/[pcre]/code/trunk/pcre_exec.c
ViewVC logotype

Contents of /code/trunk/pcre_exec.c

Parent Directory Parent Directory | Revision Log Revision Log


Revision 645 - (hide annotations) (download)
Sun Jul 31 17:02:18 2011 UTC (22 months, 2 weeks ago) by ph10
File MIME type: text/plain
File size: 194381 byte(s)
Pass *MARK name to callouts

1 nigel 77 /*************************************************
2     * Perl-Compatible Regular Expressions *
3     *************************************************/
4    
5     /* PCRE is a library of functions to support regular expressions whose syntax
6     and semantics are as close as possible to those of the Perl 5 language.
7    
8     Written by Philip Hazel
9 ph10 598 Copyright (c) 1997-2011 University of Cambridge
10 nigel 77
11     -----------------------------------------------------------------------------
12     Redistribution and use in source and binary forms, with or without
13     modification, are permitted provided that the following conditions are met:
14    
15     * Redistributions of source code must retain the above copyright notice,
16     this list of conditions and the following disclaimer.
17    
18     * Redistributions in binary form must reproduce the above copyright
19     notice, this list of conditions and the following disclaimer in the
20     documentation and/or other materials provided with the distribution.
21    
22     * Neither the name of the University of Cambridge nor the names of its
23     contributors may be used to endorse or promote products derived from
24     this software without specific prior written permission.
25    
26     THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
27     AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
28     IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
29     ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
30     LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
31     CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
32     SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
33     INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
34     CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
35     ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
36     POSSIBILITY OF SUCH DAMAGE.
37     -----------------------------------------------------------------------------
38     */
39    
40    
41     /* This module contains pcre_exec(), the externally visible function that does
42     pattern matching using an NFA algorithm, trying to mimic Perl as closely as
43     possible. There are also some static supporting functions. */
44    
45 ph10 200 #ifdef HAVE_CONFIG_H
46 ph10 236 #include "config.h"
47 ph10 200 #endif
48 ph10 199
49 nigel 93 #define NLBLOCK md /* Block containing newline information */
50     #define PSSTART start_subject /* Field containing processed string start */
51     #define PSEND end_subject /* Field containing processed string end */
52    
53 nigel 77 #include "pcre_internal.h"
54    
55 ph10 137 /* Undefine some potentially clashing cpp symbols */
56    
57     #undef min
58     #undef max
59    
60 ph10 625 /* Values for setting in md->match_function_type to indicate two special types
61     of call to match(). We do it this way to save on using another stack variable,
62 ph10 604 as stack usage is to be discouraged. */
63 nigel 77
64 ph10 604 #define MATCH_CONDASSERT 1 /* Called to check a condition assertion */
65     #define MATCH_CBEGROUP 2 /* Could-be-empty unlimited repeat group */
66 nigel 77
67     /* Non-error returns from the match() function. Error returns are externally
68     defined PCRE_ERROR_xxx codes, which are all negative. */
69    
70     #define MATCH_MATCH 1
71     #define MATCH_NOMATCH 0
72    
73 ph10 211 /* Special internal returns from the match() function. Make them sufficiently
74 ph10 210 negative to avoid the external error codes. */
75    
76 ph10 511 #define MATCH_ACCEPT (-999)
77     #define MATCH_COMMIT (-998)
78 ph10 604 #define MATCH_KETRPOS (-997)
79 ph10 618 #define MATCH_ONCE (-996)
80     #define MATCH_PRUNE (-995)
81     #define MATCH_SKIP (-994)
82     #define MATCH_SKIP_ARG (-993)
83     #define MATCH_THEN (-992)
84 ph10 210
85 ph10 510 /* This is a convenience macro for code that occurs many times. */
86    
87     #define MRRETURN(ra) \
88     { \
89     md->mark = markptr; \
90     RRETURN(ra); \
91     }
92    
93 nigel 77 /* Maximum number of ints of offset to save on the stack for recursive calls.
94     If the offset vector is bigger, malloc is used. This should be a multiple of 3,
95     because the offset vector is always a multiple of 3 long. */
96    
97     #define REC_STACK_SAVE_MAX 30
98    
99     /* Min and max values for the common repeats; for the maxima, 0 => infinity */
100    
101     static const char rep_min[] = { 0, 0, 1, 1, 0, 0 };
102     static const char rep_max[] = { 0, 0, 0, 0, 1, 1 };
103    
104    
105    
106 ph10 475 #ifdef PCRE_DEBUG
107 nigel 77 /*************************************************
108     * Debugging function to print chars *
109     *************************************************/
110    
111     /* Print a sequence of chars in printable format, stopping at the end of the
112     subject if the requested.
113    
114     Arguments:
115     p points to characters
116     length number to print
117     is_subject TRUE if printing from within md->start_subject
118     md pointer to matching data block, if is_subject is TRUE
119    
120     Returns: nothing
121     */
122    
123     static void
124     pchars(const uschar *p, int length, BOOL is_subject, match_data *md)
125     {
126 nigel 93 unsigned int c;
127 nigel 77 if (is_subject && length > md->end_subject - p) length = md->end_subject - p;
128     while (length-- > 0)
129     if (isprint(c = *(p++))) printf("%c", c); else printf("\\x%02x", c);
130     }
131     #endif
132    
133    
134    
135     /*************************************************
136     * Match a back-reference *
137     *************************************************/
138    
139 ph10 595 /* Normally, if a back reference hasn't been set, the length that is passed is
140     negative, so the match always fails. However, in JavaScript compatibility mode,
141 ph10 625 the length passed is zero. Note that in caseless UTF-8 mode, the number of
142 ph10 595 subject bytes matched may be different to the number of reference bytes.
143 nigel 77
144     Arguments:
145     offset index into the offset vector
146 ph10 595 eptr pointer into the subject
147     length length of reference to be matched (number of bytes)
148 nigel 77 md points to match data block
149 ph10 602 caseless TRUE if caseless
150 nigel 77
151 ph10 595 Returns: < 0 if not matched, otherwise the number of subject bytes matched
152 nigel 77 */
153    
154 ph10 595 static int
155 nigel 87 match_ref(int offset, register USPTR eptr, int length, match_data *md,
156 ph10 602 BOOL caseless)
157 nigel 77 {
158 ph10 595 USPTR eptr_start = eptr;
159     register USPTR p = md->start_subject + md->offset_vector[offset];
160 nigel 77
161 ph10 475 #ifdef PCRE_DEBUG
162 nigel 77 if (eptr >= md->end_subject)
163     printf("matching subject <null>");
164     else
165     {
166     printf("matching subject ");
167     pchars(eptr, length, TRUE, md);
168     }
169     printf(" against backref ");
170     pchars(p, length, FALSE, md);
171     printf("\n");
172     #endif
173    
174 ph10 595 /* Always fail if reference not set (and not JavaScript compatible). */
175 nigel 77
176 ph10 595 if (length < 0) return -1;
177 nigel 77
178 ph10 354 /* Separate the caseless case for speed. In UTF-8 mode we can only do this
179     properly if Unicode properties are supported. Otherwise, we can check only
180     ASCII characters. */
181 nigel 77
182 ph10 602 if (caseless)
183 nigel 77 {
184 ph10 354 #ifdef SUPPORT_UTF8
185     #ifdef SUPPORT_UCP
186     if (md->utf8)
187     {
188 ph10 625 /* Match characters up to the end of the reference. NOTE: the number of
189 ph10 595 bytes matched may differ, because there are some characters whose upper and
190     lower case versions code as different numbers of bytes. For example, U+023A
191     (2 bytes in UTF-8) is the upper case version of U+2C65 (3 bytes in UTF-8);
192     a sequence of 3 of the former uses 6 bytes, as does a sequence of two of
193 ph10 625 the latter. It is important, therefore, to check the length along the
194 ph10 595 reference, not along the subject (earlier code did this wrong). */
195 ph10 625
196 ph10 595 USPTR endptr = p + length;
197     while (p < endptr)
198 ph10 354 {
199 ph10 358 int c, d;
200 ph10 597 if (eptr >= md->end_subject) return -1;
201 ph10 354 GETCHARINC(c, eptr);
202     GETCHARINC(d, p);
203 ph10 595 if (c != d && c != UCD_OTHERCASE(d)) return -1;
204 ph10 358 }
205     }
206 ph10 354 else
207     #endif
208     #endif
209    
210     /* The same code works when not in UTF-8 mode and in UTF-8 mode when there
211     is no UCP support. */
212 ph10 597 {
213 ph10 625 if (eptr + length > md->end_subject) return -1;
214 ph10 597 while (length-- > 0)
215     { if (md->lcc[*p++] != md->lcc[*eptr++]) return -1; }
216 ph10 625 }
217 nigel 77 }
218 ph10 358
219 ph10 354 /* In the caseful case, we can just compare the bytes, whether or not we
220     are in UTF-8 mode. */
221 ph10 358
222 nigel 77 else
223 ph10 625 {
224     if (eptr + length > md->end_subject) return -1;
225     while (length-- > 0) if (*p++ != *eptr++) return -1;
226 ph10 597 }
227 nigel 77
228 ph10 595 return eptr - eptr_start;
229 nigel 77 }
230    
231    
232    
233     /***************************************************************************
234     ****************************************************************************
235     RECURSION IN THE match() FUNCTION
236    
237 nigel 87 The match() function is highly recursive, though not every recursive call
238     increases the recursive depth. Nevertheless, some regular expressions can cause
239     it to recurse to a great depth. I was writing for Unix, so I just let it call
240     itself recursively. This uses the stack for saving everything that has to be
241     saved for a recursive call. On Unix, the stack can be large, and this works
242     fine.
243 nigel 77
244 nigel 87 It turns out that on some non-Unix-like systems there are problems with
245     programs that use a lot of stack. (This despite the fact that every last chip
246     has oodles of memory these days, and techniques for extending the stack have
247     been known for decades.) So....
248 nigel 77
249     There is a fudge, triggered by defining NO_RECURSE, which avoids recursive
250     calls by keeping local variables that need to be preserved in blocks of memory
251 nigel 87 obtained from malloc() instead instead of on the stack. Macros are used to
252 nigel 77 achieve this so that the actual code doesn't look very different to what it
253     always used to.
254 ph10 164
255 ph10 165 The original heap-recursive code used longjmp(). However, it seems that this
256 ph10 164 can be very slow on some operating systems. Following a suggestion from Stan
257     Switzer, the use of longjmp() has been abolished, at the cost of having to
258     provide a unique number for each call to RMATCH. There is no way of generating
259     a sequence of numbers at compile time in C. I have given them names, to make
260     them stand out more clearly.
261    
262     Crude tests on x86 Linux show a small speedup of around 5-8%. However, on
263     FreeBSD, avoiding longjmp() more than halves the time taken to run the standard
264 ph10 165 tests. Furthermore, not using longjmp() means that local dynamic variables
265     don't have indeterminate values; this has meant that the frame size can be
266 ph10 164 reduced because the result can be "passed back" by straight setting of the
267     variable instead of being passed in the frame.
268 nigel 77 ****************************************************************************
269     ***************************************************************************/
270    
271 ph10 212 /* Numbers for RMATCH calls. When this list is changed, the code at HEAP_RETURN
272     below must be updated in sync. */
273 nigel 77
274 ph10 164 enum { RM1=1, RM2, RM3, RM4, RM5, RM6, RM7, RM8, RM9, RM10,
275     RM11, RM12, RM13, RM14, RM15, RM16, RM17, RM18, RM19, RM20,
276     RM21, RM22, RM23, RM24, RM25, RM26, RM27, RM28, RM29, RM30,
277     RM31, RM32, RM33, RM34, RM35, RM36, RM37, RM38, RM39, RM40,
278 ph10 210 RM41, RM42, RM43, RM44, RM45, RM46, RM47, RM48, RM49, RM50,
279 ph10 527 RM51, RM52, RM53, RM54, RM55, RM56, RM57, RM58, RM59, RM60,
280 ph10 623 RM61, RM62, RM63 };
281 ph10 164
282 nigel 87 /* These versions of the macros use the stack, as normal. There are debugging
283 ph10 165 versions and production versions. Note that the "rw" argument of RMATCH isn't
284 ph10 501 actually used in this definition. */
285 nigel 77
286     #ifndef NO_RECURSE
287     #define REGISTER register
288 ph10 164
289 ph10 475 #ifdef PCRE_DEBUG
290 ph10 604 #define RMATCH(ra,rb,rc,rd,re,rw) \
291 nigel 87 { \
292     printf("match() called in line %d\n", __LINE__); \
293 ph10 604 rrc = match(ra,rb,mstart,markptr,rc,rd,re,rdepth+1); \
294 nigel 87 printf("to line %d\n", __LINE__); \
295     }
296     #define RRETURN(ra) \
297     { \
298     printf("match() returned %d from line %d ", ra, __LINE__); \
299     return ra; \
300     }
301     #else
302 ph10 604 #define RMATCH(ra,rb,rc,rd,re,rw) \
303     rrc = match(ra,rb,mstart,markptr,rc,rd,re,rdepth+1)
304 nigel 77 #define RRETURN(ra) return ra
305 nigel 87 #endif
306    
307 nigel 77 #else
308    
309    
310 ph10 164 /* These versions of the macros manage a private stack on the heap. Note that
311     the "rd" argument of RMATCH isn't actually used in this definition. It's the md
312     argument of match(), which never changes. */
313 nigel 77
314     #define REGISTER
315    
316 ph10 604 #define RMATCH(ra,rb,rc,rd,re,rw)\
317 nigel 77 {\
318 ph10 563 heapframe *newframe = (heapframe *)(pcre_stack_malloc)(sizeof(heapframe));\
319 ph10 534 if (newframe == NULL) RRETURN(PCRE_ERROR_NOMEMORY);\
320 ph10 164 frame->Xwhere = rw; \
321     newframe->Xeptr = ra;\
322     newframe->Xecode = rb;\
323 ph10 168 newframe->Xmstart = mstart;\
324 ph10 501 newframe->Xmarkptr = markptr;\
325 ph10 164 newframe->Xoffset_top = rc;\
326 ph10 602 newframe->Xeptrb = re;\
327 ph10 164 newframe->Xrdepth = frame->Xrdepth + 1;\
328     newframe->Xprevframe = frame;\
329     frame = newframe;\
330     DPRINTF(("restarting from line %d\n", __LINE__));\
331     goto HEAP_RECURSE;\
332     L_##rw:\
333     DPRINTF(("jumped back to line %d\n", __LINE__));\
334 nigel 77 }
335    
336     #define RRETURN(ra)\
337     {\
338 ph10 527 heapframe *oldframe = frame;\
339     frame = oldframe->Xprevframe;\
340     (pcre_stack_free)(oldframe);\
341 nigel 77 if (frame != NULL)\
342     {\
343 ph10 164 rrc = ra;\
344     goto HEAP_RETURN;\
345 nigel 77 }\
346     return ra;\
347     }
348    
349    
350     /* Structure for remembering the local variables in a private frame */
351    
352     typedef struct heapframe {
353     struct heapframe *Xprevframe;
354    
355     /* Function arguments that may change */
356    
357 ph10 409 USPTR Xeptr;
358 nigel 77 const uschar *Xecode;
359 ph10 409 USPTR Xmstart;
360 ph10 501 USPTR Xmarkptr;
361 nigel 77 int Xoffset_top;
362     eptrblock *Xeptrb;
363 nigel 91 unsigned int Xrdepth;
364 nigel 77
365     /* Function local variables */
366    
367 ph10 409 USPTR Xcallpat;
368 ph10 406 #ifdef SUPPORT_UTF8
369 ph10 409 USPTR Xcharptr;
370 ph10 406 #endif
371 ph10 409 USPTR Xdata;
372     USPTR Xnext;
373     USPTR Xpp;
374     USPTR Xprev;
375     USPTR Xsaved_eptr;
376 nigel 77
377     recursion_info Xnew_recursive;
378    
379     BOOL Xcur_is_word;
380     BOOL Xcondition;
381     BOOL Xprev_is_word;
382    
383     #ifdef SUPPORT_UCP
384     int Xprop_type;
385 nigel 87 int Xprop_value;
386 nigel 77 int Xprop_fail_result;
387 ph10 123 int Xoclength;
388     uschar Xocchars[8];
389 nigel 77 #endif
390    
391 ph10 403 int Xcodelink;
392 nigel 77 int Xctype;
393 nigel 93 unsigned int Xfc;
394 nigel 77 int Xfi;
395     int Xlength;
396     int Xmax;
397     int Xmin;
398     int Xnumber;
399     int Xoffset;
400     int Xop;
401     int Xsave_capture_last;
402     int Xsave_offset1, Xsave_offset2, Xsave_offset3;
403     int Xstacksave[REC_STACK_SAVE_MAX];
404    
405     eptrblock Xnewptrb;
406    
407 ph10 164 /* Where to jump back to */
408 nigel 77
409 ph10 164 int Xwhere;
410 ph10 165
411 nigel 77 } heapframe;
412    
413     #endif
414    
415    
416     /***************************************************************************
417     ***************************************************************************/
418    
419    
420    
421     /*************************************************
422     * Match from current position *
423     *************************************************/
424    
425 nigel 93 /* This function is called recursively in many circumstances. Whenever it
426 nigel 77 returns a negative (error) response, the outer incarnation must also return the
427 ph10 426 same response. */
428 nigel 77
429 ph10 426 /* These macros pack up tests that are used for partial matching, and which
430     appears several times in the code. We set the "hit end" flag if the pointer is
431     at the end of the subject and also past the start of the subject (i.e.
432 ph10 427 something has been matched). For hard partial matching, we then return
433     immediately. The second one is used when we already know we are past the end of
434     the subject. */
435 ph10 426
436     #define CHECK_PARTIAL()\
437 ph10 553 if (md->partial != 0 && eptr >= md->end_subject && \
438     eptr > md->start_used_ptr) \
439     { \
440     md->hitend = TRUE; \
441     if (md->partial > 1) MRRETURN(PCRE_ERROR_PARTIAL); \
442 ph10 427 }
443 ph10 426
444     #define SCHECK_PARTIAL()\
445 ph10 553 if (md->partial != 0 && eptr > md->start_used_ptr) \
446     { \
447     md->hitend = TRUE; \
448     if (md->partial > 1) MRRETURN(PCRE_ERROR_PARTIAL); \
449 ph10 427 }
450 ph10 426
451 ph10 427
452 ph10 426 /* Performance note: It might be tempting to extract commonly used fields from
453     the md structure (e.g. utf8, end_subject) into individual variables to improve
454 nigel 77 performance. Tests using gcc on a SPARC disproved this; in the first case, it
455     made performance worse.
456    
457     Arguments:
458 nigel 93 eptr pointer to current character in subject
459     ecode pointer to current position in compiled code
460 ph10 168 mstart pointer to the current match start position (can be modified
461 ph10 172 by encountering \K)
462 ph10 501 markptr pointer to the most recent MARK name, or NULL
463 nigel 77 offset_top current top pointer
464     md pointer to "static" info for the match
465     eptrb pointer to chain of blocks containing eptr at start of
466     brackets - for testing for empty matches
467 nigel 87 rdepth the recursion depth
468 nigel 77
469     Returns: MATCH_MATCH if matched ) these values are >= 0
470     MATCH_NOMATCH if failed to match )
471 ph10 510 a negative MATCH_xxx value for PRUNE, SKIP, etc
472 nigel 77 a negative PCRE_ERROR_xxx value if aborted by an error condition
473 nigel 87 (e.g. stopped by repeated call or recursion limit)
474 nigel 77 */
475    
476     static int
477 ph10 510 match(REGISTER USPTR eptr, REGISTER const uschar *ecode, USPTR mstart,
478 ph10 625 const uschar *markptr, int offset_top, match_data *md, eptrblock *eptrb,
479 ph10 604 unsigned int rdepth)
480 nigel 77 {
481     /* These variables do not need to be preserved over recursion in this function,
482 nigel 93 so they can be ordinary variables in all cases. Mark some of them with
483     "register" because they are used a lot in loops. */
484 nigel 77
485 nigel 91 register int rrc; /* Returns from recursive calls */
486     register int i; /* Used for loops not involving calls to RMATCH() */
487 nigel 93 register unsigned int c; /* Character values not kept over RMATCH() calls */
488 nigel 91 register BOOL utf8; /* Local copy of UTF-8 flag for speed */
489 nigel 77
490 nigel 93 BOOL minimize, possessive; /* Quantifier options */
491 ph10 602 BOOL caseless;
492 ph10 403 int condcode;
493 nigel 93
494 nigel 77 /* When recursion is not being used, all "local" variables that have to be
495     preserved over calls to RMATCH() are part of a "frame" which is obtained from
496     heap storage. Set up the top-level frame here; others are obtained from the
497     heap whenever RMATCH() does a "recursion". See the macro definitions above. */
498    
499     #ifdef NO_RECURSE
500 ph10 563 heapframe *frame = (heapframe *)(pcre_stack_malloc)(sizeof(heapframe));
501 ph10 531 if (frame == NULL) RRETURN(PCRE_ERROR_NOMEMORY);
502 nigel 77 frame->Xprevframe = NULL; /* Marks the top level */
503    
504     /* Copy in the original argument variables */
505    
506     frame->Xeptr = eptr;
507     frame->Xecode = ecode;
508 ph10 168 frame->Xmstart = mstart;
509 ph10 501 frame->Xmarkptr = markptr;
510 nigel 77 frame->Xoffset_top = offset_top;
511     frame->Xeptrb = eptrb;
512 nigel 87 frame->Xrdepth = rdepth;
513 nigel 77
514     /* This is where control jumps back to to effect "recursion" */
515    
516     HEAP_RECURSE:
517    
518     /* Macros make the argument variables come from the current frame */
519    
520     #define eptr frame->Xeptr
521     #define ecode frame->Xecode
522 ph10 168 #define mstart frame->Xmstart
523 ph10 501 #define markptr frame->Xmarkptr
524 nigel 77 #define offset_top frame->Xoffset_top
525     #define eptrb frame->Xeptrb
526 nigel 87 #define rdepth frame->Xrdepth
527 nigel 77
528     /* Ditto for the local variables */
529    
530     #ifdef SUPPORT_UTF8
531     #define charptr frame->Xcharptr
532     #endif
533     #define callpat frame->Xcallpat
534 ph10 403 #define codelink frame->Xcodelink
535 nigel 77 #define data frame->Xdata
536     #define next frame->Xnext
537     #define pp frame->Xpp
538     #define prev frame->Xprev
539     #define saved_eptr frame->Xsaved_eptr
540    
541     #define new_recursive frame->Xnew_recursive
542    
543     #define cur_is_word frame->Xcur_is_word
544     #define condition frame->Xcondition
545     #define prev_is_word frame->Xprev_is_word
546    
547     #ifdef SUPPORT_UCP
548     #define prop_type frame->Xprop_type
549 nigel 87 #define prop_value frame->Xprop_value
550 nigel 77 #define prop_fail_result frame->Xprop_fail_result
551 ph10 115 #define oclength frame->Xoclength
552     #define occhars frame->Xocchars
553 nigel 77 #endif
554    
555     #define ctype frame->Xctype
556     #define fc frame->Xfc
557     #define fi frame->Xfi
558     #define length frame->Xlength
559     #define max frame->Xmax
560     #define min frame->Xmin
561     #define number frame->Xnumber
562     #define offset frame->Xoffset
563     #define op frame->Xop
564     #define save_capture_last frame->Xsave_capture_last
565     #define save_offset1 frame->Xsave_offset1
566     #define save_offset2 frame->Xsave_offset2
567     #define save_offset3 frame->Xsave_offset3
568     #define stacksave frame->Xstacksave
569    
570     #define newptrb frame->Xnewptrb
571    
572     /* When recursion is being used, local variables are allocated on the stack and
573     get preserved during recursion in the normal way. In this environment, fi and
574     i, and fc and c, can be the same variables. */
575    
576 nigel 93 #else /* NO_RECURSE not defined */
577 nigel 77 #define fi i
578     #define fc c
579    
580 ph10 604 /* Many of the following variables are used only in small blocks of the code.
581     My normal style of coding would have declared them within each of those blocks.
582     However, in order to accommodate the version of this code that uses an external
583     "stack" implemented on the heap, it is easier to declare them all here, so the
584     declarations can be cut out in a block. The only declarations within blocks
585     below are for variables that do not have to be preserved over a recursive call
586     to RMATCH(). */
587 nigel 77
588 ph10 625 #ifdef SUPPORT_UTF8
589     const uschar *charptr;
590     #endif
591     const uschar *callpat;
592     const uschar *data;
593     const uschar *next;
594     USPTR pp;
595     const uschar *prev;
596     USPTR saved_eptr;
597    
598     recursion_info new_recursive;
599    
600     BOOL cur_is_word;
601 nigel 87 BOOL condition;
602 nigel 77 BOOL prev_is_word;
603    
604     #ifdef SUPPORT_UCP
605     int prop_type;
606 nigel 87 int prop_value;
607 nigel 77 int prop_fail_result;
608 ph10 115 int oclength;
609     uschar occhars[8];
610 nigel 77 #endif
611    
612 ph10 399 int codelink;
613 nigel 77 int ctype;
614     int length;
615     int max;
616     int min;
617     int number;
618     int offset;
619     int op;
620     int save_capture_last;
621     int save_offset1, save_offset2, save_offset3;
622     int stacksave[REC_STACK_SAVE_MAX];
623    
624     eptrblock newptrb;
625 nigel 93 #endif /* NO_RECURSE */
626 nigel 77
627 ph10 625 /* To save space on the stack and in the heap frame, I have doubled up on some
628     of the local variables that are used only in localised parts of the code, but
629     still need to be preserved over recursive calls of match(). These macros define
630 ph10 604 the alternative names that are used. */
631    
632     #define allow_zero cur_is_word
633     #define cbegroup condition
634     #define code_offset codelink
635     #define condassert condition
636     #define matched_once prev_is_word
637    
638 nigel 77 /* These statements are here to stop the compiler complaining about unitialized
639     variables. */
640    
641     #ifdef SUPPORT_UCP
642 nigel 87 prop_value = 0;
643 nigel 77 prop_fail_result = 0;
644     #endif
645    
646 nigel 93
647 nigel 91 /* This label is used for tail recursion, which is used in a few cases even
648     when NO_RECURSE is not defined, in order to reduce the amount of stack that is
649     used. Thanks to Ian Taylor for noticing this possibility and sending the
650     original patch. */
651    
652     TAIL_RECURSE:
653    
654 nigel 87 /* OK, now we can get on with the real code of the function. Recursive calls
655     are specified by the macro RMATCH and RRETURN is used to return. When
656     NO_RECURSE is *not* defined, these just turn into a recursive call to match()
657 ph10 475 and a "return", respectively (possibly with some debugging if PCRE_DEBUG is
658 nigel 87 defined). However, RMATCH isn't like a function call because it's quite a
659     complicated macro. It has to be used in one particular way. This shouldn't,
660     however, impact performance when true recursion is being used. */
661 nigel 77
662 ph10 164 #ifdef SUPPORT_UTF8
663     utf8 = md->utf8; /* Local copy of the flag */
664     #else
665     utf8 = FALSE;
666     #endif
667    
668 nigel 87 /* First check that we haven't called match() too many times, or that we
669     haven't exceeded the recursive call limit. */
670    
671 nigel 77 if (md->match_call_count++ >= md->match_limit) RRETURN(PCRE_ERROR_MATCHLIMIT);
672 nigel 87 if (rdepth >= md->match_limit_recursion) RRETURN(PCRE_ERROR_RECURSIONLIMIT);
673 nigel 77
674 nigel 93 /* At the start of a group with an unlimited repeat that may match an empty
675 ph10 625 string, the variable md->match_function_type is set to MATCH_CBEGROUP. It is
676     done this way to save having to use another function argument, which would take
677 ph10 604 up space on the stack. See also MATCH_CONDASSERT below.
678 nigel 77
679 ph10 604 When MATCH_CBEGROUP is set, add the current subject pointer to the chain of
680     such remembered pointers, to be checked when we hit the closing ket, in order
681     to break infinite loops that match no characters. When match() is called in
682     other circumstances, don't add to the chain. The MATCH_CBEGROUP feature must
683     NOT be used with tail recursion, because the memory block that is used is on
684     the stack, so a new one may be required for each match(). */
685    
686     if (md->match_function_type == MATCH_CBEGROUP)
687 nigel 77 {
688 ph10 197 newptrb.epb_saved_eptr = eptr;
689     newptrb.epb_prev = eptrb;
690     eptrb = &newptrb;
691 ph10 604 md->match_function_type = 0;
692 nigel 77 }
693    
694 nigel 93 /* Now start processing the opcodes. */
695 nigel 77
696     for (;;)
697     {
698 nigel 93 minimize = possessive = FALSE;
699 nigel 77 op = *ecode;
700 ph10 625
701 nigel 93 switch(op)
702     {
703 ph10 510 case OP_MARK:
704     markptr = ecode + 2;
705     RMATCH(eptr, ecode + _pcre_OP_lengths[*ecode] + ecode[1], offset_top, md,
706 ph10 604 eptrb, RM55);
707 ph10 512
708     /* A return of MATCH_SKIP_ARG means that matching failed at SKIP with an
709     argument, and we must check whether that argument matches this MARK's
710     argument. It is passed back in md->start_match_ptr (an overloading of that
711     variable). If it does match, we reset that variable to the current subject
712     position and return MATCH_SKIP. Otherwise, pass back the return code
713 ph10 510 unaltered. */
714 ph10 512
715     if (rrc == MATCH_SKIP_ARG &&
716 ph10 510 strcmp((char *)markptr, (char *)(md->start_match_ptr)) == 0)
717     {
718     md->start_match_ptr = eptr;
719     RRETURN(MATCH_SKIP);
720     }
721    
722 ph10 512 if (md->mark == NULL) md->mark = markptr;
723 ph10 510 RRETURN(rrc);
724    
725 ph10 210 case OP_FAIL:
726 ph10 510 MRRETURN(MATCH_NOMATCH);
727 ph10 211
728 ph10 551 /* COMMIT overrides PRUNE, SKIP, and THEN */
729 ph10 553
730 ph10 510 case OP_COMMIT:
731     RMATCH(eptr, ecode + _pcre_OP_lengths[*ecode], offset_top, md,
732 ph10 604 eptrb, RM52);
733 ph10 551 if (rrc != MATCH_NOMATCH && rrc != MATCH_PRUNE &&
734 ph10 553 rrc != MATCH_SKIP && rrc != MATCH_SKIP_ARG &&
735     rrc != MATCH_THEN)
736 ph10 551 RRETURN(rrc);
737 ph10 510 MRRETURN(MATCH_COMMIT);
738    
739 ph10 551 /* PRUNE overrides THEN */
740 ph10 553
741 ph10 210 case OP_PRUNE:
742     RMATCH(eptr, ecode + _pcre_OP_lengths[*ecode], offset_top, md,
743 ph10 604 eptrb, RM51);
744 ph10 551 if (rrc != MATCH_NOMATCH && rrc != MATCH_THEN) RRETURN(rrc);
745 ph10 510 MRRETURN(MATCH_PRUNE);
746 ph10 211
747 ph10 510 case OP_PRUNE_ARG:
748     RMATCH(eptr, ecode + _pcre_OP_lengths[*ecode] + ecode[1], offset_top, md,
749 ph10 604 eptrb, RM56);
750 ph10 551 if (rrc != MATCH_NOMATCH && rrc != MATCH_THEN) RRETURN(rrc);
751 ph10 510 md->mark = ecode + 2;
752     RRETURN(MATCH_PRUNE);
753 ph10 211
754 ph10 551 /* SKIP overrides PRUNE and THEN */
755 ph10 553
756 ph10 210 case OP_SKIP:
757     RMATCH(eptr, ecode + _pcre_OP_lengths[*ecode], offset_top, md,
758 ph10 604 eptrb, RM53);
759 ph10 553 if (rrc != MATCH_NOMATCH && rrc != MATCH_PRUNE && rrc != MATCH_THEN)
760 ph10 551 RRETURN(rrc);
761 ph10 211 md->start_match_ptr = eptr; /* Pass back current position */
762 ph10 510 MRRETURN(MATCH_SKIP);
763 ph10 211
764 ph10 510 case OP_SKIP_ARG:
765     RMATCH(eptr, ecode + _pcre_OP_lengths[*ecode] + ecode[1], offset_top, md,
766 ph10 604 eptrb, RM57);
767 ph10 553 if (rrc != MATCH_NOMATCH && rrc != MATCH_PRUNE && rrc != MATCH_THEN)
768 ph10 551 RRETURN(rrc);
769 ph10 512
770     /* Pass back the current skip name by overloading md->start_match_ptr and
771     returning the special MATCH_SKIP_ARG return code. This will either be
772     caught by a matching MARK, or get to the top, where it is treated the same
773 ph10 510 as PRUNE. */
774 ph10 512
775 ph10 510 md->start_match_ptr = ecode + 2;
776 ph10 512 RRETURN(MATCH_SKIP_ARG);
777 ph10 553
778 ph10 550 /* For THEN (and THEN_ARG) we pass back the address of the bracket or
779 ph10 553 the alt that is at the start of the current branch. This makes it possible
780     to skip back past alternatives that precede the THEN within the current
781     branch. */
782 ph10 512
783 ph10 210 case OP_THEN:
784     RMATCH(eptr, ecode + _pcre_OP_lengths[*ecode], offset_top, md,
785 ph10 604 eptrb, RM54);
786 ph10 210 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
787 ph10 550 md->start_match_ptr = ecode - GET(ecode, 1);
788 ph10 510 MRRETURN(MATCH_THEN);
789    
790     case OP_THEN_ARG:
791 ph10 553 RMATCH(eptr, ecode + _pcre_OP_lengths[*ecode] + ecode[1+LINK_SIZE],
792 ph10 604 offset_top, md, eptrb, RM58);
793 ph10 510 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
794 ph10 550 md->start_match_ptr = ecode - GET(ecode, 1);
795     md->mark = ecode + LINK_SIZE + 2;
796 ph10 212 RRETURN(MATCH_THEN);
797 ph10 211
798 ph10 604 /* Handle a capturing bracket, other than those that are possessive with an
799     unlimited repeat. If there is space in the offset vector, save the current
800     subject position in the working slot at the top of the vector. We mustn't
801     change the current values of the data slot, because they may be set from a
802     previous iteration of this group, and be referred to by a reference inside
803 ph10 625 the group. A failure to match might occur after the group has succeeded,
804 ph10 617 if something later on doesn't match. For this reason, we need to restore
805     the working value and also the values of the final offsets, in case they
806     were set by a previous iteration of the same bracket.
807 nigel 77
808 nigel 93 If there isn't enough space in the offset vector, treat this as if it were
809     a non-capturing bracket. Don't worry about setting the flag for the error
810     case here; that is handled in the code for KET. */
811 nigel 77
812 nigel 93 case OP_CBRA:
813     case OP_SCBRA:
814     number = GET2(ecode, 1+LINK_SIZE);
815 nigel 77 offset = number << 1;
816 ph10 625
817 ph10 475 #ifdef PCRE_DEBUG
818 nigel 93 printf("start bracket %d\n", number);
819     printf("subject=");
820 nigel 77 pchars(eptr, 16, TRUE, md);
821     printf("\n");
822     #endif
823    
824     if (offset < md->offset_max)
825     {
826     save_offset1 = md->offset_vector[offset];
827     save_offset2 = md->offset_vector[offset+1];
828     save_offset3 = md->offset_vector[md->offset_end - number];
829     save_capture_last = md->capture_last;
830    
831     DPRINTF(("saving %d %d %d\n", save_offset1, save_offset2, save_offset3));
832 ph10 531 md->offset_vector[md->offset_end - number] =
833 ph10 530 (int)(eptr - md->start_subject);
834 nigel 77
835 ph10 604 for (;;)
836 nigel 77 {
837 ph10 625 if (op >= OP_SBRA) md->match_function_type = MATCH_CBEGROUP;
838     RMATCH(eptr, ecode + _pcre_OP_lengths[*ecode], offset_top, md,
839 ph10 604 eptrb, RM1);
840 ph10 618 if (rrc == MATCH_ONCE) break; /* Backing up through an atomic group */
841 ph10 550 if (rrc != MATCH_NOMATCH &&
842     (rrc != MATCH_THEN || md->start_match_ptr != ecode))
843     RRETURN(rrc);
844 nigel 77 md->capture_last = save_capture_last;
845     ecode += GET(ecode, 1);
846 ph10 625 if (*ecode != OP_ALT) break;
847 nigel 77 }
848    
849     DPRINTF(("bracket %d failed\n", number));
850     md->offset_vector[offset] = save_offset1;
851     md->offset_vector[offset+1] = save_offset2;
852     md->offset_vector[md->offset_end - number] = save_offset3;
853 ph10 625
854     /* At this point, rrc will be one of MATCH_ONCE, MATCH_NOMATCH, or
855 ph10 618 MATCH_THEN. */
856 nigel 77
857 ph10 609 if (rrc != MATCH_THEN && md->mark == NULL) md->mark = markptr;
858 ph10 618 RRETURN(((rrc == MATCH_ONCE)? MATCH_ONCE:MATCH_NOMATCH));
859 nigel 77 }
860    
861 ph10 197 /* FALL THROUGH ... Insufficient room for saving captured contents. Treat
862     as a non-capturing bracket. */
863 nigel 77
864 ph10 197 /* VVVVVVVVVVVVVVVVVVVVVVVVV */
865     /* VVVVVVVVVVVVVVVVVVVVVVVVV */
866    
867 nigel 93 DPRINTF(("insufficient capture room: treat as non-capturing\n"));
868 nigel 77
869 ph10 197 /* VVVVVVVVVVVVVVVVVVVVVVVVV */
870     /* VVVVVVVVVVVVVVVVVVVVVVVVV */
871    
872 ph10 618 /* Non-capturing or atomic group, except for possessive with unlimited
873     repeat. Loop for all the alternatives. When we get to the final alternative
874     within the brackets, we used to return the result of a recursive call to
875     match() whatever happened so it was possible to reduce stack usage by
876     turning this into a tail recursion, except in the case of a possibly empty
877     group. However, now that there is the possiblity of (*THEN) occurring in
878 ph10 625 the final alternative, this optimization is no longer possible.
879    
880     MATCH_ONCE is returned when the end of an atomic group is successfully
881     reached, but subsequent matching fails. It passes back up the tree (causing
882     captured values to be reset) until the original atomic group level is
883 ph10 618 reached. This is tested by comparing md->once_target with the start of the
884     group. At this point, the return is converted into MATCH_NOMATCH so that
885     previous backup points can be taken. */
886 nigel 77
887 ph10 618 case OP_ONCE:
888 nigel 93 case OP_BRA:
889     case OP_SBRA:
890     DPRINTF(("start non-capturing bracket\n"));
891 ph10 618
892 nigel 91 for (;;)
893 nigel 77 {
894 ph10 618 if (op >= OP_SBRA || op == OP_ONCE) md->match_function_type = MATCH_CBEGROUP;
895 ph10 625 RMATCH(eptr, ecode + _pcre_OP_lengths[*ecode], offset_top, md, eptrb,
896 ph10 604 RM2);
897 ph10 550 if (rrc != MATCH_NOMATCH &&
898     (rrc != MATCH_THEN || md->start_match_ptr != ecode))
899 ph10 625 {
900 ph10 618 if (rrc == MATCH_ONCE)
901     {
902     const uschar *scode = ecode;
903     if (*scode != OP_ONCE) /* If not at start, find it */
904     {
905     while (*scode == OP_ALT) scode += GET(scode, 1);
906     scode -= GET(scode, 1);
907 ph10 625 }
908 ph10 618 if (md->once_target == scode) rrc = MATCH_NOMATCH;
909 ph10 625 }
910 ph10 550 RRETURN(rrc);
911 ph10 625 }
912 nigel 77 ecode += GET(ecode, 1);
913 ph10 625 if (*ecode != OP_ALT) break;
914 nigel 77 }
915 ph10 609 if (rrc != MATCH_THEN && md->mark == NULL) md->mark = markptr;
916     RRETURN(MATCH_NOMATCH);
917    
918 ph10 625 /* Handle possessive capturing brackets with an unlimited repeat. We come
919 ph10 604 here from BRAZERO with allow_zero set TRUE. The offset_vector values are
920     handled similarly to the normal case above. However, the matching is
921     different. The end of these brackets will always be OP_KETRPOS, which
922     returns MATCH_KETRPOS without going further in the pattern. By this means
923     we can handle the group by iteration rather than recursion, thereby
924     reducing the amount of stack needed. */
925 ph10 625
926 ph10 604 case OP_CBRAPOS:
927     case OP_SCBRAPOS:
928     allow_zero = FALSE;
929 ph10 625
930 ph10 604 POSSESSIVE_CAPTURE:
931     number = GET2(ecode, 1+LINK_SIZE);
932     offset = number << 1;
933    
934     #ifdef PCRE_DEBUG
935     printf("start possessive bracket %d\n", number);
936     printf("subject=");
937     pchars(eptr, 16, TRUE, md);
938     printf("\n");
939     #endif
940    
941     if (offset < md->offset_max)
942     {
943     matched_once = FALSE;
944 ph10 625 code_offset = ecode - md->start_code;
945 ph10 604
946     save_offset1 = md->offset_vector[offset];
947     save_offset2 = md->offset_vector[offset+1];
948     save_offset3 = md->offset_vector[md->offset_end - number];
949     save_capture_last = md->capture_last;
950    
951     DPRINTF(("saving %d %d %d\n", save_offset1, save_offset2, save_offset3));
952 ph10 625
953     /* Each time round the loop, save the current subject position for use
954     when the group matches. For MATCH_MATCH, the group has matched, so we
955     restart it with a new subject starting position, remembering that we had
956     at least one match. For MATCH_NOMATCH, carry on with the alternatives, as
957     usual. If we haven't matched any alternatives in any iteration, check to
958     see if a previous iteration matched. If so, the group has matched;
959     continue from afterwards. Otherwise it has failed; restore the previous
960 ph10 604 capture values before returning NOMATCH. */
961 ph10 625
962 ph10 604 for (;;)
963     {
964     md->offset_vector[md->offset_end - number] =
965     (int)(eptr - md->start_subject);
966 ph10 625 if (op >= OP_SBRA) md->match_function_type = MATCH_CBEGROUP;
967 ph10 604 RMATCH(eptr, ecode + _pcre_OP_lengths[*ecode], offset_top, md,
968     eptrb, RM63);
969     if (rrc == MATCH_KETRPOS)
970     {
971     offset_top = md->end_offset_top;
972     eptr = md->end_match_ptr;
973 ph10 625 ecode = md->start_code + code_offset;
974 ph10 604 save_capture_last = md->capture_last;
975 ph10 625 matched_once = TRUE;
976     continue;
977     }
978 ph10 604 if (rrc != MATCH_NOMATCH &&
979     (rrc != MATCH_THEN || md->start_match_ptr != ecode))
980     RRETURN(rrc);
981     md->capture_last = save_capture_last;
982     ecode += GET(ecode, 1);
983 ph10 625 if (*ecode != OP_ALT) break;
984 ph10 604 }
985 ph10 610
986 ph10 604 if (!matched_once)
987 ph10 625 {
988 ph10 604 md->offset_vector[offset] = save_offset1;
989     md->offset_vector[offset+1] = save_offset2;
990     md->offset_vector[md->offset_end - number] = save_offset3;
991     }
992 ph10 625
993 ph10 609 if (rrc != MATCH_THEN && md->mark == NULL) md->mark = markptr;
994 ph10 604 if (allow_zero || matched_once)
995 ph10 625 {
996 ph10 604 ecode += 1 + LINK_SIZE;
997     break;
998 ph10 625 }
999    
1000 ph10 604 RRETURN(MATCH_NOMATCH);
1001     }
1002 ph10 625
1003 ph10 604 /* FALL THROUGH ... Insufficient room for saving captured contents. Treat
1004     as a non-capturing bracket. */
1005    
1006     /* VVVVVVVVVVVVVVVVVVVVVVVVV */
1007     /* VVVVVVVVVVVVVVVVVVVVVVVVV */
1008    
1009     DPRINTF(("insufficient capture room: treat as non-capturing\n"));
1010    
1011     /* VVVVVVVVVVVVVVVVVVVVVVVVV */
1012     /* VVVVVVVVVVVVVVVVVVVVVVVVV */
1013    
1014 ph10 625 /* Non-capturing possessive bracket with unlimited repeat. We come here
1015 ph10 604 from BRAZERO with allow_zero = TRUE. The code is similar to the above,
1016     without the capturing complication. It is written out separately for speed
1017     and cleanliness. */
1018    
1019     case OP_BRAPOS:
1020     case OP_SBRAPOS:
1021 ph10 625 allow_zero = FALSE;
1022    
1023 ph10 604 POSSESSIVE_NON_CAPTURE:
1024     matched_once = FALSE;
1025 ph10 625 code_offset = ecode - md->start_code;
1026 ph10 604
1027     for (;;)
1028     {
1029 ph10 625 if (op >= OP_SBRA) md->match_function_type = MATCH_CBEGROUP;
1030 ph10 604 RMATCH(eptr, ecode + _pcre_OP_lengths[*ecode], offset_top, md,
1031 ph10 609 eptrb, RM48);
1032 ph10 604 if (rrc == MATCH_KETRPOS)
1033     {
1034 ph10 610 offset_top = md->end_offset_top;
1035 ph10 604 eptr = md->end_match_ptr;
1036 ph10 625 ecode = md->start_code + code_offset;
1037     matched_once = TRUE;
1038     continue;
1039     }
1040 ph10 604 if (rrc != MATCH_NOMATCH &&
1041     (rrc != MATCH_THEN || md->start_match_ptr != ecode))
1042     RRETURN(rrc);
1043     ecode += GET(ecode, 1);
1044 ph10 625 if (*ecode != OP_ALT) break;
1045 ph10 604 }
1046 ph10 625
1047     if (matched_once || allow_zero)
1048 ph10 604 {
1049     ecode += 1 + LINK_SIZE;
1050     break;
1051 ph10 625 }
1052 ph10 604 RRETURN(MATCH_NOMATCH);
1053    
1054     /* Control never reaches here. */
1055    
1056 nigel 77 /* Conditional group: compilation checked that there are no more than
1057     two branches. If the condition is false, skipping the first branch takes us
1058     past the end if there is only one branch, but that's OK because that is
1059 ph10 609 exactly what going to the ket would do. */
1060 nigel 77
1061     case OP_COND:
1062 nigel 93 case OP_SCOND:
1063 ph10 604 codelink = GET(ecode, 1);
1064 ph10 406
1065 ph10 381 /* Because of the way auto-callout works during compile, a callout item is
1066     inserted between OP_COND and an assertion condition. */
1067 ph10 392
1068 ph10 381 if (ecode[LINK_SIZE+1] == OP_CALLOUT)
1069     {
1070     if (pcre_callout != NULL)
1071     {
1072     pcre_callout_block cb;
1073 ph10 645 cb.version = 2; /* Version 1 of the callout block */
1074 ph10 381 cb.callout_number = ecode[LINK_SIZE+2];
1075     cb.offset_vector = md->offset_vector;
1076     cb.subject = (PCRE_SPTR)md->start_subject;
1077 ph10 530 cb.subject_length = (int)(md->end_subject - md->start_subject);
1078     cb.start_match = (int)(mstart - md->start_subject);
1079     cb.current_position = (int)(eptr - md->start_subject);
1080 ph10 381 cb.pattern_position = GET(ecode, LINK_SIZE + 3);
1081     cb.next_item_length = GET(ecode, 3 + 2*LINK_SIZE);
1082     cb.capture_top = offset_top/2;
1083     cb.capture_last = md->capture_last;
1084     cb.callout_data = md->callout_data;
1085 ph10 645 cb.mark = markptr;
1086 ph10 510 if ((rrc = (*pcre_callout)(&cb)) > 0) MRRETURN(MATCH_NOMATCH);
1087 ph10 381 if (rrc < 0) RRETURN(rrc);
1088     }
1089     ecode += _pcre_OP_lengths[OP_CALLOUT];
1090     }
1091 ph10 392
1092 ph10 399 condcode = ecode[LINK_SIZE+1];
1093 ph10 406
1094 ph10 381 /* Now see what the actual condition is */
1095 ph10 392
1096 ph10 459 if (condcode == OP_RREF || condcode == OP_NRREF) /* Recursion test */
1097 nigel 77 {
1098 ph10 459 if (md->recursive == NULL) /* Not recursing => FALSE */
1099     {
1100 ph10 461 condition = FALSE;
1101     ecode += GET(ecode, 1);
1102     }
1103 ph10 459 else
1104 ph10 461 {
1105 ph10 459 int recno = GET2(ecode, LINK_SIZE + 2); /* Recursion group number*/
1106     condition = (recno == RREF_ANY || recno == md->recursive->group_num);
1107 ph10 461
1108 ph10 459 /* If the test is for recursion into a specific subpattern, and it is
1109     false, but the test was set up by name, scan the table to see if the
1110     name refers to any other numbers, and test them. The condition is true
1111     if any one is set. */
1112 ph10 461
1113 ph10 459 if (!condition && condcode == OP_NRREF && recno != RREF_ANY)
1114     {
1115     uschar *slotA = md->name_table;
1116     for (i = 0; i < md->name_count; i++)
1117 ph10 461 {
1118     if (GET2(slotA, 0) == recno) break;
1119 ph10 459 slotA += md->name_entry_size;
1120     }
1121 ph10 461
1122 ph10 459 /* Found a name for the number - there can be only one; duplicate
1123     names for different numbers are allowed, but not vice versa. First
1124     scan down for duplicates. */
1125 ph10 461
1126 ph10 459 if (i < md->name_count)
1127 ph10 461 {
1128 ph10 459 uschar *slotB = slotA;
1129     while (slotB > md->name_table)
1130     {
1131     slotB -= md->name_entry_size;
1132     if (strcmp((char *)slotA + 2, (char *)slotB + 2) == 0)
1133     {
1134     condition = GET2(slotB, 0) == md->recursive->group_num;
1135 ph10 461 if (condition) break;
1136     }
1137 ph10 459 else break;
1138 ph10 461 }
1139    
1140 ph10 459 /* Scan up for duplicates */
1141 ph10 461
1142 ph10 459 if (!condition)
1143 ph10 461 {
1144 ph10 459 slotB = slotA;
1145     for (i++; i < md->name_count; i++)
1146     {
1147     slotB += md->name_entry_size;
1148     if (strcmp((char *)slotA + 2, (char *)slotB + 2) == 0)
1149     {
1150     condition = GET2(slotB, 0) == md->recursive->group_num;
1151     if (condition) break;
1152 ph10 461 }
1153 ph10 459 else break;
1154 ph10 461 }
1155     }
1156 ph10 459 }
1157 ph10 461 }
1158    
1159 ph10 459 /* Chose branch according to the condition */
1160 ph10 461
1161 ph10 459 ecode += condition? 3 : GET(ecode, 1);
1162     }
1163 ph10 461 }
1164 nigel 93
1165 ph10 459 else if (condcode == OP_CREF || condcode == OP_NCREF) /* Group used test */
1166 nigel 93 {
1167 nigel 77 offset = GET2(ecode, LINK_SIZE+2) << 1; /* Doubled ref number */
1168 nigel 93 condition = offset < offset_top && md->offset_vector[offset] >= 0;
1169 ph10 461
1170 ph10 459 /* If the numbered capture is unset, but the reference was by name,
1171 ph10 461 scan the table to see if the name refers to any other numbers, and test
1172     them. The condition is true if any one is set. This is tediously similar
1173     to the code above, but not close enough to try to amalgamate. */
1174    
1175 ph10 459 if (!condition && condcode == OP_NCREF)
1176     {
1177 ph10 461 int refno = offset >> 1;
1178 ph10 459 uschar *slotA = md->name_table;
1179 ph10 461
1180 ph10 459 for (i = 0; i < md->name_count; i++)
1181 ph10 461 {
1182     if (GET2(slotA, 0) == refno) break;
1183 ph10 459 slotA += md->name_entry_size;
1184     }
1185 ph10 461
1186     /* Found a name for the number - there can be only one; duplicate names
1187     for different numbers are allowed, but not vice versa. First scan down
1188 ph10 459 for duplicates. */
1189 ph10 461
1190 ph10 459 if (i < md->name_count)
1191 ph10 461 {
1192 ph10 459 uschar *slotB = slotA;
1193     while (slotB > md->name_table)
1194     {
1195     slotB -= md->name_entry_size;
1196     if (strcmp((char *)slotA + 2, (char *)slotB + 2) == 0)
1197     {
1198     offset = GET2(slotB, 0) << 1;
1199 ph10 461 condition = offset < offset_top &&
1200 ph10 459 md->offset_vector[offset] >= 0;
1201 ph10 461 if (condition) break;
1202     }
1203 ph10 459 else break;
1204 ph10 461 }
1205    
1206 ph10 459 /* Scan up for duplicates */
1207 ph10 461
1208 ph10 459 if (!condition)
1209 ph10 461 {
1210 ph10 459 slotB = slotA;
1211     for (i++; i < md->name_count; i++)
1212     {
1213     slotB += md->name_entry_size;
1214     if (strcmp((char *)slotA + 2, (char *)slotB + 2) == 0)
1215     {
1216     offset = GET2(slotB, 0) << 1;
1217 ph10 461 condition = offset < offset_top &&
1218 ph10 459 md->offset_vector[offset] >= 0;
1219 ph10 461 if (condition) break;
1220     }
1221 ph10 459 else break;
1222 ph10 461 }
1223     }
1224 ph10 459 }
1225 ph10 461 }
1226    
1227 ph10 459 /* Chose branch according to the condition */
1228    
1229 nigel 93 ecode += condition? 3 : GET(ecode, 1);
1230 nigel 77 }
1231    
1232 ph10 399 else if (condcode == OP_DEF) /* DEFINE - always false */
1233 nigel 93 {
1234     condition = FALSE;
1235     ecode += GET(ecode, 1);
1236     }
1237    
1238 nigel 77 /* The condition is an assertion. Call match() to evaluate it - setting
1239 ph10 604 md->match_function_type to MATCH_CONDASSERT causes it to stop at the end of
1240     an assertion. */
1241 nigel 77
1242     else
1243     {
1244 ph10 625 md->match_function_type = MATCH_CONDASSERT;
1245 ph10 604 RMATCH(eptr, ecode + 1 + LINK_SIZE, offset_top, md, NULL, RM3);
1246 nigel 77 if (rrc == MATCH_MATCH)
1247     {
1248 ph10 619 if (md->end_offset_top > offset_top)
1249     offset_top = md->end_offset_top; /* Captures may have happened */
1250 nigel 93 condition = TRUE;
1251     ecode += 1 + LINK_SIZE + GET(ecode, LINK_SIZE + 2);
1252 nigel 77 while (*ecode == OP_ALT) ecode += GET(ecode, 1);
1253     }
1254 ph10 550 else if (rrc != MATCH_NOMATCH &&
1255     (rrc != MATCH_THEN || md->start_match_ptr != ecode))
1256 nigel 77 {
1257     RRETURN(rrc); /* Need braces because of following else */
1258     }
1259 nigel 93 else
1260     {
1261     condition = FALSE;
1262 ph10 399 ecode += codelink;
1263 nigel 93 }
1264     }
1265 nigel 91
1266 nigel 93 /* We are now at the branch that is to be obeyed. As there is only one,
1267 ph10 609 we used to use tail recursion to avoid using another stack frame, except
1268 ph10 625 when there was unlimited repeat of a possibly empty group. However, that
1269     strategy no longer works because of the possibilty of (*THEN) being
1270 ph10 609 encountered in the branch. A recursive call to match() is always required,
1271     unless the second alternative doesn't exist, in which case we can just
1272     plough on. */
1273 nigel 91
1274 nigel 93 if (condition || *ecode == OP_ALT)
1275     {
1276 ph10 625 if (op == OP_SCOND) md->match_function_type = MATCH_CBEGROUP;
1277 ph10 609 RMATCH(eptr, ecode + 1 + LINK_SIZE, offset_top, md, eptrb, RM49);
1278 ph10 625 if (rrc == MATCH_THEN && md->start_match_ptr == ecode)
1279 ph10 609 rrc = MATCH_NOMATCH;
1280     RRETURN(rrc);
1281 nigel 77 }
1282 ph10 395 else /* Condition false & no alternative */
1283 nigel 93 {
1284     ecode += 1 + LINK_SIZE;
1285     }
1286     break;
1287 nigel 77
1288 ph10 461
1289 ph10 447 /* Before OP_ACCEPT there may be any number of OP_CLOSE opcodes,
1290     to close any currently open capturing brackets. */
1291 ph10 461
1292 ph10 447 case OP_CLOSE:
1293 ph10 461 number = GET2(ecode, 1);
1294 ph10 447 offset = number << 1;
1295 ph10 461
1296 ph10 475 #ifdef PCRE_DEBUG
1297 ph10 447 printf("end bracket %d at *ACCEPT", number);
1298     printf("\n");
1299     #endif
1300 nigel 77
1301 ph10 447 md->capture_last = number;
1302     if (offset >= md->offset_max) md->offset_overflow = TRUE; else
1303     {
1304     md->offset_vector[offset] =
1305     md->offset_vector[md->offset_end - number];
1306 ph10 530 md->offset_vector[offset+1] = (int)(eptr - md->start_subject);
1307 ph10 447 if (offset_top <= offset) offset_top = offset + 2;
1308     }
1309     ecode += 3;
1310 ph10 461 break;
1311 ph10 447
1312    
1313 ph10 619 /* End of the pattern, either real or forced. */
1314 nigel 77
1315 ph10 619 case OP_END:
1316 ph10 210 case OP_ACCEPT:
1317 ph10 625 case OP_ASSERT_ACCEPT:
1318    
1319 ph10 619 /* If we have matched an empty string, fail if not in an assertion and not
1320     in a recursion if either PCRE_NOTEMPTY is set, or if PCRE_NOTEMPTY_ATSTART
1321 ph10 613 is set and we have matched at the start of the subject. In both cases,
1322     backtracking will then try other alternatives, if any. */
1323 ph10 443
1324 ph10 619 if (eptr == mstart && op != OP_ASSERT_ACCEPT &&
1325 ph10 618 md->recursive == NULL &&
1326 ph10 619 (md->notempty ||
1327     (md->notempty_atstart &&
1328     mstart == md->start_subject + md->start_offset)))
1329 ph10 510 MRRETURN(MATCH_NOMATCH);
1330 ph10 443
1331 ph10 442 /* Otherwise, we have a match. */
1332 ph10 625
1333 ph10 168 md->end_match_ptr = eptr; /* Record where we ended */
1334     md->end_offset_top = offset_top; /* and how many extracts were taken */
1335 ph10 210 md->start_match_ptr = mstart; /* and the start (\K can modify) */
1336 nigel 77
1337 ph10 512 /* For some reason, the macros don't work properly if an expression is
1338     given as the argument to MRRETURN when the heap is in use. */
1339    
1340     rrc = (op == OP_END)? MATCH_MATCH : MATCH_ACCEPT;
1341     MRRETURN(rrc);
1342    
1343 nigel 77 /* Assertion brackets. Check the alternative branches in turn - the
1344     matching won't pass the KET for an assertion. If any one branch matches,
1345     the assertion is true. Lookbehind assertions have an OP_REVERSE item at the
1346     start of each branch to move the current point backwards, so the code at
1347 ph10 625 this level is identical to the lookahead case. When the assertion is part
1348     of a condition, we want to return immediately afterwards. The caller of
1349     this incarnation of the match() function will have set MATCH_CONDASSERT in
1350     md->match_function type, and one of these opcodes will be the first opcode
1351     that is processed. We use a local variable that is preserved over calls to
1352 ph10 604 match() to remember this case. */
1353 nigel 77
1354     case OP_ASSERT:
1355     case OP_ASSERTBACK:
1356 ph10 604 if (md->match_function_type == MATCH_CONDASSERT)
1357     {
1358     condassert = TRUE;
1359     md->match_function_type = 0;
1360     }
1361 ph10 625 else condassert = FALSE;
1362    
1363 nigel 77 do
1364     {
1365 ph10 604 RMATCH(eptr, ecode + 1 + LINK_SIZE, offset_top, md, NULL, RM4);
1366 ph10 511 if (rrc == MATCH_MATCH || rrc == MATCH_ACCEPT)
1367 ph10 500 {
1368     mstart = md->start_match_ptr; /* In case \K reset it */
1369 ph10 630 markptr = md->mark;
1370 ph10 500 break;
1371 ph10 501 }
1372 ph10 550 if (rrc != MATCH_NOMATCH &&
1373     (rrc != MATCH_THEN || md->start_match_ptr != ecode))
1374     RRETURN(rrc);
1375 nigel 77 ecode += GET(ecode, 1);
1376     }
1377     while (*ecode == OP_ALT);
1378 ph10 625
1379 ph10 510 if (*ecode == OP_KET) MRRETURN(MATCH_NOMATCH);
1380 nigel 77
1381     /* If checking an assertion for a condition, return MATCH_MATCH. */
1382    
1383 ph10 604 if (condassert) RRETURN(MATCH_MATCH);
1384 nigel 77
1385     /* Continue from after the assertion, updating the offsets high water
1386     mark, since extracts may have been taken during the assertion. */
1387    
1388     do ecode += GET(ecode,1); while (*ecode == OP_ALT);
1389     ecode += 1 + LINK_SIZE;
1390     offset_top = md->end_offset_top;
1391     continue;
1392    
1393 ph10 473 /* Negative assertion: all branches must fail to match. Encountering SKIP,
1394 ph10 482 PRUNE, or COMMIT means we must assume failure without checking subsequent
1395 ph10 473 branches. */
1396 nigel 77
1397     case OP_ASSERT_NOT:
1398     case OP_ASSERTBACK_NOT:
1399 ph10 604 if (md->match_function_type == MATCH_CONDASSERT)
1400     {
1401     condassert = TRUE;
1402     md->match_function_type = 0;
1403     }
1404 ph10 625 else condassert = FALSE;
1405 ph10 604
1406 nigel 77 do
1407     {
1408 ph10 604 RMATCH(eptr, ecode + 1 + LINK_SIZE, offset_top, md, NULL, RM5);
1409 ph10 511 if (rrc == MATCH_MATCH || rrc == MATCH_ACCEPT) MRRETURN(MATCH_NOMATCH);
1410 ph10 473 if (rrc == MATCH_SKIP || rrc == MATCH_PRUNE || rrc == MATCH_COMMIT)
1411     {
1412     do ecode += GET(ecode,1); while (*ecode == OP_ALT);
1413 ph10 482 break;
1414     }
1415 ph10 550 if (rrc != MATCH_NOMATCH &&
1416     (rrc != MATCH_THEN || md->start_match_ptr != ecode))
1417     RRETURN(rrc);
1418 nigel 77 ecode += GET(ecode,1);
1419     }
1420     while (*ecode == OP_ALT);
1421    
1422 ph10 604 if (condassert) RRETURN(MATCH_MATCH); /* Condition assertion */
1423 ph10 625
1424 nigel 77 ecode += 1 + LINK_SIZE;
1425     continue;
1426    
1427     /* Move the subject pointer back. This occurs only at the start of
1428     each branch of a lookbehind assertion. If we are too close to the start to
1429     move back, this match function fails. When working with UTF-8 we move
1430     back a number of characters, not bytes. */
1431    
1432     case OP_REVERSE:
1433     #ifdef SUPPORT_UTF8
1434     if (utf8)
1435     {
1436 nigel 93 i = GET(ecode, 1);
1437     while (i-- > 0)
1438 nigel 77 {
1439     eptr--;
1440 ph10 510 if (eptr < md->start_subject) MRRETURN(MATCH_NOMATCH);
1441 ph10 207 BACKCHAR(eptr);
1442 nigel 77 }
1443     }
1444     else
1445     #endif
1446    
1447     /* No UTF-8 support, or not in UTF-8 mode: count is byte count */
1448    
1449     {
1450 nigel 93 eptr -= GET(ecode, 1);
1451 ph10 510 if (eptr < md->start_subject) MRRETURN(MATCH_NOMATCH);
1452 nigel 77 }
1453    
1454 ph10 435 /* Save the earliest consulted character, then skip to next op code */
1455 nigel 77
1456 ph10 435 if (eptr < md->start_used_ptr) md->start_used_ptr = eptr;
1457 nigel 77 ecode += 1 + LINK_SIZE;
1458     break;
1459    
1460     /* The callout item calls an external function, if one is provided, passing
1461     details of the match so far. This is mainly for debugging, though the
1462     function is able to force a failure. */
1463    
1464     case OP_CALLOUT:
1465     if (pcre_callout != NULL)
1466     {
1467     pcre_callout_block cb;
1468 ph10 645 cb.version = 2; /* Version 1 of the callout block */
1469 nigel 77 cb.callout_number = ecode[1];
1470     cb.offset_vector = md->offset_vector;
1471 nigel 87 cb.subject = (PCRE_SPTR)md->start_subject;
1472 ph10 530 cb.subject_length = (int)(md->end_subject - md->start_subject);
1473     cb.start_match = (int)(mstart - md->start_subject);
1474     cb.current_position = (int)(eptr - md->start_subject);
1475 nigel 77 cb.pattern_position = GET(ecode, 2);
1476     cb.next_item_length = GET(ecode, 2 + LINK_SIZE);
1477     cb.capture_top = offset_top/2;
1478     cb.capture_last = md->capture_last;
1479     cb.callout_data = md->callout_data;
1480 ph10 645 cb.mark = markptr;
1481 ph10 510 if ((rrc = (*pcre_callout)(&cb)) > 0) MRRETURN(MATCH_NOMATCH);
1482 nigel 77 if (rrc < 0) RRETURN(rrc);
1483     }
1484     ecode += 2 + 2*LINK_SIZE;
1485     break;
1486    
1487     /* Recursion either matches the current regex, or some subexpression. The
1488     offset data is the offset to the starting bracket from the start of the
1489     whole pattern. (This is so that it works from duplicated subpatterns.)
1490 ph10 625
1491 ph10 618 The state of the capturing groups is preserved over recursion, and
1492 ph10 625 re-instated afterwards. We don't know how many are started and not yet
1493 ph10 618 finished (offset_top records the completed total) so we just have to save
1494     all the potential data. There may be up to 65535 such values, which is too
1495     large to put on the stack, but using malloc for small numbers seems
1496     expensive. As a compromise, the stack is used when there are no more than
1497     REC_STACK_SAVE_MAX values to store; otherwise malloc is used.
1498 nigel 77
1499     There are also other values that have to be saved. We use a chained
1500     sequence of blocks that actually live on the stack. Thanks to Robin Houston
1501 ph10 625 for the original version of this logic. It has, however, been hacked around
1502 ph10 618 a lot, so he is not to blame for the current way it works. */
1503 nigel 77
1504     case OP_RECURSE:
1505     {
1506 ph10 642 recursion_info *ri;
1507     int recno;
1508    
1509 nigel 77 callpat = md->start_code + GET(ecode, 1);
1510 ph10 642 recno = (callpat == md->start_code)? 0 :
1511     GET2(callpat, 1 + LINK_SIZE);
1512    
1513     /* Check for repeating a recursion without advancing the subject pointer.
1514     This should catch convoluted mutual recursions. (Some simple cases are
1515     caught at compile time.) */
1516    
1517     for (ri = md->recursive; ri != NULL; ri = ri->prevrec)
1518     if (recno == ri->group_num && eptr == ri->subject_position)
1519     RRETURN(PCRE_ERROR_RECURSELOOP);
1520 nigel 77
1521     /* Add to "recursing stack" */
1522    
1523 ph10 642 new_recursive.group_num = recno;
1524     new_recursive.subject_position = eptr;
1525 nigel 77 new_recursive.prevrec = md->recursive;
1526     md->recursive = &new_recursive;
1527    
1528 ph10 618 /* Where to continue from afterwards */
1529 nigel 77
1530     ecode += 1 + LINK_SIZE;
1531    
1532 ph10 618 /* Now save the offset data */
1533 nigel 77
1534     new_recursive.saved_max = md->offset_end;
1535     if (new_recursive.saved_max <= REC_STACK_SAVE_MAX)
1536     new_recursive.offset_save = stacksave;
1537     else
1538     {
1539     new_recursive.offset_save =
1540     (int *)(pcre_malloc)(new_recursive.saved_max * sizeof(int));
1541     if (new_recursive.offset_save == NULL) RRETURN(PCRE_ERROR_NOMEMORY);
1542     }
1543     memcpy(new_recursive.offset_save, md->offset_vector,
1544     new_recursive.saved_max * sizeof(int));
1545 ph10 625
1546 ph10 618 /* OK, now we can do the recursion. After processing each alternative,
1547 ph10 625 restore the offset data. If there were nested recursions, md->recursive
1548 ph10 618 might be changed, so reset it before looping. */
1549 nigel 77
1550     DPRINTF(("Recursing into group %d\n", new_recursive.group_num));
1551 ph10 604 cbegroup = (*callpat >= OP_SBRA);
1552 nigel 77 do
1553     {
1554 ph10 604 if (cbegroup) md->match_function_type = MATCH_CBEGROUP;
1555 ph10 164 RMATCH(eptr, callpat + _pcre_OP_lengths[*callpat], offset_top,
1556 ph10 604 md, eptrb, RM6);
1557 ph10 618 memcpy(md->offset_vector, new_recursive.offset_save,
1558     new_recursive.saved_max * sizeof(int));
1559 ph10 511 if (rrc == MATCH_MATCH || rrc == MATCH_ACCEPT)
1560 nigel 77 {
1561 nigel 87 DPRINTF(("Recursion matched\n"));
1562 nigel 77 md->recursive = new_recursive.prevrec;
1563     if (new_recursive.offset_save != stacksave)
1564     (pcre_free)(new_recursive.offset_save);
1565 ph10 618
1566     /* Set where we got to in the subject, and reset the start in case
1567 ph10 625 it was changed by \K. This *is* propagated back out of a recursion,
1568     for Perl compatibility. */
1569    
1570 ph10 618 eptr = md->end_match_ptr;
1571     mstart = md->start_match_ptr;
1572     goto RECURSION_MATCHED; /* Exit loop; end processing */
1573 nigel 77 }
1574 ph10 550 else if (rrc != MATCH_NOMATCH &&
1575     (rrc != MATCH_THEN || md->start_match_ptr != ecode))
1576 nigel 87 {
1577     DPRINTF(("Recursion gave error %d\n", rrc));
1578 ph10 400 if (new_recursive.offset_save != stacksave)
1579     (pcre_free)(new_recursive.offset_save);
1580 nigel 87 RRETURN(rrc);
1581     }
1582 nigel 77
1583     md->recursive = &new_recursive;
1584     callpat += GET(callpat, 1);
1585     }
1586     while (*callpat == OP_ALT);
1587    
1588     DPRINTF(("Recursion didn't match\n"));
1589     md->recursive = new_recursive.prevrec;
1590     if (new_recursive.offset_save != stacksave)
1591     (pcre_free)(new_recursive.offset_save);
1592 ph10 510 MRRETURN(MATCH_NOMATCH);
1593 nigel 77 }
1594 ph10 625
1595 ph10 618 RECURSION_MATCHED:
1596     break;
1597 nigel 77
1598     /* An alternation is the end of a branch; scan along to find the end of the
1599     bracketed group and go to there. */
1600    
1601     case OP_ALT:
1602     do ecode += GET(ecode,1); while (*ecode == OP_ALT);
1603     break;
1604    
1605 ph10 335 /* BRAZERO, BRAMINZERO and SKIPZERO occur just before a bracket group,
1606     indicating that it may occur zero times. It may repeat infinitely, or not
1607     at all - i.e. it could be ()* or ()? or even (){0} in the pattern. Brackets
1608     with fixed upper repeat limits are compiled as a number of copies, with the
1609     optional ones preceded by BRAZERO or BRAMINZERO. */
1610 ph10 625
1611 nigel 77 case OP_BRAZERO:
1612 ph10 604 next = ecode + 1;
1613     RMATCH(eptr, next, offset_top, md, eptrb, RM10);
1614     if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1615     do next += GET(next, 1); while (*next == OP_ALT);
1616     ecode = next + 1 + LINK_SIZE;
1617 nigel 77 break;
1618 ph10 625
1619 nigel 77 case OP_BRAMINZERO:
1620 ph10 604 next = ecode + 1;
1621     do next += GET(next, 1); while (*next == OP_ALT);
1622     RMATCH(eptr, next + 1+LINK_SIZE, offset_top, md, eptrb, RM11);
1623     if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1624     ecode++;
1625 nigel 77 break;
1626    
1627 ph10 335 case OP_SKIPZERO:
1628 ph10 604 next = ecode+1;
1629     do next += GET(next,1); while (*next == OP_ALT);
1630     ecode = next + 1 + LINK_SIZE;
1631 ph10 335 break;
1632 ph10 625
1633 ph10 604 /* BRAPOSZERO occurs before a possessive bracket group. Don't do anything
1634     here; just jump to the group, with allow_zero set TRUE. */
1635 ph10 625
1636 ph10 604 case OP_BRAPOSZERO:
1637 ph10 625 op = *(++ecode);
1638 ph10 604 allow_zero = TRUE;
1639     if (op == OP_CBRAPOS || op == OP_SCBRAPOS) goto POSSESSIVE_CAPTURE;
1640     goto POSSESSIVE_NON_CAPTURE;
1641 ph10 335
1642 nigel 93 /* End of a group, repeated or non-repeating. */
1643 nigel 77
1644     case OP_KET:
1645     case OP_KETRMIN:
1646     case OP_KETRMAX:
1647 ph10 625 case OP_KETRPOS:
1648 nigel 91 prev = ecode - GET(ecode, 1);
1649 ph10 625
1650 nigel 93 /* If this was a group that remembered the subject start, in order to break
1651     infinite repeats of empty string matches, retrieve the subject start from
1652     the chain. Otherwise, set it NULL. */
1653 nigel 77
1654 ph10 618 if (*prev >= OP_SBRA || *prev == OP_ONCE)
1655 nigel 93 {
1656     saved_eptr = eptrb->epb_saved_eptr; /* Value at start of group */
1657     eptrb = eptrb->epb_prev; /* Backup to previous group */
1658     }
1659     else saved_eptr = NULL;
1660 nigel 77
1661 ph10 618 /* If we are at the end of an assertion group, stop matching and return
1662     MATCH_MATCH, but record the current high water mark for use by positive
1663     assertions. We also need to record the match start in case it was changed
1664     by \K. */
1665 nigel 93
1666 nigel 91 if (*prev == OP_ASSERT || *prev == OP_ASSERT_NOT ||
1667 ph10 618 *prev == OP_ASSERTBACK || *prev == OP_ASSERTBACK_NOT)
1668 nigel 91 {
1669     md->end_match_ptr = eptr; /* For ONCE */
1670     md->end_offset_top = offset_top;
1671 ph10 500 md->start_match_ptr = mstart;
1672 ph10 630 MRRETURN(MATCH_MATCH); /* Sets md->mark */
1673 nigel 91 }
1674 nigel 77
1675 nigel 93 /* For capturing groups we have to check the group number back at the start
1676     and if necessary complete handling an extraction by setting the offsets and
1677 ph10 618 bumping the high water mark. Whole-pattern recursion is coded as a recurse
1678     into group 0, so it won't be picked up here. Instead, we catch it when the
1679     OP_END is reached. Other recursion is handled here. We just have to record
1680     the current subject position and start match pointer and give a MATCH
1681     return. */
1682 nigel 77
1683 ph10 604 if (*prev == OP_CBRA || *prev == OP_SCBRA ||
1684     *prev == OP_CBRAPOS || *prev == OP_SCBRAPOS)
1685 nigel 91 {
1686 nigel 93 number = GET2(prev, 1+LINK_SIZE);
1687 nigel 91 offset = number << 1;
1688 ph10 461
1689 ph10 475 #ifdef PCRE_DEBUG
1690 nigel 91 printf("end bracket %d", number);
1691     printf("\n");
1692 nigel 77 #endif
1693    
1694 ph10 618 /* Handle a recursively called group. */
1695    
1696     if (md->recursive != NULL && md->recursive->group_num == number)
1697     {
1698     md->end_match_ptr = eptr;
1699     md->start_match_ptr = mstart;
1700     RRETURN(MATCH_MATCH);
1701     }
1702    
1703     /* Deal with capturing */
1704    
1705 nigel 93 md->capture_last = number;
1706     if (offset >= md->offset_max) md->offset_overflow = TRUE; else
1707 nigel 91 {
1708 ph10 625 /* If offset is greater than offset_top, it means that we are
1709     "skipping" a capturing group, and that group's offsets must be marked
1710     unset. In earlier versions of PCRE, all the offsets were unset at the
1711     start of matching, but this doesn't work because atomic groups and
1712 ph10 615 assertions can cause a value to be set that should later be unset.
1713     Example: matching /(?>(a))b|(a)c/ against "ac". This sets group 1 as
1714 ph10 625 part of the atomic group, but this is not on the final matching path,
1715     so must be unset when 2 is set. (If there is no group 2, there is no
1716 ph10 615 problem, because offset_top will then be 2, indicating no capture.) */
1717 ph10 625
1718 ph10 615 if (offset > offset_top)
1719     {
1720     register int *iptr = md->offset_vector + offset_top;
1721     register int *iend = md->offset_vector + offset;
1722     while (iptr < iend) *iptr++ = -1;
1723 ph10 625 }
1724    
1725 ph10 615 /* Now make the extraction */
1726    
1727 nigel 93 md->offset_vector[offset] =
1728     md->offset_vector[md->offset_end - number];
1729 ph10 530 md->offset_vector[offset+1] = (int)(eptr - md->start_subject);
1730 nigel 93 if (offset_top <= offset) offset_top = offset + 2;
1731     }
1732 nigel 91 }
1733 nigel 77
1734 ph10 618 /* For an ordinary non-repeating ket, just continue at this level. This
1735     also happens for a repeating ket if no characters were matched in the
1736     group. This is the forcible breaking of infinite loops as implemented in
1737 ph10 625 Perl 5.005. For a non-repeating atomic group, establish a backup point by
1738     processing the rest of the pattern at a lower level. If this results in a
1739     NOMATCH return, pass MATCH_ONCE back to the original OP_ONCE level, thereby
1740     bypassing intermediate backup points, but resetting any captures that
1741 ph10 618 happened along the way. */
1742 nigel 77
1743 nigel 91 if (*ecode == OP_KET || eptr == saved_eptr)
1744     {
1745 ph10 618 if (*prev == OP_ONCE)
1746     {
1747     RMATCH(eptr, ecode + 1 + LINK_SIZE, offset_top, md, eptrb, RM12);
1748     if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1749     md->once_target = prev; /* Level at which to change to MATCH_NOMATCH */
1750 ph10 625 RRETURN(MATCH_ONCE);
1751     }
1752 ph10 618 ecode += 1 + LINK_SIZE; /* Carry on at this level */
1753 nigel 91 break;
1754     }
1755 ph10 625
1756     /* OP_KETRPOS is a possessive repeating ket. Remember the current position,
1757 ph10 604 and return the MATCH_KETRPOS. This makes it possible to do the repeats one
1758     at a time from the outer level, thus saving stack. */
1759 ph10 625
1760 ph10 604 if (*ecode == OP_KETRPOS)
1761 ph10 625 {
1762 ph10 604 md->end_match_ptr = eptr;
1763 ph10 625 md->end_offset_top = offset_top;
1764 ph10 604 RRETURN(MATCH_KETRPOS);
1765 ph10 625 }
1766 nigel 77
1767 ph10 604 /* The normal repeating kets try the rest of the pattern or restart from
1768     the preceding bracket, in the appropriate order. In the second case, we can
1769     use tail recursion to avoid using another stack frame, unless we have an
1770 ph10 618 an atomic group or an unlimited repeat of a group that can match an empty
1771     string. */
1772 nigel 77
1773 nigel 91 if (*ecode == OP_KETRMIN)
1774     {
1775 ph10 623 RMATCH(eptr, ecode + 1 + LINK_SIZE, offset_top, md, eptrb, RM7);
1776 nigel 91 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1777 ph10 618 if (*prev == OP_ONCE)
1778     {
1779 ph10 623 RMATCH(eptr, prev, offset_top, md, eptrb, RM8);
1780 ph10 618 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1781     md->once_target = prev; /* Level at which to change to MATCH_NOMATCH */
1782 ph10 625 RRETURN(MATCH_ONCE);
1783     }
1784 ph10 604 if (*prev >= OP_SBRA) /* Could match an empty string */
1785 ph10 197 {
1786 ph10 625 md->match_function_type = MATCH_CBEGROUP;
1787 ph10 604 RMATCH(eptr, prev, offset_top, md, eptrb, RM50);
1788 ph10 197 RRETURN(rrc);
1789     }
1790 nigel 91 ecode = prev;
1791     goto TAIL_RECURSE;
1792 nigel 77 }
1793 nigel 91 else /* OP_KETRMAX */
1794     {
1795 ph10 625 if (*prev >= OP_SBRA) md->match_function_type = MATCH_CBEGROUP;
1796 ph10 604 RMATCH(eptr, prev, offset_top, md, eptrb, RM13);
1797 ph10 618 if (rrc == MATCH_ONCE && md->once_target == prev) rrc = MATCH_NOMATCH;
1798 nigel 91 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1799 ph10 618 if (*prev == OP_ONCE)
1800     {
1801 ph10 623 RMATCH(eptr, ecode + 1 + LINK_SIZE, offset_top, md, eptrb, RM9);
1802 ph10 618 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1803     md->once_target = prev;
1804 ph10 625 RRETURN(MATCH_ONCE);
1805     }
1806 nigel 91 ecode += 1 + LINK_SIZE;
1807     goto TAIL_RECURSE;
1808     }
1809     /* Control never gets here */
1810 nigel 77
1811 ph10 602 /* Not multiline mode: start of subject assertion, unless notbol. */
1812 nigel 77
1813     case OP_CIRC:
1814 ph10 510 if (md->notbol && eptr == md->start_subject) MRRETURN(MATCH_NOMATCH);
1815 ph10 625
1816 nigel 77 /* Start of subject assertion */
1817    
1818     case OP_SOD:
1819 ph10 510 if (eptr != md->start_subject) MRRETURN(MATCH_NOMATCH);
1820 nigel 77 ecode++;
1821     break;
1822 ph10 625
1823 ph10 602 /* Multiline mode: start of subject unless notbol, or after any newline. */
1824 nigel 77
1825 ph10 602 case OP_CIRCM:
1826     if (md->notbol && eptr == md->start_subject) MRRETURN(MATCH_NOMATCH);
1827     if (eptr != md->start_subject &&
1828     (eptr == md->end_subject || !WAS_NEWLINE(eptr)))
1829     MRRETURN(MATCH_NOMATCH);
1830     ecode++;
1831     break;
1832    
1833 nigel 77 /* Start of match assertion */
1834    
1835     case OP_SOM:
1836 ph10 510 if (eptr != md->start_subject + md->start_offset) MRRETURN(MATCH_NOMATCH);
1837 nigel 77 ecode++;
1838     break;
1839 ph10 172
1840 ph10 168 /* Reset the start of match point */
1841 ph10 172
1842 ph10 168 case OP_SET_SOM:
1843     mstart = eptr;
1844 ph10 172 ecode++;
1845     break;
1846 nigel 77
1847 ph10 602 /* Multiline mode: assert before any newline, or before end of subject
1848     unless noteol is set. */
1849 nigel 77
1850 ph10 602 case OP_DOLLM:
1851     if (eptr < md->end_subject)
1852     { if (!IS_NEWLINE(eptr)) MRRETURN(MATCH_NOMATCH); }
1853     else
1854 nigel 77 {
1855 ph10 510 if (md->noteol) MRRETURN(MATCH_NOMATCH);
1856 ph10 602 SCHECK_PARTIAL();
1857 nigel 77 }
1858 ph10 602 ecode++;
1859     break;
1860 ph10 579
1861 ph10 625 /* Not multiline mode: assert before a terminating newline or before end of
1862 ph10 602 subject unless noteol is set. */
1863    
1864     case OP_DOLL:
1865     if (md->noteol) MRRETURN(MATCH_NOMATCH);
1866     if (!md->endonly) goto ASSERT_NL_OR_EOS;
1867    
1868 nigel 91 /* ... else fall through for endonly */
1869 nigel 77
1870     /* End of subject assertion (\z) */
1871    
1872     case OP_EOD:
1873 ph10 510 if (eptr < md->end_subject) MRRETURN(MATCH_NOMATCH);
1874 ph10 553 SCHECK_PARTIAL();
1875 nigel 77 ecode++;
1876     break;
1877    
1878     /* End of subject or ending \n assertion (\Z) */
1879    
1880     case OP_EODN:
1881 ph10 553 ASSERT_NL_OR_EOS:
1882     if (eptr < md->end_subject &&
1883 nigel 93 (!IS_NEWLINE(eptr) || eptr != md->end_subject - md->nllen))
1884 ph10 510 MRRETURN(MATCH_NOMATCH);
1885 ph10 579
1886 ph10 553 /* Either at end of string or \n before end. */
1887 ph10 579
1888 ph10 553 SCHECK_PARTIAL();
1889 nigel 77 ecode++;
1890     break;
1891    
1892     /* Word boundary assertions */
1893    
1894     case OP_NOT_WORD_BOUNDARY:
1895     case OP_WORD_BOUNDARY:
1896     {
1897    
1898     /* Find out if the previous and current characters are "word" characters.
1899     It takes a bit more work in UTF-8 mode. Characters > 255 are assumed to
1900 ph10 443 be "non-word" characters. Remember the earliest consulted character for
1901 ph10 435 partial matching. */
1902 nigel 77
1903     #ifdef SUPPORT_UTF8
1904     if (utf8)
1905     {
1906 ph10 518 /* Get status of previous character */
1907 ph10 527
1908 nigel 77 if (eptr == md->start_subject) prev_is_word = FALSE; else
1909     {
1910 ph10 409 USPTR lastptr = eptr - 1;
1911 nigel 77 while((*lastptr & 0xc0) == 0x80) lastptr--;
1912 ph10 443 if (lastptr < md->start_used_ptr) md->start_used_ptr = lastptr;
1913 nigel 77 GETCHAR(c, lastptr);
1914 ph10 527 #ifdef SUPPORT_UCP
1915 ph10 518 if (md->use_ucp)
1916     {
1917     if (c == '_') prev_is_word = TRUE; else
1918 ph10 527 {
1919 ph10 518 int cat = UCD_CATEGORY(c);
1920     prev_is_word = (cat == ucp_L || cat == ucp_N);
1921 ph10 527 }
1922     }
1923     else
1924     #endif
1925 nigel 77 prev_is_word = c < 256 && (md->ctypes[c] & ctype_word) != 0;
1926     }
1927 ph10 527
1928 ph10 518 /* Get status of next character */
1929 ph10 527
1930 ph10 443 if (eptr >= md->end_subject)
1931 nigel 77 {
1932 ph10 443 SCHECK_PARTIAL();
1933     cur_is_word = FALSE;
1934 ph10 428 }
1935     else
1936     {
1937 nigel 77 GETCHAR(c, eptr);
1938 ph10 527 #ifdef SUPPORT_UCP
1939 ph10 518 if (md->use_ucp)
1940     {
1941     if (c == '_') cur_is_word = TRUE; else
1942 ph10 527 {
1943 ph10 518 int cat = UCD_CATEGORY(c);
1944     cur_is_word = (cat == ucp_L || cat == ucp_N);
1945 ph10 527 }
1946     }
1947     else
1948     #endif
1949 nigel 77 cur_is_word = c < 256 && (md->ctypes[c] & ctype_word) != 0;
1950     }
1951     }
1952     else
1953     #endif
1954    
1955 ph10 527 /* Not in UTF-8 mode, but we may still have PCRE_UCP set, and for
1956 ph10 518 consistency with the behaviour of \w we do use it in this case. */
1957 nigel 77
1958     {
1959 ph10 518 /* Get status of previous character */
1960 ph10 527
1961 ph10 435 if (eptr == md->start_subject) prev_is_word = FALSE; else
1962     {
1963 ph10 443 if (eptr <= md->start_used_ptr) md->start_used_ptr = eptr - 1;
1964 ph10 527 #ifdef SUPPORT_UCP
1965 ph10 518 if (md->use_ucp)
1966     {
1967 ph10 527 c = eptr[-1];
1968 ph10 518 if (c == '_') prev_is_word = TRUE; else
1969 ph10 527 {
1970 ph10 518 int cat = UCD_CATEGORY(c);
1971     prev_is_word = (cat == ucp_L || cat == ucp_N);
1972 ph10 527 }
1973     }
1974     else
1975     #endif
1976 ph10 435 prev_is_word = ((md->ctypes[eptr[-1]] & ctype_word) != 0);
1977     }
1978 ph10 527
1979 ph10 518 /* Get status of next character */
1980 ph10 527
1981 ph10 443 if (eptr >= md->end_subject)
1982 ph10 428 {
1983 ph10 443 SCHECK_PARTIAL();
1984     cur_is_word = FALSE;
1985 ph10 428 }
1986 ph10 527 else
1987     #ifdef SUPPORT_UCP
1988 ph10 518 if (md->use_ucp)
1989     {
1990 ph10 527 c = *eptr;
1991 ph10 518 if (c == '_') cur_is_word = TRUE; else
1992 ph10 527 {
1993 ph10 518 int cat = UCD_CATEGORY(c);
1994     cur_is_word = (cat == ucp_L || cat == ucp_N);
1995 ph10 527 }
1996     }
1997     else
1998     #endif
1999 ph10 518 cur_is_word = ((md->ctypes[*eptr] & ctype_word) != 0);
2000 nigel 77 }
2001    
2002     /* Now see if the situation is what we want */
2003    
2004     if ((*ecode++ == OP_WORD_BOUNDARY)?
2005     cur_is_word == prev_is_word : cur_is_word != prev_is_word)
2006 ph10 510 MRRETURN(MATCH_NOMATCH);
2007 nigel 77 }
2008     break;
2009    
2010     /* Match a single character type; inline for speed */
2011    
2012     case OP_ANY:
2013 ph10 510 if (IS_NEWLINE(eptr)) MRRETURN(MATCH_NOMATCH);
2014 ph10 345 /* Fall through */
2015    
2016 ph10 341 case OP_ALLANY:
2017 ph10 443 if (eptr++ >= md->end_subject)
2018 ph10 428 {
2019 ph10 443 SCHECK_PARTIAL();
2020 ph10 510 MRRETURN(MATCH_NOMATCH);
2021 ph10 443 }
2022 ph10 342 if (utf8) while (eptr < md->end_subject && (*eptr & 0xc0) == 0x80) eptr++;
2023 nigel 77 ecode++;
2024     break;
2025    
2026     /* Match a single byte, even in UTF-8 mode. This opcode really does match
2027     any byte, even newline, independent of the setting of PCRE_DOTALL. */
2028    
2029     case OP_ANYBYTE:
2030 ph10 443 if (eptr++ >= md->end_subject)
2031 ph10 428 {
2032 ph10 443 SCHECK_PARTIAL();
2033 ph10 510 MRRETURN(MATCH_NOMATCH);
2034 ph10 443 }
2035 nigel 77 ecode++;
2036     break;
2037    
2038     case OP_NOT_DIGIT:
2039 ph10 443 if (eptr >= md->end_subject)
2040 ph10 428 {
2041 ph10 443 SCHECK_PARTIAL();
2042 ph10 510 MRRETURN(MATCH_NOMATCH);
2043 ph10 443 }
2044 nigel 77 GETCHARINCTEST(c, eptr);
2045     if (
2046     #ifdef SUPPORT_UTF8
2047     c < 256 &&
2048     #endif
2049     (md->ctypes[c] & ctype_digit) != 0
2050     )
2051 ph10 510 MRRETURN(MATCH_NOMATCH);
2052 nigel 77 ecode++;
2053     break;
2054    
2055     case OP_DIGIT:
2056 ph10 443 if (eptr >= md->end_subject)
2057 ph10 428 {
2058 ph10 443 SCHECK_PARTIAL();
2059 ph10 510 MRRETURN(MATCH_NOMATCH);
2060 ph10 443 }
2061 nigel 77 GETCHARINCTEST(c, eptr);
2062     if (
2063     #ifdef SUPPORT_UTF8
2064     c >= 256 ||
2065     #endif
2066     (md->ctypes[c] & ctype_digit) == 0
2067     )
2068 ph10 510 MRRETURN(MATCH_NOMATCH);
2069 nigel 77 ecode++;
2070     break;
2071    
2072     case OP_NOT_WHITESPACE:
2073 ph10 443 if (eptr >= md->end_subject)
2074 ph10 428 {
2075 ph10 443 SCHECK_PARTIAL();
2076 ph10 510 MRRETURN(MATCH_NOMATCH);
2077 ph10 443 }
2078 nigel 77 GETCHARINCTEST(c, eptr);
2079     if (
2080     #ifdef SUPPORT_UTF8
2081     c < 256 &&
2082     #endif
2083     (md->ctypes[c] & ctype_space) != 0
2084     )
2085 ph10 510 MRRETURN(MATCH_NOMATCH);
2086 nigel 77 ecode++;
2087     break;
2088    
2089     case OP_WHITESPACE:
2090 ph10 443 if (eptr >= md->end_subject)
2091 ph10 428 {
2092 ph10 443 SCHECK_PARTIAL();
2093 ph10 510 MRRETURN(MATCH_NOMATCH);
2094 ph10 443 }
2095 nigel 77 GETCHARINCTEST(c, eptr);
2096     if (
2097     #ifdef SUPPORT_UTF8
2098     c >= 256 ||
2099     #endif
2100     (md->ctypes[c] & ctype_space) == 0
2101     )
2102 ph10 510 MRRETURN(MATCH_NOMATCH);
2103 nigel 77 ecode++;
2104     break;
2105    
2106     case OP_NOT_WORDCHAR:
2107 ph10 443 if (eptr >= md->end_subject)
2108 ph10 428 {
2109 ph10 443 SCHECK_PARTIAL();
2110 ph10 510 MRRETURN(MATCH_NOMATCH);
2111 ph10 443 }
2112 nigel 77 GETCHARINCTEST(c, eptr);
2113     if (
2114     #ifdef SUPPORT_UTF8
2115     c < 256 &&
2116     #endif
2117     (md->ctypes[c] & ctype_word) != 0
2118     )
2119 ph10 510 MRRETURN(MATCH_NOMATCH);
2120 nigel 77 ecode++;
2121     break;
2122    
2123     case OP_WORDCHAR:
2124 ph10 443 if (eptr >= md->end_subject)
2125 ph10 428 {
2126 ph10 443 SCHECK_PARTIAL();
2127 ph10 510 MRRETURN(MATCH_NOMATCH);
2128 ph10 443 }
2129 nigel 77 GETCHARINCTEST(c, eptr);
2130     if (
2131     #ifdef SUPPORT_UTF8
2132     c >= 256 ||
2133     #endif
2134     (md->ctypes[c] & ctype_word) == 0
2135     )
2136 ph10 510 MRRETURN(MATCH_NOMATCH);
2137 nigel 77 ecode++;
2138     break;
2139    
2140 nigel 93 case OP_ANYNL:
2141 ph10 443 if (eptr >= md->end_subject)
2142 ph10 428 {
2143 ph10 443 SCHECK_PARTIAL();
2144 ph10 510 MRRETURN(MATCH_NOMATCH);
2145 ph10 443 }
2146 nigel 93 GETCHARINCTEST(c, eptr);
2147     switch(c)
2148     {
2149 ph10 510 default: MRRETURN(MATCH_NOMATCH);
2150 ph10 625
2151 nigel 93 case 0x000d:
2152     if (eptr < md->end_subject && *eptr == 0x0a) eptr++;
2153     break;
2154 ph10 231
2155 nigel 93 case 0x000a:
2156 ph10 231 break;
2157    
2158 nigel 93 case 0x000b:
2159     case 0x000c:
2160     case 0x0085:
2161     case 0x2028:
2162     case 0x2029:
2163 ph10 510 if (md->bsr_anycrlf) MRRETURN(MATCH_NOMATCH);
2164 nigel 93 break;
2165     }
2166     ecode++;
2167     break;
2168    
2169 ph10 178 case OP_NOT_HSPACE:
2170 ph10 443 if (eptr >= md->end_subject)
2171 ph10 428 {
2172 ph10 443 SCHECK_PARTIAL();
2173 ph10 510 MRRETURN(MATCH_NOMATCH);
2174 ph10 443 }
2175 ph10 178 GETCHARINCTEST(c, eptr);
2176     switch(c)
2177     {
2178     default: break;
2179     case 0x09: /* HT */
2180     case 0x20: /* SPACE */
2181     case 0xa0: /* NBSP */
2182     case 0x1680: /* OGHAM SPACE MARK */
2183     case 0x180e: /* MONGOLIAN VOWEL SEPARATOR */
2184     case 0x2000: /* EN QUAD */
2185     case 0x2001: /* EM QUAD */
2186     case 0x2002: /* EN SPACE */
2187     case 0x2003: /* EM SPACE */
2188     case 0x2004: /* THREE-PER-EM SPACE */
2189     case 0x2005: /* FOUR-PER-EM SPACE */
2190     case 0x2006: /* SIX-PER-EM SPACE */
2191     case 0x2007: /* FIGURE SPACE */
2192     case 0x2008: /* PUNCTUATION SPACE */
2193     case 0x2009: /* THIN SPACE */
2194     case 0x200A: /* HAIR SPACE */
2195     case 0x202f: /* NARROW NO-BREAK SPACE */
2196     case 0x205f: /* MEDIUM MATHEMATICAL SPACE */
2197     case 0x3000: /* IDEOGRAPHIC SPACE */
2198 ph10 510 MRRETURN(MATCH_NOMATCH);
2199 ph10 178 }
2200     ecode++;
2201     break;
2202    
2203     case OP_HSPACE:
2204 ph10 443 if (eptr >= md->end_subject)
2205 ph10 428 {
2206 ph10 443 SCHECK_PARTIAL();
2207 ph10 510 MRRETURN(MATCH_NOMATCH);
2208 ph10 443 }
2209 ph10 178 GETCHARINCTEST(c, eptr);
2210     switch(c)
2211     {
2212 ph10 510 default: MRRETURN(MATCH_NOMATCH);
2213 ph10 178 case 0x09: /* HT */
2214     case 0x20: /* SPACE */
2215     case 0xa0: /* NBSP */
2216     case 0x1680: /* OGHAM SPACE MARK */
2217     case 0x180e: /* MONGOLIAN VOWEL SEPARATOR */
2218     case 0x2000: /* EN QUAD */
2219     case 0x2001: /* EM QUAD */
2220     case 0x2002: /* EN SPACE */
2221     case 0x2003: /* EM SPACE */
2222     case 0x2004: /* THREE-PER-EM SPACE */
2223     case 0x2005: /* FOUR-PER-EM SPACE */
2224     case 0x2006: /* SIX-PER-EM SPACE */
2225     case 0x2007: /* FIGURE SPACE */
2226     case 0x2008: /* PUNCTUATION SPACE */
2227     case 0x2009: /* THIN SPACE */
2228     case 0x200A: /* HAIR SPACE */
2229     case 0x202f: /* NARROW NO-BREAK SPACE */
2230     case 0x205f: /* MEDIUM MATHEMATICAL SPACE */
2231     case 0x3000: /* IDEOGRAPHIC SPACE */
2232     break;
2233     }
2234     ecode++;
2235     break;
2236    
2237     case OP_NOT_VSPACE:
2238 ph10 443 if (eptr >= md->end_subject)
2239 ph10 428 {
2240 ph10 443 SCHECK_PARTIAL();
2241 ph10 510 MRRETURN(MATCH_NOMATCH);
2242 ph10 443 }
2243 ph10 178 GETCHARINCTEST(c, eptr);
2244     switch(c)
2245     {
2246     default: break;
2247     case 0x0a: /* LF */
2248     case 0x0b: /* VT */
2249     case 0x0c: /* FF */
2250     case 0x0d: /* CR */
2251     case 0x85: /* NEL */
2252     case 0x2028: /* LINE SEPARATOR */
2253     case 0x2029: /* PARAGRAPH SEPARATOR */
2254 ph10 510 MRRETURN(MATCH_NOMATCH);
2255 ph10 178 }
2256     ecode++;
2257     break;
2258    
2259     case OP_VSPACE:
2260 ph10 443 if (eptr >= md->end_subject)
2261 ph10 428 {
2262 ph10 443 SCHECK_PARTIAL();
2263 ph10 510 MRRETURN(MATCH_NOMATCH);
2264 ph10 443 }
2265 ph10 178 GETCHARINCTEST(c, eptr);
2266     switch(c)
2267     {
2268 ph10 510 default: MRRETURN(MATCH_NOMATCH);
2269 ph10 178 case 0x0a: /* LF */
2270     case 0x0b: /* VT */
2271     case 0x0c: /* FF */
2272     case 0x0d: /* CR */
2273     case 0x85: /* NEL */
2274     case 0x2028: /* LINE SEPARATOR */
2275     case 0x2029: /* PARAGRAPH SEPARATOR */
2276     break;
2277     }
2278     ecode++;
2279     break;
2280    
2281 nigel 77 #ifdef SUPPORT_UCP
2282     /* Check the next character by Unicode property. We will get here only
2283     if the support is in the binary; otherwise a compile-time error occurs. */
2284    
2285     case OP_PROP:
2286     case OP_NOTPROP:
2287 ph10 443 if (eptr >= md->end_subject)
2288 ph10 428 {
2289 ph10 443 SCHECK_PARTIAL();
2290 ph10 510 MRRETURN(MATCH_NOMATCH);
2291 ph10 443 }
2292 nigel 77 GETCHARINCTEST(c, eptr);
2293     {
2294 ph10 384 const ucd_record *prop = GET_UCD(c);
2295 nigel 77
2296 nigel 87 switch(ecode[1])
2297     {
2298     case PT_ANY:
2299 ph10 510 if (op == OP_NOTPROP) MRRETURN(MATCH_NOMATCH);
2300 nigel 87 break;
2301 nigel 77
2302 nigel 87 case PT_LAMP:
2303 ph10 349 if ((prop->chartype == ucp_Lu ||
2304     prop->chartype == ucp_Ll ||
2305     prop->chartype == ucp_Lt) == (op == OP_NOTPROP))
2306 ph10 510 MRRETURN(MATCH_NOMATCH);
2307 ph10 517 break;
2308 nigel 87
2309     case PT_GC:
2310 ph10 351 if ((ecode[2] != _pcre_ucp_gentype[prop->chartype]) == (op == OP_PROP))
2311 ph10 510 MRRETURN(MATCH_NOMATCH);
2312 nigel 87 break;
2313    
2314     case PT_PC:
2315 ph10 349 if ((ecode[2] != prop->chartype) == (op == OP_PROP))
2316 ph10 510 MRRETURN(MATCH_NOMATCH);
2317 nigel 87 break;
2318    
2319     case PT_SC:
2320 ph10 349 if ((ecode[2] != prop->script) == (op == OP_PROP))
2321 ph10 510 MRRETURN(MATCH_NOMATCH);
2322 nigel 87 break;
2323 ph10 527
2324 ph10 517 /* These are specials */
2325 ph10 527
2326 ph10 517 case PT_ALNUM:
2327     if ((_pcre_ucp_gentype[prop->chartype] == ucp_L ||
2328     _pcre_ucp_gentype[prop->chartype] == ucp_N) == (op == OP_NOTPROP))
2329     MRRETURN(MATCH_NOMATCH);
2330 ph10 527 break;
2331    
2332 ph10 517 case PT_SPACE: /* Perl space */
2333     if ((_pcre_ucp_gentype[prop->chartype] == ucp_Z ||
2334     c == CHAR_HT || c == CHAR_NL || c == CHAR_FF || c == CHAR_CR)
2335     == (op == OP_NOTPROP))
2336     MRRETURN(MATCH_NOMATCH);
2337 ph10 527 break;
2338    
2339 ph10 517 case PT_PXSPACE: /* POSIX space */
2340     if ((_pcre_ucp_gentype[prop->chartype] == ucp_Z ||
2341 ph10 527 c == CHAR_HT || c == CHAR_NL || c == CHAR_VT ||
2342 ph10 517 c == CHAR_FF || c == CHAR_CR)
2343     == (op == OP_NOTPROP))
2344     MRRETURN(MATCH_NOMATCH);
2345 ph10 527 break;
2346 nigel 87
2347 ph10 527 case PT_WORD:
2348 ph10 517 if ((_pcre_ucp_gentype[prop->chartype] == ucp_L ||
2349 ph10 527 _pcre_ucp_gentype[prop->chartype] == ucp_N ||
2350 ph10 517 c == CHAR_UNDERSCORE) == (op == OP_NOTPROP))
2351     MRRETURN(MATCH_NOMATCH);
2352 ph10 527 break;
2353    
2354 ph10 517 /* This should never occur */
2355    
2356 nigel 87 default:
2357     RRETURN(PCRE_ERROR_INTERNAL);
2358 nigel 77 }
2359 nigel 87
2360     ecode += 3;
2361 nigel 77 }
2362     break;
2363    
2364     /* Match an extended Unicode sequence. We will get here only if the support
2365     is in the binary; otherwise a compile-time error occurs. */
2366    
2367     case OP_EXTUNI:
2368 ph10 443 if (eptr >= md->end_subject)
2369 ph10 428 {
2370 ph10 443 SCHECK_PARTIAL();
2371 ph10 510 MRRETURN(MATCH_NOMATCH);
2372 ph10 443 }
2373 nigel 77 GETCHARINCTEST(c, eptr);
2374 ph10 623 if (UCD_CATEGORY(c) == ucp_M) MRRETURN(MATCH_NOMATCH);
2375     while (eptr < md->end_subject)
2376 nigel 77 {
2377 ph10 623 int len = 1;
2378     if (!utf8) c = *eptr; else { GETCHARLEN(c, eptr, len); }
2379     if (UCD_CATEGORY(c) != ucp_M) break;
2380     eptr += len;
2381 nigel 77 }
2382     ecode++;
2383     break;
2384     #endif
2385    
2386    
2387     /* Match a back reference, possibly repeatedly. Look past the end of the
2388     item to see if there is repeat information following. The code is similar
2389     to that for character classes, but repeated for efficiency. Then obey
2390     similar code to character type repeats - written out again for speed.
2391     However, if the referenced string is the empty string, always treat
2392     it as matched, any number of times (otherwise there could be infinite
2393     loops). */
2394    
2395     case OP_REF:
2396 ph10 625 case OP_REFI:
2397     caseless = op == OP_REFI;
2398 ph10 595 offset = GET2(ecode, 1) << 1; /* Doubled ref number */
2399     ecode += 3;
2400 ph10 345
2401 ph10 595 /* If the reference is unset, there are two possibilities:
2402 ph10 345
2403 ph10 595 (a) In the default, Perl-compatible state, set the length negative;
2404     this ensures that every attempt at a match fails. We can't just fail
2405     here, because of the possibility of quantifiers with zero minima.
2406 ph10 345
2407 ph10 595 (b) If the JavaScript compatibility flag is set, set the length to zero
2408     so that the back reference matches an empty string.
2409 ph10 345
2410 ph10 595 Otherwise, set the length to the length of what was matched by the
2411     referenced subpattern. */
2412 ph10 345
2413 ph10 595 if (offset >= offset_top || md->offset_vector[offset] < 0)
2414     length = (md->jscript_compat)? 0 : -1;
2415     else
2416     length = md->offset_vector[offset+1] - md->offset_vector[offset];
2417 nigel 77
2418 ph10 595 /* Set up for repetition, or handle the non-repeated case */
2419 nigel 77
2420 ph10 595 switch (*ecode)
2421     {
2422     case OP_CRSTAR:
2423     case OP_CRMINSTAR:
2424     case OP_CRPLUS:
2425     case OP_CRMINPLUS:
2426     case OP_CRQUERY:
2427     case OP_CRMINQUERY:
2428     c = *ecode++ - OP_CRSTAR;
2429     minimize = (c & 1) != 0;
2430     min = rep_min[c]; /* Pick up values from tables; */
2431     max = rep_max[c]; /* zero for max => infinity */
2432     if (max == 0) max = INT_MAX;
2433     break;
2434 nigel 77
2435 ph10 595 case OP_CRRANGE:
2436     case OP_CRMINRANGE:
2437     minimize = (*ecode == OP_CRMINRANGE);
2438     min = GET2(ecode, 1);
2439     max = GET2(ecode, 3);
2440     if (max == 0) max = INT_MAX;
2441     ecode += 5;
2442     break;
2443 nigel 77
2444 ph10 595 default: /* No repeat follows */
2445 ph10 602 if ((length = match_ref(offset, eptr, length, md, caseless)) < 0)
2446 ph10 595 {
2447     CHECK_PARTIAL();
2448     MRRETURN(MATCH_NOMATCH);
2449 nigel 77 }
2450 ph10 595 eptr += length;
2451     continue; /* With the main loop */
2452     }
2453 nigel 77
2454 ph10 595 /* Handle repeated back references. If the length of the reference is
2455     zero, just continue with the main loop. */
2456 ph10 443
2457 ph10 595 if (length == 0) continue;
2458 nigel 77
2459 ph10 595 /* First, ensure the minimum number of matches are present. We get back
2460     the length of the reference string explicitly rather than passing the
2461     address of eptr, so that eptr can be a register variable. */
2462 nigel 77
2463 ph10 595 for (i = 1; i <= min; i++)
2464     {
2465 ph10 625 int slength;
2466 ph10 602 if ((slength = match_ref(offset, eptr, length, md, caseless)) < 0)
2467 nigel 77 {
2468 ph10 595 CHECK_PARTIAL();
2469     MRRETURN(MATCH_NOMATCH);
2470 nigel 77 }
2471 ph10 595 eptr += slength;
2472     }
2473 nigel 77
2474 ph10 595 /* If min = max, continue at the same level without recursion.
2475     They are not both allowed to be zero. */
2476 nigel 77
2477 ph10 595 if (min == max) continue;
2478 nigel 77
2479 ph10 595 /* If minimizing, keep trying and advancing the pointer */
2480 nigel 77
2481 ph10 595 if (minimize)
2482     {
2483     for (fi = min;; fi++)
2484 nigel 77 {
2485 ph10 625 int slength;
2486 ph10 604 RMATCH(eptr, ecode, offset_top, md, eptrb, RM14);
2487 ph10 595 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2488     if (fi >= max) MRRETURN(MATCH_NOMATCH);
2489 ph10 602 if ((slength = match_ref(offset, eptr, length, md, caseless)) < 0)
2490 nigel 77 {
2491 ph10 595 CHECK_PARTIAL();
2492     MRRETURN(MATCH_NOMATCH);
2493 nigel 77 }
2494 ph10 595 eptr += slength;
2495 nigel 77 }
2496 ph10 595 /* Control never gets here */
2497     }
2498 nigel 77
2499 ph10 595 /* If maximizing, find the longest string and work backwards */
2500 nigel 77
2501 ph10 595 else
2502     {
2503     pp = eptr;
2504     for (i = min; i < max; i++)
2505 nigel 77 {
2506 ph10 625 int slength;
2507 ph10 602 if ((slength = match_ref(offset, eptr, length, md, caseless)) < 0)
2508 nigel 77 {
2509 ph10 595 CHECK_PARTIAL();
2510     break;
2511 nigel 77 }
2512 ph10 595 eptr += slength;
2513 nigel 77 }
2514 ph10 595 while (eptr >= pp)
2515     {
2516 ph10 604 RMATCH(eptr, ecode, offset_top, md, eptrb, RM15);
2517 ph10 595 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2518     eptr -= length;
2519     }
2520     MRRETURN(MATCH_NOMATCH);
2521 nigel 77 }
2522     /* Control never gets here */
2523    
2524     /* Match a bit-mapped character class, possibly repeatedly. This op code is
2525     used when all the characters in the class have values in the range 0-255,
2526     and either the matching is caseful, or the characters are in the range
2527     0-127 when UTF-8 processing is enabled. The only difference between
2528     OP_CLASS and OP_NCLASS occurs when a data character outside the range is
2529     encountered.
2530    
2531     First, look past the end of the item to see if there is repeat information
2532     following. Then obey similar code to character type repeats - written out
2533     again for speed. */
2534    
2535     case OP_NCLASS:
2536     case OP_CLASS:
2537     {
2538     data = ecode + 1; /* Save for matching */
2539     ecode += 33; /* Advance past the item */
2540    
2541     switch (*ecode)
2542     {
2543     case OP_CRSTAR:
2544     case OP_CRMINSTAR:
2545     case OP_CRPLUS:
2546     case OP_CRMINPLUS:
2547     case OP_CRQUERY:
2548     case OP_CRMINQUERY:
2549     c = *ecode++ - OP_CRSTAR;
2550     minimize = (c & 1) != 0;
2551     min = rep_min[c]; /* Pick up values from tables; */
2552     max = rep_max[c]; /* zero for max => infinity */
2553     if (max == 0) max = INT_MAX;
2554     break;
2555    
2556     case OP_CRRANGE:
2557     case OP_CRMINRANGE:
2558     minimize = (*ecode == OP_CRMINRANGE);
2559     min = GET2(ecode, 1);
2560     max = GET2(ecode, 3);
2561     if (max == 0) max = INT_MAX;
2562     ecode += 5;
2563     break;
2564    
2565     default: /* No repeat follows */
2566     min = max = 1;
2567     break;
2568     }
2569    
2570     /* First, ensure the minimum number of matches are present. */
2571    
2572     #ifdef SUPPORT_UTF8
2573     /* UTF-8 mode */
2574     if (utf8)
2575     {
2576     for (i = 1; i <= min; i++)
2577     {
2578 ph10 427 if (eptr >= md->end_subject)
2579 ph10 426 {
2580 ph10 428 SCHECK_PARTIAL();
2581 ph10 510 MRRETURN(MATCH_NOMATCH);
2582 ph10 427 }
2583 nigel 77 GETCHARINC(c, eptr);
2584     if (c > 255)
2585     {
2586 ph10 510 if (op == OP_CLASS) MRRETURN(MATCH_NOMATCH);
2587 nigel 77 }
2588     else
2589     {
2590 ph10 510 if ((data[c/8] & (1 << (c&7))) == 0) MRRETURN(MATCH_NOMATCH);
2591 nigel 77 }
2592     }
2593     }
2594     else
2595     #endif
2596     /* Not UTF-8 mode */
2597     {
2598     for (i = 1; i <= min; i++)
2599     {
2600 ph10 427 if (eptr >= md->end_subject)
2601 ph10 426 {
2602 ph10 428 SCHECK_PARTIAL();
2603 ph10 510 MRRETURN(MATCH_NOMATCH);
2604 ph10 427 }
2605 nigel 77 c = *eptr++;
2606 ph10 510 if ((data[c/8] & (1 << (c&7))) == 0) MRRETURN(MATCH_NOMATCH);
2607 nigel 77 }
2608     }
2609    
2610     /* If max == min we can continue with the main loop without the
2611     need to recurse. */
2612    
2613     if (min == max) continue;
2614    
2615     /* If minimizing, keep testing the rest of the expression and advancing
2616     the pointer while it matches the class. */
2617    
2618     if (minimize)
2619     {
2620     #ifdef SUPPORT_UTF8
2621     /* UTF-8 mode */
2622     if (utf8)
2623     {
2624     for (fi = min;; fi++)
2625     {
2626 ph10 604 RMATCH(eptr, ecode, offset_top, md, eptrb, RM16);
2627 nigel 77 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2628 ph10 510 if (fi >= max) MRRETURN(MATCH_NOMATCH);
2629 ph10 427 if (eptr >= md->end_subject)
2630 ph10 426 {
2631 ph10 427 SCHECK_PARTIAL();
2632 ph10 510 MRRETURN(MATCH_NOMATCH);
2633 ph10 427 }
2634 nigel 77 GETCHARINC(c, eptr);
2635     if (c > 255)
2636     {
2637 ph10 510 if (op == OP_CLASS) MRRETURN(MATCH_NOMATCH);
2638 nigel 77 }
2639     else
2640     {
2641 ph10 510 if ((data[c/8] & (1 << (c&7))) == 0) MRRETURN(MATCH_NOMATCH);
2642 nigel 77 }
2643     }
2644     }
2645     else
2646     #endif
2647     /* Not UTF-8 mode */
2648     {
2649     for (fi = min;; fi++)
2650     {
2651 ph10 604 RMATCH(eptr, ecode, offset_top, md, eptrb, RM17);
2652 nigel 77 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2653 ph10 510 if (fi >= max) MRRETURN(MATCH_NOMATCH);
2654 ph10 427 if (eptr >= md->end_subject)
2655 ph10 426 {
2656 ph10 427 SCHECK_PARTIAL();
2657 ph10 510 MRRETURN(MATCH_NOMATCH);
2658 ph10 427 }
2659 nigel 77 c = *eptr++;
2660 ph10 510 if ((data[c/8] & (1 << (c&7))) == 0) MRRETURN(MATCH_NOMATCH);
2661 nigel 77 }
2662     }
2663     /* Control never gets here */
2664     }
2665    
2666     /* If maximizing, find the longest possible run, then work backwards. */
2667    
2668     else
2669     {
2670     pp = eptr;
2671    
2672     #ifdef SUPPORT_UTF8
2673     /* UTF-8 mode */
2674     if (utf8)
2675     {
2676     for (i = min; i < max; i++)
2677     {
2678     int len = 1;
2679 ph10 463 if (eptr >= md->end_subject)
2680 ph10 462 {
2681 ph10 463 SCHECK_PARTIAL();
2682 ph10 462 break;
2683 ph10 463 }
2684 nigel 77 GETCHARLEN(c, eptr, len);
2685     if (c > 255)
2686     {
2687     if (op == OP_CLASS) break;
2688     }
2689     else
2690     {
2691     if ((data[c/8] & (1 << (c&7))) == 0) break;
2692     }
2693     eptr += len;
2694     }
2695     for (;;)
2696     {
2697 ph10 604 RMATCH(eptr, ecode, offset_top, md, eptrb, RM18);
2698 nigel 77 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2699     if (eptr-- == pp) break; /* Stop if tried at original pos */
2700     BACKCHAR(eptr);
2701     }
2702     }
2703     else
2704     #endif
2705     /* Not UTF-8 mode */
2706     {
2707     for (i = min; i < max; i++)
2708     {
2709 ph10 463 if (eptr >= md->end_subject)
2710 ph10 462 {
2711 ph10 463 SCHECK_PARTIAL();
2712 ph10 462 break;
2713 ph10 463 }
2714 nigel 77 c = *eptr;
2715     if ((data[c/8] & (1 << (c&7))) == 0) break;
2716     eptr++;
2717     }
2718     while (eptr >= pp)
2719     {
2720 ph10 604 RMATCH(eptr, ecode, offset_top, md, eptrb, RM19);
2721 nigel 87 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2722 nigel 77 eptr--;
2723     }
2724     }
2725    
2726 ph10 510 MRRETURN(MATCH_NOMATCH);
2727 nigel 77 }
2728     }
2729     /* Control never gets here */
2730    
2731    
2732     /* Match an extended character class. This opcode is encountered only
2733 ph10 384 when UTF-8 mode mode is supported. Nevertheless, we may not be in UTF-8
2734     mode, because Unicode properties are supported in non-UTF-8 mode. */
2735 nigel 77
2736     #ifdef SUPPORT_UTF8
2737     case OP_XCLASS:
2738     {
2739     data = ecode + 1 + LINK_SIZE; /* Save for matching */
2740     ecode += GET(ecode, 1); /* Advance past the item */
2741    
2742     switch (*ecode)
2743     {
2744     case OP_CRSTAR:
2745     case OP_CRMINSTAR:
2746     case OP_CRPLUS:
2747     case OP_CRMINPLUS:
2748     case OP_CRQUERY:
2749     case OP_CRMINQUERY:
2750     c = *ecode++ - OP_CRSTAR;
2751     minimize = (c & 1) != 0;
2752     min = rep_min[c]; /* Pick up values from tables; */
2753     max = rep_max[c]; /* zero for max => infinity */
2754     if (max == 0) max = INT_MAX;
2755     break;
2756    
2757     case OP_CRRANGE:
2758     case OP_CRMINRANGE:
2759     minimize = (*ecode == OP_CRMINRANGE);
2760     min = GET2(ecode, 1);
2761     max = GET2(ecode, 3);
2762     if (max == 0) max = INT_MAX;
2763     ecode += 5;
2764     break;
2765    
2766     default: /* No repeat follows */
2767     min = max = 1;
2768     break;
2769     }
2770    
2771     /* First, ensure the minimum number of matches are present. */
2772    
2773     for (i = 1; i <= min; i++)
2774     {
2775 ph10 427 if (eptr >= md->end_subject)
2776 ph10 426 {
2777     SCHECK_PARTIAL();
2778 ph10 510 MRRETURN(MATCH_NOMATCH);
2779 ph10 427 }
2780 ph10 384 GETCHARINCTEST(c, eptr);
2781 ph10 510 if (!_pcre_xclass(c, data)) MRRETURN(MATCH_NOMATCH);
2782 nigel 77 }
2783    
2784     /* If max == min we can continue with the main loop without the
2785     need to recurse. */
2786    
2787     if (min == max) continue;
2788    
2789     /* If minimizing, keep testing the rest of the expression and advancing
2790     the pointer while it matches the class. */
2791    
2792     if (minimize)
2793     {
2794     for (fi = min;; fi++)
2795     {
2796 ph10 604 RMATCH(eptr, ecode, offset_top, md, eptrb, RM20);
2797 nigel 77 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2798 ph10 510 if (fi >= max) MRRETURN(MATCH_NOMATCH);
2799 ph10 427 if (eptr >= md->end_subject)
2800 ph10 426 {
2801 ph10 427 SCHECK_PARTIAL();
2802 ph10 510 MRRETURN(MATCH_NOMATCH);
2803 ph10 427 }
2804 ph10 384 GETCHARINCTEST(c, eptr);
2805 ph10 510 if (!_pcre_xclass(c, data)) MRRETURN(MATCH_NOMATCH);
2806 nigel 77 }
2807     /* Control never gets here */
2808     }
2809    
2810     /* If maximizing, find the longest possible run, then work backwards. */
2811    
2812     else
2813     {
2814     pp = eptr;
2815     for (i = min; i < max; i++)
2816     {
2817     int len = 1;
2818 ph10 463 if (eptr >= md->end_subject)
2819 ph10 462 {
2820 ph10 463 SCHECK_PARTIAL();
2821 ph10 462 break;
2822 ph10 463 }
2823 ph10 384 GETCHARLENTEST(c, eptr, len);
2824 nigel 77 if (!_pcre_xclass(c, data)) break;
2825     eptr += len;
2826     }
2827     for(;;)
2828     {
2829 ph10 604 RMATCH(eptr, ecode, offset_top, md, eptrb, RM21);
2830 nigel 77 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2831     if (eptr-- == pp) break; /* Stop if tried at original pos */
2832 ph10 214 if (utf8) BACKCHAR(eptr);
2833 nigel 77 }
2834 ph10 510 MRRETURN(MATCH_NOMATCH);
2835 nigel 77 }
2836    
2837     /* Control never gets here */
2838     }
2839     #endif /* End of XCLASS */
2840    
2841     /* Match a single character, casefully */
2842    
2843     case OP_CHAR:
2844     #ifdef SUPPORT_UTF8
2845     if (utf8)
2846     {
2847     length = 1;
2848     ecode++;
2849     GETCHARLEN(fc, ecode, length);
2850 ph10 443 if (length > md->end_subject - eptr)
2851 ph10 428 {
2852     CHECK_PARTIAL(); /* Not SCHECK_PARTIAL() */
2853 ph10 510 MRRETURN(MATCH_NOMATCH);
2854 ph10 443 }
2855 ph10 510 while (length-- > 0) if (*ecode++ != *eptr++) MRRETURN(MATCH_NOMATCH);
2856 nigel 77 }
2857     else
2858     #endif
2859    
2860     /* Non-UTF-8 mode */
2861     {
2862 ph10 443 if (md->end_subject - eptr < 1)
2863 ph10 428 {
2864     SCHECK_PARTIAL(); /* This one can use SCHECK_PARTIAL() */
2865 ph10 510 MRRETURN(MATCH_NOMATCH);
2866 ph10 443 }
2867 ph10 510 if (ecode[1] != *eptr++) MRRETURN(MATCH_NOMATCH);
2868 nigel 77 ecode += 2;
2869     }
2870     break;
2871    
2872     /* Match a single character, caselessly */
2873    
2874 ph10 602 case OP_CHARI:
2875 nigel 77 #ifdef SUPPORT_UTF8
2876     if (utf8)
2877     {
2878     length = 1;
2879     ecode++;
2880     GETCHARLEN(fc, ecode, length);
2881    
2882 ph10 443 if (length > md->end_subject - eptr)
2883 ph10 428 {
2884     CHECK_PARTIAL(); /* Not SCHECK_PARTIAL() */
2885 ph10 510 MRRETURN(MATCH_NOMATCH);
2886 ph10 443 }
2887 nigel 77
2888     /* If the pattern character's value is < 128, we have only one byte, and
2889     can use the fast lookup table. */
2890    
2891     if (fc < 128)
2892     {
2893 ph10 510 if (md->lcc[*ecode++] != md->lcc[*eptr++]) MRRETURN(MATCH_NOMATCH);
2894 nigel 77 }
2895    
2896     /* Otherwise we must pick up the subject character */
2897    
2898     else
2899     {
2900 nigel 93 unsigned int dc;
2901 nigel 77 GETCHARINC(dc, eptr);
2902     ecode += length;
2903    
2904     /* If we have Unicode property support, we can use it to test the other
2905 nigel 87 case of the character, if there is one. */
2906 nigel 77
2907     if (fc != dc)
2908     {
2909     #ifdef SUPPORT_UCP
2910 ph10 349 if (dc != UCD_OTHERCASE(fc))
2911 nigel 77 #endif
2912 ph10 510 MRRETURN(MATCH_NOMATCH);
2913 nigel 77 }
2914     }
2915     }
2916     else
2917     #endif /* SUPPORT_UTF8 */
2918    
2919     /* Non-UTF-8 mode */
2920     {
2921 ph10 443 if (md->end_subject - eptr < 1)
2922 ph10 428 {
2923 ph10 443 SCHECK_PARTIAL(); /* This one can use SCHECK_PARTIAL() */
2924 ph10 510 MRRETURN(MATCH_NOMATCH);
2925 ph10 443 }
2926 ph10 510 if (md->lcc[ecode[1]] != md->lcc[*eptr++]) MRRETURN(MATCH_NOMATCH);
2927 nigel 77 ecode += 2;
2928     }
2929     break;
2930    
2931 nigel 93 /* Match a single character repeatedly. */
2932 nigel 77
2933     case OP_EXACT:
2934 ph10 602 case OP_EXACTI:
2935 nigel 77 min = max = GET2(ecode, 1);
2936     ecode += 3;
2937     goto REPEATCHAR;
2938    
2939 nigel 93 case OP_POSUPTO:
2940 ph10 602 case OP_POSUPTOI:
2941 nigel 93 possessive = TRUE;
2942     /* Fall through */
2943    
2944 nigel 77 case OP_UPTO:
2945 ph10 602 case OP_UPTOI:
2946 nigel 77 case OP_MINUPTO:
2947 ph10 602 case OP_MINUPTOI:
2948 nigel 77 min = 0;
2949     max = GET2(ecode, 1);
2950 ph10 602 minimize = *ecode == OP_MINUPTO || *ecode == OP_MINUPTOI;
2951 nigel 77 ecode += 3;
2952     goto REPEATCHAR;
2953    
2954 nigel 93 case OP_POSSTAR:
2955 ph10 602 case OP_POSSTARI:
2956 nigel 93 possessive = TRUE;
2957     min = 0;
2958     max = INT_MAX;
2959     ecode++;
2960     goto REPEATCHAR;
2961    
2962     case OP_POSPLUS:
2963 ph10 602 case OP_POSPLUSI:
2964 nigel 93 possessive = TRUE;
2965     min = 1;
2966     max = INT_MAX;
2967     ecode++;
2968     goto REPEATCHAR;
2969    
2970     case OP_POSQUERY:
2971 ph10 602 case OP_POSQUERYI:
2972 nigel 93 possessive = TRUE;
2973     min = 0;
2974     max = 1;
2975     ecode++;
2976     goto REPEATCHAR;
2977    
2978 nigel 77 case OP_STAR:
2979 ph10 602 case OP_STARI:
2980 nigel 77 case OP_MINSTAR:
2981 ph10 602 case OP_MINSTARI:
2982 nigel 77 case OP_PLUS:
2983 ph10 602 case OP_PLUSI:
2984 nigel 77 case OP_MINPLUS:
2985 ph10 602 case OP_MINPLUSI:
2986 nigel 77 case OP_QUERY:
2987 ph10 602 case OP_QUERYI:
2988 nigel 77 case OP_MINQUERY:
2989 ph10 602 case OP_MINQUERYI:
2990     c = *ecode++ - ((op < OP_STARI)? OP_STAR : OP_STARI);
2991 nigel 77 minimize = (c & 1) != 0;
2992     min = rep_min[c]; /* Pick up values from tables; */
2993     max = rep_max[c]; /* zero for max => infinity */
2994     if (max == 0) max = INT_MAX;
2995    
2996 ph10 426 /* Common code for all repeated single-character matches. */
2997 nigel 77
2998     REPEATCHAR:
2999     #ifdef SUPPORT_UTF8
3000     if (utf8)
3001     {
3002     length = 1;
3003     charptr = ecode;
3004     GETCHARLEN(fc, ecode, length);
3005     ecode += length;
3006    
3007     /* Handle multibyte character matching specially here. There is
3008     support for caseless matching if UCP support is present. */
3009    
3010     if (length > 1)
3011     {
3012     #ifdef SUPPORT_UCP
3013 nigel 93 unsigned int othercase;
3014 ph10 602 if (op >= OP_STARI && /* Caseless */
3015 ph10 349 (othercase = UCD_OTHERCASE(fc)) != fc)
3016 nigel 77 oclength = _pcre_ord2utf8(othercase, occhars);
3017 ph10 115 else oclength = 0;
3018 nigel 77 #endif /* SUPPORT_UCP */
3019    
3020     for (i = 1; i <= min; i++)
3021     {
3022 ph10 426 if (eptr <= md->end_subject - length &&
3023     memcmp(eptr, charptr, length) == 0) eptr += length;
3024 ph10 123 #ifdef SUPPORT_UCP
3025 ph10 426 else if (oclength > 0 &&
3026     eptr <= md->end_subject - oclength &&
3027     memcmp(eptr, occhars, oclength) == 0) eptr += oclength;
3028     #endif /* SUPPORT_UCP */
3029 nigel 77 else
3030     {
3031 ph10 426 CHECK_PARTIAL();
3032 ph10 510 MRRETURN(MATCH_NOMATCH);
3033 nigel 77 }
3034     }
3035    
3036     if (min == max) continue;
3037    
3038     if (minimize)
3039     {
3040     for (fi = min;; fi++)
3041     {
3042 ph10 604 RMATCH(eptr, ecode, offset_top, md, eptrb, RM22);
3043 nigel 77 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3044 ph10 510 if (fi >= max) MRRETURN(MATCH_NOMATCH);
3045 ph10 426 if (eptr <= md->end_subject - length &&
3046     memcmp(eptr, charptr, length) == 0) eptr += length;
3047 ph10 123 #ifdef SUPPORT_UCP
3048 ph10 426 else if (oclength > 0 &&
3049     eptr <= md->end_subject - oclength &&
3050     memcmp(eptr, occhars, oclength) == 0) eptr += oclength;
3051     #endif /* SUPPORT_UCP */
3052 nigel 77 else
3053     {
3054 ph10 426 CHECK_PARTIAL();
3055 ph10 510 MRRETURN(MATCH_NOMATCH);
3056 nigel 77 }
3057     }
3058     /* Control never gets here */
3059     }
3060 nigel 93
3061     else /* Maximize */
3062 nigel 77 {
3063     pp = eptr;
3064     for (i = min; i < max; i++)
3065     {
3066 ph10 426 if (eptr <= md->end_subject - length &&
3067     memcmp(eptr, charptr, length) == 0) eptr += length;
3068 ph10 123 #ifdef SUPPORT_UCP
3069 ph10 426 else if (oclength > 0 &&
3070     eptr <= md->end_subject - oclength &&
3071     memcmp(eptr, occhars, oclength) == 0) eptr += oclength;
3072     #endif /* SUPPORT_UCP */
3073 ph10 463 else
3074 ph10 462 {
3075 ph10 463 CHECK_PARTIAL();
3076 ph10 462 break;
3077 ph10 463 }
3078 nigel 77 }
3079 nigel 93
3080     if (possessive) continue;
3081 ph10 427
3082 ph10 120 for(;;)
3083 ph10 426 {
3084 ph10 604 RMATCH(eptr, ecode, offset_top, md, eptrb, RM23);
3085 ph10 426 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3086 ph10 510 if (eptr == pp) { MRRETURN(MATCH_NOMATCH); }
3087 ph10 115 #ifdef SUPPORT_UCP
3088 ph10 426 eptr--;
3089     BACKCHAR(eptr);
3090 ph10 123 #else /* without SUPPORT_UCP */
3091 ph10 426 eptr -= length;
3092 ph10 123 #endif /* SUPPORT_UCP */
3093 ph10 426 }
3094 nigel 77 }
3095     /* Control never gets here */
3096     }
3097    
3098     /* If the length of a UTF-8 character is 1, we fall through here, and
3099     obey the code as for non-UTF-8 characters below, though in this case the
3100     value of fc will always be < 128. */
3101     }
3102     else
3103     #endif /* SUPPORT_UTF8 */
3104    
3105     /* When not in UTF-8 mode, load a single-byte character. */
3106    
3107 ph10 426 fc = *ecode++;
3108 ph10 443
3109 nigel 77 /* The value of fc at this point is always less than 256, though we may or
3110     may not be in UTF-8 mode. The code is duplicated for the caseless and
3111     caseful cases, for speed, since matching characters is likely to be quite
3112     common. First, ensure the minimum number of matches are present. If min =
3113     max, continue at the same level without recursing. Otherwise, if
3114     minimizing, keep trying the rest of the expression and advancing one
3115     matching character if failing, up to the maximum. Alternatively, if
3116     maximizing, find the maximum number of characters and work backwards. */
3117    
3118     DPRINTF(("matching %c{%d,%d} against subject %.*s\n", fc, min, max,
3119     max, eptr));
3120    
3121 ph10 602 if (op >= OP_STARI) /* Caseless */
3122 nigel 77 {
3123     fc = md->lcc[fc];
3124     for (i = 1; i <= min; i++)
3125 ph10 426 {
3126     if (eptr >= md->end_subject)
3127     {
3128     SCHECK_PARTIAL();
3129 ph10 510 MRRETURN(MATCH_NOMATCH);
3130 ph10 426 }
3131 ph10 510 if (fc != md->lcc[*eptr++]) MRRETURN(MATCH_NOMATCH);
3132 ph10 426 }
3133 nigel 77 if (min == max) continue;
3134     if (minimize)
3135     {
3136     for (fi = min;; fi++)
3137     {
3138 ph10 604 RMATCH(eptr, ecode, offset_top, md, eptrb, RM24);
3139 nigel 77 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3140 ph10 510 if (fi >= max) MRRETURN(MATCH_NOMATCH);
3141 ph10 426 if (eptr >= md->end_subject)
3142     {
3143 ph10 427 SCHECK_PARTIAL();
3144 ph10 510 MRRETURN(MATCH_NOMATCH);
3145 ph10 426 }
3146 ph10 510 if (fc != md->lcc[*eptr++]) MRRETURN(MATCH_NOMATCH);
3147 nigel 77 }
3148     /* Control never gets here */
3149     }
3150 nigel 93 else /* Maximize */
3151 nigel 77 {
3152     pp = eptr;
3153     for (i = min; i < max; i++)
3154     {
3155 ph10 463 if (eptr >= md->end_subject)
3156 ph10 462 {
3157     SCHECK_PARTIAL();
3158     break;
3159 ph10 463 }
3160 ph10 462 if (fc != md->lcc[*eptr]) break;
3161 nigel 77 eptr++;
3162     }
3163 ph10 427
3164 nigel 93 if (possessive) continue;
3165 ph10 427
3166 nigel 77 while (eptr >= pp)
3167     {
3168 ph10 604 RMATCH(eptr, ecode, offset_top, md, eptrb, RM25);
3169 nigel 77 eptr--;
3170     if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3171     }
3172 ph10 510 MRRETURN(MATCH_NOMATCH);
3173 nigel 77 }
3174     /* Control never gets here */
3175     }
3176    
3177     /* Caseful comparisons (includes all multi-byte characters) */
3178    
3179     else
3180     {
3181 ph10 427 for (i = 1; i <= min; i++)
3182 ph10 426 {
3183     if (eptr >= md->end_subject)
3184     {
3185     SCHECK_PARTIAL();
3186 ph10 510 MRRETURN(MATCH_NOMATCH);
3187 ph10 426 }
3188 ph10 510 if (fc != *eptr++) MRRETURN(MATCH_NOMATCH);
3189 ph10 427 }
3190 ph10 443
3191 nigel 77 if (min == max) continue;
3192 ph10 443
3193 nigel 77 if (minimize)
3194     {
3195     for (fi = min;; fi++)
3196     {
3197 ph10 604 RMATCH(eptr, ecode, offset_top, md, eptrb, RM26);
3198 nigel 77 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3199 ph10 510 if (fi >= max) MRRETURN(MATCH_NOMATCH);
3200 ph10 426 if (eptr >= md->end_subject)
3201 ph10 427 {
3202 ph10 426 SCHECK_PARTIAL();
3203 ph10 510 MRRETURN(MATCH_NOMATCH);
3204 ph10 427 }
3205 ph10 510 if (fc != *eptr++) MRRETURN(MATCH_NOMATCH);
3206 nigel 77 }
3207     /* Control never gets here */
3208     }
3209 nigel 93 else /* Maximize */
3210 nigel 77 {
3211     pp = eptr;
3212     for (i = min; i < max; i++)
3213     {
3214 ph10 463 if (eptr >= md->end_subject)
3215 ph10 462 {
3216 ph10 463 SCHECK_PARTIAL();
3217 ph10 462 break;
3218 ph10 463 }
3219 ph10 462 if (fc != *eptr) break;
3220 nigel 77 eptr++;
3221     }
3222 nigel 93 if (possessive) continue;
3223 ph10 443
3224 nigel 77 while (eptr >= pp)
3225     {
3226 ph10 604 RMATCH(eptr, ecode, offset_top, md, eptrb, RM27);
3227 nigel 77 eptr--;
3228     if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3229     }
3230 ph10 510 MRRETURN(MATCH_NOMATCH);
3231 nigel 77 }
3232     }
3233     /* Control never gets here */
3234    
3235     /* Match a negated single one-byte character. The character we are
3236     checking can be multibyte. */
3237    
3238     case OP_NOT:
3239 ph10 625 case OP_NOTI:
3240 ph10 443 if (eptr >= md->end_subject)
3241 ph10 428 {
3242 ph10 443 SCHECK_PARTIAL();
3243 ph10 510 MRRETURN(MATCH_NOMATCH);
3244 ph10 443 }
3245 nigel 77 ecode++;
3246     GETCHARINCTEST(c, eptr);
3247 ph10 602 if (op == OP_NOTI) /* The caseless case */
3248 nigel 77 {
3249     #ifdef SUPPORT_UTF8
3250     if (c < 256)
3251     #endif
3252     c = md->lcc[c];
3253 ph10 510 if (md->lcc[*ecode++] == c) MRRETURN(MATCH_NOMATCH);
3254 nigel 77 }
3255 ph10 602 else /* Caseful */
3256 nigel 77 {
3257 ph10 510 if (*ecode++ == c) MRRETURN(MATCH_NOMATCH);
3258 nigel 77 }
3259     break;
3260    
3261     /* Match a negated single one-byte character repeatedly. This is almost a
3262     repeat of the code for a repeated single character, but I haven't found a
3263     nice way of commoning these up that doesn't require a test of the
3264     positive/negative option for each character match. Maybe that wouldn't add
3265     very much to the time taken, but character matching *is* what this is all
3266     about... */
3267    
3268     case OP_NOTEXACT:
3269 ph10 602 case OP_NOTEXACTI:
3270 nigel 77 min = max = GET2(ecode, 1);
3271     ecode += 3;
3272     goto REPEATNOTCHAR;
3273    
3274     case OP_NOTUPTO:
3275 ph10 602 case OP_NOTUPTOI:
3276 nigel 77 case OP_NOTMINUPTO:
3277 ph10 602 case OP_NOTMINUPTOI:
3278 nigel 77 min = 0;
3279     max = GET2(ecode, 1);
3280 ph10 602 minimize = *ecode == OP_NOTMINUPTO || *ecode == OP_NOTMINUPTOI;
3281 nigel 77 ecode += 3;
3282     goto REPEATNOTCHAR;
3283    
3284 nigel 93 case OP_NOTPOSSTAR:
3285 ph10 602 case OP_NOTPOSSTARI:
3286 nigel 93 possessive = TRUE;
3287     min = 0;
3288     max = INT_MAX;
3289     ecode++;
3290     goto REPEATNOTCHAR;
3291    
3292     case OP_NOTPOSPLUS:
3293 ph10 602 case OP_NOTPOSPLUSI:
3294 nigel 93 possessive = TRUE;
3295     min = 1;
3296     max = INT_MAX;
3297     ecode++;
3298     goto REPEATNOTCHAR;
3299    
3300     case OP_NOTPOSQUERY:
3301 ph10 602 case OP_NOTPOSQUERYI:
3302 nigel 93 possessive = TRUE;
3303     min = 0;
3304     max = 1;
3305     ecode++;
3306     goto REPEATNOTCHAR;
3307    
3308     case OP_NOTPOSUPTO:
3309 ph10 602 case OP_NOTPOSUPTOI:
3310 nigel 93 possessive = TRUE;
3311     min = 0;
3312     max = GET2(ecode, 1);
3313     ecode += 3;
3314     goto REPEATNOTCHAR;
3315    
3316 nigel 77 case OP_NOTSTAR:
3317 ph10 602 case OP_NOTSTARI:
3318 nigel 77 case OP_NOTMINSTAR:
3319 ph10 602 case OP_NOTMINSTARI:
3320 nigel 77 case OP_NOTPLUS:
3321 ph10 602 case OP_NOTPLUSI:
3322 nigel 77 case OP_NOTMINPLUS:
3323 ph10 602 case OP_NOTMINPLUSI:
3324 nigel 77 case OP_NOTQUERY:
3325 ph10 602 case OP_NOTQUERYI:
3326 nigel 77 case OP_NOTMINQUERY:
3327 ph10 602 case OP_NOTMINQUERYI:
3328     c = *ecode++ - ((op >= OP_NOTSTARI)? OP_NOTSTARI: OP_NOTSTAR);
3329 nigel 77 minimize = (c & 1) != 0;
3330     min = rep_min[c]; /* Pick up values from tables; */
3331     max = rep_max[c]; /* zero for max => infinity */
3332     if (max == 0) max = INT_MAX;
3333    
3334 ph10 426 /* Common code for all repeated single-byte matches. */
3335 nigel 77
3336     REPEATNOTCHAR:
3337     fc = *ecode++;
3338    
3339     /* The code is duplicated for the caseless and caseful cases, for speed,
3340     since matching characters is likely to be quite common. First, ensure the
3341     minimum number of matches are present. If min = max, continue at the same
3342     level without recursing. Otherwise, if minimizing, keep trying the rest of
3343     the expression and advancing one matching character if failing, up to the
3344     maximum. Alternatively, if maximizing, find the maximum number of
3345     characters and work backwards. */
3346    
3347     DPRINTF(("negative matching %c{%d,%d} against subject %.*s\n", fc, min, max,
3348     max, eptr));
3349    
3350 ph10 602 if (op >= OP_NOTSTARI) /* Caseless */
3351 nigel 77 {
3352     fc = md->lcc[fc];
3353    
3354     #ifdef SUPPORT_UTF8
3355     /* UTF-8 mode */
3356     if (utf8)
3357     {
3358 nigel 93 register unsigned int d;
3359 nigel 77 for (i = 1; i <= min; i++)
3360     {
3361 ph10 426 if (eptr >= md->end_subject)
3362     {
3363     SCHECK_PARTIAL();
3364 ph10 510 MRRETURN(MATCH_NOMATCH);
3365 ph10 427 }
3366 nigel 77 GETCHARINC(d, eptr);
3367     if (d < 256) d = md->lcc[d];
3368 ph10 510 if (fc == d) MRRETURN(MATCH_NOMATCH);
3369 nigel 77 }
3370     }
3371     else
3372     #endif
3373    
3374     /* Not UTF-8 mode */
3375     {
3376     for (i = 1; i <= min; i++)
3377 ph10 426 {
3378     if (eptr >= md->end_subject)
3379     {
3380     SCHECK_PARTIAL();
3381 ph10 510 MRRETURN(MATCH_NOMATCH);
3382 ph10 427 }
3383 ph10 510 if (fc == md->lcc[*eptr++]) MRRETURN(MATCH_NOMATCH);
3384 ph10 427 }
3385 nigel 77 }
3386    
3387     if (min == max) continue;
3388    
3389     if (minimize)
3390     {
3391     #ifdef SUPPORT_UTF8
3392     /* UTF-8 mode */
3393     if (utf8)
3394     {
3395 nigel 93 register unsigned int d;
3396 nigel 77 for (fi = min;; fi++)
3397     {
3398 ph10 604 RMATCH(eptr, ecode, offset_top, md, eptrb, RM28);
3399 nigel 77 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3400 ph10 510 if (fi >= max) MRRETURN(MATCH_NOMATCH);
3401 ph10 427 if (eptr >= md->end_subject)
3402 ph10 426 {
3403 ph10 427 SCHECK_PARTIAL();
3404 ph10 510 MRRETURN(MATCH_NOMATCH);
3405 ph10 427 }
3406 nigel 77 GETCHARINC(d, eptr);
3407     if (d < 256) d = md->lcc[d];
3408 ph10 510 if (fc == d) MRRETURN(MATCH_NOMATCH);
3409 nigel 77 }
3410     }
3411     else
3412     #endif
3413     /* Not UTF-8 mode */
3414     {
3415     for (fi = min;; fi++)
3416     {
3417 ph10 604 RMATCH(eptr, ecode, offset_top, md, eptrb, RM29);
3418 nigel 77 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3419 ph10 510 if (fi >= max) MRRETURN(MATCH_NOMATCH);
3420 ph10 426 if (eptr >= md->end_subject)
3421     {
3422     SCHECK_PARTIAL();
3423 ph10 510 MRRETURN(MATCH_NOMATCH);
3424 ph10 426 }
3425 ph10 510 if (fc == md->lcc[*eptr++]) MRRETURN(MATCH_NOMATCH);
3426 nigel 77 }
3427     }
3428     /* Control never gets here */
3429     }
3430    
3431     /* Maximize case */
3432    
3433     else
3434     {
3435     pp = eptr;
3436    
3437     #ifdef SUPPORT_UTF8
3438     /* UTF-8 mode */
3439     if (utf8)
3440     {
3441 nigel 93 register unsigned int d;
3442 nigel 77 for (i = min; i < max; i++)
3443     {
3444     int len = 1;
3445 ph10 463 if (eptr >= md->end_subject)
3446 ph10 462 {
3447 ph10 463 SCHECK_PARTIAL();
3448 ph10 462 break;
3449 ph10 463 }
3450 nigel 77 GETCHARLEN(d, eptr, len);
3451     if (d < 256) d = md->lcc[d];
3452     if (fc == d) break;
3453     eptr += len;
3454     }
3455 nigel 93 if (possessive) continue;
3456     for(;;)
3457 nigel 77 {
3458 ph10 604 RMATCH(eptr, ecode, offset_top, md, eptrb, RM30);
3459 nigel 77 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3460     if (eptr-- == pp) break; /* Stop if tried at original pos */
3461     BACKCHAR(eptr);
3462     }
3463     }
3464     else
3465     #endif
3466     /* Not UTF-8 mode */
3467     {
3468     for (i = min; i < max; i++)
3469     {
3470 ph10 463 if (eptr >= md->end_subject)
3471 ph10 462 {
3472     SCHECK_PARTIAL();
3473     break;
3474 ph10 463 }
3475 ph10 462 if (fc == md->lcc[*eptr]) break;
3476 nigel 77 eptr++;
3477     }
3478 nigel 93 if (possessive) continue;
3479 nigel 77 while (eptr >= pp)
3480     {
3481 ph10 604 RMATCH(eptr, ecode, offset_top, md, eptrb, RM31);
3482 nigel 77 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3483     eptr--;
3484     }
3485     }
3486    
3487 ph10 510 MRRETURN(MATCH_NOMATCH);
3488 nigel 77 }
3489     /* Control never gets here */
3490     }
3491    
3492     /* Caseful comparisons */
3493    
3494     else
3495     {
3496     #ifdef SUPPORT_UTF8
3497     /* UTF-8 mode */
3498     if (utf8)
3499     {
3500 nigel 93 register unsigned int d;
3501 nigel 77 for (i = 1; i <= min; i++)
3502     {
3503 ph10 426 if (eptr >= md->end_subject)
3504     {
3505     SCHECK_PARTIAL();
3506 ph10 510 MRRETURN(MATCH_NOMATCH);
3507 ph10 427 }
3508 nigel 77 GETCHARINC(d, eptr);
3509 ph10 510 if (fc == d) MRRETURN(MATCH_NOMATCH);
3510 nigel 77 }
3511     }
3512     else
3513     #endif
3514     /* Not UTF-8 mode */
3515     {
3516     for (i = 1; i <= min; i++)
3517 ph10 426 {
3518     if (eptr >= md->end_subject)
3519     {
3520     SCHECK_PARTIAL();
3521 ph10 510 MRRETURN(MATCH_NOMATCH);
3522 ph10 427 }
3523 ph10 510 if (fc == *eptr++) MRRETURN(MATCH_NOMATCH);
3524 ph10 427 }
3525 nigel 77 }
3526    
3527     if (min == max) continue;
3528    
3529     if (minimize)
3530     {
3531     #ifdef SUPPORT_UTF8
3532     /* UTF-8 mode */
3533     if (utf8)
3534     {
3535 nigel 93 register unsigned int d;
3536 nigel 77 for (fi = min;; fi++)
3537     {
3538 ph10 604 RMATCH(eptr, ecode, offset_top, md, eptrb, RM32);
3539 nigel 77 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3540 ph10 510 if (fi >= max) MRRETURN(MATCH_NOMATCH);
3541 ph10 427 if (eptr >= md->end_subject)
3542 ph10 426 {
3543 ph10 427 SCHECK_PARTIAL();
3544 ph10 510 MRRETURN(MATCH_NOMATCH);
3545 ph10 427 }
3546 nigel 77 GETCHARINC(d, eptr);
3547 ph10 510 if (fc == d) MRRETURN(MATCH_NOMATCH);
3548 nigel 77 }
3549     }
3550     else
3551     #endif
3552     /* Not UTF-8 mode */
3553     {
3554     for (fi = min;; fi++)
3555     {
3556 ph10 604 RMATCH(eptr, ecode, offset_top, md, eptrb, RM33);
3557 nigel 77 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3558 ph10 510 if (fi >= max) MRRETURN(MATCH_NOMATCH);
3559 ph10 426 if (eptr >= md->end_subject)
3560     {
3561     SCHECK_PARTIAL();
3562 ph10 510 MRRETURN(MATCH_NOMATCH);
3563 ph10 427 }
3564 ph10 510 if (fc == *eptr++) MRRETURN(MATCH_NOMATCH);
3565 nigel 77 }
3566     }
3567     /* Control never gets here */
3568     }
3569    
3570     /* Maximize case */
3571    
3572     else
3573     {
3574     pp = eptr;
3575    
3576     #ifdef SUPPORT_UTF8
3577     /* UTF-8 mode */
3578     if (utf8)
3579     {
3580 nigel 93 register unsigned int d;
3581 nigel 77 for (i = min; i < max; i++)
3582     {
3583     int len = 1;
3584 ph10 463 if (eptr >= md->end_subject)
3585 ph10 462 {
3586 ph10 463 SCHECK_PARTIAL();
3587 ph10 462 break;
3588 ph10 463 }
3589 nigel 77 GETCHARLEN(d, eptr, len);
3590     if (fc == d) break;
3591     eptr += len;
3592     }
3593 nigel 93 if (possessive) continue;
3594 nigel 77 for(;;)
3595     {
3596 ph10 604 RMATCH(eptr, ecode, offset_top, md, eptrb, RM34);
3597 nigel 77 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3598     if (eptr-- == pp) break; /* Stop if tried at original pos */
3599     BACKCHAR(eptr);
3600     }
3601     }
3602     else
3603     #endif
3604     /* Not UTF-8 mode */
3605     {
3606     for (i = min; i < max; i++)
3607     {
3608 ph10 463 if (eptr >= md->end_subject)
3609 ph10 462 {
3610 ph10 463 SCHECK_PARTIAL();
3611 ph10 462 break;
3612 ph10 463 }
3613 ph10 462 if (fc == *eptr) break;
3614 nigel 77 eptr++;
3615     }
3616 nigel 93 if (possessive) continue;
3617 nigel 77 while (eptr >= pp)
3618     {
3619 ph10 604 RMATCH(eptr, ecode, offset_top, md, eptrb, RM35);
3620 nigel 77 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3621     eptr--;
3622     }
3623     }
3624    
3625 ph10 510 MRRETURN(MATCH_NOMATCH);
3626 nigel 77 }
3627     }
3628     /* Control never gets here */
3629    
3630     /* Match a single character type repeatedly; several different opcodes
3631     share code. This is very similar to the code for single characters, but we
3632     repeat it in the interests of efficiency. */
3633    
3634     case OP_TYPEEXACT:
3635     min = max = GET2(ecode, 1);
3636     minimize = TRUE;
3637     ecode += 3;
3638     goto REPEATTYPE;
3639    
3640     case OP_TYPEUPTO:
3641     case OP_TYPEMINUPTO:
3642     min = 0;
3643     max = GET2(ecode, 1);
3644     minimize = *ecode == OP_TYPEMINUPTO;
3645     ecode += 3;
3646     goto REPEATTYPE;
3647    
3648 nigel 93 case OP_TYPEPOSSTAR:
3649     possessive = TRUE;
3650     min = 0;
3651     max = INT_MAX;
3652     ecode++;
3653     goto REPEATTYPE;
3654    
3655     case OP_TYPEPOSPLUS:
3656     possessive = TRUE;
3657     min = 1;
3658     max = INT_MAX;
3659     ecode++;
3660     goto REPEATTYPE;
3661    
3662     case OP_TYPEPOSQUERY:
3663     possessive = TRUE;
3664     min = 0;
3665     max = 1;
3666     ecode++;
3667     goto REPEATTYPE;
3668    
3669     case OP_TYPEPOSUPTO:
3670     possessive = TRUE;
3671     min = 0;
3672     max = GET2(ecode, 1);
3673     ecode += 3;
3674     goto REPEATTYPE;
3675    
3676 nigel 77 case OP_TYPESTAR:
3677     case OP_TYPEMINSTAR:
3678     case OP_TYPEPLUS:
3679     case OP_TYPEMINPLUS:
3680     case OP_TYPEQUERY:
3681     case OP_TYPEMINQUERY:
3682     c = *ecode++ - OP_TYPESTAR;
3683     minimize = (c & 1) != 0;
3684     min = rep_min[c]; /* Pick up values from tables; */
3685     max = rep_max[c]; /* zero for max => infinity */
3686     if (max == 0) max = INT_MAX;
3687    
3688     /* Common code for all repeated single character type matches. Note that
3689     in UTF-8 mode, '.' matches a character of any length, but for the other
3690     character types, the valid characters are all one-byte long. */
3691    
3692     REPEATTYPE:
3693     ctype = *ecode++; /* Code for the character type */
3694    
3695     #ifdef SUPPORT_UCP
3696     if (ctype == OP_PROP || ctype == OP_NOTPROP)
3697     {
3698     prop_fail_result = ctype == OP_NOTPROP;
3699     prop_type = *ecode++;
3700 nigel 87 prop_value = *ecode++;
3701 nigel 77 }
3702     else prop_type = -1;
3703     #endif
3704    
3705     /* First, ensure the minimum number of matches are present. Use inline
3706     code for maximizing the speed, and do the type test once at the start
3707 ph10 426 (i.e. keep it out of the loop). Separate the UTF-8 code completely as that
3708 nigel 77 is tidier. Also separate the UCP code, which can be the same for both UTF-8
3709     and single-bytes. */
3710    
3711     if (min > 0)
3712     {
3713     #ifdef SUPPORT_UCP
3714 nigel 87 if (prop_type >= 0)
3715 nigel 77 {
3716 nigel 87 switch(prop_type)
3717 nigel 77 {
3718 nigel 87 case PT_ANY:
3719 ph10 510 if (prop_fail_result) MRRETURN(MATCH_NOMATCH);
3720 nigel 87 for (i = 1; i <= min; i++)
3721     {
3722 ph10 427 if (eptr >= md->end_subject)
3723 ph10 426 {
3724 ph10 427 SCHECK_PARTIAL();
3725 ph10 510 MRRETURN(MATCH_NOMATCH);
3726 ph10 427 }
3727 ph10 184 GETCHARINCTEST(c, eptr);
3728 nigel 87 }
3729     break;
3730    
3731     case PT_LAMP:
3732     for (i = 1; i <= min; i++)
3733     {
3734 ph10 625 int chartype;
3735 ph10 427 if (eptr >= md->end_subject)
3736 ph10 426 {
3737 ph10 427 SCHECK_PARTIAL();
3738 ph10 510 MRRETURN(MATCH_NOMATCH);
3739 ph10 427 }
3740 ph10 184 GETCHARINCTEST(c, eptr);
3741 ph10 623 chartype = UCD_CHARTYPE(c);
3742     if ((chartype == ucp_Lu ||
3743     chartype == ucp_Ll ||
3744     chartype == ucp_Lt) == prop_fail_result)
3745 ph10 510 MRRETURN(MATCH_NOMATCH);
3746 nigel 87 }
3747     break;
3748    
3749     case PT_GC:
3750     for (i = 1; i <= min; i++)
3751     {
3752 ph10 427 if (eptr >= md->end_subject)
3753 ph10 426 {
3754 ph10 427 SCHECK_PARTIAL();
3755 ph10 510 MRRETURN(MATCH_NOMATCH);
3756 ph10 427 }
3757 ph10 184 GETCHARINCTEST(c, eptr);
3758 ph10 623 if ((UCD_CATEGORY(c) == prop_value) == prop_fail_result)
3759 ph10 510 MRRETURN(MATCH_NOMATCH);
3760 nigel 87 }
3761     break;
3762    
3763     case PT_PC:
3764     for (i = 1; i <= min; i++)
3765     {
3766 ph10 427 if (eptr >= md->end_subject)
3767 ph10 426 {
3768 ph10 427 SCHECK_PARTIAL();
3769 ph10 510 MRRETURN(MATCH_NOMATCH);
3770 ph10 427 }
3771 ph10 184 GETCHARINCTEST(c, eptr);
3772 ph10 623 if ((UCD_CHARTYPE(c) == prop_value) == prop_fail_result)
3773 ph10 510 MRRETURN(MATCH_NOMATCH);
3774 nigel 87 }
3775     break;
3776    
3777     case PT_SC:
3778     for (i = 1; i <= min; i++)
3779     {
3780 ph10 427 if (eptr >= md->end_subject)
3781 ph10 426 {
3782 ph10 427 SCHECK_PARTIAL();
3783 ph10 510 MRRETURN(MATCH_NOMATCH);
3784 ph10 427 }
3785 ph10 184 GETCHARINCTEST(c, eptr);
3786 ph10 623 if ((UCD_SCRIPT(c) == prop_value) == prop_fail_result)
3787 ph10 510 MRRETURN(MATCH_NOMATCH);
3788 nigel 87 }
3789     break;
3790 ph10 527
3791 ph10 517 case PT_ALNUM:
3792     for (i = 1; i <= min; i++)
3793     {
3794 ph10 625 int category;
3795 ph10 517 if (eptr >= md->end_subject)
3796     {
3797     SCHECK_PARTIAL();
3798     MRRETURN(MATCH_NOMATCH);
3799     }
3800     GETCHARINCTEST(c, eptr);
3801 ph10 623 category = UCD_CATEGORY(c);
3802     if ((category == ucp_L || category == ucp_N) == prop_fail_result)
3803 ph10 517 MRRETURN(MATCH_NOMATCH);
3804     }
3805     break;
3806 ph10 527
3807 ph10 517 case PT_SPACE: /* Perl space */
3808     for (i = 1; i <= min; i++)
3809     {
3810     if (eptr >= md->end_subject)
3811     {
3812     SCHECK_PARTIAL();
3813     MRRETURN(MATCH_NOMATCH);
3814     }
3815     GETCHARINCTEST(c, eptr);
3816 ph10 623 if ((UCD_CATEGORY(c) == ucp_Z || c == CHAR_HT || c == CHAR_NL ||
3817 ph10 527 c == CHAR_FF || c == CHAR_CR)
3818 ph10 517 == prop_fail_result)
3819     MRRETURN(MATCH_NOMATCH);
3820     }
3821     break;
3822 ph10 527
3823 ph10 517 case PT_PXSPACE: /* POSIX space */
3824     for (i = 1; i <= min; i++)
3825     {
3826     if (eptr >= md->end_subject)
3827     {
3828     SCHECK_PARTIAL();
3829     MRRETURN(MATCH_NOMATCH);
3830     }
3831     GETCHARINCTEST(c, eptr);
3832 ph10 623 if ((UCD_CATEGORY(c) == ucp_Z || c == CHAR_HT || c == CHAR_NL ||
3833 ph10 527 c == CHAR_VT || c == CHAR_FF || c == CHAR_CR)
3834 ph10 517 == prop_fail_result)
3835     MRRETURN(MATCH_NOMATCH);
3836     }
3837     break;
3838 ph10 527
3839     case PT_WORD:
3840 ph10 517 for (i = 1; i <= min; i++)
3841     {
3842 ph10 625 int category;
3843 ph10 517 if (eptr >= md->end_subject)
3844     {
3845     SCHECK_PARTIAL();
3846     MRRETURN(MATCH_NOMATCH);
3847     }
3848     GETCHARINCTEST(c, eptr);
3849 ph10 623 category = UCD_CATEGORY(c);
3850     if ((category == ucp_L || category == ucp_N || c == CHAR_UNDERSCORE)
3851 ph10 517 == prop_fail_result)
3852     MRRETURN(MATCH_NOMATCH);
3853     }
3854     break;
3855 ph10 527
3856 ph10 517 /* This should not occur */
3857 nigel 87
3858     default: