/[pcre]/code/trunk/pcre_exec.c
ViewVC logotype

Contents of /code/trunk/pcre_exec.c

Parent Directory Parent Directory | Revision Log Revision Log


Revision 602 - (hide annotations) (download)
Wed May 25 08:29:03 2011 UTC (3 years, 4 months ago) by ph10
File MIME type: text/plain
File size: 188067 byte(s)
Remove OP_OPT by handling /i and /m entirely at compile time. Fixes bug with 
patterns like /(?i:([^b]))(?1)/, where the /i option was mishandled.

1 nigel 77 /*************************************************
2     * Perl-Compatible Regular Expressions *
3     *************************************************/
4    
5     /* PCRE is a library of functions to support regular expressions whose syntax
6     and semantics are as close as possible to those of the Perl 5 language.
7    
8     Written by Philip Hazel
9 ph10 598 Copyright (c) 1997-2011 University of Cambridge
10 nigel 77
11     -----------------------------------------------------------------------------
12     Redistribution and use in source and binary forms, with or without
13     modification, are permitted provided that the following conditions are met:
14    
15     * Redistributions of source code must retain the above copyright notice,
16     this list of conditions and the following disclaimer.
17    
18     * Redistributions in binary form must reproduce the above copyright
19     notice, this list of conditions and the following disclaimer in the
20     documentation and/or other materials provided with the distribution.
21    
22     * Neither the name of the University of Cambridge nor the names of its
23     contributors may be used to endorse or promote products derived from
24     this software without specific prior written permission.
25    
26     THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
27     AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
28     IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
29     ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
30     LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
31     CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
32     SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
33     INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
34     CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
35     ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
36     POSSIBILITY OF SUCH DAMAGE.
37     -----------------------------------------------------------------------------
38     */
39    
40    
41     /* This module contains pcre_exec(), the externally visible function that does
42     pattern matching using an NFA algorithm, trying to mimic Perl as closely as
43     possible. There are also some static supporting functions. */
44    
45 ph10 200 #ifdef HAVE_CONFIG_H
46 ph10 236 #include "config.h"
47 ph10 200 #endif
48 ph10 199
49 nigel 93 #define NLBLOCK md /* Block containing newline information */
50     #define PSSTART start_subject /* Field containing processed string start */
51     #define PSEND end_subject /* Field containing processed string end */
52    
53 nigel 77 #include "pcre_internal.h"
54    
55 ph10 137 /* Undefine some potentially clashing cpp symbols */
56    
57     #undef min
58     #undef max
59    
60 nigel 77 /* Flag bits for the match() function */
61    
62 nigel 93 #define match_condassert 0x01 /* Called to check a condition assertion */
63     #define match_cbegroup 0x02 /* Could-be-empty unlimited repeat group */
64 nigel 77
65     /* Non-error returns from the match() function. Error returns are externally
66     defined PCRE_ERROR_xxx codes, which are all negative. */
67    
68     #define MATCH_MATCH 1
69     #define MATCH_NOMATCH 0
70    
71 ph10 211 /* Special internal returns from the match() function. Make them sufficiently
72 ph10 210 negative to avoid the external error codes. */
73    
74 ph10 511 #define MATCH_ACCEPT (-999)
75     #define MATCH_COMMIT (-998)
76     #define MATCH_PRUNE (-997)
77     #define MATCH_SKIP (-996)
78     #define MATCH_SKIP_ARG (-995)
79     #define MATCH_THEN (-994)
80 ph10 210
81 ph10 510 /* This is a convenience macro for code that occurs many times. */
82    
83     #define MRRETURN(ra) \
84     { \
85     md->mark = markptr; \
86     RRETURN(ra); \
87     }
88    
89 nigel 77 /* Maximum number of ints of offset to save on the stack for recursive calls.
90     If the offset vector is bigger, malloc is used. This should be a multiple of 3,
91     because the offset vector is always a multiple of 3 long. */
92    
93     #define REC_STACK_SAVE_MAX 30
94    
95     /* Min and max values for the common repeats; for the maxima, 0 => infinity */
96    
97     static const char rep_min[] = { 0, 0, 1, 1, 0, 0 };
98     static const char rep_max[] = { 0, 0, 0, 0, 1, 1 };
99    
100    
101    
102 ph10 475 #ifdef PCRE_DEBUG
103 nigel 77 /*************************************************
104     * Debugging function to print chars *
105     *************************************************/
106    
107     /* Print a sequence of chars in printable format, stopping at the end of the
108     subject if the requested.
109    
110     Arguments:
111     p points to characters
112     length number to print
113     is_subject TRUE if printing from within md->start_subject
114     md pointer to matching data block, if is_subject is TRUE
115    
116     Returns: nothing
117     */
118    
119     static void
120     pchars(const uschar *p, int length, BOOL is_subject, match_data *md)
121     {
122 nigel 93 unsigned int c;
123 nigel 77 if (is_subject && length > md->end_subject - p) length = md->end_subject - p;
124     while (length-- > 0)
125     if (isprint(c = *(p++))) printf("%c", c); else printf("\\x%02x", c);
126     }
127     #endif
128    
129    
130    
131     /*************************************************
132     * Match a back-reference *
133     *************************************************/
134    
135 ph10 595 /* Normally, if a back reference hasn't been set, the length that is passed is
136     negative, so the match always fails. However, in JavaScript compatibility mode,
137     the length passed is zero. Note that in caseless UTF-8 mode, the number of
138     subject bytes matched may be different to the number of reference bytes.
139 nigel 77
140     Arguments:
141     offset index into the offset vector
142 ph10 595 eptr pointer into the subject
143     length length of reference to be matched (number of bytes)
144 nigel 77 md points to match data block
145 ph10 602 caseless TRUE if caseless
146 nigel 77
147 ph10 595 Returns: < 0 if not matched, otherwise the number of subject bytes matched
148 nigel 77 */
149    
150 ph10 595 static int
151 nigel 87 match_ref(int offset, register USPTR eptr, int length, match_data *md,
152 ph10 602 BOOL caseless)
153 nigel 77 {
154 ph10 595 USPTR eptr_start = eptr;
155     register USPTR p = md->start_subject + md->offset_vector[offset];
156 nigel 77
157 ph10 475 #ifdef PCRE_DEBUG
158 nigel 77 if (eptr >= md->end_subject)
159     printf("matching subject <null>");
160     else
161     {
162     printf("matching subject ");
163     pchars(eptr, length, TRUE, md);
164     }
165     printf(" against backref ");
166     pchars(p, length, FALSE, md);
167     printf("\n");
168     #endif
169    
170 ph10 595 /* Always fail if reference not set (and not JavaScript compatible). */
171 nigel 77
172 ph10 595 if (length < 0) return -1;
173 nigel 77
174 ph10 354 /* Separate the caseless case for speed. In UTF-8 mode we can only do this
175     properly if Unicode properties are supported. Otherwise, we can check only
176     ASCII characters. */
177 nigel 77
178 ph10 602 if (caseless)
179 nigel 77 {
180 ph10 354 #ifdef SUPPORT_UTF8
181     #ifdef SUPPORT_UCP
182     if (md->utf8)
183     {
184 ph10 595 /* Match characters up to the end of the reference. NOTE: the number of
185     bytes matched may differ, because there are some characters whose upper and
186     lower case versions code as different numbers of bytes. For example, U+023A
187     (2 bytes in UTF-8) is the upper case version of U+2C65 (3 bytes in UTF-8);
188     a sequence of 3 of the former uses 6 bytes, as does a sequence of two of
189     the latter. It is important, therefore, to check the length along the
190     reference, not along the subject (earlier code did this wrong). */
191    
192     USPTR endptr = p + length;
193     while (p < endptr)
194 ph10 354 {
195 ph10 358 int c, d;
196 ph10 597 if (eptr >= md->end_subject) return -1;
197 ph10 354 GETCHARINC(c, eptr);
198     GETCHARINC(d, p);
199 ph10 595 if (c != d && c != UCD_OTHERCASE(d)) return -1;
200 ph10 358 }
201     }
202 ph10 354 else
203     #endif
204     #endif
205    
206     /* The same code works when not in UTF-8 mode and in UTF-8 mode when there
207     is no UCP support. */
208 ph10 597 {
209     if (eptr + length > md->end_subject) return -1;
210     while (length-- > 0)
211     { if (md->lcc[*p++] != md->lcc[*eptr++]) return -1; }
212     }
213 nigel 77 }
214 ph10 358
215 ph10 354 /* In the caseful case, we can just compare the bytes, whether or not we
216     are in UTF-8 mode. */
217 ph10 358
218 nigel 77 else
219 ph10 597 {
220     if (eptr + length > md->end_subject) return -1;
221     while (length-- > 0) if (*p++ != *eptr++) return -1;
222     }
223 nigel 77
224 ph10 595 return eptr - eptr_start;
225 nigel 77 }
226    
227    
228    
229     /***************************************************************************
230     ****************************************************************************
231     RECURSION IN THE match() FUNCTION
232    
233 nigel 87 The match() function is highly recursive, though not every recursive call
234     increases the recursive depth. Nevertheless, some regular expressions can cause
235     it to recurse to a great depth. I was writing for Unix, so I just let it call
236     itself recursively. This uses the stack for saving everything that has to be
237     saved for a recursive call. On Unix, the stack can be large, and this works
238     fine.
239 nigel 77
240 nigel 87 It turns out that on some non-Unix-like systems there are problems with
241     programs that use a lot of stack. (This despite the fact that every last chip
242     has oodles of memory these days, and techniques for extending the stack have
243     been known for decades.) So....
244 nigel 77
245     There is a fudge, triggered by defining NO_RECURSE, which avoids recursive
246     calls by keeping local variables that need to be preserved in blocks of memory
247 nigel 87 obtained from malloc() instead instead of on the stack. Macros are used to
248 nigel 77 achieve this so that the actual code doesn't look very different to what it
249     always used to.
250 ph10 164
251 ph10 165 The original heap-recursive code used longjmp(). However, it seems that this
252 ph10 164 can be very slow on some operating systems. Following a suggestion from Stan
253     Switzer, the use of longjmp() has been abolished, at the cost of having to
254     provide a unique number for each call to RMATCH. There is no way of generating
255     a sequence of numbers at compile time in C. I have given them names, to make
256     them stand out more clearly.
257    
258     Crude tests on x86 Linux show a small speedup of around 5-8%. However, on
259     FreeBSD, avoiding longjmp() more than halves the time taken to run the standard
260 ph10 165 tests. Furthermore, not using longjmp() means that local dynamic variables
261     don't have indeterminate values; this has meant that the frame size can be
262 ph10 164 reduced because the result can be "passed back" by straight setting of the
263     variable instead of being passed in the frame.
264 nigel 77 ****************************************************************************
265     ***************************************************************************/
266    
267 ph10 212 /* Numbers for RMATCH calls. When this list is changed, the code at HEAP_RETURN
268     below must be updated in sync. */
269 nigel 77
270 ph10 164 enum { RM1=1, RM2, RM3, RM4, RM5, RM6, RM7, RM8, RM9, RM10,
271     RM11, RM12, RM13, RM14, RM15, RM16, RM17, RM18, RM19, RM20,
272     RM21, RM22, RM23, RM24, RM25, RM26, RM27, RM28, RM29, RM30,
273     RM31, RM32, RM33, RM34, RM35, RM36, RM37, RM38, RM39, RM40,
274 ph10 210 RM41, RM42, RM43, RM44, RM45, RM46, RM47, RM48, RM49, RM50,
275 ph10 527 RM51, RM52, RM53, RM54, RM55, RM56, RM57, RM58, RM59, RM60,
276     RM61, RM62 };
277 ph10 164
278 nigel 87 /* These versions of the macros use the stack, as normal. There are debugging
279 ph10 165 versions and production versions. Note that the "rw" argument of RMATCH isn't
280 ph10 501 actually used in this definition. */
281 nigel 77
282     #ifndef NO_RECURSE
283     #define REGISTER register
284 ph10 164
285 ph10 475 #ifdef PCRE_DEBUG
286 ph10 602 #define RMATCH(ra,rb,rc,rd,re,rf,rw) \
287 nigel 87 { \
288     printf("match() called in line %d\n", __LINE__); \
289 ph10 602 rrc = match(ra,rb,mstart,markptr,rc,rd,re,rf,rdepth+1); \
290 nigel 87 printf("to line %d\n", __LINE__); \
291     }
292     #define RRETURN(ra) \
293     { \
294     printf("match() returned %d from line %d ", ra, __LINE__); \
295     return ra; \
296     }
297     #else
298 ph10 602 #define RMATCH(ra,rb,rc,rd,re,rf,rw) \
299     rrc = match(ra,rb,mstart,markptr,rc,rd,re,rf,rdepth+1)
300 nigel 77 #define RRETURN(ra) return ra
301 nigel 87 #endif
302    
303 nigel 77 #else
304    
305    
306 ph10 164 /* These versions of the macros manage a private stack on the heap. Note that
307     the "rd" argument of RMATCH isn't actually used in this definition. It's the md
308     argument of match(), which never changes. */
309 nigel 77
310     #define REGISTER
311    
312 ph10 602 #define RMATCH(ra,rb,rc,rd,re,rf,rw)\
313 nigel 77 {\
314 ph10 563 heapframe *newframe = (heapframe *)(pcre_stack_malloc)(sizeof(heapframe));\
315 ph10 534 if (newframe == NULL) RRETURN(PCRE_ERROR_NOMEMORY);\
316 ph10 164 frame->Xwhere = rw; \
317     newframe->Xeptr = ra;\
318     newframe->Xecode = rb;\
319 ph10 168 newframe->Xmstart = mstart;\
320 ph10 501 newframe->Xmarkptr = markptr;\
321 ph10 164 newframe->Xoffset_top = rc;\
322 ph10 602 newframe->Xeptrb = re;\
323     newframe->Xflags = rf;\
324 ph10 164 newframe->Xrdepth = frame->Xrdepth + 1;\
325     newframe->Xprevframe = frame;\
326     frame = newframe;\
327     DPRINTF(("restarting from line %d\n", __LINE__));\
328     goto HEAP_RECURSE;\
329     L_##rw:\
330     DPRINTF(("jumped back to line %d\n", __LINE__));\
331 nigel 77 }
332    
333     #define RRETURN(ra)\
334     {\
335 ph10 527 heapframe *oldframe = frame;\
336     frame = oldframe->Xprevframe;\
337     (pcre_stack_free)(oldframe);\
338 nigel 77 if (frame != NULL)\
339     {\
340 ph10 164 rrc = ra;\
341     goto HEAP_RETURN;\
342 nigel 77 }\
343     return ra;\
344     }
345    
346    
347     /* Structure for remembering the local variables in a private frame */
348    
349     typedef struct heapframe {
350     struct heapframe *Xprevframe;
351    
352     /* Function arguments that may change */
353    
354 ph10 409 USPTR Xeptr;
355 nigel 77 const uschar *Xecode;
356 ph10 409 USPTR Xmstart;
357 ph10 501 USPTR Xmarkptr;
358 nigel 77 int Xoffset_top;
359     eptrblock *Xeptrb;
360     int Xflags;
361 nigel 91 unsigned int Xrdepth;
362 nigel 77
363     /* Function local variables */
364    
365 ph10 409 USPTR Xcallpat;
366 ph10 406 #ifdef SUPPORT_UTF8
367 ph10 409 USPTR Xcharptr;
368 ph10 406 #endif
369 ph10 409 USPTR Xdata;
370     USPTR Xnext;
371     USPTR Xpp;
372     USPTR Xprev;
373     USPTR Xsaved_eptr;
374 nigel 77
375     recursion_info Xnew_recursive;
376    
377     BOOL Xcur_is_word;
378     BOOL Xcondition;
379     BOOL Xprev_is_word;
380    
381     #ifdef SUPPORT_UCP
382     int Xprop_type;
383 nigel 87 int Xprop_value;
384 nigel 77 int Xprop_fail_result;
385     int Xprop_category;
386     int Xprop_chartype;
387 nigel 87 int Xprop_script;
388 ph10 123 int Xoclength;
389     uschar Xocchars[8];
390 nigel 77 #endif
391    
392 ph10 403 int Xcodelink;
393 nigel 77 int Xctype;
394 nigel 93 unsigned int Xfc;
395 nigel 77 int Xfi;
396     int Xlength;
397     int Xmax;
398     int Xmin;
399     int Xnumber;
400     int Xoffset;
401     int Xop;
402     int Xsave_capture_last;
403     int Xsave_offset1, Xsave_offset2, Xsave_offset3;
404     int Xstacksave[REC_STACK_SAVE_MAX];
405    
406     eptrblock Xnewptrb;
407    
408 ph10 164 /* Where to jump back to */
409 nigel 77
410 ph10 164 int Xwhere;
411 ph10 165
412 nigel 77 } heapframe;
413    
414     #endif
415    
416    
417     /***************************************************************************
418     ***************************************************************************/
419    
420    
421    
422     /*************************************************
423     * Match from current position *
424     *************************************************/
425    
426 nigel 93 /* This function is called recursively in many circumstances. Whenever it
427 nigel 77 returns a negative (error) response, the outer incarnation must also return the
428 ph10 426 same response. */
429 nigel 77
430 ph10 426 /* These macros pack up tests that are used for partial matching, and which
431     appears several times in the code. We set the "hit end" flag if the pointer is
432     at the end of the subject and also past the start of the subject (i.e.
433 ph10 427 something has been matched). For hard partial matching, we then return
434     immediately. The second one is used when we already know we are past the end of
435     the subject. */
436 ph10 426
437     #define CHECK_PARTIAL()\
438 ph10 553 if (md->partial != 0 && eptr >= md->end_subject && \
439     eptr > md->start_used_ptr) \
440     { \
441     md->hitend = TRUE; \
442     if (md->partial > 1) MRRETURN(PCRE_ERROR_PARTIAL); \
443 ph10 427 }
444 ph10 426
445     #define SCHECK_PARTIAL()\
446 ph10 553 if (md->partial != 0 && eptr > md->start_used_ptr) \
447     { \
448     md->hitend = TRUE; \
449     if (md->partial > 1) MRRETURN(PCRE_ERROR_PARTIAL); \
450 ph10 427 }
451 ph10 426
452 ph10 427
453 ph10 426 /* Performance note: It might be tempting to extract commonly used fields from
454     the md structure (e.g. utf8, end_subject) into individual variables to improve
455 nigel 77 performance. Tests using gcc on a SPARC disproved this; in the first case, it
456     made performance worse.
457    
458     Arguments:
459 nigel 93 eptr pointer to current character in subject
460     ecode pointer to current position in compiled code
461 ph10 168 mstart pointer to the current match start position (can be modified
462 ph10 172 by encountering \K)
463 ph10 501 markptr pointer to the most recent MARK name, or NULL
464 nigel 77 offset_top current top pointer
465     md pointer to "static" info for the match
466     eptrb pointer to chain of blocks containing eptr at start of
467     brackets - for testing for empty matches
468     flags can contain
469     match_condassert - this is an assertion condition
470 nigel 93 match_cbegroup - this is the start of an unlimited repeat
471     group that can match an empty string
472 nigel 87 rdepth the recursion depth
473 nigel 77
474     Returns: MATCH_MATCH if matched ) these values are >= 0
475     MATCH_NOMATCH if failed to match )
476 ph10 510 a negative MATCH_xxx value for PRUNE, SKIP, etc
477 nigel 77 a negative PCRE_ERROR_xxx value if aborted by an error condition
478 nigel 87 (e.g. stopped by repeated call or recursion limit)
479 nigel 77 */
480    
481     static int
482 ph10 510 match(REGISTER USPTR eptr, REGISTER const uschar *ecode, USPTR mstart,
483 ph10 602 const uschar *markptr, int offset_top, match_data *md, eptrblock *eptrb,
484     int flags, unsigned int rdepth)
485 nigel 77 {
486     /* These variables do not need to be preserved over recursion in this function,
487 nigel 93 so they can be ordinary variables in all cases. Mark some of them with
488     "register" because they are used a lot in loops. */
489 nigel 77
490 nigel 91 register int rrc; /* Returns from recursive calls */
491     register int i; /* Used for loops not involving calls to RMATCH() */
492 nigel 93 register unsigned int c; /* Character values not kept over RMATCH() calls */
493 nigel 91 register BOOL utf8; /* Local copy of UTF-8 flag for speed */
494 nigel 77
495 nigel 93 BOOL minimize, possessive; /* Quantifier options */
496 ph10 602 BOOL caseless;
497 ph10 403 int condcode;
498 nigel 93
499 nigel 77 /* When recursion is not being used, all "local" variables that have to be
500     preserved over calls to RMATCH() are part of a "frame" which is obtained from
501     heap storage. Set up the top-level frame here; others are obtained from the
502     heap whenever RMATCH() does a "recursion". See the macro definitions above. */
503    
504     #ifdef NO_RECURSE
505 ph10 563 heapframe *frame = (heapframe *)(pcre_stack_malloc)(sizeof(heapframe));
506 ph10 531 if (frame == NULL) RRETURN(PCRE_ERROR_NOMEMORY);
507 nigel 77 frame->Xprevframe = NULL; /* Marks the top level */
508    
509     /* Copy in the original argument variables */
510    
511     frame->Xeptr = eptr;
512     frame->Xecode = ecode;
513 ph10 168 frame->Xmstart = mstart;
514 ph10 501 frame->Xmarkptr = markptr;
515 nigel 77 frame->Xoffset_top = offset_top;
516     frame->Xeptrb = eptrb;
517     frame->Xflags = flags;
518 nigel 87 frame->Xrdepth = rdepth;
519 nigel 77
520     /* This is where control jumps back to to effect "recursion" */
521    
522     HEAP_RECURSE:
523    
524     /* Macros make the argument variables come from the current frame */
525    
526     #define eptr frame->Xeptr
527     #define ecode frame->Xecode
528 ph10 168 #define mstart frame->Xmstart
529 ph10 501 #define markptr frame->Xmarkptr
530 nigel 77 #define offset_top frame->Xoffset_top
531     #define eptrb frame->Xeptrb
532     #define flags frame->Xflags
533 nigel 87 #define rdepth frame->Xrdepth
534 nigel 77
535     /* Ditto for the local variables */
536    
537     #ifdef SUPPORT_UTF8
538     #define charptr frame->Xcharptr
539     #endif
540     #define callpat frame->Xcallpat
541 ph10 403 #define codelink frame->Xcodelink
542 nigel 77 #define data frame->Xdata
543     #define next frame->Xnext
544     #define pp frame->Xpp
545     #define prev frame->Xprev
546     #define saved_eptr frame->Xsaved_eptr
547    
548     #define new_recursive frame->Xnew_recursive
549    
550     #define cur_is_word frame->Xcur_is_word
551     #define condition frame->Xcondition
552     #define prev_is_word frame->Xprev_is_word
553    
554     #ifdef SUPPORT_UCP
555     #define prop_type frame->Xprop_type
556 nigel 87 #define prop_value frame->Xprop_value
557 nigel 77 #define prop_fail_result frame->Xprop_fail_result
558     #define prop_category frame->Xprop_category
559     #define prop_chartype frame->Xprop_chartype
560 nigel 87 #define prop_script frame->Xprop_script
561 ph10 115 #define oclength frame->Xoclength
562     #define occhars frame->Xocchars
563 nigel 77 #endif
564    
565     #define ctype frame->Xctype
566     #define fc frame->Xfc
567     #define fi frame->Xfi
568     #define length frame->Xlength
569     #define max frame->Xmax
570     #define min frame->Xmin
571     #define number frame->Xnumber
572     #define offset frame->Xoffset
573     #define op frame->Xop
574     #define save_capture_last frame->Xsave_capture_last
575     #define save_offset1 frame->Xsave_offset1
576     #define save_offset2 frame->Xsave_offset2
577     #define save_offset3 frame->Xsave_offset3
578     #define stacksave frame->Xstacksave
579    
580     #define newptrb frame->Xnewptrb
581    
582     /* When recursion is being used, local variables are allocated on the stack and
583     get preserved during recursion in the normal way. In this environment, fi and
584     i, and fc and c, can be the same variables. */
585    
586 nigel 93 #else /* NO_RECURSE not defined */
587 nigel 77 #define fi i
588     #define fc c
589    
590    
591 nigel 87 #ifdef SUPPORT_UTF8 /* Many of these variables are used only */
592     const uschar *charptr; /* in small blocks of the code. My normal */
593     #endif /* style of coding would have declared */
594     const uschar *callpat; /* them within each of those blocks. */
595     const uschar *data; /* However, in order to accommodate the */
596     const uschar *next; /* version of this code that uses an */
597     USPTR pp; /* external "stack" implemented on the */
598     const uschar *prev; /* heap, it is easier to declare them all */
599     USPTR saved_eptr; /* here, so the declarations can be cut */
600     /* out in a block. The only declarations */
601     recursion_info new_recursive; /* within blocks below are for variables */
602     /* that do not have to be preserved over */
603     BOOL cur_is_word; /* a recursive call to RMATCH(). */
604     BOOL condition;
605 nigel 77 BOOL prev_is_word;
606    
607     #ifdef SUPPORT_UCP
608     int prop_type;
609 nigel 87 int prop_value;
610 nigel 77 int prop_fail_result;
611     int prop_category;
612     int prop_chartype;
613 nigel 87 int prop_script;
614 ph10 115 int oclength;
615     uschar occhars[8];
616 nigel 77 #endif
617    
618 ph10 399 int codelink;
619 nigel 77 int ctype;
620     int length;
621     int max;
622     int min;
623     int number;
624     int offset;
625     int op;
626     int save_capture_last;
627     int save_offset1, save_offset2, save_offset3;
628     int stacksave[REC_STACK_SAVE_MAX];
629    
630     eptrblock newptrb;
631 nigel 93 #endif /* NO_RECURSE */
632 nigel 77
633     /* These statements are here to stop the compiler complaining about unitialized
634     variables. */
635    
636     #ifdef SUPPORT_UCP
637 nigel 87 prop_value = 0;
638 nigel 77 prop_fail_result = 0;
639     #endif
640    
641 nigel 93
642 nigel 91 /* This label is used for tail recursion, which is used in a few cases even
643     when NO_RECURSE is not defined, in order to reduce the amount of stack that is
644     used. Thanks to Ian Taylor for noticing this possibility and sending the
645     original patch. */
646    
647     TAIL_RECURSE:
648    
649 nigel 87 /* OK, now we can get on with the real code of the function. Recursive calls
650     are specified by the macro RMATCH and RRETURN is used to return. When
651     NO_RECURSE is *not* defined, these just turn into a recursive call to match()
652 ph10 475 and a "return", respectively (possibly with some debugging if PCRE_DEBUG is
653 nigel 87 defined). However, RMATCH isn't like a function call because it's quite a
654     complicated macro. It has to be used in one particular way. This shouldn't,
655     however, impact performance when true recursion is being used. */
656 nigel 77
657 ph10 164 #ifdef SUPPORT_UTF8
658     utf8 = md->utf8; /* Local copy of the flag */
659     #else
660     utf8 = FALSE;
661     #endif
662    
663 nigel 87 /* First check that we haven't called match() too many times, or that we
664     haven't exceeded the recursive call limit. */
665    
666 nigel 77 if (md->match_call_count++ >= md->match_limit) RRETURN(PCRE_ERROR_MATCHLIMIT);
667 nigel 87 if (rdepth >= md->match_limit_recursion) RRETURN(PCRE_ERROR_RECURSIONLIMIT);
668 nigel 77
669 nigel 93 /* At the start of a group with an unlimited repeat that may match an empty
670     string, the match_cbegroup flag is set. When this is the case, add the current
671     subject pointer to the chain of such remembered pointers, to be checked when we
672     hit the closing ket, in order to break infinite loops that match no characters.
673 ph10 197 When match() is called in other circumstances, don't add to the chain. The
674     match_cbegroup flag must NOT be used with tail recursion, because the memory
675     block that is used is on the stack, so a new one may be required for each
676     match(). */
677 nigel 77
678 nigel 93 if ((flags & match_cbegroup) != 0)
679 nigel 77 {
680 ph10 197 newptrb.epb_saved_eptr = eptr;
681     newptrb.epb_prev = eptrb;
682     eptrb = &newptrb;
683 nigel 77 }
684    
685 nigel 93 /* Now start processing the opcodes. */
686 nigel 77
687     for (;;)
688     {
689 nigel 93 minimize = possessive = FALSE;
690 nigel 77 op = *ecode;
691 ph10 443
692 nigel 93 switch(op)
693     {
694 ph10 510 case OP_MARK:
695     markptr = ecode + 2;
696     RMATCH(eptr, ecode + _pcre_OP_lengths[*ecode] + ecode[1], offset_top, md,
697 ph10 602 eptrb, flags, RM55);
698 ph10 512
699     /* A return of MATCH_SKIP_ARG means that matching failed at SKIP with an
700     argument, and we must check whether that argument matches this MARK's
701     argument. It is passed back in md->start_match_ptr (an overloading of that
702     variable). If it does match, we reset that variable to the current subject
703     position and return MATCH_SKIP. Otherwise, pass back the return code
704 ph10 510 unaltered. */
705 ph10 512
706     if (rrc == MATCH_SKIP_ARG &&
707 ph10 510 strcmp((char *)markptr, (char *)(md->start_match_ptr)) == 0)
708     {
709     md->start_match_ptr = eptr;
710     RRETURN(MATCH_SKIP);
711     }
712    
713 ph10 512 if (md->mark == NULL) md->mark = markptr;
714 ph10 510 RRETURN(rrc);
715    
716 ph10 210 case OP_FAIL:
717 ph10 510 MRRETURN(MATCH_NOMATCH);
718 ph10 211
719 ph10 551 /* COMMIT overrides PRUNE, SKIP, and THEN */
720 ph10 553
721 ph10 510 case OP_COMMIT:
722     RMATCH(eptr, ecode + _pcre_OP_lengths[*ecode], offset_top, md,
723 ph10 602 eptrb, flags, RM52);
724 ph10 551 if (rrc != MATCH_NOMATCH && rrc != MATCH_PRUNE &&
725 ph10 553 rrc != MATCH_SKIP && rrc != MATCH_SKIP_ARG &&
726     rrc != MATCH_THEN)
727 ph10 551 RRETURN(rrc);
728 ph10 510 MRRETURN(MATCH_COMMIT);
729    
730 ph10 551 /* PRUNE overrides THEN */
731 ph10 553
732 ph10 210 case OP_PRUNE:
733     RMATCH(eptr, ecode + _pcre_OP_lengths[*ecode], offset_top, md,
734 ph10 602 eptrb, flags, RM51);
735 ph10 551 if (rrc != MATCH_NOMATCH && rrc != MATCH_THEN) RRETURN(rrc);
736 ph10 510 MRRETURN(MATCH_PRUNE);
737 ph10 211
738 ph10 510 case OP_PRUNE_ARG:
739     RMATCH(eptr, ecode + _pcre_OP_lengths[*ecode] + ecode[1], offset_top, md,
740 ph10 602 eptrb, flags, RM56);
741 ph10 551 if (rrc != MATCH_NOMATCH && rrc != MATCH_THEN) RRETURN(rrc);
742 ph10 510 md->mark = ecode + 2;
743     RRETURN(MATCH_PRUNE);
744 ph10 211
745 ph10 551 /* SKIP overrides PRUNE and THEN */
746 ph10 553
747 ph10 210 case OP_SKIP:
748     RMATCH(eptr, ecode + _pcre_OP_lengths[*ecode], offset_top, md,
749 ph10 602 eptrb, flags, RM53);
750 ph10 553 if (rrc != MATCH_NOMATCH && rrc != MATCH_PRUNE && rrc != MATCH_THEN)
751 ph10 551 RRETURN(rrc);
752 ph10 211 md->start_match_ptr = eptr; /* Pass back current position */
753 ph10 510 MRRETURN(MATCH_SKIP);
754 ph10 211
755 ph10 510 case OP_SKIP_ARG:
756     RMATCH(eptr, ecode + _pcre_OP_lengths[*ecode] + ecode[1], offset_top, md,
757 ph10 602 eptrb, flags, RM57);
758 ph10 553 if (rrc != MATCH_NOMATCH && rrc != MATCH_PRUNE && rrc != MATCH_THEN)
759 ph10 551 RRETURN(rrc);
760 ph10 512
761     /* Pass back the current skip name by overloading md->start_match_ptr and
762     returning the special MATCH_SKIP_ARG return code. This will either be
763     caught by a matching MARK, or get to the top, where it is treated the same
764 ph10 510 as PRUNE. */
765 ph10 512
766 ph10 510 md->start_match_ptr = ecode + 2;
767 ph10 512 RRETURN(MATCH_SKIP_ARG);
768 ph10 553
769 ph10 550 /* For THEN (and THEN_ARG) we pass back the address of the bracket or
770 ph10 553 the alt that is at the start of the current branch. This makes it possible
771     to skip back past alternatives that precede the THEN within the current
772     branch. */
773 ph10 512
774 ph10 210 case OP_THEN:
775     RMATCH(eptr, ecode + _pcre_OP_lengths[*ecode], offset_top, md,
776 ph10 602 eptrb, flags, RM54);
777 ph10 210 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
778 ph10 550 md->start_match_ptr = ecode - GET(ecode, 1);
779 ph10 510 MRRETURN(MATCH_THEN);
780    
781     case OP_THEN_ARG:
782 ph10 553 RMATCH(eptr, ecode + _pcre_OP_lengths[*ecode] + ecode[1+LINK_SIZE],
783 ph10 602 offset_top, md, eptrb, flags, RM58);
784 ph10 510 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
785 ph10 550 md->start_match_ptr = ecode - GET(ecode, 1);
786     md->mark = ecode + LINK_SIZE + 2;
787 ph10 212 RRETURN(MATCH_THEN);
788 ph10 211
789 nigel 93 /* Handle a capturing bracket. If there is space in the offset vector, save
790     the current subject position in the working slot at the top of the vector.
791     We mustn't change the current values of the data slot, because they may be
792     set from a previous iteration of this group, and be referred to by a
793     reference inside the group.
794 nigel 77
795 nigel 93 If the bracket fails to match, we need to restore this value and also the
796     values of the final offsets, in case they were set by a previous iteration
797     of the same bracket.
798 nigel 77
799 nigel 93 If there isn't enough space in the offset vector, treat this as if it were
800     a non-capturing bracket. Don't worry about setting the flag for the error
801     case here; that is handled in the code for KET. */
802 nigel 77
803 nigel 93 case OP_CBRA:
804     case OP_SCBRA:
805     number = GET2(ecode, 1+LINK_SIZE);
806 nigel 77 offset = number << 1;
807    
808 ph10 475 #ifdef PCRE_DEBUG
809 nigel 93 printf("start bracket %d\n", number);
810     printf("subject=");
811 nigel 77 pchars(eptr, 16, TRUE, md);
812     printf("\n");
813     #endif
814    
815     if (offset < md->offset_max)
816     {
817     save_offset1 = md->offset_vector[offset];
818     save_offset2 = md->offset_vector[offset+1];
819     save_offset3 = md->offset_vector[md->offset_end - number];
820     save_capture_last = md->capture_last;
821    
822     DPRINTF(("saving %d %d %d\n", save_offset1, save_offset2, save_offset3));
823 ph10 531 md->offset_vector[md->offset_end - number] =
824 ph10 530 (int)(eptr - md->start_subject);
825 nigel 77
826 nigel 93 flags = (op == OP_SCBRA)? match_cbegroup : 0;
827 nigel 77 do
828     {
829 ph10 164 RMATCH(eptr, ecode + _pcre_OP_lengths[*ecode], offset_top, md,
830 ph10 602 eptrb, flags, RM1);
831 ph10 550 if (rrc != MATCH_NOMATCH &&
832     (rrc != MATCH_THEN || md->start_match_ptr != ecode))
833     RRETURN(rrc);
834 nigel 77 md->capture_last = save_capture_last;
835     ecode += GET(ecode, 1);
836     }
837     while (*ecode == OP_ALT);
838    
839     DPRINTF(("bracket %d failed\n", number));
840    
841     md->offset_vector[offset] = save_offset1;
842     md->offset_vector[offset+1] = save_offset2;
843     md->offset_vector[md->offset_end - number] = save_offset3;
844    
845 ph10 510 if (rrc != MATCH_THEN) md->mark = markptr;
846 nigel 77 RRETURN(MATCH_NOMATCH);
847     }
848    
849 ph10 197 /* FALL THROUGH ... Insufficient room for saving captured contents. Treat
850     as a non-capturing bracket. */
851 nigel 77
852 ph10 197 /* VVVVVVVVVVVVVVVVVVVVVVVVV */
853     /* VVVVVVVVVVVVVVVVVVVVVVVVV */
854    
855 nigel 93 DPRINTF(("insufficient capture room: treat as non-capturing\n"));
856 nigel 77
857 ph10 197 /* VVVVVVVVVVVVVVVVVVVVVVVVV */
858     /* VVVVVVVVVVVVVVVVVVVVVVVVV */
859    
860 nigel 93 /* Non-capturing bracket. Loop for all the alternatives. When we get to the
861     final alternative within the brackets, we would return the result of a
862     recursive call to match() whatever happened. We can reduce stack usage by
863 ph10 197 turning this into a tail recursion, except in the case when match_cbegroup
864     is set.*/
865 nigel 77
866 nigel 93 case OP_BRA:
867     case OP_SBRA:
868     DPRINTF(("start non-capturing bracket\n"));
869     flags = (op >= OP_SBRA)? match_cbegroup : 0;
870 nigel 91 for (;;)
871 nigel 77 {
872 ph10 197 if (ecode[GET(ecode, 1)] != OP_ALT) /* Final alternative */
873 nigel 93 {
874 ph10 197 if (flags == 0) /* Not a possibly empty group */
875     {
876     ecode += _pcre_OP_lengths[*ecode];
877     DPRINTF(("bracket 0 tail recursion\n"));
878     goto TAIL_RECURSE;
879     }
880    
881     /* Possibly empty group; can't use tail recursion. */
882    
883 ph10 602 RMATCH(eptr, ecode + _pcre_OP_lengths[*ecode], offset_top, md, eptrb,
884     flags, RM48);
885 ph10 512 if (rrc == MATCH_NOMATCH) md->mark = markptr;
886     RRETURN(rrc);
887 nigel 93 }
888 nigel 91
889     /* For non-final alternatives, continue the loop for a NOMATCH result;
890     otherwise return. */
891    
892 ph10 602 RMATCH(eptr, ecode + _pcre_OP_lengths[*ecode], offset_top, md, eptrb,
893     flags, RM2);
894 ph10 550 if (rrc != MATCH_NOMATCH &&
895     (rrc != MATCH_THEN || md->start_match_ptr != ecode))
896     RRETURN(rrc);
897 nigel 77 ecode += GET(ecode, 1);
898     }
899 nigel 91 /* Control never reaches here. */
900 nigel 77
901     /* Conditional group: compilation checked that there are no more than
902     two branches. If the condition is false, skipping the first branch takes us
903     past the end if there is only one branch, but that's OK because that is
904 nigel 91 exactly what going to the ket would do. As there is only one branch to be
905     obeyed, we can use tail recursion to avoid using another stack frame. */
906 nigel 77
907     case OP_COND:
908 nigel 93 case OP_SCOND:
909 ph10 399 codelink= GET(ecode, 1);
910 ph10 406
911 ph10 381 /* Because of the way auto-callout works during compile, a callout item is
912     inserted between OP_COND and an assertion condition. */
913 ph10 392
914 ph10 381 if (ecode[LINK_SIZE+1] == OP_CALLOUT)
915     {
916     if (pcre_callout != NULL)
917     {
918     pcre_callout_block cb;
919     cb.version = 1; /* Version 1 of the callout block */
920     cb.callout_number = ecode[LINK_SIZE+2];
921     cb.offset_vector = md->offset_vector;
922     cb.subject = (PCRE_SPTR)md->start_subject;
923 ph10 530 cb.subject_length = (int)(md->end_subject - md->start_subject);
924     cb.start_match = (int)(mstart - md->start_subject);
925     cb.current_position = (int)(eptr - md->start_subject);
926 ph10 381 cb.pattern_position = GET(ecode, LINK_SIZE + 3);
927     cb.next_item_length = GET(ecode, 3 + 2*LINK_SIZE);
928     cb.capture_top = offset_top/2;
929     cb.capture_last = md->capture_last;
930     cb.callout_data = md->callout_data;
931 ph10 510 if ((rrc = (*pcre_callout)(&cb)) > 0) MRRETURN(MATCH_NOMATCH);
932 ph10 381 if (rrc < 0) RRETURN(rrc);
933     }
934     ecode += _pcre_OP_lengths[OP_CALLOUT];
935     }
936 ph10 392
937 ph10 399 condcode = ecode[LINK_SIZE+1];
938 ph10 406
939 ph10 381 /* Now see what the actual condition is */
940 ph10 392
941 ph10 459 if (condcode == OP_RREF || condcode == OP_NRREF) /* Recursion test */
942 nigel 77 {
943 ph10 459 if (md->recursive == NULL) /* Not recursing => FALSE */
944     {
945 ph10 461 condition = FALSE;
946     ecode += GET(ecode, 1);
947     }
948 ph10 459 else
949 ph10 461 {
950 ph10 459 int recno = GET2(ecode, LINK_SIZE + 2); /* Recursion group number*/
951     condition = (recno == RREF_ANY || recno == md->recursive->group_num);
952 ph10 461
953 ph10 459 /* If the test is for recursion into a specific subpattern, and it is
954     false, but the test was set up by name, scan the table to see if the
955     name refers to any other numbers, and test them. The condition is true
956     if any one is set. */
957 ph10 461
958 ph10 459 if (!condition && condcode == OP_NRREF && recno != RREF_ANY)
959     {
960     uschar *slotA = md->name_table;
961     for (i = 0; i < md->name_count; i++)
962 ph10 461 {
963     if (GET2(slotA, 0) == recno) break;
964 ph10 459 slotA += md->name_entry_size;
965     }
966 ph10 461
967 ph10 459 /* Found a name for the number - there can be only one; duplicate
968     names for different numbers are allowed, but not vice versa. First
969     scan down for duplicates. */
970 ph10 461
971 ph10 459 if (i < md->name_count)
972 ph10 461 {
973 ph10 459 uschar *slotB = slotA;
974     while (slotB > md->name_table)
975     {
976     slotB -= md->name_entry_size;
977     if (strcmp((char *)slotA + 2, (char *)slotB + 2) == 0)
978     {
979     condition = GET2(slotB, 0) == md->recursive->group_num;
980 ph10 461 if (condition) break;
981     }
982 ph10 459 else break;
983 ph10 461 }
984    
985 ph10 459 /* Scan up for duplicates */
986 ph10 461
987 ph10 459 if (!condition)
988 ph10 461 {
989 ph10 459 slotB = slotA;
990     for (i++; i < md->name_count; i++)
991     {
992     slotB += md->name_entry_size;
993     if (strcmp((char *)slotA + 2, (char *)slotB + 2) == 0)
994     {
995     condition = GET2(slotB, 0) == md->recursive->group_num;
996     if (condition) break;
997 ph10 461 }
998 ph10 459 else break;
999 ph10 461 }
1000     }
1001 ph10 459 }
1002 ph10 461 }
1003    
1004 ph10 459 /* Chose branch according to the condition */
1005 ph10 461
1006 ph10 459 ecode += condition? 3 : GET(ecode, 1);
1007     }
1008 ph10 461 }
1009 nigel 93
1010 ph10 459 else if (condcode == OP_CREF || condcode == OP_NCREF) /* Group used test */
1011 nigel 93 {
1012 nigel 77 offset = GET2(ecode, LINK_SIZE+2) << 1; /* Doubled ref number */
1013 nigel 93 condition = offset < offset_top && md->offset_vector[offset] >= 0;
1014 ph10 461
1015 ph10 459 /* If the numbered capture is unset, but the reference was by name,
1016 ph10 461 scan the table to see if the name refers to any other numbers, and test
1017     them. The condition is true if any one is set. This is tediously similar
1018     to the code above, but not close enough to try to amalgamate. */
1019    
1020 ph10 459 if (!condition && condcode == OP_NCREF)
1021     {
1022 ph10 461 int refno = offset >> 1;
1023 ph10 459 uschar *slotA = md->name_table;
1024 ph10 461
1025 ph10 459 for (i = 0; i < md->name_count; i++)
1026 ph10 461 {
1027     if (GET2(slotA, 0) == refno) break;
1028 ph10 459 slotA += md->name_entry_size;
1029     }
1030 ph10 461
1031     /* Found a name for the number - there can be only one; duplicate names
1032     for different numbers are allowed, but not vice versa. First scan down
1033 ph10 459 for duplicates. */
1034 ph10 461
1035 ph10 459 if (i < md->name_count)
1036 ph10 461 {
1037 ph10 459 uschar *slotB = slotA;
1038     while (slotB > md->name_table)
1039     {
1040     slotB -= md->name_entry_size;
1041     if (strcmp((char *)slotA + 2, (char *)slotB + 2) == 0)
1042     {
1043     offset = GET2(slotB, 0) << 1;
1044 ph10 461 condition = offset < offset_top &&
1045 ph10 459 md->offset_vector[offset] >= 0;
1046 ph10 461 if (condition) break;
1047     }
1048 ph10 459 else break;
1049 ph10 461 }
1050    
1051 ph10 459 /* Scan up for duplicates */
1052 ph10 461
1053 ph10 459 if (!condition)
1054 ph10 461 {
1055 ph10 459 slotB = slotA;
1056     for (i++; i < md->name_count; i++)
1057     {
1058     slotB += md->name_entry_size;
1059     if (strcmp((char *)slotA + 2, (char *)slotB + 2) == 0)
1060     {
1061     offset = GET2(slotB, 0) << 1;
1062 ph10 461 condition = offset < offset_top &&
1063 ph10 459 md->offset_vector[offset] >= 0;
1064 ph10 461 if (condition) break;
1065     }
1066 ph10 459 else break;
1067 ph10 461 }
1068     }
1069 ph10 459 }
1070 ph10 461 }
1071    
1072 ph10 459 /* Chose branch according to the condition */
1073    
1074 nigel 93 ecode += condition? 3 : GET(ecode, 1);
1075 nigel 77 }
1076    
1077 ph10 399 else if (condcode == OP_DEF) /* DEFINE - always false */
1078 nigel 93 {
1079     condition = FALSE;
1080     ecode += GET(ecode, 1);
1081     }
1082    
1083 nigel 77 /* The condition is an assertion. Call match() to evaluate it - setting
1084 nigel 93 the final argument match_condassert causes it to stop at the end of an
1085     assertion. */
1086 nigel 77
1087     else
1088     {
1089 ph10 602 RMATCH(eptr, ecode + 1 + LINK_SIZE, offset_top, md, NULL,
1090 ph10 164 match_condassert, RM3);
1091 nigel 77 if (rrc == MATCH_MATCH)
1092     {
1093 nigel 93 condition = TRUE;
1094     ecode += 1 + LINK_SIZE + GET(ecode, LINK_SIZE + 2);
1095 nigel 77 while (*ecode == OP_ALT) ecode += GET(ecode, 1);
1096     }
1097 ph10 550 else if (rrc != MATCH_NOMATCH &&
1098     (rrc != MATCH_THEN || md->start_match_ptr != ecode))
1099 nigel 77 {
1100     RRETURN(rrc); /* Need braces because of following else */
1101     }
1102 nigel 93 else
1103     {
1104     condition = FALSE;
1105 ph10 399 ecode += codelink;
1106 nigel 93 }
1107     }
1108 nigel 91
1109 nigel 93 /* We are now at the branch that is to be obeyed. As there is only one,
1110 ph10 197 we can use tail recursion to avoid using another stack frame, except when
1111     match_cbegroup is required for an unlimited repeat of a possibly empty
1112     group. If the second alternative doesn't exist, we can just plough on. */
1113 nigel 91
1114 nigel 93 if (condition || *ecode == OP_ALT)
1115     {
1116 nigel 91 ecode += 1 + LINK_SIZE;
1117 ph10 197 if (op == OP_SCOND) /* Possibly empty group */
1118     {
1119 ph10 602 RMATCH(eptr, ecode, offset_top, md, eptrb, match_cbegroup, RM49);
1120 ph10 197 RRETURN(rrc);
1121     }
1122     else /* Group must match something */
1123     {
1124     flags = 0;
1125     goto TAIL_RECURSE;
1126     }
1127 nigel 77 }
1128 ph10 395 else /* Condition false & no alternative */
1129 nigel 93 {
1130     ecode += 1 + LINK_SIZE;
1131     }
1132     break;
1133 nigel 77
1134 ph10 461
1135 ph10 447 /* Before OP_ACCEPT there may be any number of OP_CLOSE opcodes,
1136     to close any currently open capturing brackets. */
1137 ph10 461
1138 ph10 447 case OP_CLOSE:
1139 ph10 461 number = GET2(ecode, 1);
1140 ph10 447 offset = number << 1;
1141 ph10 461
1142 ph10 475 #ifdef PCRE_DEBUG
1143 ph10 447 printf("end bracket %d at *ACCEPT", number);
1144     printf("\n");
1145     #endif
1146 nigel 77
1147 ph10 447 md->capture_last = number;
1148     if (offset >= md->offset_max) md->offset_overflow = TRUE; else
1149     {
1150     md->offset_vector[offset] =
1151     md->offset_vector[md->offset_end - number];
1152 ph10 530 md->offset_vector[offset+1] = (int)(eptr - md->start_subject);
1153 ph10 447 if (offset_top <= offset) offset_top = offset + 2;
1154     }
1155     ecode += 3;
1156 ph10 461 break;
1157 ph10 447
1158    
1159 ph10 210 /* End of the pattern, either real or forced. If we are in a top-level
1160     recursion, we should restore the offsets appropriately and continue from
1161     after the call. */
1162 nigel 77
1163 ph10 210 case OP_ACCEPT:
1164 nigel 77 case OP_END:
1165     if (md->recursive != NULL && md->recursive->group_num == 0)
1166     {
1167     recursion_info *rec = md->recursive;
1168 nigel 87 DPRINTF(("End of pattern in a (?0) recursion\n"));
1169 nigel 77 md->recursive = rec->prevrec;
1170     memmove(md->offset_vector, rec->offset_save,
1171     rec->saved_max * sizeof(int));
1172 ph10 461 offset_top = rec->save_offset_top;
1173 nigel 77 ecode = rec->after_call;
1174     break;
1175     }
1176    
1177 ph10 442 /* Otherwise, if we have matched an empty string, fail if PCRE_NOTEMPTY is
1178     set, or if PCRE_NOTEMPTY_ATSTART is set and we have matched at the start of
1179     the subject. In both cases, backtracking will then try other alternatives,
1180     if any. */
1181 ph10 443
1182 ph10 442 if (eptr == mstart &&
1183     (md->notempty ||
1184 ph10 443 (md->notempty_atstart &&
1185 ph10 442 mstart == md->start_subject + md->start_offset)))
1186 ph10 510 MRRETURN(MATCH_NOMATCH);
1187 ph10 443
1188 ph10 442 /* Otherwise, we have a match. */
1189 nigel 77
1190 ph10 168 md->end_match_ptr = eptr; /* Record where we ended */
1191     md->end_offset_top = offset_top; /* and how many extracts were taken */
1192 ph10 210 md->start_match_ptr = mstart; /* and the start (\K can modify) */
1193 nigel 77
1194 ph10 512 /* For some reason, the macros don't work properly if an expression is
1195     given as the argument to MRRETURN when the heap is in use. */
1196    
1197     rrc = (op == OP_END)? MATCH_MATCH : MATCH_ACCEPT;
1198     MRRETURN(rrc);
1199    
1200 nigel 77 /* Assertion brackets. Check the alternative branches in turn - the
1201     matching won't pass the KET for an assertion. If any one branch matches,
1202     the assertion is true. Lookbehind assertions have an OP_REVERSE item at the
1203     start of each branch to move the current point backwards, so the code at
1204     this level is identical to the lookahead case. */
1205    
1206     case OP_ASSERT:
1207     case OP_ASSERTBACK:
1208     do
1209     {
1210 ph10 602 RMATCH(eptr, ecode + 1 + LINK_SIZE, offset_top, md, NULL, 0,
1211 ph10 164 RM4);
1212 ph10 511 if (rrc == MATCH_MATCH || rrc == MATCH_ACCEPT)
1213 ph10 500 {
1214     mstart = md->start_match_ptr; /* In case \K reset it */
1215     break;
1216 ph10 501 }
1217 ph10 550 if (rrc != MATCH_NOMATCH &&
1218     (rrc != MATCH_THEN || md->start_match_ptr != ecode))
1219     RRETURN(rrc);
1220 nigel 77 ecode += GET(ecode, 1);
1221     }
1222     while (*ecode == OP_ALT);
1223 ph10 510 if (*ecode == OP_KET) MRRETURN(MATCH_NOMATCH);
1224 nigel 77
1225     /* If checking an assertion for a condition, return MATCH_MATCH. */
1226    
1227     if ((flags & match_condassert) != 0) RRETURN(MATCH_MATCH);
1228    
1229     /* Continue from after the assertion, updating the offsets high water
1230     mark, since extracts may have been taken during the assertion. */
1231    
1232     do ecode += GET(ecode,1); while (*ecode == OP_ALT);
1233     ecode += 1 + LINK_SIZE;
1234     offset_top = md->end_offset_top;
1235     continue;
1236    
1237 ph10 473 /* Negative assertion: all branches must fail to match. Encountering SKIP,
1238 ph10 482 PRUNE, or COMMIT means we must assume failure without checking subsequent
1239 ph10 473 branches. */
1240 nigel 77
1241     case OP_ASSERT_NOT:
1242     case OP_ASSERTBACK_NOT:
1243     do
1244     {
1245 ph10 602 RMATCH(eptr, ecode + 1 + LINK_SIZE, offset_top, md, NULL, 0,
1246 ph10 164 RM5);
1247 ph10 511 if (rrc == MATCH_MATCH || rrc == MATCH_ACCEPT) MRRETURN(MATCH_NOMATCH);
1248 ph10 473 if (rrc == MATCH_SKIP || rrc == MATCH_PRUNE || rrc == MATCH_COMMIT)
1249     {
1250     do ecode += GET(ecode,1); while (*ecode == OP_ALT);
1251 ph10 482 break;
1252     }
1253 ph10 550 if (rrc != MATCH_NOMATCH &&
1254     (rrc != MATCH_THEN || md->start_match_ptr != ecode))
1255     RRETURN(rrc);
1256 nigel 77 ecode += GET(ecode,1);
1257     }
1258     while (*ecode == OP_ALT);
1259    
1260     if ((flags & match_condassert) != 0) RRETURN(MATCH_MATCH);
1261    
1262     ecode += 1 + LINK_SIZE;
1263     continue;
1264    
1265     /* Move the subject pointer back. This occurs only at the start of
1266     each branch of a lookbehind assertion. If we are too close to the start to
1267     move back, this match function fails. When working with UTF-8 we move
1268     back a number of characters, not bytes. */
1269    
1270     case OP_REVERSE:
1271     #ifdef SUPPORT_UTF8
1272     if (utf8)
1273     {
1274 nigel 93 i = GET(ecode, 1);
1275     while (i-- > 0)
1276 nigel 77 {
1277     eptr--;
1278 ph10 510 if (eptr < md->start_subject) MRRETURN(MATCH_NOMATCH);
1279 ph10 207 BACKCHAR(eptr);
1280 nigel 77 }
1281     }
1282     else
1283     #endif
1284    
1285     /* No UTF-8 support, or not in UTF-8 mode: count is byte count */
1286    
1287     {
1288 nigel 93 eptr -= GET(ecode, 1);
1289 ph10 510 if (eptr < md->start_subject) MRRETURN(MATCH_NOMATCH);
1290 nigel 77 }
1291    
1292 ph10 435 /* Save the earliest consulted character, then skip to next op code */
1293 nigel 77
1294 ph10 435 if (eptr < md->start_used_ptr) md->start_used_ptr = eptr;
1295 nigel 77 ecode += 1 + LINK_SIZE;
1296     break;
1297    
1298     /* The callout item calls an external function, if one is provided, passing
1299     details of the match so far. This is mainly for debugging, though the
1300     function is able to force a failure. */
1301    
1302     case OP_CALLOUT:
1303     if (pcre_callout != NULL)
1304     {
1305     pcre_callout_block cb;
1306     cb.version = 1; /* Version 1 of the callout block */
1307     cb.callout_number = ecode[1];
1308     cb.offset_vector = md->offset_vector;
1309 nigel 87 cb.subject = (PCRE_SPTR)md->start_subject;
1310 ph10 530 cb.subject_length = (int)(md->end_subject - md->start_subject);
1311     cb.start_match = (int)(mstart - md->start_subject);
1312     cb.current_position = (int)(eptr - md->start_subject);
1313 nigel 77 cb.pattern_position = GET(ecode, 2);
1314     cb.next_item_length = GET(ecode, 2 + LINK_SIZE);
1315     cb.capture_top = offset_top/2;
1316     cb.capture_last = md->capture_last;
1317     cb.callout_data = md->callout_data;
1318 ph10 510 if ((rrc = (*pcre_callout)(&cb)) > 0) MRRETURN(MATCH_NOMATCH);
1319 nigel 77 if (rrc < 0) RRETURN(rrc);
1320     }
1321     ecode += 2 + 2*LINK_SIZE;
1322     break;
1323    
1324     /* Recursion either matches the current regex, or some subexpression. The
1325     offset data is the offset to the starting bracket from the start of the
1326     whole pattern. (This is so that it works from duplicated subpatterns.)
1327    
1328     If there are any capturing brackets started but not finished, we have to
1329     save their starting points and reinstate them after the recursion. However,
1330     we don't know how many such there are (offset_top records the completed
1331     total) so we just have to save all the potential data. There may be up to
1332     65535 such values, which is too large to put on the stack, but using malloc
1333     for small numbers seems expensive. As a compromise, the stack is used when
1334     there are no more than REC_STACK_SAVE_MAX values to store; otherwise malloc
1335     is used. A problem is what to do if the malloc fails ... there is no way of
1336     returning to the top level with an error. Save the top REC_STACK_SAVE_MAX
1337     values on the stack, and accept that the rest may be wrong.
1338    
1339     There are also other values that have to be saved. We use a chained
1340     sequence of blocks that actually live on the stack. Thanks to Robin Houston
1341     for the original version of this logic. */
1342    
1343     case OP_RECURSE:
1344     {
1345     callpat = md->start_code + GET(ecode, 1);
1346 nigel 93 new_recursive.group_num = (callpat == md->start_code)? 0 :
1347     GET2(callpat, 1 + LINK_SIZE);
1348 nigel 77
1349     /* Add to "recursing stack" */
1350    
1351     new_recursive.prevrec = md->recursive;
1352     md->recursive = &new_recursive;
1353    
1354     /* Find where to continue from afterwards */
1355    
1356     ecode += 1 + LINK_SIZE;
1357     new_recursive.after_call = ecode;
1358    
1359     /* Now save the offset data. */
1360    
1361     new_recursive.saved_max = md->offset_end;
1362     if (new_recursive.saved_max <= REC_STACK_SAVE_MAX)
1363     new_recursive.offset_save = stacksave;
1364     else
1365     {
1366     new_recursive.offset_save =
1367     (int *)(pcre_malloc)(new_recursive.saved_max * sizeof(int));
1368     if (new_recursive.offset_save == NULL) RRETURN(PCRE_ERROR_NOMEMORY);
1369     }
1370    
1371     memcpy(new_recursive.offset_save, md->offset_vector,
1372     new_recursive.saved_max * sizeof(int));
1373 ph10 461 new_recursive.save_offset_top = offset_top;
1374 nigel 77
1375     /* OK, now we can do the recursion. For each top-level alternative we
1376     restore the offset and recursion data. */
1377    
1378     DPRINTF(("Recursing into group %d\n", new_recursive.group_num));
1379 nigel 93 flags = (*callpat >= OP_SBRA)? match_cbegroup : 0;
1380 nigel 77 do
1381     {
1382 ph10 164 RMATCH(eptr, callpat + _pcre_OP_lengths[*callpat], offset_top,
1383 ph10 602 md, eptrb, flags, RM6);
1384 ph10 511 if (rrc == MATCH_MATCH || rrc == MATCH_ACCEPT)
1385 nigel 77 {
1386 nigel 87 DPRINTF(("Recursion matched\n"));
1387 nigel 77 md->recursive = new_recursive.prevrec;
1388     if (new_recursive.offset_save != stacksave)
1389     (pcre_free)(new_recursive.offset_save);
1390 ph10 510 MRRETURN(MATCH_MATCH);
1391 nigel 77 }
1392 ph10 550 else if (rrc != MATCH_NOMATCH &&
1393     (rrc != MATCH_THEN || md->start_match_ptr != ecode))
1394 nigel 87 {
1395     DPRINTF(("Recursion gave error %d\n", rrc));
1396 ph10 400 if (new_recursive.offset_save != stacksave)
1397     (pcre_free)(new_recursive.offset_save);
1398 nigel 87 RRETURN(rrc);
1399     }
1400 nigel 77
1401     md->recursive = &new_recursive;
1402     memcpy(md->offset_vector, new_recursive.offset_save,
1403     new_recursive.saved_max * sizeof(int));
1404     callpat += GET(callpat, 1);
1405     }
1406     while (*callpat == OP_ALT);
1407    
1408     DPRINTF(("Recursion didn't match\n"));
1409     md->recursive = new_recursive.prevrec;
1410     if (new_recursive.offset_save != stacksave)
1411     (pcre_free)(new_recursive.offset_save);
1412 ph10 510 MRRETURN(MATCH_NOMATCH);
1413 nigel 77 }
1414     /* Control never reaches here */
1415    
1416     /* "Once" brackets are like assertion brackets except that after a match,
1417     the point in the subject string is not moved back. Thus there can never be
1418     a move back into the brackets. Friedl calls these "atomic" subpatterns.
1419     Check the alternative branches in turn - the matching won't pass the KET
1420     for this kind of subpattern. If any one branch matches, we carry on as at
1421 ph10 500 the end of a normal bracket, leaving the subject pointer, but resetting
1422     the start-of-match value in case it was changed by \K. */
1423 nigel 77
1424     case OP_ONCE:
1425 nigel 91 prev = ecode;
1426     saved_eptr = eptr;
1427    
1428     do
1429 nigel 77 {
1430 ph10 602 RMATCH(eptr, ecode + 1 + LINK_SIZE, offset_top, md, eptrb, 0, RM7);
1431 ph10 511 if (rrc == MATCH_MATCH) /* Note: _not_ MATCH_ACCEPT */
1432 ph10 500 {
1433     mstart = md->start_match_ptr;
1434     break;
1435 ph10 501 }
1436 ph10 550 if (rrc != MATCH_NOMATCH &&
1437     (rrc != MATCH_THEN || md->start_match_ptr != ecode))
1438     RRETURN(rrc);
1439 nigel 91 ecode += GET(ecode,1);
1440     }
1441     while (*ecode == OP_ALT);
1442 nigel 77
1443 nigel 91 /* If hit the end of the group (which could be repeated), fail */
1444 nigel 77
1445 nigel 91 if (*ecode != OP_ONCE && *ecode != OP_ALT) RRETURN(MATCH_NOMATCH);
1446 nigel 77
1447 nigel 91 /* Continue as from after the assertion, updating the offsets high water
1448     mark, since extracts may have been taken. */
1449 nigel 77
1450 nigel 93 do ecode += GET(ecode, 1); while (*ecode == OP_ALT);
1451 nigel 77
1452 nigel 91 offset_top = md->end_offset_top;
1453     eptr = md->end_match_ptr;
1454 nigel 77
1455 nigel 91 /* For a non-repeating ket, just continue at this level. This also
1456     happens for a repeating ket if no characters were matched in the group.
1457     This is the forcible breaking of infinite loops as implemented in Perl
1458     5.005. If there is an options reset, it will get obeyed in the normal
1459     course of events. */
1460 nigel 77
1461 nigel 91 if (*ecode == OP_KET || eptr == saved_eptr)
1462     {
1463     ecode += 1+LINK_SIZE;
1464     break;
1465     }
1466 nigel 77
1467 nigel 91 /* The repeating kets try the rest of the pattern or restart from the
1468     preceding bracket, in the appropriate order. The second "call" of match()
1469 ph10 602 uses tail recursion, to avoid using another stack frame. */
1470 nigel 77
1471 nigel 91 if (*ecode == OP_KETRMIN)
1472     {
1473 ph10 602 RMATCH(eptr, ecode + 1 + LINK_SIZE, offset_top, md, eptrb, 0, RM8);
1474 nigel 91 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1475     ecode = prev;
1476 ph10 197 flags = 0;
1477 nigel 91 goto TAIL_RECURSE;
1478 nigel 77 }
1479 nigel 91 else /* OP_KETRMAX */
1480     {
1481 ph10 602 RMATCH(eptr, prev, offset_top, md, eptrb, match_cbegroup, RM9);
1482 nigel 91 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1483     ecode += 1 + LINK_SIZE;
1484 ph10 197 flags = 0;
1485 nigel 91 goto TAIL_RECURSE;
1486     }
1487     /* Control never gets here */
1488 nigel 77
1489     /* An alternation is the end of a branch; scan along to find the end of the
1490     bracketed group and go to there. */
1491    
1492     case OP_ALT:
1493     do ecode += GET(ecode,1); while (*ecode == OP_ALT);
1494     break;
1495    
1496 ph10 335 /* BRAZERO, BRAMINZERO and SKIPZERO occur just before a bracket group,
1497     indicating that it may occur zero times. It may repeat infinitely, or not
1498     at all - i.e. it could be ()* or ()? or even (){0} in the pattern. Brackets
1499     with fixed upper repeat limits are compiled as a number of copies, with the
1500     optional ones preceded by BRAZERO or BRAMINZERO. */
1501 nigel 77
1502     case OP_BRAZERO:
1503     {
1504     next = ecode+1;
1505 ph10 602 RMATCH(eptr, next, offset_top, md, eptrb, 0, RM10);
1506 nigel 77 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1507     do next += GET(next,1); while (*next == OP_ALT);
1508 nigel 93 ecode = next + 1 + LINK_SIZE;
1509 nigel 77 }
1510     break;
1511    
1512     case OP_BRAMINZERO:
1513     {
1514     next = ecode+1;
1515 nigel 93 do next += GET(next, 1); while (*next == OP_ALT);
1516 ph10 602 RMATCH(eptr, next + 1+LINK_SIZE, offset_top, md, eptrb, 0, RM11);
1517 nigel 77 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1518     ecode++;
1519     }
1520     break;
1521    
1522 ph10 335 case OP_SKIPZERO:
1523     {
1524     next = ecode+1;
1525     do next += GET(next,1); while (*next == OP_ALT);
1526     ecode = next + 1 + LINK_SIZE;
1527     }
1528     break;
1529    
1530 nigel 93 /* End of a group, repeated or non-repeating. */
1531 nigel 77
1532     case OP_KET:
1533     case OP_KETRMIN:
1534     case OP_KETRMAX:
1535 nigel 91 prev = ecode - GET(ecode, 1);
1536 nigel 77
1537 nigel 93 /* If this was a group that remembered the subject start, in order to break
1538     infinite repeats of empty string matches, retrieve the subject start from
1539     the chain. Otherwise, set it NULL. */
1540 nigel 77
1541 nigel 93 if (*prev >= OP_SBRA)
1542     {
1543     saved_eptr = eptrb->epb_saved_eptr; /* Value at start of group */
1544     eptrb = eptrb->epb_prev; /* Backup to previous group */
1545     }
1546     else saved_eptr = NULL;
1547 nigel 77
1548 ph10 500 /* If we are at the end of an assertion group or an atomic group, stop
1549     matching and return MATCH_MATCH, but record the current high water mark for
1550     use by positive assertions. We also need to record the match start in case
1551     it was changed by \K. */
1552 nigel 93
1553 nigel 91 if (*prev == OP_ASSERT || *prev == OP_ASSERT_NOT ||
1554     *prev == OP_ASSERTBACK || *prev == OP_ASSERTBACK_NOT ||
1555     *prev == OP_ONCE)
1556     {
1557     md->end_match_ptr = eptr; /* For ONCE */
1558     md->end_offset_top = offset_top;
1559 ph10 500 md->start_match_ptr = mstart;
1560 ph10 510 MRRETURN(MATCH_MATCH);
1561 nigel 91 }
1562 nigel 77
1563 nigel 93 /* For capturing groups we have to check the group number back at the start
1564     and if necessary complete handling an extraction by setting the offsets and
1565     bumping the high water mark. Note that whole-pattern recursion is coded as
1566     a recurse into group 0, so it won't be picked up here. Instead, we catch it
1567     when the OP_END is reached. Other recursion is handled here. */
1568 nigel 77
1569 nigel 93 if (*prev == OP_CBRA || *prev == OP_SCBRA)
1570 nigel 91 {
1571 nigel 93 number = GET2(prev, 1+LINK_SIZE);
1572 nigel 91 offset = number << 1;
1573 ph10 461
1574 ph10 475 #ifdef PCRE_DEBUG
1575 nigel 91 printf("end bracket %d", number);
1576     printf("\n");
1577 nigel 77 #endif
1578    
1579 nigel 93 md->capture_last = number;
1580     if (offset >= md->offset_max) md->offset_overflow = TRUE; else
1581 nigel 91 {
1582 nigel 93 md->offset_vector[offset] =
1583     md->offset_vector[md->offset_end - number];
1584 ph10 530 md->offset_vector[offset+1] = (int)(eptr - md->start_subject);
1585 nigel 93 if (offset_top <= offset) offset_top = offset + 2;
1586     }
1587 nigel 77
1588 nigel 93 /* Handle a recursively called group. Restore the offsets
1589     appropriately and continue from after the call. */
1590 nigel 77
1591 nigel 93 if (md->recursive != NULL && md->recursive->group_num == number)
1592     {
1593     recursion_info *rec = md->recursive;
1594     DPRINTF(("Recursion (%d) succeeded - continuing\n", number));
1595     md->recursive = rec->prevrec;
1596     memcpy(md->offset_vector, rec->offset_save,
1597     rec->saved_max * sizeof(int));
1598 ph10 461 offset_top = rec->save_offset_top;
1599 nigel 93 ecode = rec->after_call;
1600     break;
1601 nigel 77 }
1602 nigel 91 }
1603 nigel 77
1604 nigel 91 /* For a non-repeating ket, just continue at this level. This also
1605     happens for a repeating ket if no characters were matched in the group.
1606     This is the forcible breaking of infinite loops as implemented in Perl
1607     5.005. If there is an options reset, it will get obeyed in the normal
1608     course of events. */
1609 nigel 77
1610 nigel 91 if (*ecode == OP_KET || eptr == saved_eptr)
1611     {
1612     ecode += 1 + LINK_SIZE;
1613     break;
1614     }
1615 nigel 77
1616 nigel 91 /* The repeating kets try the rest of the pattern or restart from the
1617     preceding bracket, in the appropriate order. In the second case, we can use
1618 ph10 197 tail recursion to avoid using another stack frame, unless we have an
1619     unlimited repeat of a group that can match an empty string. */
1620 nigel 77
1621 nigel 93 flags = (*prev >= OP_SBRA)? match_cbegroup : 0;
1622    
1623 nigel 91 if (*ecode == OP_KETRMIN)
1624     {
1625 ph10 602 RMATCH(eptr, ecode + 1 + LINK_SIZE, offset_top, md, eptrb, 0, RM12);
1626 nigel 91 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1627 ph10 197 if (flags != 0) /* Could match an empty string */
1628     {
1629 ph10 602 RMATCH(eptr, prev, offset_top, md, eptrb, flags, RM50);
1630 ph10 197 RRETURN(rrc);
1631     }
1632 nigel 91 ecode = prev;
1633     goto TAIL_RECURSE;
1634 nigel 77 }
1635 nigel 91 else /* OP_KETRMAX */
1636     {
1637 ph10 602 RMATCH(eptr, prev, offset_top, md, eptrb, flags, RM13);
1638 nigel 91 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1639     ecode += 1 + LINK_SIZE;
1640 ph10 197 flags = 0;
1641 nigel 91 goto TAIL_RECURSE;
1642     }
1643     /* Control never gets here */
1644 nigel 77
1645 ph10 602 /* Not multiline mode: start of subject assertion, unless notbol. */
1646 nigel 77
1647     case OP_CIRC:
1648 ph10 510 if (md->notbol && eptr == md->start_subject) MRRETURN(MATCH_NOMATCH);
1649 ph10 602
1650 nigel 77 /* Start of subject assertion */
1651    
1652     case OP_SOD:
1653 ph10 510 if (eptr != md->start_subject) MRRETURN(MATCH_NOMATCH);
1654 nigel 77 ecode++;
1655     break;
1656 ph10 602
1657     /* Multiline mode: start of subject unless notbol, or after any newline. */
1658 nigel 77
1659 ph10 602 case OP_CIRCM:
1660     if (md->notbol && eptr == md->start_subject) MRRETURN(MATCH_NOMATCH);
1661     if (eptr != md->start_subject &&
1662     (eptr == md->end_subject || !WAS_NEWLINE(eptr)))
1663     MRRETURN(MATCH_NOMATCH);
1664     ecode++;
1665     break;
1666    
1667 nigel 77 /* Start of match assertion */
1668    
1669     case OP_SOM:
1670 ph10 510 if (eptr != md->start_subject + md->start_offset) MRRETURN(MATCH_NOMATCH);
1671 nigel 77 ecode++;
1672     break;
1673 ph10 172
1674 ph10 168 /* Reset the start of match point */
1675 ph10 172
1676 ph10 168 case OP_SET_SOM:
1677     mstart = eptr;
1678 ph10 172 ecode++;
1679     break;
1680 nigel 77
1681 ph10 602 /* Multiline mode: assert before any newline, or before end of subject
1682     unless noteol is set. */
1683 nigel 77
1684 ph10 602 case OP_DOLLM:
1685     if (eptr < md->end_subject)
1686     { if (!IS_NEWLINE(eptr)) MRRETURN(MATCH_NOMATCH); }
1687     else
1688 nigel 77 {
1689 ph10 510 if (md->noteol) MRRETURN(MATCH_NOMATCH);
1690 ph10 602 SCHECK_PARTIAL();
1691 nigel 77 }
1692 ph10 602 ecode++;
1693     break;
1694 ph10 579
1695 ph10 602 /* Not multiline mode: assert before a terminating newline or before end of
1696     subject unless noteol is set. */
1697    
1698     case OP_DOLL:
1699     if (md->noteol) MRRETURN(MATCH_NOMATCH);
1700     if (!md->endonly) goto ASSERT_NL_OR_EOS;
1701    
1702 nigel 91 /* ... else fall through for endonly */
1703 nigel 77
1704     /* End of subject assertion (\z) */
1705    
1706     case OP_EOD:
1707 ph10 510 if (eptr < md->end_subject) MRRETURN(MATCH_NOMATCH);
1708 ph10 553 SCHECK_PARTIAL();
1709 nigel 77 ecode++;
1710     break;
1711    
1712     /* End of subject or ending \n assertion (\Z) */
1713    
1714     case OP_EODN:
1715 ph10 553 ASSERT_NL_OR_EOS:
1716     if (eptr < md->end_subject &&
1717 nigel 93 (!IS_NEWLINE(eptr) || eptr != md->end_subject - md->nllen))
1718 ph10 510 MRRETURN(MATCH_NOMATCH);
1719 ph10 579
1720 ph10 553 /* Either at end of string or \n before end. */
1721 ph10 579
1722 ph10 553 SCHECK_PARTIAL();
1723 nigel 77 ecode++;
1724     break;
1725    
1726     /* Word boundary assertions */
1727    
1728     case OP_NOT_WORD_BOUNDARY:
1729     case OP_WORD_BOUNDARY:
1730     {
1731    
1732     /* Find out if the previous and current characters are "word" characters.
1733     It takes a bit more work in UTF-8 mode. Characters > 255 are assumed to
1734 ph10 443 be "non-word" characters. Remember the earliest consulted character for
1735 ph10 435 partial matching. */
1736 nigel 77
1737     #ifdef SUPPORT_UTF8
1738     if (utf8)
1739     {
1740 ph10 518 /* Get status of previous character */
1741 ph10 527
1742 nigel 77 if (eptr == md->start_subject) prev_is_word = FALSE; else
1743     {
1744 ph10 409 USPTR lastptr = eptr - 1;
1745 nigel 77 while((*lastptr & 0xc0) == 0x80) lastptr--;
1746 ph10 443 if (lastptr < md->start_used_ptr) md->start_used_ptr = lastptr;
1747 nigel 77 GETCHAR(c, lastptr);
1748 ph10 527 #ifdef SUPPORT_UCP
1749 ph10 518 if (md->use_ucp)
1750     {
1751     if (c == '_') prev_is_word = TRUE; else
1752 ph10 527 {
1753 ph10 518 int cat = UCD_CATEGORY(c);
1754     prev_is_word = (cat == ucp_L || cat == ucp_N);
1755 ph10 527 }
1756     }
1757     else
1758     #endif
1759 nigel 77 prev_is_word = c < 256 && (md->ctypes[c] & ctype_word) != 0;
1760     }
1761 ph10 527
1762 ph10 518 /* Get status of next character */
1763 ph10 527
1764 ph10 443 if (eptr >= md->end_subject)
1765 nigel 77 {
1766 ph10 443 SCHECK_PARTIAL();
1767     cur_is_word = FALSE;
1768 ph10 428 }
1769     else
1770     {
1771 nigel 77 GETCHAR(c, eptr);
1772 ph10 527 #ifdef SUPPORT_UCP
1773 ph10 518 if (md->use_ucp)
1774     {
1775     if (c == '_') cur_is_word = TRUE; else
1776 ph10 527 {
1777 ph10 518 int cat = UCD_CATEGORY(c);
1778     cur_is_word = (cat == ucp_L || cat == ucp_N);
1779 ph10 527 }
1780     }
1781     else
1782     #endif
1783 nigel 77 cur_is_word = c < 256 && (md->ctypes[c] & ctype_word) != 0;
1784     }
1785     }
1786     else
1787     #endif
1788    
1789 ph10 527 /* Not in UTF-8 mode, but we may still have PCRE_UCP set, and for
1790 ph10 518 consistency with the behaviour of \w we do use it in this case. */
1791 nigel 77
1792     {
1793 ph10 518 /* Get status of previous character */
1794 ph10 527
1795 ph10 435 if (eptr == md->start_subject) prev_is_word = FALSE; else
1796     {
1797 ph10 443 if (eptr <= md->start_used_ptr) md->start_used_ptr = eptr - 1;
1798 ph10 527 #ifdef SUPPORT_UCP
1799 ph10 518 if (md->use_ucp)
1800     {
1801 ph10 527 c = eptr[-1];
1802 ph10 518 if (c == '_') prev_is_word = TRUE; else
1803 ph10 527 {
1804 ph10 518 int cat = UCD_CATEGORY(c);
1805     prev_is_word = (cat == ucp_L || cat == ucp_N);
1806 ph10 527 }
1807     }
1808     else
1809     #endif
1810 ph10 435 prev_is_word = ((md->ctypes[eptr[-1]] & ctype_word) != 0);
1811     }
1812 ph10 527
1813 ph10 518 /* Get status of next character */
1814 ph10 527
1815 ph10 443 if (eptr >= md->end_subject)
1816 ph10 428 {
1817 ph10 443 SCHECK_PARTIAL();
1818     cur_is_word = FALSE;
1819 ph10 428 }
1820 ph10 527 else
1821     #ifdef SUPPORT_UCP
1822 ph10 518 if (md->use_ucp)
1823     {
1824 ph10 527 c = *eptr;
1825 ph10 518 if (c == '_') cur_is_word = TRUE; else
1826 ph10 527 {
1827 ph10 518 int cat = UCD_CATEGORY(c);
1828     cur_is_word = (cat == ucp_L || cat == ucp_N);
1829 ph10 527 }
1830     }
1831     else
1832     #endif
1833 ph10 518 cur_is_word = ((md->ctypes[*eptr] & ctype_word) != 0);
1834 nigel 77 }
1835    
1836     /* Now see if the situation is what we want */
1837    
1838     if ((*ecode++ == OP_WORD_BOUNDARY)?
1839     cur_is_word == prev_is_word : cur_is_word != prev_is_word)
1840 ph10 510 MRRETURN(MATCH_NOMATCH);
1841 nigel 77 }
1842     break;
1843    
1844     /* Match a single character type; inline for speed */
1845    
1846     case OP_ANY:
1847 ph10 510 if (IS_NEWLINE(eptr)) MRRETURN(MATCH_NOMATCH);
1848 ph10 345 /* Fall through */
1849    
1850 ph10 341 case OP_ALLANY:
1851 ph10 443 if (eptr++ >= md->end_subject)
1852 ph10 428 {
1853 ph10 443 SCHECK_PARTIAL();
1854 ph10 510 MRRETURN(MATCH_NOMATCH);
1855 ph10 443 }
1856 ph10 342 if (utf8) while (eptr < md->end_subject && (*eptr & 0xc0) == 0x80) eptr++;
1857 nigel 77 ecode++;
1858     break;
1859    
1860     /* Match a single byte, even in UTF-8 mode. This opcode really does match
1861     any byte, even newline, independent of the setting of PCRE_DOTALL. */
1862    
1863     case OP_ANYBYTE:
1864 ph10 443 if (eptr++ >= md->end_subject)
1865 ph10 428 {
1866 ph10 443 SCHECK_PARTIAL();
1867 ph10 510 MRRETURN(MATCH_NOMATCH);
1868 ph10 443 }
1869 nigel 77 ecode++;
1870     break;
1871    
1872     case OP_NOT_DIGIT:
1873 ph10 443 if (eptr >= md->end_subject)
1874 ph10 428 {
1875 ph10 443 SCHECK_PARTIAL();
1876 ph10 510 MRRETURN(MATCH_NOMATCH);
1877 ph10 443 }
1878 nigel 77 GETCHARINCTEST(c, eptr);
1879     if (
1880     #ifdef SUPPORT_UTF8
1881     c < 256 &&
1882     #endif
1883     (md->ctypes[c] & ctype_digit) != 0
1884     )
1885 ph10 510 MRRETURN(MATCH_NOMATCH);
1886 nigel 77 ecode++;
1887     break;
1888    
1889     case OP_DIGIT:
1890 ph10 443 if (eptr >= md->end_subject)
1891 ph10 428 {
1892 ph10 443 SCHECK_PARTIAL();
1893 ph10 510 MRRETURN(MATCH_NOMATCH);
1894 ph10 443 }
1895 nigel 77 GETCHARINCTEST(c, eptr);
1896     if (
1897     #ifdef SUPPORT_UTF8
1898     c >= 256 ||
1899     #endif
1900     (md->ctypes[c] & ctype_digit) == 0
1901     )
1902 ph10 510 MRRETURN(MATCH_NOMATCH);
1903 nigel 77 ecode++;
1904     break;
1905    
1906     case OP_NOT_WHITESPACE:
1907 ph10 443 if (eptr >= md->end_subject)
1908 ph10 428 {
1909 ph10 443 SCHECK_PARTIAL();
1910 ph10 510 MRRETURN(MATCH_NOMATCH);
1911 ph10 443 }
1912 nigel 77 GETCHARINCTEST(c, eptr);
1913     if (
1914     #ifdef SUPPORT_UTF8
1915     c < 256 &&
1916     #endif
1917     (md->ctypes[c] & ctype_space) != 0
1918     )
1919 ph10 510 MRRETURN(MATCH_NOMATCH);
1920 nigel 77 ecode++;
1921     break;
1922    
1923     case OP_WHITESPACE:
1924 ph10 443 if (eptr >= md->end_subject)
1925 ph10 428 {
1926 ph10 443 SCHECK_PARTIAL();
1927 ph10 510 MRRETURN(MATCH_NOMATCH);
1928 ph10 443 }
1929 nigel 77 GETCHARINCTEST(c, eptr);
1930     if (
1931     #ifdef SUPPORT_UTF8
1932     c >= 256 ||
1933     #endif
1934     (md->ctypes[c] & ctype_space) == 0
1935     )
1936 ph10 510 MRRETURN(MATCH_NOMATCH);
1937 nigel 77 ecode++;
1938     break;
1939    
1940     case OP_NOT_WORDCHAR:
1941 ph10 443 if (eptr >= md->end_subject)
1942 ph10 428 {
1943 ph10 443 SCHECK_PARTIAL();
1944 ph10 510 MRRETURN(MATCH_NOMATCH);
1945 ph10 443 }
1946 nigel 77 GETCHARINCTEST(c, eptr);
1947     if (
1948     #ifdef SUPPORT_UTF8
1949     c < 256 &&
1950     #endif
1951     (md->ctypes[c] & ctype_word) != 0
1952     )
1953 ph10 510 MRRETURN(MATCH_NOMATCH);
1954 nigel 77 ecode++;
1955     break;
1956    
1957     case OP_WORDCHAR:
1958 ph10 443 if (eptr >= md->end_subject)
1959 ph10 428 {
1960 ph10 443 SCHECK_PARTIAL();
1961 ph10 510 MRRETURN(MATCH_NOMATCH);
1962 ph10 443 }
1963 nigel 77 GETCHARINCTEST(c, eptr);
1964     if (
1965     #ifdef SUPPORT_UTF8
1966     c >= 256 ||
1967     #endif
1968     (md->ctypes[c] & ctype_word) == 0
1969     )
1970 ph10 510 MRRETURN(MATCH_NOMATCH);
1971 nigel 77 ecode++;
1972     break;
1973    
1974 nigel 93 case OP_ANYNL:
1975 ph10 443 if (eptr >= md->end_subject)
1976 ph10 428 {
1977 ph10 443 SCHECK_PARTIAL();
1978 ph10 510 MRRETURN(MATCH_NOMATCH);
1979 ph10 443 }
1980 nigel 93 GETCHARINCTEST(c, eptr);
1981     switch(c)
1982     {
1983 ph10 510 default: MRRETURN(MATCH_NOMATCH);
1984 ph10 600
1985 nigel 93 case 0x000d:
1986     if (eptr < md->end_subject && *eptr == 0x0a) eptr++;
1987     break;
1988 ph10 231
1989 nigel 93 case 0x000a:
1990 ph10 231 break;
1991    
1992 nigel 93 case 0x000b:
1993     case 0x000c:
1994     case 0x0085:
1995     case 0x2028:
1996     case 0x2029:
1997 ph10 510 if (md->bsr_anycrlf) MRRETURN(MATCH_NOMATCH);
1998 nigel 93 break;
1999     }
2000     ecode++;
2001     break;
2002    
2003 ph10 178 case OP_NOT_HSPACE:
2004 ph10 443 if (eptr >= md->end_subject)
2005 ph10 428 {
2006 ph10 443 SCHECK_PARTIAL();
2007 ph10 510 MRRETURN(MATCH_NOMATCH);
2008 ph10 443 }
2009 ph10 178 GETCHARINCTEST(c, eptr);
2010     switch(c)
2011     {
2012     default: break;
2013     case 0x09: /* HT */
2014     case 0x20: /* SPACE */
2015     case 0xa0: /* NBSP */
2016     case 0x1680: /* OGHAM SPACE MARK */
2017     case 0x180e: /* MONGOLIAN VOWEL SEPARATOR */
2018     case 0x2000: /* EN QUAD */
2019     case 0x2001: /* EM QUAD */
2020     case 0x2002: /* EN SPACE */
2021     case 0x2003: /* EM SPACE */
2022     case 0x2004: /* THREE-PER-EM SPACE */
2023     case 0x2005: /* FOUR-PER-EM SPACE */
2024     case 0x2006: /* SIX-PER-EM SPACE */
2025     case 0x2007: /* FIGURE SPACE */
2026     case 0x2008: /* PUNCTUATION SPACE */
2027     case 0x2009: /* THIN SPACE */
2028     case 0x200A: /* HAIR SPACE */
2029     case 0x202f: /* NARROW NO-BREAK SPACE */
2030     case 0x205f: /* MEDIUM MATHEMATICAL SPACE */
2031     case 0x3000: /* IDEOGRAPHIC SPACE */
2032 ph10 510 MRRETURN(MATCH_NOMATCH);
2033 ph10 178 }
2034     ecode++;
2035     break;
2036    
2037     case OP_HSPACE:
2038 ph10 443 if (eptr >= md->end_subject)
2039 ph10 428 {
2040 ph10 443 SCHECK_PARTIAL();
2041 ph10 510 MRRETURN(MATCH_NOMATCH);
2042 ph10 443 }
2043 ph10 178 GETCHARINCTEST(c, eptr);
2044     switch(c)
2045     {
2046 ph10 510 default: MRRETURN(MATCH_NOMATCH);
2047 ph10 178 case 0x09: /* HT */
2048     case 0x20: /* SPACE */
2049     case 0xa0: /* NBSP */
2050     case 0x1680: /* OGHAM SPACE MARK */
2051     case 0x180e: /* MONGOLIAN VOWEL SEPARATOR */
2052     case 0x2000: /* EN QUAD */
2053     case 0x2001: /* EM QUAD */
2054     case 0x2002: /* EN SPACE */
2055     case 0x2003: /* EM SPACE */
2056     case 0x2004: /* THREE-PER-EM SPACE */
2057     case 0x2005: /* FOUR-PER-EM SPACE */
2058     case 0x2006: /* SIX-PER-EM SPACE */
2059     case 0x2007: /* FIGURE SPACE */
2060     case 0x2008: /* PUNCTUATION SPACE */
2061     case 0x2009: /* THIN SPACE */
2062     case 0x200A: /* HAIR SPACE */
2063     case 0x202f: /* NARROW NO-BREAK SPACE */
2064     case 0x205f: /* MEDIUM MATHEMATICAL SPACE */
2065     case 0x3000: /* IDEOGRAPHIC SPACE */
2066     break;
2067     }
2068     ecode++;
2069     break;
2070    
2071     case OP_NOT_VSPACE:
2072 ph10 443 if (eptr >= md->end_subject)
2073 ph10 428 {
2074 ph10 443 SCHECK_PARTIAL();
2075 ph10 510 MRRETURN(MATCH_NOMATCH);
2076 ph10 443 }
2077 ph10 178 GETCHARINCTEST(c, eptr);
2078     switch(c)
2079     {
2080     default: break;
2081     case 0x0a: /* LF */
2082     case 0x0b: /* VT */
2083     case 0x0c: /* FF */
2084     case 0x0d: /* CR */
2085     case 0x85: /* NEL */
2086     case 0x2028: /* LINE SEPARATOR */
2087     case 0x2029: /* PARAGRAPH SEPARATOR */
2088 ph10 510 MRRETURN(MATCH_NOMATCH);
2089 ph10 178 }
2090     ecode++;
2091     break;
2092    
2093     case OP_VSPACE:
2094 ph10 443 if (eptr >= md->end_subject)
2095 ph10 428 {
2096 ph10 443 SCHECK_PARTIAL();
2097 ph10 510 MRRETURN(MATCH_NOMATCH);
2098 ph10 443 }
2099 ph10 178 GETCHARINCTEST(c, eptr);
2100     switch(c)
2101     {
2102 ph10 510 default: MRRETURN(MATCH_NOMATCH);
2103 ph10 178 case 0x0a: /* LF */
2104     case 0x0b: /* VT */
2105     case 0x0c: /* FF */
2106     case 0x0d: /* CR */
2107     case 0x85: /* NEL */
2108     case 0x2028: /* LINE SEPARATOR */
2109     case 0x2029: /* PARAGRAPH SEPARATOR */
2110     break;
2111     }
2112     ecode++;
2113     break;
2114    
2115 nigel 77 #ifdef SUPPORT_UCP
2116     /* Check the next character by Unicode property. We will get here only
2117     if the support is in the binary; otherwise a compile-time error occurs. */
2118    
2119     case OP_PROP:
2120     case OP_NOTPROP:
2121 ph10 443 if (eptr >= md->end_subject)
2122 ph10 428 {
2123 ph10 443 SCHECK_PARTIAL();
2124 ph10 510 MRRETURN(MATCH_NOMATCH);
2125 ph10 443 }
2126 nigel 77 GETCHARINCTEST(c, eptr);
2127     {
2128 ph10 384 const ucd_record *prop = GET_UCD(c);
2129 nigel 77
2130 nigel 87 switch(ecode[1])
2131     {
2132     case PT_ANY:
2133 ph10 510 if (op == OP_NOTPROP) MRRETURN(MATCH_NOMATCH);
2134 nigel 87 break;
2135 nigel 77
2136 nigel 87 case PT_LAMP:
2137 ph10 349 if ((prop->chartype == ucp_Lu ||
2138     prop->chartype == ucp_Ll ||
2139     prop->chartype == ucp_Lt) == (op == OP_NOTPROP))
2140 ph10 510 MRRETURN(MATCH_NOMATCH);
2141 ph10 517 break;
2142 nigel 87
2143     case PT_GC:
2144 ph10 351 if ((ecode[2] != _pcre_ucp_gentype[prop->chartype]) == (op == OP_PROP))
2145 ph10 510 MRRETURN(MATCH_NOMATCH);
2146 nigel 87 break;
2147    
2148     case PT_PC:
2149 ph10 349 if ((ecode[2] != prop->chartype) == (op == OP_PROP))
2150 ph10 510 MRRETURN(MATCH_NOMATCH);
2151 nigel 87 break;
2152    
2153     case PT_SC:
2154 ph10 349 if ((ecode[2] != prop->script) == (op == OP_PROP))
2155 ph10 510 MRRETURN(MATCH_NOMATCH);
2156 nigel 87 break;
2157 ph10 527
2158 ph10 517 /* These are specials */
2159 ph10 527
2160 ph10 517 case PT_ALNUM:
2161     if ((_pcre_ucp_gentype[prop->chartype] == ucp_L ||
2162     _pcre_ucp_gentype[prop->chartype] == ucp_N) == (op == OP_NOTPROP))
2163     MRRETURN(MATCH_NOMATCH);
2164 ph10 527 break;
2165    
2166 ph10 517 case PT_SPACE: /* Perl space */
2167     if ((_pcre_ucp_gentype[prop->chartype] == ucp_Z ||
2168     c == CHAR_HT || c == CHAR_NL || c == CHAR_FF || c == CHAR_CR)
2169     == (op == OP_NOTPROP))
2170     MRRETURN(MATCH_NOMATCH);
2171 ph10 527 break;
2172    
2173 ph10 517 case PT_PXSPACE: /* POSIX space */
2174     if ((_pcre_ucp_gentype[prop->chartype] == ucp_Z ||
2175 ph10 527 c == CHAR_HT || c == CHAR_NL || c == CHAR_VT ||
2176 ph10 517 c == CHAR_FF || c == CHAR_CR)
2177     == (op == OP_NOTPROP))
2178     MRRETURN(MATCH_NOMATCH);
2179 ph10 527 break;
2180 nigel 87
2181 ph10 527 case PT_WORD:
2182 ph10 517 if ((_pcre_ucp_gentype[prop->chartype] == ucp_L ||
2183 ph10 527 _pcre_ucp_gentype[prop->chartype] == ucp_N ||
2184 ph10 517 c == CHAR_UNDERSCORE) == (op == OP_NOTPROP))
2185     MRRETURN(MATCH_NOMATCH);
2186 ph10 527 break;
2187    
2188 ph10 517 /* This should never occur */
2189    
2190 nigel 87 default:
2191     RRETURN(PCRE_ERROR_INTERNAL);
2192 nigel 77 }
2193 nigel 87
2194     ecode += 3;
2195 nigel 77 }
2196     break;
2197    
2198     /* Match an extended Unicode sequence. We will get here only if the support
2199     is in the binary; otherwise a compile-time error occurs. */
2200    
2201     case OP_EXTUNI:
2202 ph10 443 if (eptr >= md->end_subject)
2203 ph10 428 {
2204 ph10 443 SCHECK_PARTIAL();
2205 ph10 510 MRRETURN(MATCH_NOMATCH);
2206 ph10 443 }
2207 nigel 77 GETCHARINCTEST(c, eptr);
2208     {
2209 ph10 349 int category = UCD_CATEGORY(c);
2210 ph10 510 if (category == ucp_M) MRRETURN(MATCH_NOMATCH);
2211 nigel 77 while (eptr < md->end_subject)
2212     {
2213     int len = 1;
2214     if (!utf8) c = *eptr; else
2215     {
2216     GETCHARLEN(c, eptr, len);
2217     }
2218 ph10 349 category = UCD_CATEGORY(c);
2219 nigel 77 if (category != ucp_M) break;
2220     eptr += len;
2221     }
2222     }
2223     ecode++;
2224     break;
2225     #endif
2226    
2227    
2228     /* Match a back reference, possibly repeatedly. Look past the end of the
2229     item to see if there is repeat information following. The code is similar
2230     to that for character classes, but repeated for efficiency. Then obey
2231     similar code to character type repeats - written out again for speed.
2232     However, if the referenced string is the empty string, always treat
2233     it as matched, any number of times (otherwise there could be infinite
2234     loops). */
2235    
2236     case OP_REF:
2237 ph10 602 case OP_REFI:
2238     caseless = op == OP_REFI;
2239 ph10 595 offset = GET2(ecode, 1) << 1; /* Doubled ref number */
2240     ecode += 3;
2241 ph10 345
2242 ph10 595 /* If the reference is unset, there are two possibilities:
2243 ph10 345
2244 ph10 595 (a) In the default, Perl-compatible state, set the length negative;
2245     this ensures that every attempt at a match fails. We can't just fail
2246     here, because of the possibility of quantifiers with zero minima.
2247 ph10 345
2248 ph10 595 (b) If the JavaScript compatibility flag is set, set the length to zero
2249     so that the back reference matches an empty string.
2250 ph10 345
2251 ph10 595 Otherwise, set the length to the length of what was matched by the
2252     referenced subpattern. */
2253 ph10 345
2254 ph10 595 if (offset >= offset_top || md->offset_vector[offset] < 0)
2255     length = (md->jscript_compat)? 0 : -1;
2256     else
2257     length = md->offset_vector[offset+1] - md->offset_vector[offset];
2258 nigel 77
2259 ph10 595 /* Set up for repetition, or handle the non-repeated case */
2260 nigel 77
2261 ph10 595 switch (*ecode)
2262     {
2263     case OP_CRSTAR:
2264     case OP_CRMINSTAR:
2265     case OP_CRPLUS:
2266     case OP_CRMINPLUS:
2267     case OP_CRQUERY:
2268     case OP_CRMINQUERY:
2269     c = *ecode++ - OP_CRSTAR;
2270     minimize = (c & 1) != 0;
2271     min = rep_min[c]; /* Pick up values from tables; */
2272     max = rep_max[c]; /* zero for max => infinity */
2273     if (max == 0) max = INT_MAX;
2274     break;
2275 nigel 77
2276 ph10 595 case OP_CRRANGE:
2277     case OP_CRMINRANGE:
2278     minimize = (*ecode == OP_CRMINRANGE);
2279     min = GET2(ecode, 1);
2280     max = GET2(ecode, 3);
2281     if (max == 0) max = INT_MAX;
2282     ecode += 5;
2283     break;
2284 nigel 77
2285 ph10 595 default: /* No repeat follows */
2286 ph10 602 if ((length = match_ref(offset, eptr, length, md, caseless)) < 0)
2287 ph10 595 {
2288     CHECK_PARTIAL();
2289     MRRETURN(MATCH_NOMATCH);
2290 nigel 77 }
2291 ph10 595 eptr += length;
2292     continue; /* With the main loop */
2293     }
2294 nigel 77
2295 ph10 595 /* Handle repeated back references. If the length of the reference is
2296     zero, just continue with the main loop. */
2297 ph10 443
2298 ph10 595 if (length == 0) continue;
2299 nigel 77
2300 ph10 595 /* First, ensure the minimum number of matches are present. We get back
2301     the length of the reference string explicitly rather than passing the
2302     address of eptr, so that eptr can be a register variable. */
2303 nigel 77
2304 ph10 595 for (i = 1; i <= min; i++)
2305     {
2306     int slength;
2307 ph10 602 if ((slength = match_ref(offset, eptr, length, md, caseless)) < 0)
2308 nigel 77 {
2309 ph10 595 CHECK_PARTIAL();
2310     MRRETURN(MATCH_NOMATCH);
2311 nigel 77 }
2312 ph10 595 eptr += slength;
2313     }
2314 nigel 77
2315 ph10 595 /* If min = max, continue at the same level without recursion.
2316     They are not both allowed to be zero. */
2317 nigel 77
2318 ph10 595 if (min == max) continue;
2319 nigel 77
2320 ph10 595 /* If minimizing, keep trying and advancing the pointer */
2321 nigel 77
2322 ph10 595 if (minimize)
2323     {
2324     for (fi = min;; fi++)
2325 nigel 77 {
2326 ph10 595 int slength;
2327 ph10 602 RMATCH(eptr, ecode, offset_top, md, eptrb, 0, RM14);
2328 ph10 595 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2329     if (fi >= max) MRRETURN(MATCH_NOMATCH);
2330 ph10 602 if ((slength = match_ref(offset, eptr, length, md, caseless)) < 0)
2331 nigel 77 {
2332 ph10 595 CHECK_PARTIAL();
2333     MRRETURN(MATCH_NOMATCH);
2334 nigel 77 }
2335 ph10 595 eptr += slength;
2336 nigel 77 }
2337 ph10 595 /* Control never gets here */
2338     }
2339 nigel 77
2340 ph10 595 /* If maximizing, find the longest string and work backwards */
2341 nigel 77
2342 ph10 595 else
2343     {
2344     pp = eptr;
2345     for (i = min; i < max; i++)
2346 nigel 77 {
2347 ph10 595 int slength;
2348 ph10 602 if ((slength = match_ref(offset, eptr, length, md, caseless)) < 0)
2349 nigel 77 {
2350 ph10 595 CHECK_PARTIAL();
2351     break;
2352 nigel 77 }
2353 ph10 595 eptr += slength;
2354 nigel 77 }
2355 ph10 595 while (eptr >= pp)
2356     {
2357 ph10 602 RMATCH(eptr, ecode, offset_top, md, eptrb, 0, RM15);
2358 ph10 595 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2359     eptr -= length;
2360     }
2361     MRRETURN(MATCH_NOMATCH);
2362 nigel 77 }
2363     /* Control never gets here */
2364    
2365     /* Match a bit-mapped character class, possibly repeatedly. This op code is
2366     used when all the characters in the class have values in the range 0-255,
2367     and either the matching is caseful, or the characters are in the range
2368     0-127 when UTF-8 processing is enabled. The only difference between
2369     OP_CLASS and OP_NCLASS occurs when a data character outside the range is
2370     encountered.
2371    
2372     First, look past the end of the item to see if there is repeat information
2373     following. Then obey similar code to character type repeats - written out
2374     again for speed. */
2375    
2376     case OP_NCLASS:
2377     case OP_CLASS:
2378     {
2379     data = ecode + 1; /* Save for matching */
2380     ecode += 33; /* Advance past the item */
2381    
2382     switch (*ecode)
2383     {
2384     case OP_CRSTAR:
2385     case OP_CRMINSTAR:
2386     case OP_CRPLUS:
2387     case OP_CRMINPLUS:
2388     case OP_CRQUERY:
2389     case OP_CRMINQUERY:
2390     c = *ecode++ - OP_CRSTAR;
2391     minimize = (c & 1) != 0;
2392     min = rep_min[c]; /* Pick up values from tables; */
2393     max = rep_max[c]; /* zero for max => infinity */
2394     if (max == 0) max = INT_MAX;
2395     break;
2396    
2397     case OP_CRRANGE:
2398     case OP_CRMINRANGE:
2399     minimize = (*ecode == OP_CRMINRANGE);
2400     min = GET2(ecode, 1);
2401     max = GET2(ecode, 3);
2402     if (max == 0) max = INT_MAX;
2403     ecode += 5;
2404     break;
2405    
2406     default: /* No repeat follows */
2407     min = max = 1;
2408     break;
2409     }
2410    
2411     /* First, ensure the minimum number of matches are present. */
2412    
2413     #ifdef SUPPORT_UTF8
2414     /* UTF-8 mode */
2415     if (utf8)
2416     {
2417     for (i = 1; i <= min; i++)
2418     {
2419 ph10 427 if (eptr >= md->end_subject)
2420 ph10 426 {
2421 ph10 428 SCHECK_PARTIAL();
2422 ph10 510 MRRETURN(MATCH_NOMATCH);
2423 ph10 427 }
2424 nigel 77 GETCHARINC(c, eptr);
2425     if (c > 255)
2426     {
2427 ph10 510 if (op == OP_CLASS) MRRETURN(MATCH_NOMATCH);
2428 nigel 77 }
2429     else
2430     {
2431 ph10 510 if ((data[c/8] & (1 << (c&7))) == 0) MRRETURN(MATCH_NOMATCH);
2432 nigel 77 }
2433     }
2434     }
2435     else
2436     #endif
2437     /* Not UTF-8 mode */
2438     {
2439     for (i = 1; i <= min; i++)
2440     {
2441 ph10 427 if (eptr >= md->end_subject)
2442 ph10 426 {
2443 ph10 428 SCHECK_PARTIAL();
2444 ph10 510 MRRETURN(MATCH_NOMATCH);
2445 ph10 427 }
2446 nigel 77 c = *eptr++;
2447 ph10 510 if ((data[c/8] & (1 << (c&7))) == 0) MRRETURN(MATCH_NOMATCH);
2448 nigel 77 }
2449     }
2450    
2451     /* If max == min we can continue with the main loop without the
2452     need to recurse. */
2453    
2454     if (min == max) continue;
2455    
2456     /* If minimizing, keep testing the rest of the expression and advancing
2457     the pointer while it matches the class. */
2458    
2459     if (minimize)
2460     {
2461     #ifdef SUPPORT_UTF8
2462     /* UTF-8 mode */
2463     if (utf8)
2464     {
2465     for (fi = min;; fi++)
2466     {
2467 ph10 602 RMATCH(eptr, ecode, offset_top, md, eptrb, 0, RM16);
2468 nigel 77 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2469 ph10 510 if (fi >= max) MRRETURN(MATCH_NOMATCH);
2470 ph10 427 if (eptr >= md->end_subject)
2471 ph10 426 {
2472 ph10 427 SCHECK_PARTIAL();
2473 ph10 510 MRRETURN(MATCH_NOMATCH);
2474 ph10 427 }
2475 nigel 77 GETCHARINC(c, eptr);
2476     if (c > 255)
2477     {
2478 ph10 510 if (op == OP_CLASS) MRRETURN(MATCH_NOMATCH);
2479 nigel 77 }
2480     else
2481     {
2482 ph10 510 if ((data[c/8] & (1 << (c&7))) == 0) MRRETURN(MATCH_NOMATCH);
2483 nigel 77 }
2484     }
2485     }
2486     else
2487     #endif
2488     /* Not UTF-8 mode */
2489     {
2490     for (fi = min;; fi++)
2491     {
2492 ph10 602 RMATCH(eptr, ecode, offset_top, md, eptrb, 0, RM17);
2493 nigel 77 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2494 ph10 510 if (fi >= max) MRRETURN(MATCH_NOMATCH);
2495 ph10 427 if (eptr >= md->end_subject)
2496 ph10 426 {
2497 ph10 427 SCHECK_PARTIAL();
2498 ph10 510 MRRETURN(MATCH_NOMATCH);
2499 ph10 427 }
2500 nigel 77 c = *eptr++;
2501 ph10 510 if ((data[c/8] & (1 << (c&7))) == 0) MRRETURN(MATCH_NOMATCH);
2502 nigel 77 }
2503     }
2504     /* Control never gets here */
2505     }
2506    
2507     /* If maximizing, find the longest possible run, then work backwards. */
2508    
2509     else
2510     {
2511     pp = eptr;
2512    
2513     #ifdef SUPPORT_UTF8
2514     /* UTF-8 mode */
2515     if (utf8)
2516     {
2517     for (i = min; i < max; i++)
2518     {
2519     int len = 1;
2520 ph10 463 if (eptr >= md->end_subject)
2521 ph10 462 {
2522 ph10 463 SCHECK_PARTIAL();
2523 ph10 462 break;
2524 ph10 463 }
2525 nigel 77 GETCHARLEN(c, eptr, len);
2526     if (c > 255)
2527     {
2528     if (op == OP_CLASS) break;
2529     }
2530     else
2531     {
2532     if ((data[c/8] & (1 << (c&7))) == 0) break;
2533     }
2534     eptr += len;
2535     }
2536     for (;;)
2537     {
2538 ph10 602 RMATCH(eptr, ecode, offset_top, md, eptrb, 0, RM18);
2539 nigel 77 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2540     if (eptr-- == pp) break; /* Stop if tried at original pos */
2541     BACKCHAR(eptr);
2542     }
2543     }
2544     else
2545     #endif
2546     /* Not UTF-8 mode */
2547     {
2548     for (i = min; i < max; i++)
2549     {
2550 ph10 463 if (eptr >= md->end_subject)
2551 ph10 462 {
2552 ph10 463 SCHECK_PARTIAL();
2553 ph10 462 break;
2554 ph10 463 }
2555 nigel 77 c = *eptr;
2556     if ((data[c/8] & (1 << (c&7))) == 0) break;
2557     eptr++;
2558     }
2559     while (eptr >= pp)
2560     {
2561 ph10 602 RMATCH(eptr, ecode, offset_top, md, eptrb, 0, RM19);
2562 nigel 87 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2563 nigel 77 eptr--;
2564     }
2565     }
2566    
2567 ph10 510 MRRETURN(MATCH_NOMATCH);
2568 nigel 77 }
2569     }
2570     /* Control never gets here */
2571    
2572    
2573     /* Match an extended character class. This opcode is encountered only
2574 ph10 384 when UTF-8 mode mode is supported. Nevertheless, we may not be in UTF-8
2575     mode, because Unicode properties are supported in non-UTF-8 mode. */
2576 nigel 77
2577     #ifdef SUPPORT_UTF8
2578     case OP_XCLASS:
2579     {
2580     data = ecode + 1 + LINK_SIZE; /* Save for matching */
2581     ecode += GET(ecode, 1); /* Advance past the item */
2582    
2583     switch (*ecode)
2584     {
2585     case OP_CRSTAR:
2586     case OP_CRMINSTAR:
2587     case OP_CRPLUS:
2588     case OP_CRMINPLUS:
2589     case OP_CRQUERY:
2590     case OP_CRMINQUERY:
2591     c = *ecode++ - OP_CRSTAR;
2592     minimize = (c & 1) != 0;
2593     min = rep_min[c]; /* Pick up values from tables; */
2594     max = rep_max[c]; /* zero for max => infinity */
2595     if (max == 0) max = INT_MAX;
2596     break;
2597    
2598     case OP_CRRANGE:
2599     case OP_CRMINRANGE:
2600     minimize = (*ecode == OP_CRMINRANGE);
2601     min = GET2(ecode, 1);
2602     max = GET2(ecode, 3);
2603     if (max == 0) max = INT_MAX;
2604     ecode += 5;
2605     break;
2606    
2607     default: /* No repeat follows */
2608     min = max = 1;
2609     break;
2610     }
2611    
2612     /* First, ensure the minimum number of matches are present. */
2613    
2614     for (i = 1; i <= min; i++)
2615     {
2616 ph10 427 if (eptr >= md->end_subject)
2617 ph10 426 {
2618     SCHECK_PARTIAL();
2619 ph10 510 MRRETURN(MATCH_NOMATCH);
2620 ph10 427 }
2621 ph10 384 GETCHARINCTEST(c, eptr);
2622 ph10 510 if (!_pcre_xclass(c, data)) MRRETURN(MATCH_NOMATCH);
2623 nigel 77 }
2624    
2625     /* If max == min we can continue with the main loop without the
2626     need to recurse. */
2627    
2628     if (min == max) continue;
2629    
2630     /* If minimizing, keep testing the rest of the expression and advancing
2631     the pointer while it matches the class. */
2632    
2633     if (minimize)
2634     {
2635     for (fi = min;; fi++)
2636     {
2637 ph10 602 RMATCH(eptr, ecode, offset_top, md, eptrb, 0, RM20);
2638 nigel 77 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2639 ph10 510 if (fi >= max) MRRETURN(MATCH_NOMATCH);
2640 ph10 427 if (eptr >= md->end_subject)
2641 ph10 426 {
2642 ph10 427 SCHECK_PARTIAL();
2643 ph10 510 MRRETURN(MATCH_NOMATCH);
2644 ph10 427 }
2645 ph10 384 GETCHARINCTEST(c, eptr);
2646 ph10 510 if (!_pcre_xclass(c, data)) MRRETURN(MATCH_NOMATCH);
2647 nigel 77 }
2648     /* Control never gets here */
2649     }
2650    
2651     /* If maximizing, find the longest possible run, then work backwards. */
2652    
2653     else
2654     {
2655     pp = eptr;
2656     for (i = min; i < max; i++)
2657     {
2658     int len = 1;
2659 ph10 463 if (eptr >= md->end_subject)
2660 ph10 462 {
2661 ph10 463 SCHECK_PARTIAL();
2662 ph10 462 break;
2663 ph10 463 }
2664 ph10 384 GETCHARLENTEST(c, eptr, len);
2665 nigel 77 if (!_pcre_xclass(c, data)) break;
2666     eptr += len;
2667     }
2668     for(;;)
2669     {
2670 ph10 602 RMATCH(eptr, ecode, offset_top, md, eptrb, 0, RM21);
2671 nigel 77 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2672     if (eptr-- == pp) break; /* Stop if tried at original pos */
2673 ph10 214 if (utf8) BACKCHAR(eptr);
2674 nigel 77 }
2675 ph10 510 MRRETURN(MATCH_NOMATCH);
2676 nigel 77 }
2677    
2678     /* Control never gets here */
2679     }
2680     #endif /* End of XCLASS */
2681    
2682     /* Match a single character, casefully */
2683    
2684     case OP_CHAR:
2685     #ifdef SUPPORT_UTF8
2686     if (utf8)
2687     {
2688     length = 1;
2689     ecode++;
2690     GETCHARLEN(fc, ecode, length);
2691 ph10 443 if (length > md->end_subject - eptr)
2692 ph10 428 {
2693     CHECK_PARTIAL(); /* Not SCHECK_PARTIAL() */
2694 ph10 510 MRRETURN(MATCH_NOMATCH);
2695 ph10 443 }
2696 ph10 510 while (length-- > 0) if (*ecode++ != *eptr++) MRRETURN(MATCH_NOMATCH);
2697 nigel 77 }
2698     else
2699     #endif
2700    
2701     /* Non-UTF-8 mode */
2702     {
2703 ph10 443 if (md->end_subject - eptr < 1)
2704 ph10 428 {
2705     SCHECK_PARTIAL(); /* This one can use SCHECK_PARTIAL() */
2706 ph10 510 MRRETURN(MATCH_NOMATCH);
2707 ph10 443 }
2708 ph10 510 if (ecode[1] != *eptr++) MRRETURN(MATCH_NOMATCH);
2709 nigel 77 ecode += 2;
2710     }
2711     break;
2712    
2713     /* Match a single character, caselessly */
2714    
2715 ph10 602 case OP_CHARI:
2716 nigel 77 #ifdef SUPPORT_UTF8
2717     if (utf8)
2718     {
2719     length = 1;
2720     ecode++;
2721     GETCHARLEN(fc, ecode, length);
2722    
2723 ph10 443 if (length > md->end_subject - eptr)
2724 ph10 428 {
2725     CHECK_PARTIAL(); /* Not SCHECK_PARTIAL() */
2726 ph10 510 MRRETURN(MATCH_NOMATCH);
2727 ph10 443 }
2728 nigel 77
2729     /* If the pattern character's value is < 128, we have only one byte, and
2730     can use the fast lookup table. */
2731    
2732     if (fc < 128)
2733     {
2734 ph10 510 if (md->lcc[*ecode++] != md->lcc[*eptr++]) MRRETURN(MATCH_NOMATCH);
2735 nigel 77 }
2736    
2737     /* Otherwise we must pick up the subject character */
2738    
2739     else
2740     {
2741 nigel 93 unsigned int dc;
2742 nigel 77 GETCHARINC(dc, eptr);
2743     ecode += length;
2744    
2745     /* If we have Unicode property support, we can use it to test the other
2746 nigel 87 case of the character, if there is one. */
2747 nigel 77
2748     if (fc != dc)
2749     {
2750     #ifdef SUPPORT_UCP
2751 ph10 349 if (dc != UCD_OTHERCASE(fc))
2752 nigel 77 #endif
2753 ph10 510 MRRETURN(MATCH_NOMATCH);
2754 nigel 77 }
2755     }
2756     }
2757     else
2758     #endif /* SUPPORT_UTF8 */
2759    
2760     /* Non-UTF-8 mode */
2761     {
2762 ph10 443 if (md->end_subject - eptr < 1)
2763 ph10 428 {
2764 ph10 443 SCHECK_PARTIAL(); /* This one can use SCHECK_PARTIAL() */
2765 ph10 510 MRRETURN(MATCH_NOMATCH);
2766 ph10 443 }
2767 ph10 510 if (md->lcc[ecode[1]] != md->lcc[*eptr++]) MRRETURN(MATCH_NOMATCH);
2768 nigel 77 ecode += 2;
2769     }
2770     break;
2771    
2772 nigel 93 /* Match a single character repeatedly. */
2773 nigel 77
2774     case OP_EXACT:
2775 ph10 602 case OP_EXACTI:
2776 nigel 77 min = max = GET2(ecode, 1);
2777     ecode += 3;
2778     goto REPEATCHAR;
2779    
2780 nigel 93 case OP_POSUPTO:
2781 ph10 602 case OP_POSUPTOI:
2782 nigel 93 possessive = TRUE;
2783     /* Fall through */
2784    
2785 nigel 77 case OP_UPTO:
2786 ph10 602 case OP_UPTOI:
2787 nigel 77 case OP_MINUPTO:
2788 ph10 602 case OP_MINUPTOI:
2789 nigel 77 min = 0;
2790     max = GET2(ecode, 1);
2791 ph10 602 minimize = *ecode == OP_MINUPTO || *ecode == OP_MINUPTOI;
2792 nigel 77 ecode += 3;
2793     goto REPEATCHAR;
2794    
2795 nigel 93 case OP_POSSTAR:
2796 ph10 602 case OP_POSSTARI:
2797 nigel 93 possessive = TRUE;
2798     min = 0;
2799     max = INT_MAX;
2800     ecode++;
2801     goto REPEATCHAR;
2802    
2803     case OP_POSPLUS:
2804 ph10 602 case OP_POSPLUSI:
2805 nigel 93 possessive = TRUE;
2806     min = 1;
2807     max = INT_MAX;
2808     ecode++;
2809     goto REPEATCHAR;
2810    
2811     case OP_POSQUERY:
2812 ph10 602 case OP_POSQUERYI:
2813 nigel 93 possessive = TRUE;
2814     min = 0;
2815     max = 1;
2816     ecode++;
2817     goto REPEATCHAR;
2818    
2819 nigel 77 case OP_STAR:
2820 ph10 602 case OP_STARI:
2821 nigel 77 case OP_MINSTAR:
2822 ph10 602 case OP_MINSTARI:
2823 nigel 77 case OP_PLUS:
2824 ph10 602 case OP_PLUSI:
2825 nigel 77 case OP_MINPLUS:
2826 ph10 602 case OP_MINPLUSI:
2827 nigel 77 case OP_QUERY:
2828 ph10 602 case OP_QUERYI:
2829 nigel 77 case OP_MINQUERY:
2830 ph10 602 case OP_MINQUERYI:
2831     c = *ecode++ - ((op < OP_STARI)? OP_STAR : OP_STARI);
2832 nigel 77 minimize = (c & 1) != 0;
2833     min = rep_min[c]; /* Pick up values from tables; */
2834     max = rep_max[c]; /* zero for max => infinity */
2835     if (max == 0) max = INT_MAX;
2836    
2837 ph10 426 /* Common code for all repeated single-character matches. */
2838 nigel 77
2839     REPEATCHAR:
2840     #ifdef SUPPORT_UTF8
2841     if (utf8)
2842     {
2843     length = 1;
2844     charptr = ecode;
2845     GETCHARLEN(fc, ecode, length);
2846     ecode += length;
2847    
2848     /* Handle multibyte character matching specially here. There is
2849     support for caseless matching if UCP support is present. */
2850    
2851     if (length > 1)
2852     {
2853     #ifdef SUPPORT_UCP
2854 nigel 93 unsigned int othercase;
2855 ph10 602 if (op >= OP_STARI && /* Caseless */
2856 ph10 349 (othercase = UCD_OTHERCASE(fc)) != fc)
2857 nigel 77 oclength = _pcre_ord2utf8(othercase, occhars);
2858 ph10 115 else oclength = 0;
2859 nigel 77 #endif /* SUPPORT_UCP */
2860    
2861     for (i = 1; i <= min; i++)
2862     {
2863 ph10 426 if (eptr <= md->end_subject - length &&
2864     memcmp(eptr, charptr, length) == 0) eptr += length;
2865 ph10 123 #ifdef SUPPORT_UCP
2866 ph10 426 else if (oclength > 0 &&
2867     eptr <= md->end_subject - oclength &&
2868     memcmp(eptr, occhars, oclength) == 0) eptr += oclength;
2869     #endif /* SUPPORT_UCP */
2870 nigel 77 else
2871     {
2872 ph10 426 CHECK_PARTIAL();
2873 ph10 510 MRRETURN(MATCH_NOMATCH);
2874 nigel 77 }
2875     }
2876    
2877     if (min == max) continue;
2878    
2879     if (minimize)
2880     {
2881     for (fi = min;; fi++)
2882     {
2883 ph10 602 RMATCH(eptr, ecode, offset_top, md, eptrb, 0, RM22);
2884 nigel 77 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2885 ph10 510 if (fi >= max) MRRETURN(MATCH_NOMATCH);
2886 ph10 426 if (eptr <= md->end_subject - length &&
2887     memcmp(eptr, charptr, length) == 0) eptr += length;
2888 ph10 123 #ifdef SUPPORT_UCP
2889 ph10 426 else if (oclength > 0 &&
2890     eptr <= md->end_subject - oclength &&
2891     memcmp(eptr, occhars, oclength) == 0) eptr += oclength;
2892     #endif /* SUPPORT_UCP */
2893 nigel 77 else
2894     {
2895 ph10 426 CHECK_PARTIAL();
2896 ph10 510 MRRETURN(MATCH_NOMATCH);
2897 nigel 77 }
2898     }
2899     /* Control never gets here */
2900     }
2901 nigel 93
2902     else /* Maximize */
2903 nigel 77 {
2904     pp = eptr;
2905     for (i = min; i < max; i++)
2906     {
2907 ph10 426 if (eptr <= md->end_subject - length &&
2908     memcmp(eptr, charptr, length) == 0) eptr += length;
2909 ph10 123 #ifdef SUPPORT_UCP
2910 ph10 426 else if (oclength > 0 &&
2911     eptr <= md->end_subject - oclength &&
2912     memcmp(eptr, occhars, oclength) == 0) eptr += oclength;
2913     #endif /* SUPPORT_UCP */
2914 ph10 463 else
2915 ph10 462 {
2916 ph10 463 CHECK_PARTIAL();
2917 ph10 462 break;
2918 ph10 463 }
2919 nigel 77 }
2920 nigel 93
2921     if (possessive) continue;
2922 ph10 427
2923 ph10 120 for(;;)
2924 ph10 426 {
2925 ph10 602 RMATCH(eptr, ecode, offset_top, md, eptrb, 0, RM23);
2926 ph10 426 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2927 ph10 510 if (eptr == pp) { MRRETURN(MATCH_NOMATCH); }
2928 ph10 115 #ifdef SUPPORT_UCP
2929 ph10 426 eptr--;
2930     BACKCHAR(eptr);
2931 ph10 123 #else /* without SUPPORT_UCP */
2932 ph10 426 eptr -= length;
2933 ph10 123 #endif /* SUPPORT_UCP */
2934 ph10 426 }
2935 nigel 77 }
2936     /* Control never gets here */
2937     }
2938    
2939     /* If the length of a UTF-8 character is 1, we fall through here, and
2940     obey the code as for non-UTF-8 characters below, though in this case the
2941     value of fc will always be < 128. */
2942     }
2943     else
2944     #endif /* SUPPORT_UTF8 */
2945    
2946     /* When not in UTF-8 mode, load a single-byte character. */
2947    
2948 ph10 426 fc = *ecode++;
2949 ph10 443
2950 nigel 77 /* The value of fc at this point is always less than 256, though we may or
2951     may not be in UTF-8 mode. The code is duplicated for the caseless and
2952     caseful cases, for speed, since matching characters is likely to be quite
2953     common. First, ensure the minimum number of matches are present. If min =
2954     max, continue at the same level without recursing. Otherwise, if
2955     minimizing, keep trying the rest of the expression and advancing one
2956     matching character if failing, up to the maximum. Alternatively, if
2957     maximizing, find the maximum number of characters and work backwards. */
2958    
2959     DPRINTF(("matching %c{%d,%d} against subject %.*s\n", fc, min, max,
2960     max, eptr));
2961    
2962 ph10 602 if (op >= OP_STARI) /* Caseless */
2963 nigel 77 {
2964     fc = md->lcc[fc];
2965     for (i = 1; i <= min; i++)
2966 ph10 426 {
2967     if (eptr >= md->end_subject)
2968     {
2969     SCHECK_PARTIAL();
2970 ph10 510 MRRETURN(MATCH_NOMATCH);
2971 ph10 426 }
2972 ph10 510 if (fc != md->lcc[*eptr++]) MRRETURN(MATCH_NOMATCH);
2973 ph10 426 }
2974 nigel 77 if (min == max) continue;
2975     if (minimize)
2976     {
2977     for (fi = min;; fi++)
2978     {
2979 ph10 602 RMATCH(eptr, ecode, offset_top, md, eptrb, 0, RM24);
2980 nigel 77 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2981 ph10 510 if (fi >= max) MRRETURN(MATCH_NOMATCH);
2982 ph10 426 if (eptr >= md->end_subject)
2983     {
2984 ph10 427 SCHECK_PARTIAL();
2985 ph10 510 MRRETURN(MATCH_NOMATCH);
2986 ph10 426 }
2987 ph10 510 if (fc != md->lcc[*eptr++]) MRRETURN(MATCH_NOMATCH);
2988 nigel 77 }
2989     /* Control never gets here */
2990     }
2991 nigel 93 else /* Maximize */
2992 nigel 77 {
2993     pp = eptr;
2994     for (i = min; i < max; i++)
2995     {
2996 ph10 463 if (eptr >= md->end_subject)
2997 ph10 462 {
2998     SCHECK_PARTIAL();
2999     break;
3000 ph10 463 }
3001 ph10 462 if (fc != md->lcc[*eptr]) break;
3002 nigel 77 eptr++;
3003     }
3004 ph10 427
3005 nigel 93 if (possessive) continue;
3006 ph10 427
3007 nigel 77 while (eptr >= pp)
3008     {
3009 ph10 602 RMATCH(eptr, ecode, offset_top, md, eptrb, 0, RM25);
3010 nigel 77 eptr--;
3011     if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3012     }
3013 ph10 510 MRRETURN(MATCH_NOMATCH);
3014 nigel 77 }
3015     /* Control never gets here */
3016     }
3017    
3018     /* Caseful comparisons (includes all multi-byte characters) */
3019    
3020     else
3021     {
3022 ph10 427 for (i = 1; i <= min; i++)
3023 ph10 426 {
3024     if (eptr >= md->end_subject)
3025     {
3026     SCHECK_PARTIAL();
3027 ph10 510 MRRETURN(MATCH_NOMATCH);
3028 ph10 426 }
3029 ph10 510 if (fc != *eptr++) MRRETURN(MATCH_NOMATCH);
3030 ph10 427 }
3031 ph10 443
3032 nigel 77 if (min == max) continue;
3033 ph10 443
3034 nigel 77 if (minimize)
3035     {
3036     for (fi = min;; fi++)
3037     {
3038 ph10 602 RMATCH(eptr, ecode, offset_top, md, eptrb, 0, RM26);
3039 nigel 77 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3040 ph10 510 if (fi >= max) MRRETURN(MATCH_NOMATCH);
3041 ph10 426 if (eptr >= md->end_subject)
3042 ph10 427 {
3043 ph10 426 SCHECK_PARTIAL();
3044 ph10 510 MRRETURN(MATCH_NOMATCH);
3045 ph10 427 }
3046 ph10 510 if (fc != *eptr++) MRRETURN(MATCH_NOMATCH);
3047 nigel 77 }
3048     /* Control never gets here */
3049     }
3050 nigel 93 else /* Maximize */
3051 nigel 77 {
3052     pp = eptr;
3053     for (i = min; i < max; i++)
3054     {
3055 ph10 463 if (eptr >= md->end_subject)
3056 ph10 462 {
3057 ph10 463 SCHECK_PARTIAL();
3058 ph10 462 break;
3059 ph10 463 }
3060 ph10 462 if (fc != *eptr) break;
3061 nigel 77 eptr++;
3062     }
3063 nigel 93 if (possessive) continue;
3064 ph10 443
3065 nigel 77 while (eptr >= pp)
3066     {
3067 ph10 602 RMATCH(eptr, ecode, offset_top, md, eptrb, 0, RM27);
3068 nigel 77 eptr--;
3069     if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3070     }
3071 ph10 510 MRRETURN(MATCH_NOMATCH);
3072 nigel 77 }
3073     }
3074     /* Control never gets here */
3075    
3076     /* Match a negated single one-byte character. The character we are
3077     checking can be multibyte. */
3078    
3079     case OP_NOT:
3080 ph10 602 case OP_NOTI:
3081 ph10 443 if (eptr >= md->end_subject)
3082 ph10 428 {
3083 ph10 443 SCHECK_PARTIAL();
3084 ph10 510 MRRETURN(MATCH_NOMATCH);
3085 ph10 443 }
3086 nigel 77 ecode++;
3087     GETCHARINCTEST(c, eptr);
3088 ph10 602 if (op == OP_NOTI) /* The caseless case */
3089 nigel 77 {
3090     #ifdef SUPPORT_UTF8
3091     if (c < 256)
3092     #endif
3093     c = md->lcc[c];
3094 ph10 510 if (md->lcc[*ecode++] == c) MRRETURN(MATCH_NOMATCH);
3095 nigel 77 }
3096 ph10 602 else /* Caseful */
3097 nigel 77 {
3098 ph10 510 if (*ecode++ == c) MRRETURN(MATCH_NOMATCH);
3099 nigel 77 }
3100     break;
3101    
3102     /* Match a negated single one-byte character repeatedly. This is almost a
3103     repeat of the code for a repeated single character, but I haven't found a
3104     nice way of commoning these up that doesn't require a test of the
3105     positive/negative option for each character match. Maybe that wouldn't add
3106     very much to the time taken, but character matching *is* what this is all
3107     about... */
3108    
3109     case OP_NOTEXACT:
3110 ph10 602 case OP_NOTEXACTI:
3111 nigel 77 min = max = GET2(ecode, 1);
3112     ecode += 3;
3113     goto REPEATNOTCHAR;
3114    
3115     case OP_NOTUPTO:
3116 ph10 602 case OP_NOTUPTOI:
3117 nigel 77 case OP_NOTMINUPTO:
3118 ph10 602 case OP_NOTMINUPTOI:
3119 nigel 77 min = 0;
3120     max = GET2(ecode, 1);
3121 ph10 602 minimize = *ecode == OP_NOTMINUPTO || *ecode == OP_NOTMINUPTOI;
3122 nigel 77 ecode += 3;
3123     goto REPEATNOTCHAR;
3124    
3125 nigel 93 case OP_NOTPOSSTAR:
3126 ph10 602 case OP_NOTPOSSTARI:
3127 nigel 93 possessive = TRUE;
3128     min = 0;
3129     max = INT_MAX;
3130     ecode++;
3131     goto REPEATNOTCHAR;
3132    
3133     case OP_NOTPOSPLUS:
3134 ph10 602 case OP_NOTPOSPLUSI:
3135 nigel 93 possessive = TRUE;
3136     min = 1;
3137     max = INT_MAX;
3138     ecode++;
3139     goto REPEATNOTCHAR;
3140    
3141     case OP_NOTPOSQUERY:
3142 ph10 602 case OP_NOTPOSQUERYI:
3143 nigel 93 possessive = TRUE;
3144     min = 0;
3145     max = 1;
3146     ecode++;
3147     goto REPEATNOTCHAR;
3148    
3149     case OP_NOTPOSUPTO:
3150 ph10 602 case OP_NOTPOSUPTOI:
3151 nigel 93 possessive = TRUE;
3152     min = 0;
3153     max = GET2(ecode, 1);
3154     ecode += 3;
3155     goto REPEATNOTCHAR;
3156    
3157 nigel 77 case OP_NOTSTAR:
3158 ph10 602 case OP_NOTSTARI:
3159 nigel 77 case OP_NOTMINSTAR:
3160 ph10 602 case OP_NOTMINSTARI:
3161 nigel 77 case OP_NOTPLUS:
3162 ph10 602 case OP_NOTPLUSI:
3163 nigel 77 case OP_NOTMINPLUS:
3164 ph10 602 case OP_NOTMINPLUSI:
3165 nigel 77 case OP_NOTQUERY:
3166 ph10 602 case OP_NOTQUERYI:
3167 nigel 77 case OP_NOTMINQUERY:
3168 ph10 602 case OP_NOTMINQUERYI:
3169     c = *ecode++ - ((op >= OP_NOTSTARI)? OP_NOTSTARI: OP_NOTSTAR);
3170 nigel 77 minimize = (c & 1) != 0;
3171     min = rep_min[c]; /* Pick up values from tables; */
3172     max = rep_max[c]; /* zero for max => infinity */
3173     if (max == 0) max = INT_MAX;
3174    
3175 ph10 426 /* Common code for all repeated single-byte matches. */
3176 nigel 77
3177     REPEATNOTCHAR:
3178     fc = *ecode++;
3179    
3180     /* The code is duplicated for the caseless and caseful cases, for speed,
3181     since matching characters is likely to be quite common. First, ensure the
3182     minimum number of matches are present. If min = max, continue at the same
3183     level without recursing. Otherwise, if minimizing, keep trying the rest of
3184     the expression and advancing one matching character if failing, up to the
3185     maximum. Alternatively, if maximizing, find the maximum number of
3186     characters and work backwards. */
3187    
3188     DPRINTF(("negative matching %c{%d,%d} against subject %.*s\n", fc, min, max,
3189     max, eptr));
3190    
3191 ph10 602 if (op >= OP_NOTSTARI) /* Caseless */
3192 nigel 77 {
3193     fc = md->lcc[fc];
3194    
3195     #ifdef SUPPORT_UTF8
3196     /* UTF-8 mode */
3197     if (utf8)
3198     {
3199 nigel 93 register unsigned int d;
3200 nigel 77 for (i = 1; i <= min; i++)
3201     {
3202 ph10 426 if (eptr >= md->end_subject)
3203     {
3204     SCHECK_PARTIAL();
3205 ph10 510 MRRETURN(MATCH_NOMATCH);
3206 ph10 427 }
3207 nigel 77 GETCHARINC(d, eptr);
3208     if (d < 256) d = md->lcc[d];
3209 ph10 510 if (fc == d) MRRETURN(MATCH_NOMATCH);
3210 nigel 77 }
3211     }
3212     else
3213     #endif
3214    
3215     /* Not UTF-8 mode */
3216     {
3217     for (i = 1; i <= min; i++)
3218 ph10 426 {
3219     if (eptr >= md->end_subject)
3220     {
3221     SCHECK_PARTIAL();
3222 ph10 510 MRRETURN(MATCH_NOMATCH);
3223 ph10 427 }
3224 ph10 510 if (fc == md->lcc[*eptr++]) MRRETURN(MATCH_NOMATCH);
3225 ph10 427 }
3226 nigel 77 }
3227    
3228     if (min == max) continue;
3229    
3230     if (minimize)
3231     {
3232     #ifdef SUPPORT_UTF8
3233     /* UTF-8 mode */
3234     if (utf8)
3235     {
3236 nigel 93 register unsigned int d;
3237 nigel 77 for (fi = min;; fi++)
3238     {
3239 ph10 602 RMATCH(eptr, ecode, offset_top, md, eptrb, 0, RM28);
3240 nigel 77 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3241 ph10 510 if (fi >= max) MRRETURN(MATCH_NOMATCH);
3242 ph10 427 if (eptr >= md->end_subject)
3243 ph10 426 {
3244 ph10 427 SCHECK_PARTIAL();
3245 ph10 510 MRRETURN(MATCH_NOMATCH);
3246 ph10 427 }
3247 nigel 77 GETCHARINC(d, eptr);
3248     if (d < 256) d = md->lcc[d];
3249 ph10 510 if (fc == d) MRRETURN(MATCH_NOMATCH);
3250 nigel 77 }
3251     }
3252     else
3253     #endif
3254     /* Not UTF-8 mode */
3255     {
3256     for (fi = min;; fi++)
3257     {
3258 ph10 602 RMATCH(eptr, ecode, offset_top, md, eptrb, 0, RM29);
3259 nigel 77 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3260 ph10 510 if (fi >= max) MRRETURN(MATCH_NOMATCH);
3261 ph10 426 if (eptr >= md->end_subject)
3262     {
3263     SCHECK_PARTIAL();
3264 ph10 510 MRRETURN(MATCH_NOMATCH);
3265 ph10 426 }
3266 ph10 510 if (fc == md->lcc[*eptr++]) MRRETURN(MATCH_NOMATCH);
3267 nigel 77 }
3268     }
3269     /* Control never gets here */
3270     }
3271    
3272     /* Maximize case */
3273    
3274     else
3275     {
3276     pp = eptr;
3277    
3278     #ifdef SUPPORT_UTF8
3279     /* UTF-8 mode */
3280     if (utf8)
3281     {
3282 nigel 93 register unsigned int d;
3283 nigel 77 for (i = min; i < max; i++)
3284     {
3285     int len = 1;
3286 ph10 463 if (eptr >= md->end_subject)
3287 ph10 462 {
3288 ph10 463 SCHECK_PARTIAL();
3289 ph10 462 break;
3290 ph10 463 }
3291 nigel 77 GETCHARLEN(d, eptr, len);
3292     if (d < 256) d = md->lcc[d];
3293     if (fc == d) break;
3294     eptr += len;
3295     }
3296 nigel 93 if (possessive) continue;
3297     for(;;)
3298 nigel 77 {
3299 ph10 602 RMATCH(eptr, ecode, offset_top, md, eptrb, 0, RM30);
3300 nigel 77 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3301     if (eptr-- == pp) break; /* Stop if tried at original pos */
3302     BACKCHAR(eptr);
3303     }
3304     }
3305     else
3306     #endif
3307     /* Not UTF-8 mode */
3308     {
3309     for (i = min; i < max; i++)
3310     {
3311 ph10 463 if (eptr >= md->end_subject)
3312 ph10 462 {
3313     SCHECK_PARTIAL();
3314     break;
3315 ph10 463 }
3316 ph10 462 if (fc == md->lcc[*eptr]) break;
3317 nigel 77 eptr++;
3318     }
3319 nigel 93 if (possessive) continue;
3320 nigel 77 while (eptr >= pp)
3321     {
3322 ph10 602 RMATCH(eptr, ecode, offset_top, md, eptrb, 0, RM31);
3323 nigel 77 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3324     eptr--;
3325     }
3326     }
3327    
3328 ph10 510 MRRETURN(MATCH_NOMATCH);
3329 nigel 77 }
3330     /* Control never gets here */
3331     }
3332    
3333     /* Caseful comparisons */
3334    
3335     else
3336     {
3337     #ifdef SUPPORT_UTF8
3338     /* UTF-8 mode */
3339     if (utf8)
3340     {
3341 nigel 93 register unsigned int d;
3342 nigel 77 for (i = 1; i <= min; i++)
3343     {
3344 ph10 426 if (eptr >= md->end_subject)
3345     {
3346     SCHECK_PARTIAL();
3347 ph10 510 MRRETURN(MATCH_NOMATCH);
3348 ph10 427 }
3349 nigel 77 GETCHARINC(d, eptr);
3350 ph10 510 if (fc == d) MRRETURN(MATCH_NOMATCH);
3351 nigel 77 }
3352     }
3353     else
3354     #endif
3355     /* Not UTF-8 mode */
3356     {
3357     for (i = 1; i <= min; i++)
3358 ph10 426 {
3359     if (eptr >= md->end_subject)
3360     {
3361     SCHECK_PARTIAL();
3362 ph10 510 MRRETURN(MATCH_NOMATCH);
3363 ph10 427 }
3364 ph10 510 if (fc == *eptr++) MRRETURN(MATCH_NOMATCH);
3365 ph10 427 }
3366 nigel 77 }
3367    
3368     if (min == max) continue;
3369    
3370     if (minimize)
3371     {
3372     #ifdef SUPPORT_UTF8
3373     /* UTF-8 mode */
3374     if (utf8)
3375     {
3376 nigel 93 register unsigned int d;
3377 nigel 77 for (fi = min;; fi++)
3378     {
3379 ph10 602 RMATCH(eptr, ecode, offset_top, md, eptrb, 0, RM32);
3380 nigel 77 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3381 ph10 510 if (fi >= max) MRRETURN(MATCH_NOMATCH);
3382 ph10 427 if (eptr >= md->end_subject)
3383 ph10 426 {
3384 ph10 427 SCHECK_PARTIAL();
3385 ph10 510 MRRETURN(MATCH_NOMATCH);
3386 ph10 427 }
3387 nigel 77 GETCHARINC(d, eptr);
3388 ph10 510 if (fc == d) MRRETURN(MATCH_NOMATCH);
3389 nigel 77 }
3390     }
3391     else
3392     #endif
3393     /* Not UTF-8 mode */
3394     {
3395     for (fi = min;; fi++)
3396     {
3397 ph10 602 RMATCH(eptr, ecode, offset_top, md, eptrb, 0, RM33);
3398 nigel 77 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3399 ph10 510 if (fi >= max) MRRETURN(MATCH_NOMATCH);
3400 ph10 426 if (eptr >= md->end_subject)
3401     {
3402     SCHECK_PARTIAL();
3403 ph10 510 MRRETURN(MATCH_NOMATCH);
3404 ph10 427 }
3405 ph10 510 if (fc == *eptr++) MRRETURN(MATCH_NOMATCH);
3406 nigel 77 }
3407     }
3408     /* Control never gets here */
3409     }
3410    
3411     /* Maximize case */
3412    
3413     else
3414     {
3415     pp = eptr;
3416    
3417     #ifdef SUPPORT_UTF8
3418     /* UTF-8 mode */
3419     if (utf8)
3420     {
3421 nigel 93 register unsigned int d;
3422 nigel 77 for (i = min; i < max; i++)
3423     {
3424     int len = 1;
3425 ph10 463 if (eptr >= md->end_subject)
3426 ph10 462 {
3427 ph10 463 SCHECK_PARTIAL();
3428 ph10 462 break;
3429 ph10 463 }
3430 nigel 77 GETCHARLEN(d, eptr, len);
3431     if (fc == d) break;
3432     eptr += len;
3433     }
3434 nigel 93 if (possessive) continue;
3435 nigel 77 for(;;)
3436     {
3437 ph10 602 RMATCH(eptr, ecode, offset_top, md, eptrb, 0, RM34);
3438 nigel 77 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3439     if (eptr-- == pp) break; /* Stop if tried at original pos */
3440     BACKCHAR(eptr);
3441     }
3442     }
3443     else
3444     #endif
3445     /* Not UTF-8 mode */
3446     {
3447     for (i = min; i < max; i++)
3448     {
3449 ph10 463 if (eptr >= md->end_subject)
3450 ph10 462 {
3451 ph10 463 SCHECK_PARTIAL();
3452 ph10 462 break;
3453 ph10 463 }
3454 ph10 462 if (fc == *eptr) break;
3455 nigel 77 eptr++;
3456     }
3457 nigel 93 if (possessive) continue;
3458 nigel 77 while (eptr >= pp)
3459     {
3460 ph10 602 RMATCH(eptr, ecode, offset_top, md, eptrb, 0, RM35);
3461 nigel 77 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3462     eptr--;
3463     }
3464     }
3465    
3466 ph10 510 MRRETURN(MATCH_NOMATCH);
3467 nigel 77 }
3468     }
3469     /* Control never gets here */
3470    
3471     /* Match a single character type repeatedly; several different opcodes
3472     share code. This is very similar to the code for single characters, but we
3473     repeat it in the interests of efficiency. */
3474    
3475     case OP_TYPEEXACT:
3476     min = max = GET2(ecode, 1);
3477     minimize = TRUE;
3478     ecode += 3;
3479     goto REPEATTYPE;
3480    
3481     case OP_TYPEUPTO:
3482     case OP_TYPEMINUPTO:
3483     min = 0;
3484     max = GET2(ecode, 1);
3485     minimize = *ecode == OP_TYPEMINUPTO;
3486     ecode += 3;
3487     goto REPEATTYPE;
3488    
3489 nigel 93 case OP_TYPEPOSSTAR:
3490     possessive = TRUE;
3491     min = 0;
3492     max = INT_MAX;
3493     ecode++;
3494     goto REPEATTYPE;
3495    
3496     case OP_TYPEPOSPLUS:
3497     possessive = TRUE;
3498     min = 1;
3499     max = INT_MAX;
3500     ecode++;
3501     goto REPEATTYPE;
3502    
3503     case OP_TYPEPOSQUERY:
3504     possessive = TRUE;
3505     min = 0;
3506     max = 1;
3507     ecode++;
3508     goto REPEATTYPE;
3509    
3510     case OP_TYPEPOSUPTO:
3511     possessive = TRUE;
3512     min = 0;
3513     max = GET2(ecode, 1);
3514     ecode += 3;
3515     goto REPEATTYPE;
3516    
3517 nigel 77 case OP_TYPESTAR:
3518     case OP_TYPEMINSTAR:
3519     case OP_TYPEPLUS:
3520     case OP_TYPEMINPLUS:
3521     case OP_TYPEQUERY:
3522     case OP_TYPEMINQUERY:
3523     c = *ecode++ - OP_TYPESTAR;
3524     minimize = (c & 1) != 0;
3525     min = rep_min[c]; /* Pick up values from tables; */
3526     max = rep_max[c]; /* zero for max => infinity */
3527     if (max == 0) max = INT_MAX;
3528    
3529     /* Common code for all repeated single character type matches. Note that
3530     in UTF-8 mode, '.' matches a character of any length, but for the other
3531     character types, the valid characters are all one-byte long. */
3532    
3533     REPEATTYPE:
3534     ctype = *ecode++; /* Code for the character type */
3535    
3536     #ifdef SUPPORT_UCP
3537     if (ctype == OP_PROP || ctype == OP_NOTPROP)
3538     {
3539     prop_fail_result = ctype == OP_NOTPROP;
3540     prop_type = *ecode++;
3541 nigel 87 prop_value = *ecode++;
3542 nigel 77 }
3543     else prop_type = -1;
3544     #endif
3545    
3546     /* First, ensure the minimum number of matches are present. Use inline
3547     code for maximizing the speed, and do the type test once at the start
3548 ph10 426 (i.e. keep it out of the loop). Separate the UTF-8 code completely as that
3549 nigel 77 is tidier. Also separate the UCP code, which can be the same for both UTF-8
3550     and single-bytes. */
3551    
3552     if (min > 0)
3553     {
3554     #ifdef SUPPORT_UCP
3555 nigel 87 if (prop_type >= 0)
3556 nigel 77 {
3557 nigel 87 switch(prop_type)
3558 nigel 77 {
3559 nigel 87 case PT_ANY:
3560 ph10 510 if (prop_fail_result) MRRETURN(MATCH_NOMATCH);
3561 nigel 87 for (i = 1; i <= min; i++)
3562     {
3563 ph10 427 if (eptr >= md->end_subject)
3564 ph10 426 {
3565 ph10 427 SCHECK_PARTIAL();
3566 ph10 510 MRRETURN(MATCH_NOMATCH);
3567 ph10 427 }
3568 ph10 184 GETCHARINCTEST(c, eptr);
3569 nigel 87 }
3570     break;
3571    
3572     case PT_LAMP:
3573     for (i = 1; i <= min; i++)
3574     {
3575 ph10 427 if (eptr >= md->end_subject)
3576 ph10 426 {
3577 ph10 427 SCHECK_PARTIAL();
3578 ph10 510 MRRETURN(MATCH_NOMATCH);
3579 ph10 427 }
3580 ph10 184 GETCHARINCTEST(c, eptr);
3581 ph10 349 prop_chartype = UCD_CHARTYPE(c);
3582 nigel 87 if ((prop_chartype == ucp_Lu ||
3583     prop_chartype == ucp_Ll ||
3584     prop_chartype == ucp_Lt) == prop_fail_result)
3585 ph10 510 MRRETURN(MATCH_NOMATCH);
3586 nigel 87 }
3587     break;
3588    
3589     case PT_GC:
3590     for (i = 1; i <= min; i++)
3591     {
3592 ph10 427 if (eptr >= md->end_subject)
3593 ph10 426 {
3594 ph10 427 SCHECK_PARTIAL();
3595 ph10 510 MRRETURN(MATCH_NOMATCH);
3596 ph10 427 }
3597 ph10 184 GETCHARINCTEST(c, eptr);
3598 ph10 349 prop_category = UCD_CATEGORY(c);
3599 nigel 87 if ((prop_category == prop_value) == prop_fail_result)
3600 ph10 510 MRRETURN(MATCH_NOMATCH);
3601 nigel 87 }
3602     break;
3603    
3604     case PT_PC:
3605     for (i = 1; i <= min; i++)
3606     {
3607 ph10 427 if (eptr >= md->end_subject)
3608 ph10 426 {
3609 ph10 427 SCHECK_PARTIAL();
3610 ph10 510 MRRETURN(MATCH_NOMATCH);
3611 ph10 427 }
3612 ph10 184 GETCHARINCTEST(c, eptr);
3613 ph10 349 prop_chartype = UCD_CHARTYPE(c);
3614 nigel 87 if ((prop_chartype == prop_value) == prop_fail_result)
3615 ph10 510 MRRETURN(MATCH_NOMATCH);
3616 nigel 87 }
3617     break;
3618    
3619     case PT_SC:
3620     for (i = 1; i <= min; i++)
3621     {
3622 ph10 427 if (eptr >= md->end_subject)
3623 ph10 426 {
3624 ph10 427 SCHECK_PARTIAL();
3625 ph10 510 MRRETURN(MATCH_NOMATCH);
3626 ph10 427 }
3627 ph10 184 GETCHARINCTEST(c, eptr);
3628 ph10 349 prop_script = UCD_SCRIPT(c);
3629 nigel 87 if ((prop_script == prop_value) == prop_fail_result)
3630 ph10 510 MRRETURN(MATCH_NOMATCH);
3631 nigel 87 }
3632     break;
3633 ph10 527
3634 ph10 517 case PT_ALNUM:
3635     for (i = 1; i <= min; i++)
3636     {
3637     if (eptr >= md->end_subject)
3638     {
3639     SCHECK_PARTIAL();
3640     MRRETURN(MATCH_NOMATCH);
3641     }
3642     GETCHARINCTEST(c, eptr);
3643 ph10 527 prop_category = UCD_CATEGORY(c);
3644     if ((prop_category == ucp_L || prop_category == ucp_N)
3645 ph10 517 == prop_fail_result)
3646     MRRETURN(MATCH_NOMATCH);
3647     }
3648     break;
3649 ph10 527
3650 ph10 517 case PT_SPACE: /* Perl space */
3651     for (i = 1; i <= min; i++)
3652     {
3653     if (eptr >= md->end_subject)
3654     {
3655     SCHECK_PARTIAL();
3656     MRRETURN(MATCH_NOMATCH);
3657     }
3658     GETCHARINCTEST(c, eptr);
3659 ph10 527 prop_category = UCD_CATEGORY(c);
3660     if ((prop_category == ucp_Z || c == CHAR_HT || c == CHAR_NL ||
3661     c == CHAR_FF || c == CHAR_CR)
3662 ph10 517 == prop_fail_result)
3663     MRRETURN(MATCH_NOMATCH);
3664     }
3665     break;
3666 ph10 527
3667 ph10 517 case PT_PXSPACE: /* POSIX space */
3668     for (i = 1; i <= min; i++)
3669     {
3670     if (eptr >= md->end_subject)
3671     {
3672     SCHECK_PARTIAL();
3673     MRRETURN(MATCH_NOMATCH);
3674     }
3675     GETCHARINCTEST(c, eptr);
3676 ph10 527 prop_category = UCD_CATEGORY(c);
3677     if ((prop_category == ucp_Z || c == CHAR_HT || c == CHAR_NL ||
3678     c == CHAR_VT || c == CHAR_FF || c == CHAR_CR)
3679 ph10 517 == prop_fail_result)
3680     MRRETURN(MATCH_NOMATCH);
3681     }
3682     break;
3683 ph10 527
3684     case PT_WORD:
3685 ph10 517 for (i = 1; i <= min; i++)
3686     {
3687     if (eptr >= md->end_subject)
3688     {
3689     SCHECK_PARTIAL();
3690     MRRETURN(MATCH_NOMATCH);
3691     }
3692     GETCHARINCTEST(c, eptr);
3693 ph10 527 prop_category = UCD_CATEGORY(c);
3694 ph10 517 if ((prop_category == ucp_L || prop_category == ucp_N ||
3695 ph10 527 c == CHAR_UNDERSCORE)
3696 ph10 517 == prop_fail_result)
3697     MRRETURN(MATCH_NOMATCH);
3698     }
3699     break;
3700 ph10 527
3701 ph10 517 /* This should not occur */
3702 nigel 87
3703     default:
3704     RRETURN(PCRE_ERROR_INTERNAL);
3705 nigel 77 }
3706     }
3707    
3708     /* Match extended Unicode sequences. We will get here only if the
3709     support is in the binary; otherwise a compile-time error occurs. */
3710    
3711     else if (ctype == OP_EXTUNI)
3712     {
3713     for (i = 1; i <= min; i++)
3714     {
3715 ph10 427 if (eptr >= md->end_subject)
3716 ph10 426 {
3717 ph10 427 SCHECK_PARTIAL();
3718 ph10 510 MRRETURN(MATCH_NOMATCH);
3719 ph10 427 }
3720 nigel 77 GETCHARINCTEST(c, eptr);
3721 ph10 349 prop_category = UCD_CATEGORY(c);
3722 ph10 510 if (prop_category == ucp_M) MRRETURN(MATCH_NOMATCH);
3723 nigel 77 while (eptr < md->end_subject)
3724     {
3725     int len = 1;
3726 ph10 426 if (!utf8) c = *eptr;
3727     else { GETCHARLEN(c, eptr, len); }
3728 ph10 349 prop_category = UCD_CATEGORY(c);
3729 nigel 77 if (prop_category != ucp_M) break;
3730     eptr += len;
3731     }
3732     }
3733     }
3734    
3735     else
3736     #endif /* SUPPORT_UCP */
3737    
3738     /* Handle all other cases when the coding is UTF-8 */
3739    
3740     #ifdef SUPPORT_UTF8
3741     if (utf8) switch(ctype)
3742     {
3743     case OP_ANY:
3744     for (i = 1; i <= min; i++)
3745     {
3746 ph10 426 if (eptr >= md->end_subject)
3747     {
3748 ph10 427 SCHECK_PARTIAL();
3749 ph10 510 MRRETURN(MATCH_NOMATCH);
3750 ph10 427 }
3751 ph10 510 if (IS_NEWLINE(eptr)) MRRETURN(MATCH_NOMATCH);
3752 nigel 91 eptr++;
3753 nigel 77 while (eptr < md->end_subject && (*eptr & 0xc0) == 0x80) eptr++;
3754     }
3755     break;
3756    
3757 ph10 341 case OP_ALLANY:
3758     for (i = 1; i <= min; i++)
3759     {
3760 ph10 427 if (eptr >= md->end_subject)
3761 ph10 426 {
3762     SCHECK_PARTIAL();
3763 ph10 510 MRRETURN(MATCH_NOMATCH);
3764 ph10 427 }
3765 ph10 341 eptr++;
3766     while (eptr < md->end_subject && (*eptr & 0xc0) == 0x80) eptr++;
3767     }
3768     break;
3769    
3770 nigel 77 case OP_ANYBYTE:
3771 ph10 510 if (eptr > md->end_subject - min) MRRETURN(MATCH_NOMATCH);
3772 nigel 77 eptr += min;
3773     break;
3774    
3775 nigel 93 case OP_ANYNL:
3776     for (i = 1; i <= min; i++)
3777     {
3778 ph10 427 if (eptr >= md->end_subject)
3779 ph10 426 {
3780     SCHECK_PARTIAL();
3781 ph10 510 MRRETURN(MATCH_NOMATCH);
3782 ph10 427 }
3783 nigel 93 GETCHARINC(c, eptr);
3784     switch(c)
3785     {
3786 ph10 510 default: MRRETURN(MATCH_NOMATCH);
3787 ph10 600
3788 nigel 93 case 0x000d:
3789     if (eptr < md->end_subject && *eptr == 0x0a) eptr++;
3790     break;
3791 ph10 231
3792 nigel 93 case 0x000a:
3793 ph10 231 break;
3794    
3795 nigel 93 case 0x000b:
3796     case 0x000c:
3797     case 0x0085:
3798     case 0x2028:
3799     case 0x2029:
3800 ph10 510 if (md->bsr_anycrlf) MRRETURN(MATCH_NOMATCH);
3801 nigel 93 break;
3802     }
3803     }
3804     break;
3805    
3806 ph10 178 case OP_NOT_HSPACE:
3807     for (i = 1; i <= min; i++)
3808     {
3809 ph10 427 if (eptr >= md->end_subject)
3810 ph10 426 {
3811     SCHECK_PARTIAL();
3812 ph10 510 MRRETURN(MATCH_NOMATCH);
3813 ph10 427 }
3814 ph10 178 GETCHARINC(c, eptr);
3815     switch(c)
3816     {
3817     default: break;
3818     case 0x09: /* HT */
3819     case 0x20: /* SPACE */
3820     case 0xa0: /* NBSP */
3821     case 0x1680: /* OGHAM SPACE MARK */
3822     case 0x180e: /* MONGOLIAN VOWEL SEPARATOR */
3823     case 0x2000: /* EN QUAD */
3824     case 0x2001: /* EM QUAD */
3825     case 0x2002: /* EN SPACE */
3826     case 0x2003: /* EM SPACE */
3827