/[pcre]/code/trunk/pcre_exec.c
ViewVC logotype

Contents of /code/trunk/pcre_exec.c

Parent Directory Parent Directory | Revision Log Revision Log


Revision 446 - (hide annotations) (download)
Tue Sep 15 10:49:50 2009 UTC (3 years, 8 months ago) by ph10
File MIME type: text/plain
File size: 161775 byte(s)
Correct returned capture count after recursion has matched more than outer.

1 nigel 77 /*************************************************
2     * Perl-Compatible Regular Expressions *
3     *************************************************/
4    
5     /* PCRE is a library of functions to support regular expressions whose syntax
6     and semantics are as close as possible to those of the Perl 5 language.
7    
8     Written by Philip Hazel
9 ph10 381 Copyright (c) 1997-2009 University of Cambridge
10 nigel 77
11     -----------------------------------------------------------------------------
12     Redistribution and use in source and binary forms, with or without
13     modification, are permitted provided that the following conditions are met:
14    
15     * Redistributions of source code must retain the above copyright notice,
16     this list of conditions and the following disclaimer.
17    
18     * Redistributions in binary form must reproduce the above copyright
19     notice, this list of conditions and the following disclaimer in the
20     documentation and/or other materials provided with the distribution.
21    
22     * Neither the name of the University of Cambridge nor the names of its
23     contributors may be used to endorse or promote products derived from
24     this software without specific prior written permission.
25    
26     THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
27     AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
28     IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
29     ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
30     LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
31     CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
32     SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
33     INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
34     CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
35     ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
36     POSSIBILITY OF SUCH DAMAGE.
37     -----------------------------------------------------------------------------
38     */
39    
40    
41     /* This module contains pcre_exec(), the externally visible function that does
42     pattern matching using an NFA algorithm, trying to mimic Perl as closely as
43     possible. There are also some static supporting functions. */
44    
45 ph10 200 #ifdef HAVE_CONFIG_H
46 ph10 236 #include "config.h"
47 ph10 200 #endif
48 ph10 199
49 nigel 93 #define NLBLOCK md /* Block containing newline information */
50     #define PSSTART start_subject /* Field containing processed string start */
51     #define PSEND end_subject /* Field containing processed string end */
52    
53 nigel 77 #include "pcre_internal.h"
54    
55 ph10 137 /* Undefine some potentially clashing cpp symbols */
56    
57     #undef min
58     #undef max
59    
60 nigel 77 /* Flag bits for the match() function */
61    
62 nigel 93 #define match_condassert 0x01 /* Called to check a condition assertion */
63     #define match_cbegroup 0x02 /* Could-be-empty unlimited repeat group */
64 nigel 77
65     /* Non-error returns from the match() function. Error returns are externally
66     defined PCRE_ERROR_xxx codes, which are all negative. */
67    
68     #define MATCH_MATCH 1
69     #define MATCH_NOMATCH 0
70    
71 ph10 211 /* Special internal returns from the match() function. Make them sufficiently
72 ph10 210 negative to avoid the external error codes. */
73    
74     #define MATCH_COMMIT (-999)
75     #define MATCH_PRUNE (-998)
76     #define MATCH_SKIP (-997)
77     #define MATCH_THEN (-996)
78    
79 nigel 77 /* Maximum number of ints of offset to save on the stack for recursive calls.
80     If the offset vector is bigger, malloc is used. This should be a multiple of 3,
81     because the offset vector is always a multiple of 3 long. */
82    
83     #define REC_STACK_SAVE_MAX 30
84    
85     /* Min and max values for the common repeats; for the maxima, 0 => infinity */
86    
87     static const char rep_min[] = { 0, 0, 1, 1, 0, 0 };
88     static const char rep_max[] = { 0, 0, 0, 0, 1, 1 };
89    
90    
91    
92     #ifdef DEBUG
93     /*************************************************
94     * Debugging function to print chars *
95     *************************************************/
96    
97     /* Print a sequence of chars in printable format, stopping at the end of the
98     subject if the requested.
99    
100     Arguments:
101     p points to characters
102     length number to print
103     is_subject TRUE if printing from within md->start_subject
104     md pointer to matching data block, if is_subject is TRUE
105    
106     Returns: nothing
107     */
108    
109     static void
110     pchars(const uschar *p, int length, BOOL is_subject, match_data *md)
111     {
112 nigel 93 unsigned int c;
113 nigel 77 if (is_subject && length > md->end_subject - p) length = md->end_subject - p;
114     while (length-- > 0)
115     if (isprint(c = *(p++))) printf("%c", c); else printf("\\x%02x", c);
116     }
117     #endif
118    
119    
120    
121     /*************************************************
122     * Match a back-reference *
123     *************************************************/
124    
125     /* If a back reference hasn't been set, the length that is passed is greater
126     than the number of characters left in the string, so the match fails.
127    
128     Arguments:
129     offset index into the offset vector
130     eptr points into the subject
131     length length to be matched
132     md points to match data block
133     ims the ims flags
134    
135     Returns: TRUE if matched
136     */
137    
138     static BOOL
139 nigel 87 match_ref(int offset, register USPTR eptr, int length, match_data *md,
140 nigel 77 unsigned long int ims)
141     {
142 nigel 87 USPTR p = md->start_subject + md->offset_vector[offset];
143 nigel 77
144     #ifdef DEBUG
145     if (eptr >= md->end_subject)
146     printf("matching subject <null>");
147     else
148     {
149     printf("matching subject ");
150     pchars(eptr, length, TRUE, md);
151     }
152     printf(" against backref ");
153     pchars(p, length, FALSE, md);
154     printf("\n");
155     #endif
156    
157     /* Always fail if not enough characters left */
158    
159     if (length > md->end_subject - eptr) return FALSE;
160    
161 ph10 354 /* Separate the caseless case for speed. In UTF-8 mode we can only do this
162     properly if Unicode properties are supported. Otherwise, we can check only
163     ASCII characters. */
164 nigel 77
165     if ((ims & PCRE_CASELESS) != 0)
166     {
167 ph10 354 #ifdef SUPPORT_UTF8
168     #ifdef SUPPORT_UCP
169     if (md->utf8)
170     {
171 ph10 358 USPTR endptr = eptr + length;
172 ph10 354 while (eptr < endptr)
173     {
174 ph10 358 int c, d;
175 ph10 354 GETCHARINC(c, eptr);
176     GETCHARINC(d, p);
177     if (c != d && c != UCD_OTHERCASE(d)) return FALSE;
178 ph10 358 }
179     }
180 ph10 354 else
181     #endif
182     #endif
183    
184     /* The same code works when not in UTF-8 mode and in UTF-8 mode when there
185     is no UCP support. */
186 ph10 358
187 nigel 77 while (length-- > 0)
188 ph10 354 { if (md->lcc[*p++] != md->lcc[*eptr++]) return FALSE; }
189 nigel 77 }
190 ph10 358
191 ph10 354 /* In the caseful case, we can just compare the bytes, whether or not we
192     are in UTF-8 mode. */
193 ph10 358
194 nigel 77 else
195     { while (length-- > 0) if (*p++ != *eptr++) return FALSE; }
196    
197     return TRUE;
198     }
199    
200    
201    
202     /***************************************************************************
203     ****************************************************************************
204     RECURSION IN THE match() FUNCTION
205    
206 nigel 87 The match() function is highly recursive, though not every recursive call
207     increases the recursive depth. Nevertheless, some regular expressions can cause
208     it to recurse to a great depth. I was writing for Unix, so I just let it call
209     itself recursively. This uses the stack for saving everything that has to be
210     saved for a recursive call. On Unix, the stack can be large, and this works
211     fine.
212 nigel 77
213 nigel 87 It turns out that on some non-Unix-like systems there are problems with
214     programs that use a lot of stack. (This despite the fact that every last chip
215     has oodles of memory these days, and techniques for extending the stack have
216     been known for decades.) So....
217 nigel 77
218     There is a fudge, triggered by defining NO_RECURSE, which avoids recursive
219     calls by keeping local variables that need to be preserved in blocks of memory
220 nigel 87 obtained from malloc() instead instead of on the stack. Macros are used to
221 nigel 77 achieve this so that the actual code doesn't look very different to what it
222     always used to.
223 ph10 164
224 ph10 165 The original heap-recursive code used longjmp(). However, it seems that this
225 ph10 164 can be very slow on some operating systems. Following a suggestion from Stan
226     Switzer, the use of longjmp() has been abolished, at the cost of having to
227     provide a unique number for each call to RMATCH. There is no way of generating
228     a sequence of numbers at compile time in C. I have given them names, to make
229     them stand out more clearly.
230    
231     Crude tests on x86 Linux show a small speedup of around 5-8%. However, on
232     FreeBSD, avoiding longjmp() more than halves the time taken to run the standard
233 ph10 165 tests. Furthermore, not using longjmp() means that local dynamic variables
234     don't have indeterminate values; this has meant that the frame size can be
235 ph10 164 reduced because the result can be "passed back" by straight setting of the
236     variable instead of being passed in the frame.
237 nigel 77 ****************************************************************************
238     ***************************************************************************/
239    
240 ph10 212 /* Numbers for RMATCH calls. When this list is changed, the code at HEAP_RETURN
241     below must be updated in sync. */
242 nigel 77
243 ph10 164 enum { RM1=1, RM2, RM3, RM4, RM5, RM6, RM7, RM8, RM9, RM10,
244     RM11, RM12, RM13, RM14, RM15, RM16, RM17, RM18, RM19, RM20,
245     RM21, RM22, RM23, RM24, RM25, RM26, RM27, RM28, RM29, RM30,
246     RM31, RM32, RM33, RM34, RM35, RM36, RM37, RM38, RM39, RM40,
247 ph10 210 RM41, RM42, RM43, RM44, RM45, RM46, RM47, RM48, RM49, RM50,
248 ph10 212 RM51, RM52, RM53, RM54 };
249 ph10 164
250 nigel 87 /* These versions of the macros use the stack, as normal. There are debugging
251 ph10 165 versions and production versions. Note that the "rw" argument of RMATCH isn't
252 ph10 164 actuall used in this definition. */
253 nigel 77
254     #ifndef NO_RECURSE
255     #define REGISTER register
256 ph10 164
257 nigel 87 #ifdef DEBUG
258 ph10 164 #define RMATCH(ra,rb,rc,rd,re,rf,rg,rw) \
259 nigel 87 { \
260     printf("match() called in line %d\n", __LINE__); \
261 ph10 168 rrc = match(ra,rb,mstart,rc,rd,re,rf,rg,rdepth+1); \
262 nigel 87 printf("to line %d\n", __LINE__); \
263     }
264     #define RRETURN(ra) \
265     { \
266     printf("match() returned %d from line %d ", ra, __LINE__); \
267     return ra; \
268     }
269     #else
270 ph10 164 #define RMATCH(ra,rb,rc,rd,re,rf,rg,rw) \
271 ph10 168 rrc = match(ra,rb,mstart,rc,rd,re,rf,rg,rdepth+1)
272 nigel 77 #define RRETURN(ra) return ra
273 nigel 87 #endif
274    
275 nigel 77 #else
276    
277    
278 ph10 164 /* These versions of the macros manage a private stack on the heap. Note that
279     the "rd" argument of RMATCH isn't actually used in this definition. It's the md
280     argument of match(), which never changes. */
281 nigel 77
282     #define REGISTER
283    
284 ph10 164 #define RMATCH(ra,rb,rc,rd,re,rf,rg,rw)\
285 nigel 77 {\
286     heapframe *newframe = (pcre_stack_malloc)(sizeof(heapframe));\
287 ph10 164 frame->Xwhere = rw; \
288     newframe->Xeptr = ra;\
289     newframe->Xecode = rb;\
290 ph10 168 newframe->Xmstart = mstart;\
291 ph10 164 newframe->Xoffset_top = rc;\
292     newframe->Xims = re;\
293     newframe->Xeptrb = rf;\
294     newframe->Xflags = rg;\
295     newframe->Xrdepth = frame->Xrdepth + 1;\
296     newframe->Xprevframe = frame;\
297     frame = newframe;\
298     DPRINTF(("restarting from line %d\n", __LINE__));\
299     goto HEAP_RECURSE;\
300     L_##rw:\
301     DPRINTF(("jumped back to line %d\n", __LINE__));\
302 nigel 77 }
303    
304     #define RRETURN(ra)\
305     {\
306     heapframe *newframe = frame;\
307     frame = newframe->Xprevframe;\
308     (pcre_stack_free)(newframe);\
309     if (frame != NULL)\
310     {\
311 ph10 164 rrc = ra;\
312     goto HEAP_RETURN;\
313 nigel 77 }\
314     return ra;\
315     }
316    
317    
318     /* Structure for remembering the local variables in a private frame */
319    
320     typedef struct heapframe {
321     struct heapframe *Xprevframe;
322    
323     /* Function arguments that may change */
324    
325 ph10 409 USPTR Xeptr;
326 nigel 77 const uschar *Xecode;
327 ph10 409 USPTR Xmstart;
328 nigel 77 int Xoffset_top;
329     long int Xims;
330     eptrblock *Xeptrb;
331     int Xflags;
332 nigel 91 unsigned int Xrdepth;
333 nigel 77
334     /* Function local variables */
335    
336 ph10 409 USPTR Xcallpat;
337 ph10 406 #ifdef SUPPORT_UTF8
338 ph10 409 USPTR Xcharptr;
339 ph10 406 #endif
340 ph10 409 USPTR Xdata;
341     USPTR Xnext;
342     USPTR Xpp;
343     USPTR Xprev;
344     USPTR Xsaved_eptr;
345 nigel 77
346     recursion_info Xnew_recursive;
347    
348     BOOL Xcur_is_word;
349     BOOL Xcondition;
350     BOOL Xprev_is_word;
351    
352     unsigned long int Xoriginal_ims;
353    
354     #ifdef SUPPORT_UCP
355     int Xprop_type;
356 nigel 87 int Xprop_value;
357 nigel 77 int Xprop_fail_result;
358     int Xprop_category;
359     int Xprop_chartype;
360 nigel 87 int Xprop_script;
361 ph10 123 int Xoclength;
362     uschar Xocchars[8];
363 nigel 77 #endif
364    
365 ph10 403 int Xcodelink;
366 nigel 77 int Xctype;
367 nigel 93 unsigned int Xfc;
368 nigel 77 int Xfi;
369     int Xlength;
370     int Xmax;
371     int Xmin;
372     int Xnumber;
373     int Xoffset;
374     int Xop;
375     int Xsave_capture_last;
376     int Xsave_offset1, Xsave_offset2, Xsave_offset3;
377     int Xstacksave[REC_STACK_SAVE_MAX];
378    
379     eptrblock Xnewptrb;
380    
381 ph10 164 /* Where to jump back to */
382 nigel 77
383 ph10 164 int Xwhere;
384 ph10 165
385 nigel 77 } heapframe;
386    
387     #endif
388    
389    
390     /***************************************************************************
391     ***************************************************************************/
392    
393    
394    
395     /*************************************************
396     * Match from current position *
397     *************************************************/
398    
399 nigel 93 /* This function is called recursively in many circumstances. Whenever it
400 nigel 77 returns a negative (error) response, the outer incarnation must also return the
401 ph10 426 same response. */
402 nigel 77
403 ph10 426 /* These macros pack up tests that are used for partial matching, and which
404     appears several times in the code. We set the "hit end" flag if the pointer is
405     at the end of the subject and also past the start of the subject (i.e.
406 ph10 427 something has been matched). For hard partial matching, we then return
407     immediately. The second one is used when we already know we are past the end of
408     the subject. */
409 ph10 426
410     #define CHECK_PARTIAL()\
411 ph10 435 if (md->partial != 0 && eptr >= md->end_subject && eptr > mstart)\
412 ph10 427 {\
413     md->hitend = TRUE;\
414     if (md->partial > 1) RRETURN(PCRE_ERROR_PARTIAL);\
415     }
416 ph10 426
417     #define SCHECK_PARTIAL()\
418 ph10 427 if (md->partial && eptr > mstart)\
419     {\
420     md->hitend = TRUE;\
421     if (md->partial > 1) RRETURN(PCRE_ERROR_PARTIAL);\
422     }
423 ph10 426
424 ph10 427
425 ph10 426 /* Performance note: It might be tempting to extract commonly used fields from
426     the md structure (e.g. utf8, end_subject) into individual variables to improve
427 nigel 77 performance. Tests using gcc on a SPARC disproved this; in the first case, it
428     made performance worse.
429    
430     Arguments:
431 nigel 93 eptr pointer to current character in subject
432     ecode pointer to current position in compiled code
433 ph10 168 mstart pointer to the current match start position (can be modified
434 ph10 172 by encountering \K)
435 nigel 77 offset_top current top pointer
436     md pointer to "static" info for the match
437     ims current /i, /m, and /s options
438     eptrb pointer to chain of blocks containing eptr at start of
439     brackets - for testing for empty matches
440     flags can contain
441     match_condassert - this is an assertion condition
442 nigel 93 match_cbegroup - this is the start of an unlimited repeat
443     group that can match an empty string
444 nigel 87 rdepth the recursion depth
445 nigel 77
446     Returns: MATCH_MATCH if matched ) these values are >= 0
447     MATCH_NOMATCH if failed to match )
448     a negative PCRE_ERROR_xxx value if aborted by an error condition
449 nigel 87 (e.g. stopped by repeated call or recursion limit)
450 nigel 77 */
451    
452     static int
453 ph10 409 match(REGISTER USPTR eptr, REGISTER const uschar *ecode, USPTR mstart,
454 nigel 77 int offset_top, match_data *md, unsigned long int ims, eptrblock *eptrb,
455 nigel 91 int flags, unsigned int rdepth)
456 nigel 77 {
457     /* These variables do not need to be preserved over recursion in this function,
458 nigel 93 so they can be ordinary variables in all cases. Mark some of them with
459     "register" because they are used a lot in loops. */
460 nigel 77
461 nigel 91 register int rrc; /* Returns from recursive calls */
462     register int i; /* Used for loops not involving calls to RMATCH() */
463 nigel 93 register unsigned int c; /* Character values not kept over RMATCH() calls */
464 nigel 91 register BOOL utf8; /* Local copy of UTF-8 flag for speed */
465 nigel 77
466 nigel 93 BOOL minimize, possessive; /* Quantifier options */
467 ph10 403 int condcode;
468 nigel 93
469 nigel 77 /* When recursion is not being used, all "local" variables that have to be
470     preserved over calls to RMATCH() are part of a "frame" which is obtained from
471     heap storage. Set up the top-level frame here; others are obtained from the
472     heap whenever RMATCH() does a "recursion". See the macro definitions above. */
473    
474     #ifdef NO_RECURSE
475     heapframe *frame = (pcre_stack_malloc)(sizeof(heapframe));
476     frame->Xprevframe = NULL; /* Marks the top level */
477    
478     /* Copy in the original argument variables */
479    
480     frame->Xeptr = eptr;
481     frame->Xecode = ecode;
482 ph10 168 frame->Xmstart = mstart;
483 nigel 77 frame->Xoffset_top = offset_top;
484     frame->Xims = ims;
485     frame->Xeptrb = eptrb;
486     frame->Xflags = flags;
487 nigel 87 frame->Xrdepth = rdepth;
488 nigel 77
489     /* This is where control jumps back to to effect "recursion" */
490    
491     HEAP_RECURSE:
492    
493     /* Macros make the argument variables come from the current frame */
494    
495     #define eptr frame->Xeptr
496     #define ecode frame->Xecode
497 ph10 168 #define mstart frame->Xmstart
498 nigel 77 #define offset_top frame->Xoffset_top
499     #define ims frame->Xims
500     #define eptrb frame->Xeptrb
501     #define flags frame->Xflags
502 nigel 87 #define rdepth frame->Xrdepth
503 nigel 77
504     /* Ditto for the local variables */
505    
506     #ifdef SUPPORT_UTF8
507     #define charptr frame->Xcharptr
508     #endif
509     #define callpat frame->Xcallpat
510 ph10 403 #define codelink frame->Xcodelink
511 nigel 77 #define data frame->Xdata
512     #define next frame->Xnext
513     #define pp frame->Xpp
514     #define prev frame->Xprev
515     #define saved_eptr frame->Xsaved_eptr
516    
517     #define new_recursive frame->Xnew_recursive
518    
519     #define cur_is_word frame->Xcur_is_word
520     #define condition frame->Xcondition
521     #define prev_is_word frame->Xprev_is_word
522    
523     #define original_ims frame->Xoriginal_ims
524    
525     #ifdef SUPPORT_UCP
526     #define prop_type frame->Xprop_type
527 nigel 87 #define prop_value frame->Xprop_value
528 nigel 77 #define prop_fail_result frame->Xprop_fail_result
529     #define prop_category frame->Xprop_category
530     #define prop_chartype frame->Xprop_chartype
531 nigel 87 #define prop_script frame->Xprop_script
532 ph10 115 #define oclength frame->Xoclength
533     #define occhars frame->Xocchars
534 nigel 77 #endif
535    
536     #define ctype frame->Xctype
537     #define fc frame->Xfc
538     #define fi frame->Xfi
539     #define length frame->Xlength
540     #define max frame->Xmax
541     #define min frame->Xmin
542     #define number frame->Xnumber
543     #define offset frame->Xoffset
544     #define op frame->Xop
545     #define save_capture_last frame->Xsave_capture_last
546     #define save_offset1 frame->Xsave_offset1
547     #define save_offset2 frame->Xsave_offset2
548     #define save_offset3 frame->Xsave_offset3
549     #define stacksave frame->Xstacksave
550    
551     #define newptrb frame->Xnewptrb
552    
553     /* When recursion is being used, local variables are allocated on the stack and
554     get preserved during recursion in the normal way. In this environment, fi and
555     i, and fc and c, can be the same variables. */
556    
557 nigel 93 #else /* NO_RECURSE not defined */
558 nigel 77 #define fi i
559     #define fc c
560    
561    
562 nigel 87 #ifdef SUPPORT_UTF8 /* Many of these variables are used only */
563     const uschar *charptr; /* in small blocks of the code. My normal */
564     #endif /* style of coding would have declared */
565     const uschar *callpat; /* them within each of those blocks. */
566     const uschar *data; /* However, in order to accommodate the */
567     const uschar *next; /* version of this code that uses an */
568     USPTR pp; /* external "stack" implemented on the */
569     const uschar *prev; /* heap, it is easier to declare them all */
570     USPTR saved_eptr; /* here, so the declarations can be cut */
571     /* out in a block. The only declarations */
572     recursion_info new_recursive; /* within blocks below are for variables */
573     /* that do not have to be preserved over */
574     BOOL cur_is_word; /* a recursive call to RMATCH(). */
575     BOOL condition;
576 nigel 77 BOOL prev_is_word;
577    
578     unsigned long int original_ims;
579    
580     #ifdef SUPPORT_UCP
581     int prop_type;
582 nigel 87 int prop_value;
583 nigel 77 int prop_fail_result;
584     int prop_category;
585     int prop_chartype;
586 nigel 87 int prop_script;
587 ph10 115 int oclength;
588     uschar occhars[8];
589 nigel 77 #endif
590    
591 ph10 399 int codelink;
592 nigel 77 int ctype;
593     int length;
594     int max;
595     int min;
596     int number;
597     int offset;
598     int op;
599     int save_capture_last;
600     int save_offset1, save_offset2, save_offset3;
601     int stacksave[REC_STACK_SAVE_MAX];
602    
603     eptrblock newptrb;
604 nigel 93 #endif /* NO_RECURSE */
605 nigel 77
606     /* These statements are here to stop the compiler complaining about unitialized
607     variables. */
608    
609     #ifdef SUPPORT_UCP
610 nigel 87 prop_value = 0;
611 nigel 77 prop_fail_result = 0;
612     #endif
613    
614 nigel 93
615 nigel 91 /* This label is used for tail recursion, which is used in a few cases even
616     when NO_RECURSE is not defined, in order to reduce the amount of stack that is
617     used. Thanks to Ian Taylor for noticing this possibility and sending the
618     original patch. */
619    
620     TAIL_RECURSE:
621    
622 nigel 87 /* OK, now we can get on with the real code of the function. Recursive calls
623     are specified by the macro RMATCH and RRETURN is used to return. When
624     NO_RECURSE is *not* defined, these just turn into a recursive call to match()
625     and a "return", respectively (possibly with some debugging if DEBUG is
626     defined). However, RMATCH isn't like a function call because it's quite a
627     complicated macro. It has to be used in one particular way. This shouldn't,
628     however, impact performance when true recursion is being used. */
629 nigel 77
630 ph10 164 #ifdef SUPPORT_UTF8
631     utf8 = md->utf8; /* Local copy of the flag */
632     #else
633     utf8 = FALSE;
634     #endif
635    
636 nigel 87 /* First check that we haven't called match() too many times, or that we
637     haven't exceeded the recursive call limit. */
638    
639 nigel 77 if (md->match_call_count++ >= md->match_limit) RRETURN(PCRE_ERROR_MATCHLIMIT);
640 nigel 87 if (rdepth >= md->match_limit_recursion) RRETURN(PCRE_ERROR_RECURSIONLIMIT);
641 nigel 77
642     original_ims = ims; /* Save for resetting on ')' */
643 nigel 91
644 nigel 93 /* At the start of a group with an unlimited repeat that may match an empty
645     string, the match_cbegroup flag is set. When this is the case, add the current
646     subject pointer to the chain of such remembered pointers, to be checked when we
647     hit the closing ket, in order to break infinite loops that match no characters.
648 ph10 197 When match() is called in other circumstances, don't add to the chain. The
649     match_cbegroup flag must NOT be used with tail recursion, because the memory
650     block that is used is on the stack, so a new one may be required for each
651     match(). */
652 nigel 77
653 nigel 93 if ((flags & match_cbegroup) != 0)
654 nigel 77 {
655 ph10 197 newptrb.epb_saved_eptr = eptr;
656     newptrb.epb_prev = eptrb;
657     eptrb = &newptrb;
658 nigel 77 }
659    
660 nigel 93 /* Now start processing the opcodes. */
661 nigel 77
662     for (;;)
663     {
664 nigel 93 minimize = possessive = FALSE;
665 nigel 77 op = *ecode;
666 ph10 443
667 nigel 93 switch(op)
668     {
669 ph10 210 case OP_FAIL:
670 ph10 212 RRETURN(MATCH_NOMATCH);
671 ph10 211
672 ph10 210 case OP_PRUNE:
673     RMATCH(eptr, ecode + _pcre_OP_lengths[*ecode], offset_top, md,
674     ims, eptrb, flags, RM51);
675     if (rrc != MATCH_NOMATCH) RRETURN(rrc);
676 ph10 212 RRETURN(MATCH_PRUNE);
677 ph10 211
678 ph10 210 case OP_COMMIT:
679     RMATCH(eptr, ecode + _pcre_OP_lengths[*ecode], offset_top, md,
680     ims, eptrb, flags, RM52);
681     if (rrc != MATCH_NOMATCH) RRETURN(rrc);
682 ph10 212 RRETURN(MATCH_COMMIT);
683 ph10 211
684 ph10 210 case OP_SKIP:
685     RMATCH(eptr, ecode + _pcre_OP_lengths[*ecode], offset_top, md,
686     ims, eptrb, flags, RM53);
687     if (rrc != MATCH_NOMATCH) RRETURN(rrc);
688 ph10 211 md->start_match_ptr = eptr; /* Pass back current position */
689 ph10 212 RRETURN(MATCH_SKIP);
690 ph10 211
691 ph10 210 case OP_THEN:
692     RMATCH(eptr, ecode + _pcre_OP_lengths[*ecode], offset_top, md,
693 ph10 212 ims, eptrb, flags, RM54);
694 ph10 210 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
695 ph10 212 RRETURN(MATCH_THEN);
696 ph10 211
697 nigel 93 /* Handle a capturing bracket. If there is space in the offset vector, save
698     the current subject position in the working slot at the top of the vector.
699     We mustn't change the current values of the data slot, because they may be
700     set from a previous iteration of this group, and be referred to by a
701     reference inside the group.
702 nigel 77
703 nigel 93 If the bracket fails to match, we need to restore this value and also the
704     values of the final offsets, in case they were set by a previous iteration
705     of the same bracket.
706 nigel 77
707 nigel 93 If there isn't enough space in the offset vector, treat this as if it were
708     a non-capturing bracket. Don't worry about setting the flag for the error
709     case here; that is handled in the code for KET. */
710 nigel 77
711 nigel 93 case OP_CBRA:
712     case OP_SCBRA:
713     number = GET2(ecode, 1+LINK_SIZE);
714 nigel 77 offset = number << 1;
715    
716     #ifdef DEBUG
717 nigel 93 printf("start bracket %d\n", number);
718     printf("subject=");
719 nigel 77 pchars(eptr, 16, TRUE, md);
720     printf("\n");
721     #endif
722    
723     if (offset < md->offset_max)
724     {
725     save_offset1 = md->offset_vector[offset];
726     save_offset2 = md->offset_vector[offset+1];
727     save_offset3 = md->offset_vector[md->offset_end - number];
728     save_capture_last = md->capture_last;
729    
730     DPRINTF(("saving %d %d %d\n", save_offset1, save_offset2, save_offset3));
731     md->offset_vector[md->offset_end - number] = eptr - md->start_subject;
732    
733 nigel 93 flags = (op == OP_SCBRA)? match_cbegroup : 0;
734 nigel 77 do
735     {
736 ph10 164 RMATCH(eptr, ecode + _pcre_OP_lengths[*ecode], offset_top, md,
737     ims, eptrb, flags, RM1);
738 ph10 210 if (rrc != MATCH_NOMATCH && rrc != MATCH_THEN) RRETURN(rrc);
739 nigel 77 md->capture_last = save_capture_last;
740     ecode += GET(ecode, 1);
741     }
742     while (*ecode == OP_ALT);
743    
744     DPRINTF(("bracket %d failed\n", number));
745    
746     md->offset_vector[offset] = save_offset1;
747     md->offset_vector[offset+1] = save_offset2;
748     md->offset_vector[md->offset_end - number] = save_offset3;
749    
750     RRETURN(MATCH_NOMATCH);
751     }
752    
753 ph10 197 /* FALL THROUGH ... Insufficient room for saving captured contents. Treat
754     as a non-capturing bracket. */
755 nigel 77
756 ph10 197 /* VVVVVVVVVVVVVVVVVVVVVVVVV */
757     /* VVVVVVVVVVVVVVVVVVVVVVVVV */
758    
759 nigel 93 DPRINTF(("insufficient capture room: treat as non-capturing\n"));
760 nigel 77
761 ph10 197 /* VVVVVVVVVVVVVVVVVVVVVVVVV */
762     /* VVVVVVVVVVVVVVVVVVVVVVVVV */
763    
764 nigel 93 /* Non-capturing bracket. Loop for all the alternatives. When we get to the
765     final alternative within the brackets, we would return the result of a
766     recursive call to match() whatever happened. We can reduce stack usage by
767 ph10 197 turning this into a tail recursion, except in the case when match_cbegroup
768     is set.*/
769 nigel 77
770 nigel 93 case OP_BRA:
771     case OP_SBRA:
772     DPRINTF(("start non-capturing bracket\n"));
773     flags = (op >= OP_SBRA)? match_cbegroup : 0;
774 nigel 91 for (;;)
775 nigel 77 {
776 ph10 197 if (ecode[GET(ecode, 1)] != OP_ALT) /* Final alternative */
777 nigel 93 {
778 ph10 197 if (flags == 0) /* Not a possibly empty group */
779     {
780     ecode += _pcre_OP_lengths[*ecode];
781     DPRINTF(("bracket 0 tail recursion\n"));
782     goto TAIL_RECURSE;
783     }
784    
785     /* Possibly empty group; can't use tail recursion. */
786    
787     RMATCH(eptr, ecode + _pcre_OP_lengths[*ecode], offset_top, md, ims,
788     eptrb, flags, RM48);
789     RRETURN(rrc);
790 nigel 93 }
791 nigel 91
792     /* For non-final alternatives, continue the loop for a NOMATCH result;
793     otherwise return. */
794    
795 ph10 164 RMATCH(eptr, ecode + _pcre_OP_lengths[*ecode], offset_top, md, ims,
796     eptrb, flags, RM2);
797 ph10 210 if (rrc != MATCH_NOMATCH && rrc != MATCH_THEN) RRETURN(rrc);
798 nigel 77 ecode += GET(ecode, 1);
799     }
800 nigel 91 /* Control never reaches here. */
801 nigel 77
802     /* Conditional group: compilation checked that there are no more than
803     two branches. If the condition is false, skipping the first branch takes us
804     past the end if there is only one branch, but that's OK because that is
805 nigel 91 exactly what going to the ket would do. As there is only one branch to be
806     obeyed, we can use tail recursion to avoid using another stack frame. */
807 nigel 77
808     case OP_COND:
809 nigel 93 case OP_SCOND:
810 ph10 399 codelink= GET(ecode, 1);
811 ph10 406
812 ph10 381 /* Because of the way auto-callout works during compile, a callout item is
813     inserted between OP_COND and an assertion condition. */
814 ph10 392
815 ph10 381 if (ecode[LINK_SIZE+1] == OP_CALLOUT)
816     {
817     if (pcre_callout != NULL)
818     {
819     pcre_callout_block cb;
820     cb.version = 1; /* Version 1 of the callout block */
821     cb.callout_number = ecode[LINK_SIZE+2];
822     cb.offset_vector = md->offset_vector;
823     cb.subject = (PCRE_SPTR)md->start_subject;
824     cb.subject_length = md->end_subject - md->start_subject;
825     cb.start_match = mstart - md->start_subject;
826     cb.current_position = eptr - md->start_subject;
827     cb.pattern_position = GET(ecode, LINK_SIZE + 3);
828     cb.next_item_length = GET(ecode, 3 + 2*LINK_SIZE);
829     cb.capture_top = offset_top/2;
830     cb.capture_last = md->capture_last;
831     cb.callout_data = md->callout_data;
832     if ((rrc = (*pcre_callout)(&cb)) > 0) RRETURN(MATCH_NOMATCH);
833     if (rrc < 0) RRETURN(rrc);
834     }
835     ecode += _pcre_OP_lengths[OP_CALLOUT];
836     }
837 ph10 392
838 ph10 399 condcode = ecode[LINK_SIZE+1];
839 ph10 406
840 ph10 381 /* Now see what the actual condition is */
841 ph10 392
842 ph10 399 if (condcode == OP_RREF) /* Recursion test */
843 nigel 77 {
844 nigel 93 offset = GET2(ecode, LINK_SIZE + 2); /* Recursion group number*/
845     condition = md->recursive != NULL &&
846     (offset == RREF_ANY || offset == md->recursive->group_num);
847     ecode += condition? 3 : GET(ecode, 1);
848     }
849    
850 ph10 399 else if (condcode == OP_CREF) /* Group used test */
851 nigel 93 {
852 nigel 77 offset = GET2(ecode, LINK_SIZE+2) << 1; /* Doubled ref number */
853 nigel 93 condition = offset < offset_top && md->offset_vector[offset] >= 0;
854     ecode += condition? 3 : GET(ecode, 1);
855 nigel 77 }
856    
857 ph10 399 else if (condcode == OP_DEF) /* DEFINE - always false */
858 nigel 93 {
859     condition = FALSE;
860     ecode += GET(ecode, 1);
861     }
862    
863 nigel 77 /* The condition is an assertion. Call match() to evaluate it - setting
864 nigel 93 the final argument match_condassert causes it to stop at the end of an
865     assertion. */
866 nigel 77
867     else
868     {
869 ph10 164 RMATCH(eptr, ecode + 1 + LINK_SIZE, offset_top, md, ims, NULL,
870     match_condassert, RM3);
871 nigel 77 if (rrc == MATCH_MATCH)
872     {
873 nigel 93 condition = TRUE;
874     ecode += 1 + LINK_SIZE + GET(ecode, LINK_SIZE + 2);
875 nigel 77 while (*ecode == OP_ALT) ecode += GET(ecode, 1);
876     }
877 ph10 210 else if (rrc != MATCH_NOMATCH && rrc != MATCH_THEN)
878 nigel 77 {
879     RRETURN(rrc); /* Need braces because of following else */
880     }
881 nigel 93 else
882     {
883     condition = FALSE;
884 ph10 399 ecode += codelink;
885 nigel 93 }
886     }
887 nigel 91
888 nigel 93 /* We are now at the branch that is to be obeyed. As there is only one,
889 ph10 197 we can use tail recursion to avoid using another stack frame, except when
890     match_cbegroup is required for an unlimited repeat of a possibly empty
891     group. If the second alternative doesn't exist, we can just plough on. */
892 nigel 91
893 nigel 93 if (condition || *ecode == OP_ALT)
894     {
895 nigel 91 ecode += 1 + LINK_SIZE;
896 ph10 197 if (op == OP_SCOND) /* Possibly empty group */
897     {
898     RMATCH(eptr, ecode, offset_top, md, ims, eptrb, match_cbegroup, RM49);
899     RRETURN(rrc);
900     }
901     else /* Group must match something */
902     {
903     flags = 0;
904     goto TAIL_RECURSE;
905     }
906 nigel 77 }
907 ph10 395 else /* Condition false & no alternative */
908 nigel 93 {
909     ecode += 1 + LINK_SIZE;
910     }
911     break;
912 nigel 77
913    
914 ph10 210 /* End of the pattern, either real or forced. If we are in a top-level
915     recursion, we should restore the offsets appropriately and continue from
916     after the call. */
917 nigel 77
918 ph10 210 case OP_ACCEPT:
919 nigel 77 case OP_END:
920     if (md->recursive != NULL && md->recursive->group_num == 0)
921     {
922     recursion_info *rec = md->recursive;
923 nigel 87 DPRINTF(("End of pattern in a (?0) recursion\n"));
924 nigel 77 md->recursive = rec->prevrec;
925     memmove(md->offset_vector, rec->offset_save,
926     rec->saved_max * sizeof(int));
927 ph10 446 offset_top = rec->offset_top;
928 ph10 168 mstart = rec->save_start;
929 nigel 77 ims = original_ims;
930     ecode = rec->after_call;
931     break;
932     }
933    
934 ph10 442 /* Otherwise, if we have matched an empty string, fail if PCRE_NOTEMPTY is
935     set, or if PCRE_NOTEMPTY_ATSTART is set and we have matched at the start of
936     the subject. In both cases, backtracking will then try other alternatives,
937     if any. */
938 ph10 443
939 ph10 442 if (eptr == mstart &&
940     (md->notempty ||
941 ph10 443 (md->notempty_atstart &&
942 ph10 442 mstart == md->start_subject + md->start_offset)))
943 ph10 443 RRETURN(MATCH_NOMATCH);
944    
945 ph10 442 /* Otherwise, we have a match. */
946 nigel 77
947 ph10 168 md->end_match_ptr = eptr; /* Record where we ended */
948     md->end_offset_top = offset_top; /* and how many extracts were taken */
949 ph10 210 md->start_match_ptr = mstart; /* and the start (\K can modify) */
950 nigel 77 RRETURN(MATCH_MATCH);
951    
952     /* Change option settings */
953    
954     case OP_OPT:
955     ims = ecode[1];
956     ecode += 2;
957     DPRINTF(("ims set to %02lx\n", ims));
958     break;
959    
960     /* Assertion brackets. Check the alternative branches in turn - the
961     matching won't pass the KET for an assertion. If any one branch matches,
962     the assertion is true. Lookbehind assertions have an OP_REVERSE item at the
963     start of each branch to move the current point backwards, so the code at
964     this level is identical to the lookahead case. */
965    
966     case OP_ASSERT:
967     case OP_ASSERTBACK:
968     do
969     {
970 ph10 164 RMATCH(eptr, ecode + 1 + LINK_SIZE, offset_top, md, ims, NULL, 0,
971     RM4);
972 nigel 77 if (rrc == MATCH_MATCH) break;
973 ph10 210 if (rrc != MATCH_NOMATCH && rrc != MATCH_THEN) RRETURN(rrc);
974 nigel 77 ecode += GET(ecode, 1);
975     }
976     while (*ecode == OP_ALT);
977     if (*ecode == OP_KET) RRETURN(MATCH_NOMATCH);
978    
979     /* If checking an assertion for a condition, return MATCH_MATCH. */
980    
981     if ((flags & match_condassert) != 0) RRETURN(MATCH_MATCH);
982    
983     /* Continue from after the assertion, updating the offsets high water
984     mark, since extracts may have been taken during the assertion. */
985    
986     do ecode += GET(ecode,1); while (*ecode == OP_ALT);
987     ecode += 1 + LINK_SIZE;
988     offset_top = md->end_offset_top;
989     continue;
990    
991     /* Negative assertion: all branches must fail to match */
992    
993     case OP_ASSERT_NOT:
994     case OP_ASSERTBACK_NOT:
995     do
996     {
997 ph10 164 RMATCH(eptr, ecode + 1 + LINK_SIZE, offset_top, md, ims, NULL, 0,
998     RM5);
999 nigel 77 if (rrc == MATCH_MATCH) RRETURN(MATCH_NOMATCH);
1000 ph10 210 if (rrc != MATCH_NOMATCH && rrc != MATCH_THEN) RRETURN(rrc);
1001 nigel 77 ecode += GET(ecode,1);
1002     }
1003     while (*ecode == OP_ALT);
1004    
1005     if ((flags & match_condassert) != 0) RRETURN(MATCH_MATCH);
1006    
1007     ecode += 1 + LINK_SIZE;
1008     continue;
1009    
1010     /* Move the subject pointer back. This occurs only at the start of
1011     each branch of a lookbehind assertion. If we are too close to the start to
1012     move back, this match function fails. When working with UTF-8 we move
1013     back a number of characters, not bytes. */
1014    
1015     case OP_REVERSE:
1016     #ifdef SUPPORT_UTF8
1017     if (utf8)
1018     {
1019 nigel 93 i = GET(ecode, 1);
1020     while (i-- > 0)
1021 nigel 77 {
1022     eptr--;
1023     if (eptr < md->start_subject) RRETURN(MATCH_NOMATCH);
1024 ph10 207 BACKCHAR(eptr);
1025 nigel 77 }
1026     }
1027     else
1028     #endif
1029    
1030     /* No UTF-8 support, or not in UTF-8 mode: count is byte count */
1031    
1032     {
1033 nigel 93 eptr -= GET(ecode, 1);
1034 nigel 77 if (eptr < md->start_subject) RRETURN(MATCH_NOMATCH);
1035     }
1036    
1037 ph10 435 /* Save the earliest consulted character, then skip to next op code */
1038 nigel 77
1039 ph10 435 if (eptr < md->start_used_ptr) md->start_used_ptr = eptr;
1040 nigel 77 ecode += 1 + LINK_SIZE;
1041     break;
1042    
1043     /* The callout item calls an external function, if one is provided, passing
1044     details of the match so far. This is mainly for debugging, though the
1045     function is able to force a failure. */
1046    
1047     case OP_CALLOUT:
1048     if (pcre_callout != NULL)
1049     {
1050     pcre_callout_block cb;
1051     cb.version = 1; /* Version 1 of the callout block */
1052     cb.callout_number = ecode[1];
1053     cb.offset_vector = md->offset_vector;
1054 nigel 87 cb.subject = (PCRE_SPTR)md->start_subject;
1055 nigel 77 cb.subject_length = md->end_subject - md->start_subject;
1056 ph10 168 cb.start_match = mstart - md->start_subject;
1057 nigel 77 cb.current_position = eptr - md->start_subject;
1058     cb.pattern_position = GET(ecode, 2);
1059     cb.next_item_length = GET(ecode, 2 + LINK_SIZE);
1060     cb.capture_top = offset_top/2;
1061     cb.capture_last = md->capture_last;
1062     cb.callout_data = md->callout_data;
1063     if ((rrc = (*pcre_callout)(&cb)) > 0) RRETURN(MATCH_NOMATCH);
1064     if (rrc < 0) RRETURN(rrc);
1065     }
1066     ecode += 2 + 2*LINK_SIZE;
1067     break;
1068    
1069     /* Recursion either matches the current regex, or some subexpression. The
1070     offset data is the offset to the starting bracket from the start of the
1071     whole pattern. (This is so that it works from duplicated subpatterns.)
1072    
1073     If there are any capturing brackets started but not finished, we have to
1074     save their starting points and reinstate them after the recursion. However,
1075     we don't know how many such there are (offset_top records the completed
1076     total) so we just have to save all the potential data. There may be up to
1077     65535 such values, which is too large to put on the stack, but using malloc
1078     for small numbers seems expensive. As a compromise, the stack is used when
1079     there are no more than REC_STACK_SAVE_MAX values to store; otherwise malloc
1080     is used. A problem is what to do if the malloc fails ... there is no way of
1081     returning to the top level with an error. Save the top REC_STACK_SAVE_MAX
1082     values on the stack, and accept that the rest may be wrong.
1083    
1084     There are also other values that have to be saved. We use a chained
1085     sequence of blocks that actually live on the stack. Thanks to Robin Houston
1086     for the original version of this logic. */
1087    
1088     case OP_RECURSE:
1089     {
1090     callpat = md->start_code + GET(ecode, 1);
1091 nigel 93 new_recursive.group_num = (callpat == md->start_code)? 0 :
1092     GET2(callpat, 1 + LINK_SIZE);
1093 nigel 77
1094     /* Add to "recursing stack" */
1095    
1096     new_recursive.prevrec = md->recursive;
1097     md->recursive = &new_recursive;
1098    
1099     /* Find where to continue from afterwards */
1100    
1101     ecode += 1 + LINK_SIZE;
1102     new_recursive.after_call = ecode;
1103    
1104     /* Now save the offset data. */
1105    
1106     new_recursive.saved_max = md->offset_end;
1107     if (new_recursive.saved_max <= REC_STACK_SAVE_MAX)
1108     new_recursive.offset_save = stacksave;
1109     else
1110     {
1111     new_recursive.offset_save =
1112     (int *)(pcre_malloc)(new_recursive.saved_max * sizeof(int));
1113     if (new_recursive.offset_save == NULL) RRETURN(PCRE_ERROR_NOMEMORY);
1114     }
1115    
1116     memcpy(new_recursive.offset_save, md->offset_vector,
1117     new_recursive.saved_max * sizeof(int));
1118 ph10 168 new_recursive.save_start = mstart;
1119 ph10 446 new_recursive.offset_top = offset_top;
1120 ph10 168 mstart = eptr;
1121 nigel 77
1122     /* OK, now we can do the recursion. For each top-level alternative we
1123     restore the offset and recursion data. */
1124    
1125     DPRINTF(("Recursing into group %d\n", new_recursive.group_num));
1126 nigel 93 flags = (*callpat >= OP_SBRA)? match_cbegroup : 0;
1127 nigel 77 do
1128     {
1129 ph10 164 RMATCH(eptr, callpat + _pcre_OP_lengths[*callpat], offset_top,
1130     md, ims, eptrb, flags, RM6);
1131 nigel 77 if (rrc == MATCH_MATCH)
1132     {
1133 nigel 87 DPRINTF(("Recursion matched\n"));
1134 nigel 77 md->recursive = new_recursive.prevrec;
1135     if (new_recursive.offset_save != stacksave)
1136     (pcre_free)(new_recursive.offset_save);
1137     RRETURN(MATCH_MATCH);
1138     }
1139 ph10 210 else if (rrc != MATCH_NOMATCH && rrc != MATCH_THEN)
1140 nigel 87 {
1141     DPRINTF(("Recursion gave error %d\n", rrc));
1142 ph10 400 if (new_recursive.offset_save != stacksave)
1143     (pcre_free)(new_recursive.offset_save);
1144 nigel 87 RRETURN(rrc);
1145     }
1146 nigel 77
1147     md->recursive = &new_recursive;
1148     memcpy(md->offset_vector, new_recursive.offset_save,
1149     new_recursive.saved_max * sizeof(int));
1150     callpat += GET(callpat, 1);
1151     }
1152     while (*callpat == OP_ALT);
1153    
1154     DPRINTF(("Recursion didn't match\n"));
1155     md->recursive = new_recursive.prevrec;
1156     if (new_recursive.offset_save != stacksave)
1157     (pcre_free)(new_recursive.offset_save);
1158     RRETURN(MATCH_NOMATCH);
1159     }
1160     /* Control never reaches here */
1161    
1162     /* "Once" brackets are like assertion brackets except that after a match,
1163     the point in the subject string is not moved back. Thus there can never be
1164     a move back into the brackets. Friedl calls these "atomic" subpatterns.
1165     Check the alternative branches in turn - the matching won't pass the KET
1166     for this kind of subpattern. If any one branch matches, we carry on as at
1167     the end of a normal bracket, leaving the subject pointer. */
1168    
1169     case OP_ONCE:
1170 nigel 91 prev = ecode;
1171     saved_eptr = eptr;
1172    
1173     do
1174 nigel 77 {
1175 ph10 197 RMATCH(eptr, ecode + 1 + LINK_SIZE, offset_top, md, ims, eptrb, 0, RM7);
1176 nigel 91 if (rrc == MATCH_MATCH) break;
1177 ph10 210 if (rrc != MATCH_NOMATCH && rrc != MATCH_THEN) RRETURN(rrc);
1178 nigel 91 ecode += GET(ecode,1);
1179     }
1180     while (*ecode == OP_ALT);
1181 nigel 77
1182 nigel 91 /* If hit the end of the group (which could be repeated), fail */
1183 nigel 77
1184 nigel 91 if (*ecode != OP_ONCE && *ecode != OP_ALT) RRETURN(MATCH_NOMATCH);
1185 nigel 77
1186 nigel 91 /* Continue as from after the assertion, updating the offsets high water
1187     mark, since extracts may have been taken. */
1188 nigel 77
1189 nigel 93 do ecode += GET(ecode, 1); while (*ecode == OP_ALT);
1190 nigel 77
1191 nigel 91 offset_top = md->end_offset_top;
1192     eptr = md->end_match_ptr;
1193 nigel 77
1194 nigel 91 /* For a non-repeating ket, just continue at this level. This also
1195     happens for a repeating ket if no characters were matched in the group.
1196     This is the forcible breaking of infinite loops as implemented in Perl
1197     5.005. If there is an options reset, it will get obeyed in the normal
1198     course of events. */
1199 nigel 77
1200 nigel 91 if (*ecode == OP_KET || eptr == saved_eptr)
1201     {
1202     ecode += 1+LINK_SIZE;
1203     break;
1204     }
1205 nigel 77
1206 nigel 91 /* The repeating kets try the rest of the pattern or restart from the
1207     preceding bracket, in the appropriate order. The second "call" of match()
1208     uses tail recursion, to avoid using another stack frame. We need to reset
1209     any options that changed within the bracket before re-running it, so
1210     check the next opcode. */
1211 nigel 77
1212 nigel 91 if (ecode[1+LINK_SIZE] == OP_OPT)
1213     {
1214     ims = (ims & ~PCRE_IMS) | ecode[4];
1215     DPRINTF(("ims set to %02lx at group repeat\n", ims));
1216     }
1217 nigel 77
1218 nigel 91 if (*ecode == OP_KETRMIN)
1219     {
1220 ph10 197 RMATCH(eptr, ecode + 1 + LINK_SIZE, offset_top, md, ims, eptrb, 0, RM8);
1221 nigel 91 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1222     ecode = prev;
1223 ph10 197 flags = 0;
1224 nigel 91 goto TAIL_RECURSE;
1225 nigel 77 }
1226 nigel 91 else /* OP_KETRMAX */
1227     {
1228 ph10 164 RMATCH(eptr, prev, offset_top, md, ims, eptrb, match_cbegroup, RM9);
1229 nigel 91 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1230     ecode += 1 + LINK_SIZE;
1231 ph10 197 flags = 0;
1232 nigel 91 goto TAIL_RECURSE;
1233     }
1234     /* Control never gets here */
1235 nigel 77
1236     /* An alternation is the end of a branch; scan along to find the end of the
1237     bracketed group and go to there. */
1238    
1239     case OP_ALT:
1240     do ecode += GET(ecode,1); while (*ecode == OP_ALT);
1241     break;
1242    
1243 ph10 335 /* BRAZERO, BRAMINZERO and SKIPZERO occur just before a bracket group,
1244     indicating that it may occur zero times. It may repeat infinitely, or not
1245     at all - i.e. it could be ()* or ()? or even (){0} in the pattern. Brackets
1246     with fixed upper repeat limits are compiled as a number of copies, with the
1247     optional ones preceded by BRAZERO or BRAMINZERO. */
1248 nigel 77
1249     case OP_BRAZERO:
1250     {
1251     next = ecode+1;
1252 ph10 164 RMATCH(eptr, next, offset_top, md, ims, eptrb, 0, RM10);
1253 nigel 77 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1254     do next += GET(next,1); while (*next == OP_ALT);
1255 nigel 93 ecode = next + 1 + LINK_SIZE;
1256 nigel 77 }
1257     break;
1258    
1259     case OP_BRAMINZERO:
1260     {
1261     next = ecode+1;
1262 nigel 93 do next += GET(next, 1); while (*next == OP_ALT);
1263 ph10 164 RMATCH(eptr, next + 1+LINK_SIZE, offset_top, md, ims, eptrb, 0, RM11);
1264 nigel 77 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1265     ecode++;
1266     }
1267     break;
1268    
1269 ph10 335 case OP_SKIPZERO:
1270     {
1271     next = ecode+1;
1272     do next += GET(next,1); while (*next == OP_ALT);
1273     ecode = next + 1 + LINK_SIZE;
1274     }
1275     break;
1276    
1277 nigel 93 /* End of a group, repeated or non-repeating. */
1278 nigel 77
1279     case OP_KET:
1280     case OP_KETRMIN:
1281     case OP_KETRMAX:
1282 nigel 91 prev = ecode - GET(ecode, 1);
1283 nigel 77
1284 nigel 93 /* If this was a group that remembered the subject start, in order to break
1285     infinite repeats of empty string matches, retrieve the subject start from
1286     the chain. Otherwise, set it NULL. */
1287 nigel 77
1288 nigel 93 if (*prev >= OP_SBRA)
1289     {
1290     saved_eptr = eptrb->epb_saved_eptr; /* Value at start of group */
1291     eptrb = eptrb->epb_prev; /* Backup to previous group */
1292     }
1293     else saved_eptr = NULL;
1294 nigel 77
1295 nigel 93 /* If we are at the end of an assertion group, stop matching and return
1296     MATCH_MATCH, but record the current high water mark for use by positive
1297     assertions. Do this also for the "once" (atomic) groups. */
1298    
1299 nigel 91 if (*prev == OP_ASSERT || *prev == OP_ASSERT_NOT ||
1300     *prev == OP_ASSERTBACK || *prev == OP_ASSERTBACK_NOT ||
1301     *prev == OP_ONCE)
1302     {
1303     md->end_match_ptr = eptr; /* For ONCE */
1304     md->end_offset_top = offset_top;
1305     RRETURN(MATCH_MATCH);
1306     }
1307 nigel 77
1308 nigel 93 /* For capturing groups we have to check the group number back at the start
1309     and if necessary complete handling an extraction by setting the offsets and
1310     bumping the high water mark. Note that whole-pattern recursion is coded as
1311     a recurse into group 0, so it won't be picked up here. Instead, we catch it
1312     when the OP_END is reached. Other recursion is handled here. */
1313 nigel 77
1314 nigel 93 if (*prev == OP_CBRA || *prev == OP_SCBRA)
1315 nigel 91 {
1316 nigel 93 number = GET2(prev, 1+LINK_SIZE);
1317 nigel 91 offset = number << 1;
1318 ph10 446
1319 nigel 77 #ifdef DEBUG
1320 nigel 91 printf("end bracket %d", number);
1321     printf("\n");
1322 nigel 77 #endif
1323    
1324 nigel 93 md->capture_last = number;
1325     if (offset >= md->offset_max) md->offset_overflow = TRUE; else
1326 nigel 91 {
1327 nigel 93 md->offset_vector[offset] =
1328     md->offset_vector[md->offset_end - number];
1329     md->offset_vector[offset+1] = eptr - md->start_subject;
1330     if (offset_top <= offset) offset_top = offset + 2;
1331     }
1332 nigel 77
1333 nigel 93 /* Handle a recursively called group. Restore the offsets
1334     appropriately and continue from after the call. */
1335 nigel 77
1336 nigel 93 if (md->recursive != NULL && md->recursive->group_num == number)
1337     {
1338     recursion_info *rec = md->recursive;
1339     DPRINTF(("Recursion (%d) succeeded - continuing\n", number));
1340     md->recursive = rec->prevrec;
1341 ph10 168 mstart = rec->save_start;
1342 nigel 93 memcpy(md->offset_vector, rec->offset_save,
1343     rec->saved_max * sizeof(int));
1344 ph10 446 offset_top = rec->offset_top;
1345 nigel 93 ecode = rec->after_call;
1346     ims = original_ims;
1347     break;
1348 nigel 77 }
1349 nigel 91 }
1350 nigel 77
1351 nigel 93 /* For both capturing and non-capturing groups, reset the value of the ims
1352     flags, in case they got changed during the group. */
1353 nigel 77
1354 nigel 91 ims = original_ims;
1355     DPRINTF(("ims reset to %02lx\n", ims));
1356 nigel 77
1357 nigel 91 /* For a non-repeating ket, just continue at this level. This also
1358     happens for a repeating ket if no characters were matched in the group.
1359     This is the forcible breaking of infinite loops as implemented in Perl
1360     5.005. If there is an options reset, it will get obeyed in the normal
1361     course of events. */
1362 nigel 77
1363 nigel 91 if (*ecode == OP_KET || eptr == saved_eptr)
1364     {
1365     ecode += 1 + LINK_SIZE;
1366     break;
1367     }
1368 nigel 77
1369 nigel 91 /* The repeating kets try the rest of the pattern or restart from the
1370     preceding bracket, in the appropriate order. In the second case, we can use
1371 ph10 197 tail recursion to avoid using another stack frame, unless we have an
1372     unlimited repeat of a group that can match an empty string. */
1373 nigel 77
1374 nigel 93 flags = (*prev >= OP_SBRA)? match_cbegroup : 0;
1375    
1376 nigel 91 if (*ecode == OP_KETRMIN)
1377     {
1378 ph10 197 RMATCH(eptr, ecode + 1 + LINK_SIZE, offset_top, md, ims, eptrb, 0, RM12);
1379 nigel 91 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1380 ph10 197 if (flags != 0) /* Could match an empty string */
1381     {
1382     RMATCH(eptr, prev, offset_top, md, ims, eptrb, flags, RM50);
1383     RRETURN(rrc);
1384     }
1385 nigel 91 ecode = prev;
1386     goto TAIL_RECURSE;
1387 nigel 77 }
1388 nigel 91 else /* OP_KETRMAX */
1389     {
1390 ph10 164 RMATCH(eptr, prev, offset_top, md, ims, eptrb, flags, RM13);
1391 nigel 91 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1392     ecode += 1 + LINK_SIZE;
1393 ph10 197 flags = 0;
1394 nigel 91 goto TAIL_RECURSE;
1395     }
1396     /* Control never gets here */
1397 nigel 77
1398     /* Start of subject unless notbol, or after internal newline if multiline */
1399    
1400     case OP_CIRC:
1401     if (md->notbol && eptr == md->start_subject) RRETURN(MATCH_NOMATCH);
1402     if ((ims & PCRE_MULTILINE) != 0)
1403     {
1404 nigel 91 if (eptr != md->start_subject &&
1405 nigel 93 (eptr == md->end_subject || !WAS_NEWLINE(eptr)))
1406 nigel 77 RRETURN(MATCH_NOMATCH);
1407     ecode++;
1408     break;
1409     }
1410     /* ... else fall through */
1411    
1412     /* Start of subject assertion */
1413    
1414     case OP_SOD:
1415     if (eptr != md->start_subject) RRETURN(MATCH_NOMATCH);
1416     ecode++;
1417     break;
1418    
1419     /* Start of match assertion */
1420    
1421     case OP_SOM:
1422     if (eptr != md->start_subject + md->start_offset) RRETURN(MATCH_NOMATCH);
1423     ecode++;
1424     break;
1425 ph10 172
1426 ph10 168 /* Reset the start of match point */
1427 ph10 172
1428 ph10 168 case OP_SET_SOM:
1429     mstart = eptr;
1430 ph10 172 ecode++;
1431     break;
1432 nigel 77
1433     /* Assert before internal newline if multiline, or before a terminating
1434     newline unless endonly is set, else end of subject unless noteol is set. */
1435    
1436     case OP_DOLL:
1437     if ((ims & PCRE_MULTILINE) != 0)
1438     {
1439     if (eptr < md->end_subject)
1440 nigel 91 { if (!IS_NEWLINE(eptr)) RRETURN(MATCH_NOMATCH); }
1441 nigel 77 else
1442     { if (md->noteol) RRETURN(MATCH_NOMATCH); }
1443     ecode++;
1444     break;
1445     }
1446     else
1447     {
1448     if (md->noteol) RRETURN(MATCH_NOMATCH);
1449     if (!md->endonly)
1450     {
1451 nigel 91 if (eptr != md->end_subject &&
1452 nigel 93 (!IS_NEWLINE(eptr) || eptr != md->end_subject - md->nllen))
1453 nigel 77 RRETURN(MATCH_NOMATCH);
1454     ecode++;
1455     break;
1456     }
1457     }
1458 nigel 91 /* ... else fall through for endonly */
1459 nigel 77
1460     /* End of subject assertion (\z) */
1461    
1462     case OP_EOD:
1463     if (eptr < md->end_subject) RRETURN(MATCH_NOMATCH);
1464     ecode++;
1465     break;
1466    
1467     /* End of subject or ending \n assertion (\Z) */
1468    
1469     case OP_EODN:
1470 nigel 91 if (eptr != md->end_subject &&
1471 nigel 93 (!IS_NEWLINE(eptr) || eptr != md->end_subject - md->nllen))
1472 nigel 91 RRETURN(MATCH_NOMATCH);
1473 nigel 77 ecode++;
1474     break;
1475    
1476     /* Word boundary assertions */
1477    
1478     case OP_NOT_WORD_BOUNDARY:
1479     case OP_WORD_BOUNDARY:
1480     {
1481    
1482     /* Find out if the previous and current characters are "word" characters.
1483     It takes a bit more work in UTF-8 mode. Characters > 255 are assumed to
1484 ph10 443 be "non-word" characters. Remember the earliest consulted character for
1485 ph10 435 partial matching. */
1486 nigel 77
1487     #ifdef SUPPORT_UTF8
1488     if (utf8)
1489     {
1490     if (eptr == md->start_subject) prev_is_word = FALSE; else
1491     {
1492 ph10 409 USPTR lastptr = eptr - 1;
1493 nigel 77 while((*lastptr & 0xc0) == 0x80) lastptr--;
1494 ph10 443 if (lastptr < md->start_used_ptr) md->start_used_ptr = lastptr;
1495 nigel 77 GETCHAR(c, lastptr);
1496     prev_is_word = c < 256 && (md->ctypes[c] & ctype_word) != 0;
1497     }
1498 ph10 443 if (eptr >= md->end_subject)
1499 nigel 77 {
1500 ph10 443 SCHECK_PARTIAL();
1501     cur_is_word = FALSE;
1502 ph10 428 }
1503     else
1504     {
1505 nigel 77 GETCHAR(c, eptr);
1506     cur_is_word = c < 256 && (md->ctypes[c] & ctype_word) != 0;
1507     }
1508     }
1509     else
1510     #endif
1511    
1512 ph10 428 /* Not in UTF-8 mode */
1513 nigel 77
1514     {
1515 ph10 435 if (eptr == md->start_subject) prev_is_word = FALSE; else
1516     {
1517 ph10 443 if (eptr <= md->start_used_ptr) md->start_used_ptr = eptr - 1;
1518 ph10 435 prev_is_word = ((md->ctypes[eptr[-1]] & ctype_word) != 0);
1519     }
1520 ph10 443 if (eptr >= md->end_subject)
1521 ph10 428 {
1522 ph10 443 SCHECK_PARTIAL();
1523     cur_is_word = FALSE;
1524 ph10 428 }
1525     else cur_is_word = ((md->ctypes[*eptr] & ctype_word) != 0);
1526 nigel 77 }
1527    
1528     /* Now see if the situation is what we want */
1529    
1530     if ((*ecode++ == OP_WORD_BOUNDARY)?
1531     cur_is_word == prev_is_word : cur_is_word != prev_is_word)
1532     RRETURN(MATCH_NOMATCH);
1533     }
1534     break;
1535    
1536     /* Match a single character type; inline for speed */
1537    
1538     case OP_ANY:
1539 ph10 342 if (IS_NEWLINE(eptr)) RRETURN(MATCH_NOMATCH);
1540 ph10 345 /* Fall through */
1541    
1542 ph10 341 case OP_ALLANY:
1543 ph10 443 if (eptr++ >= md->end_subject)
1544 ph10 428 {
1545 ph10 443 SCHECK_PARTIAL();
1546 ph10 428 RRETURN(MATCH_NOMATCH);
1547 ph10 443 }
1548 ph10 342 if (utf8) while (eptr < md->end_subject && (*eptr & 0xc0) == 0x80) eptr++;
1549 nigel 77 ecode++;
1550     break;
1551    
1552     /* Match a single byte, even in UTF-8 mode. This opcode really does match
1553     any byte, even newline, independent of the setting of PCRE_DOTALL. */
1554    
1555     case OP_ANYBYTE:
1556 ph10 443 if (eptr++ >= md->end_subject)
1557 ph10 428 {
1558 ph10 443 SCHECK_PARTIAL();
1559 ph10 428 RRETURN(MATCH_NOMATCH);
1560 ph10 443 }
1561 nigel 77 ecode++;
1562     break;
1563    
1564     case OP_NOT_DIGIT:
1565 ph10 443 if (eptr >= md->end_subject)
1566 ph10 428 {
1567 ph10 443 SCHECK_PARTIAL();
1568 ph10 428 RRETURN(MATCH_NOMATCH);
1569 ph10 443 }
1570 nigel 77 GETCHARINCTEST(c, eptr);
1571     if (
1572     #ifdef SUPPORT_UTF8
1573     c < 256 &&
1574     #endif
1575     (md->ctypes[c] & ctype_digit) != 0
1576     )
1577     RRETURN(MATCH_NOMATCH);
1578     ecode++;
1579     break;
1580    
1581     case OP_DIGIT:
1582 ph10 443 if (eptr >= md->end_subject)
1583 ph10 428 {
1584 ph10 443 SCHECK_PARTIAL();
1585 ph10 428 RRETURN(MATCH_NOMATCH);
1586 ph10 443 }
1587 nigel 77 GETCHARINCTEST(c, eptr);
1588     if (
1589     #ifdef SUPPORT_UTF8
1590     c >= 256 ||
1591     #endif
1592     (md->ctypes[c] & ctype_digit) == 0
1593     )
1594     RRETURN(MATCH_NOMATCH);
1595     ecode++;
1596     break;
1597    
1598     case OP_NOT_WHITESPACE:
1599 ph10 443 if (eptr >= md->end_subject)
1600 ph10 428 {
1601 ph10 443 SCHECK_PARTIAL();
1602 ph10 428 RRETURN(MATCH_NOMATCH);
1603 ph10 443 }
1604 nigel 77 GETCHARINCTEST(c, eptr);
1605     if (
1606     #ifdef SUPPORT_UTF8
1607     c < 256 &&
1608     #endif
1609     (md->ctypes[c] & ctype_space) != 0
1610     )
1611     RRETURN(MATCH_NOMATCH);
1612     ecode++;
1613     break;
1614    
1615     case OP_WHITESPACE:
1616 ph10 443 if (eptr >= md->end_subject)
1617 ph10 428 {
1618 ph10 443 SCHECK_PARTIAL();
1619 ph10 428 RRETURN(MATCH_NOMATCH);
1620 ph10 443 }
1621 nigel 77 GETCHARINCTEST(c, eptr);
1622     if (
1623     #ifdef SUPPORT_UTF8
1624     c >= 256 ||
1625     #endif
1626     (md->ctypes[c] & ctype_space) == 0
1627     )
1628     RRETURN(MATCH_NOMATCH);
1629     ecode++;
1630     break;
1631    
1632     case OP_NOT_WORDCHAR:
1633 ph10 443 if (eptr >= md->end_subject)
1634 ph10 428 {
1635 ph10 443 SCHECK_PARTIAL();
1636 ph10 428 RRETURN(MATCH_NOMATCH);
1637 ph10 443 }
1638 nigel 77 GETCHARINCTEST(c, eptr);
1639     if (
1640     #ifdef SUPPORT_UTF8
1641     c < 256 &&
1642     #endif
1643     (md->ctypes[c] & ctype_word) != 0
1644     )
1645     RRETURN(MATCH_NOMATCH);
1646     ecode++;
1647     break;
1648    
1649     case OP_WORDCHAR:
1650 ph10 443 if (eptr >= md->end_subject)
1651 ph10 428 {
1652 ph10 443 SCHECK_PARTIAL();
1653 ph10 428 RRETURN(MATCH_NOMATCH);
1654 ph10 443 }
1655 nigel 77 GETCHARINCTEST(c, eptr);
1656     if (
1657     #ifdef SUPPORT_UTF8
1658     c >= 256 ||
1659     #endif
1660     (md->ctypes[c] & ctype_word) == 0
1661     )
1662     RRETURN(MATCH_NOMATCH);
1663     ecode++;
1664     break;
1665    
1666 nigel 93 case OP_ANYNL:
1667 ph10 443 if (eptr >= md->end_subject)
1668 ph10 428 {
1669 ph10 443 SCHECK_PARTIAL();
1670 ph10 428 RRETURN(MATCH_NOMATCH);
1671 ph10 443 }
1672 nigel 93 GETCHARINCTEST(c, eptr);
1673     switch(c)
1674     {
1675     default: RRETURN(MATCH_NOMATCH);
1676     case 0x000d:
1677     if (eptr < md->end_subject && *eptr == 0x0a) eptr++;
1678     break;
1679 ph10 231
1680 nigel 93 case 0x000a:
1681 ph10 231 break;
1682    
1683 nigel 93 case 0x000b:
1684     case 0x000c:
1685     case 0x0085:
1686     case 0x2028:
1687     case 0x2029:
1688 ph10 231 if (md->bsr_anycrlf) RRETURN(MATCH_NOMATCH);
1689 nigel 93 break;
1690     }
1691     ecode++;
1692     break;
1693    
1694 ph10 178 case OP_NOT_HSPACE:
1695 ph10 443 if (eptr >= md->end_subject)
1696 ph10 428 {
1697 ph10 443 SCHECK_PARTIAL();
1698 ph10 428 RRETURN(MATCH_NOMATCH);
1699 ph10 443 }
1700 ph10 178 GETCHARINCTEST(c, eptr);
1701     switch(c)
1702     {
1703     default: break;
1704     case 0x09: /* HT */
1705     case 0x20: /* SPACE */
1706     case 0xa0: /* NBSP */
1707     case 0x1680: /* OGHAM SPACE MARK */
1708     case 0x180e: /* MONGOLIAN VOWEL SEPARATOR */
1709     case 0x2000: /* EN QUAD */
1710     case 0x2001: /* EM QUAD */
1711     case 0x2002: /* EN SPACE */
1712     case 0x2003: /* EM SPACE */
1713     case 0x2004: /* THREE-PER-EM SPACE */
1714     case 0x2005: /* FOUR-PER-EM SPACE */
1715     case 0x2006: /* SIX-PER-EM SPACE */
1716     case 0x2007: /* FIGURE SPACE */
1717     case 0x2008: /* PUNCTUATION SPACE */
1718     case 0x2009: /* THIN SPACE */
1719     case 0x200A: /* HAIR SPACE */
1720     case 0x202f: /* NARROW NO-BREAK SPACE */
1721     case 0x205f: /* MEDIUM MATHEMATICAL SPACE */
1722     case 0x3000: /* IDEOGRAPHIC SPACE */
1723     RRETURN(MATCH_NOMATCH);
1724     }
1725     ecode++;
1726     break;
1727    
1728     case OP_HSPACE:
1729 ph10 443 if (eptr >= md->end_subject)
1730 ph10 428 {
1731 ph10 443 SCHECK_PARTIAL();
1732 ph10 428 RRETURN(MATCH_NOMATCH);
1733 ph10 443 }
1734 ph10 178 GETCHARINCTEST(c, eptr);
1735     switch(c)
1736     {
1737     default: RRETURN(MATCH_NOMATCH);
1738     case 0x09: /* HT */
1739     case 0x20: /* SPACE */
1740     case 0xa0: /* NBSP */
1741     case 0x1680: /* OGHAM SPACE MARK */
1742     case 0x180e: /* MONGOLIAN VOWEL SEPARATOR */
1743     case 0x2000: /* EN QUAD */
1744     case 0x2001: /* EM QUAD */
1745     case 0x2002: /* EN SPACE */
1746     case 0x2003: /* EM SPACE */
1747     case 0x2004: /* THREE-PER-EM SPACE */
1748     case 0x2005: /* FOUR-PER-EM SPACE */
1749     case 0x2006: /* SIX-PER-EM SPACE */
1750     case 0x2007: /* FIGURE SPACE */
1751     case 0x2008: /* PUNCTUATION SPACE */
1752     case 0x2009: /* THIN SPACE */
1753     case 0x200A: /* HAIR SPACE */
1754     case 0x202f: /* NARROW NO-BREAK SPACE */
1755     case 0x205f: /* MEDIUM MATHEMATICAL SPACE */
1756     case 0x3000: /* IDEOGRAPHIC SPACE */
1757     break;
1758     }
1759     ecode++;
1760     break;
1761    
1762     case OP_NOT_VSPACE:
1763 ph10 443 if (eptr >= md->end_subject)
1764 ph10 428 {
1765 ph10 443 SCHECK_PARTIAL();
1766 ph10 428 RRETURN(MATCH_NOMATCH);
1767 ph10 443 }
1768 ph10 178 GETCHARINCTEST(c, eptr);
1769     switch(c)
1770     {
1771     default: break;
1772     case 0x0a: /* LF */
1773     case 0x0b: /* VT */
1774     case 0x0c: /* FF */
1775     case 0x0d: /* CR */
1776     case 0x85: /* NEL */
1777     case 0x2028: /* LINE SEPARATOR */
1778     case 0x2029: /* PARAGRAPH SEPARATOR */
1779     RRETURN(MATCH_NOMATCH);
1780     }
1781     ecode++;
1782     break;
1783    
1784     case OP_VSPACE:
1785 ph10 443 if (eptr >= md->end_subject)
1786 ph10 428 {
1787 ph10 443 SCHECK_PARTIAL();
1788 ph10 428 RRETURN(MATCH_NOMATCH);
1789 ph10 443 }
1790 ph10 178 GETCHARINCTEST(c, eptr);
1791     switch(c)
1792     {
1793     default: RRETURN(MATCH_NOMATCH);
1794     case 0x0a: /* LF */
1795     case 0x0b: /* VT */
1796     case 0x0c: /* FF */
1797     case 0x0d: /* CR */
1798     case 0x85: /* NEL */
1799     case 0x2028: /* LINE SEPARATOR */
1800     case 0x2029: /* PARAGRAPH SEPARATOR */
1801     break;
1802     }
1803     ecode++;
1804     break;
1805    
1806 nigel 77 #ifdef SUPPORT_UCP
1807     /* Check the next character by Unicode property. We will get here only
1808     if the support is in the binary; otherwise a compile-time error occurs. */
1809    
1810     case OP_PROP:
1811     case OP_NOTPROP:
1812 ph10 443 if (eptr >= md->end_subject)
1813 ph10 428 {
1814 ph10 443 SCHECK_PARTIAL();
1815 ph10 428 RRETURN(MATCH_NOMATCH);
1816 ph10 443 }
1817 nigel 77 GETCHARINCTEST(c, eptr);
1818     {
1819 ph10 384 const ucd_record *prop = GET_UCD(c);
1820 nigel 77
1821 nigel 87 switch(ecode[1])
1822     {
1823     case PT_ANY:
1824     if (op == OP_NOTPROP) RRETURN(MATCH_NOMATCH);
1825     break;
1826 nigel 77
1827 nigel 87 case PT_LAMP:
1828 ph10 349 if ((prop->chartype == ucp_Lu ||
1829     prop->chartype == ucp_Ll ||
1830     prop->chartype == ucp_Lt) == (op == OP_NOTPROP))
1831 nigel 77 RRETURN(MATCH_NOMATCH);
1832 nigel 87 break;
1833    
1834     case PT_GC:
1835 ph10 351 if ((ecode[2] != _pcre_ucp_gentype[prop->chartype]) == (op == OP_PROP))
1836 nigel 77 RRETURN(MATCH_NOMATCH);
1837 nigel 87 break;
1838    
1839     case PT_PC:
1840 ph10 349 if ((ecode[2] != prop->chartype) == (op == OP_PROP))
1841 nigel 87 RRETURN(MATCH_NOMATCH);
1842     break;
1843    
1844     case PT_SC:
1845 ph10 349 if ((ecode[2] != prop->script) == (op == OP_PROP))
1846 nigel 87 RRETURN(MATCH_NOMATCH);
1847     break;
1848    
1849     default:
1850     RRETURN(PCRE_ERROR_INTERNAL);
1851 nigel 77 }
1852 nigel 87
1853     ecode += 3;
1854 nigel 77 }
1855     break;
1856    
1857     /* Match an extended Unicode sequence. We will get here only if the support
1858     is in the binary; otherwise a compile-time error occurs. */
1859    
1860     case OP_EXTUNI:
1861 ph10 443 if (eptr >= md->end_subject)
1862 ph10 428 {
1863 ph10 443 SCHECK_PARTIAL();
1864 ph10 428 RRETURN(MATCH_NOMATCH);
1865 ph10 443 }
1866 nigel 77 GETCHARINCTEST(c, eptr);
1867     {
1868 ph10 349 int category = UCD_CATEGORY(c);
1869 nigel 77 if (category == ucp_M) RRETURN(MATCH_NOMATCH);
1870     while (eptr < md->end_subject)
1871     {
1872     int len = 1;
1873     if (!utf8) c = *eptr; else
1874     {
1875     GETCHARLEN(c, eptr, len);
1876     }
1877 ph10 349 category = UCD_CATEGORY(c);
1878 nigel 77 if (category != ucp_M) break;
1879     eptr += len;
1880     }
1881     }
1882     ecode++;
1883     break;
1884     #endif
1885    
1886    
1887     /* Match a back reference, possibly repeatedly. Look past the end of the
1888     item to see if there is repeat information following. The code is similar
1889     to that for character classes, but repeated for efficiency. Then obey
1890     similar code to character type repeats - written out again for speed.
1891     However, if the referenced string is the empty string, always treat
1892     it as matched, any number of times (otherwise there could be infinite
1893     loops). */
1894    
1895     case OP_REF:
1896     {
1897     offset = GET2(ecode, 1) << 1; /* Doubled ref number */
1898 ph10 345 ecode += 3;
1899    
1900 ph10 336 /* If the reference is unset, there are two possibilities:
1901 ph10 345
1902 ph10 336 (a) In the default, Perl-compatible state, set the length to be longer
1903     than the amount of subject left; this ensures that every attempt at a
1904     match fails. We can't just fail here, because of the possibility of
1905     quantifiers with zero minima.
1906 ph10 345
1907     (b) If the JavaScript compatibility flag is set, set the length to zero
1908     so that the back reference matches an empty string.
1909    
1910     Otherwise, set the length to the length of what was matched by the
1911 ph10 336 referenced subpattern. */
1912 ph10 345
1913 ph10 336 if (offset >= offset_top || md->offset_vector[offset] < 0)
1914 ph10 345 length = (md->jscript_compat)? 0 : md->end_subject - eptr + 1;
1915 ph10 336 else
1916     length = md->offset_vector[offset+1] - md->offset_vector[offset];
1917 nigel 77
1918     /* Set up for repetition, or handle the non-repeated case */
1919    
1920     switch (*ecode)
1921     {
1922     case OP_CRSTAR:
1923     case OP_CRMINSTAR:
1924     case OP_CRPLUS:
1925     case OP_CRMINPLUS:
1926     case OP_CRQUERY:
1927     case OP_CRMINQUERY:
1928     c = *ecode++ - OP_CRSTAR;
1929     minimize = (c & 1) != 0;
1930     min = rep_min[c]; /* Pick up values from tables; */
1931     max = rep_max[c]; /* zero for max => infinity */
1932     if (max == 0) max = INT_MAX;
1933     break;
1934    
1935     case OP_CRRANGE:
1936     case OP_CRMINRANGE:
1937     minimize = (*ecode == OP_CRMINRANGE);
1938     min = GET2(ecode, 1);
1939     max = GET2(ecode, 3);
1940     if (max == 0) max = INT_MAX;
1941     ecode += 5;
1942     break;
1943    
1944     default: /* No repeat follows */
1945 ph10 443 if (!match_ref(offset, eptr, length, md, ims))
1946 ph10 428 {
1947 ph10 443 CHECK_PARTIAL();
1948 ph10 428 RRETURN(MATCH_NOMATCH);
1949 ph10 443 }
1950 nigel 77 eptr += length;
1951     continue; /* With the main loop */
1952     }
1953    
1954     /* If the length of the reference is zero, just continue with the
1955     main loop. */
1956 ph10 443
1957 nigel 77 if (length == 0) continue;
1958    
1959     /* First, ensure the minimum number of matches are present. We get back
1960     the length of the reference string explicitly rather than passing the
1961     address of eptr, so that eptr can be a register variable. */
1962    
1963     for (i = 1; i <= min; i++)
1964     {
1965 ph10 427 if (!match_ref(offset, eptr, length, md, ims))
1966 ph10 426 {
1967 ph10 427 CHECK_PARTIAL();
1968 ph10 426 RRETURN(MATCH_NOMATCH);
1969 ph10 427 }
1970 nigel 77 eptr += length;
1971     }
1972    
1973     /* If min = max, continue at the same level without recursion.
1974     They are not both allowed to be zero. */
1975    
1976     if (min == max) continue;
1977    
1978     /* If minimizing, keep trying and advancing the pointer */
1979    
1980     if (minimize)
1981     {
1982     for (fi = min;; fi++)
1983     {
1984 ph10 164 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM14);
1985 nigel 77 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1986 ph10 428 if (fi >= max) RRETURN(MATCH_NOMATCH);
1987     if (!match_ref(offset, eptr, length, md, ims))
1988 ph10 426 {
1989 ph10 427 CHECK_PARTIAL();
1990 nigel 77 RRETURN(MATCH_NOMATCH);
1991 ph10 427 }
1992 nigel 77 eptr += length;
1993     }
1994     /* Control never gets here */
1995     }
1996    
1997     /* If maximizing, find the longest string and work backwards */
1998    
1999     else
2000     {
2001     pp = eptr;
2002     for (i = min; i < max; i++)
2003     {
2004     if (!match_ref(offset, eptr, length, md, ims)) break;
2005     eptr += length;
2006     }
2007     while (eptr >= pp)
2008     {
2009 ph10 164 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM15);
2010 nigel 77 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2011     eptr -= length;
2012     }
2013     RRETURN(MATCH_NOMATCH);
2014     }
2015     }
2016     /* Control never gets here */
2017    
2018     /* Match a bit-mapped character class, possibly repeatedly. This op code is
2019     used when all the characters in the class have values in the range 0-255,
2020     and either the matching is caseful, or the characters are in the range
2021     0-127 when UTF-8 processing is enabled. The only difference between
2022     OP_CLASS and OP_NCLASS occurs when a data character outside the range is
2023     encountered.
2024    
2025     First, look past the end of the item to see if there is repeat information
2026     following. Then obey similar code to character type repeats - written out
2027     again for speed. */
2028    
2029     case OP_NCLASS:
2030     case OP_CLASS:
2031     {
2032     data = ecode + 1; /* Save for matching */
2033     ecode += 33; /* Advance past the item */
2034    
2035     switch (*ecode)
2036     {
2037     case OP_CRSTAR:
2038     case OP_CRMINSTAR:
2039     case OP_CRPLUS:
2040     case OP_CRMINPLUS:
2041     case OP_CRQUERY:
2042     case OP_CRMINQUERY:
2043     c = *ecode++ - OP_CRSTAR;
2044     minimize = (c & 1) != 0;
2045     min = rep_min[c]; /* Pick up values from tables; */
2046     max = rep_max[c]; /* zero for max => infinity */
2047     if (max == 0) max = INT_MAX;
2048     break;
2049    
2050     case OP_CRRANGE:
2051     case OP_CRMINRANGE:
2052     minimize = (*ecode == OP_CRMINRANGE);
2053     min = GET2(ecode, 1);
2054     max = GET2(ecode, 3);
2055     if (max == 0) max = INT_MAX;
2056     ecode += 5;
2057     break;
2058    
2059     default: /* No repeat follows */
2060     min = max = 1;
2061     break;
2062     }
2063    
2064     /* First, ensure the minimum number of matches are present. */
2065    
2066     #ifdef SUPPORT_UTF8
2067     /* UTF-8 mode */
2068     if (utf8)
2069     {
2070     for (i = 1; i <= min; i++)
2071     {
2072 ph10 427 if (eptr >= md->end_subject)
2073 ph10 426 {
2074 ph10 428 SCHECK_PARTIAL();
2075 ph10 426 RRETURN(MATCH_NOMATCH);
2076 ph10 427 }
2077 nigel 77 GETCHARINC(c, eptr);
2078     if (c > 255)
2079     {
2080     if (op == OP_CLASS) RRETURN(MATCH_NOMATCH);
2081     }
2082     else
2083     {
2084     if ((data[c/8] & (1 << (c&7))) == 0) RRETURN(MATCH_NOMATCH);
2085     }
2086     }
2087     }
2088     else
2089     #endif
2090     /* Not UTF-8 mode */
2091     {
2092     for (i = 1; i <= min; i++)
2093     {
2094 ph10 427 if (eptr >= md->end_subject)
2095 ph10 426 {
2096 ph10 428 SCHECK_PARTIAL();
2097 ph10 426 RRETURN(MATCH_NOMATCH);
2098 ph10 427 }
2099 nigel 77 c = *eptr++;
2100     if ((data[c/8] & (1 << (c&7))) == 0) RRETURN(MATCH_NOMATCH);
2101     }
2102     }
2103    
2104     /* If max == min we can continue with the main loop without the
2105     need to recurse. */
2106    
2107     if (min == max) continue;
2108    
2109     /* If minimizing, keep testing the rest of the expression and advancing
2110     the pointer while it matches the class. */
2111    
2112     if (minimize)
2113     {
2114     #ifdef SUPPORT_UTF8
2115     /* UTF-8 mode */
2116     if (utf8)
2117     {
2118     for (fi = min;; fi++)
2119     {
2120 ph10 164 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM16);
2121 nigel 77 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2122 ph10 428 if (fi >= max) RRETURN(MATCH_NOMATCH);
2123 ph10 427 if (eptr >= md->end_subject)
2124 ph10 426 {
2125 ph10 427 SCHECK_PARTIAL();
2126 ph10 426 RRETURN(MATCH_NOMATCH);
2127 ph10 427 }
2128 nigel 77 GETCHARINC(c, eptr);
2129     if (c > 255)
2130     {
2131     if (op == OP_CLASS) RRETURN(MATCH_NOMATCH);
2132     }
2133     else
2134     {
2135     if ((data[c/8] & (1 << (c&7))) == 0) RRETURN(MATCH_NOMATCH);
2136     }
2137     }
2138     }
2139     else
2140     #endif
2141     /* Not UTF-8 mode */
2142     {
2143     for (fi = min;; fi++)
2144     {
2145 ph10 164 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM17);
2146 nigel 77 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2147 ph10 428 if (fi >= max) RRETURN(MATCH_NOMATCH);
2148 ph10 427 if (eptr >= md->end_subject)
2149 ph10 426 {
2150 ph10 427 SCHECK_PARTIAL();
2151 ph10 426 RRETURN(MATCH_NOMATCH);
2152 ph10 427 }
2153 nigel 77 c = *eptr++;
2154     if ((data[c/8] & (1 << (c&7))) == 0) RRETURN(MATCH_NOMATCH);
2155     }
2156     }
2157     /* Control never gets here */
2158     }
2159    
2160     /* If maximizing, find the longest possible run, then work backwards. */
2161    
2162     else
2163     {
2164     pp = eptr;
2165    
2166     #ifdef SUPPORT_UTF8
2167     /* UTF-8 mode */
2168     if (utf8)
2169     {
2170     for (i = min; i < max; i++)
2171     {
2172     int len = 1;
2173     if (eptr >= md->end_subject) break;
2174     GETCHARLEN(c, eptr, len);
2175     if (c > 255)
2176     {
2177     if (op == OP_CLASS) break;
2178     }
2179     else
2180     {
2181     if ((data[c/8] & (1 << (c&7))) == 0) break;
2182     }
2183     eptr += len;
2184     }
2185     for (;;)
2186     {
2187 ph10 164 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM18);
2188 nigel 77 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2189     if (eptr-- == pp) break; /* Stop if tried at original pos */
2190     BACKCHAR(eptr);
2191     }
2192     }
2193     else
2194     #endif
2195     /* Not UTF-8 mode */
2196     {
2197     for (i = min; i < max; i++)
2198     {
2199     if (eptr >= md->end_subject) break;
2200     c = *eptr;
2201     if ((data[c/8] & (1 << (c&7))) == 0) break;
2202     eptr++;
2203     }
2204     while (eptr >= pp)
2205     {
2206 ph10 164 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM19);
2207 nigel 87 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2208 nigel 77 eptr--;
2209     }
2210     }
2211    
2212     RRETURN(MATCH_NOMATCH);
2213     }
2214     }
2215     /* Control never gets here */
2216    
2217    
2218     /* Match an extended character class. This opcode is encountered only
2219 ph10 384 when UTF-8 mode mode is supported. Nevertheless, we may not be in UTF-8
2220     mode, because Unicode properties are supported in non-UTF-8 mode. */
2221 nigel 77
2222     #ifdef SUPPORT_UTF8
2223     case OP_XCLASS:
2224     {
2225     data = ecode + 1 + LINK_SIZE; /* Save for matching */
2226     ecode += GET(ecode, 1); /* Advance past the item */
2227    
2228     switch (*ecode)
2229     {
2230     case OP_CRSTAR:
2231     case OP_CRMINSTAR:
2232     case OP_CRPLUS:
2233     case OP_CRMINPLUS:
2234     case OP_CRQUERY:
2235     case OP_CRMINQUERY:
2236     c = *ecode++ - OP_CRSTAR;
2237     minimize = (c & 1) != 0;
2238     min = rep_min[c]; /* Pick up values from tables; */
2239     max = rep_max[c]; /* zero for max => infinity */
2240     if (max == 0) max = INT_MAX;
2241     break;
2242    
2243     case OP_CRRANGE:
2244     case OP_CRMINRANGE:
2245     minimize = (*ecode == OP_CRMINRANGE);
2246     min = GET2(ecode, 1);
2247     max = GET2(ecode, 3);
2248     if (max == 0) max = INT_MAX;
2249     ecode += 5;
2250     break;
2251    
2252     default: /* No repeat follows */
2253     min = max = 1;
2254     break;
2255     }
2256    
2257     /* First, ensure the minimum number of matches are present. */
2258    
2259     for (i = 1; i <= min; i++)
2260     {
2261 ph10 427 if (eptr >= md->end_subject)
2262 ph10 426 {
2263     SCHECK_PARTIAL();
2264     RRETURN(MATCH_NOMATCH);
2265 ph10 427 }
2266 ph10 384 GETCHARINCTEST(c, eptr);
2267 nigel 77 if (!_pcre_xclass(c, data)) RRETURN(MATCH_NOMATCH);
2268     }
2269    
2270     /* If max == min we can continue with the main loop without the
2271     need to recurse. */
2272    
2273     if (min == max) continue;
2274    
2275     /* If minimizing, keep testing the rest of the expression and advancing
2276     the pointer while it matches the class. */
2277    
2278     if (minimize)
2279     {
2280     for (fi = min;; fi++)
2281     {
2282 ph10 164 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM20);
2283 nigel 77 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2284 ph10 428 if (fi >= max) RRETURN(MATCH_NOMATCH);
2285 ph10 427 if (eptr >= md->end_subject)
2286 ph10 426 {
2287 ph10 427 SCHECK_PARTIAL();
2288 ph10 426 RRETURN(MATCH_NOMATCH);
2289 ph10 427 }
2290 ph10 384 GETCHARINCTEST(c, eptr);
2291 nigel 77 if (!_pcre_xclass(c, data)) RRETURN(MATCH_NOMATCH);
2292     }
2293     /* Control never gets here */
2294     }
2295    
2296     /* If maximizing, find the longest possible run, then work backwards. */
2297    
2298     else
2299     {
2300     pp = eptr;
2301     for (i = min; i < max; i++)
2302     {
2303     int len = 1;
2304     if (eptr >= md->end_subject) break;
2305 ph10 384 GETCHARLENTEST(c, eptr, len);
2306 nigel 77 if (!_pcre_xclass(c, data)) break;
2307     eptr += len;
2308     }
2309     for(;;)
2310     {
2311 ph10 164 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM21);
2312 nigel 77 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2313     if (eptr-- == pp) break; /* Stop if tried at original pos */
2314 ph10 214 if (utf8) BACKCHAR(eptr);
2315 nigel 77 }
2316     RRETURN(MATCH_NOMATCH);
2317     }
2318    
2319     /* Control never gets here */
2320     }
2321     #endif /* End of XCLASS */
2322    
2323     /* Match a single character, casefully */
2324    
2325     case OP_CHAR:
2326     #ifdef SUPPORT_UTF8
2327     if (utf8)
2328     {
2329     length = 1;
2330     ecode++;
2331     GETCHARLEN(fc, ecode, length);
2332 ph10 443 if (length > md->end_subject - eptr)
2333 ph10 428 {
2334     CHECK_PARTIAL(); /* Not SCHECK_PARTIAL() */
2335     RRETURN(MATCH_NOMATCH);
2336 ph10 443 }
2337 nigel 77 while (length-- > 0) if (*ecode++ != *eptr++) RRETURN(MATCH_NOMATCH);
2338     }
2339     else
2340     #endif
2341    
2342     /* Non-UTF-8 mode */
2343     {
2344 ph10 443 if (md->end_subject - eptr < 1)
2345 ph10 428 {
2346     SCHECK_PARTIAL(); /* This one can use SCHECK_PARTIAL() */
2347     RRETURN(MATCH_NOMATCH);
2348 ph10 443 }
2349 nigel 77 if (ecode[1] != *eptr++) RRETURN(MATCH_NOMATCH);
2350     ecode += 2;
2351     }
2352     break;
2353    
2354     /* Match a single character, caselessly */
2355    
2356     case OP_CHARNC:
2357     #ifdef SUPPORT_UTF8
2358     if (utf8)
2359     {
2360     length = 1;
2361     ecode++;
2362     GETCHARLEN(fc, ecode, length);
2363    
2364 ph10 443 if (length > md->end_subject - eptr)
2365 ph10 428 {
2366     CHECK_PARTIAL(); /* Not SCHECK_PARTIAL() */
2367     RRETURN(MATCH_NOMATCH);
2368 ph10 443 }
2369 nigel 77
2370     /* If the pattern character's value is < 128, we have only one byte, and
2371     can use the fast lookup table. */
2372    
2373     if (fc < 128)
2374     {
2375     if (md->lcc[*ecode++] != md->lcc[*eptr++]) RRETURN(MATCH_NOMATCH);
2376     }
2377    
2378     /* Otherwise we must pick up the subject character */
2379    
2380     else
2381     {
2382 nigel 93 unsigned int dc;
2383 nigel 77 GETCHARINC(dc, eptr);
2384     ecode += length;
2385    
2386     /* If we have Unicode property support, we can use it to test the other
2387 nigel 87 case of the character, if there is one. */
2388 nigel 77
2389     if (fc != dc)
2390     {
2391     #ifdef SUPPORT_UCP
2392 ph10 349 if (dc != UCD_OTHERCASE(fc))
2393 nigel 77 #endif
2394     RRETURN(MATCH_NOMATCH);
2395     }
2396     }
2397     }
2398     else
2399     #endif /* SUPPORT_UTF8 */
2400    
2401     /* Non-UTF-8 mode */
2402     {
2403 ph10 443 if (md->end_subject - eptr < 1)
2404 ph10 428 {
2405 ph10 443 SCHECK_PARTIAL(); /* This one can use SCHECK_PARTIAL() */
2406 ph10 428 RRETURN(MATCH_NOMATCH);
2407 ph10 443 }
2408 nigel 77 if (md->lcc[ecode[1]] != md->lcc[*eptr++]) RRETURN(MATCH_NOMATCH);
2409     ecode += 2;
2410     }
2411     break;
2412    
2413 nigel 93 /* Match a single character repeatedly. */
2414 nigel 77
2415     case OP_EXACT:
2416     min = max = GET2(ecode, 1);
2417     ecode += 3;
2418     goto REPEATCHAR;
2419    
2420 nigel 93 case OP_POSUPTO:
2421     possessive = TRUE;
2422     /* Fall through */
2423    
2424 nigel 77 case OP_UPTO:
2425     case OP_MINUPTO:
2426     min = 0;
2427     max = GET2(ecode, 1);
2428     minimize = *ecode == OP_MINUPTO;
2429     ecode += 3;
2430     goto REPEATCHAR;
2431    
2432 nigel 93 case OP_POSSTAR:
2433     possessive = TRUE;
2434     min = 0;
2435     max = INT_MAX;
2436     ecode++;
2437     goto REPEATCHAR;
2438    
2439     case OP_POSPLUS:
2440     possessive = TRUE;
2441     min = 1;
2442     max = INT_MAX;
2443     ecode++;
2444     goto REPEATCHAR;
2445    
2446     case OP_POSQUERY:
2447     possessive = TRUE;
2448     min = 0;
2449     max = 1;
2450     ecode++;
2451     goto REPEATCHAR;
2452    
2453 nigel 77 case OP_STAR:
2454     case OP_MINSTAR:
2455     case OP_PLUS:
2456     case OP_MINPLUS:
2457     case OP_QUERY:
2458     case OP_MINQUERY:
2459     c = *ecode++ - OP_STAR;
2460     minimize = (c & 1) != 0;
2461 ph10 443
2462 nigel 77 min = rep_min[c]; /* Pick up values from tables; */
2463     max = rep_max[c]; /* zero for max => infinity */
2464     if (max == 0) max = INT_MAX;
2465    
2466 ph10 426 /* Common code for all repeated single-character matches. */
2467 nigel 77
2468     REPEATCHAR:
2469     #ifdef SUPPORT_UTF8
2470     if (utf8)
2471     {
2472     length = 1;
2473     charptr = ecode;
2474     GETCHARLEN(fc, ecode, length);
2475     ecode += length;
2476    
2477     /* Handle multibyte character matching specially here. There is
2478     support for caseless matching if UCP support is present. */
2479    
2480     if (length > 1)
2481     {
2482     #ifdef SUPPORT_UCP
2483 nigel 93 unsigned int othercase;
2484 nigel 77 if ((ims & PCRE_CASELESS) != 0 &&
2485 ph10 349 (othercase = UCD_OTHERCASE(fc)) != fc)
2486 nigel 77 oclength = _pcre_ord2utf8(othercase, occhars);
2487 ph10 115 else oclength = 0;
2488 nigel 77 #endif /* SUPPORT_UCP */
2489    
2490     for (i = 1; i <= min; i++)
2491     {
2492 ph10 426 if (eptr <= md->end_subject - length &&
2493     memcmp(eptr, charptr, length) == 0) eptr += length;
2494 ph10 123 #ifdef SUPPORT_UCP
2495 ph10 426 else if (oclength > 0 &&
2496     eptr <= md->end_subject - oclength &&
2497     memcmp(eptr, occhars, oclength) == 0) eptr += oclength;
2498     #endif /* SUPPORT_UCP */
2499 nigel 77 else
2500     {
2501 ph10 426 CHECK_PARTIAL();
2502     RRETURN(MATCH_NOMATCH);
2503 nigel 77 }
2504     }
2505    
2506     if (min == max) continue;
2507    
2508     if (minimize)
2509     {
2510     for (fi = min;; fi++)
2511     {
2512 ph10 164 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM22);
2513 nigel 77 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2514 ph10 428 if (fi >= max) RRETURN(MATCH_NOMATCH);
2515 ph10 426 if (eptr <= md->end_subject - length &&
2516     memcmp(eptr, charptr, length) == 0) eptr += length;
2517 ph10 123 #ifdef SUPPORT_UCP
2518 ph10 426 else if (oclength > 0 &&
2519     eptr <= md->end_subject - oclength &&
2520     memcmp(eptr, occhars, oclength) == 0) eptr += oclength;
2521     #endif /* SUPPORT_UCP */
2522 nigel 77 else
2523     {
2524 ph10 426 CHECK_PARTIAL();
2525     RRETURN(MATCH_NOMATCH);
2526 nigel 77 }
2527     }
2528     /* Control never gets here */
2529     }
2530 nigel 93
2531     else /* Maximize */
2532 nigel 77 {
2533     pp = eptr;
2534     for (i = min; i < max; i++)
2535     {
2536 ph10 426 if (eptr <= md->end_subject - length &&
2537     memcmp(eptr, charptr, length) == 0) eptr += length;
2538 ph10 123 #ifdef SUPPORT_UCP
2539 ph10 426 else if (oclength > 0 &&
2540     eptr <= md->end_subject - oclength &&
2541     memcmp(eptr, occhars, oclength) == 0) eptr += oclength;
2542     #endif /* SUPPORT_UCP */
2543 ph10 115 else break;
2544 nigel 77 }
2545 nigel 93
2546     if (possessive) continue;
2547 ph10 427
2548 ph10 120 for(;;)
2549 ph10 426 {
2550     RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM23);
2551     if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2552     if (eptr == pp) { RRETURN(MATCH_NOMATCH); }
2553 ph10 115 #ifdef SUPPORT_UCP
2554 ph10 426 eptr--;
2555     BACKCHAR(eptr);
2556 ph10 123 #else /* without SUPPORT_UCP */
2557 ph10 426 eptr -= length;
2558 ph10 123 #endif /* SUPPORT_UCP */
2559 ph10 426 }
2560 nigel 77 }
2561     /* Control never gets here */
2562     }
2563    
2564     /* If the length of a UTF-8 character is 1, we fall through here, and
2565     obey the code as for non-UTF-8 characters below, though in this case the
2566     value of fc will always be < 128. */
2567     }
2568     else
2569     #endif /* SUPPORT_UTF8 */
2570    
2571     /* When not in UTF-8 mode, load a single-byte character. */
2572    
2573 ph10 426 fc = *ecode++;
2574 ph10 443
2575 nigel 77 /* The value of fc at this point is always less than 256, though we may or
2576     may not be in UTF-8 mode. The code is duplicated for the caseless and
2577     caseful cases, for speed, since matching characters is likely to be quite
2578     common. First, ensure the minimum number of matches are present. If min =
2579     max, continue at the same level without recursing. Otherwise, if
2580     minimizing, keep trying the rest of the expression and advancing one
2581     matching character if failing, up to the maximum. Alternatively, if
2582     maximizing, find the maximum number of characters and work backwards. */
2583    
2584     DPRINTF(("matching %c{%d,%d} against subject %.*s\n", fc, min, max,
2585     max, eptr));
2586    
2587     if ((ims & PCRE_CASELESS) != 0)
2588     {
2589     fc = md->lcc[fc];
2590     for (i = 1; i <= min; i++)
2591 ph10 426 {
2592     if (eptr >= md->end_subject)
2593     {
2594     SCHECK_PARTIAL();
2595     RRETURN(MATCH_NOMATCH);
2596     }
2597 nigel 77 if (fc != md->lcc[*eptr++]) RRETURN(MATCH_NOMATCH);
2598 ph10 426 }
2599 nigel 77 if (min == max) continue;
2600     if (minimize)
2601     {
2602     for (fi = min;; fi++)
2603     {
2604 ph10 164 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM24);
2605 nigel 77 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2606 ph10 428 if (fi >= max) RRETURN(MATCH_NOMATCH);
2607 ph10 426 if (eptr >= md->end_subject)
2608     {
2609 ph10 427 SCHECK_PARTIAL();
2610 ph10 426 RRETURN(MATCH_NOMATCH);
2611     }
2612     if (fc != md->lcc[*eptr++]) RRETURN(MATCH_NOMATCH);
2613 nigel 77 }
2614     /* Control never gets here */
2615     }
2616 nigel 93 else /* Maximize */
2617 nigel 77 {
2618     pp = eptr;
2619     for (i = min; i < max; i++)
2620     {
2621     if (eptr >= md->end_subject || fc != md->lcc[*eptr]) break;
2622     eptr++;
2623     }
2624 ph10 427
2625 nigel 93 if (possessive) continue;
2626 ph10 427
2627 nigel 77 while (eptr >= pp)
2628     {
2629 ph10 164 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM25);
2630 nigel 77 eptr--;
2631     if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2632     }
2633     RRETURN(MATCH_NOMATCH);
2634     }
2635     /* Control never gets here */
2636     }
2637    
2638     /* Caseful comparisons (includes all multi-byte characters) */
2639    
2640     else
2641     {
2642 ph10 427 for (i = 1; i <= min; i++)
2643 ph10 426 {
2644     if (eptr >= md->end_subject)
2645     {
2646     SCHECK_PARTIAL();
2647     RRETURN(MATCH_NOMATCH);
2648     }
2649     if (fc != *eptr++) RRETURN(MATCH_NOMATCH);
2650 ph10 427 }
2651 ph10 443
2652 nigel 77 if (min == max) continue;
2653 ph10 443
2654 nigel 77 if (minimize)
2655     {
2656     for (fi = min;; fi++)
2657     {
2658 ph10 164 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM26);
2659 nigel 77 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2660 ph10 428 if (fi >= max) RRETURN(MATCH_NOMATCH);
2661 ph10 426 if (eptr >= md->end_subject)
2662 ph10 427 {
2663 ph10 426 SCHECK_PARTIAL();
2664     RRETURN(MATCH_NOMATCH);
2665 ph10 427 }
2666 ph10 426 if (fc != *eptr++) RRETURN(MATCH_NOMATCH);
2667 nigel 77 }
2668     /* Control never gets here */
2669     }
2670 nigel 93 else /* Maximize */
2671 nigel 77 {
2672     pp = eptr;
2673     for (i = min; i < max; i++)
2674     {
2675     if (eptr >= md->end_subject || fc != *eptr) break;
2676     eptr++;
2677     }
2678 nigel 93 if (possessive) continue;
2679 ph10 443
2680 nigel 77 while (eptr >= pp)
2681     {
2682 ph10 164 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM27);
2683 nigel 77 eptr--;
2684     if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2685     }
2686     RRETURN(MATCH_NOMATCH);
2687     }
2688     }
2689     /* Control never gets here */
2690    
2691     /* Match a negated single one-byte character. The character we are
2692     checking can be multibyte. */
2693    
2694     case OP_NOT:
2695 ph10 443 if (eptr >= md->end_subject)
2696 ph10 428 {
2697 ph10 443 SCHECK_PARTIAL();
2698 ph10 428 RRETURN(MATCH_NOMATCH);
2699 ph10 443 }
2700 nigel 77 ecode++;
2701     GETCHARINCTEST(c, eptr);
2702     if ((ims & PCRE_CASELESS) != 0)
2703     {
2704     #ifdef SUPPORT_UTF8
2705     if (c < 256)
2706     #endif
2707     c = md->lcc[c];
2708     if (md->lcc[*ecode++] == c) RRETURN(MATCH_NOMATCH);
2709     }
2710     else
2711     {
2712     if (*ecode++ == c) RRETURN(MATCH_NOMATCH);
2713     }
2714     break;
2715    
2716     /* Match a negated single one-byte character repeatedly. This is almost a
2717     repeat of the code for a repeated single character, but I haven't found a
2718     nice way of commoning these up that doesn't require a test of the
2719     positive/negative option for each character match. Maybe that wouldn't add
2720     very much to the time taken, but character matching *is* what this is all
2721     about... */
2722    
2723     case OP_NOTEXACT:
2724     min = max = GET2(ecode, 1);
2725     ecode += 3;
2726     goto REPEATNOTCHAR;
2727    
2728     case OP_NOTUPTO:
2729     case OP_NOTMINUPTO:
2730     min = 0;
2731     max = GET2(ecode, 1);
2732     minimize = *ecode == OP_NOTMINUPTO;
2733     ecode += 3;
2734     goto REPEATNOTCHAR;
2735    
2736 nigel 93 case OP_NOTPOSSTAR:
2737     possessive = TRUE;
2738     min = 0;
2739     max = INT_MAX;
2740     ecode++;
2741     goto REPEATNOTCHAR;
2742    
2743     case OP_NOTPOSPLUS:
2744     possessive = TRUE;
2745     min = 1;
2746     max = INT_MAX;
2747     ecode++;
2748     goto REPEATNOTCHAR;
2749    
2750     case OP_NOTPOSQUERY:
2751     possessive = TRUE;
2752     min = 0;
2753     max = 1;
2754     ecode++;
2755     goto REPEATNOTCHAR;
2756    
2757     case OP_NOTPOSUPTO:
2758     possessive = TRUE;
2759     min = 0;
2760     max = GET2(ecode, 1);
2761     ecode += 3;
2762     goto REPEATNOTCHAR;
2763    
2764 nigel 77 case OP_NOTSTAR:
2765     case OP_NOTMINSTAR:
2766     case OP_NOTPLUS:
2767     case OP_NOTMINPLUS:
2768     case OP_NOTQUERY:
2769     case OP_NOTMINQUERY:
2770     c = *ecode++ - OP_NOTSTAR;
2771     minimize = (c & 1) != 0;
2772     min = rep_min[c]; /* Pick up values from tables; */
2773     max = rep_max[c]; /* zero for max => infinity */
2774     if (max == 0) max = INT_MAX;
2775    
2776 ph10 426 /* Common code for all repeated single-byte matches. */
2777 nigel 77
2778     REPEATNOTCHAR:
2779     fc = *ecode++;
2780    
2781     /* The code is duplicated for the caseless and caseful cases, for speed,
2782     since matching characters is likely to be quite common. First, ensure the
2783     minimum number of matches are present. If min = max, continue at the same
2784     level without recursing. Otherwise, if minimizing, keep trying the rest of
2785     the expression and advancing one matching character if failing, up to the
2786     maximum. Alternatively, if maximizing, find the maximum number of
2787     characters and work backwards. */
2788    
2789     DPRINTF(("negative matching %c{%d,%d} against subject %.*s\n", fc, min, max,
2790     max, eptr));
2791    
2792     if ((ims & PCRE_CASELESS) != 0)
2793     {
2794     fc = md->lcc[fc];
2795    
2796     #ifdef SUPPORT_UTF8
2797     /* UTF-8 mode */
2798     if (utf8)
2799     {
2800 nigel 93 register unsigned int d;
2801 nigel 77 for (i = 1; i <= min; i++)
2802     {
2803 ph10 426 if (eptr >= md->end_subject)
2804     {
2805     SCHECK_PARTIAL();
2806 ph10 427 RRETURN(MATCH_NOMATCH);
2807     }
2808 nigel 77 GETCHARINC(d, eptr);
2809     if (d < 256) d = md->lcc[d];
2810     if (fc == d) RRETURN(MATCH_NOMATCH);
2811     }
2812     }
2813     else
2814     #endif
2815    
2816     /* Not UTF-8 mode */
2817     {
2818     for (i = 1; i <= min; i++)
2819 ph10 426 {
2820     if (eptr >= md->end_subject)
2821     {
2822     SCHECK_PARTIAL();
2823 ph10 427 RRETURN(MATCH_NOMATCH);
2824     }
2825 nigel 77 if (fc == md->lcc[*eptr++]) RRETURN(MATCH_NOMATCH);
2826 ph10 427 }
2827 nigel 77 }
2828    
2829     if (min == max) continue;
2830    
2831     if (minimize)
2832     {
2833     #ifdef SUPPORT_UTF8
2834     /* UTF-8 mode */
2835     if (utf8)
2836     {
2837 nigel 93 register unsigned int d;
2838 nigel 77 for (fi = min;; fi++)
2839     {
2840 ph10 164 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM28);
2841 nigel 77 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2842 ph10 428 if (fi >= max) RRETURN(MATCH_NOMATCH);
2843 ph10 427 if (eptr >= md->end_subject)
2844 ph10 426 {
2845 ph10 427 SCHECK_PARTIAL();
2846 ph10 426 RRETURN(MATCH_NOMATCH);
2847 ph10 427 }
2848 nigel 77 GETCHARINC(d, eptr);
2849     if (d < 256) d = md->lcc[d];
2850 ph10 366 if (fc == d) RRETURN(MATCH_NOMATCH);
2851 nigel 77 }
2852     }
2853     else
2854     #endif
2855     /* Not UTF-8 mode */
2856     {
2857     for (fi = min;; fi++)
2858     {
2859 ph10 164 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM29);
2860 nigel 77 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2861 ph10 428 if (fi >= max) RRETURN(MATCH_NOMATCH);
2862 ph10 426 if (eptr >= md->end_subject)
2863     {
2864     SCHECK_PARTIAL();
2865     RRETURN(MATCH_NOMATCH);
2866     }
2867     if (fc == md->lcc[*eptr++]) RRETURN(MATCH_NOMATCH);
2868 nigel 77 }
2869     }
2870     /* Control never gets here */
2871     }
2872    
2873     /* Maximize case */
2874    
2875     else
2876     {
2877     pp = eptr;
2878    
2879     #ifdef SUPPORT_UTF8
2880     /* UTF-8 mode */
2881     if (utf8)
2882     {
2883 nigel 93 register unsigned int d;
2884 nigel 77 for (i = min; i < max; i++)
2885     {
2886     int len = 1;
2887     if (eptr >= md->end_subject) break;
2888     GETCHARLEN(d, eptr, len);
2889     if (d < 256) d = md->lcc[d];
2890     if (fc == d) break;
2891     eptr += len;
2892     }
2893 nigel 93 if (possessive) continue;
2894     for(;;)
2895 nigel 77 {
2896 ph10 164 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM30);
2897 nigel 77 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2898     if (eptr-- == pp) break; /* Stop if tried at original pos */
2899     BACKCHAR(eptr);
2900     }
2901     }
2902     else
2903     #endif
2904     /* Not UTF-8 mode */
2905     {
2906     for (i = min; i < max; i++)
2907     {
2908     if (eptr >= md->end_subject || fc == md->lcc[*eptr]) break;
2909     eptr++;
2910     }
2911 nigel 93 if (possessive) continue;
2912 nigel 77 while (eptr >= pp)
2913     {
2914 ph10 164 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM31);
2915 nigel 77 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2916     eptr--;
2917     }
2918     }
2919    
2920     RRETURN(MATCH_NOMATCH);
2921     }
2922     /* Control never gets here */
2923     }
2924    
2925     /* Caseful comparisons */
2926    
2927     else
2928     {
2929     #ifdef SUPPORT_UTF8
2930     /* UTF-8 mode */
2931     if (utf8)
2932     {
2933 nigel 93 register unsigned int d;
2934 nigel 77 for (i = 1; i <= min; i++)
2935     {
2936 ph10 426 if (eptr >= md->end_subject)
2937     {
2938     SCHECK_PARTIAL();
2939 ph10 427 RRETURN(MATCH_NOMATCH);
2940     }
2941 nigel 77 GETCHARINC(d, eptr);
2942     if (fc == d) RRETURN(MATCH_NOMATCH);
2943     }
2944     }
2945     else
2946     #endif
2947     /* Not UTF-8 mode */
2948     {
2949     for (i = 1; i <= min; i++)
2950 ph10 426 {
2951     if (eptr >= md->end_subject)
2952     {
2953     SCHECK_PARTIAL();
2954 ph10 427 RRETURN(MATCH_NOMATCH);
2955     }
2956 nigel 77 if (fc == *eptr++) RRETURN(MATCH_NOMATCH);
2957 ph10 427 }
2958 nigel 77 }
2959    
2960     if (min == max) continue;
2961    
2962     if (minimize)
2963     {
2964     #ifdef SUPPORT_UTF8
2965     /* UTF-8 mode */
2966     if (utf8)
2967     {
2968 nigel 93 register unsigned int d;
2969 nigel 77 for (fi = min;; fi++)
2970     {
2971 ph10 164 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM32);
2972 nigel 77 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2973 ph10 428 if (fi >= max) RRETURN(MATCH_NOMATCH);
2974 ph10 427 if (eptr >= md->end_subject)
2975 ph10 426 {
2976 ph10 427 SCHECK_PARTIAL();
2977 ph10 426 RRETURN(MATCH_NOMATCH);
2978 ph10 427 }
2979 nigel 77 GETCHARINC(d, eptr);
2980 ph10 371 if (fc == d) RRETURN(MATCH_NOMATCH);
2981 nigel 77 }
2982     }
2983     else
2984     #endif
2985     /* Not UTF-8 mode */
2986     {
2987     for (fi = min;; fi++)
2988     {
2989 ph10 164 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM33);
2990 nigel 77 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2991 ph10 428 if (fi >= max) RRETURN(MATCH_NOMATCH);
2992 ph10 426 if (eptr >= md->end_subject)
2993     {
2994     SCHECK_PARTIAL();
2995     RRETURN(MATCH_NOMATCH);
2996 ph10 427 }
2997 ph10 426 if (fc == *eptr++) RRETURN(MATCH_NOMATCH);
2998 nigel 77 }
2999     }
3000     /* Control never gets here */
3001     }
3002    
3003     /* Maximize case */
3004    
3005     else
3006     {
3007     pp = eptr;
3008    
3009     #ifdef SUPPORT_UTF8
3010     /* UTF-8 mode */
3011     if (utf8)
3012     {
3013 nigel 93 register unsigned int d;
3014 nigel 77 for (i = min; i < max; i++)
3015     {
3016     int len = 1;
3017     if (eptr >= md->end_subject) break;
3018     GETCHARLEN(d, eptr, len);
3019     if (fc == d) break;
3020     eptr += len;
3021     }
3022 nigel 93 if (possessive) continue;
3023 nigel 77 for(;;)
3024     {
3025 ph10 164 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM34);
3026 nigel 77 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3027     if (eptr-- == pp) break; /* Stop if tried at original pos */
3028     BACKCHAR(eptr);
3029     }
3030     }
3031     else
3032     #endif
3033     /* Not UTF-8 mode */
3034     {
3035     for (i = min; i < max; i++)
3036     {
3037     if (eptr >= md->end_subject || fc == *eptr) break;
3038     eptr++;
3039     }
3040 nigel 93 if (possessive) continue;
3041 nigel 77 while (eptr >= pp)
3042     {
3043 ph10 164 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM35);
3044 nigel 77 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3045     eptr--;
3046     }
3047     }
3048    
3049     RRETURN(MATCH_NOMATCH);
3050     }
3051     }
3052     /* Control never gets here */
3053    
3054     /* Match a single character type repeatedly; several different opcodes
3055     share code. This is very similar to the code for single characters, but we
3056     repeat it in the interests of efficiency. */
3057    
3058     case OP_TYPEEXACT:
3059     min = max = GET2(ecode, 1);
3060     minimize = TRUE;
3061     ecode += 3;
3062     goto REPEATTYPE;
3063    
3064     case OP_TYPEUPTO:
3065     case OP_TYPEMINUPTO:
3066     min = 0;
3067     max = GET2(ecode, 1);
3068     minimize = *ecode == OP_TYPEMINUPTO;
3069     ecode += 3;
3070     goto REPEATTYPE;
3071    
3072 nigel 93 case OP_TYPEPOSSTAR:
3073     possessive = TRUE;
3074     min = 0;
3075     max = INT_MAX;
3076     ecode++;
3077     goto REPEATTYPE;
3078    
3079     case OP_TYPEPOSPLUS:
3080     possessive = TRUE;
3081     min = 1;
3082     max = INT_MAX;
3083     ecode++;
3084     goto REPEATTYPE;
3085    
3086     case OP_TYPEPOSQUERY:
3087     possessive = TRUE;
3088     min = 0;
3089     max = 1;
3090     ecode++;
3091     goto REPEATTYPE;
3092    
3093     case OP_TYPEPOSUPTO:
3094     possessive = TRUE;
3095     min = 0;
3096     max = GET2(ecode, 1);
3097     ecode += 3;
3098     goto REPEATTYPE;
3099    
3100 nigel 77 case OP_TYPESTAR:
3101     case OP_TYPEMINSTAR:
3102     case OP_TYPEPLUS:
3103     case OP_TYPEMINPLUS:
3104     case OP_TYPEQUERY:
3105     case OP_TYPEMINQUERY:
3106     c = *ecode++ - OP_TYPESTAR;
3107     minimize = (c & 1) != 0;
3108     min = rep_min[c]; /* Pick up values from tables; */
3109     max = rep_max[c]; /* zero for max => infinity */
3110     if (max == 0) max = INT_MAX;
3111    
3112     /* Common code for all repeated single character type matches. Note that
3113     in UTF-8 mode, '.' matches a character of any length, but for the other
3114     character types, the valid characters are all one-byte long. */
3115    
3116     REPEATTYPE:
3117     ctype = *ecode++; /* Code for the character type */
3118    
3119     #ifdef SUPPORT_UCP
3120     if (ctype == OP_PROP || ctype == OP_NOTPROP)
3121     {
3122     prop_fail_result = ctype == OP_NOTPROP;
3123     prop_type = *ecode++;
3124 nigel 87 prop_value = *ecode++;
3125 nigel 77 }
3126     else prop_type = -1;
3127     #endif
3128    
3129     /* First, ensure the minimum number of matches are present. Use inline
3130     code for maximizing the speed, and do the type test once at the start
3131 ph10 426 (i.e. keep it out of the loop). Separate the UTF-8 code completely as that
3132 nigel 77 is tidier. Also separate the UCP code, which can be the same for both UTF-8
3133     and single-bytes. */
3134    
3135     if (min > 0)
3136     {
3137     #ifdef SUPPORT_UCP
3138 nigel 87 if (prop_type >= 0)
3139 nigel 77 {
3140 nigel 87 switch(prop_type)
3141 nigel 77 {
3142 nigel 87 case PT_ANY:
3143     if (prop_fail_result) RRETURN(MATCH_NOMATCH);
3144     for (i = 1; i <= min; i++)
3145     {
3146 ph10 427 if (eptr >= md->end_subject)
3147 ph10 426 {
3148 ph10 427 SCHECK_PARTIAL();
3149 ph10 426 RRETURN(MATCH_NOMATCH);
3150 ph10 427 }
3151 ph10 184 GETCHARINCTEST(c, eptr);
3152 nigel 87 }
3153     break;
3154    
3155     case PT_LAMP:
3156     for (i = 1; i <= min; i++)
3157     {
3158 ph10 427 if (eptr >= md->end_subject)
3159 ph10 426 {
3160 ph10 427 SCHECK_PARTIAL();
3161 ph10 426 RRETURN(MATCH_NOMATCH);
3162 ph10 427 }
3163 ph10 184 GETCHARINCTEST(c, eptr);
3164 ph10 349 prop_chartype = UCD_CHARTYPE(c);
3165 nigel 87 if ((prop_chartype == ucp_Lu ||
3166     prop_chartype == ucp_Ll ||
3167     prop_chartype == ucp_Lt) == prop_fail_result)
3168     RRETURN(MATCH_NOMATCH);
3169     }
3170     break;
3171    
3172     case PT_GC:
3173     for (i = 1; i <= min; i++)
3174     {
3175 ph10 427 if (eptr >= md->end_subject)
3176 ph10 426 {
3177 ph10 427 SCHECK_PARTIAL();
3178 ph10 426 RRETURN(MATCH_NOMATCH);
3179 ph10 427 }
3180 ph10 184 GETCHARINCTEST(c, eptr);
3181 ph10 349 prop_category = UCD_CATEGORY(c);
3182 nigel 87 if ((prop_category == prop_value) == prop_fail_result)
3183     RRETURN(MATCH_NOMATCH);
3184     }
3185     break;
3186    
3187     case PT_PC:
3188     for (i = 1; i <= min; i++)
3189     {
3190 ph10 427 if (eptr >= md->end_subject)
3191 ph10 426 {
3192 ph10 427 SCHECK_PARTIAL();
3193 ph10 426 RRETURN(MATCH_NOMATCH);
3194 ph10 427 }
3195 ph10 184 GETCHARINCTEST(c, eptr);
3196 ph10 349 prop_chartype = UCD_CHARTYPE(c);
3197 nigel 87 if ((prop_chartype == prop_value) == prop_fail_result)
3198     RRETURN(MATCH_NOMATCH);
3199     }
3200     break;
3201    
3202     case PT_SC:
3203     for (i = 1; i <= min; i++)
3204     {
3205 ph10 427 if (eptr >= md->end_subject)
3206 ph10 426 {
3207 ph10 427 SCHECK_PARTIAL();
3208 ph10 426 RRETURN(MATCH_NOMATCH);
3209 ph10 427 }
3210 ph10 184 GETCHARINCTEST(c, eptr);
3211 ph10 349 prop_script = UCD_SCRIPT(c);
3212 nigel 87 if ((prop_script == prop_value) == prop_fail_result)
3213     RRETURN(MATCH_NOMATCH);
3214     }
3215     break;
3216    
3217     default:
3218     RRETURN(PCRE_ERROR_INTERNAL);
3219 nigel 77 }
3220     }
3221    
3222     /* Match extended Unicode sequences. We will get here only if the
3223     support is in the binary; otherwise a compile-time error occurs. */
3224    
3225     else if (ctype == OP_EXTUNI)
3226     {
3227     for (i = 1; i <= min; i++)
3228     {
3229 ph10 427 if (eptr >= md->end_subject)
3230 ph10 426 {
3231 ph10 427 SCHECK_PARTIAL();
3232 ph10 426 RRETURN(MATCH_NOMATCH);
3233 ph10 427 }
3234 nigel 77 GETCHARINCTEST(c, eptr);
3235 ph10 349 prop_category = UCD_CATEGORY(c);
3236 nigel 77 if (prop_category == ucp_M) RRETURN(MATCH_NOMATCH);
3237     while (eptr < md->end_subject)
3238     {
3239     int len = 1;
3240 ph10 426 if (!utf8) c = *eptr;
3241     else { GETCHARLEN(c, eptr, len); }
3242 ph10 349 prop_category = UCD_CATEGORY(c);
3243 nigel 77 if (prop_category != ucp_M) break;
3244     eptr += len;
3245     }
3246     }
3247     }
3248    
3249     else
3250     #endif /* SUPPORT_UCP */
3251    
3252     /* Handle all other cases when the coding is UTF-8 */
3253    
3254     #ifdef SUPPORT_UTF8
3255     if (utf8) switch(ctype)
3256     {
3257     case OP_ANY:
3258     for (i = 1; i <= min; i++)
3259     {
3260 ph10 426 if (eptr >= md->end_subject)
3261     {
3262 ph10 427 SCHECK_PARTIAL();
3263 nigel 77 RRETURN(MATCH_NOMATCH);
3264 ph10 427 }
3265 ph10 426 if (IS_NEWLINE(eptr)) RRETURN(MATCH_NOMATCH);
3266 nigel 91 eptr++;
3267 nigel 77 while (eptr < md->end_subject && (*eptr & 0xc0) == 0x80) eptr++;
3268     }
3269     break;
3270    
3271 ph10 341 case OP_ALLANY:
3272     for (i = 1; i <= min; i++)
3273     {
3274 ph10 427 if (eptr >= md->end_subject)
3275 ph10 426 {
3276     SCHECK_PARTIAL();
3277     RRETURN(MATCH_NOMATCH);
3278 ph10 427 }
3279 ph10 341 eptr++;
3280     while (eptr < md->end_subject && (*eptr & 0xc0) == 0x80) eptr++;
3281     }
3282     break;
3283    
3284 nigel 77 case OP_ANYBYTE:
3285 ph10 427 if (eptr > md->end_subject - min) RRETURN(MATCH_NOMATCH);
3286 nigel 77 eptr += min;
3287     break;
3288    
3289 nigel 93 case OP_ANYNL:
3290     for (i = 1; i <= min; i++)
3291     {
3292 ph10 427 if (eptr >= md->end_subject)
3293 ph10 426 {
3294     SCHECK_PARTIAL();
3295     RRETURN(MATCH_NOMATCH);
3296 ph10 427 }
3297 nigel 93 GETCHARINC(c, eptr);
3298     switch(c)
3299     {
3300     default: RRETURN(MATCH_NOMATCH);
3301     case 0x000d:
3302     if (eptr < md->end_subject && *eptr == 0x0a) eptr++;
3303     break;
3304 ph10 231
3305 nigel 93 case 0x000a:
3306 ph10 231 break;
3307    
3308 nigel 93 case 0x000b:
3309     case 0x000c:
3310     case 0x0085:
3311     case 0x2028:
3312     case 0x2029:
3313 ph10 231 if (md->bsr_anycrlf) RRETURN(MATCH_NOMATCH);
3314 nigel 93 break;
3315     }
3316     }
3317     break;
3318    
3319 ph10 178 case OP_NOT_HSPACE:
3320     for (i = 1; i <= min; i++)
3321     {
3322 ph10 427 if (eptr >= md->end_subject)
3323 ph10 426 {
3324     SCHECK_PARTIAL();
3325     RRETURN(MATCH_NOMATCH);
3326 ph10 427 }
3327 ph10 178 GETCHARINC(c, eptr);
3328     switch(c)
3329     {
3330     default: break;
3331     case 0x09: /* HT */
3332     case 0x20: /* SPACE */
3333     case 0xa0: /* NBSP */
3334     case 0x1680: /* OGHAM SPACE MARK */
3335     case 0x180e: /* MONGOLIAN VOWEL SEPARATOR */
3336     case 0x2000: /* EN QUAD */
3337     case 0x2001: /* EM QUAD */
3338     case 0x2002: /* EN SPACE */
3339     case 0x2003: /* EM SPACE */
3340     case 0x2004: /* THREE-PER-EM SPACE */
3341     case 0x2005: /* FOUR-PER-EM SPACE */
3342     case 0x2006: /* SIX-PER-EM SPACE */
3343     case 0x2007: /* FIGURE SPACE */
3344     case 0x2008: /* PUNCTUATION SPACE */
3345     case 0x2009: /* THIN SPACE */
3346     case 0x200A: /* HAIR SPACE */
3347     case 0x202f: /* NARROW NO-BREAK SPACE */
3348     case 0x205f: /* MEDIUM MATHEMATICAL SPACE */
3349     case 0x3000: /* IDEOGRAPHIC SPACE */
3350     RRETURN(MATCH_NOMATCH);
3351     }
3352     }
3353     break;
3354 ph10 182
3355 ph10 178 case OP_HSPACE:
3356     for (i = 1; i <= min; i++)
3357     {
3358 ph10 427 if (eptr >= md->end_subject)
3359 ph10 426 {
3360 ph10 427 SCHECK_PARTIAL();
3361 ph10 426 RRETURN(MATCH_NOMATCH);
3362 ph10 427 }
3363 ph10 178 GETCHARINC(c, eptr);
3364     switch(c)
3365     {
3366     default: RRETURN(MATCH_NOMATCH);
3367     case 0x09: /* HT */
3368     case 0x20: /* SPACE */
3369     case 0xa0: /* NBSP */
3370     case 0x1680: /* OGHAM SPACE MARK */
3371     case 0x180e: /* MONGOLIAN VOWEL SEPARATOR */
3372     case 0x2000: /* EN QUAD */
3373     case 0x2001: /* EM QUAD */
3374     case 0x2002: /* EN SPACE */
3375     case 0x2003: /* EM SPACE */
3376     case 0x2004: /* THREE-PER-EM SPACE */
3377     case 0x2005: /* FOUR-PER-EM SPACE */
3378     case 0x2006: /* SIX-PER-EM SPACE */
3379     case 0x2007: /* FIGURE SPACE */
3380     case 0x2008: /* PUNCTUATION SPACE */
3381     case 0x2009: /* THIN SPACE */
3382     case 0x200A: /* HAIR SPACE */
3383     case 0x202f: /* NARROW NO-BREAK SPACE */
3384     case 0x205f: /* MEDIUM MATHEMATICAL SPACE */
3385     case 0x3000: /* IDEOGRAPHIC SPACE */
3386     break;
3387     }
3388     }
3389     break;
3390 ph10 182
3391 ph10 178 case OP_NOT_VSPACE:
3392     for (i = 1; i <= min; i++)
3393     {
3394 ph10 427 if (eptr >= md->end_subject)
3395 ph10 426 {
3396 ph10 427 SCHECK_PARTIAL();
3397 ph10 426 RRETURN(MATCH_NOMATCH);
3398 ph10 427 }
3399 ph10 178 GETCHARINC(c, eptr);
3400     switch(c)
3401     {
3402     default: break;
3403     case 0x0a: /* LF */
3404     case 0x0b: /* VT */
3405     case 0x0c: /* FF */
3406     case 0x0d: /* CR */
3407     case 0x85: /* NEL */
3408     case 0x2028: /* LINE SEPARATOR */
3409     case 0x2029: /* PARAGRAPH SEPARATOR */
3410     RRETURN(MATCH_NOMATCH);
3411     }
3412     }
3413     break;
3414 ph10 182
3415 ph10 178 case OP_VSPACE:
3416     for (i = 1; i <= min; i++)
3417     {
3418 ph10 427 if (eptr >= md->end_subject)
3419 ph10 426 {
3420 ph10 427 SCHECK_PARTIAL();
3421 ph10 426 RRETURN(MATCH_NOMATCH);
3422 ph10 427 }
3423 ph10 178 GETCHARINC(c, eptr);
3424     switch(c)
3425     {
3426     default: RRETURN(MATCH_NOMATCH);
3427     case 0x0a: /* LF */
3428     case 0x0b: /* VT */
3429     case 0x0c: /* FF */
3430     case 0x0d: /* CR */
3431     case 0x85: /* NEL */
3432     case 0x2028: /* LINE SEPARATOR */
3433     case 0x2029: /* PARAGRAPH SEPARATOR */
3434 ph10 182 break;
3435 ph10 178 }
3436     }
3437     break;
3438    
3439 nigel 77 case OP_NOT_DIGIT:
3440     for (i = 1; i <= min; i++)
3441     {
3442 ph10 427 if (eptr >= md->end_subject)
3443 ph10 426 {
3444 ph10 427 SCHECK_PARTIAL();
3445 ph10 426 RRETURN(MATCH_NOMATCH);
3446 ph10 427 }
3447 nigel 77 GETCHARINC(c, eptr);
3448     if (c < 128 && (md->ctypes[c] & ctype_digit) != 0)
3449     RRETURN(MATCH_NOMATCH);
3450     }
3451     break;
3452    
3453     case OP_DIGIT:
3454     for (i = 1; i <= min; i++)
3455     {
3456 ph10 427 if (eptr >= md->end_subject)
3457 ph10 426 {
3458 ph10 427 SCHECK_PARTIAL();
3459 nigel 77 RRETURN(MATCH_NOMATCH);
3460 ph10 427 }
3461 ph10 426 if (*eptr >= 128 || (md->ctypes[*eptr++] & ctype_digit) == 0)
3462     RRETURN(MATCH_NOMATCH);
3463 nigel 77 /* No need to skip more bytes - we know it's a 1-byte character */
3464     }
3465     break;
3466    
3467     case OP_NOT_WHITESPACE:
3468     for (i = 1; i <= min; i++)
3469     {
3470 ph10 427 if (eptr >= md->end_subject)
3471 ph10 426 {
3472 ph10 427 SCHECK_PARTIAL();
3473 nigel 77 RRETURN(MATCH_NOMATCH);
3474 ph10 427 }
3475 ph10 426 if (*eptr < 128 && (md->ctypes[*eptr] & ctype_space) != 0)
3476     RRETURN(MATCH_NOMATCH);
3477 ph10 219 while (++eptr < md->end_subject && (*eptr & 0xc0) == 0x80);
3478 nigel 77 }
3479     break;
3480    
3481     case OP_WHITESPACE:
3482     for (i = 1; i <= min; i++)
3483     {
3484 ph10 427 if (eptr >= md->end_subject)
3485 ph10 426 {
3486 ph10 427 SCHECK_PARTIAL();
3487 nigel 77 RRETURN(MATCH_NOMATCH);
3488 ph10 427 }
3489 ph10 426 if (*eptr >= 128 || (md->ctypes[*eptr++] & ctype_space) == 0)
3490     RRETURN(MATCH_NOMATCH);
3491 nigel 77 /* No need to skip more bytes - we know it's a 1-byte character */
3492     }
3493     break;
3494    
3495     case OP_NOT_WORDCHAR:
3496     for (i = 1; i <= min; i++)
3497     {
3498     if (eptr >= md->end_subject ||
3499 ph10 219 (*eptr < 128 && (md->ctypes[*eptr] & ctype_word) != 0))
3500 nigel 77 RRETURN(MATCH_NOMATCH);
3501 ph10 219 while (++eptr < md->end_subject && (*eptr & 0xc0) == 0x80);
3502 nigel 77 }
3503     break;
3504    
3505     case OP_WORDCHAR:
3506     for (i = 1; i <= min; i++)
3507     {
3508 ph10 427 if (eptr >= md->end_subject)
3509 ph10 426 {
3510 ph10 427 SCHECK_PARTIAL();
3511 nigel 77 RRETURN(MATCH_NOMATCH);
3512 ph10 427 }
3513 ph10 426 if (*eptr >= 128 || (md->ctypes[*eptr++] & ctype_word) == 0)
3514     RRETURN(MATCH_NOMATCH);
3515 nigel 77 /* No need to skip more bytes - we know it's a 1-byte character */
3516     }
3517     break;
3518    
3519     default:
3520     RRETURN(PCRE_ERROR_INTERNAL);
3521     } /* End switch(ctype) */
3522    
3523     else
3524     #endif /* SUPPORT_UTF8 */
3525    
3526     /* Code for the non-UTF-8 case for minimum matching of operators other
3527 ph10 426 than OP_PROP and OP_NOTPROP. */
3528 nigel 77
3529     switch(ctype)
3530     {
3531     case OP_ANY:
3532 ph10 342 for (i = 1; i <= min; i++)
3533 nigel 77 {
3534 ph10 427 if (eptr >= md->end_subject)
3535 ph10 426 {
3536 ph10 427 SCHECK_PARTIAL();
3537 ph10 426 RRETURN(MATCH_NOMATCH);
3538 ph10 427 }
3539 ph10 342 if (IS_NEWLINE(eptr)) RRETURN(MATCH_NOMATCH);
3540     eptr++;
3541 nigel 77 }
3542     break;
3543    
3544 ph10 341 case OP_ALLANY:
3545 ph10 443 if (eptr > md->end_subject - min)
3546 ph10 428 {
3547 ph10 443 SCHECK_PARTIAL();
3548 ph10 428 RRETURN(MATCH_NOMATCH);
3549 ph10 443 }
3550 ph10 341 eptr += min;
3551     break;
3552    
3553 nigel 77 case OP_ANYBYTE:
3554 ph10 443 if (eptr > md->end_subject - min)
3555 ph10 428 {
3556 ph10 443 SCHECK_PARTIAL();
3557 ph10 428 RRETURN(MATCH_NOMATCH);
3558 ph10 443 }
3559 nigel 77 eptr += min;
3560     break;
3561    
3562 nigel 93 case OP_ANYNL:
3563     for (i = 1; i <= min; i++)
3564     {
3565 ph10 427 if (eptr >= md->end_subject)
3566 ph10 426 {
3567 ph10 427 SCHECK_PARTIAL();
3568 ph10 426 RRETURN(MATCH_NOMATCH);
3569 ph10 427 }
3570 nigel 93 switch(*eptr++)
3571     {
3572     default: RRETURN(MATCH_NOMATCH);
3573     case 0x000d:
3574     if (eptr < md->end_subject && *eptr == 0x0a) eptr++;
3575     break;
3576     case 0x000a:
3577 ph10 231 break;
3578    
3579 nigel 93 case 0x000b:
3580     case 0x000c:
3581     case 0x0085:
3582 ph10 231 if (md->bsr_anycrlf) RRETURN(MATCH_NOMATCH);
3583 nigel 93 break;
3584     }
3585     }
3586     break;
3587    
3588 ph10 178 case OP_NOT_HSPACE:
3589     for (i = 1; i <= min; i++)
3590     {
3591 ph10 427 if (eptr >= md->end_subject)
3592 ph10 426 {
3593 ph10 427 SCHECK_PARTIAL();
3594 ph10 426 RRETURN(MATCH_NOMATCH);
3595 ph10 427 }
3596 ph10 178 switch(*eptr++)
3597     {
3598     default: break;
3599     case 0x09: /* HT */
3600     case 0x20: /* SPACE */
3601     case 0xa0: /* NBSP */
3602     RRETURN(MATCH_NOMATCH);
3603     }
3604     }
3605     break;
3606    
3607     case OP_HSPACE:
3608     for (i = 1; i <= min; i++)
3609     {
3610 ph10 427 if (eptr >= md->end_subject)
3611 ph10 426 {
3612 ph10 427 SCHECK_PARTIAL();
3613 ph10 426 RRETURN(MATCH_NOMATCH);
3614 ph10 427 }
3615 ph10 178 switch(*eptr++)
3616     {
3617     default: RRETURN(MATCH_NOMATCH);
3618     case 0x09: /* HT */
3619     case 0x20: /* SPACE */
3620     case 0xa0: /* NBSP */
3621 ph10 182 break;
3622 ph10 178 }
3623     }
3624     break;
3625    
3626     case OP_NOT_VSPACE:
3627     for (i = 1; i <= min; i++)
3628     {
3629 ph10 427 if (eptr >= md->end_subject)
3630 ph10 426 {
3631 ph10 427 SCHECK_PARTIAL();
3632 ph10 426 RRETURN(MATCH_NOMATCH);
3633 ph10 427 }
3634 ph10 178 switch(*eptr++)
3635     {
3636     default: break;
3637     case 0x0a: /* LF */
3638     case 0x0b: /* VT */
3639     case 0x0c: /* FF */
3640     case 0x0d: /* CR */
3641     case 0x85: /* NEL */
3642     RRETURN(MATCH_NOMATCH);
3643     }
3644     }
3645     break;
3646    
3647     case OP_VSPACE:
3648     for (i = 1; i <= min; i++)
3649     {
3650 ph10 427 if (eptr >= md->end_subject)
3651 ph10 426 {
3652 ph10 427 SCHECK_PARTIAL();
3653 ph10 426 RRETURN(MATCH_NOMATCH);
3654 ph10 427 }
3655 ph10 178 switch(*eptr++)
3656     {
3657     default: RRETURN(MATCH_NOMATCH);
3658     case 0x0a: /* LF */
3659     case 0x0b: /* VT */
3660     case 0x0c: /* FF */
3661     case 0x0d: /* CR */
3662     case 0x85: /* NEL */
3663 ph10 182 break;
3664 ph10 178 }
3665     }
3666     break;
3667    
3668 nigel 77 case OP_NOT_DIGIT:
3669     for (i = 1; i <= min; i++)
3670 ph10 427 {
3671     if (eptr >= md->end_subject)
3672 ph10 426 {
3673 ph10 427 SCHECK_PARTIAL();
3674 ph10 426 RRETURN(MATCH_NOMATCH);
3675 ph10 427 }
3676 nigel 77 if ((md->ctypes[*eptr++] & ctype_digit) != 0) RRETURN(MATCH_NOMATCH);
3677 ph10 427 }
3678 nigel 77 break;
3679    
3680     case OP_DIGIT:
3681     for (i = 1; i <= min; i++)
3682 ph10 427 {
3683     if (eptr >= md->end_subject)
3684 ph10 426 {
3685 ph10 427 SCHECK_PARTIAL();
3686 ph10 426 RRETURN(MATCH_NOMATCH);
3687 ph10 427 }
3688 nigel 77 if ((md->ctypes[*eptr++] & ctype_digit) == 0) RRETURN(MATCH_NOMATCH);
3689 ph10 427 }
3690 nigel 77 break;
3691    
3692     case OP_NOT_WHITESPACE:
3693     for (i = 1; i <= min; i++)
3694 ph10 427 {
3695     if (eptr >= md->end_subject)
3696 ph10 426 {
3697 ph10 427 SCHECK_PARTIAL();
3698 ph10 426 RRETURN(MATCH_NOMATCH);
3699 ph10 427 }
3700 nigel 77 if ((md->ctypes[*eptr++] & ctype_space) != 0) RRETURN(MATCH_NOMATCH);
3701 ph10 427 }
3702 nigel 77 break;
3703    
3704     case OP_WHITESPACE:
3705     for (i = 1; i <= min; i++)
3706 ph10 427 {
3707     if (eptr >= md->end_subject)
3708 ph10 426 {
3709 ph10 427 SCHECK_PARTIAL();
3710 ph10 426 RRETURN(MATCH_NOMATCH);
3711 ph10 427 }
3712 nigel 77 if ((md->ctypes[*eptr++] & ctype_space) == 0) RRETURN(MATCH_NOMATCH);
3713 ph10 427 }
3714 nigel 77 break;
3715    
3716     case OP_NOT_WORDCHAR:
3717     for (i = 1; i <= min; i++)
3718 ph10 427 {
3719     if (eptr >= md->end_subject)
3720 ph10 426 {
3721 ph10 427 SCHECK_PARTIAL();
3722 ph10 426 RRETURN(MATCH_NOMATCH);
3723 ph10 427 }
3724 nigel 77 if ((md->ctypes[*eptr++] & ctype_word) != 0)
3725     RRETURN(MATCH_NOMATCH);
3726 ph10 427 }
3727 nigel 77 break;
3728    
3729     case OP_WORDCHAR:
3730     for (i = 1; i <= min; i++)
3731 ph10 427 {
3732     if (eptr >= md->end_subject)
3733 ph10 426 {
3734 ph10 427 SCHECK_PARTIAL();
3735 ph10 426 RRETURN(MATCH_NOMATCH);
3736 ph10 427 }
3737 nigel 77 if ((md->ctypes[*eptr++] & ctype_word) == 0)
3738     RRETURN(MATCH_NOMATCH);
3739 ph10 427 }
3740 nigel 77 break;
3741    
3742     default:
3743     RRETURN(PCRE_ERROR_INTERNAL);
3744     }
3745     }
3746    
3747     /* If min = max, continue at the same level without recursing */
3748    
3749     if (min == max) continue;
3750    
3751     /* If minimizing, we have to test the rest of the pattern before each
3752     subsequent match. Again, separate the UTF-8 case for speed, and also
3753     separate the UCP cases. */
3754    
3755     if (minimize)
3756     {
3757     #ifdef SUPPORT_UCP
3758 nigel 87 if (prop_type >= 0)
3759 nigel 77 {
3760 nigel 87 switch(prop_type)
3761 nigel 77 {
3762 nigel 87 case PT_ANY:
3763     for (fi = min;; fi++)
3764     {
3765 ph10 164 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM36);
3766 nigel 87 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3767 ph10 428 if (fi >= max) RRETURN(MATCH_NOMATCH);
3768 ph10 427 if (eptr >= md->end_subject)
3769 ph10 426 {
3770 ph10 427 SCHECK_PARTIAL();
3771 ph10 426 RRETURN(MATCH_NOMATCH);
3772 ph10 427 }
3773 nigel 87 GETCHARINC(c, eptr);
3774     if (prop_fail_result) RRETURN(MATCH_NOMATCH);
3775     }
3776 nigel 93 /* Control never gets here */
3777 nigel 87
3778     case PT_LAMP:
3779     for (fi = min;; fi++)
3780     {
3781 ph10 164 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM37);
3782 nigel 87 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3783 ph10 428 if (fi >= max) RRETURN(MATCH_NOMATCH);
3784 ph10 427 if (eptr >= md->end_subject)
3785 ph10 426 {
3786 ph10 427 SCHECK_PARTIAL();
3787 ph10 426 RRETURN(MATCH_NOMATCH);
3788 ph10 427 }
3789 nigel 87 GETCHARINC(c, eptr);
3790 ph10 349 prop_chartype = UCD_CHARTYPE(c);
3791 nigel 87 if ((prop_chartype == ucp_Lu ||
3792     prop_chartype == ucp_Ll ||
3793     prop_chartype == ucp_Lt) == prop_fail_result)
3794     RRETURN(MATCH_NOMATCH);
3795     }
3796 nigel 93 /* Control never gets here */
3797 nigel 87
3798     case PT_GC:
3799     for (fi = min;; fi++)
3800     {
3801 ph10 164 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM38);
3802 nigel 87 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3803 ph10 428 if (fi >= max) RRETURN(MATCH_NOMATCH);
3804 ph10 427 if (eptr >= md->end_subject)
3805 ph10 426 {
3806 ph10 427 SCHECK_PARTIAL();
3807 ph10 426 RRETURN(MATCH_NOMATCH);
3808 ph10 427 }
3809 nigel 87 GETCHARINC(c, eptr);
3810 ph10 349 prop_category = UCD_CATEGORY(c);
3811 nigel 87 if ((prop_category == prop_value) == prop_fail_result)
3812     RRETURN(MATCH_NOMATCH);
3813     }
3814 nigel 93 /* Control never gets here */
3815 nigel 87
3816     case PT_PC:
3817     for (fi = min;; fi++)
3818     {
3819 ph10 164 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM39);
3820 nigel 87 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3821 ph10 428 if (fi >= max) RRETURN(MATCH_NOMATCH);
3822 ph10 427 if (eptr >= md->end_subject)
3823 ph10 426 {
3824 ph10 427 SCHECK_PARTIAL();
3825 ph10 426 RRETURN(MATCH_NOMATCH);
3826 ph10 427 }
3827 nigel 87 GETCHARINC(c, eptr);
3828 ph10 349 prop_chartype = UCD_CHARTYPE(c);
3829 nigel 87 if ((prop_chartype == prop_value) == prop_fail_result)
3830     RRETURN(MATCH_NOMATCH);
3831     }
3832 nigel 93 /* Control never gets here */
3833 nigel 87
3834     case PT_SC:
3835     for (fi = min;; fi++)
3836     {
3837 ph10 164 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM40);
3838 nigel 87 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3839 ph10 428 if (fi >= max) RRETURN(MATCH_NOMATCH);
3840 ph10 427 if (eptr >= md->end_subject)
3841 ph10 426 {
3842 ph10 427 SCHECK_PARTIAL();
3843 ph10 426 RRETURN(MATCH_NOMATCH);
3844 ph10 427 }
3845 nigel 87 GETCHARINC(c, eptr);
3846 ph10 349 prop_script = UCD_SCRIPT(c);
3847 nigel 87 if ((prop_script == prop_value) == prop_fail_result)
3848     RRETURN(MATCH_NOMATCH);
3849     }
3850 nigel 93 /* Control never gets here */
3851 nigel 87
3852     default:
3853     RRETURN(PCRE_ERROR_INTERNAL);
3854 nigel 77 }
3855     }
3856    
3857     /* Match extended Unicode sequences. We will get here only if the
3858     support is in the binary; otherwise a compile-time error occurs. */
3859    
3860     else if (ctype == OP_EXTUNI)
3861     {
3862     for (fi = min;; fi++)
3863     {
3864 ph10 164 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM41);
3865 nigel 77 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3866 ph10 428 if (fi >= max) RRETURN(MATCH_NOMATCH);
3867 ph10 427 if (eptr >= md->end_subject)
3868 ph10 426 {
3869 ph10 427 SCHECK_PARTIAL();
3870 ph10 426 RRETURN(MATCH_NOMATCH);
3871 ph10 427 }
3872 nigel 77 GETCHARINCTEST(c, eptr);
3873 ph10 349 prop_category = UCD_CATEGORY(c);
3874 nigel 77 if (prop_category == ucp_M) RRETURN(MATCH_NOMATCH);
3875     while (eptr < md->end_subject)
3876     {
3877     int len = 1;
3878 ph10 426 if (!utf8) c = *eptr;
3879     else { GETCHARLEN(c, eptr, len); }
3880 ph10 349 prop_category = UCD_CATEGORY(c);
3881 nigel 77 if (prop_category != ucp_M) break;
3882     eptr += len;
3883     }
3884     }
3885     }
3886    
3887     else
3888     #endif /* SUPPORT_UCP */
3889    
3890     #ifdef SUPPORT_UTF8
3891     /* UTF-8 mode */
3892     if (utf8)
3893     {
3894     for (fi = min;; fi++)
3895     {
3896 ph10 164 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM42);
3897 nigel 77 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3898 ph10 428 if (fi >= max) RRETURN(MATCH_NOMATCH);
3899 ph10 427 if (eptr >= md->end_subject)
3900 ph10 426 {
3901 ph10 427 SCHECK_PARTIAL();
3902 ph10 426 RRETURN(MATCH_NOMATCH);
3903 ph10 427 }
3904 ph10 426 if (ctype == OP_ANY && IS_NEWLINE(eptr))
3905     RRETURN(MATCH_NOMATCH);
3906 nigel 77 GETCHARINC(c, eptr);
3907     switch(ctype)
3908     {
3909 ph10 342 case OP_ANY: /* This is the non-NL case */
3910 ph10 345 case OP_ALLANY:
3911 nigel 77 case OP_ANYBYTE:
3912     break;
3913    
3914 nigel 93 case OP_ANYNL:
3915     switch(c)
3916     {
3917     default: RRETURN(MATCH_NOMATCH);
3918     case 0x000d:
3919     if (eptr < md->end_subject && *eptr == 0x0a) eptr++;
3920     break;
3921     case 0x000a:
3922 ph10 231 break;
3923    
3924 nigel 93 case 0x000b:
3925     case 0x000c:
3926     case 0x0085:
3927     case 0x2028:
3928     case 0x2029:
3929 ph10 231 if (md->bsr_anycrlf) RRETURN(MATCH_NOMATCH);
3930 nigel 93 break;
3931     }
3932     break;
3933    
3934 ph10 178 case OP_NOT_HSPACE:
3935     switch(c)
3936     {
3937     default: break;
3938     case 0x09: /* HT */
3939     case 0x20: /* SPACE */
3940     case 0xa0: /* NBSP */
3941     case 0x1680: /* OGHAM SPACE MARK */
3942     case 0x180e: /* MONGOLIAN VOWEL SEPARATOR */
3943     case 0x2000: /* EN QUAD */
3944     case 0x2001: /* EM QUAD */
3945     case 0x2002: /* EN SPACE */
3946     case 0x2003: /* EM SPACE */
3947     case 0x2004: /* THREE-PER-EM SPACE */
3948     case 0x2005: /* FOUR-PER-EM SPACE */
3949     case 0x2006: /* SIX-PER-EM SPACE */
3950     case 0x2007: /* FIGURE SPACE */
3951     case 0x2008: /* PUNCTUATION SPACE */
3952     case 0x2009: /* THIN SPACE */
3953     case 0x200A: /* HAIR SPACE */
3954     case 0x202f: /* NARROW NO-BREAK SPACE */
3955     case 0x205f: /* MEDIUM MATHEMATICAL SPACE */
3956     case 0x3000: /* IDEOGRAPHIC SPACE */
3957     RRETURN(MATCH_NOMATCH);
3958     }
3959     break;
3960    
3961     case OP_HSPACE:
3962     switch(c)
3963     {
3964     default: RRETURN(MATCH_NOMATCH);
3965     case 0x09: /* HT */
3966     case 0x20: /* SPACE */
3967     case 0xa0: /* NBSP */
3968     case 0x1680: /* OGHAM SPACE MARK */
3969     case 0x180e: /* MONGOLIAN VOWEL SEPARATOR */
3970     case 0x2000: /* EN QUAD */
3971     case 0x2001: /* EM QUAD */
3972     case 0x2002: /* EN SPACE */
3973     case 0x2003: /* EM SPACE */
3974     case 0x2004: /* THREE-PER-EM SPACE */
3975     case 0x2005: /* FOUR-PER-EM SPACE */
3976     case 0x2006: /* SIX-PER-EM SPACE */
3977     case 0x2007: /* FIGURE SPACE */
3978     case 0x2008: /* PUNCTUATION SPACE */
3979     case 0x2009: /* THIN SPACE */
3980     case 0x200A: /* HAIR SPACE */
3981     case 0x202f: /* NARROW NO-BREAK SPACE */
3982     case 0x205f: /* MEDIUM MATHEMATICAL SPACE */
3983     case 0x3000: /* IDEOGRAPHIC SPACE */
3984     break;
3985     }
3986     break;
3987    
3988     case OP_NOT_VSPACE:
3989     switch(c)
3990     {
3991     default: break;
3992     case 0x0a: /* LF */
3993     case 0x0b: /* VT */