/[pcre]/code/trunk/pcre_exec.c
ViewVC logotype

Contents of /code/trunk/pcre_exec.c

Parent Directory Parent Directory | Revision Log Revision Log


Revision 190 - (hide annotations) (download)
Thu Jul 19 10:38:20 2007 UTC (6 years, 9 months ago) by ph10
File MIME type: text/plain
File size: 146326 byte(s)
Fix bug with .*$ when run in not-DOTALL UTF-8 mode; small performance 
improvement for .* in DOTALL UTF-8 mode.

1 nigel 77 /*************************************************
2     * Perl-Compatible Regular Expressions *
3     *************************************************/
4    
5     /* PCRE is a library of functions to support regular expressions whose syntax
6     and semantics are as close as possible to those of the Perl 5 language.
7    
8     Written by Philip Hazel
9 ph10 117 Copyright (c) 1997-2007 University of Cambridge
10 nigel 77
11     -----------------------------------------------------------------------------
12     Redistribution and use in source and binary forms, with or without
13     modification, are permitted provided that the following conditions are met:
14    
15     * Redistributions of source code must retain the above copyright notice,
16     this list of conditions and the following disclaimer.
17    
18     * Redistributions in binary form must reproduce the above copyright
19     notice, this list of conditions and the following disclaimer in the
20     documentation and/or other materials provided with the distribution.
21    
22     * Neither the name of the University of Cambridge nor the names of its
23     contributors may be used to endorse or promote products derived from
24     this software without specific prior written permission.
25    
26     THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
27     AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
28     IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
29     ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
30     LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
31     CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
32     SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
33     INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
34     CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
35     ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
36     POSSIBILITY OF SUCH DAMAGE.
37     -----------------------------------------------------------------------------
38     */
39    
40    
41     /* This module contains pcre_exec(), the externally visible function that does
42     pattern matching using an NFA algorithm, trying to mimic Perl as closely as
43     possible. There are also some static supporting functions. */
44    
45 nigel 93 #define NLBLOCK md /* Block containing newline information */
46     #define PSSTART start_subject /* Field containing processed string start */
47     #define PSEND end_subject /* Field containing processed string end */
48    
49 nigel 77 #include "pcre_internal.h"
50    
51 ph10 137 /* Undefine some potentially clashing cpp symbols */
52    
53     #undef min
54     #undef max
55    
56 nigel 93 /* The chain of eptrblocks for tail recursions uses memory in stack workspace,
57     obtained at top level, the size of which is defined by EPTR_WORK_SIZE. */
58 nigel 77
59 nigel 93 #define EPTR_WORK_SIZE (1000)
60 nigel 77
61     /* Flag bits for the match() function */
62    
63 nigel 93 #define match_condassert 0x01 /* Called to check a condition assertion */
64     #define match_cbegroup 0x02 /* Could-be-empty unlimited repeat group */
65     #define match_tail_recursed 0x04 /* Tail recursive call */
66 nigel 77
67     /* Non-error returns from the match() function. Error returns are externally
68     defined PCRE_ERROR_xxx codes, which are all negative. */
69    
70     #define MATCH_MATCH 1
71     #define MATCH_NOMATCH 0
72    
73     /* Maximum number of ints of offset to save on the stack for recursive calls.
74     If the offset vector is bigger, malloc is used. This should be a multiple of 3,
75     because the offset vector is always a multiple of 3 long. */
76    
77     #define REC_STACK_SAVE_MAX 30
78    
79     /* Min and max values for the common repeats; for the maxima, 0 => infinity */
80    
81     static const char rep_min[] = { 0, 0, 1, 1, 0, 0 };
82     static const char rep_max[] = { 0, 0, 0, 0, 1, 1 };
83    
84    
85    
86     #ifdef DEBUG
87     /*************************************************
88     * Debugging function to print chars *
89     *************************************************/
90    
91     /* Print a sequence of chars in printable format, stopping at the end of the
92     subject if the requested.
93    
94     Arguments:
95     p points to characters
96     length number to print
97     is_subject TRUE if printing from within md->start_subject
98     md pointer to matching data block, if is_subject is TRUE
99    
100     Returns: nothing
101     */
102    
103     static void
104     pchars(const uschar *p, int length, BOOL is_subject, match_data *md)
105     {
106 nigel 93 unsigned int c;
107 nigel 77 if (is_subject && length > md->end_subject - p) length = md->end_subject - p;
108     while (length-- > 0)
109     if (isprint(c = *(p++))) printf("%c", c); else printf("\\x%02x", c);
110     }
111     #endif
112    
113    
114    
115     /*************************************************
116     * Match a back-reference *
117     *************************************************/
118    
119     /* If a back reference hasn't been set, the length that is passed is greater
120     than the number of characters left in the string, so the match fails.
121    
122     Arguments:
123     offset index into the offset vector
124     eptr points into the subject
125     length length to be matched
126     md points to match data block
127     ims the ims flags
128    
129     Returns: TRUE if matched
130     */
131    
132     static BOOL
133 nigel 87 match_ref(int offset, register USPTR eptr, int length, match_data *md,
134 nigel 77 unsigned long int ims)
135     {
136 nigel 87 USPTR p = md->start_subject + md->offset_vector[offset];
137 nigel 77
138     #ifdef DEBUG
139     if (eptr >= md->end_subject)
140     printf("matching subject <null>");
141     else
142     {
143     printf("matching subject ");
144     pchars(eptr, length, TRUE, md);
145     }
146     printf(" against backref ");
147     pchars(p, length, FALSE, md);
148     printf("\n");
149     #endif
150    
151     /* Always fail if not enough characters left */
152    
153     if (length > md->end_subject - eptr) return FALSE;
154    
155     /* Separate the caselesss case for speed */
156    
157     if ((ims & PCRE_CASELESS) != 0)
158     {
159     while (length-- > 0)
160     if (md->lcc[*p++] != md->lcc[*eptr++]) return FALSE;
161     }
162     else
163     { while (length-- > 0) if (*p++ != *eptr++) return FALSE; }
164    
165     return TRUE;
166     }
167    
168    
169    
170     /***************************************************************************
171     ****************************************************************************
172     RECURSION IN THE match() FUNCTION
173    
174 nigel 87 The match() function is highly recursive, though not every recursive call
175     increases the recursive depth. Nevertheless, some regular expressions can cause
176     it to recurse to a great depth. I was writing for Unix, so I just let it call
177     itself recursively. This uses the stack for saving everything that has to be
178     saved for a recursive call. On Unix, the stack can be large, and this works
179     fine.
180 nigel 77
181 nigel 87 It turns out that on some non-Unix-like systems there are problems with
182     programs that use a lot of stack. (This despite the fact that every last chip
183     has oodles of memory these days, and techniques for extending the stack have
184     been known for decades.) So....
185 nigel 77
186     There is a fudge, triggered by defining NO_RECURSE, which avoids recursive
187     calls by keeping local variables that need to be preserved in blocks of memory
188 nigel 87 obtained from malloc() instead instead of on the stack. Macros are used to
189 nigel 77 achieve this so that the actual code doesn't look very different to what it
190     always used to.
191 ph10 164
192 ph10 165 The original heap-recursive code used longjmp(). However, it seems that this
193 ph10 164 can be very slow on some operating systems. Following a suggestion from Stan
194     Switzer, the use of longjmp() has been abolished, at the cost of having to
195     provide a unique number for each call to RMATCH. There is no way of generating
196     a sequence of numbers at compile time in C. I have given them names, to make
197     them stand out more clearly.
198    
199     Crude tests on x86 Linux show a small speedup of around 5-8%. However, on
200     FreeBSD, avoiding longjmp() more than halves the time taken to run the standard
201 ph10 165 tests. Furthermore, not using longjmp() means that local dynamic variables
202     don't have indeterminate values; this has meant that the frame size can be
203 ph10 164 reduced because the result can be "passed back" by straight setting of the
204     variable instead of being passed in the frame.
205 nigel 77 ****************************************************************************
206     ***************************************************************************/
207    
208    
209 ph10 164 /* Numbers for RMATCH calls */
210    
211     enum { RM1=1, RM2, RM3, RM4, RM5, RM6, RM7, RM8, RM9, RM10,
212     RM11, RM12, RM13, RM14, RM15, RM16, RM17, RM18, RM19, RM20,
213     RM21, RM22, RM23, RM24, RM25, RM26, RM27, RM28, RM29, RM30,
214     RM31, RM32, RM33, RM34, RM35, RM36, RM37, RM38, RM39, RM40,
215     RM41, RM42, RM43, RM44, RM45, RM46, RM47 };
216    
217 ph10 165
218 nigel 87 /* These versions of the macros use the stack, as normal. There are debugging
219 ph10 165 versions and production versions. Note that the "rw" argument of RMATCH isn't
220 ph10 164 actuall used in this definition. */
221 nigel 77
222     #ifndef NO_RECURSE
223     #define REGISTER register
224 ph10 164
225 nigel 87 #ifdef DEBUG
226 ph10 164 #define RMATCH(ra,rb,rc,rd,re,rf,rg,rw) \
227 nigel 87 { \
228     printf("match() called in line %d\n", __LINE__); \
229 ph10 168 rrc = match(ra,rb,mstart,rc,rd,re,rf,rg,rdepth+1); \
230 nigel 87 printf("to line %d\n", __LINE__); \
231     }
232     #define RRETURN(ra) \
233     { \
234     printf("match() returned %d from line %d ", ra, __LINE__); \
235     return ra; \
236     }
237     #else
238 ph10 164 #define RMATCH(ra,rb,rc,rd,re,rf,rg,rw) \
239 ph10 168 rrc = match(ra,rb,mstart,rc,rd,re,rf,rg,rdepth+1)
240 nigel 77 #define RRETURN(ra) return ra
241 nigel 87 #endif
242    
243 nigel 77 #else
244    
245    
246 ph10 164 /* These versions of the macros manage a private stack on the heap. Note that
247     the "rd" argument of RMATCH isn't actually used in this definition. It's the md
248     argument of match(), which never changes. */
249 nigel 77
250     #define REGISTER
251    
252 ph10 164 #define RMATCH(ra,rb,rc,rd,re,rf,rg,rw)\
253 nigel 77 {\
254     heapframe *newframe = (pcre_stack_malloc)(sizeof(heapframe));\
255 ph10 164 frame->Xwhere = rw; \
256     newframe->Xeptr = ra;\
257     newframe->Xecode = rb;\
258 ph10 168 newframe->Xmstart = mstart;\
259 ph10 164 newframe->Xoffset_top = rc;\
260     newframe->Xims = re;\
261     newframe->Xeptrb = rf;\
262     newframe->Xflags = rg;\
263     newframe->Xrdepth = frame->Xrdepth + 1;\
264     newframe->Xprevframe = frame;\
265     frame = newframe;\
266     DPRINTF(("restarting from line %d\n", __LINE__));\
267     goto HEAP_RECURSE;\
268     L_##rw:\
269     DPRINTF(("jumped back to line %d\n", __LINE__));\
270 nigel 77 }
271    
272     #define RRETURN(ra)\
273     {\
274     heapframe *newframe = frame;\
275     frame = newframe->Xprevframe;\
276     (pcre_stack_free)(newframe);\
277     if (frame != NULL)\
278     {\
279 ph10 164 rrc = ra;\
280     goto HEAP_RETURN;\
281 nigel 77 }\
282     return ra;\
283     }
284    
285    
286     /* Structure for remembering the local variables in a private frame */
287    
288     typedef struct heapframe {
289     struct heapframe *Xprevframe;
290    
291     /* Function arguments that may change */
292    
293     const uschar *Xeptr;
294     const uschar *Xecode;
295 ph10 172 const uschar *Xmstart;
296 nigel 77 int Xoffset_top;
297     long int Xims;
298     eptrblock *Xeptrb;
299     int Xflags;
300 nigel 91 unsigned int Xrdepth;
301 nigel 77
302     /* Function local variables */
303    
304     const uschar *Xcallpat;
305     const uschar *Xcharptr;
306     const uschar *Xdata;
307     const uschar *Xnext;
308     const uschar *Xpp;
309     const uschar *Xprev;
310     const uschar *Xsaved_eptr;
311    
312     recursion_info Xnew_recursive;
313    
314     BOOL Xcur_is_word;
315     BOOL Xcondition;
316     BOOL Xprev_is_word;
317    
318     unsigned long int Xoriginal_ims;
319    
320     #ifdef SUPPORT_UCP
321     int Xprop_type;
322 nigel 87 int Xprop_value;
323 nigel 77 int Xprop_fail_result;
324     int Xprop_category;
325     int Xprop_chartype;
326 nigel 87 int Xprop_script;
327 ph10 123 int Xoclength;
328     uschar Xocchars[8];
329 nigel 77 #endif
330    
331     int Xctype;
332 nigel 93 unsigned int Xfc;
333 nigel 77 int Xfi;
334     int Xlength;
335     int Xmax;
336     int Xmin;
337     int Xnumber;
338     int Xoffset;
339     int Xop;
340     int Xsave_capture_last;
341     int Xsave_offset1, Xsave_offset2, Xsave_offset3;
342     int Xstacksave[REC_STACK_SAVE_MAX];
343    
344     eptrblock Xnewptrb;
345    
346 ph10 164 /* Where to jump back to */
347 nigel 77
348 ph10 164 int Xwhere;
349 ph10 165
350 nigel 77 } heapframe;
351    
352     #endif
353    
354    
355     /***************************************************************************
356     ***************************************************************************/
357    
358    
359    
360     /*************************************************
361     * Match from current position *
362     *************************************************/
363    
364 nigel 93 /* This function is called recursively in many circumstances. Whenever it
365 nigel 77 returns a negative (error) response, the outer incarnation must also return the
366     same response.
367    
368     Performance note: It might be tempting to extract commonly used fields from the
369     md structure (e.g. utf8, end_subject) into individual variables to improve
370     performance. Tests using gcc on a SPARC disproved this; in the first case, it
371     made performance worse.
372    
373     Arguments:
374 nigel 93 eptr pointer to current character in subject
375     ecode pointer to current position in compiled code
376 ph10 168 mstart pointer to the current match start position (can be modified
377 ph10 172 by encountering \K)
378 nigel 77 offset_top current top pointer
379     md pointer to "static" info for the match
380     ims current /i, /m, and /s options
381     eptrb pointer to chain of blocks containing eptr at start of
382     brackets - for testing for empty matches
383     flags can contain
384     match_condassert - this is an assertion condition
385 nigel 93 match_cbegroup - this is the start of an unlimited repeat
386     group that can match an empty string
387     match_tail_recursed - this is a tail_recursed group
388 nigel 87 rdepth the recursion depth
389 nigel 77
390     Returns: MATCH_MATCH if matched ) these values are >= 0
391     MATCH_NOMATCH if failed to match )
392     a negative PCRE_ERROR_xxx value if aborted by an error condition
393 nigel 87 (e.g. stopped by repeated call or recursion limit)
394 nigel 77 */
395    
396     static int
397 ph10 172 match(REGISTER USPTR eptr, REGISTER const uschar *ecode, const uschar *mstart,
398 nigel 77 int offset_top, match_data *md, unsigned long int ims, eptrblock *eptrb,
399 nigel 91 int flags, unsigned int rdepth)
400 nigel 77 {
401     /* These variables do not need to be preserved over recursion in this function,
402 nigel 93 so they can be ordinary variables in all cases. Mark some of them with
403     "register" because they are used a lot in loops. */
404 nigel 77
405 nigel 91 register int rrc; /* Returns from recursive calls */
406     register int i; /* Used for loops not involving calls to RMATCH() */
407 nigel 93 register unsigned int c; /* Character values not kept over RMATCH() calls */
408 nigel 91 register BOOL utf8; /* Local copy of UTF-8 flag for speed */
409 nigel 77
410 nigel 93 BOOL minimize, possessive; /* Quantifier options */
411    
412 nigel 77 /* When recursion is not being used, all "local" variables that have to be
413     preserved over calls to RMATCH() are part of a "frame" which is obtained from
414     heap storage. Set up the top-level frame here; others are obtained from the
415     heap whenever RMATCH() does a "recursion". See the macro definitions above. */
416    
417     #ifdef NO_RECURSE
418     heapframe *frame = (pcre_stack_malloc)(sizeof(heapframe));
419     frame->Xprevframe = NULL; /* Marks the top level */
420    
421     /* Copy in the original argument variables */
422    
423     frame->Xeptr = eptr;
424     frame->Xecode = ecode;
425 ph10 168 frame->Xmstart = mstart;
426 nigel 77 frame->Xoffset_top = offset_top;
427     frame->Xims = ims;
428     frame->Xeptrb = eptrb;
429     frame->Xflags = flags;
430 nigel 87 frame->Xrdepth = rdepth;
431 nigel 77
432     /* This is where control jumps back to to effect "recursion" */
433    
434     HEAP_RECURSE:
435    
436     /* Macros make the argument variables come from the current frame */
437    
438     #define eptr frame->Xeptr
439     #define ecode frame->Xecode
440 ph10 168 #define mstart frame->Xmstart
441 nigel 77 #define offset_top frame->Xoffset_top
442     #define ims frame->Xims
443     #define eptrb frame->Xeptrb
444     #define flags frame->Xflags
445 nigel 87 #define rdepth frame->Xrdepth
446 nigel 77
447     /* Ditto for the local variables */
448    
449     #ifdef SUPPORT_UTF8
450     #define charptr frame->Xcharptr
451     #endif
452     #define callpat frame->Xcallpat
453     #define data frame->Xdata
454     #define next frame->Xnext
455     #define pp frame->Xpp
456     #define prev frame->Xprev
457     #define saved_eptr frame->Xsaved_eptr
458    
459     #define new_recursive frame->Xnew_recursive
460    
461     #define cur_is_word frame->Xcur_is_word
462     #define condition frame->Xcondition
463     #define prev_is_word frame->Xprev_is_word
464    
465     #define original_ims frame->Xoriginal_ims
466    
467     #ifdef SUPPORT_UCP
468     #define prop_type frame->Xprop_type
469 nigel 87 #define prop_value frame->Xprop_value
470 nigel 77 #define prop_fail_result frame->Xprop_fail_result
471     #define prop_category frame->Xprop_category
472     #define prop_chartype frame->Xprop_chartype
473 nigel 87 #define prop_script frame->Xprop_script
474 ph10 115 #define oclength frame->Xoclength
475     #define occhars frame->Xocchars
476 nigel 77 #endif
477    
478     #define ctype frame->Xctype
479     #define fc frame->Xfc
480     #define fi frame->Xfi
481     #define length frame->Xlength
482     #define max frame->Xmax
483     #define min frame->Xmin
484     #define number frame->Xnumber
485     #define offset frame->Xoffset
486     #define op frame->Xop
487     #define save_capture_last frame->Xsave_capture_last
488     #define save_offset1 frame->Xsave_offset1
489     #define save_offset2 frame->Xsave_offset2
490     #define save_offset3 frame->Xsave_offset3
491     #define stacksave frame->Xstacksave
492    
493     #define newptrb frame->Xnewptrb
494    
495     /* When recursion is being used, local variables are allocated on the stack and
496     get preserved during recursion in the normal way. In this environment, fi and
497     i, and fc and c, can be the same variables. */
498    
499 nigel 93 #else /* NO_RECURSE not defined */
500 nigel 77 #define fi i
501     #define fc c
502    
503    
504 nigel 87 #ifdef SUPPORT_UTF8 /* Many of these variables are used only */
505     const uschar *charptr; /* in small blocks of the code. My normal */
506     #endif /* style of coding would have declared */
507     const uschar *callpat; /* them within each of those blocks. */
508     const uschar *data; /* However, in order to accommodate the */
509     const uschar *next; /* version of this code that uses an */
510     USPTR pp; /* external "stack" implemented on the */
511     const uschar *prev; /* heap, it is easier to declare them all */
512     USPTR saved_eptr; /* here, so the declarations can be cut */
513     /* out in a block. The only declarations */
514     recursion_info new_recursive; /* within blocks below are for variables */
515     /* that do not have to be preserved over */
516     BOOL cur_is_word; /* a recursive call to RMATCH(). */
517     BOOL condition;
518 nigel 77 BOOL prev_is_word;
519    
520     unsigned long int original_ims;
521    
522     #ifdef SUPPORT_UCP
523     int prop_type;
524 nigel 87 int prop_value;
525 nigel 77 int prop_fail_result;
526     int prop_category;
527     int prop_chartype;
528 nigel 87 int prop_script;
529 ph10 115 int oclength;
530     uschar occhars[8];
531 nigel 77 #endif
532    
533     int ctype;
534     int length;
535     int max;
536     int min;
537     int number;
538     int offset;
539     int op;
540     int save_capture_last;
541     int save_offset1, save_offset2, save_offset3;
542     int stacksave[REC_STACK_SAVE_MAX];
543    
544     eptrblock newptrb;
545 nigel 93 #endif /* NO_RECURSE */
546 nigel 77
547     /* These statements are here to stop the compiler complaining about unitialized
548     variables. */
549    
550     #ifdef SUPPORT_UCP
551 nigel 87 prop_value = 0;
552 nigel 77 prop_fail_result = 0;
553     #endif
554    
555 nigel 93
556 nigel 91 /* This label is used for tail recursion, which is used in a few cases even
557     when NO_RECURSE is not defined, in order to reduce the amount of stack that is
558     used. Thanks to Ian Taylor for noticing this possibility and sending the
559     original patch. */
560    
561     TAIL_RECURSE:
562    
563 nigel 87 /* OK, now we can get on with the real code of the function. Recursive calls
564     are specified by the macro RMATCH and RRETURN is used to return. When
565     NO_RECURSE is *not* defined, these just turn into a recursive call to match()
566     and a "return", respectively (possibly with some debugging if DEBUG is
567     defined). However, RMATCH isn't like a function call because it's quite a
568     complicated macro. It has to be used in one particular way. This shouldn't,
569     however, impact performance when true recursion is being used. */
570 nigel 77
571 ph10 164 #ifdef SUPPORT_UTF8
572     utf8 = md->utf8; /* Local copy of the flag */
573     #else
574     utf8 = FALSE;
575     #endif
576    
577 nigel 87 /* First check that we haven't called match() too many times, or that we
578     haven't exceeded the recursive call limit. */
579    
580 nigel 77 if (md->match_call_count++ >= md->match_limit) RRETURN(PCRE_ERROR_MATCHLIMIT);
581 nigel 87 if (rdepth >= md->match_limit_recursion) RRETURN(PCRE_ERROR_RECURSIONLIMIT);
582 nigel 77
583     original_ims = ims; /* Save for resetting on ')' */
584 nigel 91
585 nigel 93 /* At the start of a group with an unlimited repeat that may match an empty
586     string, the match_cbegroup flag is set. When this is the case, add the current
587     subject pointer to the chain of such remembered pointers, to be checked when we
588     hit the closing ket, in order to break infinite loops that match no characters.
589     When match() is called in other circumstances, don't add to the chain. If this
590     is a tail recursion, use a block from the workspace, as the one on the stack is
591     already used. */
592 nigel 77
593 nigel 93 if ((flags & match_cbegroup) != 0)
594 nigel 77 {
595 nigel 93 eptrblock *p;
596     if ((flags & match_tail_recursed) != 0)
597     {
598     if (md->eptrn >= EPTR_WORK_SIZE) RRETURN(PCRE_ERROR_NULLWSLIMIT);
599     p = md->eptrchain + md->eptrn++;
600     }
601     else p = &newptrb;
602     p->epb_saved_eptr = eptr;
603     p->epb_prev = eptrb;
604     eptrb = p;
605 nigel 77 }
606    
607 nigel 93 /* Now start processing the opcodes. */
608 nigel 77
609     for (;;)
610     {
611 nigel 93 minimize = possessive = FALSE;
612 nigel 77 op = *ecode;
613    
614     /* For partial matching, remember if we ever hit the end of the subject after
615     matching at least one subject character. */
616    
617     if (md->partial &&
618     eptr >= md->end_subject &&
619 ph10 168 eptr > mstart)
620 nigel 77 md->hitend = TRUE;
621    
622 nigel 93 switch(op)
623     {
624     /* Handle a capturing bracket. If there is space in the offset vector, save
625     the current subject position in the working slot at the top of the vector.
626     We mustn't change the current values of the data slot, because they may be
627     set from a previous iteration of this group, and be referred to by a
628     reference inside the group.
629 nigel 77
630 nigel 93 If the bracket fails to match, we need to restore this value and also the
631     values of the final offsets, in case they were set by a previous iteration
632     of the same bracket.
633 nigel 77
634 nigel 93 If there isn't enough space in the offset vector, treat this as if it were
635     a non-capturing bracket. Don't worry about setting the flag for the error
636     case here; that is handled in the code for KET. */
637 nigel 77
638 nigel 93 case OP_CBRA:
639     case OP_SCBRA:
640     number = GET2(ecode, 1+LINK_SIZE);
641 nigel 77 offset = number << 1;
642    
643     #ifdef DEBUG
644 nigel 93 printf("start bracket %d\n", number);
645     printf("subject=");
646 nigel 77 pchars(eptr, 16, TRUE, md);
647     printf("\n");
648     #endif
649    
650     if (offset < md->offset_max)
651     {
652     save_offset1 = md->offset_vector[offset];
653     save_offset2 = md->offset_vector[offset+1];
654     save_offset3 = md->offset_vector[md->offset_end - number];
655     save_capture_last = md->capture_last;
656    
657     DPRINTF(("saving %d %d %d\n", save_offset1, save_offset2, save_offset3));
658     md->offset_vector[md->offset_end - number] = eptr - md->start_subject;
659    
660 nigel 93 flags = (op == OP_SCBRA)? match_cbegroup : 0;
661 nigel 77 do
662     {
663 ph10 164 RMATCH(eptr, ecode + _pcre_OP_lengths[*ecode], offset_top, md,
664     ims, eptrb, flags, RM1);
665 nigel 77 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
666     md->capture_last = save_capture_last;
667     ecode += GET(ecode, 1);
668     }
669     while (*ecode == OP_ALT);
670    
671     DPRINTF(("bracket %d failed\n", number));
672    
673     md->offset_vector[offset] = save_offset1;
674     md->offset_vector[offset+1] = save_offset2;
675     md->offset_vector[md->offset_end - number] = save_offset3;
676    
677     RRETURN(MATCH_NOMATCH);
678     }
679    
680 nigel 93 /* Insufficient room for saving captured contents. Treat as a non-capturing
681     bracket. */
682 nigel 77
683 nigel 93 DPRINTF(("insufficient capture room: treat as non-capturing\n"));
684 nigel 77
685 nigel 93 /* Non-capturing bracket. Loop for all the alternatives. When we get to the
686     final alternative within the brackets, we would return the result of a
687     recursive call to match() whatever happened. We can reduce stack usage by
688     turning this into a tail recursion. */
689 nigel 77
690 nigel 93 case OP_BRA:
691     case OP_SBRA:
692     DPRINTF(("start non-capturing bracket\n"));
693     flags = (op >= OP_SBRA)? match_cbegroup : 0;
694 nigel 91 for (;;)
695 nigel 77 {
696 nigel 91 if (ecode[GET(ecode, 1)] != OP_ALT)
697 nigel 93 {
698     ecode += _pcre_OP_lengths[*ecode];
699     flags |= match_tail_recursed;
700     DPRINTF(("bracket 0 tail recursion\n"));
701     goto TAIL_RECURSE;
702     }
703 nigel 91
704     /* For non-final alternatives, continue the loop for a NOMATCH result;
705     otherwise return. */
706    
707 ph10 164 RMATCH(eptr, ecode + _pcre_OP_lengths[*ecode], offset_top, md, ims,
708     eptrb, flags, RM2);
709 nigel 77 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
710     ecode += GET(ecode, 1);
711     }
712 nigel 91 /* Control never reaches here. */
713 nigel 77
714     /* Conditional group: compilation checked that there are no more than
715     two branches. If the condition is false, skipping the first branch takes us
716     past the end if there is only one branch, but that's OK because that is
717 nigel 91 exactly what going to the ket would do. As there is only one branch to be
718     obeyed, we can use tail recursion to avoid using another stack frame. */
719 nigel 77
720     case OP_COND:
721 nigel 93 case OP_SCOND:
722     if (ecode[LINK_SIZE+1] == OP_RREF) /* Recursion test */
723 nigel 77 {
724 nigel 93 offset = GET2(ecode, LINK_SIZE + 2); /* Recursion group number*/
725     condition = md->recursive != NULL &&
726     (offset == RREF_ANY || offset == md->recursive->group_num);
727     ecode += condition? 3 : GET(ecode, 1);
728     }
729    
730     else if (ecode[LINK_SIZE+1] == OP_CREF) /* Group used test */
731     {
732 nigel 77 offset = GET2(ecode, LINK_SIZE+2) << 1; /* Doubled ref number */
733 nigel 93 condition = offset < offset_top && md->offset_vector[offset] >= 0;
734     ecode += condition? 3 : GET(ecode, 1);
735 nigel 77 }
736    
737 nigel 93 else if (ecode[LINK_SIZE+1] == OP_DEF) /* DEFINE - always false */
738     {
739     condition = FALSE;
740     ecode += GET(ecode, 1);
741     }
742    
743 nigel 77 /* The condition is an assertion. Call match() to evaluate it - setting
744 nigel 93 the final argument match_condassert causes it to stop at the end of an
745     assertion. */
746 nigel 77
747     else
748     {
749 ph10 164 RMATCH(eptr, ecode + 1 + LINK_SIZE, offset_top, md, ims, NULL,
750     match_condassert, RM3);
751 nigel 77 if (rrc == MATCH_MATCH)
752     {
753 nigel 93 condition = TRUE;
754     ecode += 1 + LINK_SIZE + GET(ecode, LINK_SIZE + 2);
755 nigel 77 while (*ecode == OP_ALT) ecode += GET(ecode, 1);
756     }
757     else if (rrc != MATCH_NOMATCH)
758     {
759     RRETURN(rrc); /* Need braces because of following else */
760     }
761 nigel 93 else
762     {
763     condition = FALSE;
764     ecode += GET(ecode, 1);
765     }
766     }
767 nigel 91
768 nigel 93 /* We are now at the branch that is to be obeyed. As there is only one,
769     we can use tail recursion to avoid using another stack frame. If the second
770     alternative doesn't exist, we can just plough on. */
771 nigel 91
772 nigel 93 if (condition || *ecode == OP_ALT)
773     {
774 nigel 91 ecode += 1 + LINK_SIZE;
775 nigel 93 flags = match_tail_recursed | ((op == OP_SCOND)? match_cbegroup : 0);
776 nigel 91 goto TAIL_RECURSE;
777 nigel 77 }
778 nigel 93 else
779     {
780     ecode += 1 + LINK_SIZE;
781     }
782     break;
783 nigel 77
784    
785 nigel 93 /* End of the pattern. If we are in a top-level recursion, we should
786     restore the offsets appropriately and continue from after the call. */
787 nigel 77
788     case OP_END:
789     if (md->recursive != NULL && md->recursive->group_num == 0)
790     {
791     recursion_info *rec = md->recursive;
792 nigel 87 DPRINTF(("End of pattern in a (?0) recursion\n"));
793 nigel 77 md->recursive = rec->prevrec;
794     memmove(md->offset_vector, rec->offset_save,
795     rec->saved_max * sizeof(int));
796 ph10 168 mstart = rec->save_start;
797 nigel 77 ims = original_ims;
798     ecode = rec->after_call;
799     break;
800     }
801    
802     /* Otherwise, if PCRE_NOTEMPTY is set, fail if we have matched an empty
803     string - backtracking will then try other alternatives, if any. */
804    
805 ph10 168 if (md->notempty && eptr == mstart) RRETURN(MATCH_NOMATCH);
806     md->end_match_ptr = eptr; /* Record where we ended */
807     md->end_offset_top = offset_top; /* and how many extracts were taken */
808     md->start_match_ptr = mstart; /* and the start (\K can modify) */
809 nigel 77 RRETURN(MATCH_MATCH);
810    
811     /* Change option settings */
812    
813     case OP_OPT:
814     ims = ecode[1];
815     ecode += 2;
816     DPRINTF(("ims set to %02lx\n", ims));
817     break;
818    
819     /* Assertion brackets. Check the alternative branches in turn - the
820     matching won't pass the KET for an assertion. If any one branch matches,
821     the assertion is true. Lookbehind assertions have an OP_REVERSE item at the
822     start of each branch to move the current point backwards, so the code at
823     this level is identical to the lookahead case. */
824    
825     case OP_ASSERT:
826     case OP_ASSERTBACK:
827     do
828     {
829 ph10 164 RMATCH(eptr, ecode + 1 + LINK_SIZE, offset_top, md, ims, NULL, 0,
830     RM4);
831 nigel 77 if (rrc == MATCH_MATCH) break;
832     if (rrc != MATCH_NOMATCH) RRETURN(rrc);
833     ecode += GET(ecode, 1);
834     }
835     while (*ecode == OP_ALT);
836     if (*ecode == OP_KET) RRETURN(MATCH_NOMATCH);
837    
838     /* If checking an assertion for a condition, return MATCH_MATCH. */
839    
840     if ((flags & match_condassert) != 0) RRETURN(MATCH_MATCH);
841    
842     /* Continue from after the assertion, updating the offsets high water
843     mark, since extracts may have been taken during the assertion. */
844    
845     do ecode += GET(ecode,1); while (*ecode == OP_ALT);
846     ecode += 1 + LINK_SIZE;
847     offset_top = md->end_offset_top;
848     continue;
849    
850     /* Negative assertion: all branches must fail to match */
851    
852     case OP_ASSERT_NOT:
853     case OP_ASSERTBACK_NOT:
854     do
855     {
856 ph10 164 RMATCH(eptr, ecode + 1 + LINK_SIZE, offset_top, md, ims, NULL, 0,
857     RM5);
858 nigel 77 if (rrc == MATCH_MATCH) RRETURN(MATCH_NOMATCH);
859     if (rrc != MATCH_NOMATCH) RRETURN(rrc);
860     ecode += GET(ecode,1);
861     }
862     while (*ecode == OP_ALT);
863    
864     if ((flags & match_condassert) != 0) RRETURN(MATCH_MATCH);
865    
866     ecode += 1 + LINK_SIZE;
867     continue;
868    
869     /* Move the subject pointer back. This occurs only at the start of
870     each branch of a lookbehind assertion. If we are too close to the start to
871     move back, this match function fails. When working with UTF-8 we move
872     back a number of characters, not bytes. */
873    
874     case OP_REVERSE:
875     #ifdef SUPPORT_UTF8
876     if (utf8)
877     {
878 nigel 93 i = GET(ecode, 1);
879     while (i-- > 0)
880 nigel 77 {
881     eptr--;
882     if (eptr < md->start_subject) RRETURN(MATCH_NOMATCH);
883     BACKCHAR(eptr)
884     }
885     }
886     else
887     #endif
888    
889     /* No UTF-8 support, or not in UTF-8 mode: count is byte count */
890    
891     {
892 nigel 93 eptr -= GET(ecode, 1);
893 nigel 77 if (eptr < md->start_subject) RRETURN(MATCH_NOMATCH);
894     }
895    
896     /* Skip to next op code */
897    
898     ecode += 1 + LINK_SIZE;
899     break;
900    
901     /* The callout item calls an external function, if one is provided, passing
902     details of the match so far. This is mainly for debugging, though the
903     function is able to force a failure. */
904    
905     case OP_CALLOUT:
906     if (pcre_callout != NULL)
907     {
908     pcre_callout_block cb;
909     cb.version = 1; /* Version 1 of the callout block */
910     cb.callout_number = ecode[1];
911     cb.offset_vector = md->offset_vector;
912 nigel 87 cb.subject = (PCRE_SPTR)md->start_subject;
913 nigel 77 cb.subject_length = md->end_subject - md->start_subject;
914 ph10 168 cb.start_match = mstart - md->start_subject;
915 nigel 77 cb.current_position = eptr - md->start_subject;
916     cb.pattern_position = GET(ecode, 2);
917     cb.next_item_length = GET(ecode, 2 + LINK_SIZE);
918     cb.capture_top = offset_top/2;
919     cb.capture_last = md->capture_last;
920     cb.callout_data = md->callout_data;
921     if ((rrc = (*pcre_callout)(&cb)) > 0) RRETURN(MATCH_NOMATCH);
922     if (rrc < 0) RRETURN(rrc);
923     }
924     ecode += 2 + 2*LINK_SIZE;
925     break;
926    
927     /* Recursion either matches the current regex, or some subexpression. The
928     offset data is the offset to the starting bracket from the start of the
929     whole pattern. (This is so that it works from duplicated subpatterns.)
930    
931     If there are any capturing brackets started but not finished, we have to
932     save their starting points and reinstate them after the recursion. However,
933     we don't know how many such there are (offset_top records the completed
934     total) so we just have to save all the potential data. There may be up to
935     65535 such values, which is too large to put on the stack, but using malloc
936     for small numbers seems expensive. As a compromise, the stack is used when
937     there are no more than REC_STACK_SAVE_MAX values to store; otherwise malloc
938     is used. A problem is what to do if the malloc fails ... there is no way of
939     returning to the top level with an error. Save the top REC_STACK_SAVE_MAX
940     values on the stack, and accept that the rest may be wrong.
941    
942     There are also other values that have to be saved. We use a chained
943     sequence of blocks that actually live on the stack. Thanks to Robin Houston
944     for the original version of this logic. */
945    
946     case OP_RECURSE:
947     {
948     callpat = md->start_code + GET(ecode, 1);
949 nigel 93 new_recursive.group_num = (callpat == md->start_code)? 0 :
950     GET2(callpat, 1 + LINK_SIZE);
951 nigel 77
952     /* Add to "recursing stack" */
953    
954     new_recursive.prevrec = md->recursive;
955     md->recursive = &new_recursive;
956    
957     /* Find where to continue from afterwards */
958    
959     ecode += 1 + LINK_SIZE;
960     new_recursive.after_call = ecode;
961    
962     /* Now save the offset data. */
963    
964     new_recursive.saved_max = md->offset_end;
965     if (new_recursive.saved_max <= REC_STACK_SAVE_MAX)
966     new_recursive.offset_save = stacksave;
967     else
968     {
969     new_recursive.offset_save =
970     (int *)(pcre_malloc)(new_recursive.saved_max * sizeof(int));
971     if (new_recursive.offset_save == NULL) RRETURN(PCRE_ERROR_NOMEMORY);
972     }
973    
974     memcpy(new_recursive.offset_save, md->offset_vector,
975     new_recursive.saved_max * sizeof(int));
976 ph10 168 new_recursive.save_start = mstart;
977     mstart = eptr;
978 nigel 77
979     /* OK, now we can do the recursion. For each top-level alternative we
980     restore the offset and recursion data. */
981    
982     DPRINTF(("Recursing into group %d\n", new_recursive.group_num));
983 nigel 93 flags = (*callpat >= OP_SBRA)? match_cbegroup : 0;
984 nigel 77 do
985     {
986 ph10 164 RMATCH(eptr, callpat + _pcre_OP_lengths[*callpat], offset_top,
987     md, ims, eptrb, flags, RM6);
988 nigel 77 if (rrc == MATCH_MATCH)
989     {
990 nigel 87 DPRINTF(("Recursion matched\n"));
991 nigel 77 md->recursive = new_recursive.prevrec;
992     if (new_recursive.offset_save != stacksave)
993     (pcre_free)(new_recursive.offset_save);
994     RRETURN(MATCH_MATCH);
995     }
996 nigel 87 else if (rrc != MATCH_NOMATCH)
997     {
998     DPRINTF(("Recursion gave error %d\n", rrc));
999     RRETURN(rrc);
1000     }
1001 nigel 77
1002     md->recursive = &new_recursive;
1003     memcpy(md->offset_vector, new_recursive.offset_save,
1004     new_recursive.saved_max * sizeof(int));
1005     callpat += GET(callpat, 1);
1006     }
1007     while (*callpat == OP_ALT);
1008    
1009     DPRINTF(("Recursion didn't match\n"));
1010     md->recursive = new_recursive.prevrec;
1011     if (new_recursive.offset_save != stacksave)
1012     (pcre_free)(new_recursive.offset_save);
1013     RRETURN(MATCH_NOMATCH);
1014     }
1015     /* Control never reaches here */
1016    
1017     /* "Once" brackets are like assertion brackets except that after a match,
1018     the point in the subject string is not moved back. Thus there can never be
1019     a move back into the brackets. Friedl calls these "atomic" subpatterns.
1020     Check the alternative branches in turn - the matching won't pass the KET
1021     for this kind of subpattern. If any one branch matches, we carry on as at
1022     the end of a normal bracket, leaving the subject pointer. */
1023    
1024     case OP_ONCE:
1025 nigel 91 prev = ecode;
1026     saved_eptr = eptr;
1027    
1028     do
1029 nigel 77 {
1030 ph10 164 RMATCH(eptr, ecode + 1 + LINK_SIZE, offset_top, md, ims,
1031     eptrb, 0, RM7);
1032 nigel 91 if (rrc == MATCH_MATCH) break;
1033     if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1034     ecode += GET(ecode,1);
1035     }
1036     while (*ecode == OP_ALT);
1037 nigel 77
1038 nigel 91 /* If hit the end of the group (which could be repeated), fail */
1039 nigel 77
1040 nigel 91 if (*ecode != OP_ONCE && *ecode != OP_ALT) RRETURN(MATCH_NOMATCH);
1041 nigel 77
1042 nigel 91 /* Continue as from after the assertion, updating the offsets high water
1043     mark, since extracts may have been taken. */
1044 nigel 77
1045 nigel 93 do ecode += GET(ecode, 1); while (*ecode == OP_ALT);
1046 nigel 77
1047 nigel 91 offset_top = md->end_offset_top;
1048     eptr = md->end_match_ptr;
1049 nigel 77
1050 nigel 91 /* For a non-repeating ket, just continue at this level. This also
1051     happens for a repeating ket if no characters were matched in the group.
1052     This is the forcible breaking of infinite loops as implemented in Perl
1053     5.005. If there is an options reset, it will get obeyed in the normal
1054     course of events. */
1055 nigel 77
1056 nigel 91 if (*ecode == OP_KET || eptr == saved_eptr)
1057     {
1058     ecode += 1+LINK_SIZE;
1059     break;
1060     }
1061 nigel 77
1062 nigel 91 /* The repeating kets try the rest of the pattern or restart from the
1063     preceding bracket, in the appropriate order. The second "call" of match()
1064     uses tail recursion, to avoid using another stack frame. We need to reset
1065     any options that changed within the bracket before re-running it, so
1066     check the next opcode. */
1067 nigel 77
1068 nigel 91 if (ecode[1+LINK_SIZE] == OP_OPT)
1069     {
1070     ims = (ims & ~PCRE_IMS) | ecode[4];
1071     DPRINTF(("ims set to %02lx at group repeat\n", ims));
1072     }
1073 nigel 77
1074 nigel 91 if (*ecode == OP_KETRMIN)
1075     {
1076 ph10 164 RMATCH(eptr, ecode + 1 + LINK_SIZE, offset_top, md, ims, eptrb, 0,
1077     RM8);
1078 nigel 91 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1079     ecode = prev;
1080 nigel 93 flags = match_tail_recursed;
1081 nigel 91 goto TAIL_RECURSE;
1082 nigel 77 }
1083 nigel 91 else /* OP_KETRMAX */
1084     {
1085 ph10 164 RMATCH(eptr, prev, offset_top, md, ims, eptrb, match_cbegroup, RM9);
1086 nigel 91 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1087     ecode += 1 + LINK_SIZE;
1088 nigel 93 flags = match_tail_recursed;
1089 nigel 91 goto TAIL_RECURSE;
1090     }
1091     /* Control never gets here */
1092 nigel 77
1093     /* An alternation is the end of a branch; scan along to find the end of the
1094     bracketed group and go to there. */
1095    
1096     case OP_ALT:
1097     do ecode += GET(ecode,1); while (*ecode == OP_ALT);
1098     break;
1099    
1100     /* BRAZERO and BRAMINZERO occur just before a bracket group, indicating
1101     that it may occur zero times. It may repeat infinitely, or not at all -
1102     i.e. it could be ()* or ()? in the pattern. Brackets with fixed upper
1103     repeat limits are compiled as a number of copies, with the optional ones
1104     preceded by BRAZERO or BRAMINZERO. */
1105    
1106     case OP_BRAZERO:
1107     {
1108     next = ecode+1;
1109 ph10 164 RMATCH(eptr, next, offset_top, md, ims, eptrb, 0, RM10);
1110 nigel 77 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1111     do next += GET(next,1); while (*next == OP_ALT);
1112 nigel 93 ecode = next + 1 + LINK_SIZE;
1113 nigel 77 }
1114     break;
1115    
1116     case OP_BRAMINZERO:
1117     {
1118     next = ecode+1;
1119 nigel 93 do next += GET(next, 1); while (*next == OP_ALT);
1120 ph10 164 RMATCH(eptr, next + 1+LINK_SIZE, offset_top, md, ims, eptrb, 0, RM11);
1121 nigel 77 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1122     ecode++;
1123     }
1124     break;
1125    
1126 nigel 93 /* End of a group, repeated or non-repeating. */
1127 nigel 77
1128     case OP_KET:
1129     case OP_KETRMIN:
1130     case OP_KETRMAX:
1131 nigel 91 prev = ecode - GET(ecode, 1);
1132 nigel 77
1133 nigel 93 /* If this was a group that remembered the subject start, in order to break
1134     infinite repeats of empty string matches, retrieve the subject start from
1135     the chain. Otherwise, set it NULL. */
1136 nigel 77
1137 nigel 93 if (*prev >= OP_SBRA)
1138     {
1139     saved_eptr = eptrb->epb_saved_eptr; /* Value at start of group */
1140     eptrb = eptrb->epb_prev; /* Backup to previous group */
1141     }
1142     else saved_eptr = NULL;
1143 nigel 77
1144 nigel 93 /* If we are at the end of an assertion group, stop matching and return
1145     MATCH_MATCH, but record the current high water mark for use by positive
1146     assertions. Do this also for the "once" (atomic) groups. */
1147    
1148 nigel 91 if (*prev == OP_ASSERT || *prev == OP_ASSERT_NOT ||
1149     *prev == OP_ASSERTBACK || *prev == OP_ASSERTBACK_NOT ||
1150     *prev == OP_ONCE)
1151     {
1152     md->end_match_ptr = eptr; /* For ONCE */
1153     md->end_offset_top = offset_top;
1154     RRETURN(MATCH_MATCH);
1155     }
1156 nigel 77
1157 nigel 93 /* For capturing groups we have to check the group number back at the start
1158     and if necessary complete handling an extraction by setting the offsets and
1159     bumping the high water mark. Note that whole-pattern recursion is coded as
1160     a recurse into group 0, so it won't be picked up here. Instead, we catch it
1161     when the OP_END is reached. Other recursion is handled here. */
1162 nigel 77
1163 nigel 93 if (*prev == OP_CBRA || *prev == OP_SCBRA)
1164 nigel 91 {
1165 nigel 93 number = GET2(prev, 1+LINK_SIZE);
1166 nigel 91 offset = number << 1;
1167 nigel 77
1168     #ifdef DEBUG
1169 nigel 91 printf("end bracket %d", number);
1170     printf("\n");
1171 nigel 77 #endif
1172    
1173 nigel 93 md->capture_last = number;
1174     if (offset >= md->offset_max) md->offset_overflow = TRUE; else
1175 nigel 91 {
1176 nigel 93 md->offset_vector[offset] =
1177     md->offset_vector[md->offset_end - number];
1178     md->offset_vector[offset+1] = eptr - md->start_subject;
1179     if (offset_top <= offset) offset_top = offset + 2;
1180     }
1181 nigel 77
1182 nigel 93 /* Handle a recursively called group. Restore the offsets
1183     appropriately and continue from after the call. */
1184 nigel 77
1185 nigel 93 if (md->recursive != NULL && md->recursive->group_num == number)
1186     {
1187     recursion_info *rec = md->recursive;
1188     DPRINTF(("Recursion (%d) succeeded - continuing\n", number));
1189     md->recursive = rec->prevrec;
1190 ph10 168 mstart = rec->save_start;
1191 nigel 93 memcpy(md->offset_vector, rec->offset_save,
1192     rec->saved_max * sizeof(int));
1193     ecode = rec->after_call;
1194     ims = original_ims;
1195     break;
1196 nigel 77 }
1197 nigel 91 }
1198 nigel 77
1199 nigel 93 /* For both capturing and non-capturing groups, reset the value of the ims
1200     flags, in case they got changed during the group. */
1201 nigel 77
1202 nigel 91 ims = original_ims;
1203     DPRINTF(("ims reset to %02lx\n", ims));
1204 nigel 77
1205 nigel 91 /* For a non-repeating ket, just continue at this level. This also
1206     happens for a repeating ket if no characters were matched in the group.
1207     This is the forcible breaking of infinite loops as implemented in Perl
1208     5.005. If there is an options reset, it will get obeyed in the normal
1209     course of events. */
1210 nigel 77
1211 nigel 91 if (*ecode == OP_KET || eptr == saved_eptr)
1212     {
1213     ecode += 1 + LINK_SIZE;
1214     break;
1215     }
1216 nigel 77
1217 nigel 91 /* The repeating kets try the rest of the pattern or restart from the
1218     preceding bracket, in the appropriate order. In the second case, we can use
1219     tail recursion to avoid using another stack frame. */
1220 nigel 77
1221 nigel 93 flags = (*prev >= OP_SBRA)? match_cbegroup : 0;
1222    
1223 nigel 91 if (*ecode == OP_KETRMIN)
1224     {
1225 ph10 164 RMATCH(eptr, ecode + 1+LINK_SIZE, offset_top, md, ims, eptrb, 0,
1226     RM12);
1227 nigel 91 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1228     ecode = prev;
1229 nigel 93 flags |= match_tail_recursed;
1230 nigel 91 goto TAIL_RECURSE;
1231 nigel 77 }
1232 nigel 91 else /* OP_KETRMAX */
1233     {
1234 ph10 164 RMATCH(eptr, prev, offset_top, md, ims, eptrb, flags, RM13);
1235 nigel 91 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1236     ecode += 1 + LINK_SIZE;
1237 nigel 93 flags = match_tail_recursed;
1238 nigel 91 goto TAIL_RECURSE;
1239     }
1240     /* Control never gets here */
1241 nigel 77
1242     /* Start of subject unless notbol, or after internal newline if multiline */
1243    
1244     case OP_CIRC:
1245     if (md->notbol && eptr == md->start_subject) RRETURN(MATCH_NOMATCH);
1246     if ((ims & PCRE_MULTILINE) != 0)
1247     {
1248 nigel 91 if (eptr != md->start_subject &&
1249 nigel 93 (eptr == md->end_subject || !WAS_NEWLINE(eptr)))
1250 nigel 77 RRETURN(MATCH_NOMATCH);
1251     ecode++;
1252     break;
1253     }
1254     /* ... else fall through */
1255    
1256     /* Start of subject assertion */
1257    
1258     case OP_SOD:
1259     if (eptr != md->start_subject) RRETURN(MATCH_NOMATCH);
1260     ecode++;
1261     break;
1262    
1263     /* Start of match assertion */
1264    
1265     case OP_SOM:
1266     if (eptr != md->start_subject + md->start_offset) RRETURN(MATCH_NOMATCH);
1267     ecode++;
1268     break;
1269 ph10 172
1270 ph10 168 /* Reset the start of match point */
1271 ph10 172
1272 ph10 168 case OP_SET_SOM:
1273     mstart = eptr;
1274 ph10 172 ecode++;
1275     break;
1276 nigel 77
1277     /* Assert before internal newline if multiline, or before a terminating
1278     newline unless endonly is set, else end of subject unless noteol is set. */
1279    
1280     case OP_DOLL:
1281     if ((ims & PCRE_MULTILINE) != 0)
1282     {
1283     if (eptr < md->end_subject)
1284 nigel 91 { if (!IS_NEWLINE(eptr)) RRETURN(MATCH_NOMATCH); }
1285 nigel 77 else
1286     { if (md->noteol) RRETURN(MATCH_NOMATCH); }
1287     ecode++;
1288     break;
1289     }
1290     else
1291     {
1292     if (md->noteol) RRETURN(MATCH_NOMATCH);
1293     if (!md->endonly)
1294     {
1295 nigel 91 if (eptr != md->end_subject &&
1296 nigel 93 (!IS_NEWLINE(eptr) || eptr != md->end_subject - md->nllen))
1297 nigel 77 RRETURN(MATCH_NOMATCH);
1298     ecode++;
1299     break;
1300     }
1301     }
1302 nigel 91 /* ... else fall through for endonly */
1303 nigel 77
1304     /* End of subject assertion (\z) */
1305    
1306     case OP_EOD:
1307     if (eptr < md->end_subject) RRETURN(MATCH_NOMATCH);
1308     ecode++;
1309     break;
1310    
1311     /* End of subject or ending \n assertion (\Z) */
1312    
1313     case OP_EODN:
1314 nigel 91 if (eptr != md->end_subject &&
1315 nigel 93 (!IS_NEWLINE(eptr) || eptr != md->end_subject - md->nllen))
1316 nigel 91 RRETURN(MATCH_NOMATCH);
1317 nigel 77 ecode++;
1318     break;
1319    
1320     /* Word boundary assertions */
1321    
1322     case OP_NOT_WORD_BOUNDARY:
1323     case OP_WORD_BOUNDARY:
1324     {
1325    
1326     /* Find out if the previous and current characters are "word" characters.
1327     It takes a bit more work in UTF-8 mode. Characters > 255 are assumed to
1328     be "non-word" characters. */
1329    
1330     #ifdef SUPPORT_UTF8
1331     if (utf8)
1332     {
1333     if (eptr == md->start_subject) prev_is_word = FALSE; else
1334     {
1335     const uschar *lastptr = eptr - 1;
1336     while((*lastptr & 0xc0) == 0x80) lastptr--;
1337     GETCHAR(c, lastptr);
1338     prev_is_word = c < 256 && (md->ctypes[c] & ctype_word) != 0;
1339     }
1340     if (eptr >= md->end_subject) cur_is_word = FALSE; else
1341     {
1342     GETCHAR(c, eptr);
1343     cur_is_word = c < 256 && (md->ctypes[c] & ctype_word) != 0;
1344     }
1345     }
1346     else
1347     #endif
1348    
1349     /* More streamlined when not in UTF-8 mode */
1350    
1351     {
1352     prev_is_word = (eptr != md->start_subject) &&
1353     ((md->ctypes[eptr[-1]] & ctype_word) != 0);
1354     cur_is_word = (eptr < md->end_subject) &&
1355     ((md->ctypes[*eptr] & ctype_word) != 0);
1356     }
1357    
1358     /* Now see if the situation is what we want */
1359    
1360     if ((*ecode++ == OP_WORD_BOUNDARY)?
1361     cur_is_word == prev_is_word : cur_is_word != prev_is_word)
1362     RRETURN(MATCH_NOMATCH);
1363     }
1364     break;
1365    
1366     /* Match a single character type; inline for speed */
1367    
1368     case OP_ANY:
1369 nigel 91 if ((ims & PCRE_DOTALL) == 0)
1370     {
1371 nigel 93 if (IS_NEWLINE(eptr)) RRETURN(MATCH_NOMATCH);
1372 nigel 91 }
1373 nigel 77 if (eptr++ >= md->end_subject) RRETURN(MATCH_NOMATCH);
1374     if (utf8)
1375     while (eptr < md->end_subject && (*eptr & 0xc0) == 0x80) eptr++;
1376     ecode++;
1377     break;
1378    
1379     /* Match a single byte, even in UTF-8 mode. This opcode really does match
1380     any byte, even newline, independent of the setting of PCRE_DOTALL. */
1381    
1382     case OP_ANYBYTE:
1383     if (eptr++ >= md->end_subject) RRETURN(MATCH_NOMATCH);
1384     ecode++;
1385     break;
1386    
1387     case OP_NOT_DIGIT:
1388     if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
1389     GETCHARINCTEST(c, eptr);
1390     if (
1391     #ifdef SUPPORT_UTF8
1392     c < 256 &&
1393     #endif
1394     (md->ctypes[c] & ctype_digit) != 0
1395     )
1396     RRETURN(MATCH_NOMATCH);
1397     ecode++;
1398     break;
1399    
1400     case OP_DIGIT:
1401     if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
1402     GETCHARINCTEST(c, eptr);
1403     if (
1404     #ifdef SUPPORT_UTF8
1405     c >= 256 ||
1406     #endif
1407     (md->ctypes[c] & ctype_digit) == 0
1408     )
1409     RRETURN(MATCH_NOMATCH);
1410     ecode++;
1411     break;
1412    
1413     case OP_NOT_WHITESPACE:
1414     if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
1415     GETCHARINCTEST(c, eptr);
1416     if (
1417     #ifdef SUPPORT_UTF8
1418     c < 256 &&
1419     #endif
1420     (md->ctypes[c] & ctype_space) != 0
1421     )
1422     RRETURN(MATCH_NOMATCH);
1423     ecode++;
1424     break;
1425    
1426     case OP_WHITESPACE:
1427     if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
1428     GETCHARINCTEST(c, eptr);
1429     if (
1430     #ifdef SUPPORT_UTF8
1431     c >= 256 ||
1432     #endif
1433     (md->ctypes[c] & ctype_space) == 0
1434     )
1435     RRETURN(MATCH_NOMATCH);
1436     ecode++;
1437     break;
1438    
1439     case OP_NOT_WORDCHAR:
1440     if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
1441     GETCHARINCTEST(c, eptr);
1442     if (
1443     #ifdef SUPPORT_UTF8
1444     c < 256 &&
1445     #endif
1446     (md->ctypes[c] & ctype_word) != 0
1447     )
1448     RRETURN(MATCH_NOMATCH);
1449     ecode++;
1450     break;
1451    
1452     case OP_WORDCHAR:
1453     if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
1454     GETCHARINCTEST(c, eptr);
1455     if (
1456     #ifdef SUPPORT_UTF8
1457     c >= 256 ||
1458     #endif
1459     (md->ctypes[c] & ctype_word) == 0
1460     )
1461     RRETURN(MATCH_NOMATCH);
1462     ecode++;
1463     break;
1464    
1465 nigel 93 case OP_ANYNL:
1466     if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
1467     GETCHARINCTEST(c, eptr);
1468     switch(c)
1469     {
1470     default: RRETURN(MATCH_NOMATCH);
1471     case 0x000d:
1472     if (eptr < md->end_subject && *eptr == 0x0a) eptr++;
1473     break;
1474     case 0x000a:
1475     case 0x000b:
1476     case 0x000c:
1477     case 0x0085:
1478     case 0x2028:
1479     case 0x2029:
1480     break;
1481     }
1482     ecode++;
1483     break;
1484    
1485 ph10 178 case OP_NOT_HSPACE:
1486     if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
1487     GETCHARINCTEST(c, eptr);
1488     switch(c)
1489     {
1490     default: break;
1491     case 0x09: /* HT */
1492     case 0x20: /* SPACE */
1493     case 0xa0: /* NBSP */
1494     case 0x1680: /* OGHAM SPACE MARK */
1495     case 0x180e: /* MONGOLIAN VOWEL SEPARATOR */
1496     case 0x2000: /* EN QUAD */
1497     case 0x2001: /* EM QUAD */
1498     case 0x2002: /* EN SPACE */
1499     case 0x2003: /* EM SPACE */
1500     case 0x2004: /* THREE-PER-EM SPACE */
1501     case 0x2005: /* FOUR-PER-EM SPACE */
1502     case 0x2006: /* SIX-PER-EM SPACE */
1503     case 0x2007: /* FIGURE SPACE */
1504     case 0x2008: /* PUNCTUATION SPACE */
1505     case 0x2009: /* THIN SPACE */
1506     case 0x200A: /* HAIR SPACE */
1507     case 0x202f: /* NARROW NO-BREAK SPACE */
1508     case 0x205f: /* MEDIUM MATHEMATICAL SPACE */
1509     case 0x3000: /* IDEOGRAPHIC SPACE */
1510     RRETURN(MATCH_NOMATCH);
1511     }
1512     ecode++;
1513     break;
1514    
1515     case OP_HSPACE:
1516     if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
1517     GETCHARINCTEST(c, eptr);
1518     switch(c)
1519     {
1520     default: RRETURN(MATCH_NOMATCH);
1521     case 0x09: /* HT */
1522     case 0x20: /* SPACE */
1523     case 0xa0: /* NBSP */
1524     case 0x1680: /* OGHAM SPACE MARK */
1525     case 0x180e: /* MONGOLIAN VOWEL SEPARATOR */
1526     case 0x2000: /* EN QUAD */
1527     case 0x2001: /* EM QUAD */
1528     case 0x2002: /* EN SPACE */
1529     case 0x2003: /* EM SPACE */
1530     case 0x2004: /* THREE-PER-EM SPACE */
1531     case 0x2005: /* FOUR-PER-EM SPACE */
1532     case 0x2006: /* SIX-PER-EM SPACE */
1533     case 0x2007: /* FIGURE SPACE */
1534     case 0x2008: /* PUNCTUATION SPACE */
1535     case 0x2009: /* THIN SPACE */
1536     case 0x200A: /* HAIR SPACE */
1537     case 0x202f: /* NARROW NO-BREAK SPACE */
1538     case 0x205f: /* MEDIUM MATHEMATICAL SPACE */
1539     case 0x3000: /* IDEOGRAPHIC SPACE */
1540     break;
1541     }
1542     ecode++;
1543     break;
1544    
1545     case OP_NOT_VSPACE:
1546     if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
1547     GETCHARINCTEST(c, eptr);
1548     switch(c)
1549     {
1550     default: break;
1551     case 0x0a: /* LF */
1552     case 0x0b: /* VT */
1553     case 0x0c: /* FF */
1554     case 0x0d: /* CR */
1555     case 0x85: /* NEL */
1556     case 0x2028: /* LINE SEPARATOR */
1557     case 0x2029: /* PARAGRAPH SEPARATOR */
1558     RRETURN(MATCH_NOMATCH);
1559     }
1560     ecode++;
1561     break;
1562    
1563     case OP_VSPACE:
1564     if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
1565     GETCHARINCTEST(c, eptr);
1566     switch(c)
1567     {
1568     default: RRETURN(MATCH_NOMATCH);
1569     case 0x0a: /* LF */
1570     case 0x0b: /* VT */
1571     case 0x0c: /* FF */
1572     case 0x0d: /* CR */
1573     case 0x85: /* NEL */
1574     case 0x2028: /* LINE SEPARATOR */
1575     case 0x2029: /* PARAGRAPH SEPARATOR */
1576     break;
1577     }
1578     ecode++;
1579     break;
1580    
1581 nigel 77 #ifdef SUPPORT_UCP
1582     /* Check the next character by Unicode property. We will get here only
1583     if the support is in the binary; otherwise a compile-time error occurs. */
1584    
1585     case OP_PROP:
1586     case OP_NOTPROP:
1587     if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
1588     GETCHARINCTEST(c, eptr);
1589     {
1590 nigel 87 int chartype, script;
1591     int category = _pcre_ucp_findprop(c, &chartype, &script);
1592 nigel 77
1593 nigel 87 switch(ecode[1])
1594     {
1595     case PT_ANY:
1596     if (op == OP_NOTPROP) RRETURN(MATCH_NOMATCH);
1597     break;
1598 nigel 77
1599 nigel 87 case PT_LAMP:
1600     if ((chartype == ucp_Lu ||
1601     chartype == ucp_Ll ||
1602     chartype == ucp_Lt) == (op == OP_NOTPROP))
1603 nigel 77 RRETURN(MATCH_NOMATCH);
1604 nigel 87 break;
1605    
1606     case PT_GC:
1607     if ((ecode[2] != category) == (op == OP_PROP))
1608 nigel 77 RRETURN(MATCH_NOMATCH);
1609 nigel 87 break;
1610    
1611     case PT_PC:
1612     if ((ecode[2] != chartype) == (op == OP_PROP))
1613     RRETURN(MATCH_NOMATCH);
1614     break;
1615    
1616     case PT_SC:
1617     if ((ecode[2] != script) == (op == OP_PROP))
1618     RRETURN(MATCH_NOMATCH);
1619     break;
1620    
1621     default:
1622     RRETURN(PCRE_ERROR_INTERNAL);
1623 nigel 77 }
1624 nigel 87
1625     ecode += 3;
1626 nigel 77 }
1627     break;
1628    
1629     /* Match an extended Unicode sequence. We will get here only if the support
1630     is in the binary; otherwise a compile-time error occurs. */
1631    
1632     case OP_EXTUNI:
1633     if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
1634     GETCHARINCTEST(c, eptr);
1635     {
1636 nigel 87 int chartype, script;
1637     int category = _pcre_ucp_findprop(c, &chartype, &script);
1638 nigel 77 if (category == ucp_M) RRETURN(MATCH_NOMATCH);
1639     while (eptr < md->end_subject)
1640     {
1641     int len = 1;
1642     if (!utf8) c = *eptr; else
1643     {
1644     GETCHARLEN(c, eptr, len);
1645     }
1646 nigel 87 category = _pcre_ucp_findprop(c, &chartype, &script);
1647 nigel 77 if (category != ucp_M) break;
1648     eptr += len;
1649     }
1650     }
1651     ecode++;
1652     break;
1653     #endif
1654    
1655    
1656     /* Match a back reference, possibly repeatedly. Look past the end of the
1657     item to see if there is repeat information following. The code is similar
1658     to that for character classes, but repeated for efficiency. Then obey
1659     similar code to character type repeats - written out again for speed.
1660     However, if the referenced string is the empty string, always treat
1661     it as matched, any number of times (otherwise there could be infinite
1662     loops). */
1663    
1664     case OP_REF:
1665     {
1666     offset = GET2(ecode, 1) << 1; /* Doubled ref number */
1667     ecode += 3; /* Advance past item */
1668    
1669     /* If the reference is unset, set the length to be longer than the amount
1670     of subject left; this ensures that every attempt at a match fails. We
1671     can't just fail here, because of the possibility of quantifiers with zero
1672     minima. */
1673    
1674     length = (offset >= offset_top || md->offset_vector[offset] < 0)?
1675     md->end_subject - eptr + 1 :
1676     md->offset_vector[offset+1] - md->offset_vector[offset];
1677    
1678     /* Set up for repetition, or handle the non-repeated case */
1679    
1680     switch (*ecode)
1681     {
1682     case OP_CRSTAR:
1683     case OP_CRMINSTAR:
1684     case OP_CRPLUS:
1685     case OP_CRMINPLUS:
1686     case OP_CRQUERY:
1687     case OP_CRMINQUERY:
1688     c = *ecode++ - OP_CRSTAR;
1689     minimize = (c & 1) != 0;
1690     min = rep_min[c]; /* Pick up values from tables; */
1691     max = rep_max[c]; /* zero for max => infinity */
1692     if (max == 0) max = INT_MAX;
1693     break;
1694    
1695     case OP_CRRANGE:
1696     case OP_CRMINRANGE:
1697     minimize = (*ecode == OP_CRMINRANGE);
1698     min = GET2(ecode, 1);
1699     max = GET2(ecode, 3);
1700     if (max == 0) max = INT_MAX;
1701     ecode += 5;
1702     break;
1703    
1704     default: /* No repeat follows */
1705     if (!match_ref(offset, eptr, length, md, ims)) RRETURN(MATCH_NOMATCH);
1706     eptr += length;
1707     continue; /* With the main loop */
1708     }
1709    
1710     /* If the length of the reference is zero, just continue with the
1711     main loop. */
1712    
1713     if (length == 0) continue;
1714    
1715     /* First, ensure the minimum number of matches are present. We get back
1716     the length of the reference string explicitly rather than passing the
1717     address of eptr, so that eptr can be a register variable. */
1718    
1719     for (i = 1; i <= min; i++)
1720     {
1721     if (!match_ref(offset, eptr, length, md, ims)) RRETURN(MATCH_NOMATCH);
1722     eptr += length;
1723     }
1724    
1725     /* If min = max, continue at the same level without recursion.
1726     They are not both allowed to be zero. */
1727    
1728     if (min == max) continue;
1729    
1730     /* If minimizing, keep trying and advancing the pointer */
1731    
1732     if (minimize)
1733     {
1734     for (fi = min;; fi++)
1735     {
1736 ph10 164 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM14);
1737 nigel 77 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1738     if (fi >= max || !match_ref(offset, eptr, length, md, ims))
1739     RRETURN(MATCH_NOMATCH);
1740     eptr += length;
1741     }
1742     /* Control never gets here */
1743     }
1744    
1745     /* If maximizing, find the longest string and work backwards */
1746    
1747     else
1748     {
1749     pp = eptr;
1750     for (i = min; i < max; i++)
1751     {
1752     if (!match_ref(offset, eptr, length, md, ims)) break;
1753     eptr += length;
1754     }
1755     while (eptr >= pp)
1756     {
1757 ph10 164 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM15);
1758 nigel 77 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1759     eptr -= length;
1760     }
1761     RRETURN(MATCH_NOMATCH);
1762     }
1763     }
1764     /* Control never gets here */
1765    
1766    
1767    
1768     /* Match a bit-mapped character class, possibly repeatedly. This op code is
1769     used when all the characters in the class have values in the range 0-255,
1770     and either the matching is caseful, or the characters are in the range
1771     0-127 when UTF-8 processing is enabled. The only difference between
1772     OP_CLASS and OP_NCLASS occurs when a data character outside the range is
1773     encountered.
1774    
1775     First, look past the end of the item to see if there is repeat information
1776     following. Then obey similar code to character type repeats - written out
1777     again for speed. */
1778    
1779     case OP_NCLASS:
1780     case OP_CLASS:
1781     {
1782     data = ecode + 1; /* Save for matching */
1783     ecode += 33; /* Advance past the item */
1784    
1785     switch (*ecode)
1786     {
1787     case OP_CRSTAR:
1788     case OP_CRMINSTAR:
1789     case OP_CRPLUS:
1790     case OP_CRMINPLUS:
1791     case OP_CRQUERY:
1792     case OP_CRMINQUERY:
1793     c = *ecode++ - OP_CRSTAR;
1794     minimize = (c & 1) != 0;
1795     min = rep_min[c]; /* Pick up values from tables; */
1796     max = rep_max[c]; /* zero for max => infinity */
1797     if (max == 0) max = INT_MAX;
1798     break;
1799    
1800     case OP_CRRANGE:
1801     case OP_CRMINRANGE:
1802     minimize = (*ecode == OP_CRMINRANGE);
1803     min = GET2(ecode, 1);
1804     max = GET2(ecode, 3);
1805     if (max == 0) max = INT_MAX;
1806     ecode += 5;
1807     break;
1808    
1809     default: /* No repeat follows */
1810     min = max = 1;
1811     break;
1812     }
1813    
1814     /* First, ensure the minimum number of matches are present. */
1815    
1816     #ifdef SUPPORT_UTF8
1817     /* UTF-8 mode */
1818     if (utf8)
1819     {
1820     for (i = 1; i <= min; i++)
1821     {
1822     if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
1823     GETCHARINC(c, eptr);
1824     if (c > 255)
1825     {
1826     if (op == OP_CLASS) RRETURN(MATCH_NOMATCH);
1827     }
1828     else
1829     {
1830     if ((data[c/8] & (1 << (c&7))) == 0) RRETURN(MATCH_NOMATCH);
1831     }
1832     }
1833     }
1834     else
1835     #endif
1836     /* Not UTF-8 mode */
1837     {
1838     for (i = 1; i <= min; i++)
1839     {
1840     if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
1841     c = *eptr++;
1842     if ((data[c/8] & (1 << (c&7))) == 0) RRETURN(MATCH_NOMATCH);
1843     }
1844     }
1845    
1846     /* If max == min we can continue with the main loop without the
1847     need to recurse. */
1848    
1849     if (min == max) continue;
1850    
1851     /* If minimizing, keep testing the rest of the expression and advancing
1852     the pointer while it matches the class. */
1853    
1854     if (minimize)
1855     {
1856     #ifdef SUPPORT_UTF8
1857     /* UTF-8 mode */
1858     if (utf8)
1859     {
1860     for (fi = min;; fi++)
1861     {
1862 ph10 164 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM16);
1863 nigel 77 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1864     if (fi >= max || eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
1865     GETCHARINC(c, eptr);
1866     if (c > 255)
1867     {
1868     if (op == OP_CLASS) RRETURN(MATCH_NOMATCH);
1869     }
1870     else
1871     {
1872     if ((data[c/8] & (1 << (c&7))) == 0) RRETURN(MATCH_NOMATCH);
1873     }
1874     }
1875     }
1876     else
1877     #endif
1878     /* Not UTF-8 mode */
1879     {
1880     for (fi = min;; fi++)
1881     {
1882 ph10 164 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM17);
1883 nigel 77 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1884     if (fi >= max || eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
1885     c = *eptr++;
1886     if ((data[c/8] & (1 << (c&7))) == 0) RRETURN(MATCH_NOMATCH);
1887     }
1888     }
1889     /* Control never gets here */
1890     }
1891    
1892     /* If maximizing, find the longest possible run, then work backwards. */
1893    
1894     else
1895     {
1896     pp = eptr;
1897    
1898     #ifdef SUPPORT_UTF8
1899     /* UTF-8 mode */
1900     if (utf8)
1901     {
1902     for (i = min; i < max; i++)
1903     {
1904     int len = 1;
1905     if (eptr >= md->end_subject) break;
1906     GETCHARLEN(c, eptr, len);
1907     if (c > 255)
1908     {
1909     if (op == OP_CLASS) break;
1910     }
1911     else
1912     {
1913     if ((data[c/8] & (1 << (c&7))) == 0) break;
1914     }
1915     eptr += len;
1916     }
1917     for (;;)
1918     {
1919 ph10 164 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM18);
1920 nigel 77 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1921     if (eptr-- == pp) break; /* Stop if tried at original pos */
1922     BACKCHAR(eptr);
1923     }
1924     }
1925     else
1926     #endif
1927     /* Not UTF-8 mode */
1928     {
1929     for (i = min; i < max; i++)
1930     {
1931     if (eptr >= md->end_subject) break;
1932     c = *eptr;
1933     if ((data[c/8] & (1 << (c&7))) == 0) break;
1934     eptr++;
1935     }
1936     while (eptr >= pp)
1937     {
1938 ph10 164 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM19);
1939 nigel 87 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1940 nigel 77 eptr--;
1941     }
1942     }
1943    
1944     RRETURN(MATCH_NOMATCH);
1945     }
1946     }
1947     /* Control never gets here */
1948    
1949    
1950     /* Match an extended character class. This opcode is encountered only
1951     in UTF-8 mode, because that's the only time it is compiled. */
1952    
1953     #ifdef SUPPORT_UTF8
1954     case OP_XCLASS:
1955     {
1956     data = ecode + 1 + LINK_SIZE; /* Save for matching */
1957     ecode += GET(ecode, 1); /* Advance past the item */
1958    
1959     switch (*ecode)
1960     {
1961     case OP_CRSTAR:
1962     case OP_CRMINSTAR:
1963     case OP_CRPLUS:
1964     case OP_CRMINPLUS:
1965     case OP_CRQUERY:
1966     case OP_CRMINQUERY:
1967     c = *ecode++ - OP_CRSTAR;
1968     minimize = (c & 1) != 0;
1969     min = rep_min[c]; /* Pick up values from tables; */
1970     max = rep_max[c]; /* zero for max => infinity */
1971     if (max == 0) max = INT_MAX;
1972     break;
1973    
1974     case OP_CRRANGE:
1975     case OP_CRMINRANGE:
1976     minimize = (*ecode == OP_CRMINRANGE);
1977     min = GET2(ecode, 1);
1978     max = GET2(ecode, 3);
1979     if (max == 0) max = INT_MAX;
1980     ecode += 5;
1981     break;
1982    
1983     default: /* No repeat follows */
1984     min = max = 1;
1985     break;
1986     }
1987    
1988     /* First, ensure the minimum number of matches are present. */
1989    
1990     for (i = 1; i <= min; i++)
1991     {
1992     if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
1993     GETCHARINC(c, eptr);
1994     if (!_pcre_xclass(c, data)) RRETURN(MATCH_NOMATCH);
1995     }
1996    
1997     /* If max == min we can continue with the main loop without the
1998     need to recurse. */
1999    
2000     if (min == max) continue;
2001    
2002     /* If minimizing, keep testing the rest of the expression and advancing
2003     the pointer while it matches the class. */
2004    
2005     if (minimize)
2006     {
2007     for (fi = min;; fi++)
2008     {
2009 ph10 164 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM20);
2010 nigel 77 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2011     if (fi >= max || eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
2012     GETCHARINC(c, eptr);
2013     if (!_pcre_xclass(c, data)) RRETURN(MATCH_NOMATCH);
2014     }
2015     /* Control never gets here */
2016     }
2017    
2018     /* If maximizing, find the longest possible run, then work backwards. */
2019    
2020     else
2021     {
2022     pp = eptr;
2023     for (i = min; i < max; i++)
2024     {
2025     int len = 1;
2026     if (eptr >= md->end_subject) break;
2027     GETCHARLEN(c, eptr, len);
2028     if (!_pcre_xclass(c, data)) break;
2029     eptr += len;
2030     }
2031     for(;;)
2032     {
2033 ph10 164 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM21);
2034 nigel 77 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2035     if (eptr-- == pp) break; /* Stop if tried at original pos */
2036     BACKCHAR(eptr)
2037     }
2038     RRETURN(MATCH_NOMATCH);
2039     }
2040    
2041     /* Control never gets here */
2042     }
2043     #endif /* End of XCLASS */
2044    
2045     /* Match a single character, casefully */
2046    
2047     case OP_CHAR:
2048     #ifdef SUPPORT_UTF8
2049     if (utf8)
2050     {
2051     length = 1;
2052     ecode++;
2053     GETCHARLEN(fc, ecode, length);
2054     if (length > md->end_subject - eptr) RRETURN(MATCH_NOMATCH);
2055     while (length-- > 0) if (*ecode++ != *eptr++) RRETURN(MATCH_NOMATCH);
2056     }
2057     else
2058     #endif
2059    
2060     /* Non-UTF-8 mode */
2061     {
2062     if (md->end_subject - eptr < 1) RRETURN(MATCH_NOMATCH);
2063     if (ecode[1] != *eptr++) RRETURN(MATCH_NOMATCH);
2064     ecode += 2;
2065     }
2066     break;
2067    
2068     /* Match a single character, caselessly */
2069    
2070     case OP_CHARNC:
2071     #ifdef SUPPORT_UTF8
2072     if (utf8)
2073     {
2074     length = 1;
2075     ecode++;
2076     GETCHARLEN(fc, ecode, length);
2077    
2078     if (length > md->end_subject - eptr) RRETURN(MATCH_NOMATCH);
2079    
2080     /* If the pattern character's value is < 128, we have only one byte, and
2081     can use the fast lookup table. */
2082    
2083     if (fc < 128)
2084     {
2085     if (md->lcc[*ecode++] != md->lcc[*eptr++]) RRETURN(MATCH_NOMATCH);
2086     }
2087    
2088     /* Otherwise we must pick up the subject character */
2089    
2090     else
2091     {
2092 nigel 93 unsigned int dc;
2093 nigel 77 GETCHARINC(dc, eptr);
2094     ecode += length;
2095    
2096     /* If we have Unicode property support, we can use it to test the other
2097 nigel 87 case of the character, if there is one. */
2098 nigel 77
2099     if (fc != dc)
2100     {
2101     #ifdef SUPPORT_UCP
2102 nigel 87 if (dc != _pcre_ucp_othercase(fc))
2103 nigel 77 #endif
2104     RRETURN(MATCH_NOMATCH);
2105     }
2106     }
2107     }
2108     else
2109     #endif /* SUPPORT_UTF8 */
2110    
2111     /* Non-UTF-8 mode */
2112     {
2113     if (md->end_subject - eptr < 1) RRETURN(MATCH_NOMATCH);
2114     if (md->lcc[ecode[1]] != md->lcc[*eptr++]) RRETURN(MATCH_NOMATCH);
2115     ecode += 2;
2116     }
2117     break;
2118    
2119 nigel 93 /* Match a single character repeatedly. */
2120 nigel 77
2121     case OP_EXACT:
2122     min = max = GET2(ecode, 1);
2123     ecode += 3;
2124     goto REPEATCHAR;
2125    
2126 nigel 93 case OP_POSUPTO:
2127     possessive = TRUE;
2128     /* Fall through */
2129    
2130 nigel 77 case OP_UPTO:
2131     case OP_MINUPTO:
2132     min = 0;
2133     max = GET2(ecode, 1);
2134     minimize = *ecode == OP_MINUPTO;
2135     ecode += 3;
2136     goto REPEATCHAR;
2137    
2138 nigel 93 case OP_POSSTAR:
2139     possessive = TRUE;
2140     min = 0;
2141     max = INT_MAX;
2142     ecode++;
2143     goto REPEATCHAR;
2144    
2145     case OP_POSPLUS:
2146     possessive = TRUE;
2147     min = 1;
2148     max = INT_MAX;
2149     ecode++;
2150     goto REPEATCHAR;
2151    
2152     case OP_POSQUERY:
2153     possessive = TRUE;
2154     min = 0;
2155     max = 1;
2156     ecode++;
2157     goto REPEATCHAR;
2158    
2159 nigel 77 case OP_STAR:
2160     case OP_MINSTAR:
2161     case OP_PLUS:
2162     case OP_MINPLUS:
2163     case OP_QUERY:
2164     case OP_MINQUERY:
2165     c = *ecode++ - OP_STAR;
2166     minimize = (c & 1) != 0;
2167     min = rep_min[c]; /* Pick up values from tables; */
2168     max = rep_max[c]; /* zero for max => infinity */
2169     if (max == 0) max = INT_MAX;
2170    
2171     /* Common code for all repeated single-character matches. We can give
2172     up quickly if there are fewer than the minimum number of characters left in
2173     the subject. */
2174    
2175     REPEATCHAR:
2176     #ifdef SUPPORT_UTF8
2177     if (utf8)
2178     {
2179     length = 1;
2180     charptr = ecode;
2181     GETCHARLEN(fc, ecode, length);
2182     if (min * length > md->end_subject - eptr) RRETURN(MATCH_NOMATCH);
2183     ecode += length;
2184    
2185     /* Handle multibyte character matching specially here. There is
2186     support for caseless matching if UCP support is present. */
2187    
2188     if (length > 1)
2189     {
2190     #ifdef SUPPORT_UCP
2191 nigel 93 unsigned int othercase;
2192 nigel 77 if ((ims & PCRE_CASELESS) != 0 &&
2193 nigel 93 (othercase = _pcre_ucp_othercase(fc)) != NOTACHAR)
2194 nigel 77 oclength = _pcre_ord2utf8(othercase, occhars);
2195 ph10 115 else oclength = 0;
2196 nigel 77 #endif /* SUPPORT_UCP */
2197    
2198     for (i = 1; i <= min; i++)
2199     {
2200     if (memcmp(eptr, charptr, length) == 0) eptr += length;
2201 ph10 123 #ifdef SUPPORT_UCP
2202 nigel 77 /* Need braces because of following else */
2203     else if (oclength == 0) { RRETURN(MATCH_NOMATCH); }
2204     else
2205     {
2206     if (memcmp(eptr, occhars, oclength) != 0) RRETURN(MATCH_NOMATCH);
2207     eptr += oclength;
2208     }
2209 ph10 115 #else /* without SUPPORT_UCP */
2210     else { RRETURN(MATCH_NOMATCH); }
2211 ph10 123 #endif /* SUPPORT_UCP */
2212 nigel 77 }
2213    
2214     if (min == max) continue;
2215    
2216     if (minimize)
2217     {
2218     for (fi = min;; fi++)
2219     {
2220 ph10 164 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM22);
2221 nigel 77 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2222     if (fi >= max || eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
2223     if (memcmp(eptr, charptr, length) == 0) eptr += length;
2224 ph10 123 #ifdef SUPPORT_UCP
2225 nigel 77 /* Need braces because of following else */
2226     else if (oclength == 0) { RRETURN(MATCH_NOMATCH); }
2227     else
2228     {
2229     if (memcmp(eptr, occhars, oclength) != 0) RRETURN(MATCH_NOMATCH);
2230     eptr += oclength;
2231     }
2232 ph10 115 #else /* without SUPPORT_UCP */
2233     else { RRETURN (MATCH_NOMATCH); }
2234     #endif /* SUPPORT_UCP */
2235 nigel 77 }
2236     /* Control never gets here */
2237     }
2238 nigel 93
2239     else /* Maximize */
2240 nigel 77 {
2241     pp = eptr;
2242     for (i = min; i < max; i++)
2243     {
2244     if (eptr > md->end_subject - length) break;
2245     if (memcmp(eptr, charptr, length) == 0) eptr += length;
2246 ph10 123 #ifdef SUPPORT_UCP
2247 nigel 77 else if (oclength == 0) break;
2248     else
2249     {
2250     if (memcmp(eptr, occhars, oclength) != 0) break;
2251     eptr += oclength;
2252     }
2253 ph10 115 #else /* without SUPPORT_UCP */
2254     else break;
2255 ph10 123 #endif /* SUPPORT_UCP */
2256 nigel 77 }
2257 nigel 93
2258     if (possessive) continue;
2259 ph10 120 for(;;)
2260 nigel 77 {
2261 ph10 164 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM23);
2262 nigel 77 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2263 ph10 120 if (eptr == pp) RRETURN(MATCH_NOMATCH);
2264 ph10 115 #ifdef SUPPORT_UCP
2265     eptr--;
2266     BACKCHAR(eptr);
2267 ph10 123 #else /* without SUPPORT_UCP */
2268 nigel 77 eptr -= length;
2269 ph10 123 #endif /* SUPPORT_UCP */
2270 nigel 77 }
2271     }
2272     /* Control never gets here */
2273     }
2274    
2275     /* If the length of a UTF-8 character is 1, we fall through here, and
2276     obey the code as for non-UTF-8 characters below, though in this case the
2277     value of fc will always be < 128. */
2278     }
2279     else
2280     #endif /* SUPPORT_UTF8 */
2281    
2282     /* When not in UTF-8 mode, load a single-byte character. */
2283     {
2284     if (min > md->end_subject - eptr) RRETURN(MATCH_NOMATCH);
2285     fc = *ecode++;
2286     }
2287    
2288     /* The value of fc at this point is always less than 256, though we may or
2289     may not be in UTF-8 mode. The code is duplicated for the caseless and
2290     caseful cases, for speed, since matching characters is likely to be quite
2291     common. First, ensure the minimum number of matches are present. If min =
2292     max, continue at the same level without recursing. Otherwise, if
2293     minimizing, keep trying the rest of the expression and advancing one
2294     matching character if failing, up to the maximum. Alternatively, if
2295     maximizing, find the maximum number of characters and work backwards. */
2296    
2297     DPRINTF(("matching %c{%d,%d} against subject %.*s\n", fc, min, max,
2298     max, eptr));
2299    
2300     if ((ims & PCRE_CASELESS) != 0)
2301     {
2302     fc = md->lcc[fc];
2303     for (i = 1; i <= min; i++)
2304     if (fc != md->lcc[*eptr++]) RRETURN(MATCH_NOMATCH);
2305     if (min == max) continue;
2306     if (minimize)
2307     {
2308     for (fi = min;; fi++)
2309     {
2310 ph10 164 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM24);
2311 nigel 77 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2312     if (fi >= max || eptr >= md->end_subject ||
2313     fc != md->lcc[*eptr++])
2314     RRETURN(MATCH_NOMATCH);
2315     }
2316     /* Control never gets here */
2317     }
2318 nigel 93 else /* Maximize */
2319 nigel 77 {
2320     pp = eptr;
2321     for (i = min; i < max; i++)
2322     {
2323     if (eptr >= md->end_subject || fc != md->lcc[*eptr]) break;
2324     eptr++;
2325     }
2326 nigel 93 if (possessive) continue;
2327 nigel 77 while (eptr >= pp)
2328     {
2329 ph10 164 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM25);
2330 nigel 77 eptr--;
2331     if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2332     }
2333     RRETURN(MATCH_NOMATCH);
2334     }
2335     /* Control never gets here */
2336     }
2337    
2338     /* Caseful comparisons (includes all multi-byte characters) */
2339    
2340     else
2341     {
2342     for (i = 1; i <= min; i++) if (fc != *eptr++) RRETURN(MATCH_NOMATCH);
2343     if (min == max) continue;
2344     if (minimize)
2345     {
2346     for (fi = min;; fi++)
2347     {
2348 ph10 164 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM26);
2349 nigel 77 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2350     if (fi >= max || eptr >= md->end_subject || fc != *eptr++)
2351     RRETURN(MATCH_NOMATCH);
2352     }
2353     /* Control never gets here */
2354     }
2355 nigel 93 else /* Maximize */
2356 nigel 77 {
2357     pp = eptr;
2358     for (i = min; i < max; i++)
2359     {
2360     if (eptr >= md->end_subject || fc != *eptr) break;
2361     eptr++;
2362     }
2363 nigel 93 if (possessive) continue;
2364 nigel 77 while (eptr >= pp)
2365     {
2366 ph10 164 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM27);
2367 nigel 77 eptr--;
2368     if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2369     }
2370     RRETURN(MATCH_NOMATCH);
2371     }
2372     }
2373     /* Control never gets here */
2374    
2375     /* Match a negated single one-byte character. The character we are
2376     checking can be multibyte. */
2377    
2378     case OP_NOT:
2379     if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
2380     ecode++;
2381     GETCHARINCTEST(c, eptr);
2382     if ((ims & PCRE_CASELESS) != 0)
2383     {
2384     #ifdef SUPPORT_UTF8
2385     if (c < 256)
2386     #endif
2387     c = md->lcc[c];
2388     if (md->lcc[*ecode++] == c) RRETURN(MATCH_NOMATCH);
2389     }
2390     else
2391     {
2392     if (*ecode++ == c) RRETURN(MATCH_NOMATCH);
2393     }
2394     break;
2395    
2396     /* Match a negated single one-byte character repeatedly. This is almost a
2397     repeat of the code for a repeated single character, but I haven't found a
2398     nice way of commoning these up that doesn't require a test of the
2399     positive/negative option for each character match. Maybe that wouldn't add
2400     very much to the time taken, but character matching *is* what this is all
2401     about... */
2402    
2403     case OP_NOTEXACT:
2404     min = max = GET2(ecode, 1);
2405     ecode += 3;
2406     goto REPEATNOTCHAR;
2407    
2408     case OP_NOTUPTO:
2409     case OP_NOTMINUPTO:
2410     min = 0;
2411     max = GET2(ecode, 1);
2412     minimize = *ecode == OP_NOTMINUPTO;
2413     ecode += 3;
2414     goto REPEATNOTCHAR;
2415    
2416 nigel 93 case OP_NOTPOSSTAR:
2417     possessive = TRUE;
2418     min = 0;
2419     max = INT_MAX;
2420     ecode++;
2421     goto REPEATNOTCHAR;
2422    
2423     case OP_NOTPOSPLUS:
2424     possessive = TRUE;
2425     min = 1;
2426     max = INT_MAX;
2427     ecode++;
2428     goto REPEATNOTCHAR;
2429    
2430     case OP_NOTPOSQUERY:
2431     possessive = TRUE;
2432     min = 0;
2433     max = 1;
2434     ecode++;
2435     goto REPEATNOTCHAR;
2436    
2437     case OP_NOTPOSUPTO:
2438     possessive = TRUE;
2439     min = 0;
2440     max = GET2(ecode, 1);
2441     ecode += 3;
2442     goto REPEATNOTCHAR;
2443    
2444 nigel 77 case OP_NOTSTAR:
2445     case OP_NOTMINSTAR:
2446     case OP_NOTPLUS:
2447     case OP_NOTMINPLUS:
2448     case OP_NOTQUERY:
2449     case OP_NOTMINQUERY:
2450     c = *ecode++ - OP_NOTSTAR;
2451     minimize = (c & 1) != 0;
2452     min = rep_min[c]; /* Pick up values from tables; */
2453     max = rep_max[c]; /* zero for max => infinity */
2454     if (max == 0) max = INT_MAX;
2455    
2456     /* Common code for all repeated single-byte matches. We can give up quickly
2457     if there are fewer than the minimum number of bytes left in the
2458     subject. */
2459    
2460     REPEATNOTCHAR:
2461     if (min > md->end_subject - eptr) RRETURN(MATCH_NOMATCH);
2462     fc = *ecode++;
2463    
2464     /* The code is duplicated for the caseless and caseful cases, for speed,
2465     since matching characters is likely to be quite common. First, ensure the
2466     minimum number of matches are present. If min = max, continue at the same
2467     level without recursing. Otherwise, if minimizing, keep trying the rest of
2468     the expression and advancing one matching character if failing, up to the
2469     maximum. Alternatively, if maximizing, find the maximum number of
2470     characters and work backwards. */
2471    
2472     DPRINTF(("negative matching %c{%d,%d} against subject %.*s\n", fc, min, max,
2473     max, eptr));
2474    
2475     if ((ims & PCRE_CASELESS) != 0)
2476     {
2477     fc = md->lcc[fc];
2478    
2479     #ifdef SUPPORT_UTF8
2480     /* UTF-8 mode */
2481     if (utf8)
2482     {
2483 nigel 93 register unsigned int d;
2484 nigel 77 for (i = 1; i <= min; i++)
2485     {
2486     GETCHARINC(d, eptr);
2487     if (d < 256) d = md->lcc[d];
2488     if (fc == d) RRETURN(MATCH_NOMATCH);
2489     }
2490     }
2491     else
2492     #endif
2493    
2494     /* Not UTF-8 mode */
2495     {
2496     for (i = 1; i <= min; i++)
2497     if (fc == md->lcc[*eptr++]) RRETURN(MATCH_NOMATCH);
2498     }
2499    
2500     if (min == max) continue;
2501    
2502     if (minimize)
2503     {
2504     #ifdef SUPPORT_UTF8
2505     /* UTF-8 mode */
2506     if (utf8)
2507     {
2508 nigel 93 register unsigned int d;
2509 nigel 77 for (fi = min;; fi++)
2510     {
2511 ph10 164 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM28);
2512 nigel 77 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2513     GETCHARINC(d, eptr);
2514     if (d < 256) d = md->lcc[d];
2515     if (fi >= max || eptr >= md->end_subject || fc == d)
2516     RRETURN(MATCH_NOMATCH);
2517     }
2518     }
2519     else
2520     #endif
2521     /* Not UTF-8 mode */
2522     {
2523     for (fi = min;; fi++)
2524     {
2525 ph10 164 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM29);
2526 nigel 77 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2527     if (fi >= max || eptr >= md->end_subject || fc == md->lcc[*eptr++])
2528     RRETURN(MATCH_NOMATCH);
2529     }
2530     }
2531     /* Control never gets here */
2532     }
2533    
2534     /* Maximize case */
2535    
2536     else
2537     {
2538     pp = eptr;
2539    
2540     #ifdef SUPPORT_UTF8
2541     /* UTF-8 mode */
2542     if (utf8)
2543     {
2544 nigel 93 register unsigned int d;
2545 nigel 77 for (i = min; i < max; i++)
2546     {
2547     int len = 1;
2548     if (eptr >= md->end_subject) break;
2549     GETCHARLEN(d, eptr, len);
2550     if (d < 256) d = md->lcc[d];
2551     if (fc == d) break;
2552     eptr += len;
2553     }
2554 nigel 93 if (possessive) continue;
2555     for(;;)
2556 nigel 77 {
2557 ph10 164 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM30);
2558 nigel 77 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2559     if (eptr-- == pp) break; /* Stop if tried at original pos */
2560     BACKCHAR(eptr);
2561     }
2562     }
2563     else
2564     #endif
2565     /* Not UTF-8 mode */
2566     {
2567     for (i = min; i < max; i++)
2568     {
2569     if (eptr >= md->end_subject || fc == md->lcc[*eptr]) break;
2570     eptr++;
2571     }
2572 nigel 93 if (possessive) continue;
2573 nigel 77 while (eptr >= pp)
2574     {
2575 ph10 164 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM31);
2576 nigel 77 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2577     eptr--;
2578     }
2579     }
2580    
2581     RRETURN(MATCH_NOMATCH);
2582     }
2583     /* Control never gets here */
2584     }
2585    
2586     /* Caseful comparisons */
2587    
2588     else
2589     {
2590     #ifdef SUPPORT_UTF8
2591     /* UTF-8 mode */
2592     if (utf8)
2593     {
2594 nigel 93 register unsigned int d;
2595 nigel 77 for (i = 1; i <= min; i++)
2596     {
2597     GETCHARINC(d, eptr);
2598     if (fc == d) RRETURN(MATCH_NOMATCH);
2599     }
2600     }
2601     else
2602     #endif
2603     /* Not UTF-8 mode */
2604     {
2605     for (i = 1; i <= min; i++)
2606     if (fc == *eptr++) RRETURN(MATCH_NOMATCH);
2607     }
2608    
2609     if (min == max) continue;
2610    
2611     if (minimize)
2612     {
2613     #ifdef SUPPORT_UTF8
2614     /* UTF-8 mode */
2615     if (utf8)
2616     {
2617 nigel 93 register unsigned int d;
2618 nigel 77 for (fi = min;; fi++)
2619     {
2620 ph10 164 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM32);
2621 nigel 77 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2622     GETCHARINC(d, eptr);
2623     if (fi >= max || eptr >= md->end_subject || fc == d)
2624     RRETURN(MATCH_NOMATCH);
2625     }
2626     }
2627     else
2628     #endif
2629     /* Not UTF-8 mode */
2630     {
2631     for (fi = min;; fi++)
2632     {
2633 ph10 164 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM33);
2634 nigel 77 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2635     if (fi >= max || eptr >= md->end_subject || fc == *eptr++)
2636     RRETURN(MATCH_NOMATCH);
2637     }
2638     }
2639     /* Control never gets here */
2640     }
2641    
2642     /* Maximize case */
2643    
2644     else
2645     {
2646     pp = eptr;
2647    
2648     #ifdef SUPPORT_UTF8
2649     /* UTF-8 mode */
2650     if (utf8)
2651     {
2652 nigel 93 register unsigned int d;
2653 nigel 77 for (i = min; i < max; i++)
2654     {
2655     int len = 1;
2656     if (eptr >= md->end_subject) break;
2657     GETCHARLEN(d, eptr, len);
2658     if (fc == d) break;
2659     eptr += len;
2660     }
2661 nigel 93 if (possessive) continue;
2662 nigel 77 for(;;)
2663     {
2664 ph10 164 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM34);
2665 nigel 77 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2666     if (eptr-- == pp) break; /* Stop if tried at original pos */
2667     BACKCHAR(eptr);
2668     }
2669     }
2670     else
2671     #endif
2672     /* Not UTF-8 mode */
2673     {
2674     for (i = min; i < max; i++)
2675     {
2676     if (eptr >= md->end_subject || fc == *eptr) break;
2677     eptr++;
2678     }
2679 nigel 93 if (possessive) continue;
2680 nigel 77 while (eptr >= pp)
2681     {
2682 ph10 164 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM35);
2683 nigel 77 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2684     eptr--;
2685     }
2686     }
2687    
2688     RRETURN(MATCH_NOMATCH);
2689     }
2690     }
2691     /* Control never gets here */
2692    
2693     /* Match a single character type repeatedly; several different opcodes
2694     share code. This is very similar to the code for single characters, but we
2695     repeat it in the interests of efficiency. */
2696    
2697     case OP_TYPEEXACT:
2698     min = max = GET2(ecode, 1);
2699     minimize = TRUE;
2700     ecode += 3;
2701     goto REPEATTYPE;
2702    
2703     case OP_TYPEUPTO:
2704     case OP_TYPEMINUPTO:
2705     min = 0;
2706     max = GET2(ecode, 1);
2707     minimize = *ecode == OP_TYPEMINUPTO;
2708     ecode += 3;
2709     goto REPEATTYPE;
2710    
2711 nigel 93 case OP_TYPEPOSSTAR:
2712     possessive = TRUE;
2713     min = 0;
2714     max = INT_MAX;
2715     ecode++;
2716     goto REPEATTYPE;
2717    
2718     case OP_TYPEPOSPLUS:
2719     possessive = TRUE;
2720     min = 1;
2721     max = INT_MAX;
2722     ecode++;
2723     goto REPEATTYPE;
2724    
2725     case OP_TYPEPOSQUERY:
2726     possessive = TRUE;
2727     min = 0;
2728     max = 1;
2729     ecode++;
2730     goto REPEATTYPE;
2731    
2732     case OP_TYPEPOSUPTO:
2733     possessive = TRUE;
2734     min = 0;
2735     max = GET2(ecode, 1);
2736     ecode += 3;
2737     goto REPEATTYPE;
2738    
2739 nigel 77 case OP_TYPESTAR:
2740     case OP_TYPEMINSTAR:
2741     case OP_TYPEPLUS:
2742     case OP_TYPEMINPLUS:
2743     case OP_TYPEQUERY:
2744     case OP_TYPEMINQUERY:
2745     c = *ecode++ - OP_TYPESTAR;
2746     minimize = (c & 1) != 0;
2747     min = rep_min[c]; /* Pick up values from tables; */
2748     max = rep_max[c]; /* zero for max => infinity */
2749     if (max == 0) max = INT_MAX;
2750    
2751     /* Common code for all repeated single character type matches. Note that
2752     in UTF-8 mode, '.' matches a character of any length, but for the other
2753     character types, the valid characters are all one-byte long. */
2754    
2755     REPEATTYPE:
2756     ctype = *ecode++; /* Code for the character type */
2757    
2758     #ifdef SUPPORT_UCP
2759     if (ctype == OP_PROP || ctype == OP_NOTPROP)
2760     {
2761     prop_fail_result = ctype == OP_NOTPROP;
2762     prop_type = *ecode++;
2763 nigel 87 prop_value = *ecode++;
2764 nigel 77 }
2765     else prop_type = -1;
2766     #endif
2767    
2768     /* First, ensure the minimum number of matches are present. Use inline
2769     code for maximizing the speed, and do the type test once at the start
2770     (i.e. keep it out of the loop). Also we can test that there are at least
2771     the minimum number of bytes before we start. This isn't as effective in
2772     UTF-8 mode, but it does no harm. Separate the UTF-8 code completely as that
2773     is tidier. Also separate the UCP code, which can be the same for both UTF-8
2774     and single-bytes. */
2775    
2776     if (min > md->end_subject - eptr) RRETURN(MATCH_NOMATCH);
2777     if (min > 0)
2778     {
2779     #ifdef SUPPORT_UCP
2780 nigel 87 if (prop_type >= 0)
2781 nigel 77 {
2782 nigel 87 switch(prop_type)
2783 nigel 77 {
2784 nigel 87 case PT_ANY:
2785     if (prop_fail_result) RRETURN(MATCH_NOMATCH);
2786     for (i = 1; i <= min; i++)
2787     {
2788     if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
2789 ph10 184 GETCHARINCTEST(c, eptr);
2790 nigel 87 }
2791     break;
2792    
2793     case PT_LAMP:
2794     for (i = 1; i <= min; i++)
2795     {
2796     if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
2797 ph10 184 GETCHARINCTEST(c, eptr);
2798 nigel 87 prop_category = _pcre_ucp_findprop(c, &prop_chartype, &prop_script);
2799     if ((prop_chartype == ucp_Lu ||
2800     prop_chartype == ucp_Ll ||
2801     prop_chartype == ucp_Lt) == prop_fail_result)
2802     RRETURN(MATCH_NOMATCH);
2803     }
2804     break;
2805    
2806     case PT_GC:
2807     for (i = 1; i <= min; i++)
2808     {
2809     if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
2810 ph10 184 GETCHARINCTEST(c, eptr);
2811 nigel 87 prop_category = _pcre_ucp_findprop(c, &prop_chartype, &prop_script);
2812     if ((prop_category == prop_value) == prop_fail_result)
2813     RRETURN(MATCH_NOMATCH);
2814     }
2815     break;
2816    
2817     case PT_PC:
2818     for (i = 1; i <= min; i++)
2819     {
2820     if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
2821 ph10 184 GETCHARINCTEST(c, eptr);
2822 nigel 87 prop_category = _pcre_ucp_findprop(c, &prop_chartype, &prop_script);
2823     if ((prop_chartype == prop_value) == prop_fail_result)
2824     RRETURN(MATCH_NOMATCH);
2825     }
2826     break;
2827    
2828     case PT_SC:
2829     for (i = 1; i <= min; i++)
2830     {
2831     if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
2832 ph10 184 GETCHARINCTEST(c, eptr);
2833 nigel 87 prop_category = _pcre_ucp_findprop(c, &prop_chartype, &prop_script);
2834     if ((prop_script == prop_value) == prop_fail_result)
2835     RRETURN(MATCH_NOMATCH);
2836     }
2837     break;
2838    
2839     default:
2840     RRETURN(PCRE_ERROR_INTERNAL);
2841 nigel 77 }
2842     }
2843    
2844     /* Match extended Unicode sequences. We will get here only if the
2845     support is in the binary; otherwise a compile-time error occurs. */
2846    
2847     else if (ctype == OP_EXTUNI)
2848     {
2849     for (i = 1; i <= min; i++)
2850     {
2851     GETCHARINCTEST(c, eptr);
2852 nigel 87 prop_category = _pcre_ucp_findprop(c, &prop_chartype, &prop_script);
2853 nigel 77 if (prop_category == ucp_M) RRETURN(MATCH_NOMATCH);
2854     while (eptr < md->end_subject)
2855     {
2856     int len = 1;
2857     if (!utf8) c = *eptr; else
2858     {
2859     GETCHARLEN(c, eptr, len);
2860     }
2861 nigel 87 prop_category = _pcre_ucp_findprop(c, &prop_chartype, &prop_script);
2862 nigel 77 if (prop_category != ucp_M) break;
2863     eptr += len;
2864     }
2865     }
2866     }
2867    
2868     else
2869     #endif /* SUPPORT_UCP */
2870    
2871     /* Handle all other cases when the coding is UTF-8 */
2872    
2873     #ifdef SUPPORT_UTF8
2874     if (utf8) switch(ctype)
2875     {
2876     case OP_ANY:
2877     for (i = 1; i <= min; i++)
2878     {
2879     if (eptr >= md->end_subject ||
2880 nigel 93 ((ims & PCRE_DOTALL) == 0 && IS_NEWLINE(eptr)))
2881 nigel 77 RRETURN(MATCH_NOMATCH);
2882 nigel 91 eptr++;
2883 nigel 77 while (eptr < md->end_subject && (*eptr & 0xc0) == 0x80) eptr++;
2884     }
2885     break;
2886    
2887     case OP_ANYBYTE:
2888     eptr += min;
2889     break;
2890    
2891 nigel 93 case OP_ANYNL:
2892     for (i = 1; i <= min; i++)
2893     {
2894     if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
2895     GETCHARINC(c, eptr);
2896     switch(c)
2897     {
2898     default: RRETURN(MATCH_NOMATCH);
2899     case 0x000d:
2900     if (eptr < md->end_subject && *eptr == 0x0a) eptr++;
2901     break;
2902     case 0x000a:
2903     case 0x000b:
2904     case 0x000c:
2905     case 0x0085:
2906     case 0x2028:
2907     case 0x2029:
2908     break;
2909     }
2910     }
2911     break;
2912    
2913 ph10 178 case OP_NOT_HSPACE:
2914     for (i = 1; i <= min; i++)
2915     {
2916     if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
2917     GETCHARINC(c, eptr);
2918     switch(c)
2919     {
2920     default: break;
2921     case 0x09: /* HT */
2922     case 0x20: /* SPACE */
2923     case 0xa0: /* NBSP */
2924     case 0x1680: /* OGHAM SPACE MARK */
2925     case 0x180e: /* MONGOLIAN VOWEL SEPARATOR */
2926     case 0x2000: /* EN QUAD */
2927     case 0x2001: /* EM QUAD */
2928     case 0x2002: /* EN SPACE */
2929     case 0x2003: /* EM SPACE */
2930     case 0x2004: /* THREE-PER-EM SPACE */
2931     case 0x2005: /* FOUR-PER-EM SPACE */
2932     case 0x2006: /* SIX-PER-EM SPACE */
2933     case 0x2007: /* FIGURE SPACE */
2934     case 0x2008: /* PUNCTUATION SPACE */
2935     case 0x2009: /* THIN SPACE */
2936     case 0x200A: /* HAIR SPACE */
2937     case 0x202f: /* NARROW NO-BREAK SPACE */
2938     case 0x205f: /* MEDIUM MATHEMATICAL SPACE */
2939     case 0x3000: /* IDEOGRAPHIC SPACE */
2940     RRETURN(MATCH_NOMATCH);
2941     }
2942     }
2943     break;
2944 ph10 182
2945 ph10 178 case OP_HSPACE:
2946     for (i = 1; i <= min; i++)
2947     {
2948     if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
2949     GETCHARINC(c, eptr);
2950     switch(c)
2951     {
2952     default: RRETURN(MATCH_NOMATCH);
2953     case 0x09: /* HT */
2954     case 0x20: /* SPACE */
2955     case 0xa0: /* NBSP */
2956     case 0x1680: /* OGHAM SPACE MARK */
2957     case 0x180e: /* MONGOLIAN VOWEL SEPARATOR */
2958     case 0x2000: /* EN QUAD */
2959     case 0x2001: /* EM QUAD */
2960     case 0x2002: /* EN SPACE */
2961     case 0x2003: /* EM SPACE */
2962     case 0x2004: /* THREE-PER-EM SPACE */
2963     case 0x2005: /* FOUR-PER-EM SPACE */
2964     case 0x2006: /* SIX-PER-EM SPACE */
2965     case 0x2007: /* FIGURE SPACE */
2966     case 0x2008: /* PUNCTUATION SPACE */
2967     case 0x2009: /* THIN SPACE */
2968     case 0x200A: /* HAIR SPACE */
2969     case 0x202f: /* NARROW NO-BREAK SPACE */
2970     case 0x205f: /* MEDIUM MATHEMATICAL SPACE */
2971     case 0x3000: /* IDEOGRAPHIC SPACE */
2972     break;
2973     }
2974     }
2975     break;
2976 ph10 182
2977 ph10 178 case OP_NOT_VSPACE:
2978     for (i = 1; i <= min; i++)
2979     {
2980     if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
2981     GETCHARINC(c, eptr);
2982     switch(c)
2983     {
2984     default: break;
2985     case 0x0a: /* LF */
2986     case 0x0b: /* VT */
2987     case 0x0c: /* FF */
2988     case 0x0d: /* CR */
2989     case 0x85: /* NEL */
2990     case 0x2028: /* LINE SEPARATOR */
2991     case 0x2029: /* PARAGRAPH SEPARATOR */
2992     RRETURN(MATCH_NOMATCH);
2993     }
2994     }
2995     break;
2996 ph10 182
2997 ph10 178 case OP_VSPACE:
2998     for (i = 1; i <= min; i++)
2999     {
3000     if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
3001     GETCHARINC(c, eptr);
3002     switch(c)
3003     {
3004     default: RRETURN(MATCH_NOMATCH);
3005     case 0x0a: /* LF */
3006     case 0x0b: /* VT */
3007     case 0x0c: /* FF */
3008     case 0x0d: /* CR */
3009     case 0x85: /* NEL */
3010     case 0x2028: /* LINE SEPARATOR */
3011     case 0x2029: /* PARAGRAPH SEPARATOR */
3012 ph10 182 break;
3013 ph10 178 }
3014     }
3015     break;
3016    
3017 nigel 77 case OP_NOT_DIGIT:
3018     for (i = 1; i <= min; i++)
3019     {
3020     if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
3021     GETCHARINC(c, eptr);
3022     if (c < 128 && (md->ctypes[c] & ctype_digit) != 0)
3023     RRETURN(MATCH_NOMATCH);
3024     }
3025     break;
3026    
3027     case OP_DIGIT:
3028     for (i = 1; i <= min; i++)
3029     {
3030     if (eptr >= md->end_subject ||
3031     *eptr >= 128 || (md->ctypes[*eptr++] & ctype_digit) == 0)
3032     RRETURN(MATCH_NOMATCH);
3033     /* No need to skip more bytes - we know it's a 1-byte character */
3034     }
3035     break;
3036    
3037     case OP_NOT_WHITESPACE:
3038     for (i = 1; i <= min; i++)
3039     {
3040     if (eptr >= md->end_subject ||
3041     (*eptr < 128 && (md->ctypes[*eptr++] & ctype_space) != 0))
3042     RRETURN(MATCH_NOMATCH);
3043     while (eptr < md->end_subject && (*eptr & 0xc0) == 0x80) eptr++;
3044     }
3045     break;
3046    
3047     case OP_WHITESPACE:
3048     for (i = 1; i <= min; i++)
3049     {
3050     if (eptr >= md->end_subject ||
3051     *eptr >= 128 || (md->ctypes[*eptr++] & ctype_space) == 0)
3052     RRETURN(MATCH_NOMATCH);
3053     /* No need to skip more bytes - we know it's a 1-byte character */
3054     }
3055     break;
3056    
3057     case OP_NOT_WORDCHAR:
3058     for (i = 1; i <= min; i++)
3059     {
3060     if (eptr >= md->end_subject ||
3061     (*eptr < 128 && (md->ctypes[*eptr++] & ctype_word) != 0))
3062     RRETURN(MATCH_NOMATCH);
3063     while (eptr < md->end_subject && (*eptr & 0xc0) == 0x80) eptr++;
3064     }
3065     break;
3066    
3067     case OP_WORDCHAR:
3068     for (i = 1; i <= min; i++)
3069     {
3070     if (eptr >= md->end_subject ||
3071     *eptr >= 128 || (md->ctypes[*eptr++] & ctype_word) == 0)
3072     RRETURN(MATCH_NOMATCH);
3073     /* No need to skip more bytes - we know it's a 1-byte character */
3074     }
3075     break;
3076    
3077     default:
3078     RRETURN(PCRE_ERROR_INTERNAL);
3079     } /* End switch(ctype) */
3080    
3081     else
3082     #endif /* SUPPORT_UTF8 */
3083    
3084     /* Code for the non-UTF-8 case for minimum matching of operators other
3085 nigel 93 than OP_PROP and OP_NOTPROP. We can assume that there are the minimum
3086     number of bytes present, as this was tested above. */
3087 nigel 77
3088     switch(ctype)
3089     {
3090     case OP_ANY:
3091     if ((ims & PCRE_DOTALL) == 0)
3092     {
3093     for (i = 1; i <= min; i++)
3094 nigel 91 {
3095 nigel 93 if (IS_NEWLINE(eptr)) RRETURN(MATCH_NOMATCH);
3096 nigel 91 eptr++;
3097     }
3098 nigel 77 }
3099     else eptr += min;
3100     break;
3101    
3102     case OP_ANYBYTE:
3103     eptr += min;
3104     break;
3105    
3106 nigel 93 /* Because of the CRLF case, we can't assume the minimum number of
3107     bytes are present in this case. */
3108    
3109     case OP_ANYNL:
3110     for (i = 1; i <= min; i++)
3111     {
3112     if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
3113     switch(*eptr++)
3114     {
3115     default: RRETURN(MATCH_NOMATCH);
3116     case 0x000d:
3117     if (eptr < md->end_subject && *eptr == 0x0a) eptr++;
3118     break;
3119     case 0x000a:
3120     case 0x000b:
3121     case 0x000c:
3122     case 0x0085:
3123     break;
3124     }
3125     }
3126     break;
3127    
3128 ph10 178 case OP_NOT_HSPACE:
3129     for (i = 1; i <= min; i++)
3130     {
3131     if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
3132     switch(*eptr++)
3133     {
3134     default: break;
3135     case 0x09: /* HT */
3136     case 0x20: /* SPACE */
3137     case 0xa0: /* NBSP */
3138     RRETURN(MATCH_NOMATCH);
3139     }
3140     }
3141     break;
3142    
3143     case OP_HSPACE:
3144     for (i = 1; i <= min; i++)
3145     {
3146     if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
3147     switch(*eptr++)
3148     {
3149     default: RRETURN(MATCH_NOMATCH);
3150     case 0x09: /* HT */
3151     case 0x20: /* SPACE */
3152     case 0xa0: /* NBSP */
3153 ph10 182 break;
3154 ph10 178 }
3155     }
3156     break;
3157    
3158     case OP_NOT_VSPACE:
3159     for (i = 1; i <= min; i++)
3160     {
3161     if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
3162     switch(*eptr++)
3163     {
3164     default: break;
3165     case 0x0a: /* LF */
3166     case 0x0b: /* VT */
3167     case 0x0c: /* FF */
3168     case 0x0d: /* CR */
3169     case 0x85: /* NEL */
3170     RRETURN(MATCH_NOMATCH);
3171     }
3172     }
3173     break;
3174    
3175     case OP_VSPACE:
3176     for (i = 1; i <= min; i++)
3177     {
3178     if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
3179     switch(*eptr++)
3180     {
3181     default: RRETURN(MATCH_NOMATCH);
3182     case 0x0a: /* LF */
3183     case 0x0b: /* VT */
3184     case 0x0c: /* FF */
3185     case 0x0d: /* CR */
3186     case 0x85: /* NEL */
3187 ph10 182 break;
3188 ph10 178 }
3189     }
3190     break;
3191    
3192 nigel 77 case OP_NOT_DIGIT:
3193     for (i = 1; i <= min; i++)
3194     if ((md->ctypes[*eptr++] & ctype_digit) != 0) RRETURN(MATCH_NOMATCH);
3195     break;
3196    
3197     case OP_DIGIT:
3198     for (i = 1; i <= min; i++)
3199     if ((md->ctypes[*eptr++] & ctype_digit) == 0) RRETURN(MATCH_NOMATCH);
3200     break;
3201    
3202     case OP_NOT_WHITESPACE:
3203     for (i = 1; i <= min; i++)
3204     if ((md->ctypes[*eptr++] & ctype_space) != 0) RRETURN(MATCH_NOMATCH);
3205     break;
3206    
3207     case OP_WHITESPACE:
3208     for (i = 1; i <= min; i++)
3209     if ((md->ctypes[*eptr++] & ctype_space) == 0) RRETURN(MATCH_NOMATCH);
3210     break;
3211    
3212     case OP_NOT_WORDCHAR:
3213     for (i = 1; i <= min; i++)
3214     if ((md->ctypes[*eptr++] & ctype_word) != 0)
3215     RRETURN(MATCH_NOMATCH);
3216     break;
3217    
3218     case OP_WORDCHAR:
3219     for (i = 1; i <= min; i++)
3220     if ((md->ctypes[*eptr++] & ctype_word) == 0)
3221     RRETURN(MATCH_NOMATCH);
3222     break;
3223    
3224     default:
3225     RRETURN(PCRE_ERROR_INTERNAL);
3226     }
3227     }
3228    
3229     /* If min = max, continue at the same level without recursing */
3230    
3231     if (min == max) continue;
3232    
3233     /* If minimizing, we have to test the rest of the pattern before each
3234     subsequent match. Again, separate the UTF-8 case for speed, and also
3235     separate the UCP cases. */
3236    
3237     if (minimize)
3238     {
3239     #ifdef SUPPORT_UCP
3240 nigel 87 if (prop_type >= 0)
3241 nigel 77 {
3242 nigel 87 switch(prop_type)
3243 nigel 77 {
3244 nigel 87 case PT_ANY:
3245     for (fi = min;; fi++)
3246     {
3247 ph10 164 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM36);
3248 nigel 87 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3249     if (fi >= max || eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
3250     GETCHARINC(c, eptr);
3251     if (prop_fail_result) RRETURN(MATCH_NOMATCH);
3252     }
3253 nigel 93 /* Control never gets here */
3254 nigel 87
3255     case PT_LAMP:
3256     for (fi = min;; fi++)
3257     {
3258 ph10 164 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM37);
3259 nigel 87 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3260     if (fi >= max || eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
3261     GETCHARINC(c, eptr);
3262     prop_category = _pcre_ucp_findprop(c, &prop_chartype, &prop_script);
3263     if ((prop_chartype == ucp_Lu ||
3264     prop_chartype == ucp_Ll ||
3265     prop_chartype == ucp_Lt) == prop_fail_result)
3266     RRETURN(MATCH_NOMATCH);
3267     }
3268 nigel 93 /* Control never gets here */
3269 nigel 87
3270     case PT_GC:
3271     for (fi = min;; fi++)
3272     {
3273 ph10 164 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM38);
3274 nigel 87 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3275     if (fi >= max || eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
3276     GETCHARINC(c, eptr);
3277     prop_category = _pcre_ucp_findprop(c, &prop_chartype, &prop_script);
3278     if ((prop_category == prop_value) == prop_fail_result)
3279     RRETURN(MATCH_NOMATCH);
3280     }
3281 nigel 93 /* Control never gets here */
3282 nigel 87
3283     case PT_PC:
3284     for (fi = min;; fi++)
3285     {
3286 ph10 164 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM39);
3287 nigel 87 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3288     if (fi >= max || eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
3289     GETCHARINC(c, eptr);
3290     prop_category = _pcre_ucp_findprop(c, &prop_chartype, &prop_script);
3291     if ((prop_chartype == prop_value) == prop_fail_result)
3292     RRETURN(MATCH_NOMATCH);
3293     }
3294 nigel 93 /* Control never gets here */
3295 nigel 87
3296     case PT_SC:
3297     for (fi = min;; fi++)
3298     {
3299 ph10 164 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM40);
3300 nigel 87 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3301     if (fi >= max || eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
3302     GETCHARINC(c, eptr);
3303     prop_category = _pcre_ucp_findprop(c, &prop_chartype, &prop_script);
3304     if ((prop_script == prop_value) == prop_fail_result)
3305     RRETURN(MATCH_NOMATCH);
3306     }
3307 nigel 93 /* Control never gets here */
3308 nigel 87
3309     default:
3310     RRETURN(PCRE_ERROR_INTERNAL);
3311 nigel 77 }
3312     }
3313    
3314     /* Match extended Unicode sequences. We will get here only if the
3315     support is in the binary; otherwise a compile-time error occurs. */
3316    
3317     else if (ctype == OP_EXTUNI)
3318     {
3319     for (fi = min;; fi++)
3320     {
3321 ph10 164 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM41);
3322 nigel 77 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3323     if (fi >= max || eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
3324     GETCHARINCTEST(c, eptr);
3325 nigel 87 prop_category = _pcre_ucp_findprop(c, &prop_chartype, &prop_script);
3326 nigel 77 if (prop_category == ucp_M) RRETURN(MATCH_NOMATCH);
3327     while (eptr < md->end_subject)
3328     {
3329     int len = 1;
3330     if (!utf8) c = *eptr; else
3331     {
3332     GETCHARLEN(c, eptr, len);
3333     }
3334 nigel 87 prop_category = _pcre_ucp_findprop(c, &prop_chartype, &prop_script);
3335 nigel 77 if (prop_category != ucp_M) break;
3336     eptr += len;
3337     }
3338     }
3339     }
3340    
3341     else
3342     #endif /* SUPPORT_UCP */
3343    
3344     #ifdef SUPPORT_UTF8
3345     /* UTF-8 mode */
3346     if (utf8)
3347     {
3348     for (fi = min;; fi++)
3349     {
3350 ph10 164 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM42);
3351 nigel 77 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3352 nigel 91 if (fi >= max || eptr >= md->end_subject ||
3353     (ctype == OP_ANY && (ims & PCRE_DOTALL) == 0 &&
3354 nigel 93 IS_NEWLINE(eptr)))
3355 nigel 91 RRETURN(MATCH_NOMATCH);
3356 nigel 77
3357     GETCHARINC(c, eptr);
3358     switch(ctype)
3359     {
3360 nigel 91 case OP_ANY: /* This is the DOTALL case */
3361 nigel 77 break;
3362    
3363     case OP_ANYBYTE:
3364     break;
3365    
3366 nigel 93 case OP_ANYNL:
3367     switch(c)
3368     {
3369     default: RRETURN(MATCH_NOMATCH);
3370     case 0x000d:
3371     if (eptr < md->end_subject && *eptr == 0x0a) eptr++;
3372     break;
3373     case 0x000a:
3374     case 0x000b:
3375     case 0x000c:
3376     case 0x0085:
3377     case 0x2028:
3378     case 0x2029:
3379     break;
3380     }
3381     break;
3382    
3383 ph10 178 case OP_NOT_HSPACE:
3384     switch(c)
3385     {
3386     default: break;
3387     case 0x09: /* HT */
3388     case 0x20: /* SPACE */
3389     case 0xa0: /* NBSP */
3390     case 0x1680: /* OGHAM SPACE MARK */
3391     case 0x180e: /* MONGOLIAN VOWEL SEPARATOR */
3392     case 0x2000: /* EN QUAD */
3393     case 0x2001: /* EM QUAD */
3394     case 0x2002: /* EN SPACE */
3395     case 0x2003: /* EM SPACE */
3396     case 0x2004: /* THREE-PER-EM SPACE */
3397     case 0x2005: /* FOUR-PER-EM SPACE */
3398     case 0x2006: /* SIX-PER-EM SPACE */
3399     case 0x2007: /* FIGURE SPACE */
3400     case 0x2008: /* PUNCTUATION SPACE */
3401     case 0x2009: /* THIN SPACE */
3402     case 0x200A: /* HAIR SPACE */
3403     case 0x202f: /* NARROW NO-BREAK SPACE */
3404     case 0x205f: /* MEDIUM MATHEMATICAL SPACE */
3405     case 0x3000: /* IDEOGRAPHIC SPACE */
3406     RRETURN(MATCH_NOMATCH);
3407     }
3408     break;
3409    
3410     case OP_HSPACE:
3411     switch(c)
3412     {
3413     default: RRETURN(MATCH_NOMATCH);
3414     case 0x09: /* HT */
3415     case 0x20: /* SPACE */
3416     case 0xa0: /* NBSP */
3417     case 0x1680: /* OGHAM SPACE MARK */
3418     case 0x180e: /* MONGOLIAN VOWEL SEPARATOR */
3419     case 0x2000: /* EN QUAD */
3420     case 0x2001: /* EM QUAD */
3421     case 0x2002: /* EN SPACE */
3422     case 0x2003: /* EM SPACE */
3423     case 0x2004: /* THREE-PER-EM SPACE */
3424     case 0x2005: /* FOUR-PER-EM SPACE */
3425     case 0x2006: /* SIX-PER-EM SPACE */
3426     case 0x2007: /* FIGURE SPACE */
3427     case 0x2008: /* PUNCTUATION SPACE */
3428     case 0x2009: /* THIN SPACE */
3429     case 0x200A: /* HAIR SPACE */
3430     case 0x202f: /* NARROW NO-BREAK SPACE */
3431     case 0x205f: /* MEDIUM MATHEMATICAL SPACE */
3432     case 0x3000: /* IDEOGRAPHIC SPACE */
3433     break;
3434     }
3435     break;
3436    
3437     case OP_NOT_VSPACE:
3438     switch(c)
3439     {
3440     default: break;
3441     case 0x0a: /* LF */
3442     case 0x0b: /* VT */
3443     case 0x0c: /* FF */
3444     case 0x0d: /* CR */
3445     case 0x85: /* NEL */
3446     case 0x2028: /* LINE SEPARATOR */
3447     case 0x2029: /* PARAGRAPH SEPARATOR */
3448     RRETURN(MATCH_NOMATCH);
3449     }
3450     break;
3451    
3452     case OP_VSPACE:
3453     switch(c)
3454     {
3455     default: RRETURN(MATCH_NOMATCH);
3456     case 0x0a: /* LF */
3457     case 0x0b: /* VT */
3458     case 0x0c: /* FF */
3459     case 0x0d: /* CR */
3460     case 0x85: /* NEL */
3461     case 0x2028: /* LINE SEPARATOR */
3462     case 0x2029: /* PARAGRAPH SEPARATOR */
3463     break;
3464     }
3465     break;
3466    
3467 nigel 77 case OP_NOT_DIGIT:
3468     if (c < 256 && (md->ctypes[c] & ctype_digit) != 0)
3469     RRETURN(MATCH_NOMATCH);
3470     break;
3471    
3472     case OP_DIGIT:
3473     if (c >= 256 || (md->ctypes[c] & ctype_digit) == 0)
3474     RRETURN(MATCH_NOMATCH);
3475     break;
3476    
3477     case OP_NOT_WHITESPACE:
3478     if (c < 256 && (md->ctypes[c] & ctype_space) != 0)
3479     RRETURN(MATCH_NOMATCH);
3480     break;
3481    
3482     case OP_WHITESPACE:
3483     if (c >= 256 || (md->ctypes[c] & ctype_space) == 0)
3484     RRETURN(MATCH_NOMATCH);
3485     break;
3486    
3487     case OP_NOT_WORDCHAR:
3488     if (c < 256 && (md->ctypes[c] & ctype_word) != 0)
3489     RRETURN(MATCH_NOMATCH);
3490     break;
3491    
3492     case OP_WORDCHAR:
3493     if (c >= 256 || (md->ctypes[c] & ctype_word) == 0)
3494     RRETURN(MATCH_NOMATCH);
3495     break;
3496    
3497     default:
3498     RRETURN(PCRE_ERROR_INTERNAL);
3499     }
3500     }
3501     }
3502     else
3503     #endif
3504     /* Not UTF-8 mode */
3505     {
3506     for (fi = min;; fi++)
3507     {
3508 ph10 164 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM43);
3509 nigel 77 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3510 nigel 91 if (fi >= max || eptr >= md->end_subject ||
3511 nigel 93 ((ims & PCRE_DOTALL) == 0 && IS_NEWLINE(eptr)))
3512 nigel 91 RRETURN(MATCH_NOMATCH);
3513    
3514 nigel 77 c = *eptr++;
3515     switch(ctype)
3516     {
3517 nigel 91 case OP_ANY: /* This is the DOTALL case */
3518 nigel 77 break;
3519    
3520     case OP_ANYBYTE:
3521     break;
3522    
3523 nigel 93 case OP_ANYNL:
3524     switch(c)
3525     {
3526     default: RRETURN(MATCH_NOMATCH);
3527     case 0x000d:
3528     if (eptr < md->end_subject && *eptr == 0x0a) eptr++;
3529     break;
3530     case 0x000a:
3531     case 0x000b:
3532     case 0x000c:
3533     case 0x0085:
3534     break;
3535     }
3536     break;
3537    
3538 ph10 178 case OP_NOT_HSPACE:
3539     switch(c)
3540     {
3541     default: break;
3542     case 0x09: /* HT */
3543     case 0x20: /* SPACE */
3544     case 0xa0: /* NBSP */
3545     RRETURN(MATCH_NOMATCH);
3546     }
3547     break;
3548    
3549     case OP_HSPACE:
3550     switch(c)
3551     {
3552     default: RRETURN(MATCH_NOMATCH);
3553     case 0x09: /* HT */
3554     case 0x20: /* SPACE */
3555     case 0xa0: /* NBSP */
3556     break;
3557     }
3558     break;
3559    
3560     case OP_NOT_VSPACE:
3561     switch(c)
3562     {
3563     default: break;
3564     case 0x0a: /* LF */
3565     case 0x0b: /* VT */
3566     case 0x0c: /* FF */
3567     case 0x0d: /* CR */
3568     case 0x85: /* NEL */
3569     RRETURN(MATCH_NOMATCH);
3570     }
3571     break;
3572    
3573     case OP_VSPACE:
3574     switch(c)
3575     {
3576     default: RRETURN(MATCH_NOMATCH);
3577     case 0x0a: /* LF */
3578     case 0x0b: /* VT */
3579     case 0x0c: /* FF */
3580     case 0x0d: /* CR */
3581     case 0x85: /* NEL */
3582     break;
3583     }
3584     break;
3585    
3586 nigel 77 case OP_NOT_DIGIT:
3587     if ((md->ctypes[c] & ctype_digit) != 0) RRETURN(MATCH_NOMATCH);
3588     break;
3589    
3590     case OP_DIGIT:
3591     if ((md->ctypes[c] & ctype_digit) == 0) RRETURN(MATCH_NOMATCH);
3592     break;
3593    
3594     case OP_NOT_WHITESPACE:
3595     if ((md->ctypes[c] & ctype_space) != 0) RRETURN(MATCH_NOMATCH);
3596     break;
3597    
3598     case OP_WHITESPACE:
3599     if ((md->ctypes[c] & ctype_space) == 0) RRETURN(MATCH_NOMATCH);
3600     break;
3601    
3602     case OP_NOT_WORDCHAR:
3603     if ((md->ctypes[c] & ctype_word) != 0) RRETURN(MATCH_NOMATCH);
3604     break;
3605    
3606     case OP_WORDCHAR:
3607     if ((md->ctypes[c] & ctype_word) == 0) RRETURN(MATCH_NOMATCH);
3608     break;
3609    
3610     default:
3611     RRETURN(PCRE_ERROR_INTERNAL);
3612     }
3613     }
3614     }
3615     /* Control never gets here */
3616     }
3617    
3618 nigel 93 /* If maximizing, it is worth using inline code for speed, doing the type
3619 nigel 77 test once at the start (i.e. keep it out of the loop). Again, keep the
3620     UTF-8 and UCP stuff separate. */
3621    
3622     else
3623     {
3624     pp = eptr; /* Remember where we started */
3625    
3626     #ifdef SUPPORT_UCP
3627 nigel 87 if (prop_type >= 0)
3628 nigel 77 {
3629 nigel 87 switch(prop_type)
3630 nigel 77 {
3631 nigel 87 case PT_ANY:
3632     for (i = min; i < max; i++)
3633     {
3634     int len = 1;
3635     if (eptr >= md->end_subject) break;
3636     GETCHARLEN(c, eptr, len);
3637     if (prop_fail_result) break;
3638     eptr+= len;
3639     }
3640     break;
3641    
3642     case PT_LAMP:
3643     for (i = min; i < max; i++)
3644     {
3645     int len = 1;
3646     if (eptr >= md->end_subject) break;
3647     GETCHARLEN(c, eptr, len);
3648     prop_category = _pcre_ucp_findprop(c, &prop_chartype, &prop_script);
3649     if ((prop_chartype == ucp_Lu ||
3650     prop_chartype == ucp_Ll ||
3651     prop_chartype == ucp_Lt) == prop_fail_result)
3652     break;
3653     eptr+= len;
3654     }
3655     break;
3656    
3657     case PT_GC:
3658     for (i = min; i < max; i++)
3659     {
3660     int len = 1;
3661     if (eptr >= md->end_subject) break;
3662     GETCHARLEN(c, eptr, len);
3663     prop_category = _pcre_ucp_findprop(c, &prop_chartype, &prop_script);
3664     if ((prop_category == prop_value) == prop_fail_result)
3665     break;
3666     eptr+= len;
3667     }
3668     break;
3669    
3670     case PT_PC:
3671     for (i = min; i < max; i++)
3672     {
3673     int len = 1;
3674     if (eptr >= md->end_subject) break;
3675     GETCHARLEN(c, eptr, len);
3676     prop_category = _pcre_ucp_findprop(c, &prop_chartype, &prop_script);
3677     if ((prop_chartype == prop_value) == prop_fail_result)
3678     break;
3679     eptr+= len;
3680     }
3681     break;
3682    
3683     case PT_SC:
3684     for (i = min; i < max; i++)
3685     {
3686     int len = 1;
3687     if (eptr >= md->end_subject) break;
3688     GETCHARLEN(c, eptr, len);
3689     prop_category = _pcre_ucp_findprop(c, &prop_chartype, &prop_script);
3690     if ((prop_script == prop_value) == prop_fail_result)
3691     break;
3692     eptr+= len;
3693     }
3694     break;
3695 nigel 77 }
3696    
3697     /* eptr is now past the end of the maximum run */
3698    
3699 nigel 93 if (possessive) continue;
3700 nigel 77 for(;;)
3701     {
3702 ph10 164 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM44);
3703 nigel 77 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3704     if (eptr-- == pp) break; /* Stop if tried at original pos */
3705     BACKCHAR(eptr);
3706     }
3707     }
3708    
3709     /* Match extended Unicode sequences. We will get here only if the
3710     support is in the binary; otherwise a compile-time error occurs. */
3711    
3712     else if (ctype == OP_EXTUNI)
3713     {
3714     for (i = min; i < max; i++)
3715     {
3716     if (eptr >= md->end_subject) break;
3717     GETCHARINCTEST(c, eptr);
3718 nigel 87 prop_category = _pcre_ucp_findprop(c, &prop_chartype, &prop_script);
3719 nigel 77 if (prop_category == ucp_M) break;
3720     while (eptr < md->end_subject)
3721     {
3722     int len = 1;
3723     if (!utf8) c = *eptr; else
3724     {
3725     GETCHARLEN(c, eptr, len);
3726     }
3727 nigel 87 prop_category = _pcre_ucp_findprop(c, &prop_chartype, &prop_script);
3728 nigel 77 if (prop_category != ucp_M) break;
3729     eptr += len;
3730     }
3731     }
3732    
3733     /* eptr is now past the end of the maximum run */
3734    
3735 nigel 93 if (possessive) continue;
3736 nigel 77 for(;;)
3737     {
3738 ph10 164 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM45);
3739 nigel 77 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3740     if (eptr-- == pp) break; /* Stop if tried at original pos */
3741     for (;;) /* Move back over one extended */
3742     {
3743     int len = 1;
3744     BACKCHAR(eptr);
3745     if (!utf8) c = *eptr; else
3746     {
3747     GETCHARLEN(c, eptr, len);
3748     }
3749 nigel 87 prop_category = _pcre_ucp_findprop(c, &prop_chartype, &prop_script);
3750 nigel 77 if (prop_category != ucp_M) break;
3751     eptr--;
3752     }
3753     }
3754     }
3755    
3756     else
3757     #endif /* SUPPORT_UCP */
3758    
3759     #ifdef SUPPORT_UTF8
3760     /* UTF-8 mode */
3761    
3762     if (utf8)
3763     {
3764     switch(ctype)
3765     {
3766     case OP_ANY:
3767     if (max < INT_MAX)
3768     {
3769     if ((ims & PCRE_DOTALL) == 0)
3770     {
3771     for (i = min; i < max; i++)
3772     {
3773 nigel 93 if (eptr >= md->end_subject || IS_NEWLINE(eptr)) break;
3774 nigel 77 eptr++;
3775     while (eptr < md->end_subject && (*eptr & 0xc0) == 0x80) eptr++;
3776     }
3777     }
3778     else
3779     {
3780     for (i = min; i < max; i++)
3781     {
3782 nigel 91 if (eptr >= md->end_subject) break;
3783 nigel 77 eptr++;
3784     while (eptr < md->end_subject && (*eptr & 0xc0) == 0x80) eptr++;
3785     }
3786     }
3787     }
3788    
3789     /* Handle unlimited UTF-8 repeat */
3790    
3791     else
3792     {
3793     if ((ims & PCRE_DOTALL) == 0)
3794     {
3795     for (i = min; i < max; i++)
3796     {
3797 nigel 93 if (eptr >= md->end_subject || IS_NEWLINE(eptr)) break;
3798 nigel 77 eptr++;
3799 ph10 190 while (eptr < md->end_subject && (*eptr & 0xc0) == 0x80) eptr++;
3800 nigel 77 }
3801     }
3802     else
3803     {
3804 ph10 190 eptr = md->end_subject;
3805 nigel 77 }
3806     }
3807     break;
3808    
3809     /* The byte case is the same as non-UTF8 */
3810    
3811     case OP_ANYBYTE:
3812     c = max - min;
3813 nigel 93 if (c > (unsigned int)(md->end_subject - eptr))
3814     c = md->end_subject - eptr;
3815 nigel 77 eptr += c;
3816     break;
3817    
3818 nigel 93 case OP_ANYNL:
3819     for (i = min; i < max; i++)
3820     {
3821     int len = 1;
3822     if (eptr >= md->end_subject) break;
3823     GETCHARLEN(c, eptr, len);
3824     if (c == 0x000d)
3825     {
3826     if (++eptr >= md->end_subject) break;
3827     if (*eptr == 0x000a) eptr++;
3828     }
3829     else
3830     {
3831     if (c != 0x000a && c != 0x000b && c != 0x000c &&
3832     c != 0x0085 && c != 0x2028 && c != 0x2029)
3833     break;
3834     eptr += len;
3835     }
3836     }
3837     break;
3838    
3839 ph10 178 case OP_NOT_HSPACE:
3840 ph10 182 case OP_HSPACE:
3841 ph10 178 for (i = min; i < max; i++)
3842     {
3843 ph10 182 BOOL gotspace;
3844 ph10 178 int len = 1;
3845     if (eptr >= md->end_subject) break;
3846     GETCHARLEN(c, eptr, len);
3847     switch(c)
3848 ph10 182 {
3849     default: gotspace = FALSE; break;
3850 ph10 178 case 0x09: /* HT */
3851     case 0x20: /* SPACE */
3852     case 0xa0: /* NBSP */
3853     case 0x1680: /* OGHAM SPACE MARK */
3854     case 0x180e: /* MONGOLIAN VOWEL SEPARATOR */
3855     case 0x2000: /* EN QUAD */
3856     case 0x2001: /* EM QUAD */
3857     case 0x2002: /* EN SPACE */
3858     case 0x2003: /* EM SPACE */
3859     case 0x2004: /* THREE-PER-EM SPACE */
3860     case 0x2005: /* FOUR-PER-EM SPACE */
3861     case 0x2006: /* SIX-PER-EM SPACE */
3862     case 0x2007: /* FIGURE SPACE */
3863     case 0x2008: /* PUNCTUATION SPACE */
3864     case 0x2009: /* THIN SPACE */
3865     case 0x200A: /* HAIR SPACE */
3866     case 0x202f: /* NARROW NO-BREAK SPACE */
3867     case 0x205f: /* MEDIUM MATHEMATICAL SPACE */
3868     case 0x3000: /* IDEOGRAPHIC SPACE */
3869     gotspace = TRUE;
3870 ph10 182 break;
3871 ph10 178 }
3872     if (gotspace == (ctype == OP_NOT_HSPACE)) break;
3873     eptr += len;
3874     }
3875     break;
3876    
3877     case OP_NOT_VSPACE:
3878 ph10 182 case OP_VSPACE:
3879 ph10 178 for (i = min; i < max; i++)
3880     {
3881 ph10 182 BOOL gotspace;
3882 ph10 178 int len = 1;
3883     if (eptr >= md->end_subject) break;
3884     GETCHARLEN(c, eptr, len);
3885     switch(c)
3886     {
3887 ph10 182 default: gotspace = FALSE; break;
3888 ph10 178 case 0x0a: /* LF */
3889     case 0x0b: /* VT */
3890     case 0x0c: /* FF */
3891     case 0x0d: /* CR */
3892     case 0x85: /* NEL */
3893     case 0x2028: /* LINE SEPARATOR */
3894     case 0x2029: /* PARAGRAPH SEPARATOR */
3895     gotspace = TRUE;
3896     break;
3897     }
3898 ph10 182 if (gotspace == (ctype == OP_NOT_VSPACE)) break;
3899 ph10 178 eptr += len;
3900     }
3901     break;
3902    
3903 nigel 77 case OP_NOT_DIGIT:
3904     for (i = min; i < max; i++)
3905     {
3906     int len = 1;
3907     if (eptr >= md->end_subject) break;
3908     GETCHARLEN(c, eptr, len);
3909     if (c < 256 && (md->ctypes[c] & ctype_digit) != 0) break;
3910     eptr+= len;
3911     }
3912     break;
3913    
3914     case OP_DIGIT:
3915     for (i = min; i < max; i++)
3916     {
3917     int len = 1;
3918     if (eptr >= md->end_subject) break;
3919     GETCHARLEN(c, eptr, len);
3920     if (c >= 256 ||(md->ctypes[c] & ctype_digit) == 0) break;
3921     eptr+= len;
3922     }
3923     break;
3924    
3925     case OP_NOT_WHITESPACE:
3926     for (i = min; i < max; i++)
3927     {
3928     int len = 1;
3929     if (eptr >= md->end_subject) break;
3930     GETCHARLEN(c, eptr, len);
3931     if (c < 256 && (md->ctypes[c] & ctype_space) != 0) break;
3932     eptr+= len;
3933     }
3934     break;
3935    
3936     case OP_WHITESPACE:
3937     for (i = min; i < max; i++)
3938     {
3939     int len = 1;
3940     if (eptr >= md->end_subject) break;
3941     GETCHARLEN(c, eptr, len);
3942     if (c >= 256 ||(md->ctypes[c] & ctype_space) == 0) break;
3943     eptr+= len;
3944     }
3945     break;
3946    
3947     case OP_NOT_WORDCHAR:
3948     for (i = min; i < max; i++)
3949     {
3950     int len = 1;
3951     if (eptr >= md->end_subject) break;
3952     GETCHARLEN(c, eptr, len);
3953     if (c < 256 && (md->ctypes[c] & ctype_word) != 0) break;
3954     eptr+= len;
3955     }
3956     break;
3957    
3958     case OP_WORDCHAR:
3959     for (i = min; i < max; i++)
3960     {
3961     int len = 1;
3962     if (eptr >= md->end_subject) break;
3963     GETCHARLEN(c, eptr, len);
3964     if (c >= 256 || (md->ctypes[c] & ctype_word) == 0) break;
3965     eptr+= len;
3966     }
3967     break;
3968    
3969     default:
3970     RRETURN(PCRE_ERROR_INTERNAL);
3971     }
3972    
3973     /* eptr is now past the end of the maximum run */
3974    
3975 nigel 93 if (possessive) continue;
3976 nigel 77 for(;;)
3977     {
3978 ph10 164 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM46);
3979 nigel 77 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3980     if (eptr-- == pp) break; /* Stop if tried at original pos */
3981     BACKCHAR(eptr);
3982     }
3983     }
3984     else
3985     #endif
3986    
3987     /* Not UTF-8 mode */
3988     {
3989     switch(ctype)
3990     {
3991     case OP_ANY:
3992     if ((ims & PCRE_DOTALL) == 0)
3993     {
3994     for (i = min; i < max; i++)
3995     {
3996 nigel 93 if (eptr >= md->end_subject || IS_NEWLINE(eptr)) break;
3997 nigel 77 eptr++;
3998     }
3999     break;
4000     }
4001     /* For DOTALL case, fall through and treat as \C */
4002    
4003     case OP_ANYBYTE:
4004     c = max - min;
4005 nigel 93 if (c > (unsigned int)(md->end_subject - eptr))
4006     c = md->end_subject - eptr;
4007 nigel 77 eptr += c;
4008     break;
4009    
4010 nigel 93 case OP_ANYNL:
4011     for (i = min; i < max; i++)
4012     {
4013     if (eptr >= md->end_subject) break;
4014     c = *eptr;
4015     if (c == 0x000d)
4016     {
4017     if (++eptr >= md->end_subject) break;
4018     if (*eptr == 0x000a) eptr++;
4019     }
4020     else
4021     {
4022     if (c != 0x000a && c != 0x000b && c != 0x000c && c != 0x0085)
4023     break;
4024     eptr++;
4025     }
4026     }
4027     break;
4028    
4029 ph10 178 case OP_NOT_HSPACE:
4030     for (i = min; i < max; i++)
4031     {
4032     if (eptr >= md->end_subject) break;
4033     c = *eptr;
4034     if (c == 0x09 || c == 0x20 || c == 0xa0) break;
4035 ph10 182 eptr++;
4036 ph10 178 }
4037     break;
4038    
4039     case OP_HSPACE:
4040     for (i = min; i < max; i++)
4041     {
4042     if (eptr >= md->end_subject) break;
4043     c = *eptr;
4044     if (c != 0x09 && c != 0x20 && c != 0xa0) break;
4045 ph10 182 eptr++;
4046 ph10 178 }
4047     break;
4048    
4049     case OP_NOT_VSPACE:
4050     for (i = min; i < max; i++)
4051     {
4052     if (eptr >= md->end_subject) break;
4053     c = *eptr;
4054     if (c == 0x0a || c == 0x0b || c == 0x0c || c == 0x0d || c == 0x85)
4055     break;
4056 ph10 182 eptr++;
4057 ph10 178 }
4058     break;
4059    
4060     case OP_VSPACE:
4061     for (i = min; i < max; i++)
4062     {
4063     if (eptr >= md->end_subject) break;
4064     c = *eptr;
4065     if (c != 0x0a && c != 0x0b && c != 0x0c && c != 0x0d && c != 0x85)
4066     break;
4067     eptr++;
4068     }
4069     break;
4070    
4071 nigel 77 case OP_NOT_DIGIT:
4072     for (i = min; i < max; i++)
4073     {
4074     if (eptr >= md->end_subject || (md->ctypes[*eptr] & ctype_digit) != 0)
4075     break;
4076     eptr++;
4077     }
4078     break;
4079    
4080     case OP_DIGIT:
4081     for (i = min; i < max; i++)
4082     {
4083     if (eptr >= md->end_subject || (md->ctypes[*eptr] & ctype_digit) == 0)
4084     break;
4085     eptr++;
4086     }
4087     break;
4088    
4089     case OP_NOT_WHITESPACE:
4090     for (i = min; i < max; i++)
4091     {
4092     if (eptr >= md->end_subject || (md->ctypes[*eptr] & ctype_space) != 0)
4093     break;
4094     eptr++;
4095     }
4096     break;
4097    
4098     case OP_WHITESPACE:
4099     for (i = min; i < max; i++)
4100     {
4101     if (eptr >= md->end_subject || (md->ctypes[*eptr] & ctype_space) == 0)
4102     break;
4103     eptr++;
4104     }
4105     break;
4106    
4107     case OP_NOT_WORDCHAR:
4108     for (i = min; i < max; i++)
4109     {
4110     if (eptr >= md->end_subject || (md->ctypes[*eptr] & ctype_word) != 0)
4111     break;
4112     eptr++;
4113     }
4114     break;
4115    
4116     case OP_WORDCHAR:
4117     for (i = min; i < max; i++)
4118     {
4119     if (eptr >= md->end_subject || (md->ctypes[*eptr] & ctype_word) == 0)
4120     break;
4121     eptr++;
4122     }
4123     break;
4124    
4125     default:
4126     RRETURN(PCRE_ERROR_INTERNAL);
4127     }
4128    
4129     /* eptr is now past the end of the maximum run */
4130    
4131 nigel 93 if (possessive) continue;
4132 nigel 77 while (eptr >= pp)
4133     {
4134 ph10 164 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM47);
4135 nigel 77 eptr--;
4136     if (rrc != MATCH_NOMATCH) RRETURN(rrc);
4137     }
4138     }
4139    
4140     /* Get here if we can't make it match with any permitted repetitions */
4141    
4142     RRETURN(MATCH_NOMATCH);
4143     }
4144     /* Control never gets here */
4145    
4146 nigel 93 /* There's been some horrible disaster. Arrival here can only mean there is
4147     something seriously wrong in the code above or the OP_xxx definitions. */
4148 nigel 77
4149     default:
4150     DPRINTF(("Unknown opcode %d\n", *ecode));
4151 nigel 93 RRETURN(PCRE_ERROR_UNKNOWN_OPCODE);
4152 nigel 77 }
4153    
4154     /* Do not stick any code in here without much thought; it is assumed
4155     that "continue" in the code above comes out to here to repeat the main
4156     loop. */
4157    
4158     } /* End of main loop */
4159     /* Control never reaches here */
4160 ph10 164
4161    
4162 ph10 165 /* When compiling to use the heap rather than the stack for recursive calls to
4163     match(), the RRETURN() macro jumps here. The number that is saved in
4164 ph10 164 frame->Xwhere indicates which label we actually want to return to. */
4165    
4166     #ifdef NO_RECURSE
4167     #define LBL(val) case val: goto L_RM##val;
4168     HEAP_RETURN:
4169     switch (frame->Xwhere)
4170     {
4171