/[pcre]/code/trunk/pcre_exec.c
ViewVC logotype

Contents of /code/trunk/pcre_exec.c

Parent Directory Parent Directory | Revision Log Revision Log


Revision 399 - (hide annotations) (download)
Sat Mar 21 12:34:15 2009 UTC (5 years, 5 months ago) by ph10
File MIME type: text/plain
File size: 152925 byte(s)
Further fix to auto-callout with conditional groups whose condition is an 
assertion.

1 nigel 77 /*************************************************
2     * Perl-Compatible Regular Expressions *
3     *************************************************/
4    
5     /* PCRE is a library of functions to support regular expressions whose syntax
6     and semantics are as close as possible to those of the Perl 5 language.
7    
8     Written by Philip Hazel
9 ph10 381 Copyright (c) 1997-2009 University of Cambridge
10 nigel 77
11     -----------------------------------------------------------------------------
12     Redistribution and use in source and binary forms, with or without
13     modification, are permitted provided that the following conditions are met:
14    
15     * Redistributions of source code must retain the above copyright notice,
16     this list of conditions and the following disclaimer.
17    
18     * Redistributions in binary form must reproduce the above copyright
19     notice, this list of conditions and the following disclaimer in the
20     documentation and/or other materials provided with the distribution.
21    
22     * Neither the name of the University of Cambridge nor the names of its
23     contributors may be used to endorse or promote products derived from
24     this software without specific prior written permission.
25    
26     THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
27     AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
28     IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
29     ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
30     LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
31     CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
32     SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
33     INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
34     CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
35     ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
36     POSSIBILITY OF SUCH DAMAGE.
37     -----------------------------------------------------------------------------
38     */
39    
40    
41     /* This module contains pcre_exec(), the externally visible function that does
42     pattern matching using an NFA algorithm, trying to mimic Perl as closely as
43     possible. There are also some static supporting functions. */
44    
45 ph10 200 #ifdef HAVE_CONFIG_H
46 ph10 236 #include "config.h"
47 ph10 200 #endif
48 ph10 199
49 nigel 93 #define NLBLOCK md /* Block containing newline information */
50     #define PSSTART start_subject /* Field containing processed string start */
51     #define PSEND end_subject /* Field containing processed string end */
52    
53 nigel 77 #include "pcre_internal.h"
54    
55 ph10 137 /* Undefine some potentially clashing cpp symbols */
56    
57     #undef min
58     #undef max
59    
60 nigel 77 /* Flag bits for the match() function */
61    
62 nigel 93 #define match_condassert 0x01 /* Called to check a condition assertion */
63     #define match_cbegroup 0x02 /* Could-be-empty unlimited repeat group */
64 nigel 77
65     /* Non-error returns from the match() function. Error returns are externally
66     defined PCRE_ERROR_xxx codes, which are all negative. */
67    
68     #define MATCH_MATCH 1
69     #define MATCH_NOMATCH 0
70    
71 ph10 211 /* Special internal returns from the match() function. Make them sufficiently
72 ph10 210 negative to avoid the external error codes. */
73    
74     #define MATCH_COMMIT (-999)
75     #define MATCH_PRUNE (-998)
76     #define MATCH_SKIP (-997)
77     #define MATCH_THEN (-996)
78    
79 nigel 77 /* Maximum number of ints of offset to save on the stack for recursive calls.
80     If the offset vector is bigger, malloc is used. This should be a multiple of 3,
81     because the offset vector is always a multiple of 3 long. */
82    
83     #define REC_STACK_SAVE_MAX 30
84    
85     /* Min and max values for the common repeats; for the maxima, 0 => infinity */
86    
87     static const char rep_min[] = { 0, 0, 1, 1, 0, 0 };
88     static const char rep_max[] = { 0, 0, 0, 0, 1, 1 };
89    
90    
91    
92     #ifdef DEBUG
93     /*************************************************
94     * Debugging function to print chars *
95     *************************************************/
96    
97     /* Print a sequence of chars in printable format, stopping at the end of the
98     subject if the requested.
99    
100     Arguments:
101     p points to characters
102     length number to print
103     is_subject TRUE if printing from within md->start_subject
104     md pointer to matching data block, if is_subject is TRUE
105    
106     Returns: nothing
107     */
108    
109     static void
110     pchars(const uschar *p, int length, BOOL is_subject, match_data *md)
111     {
112 nigel 93 unsigned int c;
113 nigel 77 if (is_subject && length > md->end_subject - p) length = md->end_subject - p;
114     while (length-- > 0)
115     if (isprint(c = *(p++))) printf("%c", c); else printf("\\x%02x", c);
116     }
117     #endif
118    
119    
120    
121     /*************************************************
122     * Match a back-reference *
123     *************************************************/
124    
125     /* If a back reference hasn't been set, the length that is passed is greater
126     than the number of characters left in the string, so the match fails.
127    
128     Arguments:
129     offset index into the offset vector
130     eptr points into the subject
131     length length to be matched
132     md points to match data block
133     ims the ims flags
134    
135     Returns: TRUE if matched
136     */
137    
138     static BOOL
139 nigel 87 match_ref(int offset, register USPTR eptr, int length, match_data *md,
140 nigel 77 unsigned long int ims)
141     {
142 nigel 87 USPTR p = md->start_subject + md->offset_vector[offset];
143 nigel 77
144     #ifdef DEBUG
145     if (eptr >= md->end_subject)
146     printf("matching subject <null>");
147     else
148     {
149     printf("matching subject ");
150     pchars(eptr, length, TRUE, md);
151     }
152     printf(" against backref ");
153     pchars(p, length, FALSE, md);
154     printf("\n");
155     #endif
156    
157     /* Always fail if not enough characters left */
158    
159     if (length > md->end_subject - eptr) return FALSE;
160    
161 ph10 354 /* Separate the caseless case for speed. In UTF-8 mode we can only do this
162     properly if Unicode properties are supported. Otherwise, we can check only
163     ASCII characters. */
164 nigel 77
165     if ((ims & PCRE_CASELESS) != 0)
166     {
167 ph10 354 #ifdef SUPPORT_UTF8
168     #ifdef SUPPORT_UCP
169     if (md->utf8)
170     {
171 ph10 358 USPTR endptr = eptr + length;
172 ph10 354 while (eptr < endptr)
173     {
174 ph10 358 int c, d;
175 ph10 354 GETCHARINC(c, eptr);
176     GETCHARINC(d, p);
177     if (c != d && c != UCD_OTHERCASE(d)) return FALSE;
178 ph10 358 }
179     }
180 ph10 354 else
181     #endif
182     #endif
183    
184     /* The same code works when not in UTF-8 mode and in UTF-8 mode when there
185     is no UCP support. */
186 ph10 358
187 nigel 77 while (length-- > 0)
188 ph10 354 { if (md->lcc[*p++] != md->lcc[*eptr++]) return FALSE; }
189 nigel 77 }
190 ph10 358
191 ph10 354 /* In the caseful case, we can just compare the bytes, whether or not we
192     are in UTF-8 mode. */
193 ph10 358
194 nigel 77 else
195     { while (length-- > 0) if (*p++ != *eptr++) return FALSE; }
196    
197     return TRUE;
198     }
199    
200    
201    
202     /***************************************************************************
203     ****************************************************************************
204     RECURSION IN THE match() FUNCTION
205    
206 nigel 87 The match() function is highly recursive, though not every recursive call
207     increases the recursive depth. Nevertheless, some regular expressions can cause
208     it to recurse to a great depth. I was writing for Unix, so I just let it call
209     itself recursively. This uses the stack for saving everything that has to be
210     saved for a recursive call. On Unix, the stack can be large, and this works
211     fine.
212 nigel 77
213 nigel 87 It turns out that on some non-Unix-like systems there are problems with
214     programs that use a lot of stack. (This despite the fact that every last chip
215     has oodles of memory these days, and techniques for extending the stack have
216     been known for decades.) So....
217 nigel 77
218     There is a fudge, triggered by defining NO_RECURSE, which avoids recursive
219     calls by keeping local variables that need to be preserved in blocks of memory
220 nigel 87 obtained from malloc() instead instead of on the stack. Macros are used to
221 nigel 77 achieve this so that the actual code doesn't look very different to what it
222     always used to.
223 ph10 164
224 ph10 165 The original heap-recursive code used longjmp(). However, it seems that this
225 ph10 164 can be very slow on some operating systems. Following a suggestion from Stan
226     Switzer, the use of longjmp() has been abolished, at the cost of having to
227     provide a unique number for each call to RMATCH. There is no way of generating
228     a sequence of numbers at compile time in C. I have given them names, to make
229     them stand out more clearly.
230    
231     Crude tests on x86 Linux show a small speedup of around 5-8%. However, on
232     FreeBSD, avoiding longjmp() more than halves the time taken to run the standard
233 ph10 165 tests. Furthermore, not using longjmp() means that local dynamic variables
234     don't have indeterminate values; this has meant that the frame size can be
235 ph10 164 reduced because the result can be "passed back" by straight setting of the
236     variable instead of being passed in the frame.
237 nigel 77 ****************************************************************************
238     ***************************************************************************/
239    
240 ph10 212 /* Numbers for RMATCH calls. When this list is changed, the code at HEAP_RETURN
241     below must be updated in sync. */
242 nigel 77
243 ph10 164 enum { RM1=1, RM2, RM3, RM4, RM5, RM6, RM7, RM8, RM9, RM10,
244     RM11, RM12, RM13, RM14, RM15, RM16, RM17, RM18, RM19, RM20,
245     RM21, RM22, RM23, RM24, RM25, RM26, RM27, RM28, RM29, RM30,
246     RM31, RM32, RM33, RM34, RM35, RM36, RM37, RM38, RM39, RM40,
247 ph10 210 RM41, RM42, RM43, RM44, RM45, RM46, RM47, RM48, RM49, RM50,
248 ph10 212 RM51, RM52, RM53, RM54 };
249 ph10 164
250 nigel 87 /* These versions of the macros use the stack, as normal. There are debugging
251 ph10 165 versions and production versions. Note that the "rw" argument of RMATCH isn't
252 ph10 164 actuall used in this definition. */
253 nigel 77
254     #ifndef NO_RECURSE
255     #define REGISTER register
256 ph10 164
257 nigel 87 #ifdef DEBUG
258 ph10 164 #define RMATCH(ra,rb,rc,rd,re,rf,rg,rw) \
259 nigel 87 { \
260     printf("match() called in line %d\n", __LINE__); \
261 ph10 168 rrc = match(ra,rb,mstart,rc,rd,re,rf,rg,rdepth+1); \
262 nigel 87 printf("to line %d\n", __LINE__); \
263     }
264     #define RRETURN(ra) \
265     { \
266     printf("match() returned %d from line %d ", ra, __LINE__); \
267     return ra; \
268     }
269     #else
270 ph10 164 #define RMATCH(ra,rb,rc,rd,re,rf,rg,rw) \
271 ph10 168 rrc = match(ra,rb,mstart,rc,rd,re,rf,rg,rdepth+1)
272 nigel 77 #define RRETURN(ra) return ra
273 nigel 87 #endif
274    
275 nigel 77 #else
276    
277    
278 ph10 164 /* These versions of the macros manage a private stack on the heap. Note that
279     the "rd" argument of RMATCH isn't actually used in this definition. It's the md
280     argument of match(), which never changes. */
281 nigel 77
282     #define REGISTER
283    
284 ph10 164 #define RMATCH(ra,rb,rc,rd,re,rf,rg,rw)\
285 nigel 77 {\
286     heapframe *newframe = (pcre_stack_malloc)(sizeof(heapframe));\
287 ph10 164 frame->Xwhere = rw; \
288     newframe->Xeptr = ra;\
289     newframe->Xecode = rb;\
290 ph10 168 newframe->Xmstart = mstart;\
291 ph10 164 newframe->Xoffset_top = rc;\
292     newframe->Xims = re;\
293     newframe->Xeptrb = rf;\
294     newframe->Xflags = rg;\
295     newframe->Xrdepth = frame->Xrdepth + 1;\
296     newframe->Xprevframe = frame;\
297     frame = newframe;\
298     DPRINTF(("restarting from line %d\n", __LINE__));\
299     goto HEAP_RECURSE;\
300     L_##rw:\
301     DPRINTF(("jumped back to line %d\n", __LINE__));\
302 nigel 77 }
303    
304     #define RRETURN(ra)\
305     {\
306     heapframe *newframe = frame;\
307     frame = newframe->Xprevframe;\
308     (pcre_stack_free)(newframe);\
309     if (frame != NULL)\
310     {\
311 ph10 164 rrc = ra;\
312     goto HEAP_RETURN;\
313 nigel 77 }\
314     return ra;\
315     }
316    
317    
318     /* Structure for remembering the local variables in a private frame */
319    
320     typedef struct heapframe {
321     struct heapframe *Xprevframe;
322    
323     /* Function arguments that may change */
324    
325     const uschar *Xeptr;
326     const uschar *Xecode;
327 ph10 172 const uschar *Xmstart;
328 nigel 77 int Xoffset_top;
329     long int Xims;
330     eptrblock *Xeptrb;
331     int Xflags;
332 nigel 91 unsigned int Xrdepth;
333 nigel 77
334     /* Function local variables */
335    
336     const uschar *Xcallpat;
337     const uschar *Xcharptr;
338     const uschar *Xdata;
339     const uschar *Xnext;
340     const uschar *Xpp;
341     const uschar *Xprev;
342     const uschar *Xsaved_eptr;
343    
344     recursion_info Xnew_recursive;
345    
346     BOOL Xcur_is_word;
347     BOOL Xcondition;
348     BOOL Xprev_is_word;
349    
350     unsigned long int Xoriginal_ims;
351    
352     #ifdef SUPPORT_UCP
353     int Xprop_type;
354 nigel 87 int Xprop_value;
355 nigel 77 int Xprop_fail_result;
356     int Xprop_category;
357     int Xprop_chartype;
358 nigel 87 int Xprop_script;
359 ph10 123 int Xoclength;
360     uschar Xocchars[8];
361 nigel 77 #endif
362    
363     int Xctype;
364 nigel 93 unsigned int Xfc;
365 nigel 77 int Xfi;
366     int Xlength;
367     int Xmax;
368     int Xmin;
369     int Xnumber;
370     int Xoffset;
371     int Xop;
372     int Xsave_capture_last;
373     int Xsave_offset1, Xsave_offset2, Xsave_offset3;
374     int Xstacksave[REC_STACK_SAVE_MAX];
375    
376     eptrblock Xnewptrb;
377    
378 ph10 164 /* Where to jump back to */
379 nigel 77
380 ph10 164 int Xwhere;
381 ph10 165
382 nigel 77 } heapframe;
383    
384     #endif
385    
386    
387     /***************************************************************************
388     ***************************************************************************/
389    
390    
391    
392     /*************************************************
393     * Match from current position *
394     *************************************************/
395    
396 nigel 93 /* This function is called recursively in many circumstances. Whenever it
397 nigel 77 returns a negative (error) response, the outer incarnation must also return the
398     same response.
399    
400     Performance note: It might be tempting to extract commonly used fields from the
401     md structure (e.g. utf8, end_subject) into individual variables to improve
402     performance. Tests using gcc on a SPARC disproved this; in the first case, it
403     made performance worse.
404    
405     Arguments:
406 nigel 93 eptr pointer to current character in subject
407     ecode pointer to current position in compiled code
408 ph10 168 mstart pointer to the current match start position (can be modified
409 ph10 172 by encountering \K)
410 nigel 77 offset_top current top pointer
411     md pointer to "static" info for the match
412     ims current /i, /m, and /s options
413     eptrb pointer to chain of blocks containing eptr at start of
414     brackets - for testing for empty matches
415     flags can contain
416     match_condassert - this is an assertion condition
417 nigel 93 match_cbegroup - this is the start of an unlimited repeat
418     group that can match an empty string
419 nigel 87 rdepth the recursion depth
420 nigel 77
421     Returns: MATCH_MATCH if matched ) these values are >= 0
422     MATCH_NOMATCH if failed to match )
423     a negative PCRE_ERROR_xxx value if aborted by an error condition
424 nigel 87 (e.g. stopped by repeated call or recursion limit)
425 nigel 77 */
426    
427     static int
428 ph10 172 match(REGISTER USPTR eptr, REGISTER const uschar *ecode, const uschar *mstart,
429 nigel 77 int offset_top, match_data *md, unsigned long int ims, eptrblock *eptrb,
430 nigel 91 int flags, unsigned int rdepth)
431 nigel 77 {
432     /* These variables do not need to be preserved over recursion in this function,
433 nigel 93 so they can be ordinary variables in all cases. Mark some of them with
434     "register" because they are used a lot in loops. */
435 nigel 77
436 nigel 91 register int rrc; /* Returns from recursive calls */
437     register int i; /* Used for loops not involving calls to RMATCH() */
438 nigel 93 register unsigned int c; /* Character values not kept over RMATCH() calls */
439 nigel 91 register BOOL utf8; /* Local copy of UTF-8 flag for speed */
440 nigel 77
441 nigel 93 BOOL minimize, possessive; /* Quantifier options */
442    
443 nigel 77 /* When recursion is not being used, all "local" variables that have to be
444     preserved over calls to RMATCH() are part of a "frame" which is obtained from
445     heap storage. Set up the top-level frame here; others are obtained from the
446     heap whenever RMATCH() does a "recursion". See the macro definitions above. */
447    
448     #ifdef NO_RECURSE
449     heapframe *frame = (pcre_stack_malloc)(sizeof(heapframe));
450     frame->Xprevframe = NULL; /* Marks the top level */
451    
452     /* Copy in the original argument variables */
453    
454     frame->Xeptr = eptr;
455     frame->Xecode = ecode;
456 ph10 168 frame->Xmstart = mstart;
457 nigel 77 frame->Xoffset_top = offset_top;
458     frame->Xims = ims;
459     frame->Xeptrb = eptrb;
460     frame->Xflags = flags;
461 nigel 87 frame->Xrdepth = rdepth;
462 nigel 77
463     /* This is where control jumps back to to effect "recursion" */
464    
465     HEAP_RECURSE:
466    
467     /* Macros make the argument variables come from the current frame */
468    
469     #define eptr frame->Xeptr
470     #define ecode frame->Xecode
471 ph10 168 #define mstart frame->Xmstart
472 nigel 77 #define offset_top frame->Xoffset_top
473     #define ims frame->Xims
474     #define eptrb frame->Xeptrb
475     #define flags frame->Xflags
476 nigel 87 #define rdepth frame->Xrdepth
477 nigel 77
478     /* Ditto for the local variables */
479    
480     #ifdef SUPPORT_UTF8
481     #define charptr frame->Xcharptr
482     #endif
483     #define callpat frame->Xcallpat
484     #define data frame->Xdata
485     #define next frame->Xnext
486     #define pp frame->Xpp
487     #define prev frame->Xprev
488     #define saved_eptr frame->Xsaved_eptr
489    
490     #define new_recursive frame->Xnew_recursive
491    
492     #define cur_is_word frame->Xcur_is_word
493     #define condition frame->Xcondition
494     #define prev_is_word frame->Xprev_is_word
495    
496     #define original_ims frame->Xoriginal_ims
497    
498     #ifdef SUPPORT_UCP
499     #define prop_type frame->Xprop_type
500 nigel 87 #define prop_value frame->Xprop_value
501 nigel 77 #define prop_fail_result frame->Xprop_fail_result
502     #define prop_category frame->Xprop_category
503     #define prop_chartype frame->Xprop_chartype
504 nigel 87 #define prop_script frame->Xprop_script
505 ph10 115 #define oclength frame->Xoclength
506     #define occhars frame->Xocchars
507 nigel 77 #endif
508    
509     #define ctype frame->Xctype
510     #define fc frame->Xfc
511     #define fi frame->Xfi
512     #define length frame->Xlength
513     #define max frame->Xmax
514     #define min frame->Xmin
515     #define number frame->Xnumber
516     #define offset frame->Xoffset
517     #define op frame->Xop
518     #define save_capture_last frame->Xsave_capture_last
519     #define save_offset1 frame->Xsave_offset1
520     #define save_offset2 frame->Xsave_offset2
521     #define save_offset3 frame->Xsave_offset3
522     #define stacksave frame->Xstacksave
523    
524     #define newptrb frame->Xnewptrb
525    
526     /* When recursion is being used, local variables are allocated on the stack and
527     get preserved during recursion in the normal way. In this environment, fi and
528     i, and fc and c, can be the same variables. */
529    
530 nigel 93 #else /* NO_RECURSE not defined */
531 nigel 77 #define fi i
532     #define fc c
533    
534    
535 nigel 87 #ifdef SUPPORT_UTF8 /* Many of these variables are used only */
536     const uschar *charptr; /* in small blocks of the code. My normal */
537     #endif /* style of coding would have declared */
538     const uschar *callpat; /* them within each of those blocks. */
539     const uschar *data; /* However, in order to accommodate the */
540     const uschar *next; /* version of this code that uses an */
541     USPTR pp; /* external "stack" implemented on the */
542     const uschar *prev; /* heap, it is easier to declare them all */
543     USPTR saved_eptr; /* here, so the declarations can be cut */
544     /* out in a block. The only declarations */
545     recursion_info new_recursive; /* within blocks below are for variables */
546     /* that do not have to be preserved over */
547     BOOL cur_is_word; /* a recursive call to RMATCH(). */
548     BOOL condition;
549 nigel 77 BOOL prev_is_word;
550    
551     unsigned long int original_ims;
552    
553     #ifdef SUPPORT_UCP
554     int prop_type;
555 nigel 87 int prop_value;
556 nigel 77 int prop_fail_result;
557     int prop_category;
558     int prop_chartype;
559 nigel 87 int prop_script;
560 ph10 115 int oclength;
561     uschar occhars[8];
562 nigel 77 #endif
563    
564 ph10 399 int codelink;
565     int condcode;
566 nigel 77 int ctype;
567     int length;
568     int max;
569     int min;
570     int number;
571     int offset;
572     int op;
573     int save_capture_last;
574     int save_offset1, save_offset2, save_offset3;
575     int stacksave[REC_STACK_SAVE_MAX];
576    
577     eptrblock newptrb;
578 nigel 93 #endif /* NO_RECURSE */
579 nigel 77
580     /* These statements are here to stop the compiler complaining about unitialized
581     variables. */
582    
583     #ifdef SUPPORT_UCP
584 nigel 87 prop_value = 0;
585 nigel 77 prop_fail_result = 0;
586     #endif
587    
588 nigel 93
589 nigel 91 /* This label is used for tail recursion, which is used in a few cases even
590     when NO_RECURSE is not defined, in order to reduce the amount of stack that is
591     used. Thanks to Ian Taylor for noticing this possibility and sending the
592     original patch. */
593    
594     TAIL_RECURSE:
595    
596 nigel 87 /* OK, now we can get on with the real code of the function. Recursive calls
597     are specified by the macro RMATCH and RRETURN is used to return. When
598     NO_RECURSE is *not* defined, these just turn into a recursive call to match()
599     and a "return", respectively (possibly with some debugging if DEBUG is
600     defined). However, RMATCH isn't like a function call because it's quite a
601     complicated macro. It has to be used in one particular way. This shouldn't,
602     however, impact performance when true recursion is being used. */
603 nigel 77
604 ph10 164 #ifdef SUPPORT_UTF8
605     utf8 = md->utf8; /* Local copy of the flag */
606     #else
607     utf8 = FALSE;
608     #endif
609    
610 nigel 87 /* First check that we haven't called match() too many times, or that we
611     haven't exceeded the recursive call limit. */
612    
613 nigel 77 if (md->match_call_count++ >= md->match_limit) RRETURN(PCRE_ERROR_MATCHLIMIT);
614 nigel 87 if (rdepth >= md->match_limit_recursion) RRETURN(PCRE_ERROR_RECURSIONLIMIT);
615 nigel 77
616     original_ims = ims; /* Save for resetting on ')' */
617 nigel 91
618 nigel 93 /* At the start of a group with an unlimited repeat that may match an empty
619     string, the match_cbegroup flag is set. When this is the case, add the current
620     subject pointer to the chain of such remembered pointers, to be checked when we
621     hit the closing ket, in order to break infinite loops that match no characters.
622 ph10 197 When match() is called in other circumstances, don't add to the chain. The
623     match_cbegroup flag must NOT be used with tail recursion, because the memory
624     block that is used is on the stack, so a new one may be required for each
625     match(). */
626 nigel 77
627 nigel 93 if ((flags & match_cbegroup) != 0)
628 nigel 77 {
629 ph10 197 newptrb.epb_saved_eptr = eptr;
630     newptrb.epb_prev = eptrb;
631     eptrb = &newptrb;
632 nigel 77 }
633    
634 nigel 93 /* Now start processing the opcodes. */
635 nigel 77
636     for (;;)
637     {
638 nigel 93 minimize = possessive = FALSE;
639 nigel 77 op = *ecode;
640 ph10 395
641 nigel 77 /* For partial matching, remember if we ever hit the end of the subject after
642     matching at least one subject character. */
643    
644     if (md->partial &&
645     eptr >= md->end_subject &&
646 ph10 168 eptr > mstart)
647 nigel 77 md->hitend = TRUE;
648 ph10 208
649 nigel 93 switch(op)
650     {
651 ph10 210 case OP_FAIL:
652 ph10 212 RRETURN(MATCH_NOMATCH);
653 ph10 211
654 ph10 210 case OP_PRUNE:
655     RMATCH(eptr, ecode + _pcre_OP_lengths[*ecode], offset_top, md,
656     ims, eptrb, flags, RM51);
657     if (rrc != MATCH_NOMATCH) RRETURN(rrc);
658 ph10 212 RRETURN(MATCH_PRUNE);
659 ph10 211
660 ph10 210 case OP_COMMIT:
661     RMATCH(eptr, ecode + _pcre_OP_lengths[*ecode], offset_top, md,
662     ims, eptrb, flags, RM52);
663     if (rrc != MATCH_NOMATCH) RRETURN(rrc);
664 ph10 212 RRETURN(MATCH_COMMIT);
665 ph10 211
666 ph10 210 case OP_SKIP:
667     RMATCH(eptr, ecode + _pcre_OP_lengths[*ecode], offset_top, md,
668     ims, eptrb, flags, RM53);
669     if (rrc != MATCH_NOMATCH) RRETURN(rrc);
670 ph10 211 md->start_match_ptr = eptr; /* Pass back current position */
671 ph10 212 RRETURN(MATCH_SKIP);
672 ph10 211
673 ph10 210 case OP_THEN:
674     RMATCH(eptr, ecode + _pcre_OP_lengths[*ecode], offset_top, md,
675 ph10 212 ims, eptrb, flags, RM54);
676 ph10 210 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
677 ph10 212 RRETURN(MATCH_THEN);
678 ph10 211
679 nigel 93 /* Handle a capturing bracket. If there is space in the offset vector, save
680     the current subject position in the working slot at the top of the vector.
681     We mustn't change the current values of the data slot, because they may be
682     set from a previous iteration of this group, and be referred to by a
683     reference inside the group.
684 nigel 77
685 nigel 93 If the bracket fails to match, we need to restore this value and also the
686     values of the final offsets, in case they were set by a previous iteration
687     of the same bracket.
688 nigel 77
689 nigel 93 If there isn't enough space in the offset vector, treat this as if it were
690     a non-capturing bracket. Don't worry about setting the flag for the error
691     case here; that is handled in the code for KET. */
692 nigel 77
693 nigel 93 case OP_CBRA:
694     case OP_SCBRA:
695     number = GET2(ecode, 1+LINK_SIZE);
696 nigel 77 offset = number << 1;
697    
698     #ifdef DEBUG
699 nigel 93 printf("start bracket %d\n", number);
700     printf("subject=");
701 nigel 77 pchars(eptr, 16, TRUE, md);
702     printf("\n");
703     #endif
704    
705     if (offset < md->offset_max)
706     {
707     save_offset1 = md->offset_vector[offset];
708     save_offset2 = md->offset_vector[offset+1];
709     save_offset3 = md->offset_vector[md->offset_end - number];
710     save_capture_last = md->capture_last;
711    
712     DPRINTF(("saving %d %d %d\n", save_offset1, save_offset2, save_offset3));
713     md->offset_vector[md->offset_end - number] = eptr - md->start_subject;
714    
715 nigel 93 flags = (op == OP_SCBRA)? match_cbegroup : 0;
716 nigel 77 do
717     {
718 ph10 164 RMATCH(eptr, ecode + _pcre_OP_lengths[*ecode], offset_top, md,
719     ims, eptrb, flags, RM1);
720 ph10 210 if (rrc != MATCH_NOMATCH && rrc != MATCH_THEN) RRETURN(rrc);
721 nigel 77 md->capture_last = save_capture_last;
722     ecode += GET(ecode, 1);
723     }
724     while (*ecode == OP_ALT);
725    
726     DPRINTF(("bracket %d failed\n", number));
727    
728     md->offset_vector[offset] = save_offset1;
729     md->offset_vector[offset+1] = save_offset2;
730     md->offset_vector[md->offset_end - number] = save_offset3;
731    
732     RRETURN(MATCH_NOMATCH);
733     }
734    
735 ph10 197 /* FALL THROUGH ... Insufficient room for saving captured contents. Treat
736     as a non-capturing bracket. */
737 nigel 77
738 ph10 197 /* VVVVVVVVVVVVVVVVVVVVVVVVV */
739     /* VVVVVVVVVVVVVVVVVVVVVVVVV */
740    
741 nigel 93 DPRINTF(("insufficient capture room: treat as non-capturing\n"));
742 nigel 77
743 ph10 197 /* VVVVVVVVVVVVVVVVVVVVVVVVV */
744     /* VVVVVVVVVVVVVVVVVVVVVVVVV */
745    
746 nigel 93 /* Non-capturing bracket. Loop for all the alternatives. When we get to the
747     final alternative within the brackets, we would return the result of a
748     recursive call to match() whatever happened. We can reduce stack usage by
749 ph10 197 turning this into a tail recursion, except in the case when match_cbegroup
750     is set.*/
751 nigel 77
752 nigel 93 case OP_BRA:
753     case OP_SBRA:
754     DPRINTF(("start non-capturing bracket\n"));
755     flags = (op >= OP_SBRA)? match_cbegroup : 0;
756 nigel 91 for (;;)
757 nigel 77 {
758 ph10 197 if (ecode[GET(ecode, 1)] != OP_ALT) /* Final alternative */
759 nigel 93 {
760 ph10 197 if (flags == 0) /* Not a possibly empty group */
761     {
762     ecode += _pcre_OP_lengths[*ecode];
763     DPRINTF(("bracket 0 tail recursion\n"));
764     goto TAIL_RECURSE;
765     }
766    
767     /* Possibly empty group; can't use tail recursion. */
768    
769     RMATCH(eptr, ecode + _pcre_OP_lengths[*ecode], offset_top, md, ims,
770     eptrb, flags, RM48);
771     RRETURN(rrc);
772 nigel 93 }
773 nigel 91
774     /* For non-final alternatives, continue the loop for a NOMATCH result;
775     otherwise return. */
776    
777 ph10 164 RMATCH(eptr, ecode + _pcre_OP_lengths[*ecode], offset_top, md, ims,
778     eptrb, flags, RM2);
779 ph10 210 if (rrc != MATCH_NOMATCH && rrc != MATCH_THEN) RRETURN(rrc);
780 nigel 77 ecode += GET(ecode, 1);
781     }
782 nigel 91 /* Control never reaches here. */
783 nigel 77
784     /* Conditional group: compilation checked that there are no more than
785     two branches. If the condition is false, skipping the first branch takes us
786     past the end if there is only one branch, but that's OK because that is
787 nigel 91 exactly what going to the ket would do. As there is only one branch to be
788     obeyed, we can use tail recursion to avoid using another stack frame. */
789 nigel 77
790     case OP_COND:
791 nigel 93 case OP_SCOND:
792 ph10 399 codelink= GET(ecode, 1);
793    
794 ph10 381 /* Because of the way auto-callout works during compile, a callout item is
795     inserted between OP_COND and an assertion condition. */
796 ph10 392
797 ph10 381 if (ecode[LINK_SIZE+1] == OP_CALLOUT)
798     {
799     if (pcre_callout != NULL)
800     {
801     pcre_callout_block cb;
802     cb.version = 1; /* Version 1 of the callout block */
803     cb.callout_number = ecode[LINK_SIZE+2];
804     cb.offset_vector = md->offset_vector;
805     cb.subject = (PCRE_SPTR)md->start_subject;
806     cb.subject_length = md->end_subject - md->start_subject;
807     cb.start_match = mstart - md->start_subject;
808     cb.current_position = eptr - md->start_subject;
809     cb.pattern_position = GET(ecode, LINK_SIZE + 3);
810     cb.next_item_length = GET(ecode, 3 + 2*LINK_SIZE);
811     cb.capture_top = offset_top/2;
812     cb.capture_last = md->capture_last;
813     cb.callout_data = md->callout_data;
814     if ((rrc = (*pcre_callout)(&cb)) > 0) RRETURN(MATCH_NOMATCH);
815     if (rrc < 0) RRETURN(rrc);
816     }
817     ecode += _pcre_OP_lengths[OP_CALLOUT];
818     }
819 ph10 392
820 ph10 399 condcode = ecode[LINK_SIZE+1];
821    
822 ph10 381 /* Now see what the actual condition is */
823 ph10 392
824 ph10 399 if (condcode == OP_RREF) /* Recursion test */
825 nigel 77 {
826 nigel 93 offset = GET2(ecode, LINK_SIZE + 2); /* Recursion group number*/
827     condition = md->recursive != NULL &&
828     (offset == RREF_ANY || offset == md->recursive->group_num);
829     ecode += condition? 3 : GET(ecode, 1);
830     }
831    
832 ph10 399 else if (condcode == OP_CREF) /* Group used test */
833 nigel 93 {
834 nigel 77 offset = GET2(ecode, LINK_SIZE+2) << 1; /* Doubled ref number */
835 nigel 93 condition = offset < offset_top && md->offset_vector[offset] >= 0;
836     ecode += condition? 3 : GET(ecode, 1);
837 nigel 77 }
838    
839 ph10 399 else if (condcode == OP_DEF) /* DEFINE - always false */
840 nigel 93 {
841     condition = FALSE;
842     ecode += GET(ecode, 1);
843     }
844    
845 nigel 77 /* The condition is an assertion. Call match() to evaluate it - setting
846 nigel 93 the final argument match_condassert causes it to stop at the end of an
847     assertion. */
848 nigel 77
849     else
850     {
851 ph10 164 RMATCH(eptr, ecode + 1 + LINK_SIZE, offset_top, md, ims, NULL,
852     match_condassert, RM3);
853 nigel 77 if (rrc == MATCH_MATCH)
854     {
855 nigel 93 condition = TRUE;
856     ecode += 1 + LINK_SIZE + GET(ecode, LINK_SIZE + 2);
857 nigel 77 while (*ecode == OP_ALT) ecode += GET(ecode, 1);
858     }
859 ph10 210 else if (rrc != MATCH_NOMATCH && rrc != MATCH_THEN)
860 nigel 77 {
861     RRETURN(rrc); /* Need braces because of following else */
862     }
863 nigel 93 else
864     {
865     condition = FALSE;
866 ph10 399 ecode += codelink;
867 nigel 93 }
868     }
869 nigel 91
870 nigel 93 /* We are now at the branch that is to be obeyed. As there is only one,
871 ph10 197 we can use tail recursion to avoid using another stack frame, except when
872     match_cbegroup is required for an unlimited repeat of a possibly empty
873     group. If the second alternative doesn't exist, we can just plough on. */
874 nigel 91
875 nigel 93 if (condition || *ecode == OP_ALT)
876     {
877 nigel 91 ecode += 1 + LINK_SIZE;
878 ph10 197 if (op == OP_SCOND) /* Possibly empty group */
879     {
880     RMATCH(eptr, ecode, offset_top, md, ims, eptrb, match_cbegroup, RM49);
881     RRETURN(rrc);
882     }
883     else /* Group must match something */
884     {
885     flags = 0;
886     goto TAIL_RECURSE;
887     }
888 nigel 77 }
889 ph10 395 else /* Condition false & no alternative */
890 nigel 93 {
891     ecode += 1 + LINK_SIZE;
892     }
893     break;
894 nigel 77
895    
896 ph10 210 /* End of the pattern, either real or forced. If we are in a top-level
897     recursion, we should restore the offsets appropriately and continue from
898     after the call. */
899 nigel 77
900 ph10 210 case OP_ACCEPT:
901 nigel 77 case OP_END:
902     if (md->recursive != NULL && md->recursive->group_num == 0)
903     {
904     recursion_info *rec = md->recursive;
905 nigel 87 DPRINTF(("End of pattern in a (?0) recursion\n"));
906 nigel 77 md->recursive = rec->prevrec;
907     memmove(md->offset_vector, rec->offset_save,
908     rec->saved_max * sizeof(int));
909 ph10 168 mstart = rec->save_start;
910 nigel 77 ims = original_ims;
911     ecode = rec->after_call;
912     break;
913     }
914    
915     /* Otherwise, if PCRE_NOTEMPTY is set, fail if we have matched an empty
916     string - backtracking will then try other alternatives, if any. */
917    
918 ph10 168 if (md->notempty && eptr == mstart) RRETURN(MATCH_NOMATCH);
919     md->end_match_ptr = eptr; /* Record where we ended */
920     md->end_offset_top = offset_top; /* and how many extracts were taken */
921 ph10 210 md->start_match_ptr = mstart; /* and the start (\K can modify) */
922 nigel 77 RRETURN(MATCH_MATCH);
923    
924     /* Change option settings */
925    
926     case OP_OPT:
927     ims = ecode[1];
928     ecode += 2;
929     DPRINTF(("ims set to %02lx\n", ims));
930     break;
931    
932     /* Assertion brackets. Check the alternative branches in turn - the
933     matching won't pass the KET for an assertion. If any one branch matches,
934     the assertion is true. Lookbehind assertions have an OP_REVERSE item at the
935     start of each branch to move the current point backwards, so the code at
936     this level is identical to the lookahead case. */
937    
938     case OP_ASSERT:
939     case OP_ASSERTBACK:
940     do
941     {
942 ph10 164 RMATCH(eptr, ecode + 1 + LINK_SIZE, offset_top, md, ims, NULL, 0,
943     RM4);
944 nigel 77 if (rrc == MATCH_MATCH) break;
945 ph10 210 if (rrc != MATCH_NOMATCH && rrc != MATCH_THEN) RRETURN(rrc);
946 nigel 77 ecode += GET(ecode, 1);
947     }
948     while (*ecode == OP_ALT);
949     if (*ecode == OP_KET) RRETURN(MATCH_NOMATCH);
950    
951     /* If checking an assertion for a condition, return MATCH_MATCH. */
952    
953     if ((flags & match_condassert) != 0) RRETURN(MATCH_MATCH);
954    
955     /* Continue from after the assertion, updating the offsets high water
956     mark, since extracts may have been taken during the assertion. */
957    
958     do ecode += GET(ecode,1); while (*ecode == OP_ALT);
959     ecode += 1 + LINK_SIZE;
960     offset_top = md->end_offset_top;
961     continue;
962    
963     /* Negative assertion: all branches must fail to match */
964    
965     case OP_ASSERT_NOT:
966     case OP_ASSERTBACK_NOT:
967     do
968     {
969 ph10 164 RMATCH(eptr, ecode + 1 + LINK_SIZE, offset_top, md, ims, NULL, 0,
970     RM5);
971 nigel 77 if (rrc == MATCH_MATCH) RRETURN(MATCH_NOMATCH);
972 ph10 210 if (rrc != MATCH_NOMATCH && rrc != MATCH_THEN) RRETURN(rrc);
973 nigel 77 ecode += GET(ecode,1);
974     }
975     while (*ecode == OP_ALT);
976    
977     if ((flags & match_condassert) != 0) RRETURN(MATCH_MATCH);
978    
979     ecode += 1 + LINK_SIZE;
980     continue;
981    
982     /* Move the subject pointer back. This occurs only at the start of
983     each branch of a lookbehind assertion. If we are too close to the start to
984     move back, this match function fails. When working with UTF-8 we move
985     back a number of characters, not bytes. */
986    
987     case OP_REVERSE:
988     #ifdef SUPPORT_UTF8
989     if (utf8)
990     {
991 nigel 93 i = GET(ecode, 1);
992     while (i-- > 0)
993 nigel 77 {
994     eptr--;
995     if (eptr < md->start_subject) RRETURN(MATCH_NOMATCH);
996 ph10 207 BACKCHAR(eptr);
997 nigel 77 }
998     }
999     else
1000     #endif
1001    
1002     /* No UTF-8 support, or not in UTF-8 mode: count is byte count */
1003    
1004     {
1005 nigel 93 eptr -= GET(ecode, 1);
1006 nigel 77 if (eptr < md->start_subject) RRETURN(MATCH_NOMATCH);
1007     }
1008    
1009     /* Skip to next op code */
1010    
1011     ecode += 1 + LINK_SIZE;
1012     break;
1013    
1014     /* The callout item calls an external function, if one is provided, passing
1015     details of the match so far. This is mainly for debugging, though the
1016     function is able to force a failure. */
1017    
1018     case OP_CALLOUT:
1019     if (pcre_callout != NULL)
1020     {
1021     pcre_callout_block cb;
1022     cb.version = 1; /* Version 1 of the callout block */
1023     cb.callout_number = ecode[1];
1024     cb.offset_vector = md->offset_vector;
1025 nigel 87 cb.subject = (PCRE_SPTR)md->start_subject;
1026 nigel 77 cb.subject_length = md->end_subject - md->start_subject;
1027 ph10 168 cb.start_match = mstart - md->start_subject;
1028 nigel 77 cb.current_position = eptr - md->start_subject;
1029     cb.pattern_position = GET(ecode, 2);
1030     cb.next_item_length = GET(ecode, 2 + LINK_SIZE);
1031     cb.capture_top = offset_top/2;
1032     cb.capture_last = md->capture_last;
1033     cb.callout_data = md->callout_data;
1034     if ((rrc = (*pcre_callout)(&cb)) > 0) RRETURN(MATCH_NOMATCH);
1035     if (rrc < 0) RRETURN(rrc);
1036     }
1037     ecode += 2 + 2*LINK_SIZE;
1038     break;
1039    
1040     /* Recursion either matches the current regex, or some subexpression. The
1041     offset data is the offset to the starting bracket from the start of the
1042     whole pattern. (This is so that it works from duplicated subpatterns.)
1043    
1044     If there are any capturing brackets started but not finished, we have to
1045     save their starting points and reinstate them after the recursion. However,
1046     we don't know how many such there are (offset_top records the completed
1047     total) so we just have to save all the potential data. There may be up to
1048     65535 such values, which is too large to put on the stack, but using malloc
1049     for small numbers seems expensive. As a compromise, the stack is used when
1050     there are no more than REC_STACK_SAVE_MAX values to store; otherwise malloc
1051     is used. A problem is what to do if the malloc fails ... there is no way of
1052     returning to the top level with an error. Save the top REC_STACK_SAVE_MAX
1053     values on the stack, and accept that the rest may be wrong.
1054    
1055     There are also other values that have to be saved. We use a chained
1056     sequence of blocks that actually live on the stack. Thanks to Robin Houston
1057     for the original version of this logic. */
1058    
1059     case OP_RECURSE:
1060     {
1061     callpat = md->start_code + GET(ecode, 1);
1062 nigel 93 new_recursive.group_num = (callpat == md->start_code)? 0 :
1063     GET2(callpat, 1 + LINK_SIZE);
1064 nigel 77
1065     /* Add to "recursing stack" */
1066    
1067     new_recursive.prevrec = md->recursive;
1068     md->recursive = &new_recursive;
1069    
1070     /* Find where to continue from afterwards */
1071    
1072     ecode += 1 + LINK_SIZE;
1073     new_recursive.after_call = ecode;
1074    
1075     /* Now save the offset data. */
1076    
1077     new_recursive.saved_max = md->offset_end;
1078     if (new_recursive.saved_max <= REC_STACK_SAVE_MAX)
1079     new_recursive.offset_save = stacksave;
1080     else
1081     {
1082     new_recursive.offset_save =
1083     (int *)(pcre_malloc)(new_recursive.saved_max * sizeof(int));
1084     if (new_recursive.offset_save == NULL) RRETURN(PCRE_ERROR_NOMEMORY);
1085     }
1086    
1087     memcpy(new_recursive.offset_save, md->offset_vector,
1088     new_recursive.saved_max * sizeof(int));
1089 ph10 168 new_recursive.save_start = mstart;
1090     mstart = eptr;
1091 nigel 77
1092     /* OK, now we can do the recursion. For each top-level alternative we
1093     restore the offset and recursion data. */
1094    
1095     DPRINTF(("Recursing into group %d\n", new_recursive.group_num));
1096 nigel 93 flags = (*callpat >= OP_SBRA)? match_cbegroup : 0;
1097 nigel 77 do
1098     {
1099 ph10 164 RMATCH(eptr, callpat + _pcre_OP_lengths[*callpat], offset_top,
1100     md, ims, eptrb, flags, RM6);
1101 nigel 77 if (rrc == MATCH_MATCH)
1102     {
1103 nigel 87 DPRINTF(("Recursion matched\n"));
1104 nigel 77 md->recursive = new_recursive.prevrec;
1105     if (new_recursive.offset_save != stacksave)
1106     (pcre_free)(new_recursive.offset_save);
1107     RRETURN(MATCH_MATCH);
1108     }
1109 ph10 210 else if (rrc != MATCH_NOMATCH && rrc != MATCH_THEN)
1110 nigel 87 {
1111     DPRINTF(("Recursion gave error %d\n", rrc));
1112     RRETURN(rrc);
1113     }
1114 nigel 77
1115     md->recursive = &new_recursive;
1116     memcpy(md->offset_vector, new_recursive.offset_save,
1117     new_recursive.saved_max * sizeof(int));
1118     callpat += GET(callpat, 1);
1119     }
1120     while (*callpat == OP_ALT);
1121    
1122     DPRINTF(("Recursion didn't match\n"));
1123     md->recursive = new_recursive.prevrec;
1124     if (new_recursive.offset_save != stacksave)
1125     (pcre_free)(new_recursive.offset_save);
1126     RRETURN(MATCH_NOMATCH);
1127     }
1128     /* Control never reaches here */
1129    
1130     /* "Once" brackets are like assertion brackets except that after a match,
1131     the point in the subject string is not moved back. Thus there can never be
1132     a move back into the brackets. Friedl calls these "atomic" subpatterns.
1133     Check the alternative branches in turn - the matching won't pass the KET
1134     for this kind of subpattern. If any one branch matches, we carry on as at
1135     the end of a normal bracket, leaving the subject pointer. */
1136    
1137     case OP_ONCE:
1138 nigel 91 prev = ecode;
1139     saved_eptr = eptr;
1140    
1141     do
1142 nigel 77 {
1143 ph10 197 RMATCH(eptr, ecode + 1 + LINK_SIZE, offset_top, md, ims, eptrb, 0, RM7);
1144 nigel 91 if (rrc == MATCH_MATCH) break;
1145 ph10 210 if (rrc != MATCH_NOMATCH && rrc != MATCH_THEN) RRETURN(rrc);
1146 nigel 91 ecode += GET(ecode,1);
1147     }
1148     while (*ecode == OP_ALT);
1149 nigel 77
1150 nigel 91 /* If hit the end of the group (which could be repeated), fail */
1151 nigel 77
1152 nigel 91 if (*ecode != OP_ONCE && *ecode != OP_ALT) RRETURN(MATCH_NOMATCH);
1153 nigel 77
1154 nigel 91 /* Continue as from after the assertion, updating the offsets high water
1155     mark, since extracts may have been taken. */
1156 nigel 77
1157 nigel 93 do ecode += GET(ecode, 1); while (*ecode == OP_ALT);
1158 nigel 77
1159 nigel 91 offset_top = md->end_offset_top;
1160     eptr = md->end_match_ptr;
1161 nigel 77
1162 nigel 91 /* For a non-repeating ket, just continue at this level. This also
1163     happens for a repeating ket if no characters were matched in the group.
1164     This is the forcible breaking of infinite loops as implemented in Perl
1165     5.005. If there is an options reset, it will get obeyed in the normal
1166     course of events. */
1167 nigel 77
1168 nigel 91 if (*ecode == OP_KET || eptr == saved_eptr)
1169     {
1170     ecode += 1+LINK_SIZE;
1171     break;
1172     }
1173 nigel 77
1174 nigel 91 /* The repeating kets try the rest of the pattern or restart from the
1175     preceding bracket, in the appropriate order. The second "call" of match()
1176     uses tail recursion, to avoid using another stack frame. We need to reset
1177     any options that changed within the bracket before re-running it, so
1178     check the next opcode. */
1179 nigel 77
1180 nigel 91 if (ecode[1+LINK_SIZE] == OP_OPT)
1181     {
1182     ims = (ims & ~PCRE_IMS) | ecode[4];
1183     DPRINTF(("ims set to %02lx at group repeat\n", ims));
1184     }
1185 nigel 77
1186 nigel 91 if (*ecode == OP_KETRMIN)
1187     {
1188 ph10 197 RMATCH(eptr, ecode + 1 + LINK_SIZE, offset_top, md, ims, eptrb, 0, RM8);
1189 nigel 91 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1190     ecode = prev;
1191 ph10 197 flags = 0;
1192 nigel 91 goto TAIL_RECURSE;
1193 nigel 77 }
1194 nigel 91 else /* OP_KETRMAX */
1195     {
1196 ph10 164 RMATCH(eptr, prev, offset_top, md, ims, eptrb, match_cbegroup, RM9);
1197 nigel 91 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1198     ecode += 1 + LINK_SIZE;
1199 ph10 197 flags = 0;
1200 nigel 91 goto TAIL_RECURSE;
1201     }
1202     /* Control never gets here */
1203 nigel 77
1204     /* An alternation is the end of a branch; scan along to find the end of the
1205     bracketed group and go to there. */
1206    
1207     case OP_ALT:
1208     do ecode += GET(ecode,1); while (*ecode == OP_ALT);
1209     break;
1210    
1211 ph10 335 /* BRAZERO, BRAMINZERO and SKIPZERO occur just before a bracket group,
1212     indicating that it may occur zero times. It may repeat infinitely, or not
1213     at all - i.e. it could be ()* or ()? or even (){0} in the pattern. Brackets
1214     with fixed upper repeat limits are compiled as a number of copies, with the
1215     optional ones preceded by BRAZERO or BRAMINZERO. */
1216 nigel 77
1217     case OP_BRAZERO:
1218     {
1219     next = ecode+1;
1220 ph10 164 RMATCH(eptr, next, offset_top, md, ims, eptrb, 0, RM10);
1221 nigel 77 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1222     do next += GET(next,1); while (*next == OP_ALT);
1223 nigel 93 ecode = next + 1 + LINK_SIZE;
1224 nigel 77 }
1225     break;
1226    
1227     case OP_BRAMINZERO:
1228     {
1229     next = ecode+1;
1230 nigel 93 do next += GET(next, 1); while (*next == OP_ALT);
1231 ph10 164 RMATCH(eptr, next + 1+LINK_SIZE, offset_top, md, ims, eptrb, 0, RM11);
1232 nigel 77 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1233     ecode++;
1234     }
1235     break;
1236    
1237 ph10 335 case OP_SKIPZERO:
1238     {
1239     next = ecode+1;
1240     do next += GET(next,1); while (*next == OP_ALT);
1241     ecode = next + 1 + LINK_SIZE;
1242     }
1243     break;
1244    
1245 nigel 93 /* End of a group, repeated or non-repeating. */
1246 nigel 77
1247     case OP_KET:
1248     case OP_KETRMIN:
1249     case OP_KETRMAX:
1250 nigel 91 prev = ecode - GET(ecode, 1);
1251 nigel 77
1252 nigel 93 /* If this was a group that remembered the subject start, in order to break
1253     infinite repeats of empty string matches, retrieve the subject start from
1254     the chain. Otherwise, set it NULL. */
1255 nigel 77
1256 nigel 93 if (*prev >= OP_SBRA)
1257     {
1258     saved_eptr = eptrb->epb_saved_eptr; /* Value at start of group */
1259     eptrb = eptrb->epb_prev; /* Backup to previous group */
1260     }
1261     else saved_eptr = NULL;
1262 nigel 77
1263 nigel 93 /* If we are at the end of an assertion group, stop matching and return
1264     MATCH_MATCH, but record the current high water mark for use by positive
1265     assertions. Do this also for the "once" (atomic) groups. */
1266    
1267 nigel 91 if (*prev == OP_ASSERT || *prev == OP_ASSERT_NOT ||
1268     *prev == OP_ASSERTBACK || *prev == OP_ASSERTBACK_NOT ||
1269     *prev == OP_ONCE)
1270     {
1271     md->end_match_ptr = eptr; /* For ONCE */
1272     md->end_offset_top = offset_top;
1273     RRETURN(MATCH_MATCH);
1274     }
1275 nigel 77
1276 nigel 93 /* For capturing groups we have to check the group number back at the start
1277     and if necessary complete handling an extraction by setting the offsets and
1278     bumping the high water mark. Note that whole-pattern recursion is coded as
1279     a recurse into group 0, so it won't be picked up here. Instead, we catch it
1280     when the OP_END is reached. Other recursion is handled here. */
1281 nigel 77
1282 nigel 93 if (*prev == OP_CBRA || *prev == OP_SCBRA)
1283 nigel 91 {
1284 nigel 93 number = GET2(prev, 1+LINK_SIZE);
1285 nigel 91 offset = number << 1;
1286 nigel 77
1287     #ifdef DEBUG
1288 nigel 91 printf("end bracket %d", number);
1289     printf("\n");
1290 nigel 77 #endif
1291    
1292 nigel 93 md->capture_last = number;
1293     if (offset >= md->offset_max) md->offset_overflow = TRUE; else
1294 nigel 91 {
1295 nigel 93 md->offset_vector[offset] =
1296     md->offset_vector[md->offset_end - number];
1297     md->offset_vector[offset+1] = eptr - md->start_subject;
1298     if (offset_top <= offset) offset_top = offset + 2;
1299     }
1300 nigel 77
1301 nigel 93 /* Handle a recursively called group. Restore the offsets
1302     appropriately and continue from after the call. */
1303 nigel 77
1304 nigel 93 if (md->recursive != NULL && md->recursive->group_num == number)
1305     {
1306     recursion_info *rec = md->recursive;
1307     DPRINTF(("Recursion (%d) succeeded - continuing\n", number));
1308     md->recursive = rec->prevrec;
1309 ph10 168 mstart = rec->save_start;
1310 nigel 93 memcpy(md->offset_vector, rec->offset_save,
1311     rec->saved_max * sizeof(int));
1312     ecode = rec->after_call;
1313     ims = original_ims;
1314     break;
1315 nigel 77 }
1316 nigel 91 }
1317 nigel 77
1318 nigel 93 /* For both capturing and non-capturing groups, reset the value of the ims
1319     flags, in case they got changed during the group. */
1320 nigel 77
1321 nigel 91 ims = original_ims;
1322     DPRINTF(("ims reset to %02lx\n", ims));
1323 nigel 77
1324 nigel 91 /* For a non-repeating ket, just continue at this level. This also
1325     happens for a repeating ket if no characters were matched in the group.
1326     This is the forcible breaking of infinite loops as implemented in Perl
1327     5.005. If there is an options reset, it will get obeyed in the normal
1328     course of events. */
1329 nigel 77
1330 nigel 91 if (*ecode == OP_KET || eptr == saved_eptr)
1331     {
1332     ecode += 1 + LINK_SIZE;
1333     break;
1334     }
1335 nigel 77
1336 nigel 91 /* The repeating kets try the rest of the pattern or restart from the
1337     preceding bracket, in the appropriate order. In the second case, we can use
1338 ph10 197 tail recursion to avoid using another stack frame, unless we have an
1339     unlimited repeat of a group that can match an empty string. */
1340 nigel 77
1341 nigel 93 flags = (*prev >= OP_SBRA)? match_cbegroup : 0;
1342    
1343 nigel 91 if (*ecode == OP_KETRMIN)
1344     {
1345 ph10 197 RMATCH(eptr, ecode + 1 + LINK_SIZE, offset_top, md, ims, eptrb, 0, RM12);
1346 nigel 91 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1347 ph10 197 if (flags != 0) /* Could match an empty string */
1348     {
1349     RMATCH(eptr, prev, offset_top, md, ims, eptrb, flags, RM50);
1350     RRETURN(rrc);
1351     }
1352 nigel 91 ecode = prev;
1353     goto TAIL_RECURSE;
1354 nigel 77 }
1355 nigel 91 else /* OP_KETRMAX */
1356     {
1357 ph10 164 RMATCH(eptr, prev, offset_top, md, ims, eptrb, flags, RM13);
1358 nigel 91 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1359     ecode += 1 + LINK_SIZE;
1360 ph10 197 flags = 0;
1361 nigel 91 goto TAIL_RECURSE;
1362     }
1363     /* Control never gets here */
1364 nigel 77
1365     /* Start of subject unless notbol, or after internal newline if multiline */
1366    
1367     case OP_CIRC:
1368     if (md->notbol && eptr == md->start_subject) RRETURN(MATCH_NOMATCH);
1369     if ((ims & PCRE_MULTILINE) != 0)
1370     {
1371 nigel 91 if (eptr != md->start_subject &&
1372 nigel 93 (eptr == md->end_subject || !WAS_NEWLINE(eptr)))
1373 nigel 77 RRETURN(MATCH_NOMATCH);
1374     ecode++;
1375     break;
1376     }
1377     /* ... else fall through */
1378    
1379     /* Start of subject assertion */
1380    
1381     case OP_SOD:
1382     if (eptr != md->start_subject) RRETURN(MATCH_NOMATCH);
1383     ecode++;
1384     break;
1385    
1386     /* Start of match assertion */
1387    
1388     case OP_SOM:
1389     if (eptr != md->start_subject + md->start_offset) RRETURN(MATCH_NOMATCH);
1390     ecode++;
1391     break;
1392 ph10 172
1393 ph10 168 /* Reset the start of match point */
1394 ph10 172
1395 ph10 168 case OP_SET_SOM:
1396     mstart = eptr;
1397 ph10 172 ecode++;
1398     break;
1399 nigel 77
1400     /* Assert before internal newline if multiline, or before a terminating
1401     newline unless endonly is set, else end of subject unless noteol is set. */
1402    
1403     case OP_DOLL:
1404     if ((ims & PCRE_MULTILINE) != 0)
1405     {
1406     if (eptr < md->end_subject)
1407 nigel 91 { if (!IS_NEWLINE(eptr)) RRETURN(MATCH_NOMATCH); }
1408 nigel 77 else
1409     { if (md->noteol) RRETURN(MATCH_NOMATCH); }
1410     ecode++;
1411     break;
1412     }
1413     else
1414     {
1415     if (md->noteol) RRETURN(MATCH_NOMATCH);
1416     if (!md->endonly)
1417     {
1418 nigel 91 if (eptr != md->end_subject &&
1419 nigel 93 (!IS_NEWLINE(eptr) || eptr != md->end_subject - md->nllen))
1420 nigel 77 RRETURN(MATCH_NOMATCH);
1421     ecode++;
1422     break;
1423     }
1424     }
1425 nigel 91 /* ... else fall through for endonly */
1426 nigel 77
1427     /* End of subject assertion (\z) */
1428    
1429     case OP_EOD:
1430     if (eptr < md->end_subject) RRETURN(MATCH_NOMATCH);
1431     ecode++;
1432     break;
1433    
1434     /* End of subject or ending \n assertion (\Z) */
1435    
1436     case OP_EODN:
1437 nigel 91 if (eptr != md->end_subject &&
1438 nigel 93 (!IS_NEWLINE(eptr) || eptr != md->end_subject - md->nllen))
1439 nigel 91 RRETURN(MATCH_NOMATCH);
1440 nigel 77 ecode++;
1441     break;
1442    
1443     /* Word boundary assertions */
1444    
1445     case OP_NOT_WORD_BOUNDARY:
1446     case OP_WORD_BOUNDARY:
1447     {
1448    
1449     /* Find out if the previous and current characters are "word" characters.
1450     It takes a bit more work in UTF-8 mode. Characters > 255 are assumed to
1451     be "non-word" characters. */
1452    
1453     #ifdef SUPPORT_UTF8
1454     if (utf8)
1455     {
1456     if (eptr == md->start_subject) prev_is_word = FALSE; else
1457     {
1458     const uschar *lastptr = eptr - 1;
1459     while((*lastptr & 0xc0) == 0x80) lastptr--;
1460     GETCHAR(c, lastptr);
1461     prev_is_word = c < 256 && (md->ctypes[c] & ctype_word) != 0;
1462     }
1463     if (eptr >= md->end_subject) cur_is_word = FALSE; else
1464     {
1465     GETCHAR(c, eptr);
1466     cur_is_word = c < 256 && (md->ctypes[c] & ctype_word) != 0;
1467     }
1468     }
1469     else
1470     #endif
1471    
1472     /* More streamlined when not in UTF-8 mode */
1473    
1474     {
1475     prev_is_word = (eptr != md->start_subject) &&
1476     ((md->ctypes[eptr[-1]] & ctype_word) != 0);
1477     cur_is_word = (eptr < md->end_subject) &&
1478     ((md->ctypes[*eptr] & ctype_word) != 0);
1479     }
1480    
1481     /* Now see if the situation is what we want */
1482    
1483     if ((*ecode++ == OP_WORD_BOUNDARY)?
1484     cur_is_word == prev_is_word : cur_is_word != prev_is_word)
1485     RRETURN(MATCH_NOMATCH);
1486     }
1487     break;
1488    
1489     /* Match a single character type; inline for speed */
1490    
1491     case OP_ANY:
1492 ph10 342 if (IS_NEWLINE(eptr)) RRETURN(MATCH_NOMATCH);
1493 ph10 345 /* Fall through */
1494    
1495 ph10 341 case OP_ALLANY:
1496 nigel 77 if (eptr++ >= md->end_subject) RRETURN(MATCH_NOMATCH);
1497 ph10 342 if (utf8) while (eptr < md->end_subject && (*eptr & 0xc0) == 0x80) eptr++;
1498 nigel 77 ecode++;
1499     break;
1500    
1501     /* Match a single byte, even in UTF-8 mode. This opcode really does match
1502     any byte, even newline, independent of the setting of PCRE_DOTALL. */
1503    
1504     case OP_ANYBYTE:
1505     if (eptr++ >= md->end_subject) RRETURN(MATCH_NOMATCH);
1506     ecode++;
1507     break;
1508    
1509     case OP_NOT_DIGIT:
1510     if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
1511     GETCHARINCTEST(c, eptr);
1512     if (
1513     #ifdef SUPPORT_UTF8
1514     c < 256 &&
1515     #endif
1516     (md->ctypes[c] & ctype_digit) != 0
1517     )
1518     RRETURN(MATCH_NOMATCH);
1519     ecode++;
1520     break;
1521    
1522     case OP_DIGIT:
1523     if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
1524     GETCHARINCTEST(c, eptr);
1525     if (
1526     #ifdef SUPPORT_UTF8
1527     c >= 256 ||
1528     #endif
1529     (md->ctypes[c] & ctype_digit) == 0
1530     )
1531     RRETURN(MATCH_NOMATCH);
1532     ecode++;
1533     break;
1534    
1535     case OP_NOT_WHITESPACE:
1536     if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
1537     GETCHARINCTEST(c, eptr);
1538     if (
1539     #ifdef SUPPORT_UTF8
1540     c < 256 &&
1541     #endif
1542     (md->ctypes[c] & ctype_space) != 0
1543     )
1544     RRETURN(MATCH_NOMATCH);
1545     ecode++;
1546     break;
1547    
1548     case OP_WHITESPACE:
1549     if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
1550     GETCHARINCTEST(c, eptr);
1551     if (
1552     #ifdef SUPPORT_UTF8
1553     c >= 256 ||
1554     #endif
1555     (md->ctypes[c] & ctype_space) == 0
1556     )
1557     RRETURN(MATCH_NOMATCH);
1558     ecode++;
1559     break;
1560    
1561     case OP_NOT_WORDCHAR:
1562     if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
1563     GETCHARINCTEST(c, eptr);
1564     if (
1565     #ifdef SUPPORT_UTF8
1566     c < 256 &&
1567     #endif
1568     (md->ctypes[c] & ctype_word) != 0
1569     )
1570     RRETURN(MATCH_NOMATCH);
1571     ecode++;
1572     break;
1573    
1574     case OP_WORDCHAR:
1575     if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
1576     GETCHARINCTEST(c, eptr);
1577     if (
1578     #ifdef SUPPORT_UTF8
1579     c >= 256 ||
1580     #endif
1581     (md->ctypes[c] & ctype_word) == 0
1582     )
1583     RRETURN(MATCH_NOMATCH);
1584     ecode++;
1585     break;
1586    
1587 nigel 93 case OP_ANYNL:
1588     if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
1589     GETCHARINCTEST(c, eptr);
1590     switch(c)
1591     {
1592     default: RRETURN(MATCH_NOMATCH);
1593     case 0x000d:
1594     if (eptr < md->end_subject && *eptr == 0x0a) eptr++;
1595     break;
1596 ph10 231
1597 nigel 93 case 0x000a:
1598 ph10 231 break;
1599    
1600 nigel 93 case 0x000b:
1601     case 0x000c:
1602     case 0x0085:
1603     case 0x2028:
1604     case 0x2029:
1605 ph10 231 if (md->bsr_anycrlf) RRETURN(MATCH_NOMATCH);
1606 nigel 93 break;
1607     }
1608     ecode++;
1609     break;
1610    
1611 ph10 178 case OP_NOT_HSPACE:
1612     if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
1613     GETCHARINCTEST(c, eptr);
1614     switch(c)
1615     {
1616     default: break;
1617     case 0x09: /* HT */
1618     case 0x20: /* SPACE */
1619     case 0xa0: /* NBSP */
1620     case 0x1680: /* OGHAM SPACE MARK */
1621     case 0x180e: /* MONGOLIAN VOWEL SEPARATOR */
1622     case 0x2000: /* EN QUAD */
1623     case 0x2001: /* EM QUAD */
1624     case 0x2002: /* EN SPACE */
1625     case 0x2003: /* EM SPACE */
1626     case 0x2004: /* THREE-PER-EM SPACE */
1627     case 0x2005: /* FOUR-PER-EM SPACE */
1628     case 0x2006: /* SIX-PER-EM SPACE */
1629     case 0x2007: /* FIGURE SPACE */
1630     case 0x2008: /* PUNCTUATION SPACE */
1631     case 0x2009: /* THIN SPACE */
1632     case 0x200A: /* HAIR SPACE */
1633     case 0x202f: /* NARROW NO-BREAK SPACE */
1634     case 0x205f: /* MEDIUM MATHEMATICAL SPACE */
1635     case 0x3000: /* IDEOGRAPHIC SPACE */
1636     RRETURN(MATCH_NOMATCH);
1637     }
1638     ecode++;
1639     break;
1640    
1641     case OP_HSPACE:
1642     if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
1643     GETCHARINCTEST(c, eptr);
1644     switch(c)
1645     {
1646     default: RRETURN(MATCH_NOMATCH);
1647     case 0x09: /* HT */
1648     case 0x20: /* SPACE */
1649     case 0xa0: /* NBSP */
1650     case 0x1680: /* OGHAM SPACE MARK */
1651     case 0x180e: /* MONGOLIAN VOWEL SEPARATOR */
1652     case 0x2000: /* EN QUAD */
1653     case 0x2001: /* EM QUAD */
1654     case 0x2002: /* EN SPACE */
1655     case 0x2003: /* EM SPACE */
1656     case 0x2004: /* THREE-PER-EM SPACE */
1657     case 0x2005: /* FOUR-PER-EM SPACE */
1658     case 0x2006: /* SIX-PER-EM SPACE */
1659     case 0x2007: /* FIGURE SPACE */
1660     case 0x2008: /* PUNCTUATION SPACE */
1661     case 0x2009: /* THIN SPACE */
1662     case 0x200A: /* HAIR SPACE */
1663     case 0x202f: /* NARROW NO-BREAK SPACE */
1664     case 0x205f: /* MEDIUM MATHEMATICAL SPACE */
1665     case 0x3000: /* IDEOGRAPHIC SPACE */
1666     break;
1667     }
1668     ecode++;
1669     break;
1670    
1671     case OP_NOT_VSPACE:
1672     if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
1673     GETCHARINCTEST(c, eptr);
1674     switch(c)
1675     {
1676     default: break;
1677     case 0x0a: /* LF */
1678     case 0x0b: /* VT */
1679     case 0x0c: /* FF */
1680     case 0x0d: /* CR */
1681     case 0x85: /* NEL */
1682     case 0x2028: /* LINE SEPARATOR */
1683     case 0x2029: /* PARAGRAPH SEPARATOR */
1684     RRETURN(MATCH_NOMATCH);
1685     }
1686     ecode++;
1687     break;
1688    
1689     case OP_VSPACE:
1690     if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
1691     GETCHARINCTEST(c, eptr);
1692     switch(c)
1693     {
1694     default: RRETURN(MATCH_NOMATCH);
1695     case 0x0a: /* LF */
1696     case 0x0b: /* VT */
1697     case 0x0c: /* FF */
1698     case 0x0d: /* CR */
1699     case 0x85: /* NEL */
1700     case 0x2028: /* LINE SEPARATOR */
1701     case 0x2029: /* PARAGRAPH SEPARATOR */
1702     break;
1703     }
1704     ecode++;
1705     break;
1706    
1707 nigel 77 #ifdef SUPPORT_UCP
1708     /* Check the next character by Unicode property. We will get here only
1709     if the support is in the binary; otherwise a compile-time error occurs. */
1710    
1711     case OP_PROP:
1712     case OP_NOTPROP:
1713     if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
1714     GETCHARINCTEST(c, eptr);
1715     {
1716 ph10 384 const ucd_record *prop = GET_UCD(c);
1717 nigel 77
1718 nigel 87 switch(ecode[1])
1719     {
1720     case PT_ANY:
1721     if (op == OP_NOTPROP) RRETURN(MATCH_NOMATCH);
1722     break;
1723 nigel 77
1724 nigel 87 case PT_LAMP:
1725 ph10 349 if ((prop->chartype == ucp_Lu ||
1726     prop->chartype == ucp_Ll ||
1727     prop->chartype == ucp_Lt) == (op == OP_NOTPROP))
1728 nigel 77 RRETURN(MATCH_NOMATCH);
1729 nigel 87 break;
1730    
1731     case PT_GC:
1732 ph10 351 if ((ecode[2] != _pcre_ucp_gentype[prop->chartype]) == (op == OP_PROP))
1733 nigel 77 RRETURN(MATCH_NOMATCH);
1734 nigel 87 break;
1735    
1736     case PT_PC:
1737 ph10 349 if ((ecode[2] != prop->chartype) == (op == OP_PROP))
1738 nigel 87 RRETURN(MATCH_NOMATCH);
1739     break;
1740    
1741     case PT_SC:
1742 ph10 349 if ((ecode[2] != prop->script) == (op == OP_PROP))
1743 nigel 87 RRETURN(MATCH_NOMATCH);
1744     break;
1745    
1746     default:
1747     RRETURN(PCRE_ERROR_INTERNAL);
1748 nigel 77 }
1749 nigel 87
1750     ecode += 3;
1751 nigel 77 }
1752     break;
1753    
1754     /* Match an extended Unicode sequence. We will get here only if the support
1755     is in the binary; otherwise a compile-time error occurs. */
1756    
1757     case OP_EXTUNI:
1758     if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
1759     GETCHARINCTEST(c, eptr);
1760     {
1761 ph10 349 int category = UCD_CATEGORY(c);
1762 nigel 77 if (category == ucp_M) RRETURN(MATCH_NOMATCH);
1763     while (eptr < md->end_subject)
1764     {
1765     int len = 1;
1766     if (!utf8) c = *eptr; else
1767     {
1768     GETCHARLEN(c, eptr, len);
1769     }
1770 ph10 349 category = UCD_CATEGORY(c);
1771 nigel 77 if (category != ucp_M) break;
1772     eptr += len;
1773     }
1774     }
1775     ecode++;
1776     break;
1777     #endif
1778    
1779    
1780     /* Match a back reference, possibly repeatedly. Look past the end of the
1781     item to see if there is repeat information following. The code is similar
1782     to that for character classes, but repeated for efficiency. Then obey
1783     similar code to character type repeats - written out again for speed.
1784     However, if the referenced string is the empty string, always treat
1785     it as matched, any number of times (otherwise there could be infinite
1786     loops). */
1787    
1788     case OP_REF:
1789     {
1790     offset = GET2(ecode, 1) << 1; /* Doubled ref number */
1791 ph10 345 ecode += 3;
1792    
1793 ph10 336 /* If the reference is unset, there are two possibilities:
1794 ph10 345
1795 ph10 336 (a) In the default, Perl-compatible state, set the length to be longer
1796     than the amount of subject left; this ensures that every attempt at a
1797     match fails. We can't just fail here, because of the possibility of
1798     quantifiers with zero minima.
1799 ph10 345
1800     (b) If the JavaScript compatibility flag is set, set the length to zero
1801     so that the back reference matches an empty string.
1802    
1803     Otherwise, set the length to the length of what was matched by the
1804 ph10 336 referenced subpattern. */
1805 ph10 345
1806 ph10 336 if (offset >= offset_top || md->offset_vector[offset] < 0)
1807 ph10 345 length = (md->jscript_compat)? 0 : md->end_subject - eptr + 1;
1808 ph10 336 else
1809     length = md->offset_vector[offset+1] - md->offset_vector[offset];
1810 nigel 77
1811     /* Set up for repetition, or handle the non-repeated case */
1812    
1813     switch (*ecode)
1814     {
1815     case OP_CRSTAR:
1816     case OP_CRMINSTAR:
1817     case OP_CRPLUS:
1818     case OP_CRMINPLUS:
1819     case OP_CRQUERY:
1820     case OP_CRMINQUERY:
1821     c = *ecode++ - OP_CRSTAR;
1822     minimize = (c & 1) != 0;
1823     min = rep_min[c]; /* Pick up values from tables; */
1824     max = rep_max[c]; /* zero for max => infinity */
1825     if (max == 0) max = INT_MAX;
1826     break;
1827    
1828     case OP_CRRANGE:
1829     case OP_CRMINRANGE:
1830     minimize = (*ecode == OP_CRMINRANGE);
1831     min = GET2(ecode, 1);
1832     max = GET2(ecode, 3);
1833     if (max == 0) max = INT_MAX;
1834     ecode += 5;
1835     break;
1836    
1837     default: /* No repeat follows */
1838     if (!match_ref(offset, eptr, length, md, ims)) RRETURN(MATCH_NOMATCH);
1839     eptr += length;
1840     continue; /* With the main loop */
1841     }
1842    
1843     /* If the length of the reference is zero, just continue with the
1844     main loop. */
1845    
1846     if (length == 0) continue;
1847    
1848     /* First, ensure the minimum number of matches are present. We get back
1849     the length of the reference string explicitly rather than passing the
1850     address of eptr, so that eptr can be a register variable. */
1851    
1852     for (i = 1; i <= min; i++)
1853     {
1854     if (!match_ref(offset, eptr, length, md, ims)) RRETURN(MATCH_NOMATCH);
1855     eptr += length;
1856     }
1857    
1858     /* If min = max, continue at the same level without recursion.
1859     They are not both allowed to be zero. */
1860    
1861     if (min == max) continue;
1862    
1863     /* If minimizing, keep trying and advancing the pointer */
1864    
1865     if (minimize)
1866     {
1867     for (fi = min;; fi++)
1868     {
1869 ph10 164 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM14);
1870 nigel 77 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1871     if (fi >= max || !match_ref(offset, eptr, length, md, ims))
1872     RRETURN(MATCH_NOMATCH);
1873     eptr += length;
1874     }
1875     /* Control never gets here */
1876     }
1877    
1878     /* If maximizing, find the longest string and work backwards */
1879    
1880     else
1881     {
1882     pp = eptr;
1883     for (i = min; i < max; i++)
1884     {
1885     if (!match_ref(offset, eptr, length, md, ims)) break;
1886     eptr += length;
1887     }
1888     while (eptr >= pp)
1889     {
1890 ph10 164 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM15);
1891 nigel 77 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1892     eptr -= length;
1893     }
1894     RRETURN(MATCH_NOMATCH);
1895     }
1896     }
1897     /* Control never gets here */
1898    
1899    
1900    
1901     /* Match a bit-mapped character class, possibly repeatedly. This op code is
1902     used when all the characters in the class have values in the range 0-255,
1903     and either the matching is caseful, or the characters are in the range
1904     0-127 when UTF-8 processing is enabled. The only difference between
1905     OP_CLASS and OP_NCLASS occurs when a data character outside the range is
1906     encountered.
1907    
1908     First, look past the end of the item to see if there is repeat information
1909     following. Then obey similar code to character type repeats - written out
1910     again for speed. */
1911    
1912     case OP_NCLASS:
1913     case OP_CLASS:
1914     {
1915     data = ecode + 1; /* Save for matching */
1916     ecode += 33; /* Advance past the item */
1917    
1918     switch (*ecode)
1919     {
1920     case OP_CRSTAR:
1921     case OP_CRMINSTAR:
1922     case OP_CRPLUS:
1923     case OP_CRMINPLUS:
1924     case OP_CRQUERY:
1925     case OP_CRMINQUERY:
1926     c = *ecode++ - OP_CRSTAR;
1927     minimize = (c & 1) != 0;
1928     min = rep_min[c]; /* Pick up values from tables; */
1929     max = rep_max[c]; /* zero for max => infinity */
1930     if (max == 0) max = INT_MAX;
1931     break;
1932    
1933     case OP_CRRANGE:
1934     case OP_CRMINRANGE:
1935     minimize = (*ecode == OP_CRMINRANGE);
1936     min = GET2(ecode, 1);
1937     max = GET2(ecode, 3);
1938     if (max == 0) max = INT_MAX;
1939     ecode += 5;
1940     break;
1941    
1942     default: /* No repeat follows */
1943     min = max = 1;
1944     break;
1945     }
1946    
1947     /* First, ensure the minimum number of matches are present. */
1948    
1949     #ifdef SUPPORT_UTF8
1950     /* UTF-8 mode */
1951     if (utf8)
1952     {
1953     for (i = 1; i <= min; i++)
1954     {
1955     if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
1956     GETCHARINC(c, eptr);
1957     if (c > 255)
1958     {
1959     if (op == OP_CLASS) RRETURN(MATCH_NOMATCH);
1960     }
1961     else
1962     {
1963     if ((data[c/8] & (1 << (c&7))) == 0) RRETURN(MATCH_NOMATCH);
1964     }
1965     }
1966     }
1967     else
1968     #endif
1969     /* Not UTF-8 mode */
1970     {
1971     for (i = 1; i <= min; i++)
1972     {
1973     if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
1974     c = *eptr++;
1975     if ((data[c/8] & (1 << (c&7))) == 0) RRETURN(MATCH_NOMATCH);
1976     }
1977     }
1978    
1979     /* If max == min we can continue with the main loop without the
1980     need to recurse. */
1981    
1982     if (min == max) continue;
1983    
1984     /* If minimizing, keep testing the rest of the expression and advancing
1985     the pointer while it matches the class. */
1986    
1987     if (minimize)
1988     {
1989     #ifdef SUPPORT_UTF8
1990     /* UTF-8 mode */
1991     if (utf8)
1992     {
1993     for (fi = min;; fi++)
1994     {
1995 ph10 164 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM16);
1996 nigel 77 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1997     if (fi >= max || eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
1998     GETCHARINC(c, eptr);
1999     if (c > 255)
2000     {
2001     if (op == OP_CLASS) RRETURN(MATCH_NOMATCH);
2002     }
2003     else
2004     {
2005     if ((data[c/8] & (1 << (c&7))) == 0) RRETURN(MATCH_NOMATCH);
2006     }
2007     }
2008     }
2009     else
2010     #endif
2011     /* Not UTF-8 mode */
2012     {
2013     for (fi = min;; fi++)
2014     {
2015 ph10 164 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM17);
2016 nigel 77 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2017     if (fi >= max || eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
2018     c = *eptr++;
2019     if ((data[c/8] & (1 << (c&7))) == 0) RRETURN(MATCH_NOMATCH);
2020     }
2021     }
2022     /* Control never gets here */
2023     }
2024    
2025     /* If maximizing, find the longest possible run, then work backwards. */
2026    
2027     else
2028     {
2029     pp = eptr;
2030    
2031     #ifdef SUPPORT_UTF8
2032     /* UTF-8 mode */
2033     if (utf8)
2034     {
2035     for (i = min; i < max; i++)
2036     {
2037     int len = 1;
2038     if (eptr >= md->end_subject) break;
2039     GETCHARLEN(c, eptr, len);
2040     if (c > 255)
2041     {
2042     if (op == OP_CLASS) break;
2043     }
2044     else
2045     {
2046     if ((data[c/8] & (1 << (c&7))) == 0) break;
2047     }
2048     eptr += len;
2049     }
2050     for (;;)
2051     {
2052 ph10 164 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM18);
2053 nigel 77 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2054     if (eptr-- == pp) break; /* Stop if tried at original pos */
2055     BACKCHAR(eptr);
2056     }
2057     }
2058     else
2059     #endif
2060     /* Not UTF-8 mode */
2061     {
2062     for (i = min; i < max; i++)
2063     {
2064     if (eptr >= md->end_subject) break;
2065     c = *eptr;
2066     if ((data[c/8] & (1 << (c&7))) == 0) break;
2067     eptr++;
2068     }
2069     while (eptr >= pp)
2070     {
2071 ph10 164 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM19);
2072 nigel 87 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2073 nigel 77 eptr--;
2074     }
2075     }
2076    
2077     RRETURN(MATCH_NOMATCH);
2078     }
2079     }
2080     /* Control never gets here */
2081    
2082    
2083     /* Match an extended character class. This opcode is encountered only
2084 ph10 384 when UTF-8 mode mode is supported. Nevertheless, we may not be in UTF-8
2085     mode, because Unicode properties are supported in non-UTF-8 mode. */
2086 nigel 77
2087     #ifdef SUPPORT_UTF8
2088     case OP_XCLASS:
2089     {
2090     data = ecode + 1 + LINK_SIZE; /* Save for matching */
2091     ecode += GET(ecode, 1); /* Advance past the item */
2092    
2093     switch (*ecode)
2094     {
2095     case OP_CRSTAR:
2096     case OP_CRMINSTAR:
2097     case OP_CRPLUS:
2098     case OP_CRMINPLUS:
2099     case OP_CRQUERY:
2100     case OP_CRMINQUERY:
2101     c = *ecode++ - OP_CRSTAR;
2102     minimize = (c & 1) != 0;
2103     min = rep_min[c]; /* Pick up values from tables; */
2104     max = rep_max[c]; /* zero for max => infinity */
2105     if (max == 0) max = INT_MAX;
2106     break;
2107    
2108     case OP_CRRANGE:
2109     case OP_CRMINRANGE:
2110     minimize = (*ecode == OP_CRMINRANGE);
2111     min = GET2(ecode, 1);
2112     max = GET2(ecode, 3);
2113     if (max == 0) max = INT_MAX;
2114     ecode += 5;
2115     break;
2116    
2117     default: /* No repeat follows */
2118     min = max = 1;
2119     break;
2120     }
2121    
2122     /* First, ensure the minimum number of matches are present. */
2123    
2124     for (i = 1; i <= min; i++)
2125     {
2126     if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
2127 ph10 384 GETCHARINCTEST(c, eptr);
2128 nigel 77 if (!_pcre_xclass(c, data)) RRETURN(MATCH_NOMATCH);
2129     }
2130    
2131     /* If max == min we can continue with the main loop without the
2132     need to recurse. */
2133    
2134     if (min == max) continue;
2135    
2136     /* If minimizing, keep testing the rest of the expression and advancing
2137     the pointer while it matches the class. */
2138    
2139     if (minimize)
2140     {
2141     for (fi = min;; fi++)
2142     {
2143 ph10 164 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM20);
2144 nigel 77 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2145     if (fi >= max || eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
2146 ph10 384 GETCHARINCTEST(c, eptr);
2147 nigel 77 if (!_pcre_xclass(c, data)) RRETURN(MATCH_NOMATCH);
2148     }
2149     /* Control never gets here */
2150     }
2151    
2152     /* If maximizing, find the longest possible run, then work backwards. */
2153    
2154     else
2155     {
2156     pp = eptr;
2157     for (i = min; i < max; i++)
2158     {
2159     int len = 1;
2160     if (eptr >= md->end_subject) break;
2161 ph10 384 GETCHARLENTEST(c, eptr, len);
2162 nigel 77 if (!_pcre_xclass(c, data)) break;
2163     eptr += len;
2164     }
2165     for(;;)
2166     {
2167 ph10 164 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM21);
2168 nigel 77 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2169     if (eptr-- == pp) break; /* Stop if tried at original pos */
2170 ph10 214 if (utf8) BACKCHAR(eptr);
2171 nigel 77 }
2172     RRETURN(MATCH_NOMATCH);
2173     }
2174    
2175     /* Control never gets here */
2176     }
2177     #endif /* End of XCLASS */
2178    
2179     /* Match a single character, casefully */
2180    
2181     case OP_CHAR:
2182     #ifdef SUPPORT_UTF8
2183     if (utf8)
2184     {
2185     length = 1;
2186     ecode++;
2187     GETCHARLEN(fc, ecode, length);
2188     if (length > md->end_subject - eptr) RRETURN(MATCH_NOMATCH);
2189     while (length-- > 0) if (*ecode++ != *eptr++) RRETURN(MATCH_NOMATCH);
2190     }
2191     else
2192     #endif
2193    
2194     /* Non-UTF-8 mode */
2195     {
2196     if (md->end_subject - eptr < 1) RRETURN(MATCH_NOMATCH);
2197     if (ecode[1] != *eptr++) RRETURN(MATCH_NOMATCH);
2198     ecode += 2;
2199     }
2200     break;
2201    
2202     /* Match a single character, caselessly */
2203    
2204     case OP_CHARNC:
2205     #ifdef SUPPORT_UTF8
2206     if (utf8)
2207     {
2208     length = 1;
2209     ecode++;
2210     GETCHARLEN(fc, ecode, length);
2211    
2212     if (length > md->end_subject - eptr) RRETURN(MATCH_NOMATCH);
2213    
2214     /* If the pattern character's value is < 128, we have only one byte, and
2215     can use the fast lookup table. */
2216    
2217     if (fc < 128)
2218     {
2219     if (md->lcc[*ecode++] != md->lcc[*eptr++]) RRETURN(MATCH_NOMATCH);
2220     }
2221    
2222     /* Otherwise we must pick up the subject character */
2223    
2224     else
2225     {
2226 nigel 93 unsigned int dc;
2227 nigel 77 GETCHARINC(dc, eptr);
2228     ecode += length;
2229    
2230     /* If we have Unicode property support, we can use it to test the other
2231 nigel 87 case of the character, if there is one. */
2232 nigel 77
2233     if (fc != dc)
2234     {
2235     #ifdef SUPPORT_UCP
2236 ph10 349 if (dc != UCD_OTHERCASE(fc))
2237 nigel 77 #endif
2238     RRETURN(MATCH_NOMATCH);
2239     }
2240     }
2241     }
2242     else
2243     #endif /* SUPPORT_UTF8 */
2244    
2245     /* Non-UTF-8 mode */
2246     {
2247     if (md->end_subject - eptr < 1) RRETURN(MATCH_NOMATCH);
2248     if (md->lcc[ecode[1]] != md->lcc[*eptr++]) RRETURN(MATCH_NOMATCH);
2249     ecode += 2;
2250     }
2251     break;
2252    
2253 nigel 93 /* Match a single character repeatedly. */
2254 nigel 77
2255     case OP_EXACT:
2256     min = max = GET2(ecode, 1);
2257     ecode += 3;
2258     goto REPEATCHAR;
2259    
2260 nigel 93 case OP_POSUPTO:
2261     possessive = TRUE;
2262     /* Fall through */
2263    
2264 nigel 77 case OP_UPTO:
2265     case OP_MINUPTO:
2266     min = 0;
2267     max = GET2(ecode, 1);
2268     minimize = *ecode == OP_MINUPTO;
2269     ecode += 3;
2270     goto REPEATCHAR;
2271    
2272 nigel 93 case OP_POSSTAR:
2273     possessive = TRUE;
2274     min = 0;
2275     max = INT_MAX;
2276     ecode++;
2277     goto REPEATCHAR;
2278    
2279     case OP_POSPLUS:
2280     possessive = TRUE;
2281     min = 1;
2282     max = INT_MAX;
2283     ecode++;
2284     goto REPEATCHAR;
2285    
2286     case OP_POSQUERY:
2287     possessive = TRUE;
2288     min = 0;
2289     max = 1;
2290     ecode++;
2291     goto REPEATCHAR;
2292    
2293 nigel 77 case OP_STAR:
2294     case OP_MINSTAR:
2295     case OP_PLUS:
2296     case OP_MINPLUS:
2297     case OP_QUERY:
2298     case OP_MINQUERY:
2299     c = *ecode++ - OP_STAR;
2300     minimize = (c & 1) != 0;
2301     min = rep_min[c]; /* Pick up values from tables; */
2302     max = rep_max[c]; /* zero for max => infinity */
2303     if (max == 0) max = INT_MAX;
2304    
2305     /* Common code for all repeated single-character matches. We can give
2306     up quickly if there are fewer than the minimum number of characters left in
2307     the subject. */
2308    
2309     REPEATCHAR:
2310     #ifdef SUPPORT_UTF8
2311     if (utf8)
2312     {
2313     length = 1;
2314     charptr = ecode;
2315     GETCHARLEN(fc, ecode, length);
2316     if (min * length > md->end_subject - eptr) RRETURN(MATCH_NOMATCH);
2317     ecode += length;
2318    
2319     /* Handle multibyte character matching specially here. There is
2320     support for caseless matching if UCP support is present. */
2321    
2322     if (length > 1)
2323     {
2324     #ifdef SUPPORT_UCP
2325 nigel 93 unsigned int othercase;
2326 nigel 77 if ((ims & PCRE_CASELESS) != 0 &&
2327 ph10 349 (othercase = UCD_OTHERCASE(fc)) != fc)
2328 nigel 77 oclength = _pcre_ord2utf8(othercase, occhars);
2329 ph10 115 else oclength = 0;
2330 nigel 77 #endif /* SUPPORT_UCP */
2331    
2332     for (i = 1; i <= min; i++)
2333     {
2334     if (memcmp(eptr, charptr, length) == 0) eptr += length;
2335 ph10 123 #ifdef SUPPORT_UCP
2336 nigel 77 /* Need braces because of following else */
2337     else if (oclength == 0) { RRETURN(MATCH_NOMATCH); }
2338     else
2339     {
2340     if (memcmp(eptr, occhars, oclength) != 0) RRETURN(MATCH_NOMATCH);
2341     eptr += oclength;
2342     }
2343 ph10 115 #else /* without SUPPORT_UCP */
2344     else { RRETURN(MATCH_NOMATCH); }
2345 ph10 123 #endif /* SUPPORT_UCP */
2346 nigel 77 }
2347    
2348     if (min == max) continue;
2349    
2350     if (minimize)
2351     {
2352     for (fi = min;; fi++)
2353     {
2354 ph10 164 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM22);
2355 nigel 77 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2356     if (fi >= max || eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
2357     if (memcmp(eptr, charptr, length) == 0) eptr += length;
2358 ph10 123 #ifdef SUPPORT_UCP
2359 nigel 77 /* Need braces because of following else */
2360     else if (oclength == 0) { RRETURN(MATCH_NOMATCH); }
2361     else
2362     {
2363     if (memcmp(eptr, occhars, oclength) != 0) RRETURN(MATCH_NOMATCH);
2364     eptr += oclength;
2365     }
2366 ph10 115 #else /* without SUPPORT_UCP */
2367     else { RRETURN (MATCH_NOMATCH); }
2368     #endif /* SUPPORT_UCP */
2369 nigel 77 }
2370     /* Control never gets here */
2371     }
2372 nigel 93
2373     else /* Maximize */
2374 nigel 77 {
2375     pp = eptr;
2376     for (i = min; i < max; i++)
2377     {
2378     if (eptr > md->end_subject - length) break;
2379     if (memcmp(eptr, charptr, length) == 0) eptr += length;
2380 ph10 123 #ifdef SUPPORT_UCP
2381 nigel 77 else if (oclength == 0) break;
2382     else
2383     {
2384     if (memcmp(eptr, occhars, oclength) != 0) break;
2385     eptr += oclength;
2386     }
2387 ph10 115 #else /* without SUPPORT_UCP */
2388     else break;
2389 ph10 123 #endif /* SUPPORT_UCP */
2390 nigel 77 }
2391 nigel 93
2392     if (possessive) continue;
2393 ph10 120 for(;;)
2394 nigel 77 {
2395 ph10 164 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM23);
2396 nigel 77 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2397 ph10 120 if (eptr == pp) RRETURN(MATCH_NOMATCH);
2398 ph10 115 #ifdef SUPPORT_UCP
2399     eptr--;
2400     BACKCHAR(eptr);
2401 ph10 123 #else /* without SUPPORT_UCP */
2402 nigel 77 eptr -= length;
2403 ph10 123 #endif /* SUPPORT_UCP */
2404 nigel 77 }
2405     }
2406     /* Control never gets here */
2407     }
2408    
2409     /* If the length of a UTF-8 character is 1, we fall through here, and
2410     obey the code as for non-UTF-8 characters below, though in this case the
2411     value of fc will always be < 128. */
2412     }
2413     else
2414     #endif /* SUPPORT_UTF8 */
2415    
2416     /* When not in UTF-8 mode, load a single-byte character. */
2417     {
2418     if (min > md->end_subject - eptr) RRETURN(MATCH_NOMATCH);
2419     fc = *ecode++;
2420     }
2421    
2422     /* The value of fc at this point is always less than 256, though we may or
2423     may not be in UTF-8 mode. The code is duplicated for the caseless and
2424     caseful cases, for speed, since matching characters is likely to be quite
2425     common. First, ensure the minimum number of matches are present. If min =
2426     max, continue at the same level without recursing. Otherwise, if
2427     minimizing, keep trying the rest of the expression and advancing one
2428     matching character if failing, up to the maximum. Alternatively, if
2429     maximizing, find the maximum number of characters and work backwards. */
2430    
2431     DPRINTF(("matching %c{%d,%d} against subject %.*s\n", fc, min, max,
2432     max, eptr));
2433    
2434     if ((ims & PCRE_CASELESS) != 0)
2435     {
2436     fc = md->lcc[fc];
2437     for (i = 1; i <= min; i++)
2438     if (fc != md->lcc[*eptr++]) RRETURN(MATCH_NOMATCH);
2439     if (min == max) continue;
2440     if (minimize)
2441     {
2442     for (fi = min;; fi++)
2443     {
2444 ph10 164 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM24);
2445 nigel 77 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2446     if (fi >= max || eptr >= md->end_subject ||
2447     fc != md->lcc[*eptr++])
2448     RRETURN(MATCH_NOMATCH);
2449     }
2450     /* Control never gets here */
2451     }
2452 nigel 93 else /* Maximize */
2453 nigel 77 {
2454     pp = eptr;
2455     for (i = min; i < max; i++)
2456     {
2457     if (eptr >= md->end_subject || fc != md->lcc[*eptr]) break;
2458     eptr++;
2459     }
2460 nigel 93 if (possessive) continue;
2461 nigel 77 while (eptr >= pp)
2462     {
2463 ph10 164 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM25);
2464 nigel 77 eptr--;
2465     if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2466     }
2467     RRETURN(MATCH_NOMATCH);
2468     }
2469     /* Control never gets here */
2470     }
2471    
2472     /* Caseful comparisons (includes all multi-byte characters) */
2473    
2474     else
2475     {
2476     for (i = 1; i <= min; i++) if (fc != *eptr++) RRETURN(MATCH_NOMATCH);
2477     if (min == max) continue;
2478     if (minimize)
2479     {
2480     for (fi = min;; fi++)
2481     {
2482 ph10 164 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM26);
2483 nigel 77 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2484     if (fi >= max || eptr >= md->end_subject || fc != *eptr++)
2485     RRETURN(MATCH_NOMATCH);
2486     }
2487     /* Control never gets here */
2488     }
2489 nigel 93 else /* Maximize */
2490 nigel 77 {
2491     pp = eptr;
2492     for (i = min; i < max; i++)
2493     {
2494     if (eptr >= md->end_subject || fc != *eptr) break;
2495     eptr++;
2496     }
2497 nigel 93 if (possessive) continue;
2498 nigel 77 while (eptr >= pp)
2499     {
2500 ph10 164 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM27);
2501 nigel 77 eptr--;
2502     if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2503     }
2504     RRETURN(MATCH_NOMATCH);
2505     }
2506     }
2507     /* Control never gets here */
2508    
2509     /* Match a negated single one-byte character. The character we are
2510     checking can be multibyte. */
2511    
2512     case OP_NOT:
2513     if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
2514     ecode++;
2515     GETCHARINCTEST(c, eptr);
2516     if ((ims & PCRE_CASELESS) != 0)
2517     {
2518     #ifdef SUPPORT_UTF8
2519     if (c < 256)
2520     #endif
2521     c = md->lcc[c];
2522     if (md->lcc[*ecode++] == c) RRETURN(MATCH_NOMATCH);
2523     }
2524     else
2525     {
2526     if (*ecode++ == c) RRETURN(MATCH_NOMATCH);
2527     }
2528     break;
2529    
2530     /* Match a negated single one-byte character repeatedly. This is almost a
2531     repeat of the code for a repeated single character, but I haven't found a
2532     nice way of commoning these up that doesn't require a test of the
2533     positive/negative option for each character match. Maybe that wouldn't add
2534     very much to the time taken, but character matching *is* what this is all
2535     about... */
2536    
2537     case OP_NOTEXACT:
2538     min = max = GET2(ecode, 1);
2539     ecode += 3;
2540     goto REPEATNOTCHAR;
2541    
2542     case OP_NOTUPTO:
2543     case OP_NOTMINUPTO:
2544     min = 0;
2545     max = GET2(ecode, 1);
2546     minimize = *ecode == OP_NOTMINUPTO;
2547     ecode += 3;
2548     goto REPEATNOTCHAR;
2549    
2550 nigel 93 case OP_NOTPOSSTAR:
2551     possessive = TRUE;
2552     min = 0;
2553     max = INT_MAX;
2554     ecode++;
2555     goto REPEATNOTCHAR;
2556    
2557     case OP_NOTPOSPLUS:
2558     possessive = TRUE;
2559     min = 1;
2560     max = INT_MAX;
2561     ecode++;
2562     goto REPEATNOTCHAR;
2563    
2564     case OP_NOTPOSQUERY:
2565     possessive = TRUE;
2566     min = 0;
2567     max = 1;
2568     ecode++;
2569     goto REPEATNOTCHAR;
2570    
2571     case OP_NOTPOSUPTO:
2572     possessive = TRUE;
2573     min = 0;
2574     max = GET2(ecode, 1);
2575     ecode += 3;
2576     goto REPEATNOTCHAR;
2577    
2578 nigel 77 case OP_NOTSTAR:
2579     case OP_NOTMINSTAR:
2580     case OP_NOTPLUS:
2581     case OP_NOTMINPLUS:
2582     case OP_NOTQUERY:
2583     case OP_NOTMINQUERY:
2584     c = *ecode++ - OP_NOTSTAR;
2585     minimize = (c & 1) != 0;
2586     min = rep_min[c]; /* Pick up values from tables; */
2587     max = rep_max[c]; /* zero for max => infinity */
2588     if (max == 0) max = INT_MAX;
2589    
2590     /* Common code for all repeated single-byte matches. We can give up quickly
2591     if there are fewer than the minimum number of bytes left in the
2592     subject. */
2593    
2594     REPEATNOTCHAR:
2595     if (min > md->end_subject - eptr) RRETURN(MATCH_NOMATCH);
2596     fc = *ecode++;
2597    
2598     /* The code is duplicated for the caseless and caseful cases, for speed,
2599     since matching characters is likely to be quite common. First, ensure the
2600     minimum number of matches are present. If min = max, continue at the same
2601     level without recursing. Otherwise, if minimizing, keep trying the rest of
2602     the expression and advancing one matching character if failing, up to the
2603     maximum. Alternatively, if maximizing, find the maximum number of
2604     characters and work backwards. */
2605    
2606     DPRINTF(("negative matching %c{%d,%d} against subject %.*s\n", fc, min, max,
2607     max, eptr));
2608    
2609     if ((ims & PCRE_CASELESS) != 0)
2610     {
2611     fc = md->lcc[fc];
2612    
2613     #ifdef SUPPORT_UTF8
2614     /* UTF-8 mode */
2615     if (utf8)
2616     {
2617 nigel 93 register unsigned int d;
2618 nigel 77 for (i = 1; i <= min; i++)
2619     {
2620     GETCHARINC(d, eptr);
2621     if (d < 256) d = md->lcc[d];
2622     if (fc == d) RRETURN(MATCH_NOMATCH);
2623     }
2624     }
2625     else
2626     #endif
2627    
2628     /* Not UTF-8 mode */
2629     {
2630     for (i = 1; i <= min; i++)
2631     if (fc == md->lcc[*eptr++]) RRETURN(MATCH_NOMATCH);
2632     }
2633    
2634     if (min == max) continue;
2635    
2636     if (minimize)
2637     {
2638     #ifdef SUPPORT_UTF8
2639     /* UTF-8 mode */
2640     if (utf8)
2641     {
2642 nigel 93 register unsigned int d;
2643 nigel 77 for (fi = min;; fi++)
2644     {
2645 ph10 164 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM28);
2646 nigel 77 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2647 ph10 366 if (fi >= max || eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
2648 nigel 77 GETCHARINC(d, eptr);
2649     if (d < 256) d = md->lcc[d];
2650 ph10 366 if (fc == d) RRETURN(MATCH_NOMATCH);
2651 ph10 371
2652 nigel 77 }
2653     }
2654     else
2655     #endif
2656     /* Not UTF-8 mode */
2657     {
2658     for (fi = min;; fi++)
2659     {
2660 ph10 164 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM29);
2661 nigel 77 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2662     if (fi >= max || eptr >= md->end_subject || fc == md->lcc[*eptr++])
2663     RRETURN(MATCH_NOMATCH);
2664     }
2665     }
2666     /* Control never gets here */
2667     }
2668    
2669     /* Maximize case */
2670    
2671     else
2672     {
2673     pp = eptr;
2674    
2675     #ifdef SUPPORT_UTF8
2676     /* UTF-8 mode */
2677     if (utf8)
2678     {
2679 nigel 93 register unsigned int d;
2680 nigel 77 for (i = min; i < max; i++)
2681     {
2682     int len = 1;
2683     if (eptr >= md->end_subject) break;
2684     GETCHARLEN(d, eptr, len);
2685     if (d < 256) d = md->lcc[d];
2686     if (fc == d) break;
2687     eptr += len;
2688     }
2689 nigel 93 if (possessive) continue;
2690     for(;;)
2691 nigel 77 {
2692 ph10 164 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM30);
2693 nigel 77 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2694     if (eptr-- == pp) break; /* Stop if tried at original pos */
2695     BACKCHAR(eptr);
2696     }
2697     }
2698     else
2699     #endif
2700     /* Not UTF-8 mode */
2701     {
2702     for (i = min; i < max; i++)
2703     {
2704     if (eptr >= md->end_subject || fc == md->lcc[*eptr]) break;
2705     eptr++;
2706     }
2707 nigel 93 if (possessive) continue;
2708 nigel 77 while (eptr >= pp)
2709     {
2710 ph10 164 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM31);
2711 nigel 77 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2712     eptr--;
2713     }
2714     }
2715    
2716     RRETURN(MATCH_NOMATCH);
2717     }
2718     /* Control never gets here */
2719     }
2720    
2721     /* Caseful comparisons */
2722    
2723     else
2724     {
2725     #ifdef SUPPORT_UTF8
2726     /* UTF-8 mode */
2727     if (utf8)
2728     {
2729 nigel 93 register unsigned int d;
2730 nigel 77 for (i = 1; i <= min; i++)
2731     {
2732     GETCHARINC(d, eptr);
2733     if (fc == d) RRETURN(MATCH_NOMATCH);
2734     }
2735     }
2736     else
2737     #endif
2738     /* Not UTF-8 mode */
2739     {
2740     for (i = 1; i <= min; i++)
2741     if (fc == *eptr++) RRETURN(MATCH_NOMATCH);
2742     }
2743    
2744     if (min == max) continue;
2745    
2746     if (minimize)
2747     {
2748     #ifdef SUPPORT_UTF8
2749     /* UTF-8 mode */
2750     if (utf8)
2751     {
2752 nigel 93 register unsigned int d;
2753 nigel 77 for (fi = min;; fi++)
2754     {
2755 ph10 164 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM32);
2756 nigel 77 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2757 ph10 366 if (fi >= max || eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
2758 nigel 77 GETCHARINC(d, eptr);
2759 ph10 371 if (fc == d) RRETURN(MATCH_NOMATCH);
2760 nigel 77 }
2761     }
2762     else
2763     #endif
2764     /* Not UTF-8 mode */
2765     {
2766     for (fi = min;; fi++)
2767     {
2768 ph10 164 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM33);
2769 nigel 77 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2770     if (fi >= max || eptr >= md->end_subject || fc == *eptr++)
2771     RRETURN(MATCH_NOMATCH);
2772     }
2773     }
2774     /* Control never gets here */
2775     }
2776    
2777     /* Maximize case */
2778    
2779     else
2780     {
2781     pp = eptr;
2782    
2783     #ifdef SUPPORT_UTF8
2784     /* UTF-8 mode */
2785     if (utf8)
2786     {
2787 nigel 93 register unsigned int d;
2788 nigel 77 for (i = min; i < max; i++)
2789     {
2790     int len = 1;
2791     if (eptr >= md->end_subject) break;
2792     GETCHARLEN(d, eptr, len);
2793     if (fc == d) break;
2794     eptr += len;
2795     }
2796 nigel 93 if (possessive) continue;
2797 nigel 77 for(;;)
2798     {
2799 ph10 164 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM34);
2800 nigel 77 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2801     if (eptr-- == pp) break; /* Stop if tried at original pos */
2802     BACKCHAR(eptr);
2803     }
2804     }
2805     else
2806     #endif
2807     /* Not UTF-8 mode */
2808     {
2809     for (i = min; i < max; i++)
2810     {
2811     if (eptr >= md->end_subject || fc == *eptr) break;
2812     eptr++;
2813     }
2814 nigel 93 if (possessive) continue;
2815 nigel 77 while (eptr >= pp)
2816     {
2817 ph10 164 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM35);
2818 nigel 77 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2819     eptr--;
2820     }
2821     }
2822    
2823     RRETURN(MATCH_NOMATCH);
2824     }
2825     }
2826     /* Control never gets here */
2827    
2828     /* Match a single character type repeatedly; several different opcodes
2829     share code. This is very similar to the code for single characters, but we
2830     repeat it in the interests of efficiency. */
2831    
2832     case OP_TYPEEXACT:
2833     min = max = GET2(ecode, 1);
2834     minimize = TRUE;
2835     ecode += 3;
2836     goto REPEATTYPE;
2837    
2838     case OP_TYPEUPTO:
2839     case OP_TYPEMINUPTO:
2840     min = 0;
2841     max = GET2(ecode, 1);
2842     minimize = *ecode == OP_TYPEMINUPTO;
2843     ecode += 3;
2844     goto REPEATTYPE;
2845    
2846 nigel 93 case OP_TYPEPOSSTAR:
2847     possessive = TRUE;
2848     min = 0;
2849     max = INT_MAX;
2850     ecode++;
2851     goto REPEATTYPE;
2852    
2853     case OP_TYPEPOSPLUS:
2854     possessive = TRUE;
2855     min = 1;
2856     max = INT_MAX;
2857     ecode++;
2858     goto REPEATTYPE;
2859    
2860     case OP_TYPEPOSQUERY:
2861     possessive = TRUE;
2862     min = 0;
2863     max = 1;
2864     ecode++;
2865     goto REPEATTYPE;
2866    
2867     case OP_TYPEPOSUPTO:
2868     possessive = TRUE;
2869     min = 0;
2870     max = GET2(ecode, 1);
2871     ecode += 3;
2872     goto REPEATTYPE;
2873    
2874 nigel 77 case OP_TYPESTAR:
2875     case OP_TYPEMINSTAR:
2876     case OP_TYPEPLUS:
2877     case OP_TYPEMINPLUS:
2878     case OP_TYPEQUERY:
2879     case OP_TYPEMINQUERY:
2880     c = *ecode++ - OP_TYPESTAR;
2881     minimize = (c & 1) != 0;
2882     min = rep_min[c]; /* Pick up values from tables; */
2883     max = rep_max[c]; /* zero for max => infinity */
2884     if (max == 0) max = INT_MAX;
2885    
2886     /* Common code for all repeated single character type matches. Note that
2887     in UTF-8 mode, '.' matches a character of any length, but for the other
2888     character types, the valid characters are all one-byte long. */
2889    
2890     REPEATTYPE:
2891     ctype = *ecode++; /* Code for the character type */
2892    
2893     #ifdef SUPPORT_UCP
2894     if (ctype == OP_PROP || ctype == OP_NOTPROP)
2895     {
2896     prop_fail_result = ctype == OP_NOTPROP;
2897     prop_type = *ecode++;
2898 nigel 87 prop_value = *ecode++;
2899 nigel 77 }
2900     else prop_type = -1;
2901     #endif
2902    
2903     /* First, ensure the minimum number of matches are present. Use inline
2904     code for maximizing the speed, and do the type test once at the start
2905     (i.e. keep it out of the loop). Also we can test that there are at least
2906     the minimum number of bytes before we start. This isn't as effective in
2907     UTF-8 mode, but it does no harm. Separate the UTF-8 code completely as that
2908     is tidier. Also separate the UCP code, which can be the same for both UTF-8
2909     and single-bytes. */
2910    
2911     if (min > md->end_subject - eptr) RRETURN(MATCH_NOMATCH);
2912     if (min > 0)
2913     {
2914     #ifdef SUPPORT_UCP
2915 nigel 87 if (prop_type >= 0)
2916 nigel 77 {
2917 nigel 87 switch(prop_type)
2918 nigel 77 {
2919 nigel 87 case PT_ANY:
2920     if (prop_fail_result) RRETURN(MATCH_NOMATCH);
2921     for (i = 1; i <= min; i++)
2922     {
2923     if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
2924 ph10 184 GETCHARINCTEST(c, eptr);
2925 nigel 87 }
2926     break;
2927    
2928     case PT_LAMP:
2929     for (i = 1; i <= min; i++)
2930     {
2931     if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
2932 ph10 184 GETCHARINCTEST(c, eptr);
2933 ph10 349 prop_chartype = UCD_CHARTYPE(c);
2934 nigel 87 if ((prop_chartype == ucp_Lu ||
2935     prop_chartype == ucp_Ll ||
2936     prop_chartype == ucp_Lt) == prop_fail_result)
2937     RRETURN(MATCH_NOMATCH);
2938     }
2939     break;
2940    
2941     case PT_GC:
2942     for (i = 1; i <= min; i++)
2943     {
2944     if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
2945 ph10 184 GETCHARINCTEST(c, eptr);
2946 ph10 349 prop_category = UCD_CATEGORY(c);
2947 nigel 87 if ((prop_category == prop_value) == prop_fail_result)
2948     RRETURN(MATCH_NOMATCH);
2949     }
2950     break;
2951    
2952     case PT_PC:
2953     for (i = 1; i <= min; i++)
2954     {
2955     if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
2956 ph10 184 GETCHARINCTEST(c, eptr);
2957 ph10 349 prop_chartype = UCD_CHARTYPE(c);
2958 nigel 87 if ((prop_chartype == prop_value) == prop_fail_result)
2959     RRETURN(MATCH_NOMATCH);
2960     }
2961     break;
2962    
2963     case PT_SC:
2964     for (i = 1; i <= min; i++)
2965     {
2966     if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
2967 ph10 184 GETCHARINCTEST(c, eptr);
2968 ph10 349 prop_script = UCD_SCRIPT(c);
2969 nigel 87 if ((prop_script == prop_value) == prop_fail_result)
2970     RRETURN(MATCH_NOMATCH);
2971     }
2972     break;
2973    
2974     default:
2975     RRETURN(PCRE_ERROR_INTERNAL);
2976 nigel 77 }
2977     }
2978    
2979     /* Match extended Unicode sequences. We will get here only if the
2980     support is in the binary; otherwise a compile-time error occurs. */
2981    
2982     else if (ctype == OP_EXTUNI)
2983     {
2984     for (i = 1; i <= min; i++)
2985     {
2986     GETCHARINCTEST(c, eptr);
2987 ph10 349 prop_category = UCD_CATEGORY(c);
2988 nigel 77 if (prop_category == ucp_M) RRETURN(MATCH_NOMATCH);
2989     while (eptr < md->end_subject)
2990     {
2991     int len = 1;
2992     if (!utf8) c = *eptr; else
2993     {
2994     GETCHARLEN(c, eptr, len);
2995     }
2996 ph10 349 prop_category = UCD_CATEGORY(c);
2997 nigel 77 if (prop_category != ucp_M) break;
2998     eptr += len;
2999     }
3000     }
3001     }
3002    
3003     else
3004     #endif /* SUPPORT_UCP */
3005    
3006     /* Handle all other cases when the coding is UTF-8 */
3007    
3008     #ifdef SUPPORT_UTF8
3009     if (utf8) switch(ctype)
3010     {
3011     case OP_ANY:
3012     for (i = 1; i <= min; i++)
3013     {
3014 ph10 342 if (eptr >= md->end_subject || IS_NEWLINE(eptr))
3015 nigel 77 RRETURN(MATCH_NOMATCH);
3016 nigel 91 eptr++;
3017 nigel 77 while (eptr < md->end_subject && (*eptr & 0xc0) == 0x80) eptr++;
3018     }
3019     break;
3020    
3021 ph10 341 case OP_ALLANY:
3022     for (i = 1; i <= min; i++)
3023     {
3024     if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
3025     eptr++;
3026     while (eptr < md->end_subject && (*eptr & 0xc0) == 0x80) eptr++;
3027     }
3028     break;
3029    
3030 nigel 77 case OP_ANYBYTE:
3031     eptr += min;
3032     break;
3033    
3034 nigel 93 case OP_ANYNL:
3035     for (i = 1; i <= min; i++)
3036     {
3037     if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
3038     GETCHARINC(c, eptr);
3039     switch(c)
3040     {
3041     default: RRETURN(MATCH_NOMATCH);
3042     case 0x000d:
3043     if (eptr < md->end_subject && *eptr == 0x0a) eptr++;
3044     break;
3045 ph10 231
3046 nigel 93 case 0x000a:
3047 ph10 231 break;
3048    
3049 nigel 93 case 0x000b:
3050     case 0x000c:
3051     case 0x0085:
3052     case 0x2028:
3053     case 0x2029:
3054 ph10 231 if (md->bsr_anycrlf) RRETURN(MATCH_NOMATCH);
3055 nigel 93 break;
3056     }
3057     }
3058     break;
3059    
3060 ph10 178 case OP_NOT_HSPACE:
3061     for (i = 1; i <= min; i++)
3062     {
3063     if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
3064     GETCHARINC(c, eptr);
3065     switch(c)
3066     {
3067     default: break;
3068     case 0x09: /* HT */
3069     case 0x20: /* SPACE */
3070     case 0xa0: /* NBSP */
3071     case 0x1680: /* OGHAM SPACE MARK */
3072     case 0x180e: /* MONGOLIAN VOWEL SEPARATOR */
3073     case 0x2000: /* EN QUAD */
3074     case 0x2001: /* EM QUAD */
3075     case 0x2002: /* EN SPACE */
3076     case 0x2003: /* EM SPACE */
3077     case 0x2004: /* THREE-PER-EM SPACE */
3078     case 0x2005: /* FOUR-PER-EM SPACE */
3079     case 0x2006: /* SIX-PER-EM SPACE */
3080     case 0x2007: /* FIGURE SPACE */
3081     case 0x2008: /* PUNCTUATION SPACE */
3082     case 0x2009: /* THIN SPACE */
3083     case 0x200A: /* HAIR SPACE */
3084     case 0x202f: /* NARROW NO-BREAK SPACE */
3085     case 0x205f: /* MEDIUM MATHEMATICAL SPACE */
3086     case 0x3000: /* IDEOGRAPHIC SPACE */
3087     RRETURN(MATCH_NOMATCH);
3088     }
3089     }
3090     break;
3091 ph10 182
3092 ph10 178 case OP_HSPACE:
3093     for (i = 1; i <= min; i++)
3094     {
3095     if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
3096     GETCHARINC(c, eptr);
3097     switch(c)
3098     {
3099     default: RRETURN(MATCH_NOMATCH);
3100     case 0x09: /* HT */
3101     case 0x20: /* SPACE */
3102     case 0xa0: /* NBSP */
3103     case 0x1680: /* OGHAM SPACE MARK */
3104     case 0x180e: /* MONGOLIAN VOWEL SEPARATOR */
3105     case 0x2000: /* EN QUAD */
3106     case 0x2001: /* EM QUAD */
3107     case 0x2002: /* EN SPACE */
3108     case 0x2003: /* EM SPACE */
3109     case 0x2004: /* THREE-PER-EM SPACE */
3110     case 0x2005: /* FOUR-PER-EM SPACE */
3111     case 0x2006: /* SIX-PER-EM SPACE */
3112     case 0x2007: /* FIGURE SPACE */
3113     case 0x2008: /* PUNCTUATION SPACE */
3114     case 0x2009: /* THIN SPACE */
3115     case 0x200A: /* HAIR SPACE */
3116     case 0x202f: /* NARROW NO-BREAK SPACE */
3117     case 0x205f: /* MEDIUM MATHEMATICAL SPACE */
3118     case 0x3000: /* IDEOGRAPHIC SPACE */
3119     break;
3120     }
3121     }
3122     break;
3123 ph10 182
3124 ph10 178 case OP_NOT_VSPACE:
3125     for (i = 1; i <= min; i++)
3126     {
3127     if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
3128     GETCHARINC(c, eptr);
3129     switch(c)
3130     {
3131     default: break;
3132     case 0x0a: /* LF */
3133     case 0x0b: /* VT */
3134     case 0x0c: /* FF */
3135     case 0x0d: /* CR */
3136     case 0x85: /* NEL */
3137     case 0x2028: /* LINE SEPARATOR */
3138     case 0x2029: /* PARAGRAPH SEPARATOR */
3139     RRETURN(MATCH_NOMATCH);
3140     }
3141     }
3142     break;
3143 ph10 182
3144 ph10 178 case OP_VSPACE:
3145     for (i = 1; i <= min; i++)
3146     {
3147     if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
3148     GETCHARINC(c, eptr);
3149     switch(c)
3150     {
3151     default: RRETURN(MATCH_NOMATCH);
3152     case 0x0a: /* LF */
3153     case 0x0b: /* VT */
3154     case 0x0c: /* FF */
3155     case 0x0d: /* CR */
3156     case 0x85: /* NEL */
3157     case 0x2028: /* LINE SEPARATOR */
3158     case 0x2029: /* PARAGRAPH SEPARATOR */
3159 ph10 182 break;
3160 ph10 178 }
3161     }
3162     break;
3163    
3164 nigel 77 case OP_NOT_DIGIT:
3165     for (i = 1; i <= min; i++)
3166     {
3167     if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
3168     GETCHARINC(c, eptr);
3169     if (c < 128 && (md->ctypes[c] & ctype_digit) != 0)
3170     RRETURN(MATCH_NOMATCH);
3171     }
3172     break;
3173    
3174     case OP_DIGIT:
3175     for (i = 1; i <= min; i++)
3176     {
3177     if (eptr >= md->end_subject ||
3178     *eptr >= 128 || (md->ctypes[*eptr++] & ctype_digit) == 0)
3179     RRETURN(MATCH_NOMATCH);
3180     /* No need to skip more bytes - we know it's a 1-byte character */
3181     }
3182     break;
3183    
3184     case OP_NOT_WHITESPACE:
3185     for (i = 1; i <= min; i++)
3186     {
3187     if (eptr >= md->end_subject ||
3188 ph10 219 (*eptr < 128 && (md->ctypes[*eptr] & ctype_space) != 0))
3189 nigel 77 RRETURN(MATCH_NOMATCH);
3190 ph10 219 while (++eptr < md->end_subject && (*eptr & 0xc0) == 0x80);
3191 nigel 77 }
3192     break;
3193    
3194     case OP_WHITESPACE:
3195     for (i = 1; i <= min; i++)
3196     {
3197     if (eptr >= md->end_subject ||
3198     *eptr >= 128 || (md->ctypes[*eptr++] & ctype_space) == 0)
3199     RRETURN(MATCH_NOMATCH);
3200     /* No need to skip more bytes - we know it's a 1-byte character */
3201     }
3202     break;
3203    
3204     case OP_NOT_WORDCHAR:
3205     for (i = 1; i <= min; i++)
3206     {
3207     if (eptr >= md->end_subject ||
3208 ph10 219 (*eptr < 128 && (md->ctypes[*eptr] & ctype_word) != 0))
3209 nigel 77 RRETURN(MATCH_NOMATCH);
3210 ph10 219 while (++eptr < md->end_subject && (*eptr & 0xc0) == 0x80);
3211 nigel 77 }
3212     break;
3213    
3214     case OP_WORDCHAR:
3215     for (i = 1; i <= min; i++)
3216     {
3217     if (eptr >= md->end_subject ||
3218     *eptr >= 128 || (md->ctypes[*eptr++] & ctype_word) == 0)
3219     RRETURN(MATCH_NOMATCH);
3220     /* No need to skip more bytes - we know it's a 1-byte character */
3221     }
3222     break;
3223    
3224     default:
3225     RRETURN(PCRE_ERROR_INTERNAL);
3226     } /* End switch(ctype) */
3227    
3228     else
3229     #endif /* SUPPORT_UTF8 */
3230    
3231     /* Code for the non-UTF-8 case for minimum matching of operators other
3232 nigel 93 than OP_PROP and OP_NOTPROP. We can assume that there are the minimum
3233     number of bytes present, as this was tested above. */
3234 nigel 77
3235     switch(ctype)
3236     {
3237     case OP_ANY:
3238 ph10 342 for (i = 1; i <= min; i++)
3239 nigel 77 {
3240 ph10 342 if (IS_NEWLINE(eptr)) RRETURN(MATCH_NOMATCH);
3241     eptr++;
3242 nigel 77 }
3243     break;
3244    
3245 ph10 341 case OP_ALLANY:
3246     eptr += min;
3247     break;
3248    
3249 nigel 77 case OP_ANYBYTE:
3250     eptr += min;
3251     break;
3252    
3253 nigel 93 /* Because of the CRLF case, we can't assume the minimum number of
3254     bytes are present in this case. */
3255    
3256     case OP_ANYNL:
3257     for (i = 1; i <= min; i++)
3258     {
3259     if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
3260     switch(*eptr++)
3261     {
3262     default: RRETURN(MATCH_NOMATCH);
3263     case 0x000d:
3264     if (eptr < md->end_subject && *eptr == 0x0a) eptr++;
3265     break;
3266     case 0x000a:
3267 ph10 231 break;
3268    
3269 nigel 93 case 0x000b:
3270     case 0x000c:
3271     case 0x0085:
3272 ph10 231 if (md->bsr_anycrlf) RRETURN(MATCH_NOMATCH);
3273 nigel 93 break;
3274     }
3275     }
3276     break;
3277    
3278 ph10 178 case OP_NOT_HSPACE:
3279     for (i = 1; i <= min; i++)
3280     {
3281     if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
3282     switch(*eptr++)
3283     {
3284     default: break;
3285     case 0x09: /* HT */
3286     case 0x20: /* SPACE */
3287     case 0xa0: /* NBSP */
3288     RRETURN(MATCH_NOMATCH);
3289     }
3290     }
3291     break;
3292    
3293     case OP_HSPACE:
3294     for (i = 1; i <= min; i++)
3295     {
3296     if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
3297     switch(*eptr++)
3298     {
3299     default: RRETURN(MATCH_NOMATCH);
3300     case 0x09: /* HT */
3301     case 0x20: /* SPACE */
3302     case 0xa0: /* NBSP */
3303 ph10 182 break;
3304 ph10 178 }
3305     }
3306     break;
3307    
3308     case OP_NOT_VSPACE:
3309     for (i = 1; i <= min; i++)
3310     {
3311     if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
3312     switch(*eptr++)
3313     {
3314     default: break;
3315     case 0x0a: /* LF */
3316     case 0x0b: /* VT */
3317     case 0x0c: /* FF */
3318     case 0x0d: /* CR */
3319     case 0x85: /* NEL */
3320     RRETURN(MATCH_NOMATCH);
3321     }
3322     }
3323     break;
3324    
3325     case OP_VSPACE:
3326     for (i = 1; i <= min; i++)
3327     {
3328     if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
3329     switch(*eptr++)
3330     {
3331     default: RRETURN(MATCH_NOMATCH);
3332     case 0x0a: /* LF */
3333     case 0x0b: /* VT */
3334     case 0x0c: /* FF */
3335     case 0x0d: /* CR */
3336     case 0x85: /* NEL */
3337 ph10 182 break;
3338 ph10 178 }
3339     }
3340     break;
3341    
3342 nigel 77 case OP_NOT_DIGIT:
3343     for (i = 1; i <= min; i++)
3344     if ((md->ctypes[*eptr++] & ctype_digit) != 0) RRETURN(MATCH_NOMATCH);
3345     break;
3346    
3347     case OP_DIGIT:
3348     for (i = 1; i <= min; i++)
3349     if ((md->ctypes[*eptr++] & ctype_digit) == 0) RRETURN(MATCH_NOMATCH);
3350     break;
3351    
3352     case OP_NOT_WHITESPACE:
3353     for (i = 1; i <= min; i++)
3354     if ((md->ctypes[*eptr++] & ctype_space) != 0) RRETURN(MATCH_NOMATCH);
3355     break;
3356    
3357     case OP_WHITESPACE:
3358     for (i = 1; i <= min; i++)
3359     if ((md->ctypes[*eptr++] & ctype_space) == 0) RRETURN(MATCH_NOMATCH);
3360     break;
3361    
3362     case OP_NOT_WORDCHAR:
3363     for (i = 1; i <= min; i++)
3364     if ((md->ctypes[*eptr++] & ctype_word) != 0)
3365     RRETURN(MATCH_NOMATCH);
3366     break;
3367    
3368     case OP_WORDCHAR:
3369     for (i = 1; i <= min; i++)
3370     if ((md->ctypes[*eptr++] & ctype_word) == 0)
3371     RRETURN(MATCH_NOMATCH);
3372     break;
3373    
3374     default:
3375     RRETURN(PCRE_ERROR_INTERNAL);
3376     }
3377     }
3378    
3379     /* If min = max, continue at the same level without recursing */
3380    
3381     if (min == max) continue;
3382    
3383     /* If minimizing, we have to test the rest of the pattern before each
3384     subsequent match. Again, separate the UTF-8 case for speed, and also
3385     separate the UCP cases. */
3386    
3387     if (minimize)
3388     {
3389     #ifdef SUPPORT_UCP
3390 nigel 87 if (prop_type >= 0)
3391 nigel 77 {
3392 nigel 87 switch(prop_type)
3393 nigel 77 {
3394 nigel 87 case PT_ANY:
3395     for (fi = min;; fi++)
3396     {
3397 ph10 164 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM36);
3398 nigel 87 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3399     if (fi >= max || eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
3400     GETCHARINC(c, eptr);
3401     if (prop_fail_result) RRETURN(MATCH_NOMATCH);
3402     }
3403 nigel 93 /* Control never gets here */
3404 nigel 87
3405     case PT_LAMP:
3406     for (fi = min;; fi++)
3407     {
3408 ph10 164 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM37);
3409 nigel 87 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3410     if (fi >= max || eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
3411     GETCHARINC(c, eptr);
3412 ph10 349 prop_chartype = UCD_CHARTYPE(c);
3413 nigel 87 if ((prop_chartype == ucp_Lu ||
3414     prop_chartype == ucp_Ll ||
3415     prop_chartype == ucp_Lt) == prop_fail_result)
3416     RRETURN(MATCH_NOMATCH);
3417     }
3418 nigel 93 /* Control never gets here */
3419 nigel 87
3420     case PT_GC:
3421     for (fi = min;; fi++)
3422     {
3423 ph10 164 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM38);
3424 nigel 87 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3425     if (fi >= max || eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
3426     GETCHARINC(c, eptr);
3427 ph10 349 prop_category = UCD_CATEGORY(c);
3428 nigel 87 if ((prop_category == prop_value) == prop_fail_result)
3429     RRETURN(MATCH_NOMATCH);
3430     }
3431 nigel 93 /* Control never gets here */
3432 nigel 87
3433     case PT_PC:
3434     for (fi = min;; fi++)
3435     {
3436 ph10 164 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM39);
3437 nigel 87 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3438     if (fi >= max || eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
3439     GETCHARINC(c, eptr);
3440 ph10 349 prop_chartype = UCD_CHARTYPE(c);
3441 nigel 87 if ((prop_chartype == prop_value) == prop_fail_result)
3442     RRETURN(MATCH_NOMATCH);
3443     }
3444 nigel 93 /* Control never gets here */
3445 nigel 87
3446     case PT_SC:
3447     for (fi = min;; fi++)
3448     {
3449 ph10 164 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM40);
3450 nigel 87 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3451     if (fi >= max || eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
3452     GETCHARINC(c, eptr);
3453 ph10 349 prop_script = UCD_SCRIPT(c);
3454 nigel 87 if ((prop_script == prop_value) == prop_fail_result)
3455     RRETURN(MATCH_NOMATCH);
3456     }
3457 nigel 93 /* Control never gets here */
3458 nigel 87
3459     default:
3460     RRETURN(PCRE_ERROR_INTERNAL);
3461 nigel 77 }
3462     }
3463    
3464     /* Match extended Unicode sequences. We will get here only if the
3465     support is in the binary; otherwise a compile-time error occurs. */
3466    
3467     else if (ctype == OP_EXTUNI)
3468     {
3469     for (fi = min;; fi++)
3470     {
3471 ph10 164 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM41);
3472 nigel 77 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3473     if (fi >= max || eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
3474     GETCHARINCTEST(c, eptr);
3475 ph10 349 prop_category = UCD_CATEGORY(c);
3476 nigel 77 if (prop_category == ucp_M) RRETURN(MATCH_NOMATCH);
3477     while (eptr < md->end_subject)
3478     {
3479     int len = 1;
3480     if (!utf8) c = *eptr; else
3481     {
3482     GETCHARLEN(c, eptr, len);
3483     }
3484 ph10 349 prop_category = UCD_CATEGORY(c);
3485 nigel 77 if (prop_category != ucp_M) break;
3486     eptr += len;
3487     }
3488     }
3489     }
3490    
3491     else
3492     #endif /* SUPPORT_UCP */
3493    
3494     #ifdef SUPPORT_UTF8
3495     /* UTF-8 mode */
3496     if (utf8)
3497     {
3498     for (fi = min;; fi++)
3499     {
3500 ph10 164 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM42);
3501 nigel 77 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3502 nigel 91 if (fi >= max || eptr >= md->end_subject ||
3503 ph10 342 (ctype == OP_ANY && IS_NEWLINE(eptr)))
3504 nigel 91 RRETURN(MATCH_NOMATCH);
3505 nigel 77
3506     GETCHARINC(c, eptr);
3507     switch(ctype)
3508     {
3509 ph10 342 case OP_ANY: /* This is the non-NL case */
3510 ph10 345 case OP_ALLANY:
3511 nigel 77 case OP_ANYBYTE:
3512     break;
3513    
3514 nigel 93 case OP_ANYNL:
3515     switch(c)
3516     {
3517     default: RRETURN(MATCH_NOMATCH);
3518     case 0x000d:
3519     if (eptr < md->end_subject && *eptr == 0x0a) eptr++;
3520     break;
3521     case 0x000a:
3522 ph10 231 break;
3523    
3524 nigel 93 case 0x000b:
3525     case 0x000c:
3526     case 0x0085:
3527     case 0x2028:
3528     case 0x2029:
3529 ph10 231 if (md->bsr_anycrlf) RRETURN(MATCH_NOMATCH);
3530 nigel 93 break;
3531     }
3532     break;
3533    
3534 ph10 178 case OP_NOT_HSPACE:
3535     switch(c)
3536     {
3537     default: break;
3538     case 0x09: /* HT */
3539     case 0x20: /* SPACE */
3540     case 0xa0: /* NBSP */
3541     case 0x1680: /* OGHAM SPACE MARK */
3542     case 0x180e: /* MONGOLIAN VOWEL SEPARATOR */
3543     case 0x2000: /* EN QUAD */
3544     case 0x2001: /* EM QUAD */
3545     case 0x2002: /* EN SPACE */
3546     case 0x2003: /* EM SPACE */
3547     case 0x2004: /* THREE-PER-EM SPACE */
3548     case 0x2005: /* FOUR-PER-EM SPACE */
3549     case 0x2006: /* SIX-PER-EM SPACE */
3550     case 0x2007: /* FIGURE SPACE */
3551     case 0x2008: /* PUNCTUATION SPACE */
3552     case 0x2009: /* THIN SPACE */
3553     case 0x200A: /* HAIR SPACE */
3554     case 0x202f: /* NARROW NO-BREAK SPACE */
3555     case 0x205f: /* MEDIUM MATHEMATICAL SPACE */
3556     case 0x3000: /* IDEOGRAPHIC SPACE */
3557     RRETURN(MATCH_NOMATCH);
3558     }
3559     break;
3560    
3561     case OP_HSPACE:
3562     switch(c)
3563     {
3564     default: RRETURN(MATCH_NOMATCH);
3565     case 0x09: /* HT */
3566     case 0x20: /* SPACE */
3567     case 0xa0: /* NBSP */
3568     case 0x1680: /* OGHAM SPACE MARK */
3569     case 0x180e: /* MONGOLIAN VOWEL SEPARATOR */
3570     case 0x2000: /* EN QUAD */
3571     case 0x2001: /* EM QUAD */
3572     case 0x2002: /* EN SPACE */
3573     case 0x2003: /* EM SPACE */
3574     case 0x2004: /* THREE-PER-EM SPACE */
3575     case 0x2005: /* FOUR-PER-EM SPACE */
3576     case 0x2006: /* SIX-PER-EM SPACE */
3577     case 0x2007: /* FIGURE SPACE */
3578     case 0x2008: /* PUNCTUATION SPACE */
3579     case 0x2009: /* THIN SPACE */
3580     case 0x200A: /* HAIR SPACE */
3581     case 0x202f: /* NARROW NO-BREAK SPACE */
3582     case 0x205f: /* MEDIUM MATHEMATICAL SPACE */
3583     case 0x3000: /* IDEOGRAPHIC SPACE */
3584     break;
3585     }
3586     break;
3587    
3588     case OP_NOT_VSPACE:
3589     switch(c)
3590     {
3591     default: break;
3592     case 0x0a: /* LF */
3593     case 0x0b: /* VT */
3594     case 0x0c: /* FF */
3595     case 0x0d: /* CR */
3596     case 0x85: /* NEL */
3597     case 0x2028: /* LINE SEPARATOR */
3598     case 0x2029: /* PARAGRAPH SEPARATOR */
3599     RRETURN(MATCH_NOMATCH);
3600     }
3601     break;
3602    
3603     case OP_VSPACE:
3604     switch(c)
3605     {
3606     default: RRETURN(MATCH_NOMATCH);
3607     case 0x0a: /* LF */
3608     case 0x0b: /* VT */
3609     case 0x0c: /* FF */
3610     case 0x0d: /* CR */
3611     case 0x85: /* NEL */
3612     case 0x2028: /* LINE SEPARATOR */
3613     case 0x2029: /* PARAGRAPH SEPARATOR */
3614     break;
3615     }
3616     break;
3617    
3618 nigel 77 case OP_NOT_DIGIT:
3619     if (c < 256 && (md->ctypes[c] & ctype_digit) != 0)
3620     RRETURN(MATCH_NOMATCH);
3621     break;
3622    
3623     case OP_DIGIT:
3624     if (c >= 256 || (md->ctypes[c] & ctype_digit) == 0)
3625     RRETURN(MATCH_NOMATCH);
3626     break;
3627    
3628     case OP_NOT_WHITESPACE:
3629     if (c < 256 && (md->ctypes[c] & ctype_space) != 0)
3630     RRETURN(MATCH_NOMATCH);
3631     break;
3632    
3633     case OP_WHITESPACE:
3634     if (c >= 256 || (md->ctypes[c] & ctype_space) == 0)
3635     RRETURN(MATCH_NOMATCH);
3636     break;
3637    
3638     case OP_NOT_WORDCHAR:
3639     if (c < 256 && (md->ctypes[c] & ctype_word) != 0)
3640     RRETURN(MATCH_NOMATCH);
3641     break;
3642    
3643     case OP_WORDCHAR:
3644     if (c >= 256 || (md->ctypes[c] & ctype_word) == 0)
3645     RRETURN(MATCH_NOMATCH);
3646     break;
3647    
3648     default:
3649     RRETURN(PCRE_ERROR_INTERNAL);
3650     }
3651     }
3652     }
3653     else
3654     #endif
3655     /* Not UTF-8 mode */
3656     {
3657     for (fi = min;; fi++)
3658     {
3659 ph10 164 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM43);
3660 nigel 77 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3661 nigel 91 if (fi >= max || eptr >= md->end_subject ||
3662 ph10 342 (ctype == OP_ANY && IS_NEWLINE(eptr)))
3663 nigel 91 RRETURN(MATCH_NOMATCH);
3664    
3665 nigel 77 c = *eptr++;
3666     switch(ctype)
3667     {
3668 ph10 342 case OP_ANY: /* This is the non-NL case */
3669 ph10 345 case OP_ALLANY:
3670 nigel 77 case OP_ANYBYTE:
3671     break;
3672    
3673 nigel 93 case OP_ANYNL:
3674     switch(c)
3675     {
3676     default: RRETURN(MATCH_NOMATCH);
3677     case 0x000d:
3678     if (eptr < md->end_subject && *eptr == 0x0a) eptr++;
3679     break;
3680 ph10 231
3681 nigel 93 case 0x000a:
3682 ph10 231 break;
3683    
3684 nigel 93 case 0x000b:
3685     case 0x000c:
3686     case 0x0085:
3687 ph10 231 if (md->bsr_anycrlf) RRETURN(MATCH_NOMATCH);
3688 nigel 93 break;
3689     }
3690     break;
3691    
3692 ph10 178 case OP_NOT_HSPACE:
3693     switch(c)
3694     {
3695     default: break;
3696     case 0x09: /* HT */
3697     case 0x20: /* SPACE */
3698     case 0xa0: /* NBSP */
3699     RRETURN(MATCH_NOMATCH);
3700     }
3701     break;
3702    
3703     case OP_HSPACE:
3704     switch(c)
3705     {
3706     default: RRETURN(MATCH_NOMATCH);
3707     case 0x09: /* HT */
3708     case 0x20: /* SPACE */
3709     case 0xa0: /* NBSP */
3710     break;
3711     }
3712     break;
3713    
3714     case OP_NOT_VSPACE:
3715     switch(c)
3716     {
3717     default: break;
3718     case 0x0a: /* LF */
3719     case 0x0b: /* VT */
3720     case 0x0c: /* FF */
3721     case 0x0d: /* CR */
3722     case 0x85: /* NEL */
3723     RRETURN(MATCH_NOMATCH);
3724     }
3725     break;
3726    
3727     case OP_VSPACE:
3728     switch(c)
3729     {
3730     default: RRETURN(MATCH_NOMATCH);
3731     case 0x0a: /* LF */
3732     case 0x0b: /* VT */
3733     case 0x0c: /* FF */
3734     case 0x0d: /* CR */
3735     case 0x85: /* NEL */
3736     break;
3737     }
3738     break;
3739    
3740 nigel 77 case OP_NOT_DIGIT:
3741     if ((md->ctypes[c] & ctype_digit) != 0) RRETURN(MATCH_NOMATCH);
3742     break;
3743    
3744     case OP_DIGIT:
3745     if ((md->ctypes[c] & ctype_digit) == 0) RRETURN(MATCH_NOMATCH);
3746     break;
3747    
3748     case OP_NOT_WHITESPACE:
3749     if ((md->ctypes[c] & ctype_space) != 0) RRETURN(MATCH_NOMATCH);
3750     break;
3751    
3752     case OP_WHITESPACE:
3753     if ((md->ctypes[c] & ctype_space) == 0) RRETURN(MATCH_NOMATCH);
3754     break;
3755    
3756     case OP_NOT_WORDCHAR:
3757     if ((md->ctypes[c] & ctype_word) != 0) RRETURN(MATCH_NOMATCH);
3758     break;
3759    
3760     case OP_WORDCHAR:
3761     if ((md->ctypes[c] & ctype_word) == 0) RRETURN(MATCH_NOMATCH);
3762     break;
3763    
3764     default:
3765     RRETURN(PCRE_ERROR_INTERNAL);
3766     }
3767     }
3768     }
3769     /* Control never gets here */
3770     }
3771    
3772 nigel 93 /* If maximizing, it is worth using inline code for speed, doing the type
3773 nigel 77 test once at the start (i.e. keep it out of the loop). Again, keep the
3774     UTF-8 and UCP stuff separate. */
3775    
3776     else
3777     {
3778     pp = eptr; /* Remember where we started */
3779    
3780     #ifdef SUPPORT_UCP
3781 nigel 87 if (prop_type >= 0)
3782 nigel 77 {
3783 nigel 87 switch(prop_type)
3784 nigel 77 {
3785 nigel 87 case PT_ANY:
3786     for (i = min; i < max; i++)
3787     {
3788     int len = 1;
3789     if (eptr >= md->end_subject) break;
3790     GETCHARLEN(c, eptr, len);
3791     if (prop_fail_result) break;
3792     eptr+= len;
3793     }
3794     break;
3795    
3796     case PT_LAMP:
3797     for (i = min; i < max; i++)
3798     {
3799     int len = 1;
3800     if (eptr >= md->end_subject) break;
3801     GETCHARLEN(c, eptr, len);
3802 ph10 349 prop_chartype = UCD_CHARTYPE(c);
3803 nigel 87 if ((prop_chartype == ucp_Lu ||
3804     prop_chartype == ucp_Ll ||
3805     prop_chartype == ucp_Lt) == prop_fail_result)
3806     break;
3807     eptr+= len;
3808     }
3809     break;
3810    
3811     case PT_GC:
3812     for (i = min; i < max; i++)
3813     {
3814     int len = 1;
3815     if (eptr >= md->end_subject) break;
3816     GETCHARLEN(c, eptr, len);
3817 ph10 349 prop_category = UCD_CATEGORY(c);
3818 nigel 87 if ((prop_category == prop_value) == prop_fail_result)
3819     break;
3820     eptr+= len;
3821     }
3822     break;
3823    
3824     case PT_PC:
3825     for (i = min; i < max; i++)
3826     {
3827     int len = 1;
3828     if (eptr >= md->end_subject) break;
3829     GETCHARLEN(c, eptr, len);
3830 ph10 349 prop_chartype = UCD_CHARTYPE(c);
3831 nigel 87 if ((prop_chartype == prop_value) == prop_fail_result)
3832     break;
3833     eptr+= len;
3834     }
3835     break;
3836    
3837     case PT_SC:
3838     for (i = min; i < max; i++)
3839     {
3840     int len = 1;
3841     if (eptr >= md->end_subject) break;
3842     GETCHARLEN(c, eptr, len);
3843 ph10 349 prop_script = UCD_SCRIPT(c);
3844 nigel 87 if ((prop_script == prop_value) == prop_fail_result)
3845     break;
3846     eptr+= len;
3847     }
3848     break;
3849 nigel 77 }
3850    
3851     /* eptr is now past the end of the maximum run */
3852    
3853 nigel 93 if (possessive) continue;
3854 nigel 77 for(;;)
3855     {
3856 ph10 164 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM44);
3857 nigel 77 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3858     if (eptr-- == pp) break; /* Stop if tried at original pos */
3859 ph10 207 if (utf8) BACKCHAR(eptr);
3860 nigel 77 }
3861     }
3862    
3863     /* Match extended Unicode sequences. We will get here only if the
3864     support is in the binary; otherwise a compile-time error occurs. */
3865    
3866     else if (ctype == OP_EXTUNI)
3867     {
3868     for (i = min; i < max; i++)
3869     {
3870     if (eptr >= md->end_subject) break;
3871     GETCHARINCTEST(c, eptr);
3872 ph10 349 prop_category = UCD_CATEGORY(c);
3873 nigel 77 if (prop_category == ucp_M) break;
3874     while (eptr < md->end_subject)
3875     {
3876     int len = 1;
3877     if (!utf8) c = *eptr; else
3878     {
3879     GETCHARLEN(c, eptr, len);
3880     }
3881 ph10 349 prop_category = UCD_CATEGORY(c);
3882 nigel 77 if (prop_category != ucp_M) break;
3883     eptr += len;
3884     }
3885     }
3886    
3887     /* eptr is now past the end of the maximum run */
3888    
3889 nigel 93 if (possessive) continue;
3890 nigel 77 for(;;)
3891     {
3892 ph10 164 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM45);
3893 nigel 77 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3894     if (eptr-- == pp) break; /* Stop if tried at original pos */
3895     for (;;) /* Move back over one extended */
3896     {
3897     int len = 1;
3898     if (!utf8) c = *eptr; else
3899     {
3900 ph10 207 BACKCHAR(eptr);
3901 nigel 77 GETCHARLEN(c, eptr, len);
3902     }
3903 ph10 349 prop_category = UCD_CATEGORY(c);
3904 nigel 77 if (prop_category != ucp_M) break;
3905     eptr--;
3906     }
3907     }
3908     }
3909    
3910     else
3911     #endif /* SUPPORT_UCP */
3912    
3913     #ifdef SUPPORT_UTF8
3914     /* UTF-8 mode */
3915    
3916     if (utf8)
3917     {
3918     switch(ctype)
3919     {
3920     case OP_ANY:
3921     if (max < INT_MAX)
3922     {
3923 ph10 342 for (i = min; i < max; i++)
3924 nigel 77 {
3925 ph10 342 if (eptr >= md->end_subject || IS_NEWLINE(eptr)) break;
3926     eptr++;
3927     while (eptr < md->end_subject && (*eptr & 0xc0) == 0x80) eptr++;
3928 nigel 77 }
3929     }
3930    
3931     /* Handle unlimited UTF-8 repeat */
3932    
3933     else
3934     {
3935 ph10 342 for (i = min; i < max; i++)
3936 nigel 77 {
3937 ph10 342 if (eptr >= md->end_subject || IS_NEWLINE(eptr)) break;
3938     eptr++;
3939     while (eptr < md->end_subject && (*eptr & 0xc0) == 0x80) eptr++;
3940 nigel 77 }
3941     }
3942     break;
3943    
3944 ph10 341 case OP_ALLANY:
3945     if (max < INT_MAX)
3946     {
3947     for (i = min; i < max; i++)
3948     {
3949     if (eptr >= md->end_subject) break;
3950     eptr++;
3951     while (eptr < md->end_subject && (*eptr & 0xc0) == 0x80) eptr++;
3952     }
3953     }
3954     else eptr = md->end_subject; /* Unlimited UTF-8 repeat */
3955     break;
3956    
3957 nigel 77 /* The byte case is the same as non-UTF8 */
3958    
3959     case OP_ANYBYTE:
3960     c = max - min;
3961 nigel 93 if (c > (unsigned int)(md->end_subject - eptr))
3962     c = md->end_subject - eptr;
3963 nigel 77 eptr += c;
3964     break;
3965    
3966 nigel 93 case OP_ANYNL:
3967     for (i = min; i < max; i++)
3968     {
3969     int len = 1;
3970     if (eptr >= md->end_subject) break;
3971     GETCHARLEN(c, eptr, len);
3972     if (c == 0x000d)
3973     {
3974     if (++eptr >= md->end_subject) break;
3975     if (*eptr == 0x000a) eptr++;
3976     }
3977     else
3978     {
3979 ph10 231 if (c != 0x000a &&
3980     (md->bsr_anycrlf ||
3981     (c != 0x000b && c != 0x000c &&
3982     c != 0x0085 && c != 0x2028 && c != 0x2029)))
3983 nigel 93 break;
3984     eptr += len;
3985     }
3986     }
3987     break;
3988    
3989 ph10 178 case OP_NOT_HSPACE:
3990 ph10 182 case OP_HSPACE:
3991 ph10 178 for (i = min; i < max; i++)
3992     {
3993 ph10 182 BOOL gotspace;
3994 ph10 178 int len = 1;
3995     if (eptr >= md->end_subject) break;
3996     GETCHARLEN(c, eptr, len);
3997     switch(c)
3998 ph10 182 {
3999     default: gotspace = FALSE; break;
4000 ph10 178 case 0x09: /* HT */
4001     case 0x20: /* SPACE */
4002     case 0xa0: /* NBSP */
4003     case 0x1680: /* OGHAM SPACE MARK */
4004     case 0x180e: /* MONGOLIAN VOWEL SEPARATOR */
4005     case 0x2000: /* EN QUAD */
4006     case 0x2001: /* EM QUAD */
4007     case 0x2002: /* EN SPACE */
4008     case 0x2003: /* EM SPACE */
4009     case 0x2004: /* THREE-PER-EM SPACE */
4010     case 0x2005: /* FOUR-PER-EM SPACE */
4011     case 0x2006: /* SIX-PER-EM SPACE */
4012     case 0x2007: /* FIGURE SPACE */
4013     case 0x2008: /* PUNCTUATION SPACE */
4014     case 0x2009: /* THIN SPACE */
4015     case 0x200A: /* HAIR SPACE */
4016     case 0x202f: /* NARROW NO-BREAK SPACE */
4017     case 0x205f: /* MEDIUM MATHEMATICAL SPACE */
4018     case 0x3000: /* IDEOGRAPHIC SPACE */
4019     gotspace = TRUE;
4020 ph10 182 break;
4021 ph10 178 }
4022     if (gotspace == (ctype == OP_NOT_HSPACE)) break;
4023     eptr += len;
4024     }
4025     break;
4026    
4027     case OP_NOT_VSPACE:
4028 ph10 182 case OP_VSPACE:
4029 ph10 178 for (i = min; i < max; i++)
4030     {
4031 ph10 182 BOOL gotspace;
4032 ph10 178 int len = 1;
4033     if (eptr >= md->end_subject) break;
4034     GETCHARLEN(c, eptr, len);
4035     switch(c)
4036     {
4037 ph10 182 default: gotspace = FALSE; break;
4038 ph10 178 case 0x0a: /* LF */
4039     case 0x0b: /* VT */
4040     case 0x0c: /* FF */
4041     case 0x0d: /* CR */
4042     case 0x85: /* NEL */
4043     case 0x2028: /* LINE SEPARATOR */
4044     case 0x2029: /* PARAGRAPH SEPARATOR */
4045     gotspace = TRUE;
4046     break;
4047     }
4048 ph10 182 if (gotspace == (ctype == OP_NOT_VSPACE)) break;
4049 ph10 178 eptr += len;
4050     }
4051     break;
4052    
4053 nigel 77 case OP_NOT_DIGIT:
4054     for (i = min; i < max; i++)
4055     {
4056     int len = 1;
4057     if (eptr >= md->end_subject) break;
4058     GETCHARLEN(c, eptr, len);
4059     if (c < 256 && (md->ctypes[c] & ctype_digit) != 0) break;
4060     eptr+= len;
4061     }
4062     break;
4063    
4064     case OP_DIGIT:
4065     for (i = min; i < max; i++)
4066     {
4067     int len = 1;
4068     if (eptr >= md->end_subject) break;
4069     GETCHARLEN(c, eptr, len);
4070     if (c >= 256 ||(md->ctypes[c] & ctype_digit) == 0) break;
4071     eptr+= len;
4072     }
4073     break;
4074    
4075     case OP_NOT_WHITESPACE:
4076     for (i = min; i < max; i++)
4077     {
4078     int len = 1;
4079     if (eptr >= md->end_subject) break;
4080     GETCHARLEN(c, eptr, len);
4081     if (c < 256 && (md->ctypes[c] & ctype_space) != 0) break;
4082     eptr+= len;
4083     }
4084     break;
4085    
4086     case OP_WHITESPACE:
4087     for (i = min; i < max; i++)
4088     {
4089     int len = 1;
4090     if (eptr >= md->end_subject) break;
4091     GETCHARLEN(c, eptr, len);
4092     if (c >= 256 ||(md->ctypes[c] & ctype_space) == 0) break;
4093     eptr+= len;
4094     }
4095     break;
4096    
4097     case OP_NOT_WORDCHAR:
4098     for (i = min; i < max; i++)
4099     {
4100     int len = 1;
4101     if (eptr >= md->end_subject) break;
4102     GETCHARLEN(c, eptr, len);
4103     if (c < 256 && (md->ctypes[c] & ctype_word) != 0) break;
4104     eptr+= len;
4105     }
4106     break;
4107    
4108     case OP_WORDCHAR:
4109     for (i = min; i < max; i++)
4110     {
4111     int len = 1;
4112     if (eptr >= md->end_subject) break;
4113     GETCHARLEN(c, eptr, len);
4114     if (c >= 256 || (md->ctypes[c] & ctype_word) == 0) break;
4115     eptr+= len;
4116     }
4117     break;
4118    
4119     default:
4120     RRETURN(PCRE_ERROR_INTERNAL);
4121     }
4122    
4123     /* eptr is now past the end of the maximum run */
4124    
4125 nigel 93 if (possessive) continue;
4126 nigel 77 for(;;)
4127     {
4128 ph10 164 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM46);
4129 nigel 77 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
4130     if (eptr-- == pp) break; /* Stop if tried at original pos */
4131     BACKCHAR(eptr);
4132     }
4133     }
4134     else
4135 ph10 207 #endif /* SUPPORT_UTF8 */
4136 nigel 77
4137     /* Not UTF-8 mode */
4138     {
4139     switch(ctype)
4140     {
4141     case OP_ANY:
4142 ph10 342 for (i = min; i < max; i++)
4143 nigel 77 {
4144 ph10