/[pcre]/code/trunk/pcre_exec.c
ViewVC logotype

Contents of /code/trunk/pcre_exec.c

Parent Directory Parent Directory | Revision Log Revision Log


Revision 354 - (hide annotations) (download)
Mon Jul 7 16:30:33 2008 UTC (6 years ago) by ph10
File MIME type: text/plain
File size: 150585 byte(s)
Fix caseless backreferences for non-ASCII characters.

1 nigel 77 /*************************************************
2     * Perl-Compatible Regular Expressions *
3     *************************************************/
4    
5     /* PCRE is a library of functions to support regular expressions whose syntax
6     and semantics are as close as possible to those of the Perl 5 language.
7    
8     Written by Philip Hazel
9 ph10 305 Copyright (c) 1997-2008 University of Cambridge
10 nigel 77
11     -----------------------------------------------------------------------------
12     Redistribution and use in source and binary forms, with or without
13     modification, are permitted provided that the following conditions are met:
14    
15     * Redistributions of source code must retain the above copyright notice,
16     this list of conditions and the following disclaimer.
17    
18     * Redistributions in binary form must reproduce the above copyright
19     notice, this list of conditions and the following disclaimer in the
20     documentation and/or other materials provided with the distribution.
21    
22     * Neither the name of the University of Cambridge nor the names of its
23     contributors may be used to endorse or promote products derived from
24     this software without specific prior written permission.
25    
26     THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
27     AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
28     IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
29     ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
30     LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
31     CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
32     SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
33     INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
34     CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
35     ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
36     POSSIBILITY OF SUCH DAMAGE.
37     -----------------------------------------------------------------------------
38     */
39    
40    
41     /* This module contains pcre_exec(), the externally visible function that does
42     pattern matching using an NFA algorithm, trying to mimic Perl as closely as
43     possible. There are also some static supporting functions. */
44    
45 ph10 200 #ifdef HAVE_CONFIG_H
46 ph10 236 #include "config.h"
47 ph10 200 #endif
48 ph10 199
49 nigel 93 #define NLBLOCK md /* Block containing newline information */
50     #define PSSTART start_subject /* Field containing processed string start */
51     #define PSEND end_subject /* Field containing processed string end */
52    
53 nigel 77 #include "pcre_internal.h"
54    
55 ph10 137 /* Undefine some potentially clashing cpp symbols */
56    
57     #undef min
58     #undef max
59    
60 nigel 77 /* Flag bits for the match() function */
61    
62 nigel 93 #define match_condassert 0x01 /* Called to check a condition assertion */
63     #define match_cbegroup 0x02 /* Could-be-empty unlimited repeat group */
64 nigel 77
65     /* Non-error returns from the match() function. Error returns are externally
66     defined PCRE_ERROR_xxx codes, which are all negative. */
67    
68     #define MATCH_MATCH 1
69     #define MATCH_NOMATCH 0
70    
71 ph10 211 /* Special internal returns from the match() function. Make them sufficiently
72 ph10 210 negative to avoid the external error codes. */
73    
74     #define MATCH_COMMIT (-999)
75     #define MATCH_PRUNE (-998)
76     #define MATCH_SKIP (-997)
77     #define MATCH_THEN (-996)
78    
79 nigel 77 /* Maximum number of ints of offset to save on the stack for recursive calls.
80     If the offset vector is bigger, malloc is used. This should be a multiple of 3,
81     because the offset vector is always a multiple of 3 long. */
82    
83     #define REC_STACK_SAVE_MAX 30
84    
85     /* Min and max values for the common repeats; for the maxima, 0 => infinity */
86    
87     static const char rep_min[] = { 0, 0, 1, 1, 0, 0 };
88     static const char rep_max[] = { 0, 0, 0, 0, 1, 1 };
89    
90    
91    
92     #ifdef DEBUG
93     /*************************************************
94     * Debugging function to print chars *
95     *************************************************/
96    
97     /* Print a sequence of chars in printable format, stopping at the end of the
98     subject if the requested.
99    
100     Arguments:
101     p points to characters
102     length number to print
103     is_subject TRUE if printing from within md->start_subject
104     md pointer to matching data block, if is_subject is TRUE
105    
106     Returns: nothing
107     */
108    
109     static void
110     pchars(const uschar *p, int length, BOOL is_subject, match_data *md)
111     {
112 nigel 93 unsigned int c;
113 nigel 77 if (is_subject && length > md->end_subject - p) length = md->end_subject - p;
114     while (length-- > 0)
115     if (isprint(c = *(p++))) printf("%c", c); else printf("\\x%02x", c);
116     }
117     #endif
118    
119    
120    
121     /*************************************************
122     * Match a back-reference *
123     *************************************************/
124    
125     /* If a back reference hasn't been set, the length that is passed is greater
126     than the number of characters left in the string, so the match fails.
127    
128     Arguments:
129     offset index into the offset vector
130     eptr points into the subject
131     length length to be matched
132     md points to match data block
133     ims the ims flags
134    
135     Returns: TRUE if matched
136     */
137    
138     static BOOL
139 nigel 87 match_ref(int offset, register USPTR eptr, int length, match_data *md,
140 nigel 77 unsigned long int ims)
141     {
142 nigel 87 USPTR p = md->start_subject + md->offset_vector[offset];
143 nigel 77
144     #ifdef DEBUG
145     if (eptr >= md->end_subject)
146     printf("matching subject <null>");
147     else
148     {
149     printf("matching subject ");
150     pchars(eptr, length, TRUE, md);
151     }
152     printf(" against backref ");
153     pchars(p, length, FALSE, md);
154     printf("\n");
155     #endif
156    
157     /* Always fail if not enough characters left */
158    
159     if (length > md->end_subject - eptr) return FALSE;
160    
161 ph10 354 /* Separate the caseless case for speed. In UTF-8 mode we can only do this
162     properly if Unicode properties are supported. Otherwise, we can check only
163     ASCII characters. */
164 nigel 77
165     if ((ims & PCRE_CASELESS) != 0)
166     {
167 ph10 354 #ifdef SUPPORT_UTF8
168     #ifdef SUPPORT_UCP
169     if (md->utf8)
170     {
171     USPTR endptr = eptr + length;
172     while (eptr < endptr)
173     {
174     int c, d;
175     GETCHARINC(c, eptr);
176     GETCHARINC(d, p);
177     if (c != d && c != UCD_OTHERCASE(d)) return FALSE;
178     }
179     }
180     else
181     #endif
182     #endif
183    
184     /* The same code works when not in UTF-8 mode and in UTF-8 mode when there
185     is no UCP support. */
186    
187 nigel 77 while (length-- > 0)
188 ph10 354 { if (md->lcc[*p++] != md->lcc[*eptr++]) return FALSE; }
189 nigel 77 }
190 ph10 354
191     /* In the caseful case, we can just compare the bytes, whether or not we
192     are in UTF-8 mode. */
193    
194 nigel 77 else
195     { while (length-- > 0) if (*p++ != *eptr++) return FALSE; }
196    
197     return TRUE;
198     }
199    
200    
201    
202     /***************************************************************************
203     ****************************************************************************
204     RECURSION IN THE match() FUNCTION
205    
206 nigel 87 The match() function is highly recursive, though not every recursive call
207     increases the recursive depth. Nevertheless, some regular expressions can cause
208     it to recurse to a great depth. I was writing for Unix, so I just let it call
209     itself recursively. This uses the stack for saving everything that has to be
210     saved for a recursive call. On Unix, the stack can be large, and this works
211     fine.
212 nigel 77
213 nigel 87 It turns out that on some non-Unix-like systems there are problems with
214     programs that use a lot of stack. (This despite the fact that every last chip
215     has oodles of memory these days, and techniques for extending the stack have
216     been known for decades.) So....
217 nigel 77
218     There is a fudge, triggered by defining NO_RECURSE, which avoids recursive
219     calls by keeping local variables that need to be preserved in blocks of memory
220 nigel 87 obtained from malloc() instead instead of on the stack. Macros are used to
221 nigel 77 achieve this so that the actual code doesn't look very different to what it
222     always used to.
223 ph10 164
224 ph10 165 The original heap-recursive code used longjmp(). However, it seems that this
225 ph10 164 can be very slow on some operating systems. Following a suggestion from Stan
226     Switzer, the use of longjmp() has been abolished, at the cost of having to
227     provide a unique number for each call to RMATCH. There is no way of generating
228     a sequence of numbers at compile time in C. I have given them names, to make
229     them stand out more clearly.
230    
231     Crude tests on x86 Linux show a small speedup of around 5-8%. However, on
232     FreeBSD, avoiding longjmp() more than halves the time taken to run the standard
233 ph10 165 tests. Furthermore, not using longjmp() means that local dynamic variables
234     don't have indeterminate values; this has meant that the frame size can be
235 ph10 164 reduced because the result can be "passed back" by straight setting of the
236     variable instead of being passed in the frame.
237 nigel 77 ****************************************************************************
238     ***************************************************************************/
239    
240 ph10 212 /* Numbers for RMATCH calls. When this list is changed, the code at HEAP_RETURN
241     below must be updated in sync. */
242 nigel 77
243 ph10 164 enum { RM1=1, RM2, RM3, RM4, RM5, RM6, RM7, RM8, RM9, RM10,
244     RM11, RM12, RM13, RM14, RM15, RM16, RM17, RM18, RM19, RM20,
245     RM21, RM22, RM23, RM24, RM25, RM26, RM27, RM28, RM29, RM30,
246     RM31, RM32, RM33, RM34, RM35, RM36, RM37, RM38, RM39, RM40,
247 ph10 210 RM41, RM42, RM43, RM44, RM45, RM46, RM47, RM48, RM49, RM50,
248 ph10 212 RM51, RM52, RM53, RM54 };
249 ph10 164
250 nigel 87 /* These versions of the macros use the stack, as normal. There are debugging
251 ph10 165 versions and production versions. Note that the "rw" argument of RMATCH isn't
252 ph10 164 actuall used in this definition. */
253 nigel 77
254     #ifndef NO_RECURSE
255     #define REGISTER register
256 ph10 164
257 nigel 87 #ifdef DEBUG
258 ph10 164 #define RMATCH(ra,rb,rc,rd,re,rf,rg,rw) \
259 nigel 87 { \
260     printf("match() called in line %d\n", __LINE__); \
261 ph10 168 rrc = match(ra,rb,mstart,rc,rd,re,rf,rg,rdepth+1); \
262 nigel 87 printf("to line %d\n", __LINE__); \
263     }
264     #define RRETURN(ra) \
265     { \
266     printf("match() returned %d from line %d ", ra, __LINE__); \
267     return ra; \
268     }
269     #else
270 ph10 164 #define RMATCH(ra,rb,rc,rd,re,rf,rg,rw) \
271 ph10 168 rrc = match(ra,rb,mstart,rc,rd,re,rf,rg,rdepth+1)
272 nigel 77 #define RRETURN(ra) return ra
273 nigel 87 #endif
274    
275 nigel 77 #else
276    
277    
278 ph10 164 /* These versions of the macros manage a private stack on the heap. Note that
279     the "rd" argument of RMATCH isn't actually used in this definition. It's the md
280     argument of match(), which never changes. */
281 nigel 77
282     #define REGISTER
283    
284 ph10 164 #define RMATCH(ra,rb,rc,rd,re,rf,rg,rw)\
285 nigel 77 {\
286     heapframe *newframe = (pcre_stack_malloc)(sizeof(heapframe));\
287 ph10 164 frame->Xwhere = rw; \
288     newframe->Xeptr = ra;\
289     newframe->Xecode = rb;\
290 ph10 168 newframe->Xmstart = mstart;\
291 ph10 164 newframe->Xoffset_top = rc;\
292     newframe->Xims = re;\
293     newframe->Xeptrb = rf;\
294     newframe->Xflags = rg;\
295     newframe->Xrdepth = frame->Xrdepth + 1;\
296     newframe->Xprevframe = frame;\
297     frame = newframe;\
298     DPRINTF(("restarting from line %d\n", __LINE__));\
299     goto HEAP_RECURSE;\
300     L_##rw:\
301     DPRINTF(("jumped back to line %d\n", __LINE__));\
302 nigel 77 }
303    
304     #define RRETURN(ra)\
305     {\
306     heapframe *newframe = frame;\
307     frame = newframe->Xprevframe;\
308     (pcre_stack_free)(newframe);\
309     if (frame != NULL)\
310     {\
311 ph10 164 rrc = ra;\
312     goto HEAP_RETURN;\
313 nigel 77 }\
314     return ra;\
315     }
316    
317    
318     /* Structure for remembering the local variables in a private frame */
319    
320     typedef struct heapframe {
321     struct heapframe *Xprevframe;
322    
323     /* Function arguments that may change */
324    
325     const uschar *Xeptr;
326     const uschar *Xecode;
327 ph10 172 const uschar *Xmstart;
328 nigel 77 int Xoffset_top;
329     long int Xims;
330     eptrblock *Xeptrb;
331     int Xflags;
332 nigel 91 unsigned int Xrdepth;
333 nigel 77
334     /* Function local variables */
335    
336     const uschar *Xcallpat;
337     const uschar *Xcharptr;
338     const uschar *Xdata;
339     const uschar *Xnext;
340     const uschar *Xpp;
341     const uschar *Xprev;
342     const uschar *Xsaved_eptr;
343    
344     recursion_info Xnew_recursive;
345    
346     BOOL Xcur_is_word;
347     BOOL Xcondition;
348     BOOL Xprev_is_word;
349    
350     unsigned long int Xoriginal_ims;
351    
352     #ifdef SUPPORT_UCP
353     int Xprop_type;
354 nigel 87 int Xprop_value;
355 nigel 77 int Xprop_fail_result;
356     int Xprop_category;
357     int Xprop_chartype;
358 nigel 87 int Xprop_script;
359 ph10 123 int Xoclength;
360     uschar Xocchars[8];
361 nigel 77 #endif
362    
363     int Xctype;
364 nigel 93 unsigned int Xfc;
365 nigel 77 int Xfi;
366     int Xlength;
367     int Xmax;
368     int Xmin;
369     int Xnumber;
370     int Xoffset;
371     int Xop;
372     int Xsave_capture_last;
373     int Xsave_offset1, Xsave_offset2, Xsave_offset3;
374     int Xstacksave[REC_STACK_SAVE_MAX];
375    
376     eptrblock Xnewptrb;
377    
378 ph10 164 /* Where to jump back to */
379 nigel 77
380 ph10 164 int Xwhere;
381 ph10 165
382 nigel 77 } heapframe;
383    
384     #endif
385    
386    
387     /***************************************************************************
388     ***************************************************************************/
389    
390    
391    
392     /*************************************************
393     * Match from current position *
394     *************************************************/
395    
396 nigel 93 /* This function is called recursively in many circumstances. Whenever it
397 nigel 77 returns a negative (error) response, the outer incarnation must also return the
398     same response.
399    
400     Performance note: It might be tempting to extract commonly used fields from the
401     md structure (e.g. utf8, end_subject) into individual variables to improve
402     performance. Tests using gcc on a SPARC disproved this; in the first case, it
403     made performance worse.
404    
405     Arguments:
406 nigel 93 eptr pointer to current character in subject
407     ecode pointer to current position in compiled code
408 ph10 168 mstart pointer to the current match start position (can be modified
409 ph10 172 by encountering \K)
410 nigel 77 offset_top current top pointer
411     md pointer to "static" info for the match
412     ims current /i, /m, and /s options
413     eptrb pointer to chain of blocks containing eptr at start of
414     brackets - for testing for empty matches
415     flags can contain
416     match_condassert - this is an assertion condition
417 nigel 93 match_cbegroup - this is the start of an unlimited repeat
418     group that can match an empty string
419 nigel 87 rdepth the recursion depth
420 nigel 77
421     Returns: MATCH_MATCH if matched ) these values are >= 0
422     MATCH_NOMATCH if failed to match )
423     a negative PCRE_ERROR_xxx value if aborted by an error condition
424 nigel 87 (e.g. stopped by repeated call or recursion limit)
425 nigel 77 */
426    
427     static int
428 ph10 172 match(REGISTER USPTR eptr, REGISTER const uschar *ecode, const uschar *mstart,
429 nigel 77 int offset_top, match_data *md, unsigned long int ims, eptrblock *eptrb,
430 nigel 91 int flags, unsigned int rdepth)
431 nigel 77 {
432     /* These variables do not need to be preserved over recursion in this function,
433 nigel 93 so they can be ordinary variables in all cases. Mark some of them with
434     "register" because they are used a lot in loops. */
435 nigel 77
436 nigel 91 register int rrc; /* Returns from recursive calls */
437     register int i; /* Used for loops not involving calls to RMATCH() */
438 nigel 93 register unsigned int c; /* Character values not kept over RMATCH() calls */
439 nigel 91 register BOOL utf8; /* Local copy of UTF-8 flag for speed */
440 nigel 77
441 nigel 93 BOOL minimize, possessive; /* Quantifier options */
442    
443 nigel 77 /* When recursion is not being used, all "local" variables that have to be
444     preserved over calls to RMATCH() are part of a "frame" which is obtained from
445     heap storage. Set up the top-level frame here; others are obtained from the
446     heap whenever RMATCH() does a "recursion". See the macro definitions above. */
447    
448     #ifdef NO_RECURSE
449     heapframe *frame = (pcre_stack_malloc)(sizeof(heapframe));
450     frame->Xprevframe = NULL; /* Marks the top level */
451    
452     /* Copy in the original argument variables */
453    
454     frame->Xeptr = eptr;
455     frame->Xecode = ecode;
456 ph10 168 frame->Xmstart = mstart;
457 nigel 77 frame->Xoffset_top = offset_top;
458     frame->Xims = ims;
459     frame->Xeptrb = eptrb;
460     frame->Xflags = flags;
461 nigel 87 frame->Xrdepth = rdepth;
462 nigel 77
463     /* This is where control jumps back to to effect "recursion" */
464    
465     HEAP_RECURSE:
466    
467     /* Macros make the argument variables come from the current frame */
468    
469     #define eptr frame->Xeptr
470     #define ecode frame->Xecode
471 ph10 168 #define mstart frame->Xmstart
472 nigel 77 #define offset_top frame->Xoffset_top
473     #define ims frame->Xims
474     #define eptrb frame->Xeptrb
475     #define flags frame->Xflags
476 nigel 87 #define rdepth frame->Xrdepth
477 nigel 77
478     /* Ditto for the local variables */
479    
480     #ifdef SUPPORT_UTF8
481     #define charptr frame->Xcharptr
482     #endif
483     #define callpat frame->Xcallpat
484     #define data frame->Xdata
485     #define next frame->Xnext
486     #define pp frame->Xpp
487     #define prev frame->Xprev
488     #define saved_eptr frame->Xsaved_eptr
489    
490     #define new_recursive frame->Xnew_recursive
491    
492     #define cur_is_word frame->Xcur_is_word
493     #define condition frame->Xcondition
494     #define prev_is_word frame->Xprev_is_word
495    
496     #define original_ims frame->Xoriginal_ims
497    
498     #ifdef SUPPORT_UCP
499     #define prop_type frame->Xprop_type
500 nigel 87 #define prop_value frame->Xprop_value
501 nigel 77 #define prop_fail_result frame->Xprop_fail_result
502     #define prop_category frame->Xprop_category
503     #define prop_chartype frame->Xprop_chartype
504 nigel 87 #define prop_script frame->Xprop_script
505 ph10 115 #define oclength frame->Xoclength
506     #define occhars frame->Xocchars
507 nigel 77 #endif
508    
509     #define ctype frame->Xctype
510     #define fc frame->Xfc
511     #define fi frame->Xfi
512     #define length frame->Xlength
513     #define max frame->Xmax
514     #define min frame->Xmin
515     #define number frame->Xnumber
516     #define offset frame->Xoffset
517     #define op frame->Xop
518     #define save_capture_last frame->Xsave_capture_last
519     #define save_offset1 frame->Xsave_offset1
520     #define save_offset2 frame->Xsave_offset2
521     #define save_offset3 frame->Xsave_offset3
522     #define stacksave frame->Xstacksave
523    
524     #define newptrb frame->Xnewptrb
525    
526     /* When recursion is being used, local variables are allocated on the stack and
527     get preserved during recursion in the normal way. In this environment, fi and
528     i, and fc and c, can be the same variables. */
529    
530 nigel 93 #else /* NO_RECURSE not defined */
531 nigel 77 #define fi i
532     #define fc c
533    
534    
535 nigel 87 #ifdef SUPPORT_UTF8 /* Many of these variables are used only */
536     const uschar *charptr; /* in small blocks of the code. My normal */
537     #endif /* style of coding would have declared */
538     const uschar *callpat; /* them within each of those blocks. */
539     const uschar *data; /* However, in order to accommodate the */
540     const uschar *next; /* version of this code that uses an */
541     USPTR pp; /* external "stack" implemented on the */
542     const uschar *prev; /* heap, it is easier to declare them all */
543     USPTR saved_eptr; /* here, so the declarations can be cut */
544     /* out in a block. The only declarations */
545     recursion_info new_recursive; /* within blocks below are for variables */
546     /* that do not have to be preserved over */
547     BOOL cur_is_word; /* a recursive call to RMATCH(). */
548     BOOL condition;
549 nigel 77 BOOL prev_is_word;
550    
551     unsigned long int original_ims;
552    
553     #ifdef SUPPORT_UCP
554     int prop_type;
555 nigel 87 int prop_value;
556 nigel 77 int prop_fail_result;
557     int prop_category;
558     int prop_chartype;
559 nigel 87 int prop_script;
560 ph10 115 int oclength;
561     uschar occhars[8];
562 nigel 77 #endif
563    
564     int ctype;
565     int length;
566     int max;
567     int min;
568     int number;
569     int offset;
570     int op;
571     int save_capture_last;
572     int save_offset1, save_offset2, save_offset3;
573     int stacksave[REC_STACK_SAVE_MAX];
574    
575     eptrblock newptrb;
576 nigel 93 #endif /* NO_RECURSE */
577 nigel 77
578     /* These statements are here to stop the compiler complaining about unitialized
579     variables. */
580    
581     #ifdef SUPPORT_UCP
582 nigel 87 prop_value = 0;
583 nigel 77 prop_fail_result = 0;
584     #endif
585    
586 nigel 93
587 nigel 91 /* This label is used for tail recursion, which is used in a few cases even
588     when NO_RECURSE is not defined, in order to reduce the amount of stack that is
589     used. Thanks to Ian Taylor for noticing this possibility and sending the
590     original patch. */
591    
592     TAIL_RECURSE:
593    
594 nigel 87 /* OK, now we can get on with the real code of the function. Recursive calls
595     are specified by the macro RMATCH and RRETURN is used to return. When
596     NO_RECURSE is *not* defined, these just turn into a recursive call to match()
597     and a "return", respectively (possibly with some debugging if DEBUG is
598     defined). However, RMATCH isn't like a function call because it's quite a
599     complicated macro. It has to be used in one particular way. This shouldn't,
600     however, impact performance when true recursion is being used. */
601 nigel 77
602 ph10 164 #ifdef SUPPORT_UTF8
603     utf8 = md->utf8; /* Local copy of the flag */
604     #else
605     utf8 = FALSE;
606     #endif
607    
608 nigel 87 /* First check that we haven't called match() too many times, or that we
609     haven't exceeded the recursive call limit. */
610    
611 nigel 77 if (md->match_call_count++ >= md->match_limit) RRETURN(PCRE_ERROR_MATCHLIMIT);
612 nigel 87 if (rdepth >= md->match_limit_recursion) RRETURN(PCRE_ERROR_RECURSIONLIMIT);
613 nigel 77
614     original_ims = ims; /* Save for resetting on ')' */
615 nigel 91
616 nigel 93 /* At the start of a group with an unlimited repeat that may match an empty
617     string, the match_cbegroup flag is set. When this is the case, add the current
618     subject pointer to the chain of such remembered pointers, to be checked when we
619     hit the closing ket, in order to break infinite loops that match no characters.
620 ph10 197 When match() is called in other circumstances, don't add to the chain. The
621     match_cbegroup flag must NOT be used with tail recursion, because the memory
622     block that is used is on the stack, so a new one may be required for each
623     match(). */
624 nigel 77
625 nigel 93 if ((flags & match_cbegroup) != 0)
626 nigel 77 {
627 ph10 197 newptrb.epb_saved_eptr = eptr;
628     newptrb.epb_prev = eptrb;
629     eptrb = &newptrb;
630 nigel 77 }
631    
632 nigel 93 /* Now start processing the opcodes. */
633 nigel 77
634     for (;;)
635     {
636 nigel 93 minimize = possessive = FALSE;
637 nigel 77 op = *ecode;
638    
639     /* For partial matching, remember if we ever hit the end of the subject after
640     matching at least one subject character. */
641    
642     if (md->partial &&
643     eptr >= md->end_subject &&
644 ph10 168 eptr > mstart)
645 nigel 77 md->hitend = TRUE;
646 ph10 208
647 nigel 93 switch(op)
648     {
649 ph10 210 case OP_FAIL:
650 ph10 212 RRETURN(MATCH_NOMATCH);
651 ph10 211
652 ph10 210 case OP_PRUNE:
653     RMATCH(eptr, ecode + _pcre_OP_lengths[*ecode], offset_top, md,
654     ims, eptrb, flags, RM51);
655     if (rrc != MATCH_NOMATCH) RRETURN(rrc);
656 ph10 212 RRETURN(MATCH_PRUNE);
657 ph10 211
658 ph10 210 case OP_COMMIT:
659     RMATCH(eptr, ecode + _pcre_OP_lengths[*ecode], offset_top, md,
660     ims, eptrb, flags, RM52);
661     if (rrc != MATCH_NOMATCH) RRETURN(rrc);
662 ph10 212 RRETURN(MATCH_COMMIT);
663 ph10 211
664 ph10 210 case OP_SKIP:
665     RMATCH(eptr, ecode + _pcre_OP_lengths[*ecode], offset_top, md,
666     ims, eptrb, flags, RM53);
667     if (rrc != MATCH_NOMATCH) RRETURN(rrc);
668 ph10 211 md->start_match_ptr = eptr; /* Pass back current position */
669 ph10 212 RRETURN(MATCH_SKIP);
670 ph10 211
671 ph10 210 case OP_THEN:
672     RMATCH(eptr, ecode + _pcre_OP_lengths[*ecode], offset_top, md,
673 ph10 212 ims, eptrb, flags, RM54);
674 ph10 210 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
675 ph10 212 RRETURN(MATCH_THEN);
676 ph10 211
677 nigel 93 /* Handle a capturing bracket. If there is space in the offset vector, save
678     the current subject position in the working slot at the top of the vector.
679     We mustn't change the current values of the data slot, because they may be
680     set from a previous iteration of this group, and be referred to by a
681     reference inside the group.
682 nigel 77
683 nigel 93 If the bracket fails to match, we need to restore this value and also the
684     values of the final offsets, in case they were set by a previous iteration
685     of the same bracket.
686 nigel 77
687 nigel 93 If there isn't enough space in the offset vector, treat this as if it were
688     a non-capturing bracket. Don't worry about setting the flag for the error
689     case here; that is handled in the code for KET. */
690 nigel 77
691 nigel 93 case OP_CBRA:
692     case OP_SCBRA:
693     number = GET2(ecode, 1+LINK_SIZE);
694 nigel 77 offset = number << 1;
695    
696     #ifdef DEBUG
697 nigel 93 printf("start bracket %d\n", number);
698     printf("subject=");
699 nigel 77 pchars(eptr, 16, TRUE, md);
700     printf("\n");
701     #endif
702    
703     if (offset < md->offset_max)
704     {
705     save_offset1 = md->offset_vector[offset];
706     save_offset2 = md->offset_vector[offset+1];
707     save_offset3 = md->offset_vector[md->offset_end - number];
708     save_capture_last = md->capture_last;
709    
710     DPRINTF(("saving %d %d %d\n", save_offset1, save_offset2, save_offset3));
711     md->offset_vector[md->offset_end - number] = eptr - md->start_subject;
712    
713 nigel 93 flags = (op == OP_SCBRA)? match_cbegroup : 0;
714 nigel 77 do
715     {
716 ph10 164 RMATCH(eptr, ecode + _pcre_OP_lengths[*ecode], offset_top, md,
717     ims, eptrb, flags, RM1);
718 ph10 210 if (rrc != MATCH_NOMATCH && rrc != MATCH_THEN) RRETURN(rrc);
719 nigel 77 md->capture_last = save_capture_last;
720     ecode += GET(ecode, 1);
721     }
722     while (*ecode == OP_ALT);
723    
724     DPRINTF(("bracket %d failed\n", number));
725    
726     md->offset_vector[offset] = save_offset1;
727     md->offset_vector[offset+1] = save_offset2;
728     md->offset_vector[md->offset_end - number] = save_offset3;
729    
730     RRETURN(MATCH_NOMATCH);
731     }
732    
733 ph10 197 /* FALL THROUGH ... Insufficient room for saving captured contents. Treat
734     as a non-capturing bracket. */
735 nigel 77
736 ph10 197 /* VVVVVVVVVVVVVVVVVVVVVVVVV */
737     /* VVVVVVVVVVVVVVVVVVVVVVVVV */
738    
739 nigel 93 DPRINTF(("insufficient capture room: treat as non-capturing\n"));
740 nigel 77
741 ph10 197 /* VVVVVVVVVVVVVVVVVVVVVVVVV */
742     /* VVVVVVVVVVVVVVVVVVVVVVVVV */
743    
744 nigel 93 /* Non-capturing bracket. Loop for all the alternatives. When we get to the
745     final alternative within the brackets, we would return the result of a
746     recursive call to match() whatever happened. We can reduce stack usage by
747 ph10 197 turning this into a tail recursion, except in the case when match_cbegroup
748     is set.*/
749 nigel 77
750 nigel 93 case OP_BRA:
751     case OP_SBRA:
752     DPRINTF(("start non-capturing bracket\n"));
753     flags = (op >= OP_SBRA)? match_cbegroup : 0;
754 nigel 91 for (;;)
755 nigel 77 {
756 ph10 197 if (ecode[GET(ecode, 1)] != OP_ALT) /* Final alternative */
757 nigel 93 {
758 ph10 197 if (flags == 0) /* Not a possibly empty group */
759     {
760     ecode += _pcre_OP_lengths[*ecode];
761     DPRINTF(("bracket 0 tail recursion\n"));
762     goto TAIL_RECURSE;
763     }
764    
765     /* Possibly empty group; can't use tail recursion. */
766    
767     RMATCH(eptr, ecode + _pcre_OP_lengths[*ecode], offset_top, md, ims,
768     eptrb, flags, RM48);
769     RRETURN(rrc);
770 nigel 93 }
771 nigel 91
772     /* For non-final alternatives, continue the loop for a NOMATCH result;
773     otherwise return. */
774    
775 ph10 164 RMATCH(eptr, ecode + _pcre_OP_lengths[*ecode], offset_top, md, ims,
776     eptrb, flags, RM2);
777 ph10 210 if (rrc != MATCH_NOMATCH && rrc != MATCH_THEN) RRETURN(rrc);
778 nigel 77 ecode += GET(ecode, 1);
779     }
780 nigel 91 /* Control never reaches here. */
781 nigel 77
782     /* Conditional group: compilation checked that there are no more than
783     two branches. If the condition is false, skipping the first branch takes us
784     past the end if there is only one branch, but that's OK because that is
785 nigel 91 exactly what going to the ket would do. As there is only one branch to be
786     obeyed, we can use tail recursion to avoid using another stack frame. */
787 nigel 77
788     case OP_COND:
789 nigel 93 case OP_SCOND:
790     if (ecode[LINK_SIZE+1] == OP_RREF) /* Recursion test */
791 nigel 77 {
792 nigel 93 offset = GET2(ecode, LINK_SIZE + 2); /* Recursion group number*/
793     condition = md->recursive != NULL &&
794     (offset == RREF_ANY || offset == md->recursive->group_num);
795     ecode += condition? 3 : GET(ecode, 1);
796     }
797    
798     else if (ecode[LINK_SIZE+1] == OP_CREF) /* Group used test */
799     {
800 nigel 77 offset = GET2(ecode, LINK_SIZE+2) << 1; /* Doubled ref number */
801 nigel 93 condition = offset < offset_top && md->offset_vector[offset] >= 0;
802     ecode += condition? 3 : GET(ecode, 1);
803 nigel 77 }
804    
805 nigel 93 else if (ecode[LINK_SIZE+1] == OP_DEF) /* DEFINE - always false */
806     {
807     condition = FALSE;
808     ecode += GET(ecode, 1);
809     }
810    
811 nigel 77 /* The condition is an assertion. Call match() to evaluate it - setting
812 nigel 93 the final argument match_condassert causes it to stop at the end of an
813     assertion. */
814 nigel 77
815     else
816     {
817 ph10 164 RMATCH(eptr, ecode + 1 + LINK_SIZE, offset_top, md, ims, NULL,
818     match_condassert, RM3);
819 nigel 77 if (rrc == MATCH_MATCH)
820     {
821 nigel 93 condition = TRUE;
822     ecode += 1 + LINK_SIZE + GET(ecode, LINK_SIZE + 2);
823 nigel 77 while (*ecode == OP_ALT) ecode += GET(ecode, 1);
824     }
825 ph10 210 else if (rrc != MATCH_NOMATCH && rrc != MATCH_THEN)
826 nigel 77 {
827     RRETURN(rrc); /* Need braces because of following else */
828     }
829 nigel 93 else
830     {
831     condition = FALSE;
832     ecode += GET(ecode, 1);
833     }
834     }
835 nigel 91
836 nigel 93 /* We are now at the branch that is to be obeyed. As there is only one,
837 ph10 197 we can use tail recursion to avoid using another stack frame, except when
838     match_cbegroup is required for an unlimited repeat of a possibly empty
839     group. If the second alternative doesn't exist, we can just plough on. */
840 nigel 91
841 nigel 93 if (condition || *ecode == OP_ALT)
842     {
843 nigel 91 ecode += 1 + LINK_SIZE;
844 ph10 197 if (op == OP_SCOND) /* Possibly empty group */
845     {
846     RMATCH(eptr, ecode, offset_top, md, ims, eptrb, match_cbegroup, RM49);
847     RRETURN(rrc);
848     }
849     else /* Group must match something */
850     {
851     flags = 0;
852     goto TAIL_RECURSE;
853     }
854 nigel 77 }
855 ph10 197 else /* Condition false & no 2nd alternative */
856 nigel 93 {
857     ecode += 1 + LINK_SIZE;
858     }
859     break;
860 nigel 77
861    
862 ph10 210 /* End of the pattern, either real or forced. If we are in a top-level
863     recursion, we should restore the offsets appropriately and continue from
864     after the call. */
865 nigel 77
866 ph10 210 case OP_ACCEPT:
867 nigel 77 case OP_END:
868     if (md->recursive != NULL && md->recursive->group_num == 0)
869     {
870     recursion_info *rec = md->recursive;
871 nigel 87 DPRINTF(("End of pattern in a (?0) recursion\n"));
872 nigel 77 md->recursive = rec->prevrec;
873     memmove(md->offset_vector, rec->offset_save,
874     rec->saved_max * sizeof(int));
875 ph10 168 mstart = rec->save_start;
876 nigel 77 ims = original_ims;
877     ecode = rec->after_call;
878     break;
879     }
880    
881     /* Otherwise, if PCRE_NOTEMPTY is set, fail if we have matched an empty
882     string - backtracking will then try other alternatives, if any. */
883    
884 ph10 168 if (md->notempty && eptr == mstart) RRETURN(MATCH_NOMATCH);
885     md->end_match_ptr = eptr; /* Record where we ended */
886     md->end_offset_top = offset_top; /* and how many extracts were taken */
887 ph10 210 md->start_match_ptr = mstart; /* and the start (\K can modify) */
888 nigel 77 RRETURN(MATCH_MATCH);
889    
890     /* Change option settings */
891    
892     case OP_OPT:
893     ims = ecode[1];
894     ecode += 2;
895     DPRINTF(("ims set to %02lx\n", ims));
896     break;
897    
898     /* Assertion brackets. Check the alternative branches in turn - the
899     matching won't pass the KET for an assertion. If any one branch matches,
900     the assertion is true. Lookbehind assertions have an OP_REVERSE item at the
901     start of each branch to move the current point backwards, so the code at
902     this level is identical to the lookahead case. */
903    
904     case OP_ASSERT:
905     case OP_ASSERTBACK:
906     do
907     {
908 ph10 164 RMATCH(eptr, ecode + 1 + LINK_SIZE, offset_top, md, ims, NULL, 0,
909     RM4);
910 nigel 77 if (rrc == MATCH_MATCH) break;
911 ph10 210 if (rrc != MATCH_NOMATCH && rrc != MATCH_THEN) RRETURN(rrc);
912 nigel 77 ecode += GET(ecode, 1);
913     }
914     while (*ecode == OP_ALT);
915     if (*ecode == OP_KET) RRETURN(MATCH_NOMATCH);
916    
917     /* If checking an assertion for a condition, return MATCH_MATCH. */
918    
919     if ((flags & match_condassert) != 0) RRETURN(MATCH_MATCH);
920    
921     /* Continue from after the assertion, updating the offsets high water
922     mark, since extracts may have been taken during the assertion. */
923    
924     do ecode += GET(ecode,1); while (*ecode == OP_ALT);
925     ecode += 1 + LINK_SIZE;
926     offset_top = md->end_offset_top;
927     continue;
928    
929     /* Negative assertion: all branches must fail to match */
930    
931     case OP_ASSERT_NOT:
932     case OP_ASSERTBACK_NOT:
933     do
934     {
935 ph10 164 RMATCH(eptr, ecode + 1 + LINK_SIZE, offset_top, md, ims, NULL, 0,
936     RM5);
937 nigel 77 if (rrc == MATCH_MATCH) RRETURN(MATCH_NOMATCH);
938 ph10 210 if (rrc != MATCH_NOMATCH && rrc != MATCH_THEN) RRETURN(rrc);
939 nigel 77 ecode += GET(ecode,1);
940     }
941     while (*ecode == OP_ALT);
942    
943     if ((flags & match_condassert) != 0) RRETURN(MATCH_MATCH);
944    
945     ecode += 1 + LINK_SIZE;
946     continue;
947    
948     /* Move the subject pointer back. This occurs only at the start of
949     each branch of a lookbehind assertion. If we are too close to the start to
950     move back, this match function fails. When working with UTF-8 we move
951     back a number of characters, not bytes. */
952    
953     case OP_REVERSE:
954     #ifdef SUPPORT_UTF8
955     if (utf8)
956     {
957 nigel 93 i = GET(ecode, 1);
958     while (i-- > 0)
959 nigel 77 {
960     eptr--;
961     if (eptr < md->start_subject) RRETURN(MATCH_NOMATCH);
962 ph10 207 BACKCHAR(eptr);
963 nigel 77 }
964     }
965     else
966     #endif
967    
968     /* No UTF-8 support, or not in UTF-8 mode: count is byte count */
969    
970     {
971 nigel 93 eptr -= GET(ecode, 1);
972 nigel 77 if (eptr < md->start_subject) RRETURN(MATCH_NOMATCH);
973     }
974    
975     /* Skip to next op code */
976    
977     ecode += 1 + LINK_SIZE;
978     break;
979    
980     /* The callout item calls an external function, if one is provided, passing
981     details of the match so far. This is mainly for debugging, though the
982     function is able to force a failure. */
983    
984     case OP_CALLOUT:
985     if (pcre_callout != NULL)
986     {
987     pcre_callout_block cb;
988     cb.version = 1; /* Version 1 of the callout block */
989     cb.callout_number = ecode[1];
990     cb.offset_vector = md->offset_vector;
991 nigel 87 cb.subject = (PCRE_SPTR)md->start_subject;
992 nigel 77 cb.subject_length = md->end_subject - md->start_subject;
993 ph10 168 cb.start_match = mstart - md->start_subject;
994 nigel 77 cb.current_position = eptr - md->start_subject;
995     cb.pattern_position = GET(ecode, 2);
996     cb.next_item_length = GET(ecode, 2 + LINK_SIZE);
997     cb.capture_top = offset_top/2;
998     cb.capture_last = md->capture_last;
999     cb.callout_data = md->callout_data;
1000     if ((rrc = (*pcre_callout)(&cb)) > 0) RRETURN(MATCH_NOMATCH);
1001     if (rrc < 0) RRETURN(rrc);
1002     }
1003     ecode += 2 + 2*LINK_SIZE;
1004     break;
1005    
1006     /* Recursion either matches the current regex, or some subexpression. The
1007     offset data is the offset to the starting bracket from the start of the
1008     whole pattern. (This is so that it works from duplicated subpatterns.)
1009    
1010     If there are any capturing brackets started but not finished, we have to
1011     save their starting points and reinstate them after the recursion. However,
1012     we don't know how many such there are (offset_top records the completed
1013     total) so we just have to save all the potential data. There may be up to
1014     65535 such values, which is too large to put on the stack, but using malloc
1015     for small numbers seems expensive. As a compromise, the stack is used when
1016     there are no more than REC_STACK_SAVE_MAX values to store; otherwise malloc
1017     is used. A problem is what to do if the malloc fails ... there is no way of
1018     returning to the top level with an error. Save the top REC_STACK_SAVE_MAX
1019     values on the stack, and accept that the rest may be wrong.
1020    
1021     There are also other values that have to be saved. We use a chained
1022     sequence of blocks that actually live on the stack. Thanks to Robin Houston
1023     for the original version of this logic. */
1024    
1025     case OP_RECURSE:
1026     {
1027     callpat = md->start_code + GET(ecode, 1);
1028 nigel 93 new_recursive.group_num = (callpat == md->start_code)? 0 :
1029     GET2(callpat, 1 + LINK_SIZE);
1030 nigel 77
1031     /* Add to "recursing stack" */
1032    
1033     new_recursive.prevrec = md->recursive;
1034     md->recursive = &new_recursive;
1035    
1036     /* Find where to continue from afterwards */
1037    
1038     ecode += 1 + LINK_SIZE;
1039     new_recursive.after_call = ecode;
1040    
1041     /* Now save the offset data. */
1042    
1043     new_recursive.saved_max = md->offset_end;
1044     if (new_recursive.saved_max <= REC_STACK_SAVE_MAX)
1045     new_recursive.offset_save = stacksave;
1046     else
1047     {
1048     new_recursive.offset_save =
1049     (int *)(pcre_malloc)(new_recursive.saved_max * sizeof(int));
1050     if (new_recursive.offset_save == NULL) RRETURN(PCRE_ERROR_NOMEMORY);
1051     }
1052    
1053     memcpy(new_recursive.offset_save, md->offset_vector,
1054     new_recursive.saved_max * sizeof(int));
1055 ph10 168 new_recursive.save_start = mstart;
1056     mstart = eptr;
1057 nigel 77
1058     /* OK, now we can do the recursion. For each top-level alternative we
1059     restore the offset and recursion data. */
1060    
1061     DPRINTF(("Recursing into group %d\n", new_recursive.group_num));
1062 nigel 93 flags = (*callpat >= OP_SBRA)? match_cbegroup : 0;
1063 nigel 77 do
1064     {
1065 ph10 164 RMATCH(eptr, callpat + _pcre_OP_lengths[*callpat], offset_top,
1066     md, ims, eptrb, flags, RM6);
1067 nigel 77 if (rrc == MATCH_MATCH)
1068     {
1069 nigel 87 DPRINTF(("Recursion matched\n"));
1070 nigel 77 md->recursive = new_recursive.prevrec;
1071     if (new_recursive.offset_save != stacksave)
1072     (pcre_free)(new_recursive.offset_save);
1073     RRETURN(MATCH_MATCH);
1074     }
1075 ph10 210 else if (rrc != MATCH_NOMATCH && rrc != MATCH_THEN)
1076 nigel 87 {
1077     DPRINTF(("Recursion gave error %d\n", rrc));
1078     RRETURN(rrc);
1079     }
1080 nigel 77
1081     md->recursive = &new_recursive;
1082     memcpy(md->offset_vector, new_recursive.offset_save,
1083     new_recursive.saved_max * sizeof(int));
1084     callpat += GET(callpat, 1);
1085     }
1086     while (*callpat == OP_ALT);
1087    
1088     DPRINTF(("Recursion didn't match\n"));
1089     md->recursive = new_recursive.prevrec;
1090     if (new_recursive.offset_save != stacksave)
1091     (pcre_free)(new_recursive.offset_save);
1092     RRETURN(MATCH_NOMATCH);
1093     }
1094     /* Control never reaches here */
1095    
1096     /* "Once" brackets are like assertion brackets except that after a match,
1097     the point in the subject string is not moved back. Thus there can never be
1098     a move back into the brackets. Friedl calls these "atomic" subpatterns.
1099     Check the alternative branches in turn - the matching won't pass the KET
1100     for this kind of subpattern. If any one branch matches, we carry on as at
1101     the end of a normal bracket, leaving the subject pointer. */
1102    
1103     case OP_ONCE:
1104 nigel 91 prev = ecode;
1105     saved_eptr = eptr;
1106    
1107     do
1108 nigel 77 {
1109 ph10 197 RMATCH(eptr, ecode + 1 + LINK_SIZE, offset_top, md, ims, eptrb, 0, RM7);
1110 nigel 91 if (rrc == MATCH_MATCH) break;
1111 ph10 210 if (rrc != MATCH_NOMATCH && rrc != MATCH_THEN) RRETURN(rrc);
1112 nigel 91 ecode += GET(ecode,1);
1113     }
1114     while (*ecode == OP_ALT);
1115 nigel 77
1116 nigel 91 /* If hit the end of the group (which could be repeated), fail */
1117 nigel 77
1118 nigel 91 if (*ecode != OP_ONCE && *ecode != OP_ALT) RRETURN(MATCH_NOMATCH);
1119 nigel 77
1120 nigel 91 /* Continue as from after the assertion, updating the offsets high water
1121     mark, since extracts may have been taken. */
1122 nigel 77
1123 nigel 93 do ecode += GET(ecode, 1); while (*ecode == OP_ALT);
1124 nigel 77
1125 nigel 91 offset_top = md->end_offset_top;
1126     eptr = md->end_match_ptr;
1127 nigel 77
1128 nigel 91 /* For a non-repeating ket, just continue at this level. This also
1129     happens for a repeating ket if no characters were matched in the group.
1130     This is the forcible breaking of infinite loops as implemented in Perl
1131     5.005. If there is an options reset, it will get obeyed in the normal
1132     course of events. */
1133 nigel 77
1134 nigel 91 if (*ecode == OP_KET || eptr == saved_eptr)
1135     {
1136     ecode += 1+LINK_SIZE;
1137     break;
1138     }
1139 nigel 77
1140 nigel 91 /* The repeating kets try the rest of the pattern or restart from the
1141     preceding bracket, in the appropriate order. The second "call" of match()
1142     uses tail recursion, to avoid using another stack frame. We need to reset
1143     any options that changed within the bracket before re-running it, so
1144     check the next opcode. */
1145 nigel 77
1146 nigel 91 if (ecode[1+LINK_SIZE] == OP_OPT)
1147     {
1148     ims = (ims & ~PCRE_IMS) | ecode[4];
1149     DPRINTF(("ims set to %02lx at group repeat\n", ims));
1150     }
1151 nigel 77
1152 nigel 91 if (*ecode == OP_KETRMIN)
1153     {
1154 ph10 197 RMATCH(eptr, ecode + 1 + LINK_SIZE, offset_top, md, ims, eptrb, 0, RM8);
1155 nigel 91 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1156     ecode = prev;
1157 ph10 197 flags = 0;
1158 nigel 91 goto TAIL_RECURSE;
1159 nigel 77 }
1160 nigel 91 else /* OP_KETRMAX */
1161     {
1162 ph10 164 RMATCH(eptr, prev, offset_top, md, ims, eptrb, match_cbegroup, RM9);
1163 nigel 91 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1164     ecode += 1 + LINK_SIZE;
1165 ph10 197 flags = 0;
1166 nigel 91 goto TAIL_RECURSE;
1167     }
1168     /* Control never gets here */
1169 nigel 77
1170     /* An alternation is the end of a branch; scan along to find the end of the
1171     bracketed group and go to there. */
1172    
1173     case OP_ALT:
1174     do ecode += GET(ecode,1); while (*ecode == OP_ALT);
1175     break;
1176    
1177 ph10 335 /* BRAZERO, BRAMINZERO and SKIPZERO occur just before a bracket group,
1178     indicating that it may occur zero times. It may repeat infinitely, or not
1179     at all - i.e. it could be ()* or ()? or even (){0} in the pattern. Brackets
1180     with fixed upper repeat limits are compiled as a number of copies, with the
1181     optional ones preceded by BRAZERO or BRAMINZERO. */
1182 nigel 77
1183     case OP_BRAZERO:
1184     {
1185     next = ecode+1;
1186 ph10 164 RMATCH(eptr, next, offset_top, md, ims, eptrb, 0, RM10);
1187 nigel 77 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1188     do next += GET(next,1); while (*next == OP_ALT);
1189 nigel 93 ecode = next + 1 + LINK_SIZE;
1190 nigel 77 }
1191     break;
1192    
1193     case OP_BRAMINZERO:
1194     {
1195     next = ecode+1;
1196 nigel 93 do next += GET(next, 1); while (*next == OP_ALT);
1197 ph10 164 RMATCH(eptr, next + 1+LINK_SIZE, offset_top, md, ims, eptrb, 0, RM11);
1198 nigel 77 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1199     ecode++;
1200     }
1201     break;
1202    
1203 ph10 335 case OP_SKIPZERO:
1204     {
1205     next = ecode+1;
1206     do next += GET(next,1); while (*next == OP_ALT);
1207     ecode = next + 1 + LINK_SIZE;
1208     }
1209     break;
1210    
1211 nigel 93 /* End of a group, repeated or non-repeating. */
1212 nigel 77
1213     case OP_KET:
1214     case OP_KETRMIN:
1215     case OP_KETRMAX:
1216 nigel 91 prev = ecode - GET(ecode, 1);
1217 nigel 77
1218 nigel 93 /* If this was a group that remembered the subject start, in order to break
1219     infinite repeats of empty string matches, retrieve the subject start from
1220     the chain. Otherwise, set it NULL. */
1221 nigel 77
1222 nigel 93 if (*prev >= OP_SBRA)
1223     {
1224     saved_eptr = eptrb->epb_saved_eptr; /* Value at start of group */
1225     eptrb = eptrb->epb_prev; /* Backup to previous group */
1226     }
1227     else saved_eptr = NULL;
1228 nigel 77
1229 nigel 93 /* If we are at the end of an assertion group, stop matching and return
1230     MATCH_MATCH, but record the current high water mark for use by positive
1231     assertions. Do this also for the "once" (atomic) groups. */
1232    
1233 nigel 91 if (*prev == OP_ASSERT || *prev == OP_ASSERT_NOT ||
1234     *prev == OP_ASSERTBACK || *prev == OP_ASSERTBACK_NOT ||
1235     *prev == OP_ONCE)
1236     {
1237     md->end_match_ptr = eptr; /* For ONCE */
1238     md->end_offset_top = offset_top;
1239     RRETURN(MATCH_MATCH);
1240     }
1241 nigel 77
1242 nigel 93 /* For capturing groups we have to check the group number back at the start
1243     and if necessary complete handling an extraction by setting the offsets and
1244     bumping the high water mark. Note that whole-pattern recursion is coded as
1245     a recurse into group 0, so it won't be picked up here. Instead, we catch it
1246     when the OP_END is reached. Other recursion is handled here. */
1247 nigel 77
1248 nigel 93 if (*prev == OP_CBRA || *prev == OP_SCBRA)
1249 nigel 91 {
1250 nigel 93 number = GET2(prev, 1+LINK_SIZE);
1251 nigel 91 offset = number << 1;
1252 nigel 77
1253     #ifdef DEBUG
1254 nigel 91 printf("end bracket %d", number);
1255     printf("\n");
1256 nigel 77 #endif
1257    
1258 nigel 93 md->capture_last = number;
1259     if (offset >= md->offset_max) md->offset_overflow = TRUE; else
1260 nigel 91 {
1261 nigel 93 md->offset_vector[offset] =
1262     md->offset_vector[md->offset_end - number];
1263     md->offset_vector[offset+1] = eptr - md->start_subject;
1264     if (offset_top <= offset) offset_top = offset + 2;
1265     }
1266 nigel 77
1267 nigel 93 /* Handle a recursively called group. Restore the offsets
1268     appropriately and continue from after the call. */
1269 nigel 77
1270 nigel 93 if (md->recursive != NULL && md->recursive->group_num == number)
1271     {
1272     recursion_info *rec = md->recursive;
1273     DPRINTF(("Recursion (%d) succeeded - continuing\n", number));
1274     md->recursive = rec->prevrec;
1275 ph10 168 mstart = rec->save_start;
1276 nigel 93 memcpy(md->offset_vector, rec->offset_save,
1277     rec->saved_max * sizeof(int));
1278     ecode = rec->after_call;
1279     ims = original_ims;
1280     break;
1281 nigel 77 }
1282 nigel 91 }
1283 nigel 77
1284 nigel 93 /* For both capturing and non-capturing groups, reset the value of the ims
1285     flags, in case they got changed during the group. */
1286 nigel 77
1287 nigel 91 ims = original_ims;
1288     DPRINTF(("ims reset to %02lx\n", ims));
1289 nigel 77
1290 nigel 91 /* For a non-repeating ket, just continue at this level. This also
1291     happens for a repeating ket if no characters were matched in the group.
1292     This is the forcible breaking of infinite loops as implemented in Perl
1293     5.005. If there is an options reset, it will get obeyed in the normal
1294     course of events. */
1295 nigel 77
1296 nigel 91 if (*ecode == OP_KET || eptr == saved_eptr)
1297     {
1298     ecode += 1 + LINK_SIZE;
1299     break;
1300     }
1301 nigel 77
1302 nigel 91 /* The repeating kets try the rest of the pattern or restart from the
1303     preceding bracket, in the appropriate order. In the second case, we can use
1304 ph10 197 tail recursion to avoid using another stack frame, unless we have an
1305     unlimited repeat of a group that can match an empty string. */
1306 nigel 77
1307 nigel 93 flags = (*prev >= OP_SBRA)? match_cbegroup : 0;
1308    
1309 nigel 91 if (*ecode == OP_KETRMIN)
1310     {
1311 ph10 197 RMATCH(eptr, ecode + 1 + LINK_SIZE, offset_top, md, ims, eptrb, 0, RM12);
1312 nigel 91 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1313 ph10 197 if (flags != 0) /* Could match an empty string */
1314     {
1315     RMATCH(eptr, prev, offset_top, md, ims, eptrb, flags, RM50);
1316     RRETURN(rrc);
1317     }
1318 nigel 91 ecode = prev;
1319     goto TAIL_RECURSE;
1320 nigel 77 }
1321 nigel 91 else /* OP_KETRMAX */
1322     {
1323 ph10 164 RMATCH(eptr, prev, offset_top, md, ims, eptrb, flags, RM13);
1324 nigel 91 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1325     ecode += 1 + LINK_SIZE;
1326 ph10 197 flags = 0;
1327 nigel 91 goto TAIL_RECURSE;
1328     }
1329     /* Control never gets here */
1330 nigel 77
1331     /* Start of subject unless notbol, or after internal newline if multiline */
1332    
1333     case OP_CIRC:
1334     if (md->notbol && eptr == md->start_subject) RRETURN(MATCH_NOMATCH);
1335     if ((ims & PCRE_MULTILINE) != 0)
1336     {
1337 nigel 91 if (eptr != md->start_subject &&
1338 nigel 93 (eptr == md->end_subject || !WAS_NEWLINE(eptr)))
1339 nigel 77 RRETURN(MATCH_NOMATCH);
1340     ecode++;
1341     break;
1342     }
1343     /* ... else fall through */
1344    
1345     /* Start of subject assertion */
1346    
1347     case OP_SOD:
1348     if (eptr != md->start_subject) RRETURN(MATCH_NOMATCH);
1349     ecode++;
1350     break;
1351    
1352     /* Start of match assertion */
1353    
1354     case OP_SOM:
1355     if (eptr != md->start_subject + md->start_offset) RRETURN(MATCH_NOMATCH);
1356     ecode++;
1357     break;
1358 ph10 172
1359 ph10 168 /* Reset the start of match point */
1360 ph10 172
1361 ph10 168 case OP_SET_SOM:
1362     mstart = eptr;
1363 ph10 172 ecode++;
1364     break;
1365 nigel 77
1366     /* Assert before internal newline if multiline, or before a terminating
1367     newline unless endonly is set, else end of subject unless noteol is set. */
1368    
1369     case OP_DOLL:
1370     if ((ims & PCRE_MULTILINE) != 0)
1371     {
1372     if (eptr < md->end_subject)
1373 nigel 91 { if (!IS_NEWLINE(eptr)) RRETURN(MATCH_NOMATCH); }
1374 nigel 77 else
1375     { if (md->noteol) RRETURN(MATCH_NOMATCH); }
1376     ecode++;
1377     break;
1378     }
1379     else
1380     {
1381     if (md->noteol) RRETURN(MATCH_NOMATCH);
1382     if (!md->endonly)
1383     {
1384 nigel 91 if (eptr != md->end_subject &&
1385 nigel 93 (!IS_NEWLINE(eptr) || eptr != md->end_subject - md->nllen))
1386 nigel 77 RRETURN(MATCH_NOMATCH);
1387     ecode++;
1388     break;
1389     }
1390     }
1391 nigel 91 /* ... else fall through for endonly */
1392 nigel 77
1393     /* End of subject assertion (\z) */
1394    
1395     case OP_EOD:
1396     if (eptr < md->end_subject) RRETURN(MATCH_NOMATCH);
1397     ecode++;
1398     break;
1399    
1400     /* End of subject or ending \n assertion (\Z) */
1401    
1402     case OP_EODN:
1403 nigel 91 if (eptr != md->end_subject &&
1404 nigel 93 (!IS_NEWLINE(eptr) || eptr != md->end_subject - md->nllen))
1405 nigel 91 RRETURN(MATCH_NOMATCH);
1406 nigel 77 ecode++;
1407     break;
1408    
1409     /* Word boundary assertions */
1410    
1411     case OP_NOT_WORD_BOUNDARY:
1412     case OP_WORD_BOUNDARY:
1413     {
1414    
1415     /* Find out if the previous and current characters are "word" characters.
1416     It takes a bit more work in UTF-8 mode. Characters > 255 are assumed to
1417     be "non-word" characters. */
1418    
1419     #ifdef SUPPORT_UTF8
1420     if (utf8)
1421     {
1422     if (eptr == md->start_subject) prev_is_word = FALSE; else
1423     {
1424     const uschar *lastptr = eptr - 1;
1425     while((*lastptr & 0xc0) == 0x80) lastptr--;
1426     GETCHAR(c, lastptr);
1427     prev_is_word = c < 256 && (md->ctypes[c] & ctype_word) != 0;
1428     }
1429     if (eptr >= md->end_subject) cur_is_word = FALSE; else
1430     {
1431     GETCHAR(c, eptr);
1432     cur_is_word = c < 256 && (md->ctypes[c] & ctype_word) != 0;
1433     }
1434     }
1435     else
1436     #endif
1437    
1438     /* More streamlined when not in UTF-8 mode */
1439    
1440     {
1441     prev_is_word = (eptr != md->start_subject) &&
1442     ((md->ctypes[eptr[-1]] & ctype_word) != 0);
1443     cur_is_word = (eptr < md->end_subject) &&
1444     ((md->ctypes[*eptr] & ctype_word) != 0);
1445     }
1446    
1447     /* Now see if the situation is what we want */
1448    
1449     if ((*ecode++ == OP_WORD_BOUNDARY)?
1450     cur_is_word == prev_is_word : cur_is_word != prev_is_word)
1451     RRETURN(MATCH_NOMATCH);
1452     }
1453     break;
1454    
1455     /* Match a single character type; inline for speed */
1456    
1457     case OP_ANY:
1458 ph10 342 if (IS_NEWLINE(eptr)) RRETURN(MATCH_NOMATCH);
1459 ph10 345 /* Fall through */
1460    
1461 ph10 341 case OP_ALLANY:
1462 nigel 77 if (eptr++ >= md->end_subject) RRETURN(MATCH_NOMATCH);
1463 ph10 342 if (utf8) while (eptr < md->end_subject && (*eptr & 0xc0) == 0x80) eptr++;
1464 nigel 77 ecode++;
1465     break;
1466    
1467     /* Match a single byte, even in UTF-8 mode. This opcode really does match
1468     any byte, even newline, independent of the setting of PCRE_DOTALL. */
1469    
1470     case OP_ANYBYTE:
1471     if (eptr++ >= md->end_subject) RRETURN(MATCH_NOMATCH);
1472     ecode++;
1473     break;
1474    
1475     case OP_NOT_DIGIT:
1476     if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
1477     GETCHARINCTEST(c, eptr);
1478     if (
1479     #ifdef SUPPORT_UTF8
1480     c < 256 &&
1481     #endif
1482     (md->ctypes[c] & ctype_digit) != 0
1483     )
1484     RRETURN(MATCH_NOMATCH);
1485     ecode++;
1486     break;
1487    
1488     case OP_DIGIT:
1489     if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
1490     GETCHARINCTEST(c, eptr);
1491     if (
1492     #ifdef SUPPORT_UTF8
1493     c >= 256 ||
1494     #endif
1495     (md->ctypes[c] & ctype_digit) == 0
1496     )
1497     RRETURN(MATCH_NOMATCH);
1498     ecode++;
1499     break;
1500    
1501     case OP_NOT_WHITESPACE:
1502     if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
1503     GETCHARINCTEST(c, eptr);
1504     if (
1505     #ifdef SUPPORT_UTF8
1506     c < 256 &&
1507     #endif
1508     (md->ctypes[c] & ctype_space) != 0
1509     )
1510     RRETURN(MATCH_NOMATCH);
1511     ecode++;
1512     break;
1513    
1514     case OP_WHITESPACE:
1515     if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
1516     GETCHARINCTEST(c, eptr);
1517     if (
1518     #ifdef SUPPORT_UTF8
1519     c >= 256 ||
1520     #endif
1521     (md->ctypes[c] & ctype_space) == 0
1522     )
1523     RRETURN(MATCH_NOMATCH);
1524     ecode++;
1525     break;
1526    
1527     case OP_NOT_WORDCHAR:
1528     if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
1529     GETCHARINCTEST(c, eptr);
1530     if (
1531     #ifdef SUPPORT_UTF8
1532     c < 256 &&
1533     #endif
1534     (md->ctypes[c] & ctype_word) != 0
1535     )
1536     RRETURN(MATCH_NOMATCH);
1537     ecode++;
1538     break;
1539    
1540     case OP_WORDCHAR:
1541     if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
1542     GETCHARINCTEST(c, eptr);
1543     if (
1544     #ifdef SUPPORT_UTF8
1545     c >= 256 ||
1546     #endif
1547     (md->ctypes[c] & ctype_word) == 0
1548     )
1549     RRETURN(MATCH_NOMATCH);
1550     ecode++;
1551     break;
1552    
1553 nigel 93 case OP_ANYNL:
1554     if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
1555     GETCHARINCTEST(c, eptr);
1556     switch(c)
1557     {
1558     default: RRETURN(MATCH_NOMATCH);
1559     case 0x000d:
1560     if (eptr < md->end_subject && *eptr == 0x0a) eptr++;
1561     break;
1562 ph10 231
1563 nigel 93 case 0x000a:
1564 ph10 231 break;
1565    
1566 nigel 93 case 0x000b:
1567     case 0x000c:
1568     case 0x0085:
1569     case 0x2028:
1570     case 0x2029:
1571 ph10 231 if (md->bsr_anycrlf) RRETURN(MATCH_NOMATCH);
1572 nigel 93 break;
1573     }
1574     ecode++;
1575     break;
1576    
1577 ph10 178 case OP_NOT_HSPACE:
1578     if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
1579     GETCHARINCTEST(c, eptr);
1580     switch(c)
1581     {
1582     default: break;
1583     case 0x09: /* HT */
1584     case 0x20: /* SPACE */
1585     case 0xa0: /* NBSP */
1586     case 0x1680: /* OGHAM SPACE MARK */
1587     case 0x180e: /* MONGOLIAN VOWEL SEPARATOR */
1588     case 0x2000: /* EN QUAD */
1589     case 0x2001: /* EM QUAD */
1590     case 0x2002: /* EN SPACE */
1591     case 0x2003: /* EM SPACE */
1592     case 0x2004: /* THREE-PER-EM SPACE */
1593     case 0x2005: /* FOUR-PER-EM SPACE */
1594     case 0x2006: /* SIX-PER-EM SPACE */
1595     case 0x2007: /* FIGURE SPACE */
1596     case 0x2008: /* PUNCTUATION SPACE */
1597     case 0x2009: /* THIN SPACE */
1598     case 0x200A: /* HAIR SPACE */
1599     case 0x202f: /* NARROW NO-BREAK SPACE */
1600     case 0x205f: /* MEDIUM MATHEMATICAL SPACE */
1601     case 0x3000: /* IDEOGRAPHIC SPACE */
1602     RRETURN(MATCH_NOMATCH);
1603     }
1604     ecode++;
1605     break;
1606    
1607     case OP_HSPACE:
1608     if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
1609     GETCHARINCTEST(c, eptr);
1610     switch(c)
1611     {
1612     default: RRETURN(MATCH_NOMATCH);
1613     case 0x09: /* HT */
1614     case 0x20: /* SPACE */
1615     case 0xa0: /* NBSP */
1616     case 0x1680: /* OGHAM SPACE MARK */
1617     case 0x180e: /* MONGOLIAN VOWEL SEPARATOR */
1618     case 0x2000: /* EN QUAD */
1619     case 0x2001: /* EM QUAD */
1620     case 0x2002: /* EN SPACE */
1621     case 0x2003: /* EM SPACE */
1622     case 0x2004: /* THREE-PER-EM SPACE */
1623     case 0x2005: /* FOUR-PER-EM SPACE */
1624     case 0x2006: /* SIX-PER-EM SPACE */
1625     case 0x2007: /* FIGURE SPACE */
1626     case 0x2008: /* PUNCTUATION SPACE */
1627     case 0x2009: /* THIN SPACE */
1628     case 0x200A: /* HAIR SPACE */
1629     case 0x202f: /* NARROW NO-BREAK SPACE */
1630     case 0x205f: /* MEDIUM MATHEMATICAL SPACE */
1631     case 0x3000: /* IDEOGRAPHIC SPACE */
1632     break;
1633     }
1634     ecode++;
1635     break;
1636    
1637     case OP_NOT_VSPACE:
1638     if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
1639     GETCHARINCTEST(c, eptr);
1640     switch(c)
1641     {
1642     default: break;
1643     case 0x0a: /* LF */
1644     case 0x0b: /* VT */
1645     case 0x0c: /* FF */
1646     case 0x0d: /* CR */
1647     case 0x85: /* NEL */
1648     case 0x2028: /* LINE SEPARATOR */
1649     case 0x2029: /* PARAGRAPH SEPARATOR */
1650     RRETURN(MATCH_NOMATCH);
1651     }
1652     ecode++;
1653     break;
1654    
1655     case OP_VSPACE:
1656     if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
1657     GETCHARINCTEST(c, eptr);
1658     switch(c)
1659     {
1660     default: RRETURN(MATCH_NOMATCH);
1661     case 0x0a: /* LF */
1662     case 0x0b: /* VT */
1663     case 0x0c: /* FF */
1664     case 0x0d: /* CR */
1665     case 0x85: /* NEL */
1666     case 0x2028: /* LINE SEPARATOR */
1667     case 0x2029: /* PARAGRAPH SEPARATOR */
1668     break;
1669     }
1670     ecode++;
1671     break;
1672    
1673 nigel 77 #ifdef SUPPORT_UCP
1674     /* Check the next character by Unicode property. We will get here only
1675     if the support is in the binary; otherwise a compile-time error occurs. */
1676    
1677     case OP_PROP:
1678     case OP_NOTPROP:
1679     if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
1680     GETCHARINCTEST(c, eptr);
1681     {
1682 ph10 349 const ucd_record * prop = GET_UCD(c);
1683 nigel 77
1684 nigel 87 switch(ecode[1])
1685     {
1686     case PT_ANY:
1687     if (op == OP_NOTPROP) RRETURN(MATCH_NOMATCH);
1688     break;
1689 nigel 77
1690 nigel 87 case PT_LAMP:
1691 ph10 349 if ((prop->chartype == ucp_Lu ||
1692     prop->chartype == ucp_Ll ||
1693     prop->chartype == ucp_Lt) == (op == OP_NOTPROP))
1694 nigel 77 RRETURN(MATCH_NOMATCH);
1695 nigel 87 break;
1696    
1697     case PT_GC:
1698 ph10 351 if ((ecode[2] != _pcre_ucp_gentype[prop->chartype]) == (op == OP_PROP))
1699 nigel 77 RRETURN(MATCH_NOMATCH);
1700 nigel 87 break;
1701    
1702     case PT_PC:
1703 ph10 349 if ((ecode[2] != prop->chartype) == (op == OP_PROP))
1704 nigel 87 RRETURN(MATCH_NOMATCH);
1705     break;
1706    
1707     case PT_SC:
1708 ph10 349 if ((ecode[2] != prop->script) == (op == OP_PROP))
1709 nigel 87 RRETURN(MATCH_NOMATCH);
1710     break;
1711    
1712     default:
1713     RRETURN(PCRE_ERROR_INTERNAL);
1714 nigel 77 }
1715 nigel 87
1716     ecode += 3;
1717 nigel 77 }
1718     break;
1719    
1720     /* Match an extended Unicode sequence. We will get here only if the support
1721     is in the binary; otherwise a compile-time error occurs. */
1722    
1723     case OP_EXTUNI:
1724     if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
1725     GETCHARINCTEST(c, eptr);
1726     {
1727 ph10 349 int category = UCD_CATEGORY(c);
1728 nigel 77 if (category == ucp_M) RRETURN(MATCH_NOMATCH);
1729     while (eptr < md->end_subject)
1730     {
1731     int len = 1;
1732     if (!utf8) c = *eptr; else
1733     {
1734     GETCHARLEN(c, eptr, len);
1735     }
1736 ph10 349 category = UCD_CATEGORY(c);
1737 nigel 77 if (category != ucp_M) break;
1738     eptr += len;
1739     }
1740     }
1741     ecode++;
1742     break;
1743     #endif
1744    
1745    
1746     /* Match a back reference, possibly repeatedly. Look past the end of the
1747     item to see if there is repeat information following. The code is similar
1748     to that for character classes, but repeated for efficiency. Then obey
1749     similar code to character type repeats - written out again for speed.
1750     However, if the referenced string is the empty string, always treat
1751     it as matched, any number of times (otherwise there could be infinite
1752     loops). */
1753    
1754     case OP_REF:
1755     {
1756     offset = GET2(ecode, 1) << 1; /* Doubled ref number */
1757 ph10 345 ecode += 3;
1758    
1759 ph10 336 /* If the reference is unset, there are two possibilities:
1760 ph10 345
1761 ph10 336 (a) In the default, Perl-compatible state, set the length to be longer
1762     than the amount of subject left; this ensures that every attempt at a
1763     match fails. We can't just fail here, because of the possibility of
1764     quantifiers with zero minima.
1765 ph10 345
1766     (b) If the JavaScript compatibility flag is set, set the length to zero
1767     so that the back reference matches an empty string.
1768    
1769     Otherwise, set the length to the length of what was matched by the
1770 ph10 336 referenced subpattern. */
1771 ph10 345
1772 ph10 336 if (offset >= offset_top || md->offset_vector[offset] < 0)
1773 ph10 345 length = (md->jscript_compat)? 0 : md->end_subject - eptr + 1;
1774 ph10 336 else
1775     length = md->offset_vector[offset+1] - md->offset_vector[offset];
1776 nigel 77
1777     /* Set up for repetition, or handle the non-repeated case */
1778    
1779     switch (*ecode)
1780     {
1781     case OP_CRSTAR:
1782     case OP_CRMINSTAR:
1783     case OP_CRPLUS:
1784     case OP_CRMINPLUS:
1785     case OP_CRQUERY:
1786     case OP_CRMINQUERY:
1787     c = *ecode++ - OP_CRSTAR;
1788     minimize = (c & 1) != 0;
1789     min = rep_min[c]; /* Pick up values from tables; */
1790     max = rep_max[c]; /* zero for max => infinity */
1791     if (max == 0) max = INT_MAX;
1792     break;
1793    
1794     case OP_CRRANGE:
1795     case OP_CRMINRANGE:
1796     minimize = (*ecode == OP_CRMINRANGE);
1797     min = GET2(ecode, 1);
1798     max = GET2(ecode, 3);
1799     if (max == 0) max = INT_MAX;
1800     ecode += 5;
1801     break;
1802    
1803     default: /* No repeat follows */
1804     if (!match_ref(offset, eptr, length, md, ims)) RRETURN(MATCH_NOMATCH);
1805     eptr += length;
1806     continue; /* With the main loop */
1807     }
1808    
1809     /* If the length of the reference is zero, just continue with the
1810     main loop. */
1811    
1812     if (length == 0) continue;
1813    
1814     /* First, ensure the minimum number of matches are present. We get back
1815     the length of the reference string explicitly rather than passing the
1816     address of eptr, so that eptr can be a register variable. */
1817    
1818     for (i = 1; i <= min; i++)
1819     {
1820     if (!match_ref(offset, eptr, length, md, ims)) RRETURN(MATCH_NOMATCH);
1821     eptr += length;
1822     }
1823    
1824     /* If min = max, continue at the same level without recursion.
1825     They are not both allowed to be zero. */
1826    
1827     if (min == max) continue;
1828    
1829     /* If minimizing, keep trying and advancing the pointer */
1830    
1831     if (minimize)
1832     {
1833     for (fi = min;; fi++)
1834     {
1835 ph10 164 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM14);
1836 nigel 77 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1837     if (fi >= max || !match_ref(offset, eptr, length, md, ims))
1838     RRETURN(MATCH_NOMATCH);
1839     eptr += length;
1840     }
1841     /* Control never gets here */
1842     }
1843    
1844     /* If maximizing, find the longest string and work backwards */
1845    
1846     else
1847     {
1848     pp = eptr;
1849     for (i = min; i < max; i++)
1850     {
1851     if (!match_ref(offset, eptr, length, md, ims)) break;
1852     eptr += length;
1853     }
1854     while (eptr >= pp)
1855     {
1856 ph10 164 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM15);
1857 nigel 77 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1858     eptr -= length;
1859     }
1860     RRETURN(MATCH_NOMATCH);
1861     }
1862     }
1863     /* Control never gets here */
1864    
1865    
1866    
1867     /* Match a bit-mapped character class, possibly repeatedly. This op code is
1868     used when all the characters in the class have values in the range 0-255,
1869     and either the matching is caseful, or the characters are in the range
1870     0-127 when UTF-8 processing is enabled. The only difference between
1871     OP_CLASS and OP_NCLASS occurs when a data character outside the range is
1872     encountered.
1873    
1874     First, look past the end of the item to see if there is repeat information
1875     following. Then obey similar code to character type repeats - written out
1876     again for speed. */
1877    
1878     case OP_NCLASS:
1879     case OP_CLASS:
1880     {
1881     data = ecode + 1; /* Save for matching */
1882     ecode += 33; /* Advance past the item */
1883    
1884     switch (*ecode)
1885     {
1886     case OP_CRSTAR:
1887     case OP_CRMINSTAR:
1888     case OP_CRPLUS:
1889     case OP_CRMINPLUS:
1890     case OP_CRQUERY:
1891     case OP_CRMINQUERY:
1892     c = *ecode++ - OP_CRSTAR;
1893     minimize = (c & 1) != 0;
1894     min = rep_min[c]; /* Pick up values from tables; */
1895     max = rep_max[c]; /* zero for max => infinity */
1896     if (max == 0) max = INT_MAX;
1897     break;
1898    
1899     case OP_CRRANGE:
1900     case OP_CRMINRANGE:
1901     minimize = (*ecode == OP_CRMINRANGE);
1902     min = GET2(ecode, 1);
1903     max = GET2(ecode, 3);
1904     if (max == 0) max = INT_MAX;
1905     ecode += 5;
1906     break;
1907    
1908     default: /* No repeat follows */
1909     min = max = 1;
1910     break;
1911     }
1912    
1913     /* First, ensure the minimum number of matches are present. */
1914    
1915     #ifdef SUPPORT_UTF8
1916     /* UTF-8 mode */
1917     if (utf8)
1918     {
1919     for (i = 1; i <= min; i++)
1920     {
1921     if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
1922     GETCHARINC(c, eptr);
1923     if (c > 255)
1924     {
1925     if (op == OP_CLASS) RRETURN(MATCH_NOMATCH);
1926     }
1927     else
1928     {
1929     if ((data[c/8] & (1 << (c&7))) == 0) RRETURN(MATCH_NOMATCH);
1930     }
1931     }
1932     }
1933     else
1934     #endif
1935     /* Not UTF-8 mode */
1936     {
1937     for (i = 1; i <= min; i++)
1938     {
1939     if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
1940     c = *eptr++;
1941     if ((data[c/8] & (1 << (c&7))) == 0) RRETURN(MATCH_NOMATCH);
1942     }
1943     }
1944    
1945     /* If max == min we can continue with the main loop without the
1946     need to recurse. */
1947    
1948     if (min == max) continue;
1949    
1950     /* If minimizing, keep testing the rest of the expression and advancing
1951     the pointer while it matches the class. */
1952    
1953     if (minimize)
1954     {
1955     #ifdef SUPPORT_UTF8
1956     /* UTF-8 mode */
1957     if (utf8)
1958     {
1959     for (fi = min;; fi++)
1960     {
1961 ph10 164 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM16);
1962 nigel 77 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1963     if (fi >= max || eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
1964     GETCHARINC(c, eptr);
1965     if (c > 255)
1966     {
1967     if (op == OP_CLASS) RRETURN(MATCH_NOMATCH);
1968     }
1969     else
1970     {
1971     if ((data[c/8] & (1 << (c&7))) == 0) RRETURN(MATCH_NOMATCH);
1972     }
1973     }
1974     }
1975     else
1976     #endif
1977     /* Not UTF-8 mode */
1978     {
1979     for (fi = min;; fi++)
1980     {
1981 ph10 164 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM17);
1982 nigel 77 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1983     if (fi >= max || eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
1984     c = *eptr++;
1985     if ((data[c/8] & (1 << (c&7))) == 0) RRETURN(MATCH_NOMATCH);
1986     }
1987     }
1988     /* Control never gets here */
1989     }
1990    
1991     /* If maximizing, find the longest possible run, then work backwards. */
1992    
1993     else
1994     {
1995     pp = eptr;
1996    
1997     #ifdef SUPPORT_UTF8
1998     /* UTF-8 mode */
1999     if (utf8)
2000     {
2001     for (i = min; i < max; i++)
2002     {
2003     int len = 1;
2004     if (eptr >= md->end_subject) break;
2005     GETCHARLEN(c, eptr, len);
2006     if (c > 255)
2007     {
2008     if (op == OP_CLASS) break;
2009     }
2010     else
2011     {
2012     if ((data[c/8] & (1 << (c&7))) == 0) break;
2013     }
2014     eptr += len;
2015     }
2016     for (;;)
2017     {
2018 ph10 164 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM18);
2019 nigel 77 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2020     if (eptr-- == pp) break; /* Stop if tried at original pos */
2021     BACKCHAR(eptr);
2022     }
2023     }
2024     else
2025     #endif
2026     /* Not UTF-8 mode */
2027     {
2028     for (i = min; i < max; i++)
2029     {
2030     if (eptr >= md->end_subject) break;
2031     c = *eptr;
2032     if ((data[c/8] & (1 << (c&7))) == 0) break;
2033     eptr++;
2034     }
2035     while (eptr >= pp)
2036     {
2037 ph10 164 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM19);
2038 nigel 87 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2039 nigel 77 eptr--;
2040     }
2041     }
2042    
2043     RRETURN(MATCH_NOMATCH);
2044     }
2045     }
2046     /* Control never gets here */
2047    
2048    
2049     /* Match an extended character class. This opcode is encountered only
2050     in UTF-8 mode, because that's the only time it is compiled. */
2051    
2052     #ifdef SUPPORT_UTF8
2053     case OP_XCLASS:
2054     {
2055     data = ecode + 1 + LINK_SIZE; /* Save for matching */
2056     ecode += GET(ecode, 1); /* Advance past the item */
2057    
2058     switch (*ecode)
2059     {
2060     case OP_CRSTAR:
2061     case OP_CRMINSTAR:
2062     case OP_CRPLUS:
2063     case OP_CRMINPLUS:
2064     case OP_CRQUERY:
2065     case OP_CRMINQUERY:
2066     c = *ecode++ - OP_CRSTAR;
2067     minimize = (c & 1) != 0;
2068     min = rep_min[c]; /* Pick up values from tables; */
2069     max = rep_max[c]; /* zero for max => infinity */
2070     if (max == 0) max = INT_MAX;
2071     break;
2072    
2073     case OP_CRRANGE:
2074     case OP_CRMINRANGE:
2075     minimize = (*ecode == OP_CRMINRANGE);
2076     min = GET2(ecode, 1);
2077     max = GET2(ecode, 3);
2078     if (max == 0) max = INT_MAX;
2079     ecode += 5;
2080     break;
2081    
2082     default: /* No repeat follows */
2083     min = max = 1;
2084     break;
2085     }
2086    
2087     /* First, ensure the minimum number of matches are present. */
2088    
2089     for (i = 1; i <= min; i++)
2090     {
2091     if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
2092     GETCHARINC(c, eptr);
2093     if (!_pcre_xclass(c, data)) RRETURN(MATCH_NOMATCH);
2094     }
2095    
2096     /* If max == min we can continue with the main loop without the
2097     need to recurse. */
2098    
2099     if (min == max) continue;
2100    
2101     /* If minimizing, keep testing the rest of the expression and advancing
2102     the pointer while it matches the class. */
2103    
2104     if (minimize)
2105     {
2106     for (fi = min;; fi++)
2107     {
2108 ph10 164 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM20);
2109 nigel 77 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2110     if (fi >= max || eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
2111     GETCHARINC(c, eptr);
2112     if (!_pcre_xclass(c, data)) RRETURN(MATCH_NOMATCH);
2113     }
2114     /* Control never gets here */
2115     }
2116    
2117     /* If maximizing, find the longest possible run, then work backwards. */
2118    
2119     else
2120     {
2121     pp = eptr;
2122     for (i = min; i < max; i++)
2123     {
2124     int len = 1;
2125     if (eptr >= md->end_subject) break;
2126     GETCHARLEN(c, eptr, len);
2127     if (!_pcre_xclass(c, data)) break;
2128     eptr += len;
2129     }
2130     for(;;)
2131     {
2132 ph10 164 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM21);
2133 nigel 77 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2134     if (eptr-- == pp) break; /* Stop if tried at original pos */
2135 ph10 214 if (utf8) BACKCHAR(eptr);
2136 nigel 77 }
2137     RRETURN(MATCH_NOMATCH);
2138     }
2139    
2140     /* Control never gets here */
2141     }
2142     #endif /* End of XCLASS */
2143    
2144     /* Match a single character, casefully */
2145    
2146     case OP_CHAR:
2147     #ifdef SUPPORT_UTF8
2148     if (utf8)
2149     {
2150     length = 1;
2151     ecode++;
2152     GETCHARLEN(fc, ecode, length);
2153     if (length > md->end_subject - eptr) RRETURN(MATCH_NOMATCH);
2154     while (length-- > 0) if (*ecode++ != *eptr++) RRETURN(MATCH_NOMATCH);
2155     }
2156     else
2157     #endif
2158    
2159     /* Non-UTF-8 mode */
2160     {
2161     if (md->end_subject - eptr < 1) RRETURN(MATCH_NOMATCH);
2162     if (ecode[1] != *eptr++) RRETURN(MATCH_NOMATCH);
2163     ecode += 2;
2164     }
2165     break;
2166    
2167     /* Match a single character, caselessly */
2168    
2169     case OP_CHARNC:
2170     #ifdef SUPPORT_UTF8
2171     if (utf8)
2172     {
2173     length = 1;
2174     ecode++;
2175     GETCHARLEN(fc, ecode, length);
2176    
2177     if (length > md->end_subject - eptr) RRETURN(MATCH_NOMATCH);
2178    
2179     /* If the pattern character's value is < 128, we have only one byte, and
2180     can use the fast lookup table. */
2181    
2182     if (fc < 128)
2183     {
2184     if (md->lcc[*ecode++] != md->lcc[*eptr++]) RRETURN(MATCH_NOMATCH);
2185     }
2186    
2187     /* Otherwise we must pick up the subject character */
2188    
2189     else
2190     {
2191 nigel 93 unsigned int dc;
2192 nigel 77 GETCHARINC(dc, eptr);
2193     ecode += length;
2194    
2195     /* If we have Unicode property support, we can use it to test the other
2196 nigel 87 case of the character, if there is one. */
2197 nigel 77
2198     if (fc != dc)
2199     {
2200     #ifdef SUPPORT_UCP
2201 ph10 349 if (dc != UCD_OTHERCASE(fc))
2202 nigel 77 #endif
2203     RRETURN(MATCH_NOMATCH);
2204     }
2205     }
2206     }
2207     else
2208     #endif /* SUPPORT_UTF8 */
2209    
2210     /* Non-UTF-8 mode */
2211     {
2212     if (md->end_subject - eptr < 1) RRETURN(MATCH_NOMATCH);
2213     if (md->lcc[ecode[1]] != md->lcc[*eptr++]) RRETURN(MATCH_NOMATCH);
2214     ecode += 2;
2215     }
2216     break;
2217    
2218 nigel 93 /* Match a single character repeatedly. */
2219 nigel 77
2220     case OP_EXACT:
2221     min = max = GET2(ecode, 1);
2222     ecode += 3;
2223     goto REPEATCHAR;
2224    
2225 nigel 93 case OP_POSUPTO:
2226     possessive = TRUE;
2227     /* Fall through */
2228    
2229 nigel 77 case OP_UPTO:
2230     case OP_MINUPTO:
2231     min = 0;
2232     max = GET2(ecode, 1);
2233     minimize = *ecode == OP_MINUPTO;
2234     ecode += 3;
2235     goto REPEATCHAR;
2236    
2237 nigel 93 case OP_POSSTAR:
2238     possessive = TRUE;
2239     min = 0;
2240     max = INT_MAX;
2241     ecode++;
2242     goto REPEATCHAR;
2243    
2244     case OP_POSPLUS:
2245     possessive = TRUE;
2246     min = 1;
2247     max = INT_MAX;
2248     ecode++;
2249     goto REPEATCHAR;
2250    
2251     case OP_POSQUERY:
2252     possessive = TRUE;
2253     min = 0;
2254     max = 1;
2255     ecode++;
2256     goto REPEATCHAR;
2257    
2258 nigel 77 case OP_STAR:
2259     case OP_MINSTAR:
2260     case OP_PLUS:
2261     case OP_MINPLUS:
2262     case OP_QUERY:
2263     case OP_MINQUERY:
2264     c = *ecode++ - OP_STAR;
2265     minimize = (c & 1) != 0;
2266     min = rep_min[c]; /* Pick up values from tables; */
2267     max = rep_max[c]; /* zero for max => infinity */
2268     if (max == 0) max = INT_MAX;
2269    
2270     /* Common code for all repeated single-character matches. We can give
2271     up quickly if there are fewer than the minimum number of characters left in
2272     the subject. */
2273    
2274     REPEATCHAR:
2275     #ifdef SUPPORT_UTF8
2276     if (utf8)
2277     {
2278     length = 1;
2279     charptr = ecode;
2280     GETCHARLEN(fc, ecode, length);
2281     if (min * length > md->end_subject - eptr) RRETURN(MATCH_NOMATCH);
2282     ecode += length;
2283    
2284     /* Handle multibyte character matching specially here. There is
2285     support for caseless matching if UCP support is present. */
2286    
2287     if (length > 1)
2288     {
2289     #ifdef SUPPORT_UCP
2290 nigel 93 unsigned int othercase;
2291 nigel 77 if ((ims & PCRE_CASELESS) != 0 &&
2292 ph10 349 (othercase = UCD_OTHERCASE(fc)) != fc)
2293 nigel 77 oclength = _pcre_ord2utf8(othercase, occhars);
2294 ph10 115 else oclength = 0;
2295 nigel 77 #endif /* SUPPORT_UCP */
2296    
2297     for (i = 1; i <= min; i++)
2298     {
2299     if (memcmp(eptr, charptr, length) == 0) eptr += length;
2300 ph10 123 #ifdef SUPPORT_UCP
2301 nigel 77 /* Need braces because of following else */
2302     else if (oclength == 0) { RRETURN(MATCH_NOMATCH); }
2303     else
2304     {
2305     if (memcmp(eptr, occhars, oclength) != 0) RRETURN(MATCH_NOMATCH);
2306     eptr += oclength;
2307     }
2308 ph10 115 #else /* without SUPPORT_UCP */
2309     else { RRETURN(MATCH_NOMATCH); }
2310 ph10 123 #endif /* SUPPORT_UCP */
2311 nigel 77 }
2312    
2313     if (min == max) continue;
2314    
2315     if (minimize)
2316     {
2317     for (fi = min;; fi++)
2318     {
2319 ph10 164 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM22);
2320 nigel 77 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2321     if (fi >= max || eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
2322     if (memcmp(eptr, charptr, length) == 0) eptr += length;
2323 ph10 123 #ifdef SUPPORT_UCP
2324 nigel 77 /* Need braces because of following else */
2325     else if (oclength == 0) { RRETURN(MATCH_NOMATCH); }
2326     else
2327     {
2328     if (memcmp(eptr, occhars, oclength) != 0) RRETURN(MATCH_NOMATCH);
2329     eptr += oclength;
2330     }
2331 ph10 115 #else /* without SUPPORT_UCP */
2332     else { RRETURN (MATCH_NOMATCH); }
2333     #endif /* SUPPORT_UCP */
2334 nigel 77 }
2335     /* Control never gets here */
2336     }
2337 nigel 93
2338     else /* Maximize */
2339 nigel 77 {
2340     pp = eptr;
2341     for (i = min; i < max; i++)
2342     {
2343     if (eptr > md->end_subject - length) break;
2344     if (memcmp(eptr, charptr, length) == 0) eptr += length;
2345 ph10 123 #ifdef SUPPORT_UCP
2346 nigel 77 else if (oclength == 0) break;
2347     else
2348     {
2349     if (memcmp(eptr, occhars, oclength) != 0) break;
2350     eptr += oclength;
2351     }
2352 ph10 115 #else /* without SUPPORT_UCP */
2353     else break;
2354 ph10 123 #endif /* SUPPORT_UCP */
2355 nigel 77 }
2356 nigel 93
2357     if (possessive) continue;
2358 ph10 120 for(;;)
2359 nigel 77 {
2360 ph10 164 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM23);
2361 nigel 77 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2362 ph10 120 if (eptr == pp) RRETURN(MATCH_NOMATCH);
2363 ph10 115 #ifdef SUPPORT_UCP
2364     eptr--;
2365     BACKCHAR(eptr);
2366 ph10 123 #else /* without SUPPORT_UCP */
2367 nigel 77 eptr -= length;
2368 ph10 123 #endif /* SUPPORT_UCP */
2369 nigel 77 }
2370     }
2371     /* Control never gets here */
2372     }
2373    
2374     /* If the length of a UTF-8 character is 1, we fall through here, and
2375     obey the code as for non-UTF-8 characters below, though in this case the
2376     value of fc will always be < 128. */
2377     }
2378     else
2379     #endif /* SUPPORT_UTF8 */
2380    
2381     /* When not in UTF-8 mode, load a single-byte character. */
2382     {
2383     if (min > md->end_subject - eptr) RRETURN(MATCH_NOMATCH);
2384     fc = *ecode++;
2385     }
2386    
2387     /* The value of fc at this point is always less than 256, though we may or
2388     may not be in UTF-8 mode. The code is duplicated for the caseless and
2389     caseful cases, for speed, since matching characters is likely to be quite
2390     common. First, ensure the minimum number of matches are present. If min =
2391     max, continue at the same level without recursing. Otherwise, if
2392     minimizing, keep trying the rest of the expression and advancing one
2393     matching character if failing, up to the maximum. Alternatively, if
2394     maximizing, find the maximum number of characters and work backwards. */
2395    
2396     DPRINTF(("matching %c{%d,%d} against subject %.*s\n", fc, min, max,
2397     max, eptr));
2398    
2399     if ((ims & PCRE_CASELESS) != 0)
2400     {
2401     fc = md->lcc[fc];
2402     for (i = 1; i <= min; i++)
2403     if (fc != md->lcc[*eptr++]) RRETURN(MATCH_NOMATCH);
2404     if (min == max) continue;
2405     if (minimize)
2406     {
2407     for (fi = min;; fi++)
2408     {
2409 ph10 164 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM24);
2410 nigel 77 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2411     if (fi >= max || eptr >= md->end_subject ||
2412     fc != md->lcc[*eptr++])
2413     RRETURN(MATCH_NOMATCH);
2414     }
2415     /* Control never gets here */
2416     }
2417 nigel 93 else /* Maximize */
2418 nigel 77 {
2419     pp = eptr;
2420     for (i = min; i < max; i++)
2421     {
2422     if (eptr >= md->end_subject || fc != md->lcc[*eptr]) break;
2423     eptr++;
2424     }
2425 nigel 93 if (possessive) continue;
2426 nigel 77 while (eptr >= pp)
2427     {
2428 ph10 164 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM25);
2429 nigel 77 eptr--;
2430     if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2431     }
2432     RRETURN(MATCH_NOMATCH);
2433     }
2434     /* Control never gets here */
2435     }
2436    
2437     /* Caseful comparisons (includes all multi-byte characters) */
2438    
2439     else
2440     {
2441     for (i = 1; i <= min; i++) if (fc != *eptr++) RRETURN(MATCH_NOMATCH);
2442     if (min == max) continue;
2443     if (minimize)
2444     {
2445     for (fi = min;; fi++)
2446     {
2447 ph10 164 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM26);
2448 nigel 77 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2449     if (fi >= max || eptr >= md->end_subject || fc != *eptr++)
2450     RRETURN(MATCH_NOMATCH);
2451     }
2452     /* Control never gets here */
2453     }
2454 nigel 93 else /* Maximize */
2455 nigel 77 {
2456     pp = eptr;
2457     for (i = min; i < max; i++)
2458     {
2459     if (eptr >= md->end_subject || fc != *eptr) break;
2460     eptr++;
2461     }
2462 nigel 93 if (possessive) continue;
2463 nigel 77 while (eptr >= pp)
2464     {
2465 ph10 164 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM27);
2466 nigel 77 eptr--;
2467     if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2468     }
2469     RRETURN(MATCH_NOMATCH);
2470     }
2471     }
2472     /* Control never gets here */
2473    
2474     /* Match a negated single one-byte character. The character we are
2475     checking can be multibyte. */
2476    
2477     case OP_NOT:
2478     if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
2479     ecode++;
2480     GETCHARINCTEST(c, eptr);
2481     if ((ims & PCRE_CASELESS) != 0)
2482     {
2483     #ifdef SUPPORT_UTF8
2484     if (c < 256)
2485     #endif
2486     c = md->lcc[c];
2487     if (md->lcc[*ecode++] == c) RRETURN(MATCH_NOMATCH);
2488     }
2489     else
2490     {
2491     if (*ecode++ == c) RRETURN(MATCH_NOMATCH);
2492     }
2493     break;
2494    
2495     /* Match a negated single one-byte character repeatedly. This is almost a
2496     repeat of the code for a repeated single character, but I haven't found a
2497     nice way of commoning these up that doesn't require a test of the
2498     positive/negative option for each character match. Maybe that wouldn't add
2499     very much to the time taken, but character matching *is* what this is all
2500     about... */
2501    
2502     case OP_NOTEXACT:
2503     min = max = GET2(ecode, 1);
2504     ecode += 3;
2505     goto REPEATNOTCHAR;
2506    
2507     case OP_NOTUPTO:
2508     case OP_NOTMINUPTO:
2509     min = 0;
2510     max = GET2(ecode, 1);
2511     minimize = *ecode == OP_NOTMINUPTO;
2512     ecode += 3;
2513     goto REPEATNOTCHAR;
2514    
2515 nigel 93 case OP_NOTPOSSTAR:
2516     possessive = TRUE;
2517     min = 0;
2518     max = INT_MAX;
2519     ecode++;
2520     goto REPEATNOTCHAR;
2521    
2522     case OP_NOTPOSPLUS:
2523     possessive = TRUE;
2524     min = 1;
2525     max = INT_MAX;
2526     ecode++;
2527     goto REPEATNOTCHAR;
2528    
2529     case OP_NOTPOSQUERY:
2530     possessive = TRUE;
2531     min = 0;
2532     max = 1;
2533     ecode++;
2534     goto REPEATNOTCHAR;
2535    
2536     case OP_NOTPOSUPTO:
2537     possessive = TRUE;
2538     min = 0;
2539     max = GET2(ecode, 1);
2540     ecode += 3;
2541     goto REPEATNOTCHAR;
2542    
2543 nigel 77 case OP_NOTSTAR:
2544     case OP_NOTMINSTAR:
2545     case OP_NOTPLUS:
2546     case OP_NOTMINPLUS:
2547     case OP_NOTQUERY:
2548     case OP_NOTMINQUERY:
2549     c = *ecode++ - OP_NOTSTAR;
2550     minimize = (c & 1) != 0;
2551     min = rep_min[c]; /* Pick up values from tables; */
2552     max = rep_max[c]; /* zero for max => infinity */
2553     if (max == 0) max = INT_MAX;
2554    
2555     /* Common code for all repeated single-byte matches. We can give up quickly
2556     if there are fewer than the minimum number of bytes left in the
2557     subject. */
2558    
2559     REPEATNOTCHAR:
2560     if (min > md->end_subject - eptr) RRETURN(MATCH_NOMATCH);
2561     fc = *ecode++;
2562    
2563     /* The code is duplicated for the caseless and caseful cases, for speed,
2564     since matching characters is likely to be quite common. First, ensure the
2565     minimum number of matches are present. If min = max, continue at the same
2566     level without recursing. Otherwise, if minimizing, keep trying the rest of
2567     the expression and advancing one matching character if failing, up to the
2568     maximum. Alternatively, if maximizing, find the maximum number of
2569     characters and work backwards. */
2570    
2571     DPRINTF(("negative matching %c{%d,%d} against subject %.*s\n", fc, min, max,
2572     max, eptr));
2573    
2574     if ((ims & PCRE_CASELESS) != 0)
2575     {
2576     fc = md->lcc[fc];
2577    
2578     #ifdef SUPPORT_UTF8
2579     /* UTF-8 mode */
2580     if (utf8)
2581     {
2582 nigel 93 register unsigned int d;
2583 nigel 77 for (i = 1; i <= min; i++)
2584     {
2585     GETCHARINC(d, eptr);
2586     if (d < 256) d = md->lcc[d];
2587     if (fc == d) RRETURN(MATCH_NOMATCH);
2588     }
2589     }
2590     else
2591     #endif
2592    
2593     /* Not UTF-8 mode */
2594     {
2595     for (i = 1; i <= min; i++)
2596     if (fc == md->lcc[*eptr++]) RRETURN(MATCH_NOMATCH);
2597     }
2598    
2599     if (min == max) continue;
2600    
2601     if (minimize)
2602     {
2603     #ifdef SUPPORT_UTF8
2604     /* UTF-8 mode */
2605     if (utf8)
2606     {
2607 nigel 93 register unsigned int d;
2608 nigel 77 for (fi = min;; fi++)
2609     {
2610 ph10 164 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM28);
2611 nigel 77 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2612     GETCHARINC(d, eptr);
2613     if (d < 256) d = md->lcc[d];
2614     if (fi >= max || eptr >= md->end_subject || fc == d)
2615     RRETURN(MATCH_NOMATCH);
2616     }
2617     }
2618     else
2619     #endif
2620     /* Not UTF-8 mode */
2621     {
2622     for (fi = min;; fi++)
2623     {
2624 ph10 164 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM29);
2625 nigel 77 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2626     if (fi >= max || eptr >= md->end_subject || fc == md->lcc[*eptr++])
2627     RRETURN(MATCH_NOMATCH);
2628     }
2629     }
2630     /* Control never gets here */
2631     }
2632    
2633     /* Maximize case */
2634    
2635     else
2636     {
2637     pp = eptr;
2638    
2639     #ifdef SUPPORT_UTF8
2640     /* UTF-8 mode */
2641     if (utf8)
2642     {
2643 nigel 93 register unsigned int d;
2644 nigel 77 for (i = min; i < max; i++)
2645     {
2646     int len = 1;
2647     if (eptr >= md->end_subject) break;
2648     GETCHARLEN(d, eptr, len);
2649     if (d < 256) d = md->lcc[d];
2650     if (fc == d) break;
2651     eptr += len;
2652     }
2653 nigel 93 if (possessive) continue;
2654     for(;;)
2655 nigel 77 {
2656 ph10 164 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM30);
2657 nigel 77 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2658     if (eptr-- == pp) break; /* Stop if tried at original pos */
2659     BACKCHAR(eptr);
2660     }
2661     }
2662     else
2663     #endif
2664     /* Not UTF-8 mode */
2665     {
2666     for (i = min; i < max; i++)
2667     {
2668     if (eptr >= md->end_subject || fc == md->lcc[*eptr]) break;
2669     eptr++;
2670     }
2671 nigel 93 if (possessive) continue;
2672 nigel 77 while (eptr >= pp)
2673     {
2674 ph10 164 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM31);
2675 nigel 77 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2676     eptr--;
2677     }
2678     }
2679    
2680     RRETURN(MATCH_NOMATCH);
2681     }
2682     /* Control never gets here */
2683     }
2684    
2685     /* Caseful comparisons */
2686    
2687     else
2688     {
2689     #ifdef SUPPORT_UTF8
2690     /* UTF-8 mode */
2691     if (utf8)
2692     {
2693 nigel 93 register unsigned int d;
2694 nigel 77 for (i = 1; i <= min; i++)
2695     {
2696     GETCHARINC(d, eptr);
2697     if (fc == d) RRETURN(MATCH_NOMATCH);
2698     }
2699     }
2700     else
2701     #endif
2702     /* Not UTF-8 mode */
2703     {
2704     for (i = 1; i <= min; i++)
2705     if (fc == *eptr++) RRETURN(MATCH_NOMATCH);
2706     }
2707    
2708     if (min == max) continue;
2709    
2710     if (minimize)
2711     {
2712     #ifdef SUPPORT_UTF8
2713     /* UTF-8 mode */
2714     if (utf8)
2715     {
2716 nigel 93 register unsigned int d;
2717 nigel 77 for (fi = min;; fi++)
2718     {
2719 ph10 164 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM32);
2720 nigel 77 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2721     GETCHARINC(d, eptr);
2722     if (fi >= max || eptr >= md->end_subject || fc == d)
2723     RRETURN(MATCH_NOMATCH);
2724     }
2725     }
2726     else
2727     #endif
2728     /* Not UTF-8 mode */
2729     {
2730     for (fi = min;; fi++)
2731     {
2732 ph10 164 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM33);
2733 nigel 77 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2734     if (fi >= max || eptr >= md->end_subject || fc == *eptr++)
2735     RRETURN(MATCH_NOMATCH);
2736     }
2737     }
2738     /* Control never gets here */
2739     }
2740    
2741     /* Maximize case */
2742    
2743     else
2744     {
2745     pp = eptr;
2746    
2747     #ifdef SUPPORT_UTF8
2748     /* UTF-8 mode */
2749     if (utf8)
2750     {
2751 nigel 93 register unsigned int d;
2752 nigel 77 for (i = min; i < max; i++)
2753     {
2754     int len = 1;
2755     if (eptr >= md->end_subject) break;
2756     GETCHARLEN(d, eptr, len);
2757     if (fc == d) break;
2758     eptr += len;
2759     }
2760 nigel 93 if (possessive) continue;
2761 nigel 77 for(;;)
2762     {
2763 ph10 164 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM34);
2764 nigel 77 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2765     if (eptr-- == pp) break; /* Stop if tried at original pos */
2766     BACKCHAR(eptr);
2767     }
2768     }
2769     else
2770     #endif
2771     /* Not UTF-8 mode */
2772     {
2773     for (i = min; i < max; i++)
2774     {
2775     if (eptr >= md->end_subject || fc == *eptr) break;
2776     eptr++;
2777     }
2778 nigel 93 if (possessive) continue;
2779 nigel 77 while (eptr >= pp)
2780     {
2781 ph10 164 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM35);
2782 nigel 77 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2783     eptr--;
2784     }
2785     }
2786    
2787     RRETURN(MATCH_NOMATCH);
2788     }
2789     }
2790     /* Control never gets here */
2791    
2792     /* Match a single character type repeatedly; several different opcodes
2793     share code. This is very similar to the code for single characters, but we
2794     repeat it in the interests of efficiency. */
2795    
2796     case OP_TYPEEXACT:
2797     min = max = GET2(ecode, 1);
2798     minimize = TRUE;
2799     ecode += 3;
2800     goto REPEATTYPE;
2801    
2802     case OP_TYPEUPTO:
2803     case OP_TYPEMINUPTO:
2804     min = 0;
2805     max = GET2(ecode, 1);
2806     minimize = *ecode == OP_TYPEMINUPTO;
2807     ecode += 3;
2808     goto REPEATTYPE;
2809    
2810 nigel 93 case OP_TYPEPOSSTAR:
2811     possessive = TRUE;
2812     min = 0;
2813     max = INT_MAX;
2814     ecode++;
2815     goto REPEATTYPE;
2816    
2817     case OP_TYPEPOSPLUS:
2818     possessive = TRUE;
2819     min = 1;
2820     max = INT_MAX;
2821     ecode++;
2822     goto REPEATTYPE;
2823    
2824     case OP_TYPEPOSQUERY:
2825     possessive = TRUE;
2826     min = 0;
2827     max = 1;
2828     ecode++;
2829     goto REPEATTYPE;
2830    
2831     case OP_TYPEPOSUPTO:
2832     possessive = TRUE;
2833     min = 0;
2834     max = GET2(ecode, 1);
2835     ecode += 3;
2836     goto REPEATTYPE;
2837    
2838 nigel 77 case OP_TYPESTAR:
2839     case OP_TYPEMINSTAR:
2840     case OP_TYPEPLUS:
2841     case OP_TYPEMINPLUS:
2842     case OP_TYPEQUERY:
2843     case OP_TYPEMINQUERY:
2844     c = *ecode++ - OP_TYPESTAR;
2845     minimize = (c & 1) != 0;
2846     min = rep_min[c]; /* Pick up values from tables; */
2847     max = rep_max[c]; /* zero for max => infinity */
2848     if (max == 0) max = INT_MAX;
2849    
2850     /* Common code for all repeated single character type matches. Note that
2851     in UTF-8 mode, '.' matches a character of any length, but for the other
2852     character types, the valid characters are all one-byte long. */
2853    
2854     REPEATTYPE:
2855     ctype = *ecode++; /* Code for the character type */
2856    
2857     #ifdef SUPPORT_UCP
2858     if (ctype == OP_PROP || ctype == OP_NOTPROP)
2859     {
2860     prop_fail_result = ctype == OP_NOTPROP;
2861     prop_type = *ecode++;
2862 nigel 87 prop_value = *ecode++;
2863 nigel 77 }
2864     else prop_type = -1;
2865     #endif
2866    
2867     /* First, ensure the minimum number of matches are present. Use inline
2868     code for maximizing the speed, and do the type test once at the start
2869     (i.e. keep it out of the loop). Also we can test that there are at least
2870     the minimum number of bytes before we start. This isn't as effective in
2871     UTF-8 mode, but it does no harm. Separate the UTF-8 code completely as that
2872     is tidier. Also separate the UCP code, which can be the same for both UTF-8
2873     and single-bytes. */
2874    
2875     if (min > md->end_subject - eptr) RRETURN(MATCH_NOMATCH);
2876     if (min > 0)
2877     {
2878     #ifdef SUPPORT_UCP
2879 nigel 87 if (prop_type >= 0)
2880 nigel 77 {
2881 nigel 87 switch(prop_type)
2882 nigel 77 {
2883 nigel 87 case PT_ANY:
2884     if (prop_fail_result) RRETURN(MATCH_NOMATCH);
2885     for (i = 1; i <= min; i++)
2886     {
2887     if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
2888 ph10 184 GETCHARINCTEST(c, eptr);
2889 nigel 87 }
2890     break;
2891    
2892     case PT_LAMP:
2893     for (i = 1; i <= min; i++)
2894     {
2895     if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
2896 ph10 184 GETCHARINCTEST(c, eptr);
2897 ph10 349 prop_chartype = UCD_CHARTYPE(c);
2898 nigel 87 if ((prop_chartype == ucp_Lu ||
2899     prop_chartype == ucp_Ll ||
2900     prop_chartype == ucp_Lt) == prop_fail_result)
2901     RRETURN(MATCH_NOMATCH);
2902     }
2903     break;
2904    
2905     case PT_GC:
2906     for (i = 1; i <= min; i++)
2907     {
2908     if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
2909 ph10 184 GETCHARINCTEST(c, eptr);
2910 ph10 349 prop_category = UCD_CATEGORY(c);
2911 nigel 87 if ((prop_category == prop_value) == prop_fail_result)
2912     RRETURN(MATCH_NOMATCH);
2913     }
2914     break;
2915    
2916     case PT_PC:
2917     for (i = 1; i <= min; i++)
2918     {
2919     if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
2920 ph10 184 GETCHARINCTEST(c, eptr);
2921 ph10 349 prop_chartype = UCD_CHARTYPE(c);
2922 nigel 87 if ((prop_chartype == prop_value) == prop_fail_result)
2923     RRETURN(MATCH_NOMATCH);
2924     }
2925     break;
2926    
2927     case PT_SC:
2928     for (i = 1; i <= min; i++)
2929     {
2930     if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
2931 ph10 184 GETCHARINCTEST(c, eptr);
2932 ph10 349 prop_script = UCD_SCRIPT(c);
2933 nigel 87 if ((prop_script == prop_value) == prop_fail_result)
2934     RRETURN(MATCH_NOMATCH);
2935     }
2936     break;
2937    
2938     default:
2939     RRETURN(PCRE_ERROR_INTERNAL);
2940 nigel 77 }
2941     }
2942    
2943     /* Match extended Unicode sequences. We will get here only if the
2944     support is in the binary; otherwise a compile-time error occurs. */
2945    
2946     else if (ctype == OP_EXTUNI)
2947     {
2948     for (i = 1; i <= min; i++)
2949     {
2950     GETCHARINCTEST(c, eptr);
2951 ph10 349 prop_category = UCD_CATEGORY(c);
2952 nigel 77 if (prop_category == ucp_M) RRETURN(MATCH_NOMATCH);
2953     while (eptr < md->end_subject)
2954     {
2955     int len = 1;
2956     if (!utf8) c = *eptr; else
2957     {
2958     GETCHARLEN(c, eptr, len);
2959     }
2960 ph10 349 prop_category = UCD_CATEGORY(c);
2961 nigel 77 if (prop_category != ucp_M) break;
2962     eptr += len;
2963     }
2964     }
2965     }
2966    
2967     else
2968     #endif /* SUPPORT_UCP */
2969    
2970     /* Handle all other cases when the coding is UTF-8 */
2971    
2972     #ifdef SUPPORT_UTF8
2973     if (utf8) switch(ctype)
2974     {
2975     case OP_ANY:
2976     for (i = 1; i <= min; i++)
2977     {
2978 ph10 342 if (eptr >= md->end_subject || IS_NEWLINE(eptr))
2979 nigel 77 RRETURN(MATCH_NOMATCH);
2980 nigel 91 eptr++;
2981 nigel 77 while (eptr < md->end_subject && (*eptr & 0xc0) == 0x80) eptr++;
2982     }
2983     break;
2984    
2985 ph10 341 case OP_ALLANY:
2986     for (i = 1; i <= min; i++)
2987     {
2988     if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
2989     eptr++;
2990     while (eptr < md->end_subject && (*eptr & 0xc0) == 0x80) eptr++;
2991     }
2992     break;
2993    
2994 nigel 77 case OP_ANYBYTE:
2995     eptr += min;
2996     break;
2997    
2998 nigel 93 case OP_ANYNL:
2999     for (i = 1; i <= min; i++)
3000     {
3001     if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
3002     GETCHARINC(c, eptr);
3003     switch(c)
3004     {
3005     default: RRETURN(MATCH_NOMATCH);
3006     case 0x000d:
3007     if (eptr < md->end_subject && *eptr == 0x0a) eptr++;
3008     break;
3009 ph10 231
3010 nigel 93 case 0x000a:
3011 ph10 231 break;
3012    
3013 nigel 93 case 0x000b:
3014     case 0x000c:
3015     case 0x0085:
3016     case 0x2028:
3017     case 0x2029:
3018 ph10 231 if (md->bsr_anycrlf) RRETURN(MATCH_NOMATCH);
3019 nigel 93 break;
3020     }
3021     }
3022     break;
3023    
3024 ph10 178 case OP_NOT_HSPACE:
3025     for (i = 1; i <= min; i++)
3026     {
3027     if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
3028     GETCHARINC(c, eptr);
3029     switch(c)
3030     {
3031     default: break;
3032     case 0x09: /* HT */
3033     case 0x20: /* SPACE */
3034     case 0xa0: /* NBSP */
3035     case 0x1680: /* OGHAM SPACE MARK */
3036     case 0x180e: /* MONGOLIAN VOWEL SEPARATOR */
3037     case 0x2000: /* EN QUAD */
3038     case 0x2001: /* EM QUAD */
3039     case 0x2002: /* EN SPACE */
3040     case 0x2003: /* EM SPACE */
3041     case 0x2004: /* THREE-PER-EM SPACE */
3042     case 0x2005: /* FOUR-PER-EM SPACE */
3043     case 0x2006: /* SIX-PER-EM SPACE */
3044     case 0x2007: /* FIGURE SPACE */
3045     case 0x2008: /* PUNCTUATION SPACE */
3046     case 0x2009: /* THIN SPACE */
3047     case 0x200A: /* HAIR SPACE */
3048     case 0x202f: /* NARROW NO-BREAK SPACE */
3049     case 0x205f: /* MEDIUM MATHEMATICAL SPACE */
3050     case 0x3000: /* IDEOGRAPHIC SPACE */
3051     RRETURN(MATCH_NOMATCH);
3052     }
3053     }
3054     break;
3055 ph10 182
3056 ph10 178 case OP_HSPACE:
3057     for (i = 1; i <= min; i++)
3058     {
3059     if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
3060     GETCHARINC(c, eptr);
3061     switch(c)
3062     {
3063     default: RRETURN(MATCH_NOMATCH);
3064     case 0x09: /* HT */
3065     case 0x20: /* SPACE */
3066     case 0xa0: /* NBSP */
3067     case 0x1680: /* OGHAM SPACE MARK */
3068     case 0x180e: /* MONGOLIAN VOWEL SEPARATOR */
3069     case 0x2000: /* EN QUAD */
3070     case 0x2001: /* EM QUAD */
3071     case 0x2002: /* EN SPACE */
3072     case 0x2003: /* EM SPACE */
3073     case 0x2004: /* THREE-PER-EM SPACE */
3074     case 0x2005: /* FOUR-PER-EM SPACE */
3075     case 0x2006: /* SIX-PER-EM SPACE */
3076     case 0x2007: /* FIGURE SPACE */
3077     case 0x2008: /* PUNCTUATION SPACE */
3078     case 0x2009: /* THIN SPACE */
3079     case 0x200A: /* HAIR SPACE */
3080     case 0x202f: /* NARROW NO-BREAK SPACE */
3081     case 0x205f: /* MEDIUM MATHEMATICAL SPACE */
3082     case 0x3000: /* IDEOGRAPHIC SPACE */
3083     break;
3084     }
3085     }
3086     break;
3087 ph10 182
3088 ph10 178 case OP_NOT_VSPACE:
3089     for (i = 1; i <= min; i++)
3090     {
3091     if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
3092     GETCHARINC(c, eptr);
3093     switch(c)
3094     {
3095     default: break;
3096     case 0x0a: /* LF */
3097     case 0x0b: /* VT */
3098     case 0x0c: /* FF */
3099     case 0x0d: /* CR */
3100     case 0x85: /* NEL */
3101     case 0x2028: /* LINE SEPARATOR */
3102     case 0x2029: /* PARAGRAPH SEPARATOR */
3103     RRETURN(MATCH_NOMATCH);
3104     }
3105     }
3106     break;
3107 ph10 182
3108 ph10 178 case OP_VSPACE:
3109     for (i = 1; i <= min; i++)
3110     {
3111     if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
3112     GETCHARINC(c, eptr);
3113     switch(c)
3114     {
3115     default: RRETURN(MATCH_NOMATCH);
3116     case 0x0a: /* LF */
3117     case 0x0b: /* VT */
3118     case 0x0c: /* FF */
3119     case 0x0d: /* CR */
3120     case 0x85: /* NEL */
3121     case 0x2028: /* LINE SEPARATOR */
3122     case 0x2029: /* PARAGRAPH SEPARATOR */
3123 ph10 182 break;
3124 ph10 178 }
3125     }
3126     break;
3127    
3128 nigel 77 case OP_NOT_DIGIT:
3129     for (i = 1; i <= min; i++)
3130     {
3131     if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
3132     GETCHARINC(c, eptr);
3133     if (c < 128 && (md->ctypes[c] & ctype_digit) != 0)
3134     RRETURN(MATCH_NOMATCH);
3135     }
3136     break;
3137    
3138     case OP_DIGIT:
3139     for (i = 1; i <= min; i++)
3140     {
3141     if (eptr >= md->end_subject ||
3142     *eptr >= 128 || (md->ctypes[*eptr++] & ctype_digit) == 0)
3143     RRETURN(MATCH_NOMATCH);
3144     /* No need to skip more bytes - we know it's a 1-byte character */
3145     }
3146     break;
3147    
3148     case OP_NOT_WHITESPACE:
3149     for (i = 1; i <= min; i++)
3150     {
3151     if (eptr >= md->end_subject ||
3152 ph10 219 (*eptr < 128 && (md->ctypes[*eptr] & ctype_space) != 0))
3153 nigel 77 RRETURN(MATCH_NOMATCH);
3154 ph10 219 while (++eptr < md->end_subject && (*eptr & 0xc0) == 0x80);
3155 nigel 77 }
3156     break;
3157    
3158     case OP_WHITESPACE:
3159     for (i = 1; i <= min; i++)
3160     {
3161     if (eptr >= md->end_subject ||
3162     *eptr >= 128 || (md->ctypes[*eptr++] & ctype_space) == 0)
3163     RRETURN(MATCH_NOMATCH);
3164     /* No need to skip more bytes - we know it's a 1-byte character */
3165     }
3166     break;
3167    
3168     case OP_NOT_WORDCHAR:
3169     for (i = 1; i <= min; i++)
3170     {
3171     if (eptr >= md->end_subject ||
3172 ph10 219 (*eptr < 128 && (md->ctypes[*eptr] & ctype_word) != 0))
3173 nigel 77 RRETURN(MATCH_NOMATCH);
3174 ph10 219 while (++eptr < md->end_subject && (*eptr & 0xc0) == 0x80);
3175 nigel 77 }
3176     break;
3177    
3178     case OP_WORDCHAR:
3179     for (i = 1; i <= min; i++)
3180     {
3181     if (eptr >= md->end_subject ||
3182     *eptr >= 128 || (md->ctypes[*eptr++] & ctype_word) == 0)
3183     RRETURN(MATCH_NOMATCH);
3184     /* No need to skip more bytes - we know it's a 1-byte character */
3185     }
3186     break;
3187    
3188     default:
3189     RRETURN(PCRE_ERROR_INTERNAL);
3190     } /* End switch(ctype) */
3191    
3192     else
3193     #endif /* SUPPORT_UTF8 */
3194    
3195     /* Code for the non-UTF-8 case for minimum matching of operators other
3196 nigel 93 than OP_PROP and OP_NOTPROP. We can assume that there are the minimum
3197     number of bytes present, as this was tested above. */
3198 nigel 77
3199     switch(ctype)
3200     {
3201     case OP_ANY:
3202 ph10 342 for (i = 1; i <= min; i++)
3203 nigel 77 {
3204 ph10 342 if (IS_NEWLINE(eptr)) RRETURN(MATCH_NOMATCH);
3205     eptr++;
3206 nigel 77 }
3207     break;
3208    
3209 ph10 341 case OP_ALLANY:
3210     eptr += min;
3211     break;
3212    
3213 nigel 77 case OP_ANYBYTE:
3214     eptr += min;
3215     break;
3216    
3217 nigel 93 /* Because of the CRLF case, we can't assume the minimum number of
3218     bytes are present in this case. */
3219    
3220     case OP_ANYNL:
3221     for (i = 1; i <= min; i++)
3222     {
3223     if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
3224     switch(*eptr++)
3225     {
3226     default: RRETURN(MATCH_NOMATCH);
3227     case 0x000d:
3228     if (eptr < md->end_subject && *eptr == 0x0a) eptr++;
3229     break;
3230     case 0x000a:
3231 ph10 231 break;
3232    
3233 nigel 93 case 0x000b:
3234     case 0x000c:
3235     case 0x0085:
3236 ph10 231 if (md->bsr_anycrlf) RRETURN(MATCH_NOMATCH);
3237 nigel 93 break;
3238     }
3239     }
3240     break;
3241    
3242 ph10 178 case OP_NOT_HSPACE:
3243     for (i = 1; i <= min; i++)
3244     {
3245     if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
3246     switch(*eptr++)
3247     {
3248     default: break;
3249     case 0x09: /* HT */
3250     case 0x20: /* SPACE */
3251     case 0xa0: /* NBSP */
3252     RRETURN(MATCH_NOMATCH);
3253     }
3254     }
3255     break;
3256    
3257     case OP_HSPACE:
3258     for (i = 1; i <= min; i++)
3259     {
3260     if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
3261     switch(*eptr++)
3262     {
3263     default: RRETURN(MATCH_NOMATCH);
3264     case 0x09: /* HT */
3265     case 0x20: /* SPACE */
3266     case 0xa0: /* NBSP */
3267 ph10 182 break;
3268 ph10 178 }
3269     }
3270     break;
3271    
3272     case OP_NOT_VSPACE:
3273     for (i = 1; i <= min; i++)
3274     {
3275     if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
3276     switch(*eptr++)
3277     {
3278     default: break;
3279     case 0x0a: /* LF */
3280     case 0x0b: /* VT */
3281     case 0x0c: /* FF */
3282     case 0x0d: /* CR */
3283     case 0x85: /* NEL */
3284     RRETURN(MATCH_NOMATCH);
3285     }
3286     }
3287     break;
3288    
3289     case OP_VSPACE:
3290     for (i = 1; i <= min; i++)
3291     {
3292     if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
3293     switch(*eptr++)
3294     {
3295     default: RRETURN(MATCH_NOMATCH);
3296     case 0x0a: /* LF */
3297     case 0x0b: /* VT */
3298     case 0x0c: /* FF */
3299     case 0x0d: /* CR */
3300     case 0x85: /* NEL */
3301 ph10 182 break;
3302 ph10 178 }
3303     }
3304     break;
3305    
3306 nigel 77 case OP_NOT_DIGIT:
3307     for (i = 1; i <= min; i++)
3308     if ((md->ctypes[*eptr++] & ctype_digit) != 0) RRETURN(MATCH_NOMATCH);
3309     break;
3310    
3311     case OP_DIGIT:
3312     for (i = 1; i <= min; i++)
3313     if ((md->ctypes[*eptr++] & ctype_digit) == 0) RRETURN(MATCH_NOMATCH);
3314     break;
3315    
3316     case OP_NOT_WHITESPACE:
3317     for (i = 1; i <= min; i++)
3318     if ((md->ctypes[*eptr++] & ctype_space) != 0) RRETURN(MATCH_NOMATCH);
3319     break;
3320    
3321     case OP_WHITESPACE:
3322     for (i = 1; i <= min; i++)
3323     if ((md->ctypes[*eptr++] & ctype_space) == 0) RRETURN(MATCH_NOMATCH);
3324     break;
3325    
3326     case OP_NOT_WORDCHAR:
3327     for (i = 1; i <= min; i++)
3328     if ((md->ctypes[*eptr++] & ctype_word) != 0)
3329     RRETURN(MATCH_NOMATCH);
3330     break;
3331    
3332     case OP_WORDCHAR:
3333     for (i = 1; i <= min; i++)
3334     if ((md->ctypes[*eptr++] & ctype_word) == 0)
3335     RRETURN(MATCH_NOMATCH);
3336     break;
3337    
3338     default:
3339     RRETURN(PCRE_ERROR_INTERNAL);
3340     }
3341     }
3342    
3343     /* If min = max, continue at the same level without recursing */
3344    
3345     if (min == max) continue;
3346    
3347     /* If minimizing, we have to test the rest of the pattern before each
3348     subsequent match. Again, separate the UTF-8 case for speed, and also
3349     separate the UCP cases. */
3350    
3351     if (minimize)
3352     {
3353     #ifdef SUPPORT_UCP
3354 nigel 87 if (prop_type >= 0)
3355 nigel 77 {
3356 nigel 87 switch(prop_type)
3357 nigel 77 {
3358 nigel 87 case PT_ANY:
3359     for (fi = min;; fi++)
3360     {
3361 ph10 164 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM36);
3362 nigel 87 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3363     if (fi >= max || eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
3364     GETCHARINC(c, eptr);
3365     if (prop_fail_result) RRETURN(MATCH_NOMATCH);
3366     }
3367 nigel 93 /* Control never gets here */
3368 nigel 87
3369     case PT_LAMP:
3370     for (fi = min;; fi++)
3371     {
3372 ph10 164 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM37);
3373 nigel 87 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3374     if (fi >= max || eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
3375     GETCHARINC(c, eptr);
3376 ph10 349 prop_chartype = UCD_CHARTYPE(c);
3377 nigel 87 if ((prop_chartype == ucp_Lu ||
3378     prop_chartype == ucp_Ll ||
3379     prop_chartype == ucp_Lt) == prop_fail_result)
3380     RRETURN(MATCH_NOMATCH);
3381     }
3382 nigel 93 /* Control never gets here */
3383 nigel 87
3384     case PT_GC:
3385     for (fi = min;; fi++)
3386     {
3387 ph10 164 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM38);
3388 nigel 87 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3389     if (fi >= max || eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
3390     GETCHARINC(c, eptr);
3391 ph10 349 prop_category = UCD_CATEGORY(c);
3392 nigel 87 if ((prop_category == prop_value) == prop_fail_result)
3393     RRETURN(MATCH_NOMATCH);
3394     }
3395 nigel 93 /* Control never gets here */
3396 nigel 87
3397     case PT_PC:
3398     for (fi = min;; fi++)
3399     {
3400 ph10 164 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM39);
3401 nigel 87 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3402     if (fi >= max || eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
3403     GETCHARINC(c, eptr);
3404 ph10 349 prop_chartype = UCD_CHARTYPE(c);
3405 nigel 87 if ((prop_chartype == prop_value) == prop_fail_result)
3406     RRETURN(MATCH_NOMATCH);
3407     }
3408 nigel 93 /* Control never gets here */
3409 nigel 87
3410     case PT_SC:
3411     for (fi = min;; fi++)
3412     {
3413 ph10 164 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM40);
3414 nigel 87 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3415     if (fi >= max || eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
3416     GETCHARINC(c, eptr);
3417 ph10 349 prop_script = UCD_SCRIPT(c);
3418 nigel 87 if ((prop_script == prop_value) == prop_fail_result)
3419     RRETURN(MATCH_NOMATCH);
3420     }
3421 nigel 93 /* Control never gets here */
3422 nigel 87
3423     default:
3424     RRETURN(PCRE_ERROR_INTERNAL);
3425 nigel 77 }
3426     }
3427    
3428     /* Match extended Unicode sequences. We will get here only if the
3429     support is in the binary; otherwise a compile-time error occurs. */
3430    
3431     else if (ctype == OP_EXTUNI)
3432     {
3433     for (fi = min;; fi++)
3434     {
3435 ph10 164 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM41);
3436 nigel 77 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3437     if (fi >= max || eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
3438     GETCHARINCTEST(c, eptr);
3439 ph10 349 prop_category = UCD_CATEGORY(c);
3440 nigel 77 if (prop_category == ucp_M) RRETURN(MATCH_NOMATCH);
3441     while (eptr < md->end_subject)
3442     {
3443     int len = 1;
3444     if (!utf8) c = *eptr; else
3445     {
3446     GETCHARLEN(c, eptr, len);
3447     }
3448 ph10 349 prop_category = UCD_CATEGORY(c);
3449 nigel 77 if (prop_category != ucp_M) break;
3450     eptr += len;
3451     }
3452     }
3453     }
3454    
3455     else
3456     #endif /* SUPPORT_UCP */
3457    
3458     #ifdef SUPPORT_UTF8
3459     /* UTF-8 mode */
3460     if (utf8)
3461     {
3462     for (fi = min;; fi++)
3463     {
3464 ph10 164 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM42);
3465 nigel 77 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3466 nigel 91 if (fi >= max || eptr >= md->end_subject ||
3467 ph10 342 (ctype == OP_ANY && IS_NEWLINE(eptr)))
3468 nigel 91 RRETURN(MATCH_NOMATCH);
3469 nigel 77
3470     GETCHARINC(c, eptr);
3471     switch(ctype)
3472     {
3473 ph10 342 case OP_ANY: /* This is the non-NL case */
3474 ph10 345 case OP_ALLANY:
3475 nigel 77 case OP_ANYBYTE:
3476     break;
3477    
3478 nigel 93 case OP_ANYNL:
3479     switch(c)
3480     {
3481     default: RRETURN(MATCH_NOMATCH);
3482     case 0x000d:
3483     if (eptr < md->end_subject && *eptr == 0x0a) eptr++;
3484     break;
3485     case 0x000a:
3486 ph10 231 break;
3487    
3488 nigel 93 case 0x000b:
3489     case 0x000c:
3490     case 0x0085:
3491     case 0x2028:
3492     case 0x2029:
3493 ph10 231 if (md->bsr_anycrlf) RRETURN(MATCH_NOMATCH);
3494 nigel 93 break;
3495     }
3496     break;
3497    
3498 ph10 178 case OP_NOT_HSPACE:
3499     switch(c)
3500     {
3501     default: break;
3502     case 0x09: /* HT */
3503     case 0x20: /* SPACE */
3504     case 0xa0: /* NBSP */
3505     case 0x1680: /* OGHAM SPACE MARK */
3506     case 0x180e: /* MONGOLIAN VOWEL SEPARATOR */
3507     case 0x2000: /* EN QUAD */
3508     case 0x2001: /* EM QUAD */
3509     case 0x2002: /* EN SPACE */
3510     case 0x2003: /* EM SPACE */
3511     case 0x2004: /* THREE-PER-EM SPACE */
3512     case 0x2005: /* FOUR-PER-EM SPACE */
3513     case 0x2006: /* SIX-PER-EM SPACE */
3514     case 0x2007: /* FIGURE SPACE */
3515     case 0x2008: /* PUNCTUATION SPACE */
3516     case 0x2009: /* THIN SPACE */
3517     case 0x200A: /* HAIR SPACE */
3518     case 0x202f: /* NARROW NO-BREAK SPACE */
3519     case 0x205f: /* MEDIUM MATHEMATICAL SPACE */
3520     case 0x3000: /* IDEOGRAPHIC SPACE */
3521     RRETURN(MATCH_NOMATCH);
3522     }
3523     break;
3524    
3525     case OP_HSPACE:
3526     switch(c)
3527     {
3528     default: RRETURN(MATCH_NOMATCH);
3529     case 0x09: /* HT */
3530     case 0x20: /* SPACE */
3531     case 0xa0: /* NBSP */
3532     case 0x1680: /* OGHAM SPACE MARK */
3533     case 0x180e: /* MONGOLIAN VOWEL SEPARATOR */
3534     case 0x2000: /* EN QUAD */
3535     case 0x2001: /* EM QUAD */
3536     case 0x2002: /* EN SPACE */
3537     case 0x2003: /* EM SPACE */
3538     case 0x2004: /* THREE-PER-EM SPACE */
3539     case 0x2005: /* FOUR-PER-EM SPACE */
3540     case 0x2006: /* SIX-PER-EM SPACE */
3541     case 0x2007: /* FIGURE SPACE */
3542     case 0x2008: /* PUNCTUATION SPACE */
3543     case 0x2009: /* THIN SPACE */
3544     case 0x200A: /* HAIR SPACE */
3545     case 0x202f: /* NARROW NO-BREAK SPACE */
3546     case 0x205f: /* MEDIUM MATHEMATICAL SPACE */
3547     case 0x3000: /* IDEOGRAPHIC SPACE */
3548     break;
3549     }
3550     break;
3551    
3552     case OP_NOT_VSPACE:
3553     switch(c)
3554     {
3555     default: break;
3556     case 0x0a: /* LF */
3557     case 0x0b: /* VT */
3558     case 0x0c: /* FF */
3559     case 0x0d: /* CR */
3560     case 0x85: /* NEL */
3561     case 0x2028: /* LINE SEPARATOR */
3562     case 0x2029: /* PARAGRAPH SEPARATOR */
3563     RRETURN(MATCH_NOMATCH);
3564     }
3565     break;
3566    
3567     case OP_VSPACE:
3568     switch(c)
3569     {
3570     default: RRETURN(MATCH_NOMATCH);
3571     case 0x0a: /* LF */
3572     case 0x0b: /* VT */
3573     case 0x0c: /* FF */
3574     case 0x0d: /* CR */
3575     case 0x85: /* NEL */
3576     case 0x2028: /* LINE SEPARATOR */
3577     case 0x2029: /* PARAGRAPH SEPARATOR */
3578     break;
3579     }
3580     break;
3581    
3582 nigel 77 case OP_NOT_DIGIT:
3583     if (c < 256 && (md->ctypes[c] & ctype_digit) != 0)
3584     RRETURN(MATCH_NOMATCH);
3585     break;
3586    
3587     case OP_DIGIT:
3588     if (c >= 256 || (md->ctypes[c] & ctype_digit) == 0)
3589     RRETURN(MATCH_NOMATCH);
3590     break;
3591    
3592     case OP_NOT_WHITESPACE:
3593     if (c < 256 && (md->ctypes[c] & ctype_space) != 0)
3594     RRETURN(MATCH_NOMATCH);
3595     break;
3596    
3597     case OP_WHITESPACE:
3598     if (c >= 256 || (md->ctypes[c] & ctype_space) == 0)
3599     RRETURN(MATCH_NOMATCH);
3600     break;
3601    
3602     case OP_NOT_WORDCHAR:
3603     if (c < 256 && (md->ctypes[c] & ctype_word) != 0)
3604     RRETURN(MATCH_NOMATCH);
3605     break;
3606    
3607     case OP_WORDCHAR:
3608     if (c >= 256 || (md->ctypes[c] & ctype_word) == 0)
3609     RRETURN(MATCH_NOMATCH);
3610     break;
3611    
3612     default:
3613     RRETURN(PCRE_ERROR_INTERNAL);
3614     }
3615     }
3616     }
3617     else
3618     #endif
3619     /* Not UTF-8 mode */
3620     {
3621     for (fi = min;; fi++)
3622     {
3623 ph10 164 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM43);
3624 nigel 77 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3625 nigel 91 if (fi >= max || eptr >= md->end_subject ||
3626 ph10 342 (ctype == OP_ANY && IS_NEWLINE(eptr)))
3627 nigel 91 RRETURN(MATCH_NOMATCH);
3628    
3629 nigel 77 c = *eptr++;
3630     switch(ctype)
3631     {
3632 ph10 342 case OP_ANY: /* This is the non-NL case */
3633 ph10 345 case OP_ALLANY:
3634 nigel 77 case OP_ANYBYTE:
3635     break;
3636    
3637 nigel 93 case OP_ANYNL:
3638     switch(c)
3639     {
3640     default: RRETURN(MATCH_NOMATCH);
3641     case 0x000d:
3642     if (eptr < md->end_subject && *eptr == 0x0a) eptr++;
3643     break;
3644 ph10 231
3645 nigel 93 case 0x000a:
3646 ph10 231 break;
3647    
3648 nigel 93 case 0x000b:
3649     case 0x000c:
3650     case 0x0085:
3651 ph10 231 if (md->bsr_anycrlf) RRETURN(MATCH_NOMATCH);
3652 nigel 93 break;
3653     }
3654     break;
3655    
3656 ph10 178 case OP_NOT_HSPACE:
3657     switch(c)
3658     {
3659     default: break;
3660     case 0x09: /* HT */
3661     case 0x20: /* SPACE */
3662     case 0xa0: /* NBSP */
3663     RRETURN(MATCH_NOMATCH);
3664     }
3665     break;
3666    
3667     case OP_HSPACE:
3668     switch(c)
3669     {
3670     default: RRETURN(MATCH_NOMATCH);
3671     case 0x09: /* HT */
3672     case 0x20: /* SPACE */
3673     case 0xa0: /* NBSP */
3674     break;
3675     }
3676     break;
3677    
3678     case OP_NOT_VSPACE:
3679     switch(c)
3680     {
3681     default: break;
3682     case 0x0a: /* LF */
3683     case 0x0b: /* VT */
3684     case 0x0c: /* FF */
3685     case 0x0d: /* CR */
3686     case 0x85: /* NEL */
3687     RRETURN(MATCH_NOMATCH);
3688     }
3689     break;
3690    
3691     case OP_VSPACE:
3692     switch(c)
3693     {
3694     default: RRETURN(MATCH_NOMATCH);
3695     case 0x0a: /* LF */
3696     case 0x0b: /* VT */
3697     case 0x0c: /* FF */
3698     case 0x0d: /* CR */
3699     case 0x85: /* NEL */
3700     break;
3701     }
3702     break;
3703    
3704 nigel 77 case OP_NOT_DIGIT:
3705     if ((md->ctypes[c] & ctype_digit) != 0) RRETURN(MATCH_NOMATCH);
3706     break;
3707    
3708     case OP_DIGIT:
3709     if ((md->ctypes[c] & ctype_digit) == 0) RRETURN(MATCH_NOMATCH);
3710     break;
3711    
3712     case OP_NOT_WHITESPACE:
3713     if ((md->ctypes[c] & ctype_space) != 0) RRETURN(MATCH_NOMATCH);
3714     break;
3715    
3716     case OP_WHITESPACE:
3717     if ((md->ctypes[c] & ctype_space) == 0) RRETURN(MATCH_NOMATCH);
3718     break;
3719    
3720     case OP_NOT_WORDCHAR:
3721     if ((md->ctypes[c] & ctype_word) != 0) RRETURN(MATCH_NOMATCH);
3722     break;
3723    
3724     case OP_WORDCHAR:
3725     if ((md->ctypes[c] & ctype_word) == 0) RRETURN(MATCH_NOMATCH);
3726     break;
3727    
3728     default:
3729     RRETURN(PCRE_ERROR_INTERNAL);
3730     }
3731     }
3732     }
3733     /* Control never gets here */
3734     }
3735    
3736 nigel 93 /* If maximizing, it is worth using inline code for speed, doing the type
3737 nigel 77 test once at the start (i.e. keep it out of the loop). Again, keep the
3738     UTF-8 and UCP stuff separate. */
3739    
3740     else
3741     {
3742     pp = eptr; /* Remember where we started */
3743    
3744     #ifdef SUPPORT_UCP
3745 nigel 87 if (prop_type >= 0)
3746 nigel 77 {
3747 nigel 87 switch(prop_type)
3748 nigel 77 {
3749 nigel 87 case PT_ANY:
3750     for (i = min; i < max; i++)
3751     {
3752     int len = 1;
3753     if (eptr >= md->end_subject) break;
3754     GETCHARLEN(c, eptr, len);
3755     if (prop_fail_result) break;
3756     eptr+= len;
3757     }
3758     break;
3759    
3760     case PT_LAMP:
3761     for (i = min; i < max; i++)
3762     {
3763     int len = 1;
3764     if (eptr >= md->end_subject) break;
3765     GETCHARLEN(c, eptr, len);
3766 ph10 349 prop_chartype = UCD_CHARTYPE(c);
3767 nigel 87 if ((prop_chartype == ucp_Lu ||
3768     prop_chartype == ucp_Ll ||
3769     prop_chartype == ucp_Lt) == prop_fail_result)
3770     break;
3771     eptr+= len;
3772     }
3773     break;
3774    
3775     case PT_GC:
3776     for (i = min; i < max; i++)
3777     {
3778     int len = 1;
3779     if (eptr >= md->end_subject) break;
3780     GETCHARLEN(c, eptr, len);
3781 ph10 349 prop_category = UCD_CATEGORY(c);
3782 nigel 87 if ((prop_category == prop_value) == prop_fail_result)
3783     break;
3784     eptr+= len;
3785     }
3786     break;
3787    
3788     case PT_PC:
3789     for (i = min; i < max; i++)
3790     {
3791     int len = 1;
3792     if (eptr >= md->end_subject) break;
3793     GETCHARLEN(c, eptr, len);
3794 ph10 349 prop_chartype = UCD_CHARTYPE(c);
3795 nigel 87 if ((prop_chartype == prop_value) == prop_fail_result)
3796     break;
3797     eptr+= len;
3798     }
3799     break;
3800    
3801     case PT_SC:
3802     for (i = min; i < max; i++)
3803     {
3804     int len = 1;
3805     if (eptr >= md->end_subject) break;
3806     GETCHARLEN(c, eptr, len);
3807 ph10 349 prop_script = UCD_SCRIPT(c);
3808 nigel 87 if ((prop_script == prop_value) == prop_fail_result)
3809     break;
3810     eptr+= len;
3811     }
3812     break;
3813 nigel 77 }
3814    
3815     /* eptr is now past the end of the maximum run */
3816    
3817 nigel 93 if (possessive) continue;
3818 nigel 77 for(;;)
3819     {
3820 ph10 164 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM44);
3821 nigel 77 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3822     if (eptr-- == pp) break; /* Stop if tried at original pos */
3823 ph10 207 if (utf8) BACKCHAR(eptr);
3824 nigel 77 }
3825     }
3826    
3827     /* Match extended Unicode sequences. We will get here only if the
3828     support is in the binary; otherwise a compile-time error occurs. */
3829    
3830     else if (ctype == OP_EXTUNI)
3831     {
3832     for (i = min; i < max; i++)
3833     {
3834     if (eptr >= md->end_subject) break;
3835     GETCHARINCTEST(c, eptr);
3836 ph10 349 prop_category = UCD_CATEGORY(c);
3837 nigel 77 if (prop_category == ucp_M) break;
3838     while (eptr < md->end_subject)
3839     {
3840     int len = 1;
3841     if (!utf8) c = *eptr; else
3842     {
3843     GETCHARLEN(c, eptr, len);
3844     }
3845 ph10 349 prop_category = UCD_CATEGORY(c);
3846 nigel 77 if (prop_category != ucp_M) break;
3847     eptr += len;
3848     }
3849     }
3850    
3851     /* eptr is now past the end of the maximum run */
3852    
3853 nigel 93 if (possessive) continue;
3854 nigel 77 for(;;)
3855     {
3856 ph10 164 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM45);
3857 nigel 77 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3858     if (eptr-- == pp) break; /* Stop if tried at original pos */
3859     for (;;) /* Move back over one extended */
3860     {
3861     int len = 1;
3862     if (!utf8) c = *eptr; else
3863     {
3864 ph10 207 BACKCHAR(eptr);
3865 nigel 77 GETCHARLEN(c, eptr, len);
3866     }
3867 ph10 349 prop_category = UCD_CATEGORY(c);
3868 nigel 77 if (prop_category != ucp_M) break;
3869     eptr--;
3870     }
3871     }
3872     }
3873    
3874     else
3875     #endif /* SUPPORT_UCP */
3876    
3877     #ifdef SUPPORT_UTF8
3878     /* UTF-8 mode */
3879    
3880     if (utf8)
3881     {
3882     switch(ctype)
3883     {
3884     case OP_ANY:
3885     if (max < INT_MAX)
3886     {
3887 ph10 342 for (i = min; i < max; i++)
3888 nigel 77 {
3889 ph10 342 if (eptr >= md->end_subject || IS_NEWLINE(eptr)) break;
3890     eptr++;
3891     while (eptr < md->end_subject && (*eptr & 0xc0) == 0x80) eptr++;
3892 nigel 77 }
3893     }
3894    
3895     /* Handle unlimited UTF-8 repeat */
3896    
3897     else
3898     {
3899 ph10 342 for (i = min; i < max; i++)
3900 nigel 77 {
3901 ph10 342 if (eptr >= md->end_subject || IS_NEWLINE(eptr)) break;
3902     eptr++;
3903     while (eptr < md->end_subject && (*eptr & 0xc0) == 0x80) eptr++;
3904 nigel 77 }
3905     }
3906     break;
3907    
3908 ph10 341 case OP_ALLANY:
3909     if (max < INT_MAX)
3910     {
3911     for (i = min; i < max; i++)
3912     {
3913     if (eptr >= md->end_subject) break;
3914     eptr++;
3915     while (eptr < md->end_subject && (*eptr & 0xc0) == 0x80) eptr++;
3916     }
3917     }
3918     else eptr = md->end_subject; /* Unlimited UTF-8 repeat */
3919     break;
3920    
3921 nigel 77 /* The byte case is the same as non-UTF8 */
3922    
3923     case OP_ANYBYTE:
3924     c = max - min;
3925 nigel 93 if (c > (unsigned int)(md->end_subject - eptr))
3926     c = md->end_subject - eptr;
3927 nigel 77 eptr += c;
3928     break;
3929    
3930 nigel 93 case OP_ANYNL:
3931     for (i = min; i < max; i++)
3932     {
3933     int len = 1;
3934     if (eptr >= md->end_subject) break;
3935     GETCHARLEN(c, eptr, len);
3936     if (c == 0x000d)
3937     {
3938     if (++eptr >= md->end_subject) break;
3939     if (*eptr == 0x000a) eptr++;
3940     }
3941     else
3942     {
3943 ph10 231 if (c != 0x000a &&
3944     (md->bsr_anycrlf ||
3945     (c != 0x000b && c != 0x000c &&
3946     c != 0x0085 && c != 0x2028 && c != 0x2029)))
3947 nigel 93 break;
3948     eptr += len;
3949     }
3950     }
3951     break;
3952    
3953 ph10 178 case OP_NOT_HSPACE:
3954 ph10 182 case OP_HSPACE:
3955 ph10 178 for (i = min; i < max; i++)
3956     {
3957 ph10 182 BOOL gotspace;
3958 ph10 178 int len = 1;
3959     if (eptr >= md->end_subject) break;
3960     GETCHARLEN(c, eptr, len);
3961     switch(c)
3962 ph10 182 {
3963     default: gotspace = FALSE; break;
3964 ph10 178 case 0x09: /* HT */
3965     case 0x20: /* SPACE */
3966     case 0xa0: /* NBSP */
3967     case 0x1680: /* OGHAM SPACE MARK */
3968     case 0x180e: /* MONGOLIAN VOWEL SEPARATOR */
3969     case 0x2000: /* EN QUAD */
3970     case 0x2001: /* EM QUAD */
3971     case 0x2002: /* EN SPACE */
3972     case 0x2003: /* EM SPACE */
3973     case 0x2004: /* THREE-PER-EM SPACE */
3974     case 0x2005: /* FOUR-PER-EM SPACE */
3975     case 0x2006: /* SIX-PER-EM SPACE */
3976     case 0x2007: /* FIGURE SPACE */
3977     case 0x2008: /* PUNCTUATION SPACE */
3978     case 0x2009: /* THIN SPACE */
3979     case 0x200A: /* HAIR SPACE */
3980     case 0x202f: /* NARROW NO-BREAK SPACE */
3981     case 0x205f: /* MEDIUM MATHEMATICAL SPACE */
3982     case 0x3000: /* IDEOGRAPHIC SPACE */
3983     gotspace = TRUE;
3984 ph10 182 break;
3985 ph10 178 }
3986     if (gotspace == (ctype == OP_NOT_HSPACE)) break;
3987     eptr += len;
3988     }
3989     break;
3990    
3991     case OP_NOT_VSPACE:
3992 ph10 182 case OP_VSPACE:
3993 ph10 178 for (i = min; i < max; i++)
3994     {
3995 ph10 182 BOOL gotspace;
3996 ph10 178 int len = 1;
3997     if (eptr >= md->end_subject) break;
3998     GETCHARLEN(c, eptr, len);
3999     switch(c)
4000     {
4001 ph10 182 default: gotspace = FALSE; break;
4002 ph10 178 case 0x0a: /* LF */
4003     case 0x0b: /* VT */
4004     case 0x0c: /* FF */
4005     case 0x0d: /* CR */
4006     case 0x85: /* NEL */
4007     case 0x2028: /* LINE SEPARATOR */
4008     case 0x2029: /* PARAGRAPH SEPARATOR */
4009     gotspace = TRUE;
4010     break;
4011     }
4012 ph10 182 if (gotspace == (ctype == OP_NOT_VSPACE)) break;
4013 ph10 178 eptr += len;
4014     }
4015     break;
4016    
4017 nigel 77 case OP_NOT_DIGIT:
4018     for (i = min; i < max; i++)
4019     {
4020     int len = 1;
4021     if (eptr >= md->end_subject) break;
4022     GETCHARLEN(c, eptr, len);
4023     if (c < 256 && (md->ctypes[c] & ctype_digit) != 0) break;
4024     eptr+= len;
4025     }
4026     break;
4027    
4028     case OP_DIGIT:
4029     for (i = min; i < max; i++)
4030     {
4031     int len = 1;
4032     if (eptr >= md->end_subject) break;
4033     GETCHARLEN(c, eptr, len);
4034     if (c >= 256 ||(md->ctypes[c] & ctype_digit) == 0) break;
4035     eptr+= len;
4036     }
4037     break;
4038    
4039     case OP_NOT_WHITESPACE:
4040     for (i = min; i < max; i++)
4041     {
4042     int len = 1;
4043     if (eptr >= md->end_subject) break;
4044     GETCHARLEN(c, eptr, len);
4045     if (c < 256 && (md->ctypes[c] & ctype_space) != 0) break;
4046     eptr+= len;
4047     }
4048     break;
4049    
4050     case OP_WHITESPACE:
4051     for (i = min; i < max; i++)
4052     {
4053     int len = 1;
4054     if (eptr >= md->end_subject) break;
4055     GETCHARLEN(c, eptr, len);
4056     if (c >= 256 ||(md->ctypes[c] & ctype_space) == 0) break;
4057     eptr+= len;
4058     }
4059     break;
4060    
4061     case OP_NOT_WORDCHAR:
4062     for (i = min; i < max; i++)
4063     {
4064     int len = 1;
4065     if (eptr >= md->end_subject) break;
4066     GETCHARLEN(c, eptr, len);
4067     if (c < 256 && (md->ctypes[c] & ctype_word) != 0) break;
4068     eptr+= len;
4069     }
4070     break;
4071    
4072     case OP_WORDCHAR:
4073     for (i = min; i < max; i++)
4074     {
4075     int len = 1;
4076     if (eptr >= md->end_subject) break;
4077     GETCHARLEN(c, eptr, len);
4078     if (c >= 256 || (md->ctypes[c] & ctype_word) == 0) break;
4079     eptr+= len;
4080     }
4081     break;
4082    
4083     default:
4084     RRETURN(PCRE_ERROR_INTERNAL);
4085     }
4086    
4087     /* eptr is now past the end of the maximum run */
4088    
4089 nigel 93 if (possessive) continue;
4090 nigel 77 for(;;)
4091     {
4092 ph10 164 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM46);
4093 nigel 77 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
4094     if (eptr-- == pp) break; /* Stop if tried at original pos */
4095     BACKCHAR(eptr);
4096     }
4097     }
4098     else
4099 ph10 207 #endif /* SUPPORT_UTF8 */
4100 nigel 77
4101     /* Not UTF-8 mode */
4102     {
4103     switch(ctype)
4104     {
4105     case OP_ANY:
4106 ph10 342 for (i = min; i < max; i++)
4107 nigel 77 {
4108 ph10 342 if (eptr >= md->end_subject || IS_NEWLINE(eptr)) break;
4109     eptr++;
4110 nigel 77 }
4111 ph10 342 break;
4112 nigel 77
4113 ph10 341 case OP_ALLANY:
4114 nigel 77 case OP_ANYBYTE:
4115     c = max - min;
4116 nigel 93 if (c > (unsigned int)(md->end_subject - eptr))
4117     c = md->end_subject - eptr;
4118 nigel 77 eptr += c;
4119     break;
4120    
4121 nigel 93 case OP_ANYNL:
4122     for (i = min; i < max; i++)
4123     {
4124     if (eptr >= md->end_subject) break;
4125     c = *eptr;
4126     if (c == 0x000d)
4127     {
4128     if (++eptr >= md->end_subject) break;
4129     if (*eptr == 0x000a) eptr++;
4130     }
4131     else
4132     {
4133 ph10 231 if (c != 0x000a &&
4134     (md->bsr_anycrlf ||
4135     (c != 0x000b && c != 0x000c && c != 0x0085)))
4136 nigel 93 break;
4137     eptr++;
4138     }
4139     }
4140     break;
4141    
4142 ph10 178 case OP_NOT_HSPACE:
4143     for (i = min; i < max; i++)
4144     {
4145     if (eptr >= md->end_subject) break;
4146     c = *eptr;
4147     if (c == 0x09 || c == 0x20 || c == 0xa0) break;
4148 ph10 182 eptr++;
4149 ph10 178 }
4150     break;