/[pcre]/code/trunk/pcre_exec.c
ViewVC logotype

Contents of /code/trunk/pcre_exec.c

Parent Directory Parent Directory | Revision Log Revision Log


Revision 371 - (hide annotations) (download)
Mon Aug 25 18:28:05 2008 UTC (6 years, 2 months ago) by ph10
File MIME type: text/plain
File size: 151090 byte(s)
Source tidies for 7.8-RC1 

1 nigel 77 /*************************************************
2     * Perl-Compatible Regular Expressions *
3     *************************************************/
4    
5     /* PCRE is a library of functions to support regular expressions whose syntax
6     and semantics are as close as possible to those of the Perl 5 language.
7    
8     Written by Philip Hazel
9 ph10 305 Copyright (c) 1997-2008 University of Cambridge
10 nigel 77
11     -----------------------------------------------------------------------------
12     Redistribution and use in source and binary forms, with or without
13     modification, are permitted provided that the following conditions are met:
14    
15     * Redistributions of source code must retain the above copyright notice,
16     this list of conditions and the following disclaimer.
17    
18     * Redistributions in binary form must reproduce the above copyright
19     notice, this list of conditions and the following disclaimer in the
20     documentation and/or other materials provided with the distribution.
21    
22     * Neither the name of the University of Cambridge nor the names of its
23     contributors may be used to endorse or promote products derived from
24     this software without specific prior written permission.
25    
26     THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
27     AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
28     IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
29     ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
30     LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
31     CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
32     SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
33     INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
34     CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
35     ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
36     POSSIBILITY OF SUCH DAMAGE.
37     -----------------------------------------------------------------------------
38     */
39    
40    
41     /* This module contains pcre_exec(), the externally visible function that does
42     pattern matching using an NFA algorithm, trying to mimic Perl as closely as
43     possible. There are also some static supporting functions. */
44    
45 ph10 200 #ifdef HAVE_CONFIG_H
46 ph10 236 #include "config.h"
47 ph10 200 #endif
48 ph10 199
49 nigel 93 #define NLBLOCK md /* Block containing newline information */
50     #define PSSTART start_subject /* Field containing processed string start */
51     #define PSEND end_subject /* Field containing processed string end */
52    
53 nigel 77 #include "pcre_internal.h"
54    
55 ph10 137 /* Undefine some potentially clashing cpp symbols */
56    
57     #undef min
58     #undef max
59    
60 nigel 77 /* Flag bits for the match() function */
61    
62 nigel 93 #define match_condassert 0x01 /* Called to check a condition assertion */
63     #define match_cbegroup 0x02 /* Could-be-empty unlimited repeat group */
64 nigel 77
65     /* Non-error returns from the match() function. Error returns are externally
66     defined PCRE_ERROR_xxx codes, which are all negative. */
67    
68     #define MATCH_MATCH 1
69     #define MATCH_NOMATCH 0
70    
71 ph10 211 /* Special internal returns from the match() function. Make them sufficiently
72 ph10 210 negative to avoid the external error codes. */
73    
74     #define MATCH_COMMIT (-999)
75     #define MATCH_PRUNE (-998)
76     #define MATCH_SKIP (-997)
77     #define MATCH_THEN (-996)
78    
79 nigel 77 /* Maximum number of ints of offset to save on the stack for recursive calls.
80     If the offset vector is bigger, malloc is used. This should be a multiple of 3,
81     because the offset vector is always a multiple of 3 long. */
82    
83     #define REC_STACK_SAVE_MAX 30
84    
85     /* Min and max values for the common repeats; for the maxima, 0 => infinity */
86    
87     static const char rep_min[] = { 0, 0, 1, 1, 0, 0 };
88     static const char rep_max[] = { 0, 0, 0, 0, 1, 1 };
89    
90    
91    
92     #ifdef DEBUG
93     /*************************************************
94     * Debugging function to print chars *
95     *************************************************/
96    
97     /* Print a sequence of chars in printable format, stopping at the end of the
98     subject if the requested.
99    
100     Arguments:
101     p points to characters
102     length number to print
103     is_subject TRUE if printing from within md->start_subject
104     md pointer to matching data block, if is_subject is TRUE
105    
106     Returns: nothing
107     */
108    
109     static void
110     pchars(const uschar *p, int length, BOOL is_subject, match_data *md)
111     {
112 nigel 93 unsigned int c;
113 nigel 77 if (is_subject && length > md->end_subject - p) length = md->end_subject - p;
114     while (length-- > 0)
115     if (isprint(c = *(p++))) printf("%c", c); else printf("\\x%02x", c);
116     }
117     #endif
118    
119    
120    
121     /*************************************************
122     * Match a back-reference *
123     *************************************************/
124    
125     /* If a back reference hasn't been set, the length that is passed is greater
126     than the number of characters left in the string, so the match fails.
127    
128     Arguments:
129     offset index into the offset vector
130     eptr points into the subject
131     length length to be matched
132     md points to match data block
133     ims the ims flags
134    
135     Returns: TRUE if matched
136     */
137    
138     static BOOL
139 nigel 87 match_ref(int offset, register USPTR eptr, int length, match_data *md,
140 nigel 77 unsigned long int ims)
141     {
142 nigel 87 USPTR p = md->start_subject + md->offset_vector[offset];
143 nigel 77
144     #ifdef DEBUG
145     if (eptr >= md->end_subject)
146     printf("matching subject <null>");
147     else
148     {
149     printf("matching subject ");
150     pchars(eptr, length, TRUE, md);
151     }
152     printf(" against backref ");
153     pchars(p, length, FALSE, md);
154     printf("\n");
155     #endif
156    
157     /* Always fail if not enough characters left */
158    
159     if (length > md->end_subject - eptr) return FALSE;
160    
161 ph10 354 /* Separate the caseless case for speed. In UTF-8 mode we can only do this
162     properly if Unicode properties are supported. Otherwise, we can check only
163     ASCII characters. */
164 nigel 77
165     if ((ims & PCRE_CASELESS) != 0)
166     {
167 ph10 354 #ifdef SUPPORT_UTF8
168     #ifdef SUPPORT_UCP
169     if (md->utf8)
170     {
171 ph10 358 USPTR endptr = eptr + length;
172 ph10 354 while (eptr < endptr)
173     {
174 ph10 358 int c, d;
175 ph10 354 GETCHARINC(c, eptr);
176     GETCHARINC(d, p);
177     if (c != d && c != UCD_OTHERCASE(d)) return FALSE;
178 ph10 358 }
179     }
180 ph10 354 else
181     #endif
182     #endif
183    
184     /* The same code works when not in UTF-8 mode and in UTF-8 mode when there
185     is no UCP support. */
186 ph10 358
187 nigel 77 while (length-- > 0)
188 ph10 354 { if (md->lcc[*p++] != md->lcc[*eptr++]) return FALSE; }
189 nigel 77 }
190 ph10 358
191 ph10 354 /* In the caseful case, we can just compare the bytes, whether or not we
192     are in UTF-8 mode. */
193 ph10 358
194 nigel 77 else
195     { while (length-- > 0) if (*p++ != *eptr++) return FALSE; }
196    
197     return TRUE;
198     }
199    
200    
201    
202     /***************************************************************************
203     ****************************************************************************
204     RECURSION IN THE match() FUNCTION
205    
206 nigel 87 The match() function is highly recursive, though not every recursive call
207     increases the recursive depth. Nevertheless, some regular expressions can cause
208     it to recurse to a great depth. I was writing for Unix, so I just let it call
209     itself recursively. This uses the stack for saving everything that has to be
210     saved for a recursive call. On Unix, the stack can be large, and this works
211     fine.
212 nigel 77
213 nigel 87 It turns out that on some non-Unix-like systems there are problems with
214     programs that use a lot of stack. (This despite the fact that every last chip
215     has oodles of memory these days, and techniques for extending the stack have
216     been known for decades.) So....
217 nigel 77
218     There is a fudge, triggered by defining NO_RECURSE, which avoids recursive
219     calls by keeping local variables that need to be preserved in blocks of memory
220 nigel 87 obtained from malloc() instead instead of on the stack. Macros are used to
221 nigel 77 achieve this so that the actual code doesn't look very different to what it
222     always used to.
223 ph10 164
224 ph10 165 The original heap-recursive code used longjmp(). However, it seems that this
225 ph10 164 can be very slow on some operating systems. Following a suggestion from Stan
226     Switzer, the use of longjmp() has been abolished, at the cost of having to
227     provide a unique number for each call to RMATCH. There is no way of generating
228     a sequence of numbers at compile time in C. I have given them names, to make
229     them stand out more clearly.
230    
231     Crude tests on x86 Linux show a small speedup of around 5-8%. However, on
232     FreeBSD, avoiding longjmp() more than halves the time taken to run the standard
233 ph10 165 tests. Furthermore, not using longjmp() means that local dynamic variables
234     don't have indeterminate values; this has meant that the frame size can be
235 ph10 164 reduced because the result can be "passed back" by straight setting of the
236     variable instead of being passed in the frame.
237 nigel 77 ****************************************************************************
238     ***************************************************************************/
239    
240 ph10 212 /* Numbers for RMATCH calls. When this list is changed, the code at HEAP_RETURN
241     below must be updated in sync. */
242 nigel 77
243 ph10 164 enum { RM1=1, RM2, RM3, RM4, RM5, RM6, RM7, RM8, RM9, RM10,
244     RM11, RM12, RM13, RM14, RM15, RM16, RM17, RM18, RM19, RM20,
245     RM21, RM22, RM23, RM24, RM25, RM26, RM27, RM28, RM29, RM30,
246     RM31, RM32, RM33, RM34, RM35, RM36, RM37, RM38, RM39, RM40,
247 ph10 210 RM41, RM42, RM43, RM44, RM45, RM46, RM47, RM48, RM49, RM50,
248 ph10 212 RM51, RM52, RM53, RM54 };
249 ph10 164
250 nigel 87 /* These versions of the macros use the stack, as normal. There are debugging
251 ph10 165 versions and production versions. Note that the "rw" argument of RMATCH isn't
252 ph10 164 actuall used in this definition. */
253 nigel 77
254     #ifndef NO_RECURSE
255     #define REGISTER register
256 ph10 164
257 nigel 87 #ifdef DEBUG
258 ph10 164 #define RMATCH(ra,rb,rc,rd,re,rf,rg,rw) \
259 nigel 87 { \
260     printf("match() called in line %d\n", __LINE__); \
261 ph10 168 rrc = match(ra,rb,mstart,rc,rd,re,rf,rg,rdepth+1); \
262 nigel 87 printf("to line %d\n", __LINE__); \
263     }
264     #define RRETURN(ra) \
265     { \
266     printf("match() returned %d from line %d ", ra, __LINE__); \
267     return ra; \
268     }
269     #else
270 ph10 164 #define RMATCH(ra,rb,rc,rd,re,rf,rg,rw) \
271 ph10 168 rrc = match(ra,rb,mstart,rc,rd,re,rf,rg,rdepth+1)
272 nigel 77 #define RRETURN(ra) return ra
273 nigel 87 #endif
274    
275 nigel 77 #else
276    
277    
278 ph10 164 /* These versions of the macros manage a private stack on the heap. Note that
279     the "rd" argument of RMATCH isn't actually used in this definition. It's the md
280     argument of match(), which never changes. */
281 nigel 77
282     #define REGISTER
283    
284 ph10 164 #define RMATCH(ra,rb,rc,rd,re,rf,rg,rw)\
285 nigel 77 {\
286     heapframe *newframe = (pcre_stack_malloc)(sizeof(heapframe));\
287 ph10 164 frame->Xwhere = rw; \
288     newframe->Xeptr = ra;\
289     newframe->Xecode = rb;\
290 ph10 168 newframe->Xmstart = mstart;\
291 ph10 164 newframe->Xoffset_top = rc;\
292     newframe->Xims = re;\
293     newframe->Xeptrb = rf;\
294     newframe->Xflags = rg;\
295     newframe->Xrdepth = frame->Xrdepth + 1;\
296     newframe->Xprevframe = frame;\
297     frame = newframe;\
298     DPRINTF(("restarting from line %d\n", __LINE__));\
299     goto HEAP_RECURSE;\
300     L_##rw:\
301     DPRINTF(("jumped back to line %d\n", __LINE__));\
302 nigel 77 }
303    
304     #define RRETURN(ra)\
305     {\
306     heapframe *newframe = frame;\
307     frame = newframe->Xprevframe;\
308     (pcre_stack_free)(newframe);\
309     if (frame != NULL)\
310     {\
311 ph10 164 rrc = ra;\
312     goto HEAP_RETURN;\
313 nigel 77 }\
314     return ra;\
315     }
316    
317    
318     /* Structure for remembering the local variables in a private frame */
319    
320     typedef struct heapframe {
321     struct heapframe *Xprevframe;
322    
323     /* Function arguments that may change */
324    
325     const uschar *Xeptr;
326     const uschar *Xecode;
327 ph10 172 const uschar *Xmstart;
328 nigel 77 int Xoffset_top;
329     long int Xims;
330     eptrblock *Xeptrb;
331     int Xflags;
332 nigel 91 unsigned int Xrdepth;
333 nigel 77
334     /* Function local variables */
335    
336     const uschar *Xcallpat;
337     const uschar *Xcharptr;
338     const uschar *Xdata;
339     const uschar *Xnext;
340     const uschar *Xpp;
341     const uschar *Xprev;
342     const uschar *Xsaved_eptr;
343    
344     recursion_info Xnew_recursive;
345    
346     BOOL Xcur_is_word;
347     BOOL Xcondition;
348     BOOL Xprev_is_word;
349    
350     unsigned long int Xoriginal_ims;
351    
352     #ifdef SUPPORT_UCP
353     int Xprop_type;
354 nigel 87 int Xprop_value;
355 nigel 77 int Xprop_fail_result;
356     int Xprop_category;
357     int Xprop_chartype;
358 nigel 87 int Xprop_script;
359 ph10 123 int Xoclength;
360     uschar Xocchars[8];
361 nigel 77 #endif
362    
363     int Xctype;
364 nigel 93 unsigned int Xfc;
365 nigel 77 int Xfi;
366     int Xlength;
367     int Xmax;
368     int Xmin;
369     int Xnumber;
370     int Xoffset;
371     int Xop;
372     int Xsave_capture_last;
373     int Xsave_offset1, Xsave_offset2, Xsave_offset3;
374     int Xstacksave[REC_STACK_SAVE_MAX];
375    
376     eptrblock Xnewptrb;
377    
378 ph10 164 /* Where to jump back to */
379 nigel 77
380 ph10 164 int Xwhere;
381 ph10 165
382 nigel 77 } heapframe;
383    
384     #endif
385    
386    
387     /***************************************************************************
388     ***************************************************************************/
389    
390    
391    
392     /*************************************************
393     * Match from current position *
394     *************************************************/
395    
396 nigel 93 /* This function is called recursively in many circumstances. Whenever it
397 nigel 77 returns a negative (error) response, the outer incarnation must also return the
398     same response.
399    
400     Performance note: It might be tempting to extract commonly used fields from the
401     md structure (e.g. utf8, end_subject) into individual variables to improve
402     performance. Tests using gcc on a SPARC disproved this; in the first case, it
403     made performance worse.
404    
405     Arguments:
406 nigel 93 eptr pointer to current character in subject
407     ecode pointer to current position in compiled code
408 ph10 168 mstart pointer to the current match start position (can be modified
409 ph10 172 by encountering \K)
410 nigel 77 offset_top current top pointer
411     md pointer to "static" info for the match
412     ims current /i, /m, and /s options
413     eptrb pointer to chain of blocks containing eptr at start of
414     brackets - for testing for empty matches
415     flags can contain
416     match_condassert - this is an assertion condition
417 nigel 93 match_cbegroup - this is the start of an unlimited repeat
418     group that can match an empty string
419 nigel 87 rdepth the recursion depth
420 nigel 77
421     Returns: MATCH_MATCH if matched ) these values are >= 0
422     MATCH_NOMATCH if failed to match )
423     a negative PCRE_ERROR_xxx value if aborted by an error condition
424 nigel 87 (e.g. stopped by repeated call or recursion limit)
425 nigel 77 */
426    
427     static int
428 ph10 172 match(REGISTER USPTR eptr, REGISTER const uschar *ecode, const uschar *mstart,
429 nigel 77 int offset_top, match_data *md, unsigned long int ims, eptrblock *eptrb,
430 nigel 91 int flags, unsigned int rdepth)
431 nigel 77 {
432     /* These variables do not need to be preserved over recursion in this function,
433 nigel 93 so they can be ordinary variables in all cases. Mark some of them with
434     "register" because they are used a lot in loops. */
435 nigel 77
436 nigel 91 register int rrc; /* Returns from recursive calls */
437     register int i; /* Used for loops not involving calls to RMATCH() */
438 nigel 93 register unsigned int c; /* Character values not kept over RMATCH() calls */
439 nigel 91 register BOOL utf8; /* Local copy of UTF-8 flag for speed */
440 nigel 77
441 nigel 93 BOOL minimize, possessive; /* Quantifier options */
442    
443 nigel 77 /* When recursion is not being used, all "local" variables that have to be
444     preserved over calls to RMATCH() are part of a "frame" which is obtained from
445     heap storage. Set up the top-level frame here; others are obtained from the
446     heap whenever RMATCH() does a "recursion". See the macro definitions above. */
447    
448     #ifdef NO_RECURSE
449     heapframe *frame = (pcre_stack_malloc)(sizeof(heapframe));
450     frame->Xprevframe = NULL; /* Marks the top level */
451    
452     /* Copy in the original argument variables */
453    
454     frame->Xeptr = eptr;
455     frame->Xecode = ecode;
456 ph10 168 frame->Xmstart = mstart;
457 nigel 77 frame->Xoffset_top = offset_top;
458     frame->Xims = ims;
459     frame->Xeptrb = eptrb;
460     frame->Xflags = flags;
461 nigel 87 frame->Xrdepth = rdepth;
462 nigel 77
463     /* This is where control jumps back to to effect "recursion" */
464    
465     HEAP_RECURSE:
466    
467     /* Macros make the argument variables come from the current frame */
468    
469     #define eptr frame->Xeptr
470     #define ecode frame->Xecode
471 ph10 168 #define mstart frame->Xmstart
472 nigel 77 #define offset_top frame->Xoffset_top
473     #define ims frame->Xims
474     #define eptrb frame->Xeptrb
475     #define flags frame->Xflags
476 nigel 87 #define rdepth frame->Xrdepth
477 nigel 77
478     /* Ditto for the local variables */
479    
480     #ifdef SUPPORT_UTF8
481     #define charptr frame->Xcharptr
482     #endif
483     #define callpat frame->Xcallpat
484     #define data frame->Xdata
485     #define next frame->Xnext
486     #define pp frame->Xpp
487     #define prev frame->Xprev
488     #define saved_eptr frame->Xsaved_eptr
489    
490     #define new_recursive frame->Xnew_recursive
491    
492     #define cur_is_word frame->Xcur_is_word
493     #define condition frame->Xcondition
494     #define prev_is_word frame->Xprev_is_word
495    
496     #define original_ims frame->Xoriginal_ims
497    
498     #ifdef SUPPORT_UCP
499     #define prop_type frame->Xprop_type
500 nigel 87 #define prop_value frame->Xprop_value
501 nigel 77 #define prop_fail_result frame->Xprop_fail_result
502     #define prop_category frame->Xprop_category
503     #define prop_chartype frame->Xprop_chartype
504 nigel 87 #define prop_script frame->Xprop_script
505 ph10 115 #define oclength frame->Xoclength
506     #define occhars frame->Xocchars
507 nigel 77 #endif
508    
509     #define ctype frame->Xctype
510     #define fc frame->Xfc
511     #define fi frame->Xfi
512     #define length frame->Xlength
513     #define max frame->Xmax
514     #define min frame->Xmin
515     #define number frame->Xnumber
516     #define offset frame->Xoffset
517     #define op frame->Xop
518     #define save_capture_last frame->Xsave_capture_last
519     #define save_offset1 frame->Xsave_offset1
520     #define save_offset2 frame->Xsave_offset2
521     #define save_offset3 frame->Xsave_offset3
522     #define stacksave frame->Xstacksave
523    
524     #define newptrb frame->Xnewptrb
525    
526     /* When recursion is being used, local variables are allocated on the stack and
527     get preserved during recursion in the normal way. In this environment, fi and
528     i, and fc and c, can be the same variables. */
529    
530 nigel 93 #else /* NO_RECURSE not defined */
531 nigel 77 #define fi i
532     #define fc c
533    
534    
535 nigel 87 #ifdef SUPPORT_UTF8 /* Many of these variables are used only */
536     const uschar *charptr; /* in small blocks of the code. My normal */
537     #endif /* style of coding would have declared */
538     const uschar *callpat; /* them within each of those blocks. */
539     const uschar *data; /* However, in order to accommodate the */
540     const uschar *next; /* version of this code that uses an */
541     USPTR pp; /* external "stack" implemented on the */
542     const uschar *prev; /* heap, it is easier to declare them all */
543     USPTR saved_eptr; /* here, so the declarations can be cut */
544     /* out in a block. The only declarations */
545     recursion_info new_recursive; /* within blocks below are for variables */
546     /* that do not have to be preserved over */
547     BOOL cur_is_word; /* a recursive call to RMATCH(). */
548     BOOL condition;
549 nigel 77 BOOL prev_is_word;
550    
551     unsigned long int original_ims;
552    
553     #ifdef SUPPORT_UCP
554     int prop_type;
555 nigel 87 int prop_value;
556 nigel 77 int prop_fail_result;
557     int prop_category;
558     int prop_chartype;
559 nigel 87 int prop_script;
560 ph10 115 int oclength;
561     uschar occhars[8];
562 nigel 77 #endif
563    
564     int ctype;
565     int length;
566     int max;
567     int min;
568     int number;
569     int offset;
570     int op;
571     int save_capture_last;
572     int save_offset1, save_offset2, save_offset3;
573     int stacksave[REC_STACK_SAVE_MAX];
574    
575     eptrblock newptrb;
576 nigel 93 #endif /* NO_RECURSE */
577 nigel 77
578     /* These statements are here to stop the compiler complaining about unitialized
579     variables. */
580    
581     #ifdef SUPPORT_UCP
582 nigel 87 prop_value = 0;
583 nigel 77 prop_fail_result = 0;
584     #endif
585    
586 nigel 93
587 nigel 91 /* This label is used for tail recursion, which is used in a few cases even
588     when NO_RECURSE is not defined, in order to reduce the amount of stack that is
589     used. Thanks to Ian Taylor for noticing this possibility and sending the
590     original patch. */
591    
592     TAIL_RECURSE:
593    
594 nigel 87 /* OK, now we can get on with the real code of the function. Recursive calls
595     are specified by the macro RMATCH and RRETURN is used to return. When
596     NO_RECURSE is *not* defined, these just turn into a recursive call to match()
597     and a "return", respectively (possibly with some debugging if DEBUG is
598     defined). However, RMATCH isn't like a function call because it's quite a
599     complicated macro. It has to be used in one particular way. This shouldn't,
600     however, impact performance when true recursion is being used. */
601 nigel 77
602 ph10 164 #ifdef SUPPORT_UTF8
603     utf8 = md->utf8; /* Local copy of the flag */
604     #else
605     utf8 = FALSE;
606     #endif
607    
608 nigel 87 /* First check that we haven't called match() too many times, or that we
609     haven't exceeded the recursive call limit. */
610    
611 nigel 77 if (md->match_call_count++ >= md->match_limit) RRETURN(PCRE_ERROR_MATCHLIMIT);
612 nigel 87 if (rdepth >= md->match_limit_recursion) RRETURN(PCRE_ERROR_RECURSIONLIMIT);
613 nigel 77
614     original_ims = ims; /* Save for resetting on ')' */
615 nigel 91
616 nigel 93 /* At the start of a group with an unlimited repeat that may match an empty
617     string, the match_cbegroup flag is set. When this is the case, add the current
618     subject pointer to the chain of such remembered pointers, to be checked when we
619     hit the closing ket, in order to break infinite loops that match no characters.
620 ph10 197 When match() is called in other circumstances, don't add to the chain. The
621     match_cbegroup flag must NOT be used with tail recursion, because the memory
622     block that is used is on the stack, so a new one may be required for each
623     match(). */
624 nigel 77
625 nigel 93 if ((flags & match_cbegroup) != 0)
626 nigel 77 {
627 ph10 197 newptrb.epb_saved_eptr = eptr;
628     newptrb.epb_prev = eptrb;
629     eptrb = &newptrb;
630 nigel 77 }
631    
632 nigel 93 /* Now start processing the opcodes. */
633 nigel 77
634     for (;;)
635     {
636 nigel 93 minimize = possessive = FALSE;
637 nigel 77 op = *ecode;
638    
639     /* For partial matching, remember if we ever hit the end of the subject after
640     matching at least one subject character. */
641    
642     if (md->partial &&
643     eptr >= md->end_subject &&
644 ph10 168 eptr > mstart)
645 nigel 77 md->hitend = TRUE;
646 ph10 208
647 nigel 93 switch(op)
648     {
649 ph10 210 case OP_FAIL:
650 ph10 212 RRETURN(MATCH_NOMATCH);
651 ph10 211
652 ph10 210 case OP_PRUNE:
653     RMATCH(eptr, ecode + _pcre_OP_lengths[*ecode], offset_top, md,
654     ims, eptrb, flags, RM51);
655     if (rrc != MATCH_NOMATCH) RRETURN(rrc);
656 ph10 212 RRETURN(MATCH_PRUNE);
657 ph10 211
658 ph10 210 case OP_COMMIT:
659     RMATCH(eptr, ecode + _pcre_OP_lengths[*ecode], offset_top, md,
660     ims, eptrb, flags, RM52);
661     if (rrc != MATCH_NOMATCH) RRETURN(rrc);
662 ph10 212 RRETURN(MATCH_COMMIT);
663 ph10 211
664 ph10 210 case OP_SKIP:
665     RMATCH(eptr, ecode + _pcre_OP_lengths[*ecode], offset_top, md,
666     ims, eptrb, flags, RM53);
667     if (rrc != MATCH_NOMATCH) RRETURN(rrc);
668 ph10 211 md->start_match_ptr = eptr; /* Pass back current position */
669 ph10 212 RRETURN(MATCH_SKIP);
670 ph10 211
671 ph10 210 case OP_THEN:
672     RMATCH(eptr, ecode + _pcre_OP_lengths[*ecode], offset_top, md,
673 ph10 212 ims, eptrb, flags, RM54);
674 ph10 210 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
675 ph10 212 RRETURN(MATCH_THEN);
676 ph10 211
677 nigel 93 /* Handle a capturing bracket. If there is space in the offset vector, save
678     the current subject position in the working slot at the top of the vector.
679     We mustn't change the current values of the data slot, because they may be
680     set from a previous iteration of this group, and be referred to by a
681     reference inside the group.
682 nigel 77
683 nigel 93 If the bracket fails to match, we need to restore this value and also the
684     values of the final offsets, in case they were set by a previous iteration
685     of the same bracket.
686 nigel 77
687 nigel 93 If there isn't enough space in the offset vector, treat this as if it were
688     a non-capturing bracket. Don't worry about setting the flag for the error
689     case here; that is handled in the code for KET. */
690 nigel 77
691 nigel 93 case OP_CBRA:
692     case OP_SCBRA:
693     number = GET2(ecode, 1+LINK_SIZE);
694 nigel 77 offset = number << 1;
695    
696     #ifdef DEBUG
697 nigel 93 printf("start bracket %d\n", number);
698     printf("subject=");
699 nigel 77 pchars(eptr, 16, TRUE, md);
700     printf("\n");
701     #endif
702    
703     if (offset < md->offset_max)
704     {
705     save_offset1 = md->offset_vector[offset];
706     save_offset2 = md->offset_vector[offset+1];
707     save_offset3 = md->offset_vector[md->offset_end - number];
708     save_capture_last = md->capture_last;
709    
710     DPRINTF(("saving %d %d %d\n", save_offset1, save_offset2, save_offset3));
711     md->offset_vector[md->offset_end - number] = eptr - md->start_subject;
712    
713 nigel 93 flags = (op == OP_SCBRA)? match_cbegroup : 0;
714 nigel 77 do
715     {
716 ph10 164 RMATCH(eptr, ecode + _pcre_OP_lengths[*ecode], offset_top, md,
717     ims, eptrb, flags, RM1);
718 ph10 210 if (rrc != MATCH_NOMATCH && rrc != MATCH_THEN) RRETURN(rrc);
719 nigel 77 md->capture_last = save_capture_last;
720     ecode += GET(ecode, 1);
721     }
722     while (*ecode == OP_ALT);
723    
724     DPRINTF(("bracket %d failed\n", number));
725    
726     md->offset_vector[offset] = save_offset1;
727     md->offset_vector[offset+1] = save_offset2;
728     md->offset_vector[md->offset_end - number] = save_offset3;
729    
730     RRETURN(MATCH_NOMATCH);
731     }
732    
733 ph10 197 /* FALL THROUGH ... Insufficient room for saving captured contents. Treat
734     as a non-capturing bracket. */
735 nigel 77
736 ph10 197 /* VVVVVVVVVVVVVVVVVVVVVVVVV */
737     /* VVVVVVVVVVVVVVVVVVVVVVVVV */
738    
739 nigel 93 DPRINTF(("insufficient capture room: treat as non-capturing\n"));
740 nigel 77
741 ph10 197 /* VVVVVVVVVVVVVVVVVVVVVVVVV */
742     /* VVVVVVVVVVVVVVVVVVVVVVVVV */
743    
744 nigel 93 /* Non-capturing bracket. Loop for all the alternatives. When we get to the
745     final alternative within the brackets, we would return the result of a
746     recursive call to match() whatever happened. We can reduce stack usage by
747 ph10 197 turning this into a tail recursion, except in the case when match_cbegroup
748     is set.*/
749 nigel 77
750 nigel 93 case OP_BRA:
751     case OP_SBRA:
752     DPRINTF(("start non-capturing bracket\n"));
753     flags = (op >= OP_SBRA)? match_cbegroup : 0;
754 nigel 91 for (;;)
755 nigel 77 {
756 ph10 197 if (ecode[GET(ecode, 1)] != OP_ALT) /* Final alternative */
757 nigel 93 {
758 ph10 197 if (flags == 0) /* Not a possibly empty group */
759     {
760     ecode += _pcre_OP_lengths[*ecode];
761     DPRINTF(("bracket 0 tail recursion\n"));
762     goto TAIL_RECURSE;
763     }
764    
765     /* Possibly empty group; can't use tail recursion. */
766    
767     RMATCH(eptr, ecode + _pcre_OP_lengths[*ecode], offset_top, md, ims,
768     eptrb, flags, RM48);
769     RRETURN(rrc);
770 nigel 93 }
771 nigel 91
772     /* For non-final alternatives, continue the loop for a NOMATCH result;
773     otherwise return. */
774    
775 ph10 164 RMATCH(eptr, ecode + _pcre_OP_lengths[*ecode], offset_top, md, ims,
776     eptrb, flags, RM2);
777 ph10 210 if (rrc != MATCH_NOMATCH && rrc != MATCH_THEN) RRETURN(rrc);
778 nigel 77 ecode += GET(ecode, 1);
779     }
780 nigel 91 /* Control never reaches here. */
781 nigel 77
782     /* Conditional group: compilation checked that there are no more than
783     two branches. If the condition is false, skipping the first branch takes us
784     past the end if there is only one branch, but that's OK because that is
785 nigel 91 exactly what going to the ket would do. As there is only one branch to be
786     obeyed, we can use tail recursion to avoid using another stack frame. */
787 nigel 77
788     case OP_COND:
789 nigel 93 case OP_SCOND:
790     if (ecode[LINK_SIZE+1] == OP_RREF) /* Recursion test */
791 nigel 77 {
792 nigel 93 offset = GET2(ecode, LINK_SIZE + 2); /* Recursion group number*/
793     condition = md->recursive != NULL &&
794     (offset == RREF_ANY || offset == md->recursive->group_num);
795     ecode += condition? 3 : GET(ecode, 1);
796     }
797    
798     else if (ecode[LINK_SIZE+1] == OP_CREF) /* Group used test */
799     {
800 nigel 77 offset = GET2(ecode, LINK_SIZE+2) << 1; /* Doubled ref number */
801 nigel 93 condition = offset < offset_top && md->offset_vector[offset] >= 0;
802     ecode += condition? 3 : GET(ecode, 1);
803 nigel 77 }
804    
805 nigel 93 else if (ecode[LINK_SIZE+1] == OP_DEF) /* DEFINE - always false */
806     {
807     condition = FALSE;
808     ecode += GET(ecode, 1);
809     }
810    
811 nigel 77 /* The condition is an assertion. Call match() to evaluate it - setting
812 nigel 93 the final argument match_condassert causes it to stop at the end of an
813     assertion. */
814 nigel 77
815     else
816     {
817 ph10 164 RMATCH(eptr, ecode + 1 + LINK_SIZE, offset_top, md, ims, NULL,
818     match_condassert, RM3);
819 nigel 77 if (rrc == MATCH_MATCH)
820     {
821 nigel 93 condition = TRUE;
822     ecode += 1 + LINK_SIZE + GET(ecode, LINK_SIZE + 2);
823 nigel 77 while (*ecode == OP_ALT) ecode += GET(ecode, 1);
824     }
825 ph10 210 else if (rrc != MATCH_NOMATCH && rrc != MATCH_THEN)
826 nigel 77 {
827     RRETURN(rrc); /* Need braces because of following else */
828     }
829 nigel 93 else
830     {
831     condition = FALSE;
832     ecode += GET(ecode, 1);
833     }
834     }
835 nigel 91
836 nigel 93 /* We are now at the branch that is to be obeyed. As there is only one,
837 ph10 197 we can use tail recursion to avoid using another stack frame, except when
838     match_cbegroup is required for an unlimited repeat of a possibly empty
839     group. If the second alternative doesn't exist, we can just plough on. */
840 nigel 91
841 nigel 93 if (condition || *ecode == OP_ALT)
842     {
843 nigel 91 ecode += 1 + LINK_SIZE;
844 ph10 197 if (op == OP_SCOND) /* Possibly empty group */
845     {
846     RMATCH(eptr, ecode, offset_top, md, ims, eptrb, match_cbegroup, RM49);
847     RRETURN(rrc);
848     }
849     else /* Group must match something */
850     {
851     flags = 0;
852     goto TAIL_RECURSE;
853     }
854 nigel 77 }
855 ph10 197 else /* Condition false & no 2nd alternative */
856 nigel 93 {
857     ecode += 1 + LINK_SIZE;
858     }
859     break;
860 nigel 77
861    
862 ph10 210 /* End of the pattern, either real or forced. If we are in a top-level
863     recursion, we should restore the offsets appropriately and continue from
864     after the call. */
865 nigel 77
866 ph10 210 case OP_ACCEPT:
867 nigel 77 case OP_END:
868     if (md->recursive != NULL && md->recursive->group_num == 0)
869     {
870     recursion_info *rec = md->recursive;
871 nigel 87 DPRINTF(("End of pattern in a (?0) recursion\n"));
872 nigel 77 md->recursive = rec->prevrec;
873     memmove(md->offset_vector, rec->offset_save,
874     rec->saved_max * sizeof(int));
875 ph10 168 mstart = rec->save_start;
876 nigel 77 ims = original_ims;
877     ecode = rec->after_call;
878     break;
879     }
880    
881     /* Otherwise, if PCRE_NOTEMPTY is set, fail if we have matched an empty
882     string - backtracking will then try other alternatives, if any. */
883    
884 ph10 168 if (md->notempty && eptr == mstart) RRETURN(MATCH_NOMATCH);
885     md->end_match_ptr = eptr; /* Record where we ended */
886     md->end_offset_top = offset_top; /* and how many extracts were taken */
887 ph10 210 md->start_match_ptr = mstart; /* and the start (\K can modify) */
888 nigel 77 RRETURN(MATCH_MATCH);
889    
890     /* Change option settings */
891    
892     case OP_OPT:
893     ims = ecode[1];
894     ecode += 2;
895     DPRINTF(("ims set to %02lx\n", ims));
896     break;
897    
898     /* Assertion brackets. Check the alternative branches in turn - the
899     matching won't pass the KET for an assertion. If any one branch matches,
900     the assertion is true. Lookbehind assertions have an OP_REVERSE item at the
901     start of each branch to move the current point backwards, so the code at
902     this level is identical to the lookahead case. */
903    
904     case OP_ASSERT:
905     case OP_ASSERTBACK:
906     do
907     {
908 ph10 164 RMATCH(eptr, ecode + 1 + LINK_SIZE, offset_top, md, ims, NULL, 0,
909     RM4);
910 nigel 77 if (rrc == MATCH_MATCH) break;
911 ph10 210 if (rrc != MATCH_NOMATCH && rrc != MATCH_THEN) RRETURN(rrc);
912 nigel 77 ecode += GET(ecode, 1);
913     }
914     while (*ecode == OP_ALT);
915     if (*ecode == OP_KET) RRETURN(MATCH_NOMATCH);
916    
917     /* If checking an assertion for a condition, return MATCH_MATCH. */
918    
919     if ((flags & match_condassert) != 0) RRETURN(MATCH_MATCH);
920    
921     /* Continue from after the assertion, updating the offsets high water
922     mark, since extracts may have been taken during the assertion. */
923    
924     do ecode += GET(ecode,1); while (*ecode == OP_ALT);
925     ecode += 1 + LINK_SIZE;
926     offset_top = md->end_offset_top;
927     continue;
928    
929     /* Negative assertion: all branches must fail to match */
930    
931     case OP_ASSERT_NOT:
932     case OP_ASSERTBACK_NOT:
933     do
934     {
935 ph10 164 RMATCH(eptr, ecode + 1 + LINK_SIZE, offset_top, md, ims, NULL, 0,
936     RM5);
937 nigel 77 if (rrc == MATCH_MATCH) RRETURN(MATCH_NOMATCH);
938 ph10 210 if (rrc != MATCH_NOMATCH && rrc != MATCH_THEN) RRETURN(rrc);
939 nigel 77 ecode += GET(ecode,1);
940     }
941     while (*ecode == OP_ALT);
942    
943     if ((flags & match_condassert) != 0) RRETURN(MATCH_MATCH);
944    
945     ecode += 1 + LINK_SIZE;
946     continue;
947    
948     /* Move the subject pointer back. This occurs only at the start of
949     each branch of a lookbehind assertion. If we are too close to the start to
950     move back, this match function fails. When working with UTF-8 we move
951     back a number of characters, not bytes. */
952    
953     case OP_REVERSE:
954     #ifdef SUPPORT_UTF8
955     if (utf8)
956     {
957 nigel 93 i = GET(ecode, 1);
958     while (i-- > 0)
959 nigel 77 {
960     eptr--;
961     if (eptr < md->start_subject) RRETURN(MATCH_NOMATCH);
962 ph10 207 BACKCHAR(eptr);
963 nigel 77 }
964     }
965     else
966     #endif
967    
968     /* No UTF-8 support, or not in UTF-8 mode: count is byte count */
969    
970     {
971 nigel 93 eptr -= GET(ecode, 1);
972 nigel 77 if (eptr < md->start_subject) RRETURN(MATCH_NOMATCH);
973     }
974    
975     /* Skip to next op code */
976    
977     ecode += 1 + LINK_SIZE;
978     break;
979    
980     /* The callout item calls an external function, if one is provided, passing
981     details of the match so far. This is mainly for debugging, though the
982     function is able to force a failure. */
983    
984     case OP_CALLOUT:
985     if (pcre_callout != NULL)
986     {
987     pcre_callout_block cb;
988     cb.version = 1; /* Version 1 of the callout block */
989     cb.callout_number = ecode[1];
990     cb.offset_vector = md->offset_vector;
991 nigel 87 cb.subject = (PCRE_SPTR)md->start_subject;
992 nigel 77 cb.subject_length = md->end_subject - md->start_subject;
993 ph10 168 cb.start_match = mstart - md->start_subject;
994 nigel 77 cb.current_position = eptr - md->start_subject;
995     cb.pattern_position = GET(ecode, 2);
996     cb.next_item_length = GET(ecode, 2 + LINK_SIZE);
997     cb.capture_top = offset_top/2;
998     cb.capture_last = md->capture_last;
999     cb.callout_data = md->callout_data;
1000     if ((rrc = (*pcre_callout)(&cb)) > 0) RRETURN(MATCH_NOMATCH);
1001     if (rrc < 0) RRETURN(rrc);
1002     }
1003     ecode += 2 + 2*LINK_SIZE;
1004     break;
1005    
1006     /* Recursion either matches the current regex, or some subexpression. The
1007     offset data is the offset to the starting bracket from the start of the
1008     whole pattern. (This is so that it works from duplicated subpatterns.)
1009    
1010     If there are any capturing brackets started but not finished, we have to
1011     save their starting points and reinstate them after the recursion. However,
1012     we don't know how many such there are (offset_top records the completed
1013     total) so we just have to save all the potential data. There may be up to
1014     65535 such values, which is too large to put on the stack, but using malloc
1015     for small numbers seems expensive. As a compromise, the stack is used when
1016     there are no more than REC_STACK_SAVE_MAX values to store; otherwise malloc
1017     is used. A problem is what to do if the malloc fails ... there is no way of
1018     returning to the top level with an error. Save the top REC_STACK_SAVE_MAX
1019     values on the stack, and accept that the rest may be wrong.
1020    
1021     There are also other values that have to be saved. We use a chained
1022     sequence of blocks that actually live on the stack. Thanks to Robin Houston
1023     for the original version of this logic. */
1024    
1025     case OP_RECURSE:
1026     {
1027     callpat = md->start_code + GET(ecode, 1);
1028 nigel 93 new_recursive.group_num = (callpat == md->start_code)? 0 :
1029     GET2(callpat, 1 + LINK_SIZE);
1030 nigel 77
1031     /* Add to "recursing stack" */
1032    
1033     new_recursive.prevrec = md->recursive;
1034     md->recursive = &new_recursive;
1035    
1036     /* Find where to continue from afterwards */
1037    
1038     ecode += 1 + LINK_SIZE;
1039     new_recursive.after_call = ecode;
1040    
1041     /* Now save the offset data. */
1042    
1043     new_recursive.saved_max = md->offset_end;
1044     if (new_recursive.saved_max <= REC_STACK_SAVE_MAX)
1045     new_recursive.offset_save = stacksave;
1046     else
1047     {
1048     new_recursive.offset_save =
1049     (int *)(pcre_malloc)(new_recursive.saved_max * sizeof(int));
1050     if (new_recursive.offset_save == NULL) RRETURN(PCRE_ERROR_NOMEMORY);
1051     }
1052    
1053     memcpy(new_recursive.offset_save, md->offset_vector,
1054     new_recursive.saved_max * sizeof(int));
1055 ph10 168 new_recursive.save_start = mstart;
1056     mstart = eptr;
1057 nigel 77
1058     /* OK, now we can do the recursion. For each top-level alternative we
1059     restore the offset and recursion data. */
1060    
1061     DPRINTF(("Recursing into group %d\n", new_recursive.group_num));
1062 nigel 93 flags = (*callpat >= OP_SBRA)? match_cbegroup : 0;
1063 nigel 77 do
1064     {
1065 ph10 164 RMATCH(eptr, callpat + _pcre_OP_lengths[*callpat], offset_top,
1066     md, ims, eptrb, flags, RM6);
1067 nigel 77 if (rrc == MATCH_MATCH)
1068     {
1069 nigel 87 DPRINTF(("Recursion matched\n"));
1070 nigel 77 md->recursive = new_recursive.prevrec;
1071     if (new_recursive.offset_save != stacksave)
1072     (pcre_free)(new_recursive.offset_save);
1073     RRETURN(MATCH_MATCH);
1074     }
1075 ph10 210 else if (rrc != MATCH_NOMATCH && rrc != MATCH_THEN)
1076 nigel 87 {
1077     DPRINTF(("Recursion gave error %d\n", rrc));
1078     RRETURN(rrc);
1079     }
1080 nigel 77
1081     md->recursive = &new_recursive;
1082     memcpy(md->offset_vector, new_recursive.offset_save,
1083     new_recursive.saved_max * sizeof(int));
1084     callpat += GET(callpat, 1);
1085     }
1086     while (*callpat == OP_ALT);
1087    
1088     DPRINTF(("Recursion didn't match\n"));
1089     md->recursive = new_recursive.prevrec;
1090     if (new_recursive.offset_save != stacksave)
1091     (pcre_free)(new_recursive.offset_save);
1092     RRETURN(MATCH_NOMATCH);
1093     }
1094     /* Control never reaches here */
1095    
1096     /* "Once" brackets are like assertion brackets except that after a match,
1097     the point in the subject string is not moved back. Thus there can never be
1098     a move back into the brackets. Friedl calls these "atomic" subpatterns.
1099     Check the alternative branches in turn - the matching won't pass the KET
1100     for this kind of subpattern. If any one branch matches, we carry on as at
1101     the end of a normal bracket, leaving the subject pointer. */
1102    
1103     case OP_ONCE:
1104 nigel 91 prev = ecode;
1105     saved_eptr = eptr;
1106    
1107     do
1108 nigel 77 {
1109 ph10 197 RMATCH(eptr, ecode + 1 + LINK_SIZE, offset_top, md, ims, eptrb, 0, RM7);
1110 nigel 91 if (rrc == MATCH_MATCH) break;
1111 ph10 210 if (rrc != MATCH_NOMATCH && rrc != MATCH_THEN) RRETURN(rrc);
1112 nigel 91 ecode += GET(ecode,1);
1113     }
1114     while (*ecode == OP_ALT);
1115 nigel 77
1116 nigel 91 /* If hit the end of the group (which could be repeated), fail */
1117 nigel 77
1118 nigel 91 if (*ecode != OP_ONCE && *ecode != OP_ALT) RRETURN(MATCH_NOMATCH);
1119 nigel 77
1120 nigel 91 /* Continue as from after the assertion, updating the offsets high water
1121     mark, since extracts may have been taken. */
1122 nigel 77
1123 nigel 93 do ecode += GET(ecode, 1); while (*ecode == OP_ALT);
1124 nigel 77
1125 nigel 91 offset_top = md->end_offset_top;
1126     eptr = md->end_match_ptr;
1127 nigel 77
1128 nigel 91 /* For a non-repeating ket, just continue at this level. This also
1129     happens for a repeating ket if no characters were matched in the group.
1130     This is the forcible breaking of infinite loops as implemented in Perl
1131     5.005. If there is an options reset, it will get obeyed in the normal
1132     course of events. */
1133 nigel 77
1134 nigel 91 if (*ecode == OP_KET || eptr == saved_eptr)
1135     {
1136     ecode += 1+LINK_SIZE;
1137     break;
1138     }
1139 nigel 77
1140 nigel 91 /* The repeating kets try the rest of the pattern or restart from the
1141     preceding bracket, in the appropriate order. The second "call" of match()
1142     uses tail recursion, to avoid using another stack frame. We need to reset
1143     any options that changed within the bracket before re-running it, so
1144     check the next opcode. */
1145 nigel 77
1146 nigel 91 if (ecode[1+LINK_SIZE] == OP_OPT)
1147     {
1148     ims = (ims & ~PCRE_IMS) | ecode[4];
1149     DPRINTF(("ims set to %02lx at group repeat\n", ims));
1150     }
1151 nigel 77
1152 nigel 91 if (*ecode == OP_KETRMIN)
1153     {
1154 ph10 197 RMATCH(eptr, ecode + 1 + LINK_SIZE, offset_top, md, ims, eptrb, 0, RM8);
1155 nigel 91 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1156     ecode = prev;
1157 ph10 197 flags = 0;
1158 nigel 91 goto TAIL_RECURSE;
1159 nigel 77 }
1160 nigel 91 else /* OP_KETRMAX */
1161     {
1162 ph10 164 RMATCH(eptr, prev, offset_top, md, ims, eptrb, match_cbegroup, RM9);
1163 nigel 91 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1164     ecode += 1 + LINK_SIZE;
1165 ph10 197 flags = 0;
1166 nigel 91 goto TAIL_RECURSE;
1167     }
1168     /* Control never gets here */
1169 nigel 77
1170     /* An alternation is the end of a branch; scan along to find the end of the
1171     bracketed group and go to there. */
1172    
1173     case OP_ALT:
1174     do ecode += GET(ecode,1); while (*ecode == OP_ALT);
1175     break;
1176    
1177 ph10 335 /* BRAZERO, BRAMINZERO and SKIPZERO occur just before a bracket group,
1178     indicating that it may occur zero times. It may repeat infinitely, or not
1179     at all - i.e. it could be ()* or ()? or even (){0} in the pattern. Brackets
1180     with fixed upper repeat limits are compiled as a number of copies, with the
1181     optional ones preceded by BRAZERO or BRAMINZERO. */
1182 nigel 77
1183     case OP_BRAZERO:
1184     {
1185     next = ecode+1;
1186 ph10 164 RMATCH(eptr, next, offset_top, md, ims, eptrb, 0, RM10);
1187 nigel 77 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1188     do next += GET(next,1); while (*next == OP_ALT);
1189 nigel 93 ecode = next + 1 + LINK_SIZE;
1190 nigel 77 }
1191     break;
1192    
1193     case OP_BRAMINZERO:
1194     {
1195     next = ecode+1;
1196 nigel 93 do next += GET(next, 1); while (*next == OP_ALT);
1197 ph10 164 RMATCH(eptr, next + 1+LINK_SIZE, offset_top, md, ims, eptrb, 0, RM11);
1198 nigel 77 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1199     ecode++;
1200     }
1201     break;
1202    
1203 ph10 335 case OP_SKIPZERO:
1204     {
1205     next = ecode+1;
1206     do next += GET(next,1); while (*next == OP_ALT);
1207     ecode = next + 1 + LINK_SIZE;
1208     }
1209     break;
1210    
1211 nigel 93 /* End of a group, repeated or non-repeating. */
1212 nigel 77
1213     case OP_KET:
1214     case OP_KETRMIN:
1215     case OP_KETRMAX:
1216 nigel 91 prev = ecode - GET(ecode, 1);
1217 nigel 77
1218 nigel 93 /* If this was a group that remembered the subject start, in order to break
1219     infinite repeats of empty string matches, retrieve the subject start from
1220     the chain. Otherwise, set it NULL. */
1221 nigel 77
1222 nigel 93 if (*prev >= OP_SBRA)
1223     {
1224     saved_eptr = eptrb->epb_saved_eptr; /* Value at start of group */
1225     eptrb = eptrb->epb_prev; /* Backup to previous group */
1226     }
1227     else saved_eptr = NULL;
1228 nigel 77
1229 nigel 93 /* If we are at the end of an assertion group, stop matching and return
1230     MATCH_MATCH, but record the current high water mark for use by positive
1231     assertions. Do this also for the "once" (atomic) groups. */
1232    
1233 nigel 91 if (*prev == OP_ASSERT || *prev == OP_ASSERT_NOT ||
1234     *prev == OP_ASSERTBACK || *prev == OP_ASSERTBACK_NOT ||
1235     *prev == OP_ONCE)
1236     {
1237     md->end_match_ptr = eptr; /* For ONCE */
1238     md->end_offset_top = offset_top;
1239     RRETURN(MATCH_MATCH);
1240     }
1241 nigel 77
1242 nigel 93 /* For capturing groups we have to check the group number back at the start
1243     and if necessary complete handling an extraction by setting the offsets and
1244     bumping the high water mark. Note that whole-pattern recursion is coded as
1245     a recurse into group 0, so it won't be picked up here. Instead, we catch it
1246     when the OP_END is reached. Other recursion is handled here. */
1247 nigel 77
1248 nigel 93 if (*prev == OP_CBRA || *prev == OP_SCBRA)
1249 nigel 91 {
1250 nigel 93 number = GET2(prev, 1+LINK_SIZE);
1251 nigel 91 offset = number << 1;
1252 nigel 77
1253     #ifdef DEBUG
1254 nigel 91 printf("end bracket %d", number);
1255     printf("\n");
1256 nigel 77 #endif
1257    
1258 nigel 93 md->capture_last = number;
1259     if (offset >= md->offset_max) md->offset_overflow = TRUE; else
1260 nigel 91 {
1261 nigel 93 md->offset_vector[offset] =
1262     md->offset_vector[md->offset_end - number];
1263     md->offset_vector[offset+1] = eptr - md->start_subject;
1264     if (offset_top <= offset) offset_top = offset + 2;
1265     }
1266 nigel 77
1267 nigel 93 /* Handle a recursively called group. Restore the offsets
1268     appropriately and continue from after the call. */
1269 nigel 77
1270 nigel 93 if (md->recursive != NULL && md->recursive->group_num == number)
1271     {
1272     recursion_info *rec = md->recursive;
1273     DPRINTF(("Recursion (%d) succeeded - continuing\n", number));
1274     md->recursive = rec->prevrec;
1275 ph10 168 mstart = rec->save_start;
1276 nigel 93 memcpy(md->offset_vector, rec->offset_save,
1277     rec->saved_max * sizeof(int));
1278     ecode = rec->after_call;
1279     ims = original_ims;
1280     break;
1281 nigel 77 }
1282 nigel 91 }
1283 nigel 77
1284 nigel 93 /* For both capturing and non-capturing groups, reset the value of the ims
1285     flags, in case they got changed during the group. */
1286 nigel 77
1287 nigel 91 ims = original_ims;
1288     DPRINTF(("ims reset to %02lx\n", ims));
1289 nigel 77
1290 nigel 91 /* For a non-repeating ket, just continue at this level. This also
1291     happens for a repeating ket if no characters were matched in the group.
1292     This is the forcible breaking of infinite loops as implemented in Perl
1293     5.005. If there is an options reset, it will get obeyed in the normal
1294     course of events. */
1295 nigel 77
1296 nigel 91 if (*ecode == OP_KET || eptr == saved_eptr)
1297     {
1298     ecode += 1 + LINK_SIZE;
1299     break;
1300     }
1301 nigel 77
1302 nigel 91 /* The repeating kets try the rest of the pattern or restart from the
1303     preceding bracket, in the appropriate order. In the second case, we can use
1304 ph10 197 tail recursion to avoid using another stack frame, unless we have an
1305     unlimited repeat of a group that can match an empty string. */
1306 nigel 77
1307 nigel 93 flags = (*prev >= OP_SBRA)? match_cbegroup : 0;
1308    
1309 nigel 91 if (*ecode == OP_KETRMIN)
1310     {
1311 ph10 197 RMATCH(eptr, ecode + 1 + LINK_SIZE, offset_top, md, ims, eptrb, 0, RM12);
1312 nigel 91 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1313 ph10 197 if (flags != 0) /* Could match an empty string */
1314     {
1315     RMATCH(eptr, prev, offset_top, md, ims, eptrb, flags, RM50);
1316     RRETURN(rrc);
1317     }
1318 nigel 91 ecode = prev;
1319     goto TAIL_RECURSE;
1320 nigel 77 }
1321 nigel 91 else /* OP_KETRMAX */
1322     {
1323 ph10 164 RMATCH(eptr, prev, offset_top, md, ims, eptrb, flags, RM13);
1324 nigel 91 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1325     ecode += 1 + LINK_SIZE;
1326 ph10 197 flags = 0;
1327 nigel 91 goto TAIL_RECURSE;
1328     }
1329     /* Control never gets here */
1330 nigel 77
1331     /* Start of subject unless notbol, or after internal newline if multiline */
1332    
1333     case OP_CIRC:
1334     if (md->notbol && eptr == md->start_subject) RRETURN(MATCH_NOMATCH);
1335     if ((ims & PCRE_MULTILINE) != 0)
1336     {
1337 nigel 91 if (eptr != md->start_subject &&
1338 nigel 93 (eptr == md->end_subject || !WAS_NEWLINE(eptr)))
1339 nigel 77 RRETURN(MATCH_NOMATCH);
1340     ecode++;
1341     break;
1342     }
1343     /* ... else fall through */
1344    
1345     /* Start of subject assertion */
1346    
1347     case OP_SOD:
1348     if (eptr != md->start_subject) RRETURN(MATCH_NOMATCH);
1349     ecode++;
1350     break;
1351    
1352     /* Start of match assertion */
1353    
1354     case OP_SOM:
1355     if (eptr != md->start_subject + md->start_offset) RRETURN(MATCH_NOMATCH);
1356     ecode++;
1357     break;
1358 ph10 172
1359 ph10 168 /* Reset the start of match point */
1360 ph10 172
1361 ph10 168 case OP_SET_SOM:
1362     mstart = eptr;
1363 ph10 172 ecode++;
1364     break;
1365 nigel 77
1366     /* Assert before internal newline if multiline, or before a terminating
1367     newline unless endonly is set, else end of subject unless noteol is set. */
1368    
1369     case OP_DOLL:
1370     if ((ims & PCRE_MULTILINE) != 0)
1371     {
1372     if (eptr < md->end_subject)
1373 nigel 91 { if (!IS_NEWLINE(eptr)) RRETURN(MATCH_NOMATCH); }
1374 nigel 77 else
1375     { if (md->noteol) RRETURN(MATCH_NOMATCH); }
1376     ecode++;
1377     break;
1378     }
1379     else
1380     {
1381     if (md->noteol) RRETURN(MATCH_NOMATCH);
1382     if (!md->endonly)
1383     {
1384 nigel 91 if (eptr != md->end_subject &&
1385 nigel 93 (!IS_NEWLINE(eptr) || eptr != md->end_subject - md->nllen))
1386 nigel 77 RRETURN(MATCH_NOMATCH);
1387     ecode++;
1388     break;
1389     }
1390     }
1391 nigel 91 /* ... else fall through for endonly */
1392 nigel 77
1393     /* End of subject assertion (\z) */
1394    
1395     case OP_EOD:
1396     if (eptr < md->end_subject) RRETURN(MATCH_NOMATCH);
1397     ecode++;
1398     break;
1399    
1400     /* End of subject or ending \n assertion (\Z) */
1401    
1402     case OP_EODN:
1403 nigel 91 if (eptr != md->end_subject &&
1404 nigel 93 (!IS_NEWLINE(eptr) || eptr != md->end_subject - md->nllen))
1405 nigel 91 RRETURN(MATCH_NOMATCH);
1406 nigel 77 ecode++;
1407     break;
1408    
1409     /* Word boundary assertions */
1410    
1411     case OP_NOT_WORD_BOUNDARY:
1412     case OP_WORD_BOUNDARY:
1413     {
1414    
1415     /* Find out if the previous and current characters are "word" characters.
1416     It takes a bit more work in UTF-8 mode. Characters > 255 are assumed to
1417     be "non-word" characters. */
1418    
1419     #ifdef SUPPORT_UTF8
1420     if (utf8)
1421     {
1422     if (eptr == md->start_subject) prev_is_word = FALSE; else
1423     {
1424     const uschar *lastptr = eptr - 1;
1425     while((*lastptr & 0xc0) == 0x80) lastptr--;
1426     GETCHAR(c, lastptr);
1427     prev_is_word = c < 256 && (md->ctypes[c] & ctype_word) != 0;
1428     }
1429     if (eptr >= md->end_subject) cur_is_word = FALSE; else
1430     {
1431     GETCHAR(c, eptr);
1432     cur_is_word = c < 256 && (md->ctypes[c] & ctype_word) != 0;
1433     }
1434     }
1435     else
1436     #endif
1437    
1438     /* More streamlined when not in UTF-8 mode */
1439    
1440     {
1441     prev_is_word = (eptr != md->start_subject) &&
1442     ((md->ctypes[eptr[-1]] & ctype_word) != 0);
1443     cur_is_word = (eptr < md->end_subject) &&
1444     ((md->ctypes[*eptr] & ctype_word) != 0);
1445     }
1446    
1447     /* Now see if the situation is what we want */
1448    
1449     if ((*ecode++ == OP_WORD_BOUNDARY)?
1450     cur_is_word == prev_is_word : cur_is_word != prev_is_word)
1451     RRETURN(MATCH_NOMATCH);
1452     }
1453     break;
1454    
1455     /* Match a single character type; inline for speed */
1456    
1457     case OP_ANY:
1458 ph10 342 if (IS_NEWLINE(eptr)) RRETURN(MATCH_NOMATCH);
1459 ph10 345 /* Fall through */
1460    
1461 ph10 341 case OP_ALLANY:
1462 nigel 77 if (eptr++ >= md->end_subject) RRETURN(MATCH_NOMATCH);
1463 ph10 342 if (utf8) while (eptr < md->end_subject && (*eptr & 0xc0) == 0x80) eptr++;
1464 nigel 77 ecode++;
1465     break;
1466    
1467     /* Match a single byte, even in UTF-8 mode. This opcode really does match
1468     any byte, even newline, independent of the setting of PCRE_DOTALL. */
1469    
1470     case OP_ANYBYTE:
1471     if (eptr++ >= md->end_subject) RRETURN(MATCH_NOMATCH);
1472     ecode++;
1473     break;
1474    
1475     case OP_NOT_DIGIT:
1476     if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
1477     GETCHARINCTEST(c, eptr);
1478     if (
1479     #ifdef SUPPORT_UTF8
1480     c < 256 &&
1481     #endif
1482     (md->ctypes[c] & ctype_digit) != 0
1483     )
1484     RRETURN(MATCH_NOMATCH);
1485     ecode++;
1486     break;
1487    
1488     case OP_DIGIT:
1489     if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
1490     GETCHARINCTEST(c, eptr);
1491     if (
1492     #ifdef SUPPORT_UTF8
1493     c >= 256 ||
1494     #endif
1495     (md->ctypes[c] & ctype_digit) == 0
1496     )
1497     RRETURN(MATCH_NOMATCH);
1498     ecode++;
1499     break;
1500    
1501     case OP_NOT_WHITESPACE:
1502     if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
1503     GETCHARINCTEST(c, eptr);
1504     if (
1505     #ifdef SUPPORT_UTF8
1506     c < 256 &&
1507     #endif
1508     (md->ctypes[c] & ctype_space) != 0
1509     )
1510     RRETURN(MATCH_NOMATCH);
1511     ecode++;
1512     break;
1513    
1514     case OP_WHITESPACE:
1515     if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
1516     GETCHARINCTEST(c, eptr);
1517     if (
1518     #ifdef SUPPORT_UTF8
1519     c >= 256 ||
1520     #endif
1521     (md->ctypes[c] & ctype_space) == 0
1522     )
1523     RRETURN(MATCH_NOMATCH);
1524     ecode++;
1525     break;
1526    
1527     case OP_NOT_WORDCHAR:
1528     if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
1529     GETCHARINCTEST(c, eptr);
1530     if (
1531     #ifdef SUPPORT_UTF8
1532     c < 256 &&
1533     #endif
1534     (md->ctypes[c] & ctype_word) != 0
1535     )
1536     RRETURN(MATCH_NOMATCH);
1537     ecode++;
1538     break;
1539    
1540     case OP_WORDCHAR:
1541     if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
1542     GETCHARINCTEST(c, eptr);
1543     if (
1544     #ifdef SUPPORT_UTF8
1545     c >= 256 ||
1546     #endif
1547     (md->ctypes[c] & ctype_word) == 0
1548     )
1549     RRETURN(MATCH_NOMATCH);
1550     ecode++;
1551     break;
1552    
1553 nigel 93 case OP_ANYNL:
1554     if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
1555     GETCHARINCTEST(c, eptr);
1556     switch(c)
1557     {
1558     default: RRETURN(MATCH_NOMATCH);
1559     case 0x000d:
1560     if (eptr < md->end_subject && *eptr == 0x0a) eptr++;
1561     break;
1562 ph10 231
1563 nigel 93 case 0x000a:
1564 ph10 231 break;
1565    
1566 nigel 93 case 0x000b:
1567     case 0x000c:
1568     case 0x0085:
1569     case 0x2028:
1570     case 0x2029:
1571 ph10 231 if (md->bsr_anycrlf) RRETURN(MATCH_NOMATCH);
1572 nigel 93 break;
1573     }
1574     ecode++;
1575     break;
1576    
1577 ph10 178 case OP_NOT_HSPACE:
1578     if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
1579     GETCHARINCTEST(c, eptr);
1580     switch(c)
1581     {
1582     default: break;
1583     case 0x09: /* HT */
1584     case 0x20: /* SPACE */
1585     case 0xa0: /* NBSP */
1586     case 0x1680: /* OGHAM SPACE MARK */
1587     case 0x180e: /* MONGOLIAN VOWEL SEPARATOR */
1588     case 0x2000: /* EN QUAD */
1589     case 0x2001: /* EM QUAD */
1590     case 0x2002: /* EN SPACE */
1591     case 0x2003: /* EM SPACE */
1592     case 0x2004: /* THREE-PER-EM SPACE */
1593     case 0x2005: /* FOUR-PER-EM SPACE */
1594     case 0x2006: /* SIX-PER-EM SPACE */
1595     case 0x2007: /* FIGURE SPACE */
1596     case 0x2008: /* PUNCTUATION SPACE */
1597     case 0x2009: /* THIN SPACE */
1598     case 0x200A: /* HAIR SPACE */
1599     case 0x202f: /* NARROW NO-BREAK SPACE */
1600     case 0x205f: /* MEDIUM MATHEMATICAL SPACE */
1601     case 0x3000: /* IDEOGRAPHIC SPACE */
1602     RRETURN(MATCH_NOMATCH);
1603     }
1604     ecode++;
1605     break;
1606    
1607     case OP_HSPACE:
1608     if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
1609     GETCHARINCTEST(c, eptr);
1610     switch(c)
1611     {
1612     default: RRETURN(MATCH_NOMATCH);
1613     case 0x09: /* HT */
1614     case 0x20: /* SPACE */
1615     case 0xa0: /* NBSP */
1616     case 0x1680: /* OGHAM SPACE MARK */
1617     case 0x180e: /* MONGOLIAN VOWEL SEPARATOR */
1618     case 0x2000: /* EN QUAD */
1619     case 0x2001: /* EM QUAD */
1620     case 0x2002: /* EN SPACE */
1621     case 0x2003: /* EM SPACE */
1622     case 0x2004: /* THREE-PER-EM SPACE */
1623     case 0x2005: /* FOUR-PER-EM SPACE */
1624     case 0x2006: /* SIX-PER-EM SPACE */
1625     case 0x2007: /* FIGURE SPACE */
1626     case 0x2008: /* PUNCTUATION SPACE */
1627     case 0x2009: /* THIN SPACE */
1628     case 0x200A: /* HAIR SPACE */
1629     case 0x202f: /* NARROW NO-BREAK SPACE */
1630     case 0x205f: /* MEDIUM MATHEMATICAL SPACE */
1631     case 0x3000: /* IDEOGRAPHIC SPACE */
1632     break;
1633     }
1634     ecode++;
1635     break;
1636    
1637     case OP_NOT_VSPACE:
1638     if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
1639     GETCHARINCTEST(c, eptr);
1640     switch(c)
1641     {
1642     default: break;
1643     case 0x0a: /* LF */
1644     case 0x0b: /* VT */
1645     case 0x0c: /* FF */
1646     case 0x0d: /* CR */
1647     case 0x85: /* NEL */
1648     case 0x2028: /* LINE SEPARATOR */
1649     case 0x2029: /* PARAGRAPH SEPARATOR */
1650     RRETURN(MATCH_NOMATCH);
1651     }
1652     ecode++;
1653     break;
1654    
1655     case OP_VSPACE:
1656     if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
1657     GETCHARINCTEST(c, eptr);
1658     switch(c)
1659     {
1660     default: RRETURN(MATCH_NOMATCH);
1661     case 0x0a: /* LF */
1662     case 0x0b: /* VT */
1663     case 0x0c: /* FF */
1664     case 0x0d: /* CR */
1665     case 0x85: /* NEL */
1666     case 0x2028: /* LINE SEPARATOR */
1667     case 0x2029: /* PARAGRAPH SEPARATOR */
1668     break;
1669     }
1670     ecode++;
1671     break;
1672    
1673 nigel 77 #ifdef SUPPORT_UCP
1674     /* Check the next character by Unicode property. We will get here only
1675     if the support is in the binary; otherwise a compile-time error occurs. */
1676    
1677     case OP_PROP:
1678     case OP_NOTPROP:
1679     if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
1680     GETCHARINCTEST(c, eptr);
1681     {
1682 ph10 349 const ucd_record * prop = GET_UCD(c);
1683 nigel 77
1684 nigel 87 switch(ecode[1])
1685     {
1686     case PT_ANY:
1687     if (op == OP_NOTPROP) RRETURN(MATCH_NOMATCH);
1688     break;
1689 nigel 77
1690 nigel 87 case PT_LAMP:
1691 ph10 349 if ((prop->chartype == ucp_Lu ||
1692     prop->chartype == ucp_Ll ||
1693     prop->chartype == ucp_Lt) == (op == OP_NOTPROP))
1694 nigel 77 RRETURN(MATCH_NOMATCH);
1695 nigel 87 break;
1696    
1697     case PT_GC:
1698 ph10 351 if ((ecode[2] != _pcre_ucp_gentype[prop->chartype]) == (op == OP_PROP))
1699 nigel 77 RRETURN(MATCH_NOMATCH);
1700 nigel 87 break;
1701    
1702     case PT_PC:
1703 ph10 349 if ((ecode[2] != prop->chartype) == (op == OP_PROP))
1704 nigel 87 RRETURN(MATCH_NOMATCH);
1705     break;
1706    
1707     case PT_SC:
1708 ph10 349 if ((ecode[2] != prop->script) == (op == OP_PROP))
1709 nigel 87 RRETURN(MATCH_NOMATCH);
1710     break;
1711    
1712     default:
1713     RRETURN(PCRE_ERROR_INTERNAL);
1714 nigel 77 }
1715 nigel 87
1716     ecode += 3;
1717 nigel 77 }
1718     break;
1719    
1720     /* Match an extended Unicode sequence. We will get here only if the support
1721     is in the binary; otherwise a compile-time error occurs. */
1722    
1723     case OP_EXTUNI:
1724     if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
1725     GETCHARINCTEST(c, eptr);
1726     {
1727 ph10 349 int category = UCD_CATEGORY(c);
1728 nigel 77 if (category == ucp_M) RRETURN(MATCH_NOMATCH);
1729     while (eptr < md->end_subject)
1730     {
1731     int len = 1;
1732     if (!utf8) c = *eptr; else
1733     {
1734     GETCHARLEN(c, eptr, len);
1735     }
1736 ph10 349 category = UCD_CATEGORY(c);
1737 nigel 77 if (category != ucp_M) break;
1738     eptr += len;
1739     }
1740     }
1741     ecode++;
1742     break;
1743     #endif
1744    
1745    
1746     /* Match a back reference, possibly repeatedly. Look past the end of the
1747     item to see if there is repeat information following. The code is similar
1748     to that for character classes, but repeated for efficiency. Then obey
1749     similar code to character type repeats - written out again for speed.
1750     However, if the referenced string is the empty string, always treat
1751     it as matched, any number of times (otherwise there could be infinite
1752     loops). */
1753    
1754     case OP_REF:
1755     {
1756     offset = GET2(ecode, 1) << 1; /* Doubled ref number */
1757 ph10 345 ecode += 3;
1758    
1759 ph10 336 /* If the reference is unset, there are two possibilities:
1760 ph10 345
1761 ph10 336 (a) In the default, Perl-compatible state, set the length to be longer
1762     than the amount of subject left; this ensures that every attempt at a
1763     match fails. We can't just fail here, because of the possibility of
1764     quantifiers with zero minima.
1765 ph10 345
1766     (b) If the JavaScript compatibility flag is set, set the length to zero
1767     so that the back reference matches an empty string.
1768    
1769     Otherwise, set the length to the length of what was matched by the
1770 ph10 336 referenced subpattern. */
1771 ph10 345
1772 ph10 336 if (offset >= offset_top || md->offset_vector[offset] < 0)
1773 ph10 345 length = (md->jscript_compat)? 0 : md->end_subject - eptr + 1;
1774 ph10 336 else
1775     length = md->offset_vector[offset+1] - md->offset_vector[offset];
1776 nigel 77
1777     /* Set up for repetition, or handle the non-repeated case */
1778    
1779     switch (*ecode)
1780     {
1781     case OP_CRSTAR:
1782     case OP_CRMINSTAR:
1783     case OP_CRPLUS:
1784     case OP_CRMINPLUS:
1785     case OP_CRQUERY:
1786     case OP_CRMINQUERY:
1787     c = *ecode++ - OP_CRSTAR;
1788     minimize = (c & 1) != 0;
1789     min = rep_min[c]; /* Pick up values from tables; */
1790     max = rep_max[c]; /* zero for max => infinity */
1791     if (max == 0) max = INT_MAX;
1792     break;
1793    
1794     case OP_CRRANGE:
1795     case OP_CRMINRANGE:
1796     minimize = (*ecode == OP_CRMINRANGE);
1797     min = GET2(ecode, 1);
1798     max = GET2(ecode, 3);
1799     if (max == 0) max = INT_MAX;
1800     ecode += 5;
1801     break;
1802    
1803     default: /* No repeat follows */
1804     if (!match_ref(offset, eptr, length, md, ims)) RRETURN(MATCH_NOMATCH);
1805     eptr += length;
1806     continue; /* With the main loop */
1807     }
1808    
1809     /* If the length of the reference is zero, just continue with the
1810     main loop. */
1811    
1812     if (length == 0) continue;
1813    
1814     /* First, ensure the minimum number of matches are present. We get back
1815     the length of the reference string explicitly rather than passing the
1816     address of eptr, so that eptr can be a register variable. */
1817    
1818     for (i = 1; i <= min; i++)
1819     {
1820     if (!match_ref(offset, eptr, length, md, ims)) RRETURN(MATCH_NOMATCH);
1821     eptr += length;
1822     }
1823    
1824     /* If min = max, continue at the same level without recursion.
1825     They are not both allowed to be zero. */
1826    
1827     if (min == max) continue;
1828    
1829     /* If minimizing, keep trying and advancing the pointer */
1830    
1831     if (minimize)
1832     {
1833     for (fi = min;; fi++)
1834     {
1835 ph10 164 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM14);
1836 nigel 77 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1837     if (fi >= max || !match_ref(offset, eptr, length, md, ims))
1838     RRETURN(MATCH_NOMATCH);
1839     eptr += length;
1840     }
1841     /* Control never gets here */
1842     }
1843    
1844     /* If maximizing, find the longest string and work backwards */
1845    
1846     else
1847     {
1848     pp = eptr;
1849     for (i = min; i < max; i++)
1850     {
1851     if (!match_ref(offset, eptr, length, md, ims)) break;
1852     eptr += length;
1853     }
1854     while (eptr >= pp)
1855     {
1856 ph10 164 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM15);
1857 nigel 77 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1858     eptr -= length;
1859     }
1860     RRETURN(MATCH_NOMATCH);
1861     }
1862     }
1863     /* Control never gets here */
1864    
1865    
1866    
1867     /* Match a bit-mapped character class, possibly repeatedly. This op code is
1868     used when all the characters in the class have values in the range 0-255,
1869     and either the matching is caseful, or the characters are in the range
1870     0-127 when UTF-8 processing is enabled. The only difference between
1871     OP_CLASS and OP_NCLASS occurs when a data character outside the range is
1872     encountered.
1873    
1874     First, look past the end of the item to see if there is repeat information
1875     following. Then obey similar code to character type repeats - written out
1876     again for speed. */
1877    
1878     case OP_NCLASS:
1879     case OP_CLASS:
1880     {
1881     data = ecode + 1; /* Save for matching */
1882     ecode += 33; /* Advance past the item */
1883    
1884     switch (*ecode)
1885     {
1886     case OP_CRSTAR:
1887     case OP_CRMINSTAR:
1888     case OP_CRPLUS:
1889     case OP_CRMINPLUS:
1890     case OP_CRQUERY:
1891     case OP_CRMINQUERY:
1892     c = *ecode++ - OP_CRSTAR;
1893     minimize = (c & 1) != 0;
1894     min = rep_min[c]; /* Pick up values from tables; */
1895     max = rep_max[c]; /* zero for max => infinity */
1896     if (max == 0) max = INT_MAX;
1897     break;
1898    
1899     case OP_CRRANGE:
1900     case OP_CRMINRANGE:
1901     minimize = (*ecode == OP_CRMINRANGE);
1902     min = GET2(ecode, 1);
1903     max = GET2(ecode, 3);
1904     if (max == 0) max = INT_MAX;
1905     ecode += 5;
1906     break;
1907    
1908     default: /* No repeat follows */
1909     min = max = 1;
1910     break;
1911     }
1912    
1913     /* First, ensure the minimum number of matches are present. */
1914    
1915     #ifdef SUPPORT_UTF8
1916     /* UTF-8 mode */
1917     if (utf8)
1918     {
1919     for (i = 1; i <= min; i++)
1920     {
1921     if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
1922     GETCHARINC(c, eptr);
1923     if (c > 255)
1924     {
1925     if (op == OP_CLASS) RRETURN(MATCH_NOMATCH);
1926     }
1927     else
1928     {
1929     if ((data[c/8] & (1 << (c&7))) == 0) RRETURN(MATCH_NOMATCH);
1930     }
1931     }
1932     }
1933     else
1934     #endif
1935     /* Not UTF-8 mode */
1936     {
1937     for (i = 1; i <= min; i++)
1938     {
1939     if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
1940     c = *eptr++;
1941     if ((data[c/8] & (1 << (c&7))) == 0) RRETURN(MATCH_NOMATCH);
1942     }
1943     }
1944    
1945     /* If max == min we can continue with the main loop without the
1946     need to recurse. */
1947    
1948     if (min == max) continue;
1949    
1950     /* If minimizing, keep testing the rest of the expression and advancing
1951     the pointer while it matches the class. */
1952    
1953     if (minimize)
1954     {
1955     #ifdef SUPPORT_UTF8
1956     /* UTF-8 mode */
1957     if (utf8)
1958     {
1959     for (fi = min;; fi++)
1960     {
1961 ph10 164 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM16);
1962 nigel 77 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1963     if (fi >= max || eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
1964     GETCHARINC(c, eptr);
1965     if (c > 255)
1966     {
1967     if (op == OP_CLASS) RRETURN(MATCH_NOMATCH);
1968     }
1969     else
1970     {
1971     if ((data[c/8] & (1 << (c&7))) == 0) RRETURN(MATCH_NOMATCH);
1972     }
1973     }
1974     }
1975     else
1976     #endif
1977     /* Not UTF-8 mode */
1978     {
1979     for (fi = min;; fi++)
1980     {
1981 ph10 164 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM17);
1982 nigel 77 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1983     if (fi >= max || eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
1984     c = *eptr++;
1985     if ((data[c/8] & (1 << (c&7))) == 0) RRETURN(MATCH_NOMATCH);
1986     }
1987     }
1988     /* Control never gets here */
1989     }
1990    
1991     /* If maximizing, find the longest possible run, then work backwards. */
1992    
1993     else
1994     {
1995     pp = eptr;
1996    
1997     #ifdef SUPPORT_UTF8
1998     /* UTF-8 mode */
1999     if (utf8)
2000     {
2001     for (i = min; i < max; i++)
2002     {
2003     int len = 1;
2004     if (eptr >= md->end_subject) break;
2005     GETCHARLEN(c, eptr, len);
2006     if (c > 255)
2007     {
2008     if (op == OP_CLASS) break;
2009     }
2010     else
2011     {
2012     if ((data[c/8] & (1 << (c&7))) == 0) break;
2013     }
2014     eptr += len;
2015     }
2016     for (;;)
2017     {
2018 ph10 164 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM18);
2019 nigel 77 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2020     if (eptr-- == pp) break; /* Stop if tried at original pos */
2021     BACKCHAR(eptr);
2022     }
2023     }
2024     else
2025     #endif
2026     /* Not UTF-8 mode */
2027     {
2028     for (i = min; i < max; i++)
2029     {
2030     if (eptr >= md->end_subject) break;
2031     c = *eptr;
2032     if ((data[c/8] & (1 << (c&7))) == 0) break;
2033     eptr++;
2034     }
2035     while (eptr >= pp)
2036     {
2037 ph10 164 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM19);
2038 nigel 87 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2039 nigel 77 eptr--;
2040     }
2041     }
2042    
2043     RRETURN(MATCH_NOMATCH);
2044     }
2045     }
2046     /* Control never gets here */
2047    
2048    
2049     /* Match an extended character class. This opcode is encountered only
2050     in UTF-8 mode, because that's the only time it is compiled. */
2051    
2052     #ifdef SUPPORT_UTF8
2053     case OP_XCLASS:
2054     {
2055     data = ecode + 1 + LINK_SIZE; /* Save for matching */
2056     ecode += GET(ecode, 1); /* Advance past the item */
2057    
2058     switch (*ecode)
2059     {
2060     case OP_CRSTAR:
2061     case OP_CRMINSTAR:
2062     case OP_CRPLUS:
2063     case OP_CRMINPLUS:
2064     case OP_CRQUERY:
2065     case OP_CRMINQUERY:
2066     c = *ecode++ - OP_CRSTAR;
2067     minimize = (c & 1) != 0;
2068     min = rep_min[c]; /* Pick up values from tables; */
2069     max = rep_max[c]; /* zero for max => infinity */
2070     if (max == 0) max = INT_MAX;
2071     break;
2072    
2073     case OP_CRRANGE:
2074     case OP_CRMINRANGE:
2075     minimize = (*ecode == OP_CRMINRANGE);
2076     min = GET2(ecode, 1);
2077     max = GET2(ecode, 3);
2078     if (max == 0) max = INT_MAX;
2079     ecode += 5;
2080     break;
2081    
2082     default: /* No repeat follows */
2083     min = max = 1;
2084     break;
2085     }
2086    
2087     /* First, ensure the minimum number of matches are present. */
2088    
2089     for (i = 1; i <= min; i++)
2090     {
2091     if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
2092     GETCHARINC(c, eptr);
2093     if (!_pcre_xclass(c, data)) RRETURN(MATCH_NOMATCH);
2094     }
2095    
2096     /* If max == min we can continue with the main loop without the
2097     need to recurse. */
2098    
2099     if (min == max) continue;
2100    
2101     /* If minimizing, keep testing the rest of the expression and advancing
2102     the pointer while it matches the class. */
2103    
2104     if (minimize)
2105     {
2106     for (fi = min;; fi++)
2107     {
2108 ph10 164 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM20);
2109 nigel 77 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2110     if (fi >= max || eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
2111     GETCHARINC(c, eptr);
2112     if (!_pcre_xclass(c, data)) RRETURN(MATCH_NOMATCH);
2113     }
2114     /* Control never gets here */
2115     }
2116    
2117     /* If maximizing, find the longest possible run, then work backwards. */
2118    
2119     else
2120     {
2121     pp = eptr;
2122     for (i = min; i < max; i++)
2123     {
2124     int len = 1;
2125     if (eptr >= md->end_subject) break;
2126     GETCHARLEN(c, eptr, len);
2127     if (!_pcre_xclass(c, data)) break;
2128     eptr += len;
2129     }
2130     for(;;)
2131     {
2132 ph10 164 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM21);
2133 nigel 77 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2134     if (eptr-- == pp) break; /* Stop if tried at original pos */
2135 ph10 214 if (utf8) BACKCHAR(eptr);
2136 nigel 77 }
2137     RRETURN(MATCH_NOMATCH);
2138     }
2139    
2140     /* Control never gets here */
2141     }
2142     #endif /* End of XCLASS */
2143    
2144     /* Match a single character, casefully */
2145    
2146     case OP_CHAR:
2147     #ifdef SUPPORT_UTF8
2148     if (utf8)
2149     {
2150     length = 1;
2151     ecode++;
2152     GETCHARLEN(fc, ecode, length);
2153     if (length > md->end_subject - eptr) RRETURN(MATCH_NOMATCH);
2154     while (length-- > 0) if (*ecode++ != *eptr++) RRETURN(MATCH_NOMATCH);
2155     }
2156     else
2157     #endif
2158    
2159     /* Non-UTF-8 mode */
2160     {
2161     if (md->end_subject - eptr < 1) RRETURN(MATCH_NOMATCH);
2162     if (ecode[1] != *eptr++) RRETURN(MATCH_NOMATCH);
2163     ecode += 2;
2164     }
2165     break;
2166    
2167     /* Match a single character, caselessly */
2168    
2169     case OP_CHARNC:
2170     #ifdef SUPPORT_UTF8
2171     if (utf8)
2172     {
2173     length = 1;
2174     ecode++;
2175     GETCHARLEN(fc, ecode, length);
2176    
2177     if (length > md->end_subject - eptr) RRETURN(MATCH_NOMATCH);
2178    
2179     /* If the pattern character's value is < 128, we have only one byte, and
2180     can use the fast lookup table. */
2181    
2182     if (fc < 128)
2183     {
2184     if (md->lcc[*ecode++] != md->lcc[*eptr++]) RRETURN(MATCH_NOMATCH);
2185     }
2186    
2187     /* Otherwise we must pick up the subject character */
2188    
2189     else
2190     {
2191 nigel 93 unsigned int dc;
2192 nigel 77 GETCHARINC(dc, eptr);
2193     ecode += length;
2194    
2195     /* If we have Unicode property support, we can use it to test the other
2196 nigel 87 case of the character, if there is one. */
2197 nigel 77
2198     if (fc != dc)
2199     {
2200     #ifdef SUPPORT_UCP
2201 ph10 349 if (dc != UCD_OTHERCASE(fc))
2202 nigel 77 #endif
2203     RRETURN(MATCH_NOMATCH);
2204     }
2205     }
2206     }
2207     else
2208     #endif /* SUPPORT_UTF8 */
2209    
2210     /* Non-UTF-8 mode */
2211     {
2212     if (md->end_subject - eptr < 1) RRETURN(MATCH_NOMATCH);
2213     if (md->lcc[ecode[1]] != md->lcc[*eptr++]) RRETURN(MATCH_NOMATCH);
2214     ecode += 2;
2215     }
2216     break;
2217    
2218 nigel 93 /* Match a single character repeatedly. */
2219 nigel 77
2220     case OP_EXACT:
2221     min = max = GET2(ecode, 1);
2222     ecode += 3;
2223     goto REPEATCHAR;
2224    
2225 nigel 93 case OP_POSUPTO:
2226     possessive = TRUE;
2227     /* Fall through */
2228    
2229 nigel 77 case OP_UPTO:
2230     case OP_MINUPTO:
2231     min = 0;
2232     max = GET2(ecode, 1);
2233     minimize = *ecode == OP_MINUPTO;
2234     ecode += 3;
2235     goto REPEATCHAR;
2236    
2237 nigel 93 case OP_POSSTAR:
2238     possessive = TRUE;
2239     min = 0;
2240     max = INT_MAX;
2241     ecode++;
2242     goto REPEATCHAR;
2243    
2244     case OP_POSPLUS:
2245     possessive = TRUE;
2246     min = 1;
2247     max = INT_MAX;
2248     ecode++;
2249     goto REPEATCHAR;
2250    
2251     case OP_POSQUERY:
2252     possessive = TRUE;
2253     min = 0;
2254     max = 1;
2255     ecode++;
2256     goto REPEATCHAR;
2257    
2258 nigel 77 case OP_STAR:
2259     case OP_MINSTAR:
2260     case OP_PLUS:
2261     case OP_MINPLUS:
2262     case OP_QUERY:
2263     case OP_MINQUERY:
2264     c = *ecode++ - OP_STAR;
2265     minimize = (c & 1) != 0;
2266     min = rep_min[c]; /* Pick up values from tables; */
2267     max = rep_max[c]; /* zero for max => infinity */
2268     if (max == 0) max = INT_MAX;
2269    
2270     /* Common code for all repeated single-character matches. We can give
2271     up quickly if there are fewer than the minimum number of characters left in
2272     the subject. */
2273    
2274     REPEATCHAR:
2275     #ifdef SUPPORT_UTF8
2276     if (utf8)
2277     {
2278     length = 1;
2279     charptr = ecode;
2280     GETCHARLEN(fc, ecode, length);
2281     if (min * length > md->end_subject - eptr) RRETURN(MATCH_NOMATCH);
2282     ecode += length;
2283    
2284     /* Handle multibyte character matching specially here. There is
2285     support for caseless matching if UCP support is present. */
2286    
2287     if (length > 1)
2288     {
2289     #ifdef SUPPORT_UCP
2290 nigel 93 unsigned int othercase;
2291 nigel 77 if ((ims & PCRE_CASELESS) != 0 &&
2292 ph10 349 (othercase = UCD_OTHERCASE(fc)) != fc)
2293 nigel 77 oclength = _pcre_ord2utf8(othercase, occhars);
2294 ph10 115 else oclength = 0;
2295 nigel 77 #endif /* SUPPORT_UCP */
2296    
2297     for (i = 1; i <= min; i++)
2298     {
2299     if (memcmp(eptr, charptr, length) == 0) eptr += length;
2300 ph10 123 #ifdef SUPPORT_UCP
2301 nigel 77 /* Need braces because of following else */
2302     else if (oclength == 0) { RRETURN(MATCH_NOMATCH); }
2303     else
2304     {
2305     if (memcmp(eptr, occhars, oclength) != 0) RRETURN(MATCH_NOMATCH);
2306     eptr += oclength;
2307     }
2308 ph10 115 #else /* without SUPPORT_UCP */
2309     else { RRETURN(MATCH_NOMATCH); }
2310 ph10 123 #endif /* SUPPORT_UCP */
2311 nigel 77 }
2312    
2313     if (min == max) continue;
2314    
2315     if (minimize)
2316     {
2317     for (fi = min;; fi++)
2318     {
2319 ph10 164 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM22);
2320 nigel 77 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2321     if (fi >= max || eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
2322     if (memcmp(eptr, charptr, length) == 0) eptr += length;
2323 ph10 123 #ifdef SUPPORT_UCP
2324 nigel 77 /* Need braces because of following else */
2325     else if (oclength == 0) { RRETURN(MATCH_NOMATCH); }
2326     else
2327     {
2328     if (memcmp(eptr, occhars, oclength) != 0) RRETURN(MATCH_NOMATCH);
2329     eptr += oclength;
2330     }
2331 ph10 115 #else /* without SUPPORT_UCP */
2332     else { RRETURN (MATCH_NOMATCH); }
2333     #endif /* SUPPORT_UCP */
2334 nigel 77 }
2335     /* Control never gets here */
2336     }
2337 nigel 93
2338     else /* Maximize */
2339 nigel 77 {
2340     pp = eptr;
2341     for (i = min; i < max; i++)
2342     {
2343     if (eptr > md->end_subject - length) break;
2344     if (memcmp(eptr, charptr, length) == 0) eptr += length;
2345 ph10 123 #ifdef SUPPORT_UCP
2346 nigel 77 else if (oclength == 0) break;
2347     else
2348     {
2349     if (memcmp(eptr, occhars, oclength) != 0) break;
2350     eptr += oclength;
2351     }
2352 ph10 115 #else /* without SUPPORT_UCP */
2353     else break;
2354 ph10 123 #endif /* SUPPORT_UCP */
2355 nigel 77 }
2356 nigel 93
2357     if (possessive) continue;
2358 ph10 120 for(;;)
2359 nigel 77 {
2360 ph10 164 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM23);
2361 nigel 77 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2362 ph10 120 if (eptr == pp) RRETURN(MATCH_NOMATCH);
2363 ph10 115 #ifdef SUPPORT_UCP
2364     eptr--;
2365     BACKCHAR(eptr);
2366 ph10 123 #else /* without SUPPORT_UCP */
2367 nigel 77 eptr -= length;
2368 ph10 123 #endif /* SUPPORT_UCP */
2369 nigel 77 }
2370     }
2371     /* Control never gets here */
2372     }
2373    
2374     /* If the length of a UTF-8 character is 1, we fall through here, and
2375     obey the code as for non-UTF-8 characters below, though in this case the
2376     value of fc will always be < 128. */
2377     }
2378     else
2379     #endif /* SUPPORT_UTF8 */
2380    
2381     /* When not in UTF-8 mode, load a single-byte character. */
2382     {
2383     if (min > md->end_subject - eptr) RRETURN(MATCH_NOMATCH);
2384     fc = *ecode++;
2385     }
2386    
2387     /* The value of fc at this point is always less than 256, though we may or
2388     may not be in UTF-8 mode. The code is duplicated for the caseless and
2389     caseful cases, for speed, since matching characters is likely to be quite
2390     common. First, ensure the minimum number of matches are present. If min =
2391     max, continue at the same level without recursing. Otherwise, if
2392     minimizing, keep trying the rest of the expression and advancing one
2393     matching character if failing, up to the maximum. Alternatively, if
2394     maximizing, find the maximum number of characters and work backwards. */
2395    
2396     DPRINTF(("matching %c{%d,%d} against subject %.*s\n", fc, min, max,
2397     max, eptr));
2398    
2399     if ((ims & PCRE_CASELESS) != 0)
2400     {
2401     fc = md->lcc[fc];
2402     for (i = 1; i <= min; i++)
2403     if (fc != md->lcc[*eptr++]) RRETURN(MATCH_NOMATCH);
2404     if (min == max) continue;
2405     if (minimize)
2406     {
2407     for (fi = min;; fi++)
2408     {
2409 ph10 164 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM24);
2410 nigel 77 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2411     if (fi >= max || eptr >= md->end_subject ||
2412     fc != md->lcc[*eptr++])
2413     RRETURN(MATCH_NOMATCH);
2414     }
2415     /* Control never gets here */
2416     }
2417 nigel 93 else /* Maximize */
2418 nigel 77 {
2419     pp = eptr;
2420     for (i = min; i < max; i++)
2421     {
2422     if (eptr >= md->end_subject || fc != md->lcc[*eptr]) break;
2423     eptr++;
2424     }
2425 nigel 93 if (possessive) continue;
2426 nigel 77 while (eptr >= pp)
2427     {
2428 ph10 164 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM25);
2429 nigel 77 eptr--;
2430     if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2431     }
2432     RRETURN(MATCH_NOMATCH);
2433     }
2434     /* Control never gets here */
2435     }
2436    
2437     /* Caseful comparisons (includes all multi-byte characters) */
2438    
2439     else
2440     {
2441     for (i = 1; i <= min; i++) if (fc != *eptr++) RRETURN(MATCH_NOMATCH);
2442     if (min == max) continue;
2443     if (minimize)
2444     {
2445     for (fi = min;; fi++)
2446     {
2447 ph10 164 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM26);
2448 nigel 77 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2449     if (fi >= max || eptr >= md->end_subject || fc != *eptr++)
2450     RRETURN(MATCH_NOMATCH);
2451     }
2452     /* Control never gets here */
2453     }
2454 nigel 93 else /* Maximize */
2455 nigel 77 {
2456     pp = eptr;
2457     for (i = min; i < max; i++)
2458     {
2459     if (eptr >= md->end_subject || fc != *eptr) break;
2460     eptr++;
2461     }
2462 nigel 93 if (possessive) continue;
2463 nigel 77 while (eptr >= pp)
2464     {
2465 ph10 164 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM27);
2466 nigel 77 eptr--;
2467     if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2468     }
2469     RRETURN(MATCH_NOMATCH);
2470     }
2471     }
2472     /* Control never gets here */
2473    
2474     /* Match a negated single one-byte character. The character we are
2475     checking can be multibyte. */
2476    
2477     case OP_NOT:
2478     if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
2479     ecode++;
2480     GETCHARINCTEST(c, eptr);
2481     if ((ims & PCRE_CASELESS) != 0)
2482     {
2483     #ifdef SUPPORT_UTF8
2484     if (c < 256)
2485     #endif
2486     c = md->lcc[c];
2487     if (md->lcc[*ecode++] == c) RRETURN(MATCH_NOMATCH);
2488     }
2489     else
2490     {
2491     if (*ecode++ == c) RRETURN(MATCH_NOMATCH);
2492     }
2493     break;
2494    
2495     /* Match a negated single one-byte character repeatedly. This is almost a
2496     repeat of the code for a repeated single character, but I haven't found a
2497     nice way of commoning these up that doesn't require a test of the
2498     positive/negative option for each character match. Maybe that wouldn't add
2499     very much to the time taken, but character matching *is* what this is all
2500     about... */
2501    
2502     case OP_NOTEXACT:
2503     min = max = GET2(ecode, 1);
2504     ecode += 3;
2505     goto REPEATNOTCHAR;
2506    
2507     case OP_NOTUPTO:
2508     case OP_NOTMINUPTO:
2509     min = 0;
2510     max = GET2(ecode, 1);
2511     minimize = *ecode == OP_NOTMINUPTO;
2512     ecode += 3;
2513     goto REPEATNOTCHAR;
2514    
2515 nigel 93 case OP_NOTPOSSTAR:
2516     possessive = TRUE;
2517     min = 0;
2518     max = INT_MAX;
2519     ecode++;
2520     goto REPEATNOTCHAR;
2521    
2522     case OP_NOTPOSPLUS:
2523     possessive = TRUE;
2524     min = 1;
2525     max = INT_MAX;
2526     ecode++;
2527     goto REPEATNOTCHAR;
2528    
2529     case OP_NOTPOSQUERY:
2530     possessive = TRUE;
2531     min = 0;
2532     max = 1;
2533     ecode++;
2534     goto REPEATNOTCHAR;
2535    
2536     case OP_NOTPOSUPTO:
2537     possessive = TRUE;
2538     min = 0;
2539     max = GET2(ecode, 1);
2540     ecode += 3;
2541     goto REPEATNOTCHAR;
2542    
2543 nigel 77 case OP_NOTSTAR:
2544     case OP_NOTMINSTAR:
2545     case OP_NOTPLUS:
2546     case OP_NOTMINPLUS:
2547     case OP_NOTQUERY:
2548     case OP_NOTMINQUERY:
2549     c = *ecode++ - OP_NOTSTAR;
2550     minimize = (c & 1) != 0;
2551     min = rep_min[c]; /* Pick up values from tables; */
2552     max = rep_max[c]; /* zero for max => infinity */
2553     if (max == 0) max = INT_MAX;
2554    
2555     /* Common code for all repeated single-byte matches. We can give up quickly
2556     if there are fewer than the minimum number of bytes left in the
2557     subject. */
2558    
2559     REPEATNOTCHAR:
2560     if (min > md->end_subject - eptr) RRETURN(MATCH_NOMATCH);
2561     fc = *ecode++;
2562    
2563     /* The code is duplicated for the caseless and caseful cases, for speed,
2564     since matching characters is likely to be quite common. First, ensure the
2565     minimum number of matches are present. If min = max, continue at the same
2566     level without recursing. Otherwise, if minimizing, keep trying the rest of
2567     the expression and advancing one matching character if failing, up to the
2568     maximum. Alternatively, if maximizing, find the maximum number of
2569     characters and work backwards. */
2570    
2571     DPRINTF(("negative matching %c{%d,%d} against subject %.*s\n", fc, min, max,
2572     max, eptr));
2573    
2574     if ((ims & PCRE_CASELESS) != 0)
2575     {
2576     fc = md->lcc[fc];
2577    
2578     #ifdef SUPPORT_UTF8
2579     /* UTF-8 mode */
2580     if (utf8)
2581     {
2582 nigel 93 register unsigned int d;
2583 nigel 77 for (i = 1; i <= min; i++)
2584     {
2585     GETCHARINC(d, eptr);
2586     if (d < 256) d = md->lcc[d];
2587     if (fc == d) RRETURN(MATCH_NOMATCH);
2588     }
2589     }
2590     else
2591     #endif
2592    
2593     /* Not UTF-8 mode */
2594     {
2595     for (i = 1; i <= min; i++)
2596     if (fc == md->lcc[*eptr++]) RRETURN(MATCH_NOMATCH);
2597     }
2598    
2599     if (min == max) continue;
2600    
2601     if (minimize)
2602     {
2603     #ifdef SUPPORT_UTF8
2604     /* UTF-8 mode */
2605     if (utf8)
2606     {
2607 nigel 93 register unsigned int d;
2608 nigel 77 for (fi = min;; fi++)
2609     {
2610 ph10 164 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM28);
2611 nigel 77 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2612 ph10 366 if (fi >= max || eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
2613 nigel 77 GETCHARINC(d, eptr);
2614     if (d < 256) d = md->lcc[d];
2615 ph10 366 if (fc == d) RRETURN(MATCH_NOMATCH);
2616 ph10 371
2617 nigel 77 }
2618     }
2619     else
2620     #endif
2621     /* Not UTF-8 mode */
2622     {
2623     for (fi = min;; fi++)
2624     {
2625 ph10 164 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM29);
2626 nigel 77 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2627     if (fi >= max || eptr >= md->end_subject || fc == md->lcc[*eptr++])
2628     RRETURN(MATCH_NOMATCH);
2629     }
2630     }
2631     /* Control never gets here */
2632     }
2633    
2634     /* Maximize case */
2635    
2636     else
2637     {
2638     pp = eptr;
2639    
2640     #ifdef SUPPORT_UTF8
2641     /* UTF-8 mode */
2642     if (utf8)
2643     {
2644 nigel 93 register unsigned int d;
2645 nigel 77 for (i = min; i < max; i++)
2646     {
2647     int len = 1;
2648     if (eptr >= md->end_subject) break;
2649     GETCHARLEN(d, eptr, len);
2650     if (d < 256) d = md->lcc[d];
2651     if (fc == d) break;
2652     eptr += len;
2653     }
2654 nigel 93 if (possessive) continue;
2655     for(;;)
2656 nigel 77 {
2657 ph10 164 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM30);
2658 nigel 77 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2659     if (eptr-- == pp) break; /* Stop if tried at original pos */
2660     BACKCHAR(eptr);
2661     }
2662     }
2663     else
2664     #endif
2665     /* Not UTF-8 mode */
2666     {
2667     for (i = min; i < max; i++)
2668     {
2669     if (eptr >= md->end_subject || fc == md->lcc[*eptr]) break;
2670     eptr++;
2671     }
2672 nigel 93 if (possessive) continue;
2673 nigel 77 while (eptr >= pp)
2674     {
2675 ph10 164 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM31);
2676 nigel 77 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2677     eptr--;
2678     }
2679     }
2680    
2681     RRETURN(MATCH_NOMATCH);
2682     }
2683     /* Control never gets here */
2684     }
2685    
2686     /* Caseful comparisons */
2687    
2688     else
2689     {
2690     #ifdef SUPPORT_UTF8
2691     /* UTF-8 mode */
2692     if (utf8)
2693     {
2694 nigel 93 register unsigned int d;
2695 nigel 77 for (i = 1; i <= min; i++)
2696     {
2697     GETCHARINC(d, eptr);
2698     if (fc == d) RRETURN(MATCH_NOMATCH);
2699     }
2700     }
2701     else
2702     #endif
2703     /* Not UTF-8 mode */
2704     {
2705     for (i = 1; i <= min; i++)
2706     if (fc == *eptr++) RRETURN(MATCH_NOMATCH);
2707     }
2708    
2709     if (min == max) continue;
2710    
2711     if (minimize)
2712     {
2713     #ifdef SUPPORT_UTF8
2714     /* UTF-8 mode */
2715     if (utf8)
2716     {
2717 nigel 93 register unsigned int d;
2718 nigel 77 for (fi = min;; fi++)
2719     {
2720 ph10 164 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM32);
2721 nigel 77 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2722 ph10 366 if (fi >= max || eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
2723 nigel 77 GETCHARINC(d, eptr);
2724 ph10 371 if (fc == d) RRETURN(MATCH_NOMATCH);
2725 nigel 77 }
2726     }
2727     else
2728     #endif
2729     /* Not UTF-8 mode */
2730     {
2731     for (fi = min;; fi++)
2732     {
2733 ph10 164 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM33);
2734 nigel 77 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2735     if (fi >= max || eptr >= md->end_subject || fc == *eptr++)
2736     RRETURN(MATCH_NOMATCH);
2737     }
2738     }
2739     /* Control never gets here */
2740     }
2741    
2742     /* Maximize case */
2743    
2744     else
2745     {
2746     pp = eptr;
2747    
2748     #ifdef SUPPORT_UTF8
2749     /* UTF-8 mode */
2750     if (utf8)
2751     {
2752 nigel 93 register unsigned int d;
2753 nigel 77 for (i = min; i < max; i++)
2754     {
2755     int len = 1;
2756     if (eptr >= md->end_subject) break;
2757     GETCHARLEN(d, eptr, len);
2758     if (fc == d) break;
2759     eptr += len;
2760     }
2761 nigel 93 if (possessive) continue;
2762 nigel 77 for(;;)
2763     {
2764 ph10 164 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM34);
2765 nigel 77 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2766     if (eptr-- == pp) break; /* Stop if tried at original pos */
2767     BACKCHAR(eptr);
2768     }
2769     }
2770     else
2771     #endif
2772     /* Not UTF-8 mode */
2773     {
2774     for (i = min; i < max; i++)
2775     {
2776     if (eptr >= md->end_subject || fc == *eptr) break;
2777     eptr++;
2778     }
2779 nigel 93 if (possessive) continue;
2780 nigel 77 while (eptr >= pp)
2781     {
2782 ph10 164 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM35);
2783 nigel 77 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2784     eptr--;
2785     }
2786     }
2787    
2788     RRETURN(MATCH_NOMATCH);
2789     }
2790     }
2791     /* Control never gets here */
2792    
2793     /* Match a single character type repeatedly; several different opcodes
2794     share code. This is very similar to the code for single characters, but we
2795     repeat it in the interests of efficiency. */
2796    
2797     case OP_TYPEEXACT:
2798     min = max = GET2(ecode, 1);
2799     minimize = TRUE;
2800     ecode += 3;
2801     goto REPEATTYPE;
2802    
2803     case OP_TYPEUPTO:
2804     case OP_TYPEMINUPTO:
2805     min = 0;
2806     max = GET2(ecode, 1);
2807     minimize = *ecode == OP_TYPEMINUPTO;
2808     ecode += 3;
2809     goto REPEATTYPE;
2810    
2811 nigel 93 case OP_TYPEPOSSTAR:
2812     possessive = TRUE;
2813     min = 0;
2814     max = INT_MAX;
2815     ecode++;
2816     goto REPEATTYPE;
2817    
2818     case OP_TYPEPOSPLUS:
2819     possessive = TRUE;
2820     min = 1;
2821     max = INT_MAX;
2822     ecode++;
2823     goto REPEATTYPE;
2824    
2825     case OP_TYPEPOSQUERY:
2826     possessive = TRUE;
2827     min = 0;
2828     max = 1;
2829     ecode++;
2830     goto REPEATTYPE;
2831    
2832     case OP_TYPEPOSUPTO:
2833     possessive = TRUE;
2834     min = 0;
2835     max = GET2(ecode, 1);
2836     ecode += 3;
2837     goto REPEATTYPE;
2838    
2839 nigel 77 case OP_TYPESTAR:
2840     case OP_TYPEMINSTAR:
2841     case OP_TYPEPLUS:
2842     case OP_TYPEMINPLUS:
2843     case OP_TYPEQUERY:
2844     case OP_TYPEMINQUERY:
2845     c = *ecode++ - OP_TYPESTAR;
2846     minimize = (c & 1) != 0;
2847     min = rep_min[c]; /* Pick up values from tables; */
2848     max = rep_max[c]; /* zero for max => infinity */
2849     if (max == 0) max = INT_MAX;
2850    
2851     /* Common code for all repeated single character type matches. Note that
2852     in UTF-8 mode, '.' matches a character of any length, but for the other
2853     character types, the valid characters are all one-byte long. */
2854    
2855     REPEATTYPE:
2856     ctype = *ecode++; /* Code for the character type */
2857    
2858     #ifdef SUPPORT_UCP
2859     if (ctype == OP_PROP || ctype == OP_NOTPROP)
2860     {
2861     prop_fail_result = ctype == OP_NOTPROP;
2862     prop_type = *ecode++;
2863 nigel 87 prop_value = *ecode++;
2864 nigel 77 }
2865     else prop_type = -1;
2866     #endif
2867    
2868     /* First, ensure the minimum number of matches are present. Use inline
2869     code for maximizing the speed, and do the type test once at the start
2870     (i.e. keep it out of the loop). Also we can test that there are at least
2871     the minimum number of bytes before we start. This isn't as effective in
2872     UTF-8 mode, but it does no harm. Separate the UTF-8 code completely as that
2873     is tidier. Also separate the UCP code, which can be the same for both UTF-8
2874     and single-bytes. */
2875    
2876     if (min > md->end_subject - eptr) RRETURN(MATCH_NOMATCH);
2877     if (min > 0)
2878     {
2879     #ifdef SUPPORT_UCP
2880 nigel 87 if (prop_type >= 0)
2881 nigel 77 {
2882 nigel 87 switch(prop_type)
2883 nigel 77 {
2884 nigel 87 case PT_ANY:
2885     if (prop_fail_result) RRETURN(MATCH_NOMATCH);
2886     for (i = 1; i <= min; i++)
2887     {
2888     if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
2889 ph10 184 GETCHARINCTEST(c, eptr);
2890 nigel 87 }
2891     break;
2892    
2893     case PT_LAMP:
2894     for (i = 1; i <= min; i++)
2895     {
2896     if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
2897 ph10 184 GETCHARINCTEST(c, eptr);
2898 ph10 349 prop_chartype = UCD_CHARTYPE(c);
2899 nigel 87 if ((prop_chartype == ucp_Lu ||
2900     prop_chartype == ucp_Ll ||
2901     prop_chartype == ucp_Lt) == prop_fail_result)
2902     RRETURN(MATCH_NOMATCH);
2903     }
2904     break;
2905    
2906     case PT_GC:
2907     for (i = 1; i <= min; i++)
2908     {
2909     if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
2910 ph10 184 GETCHARINCTEST(c, eptr);
2911 ph10 349 prop_category = UCD_CATEGORY(c);
2912 nigel 87 if ((prop_category == prop_value) == prop_fail_result)
2913     RRETURN(MATCH_NOMATCH);
2914     }
2915     break;
2916    
2917     case PT_PC:
2918     for (i = 1; i <= min; i++)
2919     {
2920     if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
2921 ph10 184 GETCHARINCTEST(c, eptr);
2922 ph10 349 prop_chartype = UCD_CHARTYPE(c);
2923 nigel 87 if ((prop_chartype == prop_value) == prop_fail_result)
2924     RRETURN(MATCH_NOMATCH);
2925     }
2926     break;
2927    
2928     case PT_SC:
2929     for (i = 1; i <= min; i++)
2930     {
2931     if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
2932 ph10 184 GETCHARINCTEST(c, eptr);
2933 ph10 349 prop_script = UCD_SCRIPT(c);
2934 nigel 87 if ((prop_script == prop_value) == prop_fail_result)
2935     RRETURN(MATCH_NOMATCH);
2936     }
2937     break;
2938    
2939     default:
2940     RRETURN(PCRE_ERROR_INTERNAL);
2941 nigel 77 }
2942     }
2943    
2944     /* Match extended Unicode sequences. We will get here only if the
2945     support is in the binary; otherwise a compile-time error occurs. */
2946    
2947     else if (ctype == OP_EXTUNI)
2948     {
2949     for (i = 1; i <= min; i++)
2950     {
2951     GETCHARINCTEST(c, eptr);
2952 ph10 349 prop_category = UCD_CATEGORY(c);
2953 nigel 77 if (prop_category == ucp_M) RRETURN(MATCH_NOMATCH);
2954     while (eptr < md->end_subject)
2955     {
2956     int len = 1;
2957     if (!utf8) c = *eptr; else
2958     {
2959     GETCHARLEN(c, eptr, len);
2960     }
2961 ph10 349 prop_category = UCD_CATEGORY(c);
2962 nigel 77 if (prop_category != ucp_M) break;
2963     eptr += len;
2964     }
2965     }
2966     }
2967    
2968     else
2969     #endif /* SUPPORT_UCP */
2970    
2971     /* Handle all other cases when the coding is UTF-8 */
2972    
2973     #ifdef SUPPORT_UTF8
2974     if (utf8) switch(ctype)
2975     {
2976     case OP_ANY:
2977     for (i = 1; i <= min; i++)
2978     {
2979 ph10 342 if (eptr >= md->end_subject || IS_NEWLINE(eptr))
2980 nigel 77 RRETURN(MATCH_NOMATCH);
2981 nigel 91 eptr++;
2982 nigel 77 while (eptr < md->end_subject && (*eptr & 0xc0) == 0x80) eptr++;
2983     }
2984     break;
2985    
2986 ph10 341 case OP_ALLANY:
2987     for (i = 1; i <= min; i++)
2988     {
2989     if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
2990     eptr++;
2991     while (eptr < md->end_subject && (*eptr & 0xc0) == 0x80) eptr++;
2992     }
2993     break;
2994    
2995 nigel 77 case OP_ANYBYTE:
2996     eptr += min;
2997     break;
2998    
2999 nigel 93 case OP_ANYNL:
3000     for (i = 1; i <= min; i++)
3001     {
3002     if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
3003     GETCHARINC(c, eptr);
3004     switch(c)
3005     {
3006     default: RRETURN(MATCH_NOMATCH);
3007     case 0x000d:
3008     if (eptr < md->end_subject && *eptr == 0x0a) eptr++;
3009     break;
3010 ph10 231
3011 nigel 93 case 0x000a:
3012 ph10 231 break;
3013    
3014 nigel 93 case 0x000b:
3015     case 0x000c:
3016     case 0x0085:
3017     case 0x2028:
3018     case 0x2029:
3019 ph10 231 if (md->bsr_anycrlf) RRETURN(MATCH_NOMATCH);
3020 nigel 93 break;
3021     }
3022     }
3023     break;
3024    
3025 ph10 178 case OP_NOT_HSPACE:
3026     for (i = 1; i <= min; i++)
3027     {
3028     if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
3029     GETCHARINC(c, eptr);
3030     switch(c)
3031     {
3032     default: break;
3033     case 0x09: /* HT */
3034     case 0x20: /* SPACE */
3035     case 0xa0: /* NBSP */
3036     case 0x1680: /* OGHAM SPACE MARK */
3037     case 0x180e: /* MONGOLIAN VOWEL SEPARATOR */
3038     case 0x2000: /* EN QUAD */
3039     case 0x2001: /* EM QUAD */
3040     case 0x2002: /* EN SPACE */
3041     case 0x2003: /* EM SPACE */
3042     case 0x2004: /* THREE-PER-EM SPACE */
3043     case 0x2005: /* FOUR-PER-EM SPACE */
3044     case 0x2006: /* SIX-PER-EM SPACE */
3045     case 0x2007: /* FIGURE SPACE */
3046     case 0x2008: /* PUNCTUATION SPACE */
3047     case 0x2009: /* THIN SPACE */
3048     case 0x200A: /* HAIR SPACE */
3049     case 0x202f: /* NARROW NO-BREAK SPACE */
3050     case 0x205f: /* MEDIUM MATHEMATICAL SPACE */
3051     case 0x3000: /* IDEOGRAPHIC SPACE */
3052     RRETURN(MATCH_NOMATCH);
3053     }
3054     }
3055     break;
3056 ph10 182
3057 ph10 178 case OP_HSPACE:
3058     for (i = 1; i <= min; i++)
3059     {
3060     if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
3061     GETCHARINC(c, eptr);
3062     switch(c)
3063     {
3064     default: RRETURN(MATCH_NOMATCH);
3065     case 0x09: /* HT */
3066     case 0x20: /* SPACE */
3067     case 0xa0: /* NBSP */
3068     case 0x1680: /* OGHAM SPACE MARK */
3069     case 0x180e: /* MONGOLIAN VOWEL SEPARATOR */
3070     case 0x2000: /* EN QUAD */
3071     case 0x2001: /* EM QUAD */
3072     case 0x2002: /* EN SPACE */
3073     case 0x2003: /* EM SPACE */
3074     case 0x2004: /* THREE-PER-EM SPACE */
3075     case 0x2005: /* FOUR-PER-EM SPACE */
3076     case 0x2006: /* SIX-PER-EM SPACE */
3077     case 0x2007: /* FIGURE SPACE */
3078     case 0x2008: /* PUNCTUATION SPACE */
3079     case 0x2009: /* THIN SPACE */
3080     case 0x200A: /* HAIR SPACE */
3081     case 0x202f: /* NARROW NO-BREAK SPACE */
3082     case 0x205f: /* MEDIUM MATHEMATICAL SPACE */
3083     case 0x3000: /* IDEOGRAPHIC SPACE */
3084     break;
3085     }
3086     }
3087     break;
3088 ph10 182
3089 ph10 178 case OP_NOT_VSPACE:
3090     for (i = 1; i <= min; i++)
3091     {
3092     if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
3093     GETCHARINC(c, eptr);
3094     switch(c)
3095     {
3096     default: break;
3097     case 0x0a: /* LF */
3098     case 0x0b: /* VT */
3099     case 0x0c: /* FF */
3100     case 0x0d: /* CR */
3101     case 0x85: /* NEL */
3102     case 0x2028: /* LINE SEPARATOR */
3103     case 0x2029: /* PARAGRAPH SEPARATOR */
3104     RRETURN(MATCH_NOMATCH);
3105     }
3106     }
3107     break;
3108 ph10 182
3109 ph10 178 case OP_VSPACE:
3110     for (i = 1; i <= min; i++)
3111     {
3112     if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
3113     GETCHARINC(c, eptr);
3114     switch(c)
3115     {
3116     default: RRETURN(MATCH_NOMATCH);
3117     case 0x0a: /* LF */
3118     case 0x0b: /* VT */
3119     case 0x0c: /* FF */
3120     case 0x0d: /* CR */
3121     case 0x85: /* NEL */
3122     case 0x2028: /* LINE SEPARATOR */
3123     case 0x2029: /* PARAGRAPH SEPARATOR */
3124 ph10 182 break;
3125 ph10 178 }
3126     }
3127     break;
3128    
3129 nigel 77 case OP_NOT_DIGIT:
3130     for (i = 1; i <= min; i++)
3131     {
3132     if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
3133     GETCHARINC(c, eptr);
3134     if (c < 128 && (md->ctypes[c] & ctype_digit) != 0)
3135     RRETURN(MATCH_NOMATCH);
3136     }
3137     break;
3138    
3139     case OP_DIGIT:
3140     for (i = 1; i <= min; i++)
3141     {
3142     if (eptr >= md->end_subject ||
3143     *eptr >= 128 || (md->ctypes[*eptr++] & ctype_digit) == 0)
3144     RRETURN(MATCH_NOMATCH);
3145     /* No need to skip more bytes - we know it's a 1-byte character */
3146     }
3147     break;
3148    
3149     case OP_NOT_WHITESPACE:
3150     for (i = 1; i <= min; i++)
3151     {
3152     if (eptr >= md->end_subject ||
3153 ph10 219 (*eptr < 128 && (md->ctypes[*eptr] & ctype_space) != 0))
3154 nigel 77 RRETURN(MATCH_NOMATCH);
3155 ph10 219 while (++eptr < md->end_subject && (*eptr & 0xc0) == 0x80);
3156 nigel 77 }
3157     break;
3158    
3159     case OP_WHITESPACE:
3160     for (i = 1; i <= min; i++)
3161     {
3162     if (eptr >= md->end_subject ||
3163     *eptr >= 128 || (md->ctypes[*eptr++] & ctype_space) == 0)
3164     RRETURN(MATCH_NOMATCH);
3165     /* No need to skip more bytes - we know it's a 1-byte character */
3166     }
3167     break;
3168    
3169     case OP_NOT_WORDCHAR:
3170     for (i = 1; i <= min; i++)
3171     {
3172     if (eptr >= md->end_subject ||
3173 ph10 219 (*eptr < 128 && (md->ctypes[*eptr] & ctype_word) != 0))
3174 nigel 77 RRETURN(MATCH_NOMATCH);
3175 ph10 219 while (++eptr < md->end_subject && (*eptr & 0xc0) == 0x80);
3176 nigel 77 }
3177     break;
3178    
3179     case OP_WORDCHAR:
3180     for (i = 1; i <= min; i++)
3181     {
3182     if (eptr >= md->end_subject ||
3183     *eptr >= 128 || (md->ctypes[*eptr++] & ctype_word) == 0)
3184     RRETURN(MATCH_NOMATCH);
3185     /* No need to skip more bytes - we know it's a 1-byte character */
3186     }
3187     break;
3188    
3189     default:
3190     RRETURN(PCRE_ERROR_INTERNAL);
3191     } /* End switch(ctype) */
3192    
3193     else
3194     #endif /* SUPPORT_UTF8 */
3195    
3196     /* Code for the non-UTF-8 case for minimum matching of operators other
3197 nigel 93 than OP_PROP and OP_NOTPROP. We can assume that there are the minimum
3198     number of bytes present, as this was tested above. */
3199 nigel 77
3200     switch(ctype)
3201     {
3202     case OP_ANY:
3203 ph10 342 for (i = 1; i <= min; i++)
3204 nigel 77 {
3205 ph10 342 if (IS_NEWLINE(eptr)) RRETURN(MATCH_NOMATCH);
3206     eptr++;
3207 nigel 77 }
3208     break;
3209    
3210 ph10 341 case OP_ALLANY:
3211     eptr += min;
3212     break;
3213    
3214 nigel 77 case OP_ANYBYTE:
3215     eptr += min;
3216     break;
3217    
3218 nigel 93 /* Because of the CRLF case, we can't assume the minimum number of
3219     bytes are present in this case. */
3220    
3221     case OP_ANYNL:
3222     for (i = 1; i <= min; i++)
3223     {
3224     if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
3225     switch(*eptr++)
3226     {
3227     default: RRETURN(MATCH_NOMATCH);
3228     case 0x000d:
3229     if (eptr < md->end_subject && *eptr == 0x0a) eptr++;
3230     break;
3231     case 0x000a:
3232 ph10 231 break;
3233    
3234 nigel 93 case 0x000b:
3235     case 0x000c:
3236     case 0x0085:
3237 ph10 231 if (md->bsr_anycrlf) RRETURN(MATCH_NOMATCH);
3238 nigel 93 break;
3239     }
3240     }
3241     break;
3242    
3243 ph10 178 case OP_NOT_HSPACE:
3244     for (i = 1; i <= min; i++)
3245     {
3246     if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
3247     switch(*eptr++)
3248     {
3249     default: break;
3250     case 0x09: /* HT */
3251     case 0x20: /* SPACE */
3252     case 0xa0: /* NBSP */
3253     RRETURN(MATCH_NOMATCH);
3254     }
3255     }
3256     break;
3257    
3258     case OP_HSPACE:
3259     for (i = 1; i <= min; i++)
3260     {
3261     if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
3262     switch(*eptr++)
3263     {
3264     default: RRETURN(MATCH_NOMATCH);
3265     case 0x09: /* HT */
3266     case 0x20: /* SPACE */
3267     case 0xa0: /* NBSP */
3268 ph10 182 break;
3269 ph10 178 }
3270     }
3271     break;
3272    
3273     case OP_NOT_VSPACE:
3274     for (i = 1; i <= min; i++)
3275     {
3276     if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
3277     switch(*eptr++)
3278     {
3279     default: break;
3280     case 0x0a: /* LF */
3281     case 0x0b: /* VT */
3282     case 0x0c: /* FF */
3283     case 0x0d: /* CR */
3284     case 0x85: /* NEL */
3285     RRETURN(MATCH_NOMATCH);
3286     }
3287     }
3288     break;
3289    
3290     case OP_VSPACE:
3291     for (i = 1; i <= min; i++)
3292     {
3293     if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
3294     switch(*eptr++)
3295     {
3296     default: RRETURN(MATCH_NOMATCH);
3297     case 0x0a: /* LF */
3298     case 0x0b: /* VT */
3299     case 0x0c: /* FF */
3300     case 0x0d: /* CR */
3301     case 0x85: /* NEL */
3302 ph10 182 break;
3303 ph10 178 }
3304     }
3305     break;
3306    
3307 nigel 77 case OP_NOT_DIGIT:
3308     for (i = 1; i <= min; i++)
3309     if ((md->ctypes[*eptr++] & ctype_digit) != 0) RRETURN(MATCH_NOMATCH);
3310     break;
3311    
3312     case OP_DIGIT:
3313     for (i = 1; i <= min; i++)
3314     if ((md->ctypes[*eptr++] & ctype_digit) == 0) RRETURN(MATCH_NOMATCH);
3315     break;
3316    
3317     case OP_NOT_WHITESPACE:
3318     for (i = 1; i <= min; i++)
3319     if ((md->ctypes[*eptr++] & ctype_space) != 0) RRETURN(MATCH_NOMATCH);
3320     break;
3321    
3322     case OP_WHITESPACE:
3323     for (i = 1; i <= min; i++)
3324     if ((md->ctypes[*eptr++] & ctype_space) == 0) RRETURN(MATCH_NOMATCH);
3325     break;
3326    
3327     case OP_NOT_WORDCHAR:
3328     for (i = 1; i <= min; i++)
3329     if ((md->ctypes[*eptr++] & ctype_word) != 0)
3330     RRETURN(MATCH_NOMATCH);
3331     break;
3332    
3333     case OP_WORDCHAR:
3334     for (i = 1; i <= min; i++)
3335     if ((md->ctypes[*eptr++] & ctype_word) == 0)
3336     RRETURN(MATCH_NOMATCH);
3337     break;
3338    
3339     default:
3340     RRETURN(PCRE_ERROR_INTERNAL);
3341     }
3342     }
3343    
3344     /* If min = max, continue at the same level without recursing */
3345    
3346     if (min == max) continue;
3347    
3348     /* If minimizing, we have to test the rest of the pattern before each
3349     subsequent match. Again, separate the UTF-8 case for speed, and also
3350     separate the UCP cases. */
3351    
3352     if (minimize)
3353     {
3354     #ifdef SUPPORT_UCP
3355 nigel 87 if (prop_type >= 0)
3356 nigel 77 {
3357 nigel 87 switch(prop_type)
3358 nigel 77 {
3359 nigel 87 case PT_ANY:
3360     for (fi = min;; fi++)
3361     {
3362 ph10 164 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM36);
3363 nigel 87 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3364     if (fi >= max || eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
3365     GETCHARINC(c, eptr);
3366     if (prop_fail_result) RRETURN(MATCH_NOMATCH);
3367     }
3368 nigel 93 /* Control never gets here */
3369 nigel 87
3370     case PT_LAMP:
3371     for (fi = min;; fi++)
3372     {
3373 ph10 164 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM37);
3374 nigel 87 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3375     if (fi >= max || eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
3376     GETCHARINC(c, eptr);
3377 ph10 349 prop_chartype = UCD_CHARTYPE(c);
3378 nigel 87 if ((prop_chartype == ucp_Lu ||
3379     prop_chartype == ucp_Ll ||
3380     prop_chartype == ucp_Lt) == prop_fail_result)
3381     RRETURN(MATCH_NOMATCH);
3382     }
3383 nigel 93 /* Control never gets here */
3384 nigel 87
3385     case PT_GC:
3386     for (fi = min;; fi++)
3387     {
3388 ph10 164 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM38);
3389 nigel 87 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3390     if (fi >= max || eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
3391     GETCHARINC(c, eptr);
3392 ph10 349 prop_category = UCD_CATEGORY(c);
3393 nigel 87 if ((prop_category == prop_value) == prop_fail_result)
3394     RRETURN(MATCH_NOMATCH);
3395     }
3396 nigel 93 /* Control never gets here */
3397 nigel 87
3398     case PT_PC:
3399     for (fi = min;; fi++)
3400     {
3401 ph10 164 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM39);
3402 nigel 87 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3403     if (fi >= max || eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
3404     GETCHARINC(c, eptr);
3405 ph10 349 prop_chartype = UCD_CHARTYPE(c);
3406 nigel 87 if ((prop_chartype == prop_value) == prop_fail_result)
3407     RRETURN(MATCH_NOMATCH);
3408     }
3409 nigel 93 /* Control never gets here */
3410 nigel 87
3411     case PT_SC:
3412     for (fi = min;; fi++)
3413     {
3414 ph10 164 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM40);
3415 nigel 87 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3416     if (fi >= max || eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
3417     GETCHARINC(c, eptr);
3418 ph10 349 prop_script = UCD_SCRIPT(c);
3419 nigel 87 if ((prop_script == prop_value) == prop_fail_result)
3420     RRETURN(MATCH_NOMATCH);
3421     }
3422 nigel 93 /* Control never gets here */
3423 nigel 87
3424     default:
3425     RRETURN(PCRE_ERROR_INTERNAL);
3426 nigel 77 }
3427     }
3428    
3429     /* Match extended Unicode sequences. We will get here only if the
3430     support is in the binary; otherwise a compile-time error occurs. */
3431    
3432     else if (ctype == OP_EXTUNI)
3433     {
3434     for (fi = min;; fi++)
3435     {
3436 ph10 164 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM41);
3437 nigel 77 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3438     if (fi >= max || eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
3439     GETCHARINCTEST(c, eptr);
3440 ph10 349 prop_category = UCD_CATEGORY(c);
3441 nigel 77 if (prop_category == ucp_M) RRETURN(MATCH_NOMATCH);
3442     while (eptr < md->end_subject)
3443     {
3444     int len = 1;
3445     if (!utf8) c = *eptr; else
3446     {
3447     GETCHARLEN(c, eptr, len);
3448     }
3449 ph10 349 prop_category = UCD_CATEGORY(c);
3450 nigel 77 if (prop_category != ucp_M) break;
3451     eptr += len;
3452     }
3453     }
3454     }
3455    
3456     else
3457     #endif /* SUPPORT_UCP */
3458    
3459     #ifdef SUPPORT_UTF8
3460     /* UTF-8 mode */
3461     if (utf8)
3462     {
3463     for (fi = min;; fi++)
3464     {
3465 ph10 164 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM42);
3466 nigel 77 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3467 nigel 91 if (fi >= max || eptr >= md->end_subject ||
3468 ph10 342 (ctype == OP_ANY && IS_NEWLINE(eptr)))
3469 nigel 91 RRETURN(MATCH_NOMATCH);
3470 nigel 77
3471     GETCHARINC(c, eptr);
3472     switch(ctype)
3473     {
3474 ph10 342 case OP_ANY: /* This is the non-NL case */
3475 ph10 345 case OP_ALLANY:
3476 nigel 77 case OP_ANYBYTE:
3477     break;
3478    
3479 nigel 93 case OP_ANYNL:
3480     switch(c)
3481     {
3482     default: RRETURN(MATCH_NOMATCH);
3483     case 0x000d:
3484     if (eptr < md->end_subject && *eptr == 0x0a) eptr++;
3485     break;
3486     case 0x000a:
3487 ph10 231 break;
3488    
3489 nigel 93 case 0x000b:
3490     case 0x000c:
3491     case 0x0085:
3492     case 0x2028:
3493     case 0x2029:
3494 ph10 231 if (md->bsr_anycrlf) RRETURN(MATCH_NOMATCH);
3495 nigel 93 break;
3496     }
3497     break;
3498    
3499 ph10 178 case OP_NOT_HSPACE:
3500     switch(c)
3501     {
3502     default: break;
3503     case 0x09: /* HT */
3504     case 0x20: /* SPACE */
3505     case 0xa0: /* NBSP */
3506     case 0x1680: /* OGHAM SPACE MARK */
3507     case 0x180e: /* MONGOLIAN VOWEL SEPARATOR */
3508     case 0x2000: /* EN QUAD */
3509     case 0x2001: /* EM QUAD */
3510     case 0x2002: /* EN SPACE */
3511     case 0x2003: /* EM SPACE */
3512     case 0x2004: /* THREE-PER-EM SPACE */
3513     case 0x2005: /* FOUR-PER-EM SPACE */
3514     case 0x2006: /* SIX-PER-EM SPACE */
3515     case 0x2007: /* FIGURE SPACE */
3516     case 0x2008: /* PUNCTUATION SPACE */
3517     case 0x2009: /* THIN SPACE */
3518     case 0x200A: /* HAIR SPACE */
3519     case 0x202f: /* NARROW NO-BREAK SPACE */
3520     case 0x205f: /* MEDIUM MATHEMATICAL SPACE */
3521     case 0x3000: /* IDEOGRAPHIC SPACE */
3522     RRETURN(MATCH_NOMATCH);
3523     }
3524     break;
3525    
3526     case OP_HSPACE:
3527     switch(c)
3528     {
3529     default: RRETURN(MATCH_NOMATCH);
3530     case 0x09: /* HT */
3531     case 0x20: /* SPACE */
3532     case 0xa0: /* NBSP */
3533     case 0x1680: /* OGHAM SPACE MARK */
3534     case 0x180e: /* MONGOLIAN VOWEL SEPARATOR */
3535     case 0x2000: /* EN QUAD */
3536     case 0x2001: /* EM QUAD */
3537     case 0x2002: /* EN SPACE */
3538     case 0x2003: /* EM SPACE */
3539     case 0x2004: /* THREE-PER-EM SPACE */
3540     case 0x2005: /* FOUR-PER-EM SPACE */
3541     case 0x2006: /* SIX-PER-EM SPACE */
3542     case 0x2007: /* FIGURE SPACE */
3543     case 0x2008: /* PUNCTUATION SPACE */
3544     case 0x2009: /* THIN SPACE */
3545     case 0x200A: /* HAIR SPACE */
3546     case 0x202f: /* NARROW NO-BREAK SPACE */
3547     case 0x205f: /* MEDIUM MATHEMATICAL SPACE */
3548     case 0x3000: /* IDEOGRAPHIC SPACE */
3549     break;
3550     }
3551     break;
3552    
3553     case OP_NOT_VSPACE:
3554     switch(c)
3555     {
3556     default: break;
3557     case 0x0a: /* LF */
3558     case 0x0b: /* VT */
3559     case 0x0c: /* FF */
3560     case 0x0d: /* CR */
3561     case 0x85: /* NEL */
3562     case 0x2028: /* LINE SEPARATOR */
3563     case 0x2029: /* PARAGRAPH SEPARATOR */
3564     RRETURN(MATCH_NOMATCH);
3565     }
3566     break;
3567    
3568     case OP_VSPACE:
3569     switch(c)
3570     {
3571     default: RRETURN(MATCH_NOMATCH);
3572     case 0x0a: /* LF */
3573     case 0x0b: /* VT */
3574     case 0x0c: /* FF */
3575     case 0x0d: /* CR */
3576     case 0x85: /* NEL */
3577     case 0x2028: /* LINE SEPARATOR */
3578     case 0x2029: /* PARAGRAPH SEPARATOR */
3579     break;
3580     }
3581     break;
3582    
3583 nigel 77 case OP_NOT_DIGIT:
3584     if (c < 256 && (md->ctypes[c] & ctype_digit) != 0)
3585     RRETURN(MATCH_NOMATCH);
3586     break;
3587    
3588     case OP_DIGIT:
3589     if (c >= 256 || (md->ctypes[c] & ctype_digit) == 0)
3590     RRETURN(MATCH_NOMATCH);
3591     break;
3592    
3593     case OP_NOT_WHITESPACE:
3594     if (c < 256 && (md->ctypes[c] & ctype_space) != 0)
3595     RRETURN(MATCH_NOMATCH);
3596     break;
3597    
3598     case OP_WHITESPACE:
3599     if (c >= 256 || (md->ctypes[c] & ctype_space) == 0)
3600     RRETURN(MATCH_NOMATCH);
3601     break;
3602    
3603     case OP_NOT_WORDCHAR:
3604     if (c < 256 && (md->ctypes[c] & ctype_word) != 0)
3605     RRETURN(MATCH_NOMATCH);
3606     break;
3607    
3608     case OP_WORDCHAR:
3609     if (c >= 256 || (md->ctypes[c] & ctype_word) == 0)
3610     RRETURN(MATCH_NOMATCH);
3611     break;
3612    
3613     default:
3614     RRETURN(PCRE_ERROR_INTERNAL);
3615     }
3616     }
3617     }
3618     else
3619     #endif
3620     /* Not UTF-8 mode */
3621     {
3622     for (fi = min;; fi++)
3623     {
3624 ph10 164 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM43);
3625 nigel 77 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3626 nigel 91 if (fi >= max || eptr >= md->end_subject ||
3627 ph10 342 (ctype == OP_ANY && IS_NEWLINE(eptr)))
3628 nigel 91 RRETURN(MATCH_NOMATCH);
3629    
3630 nigel 77 c = *eptr++;
3631     switch(ctype)
3632     {
3633 ph10 342 case OP_ANY: /* This is the non-NL case */
3634 ph10 345 case OP_ALLANY:
3635 nigel 77 case OP_ANYBYTE:
3636     break;
3637    
3638 nigel 93 case OP_ANYNL:
3639     switch(c)
3640     {
3641     default: RRETURN(MATCH_NOMATCH);
3642     case 0x000d:
3643     if (eptr < md->end_subject && *eptr == 0x0a) eptr++;
3644     break;
3645 ph10 231
3646 nigel 93 case 0x000a:
3647 ph10 231 break;
3648    
3649 nigel 93 case 0x000b:
3650     case 0x000c:
3651     case 0x0085:
3652 ph10 231 if (md->bsr_anycrlf) RRETURN(MATCH_NOMATCH);
3653 nigel 93 break;
3654     }
3655     break;
3656    
3657 ph10 178 case OP_NOT_HSPACE:
3658     switch(c)
3659     {
3660     default: break;
3661     case 0x09: /* HT */
3662     case 0x20: /* SPACE */
3663     case 0xa0: /* NBSP */
3664     RRETURN(MATCH_NOMATCH);
3665     }
3666     break;
3667    
3668     case OP_HSPACE:
3669     switch(c)
3670     {
3671     default: RRETURN(MATCH_NOMATCH);
3672     case 0x09: /* HT */
3673     case 0x20: /* SPACE */
3674     case 0xa0: /* NBSP */
3675     break;
3676     }
3677     break;
3678    
3679     case OP_NOT_VSPACE:
3680     switch(c)
3681     {
3682     default: break;
3683     case 0x0a: /* LF */
3684     case 0x0b: /* VT */
3685     case 0x0c: /* FF */
3686     case 0x0d: /* CR */
3687     case 0x85: /* NEL */
3688     RRETURN(MATCH_NOMATCH);
3689     }
3690     break;
3691    
3692     case OP_VSPACE:
3693     switch(c)
3694     {
3695     default: RRETURN(MATCH_NOMATCH);
3696     case 0x0a: /* LF */
3697     case 0x0b: /* VT */
3698     case 0x0c: /* FF */
3699     case 0x0d: /* CR */
3700     case 0x85: /* NEL */
3701     break;
3702     }
3703     break;
3704    
3705 nigel 77 case OP_NOT_DIGIT:
3706     if ((md->ctypes[c] & ctype_digit) != 0) RRETURN(MATCH_NOMATCH);
3707     break;
3708    
3709     case OP_DIGIT:
3710     if ((md->ctypes[c] & ctype_digit) == 0) RRETURN(MATCH_NOMATCH);
3711     break;
3712    
3713     case OP_NOT_WHITESPACE:
3714     if ((md->ctypes[c] & ctype_space) != 0) RRETURN(MATCH_NOMATCH);
3715     break;
3716    
3717     case OP_WHITESPACE:
3718     if ((md->ctypes[c] & ctype_space) == 0) RRETURN(MATCH_NOMATCH);
3719     break;
3720    
3721     case OP_NOT_WORDCHAR:
3722     if ((md->ctypes[c] & ctype_word) != 0) RRETURN(MATCH_NOMATCH);
3723     break;
3724    
3725     case OP_WORDCHAR:
3726     if ((md->ctypes[c] & ctype_word) == 0) RRETURN(MATCH_NOMATCH);
3727     break;
3728    
3729     default:
3730     RRETURN(PCRE_ERROR_INTERNAL);
3731     }
3732     }
3733     }
3734     /* Control never gets here */
3735     }
3736    
3737 nigel 93 /* If maximizing, it is worth using inline code for speed, doing the type
3738 nigel 77 test once at the start (i.e. keep it out of the loop). Again, keep the
3739     UTF-8 and UCP stuff separate. */
3740    
3741     else
3742     {
3743     pp = eptr; /* Remember where we started */
3744    
3745     #ifdef SUPPORT_UCP
3746 nigel 87 if (prop_type >= 0)
3747 nigel 77 {
3748 nigel 87 switch(prop_type)
3749 nigel 77 {
3750 nigel 87 case PT_ANY:
3751     for (i = min; i < max; i++)
3752     {
3753     int len = 1;
3754     if (eptr >= md->end_subject) break;
3755     GETCHARLEN(c, eptr, len);
3756     if (prop_fail_result) break;
3757     eptr+= len;
3758     }
3759     break;
3760    
3761     case PT_LAMP:
3762     for (i = min; i < max; i++)
3763     {
3764     int len = 1;
3765     if (eptr >= md->end_subject) break;
3766     GETCHARLEN(c, eptr, len);
3767 ph10 349 prop_chartype = UCD_CHARTYPE(c);
3768 nigel 87 if ((prop_chartype == ucp_Lu ||
3769     prop_chartype == ucp_Ll ||
3770     prop_chartype == ucp_Lt) == prop_fail_result)
3771     break;
3772     eptr+= len;
3773     }
3774     break;
3775    
3776     case PT_GC:
3777     for (i = min; i < max; i++)
3778     {
3779     int len = 1;
3780     if (eptr >= md->end_subject) break;
3781     GETCHARLEN(c, eptr, len);
3782 ph10 349 prop_category = UCD_CATEGORY(c);
3783 nigel 87 if ((prop_category == prop_value) == prop_fail_result)
3784     break;
3785     eptr+= len;
3786     }
3787     break;
3788    
3789     case PT_PC:
3790     for (i = min; i < max; i++)
3791     {
3792     int len = 1;
3793     if (eptr >= md->end_subject) break;
3794     GETCHARLEN(c, eptr, len);
3795 ph10 349 prop_chartype = UCD_CHARTYPE(c);
3796 nigel 87 if ((prop_chartype == prop_value) == prop_fail_result)
3797     break;
3798     eptr+= len;
3799     }
3800     break;
3801    
3802     case PT_SC:
3803     for (i = min; i < max; i++)
3804     {
3805     int len = 1;
3806     if (eptr >= md->end_subject) break;
3807     GETCHARLEN(c, eptr, len);
3808 ph10 349 prop_script = UCD_SCRIPT(c);
3809 nigel 87 if ((prop_script == prop_value) == prop_fail_result)
3810     break;
3811     eptr+= len;
3812     }
3813     break;
3814 nigel 77 }
3815    
3816     /* eptr is now past the end of the maximum run */
3817    
3818 nigel 93 if (possessive) continue;
3819 nigel 77 for(;;)
3820     {
3821 ph10 164 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM44);
3822 nigel 77 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3823     if (eptr-- == pp) break; /* Stop if tried at original pos */
3824 ph10 207 if (utf8) BACKCHAR(eptr);
3825 nigel 77 }
3826     }
3827    
3828     /* Match extended Unicode sequences. We will get here only if the
3829     support is in the binary; otherwise a compile-time error occurs. */
3830    
3831     else if (ctype == OP_EXTUNI)
3832     {
3833     for (i = min; i < max; i++)
3834     {
3835     if (eptr >= md->end_subject) break;
3836     GETCHARINCTEST(c, eptr);
3837 ph10 349 prop_category = UCD_CATEGORY(c);
3838 nigel 77 if (prop_category == ucp_M) break;
3839     while (eptr < md->end_subject)
3840     {
3841     int len = 1;
3842     if (!utf8) c = *eptr; else
3843     {
3844     GETCHARLEN(c, eptr, len);
3845     }
3846 ph10 349 prop_category = UCD_CATEGORY(c);
3847 nigel 77 if (prop_category != ucp_M) break;
3848     eptr += len;
3849     }
3850     }
3851    
3852     /* eptr is now past the end of the maximum run */
3853    
3854 nigel 93 if (possessive) continue;
3855 nigel 77 for(;;)
3856     {
3857 ph10 164 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM45);
3858 nigel 77 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3859     if (eptr-- == pp) break; /* Stop if tried at original pos */
3860     for (;;) /* Move back over one extended */
3861     {
3862     int len = 1;
3863     if (!utf8) c = *eptr; else
3864     {
3865 ph10 207 BACKCHAR(eptr);
3866 nigel 77 GETCHARLEN(c, eptr, len);
3867     }
3868 ph10 349 prop_category = UCD_CATEGORY(c);
3869 nigel 77 if (prop_category != ucp_M) break;
3870     eptr--;
3871     }
3872     }
3873     }
3874    
3875     else
3876     #endif /* SUPPORT_UCP */
3877    
3878     #ifdef SUPPORT_UTF8
3879     /* UTF-8 mode */
3880    
3881     if (utf8)
3882     {
3883     switch(ctype)
3884     {
3885     case OP_ANY:
3886     if (max < INT_MAX)
3887     {
3888 ph10 342 for (i = min; i < max; i++)
3889 nigel 77 {
3890 ph10 342 if (eptr >= md->end_subject || IS_NEWLINE(eptr)) break;
3891     eptr++;
3892     while (eptr < md->end_subject && (*eptr & 0xc0) == 0x80) eptr++;
3893 nigel 77 }
3894     }
3895    
3896     /* Handle unlimited UTF-8 repeat */
3897    
3898     else
3899     {
3900 ph10 342 for (i = min; i < max; i++)
3901 nigel 77 {
3902 ph10 342 if (eptr >= md->end_subject || IS_NEWLINE(eptr)) break;
3903     eptr++;
3904     while (eptr < md->end_subject && (*eptr & 0xc0) == 0x80) eptr++;
3905 nigel 77 }
3906     }
3907     break;
3908    
3909 ph10 341 case OP_ALLANY:
3910     if (max < INT_MAX)
3911     {
3912     for (i = min; i < max; i++)
3913     {
3914     if (eptr >= md->end_subject) break;
3915     eptr++;
3916     while (eptr < md->end_subject && (*eptr & 0xc0) == 0x80) eptr++;
3917     }
3918     }
3919     else eptr = md->end_subject; /* Unlimited UTF-8 repeat */
3920     break;
3921    
3922 nigel 77 /* The byte case is the same as non-UTF8 */
3923    
3924     case OP_ANYBYTE:
3925     c = max - min;
3926 nigel 93 if (c > (unsigned int)(md->end_subject - eptr))
3927     c = md->end_subject - eptr;
3928 nigel 77 eptr += c;
3929     break;
3930    
3931 nigel 93 case OP_ANYNL:
3932     for (i = min; i < max; i++)
3933     {
3934     int len = 1;
3935     if (eptr >= md->end_subject) break;
3936     GETCHARLEN(c, eptr, len);
3937     if (c == 0x000d)
3938     {
3939     if (++eptr >= md->end_subject) break;
3940     if (*eptr == 0x000a) eptr++;
3941     }
3942     else
3943     {
3944 ph10 231 if (c != 0x000a &&
3945     (md->bsr_anycrlf ||
3946     (c != 0x000b && c != 0x000c &&
3947     c != 0x0085 && c != 0x2028 && c != 0x2029)))
3948 nigel 93 break;
3949     eptr += len;
3950     }
3951     }
3952     break;
3953    
3954 ph10 178 case OP_NOT_HSPACE:
3955 ph10 182 case OP_HSPACE:
3956 ph10 178 for (i = min; i < max; i++)
3957     {
3958 ph10 182 BOOL gotspace;
3959 ph10 178 int len = 1;
3960     if (eptr >= md->end_subject) break;
3961     GETCHARLEN(c, eptr, len);
3962     switch(c)
3963 ph10 182 {
3964     default: gotspace = FALSE; break;
3965 ph10 178 case 0x09: /* HT */
3966     case 0x20: /* SPACE */
3967     case 0xa0: /* NBSP */
3968     case 0x1680: /* OGHAM SPACE MARK */
3969     case 0x180e: /* MONGOLIAN VOWEL SEPARATOR */
3970     case 0x2000: /* EN QUAD */
3971     case 0x2001: /* EM QUAD */
3972     case 0x2002: /* EN SPACE */
3973     case 0x2003: /* EM SPACE */
3974     case 0x2004: /* THREE-PER-EM SPACE */
3975     case 0x2005: /* FOUR-PER-EM SPACE */
3976     case 0x2006: /* SIX-PER-EM SPACE */
3977     case 0x2007: /* FIGURE SPACE */
3978     case 0x2008: /* PUNCTUATION SPACE */
3979     case 0x2009: /* THIN SPACE */
3980     case 0x200A: /* HAIR SPACE */
3981     case 0x202f: /* NARROW NO-BREAK SPACE */
3982     case 0x205f: /* MEDIUM MATHEMATICAL SPACE */
3983     case 0x3000: /* IDEOGRAPHIC SPACE */
3984     gotspace = TRUE;
3985 ph10 182 break;
3986 ph10 178 }
3987     if (gotspace == (ctype == OP_NOT_HSPACE)) break;
3988     eptr += len;
3989     }
3990     break;
3991    
3992     case OP_NOT_VSPACE:
3993 ph10 182 case OP_VSPACE:
3994 ph10 178 for (i = min; i < max; i++)
3995     {
3996 ph10 182 BOOL gotspace;
3997 ph10 178 int len = 1;
3998     if (eptr >= md->end_subject) break;
3999     GETCHARLEN(c, eptr, len);
4000     switch(c)
4001     {
4002 ph10 182 default: gotspace = FALSE; break;
4003 ph10 178 case 0x0a: /* LF */
4004     case 0x0b: /* VT */
4005     case 0x0c: /* FF */
4006     case 0x0d: /* CR */
4007     case 0x85: /* NEL */
4008     case 0x2028: /* LINE SEPARATOR */
4009     case 0x2029: /* PARAGRAPH SEPARATOR */
4010     gotspace = TRUE;
4011     break;
4012     }
4013 ph10 182 if (gotspace == (ctype == OP_NOT_VSPACE)) break;
4014 ph10 178 eptr += len;
4015     }
4016     break;
4017    
4018 nigel 77 case OP_NOT_DIGIT:
4019     for (i = min; i < max; i++)
4020     {
4021     int len = 1;
4022     if (eptr >= md->end_subject) break;
4023     GETCHARLEN(c, eptr, len);
4024     if (c < 256 && (md->ctypes[c] & ctype_digit) != 0) break;
4025     eptr+= len;
4026     }
4027     break;
4028    
4029     case OP_DIGIT:
4030     for (i = min; i < max; i++)
4031     {
4032     int len = 1;
4033     if (eptr >= md->end_subject) break;
4034     GETCHARLEN(c, eptr, len);
4035     if (c >= 256 ||(md->ctypes[c] & ctype_digit) == 0) break;
4036     eptr+= len;
4037     }
4038     break;
4039    
4040     case OP_NOT_WHITESPACE:
4041     for (i = min; i < max; i++)
4042     {
4043     int len = 1;
4044     if (eptr >= md->end_subject) break;
4045     GETCHARLEN(c, eptr, len);
4046     if (c < 256 && (md->ctypes[c] & ctype_space) != 0) break;
4047     eptr+= len;
4048     }
4049     break;
4050    
4051     case OP_WHITESPACE:
4052     for (i = min; i < max; i++)
4053     {
4054     int len = 1;
4055     if (eptr >= md->end_subject) break;
4056     GETCHARLEN(c, eptr, len);
4057     if (c >= 256 ||(md->ctypes[c] & ctype_space) == 0) break;
4058     eptr+= len;
4059     }
4060     break;
4061    
4062     case OP_NOT_WORDCHAR:
4063     for (i = min; i < max; i++)
4064     {
4065     int len = 1;
4066     if (eptr >= md->end_subject) break;
4067     GETCHARLEN(c, eptr, len);
4068     if (c < 256 && (md->ctypes[c] & ctype_word) != 0) break;
4069     eptr+= len;
4070     }
4071     break;
4072    
4073     case OP_WORDCHAR:
4074     for (i = min; i < max; i++)
4075     {
4076     int len = 1;
4077     if (eptr >= md->end_subject) break;
4078     GETCHARLEN(c, eptr, len);
4079     if (c >= 256 || (md->ctypes[c] & ctype_word) == 0) break;
4080     eptr+= len;
4081     }
4082     break;
4083    
4084     default:
4085     RRETURN(PCRE_ERROR_INTERNAL);
4086     }
4087    
4088     /* eptr is now past the end of the maximum run */
4089    
4090 nigel 93 if (possessive) continue;
4091 nigel 77 for(;;)
4092     {
4093 ph10 164 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM46);
4094 nigel 77 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
4095     if (eptr-- == pp) break; /* Stop if tried at original pos */
4096     BACKCHAR(eptr);
4097     }
4098     }
4099     else
4100 ph10 207 #endif /* SUPPORT_UTF8 */
4101 nigel 77
4102     /* Not UTF-8 mode */
4103     {
4104     switch(ctype)
4105     {
4106     case OP_ANY:
4107 ph10 342 for (i = min; i < max; i++)
4108 nigel 77 {
4109 ph10 342 if (eptr >= md->end_subject || IS_NEWLINE(eptr)) break;
4110     eptr++;
4111 nigel 77 }
4112 ph10 342 break;
4113 nigel 77
4114 ph10 341 case OP_ALLANY:
4115 nigel 77 case OP_ANYBYTE:
4116     c = max - min;
4117 nigel 93 if (c > (unsigned int)(md->end_subject - eptr))
4118     c = md->end_subject - eptr;
4119 nigel 77 eptr += c;
4120     break;
4121    
4122 nigel 93 case OP_ANYNL:
4123     for (i = min; i < max; i++)
4124     {
4125     if (eptr >= md->end_subject) break;
4126     c = *eptr;
4127     if (c == 0x000d)
4128     {
4129     if (++eptr >= md->end_subject) break;
4130     if (*eptr == 0x000a) eptr++;
4131     }
4132     else
4133     {
4134 ph10 231 if (c != 0x000a &&
4135     (md->bsr_anycrlf ||
4136     (c != 0x000b && c != 0x000c && c != 0x0085)))