/[pcre]/code/trunk/pcre_exec.c
ViewVC logotype

Contents of /code/trunk/pcre_exec.c

Parent Directory Parent Directory | Revision Log Revision Log


Revision 402 - (hide annotations) (download)
Sat Mar 21 17:26:03 2009 UTC (5 years, 5 months ago) by ph10
File MIME type: text/plain
File size: 153064 byte(s)
Add missing #ifdef SUPPORT_UTF8 round heapframe::Xcharptr.

1 nigel 77 /*************************************************
2     * Perl-Compatible Regular Expressions *
3     *************************************************/
4    
5     /* PCRE is a library of functions to support regular expressions whose syntax
6     and semantics are as close as possible to those of the Perl 5 language.
7    
8     Written by Philip Hazel
9 ph10 381 Copyright (c) 1997-2009 University of Cambridge
10 nigel 77
11     -----------------------------------------------------------------------------
12     Redistribution and use in source and binary forms, with or without
13     modification, are permitted provided that the following conditions are met:
14    
15     * Redistributions of source code must retain the above copyright notice,
16     this list of conditions and the following disclaimer.
17    
18     * Redistributions in binary form must reproduce the above copyright
19     notice, this list of conditions and the following disclaimer in the
20     documentation and/or other materials provided with the distribution.
21    
22     * Neither the name of the University of Cambridge nor the names of its
23     contributors may be used to endorse or promote products derived from
24     this software without specific prior written permission.
25    
26     THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
27     AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
28     IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
29     ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
30     LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
31     CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
32     SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
33     INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
34     CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
35     ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
36     POSSIBILITY OF SUCH DAMAGE.
37     -----------------------------------------------------------------------------
38     */
39    
40    
41     /* This module contains pcre_exec(), the externally visible function that does
42     pattern matching using an NFA algorithm, trying to mimic Perl as closely as
43     possible. There are also some static supporting functions. */
44    
45 ph10 200 #ifdef HAVE_CONFIG_H
46 ph10 236 #include "config.h"
47 ph10 200 #endif
48 ph10 199
49 nigel 93 #define NLBLOCK md /* Block containing newline information */
50     #define PSSTART start_subject /* Field containing processed string start */
51     #define PSEND end_subject /* Field containing processed string end */
52    
53 nigel 77 #include "pcre_internal.h"
54    
55 ph10 137 /* Undefine some potentially clashing cpp symbols */
56    
57     #undef min
58     #undef max
59    
60 nigel 77 /* Flag bits for the match() function */
61    
62 nigel 93 #define match_condassert 0x01 /* Called to check a condition assertion */
63     #define match_cbegroup 0x02 /* Could-be-empty unlimited repeat group */
64 nigel 77
65     /* Non-error returns from the match() function. Error returns are externally
66     defined PCRE_ERROR_xxx codes, which are all negative. */
67    
68     #define MATCH_MATCH 1
69     #define MATCH_NOMATCH 0
70    
71 ph10 211 /* Special internal returns from the match() function. Make them sufficiently
72 ph10 210 negative to avoid the external error codes. */
73    
74     #define MATCH_COMMIT (-999)
75     #define MATCH_PRUNE (-998)
76     #define MATCH_SKIP (-997)
77     #define MATCH_THEN (-996)
78    
79 nigel 77 /* Maximum number of ints of offset to save on the stack for recursive calls.
80     If the offset vector is bigger, malloc is used. This should be a multiple of 3,
81     because the offset vector is always a multiple of 3 long. */
82    
83     #define REC_STACK_SAVE_MAX 30
84    
85     /* Min and max values for the common repeats; for the maxima, 0 => infinity */
86    
87     static const char rep_min[] = { 0, 0, 1, 1, 0, 0 };
88     static const char rep_max[] = { 0, 0, 0, 0, 1, 1 };
89    
90    
91    
92     #ifdef DEBUG
93     /*************************************************
94     * Debugging function to print chars *
95     *************************************************/
96    
97     /* Print a sequence of chars in printable format, stopping at the end of the
98     subject if the requested.
99    
100     Arguments:
101     p points to characters
102     length number to print
103     is_subject TRUE if printing from within md->start_subject
104     md pointer to matching data block, if is_subject is TRUE
105    
106     Returns: nothing
107     */
108    
109     static void
110     pchars(const uschar *p, int length, BOOL is_subject, match_data *md)
111     {
112 nigel 93 unsigned int c;
113 nigel 77 if (is_subject && length > md->end_subject - p) length = md->end_subject - p;
114     while (length-- > 0)
115     if (isprint(c = *(p++))) printf("%c", c); else printf("\\x%02x", c);
116     }
117     #endif
118    
119    
120    
121     /*************************************************
122     * Match a back-reference *
123     *************************************************/
124    
125     /* If a back reference hasn't been set, the length that is passed is greater
126     than the number of characters left in the string, so the match fails.
127    
128     Arguments:
129     offset index into the offset vector
130     eptr points into the subject
131     length length to be matched
132     md points to match data block
133     ims the ims flags
134    
135     Returns: TRUE if matched
136     */
137    
138     static BOOL
139 nigel 87 match_ref(int offset, register USPTR eptr, int length, match_data *md,
140 nigel 77 unsigned long int ims)
141     {
142 nigel 87 USPTR p = md->start_subject + md->offset_vector[offset];
143 nigel 77
144     #ifdef DEBUG
145     if (eptr >= md->end_subject)
146     printf("matching subject <null>");
147     else
148     {
149     printf("matching subject ");
150     pchars(eptr, length, TRUE, md);
151     }
152     printf(" against backref ");
153     pchars(p, length, FALSE, md);
154     printf("\n");
155     #endif
156    
157     /* Always fail if not enough characters left */
158    
159     if (length > md->end_subject - eptr) return FALSE;
160    
161 ph10 354 /* Separate the caseless case for speed. In UTF-8 mode we can only do this
162     properly if Unicode properties are supported. Otherwise, we can check only
163     ASCII characters. */
164 nigel 77
165     if ((ims & PCRE_CASELESS) != 0)
166     {
167 ph10 354 #ifdef SUPPORT_UTF8
168     #ifdef SUPPORT_UCP
169     if (md->utf8)
170     {
171 ph10 358 USPTR endptr = eptr + length;
172 ph10 354 while (eptr < endptr)
173     {
174 ph10 358 int c, d;
175 ph10 354 GETCHARINC(c, eptr);
176     GETCHARINC(d, p);
177     if (c != d && c != UCD_OTHERCASE(d)) return FALSE;
178 ph10 358 }
179     }
180 ph10 354 else
181     #endif
182     #endif
183    
184     /* The same code works when not in UTF-8 mode and in UTF-8 mode when there
185     is no UCP support. */
186 ph10 358
187 nigel 77 while (length-- > 0)
188 ph10 354 { if (md->lcc[*p++] != md->lcc[*eptr++]) return FALSE; }
189 nigel 77 }
190 ph10 358
191 ph10 354 /* In the caseful case, we can just compare the bytes, whether or not we
192     are in UTF-8 mode. */
193 ph10 358
194 nigel 77 else
195     { while (length-- > 0) if (*p++ != *eptr++) return FALSE; }
196    
197     return TRUE;
198     }
199    
200    
201    
202     /***************************************************************************
203     ****************************************************************************
204     RECURSION IN THE match() FUNCTION
205    
206 nigel 87 The match() function is highly recursive, though not every recursive call
207     increases the recursive depth. Nevertheless, some regular expressions can cause
208     it to recurse to a great depth. I was writing for Unix, so I just let it call
209     itself recursively. This uses the stack for saving everything that has to be
210     saved for a recursive call. On Unix, the stack can be large, and this works
211     fine.
212 nigel 77
213 nigel 87 It turns out that on some non-Unix-like systems there are problems with
214     programs that use a lot of stack. (This despite the fact that every last chip
215     has oodles of memory these days, and techniques for extending the stack have
216     been known for decades.) So....
217 nigel 77
218     There is a fudge, triggered by defining NO_RECURSE, which avoids recursive
219     calls by keeping local variables that need to be preserved in blocks of memory
220 nigel 87 obtained from malloc() instead instead of on the stack. Macros are used to
221 nigel 77 achieve this so that the actual code doesn't look very different to what it
222     always used to.
223 ph10 164
224 ph10 165 The original heap-recursive code used longjmp(). However, it seems that this
225 ph10 164 can be very slow on some operating systems. Following a suggestion from Stan
226     Switzer, the use of longjmp() has been abolished, at the cost of having to
227     provide a unique number for each call to RMATCH. There is no way of generating
228     a sequence of numbers at compile time in C. I have given them names, to make
229     them stand out more clearly.
230    
231     Crude tests on x86 Linux show a small speedup of around 5-8%. However, on
232     FreeBSD, avoiding longjmp() more than halves the time taken to run the standard
233 ph10 165 tests. Furthermore, not using longjmp() means that local dynamic variables
234     don't have indeterminate values; this has meant that the frame size can be
235 ph10 164 reduced because the result can be "passed back" by straight setting of the
236     variable instead of being passed in the frame.
237 nigel 77 ****************************************************************************
238     ***************************************************************************/
239    
240 ph10 212 /* Numbers for RMATCH calls. When this list is changed, the code at HEAP_RETURN
241     below must be updated in sync. */
242 nigel 77
243 ph10 164 enum { RM1=1, RM2, RM3, RM4, RM5, RM6, RM7, RM8, RM9, RM10,
244     RM11, RM12, RM13, RM14, RM15, RM16, RM17, RM18, RM19, RM20,
245     RM21, RM22, RM23, RM24, RM25, RM26, RM27, RM28, RM29, RM30,
246     RM31, RM32, RM33, RM34, RM35, RM36, RM37, RM38, RM39, RM40,
247 ph10 210 RM41, RM42, RM43, RM44, RM45, RM46, RM47, RM48, RM49, RM50,
248 ph10 212 RM51, RM52, RM53, RM54 };
249 ph10 164
250 nigel 87 /* These versions of the macros use the stack, as normal. There are debugging
251 ph10 165 versions and production versions. Note that the "rw" argument of RMATCH isn't
252 ph10 164 actuall used in this definition. */
253 nigel 77
254     #ifndef NO_RECURSE
255     #define REGISTER register
256 ph10 164
257 nigel 87 #ifdef DEBUG
258 ph10 164 #define RMATCH(ra,rb,rc,rd,re,rf,rg,rw) \
259 nigel 87 { \
260     printf("match() called in line %d\n", __LINE__); \
261 ph10 168 rrc = match(ra,rb,mstart,rc,rd,re,rf,rg,rdepth+1); \
262 nigel 87 printf("to line %d\n", __LINE__); \
263     }
264     #define RRETURN(ra) \
265     { \
266     printf("match() returned %d from line %d ", ra, __LINE__); \
267     return ra; \
268     }
269     #else
270 ph10 164 #define RMATCH(ra,rb,rc,rd,re,rf,rg,rw) \
271 ph10 168 rrc = match(ra,rb,mstart,rc,rd,re,rf,rg,rdepth+1)
272 nigel 77 #define RRETURN(ra) return ra
273 nigel 87 #endif
274    
275 nigel 77 #else
276    
277    
278 ph10 164 /* These versions of the macros manage a private stack on the heap. Note that
279     the "rd" argument of RMATCH isn't actually used in this definition. It's the md
280     argument of match(), which never changes. */
281 nigel 77
282     #define REGISTER
283    
284 ph10 164 #define RMATCH(ra,rb,rc,rd,re,rf,rg,rw)\
285 nigel 77 {\
286     heapframe *newframe = (pcre_stack_malloc)(sizeof(heapframe));\
287 ph10 164 frame->Xwhere = rw; \
288     newframe->Xeptr = ra;\
289     newframe->Xecode = rb;\
290 ph10 168 newframe->Xmstart = mstart;\
291 ph10 164 newframe->Xoffset_top = rc;\
292     newframe->Xims = re;\
293     newframe->Xeptrb = rf;\
294     newframe->Xflags = rg;\
295     newframe->Xrdepth = frame->Xrdepth + 1;\
296     newframe->Xprevframe = frame;\
297     frame = newframe;\
298     DPRINTF(("restarting from line %d\n", __LINE__));\
299     goto HEAP_RECURSE;\
300     L_##rw:\
301     DPRINTF(("jumped back to line %d\n", __LINE__));\
302 nigel 77 }
303    
304     #define RRETURN(ra)\
305     {\
306     heapframe *newframe = frame;\
307     frame = newframe->Xprevframe;\
308     (pcre_stack_free)(newframe);\
309     if (frame != NULL)\
310     {\
311 ph10 164 rrc = ra;\
312     goto HEAP_RETURN;\
313 nigel 77 }\
314     return ra;\
315     }
316    
317    
318     /* Structure for remembering the local variables in a private frame */
319    
320     typedef struct heapframe {
321     struct heapframe *Xprevframe;
322    
323     /* Function arguments that may change */
324    
325     const uschar *Xeptr;
326     const uschar *Xecode;
327 ph10 172 const uschar *Xmstart;
328 nigel 77 int Xoffset_top;
329     long int Xims;
330     eptrblock *Xeptrb;
331     int Xflags;
332 nigel 91 unsigned int Xrdepth;
333 nigel 77
334     /* Function local variables */
335    
336     const uschar *Xcallpat;
337 ph10 402 #ifdef SUPPORT_UTF8
338 nigel 77 const uschar *Xcharptr;
339 ph10 402 #endif
340 nigel 77 const uschar *Xdata;
341     const uschar *Xnext;
342     const uschar *Xpp;
343     const uschar *Xprev;
344     const uschar *Xsaved_eptr;
345    
346     recursion_info Xnew_recursive;
347    
348     BOOL Xcur_is_word;
349     BOOL Xcondition;
350     BOOL Xprev_is_word;
351    
352     unsigned long int Xoriginal_ims;
353    
354     #ifdef SUPPORT_UCP
355     int Xprop_type;
356 nigel 87 int Xprop_value;
357 nigel 77 int Xprop_fail_result;
358     int Xprop_category;
359     int Xprop_chartype;
360 nigel 87 int Xprop_script;
361 ph10 123 int Xoclength;
362     uschar Xocchars[8];
363 nigel 77 #endif
364    
365     int Xctype;
366 nigel 93 unsigned int Xfc;
367 nigel 77 int Xfi;
368     int Xlength;
369     int Xmax;
370     int Xmin;
371     int Xnumber;
372     int Xoffset;
373     int Xop;
374     int Xsave_capture_last;
375     int Xsave_offset1, Xsave_offset2, Xsave_offset3;
376     int Xstacksave[REC_STACK_SAVE_MAX];
377    
378     eptrblock Xnewptrb;
379    
380 ph10 164 /* Where to jump back to */
381 nigel 77
382 ph10 164 int Xwhere;
383 ph10 165
384 nigel 77 } heapframe;
385    
386     #endif
387    
388    
389     /***************************************************************************
390     ***************************************************************************/
391    
392    
393    
394     /*************************************************
395     * Match from current position *
396     *************************************************/
397    
398 nigel 93 /* This function is called recursively in many circumstances. Whenever it
399 nigel 77 returns a negative (error) response, the outer incarnation must also return the
400     same response.
401    
402     Performance note: It might be tempting to extract commonly used fields from the
403     md structure (e.g. utf8, end_subject) into individual variables to improve
404     performance. Tests using gcc on a SPARC disproved this; in the first case, it
405     made performance worse.
406    
407     Arguments:
408 nigel 93 eptr pointer to current character in subject
409     ecode pointer to current position in compiled code
410 ph10 168 mstart pointer to the current match start position (can be modified
411 ph10 172 by encountering \K)
412 nigel 77 offset_top current top pointer
413     md pointer to "static" info for the match
414     ims current /i, /m, and /s options
415     eptrb pointer to chain of blocks containing eptr at start of
416     brackets - for testing for empty matches
417     flags can contain
418     match_condassert - this is an assertion condition
419 nigel 93 match_cbegroup - this is the start of an unlimited repeat
420     group that can match an empty string
421 nigel 87 rdepth the recursion depth
422 nigel 77
423     Returns: MATCH_MATCH if matched ) these values are >= 0
424     MATCH_NOMATCH if failed to match )
425     a negative PCRE_ERROR_xxx value if aborted by an error condition
426 nigel 87 (e.g. stopped by repeated call or recursion limit)
427 nigel 77 */
428    
429     static int
430 ph10 172 match(REGISTER USPTR eptr, REGISTER const uschar *ecode, const uschar *mstart,
431 nigel 77 int offset_top, match_data *md, unsigned long int ims, eptrblock *eptrb,
432 nigel 91 int flags, unsigned int rdepth)
433 nigel 77 {
434     /* These variables do not need to be preserved over recursion in this function,
435 nigel 93 so they can be ordinary variables in all cases. Mark some of them with
436     "register" because they are used a lot in loops. */
437 nigel 77
438 nigel 91 register int rrc; /* Returns from recursive calls */
439     register int i; /* Used for loops not involving calls to RMATCH() */
440 nigel 93 register unsigned int c; /* Character values not kept over RMATCH() calls */
441 nigel 91 register BOOL utf8; /* Local copy of UTF-8 flag for speed */
442 nigel 77
443 nigel 93 BOOL minimize, possessive; /* Quantifier options */
444    
445 nigel 77 /* When recursion is not being used, all "local" variables that have to be
446     preserved over calls to RMATCH() are part of a "frame" which is obtained from
447     heap storage. Set up the top-level frame here; others are obtained from the
448     heap whenever RMATCH() does a "recursion". See the macro definitions above. */
449    
450     #ifdef NO_RECURSE
451     heapframe *frame = (pcre_stack_malloc)(sizeof(heapframe));
452     frame->Xprevframe = NULL; /* Marks the top level */
453    
454     /* Copy in the original argument variables */
455    
456     frame->Xeptr = eptr;
457     frame->Xecode = ecode;
458 ph10 168 frame->Xmstart = mstart;
459 nigel 77 frame->Xoffset_top = offset_top;
460     frame->Xims = ims;
461     frame->Xeptrb = eptrb;
462     frame->Xflags = flags;
463 nigel 87 frame->Xrdepth = rdepth;
464 nigel 77
465     /* This is where control jumps back to to effect "recursion" */
466    
467     HEAP_RECURSE:
468    
469     /* Macros make the argument variables come from the current frame */
470    
471     #define eptr frame->Xeptr
472     #define ecode frame->Xecode
473 ph10 168 #define mstart frame->Xmstart
474 nigel 77 #define offset_top frame->Xoffset_top
475     #define ims frame->Xims
476     #define eptrb frame->Xeptrb
477     #define flags frame->Xflags
478 nigel 87 #define rdepth frame->Xrdepth
479 nigel 77
480     /* Ditto for the local variables */
481    
482     #ifdef SUPPORT_UTF8
483     #define charptr frame->Xcharptr
484     #endif
485     #define callpat frame->Xcallpat
486     #define data frame->Xdata
487     #define next frame->Xnext
488     #define pp frame->Xpp
489     #define prev frame->Xprev
490     #define saved_eptr frame->Xsaved_eptr
491    
492     #define new_recursive frame->Xnew_recursive
493    
494     #define cur_is_word frame->Xcur_is_word
495     #define condition frame->Xcondition
496     #define prev_is_word frame->Xprev_is_word
497    
498     #define original_ims frame->Xoriginal_ims
499    
500     #ifdef SUPPORT_UCP
501     #define prop_type frame->Xprop_type
502 nigel 87 #define prop_value frame->Xprop_value
503 nigel 77 #define prop_fail_result frame->Xprop_fail_result
504     #define prop_category frame->Xprop_category
505     #define prop_chartype frame->Xprop_chartype
506 nigel 87 #define prop_script frame->Xprop_script
507 ph10 115 #define oclength frame->Xoclength
508     #define occhars frame->Xocchars
509 nigel 77 #endif
510    
511     #define ctype frame->Xctype
512     #define fc frame->Xfc
513     #define fi frame->Xfi
514     #define length frame->Xlength
515     #define max frame->Xmax
516     #define min frame->Xmin
517     #define number frame->Xnumber
518     #define offset frame->Xoffset
519     #define op frame->Xop
520     #define save_capture_last frame->Xsave_capture_last
521     #define save_offset1 frame->Xsave_offset1
522     #define save_offset2 frame->Xsave_offset2
523     #define save_offset3 frame->Xsave_offset3
524     #define stacksave frame->Xstacksave
525    
526     #define newptrb frame->Xnewptrb
527    
528     /* When recursion is being used, local variables are allocated on the stack and
529     get preserved during recursion in the normal way. In this environment, fi and
530     i, and fc and c, can be the same variables. */
531    
532 nigel 93 #else /* NO_RECURSE not defined */
533 nigel 77 #define fi i
534     #define fc c
535    
536    
537 nigel 87 #ifdef SUPPORT_UTF8 /* Many of these variables are used only */
538     const uschar *charptr; /* in small blocks of the code. My normal */
539     #endif /* style of coding would have declared */
540     const uschar *callpat; /* them within each of those blocks. */
541     const uschar *data; /* However, in order to accommodate the */
542     const uschar *next; /* version of this code that uses an */
543     USPTR pp; /* external "stack" implemented on the */
544     const uschar *prev; /* heap, it is easier to declare them all */
545     USPTR saved_eptr; /* here, so the declarations can be cut */
546     /* out in a block. The only declarations */
547     recursion_info new_recursive; /* within blocks below are for variables */
548     /* that do not have to be preserved over */
549     BOOL cur_is_word; /* a recursive call to RMATCH(). */
550     BOOL condition;
551 nigel 77 BOOL prev_is_word;
552    
553     unsigned long int original_ims;
554    
555     #ifdef SUPPORT_UCP
556     int prop_type;
557 nigel 87 int prop_value;
558 nigel 77 int prop_fail_result;
559     int prop_category;
560     int prop_chartype;
561 nigel 87 int prop_script;
562 ph10 115 int oclength;
563     uschar occhars[8];
564 nigel 77 #endif
565    
566 ph10 399 int codelink;
567     int condcode;
568 nigel 77 int ctype;
569     int length;
570     int max;
571     int min;
572     int number;
573     int offset;
574     int op;
575     int save_capture_last;
576     int save_offset1, save_offset2, save_offset3;
577     int stacksave[REC_STACK_SAVE_MAX];
578    
579     eptrblock newptrb;
580 nigel 93 #endif /* NO_RECURSE */
581 nigel 77
582     /* These statements are here to stop the compiler complaining about unitialized
583     variables. */
584    
585     #ifdef SUPPORT_UCP
586 nigel 87 prop_value = 0;
587 nigel 77 prop_fail_result = 0;
588     #endif
589    
590 nigel 93
591 nigel 91 /* This label is used for tail recursion, which is used in a few cases even
592     when NO_RECURSE is not defined, in order to reduce the amount of stack that is
593     used. Thanks to Ian Taylor for noticing this possibility and sending the
594     original patch. */
595    
596     TAIL_RECURSE:
597    
598 nigel 87 /* OK, now we can get on with the real code of the function. Recursive calls
599     are specified by the macro RMATCH and RRETURN is used to return. When
600     NO_RECURSE is *not* defined, these just turn into a recursive call to match()
601     and a "return", respectively (possibly with some debugging if DEBUG is
602     defined). However, RMATCH isn't like a function call because it's quite a
603     complicated macro. It has to be used in one particular way. This shouldn't,
604     however, impact performance when true recursion is being used. */
605 nigel 77
606 ph10 164 #ifdef SUPPORT_UTF8
607     utf8 = md->utf8; /* Local copy of the flag */
608     #else
609     utf8 = FALSE;
610     #endif
611    
612 nigel 87 /* First check that we haven't called match() too many times, or that we
613     haven't exceeded the recursive call limit. */
614    
615 nigel 77 if (md->match_call_count++ >= md->match_limit) RRETURN(PCRE_ERROR_MATCHLIMIT);
616 nigel 87 if (rdepth >= md->match_limit_recursion) RRETURN(PCRE_ERROR_RECURSIONLIMIT);
617 nigel 77
618     original_ims = ims; /* Save for resetting on ')' */
619 nigel 91
620 nigel 93 /* At the start of a group with an unlimited repeat that may match an empty
621     string, the match_cbegroup flag is set. When this is the case, add the current
622     subject pointer to the chain of such remembered pointers, to be checked when we
623     hit the closing ket, in order to break infinite loops that match no characters.
624 ph10 197 When match() is called in other circumstances, don't add to the chain. The
625     match_cbegroup flag must NOT be used with tail recursion, because the memory
626     block that is used is on the stack, so a new one may be required for each
627     match(). */
628 nigel 77
629 nigel 93 if ((flags & match_cbegroup) != 0)
630 nigel 77 {
631 ph10 197 newptrb.epb_saved_eptr = eptr;
632     newptrb.epb_prev = eptrb;
633     eptrb = &newptrb;
634 nigel 77 }
635    
636 nigel 93 /* Now start processing the opcodes. */
637 nigel 77
638     for (;;)
639     {
640 nigel 93 minimize = possessive = FALSE;
641 nigel 77 op = *ecode;
642 ph10 395
643 nigel 77 /* For partial matching, remember if we ever hit the end of the subject after
644     matching at least one subject character. */
645    
646     if (md->partial &&
647     eptr >= md->end_subject &&
648 ph10 168 eptr > mstart)
649 nigel 77 md->hitend = TRUE;
650 ph10 208
651 nigel 93 switch(op)
652     {
653 ph10 210 case OP_FAIL:
654 ph10 212 RRETURN(MATCH_NOMATCH);
655 ph10 211
656 ph10 210 case OP_PRUNE:
657     RMATCH(eptr, ecode + _pcre_OP_lengths[*ecode], offset_top, md,
658     ims, eptrb, flags, RM51);
659     if (rrc != MATCH_NOMATCH) RRETURN(rrc);
660 ph10 212 RRETURN(MATCH_PRUNE);
661 ph10 211
662 ph10 210 case OP_COMMIT:
663     RMATCH(eptr, ecode + _pcre_OP_lengths[*ecode], offset_top, md,
664     ims, eptrb, flags, RM52);
665     if (rrc != MATCH_NOMATCH) RRETURN(rrc);
666 ph10 212 RRETURN(MATCH_COMMIT);
667 ph10 211
668 ph10 210 case OP_SKIP:
669     RMATCH(eptr, ecode + _pcre_OP_lengths[*ecode], offset_top, md,
670     ims, eptrb, flags, RM53);
671     if (rrc != MATCH_NOMATCH) RRETURN(rrc);
672 ph10 211 md->start_match_ptr = eptr; /* Pass back current position */
673 ph10 212 RRETURN(MATCH_SKIP);
674 ph10 211
675 ph10 210 case OP_THEN:
676     RMATCH(eptr, ecode + _pcre_OP_lengths[*ecode], offset_top, md,
677 ph10 212 ims, eptrb, flags, RM54);
678 ph10 210 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
679 ph10 212 RRETURN(MATCH_THEN);
680 ph10 211
681 nigel 93 /* Handle a capturing bracket. If there is space in the offset vector, save
682     the current subject position in the working slot at the top of the vector.
683     We mustn't change the current values of the data slot, because they may be
684     set from a previous iteration of this group, and be referred to by a
685     reference inside the group.
686 nigel 77
687 nigel 93 If the bracket fails to match, we need to restore this value and also the
688     values of the final offsets, in case they were set by a previous iteration
689     of the same bracket.
690 nigel 77
691 nigel 93 If there isn't enough space in the offset vector, treat this as if it were
692     a non-capturing bracket. Don't worry about setting the flag for the error
693     case here; that is handled in the code for KET. */
694 nigel 77
695 nigel 93 case OP_CBRA:
696     case OP_SCBRA:
697     number = GET2(ecode, 1+LINK_SIZE);
698 nigel 77 offset = number << 1;
699    
700     #ifdef DEBUG
701 nigel 93 printf("start bracket %d\n", number);
702     printf("subject=");
703 nigel 77 pchars(eptr, 16, TRUE, md);
704     printf("\n");
705     #endif
706    
707     if (offset < md->offset_max)
708     {
709     save_offset1 = md->offset_vector[offset];
710     save_offset2 = md->offset_vector[offset+1];
711     save_offset3 = md->offset_vector[md->offset_end - number];
712     save_capture_last = md->capture_last;
713    
714     DPRINTF(("saving %d %d %d\n", save_offset1, save_offset2, save_offset3));
715     md->offset_vector[md->offset_end - number] = eptr - md->start_subject;
716    
717 nigel 93 flags = (op == OP_SCBRA)? match_cbegroup : 0;
718 nigel 77 do
719     {
720 ph10 164 RMATCH(eptr, ecode + _pcre_OP_lengths[*ecode], offset_top, md,
721     ims, eptrb, flags, RM1);
722 ph10 210 if (rrc != MATCH_NOMATCH && rrc != MATCH_THEN) RRETURN(rrc);
723 nigel 77 md->capture_last = save_capture_last;
724     ecode += GET(ecode, 1);
725     }
726     while (*ecode == OP_ALT);
727    
728     DPRINTF(("bracket %d failed\n", number));
729    
730     md->offset_vector[offset] = save_offset1;
731     md->offset_vector[offset+1] = save_offset2;
732     md->offset_vector[md->offset_end - number] = save_offset3;
733    
734     RRETURN(MATCH_NOMATCH);
735     }
736    
737 ph10 197 /* FALL THROUGH ... Insufficient room for saving captured contents. Treat
738     as a non-capturing bracket. */
739 nigel 77
740 ph10 197 /* VVVVVVVVVVVVVVVVVVVVVVVVV */
741     /* VVVVVVVVVVVVVVVVVVVVVVVVV */
742    
743 nigel 93 DPRINTF(("insufficient capture room: treat as non-capturing\n"));
744 nigel 77
745 ph10 197 /* VVVVVVVVVVVVVVVVVVVVVVVVV */
746     /* VVVVVVVVVVVVVVVVVVVVVVVVV */
747    
748 nigel 93 /* Non-capturing bracket. Loop for all the alternatives. When we get to the
749     final alternative within the brackets, we would return the result of a
750     recursive call to match() whatever happened. We can reduce stack usage by
751 ph10 197 turning this into a tail recursion, except in the case when match_cbegroup
752     is set.*/
753 nigel 77
754 nigel 93 case OP_BRA:
755     case OP_SBRA:
756     DPRINTF(("start non-capturing bracket\n"));
757     flags = (op >= OP_SBRA)? match_cbegroup : 0;
758 nigel 91 for (;;)
759 nigel 77 {
760 ph10 197 if (ecode[GET(ecode, 1)] != OP_ALT) /* Final alternative */
761 nigel 93 {
762 ph10 197 if (flags == 0) /* Not a possibly empty group */
763     {
764     ecode += _pcre_OP_lengths[*ecode];
765     DPRINTF(("bracket 0 tail recursion\n"));
766     goto TAIL_RECURSE;
767     }
768    
769     /* Possibly empty group; can't use tail recursion. */
770    
771     RMATCH(eptr, ecode + _pcre_OP_lengths[*ecode], offset_top, md, ims,
772     eptrb, flags, RM48);
773     RRETURN(rrc);
774 nigel 93 }
775 nigel 91
776     /* For non-final alternatives, continue the loop for a NOMATCH result;
777     otherwise return. */
778    
779 ph10 164 RMATCH(eptr, ecode + _pcre_OP_lengths[*ecode], offset_top, md, ims,
780     eptrb, flags, RM2);
781 ph10 210 if (rrc != MATCH_NOMATCH && rrc != MATCH_THEN) RRETURN(rrc);
782 nigel 77 ecode += GET(ecode, 1);
783     }
784 nigel 91 /* Control never reaches here. */
785 nigel 77
786     /* Conditional group: compilation checked that there are no more than
787     two branches. If the condition is false, skipping the first branch takes us
788     past the end if there is only one branch, but that's OK because that is
789 nigel 91 exactly what going to the ket would do. As there is only one branch to be
790     obeyed, we can use tail recursion to avoid using another stack frame. */
791 nigel 77
792     case OP_COND:
793 nigel 93 case OP_SCOND:
794 ph10 399 codelink= GET(ecode, 1);
795    
796 ph10 381 /* Because of the way auto-callout works during compile, a callout item is
797     inserted between OP_COND and an assertion condition. */
798 ph10 392
799 ph10 381 if (ecode[LINK_SIZE+1] == OP_CALLOUT)
800     {
801     if (pcre_callout != NULL)
802     {
803     pcre_callout_block cb;
804     cb.version = 1; /* Version 1 of the callout block */
805     cb.callout_number = ecode[LINK_SIZE+2];
806     cb.offset_vector = md->offset_vector;
807     cb.subject = (PCRE_SPTR)md->start_subject;
808     cb.subject_length = md->end_subject - md->start_subject;
809     cb.start_match = mstart - md->start_subject;
810     cb.current_position = eptr - md->start_subject;
811     cb.pattern_position = GET(ecode, LINK_SIZE + 3);
812     cb.next_item_length = GET(ecode, 3 + 2*LINK_SIZE);
813     cb.capture_top = offset_top/2;
814     cb.capture_last = md->capture_last;
815     cb.callout_data = md->callout_data;
816     if ((rrc = (*pcre_callout)(&cb)) > 0) RRETURN(MATCH_NOMATCH);
817     if (rrc < 0) RRETURN(rrc);
818     }
819     ecode += _pcre_OP_lengths[OP_CALLOUT];
820     }
821 ph10 392
822 ph10 399 condcode = ecode[LINK_SIZE+1];
823    
824 ph10 381 /* Now see what the actual condition is */
825 ph10 392
826 ph10 399 if (condcode == OP_RREF) /* Recursion test */
827 nigel 77 {
828 nigel 93 offset = GET2(ecode, LINK_SIZE + 2); /* Recursion group number*/
829     condition = md->recursive != NULL &&
830     (offset == RREF_ANY || offset == md->recursive->group_num);
831     ecode += condition? 3 : GET(ecode, 1);
832     }
833    
834 ph10 399 else if (condcode == OP_CREF) /* Group used test */
835 nigel 93 {
836 nigel 77 offset = GET2(ecode, LINK_SIZE+2) << 1; /* Doubled ref number */
837 nigel 93 condition = offset < offset_top && md->offset_vector[offset] >= 0;
838     ecode += condition? 3 : GET(ecode, 1);
839 nigel 77 }
840    
841 ph10 399 else if (condcode == OP_DEF) /* DEFINE - always false */
842 nigel 93 {
843     condition = FALSE;
844     ecode += GET(ecode, 1);
845     }
846    
847 nigel 77 /* The condition is an assertion. Call match() to evaluate it - setting
848 nigel 93 the final argument match_condassert causes it to stop at the end of an
849     assertion. */
850 nigel 77
851     else
852     {
853 ph10 164 RMATCH(eptr, ecode + 1 + LINK_SIZE, offset_top, md, ims, NULL,
854     match_condassert, RM3);
855 nigel 77 if (rrc == MATCH_MATCH)
856     {
857 nigel 93 condition = TRUE;
858     ecode += 1 + LINK_SIZE + GET(ecode, LINK_SIZE + 2);
859 nigel 77 while (*ecode == OP_ALT) ecode += GET(ecode, 1);
860     }
861 ph10 210 else if (rrc != MATCH_NOMATCH && rrc != MATCH_THEN)
862 nigel 77 {
863     RRETURN(rrc); /* Need braces because of following else */
864     }
865 nigel 93 else
866     {
867     condition = FALSE;
868 ph10 399 ecode += codelink;
869 nigel 93 }
870     }
871 nigel 91
872 nigel 93 /* We are now at the branch that is to be obeyed. As there is only one,
873 ph10 197 we can use tail recursion to avoid using another stack frame, except when
874     match_cbegroup is required for an unlimited repeat of a possibly empty
875     group. If the second alternative doesn't exist, we can just plough on. */
876 nigel 91
877 nigel 93 if (condition || *ecode == OP_ALT)
878     {
879 nigel 91 ecode += 1 + LINK_SIZE;
880 ph10 197 if (op == OP_SCOND) /* Possibly empty group */
881     {
882     RMATCH(eptr, ecode, offset_top, md, ims, eptrb, match_cbegroup, RM49);
883     RRETURN(rrc);
884     }
885     else /* Group must match something */
886     {
887     flags = 0;
888     goto TAIL_RECURSE;
889     }
890 nigel 77 }
891 ph10 395 else /* Condition false & no alternative */
892 nigel 93 {
893     ecode += 1 + LINK_SIZE;
894     }
895     break;
896 nigel 77
897    
898 ph10 210 /* End of the pattern, either real or forced. If we are in a top-level
899     recursion, we should restore the offsets appropriately and continue from
900     after the call. */
901 nigel 77
902 ph10 210 case OP_ACCEPT:
903 nigel 77 case OP_END:
904     if (md->recursive != NULL && md->recursive->group_num == 0)
905     {
906     recursion_info *rec = md->recursive;
907 nigel 87 DPRINTF(("End of pattern in a (?0) recursion\n"));
908 nigel 77 md->recursive = rec->prevrec;
909     memmove(md->offset_vector, rec->offset_save,
910     rec->saved_max * sizeof(int));
911 ph10 168 mstart = rec->save_start;
912 nigel 77 ims = original_ims;
913     ecode = rec->after_call;
914     break;
915     }
916    
917     /* Otherwise, if PCRE_NOTEMPTY is set, fail if we have matched an empty
918     string - backtracking will then try other alternatives, if any. */
919    
920 ph10 168 if (md->notempty && eptr == mstart) RRETURN(MATCH_NOMATCH);
921     md->end_match_ptr = eptr; /* Record where we ended */
922     md->end_offset_top = offset_top; /* and how many extracts were taken */
923 ph10 210 md->start_match_ptr = mstart; /* and the start (\K can modify) */
924 nigel 77 RRETURN(MATCH_MATCH);
925    
926     /* Change option settings */
927    
928     case OP_OPT:
929     ims = ecode[1];
930     ecode += 2;
931     DPRINTF(("ims set to %02lx\n", ims));
932     break;
933    
934     /* Assertion brackets. Check the alternative branches in turn - the
935     matching won't pass the KET for an assertion. If any one branch matches,
936     the assertion is true. Lookbehind assertions have an OP_REVERSE item at the
937     start of each branch to move the current point backwards, so the code at
938     this level is identical to the lookahead case. */
939    
940     case OP_ASSERT:
941     case OP_ASSERTBACK:
942     do
943     {
944 ph10 164 RMATCH(eptr, ecode + 1 + LINK_SIZE, offset_top, md, ims, NULL, 0,
945     RM4);
946 nigel 77 if (rrc == MATCH_MATCH) break;
947 ph10 210 if (rrc != MATCH_NOMATCH && rrc != MATCH_THEN) RRETURN(rrc);
948 nigel 77 ecode += GET(ecode, 1);
949     }
950     while (*ecode == OP_ALT);
951     if (*ecode == OP_KET) RRETURN(MATCH_NOMATCH);
952    
953     /* If checking an assertion for a condition, return MATCH_MATCH. */
954    
955     if ((flags & match_condassert) != 0) RRETURN(MATCH_MATCH);
956    
957     /* Continue from after the assertion, updating the offsets high water
958     mark, since extracts may have been taken during the assertion. */
959    
960     do ecode += GET(ecode,1); while (*ecode == OP_ALT);
961     ecode += 1 + LINK_SIZE;
962     offset_top = md->end_offset_top;
963     continue;
964    
965     /* Negative assertion: all branches must fail to match */
966    
967     case OP_ASSERT_NOT:
968     case OP_ASSERTBACK_NOT:
969     do
970     {
971 ph10 164 RMATCH(eptr, ecode + 1 + LINK_SIZE, offset_top, md, ims, NULL, 0,
972     RM5);
973 nigel 77 if (rrc == MATCH_MATCH) RRETURN(MATCH_NOMATCH);
974 ph10 210 if (rrc != MATCH_NOMATCH && rrc != MATCH_THEN) RRETURN(rrc);
975 nigel 77 ecode += GET(ecode,1);
976     }
977     while (*ecode == OP_ALT);
978    
979     if ((flags & match_condassert) != 0) RRETURN(MATCH_MATCH);
980    
981     ecode += 1 + LINK_SIZE;
982     continue;
983    
984     /* Move the subject pointer back. This occurs only at the start of
985     each branch of a lookbehind assertion. If we are too close to the start to
986     move back, this match function fails. When working with UTF-8 we move
987     back a number of characters, not bytes. */
988    
989     case OP_REVERSE:
990     #ifdef SUPPORT_UTF8
991     if (utf8)
992     {
993 nigel 93 i = GET(ecode, 1);
994     while (i-- > 0)
995 nigel 77 {
996     eptr--;
997     if (eptr < md->start_subject) RRETURN(MATCH_NOMATCH);
998 ph10 207 BACKCHAR(eptr);
999 nigel 77 }
1000     }
1001     else
1002     #endif
1003    
1004     /* No UTF-8 support, or not in UTF-8 mode: count is byte count */
1005    
1006     {
1007 nigel 93 eptr -= GET(ecode, 1);
1008 nigel 77 if (eptr < md->start_subject) RRETURN(MATCH_NOMATCH);
1009     }
1010    
1011     /* Skip to next op code */
1012    
1013     ecode += 1 + LINK_SIZE;
1014     break;
1015    
1016     /* The callout item calls an external function, if one is provided, passing
1017     details of the match so far. This is mainly for debugging, though the
1018     function is able to force a failure. */
1019    
1020     case OP_CALLOUT:
1021     if (pcre_callout != NULL)
1022     {
1023     pcre_callout_block cb;
1024     cb.version = 1; /* Version 1 of the callout block */
1025     cb.callout_number = ecode[1];
1026     cb.offset_vector = md->offset_vector;
1027 nigel 87 cb.subject = (PCRE_SPTR)md->start_subject;
1028 nigel 77 cb.subject_length = md->end_subject - md->start_subject;
1029 ph10 168 cb.start_match = mstart - md->start_subject;
1030 nigel 77 cb.current_position = eptr - md->start_subject;
1031     cb.pattern_position = GET(ecode, 2);
1032     cb.next_item_length = GET(ecode, 2 + LINK_SIZE);
1033     cb.capture_top = offset_top/2;
1034     cb.capture_last = md->capture_last;
1035     cb.callout_data = md->callout_data;
1036     if ((rrc = (*pcre_callout)(&cb)) > 0) RRETURN(MATCH_NOMATCH);
1037     if (rrc < 0) RRETURN(rrc);
1038     }
1039     ecode += 2 + 2*LINK_SIZE;
1040     break;
1041    
1042     /* Recursion either matches the current regex, or some subexpression. The
1043     offset data is the offset to the starting bracket from the start of the
1044     whole pattern. (This is so that it works from duplicated subpatterns.)
1045    
1046     If there are any capturing brackets started but not finished, we have to
1047     save their starting points and reinstate them after the recursion. However,
1048     we don't know how many such there are (offset_top records the completed
1049     total) so we just have to save all the potential data. There may be up to
1050     65535 such values, which is too large to put on the stack, but using malloc
1051     for small numbers seems expensive. As a compromise, the stack is used when
1052     there are no more than REC_STACK_SAVE_MAX values to store; otherwise malloc
1053     is used. A problem is what to do if the malloc fails ... there is no way of
1054     returning to the top level with an error. Save the top REC_STACK_SAVE_MAX
1055     values on the stack, and accept that the rest may be wrong.
1056    
1057     There are also other values that have to be saved. We use a chained
1058     sequence of blocks that actually live on the stack. Thanks to Robin Houston
1059     for the original version of this logic. */
1060    
1061     case OP_RECURSE:
1062     {
1063     callpat = md->start_code + GET(ecode, 1);
1064 nigel 93 new_recursive.group_num = (callpat == md->start_code)? 0 :
1065     GET2(callpat, 1 + LINK_SIZE);
1066 nigel 77
1067     /* Add to "recursing stack" */
1068    
1069     new_recursive.prevrec = md->recursive;
1070     md->recursive = &new_recursive;
1071    
1072     /* Find where to continue from afterwards */
1073    
1074     ecode += 1 + LINK_SIZE;
1075     new_recursive.after_call = ecode;
1076    
1077     /* Now save the offset data. */
1078    
1079     new_recursive.saved_max = md->offset_end;
1080     if (new_recursive.saved_max <= REC_STACK_SAVE_MAX)
1081     new_recursive.offset_save = stacksave;
1082     else
1083     {
1084     new_recursive.offset_save =
1085     (int *)(pcre_malloc)(new_recursive.saved_max * sizeof(int));
1086     if (new_recursive.offset_save == NULL) RRETURN(PCRE_ERROR_NOMEMORY);
1087     }
1088    
1089     memcpy(new_recursive.offset_save, md->offset_vector,
1090     new_recursive.saved_max * sizeof(int));
1091 ph10 168 new_recursive.save_start = mstart;
1092     mstart = eptr;
1093 nigel 77
1094     /* OK, now we can do the recursion. For each top-level alternative we
1095     restore the offset and recursion data. */
1096    
1097     DPRINTF(("Recursing into group %d\n", new_recursive.group_num));
1098 nigel 93 flags = (*callpat >= OP_SBRA)? match_cbegroup : 0;
1099 nigel 77 do
1100     {
1101 ph10 164 RMATCH(eptr, callpat + _pcre_OP_lengths[*callpat], offset_top,
1102     md, ims, eptrb, flags, RM6);
1103 nigel 77 if (rrc == MATCH_MATCH)
1104     {
1105 nigel 87 DPRINTF(("Recursion matched\n"));
1106 nigel 77 md->recursive = new_recursive.prevrec;
1107     if (new_recursive.offset_save != stacksave)
1108     (pcre_free)(new_recursive.offset_save);
1109     RRETURN(MATCH_MATCH);
1110     }
1111 ph10 210 else if (rrc != MATCH_NOMATCH && rrc != MATCH_THEN)
1112 nigel 87 {
1113     DPRINTF(("Recursion gave error %d\n", rrc));
1114 ph10 400 if (new_recursive.offset_save != stacksave)
1115     (pcre_free)(new_recursive.offset_save);
1116 nigel 87 RRETURN(rrc);
1117     }
1118 nigel 77
1119     md->recursive = &new_recursive;
1120     memcpy(md->offset_vector, new_recursive.offset_save,
1121     new_recursive.saved_max * sizeof(int));
1122     callpat += GET(callpat, 1);
1123     }
1124     while (*callpat == OP_ALT);
1125    
1126     DPRINTF(("Recursion didn't match\n"));
1127     md->recursive = new_recursive.prevrec;
1128     if (new_recursive.offset_save != stacksave)
1129     (pcre_free)(new_recursive.offset_save);
1130     RRETURN(MATCH_NOMATCH);
1131     }
1132     /* Control never reaches here */
1133    
1134     /* "Once" brackets are like assertion brackets except that after a match,
1135     the point in the subject string is not moved back. Thus there can never be
1136     a move back into the brackets. Friedl calls these "atomic" subpatterns.
1137     Check the alternative branches in turn - the matching won't pass the KET
1138     for this kind of subpattern. If any one branch matches, we carry on as at
1139     the end of a normal bracket, leaving the subject pointer. */
1140    
1141     case OP_ONCE:
1142 nigel 91 prev = ecode;
1143     saved_eptr = eptr;
1144    
1145     do
1146 nigel 77 {
1147 ph10 197 RMATCH(eptr, ecode + 1 + LINK_SIZE, offset_top, md, ims, eptrb, 0, RM7);
1148 nigel 91 if (rrc == MATCH_MATCH) break;
1149 ph10 210 if (rrc != MATCH_NOMATCH && rrc != MATCH_THEN) RRETURN(rrc);
1150 nigel 91 ecode += GET(ecode,1);
1151     }
1152     while (*ecode == OP_ALT);
1153 nigel 77
1154 nigel 91 /* If hit the end of the group (which could be repeated), fail */
1155 nigel 77
1156 nigel 91 if (*ecode != OP_ONCE && *ecode != OP_ALT) RRETURN(MATCH_NOMATCH);
1157 nigel 77
1158 nigel 91 /* Continue as from after the assertion, updating the offsets high water
1159     mark, since extracts may have been taken. */
1160 nigel 77
1161 nigel 93 do ecode += GET(ecode, 1); while (*ecode == OP_ALT);
1162 nigel 77
1163 nigel 91 offset_top = md->end_offset_top;
1164     eptr = md->end_match_ptr;
1165 nigel 77
1166 nigel 91 /* For a non-repeating ket, just continue at this level. This also
1167     happens for a repeating ket if no characters were matched in the group.
1168     This is the forcible breaking of infinite loops as implemented in Perl
1169     5.005. If there is an options reset, it will get obeyed in the normal
1170     course of events. */
1171 nigel 77
1172 nigel 91 if (*ecode == OP_KET || eptr == saved_eptr)
1173     {
1174     ecode += 1+LINK_SIZE;
1175     break;
1176     }
1177 nigel 77
1178 nigel 91 /* The repeating kets try the rest of the pattern or restart from the
1179     preceding bracket, in the appropriate order. The second "call" of match()
1180     uses tail recursion, to avoid using another stack frame. We need to reset
1181     any options that changed within the bracket before re-running it, so
1182     check the next opcode. */
1183 nigel 77
1184 nigel 91 if (ecode[1+LINK_SIZE] == OP_OPT)
1185     {
1186     ims = (ims & ~PCRE_IMS) | ecode[4];
1187     DPRINTF(("ims set to %02lx at group repeat\n", ims));
1188     }
1189 nigel 77
1190 nigel 91 if (*ecode == OP_KETRMIN)
1191     {
1192 ph10 197 RMATCH(eptr, ecode + 1 + LINK_SIZE, offset_top, md, ims, eptrb, 0, RM8);
1193 nigel 91 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1194     ecode = prev;
1195 ph10 197 flags = 0;
1196 nigel 91 goto TAIL_RECURSE;
1197 nigel 77 }
1198 nigel 91 else /* OP_KETRMAX */
1199     {
1200 ph10 164 RMATCH(eptr, prev, offset_top, md, ims, eptrb, match_cbegroup, RM9);
1201 nigel 91 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1202     ecode += 1 + LINK_SIZE;
1203 ph10 197 flags = 0;
1204 nigel 91 goto TAIL_RECURSE;
1205     }
1206     /* Control never gets here */
1207 nigel 77
1208     /* An alternation is the end of a branch; scan along to find the end of the
1209     bracketed group and go to there. */
1210    
1211     case OP_ALT:
1212     do ecode += GET(ecode,1); while (*ecode == OP_ALT);
1213     break;
1214    
1215 ph10 335 /* BRAZERO, BRAMINZERO and SKIPZERO occur just before a bracket group,
1216     indicating that it may occur zero times. It may repeat infinitely, or not
1217     at all - i.e. it could be ()* or ()? or even (){0} in the pattern. Brackets
1218     with fixed upper repeat limits are compiled as a number of copies, with the
1219     optional ones preceded by BRAZERO or BRAMINZERO. */
1220 nigel 77
1221     case OP_BRAZERO:
1222     {
1223     next = ecode+1;
1224 ph10 164 RMATCH(eptr, next, offset_top, md, ims, eptrb, 0, RM10);
1225 nigel 77 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1226     do next += GET(next,1); while (*next == OP_ALT);
1227 nigel 93 ecode = next + 1 + LINK_SIZE;
1228 nigel 77 }
1229     break;
1230    
1231     case OP_BRAMINZERO:
1232     {
1233     next = ecode+1;
1234 nigel 93 do next += GET(next, 1); while (*next == OP_ALT);
1235 ph10 164 RMATCH(eptr, next + 1+LINK_SIZE, offset_top, md, ims, eptrb, 0, RM11);
1236 nigel 77 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1237     ecode++;
1238     }
1239     break;
1240    
1241 ph10 335 case OP_SKIPZERO:
1242     {
1243     next = ecode+1;
1244     do next += GET(next,1); while (*next == OP_ALT);
1245     ecode = next + 1 + LINK_SIZE;
1246     }
1247     break;
1248    
1249 nigel 93 /* End of a group, repeated or non-repeating. */
1250 nigel 77
1251     case OP_KET:
1252     case OP_KETRMIN:
1253     case OP_KETRMAX:
1254 nigel 91 prev = ecode - GET(ecode, 1);
1255 nigel 77
1256 nigel 93 /* If this was a group that remembered the subject start, in order to break
1257     infinite repeats of empty string matches, retrieve the subject start from
1258     the chain. Otherwise, set it NULL. */
1259 nigel 77
1260 nigel 93 if (*prev >= OP_SBRA)
1261     {
1262     saved_eptr = eptrb->epb_saved_eptr; /* Value at start of group */
1263     eptrb = eptrb->epb_prev; /* Backup to previous group */
1264     }
1265     else saved_eptr = NULL;
1266 nigel 77
1267 nigel 93 /* If we are at the end of an assertion group, stop matching and return
1268     MATCH_MATCH, but record the current high water mark for use by positive
1269     assertions. Do this also for the "once" (atomic) groups. */
1270    
1271 nigel 91 if (*prev == OP_ASSERT || *prev == OP_ASSERT_NOT ||
1272     *prev == OP_ASSERTBACK || *prev == OP_ASSERTBACK_NOT ||
1273     *prev == OP_ONCE)
1274     {
1275     md->end_match_ptr = eptr; /* For ONCE */
1276     md->end_offset_top = offset_top;
1277     RRETURN(MATCH_MATCH);
1278     }
1279 nigel 77
1280 nigel 93 /* For capturing groups we have to check the group number back at the start
1281     and if necessary complete handling an extraction by setting the offsets and
1282     bumping the high water mark. Note that whole-pattern recursion is coded as
1283     a recurse into group 0, so it won't be picked up here. Instead, we catch it
1284     when the OP_END is reached. Other recursion is handled here. */
1285 nigel 77
1286 nigel 93 if (*prev == OP_CBRA || *prev == OP_SCBRA)
1287 nigel 91 {
1288 nigel 93 number = GET2(prev, 1+LINK_SIZE);
1289 nigel 91 offset = number << 1;
1290 nigel 77
1291     #ifdef DEBUG
1292 nigel 91 printf("end bracket %d", number);
1293     printf("\n");
1294 nigel 77 #endif
1295    
1296 nigel 93 md->capture_last = number;
1297     if (offset >= md->offset_max) md->offset_overflow = TRUE; else
1298 nigel 91 {
1299 nigel 93 md->offset_vector[offset] =
1300     md->offset_vector[md->offset_end - number];
1301     md->offset_vector[offset+1] = eptr - md->start_subject;
1302     if (offset_top <= offset) offset_top = offset + 2;
1303     }
1304 nigel 77
1305 nigel 93 /* Handle a recursively called group. Restore the offsets
1306     appropriately and continue from after the call. */
1307 nigel 77
1308 nigel 93 if (md->recursive != NULL && md->recursive->group_num == number)
1309     {
1310     recursion_info *rec = md->recursive;
1311     DPRINTF(("Recursion (%d) succeeded - continuing\n", number));
1312     md->recursive = rec->prevrec;
1313 ph10 168 mstart = rec->save_start;
1314 nigel 93 memcpy(md->offset_vector, rec->offset_save,
1315     rec->saved_max * sizeof(int));
1316     ecode = rec->after_call;
1317     ims = original_ims;
1318     break;
1319 nigel 77 }
1320 nigel 91 }
1321 nigel 77
1322 nigel 93 /* For both capturing and non-capturing groups, reset the value of the ims
1323     flags, in case they got changed during the group. */
1324 nigel 77
1325 nigel 91 ims = original_ims;
1326     DPRINTF(("ims reset to %02lx\n", ims));
1327 nigel 77
1328 nigel 91 /* For a non-repeating ket, just continue at this level. This also
1329     happens for a repeating ket if no characters were matched in the group.
1330     This is the forcible breaking of infinite loops as implemented in Perl
1331     5.005. If there is an options reset, it will get obeyed in the normal
1332     course of events. */
1333 nigel 77
1334 nigel 91 if (*ecode == OP_KET || eptr == saved_eptr)
1335     {
1336     ecode += 1 + LINK_SIZE;
1337     break;
1338     }
1339 nigel 77
1340 nigel 91 /* The repeating kets try the rest of the pattern or restart from the
1341     preceding bracket, in the appropriate order. In the second case, we can use
1342 ph10 197 tail recursion to avoid using another stack frame, unless we have an
1343     unlimited repeat of a group that can match an empty string. */
1344 nigel 77
1345 nigel 93 flags = (*prev >= OP_SBRA)? match_cbegroup : 0;
1346    
1347 nigel 91 if (*ecode == OP_KETRMIN)
1348     {
1349 ph10 197 RMATCH(eptr, ecode + 1 + LINK_SIZE, offset_top, md, ims, eptrb, 0, RM12);
1350 nigel 91 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1351 ph10 197 if (flags != 0) /* Could match an empty string */
1352     {
1353     RMATCH(eptr, prev, offset_top, md, ims, eptrb, flags, RM50);
1354     RRETURN(rrc);
1355     }
1356 nigel 91 ecode = prev;
1357     goto TAIL_RECURSE;
1358 nigel 77 }
1359 nigel 91 else /* OP_KETRMAX */
1360     {
1361 ph10 164 RMATCH(eptr, prev, offset_top, md, ims, eptrb, flags, RM13);
1362 nigel 91 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1363     ecode += 1 + LINK_SIZE;
1364 ph10 197 flags = 0;
1365 nigel 91 goto TAIL_RECURSE;
1366     }
1367     /* Control never gets here */
1368 nigel 77
1369     /* Start of subject unless notbol, or after internal newline if multiline */
1370    
1371     case OP_CIRC:
1372     if (md->notbol && eptr == md->start_subject) RRETURN(MATCH_NOMATCH);
1373     if ((ims & PCRE_MULTILINE) != 0)
1374     {
1375 nigel 91 if (eptr != md->start_subject &&
1376 nigel 93 (eptr == md->end_subject || !WAS_NEWLINE(eptr)))
1377 nigel 77 RRETURN(MATCH_NOMATCH);
1378     ecode++;
1379     break;
1380     }
1381     /* ... else fall through */
1382    
1383     /* Start of subject assertion */
1384    
1385     case OP_SOD:
1386     if (eptr != md->start_subject) RRETURN(MATCH_NOMATCH);
1387     ecode++;
1388     break;
1389    
1390     /* Start of match assertion */
1391    
1392     case OP_SOM:
1393     if (eptr != md->start_subject + md->start_offset) RRETURN(MATCH_NOMATCH);
1394     ecode++;
1395     break;
1396 ph10 172
1397 ph10 168 /* Reset the start of match point */
1398 ph10 172
1399 ph10 168 case OP_SET_SOM:
1400     mstart = eptr;
1401 ph10 172 ecode++;
1402     break;
1403 nigel 77
1404     /* Assert before internal newline if multiline, or before a terminating
1405     newline unless endonly is set, else end of subject unless noteol is set. */
1406    
1407     case OP_DOLL:
1408     if ((ims & PCRE_MULTILINE) != 0)
1409     {
1410     if (eptr < md->end_subject)
1411 nigel 91 { if (!IS_NEWLINE(eptr)) RRETURN(MATCH_NOMATCH); }
1412 nigel 77 else
1413     { if (md->noteol) RRETURN(MATCH_NOMATCH); }
1414     ecode++;
1415     break;
1416     }
1417     else
1418     {
1419     if (md->noteol) RRETURN(MATCH_NOMATCH);
1420     if (!md->endonly)
1421     {
1422 nigel 91 if (eptr != md->end_subject &&
1423 nigel 93 (!IS_NEWLINE(eptr) || eptr != md->end_subject - md->nllen))
1424 nigel 77 RRETURN(MATCH_NOMATCH);
1425     ecode++;
1426     break;
1427     }
1428     }
1429 nigel 91 /* ... else fall through for endonly */
1430 nigel 77
1431     /* End of subject assertion (\z) */
1432    
1433     case OP_EOD:
1434     if (eptr < md->end_subject) RRETURN(MATCH_NOMATCH);
1435     ecode++;
1436     break;
1437    
1438     /* End of subject or ending \n assertion (\Z) */
1439    
1440     case OP_EODN:
1441 nigel 91 if (eptr != md->end_subject &&
1442 nigel 93 (!IS_NEWLINE(eptr) || eptr != md->end_subject - md->nllen))
1443 nigel 91 RRETURN(MATCH_NOMATCH);
1444 nigel 77 ecode++;
1445     break;
1446    
1447     /* Word boundary assertions */
1448    
1449     case OP_NOT_WORD_BOUNDARY:
1450     case OP_WORD_BOUNDARY:
1451     {
1452    
1453     /* Find out if the previous and current characters are "word" characters.
1454     It takes a bit more work in UTF-8 mode. Characters > 255 are assumed to
1455     be "non-word" characters. */
1456    
1457     #ifdef SUPPORT_UTF8
1458     if (utf8)
1459     {
1460     if (eptr == md->start_subject) prev_is_word = FALSE; else
1461     {
1462     const uschar *lastptr = eptr - 1;
1463     while((*lastptr & 0xc0) == 0x80) lastptr--;
1464     GETCHAR(c, lastptr);
1465     prev_is_word = c < 256 && (md->ctypes[c] & ctype_word) != 0;
1466     }
1467     if (eptr >= md->end_subject) cur_is_word = FALSE; else
1468     {
1469     GETCHAR(c, eptr);
1470     cur_is_word = c < 256 && (md->ctypes[c] & ctype_word) != 0;
1471     }
1472     }
1473     else
1474     #endif
1475    
1476     /* More streamlined when not in UTF-8 mode */
1477    
1478     {
1479     prev_is_word = (eptr != md->start_subject) &&
1480     ((md->ctypes[eptr[-1]] & ctype_word) != 0);
1481     cur_is_word = (eptr < md->end_subject) &&
1482     ((md->ctypes[*eptr] & ctype_word) != 0);
1483     }
1484    
1485     /* Now see if the situation is what we want */
1486    
1487     if ((*ecode++ == OP_WORD_BOUNDARY)?
1488     cur_is_word == prev_is_word : cur_is_word != prev_is_word)
1489     RRETURN(MATCH_NOMATCH);
1490     }
1491     break;
1492    
1493     /* Match a single character type; inline for speed */
1494    
1495     case OP_ANY:
1496 ph10 342 if (IS_NEWLINE(eptr)) RRETURN(MATCH_NOMATCH);
1497 ph10 345 /* Fall through */
1498    
1499 ph10 341 case OP_ALLANY:
1500 nigel 77 if (eptr++ >= md->end_subject) RRETURN(MATCH_NOMATCH);
1501 ph10 342 if (utf8) while (eptr < md->end_subject && (*eptr & 0xc0) == 0x80) eptr++;
1502 nigel 77 ecode++;
1503     break;
1504    
1505     /* Match a single byte, even in UTF-8 mode. This opcode really does match
1506     any byte, even newline, independent of the setting of PCRE_DOTALL. */
1507    
1508     case OP_ANYBYTE:
1509     if (eptr++ >= md->end_subject) RRETURN(MATCH_NOMATCH);
1510     ecode++;
1511     break;
1512    
1513     case OP_NOT_DIGIT:
1514     if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
1515     GETCHARINCTEST(c, eptr);
1516     if (
1517     #ifdef SUPPORT_UTF8
1518     c < 256 &&
1519     #endif
1520     (md->ctypes[c] & ctype_digit) != 0
1521     )
1522     RRETURN(MATCH_NOMATCH);
1523     ecode++;
1524     break;
1525    
1526     case OP_DIGIT:
1527     if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
1528     GETCHARINCTEST(c, eptr);
1529     if (
1530     #ifdef SUPPORT_UTF8
1531     c >= 256 ||
1532     #endif
1533     (md->ctypes[c] & ctype_digit) == 0
1534     )
1535     RRETURN(MATCH_NOMATCH);
1536     ecode++;
1537     break;
1538    
1539     case OP_NOT_WHITESPACE:
1540     if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
1541     GETCHARINCTEST(c, eptr);
1542     if (
1543     #ifdef SUPPORT_UTF8
1544     c < 256 &&
1545     #endif
1546     (md->ctypes[c] & ctype_space) != 0
1547     )
1548     RRETURN(MATCH_NOMATCH);
1549     ecode++;
1550     break;
1551    
1552     case OP_WHITESPACE:
1553     if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
1554     GETCHARINCTEST(c, eptr);
1555     if (
1556     #ifdef SUPPORT_UTF8
1557     c >= 256 ||
1558     #endif
1559     (md->ctypes[c] & ctype_space) == 0
1560     )
1561     RRETURN(MATCH_NOMATCH);
1562     ecode++;
1563     break;
1564    
1565     case OP_NOT_WORDCHAR:
1566     if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
1567     GETCHARINCTEST(c, eptr);
1568     if (
1569     #ifdef SUPPORT_UTF8
1570     c < 256 &&
1571     #endif
1572     (md->ctypes[c] & ctype_word) != 0
1573     )
1574     RRETURN(MATCH_NOMATCH);
1575     ecode++;
1576     break;
1577    
1578     case OP_WORDCHAR:
1579     if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
1580     GETCHARINCTEST(c, eptr);
1581     if (
1582     #ifdef SUPPORT_UTF8
1583     c >= 256 ||
1584     #endif
1585     (md->ctypes[c] & ctype_word) == 0
1586     )
1587     RRETURN(MATCH_NOMATCH);
1588     ecode++;
1589     break;
1590    
1591 nigel 93 case OP_ANYNL:
1592     if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
1593     GETCHARINCTEST(c, eptr);
1594     switch(c)
1595     {
1596     default: RRETURN(MATCH_NOMATCH);
1597     case 0x000d:
1598     if (eptr < md->end_subject && *eptr == 0x0a) eptr++;
1599     break;
1600 ph10 231
1601 nigel 93 case 0x000a:
1602 ph10 231 break;
1603    
1604 nigel 93 case 0x000b:
1605     case 0x000c:
1606     case 0x0085:
1607     case 0x2028:
1608     case 0x2029:
1609 ph10 231 if (md->bsr_anycrlf) RRETURN(MATCH_NOMATCH);
1610 nigel 93 break;
1611     }
1612     ecode++;
1613     break;
1614    
1615 ph10 178 case OP_NOT_HSPACE:
1616     if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
1617     GETCHARINCTEST(c, eptr);
1618     switch(c)
1619     {
1620     default: break;
1621     case 0x09: /* HT */
1622     case 0x20: /* SPACE */
1623     case 0xa0: /* NBSP */
1624     case 0x1680: /* OGHAM SPACE MARK */
1625     case 0x180e: /* MONGOLIAN VOWEL SEPARATOR */
1626     case 0x2000: /* EN QUAD */
1627     case 0x2001: /* EM QUAD */
1628     case 0x2002: /* EN SPACE */
1629     case 0x2003: /* EM SPACE */
1630     case 0x2004: /* THREE-PER-EM SPACE */
1631     case 0x2005: /* FOUR-PER-EM SPACE */
1632     case 0x2006: /* SIX-PER-EM SPACE */
1633     case 0x2007: /* FIGURE SPACE */
1634     case 0x2008: /* PUNCTUATION SPACE */
1635     case 0x2009: /* THIN SPACE */
1636     case 0x200A: /* HAIR SPACE */
1637     case 0x202f: /* NARROW NO-BREAK SPACE */
1638     case 0x205f: /* MEDIUM MATHEMATICAL SPACE */
1639     case 0x3000: /* IDEOGRAPHIC SPACE */
1640     RRETURN(MATCH_NOMATCH);
1641     }
1642     ecode++;
1643     break;
1644    
1645     case OP_HSPACE:
1646     if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
1647     GETCHARINCTEST(c, eptr);
1648     switch(c)
1649     {
1650     default: RRETURN(MATCH_NOMATCH);
1651     case 0x09: /* HT */
1652     case 0x20: /* SPACE */
1653     case 0xa0: /* NBSP */
1654     case 0x1680: /* OGHAM SPACE MARK */
1655     case 0x180e: /* MONGOLIAN VOWEL SEPARATOR */
1656     case 0x2000: /* EN QUAD */
1657     case 0x2001: /* EM QUAD */
1658     case 0x2002: /* EN SPACE */
1659     case 0x2003: /* EM SPACE */
1660     case 0x2004: /* THREE-PER-EM SPACE */
1661     case 0x2005: /* FOUR-PER-EM SPACE */
1662     case 0x2006: /* SIX-PER-EM SPACE */
1663     case 0x2007: /* FIGURE SPACE */
1664     case 0x2008: /* PUNCTUATION SPACE */
1665     case 0x2009: /* THIN SPACE */
1666     case 0x200A: /* HAIR SPACE */
1667     case 0x202f: /* NARROW NO-BREAK SPACE */
1668     case 0x205f: /* MEDIUM MATHEMATICAL SPACE */
1669     case 0x3000: /* IDEOGRAPHIC SPACE */
1670     break;
1671     }
1672     ecode++;
1673     break;
1674    
1675     case OP_NOT_VSPACE:
1676     if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
1677     GETCHARINCTEST(c, eptr);
1678     switch(c)
1679     {
1680     default: break;
1681     case 0x0a: /* LF */
1682     case 0x0b: /* VT */
1683     case 0x0c: /* FF */
1684     case 0x0d: /* CR */
1685     case 0x85: /* NEL */
1686     case 0x2028: /* LINE SEPARATOR */
1687     case 0x2029: /* PARAGRAPH SEPARATOR */
1688     RRETURN(MATCH_NOMATCH);
1689     }
1690     ecode++;
1691     break;
1692    
1693     case OP_VSPACE:
1694     if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
1695     GETCHARINCTEST(c, eptr);
1696     switch(c)
1697     {
1698     default: RRETURN(MATCH_NOMATCH);
1699     case 0x0a: /* LF */
1700     case 0x0b: /* VT */
1701     case 0x0c: /* FF */
1702     case 0x0d: /* CR */
1703     case 0x85: /* NEL */
1704     case 0x2028: /* LINE SEPARATOR */
1705     case 0x2029: /* PARAGRAPH SEPARATOR */
1706     break;
1707     }
1708     ecode++;
1709     break;
1710    
1711 nigel 77 #ifdef SUPPORT_UCP
1712     /* Check the next character by Unicode property. We will get here only
1713     if the support is in the binary; otherwise a compile-time error occurs. */
1714    
1715     case OP_PROP:
1716     case OP_NOTPROP:
1717     if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
1718     GETCHARINCTEST(c, eptr);
1719     {
1720 ph10 384 const ucd_record *prop = GET_UCD(c);
1721 nigel 77
1722 nigel 87 switch(ecode[1])
1723     {
1724     case PT_ANY:
1725     if (op == OP_NOTPROP) RRETURN(MATCH_NOMATCH);
1726     break;
1727 nigel 77
1728 nigel 87 case PT_LAMP:
1729 ph10 349 if ((prop->chartype == ucp_Lu ||
1730     prop->chartype == ucp_Ll ||
1731     prop->chartype == ucp_Lt) == (op == OP_NOTPROP))
1732 nigel 77 RRETURN(MATCH_NOMATCH);
1733 nigel 87 break;
1734    
1735     case PT_GC:
1736 ph10 351 if ((ecode[2] != _pcre_ucp_gentype[prop->chartype]) == (op == OP_PROP))
1737 nigel 77 RRETURN(MATCH_NOMATCH);
1738 nigel 87 break;
1739    
1740     case PT_PC:
1741 ph10 349 if ((ecode[2] != prop->chartype) == (op == OP_PROP))
1742 nigel 87 RRETURN(MATCH_NOMATCH);
1743     break;
1744    
1745     case PT_SC:
1746 ph10 349 if ((ecode[2] != prop->script) == (op == OP_PROP))
1747 nigel 87 RRETURN(MATCH_NOMATCH);
1748     break;
1749    
1750     default:
1751     RRETURN(PCRE_ERROR_INTERNAL);
1752 nigel 77 }
1753 nigel 87
1754     ecode += 3;
1755 nigel 77 }
1756     break;
1757    
1758     /* Match an extended Unicode sequence. We will get here only if the support
1759     is in the binary; otherwise a compile-time error occurs. */
1760    
1761     case OP_EXTUNI:
1762     if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
1763     GETCHARINCTEST(c, eptr);
1764     {
1765 ph10 349 int category = UCD_CATEGORY(c);
1766 nigel 77 if (category == ucp_M) RRETURN(MATCH_NOMATCH);
1767     while (eptr < md->end_subject)
1768     {
1769     int len = 1;
1770     if (!utf8) c = *eptr; else
1771     {
1772     GETCHARLEN(c, eptr, len);
1773     }
1774 ph10 349 category = UCD_CATEGORY(c);
1775 nigel 77 if (category != ucp_M) break;
1776     eptr += len;
1777     }
1778     }
1779     ecode++;
1780     break;
1781     #endif
1782    
1783    
1784     /* Match a back reference, possibly repeatedly. Look past the end of the
1785     item to see if there is repeat information following. The code is similar
1786     to that for character classes, but repeated for efficiency. Then obey
1787     similar code to character type repeats - written out again for speed.
1788     However, if the referenced string is the empty string, always treat
1789     it as matched, any number of times (otherwise there could be infinite
1790     loops). */
1791    
1792     case OP_REF:
1793     {
1794     offset = GET2(ecode, 1) << 1; /* Doubled ref number */
1795 ph10 345 ecode += 3;
1796    
1797 ph10 336 /* If the reference is unset, there are two possibilities:
1798 ph10 345
1799 ph10 336 (a) In the default, Perl-compatible state, set the length to be longer
1800     than the amount of subject left; this ensures that every attempt at a
1801     match fails. We can't just fail here, because of the possibility of
1802     quantifiers with zero minima.
1803 ph10 345
1804     (b) If the JavaScript compatibility flag is set, set the length to zero
1805     so that the back reference matches an empty string.
1806    
1807     Otherwise, set the length to the length of what was matched by the
1808 ph10 336 referenced subpattern. */
1809 ph10 345
1810 ph10 336 if (offset >= offset_top || md->offset_vector[offset] < 0)
1811 ph10 345 length = (md->jscript_compat)? 0 : md->end_subject - eptr + 1;
1812 ph10 336 else
1813     length = md->offset_vector[offset+1] - md->offset_vector[offset];
1814 nigel 77
1815     /* Set up for repetition, or handle the non-repeated case */
1816    
1817     switch (*ecode)
1818     {
1819     case OP_CRSTAR:
1820     case OP_CRMINSTAR:
1821     case OP_CRPLUS:
1822     case OP_CRMINPLUS:
1823     case OP_CRQUERY:
1824     case OP_CRMINQUERY:
1825     c = *ecode++ - OP_CRSTAR;
1826     minimize = (c & 1) != 0;
1827     min = rep_min[c]; /* Pick up values from tables; */
1828     max = rep_max[c]; /* zero for max => infinity */
1829     if (max == 0) max = INT_MAX;
1830     break;
1831    
1832     case OP_CRRANGE:
1833     case OP_CRMINRANGE:
1834     minimize = (*ecode == OP_CRMINRANGE);
1835     min = GET2(ecode, 1);
1836     max = GET2(ecode, 3);
1837     if (max == 0) max = INT_MAX;
1838     ecode += 5;
1839     break;
1840    
1841     default: /* No repeat follows */
1842     if (!match_ref(offset, eptr, length, md, ims)) RRETURN(MATCH_NOMATCH);
1843     eptr += length;
1844     continue; /* With the main loop */
1845     }
1846    
1847     /* If the length of the reference is zero, just continue with the
1848     main loop. */
1849    
1850     if (length == 0) continue;
1851    
1852     /* First, ensure the minimum number of matches are present. We get back
1853     the length of the reference string explicitly rather than passing the
1854     address of eptr, so that eptr can be a register variable. */
1855    
1856     for (i = 1; i <= min; i++)
1857     {
1858     if (!match_ref(offset, eptr, length, md, ims)) RRETURN(MATCH_NOMATCH);
1859     eptr += length;
1860     }
1861    
1862     /* If min = max, continue at the same level without recursion.
1863     They are not both allowed to be zero. */
1864    
1865     if (min == max) continue;
1866    
1867     /* If minimizing, keep trying and advancing the pointer */
1868    
1869     if (minimize)
1870     {
1871     for (fi = min;; fi++)
1872     {
1873 ph10 164 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM14);
1874 nigel 77 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1875     if (fi >= max || !match_ref(offset, eptr, length, md, ims))
1876     RRETURN(MATCH_NOMATCH);
1877     eptr += length;
1878     }
1879     /* Control never gets here */
1880     }
1881    
1882     /* If maximizing, find the longest string and work backwards */
1883    
1884     else
1885     {
1886     pp = eptr;
1887     for (i = min; i < max; i++)
1888     {
1889     if (!match_ref(offset, eptr, length, md, ims)) break;
1890     eptr += length;
1891     }
1892     while (eptr >= pp)
1893     {
1894 ph10 164 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM15);
1895 nigel 77 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1896     eptr -= length;
1897     }
1898     RRETURN(MATCH_NOMATCH);
1899     }
1900     }
1901     /* Control never gets here */
1902    
1903    
1904    
1905     /* Match a bit-mapped character class, possibly repeatedly. This op code is
1906     used when all the characters in the class have values in the range 0-255,
1907     and either the matching is caseful, or the characters are in the range
1908     0-127 when UTF-8 processing is enabled. The only difference between
1909     OP_CLASS and OP_NCLASS occurs when a data character outside the range is
1910     encountered.
1911    
1912     First, look past the end of the item to see if there is repeat information
1913     following. Then obey similar code to character type repeats - written out
1914     again for speed. */
1915    
1916     case OP_NCLASS:
1917     case OP_CLASS:
1918     {
1919     data = ecode + 1; /* Save for matching */
1920     ecode += 33; /* Advance past the item */
1921    
1922     switch (*ecode)
1923     {
1924     case OP_CRSTAR:
1925     case OP_CRMINSTAR:
1926     case OP_CRPLUS:
1927     case OP_CRMINPLUS:
1928     case OP_CRQUERY:
1929     case OP_CRMINQUERY:
1930     c = *ecode++ - OP_CRSTAR;
1931     minimize = (c & 1) != 0;
1932     min = rep_min[c]; /* Pick up values from tables; */
1933     max = rep_max[c]; /* zero for max => infinity */
1934     if (max == 0) max = INT_MAX;
1935     break;
1936    
1937     case OP_CRRANGE:
1938     case OP_CRMINRANGE:
1939     minimize = (*ecode == OP_CRMINRANGE);
1940     min = GET2(ecode, 1);
1941     max = GET2(ecode, 3);
1942     if (max == 0) max = INT_MAX;
1943     ecode += 5;
1944     break;
1945    
1946     default: /* No repeat follows */
1947     min = max = 1;
1948     break;
1949     }
1950    
1951     /* First, ensure the minimum number of matches are present. */
1952    
1953     #ifdef SUPPORT_UTF8
1954     /* UTF-8 mode */
1955     if (utf8)
1956     {
1957     for (i = 1; i <= min; i++)
1958     {
1959     if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
1960     GETCHARINC(c, eptr);
1961     if (c > 255)
1962     {
1963     if (op == OP_CLASS) RRETURN(MATCH_NOMATCH);
1964     }
1965     else
1966     {
1967     if ((data[c/8] & (1 << (c&7))) == 0) RRETURN(MATCH_NOMATCH);
1968     }
1969     }
1970     }
1971     else
1972     #endif
1973     /* Not UTF-8 mode */
1974     {
1975     for (i = 1; i <= min; i++)
1976     {
1977     if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
1978     c = *eptr++;
1979     if ((data[c/8] & (1 << (c&7))) == 0) RRETURN(MATCH_NOMATCH);
1980     }
1981     }
1982    
1983     /* If max == min we can continue with the main loop without the
1984     need to recurse. */
1985    
1986     if (min == max) continue;
1987    
1988     /* If minimizing, keep testing the rest of the expression and advancing
1989     the pointer while it matches the class. */
1990    
1991     if (minimize)
1992     {
1993     #ifdef SUPPORT_UTF8
1994     /* UTF-8 mode */
1995     if (utf8)
1996     {
1997     for (fi = min;; fi++)
1998     {
1999 ph10 164 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM16);
2000 nigel 77 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2001     if (fi >= max || eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
2002     GETCHARINC(c, eptr);
2003     if (c > 255)
2004     {
2005     if (op == OP_CLASS) RRETURN(MATCH_NOMATCH);
2006     }
2007     else
2008     {
2009     if ((data[c/8] & (1 << (c&7))) == 0) RRETURN(MATCH_NOMATCH);
2010     }
2011     }
2012     }
2013     else
2014     #endif
2015     /* Not UTF-8 mode */
2016     {
2017     for (fi = min;; fi++)
2018     {
2019 ph10 164 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM17);
2020 nigel 77 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2021     if (fi >= max || eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
2022     c = *eptr++;
2023     if ((data[c/8] & (1 << (c&7))) == 0) RRETURN(MATCH_NOMATCH);
2024     }
2025     }
2026     /* Control never gets here */
2027     }
2028    
2029     /* If maximizing, find the longest possible run, then work backwards. */
2030    
2031     else
2032     {
2033     pp = eptr;
2034    
2035     #ifdef SUPPORT_UTF8
2036     /* UTF-8 mode */
2037     if (utf8)
2038     {
2039     for (i = min; i < max; i++)
2040     {
2041     int len = 1;
2042     if (eptr >= md->end_subject) break;
2043     GETCHARLEN(c, eptr, len);
2044     if (c > 255)
2045     {
2046     if (op == OP_CLASS) break;
2047     }
2048     else
2049     {
2050     if ((data[c/8] & (1 << (c&7))) == 0) break;
2051     }
2052     eptr += len;
2053     }
2054     for (;;)
2055     {
2056 ph10 164 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM18);
2057 nigel 77 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2058     if (eptr-- == pp) break; /* Stop if tried at original pos */
2059     BACKCHAR(eptr);
2060     }
2061     }
2062     else
2063     #endif
2064     /* Not UTF-8 mode */
2065     {
2066     for (i = min; i < max; i++)
2067     {
2068     if (eptr >= md->end_subject) break;
2069     c = *eptr;
2070     if ((data[c/8] & (1 << (c&7))) == 0) break;
2071     eptr++;
2072     }
2073     while (eptr >= pp)
2074     {
2075 ph10 164 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM19);
2076 nigel 87 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2077 nigel 77 eptr--;
2078     }
2079     }
2080    
2081     RRETURN(MATCH_NOMATCH);
2082     }
2083     }
2084     /* Control never gets here */
2085    
2086    
2087     /* Match an extended character class. This opcode is encountered only
2088 ph10 384 when UTF-8 mode mode is supported. Nevertheless, we may not be in UTF-8
2089     mode, because Unicode properties are supported in non-UTF-8 mode. */
2090 nigel 77
2091     #ifdef SUPPORT_UTF8
2092     case OP_XCLASS:
2093     {
2094     data = ecode + 1 + LINK_SIZE; /* Save for matching */
2095     ecode += GET(ecode, 1); /* Advance past the item */
2096    
2097     switch (*ecode)
2098     {
2099     case OP_CRSTAR:
2100     case OP_CRMINSTAR:
2101     case OP_CRPLUS:
2102     case OP_CRMINPLUS:
2103     case OP_CRQUERY:
2104     case OP_CRMINQUERY:
2105     c = *ecode++ - OP_CRSTAR;
2106     minimize = (c & 1) != 0;
2107     min = rep_min[c]; /* Pick up values from tables; */
2108     max = rep_max[c]; /* zero for max => infinity */
2109     if (max == 0) max = INT_MAX;
2110     break;
2111    
2112     case OP_CRRANGE:
2113     case OP_CRMINRANGE:
2114     minimize = (*ecode == OP_CRMINRANGE);
2115     min = GET2(ecode, 1);
2116     max = GET2(ecode, 3);
2117     if (max == 0) max = INT_MAX;
2118     ecode += 5;
2119     break;
2120    
2121     default: /* No repeat follows */
2122     min = max = 1;
2123     break;
2124     }
2125    
2126     /* First, ensure the minimum number of matches are present. */
2127    
2128     for (i = 1; i <= min; i++)
2129     {
2130     if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
2131 ph10 384 GETCHARINCTEST(c, eptr);
2132 nigel 77 if (!_pcre_xclass(c, data)) RRETURN(MATCH_NOMATCH);
2133     }
2134    
2135     /* If max == min we can continue with the main loop without the
2136     need to recurse. */
2137    
2138     if (min == max) continue;
2139    
2140     /* If minimizing, keep testing the rest of the expression and advancing
2141     the pointer while it matches the class. */
2142    
2143     if (minimize)
2144     {
2145     for (fi = min;; fi++)
2146     {
2147 ph10 164 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM20);
2148 nigel 77 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2149     if (fi >= max || eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
2150 ph10 384 GETCHARINCTEST(c, eptr);
2151 nigel 77 if (!_pcre_xclass(c, data)) RRETURN(MATCH_NOMATCH);
2152     }
2153     /* Control never gets here */
2154     }
2155    
2156     /* If maximizing, find the longest possible run, then work backwards. */
2157    
2158     else
2159     {
2160     pp = eptr;
2161     for (i = min; i < max; i++)
2162     {
2163     int len = 1;
2164     if (eptr >= md->end_subject) break;
2165 ph10 384 GETCHARLENTEST(c, eptr, len);
2166 nigel 77 if (!_pcre_xclass(c, data)) break;
2167     eptr += len;
2168     }
2169     for(;;)
2170     {
2171 ph10 164 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM21);
2172 nigel 77 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2173     if (eptr-- == pp) break; /* Stop if tried at original pos */
2174 ph10 214 if (utf8) BACKCHAR(eptr);
2175 nigel 77 }
2176     RRETURN(MATCH_NOMATCH);
2177     }
2178    
2179     /* Control never gets here */
2180     }
2181     #endif /* End of XCLASS */
2182    
2183     /* Match a single character, casefully */
2184    
2185     case OP_CHAR:
2186     #ifdef SUPPORT_UTF8
2187     if (utf8)
2188     {
2189     length = 1;
2190     ecode++;
2191     GETCHARLEN(fc, ecode, length);
2192     if (length > md->end_subject - eptr) RRETURN(MATCH_NOMATCH);
2193     while (length-- > 0) if (*ecode++ != *eptr++) RRETURN(MATCH_NOMATCH);
2194     }
2195     else
2196     #endif
2197    
2198     /* Non-UTF-8 mode */
2199     {
2200     if (md->end_subject - eptr < 1) RRETURN(MATCH_NOMATCH);
2201     if (ecode[1] != *eptr++) RRETURN(MATCH_NOMATCH);
2202     ecode += 2;
2203     }
2204     break;
2205    
2206     /* Match a single character, caselessly */
2207    
2208     case OP_CHARNC:
2209     #ifdef SUPPORT_UTF8
2210     if (utf8)
2211     {
2212     length = 1;
2213     ecode++;
2214     GETCHARLEN(fc, ecode, length);
2215    
2216     if (length > md->end_subject - eptr) RRETURN(MATCH_NOMATCH);
2217    
2218     /* If the pattern character's value is < 128, we have only one byte, and
2219     can use the fast lookup table. */
2220    
2221     if (fc < 128)
2222     {
2223     if (md->lcc[*ecode++] != md->lcc[*eptr++]) RRETURN(MATCH_NOMATCH);
2224     }
2225    
2226     /* Otherwise we must pick up the subject character */
2227    
2228     else
2229     {
2230 nigel 93 unsigned int dc;
2231 nigel 77 GETCHARINC(dc, eptr);
2232     ecode += length;
2233    
2234     /* If we have Unicode property support, we can use it to test the other
2235 nigel 87 case of the character, if there is one. */
2236 nigel 77
2237     if (fc != dc)
2238     {
2239     #ifdef SUPPORT_UCP
2240 ph10 349 if (dc != UCD_OTHERCASE(fc))
2241 nigel 77 #endif
2242     RRETURN(MATCH_NOMATCH);
2243     }
2244     }
2245     }
2246     else
2247     #endif /* SUPPORT_UTF8 */
2248    
2249     /* Non-UTF-8 mode */
2250     {
2251     if (md->end_subject - eptr < 1) RRETURN(MATCH_NOMATCH);
2252     if (md->lcc[ecode[1]] != md->lcc[*eptr++]) RRETURN(MATCH_NOMATCH);
2253     ecode += 2;
2254     }
2255     break;
2256    
2257 nigel 93 /* Match a single character repeatedly. */
2258 nigel 77
2259     case OP_EXACT:
2260     min = max = GET2(ecode, 1);
2261     ecode += 3;
2262     goto REPEATCHAR;
2263    
2264 nigel 93 case OP_POSUPTO:
2265     possessive = TRUE;
2266     /* Fall through */
2267    
2268 nigel 77 case OP_UPTO:
2269     case OP_MINUPTO:
2270     min = 0;
2271     max = GET2(ecode, 1);
2272     minimize = *ecode == OP_MINUPTO;
2273     ecode += 3;
2274     goto REPEATCHAR;
2275    
2276 nigel 93 case OP_POSSTAR:
2277     possessive = TRUE;
2278     min = 0;
2279     max = INT_MAX;
2280     ecode++;
2281     goto REPEATCHAR;
2282    
2283     case OP_POSPLUS:
2284     possessive = TRUE;
2285     min = 1;
2286     max = INT_MAX;
2287     ecode++;
2288     goto REPEATCHAR;
2289    
2290     case OP_POSQUERY:
2291     possessive = TRUE;
2292     min = 0;
2293     max = 1;
2294     ecode++;
2295     goto REPEATCHAR;
2296    
2297 nigel 77 case OP_STAR:
2298     case OP_MINSTAR:
2299     case OP_PLUS:
2300     case OP_MINPLUS:
2301     case OP_QUERY:
2302     case OP_MINQUERY:
2303     c = *ecode++ - OP_STAR;
2304     minimize = (c & 1) != 0;
2305     min = rep_min[c]; /* Pick up values from tables; */
2306     max = rep_max[c]; /* zero for max => infinity */
2307     if (max == 0) max = INT_MAX;
2308    
2309     /* Common code for all repeated single-character matches. We can give
2310     up quickly if there are fewer than the minimum number of characters left in
2311     the subject. */
2312    
2313     REPEATCHAR:
2314     #ifdef SUPPORT_UTF8
2315     if (utf8)
2316     {
2317     length = 1;
2318     charptr = ecode;
2319     GETCHARLEN(fc, ecode, length);
2320     if (min * length > md->end_subject - eptr) RRETURN(MATCH_NOMATCH);
2321     ecode += length;
2322    
2323     /* Handle multibyte character matching specially here. There is
2324     support for caseless matching if UCP support is present. */
2325    
2326     if (length > 1)
2327     {
2328     #ifdef SUPPORT_UCP
2329 nigel 93 unsigned int othercase;
2330 nigel 77 if ((ims & PCRE_CASELESS) != 0 &&
2331 ph10 349 (othercase = UCD_OTHERCASE(fc)) != fc)
2332 nigel 77 oclength = _pcre_ord2utf8(othercase, occhars);
2333 ph10 115 else oclength = 0;
2334 nigel 77 #endif /* SUPPORT_UCP */
2335    
2336     for (i = 1; i <= min; i++)
2337     {
2338     if (memcmp(eptr, charptr, length) == 0) eptr += length;
2339 ph10 123 #ifdef SUPPORT_UCP
2340 nigel 77 /* Need braces because of following else */
2341     else if (oclength == 0) { RRETURN(MATCH_NOMATCH); }
2342     else
2343     {
2344     if (memcmp(eptr, occhars, oclength) != 0) RRETURN(MATCH_NOMATCH);
2345     eptr += oclength;
2346     }
2347 ph10 115 #else /* without SUPPORT_UCP */
2348     else { RRETURN(MATCH_NOMATCH); }
2349 ph10 123 #endif /* SUPPORT_UCP */
2350 nigel 77 }
2351    
2352     if (min == max) continue;
2353    
2354     if (minimize)
2355     {
2356     for (fi = min;; fi++)
2357     {
2358 ph10 164 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM22);
2359 nigel 77 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2360     if (fi >= max || eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
2361     if (memcmp(eptr, charptr, length) == 0) eptr += length;
2362 ph10 123 #ifdef SUPPORT_UCP
2363 nigel 77 /* Need braces because of following else */
2364     else if (oclength == 0) { RRETURN(MATCH_NOMATCH); }
2365     else
2366     {
2367     if (memcmp(eptr, occhars, oclength) != 0) RRETURN(MATCH_NOMATCH);
2368     eptr += oclength;
2369     }
2370 ph10 115 #else /* without SUPPORT_UCP */
2371     else { RRETURN (MATCH_NOMATCH); }
2372     #endif /* SUPPORT_UCP */
2373 nigel 77 }
2374     /* Control never gets here */
2375     }
2376 nigel 93
2377     else /* Maximize */
2378 nigel 77 {
2379     pp = eptr;
2380     for (i = min; i < max; i++)
2381     {
2382     if (eptr > md->end_subject - length) break;
2383     if (memcmp(eptr, charptr, length) == 0) eptr += length;
2384 ph10 123 #ifdef SUPPORT_UCP
2385 nigel 77 else if (oclength == 0) break;
2386     else
2387     {
2388     if (memcmp(eptr, occhars, oclength) != 0) break;
2389     eptr += oclength;
2390     }
2391 ph10 115 #else /* without SUPPORT_UCP */
2392     else break;
2393 ph10 123 #endif /* SUPPORT_UCP */
2394 nigel 77 }
2395 nigel 93
2396     if (possessive) continue;
2397 ph10 120 for(;;)
2398 nigel 77 {
2399 ph10 164 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM23);
2400 nigel 77 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2401 ph10 120 if (eptr == pp) RRETURN(MATCH_NOMATCH);
2402 ph10 115 #ifdef SUPPORT_UCP
2403     eptr--;
2404     BACKCHAR(eptr);
2405 ph10 123 #else /* without SUPPORT_UCP */
2406 nigel 77 eptr -= length;
2407 ph10 123 #endif /* SUPPORT_UCP */
2408 nigel 77 }
2409     }
2410     /* Control never gets here */
2411     }
2412    
2413     /* If the length of a UTF-8 character is 1, we fall through here, and
2414     obey the code as for non-UTF-8 characters below, though in this case the
2415     value of fc will always be < 128. */
2416     }
2417     else
2418     #endif /* SUPPORT_UTF8 */
2419    
2420     /* When not in UTF-8 mode, load a single-byte character. */
2421     {
2422     if (min > md->end_subject - eptr) RRETURN(MATCH_NOMATCH);
2423     fc = *ecode++;
2424     }
2425    
2426     /* The value of fc at this point is always less than 256, though we may or
2427     may not be in UTF-8 mode. The code is duplicated for the caseless and
2428     caseful cases, for speed, since matching characters is likely to be quite
2429     common. First, ensure the minimum number of matches are present. If min =
2430     max, continue at the same level without recursing. Otherwise, if
2431     minimizing, keep trying the rest of the expression and advancing one
2432     matching character if failing, up to the maximum. Alternatively, if
2433     maximizing, find the maximum number of characters and work backwards. */
2434    
2435     DPRINTF(("matching %c{%d,%d} against subject %.*s\n", fc, min, max,
2436     max, eptr));
2437    
2438     if ((ims & PCRE_CASELESS) != 0)
2439     {
2440     fc = md->lcc[fc];
2441     for (i = 1; i <= min; i++)
2442     if (fc != md->lcc[*eptr++]) RRETURN(MATCH_NOMATCH);
2443     if (min == max) continue;
2444     if (minimize)
2445     {
2446     for (fi = min;; fi++)
2447     {
2448 ph10 164 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM24);
2449 nigel 77 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2450     if (fi >= max || eptr >= md->end_subject ||
2451     fc != md->lcc[*eptr++])
2452     RRETURN(MATCH_NOMATCH);
2453     }
2454     /* Control never gets here */
2455     }
2456 nigel 93 else /* Maximize */
2457 nigel 77 {
2458     pp = eptr;
2459     for (i = min; i < max; i++)
2460     {
2461     if (eptr >= md->end_subject || fc != md->lcc[*eptr]) break;
2462     eptr++;
2463     }
2464 nigel 93 if (possessive) continue;
2465 nigel 77 while (eptr >= pp)
2466     {
2467 ph10 164 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM25);
2468 nigel 77 eptr--;
2469     if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2470     }
2471     RRETURN(MATCH_NOMATCH);
2472     }
2473     /* Control never gets here */
2474     }
2475    
2476     /* Caseful comparisons (includes all multi-byte characters) */
2477    
2478     else
2479     {
2480     for (i = 1; i <= min; i++) if (fc != *eptr++) RRETURN(MATCH_NOMATCH);
2481     if (min == max) continue;
2482     if (minimize)
2483     {
2484     for (fi = min;; fi++)
2485     {
2486 ph10 164 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM26);
2487 nigel 77 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2488     if (fi >= max || eptr >= md->end_subject || fc != *eptr++)
2489     RRETURN(MATCH_NOMATCH);
2490     }
2491     /* Control never gets here */
2492     }
2493 nigel 93 else /* Maximize */
2494 nigel 77 {
2495     pp = eptr;
2496     for (i = min; i < max; i++)
2497     {
2498     if (eptr >= md->end_subject || fc != *eptr) break;
2499     eptr++;
2500     }
2501 nigel 93 if (possessive) continue;
2502 nigel 77 while (eptr >= pp)
2503     {
2504 ph10 164 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM27);
2505 nigel 77 eptr--;
2506     if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2507     }
2508     RRETURN(MATCH_NOMATCH);
2509     }
2510     }
2511     /* Control never gets here */
2512    
2513     /* Match a negated single one-byte character. The character we are
2514     checking can be multibyte. */
2515    
2516     case OP_NOT:
2517     if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
2518     ecode++;
2519     GETCHARINCTEST(c, eptr);
2520     if ((ims & PCRE_CASELESS) != 0)
2521     {
2522     #ifdef SUPPORT_UTF8
2523     if (c < 256)
2524     #endif
2525     c = md->lcc[c];
2526     if (md->lcc[*ecode++] == c) RRETURN(MATCH_NOMATCH);
2527     }
2528     else
2529     {
2530     if (*ecode++ == c) RRETURN(MATCH_NOMATCH);
2531     }
2532     break;
2533    
2534     /* Match a negated single one-byte character repeatedly. This is almost a
2535     repeat of the code for a repeated single character, but I haven't found a
2536     nice way of commoning these up that doesn't require a test of the
2537     positive/negative option for each character match. Maybe that wouldn't add
2538     very much to the time taken, but character matching *is* what this is all
2539     about... */
2540    
2541     case OP_NOTEXACT:
2542     min = max = GET2(ecode, 1);
2543     ecode += 3;
2544     goto REPEATNOTCHAR;
2545    
2546     case OP_NOTUPTO:
2547     case OP_NOTMINUPTO:
2548     min = 0;
2549     max = GET2(ecode, 1);
2550     minimize = *ecode == OP_NOTMINUPTO;
2551     ecode += 3;
2552     goto REPEATNOTCHAR;
2553    
2554 nigel 93 case OP_NOTPOSSTAR:
2555     possessive = TRUE;
2556     min = 0;
2557     max = INT_MAX;
2558     ecode++;
2559     goto REPEATNOTCHAR;
2560    
2561     case OP_NOTPOSPLUS:
2562     possessive = TRUE;
2563     min = 1;
2564     max = INT_MAX;
2565     ecode++;
2566     goto REPEATNOTCHAR;
2567    
2568     case OP_NOTPOSQUERY:
2569     possessive = TRUE;
2570     min = 0;
2571     max = 1;
2572     ecode++;
2573     goto REPEATNOTCHAR;
2574    
2575     case OP_NOTPOSUPTO:
2576     possessive = TRUE;
2577     min = 0;
2578     max = GET2(ecode, 1);
2579     ecode += 3;
2580     goto REPEATNOTCHAR;
2581    
2582 nigel 77 case OP_NOTSTAR:
2583     case OP_NOTMINSTAR:
2584     case OP_NOTPLUS:
2585     case OP_NOTMINPLUS:
2586     case OP_NOTQUERY:
2587     case OP_NOTMINQUERY:
2588     c = *ecode++ - OP_NOTSTAR;
2589     minimize = (c & 1) != 0;
2590     min = rep_min[c]; /* Pick up values from tables; */
2591     max = rep_max[c]; /* zero for max => infinity */
2592     if (max == 0) max = INT_MAX;
2593    
2594     /* Common code for all repeated single-byte matches. We can give up quickly
2595     if there are fewer than the minimum number of bytes left in the
2596     subject. */
2597    
2598     REPEATNOTCHAR:
2599     if (min > md->end_subject - eptr) RRETURN(MATCH_NOMATCH);
2600     fc = *ecode++;
2601    
2602     /* The code is duplicated for the caseless and caseful cases, for speed,
2603     since matching characters is likely to be quite common. First, ensure the
2604     minimum number of matches are present. If min = max, continue at the same
2605     level without recursing. Otherwise, if minimizing, keep trying the rest of
2606     the expression and advancing one matching character if failing, up to the
2607     maximum. Alternatively, if maximizing, find the maximum number of
2608     characters and work backwards. */
2609    
2610     DPRINTF(("negative matching %c{%d,%d} against subject %.*s\n", fc, min, max,
2611     max, eptr));
2612    
2613     if ((ims & PCRE_CASELESS) != 0)
2614     {
2615     fc = md->lcc[fc];
2616    
2617     #ifdef SUPPORT_UTF8
2618     /* UTF-8 mode */
2619     if (utf8)
2620     {
2621 nigel 93 register unsigned int d;
2622 nigel 77 for (i = 1; i <= min; i++)
2623     {
2624     GETCHARINC(d, eptr);
2625     if (d < 256) d = md->lcc[d];
2626     if (fc == d) RRETURN(MATCH_NOMATCH);
2627     }
2628     }
2629     else
2630     #endif
2631    
2632     /* Not UTF-8 mode */
2633     {
2634     for (i = 1; i <= min; i++)
2635     if (fc == md->lcc[*eptr++]) RRETURN(MATCH_NOMATCH);
2636     }
2637    
2638     if (min == max) continue;
2639    
2640     if (minimize)
2641     {
2642     #ifdef SUPPORT_UTF8
2643     /* UTF-8 mode */
2644     if (utf8)
2645     {
2646 nigel 93 register unsigned int d;
2647 nigel 77 for (fi = min;; fi++)
2648     {
2649 ph10 164 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM28);
2650 nigel 77 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2651 ph10 366 if (fi >= max || eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
2652 nigel 77 GETCHARINC(d, eptr);
2653     if (d < 256) d = md->lcc[d];
2654 ph10 366 if (fc == d) RRETURN(MATCH_NOMATCH);
2655 ph10 371
2656 nigel 77 }
2657     }
2658     else
2659     #endif
2660     /* Not UTF-8 mode */
2661     {
2662     for (fi = min;; fi++)
2663     {
2664 ph10 164 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM29);
2665 nigel 77 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2666     if (fi >= max || eptr >= md->end_subject || fc == md->lcc[*eptr++])
2667     RRETURN(MATCH_NOMATCH);
2668     }
2669     }
2670     /* Control never gets here */
2671     }
2672    
2673     /* Maximize case */
2674    
2675     else
2676     {
2677     pp = eptr;
2678    
2679     #ifdef SUPPORT_UTF8
2680     /* UTF-8 mode */
2681     if (utf8)
2682     {
2683 nigel 93 register unsigned int d;
2684 nigel 77 for (i = min; i < max; i++)
2685     {
2686     int len = 1;
2687     if (eptr >= md->end_subject) break;
2688     GETCHARLEN(d, eptr, len);
2689     if (d < 256) d = md->lcc[d];
2690     if (fc == d) break;
2691     eptr += len;
2692     }
2693 nigel 93 if (possessive) continue;
2694     for(;;)
2695 nigel 77 {
2696 ph10 164 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM30);
2697 nigel 77 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2698     if (eptr-- == pp) break; /* Stop if tried at original pos */
2699     BACKCHAR(eptr);
2700     }
2701     }
2702     else
2703     #endif
2704     /* Not UTF-8 mode */
2705     {
2706     for (i = min; i < max; i++)
2707     {
2708     if (eptr >= md->end_subject || fc == md->lcc[*eptr]) break;
2709     eptr++;
2710     }
2711 nigel 93 if (possessive) continue;
2712 nigel 77 while (eptr >= pp)
2713     {
2714 ph10 164 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM31);
2715 nigel 77 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2716     eptr--;
2717     }
2718     }
2719    
2720     RRETURN(MATCH_NOMATCH);
2721     }
2722     /* Control never gets here */
2723     }
2724    
2725     /* Caseful comparisons */
2726    
2727     else
2728     {
2729     #ifdef SUPPORT_UTF8
2730     /* UTF-8 mode */
2731     if (utf8)
2732     {
2733 nigel 93 register unsigned int d;
2734 nigel 77 for (i = 1; i <= min; i++)
2735     {
2736     GETCHARINC(d, eptr);
2737     if (fc == d) RRETURN(MATCH_NOMATCH);
2738     }
2739     }
2740     else
2741     #endif
2742     /* Not UTF-8 mode */
2743     {
2744     for (i = 1; i <= min; i++)
2745     if (fc == *eptr++) RRETURN(MATCH_NOMATCH);
2746     }
2747    
2748     if (min == max) continue;
2749    
2750     if (minimize)
2751     {
2752     #ifdef SUPPORT_UTF8
2753     /* UTF-8 mode */
2754     if (utf8)
2755     {
2756 nigel 93 register unsigned int d;
2757 nigel 77 for (fi = min;; fi++)
2758     {
2759 ph10 164 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM32);
2760 nigel 77 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2761 ph10 366 if (fi >= max || eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
2762 nigel 77 GETCHARINC(d, eptr);
2763 ph10 371 if (fc == d) RRETURN(MATCH_NOMATCH);
2764 nigel 77 }
2765     }
2766     else
2767     #endif
2768     /* Not UTF-8 mode */
2769     {
2770     for (fi = min;; fi++)
2771     {
2772 ph10 164 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM33);
2773 nigel 77 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2774     if (fi >= max || eptr >= md->end_subject || fc == *eptr++)
2775     RRETURN(MATCH_NOMATCH);
2776     }
2777     }
2778     /* Control never gets here */
2779     }
2780    
2781     /* Maximize case */
2782    
2783     else
2784     {
2785     pp = eptr;
2786    
2787     #ifdef SUPPORT_UTF8
2788     /* UTF-8 mode */
2789     if (utf8)
2790     {
2791 nigel 93 register unsigned int d;
2792 nigel 77 for (i = min; i < max; i++)
2793     {
2794     int len = 1;
2795     if (eptr >= md->end_subject) break;
2796     GETCHARLEN(d, eptr, len);
2797     if (fc == d) break;
2798     eptr += len;
2799     }
2800 nigel 93 if (possessive) continue;
2801 nigel 77 for(;;)
2802     {
2803 ph10 164 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM34);
2804 nigel 77 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2805     if (eptr-- == pp) break; /* Stop if tried at original pos */
2806     BACKCHAR(eptr);
2807     }
2808     }
2809     else
2810     #endif
2811     /* Not UTF-8 mode */
2812     {
2813     for (i = min; i < max; i++)
2814     {
2815     if (eptr >= md->end_subject || fc == *eptr) break;
2816     eptr++;
2817     }
2818 nigel 93 if (possessive) continue;
2819 nigel 77 while (eptr >= pp)
2820     {
2821 ph10 164 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM35);
2822 nigel 77 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2823     eptr--;
2824     }
2825     }
2826    
2827     RRETURN(MATCH_NOMATCH);
2828     }
2829     }
2830     /* Control never gets here */
2831    
2832     /* Match a single character type repeatedly; several different opcodes
2833     share code. This is very similar to the code for single characters, but we
2834     repeat it in the interests of efficiency. */
2835    
2836     case OP_TYPEEXACT:
2837     min = max = GET2(ecode, 1);
2838     minimize = TRUE;
2839     ecode += 3;
2840     goto REPEATTYPE;
2841    
2842     case OP_TYPEUPTO:
2843     case OP_TYPEMINUPTO:
2844     min = 0;
2845     max = GET2(ecode, 1);
2846     minimize = *ecode == OP_TYPEMINUPTO;
2847     ecode += 3;
2848     goto REPEATTYPE;
2849    
2850 nigel 93 case OP_TYPEPOSSTAR:
2851     possessive = TRUE;
2852     min = 0;
2853     max = INT_MAX;
2854     ecode++;
2855     goto REPEATTYPE;
2856    
2857     case OP_TYPEPOSPLUS:
2858     possessive = TRUE;
2859     min = 1;
2860     max = INT_MAX;
2861     ecode++;
2862     goto REPEATTYPE;
2863    
2864     case OP_TYPEPOSQUERY:
2865     possessive = TRUE;
2866     min = 0;
2867     max = 1;
2868     ecode++;
2869     goto REPEATTYPE;
2870    
2871     case OP_TYPEPOSUPTO:
2872     possessive = TRUE;
2873     min = 0;
2874     max = GET2(ecode, 1);
2875     ecode += 3;
2876     goto REPEATTYPE;
2877    
2878 nigel 77 case OP_TYPESTAR:
2879     case OP_TYPEMINSTAR:
2880     case OP_TYPEPLUS:
2881     case OP_TYPEMINPLUS:
2882     case OP_TYPEQUERY:
2883     case OP_TYPEMINQUERY:
2884     c = *ecode++ - OP_TYPESTAR;
2885     minimize = (c & 1) != 0;
2886     min = rep_min[c]; /* Pick up values from tables; */
2887     max = rep_max[c]; /* zero for max => infinity */
2888     if (max == 0) max = INT_MAX;
2889    
2890     /* Common code for all repeated single character type matches. Note that
2891     in UTF-8 mode, '.' matches a character of any length, but for the other
2892     character types, the valid characters are all one-byte long. */
2893    
2894     REPEATTYPE:
2895     ctype = *ecode++; /* Code for the character type */
2896    
2897     #ifdef SUPPORT_UCP
2898     if (ctype == OP_PROP || ctype == OP_NOTPROP)
2899     {
2900     prop_fail_result = ctype == OP_NOTPROP;
2901     prop_type = *ecode++;
2902 nigel 87 prop_value = *ecode++;
2903 nigel 77 }
2904     else prop_type = -1;
2905     #endif
2906    
2907     /* First, ensure the minimum number of matches are present. Use inline
2908     code for maximizing the speed, and do the type test once at the start
2909     (i.e. keep it out of the loop). Also we can test that there are at least
2910     the minimum number of bytes before we start. This isn't as effective in
2911     UTF-8 mode, but it does no harm. Separate the UTF-8 code completely as that
2912     is tidier. Also separate the UCP code, which can be the same for both UTF-8
2913     and single-bytes. */
2914    
2915     if (min > md->end_subject - eptr) RRETURN(MATCH_NOMATCH);
2916     if (min > 0)
2917     {
2918     #ifdef SUPPORT_UCP
2919 nigel 87 if (prop_type >= 0)
2920 nigel 77 {
2921 nigel 87 switch(prop_type)
2922 nigel 77 {
2923 nigel 87 case PT_ANY:
2924     if (prop_fail_result) RRETURN(MATCH_NOMATCH);
2925     for (i = 1; i <= min; i++)
2926     {
2927     if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
2928 ph10 184 GETCHARINCTEST(c, eptr);
2929 nigel 87 }
2930     break;
2931    
2932     case PT_LAMP:
2933     for (i = 1; i <= min; i++)
2934     {
2935     if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
2936 ph10 184 GETCHARINCTEST(c, eptr);
2937 ph10 349 prop_chartype = UCD_CHARTYPE(c);
2938 nigel 87 if ((prop_chartype == ucp_Lu ||
2939     prop_chartype == ucp_Ll ||
2940     prop_chartype == ucp_Lt) == prop_fail_result)
2941     RRETURN(MATCH_NOMATCH);
2942     }
2943     break;
2944    
2945     case PT_GC:
2946     for (i = 1; i <= min; i++)
2947     {
2948     if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
2949 ph10 184 GETCHARINCTEST(c, eptr);
2950 ph10 349 prop_category = UCD_CATEGORY(c);
2951 nigel 87 if ((prop_category == prop_value) == prop_fail_result)
2952     RRETURN(MATCH_NOMATCH);
2953     }
2954     break;
2955    
2956     case PT_PC:
2957     for (i = 1; i <= min; i++)
2958     {
2959     if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
2960 ph10 184 GETCHARINCTEST(c, eptr);
2961 ph10 349 prop_chartype = UCD_CHARTYPE(c);
2962 nigel 87 if ((prop_chartype == prop_value) == prop_fail_result)
2963     RRETURN(MATCH_NOMATCH);
2964     }
2965     break;
2966    
2967     case PT_SC:
2968     for (i = 1; i <= min; i++)
2969     {
2970     if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
2971 ph10 184 GETCHARINCTEST(c, eptr);
2972 ph10 349 prop_script = UCD_SCRIPT(c);
2973 nigel 87 if ((prop_script == prop_value) == prop_fail_result)
2974     RRETURN(MATCH_NOMATCH);
2975     }
2976     break;
2977    
2978     default:
2979     RRETURN(PCRE_ERROR_INTERNAL);
2980 nigel 77 }
2981     }
2982    
2983     /* Match extended Unicode sequences. We will get here only if the
2984     support is in the binary; otherwise a compile-time error occurs. */
2985    
2986     else if (ctype == OP_EXTUNI)
2987     {
2988     for (i = 1; i <= min; i++)
2989     {
2990     GETCHARINCTEST(c, eptr);
2991 ph10 349 prop_category = UCD_CATEGORY(c);
2992 nigel 77 if (prop_category == ucp_M) RRETURN(MATCH_NOMATCH);
2993     while (eptr < md->end_subject)
2994     {
2995     int len = 1;
2996     if (!utf8) c = *eptr; else
2997     {
2998     GETCHARLEN(c, eptr, len);
2999     }
3000 ph10 349 prop_category = UCD_CATEGORY(c);
3001 nigel 77 if (prop_category != ucp_M) break;
3002     eptr += len;
3003     }
3004     }
3005     }
3006    
3007     else
3008     #endif /* SUPPORT_UCP */
3009    
3010     /* Handle all other cases when the coding is UTF-8 */
3011    
3012     #ifdef SUPPORT_UTF8
3013     if (utf8) switch(ctype)
3014     {
3015     case OP_ANY:
3016     for (i = 1; i <= min; i++)
3017     {
3018 ph10 342 if (eptr >= md->end_subject || IS_NEWLINE(eptr))
3019 nigel 77 RRETURN(MATCH_NOMATCH);
3020 nigel 91 eptr++;
3021 nigel 77 while (eptr < md->end_subject && (*eptr & 0xc0) == 0x80) eptr++;
3022     }
3023     break;
3024    
3025 ph10 341 case OP_ALLANY:
3026     for (i = 1; i <= min; i++)
3027     {
3028     if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
3029     eptr++;
3030     while (eptr < md->end_subject && (*eptr & 0xc0) == 0x80) eptr++;
3031     }
3032     break;
3033    
3034 nigel 77 case OP_ANYBYTE:
3035     eptr += min;
3036     break;
3037    
3038 nigel 93 case OP_ANYNL:
3039     for (i = 1; i <= min; i++)
3040     {
3041     if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
3042     GETCHARINC(c, eptr);
3043     switch(c)
3044     {
3045     default: RRETURN(MATCH_NOMATCH);
3046     case 0x000d:
3047     if (eptr < md->end_subject && *eptr == 0x0a) eptr++;
3048     break;
3049 ph10 231
3050 nigel 93 case 0x000a:
3051 ph10 231 break;
3052    
3053 nigel 93 case 0x000b:
3054     case 0x000c:
3055     case 0x0085:
3056     case 0x2028:
3057     case 0x2029:
3058 ph10 231 if (md->bsr_anycrlf) RRETURN(MATCH_NOMATCH);
3059 nigel 93 break;
3060     }
3061     }
3062     break;
3063    
3064 ph10 178 case OP_NOT_HSPACE:
3065     for (i = 1; i <= min; i++)
3066     {
3067     if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
3068     GETCHARINC(c, eptr);
3069     switch(c)
3070     {
3071     default: break;
3072     case 0x09: /* HT */
3073     case 0x20: /* SPACE */
3074     case 0xa0: /* NBSP */
3075     case 0x1680: /* OGHAM SPACE MARK */
3076     case 0x180e: /* MONGOLIAN VOWEL SEPARATOR */
3077     case 0x2000: /* EN QUAD */
3078     case 0x2001: /* EM QUAD */
3079     case 0x2002: /* EN SPACE */
3080     case 0x2003: /* EM SPACE */
3081     case 0x2004: /* THREE-PER-EM SPACE */
3082     case 0x2005: /* FOUR-PER-EM SPACE */
3083     case 0x2006: /* SIX-PER-EM SPACE */
3084     case 0x2007: /* FIGURE SPACE */
3085     case 0x2008: /* PUNCTUATION SPACE */
3086     case 0x2009: /* THIN SPACE */
3087     case 0x200A: /* HAIR SPACE */
3088     case 0x202f: /* NARROW NO-BREAK SPACE */
3089     case 0x205f: /* MEDIUM MATHEMATICAL SPACE */
3090     case 0x3000: /* IDEOGRAPHIC SPACE */
3091     RRETURN(MATCH_NOMATCH);
3092     }
3093     }
3094     break;
3095 ph10 182
3096 ph10 178 case OP_HSPACE:
3097     for (i = 1; i <= min; i++)
3098     {
3099     if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
3100     GETCHARINC(c, eptr);
3101     switch(c)
3102     {
3103     default: RRETURN(MATCH_NOMATCH);
3104     case 0x09: /* HT */
3105     case 0x20: /* SPACE */
3106     case 0xa0: /* NBSP */
3107     case 0x1680: /* OGHAM SPACE MARK */
3108     case 0x180e: /* MONGOLIAN VOWEL SEPARATOR */
3109     case 0x2000: /* EN QUAD */
3110     case 0x2001: /* EM QUAD */
3111     case 0x2002: /* EN SPACE */
3112     case 0x2003: /* EM SPACE */
3113     case 0x2004: /* THREE-PER-EM SPACE */
3114     case 0x2005: /* FOUR-PER-EM SPACE */
3115     case 0x2006: /* SIX-PER-EM SPACE */
3116     case 0x2007: /* FIGURE SPACE */
3117     case 0x2008: /* PUNCTUATION SPACE */
3118     case 0x2009: /* THIN SPACE */
3119     case 0x200A: /* HAIR SPACE */
3120     case 0x202f: /* NARROW NO-BREAK SPACE */
3121     case 0x205f: /* MEDIUM MATHEMATICAL SPACE */
3122     case 0x3000: /* IDEOGRAPHIC SPACE */
3123     break;
3124     }
3125     }
3126     break;
3127 ph10 182
3128 ph10 178 case OP_NOT_VSPACE:
3129     for (i = 1; i <= min; i++)
3130     {
3131     if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
3132     GETCHARINC(c, eptr);
3133     switch(c)
3134     {
3135     default: break;
3136     case 0x0a: /* LF */
3137     case 0x0b: /* VT */
3138     case 0x0c: /* FF */
3139     case 0x0d: /* CR */
3140     case 0x85: /* NEL */
3141     case 0x2028: /* LINE SEPARATOR */
3142     case 0x2029: /* PARAGRAPH SEPARATOR */
3143     RRETURN(MATCH_NOMATCH);
3144     }
3145     }
3146     break;
3147 ph10 182
3148 ph10 178 case OP_VSPACE:
3149     for (i = 1; i <= min; i++)
3150     {
3151     if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
3152     GETCHARINC(c, eptr);
3153     switch(c)
3154     {
3155     default: RRETURN(MATCH_NOMATCH);
3156     case 0x0a: /* LF */
3157     case 0x0b: /* VT */
3158     case 0x0c: /* FF */
3159     case 0x0d: /* CR */
3160     case 0x85: /* NEL */
3161     case 0x2028: /* LINE SEPARATOR */
3162     case 0x2029: /* PARAGRAPH SEPARATOR */
3163 ph10 182 break;
3164 ph10 178 }
3165     }
3166     break;
3167    
3168 nigel 77 case OP_NOT_DIGIT:
3169     for (i = 1; i <= min; i++)
3170     {
3171     if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
3172     GETCHARINC(c, eptr);
3173     if (c < 128 && (md->ctypes[c] & ctype_digit) != 0)
3174     RRETURN(MATCH_NOMATCH);
3175     }
3176     break;
3177    
3178     case OP_DIGIT:
3179     for (i = 1; i <= min; i++)
3180     {
3181     if (eptr >= md->end_subject ||
3182     *eptr >= 128 || (md->ctypes[*eptr++] & ctype_digit) == 0)
3183     RRETURN(MATCH_NOMATCH);
3184     /* No need to skip more bytes - we know it's a 1-byte character */
3185     }
3186     break;
3187    
3188     case OP_NOT_WHITESPACE:
3189     for (i = 1; i <= min; i++)
3190     {
3191     if (eptr >= md->end_subject ||
3192 ph10 219 (*eptr < 128 && (md->ctypes[*eptr] & ctype_space) != 0))
3193 nigel 77 RRETURN(MATCH_NOMATCH);
3194 ph10 219 while (++eptr < md->end_subject && (*eptr & 0xc0) == 0x80);
3195 nigel 77 }
3196     break;
3197    
3198     case OP_WHITESPACE:
3199     for (i = 1; i <= min; i++)
3200     {
3201     if (eptr >= md->end_subject ||
3202     *eptr >= 128 || (md->ctypes[*eptr++] & ctype_space) == 0)
3203     RRETURN(MATCH_NOMATCH);
3204     /* No need to skip more bytes - we know it's a 1-byte character */
3205     }
3206     break;
3207    
3208     case OP_NOT_WORDCHAR:
3209     for (i = 1; i <= min; i++)
3210     {
3211     if (eptr >= md->end_subject ||
3212 ph10 219 (*eptr < 128 && (md->ctypes[*eptr] & ctype_word) != 0))
3213 nigel 77 RRETURN(MATCH_NOMATCH);
3214 ph10 219 while (++eptr < md->end_subject && (*eptr & 0xc0) == 0x80);
3215 nigel 77 }
3216     break;
3217    
3218     case OP_WORDCHAR:
3219     for (i = 1; i <= min; i++)
3220     {
3221     if (eptr >= md->end_subject ||
3222     *eptr >= 128 || (md->ctypes[*eptr++] & ctype_word) == 0)
3223     RRETURN(MATCH_NOMATCH);
3224     /* No need to skip more bytes - we know it's a 1-byte character */
3225     }
3226     break;
3227    
3228     default:
3229     RRETURN(PCRE_ERROR_INTERNAL);
3230     } /* End switch(ctype) */
3231    
3232     else
3233     #endif /* SUPPORT_UTF8 */
3234    
3235     /* Code for the non-UTF-8 case for minimum matching of operators other
3236 nigel 93 than OP_PROP and OP_NOTPROP. We can assume that there are the minimum
3237     number of bytes present, as this was tested above. */
3238 nigel 77
3239     switch(ctype)
3240     {
3241     case OP_ANY:
3242 ph10 342 for (i = 1; i <= min; i++)
3243 nigel 77 {
3244 ph10 342 if (IS_NEWLINE(eptr)) RRETURN(MATCH_NOMATCH);
3245     eptr++;
3246 nigel 77 }
3247     break;
3248    
3249 ph10 341 case OP_ALLANY:
3250     eptr += min;
3251     break;
3252    
3253 nigel 77 case OP_ANYBYTE:
3254     eptr += min;
3255     break;
3256    
3257 nigel 93 /* Because of the CRLF case, we can't assume the minimum number of
3258     bytes are present in this case. */
3259    
3260     case OP_ANYNL:
3261     for (i = 1; i <= min; i++)
3262     {
3263     if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
3264     switch(*eptr++)
3265     {
3266     default: RRETURN(MATCH_NOMATCH);
3267     case 0x000d:
3268     if (eptr < md->end_subject && *eptr == 0x0a) eptr++;
3269     break;
3270     case 0x000a:
3271 ph10 231 break;
3272    
3273 nigel 93 case 0x000b:
3274     case 0x000c:
3275     case 0x0085:
3276 ph10 231 if (md->bsr_anycrlf) RRETURN(MATCH_NOMATCH);
3277 nigel 93 break;
3278     }
3279     }
3280     break;
3281    
3282 ph10 178 case OP_NOT_HSPACE:
3283     for (i = 1; i <= min; i++)
3284     {
3285     if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
3286     switch(*eptr++)
3287     {
3288     default: break;
3289     case 0x09: /* HT */
3290     case 0x20: /* SPACE */
3291     case 0xa0: /* NBSP */
3292     RRETURN(MATCH_NOMATCH);
3293     }
3294     }
3295     break;
3296    
3297     case OP_HSPACE:
3298     for (i = 1; i <= min; i++)
3299     {
3300     if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
3301     switch(*eptr++)
3302     {
3303     default: RRETURN(MATCH_NOMATCH);
3304     case 0x09: /* HT */
3305     case 0x20: /* SPACE */
3306     case 0xa0: /* NBSP */
3307 ph10 182 break;
3308 ph10 178 }
3309     }
3310     break;
3311    
3312     case OP_NOT_VSPACE:
3313     for (i = 1; i <= min; i++)
3314     {
3315     if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
3316     switch(*eptr++)
3317     {
3318     default: break;
3319     case 0x0a: /* LF */
3320     case 0x0b: /* VT */
3321     case 0x0c: /* FF */
3322     case 0x0d: /* CR */
3323     case 0x85: /* NEL */
3324     RRETURN(MATCH_NOMATCH);
3325     }
3326     }
3327     break;
3328    
3329     case OP_VSPACE:
3330     for (i = 1; i <= min; i++)
3331     {
3332     if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
3333     switch(*eptr++)
3334     {
3335     default: RRETURN(MATCH_NOMATCH);
3336     case 0x0a: /* LF */
3337     case 0x0b: /* VT */
3338     case 0x0c: /* FF */
3339     case 0x0d: /* CR */
3340     case 0x85: /* NEL */
3341 ph10 182 break;
3342 ph10 178 }
3343     }
3344     break;
3345    
3346 nigel 77 case OP_NOT_DIGIT:
3347     for (i = 1; i <= min; i++)
3348     if ((md->ctypes[*eptr++] & ctype_digit) != 0) RRETURN(MATCH_NOMATCH);
3349     break;
3350    
3351     case OP_DIGIT:
3352     for (i = 1; i <= min; i++)
3353     if ((md->ctypes[*eptr++] & ctype_digit) == 0) RRETURN(MATCH_NOMATCH);
3354     break;
3355    
3356     case OP_NOT_WHITESPACE:
3357     for (i = 1; i <= min; i++)
3358     if ((md->ctypes[*eptr++] & ctype_space) != 0) RRETURN(MATCH_NOMATCH);
3359     break;
3360    
3361     case OP_WHITESPACE:
3362     for (i = 1; i <= min; i++)
3363     if ((md->ctypes[*eptr++] & ctype_space) == 0) RRETURN(MATCH_NOMATCH);
3364     break;
3365    
3366     case OP_NOT_WORDCHAR:
3367     for (i = 1; i <= min; i++)
3368     if ((md->ctypes[*eptr++] & ctype_word) != 0)
3369     RRETURN(MATCH_NOMATCH);
3370     break;
3371    
3372     case OP_WORDCHAR:
3373     for (i = 1; i <= min; i++)
3374     if ((md->ctypes[*eptr++] & ctype_word) == 0)
3375     RRETURN(MATCH_NOMATCH);
3376     break;
3377    
3378     default:
3379     RRETURN(PCRE_ERROR_INTERNAL);
3380     }
3381     }
3382    
3383     /* If min = max, continue at the same level without recursing */
3384    
3385     if (min == max) continue;
3386    
3387     /* If minimizing, we have to test the rest of the pattern before each
3388     subsequent match. Again, separate the UTF-8 case for speed, and also
3389     separate the UCP cases. */
3390    
3391     if (minimize)
3392     {
3393     #ifdef SUPPORT_UCP
3394 nigel 87 if (prop_type >= 0)
3395 nigel 77 {
3396 nigel 87 switch(prop_type)
3397 nigel 77 {
3398 nigel 87 case PT_ANY:
3399     for (fi = min;; fi++)
3400     {
3401 ph10 164 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM36);
3402 nigel 87 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3403     if (fi >= max || eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
3404     GETCHARINC(c, eptr);
3405     if (prop_fail_result) RRETURN(MATCH_NOMATCH);
3406     }
3407 nigel 93 /* Control never gets here */
3408 nigel 87
3409     case PT_LAMP:
3410     for (fi = min;; fi++)
3411     {
3412 ph10 164 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM37);
3413 nigel 87 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3414     if (fi >= max || eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
3415     GETCHARINC(c, eptr);
3416 ph10 349 prop_chartype = UCD_CHARTYPE(c);
3417 nigel 87 if ((prop_chartype == ucp_Lu ||
3418     prop_chartype == ucp_Ll ||
3419     prop_chartype == ucp_Lt) == prop_fail_result)
3420     RRETURN(MATCH_NOMATCH);
3421     }
3422 nigel 93 /* Control never gets here */
3423 nigel 87
3424     case PT_GC:
3425     for (fi = min;; fi++)
3426     {
3427 ph10 164 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM38);
3428 nigel 87 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3429     if (fi >= max || eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
3430     GETCHARINC(c, eptr);
3431 ph10 349 prop_category = UCD_CATEGORY(c);
3432 nigel 87 if ((prop_category == prop_value) == prop_fail_result)
3433     RRETURN(MATCH_NOMATCH);
3434     }
3435 nigel 93 /* Control never gets here */
3436 nigel 87
3437     case PT_PC:
3438     for (fi = min;; fi++)
3439     {
3440 ph10 164 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM39);
3441 nigel 87 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3442     if (fi >= max || eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
3443     GETCHARINC(c, eptr);
3444 ph10 349 prop_chartype = UCD_CHARTYPE(c);
3445 nigel 87 if ((prop_chartype == prop_value) == prop_fail_result)
3446     RRETURN(MATCH_NOMATCH);
3447     }
3448 nigel 93 /* Control never gets here */
3449 nigel 87
3450     case PT_SC:
3451     for (fi = min;; fi++)
3452     {
3453 ph10 164 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM40);
3454 nigel 87 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3455     if (fi >= max || eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
3456     GETCHARINC(c, eptr);
3457 ph10 349 prop_script = UCD_SCRIPT(c);
3458 nigel 87 if ((prop_script == prop_value) == prop_fail_result)
3459     RRETURN(MATCH_NOMATCH);
3460     }
3461 nigel 93 /* Control never gets here */
3462 nigel 87
3463     default:
3464     RRETURN(PCRE_ERROR_INTERNAL);
3465 nigel 77 }
3466     }
3467    
3468     /* Match extended Unicode sequences. We will get here only if the
3469     support is in the binary; otherwise a compile-time error occurs. */
3470    
3471     else if (ctype == OP_EXTUNI)
3472     {
3473     for (fi = min;; fi++)
3474     {
3475 ph10 164 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM41);
3476 nigel 77 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3477     if (fi >= max || eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
3478     GETCHARINCTEST(c, eptr);
3479 ph10 349 prop_category = UCD_CATEGORY(c);
3480 nigel 77 if (prop_category == ucp_M) RRETURN(MATCH_NOMATCH);
3481     while (eptr < md->end_subject)
3482     {
3483     int len = 1;
3484     if (!utf8) c = *eptr; else
3485     {
3486     GETCHARLEN(c, eptr, len);
3487     }
3488 ph10 349 prop_category = UCD_CATEGORY(c);
3489 nigel 77 if (prop_category != ucp_M) break;
3490     eptr += len;
3491     }
3492     }
3493     }
3494    
3495     else
3496     #endif /* SUPPORT_UCP */
3497    
3498     #ifdef SUPPORT_UTF8
3499     /* UTF-8 mode */
3500     if (utf8)
3501     {
3502     for (fi = min;; fi++)
3503     {
3504 ph10 164 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM42);
3505 nigel 77 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3506 nigel 91 if (fi >= max || eptr >= md->end_subject ||
3507 ph10 342 (ctype == OP_ANY && IS_NEWLINE(eptr)))
3508 nigel 91 RRETURN(MATCH_NOMATCH);
3509 nigel 77
3510     GETCHARINC(c, eptr);
3511     switch(ctype)
3512     {
3513 ph10 342 case OP_ANY: /* This is the non-NL case */
3514 ph10 345 case OP_ALLANY:
3515 nigel 77 case OP_ANYBYTE:
3516     break;
3517    
3518 nigel 93 case OP_ANYNL:
3519     switch(c)
3520     {
3521     default: RRETURN(MATCH_NOMATCH);
3522     case 0x000d:
3523     if (eptr < md->end_subject && *eptr == 0x0a) eptr++;
3524     break;
3525     case 0x000a:
3526 ph10 231 break;
3527    
3528 nigel 93 case 0x000b:
3529     case 0x000c:
3530     case 0x0085:
3531     case 0x2028:
3532     case 0x2029:
3533 ph10 231 if (md->bsr_anycrlf) RRETURN(MATCH_NOMATCH);
3534 nigel 93 break;
3535     }
3536     break;
3537    
3538 ph10 178 case OP_NOT_HSPACE:
3539     switch(c)
3540     {
3541     default: break;
3542     case 0x09: /* HT */
3543     case 0x20: /* SPACE */
3544     case 0xa0: /* NBSP */
3545     case 0x1680: /* OGHAM SPACE MARK */
3546     case 0x180e: /* MONGOLIAN VOWEL SEPARATOR */
3547     case 0x2000: /* EN QUAD */
3548     case 0x2001: /* EM QUAD */
3549     case 0x2002: /* EN SPACE */
3550     case 0x2003: /* EM SPACE */
3551     case 0x2004: /* THREE-PER-EM SPACE */
3552     case 0x2005: /* FOUR-PER-EM SPACE */
3553     case 0x2006: /* SIX-PER-EM SPACE */
3554     case 0x2007: /* FIGURE SPACE */
3555     case 0x2008: /* PUNCTUATION SPACE */
3556     case 0x2009: /* THIN SPACE */
3557     case 0x200A: /* HAIR SPACE */
3558     case 0x202f: /* NARROW NO-BREAK SPACE */
3559     case 0x205f: /* MEDIUM MATHEMATICAL SPACE */
3560     case 0x3000: /* IDEOGRAPHIC SPACE */
3561     RRETURN(MATCH_NOMATCH);
3562     }
3563     break;
3564    
3565     case OP_HSPACE:
3566     switch(c)
3567     {
3568     default: RRETURN(MATCH_NOMATCH);
3569     case 0x09: /* HT */
3570     case 0x20: /* SPACE */
3571     case 0xa0: /* NBSP */
3572     case 0x1680: /* OGHAM SPACE MARK */
3573     case 0x180e: /* MONGOLIAN VOWEL SEPARATOR */
3574     case 0x2000: /* EN QUAD */
3575     case 0x2001: /* EM QUAD */
3576     case 0x2002: /* EN SPACE */
3577     case 0x2003: /* EM SPACE */
3578     case 0x2004: /* THREE-PER-EM SPACE */
3579     case 0x2005: /* FOUR-PER-EM SPACE */
3580     case 0x2006: /* SIX-PER-EM SPACE */
3581     case 0x2007: /* FIGURE SPACE */
3582     case 0x2008: /* PUNCTUATION SPACE */
3583     case 0x2009: /* THIN SPACE */
3584     case 0x200A: /* HAIR SPACE */
3585     case 0x202f: /* NARROW NO-BREAK SPACE */
3586     case 0x205f: /* MEDIUM MATHEMATICAL SPACE */
3587     case 0x3000: /* IDEOGRAPHIC SPACE */
3588     break;
3589     }
3590     break;
3591    
3592     case OP_NOT_VSPACE:
3593     switch(c)
3594     {
3595     default: break;
3596     case 0x0a: /* LF */
3597     case 0x0b: /* VT */
3598     case 0x0c: /* FF */
3599     case 0x0d: /* CR */
3600     case 0x85: /* NEL */
3601     case 0x2028: /* LINE SEPARATOR */
3602     case 0x2029: /* PARAGRAPH SEPARATOR */
3603     RRETURN(MATCH_NOMATCH);
3604     }
3605     break;
3606    
3607     case OP_VSPACE:
3608     switch(c)
3609     {
3610     default: RRETURN(MATCH_NOMATCH);
3611     case 0x0a: /* LF */
3612     case 0x0b: /* VT */
3613     case 0x0c: /* FF */
3614     case 0x0d: /* CR */
3615     case 0x85: /* NEL */
3616     case 0x2028: /* LINE SEPARATOR */
3617     case 0x2029: /* PARAGRAPH SEPARATOR */
3618     break;
3619     }
3620     break;
3621    
3622 nigel 77 case OP_NOT_DIGIT:
3623     if (c < 256 && (md->ctypes[c] & ctype_digit) != 0)
3624     RRETURN(MATCH_NOMATCH);
3625     break;
3626    
3627     case OP_DIGIT:
3628     if (c >= 256 || (md->ctypes[c] & ctype_digit) == 0)
3629     RRETURN(MATCH_NOMATCH);
3630     break;
3631    
3632     case OP_NOT_WHITESPACE:
3633     if (c < 256 && (md->ctypes[c] & ctype_space) != 0)
3634     RRETURN(MATCH_NOMATCH);
3635     break;
3636    
3637     case OP_WHITESPACE:
3638     if (c >= 256 || (md->ctypes[c] & ctype_space) == 0)
3639     RRETURN(MATCH_NOMATCH);
3640     break;
3641    
3642     case OP_NOT_WORDCHAR:
3643     if (c < 256 && (md->ctypes[c] & ctype_word) != 0)
3644     RRETURN(MATCH_NOMATCH);
3645     break;
3646    
3647     case OP_WORDCHAR:
3648     if (c >= 256 || (md->ctypes[c] & ctype_word) == 0)
3649     RRETURN(MATCH_NOMATCH);
3650     break;
3651    
3652     default:
3653     RRETURN(PCRE_ERROR_INTERNAL);
3654     }
3655     }
3656     }
3657     else
3658     #endif
3659     /* Not UTF-8 mode */
3660     {
3661     for (fi = min;; fi++)
3662     {
3663 ph10 164 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM43);
3664 nigel 77 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3665 nigel 91 if (fi >= max || eptr >= md->end_subject ||
3666 ph10 342 (ctype == OP_ANY && IS_NEWLINE(eptr)))
3667 nigel 91 RRETURN(MATCH_NOMATCH);
3668    
3669 nigel 77 c = *eptr++;
3670     switch(ctype)
3671     {
3672 ph10 342 case OP_ANY: /* This is the non-NL case */
3673 ph10 345 case OP_ALLANY:
3674 nigel 77 case OP_ANYBYTE:
3675     break;
3676    
3677 nigel 93 case OP_ANYNL:
3678     switch(c)
3679     {
3680     default: RRETURN(MATCH_NOMATCH);
3681     case 0x000d:
3682     if (eptr < md->end_subject && *eptr == 0x0a) eptr++;
3683     break;
3684 ph10 231
3685 nigel 93 case 0x000a:
3686 ph10 231 break;
3687    
3688 nigel 93 case 0x000b:
3689     case 0x000c:
3690     case 0x0085:
3691 ph10 231 if (md->bsr_anycrlf) RRETURN(MATCH_NOMATCH);
3692 nigel 93 break;
3693     }
3694     break;
3695    
3696 ph10 178 case OP_NOT_HSPACE:
3697     switch(c)
3698     {
3699     default: break;
3700     case 0x09: /* HT */
3701     case 0x20: /* SPACE */
3702     case 0xa0: /* NBSP */
3703     RRETURN(MATCH_NOMATCH);
3704     }
3705     break;
3706    
3707     case OP_HSPACE:
3708     switch(c)
3709     {
3710     default: RRETURN(MATCH_NOMATCH);
3711     case 0x09: /* HT */
3712     case 0x20: /* SPACE */
3713     case 0xa0: /* NBSP */
3714     break;
3715     }
3716     break;
3717    
3718     case OP_NOT_VSPACE:
3719     switch(c)
3720     {
3721     default: break;
3722     case 0x0a: /* LF */
3723     case 0x0b: /* VT */
3724     case 0x0c: /* FF */
3725     case 0x0d: /* CR */
3726     case 0x85: /* NEL */
3727     RRETURN(MATCH_NOMATCH);
3728     }
3729     break;
3730    
3731     case OP_VSPACE:
3732     switch(c)
3733     {
3734     default: RRETURN(MATCH_NOMATCH);
3735     case 0x0a: /* LF */
3736     case 0x0b: /* VT */
3737     case 0x0c: /* FF */
3738     case 0x0d: /* CR */
3739     case 0x85: /* NEL */
3740     break;
3741     }
3742     break;
3743    
3744 nigel 77 case OP_NOT_DIGIT:
3745     if ((md->ctypes[c] & ctype_digit) != 0) RRETURN(MATCH_NOMATCH);
3746     break;
3747    
3748     case OP_DIGIT:
3749     if ((md->ctypes[c] & ctype_digit) == 0) RRETURN(MATCH_NOMATCH);
3750     break;
3751    
3752     case OP_NOT_WHITESPACE:
3753     if ((md->ctypes[c] & ctype_space) != 0) RRETURN(MATCH_NOMATCH);
3754     break;
3755    
3756     case OP_WHITESPACE:
3757     if ((md->ctypes[c] & ctype_space) == 0) RRETURN(MATCH_NOMATCH);
3758     break;
3759    
3760     case OP_NOT_WORDCHAR:
3761     if ((md->ctypes[c] & ctype_word) != 0) RRETURN(MATCH_NOMATCH);
3762     break;
3763    
3764     case OP_WORDCHAR:
3765     if ((md->ctypes[c] & ctype_word) == 0) RRETURN(MATCH_NOMATCH);
3766     break;
3767    
3768     default:
3769     RRETURN(PCRE_ERROR_INTERNAL);
3770     }
3771     }
3772     }
3773     /* Control never gets here */
3774     }
3775    
3776 nigel 93 /* If maximizing, it is worth using inline code for speed, doing the type
3777 nigel 77 test once at the start (i.e. keep it out of the loop). Again, keep the
3778     UTF-8 and UCP stuff separate. */
3779    
3780     else
3781     {
3782     pp = eptr; /* Remember where we started */
3783    
3784     #ifdef SUPPORT_UCP
3785 nigel 87 if (prop_type >= 0)
3786 nigel 77 {
3787 nigel 87 switch(prop_type)
3788 nigel 77 {
3789 nigel 87 case PT_ANY:
3790     for (i = min; i < max; i++)
3791     {
3792     int len = 1;
3793     if (eptr >= md->end_subject) break;
3794     GETCHARLEN(c, eptr, len);
3795     if (prop_fail_result) break;
3796     eptr+= len;
3797     }
3798     break;
3799    
3800     case PT_LAMP:
3801     for (i = min; i < max; i++)
3802     {
3803     int len = 1;
3804     if (eptr >= md->end_subject) break;
3805     GETCHARLEN(c, eptr, len);
3806 ph10 349 prop_chartype = UCD_CHARTYPE(c);
3807 nigel 87 if ((prop_chartype == ucp_Lu ||
3808     prop_chartype == ucp_Ll ||
3809     prop_chartype == ucp_Lt) == prop_fail_result)
3810     break;
3811     eptr+= len;
3812     }
3813     break;
3814    
3815     case PT_GC:
3816     for (i = min; i < max; i++)
3817     {
3818     int len = 1;
3819     if (eptr >= md->end_subject) break;
3820     GETCHARLEN(c, eptr, len);
3821 ph10 349 prop_category = UCD_CATEGORY(c);
3822 nigel 87 if ((prop_category == prop_value) == prop_fail_result)
3823     break;
3824     eptr+= len;
3825     }
3826     break;
3827    
3828     case PT_PC:
3829     for (i = min; i < max; i++)
3830     {
3831     int len = 1;
3832     if (eptr >= md->end_subject) break;
3833     GETCHARLEN(c, eptr, len);
3834 ph10 349 prop_chartype = UCD_CHARTYPE(c);
3835 nigel 87 if ((prop_chartype == prop_value) == prop_fail_result)
3836     break;
3837     eptr+= len;
3838     }
3839     break;
3840    
3841     case PT_SC:
3842     for (i = min; i < max; i++)
3843     {
3844     int len = 1;
3845     if (eptr >= md->end_subject) break;
3846     GETCHARLEN(c, eptr, len);
3847 ph10 349 prop_script = UCD_SCRIPT(c);
3848 nigel 87 if ((prop_script == prop_value) == prop_fail_result)
3849     break;
3850     eptr+= len;
3851     }
3852     break;
3853 nigel 77 }
3854    
3855     /* eptr is now past the end of the maximum run */
3856    
3857 nigel 93 if (possessive) continue;
3858 nigel 77 for(;;)
3859     {
3860 ph10 164 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM44);
3861 nigel 77 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3862     if (eptr-- == pp) break; /* Stop if tried at original pos */
3863 ph10 207 if (utf8) BACKCHAR(eptr);
3864 nigel 77 }
3865     }
3866    
3867     /* Match extended Unicode sequences. We will get here only if the
3868     support is in the binary; otherwise a compile-time error occurs. */
3869    
3870     else if (ctype == OP_EXTUNI)
3871     {
3872     for (i = min; i < max; i++)
3873     {
3874     if (eptr >= md->end_subject) break;
3875     GETCHARINCTEST(c, eptr);
3876 ph10 349 prop_category = UCD_CATEGORY(c);
3877 nigel 77 if (prop_category == ucp_M) break;
3878     while (eptr < md->end_subject)
3879     {
3880     int len = 1;
3881     if (!utf8) c = *eptr; else
3882     {
3883     GETCHARLEN(c, eptr, len);
3884     }
3885 ph10 349 prop_category = UCD_CATEGORY(c);
3886 nigel 77 if (prop_category != ucp_M) break;
3887     eptr += len;
3888     }
3889     }
3890    
3891     /* eptr is now past the end of the maximum run */
3892    
3893 nigel 93 if (possessive) continue;
3894 nigel 77 for(;;)
3895     {
3896 ph10 164 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM45);
3897 nigel 77 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3898     if (eptr-- == pp) break; /* Stop if tried at original pos */
3899     for (;;) /* Move back over one extended */
3900     {
3901     int len = 1;
3902     if (!utf8) c = *eptr; else
3903     {
3904 ph10 207 BACKCHAR(eptr);
3905 nigel 77 GETCHARLEN(c, eptr, len);
3906     }
3907 ph10 349 prop_category = UCD_CATEGORY(c);
3908 nigel 77 if (prop_category != ucp_M) break;
3909     eptr--;
3910     }
3911     }
3912     }
3913    
3914     else
3915     #endif /* SUPPORT_UCP */
3916    
3917     #ifdef SUPPORT_UTF8
3918     /* UTF-8 mode */
3919    
3920     if (utf8)
3921     {
3922     switch(ctype)
3923     {
3924     case OP_ANY:
3925     if (max < INT_MAX)
3926     {
3927 ph10 342 for (i = min; i < max; i++)
3928 nigel 77 {
3929 ph10 342 if (eptr >= md->end_subject || IS_NEWLINE(eptr)) break;
3930     eptr++;
3931     while (eptr < md->end_subject && (*eptr & 0xc0) == 0x80) eptr++;
3932 nigel 77 }
3933     }
3934    
3935     /* Handle unlimited UTF-8 repeat */
3936    
3937     else
3938     {
3939 ph10 342 for (i = min; i < max; i++)
3940 nigel 77 {
3941 ph10 342 if (eptr >= md->end_subject || IS_NEWLINE(eptr)) break;
3942     eptr++;
3943     while (eptr < md->end_subject && (*eptr & 0xc0) == 0x80) eptr++;
3944 nigel 77 }
3945     }
3946     break;
3947    
3948 ph10 341 case OP_ALLANY:
3949     if (max < INT_MAX)
3950     {
3951     for (i = min; i < max; i++)
3952     {
3953     if (eptr >= md->end_subject) break;
3954     eptr++;
3955     while (eptr < md->end_subject && (*eptr & 0xc0) == 0x80) eptr++;
3956     }
3957     }
3958     else eptr = md->end_subject; /* Unlimited UTF-8 repeat */
3959     break;
3960    
3961 nigel 77 /* The byte case is the same as non-UTF8 */
3962    
3963     case OP_ANYBYTE:
3964     c = max - min;
3965 nigel 93 if (c > (unsigned int)(md->end_subject - eptr))
3966     c = md->end_subject - eptr;
3967 nigel 77 eptr += c;
3968     break;
3969    
3970 nigel 93 case OP_ANYNL:
3971     for (i = min; i < max; i++)
3972     {
3973     int len = 1;
3974     if (eptr >= md->end_subject) break;
3975     GETCHARLEN(c, eptr, len);
3976     if (c == 0x000d)
3977     {
3978     if (++eptr >= md->end_subject) break;
3979     if (*eptr == 0x000a) eptr++;
3980     }
3981     else
3982     {
3983 ph10 231 if (c != 0x000a &&
3984     (md->bsr_anycrlf ||
3985     (c != 0x000b && c != 0x000c &&
3986     c != 0x0085 && c != 0x2028 && c != 0x2029)))
3987 nigel 93 break;
3988     eptr += len;
3989     }
3990     }
3991     break;
3992    
3993 ph10 178 case OP_NOT_HSPACE:
3994 ph10 182 case OP_HSPACE:
3995 ph10 178 for (i = min; i < max; i++)
3996     {
3997 ph10 182 BOOL gotspace;
3998 ph10 178 int len = 1;
3999     if (eptr >= md->end_subject) break;
4000     GETCHARLEN(c, eptr, len);
4001     switch(c)
4002 ph10 182 {
4003     default: gotspace = FALSE; break;
4004 ph10 178 case 0x09: /* HT */
4005     case 0x20: /* SPACE */
4006     case 0xa0: /* NBSP */
4007     case 0x1680: /* OGHAM SPACE MARK */
4008     case 0x180e: /* MONGOLIAN VOWEL SEPARATOR */
4009     case 0x2000: /* EN QUAD */
4010     case 0x2001: /* EM QUAD */
4011     case 0x2002: /* EN SPACE */
4012     case 0x2003: /* EM SPACE */
4013     case 0x2004: /* THREE-PER-EM SPACE */
4014     case 0x2005: /* FOUR-PER-EM SPACE */
4015     case 0x2006: /* SIX-PER-EM SPACE */
4016     case 0x2007: /* FIGURE SPACE */
4017     case 0x2008: /* PUNCTUATION SPACE */
4018     case 0x2009: /* THIN SPACE */
4019     case 0x200A: /* HAIR SPACE */
4020     case 0x202f: /* NARROW NO-BREAK SPACE */
4021     case 0x205f: /* MEDIUM MATHEMATICAL SPACE */
4022     case 0x3000: /* IDEOGRAPHIC SPACE */
4023     gotspace = TRUE;
4024 ph10 182 break;
4025 ph10 178 }
4026     if (gotspace == (ctype == OP_NOT_HSPACE)) break;
4027     eptr += len;
4028     }
4029     break;
4030    
4031     case OP_NOT_VSPACE:
4032 ph10 182 case OP_VSPACE:
4033 ph10 178 for (i = min; i < max; i++)
4034     {
4035 ph10 182 BOOL gotspace;
4036 ph10 178 int len = 1;
4037     if (eptr >= md->end_subject) break;
4038     GETCHARLEN(c, eptr, len);
4039     switch(c)
4040     {
4041 ph10 182 default: gotspace = FALSE; break;
4042 ph10 178 case 0x0a: /* LF */
4043     case 0x0b: /* VT */
4044     case 0x0c: /* FF */
4045     case 0x0d: /* CR */
4046     case 0x85: /* NEL */
4047     case 0x2028: /* LINE SEPARATOR */
4048     case 0x2029: /* PARAGRAPH SEPARATOR */
4049     gotspace = TRUE;
4050     break;
4051     }
4052 ph10 182 if (gotspace == (ctype == OP_NOT_VSPACE)) break;
4053 ph10 178 eptr += len;
4054     }
4055     break;
4056    
4057 nigel 77 case OP_NOT_DIGIT:
4058     for (i = min; i < max; i++)
4059     {
4060     int len = 1;
4061     if (eptr >= md->end_subject) break;
4062     GETCHARLEN(c, eptr, len);
4063     if (c < 256 && (md->ctypes[c] & ctype_digit) != 0) break;
4064     eptr+= len;
4065     }
4066     break;
4067    
4068     case OP_DIGIT:
4069     for (i = min; i < max; i++)
4070     {
4071     int len = 1;
4072     if (eptr >= md->end_subject) break;
4073     GETCHARLEN(c, eptr, len);
4074