/[pcre]/code/trunk/pcre_exec.c
ViewVC logotype

Contents of /code/trunk/pcre_exec.c

Parent Directory Parent Directory | Revision Log Revision Log


Revision 406 - (hide annotations) (download)
Mon Mar 23 12:05:43 2009 UTC (5 years, 9 months ago) by ph10
File MIME type: text/plain
File size: 153106 byte(s)
Trailing space tidies

1 nigel 77 /*************************************************
2     * Perl-Compatible Regular Expressions *
3     *************************************************/
4    
5     /* PCRE is a library of functions to support regular expressions whose syntax
6     and semantics are as close as possible to those of the Perl 5 language.
7    
8     Written by Philip Hazel
9 ph10 381 Copyright (c) 1997-2009 University of Cambridge
10 nigel 77
11     -----------------------------------------------------------------------------
12     Redistribution and use in source and binary forms, with or without
13     modification, are permitted provided that the following conditions are met:
14    
15     * Redistributions of source code must retain the above copyright notice,
16     this list of conditions and the following disclaimer.
17    
18     * Redistributions in binary form must reproduce the above copyright
19     notice, this list of conditions and the following disclaimer in the
20     documentation and/or other materials provided with the distribution.
21    
22     * Neither the name of the University of Cambridge nor the names of its
23     contributors may be used to endorse or promote products derived from
24     this software without specific prior written permission.
25    
26     THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
27     AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
28     IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
29     ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
30     LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
31     CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
32     SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
33     INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
34     CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
35     ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
36     POSSIBILITY OF SUCH DAMAGE.
37     -----------------------------------------------------------------------------
38     */
39    
40    
41     /* This module contains pcre_exec(), the externally visible function that does
42     pattern matching using an NFA algorithm, trying to mimic Perl as closely as
43     possible. There are also some static supporting functions. */
44    
45 ph10 200 #ifdef HAVE_CONFIG_H
46 ph10 236 #include "config.h"
47 ph10 200 #endif
48 ph10 199
49 nigel 93 #define NLBLOCK md /* Block containing newline information */
50     #define PSSTART start_subject /* Field containing processed string start */
51     #define PSEND end_subject /* Field containing processed string end */
52    
53 nigel 77 #include "pcre_internal.h"
54    
55 ph10 137 /* Undefine some potentially clashing cpp symbols */
56    
57     #undef min
58     #undef max
59    
60 nigel 77 /* Flag bits for the match() function */
61    
62 nigel 93 #define match_condassert 0x01 /* Called to check a condition assertion */
63     #define match_cbegroup 0x02 /* Could-be-empty unlimited repeat group */
64 nigel 77
65     /* Non-error returns from the match() function. Error returns are externally
66     defined PCRE_ERROR_xxx codes, which are all negative. */
67    
68     #define MATCH_MATCH 1
69     #define MATCH_NOMATCH 0
70    
71 ph10 211 /* Special internal returns from the match() function. Make them sufficiently
72 ph10 210 negative to avoid the external error codes. */
73    
74     #define MATCH_COMMIT (-999)
75     #define MATCH_PRUNE (-998)
76     #define MATCH_SKIP (-997)
77     #define MATCH_THEN (-996)
78    
79 nigel 77 /* Maximum number of ints of offset to save on the stack for recursive calls.
80     If the offset vector is bigger, malloc is used. This should be a multiple of 3,
81     because the offset vector is always a multiple of 3 long. */
82    
83     #define REC_STACK_SAVE_MAX 30
84    
85     /* Min and max values for the common repeats; for the maxima, 0 => infinity */
86    
87     static const char rep_min[] = { 0, 0, 1, 1, 0, 0 };
88     static const char rep_max[] = { 0, 0, 0, 0, 1, 1 };
89    
90    
91    
92     #ifdef DEBUG
93     /*************************************************
94     * Debugging function to print chars *
95     *************************************************/
96    
97     /* Print a sequence of chars in printable format, stopping at the end of the
98     subject if the requested.
99    
100     Arguments:
101     p points to characters
102     length number to print
103     is_subject TRUE if printing from within md->start_subject
104     md pointer to matching data block, if is_subject is TRUE
105    
106     Returns: nothing
107     */
108    
109     static void
110     pchars(const uschar *p, int length, BOOL is_subject, match_data *md)
111     {
112 nigel 93 unsigned int c;
113 nigel 77 if (is_subject && length > md->end_subject - p) length = md->end_subject - p;
114     while (length-- > 0)
115     if (isprint(c = *(p++))) printf("%c", c); else printf("\\x%02x", c);
116     }
117     #endif
118    
119    
120    
121     /*************************************************
122     * Match a back-reference *
123     *************************************************/
124    
125     /* If a back reference hasn't been set, the length that is passed is greater
126     than the number of characters left in the string, so the match fails.
127    
128     Arguments:
129     offset index into the offset vector
130     eptr points into the subject
131     length length to be matched
132     md points to match data block
133     ims the ims flags
134    
135     Returns: TRUE if matched
136     */
137    
138     static BOOL
139 nigel 87 match_ref(int offset, register USPTR eptr, int length, match_data *md,
140 nigel 77 unsigned long int ims)
141     {
142 nigel 87 USPTR p = md->start_subject + md->offset_vector[offset];
143 nigel 77
144     #ifdef DEBUG
145     if (eptr >= md->end_subject)
146     printf("matching subject <null>");
147     else
148     {
149     printf("matching subject ");
150     pchars(eptr, length, TRUE, md);
151     }
152     printf(" against backref ");
153     pchars(p, length, FALSE, md);
154     printf("\n");
155     #endif
156    
157     /* Always fail if not enough characters left */
158    
159     if (length > md->end_subject - eptr) return FALSE;
160    
161 ph10 354 /* Separate the caseless case for speed. In UTF-8 mode we can only do this
162     properly if Unicode properties are supported. Otherwise, we can check only
163     ASCII characters. */
164 nigel 77
165     if ((ims & PCRE_CASELESS) != 0)
166     {
167 ph10 354 #ifdef SUPPORT_UTF8
168     #ifdef SUPPORT_UCP
169     if (md->utf8)
170     {
171 ph10 358 USPTR endptr = eptr + length;
172 ph10 354 while (eptr < endptr)
173     {
174 ph10 358 int c, d;
175 ph10 354 GETCHARINC(c, eptr);
176     GETCHARINC(d, p);
177     if (c != d && c != UCD_OTHERCASE(d)) return FALSE;
178 ph10 358 }
179     }
180 ph10 354 else
181     #endif
182     #endif
183    
184     /* The same code works when not in UTF-8 mode and in UTF-8 mode when there
185     is no UCP support. */
186 ph10 358
187 nigel 77 while (length-- > 0)
188 ph10 354 { if (md->lcc[*p++] != md->lcc[*eptr++]) return FALSE; }
189 nigel 77 }
190 ph10 358
191 ph10 354 /* In the caseful case, we can just compare the bytes, whether or not we
192     are in UTF-8 mode. */
193 ph10 358
194 nigel 77 else
195     { while (length-- > 0) if (*p++ != *eptr++) return FALSE; }
196    
197     return TRUE;
198     }
199    
200    
201    
202     /***************************************************************************
203     ****************************************************************************
204     RECURSION IN THE match() FUNCTION
205    
206 nigel 87 The match() function is highly recursive, though not every recursive call
207     increases the recursive depth. Nevertheless, some regular expressions can cause
208     it to recurse to a great depth. I was writing for Unix, so I just let it call
209     itself recursively. This uses the stack for saving everything that has to be
210     saved for a recursive call. On Unix, the stack can be large, and this works
211     fine.
212 nigel 77
213 nigel 87 It turns out that on some non-Unix-like systems there are problems with
214     programs that use a lot of stack. (This despite the fact that every last chip
215     has oodles of memory these days, and techniques for extending the stack have
216     been known for decades.) So....
217 nigel 77
218     There is a fudge, triggered by defining NO_RECURSE, which avoids recursive
219     calls by keeping local variables that need to be preserved in blocks of memory
220 nigel 87 obtained from malloc() instead instead of on the stack. Macros are used to
221 nigel 77 achieve this so that the actual code doesn't look very different to what it
222     always used to.
223 ph10 164
224 ph10 165 The original heap-recursive code used longjmp(). However, it seems that this
225 ph10 164 can be very slow on some operating systems. Following a suggestion from Stan
226     Switzer, the use of longjmp() has been abolished, at the cost of having to
227     provide a unique number for each call to RMATCH. There is no way of generating
228     a sequence of numbers at compile time in C. I have given them names, to make
229     them stand out more clearly.
230    
231     Crude tests on x86 Linux show a small speedup of around 5-8%. However, on
232     FreeBSD, avoiding longjmp() more than halves the time taken to run the standard
233 ph10 165 tests. Furthermore, not using longjmp() means that local dynamic variables
234     don't have indeterminate values; this has meant that the frame size can be
235 ph10 164 reduced because the result can be "passed back" by straight setting of the
236     variable instead of being passed in the frame.
237 nigel 77 ****************************************************************************
238     ***************************************************************************/
239    
240 ph10 212 /* Numbers for RMATCH calls. When this list is changed, the code at HEAP_RETURN
241     below must be updated in sync. */
242 nigel 77
243 ph10 164 enum { RM1=1, RM2, RM3, RM4, RM5, RM6, RM7, RM8, RM9, RM10,
244     RM11, RM12, RM13, RM14, RM15, RM16, RM17, RM18, RM19, RM20,
245     RM21, RM22, RM23, RM24, RM25, RM26, RM27, RM28, RM29, RM30,
246     RM31, RM32, RM33, RM34, RM35, RM36, RM37, RM38, RM39, RM40,
247 ph10 210 RM41, RM42, RM43, RM44, RM45, RM46, RM47, RM48, RM49, RM50,
248 ph10 212 RM51, RM52, RM53, RM54 };
249 ph10 164
250 nigel 87 /* These versions of the macros use the stack, as normal. There are debugging
251 ph10 165 versions and production versions. Note that the "rw" argument of RMATCH isn't
252 ph10 164 actuall used in this definition. */
253 nigel 77
254     #ifndef NO_RECURSE
255     #define REGISTER register
256 ph10 164
257 nigel 87 #ifdef DEBUG
258 ph10 164 #define RMATCH(ra,rb,rc,rd,re,rf,rg,rw) \
259 nigel 87 { \
260     printf("match() called in line %d\n", __LINE__); \
261 ph10 168 rrc = match(ra,rb,mstart,rc,rd,re,rf,rg,rdepth+1); \
262 nigel 87 printf("to line %d\n", __LINE__); \
263     }
264     #define RRETURN(ra) \
265     { \
266     printf("match() returned %d from line %d ", ra, __LINE__); \
267     return ra; \
268     }
269     #else
270 ph10 164 #define RMATCH(ra,rb,rc,rd,re,rf,rg,rw) \
271 ph10 168 rrc = match(ra,rb,mstart,rc,rd,re,rf,rg,rdepth+1)
272 nigel 77 #define RRETURN(ra) return ra
273 nigel 87 #endif
274    
275 nigel 77 #else
276    
277    
278 ph10 164 /* These versions of the macros manage a private stack on the heap. Note that
279     the "rd" argument of RMATCH isn't actually used in this definition. It's the md
280     argument of match(), which never changes. */
281 nigel 77
282     #define REGISTER
283    
284 ph10 164 #define RMATCH(ra,rb,rc,rd,re,rf,rg,rw)\
285 nigel 77 {\
286     heapframe *newframe = (pcre_stack_malloc)(sizeof(heapframe));\
287 ph10 164 frame->Xwhere = rw; \
288     newframe->Xeptr = ra;\
289     newframe->Xecode = rb;\
290 ph10 168 newframe->Xmstart = mstart;\
291 ph10 164 newframe->Xoffset_top = rc;\
292     newframe->Xims = re;\
293     newframe->Xeptrb = rf;\
294     newframe->Xflags = rg;\
295     newframe->Xrdepth = frame->Xrdepth + 1;\
296     newframe->Xprevframe = frame;\
297     frame = newframe;\
298     DPRINTF(("restarting from line %d\n", __LINE__));\
299     goto HEAP_RECURSE;\
300     L_##rw:\
301     DPRINTF(("jumped back to line %d\n", __LINE__));\
302 nigel 77 }
303    
304     #define RRETURN(ra)\
305     {\
306     heapframe *newframe = frame;\
307     frame = newframe->Xprevframe;\
308     (pcre_stack_free)(newframe);\
309     if (frame != NULL)\
310     {\
311 ph10 164 rrc = ra;\
312     goto HEAP_RETURN;\
313 nigel 77 }\
314     return ra;\
315     }
316    
317    
318     /* Structure for remembering the local variables in a private frame */
319    
320     typedef struct heapframe {
321     struct heapframe *Xprevframe;
322    
323     /* Function arguments that may change */
324    
325     const uschar *Xeptr;
326     const uschar *Xecode;
327 ph10 172 const uschar *Xmstart;
328 nigel 77 int Xoffset_top;
329     long int Xims;
330     eptrblock *Xeptrb;
331     int Xflags;
332 nigel 91 unsigned int Xrdepth;
333 nigel 77
334     /* Function local variables */
335    
336     const uschar *Xcallpat;
337 ph10 406 #ifdef SUPPORT_UTF8
338 nigel 77 const uschar *Xcharptr;
339 ph10 406 #endif
340 nigel 77 const uschar *Xdata;
341     const uschar *Xnext;
342     const uschar *Xpp;
343     const uschar *Xprev;
344     const uschar *Xsaved_eptr;
345    
346     recursion_info Xnew_recursive;
347    
348     BOOL Xcur_is_word;
349     BOOL Xcondition;
350     BOOL Xprev_is_word;
351    
352     unsigned long int Xoriginal_ims;
353    
354     #ifdef SUPPORT_UCP
355     int Xprop_type;
356 nigel 87 int Xprop_value;
357 nigel 77 int Xprop_fail_result;
358     int Xprop_category;
359     int Xprop_chartype;
360 nigel 87 int Xprop_script;
361 ph10 123 int Xoclength;
362     uschar Xocchars[8];
363 nigel 77 #endif
364    
365 ph10 403 int Xcodelink;
366 nigel 77 int Xctype;
367 nigel 93 unsigned int Xfc;
368 nigel 77 int Xfi;
369     int Xlength;
370     int Xmax;
371     int Xmin;
372     int Xnumber;
373     int Xoffset;
374     int Xop;
375     int Xsave_capture_last;
376     int Xsave_offset1, Xsave_offset2, Xsave_offset3;
377     int Xstacksave[REC_STACK_SAVE_MAX];
378    
379     eptrblock Xnewptrb;
380    
381 ph10 164 /* Where to jump back to */
382 nigel 77
383 ph10 164 int Xwhere;
384 ph10 165
385 nigel 77 } heapframe;
386    
387     #endif
388    
389    
390     /***************************************************************************
391     ***************************************************************************/
392    
393    
394    
395     /*************************************************
396     * Match from current position *
397     *************************************************/
398    
399 nigel 93 /* This function is called recursively in many circumstances. Whenever it
400 nigel 77 returns a negative (error) response, the outer incarnation must also return the
401     same response.
402    
403     Performance note: It might be tempting to extract commonly used fields from the
404     md structure (e.g. utf8, end_subject) into individual variables to improve
405     performance. Tests using gcc on a SPARC disproved this; in the first case, it
406     made performance worse.
407    
408     Arguments:
409 nigel 93 eptr pointer to current character in subject
410     ecode pointer to current position in compiled code
411 ph10 168 mstart pointer to the current match start position (can be modified
412 ph10 172 by encountering \K)
413 nigel 77 offset_top current top pointer
414     md pointer to "static" info for the match
415     ims current /i, /m, and /s options
416     eptrb pointer to chain of blocks containing eptr at start of
417     brackets - for testing for empty matches
418     flags can contain
419     match_condassert - this is an assertion condition
420 nigel 93 match_cbegroup - this is the start of an unlimited repeat
421     group that can match an empty string
422 nigel 87 rdepth the recursion depth
423 nigel 77
424     Returns: MATCH_MATCH if matched ) these values are >= 0
425     MATCH_NOMATCH if failed to match )
426     a negative PCRE_ERROR_xxx value if aborted by an error condition
427 nigel 87 (e.g. stopped by repeated call or recursion limit)
428 nigel 77 */
429    
430     static int
431 ph10 172 match(REGISTER USPTR eptr, REGISTER const uschar *ecode, const uschar *mstart,
432 nigel 77 int offset_top, match_data *md, unsigned long int ims, eptrblock *eptrb,
433 nigel 91 int flags, unsigned int rdepth)
434 nigel 77 {
435     /* These variables do not need to be preserved over recursion in this function,
436 nigel 93 so they can be ordinary variables in all cases. Mark some of them with
437     "register" because they are used a lot in loops. */
438 nigel 77
439 nigel 91 register int rrc; /* Returns from recursive calls */
440     register int i; /* Used for loops not involving calls to RMATCH() */
441 nigel 93 register unsigned int c; /* Character values not kept over RMATCH() calls */
442 nigel 91 register BOOL utf8; /* Local copy of UTF-8 flag for speed */
443 nigel 77
444 nigel 93 BOOL minimize, possessive; /* Quantifier options */
445 ph10 403 int condcode;
446 nigel 93
447 nigel 77 /* When recursion is not being used, all "local" variables that have to be
448     preserved over calls to RMATCH() are part of a "frame" which is obtained from
449     heap storage. Set up the top-level frame here; others are obtained from the
450     heap whenever RMATCH() does a "recursion". See the macro definitions above. */
451    
452     #ifdef NO_RECURSE
453     heapframe *frame = (pcre_stack_malloc)(sizeof(heapframe));
454     frame->Xprevframe = NULL; /* Marks the top level */
455    
456     /* Copy in the original argument variables */
457    
458     frame->Xeptr = eptr;
459     frame->Xecode = ecode;
460 ph10 168 frame->Xmstart = mstart;
461 nigel 77 frame->Xoffset_top = offset_top;
462     frame->Xims = ims;
463     frame->Xeptrb = eptrb;
464     frame->Xflags = flags;
465 nigel 87 frame->Xrdepth = rdepth;
466 nigel 77
467     /* This is where control jumps back to to effect "recursion" */
468    
469     HEAP_RECURSE:
470    
471     /* Macros make the argument variables come from the current frame */
472    
473     #define eptr frame->Xeptr
474     #define ecode frame->Xecode
475 ph10 168 #define mstart frame->Xmstart
476 nigel 77 #define offset_top frame->Xoffset_top
477     #define ims frame->Xims
478     #define eptrb frame->Xeptrb
479     #define flags frame->Xflags
480 nigel 87 #define rdepth frame->Xrdepth
481 nigel 77
482     /* Ditto for the local variables */
483    
484     #ifdef SUPPORT_UTF8
485     #define charptr frame->Xcharptr
486     #endif
487     #define callpat frame->Xcallpat
488 ph10 403 #define codelink frame->Xcodelink
489 nigel 77 #define data frame->Xdata
490     #define next frame->Xnext
491     #define pp frame->Xpp
492     #define prev frame->Xprev
493     #define saved_eptr frame->Xsaved_eptr
494    
495     #define new_recursive frame->Xnew_recursive
496    
497     #define cur_is_word frame->Xcur_is_word
498     #define condition frame->Xcondition
499     #define prev_is_word frame->Xprev_is_word
500    
501     #define original_ims frame->Xoriginal_ims
502    
503     #ifdef SUPPORT_UCP
504     #define prop_type frame->Xprop_type
505 nigel 87 #define prop_value frame->Xprop_value
506 nigel 77 #define prop_fail_result frame->Xprop_fail_result
507     #define prop_category frame->Xprop_category
508     #define prop_chartype frame->Xprop_chartype
509 nigel 87 #define prop_script frame->Xprop_script
510 ph10 115 #define oclength frame->Xoclength
511     #define occhars frame->Xocchars
512 nigel 77 #endif
513    
514     #define ctype frame->Xctype
515     #define fc frame->Xfc
516     #define fi frame->Xfi
517     #define length frame->Xlength
518     #define max frame->Xmax
519     #define min frame->Xmin
520     #define number frame->Xnumber
521     #define offset frame->Xoffset
522     #define op frame->Xop
523     #define save_capture_last frame->Xsave_capture_last
524     #define save_offset1 frame->Xsave_offset1
525     #define save_offset2 frame->Xsave_offset2
526     #define save_offset3 frame->Xsave_offset3
527     #define stacksave frame->Xstacksave
528    
529     #define newptrb frame->Xnewptrb
530    
531     /* When recursion is being used, local variables are allocated on the stack and
532     get preserved during recursion in the normal way. In this environment, fi and
533     i, and fc and c, can be the same variables. */
534    
535 nigel 93 #else /* NO_RECURSE not defined */
536 nigel 77 #define fi i
537     #define fc c
538    
539    
540 nigel 87 #ifdef SUPPORT_UTF8 /* Many of these variables are used only */
541     const uschar *charptr; /* in small blocks of the code. My normal */
542     #endif /* style of coding would have declared */
543     const uschar *callpat; /* them within each of those blocks. */
544     const uschar *data; /* However, in order to accommodate the */
545     const uschar *next; /* version of this code that uses an */
546     USPTR pp; /* external "stack" implemented on the */
547     const uschar *prev; /* heap, it is easier to declare them all */
548     USPTR saved_eptr; /* here, so the declarations can be cut */
549     /* out in a block. The only declarations */
550     recursion_info new_recursive; /* within blocks below are for variables */
551     /* that do not have to be preserved over */
552     BOOL cur_is_word; /* a recursive call to RMATCH(). */
553     BOOL condition;
554 nigel 77 BOOL prev_is_word;
555    
556     unsigned long int original_ims;
557    
558     #ifdef SUPPORT_UCP
559     int prop_type;
560 nigel 87 int prop_value;
561 nigel 77 int prop_fail_result;
562     int prop_category;
563     int prop_chartype;
564 nigel 87 int prop_script;
565 ph10 115 int oclength;
566     uschar occhars[8];
567 nigel 77 #endif
568    
569 ph10 399 int codelink;
570 nigel 77 int ctype;
571     int length;
572     int max;
573     int min;
574     int number;
575     int offset;
576     int op;
577     int save_capture_last;
578     int save_offset1, save_offset2, save_offset3;
579     int stacksave[REC_STACK_SAVE_MAX];
580    
581     eptrblock newptrb;
582 nigel 93 #endif /* NO_RECURSE */
583 nigel 77
584     /* These statements are here to stop the compiler complaining about unitialized
585     variables. */
586    
587     #ifdef SUPPORT_UCP
588 nigel 87 prop_value = 0;
589 nigel 77 prop_fail_result = 0;
590     #endif
591    
592 nigel 93
593 nigel 91 /* This label is used for tail recursion, which is used in a few cases even
594     when NO_RECURSE is not defined, in order to reduce the amount of stack that is
595     used. Thanks to Ian Taylor for noticing this possibility and sending the
596     original patch. */
597    
598     TAIL_RECURSE:
599    
600 nigel 87 /* OK, now we can get on with the real code of the function. Recursive calls
601     are specified by the macro RMATCH and RRETURN is used to return. When
602     NO_RECURSE is *not* defined, these just turn into a recursive call to match()
603     and a "return", respectively (possibly with some debugging if DEBUG is
604     defined). However, RMATCH isn't like a function call because it's quite a
605     complicated macro. It has to be used in one particular way. This shouldn't,
606     however, impact performance when true recursion is being used. */
607 nigel 77
608 ph10 164 #ifdef SUPPORT_UTF8
609     utf8 = md->utf8; /* Local copy of the flag */
610     #else
611     utf8 = FALSE;
612     #endif
613    
614 nigel 87 /* First check that we haven't called match() too many times, or that we
615     haven't exceeded the recursive call limit. */
616    
617 nigel 77 if (md->match_call_count++ >= md->match_limit) RRETURN(PCRE_ERROR_MATCHLIMIT);
618 nigel 87 if (rdepth >= md->match_limit_recursion) RRETURN(PCRE_ERROR_RECURSIONLIMIT);
619 nigel 77
620     original_ims = ims; /* Save for resetting on ')' */
621 nigel 91
622 nigel 93 /* At the start of a group with an unlimited repeat that may match an empty
623     string, the match_cbegroup flag is set. When this is the case, add the current
624     subject pointer to the chain of such remembered pointers, to be checked when we
625     hit the closing ket, in order to break infinite loops that match no characters.
626 ph10 197 When match() is called in other circumstances, don't add to the chain. The
627     match_cbegroup flag must NOT be used with tail recursion, because the memory
628     block that is used is on the stack, so a new one may be required for each
629     match(). */
630 nigel 77
631 nigel 93 if ((flags & match_cbegroup) != 0)
632 nigel 77 {
633 ph10 197 newptrb.epb_saved_eptr = eptr;
634     newptrb.epb_prev = eptrb;
635     eptrb = &newptrb;
636 nigel 77 }
637    
638 nigel 93 /* Now start processing the opcodes. */
639 nigel 77
640     for (;;)
641     {
642 nigel 93 minimize = possessive = FALSE;
643 nigel 77 op = *ecode;
644 ph10 406
645 nigel 77 /* For partial matching, remember if we ever hit the end of the subject after
646     matching at least one subject character. */
647    
648     if (md->partial &&
649     eptr >= md->end_subject &&
650 ph10 168 eptr > mstart)
651 nigel 77 md->hitend = TRUE;
652 ph10 208
653 nigel 93 switch(op)
654     {
655 ph10 210 case OP_FAIL:
656 ph10 212 RRETURN(MATCH_NOMATCH);
657 ph10 211
658 ph10 210 case OP_PRUNE:
659     RMATCH(eptr, ecode + _pcre_OP_lengths[*ecode], offset_top, md,
660     ims, eptrb, flags, RM51);
661     if (rrc != MATCH_NOMATCH) RRETURN(rrc);
662 ph10 212 RRETURN(MATCH_PRUNE);
663 ph10 211
664 ph10 210 case OP_COMMIT:
665     RMATCH(eptr, ecode + _pcre_OP_lengths[*ecode], offset_top, md,
666     ims, eptrb, flags, RM52);
667     if (rrc != MATCH_NOMATCH) RRETURN(rrc);
668 ph10 212 RRETURN(MATCH_COMMIT);
669 ph10 211
670 ph10 210 case OP_SKIP:
671     RMATCH(eptr, ecode + _pcre_OP_lengths[*ecode], offset_top, md,
672     ims, eptrb, flags, RM53);
673     if (rrc != MATCH_NOMATCH) RRETURN(rrc);
674 ph10 211 md->start_match_ptr = eptr; /* Pass back current position */
675 ph10 212 RRETURN(MATCH_SKIP);
676 ph10 211
677 ph10 210 case OP_THEN:
678     RMATCH(eptr, ecode + _pcre_OP_lengths[*ecode], offset_top, md,
679 ph10 212 ims, eptrb, flags, RM54);
680 ph10 210 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
681 ph10 212 RRETURN(MATCH_THEN);
682 ph10 211
683 nigel 93 /* Handle a capturing bracket. If there is space in the offset vector, save
684     the current subject position in the working slot at the top of the vector.
685     We mustn't change the current values of the data slot, because they may be
686     set from a previous iteration of this group, and be referred to by a
687     reference inside the group.
688 nigel 77
689 nigel 93 If the bracket fails to match, we need to restore this value and also the
690     values of the final offsets, in case they were set by a previous iteration
691     of the same bracket.
692 nigel 77
693 nigel 93 If there isn't enough space in the offset vector, treat this as if it were
694     a non-capturing bracket. Don't worry about setting the flag for the error
695     case here; that is handled in the code for KET. */
696 nigel 77
697 nigel 93 case OP_CBRA:
698     case OP_SCBRA:
699     number = GET2(ecode, 1+LINK_SIZE);
700 nigel 77 offset = number << 1;
701    
702     #ifdef DEBUG
703 nigel 93 printf("start bracket %d\n", number);
704     printf("subject=");
705 nigel 77 pchars(eptr, 16, TRUE, md);
706     printf("\n");
707     #endif
708    
709     if (offset < md->offset_max)
710     {
711     save_offset1 = md->offset_vector[offset];
712     save_offset2 = md->offset_vector[offset+1];
713     save_offset3 = md->offset_vector[md->offset_end - number];
714     save_capture_last = md->capture_last;
715    
716     DPRINTF(("saving %d %d %d\n", save_offset1, save_offset2, save_offset3));
717     md->offset_vector[md->offset_end - number] = eptr - md->start_subject;
718    
719 nigel 93 flags = (op == OP_SCBRA)? match_cbegroup : 0;
720 nigel 77 do
721     {
722 ph10 164 RMATCH(eptr, ecode + _pcre_OP_lengths[*ecode], offset_top, md,
723     ims, eptrb, flags, RM1);
724 ph10 210 if (rrc != MATCH_NOMATCH && rrc != MATCH_THEN) RRETURN(rrc);
725 nigel 77 md->capture_last = save_capture_last;
726     ecode += GET(ecode, 1);
727     }
728     while (*ecode == OP_ALT);
729    
730     DPRINTF(("bracket %d failed\n", number));
731    
732     md->offset_vector[offset] = save_offset1;
733     md->offset_vector[offset+1] = save_offset2;
734     md->offset_vector[md->offset_end - number] = save_offset3;
735    
736     RRETURN(MATCH_NOMATCH);
737     }
738    
739 ph10 197 /* FALL THROUGH ... Insufficient room for saving captured contents. Treat
740     as a non-capturing bracket. */
741 nigel 77
742 ph10 197 /* VVVVVVVVVVVVVVVVVVVVVVVVV */
743     /* VVVVVVVVVVVVVVVVVVVVVVVVV */
744    
745 nigel 93 DPRINTF(("insufficient capture room: treat as non-capturing\n"));
746 nigel 77
747 ph10 197 /* VVVVVVVVVVVVVVVVVVVVVVVVV */
748     /* VVVVVVVVVVVVVVVVVVVVVVVVV */
749    
750 nigel 93 /* Non-capturing bracket. Loop for all the alternatives. When we get to the
751     final alternative within the brackets, we would return the result of a
752     recursive call to match() whatever happened. We can reduce stack usage by
753 ph10 197 turning this into a tail recursion, except in the case when match_cbegroup
754     is set.*/
755 nigel 77
756 nigel 93 case OP_BRA:
757     case OP_SBRA:
758     DPRINTF(("start non-capturing bracket\n"));
759     flags = (op >= OP_SBRA)? match_cbegroup : 0;
760 nigel 91 for (;;)
761 nigel 77 {
762 ph10 197 if (ecode[GET(ecode, 1)] != OP_ALT) /* Final alternative */
763 nigel 93 {
764 ph10 197 if (flags == 0) /* Not a possibly empty group */
765     {
766     ecode += _pcre_OP_lengths[*ecode];
767     DPRINTF(("bracket 0 tail recursion\n"));
768     goto TAIL_RECURSE;
769     }
770    
771     /* Possibly empty group; can't use tail recursion. */
772    
773     RMATCH(eptr, ecode + _pcre_OP_lengths[*ecode], offset_top, md, ims,
774     eptrb, flags, RM48);
775     RRETURN(rrc);
776 nigel 93 }
777 nigel 91
778     /* For non-final alternatives, continue the loop for a NOMATCH result;
779     otherwise return. */
780    
781 ph10 164 RMATCH(eptr, ecode + _pcre_OP_lengths[*ecode], offset_top, md, ims,
782     eptrb, flags, RM2);
783 ph10 210 if (rrc != MATCH_NOMATCH && rrc != MATCH_THEN) RRETURN(rrc);
784 nigel 77 ecode += GET(ecode, 1);
785     }
786 nigel 91 /* Control never reaches here. */
787 nigel 77
788     /* Conditional group: compilation checked that there are no more than
789     two branches. If the condition is false, skipping the first branch takes us
790     past the end if there is only one branch, but that's OK because that is
791 nigel 91 exactly what going to the ket would do. As there is only one branch to be
792     obeyed, we can use tail recursion to avoid using another stack frame. */
793 nigel 77
794     case OP_COND:
795 nigel 93 case OP_SCOND:
796 ph10 399 codelink= GET(ecode, 1);
797 ph10 406
798 ph10 381 /* Because of the way auto-callout works during compile, a callout item is
799     inserted between OP_COND and an assertion condition. */
800 ph10 392
801 ph10 381 if (ecode[LINK_SIZE+1] == OP_CALLOUT)
802     {
803     if (pcre_callout != NULL)
804     {
805     pcre_callout_block cb;
806     cb.version = 1; /* Version 1 of the callout block */
807     cb.callout_number = ecode[LINK_SIZE+2];
808     cb.offset_vector = md->offset_vector;
809     cb.subject = (PCRE_SPTR)md->start_subject;
810     cb.subject_length = md->end_subject - md->start_subject;
811     cb.start_match = mstart - md->start_subject;
812     cb.current_position = eptr - md->start_subject;
813     cb.pattern_position = GET(ecode, LINK_SIZE + 3);
814     cb.next_item_length = GET(ecode, 3 + 2*LINK_SIZE);
815     cb.capture_top = offset_top/2;
816     cb.capture_last = md->capture_last;
817     cb.callout_data = md->callout_data;
818     if ((rrc = (*pcre_callout)(&cb)) > 0) RRETURN(MATCH_NOMATCH);
819     if (rrc < 0) RRETURN(rrc);
820     }
821     ecode += _pcre_OP_lengths[OP_CALLOUT];
822     }
823 ph10 392
824 ph10 399 condcode = ecode[LINK_SIZE+1];
825 ph10 406
826 ph10 381 /* Now see what the actual condition is */
827 ph10 392
828 ph10 399 if (condcode == OP_RREF) /* Recursion test */
829 nigel 77 {
830 nigel 93 offset = GET2(ecode, LINK_SIZE + 2); /* Recursion group number*/
831     condition = md->recursive != NULL &&
832     (offset == RREF_ANY || offset == md->recursive->group_num);
833     ecode += condition? 3 : GET(ecode, 1);
834     }
835    
836 ph10 399 else if (condcode == OP_CREF) /* Group used test */
837 nigel 93 {
838 nigel 77 offset = GET2(ecode, LINK_SIZE+2) << 1; /* Doubled ref number */
839 nigel 93 condition = offset < offset_top && md->offset_vector[offset] >= 0;
840     ecode += condition? 3 : GET(ecode, 1);
841 nigel 77 }
842    
843 ph10 399 else if (condcode == OP_DEF) /* DEFINE - always false */
844 nigel 93 {
845     condition = FALSE;
846     ecode += GET(ecode, 1);
847     }
848    
849 nigel 77 /* The condition is an assertion. Call match() to evaluate it - setting
850 nigel 93 the final argument match_condassert causes it to stop at the end of an
851     assertion. */
852 nigel 77
853     else
854     {
855 ph10 164 RMATCH(eptr, ecode + 1 + LINK_SIZE, offset_top, md, ims, NULL,
856     match_condassert, RM3);
857 nigel 77 if (rrc == MATCH_MATCH)
858     {
859 nigel 93 condition = TRUE;
860     ecode += 1 + LINK_SIZE + GET(ecode, LINK_SIZE + 2);
861 nigel 77 while (*ecode == OP_ALT) ecode += GET(ecode, 1);
862     }
863 ph10 210 else if (rrc != MATCH_NOMATCH && rrc != MATCH_THEN)
864 nigel 77 {
865     RRETURN(rrc); /* Need braces because of following else */
866     }
867 nigel 93 else
868     {
869     condition = FALSE;
870 ph10 399 ecode += codelink;
871 nigel 93 }
872     }
873 nigel 91
874 nigel 93 /* We are now at the branch that is to be obeyed. As there is only one,
875 ph10 197 we can use tail recursion to avoid using another stack frame, except when
876     match_cbegroup is required for an unlimited repeat of a possibly empty
877     group. If the second alternative doesn't exist, we can just plough on. */
878 nigel 91
879 nigel 93 if (condition || *ecode == OP_ALT)
880     {
881 nigel 91 ecode += 1 + LINK_SIZE;
882 ph10 197 if (op == OP_SCOND) /* Possibly empty group */
883     {
884     RMATCH(eptr, ecode, offset_top, md, ims, eptrb, match_cbegroup, RM49);
885     RRETURN(rrc);
886     }
887     else /* Group must match something */
888     {
889     flags = 0;
890     goto TAIL_RECURSE;
891     }
892 nigel 77 }
893 ph10 395 else /* Condition false & no alternative */
894 nigel 93 {
895     ecode += 1 + LINK_SIZE;
896     }
897     break;
898 nigel 77
899    
900 ph10 210 /* End of the pattern, either real or forced. If we are in a top-level
901     recursion, we should restore the offsets appropriately and continue from
902     after the call. */
903 nigel 77
904 ph10 210 case OP_ACCEPT:
905 nigel 77 case OP_END:
906     if (md->recursive != NULL && md->recursive->group_num == 0)
907     {
908     recursion_info *rec = md->recursive;
909 nigel 87 DPRINTF(("End of pattern in a (?0) recursion\n"));
910 nigel 77 md->recursive = rec->prevrec;
911     memmove(md->offset_vector, rec->offset_save,
912     rec->saved_max * sizeof(int));
913 ph10 168 mstart = rec->save_start;
914 nigel 77 ims = original_ims;
915     ecode = rec->after_call;
916     break;
917     }
918    
919     /* Otherwise, if PCRE_NOTEMPTY is set, fail if we have matched an empty
920     string - backtracking will then try other alternatives, if any. */
921    
922 ph10 168 if (md->notempty && eptr == mstart) RRETURN(MATCH_NOMATCH);
923     md->end_match_ptr = eptr; /* Record where we ended */
924     md->end_offset_top = offset_top; /* and how many extracts were taken */
925 ph10 210 md->start_match_ptr = mstart; /* and the start (\K can modify) */
926 nigel 77 RRETURN(MATCH_MATCH);
927    
928     /* Change option settings */
929    
930     case OP_OPT:
931     ims = ecode[1];
932     ecode += 2;
933     DPRINTF(("ims set to %02lx\n", ims));
934     break;
935    
936     /* Assertion brackets. Check the alternative branches in turn - the
937     matching won't pass the KET for an assertion. If any one branch matches,
938     the assertion is true. Lookbehind assertions have an OP_REVERSE item at the
939     start of each branch to move the current point backwards, so the code at
940     this level is identical to the lookahead case. */
941    
942     case OP_ASSERT:
943     case OP_ASSERTBACK:
944     do
945     {
946 ph10 164 RMATCH(eptr, ecode + 1 + LINK_SIZE, offset_top, md, ims, NULL, 0,
947     RM4);
948 nigel 77 if (rrc == MATCH_MATCH) break;
949 ph10 210 if (rrc != MATCH_NOMATCH && rrc != MATCH_THEN) RRETURN(rrc);
950 nigel 77 ecode += GET(ecode, 1);
951     }
952     while (*ecode == OP_ALT);
953     if (*ecode == OP_KET) RRETURN(MATCH_NOMATCH);
954    
955     /* If checking an assertion for a condition, return MATCH_MATCH. */
956    
957     if ((flags & match_condassert) != 0) RRETURN(MATCH_MATCH);
958    
959     /* Continue from after the assertion, updating the offsets high water
960     mark, since extracts may have been taken during the assertion. */
961    
962     do ecode += GET(ecode,1); while (*ecode == OP_ALT);
963     ecode += 1 + LINK_SIZE;
964     offset_top = md->end_offset_top;
965     continue;
966    
967     /* Negative assertion: all branches must fail to match */
968    
969     case OP_ASSERT_NOT:
970     case OP_ASSERTBACK_NOT:
971     do
972     {
973 ph10 164 RMATCH(eptr, ecode + 1 + LINK_SIZE, offset_top, md, ims, NULL, 0,
974     RM5);
975 nigel 77 if (rrc == MATCH_MATCH) RRETURN(MATCH_NOMATCH);
976 ph10 210 if (rrc != MATCH_NOMATCH && rrc != MATCH_THEN) RRETURN(rrc);
977 nigel 77 ecode += GET(ecode,1);
978     }
979     while (*ecode == OP_ALT);
980    
981     if ((flags & match_condassert) != 0) RRETURN(MATCH_MATCH);
982    
983     ecode += 1 + LINK_SIZE;
984     continue;
985    
986     /* Move the subject pointer back. This occurs only at the start of
987     each branch of a lookbehind assertion. If we are too close to the start to
988     move back, this match function fails. When working with UTF-8 we move
989     back a number of characters, not bytes. */
990    
991     case OP_REVERSE:
992     #ifdef SUPPORT_UTF8
993     if (utf8)
994     {
995 nigel 93 i = GET(ecode, 1);
996     while (i-- > 0)
997 nigel 77 {
998     eptr--;
999     if (eptr < md->start_subject) RRETURN(MATCH_NOMATCH);
1000 ph10 207 BACKCHAR(eptr);
1001 nigel 77 }
1002     }
1003     else
1004     #endif
1005    
1006     /* No UTF-8 support, or not in UTF-8 mode: count is byte count */
1007    
1008     {
1009 nigel 93 eptr -= GET(ecode, 1);
1010 nigel 77 if (eptr < md->start_subject) RRETURN(MATCH_NOMATCH);
1011     }
1012    
1013     /* Skip to next op code */
1014    
1015     ecode += 1 + LINK_SIZE;
1016     break;
1017    
1018     /* The callout item calls an external function, if one is provided, passing
1019     details of the match so far. This is mainly for debugging, though the
1020     function is able to force a failure. */
1021    
1022     case OP_CALLOUT:
1023     if (pcre_callout != NULL)
1024     {
1025     pcre_callout_block cb;
1026     cb.version = 1; /* Version 1 of the callout block */
1027     cb.callout_number = ecode[1];
1028     cb.offset_vector = md->offset_vector;
1029 nigel 87 cb.subject = (PCRE_SPTR)md->start_subject;
1030 nigel 77 cb.subject_length = md->end_subject - md->start_subject;
1031 ph10 168 cb.start_match = mstart - md->start_subject;
1032 nigel 77 cb.current_position = eptr - md->start_subject;
1033     cb.pattern_position = GET(ecode, 2);
1034     cb.next_item_length = GET(ecode, 2 + LINK_SIZE);
1035     cb.capture_top = offset_top/2;
1036     cb.capture_last = md->capture_last;
1037     cb.callout_data = md->callout_data;
1038     if ((rrc = (*pcre_callout)(&cb)) > 0) RRETURN(MATCH_NOMATCH);
1039     if (rrc < 0) RRETURN(rrc);
1040     }
1041     ecode += 2 + 2*LINK_SIZE;
1042     break;
1043    
1044     /* Recursion either matches the current regex, or some subexpression. The
1045     offset data is the offset to the starting bracket from the start of the
1046     whole pattern. (This is so that it works from duplicated subpatterns.)
1047    
1048     If there are any capturing brackets started but not finished, we have to
1049     save their starting points and reinstate them after the recursion. However,
1050     we don't know how many such there are (offset_top records the completed
1051     total) so we just have to save all the potential data. There may be up to
1052     65535 such values, which is too large to put on the stack, but using malloc
1053     for small numbers seems expensive. As a compromise, the stack is used when
1054     there are no more than REC_STACK_SAVE_MAX values to store; otherwise malloc
1055     is used. A problem is what to do if the malloc fails ... there is no way of
1056     returning to the top level with an error. Save the top REC_STACK_SAVE_MAX
1057     values on the stack, and accept that the rest may be wrong.
1058    
1059     There are also other values that have to be saved. We use a chained
1060     sequence of blocks that actually live on the stack. Thanks to Robin Houston
1061     for the original version of this logic. */
1062    
1063     case OP_RECURSE:
1064     {
1065     callpat = md->start_code + GET(ecode, 1);
1066 nigel 93 new_recursive.group_num = (callpat == md->start_code)? 0 :
1067     GET2(callpat, 1 + LINK_SIZE);
1068 nigel 77
1069     /* Add to "recursing stack" */
1070    
1071     new_recursive.prevrec = md->recursive;
1072     md->recursive = &new_recursive;
1073    
1074     /* Find where to continue from afterwards */
1075    
1076     ecode += 1 + LINK_SIZE;
1077     new_recursive.after_call = ecode;
1078    
1079     /* Now save the offset data. */
1080    
1081     new_recursive.saved_max = md->offset_end;
1082     if (new_recursive.saved_max <= REC_STACK_SAVE_MAX)
1083     new_recursive.offset_save = stacksave;
1084     else
1085     {
1086     new_recursive.offset_save =
1087     (int *)(pcre_malloc)(new_recursive.saved_max * sizeof(int));
1088     if (new_recursive.offset_save == NULL) RRETURN(PCRE_ERROR_NOMEMORY);
1089     }
1090    
1091     memcpy(new_recursive.offset_save, md->offset_vector,
1092     new_recursive.saved_max * sizeof(int));
1093 ph10 168 new_recursive.save_start = mstart;
1094     mstart = eptr;
1095 nigel 77
1096     /* OK, now we can do the recursion. For each top-level alternative we
1097     restore the offset and recursion data. */
1098    
1099     DPRINTF(("Recursing into group %d\n", new_recursive.group_num));
1100 nigel 93 flags = (*callpat >= OP_SBRA)? match_cbegroup : 0;
1101 nigel 77 do
1102     {
1103 ph10 164 RMATCH(eptr, callpat + _pcre_OP_lengths[*callpat], offset_top,
1104     md, ims, eptrb, flags, RM6);
1105 nigel 77 if (rrc == MATCH_MATCH)
1106     {
1107 nigel 87 DPRINTF(("Recursion matched\n"));
1108 nigel 77 md->recursive = new_recursive.prevrec;
1109     if (new_recursive.offset_save != stacksave)
1110     (pcre_free)(new_recursive.offset_save);
1111     RRETURN(MATCH_MATCH);
1112     }
1113 ph10 210 else if (rrc != MATCH_NOMATCH && rrc != MATCH_THEN)
1114 nigel 87 {
1115     DPRINTF(("Recursion gave error %d\n", rrc));
1116 ph10 400 if (new_recursive.offset_save != stacksave)
1117     (pcre_free)(new_recursive.offset_save);
1118 nigel 87 RRETURN(rrc);
1119     }
1120 nigel 77
1121     md->recursive = &new_recursive;
1122     memcpy(md->offset_vector, new_recursive.offset_save,
1123     new_recursive.saved_max * sizeof(int));
1124     callpat += GET(callpat, 1);
1125     }
1126     while (*callpat == OP_ALT);
1127    
1128     DPRINTF(("Recursion didn't match\n"));
1129     md->recursive = new_recursive.prevrec;
1130     if (new_recursive.offset_save != stacksave)
1131     (pcre_free)(new_recursive.offset_save);
1132     RRETURN(MATCH_NOMATCH);
1133     }
1134     /* Control never reaches here */
1135    
1136     /* "Once" brackets are like assertion brackets except that after a match,
1137     the point in the subject string is not moved back. Thus there can never be
1138     a move back into the brackets. Friedl calls these "atomic" subpatterns.
1139     Check the alternative branches in turn - the matching won't pass the KET
1140     for this kind of subpattern. If any one branch matches, we carry on as at
1141     the end of a normal bracket, leaving the subject pointer. */
1142    
1143     case OP_ONCE:
1144 nigel 91 prev = ecode;
1145     saved_eptr = eptr;
1146    
1147     do
1148 nigel 77 {
1149 ph10 197 RMATCH(eptr, ecode + 1 + LINK_SIZE, offset_top, md, ims, eptrb, 0, RM7);
1150 nigel 91 if (rrc == MATCH_MATCH) break;
1151 ph10 210 if (rrc != MATCH_NOMATCH && rrc != MATCH_THEN) RRETURN(rrc);
1152 nigel 91 ecode += GET(ecode,1);
1153     }
1154     while (*ecode == OP_ALT);
1155 nigel 77
1156 nigel 91 /* If hit the end of the group (which could be repeated), fail */
1157 nigel 77
1158 nigel 91 if (*ecode != OP_ONCE && *ecode != OP_ALT) RRETURN(MATCH_NOMATCH);
1159 nigel 77
1160 nigel 91 /* Continue as from after the assertion, updating the offsets high water
1161     mark, since extracts may have been taken. */
1162 nigel 77
1163 nigel 93 do ecode += GET(ecode, 1); while (*ecode == OP_ALT);
1164 nigel 77
1165 nigel 91 offset_top = md->end_offset_top;
1166     eptr = md->end_match_ptr;
1167 nigel 77
1168 nigel 91 /* For a non-repeating ket, just continue at this level. This also
1169     happens for a repeating ket if no characters were matched in the group.
1170     This is the forcible breaking of infinite loops as implemented in Perl
1171     5.005. If there is an options reset, it will get obeyed in the normal
1172     course of events. */
1173 nigel 77
1174 nigel 91 if (*ecode == OP_KET || eptr == saved_eptr)
1175     {
1176     ecode += 1+LINK_SIZE;
1177     break;
1178     }
1179 nigel 77
1180 nigel 91 /* The repeating kets try the rest of the pattern or restart from the
1181     preceding bracket, in the appropriate order. The second "call" of match()
1182     uses tail recursion, to avoid using another stack frame. We need to reset
1183     any options that changed within the bracket before re-running it, so
1184     check the next opcode. */
1185 nigel 77
1186 nigel 91 if (ecode[1+LINK_SIZE] == OP_OPT)
1187     {
1188     ims = (ims & ~PCRE_IMS) | ecode[4];
1189     DPRINTF(("ims set to %02lx at group repeat\n", ims));
1190     }
1191 nigel 77
1192 nigel 91 if (*ecode == OP_KETRMIN)
1193     {
1194 ph10 197 RMATCH(eptr, ecode + 1 + LINK_SIZE, offset_top, md, ims, eptrb, 0, RM8);
1195 nigel 91 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1196     ecode = prev;
1197 ph10 197 flags = 0;
1198 nigel 91 goto TAIL_RECURSE;
1199 nigel 77 }
1200 nigel 91 else /* OP_KETRMAX */
1201     {
1202 ph10 164 RMATCH(eptr, prev, offset_top, md, ims, eptrb, match_cbegroup, RM9);
1203 nigel 91 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1204     ecode += 1 + LINK_SIZE;
1205 ph10 197 flags = 0;
1206 nigel 91 goto TAIL_RECURSE;
1207     }
1208     /* Control never gets here */
1209 nigel 77
1210     /* An alternation is the end of a branch; scan along to find the end of the
1211     bracketed group and go to there. */
1212    
1213     case OP_ALT:
1214     do ecode += GET(ecode,1); while (*ecode == OP_ALT);
1215     break;
1216    
1217 ph10 335 /* BRAZERO, BRAMINZERO and SKIPZERO occur just before a bracket group,
1218     indicating that it may occur zero times. It may repeat infinitely, or not
1219     at all - i.e. it could be ()* or ()? or even (){0} in the pattern. Brackets
1220     with fixed upper repeat limits are compiled as a number of copies, with the
1221     optional ones preceded by BRAZERO or BRAMINZERO. */
1222 nigel 77
1223     case OP_BRAZERO:
1224     {
1225     next = ecode+1;
1226 ph10 164 RMATCH(eptr, next, offset_top, md, ims, eptrb, 0, RM10);
1227 nigel 77 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1228     do next += GET(next,1); while (*next == OP_ALT);
1229 nigel 93 ecode = next + 1 + LINK_SIZE;
1230 nigel 77 }
1231     break;
1232    
1233     case OP_BRAMINZERO:
1234     {
1235     next = ecode+1;
1236 nigel 93 do next += GET(next, 1); while (*next == OP_ALT);
1237 ph10 164 RMATCH(eptr, next + 1+LINK_SIZE, offset_top, md, ims, eptrb, 0, RM11);
1238 nigel 77 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1239     ecode++;
1240     }
1241     break;
1242    
1243 ph10 335 case OP_SKIPZERO:
1244     {
1245     next = ecode+1;
1246     do next += GET(next,1); while (*next == OP_ALT);
1247     ecode = next + 1 + LINK_SIZE;
1248     }
1249     break;
1250    
1251 nigel 93 /* End of a group, repeated or non-repeating. */
1252 nigel 77
1253     case OP_KET:
1254     case OP_KETRMIN:
1255     case OP_KETRMAX:
1256 nigel 91 prev = ecode - GET(ecode, 1);
1257 nigel 77
1258 nigel 93 /* If this was a group that remembered the subject start, in order to break
1259     infinite repeats of empty string matches, retrieve the subject start from
1260     the chain. Otherwise, set it NULL. */
1261 nigel 77
1262 nigel 93 if (*prev >= OP_SBRA)
1263     {
1264     saved_eptr = eptrb->epb_saved_eptr; /* Value at start of group */
1265     eptrb = eptrb->epb_prev; /* Backup to previous group */
1266     }
1267     else saved_eptr = NULL;
1268 nigel 77
1269 nigel 93 /* If we are at the end of an assertion group, stop matching and return
1270     MATCH_MATCH, but record the current high water mark for use by positive
1271     assertions. Do this also for the "once" (atomic) groups. */
1272    
1273 nigel 91 if (*prev == OP_ASSERT || *prev == OP_ASSERT_NOT ||
1274     *prev == OP_ASSERTBACK || *prev == OP_ASSERTBACK_NOT ||
1275     *prev == OP_ONCE)
1276     {
1277     md->end_match_ptr = eptr; /* For ONCE */
1278     md->end_offset_top = offset_top;
1279     RRETURN(MATCH_MATCH);
1280     }
1281 nigel 77
1282 nigel 93 /* For capturing groups we have to check the group number back at the start
1283     and if necessary complete handling an extraction by setting the offsets and
1284     bumping the high water mark. Note that whole-pattern recursion is coded as
1285     a recurse into group 0, so it won't be picked up here. Instead, we catch it
1286     when the OP_END is reached. Other recursion is handled here. */
1287 nigel 77
1288 nigel 93 if (*prev == OP_CBRA || *prev == OP_SCBRA)
1289 nigel 91 {
1290 nigel 93 number = GET2(prev, 1+LINK_SIZE);
1291 nigel 91 offset = number << 1;
1292 nigel 77
1293     #ifdef DEBUG
1294 nigel 91 printf("end bracket %d", number);
1295     printf("\n");
1296 nigel 77 #endif
1297    
1298 nigel 93 md->capture_last = number;
1299     if (offset >= md->offset_max) md->offset_overflow = TRUE; else
1300 nigel 91 {
1301 nigel 93 md->offset_vector[offset] =
1302     md->offset_vector[md->offset_end - number];
1303     md->offset_vector[offset+1] = eptr - md->start_subject;
1304     if (offset_top <= offset) offset_top = offset + 2;
1305     }
1306 nigel 77
1307 nigel 93 /* Handle a recursively called group. Restore the offsets
1308     appropriately and continue from after the call. */
1309 nigel 77
1310 nigel 93 if (md->recursive != NULL && md->recursive->group_num == number)
1311     {
1312     recursion_info *rec = md->recursive;
1313     DPRINTF(("Recursion (%d) succeeded - continuing\n", number));
1314     md->recursive = rec->prevrec;
1315 ph10 168 mstart = rec->save_start;
1316 nigel 93 memcpy(md->offset_vector, rec->offset_save,
1317     rec->saved_max * sizeof(int));
1318     ecode = rec->after_call;
1319     ims = original_ims;
1320     break;
1321 nigel 77 }
1322 nigel 91 }
1323 nigel 77
1324 nigel 93 /* For both capturing and non-capturing groups, reset the value of the ims
1325     flags, in case they got changed during the group. */
1326 nigel 77
1327 nigel 91 ims = original_ims;
1328     DPRINTF(("ims reset to %02lx\n", ims));
1329 nigel 77
1330 nigel 91 /* For a non-repeating ket, just continue at this level. This also
1331     happens for a repeating ket if no characters were matched in the group.
1332     This is the forcible breaking of infinite loops as implemented in Perl
1333     5.005. If there is an options reset, it will get obeyed in the normal
1334     course of events. */
1335 nigel 77
1336 nigel 91 if (*ecode == OP_KET || eptr == saved_eptr)
1337     {
1338     ecode += 1 + LINK_SIZE;
1339     break;
1340     }
1341 nigel 77
1342 nigel 91 /* The repeating kets try the rest of the pattern or restart from the
1343     preceding bracket, in the appropriate order. In the second case, we can use
1344 ph10 197 tail recursion to avoid using another stack frame, unless we have an
1345     unlimited repeat of a group that can match an empty string. */
1346 nigel 77
1347 nigel 93 flags = (*prev >= OP_SBRA)? match_cbegroup : 0;
1348    
1349 nigel 91 if (*ecode == OP_KETRMIN)
1350     {
1351 ph10 197 RMATCH(eptr, ecode + 1 + LINK_SIZE, offset_top, md, ims, eptrb, 0, RM12);
1352 nigel 91 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1353 ph10 197 if (flags != 0) /* Could match an empty string */
1354     {
1355     RMATCH(eptr, prev, offset_top, md, ims, eptrb, flags, RM50);
1356     RRETURN(rrc);
1357     }
1358 nigel 91 ecode = prev;
1359     goto TAIL_RECURSE;
1360 nigel 77 }
1361 nigel 91 else /* OP_KETRMAX */
1362     {
1363 ph10 164 RMATCH(eptr, prev, offset_top, md, ims, eptrb, flags, RM13);
1364 nigel 91 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1365     ecode += 1 + LINK_SIZE;
1366 ph10 197 flags = 0;
1367 nigel 91 goto TAIL_RECURSE;
1368     }
1369     /* Control never gets here */
1370 nigel 77
1371     /* Start of subject unless notbol, or after internal newline if multiline */
1372    
1373     case OP_CIRC:
1374     if (md->notbol && eptr == md->start_subject) RRETURN(MATCH_NOMATCH);
1375     if ((ims & PCRE_MULTILINE) != 0)
1376     {
1377 nigel 91 if (eptr != md->start_subject &&
1378 nigel 93 (eptr == md->end_subject || !WAS_NEWLINE(eptr)))
1379 nigel 77 RRETURN(MATCH_NOMATCH);
1380     ecode++;
1381     break;
1382     }
1383     /* ... else fall through */
1384    
1385     /* Start of subject assertion */
1386    
1387     case OP_SOD:
1388     if (eptr != md->start_subject) RRETURN(MATCH_NOMATCH);
1389     ecode++;
1390     break;
1391    
1392     /* Start of match assertion */
1393    
1394     case OP_SOM:
1395     if (eptr != md->start_subject + md->start_offset) RRETURN(MATCH_NOMATCH);
1396     ecode++;
1397     break;
1398 ph10 172
1399 ph10 168 /* Reset the start of match point */
1400 ph10 172
1401 ph10 168 case OP_SET_SOM:
1402     mstart = eptr;
1403 ph10 172 ecode++;
1404     break;
1405 nigel 77
1406     /* Assert before internal newline if multiline, or before a terminating
1407     newline unless endonly is set, else end of subject unless noteol is set. */
1408    
1409     case OP_DOLL:
1410     if ((ims & PCRE_MULTILINE) != 0)
1411     {
1412     if (eptr < md->end_subject)
1413 nigel 91 { if (!IS_NEWLINE(eptr)) RRETURN(MATCH_NOMATCH); }
1414 nigel 77 else
1415     { if (md->noteol) RRETURN(MATCH_NOMATCH); }
1416     ecode++;
1417     break;
1418     }
1419     else
1420     {
1421     if (md->noteol) RRETURN(MATCH_NOMATCH);
1422     if (!md->endonly)
1423     {
1424 nigel 91 if (eptr != md->end_subject &&
1425 nigel 93 (!IS_NEWLINE(eptr) || eptr != md->end_subject - md->nllen))
1426 nigel 77 RRETURN(MATCH_NOMATCH);
1427     ecode++;
1428     break;
1429     }
1430     }
1431 nigel 91 /* ... else fall through for endonly */
1432 nigel 77
1433     /* End of subject assertion (\z) */
1434    
1435     case OP_EOD:
1436     if (eptr < md->end_subject) RRETURN(MATCH_NOMATCH);
1437     ecode++;
1438     break;
1439    
1440     /* End of subject or ending \n assertion (\Z) */
1441    
1442     case OP_EODN:
1443 nigel 91 if (eptr != md->end_subject &&
1444 nigel 93 (!IS_NEWLINE(eptr) || eptr != md->end_subject - md->nllen))
1445 nigel 91 RRETURN(MATCH_NOMATCH);
1446 nigel 77 ecode++;
1447     break;
1448    
1449     /* Word boundary assertions */
1450    
1451     case OP_NOT_WORD_BOUNDARY:
1452     case OP_WORD_BOUNDARY:
1453     {
1454    
1455     /* Find out if the previous and current characters are "word" characters.
1456     It takes a bit more work in UTF-8 mode. Characters > 255 are assumed to
1457     be "non-word" characters. */
1458    
1459     #ifdef SUPPORT_UTF8
1460     if (utf8)
1461     {
1462     if (eptr == md->start_subject) prev_is_word = FALSE; else
1463     {
1464     const uschar *lastptr = eptr - 1;
1465     while((*lastptr & 0xc0) == 0x80) lastptr--;
1466     GETCHAR(c, lastptr);
1467     prev_is_word = c < 256 && (md->ctypes[c] & ctype_word) != 0;
1468     }
1469     if (eptr >= md->end_subject) cur_is_word = FALSE; else
1470     {
1471     GETCHAR(c, eptr);
1472     cur_is_word = c < 256 && (md->ctypes[c] & ctype_word) != 0;
1473     }
1474     }
1475     else
1476     #endif
1477    
1478     /* More streamlined when not in UTF-8 mode */
1479    
1480     {
1481     prev_is_word = (eptr != md->start_subject) &&
1482     ((md->ctypes[eptr[-1]] & ctype_word) != 0);
1483     cur_is_word = (eptr < md->end_subject) &&
1484     ((md->ctypes[*eptr] & ctype_word) != 0);
1485     }
1486    
1487     /* Now see if the situation is what we want */
1488    
1489     if ((*ecode++ == OP_WORD_BOUNDARY)?
1490     cur_is_word == prev_is_word : cur_is_word != prev_is_word)
1491     RRETURN(MATCH_NOMATCH);
1492     }
1493     break;
1494    
1495     /* Match a single character type; inline for speed */
1496    
1497     case OP_ANY:
1498 ph10 342 if (IS_NEWLINE(eptr)) RRETURN(MATCH_NOMATCH);
1499 ph10 345 /* Fall through */
1500    
1501 ph10 341 case OP_ALLANY:
1502 nigel 77 if (eptr++ >= md->end_subject) RRETURN(MATCH_NOMATCH);
1503 ph10 342 if (utf8) while (eptr < md->end_subject && (*eptr & 0xc0) == 0x80) eptr++;
1504 nigel 77 ecode++;
1505     break;
1506    
1507     /* Match a single byte, even in UTF-8 mode. This opcode really does match
1508     any byte, even newline, independent of the setting of PCRE_DOTALL. */
1509    
1510     case OP_ANYBYTE:
1511     if (eptr++ >= md->end_subject) RRETURN(MATCH_NOMATCH);
1512     ecode++;
1513     break;
1514    
1515     case OP_NOT_DIGIT:
1516     if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
1517     GETCHARINCTEST(c, eptr);
1518     if (
1519     #ifdef SUPPORT_UTF8
1520     c < 256 &&
1521     #endif
1522     (md->ctypes[c] & ctype_digit) != 0
1523     )
1524     RRETURN(MATCH_NOMATCH);
1525     ecode++;
1526     break;
1527    
1528     case OP_DIGIT:
1529     if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
1530     GETCHARINCTEST(c, eptr);
1531     if (
1532     #ifdef SUPPORT_UTF8
1533     c >= 256 ||
1534     #endif
1535     (md->ctypes[c] & ctype_digit) == 0
1536     )
1537     RRETURN(MATCH_NOMATCH);
1538     ecode++;
1539     break;
1540    
1541     case OP_NOT_WHITESPACE:
1542     if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
1543     GETCHARINCTEST(c, eptr);
1544     if (
1545     #ifdef SUPPORT_UTF8
1546     c < 256 &&
1547     #endif
1548     (md->ctypes[c] & ctype_space) != 0
1549     )
1550     RRETURN(MATCH_NOMATCH);
1551     ecode++;
1552     break;
1553    
1554     case OP_WHITESPACE:
1555     if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
1556     GETCHARINCTEST(c, eptr);
1557     if (
1558     #ifdef SUPPORT_UTF8
1559     c >= 256 ||
1560     #endif
1561     (md->ctypes[c] & ctype_space) == 0
1562     )
1563     RRETURN(MATCH_NOMATCH);
1564     ecode++;
1565     break;
1566    
1567     case OP_NOT_WORDCHAR:
1568     if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
1569     GETCHARINCTEST(c, eptr);
1570     if (
1571     #ifdef SUPPORT_UTF8
1572     c < 256 &&
1573     #endif
1574     (md->ctypes[c] & ctype_word) != 0
1575     )
1576     RRETURN(MATCH_NOMATCH);
1577     ecode++;
1578     break;
1579    
1580     case OP_WORDCHAR:
1581     if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
1582     GETCHARINCTEST(c, eptr);
1583     if (
1584     #ifdef SUPPORT_UTF8
1585     c >= 256 ||
1586     #endif
1587     (md->ctypes[c] & ctype_word) == 0
1588     )
1589     RRETURN(MATCH_NOMATCH);
1590     ecode++;
1591     break;
1592    
1593 nigel 93 case OP_ANYNL:
1594     if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
1595     GETCHARINCTEST(c, eptr);
1596     switch(c)
1597     {
1598     default: RRETURN(MATCH_NOMATCH);
1599     case 0x000d:
1600     if (eptr < md->end_subject && *eptr == 0x0a) eptr++;
1601     break;
1602 ph10 231
1603 nigel 93 case 0x000a:
1604 ph10 231 break;
1605    
1606 nigel 93 case 0x000b:
1607     case 0x000c:
1608     case 0x0085:
1609     case 0x2028:
1610     case 0x2029:
1611 ph10 231 if (md->bsr_anycrlf) RRETURN(MATCH_NOMATCH);
1612 nigel 93 break;
1613     }
1614     ecode++;
1615     break;
1616    
1617 ph10 178 case OP_NOT_HSPACE:
1618     if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
1619     GETCHARINCTEST(c, eptr);
1620     switch(c)
1621     {
1622     default: break;
1623     case 0x09: /* HT */
1624     case 0x20: /* SPACE */
1625     case 0xa0: /* NBSP */
1626     case 0x1680: /* OGHAM SPACE MARK */
1627     case 0x180e: /* MONGOLIAN VOWEL SEPARATOR */
1628     case 0x2000: /* EN QUAD */
1629     case 0x2001: /* EM QUAD */
1630     case 0x2002: /* EN SPACE */
1631     case 0x2003: /* EM SPACE */
1632     case 0x2004: /* THREE-PER-EM SPACE */
1633     case 0x2005: /* FOUR-PER-EM SPACE */
1634     case 0x2006: /* SIX-PER-EM SPACE */
1635     case 0x2007: /* FIGURE SPACE */
1636     case 0x2008: /* PUNCTUATION SPACE */
1637     case 0x2009: /* THIN SPACE */
1638     case 0x200A: /* HAIR SPACE */
1639     case 0x202f: /* NARROW NO-BREAK SPACE */
1640     case 0x205f: /* MEDIUM MATHEMATICAL SPACE */
1641     case 0x3000: /* IDEOGRAPHIC SPACE */
1642     RRETURN(MATCH_NOMATCH);
1643     }
1644     ecode++;
1645     break;
1646    
1647     case OP_HSPACE:
1648     if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
1649     GETCHARINCTEST(c, eptr);
1650     switch(c)
1651     {
1652     default: RRETURN(MATCH_NOMATCH);
1653     case 0x09: /* HT */
1654     case 0x20: /* SPACE */
1655     case 0xa0: /* NBSP */
1656     case 0x1680: /* OGHAM SPACE MARK */
1657     case 0x180e: /* MONGOLIAN VOWEL SEPARATOR */
1658     case 0x2000: /* EN QUAD */
1659     case 0x2001: /* EM QUAD */
1660     case 0x2002: /* EN SPACE */
1661     case 0x2003: /* EM SPACE */
1662     case 0x2004: /* THREE-PER-EM SPACE */
1663     case 0x2005: /* FOUR-PER-EM SPACE */
1664     case 0x2006: /* SIX-PER-EM SPACE */
1665     case 0x2007: /* FIGURE SPACE */
1666     case 0x2008: /* PUNCTUATION SPACE */
1667     case 0x2009: /* THIN SPACE */
1668     case 0x200A: /* HAIR SPACE */
1669     case 0x202f: /* NARROW NO-BREAK SPACE */
1670     case 0x205f: /* MEDIUM MATHEMATICAL SPACE */
1671     case 0x3000: /* IDEOGRAPHIC SPACE */
1672     break;
1673     }
1674     ecode++;
1675     break;
1676    
1677     case OP_NOT_VSPACE:
1678     if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
1679     GETCHARINCTEST(c, eptr);
1680     switch(c)
1681     {
1682     default: break;
1683     case 0x0a: /* LF */
1684     case 0x0b: /* VT */
1685     case 0x0c: /* FF */
1686     case 0x0d: /* CR */
1687     case 0x85: /* NEL */
1688     case 0x2028: /* LINE SEPARATOR */
1689     case 0x2029: /* PARAGRAPH SEPARATOR */
1690     RRETURN(MATCH_NOMATCH);
1691     }
1692     ecode++;
1693     break;
1694    
1695     case OP_VSPACE:
1696     if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
1697     GETCHARINCTEST(c, eptr);
1698     switch(c)
1699     {
1700     default: RRETURN(MATCH_NOMATCH);
1701     case 0x0a: /* LF */
1702     case 0x0b: /* VT */
1703     case 0x0c: /* FF */
1704     case 0x0d: /* CR */
1705     case 0x85: /* NEL */
1706     case 0x2028: /* LINE SEPARATOR */
1707     case 0x2029: /* PARAGRAPH SEPARATOR */
1708     break;
1709     }
1710     ecode++;
1711     break;
1712    
1713 nigel 77 #ifdef SUPPORT_UCP
1714     /* Check the next character by Unicode property. We will get here only
1715     if the support is in the binary; otherwise a compile-time error occurs. */
1716    
1717     case OP_PROP:
1718     case OP_NOTPROP:
1719     if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
1720     GETCHARINCTEST(c, eptr);
1721     {
1722 ph10 384 const ucd_record *prop = GET_UCD(c);
1723 nigel 77
1724 nigel 87 switch(ecode[1])
1725     {
1726     case PT_ANY:
1727     if (op == OP_NOTPROP) RRETURN(MATCH_NOMATCH);
1728     break;
1729 nigel 77
1730 nigel 87 case PT_LAMP:
1731 ph10 349 if ((prop->chartype == ucp_Lu ||
1732     prop->chartype == ucp_Ll ||
1733     prop->chartype == ucp_Lt) == (op == OP_NOTPROP))
1734 nigel 77 RRETURN(MATCH_NOMATCH);
1735 nigel 87 break;
1736    
1737     case PT_GC:
1738 ph10 351 if ((ecode[2] != _pcre_ucp_gentype[prop->chartype]) == (op == OP_PROP))
1739 nigel 77 RRETURN(MATCH_NOMATCH);
1740 nigel 87 break;
1741    
1742     case PT_PC:
1743 ph10 349 if ((ecode[2] != prop->chartype) == (op == OP_PROP))
1744 nigel 87 RRETURN(MATCH_NOMATCH);
1745     break;
1746    
1747     case PT_SC:
1748 ph10 349 if ((ecode[2] != prop->script) == (op == OP_PROP))
1749 nigel 87 RRETURN(MATCH_NOMATCH);
1750     break;
1751    
1752     default:
1753     RRETURN(PCRE_ERROR_INTERNAL);
1754 nigel 77 }
1755 nigel 87
1756     ecode += 3;
1757 nigel 77 }
1758     break;
1759    
1760     /* Match an extended Unicode sequence. We will get here only if the support
1761     is in the binary; otherwise a compile-time error occurs. */
1762    
1763     case OP_EXTUNI:
1764     if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
1765     GETCHARINCTEST(c, eptr);
1766     {
1767 ph10 349 int category = UCD_CATEGORY(c);
1768 nigel 77 if (category == ucp_M) RRETURN(MATCH_NOMATCH);
1769     while (eptr < md->end_subject)
1770     {
1771     int len = 1;
1772     if (!utf8) c = *eptr; else
1773     {
1774     GETCHARLEN(c, eptr, len);
1775     }
1776 ph10 349 category = UCD_CATEGORY(c);
1777 nigel 77 if (category != ucp_M) break;
1778     eptr += len;
1779     }
1780     }
1781     ecode++;
1782     break;
1783     #endif
1784    
1785    
1786     /* Match a back reference, possibly repeatedly. Look past the end of the
1787     item to see if there is repeat information following. The code is similar
1788     to that for character classes, but repeated for efficiency. Then obey
1789     similar code to character type repeats - written out again for speed.
1790     However, if the referenced string is the empty string, always treat
1791     it as matched, any number of times (otherwise there could be infinite
1792     loops). */
1793    
1794     case OP_REF:
1795     {
1796     offset = GET2(ecode, 1) << 1; /* Doubled ref number */
1797 ph10 345 ecode += 3;
1798    
1799 ph10 336 /* If the reference is unset, there are two possibilities:
1800 ph10 345
1801 ph10 336 (a) In the default, Perl-compatible state, set the length to be longer
1802     than the amount of subject left; this ensures that every attempt at a
1803     match fails. We can't just fail here, because of the possibility of
1804     quantifiers with zero minima.
1805 ph10 345
1806     (b) If the JavaScript compatibility flag is set, set the length to zero
1807     so that the back reference matches an empty string.
1808    
1809     Otherwise, set the length to the length of what was matched by the
1810 ph10 336 referenced subpattern. */
1811 ph10 345
1812 ph10 336 if (offset >= offset_top || md->offset_vector[offset] < 0)
1813 ph10 345 length = (md->jscript_compat)? 0 : md->end_subject - eptr + 1;
1814 ph10 336 else
1815     length = md->offset_vector[offset+1] - md->offset_vector[offset];
1816 nigel 77
1817     /* Set up for repetition, or handle the non-repeated case */
1818    
1819     switch (*ecode)
1820     {
1821     case OP_CRSTAR:
1822     case OP_CRMINSTAR:
1823     case OP_CRPLUS:
1824     case OP_CRMINPLUS:
1825     case OP_CRQUERY:
1826     case OP_CRMINQUERY:
1827     c = *ecode++ - OP_CRSTAR;
1828     minimize = (c & 1) != 0;
1829     min = rep_min[c]; /* Pick up values from tables; */
1830     max = rep_max[c]; /* zero for max => infinity */
1831     if (max == 0) max = INT_MAX;
1832     break;
1833    
1834     case OP_CRRANGE:
1835     case OP_CRMINRANGE:
1836     minimize = (*ecode == OP_CRMINRANGE);
1837     min = GET2(ecode, 1);
1838     max = GET2(ecode, 3);
1839     if (max == 0) max = INT_MAX;
1840     ecode += 5;
1841     break;
1842    
1843     default: /* No repeat follows */
1844     if (!match_ref(offset, eptr, length, md, ims)) RRETURN(MATCH_NOMATCH);
1845     eptr += length;
1846     continue; /* With the main loop */
1847     }
1848    
1849     /* If the length of the reference is zero, just continue with the
1850     main loop. */
1851    
1852     if (length == 0) continue;
1853    
1854     /* First, ensure the minimum number of matches are present. We get back
1855     the length of the reference string explicitly rather than passing the
1856     address of eptr, so that eptr can be a register variable. */
1857    
1858     for (i = 1; i <= min; i++)
1859     {
1860     if (!match_ref(offset, eptr, length, md, ims)) RRETURN(MATCH_NOMATCH);
1861     eptr += length;
1862     }
1863    
1864     /* If min = max, continue at the same level without recursion.
1865     They are not both allowed to be zero. */
1866    
1867     if (min == max) continue;
1868    
1869     /* If minimizing, keep trying and advancing the pointer */
1870    
1871     if (minimize)
1872     {
1873     for (fi = min;; fi++)
1874     {
1875 ph10 164 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM14);
1876 nigel 77 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1877     if (fi >= max || !match_ref(offset, eptr, length, md, ims))
1878     RRETURN(MATCH_NOMATCH);
1879     eptr += length;
1880     }
1881     /* Control never gets here */
1882     }
1883    
1884     /* If maximizing, find the longest string and work backwards */
1885    
1886     else
1887     {
1888     pp = eptr;
1889     for (i = min; i < max; i++)
1890     {
1891     if (!match_ref(offset, eptr, length, md, ims)) break;
1892     eptr += length;
1893     }
1894     while (eptr >= pp)
1895     {
1896 ph10 164 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM15);
1897 nigel 77 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1898     eptr -= length;
1899     }
1900     RRETURN(MATCH_NOMATCH);
1901     }
1902     }
1903     /* Control never gets here */
1904    
1905    
1906    
1907     /* Match a bit-mapped character class, possibly repeatedly. This op code is
1908     used when all the characters in the class have values in the range 0-255,
1909     and either the matching is caseful, or the characters are in the range
1910     0-127 when UTF-8 processing is enabled. The only difference between
1911     OP_CLASS and OP_NCLASS occurs when a data character outside the range is
1912     encountered.
1913    
1914     First, look past the end of the item to see if there is repeat information
1915     following. Then obey similar code to character type repeats - written out
1916     again for speed. */
1917    
1918     case OP_NCLASS:
1919     case OP_CLASS:
1920     {
1921     data = ecode + 1; /* Save for matching */
1922     ecode += 33; /* Advance past the item */
1923    
1924     switch (*ecode)
1925     {
1926     case OP_CRSTAR:
1927     case OP_CRMINSTAR:
1928     case OP_CRPLUS:
1929     case OP_CRMINPLUS:
1930     case OP_CRQUERY:
1931     case OP_CRMINQUERY:
1932     c = *ecode++ - OP_CRSTAR;
1933     minimize = (c & 1) != 0;
1934     min = rep_min[c]; /* Pick up values from tables; */
1935     max = rep_max[c]; /* zero for max => infinity */
1936     if (max == 0) max = INT_MAX;
1937     break;
1938    
1939     case OP_CRRANGE:
1940     case OP_CRMINRANGE:
1941     minimize = (*ecode == OP_CRMINRANGE);
1942     min = GET2(ecode, 1);
1943     max = GET2(ecode, 3);
1944     if (max == 0) max = INT_MAX;
1945     ecode += 5;
1946     break;
1947    
1948     default: /* No repeat follows */
1949     min = max = 1;
1950     break;
1951     }
1952    
1953     /* First, ensure the minimum number of matches are present. */
1954    
1955     #ifdef SUPPORT_UTF8
1956     /* UTF-8 mode */
1957     if (utf8)
1958     {
1959     for (i = 1; i <= min; i++)
1960     {
1961     if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
1962     GETCHARINC(c, eptr);
1963     if (c > 255)
1964     {
1965     if (op == OP_CLASS) RRETURN(MATCH_NOMATCH);
1966     }
1967     else
1968     {
1969     if ((data[c/8] & (1 << (c&7))) == 0) RRETURN(MATCH_NOMATCH);
1970     }
1971     }
1972     }
1973     else
1974     #endif
1975     /* Not UTF-8 mode */
1976     {
1977     for (i = 1; i <= min; i++)
1978     {
1979     if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
1980     c = *eptr++;
1981     if ((data[c/8] & (1 << (c&7))) == 0) RRETURN(MATCH_NOMATCH);
1982     }
1983     }
1984    
1985     /* If max == min we can continue with the main loop without the
1986     need to recurse. */
1987    
1988     if (min == max) continue;
1989    
1990     /* If minimizing, keep testing the rest of the expression and advancing
1991     the pointer while it matches the class. */
1992    
1993     if (minimize)
1994     {
1995     #ifdef SUPPORT_UTF8
1996     /* UTF-8 mode */
1997     if (utf8)
1998     {
1999     for (fi = min;; fi++)
2000     {
2001 ph10 164 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM16);
2002 nigel 77 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2003     if (fi >= max || eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
2004     GETCHARINC(c, eptr);
2005     if (c > 255)
2006     {
2007     if (op == OP_CLASS) RRETURN(MATCH_NOMATCH);
2008     }
2009     else
2010     {
2011     if ((data[c/8] & (1 << (c&7))) == 0) RRETURN(MATCH_NOMATCH);
2012     }
2013     }
2014     }
2015     else
2016     #endif
2017     /* Not UTF-8 mode */
2018     {
2019     for (fi = min;; fi++)
2020     {
2021 ph10 164 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM17);
2022 nigel 77 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2023     if (fi >= max || eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
2024     c = *eptr++;
2025     if ((data[c/8] & (1 << (c&7))) == 0) RRETURN(MATCH_NOMATCH);
2026     }
2027     }
2028     /* Control never gets here */
2029     }
2030    
2031     /* If maximizing, find the longest possible run, then work backwards. */
2032    
2033     else
2034     {
2035     pp = eptr;
2036    
2037     #ifdef SUPPORT_UTF8
2038     /* UTF-8 mode */
2039     if (utf8)
2040     {
2041     for (i = min; i < max; i++)
2042     {
2043     int len = 1;
2044     if (eptr >= md->end_subject) break;
2045     GETCHARLEN(c, eptr, len);
2046     if (c > 255)
2047     {
2048     if (op == OP_CLASS) break;
2049     }
2050     else
2051     {
2052     if ((data[c/8] & (1 << (c&7))) == 0) break;
2053     }
2054     eptr += len;
2055     }
2056     for (;;)
2057     {
2058 ph10 164 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM18);
2059 nigel 77 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2060     if (eptr-- == pp) break; /* Stop if tried at original pos */
2061     BACKCHAR(eptr);
2062     }
2063     }
2064     else
2065     #endif
2066     /* Not UTF-8 mode */
2067     {
2068     for (i = min; i < max; i++)
2069     {
2070     if (eptr >= md->end_subject) break;
2071     c = *eptr;
2072     if ((data[c/8] & (1 << (c&7))) == 0) break;
2073     eptr++;
2074     }
2075     while (eptr >= pp)
2076     {
2077 ph10 164 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM19);
2078 nigel 87 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2079 nigel 77 eptr--;
2080     }
2081     }
2082    
2083     RRETURN(MATCH_NOMATCH);
2084     }
2085     }
2086     /* Control never gets here */
2087    
2088    
2089     /* Match an extended character class. This opcode is encountered only
2090 ph10 384 when UTF-8 mode mode is supported. Nevertheless, we may not be in UTF-8
2091     mode, because Unicode properties are supported in non-UTF-8 mode. */
2092 nigel 77
2093     #ifdef SUPPORT_UTF8
2094     case OP_XCLASS:
2095     {
2096     data = ecode + 1 + LINK_SIZE; /* Save for matching */
2097     ecode += GET(ecode, 1); /* Advance past the item */
2098    
2099     switch (*ecode)
2100     {
2101     case OP_CRSTAR:
2102     case OP_CRMINSTAR:
2103     case OP_CRPLUS:
2104     case OP_CRMINPLUS:
2105     case OP_CRQUERY:
2106     case OP_CRMINQUERY:
2107     c = *ecode++ - OP_CRSTAR;
2108     minimize = (c & 1) != 0;
2109     min = rep_min[c]; /* Pick up values from tables; */
2110     max = rep_max[c]; /* zero for max => infinity */
2111     if (max == 0) max = INT_MAX;
2112     break;
2113    
2114     case OP_CRRANGE:
2115     case OP_CRMINRANGE:
2116     minimize = (*ecode == OP_CRMINRANGE);
2117     min = GET2(ecode, 1);
2118     max = GET2(ecode, 3);
2119     if (max == 0) max = INT_MAX;
2120     ecode += 5;
2121     break;
2122    
2123     default: /* No repeat follows */
2124     min = max = 1;
2125     break;
2126     }
2127    
2128     /* First, ensure the minimum number of matches are present. */
2129    
2130     for (i = 1; i <= min; i++)
2131     {
2132     if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
2133 ph10 384 GETCHARINCTEST(c, eptr);
2134 nigel 77 if (!_pcre_xclass(c, data)) RRETURN(MATCH_NOMATCH);
2135     }
2136    
2137     /* If max == min we can continue with the main loop without the
2138     need to recurse. */
2139    
2140     if (min == max) continue;
2141    
2142     /* If minimizing, keep testing the rest of the expression and advancing
2143     the pointer while it matches the class. */
2144    
2145     if (minimize)
2146     {
2147     for (fi = min;; fi++)
2148     {
2149 ph10 164 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM20);
2150 nigel 77 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2151     if (fi >= max || eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
2152 ph10 384 GETCHARINCTEST(c, eptr);
2153 nigel 77 if (!_pcre_xclass(c, data)) RRETURN(MATCH_NOMATCH);
2154     }
2155     /* Control never gets here */
2156     }
2157    
2158     /* If maximizing, find the longest possible run, then work backwards. */
2159    
2160     else
2161     {
2162     pp = eptr;
2163     for (i = min; i < max; i++)
2164     {
2165     int len = 1;
2166     if (eptr >= md->end_subject) break;
2167 ph10 384 GETCHARLENTEST(c, eptr, len);
2168 nigel 77 if (!_pcre_xclass(c, data)) break;
2169     eptr += len;
2170     }
2171     for(;;)
2172     {
2173 ph10 164 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM21);
2174 nigel 77 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2175     if (eptr-- == pp) break; /* Stop if tried at original pos */
2176 ph10 214 if (utf8) BACKCHAR(eptr);
2177 nigel 77 }
2178     RRETURN(MATCH_NOMATCH);
2179     }
2180    
2181     /* Control never gets here */
2182     }
2183     #endif /* End of XCLASS */
2184    
2185     /* Match a single character, casefully */
2186    
2187     case OP_CHAR:
2188     #ifdef SUPPORT_UTF8
2189     if (utf8)
2190     {
2191     length = 1;
2192     ecode++;
2193     GETCHARLEN(fc, ecode, length);
2194     if (length > md->end_subject - eptr) RRETURN(MATCH_NOMATCH);
2195     while (length-- > 0) if (*ecode++ != *eptr++) RRETURN(MATCH_NOMATCH);
2196     }
2197     else
2198     #endif
2199    
2200     /* Non-UTF-8 mode */
2201     {
2202     if (md->end_subject - eptr < 1) RRETURN(MATCH_NOMATCH);
2203     if (ecode[1] != *eptr++) RRETURN(MATCH_NOMATCH);
2204     ecode += 2;
2205     }
2206     break;
2207    
2208     /* Match a single character, caselessly */
2209    
2210     case OP_CHARNC:
2211     #ifdef SUPPORT_UTF8
2212     if (utf8)
2213     {
2214     length = 1;
2215     ecode++;
2216     GETCHARLEN(fc, ecode, length);
2217    
2218     if (length > md->end_subject - eptr) RRETURN(MATCH_NOMATCH);
2219    
2220     /* If the pattern character's value is < 128, we have only one byte, and
2221     can use the fast lookup table. */
2222    
2223     if (fc < 128)
2224     {
2225     if (md->lcc[*ecode++] != md->lcc[*eptr++]) RRETURN(MATCH_NOMATCH);
2226     }
2227    
2228     /* Otherwise we must pick up the subject character */
2229    
2230     else
2231     {
2232 nigel 93 unsigned int dc;
2233 nigel 77 GETCHARINC(dc, eptr);
2234     ecode += length;
2235    
2236     /* If we have Unicode property support, we can use it to test the other
2237 nigel 87 case of the character, if there is one. */
2238 nigel 77
2239     if (fc != dc)
2240     {
2241     #ifdef SUPPORT_UCP
2242 ph10 349 if (dc != UCD_OTHERCASE(fc))
2243 nigel 77 #endif
2244     RRETURN(MATCH_NOMATCH);
2245     }
2246     }
2247     }
2248     else
2249     #endif /* SUPPORT_UTF8 */
2250    
2251     /* Non-UTF-8 mode */
2252     {
2253     if (md->end_subject - eptr < 1) RRETURN(MATCH_NOMATCH);
2254     if (md->lcc[ecode[1]] != md->lcc[*eptr++]) RRETURN(MATCH_NOMATCH);
2255     ecode += 2;
2256     }
2257     break;
2258    
2259 nigel 93 /* Match a single character repeatedly. */
2260 nigel 77
2261     case OP_EXACT:
2262     min = max = GET2(ecode, 1);
2263     ecode += 3;
2264     goto REPEATCHAR;
2265    
2266 nigel 93 case OP_POSUPTO:
2267     possessive = TRUE;
2268     /* Fall through */
2269    
2270 nigel 77 case OP_UPTO:
2271     case OP_MINUPTO:
2272     min = 0;
2273     max = GET2(ecode, 1);
2274     minimize = *ecode == OP_MINUPTO;
2275     ecode += 3;
2276     goto REPEATCHAR;
2277    
2278 nigel 93 case OP_POSSTAR:
2279     possessive = TRUE;
2280     min = 0;
2281     max = INT_MAX;
2282     ecode++;
2283     goto REPEATCHAR;
2284    
2285     case OP_POSPLUS:
2286     possessive = TRUE;
2287     min = 1;
2288     max = INT_MAX;
2289     ecode++;
2290     goto REPEATCHAR;
2291    
2292     case OP_POSQUERY:
2293     possessive = TRUE;
2294     min = 0;
2295     max = 1;
2296     ecode++;
2297     goto REPEATCHAR;
2298    
2299 nigel 77 case OP_STAR:
2300     case OP_MINSTAR:
2301     case OP_PLUS:
2302     case OP_MINPLUS:
2303     case OP_QUERY:
2304     case OP_MINQUERY:
2305     c = *ecode++ - OP_STAR;
2306     minimize = (c & 1) != 0;
2307     min = rep_min[c]; /* Pick up values from tables; */
2308     max = rep_max[c]; /* zero for max => infinity */
2309     if (max == 0) max = INT_MAX;
2310    
2311     /* Common code for all repeated single-character matches. We can give
2312     up quickly if there are fewer than the minimum number of characters left in
2313     the subject. */
2314    
2315     REPEATCHAR:
2316     #ifdef SUPPORT_UTF8
2317     if (utf8)
2318     {
2319     length = 1;
2320     charptr = ecode;
2321     GETCHARLEN(fc, ecode, length);
2322     if (min * length > md->end_subject - eptr) RRETURN(MATCH_NOMATCH);
2323     ecode += length;
2324    
2325     /* Handle multibyte character matching specially here. There is
2326     support for caseless matching if UCP support is present. */
2327    
2328     if (length > 1)
2329     {
2330     #ifdef SUPPORT_UCP
2331 nigel 93 unsigned int othercase;
2332 nigel 77 if ((ims & PCRE_CASELESS) != 0 &&
2333 ph10 349 (othercase = UCD_OTHERCASE(fc)) != fc)
2334 nigel 77 oclength = _pcre_ord2utf8(othercase, occhars);
2335 ph10 115 else oclength = 0;
2336 nigel 77 #endif /* SUPPORT_UCP */
2337    
2338     for (i = 1; i <= min; i++)
2339     {
2340     if (memcmp(eptr, charptr, length) == 0) eptr += length;
2341 ph10 123 #ifdef SUPPORT_UCP
2342 nigel 77 /* Need braces because of following else */
2343     else if (oclength == 0) { RRETURN(MATCH_NOMATCH); }
2344     else
2345     {
2346     if (memcmp(eptr, occhars, oclength) != 0) RRETURN(MATCH_NOMATCH);
2347     eptr += oclength;
2348     }
2349 ph10 115 #else /* without SUPPORT_UCP */
2350     else { RRETURN(MATCH_NOMATCH); }
2351 ph10 123 #endif /* SUPPORT_UCP */
2352 nigel 77 }
2353    
2354     if (min == max) continue;
2355    
2356     if (minimize)
2357     {
2358     for (fi = min;; fi++)
2359     {
2360 ph10 164 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM22);
2361 nigel 77 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2362     if (fi >= max || eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
2363     if (memcmp(eptr, charptr, length) == 0) eptr += length;
2364 ph10 123 #ifdef SUPPORT_UCP
2365 nigel 77 /* Need braces because of following else */
2366     else if (oclength == 0) { RRETURN(MATCH_NOMATCH); }
2367     else
2368     {
2369     if (memcmp(eptr, occhars, oclength) != 0) RRETURN(MATCH_NOMATCH);
2370     eptr += oclength;
2371     }
2372 ph10 115 #else /* without SUPPORT_UCP */
2373     else { RRETURN (MATCH_NOMATCH); }
2374     #endif /* SUPPORT_UCP */
2375 nigel 77 }
2376     /* Control never gets here */
2377     }
2378 nigel 93
2379     else /* Maximize */
2380 nigel 77 {
2381     pp = eptr;
2382     for (i = min; i < max; i++)
2383     {
2384     if (eptr > md->end_subject - length) break;
2385     if (memcmp(eptr, charptr, length) == 0) eptr += length;
2386 ph10 123 #ifdef SUPPORT_UCP
2387 nigel 77 else if (oclength == 0) break;
2388     else
2389     {
2390     if (memcmp(eptr, occhars, oclength) != 0) break;
2391     eptr += oclength;
2392     }
2393 ph10 115 #else /* without SUPPORT_UCP */
2394     else break;
2395 ph10 123 #endif /* SUPPORT_UCP */
2396 nigel 77 }
2397 nigel 93
2398     if (possessive) continue;
2399 ph10 120 for(;;)
2400 nigel 77 {
2401 ph10 164 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM23);
2402 nigel 77 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2403 ph10 120 if (eptr == pp) RRETURN(MATCH_NOMATCH);
2404 ph10 115 #ifdef SUPPORT_UCP
2405     eptr--;
2406     BACKCHAR(eptr);
2407 ph10 123 #else /* without SUPPORT_UCP */
2408 nigel 77 eptr -= length;
2409 ph10 123 #endif /* SUPPORT_UCP */
2410 nigel 77 }
2411     }
2412     /* Control never gets here */
2413     }
2414    
2415     /* If the length of a UTF-8 character is 1, we fall through here, and
2416     obey the code as for non-UTF-8 characters below, though in this case the
2417     value of fc will always be < 128. */
2418     }
2419     else
2420     #endif /* SUPPORT_UTF8 */
2421    
2422     /* When not in UTF-8 mode, load a single-byte character. */
2423     {
2424     if (min > md->end_subject - eptr) RRETURN(MATCH_NOMATCH);
2425     fc = *ecode++;
2426     }
2427    
2428     /* The value of fc at this point is always less than 256, though we may or
2429     may not be in UTF-8 mode. The code is duplicated for the caseless and
2430     caseful cases, for speed, since matching characters is likely to be quite
2431     common. First, ensure the minimum number of matches are present. If min =
2432     max, continue at the same level without recursing. Otherwise, if
2433     minimizing, keep trying the rest of the expression and advancing one
2434     matching character if failing, up to the maximum. Alternatively, if
2435     maximizing, find the maximum number of characters and work backwards. */
2436    
2437     DPRINTF(("matching %c{%d,%d} against subject %.*s\n", fc, min, max,
2438     max, eptr));
2439    
2440     if ((ims & PCRE_CASELESS) != 0)
2441     {
2442     fc = md->lcc[fc];
2443     for (i = 1; i <= min; i++)
2444     if (fc != md->lcc[*eptr++]) RRETURN(MATCH_NOMATCH);
2445     if (min == max) continue;
2446     if (minimize)
2447     {
2448     for (fi = min;; fi++)
2449     {
2450 ph10 164 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM24);
2451 nigel 77 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2452     if (fi >= max || eptr >= md->end_subject ||
2453     fc != md->lcc[*eptr++])
2454     RRETURN(MATCH_NOMATCH);
2455     }
2456     /* Control never gets here */
2457     }
2458 nigel 93 else /* Maximize */
2459 nigel 77 {
2460     pp = eptr;
2461     for (i = min; i < max; i++)
2462     {
2463     if (eptr >= md->end_subject || fc != md->lcc[*eptr]) break;
2464     eptr++;
2465     }
2466 nigel 93 if (possessive) continue;
2467 nigel 77 while (eptr >= pp)
2468     {
2469 ph10 164 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM25);
2470 nigel 77 eptr--;
2471     if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2472     }
2473     RRETURN(MATCH_NOMATCH);
2474     }
2475     /* Control never gets here */
2476     }
2477    
2478     /* Caseful comparisons (includes all multi-byte characters) */
2479    
2480     else
2481     {
2482     for (i = 1; i <= min; i++) if (fc != *eptr++) RRETURN(MATCH_NOMATCH);
2483     if (min == max) continue;
2484     if (minimize)
2485     {
2486     for (fi = min;; fi++)
2487     {
2488 ph10 164 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM26);
2489 nigel 77 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2490     if (fi >= max || eptr >= md->end_subject || fc != *eptr++)
2491     RRETURN(MATCH_NOMATCH);
2492     }
2493     /* Control never gets here */
2494     }
2495 nigel 93 else /* Maximize */
2496 nigel 77 {
2497     pp = eptr;
2498     for (i = min; i < max; i++)
2499     {
2500     if (eptr >= md->end_subject || fc != *eptr) break;
2501     eptr++;
2502     }
2503 nigel 93 if (possessive) continue;
2504 nigel 77 while (eptr >= pp)
2505     {
2506 ph10 164 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM27);
2507 nigel 77 eptr--;
2508     if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2509     }
2510     RRETURN(MATCH_NOMATCH);
2511     }
2512     }
2513     /* Control never gets here */
2514    
2515     /* Match a negated single one-byte character. The character we are
2516     checking can be multibyte. */
2517    
2518     case OP_NOT:
2519     if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
2520     ecode++;
2521     GETCHARINCTEST(c, eptr);
2522     if ((ims & PCRE_CASELESS) != 0)
2523     {
2524     #ifdef SUPPORT_UTF8
2525     if (c < 256)
2526     #endif
2527     c = md->lcc[c];
2528     if (md->lcc[*ecode++] == c) RRETURN(MATCH_NOMATCH);
2529     }
2530     else
2531     {
2532     if (*ecode++ == c) RRETURN(MATCH_NOMATCH);
2533     }
2534     break;
2535    
2536     /* Match a negated single one-byte character repeatedly. This is almost a
2537     repeat of the code for a repeated single character, but I haven't found a
2538     nice way of commoning these up that doesn't require a test of the
2539     positive/negative option for each character match. Maybe that wouldn't add
2540     very much to the time taken, but character matching *is* what this is all
2541     about... */
2542    
2543     case OP_NOTEXACT:
2544     min = max = GET2(ecode, 1);
2545     ecode += 3;
2546     goto REPEATNOTCHAR;
2547    
2548     case OP_NOTUPTO:
2549     case OP_NOTMINUPTO:
2550     min = 0;
2551     max = GET2(ecode, 1);
2552     minimize = *ecode == OP_NOTMINUPTO;
2553     ecode += 3;
2554     goto REPEATNOTCHAR;
2555    
2556 nigel 93 case OP_NOTPOSSTAR:
2557     possessive = TRUE;
2558     min = 0;
2559     max = INT_MAX;
2560     ecode++;
2561     goto REPEATNOTCHAR;
2562    
2563     case OP_NOTPOSPLUS:
2564     possessive = TRUE;
2565     min = 1;
2566     max = INT_MAX;
2567     ecode++;
2568     goto REPEATNOTCHAR;
2569    
2570     case OP_NOTPOSQUERY:
2571     possessive = TRUE;
2572     min = 0;
2573     max = 1;
2574     ecode++;
2575     goto REPEATNOTCHAR;
2576    
2577     case OP_NOTPOSUPTO:
2578     possessive = TRUE;
2579     min = 0;
2580     max = GET2(ecode, 1);
2581     ecode += 3;
2582     goto REPEATNOTCHAR;
2583    
2584 nigel 77 case OP_NOTSTAR:
2585     case OP_NOTMINSTAR:
2586     case OP_NOTPLUS:
2587     case OP_NOTMINPLUS:
2588     case OP_NOTQUERY:
2589     case OP_NOTMINQUERY:
2590     c = *ecode++ - OP_NOTSTAR;
2591     minimize = (c & 1) != 0;
2592     min = rep_min[c]; /* Pick up values from tables; */
2593     max = rep_max[c]; /* zero for max => infinity */
2594     if (max == 0) max = INT_MAX;
2595    
2596     /* Common code for all repeated single-byte matches. We can give up quickly
2597     if there are fewer than the minimum number of bytes left in the
2598     subject. */
2599    
2600     REPEATNOTCHAR:
2601     if (min > md->end_subject - eptr) RRETURN(MATCH_NOMATCH);
2602     fc = *ecode++;
2603    
2604     /* The code is duplicated for the caseless and caseful cases, for speed,
2605     since matching characters is likely to be quite common. First, ensure the
2606     minimum number of matches are present. If min = max, continue at the same
2607     level without recursing. Otherwise, if minimizing, keep trying the rest of
2608     the expression and advancing one matching character if failing, up to the
2609     maximum. Alternatively, if maximizing, find the maximum number of
2610     characters and work backwards. */
2611    
2612     DPRINTF(("negative matching %c{%d,%d} against subject %.*s\n", fc, min, max,
2613     max, eptr));
2614    
2615     if ((ims & PCRE_CASELESS) != 0)
2616     {
2617     fc = md->lcc[fc];
2618    
2619     #ifdef SUPPORT_UTF8
2620     /* UTF-8 mode */
2621     if (utf8)
2622     {
2623 nigel 93 register unsigned int d;
2624 nigel 77 for (i = 1; i <= min; i++)
2625     {
2626     GETCHARINC(d, eptr);
2627     if (d < 256) d = md->lcc[d];
2628     if (fc == d) RRETURN(MATCH_NOMATCH);
2629     }
2630     }
2631     else
2632     #endif
2633    
2634     /* Not UTF-8 mode */
2635     {
2636     for (i = 1; i <= min; i++)
2637     if (fc == md->lcc[*eptr++]) RRETURN(MATCH_NOMATCH);
2638     }
2639    
2640     if (min == max) continue;
2641    
2642     if (minimize)
2643     {
2644     #ifdef SUPPORT_UTF8
2645     /* UTF-8 mode */
2646     if (utf8)
2647     {
2648 nigel 93 register unsigned int d;
2649 nigel 77 for (fi = min;; fi++)
2650     {
2651 ph10 164 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM28);
2652 nigel 77 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2653 ph10 366 if (fi >= max || eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
2654 nigel 77 GETCHARINC(d, eptr);
2655     if (d < 256) d = md->lcc[d];
2656 ph10 366 if (fc == d) RRETURN(MATCH_NOMATCH);
2657 ph10 371
2658 nigel 77 }
2659     }
2660     else
2661     #endif
2662     /* Not UTF-8 mode */
2663     {
2664     for (fi = min;; fi++)
2665     {
2666 ph10 164 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM29);
2667 nigel 77 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2668     if (fi >= max || eptr >= md->end_subject || fc == md->lcc[*eptr++])
2669     RRETURN(MATCH_NOMATCH);
2670     }
2671     }
2672     /* Control never gets here */
2673     }
2674    
2675     /* Maximize case */
2676    
2677     else
2678     {
2679     pp = eptr;
2680    
2681     #ifdef SUPPORT_UTF8
2682     /* UTF-8 mode */
2683     if (utf8)
2684     {
2685 nigel 93 register unsigned int d;
2686 nigel 77 for (i = min; i < max; i++)
2687     {
2688     int len = 1;
2689     if (eptr >= md->end_subject) break;
2690     GETCHARLEN(d, eptr, len);
2691     if (d < 256) d = md->lcc[d];
2692     if (fc == d) break;
2693     eptr += len;
2694     }
2695 nigel 93 if (possessive) continue;
2696     for(;;)
2697 nigel 77 {
2698 ph10 164 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM30);
2699 nigel 77 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2700     if (eptr-- == pp) break; /* Stop if tried at original pos */
2701     BACKCHAR(eptr);
2702     }
2703     }
2704     else
2705     #endif
2706     /* Not UTF-8 mode */
2707     {
2708     for (i = min; i < max; i++)
2709     {
2710     if (eptr >= md->end_subject || fc == md->lcc[*eptr]) break;
2711     eptr++;
2712     }
2713 nigel 93 if (possessive) continue;
2714 nigel 77 while (eptr >= pp)
2715     {
2716 ph10 164 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM31);
2717 nigel 77 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2718     eptr--;
2719     }
2720     }
2721    
2722     RRETURN(MATCH_NOMATCH);
2723     }
2724     /* Control never gets here */
2725     }
2726    
2727     /* Caseful comparisons */
2728    
2729     else
2730     {
2731     #ifdef SUPPORT_UTF8
2732     /* UTF-8 mode */
2733     if (utf8)
2734     {
2735 nigel 93 register unsigned int d;
2736 nigel 77 for (i = 1; i <= min; i++)
2737     {
2738     GETCHARINC(d, eptr);
2739     if (fc == d) RRETURN(MATCH_NOMATCH);
2740     }
2741     }
2742     else
2743     #endif
2744     /* Not UTF-8 mode */
2745     {
2746     for (i = 1; i <= min; i++)
2747     if (fc == *eptr++) RRETURN(MATCH_NOMATCH);
2748     }
2749    
2750     if (min == max) continue;
2751    
2752     if (minimize)
2753     {
2754     #ifdef SUPPORT_UTF8
2755     /* UTF-8 mode */
2756     if (utf8)
2757     {
2758 nigel 93 register unsigned int d;
2759 nigel 77 for (fi = min;; fi++)
2760     {
2761 ph10 164 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM32);
2762 nigel 77 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2763 ph10 366 if (fi >= max || eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
2764 nigel 77 GETCHARINC(d, eptr);
2765 ph10 371 if (fc == d) RRETURN(MATCH_NOMATCH);
2766 nigel 77 }
2767     }
2768     else
2769     #endif
2770     /* Not UTF-8 mode */
2771     {
2772     for (fi = min;; fi++)
2773     {
2774 ph10 164 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM33);
2775 nigel 77 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2776     if (fi >= max || eptr >= md->end_subject || fc == *eptr++)
2777     RRETURN(MATCH_NOMATCH);
2778     }
2779     }
2780     /* Control never gets here */
2781     }
2782    
2783     /* Maximize case */
2784    
2785     else
2786     {
2787     pp = eptr;
2788    
2789     #ifdef SUPPORT_UTF8
2790     /* UTF-8 mode */
2791     if (utf8)
2792     {
2793 nigel 93 register unsigned int d;
2794 nigel 77 for (i = min; i < max; i++)
2795     {
2796     int len = 1;
2797     if (eptr >= md->end_subject) break;
2798     GETCHARLEN(d, eptr, len);
2799     if (fc == d) break;
2800     eptr += len;
2801     }
2802 nigel 93 if (possessive) continue;
2803 nigel 77 for(;;)
2804     {
2805 ph10 164 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM34);
2806 nigel 77 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2807     if (eptr-- == pp) break; /* Stop if tried at original pos */
2808     BACKCHAR(eptr);
2809     }
2810     }
2811     else
2812     #endif
2813     /* Not UTF-8 mode */
2814     {
2815     for (i = min; i < max; i++)
2816     {
2817     if (eptr >= md->end_subject || fc == *eptr) break;
2818     eptr++;
2819     }
2820 nigel 93 if (possessive) continue;
2821 nigel 77 while (eptr >= pp)
2822     {
2823 ph10 164 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM35);
2824 nigel 77 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2825     eptr--;
2826     }
2827     }
2828    
2829     RRETURN(MATCH_NOMATCH);
2830     }
2831     }
2832     /* Control never gets here */
2833    
2834     /* Match a single character type repeatedly; several different opcodes
2835     share code. This is very similar to the code for single characters, but we
2836     repeat it in the interests of efficiency. */
2837    
2838     case OP_TYPEEXACT:
2839     min = max = GET2(ecode, 1);
2840     minimize = TRUE;
2841     ecode += 3;
2842     goto REPEATTYPE;
2843    
2844     case OP_TYPEUPTO:
2845     case OP_TYPEMINUPTO:
2846     min = 0;
2847     max = GET2(ecode, 1);
2848     minimize = *ecode == OP_TYPEMINUPTO;
2849     ecode += 3;
2850     goto REPEATTYPE;
2851    
2852 nigel 93 case OP_TYPEPOSSTAR:
2853     possessive = TRUE;
2854     min = 0;
2855     max = INT_MAX;
2856     ecode++;
2857     goto REPEATTYPE;
2858    
2859     case OP_TYPEPOSPLUS:
2860     possessive = TRUE;
2861     min = 1;
2862     max = INT_MAX;
2863     ecode++;
2864     goto REPEATTYPE;
2865    
2866     case OP_TYPEPOSQUERY:
2867     possessive = TRUE;
2868     min = 0;
2869     max = 1;
2870     ecode++;
2871     goto REPEATTYPE;
2872    
2873     case OP_TYPEPOSUPTO:
2874     possessive = TRUE;
2875     min = 0;
2876     max = GET2(ecode, 1);
2877     ecode += 3;
2878     goto REPEATTYPE;
2879    
2880 nigel 77 case OP_TYPESTAR:
2881     case OP_TYPEMINSTAR:
2882     case OP_TYPEPLUS:
2883     case OP_TYPEMINPLUS:
2884     case OP_TYPEQUERY:
2885     case OP_TYPEMINQUERY:
2886     c = *ecode++ - OP_TYPESTAR;
2887     minimize = (c & 1) != 0;
2888     min = rep_min[c]; /* Pick up values from tables; */
2889     max = rep_max[c]; /* zero for max => infinity */
2890     if (max == 0) max = INT_MAX;
2891    
2892     /* Common code for all repeated single character type matches. Note that
2893     in UTF-8 mode, '.' matches a character of any length, but for the other
2894     character types, the valid characters are all one-byte long. */
2895    
2896     REPEATTYPE:
2897     ctype = *ecode++; /* Code for the character type */
2898    
2899     #ifdef SUPPORT_UCP
2900     if (ctype == OP_PROP || ctype == OP_NOTPROP)
2901     {
2902     prop_fail_result = ctype == OP_NOTPROP;
2903     prop_type = *ecode++;
2904 nigel 87 prop_value = *ecode++;
2905 nigel 77 }
2906     else prop_type = -1;
2907     #endif
2908    
2909     /* First, ensure the minimum number of matches are present. Use inline
2910     code for maximizing the speed, and do the type test once at the start
2911     (i.e. keep it out of the loop). Also we can test that there are at least
2912     the minimum number of bytes before we start. This isn't as effective in
2913     UTF-8 mode, but it does no harm. Separate the UTF-8 code completely as that
2914     is tidier. Also separate the UCP code, which can be the same for both UTF-8
2915     and single-bytes. */
2916    
2917     if (min > md->end_subject - eptr) RRETURN(MATCH_NOMATCH);
2918     if (min > 0)
2919     {
2920     #ifdef SUPPORT_UCP
2921 nigel 87 if (prop_type >= 0)
2922 nigel 77 {
2923 nigel 87 switch(prop_type)
2924 nigel 77 {
2925 nigel 87 case PT_ANY:
2926     if (prop_fail_result) RRETURN(MATCH_NOMATCH);
2927     for (i = 1; i <= min; i++)
2928     {
2929     if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
2930 ph10 184 GETCHARINCTEST(c, eptr);
2931 nigel 87 }
2932     break;
2933    
2934     case PT_LAMP:
2935     for (i = 1; i <= min; i++)
2936     {
2937     if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
2938 ph10 184 GETCHARINCTEST(c, eptr);
2939 ph10 349 prop_chartype = UCD_CHARTYPE(c);
2940 nigel 87 if ((prop_chartype == ucp_Lu ||
2941     prop_chartype == ucp_Ll ||
2942     prop_chartype == ucp_Lt) == prop_fail_result)
2943     RRETURN(MATCH_NOMATCH);
2944     }
2945     break;
2946    
2947     case PT_GC:
2948     for (i = 1; i <= min; i++)
2949     {
2950     if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
2951 ph10 184 GETCHARINCTEST(c, eptr);
2952 ph10 349 prop_category = UCD_CATEGORY(c);
2953 nigel 87 if ((prop_category == prop_value) == prop_fail_result)
2954     RRETURN(MATCH_NOMATCH);
2955     }
2956     break;
2957    
2958     case PT_PC:
2959     for (i = 1; i <= min; i++)
2960     {
2961     if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
2962 ph10 184 GETCHARINCTEST(c, eptr);
2963 ph10 349 prop_chartype = UCD_CHARTYPE(c);
2964 nigel 87 if ((prop_chartype == prop_value) == prop_fail_result)
2965     RRETURN(MATCH_NOMATCH);
2966     }
2967     break;
2968    
2969     case PT_SC:
2970     for (i = 1; i <= min; i++)
2971     {
2972     if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
2973 ph10 184 GETCHARINCTEST(c, eptr);
2974 ph10 349 prop_script = UCD_SCRIPT(c);
2975 nigel 87 if ((prop_script == prop_value) == prop_fail_result)
2976     RRETURN(MATCH_NOMATCH);
2977     }
2978     break;
2979    
2980     default:
2981     RRETURN(PCRE_ERROR_INTERNAL);
2982 nigel 77 }
2983     }
2984    
2985     /* Match extended Unicode sequences. We will get here only if the
2986     support is in the binary; otherwise a compile-time error occurs. */
2987    
2988     else if (ctype == OP_EXTUNI)
2989     {
2990     for (i = 1; i <= min; i++)
2991     {
2992     GETCHARINCTEST(c, eptr);
2993 ph10 349 prop_category = UCD_CATEGORY(c);
2994 nigel 77 if (prop_category == ucp_M) RRETURN(MATCH_NOMATCH);
2995     while (eptr < md->end_subject)
2996     {
2997     int len = 1;
2998     if (!utf8) c = *eptr; else
2999     {
3000     GETCHARLEN(c, eptr, len);
3001     }
3002 ph10 349 prop_category = UCD_CATEGORY(c);
3003 nigel 77 if (prop_category != ucp_M) break;
3004     eptr += len;
3005     }
3006     }
3007     }
3008    
3009     else
3010     #endif /* SUPPORT_UCP */
3011    
3012     /* Handle all other cases when the coding is UTF-8 */
3013    
3014     #ifdef SUPPORT_UTF8
3015     if (utf8) switch(ctype)
3016     {
3017     case OP_ANY:
3018     for (i = 1; i <= min; i++)
3019     {
3020 ph10 342 if (eptr >= md->end_subject || IS_NEWLINE(eptr))
3021 nigel 77 RRETURN(MATCH_NOMATCH);
3022 nigel 91 eptr++;
3023 nigel 77 while (eptr < md->end_subject && (*eptr & 0xc0) == 0x80) eptr++;
3024     }
3025     break;
3026    
3027 ph10 341 case OP_ALLANY:
3028     for (i = 1; i <= min; i++)
3029     {
3030     if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
3031     eptr++;
3032     while (eptr < md->end_subject && (*eptr & 0xc0) == 0x80) eptr++;
3033     }
3034     break;
3035    
3036 nigel 77 case OP_ANYBYTE:
3037     eptr += min;
3038     break;
3039    
3040 nigel 93 case OP_ANYNL:
3041     for (i = 1; i <= min; i++)
3042     {
3043     if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
3044     GETCHARINC(c, eptr);
3045     switch(c)
3046     {
3047     default: RRETURN(MATCH_NOMATCH);
3048     case 0x000d:
3049     if (eptr < md->end_subject && *eptr == 0x0a) eptr++;
3050     break;
3051 ph10 231
3052 nigel 93 case 0x000a:
3053 ph10 231 break;
3054    
3055 nigel 93 case 0x000b:
3056     case 0x000c:
3057     case 0x0085:
3058     case 0x2028:
3059     case 0x2029:
3060 ph10 231 if (md->bsr_anycrlf) RRETURN(MATCH_NOMATCH);
3061 nigel 93 break;
3062     }
3063     }
3064     break;
3065    
3066 ph10 178 case OP_NOT_HSPACE:
3067     for (i = 1; i <= min; i++)
3068     {
3069     if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
3070     GETCHARINC(c, eptr);
3071     switch(c)
3072     {
3073     default: break;
3074     case 0x09: /* HT */
3075     case 0x20: /* SPACE */
3076     case 0xa0: /* NBSP */
3077     case 0x1680: /* OGHAM SPACE MARK */
3078     case 0x180e: /* MONGOLIAN VOWEL SEPARATOR */
3079     case 0x2000: /* EN QUAD */
3080     case 0x2001: /* EM QUAD */
3081     case 0x2002: /* EN SPACE */
3082     case 0x2003: /* EM SPACE */
3083     case 0x2004: /* THREE-PER-EM SPACE */
3084     case 0x2005: /* FOUR-PER-EM SPACE */
3085     case 0x2006: /* SIX-PER-EM SPACE */
3086     case 0x2007: /* FIGURE SPACE */
3087     case 0x2008: /* PUNCTUATION SPACE */
3088     case 0x2009: /* THIN SPACE */
3089     case 0x200A: /* HAIR SPACE */
3090     case 0x202f: /* NARROW NO-BREAK SPACE */
3091     case 0x205f: /* MEDIUM MATHEMATICAL SPACE */
3092     case 0x3000: /* IDEOGRAPHIC SPACE */
3093     RRETURN(MATCH_NOMATCH);
3094     }
3095     }
3096     break;
3097 ph10 182
3098 ph10 178 case OP_HSPACE:
3099     for (i = 1; i <= min; i++)
3100     {
3101     if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
3102     GETCHARINC(c, eptr);
3103     switch(c)
3104     {
3105     default: RRETURN(MATCH_NOMATCH);
3106     case 0x09: /* HT */
3107     case 0x20: /* SPACE */
3108     case 0xa0: /* NBSP */
3109     case 0x1680: /* OGHAM SPACE MARK */
3110     case 0x180e: /* MONGOLIAN VOWEL SEPARATOR */
3111     case 0x2000: /* EN QUAD */
3112     case 0x2001: /* EM QUAD */
3113     case 0x2002: /* EN SPACE */
3114     case 0x2003: /* EM SPACE */
3115     case 0x2004: /* THREE-PER-EM SPACE */
3116     case 0x2005: /* FOUR-PER-EM SPACE */
3117     case 0x2006: /* SIX-PER-EM SPACE */
3118     case 0x2007: /* FIGURE SPACE */
3119     case 0x2008: /* PUNCTUATION SPACE */
3120     case 0x2009: /* THIN SPACE */
3121     case 0x200A: /* HAIR SPACE */
3122     case 0x202f: /* NARROW NO-BREAK SPACE */
3123     case 0x205f: /* MEDIUM MATHEMATICAL SPACE */
3124     case 0x3000: /* IDEOGRAPHIC SPACE */
3125     break;
3126     }
3127     }
3128     break;
3129 ph10 182
3130 ph10 178 case OP_NOT_VSPACE:
3131     for (i = 1; i <= min; i++)
3132     {
3133     if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
3134     GETCHARINC(c, eptr);
3135     switch(c)
3136     {
3137     default: break;
3138     case 0x0a: /* LF */
3139     case 0x0b: /* VT */
3140     case 0x0c: /* FF */
3141     case 0x0d: /* CR */
3142     case 0x85: /* NEL */
3143     case 0x2028: /* LINE SEPARATOR */
3144     case 0x2029: /* PARAGRAPH SEPARATOR */
3145     RRETURN(MATCH_NOMATCH);
3146     }
3147     }
3148     break;
3149 ph10 182
3150 ph10 178 case OP_VSPACE:
3151     for (i = 1; i <= min; i++)
3152     {
3153     if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
3154     GETCHARINC(c, eptr);
3155     switch(c)
3156     {
3157     default: RRETURN(MATCH_NOMATCH);
3158     case 0x0a: /* LF */
3159     case 0x0b: /* VT */
3160     case 0x0c: /* FF */
3161     case 0x0d: /* CR */
3162     case 0x85: /* NEL */
3163     case 0x2028: /* LINE SEPARATOR */
3164     case 0x2029: /* PARAGRAPH SEPARATOR */
3165 ph10 182 break;
3166 ph10 178 }
3167     }
3168     break;
3169    
3170 nigel 77 case OP_NOT_DIGIT:
3171     for (i = 1; i <= min; i++)
3172     {
3173     if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
3174     GETCHARINC(c, eptr);
3175     if (c < 128 && (md->ctypes[c] & ctype_digit) != 0)
3176     RRETURN(MATCH_NOMATCH);
3177     }
3178     break;
3179    
3180     case OP_DIGIT:
3181     for (i = 1; i <= min; i++)
3182     {
3183     if (eptr >= md->end_subject ||
3184     *eptr >= 128 || (md->ctypes[*eptr++] & ctype_digit) == 0)
3185     RRETURN(MATCH_NOMATCH);
3186     /* No need to skip more bytes - we know it's a 1-byte character */
3187     }
3188     break;
3189    
3190     case OP_NOT_WHITESPACE:
3191     for (i = 1; i <= min; i++)
3192     {
3193     if (eptr >= md->end_subject ||
3194 ph10 219 (*eptr < 128 && (md->ctypes[*eptr] & ctype_space) != 0))
3195 nigel 77 RRETURN(MATCH_NOMATCH);
3196 ph10 219 while (++eptr < md->end_subject && (*eptr & 0xc0) == 0x80);
3197 nigel 77 }
3198     break;
3199    
3200     case OP_WHITESPACE:
3201     for (i = 1; i <= min; i++)
3202     {
3203     if (eptr >= md->end_subject ||
3204     *eptr >= 128 || (md->ctypes[*eptr++] & ctype_space) == 0)
3205     RRETURN(MATCH_NOMATCH);
3206     /* No need to skip more bytes - we know it's a 1-byte character */
3207     }
3208     break;
3209    
3210     case OP_NOT_WORDCHAR:
3211     for (i = 1; i <= min; i++)
3212     {
3213     if (eptr >= md->end_subject ||
3214 ph10 219 (*eptr < 128 && (md->ctypes[*eptr] & ctype_word) != 0))
3215 nigel 77 RRETURN(MATCH_NOMATCH);
3216 ph10 219 while (++eptr < md->end_subject && (*eptr & 0xc0) == 0x80);
3217 nigel 77 }
3218     break;
3219    
3220     case OP_WORDCHAR:
3221     for (i = 1; i <= min; i++)
3222     {
3223     if (eptr >= md->end_subject ||
3224     *eptr >= 128 || (md->ctypes[*eptr++] & ctype_word) == 0)
3225     RRETURN(MATCH_NOMATCH);
3226     /* No need to skip more bytes - we know it's a 1-byte character */
3227     }
3228     break;
3229    
3230     default:
3231     RRETURN(PCRE_ERROR_INTERNAL);
3232     } /* End switch(ctype) */
3233    
3234     else
3235     #endif /* SUPPORT_UTF8 */
3236    
3237     /* Code for the non-UTF-8 case for minimum matching of operators other
3238 nigel 93 than OP_PROP and OP_NOTPROP. We can assume that there are the minimum
3239     number of bytes present, as this was tested above. */
3240 nigel 77
3241     switch(ctype)
3242     {
3243     case OP_ANY:
3244 ph10 342 for (i = 1; i <= min; i++)
3245 nigel 77 {
3246 ph10 342 if (IS_NEWLINE(eptr)) RRETURN(MATCH_NOMATCH);
3247     eptr++;
3248 nigel 77 }
3249     break;
3250    
3251 ph10 341 case OP_ALLANY:
3252     eptr += min;
3253     break;
3254    
3255 nigel 77 case OP_ANYBYTE:
3256     eptr += min;
3257     break;
3258    
3259 nigel 93 /* Because of the CRLF case, we can't assume the minimum number of
3260     bytes are present in this case. */
3261    
3262     case OP_ANYNL:
3263     for (i = 1; i <= min; i++)
3264     {
3265     if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
3266     switch(*eptr++)
3267     {
3268     default: RRETURN(MATCH_NOMATCH);
3269     case 0x000d:
3270     if (eptr < md->end_subject && *eptr == 0x0a) eptr++;
3271     break;
3272     case 0x000a:
3273 ph10 231 break;
3274    
3275 nigel 93 case 0x000b:
3276     case 0x000c:
3277     case 0x0085:
3278 ph10 231 if (md->bsr_anycrlf) RRETURN(MATCH_NOMATCH);
3279 nigel 93 break;
3280     }
3281     }
3282     break;
3283    
3284 ph10 178 case OP_NOT_HSPACE:
3285     for (i = 1; i <= min; i++)
3286     {
3287     if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
3288     switch(*eptr++)
3289     {
3290     default: break;
3291     case 0x09: /* HT */
3292     case 0x20: /* SPACE */
3293     case 0xa0: /* NBSP */
3294     RRETURN(MATCH_NOMATCH);
3295     }
3296     }
3297     break;
3298    
3299     case OP_HSPACE:
3300     for (i = 1; i <= min; i++)
3301     {
3302     if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
3303     switch(*eptr++)
3304     {
3305     default: RRETURN(MATCH_NOMATCH);
3306     case 0x09: /* HT */
3307     case 0x20: /* SPACE */
3308     case 0xa0: /* NBSP */
3309 ph10 182 break;
3310 ph10 178 }
3311     }
3312     break;
3313    
3314     case OP_NOT_VSPACE:
3315     for (i = 1; i <= min; i++)
3316     {
3317     if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
3318     switch(*eptr++)
3319     {
3320     default: break;
3321     case 0x0a: /* LF */
3322     case 0x0b: /* VT */
3323     case 0x0c: /* FF */
3324     case 0x0d: /* CR */
3325     case 0x85: /* NEL */
3326     RRETURN(MATCH_NOMATCH);
3327     }
3328     }
3329     break;
3330    
3331     case OP_VSPACE:
3332     for (i = 1; i <= min; i++)
3333     {
3334     if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
3335     switch(*eptr++)
3336     {
3337     default: RRETURN(MATCH_NOMATCH);
3338     case 0x0a: /* LF */
3339     case 0x0b: /* VT */
3340     case 0x0c: /* FF */
3341     case 0x0d: /* CR */
3342     case 0x85: /* NEL */
3343 ph10 182 break;
3344 ph10 178 }
3345     }
3346     break;
3347    
3348 nigel 77 case OP_NOT_DIGIT:
3349     for (i = 1; i <= min; i++)
3350     if ((md->ctypes[*eptr++] & ctype_digit) != 0) RRETURN(MATCH_NOMATCH);
3351     break;
3352    
3353     case OP_DIGIT:
3354     for (i = 1; i <= min; i++)
3355     if ((md->ctypes[*eptr++] & ctype_digit) == 0) RRETURN(MATCH_NOMATCH);
3356     break;
3357    
3358     case OP_NOT_WHITESPACE:
3359     for (i = 1; i <= min; i++)
3360     if ((md->ctypes[*eptr++] & ctype_space) != 0) RRETURN(MATCH_NOMATCH);
3361     break;
3362    
3363     case OP_WHITESPACE:
3364     for (i = 1; i <= min; i++)
3365     if ((md->ctypes[*eptr++] & ctype_space) == 0) RRETURN(MATCH_NOMATCH);
3366     break;
3367    
3368     case OP_NOT_WORDCHAR:
3369     for (i = 1; i <= min; i++)
3370     if ((md->ctypes[*eptr++] & ctype_word) != 0)
3371     RRETURN(MATCH_NOMATCH);
3372     break;
3373    
3374     case OP_WORDCHAR:
3375     for (i = 1; i <= min; i++)
3376     if ((md->ctypes[*eptr++] & ctype_word) == 0)
3377     RRETURN(MATCH_NOMATCH);
3378     break;
3379    
3380     default:
3381     RRETURN(PCRE_ERROR_INTERNAL);
3382     }
3383     }
3384    
3385     /* If min = max, continue at the same level without recursing */
3386    
3387     if (min == max) continue;
3388    
3389     /* If minimizing, we have to test the rest of the pattern before each
3390     subsequent match. Again, separate the UTF-8 case for speed, and also
3391     separate the UCP cases. */
3392    
3393     if (minimize)
3394     {
3395     #ifdef SUPPORT_UCP
3396 nigel 87 if (prop_type >= 0)
3397 nigel 77 {
3398 nigel 87 switch(prop_type)
3399 nigel 77 {
3400 nigel 87 case PT_ANY:
3401     for (fi = min;; fi++)
3402     {
3403 ph10 164 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM36);
3404 nigel 87 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3405     if (fi >= max || eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
3406     GETCHARINC(c, eptr);
3407     if (prop_fail_result) RRETURN(MATCH_NOMATCH);
3408     }
3409 nigel 93 /* Control never gets here */
3410 nigel 87
3411     case PT_LAMP:
3412     for (fi = min;; fi++)
3413     {
3414 ph10 164 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM37);
3415 nigel 87 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3416     if (fi >= max || eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
3417     GETCHARINC(c, eptr);
3418 ph10 349 prop_chartype = UCD_CHARTYPE(c);
3419 nigel 87 if ((prop_chartype == ucp_Lu ||
3420     prop_chartype == ucp_Ll ||
3421     prop_chartype == ucp_Lt) == prop_fail_result)
3422     RRETURN(MATCH_NOMATCH);
3423     }
3424 nigel 93 /* Control never gets here */
3425 nigel 87
3426     case PT_GC:
3427     for (fi = min;; fi++)
3428     {
3429 ph10 164 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM38);
3430 nigel 87 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3431     if (fi >= max || eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
3432     GETCHARINC(c, eptr);
3433 ph10 349 prop_category = UCD_CATEGORY(c);
3434 nigel 87 if ((prop_category == prop_value) == prop_fail_result)
3435     RRETURN(MATCH_NOMATCH);
3436     }
3437 nigel 93 /* Control never gets here */
3438 nigel 87
3439     case PT_PC:
3440     for (fi = min;; fi++)
3441     {
3442 ph10 164 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM39);
3443 nigel 87 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3444     if (fi >= max || eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
3445     GETCHARINC(c, eptr);
3446 ph10 349 prop_chartype = UCD_CHARTYPE(c);
3447 nigel 87 if ((prop_chartype == prop_value) == prop_fail_result)
3448     RRETURN(MATCH_NOMATCH);
3449     }
3450 nigel 93 /* Control never gets here */
3451 nigel 87
3452     case PT_SC:
3453     for (fi = min;; fi++)
3454     {
3455 ph10 164 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM40);
3456 nigel 87 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3457     if (fi >= max || eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
3458     GETCHARINC(c, eptr);
3459 ph10 349 prop_script = UCD_SCRIPT(c);
3460 nigel 87 if ((prop_script == prop_value) == prop_fail_result)
3461     RRETURN(MATCH_NOMATCH);
3462     }
3463 nigel 93 /* Control never gets here */
3464 nigel 87
3465     default:
3466     RRETURN(PCRE_ERROR_INTERNAL);
3467 nigel 77 }
3468     }
3469    
3470     /* Match extended Unicode sequences. We will get here only if the
3471     support is in the binary; otherwise a compile-time error occurs. */
3472    
3473     else if (ctype == OP_EXTUNI)
3474     {
3475     for (fi = min;; fi++)
3476     {
3477 ph10 164 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM41);
3478 nigel 77 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3479     if (fi >= max || eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
3480     GETCHARINCTEST(c, eptr);
3481 ph10 349 prop_category = UCD_CATEGORY(c);
3482 nigel 77 if (prop_category == ucp_M) RRETURN(MATCH_NOMATCH);
3483     while (eptr < md->end_subject)
3484     {
3485     int len = 1;
3486     if (!utf8) c = *eptr; else
3487     {
3488     GETCHARLEN(c, eptr, len);
3489     }
3490 ph10 349 prop_category = UCD_CATEGORY(c);
3491 nigel 77 if (prop_category != ucp_M) break;
3492     eptr += len;
3493     }
3494     }
3495     }
3496    
3497     else
3498     #endif /* SUPPORT_UCP */
3499    
3500     #ifdef SUPPORT_UTF8
3501     /* UTF-8 mode */
3502     if (utf8)
3503     {
3504     for (fi = min;; fi++)
3505     {
3506 ph10 164 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM42);
3507 nigel 77 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3508 nigel 91 if (fi >= max || eptr >= md->end_subject ||
3509 ph10 342 (ctype == OP_ANY && IS_NEWLINE(eptr)))
3510 nigel 91 RRETURN(MATCH_NOMATCH);
3511 nigel 77
3512     GETCHARINC(c, eptr);
3513     switch(ctype)
3514     {
3515 ph10 342 case OP_ANY: /* This is the non-NL case */
3516 ph10 345 case OP_ALLANY:
3517 nigel 77 case OP_ANYBYTE:
3518     break;
3519    
3520 nigel 93 case OP_ANYNL:
3521     switch(c)
3522     {
3523     default: RRETURN(MATCH_NOMATCH);
3524     case 0x000d:
3525     if (eptr < md->end_subject && *eptr == 0x0a) eptr++;
3526     break;
3527     case 0x000a:
3528 ph10 231 break;
3529    
3530 nigel 93 case 0x000b:
3531     case 0x000c:
3532     case 0x0085:
3533     case 0x2028:
3534     case 0x2029:
3535 ph10 231 if (md->bsr_anycrlf) RRETURN(MATCH_NOMATCH);
3536 nigel 93 break;
3537     }
3538     break;
3539    
3540 ph10 178 case OP_NOT_HSPACE:
3541     switch(c)
3542     {
3543     default: break;
3544     case 0x09: /* HT */
3545     case 0x20: /* SPACE */
3546     case 0xa0: /* NBSP */
3547     case 0x1680: /* OGHAM SPACE MARK */
3548     case 0x180e: /* MONGOLIAN VOWEL SEPARATOR */
3549     case 0x2000: /* EN QUAD */
3550     case 0x2001: /* EM QUAD */
3551     case 0x2002: /* EN SPACE */
3552     case 0x2003: /* EM SPACE */
3553     case 0x2004: /* THREE-PER-EM SPACE */
3554     case 0x2005: /* FOUR-PER-EM SPACE */
3555     case 0x2006: /* SIX-PER-EM SPACE */
3556     case 0x2007: /* FIGURE SPACE */
3557     case 0x2008: /* PUNCTUATION SPACE */
3558     case 0x2009: /* THIN SPACE */
3559     case 0x200A: /* HAIR SPACE */
3560     case 0x202f: /* NARROW NO-BREAK SPACE */
3561     case 0x205f: /* MEDIUM MATHEMATICAL SPACE */
3562     case 0x3000: /* IDEOGRAPHIC SPACE */
3563     RRETURN(MATCH_NOMATCH);
3564     }
3565     break;
3566    
3567     case OP_HSPACE:
3568     switch(c)
3569     {
3570     default: RRETURN(MATCH_NOMATCH);
3571     case 0x09: /* HT */
3572     case 0x20: /* SPACE */
3573     case 0xa0: /* NBSP */
3574     case 0x1680: /* OGHAM SPACE MARK */
3575     case 0x180e: /* MONGOLIAN VOWEL SEPARATOR */
3576     case 0x2000: /* EN QUAD */
3577     case 0x2001: /* EM QUAD */
3578     case 0x2002: /* EN SPACE */
3579     case 0x2003: /* EM SPACE */
3580     case 0x2004: /* THREE-PER-EM SPACE */
3581     case 0x2005: /* FOUR-PER-EM SPACE */
3582     case 0x2006: /* SIX-PER-EM SPACE */
3583     case 0x2007: /* FIGURE SPACE */
3584     case 0x2008: /* PUNCTUATION SPACE */
3585     case 0x2009: /* THIN SPACE */
3586     case 0x200A: /* HAIR SPACE */
3587     case 0x202f: /* NARROW NO-BREAK SPACE */
3588     case 0x205f: /* MEDIUM MATHEMATICAL SPACE */
3589     case 0x3000: /* IDEOGRAPHIC SPACE */
3590     break;
3591     }
3592     break;
3593    
3594     case OP_NOT_VSPACE:
3595     switch(c)
3596     {
3597     default: break;
3598     case 0x0a: /* LF */
3599     case 0x0b: /* VT */
3600     case 0x0c: /* FF */
3601     case 0x0d: /* CR */
3602     case 0x85: /* NEL */
3603     case 0x2028: /* LINE SEPARATOR */
3604     case 0x2029: /* PARAGRAPH SEPARATOR */
3605     RRETURN(MATCH_NOMATCH);
3606     }
3607     break;
3608    
3609     case OP_VSPACE:
3610     switch(c)
3611     {
3612     default: RRETURN(MATCH_NOMATCH);
3613     case 0x0a: /* LF */
3614     case 0x0b: /* VT */
3615     case 0x0c: /* FF */
3616     case 0x0d: /* CR */
3617     case 0x85: /* NEL */
3618     case 0x2028: /* LINE SEPARATOR */
3619     case 0x2029: /* PARAGRAPH SEPARATOR */
3620     break;
3621     }
3622     break;
3623    
3624 nigel 77 case OP_NOT_DIGIT:
3625     if (c < 256 && (md->ctypes[c] & ctype_digit) != 0)
3626     RRETURN(MATCH_NOMATCH);
3627     break;
3628    
3629     case OP_DIGIT:
3630     if (c >= 256 || (md->ctypes[c] & ctype_digit) == 0)
3631     RRETURN(MATCH_NOMATCH);
3632     break;
3633    
3634     case OP_NOT_WHITESPACE:
3635     if (c < 256 && (md->ctypes[c] & ctype_space) != 0)
3636     RRETURN(MATCH_NOMATCH);
3637     break;
3638    
3639     case OP_WHITESPACE:
3640     if (c >= 256 || (md->ctypes[c] & ctype_space) == 0)
3641     RRETURN(MATCH_NOMATCH);
3642     break;
3643    
3644     case OP_NOT_WORDCHAR:
3645     if (c < 256 && (md->ctypes[c] & ctype_word) != 0)
3646     RRETURN(MATCH_NOMATCH);
3647     break;
3648    
3649     case OP_WORDCHAR:
3650     if (c >= 256 || (md->ctypes[c] & ctype_word) == 0)
3651     RRETURN(MATCH_NOMATCH);
3652     break;
3653    
3654     default:
3655     RRETURN(PCRE_ERROR_INTERNAL);
3656     }
3657     }
3658     }
3659     else
3660     #endif
3661     /* Not UTF-8 mode */
3662     {
3663     for (fi = min;; fi++)
3664     {
3665 ph10 164 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM43);
3666 nigel 77 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3667 nigel 91 if (fi >= max || eptr >= md->end_subject ||
3668 ph10 342 (ctype == OP_ANY && IS_NEWLINE(eptr)))
3669 nigel 91 RRETURN(MATCH_NOMATCH);
3670    
3671 nigel 77 c = *eptr++;
3672     switch(ctype)
3673     {
3674 ph10 342 case OP_ANY: /* This is the non-NL case */
3675 ph10 345 case OP_ALLANY:
3676 nigel 77 case OP_ANYBYTE:
3677     break;
3678    
3679 nigel 93 case OP_ANYNL:
3680     switch(c)
3681     {
3682     default: RRETURN(MATCH_NOMATCH);
3683     case 0x000d:
3684     if (eptr < md->end_subject && *eptr == 0x0a) eptr++;
3685     break;
3686 ph10 231
3687 nigel 93 case 0x000a:
3688 ph10 231 break;
3689    
3690 nigel 93 case 0x000b:
3691     case 0x000c:
3692     case 0x0085:
3693 ph10 231 if (md->bsr_anycrlf) RRETURN(MATCH_NOMATCH);
3694 nigel 93 break;
3695     }
3696     break;
3697    
3698 ph10 178 case OP_NOT_HSPACE:
3699     switch(c)
3700     {
3701     default: break;
3702     case 0x09: /* HT */
3703     case 0x20: /* SPACE */
3704     case 0xa0: /* NBSP */
3705     RRETURN(MATCH_NOMATCH);
3706     }
3707     break;
3708    
3709     case OP_HSPACE:
3710     switch(c)
3711     {
3712     default: RRETURN(MATCH_NOMATCH);
3713     case 0x09: /* HT */
3714     case 0x20: /* SPACE */
3715     case 0xa0: /* NBSP */
3716     break;
3717     }
3718     break;
3719    
3720     case OP_NOT_VSPACE:
3721     switch(c)
3722     {
3723     default: break;
3724     case 0x0a: /* LF */
3725     case 0x0b: /* VT */
3726     case 0x0c: /* FF */
3727     case 0x0d: /* CR */
3728     case 0x85: /* NEL */
3729     RRETURN(MATCH_NOMATCH);
3730     }
3731     break;
3732    
3733     case OP_VSPACE:
3734     switch(c)
3735     {
3736     default: RRETURN(MATCH_NOMATCH);
3737     case 0x0a: /* LF */
3738     case 0x0b: /* VT */
3739     case 0x0c: /* FF */
3740     case 0x0d: /* CR */
3741     case 0x85: /* NEL */
3742     break;
3743     }
3744     break;
3745    
3746 nigel 77 case OP_NOT_DIGIT:
3747     if ((md->ctypes[c] & ctype_digit) != 0) RRETURN(MATCH_NOMATCH);
3748     break;
3749    
3750     case OP_DIGIT:
3751     if ((md->ctypes[c] & ctype_digit) == 0) RRETURN(MATCH_NOMATCH);
3752     break;
3753    
3754     case OP_NOT_WHITESPACE:
3755     if ((md->ctypes[c] & ctype_space) != 0) RRETURN(MATCH_NOMATCH);
3756     break;
3757    
3758     case OP_WHITESPACE:
3759     if ((md->ctypes[c] & ctype_space) == 0) RRETURN(MATCH_NOMATCH);
3760     break;
3761    
3762     case OP_NOT_WORDCHAR:
3763     if ((md->ctypes[c] & ctype_word) != 0) RRETURN(MATCH_NOMATCH);
3764     break;
3765    
3766     case OP_WORDCHAR:
3767     if ((md->ctypes[c] & ctype_word) == 0) RRETURN(MATCH_NOMATCH);
3768     break;
3769    
3770     default:
3771     RRETURN(PCRE_ERROR_INTERNAL);
3772     }
3773     }
3774     }
3775     /* Control never gets here */
3776     }
3777    
3778 nigel 93 /* If maximizing, it is worth using inline code for speed, doing the type
3779 nigel 77 test once at the start (i.e. keep it out of the loop). Again, keep the
3780     UTF-8 and UCP stuff separate. */
3781    
3782     else
3783     {
3784     pp = eptr; /* Remember where we started */
3785    
3786     #ifdef SUPPORT_UCP
3787 nigel 87 if (prop_type >= 0)
3788 nigel 77 {
3789 nigel 87 switch(prop_type)
3790 nigel 77 {
3791 nigel 87 case PT_ANY:
3792     for (i = min; i < max; i++)
3793     {
3794     int len = 1;
3795     if (eptr >= md->end_subject) break;
3796     GETCHARLEN(c, eptr, len);
3797     if (prop_fail_result) break;
3798     eptr+= len;
3799     }
3800     break;
3801    
3802     case PT_LAMP:
3803     for (i = min; i < max; i++)
3804     {
3805     int len = 1;
3806     if (eptr >= md->end_subject) break;
3807     GETCHARLEN(c, eptr, len);
3808 ph10 349 prop_chartype = UCD_CHARTYPE(c);
3809 nigel 87 if ((prop_chartype == ucp_Lu ||
3810     prop_chartype == ucp_Ll ||
3811     prop_chartype == ucp_Lt) == prop_fail_result)
3812     break;
3813     eptr+= len;
3814     }
3815     break;
3816    
3817     case PT_GC:
3818     for (i = min; i < max; i++)
3819     {
3820     int len = 1;
3821     if (eptr >= md->end_subject) break;
3822     GETCHARLEN(c, eptr, len);
3823 ph10 349 prop_category = UCD_CATEGORY(c);
3824 nigel 87 if ((prop_category == prop_value) == prop_fail_result)
3825     break;
3826     eptr+= len;
3827     }
3828     break;
3829    
3830     case PT_PC:
3831     for (i = min; i < max; i++)
3832     {
3833     int len = 1;
3834     if (eptr >= md->end_subject) break;
3835     GETCHARLEN(c, eptr, len);
3836 ph10 349 prop_chartype = UCD_CHARTYPE(c);
3837 nigel 87 if ((prop_chartype == prop_value) == prop_fail_result)
3838     break;
3839     eptr+= len;
3840     }
3841     break;
3842    
3843     case PT_SC:
3844     for (i = min; i < max; i++)
3845     {
3846     int len = 1;
3847     if (eptr >= md->end_subject) break;
3848     GETCHARLEN(c, eptr, len);
3849 ph10 349 prop_script = UCD_SCRIPT(c);
3850 nigel 87 if ((prop_script == prop_value) == prop_fail_result)
3851     break;
3852     eptr+= len;
3853     }
3854     break;
3855 nigel 77 }
3856    
3857     /* eptr is now past the end of the maximum run */
3858    
3859 nigel 93 if (possessive) continue;
3860 nigel 77 for(;;)
3861     {
3862 ph10 164 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM44);
3863 nigel 77 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3864     if (eptr-- == pp) break; /* Stop if tried at original pos */
3865 ph10 207 if (utf8) BACKCHAR(eptr);
3866 nigel 77 }
3867     }
3868    
3869     /* Match extended Unicode sequences. We will get here only if the
3870     support is in the binary; otherwise a compile-time error occurs. */
3871    
3872     else if (ctype == OP_EXTUNI)
3873     {
3874     for (i = min; i < max; i++)
3875     {
3876     if (eptr >= md->end_subject) break;
3877     GETCHARINCTEST(c, eptr);
3878 ph10 349 prop_category = UCD_CATEGORY(c);
3879 nigel 77 if (prop_category == ucp_M) break;
3880     while (eptr < md->end_subject)
3881     {
3882     int len = 1;
3883     if (!utf8) c = *eptr; else
3884     {
3885     GETCHARLEN(c, eptr, len);
3886     }
3887 ph10 349 prop_category = UCD_CATEGORY(c);
3888 nigel 77 if (prop_category != ucp_M) break;
3889     eptr += len;
3890     }
3891     }
3892    
3893     /* eptr is now past the end of the maximum run */
3894    
3895 nigel 93 if (possessive) continue;
3896 nigel 77 for(;;)
3897     {
3898 ph10 164 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM45);
3899 nigel 77 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3900     if (eptr-- == pp) break; /* Stop if tried at original pos */
3901     for (;;) /* Move back over one extended */
3902     {
3903     int len = 1;
3904     if (!utf8) c = *eptr; else
3905     {
3906 ph10 207 BACKCHAR(eptr);
3907 nigel 77 GETCHARLEN(c, eptr, len);
3908     }
3909 ph10 349 prop_category = UCD_CATEGORY(c);
3910 nigel 77 if (prop_category != ucp_M) break;
3911     eptr--;
3912     }
3913     }
3914     }
3915    
3916     else
3917     #endif /* SUPPORT_UCP */
3918    
3919     #ifdef SUPPORT_UTF8
3920     /* UTF-8 mode */
3921    
3922     if (utf8)
3923     {
3924     switch(ctype)
3925     {
3926     case OP_ANY:
3927     if (max < INT_MAX)
3928     {
3929 ph10 342 for (i = min; i < max; i++)
3930 nigel 77 {
3931 ph10 342 if (eptr >= md->end_subject || IS_NEWLINE(eptr)) break;
3932     eptr++;
3933     while (eptr < md->end_subject && (*eptr & 0xc0) == 0x80) eptr++;
3934 nigel 77 }
3935     }
3936    
3937     /* Handle unlimited UTF-8 repeat */
3938    
3939     else
3940     {
3941 ph10 342 for (i = min; i < max; i++)
3942 nigel 77 {
3943 ph10 342 if (eptr >= md->end_subject || IS_NEWLINE(eptr)) break;
3944     eptr++;
3945     while (eptr < md->end_subject && (*eptr & 0xc0) == 0x80) eptr++;
3946 nigel 77 }
3947     }
3948     break;
3949    
3950 ph10 341 case OP_ALLANY:
3951     if (max < INT_MAX)
3952     {
3953     for (i = min; i < max; i++)
3954     {
3955     if (eptr >= md->end_subject) break;
3956     eptr++;
3957     while (eptr < md->end_subject && (*eptr & 0xc0) == 0x80) eptr++;
3958     }
3959     }
3960     else eptr = md->end_subject; /* Unlimited UTF-8 repeat */
3961     break;
3962    
3963 nigel 77 /* The byte case is the same as non-UTF8 */
3964    
3965     case OP_ANYBYTE:
3966     c = max - min;
3967 nigel 93 if (c > (unsigned int)(md->end_subject - eptr))
3968     c = md->end_subject - eptr;
3969 nigel 77 eptr += c;
3970     break;
3971    
3972 nigel 93 case OP_ANYNL:
3973     for (i = min; i < max; i++)
3974     {
3975     int len = 1;
3976     if (eptr >= md->end_subject) break;
3977     GETCHARLEN(c, eptr, len);
3978     if (c == 0x000d)
3979     {
3980     if (++eptr >= md->end_subject) break;
3981     if (*eptr == 0x000a) eptr++;
3982     }
3983     else
3984     {
3985 ph10 231 if (c != 0x000a &&
3986     (md->bsr_anycrlf ||
3987     (c != 0x000b && c != 0x000c &&
3988     c != 0x0085 && c != 0x2028 && c != 0x2029)))
3989 nigel 93 break;
3990     eptr += len;
3991     }
3992     }
3993     break;
3994    
3995 ph10 178 case OP_NOT_HSPACE:
3996 ph10 182 case OP_HSPACE:
3997 ph10 178 for (i = min; i < max; i++)
3998     {
3999 ph10 182 BOOL gotspace;
4000 ph10 178 int len = 1;
4001     if (eptr >= md->end_subject) break;
4002     GETCHARLEN(c, eptr, len);
4003     switch(c)
4004 ph10 182 {
4005     default: gotspace = FALSE; break;
4006 ph10 178 case 0x09: /* HT */
4007     case 0x20: /* SPACE */
4008     case 0xa0: /* NBSP */
4009     case 0x1680: /* OGHAM SPACE MARK */
4010     case 0x180e: /* MONGOLIAN VOWEL SEPARATOR */
4011     case 0x2000: /* EN QUAD */
4012     case 0x2001: /* EM QUAD */
4013     case 0x2002: /* EN SPACE */
4014     case 0x2003: /* EM SPACE */
4015     case 0x2004: /* THREE-PER-EM SPACE */
4016     case 0x2005: /* FOUR-PER-EM SPACE */
4017     case 0x2006: /* SIX-PER-EM SPACE */
4018     case 0x2007: /* FIGURE SPACE */
4019     case 0x2008: /* PUNCTUATION SPACE */
4020     case 0x2009: /* THIN SPACE */
4021     case 0x200A: /* HAIR SPACE */
4022     case 0x202f: /* NARROW NO-BREAK SPACE */
4023     case 0x205f: /* MEDIUM MATHEMATICAL SPACE */
4024     case 0x3000: /* IDEOGRAPHIC SPACE */
4025     gotspace = TRUE;
4026 ph10 182 break;
4027 ph10 178 }
4028     if (gotspace == (ctype == OP_NOT_HSPACE)) break;
4029     eptr += len;
4030     }
4031     break;
4032    
4033     case OP_NOT_VSPACE:
4034 ph10 182 case OP_VSPACE:
4035 ph10 178 for (i = min; i < max; i++)
4036     {
4037 ph10 182 BOOL gotspace;
4038 ph10 178 int len = 1;
4039     if (eptr >= md->end_subject) break;
4040     GETCHARLEN(c, eptr, len);
4041     switch(c)
4042     {
4043 ph10 182 default: gotspace = FALSE; break;
4044 ph10 178 case 0x0a: /* LF */
4045     case 0x0b: /* VT */
4046     case 0x0c: /* FF */
4047     case 0x0d: /* CR */
4048     case 0x85: /* NEL */
4049     case 0x2028: /* LINE SEPARATOR */
4050     case 0x2029: /* PARAGRAPH SEPARATOR */
4051     gotspace = TRUE;
4052     break;
4053     }
4054 ph10 182 if (gotspace == (ctype == OP_NOT_VSPACE)) break;
4055 ph10 178 eptr += len;
4056     }
4057     break;
4058    
4059 nigel 77 case OP_NOT_DIGIT:
4060     for (i = min; i < max; i++)
4061     {
4062     int len = 1;
4063     if (eptr >= md->end_subject) break;
4064     GETCHARLEN(c, eptr, len);
4065     if (c < 256 && (md->ctypes[c] & ctype_digit) != 0) break;
4066     eptr+= len;
4067     }
4068     break;
4069    
4070     case OP_DIGIT:
4071     for (i = min; i < max; i++)
4072     {
4073     int len = 1;
4074     if (eptr >= md->end_subject) break;
4075