/[pcre]/code/trunk/pcre_exec.c
ViewVC logotype

Contents of /code/trunk/pcre_exec.c

Parent Directory Parent Directory | Revision Log Revision Log


Revision 392 - (hide annotations) (download)
Tue Mar 17 21:30:30 2009 UTC (5 years, 7 months ago) by ph10
File MIME type: text/plain
File size: 152857 byte(s)
Update after detrailing for a test release.

1 nigel 77 /*************************************************
2     * Perl-Compatible Regular Expressions *
3     *************************************************/
4    
5     /* PCRE is a library of functions to support regular expressions whose syntax
6     and semantics are as close as possible to those of the Perl 5 language.
7    
8     Written by Philip Hazel
9 ph10 381 Copyright (c) 1997-2009 University of Cambridge
10 nigel 77
11     -----------------------------------------------------------------------------
12     Redistribution and use in source and binary forms, with or without
13     modification, are permitted provided that the following conditions are met:
14    
15     * Redistributions of source code must retain the above copyright notice,
16     this list of conditions and the following disclaimer.
17    
18     * Redistributions in binary form must reproduce the above copyright
19     notice, this list of conditions and the following disclaimer in the
20     documentation and/or other materials provided with the distribution.
21    
22     * Neither the name of the University of Cambridge nor the names of its
23     contributors may be used to endorse or promote products derived from
24     this software without specific prior written permission.
25    
26     THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
27     AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
28     IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
29     ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
30     LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
31     CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
32     SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
33     INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
34     CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
35     ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
36     POSSIBILITY OF SUCH DAMAGE.
37     -----------------------------------------------------------------------------
38     */
39    
40    
41     /* This module contains pcre_exec(), the externally visible function that does
42     pattern matching using an NFA algorithm, trying to mimic Perl as closely as
43     possible. There are also some static supporting functions. */
44    
45 ph10 200 #ifdef HAVE_CONFIG_H
46 ph10 236 #include "config.h"
47 ph10 200 #endif
48 ph10 199
49 nigel 93 #define NLBLOCK md /* Block containing newline information */
50     #define PSSTART start_subject /* Field containing processed string start */
51     #define PSEND end_subject /* Field containing processed string end */
52    
53 nigel 77 #include "pcre_internal.h"
54    
55 ph10 137 /* Undefine some potentially clashing cpp symbols */
56    
57     #undef min
58     #undef max
59    
60 nigel 77 /* Flag bits for the match() function */
61    
62 nigel 93 #define match_condassert 0x01 /* Called to check a condition assertion */
63     #define match_cbegroup 0x02 /* Could-be-empty unlimited repeat group */
64 nigel 77
65     /* Non-error returns from the match() function. Error returns are externally
66     defined PCRE_ERROR_xxx codes, which are all negative. */
67    
68     #define MATCH_MATCH 1
69     #define MATCH_NOMATCH 0
70    
71 ph10 211 /* Special internal returns from the match() function. Make them sufficiently
72 ph10 210 negative to avoid the external error codes. */
73    
74     #define MATCH_COMMIT (-999)
75     #define MATCH_PRUNE (-998)
76     #define MATCH_SKIP (-997)
77     #define MATCH_THEN (-996)
78    
79 nigel 77 /* Maximum number of ints of offset to save on the stack for recursive calls.
80     If the offset vector is bigger, malloc is used. This should be a multiple of 3,
81     because the offset vector is always a multiple of 3 long. */
82    
83     #define REC_STACK_SAVE_MAX 30
84    
85     /* Min and max values for the common repeats; for the maxima, 0 => infinity */
86    
87     static const char rep_min[] = { 0, 0, 1, 1, 0, 0 };
88     static const char rep_max[] = { 0, 0, 0, 0, 1, 1 };
89    
90    
91    
92     #ifdef DEBUG
93     /*************************************************
94     * Debugging function to print chars *
95     *************************************************/
96    
97     /* Print a sequence of chars in printable format, stopping at the end of the
98     subject if the requested.
99    
100     Arguments:
101     p points to characters
102     length number to print
103     is_subject TRUE if printing from within md->start_subject
104     md pointer to matching data block, if is_subject is TRUE
105    
106     Returns: nothing
107     */
108    
109     static void
110     pchars(const uschar *p, int length, BOOL is_subject, match_data *md)
111     {
112 nigel 93 unsigned int c;
113 nigel 77 if (is_subject && length > md->end_subject - p) length = md->end_subject - p;
114     while (length-- > 0)
115     if (isprint(c = *(p++))) printf("%c", c); else printf("\\x%02x", c);
116     }
117     #endif
118    
119    
120    
121     /*************************************************
122     * Match a back-reference *
123     *************************************************/
124    
125     /* If a back reference hasn't been set, the length that is passed is greater
126     than the number of characters left in the string, so the match fails.
127    
128     Arguments:
129     offset index into the offset vector
130     eptr points into the subject
131     length length to be matched
132     md points to match data block
133     ims the ims flags
134    
135     Returns: TRUE if matched
136     */
137    
138     static BOOL
139 nigel 87 match_ref(int offset, register USPTR eptr, int length, match_data *md,
140 nigel 77 unsigned long int ims)
141     {
142 nigel 87 USPTR p = md->start_subject + md->offset_vector[offset];
143 nigel 77
144     #ifdef DEBUG
145     if (eptr >= md->end_subject)
146     printf("matching subject <null>");
147     else
148     {
149     printf("matching subject ");
150     pchars(eptr, length, TRUE, md);
151     }
152     printf(" against backref ");
153     pchars(p, length, FALSE, md);
154     printf("\n");
155     #endif
156    
157     /* Always fail if not enough characters left */
158    
159     if (length > md->end_subject - eptr) return FALSE;
160    
161 ph10 354 /* Separate the caseless case for speed. In UTF-8 mode we can only do this
162     properly if Unicode properties are supported. Otherwise, we can check only
163     ASCII characters. */
164 nigel 77
165     if ((ims & PCRE_CASELESS) != 0)
166     {
167 ph10 354 #ifdef SUPPORT_UTF8
168     #ifdef SUPPORT_UCP
169     if (md->utf8)
170     {
171 ph10 358 USPTR endptr = eptr + length;
172 ph10 354 while (eptr < endptr)
173     {
174 ph10 358 int c, d;
175 ph10 354 GETCHARINC(c, eptr);
176     GETCHARINC(d, p);
177     if (c != d && c != UCD_OTHERCASE(d)) return FALSE;
178 ph10 358 }
179     }
180 ph10 354 else
181     #endif
182     #endif
183    
184     /* The same code works when not in UTF-8 mode and in UTF-8 mode when there
185     is no UCP support. */
186 ph10 358
187 nigel 77 while (length-- > 0)
188 ph10 354 { if (md->lcc[*p++] != md->lcc[*eptr++]) return FALSE; }
189 nigel 77 }
190 ph10 358
191 ph10 354 /* In the caseful case, we can just compare the bytes, whether or not we
192     are in UTF-8 mode. */
193 ph10 358
194 nigel 77 else
195     { while (length-- > 0) if (*p++ != *eptr++) return FALSE; }
196    
197     return TRUE;
198     }
199    
200    
201    
202     /***************************************************************************
203     ****************************************************************************
204     RECURSION IN THE match() FUNCTION
205    
206 nigel 87 The match() function is highly recursive, though not every recursive call
207     increases the recursive depth. Nevertheless, some regular expressions can cause
208     it to recurse to a great depth. I was writing for Unix, so I just let it call
209     itself recursively. This uses the stack for saving everything that has to be
210     saved for a recursive call. On Unix, the stack can be large, and this works
211     fine.
212 nigel 77
213 nigel 87 It turns out that on some non-Unix-like systems there are problems with
214     programs that use a lot of stack. (This despite the fact that every last chip
215     has oodles of memory these days, and techniques for extending the stack have
216     been known for decades.) So....
217 nigel 77
218     There is a fudge, triggered by defining NO_RECURSE, which avoids recursive
219     calls by keeping local variables that need to be preserved in blocks of memory
220 nigel 87 obtained from malloc() instead instead of on the stack. Macros are used to
221 nigel 77 achieve this so that the actual code doesn't look very different to what it
222     always used to.
223 ph10 164
224 ph10 165 The original heap-recursive code used longjmp(). However, it seems that this
225 ph10 164 can be very slow on some operating systems. Following a suggestion from Stan
226     Switzer, the use of longjmp() has been abolished, at the cost of having to
227     provide a unique number for each call to RMATCH. There is no way of generating
228     a sequence of numbers at compile time in C. I have given them names, to make
229     them stand out more clearly.
230    
231     Crude tests on x86 Linux show a small speedup of around 5-8%. However, on
232     FreeBSD, avoiding longjmp() more than halves the time taken to run the standard
233 ph10 165 tests. Furthermore, not using longjmp() means that local dynamic variables
234     don't have indeterminate values; this has meant that the frame size can be
235 ph10 164 reduced because the result can be "passed back" by straight setting of the
236     variable instead of being passed in the frame.
237 nigel 77 ****************************************************************************
238     ***************************************************************************/
239    
240 ph10 212 /* Numbers for RMATCH calls. When this list is changed, the code at HEAP_RETURN
241     below must be updated in sync. */
242 nigel 77
243 ph10 164 enum { RM1=1, RM2, RM3, RM4, RM5, RM6, RM7, RM8, RM9, RM10,
244     RM11, RM12, RM13, RM14, RM15, RM16, RM17, RM18, RM19, RM20,
245     RM21, RM22, RM23, RM24, RM25, RM26, RM27, RM28, RM29, RM30,
246     RM31, RM32, RM33, RM34, RM35, RM36, RM37, RM38, RM39, RM40,
247 ph10 210 RM41, RM42, RM43, RM44, RM45, RM46, RM47, RM48, RM49, RM50,
248 ph10 212 RM51, RM52, RM53, RM54 };
249 ph10 164
250 nigel 87 /* These versions of the macros use the stack, as normal. There are debugging
251 ph10 165 versions and production versions. Note that the "rw" argument of RMATCH isn't
252 ph10 164 actuall used in this definition. */
253 nigel 77
254     #ifndef NO_RECURSE
255     #define REGISTER register
256 ph10 164
257 nigel 87 #ifdef DEBUG
258 ph10 164 #define RMATCH(ra,rb,rc,rd,re,rf,rg,rw) \
259 nigel 87 { \
260     printf("match() called in line %d\n", __LINE__); \
261 ph10 168 rrc = match(ra,rb,mstart,rc,rd,re,rf,rg,rdepth+1); \
262 nigel 87 printf("to line %d\n", __LINE__); \
263     }
264     #define RRETURN(ra) \
265     { \
266     printf("match() returned %d from line %d ", ra, __LINE__); \
267     return ra; \
268     }
269     #else
270 ph10 164 #define RMATCH(ra,rb,rc,rd,re,rf,rg,rw) \
271 ph10 168 rrc = match(ra,rb,mstart,rc,rd,re,rf,rg,rdepth+1)
272 nigel 77 #define RRETURN(ra) return ra
273 nigel 87 #endif
274    
275 nigel 77 #else
276    
277    
278 ph10 164 /* These versions of the macros manage a private stack on the heap. Note that
279     the "rd" argument of RMATCH isn't actually used in this definition. It's the md
280     argument of match(), which never changes. */
281 nigel 77
282     #define REGISTER
283    
284 ph10 164 #define RMATCH(ra,rb,rc,rd,re,rf,rg,rw)\
285 nigel 77 {\
286     heapframe *newframe = (pcre_stack_malloc)(sizeof(heapframe));\
287 ph10 164 frame->Xwhere = rw; \
288     newframe->Xeptr = ra;\
289     newframe->Xecode = rb;\
290 ph10 168 newframe->Xmstart = mstart;\
291 ph10 164 newframe->Xoffset_top = rc;\
292     newframe->Xims = re;\
293     newframe->Xeptrb = rf;\
294     newframe->Xflags = rg;\
295     newframe->Xrdepth = frame->Xrdepth + 1;\
296     newframe->Xprevframe = frame;\
297     frame = newframe;\
298     DPRINTF(("restarting from line %d\n", __LINE__));\
299     goto HEAP_RECURSE;\
300     L_##rw:\
301     DPRINTF(("jumped back to line %d\n", __LINE__));\
302 nigel 77 }
303    
304     #define RRETURN(ra)\
305     {\
306     heapframe *newframe = frame;\
307     frame = newframe->Xprevframe;\
308     (pcre_stack_free)(newframe);\
309     if (frame != NULL)\
310     {\
311 ph10 164 rrc = ra;\
312     goto HEAP_RETURN;\
313 nigel 77 }\
314     return ra;\
315     }
316    
317    
318     /* Structure for remembering the local variables in a private frame */
319    
320     typedef struct heapframe {
321     struct heapframe *Xprevframe;
322    
323     /* Function arguments that may change */
324    
325     const uschar *Xeptr;
326     const uschar *Xecode;
327 ph10 172 const uschar *Xmstart;
328 nigel 77 int Xoffset_top;
329     long int Xims;
330     eptrblock *Xeptrb;
331     int Xflags;
332 nigel 91 unsigned int Xrdepth;
333 nigel 77
334     /* Function local variables */
335    
336     const uschar *Xcallpat;
337     const uschar *Xcharptr;
338     const uschar *Xdata;
339     const uschar *Xnext;
340     const uschar *Xpp;
341     const uschar *Xprev;
342     const uschar *Xsaved_eptr;
343    
344     recursion_info Xnew_recursive;
345    
346     BOOL Xcur_is_word;
347     BOOL Xcondition;
348     BOOL Xprev_is_word;
349    
350     unsigned long int Xoriginal_ims;
351    
352     #ifdef SUPPORT_UCP
353     int Xprop_type;
354 nigel 87 int Xprop_value;
355 nigel 77 int Xprop_fail_result;
356     int Xprop_category;
357     int Xprop_chartype;
358 nigel 87 int Xprop_script;
359 ph10 123 int Xoclength;
360     uschar Xocchars[8];
361 nigel 77 #endif
362    
363     int Xctype;
364 nigel 93 unsigned int Xfc;
365 nigel 77 int Xfi;
366     int Xlength;
367     int Xmax;
368     int Xmin;
369     int Xnumber;
370     int Xoffset;
371     int Xop;
372     int Xsave_capture_last;
373     int Xsave_offset1, Xsave_offset2, Xsave_offset3;
374     int Xstacksave[REC_STACK_SAVE_MAX];
375    
376     eptrblock Xnewptrb;
377    
378 ph10 164 /* Where to jump back to */
379 nigel 77
380 ph10 164 int Xwhere;
381 ph10 165
382 nigel 77 } heapframe;
383    
384     #endif
385    
386    
387     /***************************************************************************
388     ***************************************************************************/
389    
390    
391    
392     /*************************************************
393     * Match from current position *
394     *************************************************/
395    
396 nigel 93 /* This function is called recursively in many circumstances. Whenever it
397 nigel 77 returns a negative (error) response, the outer incarnation must also return the
398     same response.
399    
400     Performance note: It might be tempting to extract commonly used fields from the
401     md structure (e.g. utf8, end_subject) into individual variables to improve
402     performance. Tests using gcc on a SPARC disproved this; in the first case, it
403     made performance worse.
404    
405     Arguments:
406 nigel 93 eptr pointer to current character in subject
407     ecode pointer to current position in compiled code
408 ph10 168 mstart pointer to the current match start position (can be modified
409 ph10 172 by encountering \K)
410 nigel 77 offset_top current top pointer
411     md pointer to "static" info for the match
412     ims current /i, /m, and /s options
413     eptrb pointer to chain of blocks containing eptr at start of
414     brackets - for testing for empty matches
415     flags can contain
416     match_condassert - this is an assertion condition
417 nigel 93 match_cbegroup - this is the start of an unlimited repeat
418     group that can match an empty string
419 nigel 87 rdepth the recursion depth
420 nigel 77
421     Returns: MATCH_MATCH if matched ) these values are >= 0
422     MATCH_NOMATCH if failed to match )
423     a negative PCRE_ERROR_xxx value if aborted by an error condition
424 nigel 87 (e.g. stopped by repeated call or recursion limit)
425 nigel 77 */
426    
427     static int
428 ph10 172 match(REGISTER USPTR eptr, REGISTER const uschar *ecode, const uschar *mstart,
429 nigel 77 int offset_top, match_data *md, unsigned long int ims, eptrblock *eptrb,
430 nigel 91 int flags, unsigned int rdepth)
431 nigel 77 {
432     /* These variables do not need to be preserved over recursion in this function,
433 nigel 93 so they can be ordinary variables in all cases. Mark some of them with
434     "register" because they are used a lot in loops. */
435 nigel 77
436 nigel 91 register int rrc; /* Returns from recursive calls */
437     register int i; /* Used for loops not involving calls to RMATCH() */
438 nigel 93 register unsigned int c; /* Character values not kept over RMATCH() calls */
439 nigel 91 register BOOL utf8; /* Local copy of UTF-8 flag for speed */
440 nigel 77
441 nigel 93 BOOL minimize, possessive; /* Quantifier options */
442    
443 nigel 77 /* When recursion is not being used, all "local" variables that have to be
444     preserved over calls to RMATCH() are part of a "frame" which is obtained from
445     heap storage. Set up the top-level frame here; others are obtained from the
446     heap whenever RMATCH() does a "recursion". See the macro definitions above. */
447    
448     #ifdef NO_RECURSE
449     heapframe *frame = (pcre_stack_malloc)(sizeof(heapframe));
450     frame->Xprevframe = NULL; /* Marks the top level */
451    
452     /* Copy in the original argument variables */
453    
454     frame->Xeptr = eptr;
455     frame->Xecode = ecode;
456 ph10 168 frame->Xmstart = mstart;
457 nigel 77 frame->Xoffset_top = offset_top;
458     frame->Xims = ims;
459     frame->Xeptrb = eptrb;
460     frame->Xflags = flags;
461 nigel 87 frame->Xrdepth = rdepth;
462 nigel 77
463     /* This is where control jumps back to to effect "recursion" */
464    
465     HEAP_RECURSE:
466    
467     /* Macros make the argument variables come from the current frame */
468    
469     #define eptr frame->Xeptr
470     #define ecode frame->Xecode
471 ph10 168 #define mstart frame->Xmstart
472 nigel 77 #define offset_top frame->Xoffset_top
473     #define ims frame->Xims
474     #define eptrb frame->Xeptrb
475     #define flags frame->Xflags
476 nigel 87 #define rdepth frame->Xrdepth
477 nigel 77
478     /* Ditto for the local variables */
479    
480     #ifdef SUPPORT_UTF8
481     #define charptr frame->Xcharptr
482     #endif
483     #define callpat frame->Xcallpat
484     #define data frame->Xdata
485     #define next frame->Xnext
486     #define pp frame->Xpp
487     #define prev frame->Xprev
488     #define saved_eptr frame->Xsaved_eptr
489    
490     #define new_recursive frame->Xnew_recursive
491    
492     #define cur_is_word frame->Xcur_is_word
493     #define condition frame->Xcondition
494     #define prev_is_word frame->Xprev_is_word
495    
496     #define original_ims frame->Xoriginal_ims
497    
498     #ifdef SUPPORT_UCP
499     #define prop_type frame->Xprop_type
500 nigel 87 #define prop_value frame->Xprop_value
501 nigel 77 #define prop_fail_result frame->Xprop_fail_result
502     #define prop_category frame->Xprop_category
503     #define prop_chartype frame->Xprop_chartype
504 nigel 87 #define prop_script frame->Xprop_script
505 ph10 115 #define oclength frame->Xoclength
506     #define occhars frame->Xocchars
507 nigel 77 #endif
508    
509     #define ctype frame->Xctype
510     #define fc frame->Xfc
511     #define fi frame->Xfi
512     #define length frame->Xlength
513     #define max frame->Xmax
514     #define min frame->Xmin
515     #define number frame->Xnumber
516     #define offset frame->Xoffset
517     #define op frame->Xop
518     #define save_capture_last frame->Xsave_capture_last
519     #define save_offset1 frame->Xsave_offset1
520     #define save_offset2 frame->Xsave_offset2
521     #define save_offset3 frame->Xsave_offset3
522     #define stacksave frame->Xstacksave
523    
524     #define newptrb frame->Xnewptrb
525    
526     /* When recursion is being used, local variables are allocated on the stack and
527     get preserved during recursion in the normal way. In this environment, fi and
528     i, and fc and c, can be the same variables. */
529    
530 nigel 93 #else /* NO_RECURSE not defined */
531 nigel 77 #define fi i
532     #define fc c
533    
534    
535 nigel 87 #ifdef SUPPORT_UTF8 /* Many of these variables are used only */
536     const uschar *charptr; /* in small blocks of the code. My normal */
537     #endif /* style of coding would have declared */
538     const uschar *callpat; /* them within each of those blocks. */
539     const uschar *data; /* However, in order to accommodate the */
540     const uschar *next; /* version of this code that uses an */
541     USPTR pp; /* external "stack" implemented on the */
542     const uschar *prev; /* heap, it is easier to declare them all */
543     USPTR saved_eptr; /* here, so the declarations can be cut */
544     /* out in a block. The only declarations */
545     recursion_info new_recursive; /* within blocks below are for variables */
546     /* that do not have to be preserved over */
547     BOOL cur_is_word; /* a recursive call to RMATCH(). */
548     BOOL condition;
549 nigel 77 BOOL prev_is_word;
550    
551     unsigned long int original_ims;
552    
553     #ifdef SUPPORT_UCP
554     int prop_type;
555 nigel 87 int prop_value;
556 nigel 77 int prop_fail_result;
557     int prop_category;
558     int prop_chartype;
559 nigel 87 int prop_script;
560 ph10 115 int oclength;
561     uschar occhars[8];
562 nigel 77 #endif
563    
564     int ctype;
565     int length;
566     int max;
567     int min;
568     int number;
569     int offset;
570     int op;
571     int save_capture_last;
572     int save_offset1, save_offset2, save_offset3;
573     int stacksave[REC_STACK_SAVE_MAX];
574    
575     eptrblock newptrb;
576 nigel 93 #endif /* NO_RECURSE */
577 nigel 77
578     /* These statements are here to stop the compiler complaining about unitialized
579     variables. */
580    
581     #ifdef SUPPORT_UCP
582 nigel 87 prop_value = 0;
583 nigel 77 prop_fail_result = 0;
584     #endif
585    
586 nigel 93
587 nigel 91 /* This label is used for tail recursion, which is used in a few cases even
588     when NO_RECURSE is not defined, in order to reduce the amount of stack that is
589     used. Thanks to Ian Taylor for noticing this possibility and sending the
590     original patch. */
591    
592     TAIL_RECURSE:
593    
594 nigel 87 /* OK, now we can get on with the real code of the function. Recursive calls
595     are specified by the macro RMATCH and RRETURN is used to return. When
596     NO_RECURSE is *not* defined, these just turn into a recursive call to match()
597     and a "return", respectively (possibly with some debugging if DEBUG is
598     defined). However, RMATCH isn't like a function call because it's quite a
599     complicated macro. It has to be used in one particular way. This shouldn't,
600     however, impact performance when true recursion is being used. */
601 nigel 77
602 ph10 164 #ifdef SUPPORT_UTF8
603     utf8 = md->utf8; /* Local copy of the flag */
604     #else
605     utf8 = FALSE;
606     #endif
607    
608 nigel 87 /* First check that we haven't called match() too many times, or that we
609     haven't exceeded the recursive call limit. */
610    
611 nigel 77 if (md->match_call_count++ >= md->match_limit) RRETURN(PCRE_ERROR_MATCHLIMIT);
612 nigel 87 if (rdepth >= md->match_limit_recursion) RRETURN(PCRE_ERROR_RECURSIONLIMIT);
613 nigel 77
614     original_ims = ims; /* Save for resetting on ')' */
615 nigel 91
616 nigel 93 /* At the start of a group with an unlimited repeat that may match an empty
617     string, the match_cbegroup flag is set. When this is the case, add the current
618     subject pointer to the chain of such remembered pointers, to be checked when we
619     hit the closing ket, in order to break infinite loops that match no characters.
620 ph10 197 When match() is called in other circumstances, don't add to the chain. The
621     match_cbegroup flag must NOT be used with tail recursion, because the memory
622     block that is used is on the stack, so a new one may be required for each
623     match(). */
624 nigel 77
625 nigel 93 if ((flags & match_cbegroup) != 0)
626 nigel 77 {
627 ph10 197 newptrb.epb_saved_eptr = eptr;
628     newptrb.epb_prev = eptrb;
629     eptrb = &newptrb;
630 nigel 77 }
631    
632 nigel 93 /* Now start processing the opcodes. */
633 nigel 77
634     for (;;)
635     {
636 nigel 93 minimize = possessive = FALSE;
637 nigel 77 op = *ecode;
638    
639     /* For partial matching, remember if we ever hit the end of the subject after
640     matching at least one subject character. */
641    
642     if (md->partial &&
643     eptr >= md->end_subject &&
644 ph10 168 eptr > mstart)
645 nigel 77 md->hitend = TRUE;
646 ph10 208
647 nigel 93 switch(op)
648     {
649 ph10 210 case OP_FAIL:
650 ph10 212 RRETURN(MATCH_NOMATCH);
651 ph10 211
652 ph10 210 case OP_PRUNE:
653     RMATCH(eptr, ecode + _pcre_OP_lengths[*ecode], offset_top, md,
654     ims, eptrb, flags, RM51);
655     if (rrc != MATCH_NOMATCH) RRETURN(rrc);
656 ph10 212 RRETURN(MATCH_PRUNE);
657 ph10 211
658 ph10 210 case OP_COMMIT:
659     RMATCH(eptr, ecode + _pcre_OP_lengths[*ecode], offset_top, md,
660     ims, eptrb, flags, RM52);
661     if (rrc != MATCH_NOMATCH) RRETURN(rrc);
662 ph10 212 RRETURN(MATCH_COMMIT);
663 ph10 211
664 ph10 210 case OP_SKIP:
665     RMATCH(eptr, ecode + _pcre_OP_lengths[*ecode], offset_top, md,
666     ims, eptrb, flags, RM53);
667     if (rrc != MATCH_NOMATCH) RRETURN(rrc);
668 ph10 211 md->start_match_ptr = eptr; /* Pass back current position */
669 ph10 212 RRETURN(MATCH_SKIP);
670 ph10 211
671 ph10 210 case OP_THEN:
672     RMATCH(eptr, ecode + _pcre_OP_lengths[*ecode], offset_top, md,
673 ph10 212 ims, eptrb, flags, RM54);
674 ph10 210 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
675 ph10 212 RRETURN(MATCH_THEN);
676 ph10 211
677 nigel 93 /* Handle a capturing bracket. If there is space in the offset vector, save
678     the current subject position in the working slot at the top of the vector.
679     We mustn't change the current values of the data slot, because they may be
680     set from a previous iteration of this group, and be referred to by a
681     reference inside the group.
682 nigel 77
683 nigel 93 If the bracket fails to match, we need to restore this value and also the
684     values of the final offsets, in case they were set by a previous iteration
685     of the same bracket.
686 nigel 77
687 nigel 93 If there isn't enough space in the offset vector, treat this as if it were
688     a non-capturing bracket. Don't worry about setting the flag for the error
689     case here; that is handled in the code for KET. */
690 nigel 77
691 nigel 93 case OP_CBRA:
692     case OP_SCBRA:
693     number = GET2(ecode, 1+LINK_SIZE);
694 nigel 77 offset = number << 1;
695    
696     #ifdef DEBUG
697 nigel 93 printf("start bracket %d\n", number);
698     printf("subject=");
699 nigel 77 pchars(eptr, 16, TRUE, md);
700     printf("\n");
701     #endif
702    
703     if (offset < md->offset_max)
704     {
705     save_offset1 = md->offset_vector[offset];
706     save_offset2 = md->offset_vector[offset+1];
707     save_offset3 = md->offset_vector[md->offset_end - number];
708     save_capture_last = md->capture_last;
709    
710     DPRINTF(("saving %d %d %d\n", save_offset1, save_offset2, save_offset3));
711     md->offset_vector[md->offset_end - number] = eptr - md->start_subject;
712    
713 nigel 93 flags = (op == OP_SCBRA)? match_cbegroup : 0;
714 nigel 77 do
715     {
716 ph10 164 RMATCH(eptr, ecode + _pcre_OP_lengths[*ecode], offset_top, md,
717     ims, eptrb, flags, RM1);
718 ph10 210 if (rrc != MATCH_NOMATCH && rrc != MATCH_THEN) RRETURN(rrc);
719 nigel 77 md->capture_last = save_capture_last;
720     ecode += GET(ecode, 1);
721     }
722     while (*ecode == OP_ALT);
723    
724     DPRINTF(("bracket %d failed\n", number));
725    
726     md->offset_vector[offset] = save_offset1;
727     md->offset_vector[offset+1] = save_offset2;
728     md->offset_vector[md->offset_end - number] = save_offset3;
729    
730     RRETURN(MATCH_NOMATCH);
731     }
732    
733 ph10 197 /* FALL THROUGH ... Insufficient room for saving captured contents. Treat
734     as a non-capturing bracket. */
735 nigel 77
736 ph10 197 /* VVVVVVVVVVVVVVVVVVVVVVVVV */
737     /* VVVVVVVVVVVVVVVVVVVVVVVVV */
738    
739 nigel 93 DPRINTF(("insufficient capture room: treat as non-capturing\n"));
740 nigel 77
741 ph10 197 /* VVVVVVVVVVVVVVVVVVVVVVVVV */
742     /* VVVVVVVVVVVVVVVVVVVVVVVVV */
743    
744 nigel 93 /* Non-capturing bracket. Loop for all the alternatives. When we get to the
745     final alternative within the brackets, we would return the result of a
746     recursive call to match() whatever happened. We can reduce stack usage by
747 ph10 197 turning this into a tail recursion, except in the case when match_cbegroup
748     is set.*/
749 nigel 77
750 nigel 93 case OP_BRA:
751     case OP_SBRA:
752     DPRINTF(("start non-capturing bracket\n"));
753     flags = (op >= OP_SBRA)? match_cbegroup : 0;
754 nigel 91 for (;;)
755 nigel 77 {
756 ph10 197 if (ecode[GET(ecode, 1)] != OP_ALT) /* Final alternative */
757 nigel 93 {
758 ph10 197 if (flags == 0) /* Not a possibly empty group */
759     {
760     ecode += _pcre_OP_lengths[*ecode];
761     DPRINTF(("bracket 0 tail recursion\n"));
762     goto TAIL_RECURSE;
763     }
764    
765     /* Possibly empty group; can't use tail recursion. */
766    
767     RMATCH(eptr, ecode + _pcre_OP_lengths[*ecode], offset_top, md, ims,
768     eptrb, flags, RM48);
769     RRETURN(rrc);
770 nigel 93 }
771 nigel 91
772     /* For non-final alternatives, continue the loop for a NOMATCH result;
773     otherwise return. */
774    
775 ph10 164 RMATCH(eptr, ecode + _pcre_OP_lengths[*ecode], offset_top, md, ims,
776     eptrb, flags, RM2);
777 ph10 210 if (rrc != MATCH_NOMATCH && rrc != MATCH_THEN) RRETURN(rrc);
778 nigel 77 ecode += GET(ecode, 1);
779     }
780 nigel 91 /* Control never reaches here. */
781 nigel 77
782     /* Conditional group: compilation checked that there are no more than
783     two branches. If the condition is false, skipping the first branch takes us
784     past the end if there is only one branch, but that's OK because that is
785 nigel 91 exactly what going to the ket would do. As there is only one branch to be
786     obeyed, we can use tail recursion to avoid using another stack frame. */
787 nigel 77
788     case OP_COND:
789 nigel 93 case OP_SCOND:
790 ph10 381 /* Because of the way auto-callout works during compile, a callout item is
791     inserted between OP_COND and an assertion condition. */
792 ph10 392
793 ph10 381 if (ecode[LINK_SIZE+1] == OP_CALLOUT)
794     {
795     if (pcre_callout != NULL)
796     {
797     pcre_callout_block cb;
798     cb.version = 1; /* Version 1 of the callout block */
799     cb.callout_number = ecode[LINK_SIZE+2];
800     cb.offset_vector = md->offset_vector;
801     cb.subject = (PCRE_SPTR)md->start_subject;
802     cb.subject_length = md->end_subject - md->start_subject;
803     cb.start_match = mstart - md->start_subject;
804     cb.current_position = eptr - md->start_subject;
805     cb.pattern_position = GET(ecode, LINK_SIZE + 3);
806     cb.next_item_length = GET(ecode, 3 + 2*LINK_SIZE);
807     cb.capture_top = offset_top/2;
808     cb.capture_last = md->capture_last;
809     cb.callout_data = md->callout_data;
810     if ((rrc = (*pcre_callout)(&cb)) > 0) RRETURN(MATCH_NOMATCH);
811     if (rrc < 0) RRETURN(rrc);
812     }
813     ecode += _pcre_OP_lengths[OP_CALLOUT];
814     }
815 ph10 392
816 ph10 381 /* Now see what the actual condition is */
817 ph10 392
818 nigel 93 if (ecode[LINK_SIZE+1] == OP_RREF) /* Recursion test */
819 nigel 77 {
820 nigel 93 offset = GET2(ecode, LINK_SIZE + 2); /* Recursion group number*/
821     condition = md->recursive != NULL &&
822     (offset == RREF_ANY || offset == md->recursive->group_num);
823     ecode += condition? 3 : GET(ecode, 1);
824     }
825    
826     else if (ecode[LINK_SIZE+1] == OP_CREF) /* Group used test */
827     {
828 nigel 77 offset = GET2(ecode, LINK_SIZE+2) << 1; /* Doubled ref number */
829 nigel 93 condition = offset < offset_top && md->offset_vector[offset] >= 0;
830     ecode += condition? 3 : GET(ecode, 1);
831 nigel 77 }
832    
833 nigel 93 else if (ecode[LINK_SIZE+1] == OP_DEF) /* DEFINE - always false */
834     {
835     condition = FALSE;
836     ecode += GET(ecode, 1);
837     }
838    
839 nigel 77 /* The condition is an assertion. Call match() to evaluate it - setting
840 nigel 93 the final argument match_condassert causes it to stop at the end of an
841     assertion. */
842 nigel 77
843     else
844     {
845 ph10 164 RMATCH(eptr, ecode + 1 + LINK_SIZE, offset_top, md, ims, NULL,
846     match_condassert, RM3);
847 nigel 77 if (rrc == MATCH_MATCH)
848     {
849 nigel 93 condition = TRUE;
850     ecode += 1 + LINK_SIZE + GET(ecode, LINK_SIZE + 2);
851 nigel 77 while (*ecode == OP_ALT) ecode += GET(ecode, 1);
852     }
853 ph10 210 else if (rrc != MATCH_NOMATCH && rrc != MATCH_THEN)
854 nigel 77 {
855     RRETURN(rrc); /* Need braces because of following else */
856     }
857 nigel 93 else
858     {
859     condition = FALSE;
860     ecode += GET(ecode, 1);
861     }
862     }
863 nigel 91
864 nigel 93 /* We are now at the branch that is to be obeyed. As there is only one,
865 ph10 197 we can use tail recursion to avoid using another stack frame, except when
866     match_cbegroup is required for an unlimited repeat of a possibly empty
867     group. If the second alternative doesn't exist, we can just plough on. */
868 nigel 91
869 nigel 93 if (condition || *ecode == OP_ALT)
870     {
871 nigel 91 ecode += 1 + LINK_SIZE;
872 ph10 197 if (op == OP_SCOND) /* Possibly empty group */
873     {
874     RMATCH(eptr, ecode, offset_top, md, ims, eptrb, match_cbegroup, RM49);
875     RRETURN(rrc);
876     }
877     else /* Group must match something */
878     {
879     flags = 0;
880     goto TAIL_RECURSE;
881     }
882 nigel 77 }
883 ph10 197 else /* Condition false & no 2nd alternative */
884 nigel 93 {
885     ecode += 1 + LINK_SIZE;
886     }
887     break;
888 nigel 77
889    
890 ph10 210 /* End of the pattern, either real or forced. If we are in a top-level
891     recursion, we should restore the offsets appropriately and continue from
892     after the call. */
893 nigel 77
894 ph10 210 case OP_ACCEPT:
895 nigel 77 case OP_END:
896     if (md->recursive != NULL && md->recursive->group_num == 0)
897     {
898     recursion_info *rec = md->recursive;
899 nigel 87 DPRINTF(("End of pattern in a (?0) recursion\n"));
900 nigel 77 md->recursive = rec->prevrec;
901     memmove(md->offset_vector, rec->offset_save,
902     rec->saved_max * sizeof(int));
903 ph10 168 mstart = rec->save_start;
904 nigel 77 ims = original_ims;
905     ecode = rec->after_call;
906     break;
907     }
908    
909     /* Otherwise, if PCRE_NOTEMPTY is set, fail if we have matched an empty
910     string - backtracking will then try other alternatives, if any. */
911    
912 ph10 168 if (md->notempty && eptr == mstart) RRETURN(MATCH_NOMATCH);
913     md->end_match_ptr = eptr; /* Record where we ended */
914     md->end_offset_top = offset_top; /* and how many extracts were taken */
915 ph10 210 md->start_match_ptr = mstart; /* and the start (\K can modify) */
916 nigel 77 RRETURN(MATCH_MATCH);
917    
918     /* Change option settings */
919    
920     case OP_OPT:
921     ims = ecode[1];
922     ecode += 2;
923     DPRINTF(("ims set to %02lx\n", ims));
924     break;
925    
926     /* Assertion brackets. Check the alternative branches in turn - the
927     matching won't pass the KET for an assertion. If any one branch matches,
928     the assertion is true. Lookbehind assertions have an OP_REVERSE item at the
929     start of each branch to move the current point backwards, so the code at
930     this level is identical to the lookahead case. */
931    
932     case OP_ASSERT:
933     case OP_ASSERTBACK:
934     do
935     {
936 ph10 164 RMATCH(eptr, ecode + 1 + LINK_SIZE, offset_top, md, ims, NULL, 0,
937     RM4);
938 nigel 77 if (rrc == MATCH_MATCH) break;
939 ph10 210 if (rrc != MATCH_NOMATCH && rrc != MATCH_THEN) RRETURN(rrc);
940 nigel 77 ecode += GET(ecode, 1);
941     }
942     while (*ecode == OP_ALT);
943     if (*ecode == OP_KET) RRETURN(MATCH_NOMATCH);
944    
945     /* If checking an assertion for a condition, return MATCH_MATCH. */
946    
947     if ((flags & match_condassert) != 0) RRETURN(MATCH_MATCH);
948    
949     /* Continue from after the assertion, updating the offsets high water
950     mark, since extracts may have been taken during the assertion. */
951    
952     do ecode += GET(ecode,1); while (*ecode == OP_ALT);
953     ecode += 1 + LINK_SIZE;
954     offset_top = md->end_offset_top;
955     continue;
956    
957     /* Negative assertion: all branches must fail to match */
958    
959     case OP_ASSERT_NOT:
960     case OP_ASSERTBACK_NOT:
961     do
962     {
963 ph10 164 RMATCH(eptr, ecode + 1 + LINK_SIZE, offset_top, md, ims, NULL, 0,
964     RM5);
965 nigel 77 if (rrc == MATCH_MATCH) RRETURN(MATCH_NOMATCH);
966 ph10 210 if (rrc != MATCH_NOMATCH && rrc != MATCH_THEN) RRETURN(rrc);
967 nigel 77 ecode += GET(ecode,1);
968     }
969     while (*ecode == OP_ALT);
970    
971     if ((flags & match_condassert) != 0) RRETURN(MATCH_MATCH);
972    
973     ecode += 1 + LINK_SIZE;
974     continue;
975    
976     /* Move the subject pointer back. This occurs only at the start of
977     each branch of a lookbehind assertion. If we are too close to the start to
978     move back, this match function fails. When working with UTF-8 we move
979     back a number of characters, not bytes. */
980    
981     case OP_REVERSE:
982     #ifdef SUPPORT_UTF8
983     if (utf8)
984     {
985 nigel 93 i = GET(ecode, 1);
986     while (i-- > 0)
987 nigel 77 {
988     eptr--;
989     if (eptr < md->start_subject) RRETURN(MATCH_NOMATCH);
990 ph10 207 BACKCHAR(eptr);
991 nigel 77 }
992     }
993     else
994     #endif
995    
996     /* No UTF-8 support, or not in UTF-8 mode: count is byte count */
997    
998     {
999 nigel 93 eptr -= GET(ecode, 1);
1000 nigel 77 if (eptr < md->start_subject) RRETURN(MATCH_NOMATCH);
1001     }
1002    
1003     /* Skip to next op code */
1004    
1005     ecode += 1 + LINK_SIZE;
1006     break;
1007    
1008     /* The callout item calls an external function, if one is provided, passing
1009     details of the match so far. This is mainly for debugging, though the
1010     function is able to force a failure. */
1011    
1012     case OP_CALLOUT:
1013     if (pcre_callout != NULL)
1014     {
1015     pcre_callout_block cb;
1016     cb.version = 1; /* Version 1 of the callout block */
1017     cb.callout_number = ecode[1];
1018     cb.offset_vector = md->offset_vector;
1019 nigel 87 cb.subject = (PCRE_SPTR)md->start_subject;
1020 nigel 77 cb.subject_length = md->end_subject - md->start_subject;
1021 ph10 168 cb.start_match = mstart - md->start_subject;
1022 nigel 77 cb.current_position = eptr - md->start_subject;
1023     cb.pattern_position = GET(ecode, 2);
1024     cb.next_item_length = GET(ecode, 2 + LINK_SIZE);
1025     cb.capture_top = offset_top/2;
1026     cb.capture_last = md->capture_last;
1027     cb.callout_data = md->callout_data;
1028     if ((rrc = (*pcre_callout)(&cb)) > 0) RRETURN(MATCH_NOMATCH);
1029     if (rrc < 0) RRETURN(rrc);
1030     }
1031     ecode += 2 + 2*LINK_SIZE;
1032     break;
1033    
1034     /* Recursion either matches the current regex, or some subexpression. The
1035     offset data is the offset to the starting bracket from the start of the
1036     whole pattern. (This is so that it works from duplicated subpatterns.)
1037    
1038     If there are any capturing brackets started but not finished, we have to
1039     save their starting points and reinstate them after the recursion. However,
1040     we don't know how many such there are (offset_top records the completed
1041     total) so we just have to save all the potential data. There may be up to
1042     65535 such values, which is too large to put on the stack, but using malloc
1043     for small numbers seems expensive. As a compromise, the stack is used when
1044     there are no more than REC_STACK_SAVE_MAX values to store; otherwise malloc
1045     is used. A problem is what to do if the malloc fails ... there is no way of
1046     returning to the top level with an error. Save the top REC_STACK_SAVE_MAX
1047     values on the stack, and accept that the rest may be wrong.
1048    
1049     There are also other values that have to be saved. We use a chained
1050     sequence of blocks that actually live on the stack. Thanks to Robin Houston
1051     for the original version of this logic. */
1052    
1053     case OP_RECURSE:
1054     {
1055     callpat = md->start_code + GET(ecode, 1);
1056 nigel 93 new_recursive.group_num = (callpat == md->start_code)? 0 :
1057     GET2(callpat, 1 + LINK_SIZE);
1058 nigel 77
1059     /* Add to "recursing stack" */
1060    
1061     new_recursive.prevrec = md->recursive;
1062     md->recursive = &new_recursive;
1063    
1064     /* Find where to continue from afterwards */
1065    
1066     ecode += 1 + LINK_SIZE;
1067     new_recursive.after_call = ecode;
1068    
1069     /* Now save the offset data. */
1070    
1071     new_recursive.saved_max = md->offset_end;
1072     if (new_recursive.saved_max <= REC_STACK_SAVE_MAX)
1073     new_recursive.offset_save = stacksave;
1074     else
1075     {
1076     new_recursive.offset_save =
1077     (int *)(pcre_malloc)(new_recursive.saved_max * sizeof(int));
1078     if (new_recursive.offset_save == NULL) RRETURN(PCRE_ERROR_NOMEMORY);
1079     }
1080    
1081     memcpy(new_recursive.offset_save, md->offset_vector,
1082     new_recursive.saved_max * sizeof(int));
1083 ph10 168 new_recursive.save_start = mstart;
1084     mstart = eptr;
1085 nigel 77
1086     /* OK, now we can do the recursion. For each top-level alternative we
1087     restore the offset and recursion data. */
1088    
1089     DPRINTF(("Recursing into group %d\n", new_recursive.group_num));
1090 nigel 93 flags = (*callpat >= OP_SBRA)? match_cbegroup : 0;
1091 nigel 77 do
1092     {
1093 ph10 164 RMATCH(eptr, callpat + _pcre_OP_lengths[*callpat], offset_top,
1094     md, ims, eptrb, flags, RM6);
1095 nigel 77 if (rrc == MATCH_MATCH)
1096     {
1097 nigel 87 DPRINTF(("Recursion matched\n"));
1098 nigel 77 md->recursive = new_recursive.prevrec;
1099     if (new_recursive.offset_save != stacksave)
1100     (pcre_free)(new_recursive.offset_save);
1101     RRETURN(MATCH_MATCH);
1102     }
1103 ph10 210 else if (rrc != MATCH_NOMATCH && rrc != MATCH_THEN)
1104 nigel 87 {
1105     DPRINTF(("Recursion gave error %d\n", rrc));
1106     RRETURN(rrc);
1107     }
1108 nigel 77
1109     md->recursive = &new_recursive;
1110     memcpy(md->offset_vector, new_recursive.offset_save,
1111     new_recursive.saved_max * sizeof(int));
1112     callpat += GET(callpat, 1);
1113     }
1114     while (*callpat == OP_ALT);
1115    
1116     DPRINTF(("Recursion didn't match\n"));
1117     md->recursive = new_recursive.prevrec;
1118     if (new_recursive.offset_save != stacksave)
1119     (pcre_free)(new_recursive.offset_save);
1120     RRETURN(MATCH_NOMATCH);
1121     }
1122     /* Control never reaches here */
1123    
1124     /* "Once" brackets are like assertion brackets except that after a match,
1125     the point in the subject string is not moved back. Thus there can never be
1126     a move back into the brackets. Friedl calls these "atomic" subpatterns.
1127     Check the alternative branches in turn - the matching won't pass the KET
1128     for this kind of subpattern. If any one branch matches, we carry on as at
1129     the end of a normal bracket, leaving the subject pointer. */
1130    
1131     case OP_ONCE:
1132 nigel 91 prev = ecode;
1133     saved_eptr = eptr;
1134    
1135     do
1136 nigel 77 {
1137 ph10 197 RMATCH(eptr, ecode + 1 + LINK_SIZE, offset_top, md, ims, eptrb, 0, RM7);
1138 nigel 91 if (rrc == MATCH_MATCH) break;
1139 ph10 210 if (rrc != MATCH_NOMATCH && rrc != MATCH_THEN) RRETURN(rrc);
1140 nigel 91 ecode += GET(ecode,1);
1141     }
1142     while (*ecode == OP_ALT);
1143 nigel 77
1144 nigel 91 /* If hit the end of the group (which could be repeated), fail */
1145 nigel 77
1146 nigel 91 if (*ecode != OP_ONCE && *ecode != OP_ALT) RRETURN(MATCH_NOMATCH);
1147 nigel 77
1148 nigel 91 /* Continue as from after the assertion, updating the offsets high water
1149     mark, since extracts may have been taken. */
1150 nigel 77
1151 nigel 93 do ecode += GET(ecode, 1); while (*ecode == OP_ALT);
1152 nigel 77
1153 nigel 91 offset_top = md->end_offset_top;
1154     eptr = md->end_match_ptr;
1155 nigel 77
1156 nigel 91 /* For a non-repeating ket, just continue at this level. This also
1157     happens for a repeating ket if no characters were matched in the group.
1158     This is the forcible breaking of infinite loops as implemented in Perl
1159     5.005. If there is an options reset, it will get obeyed in the normal
1160     course of events. */
1161 nigel 77
1162 nigel 91 if (*ecode == OP_KET || eptr == saved_eptr)
1163     {
1164     ecode += 1+LINK_SIZE;
1165     break;
1166     }
1167 nigel 77
1168 nigel 91 /* The repeating kets try the rest of the pattern or restart from the
1169     preceding bracket, in the appropriate order. The second "call" of match()
1170     uses tail recursion, to avoid using another stack frame. We need to reset
1171     any options that changed within the bracket before re-running it, so
1172     check the next opcode. */
1173 nigel 77
1174 nigel 91 if (ecode[1+LINK_SIZE] == OP_OPT)
1175     {
1176     ims = (ims & ~PCRE_IMS) | ecode[4];
1177     DPRINTF(("ims set to %02lx at group repeat\n", ims));
1178     }
1179 nigel 77
1180 nigel 91 if (*ecode == OP_KETRMIN)
1181     {
1182 ph10 197 RMATCH(eptr, ecode + 1 + LINK_SIZE, offset_top, md, ims, eptrb, 0, RM8);
1183 nigel 91 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1184     ecode = prev;
1185 ph10 197 flags = 0;
1186 nigel 91 goto TAIL_RECURSE;
1187 nigel 77 }
1188 nigel 91 else /* OP_KETRMAX */
1189     {
1190 ph10 164 RMATCH(eptr, prev, offset_top, md, ims, eptrb, match_cbegroup, RM9);
1191 nigel 91 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1192     ecode += 1 + LINK_SIZE;
1193 ph10 197 flags = 0;
1194 nigel 91 goto TAIL_RECURSE;
1195     }
1196     /* Control never gets here */
1197 nigel 77
1198     /* An alternation is the end of a branch; scan along to find the end of the
1199     bracketed group and go to there. */
1200    
1201     case OP_ALT:
1202     do ecode += GET(ecode,1); while (*ecode == OP_ALT);
1203     break;
1204    
1205 ph10 335 /* BRAZERO, BRAMINZERO and SKIPZERO occur just before a bracket group,
1206     indicating that it may occur zero times. It may repeat infinitely, or not
1207     at all - i.e. it could be ()* or ()? or even (){0} in the pattern. Brackets
1208     with fixed upper repeat limits are compiled as a number of copies, with the
1209     optional ones preceded by BRAZERO or BRAMINZERO. */
1210 nigel 77
1211     case OP_BRAZERO:
1212     {
1213     next = ecode+1;
1214 ph10 164 RMATCH(eptr, next, offset_top, md, ims, eptrb, 0, RM10);
1215 nigel 77 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1216     do next += GET(next,1); while (*next == OP_ALT);
1217 nigel 93 ecode = next + 1 + LINK_SIZE;
1218 nigel 77 }
1219     break;
1220    
1221     case OP_BRAMINZERO:
1222     {
1223     next = ecode+1;
1224 nigel 93 do next += GET(next, 1); while (*next == OP_ALT);
1225 ph10 164 RMATCH(eptr, next + 1+LINK_SIZE, offset_top, md, ims, eptrb, 0, RM11);
1226 nigel 77 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1227     ecode++;
1228     }
1229     break;
1230    
1231 ph10 335 case OP_SKIPZERO:
1232     {
1233     next = ecode+1;
1234     do next += GET(next,1); while (*next == OP_ALT);
1235     ecode = next + 1 + LINK_SIZE;
1236     }
1237     break;
1238    
1239 nigel 93 /* End of a group, repeated or non-repeating. */
1240 nigel 77
1241     case OP_KET:
1242     case OP_KETRMIN:
1243     case OP_KETRMAX:
1244 nigel 91 prev = ecode - GET(ecode, 1);
1245 nigel 77
1246 nigel 93 /* If this was a group that remembered the subject start, in order to break
1247     infinite repeats of empty string matches, retrieve the subject start from
1248     the chain. Otherwise, set it NULL. */
1249 nigel 77
1250 nigel 93 if (*prev >= OP_SBRA)
1251     {
1252     saved_eptr = eptrb->epb_saved_eptr; /* Value at start of group */
1253     eptrb = eptrb->epb_prev; /* Backup to previous group */
1254     }
1255     else saved_eptr = NULL;
1256 nigel 77
1257 nigel 93 /* If we are at the end of an assertion group, stop matching and return
1258     MATCH_MATCH, but record the current high water mark for use by positive
1259     assertions. Do this also for the "once" (atomic) groups. */
1260    
1261 nigel 91 if (*prev == OP_ASSERT || *prev == OP_ASSERT_NOT ||
1262     *prev == OP_ASSERTBACK || *prev == OP_ASSERTBACK_NOT ||
1263     *prev == OP_ONCE)
1264     {
1265     md->end_match_ptr = eptr; /* For ONCE */
1266     md->end_offset_top = offset_top;
1267     RRETURN(MATCH_MATCH);
1268     }
1269 nigel 77
1270 nigel 93 /* For capturing groups we have to check the group number back at the start
1271     and if necessary complete handling an extraction by setting the offsets and
1272     bumping the high water mark. Note that whole-pattern recursion is coded as
1273     a recurse into group 0, so it won't be picked up here. Instead, we catch it
1274     when the OP_END is reached. Other recursion is handled here. */
1275 nigel 77
1276 nigel 93 if (*prev == OP_CBRA || *prev == OP_SCBRA)
1277 nigel 91 {
1278 nigel 93 number = GET2(prev, 1+LINK_SIZE);
1279 nigel 91 offset = number << 1;
1280 nigel 77
1281     #ifdef DEBUG
1282 nigel 91 printf("end bracket %d", number);
1283     printf("\n");
1284 nigel 77 #endif
1285    
1286 nigel 93 md->capture_last = number;
1287     if (offset >= md->offset_max) md->offset_overflow = TRUE; else
1288 nigel 91 {
1289 nigel 93 md->offset_vector[offset] =
1290     md->offset_vector[md->offset_end - number];
1291     md->offset_vector[offset+1] = eptr - md->start_subject;
1292     if (offset_top <= offset) offset_top = offset + 2;
1293     }
1294 nigel 77
1295 nigel 93 /* Handle a recursively called group. Restore the offsets
1296     appropriately and continue from after the call. */
1297 nigel 77
1298 nigel 93 if (md->recursive != NULL && md->recursive->group_num == number)
1299     {
1300     recursion_info *rec = md->recursive;
1301     DPRINTF(("Recursion (%d) succeeded - continuing\n", number));
1302     md->recursive = rec->prevrec;
1303 ph10 168 mstart = rec->save_start;
1304 nigel 93 memcpy(md->offset_vector, rec->offset_save,
1305     rec->saved_max * sizeof(int));
1306     ecode = rec->after_call;
1307     ims = original_ims;
1308     break;
1309 nigel 77 }
1310 nigel 91 }
1311 nigel 77
1312 nigel 93 /* For both capturing and non-capturing groups, reset the value of the ims
1313     flags, in case they got changed during the group. */
1314 nigel 77
1315 nigel 91 ims = original_ims;
1316     DPRINTF(("ims reset to %02lx\n", ims));
1317 nigel 77
1318 nigel 91 /* For a non-repeating ket, just continue at this level. This also
1319     happens for a repeating ket if no characters were matched in the group.
1320     This is the forcible breaking of infinite loops as implemented in Perl
1321     5.005. If there is an options reset, it will get obeyed in the normal
1322     course of events. */
1323 nigel 77
1324 nigel 91 if (*ecode == OP_KET || eptr == saved_eptr)
1325     {
1326     ecode += 1 + LINK_SIZE;
1327     break;
1328     }
1329 nigel 77
1330 nigel 91 /* The repeating kets try the rest of the pattern or restart from the
1331     preceding bracket, in the appropriate order. In the second case, we can use
1332 ph10 197 tail recursion to avoid using another stack frame, unless we have an
1333     unlimited repeat of a group that can match an empty string. */
1334 nigel 77
1335 nigel 93 flags = (*prev >= OP_SBRA)? match_cbegroup : 0;
1336    
1337 nigel 91 if (*ecode == OP_KETRMIN)
1338     {
1339 ph10 197 RMATCH(eptr, ecode + 1 + LINK_SIZE, offset_top, md, ims, eptrb, 0, RM12);
1340 nigel 91 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1341 ph10 197 if (flags != 0) /* Could match an empty string */
1342     {
1343     RMATCH(eptr, prev, offset_top, md, ims, eptrb, flags, RM50);
1344     RRETURN(rrc);
1345     }
1346 nigel 91 ecode = prev;
1347     goto TAIL_RECURSE;
1348 nigel 77 }
1349 nigel 91 else /* OP_KETRMAX */
1350     {
1351 ph10 164 RMATCH(eptr, prev, offset_top, md, ims, eptrb, flags, RM13);
1352 nigel 91 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1353     ecode += 1 + LINK_SIZE;
1354 ph10 197 flags = 0;
1355 nigel 91 goto TAIL_RECURSE;
1356     }
1357     /* Control never gets here */
1358 nigel 77
1359     /* Start of subject unless notbol, or after internal newline if multiline */
1360    
1361     case OP_CIRC:
1362     if (md->notbol && eptr == md->start_subject) RRETURN(MATCH_NOMATCH);
1363     if ((ims & PCRE_MULTILINE) != 0)
1364     {
1365 nigel 91 if (eptr != md->start_subject &&
1366 nigel 93 (eptr == md->end_subject || !WAS_NEWLINE(eptr)))
1367 nigel 77 RRETURN(MATCH_NOMATCH);
1368     ecode++;
1369     break;
1370     }
1371     /* ... else fall through */
1372    
1373     /* Start of subject assertion */
1374    
1375     case OP_SOD:
1376     if (eptr != md->start_subject) RRETURN(MATCH_NOMATCH);
1377     ecode++;
1378     break;
1379    
1380     /* Start of match assertion */
1381    
1382     case OP_SOM:
1383     if (eptr != md->start_subject + md->start_offset) RRETURN(MATCH_NOMATCH);
1384     ecode++;
1385     break;
1386 ph10 172
1387 ph10 168 /* Reset the start of match point */
1388 ph10 172
1389 ph10 168 case OP_SET_SOM:
1390     mstart = eptr;
1391 ph10 172 ecode++;
1392     break;
1393 nigel 77
1394     /* Assert before internal newline if multiline, or before a terminating
1395     newline unless endonly is set, else end of subject unless noteol is set. */
1396    
1397     case OP_DOLL:
1398     if ((ims & PCRE_MULTILINE) != 0)
1399     {
1400     if (eptr < md->end_subject)
1401 nigel 91 { if (!IS_NEWLINE(eptr)) RRETURN(MATCH_NOMATCH); }
1402 nigel 77 else
1403     { if (md->noteol) RRETURN(MATCH_NOMATCH); }
1404     ecode++;
1405     break;
1406     }
1407     else
1408     {
1409     if (md->noteol) RRETURN(MATCH_NOMATCH);
1410     if (!md->endonly)
1411     {
1412 nigel 91 if (eptr != md->end_subject &&
1413 nigel 93 (!IS_NEWLINE(eptr) || eptr != md->end_subject - md->nllen))
1414 nigel 77 RRETURN(MATCH_NOMATCH);
1415     ecode++;
1416     break;
1417     }
1418     }
1419 nigel 91 /* ... else fall through for endonly */
1420 nigel 77
1421     /* End of subject assertion (\z) */
1422    
1423     case OP_EOD:
1424     if (eptr < md->end_subject) RRETURN(MATCH_NOMATCH);
1425     ecode++;
1426     break;
1427    
1428     /* End of subject or ending \n assertion (\Z) */
1429    
1430     case OP_EODN:
1431 nigel 91 if (eptr != md->end_subject &&
1432 nigel 93 (!IS_NEWLINE(eptr) || eptr != md->end_subject - md->nllen))
1433 nigel 91 RRETURN(MATCH_NOMATCH);
1434 nigel 77 ecode++;
1435     break;
1436    
1437     /* Word boundary assertions */
1438    
1439     case OP_NOT_WORD_BOUNDARY:
1440     case OP_WORD_BOUNDARY:
1441     {
1442    
1443     /* Find out if the previous and current characters are "word" characters.
1444     It takes a bit more work in UTF-8 mode. Characters > 255 are assumed to
1445     be "non-word" characters. */
1446    
1447     #ifdef SUPPORT_UTF8
1448     if (utf8)
1449     {
1450     if (eptr == md->start_subject) prev_is_word = FALSE; else
1451     {
1452     const uschar *lastptr = eptr - 1;
1453     while((*lastptr & 0xc0) == 0x80) lastptr--;
1454     GETCHAR(c, lastptr);
1455     prev_is_word = c < 256 && (md->ctypes[c] & ctype_word) != 0;
1456     }
1457     if (eptr >= md->end_subject) cur_is_word = FALSE; else
1458     {
1459     GETCHAR(c, eptr);
1460     cur_is_word = c < 256 && (md->ctypes[c] & ctype_word) != 0;
1461     }
1462     }
1463     else
1464     #endif
1465    
1466     /* More streamlined when not in UTF-8 mode */
1467    
1468     {
1469     prev_is_word = (eptr != md->start_subject) &&
1470     ((md->ctypes[eptr[-1]] & ctype_word) != 0);
1471     cur_is_word = (eptr < md->end_subject) &&
1472     ((md->ctypes[*eptr] & ctype_word) != 0);
1473     }
1474    
1475     /* Now see if the situation is what we want */
1476    
1477     if ((*ecode++ == OP_WORD_BOUNDARY)?
1478     cur_is_word == prev_is_word : cur_is_word != prev_is_word)
1479     RRETURN(MATCH_NOMATCH);
1480     }
1481     break;
1482    
1483     /* Match a single character type; inline for speed */
1484    
1485     case OP_ANY:
1486 ph10 342 if (IS_NEWLINE(eptr)) RRETURN(MATCH_NOMATCH);
1487 ph10 345 /* Fall through */
1488    
1489 ph10 341 case OP_ALLANY:
1490 nigel 77 if (eptr++ >= md->end_subject) RRETURN(MATCH_NOMATCH);
1491 ph10 342 if (utf8) while (eptr < md->end_subject && (*eptr & 0xc0) == 0x80) eptr++;
1492 nigel 77 ecode++;
1493     break;
1494    
1495     /* Match a single byte, even in UTF-8 mode. This opcode really does match
1496     any byte, even newline, independent of the setting of PCRE_DOTALL. */
1497    
1498     case OP_ANYBYTE:
1499     if (eptr++ >= md->end_subject) RRETURN(MATCH_NOMATCH);
1500     ecode++;
1501     break;
1502    
1503     case OP_NOT_DIGIT:
1504     if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
1505     GETCHARINCTEST(c, eptr);
1506     if (
1507     #ifdef SUPPORT_UTF8
1508     c < 256 &&
1509     #endif
1510     (md->ctypes[c] & ctype_digit) != 0
1511     )
1512     RRETURN(MATCH_NOMATCH);
1513     ecode++;
1514     break;
1515    
1516     case OP_DIGIT:
1517     if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
1518     GETCHARINCTEST(c, eptr);
1519     if (
1520     #ifdef SUPPORT_UTF8
1521     c >= 256 ||
1522     #endif
1523     (md->ctypes[c] & ctype_digit) == 0
1524     )
1525     RRETURN(MATCH_NOMATCH);
1526     ecode++;
1527     break;
1528    
1529     case OP_NOT_WHITESPACE:
1530     if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
1531     GETCHARINCTEST(c, eptr);
1532     if (
1533     #ifdef SUPPORT_UTF8
1534     c < 256 &&
1535     #endif
1536     (md->ctypes[c] & ctype_space) != 0
1537     )
1538     RRETURN(MATCH_NOMATCH);
1539     ecode++;
1540     break;
1541    
1542     case OP_WHITESPACE:
1543     if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
1544     GETCHARINCTEST(c, eptr);
1545     if (
1546     #ifdef SUPPORT_UTF8
1547     c >= 256 ||
1548     #endif
1549     (md->ctypes[c] & ctype_space) == 0
1550     )
1551     RRETURN(MATCH_NOMATCH);
1552     ecode++;
1553     break;
1554    
1555     case OP_NOT_WORDCHAR:
1556     if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
1557     GETCHARINCTEST(c, eptr);
1558     if (
1559     #ifdef SUPPORT_UTF8
1560     c < 256 &&
1561     #endif
1562     (md->ctypes[c] & ctype_word) != 0
1563     )
1564     RRETURN(MATCH_NOMATCH);
1565     ecode++;
1566     break;
1567    
1568     case OP_WORDCHAR:
1569     if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
1570     GETCHARINCTEST(c, eptr);
1571     if (
1572     #ifdef SUPPORT_UTF8
1573     c >= 256 ||
1574     #endif
1575     (md->ctypes[c] & ctype_word) == 0
1576     )
1577     RRETURN(MATCH_NOMATCH);
1578     ecode++;
1579     break;
1580    
1581 nigel 93 case OP_ANYNL:
1582     if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
1583     GETCHARINCTEST(c, eptr);
1584     switch(c)
1585     {
1586     default: RRETURN(MATCH_NOMATCH);
1587     case 0x000d:
1588     if (eptr < md->end_subject && *eptr == 0x0a) eptr++;
1589     break;
1590 ph10 231
1591 nigel 93 case 0x000a:
1592 ph10 231 break;
1593    
1594 nigel 93 case 0x000b:
1595     case 0x000c:
1596     case 0x0085:
1597     case 0x2028:
1598     case 0x2029:
1599 ph10 231 if (md->bsr_anycrlf) RRETURN(MATCH_NOMATCH);
1600 nigel 93 break;
1601     }
1602     ecode++;
1603     break;
1604    
1605 ph10 178 case OP_NOT_HSPACE:
1606     if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
1607     GETCHARINCTEST(c, eptr);
1608     switch(c)
1609     {
1610     default: break;
1611     case 0x09: /* HT */
1612     case 0x20: /* SPACE */
1613     case 0xa0: /* NBSP */
1614     case 0x1680: /* OGHAM SPACE MARK */
1615     case 0x180e: /* MONGOLIAN VOWEL SEPARATOR */
1616     case 0x2000: /* EN QUAD */
1617     case 0x2001: /* EM QUAD */
1618     case 0x2002: /* EN SPACE */
1619     case 0x2003: /* EM SPACE */
1620     case 0x2004: /* THREE-PER-EM SPACE */
1621     case 0x2005: /* FOUR-PER-EM SPACE */
1622     case 0x2006: /* SIX-PER-EM SPACE */
1623     case 0x2007: /* FIGURE SPACE */
1624     case 0x2008: /* PUNCTUATION SPACE */
1625     case 0x2009: /* THIN SPACE */
1626     case 0x200A: /* HAIR SPACE */
1627     case 0x202f: /* NARROW NO-BREAK SPACE */
1628     case 0x205f: /* MEDIUM MATHEMATICAL SPACE */
1629     case 0x3000: /* IDEOGRAPHIC SPACE */
1630     RRETURN(MATCH_NOMATCH);
1631     }
1632     ecode++;
1633     break;
1634    
1635     case OP_HSPACE:
1636     if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
1637     GETCHARINCTEST(c, eptr);
1638     switch(c)
1639     {
1640     default: RRETURN(MATCH_NOMATCH);
1641     case 0x09: /* HT */
1642     case 0x20: /* SPACE */
1643     case 0xa0: /* NBSP */
1644     case 0x1680: /* OGHAM SPACE MARK */
1645     case 0x180e: /* MONGOLIAN VOWEL SEPARATOR */
1646     case 0x2000: /* EN QUAD */
1647     case 0x2001: /* EM QUAD */
1648     case 0x2002: /* EN SPACE */
1649     case 0x2003: /* EM SPACE */
1650     case 0x2004: /* THREE-PER-EM SPACE */
1651     case 0x2005: /* FOUR-PER-EM SPACE */
1652     case 0x2006: /* SIX-PER-EM SPACE */
1653     case 0x2007: /* FIGURE SPACE */
1654     case 0x2008: /* PUNCTUATION SPACE */
1655     case 0x2009: /* THIN SPACE */
1656     case 0x200A: /* HAIR SPACE */
1657     case 0x202f: /* NARROW NO-BREAK SPACE */
1658     case 0x205f: /* MEDIUM MATHEMATICAL SPACE */
1659     case 0x3000: /* IDEOGRAPHIC SPACE */
1660     break;
1661     }
1662     ecode++;
1663     break;
1664    
1665     case OP_NOT_VSPACE:
1666     if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
1667     GETCHARINCTEST(c, eptr);
1668     switch(c)
1669     {
1670     default: break;
1671     case 0x0a: /* LF */
1672     case 0x0b: /* VT */
1673     case 0x0c: /* FF */
1674     case 0x0d: /* CR */
1675     case 0x85: /* NEL */
1676     case 0x2028: /* LINE SEPARATOR */
1677     case 0x2029: /* PARAGRAPH SEPARATOR */
1678     RRETURN(MATCH_NOMATCH);
1679     }
1680     ecode++;
1681     break;
1682    
1683     case OP_VSPACE:
1684     if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
1685     GETCHARINCTEST(c, eptr);
1686     switch(c)
1687     {
1688     default: RRETURN(MATCH_NOMATCH);
1689     case 0x0a: /* LF */
1690     case 0x0b: /* VT */
1691     case 0x0c: /* FF */
1692     case 0x0d: /* CR */
1693     case 0x85: /* NEL */
1694     case 0x2028: /* LINE SEPARATOR */
1695     case 0x2029: /* PARAGRAPH SEPARATOR */
1696     break;
1697     }
1698     ecode++;
1699     break;
1700    
1701 nigel 77 #ifdef SUPPORT_UCP
1702     /* Check the next character by Unicode property. We will get here only
1703     if the support is in the binary; otherwise a compile-time error occurs. */
1704    
1705     case OP_PROP:
1706     case OP_NOTPROP:
1707     if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
1708     GETCHARINCTEST(c, eptr);
1709     {
1710 ph10 384 const ucd_record *prop = GET_UCD(c);
1711 nigel 77
1712 nigel 87 switch(ecode[1])
1713     {
1714     case PT_ANY:
1715     if (op == OP_NOTPROP) RRETURN(MATCH_NOMATCH);
1716     break;
1717 nigel 77
1718 nigel 87 case PT_LAMP:
1719 ph10 349 if ((prop->chartype == ucp_Lu ||
1720     prop->chartype == ucp_Ll ||
1721     prop->chartype == ucp_Lt) == (op == OP_NOTPROP))
1722 nigel 77 RRETURN(MATCH_NOMATCH);
1723 nigel 87 break;
1724    
1725     case PT_GC:
1726 ph10 351 if ((ecode[2] != _pcre_ucp_gentype[prop->chartype]) == (op == OP_PROP))
1727 nigel 77 RRETURN(MATCH_NOMATCH);
1728 nigel 87 break;
1729    
1730     case PT_PC:
1731 ph10 349 if ((ecode[2] != prop->chartype) == (op == OP_PROP))
1732 nigel 87 RRETURN(MATCH_NOMATCH);
1733     break;
1734    
1735     case PT_SC:
1736 ph10 349 if ((ecode[2] != prop->script) == (op == OP_PROP))
1737 nigel 87 RRETURN(MATCH_NOMATCH);
1738     break;
1739    
1740     default:
1741     RRETURN(PCRE_ERROR_INTERNAL);
1742 nigel 77 }
1743 nigel 87
1744     ecode += 3;
1745 nigel 77 }
1746     break;
1747    
1748     /* Match an extended Unicode sequence. We will get here only if the support
1749     is in the binary; otherwise a compile-time error occurs. */
1750    
1751     case OP_EXTUNI:
1752     if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
1753     GETCHARINCTEST(c, eptr);
1754     {
1755 ph10 349 int category = UCD_CATEGORY(c);
1756 nigel 77 if (category == ucp_M) RRETURN(MATCH_NOMATCH);
1757     while (eptr < md->end_subject)
1758     {
1759     int len = 1;
1760     if (!utf8) c = *eptr; else
1761     {
1762     GETCHARLEN(c, eptr, len);
1763     }
1764 ph10 349 category = UCD_CATEGORY(c);
1765 nigel 77 if (category != ucp_M) break;
1766     eptr += len;
1767     }
1768     }
1769     ecode++;
1770     break;
1771     #endif
1772    
1773    
1774     /* Match a back reference, possibly repeatedly. Look past the end of the
1775     item to see if there is repeat information following. The code is similar
1776     to that for character classes, but repeated for efficiency. Then obey
1777     similar code to character type repeats - written out again for speed.
1778     However, if the referenced string is the empty string, always treat
1779     it as matched, any number of times (otherwise there could be infinite
1780     loops). */
1781    
1782     case OP_REF:
1783     {
1784     offset = GET2(ecode, 1) << 1; /* Doubled ref number */
1785 ph10 345 ecode += 3;
1786    
1787 ph10 336 /* If the reference is unset, there are two possibilities:
1788 ph10 345
1789 ph10 336 (a) In the default, Perl-compatible state, set the length to be longer
1790     than the amount of subject left; this ensures that every attempt at a
1791     match fails. We can't just fail here, because of the possibility of
1792     quantifiers with zero minima.
1793 ph10 345
1794     (b) If the JavaScript compatibility flag is set, set the length to zero
1795     so that the back reference matches an empty string.
1796    
1797     Otherwise, set the length to the length of what was matched by the
1798 ph10 336 referenced subpattern. */
1799 ph10 345
1800 ph10 336 if (offset >= offset_top || md->offset_vector[offset] < 0)
1801 ph10 345 length = (md->jscript_compat)? 0 : md->end_subject - eptr + 1;
1802 ph10 336 else
1803     length = md->offset_vector[offset+1] - md->offset_vector[offset];
1804 nigel 77
1805     /* Set up for repetition, or handle the non-repeated case */
1806    
1807     switch (*ecode)
1808     {
1809     case OP_CRSTAR:
1810     case OP_CRMINSTAR:
1811     case OP_CRPLUS:
1812     case OP_CRMINPLUS:
1813     case OP_CRQUERY:
1814     case OP_CRMINQUERY:
1815     c = *ecode++ - OP_CRSTAR;
1816     minimize = (c & 1) != 0;
1817     min = rep_min[c]; /* Pick up values from tables; */
1818     max = rep_max[c]; /* zero for max => infinity */
1819     if (max == 0) max = INT_MAX;
1820     break;
1821    
1822     case OP_CRRANGE:
1823     case OP_CRMINRANGE:
1824     minimize = (*ecode == OP_CRMINRANGE);
1825     min = GET2(ecode, 1);
1826     max = GET2(ecode, 3);
1827     if (max == 0) max = INT_MAX;
1828     ecode += 5;
1829     break;
1830    
1831     default: /* No repeat follows */
1832     if (!match_ref(offset, eptr, length, md, ims)) RRETURN(MATCH_NOMATCH);
1833     eptr += length;
1834     continue; /* With the main loop */
1835     }
1836    
1837     /* If the length of the reference is zero, just continue with the
1838     main loop. */
1839    
1840     if (length == 0) continue;
1841    
1842     /* First, ensure the minimum number of matches are present. We get back
1843     the length of the reference string explicitly rather than passing the
1844     address of eptr, so that eptr can be a register variable. */
1845    
1846     for (i = 1; i <= min; i++)
1847     {
1848     if (!match_ref(offset, eptr, length, md, ims)) RRETURN(MATCH_NOMATCH);
1849     eptr += length;
1850     }
1851    
1852     /* If min = max, continue at the same level without recursion.
1853     They are not both allowed to be zero. */
1854    
1855     if (min == max) continue;
1856    
1857     /* If minimizing, keep trying and advancing the pointer */
1858    
1859     if (minimize)
1860     {
1861     for (fi = min;; fi++)
1862     {
1863 ph10 164 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM14);
1864 nigel 77 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1865     if (fi >= max || !match_ref(offset, eptr, length, md, ims))
1866     RRETURN(MATCH_NOMATCH);
1867     eptr += length;
1868     }
1869     /* Control never gets here */
1870     }
1871    
1872     /* If maximizing, find the longest string and work backwards */
1873    
1874     else
1875     {
1876     pp = eptr;
1877     for (i = min; i < max; i++)
1878     {
1879     if (!match_ref(offset, eptr, length, md, ims)) break;
1880     eptr += length;
1881     }
1882     while (eptr >= pp)
1883     {
1884 ph10 164 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM15);
1885 nigel 77 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1886     eptr -= length;
1887     }
1888     RRETURN(MATCH_NOMATCH);
1889     }
1890     }
1891     /* Control never gets here */
1892    
1893    
1894    
1895     /* Match a bit-mapped character class, possibly repeatedly. This op code is
1896     used when all the characters in the class have values in the range 0-255,
1897     and either the matching is caseful, or the characters are in the range
1898     0-127 when UTF-8 processing is enabled. The only difference between
1899     OP_CLASS and OP_NCLASS occurs when a data character outside the range is
1900     encountered.
1901    
1902     First, look past the end of the item to see if there is repeat information
1903     following. Then obey similar code to character type repeats - written out
1904     again for speed. */
1905    
1906     case OP_NCLASS:
1907     case OP_CLASS:
1908     {
1909     data = ecode + 1; /* Save for matching */
1910     ecode += 33; /* Advance past the item */
1911    
1912     switch (*ecode)
1913     {
1914     case OP_CRSTAR:
1915     case OP_CRMINSTAR:
1916     case OP_CRPLUS:
1917     case OP_CRMINPLUS:
1918     case OP_CRQUERY:
1919     case OP_CRMINQUERY:
1920     c = *ecode++ - OP_CRSTAR;
1921     minimize = (c & 1) != 0;
1922     min = rep_min[c]; /* Pick up values from tables; */
1923     max = rep_max[c]; /* zero for max => infinity */
1924     if (max == 0) max = INT_MAX;
1925     break;
1926    
1927     case OP_CRRANGE:
1928     case OP_CRMINRANGE:
1929     minimize = (*ecode == OP_CRMINRANGE);
1930     min = GET2(ecode, 1);
1931     max = GET2(ecode, 3);
1932     if (max == 0) max = INT_MAX;
1933     ecode += 5;
1934     break;
1935    
1936     default: /* No repeat follows */
1937     min = max = 1;
1938     break;
1939     }
1940    
1941     /* First, ensure the minimum number of matches are present. */
1942    
1943     #ifdef SUPPORT_UTF8
1944     /* UTF-8 mode */
1945     if (utf8)
1946     {
1947     for (i = 1; i <= min; i++)
1948     {
1949     if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
1950     GETCHARINC(c, eptr);
1951     if (c > 255)
1952     {
1953     if (op == OP_CLASS) RRETURN(MATCH_NOMATCH);
1954     }
1955     else
1956     {
1957     if ((data[c/8] & (1 << (c&7))) == 0) RRETURN(MATCH_NOMATCH);
1958     }
1959     }
1960     }
1961     else
1962     #endif
1963     /* Not UTF-8 mode */
1964     {
1965     for (i = 1; i <= min; i++)
1966     {
1967     if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
1968     c = *eptr++;
1969     if ((data[c/8] & (1 << (c&7))) == 0) RRETURN(MATCH_NOMATCH);
1970     }
1971     }
1972    
1973     /* If max == min we can continue with the main loop without the
1974     need to recurse. */
1975    
1976     if (min == max) continue;
1977    
1978     /* If minimizing, keep testing the rest of the expression and advancing
1979     the pointer while it matches the class. */
1980    
1981     if (minimize)
1982     {
1983     #ifdef SUPPORT_UTF8
1984     /* UTF-8 mode */
1985     if (utf8)
1986     {
1987     for (fi = min;; fi++)
1988     {
1989 ph10 164 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM16);
1990 nigel 77 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1991     if (fi >= max || eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
1992     GETCHARINC(c, eptr);
1993     if (c > 255)
1994     {
1995     if (op == OP_CLASS) RRETURN(MATCH_NOMATCH);
1996     }
1997     else
1998     {
1999     if ((data[c/8] & (1 << (c&7))) == 0) RRETURN(MATCH_NOMATCH);
2000     }
2001     }
2002     }
2003     else
2004     #endif
2005     /* Not UTF-8 mode */
2006     {
2007     for (fi = min;; fi++)
2008     {
2009 ph10 164 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM17);
2010 nigel 77 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2011     if (fi >= max || eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
2012     c = *eptr++;
2013     if ((data[c/8] & (1 << (c&7))) == 0) RRETURN(MATCH_NOMATCH);
2014     }
2015     }
2016     /* Control never gets here */
2017     }
2018    
2019     /* If maximizing, find the longest possible run, then work backwards. */
2020    
2021     else
2022     {
2023     pp = eptr;
2024    
2025     #ifdef SUPPORT_UTF8
2026     /* UTF-8 mode */
2027     if (utf8)
2028     {
2029     for (i = min; i < max; i++)
2030     {
2031     int len = 1;
2032     if (eptr >= md->end_subject) break;
2033     GETCHARLEN(c, eptr, len);
2034     if (c > 255)
2035     {
2036     if (op == OP_CLASS) break;
2037     }
2038     else
2039     {
2040     if ((data[c/8] & (1 << (c&7))) == 0) break;
2041     }
2042     eptr += len;
2043     }
2044     for (;;)
2045     {
2046 ph10 164 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM18);
2047 nigel 77 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2048     if (eptr-- == pp) break; /* Stop if tried at original pos */
2049     BACKCHAR(eptr);
2050     }
2051     }
2052     else
2053     #endif
2054     /* Not UTF-8 mode */
2055     {
2056     for (i = min; i < max; i++)
2057     {
2058     if (eptr >= md->end_subject) break;
2059     c = *eptr;
2060     if ((data[c/8] & (1 << (c&7))) == 0) break;
2061     eptr++;
2062     }
2063     while (eptr >= pp)
2064     {
2065 ph10 164 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM19);
2066 nigel 87 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2067 nigel 77 eptr--;
2068     }
2069     }
2070    
2071     RRETURN(MATCH_NOMATCH);
2072     }
2073     }
2074     /* Control never gets here */
2075    
2076    
2077     /* Match an extended character class. This opcode is encountered only
2078 ph10 384 when UTF-8 mode mode is supported. Nevertheless, we may not be in UTF-8
2079     mode, because Unicode properties are supported in non-UTF-8 mode. */
2080 nigel 77
2081     #ifdef SUPPORT_UTF8
2082     case OP_XCLASS:
2083     {
2084     data = ecode + 1 + LINK_SIZE; /* Save for matching */
2085     ecode += GET(ecode, 1); /* Advance past the item */
2086    
2087     switch (*ecode)
2088     {
2089     case OP_CRSTAR:
2090     case OP_CRMINSTAR:
2091     case OP_CRPLUS:
2092     case OP_CRMINPLUS:
2093     case OP_CRQUERY:
2094     case OP_CRMINQUERY:
2095     c = *ecode++ - OP_CRSTAR;
2096     minimize = (c & 1) != 0;
2097     min = rep_min[c]; /* Pick up values from tables; */
2098     max = rep_max[c]; /* zero for max => infinity */
2099     if (max == 0) max = INT_MAX;
2100     break;
2101    
2102     case OP_CRRANGE:
2103     case OP_CRMINRANGE:
2104     minimize = (*ecode == OP_CRMINRANGE);
2105     min = GET2(ecode, 1);
2106     max = GET2(ecode, 3);
2107     if (max == 0) max = INT_MAX;
2108     ecode += 5;
2109     break;
2110    
2111     default: /* No repeat follows */
2112     min = max = 1;
2113     break;
2114     }
2115    
2116     /* First, ensure the minimum number of matches are present. */
2117    
2118     for (i = 1; i <= min; i++)
2119     {
2120     if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
2121 ph10 384 GETCHARINCTEST(c, eptr);
2122 nigel 77 if (!_pcre_xclass(c, data)) RRETURN(MATCH_NOMATCH);
2123     }
2124    
2125     /* If max == min we can continue with the main loop without the
2126     need to recurse. */
2127    
2128     if (min == max) continue;
2129    
2130     /* If minimizing, keep testing the rest of the expression and advancing
2131     the pointer while it matches the class. */
2132    
2133     if (minimize)
2134     {
2135     for (fi = min;; fi++)
2136     {
2137 ph10 164 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM20);
2138 nigel 77 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2139     if (fi >= max || eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
2140 ph10 384 GETCHARINCTEST(c, eptr);
2141 nigel 77 if (!_pcre_xclass(c, data)) RRETURN(MATCH_NOMATCH);
2142     }
2143     /* Control never gets here */
2144     }
2145    
2146     /* If maximizing, find the longest possible run, then work backwards. */
2147    
2148     else
2149     {
2150     pp = eptr;
2151     for (i = min; i < max; i++)
2152     {
2153     int len = 1;
2154     if (eptr >= md->end_subject) break;
2155 ph10 384 GETCHARLENTEST(c, eptr, len);
2156 nigel 77 if (!_pcre_xclass(c, data)) break;
2157     eptr += len;
2158     }
2159     for(;;)
2160     {
2161 ph10 164 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM21);
2162 nigel 77 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2163     if (eptr-- == pp) break; /* Stop if tried at original pos */
2164 ph10 214 if (utf8) BACKCHAR(eptr);
2165 nigel 77 }
2166     RRETURN(MATCH_NOMATCH);
2167     }
2168    
2169     /* Control never gets here */
2170     }
2171     #endif /* End of XCLASS */
2172    
2173     /* Match a single character, casefully */
2174    
2175     case OP_CHAR:
2176     #ifdef SUPPORT_UTF8
2177     if (utf8)
2178     {
2179     length = 1;
2180     ecode++;
2181     GETCHARLEN(fc, ecode, length);
2182     if (length > md->end_subject - eptr) RRETURN(MATCH_NOMATCH);
2183     while (length-- > 0) if (*ecode++ != *eptr++) RRETURN(MATCH_NOMATCH);
2184     }
2185     else
2186     #endif
2187    
2188     /* Non-UTF-8 mode */
2189     {
2190     if (md->end_subject - eptr < 1) RRETURN(MATCH_NOMATCH);
2191     if (ecode[1] != *eptr++) RRETURN(MATCH_NOMATCH);
2192     ecode += 2;
2193     }
2194     break;
2195    
2196     /* Match a single character, caselessly */
2197    
2198     case OP_CHARNC:
2199     #ifdef SUPPORT_UTF8
2200     if (utf8)
2201     {
2202     length = 1;
2203     ecode++;
2204     GETCHARLEN(fc, ecode, length);
2205    
2206     if (length > md->end_subject - eptr) RRETURN(MATCH_NOMATCH);
2207    
2208     /* If the pattern character's value is < 128, we have only one byte, and
2209     can use the fast lookup table. */
2210    
2211     if (fc < 128)
2212     {
2213     if (md->lcc[*ecode++] != md->lcc[*eptr++]) RRETURN(MATCH_NOMATCH);
2214     }
2215    
2216     /* Otherwise we must pick up the subject character */
2217    
2218     else
2219     {
2220 nigel 93 unsigned int dc;
2221 nigel 77 GETCHARINC(dc, eptr);
2222     ecode += length;
2223    
2224     /* If we have Unicode property support, we can use it to test the other
2225 nigel 87 case of the character, if there is one. */
2226 nigel 77
2227     if (fc != dc)
2228     {
2229     #ifdef SUPPORT_UCP
2230 ph10 349 if (dc != UCD_OTHERCASE(fc))
2231 nigel 77 #endif
2232     RRETURN(MATCH_NOMATCH);
2233     }
2234     }
2235     }
2236     else
2237     #endif /* SUPPORT_UTF8 */
2238    
2239     /* Non-UTF-8 mode */
2240     {
2241     if (md->end_subject - eptr < 1) RRETURN(MATCH_NOMATCH);
2242     if (md->lcc[ecode[1]] != md->lcc[*eptr++]) RRETURN(MATCH_NOMATCH);
2243     ecode += 2;
2244     }
2245     break;
2246    
2247 nigel 93 /* Match a single character repeatedly. */
2248 nigel 77
2249     case OP_EXACT:
2250     min = max = GET2(ecode, 1);
2251     ecode += 3;
2252     goto REPEATCHAR;
2253    
2254 nigel 93 case OP_POSUPTO:
2255     possessive = TRUE;
2256     /* Fall through */
2257    
2258 nigel 77 case OP_UPTO:
2259     case OP_MINUPTO:
2260     min = 0;
2261     max = GET2(ecode, 1);
2262     minimize = *ecode == OP_MINUPTO;
2263     ecode += 3;
2264     goto REPEATCHAR;
2265    
2266 nigel 93 case OP_POSSTAR:
2267     possessive = TRUE;
2268     min = 0;
2269     max = INT_MAX;
2270     ecode++;
2271     goto REPEATCHAR;
2272    
2273     case OP_POSPLUS:
2274     possessive = TRUE;
2275     min = 1;
2276     max = INT_MAX;
2277     ecode++;
2278     goto REPEATCHAR;
2279    
2280     case OP_POSQUERY:
2281     possessive = TRUE;
2282     min = 0;
2283     max = 1;
2284     ecode++;
2285     goto REPEATCHAR;
2286    
2287 nigel 77 case OP_STAR:
2288     case OP_MINSTAR:
2289     case OP_PLUS:
2290     case OP_MINPLUS:
2291     case OP_QUERY:
2292     case OP_MINQUERY:
2293     c = *ecode++ - OP_STAR;
2294     minimize = (c & 1) != 0;
2295     min = rep_min[c]; /* Pick up values from tables; */
2296     max = rep_max[c]; /* zero for max => infinity */
2297     if (max == 0) max = INT_MAX;
2298    
2299     /* Common code for all repeated single-character matches. We can give
2300     up quickly if there are fewer than the minimum number of characters left in
2301     the subject. */
2302    
2303     REPEATCHAR:
2304     #ifdef SUPPORT_UTF8
2305     if (utf8)
2306     {
2307     length = 1;
2308     charptr = ecode;
2309     GETCHARLEN(fc, ecode, length);
2310     if (min * length > md->end_subject - eptr) RRETURN(MATCH_NOMATCH);
2311     ecode += length;
2312    
2313     /* Handle multibyte character matching specially here. There is
2314     support for caseless matching if UCP support is present. */
2315    
2316     if (length > 1)
2317     {
2318     #ifdef SUPPORT_UCP
2319 nigel 93 unsigned int othercase;
2320 nigel 77 if ((ims & PCRE_CASELESS) != 0 &&
2321 ph10 349 (othercase = UCD_OTHERCASE(fc)) != fc)
2322 nigel 77 oclength = _pcre_ord2utf8(othercase, occhars);
2323 ph10 115 else oclength = 0;
2324 nigel 77 #endif /* SUPPORT_UCP */
2325    
2326     for (i = 1; i <= min; i++)
2327     {
2328     if (memcmp(eptr, charptr, length) == 0) eptr += length;
2329 ph10 123 #ifdef SUPPORT_UCP
2330 nigel 77 /* Need braces because of following else */
2331     else if (oclength == 0) { RRETURN(MATCH_NOMATCH); }
2332     else
2333     {
2334     if (memcmp(eptr, occhars, oclength) != 0) RRETURN(MATCH_NOMATCH);
2335     eptr += oclength;
2336     }
2337 ph10 115 #else /* without SUPPORT_UCP */
2338     else { RRETURN(MATCH_NOMATCH); }
2339 ph10 123 #endif /* SUPPORT_UCP */
2340 nigel 77 }
2341    
2342     if (min == max) continue;
2343    
2344     if (minimize)
2345     {
2346     for (fi = min;; fi++)
2347     {
2348 ph10 164 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM22);
2349 nigel 77 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2350     if (fi >= max || eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
2351     if (memcmp(eptr, charptr, length) == 0) eptr += length;
2352 ph10 123 #ifdef SUPPORT_UCP
2353 nigel 77 /* Need braces because of following else */
2354     else if (oclength == 0) { RRETURN(MATCH_NOMATCH); }
2355     else
2356     {
2357     if (memcmp(eptr, occhars, oclength) != 0) RRETURN(MATCH_NOMATCH);
2358     eptr += oclength;
2359     }
2360 ph10 115 #else /* without SUPPORT_UCP */
2361     else { RRETURN (MATCH_NOMATCH); }
2362     #endif /* SUPPORT_UCP */
2363 nigel 77 }
2364     /* Control never gets here */
2365     }
2366 nigel 93
2367     else /* Maximize */
2368 nigel 77 {
2369     pp = eptr;
2370     for (i = min; i < max; i++)
2371     {
2372     if (eptr > md->end_subject - length) break;
2373     if (memcmp(eptr, charptr, length) == 0) eptr += length;
2374 ph10 123 #ifdef SUPPORT_UCP
2375 nigel 77 else if (oclength == 0) break;
2376     else
2377     {
2378     if (memcmp(eptr, occhars, oclength) != 0) break;
2379     eptr += oclength;
2380     }
2381 ph10 115 #else /* without SUPPORT_UCP */
2382     else break;
2383 ph10 123 #endif /* SUPPORT_UCP */
2384 nigel 77 }
2385 nigel 93
2386     if (possessive) continue;
2387 ph10 120 for(;;)
2388 nigel 77 {
2389 ph10 164 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM23);
2390 nigel 77 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2391 ph10 120 if (eptr == pp) RRETURN(MATCH_NOMATCH);
2392 ph10 115 #ifdef SUPPORT_UCP
2393     eptr--;
2394     BACKCHAR(eptr);
2395 ph10 123 #else /* without SUPPORT_UCP */
2396 nigel 77 eptr -= length;
2397 ph10 123 #endif /* SUPPORT_UCP */
2398 nigel 77 }
2399     }
2400     /* Control never gets here */
2401     }
2402    
2403     /* If the length of a UTF-8 character is 1, we fall through here, and
2404     obey the code as for non-UTF-8 characters below, though in this case the
2405     value of fc will always be < 128. */
2406     }
2407     else
2408     #endif /* SUPPORT_UTF8 */
2409    
2410     /* When not in UTF-8 mode, load a single-byte character. */
2411     {
2412     if (min > md->end_subject - eptr) RRETURN(MATCH_NOMATCH);
2413     fc = *ecode++;
2414     }
2415    
2416     /* The value of fc at this point is always less than 256, though we may or
2417     may not be in UTF-8 mode. The code is duplicated for the caseless and
2418     caseful cases, for speed, since matching characters is likely to be quite
2419     common. First, ensure the minimum number of matches are present. If min =
2420     max, continue at the same level without recursing. Otherwise, if
2421     minimizing, keep trying the rest of the expression and advancing one
2422     matching character if failing, up to the maximum. Alternatively, if
2423     maximizing, find the maximum number of characters and work backwards. */
2424    
2425     DPRINTF(("matching %c{%d,%d} against subject %.*s\n", fc, min, max,
2426     max, eptr));
2427    
2428     if ((ims & PCRE_CASELESS) != 0)
2429     {
2430     fc = md->lcc[fc];
2431     for (i = 1; i <= min; i++)
2432     if (fc != md->lcc[*eptr++]) RRETURN(MATCH_NOMATCH);
2433     if (min == max) continue;
2434     if (minimize)
2435     {
2436     for (fi = min;; fi++)
2437     {
2438 ph10 164 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM24);
2439 nigel 77 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2440     if (fi >= max || eptr >= md->end_subject ||
2441     fc != md->lcc[*eptr++])
2442     RRETURN(MATCH_NOMATCH);
2443     }
2444     /* Control never gets here */
2445     }
2446 nigel 93 else /* Maximize */
2447 nigel 77 {
2448     pp = eptr;
2449     for (i = min; i < max; i++)
2450     {
2451     if (eptr >= md->end_subject || fc != md->lcc[*eptr]) break;
2452     eptr++;
2453     }
2454 nigel 93 if (possessive) continue;
2455 nigel 77 while (eptr >= pp)
2456     {
2457 ph10 164 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM25);
2458 nigel 77 eptr--;
2459     if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2460     }
2461     RRETURN(MATCH_NOMATCH);
2462     }
2463     /* Control never gets here */
2464     }
2465    
2466     /* Caseful comparisons (includes all multi-byte characters) */
2467    
2468     else
2469     {
2470     for (i = 1; i <= min; i++) if (fc != *eptr++) RRETURN(MATCH_NOMATCH);
2471     if (min == max) continue;
2472     if (minimize)
2473     {
2474     for (fi = min;; fi++)
2475     {
2476 ph10 164 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM26);
2477 nigel 77 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2478     if (fi >= max || eptr >= md->end_subject || fc != *eptr++)
2479     RRETURN(MATCH_NOMATCH);
2480     }
2481     /* Control never gets here */
2482     }
2483 nigel 93 else /* Maximize */
2484 nigel 77 {
2485     pp = eptr;
2486     for (i = min; i < max; i++)
2487     {
2488     if (eptr >= md->end_subject || fc != *eptr) break;
2489     eptr++;
2490     }
2491 nigel 93 if (possessive) continue;
2492 nigel 77 while (eptr >= pp)
2493     {
2494 ph10 164 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM27);
2495 nigel 77 eptr--;
2496     if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2497     }
2498     RRETURN(MATCH_NOMATCH);
2499     }
2500     }
2501     /* Control never gets here */
2502    
2503     /* Match a negated single one-byte character. The character we are
2504     checking can be multibyte. */
2505    
2506     case OP_NOT:
2507     if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
2508     ecode++;
2509     GETCHARINCTEST(c, eptr);
2510     if ((ims & PCRE_CASELESS) != 0)
2511     {
2512     #ifdef SUPPORT_UTF8
2513     if (c < 256)
2514     #endif
2515     c = md->lcc[c];
2516     if (md->lcc[*ecode++] == c) RRETURN(MATCH_NOMATCH);
2517     }
2518     else
2519     {
2520     if (*ecode++ == c) RRETURN(MATCH_NOMATCH);
2521     }
2522     break;
2523    
2524     /* Match a negated single one-byte character repeatedly. This is almost a
2525     repeat of the code for a repeated single character, but I haven't found a
2526     nice way of commoning these up that doesn't require a test of the
2527     positive/negative option for each character match. Maybe that wouldn't add
2528     very much to the time taken, but character matching *is* what this is all
2529     about... */
2530    
2531     case OP_NOTEXACT:
2532     min = max = GET2(ecode, 1);
2533     ecode += 3;
2534     goto REPEATNOTCHAR;
2535    
2536     case OP_NOTUPTO:
2537     case OP_NOTMINUPTO:
2538     min = 0;
2539     max = GET2(ecode, 1);
2540     minimize = *ecode == OP_NOTMINUPTO;
2541     ecode += 3;
2542     goto REPEATNOTCHAR;
2543    
2544 nigel 93 case OP_NOTPOSSTAR:
2545     possessive = TRUE;
2546     min = 0;
2547     max = INT_MAX;
2548     ecode++;
2549     goto REPEATNOTCHAR;
2550    
2551     case OP_NOTPOSPLUS:
2552     possessive = TRUE;
2553     min = 1;
2554     max = INT_MAX;
2555     ecode++;
2556     goto REPEATNOTCHAR;
2557    
2558     case OP_NOTPOSQUERY:
2559     possessive = TRUE;
2560     min = 0;
2561     max = 1;
2562     ecode++;
2563     goto REPEATNOTCHAR;
2564    
2565     case OP_NOTPOSUPTO:
2566     possessive = TRUE;
2567     min = 0;
2568     max = GET2(ecode, 1);
2569     ecode += 3;
2570     goto REPEATNOTCHAR;
2571    
2572 nigel 77 case OP_NOTSTAR:
2573     case OP_NOTMINSTAR:
2574     case OP_NOTPLUS:
2575     case OP_NOTMINPLUS:
2576     case OP_NOTQUERY:
2577     case OP_NOTMINQUERY:
2578     c = *ecode++ - OP_NOTSTAR;
2579     minimize = (c & 1) != 0;
2580     min = rep_min[c]; /* Pick up values from tables; */
2581     max = rep_max[c]; /* zero for max => infinity */
2582     if (max == 0) max = INT_MAX;
2583    
2584     /* Common code for all repeated single-byte matches. We can give up quickly
2585     if there are fewer than the minimum number of bytes left in the
2586     subject. */
2587    
2588     REPEATNOTCHAR:
2589     if (min > md->end_subject - eptr) RRETURN(MATCH_NOMATCH);
2590     fc = *ecode++;
2591    
2592     /* The code is duplicated for the caseless and caseful cases, for speed,
2593     since matching characters is likely to be quite common. First, ensure the
2594     minimum number of matches are present. If min = max, continue at the same
2595     level without recursing. Otherwise, if minimizing, keep trying the rest of
2596     the expression and advancing one matching character if failing, up to the
2597     maximum. Alternatively, if maximizing, find the maximum number of
2598     characters and work backwards. */
2599    
2600     DPRINTF(("negative matching %c{%d,%d} against subject %.*s\n", fc, min, max,
2601     max, eptr));
2602    
2603     if ((ims & PCRE_CASELESS) != 0)
2604     {
2605     fc = md->lcc[fc];
2606    
2607     #ifdef SUPPORT_UTF8
2608     /* UTF-8 mode */
2609     if (utf8)
2610     {
2611 nigel 93 register unsigned int d;
2612 nigel 77 for (i = 1; i <= min; i++)
2613     {
2614     GETCHARINC(d, eptr);
2615     if (d < 256) d = md->lcc[d];
2616     if (fc == d) RRETURN(MATCH_NOMATCH);
2617     }
2618     }
2619     else
2620     #endif
2621    
2622     /* Not UTF-8 mode */
2623     {
2624     for (i = 1; i <= min; i++)
2625     if (fc == md->lcc[*eptr++]) RRETURN(MATCH_NOMATCH);
2626     }
2627    
2628     if (min == max) continue;
2629    
2630     if (minimize)
2631     {
2632     #ifdef SUPPORT_UTF8
2633     /* UTF-8 mode */
2634     if (utf8)
2635     {
2636 nigel 93 register unsigned int d;
2637 nigel 77 for (fi = min;; fi++)
2638     {
2639 ph10 164 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM28);
2640 nigel 77 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2641 ph10 366 if (fi >= max || eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
2642 nigel 77 GETCHARINC(d, eptr);
2643     if (d < 256) d = md->lcc[d];
2644 ph10 366 if (fc == d) RRETURN(MATCH_NOMATCH);
2645 ph10 371
2646 nigel 77 }
2647     }
2648     else
2649     #endif
2650     /* Not UTF-8 mode */
2651     {
2652     for (fi = min;; fi++)
2653     {
2654 ph10 164 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM29);
2655 nigel 77 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2656     if (fi >= max || eptr >= md->end_subject || fc == md->lcc[*eptr++])
2657     RRETURN(MATCH_NOMATCH);
2658     }
2659     }
2660     /* Control never gets here */
2661     }
2662    
2663     /* Maximize case */
2664    
2665     else
2666     {
2667     pp = eptr;
2668    
2669     #ifdef SUPPORT_UTF8
2670     /* UTF-8 mode */
2671     if (utf8)
2672     {
2673 nigel 93 register unsigned int d;
2674 nigel 77 for (i = min; i < max; i++)
2675     {
2676     int len = 1;
2677     if (eptr >= md->end_subject) break;
2678     GETCHARLEN(d, eptr, len);
2679     if (d < 256) d = md->lcc[d];
2680     if (fc == d) break;
2681     eptr += len;
2682     }
2683 nigel 93 if (possessive) continue;
2684     for(;;)
2685 nigel 77 {
2686 ph10 164 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM30);
2687 nigel 77 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2688     if (eptr-- == pp) break; /* Stop if tried at original pos */
2689     BACKCHAR(eptr);
2690     }
2691     }
2692     else
2693     #endif
2694     /* Not UTF-8 mode */
2695     {
2696     for (i = min; i < max; i++)
2697     {
2698     if (eptr >= md->end_subject || fc == md->lcc[*eptr]) break;
2699     eptr++;
2700     }
2701 nigel 93 if (possessive) continue;
2702 nigel 77 while (eptr >= pp)
2703     {
2704 ph10 164 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM31);
2705 nigel 77 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2706     eptr--;
2707     }
2708     }
2709    
2710     RRETURN(MATCH_NOMATCH);
2711     }
2712     /* Control never gets here */
2713     }
2714    
2715     /* Caseful comparisons */
2716    
2717     else
2718     {
2719     #ifdef SUPPORT_UTF8
2720     /* UTF-8 mode */
2721     if (utf8)
2722     {
2723 nigel 93 register unsigned int d;
2724 nigel 77 for (i = 1; i <= min; i++)
2725     {
2726     GETCHARINC(d, eptr);
2727     if (fc == d) RRETURN(MATCH_NOMATCH);
2728     }
2729     }
2730     else
2731     #endif
2732     /* Not UTF-8 mode */
2733     {
2734     for (i = 1; i <= min; i++)
2735     if (fc == *eptr++) RRETURN(MATCH_NOMATCH);
2736     }
2737    
2738     if (min == max) continue;
2739    
2740     if (minimize)
2741     {
2742     #ifdef SUPPORT_UTF8
2743     /* UTF-8 mode */
2744     if (utf8)
2745     {
2746 nigel 93 register unsigned int d;
2747 nigel 77 for (fi = min;; fi++)
2748     {
2749 ph10 164 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM32);
2750 nigel 77 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2751 ph10 366 if (fi >= max || eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
2752 nigel 77 GETCHARINC(d, eptr);
2753 ph10 371 if (fc == d) RRETURN(MATCH_NOMATCH);
2754 nigel 77 }
2755     }
2756     else
2757     #endif
2758     /* Not UTF-8 mode */
2759     {
2760     for (fi = min;; fi++)
2761     {
2762 ph10 164 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM33);
2763 nigel 77 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2764     if (fi >= max || eptr >= md->end_subject || fc == *eptr++)
2765     RRETURN(MATCH_NOMATCH);
2766     }
2767     }
2768     /* Control never gets here */
2769     }
2770    
2771     /* Maximize case */
2772    
2773     else
2774     {
2775     pp = eptr;
2776    
2777     #ifdef SUPPORT_UTF8
2778     /* UTF-8 mode */
2779     if (utf8)
2780     {
2781 nigel 93 register unsigned int d;
2782 nigel 77 for (i = min; i < max; i++)
2783     {
2784     int len = 1;
2785     if (eptr >= md->end_subject) break;
2786     GETCHARLEN(d, eptr, len);
2787     if (fc == d) break;
2788     eptr += len;
2789     }
2790 nigel 93 if (possessive) continue;
2791 nigel 77 for(;;)
2792     {
2793 ph10 164 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM34);
2794 nigel 77 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2795     if (eptr-- == pp) break; /* Stop if tried at original pos */
2796     BACKCHAR(eptr);
2797     }
2798     }
2799     else
2800     #endif
2801     /* Not UTF-8 mode */
2802     {
2803     for (i = min; i < max; i++)
2804     {
2805     if (eptr >= md->end_subject || fc == *eptr) break;
2806     eptr++;
2807     }
2808 nigel 93 if (possessive) continue;
2809 nigel 77 while (eptr >= pp)
2810     {
2811 ph10 164 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM35);
2812 nigel 77 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2813     eptr--;
2814     }
2815     }
2816    
2817     RRETURN(MATCH_NOMATCH);
2818     }
2819     }
2820     /* Control never gets here */
2821    
2822     /* Match a single character type repeatedly; several different opcodes
2823     share code. This is very similar to the code for single characters, but we
2824     repeat it in the interests of efficiency. */
2825    
2826     case OP_TYPEEXACT:
2827     min = max = GET2(ecode, 1);
2828     minimize = TRUE;
2829     ecode += 3;
2830     goto REPEATTYPE;
2831    
2832     case OP_TYPEUPTO:
2833     case OP_TYPEMINUPTO:
2834     min = 0;
2835     max = GET2(ecode, 1);
2836     minimize = *ecode == OP_TYPEMINUPTO;
2837     ecode += 3;
2838     goto REPEATTYPE;
2839    
2840 nigel 93 case OP_TYPEPOSSTAR:
2841     possessive = TRUE;
2842     min = 0;
2843     max = INT_MAX;
2844     ecode++;
2845     goto REPEATTYPE;
2846    
2847     case OP_TYPEPOSPLUS:
2848     possessive = TRUE;
2849     min = 1;
2850     max = INT_MAX;
2851     ecode++;
2852     goto REPEATTYPE;
2853    
2854     case OP_TYPEPOSQUERY:
2855     possessive = TRUE;
2856     min = 0;
2857     max = 1;
2858     ecode++;
2859     goto REPEATTYPE;
2860    
2861     case OP_TYPEPOSUPTO:
2862     possessive = TRUE;
2863     min = 0;
2864     max = GET2(ecode, 1);
2865     ecode += 3;
2866     goto REPEATTYPE;
2867    
2868 nigel 77 case OP_TYPESTAR:
2869     case OP_TYPEMINSTAR:
2870     case OP_TYPEPLUS:
2871     case OP_TYPEMINPLUS:
2872     case OP_TYPEQUERY:
2873     case OP_TYPEMINQUERY:
2874     c = *ecode++ - OP_TYPESTAR;
2875     minimize = (c & 1) != 0;
2876     min = rep_min[c]; /* Pick up values from tables; */
2877     max = rep_max[c]; /* zero for max => infinity */
2878     if (max == 0) max = INT_MAX;
2879    
2880     /* Common code for all repeated single character type matches. Note that
2881     in UTF-8 mode, '.' matches a character of any length, but for the other
2882     character types, the valid characters are all one-byte long. */
2883    
2884     REPEATTYPE:
2885     ctype = *ecode++; /* Code for the character type */
2886    
2887     #ifdef SUPPORT_UCP
2888     if (ctype == OP_PROP || ctype == OP_NOTPROP)
2889     {
2890     prop_fail_result = ctype == OP_NOTPROP;
2891     prop_type = *ecode++;
2892 nigel 87 prop_value = *ecode++;
2893 nigel 77 }
2894     else prop_type = -1;
2895     #endif
2896    
2897     /* First, ensure the minimum number of matches are present. Use inline
2898     code for maximizing the speed, and do the type test once at the start
2899     (i.e. keep it out of the loop). Also we can test that there are at least
2900     the minimum number of bytes before we start. This isn't as effective in
2901     UTF-8 mode, but it does no harm. Separate the UTF-8 code completely as that
2902     is tidier. Also separate the UCP code, which can be the same for both UTF-8
2903     and single-bytes. */
2904    
2905     if (min > md->end_subject - eptr) RRETURN(MATCH_NOMATCH);
2906     if (min > 0)
2907     {
2908     #ifdef SUPPORT_UCP
2909 nigel 87 if (prop_type >= 0)
2910 nigel 77 {
2911 nigel 87 switch(prop_type)
2912 nigel 77 {
2913 nigel 87 case PT_ANY:
2914     if (prop_fail_result) RRETURN(MATCH_NOMATCH);
2915     for (i = 1; i <= min; i++)
2916     {
2917     if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
2918 ph10 184 GETCHARINCTEST(c, eptr);
2919 nigel 87 }
2920     break;
2921    
2922     case PT_LAMP:
2923     for (i = 1; i <= min; i++)
2924     {
2925     if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
2926 ph10 184 GETCHARINCTEST(c, eptr);
2927 ph10 349 prop_chartype = UCD_CHARTYPE(c);
2928 nigel 87 if ((prop_chartype == ucp_Lu ||
2929     prop_chartype == ucp_Ll ||
2930     prop_chartype == ucp_Lt) == prop_fail_result)
2931     RRETURN(MATCH_NOMATCH);
2932     }
2933     break;
2934    
2935     case PT_GC:
2936     for (i = 1; i <= min; i++)
2937     {
2938     if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
2939 ph10 184 GETCHARINCTEST(c, eptr);
2940 ph10 349 prop_category = UCD_CATEGORY(c);
2941 nigel 87 if ((prop_category == prop_value) == prop_fail_result)
2942     RRETURN(MATCH_NOMATCH);
2943     }
2944     break;
2945    
2946     case PT_PC:
2947     for (i = 1; i <= min; i++)
2948     {
2949     if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
2950 ph10 184 GETCHARINCTEST(c, eptr);
2951 ph10 349 prop_chartype = UCD_CHARTYPE(c);
2952 nigel 87 if ((prop_chartype == prop_value) == prop_fail_result)
2953     RRETURN(MATCH_NOMATCH);
2954     }
2955     break;
2956    
2957     case PT_SC:
2958     for (i = 1; i <= min; i++)
2959     {
2960     if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
2961 ph10 184 GETCHARINCTEST(c, eptr);
2962 ph10 349 prop_script = UCD_SCRIPT(c);
2963 nigel 87 if ((prop_script == prop_value) == prop_fail_result)
2964     RRETURN(MATCH_NOMATCH);
2965     }
2966     break;
2967    
2968     default:
2969     RRETURN(PCRE_ERROR_INTERNAL);
2970 nigel 77 }
2971     }
2972    
2973     /* Match extended Unicode sequences. We will get here only if the
2974     support is in the binary; otherwise a compile-time error occurs. */
2975    
2976     else if (ctype == OP_EXTUNI)
2977     {
2978     for (i = 1; i <= min; i++)
2979     {
2980     GETCHARINCTEST(c, eptr);
2981 ph10 349 prop_category = UCD_CATEGORY(c);
2982 nigel 77 if (prop_category == ucp_M) RRETURN(MATCH_NOMATCH);
2983     while (eptr < md->end_subject)
2984     {
2985     int len = 1;
2986     if (!utf8) c = *eptr; else
2987     {
2988     GETCHARLEN(c, eptr, len);
2989     }
2990 ph10 349 prop_category = UCD_CATEGORY(c);
2991 nigel 77 if (prop_category != ucp_M) break;
2992     eptr += len;
2993     }
2994     }
2995     }
2996    
2997     else
2998     #endif /* SUPPORT_UCP */
2999    
3000     /* Handle all other cases when the coding is UTF-8 */
3001    
3002     #ifdef SUPPORT_UTF8
3003     if (utf8) switch(ctype)
3004     {
3005     case OP_ANY:
3006     for (i = 1; i <= min; i++)
3007     {
3008 ph10 342 if (eptr >= md->end_subject || IS_NEWLINE(eptr))
3009 nigel 77 RRETURN(MATCH_NOMATCH);
3010 nigel 91 eptr++;
3011 nigel 77 while (eptr < md->end_subject && (*eptr & 0xc0) == 0x80) eptr++;
3012     }
3013     break;
3014    
3015 ph10 341 case OP_ALLANY:
3016     for (i = 1; i <= min; i++)
3017     {
3018     if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
3019     eptr++;
3020     while (eptr < md->end_subject && (*eptr & 0xc0) == 0x80) eptr++;
3021     }
3022     break;
3023    
3024 nigel 77 case OP_ANYBYTE:
3025     eptr += min;
3026     break;
3027    
3028 nigel 93 case OP_ANYNL:
3029     for (i = 1; i <= min; i++)
3030     {
3031     if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
3032     GETCHARINC(c, eptr);
3033     switch(c)
3034     {
3035     default: RRETURN(MATCH_NOMATCH);
3036     case 0x000d:
3037     if (eptr < md->end_subject && *eptr == 0x0a) eptr++;
3038     break;
3039 ph10 231
3040 nigel 93 case 0x000a:
3041 ph10 231 break;
3042    
3043 nigel 93 case 0x000b:
3044     case 0x000c:
3045     case 0x0085:
3046     case 0x2028:
3047     case 0x2029:
3048 ph10 231 if (md->bsr_anycrlf) RRETURN(MATCH_NOMATCH);
3049 nigel 93 break;
3050     }
3051     }
3052     break;
3053    
3054 ph10 178 case OP_NOT_HSPACE:
3055     for (i = 1; i <= min; i++)
3056     {
3057     if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
3058     GETCHARINC(c, eptr);
3059     switch(c)
3060     {
3061     default: break;
3062     case 0x09: /* HT */
3063     case 0x20: /* SPACE */
3064     case 0xa0: /* NBSP */
3065     case 0x1680: /* OGHAM SPACE MARK */
3066     case 0x180e: /* MONGOLIAN VOWEL SEPARATOR */
3067     case 0x2000: /* EN QUAD */
3068     case 0x2001: /* EM QUAD */
3069     case 0x2002: /* EN SPACE */
3070     case 0x2003: /* EM SPACE */
3071     case 0x2004: /* THREE-PER-EM SPACE */
3072     case 0x2005: /* FOUR-PER-EM SPACE */
3073     case 0x2006: /* SIX-PER-EM SPACE */
3074     case 0x2007: /* FIGURE SPACE */
3075     case 0x2008: /* PUNCTUATION SPACE */
3076     case 0x2009: /* THIN SPACE */
3077     case 0x200A: /* HAIR SPACE */
3078     case 0x202f: /* NARROW NO-BREAK SPACE */
3079     case 0x205f: /* MEDIUM MATHEMATICAL SPACE */
3080     case 0x3000: /* IDEOGRAPHIC SPACE */
3081     RRETURN(MATCH_NOMATCH);
3082     }
3083     }
3084     break;
3085 ph10 182
3086 ph10 178 case OP_HSPACE:
3087     for (i = 1; i <= min; i++)
3088     {
3089     if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
3090     GETCHARINC(c, eptr);
3091     switch(c)
3092     {
3093     default: RRETURN(MATCH_NOMATCH);
3094     case 0x09: /* HT */
3095     case 0x20: /* SPACE */
3096     case 0xa0: /* NBSP */
3097     case 0x1680: /* OGHAM SPACE MARK */
3098     case 0x180e: /* MONGOLIAN VOWEL SEPARATOR */
3099     case 0x2000: /* EN QUAD */
3100     case 0x2001: /* EM QUAD */
3101     case 0x2002: /* EN SPACE */
3102     case 0x2003: /* EM SPACE */
3103     case 0x2004: /* THREE-PER-EM SPACE */
3104     case 0x2005: /* FOUR-PER-EM SPACE */
3105     case 0x2006: /* SIX-PER-EM SPACE */
3106     case 0x2007: /* FIGURE SPACE */
3107     case 0x2008: /* PUNCTUATION SPACE */
3108     case 0x2009: /* THIN SPACE */
3109     case 0x200A: /* HAIR SPACE */
3110     case 0x202f: /* NARROW NO-BREAK SPACE */
3111     case 0x205f: /* MEDIUM MATHEMATICAL SPACE */
3112     case 0x3000: /* IDEOGRAPHIC SPACE */
3113     break;
3114     }
3115     }
3116     break;
3117 ph10 182
3118 ph10 178 case OP_NOT_VSPACE:
3119     for (i = 1; i <= min; i++)
3120     {
3121     if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
3122     GETCHARINC(c, eptr);
3123     switch(c)
3124     {
3125     default: break;
3126     case 0x0a: /* LF */
3127     case 0x0b: /* VT */
3128     case 0x0c: /* FF */
3129     case 0x0d: /* CR */
3130     case 0x85: /* NEL */
3131     case 0x2028: /* LINE SEPARATOR */
3132     case 0x2029: /* PARAGRAPH SEPARATOR */
3133     RRETURN(MATCH_NOMATCH);
3134     }
3135     }
3136     break;
3137 ph10 182
3138 ph10 178 case OP_VSPACE:
3139     for (i = 1; i <= min; i++)
3140     {
3141     if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
3142     GETCHARINC(c, eptr);
3143     switch(c)
3144     {
3145     default: RRETURN(MATCH_NOMATCH);
3146     case 0x0a: /* LF */
3147     case 0x0b: /* VT */
3148     case 0x0c: /* FF */
3149     case 0x0d: /* CR */
3150     case 0x85: /* NEL */
3151     case 0x2028: /* LINE SEPARATOR */
3152     case 0x2029: /* PARAGRAPH SEPARATOR */
3153 ph10 182 break;
3154 ph10 178 }
3155     }
3156     break;
3157    
3158 nigel 77 case OP_NOT_DIGIT:
3159     for (i = 1; i <= min; i++)
3160     {
3161     if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
3162     GETCHARINC(c, eptr);
3163     if (c < 128 && (md->ctypes[c] & ctype_digit) != 0)
3164     RRETURN(MATCH_NOMATCH);
3165     }
3166     break;
3167    
3168     case OP_DIGIT:
3169     for (i = 1; i <= min; i++)
3170     {
3171     if (eptr >= md->end_subject ||
3172     *eptr >= 128 || (md->ctypes[*eptr++] & ctype_digit) == 0)
3173     RRETURN(MATCH_NOMATCH);
3174     /* No need to skip more bytes - we know it's a 1-byte character */
3175     }
3176     break;
3177    
3178     case OP_NOT_WHITESPACE:
3179     for (i = 1; i <= min; i++)
3180     {
3181     if (eptr >= md->end_subject ||
3182 ph10 219 (*eptr < 128 && (md->ctypes[*eptr] & ctype_space) != 0))
3183 nigel 77 RRETURN(MATCH_NOMATCH);
3184 ph10 219 while (++eptr < md->end_subject && (*eptr & 0xc0) == 0x80);
3185 nigel 77 }
3186     break;
3187    
3188     case OP_WHITESPACE:
3189     for (i = 1; i <= min; i++)
3190     {
3191     if (eptr >= md->end_subject ||
3192     *eptr >= 128 || (md->ctypes[*eptr++] & ctype_space) == 0)
3193     RRETURN(MATCH_NOMATCH);
3194     /* No need to skip more bytes - we know it's a 1-byte character */
3195     }
3196     break;
3197    
3198     case OP_NOT_WORDCHAR:
3199     for (i = 1; i <= min; i++)
3200     {
3201     if (eptr >= md->end_subject ||
3202 ph10 219 (*eptr < 128 && (md->ctypes[*eptr] & ctype_word) != 0))
3203 nigel 77 RRETURN(MATCH_NOMATCH);
3204 ph10 219 while (++eptr < md->end_subject && (*eptr & 0xc0) == 0x80);
3205 nigel 77 }
3206     break;
3207    
3208     case OP_WORDCHAR:
3209     for (i = 1; i <= min; i++)
3210     {
3211     if (eptr >= md->end_subject ||
3212     *eptr >= 128 || (md->ctypes[*eptr++] & ctype_word) == 0)
3213     RRETURN(MATCH_NOMATCH);
3214     /* No need to skip more bytes - we know it's a 1-byte character */
3215     }
3216     break;
3217    
3218     default:
3219     RRETURN(PCRE_ERROR_INTERNAL);
3220     } /* End switch(ctype) */
3221    
3222     else
3223     #endif /* SUPPORT_UTF8 */
3224    
3225     /* Code for the non-UTF-8 case for minimum matching of operators other
3226 nigel 93 than OP_PROP and OP_NOTPROP. We can assume that there are the minimum
3227     number of bytes present, as this was tested above. */
3228 nigel 77
3229     switch(ctype)
3230     {
3231     case OP_ANY:
3232 ph10 342 for (i = 1; i <= min; i++)
3233 nigel 77 {
3234 ph10 342 if (IS_NEWLINE(eptr)) RRETURN(MATCH_NOMATCH);
3235     eptr++;
3236 nigel 77 }
3237     break;
3238    
3239 ph10 341 case OP_ALLANY:
3240     eptr += min;
3241     break;
3242    
3243 nigel 77 case OP_ANYBYTE:
3244     eptr += min;
3245     break;
3246    
3247 nigel 93 /* Because of the CRLF case, we can't assume the minimum number of
3248     bytes are present in this case. */
3249    
3250     case OP_ANYNL:
3251     for (i = 1; i <= min; i++)
3252     {
3253     if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
3254     switch(*eptr++)
3255     {
3256     default: RRETURN(MATCH_NOMATCH);
3257     case 0x000d:
3258     if (eptr < md->end_subject && *eptr == 0x0a) eptr++;
3259     break;
3260     case 0x000a:
3261 ph10 231 break;
3262    
3263 nigel 93 case 0x000b:
3264     case 0x000c:
3265     case 0x0085:
3266 ph10 231 if (md->bsr_anycrlf) RRETURN(MATCH_NOMATCH);
3267 nigel 93 break;
3268     }
3269     }
3270     break;
3271    
3272 ph10 178 case OP_NOT_HSPACE:
3273     for (i = 1; i <= min; i++)
3274     {
3275     if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
3276     switch(*eptr++)
3277     {
3278     default: break;
3279     case 0x09: /* HT */
3280     case 0x20: /* SPACE */
3281     case 0xa0: /* NBSP */
3282     RRETURN(MATCH_NOMATCH);
3283     }
3284     }
3285     break;
3286    
3287     case OP_HSPACE:
3288     for (i = 1; i <= min; i++)
3289     {
3290     if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
3291     switch(*eptr++)
3292     {
3293     default: RRETURN(MATCH_NOMATCH);
3294     case 0x09: /* HT */
3295     case 0x20: /* SPACE */
3296     case 0xa0: /* NBSP */
3297 ph10 182 break;
3298 ph10 178 }
3299     }
3300     break;
3301    
3302     case OP_NOT_VSPACE:
3303     for (i = 1; i <= min; i++)
3304     {
3305     if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
3306     switch(*eptr++)
3307     {
3308     default: break;
3309     case 0x0a: /* LF */
3310     case 0x0b: /* VT */
3311     case 0x0c: /* FF */
3312     case 0x0d: /* CR */
3313     case 0x85: /* NEL */
3314     RRETURN(MATCH_NOMATCH);
3315     }
3316     }
3317     break;
3318    
3319     case OP_VSPACE:
3320     for (i = 1; i <= min; i++)
3321     {
3322     if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
3323     switch(*eptr++)
3324     {
3325     default: RRETURN(MATCH_NOMATCH);
3326     case 0x0a: /* LF */
3327     case 0x0b: /* VT */
3328     case 0x0c: /* FF */
3329     case 0x0d: /* CR */
3330     case 0x85: /* NEL */
3331 ph10 182 break;
3332 ph10 178 }
3333     }
3334     break;
3335    
3336 nigel 77 case OP_NOT_DIGIT:
3337     for (i = 1; i <= min; i++)
3338     if ((md->ctypes[*eptr++] & ctype_digit) != 0) RRETURN(MATCH_NOMATCH);
3339     break;
3340    
3341     case OP_DIGIT:
3342     for (i = 1; i <= min; i++)
3343     if ((md->ctypes[*eptr++] & ctype_digit) == 0) RRETURN(MATCH_NOMATCH);
3344     break;
3345    
3346     case OP_NOT_WHITESPACE:
3347     for (i = 1; i <= min; i++)
3348     if ((md->ctypes[*eptr++] & ctype_space) != 0) RRETURN(MATCH_NOMATCH);
3349     break;
3350    
3351     case OP_WHITESPACE:
3352     for (i = 1; i <= min; i++)
3353     if ((md->ctypes[*eptr++] & ctype_space) == 0) RRETURN(MATCH_NOMATCH);
3354     break;
3355    
3356     case OP_NOT_WORDCHAR:
3357     for (i = 1; i <= min; i++)
3358     if ((md->ctypes[*eptr++] & ctype_word) != 0)
3359     RRETURN(MATCH_NOMATCH);
3360     break;
3361    
3362     case OP_WORDCHAR:
3363     for (i = 1; i <= min; i++)
3364     if ((md->ctypes[*eptr++] & ctype_word) == 0)
3365     RRETURN(MATCH_NOMATCH);
3366     break;
3367    
3368     default:
3369     RRETURN(PCRE_ERROR_INTERNAL);
3370     }
3371     }
3372    
3373     /* If min = max, continue at the same level without recursing */
3374    
3375     if (min == max) continue;
3376    
3377     /* If minimizing, we have to test the rest of the pattern before each
3378     subsequent match. Again, separate the UTF-8 case for speed, and also
3379     separate the UCP cases. */
3380    
3381     if (minimize)
3382     {
3383     #ifdef SUPPORT_UCP
3384 nigel 87 if (prop_type >= 0)
3385 nigel 77 {
3386 nigel 87 switch(prop_type)
3387 nigel 77 {
3388 nigel 87 case PT_ANY:
3389     for (fi = min;; fi++)
3390     {
3391 ph10 164 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM36);
3392 nigel 87 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3393     if (fi >= max || eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
3394     GETCHARINC(c, eptr);
3395     if (prop_fail_result) RRETURN(MATCH_NOMATCH);
3396     }
3397 nigel 93 /* Control never gets here */
3398 nigel 87
3399     case PT_LAMP:
3400     for (fi = min;; fi++)
3401     {
3402 ph10 164 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM37);
3403 nigel 87 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3404     if (fi >= max || eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
3405     GETCHARINC(c, eptr);
3406 ph10 349 prop_chartype = UCD_CHARTYPE(c);
3407 nigel 87 if ((prop_chartype == ucp_Lu ||
3408     prop_chartype == ucp_Ll ||
3409     prop_chartype == ucp_Lt) == prop_fail_result)
3410     RRETURN(MATCH_NOMATCH);
3411     }
3412 nigel 93 /* Control never gets here */
3413 nigel 87
3414     case PT_GC:
3415     for (fi = min;; fi++)
3416     {
3417 ph10 164 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM38);
3418 nigel 87 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3419     if (fi >= max || eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
3420     GETCHARINC(c, eptr);
3421 ph10 349 prop_category = UCD_CATEGORY(c);
3422 nigel 87 if ((prop_category == prop_value) == prop_fail_result)
3423     RRETURN(MATCH_NOMATCH);
3424     }
3425 nigel 93 /* Control never gets here */
3426 nigel 87
3427     case PT_PC:
3428     for (fi = min;; fi++)
3429     {
3430 ph10 164 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM39);
3431 nigel 87 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3432     if (fi >= max || eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
3433     GETCHARINC(c, eptr);
3434 ph10 349 prop_chartype = UCD_CHARTYPE(c);
3435 nigel 87 if ((prop_chartype == prop_value) == prop_fail_result)
3436     RRETURN(MATCH_NOMATCH);
3437     }
3438 nigel 93 /* Control never gets here */
3439 nigel 87
3440     case PT_SC:
3441     for (fi = min;; fi++)
3442     {
3443 ph10 164 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM40);
3444 nigel 87 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3445     if (fi >= max || eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
3446     GETCHARINC(c, eptr);
3447 ph10 349 prop_script = UCD_SCRIPT(c);
3448 nigel 87 if ((prop_script == prop_value) == prop_fail_result)
3449     RRETURN(MATCH_NOMATCH);
3450     }
3451 nigel 93 /* Control never gets here */
3452 nigel 87
3453     default:
3454     RRETURN(PCRE_ERROR_INTERNAL);
3455 nigel 77 }
3456     }
3457    
3458     /* Match extended Unicode sequences. We will get here only if the
3459     support is in the binary; otherwise a compile-time error occurs. */
3460    
3461     else if (ctype == OP_EXTUNI)
3462     {
3463     for (fi = min;; fi++)
3464     {
3465 ph10 164 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM41);
3466 nigel 77 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3467     if (fi >= max || eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
3468     GETCHARINCTEST(c, eptr);
3469 ph10 349 prop_category = UCD_CATEGORY(c);
3470 nigel 77 if (prop_category == ucp_M) RRETURN(MATCH_NOMATCH);
3471     while (eptr < md->end_subject)
3472     {
3473     int len = 1;
3474     if (!utf8) c = *eptr; else
3475     {
3476     GETCHARLEN(c, eptr, len);
3477     }
3478 ph10 349 prop_category = UCD_CATEGORY(c);
3479 nigel 77 if (prop_category != ucp_M) break;
3480     eptr += len;
3481     }
3482     }
3483     }
3484    
3485     else
3486     #endif /* SUPPORT_UCP */
3487    
3488     #ifdef SUPPORT_UTF8
3489     /* UTF-8 mode */
3490     if (utf8)
3491     {
3492     for (fi = min;; fi++)
3493     {
3494 ph10 164 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM42);
3495 nigel 77 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3496 nigel 91 if (fi >= max || eptr >= md->end_subject ||
3497 ph10 342 (ctype == OP_ANY && IS_NEWLINE(eptr)))
3498 nigel 91 RRETURN(MATCH_NOMATCH);
3499 nigel 77
3500     GETCHARINC(c, eptr);
3501     switch(ctype)
3502     {
3503 ph10 342 case OP_ANY: /* This is the non-NL case */
3504 ph10 345 case OP_ALLANY:
3505 nigel 77 case OP_ANYBYTE:
3506     break;
3507    
3508 nigel 93 case OP_ANYNL:
3509     switch(c)
3510     {
3511     default: RRETURN(MATCH_NOMATCH);
3512     case 0x000d:
3513     if (eptr < md->end_subject && *eptr == 0x0a) eptr++;
3514     break;
3515     case 0x000a:
3516 ph10 231 break;
3517    
3518 nigel 93 case 0x000b:
3519     case 0x000c:
3520     case 0x0085:
3521     case 0x2028:
3522     case 0x2029:
3523 ph10 231 if (md->bsr_anycrlf) RRETURN(MATCH_NOMATCH);
3524 nigel 93 break;
3525     }
3526     break;
3527    
3528 ph10 178 case OP_NOT_HSPACE:
3529     switch(c)
3530     {
3531     default: break;
3532     case 0x09: /* HT */
3533     case 0x20: /* SPACE */
3534     case 0xa0: /* NBSP */
3535     case 0x1680: /* OGHAM SPACE MARK */
3536     case 0x180e: /* MONGOLIAN VOWEL SEPARATOR */
3537     case 0x2000: /* EN QUAD */
3538     case 0x2001: /* EM QUAD */
3539     case 0x2002: /* EN SPACE */
3540     case 0x2003: /* EM SPACE */
3541     case 0x2004: /* THREE-PER-EM SPACE */
3542     case 0x2005: /* FOUR-PER-EM SPACE */
3543     case 0x2006: /* SIX-PER-EM SPACE */
3544     case 0x2007: /* FIGURE SPACE */
3545     case 0x2008: /* PUNCTUATION SPACE */
3546     case 0x2009: /* THIN SPACE */
3547     case 0x200A: /* HAIR SPACE */
3548     case 0x202f: /* NARROW NO-BREAK SPACE */
3549     case 0x205f: /* MEDIUM MATHEMATICAL SPACE */
3550     case 0x3000: /* IDEOGRAPHIC SPACE */
3551     RRETURN(MATCH_NOMATCH);
3552     }
3553     break;
3554    
3555     case OP_HSPACE:
3556     switch(c)
3557     {
3558     default: RRETURN(MATCH_NOMATCH);
3559     case 0x09: /* HT */
3560     case 0x20: /* SPACE */
3561     case 0xa0: /* NBSP */
3562     case 0x1680: /* OGHAM SPACE MARK */
3563     case 0x180e: /* MONGOLIAN VOWEL SEPARATOR */
3564     case 0x2000: /* EN QUAD */
3565     case 0x2001: /* EM QUAD */
3566     case 0x2002: /* EN SPACE */
3567     case 0x2003: /* EM SPACE */
3568     case 0x2004: /* THREE-PER-EM SPACE */
3569     case 0x2005: /* FOUR-PER-EM SPACE */
3570     case 0x2006: /* SIX-PER-EM SPACE */
3571     case 0x2007: /* FIGURE SPACE */
3572     case 0x2008: /* PUNCTUATION SPACE */
3573     case 0x2009: /* THIN SPACE */
3574     case 0x200A: /* HAIR SPACE */
3575     case 0x202f: /* NARROW NO-BREAK SPACE */
3576     case 0x205f: /* MEDIUM MATHEMATICAL SPACE */
3577     case 0x3000: /* IDEOGRAPHIC SPACE */
3578     break;
3579     }
3580     break;
3581    
3582     case OP_NOT_VSPACE:
3583     switch(c)
3584     {
3585     default: break;
3586     case 0x0a: /* LF */
3587     case 0x0b: /* VT */
3588     case 0x0c: /* FF */
3589     case 0x0d: /* CR */
3590     case 0x85: /* NEL */
3591     case 0x2028: /* LINE SEPARATOR */
3592     case 0x2029: /* PARAGRAPH SEPARATOR */
3593     RRETURN(MATCH_NOMATCH);
3594     }
3595     break;
3596    
3597     case OP_VSPACE:
3598     switch(c)
3599     {
3600     default: RRETURN(MATCH_NOMATCH);
3601     case 0x0a: /* LF */
3602     case 0x0b: /* VT */
3603     case 0x0c: /* FF */
3604     case 0x0d: /* CR */
3605     case 0x85: /* NEL */
3606     case 0x2028: /* LINE SEPARATOR */
3607     case 0x2029: /* PARAGRAPH SEPARATOR */
3608     break;
3609     }
3610     break;
3611    
3612 nigel 77 case OP_NOT_DIGIT:
3613     if (c < 256 && (md->ctypes[c] & ctype_digit) != 0)
3614     RRETURN(MATCH_NOMATCH);
3615     break;
3616    
3617     case OP_DIGIT:
3618     if (c >= 256 || (md->ctypes[c] & ctype_digit) == 0)
3619     RRETURN(MATCH_NOMATCH);
3620     break;
3621    
3622     case OP_NOT_WHITESPACE:
3623     if (c < 256 && (md->ctypes[c] & ctype_space) != 0)
3624     RRETURN(MATCH_NOMATCH);
3625     break;
3626    
3627     case OP_WHITESPACE:
3628     if (c >= 256 || (md->ctypes[c] & ctype_space) == 0)
3629     RRETURN(MATCH_NOMATCH);
3630     break;
3631    
3632     case OP_NOT_WORDCHAR:
3633     if (c < 256 && (md->ctypes[c] & ctype_word) != 0)
3634     RRETURN(MATCH_NOMATCH);
3635     break;
3636    
3637     case OP_WORDCHAR:
3638     if (c >= 256 || (md->ctypes[c] & ctype_word) == 0)
3639     RRETURN(MATCH_NOMATCH);
3640     break;
3641    
3642     default:
3643     RRETURN(PCRE_ERROR_INTERNAL);
3644     }
3645     }
3646     }
3647     else
3648     #endif
3649     /* Not UTF-8 mode */
3650     {
3651     for (fi = min;; fi++)
3652     {
3653 ph10 164 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM43);
3654 nigel 77 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3655 nigel 91 if (fi >= max || eptr >= md->end_subject ||
3656 ph10 342 (ctype == OP_ANY && IS_NEWLINE(eptr)))
3657 nigel 91 RRETURN(MATCH_NOMATCH);
3658    
3659 nigel 77 c = *eptr++;
3660     switch(ctype)
3661     {
3662 ph10 342 case OP_ANY: /* This is the non-NL case */
3663 ph10 345 case OP_ALLANY:
3664 nigel 77 case OP_ANYBYTE:
3665     break;
3666    
3667 nigel 93 case OP_ANYNL:
3668     switch(c)
3669     {
3670     default: RRETURN(MATCH_NOMATCH);
3671     case 0x000d:
3672     if (eptr < md->end_subject && *eptr == 0x0a) eptr++;
3673     break;
3674 ph10 231
3675 nigel 93 case 0x000a:
3676 ph10 231 break;
3677    
3678 nigel 93 case 0x000b:
3679     case 0x000c:
3680     case 0x0085:
3681 ph10 231 if (md->bsr_anycrlf) RRETURN(MATCH_NOMATCH);
3682 nigel 93 break;
3683     }
3684     break;
3685    
3686 ph10 178 case OP_NOT_HSPACE:
3687     switch(c)
3688     {
3689     default: break;
3690     case 0x09: /* HT */
3691     case 0x20: /* SPACE */
3692     case 0xa0: /* NBSP */
3693     RRETURN(MATCH_NOMATCH);
3694     }
3695     break;
3696    
3697     case OP_HSPACE:
3698     switch(c)
3699     {
3700     default: RRETURN(MATCH_NOMATCH);
3701     case 0x09: /* HT */
3702     case 0x20: /* SPACE */
3703     case 0xa0: /* NBSP */
3704     break;
3705     }
3706     break;
3707    
3708     case OP_NOT_VSPACE:
3709     switch(c)
3710     {
3711     default: break;
3712     case 0x0a: /* LF */
3713     case 0x0b: /* VT */
3714     case 0x0c: /* FF */
3715     case 0x0d: /* CR */
3716     case 0x85: /* NEL */
3717     RRETURN(MATCH_NOMATCH);
3718     }
3719     break;
3720    
3721     case OP_VSPACE:
3722     switch(c)
3723     {
3724     default: RRETURN(MATCH_NOMATCH);
3725     case 0x0a: /* LF */
3726     case 0x0b: /* VT */
3727     case 0x0c: /* FF */
3728     case 0x0d: /* CR */
3729     case 0x85: /* NEL */
3730     break;
3731     }
3732     break;
3733    
3734 nigel 77 case OP_NOT_DIGIT:
3735     if ((md->ctypes[c] & ctype_digit) != 0) RRETURN(MATCH_NOMATCH);
3736     break;
3737    
3738     case OP_DIGIT:
3739     if ((md->ctypes[c] & ctype_digit) == 0) RRETURN(MATCH_NOMATCH);
3740     break;
3741    
3742     case OP_NOT_WHITESPACE:
3743     if ((md->ctypes[c] & ctype_space) != 0) RRETURN(MATCH_NOMATCH);
3744     break;
3745    
3746     case OP_WHITESPACE:
3747     if ((md->ctypes[c] & ctype_space) == 0) RRETURN(MATCH_NOMATCH);
3748     break;
3749    
3750     case OP_NOT_WORDCHAR:
3751     if ((md->ctypes[c] & ctype_word) != 0) RRETURN(MATCH_NOMATCH);
3752     break;
3753    
3754     case OP_WORDCHAR:
3755     if ((md->ctypes[c] & ctype_word) == 0) RRETURN(MATCH_NOMATCH);
3756     break;
3757    
3758     default:
3759     RRETURN(PCRE_ERROR_INTERNAL);
3760     }
3761     }
3762     }
3763     /* Control never gets here */
3764     }
3765    
3766 nigel 93 /* If maximizing, it is worth using inline code for speed, doing the type
3767 nigel 77 test once at the start (i.e. keep it out of the loop). Again, keep the
3768     UTF-8 and UCP stuff separate. */
3769    
3770     else
3771     {
3772     pp = eptr; /* Remember where we started */
3773    
3774     #ifdef SUPPORT_UCP
3775 nigel 87 if (prop_type >= 0)
3776 nigel 77 {
3777 nigel 87 switch(prop_type)
3778 nigel 77 {
3779 nigel 87 case PT_ANY:
3780     for (i = min; i < max; i++)
3781     {
3782     int len = 1;
3783     if (eptr >= md->end_subject) break;
3784     GETCHARLEN(c, eptr, len);
3785     if (prop_fail_result) break;
3786     eptr+= len;
3787     }
3788     break;
3789    
3790     case PT_LAMP:
3791     for (i = min; i < max; i++)
3792     {
3793     int len = 1;
3794     if (eptr >= md->end_subject) break;
3795     GETCHARLEN(c, eptr, len);
3796 ph10 349 prop_chartype = UCD_CHARTYPE(c);
3797 nigel 87 if ((prop_chartype == ucp_Lu ||
3798     prop_chartype == ucp_Ll ||
3799     prop_chartype == ucp_Lt) == prop_fail_result)
3800     break;
3801     eptr+= len;
3802     }
3803     break;
3804    
3805     case PT_GC:
3806     for (i = min; i < max; i++)
3807     {
3808     int len = 1;
3809     if (eptr >= md->end_subject) break;
3810     GETCHARLEN(c, eptr, len);
3811 ph10 349 prop_category = UCD_CATEGORY(c);
3812 nigel 87 if ((prop_category == prop_value) == prop_fail_result)
3813     break;
3814     eptr+= len;
3815     }
3816     break;
3817    
3818     case PT_PC:
3819     for (i = min; i < max; i++)
3820     {
3821     int len = 1;
3822     if (eptr >= md->end_subject) break;
3823     GETCHARLEN(c, eptr, len);
3824 ph10 349 prop_chartype = UCD_CHARTYPE(c);
3825 nigel 87 if ((prop_chartype == prop_value) == prop_fail_result)
3826     break;
3827     eptr+= len;
3828     }
3829     break;
3830    
3831     case PT_SC:
3832     for (i = min; i < max; i++)
3833     {
3834     int len = 1;
3835     if (eptr >= md->end_subject) break;
3836     GETCHARLEN(c, eptr, len);
3837 ph10 349 prop_script = UCD_SCRIPT(c);
3838 nigel 87 if ((prop_script == prop_value) == prop_fail_result)
3839     break;
3840     eptr+= len;
3841     }
3842     break;
3843 nigel 77 }
3844    
3845     /* eptr is now past the end of the maximum run */
3846    
3847 nigel 93 if (possessive) continue;
3848 nigel 77 for(;;)
3849     {
3850 ph10 164 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM44);
3851 nigel 77 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3852     if (eptr-- == pp) break; /* Stop if tried at original pos */
3853 ph10 207 if (utf8) BACKCHAR(eptr);
3854 nigel 77 }
3855     }
3856    
3857     /* Match extended Unicode sequences. We will get here only if the
3858     support is in the binary; otherwise a compile-time error occurs. */
3859    
3860     else if (ctype == OP_EXTUNI)
3861     {
3862     for (i = min; i < max; i++)
3863     {
3864     if (eptr >= md->end_subject) break;
3865     GETCHARINCTEST(c, eptr);
3866 ph10 349 prop_category = UCD_CATEGORY(c);
3867 nigel 77 if (prop_category == ucp_M) break;
3868     while (eptr < md->end_subject)
3869     {
3870     int len = 1;
3871     if (!utf8) c = *eptr; else
3872     {
3873     GETCHARLEN(c, eptr, len);
3874     }
3875 ph10 349 prop_category = UCD_CATEGORY(c);
3876 nigel 77 if (prop_category != ucp_M) break;
3877     eptr += len;
3878     }
3879     }
3880    
3881     /* eptr is now past the end of the maximum run */
3882    
3883 nigel 93 if (possessive) continue;
3884 nigel 77 for(;;)
3885     {
3886 ph10 164 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM45);
3887 nigel 77 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3888     if (eptr-- == pp) break; /* Stop if tried at original pos */
3889     for (;;) /* Move back over one extended */
3890     {
3891     int len = 1;
3892     if (!utf8) c = *eptr; else
3893     {
3894 ph10 207 BACKCHAR(eptr);
3895 nigel 77 GETCHARLEN(c, eptr, len);
3896     }
3897 ph10 349 prop_category = UCD_CATEGORY(c);
3898 nigel 77 if (prop_category != ucp_M) break;
3899     eptr--;
3900     }
3901     }
3902     }
3903    
3904     else
3905     #endif /* SUPPORT_UCP */
3906    
3907     #ifdef SUPPORT_UTF8
3908     /* UTF-8 mode */
3909    
3910     if (utf8)
3911     {
3912     switch(ctype)
3913     {
3914     case OP_ANY:
3915     if (max < INT_MAX)
3916     {
3917 ph10 342 for (i = min; i < max; i++)
3918 nigel 77 {
3919 ph10 342 if (eptr >= md->end_subject || IS_NEWLINE(eptr)) break;
3920     eptr++;
3921     while (eptr < md->end_subject && (*eptr & 0xc0) == 0x80) eptr++;
3922 nigel 77 }
3923     }
3924    
3925     /* Handle unlimited UTF-8 repeat */
3926    
3927     else
3928     {
3929 ph10 342 for (i = min; i < max; i++)
3930 nigel 77 {
3931 ph10 342 if (eptr >= md->end_subject || IS_NEWLINE(eptr)) break;
3932     eptr++;
3933     while (eptr < md->end_subject && (*eptr & 0xc0) == 0x80) eptr++;
3934 nigel 77 }
3935     }
3936     break;
3937    
3938 ph10 341 case OP_ALLANY:
3939     if (max < INT_MAX)
3940     {
3941     for (i = min; i < max; i++)
3942     {
3943     if (eptr >= md->end_subject) break;
3944     eptr++;
3945     while (eptr < md->end_subject && (*eptr & 0xc0) == 0x80) eptr++;
3946     }
3947     }
3948     else eptr = md->end_subject; /* Unlimited UTF-8 repeat */
3949     break;
3950    
3951 nigel 77 /* The byte case is the same as non-UTF8 */
3952    
3953     case OP_ANYBYTE:
3954     c = max - min;
3955 nigel 93 if (c > (unsigned int)(md->end_subject - eptr))
3956     c = md->end_subject - eptr;
3957 nigel 77 eptr += c;
3958     break;
3959    
3960 nigel 93 case OP_ANYNL:
3961     for (i = min; i < max; i++)
3962     {
3963     int len = 1;
3964     if (eptr >= md->end_subject) break;
3965     GETCHARLEN(c, eptr, len);
3966     if (c == 0x000d)
3967     {
3968     if (++eptr >= md->end_subject) break;
3969     if (*eptr == 0x000a) eptr++;
3970     }
3971     else
3972     {
3973 ph10 231 if (c != 0x000a &&
3974     (md->bsr_anycrlf ||
3975     (c != 0x000b && c != 0x000c &&
3976     c != 0x0085 && c != 0x2028 && c != 0x2029)))
3977 nigel 93 break;
3978     eptr += len;
3979     }
3980     }
3981     break;
3982    
3983 ph10 178 case OP_NOT_HSPACE:
3984 ph10 182 case OP_HSPACE:
3985 ph10 178 for (i = min; i < max; i++)
3986     {
3987 ph10 182 BOOL gotspace;
3988 ph10 178 int len = 1;
3989     if (eptr >= md->end_subject) break;
3990     GETCHARLEN(c, eptr, len);
3991     switch(c)
3992 ph10 182 {
3993     default: gotspace = FALSE; break;
3994 ph10 178 case 0x09: /* HT */
3995     case 0x20: /* SPACE */
3996     case 0xa0: /* NBSP */
3997     case 0x1680: /* OGHAM SPACE MARK */
3998     case 0x180e: /* MONGOLIAN VOWEL SEPARATOR */
3999     case 0x2000: /* EN QUAD */
4000     case 0x2001: /* EM QUAD */
4001     case 0x2002: /* EN SPACE */
4002     case 0x2003: /* EM SPACE */
4003     case 0x2004: /* THREE-PER-EM SPACE */
4004     case 0x2005: /* FOUR-PER-EM SPACE */
4005     case 0x2006: /* SIX-PER-EM SPACE */
4006     case 0x2007: /* FIGURE SPACE */
4007     case 0x2008: /* PUNCTUATION SPACE */
4008     case 0x2009: /* THIN SPACE */
4009     case 0x200A: /* HAIR SPACE */
4010     case 0x202f: /* NARROW NO-BREAK SPACE */
4011     case 0x205f: /* MEDIUM MATHEMATICAL SPACE */
4012     case 0x3000: /* IDEOGRAPHIC SPACE */
4013     gotspace = TRUE;
4014 ph10 182 break;
4015 ph10 178 }
4016     if (gotspace == (ctype == OP_NOT_HSPACE)) break;
4017     eptr += len;
4018     }
4019     break;
4020    
4021     case OP_NOT_VSPACE:
4022 ph10 182 case OP_VSPACE:
4023 ph10 178 for (i = min; i < max; i++)
4024     {
4025 ph10 182 BOOL gotspace;
4026 ph10 178 int len = 1;
4027     if (eptr >= md->end_subject) break;
4028     GETCHARLEN(c, eptr, len);
4029     switch(c)
4030     {
4031 ph10 182 default: gotspace = FALSE; break;
4032 ph10 178 case 0x0a: /* LF */
4033     case 0x0b: /* VT */
4034     case 0x0c: /* FF */
4035     case 0x0d: /* CR */
4036     case 0x85: /* NEL */
4037     case 0x2028: /* LINE SEPARATOR */
4038     case 0x2029: /* PARAGRAPH SEPARATOR */
4039     gotspace = TRUE;
4040     break;
4041     }
4042 ph10 182 if (gotspace == (ctype == OP_NOT_VSPACE)) break;
4043 ph10 178 eptr += len;
4044     }
4045     break;
4046    
4047 nigel 77 case OP_NOT_DIGIT:
4048     for (i = min; i < max; i++)
4049     {
4050     int len = 1;
4051     if (eptr >= md->end_subject) break;
4052     GETCHARLEN(c, eptr, len);
4053     if (c < 256 && (md->ctypes[c] & ctype_digit) != 0) break;
4054     eptr+= len;
4055     }
4056     break;
4057    
4058     case OP_DIGIT:
4059     for (i = min; i < max; i++)
4060     {
4061     int len = 1;
4062     if (eptr >= md->end_subject) break;
4063     GETCHARLEN(c, eptr, len);
4064     if (c >= 256 ||(md->ctypes[c] & ctype_digit) == 0) break;
4065     eptr+= len;
4066     }
4067     break;
4068    
4069     case OP_NOT_WHITESPACE:
4070     for (i = min; i < max; i++)
4071     {
4072     int len = 1;
4073     if (eptr >= md->end_subject) break;
4074     GETCHARLEN(c, eptr, len);
4075     if (c < 256 && (md->ctypes[c] & ctype_space) != 0) break;
4076     eptr+= len;
4077     }
4078     break;
4079    
4080     case OP_WHITESPACE:
4081     for (i = min; i < max; i++)
4082     {
4083     int len = 1;
4084     if (eptr >= md->end_subject) break;
4085     GETCHARLEN(c, eptr, len);
4086     if (c >= 256 ||(md->ctypes[c] & ctype_space) == 0) break;
4087     eptr+= len;
4088     }
4089     break;
4090    
4091     case OP_NOT_WORDCHAR:
4092     for (i = min; i < max; i++)