/[pcre]/code/trunk/pcre_exec.c
ViewVC logotype

Contents of /code/trunk/pcre_exec.c

Parent Directory Parent Directory | Revision Log Revision Log


Revision 435 - (hide annotations) (download)
Sat Sep 5 10:20:44 2009 UTC (3 years, 8 months ago) by ph10
File MIME type: text/plain
File size: 161431 byte(s)
Further updates to partial matching.

1 nigel 77 /*************************************************
2     * Perl-Compatible Regular Expressions *
3     *************************************************/
4    
5     /* PCRE is a library of functions to support regular expressions whose syntax
6     and semantics are as close as possible to those of the Perl 5 language.
7    
8     Written by Philip Hazel
9 ph10 381 Copyright (c) 1997-2009 University of Cambridge
10 nigel 77
11     -----------------------------------------------------------------------------
12     Redistribution and use in source and binary forms, with or without
13     modification, are permitted provided that the following conditions are met:
14    
15     * Redistributions of source code must retain the above copyright notice,
16     this list of conditions and the following disclaimer.
17    
18     * Redistributions in binary form must reproduce the above copyright
19     notice, this list of conditions and the following disclaimer in the
20     documentation and/or other materials provided with the distribution.
21    
22     * Neither the name of the University of Cambridge nor the names of its
23     contributors may be used to endorse or promote products derived from
24     this software without specific prior written permission.
25    
26     THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
27     AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
28     IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
29     ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
30     LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
31     CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
32     SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
33     INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
34     CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
35     ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
36     POSSIBILITY OF SUCH DAMAGE.
37     -----------------------------------------------------------------------------
38     */
39    
40    
41     /* This module contains pcre_exec(), the externally visible function that does
42     pattern matching using an NFA algorithm, trying to mimic Perl as closely as
43     possible. There are also some static supporting functions. */
44    
45 ph10 200 #ifdef HAVE_CONFIG_H
46 ph10 236 #include "config.h"
47 ph10 200 #endif
48 ph10 199
49 nigel 93 #define NLBLOCK md /* Block containing newline information */
50     #define PSSTART start_subject /* Field containing processed string start */
51     #define PSEND end_subject /* Field containing processed string end */
52    
53 nigel 77 #include "pcre_internal.h"
54    
55 ph10 137 /* Undefine some potentially clashing cpp symbols */
56    
57     #undef min
58     #undef max
59    
60 nigel 77 /* Flag bits for the match() function */
61    
62 nigel 93 #define match_condassert 0x01 /* Called to check a condition assertion */
63     #define match_cbegroup 0x02 /* Could-be-empty unlimited repeat group */
64 nigel 77
65     /* Non-error returns from the match() function. Error returns are externally
66     defined PCRE_ERROR_xxx codes, which are all negative. */
67    
68     #define MATCH_MATCH 1
69     #define MATCH_NOMATCH 0
70    
71 ph10 211 /* Special internal returns from the match() function. Make them sufficiently
72 ph10 210 negative to avoid the external error codes. */
73    
74     #define MATCH_COMMIT (-999)
75     #define MATCH_PRUNE (-998)
76     #define MATCH_SKIP (-997)
77     #define MATCH_THEN (-996)
78    
79 nigel 77 /* Maximum number of ints of offset to save on the stack for recursive calls.
80     If the offset vector is bigger, malloc is used. This should be a multiple of 3,
81     because the offset vector is always a multiple of 3 long. */
82    
83     #define REC_STACK_SAVE_MAX 30
84    
85     /* Min and max values for the common repeats; for the maxima, 0 => infinity */
86    
87     static const char rep_min[] = { 0, 0, 1, 1, 0, 0 };
88     static const char rep_max[] = { 0, 0, 0, 0, 1, 1 };
89    
90    
91    
92     #ifdef DEBUG
93     /*************************************************
94     * Debugging function to print chars *
95     *************************************************/
96    
97     /* Print a sequence of chars in printable format, stopping at the end of the
98     subject if the requested.
99    
100     Arguments:
101     p points to characters
102     length number to print
103     is_subject TRUE if printing from within md->start_subject
104     md pointer to matching data block, if is_subject is TRUE
105    
106     Returns: nothing
107     */
108    
109     static void
110     pchars(const uschar *p, int length, BOOL is_subject, match_data *md)
111     {
112 nigel 93 unsigned int c;
113 nigel 77 if (is_subject && length > md->end_subject - p) length = md->end_subject - p;
114     while (length-- > 0)
115     if (isprint(c = *(p++))) printf("%c", c); else printf("\\x%02x", c);
116     }
117     #endif
118    
119    
120    
121     /*************************************************
122     * Match a back-reference *
123     *************************************************/
124    
125     /* If a back reference hasn't been set, the length that is passed is greater
126     than the number of characters left in the string, so the match fails.
127    
128     Arguments:
129     offset index into the offset vector
130     eptr points into the subject
131     length length to be matched
132     md points to match data block
133     ims the ims flags
134    
135     Returns: TRUE if matched
136     */
137    
138     static BOOL
139 nigel 87 match_ref(int offset, register USPTR eptr, int length, match_data *md,
140 nigel 77 unsigned long int ims)
141     {
142 nigel 87 USPTR p = md->start_subject + md->offset_vector[offset];
143 nigel 77
144     #ifdef DEBUG
145     if (eptr >= md->end_subject)
146     printf("matching subject <null>");
147     else
148     {
149     printf("matching subject ");
150     pchars(eptr, length, TRUE, md);
151     }
152     printf(" against backref ");
153     pchars(p, length, FALSE, md);
154     printf("\n");
155     #endif
156    
157     /* Always fail if not enough characters left */
158    
159     if (length > md->end_subject - eptr) return FALSE;
160    
161 ph10 354 /* Separate the caseless case for speed. In UTF-8 mode we can only do this
162     properly if Unicode properties are supported. Otherwise, we can check only
163     ASCII characters. */
164 nigel 77
165     if ((ims & PCRE_CASELESS) != 0)
166     {
167 ph10 354 #ifdef SUPPORT_UTF8
168     #ifdef SUPPORT_UCP
169     if (md->utf8)
170     {
171 ph10 358 USPTR endptr = eptr + length;
172 ph10 354 while (eptr < endptr)
173     {
174 ph10 358 int c, d;
175 ph10 354 GETCHARINC(c, eptr);
176     GETCHARINC(d, p);
177     if (c != d && c != UCD_OTHERCASE(d)) return FALSE;
178 ph10 358 }
179     }
180 ph10 354 else
181     #endif
182     #endif
183    
184     /* The same code works when not in UTF-8 mode and in UTF-8 mode when there
185     is no UCP support. */
186 ph10 358
187 nigel 77 while (length-- > 0)
188 ph10 354 { if (md->lcc[*p++] != md->lcc[*eptr++]) return FALSE; }
189 nigel 77 }
190 ph10 358
191 ph10 354 /* In the caseful case, we can just compare the bytes, whether or not we
192     are in UTF-8 mode. */
193 ph10 358
194 nigel 77 else
195     { while (length-- > 0) if (*p++ != *eptr++) return FALSE; }
196    
197     return TRUE;
198     }
199    
200    
201    
202     /***************************************************************************
203     ****************************************************************************
204     RECURSION IN THE match() FUNCTION
205    
206 nigel 87 The match() function is highly recursive, though not every recursive call
207     increases the recursive depth. Nevertheless, some regular expressions can cause
208     it to recurse to a great depth. I was writing for Unix, so I just let it call
209     itself recursively. This uses the stack for saving everything that has to be
210     saved for a recursive call. On Unix, the stack can be large, and this works
211     fine.
212 nigel 77
213 nigel 87 It turns out that on some non-Unix-like systems there are problems with
214     programs that use a lot of stack. (This despite the fact that every last chip
215     has oodles of memory these days, and techniques for extending the stack have
216     been known for decades.) So....
217 nigel 77
218     There is a fudge, triggered by defining NO_RECURSE, which avoids recursive
219     calls by keeping local variables that need to be preserved in blocks of memory
220 nigel 87 obtained from malloc() instead instead of on the stack. Macros are used to
221 nigel 77 achieve this so that the actual code doesn't look very different to what it
222     always used to.
223 ph10 164
224 ph10 165 The original heap-recursive code used longjmp(). However, it seems that this
225 ph10 164 can be very slow on some operating systems. Following a suggestion from Stan
226     Switzer, the use of longjmp() has been abolished, at the cost of having to
227     provide a unique number for each call to RMATCH. There is no way of generating
228     a sequence of numbers at compile time in C. I have given them names, to make
229     them stand out more clearly.
230    
231     Crude tests on x86 Linux show a small speedup of around 5-8%. However, on
232     FreeBSD, avoiding longjmp() more than halves the time taken to run the standard
233 ph10 165 tests. Furthermore, not using longjmp() means that local dynamic variables
234     don't have indeterminate values; this has meant that the frame size can be
235 ph10 164 reduced because the result can be "passed back" by straight setting of the
236     variable instead of being passed in the frame.
237 nigel 77 ****************************************************************************
238     ***************************************************************************/
239    
240 ph10 212 /* Numbers for RMATCH calls. When this list is changed, the code at HEAP_RETURN
241     below must be updated in sync. */
242 nigel 77
243 ph10 164 enum { RM1=1, RM2, RM3, RM4, RM5, RM6, RM7, RM8, RM9, RM10,
244     RM11, RM12, RM13, RM14, RM15, RM16, RM17, RM18, RM19, RM20,
245     RM21, RM22, RM23, RM24, RM25, RM26, RM27, RM28, RM29, RM30,
246     RM31, RM32, RM33, RM34, RM35, RM36, RM37, RM38, RM39, RM40,
247 ph10 210 RM41, RM42, RM43, RM44, RM45, RM46, RM47, RM48, RM49, RM50,
248 ph10 212 RM51, RM52, RM53, RM54 };
249 ph10 164
250 nigel 87 /* These versions of the macros use the stack, as normal. There are debugging
251 ph10 165 versions and production versions. Note that the "rw" argument of RMATCH isn't
252 ph10 164 actuall used in this definition. */
253 nigel 77
254     #ifndef NO_RECURSE
255     #define REGISTER register
256 ph10 164
257 nigel 87 #ifdef DEBUG
258 ph10 164 #define RMATCH(ra,rb,rc,rd,re,rf,rg,rw) \
259 nigel 87 { \
260     printf("match() called in line %d\n", __LINE__); \
261 ph10 168 rrc = match(ra,rb,mstart,rc,rd,re,rf,rg,rdepth+1); \
262 nigel 87 printf("to line %d\n", __LINE__); \
263     }
264     #define RRETURN(ra) \
265     { \
266     printf("match() returned %d from line %d ", ra, __LINE__); \
267     return ra; \
268     }
269     #else
270 ph10 164 #define RMATCH(ra,rb,rc,rd,re,rf,rg,rw) \
271 ph10 168 rrc = match(ra,rb,mstart,rc,rd,re,rf,rg,rdepth+1)
272 nigel 77 #define RRETURN(ra) return ra
273 nigel 87 #endif
274    
275 nigel 77 #else
276    
277    
278 ph10 164 /* These versions of the macros manage a private stack on the heap. Note that
279     the "rd" argument of RMATCH isn't actually used in this definition. It's the md
280     argument of match(), which never changes. */
281 nigel 77
282     #define REGISTER
283    
284 ph10 164 #define RMATCH(ra,rb,rc,rd,re,rf,rg,rw)\
285 nigel 77 {\
286     heapframe *newframe = (pcre_stack_malloc)(sizeof(heapframe));\
287 ph10 164 frame->Xwhere = rw; \
288     newframe->Xeptr = ra;\
289     newframe->Xecode = rb;\
290 ph10 168 newframe->Xmstart = mstart;\
291 ph10 164 newframe->Xoffset_top = rc;\
292     newframe->Xims = re;\
293     newframe->Xeptrb = rf;\
294     newframe->Xflags = rg;\
295     newframe->Xrdepth = frame->Xrdepth + 1;\
296     newframe->Xprevframe = frame;\
297     frame = newframe;\
298     DPRINTF(("restarting from line %d\n", __LINE__));\
299     goto HEAP_RECURSE;\
300     L_##rw:\
301     DPRINTF(("jumped back to line %d\n", __LINE__));\
302 nigel 77 }
303    
304     #define RRETURN(ra)\
305     {\
306     heapframe *newframe = frame;\
307     frame = newframe->Xprevframe;\
308     (pcre_stack_free)(newframe);\
309     if (frame != NULL)\
310     {\
311 ph10 164 rrc = ra;\
312     goto HEAP_RETURN;\
313 nigel 77 }\
314     return ra;\
315     }
316    
317    
318     /* Structure for remembering the local variables in a private frame */
319    
320     typedef struct heapframe {
321     struct heapframe *Xprevframe;
322    
323     /* Function arguments that may change */
324    
325 ph10 409 USPTR Xeptr;
326 nigel 77 const uschar *Xecode;
327 ph10 409 USPTR Xmstart;
328 nigel 77 int Xoffset_top;
329     long int Xims;
330     eptrblock *Xeptrb;
331     int Xflags;
332 nigel 91 unsigned int Xrdepth;
333 nigel 77
334     /* Function local variables */
335    
336 ph10 409 USPTR Xcallpat;
337 ph10 406 #ifdef SUPPORT_UTF8
338 ph10 409 USPTR Xcharptr;
339 ph10 406 #endif
340 ph10 409 USPTR Xdata;
341     USPTR Xnext;
342     USPTR Xpp;
343     USPTR Xprev;
344     USPTR Xsaved_eptr;
345 nigel 77
346     recursion_info Xnew_recursive;
347    
348     BOOL Xcur_is_word;
349     BOOL Xcondition;
350     BOOL Xprev_is_word;
351    
352     unsigned long int Xoriginal_ims;
353    
354     #ifdef SUPPORT_UCP
355     int Xprop_type;
356 nigel 87 int Xprop_value;
357 nigel 77 int Xprop_fail_result;
358     int Xprop_category;
359     int Xprop_chartype;
360 nigel 87 int Xprop_script;
361 ph10 123 int Xoclength;
362     uschar Xocchars[8];
363 nigel 77 #endif
364    
365 ph10 403 int Xcodelink;
366 nigel 77 int Xctype;
367 nigel 93 unsigned int Xfc;
368 nigel 77 int Xfi;
369     int Xlength;
370     int Xmax;
371     int Xmin;
372     int Xnumber;
373     int Xoffset;
374     int Xop;
375     int Xsave_capture_last;
376     int Xsave_offset1, Xsave_offset2, Xsave_offset3;
377     int Xstacksave[REC_STACK_SAVE_MAX];
378    
379     eptrblock Xnewptrb;
380    
381 ph10 164 /* Where to jump back to */
382 nigel 77
383 ph10 164 int Xwhere;
384 ph10 165
385 nigel 77 } heapframe;
386    
387     #endif
388    
389    
390     /***************************************************************************
391     ***************************************************************************/
392    
393    
394    
395     /*************************************************
396     * Match from current position *
397     *************************************************/
398    
399 nigel 93 /* This function is called recursively in many circumstances. Whenever it
400 nigel 77 returns a negative (error) response, the outer incarnation must also return the
401 ph10 426 same response. */
402 nigel 77
403 ph10 426 /* These macros pack up tests that are used for partial matching, and which
404     appears several times in the code. We set the "hit end" flag if the pointer is
405     at the end of the subject and also past the start of the subject (i.e.
406 ph10 427 something has been matched). For hard partial matching, we then return
407     immediately. The second one is used when we already know we are past the end of
408     the subject. */
409 ph10 426
410     #define CHECK_PARTIAL()\
411 ph10 435 if (md->partial != 0 && eptr >= md->end_subject && eptr > mstart)\
412 ph10 427 {\
413     md->hitend = TRUE;\
414     if (md->partial > 1) RRETURN(PCRE_ERROR_PARTIAL);\
415     }
416 ph10 426
417     #define SCHECK_PARTIAL()\
418 ph10 427 if (md->partial && eptr > mstart)\
419     {\
420     md->hitend = TRUE;\
421     if (md->partial > 1) RRETURN(PCRE_ERROR_PARTIAL);\
422     }
423 ph10 426
424 ph10 427
425 ph10 426 /* Performance note: It might be tempting to extract commonly used fields from
426     the md structure (e.g. utf8, end_subject) into individual variables to improve
427 nigel 77 performance. Tests using gcc on a SPARC disproved this; in the first case, it
428     made performance worse.
429    
430     Arguments:
431 nigel 93 eptr pointer to current character in subject
432     ecode pointer to current position in compiled code
433 ph10 168 mstart pointer to the current match start position (can be modified
434 ph10 172 by encountering \K)
435 nigel 77 offset_top current top pointer
436     md pointer to "static" info for the match
437     ims current /i, /m, and /s options
438     eptrb pointer to chain of blocks containing eptr at start of
439     brackets - for testing for empty matches
440     flags can contain
441     match_condassert - this is an assertion condition
442 nigel 93 match_cbegroup - this is the start of an unlimited repeat
443     group that can match an empty string
444 nigel 87 rdepth the recursion depth
445 nigel 77
446     Returns: MATCH_MATCH if matched ) these values are >= 0
447     MATCH_NOMATCH if failed to match )
448     a negative PCRE_ERROR_xxx value if aborted by an error condition
449 nigel 87 (e.g. stopped by repeated call or recursion limit)
450 nigel 77 */
451    
452     static int
453 ph10 409 match(REGISTER USPTR eptr, REGISTER const uschar *ecode, USPTR mstart,
454 nigel 77 int offset_top, match_data *md, unsigned long int ims, eptrblock *eptrb,
455 nigel 91 int flags, unsigned int rdepth)
456 nigel 77 {
457     /* These variables do not need to be preserved over recursion in this function,
458 nigel 93 so they can be ordinary variables in all cases. Mark some of them with
459     "register" because they are used a lot in loops. */
460 nigel 77
461 nigel 91 register int rrc; /* Returns from recursive calls */
462     register int i; /* Used for loops not involving calls to RMATCH() */
463 nigel 93 register unsigned int c; /* Character values not kept over RMATCH() calls */
464 nigel 91 register BOOL utf8; /* Local copy of UTF-8 flag for speed */
465 nigel 77
466 nigel 93 BOOL minimize, possessive; /* Quantifier options */
467 ph10 403 int condcode;
468 nigel 93
469 nigel 77 /* When recursion is not being used, all "local" variables that have to be
470     preserved over calls to RMATCH() are part of a "frame" which is obtained from
471     heap storage. Set up the top-level frame here; others are obtained from the
472     heap whenever RMATCH() does a "recursion". See the macro definitions above. */
473    
474     #ifdef NO_RECURSE
475     heapframe *frame = (pcre_stack_malloc)(sizeof(heapframe));
476     frame->Xprevframe = NULL; /* Marks the top level */
477    
478     /* Copy in the original argument variables */
479    
480     frame->Xeptr = eptr;
481     frame->Xecode = ecode;
482 ph10 168 frame->Xmstart = mstart;
483 nigel 77 frame->Xoffset_top = offset_top;
484     frame->Xims = ims;
485     frame->Xeptrb = eptrb;
486     frame->Xflags = flags;
487 nigel 87 frame->Xrdepth = rdepth;
488 nigel 77
489     /* This is where control jumps back to to effect "recursion" */
490    
491     HEAP_RECURSE:
492    
493     /* Macros make the argument variables come from the current frame */
494    
495     #define eptr frame->Xeptr
496     #define ecode frame->Xecode
497 ph10 168 #define mstart frame->Xmstart
498 nigel 77 #define offset_top frame->Xoffset_top
499     #define ims frame->Xims
500     #define eptrb frame->Xeptrb
501     #define flags frame->Xflags
502 nigel 87 #define rdepth frame->Xrdepth
503 nigel 77
504     /* Ditto for the local variables */
505    
506     #ifdef SUPPORT_UTF8
507     #define charptr frame->Xcharptr
508     #endif
509     #define callpat frame->Xcallpat
510 ph10 403 #define codelink frame->Xcodelink
511 nigel 77 #define data frame->Xdata
512     #define next frame->Xnext
513     #define pp frame->Xpp
514     #define prev frame->Xprev
515     #define saved_eptr frame->Xsaved_eptr
516    
517     #define new_recursive frame->Xnew_recursive
518    
519     #define cur_is_word frame->Xcur_is_word
520     #define condition frame->Xcondition
521     #define prev_is_word frame->Xprev_is_word
522    
523     #define original_ims frame->Xoriginal_ims
524    
525     #ifdef SUPPORT_UCP
526     #define prop_type frame->Xprop_type
527 nigel 87 #define prop_value frame->Xprop_value
528 nigel 77 #define prop_fail_result frame->Xprop_fail_result
529     #define prop_category frame->Xprop_category
530     #define prop_chartype frame->Xprop_chartype
531 nigel 87 #define prop_script frame->Xprop_script
532 ph10 115 #define oclength frame->Xoclength
533     #define occhars frame->Xocchars
534 nigel 77 #endif
535    
536     #define ctype frame->Xctype
537     #define fc frame->Xfc
538     #define fi frame->Xfi
539     #define length frame->Xlength
540     #define max frame->Xmax
541     #define min frame->Xmin
542     #define number frame->Xnumber
543     #define offset frame->Xoffset
544     #define op frame->Xop
545     #define save_capture_last frame->Xsave_capture_last
546     #define save_offset1 frame->Xsave_offset1
547     #define save_offset2 frame->Xsave_offset2
548     #define save_offset3 frame->Xsave_offset3
549     #define stacksave frame->Xstacksave
550    
551     #define newptrb frame->Xnewptrb
552    
553     /* When recursion is being used, local variables are allocated on the stack and
554     get preserved during recursion in the normal way. In this environment, fi and
555     i, and fc and c, can be the same variables. */
556    
557 nigel 93 #else /* NO_RECURSE not defined */
558 nigel 77 #define fi i
559     #define fc c
560    
561    
562 nigel 87 #ifdef SUPPORT_UTF8 /* Many of these variables are used only */
563     const uschar *charptr; /* in small blocks of the code. My normal */
564     #endif /* style of coding would have declared */
565     const uschar *callpat; /* them within each of those blocks. */
566     const uschar *data; /* However, in order to accommodate the */
567     const uschar *next; /* version of this code that uses an */
568     USPTR pp; /* external "stack" implemented on the */
569     const uschar *prev; /* heap, it is easier to declare them all */
570     USPTR saved_eptr; /* here, so the declarations can be cut */
571     /* out in a block. The only declarations */
572     recursion_info new_recursive; /* within blocks below are for variables */
573     /* that do not have to be preserved over */
574     BOOL cur_is_word; /* a recursive call to RMATCH(). */
575     BOOL condition;
576 nigel 77 BOOL prev_is_word;
577    
578     unsigned long int original_ims;
579    
580     #ifdef SUPPORT_UCP
581     int prop_type;
582 nigel 87 int prop_value;
583 nigel 77 int prop_fail_result;
584     int prop_category;
585     int prop_chartype;
586 nigel 87 int prop_script;
587 ph10 115 int oclength;
588     uschar occhars[8];
589 nigel 77 #endif
590    
591 ph10 399 int codelink;
592 nigel 77 int ctype;
593     int length;
594     int max;
595     int min;
596     int number;
597     int offset;
598     int op;
599     int save_capture_last;
600     int save_offset1, save_offset2, save_offset3;
601     int stacksave[REC_STACK_SAVE_MAX];
602    
603     eptrblock newptrb;
604 nigel 93 #endif /* NO_RECURSE */
605 nigel 77
606     /* These statements are here to stop the compiler complaining about unitialized
607     variables. */
608    
609     #ifdef SUPPORT_UCP
610 nigel 87 prop_value = 0;
611 nigel 77 prop_fail_result = 0;
612     #endif
613    
614 nigel 93
615 nigel 91 /* This label is used for tail recursion, which is used in a few cases even
616     when NO_RECURSE is not defined, in order to reduce the amount of stack that is
617     used. Thanks to Ian Taylor for noticing this possibility and sending the
618     original patch. */
619    
620     TAIL_RECURSE:
621    
622 nigel 87 /* OK, now we can get on with the real code of the function. Recursive calls
623     are specified by the macro RMATCH and RRETURN is used to return. When
624     NO_RECURSE is *not* defined, these just turn into a recursive call to match()
625     and a "return", respectively (possibly with some debugging if DEBUG is
626     defined). However, RMATCH isn't like a function call because it's quite a
627     complicated macro. It has to be used in one particular way. This shouldn't,
628     however, impact performance when true recursion is being used. */
629 nigel 77
630 ph10 164 #ifdef SUPPORT_UTF8
631     utf8 = md->utf8; /* Local copy of the flag */
632     #else
633     utf8 = FALSE;
634     #endif
635    
636 nigel 87 /* First check that we haven't called match() too many times, or that we
637     haven't exceeded the recursive call limit. */
638    
639 nigel 77 if (md->match_call_count++ >= md->match_limit) RRETURN(PCRE_ERROR_MATCHLIMIT);
640 nigel 87 if (rdepth >= md->match_limit_recursion) RRETURN(PCRE_ERROR_RECURSIONLIMIT);
641 nigel 77
642     original_ims = ims; /* Save for resetting on ')' */
643 nigel 91
644 nigel 93 /* At the start of a group with an unlimited repeat that may match an empty
645     string, the match_cbegroup flag is set. When this is the case, add the current
646     subject pointer to the chain of such remembered pointers, to be checked when we
647     hit the closing ket, in order to break infinite loops that match no characters.
648 ph10 197 When match() is called in other circumstances, don't add to the chain. The
649     match_cbegroup flag must NOT be used with tail recursion, because the memory
650     block that is used is on the stack, so a new one may be required for each
651     match(). */
652 nigel 77
653 nigel 93 if ((flags & match_cbegroup) != 0)
654 nigel 77 {
655 ph10 197 newptrb.epb_saved_eptr = eptr;
656     newptrb.epb_prev = eptrb;
657     eptrb = &newptrb;
658 nigel 77 }
659    
660 nigel 93 /* Now start processing the opcodes. */
661 nigel 77
662     for (;;)
663     {
664 nigel 93 minimize = possessive = FALSE;
665 nigel 77 op = *ecode;
666 ph10 428
667 nigel 93 switch(op)
668     {
669 ph10 210 case OP_FAIL:
670 ph10 212 RRETURN(MATCH_NOMATCH);
671 ph10 211
672 ph10 210 case OP_PRUNE:
673     RMATCH(eptr, ecode + _pcre_OP_lengths[*ecode], offset_top, md,
674     ims, eptrb, flags, RM51);
675     if (rrc != MATCH_NOMATCH) RRETURN(rrc);
676 ph10 212 RRETURN(MATCH_PRUNE);
677 ph10 211
678 ph10 210 case OP_COMMIT:
679     RMATCH(eptr, ecode + _pcre_OP_lengths[*ecode], offset_top, md,
680     ims, eptrb, flags, RM52);
681     if (rrc != MATCH_NOMATCH) RRETURN(rrc);
682 ph10 212 RRETURN(MATCH_COMMIT);
683 ph10 211
684 ph10 210 case OP_SKIP:
685     RMATCH(eptr, ecode + _pcre_OP_lengths[*ecode], offset_top, md,
686     ims, eptrb, flags, RM53);
687     if (rrc != MATCH_NOMATCH) RRETURN(rrc);
688 ph10 211 md->start_match_ptr = eptr; /* Pass back current position */
689 ph10 212 RRETURN(MATCH_SKIP);
690 ph10 211
691 ph10 210 case OP_THEN:
692     RMATCH(eptr, ecode + _pcre_OP_lengths[*ecode], offset_top, md,
693 ph10 212 ims, eptrb, flags, RM54);
694 ph10 210 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
695 ph10 212 RRETURN(MATCH_THEN);
696 ph10 211
697 nigel 93 /* Handle a capturing bracket. If there is space in the offset vector, save
698     the current subject position in the working slot at the top of the vector.
699     We mustn't change the current values of the data slot, because they may be
700     set from a previous iteration of this group, and be referred to by a
701     reference inside the group.
702 nigel 77
703 nigel 93 If the bracket fails to match, we need to restore this value and also the
704     values of the final offsets, in case they were set by a previous iteration
705     of the same bracket.
706 nigel 77
707 nigel 93 If there isn't enough space in the offset vector, treat this as if it were
708     a non-capturing bracket. Don't worry about setting the flag for the error
709     case here; that is handled in the code for KET. */
710 nigel 77
711 nigel 93 case OP_CBRA:
712     case OP_SCBRA:
713     number = GET2(ecode, 1+LINK_SIZE);
714 nigel 77 offset = number << 1;
715    
716     #ifdef DEBUG
717 nigel 93 printf("start bracket %d\n", number);
718     printf("subject=");
719 nigel 77 pchars(eptr, 16, TRUE, md);
720     printf("\n");
721     #endif
722    
723     if (offset < md->offset_max)
724     {
725     save_offset1 = md->offset_vector[offset];
726     save_offset2 = md->offset_vector[offset+1];
727     save_offset3 = md->offset_vector[md->offset_end - number];
728     save_capture_last = md->capture_last;
729    
730     DPRINTF(("saving %d %d %d\n", save_offset1, save_offset2, save_offset3));
731     md->offset_vector[md->offset_end - number] = eptr - md->start_subject;
732    
733 nigel 93 flags = (op == OP_SCBRA)? match_cbegroup : 0;
734 nigel 77 do
735     {
736 ph10 164 RMATCH(eptr, ecode + _pcre_OP_lengths[*ecode], offset_top, md,
737     ims, eptrb, flags, RM1);
738 ph10 210 if (rrc != MATCH_NOMATCH && rrc != MATCH_THEN) RRETURN(rrc);
739 nigel 77 md->capture_last = save_capture_last;
740     ecode += GET(ecode, 1);
741     }
742     while (*ecode == OP_ALT);
743    
744     DPRINTF(("bracket %d failed\n", number));
745    
746     md->offset_vector[offset] = save_offset1;
747     md->offset_vector[offset+1] = save_offset2;
748     md->offset_vector[md->offset_end - number] = save_offset3;
749    
750     RRETURN(MATCH_NOMATCH);
751     }
752    
753 ph10 197 /* FALL THROUGH ... Insufficient room for saving captured contents. Treat
754     as a non-capturing bracket. */
755 nigel 77
756 ph10 197 /* VVVVVVVVVVVVVVVVVVVVVVVVV */
757     /* VVVVVVVVVVVVVVVVVVVVVVVVV */
758    
759 nigel 93 DPRINTF(("insufficient capture room: treat as non-capturing\n"));
760 nigel 77
761 ph10 197 /* VVVVVVVVVVVVVVVVVVVVVVVVV */
762     /* VVVVVVVVVVVVVVVVVVVVVVVVV */
763    
764 nigel 93 /* Non-capturing bracket. Loop for all the alternatives. When we get to the
765     final alternative within the brackets, we would return the result of a
766     recursive call to match() whatever happened. We can reduce stack usage by
767 ph10 197 turning this into a tail recursion, except in the case when match_cbegroup
768     is set.*/
769 nigel 77
770 nigel 93 case OP_BRA:
771     case OP_SBRA:
772     DPRINTF(("start non-capturing bracket\n"));
773     flags = (op >= OP_SBRA)? match_cbegroup : 0;
774 nigel 91 for (;;)
775 nigel 77 {
776 ph10 197 if (ecode[GET(ecode, 1)] != OP_ALT) /* Final alternative */
777 nigel 93 {
778 ph10 197 if (flags == 0) /* Not a possibly empty group */
779     {
780     ecode += _pcre_OP_lengths[*ecode];
781     DPRINTF(("bracket 0 tail recursion\n"));
782     goto TAIL_RECURSE;
783     }
784    
785     /* Possibly empty group; can't use tail recursion. */
786    
787     RMATCH(eptr, ecode + _pcre_OP_lengths[*ecode], offset_top, md, ims,
788     eptrb, flags, RM48);
789     RRETURN(rrc);
790 nigel 93 }
791 nigel 91
792     /* For non-final alternatives, continue the loop for a NOMATCH result;
793     otherwise return. */
794    
795 ph10 164 RMATCH(eptr, ecode + _pcre_OP_lengths[*ecode], offset_top, md, ims,
796     eptrb, flags, RM2);
797 ph10 210 if (rrc != MATCH_NOMATCH && rrc != MATCH_THEN) RRETURN(rrc);
798 nigel 77 ecode += GET(ecode, 1);
799     }
800 nigel 91 /* Control never reaches here. */
801 nigel 77
802     /* Conditional group: compilation checked that there are no more than
803     two branches. If the condition is false, skipping the first branch takes us
804     past the end if there is only one branch, but that's OK because that is
805 nigel 91 exactly what going to the ket would do. As there is only one branch to be
806     obeyed, we can use tail recursion to avoid using another stack frame. */
807 nigel 77
808     case OP_COND:
809 nigel 93 case OP_SCOND:
810 ph10 399 codelink= GET(ecode, 1);
811 ph10 406
812 ph10 381 /* Because of the way auto-callout works during compile, a callout item is
813     inserted between OP_COND and an assertion condition. */
814 ph10 392
815 ph10 381 if (ecode[LINK_SIZE+1] == OP_CALLOUT)
816     {
817     if (pcre_callout != NULL)
818     {
819     pcre_callout_block cb;
820     cb.version = 1; /* Version 1 of the callout block */
821     cb.callout_number = ecode[LINK_SIZE+2];
822     cb.offset_vector = md->offset_vector;
823     cb.subject = (PCRE_SPTR)md->start_subject;
824     cb.subject_length = md->end_subject - md->start_subject;
825     cb.start_match = mstart - md->start_subject;
826     cb.current_position = eptr - md->start_subject;
827     cb.pattern_position = GET(ecode, LINK_SIZE + 3);
828     cb.next_item_length = GET(ecode, 3 + 2*LINK_SIZE);
829     cb.capture_top = offset_top/2;
830     cb.capture_last = md->capture_last;
831     cb.callout_data = md->callout_data;
832     if ((rrc = (*pcre_callout)(&cb)) > 0) RRETURN(MATCH_NOMATCH);
833     if (rrc < 0) RRETURN(rrc);
834     }
835     ecode += _pcre_OP_lengths[OP_CALLOUT];
836     }
837 ph10 392
838 ph10 399 condcode = ecode[LINK_SIZE+1];
839 ph10 406
840 ph10 381 /* Now see what the actual condition is */
841 ph10 392
842 ph10 399 if (condcode == OP_RREF) /* Recursion test */
843 nigel 77 {
844 nigel 93 offset = GET2(ecode, LINK_SIZE + 2); /* Recursion group number*/
845     condition = md->recursive != NULL &&
846     (offset == RREF_ANY || offset == md->recursive->group_num);
847     ecode += condition? 3 : GET(ecode, 1);
848     }
849    
850 ph10 399 else if (condcode == OP_CREF) /* Group used test */
851 nigel 93 {
852 nigel 77 offset = GET2(ecode, LINK_SIZE+2) << 1; /* Doubled ref number */
853 nigel 93 condition = offset < offset_top && md->offset_vector[offset] >= 0;
854     ecode += condition? 3 : GET(ecode, 1);
855 nigel 77 }
856    
857 ph10 399 else if (condcode == OP_DEF) /* DEFINE - always false */
858 nigel 93 {
859     condition = FALSE;
860     ecode += GET(ecode, 1);
861     }
862    
863 nigel 77 /* The condition is an assertion. Call match() to evaluate it - setting
864 nigel 93 the final argument match_condassert causes it to stop at the end of an
865     assertion. */
866 nigel 77
867     else
868     {
869 ph10 164 RMATCH(eptr, ecode + 1 + LINK_SIZE, offset_top, md, ims, NULL,
870     match_condassert, RM3);
871 nigel 77 if (rrc == MATCH_MATCH)
872     {
873 nigel 93 condition = TRUE;
874     ecode += 1 + LINK_SIZE + GET(ecode, LINK_SIZE + 2);
875 nigel 77 while (*ecode == OP_ALT) ecode += GET(ecode, 1);
876     }
877 ph10 210 else if (rrc != MATCH_NOMATCH && rrc != MATCH_THEN)
878 nigel 77 {
879     RRETURN(rrc); /* Need braces because of following else */
880     }
881 nigel 93 else
882     {
883     condition = FALSE;
884 ph10 399 ecode += codelink;
885 nigel 93 }
886     }
887 nigel 91
888 nigel 93 /* We are now at the branch that is to be obeyed. As there is only one,
889 ph10 197 we can use tail recursion to avoid using another stack frame, except when
890     match_cbegroup is required for an unlimited repeat of a possibly empty
891     group. If the second alternative doesn't exist, we can just plough on. */
892 nigel 91
893 nigel 93 if (condition || *ecode == OP_ALT)
894     {
895 nigel 91 ecode += 1 + LINK_SIZE;
896 ph10 197 if (op == OP_SCOND) /* Possibly empty group */
897     {
898     RMATCH(eptr, ecode, offset_top, md, ims, eptrb, match_cbegroup, RM49);
899     RRETURN(rrc);
900     }
901     else /* Group must match something */
902     {
903     flags = 0;
904     goto TAIL_RECURSE;
905     }
906 nigel 77 }
907 ph10 395 else /* Condition false & no alternative */
908 nigel 93 {
909     ecode += 1 + LINK_SIZE;
910     }
911     break;
912 nigel 77
913    
914 ph10 210 /* End of the pattern, either real or forced. If we are in a top-level
915     recursion, we should restore the offsets appropriately and continue from
916     after the call. */
917 nigel 77
918 ph10 210 case OP_ACCEPT:
919 nigel 77 case OP_END:
920     if (md->recursive != NULL && md->recursive->group_num == 0)
921     {
922     recursion_info *rec = md->recursive;
923 nigel 87 DPRINTF(("End of pattern in a (?0) recursion\n"));
924 nigel 77 md->recursive = rec->prevrec;
925     memmove(md->offset_vector, rec->offset_save,
926     rec->saved_max * sizeof(int));
927 ph10 168 mstart = rec->save_start;
928 nigel 77 ims = original_ims;
929     ecode = rec->after_call;
930     break;
931     }
932    
933     /* Otherwise, if PCRE_NOTEMPTY is set, fail if we have matched an empty
934     string - backtracking will then try other alternatives, if any. */
935    
936 ph10 168 if (md->notempty && eptr == mstart) RRETURN(MATCH_NOMATCH);
937     md->end_match_ptr = eptr; /* Record where we ended */
938     md->end_offset_top = offset_top; /* and how many extracts were taken */
939 ph10 210 md->start_match_ptr = mstart; /* and the start (\K can modify) */
940 nigel 77 RRETURN(MATCH_MATCH);
941    
942     /* Change option settings */
943    
944     case OP_OPT:
945     ims = ecode[1];
946     ecode += 2;
947     DPRINTF(("ims set to %02lx\n", ims));
948     break;
949    
950     /* Assertion brackets. Check the alternative branches in turn - the
951     matching won't pass the KET for an assertion. If any one branch matches,
952     the assertion is true. Lookbehind assertions have an OP_REVERSE item at the
953     start of each branch to move the current point backwards, so the code at
954     this level is identical to the lookahead case. */
955    
956     case OP_ASSERT:
957     case OP_ASSERTBACK:
958     do
959     {
960 ph10 164 RMATCH(eptr, ecode + 1 + LINK_SIZE, offset_top, md, ims, NULL, 0,
961     RM4);
962 nigel 77 if (rrc == MATCH_MATCH) break;
963 ph10 210 if (rrc != MATCH_NOMATCH && rrc != MATCH_THEN) RRETURN(rrc);
964 nigel 77 ecode += GET(ecode, 1);
965     }
966     while (*ecode == OP_ALT);
967     if (*ecode == OP_KET) RRETURN(MATCH_NOMATCH);
968    
969     /* If checking an assertion for a condition, return MATCH_MATCH. */
970    
971     if ((flags & match_condassert) != 0) RRETURN(MATCH_MATCH);
972    
973     /* Continue from after the assertion, updating the offsets high water
974     mark, since extracts may have been taken during the assertion. */
975    
976     do ecode += GET(ecode,1); while (*ecode == OP_ALT);
977     ecode += 1 + LINK_SIZE;
978     offset_top = md->end_offset_top;
979     continue;
980    
981     /* Negative assertion: all branches must fail to match */
982    
983     case OP_ASSERT_NOT:
984     case OP_ASSERTBACK_NOT:
985     do
986     {
987 ph10 164 RMATCH(eptr, ecode + 1 + LINK_SIZE, offset_top, md, ims, NULL, 0,
988     RM5);
989 nigel 77 if (rrc == MATCH_MATCH) RRETURN(MATCH_NOMATCH);
990 ph10 210 if (rrc != MATCH_NOMATCH && rrc != MATCH_THEN) RRETURN(rrc);
991 nigel 77 ecode += GET(ecode,1);
992     }
993     while (*ecode == OP_ALT);
994    
995     if ((flags & match_condassert) != 0) RRETURN(MATCH_MATCH);
996    
997     ecode += 1 + LINK_SIZE;
998     continue;
999    
1000     /* Move the subject pointer back. This occurs only at the start of
1001     each branch of a lookbehind assertion. If we are too close to the start to
1002     move back, this match function fails. When working with UTF-8 we move
1003     back a number of characters, not bytes. */
1004    
1005     case OP_REVERSE:
1006     #ifdef SUPPORT_UTF8
1007     if (utf8)
1008     {
1009 nigel 93 i = GET(ecode, 1);
1010     while (i-- > 0)
1011 nigel 77 {
1012     eptr--;
1013     if (eptr < md->start_subject) RRETURN(MATCH_NOMATCH);
1014 ph10 207 BACKCHAR(eptr);
1015 nigel 77 }
1016     }
1017     else
1018     #endif
1019    
1020     /* No UTF-8 support, or not in UTF-8 mode: count is byte count */
1021    
1022     {
1023 nigel 93 eptr -= GET(ecode, 1);
1024 nigel 77 if (eptr < md->start_subject) RRETURN(MATCH_NOMATCH);
1025     }
1026    
1027 ph10 435 /* Save the earliest consulted character, then skip to next op code */
1028 nigel 77
1029 ph10 435 if (eptr < md->start_used_ptr) md->start_used_ptr = eptr;
1030 nigel 77 ecode += 1 + LINK_SIZE;
1031     break;
1032    
1033     /* The callout item calls an external function, if one is provided, passing
1034     details of the match so far. This is mainly for debugging, though the
1035     function is able to force a failure. */
1036    
1037     case OP_CALLOUT:
1038     if (pcre_callout != NULL)
1039     {
1040     pcre_callout_block cb;
1041     cb.version = 1; /* Version 1 of the callout block */
1042     cb.callout_number = ecode[1];
1043     cb.offset_vector = md->offset_vector;
1044 nigel 87 cb.subject = (PCRE_SPTR)md->start_subject;
1045 nigel 77 cb.subject_length = md->end_subject - md->start_subject;
1046 ph10 168 cb.start_match = mstart - md->start_subject;
1047 nigel 77 cb.current_position = eptr - md->start_subject;
1048     cb.pattern_position = GET(ecode, 2);
1049     cb.next_item_length = GET(ecode, 2 + LINK_SIZE);
1050     cb.capture_top = offset_top/2;
1051     cb.capture_last = md->capture_last;
1052     cb.callout_data = md->callout_data;
1053     if ((rrc = (*pcre_callout)(&cb)) > 0) RRETURN(MATCH_NOMATCH);
1054     if (rrc < 0) RRETURN(rrc);
1055     }
1056     ecode += 2 + 2*LINK_SIZE;
1057     break;
1058    
1059     /* Recursion either matches the current regex, or some subexpression. The
1060     offset data is the offset to the starting bracket from the start of the
1061     whole pattern. (This is so that it works from duplicated subpatterns.)
1062    
1063     If there are any capturing brackets started but not finished, we have to
1064     save their starting points and reinstate them after the recursion. However,
1065     we don't know how many such there are (offset_top records the completed
1066     total) so we just have to save all the potential data. There may be up to
1067     65535 such values, which is too large to put on the stack, but using malloc
1068     for small numbers seems expensive. As a compromise, the stack is used when
1069     there are no more than REC_STACK_SAVE_MAX values to store; otherwise malloc
1070     is used. A problem is what to do if the malloc fails ... there is no way of
1071     returning to the top level with an error. Save the top REC_STACK_SAVE_MAX
1072     values on the stack, and accept that the rest may be wrong.
1073    
1074     There are also other values that have to be saved. We use a chained
1075     sequence of blocks that actually live on the stack. Thanks to Robin Houston
1076     for the original version of this logic. */
1077    
1078     case OP_RECURSE:
1079     {
1080     callpat = md->start_code + GET(ecode, 1);
1081 nigel 93 new_recursive.group_num = (callpat == md->start_code)? 0 :
1082     GET2(callpat, 1 + LINK_SIZE);
1083 nigel 77
1084     /* Add to "recursing stack" */
1085    
1086     new_recursive.prevrec = md->recursive;
1087     md->recursive = &new_recursive;
1088    
1089     /* Find where to continue from afterwards */
1090    
1091     ecode += 1 + LINK_SIZE;
1092     new_recursive.after_call = ecode;
1093    
1094     /* Now save the offset data. */
1095    
1096     new_recursive.saved_max = md->offset_end;
1097     if (new_recursive.saved_max <= REC_STACK_SAVE_MAX)
1098     new_recursive.offset_save = stacksave;
1099     else
1100     {
1101     new_recursive.offset_save =
1102     (int *)(pcre_malloc)(new_recursive.saved_max * sizeof(int));
1103     if (new_recursive.offset_save == NULL) RRETURN(PCRE_ERROR_NOMEMORY);
1104     }
1105    
1106     memcpy(new_recursive.offset_save, md->offset_vector,
1107     new_recursive.saved_max * sizeof(int));
1108 ph10 168 new_recursive.save_start = mstart;
1109     mstart = eptr;
1110 nigel 77
1111     /* OK, now we can do the recursion. For each top-level alternative we
1112     restore the offset and recursion data. */
1113    
1114     DPRINTF(("Recursing into group %d\n", new_recursive.group_num));
1115 nigel 93 flags = (*callpat >= OP_SBRA)? match_cbegroup : 0;
1116 nigel 77 do
1117     {
1118 ph10 164 RMATCH(eptr, callpat + _pcre_OP_lengths[*callpat], offset_top,
1119     md, ims, eptrb, flags, RM6);
1120 nigel 77 if (rrc == MATCH_MATCH)
1121     {
1122 nigel 87 DPRINTF(("Recursion matched\n"));
1123 nigel 77 md->recursive = new_recursive.prevrec;
1124     if (new_recursive.offset_save != stacksave)
1125     (pcre_free)(new_recursive.offset_save);
1126     RRETURN(MATCH_MATCH);
1127     }
1128 ph10 210 else if (rrc != MATCH_NOMATCH && rrc != MATCH_THEN)
1129 nigel 87 {
1130     DPRINTF(("Recursion gave error %d\n", rrc));
1131 ph10 400 if (new_recursive.offset_save != stacksave)
1132     (pcre_free)(new_recursive.offset_save);
1133 nigel 87 RRETURN(rrc);
1134     }
1135 nigel 77
1136     md->recursive = &new_recursive;
1137     memcpy(md->offset_vector, new_recursive.offset_save,
1138     new_recursive.saved_max * sizeof(int));
1139     callpat += GET(callpat, 1);
1140     }
1141     while (*callpat == OP_ALT);
1142    
1143     DPRINTF(("Recursion didn't match\n"));
1144     md->recursive = new_recursive.prevrec;
1145     if (new_recursive.offset_save != stacksave)
1146     (pcre_free)(new_recursive.offset_save);
1147     RRETURN(MATCH_NOMATCH);
1148     }
1149     /* Control never reaches here */
1150    
1151     /* "Once" brackets are like assertion brackets except that after a match,
1152     the point in the subject string is not moved back. Thus there can never be
1153     a move back into the brackets. Friedl calls these "atomic" subpatterns.
1154     Check the alternative branches in turn - the matching won't pass the KET
1155     for this kind of subpattern. If any one branch matches, we carry on as at
1156     the end of a normal bracket, leaving the subject pointer. */
1157    
1158     case OP_ONCE:
1159 nigel 91 prev = ecode;
1160     saved_eptr = eptr;
1161    
1162     do
1163 nigel 77 {
1164 ph10 197 RMATCH(eptr, ecode + 1 + LINK_SIZE, offset_top, md, ims, eptrb, 0, RM7);
1165 nigel 91 if (rrc == MATCH_MATCH) break;
1166 ph10 210 if (rrc != MATCH_NOMATCH && rrc != MATCH_THEN) RRETURN(rrc);
1167 nigel 91 ecode += GET(ecode,1);
1168     }
1169     while (*ecode == OP_ALT);
1170 nigel 77
1171 nigel 91 /* If hit the end of the group (which could be repeated), fail */
1172 nigel 77
1173 nigel 91 if (*ecode != OP_ONCE && *ecode != OP_ALT) RRETURN(MATCH_NOMATCH);
1174 nigel 77
1175 nigel 91 /* Continue as from after the assertion, updating the offsets high water
1176     mark, since extracts may have been taken. */
1177 nigel 77
1178 nigel 93 do ecode += GET(ecode, 1); while (*ecode == OP_ALT);
1179 nigel 77
1180 nigel 91 offset_top = md->end_offset_top;
1181     eptr = md->end_match_ptr;
1182 nigel 77
1183 nigel 91 /* For a non-repeating ket, just continue at this level. This also
1184     happens for a repeating ket if no characters were matched in the group.
1185     This is the forcible breaking of infinite loops as implemented in Perl
1186     5.005. If there is an options reset, it will get obeyed in the normal
1187     course of events. */
1188 nigel 77
1189 nigel 91 if (*ecode == OP_KET || eptr == saved_eptr)
1190     {
1191     ecode += 1+LINK_SIZE;
1192     break;
1193     }
1194 nigel 77
1195 nigel 91 /* The repeating kets try the rest of the pattern or restart from the
1196     preceding bracket, in the appropriate order. The second "call" of match()
1197     uses tail recursion, to avoid using another stack frame. We need to reset
1198     any options that changed within the bracket before re-running it, so
1199     check the next opcode. */
1200 nigel 77
1201 nigel 91 if (ecode[1+LINK_SIZE] == OP_OPT)
1202     {
1203     ims = (ims & ~PCRE_IMS) | ecode[4];
1204     DPRINTF(("ims set to %02lx at group repeat\n", ims));
1205     }
1206 nigel 77
1207 nigel 91 if (*ecode == OP_KETRMIN)
1208     {
1209 ph10 197 RMATCH(eptr, ecode + 1 + LINK_SIZE, offset_top, md, ims, eptrb, 0, RM8);
1210 nigel 91 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1211     ecode = prev;
1212 ph10 197 flags = 0;
1213 nigel 91 goto TAIL_RECURSE;
1214 nigel 77 }
1215 nigel 91 else /* OP_KETRMAX */
1216     {
1217 ph10 164 RMATCH(eptr, prev, offset_top, md, ims, eptrb, match_cbegroup, RM9);
1218 nigel 91 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1219     ecode += 1 + LINK_SIZE;
1220 ph10 197 flags = 0;
1221 nigel 91 goto TAIL_RECURSE;
1222     }
1223     /* Control never gets here */
1224 nigel 77
1225     /* An alternation is the end of a branch; scan along to find the end of the
1226     bracketed group and go to there. */
1227    
1228     case OP_ALT:
1229     do ecode += GET(ecode,1); while (*ecode == OP_ALT);
1230     break;
1231    
1232 ph10 335 /* BRAZERO, BRAMINZERO and SKIPZERO occur just before a bracket group,
1233     indicating that it may occur zero times. It may repeat infinitely, or not
1234     at all - i.e. it could be ()* or ()? or even (){0} in the pattern. Brackets
1235     with fixed upper repeat limits are compiled as a number of copies, with the
1236     optional ones preceded by BRAZERO or BRAMINZERO. */
1237 nigel 77
1238     case OP_BRAZERO:
1239     {
1240     next = ecode+1;
1241 ph10 164 RMATCH(eptr, next, offset_top, md, ims, eptrb, 0, RM10);
1242 nigel 77 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1243     do next += GET(next,1); while (*next == OP_ALT);
1244 nigel 93 ecode = next + 1 + LINK_SIZE;
1245 nigel 77 }
1246     break;
1247    
1248     case OP_BRAMINZERO:
1249     {
1250     next = ecode+1;
1251 nigel 93 do next += GET(next, 1); while (*next == OP_ALT);
1252 ph10 164 RMATCH(eptr, next + 1+LINK_SIZE, offset_top, md, ims, eptrb, 0, RM11);
1253 nigel 77 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1254     ecode++;
1255     }
1256     break;
1257    
1258 ph10 335 case OP_SKIPZERO:
1259     {
1260     next = ecode+1;
1261     do next += GET(next,1); while (*next == OP_ALT);
1262     ecode = next + 1 + LINK_SIZE;
1263     }
1264     break;
1265    
1266 nigel 93 /* End of a group, repeated or non-repeating. */
1267 nigel 77
1268     case OP_KET:
1269     case OP_KETRMIN:
1270     case OP_KETRMAX:
1271 nigel 91 prev = ecode - GET(ecode, 1);
1272 nigel 77
1273 nigel 93 /* If this was a group that remembered the subject start, in order to break
1274     infinite repeats of empty string matches, retrieve the subject start from
1275     the chain. Otherwise, set it NULL. */
1276 nigel 77
1277 nigel 93 if (*prev >= OP_SBRA)
1278     {
1279     saved_eptr = eptrb->epb_saved_eptr; /* Value at start of group */
1280     eptrb = eptrb->epb_prev; /* Backup to previous group */
1281     }
1282     else saved_eptr = NULL;
1283 nigel 77
1284 nigel 93 /* If we are at the end of an assertion group, stop matching and return
1285     MATCH_MATCH, but record the current high water mark for use by positive
1286     assertions. Do this also for the "once" (atomic) groups. */
1287    
1288 nigel 91 if (*prev == OP_ASSERT || *prev == OP_ASSERT_NOT ||
1289     *prev == OP_ASSERTBACK || *prev == OP_ASSERTBACK_NOT ||
1290     *prev == OP_ONCE)
1291     {
1292     md->end_match_ptr = eptr; /* For ONCE */
1293     md->end_offset_top = offset_top;
1294     RRETURN(MATCH_MATCH);
1295     }
1296 nigel 77
1297 nigel 93 /* For capturing groups we have to check the group number back at the start
1298     and if necessary complete handling an extraction by setting the offsets and
1299     bumping the high water mark. Note that whole-pattern recursion is coded as
1300     a recurse into group 0, so it won't be picked up here. Instead, we catch it
1301     when the OP_END is reached. Other recursion is handled here. */
1302 nigel 77
1303 nigel 93 if (*prev == OP_CBRA || *prev == OP_SCBRA)
1304 nigel 91 {
1305 nigel 93 number = GET2(prev, 1+LINK_SIZE);
1306 nigel 91 offset = number << 1;
1307 nigel 77
1308     #ifdef DEBUG
1309 nigel 91 printf("end bracket %d", number);
1310     printf("\n");
1311 nigel 77 #endif
1312    
1313 nigel 93 md->capture_last = number;
1314     if (offset >= md->offset_max) md->offset_overflow = TRUE; else
1315 nigel 91 {
1316 nigel 93 md->offset_vector[offset] =
1317     md->offset_vector[md->offset_end - number];
1318     md->offset_vector[offset+1] = eptr - md->start_subject;
1319     if (offset_top <= offset) offset_top = offset + 2;
1320     }
1321 nigel 77
1322 nigel 93 /* Handle a recursively called group. Restore the offsets
1323     appropriately and continue from after the call. */
1324 nigel 77
1325 nigel 93 if (md->recursive != NULL && md->recursive->group_num == number)
1326     {
1327     recursion_info *rec = md->recursive;
1328     DPRINTF(("Recursion (%d) succeeded - continuing\n", number));
1329     md->recursive = rec->prevrec;
1330 ph10 168 mstart = rec->save_start;
1331 nigel 93 memcpy(md->offset_vector, rec->offset_save,
1332     rec->saved_max * sizeof(int));
1333     ecode = rec->after_call;
1334     ims = original_ims;
1335     break;
1336 nigel 77 }
1337 nigel 91 }
1338 nigel 77
1339 nigel 93 /* For both capturing and non-capturing groups, reset the value of the ims
1340     flags, in case they got changed during the group. */
1341 nigel 77
1342 nigel 91 ims = original_ims;
1343     DPRINTF(("ims reset to %02lx\n", ims));
1344 nigel 77
1345 nigel 91 /* For a non-repeating ket, just continue at this level. This also
1346     happens for a repeating ket if no characters were matched in the group.
1347     This is the forcible breaking of infinite loops as implemented in Perl
1348     5.005. If there is an options reset, it will get obeyed in the normal
1349     course of events. */
1350 nigel 77
1351 nigel 91 if (*ecode == OP_KET || eptr == saved_eptr)
1352     {
1353     ecode += 1 + LINK_SIZE;
1354     break;
1355     }
1356 nigel 77
1357 nigel 91 /* The repeating kets try the rest of the pattern or restart from the
1358     preceding bracket, in the appropriate order. In the second case, we can use
1359 ph10 197 tail recursion to avoid using another stack frame, unless we have an
1360     unlimited repeat of a group that can match an empty string. */
1361 nigel 77
1362 nigel 93 flags = (*prev >= OP_SBRA)? match_cbegroup : 0;
1363    
1364 nigel 91 if (*ecode == OP_KETRMIN)
1365     {
1366 ph10 197 RMATCH(eptr, ecode + 1 + LINK_SIZE, offset_top, md, ims, eptrb, 0, RM12);
1367 nigel 91 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1368 ph10 197 if (flags != 0) /* Could match an empty string */
1369     {
1370     RMATCH(eptr, prev, offset_top, md, ims, eptrb, flags, RM50);
1371     RRETURN(rrc);
1372     }
1373 nigel 91 ecode = prev;
1374     goto TAIL_RECURSE;
1375 nigel 77 }
1376 nigel 91 else /* OP_KETRMAX */
1377     {
1378 ph10 164 RMATCH(eptr, prev, offset_top, md, ims, eptrb, flags, RM13);
1379 nigel 91 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1380     ecode += 1 + LINK_SIZE;
1381 ph10 197 flags = 0;
1382 nigel 91 goto TAIL_RECURSE;
1383     }
1384     /* Control never gets here */
1385 nigel 77
1386     /* Start of subject unless notbol, or after internal newline if multiline */
1387    
1388     case OP_CIRC:
1389     if (md->notbol && eptr == md->start_subject) RRETURN(MATCH_NOMATCH);
1390     if ((ims & PCRE_MULTILINE) != 0)
1391     {
1392 nigel 91 if (eptr != md->start_subject &&
1393 nigel 93 (eptr == md->end_subject || !WAS_NEWLINE(eptr)))
1394 nigel 77 RRETURN(MATCH_NOMATCH);
1395     ecode++;
1396     break;
1397     }
1398     /* ... else fall through */
1399    
1400     /* Start of subject assertion */
1401    
1402     case OP_SOD:
1403     if (eptr != md->start_subject) RRETURN(MATCH_NOMATCH);
1404     ecode++;
1405     break;
1406    
1407     /* Start of match assertion */
1408    
1409     case OP_SOM:
1410     if (eptr != md->start_subject + md->start_offset) RRETURN(MATCH_NOMATCH);
1411     ecode++;
1412     break;
1413 ph10 172
1414 ph10 168 /* Reset the start of match point */
1415 ph10 172
1416 ph10 168 case OP_SET_SOM:
1417     mstart = eptr;
1418 ph10 172 ecode++;
1419     break;
1420 nigel 77
1421     /* Assert before internal newline if multiline, or before a terminating
1422     newline unless endonly is set, else end of subject unless noteol is set. */
1423    
1424     case OP_DOLL:
1425     if ((ims & PCRE_MULTILINE) != 0)
1426     {
1427     if (eptr < md->end_subject)
1428 nigel 91 { if (!IS_NEWLINE(eptr)) RRETURN(MATCH_NOMATCH); }
1429 nigel 77 else
1430     { if (md->noteol) RRETURN(MATCH_NOMATCH); }
1431     ecode++;
1432     break;
1433     }
1434     else
1435     {
1436     if (md->noteol) RRETURN(MATCH_NOMATCH);
1437     if (!md->endonly)
1438     {
1439 nigel 91 if (eptr != md->end_subject &&
1440 nigel 93 (!IS_NEWLINE(eptr) || eptr != md->end_subject - md->nllen))
1441 nigel 77 RRETURN(MATCH_NOMATCH);
1442     ecode++;
1443     break;
1444     }
1445     }
1446 nigel 91 /* ... else fall through for endonly */
1447 nigel 77
1448     /* End of subject assertion (\z) */
1449    
1450     case OP_EOD:
1451     if (eptr < md->end_subject) RRETURN(MATCH_NOMATCH);
1452     ecode++;
1453     break;
1454    
1455     /* End of subject or ending \n assertion (\Z) */
1456    
1457     case OP_EODN:
1458 nigel 91 if (eptr != md->end_subject &&
1459 nigel 93 (!IS_NEWLINE(eptr) || eptr != md->end_subject - md->nllen))
1460 nigel 91 RRETURN(MATCH_NOMATCH);
1461 nigel 77 ecode++;
1462     break;
1463    
1464     /* Word boundary assertions */
1465    
1466     case OP_NOT_WORD_BOUNDARY:
1467     case OP_WORD_BOUNDARY:
1468     {
1469    
1470     /* Find out if the previous and current characters are "word" characters.
1471     It takes a bit more work in UTF-8 mode. Characters > 255 are assumed to
1472 ph10 435 be "non-word" characters. Remember the earliest consulted character for
1473     partial matching. */
1474 nigel 77
1475     #ifdef SUPPORT_UTF8
1476     if (utf8)
1477     {
1478     if (eptr == md->start_subject) prev_is_word = FALSE; else
1479     {
1480 ph10 409 USPTR lastptr = eptr - 1;
1481 nigel 77 while((*lastptr & 0xc0) == 0x80) lastptr--;
1482 ph10 435 if (lastptr < md->start_used_ptr) md->start_used_ptr = lastptr;
1483 nigel 77 GETCHAR(c, lastptr);
1484     prev_is_word = c < 256 && (md->ctypes[c] & ctype_word) != 0;
1485     }
1486 ph10 428 if (eptr >= md->end_subject)
1487 nigel 77 {
1488 ph10 428 SCHECK_PARTIAL();
1489     cur_is_word = FALSE;
1490     }
1491     else
1492     {
1493 nigel 77 GETCHAR(c, eptr);
1494     cur_is_word = c < 256 && (md->ctypes[c] & ctype_word) != 0;
1495     }
1496     }
1497     else
1498     #endif
1499    
1500 ph10 428 /* Not in UTF-8 mode */
1501 nigel 77
1502     {
1503 ph10 435 if (eptr == md->start_subject) prev_is_word = FALSE; else
1504     {
1505     if (eptr <= md->start_used_ptr) md->start_used_ptr = eptr - 1;
1506     prev_is_word = ((md->ctypes[eptr[-1]] & ctype_word) != 0);
1507     }
1508 ph10 428 if (eptr >= md->end_subject)
1509     {
1510     SCHECK_PARTIAL();
1511     cur_is_word = FALSE;
1512     }
1513     else cur_is_word = ((md->ctypes[*eptr] & ctype_word) != 0);
1514 nigel 77 }
1515    
1516     /* Now see if the situation is what we want */
1517    
1518     if ((*ecode++ == OP_WORD_BOUNDARY)?
1519     cur_is_word == prev_is_word : cur_is_word != prev_is_word)
1520     RRETURN(MATCH_NOMATCH);
1521     }
1522     break;
1523    
1524     /* Match a single character type; inline for speed */
1525    
1526     case OP_ANY:
1527 ph10 342 if (IS_NEWLINE(eptr)) RRETURN(MATCH_NOMATCH);
1528 ph10 345 /* Fall through */
1529    
1530 ph10 341 case OP_ALLANY:
1531 ph10 428 if (eptr++ >= md->end_subject)
1532     {
1533     SCHECK_PARTIAL();
1534     RRETURN(MATCH_NOMATCH);
1535     }
1536 ph10 342 if (utf8) while (eptr < md->end_subject && (*eptr & 0xc0) == 0x80) eptr++;
1537 nigel 77 ecode++;
1538     break;
1539    
1540     /* Match a single byte, even in UTF-8 mode. This opcode really does match
1541     any byte, even newline, independent of the setting of PCRE_DOTALL. */
1542    
1543     case OP_ANYBYTE:
1544 ph10 428 if (eptr++ >= md->end_subject)
1545     {
1546     SCHECK_PARTIAL();
1547     RRETURN(MATCH_NOMATCH);
1548     }
1549 nigel 77 ecode++;
1550     break;
1551    
1552     case OP_NOT_DIGIT:
1553 ph10 428 if (eptr >= md->end_subject)
1554     {
1555     SCHECK_PARTIAL();
1556     RRETURN(MATCH_NOMATCH);
1557     }
1558 nigel 77 GETCHARINCTEST(c, eptr);
1559     if (
1560     #ifdef SUPPORT_UTF8
1561     c < 256 &&
1562     #endif
1563     (md->ctypes[c] & ctype_digit) != 0
1564     )
1565     RRETURN(MATCH_NOMATCH);
1566     ecode++;
1567     break;
1568    
1569     case OP_DIGIT:
1570 ph10 428 if (eptr >= md->end_subject)
1571     {
1572     SCHECK_PARTIAL();
1573     RRETURN(MATCH_NOMATCH);
1574     }
1575 nigel 77 GETCHARINCTEST(c, eptr);
1576     if (
1577     #ifdef SUPPORT_UTF8
1578     c >= 256 ||
1579     #endif
1580     (md->ctypes[c] & ctype_digit) == 0
1581     )
1582     RRETURN(MATCH_NOMATCH);
1583     ecode++;
1584     break;
1585    
1586     case OP_NOT_WHITESPACE:
1587 ph10 428 if (eptr >= md->end_subject)
1588     {
1589     SCHECK_PARTIAL();
1590     RRETURN(MATCH_NOMATCH);
1591     }
1592 nigel 77 GETCHARINCTEST(c, eptr);
1593     if (
1594     #ifdef SUPPORT_UTF8
1595     c < 256 &&
1596     #endif
1597     (md->ctypes[c] & ctype_space) != 0
1598     )
1599     RRETURN(MATCH_NOMATCH);
1600     ecode++;
1601     break;
1602    
1603     case OP_WHITESPACE:
1604 ph10 428 if (eptr >= md->end_subject)
1605     {
1606     SCHECK_PARTIAL();
1607     RRETURN(MATCH_NOMATCH);
1608     }
1609 nigel 77 GETCHARINCTEST(c, eptr);
1610     if (
1611     #ifdef SUPPORT_UTF8
1612     c >= 256 ||
1613     #endif
1614     (md->ctypes[c] & ctype_space) == 0
1615     )
1616     RRETURN(MATCH_NOMATCH);
1617     ecode++;
1618     break;
1619    
1620     case OP_NOT_WORDCHAR:
1621 ph10 428 if (eptr >= md->end_subject)
1622     {
1623     SCHECK_PARTIAL();
1624     RRETURN(MATCH_NOMATCH);
1625     }
1626 nigel 77 GETCHARINCTEST(c, eptr);
1627     if (
1628     #ifdef SUPPORT_UTF8
1629     c < 256 &&
1630     #endif
1631     (md->ctypes[c] & ctype_word) != 0
1632     )
1633     RRETURN(MATCH_NOMATCH);
1634     ecode++;
1635     break;
1636    
1637     case OP_WORDCHAR:
1638 ph10 428 if (eptr >= md->end_subject)
1639     {
1640     SCHECK_PARTIAL();
1641     RRETURN(MATCH_NOMATCH);
1642     }
1643 nigel 77 GETCHARINCTEST(c, eptr);
1644     if (
1645     #ifdef SUPPORT_UTF8
1646     c >= 256 ||
1647     #endif
1648     (md->ctypes[c] & ctype_word) == 0
1649     )
1650     RRETURN(MATCH_NOMATCH);
1651     ecode++;
1652     break;
1653    
1654 nigel 93 case OP_ANYNL:
1655 ph10 428 if (eptr >= md->end_subject)
1656     {
1657     SCHECK_PARTIAL();
1658     RRETURN(MATCH_NOMATCH);
1659     }
1660 nigel 93 GETCHARINCTEST(c, eptr);
1661     switch(c)
1662     {
1663     default: RRETURN(MATCH_NOMATCH);
1664     case 0x000d:
1665     if (eptr < md->end_subject && *eptr == 0x0a) eptr++;
1666     break;
1667 ph10 231
1668 nigel 93 case 0x000a:
1669 ph10 231 break;
1670    
1671 nigel 93 case 0x000b:
1672     case 0x000c:
1673     case 0x0085:
1674     case 0x2028:
1675     case 0x2029:
1676 ph10 231 if (md->bsr_anycrlf) RRETURN(MATCH_NOMATCH);
1677 nigel 93 break;
1678     }
1679     ecode++;
1680     break;
1681    
1682 ph10 178 case OP_NOT_HSPACE:
1683 ph10 428 if (eptr >= md->end_subject)
1684     {
1685     SCHECK_PARTIAL();
1686     RRETURN(MATCH_NOMATCH);
1687     }
1688 ph10 178 GETCHARINCTEST(c, eptr);
1689     switch(c)
1690     {
1691     default: break;
1692     case 0x09: /* HT */
1693     case 0x20: /* SPACE */
1694     case 0xa0: /* NBSP */
1695     case 0x1680: /* OGHAM SPACE MARK */
1696     case 0x180e: /* MONGOLIAN VOWEL SEPARATOR */
1697     case 0x2000: /* EN QUAD */
1698     case 0x2001: /* EM QUAD */
1699     case 0x2002: /* EN SPACE */
1700     case 0x2003: /* EM SPACE */
1701     case 0x2004: /* THREE-PER-EM SPACE */
1702     case 0x2005: /* FOUR-PER-EM SPACE */
1703     case 0x2006: /* SIX-PER-EM SPACE */
1704     case 0x2007: /* FIGURE SPACE */
1705     case 0x2008: /* PUNCTUATION SPACE */
1706     case 0x2009: /* THIN SPACE */
1707     case 0x200A: /* HAIR SPACE */
1708     case 0x202f: /* NARROW NO-BREAK SPACE */
1709     case 0x205f: /* MEDIUM MATHEMATICAL SPACE */
1710     case 0x3000: /* IDEOGRAPHIC SPACE */
1711     RRETURN(MATCH_NOMATCH);
1712     }
1713     ecode++;
1714     break;
1715    
1716     case OP_HSPACE:
1717 ph10 428 if (eptr >= md->end_subject)
1718     {
1719     SCHECK_PARTIAL();
1720     RRETURN(MATCH_NOMATCH);
1721     }
1722 ph10 178 GETCHARINCTEST(c, eptr);
1723     switch(c)
1724     {
1725     default: RRETURN(MATCH_NOMATCH);
1726     case 0x09: /* HT */
1727     case 0x20: /* SPACE */
1728     case 0xa0: /* NBSP */
1729     case 0x1680: /* OGHAM SPACE MARK */
1730     case 0x180e: /* MONGOLIAN VOWEL SEPARATOR */
1731     case 0x2000: /* EN QUAD */
1732     case 0x2001: /* EM QUAD */
1733     case 0x2002: /* EN SPACE */
1734     case 0x2003: /* EM SPACE */
1735     case 0x2004: /* THREE-PER-EM SPACE */
1736     case 0x2005: /* FOUR-PER-EM SPACE */
1737     case 0x2006: /* SIX-PER-EM SPACE */
1738     case 0x2007: /* FIGURE SPACE */
1739     case 0x2008: /* PUNCTUATION SPACE */
1740     case 0x2009: /* THIN SPACE */
1741     case 0x200A: /* HAIR SPACE */
1742     case 0x202f: /* NARROW NO-BREAK SPACE */
1743     case 0x205f: /* MEDIUM MATHEMATICAL SPACE */
1744     case 0x3000: /* IDEOGRAPHIC SPACE */
1745     break;
1746     }
1747     ecode++;
1748     break;
1749    
1750     case OP_NOT_VSPACE:
1751 ph10 428 if (eptr >= md->end_subject)
1752     {
1753     SCHECK_PARTIAL();
1754     RRETURN(MATCH_NOMATCH);
1755     }
1756 ph10 178 GETCHARINCTEST(c, eptr);
1757     switch(c)
1758     {
1759     default: break;
1760     case 0x0a: /* LF */
1761     case 0x0b: /* VT */
1762     case 0x0c: /* FF */
1763     case 0x0d: /* CR */
1764     case 0x85: /* NEL */
1765     case 0x2028: /* LINE SEPARATOR */
1766     case 0x2029: /* PARAGRAPH SEPARATOR */
1767     RRETURN(MATCH_NOMATCH);
1768     }
1769     ecode++;
1770     break;
1771    
1772     case OP_VSPACE:
1773 ph10 428 if (eptr >= md->end_subject)
1774     {
1775     SCHECK_PARTIAL();
1776     RRETURN(MATCH_NOMATCH);
1777     }
1778 ph10 178 GETCHARINCTEST(c, eptr);
1779     switch(c)
1780     {
1781     default: RRETURN(MATCH_NOMATCH);
1782     case 0x0a: /* LF */
1783     case 0x0b: /* VT */
1784     case 0x0c: /* FF */
1785     case 0x0d: /* CR */
1786     case 0x85: /* NEL */
1787     case 0x2028: /* LINE SEPARATOR */
1788     case 0x2029: /* PARAGRAPH SEPARATOR */
1789     break;
1790     }
1791     ecode++;
1792     break;
1793    
1794 nigel 77 #ifdef SUPPORT_UCP
1795     /* Check the next character by Unicode property. We will get here only
1796     if the support is in the binary; otherwise a compile-time error occurs. */
1797    
1798     case OP_PROP:
1799     case OP_NOTPROP:
1800 ph10 428 if (eptr >= md->end_subject)
1801     {
1802     SCHECK_PARTIAL();
1803     RRETURN(MATCH_NOMATCH);
1804     }
1805 nigel 77 GETCHARINCTEST(c, eptr);
1806     {
1807 ph10 384 const ucd_record *prop = GET_UCD(c);
1808 nigel 77
1809 nigel 87 switch(ecode[1])
1810     {
1811     case PT_ANY:
1812     if (op == OP_NOTPROP) RRETURN(MATCH_NOMATCH);
1813     break;
1814 nigel 77
1815 nigel 87 case PT_LAMP:
1816 ph10 349 if ((prop->chartype == ucp_Lu ||
1817     prop->chartype == ucp_Ll ||
1818     prop->chartype == ucp_Lt) == (op == OP_NOTPROP))
1819 nigel 77 RRETURN(MATCH_NOMATCH);
1820 nigel 87 break;
1821    
1822     case PT_GC:
1823 ph10 351 if ((ecode[2] != _pcre_ucp_gentype[prop->chartype]) == (op == OP_PROP))
1824 nigel 77 RRETURN(MATCH_NOMATCH);
1825 nigel 87 break;
1826    
1827     case PT_PC:
1828 ph10 349 if ((ecode[2] != prop->chartype) == (op == OP_PROP))
1829 nigel 87 RRETURN(MATCH_NOMATCH);
1830     break;
1831    
1832     case PT_SC:
1833 ph10 349 if ((ecode[2] != prop->script) == (op == OP_PROP))
1834 nigel 87 RRETURN(MATCH_NOMATCH);
1835     break;
1836    
1837     default:
1838     RRETURN(PCRE_ERROR_INTERNAL);
1839 nigel 77 }
1840 nigel 87
1841     ecode += 3;
1842 nigel 77 }
1843     break;
1844    
1845     /* Match an extended Unicode sequence. We will get here only if the support
1846     is in the binary; otherwise a compile-time error occurs. */
1847    
1848     case OP_EXTUNI:
1849 ph10 428 if (eptr >= md->end_subject)
1850     {
1851     SCHECK_PARTIAL();
1852     RRETURN(MATCH_NOMATCH);
1853     }
1854 nigel 77 GETCHARINCTEST(c, eptr);
1855     {
1856 ph10 349 int category = UCD_CATEGORY(c);
1857 nigel 77 if (category == ucp_M) RRETURN(MATCH_NOMATCH);
1858     while (eptr < md->end_subject)
1859     {
1860     int len = 1;
1861     if (!utf8) c = *eptr; else
1862     {
1863     GETCHARLEN(c, eptr, len);
1864     }
1865 ph10 349 category = UCD_CATEGORY(c);
1866 nigel 77 if (category != ucp_M) break;
1867     eptr += len;
1868     }
1869     }
1870     ecode++;
1871     break;
1872     #endif
1873    
1874    
1875     /* Match a back reference, possibly repeatedly. Look past the end of the
1876     item to see if there is repeat information following. The code is similar
1877     to that for character classes, but repeated for efficiency. Then obey
1878     similar code to character type repeats - written out again for speed.
1879     However, if the referenced string is the empty string, always treat
1880     it as matched, any number of times (otherwise there could be infinite
1881     loops). */
1882    
1883     case OP_REF:
1884     {
1885     offset = GET2(ecode, 1) << 1; /* Doubled ref number */
1886 ph10 345 ecode += 3;
1887    
1888 ph10 336 /* If the reference is unset, there are two possibilities:
1889 ph10 345
1890 ph10 336 (a) In the default, Perl-compatible state, set the length to be longer
1891     than the amount of subject left; this ensures that every attempt at a
1892     match fails. We can't just fail here, because of the possibility of
1893     quantifiers with zero minima.
1894 ph10 345
1895     (b) If the JavaScript compatibility flag is set, set the length to zero
1896     so that the back reference matches an empty string.
1897    
1898     Otherwise, set the length to the length of what was matched by the
1899 ph10 336 referenced subpattern. */
1900 ph10 345
1901 ph10 336 if (offset >= offset_top || md->offset_vector[offset] < 0)
1902 ph10 345 length = (md->jscript_compat)? 0 : md->end_subject - eptr + 1;
1903 ph10 336 else
1904     length = md->offset_vector[offset+1] - md->offset_vector[offset];
1905 nigel 77
1906     /* Set up for repetition, or handle the non-repeated case */
1907    
1908     switch (*ecode)
1909     {
1910     case OP_CRSTAR:
1911     case OP_CRMINSTAR:
1912     case OP_CRPLUS:
1913     case OP_CRMINPLUS:
1914     case OP_CRQUERY:
1915     case OP_CRMINQUERY:
1916     c = *ecode++ - OP_CRSTAR;
1917     minimize = (c & 1) != 0;
1918     min = rep_min[c]; /* Pick up values from tables; */
1919     max = rep_max[c]; /* zero for max => infinity */
1920     if (max == 0) max = INT_MAX;
1921     break;
1922    
1923     case OP_CRRANGE:
1924     case OP_CRMINRANGE:
1925     minimize = (*ecode == OP_CRMINRANGE);
1926     min = GET2(ecode, 1);
1927     max = GET2(ecode, 3);
1928     if (max == 0) max = INT_MAX;
1929     ecode += 5;
1930     break;
1931    
1932     default: /* No repeat follows */
1933 ph10 428 if (!match_ref(offset, eptr, length, md, ims))
1934     {
1935     CHECK_PARTIAL();
1936     RRETURN(MATCH_NOMATCH);
1937     }
1938 nigel 77 eptr += length;
1939     continue; /* With the main loop */
1940     }
1941    
1942     /* If the length of the reference is zero, just continue with the
1943     main loop. */
1944 ph10 428
1945 nigel 77 if (length == 0) continue;
1946    
1947     /* First, ensure the minimum number of matches are present. We get back
1948     the length of the reference string explicitly rather than passing the
1949     address of eptr, so that eptr can be a register variable. */
1950    
1951     for (i = 1; i <= min; i++)
1952     {
1953 ph10 427 if (!match_ref(offset, eptr, length, md, ims))
1954 ph10 426 {
1955 ph10 427 CHECK_PARTIAL();
1956 ph10 426 RRETURN(MATCH_NOMATCH);
1957 ph10 427 }
1958 nigel 77 eptr += length;
1959     }
1960    
1961     /* If min = max, continue at the same level without recursion.
1962     They are not both allowed to be zero. */
1963    
1964     if (min == max) continue;
1965    
1966     /* If minimizing, keep trying and advancing the pointer */
1967    
1968     if (minimize)
1969     {
1970     for (fi = min;; fi++)
1971     {
1972 ph10 164 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM14);
1973 nigel 77 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1974 ph10 428 if (fi >= max) RRETURN(MATCH_NOMATCH);
1975     if (!match_ref(offset, eptr, length, md, ims))
1976 ph10 426 {
1977 ph10 427 CHECK_PARTIAL();
1978 nigel 77 RRETURN(MATCH_NOMATCH);
1979 ph10 427 }
1980 nigel 77 eptr += length;
1981     }
1982     /* Control never gets here */
1983     }
1984    
1985     /* If maximizing, find the longest string and work backwards */
1986    
1987     else
1988     {
1989     pp = eptr;
1990     for (i = min; i < max; i++)
1991     {
1992     if (!match_ref(offset, eptr, length, md, ims)) break;
1993     eptr += length;
1994     }
1995     while (eptr >= pp)
1996     {
1997 ph10 164 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM15);
1998 nigel 77 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1999     eptr -= length;
2000     }
2001     RRETURN(MATCH_NOMATCH);
2002     }
2003     }
2004     /* Control never gets here */
2005    
2006     /* Match a bit-mapped character class, possibly repeatedly. This op code is
2007     used when all the characters in the class have values in the range 0-255,
2008     and either the matching is caseful, or the characters are in the range
2009     0-127 when UTF-8 processing is enabled. The only difference between
2010     OP_CLASS and OP_NCLASS occurs when a data character outside the range is
2011     encountered.
2012    
2013     First, look past the end of the item to see if there is repeat information
2014     following. Then obey similar code to character type repeats - written out
2015     again for speed. */
2016    
2017     case OP_NCLASS:
2018     case OP_CLASS:
2019     {
2020     data = ecode + 1; /* Save for matching */
2021     ecode += 33; /* Advance past the item */
2022    
2023     switch (*ecode)
2024     {
2025     case OP_CRSTAR:
2026     case OP_CRMINSTAR:
2027     case OP_CRPLUS:
2028     case OP_CRMINPLUS:
2029     case OP_CRQUERY:
2030     case OP_CRMINQUERY:
2031     c = *ecode++ - OP_CRSTAR;
2032     minimize = (c & 1) != 0;
2033     min = rep_min[c]; /* Pick up values from tables; */
2034     max = rep_max[c]; /* zero for max => infinity */
2035     if (max == 0) max = INT_MAX;
2036     break;
2037    
2038     case OP_CRRANGE:
2039     case OP_CRMINRANGE:
2040     minimize = (*ecode == OP_CRMINRANGE);
2041     min = GET2(ecode, 1);
2042     max = GET2(ecode, 3);
2043     if (max == 0) max = INT_MAX;
2044     ecode += 5;
2045     break;
2046    
2047     default: /* No repeat follows */
2048     min = max = 1;
2049     break;
2050     }
2051    
2052     /* First, ensure the minimum number of matches are present. */
2053    
2054     #ifdef SUPPORT_UTF8
2055     /* UTF-8 mode */
2056     if (utf8)
2057     {
2058     for (i = 1; i <= min; i++)
2059     {
2060 ph10 427 if (eptr >= md->end_subject)
2061 ph10 426 {
2062 ph10 428 SCHECK_PARTIAL();
2063 ph10 426 RRETURN(MATCH_NOMATCH);
2064 ph10 427 }
2065 nigel 77 GETCHARINC(c, eptr);
2066     if (c > 255)
2067     {
2068     if (op == OP_CLASS) RRETURN(MATCH_NOMATCH);
2069     }
2070     else
2071     {
2072     if ((data[c/8] & (1 << (c&7))) == 0) RRETURN(MATCH_NOMATCH);
2073     }
2074     }
2075     }
2076     else
2077     #endif
2078     /* Not UTF-8 mode */
2079     {
2080     for (i = 1; i <= min; i++)
2081     {
2082 ph10 427 if (eptr >= md->end_subject)
2083 ph10 426 {
2084 ph10 428 SCHECK_PARTIAL();
2085 ph10 426 RRETURN(MATCH_NOMATCH);
2086 ph10 427 }
2087 nigel 77 c = *eptr++;
2088     if ((data[c/8] & (1 << (c&7))) == 0) RRETURN(MATCH_NOMATCH);
2089     }
2090     }
2091    
2092     /* If max == min we can continue with the main loop without the
2093     need to recurse. */
2094    
2095     if (min == max) continue;
2096    
2097     /* If minimizing, keep testing the rest of the expression and advancing
2098     the pointer while it matches the class. */
2099    
2100     if (minimize)
2101     {
2102     #ifdef SUPPORT_UTF8
2103     /* UTF-8 mode */
2104     if (utf8)
2105     {
2106     for (fi = min;; fi++)
2107     {
2108 ph10 164 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM16);
2109 nigel 77 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2110 ph10 428 if (fi >= max) RRETURN(MATCH_NOMATCH);
2111 ph10 427 if (eptr >= md->end_subject)
2112 ph10 426 {
2113 ph10 427 SCHECK_PARTIAL();
2114 ph10 426 RRETURN(MATCH_NOMATCH);
2115 ph10 427 }
2116 nigel 77 GETCHARINC(c, eptr);
2117     if (c > 255)
2118     {
2119     if (op == OP_CLASS) RRETURN(MATCH_NOMATCH);
2120     }
2121     else
2122     {
2123     if ((data[c/8] & (1 << (c&7))) == 0) RRETURN(MATCH_NOMATCH);
2124     }
2125     }
2126     }
2127     else
2128     #endif
2129     /* Not UTF-8 mode */
2130     {
2131     for (fi = min;; fi++)
2132     {
2133 ph10 164 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM17);
2134 nigel 77 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2135 ph10 428 if (fi >= max) RRETURN(MATCH_NOMATCH);
2136 ph10 427 if (eptr >= md->end_subject)
2137 ph10 426 {
2138 ph10 427 SCHECK_PARTIAL();
2139 ph10 426 RRETURN(MATCH_NOMATCH);
2140 ph10 427 }
2141 nigel 77 c = *eptr++;
2142     if ((data[c/8] & (1 << (c&7))) == 0) RRETURN(MATCH_NOMATCH);
2143     }
2144     }
2145     /* Control never gets here */
2146     }
2147    
2148     /* If maximizing, find the longest possible run, then work backwards. */
2149    
2150     else
2151     {
2152     pp = eptr;
2153    
2154     #ifdef SUPPORT_UTF8
2155     /* UTF-8 mode */
2156     if (utf8)
2157     {
2158     for (i = min; i < max; i++)
2159     {
2160     int len = 1;
2161     if (eptr >= md->end_subject) break;
2162     GETCHARLEN(c, eptr, len);
2163     if (c > 255)
2164     {
2165     if (op == OP_CLASS) break;
2166     }
2167     else
2168     {
2169     if ((data[c/8] & (1 << (c&7))) == 0) break;
2170     }
2171     eptr += len;
2172     }
2173     for (;;)
2174     {
2175 ph10 164 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM18);
2176 nigel 77 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2177     if (eptr-- == pp) break; /* Stop if tried at original pos */
2178     BACKCHAR(eptr);
2179     }
2180     }
2181     else
2182     #endif
2183     /* Not UTF-8 mode */
2184     {
2185     for (i = min; i < max; i++)
2186     {
2187     if (eptr >= md->end_subject) break;
2188     c = *eptr;
2189     if ((data[c/8] & (1 << (c&7))) == 0) break;
2190     eptr++;
2191     }
2192     while (eptr >= pp)
2193     {
2194 ph10 164 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM19);
2195 nigel 87 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2196 nigel 77 eptr--;
2197     }
2198     }
2199    
2200     RRETURN(MATCH_NOMATCH);
2201     }
2202     }
2203     /* Control never gets here */
2204    
2205    
2206     /* Match an extended character class. This opcode is encountered only
2207 ph10 384 when UTF-8 mode mode is supported. Nevertheless, we may not be in UTF-8
2208     mode, because Unicode properties are supported in non-UTF-8 mode. */
2209 nigel 77
2210     #ifdef SUPPORT_UTF8
2211     case OP_XCLASS:
2212     {
2213     data = ecode + 1 + LINK_SIZE; /* Save for matching */
2214     ecode += GET(ecode, 1); /* Advance past the item */
2215    
2216     switch (*ecode)
2217     {
2218     case OP_CRSTAR:
2219     case OP_CRMINSTAR:
2220     case OP_CRPLUS:
2221     case OP_CRMINPLUS:
2222     case OP_CRQUERY:
2223     case OP_CRMINQUERY:
2224     c = *ecode++ - OP_CRSTAR;
2225     minimize = (c & 1) != 0;
2226     min = rep_min[c]; /* Pick up values from tables; */
2227     max = rep_max[c]; /* zero for max => infinity */
2228     if (max == 0) max = INT_MAX;
2229     break;
2230    
2231     case OP_CRRANGE:
2232     case OP_CRMINRANGE:
2233     minimize = (*ecode == OP_CRMINRANGE);
2234     min = GET2(ecode, 1);
2235     max = GET2(ecode, 3);
2236     if (max == 0) max = INT_MAX;
2237     ecode += 5;
2238     break;
2239    
2240     default: /* No repeat follows */
2241     min = max = 1;
2242     break;
2243     }
2244    
2245     /* First, ensure the minimum number of matches are present. */
2246    
2247     for (i = 1; i <= min; i++)
2248     {
2249 ph10 427 if (eptr >= md->end_subject)
2250 ph10 426 {
2251     SCHECK_PARTIAL();
2252     RRETURN(MATCH_NOMATCH);
2253 ph10 427 }
2254 ph10 384 GETCHARINCTEST(c, eptr);
2255 nigel 77 if (!_pcre_xclass(c, data)) RRETURN(MATCH_NOMATCH);
2256     }
2257    
2258     /* If max == min we can continue with the main loop without the
2259     need to recurse. */
2260    
2261     if (min == max) continue;
2262    
2263     /* If minimizing, keep testing the rest of the expression and advancing
2264     the pointer while it matches the class. */
2265    
2266     if (minimize)
2267     {
2268     for (fi = min;; fi++)
2269     {
2270 ph10 164 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM20);
2271 nigel 77 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2272 ph10 428 if (fi >= max) RRETURN(MATCH_NOMATCH);
2273 ph10 427 if (eptr >= md->end_subject)
2274 ph10 426 {
2275 ph10 427 SCHECK_PARTIAL();
2276 ph10 426 RRETURN(MATCH_NOMATCH);
2277 ph10 427 }
2278 ph10 384 GETCHARINCTEST(c, eptr);
2279 nigel 77 if (!_pcre_xclass(c, data)) RRETURN(MATCH_NOMATCH);
2280     }
2281     /* Control never gets here */
2282     }
2283    
2284     /* If maximizing, find the longest possible run, then work backwards. */
2285    
2286     else
2287     {
2288     pp = eptr;
2289     for (i = min; i < max; i++)
2290     {
2291     int len = 1;
2292     if (eptr >= md->end_subject) break;
2293 ph10 384 GETCHARLENTEST(c, eptr, len);
2294 nigel 77 if (!_pcre_xclass(c, data)) break;
2295     eptr += len;
2296     }
2297     for(;;)
2298     {
2299 ph10 164 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM21);
2300 nigel 77 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2301     if (eptr-- == pp) break; /* Stop if tried at original pos */
2302 ph10 214 if (utf8) BACKCHAR(eptr);
2303 nigel 77 }
2304     RRETURN(MATCH_NOMATCH);
2305     }
2306    
2307     /* Control never gets here */
2308     }
2309     #endif /* End of XCLASS */
2310    
2311     /* Match a single character, casefully */
2312    
2313     case OP_CHAR:
2314     #ifdef SUPPORT_UTF8
2315     if (utf8)
2316     {
2317     length = 1;
2318     ecode++;
2319     GETCHARLEN(fc, ecode, length);
2320 ph10 428 if (length > md->end_subject - eptr)
2321     {
2322     CHECK_PARTIAL(); /* Not SCHECK_PARTIAL() */
2323     RRETURN(MATCH_NOMATCH);
2324     }
2325 nigel 77 while (length-- > 0) if (*ecode++ != *eptr++) RRETURN(MATCH_NOMATCH);
2326     }
2327     else
2328     #endif
2329    
2330     /* Non-UTF-8 mode */
2331     {
2332 ph10 428 if (md->end_subject - eptr < 1)
2333     {
2334     SCHECK_PARTIAL(); /* This one can use SCHECK_PARTIAL() */
2335     RRETURN(MATCH_NOMATCH);
2336     }
2337 nigel 77 if (ecode[1] != *eptr++) RRETURN(MATCH_NOMATCH);
2338     ecode += 2;
2339     }
2340     break;
2341    
2342     /* Match a single character, caselessly */
2343    
2344     case OP_CHARNC:
2345     #ifdef SUPPORT_UTF8
2346     if (utf8)
2347     {
2348     length = 1;
2349     ecode++;
2350     GETCHARLEN(fc, ecode, length);
2351    
2352 ph10 428 if (length > md->end_subject - eptr)
2353     {
2354     CHECK_PARTIAL(); /* Not SCHECK_PARTIAL() */
2355     RRETURN(MATCH_NOMATCH);
2356     }
2357 nigel 77
2358     /* If the pattern character's value is < 128, we have only one byte, and
2359     can use the fast lookup table. */
2360    
2361     if (fc < 128)
2362     {
2363     if (md->lcc[*ecode++] != md->lcc[*eptr++]) RRETURN(MATCH_NOMATCH);
2364     }
2365    
2366     /* Otherwise we must pick up the subject character */
2367    
2368     else
2369     {
2370 nigel 93 unsigned int dc;
2371 nigel 77 GETCHARINC(dc, eptr);
2372     ecode += length;
2373    
2374     /* If we have Unicode property support, we can use it to test the other
2375 nigel 87 case of the character, if there is one. */
2376 nigel 77
2377     if (fc != dc)
2378     {
2379     #ifdef SUPPORT_UCP
2380 ph10 349 if (dc != UCD_OTHERCASE(fc))
2381 nigel 77 #endif
2382     RRETURN(MATCH_NOMATCH);
2383     }
2384     }
2385     }
2386     else
2387     #endif /* SUPPORT_UTF8 */
2388    
2389     /* Non-UTF-8 mode */
2390     {
2391 ph10 428 if (md->end_subject - eptr < 1)
2392     {
2393     SCHECK_PARTIAL(); /* This one can use SCHECK_PARTIAL() */
2394     RRETURN(MATCH_NOMATCH);
2395     }
2396 nigel 77 if (md->lcc[ecode[1]] != md->lcc[*eptr++]) RRETURN(MATCH_NOMATCH);
2397     ecode += 2;
2398     }
2399     break;
2400    
2401 nigel 93 /* Match a single character repeatedly. */
2402 nigel 77
2403     case OP_EXACT:
2404     min = max = GET2(ecode, 1);
2405     ecode += 3;
2406     goto REPEATCHAR;
2407    
2408 nigel 93 case OP_POSUPTO:
2409     possessive = TRUE;
2410     /* Fall through */
2411    
2412 nigel 77 case OP_UPTO:
2413     case OP_MINUPTO:
2414     min = 0;
2415     max = GET2(ecode, 1);
2416     minimize = *ecode == OP_MINUPTO;
2417     ecode += 3;
2418     goto REPEATCHAR;
2419    
2420 nigel 93 case OP_POSSTAR:
2421     possessive = TRUE;
2422     min = 0;
2423     max = INT_MAX;
2424     ecode++;
2425     goto REPEATCHAR;
2426    
2427     case OP_POSPLUS:
2428     possessive = TRUE;
2429     min = 1;
2430     max = INT_MAX;
2431     ecode++;
2432     goto REPEATCHAR;
2433    
2434     case OP_POSQUERY:
2435     possessive = TRUE;
2436     min = 0;
2437     max = 1;
2438     ecode++;
2439     goto REPEATCHAR;
2440    
2441 nigel 77 case OP_STAR:
2442     case OP_MINSTAR:
2443     case OP_PLUS:
2444     case OP_MINPLUS:
2445     case OP_QUERY:
2446     case OP_MINQUERY:
2447     c = *ecode++ - OP_STAR;
2448     minimize = (c & 1) != 0;
2449 ph10 428
2450 nigel 77 min = rep_min[c]; /* Pick up values from tables; */
2451     max = rep_max[c]; /* zero for max => infinity */
2452     if (max == 0) max = INT_MAX;
2453    
2454 ph10 426 /* Common code for all repeated single-character matches. */
2455 nigel 77
2456     REPEATCHAR:
2457     #ifdef SUPPORT_UTF8
2458     if (utf8)
2459     {
2460     length = 1;
2461     charptr = ecode;
2462     GETCHARLEN(fc, ecode, length);
2463     ecode += length;
2464    
2465     /* Handle multibyte character matching specially here. There is
2466     support for caseless matching if UCP support is present. */
2467    
2468     if (length > 1)
2469     {
2470     #ifdef SUPPORT_UCP
2471 nigel 93 unsigned int othercase;
2472 nigel 77 if ((ims & PCRE_CASELESS) != 0 &&
2473 ph10 349 (othercase = UCD_OTHERCASE(fc)) != fc)
2474 nigel 77 oclength = _pcre_ord2utf8(othercase, occhars);
2475 ph10 115 else oclength = 0;
2476 nigel 77 #endif /* SUPPORT_UCP */
2477    
2478     for (i = 1; i <= min; i++)
2479     {
2480 ph10 426 if (eptr <= md->end_subject - length &&
2481     memcmp(eptr, charptr, length) == 0) eptr += length;
2482 ph10 123 #ifdef SUPPORT_UCP
2483 ph10 426 else if (oclength > 0 &&
2484     eptr <= md->end_subject - oclength &&
2485     memcmp(eptr, occhars, oclength) == 0) eptr += oclength;
2486     #endif /* SUPPORT_UCP */
2487 nigel 77 else
2488     {
2489 ph10 426 CHECK_PARTIAL();
2490     RRETURN(MATCH_NOMATCH);
2491 nigel 77 }
2492     }
2493    
2494     if (min == max) continue;
2495    
2496     if (minimize)
2497     {
2498     for (fi = min;; fi++)
2499     {
2500 ph10 164 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM22);
2501 nigel 77 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2502 ph10 428 if (fi >= max) RRETURN(MATCH_NOMATCH);
2503 ph10 426 if (eptr <= md->end_subject - length &&
2504     memcmp(eptr, charptr, length) == 0) eptr += length;
2505 ph10 123 #ifdef SUPPORT_UCP
2506 ph10 426 else if (oclength > 0 &&
2507     eptr <= md->end_subject - oclength &&
2508     memcmp(eptr, occhars, oclength) == 0) eptr += oclength;
2509     #endif /* SUPPORT_UCP */
2510 nigel 77 else
2511     {
2512 ph10 426 CHECK_PARTIAL();
2513     RRETURN(MATCH_NOMATCH);
2514 nigel 77 }
2515     }
2516     /* Control never gets here */
2517     }
2518 nigel 93
2519     else /* Maximize */
2520 nigel 77 {
2521     pp = eptr;
2522     for (i = min; i < max; i++)
2523     {
2524 ph10 426 if (eptr <= md->end_subject - length &&
2525     memcmp(eptr, charptr, length) == 0) eptr += length;
2526 ph10 123 #ifdef SUPPORT_UCP
2527 ph10 426 else if (oclength > 0 &&
2528     eptr <= md->end_subject - oclength &&
2529     memcmp(eptr, occhars, oclength) == 0) eptr += oclength;
2530     #endif /* SUPPORT_UCP */
2531 ph10 115 else break;
2532 nigel 77 }
2533 nigel 93
2534     if (possessive) continue;
2535 ph10 427
2536 ph10 120 for(;;)
2537 ph10 426 {
2538     RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM23);
2539     if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2540     if (eptr == pp) { RRETURN(MATCH_NOMATCH); }
2541 ph10 115 #ifdef SUPPORT_UCP
2542 ph10 426 eptr--;
2543     BACKCHAR(eptr);
2544 ph10 123 #else /* without SUPPORT_UCP */
2545 ph10 426 eptr -= length;
2546 ph10 123 #endif /* SUPPORT_UCP */
2547 ph10 426 }
2548 nigel 77 }
2549     /* Control never gets here */
2550     }
2551    
2552     /* If the length of a UTF-8 character is 1, we fall through here, and
2553     obey the code as for non-UTF-8 characters below, though in this case the
2554     value of fc will always be < 128. */
2555     }
2556     else
2557     #endif /* SUPPORT_UTF8 */
2558    
2559     /* When not in UTF-8 mode, load a single-byte character. */
2560    
2561 ph10 426 fc = *ecode++;
2562 ph10 428
2563 nigel 77 /* The value of fc at this point is always less than 256, though we may or
2564     may not be in UTF-8 mode. The code is duplicated for the caseless and
2565     caseful cases, for speed, since matching characters is likely to be quite
2566     common. First, ensure the minimum number of matches are present. If min =
2567     max, continue at the same level without recursing. Otherwise, if
2568     minimizing, keep trying the rest of the expression and advancing one
2569     matching character if failing, up to the maximum. Alternatively, if
2570     maximizing, find the maximum number of characters and work backwards. */
2571    
2572     DPRINTF(("matching %c{%d,%d} against subject %.*s\n", fc, min, max,
2573     max, eptr));
2574    
2575     if ((ims & PCRE_CASELESS) != 0)
2576     {
2577     fc = md->lcc[fc];
2578     for (i = 1; i <= min; i++)
2579 ph10 426 {
2580     if (eptr >= md->end_subject)
2581     {
2582     SCHECK_PARTIAL();
2583     RRETURN(MATCH_NOMATCH);
2584     }
2585 nigel 77 if (fc != md->lcc[*eptr++]) RRETURN(MATCH_NOMATCH);
2586 ph10 426 }
2587 nigel 77 if (min == max) continue;
2588     if (minimize)
2589     {
2590     for (fi = min;; fi++)
2591     {
2592 ph10 164 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM24);
2593 nigel 77 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2594 ph10 428 if (fi >= max) RRETURN(MATCH_NOMATCH);
2595 ph10 426 if (eptr >= md->end_subject)
2596     {
2597 ph10 427 SCHECK_PARTIAL();
2598 ph10 426 RRETURN(MATCH_NOMATCH);
2599     }
2600     if (fc != md->lcc[*eptr++]) RRETURN(MATCH_NOMATCH);
2601 nigel 77 }
2602     /* Control never gets here */
2603     }
2604 nigel 93 else /* Maximize */
2605 nigel 77 {
2606     pp = eptr;
2607     for (i = min; i < max; i++)
2608     {
2609     if (eptr >= md->end_subject || fc != md->lcc[*eptr]) break;
2610     eptr++;
2611     }
2612 ph10 427
2613 nigel 93 if (possessive) continue;
2614 ph10 427
2615 nigel 77 while (eptr >= pp)
2616     {
2617 ph10 164 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM25);
2618 nigel 77 eptr--;
2619     if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2620     }
2621     RRETURN(MATCH_NOMATCH);
2622     }
2623     /* Control never gets here */
2624     }
2625    
2626     /* Caseful comparisons (includes all multi-byte characters) */
2627    
2628     else
2629     {
2630 ph10 427 for (i = 1; i <= min; i++)
2631 ph10 426 {
2632     if (eptr >= md->end_subject)
2633     {
2634     SCHECK_PARTIAL();
2635     RRETURN(MATCH_NOMATCH);
2636     }
2637     if (fc != *eptr++) RRETURN(MATCH_NOMATCH);
2638 ph10 427 }
2639 ph10 428
2640 nigel 77 if (min == max) continue;
2641 ph10 428
2642 nigel 77 if (minimize)
2643     {
2644     for (fi = min;; fi++)
2645     {
2646 ph10 164 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM26);
2647 nigel 77 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2648 ph10 428 if (fi >= max) RRETURN(MATCH_NOMATCH);
2649 ph10 426 if (eptr >= md->end_subject)
2650 ph10 427 {
2651 ph10 426 SCHECK_PARTIAL();
2652     RRETURN(MATCH_NOMATCH);
2653 ph10 427 }
2654 ph10 426 if (fc != *eptr++) RRETURN(MATCH_NOMATCH);
2655 nigel 77 }
2656     /* Control never gets here */
2657     }
2658 nigel 93 else /* Maximize */
2659 nigel 77 {
2660     pp = eptr;
2661     for (i = min; i < max; i++)
2662     {
2663     if (eptr >= md->end_subject || fc != *eptr) break;
2664     eptr++;
2665     }
2666 nigel 93 if (possessive) continue;
2667 ph10 428
2668 nigel 77 while (eptr >= pp)
2669     {
2670 ph10 164 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM27);
2671 nigel 77 eptr--;
2672     if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2673     }
2674     RRETURN(MATCH_NOMATCH);
2675     }
2676     }
2677     /* Control never gets here */
2678    
2679     /* Match a negated single one-byte character. The character we are
2680     checking can be multibyte. */
2681    
2682     case OP_NOT:
2683 ph10 428 if (eptr >= md->end_subject)
2684     {
2685     SCHECK_PARTIAL();
2686     RRETURN(MATCH_NOMATCH);
2687     }
2688 nigel 77 ecode++;
2689     GETCHARINCTEST(c, eptr);
2690     if ((ims & PCRE_CASELESS) != 0)
2691     {
2692     #ifdef SUPPORT_UTF8
2693     if (c < 256)
2694     #endif
2695     c = md->lcc[c];
2696     if (md->lcc[*ecode++] == c) RRETURN(MATCH_NOMATCH);
2697     }
2698     else
2699     {
2700     if (*ecode++ == c) RRETURN(MATCH_NOMATCH);
2701     }
2702     break;
2703    
2704     /* Match a negated single one-byte character repeatedly. This is almost a
2705     repeat of the code for a repeated single character, but I haven't found a
2706     nice way of commoning these up that doesn't require a test of the
2707     positive/negative option for each character match. Maybe that wouldn't add
2708     very much to the time taken, but character matching *is* what this is all
2709     about... */
2710    
2711     case OP_NOTEXACT:
2712     min = max = GET2(ecode, 1);
2713     ecode += 3;
2714     goto REPEATNOTCHAR;
2715    
2716     case OP_NOTUPTO:
2717     case OP_NOTMINUPTO:
2718     min = 0;
2719     max = GET2(ecode, 1);
2720     minimize = *ecode == OP_NOTMINUPTO;
2721     ecode += 3;
2722     goto REPEATNOTCHAR;
2723    
2724 nigel 93 case OP_NOTPOSSTAR:
2725     possessive = TRUE;
2726     min = 0;
2727     max = INT_MAX;
2728     ecode++;
2729     goto REPEATNOTCHAR;
2730    
2731     case OP_NOTPOSPLUS:
2732     possessive = TRUE;
2733     min = 1;
2734     max = INT_MAX;
2735     ecode++;
2736     goto REPEATNOTCHAR;
2737    
2738     case OP_NOTPOSQUERY:
2739     possessive = TRUE;
2740     min = 0;
2741     max = 1;
2742     ecode++;
2743     goto REPEATNOTCHAR;
2744    
2745     case OP_NOTPOSUPTO:
2746     possessive = TRUE;
2747     min = 0;
2748     max = GET2(ecode, 1);
2749     ecode += 3;
2750     goto REPEATNOTCHAR;
2751    
2752 nigel 77 case OP_NOTSTAR:
2753     case OP_NOTMINSTAR:
2754     case OP_NOTPLUS:
2755     case OP_NOTMINPLUS:
2756     case OP_NOTQUERY:
2757     case OP_NOTMINQUERY:
2758     c = *ecode++ - OP_NOTSTAR;
2759     minimize = (c & 1) != 0;
2760     min = rep_min[c]; /* Pick up values from tables; */
2761     max = rep_max[c]; /* zero for max => infinity */
2762     if (max == 0) max = INT_MAX;
2763    
2764 ph10 426 /* Common code for all repeated single-byte matches. */
2765 nigel 77
2766     REPEATNOTCHAR:
2767     fc = *ecode++;
2768    
2769     /* The code is duplicated for the caseless and caseful cases, for speed,
2770     since matching characters is likely to be quite common. First, ensure the
2771     minimum number of matches are present. If min = max, continue at the same
2772     level without recursing. Otherwise, if minimizing, keep trying the rest of
2773     the expression and advancing one matching character if failing, up to the
2774     maximum. Alternatively, if maximizing, find the maximum number of
2775     characters and work backwards. */
2776    
2777     DPRINTF(("negative matching %c{%d,%d} against subject %.*s\n", fc, min, max,
2778     max, eptr));
2779    
2780     if ((ims & PCRE_CASELESS) != 0)
2781     {
2782     fc = md->lcc[fc];
2783    
2784     #ifdef SUPPORT_UTF8
2785     /* UTF-8 mode */
2786     if (utf8)
2787     {
2788 nigel 93 register unsigned int d;
2789 nigel 77 for (i = 1; i <= min; i++)
2790     {
2791 ph10 426 if (eptr >= md->end_subject)
2792     {
2793     SCHECK_PARTIAL();
2794 ph10 427 RRETURN(MATCH_NOMATCH);
2795     }
2796 nigel 77 GETCHARINC(d, eptr);
2797     if (d < 256) d = md->lcc[d];
2798     if (fc == d) RRETURN(MATCH_NOMATCH);
2799     }
2800     }
2801     else
2802     #endif
2803    
2804     /* Not UTF-8 mode */
2805     {
2806     for (i = 1; i <= min; i++)
2807 ph10 426 {
2808     if (eptr >= md->end_subject)
2809     {
2810     SCHECK_PARTIAL();
2811 ph10 427 RRETURN(MATCH_NOMATCH);
2812     }
2813 nigel 77 if (fc == md->lcc[*eptr++]) RRETURN(MATCH_NOMATCH);
2814 ph10 427 }
2815 nigel 77 }
2816    
2817     if (min == max) continue;
2818    
2819     if (minimize)
2820     {
2821     #ifdef SUPPORT_UTF8
2822     /* UTF-8 mode */
2823     if (utf8)
2824     {
2825 nigel 93 register unsigned int d;
2826 nigel 77 for (fi = min;; fi++)
2827     {
2828 ph10 164 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM28);
2829 nigel 77 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2830 ph10 428 if (fi >= max) RRETURN(MATCH_NOMATCH);
2831 ph10 427 if (eptr >= md->end_subject)
2832 ph10 426 {
2833 ph10 427 SCHECK_PARTIAL();
2834 ph10 426 RRETURN(MATCH_NOMATCH);
2835 ph10 427 }
2836 nigel 77 GETCHARINC(d, eptr);
2837     if (d < 256) d = md->lcc[d];
2838 ph10 366 if (fc == d) RRETURN(MATCH_NOMATCH);
2839 nigel 77 }
2840     }
2841     else
2842     #endif
2843     /* Not UTF-8 mode */
2844     {
2845     for (fi = min;; fi++)
2846     {
2847 ph10 164 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM29);
2848 nigel 77 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2849 ph10 428 if (fi >= max) RRETURN(MATCH_NOMATCH);
2850 ph10 426 if (eptr >= md->end_subject)
2851     {
2852     SCHECK_PARTIAL();
2853     RRETURN(MATCH_NOMATCH);
2854     }
2855     if (fc == md->lcc[*eptr++]) RRETURN(MATCH_NOMATCH);
2856 nigel 77 }
2857     }
2858     /* Control never gets here */
2859     }
2860    
2861     /* Maximize case */
2862    
2863     else
2864     {
2865     pp = eptr;
2866    
2867     #ifdef SUPPORT_UTF8
2868     /* UTF-8 mode */
2869     if (utf8)
2870     {
2871 nigel 93 register unsigned int d;
2872 nigel 77 for (i = min; i < max; i++)
2873     {
2874     int len = 1;
2875     if (eptr >= md->end_subject) break;
2876     GETCHARLEN(d, eptr, len);
2877     if (d < 256) d = md->lcc[d];
2878     if (fc == d) break;
2879     eptr += len;
2880     }
2881 nigel 93 if (possessive) continue;
2882     for(;;)
2883 nigel 77 {
2884 ph10 164 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM30);
2885 nigel 77 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2886     if (eptr-- == pp) break; /* Stop if tried at original pos */
2887     BACKCHAR(eptr);
2888     }
2889     }
2890     else
2891     #endif
2892     /* Not UTF-8 mode */
2893     {
2894     for (i = min; i < max; i++)
2895     {
2896     if (eptr >= md->end_subject || fc == md->lcc[*eptr]) break;
2897     eptr++;
2898     }
2899 nigel 93 if (possessive) continue;
2900 nigel 77 while (eptr >= pp)
2901     {
2902 ph10 164 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM31);
2903 nigel 77 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2904     eptr--;
2905     }
2906     }
2907    
2908     RRETURN(MATCH_NOMATCH);
2909     }
2910     /* Control never gets here */
2911     }
2912    
2913     /* Caseful comparisons */
2914    
2915     else
2916     {
2917     #ifdef SUPPORT_UTF8
2918     /* UTF-8 mode */
2919     if (utf8)
2920     {
2921 nigel 93 register unsigned int d;
2922 nigel 77 for (i = 1; i <= min; i++)
2923     {
2924 ph10 426 if (eptr >= md->end_subject)
2925     {
2926     SCHECK_PARTIAL();
2927 ph10 427 RRETURN(MATCH_NOMATCH);
2928     }
2929 nigel 77 GETCHARINC(d, eptr);
2930     if (fc == d) RRETURN(MATCH_NOMATCH);
2931     }
2932     }
2933     else
2934     #endif
2935     /* Not UTF-8 mode */
2936     {
2937     for (i = 1; i <= min; i++)
2938 ph10 426 {
2939     if (eptr >= md->end_subject)
2940     {
2941     SCHECK_PARTIAL();
2942 ph10 427 RRETURN(MATCH_NOMATCH);
2943     }
2944 nigel 77 if (fc == *eptr++) RRETURN(MATCH_NOMATCH);
2945 ph10 427 }
2946 nigel 77 }
2947    
2948     if (min == max) continue;
2949    
2950     if (minimize)
2951     {
2952     #ifdef SUPPORT_UTF8
2953     /* UTF-8 mode */
2954     if (utf8)
2955     {
2956 nigel 93 register unsigned int d;
2957 nigel 77 for (fi = min;; fi++)
2958     {
2959 ph10 164 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM32);
2960 nigel 77 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2961 ph10 428 if (fi >= max) RRETURN(MATCH_NOMATCH);
2962 ph10 427 if (eptr >= md->end_subject)
2963 ph10 426 {
2964 ph10 427 SCHECK_PARTIAL();
2965 ph10 426 RRETURN(MATCH_NOMATCH);
2966 ph10 427 }
2967 nigel 77 GETCHARINC(d, eptr);
2968 ph10 371 if (fc == d) RRETURN(MATCH_NOMATCH);
2969 nigel 77 }
2970     }
2971     else
2972     #endif
2973     /* Not UTF-8 mode */
2974     {
2975     for (fi = min;; fi++)
2976     {
2977 ph10 164 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM33);
2978 nigel 77 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2979 ph10 428 if (fi >= max) RRETURN(MATCH_NOMATCH);
2980 ph10 426 if (eptr >= md->end_subject)
2981     {
2982     SCHECK_PARTIAL();
2983     RRETURN(MATCH_NOMATCH);
2984 ph10 427 }
2985 ph10 426 if (fc == *eptr++) RRETURN(MATCH_NOMATCH);
2986 nigel 77 }
2987     }
2988     /* Control never gets here */
2989     }
2990    
2991     /* Maximize case */
2992    
2993     else
2994     {
2995     pp = eptr;
2996    
2997     #ifdef SUPPORT_UTF8
2998     /* UTF-8 mode */
2999     if (utf8)
3000     {
3001 nigel 93 register unsigned int d;
3002 nigel 77 for (i = min; i < max; i++)
3003     {
3004     int len = 1;
3005     if (eptr >= md->end_subject) break;
3006     GETCHARLEN(d, eptr, len);
3007     if (fc == d) break;
3008     eptr += len;
3009     }
3010 nigel 93 if (possessive) continue;
3011 nigel 77 for(;;)
3012     {
3013 ph10 164 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM34);
3014 nigel 77 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3015     if (eptr-- == pp) break; /* Stop if tried at original pos */
3016     BACKCHAR(eptr);
3017     }
3018     }
3019     else
3020     #endif
3021     /* Not UTF-8 mode */
3022     {
3023     for (i = min; i < max; i++)
3024     {
3025     if (eptr >= md->end_subject || fc == *eptr) break;
3026     eptr++;
3027     }
3028 nigel 93 if (possessive) continue;
3029 nigel 77 while (eptr >= pp)
3030     {
3031 ph10 164 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM35);
3032 nigel 77 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3033     eptr--;
3034     }
3035     }
3036    
3037     RRETURN(MATCH_NOMATCH);
3038     }
3039     }
3040     /* Control never gets here */
3041    
3042     /* Match a single character type repeatedly; several different opcodes
3043     share code. This is very similar to the code for single characters, but we
3044     repeat it in the interests of efficiency. */
3045    
3046     case OP_TYPEEXACT:
3047     min = max = GET2(ecode, 1);
3048     minimize = TRUE;
3049     ecode += 3;
3050     goto REPEATTYPE;
3051    
3052     case OP_TYPEUPTO:
3053     case OP_TYPEMINUPTO:
3054     min = 0;
3055     max = GET2(ecode, 1);
3056     minimize = *ecode == OP_TYPEMINUPTO;
3057     ecode += 3;
3058     goto REPEATTYPE;
3059    
3060 nigel 93 case OP_TYPEPOSSTAR:
3061     possessive = TRUE;
3062     min = 0;
3063     max = INT_MAX;
3064     ecode++;
3065     goto REPEATTYPE;
3066    
3067     case OP_TYPEPOSPLUS:
3068     possessive = TRUE;
3069     min = 1;
3070     max = INT_MAX;
3071     ecode++;
3072     goto REPEATTYPE;
3073    
3074     case OP_TYPEPOSQUERY:
3075     possessive = TRUE;
3076     min = 0;
3077     max = 1;
3078     ecode++;
3079     goto REPEATTYPE;
3080    
3081     case OP_TYPEPOSUPTO:
3082     possessive = TRUE;
3083     min = 0;
3084     max = GET2(ecode, 1);
3085     ecode += 3;
3086     goto REPEATTYPE;
3087    
3088 nigel 77 case OP_TYPESTAR:
3089     case OP_TYPEMINSTAR:
3090     case OP_TYPEPLUS:
3091     case OP_TYPEMINPLUS:
3092     case OP_TYPEQUERY:
3093     case OP_TYPEMINQUERY:
3094     c = *ecode++ - OP_TYPESTAR;
3095     minimize = (c & 1) != 0;
3096     min = rep_min[c]; /* Pick up values from tables; */
3097     max = rep_max[c]; /* zero for max => infinity */
3098     if (max == 0) max = INT_MAX;
3099    
3100     /* Common code for all repeated single character type matches. Note that
3101     in UTF-8 mode, '.' matches a character of any length, but for the other
3102     character types, the valid characters are all one-byte long. */
3103    
3104     REPEATTYPE:
3105     ctype = *ecode++; /* Code for the character type */
3106    
3107     #ifdef SUPPORT_UCP
3108     if (ctype == OP_PROP || ctype == OP_NOTPROP)
3109     {
3110     prop_fail_result = ctype == OP_NOTPROP;
3111     prop_type = *ecode++;
3112 nigel 87 prop_value = *ecode++;
3113 nigel 77 }
3114     else prop_type = -1;
3115     #endif
3116    
3117     /* First, ensure the minimum number of matches are present. Use inline
3118     code for maximizing the speed, and do the type test once at the start
3119 ph10 426 (i.e. keep it out of the loop). Separate the UTF-8 code completely as that
3120 nigel 77 is tidier. Also separate the UCP code, which can be the same for both UTF-8
3121     and single-bytes. */
3122    
3123     if (min > 0)
3124     {
3125     #ifdef SUPPORT_UCP
3126 nigel 87 if (prop_type >= 0)
3127 nigel 77 {
3128 nigel 87 switch(prop_type)
3129 nigel 77 {
3130 nigel 87 case PT_ANY:
3131     if (prop_fail_result) RRETURN(MATCH_NOMATCH);
3132     for (i = 1; i <= min; i++)
3133     {
3134 ph10 427 if (eptr >= md->end_subject)
3135 ph10 426 {
3136 ph10 427 SCHECK_PARTIAL();
3137 ph10 426 RRETURN(MATCH_NOMATCH);
3138 ph10 427 }
3139 ph10 184 GETCHARINCTEST(c, eptr);
3140 nigel 87 }
3141     break;
3142    
3143     case PT_LAMP:
3144     for (i = 1; i <= min; i++)
3145     {
3146 ph10 427 if (eptr >= md->end_subject)
3147 ph10 426 {
3148 ph10 427 SCHECK_PARTIAL();
3149 ph10 426 RRETURN(MATCH_NOMATCH);
3150 ph10 427 }
3151 ph10 184 GETCHARINCTEST(c, eptr);
3152 ph10 349 prop_chartype = UCD_CHARTYPE(c);
3153 nigel 87 if ((prop_chartype == ucp_Lu ||
3154     prop_chartype == ucp_Ll ||
3155     prop_chartype == ucp_Lt) == prop_fail_result)
3156     RRETURN(MATCH_NOMATCH);
3157     }
3158     break;
3159    
3160     case PT_GC:
3161     for (i = 1; i <= min; i++)
3162     {
3163 ph10 427 if (eptr >= md->end_subject)
3164 ph10 426 {
3165 ph10 427 SCHECK_PARTIAL();
3166 ph10 426 RRETURN(MATCH_NOMATCH);
3167 ph10 427 }
3168 ph10 184 GETCHARINCTEST(c, eptr);
3169 ph10 349 prop_category = UCD_CATEGORY(c);
3170 nigel 87 if ((prop_category == prop_value) == prop_fail_result)
3171     RRETURN(MATCH_NOMATCH);
3172     }
3173     break;
3174    
3175     case PT_PC:
3176     for (i = 1; i <= min; i++)
3177     {
3178 ph10 427 if (eptr >= md->end_subject)
3179 ph10 426 {
3180 ph10 427 SCHECK_PARTIAL();
3181 ph10 426 RRETURN(MATCH_NOMATCH);
3182 ph10 427 }
3183 ph10 184 GETCHARINCTEST(c, eptr);
3184 ph10 349 prop_chartype = UCD_CHARTYPE(c);
3185 nigel 87 if ((prop_chartype == prop_value) == prop_fail_result)
3186     RRETURN(MATCH_NOMATCH);
3187     }
3188     break;
3189    
3190     case PT_SC:
3191     for (i = 1; i <= min; i++)
3192     {
3193 ph10 427 if (eptr >= md->end_subject)
3194 ph10 426 {
3195 ph10 427 SCHECK_PARTIAL();
3196 ph10 426 RRETURN(MATCH_NOMATCH);
3197 ph10 427 }
3198 ph10 184 GETCHARINCTEST(c, eptr);
3199 ph10 349 prop_script = UCD_SCRIPT(c);
3200 nigel 87 if ((prop_script == prop_value) == prop_fail_result)
3201     RRETURN(MATCH_NOMATCH);
3202     }
3203     break;
3204    
3205     default:
3206     RRETURN(PCRE_ERROR_INTERNAL);
3207 nigel 77 }
3208     }
3209    
3210     /* Match extended Unicode sequences. We will get here only if the
3211     support is in the binary; otherwise a compile-time error occurs. */
3212    
3213     else if (ctype == OP_EXTUNI)
3214     {
3215     for (i = 1; i <= min; i++)
3216     {
3217 ph10 427 if (eptr >= md->end_subject)
3218 ph10 426 {
3219 ph10 427 SCHECK_PARTIAL();
3220 ph10 426 RRETURN(MATCH_NOMATCH);
3221 ph10 427 }
3222 nigel 77 GETCHARINCTEST(c, eptr);
3223 ph10 349 prop_category = UCD_CATEGORY(c);
3224 nigel 77 if (prop_category == ucp_M) RRETURN(MATCH_NOMATCH);
3225     while (eptr < md->end_subject)
3226     {
3227     int len = 1;
3228 ph10 426 if (!utf8) c = *eptr;
3229     else { GETCHARLEN(c, eptr, len); }
3230 ph10 349 prop_category = UCD_CATEGORY(c);
3231 nigel 77 if (prop_category != ucp_M) break;
3232     eptr += len;
3233     }
3234     }
3235     }
3236    
3237     else
3238     #endif /* SUPPORT_UCP */
3239    
3240     /* Handle all other cases when the coding is UTF-8 */
3241    
3242     #ifdef SUPPORT_UTF8
3243     if (utf8) switch(ctype)
3244     {
3245     case OP_ANY:
3246     for (i = 1; i <= min; i++)
3247     {
3248 ph10 426 if (eptr >= md->end_subject)
3249     {
3250 ph10 427 SCHECK_PARTIAL();
3251 nigel 77 RRETURN(MATCH_NOMATCH);
3252 ph10 427 }
3253 ph10 426 if (IS_NEWLINE(eptr)) RRETURN(MATCH_NOMATCH);
3254 nigel 91 eptr++;
3255 nigel 77 while (eptr < md->end_subject && (*eptr & 0xc0) == 0x80) eptr++;
3256     }
3257     break;
3258    
3259 ph10 341 case OP_ALLANY:
3260     for (i = 1; i <= min; i++)
3261     {
3262 ph10 427 if (eptr >= md->end_subject)
3263 ph10 426 {
3264     SCHECK_PARTIAL();
3265     RRETURN(MATCH_NOMATCH);
3266 ph10 427 }
3267 ph10 341 eptr++;
3268     while (eptr < md->end_subject && (*eptr & 0xc0) == 0x80) eptr++;
3269     }
3270     break;
3271    
3272 nigel 77 case OP_ANYBYTE:
3273 ph10 427 if (eptr > md->end_subject - min) RRETURN(MATCH_NOMATCH);
3274 nigel 77 eptr += min;
3275     break;
3276    
3277 nigel 93 case OP_ANYNL:
3278     for (i = 1; i <= min; i++)
3279     {
3280 ph10 427 if (eptr >= md->end_subject)
3281 ph10 426 {
3282     SCHECK_PARTIAL();
3283     RRETURN(MATCH_NOMATCH);
3284 ph10 427 }
3285 nigel 93 GETCHARINC(c, eptr);
3286     switch(c)
3287     {
3288     default: RRETURN(MATCH_NOMATCH);
3289     case 0x000d:
3290     if (eptr < md->end_subject && *eptr == 0x0a) eptr++;
3291     break;
3292 ph10 231
3293 nigel 93 case 0x000a:
3294 ph10 231 break;
3295    
3296 nigel 93 case 0x000b:
3297     case 0x000c:
3298     case 0x0085:
3299     case 0x2028:
3300     case 0x2029:
3301 ph10 231 if (md->bsr_anycrlf) RRETURN(MATCH_NOMATCH);
3302 nigel 93 break;
3303     }
3304     }
3305     break;
3306    
3307 ph10 178 case OP_NOT_HSPACE:
3308     for (i = 1; i <= min; i++)
3309     {
3310 ph10 427 if (eptr >= md->end_subject)
3311 ph10 426 {
3312     SCHECK_PARTIAL();
3313     RRETURN(MATCH_NOMATCH);
3314 ph10 427 }
3315 ph10 178 GETCHARINC(c, eptr);
3316     switch(c)
3317     {
3318     default: break;
3319     case 0x09: /* HT */
3320     case 0x20: /* SPACE */
3321     case 0xa0: /* NBSP */
3322     case 0x1680: /* OGHAM SPACE MARK */
3323     case 0x180e: /* MONGOLIAN VOWEL SEPARATOR */
3324     case 0x2000: /* EN QUAD */
3325     case 0x2001: /* EM QUAD */
3326     case 0x2002: /* EN SPACE */
3327     case 0x2003: /* EM SPACE */
3328     case 0x2004: /* THREE-PER-EM SPACE */
3329     case 0x2005: /* FOUR-PER-EM SPACE */
3330     case 0x2006: /* SIX-PER-EM SPACE */
3331     case 0x2007: /* FIGURE SPACE */
3332     case 0x2008: /* PUNCTUATION SPACE */
3333     case 0x2009: /* THIN SPACE */
3334     case 0x200A: /* HAIR SPACE */
3335     case 0x202f: /* NARROW NO-BREAK SPACE */
3336     case 0x205f: /* MEDIUM MATHEMATICAL SPACE */
3337     case 0x3000: /* IDEOGRAPHIC SPACE */
3338     RRETURN(MATCH_NOMATCH);
3339     }
3340     }
3341     break;
3342 ph10 182
3343 ph10 178 case OP_HSPACE:
3344     for (i = 1; i <= min; i++)
3345     {
3346 ph10 427 if (eptr >= md->end_subject)
3347 ph10 426 {
3348 ph10 427 SCHECK_PARTIAL();
3349 ph10 426 RRETURN(MATCH_NOMATCH);
3350 ph10 427 }
3351 ph10 178 GETCHARINC(c, eptr);
3352     switch(c)
3353     {
3354     default: RRETURN(MATCH_NOMATCH);
3355     case 0x09: /* HT */
3356     case 0x20: /* SPACE */
3357     case 0xa0: /* NBSP */
3358     case 0x1680: /* OGHAM SPACE MARK */
3359     case 0x180e: /* MONGOLIAN VOWEL SEPARATOR */
3360     case 0x2000: /* EN QUAD */
3361     case 0x2001: /* EM QUAD */
3362     case 0x2002: /* EN SPACE */
3363     case 0x2003: /* EM SPACE */
3364     case 0x2004: /* THREE-PER-EM SPACE */
3365     case 0x2005: /* FOUR-PER-EM SPACE */
3366     case 0x2006: /* SIX-PER-EM SPACE */
3367     case 0x2007: /* FIGURE SPACE */
3368     case 0x2008: /* PUNCTUATION SPACE */
3369     case 0x2009: /* THIN SPACE */
3370     case 0x200A: /* HAIR SPACE */
3371     case 0x202f: /* NARROW NO-BREAK SPACE */
3372     case 0x205f: /* MEDIUM MATHEMATICAL SPACE */
3373     case 0x3000: /* IDEOGRAPHIC SPACE */
3374     break;
3375     }
3376     }
3377     break;
3378 ph10 182
3379 ph10 178 case OP_NOT_VSPACE:
3380     for (i = 1; i <= min; i++)
3381     {
3382 ph10 427 if (eptr >= md->end_subject)
3383 ph10 426 {
3384 ph10 427 SCHECK_PARTIAL();
3385 ph10 426 RRETURN(MATCH_NOMATCH);
3386 ph10 427 }
3387 ph10 178 GETCHARINC(c, eptr);
3388     switch(c)
3389     {
3390     default: break;
3391     case 0x0a: /* LF */
3392     case 0x0b: /* VT */
3393     case 0x0c: /* FF */
3394     case 0x0d: /* CR */
3395     case 0x85: /* NEL */
3396     case 0x2028: /* LINE SEPARATOR */
3397     case 0x2029: /* PARAGRAPH SEPARATOR */
3398     RRETURN(MATCH_NOMATCH);
3399     }
3400     }
3401     break;
3402 ph10 182
3403 ph10 178 case OP_VSPACE:
3404     for (i = 1; i <= min; i++)
3405     {
3406 ph10 427 if (eptr >= md->end_subject)
3407 ph10 426 {
3408 ph10 427 SCHECK_PARTIAL();
3409 ph10 426 RRETURN(MATCH_NOMATCH);
3410 ph10 427 }
3411 ph10 178 GETCHARINC(c, eptr);
3412     switch(c)
3413     {
3414     default: RRETURN(MATCH_NOMATCH);
3415     case 0x0a: /* LF */
3416     case 0x0b: /* VT */
3417     case 0x0c: /* FF */
3418     case 0x0d: /* CR */
3419     case 0x85: /* NEL */
3420     case 0x2028: /* LINE SEPARATOR */
3421     case 0x2029: /* PARAGRAPH SEPARATOR */
3422 ph10 182 break;
3423 ph10 178 }
3424     }
3425     break;
3426    
3427 nigel 77 case OP_NOT_DIGIT:
3428     for (i = 1; i <= min; i++)
3429     {
3430 ph10 427 if (eptr >= md->end_subject)
3431 ph10 426 {
3432 ph10 427 SCHECK_PARTIAL();
3433 ph10 426 RRETURN(MATCH_NOMATCH);
3434 ph10 427 }
3435 nigel 77 GETCHARINC(c, eptr);
3436     if (c < 128 && (md->ctypes[c] & ctype_digit) != 0)
3437     RRETURN(MATCH_NOMATCH);
3438     }
3439     break;
3440    
3441     case OP_DIGIT:
3442     for (i = 1; i <= min; i++)
3443     {
3444 ph10 427 if (eptr >= md->end_subject)
3445 ph10 426 {
3446 ph10 427 SCHECK_PARTIAL();
3447 nigel 77 RRETURN(MATCH_NOMATCH);
3448 ph10 427 }
3449 ph10 426 if (*eptr >= 128 || (md->ctypes[*eptr++] & ctype_digit) == 0)
3450     RRETURN(MATCH_NOMATCH);
3451 nigel 77 /* No need to skip more bytes - we know it's a 1-byte character */
3452     }
3453     break;
3454    
3455     case OP_NOT_WHITESPACE:
3456     for (i = 1; i <= min; i++)
3457     {
3458 ph10 427 if (eptr >= md->end_subject)
3459 ph10 426 {
3460 ph10 427 SCHECK_PARTIAL();
3461 nigel 77 RRETURN(MATCH_NOMATCH);
3462 ph10 427 }
3463 ph10 426 if (*eptr < 128 && (md->ctypes[*eptr] & ctype_space) != 0)
3464     RRETURN(MATCH_NOMATCH);
3465 ph10 219 while (++eptr < md->end_subject && (*eptr & 0xc0) == 0x80);
3466 nigel 77 }
3467     break;
3468    
3469     case OP_WHITESPACE:
3470     for (i = 1; i <= min; i++)
3471     {
3472 ph10 427 if (eptr >= md->end_subject)
3473 ph10 426 {
3474 ph10 427 SCHECK_PARTIAL();
3475 nigel 77 RRETURN(MATCH_NOMATCH);
3476 ph10 427 }
3477 ph10 426 if (*eptr >= 128 || (md->ctypes[*eptr++] & ctype_space) == 0)
3478     RRETURN(MATCH_NOMATCH);
3479 nigel 77 /* No need to skip more bytes - we know it's a 1-byte character */
3480     }
3481     break;
3482    
3483     case OP_NOT_WORDCHAR:
3484     for (i = 1; i <= min; i++)
3485     {
3486     if (eptr >= md->end_subject ||
3487 ph10 219 (*eptr < 128 && (md->ctypes[*eptr] & ctype_word) != 0))
3488 nigel 77 RRETURN(MATCH_NOMATCH);
3489 ph10 219 while (++eptr < md->end_subject && (*eptr & 0xc0) == 0x80);
3490 nigel 77 }
3491     break;
3492    
3493     case OP_WORDCHAR:
3494     for (i = 1; i <= min; i++)
3495     {
3496 ph10 427 if (eptr >= md->end_subject)
3497 ph10 426 {
3498 ph10 427 SCHECK_PARTIAL();
3499 nigel 77 RRETURN(MATCH_NOMATCH);
3500 ph10 427 }
3501 ph10 426 if (*eptr >= 128 || (md->ctypes[*eptr++] & ctype_word) == 0)
3502     RRETURN(MATCH_NOMATCH);
3503 nigel 77 /* No need to skip more bytes - we know it's a 1-byte character */
3504     }
3505     break;
3506    
3507     default:
3508     RRETURN(PCRE_ERROR_INTERNAL);
3509     } /* End switch(ctype) */
3510    
3511     else
3512     #endif /* SUPPORT_UTF8 */
3513    
3514     /* Code for the non-UTF-8 case for minimum matching of operators other
3515 ph10 426 than OP_PROP and OP_NOTPROP. */
3516 nigel 77
3517     switch(ctype)
3518     {
3519     case OP_ANY:
3520 ph10 342 for (i = 1; i <= min; i++)
3521 nigel 77 {
3522 ph10 427 if (eptr >= md->end_subject)
3523 ph10 426 {
3524 ph10 427 SCHECK_PARTIAL();
3525 ph10 426 RRETURN(MATCH_NOMATCH);
3526 ph10 427 }
3527 ph10 342 if (IS_NEWLINE(eptr)) RRETURN(MATCH_NOMATCH);
3528     eptr++;
3529 nigel 77 }
3530     break;
3531    
3532 ph10 341 case OP_ALLANY:
3533 ph10 428 if (eptr > md->end_subject - min)
3534     {
3535     SCHECK_PARTIAL();
3536     RRETURN(MATCH_NOMATCH);
3537     }
3538 ph10 341 eptr += min;
3539     break;
3540    
3541 nigel 77 case OP_ANYBYTE:
3542 ph10 428 if (eptr > md->end_subject - min)
3543     {
3544     SCHECK_PARTIAL();
3545     RRETURN(MATCH_NOMATCH);
3546     }
3547 nigel 77 eptr += min;
3548     break;
3549    
3550 nigel 93 case OP_ANYNL:
3551     for (i = 1; i <= min; i++)
3552     {
3553 ph10 427 if (eptr >= md->end_subject)
3554 ph10 426 {
3555 ph10 427 SCHECK_PARTIAL();
3556 ph10 426 RRETURN(MATCH_NOMATCH);
3557 ph10 427 }
3558 nigel 93 switch(*eptr++)
3559     {
3560     default: RRETURN(MATCH_NOMATCH);
3561     case 0x000d:
3562     if (eptr < md->end_subject && *eptr == 0x0a) eptr++;
3563     break;
3564     case 0x000a:
3565 ph10 231 break;
3566    
3567 nigel 93 case 0x000b:
3568     case 0x000c:
3569     case 0x0085:
3570 ph10 231 if (md->bsr_anycrlf) RRETURN(MATCH_NOMATCH);
3571 nigel 93 break;
3572     }
3573     }
3574     break;
3575    
3576 ph10 178 case OP_NOT_HSPACE:
3577     for (i = 1; i <= min; i++)
3578     {
3579 ph10 427 if (eptr >= md->end_subject)
3580 ph10 426 {
3581 ph10 427 SCHECK_PARTIAL();
3582 ph10 426 RRETURN(MATCH_NOMATCH);
3583 ph10 427 }
3584 ph10 178 switch(*eptr++)
3585     {
3586     default: break;
3587     case 0x09: /* HT */
3588     case 0x20: /* SPACE */
3589     case 0xa0: /* NBSP */
3590     RRETURN(MATCH_NOMATCH);
3591     }
3592     }
3593     break;
3594    
3595     case OP_HSPACE:
3596     for (i = 1; i <= min; i++)
3597     {
3598 ph10 427 if (eptr >= md->end_subject)
3599 ph10 426 {
3600 ph10 427 SCHECK_PARTIAL();
3601 ph10 426 RRETURN(MATCH_NOMATCH);
3602 ph10 427 }
3603 ph10 178 switch(*eptr++)
3604     {
3605     default: RRETURN(MATCH_NOMATCH);
3606     case 0x09: /* HT */
3607     case 0x20: /* SPACE */
3608     case 0xa0: /* NBSP */
3609 ph10 182 break;
3610 ph10 178 }
3611     }
3612     break;
3613    
3614     case OP_NOT_VSPACE:
3615     for (i = 1; i <= min; i++)
3616     {
3617 ph10 427 if (eptr >= md->end_subject)
3618 ph10 426 {
3619 ph10 427 SCHECK_PARTIAL();
3620 ph10 426 RRETURN(MATCH_NOMATCH);
3621 ph10 427 }
3622 ph10 178 switch(*eptr++)
3623     {
3624     default: break;
3625     case 0x0a: /* LF */
3626     case 0x0b: /* VT */
3627     case 0x0c: /* FF */
3628     case 0x0d: /* CR */
3629     case 0x85: /* NEL */
3630     RRETURN(MATCH_NOMATCH);
3631     }
3632     }
3633     break;
3634    
3635     case OP_VSPACE:
3636     for (i = 1; i <= min; i++)
3637     {
3638 ph10 427 if (eptr >= md->end_subject)
3639 ph10 426 {
3640 ph10 427 SCHECK_PARTIAL();
3641 ph10 426 RRETURN(MATCH_NOMATCH);
3642 ph10 427 }
3643 ph10 178 switch(*eptr++)
3644     {
3645     default: RRETURN(MATCH_NOMATCH);
3646     case 0x0a: /* LF */
3647     case 0x0b: /* VT */
3648     case 0x0c: /* FF */
3649     case 0x0d: /* CR */
3650     case 0x85: /* NEL */
3651 ph10 182 break;
3652 ph10 178 }
3653     }
3654     break;
3655    
3656 nigel 77 case OP_NOT_DIGIT:
3657     for (i = 1; i <= min; i++)
3658 ph10 427 {
3659     if (eptr >= md->end_subject)
3660 ph10 426 {
3661 ph10 427 SCHECK_PARTIAL();
3662 ph10 426 RRETURN(MATCH_NOMATCH);
3663 ph10 427 }
3664 nigel 77 if ((md->ctypes[*eptr++] & ctype_digit) != 0) RRETURN(MATCH_NOMATCH);
3665 ph10 427 }
3666 nigel 77 break;
3667    
3668     case OP_DIGIT:
3669     for (i = 1; i <= min; i++)
3670 ph10 427 {
3671     if (eptr >= md->end_subject)
3672 ph10 426 {
3673 ph10 427 SCHECK_PARTIAL();
3674 ph10 426 RRETURN(MATCH_NOMATCH);
3675 ph10 427 }
3676 nigel 77 if ((md->ctypes[*eptr++] & ctype_digit) == 0) RRETURN(MATCH_NOMATCH);
3677 ph10 427 }
3678 nigel 77 break;
3679    
3680     case OP_NOT_WHITESPACE:
3681     for (i = 1; i <= min; i++)
3682 ph10 427 {
3683     if (eptr >= md->end_subject)
3684 ph10 426 {
3685 ph10 427 SCHECK_PARTIAL();
3686 ph10 426 RRETURN(MATCH_NOMATCH);
3687 ph10 427 }
3688 nigel 77 if ((md->ctypes[*eptr++] & ctype_space) != 0) RRETURN(MATCH_NOMATCH);
3689 ph10 427 }
3690 nigel 77 break;
3691    
3692     case OP_WHITESPACE:
3693     for (i = 1; i <= min; i++)
3694 ph10 427 {
3695     if (eptr >= md->end_subject)
3696 ph10 426 {
3697 ph10 427 SCHECK_PARTIAL();
3698 ph10 426 RRETURN(MATCH_NOMATCH);
3699 ph10 427 }
3700 nigel 77 if ((md->ctypes[*eptr++] & ctype_space) == 0) RRETURN(MATCH_NOMATCH);
3701 ph10 427 }
3702 nigel 77 break;
3703    
3704     case OP_NOT_WORDCHAR:
3705     for (i = 1; i <= min; i++)
3706 ph10 427 {
3707     if (eptr >= md->end_subject)
3708 ph10 426 {
3709 ph10 427 SCHECK_PARTIAL();
3710 ph10 426 RRETURN(MATCH_NOMATCH);
3711 ph10 427 }
3712 nigel 77 if ((md->ctypes[*eptr++] & ctype_word) != 0)
3713     RRETURN(MATCH_NOMATCH);
3714 ph10 427 }
3715 nigel 77 break;
3716    
3717     case OP_WORDCHAR:
3718     for (i = 1; i <= min; i++)
3719 ph10 427 {
3720     if (eptr >= md->end_subject)
3721 ph10 426 {
3722 ph10 427 SCHECK_PARTIAL();
3723 ph10 426 RRETURN(MATCH_NOMATCH);
3724 ph10 427 }
3725 nigel 77 if ((md->ctypes[*eptr++] & ctype_word) == 0)
3726     RRETURN(MATCH_NOMATCH);
3727 ph10 427 }
3728 nigel 77 break;
3729    
3730     default:
3731     RRETURN(PCRE_ERROR_INTERNAL);
3732     }
3733     }
3734    
3735     /* If min = max, continue at the same level without recursing */
3736    
3737     if (min == max) continue;
3738    
3739     /* If minimizing, we have to test the rest of the pattern before each
3740     subsequent match. Again, separate the UTF-8 case for speed, and also
3741     separate the UCP cases. */
3742    
3743     if (minimize)
3744     {
3745     #ifdef SUPPORT_UCP
3746 nigel 87 if (prop_type >= 0)
3747 nigel 77 {
3748 nigel 87 switch(prop_type)
3749 nigel 77 {
3750 nigel 87 case PT_ANY:
3751     for (fi = min;; fi++)
3752     {
3753 ph10 164 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM36);
3754 nigel 87 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3755 ph10 428 if (fi >= max) RRETURN(MATCH_NOMATCH);
3756 ph10 427 if (eptr >= md->end_subject)
3757 ph10 426 {
3758 ph10 427 SCHECK_PARTIAL();
3759 ph10 426 RRETURN(MATCH_NOMATCH);
3760 ph10 427 }
3761 nigel 87 GETCHARINC(c, eptr);
3762     if (prop_fail_result) RRETURN(MATCH_NOMATCH);
3763     }
3764 nigel 93 /* Control never gets here */
3765 nigel 87
3766     case PT_LAMP:
3767     for (fi = min;; fi++)
3768     {
3769 ph10 164 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM37);
3770 nigel 87 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3771 ph10 428 if (fi >= max) RRETURN(MATCH_NOMATCH);
3772 ph10 427 if (eptr >= md->end_subject)
3773 ph10 426 {
3774 ph10 427 SCHECK_PARTIAL();
3775 ph10 426 RRETURN(MATCH_NOMATCH);
3776 ph10 427 }
3777 nigel 87 GETCHARINC(c, eptr);
3778 ph10 349 prop_chartype = UCD_CHARTYPE(c);
3779 nigel 87 if ((prop_chartype == ucp_Lu ||
3780     prop_chartype == ucp_Ll ||
3781     prop_chartype == ucp_Lt) == prop_fail_result)
3782     RRETURN(MATCH_NOMATCH);
3783     }
3784 nigel 93 /* Control never gets here */
3785 nigel 87
3786     case PT_GC:
3787     for (fi = min;; fi++)
3788     {
3789 ph10 164 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM38);
3790 nigel 87 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3791 ph10 428 if (fi >= max) RRETURN(MATCH_NOMATCH);
3792 ph10 427 if (eptr >= md->end_subject)
3793 ph10 426 {
3794 ph10 427 SCHECK_PARTIAL();
3795 ph10 426 RRETURN(MATCH_NOMATCH);
3796 ph10 427 }
3797 nigel 87 GETCHARINC(c, eptr);
3798 ph10 349 prop_category = UCD_CATEGORY(c);
3799 nigel 87 if ((prop_category == prop_value) == prop_fail_result)
3800     RRETURN(MATCH_NOMATCH);
3801     }
3802 nigel 93 /* Control never gets here */
3803 nigel 87
3804     case PT_PC:
3805     for (fi = min;; fi++)
3806     {
3807 ph10 164 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM39);
3808 nigel 87 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3809 ph10 428 if (fi >= max) RRETURN(MATCH_NOMATCH);
3810 ph10 427 if (eptr >= md->end_subject)
3811 ph10 426 {
3812 ph10 427 SCHECK_PARTIAL();
3813 ph10 426 RRETURN(MATCH_NOMATCH);
3814 ph10 427 }
3815 nigel 87 GETCHARINC(c, eptr);
3816 ph10 349 prop_chartype = UCD_CHARTYPE(c);
3817 nigel 87 if ((prop_chartype == prop_value) == prop_fail_result)
3818     RRETURN(MATCH_NOMATCH);
3819     }
3820 nigel 93 /* Control never gets here */
3821 nigel 87
3822     case PT_SC:
3823     for (fi = min;; fi++)
3824     {
3825 ph10 164 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM40);
3826 nigel 87 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3827 ph10 428 if (fi >= max) RRETURN(MATCH_NOMATCH);
3828 ph10 427 if (eptr >= md->end_subject)
3829 ph10 426 {
3830 ph10 427 SCHECK_PARTIAL();
3831 ph10 426 RRETURN(MATCH_NOMATCH);
3832 ph10 427 }
3833 nigel 87 GETCHARINC(c, eptr);
3834 ph10 349 prop_script = UCD_SCRIPT(c);
3835 nigel 87 if ((prop_script == prop_value) == prop_fail_result)
3836     RRETURN(MATCH_NOMATCH);
3837     }
3838 nigel 93 /* Control never gets here */
3839 nigel 87
3840     default:
3841     RRETURN(PCRE_ERROR_INTERNAL);
3842 nigel 77 }
3843     }
3844    
3845     /* Match extended Unicode sequences. We will get here only if the
3846     support is in the binary; otherwise a compile-time error occurs. */
3847    
3848     else if (ctype == OP_EXTUNI)
3849     {
3850     for (fi = min;; fi++)
3851     {
3852 ph10 164 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM41);
3853 nigel 77 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3854 ph10 428 if (fi >= max) RRETURN(MATCH_NOMATCH);
3855 ph10 427 if (eptr >= md->end_subject)
3856 ph10 426 {
3857 ph10 427 SCHECK_PARTIAL();
3858 ph10 426 RRETURN(MATCH_NOMATCH);
3859 ph10 427 }
3860 nigel 77 GETCHARINCTEST(c, eptr);
3861 ph10 349 prop_category = UCD_CATEGORY(c);
3862 nigel 77 if (prop_category == ucp_M) RRETURN(MATCH_NOMATCH);
3863     while (eptr < md->end_subject)
3864     {
3865     int len = 1;
3866 ph10 426 if (!utf8) c = *eptr;
3867     else { GETCHARLEN(c, eptr, len); }
3868 ph10 349 prop_category = UCD_CATEGORY(c);
3869 nigel 77 if (prop_category != ucp_M) break;
3870     eptr += len;
3871     }
3872     }
3873     }
3874    
3875     else
3876     #endif /* SUPPORT_UCP */
3877    
3878     #ifdef SUPPORT_UTF8
3879     /* UTF-8 mode */
3880     if (utf8)
3881     {
3882     for (fi = min;; fi++)
3883     {
3884 ph10 164 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM42);
3885 nigel 77 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3886 ph10 428 if (fi >= max) RRETURN(MATCH_NOMATCH);
3887 ph10 427 if (eptr >= md->end_subject)
3888 ph10 426 {
3889 ph10 427 SCHECK_PARTIAL();
3890 ph10 426 RRETURN(MATCH_NOMATCH);
3891 ph10 427 }
3892 ph10 426 if (ctype == OP_ANY && IS_NEWLINE(eptr))
3893     RRETURN(MATCH_NOMATCH);
3894 nigel 77 GETCHARINC(c, eptr);
3895     switch(ctype)
3896     {
3897 ph10 342 case OP_ANY: /* This is the non-NL case */
3898 ph10 345 case OP_ALLANY:
3899 nigel 77 case OP_ANYBYTE:
3900     break;
3901    
3902 nigel 93 case OP_ANYNL:
3903     switch(c)
3904     {
3905     default: RRETURN(MATCH_NOMATCH);
3906     case 0x000d:
3907     if (eptr < md->end_subject && *eptr == 0x0a) eptr++;
3908     break;
3909     case 0x000a:
3910 ph10 231 break;
3911    
3912 nigel 93 case 0x000b:
3913     case 0x000c:
3914     case 0x0085:
3915     case 0x2028:
3916     case 0x2029:
3917 ph10 231 if (md->bsr_anycrlf) RRETURN(MATCH_NOMATCH);
3918 nigel 93 break;
3919     }
3920     break;
3921    
3922 ph10 178 case OP_NOT_HSPACE:
3923     switch(c)
3924     {
3925     default: break;
3926     case 0x09: /* HT */
3927     case 0x20: /* SPACE */
3928     case 0xa0: /* NBSP */
3929     case 0x1680: /* OGHAM SPACE MARK */
3930     case 0x180e: /* MONGOLIAN VOWEL SEPARATOR */
3931     case 0x2000: /* EN QUAD */
3932     case 0x2001: /* EM QUAD */
3933     case 0x2002: /* EN SPACE */
3934     case 0x2003: /* EM SPACE */
3935     case 0x2004: /* THREE-PER-EM SPACE */
3936     case 0x2005: /* FOUR-PER-EM SPACE */
3937     case 0x2006: /* SIX-PER-EM SPACE */
3938     case 0x2007: /* FIGURE SPACE */
3939     case 0x2008: /* PUNCTUATION SPACE */
3940     case 0x2009: /* THIN SPACE */
3941     case 0x200A: /* HAIR SPACE */
3942     case 0x202f: /* NARROW NO-BREAK SPACE */
3943     case 0x205f: /* MEDIUM MATHEMATICAL SPACE */
3944     case 0x3000: /* IDEOGRAPHIC SPACE */
3945     RRETURN(MATCH_NOMATCH);
3946     }
3947     break;
3948    
3949     case OP_HSPACE:
3950     switch(c)
3951     {
3952     default: RRETURN(MATCH_NOMATCH);
3953     case 0x09: /* HT */
3954     case 0x20: /* SPACE */
3955     case 0xa0: /* NBSP */
3956     case 0x1680: /* OGHAM SPACE MARK */
3957     case 0x180e: /* MONGOLIAN VOWEL SEPARATOR */
3958     case 0x2000: /* EN QUAD */
3959     case 0x2001: /* EM QUAD */
3960     case 0x2002: /* EN SPACE */
3961     case 0x2003: /* EM SPACE */
3962     case 0x2004: /* THREE-PER-EM SPACE */
3963     case 0x2005: /* FOUR-PER-EM SPACE */
3964     case 0x2006: /* SIX-PER-EM SPACE */
3965     case 0x2007: /* FIGURE SPACE */
3966     case 0x2008: /* PUNCTUATION SPACE */
3967     case 0x2009: /* THIN SPACE */
3968     case 0x200A: /* HAIR SPACE */
3969     case 0x202f: /* NARROW NO-BREAK SPACE */
3970     case 0x205f: /* MEDIUM MATHEMATICAL SPACE */
3971     case 0x3000: /* IDEOGRAPHIC SPACE */
3972     break;
3973     }
3974     break;
3975    
3976     case OP_NOT_VSPACE:
3977     switch(c)
3978     {
3979     default: break;
3980     case 0x0a: /* LF */
3981     case 0x0b: /* VT */
3982     case 0x0c: /* FF */
3983     case 0x0d: /* CR */
3984     case 0x85: /* NEL */
3985     case 0x2028: /* LINE SEPARATOR */
3986     case 0x2029: /* PARAGRAPH SEPARATOR */
3987     RRETURN(MATCH_NOMATCH);
3988     }
3989     break;
3990    
3991     case OP_VSPACE:
3992     switch(c)
3993     {
3994