/[pcre]/code/trunk/pcre_exec.c
ViewVC logotype

Contents of /code/trunk/pcre_exec.c

Parent Directory Parent Directory | Revision Log Revision Log


Revision 428 - (hide annotations) (download)
Mon Aug 31 17:10:26 2009 UTC (3 years, 8 months ago) by ph10
File MIME type: text/plain
File size: 161004 byte(s)
Further partial match change: add PCRE_PARTIAL_HARD and make more intuitive.

1 nigel 77 /*************************************************
2     * Perl-Compatible Regular Expressions *
3     *************************************************/
4    
5     /* PCRE is a library of functions to support regular expressions whose syntax
6     and semantics are as close as possible to those of the Perl 5 language.
7    
8     Written by Philip Hazel
9 ph10 381 Copyright (c) 1997-2009 University of Cambridge
10 nigel 77
11     -----------------------------------------------------------------------------
12     Redistribution and use in source and binary forms, with or without
13     modification, are permitted provided that the following conditions are met:
14    
15     * Redistributions of source code must retain the above copyright notice,
16     this list of conditions and the following disclaimer.
17    
18     * Redistributions in binary form must reproduce the above copyright
19     notice, this list of conditions and the following disclaimer in the
20     documentation and/or other materials provided with the distribution.
21    
22     * Neither the name of the University of Cambridge nor the names of its
23     contributors may be used to endorse or promote products derived from
24     this software without specific prior written permission.
25    
26     THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
27     AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
28     IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
29     ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
30     LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
31     CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
32     SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
33     INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
34     CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
35     ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
36     POSSIBILITY OF SUCH DAMAGE.
37     -----------------------------------------------------------------------------
38     */
39    
40    
41     /* This module contains pcre_exec(), the externally visible function that does
42     pattern matching using an NFA algorithm, trying to mimic Perl as closely as
43     possible. There are also some static supporting functions. */
44    
45 ph10 200 #ifdef HAVE_CONFIG_H
46 ph10 236 #include "config.h"
47 ph10 200 #endif
48 ph10 199
49 nigel 93 #define NLBLOCK md /* Block containing newline information */
50     #define PSSTART start_subject /* Field containing processed string start */
51     #define PSEND end_subject /* Field containing processed string end */
52    
53 nigel 77 #include "pcre_internal.h"
54    
55 ph10 137 /* Undefine some potentially clashing cpp symbols */
56    
57     #undef min
58     #undef max
59    
60 nigel 77 /* Flag bits for the match() function */
61    
62 nigel 93 #define match_condassert 0x01 /* Called to check a condition assertion */
63     #define match_cbegroup 0x02 /* Could-be-empty unlimited repeat group */
64 nigel 77
65     /* Non-error returns from the match() function. Error returns are externally
66     defined PCRE_ERROR_xxx codes, which are all negative. */
67    
68     #define MATCH_MATCH 1
69     #define MATCH_NOMATCH 0
70    
71 ph10 211 /* Special internal returns from the match() function. Make them sufficiently
72 ph10 210 negative to avoid the external error codes. */
73    
74     #define MATCH_COMMIT (-999)
75     #define MATCH_PRUNE (-998)
76     #define MATCH_SKIP (-997)
77     #define MATCH_THEN (-996)
78    
79 nigel 77 /* Maximum number of ints of offset to save on the stack for recursive calls.
80     If the offset vector is bigger, malloc is used. This should be a multiple of 3,
81     because the offset vector is always a multiple of 3 long. */
82    
83     #define REC_STACK_SAVE_MAX 30
84    
85     /* Min and max values for the common repeats; for the maxima, 0 => infinity */
86    
87     static const char rep_min[] = { 0, 0, 1, 1, 0, 0 };
88     static const char rep_max[] = { 0, 0, 0, 0, 1, 1 };
89    
90    
91    
92     #ifdef DEBUG
93     /*************************************************
94     * Debugging function to print chars *
95     *************************************************/
96    
97     /* Print a sequence of chars in printable format, stopping at the end of the
98     subject if the requested.
99    
100     Arguments:
101     p points to characters
102     length number to print
103     is_subject TRUE if printing from within md->start_subject
104     md pointer to matching data block, if is_subject is TRUE
105    
106     Returns: nothing
107     */
108    
109     static void
110     pchars(const uschar *p, int length, BOOL is_subject, match_data *md)
111     {
112 nigel 93 unsigned int c;
113 nigel 77 if (is_subject && length > md->end_subject - p) length = md->end_subject - p;
114     while (length-- > 0)
115     if (isprint(c = *(p++))) printf("%c", c); else printf("\\x%02x", c);
116     }
117     #endif
118    
119    
120    
121     /*************************************************
122     * Match a back-reference *
123     *************************************************/
124    
125     /* If a back reference hasn't been set, the length that is passed is greater
126     than the number of characters left in the string, so the match fails.
127    
128     Arguments:
129     offset index into the offset vector
130     eptr points into the subject
131     length length to be matched
132     md points to match data block
133     ims the ims flags
134    
135     Returns: TRUE if matched
136     */
137    
138     static BOOL
139 nigel 87 match_ref(int offset, register USPTR eptr, int length, match_data *md,
140 nigel 77 unsigned long int ims)
141     {
142 nigel 87 USPTR p = md->start_subject + md->offset_vector[offset];
143 nigel 77
144     #ifdef DEBUG
145     if (eptr >= md->end_subject)
146     printf("matching subject <null>");
147     else
148     {
149     printf("matching subject ");
150     pchars(eptr, length, TRUE, md);
151     }
152     printf(" against backref ");
153     pchars(p, length, FALSE, md);
154     printf("\n");
155     #endif
156    
157     /* Always fail if not enough characters left */
158    
159     if (length > md->end_subject - eptr) return FALSE;
160    
161 ph10 354 /* Separate the caseless case for speed. In UTF-8 mode we can only do this
162     properly if Unicode properties are supported. Otherwise, we can check only
163     ASCII characters. */
164 nigel 77
165     if ((ims & PCRE_CASELESS) != 0)
166     {
167 ph10 354 #ifdef SUPPORT_UTF8
168     #ifdef SUPPORT_UCP
169     if (md->utf8)
170     {
171 ph10 358 USPTR endptr = eptr + length;
172 ph10 354 while (eptr < endptr)
173     {
174 ph10 358 int c, d;
175 ph10 354 GETCHARINC(c, eptr);
176     GETCHARINC(d, p);
177     if (c != d && c != UCD_OTHERCASE(d)) return FALSE;
178 ph10 358 }
179     }
180 ph10 354 else
181     #endif
182     #endif
183    
184     /* The same code works when not in UTF-8 mode and in UTF-8 mode when there
185     is no UCP support. */
186 ph10 358
187 nigel 77 while (length-- > 0)
188 ph10 354 { if (md->lcc[*p++] != md->lcc[*eptr++]) return FALSE; }
189 nigel 77 }
190 ph10 358
191 ph10 354 /* In the caseful case, we can just compare the bytes, whether or not we
192     are in UTF-8 mode. */
193 ph10 358
194 nigel 77 else
195     { while (length-- > 0) if (*p++ != *eptr++) return FALSE; }
196    
197     return TRUE;
198     }
199    
200    
201    
202     /***************************************************************************
203     ****************************************************************************
204     RECURSION IN THE match() FUNCTION
205    
206 nigel 87 The match() function is highly recursive, though not every recursive call
207     increases the recursive depth. Nevertheless, some regular expressions can cause
208     it to recurse to a great depth. I was writing for Unix, so I just let it call
209     itself recursively. This uses the stack for saving everything that has to be
210     saved for a recursive call. On Unix, the stack can be large, and this works
211     fine.
212 nigel 77
213 nigel 87 It turns out that on some non-Unix-like systems there are problems with
214     programs that use a lot of stack. (This despite the fact that every last chip
215     has oodles of memory these days, and techniques for extending the stack have
216     been known for decades.) So....
217 nigel 77
218     There is a fudge, triggered by defining NO_RECURSE, which avoids recursive
219     calls by keeping local variables that need to be preserved in blocks of memory
220 nigel 87 obtained from malloc() instead instead of on the stack. Macros are used to
221 nigel 77 achieve this so that the actual code doesn't look very different to what it
222     always used to.
223 ph10 164
224 ph10 165 The original heap-recursive code used longjmp(). However, it seems that this
225 ph10 164 can be very slow on some operating systems. Following a suggestion from Stan
226     Switzer, the use of longjmp() has been abolished, at the cost of having to
227     provide a unique number for each call to RMATCH. There is no way of generating
228     a sequence of numbers at compile time in C. I have given them names, to make
229     them stand out more clearly.
230    
231     Crude tests on x86 Linux show a small speedup of around 5-8%. However, on
232     FreeBSD, avoiding longjmp() more than halves the time taken to run the standard
233 ph10 165 tests. Furthermore, not using longjmp() means that local dynamic variables
234     don't have indeterminate values; this has meant that the frame size can be
235 ph10 164 reduced because the result can be "passed back" by straight setting of the
236     variable instead of being passed in the frame.
237 nigel 77 ****************************************************************************
238     ***************************************************************************/
239    
240 ph10 212 /* Numbers for RMATCH calls. When this list is changed, the code at HEAP_RETURN
241     below must be updated in sync. */
242 nigel 77
243 ph10 164 enum { RM1=1, RM2, RM3, RM4, RM5, RM6, RM7, RM8, RM9, RM10,
244     RM11, RM12, RM13, RM14, RM15, RM16, RM17, RM18, RM19, RM20,
245     RM21, RM22, RM23, RM24, RM25, RM26, RM27, RM28, RM29, RM30,
246     RM31, RM32, RM33, RM34, RM35, RM36, RM37, RM38, RM39, RM40,
247 ph10 210 RM41, RM42, RM43, RM44, RM45, RM46, RM47, RM48, RM49, RM50,
248 ph10 212 RM51, RM52, RM53, RM54 };
249 ph10 164
250 nigel 87 /* These versions of the macros use the stack, as normal. There are debugging
251 ph10 165 versions and production versions. Note that the "rw" argument of RMATCH isn't
252 ph10 164 actuall used in this definition. */
253 nigel 77
254     #ifndef NO_RECURSE
255     #define REGISTER register
256 ph10 164
257 nigel 87 #ifdef DEBUG
258 ph10 164 #define RMATCH(ra,rb,rc,rd,re,rf,rg,rw) \
259 nigel 87 { \
260     printf("match() called in line %d\n", __LINE__); \
261 ph10 168 rrc = match(ra,rb,mstart,rc,rd,re,rf,rg,rdepth+1); \
262 nigel 87 printf("to line %d\n", __LINE__); \
263     }
264     #define RRETURN(ra) \
265     { \
266     printf("match() returned %d from line %d ", ra, __LINE__); \
267     return ra; \
268     }
269     #else
270 ph10 164 #define RMATCH(ra,rb,rc,rd,re,rf,rg,rw) \
271 ph10 168 rrc = match(ra,rb,mstart,rc,rd,re,rf,rg,rdepth+1)
272 nigel 77 #define RRETURN(ra) return ra
273 nigel 87 #endif
274    
275 nigel 77 #else
276    
277    
278 ph10 164 /* These versions of the macros manage a private stack on the heap. Note that
279     the "rd" argument of RMATCH isn't actually used in this definition. It's the md
280     argument of match(), which never changes. */
281 nigel 77
282     #define REGISTER
283    
284 ph10 164 #define RMATCH(ra,rb,rc,rd,re,rf,rg,rw)\
285 nigel 77 {\
286     heapframe *newframe = (pcre_stack_malloc)(sizeof(heapframe));\
287 ph10 164 frame->Xwhere = rw; \
288     newframe->Xeptr = ra;\
289     newframe->Xecode = rb;\
290 ph10 168 newframe->Xmstart = mstart;\
291 ph10 164 newframe->Xoffset_top = rc;\
292     newframe->Xims = re;\
293     newframe->Xeptrb = rf;\
294     newframe->Xflags = rg;\
295     newframe->Xrdepth = frame->Xrdepth + 1;\
296     newframe->Xprevframe = frame;\
297     frame = newframe;\
298     DPRINTF(("restarting from line %d\n", __LINE__));\
299     goto HEAP_RECURSE;\
300     L_##rw:\
301     DPRINTF(("jumped back to line %d\n", __LINE__));\
302 nigel 77 }
303    
304     #define RRETURN(ra)\
305     {\
306     heapframe *newframe = frame;\
307     frame = newframe->Xprevframe;\
308     (pcre_stack_free)(newframe);\
309     if (frame != NULL)\
310     {\
311 ph10 164 rrc = ra;\
312     goto HEAP_RETURN;\
313 nigel 77 }\
314     return ra;\
315     }
316    
317    
318     /* Structure for remembering the local variables in a private frame */
319    
320     typedef struct heapframe {
321     struct heapframe *Xprevframe;
322    
323     /* Function arguments that may change */
324    
325 ph10 409 USPTR Xeptr;
326 nigel 77 const uschar *Xecode;
327 ph10 409 USPTR Xmstart;
328 nigel 77 int Xoffset_top;
329     long int Xims;
330     eptrblock *Xeptrb;
331     int Xflags;
332 nigel 91 unsigned int Xrdepth;
333 nigel 77
334     /* Function local variables */
335    
336 ph10 409 USPTR Xcallpat;
337 ph10 406 #ifdef SUPPORT_UTF8
338 ph10 409 USPTR Xcharptr;
339 ph10 406 #endif
340 ph10 409 USPTR Xdata;
341     USPTR Xnext;
342     USPTR Xpp;
343     USPTR Xprev;
344     USPTR Xsaved_eptr;
345 nigel 77
346     recursion_info Xnew_recursive;
347    
348     BOOL Xcur_is_word;
349     BOOL Xcondition;
350     BOOL Xprev_is_word;
351    
352     unsigned long int Xoriginal_ims;
353    
354     #ifdef SUPPORT_UCP
355     int Xprop_type;
356 nigel 87 int Xprop_value;
357 nigel 77 int Xprop_fail_result;
358     int Xprop_category;
359     int Xprop_chartype;
360 nigel 87 int Xprop_script;
361 ph10 123 int Xoclength;
362     uschar Xocchars[8];
363 nigel 77 #endif
364    
365 ph10 403 int Xcodelink;
366 nigel 77 int Xctype;
367 nigel 93 unsigned int Xfc;
368 nigel 77 int Xfi;
369     int Xlength;
370     int Xmax;
371     int Xmin;
372     int Xnumber;
373     int Xoffset;
374     int Xop;
375     int Xsave_capture_last;
376     int Xsave_offset1, Xsave_offset2, Xsave_offset3;
377     int Xstacksave[REC_STACK_SAVE_MAX];
378    
379     eptrblock Xnewptrb;
380    
381 ph10 164 /* Where to jump back to */
382 nigel 77
383 ph10 164 int Xwhere;
384 ph10 165
385 nigel 77 } heapframe;
386    
387     #endif
388    
389    
390     /***************************************************************************
391     ***************************************************************************/
392    
393    
394    
395     /*************************************************
396     * Match from current position *
397     *************************************************/
398    
399 nigel 93 /* This function is called recursively in many circumstances. Whenever it
400 nigel 77 returns a negative (error) response, the outer incarnation must also return the
401 ph10 426 same response. */
402 nigel 77
403 ph10 426 /* These macros pack up tests that are used for partial matching, and which
404     appears several times in the code. We set the "hit end" flag if the pointer is
405     at the end of the subject and also past the start of the subject (i.e.
406 ph10 427 something has been matched). For hard partial matching, we then return
407     immediately. The second one is used when we already know we are past the end of
408     the subject. */
409 ph10 426
410     #define CHECK_PARTIAL()\
411     if (md->partial && eptr >= md->end_subject && eptr > mstart)\
412 ph10 427 {\
413     md->hitend = TRUE;\
414     if (md->partial > 1) RRETURN(PCRE_ERROR_PARTIAL);\
415     }
416 ph10 426
417     #define SCHECK_PARTIAL()\
418 ph10 427 if (md->partial && eptr > mstart)\
419     {\
420     md->hitend = TRUE;\
421     if (md->partial > 1) RRETURN(PCRE_ERROR_PARTIAL);\
422     }
423 ph10 426
424 ph10 427
425 ph10 426 /* Performance note: It might be tempting to extract commonly used fields from
426     the md structure (e.g. utf8, end_subject) into individual variables to improve
427 nigel 77 performance. Tests using gcc on a SPARC disproved this; in the first case, it
428     made performance worse.
429    
430     Arguments:
431 nigel 93 eptr pointer to current character in subject
432     ecode pointer to current position in compiled code
433 ph10 168 mstart pointer to the current match start position (can be modified
434 ph10 172 by encountering \K)
435 nigel 77 offset_top current top pointer
436     md pointer to "static" info for the match
437     ims current /i, /m, and /s options
438     eptrb pointer to chain of blocks containing eptr at start of
439     brackets - for testing for empty matches
440     flags can contain
441     match_condassert - this is an assertion condition
442 nigel 93 match_cbegroup - this is the start of an unlimited repeat
443     group that can match an empty string
444 nigel 87 rdepth the recursion depth
445 nigel 77
446     Returns: MATCH_MATCH if matched ) these values are >= 0
447     MATCH_NOMATCH if failed to match )
448     a negative PCRE_ERROR_xxx value if aborted by an error condition
449 nigel 87 (e.g. stopped by repeated call or recursion limit)
450 nigel 77 */
451    
452     static int
453 ph10 409 match(REGISTER USPTR eptr, REGISTER const uschar *ecode, USPTR mstart,
454 nigel 77 int offset_top, match_data *md, unsigned long int ims, eptrblock *eptrb,
455 nigel 91 int flags, unsigned int rdepth)
456 nigel 77 {
457     /* These variables do not need to be preserved over recursion in this function,
458 nigel 93 so they can be ordinary variables in all cases. Mark some of them with
459     "register" because they are used a lot in loops. */
460 nigel 77
461 nigel 91 register int rrc; /* Returns from recursive calls */
462     register int i; /* Used for loops not involving calls to RMATCH() */
463 nigel 93 register unsigned int c; /* Character values not kept over RMATCH() calls */
464 nigel 91 register BOOL utf8; /* Local copy of UTF-8 flag for speed */
465 nigel 77
466 nigel 93 BOOL minimize, possessive; /* Quantifier options */
467 ph10 403 int condcode;
468 nigel 93
469 nigel 77 /* When recursion is not being used, all "local" variables that have to be
470     preserved over calls to RMATCH() are part of a "frame" which is obtained from
471     heap storage. Set up the top-level frame here; others are obtained from the
472     heap whenever RMATCH() does a "recursion". See the macro definitions above. */
473    
474     #ifdef NO_RECURSE
475     heapframe *frame = (pcre_stack_malloc)(sizeof(heapframe));
476     frame->Xprevframe = NULL; /* Marks the top level */
477    
478     /* Copy in the original argument variables */
479    
480     frame->Xeptr = eptr;
481     frame->Xecode = ecode;
482 ph10 168 frame->Xmstart = mstart;
483 nigel 77 frame->Xoffset_top = offset_top;
484     frame->Xims = ims;
485     frame->Xeptrb = eptrb;
486     frame->Xflags = flags;
487 nigel 87 frame->Xrdepth = rdepth;
488 nigel 77
489     /* This is where control jumps back to to effect "recursion" */
490    
491     HEAP_RECURSE:
492    
493     /* Macros make the argument variables come from the current frame */
494    
495     #define eptr frame->Xeptr
496     #define ecode frame->Xecode
497 ph10 168 #define mstart frame->Xmstart
498 nigel 77 #define offset_top frame->Xoffset_top
499     #define ims frame->Xims
500     #define eptrb frame->Xeptrb
501     #define flags frame->Xflags
502 nigel 87 #define rdepth frame->Xrdepth
503 nigel 77
504     /* Ditto for the local variables */
505    
506     #ifdef SUPPORT_UTF8
507     #define charptr frame->Xcharptr
508     #endif
509     #define callpat frame->Xcallpat
510 ph10 403 #define codelink frame->Xcodelink
511 nigel 77 #define data frame->Xdata
512     #define next frame->Xnext
513     #define pp frame->Xpp
514     #define prev frame->Xprev
515     #define saved_eptr frame->Xsaved_eptr
516    
517     #define new_recursive frame->Xnew_recursive
518    
519     #define cur_is_word frame->Xcur_is_word
520     #define condition frame->Xcondition
521     #define prev_is_word frame->Xprev_is_word
522    
523     #define original_ims frame->Xoriginal_ims
524    
525     #ifdef SUPPORT_UCP
526     #define prop_type frame->Xprop_type
527 nigel 87 #define prop_value frame->Xprop_value
528 nigel 77 #define prop_fail_result frame->Xprop_fail_result
529     #define prop_category frame->Xprop_category
530     #define prop_chartype frame->Xprop_chartype
531 nigel 87 #define prop_script frame->Xprop_script
532 ph10 115 #define oclength frame->Xoclength
533     #define occhars frame->Xocchars
534 nigel 77 #endif
535    
536     #define ctype frame->Xctype
537     #define fc frame->Xfc
538     #define fi frame->Xfi
539     #define length frame->Xlength
540     #define max frame->Xmax
541     #define min frame->Xmin
542     #define number frame->Xnumber
543     #define offset frame->Xoffset
544     #define op frame->Xop
545     #define save_capture_last frame->Xsave_capture_last
546     #define save_offset1 frame->Xsave_offset1
547     #define save_offset2 frame->Xsave_offset2
548     #define save_offset3 frame->Xsave_offset3
549     #define stacksave frame->Xstacksave
550    
551     #define newptrb frame->Xnewptrb
552    
553     /* When recursion is being used, local variables are allocated on the stack and
554     get preserved during recursion in the normal way. In this environment, fi and
555     i, and fc and c, can be the same variables. */
556    
557 nigel 93 #else /* NO_RECURSE not defined */
558 nigel 77 #define fi i
559     #define fc c
560    
561    
562 nigel 87 #ifdef SUPPORT_UTF8 /* Many of these variables are used only */
563     const uschar *charptr; /* in small blocks of the code. My normal */
564     #endif /* style of coding would have declared */
565     const uschar *callpat; /* them within each of those blocks. */
566     const uschar *data; /* However, in order to accommodate the */
567     const uschar *next; /* version of this code that uses an */
568     USPTR pp; /* external "stack" implemented on the */
569     const uschar *prev; /* heap, it is easier to declare them all */
570     USPTR saved_eptr; /* here, so the declarations can be cut */
571     /* out in a block. The only declarations */
572     recursion_info new_recursive; /* within blocks below are for variables */
573     /* that do not have to be preserved over */
574     BOOL cur_is_word; /* a recursive call to RMATCH(). */
575     BOOL condition;
576 nigel 77 BOOL prev_is_word;
577    
578     unsigned long int original_ims;
579    
580     #ifdef SUPPORT_UCP
581     int prop_type;
582 nigel 87 int prop_value;
583 nigel 77 int prop_fail_result;
584     int prop_category;
585     int prop_chartype;
586 nigel 87 int prop_script;
587 ph10 115 int oclength;
588     uschar occhars[8];
589 nigel 77 #endif
590    
591 ph10 399 int codelink;
592 nigel 77 int ctype;
593     int length;
594     int max;
595     int min;
596     int number;
597     int offset;
598     int op;
599     int save_capture_last;
600     int save_offset1, save_offset2, save_offset3;
601     int stacksave[REC_STACK_SAVE_MAX];
602    
603     eptrblock newptrb;
604 nigel 93 #endif /* NO_RECURSE */
605 nigel 77
606     /* These statements are here to stop the compiler complaining about unitialized
607     variables. */
608    
609     #ifdef SUPPORT_UCP
610 nigel 87 prop_value = 0;
611 nigel 77 prop_fail_result = 0;
612     #endif
613    
614 nigel 93
615 nigel 91 /* This label is used for tail recursion, which is used in a few cases even
616     when NO_RECURSE is not defined, in order to reduce the amount of stack that is
617     used. Thanks to Ian Taylor for noticing this possibility and sending the
618     original patch. */
619    
620     TAIL_RECURSE:
621    
622 nigel 87 /* OK, now we can get on with the real code of the function. Recursive calls
623     are specified by the macro RMATCH and RRETURN is used to return. When
624     NO_RECURSE is *not* defined, these just turn into a recursive call to match()
625     and a "return", respectively (possibly with some debugging if DEBUG is
626     defined). However, RMATCH isn't like a function call because it's quite a
627     complicated macro. It has to be used in one particular way. This shouldn't,
628     however, impact performance when true recursion is being used. */
629 nigel 77
630 ph10 164 #ifdef SUPPORT_UTF8
631     utf8 = md->utf8; /* Local copy of the flag */
632     #else
633     utf8 = FALSE;
634     #endif
635    
636 nigel 87 /* First check that we haven't called match() too many times, or that we
637     haven't exceeded the recursive call limit. */
638    
639 nigel 77 if (md->match_call_count++ >= md->match_limit) RRETURN(PCRE_ERROR_MATCHLIMIT);
640 nigel 87 if (rdepth >= md->match_limit_recursion) RRETURN(PCRE_ERROR_RECURSIONLIMIT);
641 nigel 77
642     original_ims = ims; /* Save for resetting on ')' */
643 nigel 91
644 nigel 93 /* At the start of a group with an unlimited repeat that may match an empty
645     string, the match_cbegroup flag is set. When this is the case, add the current
646     subject pointer to the chain of such remembered pointers, to be checked when we
647     hit the closing ket, in order to break infinite loops that match no characters.
648 ph10 197 When match() is called in other circumstances, don't add to the chain. The
649     match_cbegroup flag must NOT be used with tail recursion, because the memory
650     block that is used is on the stack, so a new one may be required for each
651     match(). */
652 nigel 77
653 nigel 93 if ((flags & match_cbegroup) != 0)
654 nigel 77 {
655 ph10 197 newptrb.epb_saved_eptr = eptr;
656     newptrb.epb_prev = eptrb;
657     eptrb = &newptrb;
658 nigel 77 }
659    
660 nigel 93 /* Now start processing the opcodes. */
661 nigel 77
662     for (;;)
663     {
664 nigel 93 minimize = possessive = FALSE;
665 nigel 77 op = *ecode;
666 ph10 428
667 nigel 93 switch(op)
668     {
669 ph10 210 case OP_FAIL:
670 ph10 212 RRETURN(MATCH_NOMATCH);
671 ph10 211
672 ph10 210 case OP_PRUNE:
673     RMATCH(eptr, ecode + _pcre_OP_lengths[*ecode], offset_top, md,
674     ims, eptrb, flags, RM51);
675     if (rrc != MATCH_NOMATCH) RRETURN(rrc);
676 ph10 212 RRETURN(MATCH_PRUNE);
677 ph10 211
678 ph10 210 case OP_COMMIT:
679     RMATCH(eptr, ecode + _pcre_OP_lengths[*ecode], offset_top, md,
680     ims, eptrb, flags, RM52);
681     if (rrc != MATCH_NOMATCH) RRETURN(rrc);
682 ph10 212 RRETURN(MATCH_COMMIT);
683 ph10 211
684 ph10 210 case OP_SKIP:
685     RMATCH(eptr, ecode + _pcre_OP_lengths[*ecode], offset_top, md,
686     ims, eptrb, flags, RM53);
687     if (rrc != MATCH_NOMATCH) RRETURN(rrc);
688 ph10 211 md->start_match_ptr = eptr; /* Pass back current position */
689 ph10 212 RRETURN(MATCH_SKIP);
690 ph10 211
691 ph10 210 case OP_THEN:
692     RMATCH(eptr, ecode + _pcre_OP_lengths[*ecode], offset_top, md,
693 ph10 212 ims, eptrb, flags, RM54);
694 ph10 210 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
695 ph10 212 RRETURN(MATCH_THEN);
696 ph10 211
697 nigel 93 /* Handle a capturing bracket. If there is space in the offset vector, save
698     the current subject position in the working slot at the top of the vector.
699     We mustn't change the current values of the data slot, because they may be
700     set from a previous iteration of this group, and be referred to by a
701     reference inside the group.
702 nigel 77
703 nigel 93 If the bracket fails to match, we need to restore this value and also the
704     values of the final offsets, in case they were set by a previous iteration
705     of the same bracket.
706 nigel 77
707 nigel 93 If there isn't enough space in the offset vector, treat this as if it were
708     a non-capturing bracket. Don't worry about setting the flag for the error
709     case here; that is handled in the code for KET. */
710 nigel 77
711 nigel 93 case OP_CBRA:
712     case OP_SCBRA:
713     number = GET2(ecode, 1+LINK_SIZE);
714 nigel 77 offset = number << 1;
715    
716     #ifdef DEBUG
717 nigel 93 printf("start bracket %d\n", number);
718     printf("subject=");
719 nigel 77 pchars(eptr, 16, TRUE, md);
720     printf("\n");
721     #endif
722    
723     if (offset < md->offset_max)
724     {
725     save_offset1 = md->offset_vector[offset];
726     save_offset2 = md->offset_vector[offset+1];
727     save_offset3 = md->offset_vector[md->offset_end - number];
728     save_capture_last = md->capture_last;
729    
730     DPRINTF(("saving %d %d %d\n", save_offset1, save_offset2, save_offset3));
731     md->offset_vector[md->offset_end - number] = eptr - md->start_subject;
732    
733 nigel 93 flags = (op == OP_SCBRA)? match_cbegroup : 0;
734 nigel 77 do
735     {
736 ph10 164 RMATCH(eptr, ecode + _pcre_OP_lengths[*ecode], offset_top, md,
737     ims, eptrb, flags, RM1);
738 ph10 210 if (rrc != MATCH_NOMATCH && rrc != MATCH_THEN) RRETURN(rrc);
739 nigel 77 md->capture_last = save_capture_last;
740     ecode += GET(ecode, 1);
741     }
742     while (*ecode == OP_ALT);
743    
744     DPRINTF(("bracket %d failed\n", number));
745    
746     md->offset_vector[offset] = save_offset1;
747     md->offset_vector[offset+1] = save_offset2;
748     md->offset_vector[md->offset_end - number] = save_offset3;
749    
750     RRETURN(MATCH_NOMATCH);
751     }
752    
753 ph10 197 /* FALL THROUGH ... Insufficient room for saving captured contents. Treat
754     as a non-capturing bracket. */
755 nigel 77
756 ph10 197 /* VVVVVVVVVVVVVVVVVVVVVVVVV */
757     /* VVVVVVVVVVVVVVVVVVVVVVVVV */
758    
759 nigel 93 DPRINTF(("insufficient capture room: treat as non-capturing\n"));
760 nigel 77
761 ph10 197 /* VVVVVVVVVVVVVVVVVVVVVVVVV */
762     /* VVVVVVVVVVVVVVVVVVVVVVVVV */
763    
764 nigel 93 /* Non-capturing bracket. Loop for all the alternatives. When we get to the
765     final alternative within the brackets, we would return the result of a
766     recursive call to match() whatever happened. We can reduce stack usage by
767 ph10 197 turning this into a tail recursion, except in the case when match_cbegroup
768     is set.*/
769 nigel 77
770 nigel 93 case OP_BRA:
771     case OP_SBRA:
772     DPRINTF(("start non-capturing bracket\n"));
773     flags = (op >= OP_SBRA)? match_cbegroup : 0;
774 nigel 91 for (;;)
775 nigel 77 {
776 ph10 197 if (ecode[GET(ecode, 1)] != OP_ALT) /* Final alternative */
777 nigel 93 {
778 ph10 197 if (flags == 0) /* Not a possibly empty group */
779     {
780     ecode += _pcre_OP_lengths[*ecode];
781     DPRINTF(("bracket 0 tail recursion\n"));
782     goto TAIL_RECURSE;
783     }
784    
785     /* Possibly empty group; can't use tail recursion. */
786    
787     RMATCH(eptr, ecode + _pcre_OP_lengths[*ecode], offset_top, md, ims,
788     eptrb, flags, RM48);
789     RRETURN(rrc);
790 nigel 93 }
791 nigel 91
792     /* For non-final alternatives, continue the loop for a NOMATCH result;
793     otherwise return. */
794    
795 ph10 164 RMATCH(eptr, ecode + _pcre_OP_lengths[*ecode], offset_top, md, ims,
796     eptrb, flags, RM2);
797 ph10 210 if (rrc != MATCH_NOMATCH && rrc != MATCH_THEN) RRETURN(rrc);
798 nigel 77 ecode += GET(ecode, 1);
799     }
800 nigel 91 /* Control never reaches here. */
801 nigel 77
802     /* Conditional group: compilation checked that there are no more than
803     two branches. If the condition is false, skipping the first branch takes us
804     past the end if there is only one branch, but that's OK because that is
805 nigel 91 exactly what going to the ket would do. As there is only one branch to be
806     obeyed, we can use tail recursion to avoid using another stack frame. */
807 nigel 77
808     case OP_COND:
809 nigel 93 case OP_SCOND:
810 ph10 399 codelink= GET(ecode, 1);
811 ph10 406
812 ph10 381 /* Because of the way auto-callout works during compile, a callout item is
813     inserted between OP_COND and an assertion condition. */
814 ph10 392
815 ph10 381 if (ecode[LINK_SIZE+1] == OP_CALLOUT)
816     {
817     if (pcre_callout != NULL)
818     {
819     pcre_callout_block cb;
820     cb.version = 1; /* Version 1 of the callout block */
821     cb.callout_number = ecode[LINK_SIZE+2];
822     cb.offset_vector = md->offset_vector;
823     cb.subject = (PCRE_SPTR)md->start_subject;
824     cb.subject_length = md->end_subject - md->start_subject;
825     cb.start_match = mstart - md->start_subject;
826     cb.current_position = eptr - md->start_subject;
827     cb.pattern_position = GET(ecode, LINK_SIZE + 3);
828     cb.next_item_length = GET(ecode, 3 + 2*LINK_SIZE);
829     cb.capture_top = offset_top/2;
830     cb.capture_last = md->capture_last;
831     cb.callout_data = md->callout_data;
832     if ((rrc = (*pcre_callout)(&cb)) > 0) RRETURN(MATCH_NOMATCH);
833     if (rrc < 0) RRETURN(rrc);
834     }
835     ecode += _pcre_OP_lengths[OP_CALLOUT];
836     }
837 ph10 392
838 ph10 399 condcode = ecode[LINK_SIZE+1];
839 ph10 406
840 ph10 381 /* Now see what the actual condition is */
841 ph10 392
842 ph10 399 if (condcode == OP_RREF) /* Recursion test */
843 nigel 77 {
844 nigel 93 offset = GET2(ecode, LINK_SIZE + 2); /* Recursion group number*/
845     condition = md->recursive != NULL &&
846     (offset == RREF_ANY || offset == md->recursive->group_num);
847     ecode += condition? 3 : GET(ecode, 1);
848     }
849    
850 ph10 399 else if (condcode == OP_CREF) /* Group used test */
851 nigel 93 {
852 nigel 77 offset = GET2(ecode, LINK_SIZE+2) << 1; /* Doubled ref number */
853 nigel 93 condition = offset < offset_top && md->offset_vector[offset] >= 0;
854     ecode += condition? 3 : GET(ecode, 1);
855 nigel 77 }
856    
857 ph10 399 else if (condcode == OP_DEF) /* DEFINE - always false */
858 nigel 93 {
859     condition = FALSE;
860     ecode += GET(ecode, 1);
861     }
862    
863 nigel 77 /* The condition is an assertion. Call match() to evaluate it - setting
864 nigel 93 the final argument match_condassert causes it to stop at the end of an
865     assertion. */
866 nigel 77
867     else
868     {
869 ph10 164 RMATCH(eptr, ecode + 1 + LINK_SIZE, offset_top, md, ims, NULL,
870     match_condassert, RM3);
871 nigel 77 if (rrc == MATCH_MATCH)
872     {
873 nigel 93 condition = TRUE;
874     ecode += 1 + LINK_SIZE + GET(ecode, LINK_SIZE + 2);
875 nigel 77 while (*ecode == OP_ALT) ecode += GET(ecode, 1);
876     }
877 ph10 210 else if (rrc != MATCH_NOMATCH && rrc != MATCH_THEN)
878 nigel 77 {
879     RRETURN(rrc); /* Need braces because of following else */
880     }
881 nigel 93 else
882     {
883     condition = FALSE;
884 ph10 399 ecode += codelink;
885 nigel 93 }
886     }
887 nigel 91
888 nigel 93 /* We are now at the branch that is to be obeyed. As there is only one,
889 ph10 197 we can use tail recursion to avoid using another stack frame, except when
890     match_cbegroup is required for an unlimited repeat of a possibly empty
891     group. If the second alternative doesn't exist, we can just plough on. */
892 nigel 91
893 nigel 93 if (condition || *ecode == OP_ALT)
894     {
895 nigel 91 ecode += 1 + LINK_SIZE;
896 ph10 197 if (op == OP_SCOND) /* Possibly empty group */
897     {
898     RMATCH(eptr, ecode, offset_top, md, ims, eptrb, match_cbegroup, RM49);
899     RRETURN(rrc);
900     }
901     else /* Group must match something */
902     {
903     flags = 0;
904     goto TAIL_RECURSE;
905     }
906 nigel 77 }
907 ph10 395 else /* Condition false & no alternative */
908 nigel 93 {
909     ecode += 1 + LINK_SIZE;
910     }
911     break;
912 nigel 77
913    
914 ph10 210 /* End of the pattern, either real or forced. If we are in a top-level
915     recursion, we should restore the offsets appropriately and continue from
916     after the call. */
917 nigel 77
918 ph10 210 case OP_ACCEPT:
919 nigel 77 case OP_END:
920     if (md->recursive != NULL && md->recursive->group_num == 0)
921     {
922     recursion_info *rec = md->recursive;
923 nigel 87 DPRINTF(("End of pattern in a (?0) recursion\n"));
924 nigel 77 md->recursive = rec->prevrec;
925     memmove(md->offset_vector, rec->offset_save,
926     rec->saved_max * sizeof(int));
927 ph10 168 mstart = rec->save_start;
928 nigel 77 ims = original_ims;
929     ecode = rec->after_call;
930     break;
931     }
932    
933     /* Otherwise, if PCRE_NOTEMPTY is set, fail if we have matched an empty
934     string - backtracking will then try other alternatives, if any. */
935    
936 ph10 168 if (md->notempty && eptr == mstart) RRETURN(MATCH_NOMATCH);
937     md->end_match_ptr = eptr; /* Record where we ended */
938     md->end_offset_top = offset_top; /* and how many extracts were taken */
939 ph10 210 md->start_match_ptr = mstart; /* and the start (\K can modify) */
940 nigel 77 RRETURN(MATCH_MATCH);
941    
942     /* Change option settings */
943    
944     case OP_OPT:
945     ims = ecode[1];
946     ecode += 2;
947     DPRINTF(("ims set to %02lx\n", ims));
948     break;
949    
950     /* Assertion brackets. Check the alternative branches in turn - the
951     matching won't pass the KET for an assertion. If any one branch matches,
952     the assertion is true. Lookbehind assertions have an OP_REVERSE item at the
953     start of each branch to move the current point backwards, so the code at
954     this level is identical to the lookahead case. */
955    
956     case OP_ASSERT:
957     case OP_ASSERTBACK:
958     do
959     {
960 ph10 164 RMATCH(eptr, ecode + 1 + LINK_SIZE, offset_top, md, ims, NULL, 0,
961     RM4);
962 nigel 77 if (rrc == MATCH_MATCH) break;
963 ph10 210 if (rrc != MATCH_NOMATCH && rrc != MATCH_THEN) RRETURN(rrc);
964 nigel 77 ecode += GET(ecode, 1);
965     }
966     while (*ecode == OP_ALT);
967     if (*ecode == OP_KET) RRETURN(MATCH_NOMATCH);
968    
969     /* If checking an assertion for a condition, return MATCH_MATCH. */
970    
971     if ((flags & match_condassert) != 0) RRETURN(MATCH_MATCH);
972    
973     /* Continue from after the assertion, updating the offsets high water
974     mark, since extracts may have been taken during the assertion. */
975    
976     do ecode += GET(ecode,1); while (*ecode == OP_ALT);
977     ecode += 1 + LINK_SIZE;
978     offset_top = md->end_offset_top;
979     continue;
980    
981     /* Negative assertion: all branches must fail to match */
982    
983     case OP_ASSERT_NOT:
984     case OP_ASSERTBACK_NOT:
985     do
986     {
987 ph10 164 RMATCH(eptr, ecode + 1 + LINK_SIZE, offset_top, md, ims, NULL, 0,
988     RM5);
989 nigel 77 if (rrc == MATCH_MATCH) RRETURN(MATCH_NOMATCH);
990 ph10 210 if (rrc != MATCH_NOMATCH && rrc != MATCH_THEN) RRETURN(rrc);
991 nigel 77 ecode += GET(ecode,1);
992     }
993     while (*ecode == OP_ALT);
994    
995     if ((flags & match_condassert) != 0) RRETURN(MATCH_MATCH);
996    
997     ecode += 1 + LINK_SIZE;
998     continue;
999    
1000     /* Move the subject pointer back. This occurs only at the start of
1001     each branch of a lookbehind assertion. If we are too close to the start to
1002     move back, this match function fails. When working with UTF-8 we move
1003     back a number of characters, not bytes. */
1004    
1005     case OP_REVERSE:
1006     #ifdef SUPPORT_UTF8
1007     if (utf8)
1008     {
1009 nigel 93 i = GET(ecode, 1);
1010     while (i-- > 0)
1011 nigel 77 {
1012     eptr--;
1013     if (eptr < md->start_subject) RRETURN(MATCH_NOMATCH);
1014 ph10 207 BACKCHAR(eptr);
1015 nigel 77 }
1016     }
1017     else
1018     #endif
1019    
1020     /* No UTF-8 support, or not in UTF-8 mode: count is byte count */
1021    
1022     {
1023 nigel 93 eptr -= GET(ecode, 1);
1024 nigel 77 if (eptr < md->start_subject) RRETURN(MATCH_NOMATCH);
1025     }
1026    
1027     /* Skip to next op code */
1028    
1029     ecode += 1 + LINK_SIZE;
1030     break;
1031    
1032     /* The callout item calls an external function, if one is provided, passing
1033     details of the match so far. This is mainly for debugging, though the
1034     function is able to force a failure. */
1035    
1036     case OP_CALLOUT:
1037     if (pcre_callout != NULL)
1038     {
1039     pcre_callout_block cb;
1040     cb.version = 1; /* Version 1 of the callout block */
1041     cb.callout_number = ecode[1];
1042     cb.offset_vector = md->offset_vector;
1043 nigel 87 cb.subject = (PCRE_SPTR)md->start_subject;
1044 nigel 77 cb.subject_length = md->end_subject - md->start_subject;
1045 ph10 168 cb.start_match = mstart - md->start_subject;
1046 nigel 77 cb.current_position = eptr - md->start_subject;
1047     cb.pattern_position = GET(ecode, 2);
1048     cb.next_item_length = GET(ecode, 2 + LINK_SIZE);
1049     cb.capture_top = offset_top/2;
1050     cb.capture_last = md->capture_last;
1051     cb.callout_data = md->callout_data;
1052     if ((rrc = (*pcre_callout)(&cb)) > 0) RRETURN(MATCH_NOMATCH);
1053     if (rrc < 0) RRETURN(rrc);
1054     }
1055     ecode += 2 + 2*LINK_SIZE;
1056     break;
1057    
1058     /* Recursion either matches the current regex, or some subexpression. The
1059     offset data is the offset to the starting bracket from the start of the
1060     whole pattern. (This is so that it works from duplicated subpatterns.)
1061    
1062     If there are any capturing brackets started but not finished, we have to
1063     save their starting points and reinstate them after the recursion. However,
1064     we don't know how many such there are (offset_top records the completed
1065     total) so we just have to save all the potential data. There may be up to
1066     65535 such values, which is too large to put on the stack, but using malloc
1067     for small numbers seems expensive. As a compromise, the stack is used when
1068     there are no more than REC_STACK_SAVE_MAX values to store; otherwise malloc
1069     is used. A problem is what to do if the malloc fails ... there is no way of
1070     returning to the top level with an error. Save the top REC_STACK_SAVE_MAX
1071     values on the stack, and accept that the rest may be wrong.
1072    
1073     There are also other values that have to be saved. We use a chained
1074     sequence of blocks that actually live on the stack. Thanks to Robin Houston
1075     for the original version of this logic. */
1076    
1077     case OP_RECURSE:
1078     {
1079     callpat = md->start_code + GET(ecode, 1);
1080 nigel 93 new_recursive.group_num = (callpat == md->start_code)? 0 :
1081     GET2(callpat, 1 + LINK_SIZE);
1082 nigel 77
1083     /* Add to "recursing stack" */
1084    
1085     new_recursive.prevrec = md->recursive;
1086     md->recursive = &new_recursive;
1087    
1088     /* Find where to continue from afterwards */
1089    
1090     ecode += 1 + LINK_SIZE;
1091     new_recursive.after_call = ecode;
1092    
1093     /* Now save the offset data. */
1094    
1095     new_recursive.saved_max = md->offset_end;
1096     if (new_recursive.saved_max <= REC_STACK_SAVE_MAX)
1097     new_recursive.offset_save = stacksave;
1098     else
1099     {
1100     new_recursive.offset_save =
1101     (int *)(pcre_malloc)(new_recursive.saved_max * sizeof(int));
1102     if (new_recursive.offset_save == NULL) RRETURN(PCRE_ERROR_NOMEMORY);
1103     }
1104    
1105     memcpy(new_recursive.offset_save, md->offset_vector,
1106     new_recursive.saved_max * sizeof(int));
1107 ph10 168 new_recursive.save_start = mstart;
1108     mstart = eptr;
1109 nigel 77
1110     /* OK, now we can do the recursion. For each top-level alternative we
1111     restore the offset and recursion data. */
1112    
1113     DPRINTF(("Recursing into group %d\n", new_recursive.group_num));
1114 nigel 93 flags = (*callpat >= OP_SBRA)? match_cbegroup : 0;
1115 nigel 77 do
1116     {
1117 ph10 164 RMATCH(eptr, callpat + _pcre_OP_lengths[*callpat], offset_top,
1118     md, ims, eptrb, flags, RM6);
1119 nigel 77 if (rrc == MATCH_MATCH)
1120     {
1121 nigel 87 DPRINTF(("Recursion matched\n"));
1122 nigel 77 md->recursive = new_recursive.prevrec;
1123     if (new_recursive.offset_save != stacksave)
1124     (pcre_free)(new_recursive.offset_save);
1125     RRETURN(MATCH_MATCH);
1126     }
1127 ph10 210 else if (rrc != MATCH_NOMATCH && rrc != MATCH_THEN)
1128 nigel 87 {
1129     DPRINTF(("Recursion gave error %d\n", rrc));
1130 ph10 400 if (new_recursive.offset_save != stacksave)
1131     (pcre_free)(new_recursive.offset_save);
1132 nigel 87 RRETURN(rrc);
1133     }
1134 nigel 77
1135     md->recursive = &new_recursive;
1136     memcpy(md->offset_vector, new_recursive.offset_save,
1137     new_recursive.saved_max * sizeof(int));
1138     callpat += GET(callpat, 1);
1139     }
1140     while (*callpat == OP_ALT);
1141    
1142     DPRINTF(("Recursion didn't match\n"));
1143     md->recursive = new_recursive.prevrec;
1144     if (new_recursive.offset_save != stacksave)
1145     (pcre_free)(new_recursive.offset_save);
1146     RRETURN(MATCH_NOMATCH);
1147     }
1148     /* Control never reaches here */
1149    
1150     /* "Once" brackets are like assertion brackets except that after a match,
1151     the point in the subject string is not moved back. Thus there can never be
1152     a move back into the brackets. Friedl calls these "atomic" subpatterns.
1153     Check the alternative branches in turn - the matching won't pass the KET
1154     for this kind of subpattern. If any one branch matches, we carry on as at
1155     the end of a normal bracket, leaving the subject pointer. */
1156    
1157     case OP_ONCE:
1158 nigel 91 prev = ecode;
1159     saved_eptr = eptr;
1160    
1161     do
1162 nigel 77 {
1163 ph10 197 RMATCH(eptr, ecode + 1 + LINK_SIZE, offset_top, md, ims, eptrb, 0, RM7);
1164 nigel 91 if (rrc == MATCH_MATCH) break;
1165 ph10 210 if (rrc != MATCH_NOMATCH && rrc != MATCH_THEN) RRETURN(rrc);
1166 nigel 91 ecode += GET(ecode,1);
1167     }
1168     while (*ecode == OP_ALT);
1169 nigel 77
1170 nigel 91 /* If hit the end of the group (which could be repeated), fail */
1171 nigel 77
1172 nigel 91 if (*ecode != OP_ONCE && *ecode != OP_ALT) RRETURN(MATCH_NOMATCH);
1173 nigel 77
1174 nigel 91 /* Continue as from after the assertion, updating the offsets high water
1175     mark, since extracts may have been taken. */
1176 nigel 77
1177 nigel 93 do ecode += GET(ecode, 1); while (*ecode == OP_ALT);
1178 nigel 77
1179 nigel 91 offset_top = md->end_offset_top;
1180     eptr = md->end_match_ptr;
1181 nigel 77
1182 nigel 91 /* For a non-repeating ket, just continue at this level. This also
1183     happens for a repeating ket if no characters were matched in the group.
1184     This is the forcible breaking of infinite loops as implemented in Perl
1185     5.005. If there is an options reset, it will get obeyed in the normal
1186     course of events. */
1187 nigel 77
1188 nigel 91 if (*ecode == OP_KET || eptr == saved_eptr)
1189     {
1190     ecode += 1+LINK_SIZE;
1191     break;
1192     }
1193 nigel 77
1194 nigel 91 /* The repeating kets try the rest of the pattern or restart from the
1195     preceding bracket, in the appropriate order. The second "call" of match()
1196     uses tail recursion, to avoid using another stack frame. We need to reset
1197     any options that changed within the bracket before re-running it, so
1198     check the next opcode. */
1199 nigel 77
1200 nigel 91 if (ecode[1+LINK_SIZE] == OP_OPT)
1201     {
1202     ims = (ims & ~PCRE_IMS) | ecode[4];
1203     DPRINTF(("ims set to %02lx at group repeat\n", ims));
1204     }
1205 nigel 77
1206 nigel 91 if (*ecode == OP_KETRMIN)
1207     {
1208 ph10 197 RMATCH(eptr, ecode + 1 + LINK_SIZE, offset_top, md, ims, eptrb, 0, RM8);
1209 nigel 91 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1210     ecode = prev;
1211 ph10 197 flags = 0;
1212 nigel 91 goto TAIL_RECURSE;
1213 nigel 77 }
1214 nigel 91 else /* OP_KETRMAX */
1215     {
1216 ph10 164 RMATCH(eptr, prev, offset_top, md, ims, eptrb, match_cbegroup, RM9);
1217 nigel 91 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1218     ecode += 1 + LINK_SIZE;
1219 ph10 197 flags = 0;
1220 nigel 91 goto TAIL_RECURSE;
1221     }
1222     /* Control never gets here */
1223 nigel 77
1224     /* An alternation is the end of a branch; scan along to find the end of the
1225     bracketed group and go to there. */
1226    
1227     case OP_ALT:
1228     do ecode += GET(ecode,1); while (*ecode == OP_ALT);
1229     break;
1230    
1231 ph10 335 /* BRAZERO, BRAMINZERO and SKIPZERO occur just before a bracket group,
1232     indicating that it may occur zero times. It may repeat infinitely, or not
1233     at all - i.e. it could be ()* or ()? or even (){0} in the pattern. Brackets
1234     with fixed upper repeat limits are compiled as a number of copies, with the
1235     optional ones preceded by BRAZERO or BRAMINZERO. */
1236 nigel 77
1237     case OP_BRAZERO:
1238     {
1239     next = ecode+1;
1240 ph10 164 RMATCH(eptr, next, offset_top, md, ims, eptrb, 0, RM10);
1241 nigel 77 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1242     do next += GET(next,1); while (*next == OP_ALT);
1243 nigel 93 ecode = next + 1 + LINK_SIZE;
1244 nigel 77 }
1245     break;
1246    
1247     case OP_BRAMINZERO:
1248     {
1249     next = ecode+1;
1250 nigel 93 do next += GET(next, 1); while (*next == OP_ALT);
1251 ph10 164 RMATCH(eptr, next + 1+LINK_SIZE, offset_top, md, ims, eptrb, 0, RM11);
1252 nigel 77 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1253     ecode++;
1254     }
1255     break;
1256    
1257 ph10 335 case OP_SKIPZERO:
1258     {
1259     next = ecode+1;
1260     do next += GET(next,1); while (*next == OP_ALT);
1261     ecode = next + 1 + LINK_SIZE;
1262     }
1263     break;
1264    
1265 nigel 93 /* End of a group, repeated or non-repeating. */
1266 nigel 77
1267     case OP_KET:
1268     case OP_KETRMIN:
1269     case OP_KETRMAX:
1270 nigel 91 prev = ecode - GET(ecode, 1);
1271 nigel 77
1272 nigel 93 /* If this was a group that remembered the subject start, in order to break
1273     infinite repeats of empty string matches, retrieve the subject start from
1274     the chain. Otherwise, set it NULL. */
1275 nigel 77
1276 nigel 93 if (*prev >= OP_SBRA)
1277     {
1278     saved_eptr = eptrb->epb_saved_eptr; /* Value at start of group */
1279     eptrb = eptrb->epb_prev; /* Backup to previous group */
1280     }
1281     else saved_eptr = NULL;
1282 nigel 77
1283 nigel 93 /* If we are at the end of an assertion group, stop matching and return
1284     MATCH_MATCH, but record the current high water mark for use by positive
1285     assertions. Do this also for the "once" (atomic) groups. */
1286    
1287 nigel 91 if (*prev == OP_ASSERT || *prev == OP_ASSERT_NOT ||
1288     *prev == OP_ASSERTBACK || *prev == OP_ASSERTBACK_NOT ||
1289     *prev == OP_ONCE)
1290     {
1291     md->end_match_ptr = eptr; /* For ONCE */
1292     md->end_offset_top = offset_top;
1293     RRETURN(MATCH_MATCH);
1294     }
1295 nigel 77
1296 nigel 93 /* For capturing groups we have to check the group number back at the start
1297     and if necessary complete handling an extraction by setting the offsets and
1298     bumping the high water mark. Note that whole-pattern recursion is coded as
1299     a recurse into group 0, so it won't be picked up here. Instead, we catch it
1300     when the OP_END is reached. Other recursion is handled here. */
1301 nigel 77
1302 nigel 93 if (*prev == OP_CBRA || *prev == OP_SCBRA)
1303 nigel 91 {
1304 nigel 93 number = GET2(prev, 1+LINK_SIZE);
1305 nigel 91 offset = number << 1;
1306 nigel 77
1307     #ifdef DEBUG
1308 nigel 91 printf("end bracket %d", number);
1309     printf("\n");
1310 nigel 77 #endif
1311    
1312 nigel 93 md->capture_last = number;
1313     if (offset >= md->offset_max) md->offset_overflow = TRUE; else
1314 nigel 91 {
1315 nigel 93 md->offset_vector[offset] =
1316     md->offset_vector[md->offset_end - number];
1317     md->offset_vector[offset+1] = eptr - md->start_subject;
1318     if (offset_top <= offset) offset_top = offset + 2;
1319     }
1320 nigel 77
1321 nigel 93 /* Handle a recursively called group. Restore the offsets
1322     appropriately and continue from after the call. */
1323 nigel 77
1324 nigel 93 if (md->recursive != NULL && md->recursive->group_num == number)
1325     {
1326     recursion_info *rec = md->recursive;
1327     DPRINTF(("Recursion (%d) succeeded - continuing\n", number));
1328     md->recursive = rec->prevrec;
1329 ph10 168 mstart = rec->save_start;
1330 nigel 93 memcpy(md->offset_vector, rec->offset_save,
1331     rec->saved_max * sizeof(int));
1332     ecode = rec->after_call;
1333     ims = original_ims;
1334     break;
1335 nigel 77 }
1336 nigel 91 }
1337 nigel 77
1338 nigel 93 /* For both capturing and non-capturing groups, reset the value of the ims
1339     flags, in case they got changed during the group. */
1340 nigel 77
1341 nigel 91 ims = original_ims;
1342     DPRINTF(("ims reset to %02lx\n", ims));
1343 nigel 77
1344 nigel 91 /* For a non-repeating ket, just continue at this level. This also
1345     happens for a repeating ket if no characters were matched in the group.
1346     This is the forcible breaking of infinite loops as implemented in Perl
1347     5.005. If there is an options reset, it will get obeyed in the normal
1348     course of events. */
1349 nigel 77
1350 nigel 91 if (*ecode == OP_KET || eptr == saved_eptr)
1351     {
1352     ecode += 1 + LINK_SIZE;
1353     break;
1354     }
1355 nigel 77
1356 nigel 91 /* The repeating kets try the rest of the pattern or restart from the
1357     preceding bracket, in the appropriate order. In the second case, we can use
1358 ph10 197 tail recursion to avoid using another stack frame, unless we have an
1359     unlimited repeat of a group that can match an empty string. */
1360 nigel 77
1361 nigel 93 flags = (*prev >= OP_SBRA)? match_cbegroup : 0;
1362    
1363 nigel 91 if (*ecode == OP_KETRMIN)
1364     {
1365 ph10 197 RMATCH(eptr, ecode + 1 + LINK_SIZE, offset_top, md, ims, eptrb, 0, RM12);
1366 nigel 91 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1367 ph10 197 if (flags != 0) /* Could match an empty string */
1368     {
1369     RMATCH(eptr, prev, offset_top, md, ims, eptrb, flags, RM50);
1370     RRETURN(rrc);
1371     }
1372 nigel 91 ecode = prev;
1373     goto TAIL_RECURSE;
1374 nigel 77 }
1375 nigel 91 else /* OP_KETRMAX */
1376     {
1377 ph10 164 RMATCH(eptr, prev, offset_top, md, ims, eptrb, flags, RM13);
1378 nigel 91 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1379     ecode += 1 + LINK_SIZE;
1380 ph10 197 flags = 0;
1381 nigel 91 goto TAIL_RECURSE;
1382     }
1383     /* Control never gets here */
1384 nigel 77
1385     /* Start of subject unless notbol, or after internal newline if multiline */
1386    
1387     case OP_CIRC:
1388     if (md->notbol && eptr == md->start_subject) RRETURN(MATCH_NOMATCH);
1389     if ((ims & PCRE_MULTILINE) != 0)
1390     {
1391 nigel 91 if (eptr != md->start_subject &&
1392 nigel 93 (eptr == md->end_subject || !WAS_NEWLINE(eptr)))
1393 nigel 77 RRETURN(MATCH_NOMATCH);
1394     ecode++;
1395     break;
1396     }
1397     /* ... else fall through */
1398    
1399     /* Start of subject assertion */
1400    
1401     case OP_SOD:
1402     if (eptr != md->start_subject) RRETURN(MATCH_NOMATCH);
1403     ecode++;
1404     break;
1405    
1406     /* Start of match assertion */
1407    
1408     case OP_SOM:
1409     if (eptr != md->start_subject + md->start_offset) RRETURN(MATCH_NOMATCH);
1410     ecode++;
1411     break;
1412 ph10 172
1413 ph10 168 /* Reset the start of match point */
1414 ph10 172
1415 ph10 168 case OP_SET_SOM:
1416     mstart = eptr;
1417 ph10 172 ecode++;
1418     break;
1419 nigel 77
1420     /* Assert before internal newline if multiline, or before a terminating
1421     newline unless endonly is set, else end of subject unless noteol is set. */
1422    
1423     case OP_DOLL:
1424     if ((ims & PCRE_MULTILINE) != 0)
1425     {
1426     if (eptr < md->end_subject)
1427 nigel 91 { if (!IS_NEWLINE(eptr)) RRETURN(MATCH_NOMATCH); }
1428 nigel 77 else
1429     { if (md->noteol) RRETURN(MATCH_NOMATCH); }
1430     ecode++;
1431     break;
1432     }
1433     else
1434     {
1435     if (md->noteol) RRETURN(MATCH_NOMATCH);
1436     if (!md->endonly)
1437     {
1438 nigel 91 if (eptr != md->end_subject &&
1439 nigel 93 (!IS_NEWLINE(eptr) || eptr != md->end_subject - md->nllen))
1440 nigel 77 RRETURN(MATCH_NOMATCH);
1441     ecode++;
1442     break;
1443     }
1444     }
1445 nigel 91 /* ... else fall through for endonly */
1446 nigel 77
1447     /* End of subject assertion (\z) */
1448    
1449     case OP_EOD:
1450     if (eptr < md->end_subject) RRETURN(MATCH_NOMATCH);
1451     ecode++;
1452     break;
1453    
1454     /* End of subject or ending \n assertion (\Z) */
1455    
1456     case OP_EODN:
1457 nigel 91 if (eptr != md->end_subject &&
1458 nigel 93 (!IS_NEWLINE(eptr) || eptr != md->end_subject - md->nllen))
1459 nigel 91 RRETURN(MATCH_NOMATCH);
1460 nigel 77 ecode++;
1461     break;
1462    
1463     /* Word boundary assertions */
1464    
1465     case OP_NOT_WORD_BOUNDARY:
1466     case OP_WORD_BOUNDARY:
1467     {
1468    
1469     /* Find out if the previous and current characters are "word" characters.
1470     It takes a bit more work in UTF-8 mode. Characters > 255 are assumed to
1471     be "non-word" characters. */
1472    
1473     #ifdef SUPPORT_UTF8
1474     if (utf8)
1475     {
1476     if (eptr == md->start_subject) prev_is_word = FALSE; else
1477     {
1478 ph10 409 USPTR lastptr = eptr - 1;
1479 nigel 77 while((*lastptr & 0xc0) == 0x80) lastptr--;
1480     GETCHAR(c, lastptr);
1481     prev_is_word = c < 256 && (md->ctypes[c] & ctype_word) != 0;
1482     }
1483 ph10 428 if (eptr >= md->end_subject)
1484 nigel 77 {
1485 ph10 428 SCHECK_PARTIAL();
1486     cur_is_word = FALSE;
1487     }
1488     else
1489     {
1490 nigel 77 GETCHAR(c, eptr);
1491     cur_is_word = c < 256 && (md->ctypes[c] & ctype_word) != 0;
1492     }
1493     }
1494     else
1495     #endif
1496    
1497 ph10 428 /* Not in UTF-8 mode */
1498 nigel 77
1499     {
1500     prev_is_word = (eptr != md->start_subject) &&
1501     ((md->ctypes[eptr[-1]] & ctype_word) != 0);
1502 ph10 428 if (eptr >= md->end_subject)
1503     {
1504     SCHECK_PARTIAL();
1505     cur_is_word = FALSE;
1506     }
1507     else cur_is_word = ((md->ctypes[*eptr] & ctype_word) != 0);
1508 nigel 77 }
1509    
1510     /* Now see if the situation is what we want */
1511    
1512     if ((*ecode++ == OP_WORD_BOUNDARY)?
1513     cur_is_word == prev_is_word : cur_is_word != prev_is_word)
1514     RRETURN(MATCH_NOMATCH);
1515     }
1516     break;
1517    
1518     /* Match a single character type; inline for speed */
1519    
1520     case OP_ANY:
1521 ph10 342 if (IS_NEWLINE(eptr)) RRETURN(MATCH_NOMATCH);
1522 ph10 345 /* Fall through */
1523    
1524 ph10 341 case OP_ALLANY:
1525 ph10 428 if (eptr++ >= md->end_subject)
1526     {
1527     SCHECK_PARTIAL();
1528     RRETURN(MATCH_NOMATCH);
1529     }
1530 ph10 342 if (utf8) while (eptr < md->end_subject && (*eptr & 0xc0) == 0x80) eptr++;
1531 nigel 77 ecode++;
1532     break;
1533    
1534     /* Match a single byte, even in UTF-8 mode. This opcode really does match
1535     any byte, even newline, independent of the setting of PCRE_DOTALL. */
1536    
1537     case OP_ANYBYTE:
1538 ph10 428 if (eptr++ >= md->end_subject)
1539     {
1540     SCHECK_PARTIAL();
1541     RRETURN(MATCH_NOMATCH);
1542     }
1543 nigel 77 ecode++;
1544     break;
1545    
1546     case OP_NOT_DIGIT:
1547 ph10 428 if (eptr >= md->end_subject)
1548     {
1549     SCHECK_PARTIAL();
1550     RRETURN(MATCH_NOMATCH);
1551     }
1552 nigel 77 GETCHARINCTEST(c, eptr);
1553     if (
1554     #ifdef SUPPORT_UTF8
1555     c < 256 &&
1556     #endif
1557     (md->ctypes[c] & ctype_digit) != 0
1558     )
1559     RRETURN(MATCH_NOMATCH);
1560     ecode++;
1561     break;
1562    
1563     case OP_DIGIT:
1564 ph10 428 if (eptr >= md->end_subject)
1565     {
1566     SCHECK_PARTIAL();
1567     RRETURN(MATCH_NOMATCH);
1568     }
1569 nigel 77 GETCHARINCTEST(c, eptr);
1570     if (
1571     #ifdef SUPPORT_UTF8
1572     c >= 256 ||
1573     #endif
1574     (md->ctypes[c] & ctype_digit) == 0
1575     )
1576     RRETURN(MATCH_NOMATCH);
1577     ecode++;
1578     break;
1579    
1580     case OP_NOT_WHITESPACE:
1581 ph10 428 if (eptr >= md->end_subject)
1582     {
1583     SCHECK_PARTIAL();
1584     RRETURN(MATCH_NOMATCH);
1585     }
1586 nigel 77 GETCHARINCTEST(c, eptr);
1587     if (
1588     #ifdef SUPPORT_UTF8
1589     c < 256 &&
1590     #endif
1591     (md->ctypes[c] & ctype_space) != 0
1592     )
1593     RRETURN(MATCH_NOMATCH);
1594     ecode++;
1595     break;
1596    
1597     case OP_WHITESPACE:
1598 ph10 428 if (eptr >= md->end_subject)
1599     {
1600     SCHECK_PARTIAL();
1601     RRETURN(MATCH_NOMATCH);
1602     }
1603 nigel 77 GETCHARINCTEST(c, eptr);
1604     if (
1605     #ifdef SUPPORT_UTF8
1606     c >= 256 ||
1607     #endif
1608     (md->ctypes[c] & ctype_space) == 0
1609     )
1610     RRETURN(MATCH_NOMATCH);
1611     ecode++;
1612     break;
1613    
1614     case OP_NOT_WORDCHAR:
1615 ph10 428 if (eptr >= md->end_subject)
1616     {
1617     SCHECK_PARTIAL();
1618     RRETURN(MATCH_NOMATCH);
1619     }
1620 nigel 77 GETCHARINCTEST(c, eptr);
1621     if (
1622     #ifdef SUPPORT_UTF8
1623     c < 256 &&
1624     #endif
1625     (md->ctypes[c] & ctype_word) != 0
1626     )
1627     RRETURN(MATCH_NOMATCH);
1628     ecode++;
1629     break;
1630    
1631     case OP_WORDCHAR:
1632 ph10 428 if (eptr >= md->end_subject)
1633     {
1634     SCHECK_PARTIAL();
1635     RRETURN(MATCH_NOMATCH);
1636     }
1637 nigel 77 GETCHARINCTEST(c, eptr);
1638     if (
1639     #ifdef SUPPORT_UTF8
1640     c >= 256 ||
1641     #endif
1642     (md->ctypes[c] & ctype_word) == 0
1643     )
1644     RRETURN(MATCH_NOMATCH);
1645     ecode++;
1646     break;
1647    
1648 nigel 93 case OP_ANYNL:
1649 ph10 428 if (eptr >= md->end_subject)
1650     {
1651     SCHECK_PARTIAL();
1652     RRETURN(MATCH_NOMATCH);
1653     }
1654 nigel 93 GETCHARINCTEST(c, eptr);
1655     switch(c)
1656     {
1657     default: RRETURN(MATCH_NOMATCH);
1658     case 0x000d:
1659     if (eptr < md->end_subject && *eptr == 0x0a) eptr++;
1660     break;
1661 ph10 231
1662 nigel 93 case 0x000a:
1663 ph10 231 break;
1664    
1665 nigel 93 case 0x000b:
1666     case 0x000c:
1667     case 0x0085:
1668     case 0x2028:
1669     case 0x2029:
1670 ph10 231 if (md->bsr_anycrlf) RRETURN(MATCH_NOMATCH);
1671 nigel 93 break;
1672     }
1673     ecode++;
1674     break;
1675    
1676 ph10 178 case OP_NOT_HSPACE:
1677 ph10 428 if (eptr >= md->end_subject)
1678     {
1679     SCHECK_PARTIAL();
1680     RRETURN(MATCH_NOMATCH);
1681     }
1682 ph10 178 GETCHARINCTEST(c, eptr);
1683     switch(c)
1684     {
1685     default: break;
1686     case 0x09: /* HT */
1687     case 0x20: /* SPACE */
1688     case 0xa0: /* NBSP */
1689     case 0x1680: /* OGHAM SPACE MARK */
1690     case 0x180e: /* MONGOLIAN VOWEL SEPARATOR */
1691     case 0x2000: /* EN QUAD */
1692     case 0x2001: /* EM QUAD */
1693     case 0x2002: /* EN SPACE */
1694     case 0x2003: /* EM SPACE */
1695     case 0x2004: /* THREE-PER-EM SPACE */
1696     case 0x2005: /* FOUR-PER-EM SPACE */
1697     case 0x2006: /* SIX-PER-EM SPACE */
1698     case 0x2007: /* FIGURE SPACE */
1699     case 0x2008: /* PUNCTUATION SPACE */
1700     case 0x2009: /* THIN SPACE */
1701     case 0x200A: /* HAIR SPACE */
1702     case 0x202f: /* NARROW NO-BREAK SPACE */
1703     case 0x205f: /* MEDIUM MATHEMATICAL SPACE */
1704     case 0x3000: /* IDEOGRAPHIC SPACE */
1705     RRETURN(MATCH_NOMATCH);
1706     }
1707     ecode++;
1708     break;
1709    
1710     case OP_HSPACE:
1711 ph10 428 if (eptr >= md->end_subject)
1712     {
1713     SCHECK_PARTIAL();
1714     RRETURN(MATCH_NOMATCH);
1715     }
1716 ph10 178 GETCHARINCTEST(c, eptr);
1717     switch(c)
1718     {
1719     default: RRETURN(MATCH_NOMATCH);
1720     case 0x09: /* HT */
1721     case 0x20: /* SPACE */
1722     case 0xa0: /* NBSP */
1723     case 0x1680: /* OGHAM SPACE MARK */
1724     case 0x180e: /* MONGOLIAN VOWEL SEPARATOR */
1725     case 0x2000: /* EN QUAD */
1726     case 0x2001: /* EM QUAD */
1727     case 0x2002: /* EN SPACE */
1728     case 0x2003: /* EM SPACE */
1729     case 0x2004: /* THREE-PER-EM SPACE */
1730     case 0x2005: /* FOUR-PER-EM SPACE */
1731     case 0x2006: /* SIX-PER-EM SPACE */
1732     case 0x2007: /* FIGURE SPACE */
1733     case 0x2008: /* PUNCTUATION SPACE */
1734     case 0x2009: /* THIN SPACE */
1735     case 0x200A: /* HAIR SPACE */
1736     case 0x202f: /* NARROW NO-BREAK SPACE */
1737     case 0x205f: /* MEDIUM MATHEMATICAL SPACE */
1738     case 0x3000: /* IDEOGRAPHIC SPACE */
1739     break;
1740     }
1741     ecode++;
1742     break;
1743    
1744     case OP_NOT_VSPACE:
1745 ph10 428 if (eptr >= md->end_subject)
1746     {
1747     SCHECK_PARTIAL();
1748     RRETURN(MATCH_NOMATCH);
1749     }
1750 ph10 178 GETCHARINCTEST(c, eptr);
1751     switch(c)
1752     {
1753     default: break;
1754     case 0x0a: /* LF */
1755     case 0x0b: /* VT */
1756     case 0x0c: /* FF */
1757     case 0x0d: /* CR */
1758     case 0x85: /* NEL */
1759     case 0x2028: /* LINE SEPARATOR */
1760     case 0x2029: /* PARAGRAPH SEPARATOR */
1761     RRETURN(MATCH_NOMATCH);
1762     }
1763     ecode++;
1764     break;
1765    
1766     case OP_VSPACE:
1767 ph10 428 if (eptr >= md->end_subject)
1768     {
1769     SCHECK_PARTIAL();
1770     RRETURN(MATCH_NOMATCH);
1771     }
1772 ph10 178 GETCHARINCTEST(c, eptr);
1773     switch(c)
1774     {
1775     default: RRETURN(MATCH_NOMATCH);
1776     case 0x0a: /* LF */
1777     case 0x0b: /* VT */
1778     case 0x0c: /* FF */
1779     case 0x0d: /* CR */
1780     case 0x85: /* NEL */
1781     case 0x2028: /* LINE SEPARATOR */
1782     case 0x2029: /* PARAGRAPH SEPARATOR */
1783     break;
1784     }
1785     ecode++;
1786     break;
1787    
1788 nigel 77 #ifdef SUPPORT_UCP
1789     /* Check the next character by Unicode property. We will get here only
1790     if the support is in the binary; otherwise a compile-time error occurs. */
1791    
1792     case OP_PROP:
1793     case OP_NOTPROP:
1794 ph10 428 if (eptr >= md->end_subject)
1795     {
1796     SCHECK_PARTIAL();
1797     RRETURN(MATCH_NOMATCH);
1798     }
1799 nigel 77 GETCHARINCTEST(c, eptr);
1800     {
1801 ph10 384 const ucd_record *prop = GET_UCD(c);
1802 nigel 77
1803 nigel 87 switch(ecode[1])
1804     {
1805     case PT_ANY:
1806     if (op == OP_NOTPROP) RRETURN(MATCH_NOMATCH);
1807     break;
1808 nigel 77
1809 nigel 87 case PT_LAMP:
1810 ph10 349 if ((prop->chartype == ucp_Lu ||
1811     prop->chartype == ucp_Ll ||
1812     prop->chartype == ucp_Lt) == (op == OP_NOTPROP))
1813 nigel 77 RRETURN(MATCH_NOMATCH);
1814 nigel 87 break;
1815    
1816     case PT_GC:
1817 ph10 351 if ((ecode[2] != _pcre_ucp_gentype[prop->chartype]) == (op == OP_PROP))
1818 nigel 77 RRETURN(MATCH_NOMATCH);
1819 nigel 87 break;
1820    
1821     case PT_PC:
1822 ph10 349 if ((ecode[2] != prop->chartype) == (op == OP_PROP))
1823 nigel 87 RRETURN(MATCH_NOMATCH);
1824     break;
1825    
1826     case PT_SC:
1827 ph10 349 if ((ecode[2] != prop->script) == (op == OP_PROP))
1828 nigel 87 RRETURN(MATCH_NOMATCH);
1829     break;
1830    
1831     default:
1832     RRETURN(PCRE_ERROR_INTERNAL);
1833 nigel 77 }
1834 nigel 87
1835     ecode += 3;
1836 nigel 77 }
1837     break;
1838    
1839     /* Match an extended Unicode sequence. We will get here only if the support
1840     is in the binary; otherwise a compile-time error occurs. */
1841    
1842     case OP_EXTUNI:
1843 ph10 428 if (eptr >= md->end_subject)
1844     {
1845     SCHECK_PARTIAL();
1846     RRETURN(MATCH_NOMATCH);
1847     }
1848 nigel 77 GETCHARINCTEST(c, eptr);
1849     {
1850 ph10 349 int category = UCD_CATEGORY(c);
1851 nigel 77 if (category == ucp_M) RRETURN(MATCH_NOMATCH);
1852     while (eptr < md->end_subject)
1853     {
1854     int len = 1;
1855     if (!utf8) c = *eptr; else
1856     {
1857     GETCHARLEN(c, eptr, len);
1858     }
1859 ph10 349 category = UCD_CATEGORY(c);
1860 nigel 77 if (category != ucp_M) break;
1861     eptr += len;
1862     }
1863     }
1864     ecode++;
1865     break;
1866     #endif
1867    
1868    
1869     /* Match a back reference, possibly repeatedly. Look past the end of the
1870     item to see if there is repeat information following. The code is similar
1871     to that for character classes, but repeated for efficiency. Then obey
1872     similar code to character type repeats - written out again for speed.
1873     However, if the referenced string is the empty string, always treat
1874     it as matched, any number of times (otherwise there could be infinite
1875     loops). */
1876    
1877     case OP_REF:
1878     {
1879     offset = GET2(ecode, 1) << 1; /* Doubled ref number */
1880 ph10 345 ecode += 3;
1881    
1882 ph10 336 /* If the reference is unset, there are two possibilities:
1883 ph10 345
1884 ph10 336 (a) In the default, Perl-compatible state, set the length to be longer
1885     than the amount of subject left; this ensures that every attempt at a
1886     match fails. We can't just fail here, because of the possibility of
1887     quantifiers with zero minima.
1888 ph10 345
1889     (b) If the JavaScript compatibility flag is set, set the length to zero
1890     so that the back reference matches an empty string.
1891    
1892     Otherwise, set the length to the length of what was matched by the
1893 ph10 336 referenced subpattern. */
1894 ph10 345
1895 ph10 336 if (offset >= offset_top || md->offset_vector[offset] < 0)
1896 ph10 345 length = (md->jscript_compat)? 0 : md->end_subject - eptr + 1;
1897 ph10 336 else
1898     length = md->offset_vector[offset+1] - md->offset_vector[offset];
1899 nigel 77
1900     /* Set up for repetition, or handle the non-repeated case */
1901    
1902     switch (*ecode)
1903     {
1904     case OP_CRSTAR:
1905     case OP_CRMINSTAR:
1906     case OP_CRPLUS:
1907     case OP_CRMINPLUS:
1908     case OP_CRQUERY:
1909     case OP_CRMINQUERY:
1910     c = *ecode++ - OP_CRSTAR;
1911     minimize = (c & 1) != 0;
1912     min = rep_min[c]; /* Pick up values from tables; */
1913     max = rep_max[c]; /* zero for max => infinity */
1914     if (max == 0) max = INT_MAX;
1915     break;
1916    
1917     case OP_CRRANGE:
1918     case OP_CRMINRANGE:
1919     minimize = (*ecode == OP_CRMINRANGE);
1920     min = GET2(ecode, 1);
1921     max = GET2(ecode, 3);
1922     if (max == 0) max = INT_MAX;
1923     ecode += 5;
1924     break;
1925    
1926     default: /* No repeat follows */
1927 ph10 428 if (!match_ref(offset, eptr, length, md, ims))
1928     {
1929     CHECK_PARTIAL();
1930     RRETURN(MATCH_NOMATCH);
1931     }
1932 nigel 77 eptr += length;
1933     continue; /* With the main loop */
1934     }
1935    
1936     /* If the length of the reference is zero, just continue with the
1937     main loop. */
1938 ph10 428
1939 nigel 77 if (length == 0) continue;
1940    
1941     /* First, ensure the minimum number of matches are present. We get back
1942     the length of the reference string explicitly rather than passing the
1943     address of eptr, so that eptr can be a register variable. */
1944    
1945     for (i = 1; i <= min; i++)
1946     {
1947 ph10 427 if (!match_ref(offset, eptr, length, md, ims))
1948 ph10 426 {
1949 ph10 427 CHECK_PARTIAL();
1950 ph10 426 RRETURN(MATCH_NOMATCH);
1951 ph10 427 }
1952 nigel 77 eptr += length;
1953     }
1954    
1955     /* If min = max, continue at the same level without recursion.
1956     They are not both allowed to be zero. */
1957    
1958     if (min == max) continue;
1959    
1960     /* If minimizing, keep trying and advancing the pointer */
1961    
1962     if (minimize)
1963     {
1964     for (fi = min;; fi++)
1965     {
1966 ph10 164 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM14);
1967 nigel 77 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1968 ph10 428 if (fi >= max) RRETURN(MATCH_NOMATCH);
1969     if (!match_ref(offset, eptr, length, md, ims))
1970 ph10 426 {
1971 ph10 427 CHECK_PARTIAL();
1972 nigel 77 RRETURN(MATCH_NOMATCH);
1973 ph10 427 }
1974 nigel 77 eptr += length;
1975     }
1976     /* Control never gets here */
1977     }
1978    
1979     /* If maximizing, find the longest string and work backwards */
1980    
1981     else
1982     {
1983     pp = eptr;
1984     for (i = min; i < max; i++)
1985     {
1986     if (!match_ref(offset, eptr, length, md, ims)) break;
1987     eptr += length;
1988     }
1989     while (eptr >= pp)
1990     {
1991 ph10 164 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM15);
1992 nigel 77 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1993     eptr -= length;
1994     }
1995     RRETURN(MATCH_NOMATCH);
1996     }
1997     }
1998     /* Control never gets here */
1999    
2000     /* Match a bit-mapped character class, possibly repeatedly. This op code is
2001     used when all the characters in the class have values in the range 0-255,
2002     and either the matching is caseful, or the characters are in the range
2003     0-127 when UTF-8 processing is enabled. The only difference between
2004     OP_CLASS and OP_NCLASS occurs when a data character outside the range is
2005     encountered.
2006    
2007     First, look past the end of the item to see if there is repeat information
2008     following. Then obey similar code to character type repeats - written out
2009     again for speed. */
2010    
2011     case OP_NCLASS:
2012     case OP_CLASS:
2013     {
2014     data = ecode + 1; /* Save for matching */
2015     ecode += 33; /* Advance past the item */
2016    
2017     switch (*ecode)
2018     {
2019     case OP_CRSTAR:
2020     case OP_CRMINSTAR:
2021     case OP_CRPLUS:
2022     case OP_CRMINPLUS:
2023     case OP_CRQUERY:
2024     case OP_CRMINQUERY:
2025     c = *ecode++ - OP_CRSTAR;
2026     minimize = (c & 1) != 0;
2027     min = rep_min[c]; /* Pick up values from tables; */
2028     max = rep_max[c]; /* zero for max => infinity */
2029     if (max == 0) max = INT_MAX;
2030     break;
2031    
2032     case OP_CRRANGE:
2033     case OP_CRMINRANGE:
2034     minimize = (*ecode == OP_CRMINRANGE);
2035     min = GET2(ecode, 1);
2036     max = GET2(ecode, 3);
2037     if (max == 0) max = INT_MAX;
2038     ecode += 5;
2039     break;
2040    
2041     default: /* No repeat follows */
2042     min = max = 1;
2043     break;
2044     }
2045    
2046     /* First, ensure the minimum number of matches are present. */
2047    
2048     #ifdef SUPPORT_UTF8
2049     /* UTF-8 mode */
2050     if (utf8)
2051     {
2052     for (i = 1; i <= min; i++)
2053     {
2054 ph10 427 if (eptr >= md->end_subject)
2055 ph10 426 {
2056 ph10 428 SCHECK_PARTIAL();
2057 ph10 426 RRETURN(MATCH_NOMATCH);
2058 ph10 427 }
2059 nigel 77 GETCHARINC(c, eptr);
2060     if (c > 255)
2061     {
2062     if (op == OP_CLASS) RRETURN(MATCH_NOMATCH);
2063     }
2064     else
2065     {
2066     if ((data[c/8] & (1 << (c&7))) == 0) RRETURN(MATCH_NOMATCH);
2067     }
2068     }
2069     }
2070     else
2071     #endif
2072     /* Not UTF-8 mode */
2073     {
2074     for (i = 1; i <= min; i++)
2075     {
2076 ph10 427 if (eptr >= md->end_subject)
2077 ph10 426 {
2078 ph10 428 SCHECK_PARTIAL();
2079 ph10 426 RRETURN(MATCH_NOMATCH);
2080 ph10 427 }
2081 nigel 77 c = *eptr++;
2082     if ((data[c/8] & (1 << (c&7))) == 0) RRETURN(MATCH_NOMATCH);
2083     }
2084     }
2085    
2086     /* If max == min we can continue with the main loop without the
2087     need to recurse. */
2088    
2089     if (min == max) continue;
2090    
2091     /* If minimizing, keep testing the rest of the expression and advancing
2092     the pointer while it matches the class. */
2093    
2094     if (minimize)
2095     {
2096     #ifdef SUPPORT_UTF8
2097     /* UTF-8 mode */
2098     if (utf8)
2099     {
2100     for (fi = min;; fi++)
2101     {
2102 ph10 164 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM16);
2103 nigel 77 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2104 ph10 428 if (fi >= max) RRETURN(MATCH_NOMATCH);
2105 ph10 427 if (eptr >= md->end_subject)
2106 ph10 426 {
2107 ph10 427 SCHECK_PARTIAL();
2108 ph10 426 RRETURN(MATCH_NOMATCH);
2109 ph10 427 }
2110 nigel 77 GETCHARINC(c, eptr);
2111     if (c > 255)
2112     {
2113     if (op == OP_CLASS) RRETURN(MATCH_NOMATCH);
2114     }
2115     else
2116     {
2117     if ((data[c/8] & (1 << (c&7))) == 0) RRETURN(MATCH_NOMATCH);
2118     }
2119     }
2120     }
2121     else
2122     #endif
2123     /* Not UTF-8 mode */
2124     {
2125     for (fi = min;; fi++)
2126     {
2127 ph10 164 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM17);
2128 nigel 77 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2129 ph10 428 if (fi >= max) RRETURN(MATCH_NOMATCH);
2130 ph10 427 if (eptr >= md->end_subject)
2131 ph10 426 {
2132 ph10 427 SCHECK_PARTIAL();
2133 ph10 426 RRETURN(MATCH_NOMATCH);
2134 ph10 427 }
2135 nigel 77 c = *eptr++;
2136     if ((data[c/8] & (1 << (c&7))) == 0) RRETURN(MATCH_NOMATCH);
2137     }
2138     }
2139     /* Control never gets here */
2140     }
2141    
2142     /* If maximizing, find the longest possible run, then work backwards. */
2143    
2144     else
2145     {
2146     pp = eptr;
2147    
2148     #ifdef SUPPORT_UTF8
2149     /* UTF-8 mode */
2150     if (utf8)
2151     {
2152     for (i = min; i < max; i++)
2153     {
2154     int len = 1;
2155     if (eptr >= md->end_subject) break;
2156     GETCHARLEN(c, eptr, len);
2157     if (c > 255)
2158     {
2159     if (op == OP_CLASS) break;
2160     }
2161     else
2162     {
2163     if ((data[c/8] & (1 << (c&7))) == 0) break;
2164     }
2165     eptr += len;
2166     }
2167     for (;;)
2168     {
2169 ph10 164 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM18);
2170 nigel 77 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2171     if (eptr-- == pp) break; /* Stop if tried at original pos */
2172     BACKCHAR(eptr);
2173     }
2174     }
2175     else
2176     #endif
2177     /* Not UTF-8 mode */
2178     {
2179     for (i = min; i < max; i++)
2180     {
2181     if (eptr >= md->end_subject) break;
2182     c = *eptr;
2183     if ((data[c/8] & (1 << (c&7))) == 0) break;
2184     eptr++;
2185     }
2186     while (eptr >= pp)
2187     {
2188 ph10 164 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM19);
2189 nigel 87 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2190 nigel 77 eptr--;
2191     }
2192     }
2193    
2194     RRETURN(MATCH_NOMATCH);
2195     }
2196     }
2197     /* Control never gets here */
2198    
2199    
2200     /* Match an extended character class. This opcode is encountered only
2201 ph10 384 when UTF-8 mode mode is supported. Nevertheless, we may not be in UTF-8
2202     mode, because Unicode properties are supported in non-UTF-8 mode. */
2203 nigel 77
2204     #ifdef SUPPORT_UTF8
2205     case OP_XCLASS:
2206     {
2207     data = ecode + 1 + LINK_SIZE; /* Save for matching */
2208     ecode += GET(ecode, 1); /* Advance past the item */
2209    
2210     switch (*ecode)
2211     {
2212     case OP_CRSTAR:
2213     case OP_CRMINSTAR:
2214     case OP_CRPLUS:
2215     case OP_CRMINPLUS:
2216     case OP_CRQUERY:
2217     case OP_CRMINQUERY:
2218     c = *ecode++ - OP_CRSTAR;
2219     minimize = (c & 1) != 0;
2220     min = rep_min[c]; /* Pick up values from tables; */
2221     max = rep_max[c]; /* zero for max => infinity */
2222     if (max == 0) max = INT_MAX;
2223     break;
2224    
2225     case OP_CRRANGE:
2226     case OP_CRMINRANGE:
2227     minimize = (*ecode == OP_CRMINRANGE);
2228     min = GET2(ecode, 1);
2229     max = GET2(ecode, 3);
2230     if (max == 0) max = INT_MAX;
2231     ecode += 5;
2232     break;
2233    
2234     default: /* No repeat follows */
2235     min = max = 1;
2236     break;
2237     }
2238    
2239     /* First, ensure the minimum number of matches are present. */
2240    
2241     for (i = 1; i <= min; i++)
2242     {
2243 ph10 427 if (eptr >= md->end_subject)
2244 ph10 426 {
2245     SCHECK_PARTIAL();
2246     RRETURN(MATCH_NOMATCH);
2247 ph10 427 }
2248 ph10 384 GETCHARINCTEST(c, eptr);
2249 nigel 77 if (!_pcre_xclass(c, data)) RRETURN(MATCH_NOMATCH);
2250     }
2251    
2252     /* If max == min we can continue with the main loop without the
2253     need to recurse. */
2254    
2255     if (min == max) continue;
2256    
2257     /* If minimizing, keep testing the rest of the expression and advancing
2258     the pointer while it matches the class. */
2259    
2260     if (minimize)
2261     {
2262     for (fi = min;; fi++)
2263     {
2264 ph10 164 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM20);
2265 nigel 77 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2266 ph10 428 if (fi >= max) RRETURN(MATCH_NOMATCH);
2267 ph10 427 if (eptr >= md->end_subject)
2268 ph10 426 {
2269 ph10 427 SCHECK_PARTIAL();
2270 ph10 426 RRETURN(MATCH_NOMATCH);
2271 ph10 427 }
2272 ph10 384 GETCHARINCTEST(c, eptr);
2273 nigel 77 if (!_pcre_xclass(c, data)) RRETURN(MATCH_NOMATCH);
2274     }
2275     /* Control never gets here */
2276     }
2277    
2278     /* If maximizing, find the longest possible run, then work backwards. */
2279    
2280     else
2281     {
2282     pp = eptr;
2283     for (i = min; i < max; i++)
2284     {
2285     int len = 1;
2286     if (eptr >= md->end_subject) break;
2287 ph10 384 GETCHARLENTEST(c, eptr, len);
2288 nigel 77 if (!_pcre_xclass(c, data)) break;
2289     eptr += len;
2290     }
2291     for(;;)
2292     {
2293 ph10 164 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM21);
2294 nigel 77 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2295     if (eptr-- == pp) break; /* Stop if tried at original pos */
2296 ph10 214 if (utf8) BACKCHAR(eptr);
2297 nigel 77 }
2298     RRETURN(MATCH_NOMATCH);
2299     }
2300    
2301     /* Control never gets here */
2302     }
2303     #endif /* End of XCLASS */
2304    
2305     /* Match a single character, casefully */
2306    
2307     case OP_CHAR:
2308     #ifdef SUPPORT_UTF8
2309     if (utf8)
2310     {
2311     length = 1;
2312     ecode++;
2313     GETCHARLEN(fc, ecode, length);
2314 ph10 428 if (length > md->end_subject - eptr)
2315     {
2316     CHECK_PARTIAL(); /* Not SCHECK_PARTIAL() */
2317     RRETURN(MATCH_NOMATCH);
2318     }
2319 nigel 77 while (length-- > 0) if (*ecode++ != *eptr++) RRETURN(MATCH_NOMATCH);
2320     }
2321     else
2322     #endif
2323    
2324     /* Non-UTF-8 mode */
2325     {
2326 ph10 428 if (md->end_subject - eptr < 1)
2327     {
2328     SCHECK_PARTIAL(); /* This one can use SCHECK_PARTIAL() */
2329     RRETURN(MATCH_NOMATCH);
2330     }
2331 nigel 77 if (ecode[1] != *eptr++) RRETURN(MATCH_NOMATCH);
2332     ecode += 2;
2333     }
2334     break;
2335    
2336     /* Match a single character, caselessly */
2337    
2338     case OP_CHARNC:
2339     #ifdef SUPPORT_UTF8
2340     if (utf8)
2341     {
2342     length = 1;
2343     ecode++;
2344     GETCHARLEN(fc, ecode, length);
2345    
2346 ph10 428 if (length > md->end_subject - eptr)
2347     {
2348     CHECK_PARTIAL(); /* Not SCHECK_PARTIAL() */
2349     RRETURN(MATCH_NOMATCH);
2350     }
2351 nigel 77
2352     /* If the pattern character's value is < 128, we have only one byte, and
2353     can use the fast lookup table. */
2354    
2355     if (fc < 128)
2356     {
2357     if (md->lcc[*ecode++] != md->lcc[*eptr++]) RRETURN(MATCH_NOMATCH);
2358     }
2359    
2360     /* Otherwise we must pick up the subject character */
2361    
2362     else
2363     {
2364 nigel 93 unsigned int dc;
2365 nigel 77 GETCHARINC(dc, eptr);
2366     ecode += length;
2367    
2368     /* If we have Unicode property support, we can use it to test the other
2369 nigel 87 case of the character, if there is one. */
2370 nigel 77
2371     if (fc != dc)
2372     {
2373     #ifdef SUPPORT_UCP
2374 ph10 349 if (dc != UCD_OTHERCASE(fc))
2375 nigel 77 #endif
2376     RRETURN(MATCH_NOMATCH);
2377     }
2378     }
2379     }
2380     else
2381     #endif /* SUPPORT_UTF8 */
2382    
2383     /* Non-UTF-8 mode */
2384     {
2385 ph10 428 if (md->end_subject - eptr < 1)
2386     {
2387     SCHECK_PARTIAL(); /* This one can use SCHECK_PARTIAL() */
2388     RRETURN(MATCH_NOMATCH);
2389     }
2390 nigel 77 if (md->lcc[ecode[1]] != md->lcc[*eptr++]) RRETURN(MATCH_NOMATCH);
2391     ecode += 2;
2392     }
2393     break;
2394    
2395 nigel 93 /* Match a single character repeatedly. */
2396 nigel 77
2397     case OP_EXACT:
2398     min = max = GET2(ecode, 1);
2399     ecode += 3;
2400     goto REPEATCHAR;
2401    
2402 nigel 93 case OP_POSUPTO:
2403     possessive = TRUE;
2404     /* Fall through */
2405    
2406 nigel 77 case OP_UPTO:
2407     case OP_MINUPTO:
2408     min = 0;
2409     max = GET2(ecode, 1);
2410     minimize = *ecode == OP_MINUPTO;
2411     ecode += 3;
2412     goto REPEATCHAR;
2413    
2414 nigel 93 case OP_POSSTAR:
2415     possessive = TRUE;
2416     min = 0;
2417     max = INT_MAX;
2418     ecode++;
2419     goto REPEATCHAR;
2420    
2421     case OP_POSPLUS:
2422     possessive = TRUE;
2423     min = 1;
2424     max = INT_MAX;
2425     ecode++;
2426     goto REPEATCHAR;
2427    
2428     case OP_POSQUERY:
2429     possessive = TRUE;
2430     min = 0;
2431     max = 1;
2432     ecode++;
2433     goto REPEATCHAR;
2434    
2435 nigel 77 case OP_STAR:
2436     case OP_MINSTAR:
2437     case OP_PLUS:
2438     case OP_MINPLUS:
2439     case OP_QUERY:
2440     case OP_MINQUERY:
2441     c = *ecode++ - OP_STAR;
2442     minimize = (c & 1) != 0;
2443 ph10 428
2444 nigel 77 min = rep_min[c]; /* Pick up values from tables; */
2445     max = rep_max[c]; /* zero for max => infinity */
2446     if (max == 0) max = INT_MAX;
2447    
2448 ph10 426 /* Common code for all repeated single-character matches. */
2449 nigel 77
2450     REPEATCHAR:
2451     #ifdef SUPPORT_UTF8
2452     if (utf8)
2453     {
2454     length = 1;
2455     charptr = ecode;
2456     GETCHARLEN(fc, ecode, length);
2457     ecode += length;
2458    
2459     /* Handle multibyte character matching specially here. There is
2460     support for caseless matching if UCP support is present. */
2461    
2462     if (length > 1)
2463     {
2464     #ifdef SUPPORT_UCP
2465 nigel 93 unsigned int othercase;
2466 nigel 77 if ((ims & PCRE_CASELESS) != 0 &&
2467 ph10 349 (othercase = UCD_OTHERCASE(fc)) != fc)
2468 nigel 77 oclength = _pcre_ord2utf8(othercase, occhars);
2469 ph10 115 else oclength = 0;
2470 nigel 77 #endif /* SUPPORT_UCP */
2471    
2472     for (i = 1; i <= min; i++)
2473     {
2474 ph10 426 if (eptr <= md->end_subject - length &&
2475     memcmp(eptr, charptr, length) == 0) eptr += length;
2476 ph10 123 #ifdef SUPPORT_UCP
2477 ph10 426 else if (oclength > 0 &&
2478     eptr <= md->end_subject - oclength &&
2479     memcmp(eptr, occhars, oclength) == 0) eptr += oclength;
2480     #endif /* SUPPORT_UCP */
2481 nigel 77 else
2482     {
2483 ph10 426 CHECK_PARTIAL();
2484     RRETURN(MATCH_NOMATCH);
2485 nigel 77 }
2486     }
2487    
2488     if (min == max) continue;
2489    
2490     if (minimize)
2491     {
2492     for (fi = min;; fi++)
2493     {
2494 ph10 164 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM22);
2495 nigel 77 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2496 ph10 428 if (fi >= max) RRETURN(MATCH_NOMATCH);
2497 ph10 426 if (eptr <= md->end_subject - length &&
2498     memcmp(eptr, charptr, length) == 0) eptr += length;
2499 ph10 123 #ifdef SUPPORT_UCP
2500 ph10 426 else if (oclength > 0 &&
2501     eptr <= md->end_subject - oclength &&
2502     memcmp(eptr, occhars, oclength) == 0) eptr += oclength;
2503     #endif /* SUPPORT_UCP */
2504 nigel 77 else
2505     {
2506 ph10 426 CHECK_PARTIAL();
2507     RRETURN(MATCH_NOMATCH);
2508 nigel 77 }
2509     }
2510     /* Control never gets here */
2511     }
2512 nigel 93
2513     else /* Maximize */
2514 nigel 77 {
2515     pp = eptr;
2516     for (i = min; i < max; i++)
2517     {
2518 ph10 426 if (eptr <= md->end_subject - length &&
2519     memcmp(eptr, charptr, length) == 0) eptr += length;
2520 ph10 123 #ifdef SUPPORT_UCP
2521 ph10 426 else if (oclength > 0 &&
2522     eptr <= md->end_subject - oclength &&
2523     memcmp(eptr, occhars, oclength) == 0) eptr += oclength;
2524     #endif /* SUPPORT_UCP */
2525 ph10 115 else break;
2526 nigel 77 }
2527 nigel 93
2528     if (possessive) continue;
2529 ph10 427
2530 ph10 120 for(;;)
2531 ph10 426 {
2532     RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM23);
2533     if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2534     if (eptr == pp) { RRETURN(MATCH_NOMATCH); }
2535 ph10 115 #ifdef SUPPORT_UCP
2536 ph10 426 eptr--;
2537     BACKCHAR(eptr);
2538 ph10 123 #else /* without SUPPORT_UCP */
2539 ph10 426 eptr -= length;
2540 ph10 123 #endif /* SUPPORT_UCP */
2541 ph10 426 }
2542 nigel 77 }
2543     /* Control never gets here */
2544     }
2545    
2546     /* If the length of a UTF-8 character is 1, we fall through here, and
2547     obey the code as for non-UTF-8 characters below, though in this case the
2548     value of fc will always be < 128. */
2549     }
2550     else
2551     #endif /* SUPPORT_UTF8 */
2552    
2553     /* When not in UTF-8 mode, load a single-byte character. */
2554    
2555 ph10 426 fc = *ecode++;
2556 ph10 428
2557 nigel 77 /* The value of fc at this point is always less than 256, though we may or
2558     may not be in UTF-8 mode. The code is duplicated for the caseless and
2559     caseful cases, for speed, since matching characters is likely to be quite
2560     common. First, ensure the minimum number of matches are present. If min =
2561     max, continue at the same level without recursing. Otherwise, if
2562     minimizing, keep trying the rest of the expression and advancing one
2563     matching character if failing, up to the maximum. Alternatively, if
2564     maximizing, find the maximum number of characters and work backwards. */
2565    
2566     DPRINTF(("matching %c{%d,%d} against subject %.*s\n", fc, min, max,
2567     max, eptr));
2568    
2569     if ((ims & PCRE_CASELESS) != 0)
2570     {
2571     fc = md->lcc[fc];
2572     for (i = 1; i <= min; i++)
2573 ph10 426 {
2574     if (eptr >= md->end_subject)
2575     {
2576     SCHECK_PARTIAL();
2577     RRETURN(MATCH_NOMATCH);
2578     }
2579 nigel 77 if (fc != md->lcc[*eptr++]) RRETURN(MATCH_NOMATCH);
2580 ph10 426 }
2581 nigel 77 if (min == max) continue;
2582     if (minimize)
2583     {
2584     for (fi = min;; fi++)
2585     {
2586 ph10 164 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM24);
2587 nigel 77 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2588 ph10 428 if (fi >= max) RRETURN(MATCH_NOMATCH);
2589 ph10 426 if (eptr >= md->end_subject)
2590     {
2591 ph10 427 SCHECK_PARTIAL();
2592 ph10 426 RRETURN(MATCH_NOMATCH);
2593     }
2594     if (fc != md->lcc[*eptr++]) RRETURN(MATCH_NOMATCH);
2595 nigel 77 }
2596     /* Control never gets here */
2597     }
2598 nigel 93 else /* Maximize */
2599 nigel 77 {
2600     pp = eptr;
2601     for (i = min; i < max; i++)
2602     {
2603     if (eptr >= md->end_subject || fc != md->lcc[*eptr]) break;
2604     eptr++;
2605     }
2606 ph10 427
2607 nigel 93 if (possessive) continue;
2608 ph10 427
2609 nigel 77 while (eptr >= pp)
2610     {
2611 ph10 164 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM25);
2612 nigel 77 eptr--;
2613     if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2614     }
2615     RRETURN(MATCH_NOMATCH);
2616     }
2617     /* Control never gets here */
2618     }
2619    
2620     /* Caseful comparisons (includes all multi-byte characters) */
2621    
2622     else
2623     {
2624 ph10 427 for (i = 1; i <= min; i++)
2625 ph10 426 {
2626     if (eptr >= md->end_subject)
2627     {
2628     SCHECK_PARTIAL();
2629     RRETURN(MATCH_NOMATCH);
2630     }
2631     if (fc != *eptr++) RRETURN(MATCH_NOMATCH);
2632 ph10 427 }
2633 ph10 428
2634 nigel 77 if (min == max) continue;
2635 ph10 428
2636 nigel 77 if (minimize)
2637     {
2638     for (fi = min;; fi++)
2639     {
2640 ph10 164 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM26);
2641 nigel 77 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2642 ph10 428 if (fi >= max) RRETURN(MATCH_NOMATCH);
2643 ph10 426 if (eptr >= md->end_subject)
2644 ph10 427 {
2645 ph10 426 SCHECK_PARTIAL();
2646     RRETURN(MATCH_NOMATCH);
2647 ph10 427 }
2648 ph10 426 if (fc != *eptr++) RRETURN(MATCH_NOMATCH);
2649 nigel 77 }
2650     /* Control never gets here */
2651     }
2652 nigel 93 else /* Maximize */
2653 nigel 77 {
2654     pp = eptr;
2655     for (i = min; i < max; i++)
2656     {
2657     if (eptr >= md->end_subject || fc != *eptr) break;
2658     eptr++;
2659     }
2660 nigel 93 if (possessive) continue;
2661 ph10 428
2662 nigel 77 while (eptr >= pp)
2663     {
2664 ph10 164 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM27);
2665 nigel 77 eptr--;
2666     if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2667     }
2668     RRETURN(MATCH_NOMATCH);
2669     }
2670     }
2671     /* Control never gets here */
2672    
2673     /* Match a negated single one-byte character. The character we are
2674     checking can be multibyte. */
2675    
2676     case OP_NOT:
2677 ph10 428 if (eptr >= md->end_subject)
2678     {
2679     SCHECK_PARTIAL();
2680     RRETURN(MATCH_NOMATCH);
2681     }
2682 nigel 77 ecode++;
2683     GETCHARINCTEST(c, eptr);
2684     if ((ims & PCRE_CASELESS) != 0)
2685     {
2686     #ifdef SUPPORT_UTF8
2687     if (c < 256)
2688     #endif
2689     c = md->lcc[c];
2690     if (md->lcc[*ecode++] == c) RRETURN(MATCH_NOMATCH);
2691     }
2692     else
2693     {
2694     if (*ecode++ == c) RRETURN(MATCH_NOMATCH);
2695     }
2696     break;
2697    
2698     /* Match a negated single one-byte character repeatedly. This is almost a
2699     repeat of the code for a repeated single character, but I haven't found a
2700     nice way of commoning these up that doesn't require a test of the
2701     positive/negative option for each character match. Maybe that wouldn't add
2702     very much to the time taken, but character matching *is* what this is all
2703     about... */
2704    
2705     case OP_NOTEXACT:
2706     min = max = GET2(ecode, 1);
2707     ecode += 3;
2708     goto REPEATNOTCHAR;
2709    
2710     case OP_NOTUPTO:
2711     case OP_NOTMINUPTO:
2712     min = 0;
2713     max = GET2(ecode, 1);
2714     minimize = *ecode == OP_NOTMINUPTO;
2715     ecode += 3;
2716     goto REPEATNOTCHAR;
2717    
2718 nigel 93 case OP_NOTPOSSTAR:
2719     possessive = TRUE;
2720     min = 0;
2721     max = INT_MAX;
2722     ecode++;
2723     goto REPEATNOTCHAR;
2724    
2725     case OP_NOTPOSPLUS:
2726     possessive = TRUE;
2727     min = 1;
2728     max = INT_MAX;
2729     ecode++;
2730     goto REPEATNOTCHAR;
2731    
2732     case OP_NOTPOSQUERY:
2733     possessive = TRUE;
2734     min = 0;
2735     max = 1;
2736     ecode++;
2737     goto REPEATNOTCHAR;
2738    
2739     case OP_NOTPOSUPTO:
2740     possessive = TRUE;
2741     min = 0;
2742     max = GET2(ecode, 1);
2743     ecode += 3;
2744     goto REPEATNOTCHAR;
2745    
2746 nigel 77 case OP_NOTSTAR:
2747     case OP_NOTMINSTAR:
2748     case OP_NOTPLUS:
2749     case OP_NOTMINPLUS:
2750     case OP_NOTQUERY:
2751     case OP_NOTMINQUERY:
2752     c = *ecode++ - OP_NOTSTAR;
2753     minimize = (c & 1) != 0;
2754     min = rep_min[c]; /* Pick up values from tables; */
2755     max = rep_max[c]; /* zero for max => infinity */
2756     if (max == 0) max = INT_MAX;
2757    
2758 ph10 426 /* Common code for all repeated single-byte matches. */
2759 nigel 77
2760     REPEATNOTCHAR:
2761     fc = *ecode++;
2762    
2763     /* The code is duplicated for the caseless and caseful cases, for speed,
2764     since matching characters is likely to be quite common. First, ensure the
2765     minimum number of matches are present. If min = max, continue at the same
2766     level without recursing. Otherwise, if minimizing, keep trying the rest of
2767     the expression and advancing one matching character if failing, up to the
2768     maximum. Alternatively, if maximizing, find the maximum number of
2769     characters and work backwards. */
2770    
2771     DPRINTF(("negative matching %c{%d,%d} against subject %.*s\n", fc, min, max,
2772     max, eptr));
2773    
2774     if ((ims & PCRE_CASELESS) != 0)
2775     {
2776     fc = md->lcc[fc];
2777    
2778     #ifdef SUPPORT_UTF8
2779     /* UTF-8 mode */
2780     if (utf8)
2781     {
2782 nigel 93 register unsigned int d;
2783 nigel 77 for (i = 1; i <= min; i++)
2784     {
2785 ph10 426 if (eptr >= md->end_subject)
2786     {
2787     SCHECK_PARTIAL();
2788 ph10 427 RRETURN(MATCH_NOMATCH);
2789     }
2790 nigel 77 GETCHARINC(d, eptr);
2791     if (d < 256) d = md->lcc[d];
2792     if (fc == d) RRETURN(MATCH_NOMATCH);
2793     }
2794     }
2795     else
2796     #endif
2797    
2798     /* Not UTF-8 mode */
2799     {
2800     for (i = 1; i <= min; i++)
2801 ph10 426 {
2802     if (eptr >= md->end_subject)
2803     {
2804     SCHECK_PARTIAL();
2805 ph10 427 RRETURN(MATCH_NOMATCH);
2806     }
2807 nigel 77 if (fc == md->lcc[*eptr++]) RRETURN(MATCH_NOMATCH);
2808 ph10 427 }
2809 nigel 77 }
2810    
2811     if (min == max) continue;
2812    
2813     if (minimize)
2814     {
2815     #ifdef SUPPORT_UTF8
2816     /* UTF-8 mode */
2817     if (utf8)
2818     {
2819 nigel 93 register unsigned int d;
2820 nigel 77 for (fi = min;; fi++)
2821     {
2822 ph10 164 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM28);
2823 nigel 77 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2824 ph10 428 if (fi >= max) RRETURN(MATCH_NOMATCH);
2825 ph10 427 if (eptr >= md->end_subject)
2826 ph10 426 {
2827 ph10 427 SCHECK_PARTIAL();
2828 ph10 426 RRETURN(MATCH_NOMATCH);
2829 ph10 427 }
2830 nigel 77 GETCHARINC(d, eptr);
2831     if (d < 256) d = md->lcc[d];
2832 ph10 366 if (fc == d) RRETURN(MATCH_NOMATCH);
2833 nigel 77 }
2834     }
2835     else
2836     #endif
2837     /* Not UTF-8 mode */
2838     {
2839     for (fi = min;; fi++)
2840     {
2841 ph10 164 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM29);
2842 nigel 77 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2843 ph10 428 if (fi >= max) RRETURN(MATCH_NOMATCH);
2844 ph10 426 if (eptr >= md->end_subject)
2845     {
2846     SCHECK_PARTIAL();
2847     RRETURN(MATCH_NOMATCH);
2848     }
2849     if (fc == md->lcc[*eptr++]) RRETURN(MATCH_NOMATCH);
2850 nigel 77 }
2851     }
2852     /* Control never gets here */
2853     }
2854    
2855     /* Maximize case */
2856    
2857     else
2858     {
2859     pp = eptr;
2860    
2861     #ifdef SUPPORT_UTF8
2862     /* UTF-8 mode */
2863     if (utf8)
2864     {
2865 nigel 93 register unsigned int d;
2866 nigel 77 for (i = min; i < max; i++)
2867     {
2868     int len = 1;
2869     if (eptr >= md->end_subject) break;
2870     GETCHARLEN(d, eptr, len);
2871     if (d < 256) d = md->lcc[d];
2872     if (fc == d) break;
2873     eptr += len;
2874     }
2875 nigel 93 if (possessive) continue;
2876     for(;;)
2877 nigel 77 {
2878 ph10 164 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM30);
2879 nigel 77 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2880     if (eptr-- == pp) break; /* Stop if tried at original pos */
2881     BACKCHAR(eptr);
2882     }
2883     }
2884     else
2885     #endif
2886     /* Not UTF-8 mode */
2887     {
2888     for (i = min; i < max; i++)
2889     {
2890     if (eptr >= md->end_subject || fc == md->lcc[*eptr]) break;
2891     eptr++;
2892     }
2893 nigel 93 if (possessive) continue;
2894 nigel 77 while (eptr >= pp)
2895     {
2896 ph10 164 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM31);
2897 nigel 77 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2898     eptr--;
2899     }
2900     }
2901    
2902     RRETURN(MATCH_NOMATCH);
2903     }
2904     /* Control never gets here */
2905     }
2906    
2907     /* Caseful comparisons */
2908    
2909     else
2910     {
2911     #ifdef SUPPORT_UTF8
2912     /* UTF-8 mode */
2913     if (utf8)
2914     {
2915 nigel 93 register unsigned int d;
2916 nigel 77 for (i = 1; i <= min; i++)
2917     {
2918 ph10 426 if (eptr >= md->end_subject)
2919     {
2920     SCHECK_PARTIAL();
2921 ph10 427 RRETURN(MATCH_NOMATCH);
2922     }
2923 nigel 77 GETCHARINC(d, eptr);
2924     if (fc == d) RRETURN(MATCH_NOMATCH);
2925     }
2926     }
2927     else
2928     #endif
2929     /* Not UTF-8 mode */
2930     {
2931     for (i = 1; i <= min; i++)
2932 ph10 426 {
2933     if (eptr >= md->end_subject)
2934     {
2935     SCHECK_PARTIAL();
2936 ph10 427 RRETURN(MATCH_NOMATCH);
2937     }
2938 nigel 77 if (fc == *eptr++) RRETURN(MATCH_NOMATCH);
2939 ph10 427 }
2940 nigel 77 }
2941    
2942     if (min == max) continue;
2943    
2944     if (minimize)
2945     {
2946     #ifdef SUPPORT_UTF8
2947     /* UTF-8 mode */
2948     if (utf8)
2949     {
2950 nigel 93 register unsigned int d;
2951 nigel 77 for (fi = min;; fi++)
2952     {
2953 ph10 164 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM32);
2954 nigel 77 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2955 ph10 428 if (fi >= max) RRETURN(MATCH_NOMATCH);
2956 ph10 427 if (eptr >= md->end_subject)
2957 ph10 426 {
2958 ph10 427 SCHECK_PARTIAL();
2959 ph10 426 RRETURN(MATCH_NOMATCH);
2960 ph10 427 }
2961 nigel 77 GETCHARINC(d, eptr);
2962 ph10 371 if (fc == d) RRETURN(MATCH_NOMATCH);
2963 nigel 77 }
2964     }
2965     else
2966     #endif
2967     /* Not UTF-8 mode */
2968     {
2969     for (fi = min;; fi++)
2970     {
2971 ph10 164 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM33);
2972 nigel 77 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2973 ph10 428 if (fi >= max) RRETURN(MATCH_NOMATCH);
2974 ph10 426 if (eptr >= md->end_subject)
2975     {
2976     SCHECK_PARTIAL();
2977     RRETURN(MATCH_NOMATCH);
2978 ph10 427 }
2979 ph10 426 if (fc == *eptr++) RRETURN(MATCH_NOMATCH);
2980 nigel 77 }
2981     }
2982     /* Control never gets here */
2983     }
2984    
2985     /* Maximize case */
2986    
2987     else
2988     {
2989     pp = eptr;
2990    
2991     #ifdef SUPPORT_UTF8
2992     /* UTF-8 mode */
2993     if (utf8)
2994     {
2995 nigel 93 register unsigned int d;
2996 nigel 77 for (i = min; i < max; i++)
2997     {
2998     int len = 1;
2999     if (eptr >= md->end_subject) break;
3000     GETCHARLEN(d, eptr, len);
3001     if (fc == d) break;
3002     eptr += len;
3003     }
3004 nigel 93 if (possessive) continue;
3005 nigel 77 for(;;)
3006     {
3007 ph10 164 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM34);
3008 nigel 77 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3009     if (eptr-- == pp) break; /* Stop if tried at original pos */
3010     BACKCHAR(eptr);
3011     }
3012     }
3013     else
3014     #endif
3015     /* Not UTF-8 mode */
3016     {
3017     for (i = min; i < max; i++)
3018     {
3019     if (eptr >= md->end_subject || fc == *eptr) break;
3020     eptr++;
3021     }
3022 nigel 93 if (possessive) continue;
3023 nigel 77 while (eptr >= pp)
3024     {
3025 ph10 164 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM35);
3026 nigel 77 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3027     eptr--;
3028     }
3029     }
3030    
3031     RRETURN(MATCH_NOMATCH);
3032     }
3033     }
3034     /* Control never gets here */
3035    
3036     /* Match a single character type repeatedly; several different opcodes
3037     share code. This is very similar to the code for single characters, but we
3038     repeat it in the interests of efficiency. */
3039    
3040     case OP_TYPEEXACT:
3041     min = max = GET2(ecode, 1);
3042     minimize = TRUE;
3043     ecode += 3;
3044     goto REPEATTYPE;
3045    
3046     case OP_TYPEUPTO:
3047     case OP_TYPEMINUPTO:
3048     min = 0;
3049     max = GET2(ecode, 1);
3050     minimize = *ecode == OP_TYPEMINUPTO;
3051     ecode += 3;
3052     goto REPEATTYPE;
3053    
3054 nigel 93 case OP_TYPEPOSSTAR:
3055     possessive = TRUE;
3056     min = 0;
3057     max = INT_MAX;
3058     ecode++;
3059     goto REPEATTYPE;
3060    
3061     case OP_TYPEPOSPLUS:
3062     possessive = TRUE;
3063     min = 1;
3064     max = INT_MAX;
3065     ecode++;
3066     goto REPEATTYPE;
3067    
3068     case OP_TYPEPOSQUERY:
3069     possessive = TRUE;
3070     min = 0;
3071     max = 1;
3072     ecode++;
3073     goto REPEATTYPE;
3074    
3075     case OP_TYPEPOSUPTO:
3076     possessive = TRUE;
3077     min = 0;
3078     max = GET2(ecode, 1);
3079     ecode += 3;
3080     goto REPEATTYPE;
3081    
3082 nigel 77 case OP_TYPESTAR:
3083     case OP_TYPEMINSTAR:
3084     case OP_TYPEPLUS:
3085     case OP_TYPEMINPLUS:
3086     case OP_TYPEQUERY:
3087     case OP_TYPEMINQUERY:
3088     c = *ecode++ - OP_TYPESTAR;
3089     minimize = (c & 1) != 0;
3090     min = rep_min[c]; /* Pick up values from tables; */
3091     max = rep_max[c]; /* zero for max => infinity */
3092     if (max == 0) max = INT_MAX;
3093    
3094     /* Common code for all repeated single character type matches. Note that
3095     in UTF-8 mode, '.' matches a character of any length, but for the other
3096     character types, the valid characters are all one-byte long. */
3097    
3098     REPEATTYPE:
3099     ctype = *ecode++; /* Code for the character type */
3100    
3101     #ifdef SUPPORT_UCP
3102     if (ctype == OP_PROP || ctype == OP_NOTPROP)
3103     {
3104     prop_fail_result = ctype == OP_NOTPROP;
3105     prop_type = *ecode++;
3106 nigel 87 prop_value = *ecode++;
3107 nigel 77 }
3108     else prop_type = -1;
3109     #endif
3110    
3111     /* First, ensure the minimum number of matches are present. Use inline
3112     code for maximizing the speed, and do the type test once at the start
3113 ph10 426 (i.e. keep it out of the loop). Separate the UTF-8 code completely as that
3114 nigel 77 is tidier. Also separate the UCP code, which can be the same for both UTF-8
3115     and single-bytes. */
3116    
3117     if (min > 0)
3118     {
3119     #ifdef SUPPORT_UCP
3120 nigel 87 if (prop_type >= 0)
3121 nigel 77 {
3122 nigel 87 switch(prop_type)
3123 nigel 77 {
3124 nigel 87 case PT_ANY:
3125     if (prop_fail_result) RRETURN(MATCH_NOMATCH);
3126     for (i = 1; i <= min; i++)
3127     {
3128 ph10 427 if (eptr >= md->end_subject)
3129 ph10 426 {
3130 ph10 427 SCHECK_PARTIAL();
3131 ph10 426 RRETURN(MATCH_NOMATCH);
3132 ph10 427 }
3133 ph10 184 GETCHARINCTEST(c, eptr);
3134 nigel 87 }
3135     break;
3136    
3137     case PT_LAMP:
3138     for (i = 1; i <= min; i++)
3139     {
3140 ph10 427 if (eptr >= md->end_subject)
3141 ph10 426 {
3142 ph10 427 SCHECK_PARTIAL();
3143 ph10 426 RRETURN(MATCH_NOMATCH);
3144 ph10 427 }
3145 ph10 184 GETCHARINCTEST(c, eptr);
3146 ph10 349 prop_chartype = UCD_CHARTYPE(c);
3147 nigel 87 if ((prop_chartype == ucp_Lu ||
3148     prop_chartype == ucp_Ll ||
3149     prop_chartype == ucp_Lt) == prop_fail_result)
3150     RRETURN(MATCH_NOMATCH);
3151     }
3152     break;
3153    
3154     case PT_GC:
3155     for (i = 1; i <= min; i++)
3156     {
3157 ph10 427 if (eptr >= md->end_subject)
3158 ph10 426 {
3159 ph10 427 SCHECK_PARTIAL();
3160 ph10 426 RRETURN(MATCH_NOMATCH);
3161 ph10 427 }
3162 ph10 184 GETCHARINCTEST(c, eptr);
3163 ph10 349 prop_category = UCD_CATEGORY(c);
3164 nigel 87 if ((prop_category == prop_value) == prop_fail_result)
3165     RRETURN(MATCH_NOMATCH);
3166     }
3167     break;
3168    
3169     case PT_PC:
3170     for (i = 1; i <= min; i++)
3171     {
3172 ph10 427 if (eptr >= md->end_subject)
3173 ph10 426 {
3174 ph10 427 SCHECK_PARTIAL();
3175 ph10 426 RRETURN(MATCH_NOMATCH);
3176 ph10 427 }
3177 ph10 184 GETCHARINCTEST(c, eptr);
3178 ph10 349 prop_chartype = UCD_CHARTYPE(c);
3179 nigel 87 if ((prop_chartype == prop_value) == prop_fail_result)
3180     RRETURN(MATCH_NOMATCH);
3181     }
3182     break;
3183    
3184     case PT_SC:
3185     for (i = 1; i <= min; i++)
3186     {
3187 ph10 427 if (eptr >= md->end_subject)
3188 ph10 426 {
3189 ph10 427 SCHECK_PARTIAL();
3190 ph10 426 RRETURN(MATCH_NOMATCH);
3191 ph10 427 }
3192 ph10 184 GETCHARINCTEST(c, eptr);
3193 ph10 349 prop_script = UCD_SCRIPT(c);
3194 nigel 87 if ((prop_script == prop_value) == prop_fail_result)
3195     RRETURN(MATCH_NOMATCH);
3196     }
3197     break;
3198    
3199     default:
3200     RRETURN(PCRE_ERROR_INTERNAL);
3201 nigel 77 }
3202     }
3203    
3204     /* Match extended Unicode sequences. We will get here only if the
3205     support is in the binary; otherwise a compile-time error occurs. */
3206    
3207     else if (ctype == OP_EXTUNI)
3208     {
3209     for (i = 1; i <= min; i++)
3210     {
3211 ph10 427 if (eptr >= md->end_subject)
3212 ph10 426 {
3213 ph10 427 SCHECK_PARTIAL();
3214 ph10 426 RRETURN(MATCH_NOMATCH);
3215 ph10 427 }
3216 nigel 77 GETCHARINCTEST(c, eptr);
3217 ph10 349 prop_category = UCD_CATEGORY(c);
3218 nigel 77 if (prop_category == ucp_M) RRETURN(MATCH_NOMATCH);
3219     while (eptr < md->end_subject)
3220     {
3221     int len = 1;
3222 ph10 426 if (!utf8) c = *eptr;
3223     else { GETCHARLEN(c, eptr, len); }
3224 ph10 349 prop_category = UCD_CATEGORY(c);
3225 nigel 77 if (prop_category != ucp_M) break;
3226     eptr += len;
3227     }
3228     }
3229     }
3230    
3231     else
3232     #endif /* SUPPORT_UCP */
3233    
3234     /* Handle all other cases when the coding is UTF-8 */
3235    
3236     #ifdef SUPPORT_UTF8
3237     if (utf8) switch(ctype)
3238     {
3239     case OP_ANY:
3240     for (i = 1; i <= min; i++)
3241     {
3242 ph10 426 if (eptr >= md->end_subject)
3243     {
3244 ph10 427 SCHECK_PARTIAL();
3245 nigel 77 RRETURN(MATCH_NOMATCH);
3246 ph10 427 }
3247 ph10 426 if (IS_NEWLINE(eptr)) RRETURN(MATCH_NOMATCH);
3248 nigel 91 eptr++;
3249 nigel 77 while (eptr < md->end_subject && (*eptr & 0xc0) == 0x80) eptr++;
3250     }
3251     break;
3252    
3253 ph10 341 case OP_ALLANY:
3254     for (i = 1; i <= min; i++)
3255     {
3256 ph10 427 if (eptr >= md->end_subject)
3257 ph10 426 {
3258     SCHECK_PARTIAL();
3259     RRETURN(MATCH_NOMATCH);
3260 ph10 427 }
3261 ph10 341 eptr++;
3262     while (eptr < md->end_subject && (*eptr & 0xc0) == 0x80) eptr++;
3263     }
3264     break;
3265    
3266 nigel 77 case OP_ANYBYTE:
3267 ph10 427 if (eptr > md->end_subject - min) RRETURN(MATCH_NOMATCH);
3268 nigel 77 eptr += min;
3269     break;
3270    
3271 nigel 93 case OP_ANYNL:
3272     for (i = 1; i <= min; i++)
3273     {
3274 ph10 427 if (eptr >= md->end_subject)
3275 ph10 426 {
3276     SCHECK_PARTIAL();
3277     RRETURN(MATCH_NOMATCH);
3278 ph10 427 }
3279 nigel 93 GETCHARINC(c, eptr);
3280     switch(c)
3281     {
3282     default: RRETURN(MATCH_NOMATCH);
3283     case 0x000d:
3284     if (eptr < md->end_subject && *eptr == 0x0a) eptr++;
3285     break;
3286 ph10 231
3287 nigel 93 case 0x000a:
3288 ph10 231 break;
3289    
3290 nigel 93 case 0x000b:
3291     case 0x000c:
3292     case 0x0085:
3293     case 0x2028:
3294     case 0x2029:
3295 ph10 231 if (md->bsr_anycrlf) RRETURN(MATCH_NOMATCH);
3296 nigel 93 break;
3297     }
3298     }
3299     break;
3300    
3301 ph10 178 case OP_NOT_HSPACE:
3302     for (i = 1; i <= min; i++)
3303     {
3304 ph10 427 if (eptr >= md->end_subject)
3305 ph10 426 {
3306     SCHECK_PARTIAL();
3307     RRETURN(MATCH_NOMATCH);
3308 ph10 427 }
3309 ph10 178 GETCHARINC(c, eptr);
3310     switch(c)
3311     {
3312     default: break;
3313     case 0x09: /* HT */
3314     case 0x20: /* SPACE */
3315     case 0xa0: /* NBSP */
3316     case 0x1680: /* OGHAM SPACE MARK */
3317     case 0x180e: /* MONGOLIAN VOWEL SEPARATOR */
3318     case 0x2000: /* EN QUAD */
3319     case 0x2001: /* EM QUAD */
3320     case 0x2002: /* EN SPACE */
3321     case 0x2003: /* EM SPACE */
3322     case 0x2004: /* THREE-PER-EM SPACE */
3323     case 0x2005: /* FOUR-PER-EM SPACE */
3324     case 0x2006: /* SIX-PER-EM SPACE */
3325     case 0x2007: /* FIGURE SPACE */
3326     case 0x2008: /* PUNCTUATION SPACE */
3327     case 0x2009: /* THIN SPACE */
3328     case 0x200A: /* HAIR SPACE */
3329     case 0x202f: /* NARROW NO-BREAK SPACE */
3330     case 0x205f: /* MEDIUM MATHEMATICAL SPACE */
3331     case 0x3000: /* IDEOGRAPHIC SPACE */
3332     RRETURN(MATCH_NOMATCH);
3333     }
3334     }
3335     break;
3336 ph10 182
3337 ph10 178 case OP_HSPACE:
3338     for (i = 1; i <= min; i++)
3339     {
3340 ph10 427 if (eptr >= md->end_subject)
3341 ph10 426 {
3342 ph10 427 SCHECK_PARTIAL();
3343 ph10 426 RRETURN(MATCH_NOMATCH);
3344 ph10 427 }
3345 ph10 178 GETCHARINC(c, eptr);
3346     switch(c)
3347     {
3348     default: RRETURN(MATCH_NOMATCH);
3349     case 0x09: /* HT */
3350     case 0x20: /* SPACE */
3351     case 0xa0: /* NBSP */
3352     case 0x1680: /* OGHAM SPACE MARK */
3353     case 0x180e: /* MONGOLIAN VOWEL SEPARATOR */
3354     case 0x2000: /* EN QUAD */
3355     case 0x2001: /* EM QUAD */
3356     case 0x2002: /* EN SPACE */
3357     case 0x2003: /* EM SPACE */
3358     case 0x2004: /* THREE-PER-EM SPACE */
3359     case 0x2005: /* FOUR-PER-EM SPACE */
3360     case 0x2006: /* SIX-PER-EM SPACE */
3361     case 0x2007: /* FIGURE SPACE */
3362     case 0x2008: /* PUNCTUATION SPACE */
3363     case 0x2009: /* THIN SPACE */
3364     case 0x200A: /* HAIR SPACE */
3365     case 0x202f: /* NARROW NO-BREAK SPACE */
3366     case 0x205f: /* MEDIUM MATHEMATICAL SPACE */
3367     case 0x3000: /* IDEOGRAPHIC SPACE */
3368     break;
3369     }
3370     }
3371     break;
3372 ph10 182
3373 ph10 178 case OP_NOT_VSPACE:
3374     for (i = 1; i <= min; i++)
3375     {
3376 ph10 427 if (eptr >= md->end_subject)
3377 ph10 426 {
3378 ph10 427 SCHECK_PARTIAL();
3379 ph10 426 RRETURN(MATCH_NOMATCH);
3380 ph10 427 }
3381 ph10 178 GETCHARINC(c, eptr);
3382     switch(c)
3383     {
3384     default: break;
3385     case 0x0a: /* LF */
3386     case 0x0b: /* VT */
3387     case 0x0c: /* FF */
3388     case 0x0d: /* CR */
3389     case 0x85: /* NEL */
3390     case 0x2028: /* LINE SEPARATOR */
3391     case 0x2029: /* PARAGRAPH SEPARATOR */
3392     RRETURN(MATCH_NOMATCH);
3393     }
3394     }
3395     break;
3396 ph10 182
3397 ph10 178 case OP_VSPACE:
3398     for (i = 1; i <= min; i++)
3399     {
3400 ph10 427 if (eptr >= md->end_subject)
3401 ph10 426 {
3402 ph10 427 SCHECK_PARTIAL();
3403 ph10 426 RRETURN(MATCH_NOMATCH);
3404 ph10 427 }
3405 ph10 178 GETCHARINC(c, eptr);
3406     switch(c)
3407     {
3408     default: RRETURN(MATCH_NOMATCH);
3409     case 0x0a: /* LF */
3410     case 0x0b: /* VT */
3411     case 0x0c: /* FF */
3412     case 0x0d: /* CR */
3413     case 0x85: /* NEL */
3414     case 0x2028: /* LINE SEPARATOR */
3415     case 0x2029: /* PARAGRAPH SEPARATOR */
3416 ph10 182 break;
3417 ph10 178 }
3418     }
3419     break;
3420    
3421 nigel 77 case OP_NOT_DIGIT:
3422     for (i = 1; i <= min; i++)
3423     {
3424 ph10 427 if (eptr >= md->end_subject)
3425 ph10 426 {
3426 ph10 427 SCHECK_PARTIAL();
3427 ph10 426 RRETURN(MATCH_NOMATCH);
3428 ph10 427 }
3429 nigel 77 GETCHARINC(c, eptr);
3430     if (c < 128 && (md->ctypes[c] & ctype_digit) != 0)
3431     RRETURN(MATCH_NOMATCH);
3432     }
3433     break;
3434    
3435     case OP_DIGIT:
3436     for (i = 1; i <= min; i++)
3437     {
3438 ph10 427 if (eptr >= md->end_subject)
3439 ph10 426 {
3440 ph10 427 SCHECK_PARTIAL();
3441 nigel 77 RRETURN(MATCH_NOMATCH);
3442 ph10 427 }
3443 ph10 426 if (*eptr >= 128 || (md->ctypes[*eptr++] & ctype_digit) == 0)
3444     RRETURN(MATCH_NOMATCH);
3445 nigel 77 /* No need to skip more bytes - we know it's a 1-byte character */
3446     }
3447     break;
3448    
3449     case OP_NOT_WHITESPACE:
3450     for (i = 1; i <= min; i++)
3451     {
3452 ph10 427 if (eptr >= md->end_subject)
3453 ph10 426 {
3454 ph10 427 SCHECK_PARTIAL();
3455 nigel 77 RRETURN(MATCH_NOMATCH);
3456 ph10 427 }
3457 ph10 426 if (*eptr < 128 && (md->ctypes[*eptr] & ctype_space) != 0)
3458     RRETURN(MATCH_NOMATCH);
3459 ph10 219 while (++eptr < md->end_subject && (*eptr & 0xc0) == 0x80);
3460 nigel 77 }
3461     break;
3462    
3463     case OP_WHITESPACE:
3464     for (i = 1; i <= min; i++)
3465     {
3466 ph10 427 if (eptr >= md->end_subject)
3467 ph10 426 {
3468 ph10 427 SCHECK_PARTIAL();
3469 nigel 77 RRETURN(MATCH_NOMATCH);
3470 ph10 427 }
3471 ph10 426 if (*eptr >= 128 || (md->ctypes[*eptr++] & ctype_space) == 0)
3472     RRETURN(MATCH_NOMATCH);
3473 nigel 77 /* No need to skip more bytes - we know it's a 1-byte character */
3474     }
3475     break;
3476    
3477     case OP_NOT_WORDCHAR:
3478     for (i = 1; i <= min; i++)
3479     {
3480     if (eptr >= md->end_subject ||
3481 ph10 219 (*eptr < 128 && (md->ctypes[*eptr] & ctype_word) != 0))
3482 nigel 77 RRETURN(MATCH_NOMATCH);
3483 ph10 219 while (++eptr < md->end_subject && (*eptr & 0xc0) == 0x80);
3484 nigel 77 }
3485     break;
3486    
3487     case OP_WORDCHAR:
3488     for (i = 1; i <= min; i++)
3489     {
3490 ph10 427 if (eptr >= md->end_subject)
3491 ph10 426 {
3492 ph10 427 SCHECK_PARTIAL();
3493 nigel 77 RRETURN(MATCH_NOMATCH);
3494 ph10 427 }
3495 ph10 426 if (*eptr >= 128 || (md->ctypes[*eptr++] & ctype_word) == 0)
3496     RRETURN(MATCH_NOMATCH);
3497 nigel 77 /* No need to skip more bytes - we know it's a 1-byte character */
3498     }
3499     break;
3500    
3501     default:
3502     RRETURN(PCRE_ERROR_INTERNAL);
3503     } /* End switch(ctype) */
3504    
3505     else
3506     #endif /* SUPPORT_UTF8 */
3507    
3508     /* Code for the non-UTF-8 case for minimum matching of operators other
3509 ph10 426 than OP_PROP and OP_NOTPROP. */
3510 nigel 77
3511     switch(ctype)
3512     {
3513     case OP_ANY:
3514 ph10 342 for (i = 1; i <= min; i++)
3515 nigel 77 {
3516 ph10 427 if (eptr >= md->end_subject)
3517 ph10 426 {
3518 ph10 427 SCHECK_PARTIAL();
3519 ph10 426 RRETURN(MATCH_NOMATCH);
3520 ph10 427 }
3521 ph10 342 if (IS_NEWLINE(eptr)) RRETURN(MATCH_NOMATCH);
3522     eptr++;
3523 nigel 77 }
3524     break;
3525    
3526 ph10 341 case OP_ALLANY:
3527 ph10 428 if (eptr > md->end_subject - min)
3528     {
3529     SCHECK_PARTIAL();
3530     RRETURN(MATCH_NOMATCH);
3531     }
3532 ph10 341 eptr += min;
3533     break;
3534    
3535 nigel 77 case OP_ANYBYTE:
3536 ph10 428 if (eptr > md->end_subject - min)
3537     {
3538     SCHECK_PARTIAL();
3539     RRETURN(MATCH_NOMATCH);
3540     }
3541 nigel 77 eptr += min;
3542     break;
3543    
3544 nigel 93 case OP_ANYNL:
3545     for (i = 1; i <= min; i++)
3546     {
3547 ph10 427 if (eptr >= md->end_subject)
3548 ph10 426 {
3549 ph10 427 SCHECK_PARTIAL();
3550 ph10 426 RRETURN(MATCH_NOMATCH);
3551 ph10 427 }
3552 nigel 93 switch(*eptr++)
3553     {
3554     default: RRETURN(MATCH_NOMATCH);
3555     case 0x000d:
3556     if (eptr < md->end_subject && *eptr == 0x0a) eptr++;
3557     break;
3558     case 0x000a:
3559 ph10 231 break;
3560    
3561 nigel 93 case 0x000b:
3562     case 0x000c:
3563     case 0x0085:
3564 ph10 231 if (md->bsr_anycrlf) RRETURN(MATCH_NOMATCH);
3565 nigel 93 break;
3566     }
3567     }
3568     break;
3569    
3570 ph10 178 case OP_NOT_HSPACE:
3571     for (i = 1; i <= min; i++)
3572     {
3573 ph10 427 if (eptr >= md->end_subject)
3574 ph10 426 {
3575 ph10 427 SCHECK_PARTIAL();
3576 ph10 426 RRETURN(MATCH_NOMATCH);
3577 ph10 427 }
3578 ph10 178 switch(*eptr++)
3579     {
3580     default: break;
3581     case 0x09: /* HT */
3582     case 0x20: /* SPACE */
3583     case 0xa0: /* NBSP */
3584     RRETURN(MATCH_NOMATCH);
3585     }
3586     }
3587     break;
3588    
3589     case OP_HSPACE:
3590     for (i = 1; i <= min; i++)
3591     {
3592 ph10 427 if (eptr >= md->end_subject)
3593 ph10 426 {
3594 ph10 427 SCHECK_PARTIAL();
3595 ph10 426 RRETURN(MATCH_NOMATCH);
3596 ph10 427 }
3597 ph10 178 switch(*eptr++)
3598     {
3599     default: RRETURN(MATCH_NOMATCH);
3600     case 0x09: /* HT */
3601     case 0x20: /* SPACE */
3602     case 0xa0: /* NBSP */
3603 ph10 182 break;
3604 ph10 178 }
3605     }
3606     break;
3607    
3608     case OP_NOT_VSPACE:
3609     for (i = 1; i <= min; i++)
3610     {
3611 ph10 427 if (eptr >= md->end_subject)
3612 ph10 426 {
3613 ph10 427 SCHECK_PARTIAL();
3614 ph10 426 RRETURN(MATCH_NOMATCH);
3615 ph10 427 }
3616 ph10 178 switch(*eptr++)
3617     {
3618     default: break;
3619     case 0x0a: /* LF */
3620     case 0x0b: /* VT */
3621     case 0x0c: /* FF */
3622     case 0x0d: /* CR */
3623     case 0x85: /* NEL */
3624     RRETURN(MATCH_NOMATCH);
3625     }
3626     }
3627     break;
3628    
3629     case OP_VSPACE:
3630     for (i = 1; i <= min; i++)
3631     {
3632 ph10 427 if (eptr >= md->end_subject)
3633 ph10 426 {
3634 ph10 427 SCHECK_PARTIAL();
3635 ph10 426 RRETURN(MATCH_NOMATCH);
3636 ph10 427 }
3637 ph10 178 switch(*eptr++)
3638     {
3639     default: RRETURN(MATCH_NOMATCH);
3640     case 0x0a: /* LF */
3641     case 0x0b: /* VT */
3642     case 0x0c: /* FF */
3643     case 0x0d: /* CR */
3644     case 0x85: /* NEL */
3645 ph10 182 break;
3646 ph10 178 }
3647     }
3648     break;
3649    
3650 nigel 77 case OP_NOT_DIGIT:
3651     for (i = 1; i <= min; i++)
3652 ph10 427 {
3653     if (eptr >= md->end_subject)
3654 ph10 426 {
3655 ph10 427 SCHECK_PARTIAL();
3656 ph10 426 RRETURN(MATCH_NOMATCH);
3657 ph10 427 }
3658 nigel 77 if ((md->ctypes[*eptr++] & ctype_digit) != 0) RRETURN(MATCH_NOMATCH);
3659 ph10 427 }
3660 nigel 77 break;
3661    
3662     case OP_DIGIT:
3663     for (i = 1; i <= min; i++)
3664 ph10 427 {
3665     if (eptr >= md->end_subject)
3666 ph10 426 {
3667 ph10 427 SCHECK_PARTIAL();
3668 ph10 426 RRETURN(MATCH_NOMATCH);
3669 ph10 427 }
3670 nigel 77 if ((md->ctypes[*eptr++] & ctype_digit) == 0) RRETURN(MATCH_NOMATCH);
3671 ph10 427 }
3672 nigel 77 break;
3673    
3674     case OP_NOT_WHITESPACE:
3675     for (i = 1; i <= min; i++)
3676 ph10 427 {
3677     if (eptr >= md->end_subject)
3678 ph10 426 {
3679 ph10 427 SCHECK_PARTIAL();
3680 ph10 426 RRETURN(MATCH_NOMATCH);
3681 ph10 427 }
3682 nigel 77 if ((md->ctypes[*eptr++] & ctype_space) != 0) RRETURN(MATCH_NOMATCH);
3683 ph10 427 }
3684 nigel 77 break;
3685    
3686     case OP_WHITESPACE:
3687     for (i = 1; i <= min; i++)
3688 ph10 427 {
3689     if (eptr >= md->end_subject)
3690 ph10 426 {
3691 ph10 427 SCHECK_PARTIAL();
3692 ph10 426 RRETURN(MATCH_NOMATCH);
3693 ph10 427 }
3694 nigel 77 if ((md->ctypes[*eptr++] & ctype_space) == 0) RRETURN(MATCH_NOMATCH);
3695 ph10 427 }
3696 nigel 77 break;
3697    
3698     case OP_NOT_WORDCHAR:
3699     for (i = 1; i <= min; i++)
3700 ph10 427 {
3701     if (eptr >= md->end_subject)
3702 ph10 426 {
3703 ph10 427 SCHECK_PARTIAL();
3704 ph10 426 RRETURN(MATCH_NOMATCH);
3705 ph10 427 }
3706 nigel 77 if ((md->ctypes[*eptr++] & ctype_word) != 0)
3707     RRETURN(MATCH_NOMATCH);
3708 ph10 427 }
3709 nigel 77 break;
3710    
3711     case OP_WORDCHAR:
3712     for (i = 1; i <= min; i++)
3713 ph10 427 {
3714     if (eptr >= md->end_subject)
3715 ph10 426 {
3716 ph10 427 SCHECK_PARTIAL();
3717 ph10 426 RRETURN(MATCH_NOMATCH);
3718 ph10 427 }
3719 nigel 77 if ((md->ctypes[*eptr++] & ctype_word) == 0)
3720     RRETURN(MATCH_NOMATCH);
3721 ph10 427 }
3722 nigel 77 break;
3723    
3724     default:
3725     RRETURN(PCRE_ERROR_INTERNAL);
3726     }
3727     }
3728    
3729     /* If min = max, continue at the same level without recursing */
3730    
3731     if (min == max) continue;
3732    
3733     /* If minimizing, we have to test the rest of the pattern before each
3734     subsequent match. Again, separate the UTF-8 case for speed, and also
3735     separate the UCP cases. */
3736    
3737     if (minimize)
3738     {
3739     #ifdef SUPPORT_UCP
3740 nigel 87 if (prop_type >= 0)
3741 nigel 77 {
3742 nigel 87 switch(prop_type)
3743 nigel 77 {
3744 nigel 87 case PT_ANY:
3745     for (fi = min;; fi++)
3746     {
3747 ph10 164 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM36);
3748 nigel 87 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3749 ph10 428 if (fi >= max) RRETURN(MATCH_NOMATCH);
3750 ph10 427 if (eptr >= md->end_subject)
3751 ph10 426 {
3752 ph10 427 SCHECK_PARTIAL();
3753 ph10 426 RRETURN(MATCH_NOMATCH);
3754 ph10 427 }
3755 nigel 87 GETCHARINC(c, eptr);
3756     if (prop_fail_result) RRETURN(MATCH_NOMATCH);
3757     }
3758 nigel 93 /* Control never gets here */
3759 nigel 87
3760     case PT_LAMP:
3761     for (fi = min;; fi++)
3762     {
3763 ph10 164 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM37);
3764 nigel 87 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3765 ph10 428 if (fi >= max) RRETURN(MATCH_NOMATCH);
3766 ph10 427 if (eptr >= md->end_subject)
3767 ph10 426 {
3768 ph10 427 SCHECK_PARTIAL();
3769 ph10 426 RRETURN(MATCH_NOMATCH);
3770 ph10 427 }
3771 nigel 87 GETCHARINC(c, eptr);
3772 ph10 349 prop_chartype = UCD_CHARTYPE(c);
3773 nigel 87 if ((prop_chartype == ucp_Lu ||
3774     prop_chartype == ucp_Ll ||
3775     prop_chartype == ucp_Lt) == prop_fail_result)
3776     RRETURN(MATCH_NOMATCH);
3777     }
3778 nigel 93 /* Control never gets here */
3779 nigel 87
3780     case PT_GC:
3781     for (fi = min;; fi++)
3782     {
3783 ph10 164 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM38);
3784 nigel 87 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3785 ph10 428 if (fi >= max) RRETURN(MATCH_NOMATCH);
3786 ph10 427 if (eptr >= md->end_subject)
3787 ph10 426 {
3788 ph10 427 SCHECK_PARTIAL();
3789 ph10 426 RRETURN(MATCH_NOMATCH);
3790 ph10 427 }
3791 nigel 87 GETCHARINC(c, eptr);
3792 ph10 349 prop_category = UCD_CATEGORY(c);
3793 nigel 87 if ((prop_category == prop_value) == prop_fail_result)
3794     RRETURN(MATCH_NOMATCH);
3795     }
3796 nigel 93 /* Control never gets here */
3797 nigel 87
3798     case PT_PC:
3799     for (fi = min;; fi++)
3800     {
3801 ph10 164 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM39);
3802 nigel 87 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3803 ph10 428 if (fi >= max) RRETURN(MATCH_NOMATCH);
3804 ph10 427 if (eptr >= md->end_subject)
3805 ph10 426 {
3806 ph10 427 SCHECK_PARTIAL();
3807 ph10 426 RRETURN(MATCH_NOMATCH);
3808 ph10 427 }
3809 nigel 87 GETCHARINC(c, eptr);
3810 ph10 349 prop_chartype = UCD_CHARTYPE(c);
3811 nigel 87 if ((prop_chartype == prop_value) == prop_fail_result)
3812     RRETURN(MATCH_NOMATCH);
3813     }
3814 nigel 93 /* Control never gets here */
3815 nigel 87
3816     case PT_SC:
3817     for (fi = min;; fi++)
3818     {
3819 ph10 164 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM40);
3820 nigel 87 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3821 ph10 428 if (fi >= max) RRETURN(MATCH_NOMATCH);
3822 ph10 427 if (eptr >= md->end_subject)
3823 ph10 426 {
3824 ph10 427 SCHECK_PARTIAL();
3825 ph10 426 RRETURN(MATCH_NOMATCH);
3826 ph10 427 }
3827 nigel 87 GETCHARINC(c, eptr);
3828 ph10 349 prop_script = UCD_SCRIPT(c);
3829 nigel 87 if ((prop_script == prop_value) == prop_fail_result)
3830     RRETURN(MATCH_NOMATCH);
3831     }
3832 nigel 93 /* Control never gets here */
3833 nigel 87
3834     default:
3835     RRETURN(PCRE_ERROR_INTERNAL);
3836 nigel 77 }
3837     }
3838    
3839     /* Match extended Unicode sequences. We will get here only if the
3840     support is in the binary; otherwise a compile-time error occurs. */
3841    
3842     else if (ctype == OP_EXTUNI)
3843     {
3844     for (fi = min;; fi++)
3845     {
3846 ph10 164 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM41);
3847 nigel 77 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3848 ph10 428 if (fi >= max) RRETURN(MATCH_NOMATCH);
3849 ph10 427 if (eptr >= md->end_subject)
3850 ph10 426 {
3851 ph10 427 SCHECK_PARTIAL();
3852 ph10 426 RRETURN(MATCH_NOMATCH);
3853 ph10 427 }
3854 nigel 77 GETCHARINCTEST(c, eptr);
3855 ph10 349 prop_category = UCD_CATEGORY(c);
3856 nigel 77 if (prop_category == ucp_M) RRETURN(MATCH_NOMATCH);
3857     while (eptr < md->end_subject)
3858     {
3859     int len = 1;
3860 ph10 426 if (!utf8) c = *eptr;
3861     else { GETCHARLEN(c, eptr, len); }
3862 ph10 349 prop_category = UCD_CATEGORY(c);
3863 nigel 77 if (prop_category != ucp_M) break;
3864     eptr += len;
3865     }
3866     }
3867     }
3868    
3869     else
3870     #endif /* SUPPORT_UCP */
3871    
3872     #ifdef SUPPORT_UTF8
3873     /* UTF-8 mode */
3874     if (utf8)
3875     {
3876     for (fi = min;; fi++)
3877     {
3878 ph10 164 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM42);
3879 nigel 77 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3880 ph10 428 if (fi >= max) RRETURN(MATCH_NOMATCH);
3881 ph10 427 if (eptr >= md->end_subject)
3882 ph10 426 {
3883 ph10 427 SCHECK_PARTIAL();
3884 ph10 426 RRETURN(MATCH_NOMATCH);
3885 ph10 427 }
3886 ph10 426 if (ctype == OP_ANY && IS_NEWLINE(eptr))
3887     RRETURN(MATCH_NOMATCH);
3888 nigel 77 GETCHARINC(c, eptr);
3889     switch(ctype)
3890     {
3891 ph10 342 case OP_ANY: /* This is the non-NL case */
3892 ph10 345 case OP_ALLANY:
3893 nigel 77 case OP_ANYBYTE:
3894     break;
3895    
3896 nigel 93 case OP_ANYNL:
3897     switch(c)
3898     {
3899     default: RRETURN(MATCH_NOMATCH);
3900     case 0x000d:
3901     if (eptr < md->end_subject && *eptr == 0x0a) eptr++;
3902     break;
3903     case 0x000a:
3904 ph10 231 break;
3905    
3906 nigel 93 case 0x000b:
3907     case 0x000c:
3908     case 0x0085:
3909     case 0x2028:
3910     case 0x2029:
3911 ph10 231 if (md->bsr_anycrlf) RRETURN(MATCH_NOMATCH);
3912 nigel 93 break;
3913     }
3914     break;
3915    
3916 ph10 178 case OP_NOT_HSPACE:
3917     switch(c)
3918     {
3919     default: break;
3920     case 0x09: /* HT */
3921     case 0x20: /* SPACE */
3922     case 0xa0: /* NBSP */
3923     case 0x1680: /* OGHAM SPACE MARK */
3924     case 0x180e: /* MONGOLIAN VOWEL SEPARATOR */
3925     case 0x2000: /* EN QUAD */
3926     case 0x2001: /* EM QUAD */
3927     case 0x2002: /* EN SPACE */
3928     case 0x2003: /* EM SPACE */
3929     case 0x2004: /* THREE-PER-EM SPACE */
3930     case 0x2005: /* FOUR-PER-EM SPACE */
3931     case 0x2006: /* SIX-PER-EM SPACE */
3932     case 0x2007: /* FIGURE SPACE */
3933     case 0x2008: /* PUNCTUATION SPACE */
3934     case 0x2009: /* THIN SPACE */
3935     case 0x200A: /* HAIR SPACE */
3936     case 0x202f: /* NARROW NO-BREAK SPACE */
3937     case 0x205f: /* MEDIUM MATHEMATICAL SPACE */
3938     case 0x3000: /* IDEOGRAPHIC SPACE */
3939     RRETURN(MATCH_NOMATCH);
3940     }
3941     break;
3942    
3943     case OP_HSPACE:
3944     switch(c)
3945     {
3946     default: RRETURN(MATCH_NOMATCH);
3947     case 0x09: /* HT */
3948     case 0x20: /* SPACE */
3949     case 0xa0: /* NBSP */
3950     case 0x1680: /* OGHAM SPACE MARK */
3951     case 0x180e: /* MONGOLIAN VOWEL SEPARATOR */
3952     case 0x2000: /* EN QUAD */
3953     case 0x2001: /* EM QUAD */
3954     case 0x2002: /* EN SPACE */
3955     case 0x2003: /* EM SPACE */
3956     case 0x2004: /* THREE-PER-EM SPACE */
3957     case 0x2005: /* FOUR-PER-EM SPACE */
3958     case 0x2006: /* SIX-PER-EM SPACE */
3959     case 0x2007: /* FIGURE SPACE */
3960     case 0x2008: /* PUNCTUATION SPACE */
3961     case 0x2009: /* THIN SPACE */
3962     case 0x200A: /* HAIR SPACE */
3963     case 0x202f: /* NARROW NO-BREAK SPACE */
3964     case 0x205f: /* MEDIUM MATHEMATICAL SPACE */
3965     case 0x3000: /* IDEOGRAPHIC SPACE */
3966     break;
3967     }
3968     break;
3969    
3970     case OP_NOT_VSPACE:
3971     switch(c)
3972     {
3973     default: break;
3974     case 0x0a: /* LF */
3975     case 0x0b: /* VT */
3976     case 0x0c: /* FF */
3977     case 0x0d: /* CR */
3978     case 0x85: /* NEL */
3979     case 0x2028: /* LINE SEPARATOR */
3980     case 0x2029: /* PARAGRAPH SEPARATOR */
3981     RRETURN(MATCH_NOMATCH);
3982     }
3983     break;
3984    
3985     case OP_VSPACE:
3986     switch(c)
3987     {
3988     default: RRETURN(MATCH_NOMATCH);
3989     case 0x0a: /* LF */
3990     case 0x0b: /* VT */
3991     case 0x0c: /* FF */
3992     case 0x0d: /* CR */
3993     case 0x85: /* NEL */
3994