/[pcre]/code/trunk/pcre_exec.c
ViewVC logotype

Contents of /code/trunk/pcre_exec.c

Parent Directory Parent Directory | Revision Log Revision Log


Revision 444 - (hide annotations) (download)
Sun Sep 13 16:26:39 2009 UTC (4 years, 11 months ago) by ph10
File MIME type: text/plain
File size: 161641 byte(s)
Fix comment in code.

1 nigel 77 /*************************************************
2     * Perl-Compatible Regular Expressions *
3     *************************************************/
4    
5     /* PCRE is a library of functions to support regular expressions whose syntax
6     and semantics are as close as possible to those of the Perl 5 language.
7    
8     Written by Philip Hazel
9 ph10 381 Copyright (c) 1997-2009 University of Cambridge
10 nigel 77
11     -----------------------------------------------------------------------------
12     Redistribution and use in source and binary forms, with or without
13     modification, are permitted provided that the following conditions are met:
14    
15     * Redistributions of source code must retain the above copyright notice,
16     this list of conditions and the following disclaimer.
17    
18     * Redistributions in binary form must reproduce the above copyright
19     notice, this list of conditions and the following disclaimer in the
20     documentation and/or other materials provided with the distribution.
21    
22     * Neither the name of the University of Cambridge nor the names of its
23     contributors may be used to endorse or promote products derived from
24     this software without specific prior written permission.
25    
26     THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
27     AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
28     IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
29     ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
30     LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
31     CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
32     SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
33     INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
34     CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
35     ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
36     POSSIBILITY OF SUCH DAMAGE.
37     -----------------------------------------------------------------------------
38     */
39    
40    
41     /* This module contains pcre_exec(), the externally visible function that does
42     pattern matching using an NFA algorithm, trying to mimic Perl as closely as
43     possible. There are also some static supporting functions. */
44    
45 ph10 200 #ifdef HAVE_CONFIG_H
46 ph10 236 #include "config.h"
47 ph10 200 #endif
48 ph10 199
49 nigel 93 #define NLBLOCK md /* Block containing newline information */
50     #define PSSTART start_subject /* Field containing processed string start */
51     #define PSEND end_subject /* Field containing processed string end */
52    
53 nigel 77 #include "pcre_internal.h"
54    
55 ph10 137 /* Undefine some potentially clashing cpp symbols */
56    
57     #undef min
58     #undef max
59    
60 nigel 77 /* Flag bits for the match() function */
61    
62 nigel 93 #define match_condassert 0x01 /* Called to check a condition assertion */
63     #define match_cbegroup 0x02 /* Could-be-empty unlimited repeat group */
64 nigel 77
65     /* Non-error returns from the match() function. Error returns are externally
66     defined PCRE_ERROR_xxx codes, which are all negative. */
67    
68     #define MATCH_MATCH 1
69     #define MATCH_NOMATCH 0
70    
71 ph10 211 /* Special internal returns from the match() function. Make them sufficiently
72 ph10 210 negative to avoid the external error codes. */
73    
74     #define MATCH_COMMIT (-999)
75     #define MATCH_PRUNE (-998)
76     #define MATCH_SKIP (-997)
77     #define MATCH_THEN (-996)
78    
79 nigel 77 /* Maximum number of ints of offset to save on the stack for recursive calls.
80     If the offset vector is bigger, malloc is used. This should be a multiple of 3,
81     because the offset vector is always a multiple of 3 long. */
82    
83     #define REC_STACK_SAVE_MAX 30
84    
85     /* Min and max values for the common repeats; for the maxima, 0 => infinity */
86    
87     static const char rep_min[] = { 0, 0, 1, 1, 0, 0 };
88     static const char rep_max[] = { 0, 0, 0, 0, 1, 1 };
89    
90    
91    
92     #ifdef DEBUG
93     /*************************************************
94     * Debugging function to print chars *
95     *************************************************/
96    
97     /* Print a sequence of chars in printable format, stopping at the end of the
98     subject if the requested.
99    
100     Arguments:
101     p points to characters
102     length number to print
103     is_subject TRUE if printing from within md->start_subject
104     md pointer to matching data block, if is_subject is TRUE
105    
106     Returns: nothing
107     */
108    
109     static void
110     pchars(const uschar *p, int length, BOOL is_subject, match_data *md)
111     {
112 nigel 93 unsigned int c;
113 nigel 77 if (is_subject && length > md->end_subject - p) length = md->end_subject - p;
114     while (length-- > 0)
115     if (isprint(c = *(p++))) printf("%c", c); else printf("\\x%02x", c);
116     }
117     #endif
118    
119    
120    
121     /*************************************************
122     * Match a back-reference *
123     *************************************************/
124    
125     /* If a back reference hasn't been set, the length that is passed is greater
126     than the number of characters left in the string, so the match fails.
127    
128     Arguments:
129     offset index into the offset vector
130     eptr points into the subject
131     length length to be matched
132     md points to match data block
133     ims the ims flags
134    
135     Returns: TRUE if matched
136     */
137    
138     static BOOL
139 nigel 87 match_ref(int offset, register USPTR eptr, int length, match_data *md,
140 nigel 77 unsigned long int ims)
141     {
142 nigel 87 USPTR p = md->start_subject + md->offset_vector[offset];
143 nigel 77
144     #ifdef DEBUG
145     if (eptr >= md->end_subject)
146     printf("matching subject <null>");
147     else
148     {
149     printf("matching subject ");
150     pchars(eptr, length, TRUE, md);
151     }
152     printf(" against backref ");
153     pchars(p, length, FALSE, md);
154     printf("\n");
155     #endif
156    
157     /* Always fail if not enough characters left */
158    
159     if (length > md->end_subject - eptr) return FALSE;
160    
161 ph10 354 /* Separate the caseless case for speed. In UTF-8 mode we can only do this
162     properly if Unicode properties are supported. Otherwise, we can check only
163     ASCII characters. */
164 nigel 77
165     if ((ims & PCRE_CASELESS) != 0)
166     {
167 ph10 354 #ifdef SUPPORT_UTF8
168     #ifdef SUPPORT_UCP
169     if (md->utf8)
170     {
171 ph10 358 USPTR endptr = eptr + length;
172 ph10 354 while (eptr < endptr)
173     {
174 ph10 358 int c, d;
175 ph10 354 GETCHARINC(c, eptr);
176     GETCHARINC(d, p);
177     if (c != d && c != UCD_OTHERCASE(d)) return FALSE;
178 ph10 358 }
179     }
180 ph10 354 else
181     #endif
182     #endif
183    
184     /* The same code works when not in UTF-8 mode and in UTF-8 mode when there
185     is no UCP support. */
186 ph10 358
187 nigel 77 while (length-- > 0)
188 ph10 354 { if (md->lcc[*p++] != md->lcc[*eptr++]) return FALSE; }
189 nigel 77 }
190 ph10 358
191 ph10 354 /* In the caseful case, we can just compare the bytes, whether or not we
192     are in UTF-8 mode. */
193 ph10 358
194 nigel 77 else
195     { while (length-- > 0) if (*p++ != *eptr++) return FALSE; }
196    
197     return TRUE;
198     }
199    
200    
201    
202     /***************************************************************************
203     ****************************************************************************
204     RECURSION IN THE match() FUNCTION
205    
206 nigel 87 The match() function is highly recursive, though not every recursive call
207     increases the recursive depth. Nevertheless, some regular expressions can cause
208     it to recurse to a great depth. I was writing for Unix, so I just let it call
209     itself recursively. This uses the stack for saving everything that has to be
210     saved for a recursive call. On Unix, the stack can be large, and this works
211     fine.
212 nigel 77
213 nigel 87 It turns out that on some non-Unix-like systems there are problems with
214     programs that use a lot of stack. (This despite the fact that every last chip
215     has oodles of memory these days, and techniques for extending the stack have
216     been known for decades.) So....
217 nigel 77
218     There is a fudge, triggered by defining NO_RECURSE, which avoids recursive
219     calls by keeping local variables that need to be preserved in blocks of memory
220 nigel 87 obtained from malloc() instead instead of on the stack. Macros are used to
221 nigel 77 achieve this so that the actual code doesn't look very different to what it
222     always used to.
223 ph10 164
224 ph10 165 The original heap-recursive code used longjmp(). However, it seems that this
225 ph10 164 can be very slow on some operating systems. Following a suggestion from Stan
226     Switzer, the use of longjmp() has been abolished, at the cost of having to
227     provide a unique number for each call to RMATCH. There is no way of generating
228     a sequence of numbers at compile time in C. I have given them names, to make
229     them stand out more clearly.
230    
231     Crude tests on x86 Linux show a small speedup of around 5-8%. However, on
232     FreeBSD, avoiding longjmp() more than halves the time taken to run the standard
233 ph10 165 tests. Furthermore, not using longjmp() means that local dynamic variables
234     don't have indeterminate values; this has meant that the frame size can be
235 ph10 164 reduced because the result can be "passed back" by straight setting of the
236     variable instead of being passed in the frame.
237 nigel 77 ****************************************************************************
238     ***************************************************************************/
239    
240 ph10 212 /* Numbers for RMATCH calls. When this list is changed, the code at HEAP_RETURN
241     below must be updated in sync. */
242 nigel 77
243 ph10 164 enum { RM1=1, RM2, RM3, RM4, RM5, RM6, RM7, RM8, RM9, RM10,
244     RM11, RM12, RM13, RM14, RM15, RM16, RM17, RM18, RM19, RM20,
245     RM21, RM22, RM23, RM24, RM25, RM26, RM27, RM28, RM29, RM30,
246     RM31, RM32, RM33, RM34, RM35, RM36, RM37, RM38, RM39, RM40,
247 ph10 210 RM41, RM42, RM43, RM44, RM45, RM46, RM47, RM48, RM49, RM50,
248 ph10 212 RM51, RM52, RM53, RM54 };
249 ph10 164
250 nigel 87 /* These versions of the macros use the stack, as normal. There are debugging
251 ph10 165 versions and production versions. Note that the "rw" argument of RMATCH isn't
252 ph10 164 actuall used in this definition. */
253 nigel 77
254     #ifndef NO_RECURSE
255     #define REGISTER register
256 ph10 164
257 nigel 87 #ifdef DEBUG
258 ph10 164 #define RMATCH(ra,rb,rc,rd,re,rf,rg,rw) \
259 nigel 87 { \
260     printf("match() called in line %d\n", __LINE__); \
261 ph10 168 rrc = match(ra,rb,mstart,rc,rd,re,rf,rg,rdepth+1); \
262 nigel 87 printf("to line %d\n", __LINE__); \
263     }
264     #define RRETURN(ra) \
265     { \
266     printf("match() returned %d from line %d ", ra, __LINE__); \
267     return ra; \
268     }
269     #else
270 ph10 164 #define RMATCH(ra,rb,rc,rd,re,rf,rg,rw) \
271 ph10 168 rrc = match(ra,rb,mstart,rc,rd,re,rf,rg,rdepth+1)
272 nigel 77 #define RRETURN(ra) return ra
273 nigel 87 #endif
274    
275 nigel 77 #else
276    
277    
278 ph10 164 /* These versions of the macros manage a private stack on the heap. Note that
279     the "rd" argument of RMATCH isn't actually used in this definition. It's the md
280     argument of match(), which never changes. */
281 nigel 77
282     #define REGISTER
283    
284 ph10 164 #define RMATCH(ra,rb,rc,rd,re,rf,rg,rw)\
285 nigel 77 {\
286     heapframe *newframe = (pcre_stack_malloc)(sizeof(heapframe));\
287 ph10 164 frame->Xwhere = rw; \
288     newframe->Xeptr = ra;\
289     newframe->Xecode = rb;\
290 ph10 168 newframe->Xmstart = mstart;\
291 ph10 164 newframe->Xoffset_top = rc;\
292     newframe->Xims = re;\
293     newframe->Xeptrb = rf;\
294     newframe->Xflags = rg;\
295     newframe->Xrdepth = frame->Xrdepth + 1;\
296     newframe->Xprevframe = frame;\
297     frame = newframe;\
298     DPRINTF(("restarting from line %d\n", __LINE__));\
299     goto HEAP_RECURSE;\
300     L_##rw:\
301     DPRINTF(("jumped back to line %d\n", __LINE__));\
302 nigel 77 }
303    
304     #define RRETURN(ra)\
305     {\
306     heapframe *newframe = frame;\
307     frame = newframe->Xprevframe;\
308     (pcre_stack_free)(newframe);\
309     if (frame != NULL)\
310     {\
311 ph10 164 rrc = ra;\
312     goto HEAP_RETURN;\
313 nigel 77 }\
314     return ra;\
315     }
316    
317    
318     /* Structure for remembering the local variables in a private frame */
319    
320     typedef struct heapframe {
321     struct heapframe *Xprevframe;
322    
323     /* Function arguments that may change */
324    
325 ph10 409 USPTR Xeptr;
326 nigel 77 const uschar *Xecode;
327 ph10 409 USPTR Xmstart;
328 nigel 77 int Xoffset_top;
329     long int Xims;
330     eptrblock *Xeptrb;
331     int Xflags;
332 nigel 91 unsigned int Xrdepth;
333 nigel 77
334     /* Function local variables */
335    
336 ph10 409 USPTR Xcallpat;
337 ph10 406 #ifdef SUPPORT_UTF8
338 ph10 409 USPTR Xcharptr;
339 ph10 406 #endif
340 ph10 409 USPTR Xdata;
341     USPTR Xnext;
342     USPTR Xpp;
343     USPTR Xprev;
344     USPTR Xsaved_eptr;
345 nigel 77
346     recursion_info Xnew_recursive;
347    
348     BOOL Xcur_is_word;
349     BOOL Xcondition;
350     BOOL Xprev_is_word;
351    
352     unsigned long int Xoriginal_ims;
353    
354     #ifdef SUPPORT_UCP
355     int Xprop_type;
356 nigel 87 int Xprop_value;
357 nigel 77 int Xprop_fail_result;
358     int Xprop_category;
359     int Xprop_chartype;
360 nigel 87 int Xprop_script;
361 ph10 123 int Xoclength;
362     uschar Xocchars[8];
363 nigel 77 #endif
364    
365 ph10 403 int Xcodelink;
366 nigel 77 int Xctype;
367 nigel 93 unsigned int Xfc;
368 nigel 77 int Xfi;
369     int Xlength;
370     int Xmax;
371     int Xmin;
372     int Xnumber;
373     int Xoffset;
374     int Xop;
375     int Xsave_capture_last;
376     int Xsave_offset1, Xsave_offset2, Xsave_offset3;
377     int Xstacksave[REC_STACK_SAVE_MAX];
378    
379     eptrblock Xnewptrb;
380    
381 ph10 164 /* Where to jump back to */
382 nigel 77
383 ph10 164 int Xwhere;
384 ph10 165
385 nigel 77 } heapframe;
386    
387     #endif
388    
389    
390     /***************************************************************************
391     ***************************************************************************/
392    
393    
394    
395     /*************************************************
396     * Match from current position *
397     *************************************************/
398    
399 nigel 93 /* This function is called recursively in many circumstances. Whenever it
400 nigel 77 returns a negative (error) response, the outer incarnation must also return the
401 ph10 426 same response. */
402 nigel 77
403 ph10 426 /* These macros pack up tests that are used for partial matching, and which
404     appears several times in the code. We set the "hit end" flag if the pointer is
405     at the end of the subject and also past the start of the subject (i.e.
406 ph10 427 something has been matched). For hard partial matching, we then return
407     immediately. The second one is used when we already know we are past the end of
408     the subject. */
409 ph10 426
410     #define CHECK_PARTIAL()\
411 ph10 435 if (md->partial != 0 && eptr >= md->end_subject && eptr > mstart)\
412 ph10 427 {\
413     md->hitend = TRUE;\
414     if (md->partial > 1) RRETURN(PCRE_ERROR_PARTIAL);\
415     }
416 ph10 426
417     #define SCHECK_PARTIAL()\
418 ph10 427 if (md->partial && eptr > mstart)\
419     {\
420     md->hitend = TRUE;\
421     if (md->partial > 1) RRETURN(PCRE_ERROR_PARTIAL);\
422     }
423 ph10 426
424 ph10 427
425 ph10 426 /* Performance note: It might be tempting to extract commonly used fields from
426     the md structure (e.g. utf8, end_subject) into individual variables to improve
427 nigel 77 performance. Tests using gcc on a SPARC disproved this; in the first case, it
428     made performance worse.
429    
430     Arguments:
431 nigel 93 eptr pointer to current character in subject
432     ecode pointer to current position in compiled code
433 ph10 168 mstart pointer to the current match start position (can be modified
434 ph10 172 by encountering \K)
435 nigel 77 offset_top current top pointer
436     md pointer to "static" info for the match
437     ims current /i, /m, and /s options
438     eptrb pointer to chain of blocks containing eptr at start of
439     brackets - for testing for empty matches
440     flags can contain
441     match_condassert - this is an assertion condition
442 nigel 93 match_cbegroup - this is the start of an unlimited repeat
443     group that can match an empty string
444 nigel 87 rdepth the recursion depth
445 nigel 77
446     Returns: MATCH_MATCH if matched ) these values are >= 0
447     MATCH_NOMATCH if failed to match )
448     a negative PCRE_ERROR_xxx value if aborted by an error condition
449 nigel 87 (e.g. stopped by repeated call or recursion limit)
450 nigel 77 */
451    
452     static int
453 ph10 409 match(REGISTER USPTR eptr, REGISTER const uschar *ecode, USPTR mstart,
454 nigel 77 int offset_top, match_data *md, unsigned long int ims, eptrblock *eptrb,
455 nigel 91 int flags, unsigned int rdepth)
456 nigel 77 {
457     /* These variables do not need to be preserved over recursion in this function,
458 nigel 93 so they can be ordinary variables in all cases. Mark some of them with
459     "register" because they are used a lot in loops. */
460 nigel 77
461 nigel 91 register int rrc; /* Returns from recursive calls */
462     register int i; /* Used for loops not involving calls to RMATCH() */
463 nigel 93 register unsigned int c; /* Character values not kept over RMATCH() calls */
464 nigel 91 register BOOL utf8; /* Local copy of UTF-8 flag for speed */
465 nigel 77
466 nigel 93 BOOL minimize, possessive; /* Quantifier options */
467 ph10 403 int condcode;
468 nigel 93
469 nigel 77 /* When recursion is not being used, all "local" variables that have to be
470     preserved over calls to RMATCH() are part of a "frame" which is obtained from
471     heap storage. Set up the top-level frame here; others are obtained from the
472     heap whenever RMATCH() does a "recursion". See the macro definitions above. */
473    
474     #ifdef NO_RECURSE
475     heapframe *frame = (pcre_stack_malloc)(sizeof(heapframe));
476     frame->Xprevframe = NULL; /* Marks the top level */
477    
478     /* Copy in the original argument variables */
479    
480     frame->Xeptr = eptr;
481     frame->Xecode = ecode;
482 ph10 168 frame->Xmstart = mstart;
483 nigel 77 frame->Xoffset_top = offset_top;
484     frame->Xims = ims;
485     frame->Xeptrb = eptrb;
486     frame->Xflags = flags;
487 nigel 87 frame->Xrdepth = rdepth;
488 nigel 77
489     /* This is where control jumps back to to effect "recursion" */
490    
491     HEAP_RECURSE:
492    
493     /* Macros make the argument variables come from the current frame */
494    
495     #define eptr frame->Xeptr
496     #define ecode frame->Xecode
497 ph10 168 #define mstart frame->Xmstart
498 nigel 77 #define offset_top frame->Xoffset_top
499     #define ims frame->Xims
500     #define eptrb frame->Xeptrb
501     #define flags frame->Xflags
502 nigel 87 #define rdepth frame->Xrdepth
503 nigel 77
504     /* Ditto for the local variables */
505    
506     #ifdef SUPPORT_UTF8
507     #define charptr frame->Xcharptr
508     #endif
509     #define callpat frame->Xcallpat
510 ph10 403 #define codelink frame->Xcodelink
511 nigel 77 #define data frame->Xdata
512     #define next frame->Xnext
513     #define pp frame->Xpp
514     #define prev frame->Xprev
515     #define saved_eptr frame->Xsaved_eptr
516    
517     #define new_recursive frame->Xnew_recursive
518    
519     #define cur_is_word frame->Xcur_is_word
520     #define condition frame->Xcondition
521     #define prev_is_word frame->Xprev_is_word
522    
523     #define original_ims frame->Xoriginal_ims
524    
525     #ifdef SUPPORT_UCP
526     #define prop_type frame->Xprop_type
527 nigel 87 #define prop_value frame->Xprop_value
528 nigel 77 #define prop_fail_result frame->Xprop_fail_result
529     #define prop_category frame->Xprop_category
530     #define prop_chartype frame->Xprop_chartype
531 nigel 87 #define prop_script frame->Xprop_script
532 ph10 115 #define oclength frame->Xoclength
533     #define occhars frame->Xocchars
534 nigel 77 #endif
535    
536     #define ctype frame->Xctype
537     #define fc frame->Xfc
538     #define fi frame->Xfi
539     #define length frame->Xlength
540     #define max frame->Xmax
541     #define min frame->Xmin
542     #define number frame->Xnumber
543     #define offset frame->Xoffset
544     #define op frame->Xop
545     #define save_capture_last frame->Xsave_capture_last
546     #define save_offset1 frame->Xsave_offset1
547     #define save_offset2 frame->Xsave_offset2
548     #define save_offset3 frame->Xsave_offset3
549     #define stacksave frame->Xstacksave
550    
551     #define newptrb frame->Xnewptrb
552    
553     /* When recursion is being used, local variables are allocated on the stack and
554     get preserved during recursion in the normal way. In this environment, fi and
555     i, and fc and c, can be the same variables. */
556    
557 nigel 93 #else /* NO_RECURSE not defined */
558 nigel 77 #define fi i
559     #define fc c
560    
561    
562 nigel 87 #ifdef SUPPORT_UTF8 /* Many of these variables are used only */
563     const uschar *charptr; /* in small blocks of the code. My normal */
564     #endif /* style of coding would have declared */
565     const uschar *callpat; /* them within each of those blocks. */
566     const uschar *data; /* However, in order to accommodate the */
567     const uschar *next; /* version of this code that uses an */
568     USPTR pp; /* external "stack" implemented on the */
569     const uschar *prev; /* heap, it is easier to declare them all */
570     USPTR saved_eptr; /* here, so the declarations can be cut */
571     /* out in a block. The only declarations */
572     recursion_info new_recursive; /* within blocks below are for variables */
573     /* that do not have to be preserved over */
574     BOOL cur_is_word; /* a recursive call to RMATCH(). */
575     BOOL condition;
576 nigel 77 BOOL prev_is_word;
577    
578     unsigned long int original_ims;
579    
580     #ifdef SUPPORT_UCP
581     int prop_type;
582 nigel 87 int prop_value;
583 nigel 77 int prop_fail_result;
584     int prop_category;
585     int prop_chartype;
586 nigel 87 int prop_script;
587 ph10 115 int oclength;
588     uschar occhars[8];
589 nigel 77 #endif
590    
591 ph10 399 int codelink;
592 nigel 77 int ctype;
593     int length;
594     int max;
595     int min;
596     int number;
597     int offset;
598     int op;
599     int save_capture_last;
600     int save_offset1, save_offset2, save_offset3;
601     int stacksave[REC_STACK_SAVE_MAX];
602    
603     eptrblock newptrb;
604 nigel 93 #endif /* NO_RECURSE */
605 nigel 77
606     /* These statements are here to stop the compiler complaining about unitialized
607     variables. */
608    
609     #ifdef SUPPORT_UCP
610 nigel 87 prop_value = 0;
611 nigel 77 prop_fail_result = 0;
612     #endif
613    
614 nigel 93
615 nigel 91 /* This label is used for tail recursion, which is used in a few cases even
616     when NO_RECURSE is not defined, in order to reduce the amount of stack that is
617     used. Thanks to Ian Taylor for noticing this possibility and sending the
618     original patch. */
619    
620     TAIL_RECURSE:
621    
622 nigel 87 /* OK, now we can get on with the real code of the function. Recursive calls
623     are specified by the macro RMATCH and RRETURN is used to return. When
624     NO_RECURSE is *not* defined, these just turn into a recursive call to match()
625     and a "return", respectively (possibly with some debugging if DEBUG is
626     defined). However, RMATCH isn't like a function call because it's quite a
627     complicated macro. It has to be used in one particular way. This shouldn't,
628     however, impact performance when true recursion is being used. */
629 nigel 77
630 ph10 164 #ifdef SUPPORT_UTF8
631     utf8 = md->utf8; /* Local copy of the flag */
632     #else
633     utf8 = FALSE;
634     #endif
635    
636 nigel 87 /* First check that we haven't called match() too many times, or that we
637     haven't exceeded the recursive call limit. */
638    
639 nigel 77 if (md->match_call_count++ >= md->match_limit) RRETURN(PCRE_ERROR_MATCHLIMIT);
640 nigel 87 if (rdepth >= md->match_limit_recursion) RRETURN(PCRE_ERROR_RECURSIONLIMIT);
641 nigel 77
642     original_ims = ims; /* Save for resetting on ')' */
643 nigel 91
644 nigel 93 /* At the start of a group with an unlimited repeat that may match an empty
645     string, the match_cbegroup flag is set. When this is the case, add the current
646     subject pointer to the chain of such remembered pointers, to be checked when we
647     hit the closing ket, in order to break infinite loops that match no characters.
648 ph10 197 When match() is called in other circumstances, don't add to the chain. The
649     match_cbegroup flag must NOT be used with tail recursion, because the memory
650     block that is used is on the stack, so a new one may be required for each
651     match(). */
652 nigel 77
653 nigel 93 if ((flags & match_cbegroup) != 0)
654 nigel 77 {
655 ph10 197 newptrb.epb_saved_eptr = eptr;
656     newptrb.epb_prev = eptrb;
657     eptrb = &newptrb;
658 nigel 77 }
659    
660 nigel 93 /* Now start processing the opcodes. */
661 nigel 77
662     for (;;)
663     {
664 nigel 93 minimize = possessive = FALSE;
665 nigel 77 op = *ecode;
666 ph10 443
667 nigel 93 switch(op)
668     {
669 ph10 210 case OP_FAIL:
670 ph10 212 RRETURN(MATCH_NOMATCH);
671 ph10 211
672 ph10 210 case OP_PRUNE:
673     RMATCH(eptr, ecode + _pcre_OP_lengths[*ecode], offset_top, md,
674     ims, eptrb, flags, RM51);
675     if (rrc != MATCH_NOMATCH) RRETURN(rrc);
676 ph10 212 RRETURN(MATCH_PRUNE);
677 ph10 211
678 ph10 210 case OP_COMMIT:
679     RMATCH(eptr, ecode + _pcre_OP_lengths[*ecode], offset_top, md,
680     ims, eptrb, flags, RM52);
681     if (rrc != MATCH_NOMATCH) RRETURN(rrc);
682 ph10 212 RRETURN(MATCH_COMMIT);
683 ph10 211
684 ph10 210 case OP_SKIP:
685     RMATCH(eptr, ecode + _pcre_OP_lengths[*ecode], offset_top, md,
686     ims, eptrb, flags, RM53);
687     if (rrc != MATCH_NOMATCH) RRETURN(rrc);
688 ph10 211 md->start_match_ptr = eptr; /* Pass back current position */
689 ph10 212 RRETURN(MATCH_SKIP);
690 ph10 211
691 ph10 210 case OP_THEN:
692     RMATCH(eptr, ecode + _pcre_OP_lengths[*ecode], offset_top, md,
693 ph10 212 ims, eptrb, flags, RM54);
694 ph10 210 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
695 ph10 212 RRETURN(MATCH_THEN);
696 ph10 211
697 nigel 93 /* Handle a capturing bracket. If there is space in the offset vector, save
698     the current subject position in the working slot at the top of the vector.
699     We mustn't change the current values of the data slot, because they may be
700     set from a previous iteration of this group, and be referred to by a
701     reference inside the group.
702 nigel 77
703 nigel 93 If the bracket fails to match, we need to restore this value and also the
704     values of the final offsets, in case they were set by a previous iteration
705     of the same bracket.
706 nigel 77
707 nigel 93 If there isn't enough space in the offset vector, treat this as if it were
708     a non-capturing bracket. Don't worry about setting the flag for the error
709     case here; that is handled in the code for KET. */
710 nigel 77
711 nigel 93 case OP_CBRA:
712     case OP_SCBRA:
713     number = GET2(ecode, 1+LINK_SIZE);
714 nigel 77 offset = number << 1;
715    
716     #ifdef DEBUG
717 nigel 93 printf("start bracket %d\n", number);
718     printf("subject=");
719 nigel 77 pchars(eptr, 16, TRUE, md);
720     printf("\n");
721     #endif
722    
723     if (offset < md->offset_max)
724     {
725     save_offset1 = md->offset_vector[offset];
726     save_offset2 = md->offset_vector[offset+1];
727     save_offset3 = md->offset_vector[md->offset_end - number];
728     save_capture_last = md->capture_last;
729    
730     DPRINTF(("saving %d %d %d\n", save_offset1, save_offset2, save_offset3));
731     md->offset_vector[md->offset_end - number] = eptr - md->start_subject;
732    
733 nigel 93 flags = (op == OP_SCBRA)? match_cbegroup : 0;
734 nigel 77 do
735     {
736 ph10 164 RMATCH(eptr, ecode + _pcre_OP_lengths[*ecode], offset_top, md,
737     ims, eptrb, flags, RM1);
738 ph10 210 if (rrc != MATCH_NOMATCH && rrc != MATCH_THEN) RRETURN(rrc);
739 nigel 77 md->capture_last = save_capture_last;
740     ecode += GET(ecode, 1);
741     }
742     while (*ecode == OP_ALT);
743    
744     DPRINTF(("bracket %d failed\n", number));
745    
746     md->offset_vector[offset] = save_offset1;
747     md->offset_vector[offset+1] = save_offset2;
748     md->offset_vector[md->offset_end - number] = save_offset3;
749    
750     RRETURN(MATCH_NOMATCH);
751     }
752    
753 ph10 197 /* FALL THROUGH ... Insufficient room for saving captured contents. Treat
754     as a non-capturing bracket. */
755 nigel 77
756 ph10 197 /* VVVVVVVVVVVVVVVVVVVVVVVVV */
757     /* VVVVVVVVVVVVVVVVVVVVVVVVV */
758    
759 nigel 93 DPRINTF(("insufficient capture room: treat as non-capturing\n"));
760 nigel 77
761 ph10 197 /* VVVVVVVVVVVVVVVVVVVVVVVVV */
762     /* VVVVVVVVVVVVVVVVVVVVVVVVV */
763    
764 nigel 93 /* Non-capturing bracket. Loop for all the alternatives. When we get to the
765     final alternative within the brackets, we would return the result of a
766     recursive call to match() whatever happened. We can reduce stack usage by
767 ph10 197 turning this into a tail recursion, except in the case when match_cbegroup
768     is set.*/
769 nigel 77
770 nigel 93 case OP_BRA:
771     case OP_SBRA:
772     DPRINTF(("start non-capturing bracket\n"));
773     flags = (op >= OP_SBRA)? match_cbegroup : 0;
774 nigel 91 for (;;)
775 nigel 77 {
776 ph10 197 if (ecode[GET(ecode, 1)] != OP_ALT) /* Final alternative */
777 nigel 93 {
778 ph10 197 if (flags == 0) /* Not a possibly empty group */
779     {
780     ecode += _pcre_OP_lengths[*ecode];
781     DPRINTF(("bracket 0 tail recursion\n"));
782     goto TAIL_RECURSE;
783     }
784    
785     /* Possibly empty group; can't use tail recursion. */
786    
787     RMATCH(eptr, ecode + _pcre_OP_lengths[*ecode], offset_top, md, ims,
788     eptrb, flags, RM48);
789     RRETURN(rrc);
790 nigel 93 }
791 nigel 91
792     /* For non-final alternatives, continue the loop for a NOMATCH result;
793     otherwise return. */
794    
795 ph10 164 RMATCH(eptr, ecode + _pcre_OP_lengths[*ecode], offset_top, md, ims,
796     eptrb, flags, RM2);
797 ph10 210 if (rrc != MATCH_NOMATCH && rrc != MATCH_THEN) RRETURN(rrc);
798 nigel 77 ecode += GET(ecode, 1);
799     }
800 nigel 91 /* Control never reaches here. */
801 nigel 77
802     /* Conditional group: compilation checked that there are no more than
803     two branches. If the condition is false, skipping the first branch takes us
804     past the end if there is only one branch, but that's OK because that is
805 nigel 91 exactly what going to the ket would do. As there is only one branch to be
806     obeyed, we can use tail recursion to avoid using another stack frame. */
807 nigel 77
808     case OP_COND:
809 nigel 93 case OP_SCOND:
810 ph10 399 codelink= GET(ecode, 1);
811 ph10 406
812 ph10 381 /* Because of the way auto-callout works during compile, a callout item is
813     inserted between OP_COND and an assertion condition. */
814 ph10 392
815 ph10 381 if (ecode[LINK_SIZE+1] == OP_CALLOUT)
816     {
817     if (pcre_callout != NULL)
818     {
819     pcre_callout_block cb;
820     cb.version = 1; /* Version 1 of the callout block */
821     cb.callout_number = ecode[LINK_SIZE+2];
822     cb.offset_vector = md->offset_vector;
823     cb.subject = (PCRE_SPTR)md->start_subject;
824     cb.subject_length = md->end_subject - md->start_subject;
825     cb.start_match = mstart - md->start_subject;
826     cb.current_position = eptr - md->start_subject;
827     cb.pattern_position = GET(ecode, LINK_SIZE + 3);
828     cb.next_item_length = GET(ecode, 3 + 2*LINK_SIZE);
829     cb.capture_top = offset_top/2;
830     cb.capture_last = md->capture_last;
831     cb.callout_data = md->callout_data;
832     if ((rrc = (*pcre_callout)(&cb)) > 0) RRETURN(MATCH_NOMATCH);
833     if (rrc < 0) RRETURN(rrc);
834     }
835     ecode += _pcre_OP_lengths[OP_CALLOUT];
836     }
837 ph10 392
838 ph10 399 condcode = ecode[LINK_SIZE+1];
839 ph10 406
840 ph10 381 /* Now see what the actual condition is */
841 ph10 392
842 ph10 399 if (condcode == OP_RREF) /* Recursion test */
843 nigel 77 {
844 nigel 93 offset = GET2(ecode, LINK_SIZE + 2); /* Recursion group number*/
845     condition = md->recursive != NULL &&
846     (offset == RREF_ANY || offset == md->recursive->group_num);
847     ecode += condition? 3 : GET(ecode, 1);
848     }
849    
850 ph10 399 else if (condcode == OP_CREF) /* Group used test */
851 nigel 93 {
852 nigel 77 offset = GET2(ecode, LINK_SIZE+2) << 1; /* Doubled ref number */
853 nigel 93 condition = offset < offset_top && md->offset_vector[offset] >= 0;
854     ecode += condition? 3 : GET(ecode, 1);
855 nigel 77 }
856    
857 ph10 399 else if (condcode == OP_DEF) /* DEFINE - always false */
858 nigel 93 {
859     condition = FALSE;
860     ecode += GET(ecode, 1);
861     }
862    
863 nigel 77 /* The condition is an assertion. Call match() to evaluate it - setting
864 nigel 93 the final argument match_condassert causes it to stop at the end of an
865     assertion. */
866 nigel 77
867     else
868     {
869 ph10 164 RMATCH(eptr, ecode + 1 + LINK_SIZE, offset_top, md, ims, NULL,
870     match_condassert, RM3);
871 nigel 77 if (rrc == MATCH_MATCH)
872     {
873 nigel 93 condition = TRUE;
874     ecode += 1 + LINK_SIZE + GET(ecode, LINK_SIZE + 2);
875 nigel 77 while (*ecode == OP_ALT) ecode += GET(ecode, 1);
876     }
877 ph10 210 else if (rrc != MATCH_NOMATCH && rrc != MATCH_THEN)
878 nigel 77 {
879     RRETURN(rrc); /* Need braces because of following else */
880     }
881 nigel 93 else
882     {
883     condition = FALSE;
884 ph10 399 ecode += codelink;
885 nigel 93 }
886     }
887 nigel 91
888 nigel 93 /* We are now at the branch that is to be obeyed. As there is only one,
889 ph10 197 we can use tail recursion to avoid using another stack frame, except when
890     match_cbegroup is required for an unlimited repeat of a possibly empty
891     group. If the second alternative doesn't exist, we can just plough on. */
892 nigel 91
893 nigel 93 if (condition || *ecode == OP_ALT)
894     {
895 nigel 91 ecode += 1 + LINK_SIZE;
896 ph10 197 if (op == OP_SCOND) /* Possibly empty group */
897     {
898     RMATCH(eptr, ecode, offset_top, md, ims, eptrb, match_cbegroup, RM49);
899     RRETURN(rrc);
900     }
901     else /* Group must match something */
902     {
903     flags = 0;
904     goto TAIL_RECURSE;
905     }
906 nigel 77 }
907 ph10 395 else /* Condition false & no alternative */
908 nigel 93 {
909     ecode += 1 + LINK_SIZE;
910     }
911     break;
912 nigel 77
913    
914 ph10 210 /* End of the pattern, either real or forced. If we are in a top-level
915     recursion, we should restore the offsets appropriately and continue from
916     after the call. */
917 nigel 77
918 ph10 210 case OP_ACCEPT:
919 nigel 77 case OP_END:
920     if (md->recursive != NULL && md->recursive->group_num == 0)
921     {
922     recursion_info *rec = md->recursive;
923 nigel 87 DPRINTF(("End of pattern in a (?0) recursion\n"));
924 nigel 77 md->recursive = rec->prevrec;
925     memmove(md->offset_vector, rec->offset_save,
926     rec->saved_max * sizeof(int));
927 ph10 168 mstart = rec->save_start;
928 nigel 77 ims = original_ims;
929     ecode = rec->after_call;
930     break;
931     }
932    
933 ph10 442 /* Otherwise, if we have matched an empty string, fail if PCRE_NOTEMPTY is
934     set, or if PCRE_NOTEMPTY_ATSTART is set and we have matched at the start of
935     the subject. In both cases, backtracking will then try other alternatives,
936     if any. */
937 ph10 443
938 ph10 442 if (eptr == mstart &&
939     (md->notempty ||
940 ph10 443 (md->notempty_atstart &&
941 ph10 442 mstart == md->start_subject + md->start_offset)))
942 ph10 443 RRETURN(MATCH_NOMATCH);
943    
944 ph10 442 /* Otherwise, we have a match. */
945 nigel 77
946 ph10 168 md->end_match_ptr = eptr; /* Record where we ended */
947     md->end_offset_top = offset_top; /* and how many extracts were taken */
948 ph10 210 md->start_match_ptr = mstart; /* and the start (\K can modify) */
949 nigel 77 RRETURN(MATCH_MATCH);
950    
951     /* Change option settings */
952    
953     case OP_OPT:
954     ims = ecode[1];
955     ecode += 2;
956     DPRINTF(("ims set to %02lx\n", ims));
957     break;
958    
959     /* Assertion brackets. Check the alternative branches in turn - the
960     matching won't pass the KET for an assertion. If any one branch matches,
961     the assertion is true. Lookbehind assertions have an OP_REVERSE item at the
962     start of each branch to move the current point backwards, so the code at
963     this level is identical to the lookahead case. */
964    
965     case OP_ASSERT:
966     case OP_ASSERTBACK:
967     do
968     {
969 ph10 164 RMATCH(eptr, ecode + 1 + LINK_SIZE, offset_top, md, ims, NULL, 0,
970     RM4);
971 nigel 77 if (rrc == MATCH_MATCH) break;
972 ph10 210 if (rrc != MATCH_NOMATCH && rrc != MATCH_THEN) RRETURN(rrc);
973 nigel 77 ecode += GET(ecode, 1);
974     }
975     while (*ecode == OP_ALT);
976     if (*ecode == OP_KET) RRETURN(MATCH_NOMATCH);
977    
978     /* If checking an assertion for a condition, return MATCH_MATCH. */
979    
980     if ((flags & match_condassert) != 0) RRETURN(MATCH_MATCH);
981    
982     /* Continue from after the assertion, updating the offsets high water
983     mark, since extracts may have been taken during the assertion. */
984    
985     do ecode += GET(ecode,1); while (*ecode == OP_ALT);
986     ecode += 1 + LINK_SIZE;
987     offset_top = md->end_offset_top;
988     continue;
989    
990     /* Negative assertion: all branches must fail to match */
991    
992     case OP_ASSERT_NOT:
993     case OP_ASSERTBACK_NOT:
994     do
995     {
996 ph10 164 RMATCH(eptr, ecode + 1 + LINK_SIZE, offset_top, md, ims, NULL, 0,
997     RM5);
998 nigel 77 if (rrc == MATCH_MATCH) RRETURN(MATCH_NOMATCH);
999 ph10 210 if (rrc != MATCH_NOMATCH && rrc != MATCH_THEN) RRETURN(rrc);
1000 nigel 77 ecode += GET(ecode,1);
1001     }
1002     while (*ecode == OP_ALT);
1003    
1004     if ((flags & match_condassert) != 0) RRETURN(MATCH_MATCH);
1005    
1006     ecode += 1 + LINK_SIZE;
1007     continue;
1008    
1009     /* Move the subject pointer back. This occurs only at the start of
1010     each branch of a lookbehind assertion. If we are too close to the start to
1011     move back, this match function fails. When working with UTF-8 we move
1012     back a number of characters, not bytes. */
1013    
1014     case OP_REVERSE:
1015     #ifdef SUPPORT_UTF8
1016     if (utf8)
1017     {
1018 nigel 93 i = GET(ecode, 1);
1019     while (i-- > 0)
1020 nigel 77 {
1021     eptr--;
1022     if (eptr < md->start_subject) RRETURN(MATCH_NOMATCH);
1023 ph10 207 BACKCHAR(eptr);
1024 nigel 77 }
1025     }
1026     else
1027     #endif
1028    
1029     /* No UTF-8 support, or not in UTF-8 mode: count is byte count */
1030    
1031     {
1032 nigel 93 eptr -= GET(ecode, 1);
1033 nigel 77 if (eptr < md->start_subject) RRETURN(MATCH_NOMATCH);
1034     }
1035    
1036 ph10 435 /* Save the earliest consulted character, then skip to next op code */
1037 nigel 77
1038 ph10 435 if (eptr < md->start_used_ptr) md->start_used_ptr = eptr;
1039 nigel 77 ecode += 1 + LINK_SIZE;
1040     break;
1041    
1042     /* The callout item calls an external function, if one is provided, passing
1043     details of the match so far. This is mainly for debugging, though the
1044     function is able to force a failure. */
1045    
1046     case OP_CALLOUT:
1047     if (pcre_callout != NULL)
1048     {
1049     pcre_callout_block cb;
1050     cb.version = 1; /* Version 1 of the callout block */
1051     cb.callout_number = ecode[1];
1052     cb.offset_vector = md->offset_vector;
1053 nigel 87 cb.subject = (PCRE_SPTR)md->start_subject;
1054 nigel 77 cb.subject_length = md->end_subject - md->start_subject;
1055 ph10 168 cb.start_match = mstart - md->start_subject;
1056 nigel 77 cb.current_position = eptr - md->start_subject;
1057     cb.pattern_position = GET(ecode, 2);
1058     cb.next_item_length = GET(ecode, 2 + LINK_SIZE);
1059     cb.capture_top = offset_top/2;
1060     cb.capture_last = md->capture_last;
1061     cb.callout_data = md->callout_data;
1062     if ((rrc = (*pcre_callout)(&cb)) > 0) RRETURN(MATCH_NOMATCH);
1063     if (rrc < 0) RRETURN(rrc);
1064     }
1065     ecode += 2 + 2*LINK_SIZE;
1066     break;
1067    
1068     /* Recursion either matches the current regex, or some subexpression. The
1069     offset data is the offset to the starting bracket from the start of the
1070     whole pattern. (This is so that it works from duplicated subpatterns.)
1071    
1072     If there are any capturing brackets started but not finished, we have to
1073     save their starting points and reinstate them after the recursion. However,
1074     we don't know how many such there are (offset_top records the completed
1075     total) so we just have to save all the potential data. There may be up to
1076     65535 such values, which is too large to put on the stack, but using malloc
1077     for small numbers seems expensive. As a compromise, the stack is used when
1078     there are no more than REC_STACK_SAVE_MAX values to store; otherwise malloc
1079     is used. A problem is what to do if the malloc fails ... there is no way of
1080     returning to the top level with an error. Save the top REC_STACK_SAVE_MAX
1081     values on the stack, and accept that the rest may be wrong.
1082    
1083     There are also other values that have to be saved. We use a chained
1084     sequence of blocks that actually live on the stack. Thanks to Robin Houston
1085     for the original version of this logic. */
1086    
1087     case OP_RECURSE:
1088     {
1089     callpat = md->start_code + GET(ecode, 1);
1090 nigel 93 new_recursive.group_num = (callpat == md->start_code)? 0 :
1091     GET2(callpat, 1 + LINK_SIZE);
1092 nigel 77
1093     /* Add to "recursing stack" */
1094    
1095     new_recursive.prevrec = md->recursive;
1096     md->recursive = &new_recursive;
1097    
1098     /* Find where to continue from afterwards */
1099    
1100     ecode += 1 + LINK_SIZE;
1101     new_recursive.after_call = ecode;
1102    
1103     /* Now save the offset data. */
1104    
1105     new_recursive.saved_max = md->offset_end;
1106     if (new_recursive.saved_max <= REC_STACK_SAVE_MAX)
1107     new_recursive.offset_save = stacksave;
1108     else
1109     {
1110     new_recursive.offset_save =
1111     (int *)(pcre_malloc)(new_recursive.saved_max * sizeof(int));
1112     if (new_recursive.offset_save == NULL) RRETURN(PCRE_ERROR_NOMEMORY);
1113     }
1114    
1115     memcpy(new_recursive.offset_save, md->offset_vector,
1116     new_recursive.saved_max * sizeof(int));
1117 ph10 168 new_recursive.save_start = mstart;
1118     mstart = eptr;
1119 nigel 77
1120     /* OK, now we can do the recursion. For each top-level alternative we
1121     restore the offset and recursion data. */
1122    
1123     DPRINTF(("Recursing into group %d\n", new_recursive.group_num));
1124 nigel 93 flags = (*callpat >= OP_SBRA)? match_cbegroup : 0;
1125 nigel 77 do
1126     {
1127 ph10 164 RMATCH(eptr, callpat + _pcre_OP_lengths[*callpat], offset_top,
1128     md, ims, eptrb, flags, RM6);
1129 nigel 77 if (rrc == MATCH_MATCH)
1130     {
1131 nigel 87 DPRINTF(("Recursion matched\n"));
1132 nigel 77 md->recursive = new_recursive.prevrec;
1133     if (new_recursive.offset_save != stacksave)
1134     (pcre_free)(new_recursive.offset_save);
1135     RRETURN(MATCH_MATCH);
1136     }
1137 ph10 210 else if (rrc != MATCH_NOMATCH && rrc != MATCH_THEN)
1138 nigel 87 {
1139     DPRINTF(("Recursion gave error %d\n", rrc));
1140 ph10 400 if (new_recursive.offset_save != stacksave)
1141     (pcre_free)(new_recursive.offset_save);
1142 nigel 87 RRETURN(rrc);
1143     }
1144 nigel 77
1145     md->recursive = &new_recursive;
1146     memcpy(md->offset_vector, new_recursive.offset_save,
1147     new_recursive.saved_max * sizeof(int));
1148     callpat += GET(callpat, 1);
1149     }
1150     while (*callpat == OP_ALT);
1151    
1152     DPRINTF(("Recursion didn't match\n"));
1153     md->recursive = new_recursive.prevrec;
1154     if (new_recursive.offset_save != stacksave)
1155     (pcre_free)(new_recursive.offset_save);
1156     RRETURN(MATCH_NOMATCH);
1157     }
1158     /* Control never reaches here */
1159    
1160     /* "Once" brackets are like assertion brackets except that after a match,
1161     the point in the subject string is not moved back. Thus there can never be
1162     a move back into the brackets. Friedl calls these "atomic" subpatterns.
1163     Check the alternative branches in turn - the matching won't pass the KET
1164     for this kind of subpattern. If any one branch matches, we carry on as at
1165     the end of a normal bracket, leaving the subject pointer. */
1166    
1167     case OP_ONCE:
1168 nigel 91 prev = ecode;
1169     saved_eptr = eptr;
1170    
1171     do
1172 nigel 77 {
1173 ph10 197 RMATCH(eptr, ecode + 1 + LINK_SIZE, offset_top, md, ims, eptrb, 0, RM7);
1174 nigel 91 if (rrc == MATCH_MATCH) break;
1175 ph10 210 if (rrc != MATCH_NOMATCH && rrc != MATCH_THEN) RRETURN(rrc);
1176 nigel 91 ecode += GET(ecode,1);
1177     }
1178     while (*ecode == OP_ALT);
1179 nigel 77
1180 nigel 91 /* If hit the end of the group (which could be repeated), fail */
1181 nigel 77
1182 nigel 91 if (*ecode != OP_ONCE && *ecode != OP_ALT) RRETURN(MATCH_NOMATCH);
1183 nigel 77
1184 nigel 91 /* Continue as from after the assertion, updating the offsets high water
1185     mark, since extracts may have been taken. */
1186 nigel 77
1187 nigel 93 do ecode += GET(ecode, 1); while (*ecode == OP_ALT);
1188 nigel 77
1189 nigel 91 offset_top = md->end_offset_top;
1190     eptr = md->end_match_ptr;
1191 nigel 77
1192 nigel 91 /* For a non-repeating ket, just continue at this level. This also
1193     happens for a repeating ket if no characters were matched in the group.
1194     This is the forcible breaking of infinite loops as implemented in Perl
1195     5.005. If there is an options reset, it will get obeyed in the normal
1196     course of events. */
1197 nigel 77
1198 nigel 91 if (*ecode == OP_KET || eptr == saved_eptr)
1199     {
1200     ecode += 1+LINK_SIZE;
1201     break;
1202     }
1203 nigel 77
1204 nigel 91 /* The repeating kets try the rest of the pattern or restart from the
1205     preceding bracket, in the appropriate order. The second "call" of match()
1206     uses tail recursion, to avoid using another stack frame. We need to reset
1207     any options that changed within the bracket before re-running it, so
1208     check the next opcode. */
1209 nigel 77
1210 nigel 91 if (ecode[1+LINK_SIZE] == OP_OPT)
1211     {
1212     ims = (ims & ~PCRE_IMS) | ecode[4];
1213     DPRINTF(("ims set to %02lx at group repeat\n", ims));
1214     }
1215 nigel 77
1216 nigel 91 if (*ecode == OP_KETRMIN)
1217     {
1218 ph10 197 RMATCH(eptr, ecode + 1 + LINK_SIZE, offset_top, md, ims, eptrb, 0, RM8);
1219 nigel 91 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1220     ecode = prev;
1221 ph10 197 flags = 0;
1222 nigel 91 goto TAIL_RECURSE;
1223 nigel 77 }
1224 nigel 91 else /* OP_KETRMAX */
1225     {
1226 ph10 164 RMATCH(eptr, prev, offset_top, md, ims, eptrb, match_cbegroup, RM9);
1227 nigel 91 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1228     ecode += 1 + LINK_SIZE;
1229 ph10 197 flags = 0;
1230 nigel 91 goto TAIL_RECURSE;
1231     }
1232     /* Control never gets here */
1233 nigel 77
1234     /* An alternation is the end of a branch; scan along to find the end of the
1235     bracketed group and go to there. */
1236    
1237     case OP_ALT:
1238     do ecode += GET(ecode,1); while (*ecode == OP_ALT);
1239     break;
1240    
1241 ph10 335 /* BRAZERO, BRAMINZERO and SKIPZERO occur just before a bracket group,
1242     indicating that it may occur zero times. It may repeat infinitely, or not
1243     at all - i.e. it could be ()* or ()? or even (){0} in the pattern. Brackets
1244     with fixed upper repeat limits are compiled as a number of copies, with the
1245     optional ones preceded by BRAZERO or BRAMINZERO. */
1246 nigel 77
1247     case OP_BRAZERO:
1248     {
1249     next = ecode+1;
1250 ph10 164 RMATCH(eptr, next, offset_top, md, ims, eptrb, 0, RM10);
1251 nigel 77 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1252     do next += GET(next,1); while (*next == OP_ALT);
1253 nigel 93 ecode = next + 1 + LINK_SIZE;
1254 nigel 77 }
1255     break;
1256    
1257     case OP_BRAMINZERO:
1258     {
1259     next = ecode+1;
1260 nigel 93 do next += GET(next, 1); while (*next == OP_ALT);
1261 ph10 164 RMATCH(eptr, next + 1+LINK_SIZE, offset_top, md, ims, eptrb, 0, RM11);
1262 nigel 77 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1263     ecode++;
1264     }
1265     break;
1266    
1267 ph10 335 case OP_SKIPZERO:
1268     {
1269     next = ecode+1;
1270     do next += GET(next,1); while (*next == OP_ALT);
1271     ecode = next + 1 + LINK_SIZE;
1272     }
1273     break;
1274    
1275 nigel 93 /* End of a group, repeated or non-repeating. */
1276 nigel 77
1277     case OP_KET:
1278     case OP_KETRMIN:
1279     case OP_KETRMAX:
1280 nigel 91 prev = ecode - GET(ecode, 1);
1281 nigel 77
1282 nigel 93 /* If this was a group that remembered the subject start, in order to break
1283     infinite repeats of empty string matches, retrieve the subject start from
1284     the chain. Otherwise, set it NULL. */
1285 nigel 77
1286 nigel 93 if (*prev >= OP_SBRA)
1287     {
1288     saved_eptr = eptrb->epb_saved_eptr; /* Value at start of group */
1289     eptrb = eptrb->epb_prev; /* Backup to previous group */
1290     }
1291     else saved_eptr = NULL;
1292 nigel 77
1293 nigel 93 /* If we are at the end of an assertion group, stop matching and return
1294     MATCH_MATCH, but record the current high water mark for use by positive
1295     assertions. Do this also for the "once" (atomic) groups. */
1296    
1297 nigel 91 if (*prev == OP_ASSERT || *prev == OP_ASSERT_NOT ||
1298     *prev == OP_ASSERTBACK || *prev == OP_ASSERTBACK_NOT ||
1299     *prev == OP_ONCE)
1300     {
1301     md->end_match_ptr = eptr; /* For ONCE */
1302     md->end_offset_top = offset_top;
1303     RRETURN(MATCH_MATCH);
1304     }
1305 nigel 77
1306 nigel 93 /* For capturing groups we have to check the group number back at the start
1307     and if necessary complete handling an extraction by setting the offsets and
1308     bumping the high water mark. Note that whole-pattern recursion is coded as
1309     a recurse into group 0, so it won't be picked up here. Instead, we catch it
1310     when the OP_END is reached. Other recursion is handled here. */
1311 nigel 77
1312 nigel 93 if (*prev == OP_CBRA || *prev == OP_SCBRA)
1313 nigel 91 {
1314 nigel 93 number = GET2(prev, 1+LINK_SIZE);
1315 nigel 91 offset = number << 1;
1316 nigel 77
1317     #ifdef DEBUG
1318 nigel 91 printf("end bracket %d", number);
1319     printf("\n");
1320 nigel 77 #endif
1321    
1322 nigel 93 md->capture_last = number;
1323     if (offset >= md->offset_max) md->offset_overflow = TRUE; else
1324 nigel 91 {
1325 nigel 93 md->offset_vector[offset] =
1326     md->offset_vector[md->offset_end - number];
1327     md->offset_vector[offset+1] = eptr - md->start_subject;
1328     if (offset_top <= offset) offset_top = offset + 2;
1329     }
1330 nigel 77
1331 nigel 93 /* Handle a recursively called group. Restore the offsets
1332     appropriately and continue from after the call. */
1333 nigel 77
1334 nigel 93 if (md->recursive != NULL && md->recursive->group_num == number)
1335     {
1336     recursion_info *rec = md->recursive;
1337     DPRINTF(("Recursion (%d) succeeded - continuing\n", number));
1338     md->recursive = rec->prevrec;
1339 ph10 168 mstart = rec->save_start;
1340 nigel 93 memcpy(md->offset_vector, rec->offset_save,
1341     rec->saved_max * sizeof(int));
1342     ecode = rec->after_call;
1343     ims = original_ims;
1344     break;
1345 nigel 77 }
1346 nigel 91 }
1347 nigel 77
1348 nigel 93 /* For both capturing and non-capturing groups, reset the value of the ims
1349     flags, in case they got changed during the group. */
1350 nigel 77
1351 nigel 91 ims = original_ims;
1352     DPRINTF(("ims reset to %02lx\n", ims));
1353 nigel 77
1354 nigel 91 /* For a non-repeating ket, just continue at this level. This also
1355     happens for a repeating ket if no characters were matched in the group.
1356     This is the forcible breaking of infinite loops as implemented in Perl
1357     5.005. If there is an options reset, it will get obeyed in the normal
1358     course of events. */
1359 nigel 77
1360 nigel 91 if (*ecode == OP_KET || eptr == saved_eptr)
1361     {
1362     ecode += 1 + LINK_SIZE;
1363     break;
1364     }
1365 nigel 77
1366 nigel 91 /* The repeating kets try the rest of the pattern or restart from the
1367     preceding bracket, in the appropriate order. In the second case, we can use
1368 ph10 197 tail recursion to avoid using another stack frame, unless we have an
1369     unlimited repeat of a group that can match an empty string. */
1370 nigel 77
1371 nigel 93 flags = (*prev >= OP_SBRA)? match_cbegroup : 0;
1372    
1373 nigel 91 if (*ecode == OP_KETRMIN)
1374     {
1375 ph10 197 RMATCH(eptr, ecode + 1 + LINK_SIZE, offset_top, md, ims, eptrb, 0, RM12);
1376 nigel 91 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1377 ph10 197 if (flags != 0) /* Could match an empty string */
1378     {
1379     RMATCH(eptr, prev, offset_top, md, ims, eptrb, flags, RM50);
1380     RRETURN(rrc);
1381     }
1382 nigel 91 ecode = prev;
1383     goto TAIL_RECURSE;
1384 nigel 77 }
1385 nigel 91 else /* OP_KETRMAX */
1386     {
1387 ph10 164 RMATCH(eptr, prev, offset_top, md, ims, eptrb, flags, RM13);
1388 nigel 91 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1389     ecode += 1 + LINK_SIZE;
1390 ph10 197 flags = 0;
1391 nigel 91 goto TAIL_RECURSE;
1392     }
1393     /* Control never gets here */
1394 nigel 77
1395     /* Start of subject unless notbol, or after internal newline if multiline */
1396    
1397     case OP_CIRC:
1398     if (md->notbol && eptr == md->start_subject) RRETURN(MATCH_NOMATCH);
1399     if ((ims & PCRE_MULTILINE) != 0)
1400     {
1401 nigel 91 if (eptr != md->start_subject &&
1402 nigel 93 (eptr == md->end_subject || !WAS_NEWLINE(eptr)))
1403 nigel 77 RRETURN(MATCH_NOMATCH);
1404     ecode++;
1405     break;
1406     }
1407     /* ... else fall through */
1408    
1409     /* Start of subject assertion */
1410    
1411     case OP_SOD:
1412     if (eptr != md->start_subject) RRETURN(MATCH_NOMATCH);
1413     ecode++;
1414     break;
1415    
1416     /* Start of match assertion */
1417    
1418     case OP_SOM:
1419     if (eptr != md->start_subject + md->start_offset) RRETURN(MATCH_NOMATCH);
1420     ecode++;
1421     break;
1422 ph10 172
1423 ph10 168 /* Reset the start of match point */
1424 ph10 172
1425 ph10 168 case OP_SET_SOM:
1426     mstart = eptr;
1427 ph10 172 ecode++;
1428     break;
1429 nigel 77
1430     /* Assert before internal newline if multiline, or before a terminating
1431     newline unless endonly is set, else end of subject unless noteol is set. */
1432    
1433     case OP_DOLL:
1434     if ((ims & PCRE_MULTILINE) != 0)
1435     {
1436     if (eptr < md->end_subject)
1437 nigel 91 { if (!IS_NEWLINE(eptr)) RRETURN(MATCH_NOMATCH); }
1438 nigel 77 else
1439     { if (md->noteol) RRETURN(MATCH_NOMATCH); }
1440     ecode++;
1441     break;
1442     }
1443     else
1444     {
1445     if (md->noteol) RRETURN(MATCH_NOMATCH);
1446     if (!md->endonly)
1447     {
1448 nigel 91 if (eptr != md->end_subject &&
1449 nigel 93 (!IS_NEWLINE(eptr) || eptr != md->end_subject - md->nllen))
1450 nigel 77 RRETURN(MATCH_NOMATCH);
1451     ecode++;
1452     break;
1453     }
1454     }
1455 nigel 91 /* ... else fall through for endonly */
1456 nigel 77
1457     /* End of subject assertion (\z) */
1458    
1459     case OP_EOD:
1460     if (eptr < md->end_subject) RRETURN(MATCH_NOMATCH);
1461     ecode++;
1462     break;
1463    
1464     /* End of subject or ending \n assertion (\Z) */
1465    
1466     case OP_EODN:
1467 nigel 91 if (eptr != md->end_subject &&
1468 nigel 93 (!IS_NEWLINE(eptr) || eptr != md->end_subject - md->nllen))
1469 nigel 91 RRETURN(MATCH_NOMATCH);
1470 nigel 77 ecode++;
1471     break;
1472    
1473     /* Word boundary assertions */
1474    
1475     case OP_NOT_WORD_BOUNDARY:
1476     case OP_WORD_BOUNDARY:
1477     {
1478    
1479     /* Find out if the previous and current characters are "word" characters.
1480     It takes a bit more work in UTF-8 mode. Characters > 255 are assumed to
1481 ph10 443 be "non-word" characters. Remember the earliest consulted character for
1482 ph10 435 partial matching. */
1483 nigel 77
1484     #ifdef SUPPORT_UTF8
1485     if (utf8)
1486     {
1487     if (eptr == md->start_subject) prev_is_word = FALSE; else
1488     {
1489 ph10 409 USPTR lastptr = eptr - 1;
1490 nigel 77 while((*lastptr & 0xc0) == 0x80) lastptr--;
1491 ph10 443 if (lastptr < md->start_used_ptr) md->start_used_ptr = lastptr;
1492 nigel 77 GETCHAR(c, lastptr);
1493     prev_is_word = c < 256 && (md->ctypes[c] & ctype_word) != 0;
1494     }
1495 ph10 443 if (eptr >= md->end_subject)
1496 nigel 77 {
1497 ph10 443 SCHECK_PARTIAL();
1498     cur_is_word = FALSE;
1499 ph10 428 }
1500     else
1501     {
1502 nigel 77 GETCHAR(c, eptr);
1503     cur_is_word = c < 256 && (md->ctypes[c] & ctype_word) != 0;
1504     }
1505     }
1506     else
1507     #endif
1508    
1509 ph10 428 /* Not in UTF-8 mode */
1510 nigel 77
1511     {
1512 ph10 435 if (eptr == md->start_subject) prev_is_word = FALSE; else
1513     {
1514 ph10 443 if (eptr <= md->start_used_ptr) md->start_used_ptr = eptr - 1;
1515 ph10 435 prev_is_word = ((md->ctypes[eptr[-1]] & ctype_word) != 0);
1516     }
1517 ph10 443 if (eptr >= md->end_subject)
1518 ph10 428 {
1519 ph10 443 SCHECK_PARTIAL();
1520     cur_is_word = FALSE;
1521 ph10 428 }
1522     else cur_is_word = ((md->ctypes[*eptr] & ctype_word) != 0);
1523 nigel 77 }
1524    
1525     /* Now see if the situation is what we want */
1526    
1527     if ((*ecode++ == OP_WORD_BOUNDARY)?
1528     cur_is_word == prev_is_word : cur_is_word != prev_is_word)
1529     RRETURN(MATCH_NOMATCH);
1530     }
1531     break;
1532    
1533     /* Match a single character type; inline for speed */
1534    
1535     case OP_ANY:
1536 ph10 342 if (IS_NEWLINE(eptr)) RRETURN(MATCH_NOMATCH);
1537 ph10 345 /* Fall through */
1538    
1539 ph10 341 case OP_ALLANY:
1540 ph10 443 if (eptr++ >= md->end_subject)
1541 ph10 428 {
1542 ph10 443 SCHECK_PARTIAL();
1543 ph10 428 RRETURN(MATCH_NOMATCH);
1544 ph10 443 }
1545 ph10 342 if (utf8) while (eptr < md->end_subject && (*eptr & 0xc0) == 0x80) eptr++;
1546 nigel 77 ecode++;
1547     break;
1548    
1549     /* Match a single byte, even in UTF-8 mode. This opcode really does match
1550     any byte, even newline, independent of the setting of PCRE_DOTALL. */
1551    
1552     case OP_ANYBYTE:
1553 ph10 443 if (eptr++ >= md->end_subject)
1554 ph10 428 {
1555 ph10 443 SCHECK_PARTIAL();
1556 ph10 428 RRETURN(MATCH_NOMATCH);
1557 ph10 443 }
1558 nigel 77 ecode++;
1559     break;
1560    
1561     case OP_NOT_DIGIT:
1562 ph10 443 if (eptr >= md->end_subject)
1563 ph10 428 {
1564 ph10 443 SCHECK_PARTIAL();
1565 ph10 428 RRETURN(MATCH_NOMATCH);
1566 ph10 443 }
1567 nigel 77 GETCHARINCTEST(c, eptr);
1568     if (
1569     #ifdef SUPPORT_UTF8
1570     c < 256 &&
1571     #endif
1572     (md->ctypes[c] & ctype_digit) != 0
1573     )
1574     RRETURN(MATCH_NOMATCH);
1575     ecode++;
1576     break;
1577    
1578     case OP_DIGIT:
1579 ph10 443 if (eptr >= md->end_subject)
1580 ph10 428 {
1581 ph10 443 SCHECK_PARTIAL();
1582 ph10 428 RRETURN(MATCH_NOMATCH);
1583 ph10 443 }
1584 nigel 77 GETCHARINCTEST(c, eptr);
1585     if (
1586     #ifdef SUPPORT_UTF8
1587     c >= 256 ||
1588     #endif
1589     (md->ctypes[c] & ctype_digit) == 0
1590     )
1591     RRETURN(MATCH_NOMATCH);
1592     ecode++;
1593     break;
1594    
1595     case OP_NOT_WHITESPACE:
1596 ph10 443 if (eptr >= md->end_subject)
1597 ph10 428 {
1598 ph10 443 SCHECK_PARTIAL();
1599 ph10 428 RRETURN(MATCH_NOMATCH);
1600 ph10 443 }
1601 nigel 77 GETCHARINCTEST(c, eptr);
1602     if (
1603     #ifdef SUPPORT_UTF8
1604     c < 256 &&
1605     #endif
1606     (md->ctypes[c] & ctype_space) != 0
1607     )
1608     RRETURN(MATCH_NOMATCH);
1609     ecode++;
1610     break;
1611    
1612     case OP_WHITESPACE:
1613 ph10 443 if (eptr >= md->end_subject)
1614 ph10 428 {
1615 ph10 443 SCHECK_PARTIAL();
1616 ph10 428 RRETURN(MATCH_NOMATCH);
1617 ph10 443 }
1618 nigel 77 GETCHARINCTEST(c, eptr);
1619     if (
1620     #ifdef SUPPORT_UTF8
1621     c >= 256 ||
1622     #endif
1623     (md->ctypes[c] & ctype_space) == 0
1624     )
1625     RRETURN(MATCH_NOMATCH);
1626     ecode++;
1627     break;
1628    
1629     case OP_NOT_WORDCHAR:
1630 ph10 443 if (eptr >= md->end_subject)
1631 ph10 428 {
1632 ph10 443 SCHECK_PARTIAL();
1633 ph10 428 RRETURN(MATCH_NOMATCH);
1634 ph10 443 }
1635 nigel 77 GETCHARINCTEST(c, eptr);
1636     if (
1637     #ifdef SUPPORT_UTF8
1638     c < 256 &&
1639     #endif
1640     (md->ctypes[c] & ctype_word) != 0
1641     )
1642     RRETURN(MATCH_NOMATCH);
1643     ecode++;
1644     break;
1645    
1646     case OP_WORDCHAR:
1647 ph10 443 if (eptr >= md->end_subject)
1648 ph10 428 {
1649 ph10 443 SCHECK_PARTIAL();
1650 ph10 428 RRETURN(MATCH_NOMATCH);
1651 ph10 443 }
1652 nigel 77 GETCHARINCTEST(c, eptr);
1653     if (
1654     #ifdef SUPPORT_UTF8
1655     c >= 256 ||
1656     #endif
1657     (md->ctypes[c] & ctype_word) == 0
1658     )
1659     RRETURN(MATCH_NOMATCH);
1660     ecode++;
1661     break;
1662    
1663 nigel 93 case OP_ANYNL:
1664 ph10 443 if (eptr >= md->end_subject)
1665 ph10 428 {
1666 ph10 443 SCHECK_PARTIAL();
1667 ph10 428 RRETURN(MATCH_NOMATCH);
1668 ph10 443 }
1669 nigel 93 GETCHARINCTEST(c, eptr);
1670     switch(c)
1671     {
1672     default: RRETURN(MATCH_NOMATCH);
1673     case 0x000d:
1674     if (eptr < md->end_subject && *eptr == 0x0a) eptr++;
1675     break;
1676 ph10 231
1677 nigel 93 case 0x000a:
1678 ph10 231 break;
1679    
1680 nigel 93 case 0x000b:
1681     case 0x000c:
1682     case 0x0085:
1683     case 0x2028:
1684     case 0x2029:
1685 ph10 231 if (md->bsr_anycrlf) RRETURN(MATCH_NOMATCH);
1686 nigel 93 break;
1687     }
1688     ecode++;
1689     break;
1690    
1691 ph10 178 case OP_NOT_HSPACE:
1692 ph10 443 if (eptr >= md->end_subject)
1693 ph10 428 {
1694 ph10 443 SCHECK_PARTIAL();
1695 ph10 428 RRETURN(MATCH_NOMATCH);
1696 ph10 443 }
1697 ph10 178 GETCHARINCTEST(c, eptr);
1698     switch(c)
1699     {
1700     default: break;
1701     case 0x09: /* HT */
1702     case 0x20: /* SPACE */
1703     case 0xa0: /* NBSP */
1704     case 0x1680: /* OGHAM SPACE MARK */
1705     case 0x180e: /* MONGOLIAN VOWEL SEPARATOR */
1706     case 0x2000: /* EN QUAD */
1707     case 0x2001: /* EM QUAD */
1708     case 0x2002: /* EN SPACE */
1709     case 0x2003: /* EM SPACE */
1710     case 0x2004: /* THREE-PER-EM SPACE */
1711     case 0x2005: /* FOUR-PER-EM SPACE */
1712     case 0x2006: /* SIX-PER-EM SPACE */
1713     case 0x2007: /* FIGURE SPACE */
1714     case 0x2008: /* PUNCTUATION SPACE */
1715     case 0x2009: /* THIN SPACE */
1716     case 0x200A: /* HAIR SPACE */
1717     case 0x202f: /* NARROW NO-BREAK SPACE */
1718     case 0x205f: /* MEDIUM MATHEMATICAL SPACE */
1719     case 0x3000: /* IDEOGRAPHIC SPACE */
1720     RRETURN(MATCH_NOMATCH);
1721     }
1722     ecode++;
1723     break;
1724    
1725     case OP_HSPACE:
1726 ph10 443 if (eptr >= md->end_subject)
1727 ph10 428 {
1728 ph10 443 SCHECK_PARTIAL();
1729 ph10 428 RRETURN(MATCH_NOMATCH);
1730 ph10 443 }
1731 ph10 178 GETCHARINCTEST(c, eptr);
1732     switch(c)
1733     {
1734     default: RRETURN(MATCH_NOMATCH);
1735     case 0x09: /* HT */
1736     case 0x20: /* SPACE */
1737     case 0xa0: /* NBSP */
1738     case 0x1680: /* OGHAM SPACE MARK */
1739     case 0x180e: /* MONGOLIAN VOWEL SEPARATOR */
1740     case 0x2000: /* EN QUAD */
1741     case 0x2001: /* EM QUAD */
1742     case 0x2002: /* EN SPACE */
1743     case 0x2003: /* EM SPACE */
1744     case 0x2004: /* THREE-PER-EM SPACE */
1745     case 0x2005: /* FOUR-PER-EM SPACE */
1746     case 0x2006: /* SIX-PER-EM SPACE */
1747     case 0x2007: /* FIGURE SPACE */
1748     case 0x2008: /* PUNCTUATION SPACE */
1749     case 0x2009: /* THIN SPACE */
1750     case 0x200A: /* HAIR SPACE */
1751     case 0x202f: /* NARROW NO-BREAK SPACE */
1752     case 0x205f: /* MEDIUM MATHEMATICAL SPACE */
1753     case 0x3000: /* IDEOGRAPHIC SPACE */
1754     break;
1755     }
1756     ecode++;
1757     break;
1758    
1759     case OP_NOT_VSPACE:
1760 ph10 443 if (eptr >= md->end_subject)
1761 ph10 428 {
1762 ph10 443 SCHECK_PARTIAL();
1763 ph10 428 RRETURN(MATCH_NOMATCH);
1764 ph10 443 }
1765 ph10 178 GETCHARINCTEST(c, eptr);
1766     switch(c)
1767     {
1768     default: break;
1769     case 0x0a: /* LF */
1770     case 0x0b: /* VT */
1771     case 0x0c: /* FF */
1772     case 0x0d: /* CR */
1773     case 0x85: /* NEL */
1774     case 0x2028: /* LINE SEPARATOR */
1775     case 0x2029: /* PARAGRAPH SEPARATOR */
1776     RRETURN(MATCH_NOMATCH);
1777     }
1778     ecode++;
1779     break;
1780    
1781     case OP_VSPACE:
1782 ph10 443 if (eptr >= md->end_subject)
1783 ph10 428 {
1784 ph10 443 SCHECK_PARTIAL();
1785 ph10 428 RRETURN(MATCH_NOMATCH);
1786 ph10 443 }
1787 ph10 178 GETCHARINCTEST(c, eptr);
1788     switch(c)
1789     {
1790     default: RRETURN(MATCH_NOMATCH);
1791     case 0x0a: /* LF */
1792     case 0x0b: /* VT */
1793     case 0x0c: /* FF */
1794     case 0x0d: /* CR */
1795     case 0x85: /* NEL */
1796     case 0x2028: /* LINE SEPARATOR */
1797     case 0x2029: /* PARAGRAPH SEPARATOR */
1798     break;
1799     }
1800     ecode++;
1801     break;
1802    
1803 nigel 77 #ifdef SUPPORT_UCP
1804     /* Check the next character by Unicode property. We will get here only
1805     if the support is in the binary; otherwise a compile-time error occurs. */
1806    
1807     case OP_PROP:
1808     case OP_NOTPROP:
1809 ph10 443 if (eptr >= md->end_subject)
1810 ph10 428 {
1811 ph10 443 SCHECK_PARTIAL();
1812 ph10 428 RRETURN(MATCH_NOMATCH);
1813 ph10 443 }
1814 nigel 77 GETCHARINCTEST(c, eptr);
1815     {
1816 ph10 384 const ucd_record *prop = GET_UCD(c);
1817 nigel 77
1818 nigel 87 switch(ecode[1])
1819     {
1820     case PT_ANY:
1821     if (op == OP_NOTPROP) RRETURN(MATCH_NOMATCH);
1822     break;
1823 nigel 77
1824 nigel 87 case PT_LAMP:
1825 ph10 349 if ((prop->chartype == ucp_Lu ||
1826     prop->chartype == ucp_Ll ||
1827     prop->chartype == ucp_Lt) == (op == OP_NOTPROP))
1828 nigel 77 RRETURN(MATCH_NOMATCH);
1829 nigel 87 break;
1830    
1831     case PT_GC:
1832 ph10 351 if ((ecode[2] != _pcre_ucp_gentype[prop->chartype]) == (op == OP_PROP))
1833 nigel 77 RRETURN(MATCH_NOMATCH);
1834 nigel 87 break;
1835    
1836     case PT_PC:
1837 ph10 349 if ((ecode[2] != prop->chartype) == (op == OP_PROP))
1838 nigel 87 RRETURN(MATCH_NOMATCH);
1839     break;
1840    
1841     case PT_SC:
1842 ph10 349 if ((ecode[2] != prop->script) == (op == OP_PROP))
1843 nigel 87 RRETURN(MATCH_NOMATCH);
1844     break;
1845    
1846     default:
1847     RRETURN(PCRE_ERROR_INTERNAL);
1848 nigel 77 }
1849 nigel 87
1850     ecode += 3;
1851 nigel 77 }
1852     break;
1853    
1854     /* Match an extended Unicode sequence. We will get here only if the support
1855     is in the binary; otherwise a compile-time error occurs. */
1856    
1857     case OP_EXTUNI:
1858 ph10 443 if (eptr >= md->end_subject)
1859 ph10 428 {
1860 ph10 443 SCHECK_PARTIAL();
1861 ph10 428 RRETURN(MATCH_NOMATCH);
1862 ph10 443 }
1863 nigel 77 GETCHARINCTEST(c, eptr);
1864     {
1865 ph10 349 int category = UCD_CATEGORY(c);
1866 nigel 77 if (category == ucp_M) RRETURN(MATCH_NOMATCH);
1867     while (eptr < md->end_subject)
1868     {
1869     int len = 1;
1870     if (!utf8) c = *eptr; else
1871     {
1872     GETCHARLEN(c, eptr, len);
1873     }
1874 ph10 349 category = UCD_CATEGORY(c);
1875 nigel 77 if (category != ucp_M) break;
1876     eptr += len;
1877     }
1878     }
1879     ecode++;
1880     break;
1881     #endif
1882    
1883    
1884     /* Match a back reference, possibly repeatedly. Look past the end of the
1885     item to see if there is repeat information following. The code is similar
1886     to that for character classes, but repeated for efficiency. Then obey
1887     similar code to character type repeats - written out again for speed.
1888     However, if the referenced string is the empty string, always treat
1889     it as matched, any number of times (otherwise there could be infinite
1890     loops). */
1891    
1892     case OP_REF:
1893     {
1894     offset = GET2(ecode, 1) << 1; /* Doubled ref number */
1895 ph10 345 ecode += 3;
1896    
1897 ph10 336 /* If the reference is unset, there are two possibilities:
1898 ph10 345
1899 ph10 336 (a) In the default, Perl-compatible state, set the length to be longer
1900     than the amount of subject left; this ensures that every attempt at a
1901     match fails. We can't just fail here, because of the possibility of
1902     quantifiers with zero minima.
1903 ph10 345
1904     (b) If the JavaScript compatibility flag is set, set the length to zero
1905     so that the back reference matches an empty string.
1906    
1907     Otherwise, set the length to the length of what was matched by the
1908 ph10 336 referenced subpattern. */
1909 ph10 345
1910 ph10 336 if (offset >= offset_top || md->offset_vector[offset] < 0)
1911 ph10 345 length = (md->jscript_compat)? 0 : md->end_subject - eptr + 1;
1912 ph10 336 else
1913     length = md->offset_vector[offset+1] - md->offset_vector[offset];
1914 nigel 77
1915     /* Set up for repetition, or handle the non-repeated case */
1916    
1917     switch (*ecode)
1918     {
1919     case OP_CRSTAR:
1920     case OP_CRMINSTAR:
1921     case OP_CRPLUS:
1922     case OP_CRMINPLUS:
1923     case OP_CRQUERY:
1924     case OP_CRMINQUERY:
1925     c = *ecode++ - OP_CRSTAR;
1926     minimize = (c & 1) != 0;
1927     min = rep_min[c]; /* Pick up values from tables; */
1928     max = rep_max[c]; /* zero for max => infinity */
1929     if (max == 0) max = INT_MAX;
1930     break;
1931    
1932     case OP_CRRANGE:
1933     case OP_CRMINRANGE:
1934     minimize = (*ecode == OP_CRMINRANGE);
1935     min = GET2(ecode, 1);
1936     max = GET2(ecode, 3);
1937     if (max == 0) max = INT_MAX;
1938     ecode += 5;
1939     break;
1940    
1941     default: /* No repeat follows */
1942 ph10 443 if (!match_ref(offset, eptr, length, md, ims))
1943 ph10 428 {
1944 ph10 443 CHECK_PARTIAL();
1945 ph10 428 RRETURN(MATCH_NOMATCH);
1946 ph10 443 }
1947 nigel 77 eptr += length;
1948     continue; /* With the main loop */
1949     }
1950    
1951     /* If the length of the reference is zero, just continue with the
1952     main loop. */
1953 ph10 443
1954 nigel 77 if (length == 0) continue;
1955    
1956     /* First, ensure the minimum number of matches are present. We get back
1957     the length of the reference string explicitly rather than passing the
1958     address of eptr, so that eptr can be a register variable. */
1959    
1960     for (i = 1; i <= min; i++)
1961     {
1962 ph10 427 if (!match_ref(offset, eptr, length, md, ims))
1963 ph10 426 {
1964 ph10 427 CHECK_PARTIAL();
1965 ph10 426 RRETURN(MATCH_NOMATCH);
1966 ph10 427 }
1967 nigel 77 eptr += length;
1968     }
1969    
1970     /* If min = max, continue at the same level without recursion.
1971     They are not both allowed to be zero. */
1972    
1973     if (min == max) continue;
1974    
1975     /* If minimizing, keep trying and advancing the pointer */
1976    
1977     if (minimize)
1978     {
1979     for (fi = min;; fi++)
1980     {
1981 ph10 164 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM14);
1982 nigel 77 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1983 ph10 428 if (fi >= max) RRETURN(MATCH_NOMATCH);
1984     if (!match_ref(offset, eptr, length, md, ims))
1985 ph10 426 {
1986 ph10 427 CHECK_PARTIAL();
1987 nigel 77 RRETURN(MATCH_NOMATCH);
1988 ph10 427 }
1989 nigel 77 eptr += length;
1990     }
1991     /* Control never gets here */
1992     }
1993    
1994     /* If maximizing, find the longest string and work backwards */
1995    
1996     else
1997     {
1998     pp = eptr;
1999     for (i = min; i < max; i++)
2000     {
2001     if (!match_ref(offset, eptr, length, md, ims)) break;
2002     eptr += length;
2003     }
2004     while (eptr >= pp)
2005     {
2006 ph10 164 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM15);
2007 nigel 77 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2008     eptr -= length;
2009     }
2010     RRETURN(MATCH_NOMATCH);
2011     }
2012     }
2013     /* Control never gets here */
2014    
2015     /* Match a bit-mapped character class, possibly repeatedly. This op code is
2016     used when all the characters in the class have values in the range 0-255,
2017     and either the matching is caseful, or the characters are in the range
2018     0-127 when UTF-8 processing is enabled. The only difference between
2019     OP_CLASS and OP_NCLASS occurs when a data character outside the range is
2020     encountered.
2021    
2022     First, look past the end of the item to see if there is repeat information
2023     following. Then obey similar code to character type repeats - written out
2024     again for speed. */
2025    
2026     case OP_NCLASS:
2027     case OP_CLASS:
2028     {
2029     data = ecode + 1; /* Save for matching */
2030     ecode += 33; /* Advance past the item */
2031    
2032     switch (*ecode)
2033     {
2034     case OP_CRSTAR:
2035     case OP_CRMINSTAR:
2036     case OP_CRPLUS:
2037     case OP_CRMINPLUS:
2038     case OP_CRQUERY:
2039     case OP_CRMINQUERY:
2040     c = *ecode++ - OP_CRSTAR;
2041     minimize = (c & 1) != 0;
2042     min = rep_min[c]; /* Pick up values from tables; */
2043     max = rep_max[c]; /* zero for max => infinity */
2044     if (max == 0) max = INT_MAX;
2045     break;
2046    
2047     case OP_CRRANGE:
2048     case OP_CRMINRANGE:
2049     minimize = (*ecode == OP_CRMINRANGE);
2050     min = GET2(ecode, 1);
2051     max = GET2(ecode, 3);
2052     if (max == 0) max = INT_MAX;
2053     ecode += 5;
2054     break;
2055    
2056     default: /* No repeat follows */
2057     min = max = 1;
2058     break;
2059     }
2060    
2061     /* First, ensure the minimum number of matches are present. */
2062    
2063     #ifdef SUPPORT_UTF8
2064     /* UTF-8 mode */
2065     if (utf8)
2066     {
2067     for (i = 1; i <= min; i++)
2068     {
2069 ph10 427 if (eptr >= md->end_subject)
2070 ph10 426 {
2071 ph10 428 SCHECK_PARTIAL();
2072 ph10 426 RRETURN(MATCH_NOMATCH);
2073 ph10 427 }
2074 nigel 77 GETCHARINC(c, eptr);
2075     if (c > 255)
2076     {
2077     if (op == OP_CLASS) RRETURN(MATCH_NOMATCH);
2078     }
2079     else
2080     {
2081     if ((data[c/8] & (1 << (c&7))) == 0) RRETURN(MATCH_NOMATCH);
2082     }
2083     }
2084     }
2085     else
2086     #endif
2087     /* Not UTF-8 mode */
2088     {
2089     for (i = 1; i <= min; i++)
2090     {
2091 ph10 427 if (eptr >= md->end_subject)
2092 ph10 426 {
2093 ph10 428 SCHECK_PARTIAL();
2094 ph10 426 RRETURN(MATCH_NOMATCH);
2095 ph10 427 }
2096 nigel 77 c = *eptr++;
2097     if ((data[c/8] & (1 << (c&7))) == 0) RRETURN(MATCH_NOMATCH);
2098     }
2099     }
2100    
2101     /* If max == min we can continue with the main loop without the
2102     need to recurse. */
2103    
2104     if (min == max) continue;
2105    
2106     /* If minimizing, keep testing the rest of the expression and advancing
2107     the pointer while it matches the class. */
2108    
2109     if (minimize)
2110     {
2111     #ifdef SUPPORT_UTF8
2112     /* UTF-8 mode */
2113     if (utf8)
2114     {
2115     for (fi = min;; fi++)
2116     {
2117 ph10 164 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM16);
2118 nigel 77 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2119 ph10 428 if (fi >= max) RRETURN(MATCH_NOMATCH);
2120 ph10 427 if (eptr >= md->end_subject)
2121 ph10 426 {
2122 ph10 427 SCHECK_PARTIAL();
2123 ph10 426 RRETURN(MATCH_NOMATCH);
2124 ph10 427 }
2125 nigel 77 GETCHARINC(c, eptr);
2126     if (c > 255)
2127     {
2128     if (op == OP_CLASS) RRETURN(MATCH_NOMATCH);
2129     }
2130     else
2131     {
2132     if ((data[c/8] & (1 << (c&7))) == 0) RRETURN(MATCH_NOMATCH);
2133     }
2134     }
2135     }
2136     else
2137     #endif
2138     /* Not UTF-8 mode */
2139     {
2140     for (fi = min;; fi++)
2141     {
2142 ph10 164 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM17);
2143 nigel 77 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2144 ph10 428 if (fi >= max) RRETURN(MATCH_NOMATCH);
2145 ph10 427 if (eptr >= md->end_subject)
2146 ph10 426 {
2147 ph10 427 SCHECK_PARTIAL();
2148 ph10 426 RRETURN(MATCH_NOMATCH);
2149 ph10 427 }
2150 nigel 77 c = *eptr++;
2151     if ((data[c/8] & (1 << (c&7))) == 0) RRETURN(MATCH_NOMATCH);
2152     }
2153     }
2154     /* Control never gets here */
2155     }
2156    
2157     /* If maximizing, find the longest possible run, then work backwards. */
2158    
2159     else
2160     {
2161     pp = eptr;
2162    
2163     #ifdef SUPPORT_UTF8
2164     /* UTF-8 mode */
2165     if (utf8)
2166     {
2167     for (i = min; i < max; i++)
2168     {
2169     int len = 1;
2170     if (eptr >= md->end_subject) break;
2171     GETCHARLEN(c, eptr, len);
2172     if (c > 255)
2173     {
2174     if (op == OP_CLASS) break;
2175     }
2176     else
2177     {
2178     if ((data[c/8] & (1 << (c&7))) == 0) break;
2179     }
2180     eptr += len;
2181     }
2182     for (;;)
2183     {
2184 ph10 164 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM18);
2185 nigel 77 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2186     if (eptr-- == pp) break; /* Stop if tried at original pos */
2187     BACKCHAR(eptr);
2188     }
2189     }
2190     else
2191     #endif
2192     /* Not UTF-8 mode */
2193     {
2194     for (i = min; i < max; i++)
2195     {
2196     if (eptr >= md->end_subject) break;
2197     c = *eptr;
2198     if ((data[c/8] & (1 << (c&7))) == 0) break;
2199     eptr++;
2200     }
2201     while (eptr >= pp)
2202     {
2203 ph10 164 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM19);
2204 nigel 87 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2205 nigel 77 eptr--;
2206     }
2207     }
2208    
2209     RRETURN(MATCH_NOMATCH);
2210     }
2211     }
2212     /* Control never gets here */
2213    
2214    
2215     /* Match an extended character class. This opcode is encountered only
2216 ph10 384 when UTF-8 mode mode is supported. Nevertheless, we may not be in UTF-8
2217     mode, because Unicode properties are supported in non-UTF-8 mode. */
2218 nigel 77
2219     #ifdef SUPPORT_UTF8
2220     case OP_XCLASS:
2221     {
2222     data = ecode + 1 + LINK_SIZE; /* Save for matching */
2223     ecode += GET(ecode, 1); /* Advance past the item */
2224    
2225     switch (*ecode)
2226     {
2227     case OP_CRSTAR:
2228     case OP_CRMINSTAR:
2229     case OP_CRPLUS:
2230     case OP_CRMINPLUS:
2231     case OP_CRQUERY:
2232     case OP_CRMINQUERY:
2233     c = *ecode++ - OP_CRSTAR;
2234     minimize = (c & 1) != 0;
2235     min = rep_min[c]; /* Pick up values from tables; */
2236     max = rep_max[c]; /* zero for max => infinity */
2237     if (max == 0) max = INT_MAX;
2238     break;
2239    
2240     case OP_CRRANGE:
2241     case OP_CRMINRANGE:
2242     minimize = (*ecode == OP_CRMINRANGE);
2243     min = GET2(ecode, 1);
2244     max = GET2(ecode, 3);
2245     if (max == 0) max = INT_MAX;
2246     ecode += 5;
2247     break;
2248    
2249     default: /* No repeat follows */
2250     min = max = 1;
2251     break;
2252     }
2253    
2254     /* First, ensure the minimum number of matches are present. */
2255    
2256     for (i = 1; i <= min; i++)
2257     {
2258 ph10 427 if (eptr >= md->end_subject)
2259 ph10 426 {
2260     SCHECK_PARTIAL();
2261     RRETURN(MATCH_NOMATCH);
2262 ph10 427 }
2263 ph10 384 GETCHARINCTEST(c, eptr);
2264 nigel 77 if (!_pcre_xclass(c, data)) RRETURN(MATCH_NOMATCH);
2265     }
2266    
2267     /* If max == min we can continue with the main loop without the
2268     need to recurse. */
2269    
2270     if (min == max) continue;
2271    
2272     /* If minimizing, keep testing the rest of the expression and advancing
2273     the pointer while it matches the class. */
2274    
2275     if (minimize)
2276     {
2277     for (fi = min;; fi++)
2278     {
2279 ph10 164 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM20);
2280 nigel 77 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2281 ph10 428 if (fi >= max) RRETURN(MATCH_NOMATCH);
2282 ph10 427 if (eptr >= md->end_subject)
2283 ph10 426 {
2284 ph10 427 SCHECK_PARTIAL();
2285 ph10 426 RRETURN(MATCH_NOMATCH);
2286 ph10 427 }
2287 ph10 384 GETCHARINCTEST(c, eptr);
2288 nigel 77 if (!_pcre_xclass(c, data)) RRETURN(MATCH_NOMATCH);
2289     }
2290     /* Control never gets here */
2291     }
2292    
2293     /* If maximizing, find the longest possible run, then work backwards. */
2294    
2295     else
2296     {
2297     pp = eptr;
2298     for (i = min; i < max; i++)
2299     {
2300     int len = 1;
2301     if (eptr >= md->end_subject) break;
2302 ph10 384 GETCHARLENTEST(c, eptr, len);
2303 nigel 77 if (!_pcre_xclass(c, data)) break;
2304     eptr += len;
2305     }
2306     for(;;)
2307     {
2308 ph10 164 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM21);
2309 nigel 77 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2310     if (eptr-- == pp) break; /* Stop if tried at original pos */
2311 ph10 214 if (utf8) BACKCHAR(eptr);
2312 nigel 77 }
2313     RRETURN(MATCH_NOMATCH);
2314     }
2315    
2316     /* Control never gets here */
2317     }
2318     #endif /* End of XCLASS */
2319    
2320     /* Match a single character, casefully */
2321    
2322     case OP_CHAR:
2323     #ifdef SUPPORT_UTF8
2324     if (utf8)
2325     {
2326     length = 1;
2327     ecode++;
2328     GETCHARLEN(fc, ecode, length);
2329 ph10 443 if (length > md->end_subject - eptr)
2330 ph10 428 {
2331     CHECK_PARTIAL(); /* Not SCHECK_PARTIAL() */
2332     RRETURN(MATCH_NOMATCH);
2333 ph10 443 }
2334 nigel 77 while (length-- > 0) if (*ecode++ != *eptr++) RRETURN(MATCH_NOMATCH);
2335     }
2336     else
2337     #endif
2338    
2339     /* Non-UTF-8 mode */
2340     {
2341 ph10 443 if (md->end_subject - eptr < 1)
2342 ph10 428 {
2343     SCHECK_PARTIAL(); /* This one can use SCHECK_PARTIAL() */
2344     RRETURN(MATCH_NOMATCH);
2345 ph10 443 }
2346 nigel 77 if (ecode[1] != *eptr++) RRETURN(MATCH_NOMATCH);
2347     ecode += 2;
2348     }
2349     break;
2350    
2351     /* Match a single character, caselessly */
2352    
2353     case OP_CHARNC:
2354     #ifdef SUPPORT_UTF8
2355     if (utf8)
2356     {
2357     length = 1;
2358     ecode++;
2359     GETCHARLEN(fc, ecode, length);
2360    
2361 ph10 443 if (length > md->end_subject - eptr)
2362 ph10 428 {
2363     CHECK_PARTIAL(); /* Not SCHECK_PARTIAL() */
2364     RRETURN(MATCH_NOMATCH);
2365 ph10 443 }
2366 nigel 77
2367     /* If the pattern character's value is < 128, we have only one byte, and
2368     can use the fast lookup table. */
2369    
2370     if (fc < 128)
2371     {
2372     if (md->lcc[*ecode++] != md->lcc[*eptr++]) RRETURN(MATCH_NOMATCH);
2373     }
2374    
2375     /* Otherwise we must pick up the subject character */
2376    
2377     else
2378     {
2379 nigel 93 unsigned int dc;
2380 nigel 77 GETCHARINC(dc, eptr);
2381     ecode += length;
2382    
2383     /* If we have Unicode property support, we can use it to test the other
2384 nigel 87 case of the character, if there is one. */
2385 nigel 77
2386     if (fc != dc)
2387     {
2388     #ifdef SUPPORT_UCP
2389 ph10 349 if (dc != UCD_OTHERCASE(fc))
2390 nigel 77 #endif
2391     RRETURN(MATCH_NOMATCH);
2392     }
2393     }
2394     }
2395     else
2396     #endif /* SUPPORT_UTF8 */
2397    
2398     /* Non-UTF-8 mode */
2399     {
2400 ph10 443 if (md->end_subject - eptr < 1)
2401 ph10 428 {
2402 ph10 443 SCHECK_PARTIAL(); /* This one can use SCHECK_PARTIAL() */
2403 ph10 428 RRETURN(MATCH_NOMATCH);
2404 ph10 443 }
2405 nigel 77 if (md->lcc[ecode[1]] != md->lcc[*eptr++]) RRETURN(MATCH_NOMATCH);
2406     ecode += 2;
2407     }
2408     break;
2409    
2410 nigel 93 /* Match a single character repeatedly. */
2411 nigel 77
2412     case OP_EXACT:
2413     min = max = GET2(ecode, 1);
2414     ecode += 3;
2415     goto REPEATCHAR;
2416    
2417 nigel 93 case OP_POSUPTO:
2418     possessive = TRUE;
2419     /* Fall through */
2420    
2421 nigel 77 case OP_UPTO:
2422     case OP_MINUPTO:
2423     min = 0;
2424     max = GET2(ecode, 1);
2425     minimize = *ecode == OP_MINUPTO;
2426     ecode += 3;
2427     goto REPEATCHAR;
2428    
2429 nigel 93 case OP_POSSTAR:
2430     possessive = TRUE;
2431     min = 0;
2432     max = INT_MAX;
2433     ecode++;
2434     goto REPEATCHAR;
2435    
2436     case OP_POSPLUS:
2437     possessive = TRUE;
2438     min = 1;
2439     max = INT_MAX;
2440     ecode++;
2441     goto REPEATCHAR;
2442    
2443     case OP_POSQUERY:
2444     possessive = TRUE;
2445     min = 0;
2446     max = 1;
2447     ecode++;
2448     goto REPEATCHAR;
2449    
2450 nigel 77 case OP_STAR:
2451     case OP_MINSTAR:
2452     case OP_PLUS:
2453     case OP_MINPLUS:
2454     case OP_QUERY:
2455     case OP_MINQUERY:
2456     c = *ecode++ - OP_STAR;
2457     minimize = (c & 1) != 0;
2458 ph10 443
2459 nigel 77 min = rep_min[c]; /* Pick up values from tables; */
2460     max = rep_max[c]; /* zero for max => infinity */
2461     if (max == 0) max = INT_MAX;
2462    
2463 ph10 426 /* Common code for all repeated single-character matches. */
2464 nigel 77
2465     REPEATCHAR:
2466     #ifdef SUPPORT_UTF8
2467     if (utf8)
2468     {
2469     length = 1;
2470     charptr = ecode;
2471     GETCHARLEN(fc, ecode, length);
2472     ecode += length;
2473    
2474     /* Handle multibyte character matching specially here. There is
2475     support for caseless matching if UCP support is present. */
2476    
2477     if (length > 1)
2478     {
2479     #ifdef SUPPORT_UCP
2480 nigel 93 unsigned int othercase;
2481 nigel 77 if ((ims & PCRE_CASELESS) != 0 &&
2482 ph10 349 (othercase = UCD_OTHERCASE(fc)) != fc)
2483 nigel 77 oclength = _pcre_ord2utf8(othercase, occhars);
2484 ph10 115 else oclength = 0;
2485 nigel 77 #endif /* SUPPORT_UCP */
2486    
2487     for (i = 1; i <= min; i++)
2488     {
2489 ph10 426 if (eptr <= md->end_subject - length &&
2490     memcmp(eptr, charptr, length) == 0) eptr += length;
2491 ph10 123 #ifdef SUPPORT_UCP
2492 ph10 426 else if (oclength > 0 &&
2493     eptr <= md->end_subject - oclength &&
2494     memcmp(eptr, occhars, oclength) == 0) eptr += oclength;
2495     #endif /* SUPPORT_UCP */
2496 nigel 77 else
2497     {
2498 ph10 426 CHECK_PARTIAL();
2499     RRETURN(MATCH_NOMATCH);
2500 nigel 77 }
2501     }
2502    
2503     if (min == max) continue;
2504    
2505     if (minimize)
2506     {
2507     for (fi = min;; fi++)
2508     {
2509 ph10 164 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM22);
2510 nigel 77 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2511 ph10 428 if (fi >= max) RRETURN(MATCH_NOMATCH);
2512 ph10 426 if (eptr <= md->end_subject - length &&
2513     memcmp(eptr, charptr, length) == 0) eptr += length;
2514 ph10 123 #ifdef SUPPORT_UCP
2515 ph10 426 else if (oclength > 0 &&
2516     eptr <= md->end_subject - oclength &&
2517     memcmp(eptr, occhars, oclength) == 0) eptr += oclength;
2518     #endif /* SUPPORT_UCP */
2519 nigel 77 else
2520     {
2521 ph10 426 CHECK_PARTIAL();
2522     RRETURN(MATCH_NOMATCH);
2523 nigel 77 }
2524     }
2525     /* Control never gets here */
2526     }
2527 nigel 93
2528     else /* Maximize */
2529 nigel 77 {
2530     pp = eptr;
2531     for (i = min; i < max; i++)
2532     {
2533 ph10 426 if (eptr <= md->end_subject - length &&
2534     memcmp(eptr, charptr, length) == 0) eptr += length;
2535 ph10 123 #ifdef SUPPORT_UCP
2536 ph10 426 else if (oclength > 0 &&
2537     eptr <= md->end_subject - oclength &&
2538     memcmp(eptr, occhars, oclength) == 0) eptr += oclength;
2539     #endif /* SUPPORT_UCP */
2540 ph10 115 else break;
2541 nigel 77 }
2542 nigel 93
2543     if (possessive) continue;
2544 ph10 427
2545 ph10 120 for(;;)
2546 ph10 426 {
2547     RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM23);
2548     if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2549     if (eptr == pp) { RRETURN(MATCH_NOMATCH); }
2550 ph10 115 #ifdef SUPPORT_UCP
2551 ph10 426 eptr--;
2552     BACKCHAR(eptr);
2553 ph10 123 #else /* without SUPPORT_UCP */
2554 ph10 426 eptr -= length;
2555 ph10 123 #endif /* SUPPORT_UCP */
2556 ph10 426 }
2557 nigel 77 }
2558     /* Control never gets here */
2559     }
2560    
2561     /* If the length of a UTF-8 character is 1, we fall through here, and
2562     obey the code as for non-UTF-8 characters below, though in this case the
2563     value of fc will always be < 128. */
2564     }
2565     else
2566     #endif /* SUPPORT_UTF8 */
2567    
2568     /* When not in UTF-8 mode, load a single-byte character. */
2569    
2570 ph10 426 fc = *ecode++;
2571 ph10 443
2572 nigel 77 /* The value of fc at this point is always less than 256, though we may or
2573     may not be in UTF-8 mode. The code is duplicated for the caseless and
2574     caseful cases, for speed, since matching characters is likely to be quite
2575     common. First, ensure the minimum number of matches are present. If min =
2576     max, continue at the same level without recursing. Otherwise, if
2577     minimizing, keep trying the rest of the expression and advancing one
2578     matching character if failing, up to the maximum. Alternatively, if
2579     maximizing, find the maximum number of characters and work backwards. */
2580    
2581     DPRINTF(("matching %c{%d,%d} against subject %.*s\n", fc, min, max,
2582     max, eptr));
2583    
2584     if ((ims & PCRE_CASELESS) != 0)
2585     {
2586     fc = md->lcc[fc];
2587     for (i = 1; i <= min; i++)
2588 ph10 426 {
2589     if (eptr >= md->end_subject)
2590     {
2591     SCHECK_PARTIAL();
2592     RRETURN(MATCH_NOMATCH);
2593     }
2594 nigel 77 if (fc != md->lcc[*eptr++]) RRETURN(MATCH_NOMATCH);
2595 ph10 426 }
2596 nigel 77 if (min == max) continue;
2597     if (minimize)
2598     {
2599     for (fi = min;; fi++)
2600     {
2601 ph10 164 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM24);
2602 nigel 77 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2603 ph10 428 if (fi >= max) RRETURN(MATCH_NOMATCH);
2604 ph10 426 if (eptr >= md->end_subject)
2605     {
2606 ph10 427 SCHECK_PARTIAL();
2607 ph10 426 RRETURN(MATCH_NOMATCH);
2608     }
2609     if (fc != md->lcc[*eptr++]) RRETURN(MATCH_NOMATCH);
2610 nigel 77 }
2611     /* Control never gets here */
2612     }
2613 nigel 93 else /* Maximize */
2614 nigel 77 {
2615     pp = eptr;
2616     for (i = min; i < max; i++)
2617     {
2618     if (eptr >= md->end_subject || fc != md->lcc[*eptr]) break;
2619     eptr++;
2620     }
2621 ph10 427
2622 nigel 93 if (possessive) continue;
2623 ph10 427
2624 nigel 77 while (eptr >= pp)
2625     {
2626 ph10 164 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM25);
2627 nigel 77 eptr--;
2628     if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2629     }
2630     RRETURN(MATCH_NOMATCH);
2631     }
2632     /* Control never gets here */
2633     }
2634    
2635     /* Caseful comparisons (includes all multi-byte characters) */
2636    
2637     else
2638     {
2639 ph10 427 for (i = 1; i <= min; i++)
2640 ph10 426 {
2641     if (eptr >= md->end_subject)
2642     {
2643     SCHECK_PARTIAL();
2644     RRETURN(MATCH_NOMATCH);
2645     }
2646     if (fc != *eptr++) RRETURN(MATCH_NOMATCH);
2647 ph10 427 }
2648 ph10 443
2649 nigel 77 if (min == max) continue;
2650 ph10 443
2651 nigel 77 if (minimize)
2652     {
2653     for (fi = min;; fi++)
2654     {
2655 ph10 164 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM26);
2656 nigel 77 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2657 ph10 428 if (fi >= max) RRETURN(MATCH_NOMATCH);
2658 ph10 426 if (eptr >= md->end_subject)
2659 ph10 427 {
2660 ph10 426 SCHECK_PARTIAL();
2661     RRETURN(MATCH_NOMATCH);
2662 ph10 427 }
2663 ph10 426 if (fc != *eptr++) RRETURN(MATCH_NOMATCH);
2664 nigel 77 }
2665     /* Control never gets here */
2666     }
2667 nigel 93 else /* Maximize */
2668 nigel 77 {
2669     pp = eptr;
2670     for (i = min; i < max; i++)
2671     {
2672     if (eptr >= md->end_subject || fc != *eptr) break;
2673     eptr++;
2674     }
2675 nigel 93 if (possessive) continue;
2676 ph10 443
2677 nigel 77 while (eptr >= pp)
2678     {
2679 ph10 164 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM27);
2680 nigel 77 eptr--;
2681     if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2682     }
2683     RRETURN(MATCH_NOMATCH);
2684     }
2685     }
2686     /* Control never gets here */
2687    
2688     /* Match a negated single one-byte character. The character we are
2689     checking can be multibyte. */
2690    
2691     case OP_NOT:
2692 ph10 443 if (eptr >= md->end_subject)
2693 ph10 428 {
2694 ph10 443 SCHECK_PARTIAL();
2695 ph10 428 RRETURN(MATCH_NOMATCH);
2696 ph10 443 }
2697 nigel 77 ecode++;
2698     GETCHARINCTEST(c, eptr);
2699     if ((ims & PCRE_CASELESS) != 0)
2700     {
2701     #ifdef SUPPORT_UTF8
2702     if (c < 256)
2703     #endif
2704     c = md->lcc[c];
2705     if (md->lcc[*ecode++] == c) RRETURN(MATCH_NOMATCH);
2706     }
2707     else
2708     {
2709     if (*ecode++ == c) RRETURN(MATCH_NOMATCH);
2710     }
2711     break;
2712    
2713     /* Match a negated single one-byte character repeatedly. This is almost a
2714     repeat of the code for a repeated single character, but I haven't found a
2715     nice way of commoning these up that doesn't require a test of the
2716     positive/negative option for each character match. Maybe that wouldn't add
2717     very much to the time taken, but character matching *is* what this is all
2718     about... */
2719    
2720     case OP_NOTEXACT:
2721     min = max = GET2(ecode, 1);
2722     ecode += 3;
2723     goto REPEATNOTCHAR;
2724    
2725     case OP_NOTUPTO:
2726     case OP_NOTMINUPTO:
2727     min = 0;
2728     max = GET2(ecode, 1);
2729     minimize = *ecode == OP_NOTMINUPTO;
2730     ecode += 3;
2731     goto REPEATNOTCHAR;
2732    
2733 nigel 93 case OP_NOTPOSSTAR:
2734     possessive = TRUE;
2735     min = 0;
2736     max = INT_MAX;
2737     ecode++;
2738     goto REPEATNOTCHAR;
2739    
2740     case OP_NOTPOSPLUS:
2741     possessive = TRUE;
2742     min = 1;
2743     max = INT_MAX;
2744     ecode++;
2745     goto REPEATNOTCHAR;
2746    
2747     case OP_NOTPOSQUERY:
2748     possessive = TRUE;
2749     min = 0;
2750     max = 1;
2751     ecode++;
2752     goto REPEATNOTCHAR;
2753    
2754     case OP_NOTPOSUPTO:
2755     possessive = TRUE;
2756     min = 0;
2757     max = GET2(ecode, 1);
2758     ecode += 3;
2759     goto REPEATNOTCHAR;
2760    
2761 nigel 77 case OP_NOTSTAR:
2762     case OP_NOTMINSTAR:
2763     case OP_NOTPLUS:
2764     case OP_NOTMINPLUS:
2765     case OP_NOTQUERY:
2766     case OP_NOTMINQUERY:
2767     c = *ecode++ - OP_NOTSTAR;
2768     minimize = (c & 1) != 0;
2769     min = rep_min[c]; /* Pick up values from tables; */
2770     max = rep_max[c]; /* zero for max => infinity */
2771     if (max == 0) max = INT_MAX;
2772    
2773 ph10 426 /* Common code for all repeated single-byte matches. */
2774 nigel 77
2775     REPEATNOTCHAR:
2776     fc = *ecode++;
2777    
2778     /* The code is duplicated for the caseless and caseful cases, for speed,
2779     since matching characters is likely to be quite common. First, ensure the
2780     minimum number of matches are present. If min = max, continue at the same
2781     level without recursing. Otherwise, if minimizing, keep trying the rest of
2782     the expression and advancing one matching character if failing, up to the
2783     maximum. Alternatively, if maximizing, find the maximum number of
2784     characters and work backwards. */
2785    
2786     DPRINTF(("negative matching %c{%d,%d} against subject %.*s\n", fc, min, max,
2787     max, eptr));
2788    
2789     if ((ims & PCRE_CASELESS) != 0)
2790     {
2791     fc = md->lcc[fc];
2792    
2793     #ifdef SUPPORT_UTF8
2794     /* UTF-8 mode */
2795     if (utf8)
2796     {
2797 nigel 93 register unsigned int d;
2798 nigel 77 for (i = 1; i <= min; i++)
2799     {
2800 ph10 426 if (eptr >= md->end_subject)
2801     {
2802     SCHECK_PARTIAL();
2803 ph10 427 RRETURN(MATCH_NOMATCH);
2804     }
2805 nigel 77 GETCHARINC(d, eptr);
2806     if (d < 256) d = md->lcc[d];
2807     if (fc == d) RRETURN(MATCH_NOMATCH);
2808     }
2809     }
2810     else
2811     #endif
2812    
2813     /* Not UTF-8 mode */
2814     {
2815     for (i = 1; i <= min; i++)
2816 ph10 426 {
2817     if (eptr >= md->end_subject)
2818     {
2819     SCHECK_PARTIAL();
2820 ph10 427 RRETURN(MATCH_NOMATCH);
2821     }
2822 nigel 77 if (fc == md->lcc[*eptr++]) RRETURN(MATCH_NOMATCH);
2823 ph10 427 }
2824 nigel 77 }
2825    
2826     if (min == max) continue;
2827    
2828     if (minimize)
2829     {
2830     #ifdef SUPPORT_UTF8
2831     /* UTF-8 mode */
2832     if (utf8)
2833     {
2834 nigel 93 register unsigned int d;
2835 nigel 77 for (fi = min;; fi++)
2836     {
2837 ph10 164 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM28);
2838 nigel 77 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2839 ph10 428 if (fi >= max) RRETURN(MATCH_NOMATCH);
2840 ph10 427 if (eptr >= md->end_subject)
2841 ph10 426 {
2842 ph10 427 SCHECK_PARTIAL();
2843 ph10 426 RRETURN(MATCH_NOMATCH);
2844 ph10 427 }
2845 nigel 77 GETCHARINC(d, eptr);
2846     if (d < 256) d = md->lcc[d];
2847 ph10 366 if (fc == d) RRETURN(MATCH_NOMATCH);
2848 nigel 77 }
2849     }
2850     else
2851     #endif
2852     /* Not UTF-8 mode */
2853     {
2854     for (fi = min;; fi++)
2855     {
2856 ph10 164 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM29);
2857 nigel 77 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2858 ph10 428 if (fi >= max) RRETURN(MATCH_NOMATCH);
2859 ph10 426 if (eptr >= md->end_subject)
2860     {
2861     SCHECK_PARTIAL();
2862     RRETURN(MATCH_NOMATCH);
2863     }
2864     if (fc == md->lcc[*eptr++]) RRETURN(MATCH_NOMATCH);
2865 nigel 77 }
2866     }
2867     /* Control never gets here */
2868     }
2869    
2870     /* Maximize case */
2871    
2872     else
2873     {
2874     pp = eptr;
2875    
2876     #ifdef SUPPORT_UTF8
2877     /* UTF-8 mode */
2878     if (utf8)
2879     {
2880 nigel 93 register unsigned int d;
2881 nigel 77 for (i = min; i < max; i++)
2882     {
2883     int len = 1;
2884     if (eptr >= md->end_subject) break;
2885     GETCHARLEN(d, eptr, len);
2886     if (d < 256) d = md->lcc[d];
2887     if (fc == d) break;
2888     eptr += len;
2889     }
2890 nigel 93 if (possessive) continue;
2891     for(;;)
2892 nigel 77 {
2893 ph10 164 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM30);
2894 nigel 77 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2895     if (eptr-- == pp) break; /* Stop if tried at original pos */
2896     BACKCHAR(eptr);
2897     }
2898     }
2899     else
2900     #endif
2901     /* Not UTF-8 mode */
2902     {
2903     for (i = min; i < max; i++)
2904     {
2905     if (eptr >= md->end_subject || fc == md->lcc[*eptr]) break;
2906     eptr++;
2907     }
2908 nigel 93 if (possessive) continue;
2909 nigel 77 while (eptr >= pp)
2910     {
2911 ph10 164 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM31);
2912 nigel 77 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2913     eptr--;
2914     }
2915     }
2916    
2917     RRETURN(MATCH_NOMATCH);
2918     }
2919     /* Control never gets here */
2920     }
2921    
2922     /* Caseful comparisons */
2923    
2924     else
2925     {
2926     #ifdef SUPPORT_UTF8
2927     /* UTF-8 mode */
2928     if (utf8)
2929     {
2930 nigel 93 register unsigned int d;
2931 nigel 77 for (i = 1; i <= min; i++)
2932     {
2933 ph10 426 if (eptr >= md->end_subject)
2934     {
2935     SCHECK_PARTIAL();
2936 ph10 427 RRETURN(MATCH_NOMATCH);
2937     }
2938 nigel 77 GETCHARINC(d, eptr);
2939     if (fc == d) RRETURN(MATCH_NOMATCH);
2940     }
2941     }
2942     else
2943     #endif
2944     /* Not UTF-8 mode */
2945     {
2946     for (i = 1; i <= min; i++)
2947 ph10 426 {
2948     if (eptr >= md->end_subject)
2949     {
2950     SCHECK_PARTIAL();
2951 ph10 427 RRETURN(MATCH_NOMATCH);
2952     }
2953 nigel 77 if (fc == *eptr++) RRETURN(MATCH_NOMATCH);
2954 ph10 427 }
2955 nigel 77 }
2956    
2957     if (min == max) continue;
2958    
2959     if (minimize)
2960     {
2961     #ifdef SUPPORT_UTF8
2962     /* UTF-8 mode */
2963     if (utf8)
2964     {
2965 nigel 93 register unsigned int d;
2966 nigel 77 for (fi = min;; fi++)
2967     {
2968 ph10 164 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM32);
2969 nigel 77 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2970 ph10 428 if (fi >= max) RRETURN(MATCH_NOMATCH);
2971 ph10 427 if (eptr >= md->end_subject)
2972 ph10 426 {
2973 ph10 427 SCHECK_PARTIAL();
2974 ph10 426 RRETURN(MATCH_NOMATCH);
2975 ph10 427 }
2976 nigel 77 GETCHARINC(d, eptr);
2977 ph10 371 if (fc == d) RRETURN(MATCH_NOMATCH);
2978 nigel 77 }
2979     }
2980     else
2981     #endif
2982     /* Not UTF-8 mode */
2983     {
2984     for (fi = min;; fi++)
2985     {
2986 ph10 164 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM33);
2987 nigel 77 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2988 ph10 428 if (fi >= max) RRETURN(MATCH_NOMATCH);
2989 ph10 426 if (eptr >= md->end_subject)
2990     {
2991     SCHECK_PARTIAL();
2992     RRETURN(MATCH_NOMATCH);
2993 ph10 427 }
2994 ph10 426 if (fc == *eptr++) RRETURN(MATCH_NOMATCH);
2995 nigel 77 }
2996     }
2997     /* Control never gets here */
2998     }
2999    
3000     /* Maximize case */
3001    
3002     else
3003     {
3004     pp = eptr;
3005    
3006     #ifdef SUPPORT_UTF8
3007     /* UTF-8 mode */
3008     if (utf8)
3009     {
3010 nigel 93 register unsigned int d;
3011 nigel 77 for (i = min; i < max; i++)
3012     {
3013     int len = 1;
3014     if (eptr >= md->end_subject) break;
3015     GETCHARLEN(d, eptr, len);
3016     if (fc == d) break;
3017     eptr += len;
3018     }
3019 nigel 93 if (possessive) continue;
3020 nigel 77 for(;;)
3021     {
3022 ph10 164 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM34);
3023 nigel 77 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3024     if (eptr-- == pp) break; /* Stop if tried at original pos */
3025     BACKCHAR(eptr);
3026     }
3027     }
3028     else
3029     #endif
3030     /* Not UTF-8 mode */
3031     {
3032     for (i = min; i < max; i++)
3033     {
3034     if (eptr >= md->end_subject || fc == *eptr) break;
3035     eptr++;
3036     }
3037 nigel 93 if (possessive) continue;
3038 nigel 77 while (eptr >= pp)
3039     {
3040 ph10 164 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM35);
3041 nigel 77 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3042     eptr--;
3043     }
3044     }
3045    
3046     RRETURN(MATCH_NOMATCH);
3047     }
3048     }
3049     /* Control never gets here */
3050    
3051     /* Match a single character type repeatedly; several different opcodes
3052     share code. This is very similar to the code for single characters, but we
3053     repeat it in the interests of efficiency. */
3054    
3055     case OP_TYPEEXACT:
3056     min = max = GET2(ecode, 1);
3057     minimize = TRUE;
3058     ecode += 3;
3059     goto REPEATTYPE;
3060    
3061     case OP_TYPEUPTO:
3062     case OP_TYPEMINUPTO:
3063     min = 0;
3064     max = GET2(ecode, 1);
3065     minimize = *ecode == OP_TYPEMINUPTO;
3066     ecode += 3;
3067     goto REPEATTYPE;
3068    
3069 nigel 93 case OP_TYPEPOSSTAR:
3070     possessive = TRUE;
3071     min = 0;
3072     max = INT_MAX;
3073     ecode++;
3074     goto REPEATTYPE;
3075    
3076     case OP_TYPEPOSPLUS:
3077     possessive = TRUE;
3078     min = 1;
3079     max = INT_MAX;
3080     ecode++;
3081     goto REPEATTYPE;
3082    
3083     case OP_TYPEPOSQUERY:
3084     possessive = TRUE;
3085     min = 0;
3086     max = 1;
3087     ecode++;
3088     goto REPEATTYPE;
3089    
3090     case OP_TYPEPOSUPTO:
3091     possessive = TRUE;
3092     min = 0;
3093     max = GET2(ecode, 1);
3094     ecode += 3;
3095     goto REPEATTYPE;
3096    
3097 nigel 77 case OP_TYPESTAR:
3098     case OP_TYPEMINSTAR:
3099     case OP_TYPEPLUS:
3100     case OP_TYPEMINPLUS:
3101     case OP_TYPEQUERY:
3102     case OP_TYPEMINQUERY:
3103     c = *ecode++ - OP_TYPESTAR;
3104     minimize = (c & 1) != 0;
3105     min = rep_min[c]; /* Pick up values from tables; */
3106     max = rep_max[c]; /* zero for max => infinity */
3107     if (max == 0) max = INT_MAX;
3108    
3109     /* Common code for all repeated single character type matches. Note that
3110     in UTF-8 mode, '.' matches a character of any length, but for the other
3111     character types, the valid characters are all one-byte long. */
3112    
3113     REPEATTYPE:
3114     ctype = *ecode++; /* Code for the character type */
3115    
3116     #ifdef SUPPORT_UCP
3117     if (ctype == OP_PROP || ctype == OP_NOTPROP)
3118     {
3119     prop_fail_result = ctype == OP_NOTPROP;
3120     prop_type = *ecode++;
3121 nigel 87 prop_value = *ecode++;
3122 nigel 77 }
3123     else prop_type = -1;
3124     #endif
3125    
3126     /* First, ensure the minimum number of matches are present. Use inline
3127     code for maximizing the speed, and do the type test once at the start
3128 ph10 426 (i.e. keep it out of the loop). Separate the UTF-8 code completely as that
3129 nigel 77 is tidier. Also separate the UCP code, which can be the same for both UTF-8
3130     and single-bytes. */
3131    
3132     if (min > 0)
3133     {
3134     #ifdef SUPPORT_UCP
3135 nigel 87 if (prop_type >= 0)
3136 nigel 77 {
3137 nigel 87 switch(prop_type)
3138 nigel 77 {
3139 nigel 87 case PT_ANY:
3140     if (prop_fail_result) RRETURN(MATCH_NOMATCH);
3141     for (i = 1; i <= min; i++)
3142     {
3143 ph10 427 if (eptr >= md->end_subject)
3144 ph10 426 {
3145 ph10 427 SCHECK_PARTIAL();
3146 ph10 426 RRETURN(MATCH_NOMATCH);
3147 ph10 427 }
3148 ph10 184 GETCHARINCTEST(c, eptr);
3149 nigel 87 }
3150     break;
3151    
3152     case PT_LAMP:
3153     for (i = 1; i <= min; i++)
3154     {
3155 ph10 427 if (eptr >= md->end_subject)
3156 ph10 426 {
3157 ph10 427 SCHECK_PARTIAL();
3158 ph10 426 RRETURN(MATCH_NOMATCH);
3159 ph10 427 }
3160 ph10 184 GETCHARINCTEST(c, eptr);
3161 ph10 349 prop_chartype = UCD_CHARTYPE(c);
3162 nigel 87 if ((prop_chartype == ucp_Lu ||
3163     prop_chartype == ucp_Ll ||
3164     prop_chartype == ucp_Lt) == prop_fail_result)
3165     RRETURN(MATCH_NOMATCH);
3166     }
3167     break;
3168    
3169     case PT_GC:
3170     for (i = 1; i <= min; i++)
3171     {
3172 ph10 427 if (eptr >= md->end_subject)
3173 ph10 426 {
3174 ph10 427 SCHECK_PARTIAL();
3175 ph10 426 RRETURN(MATCH_NOMATCH);
3176 ph10 427 }
3177 ph10 184 GETCHARINCTEST(c, eptr);
3178 ph10 349 prop_category = UCD_CATEGORY(c);
3179 nigel 87 if ((prop_category == prop_value) == prop_fail_result)
3180     RRETURN(MATCH_NOMATCH);
3181     }
3182     break;
3183    
3184     case PT_PC:
3185     for (i = 1; i <= min; i++)
3186     {
3187 ph10 427 if (eptr >= md->end_subject)
3188 ph10 426 {
3189 ph10 427 SCHECK_PARTIAL();
3190 ph10 426 RRETURN(MATCH_NOMATCH);
3191 ph10 427 }
3192 ph10 184 GETCHARINCTEST(c, eptr);
3193 ph10 349 prop_chartype = UCD_CHARTYPE(c);
3194 nigel 87 if ((prop_chartype == prop_value) == prop_fail_result)
3195     RRETURN(MATCH_NOMATCH);
3196     }
3197     break;
3198    
3199     case PT_SC:
3200     for (i = 1; i <= min; i++)
3201     {
3202 ph10 427 if (eptr >= md->end_subject)
3203 ph10 426 {
3204 ph10 427 SCHECK_PARTIAL();
3205 ph10 426 RRETURN(MATCH_NOMATCH);
3206 ph10 427 }
3207 ph10 184 GETCHARINCTEST(c, eptr);
3208 ph10 349 prop_script = UCD_SCRIPT(c);
3209 nigel 87 if ((prop_script == prop_value) == prop_fail_result)
3210     RRETURN(MATCH_NOMATCH);
3211     }
3212     break;
3213    
3214     default:
3215     RRETURN(PCRE_ERROR_INTERNAL);
3216 nigel 77 }
3217     }
3218    
3219     /* Match extended Unicode sequences. We will get here only if the
3220     support is in the binary; otherwise a compile-time error occurs. */
3221    
3222     else if (ctype == OP_EXTUNI)
3223     {
3224     for (i = 1; i <= min; i++)
3225     {
3226 ph10 427 if (eptr >= md->end_subject)
3227 ph10 426 {
3228 ph10 427 SCHECK_PARTIAL();
3229 ph10 426 RRETURN(MATCH_NOMATCH);
3230 ph10 427 }
3231 nigel 77 GETCHARINCTEST(c, eptr);
3232 ph10 349 prop_category = UCD_CATEGORY(c);
3233 nigel 77 if (prop_category == ucp_M) RRETURN(MATCH_NOMATCH);
3234     while (eptr < md->end_subject)
3235     {
3236     int len = 1;
3237 ph10 426 if (!utf8) c = *eptr;
3238     else { GETCHARLEN(c, eptr, len); }
3239 ph10 349 prop_category = UCD_CATEGORY(c);
3240 nigel 77 if (prop_category != ucp_M) break;
3241     eptr += len;
3242     }
3243     }
3244     }
3245    
3246     else
3247     #endif /* SUPPORT_UCP */
3248    
3249     /* Handle all other cases when the coding is UTF-8 */
3250    
3251     #ifdef SUPPORT_UTF8
3252     if (utf8) switch(ctype)
3253     {
3254     case OP_ANY:
3255     for (i = 1; i <= min; i++)
3256     {
3257 ph10 426 if (eptr >= md->end_subject)
3258     {
3259 ph10 427 SCHECK_PARTIAL();
3260 nigel 77 RRETURN(MATCH_NOMATCH);
3261 ph10 427 }
3262 ph10 426 if (IS_NEWLINE(eptr)) RRETURN(MATCH_NOMATCH);
3263 nigel 91 eptr++;
3264 nigel 77 while (eptr < md->end_subject && (*eptr & 0xc0) == 0x80) eptr++;
3265     }
3266     break;
3267    
3268 ph10 341 case OP_ALLANY:
3269     for (i = 1; i <= min; i++)
3270     {
3271 ph10 427 if (eptr >= md->end_subject)
3272 ph10 426 {
3273     SCHECK_PARTIAL();
3274     RRETURN(MATCH_NOMATCH);
3275 ph10 427 }
3276 ph10 341 eptr++;
3277     while (eptr < md->end_subject && (*eptr & 0xc0) == 0x80) eptr++;
3278     }
3279     break;
3280    
3281 nigel 77 case OP_ANYBYTE:
3282 ph10 427 if (eptr > md->end_subject - min) RRETURN(MATCH_NOMATCH);
3283 nigel 77 eptr += min;
3284     break;
3285    
3286 nigel 93 case OP_ANYNL:
3287     for (i = 1; i <= min; i++)
3288     {
3289 ph10 427 if (eptr >= md->end_subject)
3290 ph10 426 {
3291     SCHECK_PARTIAL();
3292     RRETURN(MATCH_NOMATCH);
3293 ph10 427 }
3294 nigel 93 GETCHARINC(c, eptr);
3295     switch(c)
3296     {
3297     default: RRETURN(MATCH_NOMATCH);
3298     case 0x000d:
3299     if (eptr < md->end_subject && *eptr == 0x0a) eptr++;
3300     break;
3301 ph10 231
3302 nigel 93 case 0x000a:
3303 ph10 231 break;
3304    
3305 nigel 93 case 0x000b:
3306     case 0x000c:
3307     case 0x0085:
3308     case 0x2028:
3309     case 0x2029:
3310 ph10 231 if (md->bsr_anycrlf) RRETURN(MATCH_NOMATCH);
3311 nigel 93 break;
3312     }
3313     }
3314     break;
3315    
3316 ph10 178 case OP_NOT_HSPACE:
3317     for (i = 1; i <= min; i++)
3318     {
3319 ph10 427 if (eptr >= md->end_subject)
3320 ph10 426 {
3321     SCHECK_PARTIAL();
3322     RRETURN(MATCH_NOMATCH);
3323 ph10 427 }
3324 ph10 178 GETCHARINC(c, eptr);
3325     switch(c)
3326     {
3327     default: break;
3328     case 0x09: /* HT */
3329     case 0x20: /* SPACE */
3330     case 0xa0: /* NBSP */
3331     case 0x1680: /* OGHAM SPACE MARK */
3332     case 0x180e: /* MONGOLIAN VOWEL SEPARATOR */
3333     case 0x2000: /* EN QUAD */
3334     case 0x2001: /* EM QUAD */
3335     case 0x2002: /* EN SPACE */
3336     case 0x2003: /* EM SPACE */
3337     case 0x2004: /* THREE-PER-EM SPACE */
3338     case 0x2005: /* FOUR-PER-EM SPACE */
3339     case 0x2006: /* SIX-PER-EM SPACE */
3340     case 0x2007: /* FIGURE SPACE */
3341     case 0x2008: /* PUNCTUATION SPACE */
3342     case 0x2009: /* THIN SPACE */
3343     case 0x200A: /* HAIR SPACE */
3344     case 0x202f: /* NARROW NO-BREAK SPACE */
3345     case 0x205f: /* MEDIUM MATHEMATICAL SPACE */
3346     case 0x3000: /* IDEOGRAPHIC SPACE */
3347     RRETURN(MATCH_NOMATCH);
3348     }
3349     }
3350     break;
3351 ph10 182
3352 ph10 178 case OP_HSPACE:
3353     for (i = 1; i <= min; i++)
3354     {
3355 ph10 427 if (eptr >= md->end_subject)
3356 ph10 426 {
3357 ph10 427 SCHECK_PARTIAL();
3358 ph10 426 RRETURN(MATCH_NOMATCH);
3359 ph10 427 }
3360 ph10 178 GETCHARINC(c, eptr);
3361     switch(c)
3362     {
3363     default: RRETURN(MATCH_NOMATCH);
3364     case 0x09: /* HT */
3365     case 0x20: /* SPACE */
3366     case 0xa0: /* NBSP */
3367     case 0x1680: /* OGHAM SPACE MARK */
3368     case 0x180e: /* MONGOLIAN VOWEL SEPARATOR */
3369     case 0x2000: /* EN QUAD */
3370     case 0x2001: /* EM QUAD */
3371     case 0x2002: /* EN SPACE */
3372     case 0x2003: /* EM SPACE */
3373     case 0x2004: /* THREE-PER-EM SPACE */
3374     case 0x2005: /* FOUR-PER-EM SPACE */
3375     case 0x2006: /* SIX-PER-EM SPACE */
3376     case 0x2007: /* FIGURE SPACE */
3377     case 0x2008: /* PUNCTUATION SPACE */
3378     case 0x2009: /* THIN SPACE */
3379     case 0x200A: /* HAIR SPACE */
3380     case 0x202f: /* NARROW NO-BREAK SPACE */
3381     case 0x205f: /* MEDIUM MATHEMATICAL SPACE */
3382     case 0x3000: /* IDEOGRAPHIC SPACE */
3383     break;
3384     }
3385     }
3386     break;
3387 ph10 182
3388 ph10 178 case OP_NOT_VSPACE:
3389     for (i = 1; i <= min; i++)
3390     {
3391 ph10 427 if (eptr >= md->end_subject)
3392 ph10 426 {
3393 ph10 427 SCHECK_PARTIAL();
3394 ph10 426 RRETURN(MATCH_NOMATCH);
3395 ph10 427 }
3396 ph10 178 GETCHARINC(c, eptr);
3397     switch(c)
3398     {
3399     default: break;
3400     case 0x0a: /* LF */
3401     case 0x0b: /* VT */
3402     case 0x0c: /* FF */
3403     case 0x0d: /* CR */
3404     case 0x85: /* NEL */
3405     case 0x2028: /* LINE SEPARATOR */
3406     case 0x2029: /* PARAGRAPH SEPARATOR */
3407     RRETURN(MATCH_NOMATCH);
3408     }
3409     }
3410     break;
3411 ph10 182
3412 ph10 178 case OP_VSPACE:
3413     for (i = 1; i <= min; i++)
3414     {
3415 ph10 427 if (eptr >= md->end_subject)
3416 ph10 426 {
3417 ph10 427 SCHECK_PARTIAL();
3418 ph10 426 RRETURN(MATCH_NOMATCH);
3419 ph10 427 }
3420 ph10 178 GETCHARINC(c, eptr);
3421     switch(c)
3422     {
3423     default: RRETURN(MATCH_NOMATCH);
3424     case 0x0a: /* LF */
3425     case 0x0b: /* VT */
3426     case 0x0c: /* FF */
3427     case 0x0d: /* CR */
3428     case 0x85: /* NEL */
3429     case 0x2028: /* LINE SEPARATOR */
3430     case 0x2029: /* PARAGRAPH SEPARATOR */
3431 ph10 182 break;
3432 ph10 178 }
3433     }
3434     break;
3435    
3436 nigel 77 case OP_NOT_DIGIT:
3437     for (i = 1; i <= min; i++)
3438     {
3439 ph10 427 if (eptr >= md->end_subject)
3440 ph10 426 {
3441 ph10 427 SCHECK_PARTIAL();
3442 ph10 426 RRETURN(MATCH_NOMATCH);
3443 ph10 427 }
3444 nigel 77 GETCHARINC(c, eptr);
3445     if (c < 128 && (md->ctypes[c] & ctype_digit) != 0)
3446     RRETURN(MATCH_NOMATCH);
3447     }
3448     break;
3449    
3450     case OP_DIGIT:
3451     for (i = 1; i <= min; i++)
3452     {
3453 ph10 427 if (eptr >= md->end_subject)
3454 ph10 426 {
3455 ph10 427 SCHECK_PARTIAL();
3456 nigel 77 RRETURN(MATCH_NOMATCH);
3457 ph10 427 }
3458 ph10 426 if (*eptr >= 128 || (md->ctypes[*eptr++] & ctype_digit) == 0)
3459     RRETURN(MATCH_NOMATCH);
3460 nigel 77 /* No need to skip more bytes - we know it's a 1-byte character */
3461     }
3462     break;
3463    
3464     case OP_NOT_WHITESPACE:
3465     for (i = 1; i <= min; i++)
3466     {
3467 ph10 427 if (eptr >= md->end_subject)
3468 ph10 426 {
3469 ph10 427 SCHECK_PARTIAL();
3470 nigel 77 RRETURN(MATCH_NOMATCH);
3471 ph10 427 }
3472 ph10 426 if (*eptr < 128 && (md->ctypes[*eptr] & ctype_space) != 0)
3473     RRETURN(MATCH_NOMATCH);
3474 ph10 219 while (++eptr < md->end_subject && (*eptr & 0xc0) == 0x80);
3475 nigel 77 }
3476     break;
3477    
3478     case OP_WHITESPACE:
3479     for (i = 1; i <= min; i++)
3480     {
3481 ph10 427 if (eptr >= md->end_subject)
3482 ph10 426 {
3483 ph10 427 SCHECK_PARTIAL();
3484 nigel 77 RRETURN(MATCH_NOMATCH);
3485 ph10 427 }
3486 ph10 426 if (*eptr >= 128 || (md->ctypes[*eptr++] & ctype_space) == 0)
3487     RRETURN(MATCH_NOMATCH);
3488 nigel 77 /* No need to skip more bytes - we know it's a 1-byte character */
3489     }
3490     break;
3491    
3492     case OP_NOT_WORDCHAR:
3493     for (i = 1; i <= min; i++)
3494     {
3495     if (eptr >= md->end_subject ||
3496 ph10 219 (*eptr < 128 && (md->ctypes[*eptr] & ctype_word) != 0))
3497 nigel 77 RRETURN(MATCH_NOMATCH);
3498 ph10 219 while (++eptr < md->end_subject && (*eptr & 0xc0) == 0x80);
3499 nigel 77 }
3500     break;
3501    
3502     case OP_WORDCHAR:
3503     for (i = 1; i <= min; i++)
3504     {
3505 ph10 427 if (eptr >= md->end_subject)
3506 ph10 426 {
3507 ph10 427 SCHECK_PARTIAL();
3508 nigel 77 RRETURN(MATCH_NOMATCH);
3509 ph10 427 }
3510 ph10 426 if (*eptr >= 128 || (md->ctypes[*eptr++] & ctype_word) == 0)
3511     RRETURN(MATCH_NOMATCH);
3512 nigel 77 /* No need to skip more bytes - we know it's a 1-byte character */
3513     }
3514     break;
3515    
3516     default:
3517     RRETURN(PCRE_ERROR_INTERNAL);
3518     } /* End switch(ctype) */
3519    
3520     else
3521     #endif /* SUPPORT_UTF8 */
3522    
3523     /* Code for the non-UTF-8 case for minimum matching of operators other
3524 ph10 426 than OP_PROP and OP_NOTPROP. */
3525 nigel 77
3526     switch(ctype)
3527     {
3528     case OP_ANY:
3529 ph10 342 for (i = 1; i <= min; i++)
3530 nigel 77 {
3531 ph10 427 if (eptr >= md->end_subject)
3532 ph10 426 {
3533 ph10 427 SCHECK_PARTIAL();
3534 ph10 426 RRETURN(MATCH_NOMATCH);
3535 ph10 427 }
3536 ph10 342 if (IS_NEWLINE(eptr)) RRETURN(MATCH_NOMATCH);
3537     eptr++;
3538 nigel 77 }
3539     break;
3540    
3541 ph10 341 case OP_ALLANY:
3542 ph10 443 if (eptr > md->end_subject - min)
3543 ph10 428 {
3544 ph10 443 SCHECK_PARTIAL();
3545 ph10 428 RRETURN(MATCH_NOMATCH);
3546 ph10 443 }
3547 ph10 341 eptr += min;
3548     break;
3549    
3550 nigel 77 case OP_ANYBYTE:
3551 ph10 443 if (eptr > md->end_subject - min)
3552 ph10 428 {
3553 ph10 443 SCHECK_PARTIAL();
3554 ph10 428 RRETURN(MATCH_NOMATCH);
3555 ph10 443 }
3556 nigel 77 eptr += min;
3557     break;
3558    
3559 nigel 93 case OP_ANYNL:
3560     for (i = 1; i <= min; i++)
3561     {
3562 ph10 427 if (eptr >= md->end_subject)
3563 ph10 426 {
3564 ph10 427 SCHECK_PARTIAL();
3565 ph10 426 RRETURN(MATCH_NOMATCH);
3566 ph10 427 }
3567 nigel 93 switch(*eptr++)
3568     {
3569     default: RRETURN(MATCH_NOMATCH);
3570     case 0x000d:
3571     if (eptr < md->end_subject && *eptr == 0x0a) eptr++;
3572     break;
3573     case 0x000a:
3574 ph10 231 break;
3575    
3576 nigel 93 case 0x000b:
3577     case 0x000c:
3578     case 0x0085:
3579 ph10 231 if (md->bsr_anycrlf) RRETURN(MATCH_NOMATCH);
3580 nigel 93 break;
3581     }
3582     }
3583     break;
3584    
3585 ph10 178 case OP_NOT_HSPACE:
3586     for (i = 1; i <= min; i++)
3587     {
3588 ph10 427 if (eptr >= md->end_subject)
3589 ph10 426 {
3590 ph10 427 SCHECK_PARTIAL();
3591 ph10 426 RRETURN(MATCH_NOMATCH);
3592 ph10 427 }
3593 ph10 178 switch(*eptr++)
3594     {
3595     default: break;
3596     case 0x09: /* HT */
3597     case 0x20: /* SPACE */
3598     case 0xa0: /* NBSP */
3599     RRETURN(MATCH_NOMATCH);
3600     }
3601     }
3602     break;
3603    
3604     case OP_HSPACE:
3605     for (i = 1; i <= min; i++)
3606     {
3607 ph10 427 if (eptr >= md->end_subject)
3608 ph10 426 {
3609 ph10 427 SCHECK_PARTIAL();
3610 ph10 426 RRETURN(MATCH_NOMATCH);
3611 ph10 427 }
3612 ph10 178 switch(*eptr++)
3613     {
3614     default: RRETURN(MATCH_NOMATCH);
3615     case 0x09: /* HT */
3616     case 0x20: /* SPACE */
3617     case 0xa0: /* NBSP */
3618 ph10 182 break;
3619 ph10 178 }
3620     }
3621     break;
3622    
3623     case OP_NOT_VSPACE:
3624     for (i = 1; i <= min; i++)
3625     {
3626 ph10 427 if (eptr >= md->end_subject)
3627 ph10 426 {
3628 ph10 427 SCHECK_PARTIAL();
3629 ph10 426 RRETURN(MATCH_NOMATCH);
3630 ph10 427 }
3631 ph10 178 switch(*eptr++)
3632     {
3633     default: break;
3634     case 0x0a: /* LF */
3635     case 0x0b: /* VT */
3636     case 0x0c: /* FF */
3637     case 0x0d: /* CR */
3638     case 0x85: /* NEL */
3639     RRETURN(MATCH_NOMATCH);
3640     }
3641     }
3642     break;
3643    
3644     case OP_VSPACE:
3645     for (i = 1; i <= min; i++)
3646     {
3647 ph10 427 if (eptr >= md->end_subject)
3648 ph10 426 {
3649 ph10 427 SCHECK_PARTIAL();
3650 ph10 426 RRETURN(MATCH_NOMATCH);
3651 ph10 427 }
3652 ph10 178 switch(*eptr++)
3653     {
3654     default: RRETURN(MATCH_NOMATCH);
3655     case 0x0a: /* LF */
3656     case 0x0b: /* VT */
3657     case 0x0c: /* FF */
3658     case 0x0d: /* CR */
3659     case 0x85: /* NEL */
3660 ph10 182 break;
3661 ph10 178 }
3662     }
3663     break;
3664    
3665 nigel 77 case OP_NOT_DIGIT:
3666     for (i = 1; i <= min; i++)
3667 ph10 427 {
3668     if (eptr >= md->end_subject)
3669 ph10 426 {
3670 ph10 427 SCHECK_PARTIAL();
3671 ph10 426 RRETURN(MATCH_NOMATCH);
3672 ph10 427 }
3673 nigel 77 if ((md->ctypes[*eptr++] & ctype_digit) != 0) RRETURN(MATCH_NOMATCH);
3674 ph10 427 }
3675 nigel 77 break;
3676    
3677     case OP_DIGIT:
3678     for (i = 1; i <= min; i++)
3679 ph10 427 {
3680     if (eptr >= md->end_subject)
3681 ph10 426 {
3682 ph10 427 SCHECK_PARTIAL();
3683 ph10 426 RRETURN(MATCH_NOMATCH);
3684 ph10 427 }
3685 nigel 77 if ((md->ctypes[*eptr++] & ctype_digit) == 0) RRETURN(MATCH_NOMATCH);
3686 ph10 427 }
3687 nigel 77 break;
3688    
3689     case OP_NOT_WHITESPACE:
3690     for (i = 1; i <= min; i++)
3691 ph10 427 {
3692     if (eptr >= md->end_subject)
3693 ph10 426 {
3694 ph10 427 SCHECK_PARTIAL();
3695 ph10 426 RRETURN(MATCH_NOMATCH);
3696 ph10 427 }
3697 nigel 77 if ((md->ctypes[*eptr++] & ctype_space) != 0) RRETURN(MATCH_NOMATCH);
3698 ph10 427 }
3699 nigel 77 break;
3700    
3701     case OP_WHITESPACE:
3702     for (i = 1; i <= min; i++)
3703 ph10 427 {
3704     if (eptr >= md->end_subject)
3705 ph10 426 {
3706 ph10 427 SCHECK_PARTIAL();
3707 ph10 426 RRETURN(MATCH_NOMATCH);
3708 ph10 427 }
3709 nigel 77 if ((md->ctypes[*eptr++] & ctype_space) == 0) RRETURN(MATCH_NOMATCH);
3710 ph10 427 }
3711 nigel 77 break;
3712    
3713     case OP_NOT_WORDCHAR:
3714     for (i = 1; i <= min; i++)
3715 ph10 427 {
3716     if (eptr >= md->end_subject)
3717 ph10 426 {
3718 ph10 427 SCHECK_PARTIAL();
3719 ph10 426 RRETURN(MATCH_NOMATCH);
3720 ph10 427 }
3721 nigel 77 if ((md->ctypes[*eptr++] & ctype_word) != 0)
3722     RRETURN(MATCH_NOMATCH);
3723 ph10 427 }
3724 nigel 77 break;
3725    
3726     case OP_WORDCHAR:
3727     for (i = 1; i <= min; i++)
3728 ph10 427 {
3729     if (eptr >= md->end_subject)
3730 ph10 426 {
3731 ph10 427 SCHECK_PARTIAL();
3732 ph10 426 RRETURN(MATCH_NOMATCH);
3733 ph10 427 }
3734 nigel 77 if ((md->ctypes[*eptr++] & ctype_word) == 0)
3735     RRETURN(MATCH_NOMATCH);
3736 ph10 427 }
3737 nigel 77 break;
3738    
3739     default:
3740     RRETURN(PCRE_ERROR_INTERNAL);
3741     }
3742     }
3743    
3744     /* If min = max, continue at the same level without recursing */
3745    
3746     if (min == max) continue;
3747    
3748     /* If minimizing, we have to test the rest of the pattern before each
3749     subsequent match. Again, separate the UTF-8 case for speed, and also
3750     separate the UCP cases. */
3751    
3752     if (minimize)
3753     {
3754     #ifdef SUPPORT_UCP
3755 nigel 87 if (prop_type >= 0)
3756 nigel 77 {
3757 nigel 87 switch(prop_type)
3758 nigel 77 {
3759 nigel 87 case PT_ANY:
3760     for (fi = min;; fi++)
3761     {
3762 ph10 164 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM36);
3763 nigel 87 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3764 ph10 428 if (fi >= max) RRETURN(MATCH_NOMATCH);
3765 ph10 427 if (eptr >= md->end_subject)
3766 ph10 426 {
3767 ph10 427 SCHECK_PARTIAL();
3768 ph10 426 RRETURN(MATCH_NOMATCH);
3769 ph10 427 }
3770 nigel 87 GETCHARINC(c, eptr);
3771     if (prop_fail_result) RRETURN(MATCH_NOMATCH);
3772     }
3773 nigel 93 /* Control never gets here */
3774 nigel 87
3775     case PT_LAMP:
3776     for (fi = min;; fi++)
3777     {
3778 ph10 164 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM37);
3779 nigel 87 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3780 ph10 428 if (fi >= max) RRETURN(MATCH_NOMATCH);
3781 ph10 427 if (eptr >= md->end_subject)
3782 ph10 426 {
3783 ph10 427 SCHECK_PARTIAL();
3784 ph10 426 RRETURN(MATCH_NOMATCH);
3785 ph10 427 }
3786 nigel 87 GETCHARINC(c, eptr);
3787 ph10 349 prop_chartype = UCD_CHARTYPE(c);
3788 nigel 87 if ((prop_chartype == ucp_Lu ||
3789     prop_chartype == ucp_Ll ||
3790     prop_chartype == ucp_Lt) == prop_fail_result)
3791     RRETURN(MATCH_NOMATCH);
3792     }
3793 nigel 93 /* Control never gets here */
3794 nigel 87
3795     case PT_GC:
3796     for (fi = min;; fi++)
3797     {
3798 ph10 164 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM38);
3799 nigel 87 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3800 ph10 428 if (fi >= max) RRETURN(MATCH_NOMATCH);
3801 ph10 427 if (eptr >= md->end_subject)
3802 ph10 426 {
3803 ph10 427 SCHECK_PARTIAL();
3804 ph10 426 RRETURN(MATCH_NOMATCH);
3805 ph10 427 }
3806 nigel 87 GETCHARINC(c, eptr);
3807 ph10 349 prop_category = UCD_CATEGORY(c);
3808 nigel 87 if ((prop_category == prop_value) == prop_fail_result)
3809     RRETURN(MATCH_NOMATCH);
3810     }
3811 nigel 93 /* Control never gets here */
3812 nigel 87
3813     case PT_PC:
3814     for (fi = min;; fi++)
3815     {
3816 ph10 164 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM39);
3817 nigel 87 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3818 ph10 428 if (fi >= max) RRETURN(MATCH_NOMATCH);
3819 ph10 427 if (eptr >= md->end_subject)
3820 ph10 426 {
3821 ph10 427 SCHECK_PARTIAL();
3822 ph10 426 RRETURN(MATCH_NOMATCH);
3823 ph10 427 }
3824 nigel 87 GETCHARINC(c, eptr);
3825 ph10 349 prop_chartype = UCD_CHARTYPE(c);
3826 nigel 87 if ((prop_chartype == prop_value) == prop_fail_result)
3827     RRETURN(MATCH_NOMATCH);
3828     }
3829 nigel 93 /* Control never gets here */
3830 nigel 87
3831     case PT_SC:
3832     for (fi = min;; fi++)
3833     {
3834 ph10 164 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM40);
3835 nigel 87 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3836 ph10 428 if (fi >= max) RRETURN(MATCH_NOMATCH);
3837 ph10 427 if (eptr >= md->end_subject)
3838 ph10 426 {
3839 ph10 427 SCHECK_PARTIAL();
3840 ph10 426 RRETURN(MATCH_NOMATCH);
3841 ph10 427 }
3842 nigel 87 GETCHARINC(c, eptr);
3843 ph10 349 prop_script = UCD_SCRIPT(c);
3844 nigel 87 if ((prop_script == prop_value) == prop_fail_result)
3845     RRETURN(MATCH_NOMATCH);
3846     }
3847 nigel 93 /* Control never gets here */
3848 nigel 87
3849     default:
3850     RRETURN(PCRE_ERROR_INTERNAL);
3851 nigel 77 }
3852     }
3853    
3854     /* Match extended Unicode sequences. We will get here only if the
3855     support is in the binary; otherwise a compile-time error occurs. */
3856    
3857     else if (ctype == OP_EXTUNI)
3858     {
3859     for (fi = min;; fi++)
3860     {
3861 ph10 164 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM41);
3862 nigel 77 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3863 ph10 428 if (fi >= max) RRETURN(MATCH_NOMATCH);
3864 ph10 427 if (eptr >= md->end_subject)
3865 ph10 426 {
3866 ph10 427 SCHECK_PARTIAL();
3867 ph10 426 RRETURN(MATCH_NOMATCH);
3868 ph10 427 }
3869 nigel 77 GETCHARINCTEST(c, eptr);
3870 ph10 349 prop_category = UCD_CATEGORY(c);
3871 nigel 77 if (prop_category == ucp_M) RRETURN(MATCH_NOMATCH);
3872     while (eptr < md->end_subject)
3873     {
3874     int len = 1;
3875 ph10 426 if (!utf8) c = *eptr;
3876     else { GETCHARLEN(c, eptr, len); }
3877 ph10 349 prop_category = UCD_CATEGORY(c);
3878 nigel 77 if (prop_category != ucp_M) break;
3879     eptr += len;
3880     }
3881     }
3882     }
3883    
3884     else
3885     #endif /* SUPPORT_UCP */
3886    
3887     #ifdef SUPPORT_UTF8
3888     /* UTF-8 mode */
3889     if (utf8)
3890     {
3891     for (fi = min;; fi++)
3892     {
3893 ph10 164 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM42);
3894 nigel 77 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3895 ph10 428 if (fi >= max) RRETURN(MATCH_NOMATCH);
3896 ph10 427 if (eptr >= md->end_subject)
3897 ph10 426 {
3898 ph10 427 SCHECK_PARTIAL();
3899 ph10 426 RRETURN(MATCH_NOMATCH);
3900 ph10 427 }
3901 ph10 426 if (ctype == OP_ANY && IS_NEWLINE(eptr))
3902     RRETURN(MATCH_NOMATCH);
3903 nigel 77 GETCHARINC(c, eptr);
3904     switch(ctype)
3905     {
3906 ph10 342 case OP_ANY: /* This is the non-NL case */
3907 ph10 345 case OP_ALLANY:
3908 nigel 77 case OP_ANYBYTE:
3909     break;
3910    
3911 nigel 93 case OP_ANYNL:
3912     switch(c)
3913     {
3914     default: RRETURN(MATCH_NOMATCH);
3915     case 0x000d:
3916     if (eptr < md->end_subject && *eptr == 0x0a) eptr++;
3917     break;
3918     case 0x000a:
3919 ph10 231 break;
3920    
3921 nigel 93 case 0x000b:
3922     case 0x000c:
3923     case 0x0085:
3924     case 0x2028:
3925     case 0x2029:
3926 ph10 231 if (md->bsr_anycrlf) RRETURN(MATCH_NOMATCH);
3927 nigel 93 break;
3928     }
3929     break;
3930    
3931 ph10 178 case OP_NOT_HSPACE:
3932     switch(c)
3933     {
3934     default: break;
3935     case 0x09: /* HT */
3936     case 0x20: /* SPACE */
3937     case 0xa0: /* NBSP */
3938     case 0x1680: /* OGHAM SPACE MARK */
3939     case 0x180e: /* MONGOLIAN VOWEL SEPARATOR */
3940     case 0x2000: /* EN QUAD */
3941     case 0x2001: /* EM QUAD */
3942     case 0x2002: /* EN SPACE */
3943     case 0x2003: /* EM SPACE */
3944     case 0x2004: /* THREE-PER-EM SPACE */
3945     case 0x2005: /* FOUR-PER-EM SPACE */
3946     case 0x2006: /* SIX-PER-EM SPACE */
3947     case 0x2007: /* FIGURE SPACE */
3948     case 0x2008: /* PUNCTUATION SPACE */
3949     case 0x2009: /* THIN SPACE */
3950     case 0x200A: /* HAIR SPACE */
3951     case 0x202f: /* NARROW NO-BREAK SPACE */
3952     case 0x205f: /* MEDIUM MATHEMATICAL SPACE */
3953     case 0x3000: /* IDEOGRAPHIC SPACE */
3954     RRETURN(MATCH_NOMATCH);
3955     }
3956     break;
3957    
3958     case OP_HSPACE:
3959     switch(c)
3960     {
3961     default: RRETURN(MATCH_NOMATCH);
3962     case 0x09: /* HT */
3963     case 0x20: /* SPACE */
3964     case 0xa0: /* NBSP */
3965     case 0x1680: /* OGHAM SPACE MARK */
3966     case 0x180e: /* MONGOLIAN VOWEL SEPARATOR */
3967     case 0x2000: /* EN QUAD */
3968     case 0x2001: /* EM QUAD */
3969     case 0x2002: /* EN SPACE */
3970     case 0x2003: /* EM SPACE */
3971     case 0x2004: /* THREE-PER-EM SPACE */
3972     case 0x2005: /* FOUR-PER-EM SPACE */
3973     case 0x2006: /* SIX-PER-EM SPACE */
3974     case 0x2007: /* FIGURE SPACE */
3975     case 0x2008: /* PUNCTUATION SPACE */
3976     case 0x2009: /* THIN SPACE */
3977     case 0x200A: /* HAIR SPACE */
3978     case 0x202f: /* NARROW NO-BREAK SPACE */
3979     case 0x205f: /* MEDIUM MATHEMATICAL SPACE */
3980     case 0x3000: /* IDEOGRAPHIC SPACE */
3981     break;
3982     }
3983     break;
3984    
3985     case OP_NOT_VSPACE:
3986     switch(c)
3987     {
3988     default: break;
3989     case 0x0a: /* LF */
3990     case 0x0b: /* VT */
3991     case 0x0c: /* FF */
3992     case 0x0d: /* CR */
3993     case 0x85: /* NEL */
3994     case 0x2028: /* LINE SEPARATOR */
3995     case 0x2029: /* PARAGRAPH SEPARATOR */
3996     RRETURN(MATCH_NOMATCH);
3997     }
3998     break;
3999    
4000     case OP_VSPACE:
4001     switch(c)
4002     {
4003     default: RRETURN(MATCH_NOMATCH);
4004     case 0x0a: /* LF */
4005     case 0x0b: /* VT */
4006     case 0x0c: /* FF */
4007     case 0x0d: /* CR */