/[pcre]/code/trunk/pcre_exec.c
ViewVC logotype

Contents of /code/trunk/pcre_exec.c

Parent Directory Parent Directory | Revision Log Revision Log


Revision 482 - (hide annotations) (download)
Mon Jan 4 15:55:46 2010 UTC (4 years, 7 months ago) by ph10
File MIME type: text/plain
File size: 171035 byte(s)
Fix partial match bug (code omitted) for \W.

1 nigel 77 /*************************************************
2     * Perl-Compatible Regular Expressions *
3     *************************************************/
4    
5     /* PCRE is a library of functions to support regular expressions whose syntax
6     and semantics are as close as possible to those of the Perl 5 language.
7    
8     Written by Philip Hazel
9 ph10 473 Copyright (c) 1997-2010 University of Cambridge
10 nigel 77
11     -----------------------------------------------------------------------------
12     Redistribution and use in source and binary forms, with or without
13     modification, are permitted provided that the following conditions are met:
14    
15     * Redistributions of source code must retain the above copyright notice,
16     this list of conditions and the following disclaimer.
17    
18     * Redistributions in binary form must reproduce the above copyright
19     notice, this list of conditions and the following disclaimer in the
20     documentation and/or other materials provided with the distribution.
21    
22     * Neither the name of the University of Cambridge nor the names of its
23     contributors may be used to endorse or promote products derived from
24     this software without specific prior written permission.
25    
26     THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
27     AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
28     IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
29     ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
30     LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
31     CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
32     SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
33     INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
34     CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
35     ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
36     POSSIBILITY OF SUCH DAMAGE.
37     -----------------------------------------------------------------------------
38     */
39    
40    
41     /* This module contains pcre_exec(), the externally visible function that does
42     pattern matching using an NFA algorithm, trying to mimic Perl as closely as
43     possible. There are also some static supporting functions. */
44    
45 ph10 200 #ifdef HAVE_CONFIG_H
46 ph10 236 #include "config.h"
47 ph10 200 #endif
48 ph10 199
49 nigel 93 #define NLBLOCK md /* Block containing newline information */
50     #define PSSTART start_subject /* Field containing processed string start */
51     #define PSEND end_subject /* Field containing processed string end */
52    
53 nigel 77 #include "pcre_internal.h"
54    
55 ph10 137 /* Undefine some potentially clashing cpp symbols */
56    
57     #undef min
58     #undef max
59    
60 nigel 77 /* Flag bits for the match() function */
61    
62 nigel 93 #define match_condassert 0x01 /* Called to check a condition assertion */
63     #define match_cbegroup 0x02 /* Could-be-empty unlimited repeat group */
64 nigel 77
65     /* Non-error returns from the match() function. Error returns are externally
66     defined PCRE_ERROR_xxx codes, which are all negative. */
67    
68     #define MATCH_MATCH 1
69     #define MATCH_NOMATCH 0
70    
71 ph10 211 /* Special internal returns from the match() function. Make them sufficiently
72 ph10 210 negative to avoid the external error codes. */
73    
74     #define MATCH_COMMIT (-999)
75     #define MATCH_PRUNE (-998)
76     #define MATCH_SKIP (-997)
77     #define MATCH_THEN (-996)
78    
79 nigel 77 /* Maximum number of ints of offset to save on the stack for recursive calls.
80     If the offset vector is bigger, malloc is used. This should be a multiple of 3,
81     because the offset vector is always a multiple of 3 long. */
82    
83     #define REC_STACK_SAVE_MAX 30
84    
85     /* Min and max values for the common repeats; for the maxima, 0 => infinity */
86    
87     static const char rep_min[] = { 0, 0, 1, 1, 0, 0 };
88     static const char rep_max[] = { 0, 0, 0, 0, 1, 1 };
89    
90    
91    
92 ph10 475 #ifdef PCRE_DEBUG
93 nigel 77 /*************************************************
94     * Debugging function to print chars *
95     *************************************************/
96    
97     /* Print a sequence of chars in printable format, stopping at the end of the
98     subject if the requested.
99    
100     Arguments:
101     p points to characters
102     length number to print
103     is_subject TRUE if printing from within md->start_subject
104     md pointer to matching data block, if is_subject is TRUE
105    
106     Returns: nothing
107     */
108    
109     static void
110     pchars(const uschar *p, int length, BOOL is_subject, match_data *md)
111     {
112 nigel 93 unsigned int c;
113 nigel 77 if (is_subject && length > md->end_subject - p) length = md->end_subject - p;
114     while (length-- > 0)
115     if (isprint(c = *(p++))) printf("%c", c); else printf("\\x%02x", c);
116     }
117     #endif
118    
119    
120    
121     /*************************************************
122     * Match a back-reference *
123     *************************************************/
124    
125     /* If a back reference hasn't been set, the length that is passed is greater
126     than the number of characters left in the string, so the match fails.
127    
128     Arguments:
129     offset index into the offset vector
130     eptr points into the subject
131     length length to be matched
132     md points to match data block
133     ims the ims flags
134    
135     Returns: TRUE if matched
136     */
137    
138     static BOOL
139 nigel 87 match_ref(int offset, register USPTR eptr, int length, match_data *md,
140 nigel 77 unsigned long int ims)
141     {
142 nigel 87 USPTR p = md->start_subject + md->offset_vector[offset];
143 nigel 77
144 ph10 475 #ifdef PCRE_DEBUG
145 nigel 77 if (eptr >= md->end_subject)
146     printf("matching subject <null>");
147     else
148     {
149     printf("matching subject ");
150     pchars(eptr, length, TRUE, md);
151     }
152     printf(" against backref ");
153     pchars(p, length, FALSE, md);
154     printf("\n");
155     #endif
156    
157     /* Always fail if not enough characters left */
158    
159     if (length > md->end_subject - eptr) return FALSE;
160    
161 ph10 354 /* Separate the caseless case for speed. In UTF-8 mode we can only do this
162     properly if Unicode properties are supported. Otherwise, we can check only
163     ASCII characters. */
164 nigel 77
165     if ((ims & PCRE_CASELESS) != 0)
166     {
167 ph10 354 #ifdef SUPPORT_UTF8
168     #ifdef SUPPORT_UCP
169     if (md->utf8)
170     {
171 ph10 358 USPTR endptr = eptr + length;
172 ph10 354 while (eptr < endptr)
173     {
174 ph10 358 int c, d;
175 ph10 354 GETCHARINC(c, eptr);
176     GETCHARINC(d, p);
177     if (c != d && c != UCD_OTHERCASE(d)) return FALSE;
178 ph10 358 }
179     }
180 ph10 354 else
181     #endif
182     #endif
183    
184     /* The same code works when not in UTF-8 mode and in UTF-8 mode when there
185     is no UCP support. */
186 ph10 358
187 nigel 77 while (length-- > 0)
188 ph10 354 { if (md->lcc[*p++] != md->lcc[*eptr++]) return FALSE; }
189 nigel 77 }
190 ph10 358
191 ph10 354 /* In the caseful case, we can just compare the bytes, whether or not we
192     are in UTF-8 mode. */
193 ph10 358
194 nigel 77 else
195     { while (length-- > 0) if (*p++ != *eptr++) return FALSE; }
196    
197     return TRUE;
198     }
199    
200    
201    
202     /***************************************************************************
203     ****************************************************************************
204     RECURSION IN THE match() FUNCTION
205    
206 nigel 87 The match() function is highly recursive, though not every recursive call
207     increases the recursive depth. Nevertheless, some regular expressions can cause
208     it to recurse to a great depth. I was writing for Unix, so I just let it call
209     itself recursively. This uses the stack for saving everything that has to be
210     saved for a recursive call. On Unix, the stack can be large, and this works
211     fine.
212 nigel 77
213 nigel 87 It turns out that on some non-Unix-like systems there are problems with
214     programs that use a lot of stack. (This despite the fact that every last chip
215     has oodles of memory these days, and techniques for extending the stack have
216     been known for decades.) So....
217 nigel 77
218     There is a fudge, triggered by defining NO_RECURSE, which avoids recursive
219     calls by keeping local variables that need to be preserved in blocks of memory
220 nigel 87 obtained from malloc() instead instead of on the stack. Macros are used to
221 nigel 77 achieve this so that the actual code doesn't look very different to what it
222     always used to.
223 ph10 164
224 ph10 165 The original heap-recursive code used longjmp(). However, it seems that this
225 ph10 164 can be very slow on some operating systems. Following a suggestion from Stan
226     Switzer, the use of longjmp() has been abolished, at the cost of having to
227     provide a unique number for each call to RMATCH. There is no way of generating
228     a sequence of numbers at compile time in C. I have given them names, to make
229     them stand out more clearly.
230    
231     Crude tests on x86 Linux show a small speedup of around 5-8%. However, on
232     FreeBSD, avoiding longjmp() more than halves the time taken to run the standard
233 ph10 165 tests. Furthermore, not using longjmp() means that local dynamic variables
234     don't have indeterminate values; this has meant that the frame size can be
235 ph10 164 reduced because the result can be "passed back" by straight setting of the
236     variable instead of being passed in the frame.
237 nigel 77 ****************************************************************************
238     ***************************************************************************/
239    
240 ph10 212 /* Numbers for RMATCH calls. When this list is changed, the code at HEAP_RETURN
241     below must be updated in sync. */
242 nigel 77
243 ph10 164 enum { RM1=1, RM2, RM3, RM4, RM5, RM6, RM7, RM8, RM9, RM10,
244     RM11, RM12, RM13, RM14, RM15, RM16, RM17, RM18, RM19, RM20,
245     RM21, RM22, RM23, RM24, RM25, RM26, RM27, RM28, RM29, RM30,
246     RM31, RM32, RM33, RM34, RM35, RM36, RM37, RM38, RM39, RM40,
247 ph10 210 RM41, RM42, RM43, RM44, RM45, RM46, RM47, RM48, RM49, RM50,
248 ph10 212 RM51, RM52, RM53, RM54 };
249 ph10 164
250 nigel 87 /* These versions of the macros use the stack, as normal. There are debugging
251 ph10 165 versions and production versions. Note that the "rw" argument of RMATCH isn't
252 ph10 164 actuall used in this definition. */
253 nigel 77
254     #ifndef NO_RECURSE
255     #define REGISTER register
256 ph10 164
257 ph10 475 #ifdef PCRE_DEBUG
258 ph10 164 #define RMATCH(ra,rb,rc,rd,re,rf,rg,rw) \
259 nigel 87 { \
260     printf("match() called in line %d\n", __LINE__); \
261 ph10 168 rrc = match(ra,rb,mstart,rc,rd,re,rf,rg,rdepth+1); \
262 nigel 87 printf("to line %d\n", __LINE__); \
263     }
264     #define RRETURN(ra) \
265     { \
266     printf("match() returned %d from line %d ", ra, __LINE__); \
267     return ra; \
268     }
269     #else
270 ph10 164 #define RMATCH(ra,rb,rc,rd,re,rf,rg,rw) \
271 ph10 168 rrc = match(ra,rb,mstart,rc,rd,re,rf,rg,rdepth+1)
272 nigel 77 #define RRETURN(ra) return ra
273 nigel 87 #endif
274    
275 nigel 77 #else
276    
277    
278 ph10 164 /* These versions of the macros manage a private stack on the heap. Note that
279     the "rd" argument of RMATCH isn't actually used in this definition. It's the md
280     argument of match(), which never changes. */
281 nigel 77
282     #define REGISTER
283    
284 ph10 164 #define RMATCH(ra,rb,rc,rd,re,rf,rg,rw)\
285 nigel 77 {\
286     heapframe *newframe = (pcre_stack_malloc)(sizeof(heapframe));\
287 ph10 164 frame->Xwhere = rw; \
288     newframe->Xeptr = ra;\
289     newframe->Xecode = rb;\
290 ph10 168 newframe->Xmstart = mstart;\
291 ph10 164 newframe->Xoffset_top = rc;\
292     newframe->Xims = re;\
293     newframe->Xeptrb = rf;\
294     newframe->Xflags = rg;\
295     newframe->Xrdepth = frame->Xrdepth + 1;\
296     newframe->Xprevframe = frame;\
297     frame = newframe;\
298     DPRINTF(("restarting from line %d\n", __LINE__));\
299     goto HEAP_RECURSE;\
300     L_##rw:\
301     DPRINTF(("jumped back to line %d\n", __LINE__));\
302 nigel 77 }
303    
304     #define RRETURN(ra)\
305     {\
306     heapframe *newframe = frame;\
307     frame = newframe->Xprevframe;\
308     (pcre_stack_free)(newframe);\
309     if (frame != NULL)\
310     {\
311 ph10 164 rrc = ra;\
312     goto HEAP_RETURN;\
313 nigel 77 }\
314     return ra;\
315     }
316    
317    
318     /* Structure for remembering the local variables in a private frame */
319    
320     typedef struct heapframe {
321     struct heapframe *Xprevframe;
322    
323     /* Function arguments that may change */
324    
325 ph10 409 USPTR Xeptr;
326 nigel 77 const uschar *Xecode;
327 ph10 409 USPTR Xmstart;
328 nigel 77 int Xoffset_top;
329     long int Xims;
330     eptrblock *Xeptrb;
331     int Xflags;
332 nigel 91 unsigned int Xrdepth;
333 nigel 77
334     /* Function local variables */
335    
336 ph10 409 USPTR Xcallpat;
337 ph10 406 #ifdef SUPPORT_UTF8
338 ph10 409 USPTR Xcharptr;
339 ph10 406 #endif
340 ph10 409 USPTR Xdata;
341     USPTR Xnext;
342     USPTR Xpp;
343     USPTR Xprev;
344     USPTR Xsaved_eptr;
345 nigel 77
346     recursion_info Xnew_recursive;
347    
348     BOOL Xcur_is_word;
349     BOOL Xcondition;
350     BOOL Xprev_is_word;
351    
352     unsigned long int Xoriginal_ims;
353    
354     #ifdef SUPPORT_UCP
355     int Xprop_type;
356 nigel 87 int Xprop_value;
357 nigel 77 int Xprop_fail_result;
358     int Xprop_category;
359     int Xprop_chartype;
360 nigel 87 int Xprop_script;
361 ph10 123 int Xoclength;
362     uschar Xocchars[8];
363 nigel 77 #endif
364    
365 ph10 403 int Xcodelink;
366 nigel 77 int Xctype;
367 nigel 93 unsigned int Xfc;
368 nigel 77 int Xfi;
369     int Xlength;
370     int Xmax;
371     int Xmin;
372     int Xnumber;
373     int Xoffset;
374     int Xop;
375     int Xsave_capture_last;
376     int Xsave_offset1, Xsave_offset2, Xsave_offset3;
377     int Xstacksave[REC_STACK_SAVE_MAX];
378    
379     eptrblock Xnewptrb;
380    
381 ph10 164 /* Where to jump back to */
382 nigel 77
383 ph10 164 int Xwhere;
384 ph10 165
385 nigel 77 } heapframe;
386    
387     #endif
388    
389    
390     /***************************************************************************
391     ***************************************************************************/
392    
393    
394    
395     /*************************************************
396     * Match from current position *
397     *************************************************/
398    
399 nigel 93 /* This function is called recursively in many circumstances. Whenever it
400 nigel 77 returns a negative (error) response, the outer incarnation must also return the
401 ph10 426 same response. */
402 nigel 77
403 ph10 426 /* These macros pack up tests that are used for partial matching, and which
404     appears several times in the code. We set the "hit end" flag if the pointer is
405     at the end of the subject and also past the start of the subject (i.e.
406 ph10 427 something has been matched). For hard partial matching, we then return
407     immediately. The second one is used when we already know we are past the end of
408     the subject. */
409 ph10 426
410     #define CHECK_PARTIAL()\
411 ph10 435 if (md->partial != 0 && eptr >= md->end_subject && eptr > mstart)\
412 ph10 427 {\
413     md->hitend = TRUE;\
414     if (md->partial > 1) RRETURN(PCRE_ERROR_PARTIAL);\
415     }
416 ph10 426
417     #define SCHECK_PARTIAL()\
418 ph10 462 if (md->partial != 0 && eptr > mstart)\
419 ph10 427 {\
420     md->hitend = TRUE;\
421     if (md->partial > 1) RRETURN(PCRE_ERROR_PARTIAL);\
422     }
423 ph10 426
424 ph10 427
425 ph10 426 /* Performance note: It might be tempting to extract commonly used fields from
426     the md structure (e.g. utf8, end_subject) into individual variables to improve
427 nigel 77 performance. Tests using gcc on a SPARC disproved this; in the first case, it
428     made performance worse.
429    
430     Arguments:
431 nigel 93 eptr pointer to current character in subject
432     ecode pointer to current position in compiled code
433 ph10 168 mstart pointer to the current match start position (can be modified
434 ph10 172 by encountering \K)
435 nigel 77 offset_top current top pointer
436     md pointer to "static" info for the match
437     ims current /i, /m, and /s options
438     eptrb pointer to chain of blocks containing eptr at start of
439     brackets - for testing for empty matches
440     flags can contain
441     match_condassert - this is an assertion condition
442 nigel 93 match_cbegroup - this is the start of an unlimited repeat
443     group that can match an empty string
444 nigel 87 rdepth the recursion depth
445 nigel 77
446     Returns: MATCH_MATCH if matched ) these values are >= 0
447     MATCH_NOMATCH if failed to match )
448     a negative PCRE_ERROR_xxx value if aborted by an error condition
449 nigel 87 (e.g. stopped by repeated call or recursion limit)
450 nigel 77 */
451    
452     static int
453 ph10 409 match(REGISTER USPTR eptr, REGISTER const uschar *ecode, USPTR mstart,
454 nigel 77 int offset_top, match_data *md, unsigned long int ims, eptrblock *eptrb,
455 nigel 91 int flags, unsigned int rdepth)
456 nigel 77 {
457     /* These variables do not need to be preserved over recursion in this function,
458 nigel 93 so they can be ordinary variables in all cases. Mark some of them with
459     "register" because they are used a lot in loops. */
460 nigel 77
461 nigel 91 register int rrc; /* Returns from recursive calls */
462     register int i; /* Used for loops not involving calls to RMATCH() */
463 nigel 93 register unsigned int c; /* Character values not kept over RMATCH() calls */
464 nigel 91 register BOOL utf8; /* Local copy of UTF-8 flag for speed */
465 nigel 77
466 nigel 93 BOOL minimize, possessive; /* Quantifier options */
467 ph10 403 int condcode;
468 nigel 93
469 nigel 77 /* When recursion is not being used, all "local" variables that have to be
470     preserved over calls to RMATCH() are part of a "frame" which is obtained from
471     heap storage. Set up the top-level frame here; others are obtained from the
472     heap whenever RMATCH() does a "recursion". See the macro definitions above. */
473    
474     #ifdef NO_RECURSE
475     heapframe *frame = (pcre_stack_malloc)(sizeof(heapframe));
476     frame->Xprevframe = NULL; /* Marks the top level */
477    
478     /* Copy in the original argument variables */
479    
480     frame->Xeptr = eptr;
481     frame->Xecode = ecode;
482 ph10 168 frame->Xmstart = mstart;
483 nigel 77 frame->Xoffset_top = offset_top;
484     frame->Xims = ims;
485     frame->Xeptrb = eptrb;
486     frame->Xflags = flags;
487 nigel 87 frame->Xrdepth = rdepth;
488 nigel 77
489     /* This is where control jumps back to to effect "recursion" */
490    
491     HEAP_RECURSE:
492    
493     /* Macros make the argument variables come from the current frame */
494    
495     #define eptr frame->Xeptr
496     #define ecode frame->Xecode
497 ph10 168 #define mstart frame->Xmstart
498 nigel 77 #define offset_top frame->Xoffset_top
499     #define ims frame->Xims
500     #define eptrb frame->Xeptrb
501     #define flags frame->Xflags
502 nigel 87 #define rdepth frame->Xrdepth
503 nigel 77
504     /* Ditto for the local variables */
505    
506     #ifdef SUPPORT_UTF8
507     #define charptr frame->Xcharptr
508     #endif
509     #define callpat frame->Xcallpat
510 ph10 403 #define codelink frame->Xcodelink
511 nigel 77 #define data frame->Xdata
512     #define next frame->Xnext
513     #define pp frame->Xpp
514     #define prev frame->Xprev
515     #define saved_eptr frame->Xsaved_eptr
516    
517     #define new_recursive frame->Xnew_recursive
518    
519     #define cur_is_word frame->Xcur_is_word
520     #define condition frame->Xcondition
521     #define prev_is_word frame->Xprev_is_word
522    
523     #define original_ims frame->Xoriginal_ims
524    
525     #ifdef SUPPORT_UCP
526     #define prop_type frame->Xprop_type
527 nigel 87 #define prop_value frame->Xprop_value
528 nigel 77 #define prop_fail_result frame->Xprop_fail_result
529     #define prop_category frame->Xprop_category
530     #define prop_chartype frame->Xprop_chartype
531 nigel 87 #define prop_script frame->Xprop_script
532 ph10 115 #define oclength frame->Xoclength
533     #define occhars frame->Xocchars
534 nigel 77 #endif
535    
536     #define ctype frame->Xctype
537     #define fc frame->Xfc
538     #define fi frame->Xfi
539     #define length frame->Xlength
540     #define max frame->Xmax
541     #define min frame->Xmin
542     #define number frame->Xnumber
543     #define offset frame->Xoffset
544     #define op frame->Xop
545     #define save_capture_last frame->Xsave_capture_last
546     #define save_offset1 frame->Xsave_offset1
547     #define save_offset2 frame->Xsave_offset2
548     #define save_offset3 frame->Xsave_offset3
549     #define stacksave frame->Xstacksave
550    
551     #define newptrb frame->Xnewptrb
552    
553     /* When recursion is being used, local variables are allocated on the stack and
554     get preserved during recursion in the normal way. In this environment, fi and
555     i, and fc and c, can be the same variables. */
556    
557 nigel 93 #else /* NO_RECURSE not defined */
558 nigel 77 #define fi i
559     #define fc c
560    
561    
562 nigel 87 #ifdef SUPPORT_UTF8 /* Many of these variables are used only */
563     const uschar *charptr; /* in small blocks of the code. My normal */
564     #endif /* style of coding would have declared */
565     const uschar *callpat; /* them within each of those blocks. */
566     const uschar *data; /* However, in order to accommodate the */
567     const uschar *next; /* version of this code that uses an */
568     USPTR pp; /* external "stack" implemented on the */
569     const uschar *prev; /* heap, it is easier to declare them all */
570     USPTR saved_eptr; /* here, so the declarations can be cut */
571     /* out in a block. The only declarations */
572     recursion_info new_recursive; /* within blocks below are for variables */
573     /* that do not have to be preserved over */
574     BOOL cur_is_word; /* a recursive call to RMATCH(). */
575     BOOL condition;
576 nigel 77 BOOL prev_is_word;
577    
578     unsigned long int original_ims;
579    
580     #ifdef SUPPORT_UCP
581     int prop_type;
582 nigel 87 int prop_value;
583 nigel 77 int prop_fail_result;
584     int prop_category;
585     int prop_chartype;
586 nigel 87 int prop_script;
587 ph10 115 int oclength;
588     uschar occhars[8];
589 nigel 77 #endif
590    
591 ph10 399 int codelink;
592 nigel 77 int ctype;
593     int length;
594     int max;
595     int min;
596     int number;
597     int offset;
598     int op;
599     int save_capture_last;
600     int save_offset1, save_offset2, save_offset3;
601     int stacksave[REC_STACK_SAVE_MAX];
602    
603     eptrblock newptrb;
604 nigel 93 #endif /* NO_RECURSE */
605 nigel 77
606     /* These statements are here to stop the compiler complaining about unitialized
607     variables. */
608    
609     #ifdef SUPPORT_UCP
610 nigel 87 prop_value = 0;
611 nigel 77 prop_fail_result = 0;
612     #endif
613    
614 nigel 93
615 nigel 91 /* This label is used for tail recursion, which is used in a few cases even
616     when NO_RECURSE is not defined, in order to reduce the amount of stack that is
617     used. Thanks to Ian Taylor for noticing this possibility and sending the
618     original patch. */
619    
620     TAIL_RECURSE:
621    
622 nigel 87 /* OK, now we can get on with the real code of the function. Recursive calls
623     are specified by the macro RMATCH and RRETURN is used to return. When
624     NO_RECURSE is *not* defined, these just turn into a recursive call to match()
625 ph10 475 and a "return", respectively (possibly with some debugging if PCRE_DEBUG is
626 nigel 87 defined). However, RMATCH isn't like a function call because it's quite a
627     complicated macro. It has to be used in one particular way. This shouldn't,
628     however, impact performance when true recursion is being used. */
629 nigel 77
630 ph10 164 #ifdef SUPPORT_UTF8
631     utf8 = md->utf8; /* Local copy of the flag */
632     #else
633     utf8 = FALSE;
634     #endif
635    
636 nigel 87 /* First check that we haven't called match() too many times, or that we
637     haven't exceeded the recursive call limit. */
638    
639 nigel 77 if (md->match_call_count++ >= md->match_limit) RRETURN(PCRE_ERROR_MATCHLIMIT);
640 nigel 87 if (rdepth >= md->match_limit_recursion) RRETURN(PCRE_ERROR_RECURSIONLIMIT);
641 nigel 77
642     original_ims = ims; /* Save for resetting on ')' */
643 nigel 91
644 nigel 93 /* At the start of a group with an unlimited repeat that may match an empty
645     string, the match_cbegroup flag is set. When this is the case, add the current
646     subject pointer to the chain of such remembered pointers, to be checked when we
647     hit the closing ket, in order to break infinite loops that match no characters.
648 ph10 197 When match() is called in other circumstances, don't add to the chain. The
649     match_cbegroup flag must NOT be used with tail recursion, because the memory
650     block that is used is on the stack, so a new one may be required for each
651     match(). */
652 nigel 77
653 nigel 93 if ((flags & match_cbegroup) != 0)
654 nigel 77 {
655 ph10 197 newptrb.epb_saved_eptr = eptr;
656     newptrb.epb_prev = eptrb;
657     eptrb = &newptrb;
658 nigel 77 }
659    
660 nigel 93 /* Now start processing the opcodes. */
661 nigel 77
662     for (;;)
663     {
664 nigel 93 minimize = possessive = FALSE;
665 nigel 77 op = *ecode;
666 ph10 443
667 nigel 93 switch(op)
668     {
669 ph10 210 case OP_FAIL:
670 ph10 212 RRETURN(MATCH_NOMATCH);
671 ph10 211
672 ph10 210 case OP_PRUNE:
673     RMATCH(eptr, ecode + _pcre_OP_lengths[*ecode], offset_top, md,
674     ims, eptrb, flags, RM51);
675     if (rrc != MATCH_NOMATCH) RRETURN(rrc);
676 ph10 212 RRETURN(MATCH_PRUNE);
677 ph10 211
678 ph10 210 case OP_COMMIT:
679     RMATCH(eptr, ecode + _pcre_OP_lengths[*ecode], offset_top, md,
680     ims, eptrb, flags, RM52);
681     if (rrc != MATCH_NOMATCH) RRETURN(rrc);
682 ph10 212 RRETURN(MATCH_COMMIT);
683 ph10 211
684 ph10 210 case OP_SKIP:
685     RMATCH(eptr, ecode + _pcre_OP_lengths[*ecode], offset_top, md,
686     ims, eptrb, flags, RM53);
687     if (rrc != MATCH_NOMATCH) RRETURN(rrc);
688 ph10 211 md->start_match_ptr = eptr; /* Pass back current position */
689 ph10 212 RRETURN(MATCH_SKIP);
690 ph10 211
691 ph10 210 case OP_THEN:
692     RMATCH(eptr, ecode + _pcre_OP_lengths[*ecode], offset_top, md,
693 ph10 212 ims, eptrb, flags, RM54);
694 ph10 210 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
695 ph10 212 RRETURN(MATCH_THEN);
696 ph10 211
697 nigel 93 /* Handle a capturing bracket. If there is space in the offset vector, save
698     the current subject position in the working slot at the top of the vector.
699     We mustn't change the current values of the data slot, because they may be
700     set from a previous iteration of this group, and be referred to by a
701     reference inside the group.
702 nigel 77
703 nigel 93 If the bracket fails to match, we need to restore this value and also the
704     values of the final offsets, in case they were set by a previous iteration
705     of the same bracket.
706 nigel 77
707 nigel 93 If there isn't enough space in the offset vector, treat this as if it were
708     a non-capturing bracket. Don't worry about setting the flag for the error
709     case here; that is handled in the code for KET. */
710 nigel 77
711 nigel 93 case OP_CBRA:
712     case OP_SCBRA:
713     number = GET2(ecode, 1+LINK_SIZE);
714 nigel 77 offset = number << 1;
715    
716 ph10 475 #ifdef PCRE_DEBUG
717 nigel 93 printf("start bracket %d\n", number);
718     printf("subject=");
719 nigel 77 pchars(eptr, 16, TRUE, md);
720     printf("\n");
721     #endif
722    
723     if (offset < md->offset_max)
724     {
725     save_offset1 = md->offset_vector[offset];
726     save_offset2 = md->offset_vector[offset+1];
727     save_offset3 = md->offset_vector[md->offset_end - number];
728     save_capture_last = md->capture_last;
729    
730     DPRINTF(("saving %d %d %d\n", save_offset1, save_offset2, save_offset3));
731     md->offset_vector[md->offset_end - number] = eptr - md->start_subject;
732    
733 nigel 93 flags = (op == OP_SCBRA)? match_cbegroup : 0;
734 nigel 77 do
735     {
736 ph10 164 RMATCH(eptr, ecode + _pcre_OP_lengths[*ecode], offset_top, md,
737     ims, eptrb, flags, RM1);
738 ph10 210 if (rrc != MATCH_NOMATCH && rrc != MATCH_THEN) RRETURN(rrc);
739 nigel 77 md->capture_last = save_capture_last;
740     ecode += GET(ecode, 1);
741     }
742     while (*ecode == OP_ALT);
743    
744     DPRINTF(("bracket %d failed\n", number));
745    
746     md->offset_vector[offset] = save_offset1;
747     md->offset_vector[offset+1] = save_offset2;
748     md->offset_vector[md->offset_end - number] = save_offset3;
749    
750     RRETURN(MATCH_NOMATCH);
751     }
752    
753 ph10 197 /* FALL THROUGH ... Insufficient room for saving captured contents. Treat
754     as a non-capturing bracket. */
755 nigel 77
756 ph10 197 /* VVVVVVVVVVVVVVVVVVVVVVVVV */
757     /* VVVVVVVVVVVVVVVVVVVVVVVVV */
758    
759 nigel 93 DPRINTF(("insufficient capture room: treat as non-capturing\n"));
760 nigel 77
761 ph10 197 /* VVVVVVVVVVVVVVVVVVVVVVVVV */
762     /* VVVVVVVVVVVVVVVVVVVVVVVVV */
763    
764 nigel 93 /* Non-capturing bracket. Loop for all the alternatives. When we get to the
765     final alternative within the brackets, we would return the result of a
766     recursive call to match() whatever happened. We can reduce stack usage by
767 ph10 197 turning this into a tail recursion, except in the case when match_cbegroup
768     is set.*/
769 nigel 77
770 nigel 93 case OP_BRA:
771     case OP_SBRA:
772     DPRINTF(("start non-capturing bracket\n"));
773     flags = (op >= OP_SBRA)? match_cbegroup : 0;
774 nigel 91 for (;;)
775 nigel 77 {
776 ph10 197 if (ecode[GET(ecode, 1)] != OP_ALT) /* Final alternative */
777 nigel 93 {
778 ph10 197 if (flags == 0) /* Not a possibly empty group */
779     {
780     ecode += _pcre_OP_lengths[*ecode];
781     DPRINTF(("bracket 0 tail recursion\n"));
782     goto TAIL_RECURSE;
783     }
784    
785     /* Possibly empty group; can't use tail recursion. */
786    
787     RMATCH(eptr, ecode + _pcre_OP_lengths[*ecode], offset_top, md, ims,
788     eptrb, flags, RM48);
789     RRETURN(rrc);
790 nigel 93 }
791 nigel 91
792     /* For non-final alternatives, continue the loop for a NOMATCH result;
793     otherwise return. */
794    
795 ph10 164 RMATCH(eptr, ecode + _pcre_OP_lengths[*ecode], offset_top, md, ims,
796     eptrb, flags, RM2);
797 ph10 210 if (rrc != MATCH_NOMATCH && rrc != MATCH_THEN) RRETURN(rrc);
798 nigel 77 ecode += GET(ecode, 1);
799     }
800 nigel 91 /* Control never reaches here. */
801 nigel 77
802     /* Conditional group: compilation checked that there are no more than
803     two branches. If the condition is false, skipping the first branch takes us
804     past the end if there is only one branch, but that's OK because that is
805 nigel 91 exactly what going to the ket would do. As there is only one branch to be
806     obeyed, we can use tail recursion to avoid using another stack frame. */
807 nigel 77
808     case OP_COND:
809 nigel 93 case OP_SCOND:
810 ph10 399 codelink= GET(ecode, 1);
811 ph10 406
812 ph10 381 /* Because of the way auto-callout works during compile, a callout item is
813     inserted between OP_COND and an assertion condition. */
814 ph10 392
815 ph10 381 if (ecode[LINK_SIZE+1] == OP_CALLOUT)
816     {
817     if (pcre_callout != NULL)
818     {
819     pcre_callout_block cb;
820     cb.version = 1; /* Version 1 of the callout block */
821     cb.callout_number = ecode[LINK_SIZE+2];
822     cb.offset_vector = md->offset_vector;
823     cb.subject = (PCRE_SPTR)md->start_subject;
824     cb.subject_length = md->end_subject - md->start_subject;
825     cb.start_match = mstart - md->start_subject;
826     cb.current_position = eptr - md->start_subject;
827     cb.pattern_position = GET(ecode, LINK_SIZE + 3);
828     cb.next_item_length = GET(ecode, 3 + 2*LINK_SIZE);
829     cb.capture_top = offset_top/2;
830     cb.capture_last = md->capture_last;
831     cb.callout_data = md->callout_data;
832     if ((rrc = (*pcre_callout)(&cb)) > 0) RRETURN(MATCH_NOMATCH);
833     if (rrc < 0) RRETURN(rrc);
834     }
835     ecode += _pcre_OP_lengths[OP_CALLOUT];
836     }
837 ph10 392
838 ph10 399 condcode = ecode[LINK_SIZE+1];
839 ph10 406
840 ph10 381 /* Now see what the actual condition is */
841 ph10 392
842 ph10 459 if (condcode == OP_RREF || condcode == OP_NRREF) /* Recursion test */
843 nigel 77 {
844 ph10 459 if (md->recursive == NULL) /* Not recursing => FALSE */
845     {
846 ph10 461 condition = FALSE;
847     ecode += GET(ecode, 1);
848     }
849 ph10 459 else
850 ph10 461 {
851 ph10 459 int recno = GET2(ecode, LINK_SIZE + 2); /* Recursion group number*/
852     condition = (recno == RREF_ANY || recno == md->recursive->group_num);
853 ph10 461
854 ph10 459 /* If the test is for recursion into a specific subpattern, and it is
855     false, but the test was set up by name, scan the table to see if the
856     name refers to any other numbers, and test them. The condition is true
857     if any one is set. */
858 ph10 461
859 ph10 459 if (!condition && condcode == OP_NRREF && recno != RREF_ANY)
860     {
861     uschar *slotA = md->name_table;
862     for (i = 0; i < md->name_count; i++)
863 ph10 461 {
864     if (GET2(slotA, 0) == recno) break;
865 ph10 459 slotA += md->name_entry_size;
866     }
867 ph10 461
868 ph10 459 /* Found a name for the number - there can be only one; duplicate
869     names for different numbers are allowed, but not vice versa. First
870     scan down for duplicates. */
871 ph10 461
872 ph10 459 if (i < md->name_count)
873 ph10 461 {
874 ph10 459 uschar *slotB = slotA;
875     while (slotB > md->name_table)
876     {
877     slotB -= md->name_entry_size;
878     if (strcmp((char *)slotA + 2, (char *)slotB + 2) == 0)
879     {
880     condition = GET2(slotB, 0) == md->recursive->group_num;
881 ph10 461 if (condition) break;
882     }
883 ph10 459 else break;
884 ph10 461 }
885    
886 ph10 459 /* Scan up for duplicates */
887 ph10 461
888 ph10 459 if (!condition)
889 ph10 461 {
890 ph10 459 slotB = slotA;
891     for (i++; i < md->name_count; i++)
892     {
893     slotB += md->name_entry_size;
894     if (strcmp((char *)slotA + 2, (char *)slotB + 2) == 0)
895     {
896     condition = GET2(slotB, 0) == md->recursive->group_num;
897     if (condition) break;
898 ph10 461 }
899 ph10 459 else break;
900 ph10 461 }
901     }
902 ph10 459 }
903 ph10 461 }
904    
905 ph10 459 /* Chose branch according to the condition */
906 ph10 461
907 ph10 459 ecode += condition? 3 : GET(ecode, 1);
908     }
909 ph10 461 }
910 nigel 93
911 ph10 459 else if (condcode == OP_CREF || condcode == OP_NCREF) /* Group used test */
912 nigel 93 {
913 nigel 77 offset = GET2(ecode, LINK_SIZE+2) << 1; /* Doubled ref number */
914 nigel 93 condition = offset < offset_top && md->offset_vector[offset] >= 0;
915 ph10 461
916 ph10 459 /* If the numbered capture is unset, but the reference was by name,
917 ph10 461 scan the table to see if the name refers to any other numbers, and test
918     them. The condition is true if any one is set. This is tediously similar
919     to the code above, but not close enough to try to amalgamate. */
920    
921 ph10 459 if (!condition && condcode == OP_NCREF)
922     {
923 ph10 461 int refno = offset >> 1;
924 ph10 459 uschar *slotA = md->name_table;
925 ph10 461
926 ph10 459 for (i = 0; i < md->name_count; i++)
927 ph10 461 {
928     if (GET2(slotA, 0) == refno) break;
929 ph10 459 slotA += md->name_entry_size;
930     }
931 ph10 461
932     /* Found a name for the number - there can be only one; duplicate names
933     for different numbers are allowed, but not vice versa. First scan down
934 ph10 459 for duplicates. */
935 ph10 461
936 ph10 459 if (i < md->name_count)
937 ph10 461 {
938 ph10 459 uschar *slotB = slotA;
939     while (slotB > md->name_table)
940     {
941     slotB -= md->name_entry_size;
942     if (strcmp((char *)slotA + 2, (char *)slotB + 2) == 0)
943     {
944     offset = GET2(slotB, 0) << 1;
945 ph10 461 condition = offset < offset_top &&
946 ph10 459 md->offset_vector[offset] >= 0;
947 ph10 461 if (condition) break;
948     }
949 ph10 459 else break;
950 ph10 461 }
951    
952 ph10 459 /* Scan up for duplicates */
953 ph10 461
954 ph10 459 if (!condition)
955 ph10 461 {
956 ph10 459 slotB = slotA;
957     for (i++; i < md->name_count; i++)
958     {
959     slotB += md->name_entry_size;
960     if (strcmp((char *)slotA + 2, (char *)slotB + 2) == 0)
961     {
962     offset = GET2(slotB, 0) << 1;
963 ph10 461 condition = offset < offset_top &&
964 ph10 459 md->offset_vector[offset] >= 0;
965 ph10 461 if (condition) break;
966     }
967 ph10 459 else break;
968 ph10 461 }
969     }
970 ph10 459 }
971 ph10 461 }
972    
973 ph10 459 /* Chose branch according to the condition */
974    
975 nigel 93 ecode += condition? 3 : GET(ecode, 1);
976 nigel 77 }
977    
978 ph10 399 else if (condcode == OP_DEF) /* DEFINE - always false */
979 nigel 93 {
980     condition = FALSE;
981     ecode += GET(ecode, 1);
982     }
983    
984 nigel 77 /* The condition is an assertion. Call match() to evaluate it - setting
985 nigel 93 the final argument match_condassert causes it to stop at the end of an
986     assertion. */
987 nigel 77
988     else
989     {
990 ph10 164 RMATCH(eptr, ecode + 1 + LINK_SIZE, offset_top, md, ims, NULL,
991     match_condassert, RM3);
992 nigel 77 if (rrc == MATCH_MATCH)
993     {
994 nigel 93 condition = TRUE;
995     ecode += 1 + LINK_SIZE + GET(ecode, LINK_SIZE + 2);
996 nigel 77 while (*ecode == OP_ALT) ecode += GET(ecode, 1);
997     }
998 ph10 210 else if (rrc != MATCH_NOMATCH && rrc != MATCH_THEN)
999 nigel 77 {
1000     RRETURN(rrc); /* Need braces because of following else */
1001     }
1002 nigel 93 else
1003     {
1004     condition = FALSE;
1005 ph10 399 ecode += codelink;
1006 nigel 93 }
1007     }
1008 nigel 91
1009 nigel 93 /* We are now at the branch that is to be obeyed. As there is only one,
1010 ph10 197 we can use tail recursion to avoid using another stack frame, except when
1011     match_cbegroup is required for an unlimited repeat of a possibly empty
1012     group. If the second alternative doesn't exist, we can just plough on. */
1013 nigel 91
1014 nigel 93 if (condition || *ecode == OP_ALT)
1015     {
1016 nigel 91 ecode += 1 + LINK_SIZE;
1017 ph10 197 if (op == OP_SCOND) /* Possibly empty group */
1018     {
1019     RMATCH(eptr, ecode, offset_top, md, ims, eptrb, match_cbegroup, RM49);
1020     RRETURN(rrc);
1021     }
1022     else /* Group must match something */
1023     {
1024     flags = 0;
1025     goto TAIL_RECURSE;
1026     }
1027 nigel 77 }
1028 ph10 395 else /* Condition false & no alternative */
1029 nigel 93 {
1030     ecode += 1 + LINK_SIZE;
1031     }
1032     break;
1033 nigel 77
1034 ph10 461
1035 ph10 447 /* Before OP_ACCEPT there may be any number of OP_CLOSE opcodes,
1036     to close any currently open capturing brackets. */
1037 ph10 461
1038 ph10 447 case OP_CLOSE:
1039 ph10 461 number = GET2(ecode, 1);
1040 ph10 447 offset = number << 1;
1041 ph10 461
1042 ph10 475 #ifdef PCRE_DEBUG
1043 ph10 447 printf("end bracket %d at *ACCEPT", number);
1044     printf("\n");
1045     #endif
1046 nigel 77
1047 ph10 447 md->capture_last = number;
1048     if (offset >= md->offset_max) md->offset_overflow = TRUE; else
1049     {
1050     md->offset_vector[offset] =
1051     md->offset_vector[md->offset_end - number];
1052     md->offset_vector[offset+1] = eptr - md->start_subject;
1053     if (offset_top <= offset) offset_top = offset + 2;
1054     }
1055     ecode += 3;
1056 ph10 461 break;
1057 ph10 447
1058    
1059 ph10 210 /* End of the pattern, either real or forced. If we are in a top-level
1060     recursion, we should restore the offsets appropriately and continue from
1061     after the call. */
1062 nigel 77
1063 ph10 210 case OP_ACCEPT:
1064 nigel 77 case OP_END:
1065     if (md->recursive != NULL && md->recursive->group_num == 0)
1066     {
1067     recursion_info *rec = md->recursive;
1068 nigel 87 DPRINTF(("End of pattern in a (?0) recursion\n"));
1069 nigel 77 md->recursive = rec->prevrec;
1070     memmove(md->offset_vector, rec->offset_save,
1071     rec->saved_max * sizeof(int));
1072 ph10 461 offset_top = rec->save_offset_top;
1073 ph10 168 mstart = rec->save_start;
1074 nigel 77 ims = original_ims;
1075     ecode = rec->after_call;
1076     break;
1077     }
1078    
1079 ph10 442 /* Otherwise, if we have matched an empty string, fail if PCRE_NOTEMPTY is
1080     set, or if PCRE_NOTEMPTY_ATSTART is set and we have matched at the start of
1081     the subject. In both cases, backtracking will then try other alternatives,
1082     if any. */
1083 ph10 443
1084 ph10 442 if (eptr == mstart &&
1085     (md->notempty ||
1086 ph10 443 (md->notempty_atstart &&
1087 ph10 442 mstart == md->start_subject + md->start_offset)))
1088 ph10 443 RRETURN(MATCH_NOMATCH);
1089    
1090 ph10 442 /* Otherwise, we have a match. */
1091 nigel 77
1092 ph10 168 md->end_match_ptr = eptr; /* Record where we ended */
1093     md->end_offset_top = offset_top; /* and how many extracts were taken */
1094 ph10 210 md->start_match_ptr = mstart; /* and the start (\K can modify) */
1095 nigel 77 RRETURN(MATCH_MATCH);
1096    
1097     /* Change option settings */
1098    
1099     case OP_OPT:
1100     ims = ecode[1];
1101     ecode += 2;
1102     DPRINTF(("ims set to %02lx\n", ims));
1103     break;
1104    
1105     /* Assertion brackets. Check the alternative branches in turn - the
1106     matching won't pass the KET for an assertion. If any one branch matches,
1107     the assertion is true. Lookbehind assertions have an OP_REVERSE item at the
1108     start of each branch to move the current point backwards, so the code at
1109     this level is identical to the lookahead case. */
1110    
1111     case OP_ASSERT:
1112     case OP_ASSERTBACK:
1113     do
1114     {
1115 ph10 164 RMATCH(eptr, ecode + 1 + LINK_SIZE, offset_top, md, ims, NULL, 0,
1116     RM4);
1117 nigel 77 if (rrc == MATCH_MATCH) break;
1118 ph10 210 if (rrc != MATCH_NOMATCH && rrc != MATCH_THEN) RRETURN(rrc);
1119 nigel 77 ecode += GET(ecode, 1);
1120     }
1121     while (*ecode == OP_ALT);
1122     if (*ecode == OP_KET) RRETURN(MATCH_NOMATCH);
1123    
1124     /* If checking an assertion for a condition, return MATCH_MATCH. */
1125    
1126     if ((flags & match_condassert) != 0) RRETURN(MATCH_MATCH);
1127    
1128     /* Continue from after the assertion, updating the offsets high water
1129     mark, since extracts may have been taken during the assertion. */
1130    
1131     do ecode += GET(ecode,1); while (*ecode == OP_ALT);
1132     ecode += 1 + LINK_SIZE;
1133     offset_top = md->end_offset_top;
1134     continue;
1135    
1136 ph10 473 /* Negative assertion: all branches must fail to match. Encountering SKIP,
1137 ph10 482 PRUNE, or COMMIT means we must assume failure without checking subsequent
1138 ph10 473 branches. */
1139 nigel 77
1140     case OP_ASSERT_NOT:
1141     case OP_ASSERTBACK_NOT:
1142     do
1143     {
1144 ph10 164 RMATCH(eptr, ecode + 1 + LINK_SIZE, offset_top, md, ims, NULL, 0,
1145     RM5);
1146 nigel 77 if (rrc == MATCH_MATCH) RRETURN(MATCH_NOMATCH);
1147 ph10 473 if (rrc == MATCH_SKIP || rrc == MATCH_PRUNE || rrc == MATCH_COMMIT)
1148     {
1149     do ecode += GET(ecode,1); while (*ecode == OP_ALT);
1150 ph10 482 break;
1151     }
1152 ph10 210 if (rrc != MATCH_NOMATCH && rrc != MATCH_THEN) RRETURN(rrc);
1153 nigel 77 ecode += GET(ecode,1);
1154     }
1155     while (*ecode == OP_ALT);
1156    
1157     if ((flags & match_condassert) != 0) RRETURN(MATCH_MATCH);
1158    
1159     ecode += 1 + LINK_SIZE;
1160     continue;
1161    
1162     /* Move the subject pointer back. This occurs only at the start of
1163     each branch of a lookbehind assertion. If we are too close to the start to
1164     move back, this match function fails. When working with UTF-8 we move
1165     back a number of characters, not bytes. */
1166    
1167     case OP_REVERSE:
1168     #ifdef SUPPORT_UTF8
1169     if (utf8)
1170     {
1171 nigel 93 i = GET(ecode, 1);
1172     while (i-- > 0)
1173 nigel 77 {
1174     eptr--;
1175     if (eptr < md->start_subject) RRETURN(MATCH_NOMATCH);
1176 ph10 207 BACKCHAR(eptr);
1177 nigel 77 }
1178     }
1179     else
1180     #endif
1181    
1182     /* No UTF-8 support, or not in UTF-8 mode: count is byte count */
1183    
1184     {
1185 nigel 93 eptr -= GET(ecode, 1);
1186 nigel 77 if (eptr < md->start_subject) RRETURN(MATCH_NOMATCH);
1187     }
1188    
1189 ph10 435 /* Save the earliest consulted character, then skip to next op code */
1190 nigel 77
1191 ph10 435 if (eptr < md->start_used_ptr) md->start_used_ptr = eptr;
1192 nigel 77 ecode += 1 + LINK_SIZE;
1193     break;
1194    
1195     /* The callout item calls an external function, if one is provided, passing
1196     details of the match so far. This is mainly for debugging, though the
1197     function is able to force a failure. */
1198    
1199     case OP_CALLOUT:
1200     if (pcre_callout != NULL)
1201     {
1202     pcre_callout_block cb;
1203     cb.version = 1; /* Version 1 of the callout block */
1204     cb.callout_number = ecode[1];
1205     cb.offset_vector = md->offset_vector;
1206 nigel 87 cb.subject = (PCRE_SPTR)md->start_subject;
1207 nigel 77 cb.subject_length = md->end_subject - md->start_subject;
1208 ph10 168 cb.start_match = mstart - md->start_subject;
1209 nigel 77 cb.current_position = eptr - md->start_subject;
1210     cb.pattern_position = GET(ecode, 2);
1211     cb.next_item_length = GET(ecode, 2 + LINK_SIZE);
1212     cb.capture_top = offset_top/2;
1213     cb.capture_last = md->capture_last;
1214     cb.callout_data = md->callout_data;
1215     if ((rrc = (*pcre_callout)(&cb)) > 0) RRETURN(MATCH_NOMATCH);
1216     if (rrc < 0) RRETURN(rrc);
1217     }
1218     ecode += 2 + 2*LINK_SIZE;
1219     break;
1220    
1221     /* Recursion either matches the current regex, or some subexpression. The
1222     offset data is the offset to the starting bracket from the start of the
1223     whole pattern. (This is so that it works from duplicated subpatterns.)
1224    
1225     If there are any capturing brackets started but not finished, we have to
1226     save their starting points and reinstate them after the recursion. However,
1227     we don't know how many such there are (offset_top records the completed
1228     total) so we just have to save all the potential data. There may be up to
1229     65535 such values, which is too large to put on the stack, but using malloc
1230     for small numbers seems expensive. As a compromise, the stack is used when
1231     there are no more than REC_STACK_SAVE_MAX values to store; otherwise malloc
1232     is used. A problem is what to do if the malloc fails ... there is no way of
1233     returning to the top level with an error. Save the top REC_STACK_SAVE_MAX
1234     values on the stack, and accept that the rest may be wrong.
1235    
1236     There are also other values that have to be saved. We use a chained
1237     sequence of blocks that actually live on the stack. Thanks to Robin Houston
1238     for the original version of this logic. */
1239    
1240     case OP_RECURSE:
1241     {
1242     callpat = md->start_code + GET(ecode, 1);
1243 nigel 93 new_recursive.group_num = (callpat == md->start_code)? 0 :
1244     GET2(callpat, 1 + LINK_SIZE);
1245 nigel 77
1246     /* Add to "recursing stack" */
1247    
1248     new_recursive.prevrec = md->recursive;
1249     md->recursive = &new_recursive;
1250    
1251     /* Find where to continue from afterwards */
1252    
1253     ecode += 1 + LINK_SIZE;
1254     new_recursive.after_call = ecode;
1255    
1256     /* Now save the offset data. */
1257    
1258     new_recursive.saved_max = md->offset_end;
1259     if (new_recursive.saved_max <= REC_STACK_SAVE_MAX)
1260     new_recursive.offset_save = stacksave;
1261     else
1262     {
1263     new_recursive.offset_save =
1264     (int *)(pcre_malloc)(new_recursive.saved_max * sizeof(int));
1265     if (new_recursive.offset_save == NULL) RRETURN(PCRE_ERROR_NOMEMORY);
1266     }
1267    
1268     memcpy(new_recursive.offset_save, md->offset_vector,
1269     new_recursive.saved_max * sizeof(int));
1270 ph10 168 new_recursive.save_start = mstart;
1271 ph10 461 new_recursive.save_offset_top = offset_top;
1272 ph10 168 mstart = eptr;
1273 nigel 77
1274     /* OK, now we can do the recursion. For each top-level alternative we
1275     restore the offset and recursion data. */
1276    
1277     DPRINTF(("Recursing into group %d\n", new_recursive.group_num));
1278 nigel 93 flags = (*callpat >= OP_SBRA)? match_cbegroup : 0;
1279 nigel 77 do
1280     {
1281 ph10 164 RMATCH(eptr, callpat + _pcre_OP_lengths[*callpat], offset_top,
1282     md, ims, eptrb, flags, RM6);
1283 nigel 77 if (rrc == MATCH_MATCH)
1284     {
1285 nigel 87 DPRINTF(("Recursion matched\n"));
1286 nigel 77 md->recursive = new_recursive.prevrec;
1287     if (new_recursive.offset_save != stacksave)
1288     (pcre_free)(new_recursive.offset_save);
1289     RRETURN(MATCH_MATCH);
1290     }
1291 ph10 210 else if (rrc != MATCH_NOMATCH && rrc != MATCH_THEN)
1292 nigel 87 {
1293     DPRINTF(("Recursion gave error %d\n", rrc));
1294 ph10 400 if (new_recursive.offset_save != stacksave)
1295     (pcre_free)(new_recursive.offset_save);
1296 nigel 87 RRETURN(rrc);
1297     }
1298 nigel 77
1299     md->recursive = &new_recursive;
1300     memcpy(md->offset_vector, new_recursive.offset_save,
1301     new_recursive.saved_max * sizeof(int));
1302     callpat += GET(callpat, 1);
1303     }
1304     while (*callpat == OP_ALT);
1305    
1306     DPRINTF(("Recursion didn't match\n"));
1307     md->recursive = new_recursive.prevrec;
1308     if (new_recursive.offset_save != stacksave)
1309     (pcre_free)(new_recursive.offset_save);
1310     RRETURN(MATCH_NOMATCH);
1311     }
1312     /* Control never reaches here */
1313    
1314     /* "Once" brackets are like assertion brackets except that after a match,
1315     the point in the subject string is not moved back. Thus there can never be
1316     a move back into the brackets. Friedl calls these "atomic" subpatterns.
1317     Check the alternative branches in turn - the matching won't pass the KET
1318     for this kind of subpattern. If any one branch matches, we carry on as at
1319     the end of a normal bracket, leaving the subject pointer. */
1320    
1321     case OP_ONCE:
1322 nigel 91 prev = ecode;
1323     saved_eptr = eptr;
1324    
1325     do
1326 nigel 77 {
1327 ph10 197 RMATCH(eptr, ecode + 1 + LINK_SIZE, offset_top, md, ims, eptrb, 0, RM7);
1328 nigel 91 if (rrc == MATCH_MATCH) break;
1329 ph10 210 if (rrc != MATCH_NOMATCH && rrc != MATCH_THEN) RRETURN(rrc);
1330 nigel 91 ecode += GET(ecode,1);
1331     }
1332     while (*ecode == OP_ALT);
1333 nigel 77
1334 nigel 91 /* If hit the end of the group (which could be repeated), fail */
1335 nigel 77
1336 nigel 91 if (*ecode != OP_ONCE && *ecode != OP_ALT) RRETURN(MATCH_NOMATCH);
1337 nigel 77
1338 nigel 91 /* Continue as from after the assertion, updating the offsets high water
1339     mark, since extracts may have been taken. */
1340 nigel 77
1341 nigel 93 do ecode += GET(ecode, 1); while (*ecode == OP_ALT);
1342 nigel 77
1343 nigel 91 offset_top = md->end_offset_top;
1344     eptr = md->end_match_ptr;
1345 nigel 77
1346 nigel 91 /* For a non-repeating ket, just continue at this level. This also
1347     happens for a repeating ket if no characters were matched in the group.
1348     This is the forcible breaking of infinite loops as implemented in Perl
1349     5.005. If there is an options reset, it will get obeyed in the normal
1350     course of events. */
1351 nigel 77
1352 nigel 91 if (*ecode == OP_KET || eptr == saved_eptr)
1353     {
1354     ecode += 1+LINK_SIZE;
1355     break;
1356     }
1357 nigel 77
1358 nigel 91 /* The repeating kets try the rest of the pattern or restart from the
1359     preceding bracket, in the appropriate order. The second "call" of match()
1360     uses tail recursion, to avoid using another stack frame. We need to reset
1361     any options that changed within the bracket before re-running it, so
1362     check the next opcode. */
1363 nigel 77
1364 nigel 91 if (ecode[1+LINK_SIZE] == OP_OPT)
1365     {
1366     ims = (ims & ~PCRE_IMS) | ecode[4];
1367     DPRINTF(("ims set to %02lx at group repeat\n", ims));
1368     }
1369 nigel 77
1370 nigel 91 if (*ecode == OP_KETRMIN)
1371     {
1372 ph10 197 RMATCH(eptr, ecode + 1 + LINK_SIZE, offset_top, md, ims, eptrb, 0, RM8);
1373 nigel 91 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1374     ecode = prev;
1375 ph10 197 flags = 0;
1376 nigel 91 goto TAIL_RECURSE;
1377 nigel 77 }
1378 nigel 91 else /* OP_KETRMAX */
1379     {
1380 ph10 164 RMATCH(eptr, prev, offset_top, md, ims, eptrb, match_cbegroup, RM9);
1381 nigel 91 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1382     ecode += 1 + LINK_SIZE;
1383 ph10 197 flags = 0;
1384 nigel 91 goto TAIL_RECURSE;
1385     }
1386     /* Control never gets here */
1387 nigel 77
1388     /* An alternation is the end of a branch; scan along to find the end of the
1389     bracketed group and go to there. */
1390    
1391     case OP_ALT:
1392     do ecode += GET(ecode,1); while (*ecode == OP_ALT);
1393     break;
1394    
1395 ph10 335 /* BRAZERO, BRAMINZERO and SKIPZERO occur just before a bracket group,
1396     indicating that it may occur zero times. It may repeat infinitely, or not
1397     at all - i.e. it could be ()* or ()? or even (){0} in the pattern. Brackets
1398     with fixed upper repeat limits are compiled as a number of copies, with the
1399     optional ones preceded by BRAZERO or BRAMINZERO. */
1400 nigel 77
1401     case OP_BRAZERO:
1402     {
1403     next = ecode+1;
1404 ph10 164 RMATCH(eptr, next, offset_top, md, ims, eptrb, 0, RM10);
1405 nigel 77 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1406     do next += GET(next,1); while (*next == OP_ALT);
1407 nigel 93 ecode = next + 1 + LINK_SIZE;
1408 nigel 77 }
1409     break;
1410    
1411     case OP_BRAMINZERO:
1412     {
1413     next = ecode+1;
1414 nigel 93 do next += GET(next, 1); while (*next == OP_ALT);
1415 ph10 164 RMATCH(eptr, next + 1+LINK_SIZE, offset_top, md, ims, eptrb, 0, RM11);
1416 nigel 77 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1417     ecode++;
1418     }
1419     break;
1420    
1421 ph10 335 case OP_SKIPZERO:
1422     {
1423     next = ecode+1;
1424     do next += GET(next,1); while (*next == OP_ALT);
1425     ecode = next + 1 + LINK_SIZE;
1426     }
1427     break;
1428    
1429 nigel 93 /* End of a group, repeated or non-repeating. */
1430 nigel 77
1431     case OP_KET:
1432     case OP_KETRMIN:
1433     case OP_KETRMAX:
1434 nigel 91 prev = ecode - GET(ecode, 1);
1435 nigel 77
1436 nigel 93 /* If this was a group that remembered the subject start, in order to break
1437     infinite repeats of empty string matches, retrieve the subject start from
1438     the chain. Otherwise, set it NULL. */
1439 nigel 77
1440 nigel 93 if (*prev >= OP_SBRA)
1441     {
1442     saved_eptr = eptrb->epb_saved_eptr; /* Value at start of group */
1443     eptrb = eptrb->epb_prev; /* Backup to previous group */
1444     }
1445     else saved_eptr = NULL;
1446 nigel 77
1447 nigel 93 /* If we are at the end of an assertion group, stop matching and return
1448     MATCH_MATCH, but record the current high water mark for use by positive
1449     assertions. Do this also for the "once" (atomic) groups. */
1450    
1451 nigel 91 if (*prev == OP_ASSERT || *prev == OP_ASSERT_NOT ||
1452     *prev == OP_ASSERTBACK || *prev == OP_ASSERTBACK_NOT ||
1453     *prev == OP_ONCE)
1454     {
1455     md->end_match_ptr = eptr; /* For ONCE */
1456     md->end_offset_top = offset_top;
1457     RRETURN(MATCH_MATCH);
1458     }
1459 nigel 77
1460 nigel 93 /* For capturing groups we have to check the group number back at the start
1461     and if necessary complete handling an extraction by setting the offsets and
1462     bumping the high water mark. Note that whole-pattern recursion is coded as
1463     a recurse into group 0, so it won't be picked up here. Instead, we catch it
1464     when the OP_END is reached. Other recursion is handled here. */
1465 nigel 77
1466 nigel 93 if (*prev == OP_CBRA || *prev == OP_SCBRA)
1467 nigel 91 {
1468 nigel 93 number = GET2(prev, 1+LINK_SIZE);
1469 nigel 91 offset = number << 1;
1470 ph10 461
1471 ph10 475 #ifdef PCRE_DEBUG
1472 nigel 91 printf("end bracket %d", number);
1473     printf("\n");
1474 nigel 77 #endif
1475    
1476 nigel 93 md->capture_last = number;
1477     if (offset >= md->offset_max) md->offset_overflow = TRUE; else
1478 nigel 91 {
1479 nigel 93 md->offset_vector[offset] =
1480     md->offset_vector[md->offset_end - number];
1481     md->offset_vector[offset+1] = eptr - md->start_subject;
1482     if (offset_top <= offset) offset_top = offset + 2;
1483     }
1484 nigel 77
1485 nigel 93 /* Handle a recursively called group. Restore the offsets
1486     appropriately and continue from after the call. */
1487 nigel 77
1488 nigel 93 if (md->recursive != NULL && md->recursive->group_num == number)
1489     {
1490     recursion_info *rec = md->recursive;
1491     DPRINTF(("Recursion (%d) succeeded - continuing\n", number));
1492     md->recursive = rec->prevrec;
1493 ph10 168 mstart = rec->save_start;
1494 nigel 93 memcpy(md->offset_vector, rec->offset_save,
1495     rec->saved_max * sizeof(int));
1496 ph10 461 offset_top = rec->save_offset_top;
1497 nigel 93 ecode = rec->after_call;
1498     ims = original_ims;
1499     break;
1500 nigel 77 }
1501 nigel 91 }
1502 nigel 77
1503 nigel 93 /* For both capturing and non-capturing groups, reset the value of the ims
1504     flags, in case they got changed during the group. */
1505 nigel 77
1506 nigel 91 ims = original_ims;
1507     DPRINTF(("ims reset to %02lx\n", ims));
1508 nigel 77
1509 nigel 91 /* For a non-repeating ket, just continue at this level. This also
1510     happens for a repeating ket if no characters were matched in the group.
1511     This is the forcible breaking of infinite loops as implemented in Perl
1512     5.005. If there is an options reset, it will get obeyed in the normal
1513     course of events. */
1514 nigel 77
1515 nigel 91 if (*ecode == OP_KET || eptr == saved_eptr)
1516     {
1517     ecode += 1 + LINK_SIZE;
1518     break;
1519     }
1520 nigel 77
1521 nigel 91 /* The repeating kets try the rest of the pattern or restart from the
1522     preceding bracket, in the appropriate order. In the second case, we can use
1523 ph10 197 tail recursion to avoid using another stack frame, unless we have an
1524     unlimited repeat of a group that can match an empty string. */
1525 nigel 77
1526 nigel 93 flags = (*prev >= OP_SBRA)? match_cbegroup : 0;
1527    
1528 nigel 91 if (*ecode == OP_KETRMIN)
1529     {
1530 ph10 197 RMATCH(eptr, ecode + 1 + LINK_SIZE, offset_top, md, ims, eptrb, 0, RM12);
1531 nigel 91 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1532 ph10 197 if (flags != 0) /* Could match an empty string */
1533     {
1534     RMATCH(eptr, prev, offset_top, md, ims, eptrb, flags, RM50);
1535     RRETURN(rrc);
1536     }
1537 nigel 91 ecode = prev;
1538     goto TAIL_RECURSE;
1539 nigel 77 }
1540 nigel 91 else /* OP_KETRMAX */
1541     {
1542 ph10 164 RMATCH(eptr, prev, offset_top, md, ims, eptrb, flags, RM13);
1543 nigel 91 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1544     ecode += 1 + LINK_SIZE;
1545 ph10 197 flags = 0;
1546 nigel 91 goto TAIL_RECURSE;
1547     }
1548     /* Control never gets here */
1549 nigel 77
1550     /* Start of subject unless notbol, or after internal newline if multiline */
1551    
1552     case OP_CIRC:
1553     if (md->notbol && eptr == md->start_subject) RRETURN(MATCH_NOMATCH);
1554     if ((ims & PCRE_MULTILINE) != 0)
1555     {
1556 nigel 91 if (eptr != md->start_subject &&
1557 nigel 93 (eptr == md->end_subject || !WAS_NEWLINE(eptr)))
1558 nigel 77 RRETURN(MATCH_NOMATCH);
1559     ecode++;
1560     break;
1561     }
1562     /* ... else fall through */
1563    
1564     /* Start of subject assertion */
1565    
1566     case OP_SOD:
1567     if (eptr != md->start_subject) RRETURN(MATCH_NOMATCH);
1568     ecode++;
1569     break;
1570    
1571     /* Start of match assertion */
1572    
1573     case OP_SOM:
1574     if (eptr != md->start_subject + md->start_offset) RRETURN(MATCH_NOMATCH);
1575     ecode++;
1576     break;
1577 ph10 172
1578 ph10 168 /* Reset the start of match point */
1579 ph10 172
1580 ph10 168 case OP_SET_SOM:
1581     mstart = eptr;
1582 ph10 172 ecode++;
1583     break;
1584 nigel 77
1585     /* Assert before internal newline if multiline, or before a terminating
1586     newline unless endonly is set, else end of subject unless noteol is set. */
1587    
1588     case OP_DOLL:
1589     if ((ims & PCRE_MULTILINE) != 0)
1590     {
1591     if (eptr < md->end_subject)
1592 nigel 91 { if (!IS_NEWLINE(eptr)) RRETURN(MATCH_NOMATCH); }
1593 nigel 77 else
1594     { if (md->noteol) RRETURN(MATCH_NOMATCH); }
1595     ecode++;
1596     break;
1597     }
1598     else
1599     {
1600     if (md->noteol) RRETURN(MATCH_NOMATCH);
1601     if (!md->endonly)
1602     {
1603 nigel 91 if (eptr != md->end_subject &&
1604 nigel 93 (!IS_NEWLINE(eptr) || eptr != md->end_subject - md->nllen))
1605 nigel 77 RRETURN(MATCH_NOMATCH);
1606     ecode++;
1607     break;
1608     }
1609     }
1610 nigel 91 /* ... else fall through for endonly */
1611 nigel 77
1612     /* End of subject assertion (\z) */
1613    
1614     case OP_EOD:
1615     if (eptr < md->end_subject) RRETURN(MATCH_NOMATCH);
1616     ecode++;
1617     break;
1618    
1619     /* End of subject or ending \n assertion (\Z) */
1620    
1621     case OP_EODN:
1622 nigel 91 if (eptr != md->end_subject &&
1623 nigel 93 (!IS_NEWLINE(eptr) || eptr != md->end_subject - md->nllen))
1624 nigel 91 RRETURN(MATCH_NOMATCH);
1625 nigel 77 ecode++;
1626     break;
1627    
1628     /* Word boundary assertions */
1629    
1630     case OP_NOT_WORD_BOUNDARY:
1631     case OP_WORD_BOUNDARY:
1632     {
1633    
1634     /* Find out if the previous and current characters are "word" characters.
1635     It takes a bit more work in UTF-8 mode. Characters > 255 are assumed to
1636 ph10 443 be "non-word" characters. Remember the earliest consulted character for
1637 ph10 435 partial matching. */
1638 nigel 77
1639     #ifdef SUPPORT_UTF8
1640     if (utf8)
1641     {
1642     if (eptr == md->start_subject) prev_is_word = FALSE; else
1643     {
1644 ph10 409 USPTR lastptr = eptr - 1;
1645 nigel 77 while((*lastptr & 0xc0) == 0x80) lastptr--;
1646 ph10 443 if (lastptr < md->start_used_ptr) md->start_used_ptr = lastptr;
1647 nigel 77 GETCHAR(c, lastptr);
1648     prev_is_word = c < 256 && (md->ctypes[c] & ctype_word) != 0;
1649     }
1650 ph10 443 if (eptr >= md->end_subject)
1651 nigel 77 {
1652 ph10 443 SCHECK_PARTIAL();
1653     cur_is_word = FALSE;
1654 ph10 428 }
1655     else
1656     {
1657 nigel 77 GETCHAR(c, eptr);
1658     cur_is_word = c < 256 && (md->ctypes[c] & ctype_word) != 0;
1659     }
1660     }
1661     else
1662     #endif
1663    
1664 ph10 428 /* Not in UTF-8 mode */
1665 nigel 77
1666     {
1667 ph10 435 if (eptr == md->start_subject) prev_is_word = FALSE; else
1668     {
1669 ph10 443 if (eptr <= md->start_used_ptr) md->start_used_ptr = eptr - 1;
1670 ph10 435 prev_is_word = ((md->ctypes[eptr[-1]] & ctype_word) != 0);
1671     }
1672 ph10 443 if (eptr >= md->end_subject)
1673 ph10 428 {
1674 ph10 443 SCHECK_PARTIAL();
1675     cur_is_word = FALSE;
1676 ph10 428 }
1677     else cur_is_word = ((md->ctypes[*eptr] & ctype_word) != 0);
1678 nigel 77 }
1679    
1680     /* Now see if the situation is what we want */
1681    
1682     if ((*ecode++ == OP_WORD_BOUNDARY)?
1683     cur_is_word == prev_is_word : cur_is_word != prev_is_word)
1684     RRETURN(MATCH_NOMATCH);
1685     }
1686     break;
1687    
1688     /* Match a single character type; inline for speed */
1689    
1690     case OP_ANY:
1691 ph10 342 if (IS_NEWLINE(eptr)) RRETURN(MATCH_NOMATCH);
1692 ph10 345 /* Fall through */
1693    
1694 ph10 341 case OP_ALLANY:
1695 ph10 443 if (eptr++ >= md->end_subject)
1696 ph10 428 {
1697 ph10 443 SCHECK_PARTIAL();
1698 ph10 428 RRETURN(MATCH_NOMATCH);
1699 ph10 443 }
1700 ph10 342 if (utf8) while (eptr < md->end_subject && (*eptr & 0xc0) == 0x80) eptr++;
1701 nigel 77 ecode++;
1702     break;
1703    
1704     /* Match a single byte, even in UTF-8 mode. This opcode really does match
1705     any byte, even newline, independent of the setting of PCRE_DOTALL. */
1706    
1707     case OP_ANYBYTE:
1708 ph10 443 if (eptr++ >= md->end_subject)
1709 ph10 428 {
1710 ph10 443 SCHECK_PARTIAL();
1711 ph10 428 RRETURN(MATCH_NOMATCH);
1712 ph10 443 }
1713 nigel 77 ecode++;
1714     break;
1715    
1716     case OP_NOT_DIGIT:
1717 ph10 443 if (eptr >= md->end_subject)
1718 ph10 428 {
1719 ph10 443 SCHECK_PARTIAL();
1720 ph10 428 RRETURN(MATCH_NOMATCH);
1721 ph10 443 }
1722 nigel 77 GETCHARINCTEST(c, eptr);
1723     if (
1724     #ifdef SUPPORT_UTF8
1725     c < 256 &&
1726     #endif
1727     (md->ctypes[c] & ctype_digit) != 0
1728     )
1729     RRETURN(MATCH_NOMATCH);
1730     ecode++;
1731     break;
1732    
1733     case OP_DIGIT:
1734 ph10 443 if (eptr >= md->end_subject)
1735 ph10 428 {
1736 ph10 443 SCHECK_PARTIAL();
1737 ph10 428 RRETURN(MATCH_NOMATCH);
1738 ph10 443 }
1739 nigel 77 GETCHARINCTEST(c, eptr);
1740     if (
1741     #ifdef SUPPORT_UTF8
1742     c >= 256 ||
1743     #endif
1744     (md->ctypes[c] & ctype_digit) == 0
1745     )
1746     RRETURN(MATCH_NOMATCH);
1747     ecode++;
1748     break;
1749    
1750     case OP_NOT_WHITESPACE:
1751 ph10 443 if (eptr >= md->end_subject)
1752 ph10 428 {
1753 ph10 443 SCHECK_PARTIAL();
1754 ph10 428 RRETURN(MATCH_NOMATCH);
1755 ph10 443 }
1756 nigel 77 GETCHARINCTEST(c, eptr);
1757     if (
1758     #ifdef SUPPORT_UTF8
1759     c < 256 &&
1760     #endif
1761     (md->ctypes[c] & ctype_space) != 0
1762     )
1763     RRETURN(MATCH_NOMATCH);
1764     ecode++;
1765     break;
1766    
1767     case OP_WHITESPACE:
1768 ph10 443 if (eptr >= md->end_subject)
1769 ph10 428 {
1770 ph10 443 SCHECK_PARTIAL();
1771 ph10 428 RRETURN(MATCH_NOMATCH);
1772 ph10 443 }
1773 nigel 77 GETCHARINCTEST(c, eptr);
1774     if (
1775     #ifdef SUPPORT_UTF8
1776     c >= 256 ||
1777     #endif
1778     (md->ctypes[c] & ctype_space) == 0
1779     )
1780     RRETURN(MATCH_NOMATCH);
1781     ecode++;
1782     break;
1783    
1784     case OP_NOT_WORDCHAR:
1785 ph10 443 if (eptr >= md->end_subject)
1786 ph10 428 {
1787 ph10 443 SCHECK_PARTIAL();
1788 ph10 428 RRETURN(MATCH_NOMATCH);
1789 ph10 443 }
1790 nigel 77 GETCHARINCTEST(c, eptr);
1791     if (
1792     #ifdef SUPPORT_UTF8
1793     c < 256 &&
1794     #endif
1795     (md->ctypes[c] & ctype_word) != 0
1796     )
1797     RRETURN(MATCH_NOMATCH);
1798     ecode++;
1799     break;
1800    
1801     case OP_WORDCHAR:
1802 ph10 443 if (eptr >= md->end_subject)
1803 ph10 428 {
1804 ph10 443 SCHECK_PARTIAL();
1805 ph10 428 RRETURN(MATCH_NOMATCH);
1806 ph10 443 }
1807 nigel 77 GETCHARINCTEST(c, eptr);
1808     if (
1809     #ifdef SUPPORT_UTF8
1810     c >= 256 ||
1811     #endif
1812     (md->ctypes[c] & ctype_word) == 0
1813     )
1814     RRETURN(MATCH_NOMATCH);
1815     ecode++;
1816     break;
1817    
1818 nigel 93 case OP_ANYNL:
1819 ph10 443 if (eptr >= md->end_subject)
1820 ph10 428 {
1821 ph10 443 SCHECK_PARTIAL();
1822 ph10 428 RRETURN(MATCH_NOMATCH);
1823 ph10 443 }
1824 nigel 93 GETCHARINCTEST(c, eptr);
1825     switch(c)
1826     {
1827     default: RRETURN(MATCH_NOMATCH);
1828     case 0x000d:
1829     if (eptr < md->end_subject && *eptr == 0x0a) eptr++;
1830     break;
1831 ph10 231
1832 nigel 93 case 0x000a:
1833 ph10 231 break;
1834    
1835 nigel 93 case 0x000b:
1836     case 0x000c:
1837     case 0x0085:
1838     case 0x2028:
1839     case 0x2029:
1840 ph10 231 if (md->bsr_anycrlf) RRETURN(MATCH_NOMATCH);
1841 nigel 93 break;
1842     }
1843     ecode++;
1844     break;
1845    
1846 ph10 178 case OP_NOT_HSPACE:
1847 ph10 443 if (eptr >= md->end_subject)
1848 ph10 428 {
1849 ph10 443 SCHECK_PARTIAL();
1850 ph10 428 RRETURN(MATCH_NOMATCH);
1851 ph10 443 }
1852 ph10 178 GETCHARINCTEST(c, eptr);
1853     switch(c)
1854     {
1855     default: break;
1856     case 0x09: /* HT */
1857     case 0x20: /* SPACE */
1858     case 0xa0: /* NBSP */
1859     case 0x1680: /* OGHAM SPACE MARK */
1860     case 0x180e: /* MONGOLIAN VOWEL SEPARATOR */
1861     case 0x2000: /* EN QUAD */
1862     case 0x2001: /* EM QUAD */
1863     case 0x2002: /* EN SPACE */
1864     case 0x2003: /* EM SPACE */
1865     case 0x2004: /* THREE-PER-EM SPACE */
1866     case 0x2005: /* FOUR-PER-EM SPACE */
1867     case 0x2006: /* SIX-PER-EM SPACE */
1868     case 0x2007: /* FIGURE SPACE */
1869     case 0x2008: /* PUNCTUATION SPACE */
1870     case 0x2009: /* THIN SPACE */
1871     case 0x200A: /* HAIR SPACE */
1872     case 0x202f: /* NARROW NO-BREAK SPACE */
1873     case 0x205f: /* MEDIUM MATHEMATICAL SPACE */
1874     case 0x3000: /* IDEOGRAPHIC SPACE */
1875     RRETURN(MATCH_NOMATCH);
1876     }
1877     ecode++;
1878     break;
1879    
1880     case OP_HSPACE:
1881 ph10 443 if (eptr >= md->end_subject)
1882 ph10 428 {
1883 ph10 443 SCHECK_PARTIAL();
1884 ph10 428 RRETURN(MATCH_NOMATCH);
1885 ph10 443 }
1886 ph10 178 GETCHARINCTEST(c, eptr);
1887     switch(c)
1888     {
1889     default: RRETURN(MATCH_NOMATCH);
1890     case 0x09: /* HT */
1891     case 0x20: /* SPACE */
1892     case 0xa0: /* NBSP */
1893     case 0x1680: /* OGHAM SPACE MARK */
1894     case 0x180e: /* MONGOLIAN VOWEL SEPARATOR */
1895     case 0x2000: /* EN QUAD */
1896     case 0x2001: /* EM QUAD */
1897     case 0x2002: /* EN SPACE */
1898     case 0x2003: /* EM SPACE */
1899     case 0x2004: /* THREE-PER-EM SPACE */
1900     case 0x2005: /* FOUR-PER-EM SPACE */
1901     case 0x2006: /* SIX-PER-EM SPACE */
1902     case 0x2007: /* FIGURE SPACE */
1903     case 0x2008: /* PUNCTUATION SPACE */
1904     case 0x2009: /* THIN SPACE */
1905     case 0x200A: /* HAIR SPACE */
1906     case 0x202f: /* NARROW NO-BREAK SPACE */
1907     case 0x205f: /* MEDIUM MATHEMATICAL SPACE */
1908     case 0x3000: /* IDEOGRAPHIC SPACE */
1909     break;
1910     }
1911     ecode++;
1912     break;
1913    
1914     case OP_NOT_VSPACE:
1915 ph10 443 if (eptr >= md->end_subject)
1916 ph10 428 {
1917 ph10 443 SCHECK_PARTIAL();
1918 ph10 428 RRETURN(MATCH_NOMATCH);
1919 ph10 443 }
1920 ph10 178 GETCHARINCTEST(c, eptr);
1921     switch(c)
1922     {
1923     default: break;
1924     case 0x0a: /* LF */
1925     case 0x0b: /* VT */
1926     case 0x0c: /* FF */
1927     case 0x0d: /* CR */
1928     case 0x85: /* NEL */
1929     case 0x2028: /* LINE SEPARATOR */
1930     case 0x2029: /* PARAGRAPH SEPARATOR */
1931     RRETURN(MATCH_NOMATCH);
1932     }
1933     ecode++;
1934     break;
1935    
1936     case OP_VSPACE:
1937 ph10 443 if (eptr >= md->end_subject)
1938 ph10 428 {
1939 ph10 443 SCHECK_PARTIAL();
1940 ph10 428 RRETURN(MATCH_NOMATCH);
1941 ph10 443 }
1942 ph10 178 GETCHARINCTEST(c, eptr);
1943     switch(c)
1944     {
1945     default: RRETURN(MATCH_NOMATCH);
1946     case 0x0a: /* LF */
1947     case 0x0b: /* VT */
1948     case 0x0c: /* FF */
1949     case 0x0d: /* CR */
1950     case 0x85: /* NEL */
1951     case 0x2028: /* LINE SEPARATOR */
1952     case 0x2029: /* PARAGRAPH SEPARATOR */
1953     break;
1954     }
1955     ecode++;
1956     break;
1957    
1958 nigel 77 #ifdef SUPPORT_UCP
1959     /* Check the next character by Unicode property. We will get here only
1960     if the support is in the binary; otherwise a compile-time error occurs. */
1961    
1962     case OP_PROP:
1963     case OP_NOTPROP:
1964 ph10 443 if (eptr >= md->end_subject)
1965 ph10 428 {
1966 ph10 443 SCHECK_PARTIAL();
1967 ph10 428 RRETURN(MATCH_NOMATCH);
1968 ph10 443 }
1969 nigel 77 GETCHARINCTEST(c, eptr);
1970     {
1971 ph10 384 const ucd_record *prop = GET_UCD(c);
1972 nigel 77
1973 nigel 87 switch(ecode[1])
1974     {
1975     case PT_ANY:
1976     if (op == OP_NOTPROP) RRETURN(MATCH_NOMATCH);
1977     break;
1978 nigel 77
1979 nigel 87 case PT_LAMP:
1980 ph10 349 if ((prop->chartype == ucp_Lu ||
1981     prop->chartype == ucp_Ll ||
1982     prop->chartype == ucp_Lt) == (op == OP_NOTPROP))
1983 nigel 77 RRETURN(MATCH_NOMATCH);
1984 nigel 87 break;
1985    
1986     case PT_GC:
1987 ph10 351 if ((ecode[2] != _pcre_ucp_gentype[prop->chartype]) == (op == OP_PROP))
1988 nigel 77 RRETURN(MATCH_NOMATCH);
1989 nigel 87 break;
1990    
1991     case PT_PC:
1992 ph10 349 if ((ecode[2] != prop->chartype) == (op == OP_PROP))
1993 nigel 87 RRETURN(MATCH_NOMATCH);
1994     break;
1995    
1996     case PT_SC:
1997 ph10 349 if ((ecode[2] != prop->script) == (op == OP_PROP))
1998 nigel 87 RRETURN(MATCH_NOMATCH);
1999     break;
2000    
2001     default:
2002     RRETURN(PCRE_ERROR_INTERNAL);
2003 nigel 77 }
2004 nigel 87
2005     ecode += 3;
2006 nigel 77 }
2007     break;
2008    
2009     /* Match an extended Unicode sequence. We will get here only if the support
2010     is in the binary; otherwise a compile-time error occurs. */
2011    
2012     case OP_EXTUNI:
2013 ph10 443 if (eptr >= md->end_subject)
2014 ph10 428 {
2015 ph10 443 SCHECK_PARTIAL();
2016 ph10 428 RRETURN(MATCH_NOMATCH);
2017 ph10 443 }
2018 nigel 77 GETCHARINCTEST(c, eptr);
2019     {
2020 ph10 349 int category = UCD_CATEGORY(c);
2021 nigel 77 if (category == ucp_M) RRETURN(MATCH_NOMATCH);
2022     while (eptr < md->end_subject)
2023     {
2024     int len = 1;
2025     if (!utf8) c = *eptr; else
2026     {
2027     GETCHARLEN(c, eptr, len);
2028     }
2029 ph10 349 category = UCD_CATEGORY(c);
2030 nigel 77 if (category != ucp_M) break;
2031     eptr += len;
2032     }
2033     }
2034     ecode++;
2035     break;
2036     #endif
2037    
2038    
2039     /* Match a back reference, possibly repeatedly. Look past the end of the
2040     item to see if there is repeat information following. The code is similar
2041     to that for character classes, but repeated for efficiency. Then obey
2042     similar code to character type repeats - written out again for speed.
2043     However, if the referenced string is the empty string, always treat
2044     it as matched, any number of times (otherwise there could be infinite
2045     loops). */
2046    
2047     case OP_REF:
2048     {
2049     offset = GET2(ecode, 1) << 1; /* Doubled ref number */
2050 ph10 345 ecode += 3;
2051    
2052 ph10 336 /* If the reference is unset, there are two possibilities:
2053 ph10 345
2054 ph10 336 (a) In the default, Perl-compatible state, set the length to be longer
2055     than the amount of subject left; this ensures that every attempt at a
2056     match fails. We can't just fail here, because of the possibility of
2057     quantifiers with zero minima.
2058 ph10 345
2059     (b) If the JavaScript compatibility flag is set, set the length to zero
2060     so that the back reference matches an empty string.
2061    
2062     Otherwise, set the length to the length of what was matched by the
2063 ph10 336 referenced subpattern. */
2064 ph10 345
2065 ph10 336 if (offset >= offset_top || md->offset_vector[offset] < 0)
2066 ph10 345 length = (md->jscript_compat)? 0 : md->end_subject - eptr + 1;
2067 ph10 336 else
2068     length = md->offset_vector[offset+1] - md->offset_vector[offset];
2069 nigel 77
2070     /* Set up for repetition, or handle the non-repeated case */
2071    
2072     switch (*ecode)
2073     {
2074     case OP_CRSTAR:
2075     case OP_CRMINSTAR:
2076     case OP_CRPLUS:
2077     case OP_CRMINPLUS:
2078     case OP_CRQUERY:
2079     case OP_CRMINQUERY:
2080     c = *ecode++ - OP_CRSTAR;
2081     minimize = (c & 1) != 0;
2082     min = rep_min[c]; /* Pick up values from tables; */
2083     max = rep_max[c]; /* zero for max => infinity */
2084     if (max == 0) max = INT_MAX;
2085     break;
2086    
2087     case OP_CRRANGE:
2088     case OP_CRMINRANGE:
2089     minimize = (*ecode == OP_CRMINRANGE);
2090     min = GET2(ecode, 1);
2091     max = GET2(ecode, 3);
2092     if (max == 0) max = INT_MAX;
2093     ecode += 5;
2094     break;
2095    
2096     default: /* No repeat follows */
2097 ph10 443 if (!match_ref(offset, eptr, length, md, ims))
2098 ph10 428 {
2099 ph10 443 CHECK_PARTIAL();
2100 ph10 428 RRETURN(MATCH_NOMATCH);
2101 ph10 443 }
2102 nigel 77 eptr += length;
2103     continue; /* With the main loop */
2104     }
2105    
2106     /* If the length of the reference is zero, just continue with the
2107     main loop. */
2108 ph10 443
2109 nigel 77 if (length == 0) continue;
2110    
2111     /* First, ensure the minimum number of matches are present. We get back
2112     the length of the reference string explicitly rather than passing the
2113     address of eptr, so that eptr can be a register variable. */
2114    
2115     for (i = 1; i <= min; i++)
2116     {
2117 ph10 427 if (!match_ref(offset, eptr, length, md, ims))
2118 ph10 426 {
2119 ph10 427 CHECK_PARTIAL();
2120 ph10 426 RRETURN(MATCH_NOMATCH);
2121 ph10 427 }
2122 nigel 77 eptr += length;
2123     }
2124    
2125     /* If min = max, continue at the same level without recursion.
2126     They are not both allowed to be zero. */
2127    
2128     if (min == max) continue;
2129    
2130     /* If minimizing, keep trying and advancing the pointer */
2131    
2132     if (minimize)
2133     {
2134     for (fi = min;; fi++)
2135     {
2136 ph10 164 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM14);
2137 nigel 77 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2138 ph10 428 if (fi >= max) RRETURN(MATCH_NOMATCH);
2139     if (!match_ref(offset, eptr, length, md, ims))
2140 ph10 426 {
2141 ph10 427 CHECK_PARTIAL();
2142 nigel 77 RRETURN(MATCH_NOMATCH);
2143 ph10 427 }
2144 nigel 77 eptr += length;
2145     }
2146     /* Control never gets here */
2147     }
2148    
2149     /* If maximizing, find the longest string and work backwards */
2150    
2151     else
2152     {
2153     pp = eptr;
2154     for (i = min; i < max; i++)
2155     {
2156 ph10 463 if (!match_ref(offset, eptr, length, md, ims))
2157 ph10 462 {
2158 ph10 463 CHECK_PARTIAL();
2159 ph10 462 break;
2160 ph10 463 }
2161 nigel 77 eptr += length;
2162     }
2163     while (eptr >= pp)
2164     {
2165 ph10 164 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM15);
2166 nigel 77 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2167     eptr -= length;
2168     }
2169     RRETURN(MATCH_NOMATCH);
2170     }
2171     }
2172     /* Control never gets here */
2173    
2174     /* Match a bit-mapped character class, possibly repeatedly. This op code is
2175     used when all the characters in the class have values in the range 0-255,
2176     and either the matching is caseful, or the characters are in the range
2177     0-127 when UTF-8 processing is enabled. The only difference between
2178     OP_CLASS and OP_NCLASS occurs when a data character outside the range is
2179     encountered.
2180    
2181     First, look past the end of the item to see if there is repeat information
2182     following. Then obey similar code to character type repeats - written out
2183     again for speed. */
2184    
2185     case OP_NCLASS:
2186     case OP_CLASS:
2187     {
2188     data = ecode + 1; /* Save for matching */
2189     ecode += 33; /* Advance past the item */
2190    
2191     switch (*ecode)
2192     {
2193     case OP_CRSTAR:
2194     case OP_CRMINSTAR:
2195     case OP_CRPLUS:
2196     case OP_CRMINPLUS:
2197     case OP_CRQUERY:
2198     case OP_CRMINQUERY:
2199     c = *ecode++ - OP_CRSTAR;
2200     minimize = (c & 1) != 0;
2201     min = rep_min[c]; /* Pick up values from tables; */
2202     max = rep_max[c]; /* zero for max => infinity */
2203     if (max == 0) max = INT_MAX;
2204     break;
2205    
2206     case OP_CRRANGE:
2207     case OP_CRMINRANGE:
2208     minimize = (*ecode == OP_CRMINRANGE);
2209     min = GET2(ecode, 1);
2210     max = GET2(ecode, 3);
2211     if (max == 0) max = INT_MAX;
2212     ecode += 5;
2213     break;
2214    
2215     default: /* No repeat follows */
2216     min = max = 1;
2217     break;
2218     }
2219    
2220     /* First, ensure the minimum number of matches are present. */
2221    
2222     #ifdef SUPPORT_UTF8
2223     /* UTF-8 mode */
2224     if (utf8)
2225     {
2226     for (i = 1; i <= min; i++)
2227     {
2228 ph10 427 if (eptr >= md->end_subject)
2229 ph10 426 {
2230 ph10 428 SCHECK_PARTIAL();
2231 ph10 426 RRETURN(MATCH_NOMATCH);
2232 ph10 427 }
2233 nigel 77 GETCHARINC(c, eptr);
2234     if (c > 255)
2235     {
2236     if (op == OP_CLASS) RRETURN(MATCH_NOMATCH);
2237     }
2238     else
2239     {
2240     if ((data[c/8] & (1 << (c&7))) == 0) RRETURN(MATCH_NOMATCH);
2241     }
2242     }
2243     }
2244     else
2245     #endif
2246     /* Not UTF-8 mode */
2247     {
2248     for (i = 1; i <= min; i++)
2249     {
2250 ph10 427 if (eptr >= md->end_subject)
2251 ph10 426 {
2252 ph10 428 SCHECK_PARTIAL();
2253 ph10 426 RRETURN(MATCH_NOMATCH);
2254 ph10 427 }
2255 nigel 77 c = *eptr++;
2256     if ((data[c/8] & (1 << (c&7))) == 0) RRETURN(MATCH_NOMATCH);
2257     }
2258     }
2259    
2260     /* If max == min we can continue with the main loop without the
2261     need to recurse. */
2262    
2263     if (min == max) continue;
2264    
2265     /* If minimizing, keep testing the rest of the expression and advancing
2266     the pointer while it matches the class. */
2267    
2268     if (minimize)
2269     {
2270     #ifdef SUPPORT_UTF8
2271     /* UTF-8 mode */
2272     if (utf8)
2273     {
2274     for (fi = min;; fi++)
2275     {
2276 ph10 164 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM16);
2277 nigel 77 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2278 ph10 428 if (fi >= max) RRETURN(MATCH_NOMATCH);
2279 ph10 427 if (eptr >= md->end_subject)
2280 ph10 426 {
2281 ph10 427 SCHECK_PARTIAL();
2282 ph10 426 RRETURN(MATCH_NOMATCH);
2283 ph10 427 }
2284 nigel 77 GETCHARINC(c, eptr);
2285     if (c > 255)
2286     {
2287     if (op == OP_CLASS) RRETURN(MATCH_NOMATCH);
2288     }
2289     else
2290     {
2291     if ((data[c/8] & (1 << (c&7))) == 0) RRETURN(MATCH_NOMATCH);
2292     }
2293     }
2294     }
2295     else
2296     #endif
2297     /* Not UTF-8 mode */
2298     {
2299     for (fi = min;; fi++)
2300     {
2301 ph10 164 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM17);
2302 nigel 77 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2303 ph10 428 if (fi >= max) RRETURN(MATCH_NOMATCH);
2304 ph10 427 if (eptr >= md->end_subject)
2305 ph10 426 {
2306 ph10 427 SCHECK_PARTIAL();
2307 ph10 426 RRETURN(MATCH_NOMATCH);
2308 ph10 427 }
2309 nigel 77 c = *eptr++;
2310     if ((data[c/8] & (1 << (c&7))) == 0) RRETURN(MATCH_NOMATCH);
2311     }
2312     }
2313     /* Control never gets here */
2314     }
2315    
2316     /* If maximizing, find the longest possible run, then work backwards. */
2317    
2318     else
2319     {
2320     pp = eptr;
2321    
2322     #ifdef SUPPORT_UTF8
2323     /* UTF-8 mode */
2324     if (utf8)
2325     {
2326     for (i = min; i < max; i++)
2327     {
2328     int len = 1;
2329 ph10 463 if (eptr >= md->end_subject)
2330 ph10 462 {
2331 ph10 463 SCHECK_PARTIAL();
2332 ph10 462 break;
2333 ph10 463 }
2334 nigel 77 GETCHARLEN(c, eptr, len);
2335     if (c > 255)
2336     {
2337     if (op == OP_CLASS) break;
2338     }
2339     else
2340     {
2341     if ((data[c/8] & (1 << (c&7))) == 0) break;
2342     }
2343     eptr += len;
2344     }
2345     for (;;)
2346     {
2347 ph10 164 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM18);
2348 nigel 77 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2349     if (eptr-- == pp) break; /* Stop if tried at original pos */
2350     BACKCHAR(eptr);
2351     }
2352     }
2353     else
2354     #endif
2355     /* Not UTF-8 mode */
2356     {
2357     for (i = min; i < max; i++)
2358     {
2359 ph10 463 if (eptr >= md->end_subject)
2360 ph10 462 {
2361 ph10 463 SCHECK_PARTIAL();
2362 ph10 462 break;
2363 ph10 463 }
2364 nigel 77 c = *eptr;
2365     if ((data[c/8] & (1 << (c&7))) == 0) break;
2366     eptr++;
2367     }
2368     while (eptr >= pp)
2369     {
2370 ph10 164 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM19);
2371 nigel 87 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2372 nigel 77 eptr--;
2373     }
2374     }
2375    
2376     RRETURN(MATCH_NOMATCH);
2377     }
2378     }
2379     /* Control never gets here */
2380    
2381    
2382     /* Match an extended character class. This opcode is encountered only
2383 ph10 384 when UTF-8 mode mode is supported. Nevertheless, we may not be in UTF-8
2384     mode, because Unicode properties are supported in non-UTF-8 mode. */
2385 nigel 77
2386     #ifdef SUPPORT_UTF8
2387     case OP_XCLASS:
2388     {
2389     data = ecode + 1 + LINK_SIZE; /* Save for matching */
2390     ecode += GET(ecode, 1); /* Advance past the item */
2391    
2392     switch (*ecode)
2393     {
2394     case OP_CRSTAR:
2395     case OP_CRMINSTAR:
2396     case OP_CRPLUS:
2397     case OP_CRMINPLUS:
2398     case OP_CRQUERY:
2399     case OP_CRMINQUERY:
2400     c = *ecode++ - OP_CRSTAR;
2401     minimize = (c & 1) != 0;
2402     min = rep_min[c]; /* Pick up values from tables; */
2403     max = rep_max[c]; /* zero for max => infinity */
2404     if (max == 0) max = INT_MAX;
2405     break;
2406    
2407     case OP_CRRANGE:
2408     case OP_CRMINRANGE:
2409     minimize = (*ecode == OP_CRMINRANGE);
2410     min = GET2(ecode, 1);
2411     max = GET2(ecode, 3);
2412     if (max == 0) max = INT_MAX;
2413     ecode += 5;
2414     break;
2415    
2416     default: /* No repeat follows */
2417     min = max = 1;
2418     break;
2419     }
2420    
2421     /* First, ensure the minimum number of matches are present. */
2422    
2423     for (i = 1; i <= min; i++)
2424     {
2425 ph10 427 if (eptr >= md->end_subject)
2426 ph10 426 {
2427     SCHECK_PARTIAL();
2428     RRETURN(MATCH_NOMATCH);
2429 ph10 427 }
2430 ph10 384 GETCHARINCTEST(c, eptr);
2431 nigel 77 if (!_pcre_xclass(c, data)) RRETURN(MATCH_NOMATCH);
2432     }
2433    
2434     /* If max == min we can continue with the main loop without the
2435     need to recurse. */
2436    
2437     if (min == max) continue;
2438    
2439     /* If minimizing, keep testing the rest of the expression and advancing
2440     the pointer while it matches the class. */
2441    
2442     if (minimize)
2443     {
2444     for (fi = min;; fi++)
2445     {
2446 ph10 164 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM20);
2447 nigel 77 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2448 ph10 428 if (fi >= max) RRETURN(MATCH_NOMATCH);
2449 ph10 427 if (eptr >= md->end_subject)
2450 ph10 426 {
2451 ph10 427 SCHECK_PARTIAL();
2452 ph10 426 RRETURN(MATCH_NOMATCH);
2453 ph10 427 }
2454 ph10 384 GETCHARINCTEST(c, eptr);
2455 nigel 77 if (!_pcre_xclass(c, data)) RRETURN(MATCH_NOMATCH);
2456     }
2457     /* Control never gets here */
2458     }
2459    
2460     /* If maximizing, find the longest possible run, then work backwards. */
2461    
2462     else
2463     {
2464     pp = eptr;
2465     for (i = min; i < max; i++)
2466     {
2467     int len = 1;
2468 ph10 463 if (eptr >= md->end_subject)
2469 ph10 462 {
2470 ph10 463 SCHECK_PARTIAL();
2471 ph10 462 break;
2472 ph10 463 }
2473 ph10 384 GETCHARLENTEST(c, eptr, len);
2474 nigel 77 if (!_pcre_xclass(c, data)) break;
2475     eptr += len;
2476     }
2477     for(;;)
2478     {
2479 ph10 164 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM21);
2480 nigel 77 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2481     if (eptr-- == pp) break; /* Stop if tried at original pos */
2482 ph10 214 if (utf8) BACKCHAR(eptr);
2483 nigel 77 }
2484     RRETURN(MATCH_NOMATCH);
2485     }
2486    
2487     /* Control never gets here */
2488     }
2489     #endif /* End of XCLASS */
2490    
2491     /* Match a single character, casefully */
2492    
2493     case OP_CHAR:
2494     #ifdef SUPPORT_UTF8
2495     if (utf8)
2496     {
2497     length = 1;
2498     ecode++;
2499     GETCHARLEN(fc, ecode, length);
2500 ph10 443 if (length > md->end_subject - eptr)
2501 ph10 428 {
2502     CHECK_PARTIAL(); /* Not SCHECK_PARTIAL() */
2503     RRETURN(MATCH_NOMATCH);
2504 ph10 443 }
2505 nigel 77 while (length-- > 0) if (*ecode++ != *eptr++) RRETURN(MATCH_NOMATCH);
2506     }
2507     else
2508     #endif
2509    
2510     /* Non-UTF-8 mode */
2511     {
2512 ph10 443 if (md->end_subject - eptr < 1)
2513 ph10 428 {
2514     SCHECK_PARTIAL(); /* This one can use SCHECK_PARTIAL() */
2515     RRETURN(MATCH_NOMATCH);
2516 ph10 443 }
2517 nigel 77 if (ecode[1] != *eptr++) RRETURN(MATCH_NOMATCH);
2518     ecode += 2;
2519     }
2520     break;
2521    
2522     /* Match a single character, caselessly */
2523    
2524     case OP_CHARNC:
2525     #ifdef SUPPORT_UTF8
2526     if (utf8)
2527     {
2528     length = 1;
2529     ecode++;
2530     GETCHARLEN(fc, ecode, length);
2531    
2532 ph10 443 if (length > md->end_subject - eptr)
2533 ph10 428 {
2534     CHECK_PARTIAL(); /* Not SCHECK_PARTIAL() */
2535     RRETURN(MATCH_NOMATCH);
2536 ph10 443 }
2537 nigel 77
2538     /* If the pattern character's value is < 128, we have only one byte, and
2539     can use the fast lookup table. */
2540    
2541     if (fc < 128)
2542     {
2543     if (md->lcc[*ecode++] != md->lcc[*eptr++]) RRETURN(MATCH_NOMATCH);
2544     }
2545    
2546     /* Otherwise we must pick up the subject character */
2547    
2548     else
2549     {
2550 nigel 93 unsigned int dc;
2551 nigel 77 GETCHARINC(dc, eptr);
2552     ecode += length;
2553    
2554     /* If we have Unicode property support, we can use it to test the other
2555 nigel 87 case of the character, if there is one. */
2556 nigel 77
2557     if (fc != dc)
2558     {
2559     #ifdef SUPPORT_UCP
2560 ph10 349 if (dc != UCD_OTHERCASE(fc))
2561 nigel 77 #endif
2562     RRETURN(MATCH_NOMATCH);
2563     }
2564     }
2565     }
2566     else
2567     #endif /* SUPPORT_UTF8 */
2568    
2569     /* Non-UTF-8 mode */
2570     {
2571 ph10 443 if (md->end_subject - eptr < 1)
2572 ph10 428 {
2573 ph10 443 SCHECK_PARTIAL(); /* This one can use SCHECK_PARTIAL() */
2574 ph10 428 RRETURN(MATCH_NOMATCH);
2575 ph10 443 }
2576 nigel 77 if (md->lcc[ecode[1]] != md->lcc[*eptr++]) RRETURN(MATCH_NOMATCH);
2577     ecode += 2;
2578     }
2579     break;
2580    
2581 nigel 93 /* Match a single character repeatedly. */
2582 nigel 77
2583     case OP_EXACT:
2584     min = max = GET2(ecode, 1);
2585     ecode += 3;
2586     goto REPEATCHAR;
2587    
2588 nigel 93 case OP_POSUPTO:
2589     possessive = TRUE;
2590     /* Fall through */
2591    
2592 nigel 77 case OP_UPTO:
2593     case OP_MINUPTO:
2594     min = 0;
2595     max = GET2(ecode, 1);
2596     minimize = *ecode == OP_MINUPTO;
2597     ecode += 3;
2598     goto REPEATCHAR;
2599    
2600 nigel 93 case OP_POSSTAR:
2601     possessive = TRUE;
2602     min = 0;
2603     max = INT_MAX;
2604     ecode++;
2605     goto REPEATCHAR;
2606    
2607     case OP_POSPLUS:
2608     possessive = TRUE;
2609     min = 1;
2610     max = INT_MAX;
2611     ecode++;
2612     goto REPEATCHAR;
2613    
2614     case OP_POSQUERY:
2615     possessive = TRUE;
2616     min = 0;
2617     max = 1;
2618     ecode++;
2619     goto REPEATCHAR;
2620    
2621 nigel 77 case OP_STAR:
2622     case OP_MINSTAR:
2623     case OP_PLUS:
2624     case OP_MINPLUS:
2625     case OP_QUERY:
2626     case OP_MINQUERY:
2627     c = *ecode++ - OP_STAR;
2628     minimize = (c & 1) != 0;
2629 ph10 443
2630 nigel 77 min = rep_min[c]; /* Pick up values from tables; */
2631     max = rep_max[c]; /* zero for max => infinity */
2632     if (max == 0) max = INT_MAX;
2633    
2634 ph10 426 /* Common code for all repeated single-character matches. */
2635 nigel 77
2636     REPEATCHAR:
2637     #ifdef SUPPORT_UTF8
2638     if (utf8)
2639     {
2640     length = 1;
2641     charptr = ecode;
2642     GETCHARLEN(fc, ecode, length);
2643     ecode += length;
2644    
2645     /* Handle multibyte character matching specially here. There is
2646     support for caseless matching if UCP support is present. */
2647    
2648     if (length > 1)
2649     {
2650     #ifdef SUPPORT_UCP
2651 nigel 93 unsigned int othercase;
2652 nigel 77 if ((ims & PCRE_CASELESS) != 0 &&
2653 ph10 349 (othercase = UCD_OTHERCASE(fc)) != fc)
2654 nigel 77 oclength = _pcre_ord2utf8(othercase, occhars);
2655 ph10 115 else oclength = 0;
2656 nigel 77 #endif /* SUPPORT_UCP */
2657    
2658     for (i = 1; i <= min; i++)
2659     {
2660 ph10 426 if (eptr <= md->end_subject - length &&
2661     memcmp(eptr, charptr, length) == 0) eptr += length;
2662 ph10 123 #ifdef SUPPORT_UCP
2663 ph10 426 else if (oclength > 0 &&
2664     eptr <= md->end_subject - oclength &&
2665     memcmp(eptr, occhars, oclength) == 0) eptr += oclength;
2666     #endif /* SUPPORT_UCP */
2667 nigel 77 else
2668     {
2669 ph10 426 CHECK_PARTIAL();
2670     RRETURN(MATCH_NOMATCH);
2671 nigel 77 }
2672     }
2673    
2674     if (min == max) continue;
2675    
2676     if (minimize)
2677     {
2678     for (fi = min;; fi++)
2679     {
2680 ph10 164 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM22);
2681 nigel 77 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2682 ph10 428 if (fi >= max) RRETURN(MATCH_NOMATCH);
2683 ph10 426 if (eptr <= md->end_subject - length &&
2684     memcmp(eptr, charptr, length) == 0) eptr += length;
2685 ph10 123 #ifdef SUPPORT_UCP
2686 ph10 426 else if (oclength > 0 &&
2687     eptr <= md->end_subject - oclength &&
2688     memcmp(eptr, occhars, oclength) == 0) eptr += oclength;
2689     #endif /* SUPPORT_UCP */
2690 nigel 77 else
2691     {
2692 ph10 426 CHECK_PARTIAL();
2693     RRETURN(MATCH_NOMATCH);
2694 nigel 77 }
2695     }
2696     /* Control never gets here */
2697     }
2698 nigel 93
2699     else /* Maximize */
2700 nigel 77 {
2701     pp = eptr;
2702     for (i = min; i < max; i++)
2703     {
2704 ph10 426 if (eptr <= md->end_subject - length &&
2705     memcmp(eptr, charptr, length) == 0) eptr += length;
2706 ph10 123 #ifdef SUPPORT_UCP
2707 ph10 426 else if (oclength > 0 &&
2708     eptr <= md->end_subject - oclength &&
2709     memcmp(eptr, occhars, oclength) == 0) eptr += oclength;
2710     #endif /* SUPPORT_UCP */
2711 ph10 463 else
2712 ph10 462 {
2713 ph10 463 CHECK_PARTIAL();
2714 ph10 462 break;
2715 ph10 463 }
2716 nigel 77 }
2717 nigel 93
2718     if (possessive) continue;
2719 ph10 427
2720 ph10 120 for(;;)
2721 ph10 426 {
2722     RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM23);
2723     if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2724     if (eptr == pp) { RRETURN(MATCH_NOMATCH); }
2725 ph10 115 #ifdef SUPPORT_UCP
2726 ph10 426 eptr--;
2727     BACKCHAR(eptr);
2728 ph10 123 #else /* without SUPPORT_UCP */
2729 ph10 426 eptr -= length;
2730 ph10 123 #endif /* SUPPORT_UCP */
2731 ph10 426 }
2732 nigel 77 }
2733     /* Control never gets here */
2734     }
2735    
2736     /* If the length of a UTF-8 character is 1, we fall through here, and
2737     obey the code as for non-UTF-8 characters below, though in this case the
2738     value of fc will always be < 128. */
2739     }
2740     else
2741     #endif /* SUPPORT_UTF8 */
2742    
2743     /* When not in UTF-8 mode, load a single-byte character. */
2744    
2745 ph10 426 fc = *ecode++;
2746 ph10 443
2747 nigel 77 /* The value of fc at this point is always less than 256, though we may or
2748     may not be in UTF-8 mode. The code is duplicated for the caseless and
2749     caseful cases, for speed, since matching characters is likely to be quite
2750     common. First, ensure the minimum number of matches are present. If min =
2751     max, continue at the same level without recursing. Otherwise, if
2752     minimizing, keep trying the rest of the expression and advancing one
2753     matching character if failing, up to the maximum. Alternatively, if
2754     maximizing, find the maximum number of characters and work backwards. */
2755    
2756     DPRINTF(("matching %c{%d,%d} against subject %.*s\n", fc, min, max,
2757     max, eptr));
2758    
2759     if ((ims & PCRE_CASELESS) != 0)
2760     {
2761     fc = md->lcc[fc];
2762     for (i = 1; i <= min; i++)
2763 ph10 426 {
2764     if (eptr >= md->end_subject)
2765     {
2766     SCHECK_PARTIAL();
2767     RRETURN(MATCH_NOMATCH);
2768     }
2769 nigel 77 if (fc != md->lcc[*eptr++]) RRETURN(MATCH_NOMATCH);
2770 ph10 426 }
2771 nigel 77 if (min == max) continue;
2772     if (minimize)
2773     {
2774     for (fi = min;; fi++)
2775     {
2776 ph10 164 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM24);
2777 nigel 77 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2778 ph10 428 if (fi >= max) RRETURN(MATCH_NOMATCH);
2779 ph10 426 if (eptr >= md->end_subject)
2780     {
2781 ph10 427 SCHECK_PARTIAL();
2782 ph10 426 RRETURN(MATCH_NOMATCH);
2783     }
2784     if (fc != md->lcc[*eptr++]) RRETURN(MATCH_NOMATCH);
2785 nigel 77 }
2786     /* Control never gets here */
2787     }
2788 nigel 93 else /* Maximize */
2789 nigel 77 {
2790     pp = eptr;
2791     for (i = min; i < max; i++)
2792     {
2793 ph10 463 if (eptr >= md->end_subject)
2794 ph10 462 {
2795     SCHECK_PARTIAL();
2796     break;
2797 ph10 463 }
2798 ph10 462 if (fc != md->lcc[*eptr]) break;
2799 nigel 77 eptr++;
2800     }
2801 ph10 427
2802 nigel 93 if (possessive) continue;
2803 ph10 427
2804 nigel 77 while (eptr >= pp)
2805     {
2806 ph10 164 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM25);
2807 nigel 77 eptr--;
2808     if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2809     }
2810     RRETURN(MATCH_NOMATCH);
2811     }
2812     /* Control never gets here */
2813     }
2814    
2815     /* Caseful comparisons (includes all multi-byte characters) */
2816    
2817     else
2818     {
2819 ph10 427 for (i = 1; i <= min; i++)
2820 ph10 426 {
2821     if (eptr >= md->end_subject)
2822     {
2823     SCHECK_PARTIAL();
2824     RRETURN(MATCH_NOMATCH);
2825     }
2826     if (fc != *eptr++) RRETURN(MATCH_NOMATCH);
2827 ph10 427 }
2828 ph10 443
2829 nigel 77 if (min == max) continue;
2830 ph10 443
2831 nigel 77 if (minimize)
2832     {
2833     for (fi = min;; fi++)
2834     {
2835 ph10 164 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM26);
2836 nigel 77 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2837 ph10 428 if (fi >= max) RRETURN(MATCH_NOMATCH);
2838 ph10 426 if (eptr >= md->end_subject)
2839 ph10 427 {
2840 ph10 426 SCHECK_PARTIAL();
2841     RRETURN(MATCH_NOMATCH);
2842 ph10 427 }
2843 ph10 426 if (fc != *eptr++) RRETURN(MATCH_NOMATCH);
2844 nigel 77 }
2845     /* Control never gets here */
2846     }
2847 nigel 93 else /* Maximize */
2848 nigel 77 {
2849     pp = eptr;
2850     for (i = min; i < max; i++)
2851     {
2852 ph10 463 if (eptr >= md->end_subject)
2853 ph10 462 {
2854 ph10 463 SCHECK_PARTIAL();
2855 ph10 462 break;
2856 ph10 463 }
2857 ph10 462 if (fc != *eptr) break;
2858 nigel 77 eptr++;
2859     }
2860 nigel 93 if (possessive) continue;
2861 ph10 443
2862 nigel 77 while (eptr >= pp)
2863     {
2864 ph10 164 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM27);
2865 nigel 77 eptr--;
2866     if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2867     }
2868     RRETURN(MATCH_NOMATCH);
2869     }
2870     }
2871     /* Control never gets here */
2872    
2873     /* Match a negated single one-byte character. The character we are
2874     checking can be multibyte. */
2875    
2876     case OP_NOT:
2877 ph10 443 if (eptr >= md->end_subject)
2878 ph10 428 {
2879 ph10 443 SCHECK_PARTIAL();
2880 ph10 428 RRETURN(MATCH_NOMATCH);
2881 ph10 443 }
2882 nigel 77 ecode++;
2883     GETCHARINCTEST(c, eptr);
2884     if ((ims & PCRE_CASELESS) != 0)
2885     {
2886     #ifdef SUPPORT_UTF8
2887     if (c < 256)
2888     #endif
2889     c = md->lcc[c];
2890     if (md->lcc[*ecode++] == c) RRETURN(MATCH_NOMATCH);
2891     }
2892     else
2893     {
2894     if (*ecode++ == c) RRETURN(MATCH_NOMATCH);
2895     }
2896     break;
2897    
2898     /* Match a negated single one-byte character repeatedly. This is almost a
2899     repeat of the code for a repeated single character, but I haven't found a
2900     nice way of commoning these up that doesn't require a test of the
2901     positive/negative option for each character match. Maybe that wouldn't add
2902     very much to the time taken, but character matching *is* what this is all
2903     about... */
2904    
2905     case OP_NOTEXACT:
2906     min = max = GET2(ecode, 1);
2907     ecode += 3;
2908     goto REPEATNOTCHAR;
2909    
2910     case OP_NOTUPTO:
2911     case OP_NOTMINUPTO:
2912     min = 0;
2913     max = GET2(ecode, 1);
2914     minimize = *ecode == OP_NOTMINUPTO;
2915     ecode += 3;
2916     goto REPEATNOTCHAR;
2917    
2918 nigel 93 case OP_NOTPOSSTAR:
2919     possessive = TRUE;
2920     min = 0;
2921     max = INT_MAX;
2922     ecode++;
2923     goto REPEATNOTCHAR;
2924    
2925     case OP_NOTPOSPLUS:
2926     possessive = TRUE;
2927     min = 1;
2928     max = INT_MAX;
2929     ecode++;
2930     goto REPEATNOTCHAR;
2931    
2932     case OP_NOTPOSQUERY:
2933     possessive = TRUE;
2934     min = 0;
2935     max = 1;
2936     ecode++;
2937     goto REPEATNOTCHAR;
2938    
2939     case OP_NOTPOSUPTO:
2940     possessive = TRUE;
2941     min = 0;
2942     max = GET2(ecode, 1);
2943     ecode += 3;
2944     goto REPEATNOTCHAR;
2945    
2946 nigel 77 case OP_NOTSTAR:
2947     case OP_NOTMINSTAR:
2948     case OP_NOTPLUS:
2949     case OP_NOTMINPLUS:
2950     case OP_NOTQUERY:
2951     case OP_NOTMINQUERY:
2952     c = *ecode++ - OP_NOTSTAR;
2953     minimize = (c & 1) != 0;
2954     min = rep_min[c]; /* Pick up values from tables; */
2955     max = rep_max[c]; /* zero for max => infinity */
2956     if (max == 0) max = INT_MAX;
2957    
2958 ph10 426 /* Common code for all repeated single-byte matches. */
2959 nigel 77
2960     REPEATNOTCHAR:
2961     fc = *ecode++;
2962    
2963     /* The code is duplicated for the caseless and caseful cases, for speed,
2964     since matching characters is likely to be quite common. First, ensure the
2965     minimum number of matches are present. If min = max, continue at the same
2966     level without recursing. Otherwise, if minimizing, keep trying the rest of
2967     the expression and advancing one matching character if failing, up to the
2968     maximum. Alternatively, if maximizing, find the maximum number of
2969     characters and work backwards. */
2970    
2971     DPRINTF(("negative matching %c{%d,%d} against subject %.*s\n", fc, min, max,
2972     max, eptr));
2973    
2974     if ((ims & PCRE_CASELESS) != 0)
2975     {
2976     fc = md->lcc[fc];
2977    
2978     #ifdef SUPPORT_UTF8
2979     /* UTF-8 mode */
2980     if (utf8)
2981     {
2982 nigel 93 register unsigned int d;
2983 nigel 77 for (i = 1; i <= min; i++)
2984     {
2985 ph10 426 if (eptr >= md->end_subject)
2986     {
2987     SCHECK_PARTIAL();
2988 ph10 427 RRETURN(MATCH_NOMATCH);
2989     }
2990 nigel 77 GETCHARINC(d, eptr);
2991     if (d < 256) d = md->lcc[d];
2992     if (fc == d) RRETURN(MATCH_NOMATCH);
2993     }
2994     }
2995     else
2996     #endif
2997    
2998     /* Not UTF-8 mode */
2999     {
3000     for (i = 1; i <= min; i++)
3001 ph10 426 {
3002     if (eptr >= md->end_subject)
3003     {
3004     SCHECK_PARTIAL();
3005 ph10 427 RRETURN(MATCH_NOMATCH);
3006     }
3007 nigel 77 if (fc == md->lcc[*eptr++]) RRETURN(MATCH_NOMATCH);
3008 ph10 427 }
3009 nigel 77 }
3010    
3011     if (min == max) continue;
3012    
3013     if (minimize)
3014     {
3015     #ifdef SUPPORT_UTF8
3016     /* UTF-8 mode */
3017     if (utf8)
3018     {
3019 nigel 93 register unsigned int d;
3020 nigel 77 for (fi = min;; fi++)
3021     {
3022 ph10 164 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM28);
3023 nigel 77 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3024 ph10 428 if (fi >= max) RRETURN(MATCH_NOMATCH);
3025 ph10 427 if (eptr >= md->end_subject)
3026 ph10 426 {
3027 ph10 427 SCHECK_PARTIAL();
3028 ph10 426 RRETURN(MATCH_NOMATCH);
3029 ph10 427 }
3030 nigel 77 GETCHARINC(d, eptr);
3031     if (d < 256) d = md->lcc[d];
3032 ph10 366 if (fc == d) RRETURN(MATCH_NOMATCH);
3033 nigel 77 }
3034     }
3035     else
3036     #endif
3037     /* Not UTF-8 mode */
3038     {
3039     for (fi = min;; fi++)
3040     {
3041 ph10 164 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM29);
3042 nigel 77 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3043 ph10 428 if (fi >= max) RRETURN(MATCH_NOMATCH);
3044 ph10 426 if (eptr >= md->end_subject)
3045     {
3046     SCHECK_PARTIAL();
3047     RRETURN(MATCH_NOMATCH);
3048     }
3049     if (fc == md->lcc[*eptr++]) RRETURN(MATCH_NOMATCH);
3050 nigel 77 }
3051     }
3052     /* Control never gets here */
3053     }
3054    
3055     /* Maximize case */
3056    
3057     else
3058     {
3059     pp = eptr;
3060    
3061     #ifdef SUPPORT_UTF8
3062     /* UTF-8 mode */
3063     if (utf8)
3064     {
3065 nigel 93 register unsigned int d;
3066 nigel 77 for (i = min; i < max; i++)
3067     {
3068     int len = 1;
3069 ph10 463 if (eptr >= md->end_subject)
3070 ph10 462 {
3071 ph10 463 SCHECK_PARTIAL();
3072 ph10 462 break;
3073 ph10 463 }
3074 nigel 77 GETCHARLEN(d, eptr, len);
3075     if (d < 256) d = md->lcc[d];
3076     if (fc == d) break;
3077     eptr += len;
3078     }
3079 nigel 93 if (possessive) continue;
3080     for(;;)
3081 nigel 77 {
3082 ph10 164 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM30);
3083 nigel 77 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3084     if (eptr-- == pp) break; /* Stop if tried at original pos */
3085     BACKCHAR(eptr);
3086     }
3087     }
3088     else
3089     #endif
3090     /* Not UTF-8 mode */
3091     {
3092     for (i = min; i < max; i++)
3093     {
3094 ph10 463 if (eptr >= md->end_subject)
3095 ph10 462 {
3096     SCHECK_PARTIAL();
3097     break;
3098 ph10 463 }
3099 ph10 462 if (fc == md->lcc[*eptr]) break;
3100 nigel 77 eptr++;
3101     }
3102 nigel 93 if (possessive) continue;
3103 nigel 77 while (eptr >= pp)
3104     {
3105 ph10 164 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM31);
3106 nigel 77 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3107     eptr--;
3108     }
3109     }
3110    
3111     RRETURN(MATCH_NOMATCH);
3112     }
3113     /* Control never gets here */
3114     }
3115    
3116     /* Caseful comparisons */
3117    
3118     else
3119     {
3120     #ifdef SUPPORT_UTF8
3121     /* UTF-8 mode */
3122     if (utf8)
3123     {
3124 nigel 93 register unsigned int d;
3125 nigel 77 for (i = 1; i <= min; i++)
3126     {
3127 ph10 426 if (eptr >= md->end_subject)
3128     {
3129     SCHECK_PARTIAL();
3130 ph10 427 RRETURN(MATCH_NOMATCH);
3131     }
3132 nigel 77 GETCHARINC(d, eptr);
3133     if (fc == d) RRETURN(MATCH_NOMATCH);
3134     }
3135     }
3136     else
3137     #endif
3138     /* Not UTF-8 mode */
3139     {
3140     for (i = 1; i <= min; i++)
3141 ph10 426 {
3142     if (eptr >= md->end_subject)
3143     {
3144     SCHECK_PARTIAL();
3145 ph10 427 RRETURN(MATCH_NOMATCH);
3146     }
3147 nigel 77 if (fc == *eptr++) RRETURN(MATCH_NOMATCH);
3148 ph10 427 }
3149 nigel 77 }
3150    
3151     if (min == max) continue;
3152    
3153     if (minimize)
3154     {
3155     #ifdef SUPPORT_UTF8
3156     /* UTF-8 mode */
3157     if (utf8)
3158     {
3159 nigel 93 register unsigned int d;
3160 nigel 77 for (fi = min;; fi++)
3161     {
3162 ph10 164 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM32);
3163 nigel 77 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3164 ph10 428 if (fi >= max) RRETURN(MATCH_NOMATCH);
3165 ph10 427 if (eptr >= md->end_subject)
3166 ph10 426 {
3167 ph10 427 SCHECK_PARTIAL();
3168 ph10 426 RRETURN(MATCH_NOMATCH);
3169 ph10 427 }
3170 nigel 77 GETCHARINC(d, eptr);
3171 ph10 371 if (fc == d) RRETURN(MATCH_NOMATCH);
3172 nigel 77 }
3173     }
3174     else
3175     #endif
3176     /* Not UTF-8 mode */
3177     {
3178     for (fi = min;; fi++)
3179     {
3180 ph10 164 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM33);
3181 nigel 77 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3182 ph10 428 if (fi >= max) RRETURN(MATCH_NOMATCH);
3183 ph10 426 if (eptr >= md->end_subject)
3184     {
3185     SCHECK_PARTIAL();
3186     RRETURN(MATCH_NOMATCH);
3187 ph10 427 }
3188 ph10 426 if (fc == *eptr++) RRETURN(MATCH_NOMATCH);
3189 nigel 77 }
3190     }
3191     /* Control never gets here */
3192     }
3193    
3194     /* Maximize case */
3195    
3196     else
3197     {
3198     pp = eptr;
3199    
3200     #ifdef SUPPORT_UTF8
3201     /* UTF-8 mode */
3202     if (utf8)
3203     {
3204 nigel 93 register unsigned int d;
3205 nigel 77 for (i = min; i < max; i++)
3206     {
3207     int len = 1;
3208 ph10 463 if (eptr >= md->end_subject)
3209 ph10 462 {
3210 ph10 463 SCHECK_PARTIAL();
3211 ph10 462 break;
3212 ph10 463 }
3213 nigel 77 GETCHARLEN(d, eptr, len);
3214     if (fc == d) break;
3215     eptr += len;
3216     }
3217 nigel 93 if (possessive) continue;
3218 nigel 77 for(;;)
3219     {
3220 ph10 164 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM34);
3221 nigel 77 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3222     if (eptr-- == pp) break; /* Stop if tried at original pos */
3223     BACKCHAR(eptr);
3224     }
3225     }
3226     else
3227     #endif
3228     /* Not UTF-8 mode */
3229     {
3230     for (i = min; i < max; i++)
3231     {
3232 ph10 463 if (eptr >= md->end_subject)
3233 ph10 462 {
3234 ph10 463 SCHECK_PARTIAL();
3235 ph10 462 break;
3236 ph10 463 }
3237 ph10 462 if (fc == *eptr) break;
3238 nigel 77 eptr++;
3239     }
3240 nigel 93 if (possessive) continue;
3241 nigel 77 while (eptr >= pp)
3242     {
3243 ph10 164 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM35);
3244 nigel 77 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3245     eptr--;
3246     }
3247     }
3248    
3249     RRETURN(MATCH_NOMATCH);
3250     }
3251     }
3252     /* Control never gets here */
3253    
3254     /* Match a single character type repeatedly; several different opcodes
3255     share code. This is very similar to the code for single characters, but we
3256     repeat it in the interests of efficiency. */
3257    
3258     case OP_TYPEEXACT:
3259     min = max = GET2(ecode, 1);
3260     minimize = TRUE;
3261     ecode += 3;
3262     goto REPEATTYPE;
3263    
3264     case OP_TYPEUPTO:
3265     case OP_TYPEMINUPTO:
3266     min = 0;
3267     max = GET2(ecode, 1);
3268     minimize = *ecode == OP_TYPEMINUPTO;
3269     ecode += 3;
3270     goto REPEATTYPE;
3271    
3272 nigel 93 case OP_TYPEPOSSTAR:
3273     possessive = TRUE;
3274     min = 0;
3275     max = INT_MAX;
3276     ecode++;
3277     goto REPEATTYPE;
3278    
3279     case OP_TYPEPOSPLUS:
3280     possessive = TRUE;
3281     min = 1;
3282     max = INT_MAX;
3283     ecode++;
3284     goto REPEATTYPE;
3285    
3286     case OP_TYPEPOSQUERY:
3287     possessive = TRUE;
3288     min = 0;
3289     max = 1;
3290     ecode++;
3291     goto REPEATTYPE;
3292    
3293     case OP_TYPEPOSUPTO:
3294     possessive = TRUE;
3295     min = 0;
3296     max = GET2(ecode, 1);
3297     ecode += 3;
3298     goto REPEATTYPE;
3299    
3300 nigel 77 case OP_TYPESTAR:
3301     case OP_TYPEMINSTAR:
3302     case OP_TYPEPLUS:
3303     case OP_TYPEMINPLUS:
3304     case OP_TYPEQUERY:
3305     case OP_TYPEMINQUERY:
3306     c = *ecode++ - OP_TYPESTAR;
3307     minimize = (c & 1) != 0;
3308     min = rep_min[c]; /* Pick up values from tables; */
3309     max = rep_max[c]; /* zero for max => infinity */
3310     if (max == 0) max = INT_MAX;
3311    
3312     /* Common code for all repeated single character type matches. Note that
3313     in UTF-8 mode, '.' matches a character of any length, but for the other
3314     character types, the valid characters are all one-byte long. */
3315    
3316     REPEATTYPE:
3317     ctype = *ecode++; /* Code for the character type */
3318    
3319     #ifdef SUPPORT_UCP
3320     if (ctype == OP_PROP || ctype == OP_NOTPROP)
3321     {
3322     prop_fail_result = ctype == OP_NOTPROP;
3323     prop_type = *ecode++;
3324 nigel 87 prop_value = *ecode++;
3325 nigel 77 }
3326     else prop_type = -1;
3327     #endif
3328    
3329     /* First, ensure the minimum number of matches are present. Use inline
3330     code for maximizing the speed, and do the type test once at the start
3331 ph10 426 (i.e. keep it out of the loop). Separate the UTF-8 code completely as that
3332 nigel 77 is tidier. Also separate the UCP code, which can be the same for both UTF-8
3333     and single-bytes. */
3334    
3335     if (min > 0)
3336     {
3337     #ifdef SUPPORT_UCP
3338 nigel 87 if (prop_type >= 0)
3339 nigel 77 {
3340 nigel 87 switch(prop_type)
3341 nigel 77 {
3342 nigel 87 case PT_ANY:
3343     if (prop_fail_result) RRETURN(MATCH_NOMATCH);
3344     for (i = 1; i <= min; i++)
3345     {
3346 ph10 427 if (eptr >= md->end_subject)
3347 ph10 426 {
3348 ph10 427 SCHECK_PARTIAL();
3349 ph10 426 RRETURN(MATCH_NOMATCH);
3350 ph10 427 }
3351 ph10 184 GETCHARINCTEST(c, eptr);
3352 nigel 87 }
3353     break;
3354    
3355     case PT_LAMP:
3356     for (i = 1; i <= min; i++)
3357     {
3358 ph10 427 if (eptr >= md->end_subject)
3359 ph10 426 {
3360 ph10 427 SCHECK_PARTIAL();
3361 ph10 426 RRETURN(MATCH_NOMATCH);
3362 ph10 427 }
3363 ph10 184 GETCHARINCTEST(c, eptr);
3364 ph10 349 prop_chartype = UCD_CHARTYPE(c);
3365 nigel 87 if ((prop_chartype == ucp_Lu ||
3366     prop_chartype == ucp_Ll ||
3367     prop_chartype == ucp_Lt) == prop_fail_result)
3368     RRETURN(MATCH_NOMATCH);
3369     }
3370     break;
3371    
3372     case PT_GC:
3373     for (i = 1; i <= min; i++)
3374     {
3375 ph10 427 if (eptr >= md->end_subject)
3376 ph10 426 {
3377 ph10 427 SCHECK_PARTIAL();
3378 ph10 426 RRETURN(MATCH_NOMATCH);
3379 ph10 427 }
3380 ph10 184 GETCHARINCTEST(c, eptr);
3381 ph10 349 prop_category = UCD_CATEGORY(c);
3382 nigel 87 if ((prop_category == prop_value) == prop_fail_result)
3383     RRETURN(MATCH_NOMATCH);
3384     }
3385     break;
3386    
3387     case PT_PC:
3388     for (i = 1; i <= min; i++)
3389     {
3390 ph10 427 if (eptr >= md->end_subject)
3391 ph10 426 {
3392 ph10 427 SCHECK_PARTIAL();
3393 ph10 426 RRETURN(MATCH_NOMATCH);
3394 ph10 427 }
3395 ph10 184 GETCHARINCTEST(c, eptr);
3396 ph10 349 prop_chartype = UCD_CHARTYPE(c);
3397 nigel 87 if ((prop_chartype == prop_value) == prop_fail_result)
3398     RRETURN(MATCH_NOMATCH);
3399     }
3400     break;
3401    
3402     case PT_SC:
3403     for (i = 1; i <= min; i++)
3404     {
3405 ph10 427 if (eptr >= md->end_subject)
3406 ph10 426 {
3407 ph10 427 SCHECK_PARTIAL();
3408 ph10 426 RRETURN(MATCH_NOMATCH);
3409 ph10 427 }
3410 ph10 184 GETCHARINCTEST(c, eptr);
3411 ph10 349 prop_script = UCD_SCRIPT(c);
3412 nigel 87 if ((prop_script == prop_value) == prop_fail_result)
3413     RRETURN(MATCH_NOMATCH);
3414     }
3415     break;
3416    
3417     default:
3418     RRETURN(PCRE_ERROR_INTERNAL);
3419 nigel 77 }
3420     }
3421    
3422     /* Match extended Unicode sequences. We will get here only if the
3423     support is in the binary; otherwise a compile-time error occurs. */
3424    
3425     else if (ctype == OP_EXTUNI)
3426     {
3427     for (i = 1; i <= min; i++)
3428     {
3429 ph10 427 if (eptr >= md->end_subject)
3430 ph10 426 {
3431 ph10 427 SCHECK_PARTIAL();
3432 ph10 426 RRETURN(MATCH_NOMATCH);
3433 ph10 427 }
3434 nigel 77 GETCHARINCTEST(c, eptr);
3435 ph10 349 prop_category = UCD_CATEGORY(c);
3436 nigel 77 if (prop_category == ucp_M) RRETURN(MATCH_NOMATCH);
3437     while (eptr < md->end_subject)
3438     {
3439     int len = 1;
3440 ph10 426 if (!utf8) c = *eptr;
3441     else { GETCHARLEN(c, eptr, len); }
3442 ph10 349 prop_category = UCD_CATEGORY(c);
3443 nigel 77 if (prop_category != ucp_M) break;
3444     eptr += len;
3445     }
3446     }
3447     }
3448    
3449     else
3450     #endif /* SUPPORT_UCP */
3451    
3452     /* Handle all other cases when the coding is UTF-8 */
3453    
3454     #ifdef SUPPORT_UTF8
3455     if (utf8) switch(ctype)
3456     {
3457     case OP_ANY:
3458     for (i = 1; i <= min; i++)
3459     {
3460 ph10 426 if (eptr >= md->end_subject)
3461     {
3462 ph10 427 SCHECK_PARTIAL();
3463 nigel 77 RRETURN(MATCH_NOMATCH);
3464 ph10 427 }
3465 ph10 426 if (IS_NEWLINE(eptr)) RRETURN(MATCH_NOMATCH);
3466 nigel 91 eptr++;
3467 nigel 77 while (eptr < md->end_subject && (*eptr & 0xc0) == 0x80) eptr++;
3468     }
3469     break;
3470    
3471 ph10 341 case OP_ALLANY:
3472     for (i = 1; i <= min; i++)
3473     {
3474 ph10 427 if (eptr >= md->end_subject)
3475 ph10 426 {
3476     SCHECK_PARTIAL();
3477     RRETURN(MATCH_NOMATCH);
3478 ph10 427 }
3479 ph10 341 eptr++;
3480     while (eptr < md->end_subject && (*eptr & 0xc0) == 0x80) eptr++;
3481     }
3482     break;
3483    
3484 nigel 77 case OP_ANYBYTE:
3485 ph10 427 if (eptr > md->end_subject - min) RRETURN(MATCH_NOMATCH);
3486 nigel 77 eptr += min;
3487     break;
3488    
3489 nigel 93 case OP_ANYNL:
3490     for (i = 1; i <= min; i++)
3491     {
3492 ph10 427 if (eptr >= md->end_subject)
3493 ph10 426 {
3494     SCHECK_PARTIAL();
3495     RRETURN(MATCH_NOMATCH);
3496 ph10 427 }
3497 nigel 93 GETCHARINC(c, eptr);
3498     switch(c)
3499     {
3500     default: RRETURN(MATCH_NOMATCH);
3501     case 0x000d:
3502     if (eptr < md->end_subject && *eptr == 0x0a) eptr++;
3503     break;
3504 ph10 231
3505 nigel 93 case 0x000a:
3506 ph10 231 break;
3507    
3508 nigel 93 case 0x000b:
3509     case 0x000c:
3510     case 0x0085:
3511     case 0x2028:
3512     case 0x2029:
3513 ph10 231 if (md->bsr_anycrlf) RRETURN(MATCH_NOMATCH);
3514 nigel 93 break;
3515     }
3516     }
3517     break;
3518    
3519 ph10 178 case OP_NOT_HSPACE:
3520     for (i = 1; i <= min; i++)
3521     {
3522 ph10 427 if (eptr >= md->end_subject)
3523 ph10 426 {
3524     SCHECK_PARTIAL();
3525     RRETURN(MATCH_NOMATCH);
3526 ph10 427 }
3527 ph10 178 GETCHARINC(c, eptr);
3528     switch(c)
3529     {
3530     default: break;
3531     case 0x09: /* HT */
3532     case 0x20: /* SPACE */
3533     case 0xa0: /* NBSP */
3534     case 0x1680: /* OGHAM SPACE MARK */
3535     case 0x180e: /* MONGOLIAN VOWEL SEPARATOR */
3536     case 0x2000: /* EN QUAD */
3537     case 0x2001: /* EM QUAD */
3538     case 0x2002: /* EN SPACE */
3539     case 0x2003: /* EM SPACE */
3540     case 0x2004: /* THREE-PER-EM SPACE */
3541     case 0x2005: /* FOUR-PER-EM SPACE */
3542     case 0x2006: /* SIX-PER-EM SPACE */
3543     case 0x2007: /* FIGURE SPACE */
3544     case 0x2008: /* PUNCTUATION SPACE */
3545     case 0x2009: /* THIN SPACE */
3546     case 0x200A: /* HAIR SPACE */
3547     case 0x202f: /* NARROW NO-BREAK SPACE */
3548     case 0x205f: /* MEDIUM MATHEMATICAL SPACE */
3549     case 0x3000: /* IDEOGRAPHIC SPACE */
3550     RRETURN(MATCH_NOMATCH);
3551     }
3552     }
3553     break;
3554 ph10 182
3555 ph10 178 case OP_HSPACE:
3556     for (i = 1; i <= min; i++)
3557     {
3558 ph10 427 if (eptr >= md->end_subject)
3559 ph10 426 {
3560 ph10 427 SCHECK_PARTIAL();
3561 ph10 426 RRETURN(MATCH_NOMATCH);
3562 ph10 427 }
3563 ph10 178 GETCHARINC(c, eptr);
3564     switch(c)
3565     {
3566     default: RRETURN(MATCH_NOMATCH);
3567     case 0x09: /* HT */
3568     case 0x20: /* SPACE */
3569     case 0xa0: /* NBSP */
3570     case 0x1680: /* OGHAM SPACE MARK */
3571     case 0x180e: /* MONGOLIAN VOWEL SEPARATOR */
3572     case 0x2000: /* EN QUAD */
3573     case 0x2001: /* EM QUAD */
3574     case 0x2002: /* EN SPACE */
3575     case 0x2003: /* EM SPACE */
3576     case 0x2004: /* THREE-PER-EM SPACE */
3577     case 0x2005: /* FOUR-PER-EM SPACE */
3578     case 0x2006: /* SIX-PER-EM SPACE */
3579     case 0x2007: /* FIGURE SPACE */
3580     case 0x2008: /* PUNCTUATION SPACE */
3581     case 0x2009: /* THIN SPACE */
3582     case 0x200A: /* HAIR SPACE */
3583     case 0x202f: /* NARROW NO-BREAK SPACE */
3584     case 0x205f: /* MEDIUM MATHEMATICAL SPACE */
3585     case 0x3000: /* IDEOGRAPHIC SPACE */
3586     break;
3587     }
3588     }
3589     break;
3590 ph10 182
3591 ph10 178 case OP_NOT_VSPACE:
3592     for (i = 1; i <= min; i++)
3593     {
3594 ph10 427 if (eptr >= md->end_subject)
3595 ph10 426 {
3596 ph10 427 SCHECK_PARTIAL();
3597 ph10 426 RRETURN(MATCH_NOMATCH);
3598 ph10 427 }
3599 ph10 178 GETCHARINC(c, eptr);
3600     switch(c)
3601     {
3602     default: break;
3603     case 0x0a: /* LF */
3604     case 0x0b: /* VT */
3605     case 0x0c: /* FF */
3606     case 0x0d: /* CR */
3607     case 0x85: /* NEL */
3608     case 0x2028: /* LINE SEPARATOR */
3609     case 0x2029: /* PARAGRAPH SEPARATOR */
3610     RRETURN(MATCH_NOMATCH);
3611     }
3612     }
3613     break;
3614 ph10 182
3615 ph10 178 case OP_VSPACE:
3616     for (i = 1; i <= min; i++)
3617     {
3618 ph10 427 if (eptr >= md->end_subject)
3619 ph10 426 {
3620 ph10 427 SCHECK_PARTIAL();
3621 ph10 426 RRETURN(MATCH_NOMATCH);
3622 ph10 427 }
3623 ph10 178 GETCHARINC(c, eptr);
3624     switch(c)
3625     {
3626     default: RRETURN(MATCH_NOMATCH);
3627     case 0x0a: /* LF */
3628     case 0x0b: /* VT */
3629     case 0x0c: /* FF */
3630     case 0x0d: /* CR */
3631     case 0x85: /* NEL */
3632     case 0x2028: /* LINE SEPARATOR */
3633     case 0x2029: /* PARAGRAPH SEPARATOR */
3634 ph10 182 break;
3635 ph10 178 }
3636     }
3637     break;
3638    
3639 nigel 77 case OP_NOT_DIGIT:
3640     for (i = 1; i <= min; i++)
3641     {
3642 ph10 427 if (eptr >= md->end_subject)
3643 ph10 426 {
3644 ph10 427 SCHECK_PARTIAL();
3645 ph10 426 RRETURN(MATCH_NOMATCH);
3646 ph10 427 }
3647 nigel 77 GETCHARINC(c, eptr);
3648     if (c < 128 && (md->ctypes[c] & ctype_digit) != 0)
3649     RRETURN(MATCH_NOMATCH);
3650     }
3651     break;
3652    
3653     case OP_DIGIT:
3654     for (i = 1; i <= min; i++)
3655     {
3656 ph10 427 if (eptr >= md->end_subject)
3657 ph10 426 {
3658 ph10 427 SCHECK_PARTIAL();
3659 nigel 77 RRETURN(MATCH_NOMATCH);
3660 ph10 427 }
3661 ph10 426 if (*eptr >= 128 || (md->ctypes[*eptr++] & ctype_digit) == 0)
3662     RRETURN(MATCH_NOMATCH);
3663 nigel 77 /* No need to skip more bytes - we know it's a 1-byte character */
3664     }
3665     break;
3666    
3667     case OP_NOT_WHITESPACE:
3668     for (i = 1; i <= min; i++)
3669     {
3670 ph10 427 if (eptr >= md->end_subject)
3671 ph10 426 {
3672 ph10 427 SCHECK_PARTIAL();
3673 nigel 77 RRETURN(MATCH_NOMATCH);
3674 ph10 427 }
3675 ph10 426 if (*eptr < 128 && (md->ctypes[*eptr] & ctype_space) != 0)
3676     RRETURN(MATCH_NOMATCH);
3677 ph10 219 while (++eptr < md->end_subject && (*eptr & 0xc0) == 0x80);
3678 nigel 77 }
3679     break;
3680    
3681     case OP_WHITESPACE:
3682     for (i = 1; i <= min; i++)
3683     {
3684 ph10 427 if (eptr >= md->end_subject)
3685 ph10 426 {
3686 ph10 427 SCHECK_PARTIAL();
3687 nigel 77 RRETURN(MATCH_NOMATCH);
3688 ph10 427 }
3689 ph10 426 if (*eptr >= 128 || (md->ctypes[*eptr++] & ctype_space) == 0)
3690     RRETURN(MATCH_NOMATCH);
3691 nigel 77 /* No need to skip more bytes - we know it's a 1-byte character */
3692     }
3693     break;
3694    
3695     case OP_NOT_WORDCHAR:
3696     for (i = 1; i <= min; i++)
3697     {
3698 ph10 482 if (eptr >= md->end_subject)
3699     {
3700     SCHECK_PARTIAL();
3701 nigel 77 RRETURN(MATCH_NOMATCH);
3702 ph10 482 }
3703     if (*eptr < 128 && (md->ctypes[*eptr] & ctype_word) != 0)
3704     RRETURN(MATCH_NOMATCH);
3705 ph10 219 while (++eptr < md->end_subject && (*eptr & 0xc0) == 0x80);
3706 nigel 77 }
3707     break;
3708    
3709     case OP_WORDCHAR:
3710     for (i = 1; i <= min; i++)
3711     {
3712 ph10 427 if (eptr >= md->end_subject)
3713 ph10 426 {
3714 ph10 427 SCHECK_PARTIAL();
3715 nigel 77 RRETURN(MATCH_NOMATCH);
3716 ph10 427 }
3717 ph10 426 if (*eptr >= 128 || (md->ctypes[*eptr++] & ctype_word) == 0)
3718     RRETURN(MATCH_NOMATCH);
3719 nigel 77 /* No need to skip more bytes - we know it's a 1-byte character */
3720     }
3721     break;
3722    
3723     default:
3724     RRETURN(PCRE_ERROR_INTERNAL);
3725     } /* End switch(ctype) */
3726    
3727     else
3728     #endif /* SUPPORT_UTF8 */
3729    
3730     /* Code for the non-UTF-8 case for minimum matching of operators other
3731 ph10 426 than OP_PROP and OP_NOTPROP. */
3732 nigel 77
3733     switch(ctype)
3734     {
3735     case OP_ANY:
3736 ph10 342 for (i = 1; i <= min; i++)
3737 nigel 77 {
3738 ph10 427 if (eptr >= md->end_subject)
3739 ph10 426 {
3740 ph10 427 SCHECK_PARTIAL();
3741 ph10 426 RRETURN(MATCH_NOMATCH);
3742 ph10 427 }
3743 ph10 342 if (IS_NEWLINE(eptr)) RRETURN(MATCH_NOMATCH);
3744     eptr++;
3745 nigel 77 }
3746     break;
3747    
3748 ph10 341 case OP_ALLANY:
3749 ph10 443 if (eptr > md->end_subject - min)
3750 ph10 428 {
3751 ph10 443 SCHECK_PARTIAL();
3752 ph10 428 RRETURN(MATCH_NOMATCH);
3753 ph10 443 }
3754 ph10 341 eptr += min;
3755     break;
3756    
3757 nigel 77 case OP_ANYBYTE:
3758 ph10 443 if (eptr > md->end_subject - min)
3759 ph10 428 {
3760 ph10 443 SCHECK_PARTIAL();
3761 ph10 428 RRETURN(MATCH_NOMATCH);
3762 ph10 443 }
3763 nigel 77 eptr += min;
3764     break;
3765    
3766 nigel 93 case OP_ANYNL:
3767     for (i = 1; i <= min; i++)
3768     {
3769 ph10 427 if (eptr >= md->end_subject)
3770 ph10 426 {
3771 ph10 427 SCHECK_PARTIAL();
3772 ph10 426 RRETURN(MATCH_NOMATCH);
3773 ph10 427 }
3774 nigel 93 switch(*eptr++)
3775     {
3776     default: RRETURN(MATCH_NOMATCH);
3777     case 0x000d:
3778     if (eptr < md->end_subject && *eptr == 0x0a) eptr++;
3779     break;
3780     case 0x000a:
3781 ph10 231 break;
3782    
3783 nigel 93 case 0x000b:
3784     case 0x000c:
3785     case 0x0085:
3786 ph10 231 if (md->bsr_anycrlf) RRETURN(MATCH_NOMATCH);
3787 nigel 93 break;
3788     }
3789     }
3790     break;
3791    
3792 ph10 178 case OP_NOT_HSPACE:
3793     for (i = 1; i <= min; i++)
3794     {
3795 ph10 427 if (eptr >= md->end_subject)
3796 ph10 426 {
3797 ph10 427 SCHECK_PARTIAL();
3798 ph10 426 RRETURN(MATCH_NOMATCH);
3799 ph10 427 }
3800 ph10 178 switch(*eptr++)
3801     {
3802     default: break;
3803     case 0x09: /* HT */
3804     case 0x20: /* SPACE */
3805     case 0xa0: /* NBSP */
3806     RRETURN(MATCH_NOMATCH);
3807     }
3808     }
3809     break;
3810    
3811     case OP_HSPACE:
3812     for (i = 1; i <= min; i++)
3813     {
3814 ph10 427 if (eptr >= md->end_subject)
3815 ph10 426 {
3816 ph10 427 SCHECK_PARTIAL();
3817 ph10 426 RRETURN(MATCH_NOMATCH);
3818 ph10 427 }
3819 ph10 178 switch(*eptr++)
3820     {
3821     default: RRETURN(MATCH_NOMATCH);
3822     case 0x09: /* HT */
3823     case 0x20: /* SPACE */
3824     case 0xa0: /* NBSP */
3825 ph10 182 break;
3826 ph10 178 }
3827     }
3828     break;
3829    
3830     case OP_NOT_VSPACE:
3831     for (i = 1; i <= min; i++)
3832     {
3833 ph10 427 if (eptr >= md->end_subject)
3834 ph10 426 {
3835 ph10 427 SCHECK_PARTIAL();
3836 ph10 426 RRETURN(MATCH_NOMATCH);
3837 ph10 427 }
3838 ph10 178 switch(*eptr++)
3839     {
3840     default: break;
3841     case 0x0a: /* LF */
3842     case 0x0b: /* VT */
3843     case 0x0c: /* FF */
3844     case 0x0d: /* CR */
3845     case 0x85: /* NEL */
3846     RRETURN(MATCH_NOMATCH);
3847     }
3848     }
3849     break;
3850    
3851     case OP_VSPACE:
3852     for (i = 1; i <= min; i++)
3853     {
3854 ph10 427 if (eptr >= md->end_subject)
3855 ph10 426 {
3856 ph10 427 SCHECK_PARTIAL();
3857 ph10 426 RRETURN(MATCH_NOMATCH);
3858 ph10 427 }
3859 ph10 178 switch(*eptr++)
3860     {
3861     default: RRETURN(MATCH_NOMATCH);
3862     case 0x0a: /* LF */
3863     case 0x0b: /* VT */
3864     case 0x0c: /* FF */
3865     case 0x0d: /* CR */
3866     case 0x85: /* NEL */
3867 ph10 182 break;
3868 ph10 178 }
3869     }
3870     break;
3871    
3872 nigel 77 case OP_NOT_DIGIT:
3873     for (i = 1; i <= min; i++)
3874 ph10 427 {
3875     if (eptr >= md->end_subject)
3876 ph10 426 {
3877 ph10 427 SCHECK_PARTIAL();
3878 ph10 426 RRETURN(MATCH_NOMATCH);
3879 ph10 427 }
3880 nigel 77 if ((md->ctypes[*eptr++] & ctype_digit) != 0) RRETURN(MATCH_NOMATCH);
3881 ph10 427 }
3882 nigel 77 break;
3883    
3884     case OP_DIGIT:
3885     for (i = 1; i <= min; i++)
3886 ph10 427 {
3887     if (eptr >= md->end_subject)
3888 ph10 426 {
3889 ph10 427 SCHECK_PARTIAL();
3890 ph10 426 RRETURN(MATCH_NOMATCH);
3891 ph10 427 }
3892 nigel 77 if ((md->ctypes[*eptr++] & ctype_digit) == 0) RRETURN(MATCH_NOMATCH);
3893 ph10 427 }
3894 nigel 77 break;
3895    
3896     case OP_NOT_WHITESPACE:
3897     for (i = 1; i <= min; i++)
3898 ph10 427 {
3899     if (eptr >= md->end_subject)
3900 ph10 426 {
3901 ph10 427 SCHECK_PARTIAL();
3902 ph10 426 RRETURN(MATCH_NOMATCH);
3903 ph10 427 }
3904 nigel 77 if ((md->ctypes[*eptr++] & ctype_space) != 0) RRETURN(MATCH_NOMATCH);
3905 ph10 427 }
3906 nigel 77 break;
3907    
3908     case OP_WHITESPACE:
3909     for (i = 1; i <= min; i++)
3910 ph10 427 {
3911     if (eptr >= md->end_subject)
3912 ph10 426 {
3913 ph10 427 SCHECK_PARTIAL();
3914 ph10 426 RRETURN(MATCH_NOMATCH);
3915 ph10 427 }
3916 nigel 77 if ((md->ctypes[*eptr++] & ctype_space) == 0) RRETURN(MATCH_NOMATCH);
3917 ph10 427 }
3918 nigel 77 break;
3919    
3920     case OP_NOT_WORDCHAR:
3921     for (i = 1; i <= min; i++)
3922 ph10 427 {
3923     if (eptr >= md->end_subject)
3924 ph10 426 {
3925 ph10 427 SCHECK_PARTIAL();
3926 ph10 426 RRETURN(MATCH_NOMATCH);
3927 ph10 427 }
3928 nigel 77 if ((md->ctypes[*eptr++] & ctype_word) != 0)
3929     RRETURN(MATCH_NOMATCH);
3930 ph10 427 }
3931 nigel 77 break;
3932    
3933     case OP_WORDCHAR:
3934     for (i = 1; i <= min; i++)
3935 ph10 427 {
3936     if (eptr >= md->end_subject)
3937 ph10 426 {
3938 ph10 427 SCHECK_PARTIAL();
3939 ph10 426 RRETURN(MATCH_NOMATCH);
3940 ph10 427 }
3941 nigel 77 if ((md->ctypes[*eptr++] & ctype_word) == 0)
3942     RRETURN(MATCH_NOMATCH);
3943 ph10 427 }
3944 nigel 77 break;
3945    
3946     default:
3947     RRETURN(PCRE_ERROR_INTERNAL);
3948     }
3949     }
3950    
3951     /* If min = max, continue at the same level without recursing */
3952    
3953     if (min == max) continue;
3954    
3955     /* If minimizing, we have to test the rest of the pattern before each
3956     subsequent match. Again, separate the UTF-8 case for speed, and also
3957     separate the UCP cases. */
3958    
3959     if (minimize)
3960     {
3961     #ifdef SUPPORT_UCP
3962 nigel 87 if (prop_type >= 0)
3963 nigel 77 {
3964 nigel 87 switch(prop_type)
3965 nigel 77 {
3966 nigel 87 case PT_ANY:
3967     for (fi = min;; fi++)
3968     {
3969 ph10 164 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM36);
3970 nigel 87 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3971 ph10 428 if (fi >= max) RRETURN(MATCH_NOMATCH);
3972 ph10 427 if (eptr >= md->end_subject)
3973 ph10 426 {
3974 ph10 427 SCHECK_PARTIAL();
3975 ph10 426 RRETURN(MATCH_NOMATCH);
3976 ph10 427 }