/[pcre]/code/trunk/pcre_exec.c
ViewVC logotype

Contents of /code/trunk/pcre_exec.c

Parent Directory Parent Directory | Revision Log Revision Log


Revision 538 - (show annotations) (download)
Wed Jun 9 19:30:57 2010 UTC (3 years, 10 months ago) by ph10
File MIME type: text/plain
File size: 185373 byte(s)
Fix pcre_study() problem with non-C-locale chartables in UTF-8 mode.

1 /*************************************************
2 * Perl-Compatible Regular Expressions *
3 *************************************************/
4
5 /* PCRE is a library of functions to support regular expressions whose syntax
6 and semantics are as close as possible to those of the Perl 5 language.
7
8 Written by Philip Hazel
9 Copyright (c) 1997-2010 University of Cambridge
10
11 -----------------------------------------------------------------------------
12 Redistribution and use in source and binary forms, with or without
13 modification, are permitted provided that the following conditions are met:
14
15 * Redistributions of source code must retain the above copyright notice,
16 this list of conditions and the following disclaimer.
17
18 * Redistributions in binary form must reproduce the above copyright
19 notice, this list of conditions and the following disclaimer in the
20 documentation and/or other materials provided with the distribution.
21
22 * Neither the name of the University of Cambridge nor the names of its
23 contributors may be used to endorse or promote products derived from
24 this software without specific prior written permission.
25
26 THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
27 AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
28 IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
29 ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
30 LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
31 CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
32 SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
33 INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
34 CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
35 ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
36 POSSIBILITY OF SUCH DAMAGE.
37 -----------------------------------------------------------------------------
38 */
39
40
41 /* This module contains pcre_exec(), the externally visible function that does
42 pattern matching using an NFA algorithm, trying to mimic Perl as closely as
43 possible. There are also some static supporting functions. */
44
45 #ifdef HAVE_CONFIG_H
46 #include "config.h"
47 #endif
48
49 #define NLBLOCK md /* Block containing newline information */
50 #define PSSTART start_subject /* Field containing processed string start */
51 #define PSEND end_subject /* Field containing processed string end */
52
53 #include "pcre_internal.h"
54
55 /* Undefine some potentially clashing cpp symbols */
56
57 #undef min
58 #undef max
59
60 /* Flag bits for the match() function */
61
62 #define match_condassert 0x01 /* Called to check a condition assertion */
63 #define match_cbegroup 0x02 /* Could-be-empty unlimited repeat group */
64
65 /* Non-error returns from the match() function. Error returns are externally
66 defined PCRE_ERROR_xxx codes, which are all negative. */
67
68 #define MATCH_MATCH 1
69 #define MATCH_NOMATCH 0
70
71 /* Special internal returns from the match() function. Make them sufficiently
72 negative to avoid the external error codes. */
73
74 #define MATCH_ACCEPT (-999)
75 #define MATCH_COMMIT (-998)
76 #define MATCH_PRUNE (-997)
77 #define MATCH_SKIP (-996)
78 #define MATCH_SKIP_ARG (-995)
79 #define MATCH_THEN (-994)
80
81 /* This is a convenience macro for code that occurs many times. */
82
83 #define MRRETURN(ra) \
84 { \
85 md->mark = markptr; \
86 RRETURN(ra); \
87 }
88
89 /* Maximum number of ints of offset to save on the stack for recursive calls.
90 If the offset vector is bigger, malloc is used. This should be a multiple of 3,
91 because the offset vector is always a multiple of 3 long. */
92
93 #define REC_STACK_SAVE_MAX 30
94
95 /* Min and max values for the common repeats; for the maxima, 0 => infinity */
96
97 static const char rep_min[] = { 0, 0, 1, 1, 0, 0 };
98 static const char rep_max[] = { 0, 0, 0, 0, 1, 1 };
99
100
101
102 #ifdef PCRE_DEBUG
103 /*************************************************
104 * Debugging function to print chars *
105 *************************************************/
106
107 /* Print a sequence of chars in printable format, stopping at the end of the
108 subject if the requested.
109
110 Arguments:
111 p points to characters
112 length number to print
113 is_subject TRUE if printing from within md->start_subject
114 md pointer to matching data block, if is_subject is TRUE
115
116 Returns: nothing
117 */
118
119 static void
120 pchars(const uschar *p, int length, BOOL is_subject, match_data *md)
121 {
122 unsigned int c;
123 if (is_subject && length > md->end_subject - p) length = md->end_subject - p;
124 while (length-- > 0)
125 if (isprint(c = *(p++))) printf("%c", c); else printf("\\x%02x", c);
126 }
127 #endif
128
129
130
131 /*************************************************
132 * Match a back-reference *
133 *************************************************/
134
135 /* If a back reference hasn't been set, the length that is passed is greater
136 than the number of characters left in the string, so the match fails.
137
138 Arguments:
139 offset index into the offset vector
140 eptr points into the subject
141 length length to be matched
142 md points to match data block
143 ims the ims flags
144
145 Returns: TRUE if matched
146 */
147
148 static BOOL
149 match_ref(int offset, register USPTR eptr, int length, match_data *md,
150 unsigned long int ims)
151 {
152 USPTR p = md->start_subject + md->offset_vector[offset];
153
154 #ifdef PCRE_DEBUG
155 if (eptr >= md->end_subject)
156 printf("matching subject <null>");
157 else
158 {
159 printf("matching subject ");
160 pchars(eptr, length, TRUE, md);
161 }
162 printf(" against backref ");
163 pchars(p, length, FALSE, md);
164 printf("\n");
165 #endif
166
167 /* Always fail if not enough characters left */
168
169 if (length > md->end_subject - eptr) return FALSE;
170
171 /* Separate the caseless case for speed. In UTF-8 mode we can only do this
172 properly if Unicode properties are supported. Otherwise, we can check only
173 ASCII characters. */
174
175 if ((ims & PCRE_CASELESS) != 0)
176 {
177 #ifdef SUPPORT_UTF8
178 #ifdef SUPPORT_UCP
179 if (md->utf8)
180 {
181 USPTR endptr = eptr + length;
182 while (eptr < endptr)
183 {
184 int c, d;
185 GETCHARINC(c, eptr);
186 GETCHARINC(d, p);
187 if (c != d && c != UCD_OTHERCASE(d)) return FALSE;
188 }
189 }
190 else
191 #endif
192 #endif
193
194 /* The same code works when not in UTF-8 mode and in UTF-8 mode when there
195 is no UCP support. */
196
197 while (length-- > 0)
198 { if (md->lcc[*p++] != md->lcc[*eptr++]) return FALSE; }
199 }
200
201 /* In the caseful case, we can just compare the bytes, whether or not we
202 are in UTF-8 mode. */
203
204 else
205 { while (length-- > 0) if (*p++ != *eptr++) return FALSE; }
206
207 return TRUE;
208 }
209
210
211
212 /***************************************************************************
213 ****************************************************************************
214 RECURSION IN THE match() FUNCTION
215
216 The match() function is highly recursive, though not every recursive call
217 increases the recursive depth. Nevertheless, some regular expressions can cause
218 it to recurse to a great depth. I was writing for Unix, so I just let it call
219 itself recursively. This uses the stack for saving everything that has to be
220 saved for a recursive call. On Unix, the stack can be large, and this works
221 fine.
222
223 It turns out that on some non-Unix-like systems there are problems with
224 programs that use a lot of stack. (This despite the fact that every last chip
225 has oodles of memory these days, and techniques for extending the stack have
226 been known for decades.) So....
227
228 There is a fudge, triggered by defining NO_RECURSE, which avoids recursive
229 calls by keeping local variables that need to be preserved in blocks of memory
230 obtained from malloc() instead instead of on the stack. Macros are used to
231 achieve this so that the actual code doesn't look very different to what it
232 always used to.
233
234 The original heap-recursive code used longjmp(). However, it seems that this
235 can be very slow on some operating systems. Following a suggestion from Stan
236 Switzer, the use of longjmp() has been abolished, at the cost of having to
237 provide a unique number for each call to RMATCH. There is no way of generating
238 a sequence of numbers at compile time in C. I have given them names, to make
239 them stand out more clearly.
240
241 Crude tests on x86 Linux show a small speedup of around 5-8%. However, on
242 FreeBSD, avoiding longjmp() more than halves the time taken to run the standard
243 tests. Furthermore, not using longjmp() means that local dynamic variables
244 don't have indeterminate values; this has meant that the frame size can be
245 reduced because the result can be "passed back" by straight setting of the
246 variable instead of being passed in the frame.
247 ****************************************************************************
248 ***************************************************************************/
249
250 /* Numbers for RMATCH calls. When this list is changed, the code at HEAP_RETURN
251 below must be updated in sync. */
252
253 enum { RM1=1, RM2, RM3, RM4, RM5, RM6, RM7, RM8, RM9, RM10,
254 RM11, RM12, RM13, RM14, RM15, RM16, RM17, RM18, RM19, RM20,
255 RM21, RM22, RM23, RM24, RM25, RM26, RM27, RM28, RM29, RM30,
256 RM31, RM32, RM33, RM34, RM35, RM36, RM37, RM38, RM39, RM40,
257 RM41, RM42, RM43, RM44, RM45, RM46, RM47, RM48, RM49, RM50,
258 RM51, RM52, RM53, RM54, RM55, RM56, RM57, RM58, RM59, RM60,
259 RM61, RM62 };
260
261 /* These versions of the macros use the stack, as normal. There are debugging
262 versions and production versions. Note that the "rw" argument of RMATCH isn't
263 actually used in this definition. */
264
265 #ifndef NO_RECURSE
266 #define REGISTER register
267
268 #ifdef PCRE_DEBUG
269 #define RMATCH(ra,rb,rc,rd,re,rf,rg,rw) \
270 { \
271 printf("match() called in line %d\n", __LINE__); \
272 rrc = match(ra,rb,mstart,markptr,rc,rd,re,rf,rg,rdepth+1); \
273 printf("to line %d\n", __LINE__); \
274 }
275 #define RRETURN(ra) \
276 { \
277 printf("match() returned %d from line %d ", ra, __LINE__); \
278 return ra; \
279 }
280 #else
281 #define RMATCH(ra,rb,rc,rd,re,rf,rg,rw) \
282 rrc = match(ra,rb,mstart,markptr,rc,rd,re,rf,rg,rdepth+1)
283 #define RRETURN(ra) return ra
284 #endif
285
286 #else
287
288
289 /* These versions of the macros manage a private stack on the heap. Note that
290 the "rd" argument of RMATCH isn't actually used in this definition. It's the md
291 argument of match(), which never changes. */
292
293 #define REGISTER
294
295 #define RMATCH(ra,rb,rc,rd,re,rf,rg,rw)\
296 {\
297 heapframe *newframe = (pcre_stack_malloc)(sizeof(heapframe));\
298 if (newframe == NULL) RRETURN(PCRE_ERROR_NOMEMORY);\
299 frame->Xwhere = rw; \
300 newframe->Xeptr = ra;\
301 newframe->Xecode = rb;\
302 newframe->Xmstart = mstart;\
303 newframe->Xmarkptr = markptr;\
304 newframe->Xoffset_top = rc;\
305 newframe->Xims = re;\
306 newframe->Xeptrb = rf;\
307 newframe->Xflags = rg;\
308 newframe->Xrdepth = frame->Xrdepth + 1;\
309 newframe->Xprevframe = frame;\
310 frame = newframe;\
311 DPRINTF(("restarting from line %d\n", __LINE__));\
312 goto HEAP_RECURSE;\
313 L_##rw:\
314 DPRINTF(("jumped back to line %d\n", __LINE__));\
315 }
316
317 #define RRETURN(ra)\
318 {\
319 heapframe *oldframe = frame;\
320 frame = oldframe->Xprevframe;\
321 (pcre_stack_free)(oldframe);\
322 if (frame != NULL)\
323 {\
324 rrc = ra;\
325 goto HEAP_RETURN;\
326 }\
327 return ra;\
328 }
329
330
331 /* Structure for remembering the local variables in a private frame */
332
333 typedef struct heapframe {
334 struct heapframe *Xprevframe;
335
336 /* Function arguments that may change */
337
338 USPTR Xeptr;
339 const uschar *Xecode;
340 USPTR Xmstart;
341 USPTR Xmarkptr;
342 int Xoffset_top;
343 long int Xims;
344 eptrblock *Xeptrb;
345 int Xflags;
346 unsigned int Xrdepth;
347
348 /* Function local variables */
349
350 USPTR Xcallpat;
351 #ifdef SUPPORT_UTF8
352 USPTR Xcharptr;
353 #endif
354 USPTR Xdata;
355 USPTR Xnext;
356 USPTR Xpp;
357 USPTR Xprev;
358 USPTR Xsaved_eptr;
359
360 recursion_info Xnew_recursive;
361
362 BOOL Xcur_is_word;
363 BOOL Xcondition;
364 BOOL Xprev_is_word;
365
366 unsigned long int Xoriginal_ims;
367
368 #ifdef SUPPORT_UCP
369 int Xprop_type;
370 int Xprop_value;
371 int Xprop_fail_result;
372 int Xprop_category;
373 int Xprop_chartype;
374 int Xprop_script;
375 int Xoclength;
376 uschar Xocchars[8];
377 #endif
378
379 int Xcodelink;
380 int Xctype;
381 unsigned int Xfc;
382 int Xfi;
383 int Xlength;
384 int Xmax;
385 int Xmin;
386 int Xnumber;
387 int Xoffset;
388 int Xop;
389 int Xsave_capture_last;
390 int Xsave_offset1, Xsave_offset2, Xsave_offset3;
391 int Xstacksave[REC_STACK_SAVE_MAX];
392
393 eptrblock Xnewptrb;
394
395 /* Where to jump back to */
396
397 int Xwhere;
398
399 } heapframe;
400
401 #endif
402
403
404 /***************************************************************************
405 ***************************************************************************/
406
407
408
409 /*************************************************
410 * Match from current position *
411 *************************************************/
412
413 /* This function is called recursively in many circumstances. Whenever it
414 returns a negative (error) response, the outer incarnation must also return the
415 same response. */
416
417 /* These macros pack up tests that are used for partial matching, and which
418 appears several times in the code. We set the "hit end" flag if the pointer is
419 at the end of the subject and also past the start of the subject (i.e.
420 something has been matched). For hard partial matching, we then return
421 immediately. The second one is used when we already know we are past the end of
422 the subject. */
423
424 #define CHECK_PARTIAL()\
425 if (md->partial != 0 && eptr >= md->end_subject && eptr > mstart)\
426 {\
427 md->hitend = TRUE;\
428 if (md->partial > 1) MRRETURN(PCRE_ERROR_PARTIAL);\
429 }
430
431 #define SCHECK_PARTIAL()\
432 if (md->partial != 0 && eptr > mstart)\
433 {\
434 md->hitend = TRUE;\
435 if (md->partial > 1) MRRETURN(PCRE_ERROR_PARTIAL);\
436 }
437
438
439 /* Performance note: It might be tempting to extract commonly used fields from
440 the md structure (e.g. utf8, end_subject) into individual variables to improve
441 performance. Tests using gcc on a SPARC disproved this; in the first case, it
442 made performance worse.
443
444 Arguments:
445 eptr pointer to current character in subject
446 ecode pointer to current position in compiled code
447 mstart pointer to the current match start position (can be modified
448 by encountering \K)
449 markptr pointer to the most recent MARK name, or NULL
450 offset_top current top pointer
451 md pointer to "static" info for the match
452 ims current /i, /m, and /s options
453 eptrb pointer to chain of blocks containing eptr at start of
454 brackets - for testing for empty matches
455 flags can contain
456 match_condassert - this is an assertion condition
457 match_cbegroup - this is the start of an unlimited repeat
458 group that can match an empty string
459 rdepth the recursion depth
460
461 Returns: MATCH_MATCH if matched ) these values are >= 0
462 MATCH_NOMATCH if failed to match )
463 a negative MATCH_xxx value for PRUNE, SKIP, etc
464 a negative PCRE_ERROR_xxx value if aborted by an error condition
465 (e.g. stopped by repeated call or recursion limit)
466 */
467
468 static int
469 match(REGISTER USPTR eptr, REGISTER const uschar *ecode, USPTR mstart,
470 const uschar *markptr, int offset_top, match_data *md, unsigned long int ims,
471 eptrblock *eptrb, int flags, unsigned int rdepth)
472 {
473 /* These variables do not need to be preserved over recursion in this function,
474 so they can be ordinary variables in all cases. Mark some of them with
475 "register" because they are used a lot in loops. */
476
477 register int rrc; /* Returns from recursive calls */
478 register int i; /* Used for loops not involving calls to RMATCH() */
479 register unsigned int c; /* Character values not kept over RMATCH() calls */
480 register BOOL utf8; /* Local copy of UTF-8 flag for speed */
481
482 BOOL minimize, possessive; /* Quantifier options */
483 int condcode;
484
485 /* When recursion is not being used, all "local" variables that have to be
486 preserved over calls to RMATCH() are part of a "frame" which is obtained from
487 heap storage. Set up the top-level frame here; others are obtained from the
488 heap whenever RMATCH() does a "recursion". See the macro definitions above. */
489
490 #ifdef NO_RECURSE
491 heapframe *frame = (pcre_stack_malloc)(sizeof(heapframe));
492 if (frame == NULL) RRETURN(PCRE_ERROR_NOMEMORY);
493 frame->Xprevframe = NULL; /* Marks the top level */
494
495 /* Copy in the original argument variables */
496
497 frame->Xeptr = eptr;
498 frame->Xecode = ecode;
499 frame->Xmstart = mstart;
500 frame->Xmarkptr = markptr;
501 frame->Xoffset_top = offset_top;
502 frame->Xims = ims;
503 frame->Xeptrb = eptrb;
504 frame->Xflags = flags;
505 frame->Xrdepth = rdepth;
506
507 /* This is where control jumps back to to effect "recursion" */
508
509 HEAP_RECURSE:
510
511 /* Macros make the argument variables come from the current frame */
512
513 #define eptr frame->Xeptr
514 #define ecode frame->Xecode
515 #define mstart frame->Xmstart
516 #define markptr frame->Xmarkptr
517 #define offset_top frame->Xoffset_top
518 #define ims frame->Xims
519 #define eptrb frame->Xeptrb
520 #define flags frame->Xflags
521 #define rdepth frame->Xrdepth
522
523 /* Ditto for the local variables */
524
525 #ifdef SUPPORT_UTF8
526 #define charptr frame->Xcharptr
527 #endif
528 #define callpat frame->Xcallpat
529 #define codelink frame->Xcodelink
530 #define data frame->Xdata
531 #define next frame->Xnext
532 #define pp frame->Xpp
533 #define prev frame->Xprev
534 #define saved_eptr frame->Xsaved_eptr
535
536 #define new_recursive frame->Xnew_recursive
537
538 #define cur_is_word frame->Xcur_is_word
539 #define condition frame->Xcondition
540 #define prev_is_word frame->Xprev_is_word
541
542 #define original_ims frame->Xoriginal_ims
543
544 #ifdef SUPPORT_UCP
545 #define prop_type frame->Xprop_type
546 #define prop_value frame->Xprop_value
547 #define prop_fail_result frame->Xprop_fail_result
548 #define prop_category frame->Xprop_category
549 #define prop_chartype frame->Xprop_chartype
550 #define prop_script frame->Xprop_script
551 #define oclength frame->Xoclength
552 #define occhars frame->Xocchars
553 #endif
554
555 #define ctype frame->Xctype
556 #define fc frame->Xfc
557 #define fi frame->Xfi
558 #define length frame->Xlength
559 #define max frame->Xmax
560 #define min frame->Xmin
561 #define number frame->Xnumber
562 #define offset frame->Xoffset
563 #define op frame->Xop
564 #define save_capture_last frame->Xsave_capture_last
565 #define save_offset1 frame->Xsave_offset1
566 #define save_offset2 frame->Xsave_offset2
567 #define save_offset3 frame->Xsave_offset3
568 #define stacksave frame->Xstacksave
569
570 #define newptrb frame->Xnewptrb
571
572 /* When recursion is being used, local variables are allocated on the stack and
573 get preserved during recursion in the normal way. In this environment, fi and
574 i, and fc and c, can be the same variables. */
575
576 #else /* NO_RECURSE not defined */
577 #define fi i
578 #define fc c
579
580
581 #ifdef SUPPORT_UTF8 /* Many of these variables are used only */
582 const uschar *charptr; /* in small blocks of the code. My normal */
583 #endif /* style of coding would have declared */
584 const uschar *callpat; /* them within each of those blocks. */
585 const uschar *data; /* However, in order to accommodate the */
586 const uschar *next; /* version of this code that uses an */
587 USPTR pp; /* external "stack" implemented on the */
588 const uschar *prev; /* heap, it is easier to declare them all */
589 USPTR saved_eptr; /* here, so the declarations can be cut */
590 /* out in a block. The only declarations */
591 recursion_info new_recursive; /* within blocks below are for variables */
592 /* that do not have to be preserved over */
593 BOOL cur_is_word; /* a recursive call to RMATCH(). */
594 BOOL condition;
595 BOOL prev_is_word;
596
597 unsigned long int original_ims;
598
599 #ifdef SUPPORT_UCP
600 int prop_type;
601 int prop_value;
602 int prop_fail_result;
603 int prop_category;
604 int prop_chartype;
605 int prop_script;
606 int oclength;
607 uschar occhars[8];
608 #endif
609
610 int codelink;
611 int ctype;
612 int length;
613 int max;
614 int min;
615 int number;
616 int offset;
617 int op;
618 int save_capture_last;
619 int save_offset1, save_offset2, save_offset3;
620 int stacksave[REC_STACK_SAVE_MAX];
621
622 eptrblock newptrb;
623 #endif /* NO_RECURSE */
624
625 /* These statements are here to stop the compiler complaining about unitialized
626 variables. */
627
628 #ifdef SUPPORT_UCP
629 prop_value = 0;
630 prop_fail_result = 0;
631 #endif
632
633
634 /* This label is used for tail recursion, which is used in a few cases even
635 when NO_RECURSE is not defined, in order to reduce the amount of stack that is
636 used. Thanks to Ian Taylor for noticing this possibility and sending the
637 original patch. */
638
639 TAIL_RECURSE:
640
641 /* OK, now we can get on with the real code of the function. Recursive calls
642 are specified by the macro RMATCH and RRETURN is used to return. When
643 NO_RECURSE is *not* defined, these just turn into a recursive call to match()
644 and a "return", respectively (possibly with some debugging if PCRE_DEBUG is
645 defined). However, RMATCH isn't like a function call because it's quite a
646 complicated macro. It has to be used in one particular way. This shouldn't,
647 however, impact performance when true recursion is being used. */
648
649 #ifdef SUPPORT_UTF8
650 utf8 = md->utf8; /* Local copy of the flag */
651 #else
652 utf8 = FALSE;
653 #endif
654
655 /* First check that we haven't called match() too many times, or that we
656 haven't exceeded the recursive call limit. */
657
658 if (md->match_call_count++ >= md->match_limit) RRETURN(PCRE_ERROR_MATCHLIMIT);
659 if (rdepth >= md->match_limit_recursion) RRETURN(PCRE_ERROR_RECURSIONLIMIT);
660
661 original_ims = ims; /* Save for resetting on ')' */
662
663 /* At the start of a group with an unlimited repeat that may match an empty
664 string, the match_cbegroup flag is set. When this is the case, add the current
665 subject pointer to the chain of such remembered pointers, to be checked when we
666 hit the closing ket, in order to break infinite loops that match no characters.
667 When match() is called in other circumstances, don't add to the chain. The
668 match_cbegroup flag must NOT be used with tail recursion, because the memory
669 block that is used is on the stack, so a new one may be required for each
670 match(). */
671
672 if ((flags & match_cbegroup) != 0)
673 {
674 newptrb.epb_saved_eptr = eptr;
675 newptrb.epb_prev = eptrb;
676 eptrb = &newptrb;
677 }
678
679 /* Now start processing the opcodes. */
680
681 for (;;)
682 {
683 minimize = possessive = FALSE;
684 op = *ecode;
685
686 switch(op)
687 {
688 case OP_MARK:
689 markptr = ecode + 2;
690 RMATCH(eptr, ecode + _pcre_OP_lengths[*ecode] + ecode[1], offset_top, md,
691 ims, eptrb, flags, RM55);
692
693 /* A return of MATCH_SKIP_ARG means that matching failed at SKIP with an
694 argument, and we must check whether that argument matches this MARK's
695 argument. It is passed back in md->start_match_ptr (an overloading of that
696 variable). If it does match, we reset that variable to the current subject
697 position and return MATCH_SKIP. Otherwise, pass back the return code
698 unaltered. */
699
700 if (rrc == MATCH_SKIP_ARG &&
701 strcmp((char *)markptr, (char *)(md->start_match_ptr)) == 0)
702 {
703 md->start_match_ptr = eptr;
704 RRETURN(MATCH_SKIP);
705 }
706
707 if (md->mark == NULL) md->mark = markptr;
708 RRETURN(rrc);
709
710 case OP_FAIL:
711 MRRETURN(MATCH_NOMATCH);
712
713 case OP_COMMIT:
714 RMATCH(eptr, ecode + _pcre_OP_lengths[*ecode], offset_top, md,
715 ims, eptrb, flags, RM52);
716 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
717 MRRETURN(MATCH_COMMIT);
718
719 case OP_PRUNE:
720 RMATCH(eptr, ecode + _pcre_OP_lengths[*ecode], offset_top, md,
721 ims, eptrb, flags, RM51);
722 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
723 MRRETURN(MATCH_PRUNE);
724
725 case OP_PRUNE_ARG:
726 RMATCH(eptr, ecode + _pcre_OP_lengths[*ecode] + ecode[1], offset_top, md,
727 ims, eptrb, flags, RM56);
728 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
729 md->mark = ecode + 2;
730 RRETURN(MATCH_PRUNE);
731
732 case OP_SKIP:
733 RMATCH(eptr, ecode + _pcre_OP_lengths[*ecode], offset_top, md,
734 ims, eptrb, flags, RM53);
735 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
736 md->start_match_ptr = eptr; /* Pass back current position */
737 MRRETURN(MATCH_SKIP);
738
739 case OP_SKIP_ARG:
740 RMATCH(eptr, ecode + _pcre_OP_lengths[*ecode] + ecode[1], offset_top, md,
741 ims, eptrb, flags, RM57);
742 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
743
744 /* Pass back the current skip name by overloading md->start_match_ptr and
745 returning the special MATCH_SKIP_ARG return code. This will either be
746 caught by a matching MARK, or get to the top, where it is treated the same
747 as PRUNE. */
748
749 md->start_match_ptr = ecode + 2;
750 RRETURN(MATCH_SKIP_ARG);
751
752 case OP_THEN:
753 RMATCH(eptr, ecode + _pcre_OP_lengths[*ecode], offset_top, md,
754 ims, eptrb, flags, RM54);
755 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
756 MRRETURN(MATCH_THEN);
757
758 case OP_THEN_ARG:
759 RMATCH(eptr, ecode + _pcre_OP_lengths[*ecode] + ecode[1], offset_top, md,
760 ims, eptrb, flags, RM58);
761 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
762 md->mark = ecode + 2;
763 RRETURN(MATCH_THEN);
764
765 /* Handle a capturing bracket. If there is space in the offset vector, save
766 the current subject position in the working slot at the top of the vector.
767 We mustn't change the current values of the data slot, because they may be
768 set from a previous iteration of this group, and be referred to by a
769 reference inside the group.
770
771 If the bracket fails to match, we need to restore this value and also the
772 values of the final offsets, in case they were set by a previous iteration
773 of the same bracket.
774
775 If there isn't enough space in the offset vector, treat this as if it were
776 a non-capturing bracket. Don't worry about setting the flag for the error
777 case here; that is handled in the code for KET. */
778
779 case OP_CBRA:
780 case OP_SCBRA:
781 number = GET2(ecode, 1+LINK_SIZE);
782 offset = number << 1;
783
784 #ifdef PCRE_DEBUG
785 printf("start bracket %d\n", number);
786 printf("subject=");
787 pchars(eptr, 16, TRUE, md);
788 printf("\n");
789 #endif
790
791 if (offset < md->offset_max)
792 {
793 save_offset1 = md->offset_vector[offset];
794 save_offset2 = md->offset_vector[offset+1];
795 save_offset3 = md->offset_vector[md->offset_end - number];
796 save_capture_last = md->capture_last;
797
798 DPRINTF(("saving %d %d %d\n", save_offset1, save_offset2, save_offset3));
799 md->offset_vector[md->offset_end - number] =
800 (int)(eptr - md->start_subject);
801
802 flags = (op == OP_SCBRA)? match_cbegroup : 0;
803 do
804 {
805 RMATCH(eptr, ecode + _pcre_OP_lengths[*ecode], offset_top, md,
806 ims, eptrb, flags, RM1);
807 if (rrc != MATCH_NOMATCH && rrc != MATCH_THEN) RRETURN(rrc);
808 md->capture_last = save_capture_last;
809 ecode += GET(ecode, 1);
810 }
811 while (*ecode == OP_ALT);
812
813 DPRINTF(("bracket %d failed\n", number));
814
815 md->offset_vector[offset] = save_offset1;
816 md->offset_vector[offset+1] = save_offset2;
817 md->offset_vector[md->offset_end - number] = save_offset3;
818
819 if (rrc != MATCH_THEN) md->mark = markptr;
820 RRETURN(MATCH_NOMATCH);
821 }
822
823 /* FALL THROUGH ... Insufficient room for saving captured contents. Treat
824 as a non-capturing bracket. */
825
826 /* VVVVVVVVVVVVVVVVVVVVVVVVV */
827 /* VVVVVVVVVVVVVVVVVVVVVVVVV */
828
829 DPRINTF(("insufficient capture room: treat as non-capturing\n"));
830
831 /* VVVVVVVVVVVVVVVVVVVVVVVVV */
832 /* VVVVVVVVVVVVVVVVVVVVVVVVV */
833
834 /* Non-capturing bracket. Loop for all the alternatives. When we get to the
835 final alternative within the brackets, we would return the result of a
836 recursive call to match() whatever happened. We can reduce stack usage by
837 turning this into a tail recursion, except in the case when match_cbegroup
838 is set.*/
839
840 case OP_BRA:
841 case OP_SBRA:
842 DPRINTF(("start non-capturing bracket\n"));
843 flags = (op >= OP_SBRA)? match_cbegroup : 0;
844 for (;;)
845 {
846 if (ecode[GET(ecode, 1)] != OP_ALT) /* Final alternative */
847 {
848 if (flags == 0) /* Not a possibly empty group */
849 {
850 ecode += _pcre_OP_lengths[*ecode];
851 DPRINTF(("bracket 0 tail recursion\n"));
852 goto TAIL_RECURSE;
853 }
854
855 /* Possibly empty group; can't use tail recursion. */
856
857 RMATCH(eptr, ecode + _pcre_OP_lengths[*ecode], offset_top, md, ims,
858 eptrb, flags, RM48);
859 if (rrc == MATCH_NOMATCH) md->mark = markptr;
860 RRETURN(rrc);
861 }
862
863 /* For non-final alternatives, continue the loop for a NOMATCH result;
864 otherwise return. */
865
866 RMATCH(eptr, ecode + _pcre_OP_lengths[*ecode], offset_top, md, ims,
867 eptrb, flags, RM2);
868 if (rrc != MATCH_NOMATCH && rrc != MATCH_THEN) RRETURN(rrc);
869 ecode += GET(ecode, 1);
870 }
871 /* Control never reaches here. */
872
873 /* Conditional group: compilation checked that there are no more than
874 two branches. If the condition is false, skipping the first branch takes us
875 past the end if there is only one branch, but that's OK because that is
876 exactly what going to the ket would do. As there is only one branch to be
877 obeyed, we can use tail recursion to avoid using another stack frame. */
878
879 case OP_COND:
880 case OP_SCOND:
881 codelink= GET(ecode, 1);
882
883 /* Because of the way auto-callout works during compile, a callout item is
884 inserted between OP_COND and an assertion condition. */
885
886 if (ecode[LINK_SIZE+1] == OP_CALLOUT)
887 {
888 if (pcre_callout != NULL)
889 {
890 pcre_callout_block cb;
891 cb.version = 1; /* Version 1 of the callout block */
892 cb.callout_number = ecode[LINK_SIZE+2];
893 cb.offset_vector = md->offset_vector;
894 cb.subject = (PCRE_SPTR)md->start_subject;
895 cb.subject_length = (int)(md->end_subject - md->start_subject);
896 cb.start_match = (int)(mstart - md->start_subject);
897 cb.current_position = (int)(eptr - md->start_subject);
898 cb.pattern_position = GET(ecode, LINK_SIZE + 3);
899 cb.next_item_length = GET(ecode, 3 + 2*LINK_SIZE);
900 cb.capture_top = offset_top/2;
901 cb.capture_last = md->capture_last;
902 cb.callout_data = md->callout_data;
903 if ((rrc = (*pcre_callout)(&cb)) > 0) MRRETURN(MATCH_NOMATCH);
904 if (rrc < 0) RRETURN(rrc);
905 }
906 ecode += _pcre_OP_lengths[OP_CALLOUT];
907 }
908
909 condcode = ecode[LINK_SIZE+1];
910
911 /* Now see what the actual condition is */
912
913 if (condcode == OP_RREF || condcode == OP_NRREF) /* Recursion test */
914 {
915 if (md->recursive == NULL) /* Not recursing => FALSE */
916 {
917 condition = FALSE;
918 ecode += GET(ecode, 1);
919 }
920 else
921 {
922 int recno = GET2(ecode, LINK_SIZE + 2); /* Recursion group number*/
923 condition = (recno == RREF_ANY || recno == md->recursive->group_num);
924
925 /* If the test is for recursion into a specific subpattern, and it is
926 false, but the test was set up by name, scan the table to see if the
927 name refers to any other numbers, and test them. The condition is true
928 if any one is set. */
929
930 if (!condition && condcode == OP_NRREF && recno != RREF_ANY)
931 {
932 uschar *slotA = md->name_table;
933 for (i = 0; i < md->name_count; i++)
934 {
935 if (GET2(slotA, 0) == recno) break;
936 slotA += md->name_entry_size;
937 }
938
939 /* Found a name for the number - there can be only one; duplicate
940 names for different numbers are allowed, but not vice versa. First
941 scan down for duplicates. */
942
943 if (i < md->name_count)
944 {
945 uschar *slotB = slotA;
946 while (slotB > md->name_table)
947 {
948 slotB -= md->name_entry_size;
949 if (strcmp((char *)slotA + 2, (char *)slotB + 2) == 0)
950 {
951 condition = GET2(slotB, 0) == md->recursive->group_num;
952 if (condition) break;
953 }
954 else break;
955 }
956
957 /* Scan up for duplicates */
958
959 if (!condition)
960 {
961 slotB = slotA;
962 for (i++; i < md->name_count; i++)
963 {
964 slotB += md->name_entry_size;
965 if (strcmp((char *)slotA + 2, (char *)slotB + 2) == 0)
966 {
967 condition = GET2(slotB, 0) == md->recursive->group_num;
968 if (condition) break;
969 }
970 else break;
971 }
972 }
973 }
974 }
975
976 /* Chose branch according to the condition */
977
978 ecode += condition? 3 : GET(ecode, 1);
979 }
980 }
981
982 else if (condcode == OP_CREF || condcode == OP_NCREF) /* Group used test */
983 {
984 offset = GET2(ecode, LINK_SIZE+2) << 1; /* Doubled ref number */
985 condition = offset < offset_top && md->offset_vector[offset] >= 0;
986
987 /* If the numbered capture is unset, but the reference was by name,
988 scan the table to see if the name refers to any other numbers, and test
989 them. The condition is true if any one is set. This is tediously similar
990 to the code above, but not close enough to try to amalgamate. */
991
992 if (!condition && condcode == OP_NCREF)
993 {
994 int refno = offset >> 1;
995 uschar *slotA = md->name_table;
996
997 for (i = 0; i < md->name_count; i++)
998 {
999 if (GET2(slotA, 0) == refno) break;
1000 slotA += md->name_entry_size;
1001 }
1002
1003 /* Found a name for the number - there can be only one; duplicate names
1004 for different numbers are allowed, but not vice versa. First scan down
1005 for duplicates. */
1006
1007 if (i < md->name_count)
1008 {
1009 uschar *slotB = slotA;
1010 while (slotB > md->name_table)
1011 {
1012 slotB -= md->name_entry_size;
1013 if (strcmp((char *)slotA + 2, (char *)slotB + 2) == 0)
1014 {
1015 offset = GET2(slotB, 0) << 1;
1016 condition = offset < offset_top &&
1017 md->offset_vector[offset] >= 0;
1018 if (condition) break;
1019 }
1020 else break;
1021 }
1022
1023 /* Scan up for duplicates */
1024
1025 if (!condition)
1026 {
1027 slotB = slotA;
1028 for (i++; i < md->name_count; i++)
1029 {
1030 slotB += md->name_entry_size;
1031 if (strcmp((char *)slotA + 2, (char *)slotB + 2) == 0)
1032 {
1033 offset = GET2(slotB, 0) << 1;
1034 condition = offset < offset_top &&
1035 md->offset_vector[offset] >= 0;
1036 if (condition) break;
1037 }
1038 else break;
1039 }
1040 }
1041 }
1042 }
1043
1044 /* Chose branch according to the condition */
1045
1046 ecode += condition? 3 : GET(ecode, 1);
1047 }
1048
1049 else if (condcode == OP_DEF) /* DEFINE - always false */
1050 {
1051 condition = FALSE;
1052 ecode += GET(ecode, 1);
1053 }
1054
1055 /* The condition is an assertion. Call match() to evaluate it - setting
1056 the final argument match_condassert causes it to stop at the end of an
1057 assertion. */
1058
1059 else
1060 {
1061 RMATCH(eptr, ecode + 1 + LINK_SIZE, offset_top, md, ims, NULL,
1062 match_condassert, RM3);
1063 if (rrc == MATCH_MATCH)
1064 {
1065 condition = TRUE;
1066 ecode += 1 + LINK_SIZE + GET(ecode, LINK_SIZE + 2);
1067 while (*ecode == OP_ALT) ecode += GET(ecode, 1);
1068 }
1069 else if (rrc != MATCH_NOMATCH && rrc != MATCH_THEN)
1070 {
1071 RRETURN(rrc); /* Need braces because of following else */
1072 }
1073 else
1074 {
1075 condition = FALSE;
1076 ecode += codelink;
1077 }
1078 }
1079
1080 /* We are now at the branch that is to be obeyed. As there is only one,
1081 we can use tail recursion to avoid using another stack frame, except when
1082 match_cbegroup is required for an unlimited repeat of a possibly empty
1083 group. If the second alternative doesn't exist, we can just plough on. */
1084
1085 if (condition || *ecode == OP_ALT)
1086 {
1087 ecode += 1 + LINK_SIZE;
1088 if (op == OP_SCOND) /* Possibly empty group */
1089 {
1090 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, match_cbegroup, RM49);
1091 RRETURN(rrc);
1092 }
1093 else /* Group must match something */
1094 {
1095 flags = 0;
1096 goto TAIL_RECURSE;
1097 }
1098 }
1099 else /* Condition false & no alternative */
1100 {
1101 ecode += 1 + LINK_SIZE;
1102 }
1103 break;
1104
1105
1106 /* Before OP_ACCEPT there may be any number of OP_CLOSE opcodes,
1107 to close any currently open capturing brackets. */
1108
1109 case OP_CLOSE:
1110 number = GET2(ecode, 1);
1111 offset = number << 1;
1112
1113 #ifdef PCRE_DEBUG
1114 printf("end bracket %d at *ACCEPT", number);
1115 printf("\n");
1116 #endif
1117
1118 md->capture_last = number;
1119 if (offset >= md->offset_max) md->offset_overflow = TRUE; else
1120 {
1121 md->offset_vector[offset] =
1122 md->offset_vector[md->offset_end - number];
1123 md->offset_vector[offset+1] = (int)(eptr - md->start_subject);
1124 if (offset_top <= offset) offset_top = offset + 2;
1125 }
1126 ecode += 3;
1127 break;
1128
1129
1130 /* End of the pattern, either real or forced. If we are in a top-level
1131 recursion, we should restore the offsets appropriately and continue from
1132 after the call. */
1133
1134 case OP_ACCEPT:
1135 case OP_END:
1136 if (md->recursive != NULL && md->recursive->group_num == 0)
1137 {
1138 recursion_info *rec = md->recursive;
1139 DPRINTF(("End of pattern in a (?0) recursion\n"));
1140 md->recursive = rec->prevrec;
1141 memmove(md->offset_vector, rec->offset_save,
1142 rec->saved_max * sizeof(int));
1143 offset_top = rec->save_offset_top;
1144 ims = original_ims;
1145 ecode = rec->after_call;
1146 break;
1147 }
1148
1149 /* Otherwise, if we have matched an empty string, fail if PCRE_NOTEMPTY is
1150 set, or if PCRE_NOTEMPTY_ATSTART is set and we have matched at the start of
1151 the subject. In both cases, backtracking will then try other alternatives,
1152 if any. */
1153
1154 if (eptr == mstart &&
1155 (md->notempty ||
1156 (md->notempty_atstart &&
1157 mstart == md->start_subject + md->start_offset)))
1158 MRRETURN(MATCH_NOMATCH);
1159
1160 /* Otherwise, we have a match. */
1161
1162 md->end_match_ptr = eptr; /* Record where we ended */
1163 md->end_offset_top = offset_top; /* and how many extracts were taken */
1164 md->start_match_ptr = mstart; /* and the start (\K can modify) */
1165
1166 /* For some reason, the macros don't work properly if an expression is
1167 given as the argument to MRRETURN when the heap is in use. */
1168
1169 rrc = (op == OP_END)? MATCH_MATCH : MATCH_ACCEPT;
1170 MRRETURN(rrc);
1171
1172 /* Change option settings */
1173
1174 case OP_OPT:
1175 ims = ecode[1];
1176 ecode += 2;
1177 DPRINTF(("ims set to %02lx\n", ims));
1178 break;
1179
1180 /* Assertion brackets. Check the alternative branches in turn - the
1181 matching won't pass the KET for an assertion. If any one branch matches,
1182 the assertion is true. Lookbehind assertions have an OP_REVERSE item at the
1183 start of each branch to move the current point backwards, so the code at
1184 this level is identical to the lookahead case. */
1185
1186 case OP_ASSERT:
1187 case OP_ASSERTBACK:
1188 do
1189 {
1190 RMATCH(eptr, ecode + 1 + LINK_SIZE, offset_top, md, ims, NULL, 0,
1191 RM4);
1192 if (rrc == MATCH_MATCH || rrc == MATCH_ACCEPT)
1193 {
1194 mstart = md->start_match_ptr; /* In case \K reset it */
1195 break;
1196 }
1197 if (rrc != MATCH_NOMATCH && rrc != MATCH_THEN) RRETURN(rrc);
1198 ecode += GET(ecode, 1);
1199 }
1200 while (*ecode == OP_ALT);
1201 if (*ecode == OP_KET) MRRETURN(MATCH_NOMATCH);
1202
1203 /* If checking an assertion for a condition, return MATCH_MATCH. */
1204
1205 if ((flags & match_condassert) != 0) RRETURN(MATCH_MATCH);
1206
1207 /* Continue from after the assertion, updating the offsets high water
1208 mark, since extracts may have been taken during the assertion. */
1209
1210 do ecode += GET(ecode,1); while (*ecode == OP_ALT);
1211 ecode += 1 + LINK_SIZE;
1212 offset_top = md->end_offset_top;
1213 continue;
1214
1215 /* Negative assertion: all branches must fail to match. Encountering SKIP,
1216 PRUNE, or COMMIT means we must assume failure without checking subsequent
1217 branches. */
1218
1219 case OP_ASSERT_NOT:
1220 case OP_ASSERTBACK_NOT:
1221 do
1222 {
1223 RMATCH(eptr, ecode + 1 + LINK_SIZE, offset_top, md, ims, NULL, 0,
1224 RM5);
1225 if (rrc == MATCH_MATCH || rrc == MATCH_ACCEPT) MRRETURN(MATCH_NOMATCH);
1226 if (rrc == MATCH_SKIP || rrc == MATCH_PRUNE || rrc == MATCH_COMMIT)
1227 {
1228 do ecode += GET(ecode,1); while (*ecode == OP_ALT);
1229 break;
1230 }
1231 if (rrc != MATCH_NOMATCH && rrc != MATCH_THEN) RRETURN(rrc);
1232 ecode += GET(ecode,1);
1233 }
1234 while (*ecode == OP_ALT);
1235
1236 if ((flags & match_condassert) != 0) RRETURN(MATCH_MATCH);
1237
1238 ecode += 1 + LINK_SIZE;
1239 continue;
1240
1241 /* Move the subject pointer back. This occurs only at the start of
1242 each branch of a lookbehind assertion. If we are too close to the start to
1243 move back, this match function fails. When working with UTF-8 we move
1244 back a number of characters, not bytes. */
1245
1246 case OP_REVERSE:
1247 #ifdef SUPPORT_UTF8
1248 if (utf8)
1249 {
1250 i = GET(ecode, 1);
1251 while (i-- > 0)
1252 {
1253 eptr--;
1254 if (eptr < md->start_subject) MRRETURN(MATCH_NOMATCH);
1255 BACKCHAR(eptr);
1256 }
1257 }
1258 else
1259 #endif
1260
1261 /* No UTF-8 support, or not in UTF-8 mode: count is byte count */
1262
1263 {
1264 eptr -= GET(ecode, 1);
1265 if (eptr < md->start_subject) MRRETURN(MATCH_NOMATCH);
1266 }
1267
1268 /* Save the earliest consulted character, then skip to next op code */
1269
1270 if (eptr < md->start_used_ptr) md->start_used_ptr = eptr;
1271 ecode += 1 + LINK_SIZE;
1272 break;
1273
1274 /* The callout item calls an external function, if one is provided, passing
1275 details of the match so far. This is mainly for debugging, though the
1276 function is able to force a failure. */
1277
1278 case OP_CALLOUT:
1279 if (pcre_callout != NULL)
1280 {
1281 pcre_callout_block cb;
1282 cb.version = 1; /* Version 1 of the callout block */
1283 cb.callout_number = ecode[1];
1284 cb.offset_vector = md->offset_vector;
1285 cb.subject = (PCRE_SPTR)md->start_subject;
1286 cb.subject_length = (int)(md->end_subject - md->start_subject);
1287 cb.start_match = (int)(mstart - md->start_subject);
1288 cb.current_position = (int)(eptr - md->start_subject);
1289 cb.pattern_position = GET(ecode, 2);
1290 cb.next_item_length = GET(ecode, 2 + LINK_SIZE);
1291 cb.capture_top = offset_top/2;
1292 cb.capture_last = md->capture_last;
1293 cb.callout_data = md->callout_data;
1294 if ((rrc = (*pcre_callout)(&cb)) > 0) MRRETURN(MATCH_NOMATCH);
1295 if (rrc < 0) RRETURN(rrc);
1296 }
1297 ecode += 2 + 2*LINK_SIZE;
1298 break;
1299
1300 /* Recursion either matches the current regex, or some subexpression. The
1301 offset data is the offset to the starting bracket from the start of the
1302 whole pattern. (This is so that it works from duplicated subpatterns.)
1303
1304 If there are any capturing brackets started but not finished, we have to
1305 save their starting points and reinstate them after the recursion. However,
1306 we don't know how many such there are (offset_top records the completed
1307 total) so we just have to save all the potential data. There may be up to
1308 65535 such values, which is too large to put on the stack, but using malloc
1309 for small numbers seems expensive. As a compromise, the stack is used when
1310 there are no more than REC_STACK_SAVE_MAX values to store; otherwise malloc
1311 is used. A problem is what to do if the malloc fails ... there is no way of
1312 returning to the top level with an error. Save the top REC_STACK_SAVE_MAX
1313 values on the stack, and accept that the rest may be wrong.
1314
1315 There are also other values that have to be saved. We use a chained
1316 sequence of blocks that actually live on the stack. Thanks to Robin Houston
1317 for the original version of this logic. */
1318
1319 case OP_RECURSE:
1320 {
1321 callpat = md->start_code + GET(ecode, 1);
1322 new_recursive.group_num = (callpat == md->start_code)? 0 :
1323 GET2(callpat, 1 + LINK_SIZE);
1324
1325 /* Add to "recursing stack" */
1326
1327 new_recursive.prevrec = md->recursive;
1328 md->recursive = &new_recursive;
1329
1330 /* Find where to continue from afterwards */
1331
1332 ecode += 1 + LINK_SIZE;
1333 new_recursive.after_call = ecode;
1334
1335 /* Now save the offset data. */
1336
1337 new_recursive.saved_max = md->offset_end;
1338 if (new_recursive.saved_max <= REC_STACK_SAVE_MAX)
1339 new_recursive.offset_save = stacksave;
1340 else
1341 {
1342 new_recursive.offset_save =
1343 (int *)(pcre_malloc)(new_recursive.saved_max * sizeof(int));
1344 if (new_recursive.offset_save == NULL) RRETURN(PCRE_ERROR_NOMEMORY);
1345 }
1346
1347 memcpy(new_recursive.offset_save, md->offset_vector,
1348 new_recursive.saved_max * sizeof(int));
1349 new_recursive.save_offset_top = offset_top;
1350
1351 /* OK, now we can do the recursion. For each top-level alternative we
1352 restore the offset and recursion data. */
1353
1354 DPRINTF(("Recursing into group %d\n", new_recursive.group_num));
1355 flags = (*callpat >= OP_SBRA)? match_cbegroup : 0;
1356 do
1357 {
1358 RMATCH(eptr, callpat + _pcre_OP_lengths[*callpat], offset_top,
1359 md, ims, eptrb, flags, RM6);
1360 if (rrc == MATCH_MATCH || rrc == MATCH_ACCEPT)
1361 {
1362 DPRINTF(("Recursion matched\n"));
1363 md->recursive = new_recursive.prevrec;
1364 if (new_recursive.offset_save != stacksave)
1365 (pcre_free)(new_recursive.offset_save);
1366 MRRETURN(MATCH_MATCH);
1367 }
1368 else if (rrc != MATCH_NOMATCH && rrc != MATCH_THEN)
1369 {
1370 DPRINTF(("Recursion gave error %d\n", rrc));
1371 if (new_recursive.offset_save != stacksave)
1372 (pcre_free)(new_recursive.offset_save);
1373 RRETURN(rrc);
1374 }
1375
1376 md->recursive = &new_recursive;
1377 memcpy(md->offset_vector, new_recursive.offset_save,
1378 new_recursive.saved_max * sizeof(int));
1379 callpat += GET(callpat, 1);
1380 }
1381 while (*callpat == OP_ALT);
1382
1383 DPRINTF(("Recursion didn't match\n"));
1384 md->recursive = new_recursive.prevrec;
1385 if (new_recursive.offset_save != stacksave)
1386 (pcre_free)(new_recursive.offset_save);
1387 MRRETURN(MATCH_NOMATCH);
1388 }
1389 /* Control never reaches here */
1390
1391 /* "Once" brackets are like assertion brackets except that after a match,
1392 the point in the subject string is not moved back. Thus there can never be
1393 a move back into the brackets. Friedl calls these "atomic" subpatterns.
1394 Check the alternative branches in turn - the matching won't pass the KET
1395 for this kind of subpattern. If any one branch matches, we carry on as at
1396 the end of a normal bracket, leaving the subject pointer, but resetting
1397 the start-of-match value in case it was changed by \K. */
1398
1399 case OP_ONCE:
1400 prev = ecode;
1401 saved_eptr = eptr;
1402
1403 do
1404 {
1405 RMATCH(eptr, ecode + 1 + LINK_SIZE, offset_top, md, ims, eptrb, 0, RM7);
1406 if (rrc == MATCH_MATCH) /* Note: _not_ MATCH_ACCEPT */
1407 {
1408 mstart = md->start_match_ptr;
1409 break;
1410 }
1411 if (rrc != MATCH_NOMATCH && rrc != MATCH_THEN) RRETURN(rrc);
1412 ecode += GET(ecode,1);
1413 }
1414 while (*ecode == OP_ALT);
1415
1416 /* If hit the end of the group (which could be repeated), fail */
1417
1418 if (*ecode != OP_ONCE && *ecode != OP_ALT) RRETURN(MATCH_NOMATCH);
1419
1420 /* Continue as from after the assertion, updating the offsets high water
1421 mark, since extracts may have been taken. */
1422
1423 do ecode += GET(ecode, 1); while (*ecode == OP_ALT);
1424
1425 offset_top = md->end_offset_top;
1426 eptr = md->end_match_ptr;
1427
1428 /* For a non-repeating ket, just continue at this level. This also
1429 happens for a repeating ket if no characters were matched in the group.
1430 This is the forcible breaking of infinite loops as implemented in Perl
1431 5.005. If there is an options reset, it will get obeyed in the normal
1432 course of events. */
1433
1434 if (*ecode == OP_KET || eptr == saved_eptr)
1435 {
1436 ecode += 1+LINK_SIZE;
1437 break;
1438 }
1439
1440 /* The repeating kets try the rest of the pattern or restart from the
1441 preceding bracket, in the appropriate order. The second "call" of match()
1442 uses tail recursion, to avoid using another stack frame. We need to reset
1443 any options that changed within the bracket before re-running it, so
1444 check the next opcode. */
1445
1446 if (ecode[1+LINK_SIZE] == OP_OPT)
1447 {
1448 ims = (ims & ~PCRE_IMS) | ecode[4];
1449 DPRINTF(("ims set to %02lx at group repeat\n", ims));
1450 }
1451
1452 if (*ecode == OP_KETRMIN)
1453 {
1454 RMATCH(eptr, ecode + 1 + LINK_SIZE, offset_top, md, ims, eptrb, 0, RM8);
1455 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1456 ecode = prev;
1457 flags = 0;
1458 goto TAIL_RECURSE;
1459 }
1460 else /* OP_KETRMAX */
1461 {
1462 RMATCH(eptr, prev, offset_top, md, ims, eptrb, match_cbegroup, RM9);
1463 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1464 ecode += 1 + LINK_SIZE;
1465 flags = 0;
1466 goto TAIL_RECURSE;
1467 }
1468 /* Control never gets here */
1469
1470 /* An alternation is the end of a branch; scan along to find the end of the
1471 bracketed group and go to there. */
1472
1473 case OP_ALT:
1474 do ecode += GET(ecode,1); while (*ecode == OP_ALT);
1475 break;
1476
1477 /* BRAZERO, BRAMINZERO and SKIPZERO occur just before a bracket group,
1478 indicating that it may occur zero times. It may repeat infinitely, or not
1479 at all - i.e. it could be ()* or ()? or even (){0} in the pattern. Brackets
1480 with fixed upper repeat limits are compiled as a number of copies, with the
1481 optional ones preceded by BRAZERO or BRAMINZERO. */
1482
1483 case OP_BRAZERO:
1484 {
1485 next = ecode+1;
1486 RMATCH(eptr, next, offset_top, md, ims, eptrb, 0, RM10);
1487 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1488 do next += GET(next,1); while (*next == OP_ALT);
1489 ecode = next + 1 + LINK_SIZE;
1490 }
1491 break;
1492
1493 case OP_BRAMINZERO:
1494 {
1495 next = ecode+1;
1496 do next += GET(next, 1); while (*next == OP_ALT);
1497 RMATCH(eptr, next + 1+LINK_SIZE, offset_top, md, ims, eptrb, 0, RM11);
1498 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1499 ecode++;
1500 }
1501 break;
1502
1503 case OP_SKIPZERO:
1504 {
1505 next = ecode+1;
1506 do next += GET(next,1); while (*next == OP_ALT);
1507 ecode = next + 1 + LINK_SIZE;
1508 }
1509 break;
1510
1511 /* End of a group, repeated or non-repeating. */
1512
1513 case OP_KET:
1514 case OP_KETRMIN:
1515 case OP_KETRMAX:
1516 prev = ecode - GET(ecode, 1);
1517
1518 /* If this was a group that remembered the subject start, in order to break
1519 infinite repeats of empty string matches, retrieve the subject start from
1520 the chain. Otherwise, set it NULL. */
1521
1522 if (*prev >= OP_SBRA)
1523 {
1524 saved_eptr = eptrb->epb_saved_eptr; /* Value at start of group */
1525 eptrb = eptrb->epb_prev; /* Backup to previous group */
1526 }
1527 else saved_eptr = NULL;
1528
1529 /* If we are at the end of an assertion group or an atomic group, stop
1530 matching and return MATCH_MATCH, but record the current high water mark for
1531 use by positive assertions. We also need to record the match start in case
1532 it was changed by \K. */
1533
1534 if (*prev == OP_ASSERT || *prev == OP_ASSERT_NOT ||
1535 *prev == OP_ASSERTBACK || *prev == OP_ASSERTBACK_NOT ||
1536 *prev == OP_ONCE)
1537 {
1538 md->end_match_ptr = eptr; /* For ONCE */
1539 md->end_offset_top = offset_top;
1540 md->start_match_ptr = mstart;
1541 MRRETURN(MATCH_MATCH);
1542 }
1543
1544 /* For capturing groups we have to check the group number back at the start
1545 and if necessary complete handling an extraction by setting the offsets and
1546 bumping the high water mark. Note that whole-pattern recursion is coded as
1547 a recurse into group 0, so it won't be picked up here. Instead, we catch it
1548 when the OP_END is reached. Other recursion is handled here. */
1549
1550 if (*prev == OP_CBRA || *prev == OP_SCBRA)
1551 {
1552 number = GET2(prev, 1+LINK_SIZE);
1553 offset = number << 1;
1554
1555 #ifdef PCRE_DEBUG
1556 printf("end bracket %d", number);
1557 printf("\n");
1558 #endif
1559
1560 md->capture_last = number;
1561 if (offset >= md->offset_max) md->offset_overflow = TRUE; else
1562 {
1563 md->offset_vector[offset] =
1564 md->offset_vector[md->offset_end - number];
1565 md->offset_vector[offset+1] = (int)(eptr - md->start_subject);
1566 if (offset_top <= offset) offset_top = offset + 2;
1567 }
1568
1569 /* Handle a recursively called group. Restore the offsets
1570 appropriately and continue from after the call. */
1571
1572 if (md->recursive != NULL && md->recursive->group_num == number)
1573 {
1574 recursion_info *rec = md->recursive;
1575 DPRINTF(("Recursion (%d) succeeded - continuing\n", number));
1576 md->recursive = rec->prevrec;
1577 memcpy(md->offset_vector, rec->offset_save,
1578 rec->saved_max * sizeof(int));
1579 offset_top = rec->save_offset_top;
1580 ecode = rec->after_call;
1581 ims = original_ims;
1582 break;
1583 }
1584 }
1585
1586 /* For both capturing and non-capturing groups, reset the value of the ims
1587 flags, in case they got changed during the group. */
1588
1589 ims = original_ims;
1590 DPRINTF(("ims reset to %02lx\n", ims));
1591
1592 /* For a non-repeating ket, just continue at this level. This also
1593 happens for a repeating ket if no characters were matched in the group.
1594 This is the forcible breaking of infinite loops as implemented in Perl
1595 5.005. If there is an options reset, it will get obeyed in the normal
1596 course of events. */
1597
1598 if (*ecode == OP_KET || eptr == saved_eptr)
1599 {
1600 ecode += 1 + LINK_SIZE;
1601 break;
1602 }
1603
1604 /* The repeating kets try the rest of the pattern or restart from the
1605 preceding bracket, in the appropriate order. In the second case, we can use
1606 tail recursion to avoid using another stack frame, unless we have an
1607 unlimited repeat of a group that can match an empty string. */
1608
1609 flags = (*prev >= OP_SBRA)? match_cbegroup : 0;
1610
1611 if (*ecode == OP_KETRMIN)
1612 {
1613 RMATCH(eptr, ecode + 1 + LINK_SIZE, offset_top, md, ims, eptrb, 0, RM12);
1614 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1615 if (flags != 0) /* Could match an empty string */
1616 {
1617 RMATCH(eptr, prev, offset_top, md, ims, eptrb, flags, RM50);
1618 RRETURN(rrc);
1619 }
1620 ecode = prev;
1621 goto TAIL_RECURSE;
1622 }
1623 else /* OP_KETRMAX */
1624 {
1625 RMATCH(eptr, prev, offset_top, md, ims, eptrb, flags, RM13);
1626 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1627 ecode += 1 + LINK_SIZE;
1628 flags = 0;
1629 goto TAIL_RECURSE;
1630 }
1631 /* Control never gets here */
1632
1633 /* Start of subject unless notbol, or after internal newline if multiline */
1634
1635 case OP_CIRC:
1636 if (md->notbol && eptr == md->start_subject) MRRETURN(MATCH_NOMATCH);
1637 if ((ims & PCRE_MULTILINE) != 0)
1638 {
1639 if (eptr != md->start_subject &&
1640 (eptr == md->end_subject || !WAS_NEWLINE(eptr)))
1641 MRRETURN(MATCH_NOMATCH);
1642 ecode++;
1643 break;
1644 }
1645 /* ... else fall through */
1646
1647 /* Start of subject assertion */
1648
1649 case OP_SOD:
1650 if (eptr != md->start_subject) MRRETURN(MATCH_NOMATCH);
1651 ecode++;
1652 break;
1653
1654 /* Start of match assertion */
1655
1656 case OP_SOM:
1657 if (eptr != md->start_subject + md->start_offset) MRRETURN(MATCH_NOMATCH);
1658 ecode++;
1659 break;
1660
1661 /* Reset the start of match point */
1662
1663 case OP_SET_SOM:
1664 mstart = eptr;
1665 ecode++;
1666 break;
1667
1668 /* Assert before internal newline if multiline, or before a terminating
1669 newline unless endonly is set, else end of subject unless noteol is set. */
1670
1671 case OP_DOLL:
1672 if ((ims & PCRE_MULTILINE) != 0)
1673 {
1674 if (eptr < md->end_subject)
1675 { if (!IS_NEWLINE(eptr)) MRRETURN(MATCH_NOMATCH); }
1676 else
1677 { if (md->noteol) MRRETURN(MATCH_NOMATCH); }
1678 ecode++;
1679 break;
1680 }
1681 else
1682 {
1683 if (md->noteol) MRRETURN(MATCH_NOMATCH);
1684 if (!md->endonly)
1685 {
1686 if (eptr != md->end_subject &&
1687 (!IS_NEWLINE(eptr) || eptr != md->end_subject - md->nllen))
1688 MRRETURN(MATCH_NOMATCH);
1689 ecode++;
1690 break;
1691 }
1692 }
1693 /* ... else fall through for endonly */
1694
1695 /* End of subject assertion (\z) */
1696
1697 case OP_EOD:
1698 if (eptr < md->end_subject) MRRETURN(MATCH_NOMATCH);
1699 ecode++;
1700 break;
1701
1702 /* End of subject or ending \n assertion (\Z) */
1703
1704 case OP_EODN:
1705 if (eptr != md->end_subject &&
1706 (!IS_NEWLINE(eptr) || eptr != md->end_subject - md->nllen))
1707 MRRETURN(MATCH_NOMATCH);
1708 ecode++;
1709 break;
1710
1711 /* Word boundary assertions */
1712
1713 case OP_NOT_WORD_BOUNDARY:
1714 case OP_WORD_BOUNDARY:
1715 {
1716
1717 /* Find out if the previous and current characters are "word" characters.
1718 It takes a bit more work in UTF-8 mode. Characters > 255 are assumed to
1719 be "non-word" characters. Remember the earliest consulted character for
1720 partial matching. */
1721
1722 #ifdef SUPPORT_UTF8
1723 if (utf8)
1724 {
1725 /* Get status of previous character */
1726
1727 if (eptr == md->start_subject) prev_is_word = FALSE; else
1728 {
1729 USPTR lastptr = eptr - 1;
1730 while((*lastptr & 0xc0) == 0x80) lastptr--;
1731 if (lastptr < md->start_used_ptr) md->start_used_ptr = lastptr;
1732 GETCHAR(c, lastptr);
1733 #ifdef SUPPORT_UCP
1734 if (md->use_ucp)
1735 {
1736 if (c == '_') prev_is_word = TRUE; else
1737 {
1738 int cat = UCD_CATEGORY(c);
1739 prev_is_word = (cat == ucp_L || cat == ucp_N);
1740 }
1741 }
1742 else
1743 #endif
1744 prev_is_word = c < 256 && (md->ctypes[c] & ctype_word) != 0;
1745 }
1746
1747 /* Get status of next character */
1748
1749 if (eptr >= md->end_subject)
1750 {
1751 SCHECK_PARTIAL();
1752 cur_is_word = FALSE;
1753 }
1754 else
1755 {
1756 GETCHAR(c, eptr);
1757 #ifdef SUPPORT_UCP
1758 if (md->use_ucp)
1759 {
1760 if (c == '_') cur_is_word = TRUE; else
1761 {
1762 int cat = UCD_CATEGORY(c);
1763 cur_is_word = (cat == ucp_L || cat == ucp_N);
1764 }
1765 }
1766 else
1767 #endif
1768 cur_is_word = c < 256 && (md->ctypes[c] & ctype_word) != 0;
1769 }
1770 }
1771 else
1772 #endif
1773
1774 /* Not in UTF-8 mode, but we may still have PCRE_UCP set, and for
1775 consistency with the behaviour of \w we do use it in this case. */
1776
1777 {
1778 /* Get status of previous character */
1779
1780 if (eptr == md->start_subject) prev_is_word = FALSE; else
1781 {
1782 if (eptr <= md->start_used_ptr) md->start_used_ptr = eptr - 1;
1783 #ifdef SUPPORT_UCP
1784 if (md->use_ucp)
1785 {
1786 c = eptr[-1];
1787 if (c == '_') prev_is_word = TRUE; else
1788 {
1789 int cat = UCD_CATEGORY(c);
1790 prev_is_word = (cat == ucp_L || cat == ucp_N);
1791 }
1792 }
1793 else
1794 #endif
1795 prev_is_word = ((md->ctypes[eptr[-1]] & ctype_word) != 0);
1796 }
1797
1798 /* Get status of next character */
1799
1800 if (eptr >= md->end_subject)
1801 {
1802 SCHECK_PARTIAL();
1803 cur_is_word = FALSE;
1804 }
1805 else
1806 #ifdef SUPPORT_UCP
1807 if (md->use_ucp)
1808 {
1809 c = *eptr;
1810 if (c == '_') cur_is_word = TRUE; else
1811 {
1812 int cat = UCD_CATEGORY(c);
1813 cur_is_word = (cat == ucp_L || cat == ucp_N);
1814 }
1815 }
1816 else
1817 #endif
1818 cur_is_word = ((md->ctypes[*eptr] & ctype_word) != 0);
1819 }
1820
1821 /* Now see if the situation is what we want */
1822
1823 if ((*ecode++ == OP_WORD_BOUNDARY)?
1824 cur_is_word == prev_is_word : cur_is_word != prev_is_word)
1825 MRRETURN(MATCH_NOMATCH);
1826 }
1827 break;
1828
1829 /* Match a single character type; inline for speed */
1830
1831 case OP_ANY:
1832 if (IS_NEWLINE(eptr)) MRRETURN(MATCH_NOMATCH);
1833 /* Fall through */
1834
1835 case OP_ALLANY:
1836 if (eptr++ >= md->end_subject)
1837 {
1838 SCHECK_PARTIAL();
1839 MRRETURN(MATCH_NOMATCH);
1840 }
1841 if (utf8) while (eptr < md->end_subject && (*eptr & 0xc0) == 0x80) eptr++;
1842 ecode++;
1843 break;
1844
1845 /* Match a single byte, even in UTF-8 mode. This opcode really does match
1846 any byte, even newline, independent of the setting of PCRE_DOTALL. */
1847
1848 case OP_ANYBYTE:
1849 if (eptr++ >= md->end_subject)
1850 {
1851 SCHECK_PARTIAL();
1852 MRRETURN(MATCH_NOMATCH);
1853 }
1854 ecode++;
1855 break;
1856
1857 case OP_NOT_DIGIT:
1858 if (eptr >= md->end_subject)
1859 {
1860 SCHECK_PARTIAL();
1861 MRRETURN(MATCH_NOMATCH);
1862 }
1863 GETCHARINCTEST(c, eptr);
1864 if (
1865 #ifdef SUPPORT_UTF8
1866 c < 256 &&
1867 #endif
1868 (md->ctypes[c] & ctype_digit) != 0
1869 )
1870 MRRETURN(MATCH_NOMATCH);
1871 ecode++;
1872 break;
1873
1874 case OP_DIGIT:
1875 if (eptr >= md->end_subject)
1876 {
1877 SCHECK_PARTIAL();
1878 MRRETURN(MATCH_NOMATCH);
1879 }
1880 GETCHARINCTEST(c, eptr);
1881 if (
1882 #ifdef SUPPORT_UTF8
1883 c >= 256 ||
1884 #endif
1885 (md->ctypes[c] & ctype_digit) == 0
1886 )
1887 MRRETURN(MATCH_NOMATCH);
1888 ecode++;
1889 break;
1890
1891 case OP_NOT_WHITESPACE:
1892 if (eptr >= md->end_subject)
1893 {
1894 SCHECK_PARTIAL();
1895 MRRETURN(MATCH_NOMATCH);
1896 }
1897 GETCHARINCTEST(c, eptr);
1898 if (
1899 #ifdef SUPPORT_UTF8
1900 c < 256 &&
1901 #endif
1902 (md->ctypes[c] & ctype_space) != 0
1903 )
1904 MRRETURN(MATCH_NOMATCH);
1905 ecode++;
1906 break;
1907
1908 case OP_WHITESPACE:
1909 if (eptr >= md->end_subject)
1910 {
1911 SCHECK_PARTIAL();
1912 MRRETURN(MATCH_NOMATCH);
1913 }
1914 GETCHARINCTEST(c, eptr);
1915 if (
1916 #ifdef SUPPORT_UTF8
1917 c >= 256 ||
1918 #endif
1919 (md->ctypes[c] & ctype_space) == 0
1920 )
1921 MRRETURN(MATCH_NOMATCH);
1922 ecode++;
1923 break;
1924
1925 case OP_NOT_WORDCHAR:
1926 if (eptr >= md->end_subject)
1927 {
1928 SCHECK_PARTIAL();
1929 MRRETURN(MATCH_NOMATCH);
1930 }
1931 GETCHARINCTEST(c, eptr);
1932 if (
1933 #ifdef SUPPORT_UTF8
1934 c < 256 &&
1935 #endif
1936 (md->ctypes[c] & ctype_word) != 0
1937 )
1938 MRRETURN(MATCH_NOMATCH);
1939 ecode++;
1940 break;
1941
1942 case OP_WORDCHAR:
1943 if (eptr >= md->end_subject)
1944 {
1945 SCHECK_PARTIAL();
1946 MRRETURN(MATCH_NOMATCH);
1947 }
1948 GETCHARINCTEST(c, eptr);
1949 if (
1950 #ifdef SUPPORT_UTF8
1951 c >= 256 ||
1952 #endif
1953 (md->ctypes[c] & ctype_word) == 0
1954 )
1955 MRRETURN(MATCH_NOMATCH);
1956 ecode++;
1957 break;
1958
1959 case OP_ANYNL:
1960 if (eptr >= md->end_subject)
1961 {
1962 SCHECK_PARTIAL();
1963 MRRETURN(MATCH_NOMATCH);
1964 }
1965 GETCHARINCTEST(c, eptr);
1966 switch(c)
1967 {
1968 default: MRRETURN(MATCH_NOMATCH);
1969 case 0x000d:
1970 if (eptr < md->end_subject && *eptr == 0x0a) eptr++;
1971 break;
1972
1973 case 0x000a:
1974 break;
1975
1976 case 0x000b:
1977 case 0x000c:
1978 case 0x0085:
1979 case 0x2028:
1980 case 0x2029:
1981 if (md->bsr_anycrlf) MRRETURN(MATCH_NOMATCH);
1982 break;
1983 }
1984 ecode++;
1985 break;
1986
1987 case OP_NOT_HSPACE:
1988 if (eptr >= md->end_subject)
1989 {
1990 SCHECK_PARTIAL();
1991 MRRETURN(MATCH_NOMATCH);
1992 }
1993 GETCHARINCTEST(c, eptr);
1994 switch(c)
1995 {
1996 default: break;
1997 case 0x09: /* HT */
1998 case 0x20: /* SPACE */
1999 case 0xa0: /* NBSP */
2000 case 0x1680: /* OGHAM SPACE MARK */
2001 case 0x180e: /* MONGOLIAN VOWEL SEPARATOR */
2002 case 0x2000: /* EN QUAD */
2003 case 0x2001: /* EM QUAD */
2004 case 0x2002: /* EN SPACE */
2005 case 0x2003: /* EM SPACE */
2006 case 0x2004: /* THREE-PER-EM SPACE */
2007 case 0x2005: /* FOUR-PER-EM SPACE */
2008 case 0x2006: /* SIX-PER-EM SPACE */
2009 case 0x2007: /* FIGURE SPACE */
2010 case 0x2008: /* PUNCTUATION SPACE */
2011 case 0x2009: /* THIN SPACE */
2012 case 0x200A: /* HAIR SPACE */
2013 case 0x202f: /* NARROW NO-BREAK SPACE */
2014 case 0x205f: /* MEDIUM MATHEMATICAL SPACE */
2015 case 0x3000: /* IDEOGRAPHIC SPACE */
2016 MRRETURN(MATCH_NOMATCH);
2017 }
2018 ecode++;
2019 break;
2020
2021 case OP_HSPACE:
2022 if (eptr >= md->end_subject)
2023 {
2024 SCHECK_PARTIAL();
2025 MRRETURN(MATCH_NOMATCH);
2026 }
2027 GETCHARINCTEST(c, eptr);
2028 switch(c)
2029 {
2030 default: MRRETURN(MATCH_NOMATCH);
2031 case 0x09: /* HT */
2032 case 0x20: /* SPACE */
2033 case 0xa0: /* NBSP */
2034 case 0x1680: /* OGHAM SPACE MARK */
2035 case 0x180e: /* MONGOLIAN VOWEL SEPARATOR */
2036 case 0x2000: /* EN QUAD */
2037 case 0x2001: /* EM QUAD */
2038 case 0x2002: /* EN SPACE */
2039 case 0x2003: /* EM SPACE */
2040 case 0x2004: /* THREE-PER-EM SPACE */
2041 case 0x2005: /* FOUR-PER-EM SPACE */
2042 case 0x2006: /* SIX-PER-EM SPACE */
2043 case 0x2007: /* FIGURE SPACE */
2044 case 0x2008: /* PUNCTUATION SPACE */
2045 case 0x2009: /* THIN SPACE */
2046 case 0x200A: /* HAIR SPACE */
2047 case 0x202f: /* NARROW NO-BREAK SPACE */
2048 case 0x205f: /* MEDIUM MATHEMATICAL SPACE */
2049 case 0x3000: /* IDEOGRAPHIC SPACE */
2050 break;
2051 }
2052 ecode++;
2053 break;
2054
2055 case OP_NOT_VSPACE:
2056 if (eptr >= md->end_subject)
2057 {
2058 SCHECK_PARTIAL();
2059 MRRETURN(MATCH_NOMATCH);
2060 }
2061 GETCHARINCTEST(c, eptr);
2062 switch(c)
2063 {
2064 default: break;
2065 case 0x0a: /* LF */
2066 case 0x0b: /* VT */
2067 case 0x0c: /* FF */
2068 case 0x0d: /* CR */
2069 case 0x85: /* NEL */
2070 case 0x2028: /* LINE SEPARATOR */
2071 case 0x2029: /* PARAGRAPH SEPARATOR */
2072 MRRETURN(MATCH_NOMATCH);
2073 }
2074 ecode++;
2075 break;
2076
2077 case OP_VSPACE:
2078 if (eptr >= md->end_subject)
2079 {
2080 SCHECK_PARTIAL();
2081 MRRETURN(MATCH_NOMATCH);
2082 }
2083 GETCHARINCTEST(c, eptr);
2084 switch(c)
2085 {
2086 default: MRRETURN(MATCH_NOMATCH);
2087 case 0x0a: /* LF */
2088 case 0x0b: /* VT */
2089 case 0x0c: /* FF */
2090 case 0x0d: /* CR */
2091 case 0x85: /* NEL */
2092 case 0x2028: /* LINE SEPARATOR */
2093 case 0x2029: /* PARAGRAPH SEPARATOR */
2094 break;
2095 }
2096 ecode++;
2097 break;
2098
2099 #ifdef SUPPORT_UCP
2100 /* Check the next character by Unicode property. We will get here only
2101 if the support is in the binary; otherwise a compile-time error occurs. */
2102
2103 case OP_PROP:
2104 case OP_NOTPROP:
2105 if (eptr >= md->end_subject)
2106 {
2107 SCHECK_PARTIAL();
2108 MRRETURN(MATCH_NOMATCH);
2109 }
2110 GETCHARINCTEST(c, eptr);
2111 {
2112 const ucd_record *prop = GET_UCD(c);
2113
2114 switch(ecode[1])
2115 {
2116 case PT_ANY:
2117 if (op == OP_NOTPROP) MRRETURN(MATCH_NOMATCH);
2118 break;
2119
2120 case PT_LAMP:
2121 if ((prop->chartype == ucp_Lu ||
2122 prop->chartype == ucp_Ll ||
2123 prop->chartype == ucp_Lt) == (op == OP_NOTPROP))
2124 MRRETURN(MATCH_NOMATCH);
2125 break;
2126
2127 case PT_GC:
2128 if ((ecode[2] != _pcre_ucp_gentype[prop->chartype]) == (op == OP_PROP))
2129 MRRETURN(MATCH_NOMATCH);
2130 break;
2131
2132 case PT_PC:
2133 if ((ecode[2] != prop->chartype) == (op == OP_PROP))
2134 MRRETURN(MATCH_NOMATCH);
2135 break;
2136
2137 case PT_SC:
2138 if ((ecode[2] != prop->script) == (op == OP_PROP))
2139 MRRETURN(MATCH_NOMATCH);
2140 break;
2141
2142 /* These are specials */
2143
2144 case PT_ALNUM:
2145 if ((_pcre_ucp_gentype[prop->chartype] == ucp_L ||
2146 _pcre_ucp_gentype[prop->chartype] == ucp_N) == (op == OP_NOTPROP))
2147 MRRETURN(MATCH_NOMATCH);
2148 break;
2149
2150 case PT_SPACE: /* Perl space */
2151 if ((_pcre_ucp_gentype[prop->chartype] == ucp_Z ||
2152 c == CHAR_HT || c == CHAR_NL || c == CHAR_FF || c == CHAR_CR)
2153 == (op == OP_NOTPROP))
2154 MRRETURN(MATCH_NOMATCH);
2155 break;
2156
2157 case PT_PXSPACE: /* POSIX space */
2158 if ((_pcre_ucp_gentype[prop->chartype] == ucp_Z ||
2159 c == CHAR_HT || c == CHAR_NL || c == CHAR_VT ||
2160 c == CHAR_FF || c == CHAR_CR)
2161 == (op == OP_NOTPROP))
2162 MRRETURN(MATCH_NOMATCH);
2163 break;
2164
2165 case PT_WORD:
2166 if ((_pcre_ucp_gentype[prop->chartype] == ucp_L ||
2167 _pcre_ucp_gentype[prop->chartype] == ucp_N ||
2168 c == CHAR_UNDERSCORE) == (op == OP_NOTPROP))
2169 MRRETURN(MATCH_NOMATCH);
2170 break;
2171
2172 /* This should never occur */
2173
2174 default:
2175 RRETURN(PCRE_ERROR_INTERNAL);
2176 }
2177
2178 ecode += 3;
2179 }
2180 break;
2181
2182 /* Match an extended Unicode sequence. We will get here only if the support
2183 is in the binary; otherwise a compile-time error occurs. */
2184
2185 case OP_EXTUNI:
2186 if (eptr >= md->end_subject)
2187 {
2188 SCHECK_PARTIAL();
2189 MRRETURN(MATCH_NOMATCH);
2190 }
2191 GETCHARINCTEST(c, eptr);
2192 {
2193 int category = UCD_CATEGORY(c);
2194 if (category == ucp_M) MRRETURN(MATCH_NOMATCH);
2195 while (eptr < md->end_subject)
2196 {
2197 int len = 1;
2198 if (!utf8) c = *eptr; else
2199 {
2200 GETCHARLEN(c, eptr, len);
2201 }
2202 category = UCD_CATEGORY(c);
2203 if (category != ucp_M) break;
2204 eptr += len;
2205 }
2206 }
2207 ecode++;
2208 break;
2209 #endif
2210
2211
2212 /* Match a back reference, possibly repeatedly. Look past the end of the
2213 item to see if there is repeat information following. The code is similar
2214 to that for character classes, but repeated for efficiency. Then obey
2215 similar code to character type repeats - written out again for speed.
2216 However, if the referenced string is the empty string, always treat
2217 it as matched, any number of times (otherwise there could be infinite
2218 loops). */
2219
2220 case OP_REF:
2221 {
2222 offset = GET2(ecode, 1) << 1; /* Doubled ref number */
2223 ecode += 3;
2224
2225 /* If the reference is unset, there are two possibilities:
2226
2227 (a) In the default, Perl-compatible state, set the length to be longer
2228 than the amount of subject left; this ensures that every attempt at a
2229 match fails. We can't just fail here, because of the possibility of
2230 quantifiers with zero minima.
2231
2232 (b) If the JavaScript compatibility flag is set, set the length to zero
2233 so that the back reference matches an empty string.
2234
2235 Otherwise, set the length to the length of what was matched by the
2236 referenced subpattern. */
2237
2238 if (offset >= offset_top || md->offset_vector[offset] < 0)
2239 length = (md->jscript_compat)? 0 : (int)(md->end_subject - eptr + 1);
2240 else
2241 length = md->offset_vector[offset+1] - md->offset_vector[offset];
2242
2243 /* Set up for repetition, or handle the non-repeated case */
2244
2245 switch (*ecode)
2246 {
2247 case OP_CRSTAR:
2248 case OP_CRMINSTAR:
2249 case OP_CRPLUS:
2250 case OP_CRMINPLUS:
2251 case OP_CRQUERY:
2252 case OP_CRMINQUERY:
2253 c = *ecode++ - OP_CRSTAR;
2254 minimize = (c & 1) != 0;
2255 min = rep_min[c]; /* Pick up values from tables; */
2256 max = rep_max[c]; /* zero for max => infinity */
2257 if (max == 0) max = INT_MAX;
2258 break;
2259
2260 case OP_CRRANGE:
2261 case OP_CRMINRANGE:
2262 minimize = (*ecode == OP_CRMINRANGE);
2263 min = GET2(ecode, 1);
2264 max = GET2(ecode, 3);
2265 if (max == 0) max = INT_MAX;
2266 ecode += 5;
2267 break;
2268
2269 default: /* No repeat follows */
2270 if (!match_ref(offset, eptr, length, md, ims))
2271 {
2272 CHECK_PARTIAL();
2273 MRRETURN(MATCH_NOMATCH);
2274 }
2275 eptr += length;
2276 continue; /* With the main loop */
2277 }
2278
2279 /* If the length of the reference is zero, just continue with the
2280 main loop. */
2281
2282 if (length == 0) continue;
2283
2284 /* First, ensure the minimum number of matches are present. We get back
2285 the length of the reference string explicitly rather than passing the
2286 address of eptr, so that eptr can be a register variable. */
2287
2288 for (i = 1; i <= min; i++)
2289 {
2290 if (!match_ref(offset, eptr, length, md, ims))
2291 {
2292 CHECK_PARTIAL();
2293 MRRETURN(MATCH_NOMATCH);
2294 }
2295 eptr += length;
2296 }
2297
2298 /* If min = max, continue at the same level without recursion.
2299 They are not both allowed to be zero. */
2300
2301 if (min == max) continue;
2302
2303 /* If minimizing, keep trying and advancing the pointer */
2304
2305 if (minimize)
2306 {
2307 for (fi = min;; fi++)
2308 {
2309 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM14);
2310 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2311 if (fi >= max) MRRETURN(MATCH_NOMATCH);
2312 if (!match_ref(offset, eptr, length, md, ims))
2313 {
2314 CHECK_PARTIAL();
2315 MRRETURN(MATCH_NOMATCH);
2316 }
2317 eptr += length;
2318 }
2319 /* Control never gets here */
2320 }
2321
2322 /* If maximizing, find the longest string and work backwards */
2323
2324 else
2325 {
2326 pp = eptr;
2327 for (i = min; i < max; i++)
2328 {
2329 if (!match_ref(offset, eptr, length, md, ims))
2330 {
2331 CHECK_PARTIAL();
2332 break;
2333 }
2334 eptr += length;
2335 }
2336 while (eptr >= pp)
2337 {
2338 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM15);
2339 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2340 eptr -= length;
2341 }
2342 MRRETURN(MATCH_NOMATCH);
2343 }
2344 }
2345 /* Control never gets here */
2346
2347 /* Match a bit-mapped character class, possibly repeatedly. This op code is
2348 used when all the characters in the class have values in the range 0-255,
2349 and either the matching is caseful, or the characters are in the range
2350 0-127 when UTF-8 processing is enabled. The only difference between
2351 OP_CLASS and OP_NCLASS occurs when a data character outside the range is
2352 encountered.
2353
2354 First, look past the end of the item to see if there is repeat information
2355 following. Then obey similar code to character type repeats - written out
2356 again for speed. */
2357
2358 case OP_NCLASS:
2359 case OP_CLASS:
2360 {
2361 data = ecode + 1; /* Save for matching */
2362 ecode += 33; /* Advance past the item */
2363
2364 switch (*ecode)
2365 {
2366 case OP_CRSTAR:
2367 case OP_CRMINSTAR:
2368 case OP_CRPLUS:
2369 case OP_CRMINPLUS:
2370 case OP_CRQUERY:
2371 case OP_CRMINQUERY:
2372 c = *ecode++ - OP_CRSTAR;
2373 minimize = (c & 1) != 0;
2374 min = rep_min[c]; /* Pick up values from tables; */
2375 max = rep_max[c]; /* zero for max => infinity */
2376 if (max == 0) max = INT_MAX;
2377 break;
2378
2379 case OP_CRRANGE:
2380 case OP_CRMINRANGE:
2381 minimize = (*ecode == OP_CRMINRANGE);
2382 min = GET2(ecode, 1);
2383 max = GET2(ecode, 3);
2384 if (max == 0) max = INT_MAX;
2385 ecode += 5;
2386 break;
2387
2388 default: /* No repeat follows */
2389 min = max = 1;
2390 break;
2391 }
2392
2393 /* First, ensure the minimum number of matches are present. */
2394
2395 #ifdef SUPPORT_UTF8
2396 /* UTF-8 mode */
2397 if (utf8)
2398 {
2399 for (i = 1; i <= min; i++)
2400 {
2401 if (eptr >= md->end_subject)
2402 {
2403 SCHECK_PARTIAL();
2404 MRRETURN(MATCH_NOMATCH);
2405 }
2406 GETCHARINC(c, eptr);
2407 if (c > 255)
2408 {
2409 if (op == OP_CLASS) MRRETURN(MATCH_NOMATCH);
2410 }
2411 else
2412 {
2413 if ((data[c/8] & (1 << (c&7))) == 0) MRRETURN(MATCH_NOMATCH);
2414 }
2415 }
2416 }
2417 else
2418 #endif
2419 /* Not UTF-8 mode */
2420 {
2421 for (i = 1; i <= min; i++)
2422 {
2423 if (eptr >= md->end_subject)
2424 {
2425 SCHECK_PARTIAL();
2426 MRRETURN(MATCH_NOMATCH);
2427 }
2428 c = *eptr++;
2429 if ((data[c/8] & (1 << (c&7))) == 0) MRRETURN(MATCH_NOMATCH);
2430 }
2431 }
2432
2433 /* If max == min we can continue with the main loop without the
2434 need to recurse. */
2435
2436 if (min == max) continue;
2437
2438 /* If minimizing, keep testing the rest of the expression and advancing
2439 the pointer while it matches the class. */
2440
2441 if (minimize)
2442 {
2443 #ifdef SUPPORT_UTF8
2444 /* UTF-8 mode */
2445 if (utf8)
2446 {
2447 for (fi = min;; fi++)
2448 {
2449 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM16);
2450 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2451 if (fi >= max) MRRETURN(MATCH_NOMATCH);
2452 if (eptr >= md->end_subject)
2453 {
2454 SCHECK_PARTIAL();
2455 MRRETURN(MATCH_NOMATCH);
2456 }
2457 GETCHARINC(c, eptr);
2458 if (c > 255)
2459 {
2460 if (op == OP_CLASS) MRRETURN(MATCH_NOMATCH);
2461 }
2462 else
2463 {
2464 if ((data[c/8] & (1 << (c&7))) == 0) MRRETURN(MATCH_NOMATCH);
2465 }
2466 }
2467 }
2468 else
2469 #endif
2470 /* Not UTF-8 mode */
2471 {
2472 for (fi = min;; fi++)
2473 {
2474 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM17);
2475 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2476 if (fi >= max) MRRETURN(MATCH_NOMATCH);
2477 if (eptr >= md->end_subject)
2478 {
2479 SCHECK_PARTIAL();
2480 MRRETURN(MATCH_NOMATCH);
2481 }
2482 c = *eptr++;
2483 if ((data[c/8] & (1 << (c&7))) == 0) MRRETURN(MATCH_NOMATCH);
2484 }
2485 }
2486 /* Control never gets here */
2487 }
2488
2489 /* If maximizing, find the longest possible run, then work backwards. */
2490
2491 else
2492 {
2493 pp = eptr;
2494
2495 #ifdef SUPPORT_UTF8
2496 /* UTF-8 mode */
2497 if (utf8)
2498 {
2499 for (i = min; i < max; i++)
2500 {
2501 int len = 1;
2502 if (eptr >= md->end_subject)
2503 {
2504 SCHECK_PARTIAL();
2505 break;
2506 }
2507 GETCHARLEN(c, eptr, len);
2508 if (c > 255)
2509 {
2510 if (op == OP_CLASS) break;
2511 }
2512 else
2513 {
2514 if ((data[c/8] & (1 << (c&7))) == 0) break;
2515 }
2516 eptr += len;
2517 }
2518 for (;;)
2519 {
2520 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM18);
2521 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2522 if (eptr-- == pp) break; /* Stop if tried at original pos */
2523 BACKCHAR(eptr);
2524 }
2525 }
2526 else
2527 #endif
2528 /* Not UTF-8 mode */
2529 {
2530 for (i = min; i < max; i++)
2531 {
2532 if (eptr >= md->end_subject)
2533 {
2534 SCHECK_PARTIAL();
2535 break;
2536 }
2537 c = *eptr;
2538 if ((data[c/8] & (1 << (c&7))) == 0) break;
2539 eptr++;
2540 }
2541 while (eptr >= pp)
2542 {
2543 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM19);
2544 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2545 eptr--;
2546 }
2547 }
2548
2549 MRRETURN(MATCH_NOMATCH);
2550 }
2551 }
2552 /* Control never gets here */
2553
2554
2555 /* Match an extended character class. This opcode is encountered only
2556 when UTF-8 mode mode is supported. Nevertheless, we may not be in UTF-8
2557 mode, because Unicode properties are supported in non-UTF-8 mode. */
2558
2559 #ifdef SUPPORT_UTF8
2560 case OP_XCLASS:
2561 {
2562 data = ecode + 1 + LINK_SIZE; /* Save for matching */
2563 ecode += GET(ecode, 1); /* Advance past the item */
2564
2565 switch (*ecode)
2566 {
2567 case OP_CRSTAR:
2568 case OP_CRMINSTAR:
2569 case OP_CRPLUS:
2570 case OP_CRMINPLUS:
2571 case OP_CRQUERY:
2572 case OP_CRMINQUERY:
2573 c = *ecode++ - OP_CRSTAR;
2574 minimize = (c & 1) != 0;
2575 min = rep_min[c]; /* Pick up values from tables; */
2576 max = rep_max[c]; /* zero for max => infinity */
2577 if (max == 0) max = INT_MAX;
2578 break;
2579
2580 case OP_CRRANGE:
2581 case OP_CRMINRANGE:
2582 minimize = (*ecode == OP_CRMINRANGE);
2583 min = GET2(ecode, 1);
2584 max = GET2(ecode, 3);
2585 if (max == 0) max = INT_MAX;
2586 ecode += 5;
2587 break;
2588
2589 default: /* No repeat follows */
2590 min = max = 1;
2591 break;
2592 }
2593
2594 /* First, ensure the minimum number of matches are present. */
2595
2596 for (i = 1; i <= min; i++)
2597 {
2598 if (eptr >= md->end_subject)
2599 {
2600 SCHECK_PARTIAL();
2601 MRRETURN(MATCH_NOMATCH);
2602 }
2603 GETCHARINCTEST(c, eptr);
2604 if (!_pcre_xclass(c, data)) MRRETURN(MATCH_NOMATCH);
2605 }
2606
2607 /* If max == min we can continue with the main loop without the
2608 need to recurse. */
2609
2610 if (min == max) continue;
2611
2612 /* If minimizing, keep testing the rest of the expression and advancing
2613 the pointer while it matches the class. */
2614
2615 if (minimize)
2616 {
2617 for (fi = min;; fi++)
2618 {
2619 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM20);
2620 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2621 if (fi >= max) MRRETURN(MATCH_NOMATCH);
2622 if (eptr >= md->end_subject)
2623 {
2624 SCHECK_PARTIAL();
2625 MRRETURN(MATCH_NOMATCH);
2626 }
2627 GETCHARINCTEST(c, eptr);
2628 if (!_pcre_xclass(c, data)) MRRETURN(MATCH_NOMATCH);
2629 }
2630 /* Control never gets here */
2631 }
2632
2633 /* If maximizing, find the longest possible run, then work backwards. */
2634
2635 else
2636 {
2637 pp = eptr;
2638 for (i = min; i < max; i++)
2639 {
2640 int len = 1;
2641 if (eptr >= md->end_subject)
2642 {
2643 SCHECK_PARTIAL();
2644 break;
2645 }
2646 GETCHARLENTEST(c, eptr, len);
2647 if (!_pcre_xclass(c, data)) break;
2648 eptr += len;
2649 }
2650 for(;;)
2651 {
2652 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM21);
2653 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2654 if (eptr-- == pp) break; /* Stop if tried at original pos */
2655 if (utf8) BACKCHAR(eptr);
2656 }
2657 MRRETURN(MATCH_NOMATCH);
2658 }
2659
2660 /* Control never gets here */
2661 }
2662 #endif /* End of XCLASS */
2663
2664 /* Match a single character, casefully */
2665
2666 case OP_CHAR:
2667 #ifdef SUPPORT_UTF8
2668 if (utf8)
2669 {
2670 length = 1;
2671 ecode++;
2672 GETCHARLEN(fc, ecode, length);
2673 if (length > md->end_subject - eptr)
2674 {
2675 CHECK_PARTIAL(); /* Not SCHECK_PARTIAL() */
2676 MRRETURN(MATCH_NOMATCH);
2677 }
2678 while (length-- > 0) if (*ecode++ != *eptr++) MRRETURN(MATCH_NOMATCH);
2679 }
2680 else
2681 #endif
2682
2683 /* Non-UTF-8 mode */
2684 {
2685 if (md->end_subject - eptr < 1)
2686 {
2687 SCHECK_PARTIAL(); /* This one can use SCHECK_PARTIAL() */
2688 MRRETURN(MATCH_NOMATCH);
2689 }
2690 if (ecode[1] != *eptr++) MRRETURN(MATCH_NOMATCH);
2691 ecode += 2;
2692 }
2693 break;
2694
2695 /* Match a single character, caselessly */
2696
2697 case OP_CHARNC:
2698 #ifdef SUPPORT_UTF8
2699 if (utf8)
2700 {
2701 length = 1;
2702 ecode++;
2703 GETCHARLEN(fc, ecode, length);
2704
2705 if (length > md->end_subject - eptr)
2706 {
2707 CHECK_PARTIAL(); /* Not SCHECK_PARTIAL() */
2708 MRRETURN(MATCH_NOMATCH);
2709 }
2710
2711 /* If the pattern character's value is < 128, we have only one byte, and
2712 can use the fast lookup table. */
2713
2714 if (fc < 128)
2715 {
2716 if (md->lcc[*ecode++] != md->lcc[*eptr++]) MRRETURN(MATCH_NOMATCH);
2717 }
2718
2719 /* Otherwise we must pick up the subject character */
2720
2721 else
2722 {
2723 unsigned int dc;
2724 GETCHARINC(dc, eptr);
2725 ecode += length;
2726
2727 /* If we have Unicode property support, we can use it to test the other
2728 case of the character, if there is one. */
2729
2730 if (fc != dc)
2731 {
2732 #ifdef SUPPORT_UCP
2733 if (dc != UCD_OTHERCASE(fc))
2734 #endif
2735 MRRETURN(MATCH_NOMATCH);
2736 }
2737 }
2738 }
2739 else
2740 #endif /* SUPPORT_UTF8 */
2741
2742 /* Non-UTF-8 mode */
2743 {
2744 if (md->end_subject - eptr < 1)
2745 {
2746 SCHECK_PARTIAL(); /* This one can use SCHECK_PARTIAL() */
2747 MRRETURN(MATCH_NOMATCH);
2748 }
2749 if (md->lcc[ecode[1]] != md->lcc[*eptr++]) MRRETURN(MATCH_NOMATCH);
2750 ecode += 2;
2751 }
2752 break;
2753
2754 /* Match a single character repeatedly. */
2755
2756 case OP_EXACT:
2757 min = max = GET2(ecode, 1);
2758 ecode += 3;
2759 goto REPEATCHAR;
2760
2761 case OP_POSUPTO:
2762 possessive = TRUE;
2763 /* Fall through */
2764
2765 case OP_UPTO:
2766 case OP_MINUPTO:
2767 min = 0;
2768 max = GET2(ecode, 1);
2769 minimize = *ecode == OP_MINUPTO;
2770 ecode += 3;
2771 goto REPEATCHAR;
2772
2773 case OP_POSSTAR:
2774 possessive = TRUE;
2775 min = 0;
2776 max = INT_MAX;
2777 ecode++;
2778 goto REPEATCHAR;
2779
2780 case OP_POSPLUS:
2781 possessive = TRUE;
2782 min = 1;
2783 max = INT_MAX;
2784 ecode++;
2785 goto REPEATCHAR;
2786
2787 case OP_POSQUERY:
2788 possessive = TRUE;
2789 min = 0;
2790 max = 1;
2791 ecode++;
2792 goto REPEATCHAR;
2793
2794 case OP_STAR:
2795 case OP_MINSTAR:
2796 case OP_PLUS:
2797 case OP_MINPLUS:
2798 case OP_QUERY:
2799 case OP_MINQUERY:
2800 c = *ecode++ - OP_STAR;
2801 minimize = (c & 1) != 0;
2802
2803 min = rep_min[c]; /* Pick up values from tables; */
2804 max = rep_max[c]; /* zero for max => infinity */
2805 if (max == 0) max = INT_MAX;
2806
2807 /* Common code for all repeated single-character matches. */
2808
2809 REPEATCHAR:
2810 #ifdef SUPPORT_UTF8
2811 if (utf8)
2812 {
2813 length = 1;
2814 charptr = ecode;
2815 GETCHARLEN(fc, ecode, length);
2816 ecode += length;
2817
2818 /* Handle multibyte character matching specially here. There is
2819 support for caseless matching if UCP support is present. */
2820
2821 if (length > 1)
2822 {
2823 #ifdef SUPPORT_UCP
2824 unsigned int othercase;
2825 if ((ims & PCRE_CASELESS) != 0 &&
2826 (othercase = UCD_OTHERCASE(fc)) != fc)
2827 oclength = _pcre_ord2utf8(othercase, occhars);
2828 else oclength = 0;
2829 #endif /* SUPPORT_UCP */
2830
2831 for (i = 1; i <= min; i++)
2832 {
2833 if (eptr <= md->end_subject - length &&
2834 memcmp(eptr, charptr, length) == 0) eptr += length;
2835 #ifdef SUPPORT_UCP
2836 else if (oclength > 0 &&
2837 eptr <= md->end_subject - oclength &&
2838 memcmp(eptr, occhars, oclength) == 0) eptr += oclength;
2839 #endif /* SUPPORT_UCP */
2840 else
2841 {
2842 CHECK_PARTIAL();
2843 MRRETURN(MATCH_NOMATCH);
2844 }
2845 }
2846
2847 if (min == max) continue;
2848
2849 if (minimize)
2850 {
2851 for (fi = min;; fi++)
2852 {
2853 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM22);
2854 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2855 if (fi >= max) MRRETURN(MATCH_NOMATCH);
2856 if (eptr <= md->end_subject - length &&
2857 memcmp(eptr, charptr, length) == 0) eptr += length;
2858 #ifdef SUPPORT_UCP
2859 else if (oclength > 0 &&
2860 eptr <= md->end_subject - oclength &&
2861 memcmp(eptr, occhars, oclength) == 0) eptr += oclength;
2862 #endif /* SUPPORT_UCP */
2863 else
2864 {
2865 CHECK_PARTIAL();
2866 MRRETURN(MATCH_NOMATCH);
2867 }
2868 }
2869 /* Control never gets here */
2870 }
2871
2872 else /* Maximize */
2873 {
2874 pp = eptr;
2875 for (i = min; i < max; i++)
2876 {
2877 if (eptr <= md->end_subject - length &&
2878 memcmp(eptr, charptr, length) == 0) eptr += length;
2879 #ifdef SUPPORT_UCP
2880 else if (oclength > 0 &&
2881 eptr <= md->end_subject - oclength &&
2882 memcmp(eptr, occhars, oclength) == 0) eptr += oclength;
2883 #endif /* SUPPORT_UCP */
2884 else
2885 {
2886 CHECK_PARTIAL();
2887 break;
2888 }
2889 }
2890
2891 if (possessive) continue;
2892
2893 for(;;)
2894 {
2895 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM23);
2896 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2897 if (eptr == pp) { MRRETURN(MATCH_NOMATCH); }
2898 #ifdef SUPPORT_UCP
2899 eptr--;
2900 BACKCHAR(eptr);
2901 #else /* without SUPPORT_UCP */
2902 eptr -= length;
2903 #endif /* SUPPORT_UCP */
2904 }
2905 }
2906 /* Control never gets here */
2907 }
2908
2909 /* If the length of a UTF-8 character is 1, we fall through here, and
2910 obey the code as for non-UTF-8 characters below, though in this case the
2911 value of fc will always be < 128. */
2912 }
2913 else
2914 #endif /* SUPPORT_UTF8 */
2915
2916 /* When not in UTF-8 mode, load a single-byte character. */
2917
2918 fc = *ecode++;
2919
2920 /* The value of fc at this point is always less than 256, though we may or
2921 may not be in UTF-8 mode. The code is duplicated for the caseless and
2922 caseful cases, for speed, since matching characters is likely to be quite
2923 common. First, ensure the minimum number of matches are present. If min =
2924 max, continue at the same level without recursing. Otherwise, if
2925 minimizing, keep trying the rest of the expression and advancing one
2926 matching character if failing, up to the maximum. Alternatively, if
2927 maximizing, find the maximum number of characters and work backwards. */
2928
2929 DPRINTF(("matching %c{%d,%d} against subject %.*s\n", fc, min, max,
2930 max, eptr));
2931
2932 if ((ims & PCRE_CASELESS) != 0)
2933 {
2934 fc = md->lcc[fc];
2935 for (i = 1; i <= min; i++)
2936 {
2937 if (eptr >= md->end_subject)
2938 {
2939 SCHECK_PARTIAL();
2940 MRRETURN(MATCH_NOMATCH);
2941 }
2942 if (fc != md->lcc[*eptr++]) MRRETURN(MATCH_NOMATCH);
2943 }
2944 if (min == max) continue;
2945 if (minimize)
2946 {
2947 for (fi = min;; fi++)
2948 {
2949 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM24);
2950 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2951 if (fi >= max) MRRETURN(MATCH_NOMATCH);
2952 if (eptr >= md->end_subject)
2953 {
2954 SCHECK_PARTIAL();
2955 MRRETURN(MATCH_NOMATCH);
2956 }
2957 if (fc != md->lcc[*eptr++]) MRRETURN(MATCH_NOMATCH);
2958 }
2959 /* Control never gets here */
2960 }
2961 else /* Maximize */
2962 {
2963 pp = eptr;
2964 for (i = min; i < max; i++)
2965 {
2966 if (eptr >= md->end_subject)
2967 {
2968 SCHECK_PARTIAL();
2969 break;
2970 }
2971 if (fc != md->lcc[*eptr]) break;
2972 eptr++;
2973 }
2974
2975 if (possessive) continue;
2976
2977 while (eptr >= pp)
2978 {
2979 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM25);
2980 eptr--;
2981 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2982 }
2983 MRRETURN(MATCH_NOMATCH);
2984 }
2985 /* Control never gets here */
2986 }
2987
2988 /* Caseful comparisons (includes all multi-byte characters) */
2989
2990 else
2991 {
2992 for (i = 1; i <= min; i++)
2993 {
2994 if (eptr >= md->end_subject)
2995 {
2996 SCHECK_PARTIAL();
2997 MRRETURN(MATCH_NOMATCH);
2998 }
2999 if (fc != *eptr++) MRRETURN(MATCH_NOMATCH);
3000 }
3001
3002 if (min == max) continue;
3003
3004 if (minimize)
3005 {
3006 for (fi = min;; fi++)
3007 {
3008 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM26);
3009 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3010 if (fi >= max) MRRETURN(MATCH_NOMATCH);
3011 if (eptr >= md->end_subject)
3012 {
3013 SCHECK_PARTIAL();
3014 MRRETURN(MATCH_NOMATCH);
3015 }
3016 if (fc != *eptr++) MRRETURN(MATCH_NOMATCH);
3017 }
3018 /* Control never gets here */
3019 }
3020 else /* Maximize */
3021 {
3022 pp = eptr;
3023 for (i = min; i < max; i++)
3024 {
3025 if (eptr >= md->end_subject)
3026 {
3027 SCHECK_PARTIAL();
3028 break;
3029 }
3030 if (fc != *eptr) break;
3031 eptr++;
3032 }
3033 if (possessive) continue;
3034
3035 while (eptr >= pp)
3036 {
3037 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM27);
3038 eptr--;
3039 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3040 }
3041 MRRETURN(MATCH_NOMATCH);
3042 }
3043 }
3044 /* Control never gets here */
3045
3046 /* Match a negated single one-byte character. The character we are
3047 checking can be multibyte. */
3048
3049 case OP_NOT:
3050 if (eptr >= md->end_subject)
3051 {
3052 SCHECK_PARTIAL();
3053 MRRETURN(MATCH_NOMATCH);
3054 }
3055 ecode++;
3056 GETCHARINCTEST(c, eptr);
3057 if ((ims & PCRE_CASELESS) != 0)
3058 {
3059 #ifdef SUPPORT_UTF8
3060 if (c < 256)
3061 #endif
3062 c = md->lcc[c];
3063 if (md->lcc[*ecode++] == c) MRRETURN(MATCH_NOMATCH);
3064 }
3065 else
3066 {
3067 if (*ecode++ == c) MRRETURN(MATCH_NOMATCH);
3068 }
3069 break;
3070
3071 /* Match a negated single one-byte character repeatedly. This is almost a
3072 repeat of the code for a repeated single character, but I haven't found a
3073 nice way of commoning these up that doesn't require a test of the
3074 positive/negative option for each character match. Maybe that wouldn't add
3075 very much to the time taken, but character matching *is* what this is all
3076 about... */
3077
3078 case OP_NOTEXACT:
3079 min = max = GET2(ecode, 1);
3080 ecode += 3;
3081 goto REPEATNOTCHAR;
3082
3083 case OP_NOTUPTO:
3084 case OP_NOTMINUPTO:
3085 min = 0;
3086 max = GET2(ecode, 1);
3087 minimize = *ecode == OP_NOTMINUPTO;
3088 ecode += 3;
3089 goto REPEATNOTCHAR;
3090
3091 case OP_NOTPOSSTAR:
3092 possessive = TRUE;
3093 min = 0;
3094 max = INT_MAX;
3095 ecode++;
3096 goto REPEATNOTCHAR;
3097
3098 case OP_NOTPOSPLUS:
3099 possessive = TRUE;
3100 min = 1;
3101 max = INT_MAX;
3102 ecode++;
3103 goto REPEATNOTCHAR;
3104
3105 case OP_NOTPOSQUERY:
3106 possessive = TRUE;
3107 min = 0;
3108 max = 1;
3109 ecode++;
3110 goto REPEATNOTCHAR;
3111
3112 case OP_NOTPOSUPTO:
3113 possessive = TRUE;
3114 min = 0;
3115 max = GET2(ecode, 1);
3116 ecode += 3;
3117 goto REPEATNOTCHAR;
3118
3119 case OP_NOTSTAR:
3120 case OP_NOTMINSTAR:
3121 case OP_NOTPLUS:
3122 case OP_NOTMINPLUS:
3123 case OP_NOTQUERY:
3124 case OP_NOTMINQUERY:
3125 c = *ecode++ - OP_NOTSTAR;
3126 minimize = (c & 1) != 0;
3127 min = rep_min[c]; /* Pick up values from tables; */
3128 max = rep_max[c]; /* zero for max => infinity */
3129 if (max == 0) max = INT_MAX;
3130
3131 /* Common code for all repeated single-byte matches. */
3132
3133 REPEATNOTCHAR:
3134 fc = *ecode++;
3135
3136 /* The code is duplicated for the caseless and caseful cases, for speed,
3137 since matching characters is likely to be quite common. First, ensure the
3138 minimum number of matches are present. If min = max, continue at the same
3139 level without recursing. Otherwise, if minimizing, keep trying the rest of
3140 the expression and advancing one matching character if failing, up to the
3141 maximum. Alternatively, if maximizing, find the maximum number of
3142 characters and work backwards. */
3143
3144 DPRINTF(("negative matching %c{%d,%d} against subject %.*s\n", fc, min, max,
3145 max, eptr));
3146
3147 if ((ims & PCRE_CASELESS) != 0)
3148 {
3149 fc = md->lcc[fc];
3150
3151 #ifdef SUPPORT_UTF8
3152 /* UTF-8 mode */
3153 if (utf8)
3154 {
3155 register unsigned int d;
3156 for (i = 1; i <= min; i++)
3157 {
3158 if (eptr >= md->end_subject)
3159 {
3160 SCHECK_PARTIAL();
3161 MRRETURN(MATCH_NOMATCH);
3162 }
3163 GETCHARINC(d, eptr);
3164 if (d < 256) d = md->lcc[d];
3165 if (fc == d) MRRETURN(MATCH_NOMATCH);
3166 }
3167 }
3168 else
3169 #endif
3170
3171 /* Not UTF-8 mode */
3172 {
3173 for (i = 1; i <= min; i++)
3174 {
3175 if (eptr >= md->end_subject)
3176 {
3177 SCHECK_PARTIAL();
3178 MRRETURN(MATCH_NOMATCH);
3179 }
3180 if (fc == md->lcc[*eptr++]) MRRETURN(MATCH_NOMATCH);
3181 }
3182 }
3183
3184 if (min == max) continue;
3185
3186 if (minimize)
3187 {
3188 #ifdef SUPPORT_UTF8
3189 /* UTF-8 mode */
3190 if (utf8)
3191 {
3192 register unsigned int d;
3193 for (fi = min;; fi++)
3194 {
3195 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM28);
3196 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3197 if (fi >= max) MRRETURN(MATCH_NOMATCH);
3198 if (eptr >= md->end_subject)
3199 {
3200 SCHECK_PARTIAL();
3201 MRRETURN(MATCH_NOMATCH);
3202 }
3203 GETCHARINC(d, eptr);
3204 if (d < 256) d = md->lcc[d];
3205 if (fc == d) MRRETURN(MATCH_NOMATCH);
3206 }
3207 }
3208 else
3209 #endif
3210 /* Not UTF-8 mode */
3211 {
3212 for (fi = min;; fi++)
3213 {
3214 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM29);
3215 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3216 if (fi >= max) MRRETURN(MATCH_NOMATCH);
3217 if (eptr >= md->end_subject)
3218 {
3219 SCHECK_PARTIAL();
3220 MRRETURN(MATCH_NOMATCH);
3221 }
3222 if (fc == md->lcc[*eptr++]) MRRETURN(MATCH_NOMATCH);
3223 }
3224 }
3225 /* Control never gets here */
3226 }
3227
3228 /* Maximize case */
3229
3230 else
3231 {
3232 pp = eptr;
3233
3234 #ifdef SUPPORT_UTF8
3235 /* UTF-8 mode */
3236 if (utf8)
3237 {
3238 register unsigned int d;
3239 for (i = min; i < max; i++)
3240 {
3241 int len = 1;
3242 if (eptr >= md->end_subject)
3243 {
3244 SCHECK_PARTIAL();
3245 break;
3246 }
3247 GETCHARLEN(d, eptr, len);
3248 if (d < 256) d = md->lcc[d];
3249 if (fc == d) break;
3250 eptr += len;
3251 }
3252 if (possessive) continue;
3253 for(;;)
3254 {
3255 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM30);
3256 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3257 if (eptr-- == pp) break; /* Stop if tried at original pos */
3258 BACKCHAR(eptr);
3259 }
3260 }
3261 else
3262 #endif
3263 /* Not UTF-8 mode */
3264 {
3265 for (i = min; i < max; i++)
3266 {
3267 if (eptr >= md->end_subject)
3268 {
3269 SCHECK_PARTIAL();
3270 break;
3271 }
3272 if (fc == md->lcc[*eptr]) break;
3273 eptr++;
3274 }
3275 if (possessive) continue;
3276 while (eptr >= pp)
3277 {
3278 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM31);
3279 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3280 eptr--;
3281 }
3282 }
3283
3284 MRRETURN(MATCH_NOMATCH);
3285 }
3286 /* Control never gets here */
3287 }
3288
3289 /* Caseful comparisons */
3290
3291 else
3292 {
3293 #ifdef SUPPORT_UTF8
3294 /* UTF-8 mode */
3295 if (utf8)
3296 {
3297 register unsigned int d;
3298 for (i = 1; i <= min; i++)
3299 {
3300 if (eptr >= md->end_subject)
3301 {
3302 SCHECK_PARTIAL();
3303 MRRETURN(MATCH_NOMATCH);
3304 }
3305 GETCHARINC(d, eptr);
3306 if (fc == d) MRRETURN(MATCH_NOMATCH);
3307 }
3308 }
3309 else
3310 #endif
3311 /* Not UTF-8 mode */
3312 {
3313 for (i = 1; i <= min; i++)
3314 {
3315 if (eptr >= md->end_subject)
3316 {
3317 SCHECK_PARTIAL();
3318 MRRETURN(MATCH_NOMATCH);
3319 }
3320 if (fc == *eptr++) MRRETURN(MATCH_NOMATCH);
3321 }
3322 }
3323
3324 if (min == max) continue;
3325
3326 if (minimize)
3327 {
3328 #ifdef SUPPORT_UTF8
3329 /* UTF-8 mode */
3330 if (utf8)
3331 {
3332 register unsigned int d;
3333 for (fi = min;; fi++)
3334 {
3335 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM32);
3336 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3337 if (fi >= max) MRRETURN(MATCH_NOMATCH);
3338 if (eptr >= md->end_subject)
3339 {
3340 SCHECK_PARTIAL();
3341 MRRETURN(MATCH_NOMATCH);
3342 }
3343 GETCHARINC(d, eptr);
3344 if (fc == d) MRRETURN(MATCH_NOMATCH);
3345 }
3346 }
3347 else
3348 #endif
3349 /* Not UTF-8 mode */
3350 {
3351 for (fi = min;; fi++)
3352 {
3353 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM33);
3354 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3355 if (fi >= max) MRRETURN(MATCH_NOMATCH);
3356 if (eptr >= md->end_subject)
3357 {
3358 SCHECK_PARTIAL();
3359 MRRETURN(MATCH_NOMATCH);
3360 }
3361 if (fc == *eptr++) MRRETURN(MATCH_NOMATCH);
3362 }
3363 }
3364 /* Control never gets here */
3365 }
3366
3367 /* Maximize case */
3368
3369 else
3370 {
3371 pp = eptr;
3372
3373 #ifdef SUPPORT_UTF8
3374 /* UTF-8 mode */
3375 if (utf8)
3376 {
3377 register unsigned int d;
3378 for (i = min; i < max; i++)
3379 {
3380 int len = 1;
3381 if (eptr >= md->end_subject)
3382 {
3383 SCHECK_PARTIAL();
3384 break;
3385 }
3386 GETCHARLEN(d, eptr, len);
3387 if (fc == d) break;
3388 eptr += len;
3389 }
3390 if (possessive) continue;
3391 for(;;)
3392 {
3393 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM34);
3394 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3395 if (eptr-- == pp) break; /* Stop if tried at original pos */
3396 BACKCHAR(eptr);
3397 }
3398 }
3399 else
3400 #endif
3401 /* Not UTF-8 mode */
3402 {
3403 for (i = min; i < max; i++)
3404 {
3405 if (eptr >= md->end_subject)
3406 {
3407 SCHECK_PARTIAL();
3408 break;
3409 }
3410 if (fc == *eptr) break;
3411 eptr++;
3412 }
3413 if (possessive) continue;
3414 while (eptr >= pp)
3415 {
3416 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM35);
3417 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3418 eptr--;
3419 }
3420 }
3421
3422 MRRETURN(MATCH_NOMATCH);
3423 }
3424 }
3425 /* Control never gets here */
3426
3427 /* Match a single character type repeatedly; several different opcodes
3428 share code. This is very similar to the code for single characters, but we
3429 repeat it in the interests of efficiency. */
3430
3431 case OP_TYPEEXACT:
3432 min = max = GET2(ecode, 1);
3433 minimize = TRUE;
3434 ecode += 3;
3435 goto REPEATTYPE;
3436
3437 case OP_TYPEUPTO:
3438 case OP_TYPEMINUPTO:
3439 min = 0;
3440 max = GET2(ecode, 1);
3441 minimize = *ecode == OP_TYPEMINUPTO;
3442 ecode += 3;
3443 goto REPEATTYPE;
3444
3445 case OP_TYPEPOSSTAR:
3446 possessive = TRUE;
3447 min = 0;
3448 max = INT_MAX;
3449 ecode++;
3450 goto REPEATTYPE;
3451
3452 case OP_TYPEPOSPLUS:
3453 possessive = TRUE;
3454 min = 1;
3455 max = INT_MAX;
3456 ecode++;
3457 goto REPEATTYPE;
3458
3459 case OP_TYPEPOSQUERY:
3460 possessive = TRUE;
3461 min = 0;
3462 max = 1;
3463 ecode++;
3464 goto REPEATTYPE;
3465
3466 case OP_TYPEPOSUPTO:
3467 possessive = TRUE;
3468 min = 0;
3469 max = GET2(ecode, 1);
3470 ecode += 3;
3471 goto REPEATTYPE;
3472
3473 case OP_TYPESTAR:
3474 case OP_TYPEMINSTAR:
3475 case OP_TYPEPLUS:
3476 case OP_TYPEMINPLUS:
3477 case OP_TYPEQUERY:
3478 case OP_TYPEMINQUERY:
3479 c = *ecode++ - OP_TYPESTAR;
3480 minimize = (c & 1) != 0;
3481 min = rep_min[c]; /* Pick up values from tables; */
3482 max = rep_max[c]; /* zero for max => infinity */
3483 if (max == 0) max = INT_MAX;
3484
3485 /* Common code for all repeated single character type matches. Note that
3486 in UTF-8 mode, '.' matches a character of any length, but for the other
3487 character types, the valid characters are all one-byte long. */
3488
3489 REPEATTYPE:
3490 ctype = *ecode++; /* Code for the character type */
3491
3492 #ifdef SUPPORT_UCP
3493 if (ctype == OP_PROP || ctype == OP_NOTPROP)
3494 {
3495 prop_fail_result = ctype == OP_NOTPROP;
3496 prop_type = *ecode++;
3497 prop_value = *ecode++;
3498 }
3499 else prop_type = -1;
3500 #endif
3501
3502 /* First, ensure the minimum number of matches are present. Use inline
3503 code for maximizing the speed, and do the type test once at the start
3504 (i.e. keep it out of the loop). Separate the UTF-8 code completely as that
3505 is tidier. Also separate the UCP code, which can be the same for both UTF-8
3506 and single-bytes. */
3507
3508 if (min > 0)
3509 {
3510 #ifdef SUPPORT_UCP
3511 if (prop_type >= 0)
3512 {
3513 switch(prop_type)
3514 {
3515 case PT_ANY:
3516 if (prop_fail_result) MRRETURN(MATCH_NOMATCH);
3517 for (i = 1; i <= min; i++)
3518 {
3519 if (eptr >= md->end_subject)
3520 {
3521 SCHECK_PARTIAL();
3522 MRRETURN(MATCH_NOMATCH);
3523 }
3524 GETCHARINCTEST(c, eptr);
3525 }
3526 break;
3527
3528 case PT_LAMP:
3529 for (i = 1; i <= min; i++)
3530 {
3531 if (eptr >= md->end_subject)
3532 {
3533 SCHECK_PARTIAL();
3534 MRRETURN(MATCH_NOMATCH);
3535 }
3536 GETCHARINCTEST(c, eptr);
3537 prop_chartype = UCD_CHARTYPE(c);
3538 if ((prop_chartype == ucp_Lu ||
3539 prop_chartype == ucp_Ll ||
3540 prop_chartype == ucp_Lt) == prop_fail_result)
3541 MRRETURN(MATCH_NOMATCH);
3542 }
3543 break;
3544
3545 case PT_GC:
3546 for (i = 1; i <= min; i++)
3547 {
3548 if (eptr >= md->end_subject)
3549 {
3550 SCHECK_PARTIAL();
3551 MRRETURN(MATCH_NOMATCH);
3552 }
3553 GETCHARINCTEST(c, eptr);
3554 prop_category = UCD_CATEGORY(c);
3555 if ((prop_category == prop_value) == prop_fail_result)
3556 MRRETURN(MATCH_NOMATCH);
3557 }
3558 break;
3559
3560 case PT_PC:
3561 for (i = 1; i <= min; i++)
3562 {
3563 if (eptr >= md->end_subject)
3564 {
3565 SCHECK_PARTIAL();
3566 MRRETURN(MATCH_NOMATCH);
3567 }
3568 GETCHARINCTEST(c, eptr);
3569 prop_chartype = UCD_CHARTYPE(c);
3570 if ((prop_chartype == prop_value) == prop_fail_result)
3571 MRRETURN(MATCH_NOMATCH);
3572 }
3573 break;
3574
3575 case PT_SC:
3576 for (i = 1; i <= min; i++)
3577 {
3578 if (eptr >= md->end_subject)
3579 {
3580 SCHECK_PARTIAL();
3581 MRRETURN(MATCH_NOMATCH);
3582 }
3583 GETCHARINCTEST(c, eptr);
3584 prop_script = UCD_SCRIPT(c);
3585 if ((prop_script == prop_value) == prop_fail_result)
3586 MRRETURN(MATCH_NOMATCH);
3587 }
3588 break;
3589
3590 case PT_ALNUM:
3591 for (i = 1; i <= min; i++)
3592 {
3593 if (eptr >= md->end_subject)
3594 {
3595 SCHECK_PARTIAL();
3596 MRRETURN(MATCH_NOMATCH);
3597 }
3598 GETCHARINCTEST(c, eptr);
3599 prop_category = UCD_CATEGORY(c);
3600 if ((prop_category == ucp_L || prop_category == ucp_N)
3601 == prop_fail_result)
3602 MRRETURN(MATCH_NOMATCH);
3603 }
3604 break;
3605
3606 case PT_SPACE: /* Perl space */
3607 for (i = 1; i <= min; i++)
3608 {
3609 if (eptr >= md->end_subject)
3610 {
3611 SCHECK_PARTIAL();
3612 MRRETURN(MATCH_NOMATCH);
3613 }
3614 GETCHARINCTEST(c, eptr);
3615 prop_category = UCD_CATEGORY(c);
3616 if ((prop_category == ucp_Z || c == CHAR_HT || c == CHAR_NL ||
3617 c == CHAR_FF || c == CHAR_CR)
3618 == prop_fail_result)
3619 MRRETURN(MATCH_NOMATCH);
3620 }
3621 break;
3622
3623 case PT_PXSPACE: /* POSIX space */
3624 for (i = 1; i <= min; i++)
3625 {
3626 if (eptr >= md->end_subject)
3627 {
3628 SCHECK_PARTIAL();
3629 MRRETURN(MATCH_NOMATCH);
3630 }
3631 GETCHARINCTEST(c, eptr);
3632 prop_category = UCD_CATEGORY(c);
3633 if ((prop_category == ucp_Z || c == CHAR_HT || c == CHAR_NL ||
3634 c == CHAR_VT || c == CHAR_FF || c == CHAR_CR)
3635 == prop_fail_result)
3636 MRRETURN(MATCH_NOMATCH);
3637 }
3638 break;
3639
3640 case PT_WORD:
3641 for (i = 1; i <= min; i++)
3642 {
3643 if (eptr >= md->end_subject)
3644 {
3645 SCHECK_PARTIAL();
3646 MRRETURN(MATCH_NOMATCH);
3647 }
3648 GETCHARINCTEST(c, eptr);
3649 prop_category = UCD_CATEGORY(c);
3650 if ((prop_category == ucp_L || prop_category == ucp_N ||
3651 c == CHAR_UNDERSCORE)
3652 == prop_fail_result)
3653 MRRETURN(MATCH_NOMATCH);
3654 }
3655 break;
3656
3657 /* This should not occur */
3658
3659 default:
3660 RRETURN(PCRE_ERROR_INTERNAL);
3661 }
3662 }
3663
3664 /* Match extended Unicode sequences. We will get here only if the
3665 support is in the binary; otherwise a compile-time error occurs. */
3666
3667 else if (ctype == OP_EXTUNI)
3668 {
3669 for (i = 1; i <= min; i++)
3670 {
3671 if (eptr >= md->end_subject)
3672 {
3673 SCHECK_PARTIAL();
3674 MRRETURN(MATCH_NOMATCH);
3675 }
3676 GETCHARINCTEST(c, eptr);
3677 prop_category = UCD_CATEGORY(c);
3678 if (prop_category == ucp_M) MRRETURN(MATCH_NOMATCH);
3679 while (eptr < md->end_subject)
3680 {
3681 int len = 1;
3682 if (!utf8) c = *eptr;
3683 else { GETCHARLEN(c, eptr, len); }
3684 prop_category = UCD_CATEGORY(c);
3685 if (prop_category != ucp_M) break;
3686 eptr += len;
3687 }
3688 }
3689 }
3690
3691 else
3692 #endif /* SUPPORT_UCP */
3693
3694 /* Handle all other cases when the coding is UTF-8 */
3695
3696 #ifdef SUPPORT_UTF8
3697 if (utf8) switch(ctype)
3698 {
3699 case OP_ANY:
3700 for (i = 1; i <= min; i++)
3701 {
3702 if (eptr >= md->end_subject)
3703 {
3704 SCHECK_PARTIAL();
3705 MRRETURN(MATCH_NOMATCH);
3706 }
3707 if (IS_NEWLINE(eptr)) MRRETURN(MATCH_NOMATCH);
3708 eptr++;
3709 while (eptr < md->end_subject && (*eptr & 0xc0) == 0x80) eptr++;
3710 }
3711 break;
3712
3713 case OP_ALLANY:
3714 for (i = 1; i <= min; i++)
3715 {
3716 if (eptr >= md->end_subject)
3717 {
3718 SCHECK_PARTIAL();
3719 MRRETURN(MATCH_NOMATCH);
3720 }
3721 eptr++;
3722 while (eptr < md->end_subject && (*eptr & 0xc0) == 0x80) eptr++;
3723 }
3724 break;
3725
3726 case OP_ANYBYTE:
3727 if (eptr > md->end_subject - min) MRRETURN(MATCH_NOMATCH);
3728 eptr += min;
3729 break;
3730
3731 case OP_ANYNL:
3732 for (i = 1; i <= min; i++)
3733 {
3734 if (eptr >= md->end_subject)
3735 {
3736 SCHECK_PARTIAL();
3737 MRRETURN(MATCH_NOMATCH);
3738 }
3739 GETCHARINC(c, eptr);
3740 switch(c)
3741 {
3742 default: MRRETURN(MATCH_NOMATCH);
3743 case 0x000d:
3744 if (eptr < md->end_subject && *eptr == 0x0a) eptr++;
3745 break;
3746
3747 case 0x000a:
3748 break;
3749
3750 case 0x000b:
3751 case 0x000c:
3752 case 0x0085:
3753 case 0x2028:
3754 case 0x2029:
3755 if (md->bsr_anycrlf) MRRETURN(MATCH_NOMATCH);
3756 break;
3757 }
3758 }
3759 break;
3760
3761 case OP_NOT_HSPACE:
3762 for (i = 1; i <= min; i++)
3763 {
3764 if (eptr >= md->end_subject)
3765 {
3766 SCHECK_PARTIAL();
3767 MRRETURN(MATCH_NOMATCH);
3768 }
3769 GETCHARINC(c, eptr);
3770 switch(c)
3771 {
3772 default: break;
3773 case 0x09: /* HT */
3774 case 0x20: /* SPACE */
3775 case 0xa0: /* NBSP */
3776 case 0x1680: /* OGHAM SPACE MARK */
3777 case 0x180e: /* MONGOLIAN VOWEL SEPARATOR */
3778 case 0x2000: /* EN QUAD */
3779 case 0x2001: /* EM QUAD */
3780 case 0x2002: /* EN SPACE */
3781 case 0x2003: /* EM SPACE */
3782 case 0x2004: /* THREE-PER-EM SPACE */
3783 case 0x2005: /* FOUR-PER-EM SPACE */
3784 case 0x2006: /* SIX-PER-EM SPACE */
3785 case 0x2007: /* FIGURE SPACE */
3786 case 0x2008: /* PUNCTUATION SPACE */
3787 case 0x2009: /* THIN SPACE */
3788 case 0x200A: /* HAIR SPACE */
3789 case 0x202f: /* NARROW NO-BREAK SPACE */
3790 case 0x205f: /* MEDIUM MATHEMATICAL SPACE */
3791 case 0x3000: /* IDEOGRAPHIC SPACE */
3792 MRRETURN(MATCH_NOMATCH);
3793 }
3794 }
3795 break;
3796
3797 case OP_HSPACE:
3798 for (i = 1; i <= min; i++)
3799 {
3800 if (eptr >= md->end_subject)
3801 {
3802 SCHECK_PARTIAL();
3803 MRRETURN(MATCH_NOMATCH);
3804 }
3805 GETCHARINC(c, eptr);
3806 switch(c)
3807 {
3808 default: MRRETURN(MATCH_NOMATCH);
3809 case 0x09: /* HT */
3810 case 0x20: /* SPACE */
3811 case 0xa0: /* NBSP */
3812 case 0x1680: /* OGHAM SPACE MARK */
3813 case 0x180e: /* MONGOLIAN VOWEL SEPARATOR */
3814 case 0x2000: /* EN QUAD */
3815 case 0x2001: /* EM QUAD */
3816 case 0x2002: /* EN SPACE */
3817 case 0x2003: /* EM SPACE */
3818 case 0x2004: /* THREE-PER-EM SPACE */
3819 case 0x2005: /* FOUR-PER-EM SPACE */
3820 case 0x2006: /* SIX-PER-EM SPACE */
3821 case 0x2007: /* FIGURE SPACE */
3822 case 0x2008: /* PUNCTUATION SPACE */
3823 case 0x2009: /* THIN SPACE */
3824 case 0x200A: /* HAIR SPACE */
3825 case 0x202f: /* NARROW NO-BREAK SPACE */
3826 case 0x205f: /* MEDIUM MATHEMATICAL SPACE */
3827 case 0x3000: /* IDEOGRAPHIC SPACE */
3828 break;
3829 }
3830 }
3831 break;
3832
3833 case OP_NOT_VSPACE:
3834 for (i = 1; i <= min; i++)
3835 {
3836 if (eptr >= md->end_subject)
3837 {
3838 SCHECK_PARTIAL();
3839 MRRETURN(MATCH_NOMATCH);
3840 }
3841 GETCHARINC(c, eptr);
3842 switch(c)
3843 {
3844 default: break;
3845 case 0x0a: /* LF */
3846 case 0x0b: /* VT */
3847 case 0x0c: /* FF */
3848 case 0x0d: /* CR */
3849 case 0x85: /* NEL */
3850 case 0x2028: /* LINE SEPARATOR */
3851 case 0x2029: /* PARAGRAPH SEPARATOR */
3852 MRRETURN(MATCH_NOMATCH);
3853 }
3854 }
3855 break;
3856
3857 case OP_VSPACE:
3858 for (i = 1; i <= min; i++)
3859 {
3860 if (eptr >= md->end_subject)
3861 {
3862 SCHECK_PARTIAL();
3863 MRRETURN(MATCH_NOMATCH);
3864 }
3865 GETCHARINC(c, eptr);
3866 switch(c)
3867 {
3868 default: MRRETURN(MATCH_NOMATCH);
3869 case 0x0a: /* LF */
3870 case 0x0b: /* VT */
3871 case 0x0c: /* FF */
3872 case 0x0d: /* CR */
3873 case 0x85: /* NEL */
3874 case 0x2028: /* LINE SEPARATOR */
3875 case 0x2029: /* PARAGRAPH SEPARATOR */
3876 break;
3877 }
3878 }
3879 break;
3880
3881 case OP_NOT_DIGIT:
3882 for (i = 1; i <= min; i++)
3883 {
3884 if (eptr >= md->end_subject)
3885 {
3886 SCHECK_PARTIAL();
3887 MRRETURN(MATCH_NOMATCH);
3888 }
3889 GETCHARINC(c, eptr);
3890 if (c < 128 && (md->ctypes[c] & ctype_digit) != 0)
3891 MRRETURN(MATCH_NOMATCH);
3892 }
3893 break;
3894
3895 case OP_DIGIT:
3896 for (i = 1; i <= min; i++)
3897 {
3898 if (eptr >= md->end_subject)
3899 {
3900 SCHECK_PARTIAL();
3901 MRRETURN(MATCH_NOMATCH);
3902 }
3903 if (*eptr >= 128 || (md->ctypes[*eptr++] & ctype_digit) == 0)
3904 MRRETURN(MATCH_NOMATCH);
3905 /* No need to skip more bytes - we know it's a 1-byte character */
3906 }
3907 break;
3908
3909 case OP_NOT_WHITESPACE:
3910 for (i = 1; i <= min; i++)
3911 {
3912 if (eptr >= md->end_subject)
3913 {
3914 SCHECK_PARTIAL();
3915 MRRETURN(MATCH_NOMATCH);
3916 }
3917 if (*eptr < 128 && (md->ctypes[*eptr] & ctype_space) != 0)
3918 MRRETURN(MATCH_NOMATCH);
3919 while (++eptr < md->end_subject && (*eptr & 0xc0) == 0x80);
3920 }
3921 break;
3922
3923 case OP_WHITESPACE:
3924 for (i = 1; i <= min; i++)
3925 {
3926 if (eptr >= md->end_subject)
3927 {
3928 SCHECK_PARTIAL();
3929 MRRETURN(MATCH_NOMATCH);
3930 }
3931 if (*eptr >= 128 || (md->ctypes[*eptr++] & ctype_space) == 0)
3932 MRRETURN(MATCH_NOMATCH);
3933 /* No need to skip more bytes - we know it's a 1-byte character */
3934 }
3935 break;
3936
3937 case OP_NOT_WORDCHAR:
3938 for (i = 1; i <= min; i++)
3939 {
3940 if (eptr >= md->end_subject)
3941 {
3942 SCHECK_PARTIAL();
3943 MRRETURN(MATCH_NOMATCH);
3944 }
3945 if (*eptr < 128 && (md->ctypes[*eptr] & ctype_word) != 0)
3946 MRRETURN(MATCH_NOMATCH);
3947 while (++eptr < md->end_subject && (*eptr & 0xc0) == 0x80);
3948 }
3949 break;
3950
3951 case OP_WORDCHAR:
3952 for (i = 1; i <= min; i++)
3953 {
3954 if (eptr >= md->end_subject)
3955 {
3956 SCHECK_PARTIAL();
3957 MRRETURN(MATCH_NOMATCH);
3958 }
3959 if (*eptr >= 128 || (md->ctypes[*eptr++] & ctype_word) == 0)
3960 MRRETURN(MATCH_NOMATCH);
3961 /* No need to skip more bytes - we know it's a 1-byte character */
3962 }
3963 break;
3964
3965 default:
3966 RRETURN(PCRE_ERROR_INTERNAL);
3967 } /* End switch(ctype) */
3968
3969 else
3970 #endif /* SUPPORT_UTF8 */
3971
3972 /* Code for the non-UTF-8 case for minimum matching of operators other
3973 than OP_PROP and OP_NOTPROP. */
3974
3975 switch(ctype)
3976 {
3977 case OP_ANY:
3978 for (i = 1; i <= min; i++)
3979 {
3980 if (eptr >= md->end_subject)
3981 {
3982 SCHECK_PARTIAL();
3983 MRRETURN(MATCH_NOMATCH);
3984 }
3985 if (IS_NEWLINE(eptr)) MRRETURN(MATCH_NOMATCH);
3986 eptr++;
3987 }
3988 break;
3989
3990 case OP_ALLANY:
3991 if (eptr > md->end_subject - min)
3992 {
3993 SCHECK_PARTIAL();
3994 MRRETURN(MATCH_NOMATCH);
3995 }
3996 eptr += min;
3997 break;
3998
3999 case OP_ANYBYTE:
4000 if (eptr > md->end_subject - min)
4001 {
4002 SCHECK_PARTIAL();
4003 MRRETURN(MATCH_NOMATCH);
4004 }
4005 eptr += min;
4006 break;
4007
4008 case OP_ANYNL:
4009 for (i = 1; i <= min; i++)
4010 {
4011 if (eptr >= md->end_subject)
4012 {
4013 SCHECK_PARTIAL();
4014 MRRETURN(MATCH_NOMATCH);
4015 }
4016 switch(*eptr++)
4017 {
4018 default: MRRETURN(MATCH_NOMATCH);
4019 case 0x000d:
4020 if (eptr < md->end_subject && *eptr == 0x0a) eptr++;
4021 break;
4022 case 0x000a:
4023 break;
4024
4025 case 0x000b:
4026 case 0x000c:
4027 case 0x0085:
4028 if (md->bsr_anycrlf) MRRETURN(MATCH_NOMATCH);
4029 break;
4030 }
4031 }
4032 break;
4033
4034 case OP_NOT_HSPACE:
4035 for (i = 1; i <= min; i++)
4036 {
4037 if (eptr >= md->end_subject)
4038 {
4039 SCHECK_PARTIAL();
4040 MRRETURN(MATCH_NOMATCH);
4041 }
4042 switch(*eptr++)
4043 {
4044 default: break;
4045 case 0x09: /* HT */
4046 case 0x20: /* SPACE */
4047 case 0xa0: /* NBSP */
4048 MRRETURN(MATCH_NOMATCH);
4049 }
4050 }
4051 break;
4052
4053 case OP_HSPACE:
4054 for (i = 1; i <= min; i++)
4055 {
4056 if (eptr >= md->end_subject)
4057 {
4058 SCHECK_PARTIAL();
4059 MRRETURN(MATCH_NOMATCH);
4060 }
4061 switch(*eptr++)
4062 {
4063 default: MRRETURN(MATCH_NOMATCH);
4064 case 0x09: /* HT */
4065 case 0x20: /* SPACE */
4066 case 0xa0: /* NBSP */
4067 break;
4068 }
4069 }
4070 break;
4071
4072 case OP_NOT_VSPACE:
4073 for (i = 1; i <= min; i++)
4074 {
4075 if (eptr >= md->end_subject)
4076 {
4077 SCHECK_PARTIAL();
4078 MRRETURN(MATCH_NOMATCH);
4079 }
4080 switch(*eptr++)
4081 {
4082 default: break;
4083 case 0x0a: /* LF */
4084 case 0x0b: /* VT */
4085 case 0x0c: /* FF */
4086 case 0x0d: /* CR */
4087 case 0x85: /* NEL */
4088 MRRETURN(MATCH_NOMATCH);
4089 }
4090 }
4091 break;
4092
4093 case OP_VSPACE:
4094 for (i = 1; i <= min; i++)
4095 {
4096 if (eptr >= md->end_subject)
4097 {
4098 SCHECK_PARTIAL();
4099 MRRETURN(MATCH_NOMATCH);
4100 }
4101 switch(*eptr++)
4102 {
4103 default: MRRETURN(MATCH_NOMATCH);
4104 case 0x0a: /* LF */
4105 case 0x0b: /* VT */
4106 case 0x0c: /* FF */
4107 case 0x0d: /* CR */
4108 case 0x85: /* NEL */
4109 break;
4110 }
4111 }
4112 break;
4113
4114 case OP_NOT_DIGIT:
4115 for (i = 1; i <= min; i++)
4116 {
4117 if (eptr >= md->end_subject)
4118 {
4119 SCHECK_PARTIAL();
4120 MRRETURN(MATCH_NOMATCH);
4121 }
4122 if ((md->ctypes[*eptr++] & ctype_digit) != 0) MRRETURN(MATCH_NOMATCH);
4123 }
4124 break;
4125
4126 case OP_DIGIT:
4127 for (i = 1; i <= min; i++)
4128 {
4129 if (eptr >= md->end_subject)
4130 {
4131 SCHECK_PARTIAL();
4132 MRRETURN(MATCH_NOMATCH);
4133 }
4134 if ((md->ctypes[*eptr++] & ctype_digit) == 0) MRRETURN(MATCH_NOMATCH);
4135 }
4136 break;
4137
4138 case OP_NOT_WHITESPACE:
4139 for (i = 1; i <= min; i++)
4140 {
4141 if (eptr >= md->end_subject)
4142 {
4143 SCHECK_PARTIAL();
4144 MRRETURN(MATCH_NOMATCH);
4145 }
4146 if ((md->ctypes[*eptr++] & ctype_space) != 0) MRRETURN(MATCH_NOMATCH);
4147 }
4148 break;
4149
4150 case OP_WHITESPACE:
4151 for (i = 1; i <= min; i++)
4152 {
4153 if (eptr >= md->end_subject)
4154 {
4155 SCHECK_PARTIAL();
4156 MRRETURN(MATCH_NOMATCH);
4157 }
4158 if ((md->ctypes[*eptr++] & ctype_space) == 0) MRRETURN(MATCH_NOMATCH);
4159 }
4160 break;
4161
4162 case OP_NOT_WORDCHAR:
4163 for (i = 1; i <= min; i++)
4164 {
4165 if (eptr >= md->end_subject)
4166 {
4167 SCHECK_PARTIAL();
4168 MRRETURN(MATCH_NOMATCH);
4169 }
4170 if ((md->ctypes[*eptr++] & ctype_word) != 0)
4171 MRRETURN(MATCH_NOMATCH);
4172 }
4173 break;
4174
4175 case OP_WORDCHAR:
4176 for (i = 1; i <= min; i++)
4177 {
4178 if (eptr >= md->end_subject)
4179 {
4180 SCHECK_PARTIAL();
4181 MRRETURN(MATCH_NOMATCH);
4182 }
4183 if ((md->ctypes[*eptr++] & ctype_word) == 0)
4184 MRRETURN(MATCH_NOMATCH);
4185 }
4186 break;
4187
4188 default:
4189 RRETURN(PCRE_ERROR_INTERNAL);
4190 }
4191 }
4192
4193 /* If min = max, continue at the same level without recursing */
4194
4195 if (min == max) continue;
4196
4197 /* If minimizing, we have to test the rest of the pattern before each
4198 subsequent match. Again, separate the UTF-8 case for speed, and also
4199 separate the UCP cases. */
4200
4201 if (minimize)
4202 {
4203 #ifdef SUPPORT_UCP
4204 if (prop_type >= 0)
4205 {
4206 switch(prop_type)
4207 {
4208 case PT_ANY:
4209 for (fi = min;; fi++)
4210 {
4211 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM36);
4212 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
4213 if (fi >= max) MRRETURN(MATCH_NOMATCH);
4214 if (eptr >= md->end_subject)
4215 {
4216 SCHECK_PARTIAL();
4217 MRRETURN(MATCH_NOMATCH);
4218 }
4219 GETCHARINCTEST(c, eptr);
4220 if (prop_fail_result) MRRETURN(MATCH_NOMATCH);
4221 }
4222 /* Control never gets here */
4223
4224 case PT_LAMP:
4225 for (fi = min;; fi++)
4226 {
4227 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM37);
4228 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
4229 if (fi >= max) MRRETURN(MATCH_NOMATCH);
4230 if (eptr >= md->end_subject)
4231 {
4232 SCHECK_PARTIAL();
4233 MRRETURN(MATCH_NOMATCH);
4234 }
4235 GETCHARINCTEST(c, eptr);
4236 prop_chartype = UCD_CHARTYPE(c);
4237 if ((prop_chartype == ucp_Lu ||
4238 prop_chartype == ucp_Ll ||
4239 prop_chartype == ucp_Lt) == prop_fail_result)
4240 MRRETURN(MATCH_NOMATCH);
4241 }
4242 /* Control never gets here */
4243
4244 case PT_GC:
4245 for (fi = min;; fi++)
4246 {
4247 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM38);
4248 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
4249 if (fi >= max) MRRETURN(MATCH_NOMATCH);
4250 if (eptr >= md->end_subject)
4251 {
4252 SCHECK_PARTIAL();
4253 MRRETURN(MATCH_NOMATCH);
4254 }
4255 GETCHARINCTEST(c, eptr);
4256 prop_category = UCD_CATEGORY(c);
4257 if ((prop_category == prop_value) == prop_fail_result)
4258 MRRETURN(MATCH_NOMATCH);
4259 }
4260 /* Control never gets here */
4261
4262 case PT_PC:
4263 for (fi = min;; fi++)
4264 {
4265 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM39);
4266 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
4267 if (fi >= max) MRRETURN(MATCH_NOMATCH);
4268 if (eptr >= md->end_subject)
4269 {
4270 SCHECK_PARTIAL();
4271 MRRETURN(MATCH_NOMATCH);
4272 }
4273 GETCHARINCTEST(c, eptr);
4274 prop_chartype = UCD_CHARTYPE(c);
4275 if ((prop_chartype == prop_value) == prop_fail_result)
4276 MRRETURN(MATCH_NOMATCH);
4277 }
4278 /* Control never gets here */
4279
4280 case PT_SC:
4281 for (fi = min;; fi++)
4282 {
4283 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM40);
4284 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
4285 if (fi >= max) MRRETURN(MATCH_NOMATCH);
4286 if (eptr >= md->end_subject)
4287 {
4288 SCHECK_PARTIAL();
4289 MRRETURN(MATCH_NOMATCH);
4290 }
4291 GETCHARINCTEST(c, eptr);
4292 prop_script = UCD_SCRIPT(c);
4293 if ((prop_script == prop_value) == prop_fail_result)
4294 MRRETURN(MATCH_NOMATCH);
4295 }
4296 /* Control never gets here */
4297
4298 case PT_ALNUM:
4299 for (fi = min;; fi++)
4300 {
4301 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM59);
4302 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
4303 if (fi >= max) MRRETURN(MATCH_NOMATCH);
4304 if (eptr >= md->end_subject)
4305 {
4306 SCHECK_PARTIAL();
4307 MRRETURN(MATCH_NOMATCH);
4308 }
4309 GETCHARINCTEST(c, eptr);
4310 prop_category = UCD_CATEGORY(c);
4311 if ((prop_category == ucp_L || prop_category == ucp_N)
4312 == prop_fail_result)
4313 MRRETURN(MATCH_NOMATCH);
4314 }
4315 /* Control never gets here */
4316
4317 case PT_SPACE: /* Perl space */
4318 for (fi = min;; fi++)
4319 {
4320 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM60);
4321 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
4322 if (fi >= max) MRRETURN(MATCH_NOMATCH);
4323 if (eptr >= md->end_subject)
4324 {
4325 SCHECK_PARTIAL();
4326 MRRETURN(MATCH_NOMATCH);
4327 }
4328 GETCHARINCTEST(c, eptr);
4329 prop_category = UCD_CATEGORY(c);
4330 if ((prop_category == ucp_Z || c == CHAR_HT || c == CHAR_NL ||
4331 c == CHAR_FF || c == CHAR_CR)
4332 == prop_fail_result)
4333 MRRETURN(MATCH_NOMATCH);
4334 }
4335 /* Control never gets here */
4336
4337 case PT_PXSPACE: /* POSIX space */
4338 for (fi = min;; fi++)
4339 {
4340 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM61);
4341 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
4342 if (fi >= max) MRRETURN(MATCH_NOMATCH);
4343 if (eptr >= md->end_subject)
4344 {
4345 SCHECK_PARTIAL();
4346 MRRETURN(MATCH_NOMATCH);
4347 }
4348 GETCHARINCTEST(c, eptr);
4349 prop_category = UCD_CATEGORY(c);
4350 if ((prop_category == ucp_Z || c == CHAR_HT || c == CHAR_NL ||
4351 c == CHAR_VT || c == CHAR_FF || c == CHAR_CR)
4352 == prop_fail_result)
4353 MRRETURN(MATCH_NOMATCH);
4354 }
4355 /* Control never gets here */
4356
4357 case PT_WORD:
4358 for (fi = min;; fi++)
4359 {
4360 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM62);
4361 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
4362 if (fi >= max) MRRETURN(MATCH_NOMATCH);
4363 if (eptr >= md->end_subject)
4364 {
4365 SCHECK_PARTIAL();
4366 MRRETURN(MATCH_NOMATCH);
4367 }
4368 GETCHARINCTEST(c, eptr);
4369 prop_category = UCD_CATEGORY(c);
4370 if ((prop_category == ucp_L ||
4371 prop_category == ucp_N ||
4372 c == CHAR_UNDERSCORE)
4373 == prop_fail_result)
4374 MRRETURN(MATCH_NOMATCH);
4375 }
4376 /* Control never gets here */
4377
4378 /* This should never occur */
4379
4380 default:
4381 RRETURN(PCRE_ERROR_INTERNAL);
4382 }
4383 }
4384
4385 /* Match extended Unicode sequences. We will get here only if the
4386 support is in the binary; otherwise a compile-time error occurs. */
4387
4388 else if (ctype == OP_EXTUNI)
4389 {
4390 for (fi = min;; fi++)
4391 {
4392 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM41);
4393 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
4394 if (fi >= max) MRRETURN(MATCH_NOMATCH);
4395 if (eptr >= md->end_subject)
4396 {
4397 SCHECK_PARTIAL();
4398 MRRETURN(MATCH_NOMATCH);
4399 }
4400 GETCHARINCTEST(c, eptr);
4401 prop_category = UCD_CATEGORY(c);
4402 if (prop_category == ucp_M) MRRETURN(MATCH_NOMATCH);
4403 while (eptr < md->end_subject)
4404 {
4405 int len = 1;
4406 if (!utf8) c = *eptr;
4407 else { GETCHARLEN(c, eptr, len); }
4408 prop_category = UCD_CATEGORY(c);
4409 if (prop_category != ucp_M) break;
4410 eptr += len;
4411 }
4412 }
4413 }
4414
4415 else
4416 #endif /* SUPPORT_UCP */
4417
4418 #ifdef SUPPORT_UTF8
4419 /* UTF-8 mode */
4420 if (utf8)
4421 {
4422 for (fi = min;; fi++)
4423 {
4424 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM42);
4425 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
4426 if (fi >= max) MRRETURN(MATCH_NOMATCH);
4427 if (eptr >= md->end_subject)
4428 {
4429 SCHECK_PARTIAL();
4430 MRRETURN(MATCH_NOMATCH);
4431 }
4432 if (ctype == OP_ANY && IS_NEWLINE(eptr))
4433 MRRETURN(MATCH_NOMATCH);
4434 GETCHARINC(c, eptr);
4435 switch(ctype)
4436 {
4437 case OP_ANY: /* This is the non-NL case */
4438 case OP_ALLANY:
4439 case OP_ANYBYTE:
4440 break;
4441
4442 case OP_ANYNL:
4443 switch(c)
4444 {
4445 default: MRRETURN(MATCH_NOMATCH);
4446 case 0x000d:
4447 if (eptr < md->end_subject && *eptr == 0x0a) eptr++;
4448 break;
4449 case 0x000a:
4450 break;
4451
4452 case 0x000b:
4453 case 0x000c:
4454 case 0x0085:
4455 case 0x2028:
4456 case 0x2029:
4457 if (md->bsr_anycrlf) MRRETURN(MATCH_NOMATCH);
4458 break;
4459 }
4460 break;
4461
4462 case OP_NOT_HSPACE:
4463 switch(c)
4464 {
4465 default: break;
4466 case 0x09: /* HT */
4467 case 0x20: /* SPACE */
4468 case 0xa0: /* NBSP */
4469 case 0x1680: /* OGHAM SPACE MARK */
4470 case 0x180e: /* MONGOLIAN VOWEL SEPARATOR */
4471 case 0x2000: /* EN QUAD */
4472 case 0x2001: /* EM QUAD */
4473 case 0x2002: /* EN SPACE */
4474 case 0x2003: /* EM SPACE */
4475 case 0x2004: /* THREE-PER-EM SPACE */
4476 case 0x2005: /* FOUR-PER-EM SPACE */
4477 case 0x2006: /* SIX-PER-EM SPACE */
4478 case 0x2007: /* FIGURE SPACE */
4479 case 0x2008: /* PUNCTUATION SPACE */
4480 case 0x2009: /* THIN SPACE */
4481 case 0x200A: /* HAIR SPACE */
4482 case 0x202f: /* NARROW NO-BREAK SPACE */
4483 case 0x205f: /* MEDIUM MATHEMATICAL SPACE */
4484 case 0x3000: /* IDEOGRAPHIC SPACE */
4485 MRRETURN(MATCH_NOMATCH);
4486 }
4487 break;
4488
4489 case OP_HSPACE:
4490 switch(c)
4491 {
4492 default: MRRETURN(MATCH_NOMATCH);
4493 case 0x09: /* HT */
4494 case 0x20: /* SPACE */
4495 case 0xa0: /* NBSP */
4496 case 0x1680: /* OGHAM SPACE MARK */
4497 case 0x180e: /* MONGOLIAN VOWEL SEPARATOR */
4498 case 0x2000: /* EN QUAD */
4499 case 0x2001: /* EM QUAD */
4500 case 0x2002: /* EN SPACE */
4501 case 0x2003: /* EM SPACE */
4502 case 0x2004: /* THREE-PER-EM SPACE */
4503 case 0x2005: /* FOUR-PER-EM SPACE */
4504 case 0x2006: /* SIX-PER-EM SPACE */
4505 case 0x2007: /* FIGURE SPACE */
4506 case 0x2008: /* PUNCTUATION SPACE */
4507 case 0x2009: /* THIN SPACE */
4508 case 0x200A: /* HAIR SPACE */
4509 case 0x202f: /* NARROW NO-BREAK SPACE */
4510 case 0x205f: /* MEDIUM MATHEMATICAL SPACE */
4511 case 0x3000: /* IDEOGRAPHIC SPACE */
4512 break;
4513 }
4514 break;
4515
4516 case OP_NOT_VSPACE:
4517 switch(c)
4518 {
4519 default: break;
4520 case 0x0a: /* LF */
4521 case 0x0b: /* VT */
4522 case 0x0c: /* FF */
4523 case 0x0d: /* CR */
4524 case 0x85: /* NEL */
4525 case 0x2028: /* LINE SEPARATOR */
4526 case 0x2029: /* PARAGRAPH SEPARATOR */
4527 MRRETURN(MATCH_NOMATCH);
4528 }
4529 break;
4530
4531 case OP_VSPACE:
4532 switch(c)
4533 {
4534 default: MRRETURN(MATCH_NOMATCH);
4535 case 0x0a: /* LF */
4536 case 0x0b: /* VT */
4537 case 0x0c: /* FF */
4538 case 0x0d: /* CR */
4539 case 0x85: /* NEL */
4540 case 0x2028: /* LINE SEPARATOR */
4541 case 0x2029: /* PARAGRAPH SEPARATOR */
4542 break;
4543 }
4544 break;
4545
4546 case OP_NOT_DIGIT:
4547 if (c < 256 && (md->ctypes[c] & ctype_digit) != 0)
4548 MRRETURN(MATCH_NOMATCH);
4549 break;
4550
4551 case OP_DIGIT:
4552 if (c >= 256 || (md->ctypes[c] & ctype_digit) == 0)
4553 MRRETURN(MATCH_NOMATCH);
4554 break;
4555
4556 case OP_NOT_WHITESPACE:
4557 if (c < 256 && (md->ctypes[c] & ctype_space) != 0)
4558 MRRETURN(MATCH_NOMATCH);
4559 break;
4560
4561 case OP_WHITESPACE:
4562 if (c >= 256 || (md->ctypes[c] & ctype_space) == 0)
4563 MRRETURN(MATCH_NOMATCH);
4564 break;
4565
4566 case OP_NOT_WORDCHAR:
4567 if (c < 256 && (md->ctypes[c] & ctype_word) != 0)
4568 MRRETURN(MATCH_NOMATCH);
4569 break;
4570
4571 case OP_WORDCHAR:
4572 if (c >= 256 || (md->ctypes[c] & ctype_word) == 0)
4573 MRRETURN(MATCH_NOMATCH);
4574 break;
4575
4576 default:
4577 RRETURN(PCRE_ERROR_INTERNAL);
4578 }
4579 }
4580 }
4581 else
4582 #endif
4583 /* Not UTF-8 mode */
4584 {
4585 for (fi = min;; fi++)
4586 {
4587 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM43);
4588 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
4589 if (fi >= max) MRRETURN(MATCH_NOMATCH);
4590 if (eptr >= md->end_subject)
4591 {
4592 SCHECK_PARTIAL();
4593 MRRETURN(MATCH_NOMATCH);
4594 }
4595 if (ctype == OP_ANY && IS_NEWLINE(eptr))
4596 MRRETURN(MATCH_NOMATCH);
4597 c = *eptr++;
4598 switch(ctype)
4599 {
4600 case OP_ANY: /* This is the non-NL case */
4601 case OP_ALLANY:
4602 case OP_ANYBYTE:
4603 break;
4604
4605 case OP_ANYNL:
4606 switch(c)
4607 {
4608 default: MRRETURN(MATCH_NOMATCH);
4609 case 0x000d:
4610 if (eptr < md->end_subject && *eptr == 0x0a) eptr++;
4611 break;
4612
4613 case 0x000a:
4614 break;
4615
4616 case 0x000b:
4617 case 0x000c:
4618 case 0x0085:
4619 if (md->bsr_anycrlf) MRRETURN(MATCH_NOMATCH);
4620 break;
4621 }
4622 break;
4623
4624 case OP_NOT_HSPACE:
4625 switch(c)
4626 {
4627 default: break;
4628 case 0x09: /* HT */
4629 case 0x20: /* SPACE */
4630 case 0xa0: /* NBSP */
4631 MRRETURN(MATCH_NOMATCH);
4632 }
4633 break;
4634
4635 case OP_HSPACE:
4636 switch(c)
4637 {
4638 default: MRRETURN(MATCH_NOMATCH);
4639 case 0x09: /* HT */
4640 case 0x20: /* SPACE */
4641 case 0xa0: /* NBSP */
4642 break;
4643 }
4644 break;
4645
4646 case OP_NOT_VSPACE:
4647 switch(c)
4648 {
4649 default: break;
4650 case 0x0a: /* LF */
4651 case 0x0b: /* VT */
4652 case 0x0c: /* FF */
4653 case 0x0d: /* CR */
4654 case 0x85: /* NEL */
4655 MRRETURN(MATCH_NOMATCH);
4656 }
4657 break;
4658
4659 case OP_VSPACE:
4660 switch(c)
4661 {
4662 default: MRRETURN(MATCH_NOMATCH);
4663 case 0x0a: /* LF */
4664 case 0x0b: /* VT */
4665 case 0x0c: /* FF */
4666 case 0x0d: /* CR */
4667 case 0x85: /* NEL */
4668 break;
4669 }
4670 break;
4671
4672 case OP_NOT_DIGIT:
4673 if ((md->ctypes[c] & ctype_digit) != 0) MRRETURN(MATCH_NOMATCH);
4674 break;
4675
4676 case OP_DIGIT:
4677 if ((md->ctypes[c] & ctype_digit) == 0) MRRETURN(MATCH_NOMATCH);
4678 break;
4679
4680 case OP_NOT_WHITESPACE:
4681 if ((md->ctypes[c] & ctype_space) != 0) MRRETURN(MATCH_NOMATCH);
4682 break;
4683
4684 case OP_WHITESPACE:
4685 if ((md->ctypes[c] & ctype_space) == 0) MRRETURN(MATCH_NOMATCH);
4686 break;
4687
4688 case OP_NOT_WORDCHAR:
4689 if ((md->ctypes[c] & ctype_word) != 0) MRRETURN(MATCH_NOMATCH);
4690 break;
4691
4692 case OP_WORDCHAR:
4693 if ((md->ctypes[c] & ctype_word) == 0) MRRETURN(MATCH_NOMATCH);
4694 break;
4695
4696 default:
4697 RRETURN(PCRE_ERROR_INTERNAL);
4698 }
4699 }
4700 }
4701 /* Control never gets here */
4702 }
4703
4704 /* If maximizing, it is worth using inline code for speed, doing the type
4705 test once at the start (i.e. keep it out of the loop). Again, keep the
4706 UTF-8 and UCP stuff separate. */
4707
4708 else
4709 {
4710 pp = eptr; /* Remember where we started */
4711
4712 #ifdef SUPPORT_UCP
4713 if (prop_type >= 0)
4714 {
4715 switch(prop_type)
4716 {
4717 case PT_ANY:
4718 for (i = min; i < max; i++)
4719 {
4720 int len = 1;
4721 if (eptr >= md->end_subject)
4722 {
4723 SCHECK_PARTIAL();
4724 break;
4725 }
4726 GETCHARLENTEST(c, eptr, len);
4727 if (prop_fail_result) break;
4728 eptr+= len;
4729 }
4730 break;
4731
4732 case PT_LAMP:
4733 for (i = min; i < max; i++)
4734 {
4735 int len = 1;
4736 if (eptr >= md->end_subject)
4737 {
4738 SCHECK_PARTIAL();
4739 break;
4740 }
4741 GETCHARLENTEST(c, eptr, len);
4742 prop_chartype = UCD_CHARTYPE(c);
4743 if ((prop_chartype == ucp_Lu ||
4744 prop_chartype == ucp_Ll ||
4745 prop_chartype == ucp_Lt) == prop_fail_result)
4746 break;
4747 eptr+= len;
4748 }
4749 break;
4750
4751 case PT_GC:
4752 for (i = min; i < max; i++)
4753 {
4754 int len = 1;
4755 if (eptr >= md->end_subject)
4756 {
4757 SCHECK_PARTIAL();
4758 break;
4759 }
4760 GETCHARLENTEST(c, eptr, len);
4761 prop_category = UCD_CATEGORY(c);
4762 if ((prop_category == prop_value) == prop_fail_result)
4763 break;
4764 eptr+= len;
4765 }
4766 break;
4767
4768 case PT_PC:
4769 for (i = min; i < max; i++)
4770 {
4771 int len = 1;
4772 if (eptr >= md->end_subject)
4773 {
4774 SCHECK_PARTIAL();
4775 break;
4776 }
4777 GETCHARLENTEST(c, eptr, len);
4778 prop_chartype = UCD_CHARTYPE(c);
4779 if ((prop_chartype == prop_value) == prop_fail_result)
4780 break;
4781 eptr+= len;
4782 }
4783 break;
4784
4785 case PT_SC:
4786 for (i = min; i < max; i++)
4787 {
4788 int len = 1;
4789 if (eptr >= md->end_subject)
4790 {
4791 SCHECK_PARTIAL();
4792 break;
4793 }
4794 GETCHARLENTEST(c, eptr, len);
4795 prop_script = UCD_SCRIPT(c);
4796 if ((prop_script == prop_value) == prop_fail_result)
4797 break;
4798 eptr+= len;
4799 }
4800 break;
4801
4802 case PT_ALNUM:
4803 for (i = min; i < max; i++)
4804 {
4805 int len = 1;
4806 if (eptr >= md->end_subject)
4807 {
4808 SCHECK_PARTIAL();
4809 break;
4810 }
4811 GETCHARLENTEST(c, eptr, len);
4812 prop_category = UCD_CATEGORY(c);
4813 if ((prop_category == ucp_L || prop_category == ucp_N)
4814 == prop_fail_result)
4815 break;
4816 eptr+= len;
4817 }
4818 break;
4819
4820 case PT_SPACE: /* Perl space */
4821 for (i = min; i < max; i++)
4822 {
4823 int len = 1;
4824 if (eptr >= md->end_subject)
4825 {
4826 SCHECK_PARTIAL();
4827 break;
4828 }
4829 GETCHARLENTEST(c, eptr, len);
4830 prop_category = UCD_CATEGORY(c);
4831 if ((prop_category == ucp_Z || c == CHAR_HT || c == CHAR_NL ||
4832 c == CHAR_FF || c == CHAR_CR)
4833 == prop_fail_result)
4834 break;
4835 eptr+= len;
4836 }
4837 break;
4838
4839 case PT_PXSPACE: /* POSIX space */
4840 for (i = min; i < max; i++)
4841 {
4842 int len = 1;
4843 if (eptr >= md->end_subject)
4844 {
4845 SCHECK_PARTIAL();
4846 break;
4847 }
4848 GETCHARLENTEST(c, eptr, len);
4849 prop_category = UCD_CATEGORY(c);
4850 if ((prop_category == ucp_Z || c == CHAR_HT || c == CHAR_NL ||
4851 c == CHAR_VT || c == CHAR_FF || c == CHAR_CR)
4852 == prop_fail_result)
4853 break;
4854 eptr+= len;
4855 }
4856 break;
4857
4858 case PT_WORD:
4859 for (i = min; i < max; i++)
4860 {
4861 int len = 1;
4862 if (eptr >= md->end_subject)
4863 {
4864 SCHECK_PARTIAL();
4865 break;
4866 }
4867 GETCHARLENTEST(c, eptr, len);
4868 prop_category = UCD_CATEGORY(c);
4869 if ((prop_category == ucp_L || prop_category == ucp_N ||
4870 c == CHAR_UNDERSCORE) == prop_fail_result)
4871 break;
4872 eptr+= len;
4873 }
4874 break;
4875
4876 default:
4877 RRETURN(PCRE_ERROR_INTERNAL);
4878 }
4879
4880 /* eptr is now past the end of the maximum run */
4881
4882 if (possessive) continue;
4883 for(;;)
4884 {
4885 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM44);
4886 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
4887 if (eptr-- == pp) break; /* Stop if tried at original pos */
4888 if (utf8) BACKCHAR(eptr);
4889 }
4890 }
4891
4892 /* Match extended Unicode sequences. We will get here only if the
4893 support is in the binary; otherwise a compile-time error occurs. */
4894
4895 else if (ctype == OP_EXTUNI)
4896 {
4897 for (i = min; i < max; i++)
4898 {
4899 if (eptr >= md->end_subject)
4900 {
4901 SCHECK_PARTIAL();
4902 break;
4903 }
4904 GETCHARINCTEST(c, eptr);
4905 prop_category = UCD_CATEGORY(c);
4906 if (prop_category == ucp_M) break;
4907 while (eptr < md->end_subject)
4908 {
4909 int len = 1;
4910 if (!utf8) c = *eptr; else
4911 {
4912 GETCHARLEN(c, eptr, len);
4913 }
4914 prop_category = UCD_CATEGORY(c);
4915 if (prop_category != ucp_M) break;
4916 eptr += len;
4917 }
4918 }
4919
4920 /* eptr is now past the end of the maximum run */
4921
4922 if (possessive) continue;
4923
4924 for(;;)
4925 {
4926 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM45);
4927 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
4928 if (eptr-- == pp) break; /* Stop if tried at original pos */
4929 for (;;) /* Move back over one extended */
4930 {
4931 int len = 1;
4932 if (!utf8) c = *eptr; else
4933 {
4934 BACKCHAR(eptr);
4935 GETCHARLEN(c, eptr, len);
4936 }
4937 prop_category = UCD_CATEGORY(c);
4938 if (prop_category != ucp_M) break;
4939 eptr--;
4940 }
4941 }
4942 }
4943
4944 else
4945 #endif /* SUPPORT_UCP */
4946
4947 #ifdef SUPPORT_UTF8
4948 /* UTF-8 mode */
4949
4950 if (utf8)
4951 {
4952 switch(ctype)
4953 {
4954 case OP_ANY:
4955 if (max < INT_MAX)
4956 {
4957 for (i = min; i < max; i++)
4958 {
4959 if (eptr >= md->end_subject)
4960 {
4961 SCHECK_PARTIAL();
4962 break;
4963 }
4964 if (IS_NEWLINE(eptr)) break;
4965 eptr++;
4966 while (eptr < md->end_subject && (*eptr & 0xc0) == 0x80) eptr++;
4967 }
4968 }
4969
4970 /* Handle unlimited UTF-8 repeat */
4971
4972 else
4973 {
4974 for (i = min; i < max; i++)
4975 {
4976 if (eptr >= md->end_subject)
4977 {
4978 SCHECK_PARTIAL();
4979 break;
4980 }
4981 if (IS_NEWLINE(eptr)) break;
4982 eptr++;
4983 while (eptr < md->end_subject && (*eptr & 0xc0) == 0x80) eptr++;
4984 }
4985 }
4986 break;
4987
4988 case OP_ALLANY:
4989 if (max < INT_MAX)
4990 {
4991 for (i = min; i < max; i++)
4992 {
4993 if (eptr >= md->end_subject)
4994 {
4995 SCHECK_PARTIAL();
4996 break;
4997 }
4998 eptr++;
4999 while (eptr < md->end_subject && (*eptr & 0xc0) == 0x80) eptr++;
5000 }
5001 }
5002 else eptr = md->end_subject; /* Unlimited UTF-8 repeat */
5003 break;
5004
5005 /* The byte case is the same as non-UTF8 */
5006
5007 case OP_ANYBYTE:
5008 c = max - min;
5009 if (c > (unsigned int)(md->end_subject - eptr))
5010 {
5011 eptr = md->end_subject;
5012 SCHECK_PARTIAL();
5013 }
5014 else eptr += c;
5015 break;
5016
5017 case OP_ANYNL:
5018 for (i = min; i < max; i++)
5019 {
5020 int len = 1;
5021 if (eptr >= md->end_subject)
5022 {
5023 SCHECK_PARTIAL();
5024 break;
5025 }
5026 GETCHARLEN(c, eptr, len);
5027 if (c == 0x000d)
5028 {
5029 if (++eptr >= md->end_subject) break;
5030 if (*eptr == 0x000a) eptr++;
5031 }
5032 else
5033 {
5034 if (c != 0x000a &&
5035 (md->bsr_anycrlf ||
5036 (c != 0x000b && c != 0x000c &&
5037 c != 0x0085 && c != 0x2028 && c != 0x2029)))
5038 break;
5039 eptr += len;
5040 }
5041 }
5042 break;
5043
5044 case OP_NOT_HSPACE:
5045 case OP_HSPACE:
5046 for (i = min; i < max; i++)
5047 {
5048 BOOL gotspace;
5049 int len = 1;
5050 if (eptr >= md->end_subject)
5051 {
5052 SCHECK_PARTIAL();
5053 break;
5054 }
5055 GETCHARLEN(c, eptr, len);
5056 switch(c)
5057 {
5058 default: gotspace = FALSE; break;
5059 case 0x09: /* HT */
5060 case 0x20: /* SPACE */
5061 case 0xa0: /* NBSP */
5062 case 0x1680: /* OGHAM SPACE MARK */
5063 case 0x180e: /* MONGOLIAN VOWEL SEPARATOR */
5064 case 0x2000: /* EN QUAD */
5065 case 0x2001: /* EM QUAD */
5066 case 0x2002: /* EN SPACE */
5067 case 0x2003: /* EM SPACE */
5068 case 0x2004: /* THREE-PER-EM SPACE */
5069 case 0x2005: /* FOUR-PER-EM SPACE */
5070 case 0x2006: /* SIX-PER-EM SPACE */
5071 case 0x2007: /* FIGURE SPACE */
5072 case 0x2008: /* PUNCTUATION SPACE */
5073 case 0x2009: /* THIN SPACE */
5074 case 0x200A: /* HAIR SPACE */
5075 case 0x202f: /* NARROW NO-BREAK SPACE */
5076 case 0x205f: /* MEDIUM MATHEMATICAL SPACE */
5077 case 0x3000: /* IDEOGRAPHIC SPACE */
5078 gotspace = TRUE;
5079 break;
5080 }
5081 if (gotspace == (ctype == OP_NOT_HSPACE)) break;
5082 eptr += len;
5083 }
5084 break;
5085
5086 case OP_NOT_VSPACE:
5087 case OP_VSPACE:
5088 for (i = min; i < max; i++)
5089 {
5090 BOOL gotspace;
5091 int len = 1;
5092 if (eptr >= md->end_subject)
5093 {
5094 SCHECK_PARTIAL();
5095 break;
5096 }
5097 GETCHARLEN(c, eptr, len);
5098 switch(c)
5099 {
5100 default: gotspace = FALSE; break;
5101 case 0x0a: /* LF */
5102 case 0x0b: /* VT */
5103 case 0x0c: /* FF */
5104 case 0x0d: /* CR */
5105 case 0x85: /* NEL */
5106 case 0x2028: /* LINE SEPARATOR */
5107 case 0x2029: /* PARAGRAPH SEPARATOR */
5108 gotspace = TRUE;
5109 break;
5110 }
5111 if (gotspace == (ctype == OP_NOT_VSPACE)) break;
5112 eptr += len;
5113 }
5114 break;
5115
5116 case OP_NOT_DIGIT:
5117 for (i = min; i < max; i++)
5118 {
5119 int len = 1;
5120 if (eptr >= md->end_subject)
5121 {
5122 SCHECK_PARTIAL();
5123 break;
5124 }
5125 GETCHARLEN(c, eptr, len);
5126 if (c < 256 && (md->ctypes[c] & ctype_digit) != 0) break;
5127 eptr+= len;
5128 }
5129 break;
5130
5131 case OP_DIGIT:
5132 for (i = min; i < max; i++)
5133 {
5134 int len = 1;
5135 if (eptr >= md->end_subject)
5136 {
5137 SCHECK_PARTIAL();
5138 break;
5139 }
5140 GETCHARLEN(c, eptr, len);
5141 if (c >= 256 ||(md->ctypes[c] & ctype_digit) == 0) break;
5142 eptr+= len;
5143 }
5144 break;
5145
5146 case OP_NOT_WHITESPACE:
5147 for (i = min; i < max; i++)
5148 {
5149 int len = 1;
5150 if (eptr >= md->end_subject)
5151 {
5152 SCHECK_PARTIAL();
5153 break;
5154 }
5155 GETCHARLEN(c, eptr, len);
5156 if (c < 256 && (md->ctypes[c] & ctype_space) != 0) break;
5157 eptr+= len;
5158 }
5159 break;
5160
5161 case OP_WHITESPACE:
5162 for (i = min; i < max; i++)
5163 {
5164 int len = 1;
5165 if (eptr >= md->end_subject)
5166 {
5167 SCHECK_PARTIAL();
5168 break;
5169 }
5170 GETCHARLEN(c, eptr, len);
5171 if (c >= 256 ||(md->ctypes[c] & ctype_space) == 0) break;
5172 eptr+= len;
5173 }
5174 break;
5175
5176 case OP_NOT_WORDCHAR:
5177 for (i = min; i < max; i++)
5178 {
5179 int len = 1;
5180 if (eptr >= md->end_subject)
5181 {
5182 SCHECK_PARTIAL();
5183 break;
5184 }
5185 GETCHARLEN(c, eptr, len);
5186 if (c < 256 && (md->ctypes[c] & ctype_word) != 0) break;
5187 eptr+= len;
5188 }
5189 break;
5190
5191 case OP_WORDCHAR:
5192 for (i = min; i < max; i++)
5193 {
5194 int len = 1;
5195 if (eptr >= md->end_subject)
5196 {
5197 SCHECK_PARTIAL();
5198 break;
5199 }
5200 GETCHARLEN(c, eptr, len);
5201 if (c >= 256 || (md->ctypes[c] & ctype_word) == 0) break;
5202 eptr+= len;
5203 }
5204 break;
5205
5206 default:
5207 RRETURN(PCRE_ERROR_INTERNAL);
5208 }
5209
5210 /* eptr is now past the end of the maximum run */
5211
5212 if (possessive) continue;
5213 for(;;)
5214 {
5215 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM46);
5216 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
5217 if (eptr-- == pp) break; /* Stop if tried at original pos */
5218 BACKCHAR(eptr);
5219 }
5220 }
5221 else
5222 #endif /* SUPPORT_UTF8 */
5223
5224 /* Not UTF-8 mode */
5225 {
5226 switch(ctype)
5227 {
5228 case OP_ANY:
5229 for (i = min; i < max; i++)
5230 {
5231 if (eptr >= md->end_subject)
5232 {
5233 SCHECK_PARTIAL();
5234 break;
5235 }
5236 if (IS_NEWLINE(eptr)) break;
5237 eptr++;
5238 }
5239 break;
5240
5241 case OP_ALLANY:
5242 case OP_ANYBYTE:
5243 c = max - min;
5244 if (c > (unsigned int)(md->end_subject - eptr))
5245 {
5246 eptr = md->end_subject;
5247 SCHECK_PARTIAL();
5248 }
5249 else eptr += c;
5250 break;
5251
5252 case OP_ANYNL:
5253 for (i = min; i < max; i++)
5254 {
5255 if (eptr >= md->end_subject)
5256 {
5257 SCHECK_PARTIAL();
5258 break;
5259 }
5260 c = *eptr;
5261 if (c == 0x000d)
5262 {
5263 if (++eptr >= md->end_subject) break;
5264 if (*eptr == 0x000a) eptr++;
5265 }
5266 else
5267 {
5268 if (c != 0x000a &&
5269 (md->bsr_anycrlf ||
5270 (c != 0x000b && c != 0x000c && c != 0x0085)))
5271 break;
5272 eptr++;
5273 }
5274 }
5275 break;
5276
5277 case OP_NOT_HSPACE:
5278 for (i = min; i < max; i++)
5279 {
5280 if (eptr >= md->end_subject)
5281 {
5282 SCHECK_PARTIAL();
5283 break;
5284 }
5285 c = *eptr;
5286 if (c == 0x09 || c == 0x20 || c == 0xa0) break;
5287 eptr++;
5288 }
5289 break;
5290
5291 case OP_HSPACE:
5292 for (i = min; i < max; i++)
5293 {
5294 if (eptr >= md->end_subject)
5295 {
5296 SCHECK_PARTIAL();
5297 break;
5298 }
5299 c = *eptr;
5300 if (c != 0x09 && c != 0x20 && c != 0xa0) break;
5301 eptr++;
5302 }
5303 break;
5304
5305 case OP_NOT_VSPACE:
5306 for (i = min; i < max; i++)
5307 {
5308 if (eptr >= md->end_subject)
5309 {
5310 SCHECK_PARTIAL();
5311 break;
5312 }
5313 c = *eptr;
5314 if (c == 0x0a || c == 0x0b || c == 0x0c || c == 0x0d || c == 0x85)
5315 break;
5316 eptr++;
5317 }
5318 break;
5319
5320 case OP_VSPACE:
5321 for (i = min; i < max; i++)
5322 {
5323 if (eptr >= md->end_subject)
5324 {
5325 SCHECK_PARTIAL();
5326 break;
5327 }
5328 c = *eptr;
5329 if (c != 0x0a && c != 0x0b && c != 0x0c && c != 0x0d && c != 0x85)
5330 break;
5331 eptr++;
5332 }
5333 break;
5334
5335 case OP_NOT_DIGIT:
5336 for (i = min; i < max; i++)
5337 {
5338 if (eptr >= md->end_subject)
5339 {
5340 SCHECK_PARTIAL();
5341 break;
5342 }
5343 if ((md->ctypes[*eptr] & ctype_digit) != 0) break;
5344 eptr++;
5345 }
5346 break;
5347
5348 case OP_DIGIT:
5349 for (i = min; i < max; i++)
5350 {
5351 if (eptr >= md->end_subject)
5352 {
5353 SCHECK_PARTIAL();
5354 break;
5355 }
5356 if ((md->ctypes[*eptr] & ctype_digit) == 0) break;
5357 eptr++;
5358 }
5359 break;
5360
5361 case OP_NOT_WHITESPACE:
5362 for (i = min; i < max; i++)
5363 {
5364 if (eptr >= md->end_subject)
5365 {
5366 SCHECK_PARTIAL();
5367 break;
5368 }
5369 if ((md->ctypes[*eptr] & ctype_space) != 0) break;
5370 eptr++;
5371 }
5372 break;
5373
5374 case OP_WHITESPACE:
5375 for (i = min; i < max; i++)
5376 {
5377 if (eptr >= md->end_subject)
5378 {
5379 SCHECK_PARTIAL();
5380 break;
5381 }
5382 if ((md->ctypes[*eptr] & ctype_space) == 0) break;
5383 eptr++;
5384 }
5385 break;
5386
5387 case OP_NOT_WORDCHAR:
5388 for (i = min; i < max; i++)
5389 {
5390 if (eptr >= md->end_subject)
5391 {
5392 SCHECK_PARTIAL();
5393 break;
5394 }
5395 if ((md->ctypes[*eptr] & ctype_word) != 0) break;
5396 eptr++;
5397 }
5398 break;
5399
5400 case OP_WORDCHAR:
5401 for (i = min; i < max; i++)
5402 {
5403 if (eptr >= md->end_subject)
5404 {
5405 SCHECK_PARTIAL();
5406 break;
5407 }
5408 if ((md->ctypes[*eptr] & ctype_word) == 0) break;
5409 eptr++;
5410 }
5411 break;
5412
5413 default:
5414 RRETURN(PCRE_ERROR_INTERNAL);
5415 }
5416
5417 /* eptr is now past the end of the maximum run */
5418
5419 if (possessive) continue;
5420 while (eptr >= pp)
5421 {
5422 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM47);
5423 eptr--;
5424 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
5425 }
5426 }
5427
5428 /* Get here if we can't make it match with any permitted repetitions */
5429
5430 MRRETURN(MATCH_NOMATCH);
5431 }
5432 /* Control never gets here */
5433
5434 /* There's been some horrible disaster. Arrival here can only mean there is
5435 something seriously wrong in the code above or the OP_xxx definitions. */
5436
5437 default:
5438 DPRINTF(("Unknown opcode %d\n", *ecode));
5439 RRETURN(PCRE_ERROR_UNKNOWN_OPCODE);
5440 }
5441
5442 /* Do not stick any code in here without much thought; it is assumed
5443 that "continue" in the code above comes out to here to repeat the main
5444 loop. */
5445
5446 } /* End of main loop */
5447 /* Control never reaches here */
5448
5449
5450 /* When compiling to use the heap rather than the stack for recursive calls to
5451 match(), the RRETURN() macro jumps here. The number that is saved in
5452 frame->Xwhere indicates which label we actually want to return to. */
5453
5454 #ifdef NO_RECURSE
5455 #define LBL(val) case val: goto L_RM##val;
5456 HEAP_RETURN:
5457 switch (frame->Xwhere)
5458 {
5459 LBL( 1) LBL( 2) LBL( 3) LBL( 4) LBL( 5) LBL( 6) LBL( 7) LBL( 8)
5460 LBL( 9) LBL(10) LBL(11) LBL(12) LBL(13) LBL(14) LBL(15) LBL(17)
5461 LBL(19) LBL(24) LBL(25) LBL(26) LBL(27) LBL(29) LBL(31) LBL(33)
5462 LBL(35) LBL(43) LBL(47) LBL(48) LBL(49) LBL(50) LBL(51) LBL(52)
5463 LBL(53) LBL(54) LBL(55) LBL(56) LBL(57) LBL(58)
5464 #ifdef SUPPORT_UTF8
5465 LBL(16) LBL(18) LBL(20) LBL(21) LBL(22) LBL(23) LBL(28) LBL(30)
5466 LBL(32) LBL(34) LBL(42) LBL(46)
5467 #ifdef SUPPORT_UCP
5468 LBL(36) LBL(37) LBL(38) LBL(39) LBL(40) LBL(41) LBL(44) LBL(45)
5469 LBL(59) LBL(60) LBL(61) LBL(62)
5470 #endif /* SUPPORT_UCP */
5471 #endif /* SUPPORT_UTF8 */
5472 default:
5473 DPRINTF(("jump error in pcre match: label %d non-existent\n", frame->Xwhere));
5474 return PCRE_ERROR_INTERNAL;
5475 }
5476 #undef LBL
5477 #endif /* NO_RECURSE */
5478 }
5479
5480
5481 /***************************************************************************
5482 ****************************************************************************
5483 RECURSION IN THE match() FUNCTION
5484
5485 Undefine all the macros that were defined above to handle this. */
5486
5487 #ifdef NO_RECURSE
5488 #undef eptr
5489 #undef ecode
5490 #undef mstart
5491 #undef offset_top
5492 #undef ims
5493 #undef eptrb
5494 #undef flags
5495
5496 #undef callpat
5497 #undef charptr
5498 #undef data
5499 #undef next
5500 #undef pp
5501 #undef prev
5502 #undef saved_eptr
5503
5504 #undef new_recursive
5505
5506 #undef cur_is_word
5507 #undef condition
5508 #undef prev_is_word
5509
5510 #undef original_ims
5511
5512 #undef ctype
5513 #undef length
5514 #undef max
5515 #undef min
5516 #undef number
5517 #undef offset
5518 #undef op
5519 #undef save_capture_last
5520 #undef save_offset1
5521 #undef save_offset2
5522 #undef save_offset3
5523 #undef stacksave
5524
5525 #undef newptrb
5526
5527 #endif
5528
5529 /* These two are defined as macros in both cases */
5530
5531 #undef fc
5532 #undef fi
5533
5534 /***************************************************************************
5535 ***************************************************************************/
5536
5537
5538
5539 /*************************************************
5540 * Execute a Regular Expression *
5541 *************************************************/
5542
5543 /* This function applies a compiled re to a subject string and picks out
5544 portions of the string if it matches. Two elements in the vector are set for
5545 each substring: the offsets to the start and end of the substring.
5546
5547 Arguments:
5548 argument_re points to the compiled expression
5549 extra_data points to extra data or is NULL
5550 subject points to the subject string
5551 length length of subject string (may contain binary zeros)
5552 start_offset where to start in the subject string
5553 options option bits
5554 offsets points to a vector of ints to be filled in with offsets
5555 offsetcount the number of elements in the vector
5556
5557 Returns: > 0 => success; value is the number of elements filled in
5558 = 0 => success, but offsets is not big enough
5559 -1 => failed to match
5560 < -1 => some kind of unexpected problem
5561 */
5562
5563 PCRE_EXP_DEFN int PCRE_CALL_CONVENTION
5564 pcre_exec(const pcre *argument_re, const pcre_extra *extra_data,
5565 PCRE_SPTR subject, int length, int start_offset, int options, int *offsets,
5566 int offsetcount)
5567 {
5568 int rc, resetcount, ocount;
5569 int first_byte = -1;
5570 int req_byte = -1;
5571 int req_byte2 = -1;
5572 int newline;
5573 unsigned long int ims;
5574 BOOL using_temporary_offsets = FALSE;
5575 BOOL anchored;
5576 BOOL startline;
5577 BOOL firstline;
5578 BOOL first_byte_caseless = FALSE;
5579 BOOL req_byte_caseless = FALSE;
5580 BOOL utf8;
5581 match_data match_block;
5582 match_data *md = &match_block;
5583 const uschar *tables;
5584 const uschar *start_bits = NULL;
5585 USPTR start_match = (USPTR)subject + start_offset;
5586 USPTR end_subject;
5587 USPTR start_partial = NULL;
5588 USPTR req_byte_ptr = start_match - 1;
5589
5590 pcre_study_data internal_study;
5591 const pcre_study_data *study;
5592
5593 real_pcre internal_re;
5594 const real_pcre *external_re = (const real_pcre *)argument_re;
5595 const real_pcre *re = external_re;
5596
5597 /* Plausibility checks */
5598
5599 if ((options & ~PUBLIC_EXEC_OPTIONS) != 0) return PCRE_ERROR_BADOPTION;
5600 if (re == NULL || subject == NULL ||
5601 (offsets == NULL && offsetcount > 0)) return PCRE_ERROR_NULL;
5602 if (offsetcount < 0) return PCRE_ERROR_BADCOUNT;
5603
5604 /* This information is for finding all the numbers associated with a given
5605 name, for condition testing. */
5606
5607 md->name_table = (uschar *)re + re->name_table_offset;
5608 md->name_count = re->name_count;
5609 md->name_entry_size = re->name_entry_size;
5610
5611 /* Fish out the optional data from the extra_data structure, first setting
5612 the default values. */
5613
5614 study = NULL;
5615 md->match_limit = MATCH_LIMIT;
5616 md->match_limit_recursion = MATCH_LIMIT_RECURSION;
5617 md->callout_data = NULL;
5618
5619 /* The table pointer is always in native byte order. */
5620
5621 tables = external_re->tables;
5622
5623 if (extra_data != NULL)
5624 {
5625 register unsigned int flags = extra_data->flags;
5626 if ((flags & PCRE_EXTRA_STUDY_DATA) != 0)
5627 study = (const pcre_study_data *)extra_data->study_data;
5628 if ((flags & PCRE_EXTRA_MATCH_LIMIT) != 0)
5629 md->match_limit = extra_data->match_limit;
5630 if ((flags & PCRE_EXTRA_MATCH_LIMIT_RECURSION) != 0)
5631 md->match_limit_recursion = extra_data->match_limit_recursion;
5632 if ((flags & PCRE_EXTRA_CALLOUT_DATA) != 0)
5633 md->callout_data = extra_data->callout_data;
5634 if ((flags & PCRE_EXTRA_TABLES) != 0) tables = extra_data->tables;
5635 }
5636
5637 /* If the exec call supplied NULL for tables, use the inbuilt ones. This
5638 is a feature that makes it possible to save compiled regex and re-use them
5639 in other programs later. */
5640
5641 if (tables == NULL) tables = _pcre_default_tables;
5642
5643 /* Check that the first field in the block is the magic number. If it is not,
5644 test for a regex that was compiled on a host of opposite endianness. If this is
5645 the case, flipped values are put in internal_re and internal_study if there was
5646 study data too. */
5647
5648 if (re->magic_number != MAGIC_NUMBER)
5649 {
5650 re = _pcre_try_flipped(re, &internal_re, study, &internal_study);
5651 if (re == NULL) return PCRE_ERROR_BADMAGIC;
5652 if (study != NULL) study = &internal_study;
5653 }
5654
5655 /* Set up other data */
5656
5657 anchored = ((re->options | options) & PCRE_ANCHORED) != 0;
5658 startline = (re->flags & PCRE_STARTLINE) != 0;
5659 firstline = (re->options & PCRE_FIRSTLINE) != 0;
5660
5661 /* The code starts after the real_pcre block and the capture name table. */
5662
5663 md->start_code = (const uschar *)external_re + re->name_table_offset +
5664 re->name_count * re->name_entry_size;
5665
5666 md->start_subject = (USPTR)subject;
5667 md->start_offset = start_offset;
5668 md->end_subject = md->start_subject + length;
5669 end_subject = md->end_subject;
5670
5671 md->endonly = (re->options & PCRE_DOLLAR_ENDONLY) != 0;
5672 utf8 = md->utf8 = (re->options & PCRE_UTF8) != 0;
5673 md->use_ucp = (re->options & PCRE_UCP) != 0;
5674 md->jscript_compat = (re->options & PCRE_JAVASCRIPT_COMPAT) != 0;
5675
5676 md->notbol = (options & PCRE_NOTBOL) != 0;
5677 md->noteol = (options & PCRE_NOTEOL) != 0;
5678 md->notempty = (options & PCRE_NOTEMPTY) != 0;
5679 md->notempty_atstart = (options & PCRE_NOTEMPTY_ATSTART) != 0;
5680 md->partial = ((options & PCRE_PARTIAL_HARD) != 0)? 2 :
5681 ((options & PCRE_PARTIAL_SOFT) != 0)? 1 : 0;
5682 md->hitend = FALSE;
5683 md->mark = NULL; /* In case never set */
5684
5685 md->recursive = NULL; /* No recursion at top level */
5686
5687 md->lcc = tables + lcc_offset;
5688 md->ctypes = tables + ctypes_offset;
5689
5690 /* Handle different \R options. */
5691
5692 switch (options & (PCRE_BSR_ANYCRLF|PCRE_BSR_UNICODE))
5693 {
5694 case 0:
5695 if ((re->options & (PCRE_BSR_ANYCRLF|PCRE_BSR_UNICODE)) != 0)
5696 md->bsr_anycrlf = (re->options & PCRE_BSR_ANYCRLF) != 0;
5697 else
5698 #ifdef BSR_ANYCRLF
5699 md->bsr_anycrlf = TRUE;
5700 #else
5701 md->bsr_anycrlf = FALSE;
5702 #endif
5703 break;
5704
5705 case PCRE_BSR_ANYCRLF:
5706 md->bsr_anycrlf = TRUE;
5707 break;
5708
5709 case PCRE_BSR_UNICODE:
5710 md->bsr_anycrlf = FALSE;
5711 break;
5712
5713 default: return PCRE_ERROR_BADNEWLINE;
5714 }
5715
5716 /* Handle different types of newline. The three bits give eight cases. If
5717 nothing is set at run time, whatever was used at compile time applies. */
5718
5719 switch ((((options & PCRE_NEWLINE_BITS) == 0)? re->options :
5720 (pcre_uint32)options) & PCRE_NEWLINE_BITS)
5721 {
5722 case 0: newline = NEWLINE; break; /* Compile-time default */
5723 case PCRE_NEWLINE_CR: newline = CHAR_CR; break;
5724 case PCRE_NEWLINE_LF: newline = CHAR_NL; break;
5725 case PCRE_NEWLINE_CR+
5726 PCRE_NEWLINE_LF: newline = (CHAR_CR << 8) | CHAR_NL; break;
5727 case PCRE_NEWLINE_ANY: newline = -1; break;
5728 case PCRE_NEWLINE_ANYCRLF: newline = -2; break;
5729 default: return PCRE_ERROR_BADNEWLINE;
5730 }
5731
5732 if (newline == -2)
5733 {
5734 md->nltype = NLTYPE_ANYCRLF;
5735 }
5736 else if (newline < 0)
5737 {
5738 md->nltype = NLTYPE_ANY;
5739 }
5740 else
5741 {
5742 md->nltype = NLTYPE_FIXED;
5743 if (newline > 255)
5744 {
5745 md->nllen = 2;
5746 md->nl[0] = (newline >> 8) & 255;
5747 md->nl[1] = newline & 255;
5748 }
5749 else
5750 {
5751 md->nllen = 1;
5752 md->nl[0] = newline;
5753 }
5754 }
5755
5756 /* Partial matching was originally supported only for a restricted set of
5757 regexes; from release 8.00 there are no restrictions, but the bits are still
5758 defined (though never set). So there's no harm in leaving this code. */
5759
5760 if (md->partial && (re->flags & PCRE_NOPARTIAL) != 0)
5761 return PCRE_ERROR_BADPARTIAL;
5762
5763 /* Check a UTF-8 string if required. Unfortunately there's no way of passing
5764 back the character offset. */
5765
5766 #ifdef SUPPORT_UTF8
5767 if (utf8 && (options & PCRE_NO_UTF8_CHECK) == 0)
5768 {
5769 if (_pcre_valid_utf8((USPTR)subject, length) >= 0)
5770 return PCRE_ERROR_BADUTF8;
5771 if (start_offset > 0 && start_offset < length)
5772 {
5773 int tb = ((USPTR)subject)[start_offset];
5774 if (tb > 127)
5775 {
5776 tb &= 0xc0;
5777 if (tb != 0 && tb != 0xc0) return PCRE_ERROR_BADUTF8_OFFSET;
5778 }
5779 }
5780 }
5781 #endif
5782
5783 /* The ims options can vary during the matching as a result of the presence
5784 of (?ims) items in the pattern. They are kept in a local variable so that
5785 restoring at the exit of a group is easy. */
5786
5787 ims = re->options & (PCRE_CASELESS|PCRE_MULTILINE|PCRE_DOTALL);
5788
5789 /* If the expression has got more back references than the offsets supplied can
5790 hold, we get a temporary chunk of working store to use during the matching.
5791 Otherwise, we can use the vector supplied, rounding down its size to a multiple
5792 of 3. */
5793
5794 ocount = offsetcount - (offsetcount % 3);
5795
5796 if (re->top_backref > 0 && re->top_backref >= ocount/3)
5797 {
5798 ocount = re->top_backref * 3 + 3;
5799 md->offset_vector = (int *)(pcre_malloc)(ocount * sizeof(int));
5800 if (md->offset_vector == NULL) return PCRE_ERROR_NOMEMORY;
5801 using_temporary_offsets = TRUE;
5802 DPRINTF(("Got memory to hold back references\n"));
5803 }
5804 else md->offset_vector = offsets;
5805
5806 md->offset_end = ocount;
5807 md->offset_max = (2*ocount)/3;
5808 md->offset_overflow = FALSE;
5809 md->capture_last = -1;
5810
5811 /* Compute the minimum number of offsets that we need to reset each time. Doing
5812 this makes a huge difference to execution time when there aren't many brackets
5813 in the pattern. */
5814
5815 resetcount = 2 + re->top_bracket * 2;
5816 if (resetcount > offsetcount) resetcount = ocount;
5817
5818 /* Reset the working variable associated with each extraction. These should
5819 never be used unless previously set, but they get saved and restored, and so we
5820 initialize them to avoid reading uninitialized locations. */
5821
5822 if (md->offset_vector != NULL)
5823 {
5824 register int *iptr = md->offset_vector + ocount;
5825 register int *iend = iptr - resetcount/2 + 1;
5826 while (--iptr >= iend) *iptr = -1;
5827 }
5828
5829 /* Set up the first character to match, if available. The first_byte value is
5830 never set for an anchored regular expression, but the anchoring may be forced
5831 at run time, so we have to test for anchoring. The first char may be unset for
5832 an unanchored pattern, of course. If there's no first char and the pattern was
5833 studied, there may be a bitmap of possible first characters. */
5834
5835 if (!anchored)
5836 {
5837 if ((re->flags & PCRE_FIRSTSET) != 0)
5838 {
5839 first_byte = re->first_byte & 255;
5840 if ((first_byte_caseless = ((re->first_byte & REQ_CASELESS) != 0)) == TRUE)
5841 first_byte = md->lcc[first_byte];
5842 }
5843 else
5844 if (!startline && study != NULL &&
5845 (study->flags & PCRE_STUDY_MAPPED) != 0)
5846 start_bits = study->start_bits;
5847 }
5848
5849 /* For anchored or unanchored matches, there may be a "last known required
5850 character" set. */
5851
5852 if ((re->flags & PCRE_REQCHSET) != 0)
5853 {
5854 req_byte = re->req_byte & 255;
5855 req_byte_caseless = (re->req_byte & REQ_CASELESS) != 0;
5856 req_byte2 = (tables + fcc_offset)[req_byte]; /* case flipped */
5857 }
5858
5859
5860 /* ==========================================================================*/
5861
5862 /* Loop for handling unanchored repeated matching attempts; for anchored regexs
5863 the loop runs just once. */
5864
5865 for(;;)
5866 {
5867 USPTR save_end_subject = end_subject;
5868 USPTR new_start_match;
5869
5870 /* Reset the maximum number of extractions we might see. */
5871
5872 if (md->offset_vector != NULL)
5873 {
5874 register int *iptr = md->offset_vector;
5875 register int *iend = iptr + resetcount;
5876 while (iptr < iend) *iptr++ = -1;
5877 }
5878
5879 /* If firstline is TRUE, the start of the match is constrained to the first
5880 line of a multiline string. That is, the match must be before or at the first
5881 newline. Implement this by temporarily adjusting end_subject so that we stop
5882 scanning at a newline. If the match fails at the newline, later code breaks
5883 this loop. */
5884
5885 if (firstline)
5886 {
5887 USPTR t = start_match;
5888 #ifdef SUPPORT_UTF8
5889 if (utf8)
5890 {
5891 while (t < md->end_subject && !IS_NEWLINE(t))
5892 {
5893 t++;
5894 while (t < end_subject && (*t & 0xc0) == 0x80) t++;
5895 }
5896 }
5897 else
5898 #endif
5899 while (t < md->end_subject && !IS_NEWLINE(t)) t++;
5900 end_subject = t;
5901 }
5902
5903 /* There are some optimizations that avoid running the match if a known
5904 starting point is not found, or if a known later character is not present.
5905 However, there is an option that disables these, for testing and for ensuring
5906 that all callouts do actually occur. */
5907
5908 if ((options & PCRE_NO_START_OPTIMIZE) == 0)
5909 {
5910 /* Advance to a unique first byte if there is one. */
5911
5912 if (first_byte >= 0)
5913 {
5914 if (first_byte_caseless)
5915 while (start_match < end_subject && md->lcc[*start_match] != first_byte)
5916 start_match++;
5917 else
5918 while (start_match < end_subject && *start_match != first_byte)
5919 start_match++;
5920 }
5921
5922 /* Or to just after a linebreak for a multiline match */
5923
5924 else if (startline)
5925 {
5926 if (start_match > md->start_subject + start_offset)
5927 {
5928 #ifdef SUPPORT_UTF8
5929 if (utf8)
5930 {
5931 while (start_match < end_subject && !WAS_NEWLINE(start_match))
5932 {
5933 start_match++;
5934 while(start_match < end_subject && (*start_match & 0xc0) == 0x80)
5935 start_match++;
5936 }
5937 }
5938 else
5939 #endif
5940 while (start_match < end_subject && !WAS_NEWLINE(start_match))
5941 start_match++;
5942
5943 /* If we have just passed a CR and the newline option is ANY or ANYCRLF,
5944 and we are now at a LF, advance the match position by one more character.
5945 */
5946
5947 if (start_match[-1] == CHAR_CR &&
5948 (md->nltype == NLTYPE_ANY || md->nltype == NLTYPE_ANYCRLF) &&
5949 start_match < end_subject &&
5950 *start_match == CHAR_NL)
5951 start_match++;
5952 }
5953 }
5954
5955 /* Or to a non-unique first byte after study */
5956
5957 else if (start_bits != NULL)
5958 {
5959 while (start_match < end_subject)
5960 {
5961 register unsigned int c = *start_match;
5962 if ((start_bits[c/8] & (1 << (c&7))) == 0)
5963 {
5964 start_match++;
5965 #ifdef SUPPORT_UTF8
5966 if (utf8)
5967 while(start_match < end_subject && (*start_match & 0xc0) == 0x80)
5968 start_match++;
5969 #endif
5970 }
5971 else break;
5972 }
5973 }
5974 } /* Starting optimizations */
5975
5976 /* Restore fudged end_subject */
5977
5978 end_subject = save_end_subject;
5979
5980 /* The following two optimizations are disabled for partial matching or if
5981 disabling is explicitly requested. */
5982
5983 if ((options & PCRE_NO_START_OPTIMIZE) == 0 && !md->partial)
5984 {
5985 /* If the pattern was studied, a minimum subject length may be set. This is
5986 a lower bound; no actual string of that length may actually match the
5987 pattern. Although the value is, strictly, in characters, we treat it as
5988 bytes to avoid spending too much time in this optimization. */
5989
5990 if (study != NULL && (study->flags & PCRE_STUDY_MINLEN) != 0 &&
5991 (pcre_uint32)(end_subject - start_match) < study->minlength)
5992 {
5993 rc = MATCH_NOMATCH;
5994 break;
5995 }
5996
5997 /* If req_byte is set, we know that that character must appear in the
5998 subject for the match to succeed. If the first character is set, req_byte
5999 must be later in the subject; otherwise the test starts at the match point.
6000 This optimization can save a huge amount of backtracking in patterns with
6001 nested unlimited repeats that aren't going to match. Writing separate code
6002 for cased/caseless versions makes it go faster, as does using an
6003 autoincrement and backing off on a match.
6004
6005 HOWEVER: when the subject string is very, very long, searching to its end
6006 can take a long time, and give bad performance on quite ordinary patterns.
6007 This showed up when somebody was matching something like /^\d+C/ on a
6008 32-megabyte string... so we don't do this when the string is sufficiently
6009 long. */
6010
6011 if (req_byte >= 0 && end_subject - start_match < REQ_BYTE_MAX)
6012 {
6013 register USPTR p = start_match + ((first_byte >= 0)? 1 : 0);
6014
6015 /* We don't need to repeat the search if we haven't yet reached the
6016 place we found it at last time. */
6017
6018 if (p > req_byte_ptr)
6019 {
6020 if (req_byte_caseless)
6021 {
6022 while (p < end_subject)
6023 {
6024 register int pp = *p++;
6025 if (pp == req_byte || pp == req_byte2) { p--; break; }
6026 }
6027 }
6028 else
6029 {
6030 while (p < end_subject)
6031 {
6032 if (*p++ == req_byte) { p--; break; }
6033 }
6034 }
6035
6036 /* If we can't find the required character, break the matching loop,
6037 forcing a match failure. */
6038
6039 if (p >= end_subject)
6040 {
6041 rc = MATCH_NOMATCH;
6042 break;
6043 }
6044
6045 /* If we have found the required character, save the point where we
6046 found it, so that we don't search again next time round the loop if
6047 the start hasn't passed this character yet. */
6048
6049 req_byte_ptr = p;
6050 }
6051 }
6052 }
6053
6054 #ifdef PCRE_DEBUG /* Sigh. Some compilers never learn. */
6055 printf(">>>> Match against: ");
6056 pchars(start_match, end_subject - start_match, TRUE, md);
6057 printf("\n");
6058 #endif
6059
6060 /* OK, we can now run the match. If "hitend" is set afterwards, remember the
6061 first starting point for which a partial match was found. */
6062
6063 md->start_match_ptr = start_match;
6064 md->start_used_ptr = start_match;
6065 md->match_call_count = 0;
6066 rc = match(start_match, md->start_code, start_match, NULL, 2, md, ims, NULL,
6067 0, 0);
6068 if (md->hitend && start_partial == NULL) start_partial = md->start_used_ptr;
6069
6070 switch(rc)
6071 {
6072 /* NOMATCH and PRUNE advance by one character. If MATCH_SKIP_ARG reaches
6073 this level it means that a MARK that matched the SKIP's arg was not found.
6074 We treat this as NOMATCH. THEN at this level acts exactly like PRUNE. */
6075
6076 case MATCH_NOMATCH:
6077 case MATCH_PRUNE:
6078 case MATCH_SKIP_ARG:
6079 case MATCH_THEN:
6080 new_start_match = start_match + 1;
6081 #ifdef SUPPORT_UTF8
6082 if (utf8)
6083 while(new_start_match < end_subject && (*new_start_match & 0xc0) == 0x80)
6084 new_start_match++;
6085 #endif
6086 break;
6087
6088 /* SKIP passes back the next starting point explicitly. */
6089
6090 case MATCH_SKIP:
6091 new_start_match = md->start_match_ptr;
6092 break;
6093
6094 /* COMMIT disables the bumpalong, but otherwise behaves as NOMATCH. */
6095
6096 case MATCH_COMMIT:
6097 rc = MATCH_NOMATCH;
6098 goto ENDLOOP;
6099
6100 /* Any other return is either a match, or some kind of error. */
6101
6102 default:
6103 goto ENDLOOP;
6104 }
6105
6106 /* Control reaches here for the various types of "no match at this point"
6107 result. Reset the code to MATCH_NOMATCH for subsequent checking. */
6108
6109 rc = MATCH_NOMATCH;
6110
6111 /* If PCRE_FIRSTLINE is set, the match must happen before or at the first
6112 newline in the subject (though it may continue over the newline). Therefore,
6113 if we have just failed to match, starting at a newline, do not continue. */
6114
6115 if (firstline && IS_NEWLINE(start_match)) break;
6116
6117 /* Advance to new matching position */
6118
6119 start_match = new_start_match;
6120
6121 /* Break the loop if the pattern is anchored or if we have passed the end of
6122 the subject. */
6123
6124 if (anchored || start_match > end_subject) break;
6125
6126 /* If we have just passed a CR and we are now at a LF, and the pattern does
6127 not contain any explicit matches for \r or \n, and the newline option is CRLF
6128 or ANY or ANYCRLF, advance the match position by one more character. */
6129
6130 if (start_match[-1] == CHAR_CR &&
6131 start_match < end_subject &&
6132 *start_match == CHAR_NL &&
6133 (re->flags & PCRE_HASCRORLF) == 0 &&
6134 (md->nltype == NLTYPE_ANY ||
6135 md->nltype == NLTYPE_ANYCRLF ||
6136 md->nllen == 2))
6137 start_match++;
6138
6139 md->mark = NULL; /* Reset for start of next match attempt */
6140 } /* End of for(;;) "bumpalong" loop */
6141
6142 /* ==========================================================================*/
6143
6144 /* We reach here when rc is not MATCH_NOMATCH, or if one of the stopping
6145 conditions is true:
6146
6147 (1) The pattern is anchored or the match was failed by (*COMMIT);
6148
6149 (2) We are past the end of the subject;
6150
6151 (3) PCRE_FIRSTLINE is set and we have failed to match at a newline, because
6152 this option requests that a match occur at or before the first newline in
6153 the subject.
6154
6155 When we have a match and the offset vector is big enough to deal with any
6156 backreferences, captured substring offsets will already be set up. In the case
6157 where we had to get some local store to hold offsets for backreference
6158 processing, copy those that we can. In this case there need not be overflow if
6159 certain parts of the pattern were not used, even though there are more
6160 capturing parentheses than vector slots. */
6161
6162 ENDLOOP:
6163
6164 if (rc == MATCH_MATCH || rc == MATCH_ACCEPT)
6165 {
6166 if (using_temporary_offsets)
6167 {
6168 if (offsetcount >= 4)
6169 {
6170 memcpy(offsets + 2, md->offset_vector + 2,
6171 (offsetcount - 2) * sizeof(int));
6172 DPRINTF(("Copied offsets from temporary memory\n"));
6173 }
6174 if (md->end_offset_top > offsetcount) md->offset_overflow = TRUE;
6175 DPRINTF(("Freeing temporary memory\n"));
6176 (pcre_free)(md->offset_vector);
6177 }
6178
6179 /* Set the return code to the number of captured strings, or 0 if there are
6180 too many to fit into the vector. */
6181
6182 rc = md->offset_overflow? 0 : md->end_offset_top/2;
6183
6184 /* If there is space, set up the whole thing as substring 0. The value of
6185 md->start_match_ptr might be modified if \K was encountered on the success
6186 matching path. */
6187
6188 if (offsetcount < 2) rc = 0; else
6189 {
6190 offsets[0] = (int)(md->start_match_ptr - md->start_subject);
6191 offsets[1] = (int)(md->end_match_ptr - md->start_subject);
6192 }
6193
6194 DPRINTF((">>>> returning %d\n", rc));
6195 goto RETURN_MARK;
6196 }
6197
6198 /* Control gets here if there has been an error, or if the overall match
6199 attempt has failed at all permitted starting positions. */
6200
6201 if (using_temporary_offsets)
6202 {
6203 DPRINTF(("Freeing temporary memory\n"));
6204 (pcre_free)(md->offset_vector);
6205 }
6206
6207 /* For anything other than nomatch or partial match, just return the code. */
6208
6209 if (rc != MATCH_NOMATCH && rc != PCRE_ERROR_PARTIAL)
6210 {
6211 DPRINTF((">>>> error: returning %d\n", rc));
6212 return rc;
6213 }
6214
6215 /* Handle partial matches - disable any mark data */
6216
6217 if (start_partial != NULL)
6218 {
6219 DPRINTF((">>>> returning PCRE_ERROR_PARTIAL\n"));
6220 md->mark = NULL;
6221 if (offsetcount > 1)
6222 {
6223 offsets[0] = (int)(start_partial - (USPTR)subject);
6224 offsets[1] = (int)(end_subject - (USPTR)subject);
6225 }
6226 rc = PCRE_ERROR_PARTIAL;
6227 }
6228
6229 /* This is the classic nomatch case */
6230
6231 else
6232 {
6233 DPRINTF((">>>> returning PCRE_ERROR_NOMATCH\n"));
6234 rc = PCRE_ERROR_NOMATCH;
6235 }
6236
6237 /* Return the MARK data if it has been requested. */
6238
6239 RETURN_MARK:
6240
6241 if (extra_data != NULL && (extra_data->flags & PCRE_EXTRA_MARK) != 0)
6242 *(extra_data->mark) = (unsigned char *)(md->mark);
6243 return rc;
6244 }
6245
6246 /* End of pcre_exec.c */

Properties

Name Value
svn:eol-style native
svn:keywords "Author Date Id Revision Url"

webmaster@exim.org
ViewVC Help
Powered by ViewVC 1.1.12