/[pcre]/code/trunk/pcre_exec.c
ViewVC logotype

Contents of /code/trunk/pcre_exec.c

Parent Directory Parent Directory | Revision Log Revision Log


Revision 529 - (show annotations) (download)
Mon May 31 17:28:08 2010 UTC (4 years, 5 months ago) by ph10
File MIME type: text/plain
File size: 184964 byte(s)
Fix crash for property test in non-UTF-8 mode.

1 /*************************************************
2 * Perl-Compatible Regular Expressions *
3 *************************************************/
4
5 /* PCRE is a library of functions to support regular expressions whose syntax
6 and semantics are as close as possible to those of the Perl 5 language.
7
8 Written by Philip Hazel
9 Copyright (c) 1997-2010 University of Cambridge
10
11 -----------------------------------------------------------------------------
12 Redistribution and use in source and binary forms, with or without
13 modification, are permitted provided that the following conditions are met:
14
15 * Redistributions of source code must retain the above copyright notice,
16 this list of conditions and the following disclaimer.
17
18 * Redistributions in binary form must reproduce the above copyright
19 notice, this list of conditions and the following disclaimer in the
20 documentation and/or other materials provided with the distribution.
21
22 * Neither the name of the University of Cambridge nor the names of its
23 contributors may be used to endorse or promote products derived from
24 this software without specific prior written permission.
25
26 THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
27 AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
28 IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
29 ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
30 LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
31 CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
32 SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
33 INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
34 CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
35 ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
36 POSSIBILITY OF SUCH DAMAGE.
37 -----------------------------------------------------------------------------
38 */
39
40
41 /* This module contains pcre_exec(), the externally visible function that does
42 pattern matching using an NFA algorithm, trying to mimic Perl as closely as
43 possible. There are also some static supporting functions. */
44
45 #ifdef HAVE_CONFIG_H
46 #include "config.h"
47 #endif
48
49 #define NLBLOCK md /* Block containing newline information */
50 #define PSSTART start_subject /* Field containing processed string start */
51 #define PSEND end_subject /* Field containing processed string end */
52
53 #include "pcre_internal.h"
54
55 /* Undefine some potentially clashing cpp symbols */
56
57 #undef min
58 #undef max
59
60 /* Flag bits for the match() function */
61
62 #define match_condassert 0x01 /* Called to check a condition assertion */
63 #define match_cbegroup 0x02 /* Could-be-empty unlimited repeat group */
64
65 /* Non-error returns from the match() function. Error returns are externally
66 defined PCRE_ERROR_xxx codes, which are all negative. */
67
68 #define MATCH_MATCH 1
69 #define MATCH_NOMATCH 0
70
71 /* Special internal returns from the match() function. Make them sufficiently
72 negative to avoid the external error codes. */
73
74 #define MATCH_ACCEPT (-999)
75 #define MATCH_COMMIT (-998)
76 #define MATCH_PRUNE (-997)
77 #define MATCH_SKIP (-996)
78 #define MATCH_SKIP_ARG (-995)
79 #define MATCH_THEN (-994)
80
81 /* This is a convenience macro for code that occurs many times. */
82
83 #define MRRETURN(ra) \
84 { \
85 md->mark = markptr; \
86 RRETURN(ra); \
87 }
88
89 /* Maximum number of ints of offset to save on the stack for recursive calls.
90 If the offset vector is bigger, malloc is used. This should be a multiple of 3,
91 because the offset vector is always a multiple of 3 long. */
92
93 #define REC_STACK_SAVE_MAX 30
94
95 /* Min and max values for the common repeats; for the maxima, 0 => infinity */
96
97 static const char rep_min[] = { 0, 0, 1, 1, 0, 0 };
98 static const char rep_max[] = { 0, 0, 0, 0, 1, 1 };
99
100
101
102 #ifdef PCRE_DEBUG
103 /*************************************************
104 * Debugging function to print chars *
105 *************************************************/
106
107 /* Print a sequence of chars in printable format, stopping at the end of the
108 subject if the requested.
109
110 Arguments:
111 p points to characters
112 length number to print
113 is_subject TRUE if printing from within md->start_subject
114 md pointer to matching data block, if is_subject is TRUE
115
116 Returns: nothing
117 */
118
119 static void
120 pchars(const uschar *p, int length, BOOL is_subject, match_data *md)
121 {
122 unsigned int c;
123 if (is_subject && length > md->end_subject - p) length = md->end_subject - p;
124 while (length-- > 0)
125 if (isprint(c = *(p++))) printf("%c", c); else printf("\\x%02x", c);
126 }
127 #endif
128
129
130
131 /*************************************************
132 * Match a back-reference *
133 *************************************************/
134
135 /* If a back reference hasn't been set, the length that is passed is greater
136 than the number of characters left in the string, so the match fails.
137
138 Arguments:
139 offset index into the offset vector
140 eptr points into the subject
141 length length to be matched
142 md points to match data block
143 ims the ims flags
144
145 Returns: TRUE if matched
146 */
147
148 static BOOL
149 match_ref(int offset, register USPTR eptr, int length, match_data *md,
150 unsigned long int ims)
151 {
152 USPTR p = md->start_subject + md->offset_vector[offset];
153
154 #ifdef PCRE_DEBUG
155 if (eptr >= md->end_subject)
156 printf("matching subject <null>");
157 else
158 {
159 printf("matching subject ");
160 pchars(eptr, length, TRUE, md);
161 }
162 printf(" against backref ");
163 pchars(p, length, FALSE, md);
164 printf("\n");
165 #endif
166
167 /* Always fail if not enough characters left */
168
169 if (length > md->end_subject - eptr) return FALSE;
170
171 /* Separate the caseless case for speed. In UTF-8 mode we can only do this
172 properly if Unicode properties are supported. Otherwise, we can check only
173 ASCII characters. */
174
175 if ((ims & PCRE_CASELESS) != 0)
176 {
177 #ifdef SUPPORT_UTF8
178 #ifdef SUPPORT_UCP
179 if (md->utf8)
180 {
181 USPTR endptr = eptr + length;
182 while (eptr < endptr)
183 {
184 int c, d;
185 GETCHARINC(c, eptr);
186 GETCHARINC(d, p);
187 if (c != d && c != UCD_OTHERCASE(d)) return FALSE;
188 }
189 }
190 else
191 #endif
192 #endif
193
194 /* The same code works when not in UTF-8 mode and in UTF-8 mode when there
195 is no UCP support. */
196
197 while (length-- > 0)
198 { if (md->lcc[*p++] != md->lcc[*eptr++]) return FALSE; }
199 }
200
201 /* In the caseful case, we can just compare the bytes, whether or not we
202 are in UTF-8 mode. */
203
204 else
205 { while (length-- > 0) if (*p++ != *eptr++) return FALSE; }
206
207 return TRUE;
208 }
209
210
211
212 /***************************************************************************
213 ****************************************************************************
214 RECURSION IN THE match() FUNCTION
215
216 The match() function is highly recursive, though not every recursive call
217 increases the recursive depth. Nevertheless, some regular expressions can cause
218 it to recurse to a great depth. I was writing for Unix, so I just let it call
219 itself recursively. This uses the stack for saving everything that has to be
220 saved for a recursive call. On Unix, the stack can be large, and this works
221 fine.
222
223 It turns out that on some non-Unix-like systems there are problems with
224 programs that use a lot of stack. (This despite the fact that every last chip
225 has oodles of memory these days, and techniques for extending the stack have
226 been known for decades.) So....
227
228 There is a fudge, triggered by defining NO_RECURSE, which avoids recursive
229 calls by keeping local variables that need to be preserved in blocks of memory
230 obtained from malloc() instead instead of on the stack. Macros are used to
231 achieve this so that the actual code doesn't look very different to what it
232 always used to.
233
234 The original heap-recursive code used longjmp(). However, it seems that this
235 can be very slow on some operating systems. Following a suggestion from Stan
236 Switzer, the use of longjmp() has been abolished, at the cost of having to
237 provide a unique number for each call to RMATCH. There is no way of generating
238 a sequence of numbers at compile time in C. I have given them names, to make
239 them stand out more clearly.
240
241 Crude tests on x86 Linux show a small speedup of around 5-8%. However, on
242 FreeBSD, avoiding longjmp() more than halves the time taken to run the standard
243 tests. Furthermore, not using longjmp() means that local dynamic variables
244 don't have indeterminate values; this has meant that the frame size can be
245 reduced because the result can be "passed back" by straight setting of the
246 variable instead of being passed in the frame.
247 ****************************************************************************
248 ***************************************************************************/
249
250 /* Numbers for RMATCH calls. When this list is changed, the code at HEAP_RETURN
251 below must be updated in sync. */
252
253 enum { RM1=1, RM2, RM3, RM4, RM5, RM6, RM7, RM8, RM9, RM10,
254 RM11, RM12, RM13, RM14, RM15, RM16, RM17, RM18, RM19, RM20,
255 RM21, RM22, RM23, RM24, RM25, RM26, RM27, RM28, RM29, RM30,
256 RM31, RM32, RM33, RM34, RM35, RM36, RM37, RM38, RM39, RM40,
257 RM41, RM42, RM43, RM44, RM45, RM46, RM47, RM48, RM49, RM50,
258 RM51, RM52, RM53, RM54, RM55, RM56, RM57, RM58, RM59, RM60,
259 RM61, RM62 };
260
261 /* These versions of the macros use the stack, as normal. There are debugging
262 versions and production versions. Note that the "rw" argument of RMATCH isn't
263 actually used in this definition. */
264
265 #ifndef NO_RECURSE
266 #define REGISTER register
267
268 #ifdef PCRE_DEBUG
269 #define RMATCH(ra,rb,rc,rd,re,rf,rg,rw) \
270 { \
271 printf("match() called in line %d\n", __LINE__); \
272 rrc = match(ra,rb,mstart,markptr,rc,rd,re,rf,rg,rdepth+1); \
273 printf("to line %d\n", __LINE__); \
274 }
275 #define RRETURN(ra) \
276 { \
277 printf("match() returned %d from line %d ", ra, __LINE__); \
278 return ra; \
279 }
280 #else
281 #define RMATCH(ra,rb,rc,rd,re,rf,rg,rw) \
282 rrc = match(ra,rb,mstart,markptr,rc,rd,re,rf,rg,rdepth+1)
283 #define RRETURN(ra) return ra
284 #endif
285
286 #else
287
288
289 /* These versions of the macros manage a private stack on the heap. Note that
290 the "rd" argument of RMATCH isn't actually used in this definition. It's the md
291 argument of match(), which never changes. */
292
293 #define REGISTER
294
295 #define RMATCH(ra,rb,rc,rd,re,rf,rg,rw)\
296 {\
297 heapframe *newframe = (pcre_stack_malloc)(sizeof(heapframe));\
298 frame->Xwhere = rw; \
299 newframe->Xeptr = ra;\
300 newframe->Xecode = rb;\
301 newframe->Xmstart = mstart;\
302 newframe->Xmarkptr = markptr;\
303 newframe->Xoffset_top = rc;\
304 newframe->Xims = re;\
305 newframe->Xeptrb = rf;\
306 newframe->Xflags = rg;\
307 newframe->Xrdepth = frame->Xrdepth + 1;\
308 newframe->Xprevframe = frame;\
309 frame = newframe;\
310 DPRINTF(("restarting from line %d\n", __LINE__));\
311 goto HEAP_RECURSE;\
312 L_##rw:\
313 DPRINTF(("jumped back to line %d\n", __LINE__));\
314 }
315
316 #define RRETURN(ra)\
317 {\
318 heapframe *oldframe = frame;\
319 frame = oldframe->Xprevframe;\
320 (pcre_stack_free)(oldframe);\
321 if (frame != NULL)\
322 {\
323 rrc = ra;\
324 goto HEAP_RETURN;\
325 }\
326 return ra;\
327 }
328
329
330 /* Structure for remembering the local variables in a private frame */
331
332 typedef struct heapframe {
333 struct heapframe *Xprevframe;
334
335 /* Function arguments that may change */
336
337 USPTR Xeptr;
338 const uschar *Xecode;
339 USPTR Xmstart;
340 USPTR Xmarkptr;
341 int Xoffset_top;
342 long int Xims;
343 eptrblock *Xeptrb;
344 int Xflags;
345 unsigned int Xrdepth;
346
347 /* Function local variables */
348
349 USPTR Xcallpat;
350 #ifdef SUPPORT_UTF8
351 USPTR Xcharptr;
352 #endif
353 USPTR Xdata;
354 USPTR Xnext;
355 USPTR Xpp;
356 USPTR Xprev;
357 USPTR Xsaved_eptr;
358
359 recursion_info Xnew_recursive;
360
361 BOOL Xcur_is_word;
362 BOOL Xcondition;
363 BOOL Xprev_is_word;
364
365 unsigned long int Xoriginal_ims;
366
367 #ifdef SUPPORT_UCP
368 int Xprop_type;
369 int Xprop_value;
370 int Xprop_fail_result;
371 int Xprop_category;
372 int Xprop_chartype;
373 int Xprop_script;
374 int Xoclength;
375 uschar Xocchars[8];
376 #endif
377
378 int Xcodelink;
379 int Xctype;
380 unsigned int Xfc;
381 int Xfi;
382 int Xlength;
383 int Xmax;
384 int Xmin;
385 int Xnumber;
386 int Xoffset;
387 int Xop;
388 int Xsave_capture_last;
389 int Xsave_offset1, Xsave_offset2, Xsave_offset3;
390 int Xstacksave[REC_STACK_SAVE_MAX];
391
392 eptrblock Xnewptrb;
393
394 /* Where to jump back to */
395
396 int Xwhere;
397
398 } heapframe;
399
400 #endif
401
402
403 /***************************************************************************
404 ***************************************************************************/
405
406
407
408 /*************************************************
409 * Match from current position *
410 *************************************************/
411
412 /* This function is called recursively in many circumstances. Whenever it
413 returns a negative (error) response, the outer incarnation must also return the
414 same response. */
415
416 /* These macros pack up tests that are used for partial matching, and which
417 appears several times in the code. We set the "hit end" flag if the pointer is
418 at the end of the subject and also past the start of the subject (i.e.
419 something has been matched). For hard partial matching, we then return
420 immediately. The second one is used when we already know we are past the end of
421 the subject. */
422
423 #define CHECK_PARTIAL()\
424 if (md->partial != 0 && eptr >= md->end_subject && eptr > mstart)\
425 {\
426 md->hitend = TRUE;\
427 if (md->partial > 1) MRRETURN(PCRE_ERROR_PARTIAL);\
428 }
429
430 #define SCHECK_PARTIAL()\
431 if (md->partial != 0 && eptr > mstart)\
432 {\
433 md->hitend = TRUE;\
434 if (md->partial > 1) MRRETURN(PCRE_ERROR_PARTIAL);\
435 }
436
437
438 /* Performance note: It might be tempting to extract commonly used fields from
439 the md structure (e.g. utf8, end_subject) into individual variables to improve
440 performance. Tests using gcc on a SPARC disproved this; in the first case, it
441 made performance worse.
442
443 Arguments:
444 eptr pointer to current character in subject
445 ecode pointer to current position in compiled code
446 mstart pointer to the current match start position (can be modified
447 by encountering \K)
448 markptr pointer to the most recent MARK name, or NULL
449 offset_top current top pointer
450 md pointer to "static" info for the match
451 ims current /i, /m, and /s options
452 eptrb pointer to chain of blocks containing eptr at start of
453 brackets - for testing for empty matches
454 flags can contain
455 match_condassert - this is an assertion condition
456 match_cbegroup - this is the start of an unlimited repeat
457 group that can match an empty string
458 rdepth the recursion depth
459
460 Returns: MATCH_MATCH if matched ) these values are >= 0
461 MATCH_NOMATCH if failed to match )
462 a negative MATCH_xxx value for PRUNE, SKIP, etc
463 a negative PCRE_ERROR_xxx value if aborted by an error condition
464 (e.g. stopped by repeated call or recursion limit)
465 */
466
467 static int
468 match(REGISTER USPTR eptr, REGISTER const uschar *ecode, USPTR mstart,
469 const uschar *markptr, int offset_top, match_data *md, unsigned long int ims,
470 eptrblock *eptrb, int flags, unsigned int rdepth)
471 {
472 /* These variables do not need to be preserved over recursion in this function,
473 so they can be ordinary variables in all cases. Mark some of them with
474 "register" because they are used a lot in loops. */
475
476 register int rrc; /* Returns from recursive calls */
477 register int i; /* Used for loops not involving calls to RMATCH() */
478 register unsigned int c; /* Character values not kept over RMATCH() calls */
479 register BOOL utf8; /* Local copy of UTF-8 flag for speed */
480
481 BOOL minimize, possessive; /* Quantifier options */
482 int condcode;
483
484 /* When recursion is not being used, all "local" variables that have to be
485 preserved over calls to RMATCH() are part of a "frame" which is obtained from
486 heap storage. Set up the top-level frame here; others are obtained from the
487 heap whenever RMATCH() does a "recursion". See the macro definitions above. */
488
489 #ifdef NO_RECURSE
490 heapframe *frame = (pcre_stack_malloc)(sizeof(heapframe));
491 frame->Xprevframe = NULL; /* Marks the top level */
492
493 /* Copy in the original argument variables */
494
495 frame->Xeptr = eptr;
496 frame->Xecode = ecode;
497 frame->Xmstart = mstart;
498 frame->Xmarkptr = markptr;
499 frame->Xoffset_top = offset_top;
500 frame->Xims = ims;
501 frame->Xeptrb = eptrb;
502 frame->Xflags = flags;
503 frame->Xrdepth = rdepth;
504
505 /* This is where control jumps back to to effect "recursion" */
506
507 HEAP_RECURSE:
508
509 /* Macros make the argument variables come from the current frame */
510
511 #define eptr frame->Xeptr
512 #define ecode frame->Xecode
513 #define mstart frame->Xmstart
514 #define markptr frame->Xmarkptr
515 #define offset_top frame->Xoffset_top
516 #define ims frame->Xims
517 #define eptrb frame->Xeptrb
518 #define flags frame->Xflags
519 #define rdepth frame->Xrdepth
520
521 /* Ditto for the local variables */
522
523 #ifdef SUPPORT_UTF8
524 #define charptr frame->Xcharptr
525 #endif
526 #define callpat frame->Xcallpat
527 #define codelink frame->Xcodelink
528 #define data frame->Xdata
529 #define next frame->Xnext
530 #define pp frame->Xpp
531 #define prev frame->Xprev
532 #define saved_eptr frame->Xsaved_eptr
533
534 #define new_recursive frame->Xnew_recursive
535
536 #define cur_is_word frame->Xcur_is_word
537 #define condition frame->Xcondition
538 #define prev_is_word frame->Xprev_is_word
539
540 #define original_ims frame->Xoriginal_ims
541
542 #ifdef SUPPORT_UCP
543 #define prop_type frame->Xprop_type
544 #define prop_value frame->Xprop_value
545 #define prop_fail_result frame->Xprop_fail_result
546 #define prop_category frame->Xprop_category
547 #define prop_chartype frame->Xprop_chartype
548 #define prop_script frame->Xprop_script
549 #define oclength frame->Xoclength
550 #define occhars frame->Xocchars
551 #endif
552
553 #define ctype frame->Xctype
554 #define fc frame->Xfc
555 #define fi frame->Xfi
556 #define length frame->Xlength
557 #define max frame->Xmax
558 #define min frame->Xmin
559 #define number frame->Xnumber
560 #define offset frame->Xoffset
561 #define op frame->Xop
562 #define save_capture_last frame->Xsave_capture_last
563 #define save_offset1 frame->Xsave_offset1
564 #define save_offset2 frame->Xsave_offset2
565 #define save_offset3 frame->Xsave_offset3
566 #define stacksave frame->Xstacksave
567
568 #define newptrb frame->Xnewptrb
569
570 /* When recursion is being used, local variables are allocated on the stack and
571 get preserved during recursion in the normal way. In this environment, fi and
572 i, and fc and c, can be the same variables. */
573
574 #else /* NO_RECURSE not defined */
575 #define fi i
576 #define fc c
577
578
579 #ifdef SUPPORT_UTF8 /* Many of these variables are used only */
580 const uschar *charptr; /* in small blocks of the code. My normal */
581 #endif /* style of coding would have declared */
582 const uschar *callpat; /* them within each of those blocks. */
583 const uschar *data; /* However, in order to accommodate the */
584 const uschar *next; /* version of this code that uses an */
585 USPTR pp; /* external "stack" implemented on the */
586 const uschar *prev; /* heap, it is easier to declare them all */
587 USPTR saved_eptr; /* here, so the declarations can be cut */
588 /* out in a block. The only declarations */
589 recursion_info new_recursive; /* within blocks below are for variables */
590 /* that do not have to be preserved over */
591 BOOL cur_is_word; /* a recursive call to RMATCH(). */
592 BOOL condition;
593 BOOL prev_is_word;
594
595 unsigned long int original_ims;
596
597 #ifdef SUPPORT_UCP
598 int prop_type;
599 int prop_value;
600 int prop_fail_result;
601 int prop_category;
602 int prop_chartype;
603 int prop_script;
604 int oclength;
605 uschar occhars[8];
606 #endif
607
608 int codelink;
609 int ctype;
610 int length;
611 int max;
612 int min;
613 int number;
614 int offset;
615 int op;
616 int save_capture_last;
617 int save_offset1, save_offset2, save_offset3;
618 int stacksave[REC_STACK_SAVE_MAX];
619
620 eptrblock newptrb;
621 #endif /* NO_RECURSE */
622
623 /* These statements are here to stop the compiler complaining about unitialized
624 variables. */
625
626 #ifdef SUPPORT_UCP
627 prop_value = 0;
628 prop_fail_result = 0;
629 #endif
630
631
632 /* This label is used for tail recursion, which is used in a few cases even
633 when NO_RECURSE is not defined, in order to reduce the amount of stack that is
634 used. Thanks to Ian Taylor for noticing this possibility and sending the
635 original patch. */
636
637 TAIL_RECURSE:
638
639 /* OK, now we can get on with the real code of the function. Recursive calls
640 are specified by the macro RMATCH and RRETURN is used to return. When
641 NO_RECURSE is *not* defined, these just turn into a recursive call to match()
642 and a "return", respectively (possibly with some debugging if PCRE_DEBUG is
643 defined). However, RMATCH isn't like a function call because it's quite a
644 complicated macro. It has to be used in one particular way. This shouldn't,
645 however, impact performance when true recursion is being used. */
646
647 #ifdef SUPPORT_UTF8
648 utf8 = md->utf8; /* Local copy of the flag */
649 #else
650 utf8 = FALSE;
651 #endif
652
653 /* First check that we haven't called match() too many times, or that we
654 haven't exceeded the recursive call limit. */
655
656 if (md->match_call_count++ >= md->match_limit) RRETURN(PCRE_ERROR_MATCHLIMIT);
657 if (rdepth >= md->match_limit_recursion) RRETURN(PCRE_ERROR_RECURSIONLIMIT);
658
659 original_ims = ims; /* Save for resetting on ')' */
660
661 /* At the start of a group with an unlimited repeat that may match an empty
662 string, the match_cbegroup flag is set. When this is the case, add the current
663 subject pointer to the chain of such remembered pointers, to be checked when we
664 hit the closing ket, in order to break infinite loops that match no characters.
665 When match() is called in other circumstances, don't add to the chain. The
666 match_cbegroup flag must NOT be used with tail recursion, because the memory
667 block that is used is on the stack, so a new one may be required for each
668 match(). */
669
670 if ((flags & match_cbegroup) != 0)
671 {
672 newptrb.epb_saved_eptr = eptr;
673 newptrb.epb_prev = eptrb;
674 eptrb = &newptrb;
675 }
676
677 /* Now start processing the opcodes. */
678
679 for (;;)
680 {
681 minimize = possessive = FALSE;
682 op = *ecode;
683
684 switch(op)
685 {
686 case OP_MARK:
687 markptr = ecode + 2;
688 RMATCH(eptr, ecode + _pcre_OP_lengths[*ecode] + ecode[1], offset_top, md,
689 ims, eptrb, flags, RM55);
690
691 /* A return of MATCH_SKIP_ARG means that matching failed at SKIP with an
692 argument, and we must check whether that argument matches this MARK's
693 argument. It is passed back in md->start_match_ptr (an overloading of that
694 variable). If it does match, we reset that variable to the current subject
695 position and return MATCH_SKIP. Otherwise, pass back the return code
696 unaltered. */
697
698 if (rrc == MATCH_SKIP_ARG &&
699 strcmp((char *)markptr, (char *)(md->start_match_ptr)) == 0)
700 {
701 md->start_match_ptr = eptr;
702 RRETURN(MATCH_SKIP);
703 }
704
705 if (md->mark == NULL) md->mark = markptr;
706 RRETURN(rrc);
707
708 case OP_FAIL:
709 MRRETURN(MATCH_NOMATCH);
710
711 case OP_COMMIT:
712 RMATCH(eptr, ecode + _pcre_OP_lengths[*ecode], offset_top, md,
713 ims, eptrb, flags, RM52);
714 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
715 MRRETURN(MATCH_COMMIT);
716
717 case OP_PRUNE:
718 RMATCH(eptr, ecode + _pcre_OP_lengths[*ecode], offset_top, md,
719 ims, eptrb, flags, RM51);
720 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
721 MRRETURN(MATCH_PRUNE);
722
723 case OP_PRUNE_ARG:
724 RMATCH(eptr, ecode + _pcre_OP_lengths[*ecode] + ecode[1], offset_top, md,
725 ims, eptrb, flags, RM56);
726 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
727 md->mark = ecode + 2;
728 RRETURN(MATCH_PRUNE);
729
730 case OP_SKIP:
731 RMATCH(eptr, ecode + _pcre_OP_lengths[*ecode], offset_top, md,
732 ims, eptrb, flags, RM53);
733 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
734 md->start_match_ptr = eptr; /* Pass back current position */
735 MRRETURN(MATCH_SKIP);
736
737 case OP_SKIP_ARG:
738 RMATCH(eptr, ecode + _pcre_OP_lengths[*ecode] + ecode[1], offset_top, md,
739 ims, eptrb, flags, RM57);
740 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
741
742 /* Pass back the current skip name by overloading md->start_match_ptr and
743 returning the special MATCH_SKIP_ARG return code. This will either be
744 caught by a matching MARK, or get to the top, where it is treated the same
745 as PRUNE. */
746
747 md->start_match_ptr = ecode + 2;
748 RRETURN(MATCH_SKIP_ARG);
749
750 case OP_THEN:
751 RMATCH(eptr, ecode + _pcre_OP_lengths[*ecode], offset_top, md,
752 ims, eptrb, flags, RM54);
753 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
754 MRRETURN(MATCH_THEN);
755
756 case OP_THEN_ARG:
757 RMATCH(eptr, ecode + _pcre_OP_lengths[*ecode] + ecode[1], offset_top, md,
758 ims, eptrb, flags, RM58);
759 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
760 md->mark = ecode + 2;
761 RRETURN(MATCH_THEN);
762
763 /* Handle a capturing bracket. If there is space in the offset vector, save
764 the current subject position in the working slot at the top of the vector.
765 We mustn't change the current values of the data slot, because they may be
766 set from a previous iteration of this group, and be referred to by a
767 reference inside the group.
768
769 If the bracket fails to match, we need to restore this value and also the
770 values of the final offsets, in case they were set by a previous iteration
771 of the same bracket.
772
773 If there isn't enough space in the offset vector, treat this as if it were
774 a non-capturing bracket. Don't worry about setting the flag for the error
775 case here; that is handled in the code for KET. */
776
777 case OP_CBRA:
778 case OP_SCBRA:
779 number = GET2(ecode, 1+LINK_SIZE);
780 offset = number << 1;
781
782 #ifdef PCRE_DEBUG
783 printf("start bracket %d\n", number);
784 printf("subject=");
785 pchars(eptr, 16, TRUE, md);
786 printf("\n");
787 #endif
788
789 if (offset < md->offset_max)
790 {
791 save_offset1 = md->offset_vector[offset];
792 save_offset2 = md->offset_vector[offset+1];
793 save_offset3 = md->offset_vector[md->offset_end - number];
794 save_capture_last = md->capture_last;
795
796 DPRINTF(("saving %d %d %d\n", save_offset1, save_offset2, save_offset3));
797 md->offset_vector[md->offset_end - number] = eptr - md->start_subject;
798
799 flags = (op == OP_SCBRA)? match_cbegroup : 0;
800 do
801 {
802 RMATCH(eptr, ecode + _pcre_OP_lengths[*ecode], offset_top, md,
803 ims, eptrb, flags, RM1);
804 if (rrc != MATCH_NOMATCH && rrc != MATCH_THEN) RRETURN(rrc);
805 md->capture_last = save_capture_last;
806 ecode += GET(ecode, 1);
807 }
808 while (*ecode == OP_ALT);
809
810 DPRINTF(("bracket %d failed\n", number));
811
812 md->offset_vector[offset] = save_offset1;
813 md->offset_vector[offset+1] = save_offset2;
814 md->offset_vector[md->offset_end - number] = save_offset3;
815
816 if (rrc != MATCH_THEN) md->mark = markptr;
817 RRETURN(MATCH_NOMATCH);
818 }
819
820 /* FALL THROUGH ... Insufficient room for saving captured contents. Treat
821 as a non-capturing bracket. */
822
823 /* VVVVVVVVVVVVVVVVVVVVVVVVV */
824 /* VVVVVVVVVVVVVVVVVVVVVVVVV */
825
826 DPRINTF(("insufficient capture room: treat as non-capturing\n"));
827
828 /* VVVVVVVVVVVVVVVVVVVVVVVVV */
829 /* VVVVVVVVVVVVVVVVVVVVVVVVV */
830
831 /* Non-capturing bracket. Loop for all the alternatives. When we get to the
832 final alternative within the brackets, we would return the result of a
833 recursive call to match() whatever happened. We can reduce stack usage by
834 turning this into a tail recursion, except in the case when match_cbegroup
835 is set.*/
836
837 case OP_BRA:
838 case OP_SBRA:
839 DPRINTF(("start non-capturing bracket\n"));
840 flags = (op >= OP_SBRA)? match_cbegroup : 0;
841 for (;;)
842 {
843 if (ecode[GET(ecode, 1)] != OP_ALT) /* Final alternative */
844 {
845 if (flags == 0) /* Not a possibly empty group */
846 {
847 ecode += _pcre_OP_lengths[*ecode];
848 DPRINTF(("bracket 0 tail recursion\n"));
849 goto TAIL_RECURSE;
850 }
851
852 /* Possibly empty group; can't use tail recursion. */
853
854 RMATCH(eptr, ecode + _pcre_OP_lengths[*ecode], offset_top, md, ims,
855 eptrb, flags, RM48);
856 if (rrc == MATCH_NOMATCH) md->mark = markptr;
857 RRETURN(rrc);
858 }
859
860 /* For non-final alternatives, continue the loop for a NOMATCH result;
861 otherwise return. */
862
863 RMATCH(eptr, ecode + _pcre_OP_lengths[*ecode], offset_top, md, ims,
864 eptrb, flags, RM2);
865 if (rrc != MATCH_NOMATCH && rrc != MATCH_THEN) RRETURN(rrc);
866 ecode += GET(ecode, 1);
867 }
868 /* Control never reaches here. */
869
870 /* Conditional group: compilation checked that there are no more than
871 two branches. If the condition is false, skipping the first branch takes us
872 past the end if there is only one branch, but that's OK because that is
873 exactly what going to the ket would do. As there is only one branch to be
874 obeyed, we can use tail recursion to avoid using another stack frame. */
875
876 case OP_COND:
877 case OP_SCOND:
878 codelink= GET(ecode, 1);
879
880 /* Because of the way auto-callout works during compile, a callout item is
881 inserted between OP_COND and an assertion condition. */
882
883 if (ecode[LINK_SIZE+1] == OP_CALLOUT)
884 {
885 if (pcre_callout != NULL)
886 {
887 pcre_callout_block cb;
888 cb.version = 1; /* Version 1 of the callout block */
889 cb.callout_number = ecode[LINK_SIZE+2];
890 cb.offset_vector = md->offset_vector;
891 cb.subject = (PCRE_SPTR)md->start_subject;
892 cb.subject_length = md->end_subject - md->start_subject;
893 cb.start_match = mstart - md->start_subject;
894 cb.current_position = eptr - md->start_subject;
895 cb.pattern_position = GET(ecode, LINK_SIZE + 3);
896 cb.next_item_length = GET(ecode, 3 + 2*LINK_SIZE);
897 cb.capture_top = offset_top/2;
898 cb.capture_last = md->capture_last;
899 cb.callout_data = md->callout_data;
900 if ((rrc = (*pcre_callout)(&cb)) > 0) MRRETURN(MATCH_NOMATCH);
901 if (rrc < 0) RRETURN(rrc);
902 }
903 ecode += _pcre_OP_lengths[OP_CALLOUT];
904 }
905
906 condcode = ecode[LINK_SIZE+1];
907
908 /* Now see what the actual condition is */
909
910 if (condcode == OP_RREF || condcode == OP_NRREF) /* Recursion test */
911 {
912 if (md->recursive == NULL) /* Not recursing => FALSE */
913 {
914 condition = FALSE;
915 ecode += GET(ecode, 1);
916 }
917 else
918 {
919 int recno = GET2(ecode, LINK_SIZE + 2); /* Recursion group number*/
920 condition = (recno == RREF_ANY || recno == md->recursive->group_num);
921
922 /* If the test is for recursion into a specific subpattern, and it is
923 false, but the test was set up by name, scan the table to see if the
924 name refers to any other numbers, and test them. The condition is true
925 if any one is set. */
926
927 if (!condition && condcode == OP_NRREF && recno != RREF_ANY)
928 {
929 uschar *slotA = md->name_table;
930 for (i = 0; i < md->name_count; i++)
931 {
932 if (GET2(slotA, 0) == recno) break;
933 slotA += md->name_entry_size;
934 }
935
936 /* Found a name for the number - there can be only one; duplicate
937 names for different numbers are allowed, but not vice versa. First
938 scan down for duplicates. */
939
940 if (i < md->name_count)
941 {
942 uschar *slotB = slotA;
943 while (slotB > md->name_table)
944 {
945 slotB -= md->name_entry_size;
946 if (strcmp((char *)slotA + 2, (char *)slotB + 2) == 0)
947 {
948 condition = GET2(slotB, 0) == md->recursive->group_num;
949 if (condition) break;
950 }
951 else break;
952 }
953
954 /* Scan up for duplicates */
955
956 if (!condition)
957 {
958 slotB = slotA;
959 for (i++; i < md->name_count; i++)
960 {
961 slotB += md->name_entry_size;
962 if (strcmp((char *)slotA + 2, (char *)slotB + 2) == 0)
963 {
964 condition = GET2(slotB, 0) == md->recursive->group_num;
965 if (condition) break;
966 }
967 else break;
968 }
969 }
970 }
971 }
972
973 /* Chose branch according to the condition */
974
975 ecode += condition? 3 : GET(ecode, 1);
976 }
977 }
978
979 else if (condcode == OP_CREF || condcode == OP_NCREF) /* Group used test */
980 {
981 offset = GET2(ecode, LINK_SIZE+2) << 1; /* Doubled ref number */
982 condition = offset < offset_top && md->offset_vector[offset] >= 0;
983
984 /* If the numbered capture is unset, but the reference was by name,
985 scan the table to see if the name refers to any other numbers, and test
986 them. The condition is true if any one is set. This is tediously similar
987 to the code above, but not close enough to try to amalgamate. */
988
989 if (!condition && condcode == OP_NCREF)
990 {
991 int refno = offset >> 1;
992 uschar *slotA = md->name_table;
993
994 for (i = 0; i < md->name_count; i++)
995 {
996 if (GET2(slotA, 0) == refno) break;
997 slotA += md->name_entry_size;
998 }
999
1000 /* Found a name for the number - there can be only one; duplicate names
1001 for different numbers are allowed, but not vice versa. First scan down
1002 for duplicates. */
1003
1004 if (i < md->name_count)
1005 {
1006 uschar *slotB = slotA;
1007 while (slotB > md->name_table)
1008 {
1009 slotB -= md->name_entry_size;
1010 if (strcmp((char *)slotA + 2, (char *)slotB + 2) == 0)
1011 {
1012 offset = GET2(slotB, 0) << 1;
1013 condition = offset < offset_top &&
1014 md->offset_vector[offset] >= 0;
1015 if (condition) break;
1016 }
1017 else break;
1018 }
1019
1020 /* Scan up for duplicates */
1021
1022 if (!condition)
1023 {
1024 slotB = slotA;
1025 for (i++; i < md->name_count; i++)
1026 {
1027 slotB += md->name_entry_size;
1028 if (strcmp((char *)slotA + 2, (char *)slotB + 2) == 0)
1029 {
1030 offset = GET2(slotB, 0) << 1;
1031 condition = offset < offset_top &&
1032 md->offset_vector[offset] >= 0;
1033 if (condition) break;
1034 }
1035 else break;
1036 }
1037 }
1038 }
1039 }
1040
1041 /* Chose branch according to the condition */
1042
1043 ecode += condition? 3 : GET(ecode, 1);
1044 }
1045
1046 else if (condcode == OP_DEF) /* DEFINE - always false */
1047 {
1048 condition = FALSE;
1049 ecode += GET(ecode, 1);
1050 }
1051
1052 /* The condition is an assertion. Call match() to evaluate it - setting
1053 the final argument match_condassert causes it to stop at the end of an
1054 assertion. */
1055
1056 else
1057 {
1058 RMATCH(eptr, ecode + 1 + LINK_SIZE, offset_top, md, ims, NULL,
1059 match_condassert, RM3);
1060 if (rrc == MATCH_MATCH)
1061 {
1062 condition = TRUE;
1063 ecode += 1 + LINK_SIZE + GET(ecode, LINK_SIZE + 2);
1064 while (*ecode == OP_ALT) ecode += GET(ecode, 1);
1065 }
1066 else if (rrc != MATCH_NOMATCH && rrc != MATCH_THEN)
1067 {
1068 RRETURN(rrc); /* Need braces because of following else */
1069 }
1070 else
1071 {
1072 condition = FALSE;
1073 ecode += codelink;
1074 }
1075 }
1076
1077 /* We are now at the branch that is to be obeyed. As there is only one,
1078 we can use tail recursion to avoid using another stack frame, except when
1079 match_cbegroup is required for an unlimited repeat of a possibly empty
1080 group. If the second alternative doesn't exist, we can just plough on. */
1081
1082 if (condition || *ecode == OP_ALT)
1083 {
1084 ecode += 1 + LINK_SIZE;
1085 if (op == OP_SCOND) /* Possibly empty group */
1086 {
1087 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, match_cbegroup, RM49);
1088 RRETURN(rrc);
1089 }
1090 else /* Group must match something */
1091 {
1092 flags = 0;
1093 goto TAIL_RECURSE;
1094 }
1095 }
1096 else /* Condition false & no alternative */
1097 {
1098 ecode += 1 + LINK_SIZE;
1099 }
1100 break;
1101
1102
1103 /* Before OP_ACCEPT there may be any number of OP_CLOSE opcodes,
1104 to close any currently open capturing brackets. */
1105
1106 case OP_CLOSE:
1107 number = GET2(ecode, 1);
1108 offset = number << 1;
1109
1110 #ifdef PCRE_DEBUG
1111 printf("end bracket %d at *ACCEPT", number);
1112 printf("\n");
1113 #endif
1114
1115 md->capture_last = number;
1116 if (offset >= md->offset_max) md->offset_overflow = TRUE; else
1117 {
1118 md->offset_vector[offset] =
1119 md->offset_vector[md->offset_end - number];
1120 md->offset_vector[offset+1] = eptr - md->start_subject;
1121 if (offset_top <= offset) offset_top = offset + 2;
1122 }
1123 ecode += 3;
1124 break;
1125
1126
1127 /* End of the pattern, either real or forced. If we are in a top-level
1128 recursion, we should restore the offsets appropriately and continue from
1129 after the call. */
1130
1131 case OP_ACCEPT:
1132 case OP_END:
1133 if (md->recursive != NULL && md->recursive->group_num == 0)
1134 {
1135 recursion_info *rec = md->recursive;
1136 DPRINTF(("End of pattern in a (?0) recursion\n"));
1137 md->recursive = rec->prevrec;
1138 memmove(md->offset_vector, rec->offset_save,
1139 rec->saved_max * sizeof(int));
1140 offset_top = rec->save_offset_top;
1141 ims = original_ims;
1142 ecode = rec->after_call;
1143 break;
1144 }
1145
1146 /* Otherwise, if we have matched an empty string, fail if PCRE_NOTEMPTY is
1147 set, or if PCRE_NOTEMPTY_ATSTART is set and we have matched at the start of
1148 the subject. In both cases, backtracking will then try other alternatives,
1149 if any. */
1150
1151 if (eptr == mstart &&
1152 (md->notempty ||
1153 (md->notempty_atstart &&
1154 mstart == md->start_subject + md->start_offset)))
1155 MRRETURN(MATCH_NOMATCH);
1156
1157 /* Otherwise, we have a match. */
1158
1159 md->end_match_ptr = eptr; /* Record where we ended */
1160 md->end_offset_top = offset_top; /* and how many extracts were taken */
1161 md->start_match_ptr = mstart; /* and the start (\K can modify) */
1162
1163 /* For some reason, the macros don't work properly if an expression is
1164 given as the argument to MRRETURN when the heap is in use. */
1165
1166 rrc = (op == OP_END)? MATCH_MATCH : MATCH_ACCEPT;
1167 MRRETURN(rrc);
1168
1169 /* Change option settings */
1170
1171 case OP_OPT:
1172 ims = ecode[1];
1173 ecode += 2;
1174 DPRINTF(("ims set to %02lx\n", ims));
1175 break;
1176
1177 /* Assertion brackets. Check the alternative branches in turn - the
1178 matching won't pass the KET for an assertion. If any one branch matches,
1179 the assertion is true. Lookbehind assertions have an OP_REVERSE item at the
1180 start of each branch to move the current point backwards, so the code at
1181 this level is identical to the lookahead case. */
1182
1183 case OP_ASSERT:
1184 case OP_ASSERTBACK:
1185 do
1186 {
1187 RMATCH(eptr, ecode + 1 + LINK_SIZE, offset_top, md, ims, NULL, 0,
1188 RM4);
1189 if (rrc == MATCH_MATCH || rrc == MATCH_ACCEPT)
1190 {
1191 mstart = md->start_match_ptr; /* In case \K reset it */
1192 break;
1193 }
1194 if (rrc != MATCH_NOMATCH && rrc != MATCH_THEN) RRETURN(rrc);
1195 ecode += GET(ecode, 1);
1196 }
1197 while (*ecode == OP_ALT);
1198 if (*ecode == OP_KET) MRRETURN(MATCH_NOMATCH);
1199
1200 /* If checking an assertion for a condition, return MATCH_MATCH. */
1201
1202 if ((flags & match_condassert) != 0) RRETURN(MATCH_MATCH);
1203
1204 /* Continue from after the assertion, updating the offsets high water
1205 mark, since extracts may have been taken during the assertion. */
1206
1207 do ecode += GET(ecode,1); while (*ecode == OP_ALT);
1208 ecode += 1 + LINK_SIZE;
1209 offset_top = md->end_offset_top;
1210 continue;
1211
1212 /* Negative assertion: all branches must fail to match. Encountering SKIP,
1213 PRUNE, or COMMIT means we must assume failure without checking subsequent
1214 branches. */
1215
1216 case OP_ASSERT_NOT:
1217 case OP_ASSERTBACK_NOT:
1218 do
1219 {
1220 RMATCH(eptr, ecode + 1 + LINK_SIZE, offset_top, md, ims, NULL, 0,
1221 RM5);
1222 if (rrc == MATCH_MATCH || rrc == MATCH_ACCEPT) MRRETURN(MATCH_NOMATCH);
1223 if (rrc == MATCH_SKIP || rrc == MATCH_PRUNE || rrc == MATCH_COMMIT)
1224 {
1225 do ecode += GET(ecode,1); while (*ecode == OP_ALT);
1226 break;
1227 }
1228 if (rrc != MATCH_NOMATCH && rrc != MATCH_THEN) RRETURN(rrc);
1229 ecode += GET(ecode,1);
1230 }
1231 while (*ecode == OP_ALT);
1232
1233 if ((flags & match_condassert) != 0) RRETURN(MATCH_MATCH);
1234
1235 ecode += 1 + LINK_SIZE;
1236 continue;
1237
1238 /* Move the subject pointer back. This occurs only at the start of
1239 each branch of a lookbehind assertion. If we are too close to the start to
1240 move back, this match function fails. When working with UTF-8 we move
1241 back a number of characters, not bytes. */
1242
1243 case OP_REVERSE:
1244 #ifdef SUPPORT_UTF8
1245 if (utf8)
1246 {
1247 i = GET(ecode, 1);
1248 while (i-- > 0)
1249 {
1250 eptr--;
1251 if (eptr < md->start_subject) MRRETURN(MATCH_NOMATCH);
1252 BACKCHAR(eptr);
1253 }
1254 }
1255 else
1256 #endif
1257
1258 /* No UTF-8 support, or not in UTF-8 mode: count is byte count */
1259
1260 {
1261 eptr -= GET(ecode, 1);
1262 if (eptr < md->start_subject) MRRETURN(MATCH_NOMATCH);
1263 }
1264
1265 /* Save the earliest consulted character, then skip to next op code */
1266
1267 if (eptr < md->start_used_ptr) md->start_used_ptr = eptr;
1268 ecode += 1 + LINK_SIZE;
1269 break;
1270
1271 /* The callout item calls an external function, if one is provided, passing
1272 details of the match so far. This is mainly for debugging, though the
1273 function is able to force a failure. */
1274
1275 case OP_CALLOUT:
1276 if (pcre_callout != NULL)
1277 {
1278 pcre_callout_block cb;
1279 cb.version = 1; /* Version 1 of the callout block */
1280 cb.callout_number = ecode[1];
1281 cb.offset_vector = md->offset_vector;
1282 cb.subject = (PCRE_SPTR)md->start_subject;
1283 cb.subject_length = md->end_subject - md->start_subject;
1284 cb.start_match = mstart - md->start_subject;
1285 cb.current_position = eptr - md->start_subject;
1286 cb.pattern_position = GET(ecode, 2);
1287 cb.next_item_length = GET(ecode, 2 + LINK_SIZE);
1288 cb.capture_top = offset_top/2;
1289 cb.capture_last = md->capture_last;
1290 cb.callout_data = md->callout_data;
1291 if ((rrc = (*pcre_callout)(&cb)) > 0) MRRETURN(MATCH_NOMATCH);
1292 if (rrc < 0) RRETURN(rrc);
1293 }
1294 ecode += 2 + 2*LINK_SIZE;
1295 break;
1296
1297 /* Recursion either matches the current regex, or some subexpression. The
1298 offset data is the offset to the starting bracket from the start of the
1299 whole pattern. (This is so that it works from duplicated subpatterns.)
1300
1301 If there are any capturing brackets started but not finished, we have to
1302 save their starting points and reinstate them after the recursion. However,
1303 we don't know how many such there are (offset_top records the completed
1304 total) so we just have to save all the potential data. There may be up to
1305 65535 such values, which is too large to put on the stack, but using malloc
1306 for small numbers seems expensive. As a compromise, the stack is used when
1307 there are no more than REC_STACK_SAVE_MAX values to store; otherwise malloc
1308 is used. A problem is what to do if the malloc fails ... there is no way of
1309 returning to the top level with an error. Save the top REC_STACK_SAVE_MAX
1310 values on the stack, and accept that the rest may be wrong.
1311
1312 There are also other values that have to be saved. We use a chained
1313 sequence of blocks that actually live on the stack. Thanks to Robin Houston
1314 for the original version of this logic. */
1315
1316 case OP_RECURSE:
1317 {
1318 callpat = md->start_code + GET(ecode, 1);
1319 new_recursive.group_num = (callpat == md->start_code)? 0 :
1320 GET2(callpat, 1 + LINK_SIZE);
1321
1322 /* Add to "recursing stack" */
1323
1324 new_recursive.prevrec = md->recursive;
1325 md->recursive = &new_recursive;
1326
1327 /* Find where to continue from afterwards */
1328
1329 ecode += 1 + LINK_SIZE;
1330 new_recursive.after_call = ecode;
1331
1332 /* Now save the offset data. */
1333
1334 new_recursive.saved_max = md->offset_end;
1335 if (new_recursive.saved_max <= REC_STACK_SAVE_MAX)
1336 new_recursive.offset_save = stacksave;
1337 else
1338 {
1339 new_recursive.offset_save =
1340 (int *)(pcre_malloc)(new_recursive.saved_max * sizeof(int));
1341 if (new_recursive.offset_save == NULL) RRETURN(PCRE_ERROR_NOMEMORY);
1342 }
1343
1344 memcpy(new_recursive.offset_save, md->offset_vector,
1345 new_recursive.saved_max * sizeof(int));
1346 new_recursive.save_offset_top = offset_top;
1347
1348 /* OK, now we can do the recursion. For each top-level alternative we
1349 restore the offset and recursion data. */
1350
1351 DPRINTF(("Recursing into group %d\n", new_recursive.group_num));
1352 flags = (*callpat >= OP_SBRA)? match_cbegroup : 0;
1353 do
1354 {
1355 RMATCH(eptr, callpat + _pcre_OP_lengths[*callpat], offset_top,
1356 md, ims, eptrb, flags, RM6);
1357 if (rrc == MATCH_MATCH || rrc == MATCH_ACCEPT)
1358 {
1359 DPRINTF(("Recursion matched\n"));
1360 md->recursive = new_recursive.prevrec;
1361 if (new_recursive.offset_save != stacksave)
1362 (pcre_free)(new_recursive.offset_save);
1363 MRRETURN(MATCH_MATCH);
1364 }
1365 else if (rrc != MATCH_NOMATCH && rrc != MATCH_THEN)
1366 {
1367 DPRINTF(("Recursion gave error %d\n", rrc));
1368 if (new_recursive.offset_save != stacksave)
1369 (pcre_free)(new_recursive.offset_save);
1370 RRETURN(rrc);
1371 }
1372
1373 md->recursive = &new_recursive;
1374 memcpy(md->offset_vector, new_recursive.offset_save,
1375 new_recursive.saved_max * sizeof(int));
1376 callpat += GET(callpat, 1);
1377 }
1378 while (*callpat == OP_ALT);
1379
1380 DPRINTF(("Recursion didn't match\n"));
1381 md->recursive = new_recursive.prevrec;
1382 if (new_recursive.offset_save != stacksave)
1383 (pcre_free)(new_recursive.offset_save);
1384 MRRETURN(MATCH_NOMATCH);
1385 }
1386 /* Control never reaches here */
1387
1388 /* "Once" brackets are like assertion brackets except that after a match,
1389 the point in the subject string is not moved back. Thus there can never be
1390 a move back into the brackets. Friedl calls these "atomic" subpatterns.
1391 Check the alternative branches in turn - the matching won't pass the KET
1392 for this kind of subpattern. If any one branch matches, we carry on as at
1393 the end of a normal bracket, leaving the subject pointer, but resetting
1394 the start-of-match value in case it was changed by \K. */
1395
1396 case OP_ONCE:
1397 prev = ecode;
1398 saved_eptr = eptr;
1399
1400 do
1401 {
1402 RMATCH(eptr, ecode + 1 + LINK_SIZE, offset_top, md, ims, eptrb, 0, RM7);
1403 if (rrc == MATCH_MATCH) /* Note: _not_ MATCH_ACCEPT */
1404 {
1405 mstart = md->start_match_ptr;
1406 break;
1407 }
1408 if (rrc != MATCH_NOMATCH && rrc != MATCH_THEN) RRETURN(rrc);
1409 ecode += GET(ecode,1);
1410 }
1411 while (*ecode == OP_ALT);
1412
1413 /* If hit the end of the group (which could be repeated), fail */
1414
1415 if (*ecode != OP_ONCE && *ecode != OP_ALT) RRETURN(MATCH_NOMATCH);
1416
1417 /* Continue as from after the assertion, updating the offsets high water
1418 mark, since extracts may have been taken. */
1419
1420 do ecode += GET(ecode, 1); while (*ecode == OP_ALT);
1421
1422 offset_top = md->end_offset_top;
1423 eptr = md->end_match_ptr;
1424
1425 /* For a non-repeating ket, just continue at this level. This also
1426 happens for a repeating ket if no characters were matched in the group.
1427 This is the forcible breaking of infinite loops as implemented in Perl
1428 5.005. If there is an options reset, it will get obeyed in the normal
1429 course of events. */
1430
1431 if (*ecode == OP_KET || eptr == saved_eptr)
1432 {
1433 ecode += 1+LINK_SIZE;
1434 break;
1435 }
1436
1437 /* The repeating kets try the rest of the pattern or restart from the
1438 preceding bracket, in the appropriate order. The second "call" of match()
1439 uses tail recursion, to avoid using another stack frame. We need to reset
1440 any options that changed within the bracket before re-running it, so
1441 check the next opcode. */
1442
1443 if (ecode[1+LINK_SIZE] == OP_OPT)
1444 {
1445 ims = (ims & ~PCRE_IMS) | ecode[4];
1446 DPRINTF(("ims set to %02lx at group repeat\n", ims));
1447 }
1448
1449 if (*ecode == OP_KETRMIN)
1450 {
1451 RMATCH(eptr, ecode + 1 + LINK_SIZE, offset_top, md, ims, eptrb, 0, RM8);
1452 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1453 ecode = prev;
1454 flags = 0;
1455 goto TAIL_RECURSE;
1456 }
1457 else /* OP_KETRMAX */
1458 {
1459 RMATCH(eptr, prev, offset_top, md, ims, eptrb, match_cbegroup, RM9);
1460 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1461 ecode += 1 + LINK_SIZE;
1462 flags = 0;
1463 goto TAIL_RECURSE;
1464 }
1465 /* Control never gets here */
1466
1467 /* An alternation is the end of a branch; scan along to find the end of the
1468 bracketed group and go to there. */
1469
1470 case OP_ALT:
1471 do ecode += GET(ecode,1); while (*ecode == OP_ALT);
1472 break;
1473
1474 /* BRAZERO, BRAMINZERO and SKIPZERO occur just before a bracket group,
1475 indicating that it may occur zero times. It may repeat infinitely, or not
1476 at all - i.e. it could be ()* or ()? or even (){0} in the pattern. Brackets
1477 with fixed upper repeat limits are compiled as a number of copies, with the
1478 optional ones preceded by BRAZERO or BRAMINZERO. */
1479
1480 case OP_BRAZERO:
1481 {
1482 next = ecode+1;
1483 RMATCH(eptr, next, offset_top, md, ims, eptrb, 0, RM10);
1484 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1485 do next += GET(next,1); while (*next == OP_ALT);
1486 ecode = next + 1 + LINK_SIZE;
1487 }
1488 break;
1489
1490 case OP_BRAMINZERO:
1491 {
1492 next = ecode+1;
1493 do next += GET(next, 1); while (*next == OP_ALT);
1494 RMATCH(eptr, next + 1+LINK_SIZE, offset_top, md, ims, eptrb, 0, RM11);
1495 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1496 ecode++;
1497 }
1498 break;
1499
1500 case OP_SKIPZERO:
1501 {
1502 next = ecode+1;
1503 do next += GET(next,1); while (*next == OP_ALT);
1504 ecode = next + 1 + LINK_SIZE;
1505 }
1506 break;
1507
1508 /* End of a group, repeated or non-repeating. */
1509
1510 case OP_KET:
1511 case OP_KETRMIN:
1512 case OP_KETRMAX:
1513 prev = ecode - GET(ecode, 1);
1514
1515 /* If this was a group that remembered the subject start, in order to break
1516 infinite repeats of empty string matches, retrieve the subject start from
1517 the chain. Otherwise, set it NULL. */
1518
1519 if (*prev >= OP_SBRA)
1520 {
1521 saved_eptr = eptrb->epb_saved_eptr; /* Value at start of group */
1522 eptrb = eptrb->epb_prev; /* Backup to previous group */
1523 }
1524 else saved_eptr = NULL;
1525
1526 /* If we are at the end of an assertion group or an atomic group, stop
1527 matching and return MATCH_MATCH, but record the current high water mark for
1528 use by positive assertions. We also need to record the match start in case
1529 it was changed by \K. */
1530
1531 if (*prev == OP_ASSERT || *prev == OP_ASSERT_NOT ||
1532 *prev == OP_ASSERTBACK || *prev == OP_ASSERTBACK_NOT ||
1533 *prev == OP_ONCE)
1534 {
1535 md->end_match_ptr = eptr; /* For ONCE */
1536 md->end_offset_top = offset_top;
1537 md->start_match_ptr = mstart;
1538 MRRETURN(MATCH_MATCH);
1539 }
1540
1541 /* For capturing groups we have to check the group number back at the start
1542 and if necessary complete handling an extraction by setting the offsets and
1543 bumping the high water mark. Note that whole-pattern recursion is coded as
1544 a recurse into group 0, so it won't be picked up here. Instead, we catch it
1545 when the OP_END is reached. Other recursion is handled here. */
1546
1547 if (*prev == OP_CBRA || *prev == OP_SCBRA)
1548 {
1549 number = GET2(prev, 1+LINK_SIZE);
1550 offset = number << 1;
1551
1552 #ifdef PCRE_DEBUG
1553 printf("end bracket %d", number);
1554 printf("\n");
1555 #endif
1556
1557 md->capture_last = number;
1558 if (offset >= md->offset_max) md->offset_overflow = TRUE; else
1559 {
1560 md->offset_vector[offset] =
1561 md->offset_vector[md->offset_end - number];
1562 md->offset_vector[offset+1] = eptr - md->start_subject;
1563 if (offset_top <= offset) offset_top = offset + 2;
1564 }
1565
1566 /* Handle a recursively called group. Restore the offsets
1567 appropriately and continue from after the call. */
1568
1569 if (md->recursive != NULL && md->recursive->group_num == number)
1570 {
1571 recursion_info *rec = md->recursive;
1572 DPRINTF(("Recursion (%d) succeeded - continuing\n", number));
1573 md->recursive = rec->prevrec;
1574 memcpy(md->offset_vector, rec->offset_save,
1575 rec->saved_max * sizeof(int));
1576 offset_top = rec->save_offset_top;
1577 ecode = rec->after_call;
1578 ims = original_ims;
1579 break;
1580 }
1581 }
1582
1583 /* For both capturing and non-capturing groups, reset the value of the ims
1584 flags, in case they got changed during the group. */
1585
1586 ims = original_ims;
1587 DPRINTF(("ims reset to %02lx\n", ims));
1588
1589 /* For a non-repeating ket, just continue at this level. This also
1590 happens for a repeating ket if no characters were matched in the group.
1591 This is the forcible breaking of infinite loops as implemented in Perl
1592 5.005. If there is an options reset, it will get obeyed in the normal
1593 course of events. */
1594
1595 if (*ecode == OP_KET || eptr == saved_eptr)
1596 {
1597 ecode += 1 + LINK_SIZE;
1598 break;
1599 }
1600
1601 /* The repeating kets try the rest of the pattern or restart from the
1602 preceding bracket, in the appropriate order. In the second case, we can use
1603 tail recursion to avoid using another stack frame, unless we have an
1604 unlimited repeat of a group that can match an empty string. */
1605
1606 flags = (*prev >= OP_SBRA)? match_cbegroup : 0;
1607
1608 if (*ecode == OP_KETRMIN)
1609 {
1610 RMATCH(eptr, ecode + 1 + LINK_SIZE, offset_top, md, ims, eptrb, 0, RM12);
1611 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1612 if (flags != 0) /* Could match an empty string */
1613 {
1614 RMATCH(eptr, prev, offset_top, md, ims, eptrb, flags, RM50);
1615 RRETURN(rrc);
1616 }
1617 ecode = prev;
1618 goto TAIL_RECURSE;
1619 }
1620 else /* OP_KETRMAX */
1621 {
1622 RMATCH(eptr, prev, offset_top, md, ims, eptrb, flags, RM13);
1623 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1624 ecode += 1 + LINK_SIZE;
1625 flags = 0;
1626 goto TAIL_RECURSE;
1627 }
1628 /* Control never gets here */
1629
1630 /* Start of subject unless notbol, or after internal newline if multiline */
1631
1632 case OP_CIRC:
1633 if (md->notbol && eptr == md->start_subject) MRRETURN(MATCH_NOMATCH);
1634 if ((ims & PCRE_MULTILINE) != 0)
1635 {
1636 if (eptr != md->start_subject &&
1637 (eptr == md->end_subject || !WAS_NEWLINE(eptr)))
1638 MRRETURN(MATCH_NOMATCH);
1639 ecode++;
1640 break;
1641 }
1642 /* ... else fall through */
1643
1644 /* Start of subject assertion */
1645
1646 case OP_SOD:
1647 if (eptr != md->start_subject) MRRETURN(MATCH_NOMATCH);
1648 ecode++;
1649 break;
1650
1651 /* Start of match assertion */
1652
1653 case OP_SOM:
1654 if (eptr != md->start_subject + md->start_offset) MRRETURN(MATCH_NOMATCH);
1655 ecode++;
1656 break;
1657
1658 /* Reset the start of match point */
1659
1660 case OP_SET_SOM:
1661 mstart = eptr;
1662 ecode++;
1663 break;
1664
1665 /* Assert before internal newline if multiline, or before a terminating
1666 newline unless endonly is set, else end of subject unless noteol is set. */
1667
1668 case OP_DOLL:
1669 if ((ims & PCRE_MULTILINE) != 0)
1670 {
1671 if (eptr < md->end_subject)
1672 { if (!IS_NEWLINE(eptr)) MRRETURN(MATCH_NOMATCH); }
1673 else
1674 { if (md->noteol) MRRETURN(MATCH_NOMATCH); }
1675 ecode++;
1676 break;
1677 }
1678 else
1679 {
1680 if (md->noteol) MRRETURN(MATCH_NOMATCH);
1681 if (!md->endonly)
1682 {
1683 if (eptr != md->end_subject &&
1684 (!IS_NEWLINE(eptr) || eptr != md->end_subject - md->nllen))
1685 MRRETURN(MATCH_NOMATCH);
1686 ecode++;
1687 break;
1688 }
1689 }
1690 /* ... else fall through for endonly */
1691
1692 /* End of subject assertion (\z) */
1693
1694 case OP_EOD:
1695 if (eptr < md->end_subject) MRRETURN(MATCH_NOMATCH);
1696 ecode++;
1697 break;
1698
1699 /* End of subject or ending \n assertion (\Z) */
1700
1701 case OP_EODN:
1702 if (eptr != md->end_subject &&
1703 (!IS_NEWLINE(eptr) || eptr != md->end_subject - md->nllen))
1704 MRRETURN(MATCH_NOMATCH);
1705 ecode++;
1706 break;
1707
1708 /* Word boundary assertions */
1709
1710 case OP_NOT_WORD_BOUNDARY:
1711 case OP_WORD_BOUNDARY:
1712 {
1713
1714 /* Find out if the previous and current characters are "word" characters.
1715 It takes a bit more work in UTF-8 mode. Characters > 255 are assumed to
1716 be "non-word" characters. Remember the earliest consulted character for
1717 partial matching. */
1718
1719 #ifdef SUPPORT_UTF8
1720 if (utf8)
1721 {
1722 /* Get status of previous character */
1723
1724 if (eptr == md->start_subject) prev_is_word = FALSE; else
1725 {
1726 USPTR lastptr = eptr - 1;
1727 while((*lastptr & 0xc0) == 0x80) lastptr--;
1728 if (lastptr < md->start_used_ptr) md->start_used_ptr = lastptr;
1729 GETCHAR(c, lastptr);
1730 #ifdef SUPPORT_UCP
1731 if (md->use_ucp)
1732 {
1733 if (c == '_') prev_is_word = TRUE; else
1734 {
1735 int cat = UCD_CATEGORY(c);
1736 prev_is_word = (cat == ucp_L || cat == ucp_N);
1737 }
1738 }
1739 else
1740 #endif
1741 prev_is_word = c < 256 && (md->ctypes[c] & ctype_word) != 0;
1742 }
1743
1744 /* Get status of next character */
1745
1746 if (eptr >= md->end_subject)
1747 {
1748 SCHECK_PARTIAL();
1749 cur_is_word = FALSE;
1750 }
1751 else
1752 {
1753 GETCHAR(c, eptr);
1754 #ifdef SUPPORT_UCP
1755 if (md->use_ucp)
1756 {
1757 if (c == '_') cur_is_word = TRUE; else
1758 {
1759 int cat = UCD_CATEGORY(c);
1760 cur_is_word = (cat == ucp_L || cat == ucp_N);
1761 }
1762 }
1763 else
1764 #endif
1765 cur_is_word = c < 256 && (md->ctypes[c] & ctype_word) != 0;
1766 }
1767 }
1768 else
1769 #endif
1770
1771 /* Not in UTF-8 mode, but we may still have PCRE_UCP set, and for
1772 consistency with the behaviour of \w we do use it in this case. */
1773
1774 {
1775 /* Get status of previous character */
1776
1777 if (eptr == md->start_subject) prev_is_word = FALSE; else
1778 {
1779 if (eptr <= md->start_used_ptr) md->start_used_ptr = eptr - 1;
1780 #ifdef SUPPORT_UCP
1781 if (md->use_ucp)
1782 {
1783 c = eptr[-1];
1784 if (c == '_') prev_is_word = TRUE; else
1785 {
1786 int cat = UCD_CATEGORY(c);
1787 prev_is_word = (cat == ucp_L || cat == ucp_N);
1788 }
1789 }
1790 else
1791 #endif
1792 prev_is_word = ((md->ctypes[eptr[-1]] & ctype_word) != 0);
1793 }
1794
1795 /* Get status of next character */
1796
1797 if (eptr >= md->end_subject)
1798 {
1799 SCHECK_PARTIAL();
1800 cur_is_word = FALSE;
1801 }
1802 else
1803 #ifdef SUPPORT_UCP
1804 if (md->use_ucp)
1805 {
1806 c = *eptr;
1807 if (c == '_') cur_is_word = TRUE; else
1808 {
1809 int cat = UCD_CATEGORY(c);
1810 cur_is_word = (cat == ucp_L || cat == ucp_N);
1811 }
1812 }
1813 else
1814 #endif
1815 cur_is_word = ((md->ctypes[*eptr] & ctype_word) != 0);
1816 }
1817
1818 /* Now see if the situation is what we want */
1819
1820 if ((*ecode++ == OP_WORD_BOUNDARY)?
1821 cur_is_word == prev_is_word : cur_is_word != prev_is_word)
1822 MRRETURN(MATCH_NOMATCH);
1823 }
1824 break;
1825
1826 /* Match a single character type; inline for speed */
1827
1828 case OP_ANY:
1829 if (IS_NEWLINE(eptr)) MRRETURN(MATCH_NOMATCH);
1830 /* Fall through */
1831
1832 case OP_ALLANY:
1833 if (eptr++ >= md->end_subject)
1834 {
1835 SCHECK_PARTIAL();
1836 MRRETURN(MATCH_NOMATCH);
1837 }
1838 if (utf8) while (eptr < md->end_subject && (*eptr & 0xc0) == 0x80) eptr++;
1839 ecode++;
1840 break;
1841
1842 /* Match a single byte, even in UTF-8 mode. This opcode really does match
1843 any byte, even newline, independent of the setting of PCRE_DOTALL. */
1844
1845 case OP_ANYBYTE:
1846 if (eptr++ >= md->end_subject)
1847 {
1848 SCHECK_PARTIAL();
1849 MRRETURN(MATCH_NOMATCH);
1850 }
1851 ecode++;
1852 break;
1853
1854 case OP_NOT_DIGIT:
1855 if (eptr >= md->end_subject)
1856 {
1857 SCHECK_PARTIAL();
1858 MRRETURN(MATCH_NOMATCH);
1859 }
1860 GETCHARINCTEST(c, eptr);
1861 if (
1862 #ifdef SUPPORT_UTF8
1863 c < 256 &&
1864 #endif
1865 (md->ctypes[c] & ctype_digit) != 0
1866 )
1867 MRRETURN(MATCH_NOMATCH);
1868 ecode++;
1869 break;
1870
1871 case OP_DIGIT:
1872 if (eptr >= md->end_subject)
1873 {
1874 SCHECK_PARTIAL();
1875 MRRETURN(MATCH_NOMATCH);
1876 }
1877 GETCHARINCTEST(c, eptr);
1878 if (
1879 #ifdef SUPPORT_UTF8
1880 c >= 256 ||
1881 #endif
1882 (md->ctypes[c] & ctype_digit) == 0
1883 )
1884 MRRETURN(MATCH_NOMATCH);
1885 ecode++;
1886 break;
1887
1888 case OP_NOT_WHITESPACE:
1889 if (eptr >= md->end_subject)
1890 {
1891 SCHECK_PARTIAL();
1892 MRRETURN(MATCH_NOMATCH);
1893 }
1894 GETCHARINCTEST(c, eptr);
1895 if (
1896 #ifdef SUPPORT_UTF8
1897 c < 256 &&
1898 #endif
1899 (md->ctypes[c] & ctype_space) != 0
1900 )
1901 MRRETURN(MATCH_NOMATCH);
1902 ecode++;
1903 break;
1904
1905 case OP_WHITESPACE:
1906 if (eptr >= md->end_subject)
1907 {
1908 SCHECK_PARTIAL();
1909 MRRETURN(MATCH_NOMATCH);
1910 }
1911 GETCHARINCTEST(c, eptr);
1912 if (
1913 #ifdef SUPPORT_UTF8
1914 c >= 256 ||
1915 #endif
1916 (md->ctypes[c] & ctype_space) == 0
1917 )
1918 MRRETURN(MATCH_NOMATCH);
1919 ecode++;
1920 break;
1921
1922 case OP_NOT_WORDCHAR:
1923 if (eptr >= md->end_subject)
1924 {
1925 SCHECK_PARTIAL();
1926 MRRETURN(MATCH_NOMATCH);
1927 }
1928 GETCHARINCTEST(c, eptr);
1929 if (
1930 #ifdef SUPPORT_UTF8
1931 c < 256 &&
1932 #endif
1933 (md->ctypes[c] & ctype_word) != 0
1934 )
1935 MRRETURN(MATCH_NOMATCH);
1936 ecode++;
1937 break;
1938
1939 case OP_WORDCHAR:
1940 if (eptr >= md->end_subject)
1941 {
1942 SCHECK_PARTIAL();
1943 MRRETURN(MATCH_NOMATCH);
1944 }
1945 GETCHARINCTEST(c, eptr);
1946 if (
1947 #ifdef SUPPORT_UTF8
1948 c >= 256 ||
1949 #endif
1950 (md->ctypes[c] & ctype_word) == 0
1951 )
1952 MRRETURN(MATCH_NOMATCH);
1953 ecode++;
1954 break;
1955
1956 case OP_ANYNL:
1957 if (eptr >= md->end_subject)
1958 {
1959 SCHECK_PARTIAL();
1960 MRRETURN(MATCH_NOMATCH);
1961 }
1962 GETCHARINCTEST(c, eptr);
1963 switch(c)
1964 {
1965 default: MRRETURN(MATCH_NOMATCH);
1966 case 0x000d:
1967 if (eptr < md->end_subject && *eptr == 0x0a) eptr++;
1968 break;
1969
1970 case 0x000a:
1971 break;
1972
1973 case 0x000b:
1974 case 0x000c:
1975 case 0x0085:
1976 case 0x2028:
1977 case 0x2029:
1978 if (md->bsr_anycrlf) MRRETURN(MATCH_NOMATCH);
1979 break;
1980 }
1981 ecode++;
1982 break;
1983
1984 case OP_NOT_HSPACE:
1985 if (eptr >= md->end_subject)
1986 {
1987 SCHECK_PARTIAL();
1988 MRRETURN(MATCH_NOMATCH);
1989 }
1990 GETCHARINCTEST(c, eptr);
1991 switch(c)
1992 {
1993 default: break;
1994 case 0x09: /* HT */
1995 case 0x20: /* SPACE */
1996 case 0xa0: /* NBSP */
1997 case 0x1680: /* OGHAM SPACE MARK */
1998 case 0x180e: /* MONGOLIAN VOWEL SEPARATOR */
1999 case 0x2000: /* EN QUAD */
2000 case 0x2001: /* EM QUAD */
2001 case 0x2002: /* EN SPACE */
2002 case 0x2003: /* EM SPACE */
2003 case 0x2004: /* THREE-PER-EM SPACE */
2004 case 0x2005: /* FOUR-PER-EM SPACE */
2005 case 0x2006: /* SIX-PER-EM SPACE */
2006 case 0x2007: /* FIGURE SPACE */
2007 case 0x2008: /* PUNCTUATION SPACE */
2008 case 0x2009: /* THIN SPACE */
2009 case 0x200A: /* HAIR SPACE */
2010 case 0x202f: /* NARROW NO-BREAK SPACE */
2011 case 0x205f: /* MEDIUM MATHEMATICAL SPACE */
2012 case 0x3000: /* IDEOGRAPHIC SPACE */
2013 MRRETURN(MATCH_NOMATCH);
2014 }
2015 ecode++;
2016 break;
2017
2018 case OP_HSPACE:
2019 if (eptr >= md->end_subject)
2020 {
2021 SCHECK_PARTIAL();
2022 MRRETURN(MATCH_NOMATCH);
2023 }
2024 GETCHARINCTEST(c, eptr);
2025 switch(c)
2026 {
2027 default: MRRETURN(MATCH_NOMATCH);
2028 case 0x09: /* HT */
2029 case 0x20: /* SPACE */
2030 case 0xa0: /* NBSP */
2031 case 0x1680: /* OGHAM SPACE MARK */
2032 case 0x180e: /* MONGOLIAN VOWEL SEPARATOR */
2033 case 0x2000: /* EN QUAD */
2034 case 0x2001: /* EM QUAD */
2035 case 0x2002: /* EN SPACE */
2036 case 0x2003: /* EM SPACE */
2037 case 0x2004: /* THREE-PER-EM SPACE */
2038 case 0x2005: /* FOUR-PER-EM SPACE */
2039 case 0x2006: /* SIX-PER-EM SPACE */
2040 case 0x2007: /* FIGURE SPACE */
2041 case 0x2008: /* PUNCTUATION SPACE */
2042 case 0x2009: /* THIN SPACE */
2043 case 0x200A: /* HAIR SPACE */
2044 case 0x202f: /* NARROW NO-BREAK SPACE */
2045 case 0x205f: /* MEDIUM MATHEMATICAL SPACE */
2046 case 0x3000: /* IDEOGRAPHIC SPACE */
2047 break;
2048 }
2049 ecode++;
2050 break;
2051
2052 case OP_NOT_VSPACE:
2053 if (eptr >= md->end_subject)
2054 {
2055 SCHECK_PARTIAL();
2056 MRRETURN(MATCH_NOMATCH);
2057 }
2058 GETCHARINCTEST(c, eptr);
2059 switch(c)
2060 {
2061 default: break;
2062 case 0x0a: /* LF */
2063 case 0x0b: /* VT */
2064 case 0x0c: /* FF */
2065 case 0x0d: /* CR */
2066 case 0x85: /* NEL */
2067 case 0x2028: /* LINE SEPARATOR */
2068 case 0x2029: /* PARAGRAPH SEPARATOR */
2069 MRRETURN(MATCH_NOMATCH);
2070 }
2071 ecode++;
2072 break;
2073
2074 case OP_VSPACE:
2075 if (eptr >= md->end_subject)
2076 {
2077 SCHECK_PARTIAL();
2078 MRRETURN(MATCH_NOMATCH);
2079 }
2080 GETCHARINCTEST(c, eptr);
2081 switch(c)
2082 {
2083 default: MRRETURN(MATCH_NOMATCH);
2084 case 0x0a: /* LF */
2085 case 0x0b: /* VT */
2086 case 0x0c: /* FF */
2087 case 0x0d: /* CR */
2088 case 0x85: /* NEL */
2089 case 0x2028: /* LINE SEPARATOR */
2090 case 0x2029: /* PARAGRAPH SEPARATOR */
2091 break;
2092 }
2093 ecode++;
2094 break;
2095
2096 #ifdef SUPPORT_UCP
2097 /* Check the next character by Unicode property. We will get here only
2098 if the support is in the binary; otherwise a compile-time error occurs. */
2099
2100 case OP_PROP:
2101 case OP_NOTPROP:
2102 if (eptr >= md->end_subject)
2103 {
2104 SCHECK_PARTIAL();
2105 MRRETURN(MATCH_NOMATCH);
2106 }
2107 GETCHARINCTEST(c, eptr);
2108 {
2109 const ucd_record *prop = GET_UCD(c);
2110
2111 switch(ecode[1])
2112 {
2113 case PT_ANY:
2114 if (op == OP_NOTPROP) MRRETURN(MATCH_NOMATCH);
2115 break;
2116
2117 case PT_LAMP:
2118 if ((prop->chartype == ucp_Lu ||
2119 prop->chartype == ucp_Ll ||
2120 prop->chartype == ucp_Lt) == (op == OP_NOTPROP))
2121 MRRETURN(MATCH_NOMATCH);
2122 break;
2123
2124 case PT_GC:
2125 if ((ecode[2] != _pcre_ucp_gentype[prop->chartype]) == (op == OP_PROP))
2126 MRRETURN(MATCH_NOMATCH);
2127 break;
2128
2129 case PT_PC:
2130 if ((ecode[2] != prop->chartype) == (op == OP_PROP))
2131 MRRETURN(MATCH_NOMATCH);
2132 break;
2133
2134 case PT_SC:
2135 if ((ecode[2] != prop->script) == (op == OP_PROP))
2136 MRRETURN(MATCH_NOMATCH);
2137 break;
2138
2139 /* These are specials */
2140
2141 case PT_ALNUM:
2142 if ((_pcre_ucp_gentype[prop->chartype] == ucp_L ||
2143 _pcre_ucp_gentype[prop->chartype] == ucp_N) == (op == OP_NOTPROP))
2144 MRRETURN(MATCH_NOMATCH);
2145 break;
2146
2147 case PT_SPACE: /* Perl space */
2148 if ((_pcre_ucp_gentype[prop->chartype] == ucp_Z ||
2149 c == CHAR_HT || c == CHAR_NL || c == CHAR_FF || c == CHAR_CR)
2150 == (op == OP_NOTPROP))
2151 MRRETURN(MATCH_NOMATCH);
2152 break;
2153
2154 case PT_PXSPACE: /* POSIX space */
2155 if ((_pcre_ucp_gentype[prop->chartype] == ucp_Z ||
2156 c == CHAR_HT || c == CHAR_NL || c == CHAR_VT ||
2157 c == CHAR_FF || c == CHAR_CR)
2158 == (op == OP_NOTPROP))
2159 MRRETURN(MATCH_NOMATCH);
2160 break;
2161
2162 case PT_WORD:
2163 if ((_pcre_ucp_gentype[prop->chartype] == ucp_L ||
2164 _pcre_ucp_gentype[prop->chartype] == ucp_N ||
2165 c == CHAR_UNDERSCORE) == (op == OP_NOTPROP))
2166 MRRETURN(MATCH_NOMATCH);
2167 break;
2168
2169 /* This should never occur */
2170
2171 default:
2172 RRETURN(PCRE_ERROR_INTERNAL);
2173 }
2174
2175 ecode += 3;
2176 }
2177 break;
2178
2179 /* Match an extended Unicode sequence. We will get here only if the support
2180 is in the binary; otherwise a compile-time error occurs. */
2181
2182 case OP_EXTUNI:
2183 if (eptr >= md->end_subject)
2184 {
2185 SCHECK_PARTIAL();
2186 MRRETURN(MATCH_NOMATCH);
2187 }
2188 GETCHARINCTEST(c, eptr);
2189 {
2190 int category = UCD_CATEGORY(c);
2191 if (category == ucp_M) MRRETURN(MATCH_NOMATCH);
2192 while (eptr < md->end_subject)
2193 {
2194 int len = 1;
2195 if (!utf8) c = *eptr; else
2196 {
2197 GETCHARLEN(c, eptr, len);
2198 }
2199 category = UCD_CATEGORY(c);
2200 if (category != ucp_M) break;
2201 eptr += len;
2202 }
2203 }
2204 ecode++;
2205 break;
2206 #endif
2207
2208
2209 /* Match a back reference, possibly repeatedly. Look past the end of the
2210 item to see if there is repeat information following. The code is similar
2211 to that for character classes, but repeated for efficiency. Then obey
2212 similar code to character type repeats - written out again for speed.
2213 However, if the referenced string is the empty string, always treat
2214 it as matched, any number of times (otherwise there could be infinite
2215 loops). */
2216
2217 case OP_REF:
2218 {
2219 offset = GET2(ecode, 1) << 1; /* Doubled ref number */
2220 ecode += 3;
2221
2222 /* If the reference is unset, there are two possibilities:
2223
2224 (a) In the default, Perl-compatible state, set the length to be longer
2225 than the amount of subject left; this ensures that every attempt at a
2226 match fails. We can't just fail here, because of the possibility of
2227 quantifiers with zero minima.
2228
2229 (b) If the JavaScript compatibility flag is set, set the length to zero
2230 so that the back reference matches an empty string.
2231
2232 Otherwise, set the length to the length of what was matched by the
2233 referenced subpattern. */
2234
2235 if (offset >= offset_top || md->offset_vector[offset] < 0)
2236 length = (md->jscript_compat)? 0 : md->end_subject - eptr + 1;
2237 else
2238 length = md->offset_vector[offset+1] - md->offset_vector[offset];
2239
2240 /* Set up for repetition, or handle the non-repeated case */
2241
2242 switch (*ecode)
2243 {
2244 case OP_CRSTAR:
2245 case OP_CRMINSTAR:
2246 case OP_CRPLUS:
2247 case OP_CRMINPLUS:
2248 case OP_CRQUERY:
2249 case OP_CRMINQUERY:
2250 c = *ecode++ - OP_CRSTAR;
2251 minimize = (c & 1) != 0;
2252 min = rep_min[c]; /* Pick up values from tables; */
2253 max = rep_max[c]; /* zero for max => infinity */
2254 if (max == 0) max = INT_MAX;
2255 break;
2256
2257 case OP_CRRANGE:
2258 case OP_CRMINRANGE:
2259 minimize = (*ecode == OP_CRMINRANGE);
2260 min = GET2(ecode, 1);
2261 max = GET2(ecode, 3);
2262 if (max == 0) max = INT_MAX;
2263 ecode += 5;
2264 break;
2265
2266 default: /* No repeat follows */
2267 if (!match_ref(offset, eptr, length, md, ims))
2268 {
2269 CHECK_PARTIAL();
2270 MRRETURN(MATCH_NOMATCH);
2271 }
2272 eptr += length;
2273 continue; /* With the main loop */
2274 }
2275
2276 /* If the length of the reference is zero, just continue with the
2277 main loop. */
2278
2279 if (length == 0) continue;
2280
2281 /* First, ensure the minimum number of matches are present. We get back
2282 the length of the reference string explicitly rather than passing the
2283 address of eptr, so that eptr can be a register variable. */
2284
2285 for (i = 1; i <= min; i++)
2286 {
2287 if (!match_ref(offset, eptr, length, md, ims))
2288 {
2289 CHECK_PARTIAL();
2290 MRRETURN(MATCH_NOMATCH);
2291 }
2292 eptr += length;
2293 }
2294
2295 /* If min = max, continue at the same level without recursion.
2296 They are not both allowed to be zero. */
2297
2298 if (min == max) continue;
2299
2300 /* If minimizing, keep trying and advancing the pointer */
2301
2302 if (minimize)
2303 {
2304 for (fi = min;; fi++)
2305 {
2306 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM14);
2307 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2308 if (fi >= max) MRRETURN(MATCH_NOMATCH);
2309 if (!match_ref(offset, eptr, length, md, ims))
2310 {
2311 CHECK_PARTIAL();
2312 MRRETURN(MATCH_NOMATCH);
2313 }
2314 eptr += length;
2315 }
2316 /* Control never gets here */
2317 }
2318
2319 /* If maximizing, find the longest string and work backwards */
2320
2321 else
2322 {
2323 pp = eptr;
2324 for (i = min; i < max; i++)
2325 {
2326 if (!match_ref(offset, eptr, length, md, ims))
2327 {
2328 CHECK_PARTIAL();
2329 break;
2330 }
2331 eptr += length;
2332 }
2333 while (eptr >= pp)
2334 {
2335 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM15);
2336 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2337 eptr -= length;
2338 }
2339 MRRETURN(MATCH_NOMATCH);
2340 }
2341 }
2342 /* Control never gets here */
2343
2344 /* Match a bit-mapped character class, possibly repeatedly. This op code is
2345 used when all the characters in the class have values in the range 0-255,
2346 and either the matching is caseful, or the characters are in the range
2347 0-127 when UTF-8 processing is enabled. The only difference between
2348 OP_CLASS and OP_NCLASS occurs when a data character outside the range is
2349 encountered.
2350
2351 First, look past the end of the item to see if there is repeat information
2352 following. Then obey similar code to character type repeats - written out
2353 again for speed. */
2354
2355 case OP_NCLASS:
2356 case OP_CLASS:
2357 {
2358 data = ecode + 1; /* Save for matching */
2359 ecode += 33; /* Advance past the item */
2360
2361 switch (*ecode)
2362 {
2363 case OP_CRSTAR:
2364 case OP_CRMINSTAR:
2365 case OP_CRPLUS:
2366 case OP_CRMINPLUS:
2367 case OP_CRQUERY:
2368 case OP_CRMINQUERY:
2369 c = *ecode++ - OP_CRSTAR;
2370 minimize = (c & 1) != 0;
2371 min = rep_min[c]; /* Pick up values from tables; */
2372 max = rep_max[c]; /* zero for max => infinity */
2373 if (max == 0) max = INT_MAX;
2374 break;
2375
2376 case OP_CRRANGE:
2377 case OP_CRMINRANGE:
2378 minimize = (*ecode == OP_CRMINRANGE);
2379 min = GET2(ecode, 1);
2380 max = GET2(ecode, 3);
2381 if (max == 0) max = INT_MAX;
2382 ecode += 5;
2383 break;
2384
2385 default: /* No repeat follows */
2386 min = max = 1;
2387 break;
2388 }
2389
2390 /* First, ensure the minimum number of matches are present. */
2391
2392 #ifdef SUPPORT_UTF8
2393 /* UTF-8 mode */
2394 if (utf8)
2395 {
2396 for (i = 1; i <= min; i++)
2397 {
2398 if (eptr >= md->end_subject)
2399 {
2400 SCHECK_PARTIAL();
2401 MRRETURN(MATCH_NOMATCH);
2402 }
2403 GETCHARINC(c, eptr);
2404 if (c > 255)
2405 {
2406 if (op == OP_CLASS) MRRETURN(MATCH_NOMATCH);
2407 }
2408 else
2409 {
2410 if ((data[c/8] & (1 << (c&7))) == 0) MRRETURN(MATCH_NOMATCH);
2411 }
2412 }
2413 }
2414 else
2415 #endif
2416 /* Not UTF-8 mode */
2417 {
2418 for (i = 1; i <= min; i++)
2419 {
2420 if (eptr >= md->end_subject)
2421 {
2422 SCHECK_PARTIAL();
2423 MRRETURN(MATCH_NOMATCH);
2424 }
2425 c = *eptr++;
2426 if ((data[c/8] & (1 << (c&7))) == 0) MRRETURN(MATCH_NOMATCH);
2427 }
2428 }
2429
2430 /* If max == min we can continue with the main loop without the
2431 need to recurse. */
2432
2433 if (min == max) continue;
2434
2435 /* If minimizing, keep testing the rest of the expression and advancing
2436 the pointer while it matches the class. */
2437
2438 if (minimize)
2439 {
2440 #ifdef SUPPORT_UTF8
2441 /* UTF-8 mode */
2442 if (utf8)
2443 {
2444 for (fi = min;; fi++)
2445 {
2446 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM16);
2447 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2448 if (fi >= max) MRRETURN(MATCH_NOMATCH);
2449 if (eptr >= md->end_subject)
2450 {
2451 SCHECK_PARTIAL();
2452 MRRETURN(MATCH_NOMATCH);
2453 }
2454 GETCHARINC(c, eptr);
2455 if (c > 255)
2456 {
2457 if (op == OP_CLASS) MRRETURN(MATCH_NOMATCH);
2458 }
2459 else
2460 {
2461 if ((data[c/8] & (1 << (c&7))) == 0) MRRETURN(MATCH_NOMATCH);
2462 }
2463 }
2464 }
2465 else
2466 #endif
2467 /* Not UTF-8 mode */
2468 {
2469 for (fi = min;; fi++)
2470 {
2471 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM17);
2472 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2473 if (fi >= max) MRRETURN(MATCH_NOMATCH);
2474 if (eptr >= md->end_subject)
2475 {
2476 SCHECK_PARTIAL();
2477 MRRETURN(MATCH_NOMATCH);
2478 }
2479 c = *eptr++;
2480 if ((data[c/8] & (1 << (c&7))) == 0) MRRETURN(MATCH_NOMATCH);
2481 }
2482 }
2483 /* Control never gets here */
2484 }
2485
2486 /* If maximizing, find the longest possible run, then work backwards. */
2487
2488 else
2489 {
2490 pp = eptr;
2491
2492 #ifdef SUPPORT_UTF8
2493 /* UTF-8 mode */
2494 if (utf8)
2495 {
2496 for (i = min; i < max; i++)
2497 {
2498 int len = 1;
2499 if (eptr >= md->end_subject)
2500 {
2501 SCHECK_PARTIAL();
2502 break;
2503 }
2504 GETCHARLEN(c, eptr, len);
2505 if (c > 255)
2506 {
2507 if (op == OP_CLASS) break;
2508 }
2509 else
2510 {
2511 if ((data[c/8] & (1 << (c&7))) == 0) break;
2512 }
2513 eptr += len;
2514 }
2515 for (;;)
2516 {
2517 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM18);
2518 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2519 if (eptr-- == pp) break; /* Stop if tried at original pos */
2520 BACKCHAR(eptr);
2521 }
2522 }
2523 else
2524 #endif
2525 /* Not UTF-8 mode */
2526 {
2527 for (i = min; i < max; i++)
2528 {
2529 if (eptr >= md->end_subject)
2530 {
2531 SCHECK_PARTIAL();
2532 break;
2533 }
2534 c = *eptr;
2535 if ((data[c/8] & (1 << (c&7))) == 0) break;
2536 eptr++;
2537 }
2538 while (eptr >= pp)
2539 {
2540 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM19);
2541 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2542 eptr--;
2543 }
2544 }
2545
2546 MRRETURN(MATCH_NOMATCH);
2547 }
2548 }
2549 /* Control never gets here */
2550
2551
2552 /* Match an extended character class. This opcode is encountered only
2553 when UTF-8 mode mode is supported. Nevertheless, we may not be in UTF-8
2554 mode, because Unicode properties are supported in non-UTF-8 mode. */
2555
2556 #ifdef SUPPORT_UTF8
2557 case OP_XCLASS:
2558 {
2559 data = ecode + 1 + LINK_SIZE; /* Save for matching */
2560 ecode += GET(ecode, 1); /* Advance past the item */
2561
2562 switch (*ecode)
2563 {
2564 case OP_CRSTAR:
2565 case OP_CRMINSTAR:
2566 case OP_CRPLUS:
2567 case OP_CRMINPLUS:
2568 case OP_CRQUERY:
2569 case OP_CRMINQUERY:
2570 c = *ecode++ - OP_CRSTAR;
2571 minimize = (c & 1) != 0;
2572 min = rep_min[c]; /* Pick up values from tables; */
2573 max = rep_max[c]; /* zero for max => infinity */
2574 if (max == 0) max = INT_MAX;
2575 break;
2576
2577 case OP_CRRANGE:
2578 case OP_CRMINRANGE:
2579 minimize = (*ecode == OP_CRMINRANGE);
2580 min = GET2(ecode, 1);
2581 max = GET2(ecode, 3);
2582 if (max == 0) max = INT_MAX;
2583 ecode += 5;
2584 break;
2585
2586 default: /* No repeat follows */
2587 min = max = 1;
2588 break;
2589 }
2590
2591 /* First, ensure the minimum number of matches are present. */
2592
2593 for (i = 1; i <= min; i++)
2594 {
2595 if (eptr >= md->end_subject)
2596 {
2597 SCHECK_PARTIAL();
2598 MRRETURN(MATCH_NOMATCH);
2599 }
2600 GETCHARINCTEST(c, eptr);
2601 if (!_pcre_xclass(c, data)) MRRETURN(MATCH_NOMATCH);
2602 }
2603
2604 /* If max == min we can continue with the main loop without the
2605 need to recurse. */
2606
2607 if (min == max) continue;
2608
2609 /* If minimizing, keep testing the rest of the expression and advancing
2610 the pointer while it matches the class. */
2611
2612 if (minimize)
2613 {
2614 for (fi = min;; fi++)
2615 {
2616 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM20);
2617 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2618 if (fi >= max) MRRETURN(MATCH_NOMATCH);
2619 if (eptr >= md->end_subject)
2620 {
2621 SCHECK_PARTIAL();
2622 MRRETURN(MATCH_NOMATCH);
2623 }
2624 GETCHARINCTEST(c, eptr);
2625 if (!_pcre_xclass(c, data)) MRRETURN(MATCH_NOMATCH);
2626 }
2627 /* Control never gets here */
2628 }
2629
2630 /* If maximizing, find the longest possible run, then work backwards. */
2631
2632 else
2633 {
2634 pp = eptr;
2635 for (i = min; i < max; i++)
2636 {
2637 int len = 1;
2638 if (eptr >= md->end_subject)
2639 {
2640 SCHECK_PARTIAL();
2641 break;
2642 }
2643 GETCHARLENTEST(c, eptr, len);
2644 if (!_pcre_xclass(c, data)) break;
2645 eptr += len;
2646 }
2647 for(;;)
2648 {
2649 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM21);
2650 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2651 if (eptr-- == pp) break; /* Stop if tried at original pos */
2652 if (utf8) BACKCHAR(eptr);
2653 }
2654 MRRETURN(MATCH_NOMATCH);
2655 }
2656
2657 /* Control never gets here */
2658 }
2659 #endif /* End of XCLASS */
2660
2661 /* Match a single character, casefully */
2662
2663 case OP_CHAR:
2664 #ifdef SUPPORT_UTF8
2665 if (utf8)
2666 {
2667 length = 1;
2668 ecode++;
2669 GETCHARLEN(fc, ecode, length);
2670 if (length > md->end_subject - eptr)
2671 {
2672 CHECK_PARTIAL(); /* Not SCHECK_PARTIAL() */
2673 MRRETURN(MATCH_NOMATCH);
2674 }
2675 while (length-- > 0) if (*ecode++ != *eptr++) MRRETURN(MATCH_NOMATCH);
2676 }
2677 else
2678 #endif
2679
2680 /* Non-UTF-8 mode */
2681 {
2682 if (md->end_subject - eptr < 1)
2683 {
2684 SCHECK_PARTIAL(); /* This one can use SCHECK_PARTIAL() */
2685 MRRETURN(MATCH_NOMATCH);
2686 }
2687 if (ecode[1] != *eptr++) MRRETURN(MATCH_NOMATCH);
2688 ecode += 2;
2689 }
2690 break;
2691
2692 /* Match a single character, caselessly */
2693
2694 case OP_CHARNC:
2695 #ifdef SUPPORT_UTF8
2696 if (utf8)
2697 {
2698 length = 1;
2699 ecode++;
2700 GETCHARLEN(fc, ecode, length);
2701
2702 if (length > md->end_subject - eptr)
2703 {
2704 CHECK_PARTIAL(); /* Not SCHECK_PARTIAL() */
2705 MRRETURN(MATCH_NOMATCH);
2706 }
2707
2708 /* If the pattern character's value is < 128, we have only one byte, and
2709 can use the fast lookup table. */
2710
2711 if (fc < 128)
2712 {
2713 if (md->lcc[*ecode++] != md->lcc[*eptr++]) MRRETURN(MATCH_NOMATCH);
2714 }
2715
2716 /* Otherwise we must pick up the subject character */
2717
2718 else
2719 {
2720 unsigned int dc;
2721 GETCHARINC(dc, eptr);
2722 ecode += length;
2723
2724 /* If we have Unicode property support, we can use it to test the other
2725 case of the character, if there is one. */
2726
2727 if (fc != dc)
2728 {
2729 #ifdef SUPPORT_UCP
2730 if (dc != UCD_OTHERCASE(fc))
2731 #endif
2732 MRRETURN(MATCH_NOMATCH);
2733 }
2734 }
2735 }
2736 else
2737 #endif /* SUPPORT_UTF8 */
2738
2739 /* Non-UTF-8 mode */
2740 {
2741 if (md->end_subject - eptr < 1)
2742 {
2743 SCHECK_PARTIAL(); /* This one can use SCHECK_PARTIAL() */
2744 MRRETURN(MATCH_NOMATCH);
2745 }
2746 if (md->lcc[ecode[1]] != md->lcc[*eptr++]) MRRETURN(MATCH_NOMATCH);
2747 ecode += 2;
2748 }
2749 break;
2750
2751 /* Match a single character repeatedly. */
2752
2753 case OP_EXACT:
2754 min = max = GET2(ecode, 1);
2755 ecode += 3;
2756 goto REPEATCHAR;
2757
2758 case OP_POSUPTO:
2759 possessive = TRUE;
2760 /* Fall through */
2761
2762 case OP_UPTO:
2763 case OP_MINUPTO:
2764 min = 0;
2765 max = GET2(ecode, 1);
2766 minimize = *ecode == OP_MINUPTO;
2767 ecode += 3;
2768 goto REPEATCHAR;
2769
2770 case OP_POSSTAR:
2771 possessive = TRUE;
2772 min = 0;
2773 max = INT_MAX;
2774 ecode++;
2775 goto REPEATCHAR;
2776
2777 case OP_POSPLUS:
2778 possessive = TRUE;
2779 min = 1;
2780 max = INT_MAX;
2781 ecode++;
2782 goto REPEATCHAR;
2783
2784 case OP_POSQUERY:
2785 possessive = TRUE;
2786 min = 0;
2787 max = 1;
2788 ecode++;
2789 goto REPEATCHAR;
2790
2791 case OP_STAR:
2792 case OP_MINSTAR:
2793 case OP_PLUS:
2794 case OP_MINPLUS:
2795 case OP_QUERY:
2796 case OP_MINQUERY:
2797 c = *ecode++ - OP_STAR;
2798 minimize = (c & 1) != 0;
2799
2800 min = rep_min[c]; /* Pick up values from tables; */
2801 max = rep_max[c]; /* zero for max => infinity */
2802 if (max == 0) max = INT_MAX;
2803
2804 /* Common code for all repeated single-character matches. */
2805
2806 REPEATCHAR:
2807 #ifdef SUPPORT_UTF8
2808 if (utf8)
2809 {
2810 length = 1;
2811 charptr = ecode;
2812 GETCHARLEN(fc, ecode, length);
2813 ecode += length;
2814
2815 /* Handle multibyte character matching specially here. There is
2816 support for caseless matching if UCP support is present. */
2817
2818 if (length > 1)
2819 {
2820 #ifdef SUPPORT_UCP
2821 unsigned int othercase;
2822 if ((ims & PCRE_CASELESS) != 0 &&
2823 (othercase = UCD_OTHERCASE(fc)) != fc)
2824 oclength = _pcre_ord2utf8(othercase, occhars);
2825 else oclength = 0;
2826 #endif /* SUPPORT_UCP */
2827
2828 for (i = 1; i <= min; i++)
2829 {
2830 if (eptr <= md->end_subject - length &&
2831 memcmp(eptr, charptr, length) == 0) eptr += length;
2832 #ifdef SUPPORT_UCP
2833 else if (oclength > 0 &&
2834 eptr <= md->end_subject - oclength &&
2835 memcmp(eptr, occhars, oclength) == 0) eptr += oclength;
2836 #endif /* SUPPORT_UCP */
2837 else
2838 {
2839 CHECK_PARTIAL();
2840 MRRETURN(MATCH_NOMATCH);
2841 }
2842 }
2843
2844 if (min == max) continue;
2845
2846 if (minimize)
2847 {
2848 for (fi = min;; fi++)
2849 {
2850 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM22);
2851 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2852 if (fi >= max) MRRETURN(MATCH_NOMATCH);
2853 if (eptr <= md->end_subject - length &&
2854 memcmp(eptr, charptr, length) == 0) eptr += length;
2855 #ifdef SUPPORT_UCP
2856 else if (oclength > 0 &&
2857 eptr <= md->end_subject - oclength &&
2858 memcmp(eptr, occhars, oclength) == 0) eptr += oclength;
2859 #endif /* SUPPORT_UCP */
2860 else
2861 {
2862 CHECK_PARTIAL();
2863 MRRETURN(MATCH_NOMATCH);
2864 }
2865 }
2866 /* Control never gets here */
2867 }
2868
2869 else /* Maximize */
2870 {
2871 pp = eptr;
2872 for (i = min; i < max; i++)
2873 {
2874 if (eptr <= md->end_subject - length &&
2875 memcmp(eptr, charptr, length) == 0) eptr += length;
2876 #ifdef SUPPORT_UCP
2877 else if (oclength > 0 &&
2878 eptr <= md->end_subject - oclength &&
2879 memcmp(eptr, occhars, oclength) == 0) eptr += oclength;
2880 #endif /* SUPPORT_UCP */
2881 else
2882 {
2883 CHECK_PARTIAL();
2884 break;
2885 }
2886 }
2887
2888 if (possessive) continue;
2889
2890 for(;;)
2891 {
2892 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM23);
2893 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2894 if (eptr == pp) { MRRETURN(MATCH_NOMATCH); }
2895 #ifdef SUPPORT_UCP
2896 eptr--;
2897 BACKCHAR(eptr);
2898 #else /* without SUPPORT_UCP */
2899 eptr -= length;
2900 #endif /* SUPPORT_UCP */
2901 }
2902 }
2903 /* Control never gets here */
2904 }
2905
2906 /* If the length of a UTF-8 character is 1, we fall through here, and
2907 obey the code as for non-UTF-8 characters below, though in this case the
2908 value of fc will always be < 128. */
2909 }
2910 else
2911 #endif /* SUPPORT_UTF8 */
2912
2913 /* When not in UTF-8 mode, load a single-byte character. */
2914
2915 fc = *ecode++;
2916
2917 /* The value of fc at this point is always less than 256, though we may or
2918 may not be in UTF-8 mode. The code is duplicated for the caseless and
2919 caseful cases, for speed, since matching characters is likely to be quite
2920 common. First, ensure the minimum number of matches are present. If min =
2921 max, continue at the same level without recursing. Otherwise, if
2922 minimizing, keep trying the rest of the expression and advancing one
2923 matching character if failing, up to the maximum. Alternatively, if
2924 maximizing, find the maximum number of characters and work backwards. */
2925
2926 DPRINTF(("matching %c{%d,%d} against subject %.*s\n", fc, min, max,
2927 max, eptr));
2928
2929 if ((ims & PCRE_CASELESS) != 0)
2930 {
2931 fc = md->lcc[fc];
2932 for (i = 1; i <= min; i++)
2933 {
2934 if (eptr >= md->end_subject)
2935 {
2936 SCHECK_PARTIAL();
2937 MRRETURN(MATCH_NOMATCH);
2938 }
2939 if (fc != md->lcc[*eptr++]) MRRETURN(MATCH_NOMATCH);
2940 }
2941 if (min == max) continue;
2942 if (minimize)
2943 {
2944 for (fi = min;; fi++)
2945 {
2946 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM24);
2947 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2948 if (fi >= max) MRRETURN(MATCH_NOMATCH);
2949 if (eptr >= md->end_subject)
2950 {
2951 SCHECK_PARTIAL();
2952 MRRETURN(MATCH_NOMATCH);
2953 }
2954 if (fc != md->lcc[*eptr++]) MRRETURN(MATCH_NOMATCH);
2955 }
2956 /* Control never gets here */
2957 }
2958 else /* Maximize */
2959 {
2960 pp = eptr;
2961 for (i = min; i < max; i++)
2962 {
2963 if (eptr >= md->end_subject)
2964 {
2965 SCHECK_PARTIAL();
2966 break;
2967 }
2968 if (fc != md->lcc[*eptr]) break;
2969 eptr++;
2970 }
2971
2972 if (possessive) continue;
2973
2974 while (eptr >= pp)
2975 {
2976 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM25);
2977 eptr--;
2978 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2979 }
2980 MRRETURN(MATCH_NOMATCH);
2981 }
2982 /* Control never gets here */
2983 }
2984
2985 /* Caseful comparisons (includes all multi-byte characters) */
2986
2987 else
2988 {
2989 for (i = 1; i <= min; i++)
2990 {
2991 if (eptr >= md->end_subject)
2992 {
2993 SCHECK_PARTIAL();
2994 MRRETURN(MATCH_NOMATCH);
2995 }
2996 if (fc != *eptr++) MRRETURN(MATCH_NOMATCH);
2997 }
2998
2999 if (min == max) continue;
3000
3001 if (minimize)
3002 {
3003 for (fi = min;; fi++)
3004 {
3005 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM26);
3006 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3007 if (fi >= max) MRRETURN(MATCH_NOMATCH);
3008 if (eptr >= md->end_subject)
3009 {
3010 SCHECK_PARTIAL();
3011 MRRETURN(MATCH_NOMATCH);
3012 }
3013 if (fc != *eptr++) MRRETURN(MATCH_NOMATCH);
3014 }
3015 /* Control never gets here */
3016 }
3017 else /* Maximize */
3018 {
3019 pp = eptr;
3020 for (i = min; i < max; i++)
3021 {
3022 if (eptr >= md->end_subject)
3023 {
3024 SCHECK_PARTIAL();
3025 break;
3026 }
3027 if (fc != *eptr) break;
3028 eptr++;
3029 }
3030 if (possessive) continue;
3031
3032 while (eptr >= pp)
3033 {
3034 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM27);
3035 eptr--;
3036 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3037 }
3038 MRRETURN(MATCH_NOMATCH);
3039 }
3040 }
3041 /* Control never gets here */
3042
3043 /* Match a negated single one-byte character. The character we are
3044 checking can be multibyte. */
3045
3046 case OP_NOT:
3047 if (eptr >= md->end_subject)
3048 {
3049 SCHECK_PARTIAL();
3050 MRRETURN(MATCH_NOMATCH);
3051 }
3052 ecode++;
3053 GETCHARINCTEST(c, eptr);
3054 if ((ims & PCRE_CASELESS) != 0)
3055 {
3056 #ifdef SUPPORT_UTF8
3057 if (c < 256)
3058 #endif
3059 c = md->lcc[c];
3060 if (md->lcc[*ecode++] == c) MRRETURN(MATCH_NOMATCH);
3061 }
3062 else
3063 {
3064 if (*ecode++ == c) MRRETURN(MATCH_NOMATCH);
3065 }
3066 break;
3067
3068 /* Match a negated single one-byte character repeatedly. This is almost a
3069 repeat of the code for a repeated single character, but I haven't found a
3070 nice way of commoning these up that doesn't require a test of the
3071 positive/negative option for each character match. Maybe that wouldn't add
3072 very much to the time taken, but character matching *is* what this is all
3073 about... */
3074
3075 case OP_NOTEXACT:
3076 min = max = GET2(ecode, 1);
3077 ecode += 3;
3078 goto REPEATNOTCHAR;
3079
3080 case OP_NOTUPTO:
3081 case OP_NOTMINUPTO:
3082 min = 0;
3083 max = GET2(ecode, 1);
3084 minimize = *ecode == OP_NOTMINUPTO;
3085 ecode += 3;
3086 goto REPEATNOTCHAR;
3087
3088 case OP_NOTPOSSTAR:
3089 possessive = TRUE;
3090 min = 0;
3091 max = INT_MAX;
3092 ecode++;
3093 goto REPEATNOTCHAR;
3094
3095 case OP_NOTPOSPLUS:
3096 possessive = TRUE;
3097 min = 1;
3098 max = INT_MAX;
3099 ecode++;
3100 goto REPEATNOTCHAR;
3101
3102 case OP_NOTPOSQUERY:
3103 possessive = TRUE;
3104 min = 0;
3105 max = 1;
3106 ecode++;
3107 goto REPEATNOTCHAR;
3108
3109 case OP_NOTPOSUPTO:
3110 possessive = TRUE;
3111 min = 0;
3112 max = GET2(ecode, 1);
3113 ecode += 3;
3114 goto REPEATNOTCHAR;
3115
3116 case OP_NOTSTAR:
3117 case OP_NOTMINSTAR:
3118 case OP_NOTPLUS:
3119 case OP_NOTMINPLUS:
3120 case OP_NOTQUERY:
3121 case OP_NOTMINQUERY:
3122 c = *ecode++ - OP_NOTSTAR;
3123 minimize = (c & 1) != 0;
3124 min = rep_min[c]; /* Pick up values from tables; */
3125 max = rep_max[c]; /* zero for max => infinity */
3126 if (max == 0) max = INT_MAX;
3127
3128 /* Common code for all repeated single-byte matches. */
3129
3130 REPEATNOTCHAR:
3131 fc = *ecode++;
3132
3133 /* The code is duplicated for the caseless and caseful cases, for speed,
3134 since matching characters is likely to be quite common. First, ensure the
3135 minimum number of matches are present. If min = max, continue at the same
3136 level without recursing. Otherwise, if minimizing, keep trying the rest of
3137 the expression and advancing one matching character if failing, up to the
3138 maximum. Alternatively, if maximizing, find the maximum number of
3139 characters and work backwards. */
3140
3141 DPRINTF(("negative matching %c{%d,%d} against subject %.*s\n", fc, min, max,
3142 max, eptr));
3143
3144 if ((ims & PCRE_CASELESS) != 0)
3145 {
3146 fc = md->lcc[fc];
3147
3148 #ifdef SUPPORT_UTF8
3149 /* UTF-8 mode */
3150 if (utf8)
3151 {
3152 register unsigned int d;
3153 for (i = 1; i <= min; i++)
3154 {
3155 if (eptr >= md->end_subject)
3156 {
3157 SCHECK_PARTIAL();
3158 MRRETURN(MATCH_NOMATCH);
3159 }
3160 GETCHARINC(d, eptr);
3161 if (d < 256) d = md->lcc[d];
3162 if (fc == d) MRRETURN(MATCH_NOMATCH);
3163 }
3164 }
3165 else
3166 #endif
3167
3168 /* Not UTF-8 mode */
3169 {
3170 for (i = 1; i <= min; i++)
3171 {
3172 if (eptr >= md->end_subject)
3173 {
3174 SCHECK_PARTIAL();
3175 MRRETURN(MATCH_NOMATCH);
3176 }
3177 if (fc == md->lcc[*eptr++]) MRRETURN(MATCH_NOMATCH);
3178 }
3179 }
3180
3181 if (min == max) continue;
3182
3183 if (minimize)
3184 {
3185 #ifdef SUPPORT_UTF8
3186 /* UTF-8 mode */
3187 if (utf8)
3188 {
3189 register unsigned int d;
3190 for (fi = min;; fi++)
3191 {
3192 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM28);
3193 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3194 if (fi >= max) MRRETURN(MATCH_NOMATCH);
3195 if (eptr >= md->end_subject)
3196 {
3197 SCHECK_PARTIAL();
3198 MRRETURN(MATCH_NOMATCH);
3199 }
3200 GETCHARINC(d, eptr);
3201 if (d < 256) d = md->lcc[d];
3202 if (fc == d) MRRETURN(MATCH_NOMATCH);
3203 }
3204 }
3205 else
3206 #endif
3207 /* Not UTF-8 mode */
3208 {
3209 for (fi = min;; fi++)
3210 {
3211 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM29);
3212 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3213 if (fi >= max) MRRETURN(MATCH_NOMATCH);
3214 if (eptr >= md->end_subject)
3215 {
3216 SCHECK_PARTIAL();
3217 MRRETURN(MATCH_NOMATCH);
3218 }
3219 if (fc == md->lcc[*eptr++]) MRRETURN(MATCH_NOMATCH);
3220 }
3221 }
3222 /* Control never gets here */
3223 }
3224
3225 /* Maximize case */
3226
3227 else
3228 {
3229 pp = eptr;
3230
3231 #ifdef SUPPORT_UTF8
3232 /* UTF-8 mode */
3233 if (utf8)
3234 {
3235 register unsigned int d;
3236 for (i = min; i < max; i++)
3237 {
3238 int len = 1;
3239 if (eptr >= md->end_subject)
3240 {
3241 SCHECK_PARTIAL();
3242 break;
3243 }
3244 GETCHARLEN(d, eptr, len);
3245 if (d < 256) d = md->lcc[d];
3246 if (fc == d) break;
3247 eptr += len;
3248 }
3249 if (possessive) continue;
3250 for(;;)
3251 {
3252 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM30);
3253 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3254 if (eptr-- == pp) break; /* Stop if tried at original pos */
3255 BACKCHAR(eptr);
3256 }
3257 }
3258 else
3259 #endif
3260 /* Not UTF-8 mode */
3261 {
3262 for (i = min; i < max; i++)
3263 {
3264 if (eptr >= md->end_subject)
3265 {
3266 SCHECK_PARTIAL();
3267 break;
3268 }
3269 if (fc == md->lcc[*eptr]) break;
3270 eptr++;
3271 }
3272 if (possessive) continue;
3273 while (eptr >= pp)
3274 {
3275 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM31);
3276 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3277 eptr--;
3278 }
3279 }
3280
3281 MRRETURN(MATCH_NOMATCH);
3282 }
3283 /* Control never gets here */
3284 }
3285
3286 /* Caseful comparisons */
3287
3288 else
3289 {
3290 #ifdef SUPPORT_UTF8
3291 /* UTF-8 mode */
3292 if (utf8)
3293 {
3294 register unsigned int d;
3295 for (i = 1; i <= min; i++)
3296 {
3297 if (eptr >= md->end_subject)
3298 {
3299 SCHECK_PARTIAL();
3300 MRRETURN(MATCH_NOMATCH);
3301 }
3302 GETCHARINC(d, eptr);
3303 if (fc == d) MRRETURN(MATCH_NOMATCH);
3304 }
3305 }
3306 else
3307 #endif
3308 /* Not UTF-8 mode */
3309 {
3310 for (i = 1; i <= min; i++)
3311 {
3312 if (eptr >= md->end_subject)
3313 {
3314 SCHECK_PARTIAL();
3315 MRRETURN(MATCH_NOMATCH);
3316 }
3317 if (fc == *eptr++) MRRETURN(MATCH_NOMATCH);
3318 }
3319 }
3320
3321 if (min == max) continue;
3322
3323 if (minimize)
3324 {
3325 #ifdef SUPPORT_UTF8
3326 /* UTF-8 mode */
3327 if (utf8)
3328 {
3329 register unsigned int d;
3330 for (fi = min;; fi++)
3331 {
3332 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM32);
3333 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3334 if (fi >= max) MRRETURN(MATCH_NOMATCH);
3335 if (eptr >= md->end_subject)
3336 {
3337 SCHECK_PARTIAL();
3338 MRRETURN(MATCH_NOMATCH);
3339 }
3340 GETCHARINC(d, eptr);
3341 if (fc == d) MRRETURN(MATCH_NOMATCH);
3342 }
3343 }
3344 else
3345 #endif
3346 /* Not UTF-8 mode */
3347 {
3348 for (fi = min;; fi++)
3349 {
3350 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM33);
3351 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3352 if (fi >= max) MRRETURN(MATCH_NOMATCH);
3353 if (eptr >= md->end_subject)
3354 {
3355 SCHECK_PARTIAL();
3356 MRRETURN(MATCH_NOMATCH);
3357 }
3358 if (fc == *eptr++) MRRETURN(MATCH_NOMATCH);
3359 }
3360 }
3361 /* Control never gets here */
3362 }
3363
3364 /* Maximize case */
3365
3366 else
3367 {
3368 pp = eptr;
3369
3370 #ifdef SUPPORT_UTF8
3371 /* UTF-8 mode */
3372 if (utf8)
3373 {
3374 register unsigned int d;
3375 for (i = min; i < max; i++)
3376 {
3377 int len = 1;
3378 if (eptr >= md->end_subject)
3379 {
3380 SCHECK_PARTIAL();
3381 break;
3382 }
3383 GETCHARLEN(d, eptr, len);
3384 if (fc == d) break;
3385 eptr += len;
3386 }
3387 if (possessive) continue;
3388 for(;;)
3389 {
3390 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM34);
3391 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3392 if (eptr-- == pp) break; /* Stop if tried at original pos */
3393 BACKCHAR(eptr);
3394 }
3395 }
3396 else
3397 #endif
3398 /* Not UTF-8 mode */
3399 {
3400 for (i = min; i < max; i++)
3401 {
3402 if (eptr >= md->end_subject)
3403 {
3404 SCHECK_PARTIAL();
3405 break;
3406 }
3407 if (fc == *eptr) break;
3408 eptr++;
3409 }
3410 if (possessive) continue;
3411 while (eptr >= pp)
3412 {
3413 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM35);
3414 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3415 eptr--;
3416 }
3417 }
3418
3419 MRRETURN(MATCH_NOMATCH);
3420 }
3421 }
3422 /* Control never gets here */
3423
3424 /* Match a single character type repeatedly; several different opcodes
3425 share code. This is very similar to the code for single characters, but we
3426 repeat it in the interests of efficiency. */
3427
3428 case OP_TYPEEXACT:
3429 min = max = GET2(ecode, 1);
3430 minimize = TRUE;
3431 ecode += 3;
3432 goto REPEATTYPE;
3433
3434 case OP_TYPEUPTO:
3435 case OP_TYPEMINUPTO:
3436 min = 0;
3437 max = GET2(ecode, 1);
3438 minimize = *ecode == OP_TYPEMINUPTO;
3439 ecode += 3;
3440 goto REPEATTYPE;
3441
3442 case OP_TYPEPOSSTAR:
3443 possessive = TRUE;
3444 min = 0;
3445 max = INT_MAX;
3446 ecode++;
3447 goto REPEATTYPE;
3448
3449 case OP_TYPEPOSPLUS:
3450 possessive = TRUE;
3451 min = 1;
3452 max = INT_MAX;
3453 ecode++;
3454 goto REPEATTYPE;
3455
3456 case OP_TYPEPOSQUERY:
3457 possessive = TRUE;
3458 min = 0;
3459 max = 1;
3460 ecode++;
3461 goto REPEATTYPE;
3462
3463 case OP_TYPEPOSUPTO:
3464 possessive = TRUE;
3465 min = 0;
3466 max = GET2(ecode, 1);
3467 ecode += 3;
3468 goto REPEATTYPE;
3469
3470 case OP_TYPESTAR:
3471 case OP_TYPEMINSTAR:
3472 case OP_TYPEPLUS:
3473 case OP_TYPEMINPLUS:
3474 case OP_TYPEQUERY:
3475 case OP_TYPEMINQUERY:
3476 c = *ecode++ - OP_TYPESTAR;
3477 minimize = (c & 1) != 0;
3478 min = rep_min[c]; /* Pick up values from tables; */
3479 max = rep_max[c]; /* zero for max => infinity */
3480 if (max == 0) max = INT_MAX;
3481
3482 /* Common code for all repeated single character type matches. Note that
3483 in UTF-8 mode, '.' matches a character of any length, but for the other
3484 character types, the valid characters are all one-byte long. */
3485
3486 REPEATTYPE:
3487 ctype = *ecode++; /* Code for the character type */
3488
3489 #ifdef SUPPORT_UCP
3490 if (ctype == OP_PROP || ctype == OP_NOTPROP)
3491 {
3492 prop_fail_result = ctype == OP_NOTPROP;
3493 prop_type = *ecode++;
3494 prop_value = *ecode++;
3495 }
3496 else prop_type = -1;
3497 #endif
3498
3499 /* First, ensure the minimum number of matches are present. Use inline
3500 code for maximizing the speed, and do the type test once at the start
3501 (i.e. keep it out of the loop). Separate the UTF-8 code completely as that
3502 is tidier. Also separate the UCP code, which can be the same for both UTF-8
3503 and single-bytes. */
3504
3505 if (min > 0)
3506 {
3507 #ifdef SUPPORT_UCP
3508 if (prop_type >= 0)
3509 {
3510 switch(prop_type)
3511 {
3512 case PT_ANY:
3513 if (prop_fail_result) MRRETURN(MATCH_NOMATCH);
3514 for (i = 1; i <= min; i++)
3515 {
3516 if (eptr >= md->end_subject)
3517 {
3518 SCHECK_PARTIAL();
3519 MRRETURN(MATCH_NOMATCH);
3520 }
3521 GETCHARINCTEST(c, eptr);
3522 }
3523 break;
3524
3525 case PT_LAMP:
3526 for (i = 1; i <= min; i++)
3527 {
3528 if (eptr >= md->end_subject)
3529 {
3530 SCHECK_PARTIAL();
3531 MRRETURN(MATCH_NOMATCH);
3532 }
3533 GETCHARINCTEST(c, eptr);
3534 prop_chartype = UCD_CHARTYPE(c);
3535 if ((prop_chartype == ucp_Lu ||
3536 prop_chartype == ucp_Ll ||
3537 prop_chartype == ucp_Lt) == prop_fail_result)
3538 MRRETURN(MATCH_NOMATCH);
3539 }
3540 break;
3541
3542 case PT_GC:
3543 for (i = 1; i <= min; i++)
3544 {
3545 if (eptr >= md->end_subject)
3546 {
3547 SCHECK_PARTIAL();
3548 MRRETURN(MATCH_NOMATCH);
3549 }
3550 GETCHARINCTEST(c, eptr);
3551 prop_category = UCD_CATEGORY(c);
3552 if ((prop_category == prop_value) == prop_fail_result)
3553 MRRETURN(MATCH_NOMATCH);
3554 }
3555 break;
3556
3557 case PT_PC:
3558 for (i = 1; i <= min; i++)
3559 {
3560 if (eptr >= md->end_subject)
3561 {
3562 SCHECK_PARTIAL();
3563 MRRETURN(MATCH_NOMATCH);
3564 }
3565 GETCHARINCTEST(c, eptr);
3566 prop_chartype = UCD_CHARTYPE(c);
3567 if ((prop_chartype == prop_value) == prop_fail_result)
3568 MRRETURN(MATCH_NOMATCH);
3569 }
3570 break;
3571
3572 case PT_SC:
3573 for (i = 1; i <= min; i++)
3574 {
3575 if (eptr >= md->end_subject)
3576 {
3577 SCHECK_PARTIAL();
3578 MRRETURN(MATCH_NOMATCH);
3579 }
3580 GETCHARINCTEST(c, eptr);
3581 prop_script = UCD_SCRIPT(c);
3582 if ((prop_script == prop_value) == prop_fail_result)
3583 MRRETURN(MATCH_NOMATCH);
3584 }
3585 break;
3586
3587 case PT_ALNUM:
3588 for (i = 1; i <= min; i++)
3589 {
3590 if (eptr >= md->end_subject)
3591 {
3592 SCHECK_PARTIAL();
3593 MRRETURN(MATCH_NOMATCH);
3594 }
3595 GETCHARINCTEST(c, eptr);
3596 prop_category = UCD_CATEGORY(c);
3597 if ((prop_category == ucp_L || prop_category == ucp_N)
3598 == prop_fail_result)
3599 MRRETURN(MATCH_NOMATCH);
3600 }
3601 break;
3602
3603 case PT_SPACE: /* Perl space */
3604 for (i = 1; i <= min; i++)
3605 {
3606 if (eptr >= md->end_subject)
3607 {
3608 SCHECK_PARTIAL();
3609 MRRETURN(MATCH_NOMATCH);
3610 }
3611 GETCHARINCTEST(c, eptr);
3612 prop_category = UCD_CATEGORY(c);
3613 if ((prop_category == ucp_Z || c == CHAR_HT || c == CHAR_NL ||
3614 c == CHAR_FF || c == CHAR_CR)
3615 == prop_fail_result)
3616 MRRETURN(MATCH_NOMATCH);
3617 }
3618 break;
3619
3620 case PT_PXSPACE: /* POSIX space */
3621 for (i = 1; i <= min; i++)
3622 {
3623 if (eptr >= md->end_subject)
3624 {
3625 SCHECK_PARTIAL();
3626 MRRETURN(MATCH_NOMATCH);
3627 }
3628 GETCHARINCTEST(c, eptr);
3629 prop_category = UCD_CATEGORY(c);
3630 if ((prop_category == ucp_Z || c == CHAR_HT || c == CHAR_NL ||
3631 c == CHAR_VT || c == CHAR_FF || c == CHAR_CR)
3632 == prop_fail_result)
3633 MRRETURN(MATCH_NOMATCH);
3634 }
3635 break;
3636
3637 case PT_WORD:
3638 for (i = 1; i <= min; i++)
3639 {
3640 if (eptr >= md->end_subject)
3641 {
3642 SCHECK_PARTIAL();
3643 MRRETURN(MATCH_NOMATCH);
3644 }
3645 GETCHARINCTEST(c, eptr);
3646 prop_category = UCD_CATEGORY(c);
3647 if ((prop_category == ucp_L || prop_category == ucp_N ||
3648 c == CHAR_UNDERSCORE)
3649 == prop_fail_result)
3650 MRRETURN(MATCH_NOMATCH);
3651 }
3652 break;
3653
3654 /* This should not occur */
3655
3656 default:
3657 RRETURN(PCRE_ERROR_INTERNAL);
3658 }
3659 }
3660
3661 /* Match extended Unicode sequences. We will get here only if the
3662 support is in the binary; otherwise a compile-time error occurs. */
3663
3664 else if (ctype == OP_EXTUNI)
3665 {
3666 for (i = 1; i <= min; i++)
3667 {
3668 if (eptr >= md->end_subject)
3669 {
3670 SCHECK_PARTIAL();
3671 MRRETURN(MATCH_NOMATCH);
3672 }
3673 GETCHARINCTEST(c, eptr);
3674 prop_category = UCD_CATEGORY(c);
3675 if (prop_category == ucp_M) MRRETURN(MATCH_NOMATCH);
3676 while (eptr < md->end_subject)
3677 {
3678 int len = 1;
3679 if (!utf8) c = *eptr;
3680 else { GETCHARLEN(c, eptr, len); }
3681 prop_category = UCD_CATEGORY(c);
3682 if (prop_category != ucp_M) break;
3683 eptr += len;
3684 }
3685 }
3686 }
3687
3688 else
3689 #endif /* SUPPORT_UCP */
3690
3691 /* Handle all other cases when the coding is UTF-8 */
3692
3693 #ifdef SUPPORT_UTF8
3694 if (utf8) switch(ctype)
3695 {
3696 case OP_ANY:
3697 for (i = 1; i <= min; i++)
3698 {
3699 if (eptr >= md->end_subject)
3700 {
3701 SCHECK_PARTIAL();
3702 MRRETURN(MATCH_NOMATCH);
3703 }
3704 if (IS_NEWLINE(eptr)) MRRETURN(MATCH_NOMATCH);
3705 eptr++;
3706 while (eptr < md->end_subject && (*eptr & 0xc0) == 0x80) eptr++;
3707 }
3708 break;
3709
3710 case OP_ALLANY:
3711 for (i = 1; i <= min; i++)
3712 {
3713 if (eptr >= md->end_subject)
3714 {
3715 SCHECK_PARTIAL();
3716 MRRETURN(MATCH_NOMATCH);
3717 }
3718 eptr++;
3719 while (eptr < md->end_subject && (*eptr & 0xc0) == 0x80) eptr++;
3720 }
3721 break;
3722
3723 case OP_ANYBYTE:
3724 if (eptr > md->end_subject - min) MRRETURN(MATCH_NOMATCH);
3725 eptr += min;
3726 break;
3727
3728 case OP_ANYNL:
3729 for (i = 1; i <= min; i++)
3730 {
3731 if (eptr >= md->end_subject)
3732 {
3733 SCHECK_PARTIAL();
3734 MRRETURN(MATCH_NOMATCH);
3735 }
3736 GETCHARINC(c, eptr);
3737 switch(c)
3738 {
3739 default: MRRETURN(MATCH_NOMATCH);
3740 case 0x000d:
3741 if (eptr < md->end_subject && *eptr == 0x0a) eptr++;
3742 break;
3743
3744 case 0x000a:
3745 break;
3746
3747 case 0x000b:
3748 case 0x000c:
3749 case 0x0085:
3750 case 0x2028:
3751 case 0x2029:
3752 if (md->bsr_anycrlf) MRRETURN(MATCH_NOMATCH);
3753 break;
3754 }
3755 }
3756 break;
3757
3758 case OP_NOT_HSPACE:
3759 for (i = 1; i <= min; i++)
3760 {
3761 if (eptr >= md->end_subject)
3762 {
3763 SCHECK_PARTIAL();
3764 MRRETURN(MATCH_NOMATCH);
3765 }
3766 GETCHARINC(c, eptr);
3767 switch(c)
3768 {
3769 default: break;
3770 case 0x09: /* HT */
3771 case 0x20: /* SPACE */
3772 case 0xa0: /* NBSP */
3773 case 0x1680: /* OGHAM SPACE MARK */
3774 case 0x180e: /* MONGOLIAN VOWEL SEPARATOR */
3775 case 0x2000: /* EN QUAD */
3776 case 0x2001: /* EM QUAD */
3777 case 0x2002: /* EN SPACE */
3778 case 0x2003: /* EM SPACE */
3779 case 0x2004: /* THREE-PER-EM SPACE */
3780 case 0x2005: /* FOUR-PER-EM SPACE */
3781 case 0x2006: /* SIX-PER-EM SPACE */
3782 case 0x2007: /* FIGURE SPACE */
3783 case 0x2008: /* PUNCTUATION SPACE */
3784 case 0x2009: /* THIN SPACE */
3785 case 0x200A: /* HAIR SPACE */
3786 case 0x202f: /* NARROW NO-BREAK SPACE */
3787 case 0x205f: /* MEDIUM MATHEMATICAL SPACE */
3788 case 0x3000: /* IDEOGRAPHIC SPACE */
3789 MRRETURN(MATCH_NOMATCH);
3790 }
3791 }
3792 break;
3793
3794 case OP_HSPACE:
3795 for (i = 1; i <= min; i++)
3796 {
3797 if (eptr >= md->end_subject)
3798 {
3799 SCHECK_PARTIAL();
3800 MRRETURN(MATCH_NOMATCH);
3801 }
3802 GETCHARINC(c, eptr);
3803 switch(c)
3804 {
3805 default: MRRETURN(MATCH_NOMATCH);
3806 case 0x09: /* HT */
3807 case 0x20: /* SPACE */
3808 case 0xa0: /* NBSP */
3809 case 0x1680: /* OGHAM SPACE MARK */
3810 case 0x180e: /* MONGOLIAN VOWEL SEPARATOR */
3811 case 0x2000: /* EN QUAD */
3812 case 0x2001: /* EM QUAD */
3813 case 0x2002: /* EN SPACE */
3814 case 0x2003: /* EM SPACE */
3815 case 0x2004: /* THREE-PER-EM SPACE */
3816 case 0x2005: /* FOUR-PER-EM SPACE */
3817 case 0x2006: /* SIX-PER-EM SPACE */
3818 case 0x2007: /* FIGURE SPACE */
3819 case 0x2008: /* PUNCTUATION SPACE */
3820 case 0x2009: /* THIN SPACE */
3821 case 0x200A: /* HAIR SPACE */
3822 case 0x202f: /* NARROW NO-BREAK SPACE */
3823 case 0x205f: /* MEDIUM MATHEMATICAL SPACE */
3824 case 0x3000: /* IDEOGRAPHIC SPACE */
3825 break;
3826 }
3827 }
3828 break;
3829
3830 case OP_NOT_VSPACE:
3831 for (i = 1; i <= min; i++)
3832 {
3833 if (eptr >= md->end_subject)
3834 {
3835 SCHECK_PARTIAL();
3836 MRRETURN(MATCH_NOMATCH);
3837 }
3838 GETCHARINC(c, eptr);
3839 switch(c)
3840 {
3841 default: break;
3842 case 0x0a: /* LF */
3843 case 0x0b: /* VT */
3844 case 0x0c: /* FF */
3845 case 0x0d: /* CR */
3846 case 0x85: /* NEL */
3847 case 0x2028: /* LINE SEPARATOR */
3848 case 0x2029: /* PARAGRAPH SEPARATOR */
3849 MRRETURN(MATCH_NOMATCH);
3850 }
3851 }
3852 break;
3853
3854 case OP_VSPACE:
3855 for (i = 1; i <= min; i++)
3856 {
3857 if (eptr >= md->end_subject)
3858 {
3859 SCHECK_PARTIAL();
3860 MRRETURN(MATCH_NOMATCH);
3861 }
3862 GETCHARINC(c, eptr);
3863 switch(c)
3864 {
3865 default: MRRETURN(MATCH_NOMATCH);
3866 case 0x0a: /* LF */
3867 case 0x0b: /* VT */
3868 case 0x0c: /* FF */
3869 case 0x0d: /* CR */
3870 case 0x85: /* NEL */
3871 case 0x2028: /* LINE SEPARATOR */
3872 case 0x2029: /* PARAGRAPH SEPARATOR */
3873 break;
3874 }
3875 }
3876 break;
3877
3878 case OP_NOT_DIGIT:
3879 for (i = 1; i <= min; i++)
3880 {
3881 if (eptr >= md->end_subject)
3882 {
3883 SCHECK_PARTIAL();
3884 MRRETURN(MATCH_NOMATCH);
3885 }
3886 GETCHARINC(c, eptr);
3887 if (c < 128 && (md->ctypes[c] & ctype_digit) != 0)
3888 MRRETURN(MATCH_NOMATCH);
3889 }
3890 break;
3891
3892 case OP_DIGIT:
3893 for (i = 1; i <= min; i++)
3894 {
3895 if (eptr >= md->end_subject)
3896 {
3897 SCHECK_PARTIAL();
3898 MRRETURN(MATCH_NOMATCH);
3899 }
3900 if (*eptr >= 128 || (md->ctypes[*eptr++] & ctype_digit) == 0)
3901 MRRETURN(MATCH_NOMATCH);
3902 /* No need to skip more bytes - we know it's a 1-byte character */
3903 }
3904 break;
3905
3906 case OP_NOT_WHITESPACE:
3907 for (i = 1; i <= min; i++)
3908 {
3909 if (eptr >= md->end_subject)
3910 {
3911 SCHECK_PARTIAL();
3912 MRRETURN(MATCH_NOMATCH);
3913 }
3914 if (*eptr < 128 && (md->ctypes[*eptr] & ctype_space) != 0)
3915 MRRETURN(MATCH_NOMATCH);
3916 while (++eptr < md->end_subject && (*eptr & 0xc0) == 0x80);
3917 }
3918 break;
3919
3920 case OP_WHITESPACE:
3921 for (i = 1; i <= min; i++)
3922 {
3923 if (eptr >= md->end_subject)
3924 {
3925 SCHECK_PARTIAL();
3926 MRRETURN(MATCH_NOMATCH);
3927 }
3928 if (*eptr >= 128 || (md->ctypes[*eptr++] & ctype_space) == 0)
3929 MRRETURN(MATCH_NOMATCH);
3930 /* No need to skip more bytes - we know it's a 1-byte character */
3931 }
3932 break;
3933
3934 case OP_NOT_WORDCHAR:
3935 for (i = 1; i <= min; i++)
3936 {
3937 if (eptr >= md->end_subject)
3938 {
3939 SCHECK_PARTIAL();
3940 MRRETURN(MATCH_NOMATCH);
3941 }
3942 if (*eptr < 128 && (md->ctypes[*eptr] & ctype_word) != 0)
3943 MRRETURN(MATCH_NOMATCH);
3944 while (++eptr < md->end_subject && (*eptr & 0xc0) == 0x80);
3945 }
3946 break;
3947
3948 case OP_WORDCHAR:
3949 for (i = 1; i <= min; i++)
3950 {
3951 if (eptr >= md->end_subject)
3952 {
3953 SCHECK_PARTIAL();
3954 MRRETURN(MATCH_NOMATCH);
3955 }
3956 if (*eptr >= 128 || (md->ctypes[*eptr++] & ctype_word) == 0)
3957 MRRETURN(MATCH_NOMATCH);
3958 /* No need to skip more bytes - we know it's a 1-byte character */
3959 }
3960 break;
3961
3962 default:
3963 RRETURN(PCRE_ERROR_INTERNAL);
3964 } /* End switch(ctype) */
3965
3966 else
3967 #endif /* SUPPORT_UTF8 */
3968
3969 /* Code for the non-UTF-8 case for minimum matching of operators other
3970 than OP_PROP and OP_NOTPROP. */
3971
3972 switch(ctype)
3973 {
3974 case OP_ANY:
3975 for (i = 1; i <= min; i++)
3976 {
3977 if (eptr >= md->end_subject)
3978 {
3979 SCHECK_PARTIAL();
3980 MRRETURN(MATCH_NOMATCH);
3981 }
3982 if (IS_NEWLINE(eptr)) MRRETURN(MATCH_NOMATCH);
3983 eptr++;
3984 }
3985 break;
3986
3987 case OP_ALLANY:
3988 if (eptr > md->end_subject - min)
3989 {
3990 SCHECK_PARTIAL();
3991 MRRETURN(MATCH_NOMATCH);
3992 }
3993 eptr += min;
3994 break;
3995
3996 case OP_ANYBYTE:
3997 if (eptr > md->end_subject - min)
3998 {
3999 SCHECK_PARTIAL();
4000 MRRETURN(MATCH_NOMATCH);
4001 }
4002 eptr += min;
4003 break;
4004
4005 case OP_ANYNL:
4006 for (i = 1; i <= min; i++)
4007 {
4008 if (eptr >= md->end_subject)
4009 {
4010 SCHECK_PARTIAL();
4011 MRRETURN(MATCH_NOMATCH);
4012 }
4013 switch(*eptr++)
4014 {
4015 default: MRRETURN(MATCH_NOMATCH);
4016 case 0x000d:
4017 if (eptr < md->end_subject && *eptr == 0x0a) eptr++;
4018 break;
4019 case 0x000a:
4020 break;
4021
4022 case 0x000b:
4023 case 0x000c:
4024 case 0x0085:
4025 if (md->bsr_anycrlf) MRRETURN(MATCH_NOMATCH);
4026 break;
4027 }
4028 }
4029 break;
4030
4031 case OP_NOT_HSPACE:
4032 for (i = 1; i <= min; i++)
4033 {
4034 if (eptr >= md->end_subject)
4035 {
4036 SCHECK_PARTIAL();
4037 MRRETURN(MATCH_NOMATCH);
4038 }
4039 switch(*eptr++)
4040 {
4041 default: break;
4042 case 0x09: /* HT */
4043 case 0x20: /* SPACE */
4044 case 0xa0: /* NBSP */
4045 MRRETURN(MATCH_NOMATCH);
4046 }
4047 }
4048 break;
4049
4050 case OP_HSPACE:
4051 for (i = 1; i <= min; i++)
4052 {
4053 if (eptr >= md->end_subject)
4054 {
4055 SCHECK_PARTIAL();
4056 MRRETURN(MATCH_NOMATCH);
4057 }
4058 switch(*eptr++)
4059 {
4060 default: MRRETURN(MATCH_NOMATCH);
4061 case 0x09: /* HT */
4062 case 0x20: /* SPACE */
4063 case 0xa0: /* NBSP */
4064 break;
4065 }
4066 }
4067 break;
4068
4069 case OP_NOT_VSPACE:
4070 for (i = 1; i <= min; i++)
4071 {
4072 if (eptr >= md->end_subject)
4073 {
4074 SCHECK_PARTIAL();
4075 MRRETURN(MATCH_NOMATCH);
4076 }
4077 switch(*eptr++)
4078 {
4079 default: break;
4080 case 0x0a: /* LF */
4081 case 0x0b: /* VT */
4082 case 0x0c: /* FF */
4083 case 0x0d: /* CR */
4084 case 0x85: /* NEL */
4085 MRRETURN(MATCH_NOMATCH);
4086 }
4087 }
4088 break;
4089
4090 case OP_VSPACE:
4091 for (i = 1; i <= min; i++)
4092 {
4093 if (eptr >= md->end_subject)
4094 {
4095 SCHECK_PARTIAL();
4096 MRRETURN(MATCH_NOMATCH);
4097 }
4098 switch(*eptr++)
4099 {
4100 default: MRRETURN(MATCH_NOMATCH);
4101 case 0x0a: /* LF */
4102 case 0x0b: /* VT */
4103 case 0x0c: /* FF */
4104 case 0x0d: /* CR */
4105 case 0x85: /* NEL */
4106 break;
4107 }
4108 }
4109 break;
4110
4111 case OP_NOT_DIGIT:
4112 for (i = 1; i <= min; i++)
4113 {
4114 if (eptr >= md->end_subject)
4115 {
4116 SCHECK_PARTIAL();
4117 MRRETURN(MATCH_NOMATCH);
4118 }
4119 if ((md->ctypes[*eptr++] & ctype_digit) != 0) MRRETURN(MATCH_NOMATCH);
4120 }
4121 break;
4122
4123 case OP_DIGIT:
4124 for (i = 1; i <= min; i++)
4125 {
4126 if (eptr >= md->end_subject)
4127 {
4128 SCHECK_PARTIAL();
4129 MRRETURN(MATCH_NOMATCH);
4130 }
4131 if ((md->ctypes[*eptr++] & ctype_digit) == 0) MRRETURN(MATCH_NOMATCH);
4132 }
4133 break;
4134
4135 case OP_NOT_WHITESPACE:
4136 for (i = 1; i <= min; i++)
4137 {
4138 if (eptr >= md->end_subject)
4139 {
4140 SCHECK_PARTIAL();
4141 MRRETURN(MATCH_NOMATCH);
4142 }
4143 if ((md->ctypes[*eptr++] & ctype_space) != 0) MRRETURN(MATCH_NOMATCH);
4144 }
4145 break;
4146
4147 case OP_WHITESPACE:
4148 for (i = 1; i <= min; i++)
4149 {
4150 if (eptr >= md->end_subject)
4151 {
4152 SCHECK_PARTIAL();
4153 MRRETURN(MATCH_NOMATCH);
4154 }
4155 if ((md->ctypes[*eptr++] & ctype_space) == 0) MRRETURN(MATCH_NOMATCH);
4156 }
4157 break;
4158
4159 case OP_NOT_WORDCHAR:
4160 for (i = 1; i <= min; i++)
4161 {
4162 if (eptr >= md->end_subject)
4163 {
4164 SCHECK_PARTIAL();
4165 MRRETURN(MATCH_NOMATCH);
4166 }
4167 if ((md->ctypes[*eptr++] & ctype_word) != 0)
4168 MRRETURN(MATCH_NOMATCH);
4169 }
4170 break;
4171
4172 case OP_WORDCHAR:
4173 for (i = 1; i <= min; i++)
4174 {
4175 if (eptr >= md->end_subject)
4176 {
4177 SCHECK_PARTIAL();
4178 MRRETURN(MATCH_NOMATCH);
4179 }
4180 if ((md->ctypes[*eptr++] & ctype_word) == 0)
4181 MRRETURN(MATCH_NOMATCH);
4182 }
4183 break;
4184
4185 default:
4186 RRETURN(PCRE_ERROR_INTERNAL);
4187 }
4188 }
4189
4190 /* If min = max, continue at the same level without recursing */
4191
4192 if (min == max) continue;
4193
4194 /* If minimizing, we have to test the rest of the pattern before each
4195 subsequent match. Again, separate the UTF-8 case for speed, and also
4196 separate the UCP cases. */
4197
4198 if (minimize)
4199 {
4200 #ifdef SUPPORT_UCP
4201 if (prop_type >= 0)
4202 {
4203 switch(prop_type)
4204 {
4205 case PT_ANY:
4206 for (fi = min;; fi++)
4207 {
4208 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM36);
4209 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
4210 if (fi >= max) MRRETURN(MATCH_NOMATCH);
4211 if (eptr >= md->end_subject)
4212 {
4213 SCHECK_PARTIAL();
4214 MRRETURN(MATCH_NOMATCH);
4215 }
4216 GETCHARINCTEST(c, eptr);
4217 if (prop_fail_result) MRRETURN(MATCH_NOMATCH);
4218 }
4219 /* Control never gets here */
4220
4221 case PT_LAMP:
4222 for (fi = min;; fi++)
4223 {
4224 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM37);
4225 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
4226 if (fi >= max) MRRETURN(MATCH_NOMATCH);
4227 if (eptr >= md->end_subject)
4228 {
4229 SCHECK_PARTIAL();
4230 MRRETURN(MATCH_NOMATCH);
4231 }
4232 GETCHARINCTEST(c, eptr);
4233 prop_chartype = UCD_CHARTYPE(c);
4234 if ((prop_chartype == ucp_Lu ||
4235 prop_chartype == ucp_Ll ||
4236 prop_chartype == ucp_Lt) == prop_fail_result)
4237 MRRETURN(MATCH_NOMATCH);
4238 }
4239 /* Control never gets here */
4240
4241 case PT_GC:
4242 for (fi = min;; fi++)
4243 {
4244 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM38);
4245 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
4246 if (fi >= max) MRRETURN(MATCH_NOMATCH);
4247 if (eptr >= md->end_subject)
4248 {
4249 SCHECK_PARTIAL();
4250 MRRETURN(MATCH_NOMATCH);
4251 }
4252 GETCHARINCTEST(c, eptr);
4253 prop_category = UCD_CATEGORY(c);
4254 if ((prop_category == prop_value) == prop_fail_result)
4255 MRRETURN(MATCH_NOMATCH);
4256 }
4257 /* Control never gets here */
4258
4259 case PT_PC:
4260 for (fi = min;; fi++)
4261 {
4262 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM39);
4263 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
4264 if (fi >= max) MRRETURN(MATCH_NOMATCH);
4265 if (eptr >= md->end_subject)
4266 {
4267 SCHECK_PARTIAL();
4268 MRRETURN(MATCH_NOMATCH);
4269 }
4270 GETCHARINCTEST(c, eptr);
4271 prop_chartype = UCD_CHARTYPE(c);
4272 if ((prop_chartype == prop_value) == prop_fail_result)
4273 MRRETURN(MATCH_NOMATCH);
4274 }
4275 /* Control never gets here */
4276
4277 case PT_SC:
4278 for (fi = min;; fi++)
4279 {
4280 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM40);
4281 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
4282 if (fi >= max) MRRETURN(MATCH_NOMATCH);
4283 if (eptr >= md->end_subject)
4284 {
4285 SCHECK_PARTIAL();
4286 MRRETURN(MATCH_NOMATCH);
4287 }
4288 GETCHARINCTEST(c, eptr);
4289 prop_script = UCD_SCRIPT(c);
4290 if ((prop_script == prop_value) == prop_fail_result)
4291 MRRETURN(MATCH_NOMATCH);
4292 }
4293 /* Control never gets here */
4294
4295 case PT_ALNUM:
4296 for (fi = min;; fi++)
4297 {
4298 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM59);
4299 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
4300 if (fi >= max) MRRETURN(MATCH_NOMATCH);
4301 if (eptr >= md->end_subject)
4302 {
4303 SCHECK_PARTIAL();
4304 MRRETURN(MATCH_NOMATCH);
4305 }
4306 GETCHARINCTEST(c, eptr);
4307 prop_category = UCD_CATEGORY(c);
4308 if ((prop_category == ucp_L || prop_category == ucp_N)
4309 == prop_fail_result)
4310 MRRETURN(MATCH_NOMATCH);
4311 }
4312 /* Control never gets here */
4313
4314 case PT_SPACE: /* Perl space */
4315 for (fi = min;; fi++)
4316 {
4317 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM60);
4318 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
4319 if (fi >= max) MRRETURN(MATCH_NOMATCH);
4320 if (eptr >= md->end_subject)
4321 {
4322 SCHECK_PARTIAL();
4323 MRRETURN(MATCH_NOMATCH);
4324 }
4325 GETCHARINCTEST(c, eptr);
4326 prop_category = UCD_CATEGORY(c);
4327 if ((prop_category == ucp_Z || c == CHAR_HT || c == CHAR_NL ||
4328 c == CHAR_FF || c == CHAR_CR)
4329 == prop_fail_result)
4330 MRRETURN(MATCH_NOMATCH);
4331 }
4332 /* Control never gets here */
4333
4334 case PT_PXSPACE: /* POSIX space */
4335 for (fi = min;; fi++)
4336 {
4337 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM61);
4338 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
4339 if (fi >= max) MRRETURN(MATCH_NOMATCH);
4340 if (eptr >= md->end_subject)
4341 {
4342 SCHECK_PARTIAL();
4343 MRRETURN(MATCH_NOMATCH);
4344 }
4345 GETCHARINCTEST(c, eptr);
4346 prop_category = UCD_CATEGORY(c);
4347 if ((prop_category == ucp_Z || c == CHAR_HT || c == CHAR_NL ||
4348 c == CHAR_VT || c == CHAR_FF || c == CHAR_CR)
4349 == prop_fail_result)
4350 MRRETURN(MATCH_NOMATCH);
4351 }
4352 /* Control never gets here */
4353
4354 case PT_WORD:
4355 for (fi = min;; fi++)
4356 {
4357 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM62);
4358 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
4359 if (fi >= max) MRRETURN(MATCH_NOMATCH);
4360 if (eptr >= md->end_subject)
4361 {
4362 SCHECK_PARTIAL();
4363 MRRETURN(MATCH_NOMATCH);
4364 }
4365 GETCHARINCTEST(c, eptr);
4366 prop_category = UCD_CATEGORY(c);
4367 if ((prop_category == ucp_L ||
4368 prop_category == ucp_N ||
4369 c == CHAR_UNDERSCORE)
4370 == prop_fail_result)
4371 MRRETURN(MATCH_NOMATCH);
4372 }
4373 /* Control never gets here */
4374
4375 /* This should never occur */
4376
4377 default:
4378 RRETURN(PCRE_ERROR_INTERNAL);
4379 }
4380 }
4381
4382 /* Match extended Unicode sequences. We will get here only if the
4383 support is in the binary; otherwise a compile-time error occurs. */
4384
4385 else if (ctype == OP_EXTUNI)
4386 {
4387 for (fi = min;; fi++)
4388 {
4389 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM41);
4390 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
4391 if (fi >= max) MRRETURN(MATCH_NOMATCH);
4392 if (eptr >= md->end_subject)
4393 {
4394 SCHECK_PARTIAL();
4395 MRRETURN(MATCH_NOMATCH);
4396 }
4397 GETCHARINCTEST(c, eptr);
4398 prop_category = UCD_CATEGORY(c);
4399 if (prop_category == ucp_M) MRRETURN(MATCH_NOMATCH);
4400 while (eptr < md->end_subject)
4401 {
4402 int len = 1;
4403 if (!utf8) c = *eptr;
4404 else { GETCHARLEN(c, eptr, len); }
4405 prop_category = UCD_CATEGORY(c);
4406 if (prop_category != ucp_M) break;
4407 eptr += len;
4408 }
4409 }
4410 }
4411
4412 else
4413 #endif /* SUPPORT_UCP */
4414
4415 #ifdef SUPPORT_UTF8
4416 /* UTF-8 mode */
4417 if (utf8)
4418 {
4419 for (fi = min;; fi++)
4420 {
4421 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM42);
4422 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
4423 if (fi >= max) MRRETURN(MATCH_NOMATCH);
4424 if (eptr >= md->end_subject)
4425 {
4426 SCHECK_PARTIAL();
4427 MRRETURN(MATCH_NOMATCH);
4428 }
4429 if (ctype == OP_ANY && IS_NEWLINE(eptr))
4430 MRRETURN(MATCH_NOMATCH);
4431 GETCHARINC(c, eptr);
4432 switch(ctype)
4433 {
4434 case OP_ANY: /* This is the non-NL case */
4435 case OP_ALLANY:
4436 case OP_ANYBYTE:
4437 break;
4438
4439 case OP_ANYNL:
4440 switch(c)
4441 {
4442 default: MRRETURN(MATCH_NOMATCH);
4443 case 0x000d:
4444 if (eptr < md->end_subject && *eptr == 0x0a) eptr++;
4445 break;
4446 case 0x000a:
4447 break;
4448
4449 case 0x000b:
4450 case 0x000c:
4451 case 0x0085:
4452 case 0x2028:
4453 case 0x2029:
4454 if (md->bsr_anycrlf) MRRETURN(MATCH_NOMATCH);
4455 break;
4456 }
4457 break;
4458
4459 case OP_NOT_HSPACE:
4460 switch(c)
4461 {
4462 default: break;
4463 case 0x09: /* HT */
4464 case 0x20: /* SPACE */
4465 case 0xa0: /* NBSP */
4466 case 0x1680: /* OGHAM SPACE MARK */
4467 case 0x180e: /* MONGOLIAN VOWEL SEPARATOR */
4468 case 0x2000: /* EN QUAD */
4469 case 0x2001: /* EM QUAD */
4470 case 0x2002: /* EN SPACE */
4471 case 0x2003: /* EM SPACE */
4472 case 0x2004: /* THREE-PER-EM SPACE */
4473 case 0x2005: /* FOUR-PER-EM SPACE */
4474 case 0x2006: /* SIX-PER-EM SPACE */
4475 case 0x2007: /* FIGURE SPACE */
4476 case 0x2008: /* PUNCTUATION SPACE */
4477 case 0x2009: /* THIN SPACE */
4478 case 0x200A: /* HAIR SPACE */
4479 case 0x202f: /* NARROW NO-BREAK SPACE */
4480 case 0x205f: /* MEDIUM MATHEMATICAL SPACE */
4481 case 0x3000: /* IDEOGRAPHIC SPACE */
4482 MRRETURN(MATCH_NOMATCH);
4483 }
4484 break;
4485
4486 case OP_HSPACE:
4487 switch(c)
4488 {
4489 default: MRRETURN(MATCH_NOMATCH);
4490 case 0x09: /* HT */
4491 case 0x20: /* SPACE */
4492 case 0xa0: /* NBSP */
4493 case 0x1680: /* OGHAM SPACE MARK */
4494 case 0x180e: /* MONGOLIAN VOWEL SEPARATOR */
4495 case 0x2000: /* EN QUAD */
4496 case 0x2001: /* EM QUAD */
4497 case 0x2002: /* EN SPACE */
4498 case 0x2003: /* EM SPACE */
4499 case 0x2004: /* THREE-PER-EM SPACE */
4500 case 0x2005: /* FOUR-PER-EM SPACE */
4501 case 0x2006: /* SIX-PER-EM SPACE */
4502 case 0x2007: /* FIGURE SPACE */
4503 case 0x2008: /* PUNCTUATION SPACE */
4504 case 0x2009: /* THIN SPACE */
4505 case 0x200A: /* HAIR SPACE */
4506 case 0x202f: /* NARROW NO-BREAK SPACE */
4507 case 0x205f: /* MEDIUM MATHEMATICAL SPACE */
4508 case 0x3000: /* IDEOGRAPHIC SPACE */
4509 break;
4510 }
4511 break;
4512
4513 case OP_NOT_VSPACE:
4514 switch(c)
4515 {
4516 default: break;
4517 case 0x0a: /* LF */
4518 case 0x0b: /* VT */
4519 case 0x0c: /* FF */
4520 case 0x0d: /* CR */
4521 case 0x85: /* NEL */
4522 case 0x2028: /* LINE SEPARATOR */
4523 case 0x2029: /* PARAGRAPH SEPARATOR */
4524 MRRETURN(MATCH_NOMATCH);
4525 }
4526 break;
4527
4528 case OP_VSPACE:
4529 switch(c)
4530 {
4531 default: MRRETURN(MATCH_NOMATCH);
4532 case 0x0a: /* LF */
4533 case 0x0b: /* VT */
4534 case 0x0c: /* FF */
4535 case 0x0d: /* CR */
4536 case 0x85: /* NEL */
4537 case 0x2028: /* LINE SEPARATOR */
4538 case 0x2029: /* PARAGRAPH SEPARATOR */
4539 break;
4540 }
4541 break;
4542
4543 case OP_NOT_DIGIT:
4544 if (c < 256 && (md->ctypes[c] & ctype_digit) != 0)
4545 MRRETURN(MATCH_NOMATCH);
4546 break;
4547
4548 case OP_DIGIT:
4549 if (c >= 256 || (md->ctypes[c] & ctype_digit) == 0)
4550 MRRETURN(MATCH_NOMATCH);
4551 break;
4552
4553 case OP_NOT_WHITESPACE:
4554 if (c < 256 && (md->ctypes[c] & ctype_space) != 0)
4555 MRRETURN(MATCH_NOMATCH);
4556 break;
4557
4558 case OP_WHITESPACE:
4559 if (c >= 256 || (md->ctypes[c] & ctype_space) == 0)
4560 MRRETURN(MATCH_NOMATCH);
4561 break;
4562
4563 case OP_NOT_WORDCHAR:
4564 if (c < 256 && (md->ctypes[c] & ctype_word) != 0)
4565 MRRETURN(MATCH_NOMATCH);
4566 break;
4567
4568 case OP_WORDCHAR:
4569 if (c >= 256 || (md->ctypes[c] & ctype_word) == 0)
4570 MRRETURN(MATCH_NOMATCH);
4571 break;
4572
4573 default:
4574 RRETURN(PCRE_ERROR_INTERNAL);
4575 }
4576 }
4577 }
4578 else
4579 #endif
4580 /* Not UTF-8 mode */
4581 {
4582 for (fi = min;; fi++)
4583 {
4584 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM43);
4585 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
4586 if (fi >= max) MRRETURN(MATCH_NOMATCH);
4587 if (eptr >= md->end_subject)
4588 {
4589 SCHECK_PARTIAL();
4590 MRRETURN(MATCH_NOMATCH);
4591 }
4592 if (ctype == OP_ANY && IS_NEWLINE(eptr))
4593 MRRETURN(MATCH_NOMATCH);
4594 c = *eptr++;
4595 switch(ctype)
4596 {
4597 case OP_ANY: /* This is the non-NL case */
4598 case OP_ALLANY:
4599 case OP_ANYBYTE:
4600 break;
4601
4602 case OP_ANYNL:
4603 switch(c)
4604 {
4605 default: MRRETURN(MATCH_NOMATCH);
4606 case 0x000d:
4607 if (eptr < md->end_subject && *eptr == 0x0a) eptr++;
4608 break;
4609
4610 case 0x000a:
4611 break;
4612
4613 case 0x000b:
4614 case 0x000c:
4615 case 0x0085:
4616 if (md->bsr_anycrlf) MRRETURN(MATCH_NOMATCH);
4617 break;
4618 }
4619 break;
4620
4621 case OP_NOT_HSPACE:
4622 switch(c)
4623 {
4624 default: break;
4625 case 0x09: /* HT */
4626 case 0x20: /* SPACE */
4627 case 0xa0: /* NBSP */
4628 MRRETURN(MATCH_NOMATCH);
4629 }
4630 break;
4631
4632 case OP_HSPACE:
4633 switch(c)
4634 {
4635 default: MRRETURN(MATCH_NOMATCH);
4636 case 0x09: /* HT */
4637 case 0x20: /* SPACE */
4638 case 0xa0: /* NBSP */
4639 break;
4640 }
4641 break;
4642
4643 case OP_NOT_VSPACE:
4644 switch(c)
4645 {
4646 default: break;
4647 case 0x0a: /* LF */
4648 case 0x0b: /* VT */
4649 case 0x0c: /* FF */
4650 case 0x0d: /* CR */
4651 case 0x85: /* NEL */
4652 MRRETURN(MATCH_NOMATCH);
4653 }
4654 break;
4655
4656 case OP_VSPACE:
4657 switch(c)
4658 {
4659 default: MRRETURN(MATCH_NOMATCH);
4660 case 0x0a: /* LF */
4661 case 0x0b: /* VT */
4662 case 0x0c: /* FF */
4663 case 0x0d: /* CR */
4664 case 0x85: /* NEL */
4665 break;
4666 }
4667 break;
4668
4669 case OP_NOT_DIGIT:
4670 if ((md->ctypes[c] & ctype_digit) != 0) MRRETURN(MATCH_NOMATCH);
4671 break;
4672
4673 case OP_DIGIT:
4674 if ((md->ctypes[c] & ctype_digit) == 0) MRRETURN(MATCH_NOMATCH);
4675 break;
4676
4677 case OP_NOT_WHITESPACE:
4678 if ((md->ctypes[c] & ctype_space) != 0) MRRETURN(MATCH_NOMATCH);
4679 break;
4680
4681 case OP_WHITESPACE:
4682 if ((md->ctypes[c] & ctype_space) == 0) MRRETURN(MATCH_NOMATCH);
4683 break;
4684
4685 case OP_NOT_WORDCHAR:
4686 if ((md->ctypes[c] & ctype_word) != 0) MRRETURN(MATCH_NOMATCH);
4687 break;
4688
4689 case OP_WORDCHAR:
4690 if ((md->ctypes[c] & ctype_word) == 0) MRRETURN(MATCH_NOMATCH);
4691 break;
4692
4693 default:
4694 RRETURN(PCRE_ERROR_INTERNAL);
4695 }
4696 }
4697 }
4698 /* Control never gets here */
4699 }
4700
4701 /* If maximizing, it is worth using inline code for speed, doing the type
4702 test once at the start (i.e. keep it out of the loop). Again, keep the
4703 UTF-8 and UCP stuff separate. */
4704
4705 else
4706 {
4707 pp = eptr; /* Remember where we started */
4708
4709 #ifdef SUPPORT_UCP
4710 if (prop_type >= 0)
4711 {
4712 switch(prop_type)
4713 {
4714 case PT_ANY:
4715 for (i = min; i < max; i++)
4716 {
4717 int len = 1;
4718 if (eptr >= md->end_subject)
4719 {
4720 SCHECK_PARTIAL();
4721 break;
4722 }
4723 GETCHARLENTEST(c, eptr, len);
4724 if (prop_fail_result) break;
4725 eptr+= len;
4726 }
4727 break;
4728
4729 case PT_LAMP:
4730 for (i = min; i < max; i++)
4731 {
4732 int len = 1;
4733 if (eptr >= md->end_subject)
4734 {
4735 SCHECK_PARTIAL();
4736 break;
4737 }
4738 GETCHARLENTEST(c, eptr, len);
4739 prop_chartype = UCD_CHARTYPE(c);
4740 if ((prop_chartype == ucp_Lu ||
4741 prop_chartype == ucp_Ll ||
4742 prop_chartype == ucp_Lt) == prop_fail_result)
4743 break;
4744 eptr+= len;
4745 }
4746 break;
4747
4748 case PT_GC:
4749 for (i = min; i < max; i++)
4750 {
4751 int len = 1;
4752 if (eptr >= md->end_subject)
4753 {
4754 SCHECK_PARTIAL();
4755 break;
4756 }
4757 GETCHARLENTEST(c, eptr, len);
4758 prop_category = UCD_CATEGORY(c);
4759 if ((prop_category == prop_value) == prop_fail_result)
4760 break;
4761 eptr+= len;
4762 }
4763 break;
4764
4765 case PT_PC:
4766 for (i = min; i < max; i++)
4767 {
4768 int len = 1;
4769 if (eptr >= md->end_subject)
4770 {
4771 SCHECK_PARTIAL();
4772 break;
4773 }
4774 GETCHARLENTEST(c, eptr, len);
4775 prop_chartype = UCD_CHARTYPE(c);
4776 if ((prop_chartype == prop_value) == prop_fail_result)
4777 break;
4778 eptr+= len;
4779 }
4780 break;
4781
4782 case PT_SC:
4783 for (i = min; i < max; i++)
4784 {
4785 int len = 1;
4786 if (eptr >= md->end_subject)
4787 {
4788 SCHECK_PARTIAL();
4789 break;
4790 }
4791 GETCHARLENTEST(c, eptr, len);
4792 prop_script = UCD_SCRIPT(c);
4793 if ((prop_script == prop_value) == prop_fail_result)
4794 break;
4795 eptr+= len;
4796 }
4797 break;
4798
4799 case PT_ALNUM:
4800 for (i = min; i < max; i++)
4801 {
4802 int len = 1;
4803 if (eptr >= md->end_subject)
4804 {
4805 SCHECK_PARTIAL();
4806 break;
4807 }
4808 GETCHARLENTEST(c, eptr, len);
4809 prop_category = UCD_CATEGORY(c);
4810 if ((prop_category == ucp_L || prop_category == ucp_N)
4811 == prop_fail_result)
4812 break;
4813 eptr+= len;
4814 }
4815 break;
4816
4817 case PT_SPACE: /* Perl space */
4818 for (i = min; i < max; i++)
4819 {
4820 int len = 1;
4821 if (eptr >= md->end_subject)
4822 {
4823 SCHECK_PARTIAL();
4824 break;
4825 }
4826 GETCHARLENTEST(c, eptr, len);
4827 prop_category = UCD_CATEGORY(c);
4828 if ((prop_category == ucp_Z || c == CHAR_HT || c == CHAR_NL ||
4829 c == CHAR_FF || c == CHAR_CR)
4830 == prop_fail_result)
4831 break;
4832 eptr+= len;
4833 }
4834 break;
4835
4836 case PT_PXSPACE: /* POSIX space */
4837 for (i = min; i < max; i++)
4838 {
4839 int len = 1;
4840 if (eptr >= md->end_subject)
4841 {
4842 SCHECK_PARTIAL();
4843 break;
4844 }
4845 GETCHARLENTEST(c, eptr, len);
4846 prop_category = UCD_CATEGORY(c);
4847 if ((prop_category == ucp_Z || c == CHAR_HT || c == CHAR_NL ||
4848 c == CHAR_VT || c == CHAR_FF || c == CHAR_CR)
4849 == prop_fail_result)
4850 break;
4851 eptr+= len;
4852 }
4853 break;
4854
4855 case PT_WORD:
4856 for (i = min; i < max; i++)
4857 {
4858 int len = 1;
4859 if (eptr >= md->end_subject)
4860 {
4861 SCHECK_PARTIAL();
4862 break;
4863 }
4864 GETCHARLENTEST(c, eptr, len);
4865 prop_category = UCD_CATEGORY(c);
4866 if ((prop_category == ucp_L || prop_category == ucp_N ||
4867 c == CHAR_UNDERSCORE) == prop_fail_result)
4868 break;
4869 eptr+= len;
4870 }
4871 break;
4872
4873 default:
4874 RRETURN(PCRE_ERROR_INTERNAL);
4875 }
4876
4877 /* eptr is now past the end of the maximum run */
4878
4879 if (possessive) continue;
4880 for(;;)
4881 {
4882 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM44);
4883 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
4884 if (eptr-- == pp) break; /* Stop if tried at original pos */
4885 if (utf8) BACKCHAR(eptr);
4886 }
4887 }
4888
4889 /* Match extended Unicode sequences. We will get here only if the
4890 support is in the binary; otherwise a compile-time error occurs. */
4891
4892 else if (ctype == OP_EXTUNI)
4893 {
4894 for (i = min; i < max; i++)
4895 {
4896 if (eptr >= md->end_subject)
4897 {
4898 SCHECK_PARTIAL();
4899 break;
4900 }
4901 GETCHARINCTEST(c, eptr);
4902 prop_category = UCD_CATEGORY(c);
4903 if (prop_category == ucp_M) break;
4904 while (eptr < md->end_subject)
4905 {
4906 int len = 1;
4907 if (!utf8) c = *eptr; else
4908 {
4909 GETCHARLEN(c, eptr, len);
4910 }
4911 prop_category = UCD_CATEGORY(c);
4912 if (prop_category != ucp_M) break;
4913 eptr += len;
4914 }
4915 }
4916
4917 /* eptr is now past the end of the maximum run */
4918
4919 if (possessive) continue;
4920
4921 for(;;)
4922 {
4923 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM45);
4924 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
4925 if (eptr-- == pp) break; /* Stop if tried at original pos */
4926 for (;;) /* Move back over one extended */
4927 {
4928 int len = 1;
4929 if (!utf8) c = *eptr; else
4930 {
4931 BACKCHAR(eptr);
4932 GETCHARLEN(c, eptr, len);
4933 }
4934 prop_category = UCD_CATEGORY(c);
4935 if (prop_category != ucp_M) break;
4936 eptr--;
4937 }
4938 }
4939 }
4940
4941 else
4942 #endif /* SUPPORT_UCP */
4943
4944 #ifdef SUPPORT_UTF8
4945 /* UTF-8 mode */
4946
4947 if (utf8)
4948 {
4949 switch(ctype)
4950 {
4951 case OP_ANY:
4952 if (max < INT_MAX)
4953 {
4954 for (i = min; i < max; i++)
4955 {
4956 if (eptr >= md->end_subject)
4957 {
4958 SCHECK_PARTIAL();
4959 break;
4960 }
4961 if (IS_NEWLINE(eptr)) break;
4962 eptr++;
4963 while (eptr < md->end_subject && (*eptr & 0xc0) == 0x80) eptr++;
4964 }
4965 }
4966
4967 /* Handle unlimited UTF-8 repeat */
4968
4969 else
4970 {
4971 for (i = min; i < max; i++)
4972 {
4973 if (eptr >= md->end_subject)
4974 {
4975 SCHECK_PARTIAL();
4976 break;
4977 }
4978 if (IS_NEWLINE(eptr)) break;
4979 eptr++;
4980 while (eptr < md->end_subject && (*eptr & 0xc0) == 0x80) eptr++;
4981 }
4982 }
4983 break;
4984
4985 case OP_ALLANY:
4986 if (max < INT_MAX)
4987 {
4988 for (i = min; i < max; i++)
4989 {
4990 if (eptr >= md->end_subject)
4991 {
4992 SCHECK_PARTIAL();
4993 break;
4994 }
4995 eptr++;
4996 while (eptr < md->end_subject && (*eptr & 0xc0) == 0x80) eptr++;
4997 }
4998 }
4999 else eptr = md->end_subject; /* Unlimited UTF-8 repeat */
5000 break;
5001
5002 /* The byte case is the same as non-UTF8 */
5003
5004 case OP_ANYBYTE:
5005 c = max - min;
5006 if (c > (unsigned int)(md->end_subject - eptr))
5007 {
5008 eptr = md->end_subject;
5009 SCHECK_PARTIAL();
5010 }
5011 else eptr += c;
5012 break;
5013
5014 case OP_ANYNL:
5015 for (i = min; i < max; i++)
5016 {
5017 int len = 1;
5018 if (eptr >= md->end_subject)
5019 {
5020 SCHECK_PARTIAL();
5021 break;
5022 }
5023 GETCHARLEN(c, eptr, len);
5024 if (c == 0x000d)
5025 {
5026 if (++eptr >= md->end_subject) break;
5027 if (*eptr == 0x000a) eptr++;
5028 }
5029 else
5030 {
5031 if (c != 0x000a &&
5032 (md->bsr_anycrlf ||
5033 (c != 0x000b && c != 0x000c &&
5034 c != 0x0085 && c != 0x2028 && c != 0x2029)))
5035 break;
5036 eptr += len;
5037 }
5038 }
5039 break;
5040
5041 case OP_NOT_HSPACE:
5042 case OP_HSPACE:
5043 for (i = min; i < max; i++)
5044 {
5045 BOOL gotspace;
5046 int len = 1;
5047 if (eptr >= md->end_subject)
5048 {
5049 SCHECK_PARTIAL();
5050 break;
5051 }
5052 GETCHARLEN(c, eptr, len);
5053 switch(c)
5054 {
5055 default: gotspace = FALSE; break;
5056 case 0x09: /* HT */
5057 case 0x20: /* SPACE */
5058 case 0xa0: /* NBSP */
5059 case 0x1680: /* OGHAM SPACE MARK */
5060 case 0x180e: /* MONGOLIAN VOWEL SEPARATOR */
5061 case 0x2000: /* EN QUAD */
5062 case 0x2001: /* EM QUAD */
5063 case 0x2002: /* EN SPACE */
5064 case 0x2003: /* EM SPACE */
5065 case 0x2004: /* THREE-PER-EM SPACE */
5066 case 0x2005: /* FOUR-PER-EM SPACE */
5067 case 0x2006: /* SIX-PER-EM SPACE */
5068 case 0x2007: /* FIGURE SPACE */
5069 case 0x2008: /* PUNCTUATION SPACE */
5070 case 0x2009: /* THIN SPACE */
5071 case 0x200A: /* HAIR SPACE */
5072 case 0x202f: /* NARROW NO-BREAK SPACE */
5073 case 0x205f: /* MEDIUM MATHEMATICAL SPACE */
5074 case 0x3000: /* IDEOGRAPHIC SPACE */
5075 gotspace = TRUE;
5076 break;
5077 }
5078 if (gotspace == (ctype == OP_NOT_HSPACE)) break;
5079 eptr += len;
5080 }
5081 break;
5082
5083 case OP_NOT_VSPACE:
5084 case OP_VSPACE:
5085 for (i = min; i < max; i++)
5086 {
5087 BOOL gotspace;
5088 int len = 1;
5089 if (eptr >= md->end_subject)
5090 {
5091 SCHECK_PARTIAL();
5092 break;
5093 }
5094 GETCHARLEN(c, eptr, len);
5095 switch(c)
5096 {
5097 default: gotspace = FALSE; break;
5098 case 0x0a: /* LF */
5099 case 0x0b: /* VT */
5100 case 0x0c: /* FF */
5101 case 0x0d: /* CR */
5102 case 0x85: /* NEL */
5103 case 0x2028: /* LINE SEPARATOR */
5104 case 0x2029: /* PARAGRAPH SEPARATOR */
5105 gotspace = TRUE;
5106 break;
5107 }
5108 if (gotspace == (ctype == OP_NOT_VSPACE)) break;
5109 eptr += len;
5110 }
5111 break;
5112
5113 case OP_NOT_DIGIT:
5114 for (i = min; i < max; i++)
5115 {
5116 int len = 1;
5117 if (eptr >= md->end_subject)
5118 {
5119 SCHECK_PARTIAL();
5120 break;
5121 }
5122 GETCHARLEN(c, eptr, len);
5123 if (c < 256 && (md->ctypes[c] & ctype_digit) != 0) break;
5124 eptr+= len;
5125 }
5126 break;
5127
5128 case OP_DIGIT:
5129 for (i = min; i < max; i++)
5130 {
5131 int len = 1;
5132 if (eptr >= md->end_subject)
5133 {
5134 SCHECK_PARTIAL();
5135 break;
5136 }
5137 GETCHARLEN(c, eptr, len);
5138 if (c >= 256 ||(md->ctypes[c] & ctype_digit) == 0) break;
5139 eptr+= len;
5140 }
5141 break;
5142
5143 case OP_NOT_WHITESPACE:
5144 for (i = min; i < max; i++)
5145 {
5146 int len = 1;
5147 if (eptr >= md->end_subject)
5148 {
5149 SCHECK_PARTIAL();
5150 break;
5151 }
5152 GETCHARLEN(c, eptr, len);
5153 if (c < 256 && (md->ctypes[c] & ctype_space) != 0) break;
5154 eptr+= len;
5155 }
5156 break;
5157
5158 case OP_WHITESPACE:
5159 for (i = min; i < max; i++)
5160 {
5161 int len = 1;
5162 if (eptr >= md->end_subject)
5163 {
5164 SCHECK_PARTIAL();
5165 break;
5166 }
5167 GETCHARLEN(c, eptr, len);
5168 if (c >= 256 ||(md->ctypes[c] & ctype_space) == 0) break;
5169 eptr+= len;
5170 }
5171 break;
5172
5173 case OP_NOT_WORDCHAR:
5174 for (i = min; i < max; i++)
5175 {
5176 int len = 1;
5177 if (eptr >= md->end_subject)
5178 {
5179 SCHECK_PARTIAL();
5180 break;
5181 }
5182 GETCHARLEN(c, eptr, len);
5183 if (c < 256 && (md->ctypes[c] & ctype_word) != 0) break;
5184 eptr+= len;
5185 }
5186 break;
5187
5188 case OP_WORDCHAR:
5189 for (i = min; i < max; i++)
5190 {
5191 int len = 1;
5192 if (eptr >= md->end_subject)
5193 {
5194 SCHECK_PARTIAL();
5195 break;
5196 }
5197 GETCHARLEN(c, eptr, len);
5198 if (c >= 256 || (md->ctypes[c] & ctype_word) == 0) break;
5199 eptr+= len;
5200 }
5201 break;
5202
5203 default:
5204 RRETURN(PCRE_ERROR_INTERNAL);
5205 }
5206
5207 /* eptr is now past the end of the maximum run */
5208
5209 if (possessive) continue;
5210 for(;;)
5211 {
5212 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM46);
5213 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
5214 if (eptr-- == pp) break; /* Stop if tried at original pos */
5215 BACKCHAR(eptr);
5216 }
5217 }
5218 else
5219 #endif /* SUPPORT_UTF8 */
5220
5221 /* Not UTF-8 mode */
5222 {
5223 switch(ctype)
5224 {
5225 case OP_ANY:
5226 for (i = min; i < max; i++)
5227 {
5228 if (eptr >= md->end_subject)
5229 {
5230 SCHECK_PARTIAL();
5231 break;
5232 }
5233 if (IS_NEWLINE(eptr)) break;
5234 eptr++;
5235 }
5236 break;
5237
5238 case OP_ALLANY:
5239 case OP_ANYBYTE:
5240 c = max - min;
5241 if (c > (unsigned int)(md->end_subject - eptr))
5242 {
5243 eptr = md->end_subject;
5244 SCHECK_PARTIAL();
5245 }
5246 else eptr += c;
5247 break;
5248
5249 case OP_ANYNL:
5250 for (i = min; i < max; i++)
5251 {
5252 if (eptr >= md->end_subject)
5253 {
5254 SCHECK_PARTIAL();
5255 break;
5256 }
5257 c = *eptr;
5258 if (c == 0x000d)
5259 {
5260 if (++eptr >= md->end_subject) break;
5261 if (*eptr == 0x000a) eptr++;
5262 }
5263 else
5264 {
5265 if (c != 0x000a &&
5266 (md->bsr_anycrlf ||
5267 (c != 0x000b && c != 0x000c && c != 0x0085)))
5268 break;
5269 eptr++;
5270 }
5271 }
5272 break;
5273
5274 case OP_NOT_HSPACE:
5275 for (i = min; i < max; i++)
5276 {
5277 if (eptr >= md->end_subject)
5278 {
5279 SCHECK_PARTIAL();
5280 break;
5281 }
5282 c = *eptr;
5283 if (c == 0x09 || c == 0x20 || c == 0xa0) break;
5284 eptr++;
5285 }
5286 break;
5287
5288 case OP_HSPACE:
5289 for (i = min; i < max; i++)
5290 {
5291 if (eptr >= md->end_subject)
5292 {
5293 SCHECK_PARTIAL();
5294 break;
5295 }
5296 c = *eptr;
5297 if (c != 0x09 && c != 0x20 && c != 0xa0) break;
5298 eptr++;
5299 }
5300 break;
5301
5302 case OP_NOT_VSPACE:
5303 for (i = min; i < max; i++)
5304 {
5305 if (eptr >= md->end_subject)
5306 {
5307 SCHECK_PARTIAL();
5308 break;
5309 }
5310 c = *eptr;
5311 if (c == 0x0a || c == 0x0b || c == 0x0c || c == 0x0d || c == 0x85)
5312 break;
5313 eptr++;
5314 }
5315 break;
5316
5317 case OP_VSPACE:
5318 for (i = min; i < max; i++)
5319 {
5320 if (eptr >= md->end_subject)
5321 {
5322 SCHECK_PARTIAL();
5323 break;
5324 }
5325 c = *eptr;
5326 if (c != 0x0a && c != 0x0b && c != 0x0c && c != 0x0d && c != 0x85)
5327 break;
5328 eptr++;
5329 }
5330 break;
5331
5332 case OP_NOT_DIGIT:
5333 for (i = min; i < max; i++)
5334 {
5335 if (eptr >= md->end_subject)
5336 {
5337 SCHECK_PARTIAL();
5338 break;
5339 }
5340 if ((md->ctypes[*eptr] & ctype_digit) != 0) break;
5341 eptr++;
5342 }
5343 break;
5344
5345 case OP_DIGIT:
5346 for (i = min; i < max; i++)
5347 {
5348 if (eptr >= md->end_subject)
5349 {
5350 SCHECK_PARTIAL();
5351 break;
5352 }
5353 if ((md->ctypes[*eptr] & ctype_digit) == 0) break;
5354 eptr++;
5355 }
5356 break;
5357
5358 case OP_NOT_WHITESPACE:
5359 for (i = min; i < max; i++)
5360 {
5361 if (eptr >= md->end_subject)
5362 {
5363 SCHECK_PARTIAL();
5364 break;
5365 }
5366 if ((md->ctypes[*eptr] & ctype_space) != 0) break;
5367 eptr++;
5368 }
5369 break;
5370
5371 case OP_WHITESPACE:
5372 for (i = min; i < max; i++)
5373 {
5374 if (eptr >= md->end_subject)
5375 {
5376 SCHECK_PARTIAL();
5377 break;
5378 }
5379 if ((md->ctypes[*eptr] & ctype_space) == 0) break;
5380 eptr++;
5381 }
5382 break;
5383
5384 case OP_NOT_WORDCHAR:
5385 for (i = min; i < max; i++)
5386 {
5387 if (eptr >= md->end_subject)
5388 {
5389 SCHECK_PARTIAL();
5390 break;
5391 }
5392 if ((md->ctypes[*eptr] & ctype_word) != 0) break;
5393 eptr++;
5394 }
5395 break;
5396
5397 case OP_WORDCHAR:
5398 for (i = min; i < max; i++)
5399 {
5400 if (eptr >= md->end_subject)
5401 {
5402 SCHECK_PARTIAL();
5403 break;
5404 }
5405 if ((md->ctypes[*eptr] & ctype_word) == 0) break;
5406 eptr++;
5407 }
5408 break;
5409
5410 default:
5411 RRETURN(PCRE_ERROR_INTERNAL);
5412 }
5413
5414 /* eptr is now past the end of the maximum run */
5415
5416 if (possessive) continue;
5417 while (eptr >= pp)
5418 {
5419 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM47);
5420 eptr--;
5421 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
5422 }
5423 }
5424
5425 /* Get here if we can't make it match with any permitted repetitions */
5426
5427 MRRETURN(MATCH_NOMATCH);
5428 }
5429 /* Control never gets here */
5430
5431 /* There's been some horrible disaster. Arrival here can only mean there is
5432 something seriously wrong in the code above or the OP_xxx definitions. */
5433
5434 default:
5435 DPRINTF(("Unknown opcode %d\n", *ecode));
5436 RRETURN(PCRE_ERROR_UNKNOWN_OPCODE);
5437 }
5438
5439 /* Do not stick any code in here without much thought; it is assumed
5440 that "continue" in the code above comes out to here to repeat the main
5441 loop. */
5442
5443 } /* End of main loop */
5444 /* Control never reaches here */
5445
5446
5447 /* When compiling to use the heap rather than the stack for recursive calls to
5448 match(), the RRETURN() macro jumps here. The number that is saved in
5449 frame->Xwhere indicates which label we actually want to return to. */
5450
5451 #ifdef NO_RECURSE
5452 #define LBL(val) case val: goto L_RM##val;
5453 HEAP_RETURN:
5454 switch (frame->Xwhere)
5455 {
5456 LBL( 1) LBL( 2) LBL( 3) LBL( 4) LBL( 5) LBL( 6) LBL( 7) LBL( 8)
5457 LBL( 9) LBL(10) LBL(11) LBL(12) LBL(13) LBL(14) LBL(15) LBL(17)
5458 LBL(19) LBL(24) LBL(25) LBL(26) LBL(27) LBL(29) LBL(31) LBL(33)
5459 LBL(35) LBL(43) LBL(47) LBL(48) LBL(49) LBL(50) LBL(51) LBL(52)
5460 LBL(53) LBL(54) LBL(55) LBL(56) LBL(57) LBL(58)
5461 #ifdef SUPPORT_UTF8
5462 LBL(16) LBL(18) LBL(20) LBL(21) LBL(22) LBL(23) LBL(28) LBL(30)
5463 LBL(32) LBL(34) LBL(42) LBL(46)
5464 #ifdef SUPPORT_UCP
5465 LBL(36) LBL(37) LBL(38) LBL(39) LBL(40) LBL(41) LBL(44) LBL(45)
5466 LBL(59) LBL(60) LBL(61) LBL(62)
5467 #endif /* SUPPORT_UCP */
5468 #endif /* SUPPORT_UTF8 */
5469 default:
5470 DPRINTF(("jump error in pcre match: label %d non-existent\n", frame->Xwhere));
5471 return PCRE_ERROR_INTERNAL;
5472 }
5473 #undef LBL
5474 #endif /* NO_RECURSE */
5475 }
5476
5477
5478 /***************************************************************************
5479 ****************************************************************************
5480 RECURSION IN THE match() FUNCTION
5481
5482 Undefine all the macros that were defined above to handle this. */
5483
5484 #ifdef NO_RECURSE
5485 #undef eptr
5486 #undef ecode
5487 #undef mstart
5488 #undef offset_top
5489 #undef ims
5490 #undef eptrb
5491 #undef flags
5492
5493 #undef callpat
5494 #undef charptr
5495 #undef data
5496 #undef next
5497 #undef pp
5498 #undef prev
5499 #undef saved_eptr
5500
5501 #undef new_recursive
5502
5503 #undef cur_is_word
5504 #undef condition
5505 #undef prev_is_word
5506
5507 #undef original_ims
5508
5509 #undef ctype
5510 #undef length
5511 #undef max
5512 #undef min
5513 #undef number
5514 #undef offset
5515 #undef op
5516 #undef save_capture_last
5517 #undef save_offset1
5518 #undef save_offset2
5519 #undef save_offset3
5520 #undef stacksave
5521
5522 #undef newptrb
5523
5524 #endif
5525
5526 /* These two are defined as macros in both cases */
5527
5528 #undef fc
5529 #undef fi
5530
5531 /***************************************************************************
5532 ***************************************************************************/
5533
5534
5535
5536 /*************************************************
5537 * Execute a Regular Expression *
5538 *************************************************/
5539
5540 /* This function applies a compiled re to a subject string and picks out
5541 portions of the string if it matches. Two elements in the vector are set for
5542 each substring: the offsets to the start and end of the substring.
5543
5544 Arguments:
5545 argument_re points to the compiled expression
5546 extra_data points to extra data or is NULL
5547 subject points to the subject string
5548 length length of subject string (may contain binary zeros)
5549 start_offset where to start in the subject string
5550 options option bits
5551 offsets points to a vector of ints to be filled in with offsets
5552 offsetcount the number of elements in the vector
5553
5554 Returns: > 0 => success; value is the number of elements filled in
5555 = 0 => success, but offsets is not big enough
5556 -1 => failed to match
5557 < -1 => some kind of unexpected problem
5558 */
5559
5560 PCRE_EXP_DEFN int PCRE_CALL_CONVENTION
5561 pcre_exec(const pcre *argument_re, const pcre_extra *extra_data,
5562 PCRE_SPTR subject, int length, int start_offset, int options, int *offsets,
5563 int offsetcount)
5564 {
5565 int rc, resetcount, ocount;
5566 int first_byte = -1;
5567 int req_byte = -1;
5568 int req_byte2 = -1;
5569 int newline;
5570 unsigned long int ims;
5571 BOOL using_temporary_offsets = FALSE;
5572 BOOL anchored;
5573 BOOL startline;
5574 BOOL firstline;
5575 BOOL first_byte_caseless = FALSE;
5576 BOOL req_byte_caseless = FALSE;
5577 BOOL utf8;
5578 match_data match_block;
5579 match_data *md = &match_block;
5580 const uschar *tables;
5581 const uschar *start_bits = NULL;
5582 USPTR start_match = (USPTR)subject + start_offset;
5583 USPTR end_subject;
5584 USPTR start_partial = NULL;
5585 USPTR req_byte_ptr = start_match - 1;
5586
5587 pcre_study_data internal_study;
5588 const pcre_study_data *study;
5589
5590 real_pcre internal_re;
5591 const real_pcre *external_re = (const real_pcre *)argument_re;
5592 const real_pcre *re = external_re;
5593
5594 /* Plausibility checks */
5595
5596 if ((options & ~PUBLIC_EXEC_OPTIONS) != 0) return PCRE_ERROR_BADOPTION;
5597 if (re == NULL || subject == NULL ||
5598 (offsets == NULL && offsetcount > 0)) return PCRE_ERROR_NULL;
5599 if (offsetcount < 0) return PCRE_ERROR_BADCOUNT;
5600
5601 /* This information is for finding all the numbers associated with a given
5602 name, for condition testing. */
5603
5604 md->name_table = (uschar *)re + re->name_table_offset;
5605 md->name_count = re->name_count;
5606 md->name_entry_size = re->name_entry_size;
5607
5608 /* Fish out the optional data from the extra_data structure, first setting
5609 the default values. */
5610
5611 study = NULL;
5612 md->match_limit = MATCH_LIMIT;
5613 md->match_limit_recursion = MATCH_LIMIT_RECURSION;
5614 md->callout_data = NULL;
5615
5616 /* The table pointer is always in native byte order. */
5617
5618 tables = external_re->tables;
5619
5620 if (extra_data != NULL)
5621 {
5622 register unsigned int flags = extra_data->flags;
5623 if ((flags & PCRE_EXTRA_STUDY_DATA) != 0)
5624 study = (const pcre_study_data *)extra_data->study_data;
5625 if ((flags & PCRE_EXTRA_MATCH_LIMIT) != 0)
5626 md->match_limit = extra_data->match_limit;
5627 if ((flags & PCRE_EXTRA_MATCH_LIMIT_RECURSION) != 0)
5628 md->match_limit_recursion = extra_data->match_limit_recursion;
5629 if ((flags & PCRE_EXTRA_CALLOUT_DATA) != 0)
5630 md->callout_data = extra_data->callout_data;
5631 if ((flags & PCRE_EXTRA_TABLES) != 0) tables = extra_data->tables;
5632 }
5633
5634 /* If the exec call supplied NULL for tables, use the inbuilt ones. This
5635 is a feature that makes it possible to save compiled regex and re-use them
5636 in other programs later. */
5637
5638 if (tables == NULL) tables = _pcre_default_tables;
5639
5640 /* Check that the first field in the block is the magic number. If it is not,
5641 test for a regex that was compiled on a host of opposite endianness. If this is
5642 the case, flipped values are put in internal_re and internal_study if there was
5643 study data too. */
5644
5645 if (re->magic_number != MAGIC_NUMBER)
5646 {
5647 re = _pcre_try_flipped(re, &internal_re, study, &internal_study);
5648 if (re == NULL) return PCRE_ERROR_BADMAGIC;
5649 if (study != NULL) study = &internal_study;
5650 }
5651
5652 /* Set up other data */
5653
5654 anchored = ((re->options | options) & PCRE_ANCHORED) != 0;
5655 startline = (re->flags & PCRE_STARTLINE) != 0;
5656 firstline = (re->options & PCRE_FIRSTLINE) != 0;
5657
5658 /* The code starts after the real_pcre block and the capture name table. */
5659
5660 md->start_code = (const uschar *)external_re + re->name_table_offset +
5661 re->name_count * re->name_entry_size;
5662
5663 md->start_subject = (USPTR)subject;
5664 md->start_offset = start_offset;
5665 md->end_subject = md->start_subject + length;
5666 end_subject = md->end_subject;
5667
5668 md->endonly = (re->options & PCRE_DOLLAR_ENDONLY) != 0;
5669 utf8 = md->utf8 = (re->options & PCRE_UTF8) != 0;
5670 md->use_ucp = (re->options & PCRE_UCP) != 0;
5671 md->jscript_compat = (re->options & PCRE_JAVASCRIPT_COMPAT) != 0;
5672
5673 md->notbol = (options & PCRE_NOTBOL) != 0;
5674 md->noteol = (options & PCRE_NOTEOL) != 0;
5675 md->notempty = (options & PCRE_NOTEMPTY) != 0;
5676 md->notempty_atstart = (options & PCRE_NOTEMPTY_ATSTART) != 0;
5677 md->partial = ((options & PCRE_PARTIAL_HARD) != 0)? 2 :
5678 ((options & PCRE_PARTIAL_SOFT) != 0)? 1 : 0;
5679 md->hitend = FALSE;
5680 md->mark = NULL; /* In case never set */
5681
5682 md->recursive = NULL; /* No recursion at top level */
5683
5684 md->lcc = tables + lcc_offset;
5685 md->ctypes = tables + ctypes_offset;
5686
5687 /* Handle different \R options. */
5688
5689 switch (options & (PCRE_BSR_ANYCRLF|PCRE_BSR_UNICODE))
5690 {
5691 case 0:
5692 if ((re->options & (PCRE_BSR_ANYCRLF|PCRE_BSR_UNICODE)) != 0)
5693 md->bsr_anycrlf = (re->options & PCRE_BSR_ANYCRLF) != 0;
5694 else
5695 #ifdef BSR_ANYCRLF
5696 md->bsr_anycrlf = TRUE;
5697 #else
5698 md->bsr_anycrlf = FALSE;
5699 #endif
5700 break;
5701
5702 case PCRE_BSR_ANYCRLF:
5703 md->bsr_anycrlf = TRUE;
5704 break;
5705
5706 case PCRE_BSR_UNICODE:
5707 md->bsr_anycrlf = FALSE;
5708 break;
5709
5710 default: return PCRE_ERROR_BADNEWLINE;
5711 }
5712
5713 /* Handle different types of newline. The three bits give eight cases. If
5714 nothing is set at run time, whatever was used at compile time applies. */
5715
5716 switch ((((options & PCRE_NEWLINE_BITS) == 0)? re->options :
5717 (pcre_uint32)options) & PCRE_NEWLINE_BITS)
5718 {
5719 case 0: newline = NEWLINE; break; /* Compile-time default */
5720 case PCRE_NEWLINE_CR: newline = CHAR_CR; break;
5721 case PCRE_NEWLINE_LF: newline = CHAR_NL; break;
5722 case PCRE_NEWLINE_CR+
5723 PCRE_NEWLINE_LF: newline = (CHAR_CR << 8) | CHAR_NL; break;
5724 case PCRE_NEWLINE_ANY: newline = -1; break;
5725 case PCRE_NEWLINE_ANYCRLF: newline = -2; break;
5726 default: return PCRE_ERROR_BADNEWLINE;
5727 }
5728
5729 if (newline == -2)
5730 {
5731 md->nltype = NLTYPE_ANYCRLF;
5732 }
5733 else if (newline < 0)
5734 {
5735 md->nltype = NLTYPE_ANY;
5736 }
5737 else
5738 {
5739 md->nltype = NLTYPE_FIXED;
5740 if (newline > 255)
5741 {
5742 md->nllen = 2;
5743 md->nl[0] = (newline >> 8) & 255;
5744 md->nl[1] = newline & 255;
5745 }
5746 else
5747 {
5748 md->nllen = 1;
5749 md->nl[0] = newline;
5750 }
5751 }
5752
5753 /* Partial matching was originally supported only for a restricted set of
5754 regexes; from release 8.00 there are no restrictions, but the bits are still
5755 defined (though never set). So there's no harm in leaving this code. */
5756
5757 if (md->partial && (re->flags & PCRE_NOPARTIAL) != 0)
5758 return PCRE_ERROR_BADPARTIAL;
5759
5760 /* Check a UTF-8 string if required. Unfortunately there's no way of passing
5761 back the character offset. */
5762
5763 #ifdef SUPPORT_UTF8
5764 if (utf8 && (options & PCRE_NO_UTF8_CHECK) == 0)
5765 {
5766 if (_pcre_valid_utf8((USPTR)subject, length) >= 0)
5767 return PCRE_ERROR_BADUTF8;
5768 if (start_offset > 0 && start_offset < length)
5769 {
5770 int tb = ((USPTR)subject)[start_offset];
5771 if (tb > 127)
5772 {
5773 tb &= 0xc0;
5774 if (tb != 0 && tb != 0xc0) return PCRE_ERROR_BADUTF8_OFFSET;
5775 }
5776 }
5777 }
5778 #endif
5779
5780 /* The ims options can vary during the matching as a result of the presence
5781 of (?ims) items in the pattern. They are kept in a local variable so that
5782 restoring at the exit of a group is easy. */
5783
5784 ims = re->options & (PCRE_CASELESS|PCRE_MULTILINE|PCRE_DOTALL);
5785
5786 /* If the expression has got more back references than the offsets supplied can
5787 hold, we get a temporary chunk of working store to use during the matching.
5788 Otherwise, we can use the vector supplied, rounding down its size to a multiple
5789 of 3. */
5790
5791 ocount = offsetcount - (offsetcount % 3);
5792
5793 if (re->top_backref > 0 && re->top_backref >= ocount/3)
5794 {
5795 ocount = re->top_backref * 3 + 3;
5796 md->offset_vector = (int *)(pcre_malloc)(ocount * sizeof(int));
5797 if (md->offset_vector == NULL) return PCRE_ERROR_NOMEMORY;
5798 using_temporary_offsets = TRUE;
5799 DPRINTF(("Got memory to hold back references\n"));
5800 }
5801 else md->offset_vector = offsets;
5802
5803 md->offset_end = ocount;
5804 md->offset_max = (2*ocount)/3;
5805 md->offset_overflow = FALSE;
5806 md->capture_last = -1;
5807
5808 /* Compute the minimum number of offsets that we need to reset each time. Doing
5809 this makes a huge difference to execution time when there aren't many brackets
5810 in the pattern. */
5811
5812 resetcount = 2 + re->top_bracket * 2;
5813 if (resetcount > offsetcount) resetcount = ocount;
5814
5815 /* Reset the working variable associated with each extraction. These should
5816 never be used unless previously set, but they get saved and restored, and so we
5817 initialize them to avoid reading uninitialized locations. */
5818
5819 if (md->offset_vector != NULL)
5820 {
5821 register int *iptr = md->offset_vector + ocount;
5822 register int *iend = iptr - resetcount/2 + 1;
5823 while (--iptr >= iend) *iptr = -1;
5824 }
5825
5826 /* Set up the first character to match, if available. The first_byte value is
5827 never set for an anchored regular expression, but the anchoring may be forced
5828 at run time, so we have to test for anchoring. The first char may be unset for
5829 an unanchored pattern, of course. If there's no first char and the pattern was
5830 studied, there may be a bitmap of possible first characters. */
5831
5832 if (!anchored)
5833 {
5834 if ((re->flags & PCRE_FIRSTSET) != 0)
5835 {
5836 first_byte = re->first_byte & 255;
5837 if ((first_byte_caseless = ((re->first_byte & REQ_CASELESS) != 0)) == TRUE)
5838 first_byte = md->lcc[first_byte];
5839 }
5840 else
5841 if (!startline && study != NULL &&
5842 (study->flags & PCRE_STUDY_MAPPED) != 0)
5843 start_bits = study->start_bits;
5844 }
5845
5846 /* For anchored or unanchored matches, there may be a "last known required
5847 character" set. */
5848
5849 if ((re->flags & PCRE_REQCHSET) != 0)
5850 {
5851 req_byte = re->req_byte & 255;
5852 req_byte_caseless = (re->req_byte & REQ_CASELESS) != 0;
5853 req_byte2 = (tables + fcc_offset)[req_byte]; /* case flipped */
5854 }
5855
5856
5857 /* ==========================================================================*/
5858
5859 /* Loop for handling unanchored repeated matching attempts; for anchored regexs
5860 the loop runs just once. */
5861
5862 for(;;)
5863 {
5864 USPTR save_end_subject = end_subject;
5865 USPTR new_start_match;
5866
5867 /* Reset the maximum number of extractions we might see. */
5868
5869 if (md->offset_vector != NULL)
5870 {
5871 register int *iptr = md->offset_vector;
5872 register int *iend = iptr + resetcount;
5873 while (iptr < iend) *iptr++ = -1;
5874 }
5875
5876 /* If firstline is TRUE, the start of the match is constrained to the first
5877 line of a multiline string. That is, the match must be before or at the first
5878 newline. Implement this by temporarily adjusting end_subject so that we stop
5879 scanning at a newline. If the match fails at the newline, later code breaks
5880 this loop. */
5881
5882 if (firstline)
5883 {
5884 USPTR t = start_match;
5885 #ifdef SUPPORT_UTF8
5886 if (utf8)
5887 {
5888 while (t < md->end_subject && !IS_NEWLINE(t))
5889 {
5890 t++;
5891 while (t < end_subject && (*t & 0xc0) == 0x80) t++;
5892 }
5893 }
5894 else
5895 #endif
5896 while (t < md->end_subject && !IS_NEWLINE(t)) t++;
5897 end_subject = t;
5898 }
5899
5900 /* There are some optimizations that avoid running the match if a known
5901 starting point is not found, or if a known later character is not present.
5902 However, there is an option that disables these, for testing and for ensuring
5903 that all callouts do actually occur. */
5904
5905 if ((options & PCRE_NO_START_OPTIMIZE) == 0)
5906 {
5907 /* Advance to a unique first byte if there is one. */
5908
5909 if (first_byte >= 0)
5910 {
5911 if (first_byte_caseless)
5912 while (start_match < end_subject && md->lcc[*start_match] != first_byte)
5913 start_match++;
5914 else
5915 while (start_match < end_subject && *start_match != first_byte)
5916 start_match++;
5917 }
5918
5919 /* Or to just after a linebreak for a multiline match */
5920
5921 else if (startline)
5922 {
5923 if (start_match > md->start_subject + start_offset)
5924 {
5925 #ifdef SUPPORT_UTF8
5926 if (utf8)
5927 {
5928 while (start_match < end_subject && !WAS_NEWLINE(start_match))
5929 {
5930 start_match++;
5931 while(start_match < end_subject && (*start_match & 0xc0) == 0x80)
5932 start_match++;
5933 }
5934 }
5935 else
5936 #endif
5937 while (start_match < end_subject && !WAS_NEWLINE(start_match))
5938 start_match++;
5939
5940 /* If we have just passed a CR and the newline option is ANY or ANYCRLF,
5941 and we are now at a LF, advance the match position by one more character.
5942 */
5943
5944 if (start_match[-1] == CHAR_CR &&
5945 (md->nltype == NLTYPE_ANY || md->nltype == NLTYPE_ANYCRLF) &&
5946 start_match < end_subject &&
5947 *start_match == CHAR_NL)
5948 start_match++;
5949 }
5950 }
5951
5952 /* Or to a non-unique first byte after study */
5953
5954 else if (start_bits != NULL)
5955 {
5956 while (start_match < end_subject)
5957 {
5958 register unsigned int c = *start_match;
5959 if ((start_bits[c/8] & (1 << (c&7))) == 0) start_match++;
5960 else break;
5961 }
5962 }
5963 } /* Starting optimizations */
5964
5965 /* Restore fudged end_subject */
5966
5967 end_subject = save_end_subject;
5968
5969 /* The following two optimizations are disabled for partial matching or if
5970 disabling is explicitly requested. */
5971
5972 if ((options & PCRE_NO_START_OPTIMIZE) == 0 && !md->partial)
5973 {
5974 /* If the pattern was studied, a minimum subject length may be set. This is
5975 a lower bound; no actual string of that length may actually match the
5976 pattern. Although the value is, strictly, in characters, we treat it as
5977 bytes to avoid spending too much time in this optimization. */
5978
5979 if (study != NULL && (study->flags & PCRE_STUDY_MINLEN) != 0 &&
5980 (pcre_uint32)(end_subject - start_match) < study->minlength)
5981 {
5982 rc = MATCH_NOMATCH;
5983 break;
5984 }
5985
5986 /* If req_byte is set, we know that that character must appear in the
5987 subject for the match to succeed. If the first character is set, req_byte
5988 must be later in the subject; otherwise the test starts at the match point.
5989 This optimization can save a huge amount of backtracking in patterns with
5990 nested unlimited repeats that aren't going to match. Writing separate code
5991 for cased/caseless versions makes it go faster, as does using an
5992 autoincrement and backing off on a match.
5993
5994 HOWEVER: when the subject string is very, very long, searching to its end
5995 can take a long time, and give bad performance on quite ordinary patterns.
5996 This showed up when somebody was matching something like /^\d+C/ on a
5997 32-megabyte string... so we don't do this when the string is sufficiently
5998 long. */
5999
6000 if (req_byte >= 0 && end_subject - start_match < REQ_BYTE_MAX)
6001 {
6002 register USPTR p = start_match + ((first_byte >= 0)? 1 : 0);
6003
6004 /* We don't need to repeat the search if we haven't yet reached the
6005 place we found it at last time. */
6006
6007 if (p > req_byte_ptr)
6008 {
6009 if (req_byte_caseless)
6010 {
6011 while (p < end_subject)
6012 {
6013 register int pp = *p++;
6014 if (pp == req_byte || pp == req_byte2) { p--; break; }
6015 }
6016 }
6017 else
6018 {
6019 while (p < end_subject)
6020 {
6021 if (*p++ == req_byte) { p--; break; }
6022 }
6023 }
6024
6025 /* If we can't find the required character, break the matching loop,
6026 forcing a match failure. */
6027
6028 if (p >= end_subject)
6029 {
6030 rc = MATCH_NOMATCH;
6031 break;
6032 }
6033
6034 /* If we have found the required character, save the point where we
6035 found it, so that we don't search again next time round the loop if
6036 the start hasn't passed this character yet. */
6037
6038 req_byte_ptr = p;
6039 }
6040 }
6041 }
6042
6043 #ifdef PCRE_DEBUG /* Sigh. Some compilers never learn. */
6044 printf(">>>> Match against: ");
6045 pchars(start_match, end_subject - start_match, TRUE, md);
6046 printf("\n");
6047 #endif
6048
6049 /* OK, we can now run the match. If "hitend" is set afterwards, remember the
6050 first starting point for which a partial match was found. */
6051
6052 md->start_match_ptr = start_match;
6053 md->start_used_ptr = start_match;
6054 md->match_call_count = 0;
6055 rc = match(start_match, md->start_code, start_match, NULL, 2, md, ims, NULL,
6056 0, 0);
6057 if (md->hitend && start_partial == NULL) start_partial = md->start_used_ptr;
6058
6059 switch(rc)
6060 {
6061 /* NOMATCH and PRUNE advance by one character. If MATCH_SKIP_ARG reaches
6062 this level it means that a MARK that matched the SKIP's arg was not found.
6063 We treat this as NOMATCH. THEN at this level acts exactly like PRUNE. */
6064
6065 case MATCH_NOMATCH:
6066 case MATCH_PRUNE:
6067 case MATCH_SKIP_ARG:
6068 case MATCH_THEN:
6069 new_start_match = start_match + 1;
6070 #ifdef SUPPORT_UTF8
6071 if (utf8)
6072 while(new_start_match < end_subject && (*new_start_match & 0xc0) == 0x80)
6073 new_start_match++;
6074 #endif
6075 break;
6076
6077 /* SKIP passes back the next starting point explicitly. */
6078
6079 case MATCH_SKIP:
6080 new_start_match = md->start_match_ptr;
6081 break;
6082
6083 /* COMMIT disables the bumpalong, but otherwise behaves as NOMATCH. */
6084
6085 case MATCH_COMMIT:
6086 rc = MATCH_NOMATCH;
6087 goto ENDLOOP;
6088
6089 /* Any other return is either a match, or some kind of error. */
6090
6091 default:
6092 goto ENDLOOP;
6093 }
6094
6095 /* Control reaches here for the various types of "no match at this point"
6096 result. Reset the code to MATCH_NOMATCH for subsequent checking. */
6097
6098 rc = MATCH_NOMATCH;
6099
6100 /* If PCRE_FIRSTLINE is set, the match must happen before or at the first
6101 newline in the subject (though it may continue over the newline). Therefore,
6102 if we have just failed to match, starting at a newline, do not continue. */
6103
6104 if (firstline && IS_NEWLINE(start_match)) break;
6105
6106 /* Advance to new matching position */
6107
6108 start_match = new_start_match;
6109
6110 /* Break the loop if the pattern is anchored or if we have passed the end of
6111 the subject. */
6112
6113 if (anchored || start_match > end_subject) break;
6114
6115 /* If we have just passed a CR and we are now at a LF, and the pattern does
6116 not contain any explicit matches for \r or \n, and the newline option is CRLF
6117 or ANY or ANYCRLF, advance the match position by one more character. */
6118
6119 if (start_match[-1] == CHAR_CR &&
6120 start_match < end_subject &&
6121 *start_match == CHAR_NL &&
6122 (re->flags & PCRE_HASCRORLF) == 0 &&
6123 (md->nltype == NLTYPE_ANY ||
6124 md->nltype == NLTYPE_ANYCRLF ||
6125 md->nllen == 2))
6126 start_match++;
6127
6128 md->mark = NULL; /* Reset for start of next match attempt */
6129 } /* End of for(;;) "bumpalong" loop */
6130
6131 /* ==========================================================================*/
6132
6133 /* We reach here when rc is not MATCH_NOMATCH, or if one of the stopping
6134 conditions is true:
6135
6136 (1) The pattern is anchored or the match was failed by (*COMMIT);
6137
6138 (2) We are past the end of the subject;
6139
6140 (3) PCRE_FIRSTLINE is set and we have failed to match at a newline, because
6141 this option requests that a match occur at or before the first newline in
6142 the subject.
6143
6144 When we have a match and the offset vector is big enough to deal with any
6145 backreferences, captured substring offsets will already be set up. In the case
6146 where we had to get some local store to hold offsets for backreference
6147 processing, copy those that we can. In this case there need not be overflow if
6148 certain parts of the pattern were not used, even though there are more
6149 capturing parentheses than vector slots. */
6150
6151 ENDLOOP:
6152
6153 if (rc == MATCH_MATCH || rc == MATCH_ACCEPT)
6154 {
6155 if (using_temporary_offsets)
6156 {
6157 if (offsetcount >= 4)
6158 {
6159 memcpy(offsets + 2, md->offset_vector + 2,
6160 (offsetcount - 2) * sizeof(int));
6161 DPRINTF(("Copied offsets from temporary memory\n"));
6162 }
6163 if (md->end_offset_top > offsetcount) md->offset_overflow = TRUE;
6164 DPRINTF(("Freeing temporary memory\n"));
6165 (pcre_free)(md->offset_vector);
6166 }
6167
6168 /* Set the return code to the number of captured strings, or 0 if there are
6169 too many to fit into the vector. */
6170
6171 rc = md->offset_overflow? 0 : md->end_offset_top/2;
6172
6173 /* If there is space, set up the whole thing as substring 0. The value of
6174 md->start_match_ptr might be modified if \K was encountered on the success
6175 matching path. */
6176
6177 if (offsetcount < 2) rc = 0; else
6178 {
6179 offsets[0] = md->start_match_ptr - md->start_subject;
6180 offsets[1] = md->end_match_ptr - md->start_subject;
6181 }
6182
6183 DPRINTF((">>>> returning %d\n", rc));
6184 goto RETURN_MARK;
6185 }
6186
6187 /* Control gets here if there has been an error, or if the overall match
6188 attempt has failed at all permitted starting positions. */
6189
6190 if (using_temporary_offsets)
6191 {
6192 DPRINTF(("Freeing temporary memory\n"));
6193 (pcre_free)(md->offset_vector);
6194 }
6195
6196 /* For anything other than nomatch or partial match, just return the code. */
6197
6198 if (rc != MATCH_NOMATCH && rc != PCRE_ERROR_PARTIAL)
6199 {
6200 DPRINTF((">>>> error: returning %d\n", rc));
6201 return rc;
6202 }
6203
6204 /* Handle partial matches - disable any mark data */
6205
6206 if (start_partial != NULL)
6207 {
6208 DPRINTF((">>>> returning PCRE_ERROR_PARTIAL\n"));
6209 md->mark = NULL;
6210 if (offsetcount > 1)
6211 {
6212 offsets[0] = start_partial - (USPTR)subject;
6213 offsets[1] = end_subject - (USPTR)subject;
6214 }
6215 rc = PCRE_ERROR_PARTIAL;
6216 }
6217
6218 /* This is the classic nomatch case */
6219
6220 else
6221 {
6222 DPRINTF((">>>> returning PCRE_ERROR_NOMATCH\n"));
6223 rc = PCRE_ERROR_NOMATCH;
6224 }
6225
6226 /* Return the MARK data if it has been requested. */
6227
6228 RETURN_MARK:
6229
6230 if (extra_data != NULL && (extra_data->flags & PCRE_EXTRA_MARK) != 0)
6231 *(extra_data->mark) = (unsigned char *)(md->mark);
6232 return rc;
6233 }
6234
6235 /* End of pcre_exec.c */

Properties

Name Value
svn:eol-style native
svn:keywords "Author Date Id Revision Url"

webmaster@exim.org
ViewVC Help
Powered by ViewVC 1.1.12