/[pcre]/code/trunk/pcre_exec.c
ViewVC logotype

Contents of /code/trunk/pcre_exec.c

Parent Directory Parent Directory | Revision Log Revision Log


Revision 530 - (show annotations) (download)
Tue Jun 1 13:42:06 2010 UTC (3 years, 10 months ago) by ph10
File MIME type: text/plain
File size: 185071 byte(s)
Added a lot of (int) casts to avoid compiler warnings in systems where      
size_t is 64-bit.

1 /*************************************************
2 * Perl-Compatible Regular Expressions *
3 *************************************************/
4
5 /* PCRE is a library of functions to support regular expressions whose syntax
6 and semantics are as close as possible to those of the Perl 5 language.
7
8 Written by Philip Hazel
9 Copyright (c) 1997-2010 University of Cambridge
10
11 -----------------------------------------------------------------------------
12 Redistribution and use in source and binary forms, with or without
13 modification, are permitted provided that the following conditions are met:
14
15 * Redistributions of source code must retain the above copyright notice,
16 this list of conditions and the following disclaimer.
17
18 * Redistributions in binary form must reproduce the above copyright
19 notice, this list of conditions and the following disclaimer in the
20 documentation and/or other materials provided with the distribution.
21
22 * Neither the name of the University of Cambridge nor the names of its
23 contributors may be used to endorse or promote products derived from
24 this software without specific prior written permission.
25
26 THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
27 AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
28 IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
29 ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
30 LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
31 CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
32 SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
33 INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
34 CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
35 ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
36 POSSIBILITY OF SUCH DAMAGE.
37 -----------------------------------------------------------------------------
38 */
39
40
41 /* This module contains pcre_exec(), the externally visible function that does
42 pattern matching using an NFA algorithm, trying to mimic Perl as closely as
43 possible. There are also some static supporting functions. */
44
45 #ifdef HAVE_CONFIG_H
46 #include "config.h"
47 #endif
48
49 #define NLBLOCK md /* Block containing newline information */
50 #define PSSTART start_subject /* Field containing processed string start */
51 #define PSEND end_subject /* Field containing processed string end */
52
53 #include "pcre_internal.h"
54
55 /* Undefine some potentially clashing cpp symbols */
56
57 #undef min
58 #undef max
59
60 /* Flag bits for the match() function */
61
62 #define match_condassert 0x01 /* Called to check a condition assertion */
63 #define match_cbegroup 0x02 /* Could-be-empty unlimited repeat group */
64
65 /* Non-error returns from the match() function. Error returns are externally
66 defined PCRE_ERROR_xxx codes, which are all negative. */
67
68 #define MATCH_MATCH 1
69 #define MATCH_NOMATCH 0
70
71 /* Special internal returns from the match() function. Make them sufficiently
72 negative to avoid the external error codes. */
73
74 #define MATCH_ACCEPT (-999)
75 #define MATCH_COMMIT (-998)
76 #define MATCH_PRUNE (-997)
77 #define MATCH_SKIP (-996)
78 #define MATCH_SKIP_ARG (-995)
79 #define MATCH_THEN (-994)
80
81 /* This is a convenience macro for code that occurs many times. */
82
83 #define MRRETURN(ra) \
84 { \
85 md->mark = markptr; \
86 RRETURN(ra); \
87 }
88
89 /* Maximum number of ints of offset to save on the stack for recursive calls.
90 If the offset vector is bigger, malloc is used. This should be a multiple of 3,
91 because the offset vector is always a multiple of 3 long. */
92
93 #define REC_STACK_SAVE_MAX 30
94
95 /* Min and max values for the common repeats; for the maxima, 0 => infinity */
96
97 static const char rep_min[] = { 0, 0, 1, 1, 0, 0 };
98 static const char rep_max[] = { 0, 0, 0, 0, 1, 1 };
99
100
101
102 #ifdef PCRE_DEBUG
103 /*************************************************
104 * Debugging function to print chars *
105 *************************************************/
106
107 /* Print a sequence of chars in printable format, stopping at the end of the
108 subject if the requested.
109
110 Arguments:
111 p points to characters
112 length number to print
113 is_subject TRUE if printing from within md->start_subject
114 md pointer to matching data block, if is_subject is TRUE
115
116 Returns: nothing
117 */
118
119 static void
120 pchars(const uschar *p, int length, BOOL is_subject, match_data *md)
121 {
122 unsigned int c;
123 if (is_subject && length > md->end_subject - p) length = md->end_subject - p;
124 while (length-- > 0)
125 if (isprint(c = *(p++))) printf("%c", c); else printf("\\x%02x", c);
126 }
127 #endif
128
129
130
131 /*************************************************
132 * Match a back-reference *
133 *************************************************/
134
135 /* If a back reference hasn't been set, the length that is passed is greater
136 than the number of characters left in the string, so the match fails.
137
138 Arguments:
139 offset index into the offset vector
140 eptr points into the subject
141 length length to be matched
142 md points to match data block
143 ims the ims flags
144
145 Returns: TRUE if matched
146 */
147
148 static BOOL
149 match_ref(int offset, register USPTR eptr, int length, match_data *md,
150 unsigned long int ims)
151 {
152 USPTR p = md->start_subject + md->offset_vector[offset];
153
154 #ifdef PCRE_DEBUG
155 if (eptr >= md->end_subject)
156 printf("matching subject <null>");
157 else
158 {
159 printf("matching subject ");
160 pchars(eptr, length, TRUE, md);
161 }
162 printf(" against backref ");
163 pchars(p, length, FALSE, md);
164 printf("\n");
165 #endif
166
167 /* Always fail if not enough characters left */
168
169 if (length > md->end_subject - eptr) return FALSE;
170
171 /* Separate the caseless case for speed. In UTF-8 mode we can only do this
172 properly if Unicode properties are supported. Otherwise, we can check only
173 ASCII characters. */
174
175 if ((ims & PCRE_CASELESS) != 0)
176 {
177 #ifdef SUPPORT_UTF8
178 #ifdef SUPPORT_UCP
179 if (md->utf8)
180 {
181 USPTR endptr = eptr + length;
182 while (eptr < endptr)
183 {
184 int c, d;
185 GETCHARINC(c, eptr);
186 GETCHARINC(d, p);
187 if (c != d && c != UCD_OTHERCASE(d)) return FALSE;
188 }
189 }
190 else
191 #endif
192 #endif
193
194 /* The same code works when not in UTF-8 mode and in UTF-8 mode when there
195 is no UCP support. */
196
197 while (length-- > 0)
198 { if (md->lcc[*p++] != md->lcc[*eptr++]) return FALSE; }
199 }
200
201 /* In the caseful case, we can just compare the bytes, whether or not we
202 are in UTF-8 mode. */
203
204 else
205 { while (length-- > 0) if (*p++ != *eptr++) return FALSE; }
206
207 return TRUE;
208 }
209
210
211
212 /***************************************************************************
213 ****************************************************************************
214 RECURSION IN THE match() FUNCTION
215
216 The match() function is highly recursive, though not every recursive call
217 increases the recursive depth. Nevertheless, some regular expressions can cause
218 it to recurse to a great depth. I was writing for Unix, so I just let it call
219 itself recursively. This uses the stack for saving everything that has to be
220 saved for a recursive call. On Unix, the stack can be large, and this works
221 fine.
222
223 It turns out that on some non-Unix-like systems there are problems with
224 programs that use a lot of stack. (This despite the fact that every last chip
225 has oodles of memory these days, and techniques for extending the stack have
226 been known for decades.) So....
227
228 There is a fudge, triggered by defining NO_RECURSE, which avoids recursive
229 calls by keeping local variables that need to be preserved in blocks of memory
230 obtained from malloc() instead instead of on the stack. Macros are used to
231 achieve this so that the actual code doesn't look very different to what it
232 always used to.
233
234 The original heap-recursive code used longjmp(). However, it seems that this
235 can be very slow on some operating systems. Following a suggestion from Stan
236 Switzer, the use of longjmp() has been abolished, at the cost of having to
237 provide a unique number for each call to RMATCH. There is no way of generating
238 a sequence of numbers at compile time in C. I have given them names, to make
239 them stand out more clearly.
240
241 Crude tests on x86 Linux show a small speedup of around 5-8%. However, on
242 FreeBSD, avoiding longjmp() more than halves the time taken to run the standard
243 tests. Furthermore, not using longjmp() means that local dynamic variables
244 don't have indeterminate values; this has meant that the frame size can be
245 reduced because the result can be "passed back" by straight setting of the
246 variable instead of being passed in the frame.
247 ****************************************************************************
248 ***************************************************************************/
249
250 /* Numbers for RMATCH calls. When this list is changed, the code at HEAP_RETURN
251 below must be updated in sync. */
252
253 enum { RM1=1, RM2, RM3, RM4, RM5, RM6, RM7, RM8, RM9, RM10,
254 RM11, RM12, RM13, RM14, RM15, RM16, RM17, RM18, RM19, RM20,
255 RM21, RM22, RM23, RM24, RM25, RM26, RM27, RM28, RM29, RM30,
256 RM31, RM32, RM33, RM34, RM35, RM36, RM37, RM38, RM39, RM40,
257 RM41, RM42, RM43, RM44, RM45, RM46, RM47, RM48, RM49, RM50,
258 RM51, RM52, RM53, RM54, RM55, RM56, RM57, RM58, RM59, RM60,
259 RM61, RM62 };
260
261 /* These versions of the macros use the stack, as normal. There are debugging
262 versions and production versions. Note that the "rw" argument of RMATCH isn't
263 actually used in this definition. */
264
265 #ifndef NO_RECURSE
266 #define REGISTER register
267
268 #ifdef PCRE_DEBUG
269 #define RMATCH(ra,rb,rc,rd,re,rf,rg,rw) \
270 { \
271 printf("match() called in line %d\n", __LINE__); \
272 rrc = match(ra,rb,mstart,markptr,rc,rd,re,rf,rg,rdepth+1); \
273 printf("to line %d\n", __LINE__); \
274 }
275 #define RRETURN(ra) \
276 { \
277 printf("match() returned %d from line %d ", ra, __LINE__); \
278 return ra; \
279 }
280 #else
281 #define RMATCH(ra,rb,rc,rd,re,rf,rg,rw) \
282 rrc = match(ra,rb,mstart,markptr,rc,rd,re,rf,rg,rdepth+1)
283 #define RRETURN(ra) return ra
284 #endif
285
286 #else
287
288
289 /* These versions of the macros manage a private stack on the heap. Note that
290 the "rd" argument of RMATCH isn't actually used in this definition. It's the md
291 argument of match(), which never changes. */
292
293 #define REGISTER
294
295 #define RMATCH(ra,rb,rc,rd,re,rf,rg,rw)\
296 {\
297 heapframe *newframe = (pcre_stack_malloc)(sizeof(heapframe));\
298 frame->Xwhere = rw; \
299 newframe->Xeptr = ra;\
300 newframe->Xecode = rb;\
301 newframe->Xmstart = mstart;\
302 newframe->Xmarkptr = markptr;\
303 newframe->Xoffset_top = rc;\
304 newframe->Xims = re;\
305 newframe->Xeptrb = rf;\
306 newframe->Xflags = rg;\
307 newframe->Xrdepth = frame->Xrdepth + 1;\
308 newframe->Xprevframe = frame;\
309 frame = newframe;\
310 DPRINTF(("restarting from line %d\n", __LINE__));\
311 goto HEAP_RECURSE;\
312 L_##rw:\
313 DPRINTF(("jumped back to line %d\n", __LINE__));\
314 }
315
316 #define RRETURN(ra)\
317 {\
318 heapframe *oldframe = frame;\
319 frame = oldframe->Xprevframe;\
320 (pcre_stack_free)(oldframe);\
321 if (frame != NULL)\
322 {\
323 rrc = ra;\
324 goto HEAP_RETURN;\
325 }\
326 return ra;\
327 }
328
329
330 /* Structure for remembering the local variables in a private frame */
331
332 typedef struct heapframe {
333 struct heapframe *Xprevframe;
334
335 /* Function arguments that may change */
336
337 USPTR Xeptr;
338 const uschar *Xecode;
339 USPTR Xmstart;
340 USPTR Xmarkptr;
341 int Xoffset_top;
342 long int Xims;
343 eptrblock *Xeptrb;
344 int Xflags;
345 unsigned int Xrdepth;
346
347 /* Function local variables */
348
349 USPTR Xcallpat;
350 #ifdef SUPPORT_UTF8
351 USPTR Xcharptr;
352 #endif
353 USPTR Xdata;
354 USPTR Xnext;
355 USPTR Xpp;
356 USPTR Xprev;
357 USPTR Xsaved_eptr;
358
359 recursion_info Xnew_recursive;
360
361 BOOL Xcur_is_word;
362 BOOL Xcondition;
363 BOOL Xprev_is_word;
364
365 unsigned long int Xoriginal_ims;
366
367 #ifdef SUPPORT_UCP
368 int Xprop_type;
369 int Xprop_value;
370 int Xprop_fail_result;
371 int Xprop_category;
372 int Xprop_chartype;
373 int Xprop_script;
374 int Xoclength;
375 uschar Xocchars[8];
376 #endif
377
378 int Xcodelink;
379 int Xctype;
380 unsigned int Xfc;
381 int Xfi;
382 int Xlength;
383 int Xmax;
384 int Xmin;
385 int Xnumber;
386 int Xoffset;
387 int Xop;
388 int Xsave_capture_last;
389 int Xsave_offset1, Xsave_offset2, Xsave_offset3;
390 int Xstacksave[REC_STACK_SAVE_MAX];
391
392 eptrblock Xnewptrb;
393
394 /* Where to jump back to */
395
396 int Xwhere;
397
398 } heapframe;
399
400 #endif
401
402
403 /***************************************************************************
404 ***************************************************************************/
405
406
407
408 /*************************************************
409 * Match from current position *
410 *************************************************/
411
412 /* This function is called recursively in many circumstances. Whenever it
413 returns a negative (error) response, the outer incarnation must also return the
414 same response. */
415
416 /* These macros pack up tests that are used for partial matching, and which
417 appears several times in the code. We set the "hit end" flag if the pointer is
418 at the end of the subject and also past the start of the subject (i.e.
419 something has been matched). For hard partial matching, we then return
420 immediately. The second one is used when we already know we are past the end of
421 the subject. */
422
423 #define CHECK_PARTIAL()\
424 if (md->partial != 0 && eptr >= md->end_subject && eptr > mstart)\
425 {\
426 md->hitend = TRUE;\
427 if (md->partial > 1) MRRETURN(PCRE_ERROR_PARTIAL);\
428 }
429
430 #define SCHECK_PARTIAL()\
431 if (md->partial != 0 && eptr > mstart)\
432 {\
433 md->hitend = TRUE;\
434 if (md->partial > 1) MRRETURN(PCRE_ERROR_PARTIAL);\
435 }
436
437
438 /* Performance note: It might be tempting to extract commonly used fields from
439 the md structure (e.g. utf8, end_subject) into individual variables to improve
440 performance. Tests using gcc on a SPARC disproved this; in the first case, it
441 made performance worse.
442
443 Arguments:
444 eptr pointer to current character in subject
445 ecode pointer to current position in compiled code
446 mstart pointer to the current match start position (can be modified
447 by encountering \K)
448 markptr pointer to the most recent MARK name, or NULL
449 offset_top current top pointer
450 md pointer to "static" info for the match
451 ims current /i, /m, and /s options
452 eptrb pointer to chain of blocks containing eptr at start of
453 brackets - for testing for empty matches
454 flags can contain
455 match_condassert - this is an assertion condition
456 match_cbegroup - this is the start of an unlimited repeat
457 group that can match an empty string
458 rdepth the recursion depth
459
460 Returns: MATCH_MATCH if matched ) these values are >= 0
461 MATCH_NOMATCH if failed to match )
462 a negative MATCH_xxx value for PRUNE, SKIP, etc
463 a negative PCRE_ERROR_xxx value if aborted by an error condition
464 (e.g. stopped by repeated call or recursion limit)
465 */
466
467 static int
468 match(REGISTER USPTR eptr, REGISTER const uschar *ecode, USPTR mstart,
469 const uschar *markptr, int offset_top, match_data *md, unsigned long int ims,
470 eptrblock *eptrb, int flags, unsigned int rdepth)
471 {
472 /* These variables do not need to be preserved over recursion in this function,
473 so they can be ordinary variables in all cases. Mark some of them with
474 "register" because they are used a lot in loops. */
475
476 register int rrc; /* Returns from recursive calls */
477 register int i; /* Used for loops not involving calls to RMATCH() */
478 register unsigned int c; /* Character values not kept over RMATCH() calls */
479 register BOOL utf8; /* Local copy of UTF-8 flag for speed */
480
481 BOOL minimize, possessive; /* Quantifier options */
482 int condcode;
483
484 /* When recursion is not being used, all "local" variables that have to be
485 preserved over calls to RMATCH() are part of a "frame" which is obtained from
486 heap storage. Set up the top-level frame here; others are obtained from the
487 heap whenever RMATCH() does a "recursion". See the macro definitions above. */
488
489 #ifdef NO_RECURSE
490 heapframe *frame = (pcre_stack_malloc)(sizeof(heapframe));
491 frame->Xprevframe = NULL; /* Marks the top level */
492
493 /* Copy in the original argument variables */
494
495 frame->Xeptr = eptr;
496 frame->Xecode = ecode;
497 frame->Xmstart = mstart;
498 frame->Xmarkptr = markptr;
499 frame->Xoffset_top = offset_top;
500 frame->Xims = ims;
501 frame->Xeptrb = eptrb;
502 frame->Xflags = flags;
503 frame->Xrdepth = rdepth;
504
505 /* This is where control jumps back to to effect "recursion" */
506
507 HEAP_RECURSE:
508
509 /* Macros make the argument variables come from the current frame */
510
511 #define eptr frame->Xeptr
512 #define ecode frame->Xecode
513 #define mstart frame->Xmstart
514 #define markptr frame->Xmarkptr
515 #define offset_top frame->Xoffset_top
516 #define ims frame->Xims
517 #define eptrb frame->Xeptrb
518 #define flags frame->Xflags
519 #define rdepth frame->Xrdepth
520
521 /* Ditto for the local variables */
522
523 #ifdef SUPPORT_UTF8
524 #define charptr frame->Xcharptr
525 #endif
526 #define callpat frame->Xcallpat
527 #define codelink frame->Xcodelink
528 #define data frame->Xdata
529 #define next frame->Xnext
530 #define pp frame->Xpp
531 #define prev frame->Xprev
532 #define saved_eptr frame->Xsaved_eptr
533
534 #define new_recursive frame->Xnew_recursive
535
536 #define cur_is_word frame->Xcur_is_word
537 #define condition frame->Xcondition
538 #define prev_is_word frame->Xprev_is_word
539
540 #define original_ims frame->Xoriginal_ims
541
542 #ifdef SUPPORT_UCP
543 #define prop_type frame->Xprop_type
544 #define prop_value frame->Xprop_value
545 #define prop_fail_result frame->Xprop_fail_result
546 #define prop_category frame->Xprop_category
547 #define prop_chartype frame->Xprop_chartype
548 #define prop_script frame->Xprop_script
549 #define oclength frame->Xoclength
550 #define occhars frame->Xocchars
551 #endif
552
553 #define ctype frame->Xctype
554 #define fc frame->Xfc
555 #define fi frame->Xfi
556 #define length frame->Xlength
557 #define max frame->Xmax
558 #define min frame->Xmin
559 #define number frame->Xnumber
560 #define offset frame->Xoffset
561 #define op frame->Xop
562 #define save_capture_last frame->Xsave_capture_last
563 #define save_offset1 frame->Xsave_offset1
564 #define save_offset2 frame->Xsave_offset2
565 #define save_offset3 frame->Xsave_offset3
566 #define stacksave frame->Xstacksave
567
568 #define newptrb frame->Xnewptrb
569
570 /* When recursion is being used, local variables are allocated on the stack and
571 get preserved during recursion in the normal way. In this environment, fi and
572 i, and fc and c, can be the same variables. */
573
574 #else /* NO_RECURSE not defined */
575 #define fi i
576 #define fc c
577
578
579 #ifdef SUPPORT_UTF8 /* Many of these variables are used only */
580 const uschar *charptr; /* in small blocks of the code. My normal */
581 #endif /* style of coding would have declared */
582 const uschar *callpat; /* them within each of those blocks. */
583 const uschar *data; /* However, in order to accommodate the */
584 const uschar *next; /* version of this code that uses an */
585 USPTR pp; /* external "stack" implemented on the */
586 const uschar *prev; /* heap, it is easier to declare them all */
587 USPTR saved_eptr; /* here, so the declarations can be cut */
588 /* out in a block. The only declarations */
589 recursion_info new_recursive; /* within blocks below are for variables */
590 /* that do not have to be preserved over */
591 BOOL cur_is_word; /* a recursive call to RMATCH(). */
592 BOOL condition;
593 BOOL prev_is_word;
594
595 unsigned long int original_ims;
596
597 #ifdef SUPPORT_UCP
598 int prop_type;
599 int prop_value;
600 int prop_fail_result;
601 int prop_category;
602 int prop_chartype;
603 int prop_script;
604 int oclength;
605 uschar occhars[8];
606 #endif
607
608 int codelink;
609 int ctype;
610 int length;
611 int max;
612 int min;
613 int number;
614 int offset;
615 int op;
616 int save_capture_last;
617 int save_offset1, save_offset2, save_offset3;
618 int stacksave[REC_STACK_SAVE_MAX];
619
620 eptrblock newptrb;
621 #endif /* NO_RECURSE */
622
623 /* These statements are here to stop the compiler complaining about unitialized
624 variables. */
625
626 #ifdef SUPPORT_UCP
627 prop_value = 0;
628 prop_fail_result = 0;
629 #endif
630
631
632 /* This label is used for tail recursion, which is used in a few cases even
633 when NO_RECURSE is not defined, in order to reduce the amount of stack that is
634 used. Thanks to Ian Taylor for noticing this possibility and sending the
635 original patch. */
636
637 TAIL_RECURSE:
638
639 /* OK, now we can get on with the real code of the function. Recursive calls
640 are specified by the macro RMATCH and RRETURN is used to return. When
641 NO_RECURSE is *not* defined, these just turn into a recursive call to match()
642 and a "return", respectively (possibly with some debugging if PCRE_DEBUG is
643 defined). However, RMATCH isn't like a function call because it's quite a
644 complicated macro. It has to be used in one particular way. This shouldn't,
645 however, impact performance when true recursion is being used. */
646
647 #ifdef SUPPORT_UTF8
648 utf8 = md->utf8; /* Local copy of the flag */
649 #else
650 utf8 = FALSE;
651 #endif
652
653 /* First check that we haven't called match() too many times, or that we
654 haven't exceeded the recursive call limit. */
655
656 if (md->match_call_count++ >= md->match_limit) RRETURN(PCRE_ERROR_MATCHLIMIT);
657 if (rdepth >= md->match_limit_recursion) RRETURN(PCRE_ERROR_RECURSIONLIMIT);
658
659 original_ims = ims; /* Save for resetting on ')' */
660
661 /* At the start of a group with an unlimited repeat that may match an empty
662 string, the match_cbegroup flag is set. When this is the case, add the current
663 subject pointer to the chain of such remembered pointers, to be checked when we
664 hit the closing ket, in order to break infinite loops that match no characters.
665 When match() is called in other circumstances, don't add to the chain. The
666 match_cbegroup flag must NOT be used with tail recursion, because the memory
667 block that is used is on the stack, so a new one may be required for each
668 match(). */
669
670 if ((flags & match_cbegroup) != 0)
671 {
672 newptrb.epb_saved_eptr = eptr;
673 newptrb.epb_prev = eptrb;
674 eptrb = &newptrb;
675 }
676
677 /* Now start processing the opcodes. */
678
679 for (;;)
680 {
681 minimize = possessive = FALSE;
682 op = *ecode;
683
684 switch(op)
685 {
686 case OP_MARK:
687 markptr = ecode + 2;
688 RMATCH(eptr, ecode + _pcre_OP_lengths[*ecode] + ecode[1], offset_top, md,
689 ims, eptrb, flags, RM55);
690
691 /* A return of MATCH_SKIP_ARG means that matching failed at SKIP with an
692 argument, and we must check whether that argument matches this MARK's
693 argument. It is passed back in md->start_match_ptr (an overloading of that
694 variable). If it does match, we reset that variable to the current subject
695 position and return MATCH_SKIP. Otherwise, pass back the return code
696 unaltered. */
697
698 if (rrc == MATCH_SKIP_ARG &&
699 strcmp((char *)markptr, (char *)(md->start_match_ptr)) == 0)
700 {
701 md->start_match_ptr = eptr;
702 RRETURN(MATCH_SKIP);
703 }
704
705 if (md->mark == NULL) md->mark = markptr;
706 RRETURN(rrc);
707
708 case OP_FAIL:
709 MRRETURN(MATCH_NOMATCH);
710
711 case OP_COMMIT:
712 RMATCH(eptr, ecode + _pcre_OP_lengths[*ecode], offset_top, md,
713 ims, eptrb, flags, RM52);
714 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
715 MRRETURN(MATCH_COMMIT);
716
717 case OP_PRUNE:
718 RMATCH(eptr, ecode + _pcre_OP_lengths[*ecode], offset_top, md,
719 ims, eptrb, flags, RM51);
720 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
721 MRRETURN(MATCH_PRUNE);
722
723 case OP_PRUNE_ARG:
724 RMATCH(eptr, ecode + _pcre_OP_lengths[*ecode] + ecode[1], offset_top, md,
725 ims, eptrb, flags, RM56);
726 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
727 md->mark = ecode + 2;
728 RRETURN(MATCH_PRUNE);
729
730 case OP_SKIP:
731 RMATCH(eptr, ecode + _pcre_OP_lengths[*ecode], offset_top, md,
732 ims, eptrb, flags, RM53);
733 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
734 md->start_match_ptr = eptr; /* Pass back current position */
735 MRRETURN(MATCH_SKIP);
736
737 case OP_SKIP_ARG:
738 RMATCH(eptr, ecode + _pcre_OP_lengths[*ecode] + ecode[1], offset_top, md,
739 ims, eptrb, flags, RM57);
740 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
741
742 /* Pass back the current skip name by overloading md->start_match_ptr and
743 returning the special MATCH_SKIP_ARG return code. This will either be
744 caught by a matching MARK, or get to the top, where it is treated the same
745 as PRUNE. */
746
747 md->start_match_ptr = ecode + 2;
748 RRETURN(MATCH_SKIP_ARG);
749
750 case OP_THEN:
751 RMATCH(eptr, ecode + _pcre_OP_lengths[*ecode], offset_top, md,
752 ims, eptrb, flags, RM54);
753 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
754 MRRETURN(MATCH_THEN);
755
756 case OP_THEN_ARG:
757 RMATCH(eptr, ecode + _pcre_OP_lengths[*ecode] + ecode[1], offset_top, md,
758 ims, eptrb, flags, RM58);
759 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
760 md->mark = ecode + 2;
761 RRETURN(MATCH_THEN);
762
763 /* Handle a capturing bracket. If there is space in the offset vector, save
764 the current subject position in the working slot at the top of the vector.
765 We mustn't change the current values of the data slot, because they may be
766 set from a previous iteration of this group, and be referred to by a
767 reference inside the group.
768
769 If the bracket fails to match, we need to restore this value and also the
770 values of the final offsets, in case they were set by a previous iteration
771 of the same bracket.
772
773 If there isn't enough space in the offset vector, treat this as if it were
774 a non-capturing bracket. Don't worry about setting the flag for the error
775 case here; that is handled in the code for KET. */
776
777 case OP_CBRA:
778 case OP_SCBRA:
779 number = GET2(ecode, 1+LINK_SIZE);
780 offset = number << 1;
781
782 #ifdef PCRE_DEBUG
783 printf("start bracket %d\n", number);
784 printf("subject=");
785 pchars(eptr, 16, TRUE, md);
786 printf("\n");
787 #endif
788
789 if (offset < md->offset_max)
790 {
791 save_offset1 = md->offset_vector[offset];
792 save_offset2 = md->offset_vector[offset+1];
793 save_offset3 = md->offset_vector[md->offset_end - number];
794 save_capture_last = md->capture_last;
795
796 DPRINTF(("saving %d %d %d\n", save_offset1, save_offset2, save_offset3));
797 md->offset_vector[md->offset_end - number] =
798 (int)(eptr - md->start_subject);
799
800 flags = (op == OP_SCBRA)? match_cbegroup : 0;
801 do
802 {
803 RMATCH(eptr, ecode + _pcre_OP_lengths[*ecode], offset_top, md,
804 ims, eptrb, flags, RM1);
805 if (rrc != MATCH_NOMATCH && rrc != MATCH_THEN) RRETURN(rrc);
806 md->capture_last = save_capture_last;
807 ecode += GET(ecode, 1);
808 }
809 while (*ecode == OP_ALT);
810
811 DPRINTF(("bracket %d failed\n", number));
812
813 md->offset_vector[offset] = save_offset1;
814 md->offset_vector[offset+1] = save_offset2;
815 md->offset_vector[md->offset_end - number] = save_offset3;
816
817 if (rrc != MATCH_THEN) md->mark = markptr;
818 RRETURN(MATCH_NOMATCH);
819 }
820
821 /* FALL THROUGH ... Insufficient room for saving captured contents. Treat
822 as a non-capturing bracket. */
823
824 /* VVVVVVVVVVVVVVVVVVVVVVVVV */
825 /* VVVVVVVVVVVVVVVVVVVVVVVVV */
826
827 DPRINTF(("insufficient capture room: treat as non-capturing\n"));
828
829 /* VVVVVVVVVVVVVVVVVVVVVVVVV */
830 /* VVVVVVVVVVVVVVVVVVVVVVVVV */
831
832 /* Non-capturing bracket. Loop for all the alternatives. When we get to the
833 final alternative within the brackets, we would return the result of a
834 recursive call to match() whatever happened. We can reduce stack usage by
835 turning this into a tail recursion, except in the case when match_cbegroup
836 is set.*/
837
838 case OP_BRA:
839 case OP_SBRA:
840 DPRINTF(("start non-capturing bracket\n"));
841 flags = (op >= OP_SBRA)? match_cbegroup : 0;
842 for (;;)
843 {
844 if (ecode[GET(ecode, 1)] != OP_ALT) /* Final alternative */
845 {
846 if (flags == 0) /* Not a possibly empty group */
847 {
848 ecode += _pcre_OP_lengths[*ecode];
849 DPRINTF(("bracket 0 tail recursion\n"));
850 goto TAIL_RECURSE;
851 }
852
853 /* Possibly empty group; can't use tail recursion. */
854
855 RMATCH(eptr, ecode + _pcre_OP_lengths[*ecode], offset_top, md, ims,
856 eptrb, flags, RM48);
857 if (rrc == MATCH_NOMATCH) md->mark = markptr;
858 RRETURN(rrc);
859 }
860
861 /* For non-final alternatives, continue the loop for a NOMATCH result;
862 otherwise return. */
863
864 RMATCH(eptr, ecode + _pcre_OP_lengths[*ecode], offset_top, md, ims,
865 eptrb, flags, RM2);
866 if (rrc != MATCH_NOMATCH && rrc != MATCH_THEN) RRETURN(rrc);
867 ecode += GET(ecode, 1);
868 }
869 /* Control never reaches here. */
870
871 /* Conditional group: compilation checked that there are no more than
872 two branches. If the condition is false, skipping the first branch takes us
873 past the end if there is only one branch, but that's OK because that is
874 exactly what going to the ket would do. As there is only one branch to be
875 obeyed, we can use tail recursion to avoid using another stack frame. */
876
877 case OP_COND:
878 case OP_SCOND:
879 codelink= GET(ecode, 1);
880
881 /* Because of the way auto-callout works during compile, a callout item is
882 inserted between OP_COND and an assertion condition. */
883
884 if (ecode[LINK_SIZE+1] == OP_CALLOUT)
885 {
886 if (pcre_callout != NULL)
887 {
888 pcre_callout_block cb;
889 cb.version = 1; /* Version 1 of the callout block */
890 cb.callout_number = ecode[LINK_SIZE+2];
891 cb.offset_vector = md->offset_vector;
892 cb.subject = (PCRE_SPTR)md->start_subject;
893 cb.subject_length = (int)(md->end_subject - md->start_subject);
894 cb.start_match = (int)(mstart - md->start_subject);
895 cb.current_position = (int)(eptr - md->start_subject);
896 cb.pattern_position = GET(ecode, LINK_SIZE + 3);
897 cb.next_item_length = GET(ecode, 3 + 2*LINK_SIZE);
898 cb.capture_top = offset_top/2;
899 cb.capture_last = md->capture_last;
900 cb.callout_data = md->callout_data;
901 if ((rrc = (*pcre_callout)(&cb)) > 0) MRRETURN(MATCH_NOMATCH);
902 if (rrc < 0) RRETURN(rrc);
903 }
904 ecode += _pcre_OP_lengths[OP_CALLOUT];
905 }
906
907 condcode = ecode[LINK_SIZE+1];
908
909 /* Now see what the actual condition is */
910
911 if (condcode == OP_RREF || condcode == OP_NRREF) /* Recursion test */
912 {
913 if (md->recursive == NULL) /* Not recursing => FALSE */
914 {
915 condition = FALSE;
916 ecode += GET(ecode, 1);
917 }
918 else
919 {
920 int recno = GET2(ecode, LINK_SIZE + 2); /* Recursion group number*/
921 condition = (recno == RREF_ANY || recno == md->recursive->group_num);
922
923 /* If the test is for recursion into a specific subpattern, and it is
924 false, but the test was set up by name, scan the table to see if the
925 name refers to any other numbers, and test them. The condition is true
926 if any one is set. */
927
928 if (!condition && condcode == OP_NRREF && recno != RREF_ANY)
929 {
930 uschar *slotA = md->name_table;
931 for (i = 0; i < md->name_count; i++)
932 {
933 if (GET2(slotA, 0) == recno) break;
934 slotA += md->name_entry_size;
935 }
936
937 /* Found a name for the number - there can be only one; duplicate
938 names for different numbers are allowed, but not vice versa. First
939 scan down for duplicates. */
940
941 if (i < md->name_count)
942 {
943 uschar *slotB = slotA;
944 while (slotB > md->name_table)
945 {
946 slotB -= md->name_entry_size;
947 if (strcmp((char *)slotA + 2, (char *)slotB + 2) == 0)
948 {
949 condition = GET2(slotB, 0) == md->recursive->group_num;
950 if (condition) break;
951 }
952 else break;
953 }
954
955 /* Scan up for duplicates */
956
957 if (!condition)
958 {
959 slotB = slotA;
960 for (i++; i < md->name_count; i++)
961 {
962 slotB += md->name_entry_size;
963 if (strcmp((char *)slotA + 2, (char *)slotB + 2) == 0)
964 {
965 condition = GET2(slotB, 0) == md->recursive->group_num;
966 if (condition) break;
967 }
968 else break;
969 }
970 }
971 }
972 }
973
974 /* Chose branch according to the condition */
975
976 ecode += condition? 3 : GET(ecode, 1);
977 }
978 }
979
980 else if (condcode == OP_CREF || condcode == OP_NCREF) /* Group used test */
981 {
982 offset = GET2(ecode, LINK_SIZE+2) << 1; /* Doubled ref number */
983 condition = offset < offset_top && md->offset_vector[offset] >= 0;
984
985 /* If the numbered capture is unset, but the reference was by name,
986 scan the table to see if the name refers to any other numbers, and test
987 them. The condition is true if any one is set. This is tediously similar
988 to the code above, but not close enough to try to amalgamate. */
989
990 if (!condition && condcode == OP_NCREF)
991 {
992 int refno = offset >> 1;
993 uschar *slotA = md->name_table;
994
995 for (i = 0; i < md->name_count; i++)
996 {
997 if (GET2(slotA, 0) == refno) break;
998 slotA += md->name_entry_size;
999 }
1000
1001 /* Found a name for the number - there can be only one; duplicate names
1002 for different numbers are allowed, but not vice versa. First scan down
1003 for duplicates. */
1004
1005 if (i < md->name_count)
1006 {
1007 uschar *slotB = slotA;
1008 while (slotB > md->name_table)
1009 {
1010 slotB -= md->name_entry_size;
1011 if (strcmp((char *)slotA + 2, (char *)slotB + 2) == 0)
1012 {
1013 offset = GET2(slotB, 0) << 1;
1014 condition = offset < offset_top &&
1015 md->offset_vector[offset] >= 0;
1016 if (condition) break;
1017 }
1018 else break;
1019 }
1020
1021 /* Scan up for duplicates */
1022
1023 if (!condition)
1024 {
1025 slotB = slotA;
1026 for (i++; i < md->name_count; i++)
1027 {
1028 slotB += md->name_entry_size;
1029 if (strcmp((char *)slotA + 2, (char *)slotB + 2) == 0)
1030 {
1031 offset = GET2(slotB, 0) << 1;
1032 condition = offset < offset_top &&
1033 md->offset_vector[offset] >= 0;
1034 if (condition) break;
1035 }
1036 else break;
1037 }
1038 }
1039 }
1040 }
1041
1042 /* Chose branch according to the condition */
1043
1044 ecode += condition? 3 : GET(ecode, 1);
1045 }
1046
1047 else if (condcode == OP_DEF) /* DEFINE - always false */
1048 {
1049 condition = FALSE;
1050 ecode += GET(ecode, 1);
1051 }
1052
1053 /* The condition is an assertion. Call match() to evaluate it - setting
1054 the final argument match_condassert causes it to stop at the end of an
1055 assertion. */
1056
1057 else
1058 {
1059 RMATCH(eptr, ecode + 1 + LINK_SIZE, offset_top, md, ims, NULL,
1060 match_condassert, RM3);
1061 if (rrc == MATCH_MATCH)
1062 {
1063 condition = TRUE;
1064 ecode += 1 + LINK_SIZE + GET(ecode, LINK_SIZE + 2);
1065 while (*ecode == OP_ALT) ecode += GET(ecode, 1);
1066 }
1067 else if (rrc != MATCH_NOMATCH && rrc != MATCH_THEN)
1068 {
1069 RRETURN(rrc); /* Need braces because of following else */
1070 }
1071 else
1072 {
1073 condition = FALSE;
1074 ecode += codelink;
1075 }
1076 }
1077
1078 /* We are now at the branch that is to be obeyed. As there is only one,
1079 we can use tail recursion to avoid using another stack frame, except when
1080 match_cbegroup is required for an unlimited repeat of a possibly empty
1081 group. If the second alternative doesn't exist, we can just plough on. */
1082
1083 if (condition || *ecode == OP_ALT)
1084 {
1085 ecode += 1 + LINK_SIZE;
1086 if (op == OP_SCOND) /* Possibly empty group */
1087 {
1088 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, match_cbegroup, RM49);
1089 RRETURN(rrc);
1090 }
1091 else /* Group must match something */
1092 {
1093 flags = 0;
1094 goto TAIL_RECURSE;
1095 }
1096 }
1097 else /* Condition false & no alternative */
1098 {
1099 ecode += 1 + LINK_SIZE;
1100 }
1101 break;
1102
1103
1104 /* Before OP_ACCEPT there may be any number of OP_CLOSE opcodes,
1105 to close any currently open capturing brackets. */
1106
1107 case OP_CLOSE:
1108 number = GET2(ecode, 1);
1109 offset = number << 1;
1110
1111 #ifdef PCRE_DEBUG
1112 printf("end bracket %d at *ACCEPT", number);
1113 printf("\n");
1114 #endif
1115
1116 md->capture_last = number;
1117 if (offset >= md->offset_max) md->offset_overflow = TRUE; else
1118 {
1119 md->offset_vector[offset] =
1120 md->offset_vector[md->offset_end - number];
1121 md->offset_vector[offset+1] = (int)(eptr - md->start_subject);
1122 if (offset_top <= offset) offset_top = offset + 2;
1123 }
1124 ecode += 3;
1125 break;
1126
1127
1128 /* End of the pattern, either real or forced. If we are in a top-level
1129 recursion, we should restore the offsets appropriately and continue from
1130 after the call. */
1131
1132 case OP_ACCEPT:
1133 case OP_END:
1134 if (md->recursive != NULL && md->recursive->group_num == 0)
1135 {
1136 recursion_info *rec = md->recursive;
1137 DPRINTF(("End of pattern in a (?0) recursion\n"));
1138 md->recursive = rec->prevrec;
1139 memmove(md->offset_vector, rec->offset_save,
1140 rec->saved_max * sizeof(int));
1141 offset_top = rec->save_offset_top;
1142 ims = original_ims;
1143 ecode = rec->after_call;
1144 break;
1145 }
1146
1147 /* Otherwise, if we have matched an empty string, fail if PCRE_NOTEMPTY is
1148 set, or if PCRE_NOTEMPTY_ATSTART is set and we have matched at the start of
1149 the subject. In both cases, backtracking will then try other alternatives,
1150 if any. */
1151
1152 if (eptr == mstart &&
1153 (md->notempty ||
1154 (md->notempty_atstart &&
1155 mstart == md->start_subject + md->start_offset)))
1156 MRRETURN(MATCH_NOMATCH);
1157
1158 /* Otherwise, we have a match. */
1159
1160 md->end_match_ptr = eptr; /* Record where we ended */
1161 md->end_offset_top = offset_top; /* and how many extracts were taken */
1162 md->start_match_ptr = mstart; /* and the start (\K can modify) */
1163
1164 /* For some reason, the macros don't work properly if an expression is
1165 given as the argument to MRRETURN when the heap is in use. */
1166
1167 rrc = (op == OP_END)? MATCH_MATCH : MATCH_ACCEPT;
1168 MRRETURN(rrc);
1169
1170 /* Change option settings */
1171
1172 case OP_OPT:
1173 ims = ecode[1];
1174 ecode += 2;
1175 DPRINTF(("ims set to %02lx\n", ims));
1176 break;
1177
1178 /* Assertion brackets. Check the alternative branches in turn - the
1179 matching won't pass the KET for an assertion. If any one branch matches,
1180 the assertion is true. Lookbehind assertions have an OP_REVERSE item at the
1181 start of each branch to move the current point backwards, so the code at
1182 this level is identical to the lookahead case. */
1183
1184 case OP_ASSERT:
1185 case OP_ASSERTBACK:
1186 do
1187 {
1188 RMATCH(eptr, ecode + 1 + LINK_SIZE, offset_top, md, ims, NULL, 0,
1189 RM4);
1190 if (rrc == MATCH_MATCH || rrc == MATCH_ACCEPT)
1191 {
1192 mstart = md->start_match_ptr; /* In case \K reset it */
1193 break;
1194 }
1195 if (rrc != MATCH_NOMATCH && rrc != MATCH_THEN) RRETURN(rrc);
1196 ecode += GET(ecode, 1);
1197 }
1198 while (*ecode == OP_ALT);
1199 if (*ecode == OP_KET) MRRETURN(MATCH_NOMATCH);
1200
1201 /* If checking an assertion for a condition, return MATCH_MATCH. */
1202
1203 if ((flags & match_condassert) != 0) RRETURN(MATCH_MATCH);
1204
1205 /* Continue from after the assertion, updating the offsets high water
1206 mark, since extracts may have been taken during the assertion. */
1207
1208 do ecode += GET(ecode,1); while (*ecode == OP_ALT);
1209 ecode += 1 + LINK_SIZE;
1210 offset_top = md->end_offset_top;
1211 continue;
1212
1213 /* Negative assertion: all branches must fail to match. Encountering SKIP,
1214 PRUNE, or COMMIT means we must assume failure without checking subsequent
1215 branches. */
1216
1217 case OP_ASSERT_NOT:
1218 case OP_ASSERTBACK_NOT:
1219 do
1220 {
1221 RMATCH(eptr, ecode + 1 + LINK_SIZE, offset_top, md, ims, NULL, 0,
1222 RM5);
1223 if (rrc == MATCH_MATCH || rrc == MATCH_ACCEPT) MRRETURN(MATCH_NOMATCH);
1224 if (rrc == MATCH_SKIP || rrc == MATCH_PRUNE || rrc == MATCH_COMMIT)
1225 {
1226 do ecode += GET(ecode,1); while (*ecode == OP_ALT);
1227 break;
1228 }
1229 if (rrc != MATCH_NOMATCH && rrc != MATCH_THEN) RRETURN(rrc);
1230 ecode += GET(ecode,1);
1231 }
1232 while (*ecode == OP_ALT);
1233
1234 if ((flags & match_condassert) != 0) RRETURN(MATCH_MATCH);
1235
1236 ecode += 1 + LINK_SIZE;
1237 continue;
1238
1239 /* Move the subject pointer back. This occurs only at the start of
1240 each branch of a lookbehind assertion. If we are too close to the start to
1241 move back, this match function fails. When working with UTF-8 we move
1242 back a number of characters, not bytes. */
1243
1244 case OP_REVERSE:
1245 #ifdef SUPPORT_UTF8
1246 if (utf8)
1247 {
1248 i = GET(ecode, 1);
1249 while (i-- > 0)
1250 {
1251 eptr--;
1252 if (eptr < md->start_subject) MRRETURN(MATCH_NOMATCH);
1253 BACKCHAR(eptr);
1254 }
1255 }
1256 else
1257 #endif
1258
1259 /* No UTF-8 support, or not in UTF-8 mode: count is byte count */
1260
1261 {
1262 eptr -= GET(ecode, 1);
1263 if (eptr < md->start_subject) MRRETURN(MATCH_NOMATCH);
1264 }
1265
1266 /* Save the earliest consulted character, then skip to next op code */
1267
1268 if (eptr < md->start_used_ptr) md->start_used_ptr = eptr;
1269 ecode += 1 + LINK_SIZE;
1270 break;
1271
1272 /* The callout item calls an external function, if one is provided, passing
1273 details of the match so far. This is mainly for debugging, though the
1274 function is able to force a failure. */
1275
1276 case OP_CALLOUT:
1277 if (pcre_callout != NULL)
1278 {
1279 pcre_callout_block cb;
1280 cb.version = 1; /* Version 1 of the callout block */
1281 cb.callout_number = ecode[1];
1282 cb.offset_vector = md->offset_vector;
1283 cb.subject = (PCRE_SPTR)md->start_subject;
1284 cb.subject_length = (int)(md->end_subject - md->start_subject);
1285 cb.start_match = (int)(mstart - md->start_subject);
1286 cb.current_position = (int)(eptr - md->start_subject);
1287 cb.pattern_position = GET(ecode, 2);
1288 cb.next_item_length = GET(ecode, 2 + LINK_SIZE);
1289 cb.capture_top = offset_top/2;
1290 cb.capture_last = md->capture_last;
1291 cb.callout_data = md->callout_data;
1292 if ((rrc = (*pcre_callout)(&cb)) > 0) MRRETURN(MATCH_NOMATCH);
1293 if (rrc < 0) RRETURN(rrc);
1294 }
1295 ecode += 2 + 2*LINK_SIZE;
1296 break;
1297
1298 /* Recursion either matches the current regex, or some subexpression. The
1299 offset data is the offset to the starting bracket from the start of the
1300 whole pattern. (This is so that it works from duplicated subpatterns.)
1301
1302 If there are any capturing brackets started but not finished, we have to
1303 save their starting points and reinstate them after the recursion. However,
1304 we don't know how many such there are (offset_top records the completed
1305 total) so we just have to save all the potential data. There may be up to
1306 65535 such values, which is too large to put on the stack, but using malloc
1307 for small numbers seems expensive. As a compromise, the stack is used when
1308 there are no more than REC_STACK_SAVE_MAX values to store; otherwise malloc
1309 is used. A problem is what to do if the malloc fails ... there is no way of
1310 returning to the top level with an error. Save the top REC_STACK_SAVE_MAX
1311 values on the stack, and accept that the rest may be wrong.
1312
1313 There are also other values that have to be saved. We use a chained
1314 sequence of blocks that actually live on the stack. Thanks to Robin Houston
1315 for the original version of this logic. */
1316
1317 case OP_RECURSE:
1318 {
1319 callpat = md->start_code + GET(ecode, 1);
1320 new_recursive.group_num = (callpat == md->start_code)? 0 :
1321 GET2(callpat, 1 + LINK_SIZE);
1322
1323 /* Add to "recursing stack" */
1324
1325 new_recursive.prevrec = md->recursive;
1326 md->recursive = &new_recursive;
1327
1328 /* Find where to continue from afterwards */
1329
1330 ecode += 1 + LINK_SIZE;
1331 new_recursive.after_call = ecode;
1332
1333 /* Now save the offset data. */
1334
1335 new_recursive.saved_max = md->offset_end;
1336 if (new_recursive.saved_max <= REC_STACK_SAVE_MAX)
1337 new_recursive.offset_save = stacksave;
1338 else
1339 {
1340 new_recursive.offset_save =
1341 (int *)(pcre_malloc)(new_recursive.saved_max * sizeof(int));
1342 if (new_recursive.offset_save == NULL) RRETURN(PCRE_ERROR_NOMEMORY);
1343 }
1344
1345 memcpy(new_recursive.offset_save, md->offset_vector,
1346 new_recursive.saved_max * sizeof(int));
1347 new_recursive.save_offset_top = offset_top;
1348
1349 /* OK, now we can do the recursion. For each top-level alternative we
1350 restore the offset and recursion data. */
1351
1352 DPRINTF(("Recursing into group %d\n", new_recursive.group_num));
1353 flags = (*callpat >= OP_SBRA)? match_cbegroup : 0;
1354 do
1355 {
1356 RMATCH(eptr, callpat + _pcre_OP_lengths[*callpat], offset_top,
1357 md, ims, eptrb, flags, RM6);
1358 if (rrc == MATCH_MATCH || rrc == MATCH_ACCEPT)
1359 {
1360 DPRINTF(("Recursion matched\n"));
1361 md->recursive = new_recursive.prevrec;
1362 if (new_recursive.offset_save != stacksave)
1363 (pcre_free)(new_recursive.offset_save);
1364 MRRETURN(MATCH_MATCH);
1365 }
1366 else if (rrc != MATCH_NOMATCH && rrc != MATCH_THEN)
1367 {
1368 DPRINTF(("Recursion gave error %d\n", rrc));
1369 if (new_recursive.offset_save != stacksave)
1370 (pcre_free)(new_recursive.offset_save);
1371 RRETURN(rrc);
1372 }
1373
1374 md->recursive = &new_recursive;
1375 memcpy(md->offset_vector, new_recursive.offset_save,
1376 new_recursive.saved_max * sizeof(int));
1377 callpat += GET(callpat, 1);
1378 }
1379 while (*callpat == OP_ALT);
1380
1381 DPRINTF(("Recursion didn't match\n"));
1382 md->recursive = new_recursive.prevrec;
1383 if (new_recursive.offset_save != stacksave)
1384 (pcre_free)(new_recursive.offset_save);
1385 MRRETURN(MATCH_NOMATCH);
1386 }
1387 /* Control never reaches here */
1388
1389 /* "Once" brackets are like assertion brackets except that after a match,
1390 the point in the subject string is not moved back. Thus there can never be
1391 a move back into the brackets. Friedl calls these "atomic" subpatterns.
1392 Check the alternative branches in turn - the matching won't pass the KET
1393 for this kind of subpattern. If any one branch matches, we carry on as at
1394 the end of a normal bracket, leaving the subject pointer, but resetting
1395 the start-of-match value in case it was changed by \K. */
1396
1397 case OP_ONCE:
1398 prev = ecode;
1399 saved_eptr = eptr;
1400
1401 do
1402 {
1403 RMATCH(eptr, ecode + 1 + LINK_SIZE, offset_top, md, ims, eptrb, 0, RM7);
1404 if (rrc == MATCH_MATCH) /* Note: _not_ MATCH_ACCEPT */
1405 {
1406 mstart = md->start_match_ptr;
1407 break;
1408 }
1409 if (rrc != MATCH_NOMATCH && rrc != MATCH_THEN) RRETURN(rrc);
1410 ecode += GET(ecode,1);
1411 }
1412 while (*ecode == OP_ALT);
1413
1414 /* If hit the end of the group (which could be repeated), fail */
1415
1416 if (*ecode != OP_ONCE && *ecode != OP_ALT) RRETURN(MATCH_NOMATCH);
1417
1418 /* Continue as from after the assertion, updating the offsets high water
1419 mark, since extracts may have been taken. */
1420
1421 do ecode += GET(ecode, 1); while (*ecode == OP_ALT);
1422
1423 offset_top = md->end_offset_top;
1424 eptr = md->end_match_ptr;
1425
1426 /* For a non-repeating ket, just continue at this level. This also
1427 happens for a repeating ket if no characters were matched in the group.
1428 This is the forcible breaking of infinite loops as implemented in Perl
1429 5.005. If there is an options reset, it will get obeyed in the normal
1430 course of events. */
1431
1432 if (*ecode == OP_KET || eptr == saved_eptr)
1433 {
1434 ecode += 1+LINK_SIZE;
1435 break;
1436 }
1437
1438 /* The repeating kets try the rest of the pattern or restart from the
1439 preceding bracket, in the appropriate order. The second "call" of match()
1440 uses tail recursion, to avoid using another stack frame. We need to reset
1441 any options that changed within the bracket before re-running it, so
1442 check the next opcode. */
1443
1444 if (ecode[1+LINK_SIZE] == OP_OPT)
1445 {
1446 ims = (ims & ~PCRE_IMS) | ecode[4];
1447 DPRINTF(("ims set to %02lx at group repeat\n", ims));
1448 }
1449
1450 if (*ecode == OP_KETRMIN)
1451 {
1452 RMATCH(eptr, ecode + 1 + LINK_SIZE, offset_top, md, ims, eptrb, 0, RM8);
1453 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1454 ecode = prev;
1455 flags = 0;
1456 goto TAIL_RECURSE;
1457 }
1458 else /* OP_KETRMAX */
1459 {
1460 RMATCH(eptr, prev, offset_top, md, ims, eptrb, match_cbegroup, RM9);
1461 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1462 ecode += 1 + LINK_SIZE;
1463 flags = 0;
1464 goto TAIL_RECURSE;
1465 }
1466 /* Control never gets here */
1467
1468 /* An alternation is the end of a branch; scan along to find the end of the
1469 bracketed group and go to there. */
1470
1471 case OP_ALT:
1472 do ecode += GET(ecode,1); while (*ecode == OP_ALT);
1473 break;
1474
1475 /* BRAZERO, BRAMINZERO and SKIPZERO occur just before a bracket group,
1476 indicating that it may occur zero times. It may repeat infinitely, or not
1477 at all - i.e. it could be ()* or ()? or even (){0} in the pattern. Brackets
1478 with fixed upper repeat limits are compiled as a number of copies, with the
1479 optional ones preceded by BRAZERO or BRAMINZERO. */
1480
1481 case OP_BRAZERO:
1482 {
1483 next = ecode+1;
1484 RMATCH(eptr, next, offset_top, md, ims, eptrb, 0, RM10);
1485 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1486 do next += GET(next,1); while (*next == OP_ALT);
1487 ecode = next + 1 + LINK_SIZE;
1488 }
1489 break;
1490
1491 case OP_BRAMINZERO:
1492 {
1493 next = ecode+1;
1494 do next += GET(next, 1); while (*next == OP_ALT);
1495 RMATCH(eptr, next + 1+LINK_SIZE, offset_top, md, ims, eptrb, 0, RM11);
1496 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1497 ecode++;
1498 }
1499 break;
1500
1501 case OP_SKIPZERO:
1502 {
1503 next = ecode+1;
1504 do next += GET(next,1); while (*next == OP_ALT);
1505 ecode = next + 1 + LINK_SIZE;
1506 }
1507 break;
1508
1509 /* End of a group, repeated or non-repeating. */
1510
1511 case OP_KET:
1512 case OP_KETRMIN:
1513 case OP_KETRMAX:
1514 prev = ecode - GET(ecode, 1);
1515
1516 /* If this was a group that remembered the subject start, in order to break
1517 infinite repeats of empty string matches, retrieve the subject start from
1518 the chain. Otherwise, set it NULL. */
1519
1520 if (*prev >= OP_SBRA)
1521 {
1522 saved_eptr = eptrb->epb_saved_eptr; /* Value at start of group */
1523 eptrb = eptrb->epb_prev; /* Backup to previous group */
1524 }
1525 else saved_eptr = NULL;
1526
1527 /* If we are at the end of an assertion group or an atomic group, stop
1528 matching and return MATCH_MATCH, but record the current high water mark for
1529 use by positive assertions. We also need to record the match start in case
1530 it was changed by \K. */
1531
1532 if (*prev == OP_ASSERT || *prev == OP_ASSERT_NOT ||
1533 *prev == OP_ASSERTBACK || *prev == OP_ASSERTBACK_NOT ||
1534 *prev == OP_ONCE)
1535 {
1536 md->end_match_ptr = eptr; /* For ONCE */
1537 md->end_offset_top = offset_top;
1538 md->start_match_ptr = mstart;
1539 MRRETURN(MATCH_MATCH);
1540 }
1541
1542 /* For capturing groups we have to check the group number back at the start
1543 and if necessary complete handling an extraction by setting the offsets and
1544 bumping the high water mark. Note that whole-pattern recursion is coded as
1545 a recurse into group 0, so it won't be picked up here. Instead, we catch it
1546 when the OP_END is reached. Other recursion is handled here. */
1547
1548 if (*prev == OP_CBRA || *prev == OP_SCBRA)
1549 {
1550 number = GET2(prev, 1+LINK_SIZE);
1551 offset = number << 1;
1552
1553 #ifdef PCRE_DEBUG
1554 printf("end bracket %d", number);
1555 printf("\n");
1556 #endif
1557
1558 md->capture_last = number;
1559 if (offset >= md->offset_max) md->offset_overflow = TRUE; else
1560 {
1561 md->offset_vector[offset] =
1562 md->offset_vector[md->offset_end - number];
1563 md->offset_vector[offset+1] = (int)(eptr - md->start_subject);
1564 if (offset_top <= offset) offset_top = offset + 2;
1565 }
1566
1567 /* Handle a recursively called group. Restore the offsets
1568 appropriately and continue from after the call. */
1569
1570 if (md->recursive != NULL && md->recursive->group_num == number)
1571 {
1572 recursion_info *rec = md->recursive;
1573 DPRINTF(("Recursion (%d) succeeded - continuing\n", number));
1574 md->recursive = rec->prevrec;
1575 memcpy(md->offset_vector, rec->offset_save,
1576 rec->saved_max * sizeof(int));
1577 offset_top = rec->save_offset_top;
1578 ecode = rec->after_call;
1579 ims = original_ims;
1580 break;
1581 }
1582 }
1583
1584 /* For both capturing and non-capturing groups, reset the value of the ims
1585 flags, in case they got changed during the group. */
1586
1587 ims = original_ims;
1588 DPRINTF(("ims reset to %02lx\n", ims));
1589
1590 /* For a non-repeating ket, just continue at this level. This also
1591 happens for a repeating ket if no characters were matched in the group.
1592 This is the forcible breaking of infinite loops as implemented in Perl
1593 5.005. If there is an options reset, it will get obeyed in the normal
1594 course of events. */
1595
1596 if (*ecode == OP_KET || eptr == saved_eptr)
1597 {
1598 ecode += 1 + LINK_SIZE;
1599 break;
1600 }
1601
1602 /* The repeating kets try the rest of the pattern or restart from the
1603 preceding bracket, in the appropriate order. In the second case, we can use
1604 tail recursion to avoid using another stack frame, unless we have an
1605 unlimited repeat of a group that can match an empty string. */
1606
1607 flags = (*prev >= OP_SBRA)? match_cbegroup : 0;
1608
1609 if (*ecode == OP_KETRMIN)
1610 {
1611 RMATCH(eptr, ecode + 1 + LINK_SIZE, offset_top, md, ims, eptrb, 0, RM12);
1612 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1613 if (flags != 0) /* Could match an empty string */
1614 {
1615 RMATCH(eptr, prev, offset_top, md, ims, eptrb, flags, RM50);
1616 RRETURN(rrc);
1617 }
1618 ecode = prev;
1619 goto TAIL_RECURSE;
1620 }
1621 else /* OP_KETRMAX */
1622 {
1623 RMATCH(eptr, prev, offset_top, md, ims, eptrb, flags, RM13);
1624 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1625 ecode += 1 + LINK_SIZE;
1626 flags = 0;
1627 goto TAIL_RECURSE;
1628 }
1629 /* Control never gets here */
1630
1631 /* Start of subject unless notbol, or after internal newline if multiline */
1632
1633 case OP_CIRC:
1634 if (md->notbol && eptr == md->start_subject) MRRETURN(MATCH_NOMATCH);
1635 if ((ims & PCRE_MULTILINE) != 0)
1636 {
1637 if (eptr != md->start_subject &&
1638 (eptr == md->end_subject || !WAS_NEWLINE(eptr)))
1639 MRRETURN(MATCH_NOMATCH);
1640 ecode++;
1641 break;
1642 }
1643 /* ... else fall through */
1644
1645 /* Start of subject assertion */
1646
1647 case OP_SOD:
1648 if (eptr != md->start_subject) MRRETURN(MATCH_NOMATCH);
1649 ecode++;
1650 break;
1651
1652 /* Start of match assertion */
1653
1654 case OP_SOM:
1655 if (eptr != md->start_subject + md->start_offset) MRRETURN(MATCH_NOMATCH);
1656 ecode++;
1657 break;
1658
1659 /* Reset the start of match point */
1660
1661 case OP_SET_SOM:
1662 mstart = eptr;
1663 ecode++;
1664 break;
1665
1666 /* Assert before internal newline if multiline, or before a terminating
1667 newline unless endonly is set, else end of subject unless noteol is set. */
1668
1669 case OP_DOLL:
1670 if ((ims & PCRE_MULTILINE) != 0)
1671 {
1672 if (eptr < md->end_subject)
1673 { if (!IS_NEWLINE(eptr)) MRRETURN(MATCH_NOMATCH); }
1674 else
1675 { if (md->noteol) MRRETURN(MATCH_NOMATCH); }
1676 ecode++;
1677 break;
1678 }
1679 else
1680 {
1681 if (md->noteol) MRRETURN(MATCH_NOMATCH);
1682 if (!md->endonly)
1683 {
1684 if (eptr != md->end_subject &&
1685 (!IS_NEWLINE(eptr) || eptr != md->end_subject - md->nllen))
1686 MRRETURN(MATCH_NOMATCH);
1687 ecode++;
1688 break;
1689 }
1690 }
1691 /* ... else fall through for endonly */
1692
1693 /* End of subject assertion (\z) */
1694
1695 case OP_EOD:
1696 if (eptr < md->end_subject) MRRETURN(MATCH_NOMATCH);
1697 ecode++;
1698 break;
1699
1700 /* End of subject or ending \n assertion (\Z) */
1701
1702 case OP_EODN:
1703 if (eptr != md->end_subject &&
1704 (!IS_NEWLINE(eptr) || eptr != md->end_subject - md->nllen))
1705 MRRETURN(MATCH_NOMATCH);
1706 ecode++;
1707 break;
1708
1709 /* Word boundary assertions */
1710
1711 case OP_NOT_WORD_BOUNDARY:
1712 case OP_WORD_BOUNDARY:
1713 {
1714
1715 /* Find out if the previous and current characters are "word" characters.
1716 It takes a bit more work in UTF-8 mode. Characters > 255 are assumed to
1717 be "non-word" characters. Remember the earliest consulted character for
1718 partial matching. */
1719
1720 #ifdef SUPPORT_UTF8
1721 if (utf8)
1722 {
1723 /* Get status of previous character */
1724
1725 if (eptr == md->start_subject) prev_is_word = FALSE; else
1726 {
1727 USPTR lastptr = eptr - 1;
1728 while((*lastptr & 0xc0) == 0x80) lastptr--;
1729 if (lastptr < md->start_used_ptr) md->start_used_ptr = lastptr;
1730 GETCHAR(c, lastptr);
1731 #ifdef SUPPORT_UCP
1732 if (md->use_ucp)
1733 {
1734 if (c == '_') prev_is_word = TRUE; else
1735 {
1736 int cat = UCD_CATEGORY(c);
1737 prev_is_word = (cat == ucp_L || cat == ucp_N);
1738 }
1739 }
1740 else
1741 #endif
1742 prev_is_word = c < 256 && (md->ctypes[c] & ctype_word) != 0;
1743 }
1744
1745 /* Get status of next character */
1746
1747 if (eptr >= md->end_subject)
1748 {
1749 SCHECK_PARTIAL();
1750 cur_is_word = FALSE;
1751 }
1752 else
1753 {
1754 GETCHAR(c, eptr);
1755 #ifdef SUPPORT_UCP
1756 if (md->use_ucp)
1757 {
1758 if (c == '_') cur_is_word = TRUE; else
1759 {
1760 int cat = UCD_CATEGORY(c);
1761 cur_is_word = (cat == ucp_L || cat == ucp_N);
1762 }
1763 }
1764 else
1765 #endif
1766 cur_is_word = c < 256 && (md->ctypes[c] & ctype_word) != 0;
1767 }
1768 }
1769 else
1770 #endif
1771
1772 /* Not in UTF-8 mode, but we may still have PCRE_UCP set, and for
1773 consistency with the behaviour of \w we do use it in this case. */
1774
1775 {
1776 /* Get status of previous character */
1777
1778 if (eptr == md->start_subject) prev_is_word = FALSE; else
1779 {
1780 if (eptr <= md->start_used_ptr) md->start_used_ptr = eptr - 1;
1781 #ifdef SUPPORT_UCP
1782 if (md->use_ucp)
1783 {
1784 c = eptr[-1];
1785 if (c == '_') prev_is_word = TRUE; else
1786 {
1787 int cat = UCD_CATEGORY(c);
1788 prev_is_word = (cat == ucp_L || cat == ucp_N);
1789 }
1790 }
1791 else
1792 #endif
1793 prev_is_word = ((md->ctypes[eptr[-1]] & ctype_word) != 0);
1794 }
1795
1796 /* Get status of next character */
1797
1798 if (eptr >= md->end_subject)
1799 {
1800 SCHECK_PARTIAL();
1801 cur_is_word = FALSE;
1802 }
1803 else
1804 #ifdef SUPPORT_UCP
1805 if (md->use_ucp)
1806 {
1807 c = *eptr;
1808 if (c == '_') cur_is_word = TRUE; else
1809 {
1810 int cat = UCD_CATEGORY(c);
1811 cur_is_word = (cat == ucp_L || cat == ucp_N);
1812 }
1813 }
1814 else
1815 #endif
1816 cur_is_word = ((md->ctypes[*eptr] & ctype_word) != 0);
1817 }
1818
1819 /* Now see if the situation is what we want */
1820
1821 if ((*ecode++ == OP_WORD_BOUNDARY)?
1822 cur_is_word == prev_is_word : cur_is_word != prev_is_word)
1823 MRRETURN(MATCH_NOMATCH);
1824 }
1825 break;
1826
1827 /* Match a single character type; inline for speed */
1828
1829 case OP_ANY:
1830 if (IS_NEWLINE(eptr)) MRRETURN(MATCH_NOMATCH);
1831 /* Fall through */
1832
1833 case OP_ALLANY:
1834 if (eptr++ >= md->end_subject)
1835 {
1836 SCHECK_PARTIAL();
1837 MRRETURN(MATCH_NOMATCH);
1838 }
1839 if (utf8) while (eptr < md->end_subject && (*eptr & 0xc0) == 0x80) eptr++;
1840 ecode++;
1841 break;
1842
1843 /* Match a single byte, even in UTF-8 mode. This opcode really does match
1844 any byte, even newline, independent of the setting of PCRE_DOTALL. */
1845
1846 case OP_ANYBYTE:
1847 if (eptr++ >= md->end_subject)
1848 {
1849 SCHECK_PARTIAL();
1850 MRRETURN(MATCH_NOMATCH);
1851 }
1852 ecode++;
1853 break;
1854
1855 case OP_NOT_DIGIT:
1856 if (eptr >= md->end_subject)
1857 {
1858 SCHECK_PARTIAL();
1859 MRRETURN(MATCH_NOMATCH);
1860 }
1861 GETCHARINCTEST(c, eptr);
1862 if (
1863 #ifdef SUPPORT_UTF8
1864 c < 256 &&
1865 #endif
1866 (md->ctypes[c] & ctype_digit) != 0
1867 )
1868 MRRETURN(MATCH_NOMATCH);
1869 ecode++;
1870 break;
1871
1872 case OP_DIGIT:
1873 if (eptr >= md->end_subject)
1874 {
1875 SCHECK_PARTIAL();
1876 MRRETURN(MATCH_NOMATCH);
1877 }
1878 GETCHARINCTEST(c, eptr);
1879 if (
1880 #ifdef SUPPORT_UTF8
1881 c >= 256 ||
1882 #endif
1883 (md->ctypes[c] & ctype_digit) == 0
1884 )
1885 MRRETURN(MATCH_NOMATCH);
1886 ecode++;
1887 break;
1888
1889 case OP_NOT_WHITESPACE:
1890 if (eptr >= md->end_subject)
1891 {
1892 SCHECK_PARTIAL();
1893 MRRETURN(MATCH_NOMATCH);
1894 }
1895 GETCHARINCTEST(c, eptr);
1896 if (
1897 #ifdef SUPPORT_UTF8
1898 c < 256 &&
1899 #endif
1900 (md->ctypes[c] & ctype_space) != 0
1901 )
1902 MRRETURN(MATCH_NOMATCH);
1903 ecode++;
1904 break;
1905
1906 case OP_WHITESPACE:
1907 if (eptr >= md->end_subject)
1908 {
1909 SCHECK_PARTIAL();
1910 MRRETURN(MATCH_NOMATCH);
1911 }
1912 GETCHARINCTEST(c, eptr);
1913 if (
1914 #ifdef SUPPORT_UTF8
1915 c >= 256 ||
1916 #endif
1917 (md->ctypes[c] & ctype_space) == 0
1918 )
1919 MRRETURN(MATCH_NOMATCH);
1920 ecode++;
1921 break;
1922
1923 case OP_NOT_WORDCHAR:
1924 if (eptr >= md->end_subject)
1925 {
1926 SCHECK_PARTIAL();
1927 MRRETURN(MATCH_NOMATCH);
1928 }
1929 GETCHARINCTEST(c, eptr);
1930 if (
1931 #ifdef SUPPORT_UTF8
1932 c < 256 &&
1933 #endif
1934 (md->ctypes[c] & ctype_word) != 0
1935 )
1936 MRRETURN(MATCH_NOMATCH);
1937 ecode++;
1938 break;
1939
1940 case OP_WORDCHAR:
1941 if (eptr >= md->end_subject)
1942 {
1943 SCHECK_PARTIAL();
1944 MRRETURN(MATCH_NOMATCH);
1945 }
1946 GETCHARINCTEST(c, eptr);
1947 if (
1948 #ifdef SUPPORT_UTF8
1949 c >= 256 ||
1950 #endif
1951 (md->ctypes[c] & ctype_word) == 0
1952 )
1953 MRRETURN(MATCH_NOMATCH);
1954 ecode++;
1955 break;
1956
1957 case OP_ANYNL:
1958 if (eptr >= md->end_subject)
1959 {
1960 SCHECK_PARTIAL();
1961 MRRETURN(MATCH_NOMATCH);
1962 }
1963 GETCHARINCTEST(c, eptr);
1964 switch(c)
1965 {
1966 default: MRRETURN(MATCH_NOMATCH);
1967 case 0x000d:
1968 if (eptr < md->end_subject && *eptr == 0x0a) eptr++;
1969 break;
1970
1971 case 0x000a:
1972 break;
1973
1974 case 0x000b:
1975 case 0x000c:
1976 case 0x0085:
1977 case 0x2028:
1978 case 0x2029:
1979 if (md->bsr_anycrlf) MRRETURN(MATCH_NOMATCH);
1980 break;
1981 }
1982 ecode++;
1983 break;
1984
1985 case OP_NOT_HSPACE:
1986 if (eptr >= md->end_subject)
1987 {
1988 SCHECK_PARTIAL();
1989 MRRETURN(MATCH_NOMATCH);
1990 }
1991 GETCHARINCTEST(c, eptr);
1992 switch(c)
1993 {
1994 default: break;
1995 case 0x09: /* HT */
1996 case 0x20: /* SPACE */
1997 case 0xa0: /* NBSP */
1998 case 0x1680: /* OGHAM SPACE MARK */
1999 case 0x180e: /* MONGOLIAN VOWEL SEPARATOR */
2000 case 0x2000: /* EN QUAD */
2001 case 0x2001: /* EM QUAD */
2002 case 0x2002: /* EN SPACE */
2003 case 0x2003: /* EM SPACE */
2004 case 0x2004: /* THREE-PER-EM SPACE */
2005 case 0x2005: /* FOUR-PER-EM SPACE */
2006 case 0x2006: /* SIX-PER-EM SPACE */
2007 case 0x2007: /* FIGURE SPACE */
2008 case 0x2008: /* PUNCTUATION SPACE */
2009 case 0x2009: /* THIN SPACE */
2010 case 0x200A: /* HAIR SPACE */
2011 case 0x202f: /* NARROW NO-BREAK SPACE */
2012 case 0x205f: /* MEDIUM MATHEMATICAL SPACE */
2013 case 0x3000: /* IDEOGRAPHIC SPACE */
2014 MRRETURN(MATCH_NOMATCH);
2015 }
2016 ecode++;
2017 break;
2018
2019 case OP_HSPACE:
2020 if (eptr >= md->end_subject)
2021 {
2022 SCHECK_PARTIAL();
2023 MRRETURN(MATCH_NOMATCH);
2024 }
2025 GETCHARINCTEST(c, eptr);
2026 switch(c)
2027 {
2028 default: MRRETURN(MATCH_NOMATCH);
2029 case 0x09: /* HT */
2030 case 0x20: /* SPACE */
2031 case 0xa0: /* NBSP */
2032 case 0x1680: /* OGHAM SPACE MARK */
2033 case 0x180e: /* MONGOLIAN VOWEL SEPARATOR */
2034 case 0x2000: /* EN QUAD */
2035 case 0x2001: /* EM QUAD */
2036 case 0x2002: /* EN SPACE */
2037 case 0x2003: /* EM SPACE */
2038 case 0x2004: /* THREE-PER-EM SPACE */
2039 case 0x2005: /* FOUR-PER-EM SPACE */
2040 case 0x2006: /* SIX-PER-EM SPACE */
2041 case 0x2007: /* FIGURE SPACE */
2042 case 0x2008: /* PUNCTUATION SPACE */
2043 case 0x2009: /* THIN SPACE */
2044 case 0x200A: /* HAIR SPACE */
2045 case 0x202f: /* NARROW NO-BREAK SPACE */
2046 case 0x205f: /* MEDIUM MATHEMATICAL SPACE */
2047 case 0x3000: /* IDEOGRAPHIC SPACE */
2048 break;
2049 }
2050 ecode++;
2051 break;
2052
2053 case OP_NOT_VSPACE:
2054 if (eptr >= md->end_subject)
2055 {
2056 SCHECK_PARTIAL();
2057 MRRETURN(MATCH_NOMATCH);
2058 }
2059 GETCHARINCTEST(c, eptr);
2060 switch(c)
2061 {
2062 default: break;
2063 case 0x0a: /* LF */
2064 case 0x0b: /* VT */
2065 case 0x0c: /* FF */
2066 case 0x0d: /* CR */
2067 case 0x85: /* NEL */
2068 case 0x2028: /* LINE SEPARATOR */
2069 case 0x2029: /* PARAGRAPH SEPARATOR */
2070 MRRETURN(MATCH_NOMATCH);
2071 }
2072 ecode++;
2073 break;
2074
2075 case OP_VSPACE:
2076 if (eptr >= md->end_subject)
2077 {
2078 SCHECK_PARTIAL();
2079 MRRETURN(MATCH_NOMATCH);
2080 }
2081 GETCHARINCTEST(c, eptr);
2082 switch(c)
2083 {
2084 default: MRRETURN(MATCH_NOMATCH);
2085 case 0x0a: /* LF */
2086 case 0x0b: /* VT */
2087 case 0x0c: /* FF */
2088 case 0x0d: /* CR */
2089 case 0x85: /* NEL */
2090 case 0x2028: /* LINE SEPARATOR */
2091 case 0x2029: /* PARAGRAPH SEPARATOR */
2092 break;
2093 }
2094 ecode++;
2095 break;
2096
2097 #ifdef SUPPORT_UCP
2098 /* Check the next character by Unicode property. We will get here only
2099 if the support is in the binary; otherwise a compile-time error occurs. */
2100
2101 case OP_PROP:
2102 case OP_NOTPROP:
2103 if (eptr >= md->end_subject)
2104 {
2105 SCHECK_PARTIAL();
2106 MRRETURN(MATCH_NOMATCH);
2107 }
2108 GETCHARINCTEST(c, eptr);
2109 {
2110 const ucd_record *prop = GET_UCD(c);
2111
2112 switch(ecode[1])
2113 {
2114 case PT_ANY:
2115 if (op == OP_NOTPROP) MRRETURN(MATCH_NOMATCH);
2116 break;
2117
2118 case PT_LAMP:
2119 if ((prop->chartype == ucp_Lu ||
2120 prop->chartype == ucp_Ll ||
2121 prop->chartype == ucp_Lt) == (op == OP_NOTPROP))
2122 MRRETURN(MATCH_NOMATCH);
2123 break;
2124
2125 case PT_GC:
2126 if ((ecode[2] != _pcre_ucp_gentype[prop->chartype]) == (op == OP_PROP))
2127 MRRETURN(MATCH_NOMATCH);
2128 break;
2129
2130 case PT_PC:
2131 if ((ecode[2] != prop->chartype) == (op == OP_PROP))
2132 MRRETURN(MATCH_NOMATCH);
2133 break;
2134
2135 case PT_SC:
2136 if ((ecode[2] != prop->script) == (op == OP_PROP))
2137 MRRETURN(MATCH_NOMATCH);
2138 break;
2139
2140 /* These are specials */
2141
2142 case PT_ALNUM:
2143 if ((_pcre_ucp_gentype[prop->chartype] == ucp_L ||
2144 _pcre_ucp_gentype[prop->chartype] == ucp_N) == (op == OP_NOTPROP))
2145 MRRETURN(MATCH_NOMATCH);
2146 break;
2147
2148 case PT_SPACE: /* Perl space */
2149 if ((_pcre_ucp_gentype[prop->chartype] == ucp_Z ||
2150 c == CHAR_HT || c == CHAR_NL || c == CHAR_FF || c == CHAR_CR)
2151 == (op == OP_NOTPROP))
2152 MRRETURN(MATCH_NOMATCH);
2153 break;
2154
2155 case PT_PXSPACE: /* POSIX space */
2156 if ((_pcre_ucp_gentype[prop->chartype] == ucp_Z ||
2157 c == CHAR_HT || c == CHAR_NL || c == CHAR_VT ||
2158 c == CHAR_FF || c == CHAR_CR)
2159 == (op == OP_NOTPROP))
2160 MRRETURN(MATCH_NOMATCH);
2161 break;
2162
2163 case PT_WORD:
2164 if ((_pcre_ucp_gentype[prop->chartype] == ucp_L ||
2165 _pcre_ucp_gentype[prop->chartype] == ucp_N ||
2166 c == CHAR_UNDERSCORE) == (op == OP_NOTPROP))
2167 MRRETURN(MATCH_NOMATCH);
2168 break;
2169
2170 /* This should never occur */
2171
2172 default:
2173 RRETURN(PCRE_ERROR_INTERNAL);
2174 }
2175
2176 ecode += 3;
2177 }
2178 break;
2179
2180 /* Match an extended Unicode sequence. We will get here only if the support
2181 is in the binary; otherwise a compile-time error occurs. */
2182
2183 case OP_EXTUNI:
2184 if (eptr >= md->end_subject)
2185 {
2186 SCHECK_PARTIAL();
2187 MRRETURN(MATCH_NOMATCH);
2188 }
2189 GETCHARINCTEST(c, eptr);
2190 {
2191 int category = UCD_CATEGORY(c);
2192 if (category == ucp_M) MRRETURN(MATCH_NOMATCH);
2193 while (eptr < md->end_subject)
2194 {
2195 int len = 1;
2196 if (!utf8) c = *eptr; else
2197 {
2198 GETCHARLEN(c, eptr, len);
2199 }
2200 category = UCD_CATEGORY(c);
2201 if (category != ucp_M) break;
2202 eptr += len;
2203 }
2204 }
2205 ecode++;
2206 break;
2207 #endif
2208
2209
2210 /* Match a back reference, possibly repeatedly. Look past the end of the
2211 item to see if there is repeat information following. The code is similar
2212 to that for character classes, but repeated for efficiency. Then obey
2213 similar code to character type repeats - written out again for speed.
2214 However, if the referenced string is the empty string, always treat
2215 it as matched, any number of times (otherwise there could be infinite
2216 loops). */
2217
2218 case OP_REF:
2219 {
2220 offset = GET2(ecode, 1) << 1; /* Doubled ref number */
2221 ecode += 3;
2222
2223 /* If the reference is unset, there are two possibilities:
2224
2225 (a) In the default, Perl-compatible state, set the length to be longer
2226 than the amount of subject left; this ensures that every attempt at a
2227 match fails. We can't just fail here, because of the possibility of
2228 quantifiers with zero minima.
2229
2230 (b) If the JavaScript compatibility flag is set, set the length to zero
2231 so that the back reference matches an empty string.
2232
2233 Otherwise, set the length to the length of what was matched by the
2234 referenced subpattern. */
2235
2236 if (offset >= offset_top || md->offset_vector[offset] < 0)
2237 length = (md->jscript_compat)? 0 : (int)(md->end_subject - eptr + 1);
2238 else
2239 length = md->offset_vector[offset+1] - md->offset_vector[offset];
2240
2241 /* Set up for repetition, or handle the non-repeated case */
2242
2243 switch (*ecode)
2244 {
2245 case OP_CRSTAR:
2246 case OP_CRMINSTAR:
2247 case OP_CRPLUS:
2248 case OP_CRMINPLUS:
2249 case OP_CRQUERY:
2250 case OP_CRMINQUERY:
2251 c = *ecode++ - OP_CRSTAR;
2252 minimize = (c & 1) != 0;
2253 min = rep_min[c]; /* Pick up values from tables; */
2254 max = rep_max[c]; /* zero for max => infinity */
2255 if (max == 0) max = INT_MAX;
2256 break;
2257
2258 case OP_CRRANGE:
2259 case OP_CRMINRANGE:
2260 minimize = (*ecode == OP_CRMINRANGE);
2261 min = GET2(ecode, 1);
2262 max = GET2(ecode, 3);
2263 if (max == 0) max = INT_MAX;
2264 ecode += 5;
2265 break;
2266
2267 default: /* No repeat follows */
2268 if (!match_ref(offset, eptr, length, md, ims))
2269 {
2270 CHECK_PARTIAL();
2271 MRRETURN(MATCH_NOMATCH);
2272 }
2273 eptr += length;
2274 continue; /* With the main loop */
2275 }
2276
2277 /* If the length of the reference is zero, just continue with the
2278 main loop. */
2279
2280 if (length == 0) continue;
2281
2282 /* First, ensure the minimum number of matches are present. We get back
2283 the length of the reference string explicitly rather than passing the
2284 address of eptr, so that eptr can be a register variable. */
2285
2286 for (i = 1; i <= min; i++)
2287 {
2288 if (!match_ref(offset, eptr, length, md, ims))
2289 {
2290 CHECK_PARTIAL();
2291 MRRETURN(MATCH_NOMATCH);
2292 }
2293 eptr += length;
2294 }
2295
2296 /* If min = max, continue at the same level without recursion.
2297 They are not both allowed to be zero. */
2298
2299 if (min == max) continue;
2300
2301 /* If minimizing, keep trying and advancing the pointer */
2302
2303 if (minimize)
2304 {
2305 for (fi = min;; fi++)
2306 {
2307 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM14);
2308 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2309 if (fi >= max) MRRETURN(MATCH_NOMATCH);
2310 if (!match_ref(offset, eptr, length, md, ims))
2311 {
2312 CHECK_PARTIAL();
2313 MRRETURN(MATCH_NOMATCH);
2314 }
2315 eptr += length;
2316 }
2317 /* Control never gets here */
2318 }
2319
2320 /* If maximizing, find the longest string and work backwards */
2321
2322 else
2323 {
2324 pp = eptr;
2325 for (i = min; i < max; i++)
2326 {
2327 if (!match_ref(offset, eptr, length, md, ims))
2328 {
2329 CHECK_PARTIAL();
2330 break;
2331 }
2332 eptr += length;
2333 }
2334 while (eptr >= pp)
2335 {
2336 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM15);
2337 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2338 eptr -= length;
2339 }
2340 MRRETURN(MATCH_NOMATCH);
2341 }
2342 }
2343 /* Control never gets here */
2344
2345 /* Match a bit-mapped character class, possibly repeatedly. This op code is
2346 used when all the characters in the class have values in the range 0-255,
2347 and either the matching is caseful, or the characters are in the range
2348 0-127 when UTF-8 processing is enabled. The only difference between
2349 OP_CLASS and OP_NCLASS occurs when a data character outside the range is
2350 encountered.
2351
2352 First, look past the end of the item to see if there is repeat information
2353 following. Then obey similar code to character type repeats - written out
2354 again for speed. */
2355
2356 case OP_NCLASS:
2357 case OP_CLASS:
2358 {
2359 data = ecode + 1; /* Save for matching */
2360 ecode += 33; /* Advance past the item */
2361
2362 switch (*ecode)
2363 {
2364 case OP_CRSTAR:
2365 case OP_CRMINSTAR:
2366 case OP_CRPLUS:
2367 case OP_CRMINPLUS:
2368 case OP_CRQUERY:
2369 case OP_CRMINQUERY:
2370 c = *ecode++ - OP_CRSTAR;
2371 minimize = (c & 1) != 0;
2372 min = rep_min[c]; /* Pick up values from tables; */
2373 max = rep_max[c]; /* zero for max => infinity */
2374 if (max == 0) max = INT_MAX;
2375 break;
2376
2377 case OP_CRRANGE:
2378 case OP_CRMINRANGE:
2379 minimize = (*ecode == OP_CRMINRANGE);
2380 min = GET2(ecode, 1);
2381 max = GET2(ecode, 3);
2382 if (max == 0) max = INT_MAX;
2383 ecode += 5;
2384 break;
2385
2386 default: /* No repeat follows */
2387 min = max = 1;
2388 break;
2389 }
2390
2391 /* First, ensure the minimum number of matches are present. */
2392
2393 #ifdef SUPPORT_UTF8
2394 /* UTF-8 mode */
2395 if (utf8)
2396 {
2397 for (i = 1; i <= min; i++)
2398 {
2399 if (eptr >= md->end_subject)
2400 {
2401 SCHECK_PARTIAL();
2402 MRRETURN(MATCH_NOMATCH);
2403 }
2404 GETCHARINC(c, eptr);
2405 if (c > 255)
2406 {
2407 if (op == OP_CLASS) MRRETURN(MATCH_NOMATCH);
2408 }
2409 else
2410 {
2411 if ((data[c/8] & (1 << (c&7))) == 0) MRRETURN(MATCH_NOMATCH);
2412 }
2413 }
2414 }
2415 else
2416 #endif
2417 /* Not UTF-8 mode */
2418 {
2419 for (i = 1; i <= min; i++)
2420 {
2421 if (eptr >= md->end_subject)
2422 {
2423 SCHECK_PARTIAL();
2424 MRRETURN(MATCH_NOMATCH);
2425 }
2426 c = *eptr++;
2427 if ((data[c/8] & (1 << (c&7))) == 0) MRRETURN(MATCH_NOMATCH);
2428 }
2429 }
2430
2431 /* If max == min we can continue with the main loop without the
2432 need to recurse. */
2433
2434 if (min == max) continue;
2435
2436 /* If minimizing, keep testing the rest of the expression and advancing
2437 the pointer while it matches the class. */
2438
2439 if (minimize)
2440 {
2441 #ifdef SUPPORT_UTF8
2442 /* UTF-8 mode */
2443 if (utf8)
2444 {
2445 for (fi = min;; fi++)
2446 {
2447 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM16);
2448 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2449 if (fi >= max) MRRETURN(MATCH_NOMATCH);
2450 if (eptr >= md->end_subject)
2451 {
2452 SCHECK_PARTIAL();
2453 MRRETURN(MATCH_NOMATCH);
2454 }
2455 GETCHARINC(c, eptr);
2456 if (c > 255)
2457 {
2458 if (op == OP_CLASS) MRRETURN(MATCH_NOMATCH);
2459 }
2460 else
2461 {
2462 if ((data[c/8] & (1 << (c&7))) == 0) MRRETURN(MATCH_NOMATCH);
2463 }
2464 }
2465 }
2466 else
2467 #endif
2468 /* Not UTF-8 mode */
2469 {
2470 for (fi = min;; fi++)
2471 {
2472 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM17);
2473 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2474 if (fi >= max) MRRETURN(MATCH_NOMATCH);
2475 if (eptr >= md->end_subject)
2476 {
2477 SCHECK_PARTIAL();
2478 MRRETURN(MATCH_NOMATCH);
2479 }
2480 c = *eptr++;
2481 if ((data[c/8] & (1 << (c&7))) == 0) MRRETURN(MATCH_NOMATCH);
2482 }
2483 }
2484 /* Control never gets here */
2485 }
2486
2487 /* If maximizing, find the longest possible run, then work backwards. */
2488
2489 else
2490 {
2491 pp = eptr;
2492
2493 #ifdef SUPPORT_UTF8
2494 /* UTF-8 mode */
2495 if (utf8)
2496 {
2497 for (i = min; i < max; i++)
2498 {
2499 int len = 1;
2500 if (eptr >= md->end_subject)
2501 {
2502 SCHECK_PARTIAL();
2503 break;
2504 }
2505 GETCHARLEN(c, eptr, len);
2506 if (c > 255)
2507 {
2508 if (op == OP_CLASS) break;
2509 }
2510 else
2511 {
2512 if ((data[c/8] & (1 << (c&7))) == 0) break;
2513 }
2514 eptr += len;
2515 }
2516 for (;;)
2517 {
2518 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM18);
2519 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2520 if (eptr-- == pp) break; /* Stop if tried at original pos */
2521 BACKCHAR(eptr);
2522 }
2523 }
2524 else
2525 #endif
2526 /* Not UTF-8 mode */
2527 {
2528 for (i = min; i < max; i++)
2529 {
2530 if (eptr >= md->end_subject)
2531 {
2532 SCHECK_PARTIAL();
2533 break;
2534 }
2535 c = *eptr;
2536 if ((data[c/8] & (1 << (c&7))) == 0) break;
2537 eptr++;
2538 }
2539 while (eptr >= pp)
2540 {
2541 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM19);
2542 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2543 eptr--;
2544 }
2545 }
2546
2547 MRRETURN(MATCH_NOMATCH);
2548 }
2549 }
2550 /* Control never gets here */
2551
2552
2553 /* Match an extended character class. This opcode is encountered only
2554 when UTF-8 mode mode is supported. Nevertheless, we may not be in UTF-8
2555 mode, because Unicode properties are supported in non-UTF-8 mode. */
2556
2557 #ifdef SUPPORT_UTF8
2558 case OP_XCLASS:
2559 {
2560 data = ecode + 1 + LINK_SIZE; /* Save for matching */
2561 ecode += GET(ecode, 1); /* Advance past the item */
2562
2563 switch (*ecode)
2564 {
2565 case OP_CRSTAR:
2566 case OP_CRMINSTAR:
2567 case OP_CRPLUS:
2568 case OP_CRMINPLUS:
2569 case OP_CRQUERY:
2570 case OP_CRMINQUERY:
2571 c = *ecode++ - OP_CRSTAR;
2572 minimize = (c & 1) != 0;
2573 min = rep_min[c]; /* Pick up values from tables; */
2574 max = rep_max[c]; /* zero for max => infinity */
2575 if (max == 0) max = INT_MAX;
2576 break;
2577
2578 case OP_CRRANGE:
2579 case OP_CRMINRANGE:
2580 minimize = (*ecode == OP_CRMINRANGE);
2581 min = GET2(ecode, 1);
2582 max = GET2(ecode, 3);
2583 if (max == 0) max = INT_MAX;
2584 ecode += 5;
2585 break;
2586
2587 default: /* No repeat follows */
2588 min = max = 1;
2589 break;
2590 }
2591
2592 /* First, ensure the minimum number of matches are present. */
2593
2594 for (i = 1; i <= min; i++)
2595 {
2596 if (eptr >= md->end_subject)
2597 {
2598 SCHECK_PARTIAL();
2599 MRRETURN(MATCH_NOMATCH);
2600 }
2601 GETCHARINCTEST(c, eptr);
2602 if (!_pcre_xclass(c, data)) MRRETURN(MATCH_NOMATCH);
2603 }
2604
2605 /* If max == min we can continue with the main loop without the
2606 need to recurse. */
2607
2608 if (min == max) continue;
2609
2610 /* If minimizing, keep testing the rest of the expression and advancing
2611 the pointer while it matches the class. */
2612
2613 if (minimize)
2614 {
2615 for (fi = min;; fi++)
2616 {
2617 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM20);
2618 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2619 if (fi >= max) MRRETURN(MATCH_NOMATCH);
2620 if (eptr >= md->end_subject)
2621 {
2622 SCHECK_PARTIAL();
2623 MRRETURN(MATCH_NOMATCH);
2624 }
2625 GETCHARINCTEST(c, eptr);
2626 if (!_pcre_xclass(c, data)) MRRETURN(MATCH_NOMATCH);
2627 }
2628 /* Control never gets here */
2629 }
2630
2631 /* If maximizing, find the longest possible run, then work backwards. */
2632
2633 else
2634 {
2635 pp = eptr;
2636 for (i = min; i < max; i++)
2637 {
2638 int len = 1;
2639 if (eptr >= md->end_subject)
2640 {
2641 SCHECK_PARTIAL();
2642 break;
2643 }
2644 GETCHARLENTEST(c, eptr, len);
2645 if (!_pcre_xclass(c, data)) break;
2646 eptr += len;
2647 }
2648 for(;;)
2649 {
2650 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM21);
2651 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2652 if (eptr-- == pp) break; /* Stop if tried at original pos */
2653 if (utf8) BACKCHAR(eptr);
2654 }
2655 MRRETURN(MATCH_NOMATCH);
2656 }
2657
2658 /* Control never gets here */
2659 }
2660 #endif /* End of XCLASS */
2661
2662 /* Match a single character, casefully */
2663
2664 case OP_CHAR:
2665 #ifdef SUPPORT_UTF8
2666 if (utf8)
2667 {
2668 length = 1;
2669 ecode++;
2670 GETCHARLEN(fc, ecode, length);
2671 if (length > md->end_subject - eptr)
2672 {
2673 CHECK_PARTIAL(); /* Not SCHECK_PARTIAL() */
2674 MRRETURN(MATCH_NOMATCH);
2675 }
2676 while (length-- > 0) if (*ecode++ != *eptr++) MRRETURN(MATCH_NOMATCH);
2677 }
2678 else
2679 #endif
2680
2681 /* Non-UTF-8 mode */
2682 {
2683 if (md->end_subject - eptr < 1)
2684 {
2685 SCHECK_PARTIAL(); /* This one can use SCHECK_PARTIAL() */
2686 MRRETURN(MATCH_NOMATCH);
2687 }
2688 if (ecode[1] != *eptr++) MRRETURN(MATCH_NOMATCH);
2689 ecode += 2;
2690 }
2691 break;
2692
2693 /* Match a single character, caselessly */
2694
2695 case OP_CHARNC:
2696 #ifdef SUPPORT_UTF8
2697 if (utf8)
2698 {
2699 length = 1;
2700 ecode++;
2701 GETCHARLEN(fc, ecode, length);
2702
2703 if (length > md->end_subject - eptr)
2704 {
2705 CHECK_PARTIAL(); /* Not SCHECK_PARTIAL() */
2706 MRRETURN(MATCH_NOMATCH);
2707 }
2708
2709 /* If the pattern character's value is < 128, we have only one byte, and
2710 can use the fast lookup table. */
2711
2712 if (fc < 128)
2713 {
2714 if (md->lcc[*ecode++] != md->lcc[*eptr++]) MRRETURN(MATCH_NOMATCH);
2715 }
2716
2717 /* Otherwise we must pick up the subject character */
2718
2719 else
2720 {
2721 unsigned int dc;
2722 GETCHARINC(dc, eptr);
2723 ecode += length;
2724
2725 /* If we have Unicode property support, we can use it to test the other
2726 case of the character, if there is one. */
2727
2728 if (fc != dc)
2729 {
2730 #ifdef SUPPORT_UCP
2731 if (dc != UCD_OTHERCASE(fc))
2732 #endif
2733 MRRETURN(MATCH_NOMATCH);
2734 }
2735 }
2736 }
2737 else
2738 #endif /* SUPPORT_UTF8 */
2739
2740 /* Non-UTF-8 mode */
2741 {
2742 if (md->end_subject - eptr < 1)
2743 {
2744 SCHECK_PARTIAL(); /* This one can use SCHECK_PARTIAL() */
2745 MRRETURN(MATCH_NOMATCH);
2746 }
2747 if (md->lcc[ecode[1]] != md->lcc[*eptr++]) MRRETURN(MATCH_NOMATCH);
2748 ecode += 2;
2749 }
2750 break;
2751
2752 /* Match a single character repeatedly. */
2753
2754 case OP_EXACT:
2755 min = max = GET2(ecode, 1);
2756 ecode += 3;
2757 goto REPEATCHAR;
2758
2759 case OP_POSUPTO:
2760 possessive = TRUE;
2761 /* Fall through */
2762
2763 case OP_UPTO:
2764 case OP_MINUPTO:
2765 min = 0;
2766 max = GET2(ecode, 1);
2767 minimize = *ecode == OP_MINUPTO;
2768 ecode += 3;
2769 goto REPEATCHAR;
2770
2771 case OP_POSSTAR:
2772 possessive = TRUE;
2773 min = 0;
2774 max = INT_MAX;
2775 ecode++;
2776 goto REPEATCHAR;
2777
2778 case OP_POSPLUS:
2779 possessive = TRUE;
2780 min = 1;
2781 max = INT_MAX;
2782 ecode++;
2783 goto REPEATCHAR;
2784
2785 case OP_POSQUERY:
2786 possessive = TRUE;
2787 min = 0;
2788 max = 1;
2789 ecode++;
2790 goto REPEATCHAR;
2791
2792 case OP_STAR:
2793 case OP_MINSTAR:
2794 case OP_PLUS:
2795 case OP_MINPLUS:
2796 case OP_QUERY:
2797 case OP_MINQUERY:
2798 c = *ecode++ - OP_STAR;
2799 minimize = (c & 1) != 0;
2800
2801 min = rep_min[c]; /* Pick up values from tables; */
2802 max = rep_max[c]; /* zero for max => infinity */
2803 if (max == 0) max = INT_MAX;
2804
2805 /* Common code for all repeated single-character matches. */
2806
2807 REPEATCHAR:
2808 #ifdef SUPPORT_UTF8
2809 if (utf8)
2810 {
2811 length = 1;
2812 charptr = ecode;
2813 GETCHARLEN(fc, ecode, length);
2814 ecode += length;
2815
2816 /* Handle multibyte character matching specially here. There is
2817 support for caseless matching if UCP support is present. */
2818
2819 if (length > 1)
2820 {
2821 #ifdef SUPPORT_UCP
2822 unsigned int othercase;
2823 if ((ims & PCRE_CASELESS) != 0 &&
2824 (othercase = UCD_OTHERCASE(fc)) != fc)
2825 oclength = _pcre_ord2utf8(othercase, occhars);
2826 else oclength = 0;
2827 #endif /* SUPPORT_UCP */
2828
2829 for (i = 1; i <= min; i++)
2830 {
2831 if (eptr <= md->end_subject - length &&
2832 memcmp(eptr, charptr, length) == 0) eptr += length;
2833 #ifdef SUPPORT_UCP
2834 else if (oclength > 0 &&
2835 eptr <= md->end_subject - oclength &&
2836 memcmp(eptr, occhars, oclength) == 0) eptr += oclength;
2837 #endif /* SUPPORT_UCP */
2838 else
2839 {
2840 CHECK_PARTIAL();
2841 MRRETURN(MATCH_NOMATCH);
2842 }
2843 }
2844
2845 if (min == max) continue;
2846
2847 if (minimize)
2848 {
2849 for (fi = min;; fi++)
2850 {
2851 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM22);
2852 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2853 if (fi >= max) MRRETURN(MATCH_NOMATCH);
2854 if (eptr <= md->end_subject - length &&
2855 memcmp(eptr, charptr, length) == 0) eptr += length;
2856 #ifdef SUPPORT_UCP
2857 else if (oclength > 0 &&
2858 eptr <= md->end_subject - oclength &&
2859 memcmp(eptr, occhars, oclength) == 0) eptr += oclength;
2860 #endif /* SUPPORT_UCP */
2861 else
2862 {
2863 CHECK_PARTIAL();
2864 MRRETURN(MATCH_NOMATCH);
2865 }
2866 }
2867 /* Control never gets here */
2868 }
2869
2870 else /* Maximize */
2871 {
2872 pp = eptr;
2873 for (i = min; i < max; i++)
2874 {
2875 if (eptr <= md->end_subject - length &&
2876 memcmp(eptr, charptr, length) == 0) eptr += length;
2877 #ifdef SUPPORT_UCP
2878 else if (oclength > 0 &&
2879 eptr <= md->end_subject - oclength &&
2880 memcmp(eptr, occhars, oclength) == 0) eptr += oclength;
2881 #endif /* SUPPORT_UCP */
2882 else
2883 {
2884 CHECK_PARTIAL();
2885 break;
2886 }
2887 }
2888
2889 if (possessive) continue;
2890
2891 for(;;)
2892 {
2893 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM23);
2894 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2895 if (eptr == pp) { MRRETURN(MATCH_NOMATCH); }
2896 #ifdef SUPPORT_UCP
2897 eptr--;
2898 BACKCHAR(eptr);
2899 #else /* without SUPPORT_UCP */
2900 eptr -= length;
2901 #endif /* SUPPORT_UCP */
2902 }
2903 }
2904 /* Control never gets here */
2905 }
2906
2907 /* If the length of a UTF-8 character is 1, we fall through here, and
2908 obey the code as for non-UTF-8 characters below, though in this case the
2909 value of fc will always be < 128. */
2910 }
2911 else
2912 #endif /* SUPPORT_UTF8 */
2913
2914 /* When not in UTF-8 mode, load a single-byte character. */
2915
2916 fc = *ecode++;
2917
2918 /* The value of fc at this point is always less than 256, though we may or
2919 may not be in UTF-8 mode. The code is duplicated for the caseless and
2920 caseful cases, for speed, since matching characters is likely to be quite
2921 common. First, ensure the minimum number of matches are present. If min =
2922 max, continue at the same level without recursing. Otherwise, if
2923 minimizing, keep trying the rest of the expression and advancing one
2924 matching character if failing, up to the maximum. Alternatively, if
2925 maximizing, find the maximum number of characters and work backwards. */
2926
2927 DPRINTF(("matching %c{%d,%d} against subject %.*s\n", fc, min, max,
2928 max, eptr));
2929
2930 if ((ims & PCRE_CASELESS) != 0)
2931 {
2932 fc = md->lcc[fc];
2933 for (i = 1; i <= min; i++)
2934 {
2935 if (eptr >= md->end_subject)
2936 {
2937 SCHECK_PARTIAL();
2938 MRRETURN(MATCH_NOMATCH);
2939 }
2940 if (fc != md->lcc[*eptr++]) MRRETURN(MATCH_NOMATCH);
2941 }
2942 if (min == max) continue;
2943 if (minimize)
2944 {
2945 for (fi = min;; fi++)
2946 {
2947 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM24);
2948 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2949 if (fi >= max) MRRETURN(MATCH_NOMATCH);
2950 if (eptr >= md->end_subject)
2951 {
2952 SCHECK_PARTIAL();
2953 MRRETURN(MATCH_NOMATCH);
2954 }
2955 if (fc != md->lcc[*eptr++]) MRRETURN(MATCH_NOMATCH);
2956 }
2957 /* Control never gets here */
2958 }
2959 else /* Maximize */
2960 {
2961 pp = eptr;
2962 for (i = min; i < max; i++)
2963 {
2964 if (eptr >= md->end_subject)
2965 {
2966 SCHECK_PARTIAL();
2967 break;
2968 }
2969 if (fc != md->lcc[*eptr]) break;
2970 eptr++;
2971 }
2972
2973 if (possessive) continue;
2974
2975 while (eptr >= pp)
2976 {
2977 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM25);
2978 eptr--;
2979 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2980 }
2981 MRRETURN(MATCH_NOMATCH);
2982 }
2983 /* Control never gets here */
2984 }
2985
2986 /* Caseful comparisons (includes all multi-byte characters) */
2987
2988 else
2989 {
2990 for (i = 1; i <= min; i++)
2991 {
2992 if (eptr >= md->end_subject)
2993 {
2994 SCHECK_PARTIAL();
2995 MRRETURN(MATCH_NOMATCH);
2996 }
2997 if (fc != *eptr++) MRRETURN(MATCH_NOMATCH);
2998 }
2999
3000 if (min == max) continue;
3001
3002 if (minimize)
3003 {
3004 for (fi = min;; fi++)
3005 {
3006 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM26);
3007 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3008 if (fi >= max) MRRETURN(MATCH_NOMATCH);
3009 if (eptr >= md->end_subject)
3010 {
3011 SCHECK_PARTIAL();
3012 MRRETURN(MATCH_NOMATCH);
3013 }
3014 if (fc != *eptr++) MRRETURN(MATCH_NOMATCH);
3015 }
3016 /* Control never gets here */
3017 }
3018 else /* Maximize */
3019 {
3020 pp = eptr;
3021 for (i = min; i < max; i++)
3022 {
3023 if (eptr >= md->end_subject)
3024 {
3025 SCHECK_PARTIAL();
3026 break;
3027 }
3028 if (fc != *eptr) break;
3029 eptr++;
3030 }
3031 if (possessive) continue;
3032
3033 while (eptr >= pp)
3034 {
3035 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM27);
3036 eptr--;
3037 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3038 }
3039 MRRETURN(MATCH_NOMATCH);
3040 }
3041 }
3042 /* Control never gets here */
3043
3044 /* Match a negated single one-byte character. The character we are
3045 checking can be multibyte. */
3046
3047 case OP_NOT:
3048 if (eptr >= md->end_subject)
3049 {
3050 SCHECK_PARTIAL();
3051 MRRETURN(MATCH_NOMATCH);
3052 }
3053 ecode++;
3054 GETCHARINCTEST(c, eptr);
3055 if ((ims & PCRE_CASELESS) != 0)
3056 {
3057 #ifdef SUPPORT_UTF8
3058 if (c < 256)
3059 #endif
3060 c = md->lcc[c];
3061 if (md->lcc[*ecode++] == c) MRRETURN(MATCH_NOMATCH);
3062 }
3063 else
3064 {
3065 if (*ecode++ == c) MRRETURN(MATCH_NOMATCH);
3066 }
3067 break;
3068
3069 /* Match a negated single one-byte character repeatedly. This is almost a
3070 repeat of the code for a repeated single character, but I haven't found a
3071 nice way of commoning these up that doesn't require a test of the
3072 positive/negative option for each character match. Maybe that wouldn't add
3073 very much to the time taken, but character matching *is* what this is all
3074 about... */
3075
3076 case OP_NOTEXACT:
3077 min = max = GET2(ecode, 1);
3078 ecode += 3;
3079 goto REPEATNOTCHAR;
3080
3081 case OP_NOTUPTO:
3082 case OP_NOTMINUPTO:
3083 min = 0;
3084 max = GET2(ecode, 1);
3085 minimize = *ecode == OP_NOTMINUPTO;
3086 ecode += 3;
3087 goto REPEATNOTCHAR;
3088
3089 case OP_NOTPOSSTAR:
3090 possessive = TRUE;
3091 min = 0;
3092 max = INT_MAX;
3093 ecode++;
3094 goto REPEATNOTCHAR;
3095
3096 case OP_NOTPOSPLUS:
3097 possessive = TRUE;
3098 min = 1;
3099 max = INT_MAX;
3100 ecode++;
3101 goto REPEATNOTCHAR;
3102
3103 case OP_NOTPOSQUERY:
3104 possessive = TRUE;
3105 min = 0;
3106 max = 1;
3107 ecode++;
3108 goto REPEATNOTCHAR;
3109
3110 case OP_NOTPOSUPTO:
3111 possessive = TRUE;
3112 min = 0;
3113 max = GET2(ecode, 1);
3114 ecode += 3;
3115 goto REPEATNOTCHAR;
3116
3117 case OP_NOTSTAR:
3118 case OP_NOTMINSTAR:
3119 case OP_NOTPLUS:
3120 case OP_NOTMINPLUS:
3121 case OP_NOTQUERY:
3122 case OP_NOTMINQUERY:
3123 c = *ecode++ - OP_NOTSTAR;
3124 minimize = (c & 1) != 0;
3125 min = rep_min[c]; /* Pick up values from tables; */
3126 max = rep_max[c]; /* zero for max => infinity */
3127 if (max == 0) max = INT_MAX;
3128
3129 /* Common code for all repeated single-byte matches. */
3130
3131 REPEATNOTCHAR:
3132 fc = *ecode++;
3133
3134 /* The code is duplicated for the caseless and caseful cases, for speed,
3135 since matching characters is likely to be quite common. First, ensure the
3136 minimum number of matches are present. If min = max, continue at the same
3137 level without recursing. Otherwise, if minimizing, keep trying the rest of
3138 the expression and advancing one matching character if failing, up to the
3139 maximum. Alternatively, if maximizing, find the maximum number of
3140 characters and work backwards. */
3141
3142 DPRINTF(("negative matching %c{%d,%d} against subject %.*s\n", fc, min, max,
3143 max, eptr));
3144
3145 if ((ims & PCRE_CASELESS) != 0)
3146 {
3147 fc = md->lcc[fc];
3148
3149 #ifdef SUPPORT_UTF8
3150 /* UTF-8 mode */
3151 if (utf8)
3152 {
3153 register unsigned int d;
3154 for (i = 1; i <= min; i++)
3155 {
3156 if (eptr >= md->end_subject)
3157 {
3158 SCHECK_PARTIAL();
3159 MRRETURN(MATCH_NOMATCH);
3160 }
3161 GETCHARINC(d, eptr);
3162 if (d < 256) d = md->lcc[d];
3163 if (fc == d) MRRETURN(MATCH_NOMATCH);
3164 }
3165 }
3166 else
3167 #endif
3168
3169 /* Not UTF-8 mode */
3170 {
3171 for (i = 1; i <= min; i++)
3172 {
3173 if (eptr >= md->end_subject)
3174 {
3175 SCHECK_PARTIAL();
3176 MRRETURN(MATCH_NOMATCH);
3177 }
3178 if (fc == md->lcc[*eptr++]) MRRETURN(MATCH_NOMATCH);
3179 }
3180 }
3181
3182 if (min == max) continue;
3183
3184 if (minimize)
3185 {
3186 #ifdef SUPPORT_UTF8
3187 /* UTF-8 mode */
3188 if (utf8)
3189 {
3190 register unsigned int d;
3191 for (fi = min;; fi++)
3192 {
3193 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM28);
3194 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3195 if (fi >= max) MRRETURN(MATCH_NOMATCH);
3196 if (eptr >= md->end_subject)
3197 {
3198 SCHECK_PARTIAL();
3199 MRRETURN(MATCH_NOMATCH);
3200 }
3201 GETCHARINC(d, eptr);
3202 if (d < 256) d = md->lcc[d];
3203 if (fc == d) MRRETURN(MATCH_NOMATCH);
3204 }
3205 }
3206 else
3207 #endif
3208 /* Not UTF-8 mode */
3209 {
3210 for (fi = min;; fi++)
3211 {
3212 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM29);
3213 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3214 if (fi >= max) MRRETURN(MATCH_NOMATCH);
3215 if (eptr >= md->end_subject)
3216 {
3217 SCHECK_PARTIAL();
3218 MRRETURN(MATCH_NOMATCH);
3219 }
3220 if (fc == md->lcc[*eptr++]) MRRETURN(MATCH_NOMATCH);
3221 }
3222 }
3223 /* Control never gets here */
3224 }
3225
3226 /* Maximize case */
3227
3228 else
3229 {
3230 pp = eptr;
3231
3232 #ifdef SUPPORT_UTF8
3233 /* UTF-8 mode */
3234 if (utf8)
3235 {
3236 register unsigned int d;
3237 for (i = min; i < max; i++)
3238 {
3239 int len = 1;
3240 if (eptr >= md->end_subject)
3241 {
3242 SCHECK_PARTIAL();
3243 break;
3244 }
3245 GETCHARLEN(d, eptr, len);
3246 if (d < 256) d = md->lcc[d];
3247 if (fc == d) break;
3248 eptr += len;
3249 }
3250 if (possessive) continue;
3251 for(;;)
3252 {
3253 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM30);
3254 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3255 if (eptr-- == pp) break; /* Stop if tried at original pos */
3256 BACKCHAR(eptr);
3257 }
3258 }
3259 else
3260 #endif
3261 /* Not UTF-8 mode */
3262 {
3263 for (i = min; i < max; i++)
3264 {
3265 if (eptr >= md->end_subject)
3266 {
3267 SCHECK_PARTIAL();
3268 break;
3269 }
3270 if (fc == md->lcc[*eptr]) break;
3271 eptr++;
3272 }
3273 if (possessive) continue;
3274 while (eptr >= pp)
3275 {
3276 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM31);
3277 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3278 eptr--;
3279 }
3280 }
3281
3282 MRRETURN(MATCH_NOMATCH);
3283 }
3284 /* Control never gets here */
3285 }
3286
3287 /* Caseful comparisons */
3288
3289 else
3290 {
3291 #ifdef SUPPORT_UTF8
3292 /* UTF-8 mode */
3293 if (utf8)
3294 {
3295 register unsigned int d;
3296 for (i = 1; i <= min; i++)
3297 {
3298 if (eptr >= md->end_subject)
3299 {
3300 SCHECK_PARTIAL();
3301 MRRETURN(MATCH_NOMATCH);
3302 }
3303 GETCHARINC(d, eptr);
3304 if (fc == d) MRRETURN(MATCH_NOMATCH);
3305 }
3306 }
3307 else
3308 #endif
3309 /* Not UTF-8 mode */
3310 {
3311 for (i = 1; i <= min; i++)
3312 {
3313 if (eptr >= md->end_subject)
3314 {
3315 SCHECK_PARTIAL();
3316 MRRETURN(MATCH_NOMATCH);
3317 }
3318 if (fc == *eptr++) MRRETURN(MATCH_NOMATCH);
3319 }
3320 }
3321
3322 if (min == max) continue;
3323
3324 if (minimize)
3325 {
3326 #ifdef SUPPORT_UTF8
3327 /* UTF-8 mode */
3328 if (utf8)
3329 {
3330 register unsigned int d;
3331 for (fi = min;; fi++)
3332 {
3333 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM32);
3334 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3335 if (fi >= max) MRRETURN(MATCH_NOMATCH);
3336 if (eptr >= md->end_subject)
3337 {
3338 SCHECK_PARTIAL();
3339 MRRETURN(MATCH_NOMATCH);
3340 }
3341 GETCHARINC(d, eptr);
3342 if (fc == d) MRRETURN(MATCH_NOMATCH);
3343 }
3344 }
3345 else
3346 #endif
3347 /* Not UTF-8 mode */
3348 {
3349 for (fi = min;; fi++)
3350 {
3351 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM33);
3352 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3353 if (fi >= max) MRRETURN(MATCH_NOMATCH);
3354 if (eptr >= md->end_subject)
3355 {
3356 SCHECK_PARTIAL();
3357 MRRETURN(MATCH_NOMATCH);
3358 }
3359 if (fc == *eptr++) MRRETURN(MATCH_NOMATCH);
3360 }
3361 }
3362 /* Control never gets here */
3363 }
3364
3365 /* Maximize case */
3366
3367 else
3368 {
3369 pp = eptr;
3370
3371 #ifdef SUPPORT_UTF8
3372 /* UTF-8 mode */
3373 if (utf8)
3374 {
3375 register unsigned int d;
3376 for (i = min; i < max; i++)
3377 {
3378 int len = 1;
3379 if (eptr >= md->end_subject)
3380 {
3381 SCHECK_PARTIAL();
3382 break;
3383 }
3384 GETCHARLEN(d, eptr, len);
3385 if (fc == d) break;
3386 eptr += len;
3387 }
3388 if (possessive) continue;
3389 for(;;)
3390 {
3391 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM34);
3392 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3393 if (eptr-- == pp) break; /* Stop if tried at original pos */
3394 BACKCHAR(eptr);
3395 }
3396 }
3397 else
3398 #endif
3399 /* Not UTF-8 mode */
3400 {
3401 for (i = min; i < max; i++)
3402 {
3403 if (eptr >= md->end_subject)
3404 {
3405 SCHECK_PARTIAL();
3406 break;
3407 }
3408 if (fc == *eptr) break;
3409 eptr++;
3410 }
3411 if (possessive) continue;
3412 while (eptr >= pp)
3413 {
3414 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM35);
3415 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3416 eptr--;
3417 }
3418 }
3419
3420 MRRETURN(MATCH_NOMATCH);
3421 }
3422 }
3423 /* Control never gets here */
3424
3425 /* Match a single character type repeatedly; several different opcodes
3426 share code. This is very similar to the code for single characters, but we
3427 repeat it in the interests of efficiency. */
3428
3429 case OP_TYPEEXACT:
3430 min = max = GET2(ecode, 1);
3431 minimize = TRUE;
3432 ecode += 3;
3433 goto REPEATTYPE;
3434
3435 case OP_TYPEUPTO:
3436 case OP_TYPEMINUPTO:
3437 min = 0;
3438 max = GET2(ecode, 1);
3439 minimize = *ecode == OP_TYPEMINUPTO;
3440 ecode += 3;
3441 goto REPEATTYPE;
3442
3443 case OP_TYPEPOSSTAR:
3444 possessive = TRUE;
3445 min = 0;
3446 max = INT_MAX;
3447 ecode++;
3448 goto REPEATTYPE;
3449
3450 case OP_TYPEPOSPLUS:
3451 possessive = TRUE;
3452 min = 1;
3453 max = INT_MAX;
3454 ecode++;
3455 goto REPEATTYPE;
3456
3457 case OP_TYPEPOSQUERY:
3458 possessive = TRUE;
3459 min = 0;
3460 max = 1;
3461 ecode++;
3462 goto REPEATTYPE;
3463
3464 case OP_TYPEPOSUPTO:
3465 possessive = TRUE;
3466 min = 0;
3467 max = GET2(ecode, 1);
3468 ecode += 3;
3469 goto REPEATTYPE;
3470
3471 case OP_TYPESTAR:
3472 case OP_TYPEMINSTAR:
3473 case OP_TYPEPLUS:
3474 case OP_TYPEMINPLUS:
3475 case OP_TYPEQUERY:
3476 case OP_TYPEMINQUERY:
3477 c = *ecode++ - OP_TYPESTAR;
3478 minimize = (c & 1) != 0;
3479 min = rep_min[c]; /* Pick up values from tables; */
3480 max = rep_max[c]; /* zero for max => infinity */
3481 if (max == 0) max = INT_MAX;
3482
3483 /* Common code for all repeated single character type matches. Note that
3484 in UTF-8 mode, '.' matches a character of any length, but for the other
3485 character types, the valid characters are all one-byte long. */
3486
3487 REPEATTYPE:
3488 ctype = *ecode++; /* Code for the character type */
3489
3490 #ifdef SUPPORT_UCP
3491 if (ctype == OP_PROP || ctype == OP_NOTPROP)
3492 {
3493 prop_fail_result = ctype == OP_NOTPROP;
3494 prop_type = *ecode++;
3495 prop_value = *ecode++;
3496 }
3497 else prop_type = -1;
3498 #endif
3499
3500 /* First, ensure the minimum number of matches are present. Use inline
3501 code for maximizing the speed, and do the type test once at the start
3502 (i.e. keep it out of the loop). Separate the UTF-8 code completely as that
3503 is tidier. Also separate the UCP code, which can be the same for both UTF-8
3504 and single-bytes. */
3505
3506 if (min > 0)
3507 {
3508 #ifdef SUPPORT_UCP
3509 if (prop_type >= 0)
3510 {
3511 switch(prop_type)
3512 {
3513 case PT_ANY:
3514 if (prop_fail_result) MRRETURN(MATCH_NOMATCH);
3515 for (i = 1; i <= min; i++)
3516 {
3517 if (eptr >= md->end_subject)
3518 {
3519 SCHECK_PARTIAL();
3520 MRRETURN(MATCH_NOMATCH);
3521 }
3522 GETCHARINCTEST(c, eptr);
3523 }
3524 break;
3525
3526 case PT_LAMP:
3527 for (i = 1; i <= min; i++)
3528 {
3529 if (eptr >= md->end_subject)
3530 {
3531 SCHECK_PARTIAL();
3532 MRRETURN(MATCH_NOMATCH);
3533 }
3534 GETCHARINCTEST(c, eptr);
3535 prop_chartype = UCD_CHARTYPE(c);
3536 if ((prop_chartype == ucp_Lu ||
3537 prop_chartype == ucp_Ll ||
3538 prop_chartype == ucp_Lt) == prop_fail_result)
3539 MRRETURN(MATCH_NOMATCH);
3540 }
3541 break;
3542
3543 case PT_GC:
3544 for (i = 1; i <= min; i++)
3545 {
3546 if (eptr >= md->end_subject)
3547 {
3548 SCHECK_PARTIAL();
3549 MRRETURN(MATCH_NOMATCH);
3550 }
3551 GETCHARINCTEST(c, eptr);
3552 prop_category = UCD_CATEGORY(c);
3553 if ((prop_category == prop_value) == prop_fail_result)
3554 MRRETURN(MATCH_NOMATCH);
3555 }
3556 break;
3557
3558 case PT_PC:
3559 for (i = 1; i <= min; i++)
3560 {
3561 if (eptr >= md->end_subject)
3562 {
3563 SCHECK_PARTIAL();
3564 MRRETURN(MATCH_NOMATCH);
3565 }
3566 GETCHARINCTEST(c, eptr);
3567 prop_chartype = UCD_CHARTYPE(c);
3568 if ((prop_chartype == prop_value) == prop_fail_result)
3569 MRRETURN(MATCH_NOMATCH);
3570 }
3571 break;
3572
3573 case PT_SC:
3574 for (i = 1; i <= min; i++)
3575 {
3576 if (eptr >= md->end_subject)
3577 {
3578 SCHECK_PARTIAL();
3579 MRRETURN(MATCH_NOMATCH);
3580 }
3581 GETCHARINCTEST(c, eptr);
3582 prop_script = UCD_SCRIPT(c);
3583 if ((prop_script == prop_value) == prop_fail_result)
3584 MRRETURN(MATCH_NOMATCH);
3585 }
3586 break;
3587
3588 case PT_ALNUM:
3589 for (i = 1; i <= min; i++)
3590 {
3591 if (eptr >= md->end_subject)
3592 {
3593 SCHECK_PARTIAL();
3594 MRRETURN(MATCH_NOMATCH);
3595 }
3596 GETCHARINCTEST(c, eptr);
3597 prop_category = UCD_CATEGORY(c);
3598 if ((prop_category == ucp_L || prop_category == ucp_N)
3599 == prop_fail_result)
3600 MRRETURN(MATCH_NOMATCH);
3601 }
3602 break;
3603
3604 case PT_SPACE: /* Perl space */
3605 for (i = 1; i <= min; i++)
3606 {
3607 if (eptr >= md->end_subject)
3608 {
3609 SCHECK_PARTIAL();
3610 MRRETURN(MATCH_NOMATCH);
3611 }
3612 GETCHARINCTEST(c, eptr);
3613 prop_category = UCD_CATEGORY(c);
3614 if ((prop_category == ucp_Z || c == CHAR_HT || c == CHAR_NL ||
3615 c == CHAR_FF || c == CHAR_CR)
3616 == prop_fail_result)
3617 MRRETURN(MATCH_NOMATCH);
3618 }
3619 break;
3620
3621 case PT_PXSPACE: /* POSIX space */
3622 for (i = 1; i <= min; i++)
3623 {
3624 if (eptr >= md->end_subject)
3625 {
3626 SCHECK_PARTIAL();
3627 MRRETURN(MATCH_NOMATCH);
3628 }
3629 GETCHARINCTEST(c, eptr);
3630 prop_category = UCD_CATEGORY(c);
3631 if ((prop_category == ucp_Z || c == CHAR_HT || c == CHAR_NL ||
3632 c == CHAR_VT || c == CHAR_FF || c == CHAR_CR)
3633 == prop_fail_result)
3634 MRRETURN(MATCH_NOMATCH);
3635 }
3636 break;
3637
3638 case PT_WORD:
3639 for (i = 1; i <= min; i++)
3640 {
3641 if (eptr >= md->end_subject)
3642 {
3643 SCHECK_PARTIAL();
3644 MRRETURN(MATCH_NOMATCH);
3645 }
3646 GETCHARINCTEST(c, eptr);
3647 prop_category = UCD_CATEGORY(c);
3648 if ((prop_category == ucp_L || prop_category == ucp_N ||
3649 c == CHAR_UNDERSCORE)
3650 == prop_fail_result)
3651 MRRETURN(MATCH_NOMATCH);
3652 }
3653 break;
3654
3655 /* This should not occur */
3656
3657 default:
3658 RRETURN(PCRE_ERROR_INTERNAL);
3659 }
3660 }
3661
3662 /* Match extended Unicode sequences. We will get here only if the
3663 support is in the binary; otherwise a compile-time error occurs. */
3664
3665 else if (ctype == OP_EXTUNI)
3666 {
3667 for (i = 1; i <= min; i++)
3668 {
3669 if (eptr >= md->end_subject)
3670 {
3671 SCHECK_PARTIAL();
3672 MRRETURN(MATCH_NOMATCH);
3673 }
3674 GETCHARINCTEST(c, eptr);
3675 prop_category = UCD_CATEGORY(c);
3676 if (prop_category == ucp_M) MRRETURN(MATCH_NOMATCH);
3677 while (eptr < md->end_subject)
3678 {
3679 int len = 1;
3680 if (!utf8) c = *eptr;
3681 else { GETCHARLEN(c, eptr, len); }
3682 prop_category = UCD_CATEGORY(c);
3683 if (prop_category != ucp_M) break;
3684 eptr += len;
3685 }
3686 }
3687 }
3688
3689 else
3690 #endif /* SUPPORT_UCP */
3691
3692 /* Handle all other cases when the coding is UTF-8 */
3693
3694 #ifdef SUPPORT_UTF8
3695 if (utf8) switch(ctype)
3696 {
3697 case OP_ANY:
3698 for (i = 1; i <= min; i++)
3699 {
3700 if (eptr >= md->end_subject)
3701 {
3702 SCHECK_PARTIAL();
3703 MRRETURN(MATCH_NOMATCH);
3704 }
3705 if (IS_NEWLINE(eptr)) MRRETURN(MATCH_NOMATCH);
3706 eptr++;
3707 while (eptr < md->end_subject && (*eptr & 0xc0) == 0x80) eptr++;
3708 }
3709 break;
3710
3711 case OP_ALLANY:
3712 for (i = 1; i <= min; i++)
3713 {
3714 if (eptr >= md->end_subject)
3715 {
3716 SCHECK_PARTIAL();
3717 MRRETURN(MATCH_NOMATCH);
3718 }
3719 eptr++;
3720 while (eptr < md->end_subject && (*eptr & 0xc0) == 0x80) eptr++;
3721 }
3722 break;
3723
3724 case OP_ANYBYTE:
3725 if (eptr > md->end_subject - min) MRRETURN(MATCH_NOMATCH);
3726 eptr += min;
3727 break;
3728
3729 case OP_ANYNL:
3730 for (i = 1; i <= min; i++)
3731 {
3732 if (eptr >= md->end_subject)
3733 {
3734 SCHECK_PARTIAL();
3735 MRRETURN(MATCH_NOMATCH);
3736 }
3737 GETCHARINC(c, eptr);
3738 switch(c)
3739 {
3740 default: MRRETURN(MATCH_NOMATCH);
3741 case 0x000d:
3742 if (eptr < md->end_subject && *eptr == 0x0a) eptr++;
3743 break;
3744
3745 case 0x000a:
3746 break;
3747
3748 case 0x000b:
3749 case 0x000c:
3750 case 0x0085:
3751 case 0x2028:
3752 case 0x2029:
3753 if (md->bsr_anycrlf) MRRETURN(MATCH_NOMATCH);
3754 break;
3755 }
3756 }
3757 break;
3758
3759 case OP_NOT_HSPACE:
3760 for (i = 1; i <= min; i++)
3761 {
3762 if (eptr >= md->end_subject)
3763 {
3764 SCHECK_PARTIAL();
3765 MRRETURN(MATCH_NOMATCH);
3766 }
3767 GETCHARINC(c, eptr);
3768 switch(c)
3769 {
3770 default: break;
3771 case 0x09: /* HT */
3772 case 0x20: /* SPACE */
3773 case 0xa0: /* NBSP */
3774 case 0x1680: /* OGHAM SPACE MARK */
3775 case 0x180e: /* MONGOLIAN VOWEL SEPARATOR */
3776 case 0x2000: /* EN QUAD */
3777 case 0x2001: /* EM QUAD */
3778 case 0x2002: /* EN SPACE */
3779 case 0x2003: /* EM SPACE */
3780 case 0x2004: /* THREE-PER-EM SPACE */
3781 case 0x2005: /* FOUR-PER-EM SPACE */
3782 case 0x2006: /* SIX-PER-EM SPACE */
3783 case 0x2007: /* FIGURE SPACE */
3784 case 0x2008: /* PUNCTUATION SPACE */
3785 case 0x2009: /* THIN SPACE */
3786 case 0x200A: /* HAIR SPACE */
3787 case 0x202f: /* NARROW NO-BREAK SPACE */
3788 case 0x205f: /* MEDIUM MATHEMATICAL SPACE */
3789 case 0x3000: /* IDEOGRAPHIC SPACE */
3790 MRRETURN(MATCH_NOMATCH);
3791 }
3792 }
3793 break;
3794
3795 case OP_HSPACE:
3796 for (i = 1; i <= min; i++)
3797 {
3798 if (eptr >= md->end_subject)
3799 {
3800 SCHECK_PARTIAL();
3801 MRRETURN(MATCH_NOMATCH);
3802 }
3803 GETCHARINC(c, eptr);
3804 switch(c)
3805 {
3806 default: MRRETURN(MATCH_NOMATCH);
3807 case 0x09: /* HT */
3808 case 0x20: /* SPACE */
3809 case 0xa0: /* NBSP */
3810 case 0x1680: /* OGHAM SPACE MARK */
3811 case 0x180e: /* MONGOLIAN VOWEL SEPARATOR */
3812 case 0x2000: /* EN QUAD */
3813 case 0x2001: /* EM QUAD */
3814 case 0x2002: /* EN SPACE */
3815 case 0x2003: /* EM SPACE */
3816 case 0x2004: /* THREE-PER-EM SPACE */
3817 case 0x2005: /* FOUR-PER-EM SPACE */
3818 case 0x2006: /* SIX-PER-EM SPACE */
3819 case 0x2007: /* FIGURE SPACE */
3820 case 0x2008: /* PUNCTUATION SPACE */
3821 case 0x2009: /* THIN SPACE */
3822 case 0x200A: /* HAIR SPACE */
3823 case 0x202f: /* NARROW NO-BREAK SPACE */
3824 case 0x205f: /* MEDIUM MATHEMATICAL SPACE */
3825 case 0x3000: /* IDEOGRAPHIC SPACE */
3826 break;
3827 }
3828 }
3829 break;
3830
3831 case OP_NOT_VSPACE:
3832 for (i = 1; i <= min; i++)
3833 {
3834 if (eptr >= md->end_subject)
3835 {
3836 SCHECK_PARTIAL();
3837 MRRETURN(MATCH_NOMATCH);
3838 }
3839 GETCHARINC(c, eptr);
3840 switch(c)
3841 {
3842 default: break;
3843 case 0x0a: /* LF */
3844 case 0x0b: /* VT */
3845 case 0x0c: /* FF */
3846 case 0x0d: /* CR */
3847 case 0x85: /* NEL */
3848 case 0x2028: /* LINE SEPARATOR */
3849 case 0x2029: /* PARAGRAPH SEPARATOR */
3850 MRRETURN(MATCH_NOMATCH);
3851 }
3852 }
3853 break;
3854
3855 case OP_VSPACE:
3856 for (i = 1; i <= min; i++)
3857 {
3858 if (eptr >= md->end_subject)
3859 {
3860 SCHECK_PARTIAL();
3861 MRRETURN(MATCH_NOMATCH);
3862 }
3863 GETCHARINC(c, eptr);
3864 switch(c)
3865 {
3866 default: MRRETURN(MATCH_NOMATCH);
3867 case 0x0a: /* LF */
3868 case 0x0b: /* VT */
3869 case 0x0c: /* FF */
3870 case 0x0d: /* CR */
3871 case 0x85: /* NEL */
3872 case 0x2028: /* LINE SEPARATOR */
3873 case 0x2029: /* PARAGRAPH SEPARATOR */
3874 break;
3875 }
3876 }
3877 break;
3878
3879 case OP_NOT_DIGIT:
3880 for (i = 1; i <= min; i++)
3881 {
3882 if (eptr >= md->end_subject)
3883 {
3884 SCHECK_PARTIAL();
3885 MRRETURN(MATCH_NOMATCH);
3886 }
3887 GETCHARINC(c, eptr);
3888 if (c < 128 && (md->ctypes[c] & ctype_digit) != 0)
3889 MRRETURN(MATCH_NOMATCH);
3890 }
3891 break;
3892
3893 case OP_DIGIT:
3894 for (i = 1; i <= min; i++)
3895 {
3896 if (eptr >= md->end_subject)
3897 {
3898 SCHECK_PARTIAL();
3899 MRRETURN(MATCH_NOMATCH);
3900 }
3901 if (*eptr >= 128 || (md->ctypes[*eptr++] & ctype_digit) == 0)
3902 MRRETURN(MATCH_NOMATCH);
3903 /* No need to skip more bytes - we know it's a 1-byte character */
3904 }
3905 break;
3906
3907 case OP_NOT_WHITESPACE:
3908 for (i = 1; i <= min; i++)
3909 {
3910 if (eptr >= md->end_subject)
3911 {
3912 SCHECK_PARTIAL();
3913 MRRETURN(MATCH_NOMATCH);
3914 }
3915 if (*eptr < 128 && (md->ctypes[*eptr] & ctype_space) != 0)
3916 MRRETURN(MATCH_NOMATCH);
3917 while (++eptr < md->end_subject && (*eptr & 0xc0) == 0x80);
3918 }
3919 break;
3920
3921 case OP_WHITESPACE:
3922 for (i = 1; i <= min; i++)
3923 {
3924 if (eptr >= md->end_subject)
3925 {
3926 SCHECK_PARTIAL();
3927 MRRETURN(MATCH_NOMATCH);
3928 }
3929 if (*eptr >= 128 || (md->ctypes[*eptr++] & ctype_space) == 0)
3930 MRRETURN(MATCH_NOMATCH);
3931 /* No need to skip more bytes - we know it's a 1-byte character */
3932 }
3933 break;
3934
3935 case OP_NOT_WORDCHAR:
3936 for (i = 1; i <= min; i++)
3937 {
3938 if (eptr >= md->end_subject)
3939 {
3940 SCHECK_PARTIAL();
3941 MRRETURN(MATCH_NOMATCH);
3942 }
3943 if (*eptr < 128 && (md->ctypes[*eptr] & ctype_word) != 0)
3944 MRRETURN(MATCH_NOMATCH);
3945 while (++eptr < md->end_subject && (*eptr & 0xc0) == 0x80);
3946 }
3947 break;
3948
3949 case OP_WORDCHAR:
3950 for (i = 1; i <= min; i++)
3951 {
3952 if (eptr >= md->end_subject)
3953 {
3954 SCHECK_PARTIAL();
3955 MRRETURN(MATCH_NOMATCH);
3956 }
3957 if (*eptr >= 128 || (md->ctypes[*eptr++] & ctype_word) == 0)
3958 MRRETURN(MATCH_NOMATCH);
3959 /* No need to skip more bytes - we know it's a 1-byte character */
3960 }
3961 break;
3962
3963 default:
3964 RRETURN(PCRE_ERROR_INTERNAL);
3965 } /* End switch(ctype) */
3966
3967 else
3968 #endif /* SUPPORT_UTF8 */
3969
3970 /* Code for the non-UTF-8 case for minimum matching of operators other
3971 than OP_PROP and OP_NOTPROP. */
3972
3973 switch(ctype)
3974 {
3975 case OP_ANY:
3976 for (i = 1; i <= min; i++)
3977 {
3978 if (eptr >= md->end_subject)
3979 {
3980 SCHECK_PARTIAL();
3981 MRRETURN(MATCH_NOMATCH);
3982 }
3983 if (IS_NEWLINE(eptr)) MRRETURN(MATCH_NOMATCH);
3984 eptr++;
3985 }
3986 break;
3987
3988 case OP_ALLANY:
3989 if (eptr > md->end_subject - min)
3990 {
3991 SCHECK_PARTIAL();
3992 MRRETURN(MATCH_NOMATCH);
3993 }
3994 eptr += min;
3995 break;
3996
3997 case OP_ANYBYTE:
3998 if (eptr > md->end_subject - min)
3999 {
4000 SCHECK_PARTIAL();
4001 MRRETURN(MATCH_NOMATCH);
4002 }
4003 eptr += min;
4004 break;
4005
4006 case OP_ANYNL:
4007 for (i = 1; i <= min; i++)
4008 {
4009 if (eptr >= md->end_subject)
4010 {
4011 SCHECK_PARTIAL();
4012 MRRETURN(MATCH_NOMATCH);
4013 }
4014 switch(*eptr++)
4015 {
4016 default: MRRETURN(MATCH_NOMATCH);
4017 case 0x000d:
4018 if (eptr < md->end_subject && *eptr == 0x0a) eptr++;
4019 break;
4020 case 0x000a:
4021 break;
4022
4023 case 0x000b:
4024 case 0x000c:
4025 case 0x0085:
4026 if (md->bsr_anycrlf) MRRETURN(MATCH_NOMATCH);
4027 break;
4028 }
4029 }
4030 break;
4031
4032 case OP_NOT_HSPACE:
4033 for (i = 1; i <= min; i++)
4034 {
4035 if (eptr >= md->end_subject)
4036 {
4037 SCHECK_PARTIAL();
4038 MRRETURN(MATCH_NOMATCH);
4039 }
4040 switch(*eptr++)
4041 {
4042 default: break;
4043 case 0x09: /* HT */
4044 case 0x20: /* SPACE */
4045 case 0xa0: /* NBSP */
4046 MRRETURN(MATCH_NOMATCH);
4047 }
4048 }
4049 break;
4050
4051 case OP_HSPACE:
4052 for (i = 1; i <= min; i++)
4053 {
4054 if (eptr >= md->end_subject)
4055 {
4056 SCHECK_PARTIAL();
4057 MRRETURN(MATCH_NOMATCH);
4058 }
4059 switch(*eptr++)
4060 {
4061 default: MRRETURN(MATCH_NOMATCH);
4062 case 0x09: /* HT */
4063 case 0x20: /* SPACE */
4064 case 0xa0: /* NBSP */
4065 break;
4066 }
4067 }
4068 break;
4069
4070 case OP_NOT_VSPACE:
4071 for (i = 1; i <= min; i++)
4072 {
4073 if (eptr >= md->end_subject)
4074 {
4075 SCHECK_PARTIAL();
4076 MRRETURN(MATCH_NOMATCH);
4077 }
4078 switch(*eptr++)
4079 {
4080 default: break;
4081 case 0x0a: /* LF */
4082 case 0x0b: /* VT */
4083 case 0x0c: /* FF */
4084 case 0x0d: /* CR */
4085 case 0x85: /* NEL */
4086 MRRETURN(MATCH_NOMATCH);
4087 }
4088 }
4089 break;
4090
4091 case OP_VSPACE:
4092 for (i = 1; i <= min; i++)
4093 {
4094 if (eptr >= md->end_subject)
4095 {
4096 SCHECK_PARTIAL();
4097 MRRETURN(MATCH_NOMATCH);
4098 }
4099 switch(*eptr++)
4100 {
4101 default: MRRETURN(MATCH_NOMATCH);
4102 case 0x0a: /* LF */
4103 case 0x0b: /* VT */
4104 case 0x0c: /* FF */
4105 case 0x0d: /* CR */
4106 case 0x85: /* NEL */
4107 break;
4108 }
4109 }
4110 break;
4111
4112 case OP_NOT_DIGIT:
4113 for (i = 1; i <= min; i++)
4114 {
4115 if (eptr >= md->end_subject)
4116 {
4117 SCHECK_PARTIAL();
4118 MRRETURN(MATCH_NOMATCH);
4119 }
4120 if ((md->ctypes[*eptr++] & ctype_digit) != 0) MRRETURN(MATCH_NOMATCH);
4121 }
4122 break;
4123
4124 case OP_DIGIT:
4125 for (i = 1; i <= min; i++)
4126 {
4127 if (eptr >= md->end_subject)
4128 {
4129 SCHECK_PARTIAL();
4130 MRRETURN(MATCH_NOMATCH);
4131 }
4132 if ((md->ctypes[*eptr++] & ctype_digit) == 0) MRRETURN(MATCH_NOMATCH);
4133 }
4134 break;
4135
4136 case OP_NOT_WHITESPACE:
4137 for (i = 1; i <= min; i++)
4138 {
4139 if (eptr >= md->end_subject)
4140 {
4141 SCHECK_PARTIAL();
4142 MRRETURN(MATCH_NOMATCH);
4143 }
4144 if ((md->ctypes[*eptr++] & ctype_space) != 0) MRRETURN(MATCH_NOMATCH);
4145 }
4146 break;
4147
4148 case OP_WHITESPACE:
4149 for (i = 1; i <= min; i++)
4150 {
4151 if (eptr >= md->end_subject)
4152 {
4153 SCHECK_PARTIAL();
4154 MRRETURN(MATCH_NOMATCH);
4155 }
4156 if ((md->ctypes[*eptr++] & ctype_space) == 0) MRRETURN(MATCH_NOMATCH);
4157 }
4158 break;
4159
4160 case OP_NOT_WORDCHAR:
4161 for (i = 1; i <= min; i++)
4162 {
4163 if (eptr >= md->end_subject)
4164 {
4165 SCHECK_PARTIAL();
4166 MRRETURN(MATCH_NOMATCH);
4167 }
4168 if ((md->ctypes[*eptr++] & ctype_word) != 0)
4169 MRRETURN(MATCH_NOMATCH);
4170 }
4171 break;
4172
4173 case OP_WORDCHAR:
4174 for (i = 1; i <= min; i++)
4175 {
4176 if (eptr >= md->end_subject)
4177 {
4178 SCHECK_PARTIAL();
4179 MRRETURN(MATCH_NOMATCH);
4180 }
4181 if ((md->ctypes[*eptr++] & ctype_word) == 0)
4182 MRRETURN(MATCH_NOMATCH);
4183 }
4184 break;
4185
4186 default:
4187 RRETURN(PCRE_ERROR_INTERNAL);
4188 }
4189 }
4190
4191 /* If min = max, continue at the same level without recursing */
4192
4193 if (min == max) continue;
4194
4195 /* If minimizing, we have to test the rest of the pattern before each
4196 subsequent match. Again, separate the UTF-8 case for speed, and also
4197 separate the UCP cases. */
4198
4199 if (minimize)
4200 {
4201 #ifdef SUPPORT_UCP
4202 if (prop_type >= 0)
4203 {
4204 switch(prop_type)
4205 {
4206 case PT_ANY:
4207 for (fi = min;; fi++)
4208 {
4209 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM36);
4210 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
4211 if (fi >= max) MRRETURN(MATCH_NOMATCH);
4212 if (eptr >= md->end_subject)
4213 {
4214 SCHECK_PARTIAL();
4215 MRRETURN(MATCH_NOMATCH);
4216 }
4217 GETCHARINCTEST(c, eptr);
4218 if (prop_fail_result) MRRETURN(MATCH_NOMATCH);
4219 }
4220 /* Control never gets here */
4221
4222 case PT_LAMP:
4223 for (fi = min;; fi++)
4224 {
4225 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM37);
4226 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
4227 if (fi >= max) MRRETURN(MATCH_NOMATCH);
4228 if (eptr >= md->end_subject)
4229 {
4230 SCHECK_PARTIAL();
4231 MRRETURN(MATCH_NOMATCH);
4232 }
4233 GETCHARINCTEST(c, eptr);
4234 prop_chartype = UCD_CHARTYPE(c);
4235 if ((prop_chartype == ucp_Lu ||
4236 prop_chartype == ucp_Ll ||
4237 prop_chartype == ucp_Lt) == prop_fail_result)
4238 MRRETURN(MATCH_NOMATCH);
4239 }
4240 /* Control never gets here */
4241
4242 case PT_GC:
4243 for (fi = min;; fi++)
4244 {
4245 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM38);
4246 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
4247 if (fi >= max) MRRETURN(MATCH_NOMATCH);
4248 if (eptr >= md->end_subject)
4249 {
4250 SCHECK_PARTIAL();
4251 MRRETURN(MATCH_NOMATCH);
4252 }
4253 GETCHARINCTEST(c, eptr);
4254 prop_category = UCD_CATEGORY(c);
4255 if ((prop_category == prop_value) == prop_fail_result)
4256 MRRETURN(MATCH_NOMATCH);
4257 }
4258 /* Control never gets here */
4259
4260 case PT_PC:
4261 for (fi = min;; fi++)
4262 {
4263 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM39);
4264 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
4265 if (fi >= max) MRRETURN(MATCH_NOMATCH);
4266 if (eptr >= md->end_subject)
4267 {
4268 SCHECK_PARTIAL();
4269 MRRETURN(MATCH_NOMATCH);
4270 }
4271 GETCHARINCTEST(c, eptr);
4272 prop_chartype = UCD_CHARTYPE(c);
4273 if ((prop_chartype == prop_value) == prop_fail_result)
4274 MRRETURN(MATCH_NOMATCH);
4275 }
4276 /* Control never gets here */
4277
4278 case PT_SC:
4279 for (fi = min;; fi++)
4280 {
4281 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM40);
4282 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
4283 if (fi >= max) MRRETURN(MATCH_NOMATCH);
4284 if (eptr >= md->end_subject)
4285 {
4286 SCHECK_PARTIAL();
4287 MRRETURN(MATCH_NOMATCH);
4288 }
4289 GETCHARINCTEST(c, eptr);
4290 prop_script = UCD_SCRIPT(c);
4291 if ((prop_script == prop_value) == prop_fail_result)
4292 MRRETURN(MATCH_NOMATCH);
4293 }
4294 /* Control never gets here */
4295
4296 case PT_ALNUM:
4297 for (fi = min;; fi++)
4298 {
4299 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM59);
4300 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
4301 if (fi >= max) MRRETURN(MATCH_NOMATCH);
4302 if (eptr >= md->end_subject)
4303 {
4304 SCHECK_PARTIAL();
4305 MRRETURN(MATCH_NOMATCH);
4306 }
4307 GETCHARINCTEST(c, eptr);
4308 prop_category = UCD_CATEGORY(c);
4309 if ((prop_category == ucp_L || prop_category == ucp_N)
4310 == prop_fail_result)
4311 MRRETURN(MATCH_NOMATCH);
4312 }
4313 /* Control never gets here */
4314
4315 case PT_SPACE: /* Perl space */
4316 for (fi = min;; fi++)
4317 {
4318 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM60);
4319 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
4320 if (fi >= max) MRRETURN(MATCH_NOMATCH);
4321 if (eptr >= md->end_subject)
4322 {
4323 SCHECK_PARTIAL();
4324 MRRETURN(MATCH_NOMATCH);
4325 }
4326 GETCHARINCTEST(c, eptr);
4327 prop_category = UCD_CATEGORY(c);
4328 if ((prop_category == ucp_Z || c == CHAR_HT || c == CHAR_NL ||
4329 c == CHAR_FF || c == CHAR_CR)
4330 == prop_fail_result)
4331 MRRETURN(MATCH_NOMATCH);
4332 }
4333 /* Control never gets here */
4334
4335 case PT_PXSPACE: /* POSIX space */
4336 for (fi = min;; fi++)
4337 {
4338 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM61);
4339 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
4340 if (fi >= max) MRRETURN(MATCH_NOMATCH);
4341 if (eptr >= md->end_subject)
4342 {
4343 SCHECK_PARTIAL();
4344 MRRETURN(MATCH_NOMATCH);
4345 }
4346 GETCHARINCTEST(c, eptr);
4347 prop_category = UCD_CATEGORY(c);
4348 if ((prop_category == ucp_Z || c == CHAR_HT || c == CHAR_NL ||
4349 c == CHAR_VT || c == CHAR_FF || c == CHAR_CR)
4350 == prop_fail_result)
4351 MRRETURN(MATCH_NOMATCH);
4352 }
4353 /* Control never gets here */
4354
4355 case PT_WORD:
4356 for (fi = min;; fi++)
4357 {
4358 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM62);
4359 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
4360 if (fi >= max) MRRETURN(MATCH_NOMATCH);
4361 if (eptr >= md->end_subject)
4362 {
4363 SCHECK_PARTIAL();
4364 MRRETURN(MATCH_NOMATCH);
4365 }
4366 GETCHARINCTEST(c, eptr);
4367 prop_category = UCD_CATEGORY(c);
4368 if ((prop_category == ucp_L ||
4369 prop_category == ucp_N ||
4370 c == CHAR_UNDERSCORE)
4371 == prop_fail_result)
4372 MRRETURN(MATCH_NOMATCH);
4373 }
4374 /* Control never gets here */
4375
4376 /* This should never occur */
4377
4378 default:
4379 RRETURN(PCRE_ERROR_INTERNAL);
4380 }
4381 }
4382
4383 /* Match extended Unicode sequences. We will get here only if the
4384 support is in the binary; otherwise a compile-time error occurs. */
4385
4386 else if (ctype == OP_EXTUNI)
4387 {
4388 for (fi = min;; fi++)
4389 {
4390 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM41);
4391 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
4392 if (fi >= max) MRRETURN(MATCH_NOMATCH);
4393 if (eptr >= md->end_subject)
4394 {
4395 SCHECK_PARTIAL();
4396 MRRETURN(MATCH_NOMATCH);
4397 }
4398 GETCHARINCTEST(c, eptr);
4399 prop_category = UCD_CATEGORY(c);
4400 if (prop_category == ucp_M) MRRETURN(MATCH_NOMATCH);
4401 while (eptr < md->end_subject)
4402 {
4403 int len = 1;
4404 if (!utf8) c = *eptr;
4405 else { GETCHARLEN(c, eptr, len); }
4406 prop_category = UCD_CATEGORY(c);
4407 if (prop_category != ucp_M) break;
4408 eptr += len;
4409 }
4410 }
4411 }
4412
4413 else
4414 #endif /* SUPPORT_UCP */
4415
4416 #ifdef SUPPORT_UTF8
4417 /* UTF-8 mode */
4418 if (utf8)
4419 {
4420 for (fi = min;; fi++)
4421 {
4422 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM42);
4423 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
4424 if (fi >= max) MRRETURN(MATCH_NOMATCH);
4425 if (eptr >= md->end_subject)
4426 {
4427 SCHECK_PARTIAL();
4428 MRRETURN(MATCH_NOMATCH);
4429 }
4430 if (ctype == OP_ANY && IS_NEWLINE(eptr))
4431 MRRETURN(MATCH_NOMATCH);
4432 GETCHARINC(c, eptr);
4433 switch(ctype)
4434 {
4435 case OP_ANY: /* This is the non-NL case */
4436 case OP_ALLANY:
4437 case OP_ANYBYTE:
4438 break;
4439
4440 case OP_ANYNL:
4441 switch(c)
4442 {
4443 default: MRRETURN(MATCH_NOMATCH);
4444 case 0x000d:
4445 if (eptr < md->end_subject && *eptr == 0x0a) eptr++;
4446 break;
4447 case 0x000a:
4448 break;
4449
4450 case 0x000b:
4451 case 0x000c:
4452 case 0x0085:
4453 case 0x2028:
4454 case 0x2029:
4455 if (md->bsr_anycrlf) MRRETURN(MATCH_NOMATCH);
4456 break;
4457 }
4458 break;
4459
4460 case OP_NOT_HSPACE:
4461 switch(c)
4462 {
4463 default: break;
4464 case 0x09: /* HT */
4465 case 0x20: /* SPACE */
4466 case 0xa0: /* NBSP */
4467 case 0x1680: /* OGHAM SPACE MARK */
4468 case 0x180e: /* MONGOLIAN VOWEL SEPARATOR */
4469 case 0x2000: /* EN QUAD */
4470 case 0x2001: /* EM QUAD */
4471 case 0x2002: /* EN SPACE */
4472 case 0x2003: /* EM SPACE */
4473 case 0x2004: /* THREE-PER-EM SPACE */
4474 case 0x2005: /* FOUR-PER-EM SPACE */
4475 case 0x2006: /* SIX-PER-EM SPACE */
4476 case 0x2007: /* FIGURE SPACE */
4477 case 0x2008: /* PUNCTUATION SPACE */
4478 case 0x2009: /* THIN SPACE */
4479 case 0x200A: /* HAIR SPACE */
4480 case 0x202f: /* NARROW NO-BREAK SPACE */
4481 case 0x205f: /* MEDIUM MATHEMATICAL SPACE */
4482 case 0x3000: /* IDEOGRAPHIC SPACE */
4483 MRRETURN(MATCH_NOMATCH);
4484 }
4485 break;
4486
4487 case OP_HSPACE:
4488 switch(c)
4489 {
4490 default: MRRETURN(MATCH_NOMATCH);
4491 case 0x09: /* HT */
4492 case 0x20: /* SPACE */
4493 case 0xa0: /* NBSP */
4494 case 0x1680: /* OGHAM SPACE MARK */
4495 case 0x180e: /* MONGOLIAN VOWEL SEPARATOR */
4496 case 0x2000: /* EN QUAD */
4497 case 0x2001: /* EM QUAD */
4498 case 0x2002: /* EN SPACE */
4499 case 0x2003: /* EM SPACE */
4500 case 0x2004: /* THREE-PER-EM SPACE */
4501 case 0x2005: /* FOUR-PER-EM SPACE */
4502 case 0x2006: /* SIX-PER-EM SPACE */
4503 case 0x2007: /* FIGURE SPACE */
4504 case 0x2008: /* PUNCTUATION SPACE */
4505 case 0x2009: /* THIN SPACE */
4506 case 0x200A: /* HAIR SPACE */
4507 case 0x202f: /* NARROW NO-BREAK SPACE */
4508 case 0x205f: /* MEDIUM MATHEMATICAL SPACE */
4509 case 0x3000: /* IDEOGRAPHIC SPACE */
4510 break;
4511 }
4512 break;
4513
4514 case OP_NOT_VSPACE:
4515 switch(c)
4516 {
4517 default: break;
4518 case 0x0a: /* LF */
4519 case 0x0b: /* VT */
4520 case 0x0c: /* FF */
4521 case 0x0d: /* CR */
4522 case 0x85: /* NEL */
4523 case 0x2028: /* LINE SEPARATOR */
4524 case 0x2029: /* PARAGRAPH SEPARATOR */
4525 MRRETURN(MATCH_NOMATCH);
4526 }
4527 break;
4528
4529 case OP_VSPACE:
4530 switch(c)
4531 {
4532 default: MRRETURN(MATCH_NOMATCH);
4533 case 0x0a: /* LF */
4534 case 0x0b: /* VT */
4535 case 0x0c: /* FF */
4536 case 0x0d: /* CR */
4537 case 0x85: /* NEL */
4538 case 0x2028: /* LINE SEPARATOR */
4539 case 0x2029: /* PARAGRAPH SEPARATOR */
4540 break;
4541 }
4542 break;
4543
4544 case OP_NOT_DIGIT:
4545 if (c < 256 && (md->ctypes[c] & ctype_digit) != 0)
4546 MRRETURN(MATCH_NOMATCH);
4547 break;
4548
4549 case OP_DIGIT:
4550 if (c >= 256 || (md->ctypes[c] & ctype_digit) == 0)
4551 MRRETURN(MATCH_NOMATCH);
4552 break;
4553
4554 case OP_NOT_WHITESPACE:
4555 if (c < 256 && (md->ctypes[c] & ctype_space) != 0)
4556 MRRETURN(MATCH_NOMATCH);
4557 break;
4558
4559 case OP_WHITESPACE:
4560 if (c >= 256 || (md->ctypes[c] & ctype_space) == 0)
4561 MRRETURN(MATCH_NOMATCH);
4562 break;
4563
4564 case OP_NOT_WORDCHAR:
4565 if (c < 256 && (md->ctypes[c] & ctype_word) != 0)
4566 MRRETURN(MATCH_NOMATCH);
4567 break;
4568
4569 case OP_WORDCHAR:
4570 if (c >= 256 || (md->ctypes[c] & ctype_word) == 0)
4571 MRRETURN(MATCH_NOMATCH);
4572 break;
4573
4574 default:
4575 RRETURN(PCRE_ERROR_INTERNAL);
4576 }
4577 }
4578 }
4579 else
4580 #endif
4581 /* Not UTF-8 mode */
4582 {
4583 for (fi = min;; fi++)
4584 {
4585 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM43);
4586 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
4587 if (fi >= max) MRRETURN(MATCH_NOMATCH);
4588 if (eptr >= md->end_subject)
4589 {
4590 SCHECK_PARTIAL();
4591 MRRETURN(MATCH_NOMATCH);
4592 }
4593 if (ctype == OP_ANY && IS_NEWLINE(eptr))
4594 MRRETURN(MATCH_NOMATCH);
4595 c = *eptr++;
4596 switch(ctype)
4597 {
4598 case OP_ANY: /* This is the non-NL case */
4599 case OP_ALLANY:
4600 case OP_ANYBYTE:
4601 break;
4602
4603 case OP_ANYNL:
4604 switch(c)
4605 {
4606 default: MRRETURN(MATCH_NOMATCH);
4607 case 0x000d:
4608 if (eptr < md->end_subject && *eptr == 0x0a) eptr++;
4609 break;
4610
4611 case 0x000a:
4612 break;
4613
4614 case 0x000b:
4615 case 0x000c:
4616 case 0x0085:
4617 if (md->bsr_anycrlf) MRRETURN(MATCH_NOMATCH);
4618 break;
4619 }
4620 break;
4621
4622 case OP_NOT_HSPACE:
4623 switch(c)
4624 {
4625 default: break;
4626 case 0x09: /* HT */
4627 case 0x20: /* SPACE */
4628 case 0xa0: /* NBSP */
4629 MRRETURN(MATCH_NOMATCH);
4630 }
4631 break;
4632
4633 case OP_HSPACE:
4634 switch(c)
4635 {
4636 default: MRRETURN(MATCH_NOMATCH);
4637 case 0x09: /* HT */
4638 case 0x20: /* SPACE */
4639 case 0xa0: /* NBSP */
4640 break;
4641 }
4642 break;
4643
4644 case OP_NOT_VSPACE:
4645 switch(c)
4646 {
4647 default: break;
4648 case 0x0a: /* LF */
4649 case 0x0b: /* VT */
4650 case 0x0c: /* FF */
4651 case 0x0d: /* CR */
4652 case 0x85: /* NEL */
4653 MRRETURN(MATCH_NOMATCH);
4654 }
4655 break;
4656
4657 case OP_VSPACE:
4658 switch(c)
4659 {
4660 default: MRRETURN(MATCH_NOMATCH);
4661 case 0x0a: /* LF */
4662 case 0x0b: /* VT */
4663 case 0x0c: /* FF */
4664 case 0x0d: /* CR */
4665 case 0x85: /* NEL */
4666 break;
4667 }
4668 break;
4669
4670 case OP_NOT_DIGIT:
4671 if ((md->ctypes[c] & ctype_digit) != 0) MRRETURN(MATCH_NOMATCH);
4672 break;
4673
4674 case OP_DIGIT:
4675 if ((md->ctypes[c] & ctype_digit) == 0) MRRETURN(MATCH_NOMATCH);
4676 break;
4677
4678 case OP_NOT_WHITESPACE:
4679 if ((md->ctypes[c] & ctype_space) != 0) MRRETURN(MATCH_NOMATCH);
4680 break;
4681
4682 case OP_WHITESPACE:
4683 if ((md->ctypes[c] & ctype_space) == 0) MRRETURN(MATCH_NOMATCH);
4684 break;
4685
4686 case OP_NOT_WORDCHAR:
4687 if ((md->ctypes[c] & ctype_word) != 0) MRRETURN(MATCH_NOMATCH);
4688 break;
4689
4690 case OP_WORDCHAR:
4691 if ((md->ctypes[c] & ctype_word) == 0) MRRETURN(MATCH_NOMATCH);
4692 break;
4693
4694 default:
4695 RRETURN(PCRE_ERROR_INTERNAL);
4696 }
4697 }
4698 }
4699 /* Control never gets here */
4700 }
4701
4702 /* If maximizing, it is worth using inline code for speed, doing the type
4703 test once at the start (i.e. keep it out of the loop). Again, keep the
4704 UTF-8 and UCP stuff separate. */
4705
4706 else
4707 {
4708 pp = eptr; /* Remember where we started */
4709
4710 #ifdef SUPPORT_UCP
4711 if (prop_type >= 0)
4712 {
4713 switch(prop_type)
4714 {
4715 case PT_ANY:
4716 for (i = min; i < max; i++)
4717 {
4718 int len = 1;
4719 if (eptr >= md->end_subject)
4720 {
4721 SCHECK_PARTIAL();
4722 break;
4723 }
4724 GETCHARLENTEST(c, eptr, len);
4725 if (prop_fail_result) break;
4726 eptr+= len;
4727 }
4728 break;
4729
4730 case PT_LAMP:
4731 for (i = min; i < max; i++)
4732 {
4733 int len = 1;
4734 if (eptr >= md->end_subject)
4735 {
4736 SCHECK_PARTIAL();
4737 break;
4738 }
4739 GETCHARLENTEST(c, eptr, len);
4740 prop_chartype = UCD_CHARTYPE(c);
4741 if ((prop_chartype == ucp_Lu ||
4742 prop_chartype == ucp_Ll ||
4743 prop_chartype == ucp_Lt) == prop_fail_result)
4744 break;
4745 eptr+= len;
4746 }
4747 break;
4748
4749 case PT_GC:
4750 for (i = min; i < max; i++)
4751 {
4752 int len = 1;
4753 if (eptr >= md->end_subject)
4754 {
4755 SCHECK_PARTIAL();
4756 break;
4757 }
4758 GETCHARLENTEST(c, eptr, len);
4759 prop_category = UCD_CATEGORY(c);
4760 if ((prop_category == prop_value) == prop_fail_result)
4761 break;
4762 eptr+= len;
4763 }
4764 break;
4765
4766 case PT_PC:
4767 for (i = min; i < max; i++)
4768 {
4769 int len = 1;
4770 if (eptr >= md->end_subject)
4771 {
4772 SCHECK_PARTIAL();
4773 break;
4774 }
4775 GETCHARLENTEST(c, eptr, len);
4776 prop_chartype = UCD_CHARTYPE(c);
4777 if ((prop_chartype == prop_value) == prop_fail_result)
4778 break;
4779 eptr+= len;
4780 }
4781 break;
4782
4783 case PT_SC:
4784 for (i = min; i < max; i++)
4785 {
4786 int len = 1;
4787 if (eptr >= md->end_subject)
4788 {
4789 SCHECK_PARTIAL();
4790 break;
4791 }
4792 GETCHARLENTEST(c, eptr, len);
4793 prop_script = UCD_SCRIPT(c);
4794 if ((prop_script == prop_value) == prop_fail_result)
4795 break;
4796 eptr+= len;
4797 }
4798 break;
4799
4800 case PT_ALNUM:
4801 for (i = min; i < max; i++)
4802 {
4803 int len = 1;
4804 if (eptr >= md->end_subject)
4805 {
4806 SCHECK_PARTIAL();
4807 break;
4808 }
4809 GETCHARLENTEST(c, eptr, len);
4810 prop_category = UCD_CATEGORY(c);
4811 if ((prop_category == ucp_L || prop_category == ucp_N)
4812 == prop_fail_result)
4813 break;
4814 eptr+= len;
4815 }
4816 break;
4817
4818 case PT_SPACE: /* Perl space */
4819 for (i = min; i < max; i++)
4820 {
4821 int len = 1;
4822 if (eptr >= md->end_subject)
4823 {
4824 SCHECK_PARTIAL();
4825 break;
4826 }
4827 GETCHARLENTEST(c, eptr, len);
4828 prop_category = UCD_CATEGORY(c);
4829 if ((prop_category == ucp_Z || c == CHAR_HT || c == CHAR_NL ||
4830 c == CHAR_FF || c == CHAR_CR)
4831 == prop_fail_result)
4832 break;
4833 eptr+= len;
4834 }
4835 break;
4836
4837 case PT_PXSPACE: /* POSIX space */
4838 for (i = min; i < max; i++)
4839 {
4840 int len = 1;
4841 if (eptr >= md->end_subject)
4842 {
4843 SCHECK_PARTIAL();
4844 break;
4845 }
4846 GETCHARLENTEST(c, eptr, len);
4847 prop_category = UCD_CATEGORY(c);
4848 if ((prop_category == ucp_Z || c == CHAR_HT || c == CHAR_NL ||
4849 c == CHAR_VT || c == CHAR_FF || c == CHAR_CR)
4850 == prop_fail_result)
4851 break;
4852 eptr+= len;
4853 }
4854 break;
4855
4856 case PT_WORD:
4857 for (i = min; i < max; i++)
4858 {
4859 int len = 1;
4860 if (eptr >= md->end_subject)
4861 {
4862 SCHECK_PARTIAL();
4863 break;
4864 }
4865 GETCHARLENTEST(c, eptr, len);
4866 prop_category = UCD_CATEGORY(c);
4867 if ((prop_category == ucp_L || prop_category == ucp_N ||
4868 c == CHAR_UNDERSCORE) == prop_fail_result)
4869 break;
4870 eptr+= len;
4871 }
4872 break;
4873
4874 default:
4875 RRETURN(PCRE_ERROR_INTERNAL);
4876 }
4877
4878 /* eptr is now past the end of the maximum run */
4879
4880 if (possessive) continue;
4881 for(;;)
4882 {
4883 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM44);
4884 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
4885 if (eptr-- == pp) break; /* Stop if tried at original pos */
4886 if (utf8) BACKCHAR(eptr);
4887 }
4888 }
4889
4890 /* Match extended Unicode sequences. We will get here only if the
4891 support is in the binary; otherwise a compile-time error occurs. */
4892
4893 else if (ctype == OP_EXTUNI)
4894 {
4895 for (i = min; i < max; i++)
4896 {
4897 if (eptr >= md->end_subject)
4898 {
4899 SCHECK_PARTIAL();
4900 break;
4901 }
4902 GETCHARINCTEST(c, eptr);
4903 prop_category = UCD_CATEGORY(c);
4904 if (prop_category == ucp_M) break;
4905 while (eptr < md->end_subject)
4906 {
4907 int len = 1;
4908 if (!utf8) c = *eptr; else
4909 {
4910 GETCHARLEN(c, eptr, len);
4911 }
4912 prop_category = UCD_CATEGORY(c);
4913 if (prop_category != ucp_M) break;
4914 eptr += len;
4915 }
4916 }
4917
4918 /* eptr is now past the end of the maximum run */
4919
4920 if (possessive) continue;
4921
4922 for(;;)
4923 {
4924 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM45);
4925 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
4926 if (eptr-- == pp) break; /* Stop if tried at original pos */
4927 for (;;) /* Move back over one extended */
4928 {
4929 int len = 1;
4930 if (!utf8) c = *eptr; else
4931 {
4932 BACKCHAR(eptr);
4933 GETCHARLEN(c, eptr, len);
4934 }
4935 prop_category = UCD_CATEGORY(c);
4936 if (prop_category != ucp_M) break;
4937 eptr--;
4938 }
4939 }
4940 }
4941
4942 else
4943 #endif /* SUPPORT_UCP */
4944
4945 #ifdef SUPPORT_UTF8
4946 /* UTF-8 mode */
4947
4948 if (utf8)
4949 {
4950 switch(ctype)
4951 {
4952 case OP_ANY:
4953 if (max < INT_MAX)
4954 {
4955 for (i = min; i < max; i++)
4956 {
4957 if (eptr >= md->end_subject)
4958 {
4959 SCHECK_PARTIAL();
4960 break;
4961 }
4962 if (IS_NEWLINE(eptr)) break;
4963 eptr++;
4964 while (eptr < md->end_subject && (*eptr & 0xc0) == 0x80) eptr++;
4965 }
4966 }
4967
4968 /* Handle unlimited UTF-8 repeat */
4969
4970 else
4971 {
4972 for (i = min; i < max; i++)
4973 {
4974 if (eptr >= md->end_subject)
4975 {
4976 SCHECK_PARTIAL();
4977 break;
4978 }
4979 if (IS_NEWLINE(eptr)) break;
4980 eptr++;
4981 while (eptr < md->end_subject && (*eptr & 0xc0) == 0x80) eptr++;
4982 }
4983 }
4984 break;
4985
4986 case OP_ALLANY:
4987 if (max < INT_MAX)
4988 {
4989 for (i = min; i < max; i++)
4990 {
4991 if (eptr >= md->end_subject)
4992 {
4993 SCHECK_PARTIAL();
4994 break;
4995 }
4996 eptr++;
4997 while (eptr < md->end_subject && (*eptr & 0xc0) == 0x80) eptr++;
4998 }
4999 }
5000 else eptr = md->end_subject; /* Unlimited UTF-8 repeat */
5001 break;
5002
5003 /* The byte case is the same as non-UTF8 */
5004
5005 case OP_ANYBYTE:
5006 c = max - min;
5007 if (c > (unsigned int)(md->end_subject - eptr))
5008 {
5009 eptr = md->end_subject;
5010 SCHECK_PARTIAL();
5011 }
5012 else eptr += c;
5013 break;
5014
5015 case OP_ANYNL:
5016 for (i = min; i < max; i++)
5017 {
5018 int len = 1;
5019 if (eptr >= md->end_subject)
5020 {
5021 SCHECK_PARTIAL();
5022 break;
5023 }
5024 GETCHARLEN(c, eptr, len);
5025 if (c == 0x000d)
5026 {
5027 if (++eptr >= md->end_subject) break;
5028 if (*eptr == 0x000a) eptr++;
5029 }
5030 else
5031 {
5032 if (c != 0x000a &&
5033 (md->bsr_anycrlf ||
5034 (c != 0x000b && c != 0x000c &&
5035 c != 0x0085 && c != 0x2028 && c != 0x2029)))
5036 break;
5037 eptr += len;
5038 }
5039 }
5040 break;
5041
5042 case OP_NOT_HSPACE:
5043 case OP_HSPACE:
5044 for (i = min; i < max; i++)
5045 {
5046 BOOL gotspace;
5047 int len = 1;
5048 if (eptr >= md->end_subject)
5049 {
5050 SCHECK_PARTIAL();
5051 break;
5052 }
5053 GETCHARLEN(c, eptr, len);
5054 switch(c)
5055 {
5056 default: gotspace = FALSE; break;
5057 case 0x09: /* HT */
5058 case 0x20: /* SPACE */
5059 case 0xa0: /* NBSP */
5060 case 0x1680: /* OGHAM SPACE MARK */
5061 case 0x180e: /* MONGOLIAN VOWEL SEPARATOR */
5062 case 0x2000: /* EN QUAD */
5063 case 0x2001: /* EM QUAD */
5064 case 0x2002: /* EN SPACE */
5065 case 0x2003: /* EM SPACE */
5066 case 0x2004: /* THREE-PER-EM SPACE */
5067 case 0x2005: /* FOUR-PER-EM SPACE */
5068 case 0x2006: /* SIX-PER-EM SPACE */
5069 case 0x2007: /* FIGURE SPACE */
5070 case 0x2008: /* PUNCTUATION SPACE */
5071 case 0x2009: /* THIN SPACE */
5072 case 0x200A: /* HAIR SPACE */
5073 case 0x202f: /* NARROW NO-BREAK SPACE */
5074 case 0x205f: /* MEDIUM MATHEMATICAL SPACE */
5075 case 0x3000: /* IDEOGRAPHIC SPACE */
5076 gotspace = TRUE;
5077 break;
5078 }
5079 if (gotspace == (ctype == OP_NOT_HSPACE)) break;
5080 eptr += len;
5081 }
5082 break;
5083
5084 case OP_NOT_VSPACE:
5085 case OP_VSPACE:
5086 for (i = min; i < max; i++)
5087 {
5088 BOOL gotspace;
5089 int len = 1;
5090 if (eptr >= md->end_subject)
5091 {
5092 SCHECK_PARTIAL();
5093 break;
5094 }
5095 GETCHARLEN(c, eptr, len);
5096 switch(c)
5097 {
5098 default: gotspace = FALSE; break;
5099 case 0x0a: /* LF */
5100 case 0x0b: /* VT */
5101 case 0x0c: /* FF */
5102 case 0x0d: /* CR */
5103 case 0x85: /* NEL */
5104 case 0x2028: /* LINE SEPARATOR */
5105 case 0x2029: /* PARAGRAPH SEPARATOR */
5106 gotspace = TRUE;
5107 break;
5108 }
5109 if (gotspace == (ctype == OP_NOT_VSPACE)) break;
5110 eptr += len;
5111 }
5112 break;
5113
5114 case OP_NOT_DIGIT:
5115 for (i = min; i < max; i++)
5116 {
5117 int len = 1;
5118 if (eptr >= md->end_subject)
5119 {
5120 SCHECK_PARTIAL();
5121 break;
5122 }
5123 GETCHARLEN(c, eptr, len);
5124 if (c < 256 && (md->ctypes[c] & ctype_digit) != 0) break;
5125 eptr+= len;
5126 }
5127 break;
5128
5129 case OP_DIGIT:
5130 for (i = min; i < max; i++)
5131 {
5132 int len = 1;
5133 if (eptr >= md->end_subject)
5134 {
5135 SCHECK_PARTIAL();
5136 break;
5137 }
5138 GETCHARLEN(c, eptr, len);
5139 if (c >= 256 ||(md->ctypes[c] & ctype_digit) == 0) break;
5140 eptr+= len;
5141 }
5142 break;
5143
5144 case OP_NOT_WHITESPACE:
5145 for (i = min; i < max; i++)
5146 {
5147 int len = 1;
5148 if (eptr >= md->end_subject)
5149 {
5150 SCHECK_PARTIAL();
5151 break;
5152 }
5153 GETCHARLEN(c, eptr, len);
5154 if (c < 256 && (md->ctypes[c] & ctype_space) != 0) break;
5155 eptr+= len;
5156 }
5157 break;
5158
5159 case OP_WHITESPACE:
5160 for (i = min; i < max; i++)
5161 {
5162 int len = 1;
5163 if (eptr >= md->end_subject)
5164 {
5165 SCHECK_PARTIAL();
5166 break;
5167 }
5168 GETCHARLEN(c, eptr, len);
5169 if (c >= 256 ||(md->ctypes[c] & ctype_space) == 0) break;
5170 eptr+= len;
5171 }
5172 break;
5173
5174 case OP_NOT_WORDCHAR:
5175 for (i = min; i < max; i++)
5176 {
5177 int len = 1;
5178 if (eptr >= md->end_subject)
5179 {
5180 SCHECK_PARTIAL();
5181 break;
5182 }
5183 GETCHARLEN(c, eptr, len);
5184 if (c < 256 && (md->ctypes[c] & ctype_word) != 0) break;
5185 eptr+= len;
5186 }
5187 break;
5188
5189 case OP_WORDCHAR:
5190 for (i = min; i < max; i++)
5191 {
5192 int len = 1;
5193 if (eptr >= md->end_subject)
5194 {
5195 SCHECK_PARTIAL();
5196 break;
5197 }
5198 GETCHARLEN(c, eptr, len);
5199 if (c >= 256 || (md->ctypes[c] & ctype_word) == 0) break;
5200 eptr+= len;
5201 }
5202 break;
5203
5204 default:
5205 RRETURN(PCRE_ERROR_INTERNAL);
5206 }
5207
5208 /* eptr is now past the end of the maximum run */
5209
5210 if (possessive) continue;
5211 for(;;)
5212 {
5213 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM46);
5214 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
5215 if (eptr-- == pp) break; /* Stop if tried at original pos */
5216 BACKCHAR(eptr);
5217 }
5218 }
5219 else
5220 #endif /* SUPPORT_UTF8 */
5221
5222 /* Not UTF-8 mode */
5223 {
5224 switch(ctype)
5225 {
5226 case OP_ANY:
5227 for (i = min; i < max; i++)
5228 {
5229 if (eptr >= md->end_subject)
5230 {
5231 SCHECK_PARTIAL();
5232 break;
5233 }
5234 if (IS_NEWLINE(eptr)) break;
5235 eptr++;
5236 }
5237 break;
5238
5239 case OP_ALLANY:
5240 case OP_ANYBYTE:
5241 c = max - min;
5242 if (c > (unsigned int)(md->end_subject - eptr))
5243 {
5244 eptr = md->end_subject;
5245 SCHECK_PARTIAL();
5246 }
5247 else eptr += c;
5248 break;
5249
5250 case OP_ANYNL:
5251 for (i = min; i < max; i++)
5252 {
5253 if (eptr >= md->end_subject)
5254 {
5255 SCHECK_PARTIAL();
5256 break;
5257 }
5258 c = *eptr;
5259 if (c == 0x000d)
5260 {
5261 if (++eptr >= md->end_subject) break;
5262 if (*eptr == 0x000a) eptr++;
5263 }
5264 else
5265 {
5266 if (c != 0x000a &&
5267 (md->bsr_anycrlf ||
5268 (c != 0x000b && c != 0x000c && c != 0x0085)))
5269 break;
5270 eptr++;
5271 }
5272 }
5273 break;
5274
5275 case OP_NOT_HSPACE:
5276 for (i = min; i < max; i++)
5277 {
5278 if (eptr >= md->end_subject)
5279 {
5280 SCHECK_PARTIAL();
5281 break;
5282 }
5283 c = *eptr;
5284 if (c == 0x09 || c == 0x20 || c == 0xa0) break;
5285 eptr++;
5286 }
5287 break;
5288
5289 case OP_HSPACE:
5290 for (i = min; i < max; i++)
5291 {
5292 if (eptr >= md->end_subject)
5293 {
5294 SCHECK_PARTIAL();
5295 break;
5296 }
5297 c = *eptr;
5298 if (c != 0x09 && c != 0x20 && c != 0xa0) break;
5299 eptr++;
5300 }
5301 break;
5302
5303 case OP_NOT_VSPACE:
5304 for (i = min; i < max; i++)
5305 {
5306 if (eptr >= md->end_subject)
5307 {
5308 SCHECK_PARTIAL();
5309 break;
5310 }
5311 c = *eptr;
5312 if (c == 0x0a || c == 0x0b || c == 0x0c || c == 0x0d || c == 0x85)
5313 break;
5314 eptr++;
5315 }
5316 break;
5317
5318 case OP_VSPACE:
5319 for (i = min; i < max; i++)
5320 {
5321 if (eptr >= md->end_subject)
5322 {
5323 SCHECK_PARTIAL();
5324 break;
5325 }
5326 c = *eptr;
5327 if (c != 0x0a && c != 0x0b && c != 0x0c && c != 0x0d && c != 0x85)
5328 break;
5329 eptr++;
5330 }
5331 break;
5332
5333 case OP_NOT_DIGIT:
5334 for (i = min; i < max; i++)
5335 {
5336 if (eptr >= md->end_subject)
5337 {
5338 SCHECK_PARTIAL();
5339 break;
5340 }
5341 if ((md->ctypes[*eptr] & ctype_digit) != 0) break;
5342 eptr++;
5343 }
5344 break;
5345
5346 case OP_DIGIT:
5347 for (i = min; i < max; i++)
5348 {
5349 if (eptr >= md->end_subject)
5350 {
5351 SCHECK_PARTIAL();
5352 break;
5353 }
5354 if ((md->ctypes[*eptr] & ctype_digit) == 0) break;
5355 eptr++;
5356 }
5357 break;
5358
5359 case OP_NOT_WHITESPACE:
5360 for (i = min; i < max; i++)
5361 {
5362 if (eptr >= md->end_subject)
5363 {
5364 SCHECK_PARTIAL();
5365 break;
5366 }
5367 if ((md->ctypes[*eptr] & ctype_space) != 0) break;
5368 eptr++;
5369 }
5370 break;
5371
5372 case OP_WHITESPACE:
5373 for (i = min; i < max; i++)
5374 {
5375 if (eptr >= md->end_subject)
5376 {
5377 SCHECK_PARTIAL();
5378 break;
5379 }
5380 if ((md->ctypes[*eptr] & ctype_space) == 0) break;
5381 eptr++;
5382 }
5383 break;
5384
5385 case OP_NOT_WORDCHAR:
5386 for (i = min; i < max; i++)
5387 {
5388 if (eptr >= md->end_subject)
5389 {
5390 SCHECK_PARTIAL();
5391 break;
5392 }
5393 if ((md->ctypes[*eptr] & ctype_word) != 0) break;
5394 eptr++;
5395 }
5396 break;
5397
5398 case OP_WORDCHAR:
5399 for (i = min; i < max; i++)
5400 {
5401 if (eptr >= md->end_subject)
5402 {
5403 SCHECK_PARTIAL();
5404 break;
5405 }
5406 if ((md->ctypes[*eptr] & ctype_word) == 0) break;
5407 eptr++;
5408 }
5409 break;
5410
5411 default:
5412 RRETURN(PCRE_ERROR_INTERNAL);
5413 }
5414
5415 /* eptr is now past the end of the maximum run */
5416
5417 if (possessive) continue;
5418 while (eptr >= pp)
5419 {
5420 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM47);
5421 eptr--;
5422 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
5423 }
5424 }
5425
5426 /* Get here if we can't make it match with any permitted repetitions */
5427
5428 MRRETURN(MATCH_NOMATCH);
5429 }
5430 /* Control never gets here */
5431
5432 /* There's been some horrible disaster. Arrival here can only mean there is
5433 something seriously wrong in the code above or the OP_xxx definitions. */
5434
5435 default:
5436 DPRINTF(("Unknown opcode %d\n", *ecode));
5437 RRETURN(PCRE_ERROR_UNKNOWN_OPCODE);
5438 }
5439
5440 /* Do not stick any code in here without much thought; it is assumed
5441 that "continue" in the code above comes out to here to repeat the main
5442 loop. */
5443
5444 } /* End of main loop */
5445 /* Control never reaches here */
5446
5447
5448 /* When compiling to use the heap rather than the stack for recursive calls to
5449 match(), the RRETURN() macro jumps here. The number that is saved in
5450 frame->Xwhere indicates which label we actually want to return to. */
5451
5452 #ifdef NO_RECURSE
5453 #define LBL(val) case val: goto L_RM##val;
5454 HEAP_RETURN:
5455 switch (frame->Xwhere)
5456 {
5457 LBL( 1) LBL( 2) LBL( 3) LBL( 4) LBL( 5) LBL( 6) LBL( 7) LBL( 8)
5458 LBL( 9) LBL(10) LBL(11) LBL(12) LBL(13) LBL(14) LBL(15) LBL(17)
5459 LBL(19) LBL(24) LBL(25) LBL(26) LBL(27) LBL(29) LBL(31) LBL(33)
5460 LBL(35) LBL(43) LBL(47) LBL(48) LBL(49) LBL(50) LBL(51) LBL(52)
5461 LBL(53) LBL(54) LBL(55) LBL(56) LBL(57) LBL(58)
5462 #ifdef SUPPORT_UTF8
5463 LBL(16) LBL(18) LBL(20) LBL(21) LBL(22) LBL(23) LBL(28) LBL(30)
5464 LBL(32) LBL(34) LBL(42) LBL(46)
5465 #ifdef SUPPORT_UCP
5466 LBL(36) LBL(37) LBL(38) LBL(39) LBL(40) LBL(41) LBL(44) LBL(45)
5467 LBL(59) LBL(60) LBL(61) LBL(62)
5468 #endif /* SUPPORT_UCP */
5469 #endif /* SUPPORT_UTF8 */
5470 default:
5471 DPRINTF(("jump error in pcre match: label %d non-existent\n", frame->Xwhere));
5472 return PCRE_ERROR_INTERNAL;
5473 }
5474 #undef LBL
5475 #endif /* NO_RECURSE */
5476 }
5477
5478
5479 /***************************************************************************
5480 ****************************************************************************
5481 RECURSION IN THE match() FUNCTION
5482
5483 Undefine all the macros that were defined above to handle this. */
5484
5485 #ifdef NO_RECURSE
5486 #undef eptr
5487 #undef ecode
5488 #undef mstart
5489 #undef offset_top
5490 #undef ims
5491 #undef eptrb
5492 #undef flags
5493
5494 #undef callpat
5495 #undef charptr
5496 #undef data
5497 #undef next
5498 #undef pp
5499 #undef prev
5500 #undef saved_eptr
5501
5502 #undef new_recursive
5503
5504 #undef cur_is_word
5505 #undef condition
5506 #undef prev_is_word
5507
5508 #undef original_ims
5509
5510 #undef ctype
5511 #undef length
5512 #undef max
5513 #undef min
5514 #undef number
5515 #undef offset
5516 #undef op
5517 #undef save_capture_last
5518 #undef save_offset1
5519 #undef save_offset2
5520 #undef save_offset3
5521 #undef stacksave
5522
5523 #undef newptrb
5524
5525 #endif
5526
5527 /* These two are defined as macros in both cases */
5528
5529 #undef fc
5530 #undef fi
5531
5532 /***************************************************************************
5533 ***************************************************************************/
5534
5535
5536
5537 /*************************************************
5538 * Execute a Regular Expression *
5539 *************************************************/
5540
5541 /* This function applies a compiled re to a subject string and picks out
5542 portions of the string if it matches. Two elements in the vector are set for
5543 each substring: the offsets to the start and end of the substring.
5544
5545 Arguments:
5546 argument_re points to the compiled expression
5547 extra_data points to extra data or is NULL
5548 subject points to the subject string
5549 length length of subject string (may contain binary zeros)
5550 start_offset where to start in the subject string
5551 options option bits
5552 offsets points to a vector of ints to be filled in with offsets
5553 offsetcount the number of elements in the vector
5554
5555 Returns: > 0 => success; value is the number of elements filled in
5556 = 0 => success, but offsets is not big enough
5557 -1 => failed to match
5558 < -1 => some kind of unexpected problem
5559 */
5560
5561 PCRE_EXP_DEFN int PCRE_CALL_CONVENTION
5562 pcre_exec(const pcre *argument_re, const pcre_extra *extra_data,
5563 PCRE_SPTR subject, int length, int start_offset, int options, int *offsets,
5564 int offsetcount)
5565 {
5566 int rc, resetcount, ocount;
5567 int first_byte = -1;
5568 int req_byte = -1;
5569 int req_byte2 = -1;
5570 int newline;
5571 unsigned long int ims;
5572 BOOL using_temporary_offsets = FALSE;
5573 BOOL anchored;
5574 BOOL startline;
5575 BOOL firstline;
5576 BOOL first_byte_caseless = FALSE;
5577 BOOL req_byte_caseless = FALSE;
5578 BOOL utf8;
5579 match_data match_block;
5580 match_data *md = &match_block;
5581 const uschar *tables;
5582 const uschar *start_bits = NULL;
5583 USPTR start_match = (USPTR)subject + start_offset;
5584 USPTR end_subject;
5585 USPTR start_partial = NULL;
5586 USPTR req_byte_ptr = start_match - 1;
5587
5588 pcre_study_data internal_study;
5589 const pcre_study_data *study;
5590
5591 real_pcre internal_re;
5592 const real_pcre *external_re = (const real_pcre *)argument_re;
5593 const real_pcre *re = external_re;
5594
5595 /* Plausibility checks */
5596
5597 if ((options & ~PUBLIC_EXEC_OPTIONS) != 0) return PCRE_ERROR_BADOPTION;
5598 if (re == NULL || subject == NULL ||
5599 (offsets == NULL && offsetcount > 0)) return PCRE_ERROR_NULL;
5600 if (offsetcount < 0) return PCRE_ERROR_BADCOUNT;
5601
5602 /* This information is for finding all the numbers associated with a given
5603 name, for condition testing. */
5604
5605 md->name_table = (uschar *)re + re->name_table_offset;
5606 md->name_count = re->name_count;
5607 md->name_entry_size = re->name_entry_size;
5608
5609 /* Fish out the optional data from the extra_data structure, first setting
5610 the default values. */
5611
5612 study = NULL;
5613 md->match_limit = MATCH_LIMIT;
5614 md->match_limit_recursion = MATCH_LIMIT_RECURSION;
5615 md->callout_data = NULL;
5616
5617 /* The table pointer is always in native byte order. */
5618
5619 tables = external_re->tables;
5620
5621 if (extra_data != NULL)
5622 {
5623 register unsigned int flags = extra_data->flags;
5624 if ((flags & PCRE_EXTRA_STUDY_DATA) != 0)
5625 study = (const pcre_study_data *)extra_data->study_data;
5626 if ((flags & PCRE_EXTRA_MATCH_LIMIT) != 0)
5627 md->match_limit = extra_data->match_limit;
5628 if ((flags & PCRE_EXTRA_MATCH_LIMIT_RECURSION) != 0)
5629 md->match_limit_recursion = extra_data->match_limit_recursion;
5630 if ((flags & PCRE_EXTRA_CALLOUT_DATA) != 0)
5631 md->callout_data = extra_data->callout_data;
5632 if ((flags & PCRE_EXTRA_TABLES) != 0) tables = extra_data->tables;
5633 }
5634
5635 /* If the exec call supplied NULL for tables, use the inbuilt ones. This
5636 is a feature that makes it possible to save compiled regex and re-use them
5637 in other programs later. */
5638
5639 if (tables == NULL) tables = _pcre_default_tables;
5640
5641 /* Check that the first field in the block is the magic number. If it is not,
5642 test for a regex that was compiled on a host of opposite endianness. If this is
5643 the case, flipped values are put in internal_re and internal_study if there was
5644 study data too. */
5645
5646 if (re->magic_number != MAGIC_NUMBER)
5647 {
5648 re = _pcre_try_flipped(re, &internal_re, study, &internal_study);
5649 if (re == NULL) return PCRE_ERROR_BADMAGIC;
5650 if (study != NULL) study = &internal_study;
5651 }
5652
5653 /* Set up other data */
5654
5655 anchored = ((re->options | options) & PCRE_ANCHORED) != 0;
5656 startline = (re->flags & PCRE_STARTLINE) != 0;
5657 firstline = (re->options & PCRE_FIRSTLINE) != 0;
5658
5659 /* The code starts after the real_pcre block and the capture name table. */
5660
5661 md->start_code = (const uschar *)external_re + re->name_table_offset +
5662 re->name_count * re->name_entry_size;
5663
5664 md->start_subject = (USPTR)subject;
5665 md->start_offset = start_offset;
5666 md->end_subject = md->start_subject + length;
5667 end_subject = md->end_subject;
5668
5669 md->endonly = (re->options & PCRE_DOLLAR_ENDONLY) != 0;
5670 utf8 = md->utf8 = (re->options & PCRE_UTF8) != 0;
5671 md->use_ucp = (re->options & PCRE_UCP) != 0;
5672 md->jscript_compat = (re->options & PCRE_JAVASCRIPT_COMPAT) != 0;
5673
5674 md->notbol = (options & PCRE_NOTBOL) != 0;
5675 md->noteol = (options & PCRE_NOTEOL) != 0;
5676 md->notempty = (options & PCRE_NOTEMPTY) != 0;
5677 md->notempty_atstart = (options & PCRE_NOTEMPTY_ATSTART) != 0;
5678 md->partial = ((options & PCRE_PARTIAL_HARD) != 0)? 2 :
5679 ((options & PCRE_PARTIAL_SOFT) != 0)? 1 : 0;
5680 md->hitend = FALSE;
5681 md->mark = NULL; /* In case never set */
5682
5683 md->recursive = NULL; /* No recursion at top level */
5684
5685 md->lcc = tables + lcc_offset;
5686 md->ctypes = tables + ctypes_offset;
5687
5688 /* Handle different \R options. */
5689
5690 switch (options & (PCRE_BSR_ANYCRLF|PCRE_BSR_UNICODE))
5691 {
5692 case 0:
5693 if ((re->options & (PCRE_BSR_ANYCRLF|PCRE_BSR_UNICODE)) != 0)
5694 md->bsr_anycrlf = (re->options & PCRE_BSR_ANYCRLF) != 0;
5695 else
5696 #ifdef BSR_ANYCRLF
5697 md->bsr_anycrlf = TRUE;
5698 #else
5699 md->bsr_anycrlf = FALSE;
5700 #endif
5701 break;
5702
5703 case PCRE_BSR_ANYCRLF:
5704 md->bsr_anycrlf = TRUE;
5705 break;
5706
5707 case PCRE_BSR_UNICODE:
5708 md->bsr_anycrlf = FALSE;
5709 break;
5710
5711 default: return PCRE_ERROR_BADNEWLINE;
5712 }
5713
5714 /* Handle different types of newline. The three bits give eight cases. If
5715 nothing is set at run time, whatever was used at compile time applies. */
5716
5717 switch ((((options & PCRE_NEWLINE_BITS) == 0)? re->options :
5718 (pcre_uint32)options) & PCRE_NEWLINE_BITS)
5719 {
5720 case 0: newline = NEWLINE; break; /* Compile-time default */
5721 case PCRE_NEWLINE_CR: newline = CHAR_CR; break;
5722 case PCRE_NEWLINE_LF: newline = CHAR_NL; break;
5723 case PCRE_NEWLINE_CR+
5724 PCRE_NEWLINE_LF: newline = (CHAR_CR << 8) | CHAR_NL; break;
5725 case PCRE_NEWLINE_ANY: newline = -1; break;
5726 case PCRE_NEWLINE_ANYCRLF: newline = -2; break;
5727 default: return PCRE_ERROR_BADNEWLINE;
5728 }
5729
5730 if (newline == -2)
5731 {
5732 md->nltype = NLTYPE_ANYCRLF;
5733 }
5734 else if (newline < 0)
5735 {
5736 md->nltype = NLTYPE_ANY;
5737 }
5738 else
5739 {
5740 md->nltype = NLTYPE_FIXED;
5741 if (newline > 255)
5742 {
5743 md->nllen = 2;
5744 md->nl[0] = (newline >> 8) & 255;
5745 md->nl[1] = newline & 255;
5746 }
5747 else
5748 {
5749 md->nllen = 1;
5750 md->nl[0] = newline;
5751 }
5752 }
5753
5754 /* Partial matching was originally supported only for a restricted set of
5755 regexes; from release 8.00 there are no restrictions, but the bits are still
5756 defined (though never set). So there's no harm in leaving this code. */
5757
5758 if (md->partial && (re->flags & PCRE_NOPARTIAL) != 0)
5759 return PCRE_ERROR_BADPARTIAL;
5760
5761 /* Check a UTF-8 string if required. Unfortunately there's no way of passing
5762 back the character offset. */
5763
5764 #ifdef SUPPORT_UTF8
5765 if (utf8 && (options & PCRE_NO_UTF8_CHECK) == 0)
5766 {
5767 if (_pcre_valid_utf8((USPTR)subject, length) >= 0)
5768 return PCRE_ERROR_BADUTF8;
5769 if (start_offset > 0 && start_offset < length)
5770 {
5771 int tb = ((USPTR)subject)[start_offset];
5772 if (tb > 127)
5773 {
5774 tb &= 0xc0;
5775 if (tb != 0 && tb != 0xc0) return PCRE_ERROR_BADUTF8_OFFSET;
5776 }
5777 }
5778 }
5779 #endif
5780
5781 /* The ims options can vary during the matching as a result of the presence
5782 of (?ims) items in the pattern. They are kept in a local variable so that
5783 restoring at the exit of a group is easy. */
5784
5785 ims = re->options & (PCRE_CASELESS|PCRE_MULTILINE|PCRE_DOTALL);
5786
5787 /* If the expression has got more back references than the offsets supplied can
5788 hold, we get a temporary chunk of working store to use during the matching.
5789 Otherwise, we can use the vector supplied, rounding down its size to a multiple
5790 of 3. */
5791
5792 ocount = offsetcount - (offsetcount % 3);
5793
5794 if (re->top_backref > 0 && re->top_backref >= ocount/3)
5795 {
5796 ocount = re->top_backref * 3 + 3;
5797 md->offset_vector = (int *)(pcre_malloc)(ocount * sizeof(int));
5798 if (md->offset_vector == NULL) return PCRE_ERROR_NOMEMORY;
5799 using_temporary_offsets = TRUE;
5800 DPRINTF(("Got memory to hold back references\n"));
5801 }
5802 else md->offset_vector = offsets;
5803
5804 md->offset_end = ocount;
5805 md->offset_max = (2*ocount)/3;
5806 md->offset_overflow = FALSE;
5807 md->capture_last = -1;
5808
5809 /* Compute the minimum number of offsets that we need to reset each time. Doing
5810 this makes a huge difference to execution time when there aren't many brackets
5811 in the pattern. */
5812
5813 resetcount = 2 + re->top_bracket * 2;
5814 if (resetcount > offsetcount) resetcount = ocount;
5815
5816 /* Reset the working variable associated with each extraction. These should
5817 never be used unless previously set, but they get saved and restored, and so we
5818 initialize them to avoid reading uninitialized locations. */
5819
5820 if (md->offset_vector != NULL)
5821 {
5822 register int *iptr = md->offset_vector + ocount;
5823 register int *iend = iptr - resetcount/2 + 1;
5824 while (--iptr >= iend) *iptr = -1;
5825 }
5826
5827 /* Set up the first character to match, if available. The first_byte value is
5828 never set for an anchored regular expression, but the anchoring may be forced
5829 at run time, so we have to test for anchoring. The first char may be unset for
5830 an unanchored pattern, of course. If there's no first char and the pattern was
5831 studied, there may be a bitmap of possible first characters. */
5832
5833 if (!anchored)
5834 {
5835 if ((re->flags & PCRE_FIRSTSET) != 0)
5836 {
5837 first_byte = re->first_byte & 255;
5838 if ((first_byte_caseless = ((re->first_byte & REQ_CASELESS) != 0)) == TRUE)
5839 first_byte = md->lcc[first_byte];
5840 }
5841 else
5842 if (!startline && study != NULL &&
5843 (study->flags & PCRE_STUDY_MAPPED) != 0)
5844 start_bits = study->start_bits;
5845 }
5846
5847 /* For anchored or unanchored matches, there may be a "last known required
5848 character" set. */
5849
5850 if ((re->flags & PCRE_REQCHSET) != 0)
5851 {
5852 req_byte = re->req_byte & 255;
5853 req_byte_caseless = (re->req_byte & REQ_CASELESS) != 0;
5854 req_byte2 = (tables + fcc_offset)[req_byte]; /* case flipped */
5855 }
5856
5857
5858 /* ==========================================================================*/
5859
5860 /* Loop for handling unanchored repeated matching attempts; for anchored regexs
5861 the loop runs just once. */
5862
5863 for(;;)
5864 {
5865 USPTR save_end_subject = end_subject;
5866 USPTR new_start_match;
5867
5868 /* Reset the maximum number of extractions we might see. */
5869
5870 if (md->offset_vector != NULL)
5871 {
5872 register int *iptr = md->offset_vector;
5873 register int *iend = iptr + resetcount;
5874 while (iptr < iend) *iptr++ = -1;
5875 }
5876
5877 /* If firstline is TRUE, the start of the match is constrained to the first
5878 line of a multiline string. That is, the match must be before or at the first
5879 newline. Implement this by temporarily adjusting end_subject so that we stop
5880 scanning at a newline. If the match fails at the newline, later code breaks
5881 this loop. */
5882
5883 if (firstline)
5884 {
5885 USPTR t = start_match;
5886 #ifdef SUPPORT_UTF8
5887 if (utf8)
5888 {
5889 while (t < md->end_subject && !IS_NEWLINE(t))
5890 {
5891 t++;
5892 while (t < end_subject && (*t & 0xc0) == 0x80) t++;
5893 }
5894 }
5895 else
5896 #endif
5897 while (t < md->end_subject && !IS_NEWLINE(t)) t++;
5898 end_subject = t;
5899 }
5900
5901 /* There are some optimizations that avoid running the match if a known
5902 starting point is not found, or if a known later character is not present.
5903 However, there is an option that disables these, for testing and for ensuring
5904 that all callouts do actually occur. */
5905
5906 if ((options & PCRE_NO_START_OPTIMIZE) == 0)
5907 {
5908 /* Advance to a unique first byte if there is one. */
5909
5910 if (first_byte >= 0)
5911 {
5912 if (first_byte_caseless)
5913 while (start_match < end_subject && md->lcc[*start_match] != first_byte)
5914 start_match++;
5915 else
5916 while (start_match < end_subject && *start_match != first_byte)
5917 start_match++;
5918 }
5919
5920 /* Or to just after a linebreak for a multiline match */
5921
5922 else if (startline)
5923 {
5924 if (start_match > md->start_subject + start_offset)
5925 {
5926 #ifdef SUPPORT_UTF8
5927 if (utf8)
5928 {
5929 while (start_match < end_subject && !WAS_NEWLINE(start_match))
5930 {
5931 start_match++;
5932 while(start_match < end_subject && (*start_match & 0xc0) == 0x80)
5933 start_match++;
5934 }
5935 }
5936 else
5937 #endif
5938 while (start_match < end_subject && !WAS_NEWLINE(start_match))
5939 start_match++;
5940
5941 /* If we have just passed a CR and the newline option is ANY or ANYCRLF,
5942 and we are now at a LF, advance the match position by one more character.
5943 */
5944
5945 if (start_match[-1] == CHAR_CR &&
5946 (md->nltype == NLTYPE_ANY || md->nltype == NLTYPE_ANYCRLF) &&
5947 start_match < end_subject &&
5948 *start_match == CHAR_NL)
5949 start_match++;
5950 }
5951 }
5952
5953 /* Or to a non-unique first byte after study */
5954
5955 else if (start_bits != NULL)
5956 {
5957 while (start_match < end_subject)
5958 {
5959 register unsigned int c = *start_match;
5960 if ((start_bits[c/8] & (1 << (c&7))) == 0) start_match++;
5961 else break;
5962 }
5963 }
5964 } /* Starting optimizations */
5965
5966 /* Restore fudged end_subject */
5967
5968 end_subject = save_end_subject;
5969
5970 /* The following two optimizations are disabled for partial matching or if
5971 disabling is explicitly requested. */
5972
5973 if ((options & PCRE_NO_START_OPTIMIZE) == 0 && !md->partial)
5974 {
5975 /* If the pattern was studied, a minimum subject length may be set. This is
5976 a lower bound; no actual string of that length may actually match the
5977 pattern. Although the value is, strictly, in characters, we treat it as
5978 bytes to avoid spending too much time in this optimization. */
5979
5980 if (study != NULL && (study->flags & PCRE_STUDY_MINLEN) != 0 &&
5981 (pcre_uint32)(end_subject - start_match) < study->minlength)
5982 {
5983 rc = MATCH_NOMATCH;
5984 break;
5985 }
5986
5987 /* If req_byte is set, we know that that character must appear in the
5988 subject for the match to succeed. If the first character is set, req_byte
5989 must be later in the subject; otherwise the test starts at the match point.
5990 This optimization can save a huge amount of backtracking in patterns with
5991 nested unlimited repeats that aren't going to match. Writing separate code
5992 for cased/caseless versions makes it go faster, as does using an
5993 autoincrement and backing off on a match.
5994
5995 HOWEVER: when the subject string is very, very long, searching to its end
5996 can take a long time, and give bad performance on quite ordinary patterns.
5997 This showed up when somebody was matching something like /^\d+C/ on a
5998 32-megabyte string... so we don't do this when the string is sufficiently
5999 long. */
6000
6001 if (req_byte >= 0 && end_subject - start_match < REQ_BYTE_MAX)
6002 {
6003 register USPTR p = start_match + ((first_byte >= 0)? 1 : 0);
6004
6005 /* We don't need to repeat the search if we haven't yet reached the
6006 place we found it at last time. */
6007
6008 if (p > req_byte_ptr)
6009 {
6010 if (req_byte_caseless)
6011 {
6012 while (p < end_subject)
6013 {
6014 register int pp = *p++;
6015 if (pp == req_byte || pp == req_byte2) { p--; break; }
6016 }
6017 }
6018 else
6019 {
6020 while (p < end_subject)
6021 {
6022 if (*p++ == req_byte) { p--; break; }
6023 }
6024 }
6025
6026 /* If we can't find the required character, break the matching loop,
6027 forcing a match failure. */
6028
6029 if (p >= end_subject)
6030 {
6031 rc = MATCH_NOMATCH;
6032 break;
6033 }
6034
6035 /* If we have found the required character, save the point where we
6036 found it, so that we don't search again next time round the loop if
6037 the start hasn't passed this character yet. */
6038
6039 req_byte_ptr = p;
6040 }
6041 }
6042 }
6043
6044 #ifdef PCRE_DEBUG /* Sigh. Some compilers never learn. */
6045 printf(">>>> Match against: ");
6046 pchars(start_match, end_subject - start_match, TRUE, md);
6047 printf("\n");
6048 #endif
6049
6050 /* OK, we can now run the match. If "hitend" is set afterwards, remember the
6051 first starting point for which a partial match was found. */
6052
6053 md->start_match_ptr = start_match;
6054 md->start_used_ptr = start_match;
6055 md->match_call_count = 0;
6056 rc = match(start_match, md->start_code, start_match, NULL, 2, md, ims, NULL,
6057 0, 0);
6058 if (md->hitend && start_partial == NULL) start_partial = md->start_used_ptr;
6059
6060 switch(rc)
6061 {
6062 /* NOMATCH and PRUNE advance by one character. If MATCH_SKIP_ARG reaches
6063 this level it means that a MARK that matched the SKIP's arg was not found.
6064 We treat this as NOMATCH. THEN at this level acts exactly like PRUNE. */
6065
6066 case MATCH_NOMATCH:
6067 case MATCH_PRUNE:
6068 case MATCH_SKIP_ARG:
6069 case MATCH_THEN:
6070 new_start_match = start_match + 1;
6071 #ifdef SUPPORT_UTF8
6072 if (utf8)
6073 while(new_start_match < end_subject && (*new_start_match & 0xc0) == 0x80)
6074 new_start_match++;
6075 #endif
6076 break;
6077
6078 /* SKIP passes back the next starting point explicitly. */
6079
6080 case MATCH_SKIP:
6081 new_start_match = md->start_match_ptr;
6082 break;
6083
6084 /* COMMIT disables the bumpalong, but otherwise behaves as NOMATCH. */
6085
6086 case MATCH_COMMIT:
6087 rc = MATCH_NOMATCH;
6088 goto ENDLOOP;
6089
6090 /* Any other return is either a match, or some kind of error. */
6091
6092 default:
6093 goto ENDLOOP;
6094 }
6095
6096 /* Control reaches here for the various types of "no match at this point"
6097 result. Reset the code to MATCH_NOMATCH for subsequent checking. */
6098
6099 rc = MATCH_NOMATCH;
6100
6101 /* If PCRE_FIRSTLINE is set, the match must happen before or at the first
6102 newline in the subject (though it may continue over the newline). Therefore,
6103 if we have just failed to match, starting at a newline, do not continue. */
6104
6105 if (firstline && IS_NEWLINE(start_match)) break;
6106
6107 /* Advance to new matching position */
6108
6109 start_match = new_start_match;
6110
6111 /* Break the loop if the pattern is anchored or if we have passed the end of
6112 the subject. */
6113
6114 if (anchored || start_match > end_subject) break;
6115
6116 /* If we have just passed a CR and we are now at a LF, and the pattern does
6117 not contain any explicit matches for \r or \n, and the newline option is CRLF
6118 or ANY or ANYCRLF, advance the match position by one more character. */
6119
6120 if (start_match[-1] == CHAR_CR &&
6121 start_match < end_subject &&
6122 *start_match == CHAR_NL &&
6123 (re->flags & PCRE_HASCRORLF) == 0 &&
6124 (md->nltype == NLTYPE_ANY ||
6125 md->nltype == NLTYPE_ANYCRLF ||
6126 md->nllen == 2))
6127 start_match++;
6128
6129 md->mark = NULL; /* Reset for start of next match attempt */
6130 } /* End of for(;;) "bumpalong" loop */
6131
6132 /* ==========================================================================*/
6133
6134 /* We reach here when rc is not MATCH_NOMATCH, or if one of the stopping
6135 conditions is true:
6136
6137 (1) The pattern is anchored or the match was failed by (*COMMIT);
6138
6139 (2) We are past the end of the subject;
6140
6141 (3) PCRE_FIRSTLINE is set and we have failed to match at a newline, because
6142 this option requests that a match occur at or before the first newline in
6143 the subject.
6144
6145 When we have a match and the offset vector is big enough to deal with any
6146 backreferences, captured substring offsets will already be set up. In the case
6147 where we had to get some local store to hold offsets for backreference
6148 processing, copy those that we can. In this case there need not be overflow if
6149 certain parts of the pattern were not used, even though there are more
6150 capturing parentheses than vector slots. */
6151
6152 ENDLOOP:
6153
6154 if (rc == MATCH_MATCH || rc == MATCH_ACCEPT)
6155 {
6156 if (using_temporary_offsets)
6157 {
6158 if (offsetcount >= 4)
6159 {
6160 memcpy(offsets + 2, md->offset_vector + 2,
6161 (offsetcount - 2) * sizeof(int));
6162 DPRINTF(("Copied offsets from temporary memory\n"));
6163 }
6164 if (md->end_offset_top > offsetcount) md->offset_overflow = TRUE;
6165 DPRINTF(("Freeing temporary memory\n"));
6166 (pcre_free)(md->offset_vector);
6167 }
6168
6169 /* Set the return code to the number of captured strings, or 0 if there are
6170 too many to fit into the vector. */
6171
6172 rc = md->offset_overflow? 0 : md->end_offset_top/2;
6173
6174 /* If there is space, set up the whole thing as substring 0. The value of
6175 md->start_match_ptr might be modified if \K was encountered on the success
6176 matching path. */
6177
6178 if (offsetcount < 2) rc = 0; else
6179 {
6180 offsets[0] = (int)(md->start_match_ptr - md->start_subject);
6181 offsets[1] = (int)(md->end_match_ptr - md->start_subject);
6182 }
6183
6184 DPRINTF((">>>> returning %d\n", rc));
6185 goto RETURN_MARK;
6186 }
6187
6188 /* Control gets here if there has been an error, or if the overall match
6189 attempt has failed at all permitted starting positions. */
6190
6191 if (using_temporary_offsets)
6192 {
6193 DPRINTF(("Freeing temporary memory\n"));
6194 (pcre_free)(md->offset_vector);
6195 }
6196
6197 /* For anything other than nomatch or partial match, just return the code. */
6198
6199 if (rc != MATCH_NOMATCH && rc != PCRE_ERROR_PARTIAL)
6200 {
6201 DPRINTF((">>>> error: returning %d\n", rc));
6202 return rc;
6203 }
6204
6205 /* Handle partial matches - disable any mark data */
6206
6207 if (start_partial != NULL)
6208 {
6209 DPRINTF((">>>> returning PCRE_ERROR_PARTIAL\n"));
6210 md->mark = NULL;
6211 if (offsetcount > 1)
6212 {
6213 offsets[0] = (int)(start_partial - (USPTR)subject);
6214 offsets[1] = (int)(end_subject - (USPTR)subject);
6215 }
6216 rc = PCRE_ERROR_PARTIAL;
6217 }
6218
6219 /* This is the classic nomatch case */
6220
6221 else
6222 {
6223 DPRINTF((">>>> returning PCRE_ERROR_NOMATCH\n"));
6224 rc = PCRE_ERROR_NOMATCH;
6225 }
6226
6227 /* Return the MARK data if it has been requested. */
6228
6229 RETURN_MARK:
6230
6231 if (extra_data != NULL && (extra_data->flags & PCRE_EXTRA_MARK) != 0)
6232 *(extra_data->mark) = (unsigned char *)(md->mark);
6233 return rc;
6234 }
6235
6236 /* End of pcre_exec.c */

Properties

Name Value
svn:eol-style native
svn:keywords "Author Date Id Revision Url"

webmaster@exim.org
ViewVC Help
Powered by ViewVC 1.1.12