/[pcre]/code/trunk/pcre_exec.c
ViewVC logotype

Contents of /code/trunk/pcre_exec.c

Parent Directory Parent Directory | Revision Log Revision Log


Revision 518 - (show annotations) (download)
Tue May 18 15:47:01 2010 UTC (3 years, 11 months ago) by ph10
File MIME type: text/plain
File size: 185162 byte(s)
Added PCRE_UCP and related stuff to make \w etc use Unicode properties.

1 /*************************************************
2 * Perl-Compatible Regular Expressions *
3 *************************************************/
4
5 /* PCRE is a library of functions to support regular expressions whose syntax
6 and semantics are as close as possible to those of the Perl 5 language.
7
8 Written by Philip Hazel
9 Copyright (c) 1997-2010 University of Cambridge
10
11 -----------------------------------------------------------------------------
12 Redistribution and use in source and binary forms, with or without
13 modification, are permitted provided that the following conditions are met:
14
15 * Redistributions of source code must retain the above copyright notice,
16 this list of conditions and the following disclaimer.
17
18 * Redistributions in binary form must reproduce the above copyright
19 notice, this list of conditions and the following disclaimer in the
20 documentation and/or other materials provided with the distribution.
21
22 * Neither the name of the University of Cambridge nor the names of its
23 contributors may be used to endorse or promote products derived from
24 this software without specific prior written permission.
25
26 THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
27 AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
28 IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
29 ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
30 LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
31 CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
32 SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
33 INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
34 CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
35 ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
36 POSSIBILITY OF SUCH DAMAGE.
37 -----------------------------------------------------------------------------
38 */
39
40
41 /* This module contains pcre_exec(), the externally visible function that does
42 pattern matching using an NFA algorithm, trying to mimic Perl as closely as
43 possible. There are also some static supporting functions. */
44
45 #ifdef HAVE_CONFIG_H
46 #include "config.h"
47 #endif
48
49 #define NLBLOCK md /* Block containing newline information */
50 #define PSSTART start_subject /* Field containing processed string start */
51 #define PSEND end_subject /* Field containing processed string end */
52
53 #include "pcre_internal.h"
54
55 /* Undefine some potentially clashing cpp symbols */
56
57 #undef min
58 #undef max
59
60 /* Flag bits for the match() function */
61
62 #define match_condassert 0x01 /* Called to check a condition assertion */
63 #define match_cbegroup 0x02 /* Could-be-empty unlimited repeat group */
64
65 /* Non-error returns from the match() function. Error returns are externally
66 defined PCRE_ERROR_xxx codes, which are all negative. */
67
68 #define MATCH_MATCH 1
69 #define MATCH_NOMATCH 0
70
71 /* Special internal returns from the match() function. Make them sufficiently
72 negative to avoid the external error codes. */
73
74 #define MATCH_ACCEPT (-999)
75 #define MATCH_COMMIT (-998)
76 #define MATCH_PRUNE (-997)
77 #define MATCH_SKIP (-996)
78 #define MATCH_SKIP_ARG (-995)
79 #define MATCH_THEN (-994)
80
81 /* This is a convenience macro for code that occurs many times. */
82
83 #define MRRETURN(ra) \
84 { \
85 md->mark = markptr; \
86 RRETURN(ra); \
87 }
88
89 /* Maximum number of ints of offset to save on the stack for recursive calls.
90 If the offset vector is bigger, malloc is used. This should be a multiple of 3,
91 because the offset vector is always a multiple of 3 long. */
92
93 #define REC_STACK_SAVE_MAX 30
94
95 /* Min and max values for the common repeats; for the maxima, 0 => infinity */
96
97 static const char rep_min[] = { 0, 0, 1, 1, 0, 0 };
98 static const char rep_max[] = { 0, 0, 0, 0, 1, 1 };
99
100
101
102 #ifdef PCRE_DEBUG
103 /*************************************************
104 * Debugging function to print chars *
105 *************************************************/
106
107 /* Print a sequence of chars in printable format, stopping at the end of the
108 subject if the requested.
109
110 Arguments:
111 p points to characters
112 length number to print
113 is_subject TRUE if printing from within md->start_subject
114 md pointer to matching data block, if is_subject is TRUE
115
116 Returns: nothing
117 */
118
119 static void
120 pchars(const uschar *p, int length, BOOL is_subject, match_data *md)
121 {
122 unsigned int c;
123 if (is_subject && length > md->end_subject - p) length = md->end_subject - p;
124 while (length-- > 0)
125 if (isprint(c = *(p++))) printf("%c", c); else printf("\\x%02x", c);
126 }
127 #endif
128
129
130
131 /*************************************************
132 * Match a back-reference *
133 *************************************************/
134
135 /* If a back reference hasn't been set, the length that is passed is greater
136 than the number of characters left in the string, so the match fails.
137
138 Arguments:
139 offset index into the offset vector
140 eptr points into the subject
141 length length to be matched
142 md points to match data block
143 ims the ims flags
144
145 Returns: TRUE if matched
146 */
147
148 static BOOL
149 match_ref(int offset, register USPTR eptr, int length, match_data *md,
150 unsigned long int ims)
151 {
152 USPTR p = md->start_subject + md->offset_vector[offset];
153
154 #ifdef PCRE_DEBUG
155 if (eptr >= md->end_subject)
156 printf("matching subject <null>");
157 else
158 {
159 printf("matching subject ");
160 pchars(eptr, length, TRUE, md);
161 }
162 printf(" against backref ");
163 pchars(p, length, FALSE, md);
164 printf("\n");
165 #endif
166
167 /* Always fail if not enough characters left */
168
169 if (length > md->end_subject - eptr) return FALSE;
170
171 /* Separate the caseless case for speed. In UTF-8 mode we can only do this
172 properly if Unicode properties are supported. Otherwise, we can check only
173 ASCII characters. */
174
175 if ((ims & PCRE_CASELESS) != 0)
176 {
177 #ifdef SUPPORT_UTF8
178 #ifdef SUPPORT_UCP
179 if (md->utf8)
180 {
181 USPTR endptr = eptr + length;
182 while (eptr < endptr)
183 {
184 int c, d;
185 GETCHARINC(c, eptr);
186 GETCHARINC(d, p);
187 if (c != d && c != UCD_OTHERCASE(d)) return FALSE;
188 }
189 }
190 else
191 #endif
192 #endif
193
194 /* The same code works when not in UTF-8 mode and in UTF-8 mode when there
195 is no UCP support. */
196
197 while (length-- > 0)
198 { if (md->lcc[*p++] != md->lcc[*eptr++]) return FALSE; }
199 }
200
201 /* In the caseful case, we can just compare the bytes, whether or not we
202 are in UTF-8 mode. */
203
204 else
205 { while (length-- > 0) if (*p++ != *eptr++) return FALSE; }
206
207 return TRUE;
208 }
209
210
211
212 /***************************************************************************
213 ****************************************************************************
214 RECURSION IN THE match() FUNCTION
215
216 The match() function is highly recursive, though not every recursive call
217 increases the recursive depth. Nevertheless, some regular expressions can cause
218 it to recurse to a great depth. I was writing for Unix, so I just let it call
219 itself recursively. This uses the stack for saving everything that has to be
220 saved for a recursive call. On Unix, the stack can be large, and this works
221 fine.
222
223 It turns out that on some non-Unix-like systems there are problems with
224 programs that use a lot of stack. (This despite the fact that every last chip
225 has oodles of memory these days, and techniques for extending the stack have
226 been known for decades.) So....
227
228 There is a fudge, triggered by defining NO_RECURSE, which avoids recursive
229 calls by keeping local variables that need to be preserved in blocks of memory
230 obtained from malloc() instead instead of on the stack. Macros are used to
231 achieve this so that the actual code doesn't look very different to what it
232 always used to.
233
234 The original heap-recursive code used longjmp(). However, it seems that this
235 can be very slow on some operating systems. Following a suggestion from Stan
236 Switzer, the use of longjmp() has been abolished, at the cost of having to
237 provide a unique number for each call to RMATCH. There is no way of generating
238 a sequence of numbers at compile time in C. I have given them names, to make
239 them stand out more clearly.
240
241 Crude tests on x86 Linux show a small speedup of around 5-8%. However, on
242 FreeBSD, avoiding longjmp() more than halves the time taken to run the standard
243 tests. Furthermore, not using longjmp() means that local dynamic variables
244 don't have indeterminate values; this has meant that the frame size can be
245 reduced because the result can be "passed back" by straight setting of the
246 variable instead of being passed in the frame.
247 ****************************************************************************
248 ***************************************************************************/
249
250 /* Numbers for RMATCH calls. When this list is changed, the code at HEAP_RETURN
251 below must be updated in sync. */
252
253 enum { RM1=1, RM2, RM3, RM4, RM5, RM6, RM7, RM8, RM9, RM10,
254 RM11, RM12, RM13, RM14, RM15, RM16, RM17, RM18, RM19, RM20,
255 RM21, RM22, RM23, RM24, RM25, RM26, RM27, RM28, RM29, RM30,
256 RM31, RM32, RM33, RM34, RM35, RM36, RM37, RM38, RM39, RM40,
257 RM41, RM42, RM43, RM44, RM45, RM46, RM47, RM48, RM49, RM50,
258 RM51, RM52, RM53, RM54, RM55, RM56, RM57, RM58 };
259
260 /* These versions of the macros use the stack, as normal. There are debugging
261 versions and production versions. Note that the "rw" argument of RMATCH isn't
262 actually used in this definition. */
263
264 #ifndef NO_RECURSE
265 #define REGISTER register
266
267 #ifdef PCRE_DEBUG
268 #define RMATCH(ra,rb,rc,rd,re,rf,rg,rw) \
269 { \
270 printf("match() called in line %d\n", __LINE__); \
271 rrc = match(ra,rb,mstart,markptr,rc,rd,re,rf,rg,rdepth+1); \
272 printf("to line %d\n", __LINE__); \
273 }
274 #define RRETURN(ra) \
275 { \
276 printf("match() returned %d from line %d ", ra, __LINE__); \
277 return ra; \
278 }
279 #else
280 #define RMATCH(ra,rb,rc,rd,re,rf,rg,rw) \
281 rrc = match(ra,rb,mstart,markptr,rc,rd,re,rf,rg,rdepth+1)
282 #define RRETURN(ra) return ra
283 #endif
284
285 #else
286
287
288 /* These versions of the macros manage a private stack on the heap. Note that
289 the "rd" argument of RMATCH isn't actually used in this definition. It's the md
290 argument of match(), which never changes. */
291
292 #define REGISTER
293
294 #define RMATCH(ra,rb,rc,rd,re,rf,rg,rw)\
295 {\
296 heapframe *newframe = (pcre_stack_malloc)(sizeof(heapframe));\
297 frame->Xwhere = rw; \
298 newframe->Xeptr = ra;\
299 newframe->Xecode = rb;\
300 newframe->Xmstart = mstart;\
301 newframe->Xmarkptr = markptr;\
302 newframe->Xoffset_top = rc;\
303 newframe->Xims = re;\
304 newframe->Xeptrb = rf;\
305 newframe->Xflags = rg;\
306 newframe->Xrdepth = frame->Xrdepth + 1;\
307 newframe->Xprevframe = frame;\
308 frame = newframe;\
309 DPRINTF(("restarting from line %d\n", __LINE__));\
310 goto HEAP_RECURSE;\
311 L_##rw:\
312 DPRINTF(("jumped back to line %d\n", __LINE__));\
313 }
314
315 #define RRETURN(ra)\
316 {\
317 heapframe *newframe = frame;\
318 frame = newframe->Xprevframe;\
319 (pcre_stack_free)(newframe);\
320 if (frame != NULL)\
321 {\
322 rrc = ra;\
323 goto HEAP_RETURN;\
324 }\
325 return ra;\
326 }
327
328
329 /* Structure for remembering the local variables in a private frame */
330
331 typedef struct heapframe {
332 struct heapframe *Xprevframe;
333
334 /* Function arguments that may change */
335
336 USPTR Xeptr;
337 const uschar *Xecode;
338 USPTR Xmstart;
339 USPTR Xmarkptr;
340 int Xoffset_top;
341 long int Xims;
342 eptrblock *Xeptrb;
343 int Xflags;
344 unsigned int Xrdepth;
345
346 /* Function local variables */
347
348 USPTR Xcallpat;
349 #ifdef SUPPORT_UTF8
350 USPTR Xcharptr;
351 #endif
352 USPTR Xdata;
353 USPTR Xnext;
354 USPTR Xpp;
355 USPTR Xprev;
356 USPTR Xsaved_eptr;
357
358 recursion_info Xnew_recursive;
359
360 BOOL Xcur_is_word;
361 BOOL Xcondition;
362 BOOL Xprev_is_word;
363
364 unsigned long int Xoriginal_ims;
365
366 #ifdef SUPPORT_UCP
367 int Xprop_type;
368 int Xprop_value;
369 int Xprop_fail_result;
370 int Xprop_category;
371 int Xprop_chartype;
372 int Xprop_script;
373 int Xoclength;
374 uschar Xocchars[8];
375 #endif
376
377 int Xcodelink;
378 int Xctype;
379 unsigned int Xfc;
380 int Xfi;
381 int Xlength;
382 int Xmax;
383 int Xmin;
384 int Xnumber;
385 int Xoffset;
386 int Xop;
387 int Xsave_capture_last;
388 int Xsave_offset1, Xsave_offset2, Xsave_offset3;
389 int Xstacksave[REC_STACK_SAVE_MAX];
390
391 eptrblock Xnewptrb;
392
393 /* Where to jump back to */
394
395 int Xwhere;
396
397 } heapframe;
398
399 #endif
400
401
402 /***************************************************************************
403 ***************************************************************************/
404
405
406
407 /*************************************************
408 * Match from current position *
409 *************************************************/
410
411 /* This function is called recursively in many circumstances. Whenever it
412 returns a negative (error) response, the outer incarnation must also return the
413 same response. */
414
415 /* These macros pack up tests that are used for partial matching, and which
416 appears several times in the code. We set the "hit end" flag if the pointer is
417 at the end of the subject and also past the start of the subject (i.e.
418 something has been matched). For hard partial matching, we then return
419 immediately. The second one is used when we already know we are past the end of
420 the subject. */
421
422 #define CHECK_PARTIAL()\
423 if (md->partial != 0 && eptr >= md->end_subject && eptr > mstart)\
424 {\
425 md->hitend = TRUE;\
426 if (md->partial > 1) MRRETURN(PCRE_ERROR_PARTIAL);\
427 }
428
429 #define SCHECK_PARTIAL()\
430 if (md->partial != 0 && eptr > mstart)\
431 {\
432 md->hitend = TRUE;\
433 if (md->partial > 1) MRRETURN(PCRE_ERROR_PARTIAL);\
434 }
435
436
437 /* Performance note: It might be tempting to extract commonly used fields from
438 the md structure (e.g. utf8, end_subject) into individual variables to improve
439 performance. Tests using gcc on a SPARC disproved this; in the first case, it
440 made performance worse.
441
442 Arguments:
443 eptr pointer to current character in subject
444 ecode pointer to current position in compiled code
445 mstart pointer to the current match start position (can be modified
446 by encountering \K)
447 markptr pointer to the most recent MARK name, or NULL
448 offset_top current top pointer
449 md pointer to "static" info for the match
450 ims current /i, /m, and /s options
451 eptrb pointer to chain of blocks containing eptr at start of
452 brackets - for testing for empty matches
453 flags can contain
454 match_condassert - this is an assertion condition
455 match_cbegroup - this is the start of an unlimited repeat
456 group that can match an empty string
457 rdepth the recursion depth
458
459 Returns: MATCH_MATCH if matched ) these values are >= 0
460 MATCH_NOMATCH if failed to match )
461 a negative MATCH_xxx value for PRUNE, SKIP, etc
462 a negative PCRE_ERROR_xxx value if aborted by an error condition
463 (e.g. stopped by repeated call or recursion limit)
464 */
465
466 static int
467 match(REGISTER USPTR eptr, REGISTER const uschar *ecode, USPTR mstart,
468 const uschar *markptr, int offset_top, match_data *md, unsigned long int ims,
469 eptrblock *eptrb, int flags, unsigned int rdepth)
470 {
471 /* These variables do not need to be preserved over recursion in this function,
472 so they can be ordinary variables in all cases. Mark some of them with
473 "register" because they are used a lot in loops. */
474
475 register int rrc; /* Returns from recursive calls */
476 register int i; /* Used for loops not involving calls to RMATCH() */
477 register unsigned int c; /* Character values not kept over RMATCH() calls */
478 register BOOL utf8; /* Local copy of UTF-8 flag for speed */
479
480 BOOL minimize, possessive; /* Quantifier options */
481 int condcode;
482
483 /* When recursion is not being used, all "local" variables that have to be
484 preserved over calls to RMATCH() are part of a "frame" which is obtained from
485 heap storage. Set up the top-level frame here; others are obtained from the
486 heap whenever RMATCH() does a "recursion". See the macro definitions above. */
487
488 #ifdef NO_RECURSE
489 heapframe *frame = (pcre_stack_malloc)(sizeof(heapframe));
490 frame->Xprevframe = NULL; /* Marks the top level */
491
492 /* Copy in the original argument variables */
493
494 frame->Xeptr = eptr;
495 frame->Xecode = ecode;
496 frame->Xmstart = mstart;
497 frame->Xmarkptr = markptr;
498 frame->Xoffset_top = offset_top;
499 frame->Xims = ims;
500 frame->Xeptrb = eptrb;
501 frame->Xflags = flags;
502 frame->Xrdepth = rdepth;
503
504 /* This is where control jumps back to to effect "recursion" */
505
506 HEAP_RECURSE:
507
508 /* Macros make the argument variables come from the current frame */
509
510 #define eptr frame->Xeptr
511 #define ecode frame->Xecode
512 #define mstart frame->Xmstart
513 #define markptr frame->Xmarkptr
514 #define offset_top frame->Xoffset_top
515 #define ims frame->Xims
516 #define eptrb frame->Xeptrb
517 #define flags frame->Xflags
518 #define rdepth frame->Xrdepth
519
520 /* Ditto for the local variables */
521
522 #ifdef SUPPORT_UTF8
523 #define charptr frame->Xcharptr
524 #endif
525 #define callpat frame->Xcallpat
526 #define codelink frame->Xcodelink
527 #define data frame->Xdata
528 #define next frame->Xnext
529 #define pp frame->Xpp
530 #define prev frame->Xprev
531 #define saved_eptr frame->Xsaved_eptr
532
533 #define new_recursive frame->Xnew_recursive
534
535 #define cur_is_word frame->Xcur_is_word
536 #define condition frame->Xcondition
537 #define prev_is_word frame->Xprev_is_word
538
539 #define original_ims frame->Xoriginal_ims
540
541 #ifdef SUPPORT_UCP
542 #define prop_type frame->Xprop_type
543 #define prop_value frame->Xprop_value
544 #define prop_fail_result frame->Xprop_fail_result
545 #define prop_category frame->Xprop_category
546 #define prop_chartype frame->Xprop_chartype
547 #define prop_script frame->Xprop_script
548 #define oclength frame->Xoclength
549 #define occhars frame->Xocchars
550 #endif
551
552 #define ctype frame->Xctype
553 #define fc frame->Xfc
554 #define fi frame->Xfi
555 #define length frame->Xlength
556 #define max frame->Xmax
557 #define min frame->Xmin
558 #define number frame->Xnumber
559 #define offset frame->Xoffset
560 #define op frame->Xop
561 #define save_capture_last frame->Xsave_capture_last
562 #define save_offset1 frame->Xsave_offset1
563 #define save_offset2 frame->Xsave_offset2
564 #define save_offset3 frame->Xsave_offset3
565 #define stacksave frame->Xstacksave
566
567 #define newptrb frame->Xnewptrb
568
569 /* When recursion is being used, local variables are allocated on the stack and
570 get preserved during recursion in the normal way. In this environment, fi and
571 i, and fc and c, can be the same variables. */
572
573 #else /* NO_RECURSE not defined */
574 #define fi i
575 #define fc c
576
577
578 #ifdef SUPPORT_UTF8 /* Many of these variables are used only */
579 const uschar *charptr; /* in small blocks of the code. My normal */
580 #endif /* style of coding would have declared */
581 const uschar *callpat; /* them within each of those blocks. */
582 const uschar *data; /* However, in order to accommodate the */
583 const uschar *next; /* version of this code that uses an */
584 USPTR pp; /* external "stack" implemented on the */
585 const uschar *prev; /* heap, it is easier to declare them all */
586 USPTR saved_eptr; /* here, so the declarations can be cut */
587 /* out in a block. The only declarations */
588 recursion_info new_recursive; /* within blocks below are for variables */
589 /* that do not have to be preserved over */
590 BOOL cur_is_word; /* a recursive call to RMATCH(). */
591 BOOL condition;
592 BOOL prev_is_word;
593
594 unsigned long int original_ims;
595
596 #ifdef SUPPORT_UCP
597 int prop_type;
598 int prop_value;
599 int prop_fail_result;
600 int prop_category;
601 int prop_chartype;
602 int prop_script;
603 int oclength;
604 uschar occhars[8];
605 #endif
606
607 int codelink;
608 int ctype;
609 int length;
610 int max;
611 int min;
612 int number;
613 int offset;
614 int op;
615 int save_capture_last;
616 int save_offset1, save_offset2, save_offset3;
617 int stacksave[REC_STACK_SAVE_MAX];
618
619 eptrblock newptrb;
620 #endif /* NO_RECURSE */
621
622 /* These statements are here to stop the compiler complaining about unitialized
623 variables. */
624
625 #ifdef SUPPORT_UCP
626 prop_value = 0;
627 prop_fail_result = 0;
628 #endif
629
630
631 /* This label is used for tail recursion, which is used in a few cases even
632 when NO_RECURSE is not defined, in order to reduce the amount of stack that is
633 used. Thanks to Ian Taylor for noticing this possibility and sending the
634 original patch. */
635
636 TAIL_RECURSE:
637
638 /* OK, now we can get on with the real code of the function. Recursive calls
639 are specified by the macro RMATCH and RRETURN is used to return. When
640 NO_RECURSE is *not* defined, these just turn into a recursive call to match()
641 and a "return", respectively (possibly with some debugging if PCRE_DEBUG is
642 defined). However, RMATCH isn't like a function call because it's quite a
643 complicated macro. It has to be used in one particular way. This shouldn't,
644 however, impact performance when true recursion is being used. */
645
646 #ifdef SUPPORT_UTF8
647 utf8 = md->utf8; /* Local copy of the flag */
648 #else
649 utf8 = FALSE;
650 #endif
651
652 /* First check that we haven't called match() too many times, or that we
653 haven't exceeded the recursive call limit. */
654
655 if (md->match_call_count++ >= md->match_limit) RRETURN(PCRE_ERROR_MATCHLIMIT);
656 if (rdepth >= md->match_limit_recursion) RRETURN(PCRE_ERROR_RECURSIONLIMIT);
657
658 original_ims = ims; /* Save for resetting on ')' */
659
660 /* At the start of a group with an unlimited repeat that may match an empty
661 string, the match_cbegroup flag is set. When this is the case, add the current
662 subject pointer to the chain of such remembered pointers, to be checked when we
663 hit the closing ket, in order to break infinite loops that match no characters.
664 When match() is called in other circumstances, don't add to the chain. The
665 match_cbegroup flag must NOT be used with tail recursion, because the memory
666 block that is used is on the stack, so a new one may be required for each
667 match(). */
668
669 if ((flags & match_cbegroup) != 0)
670 {
671 newptrb.epb_saved_eptr = eptr;
672 newptrb.epb_prev = eptrb;
673 eptrb = &newptrb;
674 }
675
676 /* Now start processing the opcodes. */
677
678 for (;;)
679 {
680 minimize = possessive = FALSE;
681 op = *ecode;
682
683 switch(op)
684 {
685 case OP_MARK:
686 markptr = ecode + 2;
687 RMATCH(eptr, ecode + _pcre_OP_lengths[*ecode] + ecode[1], offset_top, md,
688 ims, eptrb, flags, RM55);
689
690 /* A return of MATCH_SKIP_ARG means that matching failed at SKIP with an
691 argument, and we must check whether that argument matches this MARK's
692 argument. It is passed back in md->start_match_ptr (an overloading of that
693 variable). If it does match, we reset that variable to the current subject
694 position and return MATCH_SKIP. Otherwise, pass back the return code
695 unaltered. */
696
697 if (rrc == MATCH_SKIP_ARG &&
698 strcmp((char *)markptr, (char *)(md->start_match_ptr)) == 0)
699 {
700 md->start_match_ptr = eptr;
701 RRETURN(MATCH_SKIP);
702 }
703
704 if (md->mark == NULL) md->mark = markptr;
705 RRETURN(rrc);
706
707 case OP_FAIL:
708 MRRETURN(MATCH_NOMATCH);
709
710 case OP_COMMIT:
711 RMATCH(eptr, ecode + _pcre_OP_lengths[*ecode], offset_top, md,
712 ims, eptrb, flags, RM52);
713 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
714 MRRETURN(MATCH_COMMIT);
715
716 case OP_PRUNE:
717 RMATCH(eptr, ecode + _pcre_OP_lengths[*ecode], offset_top, md,
718 ims, eptrb, flags, RM51);
719 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
720 MRRETURN(MATCH_PRUNE);
721
722 case OP_PRUNE_ARG:
723 RMATCH(eptr, ecode + _pcre_OP_lengths[*ecode] + ecode[1], offset_top, md,
724 ims, eptrb, flags, RM56);
725 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
726 md->mark = ecode + 2;
727 RRETURN(MATCH_PRUNE);
728
729 case OP_SKIP:
730 RMATCH(eptr, ecode + _pcre_OP_lengths[*ecode], offset_top, md,
731 ims, eptrb, flags, RM53);
732 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
733 md->start_match_ptr = eptr; /* Pass back current position */
734 MRRETURN(MATCH_SKIP);
735
736 case OP_SKIP_ARG:
737 RMATCH(eptr, ecode + _pcre_OP_lengths[*ecode] + ecode[1], offset_top, md,
738 ims, eptrb, flags, RM57);
739 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
740
741 /* Pass back the current skip name by overloading md->start_match_ptr and
742 returning the special MATCH_SKIP_ARG return code. This will either be
743 caught by a matching MARK, or get to the top, where it is treated the same
744 as PRUNE. */
745
746 md->start_match_ptr = ecode + 2;
747 RRETURN(MATCH_SKIP_ARG);
748
749 case OP_THEN:
750 RMATCH(eptr, ecode + _pcre_OP_lengths[*ecode], offset_top, md,
751 ims, eptrb, flags, RM54);
752 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
753 MRRETURN(MATCH_THEN);
754
755 case OP_THEN_ARG:
756 RMATCH(eptr, ecode + _pcre_OP_lengths[*ecode] + ecode[1], offset_top, md,
757 ims, eptrb, flags, RM58);
758 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
759 md->mark = ecode + 2;
760 RRETURN(MATCH_THEN);
761
762 /* Handle a capturing bracket. If there is space in the offset vector, save
763 the current subject position in the working slot at the top of the vector.
764 We mustn't change the current values of the data slot, because they may be
765 set from a previous iteration of this group, and be referred to by a
766 reference inside the group.
767
768 If the bracket fails to match, we need to restore this value and also the
769 values of the final offsets, in case they were set by a previous iteration
770 of the same bracket.
771
772 If there isn't enough space in the offset vector, treat this as if it were
773 a non-capturing bracket. Don't worry about setting the flag for the error
774 case here; that is handled in the code for KET. */
775
776 case OP_CBRA:
777 case OP_SCBRA:
778 number = GET2(ecode, 1+LINK_SIZE);
779 offset = number << 1;
780
781 #ifdef PCRE_DEBUG
782 printf("start bracket %d\n", number);
783 printf("subject=");
784 pchars(eptr, 16, TRUE, md);
785 printf("\n");
786 #endif
787
788 if (offset < md->offset_max)
789 {
790 save_offset1 = md->offset_vector[offset];
791 save_offset2 = md->offset_vector[offset+1];
792 save_offset3 = md->offset_vector[md->offset_end - number];
793 save_capture_last = md->capture_last;
794
795 DPRINTF(("saving %d %d %d\n", save_offset1, save_offset2, save_offset3));
796 md->offset_vector[md->offset_end - number] = eptr - md->start_subject;
797
798 flags = (op == OP_SCBRA)? match_cbegroup : 0;
799 do
800 {
801 RMATCH(eptr, ecode + _pcre_OP_lengths[*ecode], offset_top, md,
802 ims, eptrb, flags, RM1);
803 if (rrc != MATCH_NOMATCH && rrc != MATCH_THEN) RRETURN(rrc);
804 md->capture_last = save_capture_last;
805 ecode += GET(ecode, 1);
806 }
807 while (*ecode == OP_ALT);
808
809 DPRINTF(("bracket %d failed\n", number));
810
811 md->offset_vector[offset] = save_offset1;
812 md->offset_vector[offset+1] = save_offset2;
813 md->offset_vector[md->offset_end - number] = save_offset3;
814
815 if (rrc != MATCH_THEN) md->mark = markptr;
816 RRETURN(MATCH_NOMATCH);
817 }
818
819 /* FALL THROUGH ... Insufficient room for saving captured contents. Treat
820 as a non-capturing bracket. */
821
822 /* VVVVVVVVVVVVVVVVVVVVVVVVV */
823 /* VVVVVVVVVVVVVVVVVVVVVVVVV */
824
825 DPRINTF(("insufficient capture room: treat as non-capturing\n"));
826
827 /* VVVVVVVVVVVVVVVVVVVVVVVVV */
828 /* VVVVVVVVVVVVVVVVVVVVVVVVV */
829
830 /* Non-capturing bracket. Loop for all the alternatives. When we get to the
831 final alternative within the brackets, we would return the result of a
832 recursive call to match() whatever happened. We can reduce stack usage by
833 turning this into a tail recursion, except in the case when match_cbegroup
834 is set.*/
835
836 case OP_BRA:
837 case OP_SBRA:
838 DPRINTF(("start non-capturing bracket\n"));
839 flags = (op >= OP_SBRA)? match_cbegroup : 0;
840 for (;;)
841 {
842 if (ecode[GET(ecode, 1)] != OP_ALT) /* Final alternative */
843 {
844 if (flags == 0) /* Not a possibly empty group */
845 {
846 ecode += _pcre_OP_lengths[*ecode];
847 DPRINTF(("bracket 0 tail recursion\n"));
848 goto TAIL_RECURSE;
849 }
850
851 /* Possibly empty group; can't use tail recursion. */
852
853 RMATCH(eptr, ecode + _pcre_OP_lengths[*ecode], offset_top, md, ims,
854 eptrb, flags, RM48);
855 if (rrc == MATCH_NOMATCH) md->mark = markptr;
856 RRETURN(rrc);
857 }
858
859 /* For non-final alternatives, continue the loop for a NOMATCH result;
860 otherwise return. */
861
862 RMATCH(eptr, ecode + _pcre_OP_lengths[*ecode], offset_top, md, ims,
863 eptrb, flags, RM2);
864 if (rrc != MATCH_NOMATCH && rrc != MATCH_THEN) RRETURN(rrc);
865 ecode += GET(ecode, 1);
866 }
867 /* Control never reaches here. */
868
869 /* Conditional group: compilation checked that there are no more than
870 two branches. If the condition is false, skipping the first branch takes us
871 past the end if there is only one branch, but that's OK because that is
872 exactly what going to the ket would do. As there is only one branch to be
873 obeyed, we can use tail recursion to avoid using another stack frame. */
874
875 case OP_COND:
876 case OP_SCOND:
877 codelink= GET(ecode, 1);
878
879 /* Because of the way auto-callout works during compile, a callout item is
880 inserted between OP_COND and an assertion condition. */
881
882 if (ecode[LINK_SIZE+1] == OP_CALLOUT)
883 {
884 if (pcre_callout != NULL)
885 {
886 pcre_callout_block cb;
887 cb.version = 1; /* Version 1 of the callout block */
888 cb.callout_number = ecode[LINK_SIZE+2];
889 cb.offset_vector = md->offset_vector;
890 cb.subject = (PCRE_SPTR)md->start_subject;
891 cb.subject_length = md->end_subject - md->start_subject;
892 cb.start_match = mstart - md->start_subject;
893 cb.current_position = eptr - md->start_subject;
894 cb.pattern_position = GET(ecode, LINK_SIZE + 3);
895 cb.next_item_length = GET(ecode, 3 + 2*LINK_SIZE);
896 cb.capture_top = offset_top/2;
897 cb.capture_last = md->capture_last;
898 cb.callout_data = md->callout_data;
899 if ((rrc = (*pcre_callout)(&cb)) > 0) MRRETURN(MATCH_NOMATCH);
900 if (rrc < 0) RRETURN(rrc);
901 }
902 ecode += _pcre_OP_lengths[OP_CALLOUT];
903 }
904
905 condcode = ecode[LINK_SIZE+1];
906
907 /* Now see what the actual condition is */
908
909 if (condcode == OP_RREF || condcode == OP_NRREF) /* Recursion test */
910 {
911 if (md->recursive == NULL) /* Not recursing => FALSE */
912 {
913 condition = FALSE;
914 ecode += GET(ecode, 1);
915 }
916 else
917 {
918 int recno = GET2(ecode, LINK_SIZE + 2); /* Recursion group number*/
919 condition = (recno == RREF_ANY || recno == md->recursive->group_num);
920
921 /* If the test is for recursion into a specific subpattern, and it is
922 false, but the test was set up by name, scan the table to see if the
923 name refers to any other numbers, and test them. The condition is true
924 if any one is set. */
925
926 if (!condition && condcode == OP_NRREF && recno != RREF_ANY)
927 {
928 uschar *slotA = md->name_table;
929 for (i = 0; i < md->name_count; i++)
930 {
931 if (GET2(slotA, 0) == recno) break;
932 slotA += md->name_entry_size;
933 }
934
935 /* Found a name for the number - there can be only one; duplicate
936 names for different numbers are allowed, but not vice versa. First
937 scan down for duplicates. */
938
939 if (i < md->name_count)
940 {
941 uschar *slotB = slotA;
942 while (slotB > md->name_table)
943 {
944 slotB -= md->name_entry_size;
945 if (strcmp((char *)slotA + 2, (char *)slotB + 2) == 0)
946 {
947 condition = GET2(slotB, 0) == md->recursive->group_num;
948 if (condition) break;
949 }
950 else break;
951 }
952
953 /* Scan up for duplicates */
954
955 if (!condition)
956 {
957 slotB = slotA;
958 for (i++; i < md->name_count; i++)
959 {
960 slotB += md->name_entry_size;
961 if (strcmp((char *)slotA + 2, (char *)slotB + 2) == 0)
962 {
963 condition = GET2(slotB, 0) == md->recursive->group_num;
964 if (condition) break;
965 }
966 else break;
967 }
968 }
969 }
970 }
971
972 /* Chose branch according to the condition */
973
974 ecode += condition? 3 : GET(ecode, 1);
975 }
976 }
977
978 else if (condcode == OP_CREF || condcode == OP_NCREF) /* Group used test */
979 {
980 offset = GET2(ecode, LINK_SIZE+2) << 1; /* Doubled ref number */
981 condition = offset < offset_top && md->offset_vector[offset] >= 0;
982
983 /* If the numbered capture is unset, but the reference was by name,
984 scan the table to see if the name refers to any other numbers, and test
985 them. The condition is true if any one is set. This is tediously similar
986 to the code above, but not close enough to try to amalgamate. */
987
988 if (!condition && condcode == OP_NCREF)
989 {
990 int refno = offset >> 1;
991 uschar *slotA = md->name_table;
992
993 for (i = 0; i < md->name_count; i++)
994 {
995 if (GET2(slotA, 0) == refno) break;
996 slotA += md->name_entry_size;
997 }
998
999 /* Found a name for the number - there can be only one; duplicate names
1000 for different numbers are allowed, but not vice versa. First scan down
1001 for duplicates. */
1002
1003 if (i < md->name_count)
1004 {
1005 uschar *slotB = slotA;
1006 while (slotB > md->name_table)
1007 {
1008 slotB -= md->name_entry_size;
1009 if (strcmp((char *)slotA + 2, (char *)slotB + 2) == 0)
1010 {
1011 offset = GET2(slotB, 0) << 1;
1012 condition = offset < offset_top &&
1013 md->offset_vector[offset] >= 0;
1014 if (condition) break;
1015 }
1016 else break;
1017 }
1018
1019 /* Scan up for duplicates */
1020
1021 if (!condition)
1022 {
1023 slotB = slotA;
1024 for (i++; i < md->name_count; i++)
1025 {
1026 slotB += md->name_entry_size;
1027 if (strcmp((char *)slotA + 2, (char *)slotB + 2) == 0)
1028 {
1029 offset = GET2(slotB, 0) << 1;
1030 condition = offset < offset_top &&
1031 md->offset_vector[offset] >= 0;
1032 if (condition) break;
1033 }
1034 else break;
1035 }
1036 }
1037 }
1038 }
1039
1040 /* Chose branch according to the condition */
1041
1042 ecode += condition? 3 : GET(ecode, 1);
1043 }
1044
1045 else if (condcode == OP_DEF) /* DEFINE - always false */
1046 {
1047 condition = FALSE;
1048 ecode += GET(ecode, 1);
1049 }
1050
1051 /* The condition is an assertion. Call match() to evaluate it - setting
1052 the final argument match_condassert causes it to stop at the end of an
1053 assertion. */
1054
1055 else
1056 {
1057 RMATCH(eptr, ecode + 1 + LINK_SIZE, offset_top, md, ims, NULL,
1058 match_condassert, RM3);
1059 if (rrc == MATCH_MATCH)
1060 {
1061 condition = TRUE;
1062 ecode += 1 + LINK_SIZE + GET(ecode, LINK_SIZE + 2);
1063 while (*ecode == OP_ALT) ecode += GET(ecode, 1);
1064 }
1065 else if (rrc != MATCH_NOMATCH && rrc != MATCH_THEN)
1066 {
1067 RRETURN(rrc); /* Need braces because of following else */
1068 }
1069 else
1070 {
1071 condition = FALSE;
1072 ecode += codelink;
1073 }
1074 }
1075
1076 /* We are now at the branch that is to be obeyed. As there is only one,
1077 we can use tail recursion to avoid using another stack frame, except when
1078 match_cbegroup is required for an unlimited repeat of a possibly empty
1079 group. If the second alternative doesn't exist, we can just plough on. */
1080
1081 if (condition || *ecode == OP_ALT)
1082 {
1083 ecode += 1 + LINK_SIZE;
1084 if (op == OP_SCOND) /* Possibly empty group */
1085 {
1086 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, match_cbegroup, RM49);
1087 RRETURN(rrc);
1088 }
1089 else /* Group must match something */
1090 {
1091 flags = 0;
1092 goto TAIL_RECURSE;
1093 }
1094 }
1095 else /* Condition false & no alternative */
1096 {
1097 ecode += 1 + LINK_SIZE;
1098 }
1099 break;
1100
1101
1102 /* Before OP_ACCEPT there may be any number of OP_CLOSE opcodes,
1103 to close any currently open capturing brackets. */
1104
1105 case OP_CLOSE:
1106 number = GET2(ecode, 1);
1107 offset = number << 1;
1108
1109 #ifdef PCRE_DEBUG
1110 printf("end bracket %d at *ACCEPT", number);
1111 printf("\n");
1112 #endif
1113
1114 md->capture_last = number;
1115 if (offset >= md->offset_max) md->offset_overflow = TRUE; else
1116 {
1117 md->offset_vector[offset] =
1118 md->offset_vector[md->offset_end - number];
1119 md->offset_vector[offset+1] = eptr - md->start_subject;
1120 if (offset_top <= offset) offset_top = offset + 2;
1121 }
1122 ecode += 3;
1123 break;
1124
1125
1126 /* End of the pattern, either real or forced. If we are in a top-level
1127 recursion, we should restore the offsets appropriately and continue from
1128 after the call. */
1129
1130 case OP_ACCEPT:
1131 case OP_END:
1132 if (md->recursive != NULL && md->recursive->group_num == 0)
1133 {
1134 recursion_info *rec = md->recursive;
1135 DPRINTF(("End of pattern in a (?0) recursion\n"));
1136 md->recursive = rec->prevrec;
1137 memmove(md->offset_vector, rec->offset_save,
1138 rec->saved_max * sizeof(int));
1139 offset_top = rec->save_offset_top;
1140 ims = original_ims;
1141 ecode = rec->after_call;
1142 break;
1143 }
1144
1145 /* Otherwise, if we have matched an empty string, fail if PCRE_NOTEMPTY is
1146 set, or if PCRE_NOTEMPTY_ATSTART is set and we have matched at the start of
1147 the subject. In both cases, backtracking will then try other alternatives,
1148 if any. */
1149
1150 if (eptr == mstart &&
1151 (md->notempty ||
1152 (md->notempty_atstart &&
1153 mstart == md->start_subject + md->start_offset)))
1154 MRRETURN(MATCH_NOMATCH);
1155
1156 /* Otherwise, we have a match. */
1157
1158 md->end_match_ptr = eptr; /* Record where we ended */
1159 md->end_offset_top = offset_top; /* and how many extracts were taken */
1160 md->start_match_ptr = mstart; /* and the start (\K can modify) */
1161
1162 /* For some reason, the macros don't work properly if an expression is
1163 given as the argument to MRRETURN when the heap is in use. */
1164
1165 rrc = (op == OP_END)? MATCH_MATCH : MATCH_ACCEPT;
1166 MRRETURN(rrc);
1167
1168 /* Change option settings */
1169
1170 case OP_OPT:
1171 ims = ecode[1];
1172 ecode += 2;
1173 DPRINTF(("ims set to %02lx\n", ims));
1174 break;
1175
1176 /* Assertion brackets. Check the alternative branches in turn - the
1177 matching won't pass the KET for an assertion. If any one branch matches,
1178 the assertion is true. Lookbehind assertions have an OP_REVERSE item at the
1179 start of each branch to move the current point backwards, so the code at
1180 this level is identical to the lookahead case. */
1181
1182 case OP_ASSERT:
1183 case OP_ASSERTBACK:
1184 do
1185 {
1186 RMATCH(eptr, ecode + 1 + LINK_SIZE, offset_top, md, ims, NULL, 0,
1187 RM4);
1188 if (rrc == MATCH_MATCH || rrc == MATCH_ACCEPT)
1189 {
1190 mstart = md->start_match_ptr; /* In case \K reset it */
1191 break;
1192 }
1193 if (rrc != MATCH_NOMATCH && rrc != MATCH_THEN) RRETURN(rrc);
1194 ecode += GET(ecode, 1);
1195 }
1196 while (*ecode == OP_ALT);
1197 if (*ecode == OP_KET) MRRETURN(MATCH_NOMATCH);
1198
1199 /* If checking an assertion for a condition, return MATCH_MATCH. */
1200
1201 if ((flags & match_condassert) != 0) RRETURN(MATCH_MATCH);
1202
1203 /* Continue from after the assertion, updating the offsets high water
1204 mark, since extracts may have been taken during the assertion. */
1205
1206 do ecode += GET(ecode,1); while (*ecode == OP_ALT);
1207 ecode += 1 + LINK_SIZE;
1208 offset_top = md->end_offset_top;
1209 continue;
1210
1211 /* Negative assertion: all branches must fail to match. Encountering SKIP,
1212 PRUNE, or COMMIT means we must assume failure without checking subsequent
1213 branches. */
1214
1215 case OP_ASSERT_NOT:
1216 case OP_ASSERTBACK_NOT:
1217 do
1218 {
1219 RMATCH(eptr, ecode + 1 + LINK_SIZE, offset_top, md, ims, NULL, 0,
1220 RM5);
1221 if (rrc == MATCH_MATCH || rrc == MATCH_ACCEPT) MRRETURN(MATCH_NOMATCH);
1222 if (rrc == MATCH_SKIP || rrc == MATCH_PRUNE || rrc == MATCH_COMMIT)
1223 {
1224 do ecode += GET(ecode,1); while (*ecode == OP_ALT);
1225 break;
1226 }
1227 if (rrc != MATCH_NOMATCH && rrc != MATCH_THEN) RRETURN(rrc);
1228 ecode += GET(ecode,1);
1229 }
1230 while (*ecode == OP_ALT);
1231
1232 if ((flags & match_condassert) != 0) RRETURN(MATCH_MATCH);
1233
1234 ecode += 1 + LINK_SIZE;
1235 continue;
1236
1237 /* Move the subject pointer back. This occurs only at the start of
1238 each branch of a lookbehind assertion. If we are too close to the start to
1239 move back, this match function fails. When working with UTF-8 we move
1240 back a number of characters, not bytes. */
1241
1242 case OP_REVERSE:
1243 #ifdef SUPPORT_UTF8
1244 if (utf8)
1245 {
1246 i = GET(ecode, 1);
1247 while (i-- > 0)
1248 {
1249 eptr--;
1250 if (eptr < md->start_subject) MRRETURN(MATCH_NOMATCH);
1251 BACKCHAR(eptr);
1252 }
1253 }
1254 else
1255 #endif
1256
1257 /* No UTF-8 support, or not in UTF-8 mode: count is byte count */
1258
1259 {
1260 eptr -= GET(ecode, 1);
1261 if (eptr < md->start_subject) MRRETURN(MATCH_NOMATCH);
1262 }
1263
1264 /* Save the earliest consulted character, then skip to next op code */
1265
1266 if (eptr < md->start_used_ptr) md->start_used_ptr = eptr;
1267 ecode += 1 + LINK_SIZE;
1268 break;
1269
1270 /* The callout item calls an external function, if one is provided, passing
1271 details of the match so far. This is mainly for debugging, though the
1272 function is able to force a failure. */
1273
1274 case OP_CALLOUT:
1275 if (pcre_callout != NULL)
1276 {
1277 pcre_callout_block cb;
1278 cb.version = 1; /* Version 1 of the callout block */
1279 cb.callout_number = ecode[1];
1280 cb.offset_vector = md->offset_vector;
1281 cb.subject = (PCRE_SPTR)md->start_subject;
1282 cb.subject_length = md->end_subject - md->start_subject;
1283 cb.start_match = mstart - md->start_subject;
1284 cb.current_position = eptr - md->start_subject;
1285 cb.pattern_position = GET(ecode, 2);
1286 cb.next_item_length = GET(ecode, 2 + LINK_SIZE);
1287 cb.capture_top = offset_top/2;
1288 cb.capture_last = md->capture_last;
1289 cb.callout_data = md->callout_data;
1290 if ((rrc = (*pcre_callout)(&cb)) > 0) MRRETURN(MATCH_NOMATCH);
1291 if (rrc < 0) RRETURN(rrc);
1292 }
1293 ecode += 2 + 2*LINK_SIZE;
1294 break;
1295
1296 /* Recursion either matches the current regex, or some subexpression. The
1297 offset data is the offset to the starting bracket from the start of the
1298 whole pattern. (This is so that it works from duplicated subpatterns.)
1299
1300 If there are any capturing brackets started but not finished, we have to
1301 save their starting points and reinstate them after the recursion. However,
1302 we don't know how many such there are (offset_top records the completed
1303 total) so we just have to save all the potential data. There may be up to
1304 65535 such values, which is too large to put on the stack, but using malloc
1305 for small numbers seems expensive. As a compromise, the stack is used when
1306 there are no more than REC_STACK_SAVE_MAX values to store; otherwise malloc
1307 is used. A problem is what to do if the malloc fails ... there is no way of
1308 returning to the top level with an error. Save the top REC_STACK_SAVE_MAX
1309 values on the stack, and accept that the rest may be wrong.
1310
1311 There are also other values that have to be saved. We use a chained
1312 sequence of blocks that actually live on the stack. Thanks to Robin Houston
1313 for the original version of this logic. */
1314
1315 case OP_RECURSE:
1316 {
1317 callpat = md->start_code + GET(ecode, 1);
1318 new_recursive.group_num = (callpat == md->start_code)? 0 :
1319 GET2(callpat, 1 + LINK_SIZE);
1320
1321 /* Add to "recursing stack" */
1322
1323 new_recursive.prevrec = md->recursive;
1324 md->recursive = &new_recursive;
1325
1326 /* Find where to continue from afterwards */
1327
1328 ecode += 1 + LINK_SIZE;
1329 new_recursive.after_call = ecode;
1330
1331 /* Now save the offset data. */
1332
1333 new_recursive.saved_max = md->offset_end;
1334 if (new_recursive.saved_max <= REC_STACK_SAVE_MAX)
1335 new_recursive.offset_save = stacksave;
1336 else
1337 {
1338 new_recursive.offset_save =
1339 (int *)(pcre_malloc)(new_recursive.saved_max * sizeof(int));
1340 if (new_recursive.offset_save == NULL) RRETURN(PCRE_ERROR_NOMEMORY);
1341 }
1342
1343 memcpy(new_recursive.offset_save, md->offset_vector,
1344 new_recursive.saved_max * sizeof(int));
1345 new_recursive.save_offset_top = offset_top;
1346
1347 /* OK, now we can do the recursion. For each top-level alternative we
1348 restore the offset and recursion data. */
1349
1350 DPRINTF(("Recursing into group %d\n", new_recursive.group_num));
1351 flags = (*callpat >= OP_SBRA)? match_cbegroup : 0;
1352 do
1353 {
1354 RMATCH(eptr, callpat + _pcre_OP_lengths[*callpat], offset_top,
1355 md, ims, eptrb, flags, RM6);
1356 if (rrc == MATCH_MATCH || rrc == MATCH_ACCEPT)
1357 {
1358 DPRINTF(("Recursion matched\n"));
1359 md->recursive = new_recursive.prevrec;
1360 if (new_recursive.offset_save != stacksave)
1361 (pcre_free)(new_recursive.offset_save);
1362 MRRETURN(MATCH_MATCH);
1363 }
1364 else if (rrc != MATCH_NOMATCH && rrc != MATCH_THEN)
1365 {
1366 DPRINTF(("Recursion gave error %d\n", rrc));
1367 if (new_recursive.offset_save != stacksave)
1368 (pcre_free)(new_recursive.offset_save);
1369 RRETURN(rrc);
1370 }
1371
1372 md->recursive = &new_recursive;
1373 memcpy(md->offset_vector, new_recursive.offset_save,
1374 new_recursive.saved_max * sizeof(int));
1375 callpat += GET(callpat, 1);
1376 }
1377 while (*callpat == OP_ALT);
1378
1379 DPRINTF(("Recursion didn't match\n"));
1380 md->recursive = new_recursive.prevrec;
1381 if (new_recursive.offset_save != stacksave)
1382 (pcre_free)(new_recursive.offset_save);
1383 MRRETURN(MATCH_NOMATCH);
1384 }
1385 /* Control never reaches here */
1386
1387 /* "Once" brackets are like assertion brackets except that after a match,
1388 the point in the subject string is not moved back. Thus there can never be
1389 a move back into the brackets. Friedl calls these "atomic" subpatterns.
1390 Check the alternative branches in turn - the matching won't pass the KET
1391 for this kind of subpattern. If any one branch matches, we carry on as at
1392 the end of a normal bracket, leaving the subject pointer, but resetting
1393 the start-of-match value in case it was changed by \K. */
1394
1395 case OP_ONCE:
1396 prev = ecode;
1397 saved_eptr = eptr;
1398
1399 do
1400 {
1401 RMATCH(eptr, ecode + 1 + LINK_SIZE, offset_top, md, ims, eptrb, 0, RM7);
1402 if (rrc == MATCH_MATCH) /* Note: _not_ MATCH_ACCEPT */
1403 {
1404 mstart = md->start_match_ptr;
1405 break;
1406 }
1407 if (rrc != MATCH_NOMATCH && rrc != MATCH_THEN) RRETURN(rrc);
1408 ecode += GET(ecode,1);
1409 }
1410 while (*ecode == OP_ALT);
1411
1412 /* If hit the end of the group (which could be repeated), fail */
1413
1414 if (*ecode != OP_ONCE && *ecode != OP_ALT) RRETURN(MATCH_NOMATCH);
1415
1416 /* Continue as from after the assertion, updating the offsets high water
1417 mark, since extracts may have been taken. */
1418
1419 do ecode += GET(ecode, 1); while (*ecode == OP_ALT);
1420
1421 offset_top = md->end_offset_top;
1422 eptr = md->end_match_ptr;
1423
1424 /* For a non-repeating ket, just continue at this level. This also
1425 happens for a repeating ket if no characters were matched in the group.
1426 This is the forcible breaking of infinite loops as implemented in Perl
1427 5.005. If there is an options reset, it will get obeyed in the normal
1428 course of events. */
1429
1430 if (*ecode == OP_KET || eptr == saved_eptr)
1431 {
1432 ecode += 1+LINK_SIZE;
1433 break;
1434 }
1435
1436 /* The repeating kets try the rest of the pattern or restart from the
1437 preceding bracket, in the appropriate order. The second "call" of match()
1438 uses tail recursion, to avoid using another stack frame. We need to reset
1439 any options that changed within the bracket before re-running it, so
1440 check the next opcode. */
1441
1442 if (ecode[1+LINK_SIZE] == OP_OPT)
1443 {
1444 ims = (ims & ~PCRE_IMS) | ecode[4];
1445 DPRINTF(("ims set to %02lx at group repeat\n", ims));
1446 }
1447
1448 if (*ecode == OP_KETRMIN)
1449 {
1450 RMATCH(eptr, ecode + 1 + LINK_SIZE, offset_top, md, ims, eptrb, 0, RM8);
1451 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1452 ecode = prev;
1453 flags = 0;
1454 goto TAIL_RECURSE;
1455 }
1456 else /* OP_KETRMAX */
1457 {
1458 RMATCH(eptr, prev, offset_top, md, ims, eptrb, match_cbegroup, RM9);
1459 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1460 ecode += 1 + LINK_SIZE;
1461 flags = 0;
1462 goto TAIL_RECURSE;
1463 }
1464 /* Control never gets here */
1465
1466 /* An alternation is the end of a branch; scan along to find the end of the
1467 bracketed group and go to there. */
1468
1469 case OP_ALT:
1470 do ecode += GET(ecode,1); while (*ecode == OP_ALT);
1471 break;
1472
1473 /* BRAZERO, BRAMINZERO and SKIPZERO occur just before a bracket group,
1474 indicating that it may occur zero times. It may repeat infinitely, or not
1475 at all - i.e. it could be ()* or ()? or even (){0} in the pattern. Brackets
1476 with fixed upper repeat limits are compiled as a number of copies, with the
1477 optional ones preceded by BRAZERO or BRAMINZERO. */
1478
1479 case OP_BRAZERO:
1480 {
1481 next = ecode+1;
1482 RMATCH(eptr, next, offset_top, md, ims, eptrb, 0, RM10);
1483 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1484 do next += GET(next,1); while (*next == OP_ALT);
1485 ecode = next + 1 + LINK_SIZE;
1486 }
1487 break;
1488
1489 case OP_BRAMINZERO:
1490 {
1491 next = ecode+1;
1492 do next += GET(next, 1); while (*next == OP_ALT);
1493 RMATCH(eptr, next + 1+LINK_SIZE, offset_top, md, ims, eptrb, 0, RM11);
1494 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1495 ecode++;
1496 }
1497 break;
1498
1499 case OP_SKIPZERO:
1500 {
1501 next = ecode+1;
1502 do next += GET(next,1); while (*next == OP_ALT);
1503 ecode = next + 1 + LINK_SIZE;
1504 }
1505 break;
1506
1507 /* End of a group, repeated or non-repeating. */
1508
1509 case OP_KET:
1510 case OP_KETRMIN:
1511 case OP_KETRMAX:
1512 prev = ecode - GET(ecode, 1);
1513
1514 /* If this was a group that remembered the subject start, in order to break
1515 infinite repeats of empty string matches, retrieve the subject start from
1516 the chain. Otherwise, set it NULL. */
1517
1518 if (*prev >= OP_SBRA)
1519 {
1520 saved_eptr = eptrb->epb_saved_eptr; /* Value at start of group */
1521 eptrb = eptrb->epb_prev; /* Backup to previous group */
1522 }
1523 else saved_eptr = NULL;
1524
1525 /* If we are at the end of an assertion group or an atomic group, stop
1526 matching and return MATCH_MATCH, but record the current high water mark for
1527 use by positive assertions. We also need to record the match start in case
1528 it was changed by \K. */
1529
1530 if (*prev == OP_ASSERT || *prev == OP_ASSERT_NOT ||
1531 *prev == OP_ASSERTBACK || *prev == OP_ASSERTBACK_NOT ||
1532 *prev == OP_ONCE)
1533 {
1534 md->end_match_ptr = eptr; /* For ONCE */
1535 md->end_offset_top = offset_top;
1536 md->start_match_ptr = mstart;
1537 MRRETURN(MATCH_MATCH);
1538 }
1539
1540 /* For capturing groups we have to check the group number back at the start
1541 and if necessary complete handling an extraction by setting the offsets and
1542 bumping the high water mark. Note that whole-pattern recursion is coded as
1543 a recurse into group 0, so it won't be picked up here. Instead, we catch it
1544 when the OP_END is reached. Other recursion is handled here. */
1545
1546 if (*prev == OP_CBRA || *prev == OP_SCBRA)
1547 {
1548 number = GET2(prev, 1+LINK_SIZE);
1549 offset = number << 1;
1550
1551 #ifdef PCRE_DEBUG
1552 printf("end bracket %d", number);
1553 printf("\n");
1554 #endif
1555
1556 md->capture_last = number;
1557 if (offset >= md->offset_max) md->offset_overflow = TRUE; else
1558 {
1559 md->offset_vector[offset] =
1560 md->offset_vector[md->offset_end - number];
1561 md->offset_vector[offset+1] = eptr - md->start_subject;
1562 if (offset_top <= offset) offset_top = offset + 2;
1563 }
1564
1565 /* Handle a recursively called group. Restore the offsets
1566 appropriately and continue from after the call. */
1567
1568 if (md->recursive != NULL && md->recursive->group_num == number)
1569 {
1570 recursion_info *rec = md->recursive;
1571 DPRINTF(("Recursion (%d) succeeded - continuing\n", number));
1572 md->recursive = rec->prevrec;
1573 memcpy(md->offset_vector, rec->offset_save,
1574 rec->saved_max * sizeof(int));
1575 offset_top = rec->save_offset_top;
1576 ecode = rec->after_call;
1577 ims = original_ims;
1578 break;
1579 }
1580 }
1581
1582 /* For both capturing and non-capturing groups, reset the value of the ims
1583 flags, in case they got changed during the group. */
1584
1585 ims = original_ims;
1586 DPRINTF(("ims reset to %02lx\n", ims));
1587
1588 /* For a non-repeating ket, just continue at this level. This also
1589 happens for a repeating ket if no characters were matched in the group.
1590 This is the forcible breaking of infinite loops as implemented in Perl
1591 5.005. If there is an options reset, it will get obeyed in the normal
1592 course of events. */
1593
1594 if (*ecode == OP_KET || eptr == saved_eptr)
1595 {
1596 ecode += 1 + LINK_SIZE;
1597 break;
1598 }
1599
1600 /* The repeating kets try the rest of the pattern or restart from the
1601 preceding bracket, in the appropriate order. In the second case, we can use
1602 tail recursion to avoid using another stack frame, unless we have an
1603 unlimited repeat of a group that can match an empty string. */
1604
1605 flags = (*prev >= OP_SBRA)? match_cbegroup : 0;
1606
1607 if (*ecode == OP_KETRMIN)
1608 {
1609 RMATCH(eptr, ecode + 1 + LINK_SIZE, offset_top, md, ims, eptrb, 0, RM12);
1610 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1611 if (flags != 0) /* Could match an empty string */
1612 {
1613 RMATCH(eptr, prev, offset_top, md, ims, eptrb, flags, RM50);
1614 RRETURN(rrc);
1615 }
1616 ecode = prev;
1617 goto TAIL_RECURSE;
1618 }
1619 else /* OP_KETRMAX */
1620 {
1621 RMATCH(eptr, prev, offset_top, md, ims, eptrb, flags, RM13);
1622 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1623 ecode += 1 + LINK_SIZE;
1624 flags = 0;
1625 goto TAIL_RECURSE;
1626 }
1627 /* Control never gets here */
1628
1629 /* Start of subject unless notbol, or after internal newline if multiline */
1630
1631 case OP_CIRC:
1632 if (md->notbol && eptr == md->start_subject) MRRETURN(MATCH_NOMATCH);
1633 if ((ims & PCRE_MULTILINE) != 0)
1634 {
1635 if (eptr != md->start_subject &&
1636 (eptr == md->end_subject || !WAS_NEWLINE(eptr)))
1637 MRRETURN(MATCH_NOMATCH);
1638 ecode++;
1639 break;
1640 }
1641 /* ... else fall through */
1642
1643 /* Start of subject assertion */
1644
1645 case OP_SOD:
1646 if (eptr != md->start_subject) MRRETURN(MATCH_NOMATCH);
1647 ecode++;
1648 break;
1649
1650 /* Start of match assertion */
1651
1652 case OP_SOM:
1653 if (eptr != md->start_subject + md->start_offset) MRRETURN(MATCH_NOMATCH);
1654 ecode++;
1655 break;
1656
1657 /* Reset the start of match point */
1658
1659 case OP_SET_SOM:
1660 mstart = eptr;
1661 ecode++;
1662 break;
1663
1664 /* Assert before internal newline if multiline, or before a terminating
1665 newline unless endonly is set, else end of subject unless noteol is set. */
1666
1667 case OP_DOLL:
1668 if ((ims & PCRE_MULTILINE) != 0)
1669 {
1670 if (eptr < md->end_subject)
1671 { if (!IS_NEWLINE(eptr)) MRRETURN(MATCH_NOMATCH); }
1672 else
1673 { if (md->noteol) MRRETURN(MATCH_NOMATCH); }
1674 ecode++;
1675 break;
1676 }
1677 else
1678 {
1679 if (md->noteol) MRRETURN(MATCH_NOMATCH);
1680 if (!md->endonly)
1681 {
1682 if (eptr != md->end_subject &&
1683 (!IS_NEWLINE(eptr) || eptr != md->end_subject - md->nllen))
1684 MRRETURN(MATCH_NOMATCH);
1685 ecode++;
1686 break;
1687 }
1688 }
1689 /* ... else fall through for endonly */
1690
1691 /* End of subject assertion (\z) */
1692
1693 case OP_EOD:
1694 if (eptr < md->end_subject) MRRETURN(MATCH_NOMATCH);
1695 ecode++;
1696 break;
1697
1698 /* End of subject or ending \n assertion (\Z) */
1699
1700 case OP_EODN:
1701 if (eptr != md->end_subject &&
1702 (!IS_NEWLINE(eptr) || eptr != md->end_subject - md->nllen))
1703 MRRETURN(MATCH_NOMATCH);
1704 ecode++;
1705 break;
1706
1707 /* Word boundary assertions */
1708
1709 case OP_NOT_WORD_BOUNDARY:
1710 case OP_WORD_BOUNDARY:
1711 {
1712
1713 /* Find out if the previous and current characters are "word" characters.
1714 It takes a bit more work in UTF-8 mode. Characters > 255 are assumed to
1715 be "non-word" characters. Remember the earliest consulted character for
1716 partial matching. */
1717
1718 #ifdef SUPPORT_UTF8
1719 if (utf8)
1720 {
1721 /* Get status of previous character */
1722
1723 if (eptr == md->start_subject) prev_is_word = FALSE; else
1724 {
1725 USPTR lastptr = eptr - 1;
1726 while((*lastptr & 0xc0) == 0x80) lastptr--;
1727 if (lastptr < md->start_used_ptr) md->start_used_ptr = lastptr;
1728 GETCHAR(c, lastptr);
1729 #ifdef SUPPORT_UCP
1730 if (md->use_ucp)
1731 {
1732 if (c == '_') prev_is_word = TRUE; else
1733 {
1734 int cat = UCD_CATEGORY(c);
1735 prev_is_word = (cat == ucp_L || cat == ucp_N);
1736 }
1737 }
1738 else
1739 #endif
1740 prev_is_word = c < 256 && (md->ctypes[c] & ctype_word) != 0;
1741 }
1742
1743 /* Get status of next character */
1744
1745 if (eptr >= md->end_subject)
1746 {
1747 SCHECK_PARTIAL();
1748 cur_is_word = FALSE;
1749 }
1750 else
1751 {
1752 GETCHAR(c, eptr);
1753 #ifdef SUPPORT_UCP
1754 if (md->use_ucp)
1755 {
1756 if (c == '_') cur_is_word = TRUE; else
1757 {
1758 int cat = UCD_CATEGORY(c);
1759 cur_is_word = (cat == ucp_L || cat == ucp_N);
1760 }
1761 }
1762 else
1763 #endif
1764 cur_is_word = c < 256 && (md->ctypes[c] & ctype_word) != 0;
1765 }
1766 }
1767 else
1768 #endif
1769
1770 /* Not in UTF-8 mode, but we may still have PCRE_UCP set, and for
1771 consistency with the behaviour of \w we do use it in this case. */
1772
1773 {
1774 /* Get status of previous character */
1775
1776 if (eptr == md->start_subject) prev_is_word = FALSE; else
1777 {
1778 if (eptr <= md->start_used_ptr) md->start_used_ptr = eptr - 1;
1779 #ifdef SUPPORT_UCP
1780 if (md->use_ucp)
1781 {
1782 c = eptr[-1];
1783 if (c == '_') prev_is_word = TRUE; else
1784 {
1785 int cat = UCD_CATEGORY(c);
1786 prev_is_word = (cat == ucp_L || cat == ucp_N);
1787 }
1788 }
1789 else
1790 #endif
1791 prev_is_word = ((md->ctypes[eptr[-1]] & ctype_word) != 0);
1792 }
1793
1794 /* Get status of next character */
1795
1796 if (eptr >= md->end_subject)
1797 {
1798 SCHECK_PARTIAL();
1799 cur_is_word = FALSE;
1800 }
1801 else
1802 #ifdef SUPPORT_UCP
1803 if (md->use_ucp)
1804 {
1805 c = *eptr;
1806 if (c == '_') cur_is_word = TRUE; else
1807 {
1808 int cat = UCD_CATEGORY(c);
1809 cur_is_word = (cat == ucp_L || cat == ucp_N);
1810 }
1811 }
1812 else
1813 #endif
1814 cur_is_word = ((md->ctypes[*eptr] & ctype_word) != 0);
1815 }
1816
1817 /* Now see if the situation is what we want */
1818
1819 if ((*ecode++ == OP_WORD_BOUNDARY)?
1820 cur_is_word == prev_is_word : cur_is_word != prev_is_word)
1821 MRRETURN(MATCH_NOMATCH);
1822 }
1823 break;
1824
1825 /* Match a single character type; inline for speed */
1826
1827 case OP_ANY:
1828 if (IS_NEWLINE(eptr)) MRRETURN(MATCH_NOMATCH);
1829 /* Fall through */
1830
1831 case OP_ALLANY:
1832 if (eptr++ >= md->end_subject)
1833 {
1834 SCHECK_PARTIAL();
1835 MRRETURN(MATCH_NOMATCH);
1836 }
1837 if (utf8) while (eptr < md->end_subject && (*eptr & 0xc0) == 0x80) eptr++;
1838 ecode++;
1839 break;
1840
1841 /* Match a single byte, even in UTF-8 mode. This opcode really does match
1842 any byte, even newline, independent of the setting of PCRE_DOTALL. */
1843
1844 case OP_ANYBYTE:
1845 if (eptr++ >= md->end_subject)
1846 {
1847 SCHECK_PARTIAL();
1848 MRRETURN(MATCH_NOMATCH);
1849 }
1850 ecode++;
1851 break;
1852
1853 case OP_NOT_DIGIT:
1854 if (eptr >= md->end_subject)
1855 {
1856 SCHECK_PARTIAL();
1857 MRRETURN(MATCH_NOMATCH);
1858 }
1859 GETCHARINCTEST(c, eptr);
1860 if (
1861 #ifdef SUPPORT_UTF8
1862 c < 256 &&
1863 #endif
1864 (md->ctypes[c] & ctype_digit) != 0
1865 )
1866 MRRETURN(MATCH_NOMATCH);
1867 ecode++;
1868 break;
1869
1870 case OP_DIGIT:
1871 if (eptr >= md->end_subject)
1872 {
1873 SCHECK_PARTIAL();
1874 MRRETURN(MATCH_NOMATCH);
1875 }
1876 GETCHARINCTEST(c, eptr);
1877 if (
1878 #ifdef SUPPORT_UTF8
1879 c >= 256 ||
1880 #endif
1881 (md->ctypes[c] & ctype_digit) == 0
1882 )
1883 MRRETURN(MATCH_NOMATCH);
1884 ecode++;
1885 break;
1886
1887 case OP_NOT_WHITESPACE:
1888 if (eptr >= md->end_subject)
1889 {
1890 SCHECK_PARTIAL();
1891 MRRETURN(MATCH_NOMATCH);
1892 }
1893 GETCHARINCTEST(c, eptr);
1894 if (
1895 #ifdef SUPPORT_UTF8
1896 c < 256 &&
1897 #endif
1898 (md->ctypes[c] & ctype_space) != 0
1899 )
1900 MRRETURN(MATCH_NOMATCH);
1901 ecode++;
1902 break;
1903
1904 case OP_WHITESPACE:
1905 if (eptr >= md->end_subject)
1906 {
1907 SCHECK_PARTIAL();
1908 MRRETURN(MATCH_NOMATCH);
1909 }
1910 GETCHARINCTEST(c, eptr);
1911 if (
1912 #ifdef SUPPORT_UTF8
1913 c >= 256 ||
1914 #endif
1915 (md->ctypes[c] & ctype_space) == 0
1916 )
1917 MRRETURN(MATCH_NOMATCH);
1918 ecode++;
1919 break;
1920
1921 case OP_NOT_WORDCHAR:
1922 if (eptr >= md->end_subject)
1923 {
1924 SCHECK_PARTIAL();
1925 MRRETURN(MATCH_NOMATCH);
1926 }
1927 GETCHARINCTEST(c, eptr);
1928 if (
1929 #ifdef SUPPORT_UTF8
1930 c < 256 &&
1931 #endif
1932 (md->ctypes[c] & ctype_word) != 0
1933 )
1934 MRRETURN(MATCH_NOMATCH);
1935 ecode++;
1936 break;
1937
1938 case OP_WORDCHAR:
1939 if (eptr >= md->end_subject)
1940 {
1941 SCHECK_PARTIAL();
1942 MRRETURN(MATCH_NOMATCH);
1943 }
1944 GETCHARINCTEST(c, eptr);
1945 if (
1946 #ifdef SUPPORT_UTF8
1947 c >= 256 ||
1948 #endif
1949 (md->ctypes[c] & ctype_word) == 0
1950 )
1951 MRRETURN(MATCH_NOMATCH);
1952 ecode++;
1953 break;
1954
1955 case OP_ANYNL:
1956 if (eptr >= md->end_subject)
1957 {
1958 SCHECK_PARTIAL();
1959 MRRETURN(MATCH_NOMATCH);
1960 }
1961 GETCHARINCTEST(c, eptr);
1962 switch(c)
1963 {
1964 default: MRRETURN(MATCH_NOMATCH);
1965 case 0x000d:
1966 if (eptr < md->end_subject && *eptr == 0x0a) eptr++;
1967 break;
1968
1969 case 0x000a:
1970 break;
1971
1972 case 0x000b:
1973 case 0x000c:
1974 case 0x0085:
1975 case 0x2028:
1976 case 0x2029:
1977 if (md->bsr_anycrlf) MRRETURN(MATCH_NOMATCH);
1978 break;
1979 }
1980 ecode++;
1981 break;
1982
1983 case OP_NOT_HSPACE:
1984 if (eptr >= md->end_subject)
1985 {
1986 SCHECK_PARTIAL();
1987 MRRETURN(MATCH_NOMATCH);
1988 }
1989 GETCHARINCTEST(c, eptr);
1990 switch(c)
1991 {
1992 default: break;
1993 case 0x09: /* HT */
1994 case 0x20: /* SPACE */
1995 case 0xa0: /* NBSP */
1996 case 0x1680: /* OGHAM SPACE MARK */
1997 case 0x180e: /* MONGOLIAN VOWEL SEPARATOR */
1998 case 0x2000: /* EN QUAD */
1999 case 0x2001: /* EM QUAD */
2000 case 0x2002: /* EN SPACE */
2001 case 0x2003: /* EM SPACE */
2002 case 0x2004: /* THREE-PER-EM SPACE */
2003 case 0x2005: /* FOUR-PER-EM SPACE */
2004 case 0x2006: /* SIX-PER-EM SPACE */
2005 case 0x2007: /* FIGURE SPACE */
2006 case 0x2008: /* PUNCTUATION SPACE */
2007 case 0x2009: /* THIN SPACE */
2008 case 0x200A: /* HAIR SPACE */
2009 case 0x202f: /* NARROW NO-BREAK SPACE */
2010 case 0x205f: /* MEDIUM MATHEMATICAL SPACE */
2011 case 0x3000: /* IDEOGRAPHIC SPACE */
2012 MRRETURN(MATCH_NOMATCH);
2013 }
2014 ecode++;
2015 break;
2016
2017 case OP_HSPACE:
2018 if (eptr >= md->end_subject)
2019 {
2020 SCHECK_PARTIAL();
2021 MRRETURN(MATCH_NOMATCH);
2022 }
2023 GETCHARINCTEST(c, eptr);
2024 switch(c)
2025 {
2026 default: MRRETURN(MATCH_NOMATCH);
2027 case 0x09: /* HT */
2028 case 0x20: /* SPACE */
2029 case 0xa0: /* NBSP */
2030 case 0x1680: /* OGHAM SPACE MARK */
2031 case 0x180e: /* MONGOLIAN VOWEL SEPARATOR */
2032 case 0x2000: /* EN QUAD */
2033 case 0x2001: /* EM QUAD */
2034 case 0x2002: /* EN SPACE */
2035 case 0x2003: /* EM SPACE */
2036 case 0x2004: /* THREE-PER-EM SPACE */
2037 case 0x2005: /* FOUR-PER-EM SPACE */
2038 case 0x2006: /* SIX-PER-EM SPACE */
2039 case 0x2007: /* FIGURE SPACE */
2040 case 0x2008: /* PUNCTUATION SPACE */
2041 case 0x2009: /* THIN SPACE */
2042 case 0x200A: /* HAIR SPACE */
2043 case 0x202f: /* NARROW NO-BREAK SPACE */
2044 case 0x205f: /* MEDIUM MATHEMATICAL SPACE */
2045 case 0x3000: /* IDEOGRAPHIC SPACE */
2046 break;
2047 }
2048 ecode++;
2049 break;
2050
2051 case OP_NOT_VSPACE:
2052 if (eptr >= md->end_subject)
2053 {
2054 SCHECK_PARTIAL();
2055 MRRETURN(MATCH_NOMATCH);
2056 }
2057 GETCHARINCTEST(c, eptr);
2058 switch(c)
2059 {
2060 default: break;
2061 case 0x0a: /* LF */
2062 case 0x0b: /* VT */
2063 case 0x0c: /* FF */
2064 case 0x0d: /* CR */
2065 case 0x85: /* NEL */
2066 case 0x2028: /* LINE SEPARATOR */
2067 case 0x2029: /* PARAGRAPH SEPARATOR */
2068 MRRETURN(MATCH_NOMATCH);
2069 }
2070 ecode++;
2071 break;
2072
2073 case OP_VSPACE:
2074 if (eptr >= md->end_subject)
2075 {
2076 SCHECK_PARTIAL();
2077 MRRETURN(MATCH_NOMATCH);
2078 }
2079 GETCHARINCTEST(c, eptr);
2080 switch(c)
2081 {
2082 default: MRRETURN(MATCH_NOMATCH);
2083 case 0x0a: /* LF */
2084 case 0x0b: /* VT */
2085 case 0x0c: /* FF */
2086 case 0x0d: /* CR */
2087 case 0x85: /* NEL */
2088 case 0x2028: /* LINE SEPARATOR */
2089 case 0x2029: /* PARAGRAPH SEPARATOR */
2090 break;
2091 }
2092 ecode++;
2093 break;
2094
2095 #ifdef SUPPORT_UCP
2096 /* Check the next character by Unicode property. We will get here only
2097 if the support is in the binary; otherwise a compile-time error occurs. */
2098
2099 case OP_PROP:
2100 case OP_NOTPROP:
2101 if (eptr >= md->end_subject)
2102 {
2103 SCHECK_PARTIAL();
2104 MRRETURN(MATCH_NOMATCH);
2105 }
2106 GETCHARINCTEST(c, eptr);
2107 {
2108 const ucd_record *prop = GET_UCD(c);
2109
2110 switch(ecode[1])
2111 {
2112 case PT_ANY:
2113 if (op == OP_NOTPROP) MRRETURN(MATCH_NOMATCH);
2114 break;
2115
2116 case PT_LAMP:
2117 if ((prop->chartype == ucp_Lu ||
2118 prop->chartype == ucp_Ll ||
2119 prop->chartype == ucp_Lt) == (op == OP_NOTPROP))
2120 MRRETURN(MATCH_NOMATCH);
2121 break;
2122
2123 case PT_GC:
2124 if ((ecode[2] != _pcre_ucp_gentype[prop->chartype]) == (op == OP_PROP))
2125 MRRETURN(MATCH_NOMATCH);
2126 break;
2127
2128 case PT_PC:
2129 if ((ecode[2] != prop->chartype) == (op == OP_PROP))
2130 MRRETURN(MATCH_NOMATCH);
2131 break;
2132
2133 case PT_SC:
2134 if ((ecode[2] != prop->script) == (op == OP_PROP))
2135 MRRETURN(MATCH_NOMATCH);
2136 break;
2137
2138 /* These are specials */
2139
2140 case PT_ALNUM:
2141 if ((_pcre_ucp_gentype[prop->chartype] == ucp_L ||
2142 _pcre_ucp_gentype[prop->chartype] == ucp_N) == (op == OP_NOTPROP))
2143 MRRETURN(MATCH_NOMATCH);
2144 break;
2145
2146 case PT_SPACE: /* Perl space */
2147 if ((_pcre_ucp_gentype[prop->chartype] == ucp_Z ||
2148 c == CHAR_HT || c == CHAR_NL || c == CHAR_FF || c == CHAR_CR)
2149 == (op == OP_NOTPROP))
2150 MRRETURN(MATCH_NOMATCH);
2151 break;
2152
2153 case PT_PXSPACE: /* POSIX space */
2154 if ((_pcre_ucp_gentype[prop->chartype] == ucp_Z ||
2155 c == CHAR_HT || c == CHAR_NL || c == CHAR_VT ||
2156 c == CHAR_FF || c == CHAR_CR)
2157 == (op == OP_NOTPROP))
2158 MRRETURN(MATCH_NOMATCH);
2159 break;
2160
2161 case PT_WORD:
2162 if ((_pcre_ucp_gentype[prop->chartype] == ucp_L ||
2163 _pcre_ucp_gentype[prop->chartype] == ucp_N ||
2164 c == CHAR_UNDERSCORE) == (op == OP_NOTPROP))
2165 MRRETURN(MATCH_NOMATCH);
2166 break;
2167
2168 /* This should never occur */
2169
2170 default:
2171 RRETURN(PCRE_ERROR_INTERNAL);
2172 }
2173
2174 ecode += 3;
2175 }
2176 break;
2177
2178 /* Match an extended Unicode sequence. We will get here only if the support
2179 is in the binary; otherwise a compile-time error occurs. */
2180
2181 case OP_EXTUNI:
2182 if (eptr >= md->end_subject)
2183 {
2184 SCHECK_PARTIAL();
2185 MRRETURN(MATCH_NOMATCH);
2186 }
2187 GETCHARINCTEST(c, eptr);
2188 {
2189 int category = UCD_CATEGORY(c);
2190 if (category == ucp_M) MRRETURN(MATCH_NOMATCH);
2191 while (eptr < md->end_subject)
2192 {
2193 int len = 1;
2194 if (!utf8) c = *eptr; else
2195 {
2196 GETCHARLEN(c, eptr, len);
2197 }
2198 category = UCD_CATEGORY(c);
2199 if (category != ucp_M) break;
2200 eptr += len;
2201 }
2202 }
2203 ecode++;
2204 break;
2205 #endif
2206
2207
2208 /* Match a back reference, possibly repeatedly. Look past the end of the
2209 item to see if there is repeat information following. The code is similar
2210 to that for character classes, but repeated for efficiency. Then obey
2211 similar code to character type repeats - written out again for speed.
2212 However, if the referenced string is the empty string, always treat
2213 it as matched, any number of times (otherwise there could be infinite
2214 loops). */
2215
2216 case OP_REF:
2217 {
2218 offset = GET2(ecode, 1) << 1; /* Doubled ref number */
2219 ecode += 3;
2220
2221 /* If the reference is unset, there are two possibilities:
2222
2223 (a) In the default, Perl-compatible state, set the length to be longer
2224 than the amount of subject left; this ensures that every attempt at a
2225 match fails. We can't just fail here, because of the possibility of
2226 quantifiers with zero minima.
2227
2228 (b) If the JavaScript compatibility flag is set, set the length to zero
2229 so that the back reference matches an empty string.
2230
2231 Otherwise, set the length to the length of what was matched by the
2232 referenced subpattern. */
2233
2234 if (offset >= offset_top || md->offset_vector[offset] < 0)
2235 length = (md->jscript_compat)? 0 : md->end_subject - eptr + 1;
2236 else
2237 length = md->offset_vector[offset+1] - md->offset_vector[offset];
2238
2239 /* Set up for repetition, or handle the non-repeated case */
2240
2241 switch (*ecode)
2242 {
2243 case OP_CRSTAR:
2244 case OP_CRMINSTAR:
2245 case OP_CRPLUS:
2246 case OP_CRMINPLUS:
2247 case OP_CRQUERY:
2248 case OP_CRMINQUERY:
2249 c = *ecode++ - OP_CRSTAR;
2250 minimize = (c & 1) != 0;
2251 min = rep_min[c]; /* Pick up values from tables; */
2252 max = rep_max[c]; /* zero for max => infinity */
2253 if (max == 0) max = INT_MAX;
2254 break;
2255
2256 case OP_CRRANGE:
2257 case OP_CRMINRANGE:
2258 minimize = (*ecode == OP_CRMINRANGE);
2259 min = GET2(ecode, 1);
2260 max = GET2(ecode, 3);
2261 if (max == 0) max = INT_MAX;
2262 ecode += 5;
2263 break;
2264
2265 default: /* No repeat follows */
2266 if (!match_ref(offset, eptr, length, md, ims))
2267 {
2268 CHECK_PARTIAL();
2269 MRRETURN(MATCH_NOMATCH);
2270 }
2271 eptr += length;
2272 continue; /* With the main loop */
2273 }
2274
2275 /* If the length of the reference is zero, just continue with the
2276 main loop. */
2277
2278 if (length == 0) continue;
2279
2280 /* First, ensure the minimum number of matches are present. We get back
2281 the length of the reference string explicitly rather than passing the
2282 address of eptr, so that eptr can be a register variable. */
2283
2284 for (i = 1; i <= min; i++)
2285 {
2286 if (!match_ref(offset, eptr, length, md, ims))
2287 {
2288 CHECK_PARTIAL();
2289 MRRETURN(MATCH_NOMATCH);
2290 }
2291 eptr += length;
2292 }
2293
2294 /* If min = max, continue at the same level without recursion.
2295 They are not both allowed to be zero. */
2296
2297 if (min == max) continue;
2298
2299 /* If minimizing, keep trying and advancing the pointer */
2300
2301 if (minimize)
2302 {
2303 for (fi = min;; fi++)
2304 {
2305 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM14);
2306 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2307 if (fi >= max) MRRETURN(MATCH_NOMATCH);
2308 if (!match_ref(offset, eptr, length, md, ims))
2309 {
2310 CHECK_PARTIAL();
2311 MRRETURN(MATCH_NOMATCH);
2312 }
2313 eptr += length;
2314 }
2315 /* Control never gets here */
2316 }
2317
2318 /* If maximizing, find the longest string and work backwards */
2319
2320 else
2321 {
2322 pp = eptr;
2323 for (i = min; i < max; i++)
2324 {
2325 if (!match_ref(offset, eptr, length, md, ims))
2326 {
2327 CHECK_PARTIAL();
2328 break;
2329 }
2330 eptr += length;
2331 }
2332 while (eptr >= pp)
2333 {
2334 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM15);
2335 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2336 eptr -= length;
2337 }
2338 MRRETURN(MATCH_NOMATCH);
2339 }
2340 }
2341 /* Control never gets here */
2342
2343 /* Match a bit-mapped character class, possibly repeatedly. This op code is
2344 used when all the characters in the class have values in the range 0-255,
2345 and either the matching is caseful, or the characters are in the range
2346 0-127 when UTF-8 processing is enabled. The only difference between
2347 OP_CLASS and OP_NCLASS occurs when a data character outside the range is
2348 encountered.
2349
2350 First, look past the end of the item to see if there is repeat information
2351 following. Then obey similar code to character type repeats - written out
2352 again for speed. */
2353
2354 case OP_NCLASS:
2355 case OP_CLASS:
2356 {
2357 data = ecode + 1; /* Save for matching */
2358 ecode += 33; /* Advance past the item */
2359
2360 switch (*ecode)
2361 {
2362 case OP_CRSTAR:
2363 case OP_CRMINSTAR:
2364 case OP_CRPLUS:
2365 case OP_CRMINPLUS:
2366 case OP_CRQUERY:
2367 case OP_CRMINQUERY:
2368 c = *ecode++ - OP_CRSTAR;
2369 minimize = (c & 1) != 0;
2370 min = rep_min[c]; /* Pick up values from tables; */
2371 max = rep_max[c]; /* zero for max => infinity */
2372 if (max == 0) max = INT_MAX;
2373 break;
2374
2375 case OP_CRRANGE:
2376 case OP_CRMINRANGE:
2377 minimize = (*ecode == OP_CRMINRANGE);
2378 min = GET2(ecode, 1);
2379 max = GET2(ecode, 3);
2380 if (max == 0) max = INT_MAX;
2381 ecode += 5;
2382 break;
2383
2384 default: /* No repeat follows */
2385 min = max = 1;
2386 break;
2387 }
2388
2389 /* First, ensure the minimum number of matches are present. */
2390
2391 #ifdef SUPPORT_UTF8
2392 /* UTF-8 mode */
2393 if (utf8)
2394 {
2395 for (i = 1; i <= min; i++)
2396 {
2397 if (eptr >= md->end_subject)
2398 {
2399 SCHECK_PARTIAL();
2400 MRRETURN(MATCH_NOMATCH);
2401 }
2402 GETCHARINC(c, eptr);
2403 if (c > 255)
2404 {
2405 if (op == OP_CLASS) MRRETURN(MATCH_NOMATCH);
2406 }
2407 else
2408 {
2409 if ((data[c/8] & (1 << (c&7))) == 0) MRRETURN(MATCH_NOMATCH);
2410 }
2411 }
2412 }
2413 else
2414 #endif
2415 /* Not UTF-8 mode */
2416 {
2417 for (i = 1; i <= min; i++)
2418 {
2419 if (eptr >= md->end_subject)
2420 {
2421 SCHECK_PARTIAL();
2422 MRRETURN(MATCH_NOMATCH);
2423 }
2424 c = *eptr++;
2425 if ((data[c/8] & (1 << (c&7))) == 0) MRRETURN(MATCH_NOMATCH);
2426 }
2427 }
2428
2429 /* If max == min we can continue with the main loop without the
2430 need to recurse. */
2431
2432 if (min == max) continue;
2433
2434 /* If minimizing, keep testing the rest of the expression and advancing
2435 the pointer while it matches the class. */
2436
2437 if (minimize)
2438 {
2439 #ifdef SUPPORT_UTF8
2440 /* UTF-8 mode */
2441 if (utf8)
2442 {
2443 for (fi = min;; fi++)
2444 {
2445 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM16);
2446 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2447 if (fi >= max) MRRETURN(MATCH_NOMATCH);
2448 if (eptr >= md->end_subject)
2449 {
2450 SCHECK_PARTIAL();
2451 MRRETURN(MATCH_NOMATCH);
2452 }
2453 GETCHARINC(c, eptr);
2454 if (c > 255)
2455 {
2456 if (op == OP_CLASS) MRRETURN(MATCH_NOMATCH);
2457 }
2458 else
2459 {
2460 if ((data[c/8] & (1 << (c&7))) == 0) MRRETURN(MATCH_NOMATCH);
2461 }
2462 }
2463 }
2464 else
2465 #endif
2466 /* Not UTF-8 mode */
2467 {
2468 for (fi = min;; fi++)
2469 {
2470 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM17);
2471 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2472 if (fi >= max) MRRETURN(MATCH_NOMATCH);
2473 if (eptr >= md->end_subject)
2474 {
2475 SCHECK_PARTIAL();
2476 MRRETURN(MATCH_NOMATCH);
2477 }
2478 c = *eptr++;
2479 if ((data[c/8] & (1 << (c&7))) == 0) MRRETURN(MATCH_NOMATCH);
2480 }
2481 }
2482 /* Control never gets here */
2483 }
2484
2485 /* If maximizing, find the longest possible run, then work backwards. */
2486
2487 else
2488 {
2489 pp = eptr;
2490
2491 #ifdef SUPPORT_UTF8
2492 /* UTF-8 mode */
2493 if (utf8)
2494 {
2495 for (i = min; i < max; i++)
2496 {
2497 int len = 1;
2498 if (eptr >= md->end_subject)
2499 {
2500 SCHECK_PARTIAL();
2501 break;
2502 }
2503 GETCHARLEN(c, eptr, len);
2504 if (c > 255)
2505 {
2506 if (op == OP_CLASS) break;
2507 }
2508 else
2509 {
2510 if ((data[c/8] & (1 << (c&7))) == 0) break;
2511 }
2512 eptr += len;
2513 }
2514 for (;;)
2515 {
2516 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM18);
2517 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2518 if (eptr-- == pp) break; /* Stop if tried at original pos */
2519 BACKCHAR(eptr);
2520 }
2521 }
2522 else
2523 #endif
2524 /* Not UTF-8 mode */
2525 {
2526 for (i = min; i < max; i++)
2527 {
2528 if (eptr >= md->end_subject)
2529 {
2530 SCHECK_PARTIAL();
2531 break;
2532 }
2533 c = *eptr;
2534 if ((data[c/8] & (1 << (c&7))) == 0) break;
2535 eptr++;
2536 }
2537 while (eptr >= pp)
2538 {
2539 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM19);
2540 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2541 eptr--;
2542 }
2543 }
2544
2545 MRRETURN(MATCH_NOMATCH);
2546 }
2547 }
2548 /* Control never gets here */
2549
2550
2551 /* Match an extended character class. This opcode is encountered only
2552 when UTF-8 mode mode is supported. Nevertheless, we may not be in UTF-8
2553 mode, because Unicode properties are supported in non-UTF-8 mode. */
2554
2555 #ifdef SUPPORT_UTF8
2556 case OP_XCLASS:
2557 {
2558 data = ecode + 1 + LINK_SIZE; /* Save for matching */
2559 ecode += GET(ecode, 1); /* Advance past the item */
2560
2561 switch (*ecode)
2562 {
2563 case OP_CRSTAR:
2564 case OP_CRMINSTAR:
2565 case OP_CRPLUS:
2566 case OP_CRMINPLUS:
2567 case OP_CRQUERY:
2568 case OP_CRMINQUERY:
2569 c = *ecode++ - OP_CRSTAR;
2570 minimize = (c & 1) != 0;
2571 min = rep_min[c]; /* Pick up values from tables; */
2572 max = rep_max[c]; /* zero for max => infinity */
2573 if (max == 0) max = INT_MAX;
2574 break;
2575
2576 case OP_CRRANGE:
2577 case OP_CRMINRANGE:
2578 minimize = (*ecode == OP_CRMINRANGE);
2579 min = GET2(ecode, 1);
2580 max = GET2(ecode, 3);
2581 if (max == 0) max = INT_MAX;
2582 ecode += 5;
2583 break;
2584
2585 default: /* No repeat follows */
2586 min = max = 1;
2587 break;
2588 }
2589
2590 /* First, ensure the minimum number of matches are present. */
2591
2592 for (i = 1; i <= min; i++)
2593 {
2594 if (eptr >= md->end_subject)
2595 {
2596 SCHECK_PARTIAL();
2597 MRRETURN(MATCH_NOMATCH);
2598 }
2599 GETCHARINCTEST(c, eptr);
2600 if (!_pcre_xclass(c, data)) MRRETURN(MATCH_NOMATCH);
2601 }
2602
2603 /* If max == min we can continue with the main loop without the
2604 need to recurse. */
2605
2606 if (min == max) continue;
2607
2608 /* If minimizing, keep testing the rest of the expression and advancing
2609 the pointer while it matches the class. */
2610
2611 if (minimize)
2612 {
2613 for (fi = min;; fi++)
2614 {
2615 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM20);
2616 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2617 if (fi >= max) MRRETURN(MATCH_NOMATCH);
2618 if (eptr >= md->end_subject)
2619 {
2620 SCHECK_PARTIAL();
2621 MRRETURN(MATCH_NOMATCH);
2622 }
2623 GETCHARINCTEST(c, eptr);
2624 if (!_pcre_xclass(c, data)) MRRETURN(MATCH_NOMATCH);
2625 }
2626 /* Control never gets here */
2627 }
2628
2629 /* If maximizing, find the longest possible run, then work backwards. */
2630
2631 else
2632 {
2633 pp = eptr;
2634 for (i = min; i < max; i++)
2635 {
2636 int len = 1;
2637 if (eptr >= md->end_subject)
2638 {
2639 SCHECK_PARTIAL();
2640 break;
2641 }
2642 GETCHARLENTEST(c, eptr, len);
2643 if (!_pcre_xclass(c, data)) break;
2644 eptr += len;
2645 }
2646 for(;;)
2647 {
2648 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM21);
2649 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2650 if (eptr-- == pp) break; /* Stop if tried at original pos */
2651 if (utf8) BACKCHAR(eptr);
2652 }
2653 MRRETURN(MATCH_NOMATCH);
2654 }
2655
2656 /* Control never gets here */
2657 }
2658 #endif /* End of XCLASS */
2659
2660 /* Match a single character, casefully */
2661
2662 case OP_CHAR:
2663 #ifdef SUPPORT_UTF8
2664 if (utf8)
2665 {
2666 length = 1;
2667 ecode++;
2668 GETCHARLEN(fc, ecode, length);
2669 if (length > md->end_subject - eptr)
2670 {
2671 CHECK_PARTIAL(); /* Not SCHECK_PARTIAL() */
2672 MRRETURN(MATCH_NOMATCH);
2673 }
2674 while (length-- > 0) if (*ecode++ != *eptr++) MRRETURN(MATCH_NOMATCH);
2675 }
2676 else
2677 #endif
2678
2679 /* Non-UTF-8 mode */
2680 {
2681 if (md->end_subject - eptr < 1)
2682 {
2683 SCHECK_PARTIAL(); /* This one can use SCHECK_PARTIAL() */
2684 MRRETURN(MATCH_NOMATCH);
2685 }
2686 if (ecode[1] != *eptr++) MRRETURN(MATCH_NOMATCH);
2687 ecode += 2;
2688 }
2689 break;
2690
2691 /* Match a single character, caselessly */
2692
2693 case OP_CHARNC:
2694 #ifdef SUPPORT_UTF8
2695 if (utf8)
2696 {
2697 length = 1;
2698 ecode++;
2699 GETCHARLEN(fc, ecode, length);
2700
2701 if (length > md->end_subject - eptr)
2702 {
2703 CHECK_PARTIAL(); /* Not SCHECK_PARTIAL() */
2704 MRRETURN(MATCH_NOMATCH);
2705 }
2706
2707 /* If the pattern character's value is < 128, we have only one byte, and
2708 can use the fast lookup table. */
2709
2710 if (fc < 128)
2711 {
2712 if (md->lcc[*ecode++] != md->lcc[*eptr++]) MRRETURN(MATCH_NOMATCH);
2713 }
2714
2715 /* Otherwise we must pick up the subject character */
2716
2717 else
2718 {
2719 unsigned int dc;
2720 GETCHARINC(dc, eptr);
2721 ecode += length;
2722
2723 /* If we have Unicode property support, we can use it to test the other
2724 case of the character, if there is one. */
2725
2726 if (fc != dc)
2727 {
2728 #ifdef SUPPORT_UCP
2729 if (dc != UCD_OTHERCASE(fc))
2730 #endif
2731 MRRETURN(MATCH_NOMATCH);
2732 }
2733 }
2734 }
2735 else
2736 #endif /* SUPPORT_UTF8 */
2737
2738 /* Non-UTF-8 mode */
2739 {
2740 if (md->end_subject - eptr < 1)
2741 {
2742 SCHECK_PARTIAL(); /* This one can use SCHECK_PARTIAL() */
2743 MRRETURN(MATCH_NOMATCH);
2744 }
2745 if (md->lcc[ecode[1]] != md->lcc[*eptr++]) MRRETURN(MATCH_NOMATCH);
2746 ecode += 2;
2747 }
2748 break;
2749
2750 /* Match a single character repeatedly. */
2751
2752 case OP_EXACT:
2753 min = max = GET2(ecode, 1);
2754 ecode += 3;
2755 goto REPEATCHAR;
2756
2757 case OP_POSUPTO:
2758 possessive = TRUE;
2759 /* Fall through */
2760
2761 case OP_UPTO:
2762 case OP_MINUPTO:
2763 min = 0;
2764 max = GET2(ecode, 1);
2765 minimize = *ecode == OP_MINUPTO;
2766 ecode += 3;
2767 goto REPEATCHAR;
2768
2769 case OP_POSSTAR:
2770 possessive = TRUE;
2771 min = 0;
2772 max = INT_MAX;
2773 ecode++;
2774 goto REPEATCHAR;
2775
2776 case OP_POSPLUS:
2777 possessive = TRUE;
2778 min = 1;
2779 max = INT_MAX;
2780 ecode++;
2781 goto REPEATCHAR;
2782
2783 case OP_POSQUERY:
2784 possessive = TRUE;
2785 min = 0;
2786 max = 1;
2787 ecode++;
2788 goto REPEATCHAR;
2789
2790 case OP_STAR:
2791 case OP_MINSTAR:
2792 case OP_PLUS:
2793 case OP_MINPLUS:
2794 case OP_QUERY:
2795 case OP_MINQUERY:
2796 c = *ecode++ - OP_STAR;
2797 minimize = (c & 1) != 0;
2798
2799 min = rep_min[c]; /* Pick up values from tables; */
2800 max = rep_max[c]; /* zero for max => infinity */
2801 if (max == 0) max = INT_MAX;
2802
2803 /* Common code for all repeated single-character matches. */
2804
2805 REPEATCHAR:
2806 #ifdef SUPPORT_UTF8
2807 if (utf8)
2808 {
2809 length = 1;
2810 charptr = ecode;
2811 GETCHARLEN(fc, ecode, length);
2812 ecode += length;
2813
2814 /* Handle multibyte character matching specially here. There is
2815 support for caseless matching if UCP support is present. */
2816
2817 if (length > 1)
2818 {
2819 #ifdef SUPPORT_UCP
2820 unsigned int othercase;
2821 if ((ims & PCRE_CASELESS) != 0 &&
2822 (othercase = UCD_OTHERCASE(fc)) != fc)
2823 oclength = _pcre_ord2utf8(othercase, occhars);
2824 else oclength = 0;
2825 #endif /* SUPPORT_UCP */
2826
2827 for (i = 1; i <= min; i++)
2828 {
2829 if (eptr <= md->end_subject - length &&
2830 memcmp(eptr, charptr, length) == 0) eptr += length;
2831 #ifdef SUPPORT_UCP
2832 else if (oclength > 0 &&
2833 eptr <= md->end_subject - oclength &&
2834 memcmp(eptr, occhars, oclength) == 0) eptr += oclength;
2835 #endif /* SUPPORT_UCP */
2836 else
2837 {
2838 CHECK_PARTIAL();
2839 MRRETURN(MATCH_NOMATCH);
2840 }
2841 }
2842
2843 if (min == max) continue;
2844
2845 if (minimize)
2846 {
2847 for (fi = min;; fi++)
2848 {
2849 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM22);
2850 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2851 if (fi >= max) MRRETURN(MATCH_NOMATCH);
2852 if (eptr <= md->end_subject - length &&
2853 memcmp(eptr, charptr, length) == 0) eptr += length;
2854 #ifdef SUPPORT_UCP
2855 else if (oclength > 0 &&
2856 eptr <= md->end_subject - oclength &&
2857 memcmp(eptr, occhars, oclength) == 0) eptr += oclength;
2858 #endif /* SUPPORT_UCP */
2859 else
2860 {
2861 CHECK_PARTIAL();
2862 MRRETURN(MATCH_NOMATCH);
2863 }
2864 }
2865 /* Control never gets here */
2866 }
2867
2868 else /* Maximize */
2869 {
2870 pp = eptr;
2871 for (i = min; i < max; i++)
2872 {
2873 if (eptr <= md->end_subject - length &&
2874 memcmp(eptr, charptr, length) == 0) eptr += length;
2875 #ifdef SUPPORT_UCP
2876 else if (oclength > 0 &&
2877 eptr <= md->end_subject - oclength &&
2878 memcmp(eptr, occhars, oclength) == 0) eptr += oclength;
2879 #endif /* SUPPORT_UCP */
2880 else
2881 {
2882 CHECK_PARTIAL();
2883 break;
2884 }
2885 }
2886
2887 if (possessive) continue;
2888
2889 for(;;)
2890 {
2891 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM23);
2892 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2893 if (eptr == pp) { MRRETURN(MATCH_NOMATCH); }
2894 #ifdef SUPPORT_UCP
2895 eptr--;
2896 BACKCHAR(eptr);
2897 #else /* without SUPPORT_UCP */
2898 eptr -= length;
2899 #endif /* SUPPORT_UCP */
2900 }
2901 }
2902 /* Control never gets here */
2903 }
2904
2905 /* If the length of a UTF-8 character is 1, we fall through here, and
2906 obey the code as for non-UTF-8 characters below, though in this case the
2907 value of fc will always be < 128. */
2908 }
2909 else
2910 #endif /* SUPPORT_UTF8 */
2911
2912 /* When not in UTF-8 mode, load a single-byte character. */
2913
2914 fc = *ecode++;
2915
2916 /* The value of fc at this point is always less than 256, though we may or
2917 may not be in UTF-8 mode. The code is duplicated for the caseless and
2918 caseful cases, for speed, since matching characters is likely to be quite
2919 common. First, ensure the minimum number of matches are present. If min =
2920 max, continue at the same level without recursing. Otherwise, if
2921 minimizing, keep trying the rest of the expression and advancing one
2922 matching character if failing, up to the maximum. Alternatively, if
2923 maximizing, find the maximum number of characters and work backwards. */
2924
2925 DPRINTF(("matching %c{%d,%d} against subject %.*s\n", fc, min, max,
2926 max, eptr));
2927
2928 if ((ims & PCRE_CASELESS) != 0)
2929 {
2930 fc = md->lcc[fc];
2931 for (i = 1; i <= min; i++)
2932 {
2933 if (eptr >= md->end_subject)
2934 {
2935 SCHECK_PARTIAL();
2936 MRRETURN(MATCH_NOMATCH);
2937 }
2938 if (fc != md->lcc[*eptr++]) MRRETURN(MATCH_NOMATCH);
2939 }
2940 if (min == max) continue;
2941 if (minimize)
2942 {
2943 for (fi = min;; fi++)
2944 {
2945 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM24);
2946 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2947 if (fi >= max) MRRETURN(MATCH_NOMATCH);
2948 if (eptr >= md->end_subject)
2949 {
2950 SCHECK_PARTIAL();
2951 MRRETURN(MATCH_NOMATCH);
2952 }
2953 if (fc != md->lcc[*eptr++]) MRRETURN(MATCH_NOMATCH);
2954 }
2955 /* Control never gets here */
2956 }
2957 else /* Maximize */
2958 {
2959 pp = eptr;
2960 for (i = min; i < max; i++)
2961 {
2962 if (eptr >= md->end_subject)
2963 {
2964 SCHECK_PARTIAL();
2965 break;
2966 }
2967 if (fc != md->lcc[*eptr]) break;
2968 eptr++;
2969 }
2970
2971 if (possessive) continue;
2972
2973 while (eptr >= pp)
2974 {
2975 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM25);
2976 eptr--;
2977 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2978 }
2979 MRRETURN(MATCH_NOMATCH);
2980 }
2981 /* Control never gets here */
2982 }
2983
2984 /* Caseful comparisons (includes all multi-byte characters) */
2985
2986 else
2987 {
2988 for (i = 1; i <= min; i++)
2989 {
2990 if (eptr >= md->end_subject)
2991 {
2992 SCHECK_PARTIAL();
2993 MRRETURN(MATCH_NOMATCH);
2994 }
2995 if (fc != *eptr++) MRRETURN(MATCH_NOMATCH);
2996 }
2997
2998 if (min == max) continue;
2999
3000 if (minimize)
3001 {
3002 for (fi = min;; fi++)
3003 {
3004 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM26);
3005 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3006 if (fi >= max) MRRETURN(MATCH_NOMATCH);
3007 if (eptr >= md->end_subject)
3008 {
3009 SCHECK_PARTIAL();
3010 MRRETURN(MATCH_NOMATCH);
3011 }
3012 if (fc != *eptr++) MRRETURN(MATCH_NOMATCH);
3013 }
3014 /* Control never gets here */
3015 }
3016 else /* Maximize */
3017 {
3018 pp = eptr;
3019 for (i = min; i < max; i++)
3020 {
3021 if (eptr >= md->end_subject)
3022 {
3023 SCHECK_PARTIAL();
3024 break;
3025 }
3026 if (fc != *eptr) break;
3027 eptr++;
3028 }
3029 if (possessive) continue;
3030
3031 while (eptr >= pp)
3032 {
3033 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM27);
3034 eptr--;
3035 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3036 }
3037 MRRETURN(MATCH_NOMATCH);
3038 }
3039 }
3040 /* Control never gets here */
3041
3042 /* Match a negated single one-byte character. The character we are
3043 checking can be multibyte. */
3044
3045 case OP_NOT:
3046 if (eptr >= md->end_subject)
3047 {
3048 SCHECK_PARTIAL();
3049 MRRETURN(MATCH_NOMATCH);
3050 }
3051 ecode++;
3052 GETCHARINCTEST(c, eptr);
3053 if ((ims & PCRE_CASELESS) != 0)
3054 {
3055 #ifdef SUPPORT_UTF8
3056 if (c < 256)
3057 #endif
3058 c = md->lcc[c];
3059 if (md->lcc[*ecode++] == c) MRRETURN(MATCH_NOMATCH);
3060 }
3061 else
3062 {
3063 if (*ecode++ == c) MRRETURN(MATCH_NOMATCH);
3064 }
3065 break;
3066
3067 /* Match a negated single one-byte character repeatedly. This is almost a
3068 repeat of the code for a repeated single character, but I haven't found a
3069 nice way of commoning these up that doesn't require a test of the
3070 positive/negative option for each character match. Maybe that wouldn't add
3071 very much to the time taken, but character matching *is* what this is all
3072 about... */
3073
3074 case OP_NOTEXACT:
3075 min = max = GET2(ecode, 1);
3076 ecode += 3;
3077 goto REPEATNOTCHAR;
3078
3079 case OP_NOTUPTO:
3080 case OP_NOTMINUPTO:
3081 min = 0;
3082 max = GET2(ecode, 1);
3083 minimize = *ecode == OP_NOTMINUPTO;
3084 ecode += 3;
3085 goto REPEATNOTCHAR;
3086
3087 case OP_NOTPOSSTAR:
3088 possessive = TRUE;
3089 min = 0;
3090 max = INT_MAX;
3091 ecode++;
3092 goto REPEATNOTCHAR;
3093
3094 case OP_NOTPOSPLUS:
3095 possessive = TRUE;
3096 min = 1;
3097 max = INT_MAX;
3098 ecode++;
3099 goto REPEATNOTCHAR;
3100
3101 case OP_NOTPOSQUERY:
3102 possessive = TRUE;
3103 min = 0;
3104 max = 1;
3105 ecode++;
3106 goto REPEATNOTCHAR;
3107
3108 case OP_NOTPOSUPTO:
3109 possessive = TRUE;
3110 min = 0;
3111 max = GET2(ecode, 1);
3112 ecode += 3;
3113 goto REPEATNOTCHAR;
3114
3115 case OP_NOTSTAR:
3116 case OP_NOTMINSTAR:
3117 case OP_NOTPLUS:
3118 case OP_NOTMINPLUS:
3119 case OP_NOTQUERY:
3120 case OP_NOTMINQUERY:
3121 c = *ecode++ - OP_NOTSTAR;
3122 minimize = (c & 1) != 0;
3123 min = rep_min[c]; /* Pick up values from tables; */
3124 max = rep_max[c]; /* zero for max => infinity */
3125 if (max == 0) max = INT_MAX;
3126
3127 /* Common code for all repeated single-byte matches. */
3128
3129 REPEATNOTCHAR:
3130 fc = *ecode++;
3131
3132 /* The code is duplicated for the caseless and caseful cases, for speed,
3133 since matching characters is likely to be quite common. First, ensure the
3134 minimum number of matches are present. If min = max, continue at the same
3135 level without recursing. Otherwise, if minimizing, keep trying the rest of
3136 the expression and advancing one matching character if failing, up to the
3137 maximum. Alternatively, if maximizing, find the maximum number of
3138 characters and work backwards. */
3139
3140 DPRINTF(("negative matching %c{%d,%d} against subject %.*s\n", fc, min, max,
3141 max, eptr));
3142
3143 if ((ims & PCRE_CASELESS) != 0)
3144 {
3145 fc = md->lcc[fc];
3146
3147 #ifdef SUPPORT_UTF8
3148 /* UTF-8 mode */
3149 if (utf8)
3150 {
3151 register unsigned int d;
3152 for (i = 1; i <= min; i++)
3153 {
3154 if (eptr >= md->end_subject)
3155 {
3156 SCHECK_PARTIAL();
3157 MRRETURN(MATCH_NOMATCH);
3158 }
3159 GETCHARINC(d, eptr);
3160 if (d < 256) d = md->lcc[d];
3161 if (fc == d) MRRETURN(MATCH_NOMATCH);
3162 }
3163 }
3164 else
3165 #endif
3166
3167 /* Not UTF-8 mode */
3168 {
3169 for (i = 1; i <= min; i++)
3170 {
3171 if (eptr >= md->end_subject)
3172 {
3173 SCHECK_PARTIAL();
3174 MRRETURN(MATCH_NOMATCH);
3175 }
3176 if (fc == md->lcc[*eptr++]) MRRETURN(MATCH_NOMATCH);
3177 }
3178 }
3179
3180 if (min == max) continue;
3181
3182 if (minimize)
3183 {
3184 #ifdef SUPPORT_UTF8
3185 /* UTF-8 mode */
3186 if (utf8)
3187 {
3188 register unsigned int d;
3189 for (fi = min;; fi++)
3190 {
3191 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM28);
3192 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3193 if (fi >= max) MRRETURN(MATCH_NOMATCH);
3194 if (eptr >= md->end_subject)
3195 {
3196 SCHECK_PARTIAL();
3197 MRRETURN(MATCH_NOMATCH);
3198 }
3199 GETCHARINC(d, eptr);
3200 if (d < 256) d = md->lcc[d];
3201 if (fc == d) MRRETURN(MATCH_NOMATCH);
3202 }
3203 }
3204 else
3205 #endif
3206 /* Not UTF-8 mode */
3207 {
3208 for (fi = min;; fi++)
3209 {
3210 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM29);
3211 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3212 if (fi >= max) MRRETURN(MATCH_NOMATCH);
3213 if (eptr >= md->end_subject)
3214 {
3215 SCHECK_PARTIAL();
3216 MRRETURN(MATCH_NOMATCH);
3217 }
3218 if (fc == md->lcc[*eptr++]) MRRETURN(MATCH_NOMATCH);
3219 }
3220 }
3221 /* Control never gets here */
3222 }
3223
3224 /* Maximize case */
3225
3226 else
3227 {
3228 pp = eptr;
3229
3230 #ifdef SUPPORT_UTF8
3231 /* UTF-8 mode */
3232 if (utf8)
3233 {
3234 register unsigned int d;
3235 for (i = min; i < max; i++)
3236 {
3237 int len = 1;
3238 if (eptr >= md->end_subject)
3239 {
3240 SCHECK_PARTIAL();
3241 break;
3242 }
3243 GETCHARLEN(d, eptr, len);
3244 if (d < 256) d = md->lcc[d];
3245 if (fc == d) break;
3246 eptr += len;
3247 }
3248 if (possessive) continue;
3249 for(;;)
3250 {
3251 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM30);
3252 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3253 if (eptr-- == pp) break; /* Stop if tried at original pos */
3254 BACKCHAR(eptr);
3255 }
3256 }
3257 else
3258 #endif
3259 /* Not UTF-8 mode */
3260 {
3261 for (i = min; i < max; i++)
3262 {
3263 if (eptr >= md->end_subject)
3264 {
3265 SCHECK_PARTIAL();
3266 break;
3267 }
3268 if (fc == md->lcc[*eptr]) break;
3269 eptr++;
3270 }
3271 if (possessive) continue;
3272 while (eptr >= pp)
3273 {
3274 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM31);
3275 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3276 eptr--;
3277 }
3278 }
3279
3280 MRRETURN(MATCH_NOMATCH);
3281 }
3282 /* Control never gets here */
3283 }
3284
3285 /* Caseful comparisons */
3286
3287 else
3288 {
3289 #ifdef SUPPORT_UTF8
3290 /* UTF-8 mode */
3291 if (utf8)
3292 {
3293 register unsigned int d;
3294 for (i = 1; i <= min; i++)
3295 {
3296 if (eptr >= md->end_subject)
3297 {
3298 SCHECK_PARTIAL();
3299 MRRETURN(MATCH_NOMATCH);
3300 }
3301 GETCHARINC(d, eptr);
3302 if (fc == d) MRRETURN(MATCH_NOMATCH);
3303 }
3304 }
3305 else
3306 #endif
3307 /* Not UTF-8 mode */
3308 {
3309 for (i = 1; i <= min; i++)
3310 {
3311 if (eptr >= md->end_subject)
3312 {
3313 SCHECK_PARTIAL();
3314 MRRETURN(MATCH_NOMATCH);
3315 }
3316 if (fc == *eptr++) MRRETURN(MATCH_NOMATCH);
3317 }
3318 }
3319
3320 if (min == max) continue;
3321
3322 if (minimize)
3323 {
3324 #ifdef SUPPORT_UTF8
3325 /* UTF-8 mode */
3326 if (utf8)
3327 {
3328 register unsigned int d;
3329 for (fi = min;; fi++)
3330 {
3331 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM32);
3332 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3333 if (fi >= max) MRRETURN(MATCH_NOMATCH);
3334 if (eptr >= md->end_subject)
3335 {
3336 SCHECK_PARTIAL();
3337 MRRETURN(MATCH_NOMATCH);
3338 }
3339 GETCHARINC(d, eptr);
3340 if (fc == d) MRRETURN(MATCH_NOMATCH);
3341 }
3342 }
3343 else
3344 #endif
3345 /* Not UTF-8 mode */
3346 {
3347 for (fi = min;; fi++)
3348 {
3349 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM33);
3350 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3351 if (fi >= max) MRRETURN(MATCH_NOMATCH);
3352 if (eptr >= md->end_subject)
3353 {
3354 SCHECK_PARTIAL();
3355 MRRETURN(MATCH_NOMATCH);
3356 }
3357 if (fc == *eptr++) MRRETURN(MATCH_NOMATCH);
3358 }
3359 }
3360 /* Control never gets here */
3361 }
3362
3363 /* Maximize case */
3364
3365 else
3366 {
3367 pp = eptr;
3368
3369 #ifdef SUPPORT_UTF8
3370 /* UTF-8 mode */
3371 if (utf8)
3372 {
3373 register unsigned int d;
3374 for (i = min; i < max; i++)
3375 {
3376 int len = 1;
3377 if (eptr >= md->end_subject)
3378 {
3379 SCHECK_PARTIAL();
3380 break;
3381 }
3382 GETCHARLEN(d, eptr, len);
3383 if (fc == d) break;
3384 eptr += len;
3385 }
3386 if (possessive) continue;
3387 for(;;)
3388 {
3389 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM34);
3390 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3391 if (eptr-- == pp) break; /* Stop if tried at original pos */
3392 BACKCHAR(eptr);
3393 }
3394 }
3395 else
3396 #endif
3397 /* Not UTF-8 mode */
3398 {
3399 for (i = min; i < max; i++)
3400 {
3401 if (eptr >= md->end_subject)
3402 {
3403 SCHECK_PARTIAL();
3404 break;
3405 }
3406 if (fc == *eptr) break;
3407 eptr++;
3408 }
3409 if (possessive) continue;
3410 while (eptr >= pp)
3411 {
3412 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM35);
3413 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3414 eptr--;
3415 }
3416 }
3417
3418 MRRETURN(MATCH_NOMATCH);
3419 }
3420 }
3421 /* Control never gets here */
3422
3423 /* Match a single character type repeatedly; several different opcodes
3424 share code. This is very similar to the code for single characters, but we
3425 repeat it in the interests of efficiency. */
3426
3427 case OP_TYPEEXACT:
3428 min = max = GET2(ecode, 1);
3429 minimize = TRUE;
3430 ecode += 3;
3431 goto REPEATTYPE;
3432
3433 case OP_TYPEUPTO:
3434 case OP_TYPEMINUPTO:
3435 min = 0;
3436 max = GET2(ecode, 1);
3437 minimize = *ecode == OP_TYPEMINUPTO;
3438 ecode += 3;
3439 goto REPEATTYPE;
3440
3441 case OP_TYPEPOSSTAR:
3442 possessive = TRUE;
3443 min = 0;
3444 max = INT_MAX;
3445 ecode++;
3446 goto REPEATTYPE;
3447
3448 case OP_TYPEPOSPLUS:
3449 possessive = TRUE;
3450 min = 1;
3451 max = INT_MAX;
3452 ecode++;
3453 goto REPEATTYPE;
3454
3455 case OP_TYPEPOSQUERY:
3456 possessive = TRUE;
3457 min = 0;
3458 max = 1;
3459 ecode++;
3460 goto REPEATTYPE;
3461
3462 case OP_TYPEPOSUPTO:
3463 possessive = TRUE;
3464 min = 0;
3465 max = GET2(ecode, 1);
3466 ecode += 3;
3467 goto REPEATTYPE;
3468
3469 case OP_TYPESTAR:
3470 case OP_TYPEMINSTAR:
3471 case OP_TYPEPLUS:
3472 case OP_TYPEMINPLUS:
3473 case OP_TYPEQUERY:
3474 case OP_TYPEMINQUERY:
3475 c = *ecode++ - OP_TYPESTAR;
3476 minimize = (c & 1) != 0;
3477 min = rep_min[c]; /* Pick up values from tables; */
3478 max = rep_max[c]; /* zero for max => infinity */
3479 if (max == 0) max = INT_MAX;
3480
3481 /* Common code for all repeated single character type matches. Note that
3482 in UTF-8 mode, '.' matches a character of any length, but for the other
3483 character types, the valid characters are all one-byte long. */
3484
3485 REPEATTYPE:
3486 ctype = *ecode++; /* Code for the character type */
3487
3488 #ifdef SUPPORT_UCP
3489 if (ctype == OP_PROP || ctype == OP_NOTPROP)
3490 {
3491 prop_fail_result = ctype == OP_NOTPROP;
3492 prop_type = *ecode++;
3493 prop_value = *ecode++;
3494 }
3495 else prop_type = -1;
3496 #endif
3497
3498 /* First, ensure the minimum number of matches are present. Use inline
3499 code for maximizing the speed, and do the type test once at the start
3500 (i.e. keep it out of the loop). Separate the UTF-8 code completely as that
3501 is tidier. Also separate the UCP code, which can be the same for both UTF-8
3502 and single-bytes. */
3503
3504 if (min > 0)
3505 {
3506 #ifdef SUPPORT_UCP
3507 if (prop_type >= 0)
3508 {
3509 switch(prop_type)
3510 {
3511 case PT_ANY:
3512 if (prop_fail_result) MRRETURN(MATCH_NOMATCH);
3513 for (i = 1; i <= min; i++)
3514 {
3515 if (eptr >= md->end_subject)
3516 {
3517 SCHECK_PARTIAL();
3518 MRRETURN(MATCH_NOMATCH);
3519 }
3520 GETCHARINCTEST(c, eptr);
3521 }
3522 break;
3523
3524 case PT_LAMP:
3525 for (i = 1; i <= min; i++)
3526 {
3527 if (eptr >= md->end_subject)
3528 {
3529 SCHECK_PARTIAL();
3530 MRRETURN(MATCH_NOMATCH);
3531 }
3532 GETCHARINCTEST(c, eptr);
3533 prop_chartype = UCD_CHARTYPE(c);
3534 if ((prop_chartype == ucp_Lu ||
3535 prop_chartype == ucp_Ll ||
3536 prop_chartype == ucp_Lt) == prop_fail_result)
3537 MRRETURN(MATCH_NOMATCH);
3538 }
3539 break;
3540
3541 case PT_GC:
3542 for (i = 1; i <= min; i++)
3543 {
3544 if (eptr >= md->end_subject)
3545 {
3546 SCHECK_PARTIAL();
3547 MRRETURN(MATCH_NOMATCH);
3548 }
3549 GETCHARINCTEST(c, eptr);
3550 prop_category = UCD_CATEGORY(c);
3551 if ((prop_category == prop_value) == prop_fail_result)
3552 MRRETURN(MATCH_NOMATCH);
3553 }
3554 break;
3555
3556 case PT_PC:
3557 for (i = 1; i <= min; i++)
3558 {
3559 if (eptr >= md->end_subject)
3560 {
3561 SCHECK_PARTIAL();
3562 MRRETURN(MATCH_NOMATCH);
3563 }
3564 GETCHARINCTEST(c, eptr);
3565 prop_chartype = UCD_CHARTYPE(c);
3566 if ((prop_chartype == prop_value) == prop_fail_result)
3567 MRRETURN(MATCH_NOMATCH);
3568 }
3569 break;
3570
3571 case PT_SC:
3572 for (i = 1; i <= min; i++)
3573 {
3574 if (eptr >= md->end_subject)
3575 {
3576 SCHECK_PARTIAL();
3577 MRRETURN(MATCH_NOMATCH);
3578 }
3579 GETCHARINCTEST(c, eptr);
3580 prop_script = UCD_SCRIPT(c);
3581 if ((prop_script == prop_value) == prop_fail_result)
3582 MRRETURN(MATCH_NOMATCH);
3583 }
3584 break;
3585
3586 case PT_ALNUM:
3587 for (i = 1; i <= min; i++)
3588 {
3589 if (eptr >= md->end_subject)
3590 {
3591 SCHECK_PARTIAL();
3592 MRRETURN(MATCH_NOMATCH);
3593 }
3594 GETCHARINCTEST(c, eptr);
3595 prop_category = UCD_CATEGORY(c);
3596 if ((prop_category == ucp_L || prop_category == ucp_N)
3597 == prop_fail_result)
3598 MRRETURN(MATCH_NOMATCH);
3599 }
3600 break;
3601
3602 case PT_SPACE: /* Perl space */
3603 for (i = 1; i <= min; i++)
3604 {
3605 if (eptr >= md->end_subject)
3606 {
3607 SCHECK_PARTIAL();
3608 MRRETURN(MATCH_NOMATCH);
3609 }
3610 GETCHARINCTEST(c, eptr);
3611 prop_category = UCD_CATEGORY(c);
3612 if ((prop_category == ucp_Z || c == CHAR_HT || c == CHAR_NL ||
3613 c == CHAR_FF || c == CHAR_CR)
3614 == prop_fail_result)
3615 MRRETURN(MATCH_NOMATCH);
3616 }
3617 break;
3618
3619 case PT_PXSPACE: /* POSIX space */
3620 for (i = 1; i <= min; i++)
3621 {
3622 if (eptr >= md->end_subject)
3623 {
3624 SCHECK_PARTIAL();
3625 MRRETURN(MATCH_NOMATCH);
3626 }
3627 GETCHARINCTEST(c, eptr);
3628 prop_category = UCD_CATEGORY(c);
3629 if ((prop_category == ucp_Z || c == CHAR_HT || c == CHAR_NL ||
3630 c == CHAR_VT || c == CHAR_FF || c == CHAR_CR)
3631 == prop_fail_result)
3632 MRRETURN(MATCH_NOMATCH);
3633 }
3634 break;
3635
3636 case PT_WORD:
3637 for (i = 1; i <= min; i++)
3638 {
3639 if (eptr >= md->end_subject)
3640 {
3641 SCHECK_PARTIAL();
3642 MRRETURN(MATCH_NOMATCH);
3643 }
3644 GETCHARINCTEST(c, eptr);
3645 prop_category = UCD_CATEGORY(c);
3646 if ((prop_category == ucp_L || prop_category == ucp_N ||
3647 c == CHAR_UNDERSCORE)
3648 == prop_fail_result)
3649 MRRETURN(MATCH_NOMATCH);
3650 }
3651 break;
3652
3653 /* This should not occur */
3654
3655 default:
3656 RRETURN(PCRE_ERROR_INTERNAL);
3657 }
3658 }
3659
3660 /* Match extended Unicode sequences. We will get here only if the
3661 support is in the binary; otherwise a compile-time error occurs. */
3662
3663 else if (ctype == OP_EXTUNI)
3664 {
3665 for (i = 1; i <= min; i++)
3666 {
3667 if (eptr >= md->end_subject)
3668 {
3669 SCHECK_PARTIAL();
3670 MRRETURN(MATCH_NOMATCH);
3671 }
3672 GETCHARINCTEST(c, eptr);
3673 prop_category = UCD_CATEGORY(c);
3674 if (prop_category == ucp_M) MRRETURN(MATCH_NOMATCH);
3675 while (eptr < md->end_subject)
3676 {
3677 int len = 1;
3678 if (!utf8) c = *eptr;
3679 else { GETCHARLEN(c, eptr, len); }
3680 prop_category = UCD_CATEGORY(c);
3681 if (prop_category != ucp_M) break;
3682 eptr += len;
3683 }
3684 }
3685 }
3686
3687 else
3688 #endif /* SUPPORT_UCP */
3689
3690 /* Handle all other cases when the coding is UTF-8 */
3691
3692 #ifdef SUPPORT_UTF8
3693 if (utf8) switch(ctype)
3694 {
3695 case OP_ANY:
3696 for (i = 1; i <= min; i++)
3697 {
3698 if (eptr >= md->end_subject)
3699 {
3700 SCHECK_PARTIAL();
3701 MRRETURN(MATCH_NOMATCH);
3702 }
3703 if (IS_NEWLINE(eptr)) MRRETURN(MATCH_NOMATCH);
3704 eptr++;
3705 while (eptr < md->end_subject && (*eptr & 0xc0) == 0x80) eptr++;
3706 }
3707 break;
3708
3709 case OP_ALLANY:
3710 for (i = 1; i <= min; i++)
3711 {
3712 if (eptr >= md->end_subject)
3713 {
3714 SCHECK_PARTIAL();
3715 MRRETURN(MATCH_NOMATCH);
3716 }
3717 eptr++;
3718 while (eptr < md->end_subject && (*eptr & 0xc0) == 0x80) eptr++;
3719 }
3720 break;
3721
3722 case OP_ANYBYTE:
3723 if (eptr > md->end_subject - min) MRRETURN(MATCH_NOMATCH);
3724 eptr += min;
3725 break;
3726
3727 case OP_ANYNL:
3728 for (i = 1; i <= min; i++)
3729 {
3730 if (eptr >= md->end_subject)
3731 {
3732 SCHECK_PARTIAL();
3733 MRRETURN(MATCH_NOMATCH);
3734 }
3735 GETCHARINC(c, eptr);
3736 switch(c)
3737 {
3738 default: MRRETURN(MATCH_NOMATCH);
3739 case 0x000d:
3740 if (eptr < md->end_subject && *eptr == 0x0a) eptr++;
3741 break;
3742
3743 case 0x000a:
3744 break;
3745
3746 case 0x000b:
3747 case 0x000c:
3748 case 0x0085:
3749 case 0x2028:
3750 case 0x2029:
3751 if (md->bsr_anycrlf) MRRETURN(MATCH_NOMATCH);
3752 break;
3753 }
3754 }
3755 break;
3756
3757 case OP_NOT_HSPACE:
3758 for (i = 1; i <= min; i++)
3759 {
3760 if (eptr >= md->end_subject)
3761 {
3762 SCHECK_PARTIAL();
3763 MRRETURN(MATCH_NOMATCH);
3764 }
3765 GETCHARINC(c, eptr);
3766 switch(c)
3767 {
3768 default: break;
3769 case 0x09: /* HT */
3770 case 0x20: /* SPACE */
3771 case 0xa0: /* NBSP */
3772 case 0x1680: /* OGHAM SPACE MARK */
3773 case 0x180e: /* MONGOLIAN VOWEL SEPARATOR */
3774 case 0x2000: /* EN QUAD */
3775 case 0x2001: /* EM QUAD */
3776 case 0x2002: /* EN SPACE */
3777 case 0x2003: /* EM SPACE */
3778 case 0x2004: /* THREE-PER-EM SPACE */
3779 case 0x2005: /* FOUR-PER-EM SPACE */
3780 case 0x2006: /* SIX-PER-EM SPACE */
3781 case 0x2007: /* FIGURE SPACE */
3782 case 0x2008: /* PUNCTUATION SPACE */
3783 case 0x2009: /* THIN SPACE */
3784 case 0x200A: /* HAIR SPACE */
3785 case 0x202f: /* NARROW NO-BREAK SPACE */
3786 case 0x205f: /* MEDIUM MATHEMATICAL SPACE */
3787 case 0x3000: /* IDEOGRAPHIC SPACE */
3788 MRRETURN(MATCH_NOMATCH);
3789 }
3790 }
3791 break;
3792
3793 case OP_HSPACE:
3794 for (i = 1; i <= min; i++)
3795 {
3796 if (eptr >= md->end_subject)
3797 {
3798 SCHECK_PARTIAL();
3799 MRRETURN(MATCH_NOMATCH);
3800 }
3801 GETCHARINC(c, eptr);
3802 switch(c)
3803 {
3804 default: MRRETURN(MATCH_NOMATCH);
3805 case 0x09: /* HT */
3806 case 0x20: /* SPACE */
3807 case 0xa0: /* NBSP */
3808 case 0x1680: /* OGHAM SPACE MARK */
3809 case 0x180e: /* MONGOLIAN VOWEL SEPARATOR */
3810 case 0x2000: /* EN QUAD */
3811 case 0x2001: /* EM QUAD */
3812 case 0x2002: /* EN SPACE */
3813 case 0x2003: /* EM SPACE */
3814 case 0x2004: /* THREE-PER-EM SPACE */
3815 case 0x2005: /* FOUR-PER-EM SPACE */
3816 case 0x2006: /* SIX-PER-EM SPACE */
3817 case 0x2007: /* FIGURE SPACE */
3818 case 0x2008: /* PUNCTUATION SPACE */
3819 case 0x2009: /* THIN SPACE */
3820 case 0x200A: /* HAIR SPACE */
3821 case 0x202f: /* NARROW NO-BREAK SPACE */
3822 case 0x205f: /* MEDIUM MATHEMATICAL SPACE */
3823 case 0x3000: /* IDEOGRAPHIC SPACE */
3824 break;
3825 }
3826 }
3827 break;
3828
3829 case OP_NOT_VSPACE:
3830 for (i = 1; i <= min; i++)
3831 {
3832 if (eptr >= md->end_subject)
3833 {
3834 SCHECK_PARTIAL();
3835 MRRETURN(MATCH_NOMATCH);
3836 }
3837 GETCHARINC(c, eptr);
3838 switch(c)
3839 {
3840 default: break;
3841 case 0x0a: /* LF */
3842 case 0x0b: /* VT */
3843 case 0x0c: /* FF */
3844 case 0x0d: /* CR */
3845 case 0x85: /* NEL */
3846 case 0x2028: /* LINE SEPARATOR */
3847 case 0x2029: /* PARAGRAPH SEPARATOR */
3848 MRRETURN(MATCH_NOMATCH);
3849 }
3850 }
3851 break;
3852
3853 case OP_VSPACE:
3854 for (i = 1; i <= min; i++)
3855 {
3856 if (eptr >= md->end_subject)
3857 {
3858 SCHECK_PARTIAL();
3859 MRRETURN(MATCH_NOMATCH);
3860 }
3861 GETCHARINC(c, eptr);
3862 switch(c)
3863 {
3864 default: MRRETURN(MATCH_NOMATCH);
3865 case 0x0a: /* LF */
3866 case 0x0b: /* VT */
3867 case 0x0c: /* FF */
3868 case 0x0d: /* CR */
3869 case 0x85: /* NEL */
3870 case 0x2028: /* LINE SEPARATOR */
3871 case 0x2029: /* PARAGRAPH SEPARATOR */
3872 break;
3873 }
3874 }
3875 break;
3876
3877 case OP_NOT_DIGIT:
3878 for (i = 1; i <= min; i++)
3879 {
3880 if (eptr >= md->end_subject)
3881 {
3882 SCHECK_PARTIAL();
3883 MRRETURN(MATCH_NOMATCH);
3884 }
3885 GETCHARINC(c, eptr);
3886 if (c < 128 && (md->ctypes[c] & ctype_digit) != 0)
3887 MRRETURN(MATCH_NOMATCH);
3888 }
3889 break;
3890
3891 case OP_DIGIT:
3892 for (i = 1; i <= min; i++)
3893 {
3894 if (eptr >= md->end_subject)
3895 {
3896 SCHECK_PARTIAL();
3897 MRRETURN(MATCH_NOMATCH);
3898 }
3899 if (*eptr >= 128 || (md->ctypes[*eptr++] & ctype_digit) == 0)
3900 MRRETURN(MATCH_NOMATCH);
3901 /* No need to skip more bytes - we know it's a 1-byte character */
3902 }
3903 break;
3904
3905 case OP_NOT_WHITESPACE:
3906 for (i = 1; i <= min; i++)
3907 {
3908 if (eptr >= md->end_subject)
3909 {
3910 SCHECK_PARTIAL();
3911 MRRETURN(MATCH_NOMATCH);
3912 }
3913 if (*eptr < 128 && (md->ctypes[*eptr] & ctype_space) != 0)
3914 MRRETURN(MATCH_NOMATCH);
3915 while (++eptr < md->end_subject && (*eptr & 0xc0) == 0x80);
3916 }
3917 break;
3918
3919 case OP_WHITESPACE:
3920 for (i = 1; i <= min; i++)
3921 {
3922 if (eptr >= md->end_subject)
3923 {
3924 SCHECK_PARTIAL();
3925 MRRETURN(MATCH_NOMATCH);
3926 }
3927 if (*eptr >= 128 || (md->ctypes[*eptr++] & ctype_space) == 0)
3928 MRRETURN(MATCH_NOMATCH);
3929 /* No need to skip more bytes - we know it's a 1-byte character */
3930 }
3931 break;
3932
3933 case OP_NOT_WORDCHAR:
3934 for (i = 1; i <= min; i++)
3935 {
3936 if (eptr >= md->end_subject)
3937 {
3938 SCHECK_PARTIAL();
3939 MRRETURN(MATCH_NOMATCH);
3940 }
3941 if (*eptr < 128 && (md->ctypes[*eptr] & ctype_word) != 0)
3942 MRRETURN(MATCH_NOMATCH);
3943 while (++eptr < md->end_subject && (*eptr & 0xc0) == 0x80);
3944 }
3945 break;
3946
3947 case OP_WORDCHAR:
3948 for (i = 1; i <= min; i++)
3949 {
3950 if (eptr >= md->end_subject)
3951 {
3952 SCHECK_PARTIAL();
3953 MRRETURN(MATCH_NOMATCH);
3954 }
3955 if (*eptr >= 128 || (md->ctypes[*eptr++] & ctype_word) == 0)
3956 MRRETURN(MATCH_NOMATCH);
3957 /* No need to skip more bytes - we know it's a 1-byte character */
3958 }
3959 break;
3960
3961 default:
3962 RRETURN(PCRE_ERROR_INTERNAL);
3963 } /* End switch(ctype) */
3964
3965 else
3966 #endif /* SUPPORT_UTF8 */
3967
3968 /* Code for the non-UTF-8 case for minimum matching of operators other
3969 than OP_PROP and OP_NOTPROP. */
3970
3971 switch(ctype)
3972 {
3973 case OP_ANY:
3974 for (i = 1; i <= min; i++)
3975 {
3976 if (eptr >= md->end_subject)
3977 {
3978 SCHECK_PARTIAL();
3979 MRRETURN(MATCH_NOMATCH);
3980 }
3981 if (IS_NEWLINE(eptr)) MRRETURN(MATCH_NOMATCH);
3982 eptr++;
3983 }
3984 break;
3985
3986 case OP_ALLANY:
3987 if (eptr > md->end_subject - min)
3988 {
3989 SCHECK_PARTIAL();
3990 MRRETURN(MATCH_NOMATCH);
3991 }
3992 eptr += min;
3993 break;
3994
3995 case OP_ANYBYTE:
3996 if (eptr > md->end_subject - min)
3997 {
3998 SCHECK_PARTIAL();
3999 MRRETURN(MATCH_NOMATCH);
4000 }
4001 eptr += min;
4002 break;
4003
4004 case OP_ANYNL:
4005 for (i = 1; i <= min; i++)
4006 {
4007 if (eptr >= md->end_subject)
4008 {
4009 SCHECK_PARTIAL();
4010 MRRETURN(MATCH_NOMATCH);
4011 }
4012 switch(*eptr++)
4013 {
4014 default: MRRETURN(MATCH_NOMATCH);
4015 case 0x000d:
4016 if (eptr < md->end_subject && *eptr == 0x0a) eptr++;
4017 break;
4018 case 0x000a:
4019 break;
4020
4021 case 0x000b:
4022 case 0x000c:
4023 case 0x0085:
4024 if (md->bsr_anycrlf) MRRETURN(MATCH_NOMATCH);
4025 break;
4026 }
4027 }
4028 break;
4029
4030 case OP_NOT_HSPACE:
4031 for (i = 1; i <= min; i++)
4032 {
4033 if (eptr >= md->end_subject)
4034 {
4035 SCHECK_PARTIAL();
4036 MRRETURN(MATCH_NOMATCH);
4037 }
4038 switch(*eptr++)
4039 {
4040 default: break;
4041 case 0x09: /* HT */
4042 case 0x20: /* SPACE */
4043 case 0xa0: /* NBSP */
4044 MRRETURN(MATCH_NOMATCH);
4045 }
4046 }
4047 break;
4048
4049 case OP_HSPACE:
4050 for (i = 1; i <= min; i++)
4051 {
4052 if (eptr >= md->end_subject)
4053 {
4054 SCHECK_PARTIAL();
4055 MRRETURN(MATCH_NOMATCH);
4056 }
4057 switch(*eptr++)
4058 {
4059 default: MRRETURN(MATCH_NOMATCH);
4060 case 0x09: /* HT */
4061 case 0x20: /* SPACE */
4062 case 0xa0: /* NBSP */
4063 break;
4064 }
4065 }
4066 break;
4067
4068 case OP_NOT_VSPACE:
4069 for (i = 1; i <= min; i++)
4070 {
4071 if (eptr >= md->end_subject)
4072 {
4073 SCHECK_PARTIAL();
4074 MRRETURN(MATCH_NOMATCH);
4075 }
4076 switch(*eptr++)
4077 {
4078 default: break;
4079 case 0x0a: /* LF */
4080 case 0x0b: /* VT */
4081 case 0x0c: /* FF */
4082 case 0x0d: /* CR */
4083 case 0x85: /* NEL */
4084 MRRETURN(MATCH_NOMATCH);
4085 }
4086 }
4087 break;
4088
4089 case OP_VSPACE:
4090 for (i = 1; i <= min; i++)
4091 {
4092 if (eptr >= md->end_subject)
4093 {
4094 SCHECK_PARTIAL();
4095 MRRETURN(MATCH_NOMATCH);
4096 }
4097 switch(*eptr++)
4098 {
4099 default: MRRETURN(MATCH_NOMATCH);
4100 case 0x0a: /* LF */
4101 case 0x0b: /* VT */
4102 case 0x0c: /* FF */
4103 case 0x0d: /* CR */
4104 case 0x85: /* NEL */
4105 break;
4106 }
4107 }
4108 break;
4109
4110 case OP_NOT_DIGIT:
4111 for (i = 1; i <= min; i++)
4112 {
4113 if (eptr >= md->end_subject)
4114 {
4115 SCHECK_PARTIAL();
4116 MRRETURN(MATCH_NOMATCH);
4117 }
4118 if ((md->ctypes[*eptr++] & ctype_digit) != 0) MRRETURN(MATCH_NOMATCH);
4119 }
4120 break;
4121
4122 case OP_DIGIT:
4123 for (i = 1; i <= min; i++)
4124 {
4125 if (eptr >= md->end_subject)
4126 {
4127 SCHECK_PARTIAL();
4128 MRRETURN(MATCH_NOMATCH);
4129 }
4130 if ((md->ctypes[*eptr++] & ctype_digit) == 0) MRRETURN(MATCH_NOMATCH);
4131 }
4132 break;
4133
4134 case OP_NOT_WHITESPACE:
4135 for (i = 1; i <= min; i++)
4136 {
4137 if (eptr >= md->end_subject)
4138 {
4139 SCHECK_PARTIAL();
4140 MRRETURN(MATCH_NOMATCH);
4141 }
4142 if ((md->ctypes[*eptr++] & ctype_space) != 0) MRRETURN(MATCH_NOMATCH);
4143 }
4144 break;
4145
4146 case OP_WHITESPACE:
4147 for (i = 1; i <= min; i++)
4148 {
4149 if (eptr >= md->end_subject)
4150 {
4151 SCHECK_PARTIAL();
4152 MRRETURN(MATCH_NOMATCH);
4153 }
4154 if ((md->ctypes[*eptr++] & ctype_space) == 0) MRRETURN(MATCH_NOMATCH);
4155 }
4156 break;
4157
4158 case OP_NOT_WORDCHAR:
4159 for (i = 1; i <= min; i++)
4160 {
4161 if (eptr >= md->end_subject)
4162 {
4163 SCHECK_PARTIAL();
4164 MRRETURN(MATCH_NOMATCH);
4165 }
4166 if ((md->ctypes[*eptr++] & ctype_word) != 0)
4167 MRRETURN(MATCH_NOMATCH);
4168 }
4169 break;
4170
4171 case OP_WORDCHAR:
4172 for (i = 1; i <= min; i++)
4173 {
4174 if (eptr >= md->end_subject)
4175 {
4176 SCHECK_PARTIAL();
4177 MRRETURN(MATCH_NOMATCH);
4178 }
4179 if ((md->ctypes[*eptr++] & ctype_word) == 0)
4180 MRRETURN(MATCH_NOMATCH);
4181 }
4182 break;
4183
4184 default:
4185 RRETURN(PCRE_ERROR_INTERNAL);
4186 }
4187 }
4188
4189 /* If min = max, continue at the same level without recursing */
4190
4191 if (min == max) continue;
4192
4193 /* If minimizing, we have to test the rest of the pattern before each
4194 subsequent match. Again, separate the UTF-8 case for speed, and also
4195 separate the UCP cases. */
4196
4197 if (minimize)
4198 {
4199 #ifdef SUPPORT_UCP
4200 if (prop_type >= 0)
4201 {
4202 switch(prop_type)
4203 {
4204 case PT_ANY:
4205 for (fi = min;; fi++)
4206 {
4207 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM36);
4208 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
4209 if (fi >= max) MRRETURN(MATCH_NOMATCH);
4210 if (eptr >= md->end_subject)
4211 {
4212 SCHECK_PARTIAL();
4213 MRRETURN(MATCH_NOMATCH);
4214 }
4215 GETCHARINC(c, eptr);
4216 if (prop_fail_result) MRRETURN(MATCH_NOMATCH);
4217 }
4218 /* Control never gets here */
4219
4220 case PT_LAMP:
4221 for (fi = min;; fi++)
4222 {
4223 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM37);
4224 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
4225 if (fi >= max) MRRETURN(MATCH_NOMATCH);
4226 if (eptr >= md->end_subject)
4227 {
4228 SCHECK_PARTIAL();
4229 MRRETURN(MATCH_NOMATCH);
4230 }
4231 GETCHARINC(c, eptr);
4232 prop_chartype = UCD_CHARTYPE(c);
4233 if ((prop_chartype == ucp_Lu ||
4234 prop_chartype == ucp_Ll ||
4235 prop_chartype == ucp_Lt) == prop_fail_result)
4236 MRRETURN(MATCH_NOMATCH);
4237 }
4238 /* Control never gets here */
4239
4240 case PT_GC:
4241 for (fi = min;; fi++)
4242 {
4243 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM38);
4244 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
4245 if (fi >= max) MRRETURN(MATCH_NOMATCH);
4246 if (eptr >= md->end_subject)
4247 {
4248 SCHECK_PARTIAL();
4249 MRRETURN(MATCH_NOMATCH);
4250 }
4251 GETCHARINC(c, eptr);
4252 prop_category = UCD_CATEGORY(c);
4253 if ((prop_category == prop_value) == prop_fail_result)
4254 MRRETURN(MATCH_NOMATCH);
4255 }
4256 /* Control never gets here */
4257
4258 case PT_PC:
4259 for (fi = min;; fi++)
4260 {
4261 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM39);
4262 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
4263 if (fi >= max) MRRETURN(MATCH_NOMATCH);
4264 if (eptr >= md->end_subject)
4265 {
4266 SCHECK_PARTIAL();
4267 MRRETURN(MATCH_NOMATCH);
4268 }
4269 GETCHARINC(c, eptr);
4270 prop_chartype = UCD_CHARTYPE(c);
4271 if ((prop_chartype == prop_value) == prop_fail_result)
4272 MRRETURN(MATCH_NOMATCH);
4273 }
4274 /* Control never gets here */
4275
4276 case PT_SC:
4277 for (fi = min;; fi++)
4278 {
4279 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM40);
4280 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
4281 if (fi >= max) MRRETURN(MATCH_NOMATCH);
4282 if (eptr >= md->end_subject)
4283 {
4284 SCHECK_PARTIAL();
4285 MRRETURN(MATCH_NOMATCH);
4286 }
4287 GETCHARINC(c, eptr);
4288 prop_script = UCD_SCRIPT(c);
4289 if ((prop_script == prop_value) == prop_fail_result)
4290 MRRETURN(MATCH_NOMATCH);
4291 }
4292 /* Control never gets here */
4293
4294 case PT_ALNUM:
4295 for (fi = min;; fi++)
4296 {
4297 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM39);
4298 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
4299 if (fi >= max) MRRETURN(MATCH_NOMATCH);
4300 if (eptr >= md->end_subject)
4301 {
4302 SCHECK_PARTIAL();
4303 MRRETURN(MATCH_NOMATCH);
4304 }
4305 GETCHARINC(c, eptr);
4306 prop_category = UCD_CATEGORY(c);
4307 if ((prop_category == ucp_L || prop_category == ucp_N)
4308 == prop_fail_result)
4309 MRRETURN(MATCH_NOMATCH);
4310 }
4311 /* Control never gets here */
4312
4313 case PT_SPACE: /* Perl space */
4314 for (fi = min;; fi++)
4315 {
4316 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM39);
4317 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
4318 if (fi >= max) MRRETURN(MATCH_NOMATCH);
4319 if (eptr >= md->end_subject)
4320 {
4321 SCHECK_PARTIAL();
4322 MRRETURN(MATCH_NOMATCH);
4323 }
4324 GETCHARINC(c, eptr);
4325 prop_category = UCD_CATEGORY(c);
4326 if ((prop_category == ucp_Z || c == CHAR_HT || c == CHAR_NL ||
4327 c == CHAR_FF || c == CHAR_CR)
4328 == prop_fail_result)
4329 MRRETURN(MATCH_NOMATCH);
4330 }
4331 /* Control never gets here */
4332
4333 case PT_PXSPACE: /* POSIX space */
4334 for (fi = min;; fi++)
4335 {
4336 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM39);
4337 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
4338 if (fi >= max) MRRETURN(MATCH_NOMATCH);
4339 if (eptr >= md->end_subject)
4340 {
4341 SCHECK_PARTIAL();
4342 MRRETURN(MATCH_NOMATCH);
4343 }
4344 GETCHARINC(c, eptr);
4345 prop_category = UCD_CATEGORY(c);
4346 if ((prop_category == ucp_Z || c == CHAR_HT || c == CHAR_NL ||
4347 c == CHAR_VT || c == CHAR_FF || c == CHAR_CR)
4348 == prop_fail_result)
4349 MRRETURN(MATCH_NOMATCH);
4350 }
4351 /* Control never gets here */
4352
4353 case PT_WORD:
4354 for (fi = min;; fi++)
4355 {
4356 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM39);
4357 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
4358 if (fi >= max) MRRETURN(MATCH_NOMATCH);
4359 if (eptr >= md->end_subject)
4360 {
4361 SCHECK_PARTIAL();
4362 MRRETURN(MATCH_NOMATCH);
4363 }
4364 GETCHARINC(c, eptr);
4365 prop_category = UCD_CATEGORY(c);
4366 if ((prop_category == ucp_L ||
4367 prop_category == ucp_N ||
4368 c == CHAR_UNDERSCORE)
4369 == prop_fail_result)
4370 MRRETURN(MATCH_NOMATCH);
4371 }
4372 /* Control never gets here */
4373
4374 /* This should never occur */
4375
4376 default:
4377 RRETURN(PCRE_ERROR_INTERNAL);
4378 }
4379 }
4380
4381 /* Match extended Unicode sequences. We will get here only if the
4382 support is in the binary; otherwise a compile-time error occurs. */
4383
4384 else if (ctype == OP_EXTUNI)
4385 {
4386 for (fi = min;; fi++)
4387 {
4388 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM41);
4389 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
4390 if (fi >= max) MRRETURN(MATCH_NOMATCH);
4391 if (eptr >= md->end_subject)
4392 {
4393 SCHECK_PARTIAL();
4394 MRRETURN(MATCH_NOMATCH);
4395 }
4396 GETCHARINCTEST(c, eptr);
4397 prop_category = UCD_CATEGORY(c);
4398 if (prop_category == ucp_M) MRRETURN(MATCH_NOMATCH);
4399 while (eptr < md->end_subject)
4400 {
4401 int len = 1;
4402 if (!utf8) c = *eptr;
4403 else { GETCHARLEN(c, eptr, len); }
4404 prop_category = UCD_CATEGORY(c);
4405 if (prop_category != ucp_M) break;
4406 eptr += len;
4407 }
4408 }
4409 }
4410
4411 else
4412 #endif /* SUPPORT_UCP */
4413
4414 #ifdef SUPPORT_UTF8
4415 /* UTF-8 mode */
4416 if (utf8)
4417 {
4418 for (fi = min;; fi++)
4419 {
4420 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM42);
4421 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
4422 if (fi >= max) MRRETURN(MATCH_NOMATCH);
4423 if (eptr >= md->end_subject)
4424 {
4425 SCHECK_PARTIAL();
4426 MRRETURN(MATCH_NOMATCH);
4427 }
4428 if (ctype == OP_ANY && IS_NEWLINE(eptr))
4429 MRRETURN(MATCH_NOMATCH);
4430 GETCHARINC(c, eptr);
4431 switch(ctype)
4432 {
4433 case OP_ANY: /* This is the non-NL case */
4434 case OP_ALLANY:
4435 case OP_ANYBYTE:
4436 break;
4437
4438 case OP_ANYNL:
4439 switch(c)
4440 {
4441 default: MRRETURN(MATCH_NOMATCH);
4442 case 0x000d:
4443 if (eptr < md->end_subject && *eptr == 0x0a) eptr++;
4444 break;
4445 case 0x000a:
4446 break;
4447
4448 case 0x000b:
4449 case 0x000c:
4450 case 0x0085:
4451 case 0x2028:
4452 case 0x2029:
4453 if (md->bsr_anycrlf) MRRETURN(MATCH_NOMATCH);
4454 break;
4455 }
4456 break;
4457
4458 case OP_NOT_HSPACE:
4459 switch(c)
4460 {
4461 default: break;
4462 case 0x09: /* HT */
4463 case 0x20: /* SPACE */
4464 case 0xa0: /* NBSP */
4465 case 0x1680: /* OGHAM SPACE MARK */
4466 case 0x180e: /* MONGOLIAN VOWEL SEPARATOR */
4467 case 0x2000: /* EN QUAD */
4468 case 0x2001: /* EM QUAD */
4469 case 0x2002: /* EN SPACE */
4470 case 0x2003: /* EM SPACE */
4471 case 0x2004: /* THREE-PER-EM SPACE */
4472 case 0x2005: /* FOUR-PER-EM SPACE */
4473 case 0x2006: /* SIX-PER-EM SPACE */
4474 case 0x2007: /* FIGURE SPACE */
4475 case 0x2008: /* PUNCTUATION SPACE */
4476 case 0x2009: /* THIN SPACE */
4477 case 0x200A: /* HAIR SPACE */
4478 case 0x202f: /* NARROW NO-BREAK SPACE */
4479 case 0x205f: /* MEDIUM MATHEMATICAL SPACE */
4480 case 0x3000: /* IDEOGRAPHIC SPACE */
4481 MRRETURN(MATCH_NOMATCH);
4482 }
4483 break;
4484
4485 case OP_HSPACE:
4486 switch(c)
4487 {
4488 default: MRRETURN(MATCH_NOMATCH);
4489 case 0x09: /* HT */
4490 case 0x20: /* SPACE */
4491 case 0xa0: /* NBSP */
4492 case 0x1680: /* OGHAM SPACE MARK */
4493 case 0x180e: /* MONGOLIAN VOWEL SEPARATOR */
4494 case 0x2000: /* EN QUAD */
4495 case 0x2001: /* EM QUAD */
4496 case 0x2002: /* EN SPACE */
4497 case 0x2003: /* EM SPACE */
4498 case 0x2004: /* THREE-PER-EM SPACE */
4499 case 0x2005: /* FOUR-PER-EM SPACE */
4500 case 0x2006: /* SIX-PER-EM SPACE */
4501 case 0x2007: /* FIGURE SPACE */
4502 case 0x2008: /* PUNCTUATION SPACE */
4503 case 0x2009: /* THIN SPACE */
4504 case 0x200A: /* HAIR SPACE */
4505 case 0x202f: /* NARROW NO-BREAK SPACE */
4506 case 0x205f: /* MEDIUM MATHEMATICAL SPACE */
4507 case 0x3000: /* IDEOGRAPHIC SPACE */
4508 break;
4509 }
4510 break;
4511
4512 case OP_NOT_VSPACE:
4513 switch(c)
4514 {
4515 default: break;
4516 case 0x0a: /* LF */
4517 case 0x0b: /* VT */
4518 case 0x0c: /* FF */
4519 case 0x0d: /* CR */
4520 case 0x85: /* NEL */
4521 case 0x2028: /* LINE SEPARATOR */
4522 case 0x2029: /* PARAGRAPH SEPARATOR */
4523 MRRETURN(MATCH_NOMATCH);
4524 }
4525 break;
4526
4527 case OP_VSPACE:
4528 switch(c)
4529 {
4530 default: MRRETURN(MATCH_NOMATCH);
4531 case 0x0a: /* LF */
4532 case 0x0b: /* VT */
4533 case 0x0c: /* FF */
4534 case 0x0d: /* CR */
4535 case 0x85: /* NEL */
4536 case 0x2028: /* LINE SEPARATOR */
4537 case 0x2029: /* PARAGRAPH SEPARATOR */
4538 break;
4539 }
4540 break;
4541
4542 case OP_NOT_DIGIT:
4543 if (c < 256 && (md->ctypes[c] & ctype_digit) != 0)
4544 MRRETURN(MATCH_NOMATCH);
4545 break;
4546
4547 case OP_DIGIT:
4548 if (c >= 256 || (md->ctypes[c] & ctype_digit) == 0)
4549 MRRETURN(MATCH_NOMATCH);
4550 break;
4551
4552 case OP_NOT_WHITESPACE:
4553 if (c < 256 && (md->ctypes[c] & ctype_space) != 0)
4554 MRRETURN(MATCH_NOMATCH);
4555 break;
4556
4557 case OP_WHITESPACE:
4558 if (c >= 256 || (md->ctypes[c] & ctype_space) == 0)
4559 MRRETURN(MATCH_NOMATCH);
4560 break;
4561
4562 case OP_NOT_WORDCHAR:
4563 if (c < 256 && (md->ctypes[c] & ctype_word) != 0)
4564 MRRETURN(MATCH_NOMATCH);
4565 break;
4566
4567 case OP_WORDCHAR:
4568 if (c >= 256 || (md->ctypes[c] & ctype_word) == 0)
4569 MRRETURN(MATCH_NOMATCH);
4570 break;
4571
4572 default:
4573 RRETURN(PCRE_ERROR_INTERNAL);
4574 }
4575 }
4576 }
4577 else
4578 #endif
4579 /* Not UTF-8 mode */
4580 {
4581 for (fi = min;; fi++)
4582 {
4583 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM43);
4584 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
4585 if (fi >= max) MRRETURN(MATCH_NOMATCH);
4586 if (eptr >= md->end_subject)
4587 {
4588 SCHECK_PARTIAL();
4589 MRRETURN(MATCH_NOMATCH);
4590 }
4591 if (ctype == OP_ANY && IS_NEWLINE(eptr))
4592 MRRETURN(MATCH_NOMATCH);
4593 c = *eptr++;
4594 switch(ctype)
4595 {
4596 case OP_ANY: /* This is the non-NL case */
4597 case OP_ALLANY:
4598 case OP_ANYBYTE:
4599 break;
4600
4601 case OP_ANYNL:
4602 switch(c)
4603 {
4604 default: MRRETURN(MATCH_NOMATCH);
4605 case 0x000d:
4606 if (eptr < md->end_subject && *eptr == 0x0a) eptr++;
4607 break;
4608
4609 case 0x000a:
4610 break;
4611
4612 case 0x000b:
4613 case 0x000c:
4614 case 0x0085:
4615 if (md->bsr_anycrlf) MRRETURN(MATCH_NOMATCH);
4616 break;
4617 }
4618 break;
4619
4620 case OP_NOT_HSPACE:
4621 switch(c)
4622 {
4623 default: break;
4624 case 0x09: /* HT */
4625 case 0x20: /* SPACE */
4626 case 0xa0: /* NBSP */
4627 MRRETURN(MATCH_NOMATCH);
4628 }
4629 break;
4630
4631 case OP_HSPACE:
4632 switch(c)
4633 {
4634 default: MRRETURN(MATCH_NOMATCH);
4635 case 0x09: /* HT */
4636 case 0x20: /* SPACE */
4637 case 0xa0: /* NBSP */
4638 break;
4639 }
4640 break;
4641
4642 case OP_NOT_VSPACE:
4643 switch(c)
4644 {
4645 default: break;
4646 case 0x0a: /* LF */
4647 case 0x0b: /* VT */
4648 case 0x0c: /* FF */
4649 case 0x0d: /* CR */
4650 case 0x85: /* NEL */
4651 MRRETURN(MATCH_NOMATCH);
4652 }
4653 break;
4654
4655 case OP_VSPACE:
4656 switch(c)
4657 {
4658 default: MRRETURN(MATCH_NOMATCH);
4659 case 0x0a: /* LF */
4660 case 0x0b: /* VT */
4661 case 0x0c: /* FF */
4662 case 0x0d: /* CR */
4663 case 0x85: /* NEL */
4664 break;
4665 }
4666 break;
4667
4668 case OP_NOT_DIGIT:
4669 if ((md->ctypes[c] & ctype_digit) != 0) MRRETURN(MATCH_NOMATCH);
4670 break;
4671
4672 case OP_DIGIT:
4673 if ((md->ctypes[c] & ctype_digit) == 0) MRRETURN(MATCH_NOMATCH);
4674 break;
4675
4676 case OP_NOT_WHITESPACE:
4677 if ((md->ctypes[c] & ctype_space) != 0) MRRETURN(MATCH_NOMATCH);
4678 break;
4679
4680 case OP_WHITESPACE:
4681 if ((md->ctypes[c] & ctype_space) == 0) MRRETURN(MATCH_NOMATCH);
4682 break;
4683
4684 case OP_NOT_WORDCHAR:
4685 if ((md->ctypes[c] & ctype_word) != 0) MRRETURN(MATCH_NOMATCH);
4686 break;
4687
4688 case OP_WORDCHAR:
4689 if ((md->ctypes[c] & ctype_word) == 0) MRRETURN(MATCH_NOMATCH);
4690 break;
4691
4692 default:
4693 RRETURN(PCRE_ERROR_INTERNAL);
4694 }
4695 }
4696 }
4697 /* Control never gets here */
4698 }
4699
4700 /* If maximizing, it is worth using inline code for speed, doing the type
4701 test once at the start (i.e. keep it out of the loop). Again, keep the
4702 UTF-8 and UCP stuff separate. */
4703
4704 else
4705 {
4706 pp = eptr; /* Remember where we started */
4707
4708 #ifdef SUPPORT_UCP
4709 if (prop_type >= 0)
4710 {
4711 switch(prop_type)
4712 {
4713 case PT_ANY:
4714 for (i = min; i < max; i++)
4715 {
4716 int len = 1;
4717 if (eptr >= md->end_subject)
4718 {
4719 SCHECK_PARTIAL();
4720 break;
4721 }
4722 GETCHARLEN(c, eptr, len);
4723 if (prop_fail_result) break;
4724 eptr+= len;
4725 }
4726 break;
4727
4728 case PT_LAMP:
4729 for (i = min; i < max; i++)
4730 {
4731 int len = 1;
4732 if (eptr >= md->end_subject)
4733 {
4734 SCHECK_PARTIAL();
4735 break;
4736 }
4737 GETCHARLEN(c, eptr, len);
4738 prop_chartype = UCD_CHARTYPE(c);
4739 if ((prop_chartype == ucp_Lu ||
4740 prop_chartype == ucp_Ll ||
4741 prop_chartype == ucp_Lt) == prop_fail_result)
4742 break;
4743 eptr+= len;
4744 }
4745 break;
4746
4747 case PT_GC:
4748 for (i = min; i < max; i++)
4749 {
4750 int len = 1;
4751 if (eptr >= md->end_subject)
4752 {
4753 SCHECK_PARTIAL();
4754 break;
4755 }
4756 GETCHARLEN(c, eptr, len);
4757 prop_category = UCD_CATEGORY(c);
4758 if ((prop_category == prop_value) == prop_fail_result)
4759 break;
4760 eptr+= len;
4761 }
4762 break;
4763
4764 case PT_PC:
4765 for (i = min; i < max; i++)
4766 {
4767 int len = 1;
4768 if (eptr >= md->end_subject)
4769 {
4770 SCHECK_PARTIAL();
4771 break;
4772 }
4773 GETCHARLEN(c, eptr, len);
4774 prop_chartype = UCD_CHARTYPE(c);
4775 if ((prop_chartype == prop_value) == prop_fail_result)
4776 break;
4777 eptr+= len;
4778 }
4779 break;
4780
4781 case PT_SC:
4782 for (i = min; i < max; i++)
4783 {
4784 int len = 1;
4785 if (eptr >= md->end_subject)
4786 {
4787 SCHECK_PARTIAL();
4788 break;
4789 }
4790 GETCHARLEN(c, eptr, len);
4791 prop_script = UCD_SCRIPT(c);
4792 if ((prop_script == prop_value) == prop_fail_result)
4793 break;
4794 eptr+= len;
4795 }
4796 break;
4797
4798 case PT_ALNUM:
4799 for (i = min; i < max; i++)
4800 {
4801 int len = 1;
4802 if (eptr >= md->end_subject)
4803 {
4804 SCHECK_PARTIAL();
4805 break;
4806 }
4807 GETCHARLEN(c, eptr, len);
4808 prop_category = UCD_CATEGORY(c);
4809 if ((prop_category == ucp_L || prop_category == ucp_N)
4810 == prop_fail_result)
4811 break;
4812 eptr+= len;
4813 }
4814 break;
4815
4816 case PT_SPACE: /* Perl space */
4817 for (i = min; i < max; i++)
4818 {
4819 int len = 1;
4820 if (eptr >= md->end_subject)
4821 {
4822 SCHECK_PARTIAL();
4823 break;
4824 }
4825 GETCHARLEN(c, eptr, len);
4826 prop_category = UCD_CATEGORY(c);
4827 if ((prop_category == ucp_Z || c == CHAR_HT || c == CHAR_NL ||
4828 c == CHAR_FF || c == CHAR_CR)
4829 == prop_fail_result)
4830 break;
4831 eptr+= len;
4832 }
4833 break;
4834
4835 case PT_PXSPACE: /* POSIX space */
4836 for (i = min; i < max; i++)
4837 {
4838 int len = 1;
4839 if (eptr >= md->end_subject)
4840 {
4841 SCHECK_PARTIAL();
4842 break;
4843 }
4844 GETCHARLEN(c, eptr, len);
4845 prop_category = UCD_CATEGORY(c);
4846 if ((prop_category == ucp_Z || c == CHAR_HT || c == CHAR_NL ||
4847 c == CHAR_VT || c == CHAR_FF || c == CHAR_CR)
4848 == prop_fail_result)
4849 break;
4850 eptr+= len;
4851 }
4852 break;
4853
4854 case PT_WORD:
4855 for (i = min; i < max; i++)
4856 {
4857 int len = 1;
4858 if (eptr >= md->end_subject)
4859 {
4860 SCHECK_PARTIAL();
4861 break;
4862 }
4863 GETCHARLEN(c, eptr, len);
4864 prop_category = UCD_CATEGORY(c);
4865 if ((prop_category == ucp_L || prop_category == ucp_N ||
4866 c == CHAR_UNDERSCORE) == prop_fail_result)
4867 break;
4868 eptr+= len;
4869 }
4870 break;
4871
4872 default:
4873 RRETURN(PCRE_ERROR_INTERNAL);
4874 }
4875
4876 /* eptr is now past the end of the maximum run */
4877
4878 if (possessive) continue;
4879 for(;;)
4880 {
4881 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM44);
4882 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
4883 if (eptr-- == pp) break; /* Stop if tried at original pos */
4884 if (utf8) BACKCHAR(eptr);
4885 }
4886 }
4887
4888 /* Match extended Unicode sequences. We will get here only if the
4889 support is in the binary; otherwise a compile-time error occurs. */
4890
4891 else if (ctype == OP_EXTUNI)
4892 {
4893 for (i = min; i < max; i++)
4894 {
4895 if (eptr >= md->end_subject)
4896 {
4897 SCHECK_PARTIAL();
4898 break;
4899 }
4900 GETCHARINCTEST(c, eptr);
4901 prop_category = UCD_CATEGORY(c);
4902 if (prop_category == ucp_M) break;
4903 while (eptr < md->end_subject)
4904 {
4905 int len = 1;
4906 if (!utf8) c = *eptr; else
4907 {
4908 GETCHARLEN(c, eptr, len);
4909 }
4910 prop_category = UCD_CATEGORY(c);
4911 if (prop_category != ucp_M) break;
4912 eptr += len;
4913 }
4914 }
4915
4916 /* eptr is now past the end of the maximum run */
4917
4918 if (possessive) continue;
4919
4920 for(;;)
4921 {
4922 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM45);
4923 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
4924 if (eptr-- == pp) break; /* Stop if tried at original pos */
4925 for (;;) /* Move back over one extended */
4926 {
4927 int len = 1;
4928 if (!utf8) c = *eptr; else
4929 {
4930 BACKCHAR(eptr);
4931 GETCHARLEN(c, eptr, len);
4932 }
4933 prop_category = UCD_CATEGORY(c);
4934 if (prop_category != ucp_M) break;
4935 eptr--;
4936 }
4937 }
4938 }
4939
4940 else
4941 #endif /* SUPPORT_UCP */
4942
4943 #ifdef SUPPORT_UTF8
4944 /* UTF-8 mode */
4945
4946 if (utf8)
4947 {
4948 switch(ctype)
4949 {
4950 case OP_ANY:
4951 if (max < INT_MAX)
4952 {
4953 for (i = min; i < max; i++)
4954 {
4955 if (eptr >= md->end_subject)
4956 {
4957 SCHECK_PARTIAL();
4958 break;
4959 }
4960 if (IS_NEWLINE(eptr)) break;
4961 eptr++;
4962 while (eptr < md->end_subject && (*eptr & 0xc0) == 0x80) eptr++;
4963 }
4964 }
4965
4966 /* Handle unlimited UTF-8 repeat */
4967
4968 else
4969 {
4970 for (i = min; i < max; i++)
4971 {
4972 if (eptr >= md->end_subject)
4973 {
4974 SCHECK_PARTIAL();
4975 break;
4976 }
4977 if (IS_NEWLINE(eptr)) break;
4978 eptr++;
4979 while (eptr < md->end_subject && (*eptr & 0xc0) == 0x80) eptr++;
4980 }
4981 }
4982 break;
4983
4984 case OP_ALLANY:
4985 if (max < INT_MAX)
4986 {
4987 for (i = min; i < max; i++)
4988 {
4989 if (eptr >= md->end_subject)
4990 {
4991 SCHECK_PARTIAL();
4992 break;
4993 }
4994 eptr++;
4995 while (eptr < md->end_subject && (*eptr & 0xc0) == 0x80) eptr++;
4996 }
4997 }
4998 else eptr = md->end_subject; /* Unlimited UTF-8 repeat */
4999 break;
5000
5001 /* The byte case is the same as non-UTF8 */
5002
5003 case OP_ANYBYTE:
5004 c = max - min;
5005 if (c > (unsigned int)(md->end_subject - eptr))
5006 {
5007 eptr = md->end_subject;
5008 SCHECK_PARTIAL();
5009 }
5010 else eptr += c;
5011 break;
5012
5013 case OP_ANYNL:
5014 for (i = min; i < max; i++)
5015 {
5016 int len = 1;
5017 if (eptr >= md->end_subject)
5018 {
5019 SCHECK_PARTIAL();
5020 break;
5021 }
5022 GETCHARLEN(c, eptr, len);
5023 if (c == 0x000d)
5024 {
5025 if (++eptr >= md->end_subject) break;
5026 if (*eptr == 0x000a) eptr++;
5027 }
5028 else
5029 {
5030 if (c != 0x000a &&
5031 (md->bsr_anycrlf ||
5032 (c != 0x000b && c != 0x000c &&
5033 c != 0x0085 && c != 0x2028 && c != 0x2029)))
5034 break;
5035 eptr += len;
5036 }
5037 }
5038 break;
5039
5040 case OP_NOT_HSPACE:
5041 case OP_HSPACE:
5042 for (i = min; i < max; i++)
5043 {
5044 BOOL gotspace;
5045 int len = 1;
5046 if (eptr >= md->end_subject)
5047 {
5048 SCHECK_PARTIAL();
5049 break;
5050 }
5051 GETCHARLEN(c, eptr, len);
5052 switch(c)
5053 {
5054 default: gotspace = FALSE; break;
5055 case 0x09: /* HT */
5056 case 0x20: /* SPACE */
5057 case 0xa0: /* NBSP */
5058 case 0x1680: /* OGHAM SPACE MARK */
5059 case 0x180e: /* MONGOLIAN VOWEL SEPARATOR */
5060 case 0x2000: /* EN QUAD */
5061 case 0x2001: /* EM QUAD */
5062 case 0x2002: /* EN SPACE */
5063 case 0x2003: /* EM SPACE */
5064 case 0x2004: /* THREE-PER-EM SPACE */
5065 case 0x2005: /* FOUR-PER-EM SPACE */
5066 case 0x2006: /* SIX-PER-EM SPACE */
5067 case 0x2007: /* FIGURE SPACE */
5068 case 0x2008: /* PUNCTUATION SPACE */
5069 case 0x2009: /* THIN SPACE */
5070 case 0x200A: /* HAIR SPACE */
5071 case 0x202f: /* NARROW NO-BREAK SPACE */
5072 case 0x205f: /* MEDIUM MATHEMATICAL SPACE */
5073 case 0x3000: /* IDEOGRAPHIC SPACE */
5074 gotspace = TRUE;
5075 break;
5076 }
5077 if (gotspace == (ctype == OP_NOT_HSPACE)) break;
5078 eptr += len;
5079 }
5080 break;
5081
5082 case OP_NOT_VSPACE:
5083 case OP_VSPACE:
5084 for (i = min; i < max; i++)
5085 {
5086 BOOL gotspace;
5087 int len = 1;
5088 if (eptr >= md->end_subject)
5089 {
5090 SCHECK_PARTIAL();
5091 break;
5092 }
5093 GETCHARLEN(c, eptr, len);
5094 switch(c)
5095 {
5096 default: gotspace = FALSE; break;
5097 case 0x0a: /* LF */
5098 case 0x0b: /* VT */
5099 case 0x0c: /* FF */
5100 case 0x0d: /* CR */
5101 case 0x85: /* NEL */
5102 case 0x2028: /* LINE SEPARATOR */
5103 case 0x2029: /* PARAGRAPH SEPARATOR */
5104 gotspace = TRUE;
5105 break;
5106 }
5107 if (gotspace == (ctype == OP_NOT_VSPACE)) break;
5108 eptr += len;
5109 }
5110 break;
5111
5112 case OP_NOT_DIGIT:
5113 for (i = min; i < max; i++)
5114 {
5115 int len = 1;
5116 if (eptr >= md->end_subject)
5117 {
5118 SCHECK_PARTIAL();
5119 break;
5120 }
5121 GETCHARLEN(c, eptr, len);
5122 if (c < 256 && (md->ctypes[c] & ctype_digit) != 0) break;
5123 eptr+= len;
5124 }
5125 break;
5126
5127 case OP_DIGIT:
5128 for (i = min; i < max; i++)
5129 {
5130 int len = 1;
5131 if (eptr >= md->end_subject)
5132 {
5133 SCHECK_PARTIAL();
5134 break;
5135 }
5136 GETCHARLEN(c, eptr, len);
5137 if (c >= 256 ||(md->ctypes[c] & ctype_digit) == 0) break;
5138 eptr+= len;
5139 }
5140 break;
5141
5142 case OP_NOT_WHITESPACE:
5143 for (i = min; i < max; i++)
5144 {
5145 int len = 1;
5146 if (eptr >= md->end_subject)
5147 {
5148 SCHECK_PARTIAL();
5149 break;
5150 }
5151 GETCHARLEN(c, eptr, len);
5152 if (c < 256 && (md->ctypes[c] & ctype_space) != 0) break;
5153 eptr+= len;
5154 }
5155 break;
5156
5157 case OP_WHITESPACE:
5158 for (i = min; i < max; i++)
5159 {
5160 int len = 1;
5161 if (eptr >= md->end_subject)
5162 {
5163 SCHECK_PARTIAL();
5164 break;
5165 }
5166 GETCHARLEN(c, eptr, len);
5167 if (c >= 256 ||(md->ctypes[c] & ctype_space) == 0) break;
5168 eptr+= len;
5169 }
5170 break;
5171
5172 case OP_NOT_WORDCHAR:
5173 for (i = min; i < max; i++)
5174 {
5175 int len = 1;
5176 if (eptr >= md->end_subject)
5177 {
5178 SCHECK_PARTIAL();
5179 break;
5180 }
5181 GETCHARLEN(c, eptr, len);
5182 if (c < 256 && (md->ctypes[c] & ctype_word) != 0) break;
5183 eptr+= len;
5184 }
5185 break;
5186
5187 case OP_WORDCHAR:
5188 for (i = min; i < max; i++)
5189 {
5190 int len = 1;
5191 if (eptr >= md->end_subject)
5192 {
5193 SCHECK_PARTIAL();
5194 break;
5195 }
5196 GETCHARLEN(c, eptr, len);
5197 if (c >= 256 || (md->ctypes[c] & ctype_word) == 0) break;
5198 eptr+= len;
5199 }
5200 break;
5201
5202 default:
5203 RRETURN(PCRE_ERROR_INTERNAL);
5204 }
5205
5206 /* eptr is now past the end of the maximum run */
5207
5208 if (possessive) continue;
5209 for(;;)
5210 {
5211 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM46);
5212 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
5213 if (eptr-- == pp) break; /* Stop if tried at original pos */
5214 BACKCHAR(eptr);
5215 }
5216 }
5217 else
5218 #endif /* SUPPORT_UTF8 */
5219
5220 /* Not UTF-8 mode */
5221 {
5222 switch(ctype)
5223 {
5224 case OP_ANY:
5225 for (i = min; i < max; i++)
5226 {
5227 if (eptr >= md->end_subject)
5228 {
5229 SCHECK_PARTIAL();
5230 break;
5231 }
5232 if (IS_NEWLINE(eptr)) break;
5233 eptr++;
5234 }
5235 break;
5236
5237 case OP_ALLANY:
5238 case OP_ANYBYTE:
5239 c = max - min;
5240 if (c > (unsigned int)(md->end_subject - eptr))
5241 {
5242 eptr = md->end_subject;
5243 SCHECK_PARTIAL();
5244 }
5245 else eptr += c;
5246 break;
5247
5248 case OP_ANYNL:
5249 for (i = min; i < max; i++)
5250 {
5251 if (eptr >= md->end_subject)
5252 {
5253 SCHECK_PARTIAL();
5254 break;
5255 }
5256 c = *eptr;
5257 if (c == 0x000d)
5258 {
5259 if (++eptr >= md->end_subject) break;
5260 if (*eptr == 0x000a) eptr++;
5261 }
5262 else
5263 {
5264 if (c != 0x000a &&
5265 (md->bsr_anycrlf ||
5266 (c != 0x000b && c != 0x000c && c != 0x0085)))
5267 break;
5268 eptr++;
5269 }
5270 }
5271 break;
5272
5273 case OP_NOT_HSPACE:
5274 for (i = min; i < max; i++)
5275 {
5276 if (eptr >= md->end_subject)
5277 {
5278 SCHECK_PARTIAL();
5279 break;
5280 }
5281 c = *eptr;
5282 if (c == 0x09 || c == 0x20 || c == 0xa0) break;
5283 eptr++;
5284 }
5285 break;
5286
5287 case OP_HSPACE:
5288 for (i = min; i < max; i++)
5289 {
5290 if (eptr >= md->end_subject)
5291 {
5292 SCHECK_PARTIAL();
5293 break;
5294 }
5295 c = *eptr;
5296 if (c != 0x09 && c != 0x20 && c != 0xa0) break;
5297 eptr++;
5298 }
5299 break;
5300
5301 case OP_NOT_VSPACE:
5302 for (i = min; i < max; i++)
5303 {
5304 if (eptr >= md->end_subject)
5305 {
5306 SCHECK_PARTIAL();
5307 break;
5308 }
5309 c = *eptr;
5310 if (c == 0x0a || c == 0x0b || c == 0x0c || c == 0x0d || c == 0x85)
5311 break;
5312 eptr++;
5313 }
5314 break;
5315
5316 case OP_VSPACE:
5317 for (i = min; i < max; i++)
5318 {
5319 if (eptr >= md->end_subject)
5320 {
5321 SCHECK_PARTIAL();
5322 break;
5323 }
5324 c = *eptr;
5325 if (c != 0x0a && c != 0x0b && c != 0x0c && c != 0x0d && c != 0x85)
5326 break;
5327 eptr++;
5328 }
5329 break;
5330
5331 case OP_NOT_DIGIT:
5332 for (i = min; i < max; i++)
5333 {
5334 if (eptr >= md->end_subject)
5335 {
5336 SCHECK_PARTIAL();
5337 break;
5338 }
5339 if ((md->ctypes[*eptr] & ctype_digit) != 0) break;
5340 eptr++;
5341 }
5342 break;
5343
5344 case OP_DIGIT:
5345 for (i = min; i < max; i++)
5346 {
5347 if (eptr >= md->end_subject)
5348 {
5349 SCHECK_PARTIAL();
5350 break;
5351 }
5352 if ((md->ctypes[*eptr] & ctype_digit) == 0) break;
5353 eptr++;
5354 }
5355 break;
5356
5357 case OP_NOT_WHITESPACE:
5358 for (i = min; i < max; i++)
5359 {
5360 if (eptr >= md->end_subject)
5361 {
5362 SCHECK_PARTIAL();
5363 break;
5364 }
5365 if ((md->ctypes[*eptr] & ctype_space) != 0) break;
5366 eptr++;
5367 }
5368 break;
5369
5370 case OP_WHITESPACE:
5371 for (i = min; i < max; i++)
5372 {
5373 if (eptr >= md->end_subject)
5374 {
5375 SCHECK_PARTIAL();
5376 break;
5377 }
5378 if ((md->ctypes[*eptr] & ctype_space) == 0) break;
5379 eptr++;
5380 }
5381 break;
5382
5383 case OP_NOT_WORDCHAR:
5384 for (i = min; i < max; i++)
5385 {
5386 if (eptr >= md->end_subject)
5387 {
5388 SCHECK_PARTIAL();
5389 break;
5390 }
5391 if ((md->ctypes[*eptr] & ctype_word) != 0) break;
5392 eptr++;
5393 }
5394 break;
5395
5396 case OP_WORDCHAR:
5397 for (i = min; i < max; i++)
5398 {
5399 if (eptr >= md->end_subject)
5400 {
5401 SCHECK_PARTIAL();
5402 break;
5403 }
5404 if ((md->ctypes[*eptr] & ctype_word) == 0) break;
5405 eptr++;
5406 }
5407 break;
5408
5409 default:
5410 RRETURN(PCRE_ERROR_INTERNAL);
5411 }
5412
5413 /* eptr is now past the end of the maximum run */
5414
5415 if (possessive) continue;
5416 while (eptr >= pp)
5417 {
5418 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM47);
5419 eptr--;
5420 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
5421 }
5422 }
5423
5424 /* Get here if we can't make it match with any permitted repetitions */
5425
5426 MRRETURN(MATCH_NOMATCH);
5427 }
5428 /* Control never gets here */
5429
5430 /* There's been some horrible disaster. Arrival here can only mean there is
5431 something seriously wrong in the code above or the OP_xxx definitions. */
5432
5433 default:
5434 DPRINTF(("Unknown opcode %d\n", *ecode));
5435 RRETURN(PCRE_ERROR_UNKNOWN_OPCODE);
5436 }
5437
5438 /* Do not stick any code in here without much thought; it is assumed
5439 that "continue" in the code above comes out to here to repeat the main
5440 loop. */
5441
5442 } /* End of main loop */
5443 /* Control never reaches here */
5444
5445
5446 /* When compiling to use the heap rather than the stack for recursive calls to
5447 match(), the RRETURN() macro jumps here. The number that is saved in
5448 frame->Xwhere indicates which label we actually want to return to. */
5449
5450 #ifdef NO_RECURSE
5451 #define LBL(val) case val: goto L_RM##val;
5452 HEAP_RETURN:
5453 switch (frame->Xwhere)
5454 {
5455 LBL( 1) LBL( 2) LBL( 3) LBL( 4) LBL( 5) LBL( 6) LBL( 7) LBL( 8)
5456 LBL( 9) LBL(10) LBL(11) LBL(12) LBL(13) LBL(14) LBL(15) LBL(17)
5457 LBL(19) LBL(24) LBL(25) LBL(26) LBL(27) LBL(29) LBL(31) LBL(33)
5458 LBL(35) LBL(43) LBL(47) LBL(48) LBL(49) LBL(50) LBL(51) LBL(52)
5459 LBL(53) LBL(54) LBL(55) LBL(56) LBL(57) LBL(58)
5460 #ifdef SUPPORT_UTF8
5461 LBL(16) LBL(18) LBL(20) LBL(21) LBL(22) LBL(23) LBL(28) LBL(30)
5462 LBL(32) LBL(34) LBL(42) LBL(46)
5463 #ifdef SUPPORT_UCP
5464 LBL(36) LBL(37) LBL(38) LBL(39) LBL(40) LBL(41) LBL(44) LBL(45)
5465 #endif /* SUPPORT_UCP */
5466 #endif /* SUPPORT_UTF8 */
5467 default:
5468 DPRINTF(("jump error in pcre match: label %d non-existent\n", frame->Xwhere));
5469 return PCRE_ERROR_INTERNAL;
5470 }
5471 #undef LBL
5472 #endif /* NO_RECURSE */
5473 }
5474
5475
5476 /***************************************************************************
5477 ****************************************************************************
5478 RECURSION IN THE match() FUNCTION
5479
5480 Undefine all the macros that were defined above to handle this. */
5481
5482 #ifdef NO_RECURSE
5483 #undef eptr
5484 #undef ecode
5485 #undef mstart
5486 #undef offset_top
5487 #undef ims
5488 #undef eptrb
5489 #undef flags
5490
5491 #undef callpat
5492 #undef charptr
5493 #undef data
5494 #undef next
5495 #undef pp
5496 #undef prev
5497 #undef saved_eptr
5498
5499 #undef new_recursive
5500
5501 #undef cur_is_word
5502 #undef condition
5503 #undef prev_is_word
5504
5505 #undef original_ims
5506
5507 #undef ctype
5508 #undef length
5509 #undef max
5510 #undef min
5511 #undef number
5512 #undef offset
5513 #undef op
5514 #undef save_capture_last
5515 #undef save_offset1
5516 #undef save_offset2
5517 #undef save_offset3
5518 #undef stacksave
5519
5520 #undef newptrb
5521
5522 #endif
5523
5524 /* These two are defined as macros in both cases */
5525
5526 #undef fc
5527 #undef fi
5528
5529 /***************************************************************************
5530 ***************************************************************************/
5531
5532
5533
5534 /*************************************************
5535 * Execute a Regular Expression *
5536 *************************************************/
5537
5538 /* This function applies a compiled re to a subject string and picks out
5539 portions of the string if it matches. Two elements in the vector are set for
5540 each substring: the offsets to the start and end of the substring.
5541
5542 Arguments:
5543 argument_re points to the compiled expression
5544 extra_data points to extra data or is NULL
5545 subject points to the subject string
5546 length length of subject string (may contain binary zeros)
5547 start_offset where to start in the subject string
5548 options option bits
5549 offsets points to a vector of ints to be filled in with offsets
5550 offsetcount the number of elements in the vector
5551
5552 Returns: > 0 => success; value is the number of elements filled in
5553 = 0 => success, but offsets is not big enough
5554 -1 => failed to match
5555 < -1 => some kind of unexpected problem
5556 */
5557
5558 PCRE_EXP_DEFN int PCRE_CALL_CONVENTION
5559 pcre_exec(const pcre *argument_re, const pcre_extra *extra_data,
5560 PCRE_SPTR subject, int length, int start_offset, int options, int *offsets,
5561 int offsetcount)
5562 {
5563 int rc, resetcount, ocount;
5564 int first_byte = -1;
5565 int req_byte = -1;
5566 int req_byte2 = -1;
5567 int newline;
5568 unsigned long int ims;
5569 BOOL using_temporary_offsets = FALSE;
5570 BOOL anchored;
5571 BOOL startline;
5572 BOOL firstline;
5573 BOOL first_byte_caseless = FALSE;
5574 BOOL req_byte_caseless = FALSE;
5575 BOOL utf8;
5576 match_data match_block;
5577 match_data *md = &match_block;
5578 const uschar *tables;
5579 const uschar *start_bits = NULL;
5580 USPTR start_match = (USPTR)subject + start_offset;
5581 USPTR end_subject;
5582 USPTR start_partial = NULL;
5583 USPTR req_byte_ptr = start_match - 1;
5584
5585 pcre_study_data internal_study;
5586 const pcre_study_data *study;
5587
5588 real_pcre internal_re;
5589 const real_pcre *external_re = (const real_pcre *)argument_re;
5590 const real_pcre *re = external_re;
5591
5592 /* Plausibility checks */
5593
5594 if ((options & ~PUBLIC_EXEC_OPTIONS) != 0) return PCRE_ERROR_BADOPTION;
5595 if (re == NULL || subject == NULL ||
5596 (offsets == NULL && offsetcount > 0)) return PCRE_ERROR_NULL;
5597 if (offsetcount < 0) return PCRE_ERROR_BADCOUNT;
5598
5599 /* This information is for finding all the numbers associated with a given
5600 name, for condition testing. */
5601
5602 md->name_table = (uschar *)re + re->name_table_offset;
5603 md->name_count = re->name_count;
5604 md->name_entry_size = re->name_entry_size;
5605
5606 /* Fish out the optional data from the extra_data structure, first setting
5607 the default values. */
5608
5609 study = NULL;
5610 md->match_limit = MATCH_LIMIT;
5611 md->match_limit_recursion = MATCH_LIMIT_RECURSION;
5612 md->callout_data = NULL;
5613
5614 /* The table pointer is always in native byte order. */
5615
5616 tables = external_re->tables;
5617
5618 if (extra_data != NULL)
5619 {
5620 register unsigned int flags = extra_data->flags;
5621 if ((flags & PCRE_EXTRA_STUDY_DATA) != 0)
5622 study = (const pcre_study_data *)extra_data->study_data;
5623 if ((flags & PCRE_EXTRA_MATCH_LIMIT) != 0)
5624 md->match_limit = extra_data->match_limit;
5625 if ((flags & PCRE_EXTRA_MATCH_LIMIT_RECURSION) != 0)
5626 md->match_limit_recursion = extra_data->match_limit_recursion;
5627 if ((flags & PCRE_EXTRA_CALLOUT_DATA) != 0)
5628 md->callout_data = extra_data->callout_data;
5629 if ((flags & PCRE_EXTRA_TABLES) != 0) tables = extra_data->tables;
5630 }
5631
5632 /* If the exec call supplied NULL for tables, use the inbuilt ones. This
5633 is a feature that makes it possible to save compiled regex and re-use them
5634 in other programs later. */
5635
5636 if (tables == NULL) tables = _pcre_default_tables;
5637
5638 /* Check that the first field in the block is the magic number. If it is not,
5639 test for a regex that was compiled on a host of opposite endianness. If this is
5640 the case, flipped values are put in internal_re and internal_study if there was
5641 study data too. */
5642
5643 if (re->magic_number != MAGIC_NUMBER)
5644 {
5645 re = _pcre_try_flipped(re, &internal_re, study, &internal_study);
5646 if (re == NULL) return PCRE_ERROR_BADMAGIC;
5647 if (study != NULL) study = &internal_study;
5648 }
5649
5650 /* Set up other data */
5651
5652 anchored = ((re->options | options) & PCRE_ANCHORED) != 0;
5653 startline = (re->flags & PCRE_STARTLINE) != 0;
5654 firstline = (re->options & PCRE_FIRSTLINE) != 0;
5655
5656 /* The code starts after the real_pcre block and the capture name table. */
5657
5658 md->start_code = (const uschar *)external_re + re->name_table_offset +
5659 re->name_count * re->name_entry_size;
5660
5661 md->start_subject = (USPTR)subject;
5662 md->start_offset = start_offset;
5663 md->end_subject = md->start_subject + length;
5664 end_subject = md->end_subject;
5665
5666 md->endonly = (re->options & PCRE_DOLLAR_ENDONLY) != 0;
5667 utf8 = md->utf8 = (re->options & PCRE_UTF8) != 0;
5668 md->use_ucp = (re->options & PCRE_UCP) != 0;
5669 md->jscript_compat = (re->options & PCRE_JAVASCRIPT_COMPAT) != 0;
5670
5671 md->notbol = (options & PCRE_NOTBOL) != 0;
5672 md->noteol = (options & PCRE_NOTEOL) != 0;
5673 md->notempty = (options & PCRE_NOTEMPTY) != 0;
5674 md->notempty_atstart = (options & PCRE_NOTEMPTY_ATSTART) != 0;
5675 md->partial = ((options & PCRE_PARTIAL_HARD) != 0)? 2 :
5676 ((options & PCRE_PARTIAL_SOFT) != 0)? 1 : 0;
5677 md->hitend = FALSE;
5678 md->mark = NULL; /* In case never set */
5679
5680 md->recursive = NULL; /* No recursion at top level */
5681
5682 md->lcc = tables + lcc_offset;
5683 md->ctypes = tables + ctypes_offset;
5684
5685 /* Handle different \R options. */
5686
5687 switch (options & (PCRE_BSR_ANYCRLF|PCRE_BSR_UNICODE))
5688 {
5689 case 0:
5690 if ((re->options & (PCRE_BSR_ANYCRLF|PCRE_BSR_UNICODE)) != 0)
5691 md->bsr_anycrlf = (re->options & PCRE_BSR_ANYCRLF) != 0;
5692 else
5693 #ifdef BSR_ANYCRLF
5694 md->bsr_anycrlf = TRUE;
5695 #else
5696 md->bsr_anycrlf = FALSE;
5697 #endif
5698 break;
5699
5700 case PCRE_BSR_ANYCRLF:
5701 md->bsr_anycrlf = TRUE;
5702 break;
5703
5704 case PCRE_BSR_UNICODE:
5705 md->bsr_anycrlf = FALSE;
5706 break;
5707
5708 default: return PCRE_ERROR_BADNEWLINE;
5709 }
5710
5711 /* Handle different types of newline. The three bits give eight cases. If
5712 nothing is set at run time, whatever was used at compile time applies. */
5713
5714 switch ((((options & PCRE_NEWLINE_BITS) == 0)? re->options :
5715 (pcre_uint32)options) & PCRE_NEWLINE_BITS)
5716 {
5717 case 0: newline = NEWLINE; break; /* Compile-time default */
5718 case PCRE_NEWLINE_CR: newline = CHAR_CR; break;
5719 case PCRE_NEWLINE_LF: newline = CHAR_NL; break;
5720 case PCRE_NEWLINE_CR+
5721 PCRE_NEWLINE_LF: newline = (CHAR_CR << 8) | CHAR_NL; break;
5722 case PCRE_NEWLINE_ANY: newline = -1; break;
5723 case PCRE_NEWLINE_ANYCRLF: newline = -2; break;
5724 default: return PCRE_ERROR_BADNEWLINE;
5725 }
5726
5727 if (newline == -2)
5728 {
5729 md->nltype = NLTYPE_ANYCRLF;
5730 }
5731 else if (newline < 0)
5732 {
5733 md->nltype = NLTYPE_ANY;
5734 }
5735 else
5736 {
5737 md->nltype = NLTYPE_FIXED;
5738 if (newline > 255)
5739 {
5740 md->nllen = 2;
5741 md->nl[0] = (newline >> 8) & 255;
5742 md->nl[1] = newline & 255;
5743 }
5744 else
5745 {
5746 md->nllen = 1;
5747 md->nl[0] = newline;
5748 }
5749 }
5750
5751 /* Partial matching was originally supported only for a restricted set of
5752 regexes; from release 8.00 there are no restrictions, but the bits are still
5753 defined (though never set). So there's no harm in leaving this code. */
5754
5755 if (md->partial && (re->flags & PCRE_NOPARTIAL) != 0)
5756 return PCRE_ERROR_BADPARTIAL;
5757
5758 /* Check a UTF-8 string if required. Unfortunately there's no way of passing
5759 back the character offset. */
5760
5761 #ifdef SUPPORT_UTF8
5762 if (utf8 && (options & PCRE_NO_UTF8_CHECK) == 0)
5763 {
5764 if (_pcre_valid_utf8((USPTR)subject, length) >= 0)
5765 return PCRE_ERROR_BADUTF8;
5766 if (start_offset > 0 && start_offset < length)
5767 {
5768 int tb = ((USPTR)subject)[start_offset];
5769 if (tb > 127)
5770 {
5771 tb &= 0xc0;
5772 if (tb != 0 && tb != 0xc0) return PCRE_ERROR_BADUTF8_OFFSET;
5773 }
5774 }
5775 }
5776 #endif
5777
5778 /* The ims options can vary during the matching as a result of the presence
5779 of (?ims) items in the pattern. They are kept in a local variable so that
5780 restoring at the exit of a group is easy. */
5781
5782 ims = re->options & (PCRE_CASELESS|PCRE_MULTILINE|PCRE_DOTALL);
5783
5784 /* If the expression has got more back references than the offsets supplied can
5785 hold, we get a temporary chunk of working store to use during the matching.
5786 Otherwise, we can use the vector supplied, rounding down its size to a multiple
5787 of 3. */
5788
5789 ocount = offsetcount - (offsetcount % 3);
5790
5791 if (re->top_backref > 0 && re->top_backref >= ocount/3)
5792 {
5793 ocount = re->top_backref * 3 + 3;
5794 md->offset_vector = (int *)(pcre_malloc)(ocount * sizeof(int));
5795 if (md->offset_vector == NULL) return PCRE_ERROR_NOMEMORY;
5796 using_temporary_offsets = TRUE;
5797 DPRINTF(("Got memory to hold back references\n"));
5798 }
5799 else md->offset_vector = offsets;
5800
5801 md->offset_end = ocount;
5802 md->offset_max = (2*ocount)/3;
5803 md->offset_overflow = FALSE;
5804 md->capture_last = -1;
5805
5806 /* Compute the minimum number of offsets that we need to reset each time. Doing
5807 this makes a huge difference to execution time when there aren't many brackets
5808 in the pattern. */
5809
5810 resetcount = 2 + re->top_bracket * 2;
5811 if (resetcount > offsetcount) resetcount = ocount;
5812
5813 /* Reset the working variable associated with each extraction. These should
5814 never be used unless previously set, but they get saved and restored, and so we
5815 initialize them to avoid reading uninitialized locations. */
5816
5817 if (md->offset_vector != NULL)
5818 {
5819 register int *iptr = md->offset_vector + ocount;
5820 register int *iend = iptr - resetcount/2 + 1;
5821 while (--iptr >= iend) *iptr = -1;
5822 }
5823
5824 /* Set up the first character to match, if available. The first_byte value is
5825 never set for an anchored regular expression, but the anchoring may be forced
5826 at run time, so we have to test for anchoring. The first char may be unset for
5827 an unanchored pattern, of course. If there's no first char and the pattern was
5828 studied, there may be a bitmap of possible first characters. */
5829
5830 if (!anchored)
5831 {
5832 if ((re->flags & PCRE_FIRSTSET) != 0)
5833 {
5834 first_byte = re->first_byte & 255;
5835 if ((first_byte_caseless = ((re->first_byte & REQ_CASELESS) != 0)) == TRUE)
5836 first_byte = md->lcc[first_byte];
5837 }
5838 else
5839 if (!startline && study != NULL &&
5840 (study->flags & PCRE_STUDY_MAPPED) != 0)
5841 start_bits = study->start_bits;
5842 }
5843
5844 /* For anchored or unanchored matches, there may be a "last known required
5845 character" set. */
5846
5847 if ((re->flags & PCRE_REQCHSET) != 0)
5848 {
5849 req_byte = re->req_byte & 255;
5850 req_byte_caseless = (re->req_byte & REQ_CASELESS) != 0;
5851 req_byte2 = (tables + fcc_offset)[req_byte]; /* case flipped */
5852 }
5853
5854
5855 /* ==========================================================================*/
5856
5857 /* Loop for handling unanchored repeated matching attempts; for anchored regexs
5858 the loop runs just once. */
5859
5860 for(;;)
5861 {
5862 USPTR save_end_subject = end_subject;
5863 USPTR new_start_match;
5864
5865 /* Reset the maximum number of extractions we might see. */
5866
5867 if (md->offset_vector != NULL)
5868 {
5869 register int *iptr = md->offset_vector;
5870 register int *iend = iptr + resetcount;
5871 while (iptr < iend) *iptr++ = -1;
5872 }
5873
5874 /* If firstline is TRUE, the start of the match is constrained to the first
5875 line of a multiline string. That is, the match must be before or at the first
5876 newline. Implement this by temporarily adjusting end_subject so that we stop
5877 scanning at a newline. If the match fails at the newline, later code breaks
5878 this loop. */
5879
5880 if (firstline)
5881 {
5882 USPTR t = start_match;
5883 #ifdef SUPPORT_UTF8
5884 if (utf8)
5885 {
5886 while (t < md->end_subject && !IS_NEWLINE(t))
5887 {
5888 t++;
5889 while (t < end_subject && (*t & 0xc0) == 0x80) t++;
5890 }
5891 }
5892 else
5893 #endif
5894 while (t < md->end_subject && !IS_NEWLINE(t)) t++;
5895 end_subject = t;
5896 }
5897
5898 /* There are some optimizations that avoid running the match if a known
5899 starting point is not found, or if a known later character is not present.
5900 However, there is an option that disables these, for testing and for ensuring
5901 that all callouts do actually occur. */
5902
5903 if ((options & PCRE_NO_START_OPTIMIZE) == 0)
5904 {
5905 /* Advance to a unique first byte if there is one. */
5906
5907 if (first_byte >= 0)
5908 {
5909 if (first_byte_caseless)
5910 while (start_match < end_subject && md->lcc[*start_match] != first_byte)
5911 start_match++;
5912 else
5913 while (start_match < end_subject && *start_match != first_byte)
5914 start_match++;
5915 }
5916
5917 /* Or to just after a linebreak for a multiline match */
5918
5919 else if (startline)
5920 {
5921 if (start_match > md->start_subject + start_offset)
5922 {
5923 #ifdef SUPPORT_UTF8
5924 if (utf8)
5925 {
5926 while (start_match < end_subject && !WAS_NEWLINE(start_match))
5927 {
5928 start_match++;
5929 while(start_match < end_subject && (*start_match & 0xc0) == 0x80)
5930 start_match++;
5931 }
5932 }
5933 else
5934 #endif
5935 while (start_match < end_subject && !WAS_NEWLINE(start_match))
5936 start_match++;
5937
5938 /* If we have just passed a CR and the newline option is ANY or ANYCRLF,
5939 and we are now at a LF, advance the match position by one more character.
5940 */
5941
5942 if (start_match[-1] == CHAR_CR &&
5943 (md->nltype == NLTYPE_ANY || md->nltype == NLTYPE_ANYCRLF) &&
5944 start_match < end_subject &&
5945 *start_match == CHAR_NL)
5946 start_match++;
5947 }
5948 }
5949
5950 /* Or to a non-unique first byte after study */
5951
5952 else if (start_bits != NULL)
5953 {
5954 while (start_match < end_subject)
5955 {
5956 register unsigned int c = *start_match;
5957 if ((start_bits[c/8] & (1 << (c&7))) == 0) start_match++;
5958 else break;
5959 }
5960 }
5961 } /* Starting optimizations */
5962
5963 /* Restore fudged end_subject */
5964
5965 end_subject = save_end_subject;
5966
5967 /* The following two optimizations are disabled for partial matching or if
5968 disabling is explicitly requested. */
5969
5970 if ((options & PCRE_NO_START_OPTIMIZE) == 0 && !md->partial)
5971 {
5972 /* If the pattern was studied, a minimum subject length may be set. This is
5973 a lower bound; no actual string of that length may actually match the
5974 pattern. Although the value is, strictly, in characters, we treat it as
5975 bytes to avoid spending too much time in this optimization. */
5976
5977 if (study != NULL && (study->flags & PCRE_STUDY_MINLEN) != 0 &&
5978 (pcre_uint32)(end_subject - start_match) < study->minlength)
5979 {
5980 rc = MATCH_NOMATCH;
5981 break;
5982 }
5983
5984 /* If req_byte is set, we know that that character must appear in the
5985 subject for the match to succeed. If the first character is set, req_byte
5986 must be later in the subject; otherwise the test starts at the match point.
5987 This optimization can save a huge amount of backtracking in patterns with
5988 nested unlimited repeats that aren't going to match. Writing separate code
5989 for cased/caseless versions makes it go faster, as does using an
5990 autoincrement and backing off on a match.
5991
5992 HOWEVER: when the subject string is very, very long, searching to its end
5993 can take a long time, and give bad performance on quite ordinary patterns.
5994 This showed up when somebody was matching something like /^\d+C/ on a
5995 32-megabyte string... so we don't do this when the string is sufficiently
5996 long. */
5997
5998 if (req_byte >= 0 && end_subject - start_match < REQ_BYTE_MAX)
5999 {
6000 register USPTR p = start_match + ((first_byte >= 0)? 1 : 0);
6001
6002 /* We don't need to repeat the search if we haven't yet reached the
6003 place we found it at last time. */
6004
6005 if (p > req_byte_ptr)
6006 {
6007 if (req_byte_caseless)
6008 {
6009 while (p < end_subject)
6010 {
6011 register int pp = *p++;
6012 if (pp == req_byte || pp == req_byte2) { p--; break; }
6013 }
6014 }
6015 else
6016 {
6017 while (p < end_subject)
6018 {
6019 if (*p++ == req_byte) { p--; break; }
6020 }
6021 }
6022
6023 /* If we can't find the required character, break the matching loop,
6024 forcing a match failure. */
6025
6026 if (p >= end_subject)
6027 {
6028 rc = MATCH_NOMATCH;
6029 break;
6030 }
6031
6032 /* If we have found the required character, save the point where we
6033 found it, so that we don't search again next time round the loop if
6034 the start hasn't passed this character yet. */
6035
6036 req_byte_ptr = p;
6037 }
6038 }
6039 }
6040
6041 #ifdef PCRE_DEBUG /* Sigh. Some compilers never learn. */
6042 printf(">>>> Match against: ");
6043 pchars(start_match, end_subject - start_match, TRUE, md);
6044 printf("\n");
6045 #endif
6046
6047 /* OK, we can now run the match. If "hitend" is set afterwards, remember the
6048 first starting point for which a partial match was found. */
6049
6050 md->start_match_ptr = start_match;
6051 md->start_used_ptr = start_match;
6052 md->match_call_count = 0;
6053 rc = match(start_match, md->start_code, start_match, NULL, 2, md, ims, NULL,
6054 0, 0);
6055 if (md->hitend && start_partial == NULL) start_partial = md->start_used_ptr;
6056
6057 switch(rc)
6058 {
6059 /* NOMATCH and PRUNE advance by one character. If MATCH_SKIP_ARG reaches
6060 this level it means that a MARK that matched the SKIP's arg was not found.
6061 We treat this as NOMATCH. THEN at this level acts exactly like PRUNE. */
6062
6063 case MATCH_NOMATCH:
6064 case MATCH_PRUNE:
6065 case MATCH_SKIP_ARG:
6066 case MATCH_THEN:
6067 new_start_match = start_match + 1;
6068 #ifdef SUPPORT_UTF8
6069 if (utf8)
6070 while(new_start_match < end_subject && (*new_start_match & 0xc0) == 0x80)
6071 new_start_match++;
6072 #endif
6073 break;
6074
6075 /* SKIP passes back the next starting point explicitly. */
6076
6077 case MATCH_SKIP:
6078 new_start_match = md->start_match_ptr;
6079 break;
6080
6081 /* COMMIT disables the bumpalong, but otherwise behaves as NOMATCH. */
6082
6083 case MATCH_COMMIT:
6084 rc = MATCH_NOMATCH;
6085 goto ENDLOOP;
6086
6087 /* Any other return is either a match, or some kind of error. */
6088
6089 default:
6090 goto ENDLOOP;
6091 }
6092
6093 /* Control reaches here for the various types of "no match at this point"
6094 result. Reset the code to MATCH_NOMATCH for subsequent checking. */
6095
6096 rc = MATCH_NOMATCH;
6097
6098 /* If PCRE_FIRSTLINE is set, the match must happen before or at the first
6099 newline in the subject (though it may continue over the newline). Therefore,
6100 if we have just failed to match, starting at a newline, do not continue. */
6101
6102 if (firstline && IS_NEWLINE(start_match)) break;
6103
6104 /* Advance to new matching position */
6105
6106 start_match = new_start_match;
6107
6108 /* Break the loop if the pattern is anchored or if we have passed the end of
6109 the subject. */
6110
6111 if (anchored || start_match > end_subject) break;
6112
6113 /* If we have just passed a CR and we are now at a LF, and the pattern does
6114 not contain any explicit matches for \r or \n, and the newline option is CRLF
6115 or ANY or ANYCRLF, advance the match position by one more character. */
6116
6117 if (start_match[-1] == CHAR_CR &&
6118 start_match < end_subject &&
6119 *start_match == CHAR_NL &&
6120 (re->flags & PCRE_HASCRORLF) == 0 &&
6121 (md->nltype == NLTYPE_ANY ||
6122 md->nltype == NLTYPE_ANYCRLF ||
6123 md->nllen == 2))
6124 start_match++;
6125
6126 md->mark = NULL; /* Reset for start of next match attempt */
6127 } /* End of for(;;) "bumpalong" loop */
6128
6129 /* ==========================================================================*/
6130
6131 /* We reach here when rc is not MATCH_NOMATCH, or if one of the stopping
6132 conditions is true:
6133
6134 (1) The pattern is anchored or the match was failed by (*COMMIT);
6135
6136 (2) We are past the end of the subject;
6137
6138 (3) PCRE_FIRSTLINE is set and we have failed to match at a newline, because
6139 this option requests that a match occur at or before the first newline in
6140 the subject.
6141
6142 When we have a match and the offset vector is big enough to deal with any
6143 backreferences, captured substring offsets will already be set up. In the case
6144 where we had to get some local store to hold offsets for backreference
6145 processing, copy those that we can. In this case there need not be overflow if
6146 certain parts of the pattern were not used, even though there are more
6147 capturing parentheses than vector slots. */
6148
6149 ENDLOOP:
6150
6151 if (rc == MATCH_MATCH || rc == MATCH_ACCEPT)
6152 {
6153 if (using_temporary_offsets)
6154 {
6155 if (offsetcount >= 4)
6156 {
6157 memcpy(offsets + 2, md->offset_vector + 2,
6158 (offsetcount - 2) * sizeof(int));
6159 DPRINTF(("Copied offsets from temporary memory\n"));
6160 }
6161 if (md->end_offset_top > offsetcount) md->offset_overflow = TRUE;
6162 DPRINTF(("Freeing temporary memory\n"));
6163 (pcre_free)(md->offset_vector);
6164 }
6165
6166 /* Set the return code to the number of captured strings, or 0 if there are
6167 too many to fit into the vector. */
6168
6169 rc = md->offset_overflow? 0 : md->end_offset_top/2;
6170
6171 /* If there is space, set up the whole thing as substring 0. The value of
6172 md->start_match_ptr might be modified if \K was encountered on the success
6173 matching path. */
6174
6175 if (offsetcount < 2) rc = 0; else
6176 {
6177 offsets[0] = md->start_match_ptr - md->start_subject;
6178 offsets[1] = md->end_match_ptr - md->start_subject;
6179 }
6180
6181 DPRINTF((">>>> returning %d\n", rc));
6182 goto RETURN_MARK;
6183 }
6184
6185 /* Control gets here if there has been an error, or if the overall match
6186 attempt has failed at all permitted starting positions. */
6187
6188 if (using_temporary_offsets)
6189 {
6190 DPRINTF(("Freeing temporary memory\n"));
6191 (pcre_free)(md->offset_vector);
6192 }
6193
6194 /* For anything other than nomatch or partial match, just return the code. */
6195
6196 if (rc != MATCH_NOMATCH && rc != PCRE_ERROR_PARTIAL)
6197 {
6198 DPRINTF((">>>> error: returning %d\n", rc));
6199 return rc;
6200 }
6201
6202 /* Handle partial matches - disable any mark data */
6203
6204 if (start_partial != NULL)
6205 {
6206 DPRINTF((">>>> returning PCRE_ERROR_PARTIAL\n"));
6207 md->mark = NULL;
6208 if (offsetcount > 1)
6209 {
6210 offsets[0] = start_partial - (USPTR)subject;
6211 offsets[1] = end_subject - (USPTR)subject;
6212 }
6213 rc = PCRE_ERROR_PARTIAL;
6214 }
6215
6216 /* This is the classic nomatch case */
6217
6218 else
6219 {
6220 DPRINTF((">>>> returning PCRE_ERROR_NOMATCH\n"));
6221 rc = PCRE_ERROR_NOMATCH;
6222 }
6223
6224 /* Return the MARK data if it has been requested. */
6225
6226 RETURN_MARK:
6227
6228 if (extra_data != NULL && (extra_data->flags & PCRE_EXTRA_MARK) != 0)
6229 *(extra_data->mark) = (unsigned char *)(md->mark);
6230 return rc;
6231 }
6232
6233 /* End of pcre_exec.c */

Properties

Name Value
svn:eol-style native
svn:keywords "Author Date Id Revision Url"

webmaster@exim.org
ViewVC Help
Powered by ViewVC 1.1.12