/[pcre]/code/tags/pcre-8.02/pcre_exec.c
ViewVC logotype

Contents of /code/tags/pcre-8.02/pcre_exec.c

Parent Directory Parent Directory | Revision Log Revision Log


Revision 509 - (show annotations) (download)
Fri Mar 19 10:26:50 2010 UTC (4 years, 6 months ago) by ph10
File MIME type: text/plain
File size: 171447 byte(s)
Tag release 8.02.

1 /*************************************************
2 * Perl-Compatible Regular Expressions *
3 *************************************************/
4
5 /* PCRE is a library of functions to support regular expressions whose syntax
6 and semantics are as close as possible to those of the Perl 5 language.
7
8 Written by Philip Hazel
9 Copyright (c) 1997-2010 University of Cambridge
10
11 -----------------------------------------------------------------------------
12 Redistribution and use in source and binary forms, with or without
13 modification, are permitted provided that the following conditions are met:
14
15 * Redistributions of source code must retain the above copyright notice,
16 this list of conditions and the following disclaimer.
17
18 * Redistributions in binary form must reproduce the above copyright
19 notice, this list of conditions and the following disclaimer in the
20 documentation and/or other materials provided with the distribution.
21
22 * Neither the name of the University of Cambridge nor the names of its
23 contributors may be used to endorse or promote products derived from
24 this software without specific prior written permission.
25
26 THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
27 AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
28 IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
29 ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
30 LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
31 CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
32 SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
33 INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
34 CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
35 ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
36 POSSIBILITY OF SUCH DAMAGE.
37 -----------------------------------------------------------------------------
38 */
39
40
41 /* This module contains pcre_exec(), the externally visible function that does
42 pattern matching using an NFA algorithm, trying to mimic Perl as closely as
43 possible. There are also some static supporting functions. */
44
45 #ifdef HAVE_CONFIG_H
46 #include "config.h"
47 #endif
48
49 #define NLBLOCK md /* Block containing newline information */
50 #define PSSTART start_subject /* Field containing processed string start */
51 #define PSEND end_subject /* Field containing processed string end */
52
53 #include "pcre_internal.h"
54
55 /* Undefine some potentially clashing cpp symbols */
56
57 #undef min
58 #undef max
59
60 /* Flag bits for the match() function */
61
62 #define match_condassert 0x01 /* Called to check a condition assertion */
63 #define match_cbegroup 0x02 /* Could-be-empty unlimited repeat group */
64
65 /* Non-error returns from the match() function. Error returns are externally
66 defined PCRE_ERROR_xxx codes, which are all negative. */
67
68 #define MATCH_MATCH 1
69 #define MATCH_NOMATCH 0
70
71 /* Special internal returns from the match() function. Make them sufficiently
72 negative to avoid the external error codes. */
73
74 #define MATCH_COMMIT (-999)
75 #define MATCH_PRUNE (-998)
76 #define MATCH_SKIP (-997)
77 #define MATCH_THEN (-996)
78
79 /* Maximum number of ints of offset to save on the stack for recursive calls.
80 If the offset vector is bigger, malloc is used. This should be a multiple of 3,
81 because the offset vector is always a multiple of 3 long. */
82
83 #define REC_STACK_SAVE_MAX 30
84
85 /* Min and max values for the common repeats; for the maxima, 0 => infinity */
86
87 static const char rep_min[] = { 0, 0, 1, 1, 0, 0 };
88 static const char rep_max[] = { 0, 0, 0, 0, 1, 1 };
89
90
91
92 #ifdef PCRE_DEBUG
93 /*************************************************
94 * Debugging function to print chars *
95 *************************************************/
96
97 /* Print a sequence of chars in printable format, stopping at the end of the
98 subject if the requested.
99
100 Arguments:
101 p points to characters
102 length number to print
103 is_subject TRUE if printing from within md->start_subject
104 md pointer to matching data block, if is_subject is TRUE
105
106 Returns: nothing
107 */
108
109 static void
110 pchars(const uschar *p, int length, BOOL is_subject, match_data *md)
111 {
112 unsigned int c;
113 if (is_subject && length > md->end_subject - p) length = md->end_subject - p;
114 while (length-- > 0)
115 if (isprint(c = *(p++))) printf("%c", c); else printf("\\x%02x", c);
116 }
117 #endif
118
119
120
121 /*************************************************
122 * Match a back-reference *
123 *************************************************/
124
125 /* If a back reference hasn't been set, the length that is passed is greater
126 than the number of characters left in the string, so the match fails.
127
128 Arguments:
129 offset index into the offset vector
130 eptr points into the subject
131 length length to be matched
132 md points to match data block
133 ims the ims flags
134
135 Returns: TRUE if matched
136 */
137
138 static BOOL
139 match_ref(int offset, register USPTR eptr, int length, match_data *md,
140 unsigned long int ims)
141 {
142 USPTR p = md->start_subject + md->offset_vector[offset];
143
144 #ifdef PCRE_DEBUG
145 if (eptr >= md->end_subject)
146 printf("matching subject <null>");
147 else
148 {
149 printf("matching subject ");
150 pchars(eptr, length, TRUE, md);
151 }
152 printf(" against backref ");
153 pchars(p, length, FALSE, md);
154 printf("\n");
155 #endif
156
157 /* Always fail if not enough characters left */
158
159 if (length > md->end_subject - eptr) return FALSE;
160
161 /* Separate the caseless case for speed. In UTF-8 mode we can only do this
162 properly if Unicode properties are supported. Otherwise, we can check only
163 ASCII characters. */
164
165 if ((ims & PCRE_CASELESS) != 0)
166 {
167 #ifdef SUPPORT_UTF8
168 #ifdef SUPPORT_UCP
169 if (md->utf8)
170 {
171 USPTR endptr = eptr + length;
172 while (eptr < endptr)
173 {
174 int c, d;
175 GETCHARINC(c, eptr);
176 GETCHARINC(d, p);
177 if (c != d && c != UCD_OTHERCASE(d)) return FALSE;
178 }
179 }
180 else
181 #endif
182 #endif
183
184 /* The same code works when not in UTF-8 mode and in UTF-8 mode when there
185 is no UCP support. */
186
187 while (length-- > 0)
188 { if (md->lcc[*p++] != md->lcc[*eptr++]) return FALSE; }
189 }
190
191 /* In the caseful case, we can just compare the bytes, whether or not we
192 are in UTF-8 mode. */
193
194 else
195 { while (length-- > 0) if (*p++ != *eptr++) return FALSE; }
196
197 return TRUE;
198 }
199
200
201
202 /***************************************************************************
203 ****************************************************************************
204 RECURSION IN THE match() FUNCTION
205
206 The match() function is highly recursive, though not every recursive call
207 increases the recursive depth. Nevertheless, some regular expressions can cause
208 it to recurse to a great depth. I was writing for Unix, so I just let it call
209 itself recursively. This uses the stack for saving everything that has to be
210 saved for a recursive call. On Unix, the stack can be large, and this works
211 fine.
212
213 It turns out that on some non-Unix-like systems there are problems with
214 programs that use a lot of stack. (This despite the fact that every last chip
215 has oodles of memory these days, and techniques for extending the stack have
216 been known for decades.) So....
217
218 There is a fudge, triggered by defining NO_RECURSE, which avoids recursive
219 calls by keeping local variables that need to be preserved in blocks of memory
220 obtained from malloc() instead instead of on the stack. Macros are used to
221 achieve this so that the actual code doesn't look very different to what it
222 always used to.
223
224 The original heap-recursive code used longjmp(). However, it seems that this
225 can be very slow on some operating systems. Following a suggestion from Stan
226 Switzer, the use of longjmp() has been abolished, at the cost of having to
227 provide a unique number for each call to RMATCH. There is no way of generating
228 a sequence of numbers at compile time in C. I have given them names, to make
229 them stand out more clearly.
230
231 Crude tests on x86 Linux show a small speedup of around 5-8%. However, on
232 FreeBSD, avoiding longjmp() more than halves the time taken to run the standard
233 tests. Furthermore, not using longjmp() means that local dynamic variables
234 don't have indeterminate values; this has meant that the frame size can be
235 reduced because the result can be "passed back" by straight setting of the
236 variable instead of being passed in the frame.
237 ****************************************************************************
238 ***************************************************************************/
239
240 /* Numbers for RMATCH calls. When this list is changed, the code at HEAP_RETURN
241 below must be updated in sync. */
242
243 enum { RM1=1, RM2, RM3, RM4, RM5, RM6, RM7, RM8, RM9, RM10,
244 RM11, RM12, RM13, RM14, RM15, RM16, RM17, RM18, RM19, RM20,
245 RM21, RM22, RM23, RM24, RM25, RM26, RM27, RM28, RM29, RM30,
246 RM31, RM32, RM33, RM34, RM35, RM36, RM37, RM38, RM39, RM40,
247 RM41, RM42, RM43, RM44, RM45, RM46, RM47, RM48, RM49, RM50,
248 RM51, RM52, RM53, RM54 };
249
250 /* These versions of the macros use the stack, as normal. There are debugging
251 versions and production versions. Note that the "rw" argument of RMATCH isn't
252 actually used in this definition. */
253
254 #ifndef NO_RECURSE
255 #define REGISTER register
256
257 #ifdef PCRE_DEBUG
258 #define RMATCH(ra,rb,rc,rd,re,rf,rg,rw) \
259 { \
260 printf("match() called in line %d\n", __LINE__); \
261 rrc = match(ra,rb,mstart,markptr,rc,rd,re,rf,rg,rdepth+1); \
262 printf("to line %d\n", __LINE__); \
263 }
264 #define RRETURN(ra) \
265 { \
266 printf("match() returned %d from line %d ", ra, __LINE__); \
267 return ra; \
268 }
269 #else
270 #define RMATCH(ra,rb,rc,rd,re,rf,rg,rw) \
271 rrc = match(ra,rb,mstart,markptr,rc,rd,re,rf,rg,rdepth+1)
272 #define RRETURN(ra) return ra
273 #endif
274
275 #else
276
277
278 /* These versions of the macros manage a private stack on the heap. Note that
279 the "rd" argument of RMATCH isn't actually used in this definition. It's the md
280 argument of match(), which never changes. */
281
282 #define REGISTER
283
284 #define RMATCH(ra,rb,rc,rd,re,rf,rg,rw)\
285 {\
286 heapframe *newframe = (pcre_stack_malloc)(sizeof(heapframe));\
287 frame->Xwhere = rw; \
288 newframe->Xeptr = ra;\
289 newframe->Xecode = rb;\
290 newframe->Xmstart = mstart;\
291 newframe->Xmarkptr = markptr;\
292 newframe->Xoffset_top = rc;\
293 newframe->Xims = re;\
294 newframe->Xeptrb = rf;\
295 newframe->Xflags = rg;\
296 newframe->Xrdepth = frame->Xrdepth + 1;\
297 newframe->Xprevframe = frame;\
298 frame = newframe;\
299 DPRINTF(("restarting from line %d\n", __LINE__));\
300 goto HEAP_RECURSE;\
301 L_##rw:\
302 DPRINTF(("jumped back to line %d\n", __LINE__));\
303 }
304
305 #define RRETURN(ra)\
306 {\
307 heapframe *newframe = frame;\
308 frame = newframe->Xprevframe;\
309 (pcre_stack_free)(newframe);\
310 if (frame != NULL)\
311 {\
312 rrc = ra;\
313 goto HEAP_RETURN;\
314 }\
315 return ra;\
316 }
317
318
319 /* Structure for remembering the local variables in a private frame */
320
321 typedef struct heapframe {
322 struct heapframe *Xprevframe;
323
324 /* Function arguments that may change */
325
326 USPTR Xeptr;
327 const uschar *Xecode;
328 USPTR Xmstart;
329 USPTR Xmarkptr;
330 int Xoffset_top;
331 long int Xims;
332 eptrblock *Xeptrb;
333 int Xflags;
334 unsigned int Xrdepth;
335
336 /* Function local variables */
337
338 USPTR Xcallpat;
339 #ifdef SUPPORT_UTF8
340 USPTR Xcharptr;
341 #endif
342 USPTR Xdata;
343 USPTR Xnext;
344 USPTR Xpp;
345 USPTR Xprev;
346 USPTR Xsaved_eptr;
347
348 recursion_info Xnew_recursive;
349
350 BOOL Xcur_is_word;
351 BOOL Xcondition;
352 BOOL Xprev_is_word;
353
354 unsigned long int Xoriginal_ims;
355
356 #ifdef SUPPORT_UCP
357 int Xprop_type;
358 int Xprop_value;
359 int Xprop_fail_result;
360 int Xprop_category;
361 int Xprop_chartype;
362 int Xprop_script;
363 int Xoclength;
364 uschar Xocchars[8];
365 #endif
366
367 int Xcodelink;
368 int Xctype;
369 unsigned int Xfc;
370 int Xfi;
371 int Xlength;
372 int Xmax;
373 int Xmin;
374 int Xnumber;
375 int Xoffset;
376 int Xop;
377 int Xsave_capture_last;
378 int Xsave_offset1, Xsave_offset2, Xsave_offset3;
379 int Xstacksave[REC_STACK_SAVE_MAX];
380
381 eptrblock Xnewptrb;
382
383 /* Where to jump back to */
384
385 int Xwhere;
386
387 } heapframe;
388
389 #endif
390
391
392 /***************************************************************************
393 ***************************************************************************/
394
395
396
397 /*************************************************
398 * Match from current position *
399 *************************************************/
400
401 /* This function is called recursively in many circumstances. Whenever it
402 returns a negative (error) response, the outer incarnation must also return the
403 same response. */
404
405 /* These macros pack up tests that are used for partial matching, and which
406 appears several times in the code. We set the "hit end" flag if the pointer is
407 at the end of the subject and also past the start of the subject (i.e.
408 something has been matched). For hard partial matching, we then return
409 immediately. The second one is used when we already know we are past the end of
410 the subject. */
411
412 #define CHECK_PARTIAL()\
413 if (md->partial != 0 && eptr >= md->end_subject && eptr > mstart)\
414 {\
415 md->hitend = TRUE;\
416 if (md->partial > 1) RRETURN(PCRE_ERROR_PARTIAL);\
417 }
418
419 #define SCHECK_PARTIAL()\
420 if (md->partial != 0 && eptr > mstart)\
421 {\
422 md->hitend = TRUE;\
423 if (md->partial > 1) RRETURN(PCRE_ERROR_PARTIAL);\
424 }
425
426
427 /* Performance note: It might be tempting to extract commonly used fields from
428 the md structure (e.g. utf8, end_subject) into individual variables to improve
429 performance. Tests using gcc on a SPARC disproved this; in the first case, it
430 made performance worse.
431
432 Arguments:
433 eptr pointer to current character in subject
434 ecode pointer to current position in compiled code
435 mstart pointer to the current match start position (can be modified
436 by encountering \K)
437 markptr pointer to the most recent MARK name, or NULL
438 offset_top current top pointer
439 md pointer to "static" info for the match
440 ims current /i, /m, and /s options
441 eptrb pointer to chain of blocks containing eptr at start of
442 brackets - for testing for empty matches
443 flags can contain
444 match_condassert - this is an assertion condition
445 match_cbegroup - this is the start of an unlimited repeat
446 group that can match an empty string
447 rdepth the recursion depth
448
449 Returns: MATCH_MATCH if matched ) these values are >= 0
450 MATCH_NOMATCH if failed to match )
451 a negative PCRE_ERROR_xxx value if aborted by an error condition
452 (e.g. stopped by repeated call or recursion limit)
453 */
454
455 static int
456 match(REGISTER USPTR eptr, REGISTER const uschar *ecode, USPTR mstart, USPTR
457 markptr, int offset_top, match_data *md, unsigned long int ims,
458 eptrblock *eptrb, int flags, unsigned int rdepth)
459 {
460 /* These variables do not need to be preserved over recursion in this function,
461 so they can be ordinary variables in all cases. Mark some of them with
462 "register" because they are used a lot in loops. */
463
464 register int rrc; /* Returns from recursive calls */
465 register int i; /* Used for loops not involving calls to RMATCH() */
466 register unsigned int c; /* Character values not kept over RMATCH() calls */
467 register BOOL utf8; /* Local copy of UTF-8 flag for speed */
468
469 BOOL minimize, possessive; /* Quantifier options */
470 int condcode;
471
472 /* When recursion is not being used, all "local" variables that have to be
473 preserved over calls to RMATCH() are part of a "frame" which is obtained from
474 heap storage. Set up the top-level frame here; others are obtained from the
475 heap whenever RMATCH() does a "recursion". See the macro definitions above. */
476
477 #ifdef NO_RECURSE
478 heapframe *frame = (pcre_stack_malloc)(sizeof(heapframe));
479 frame->Xprevframe = NULL; /* Marks the top level */
480
481 /* Copy in the original argument variables */
482
483 frame->Xeptr = eptr;
484 frame->Xecode = ecode;
485 frame->Xmstart = mstart;
486 frame->Xmarkptr = markptr;
487 frame->Xoffset_top = offset_top;
488 frame->Xims = ims;
489 frame->Xeptrb = eptrb;
490 frame->Xflags = flags;
491 frame->Xrdepth = rdepth;
492
493 /* This is where control jumps back to to effect "recursion" */
494
495 HEAP_RECURSE:
496
497 /* Macros make the argument variables come from the current frame */
498
499 #define eptr frame->Xeptr
500 #define ecode frame->Xecode
501 #define mstart frame->Xmstart
502 #define markptr frame->Xmarkptr
503 #define offset_top frame->Xoffset_top
504 #define ims frame->Xims
505 #define eptrb frame->Xeptrb
506 #define flags frame->Xflags
507 #define rdepth frame->Xrdepth
508
509 /* Ditto for the local variables */
510
511 #ifdef SUPPORT_UTF8
512 #define charptr frame->Xcharptr
513 #endif
514 #define callpat frame->Xcallpat
515 #define codelink frame->Xcodelink
516 #define data frame->Xdata
517 #define next frame->Xnext
518 #define pp frame->Xpp
519 #define prev frame->Xprev
520 #define saved_eptr frame->Xsaved_eptr
521
522 #define new_recursive frame->Xnew_recursive
523
524 #define cur_is_word frame->Xcur_is_word
525 #define condition frame->Xcondition
526 #define prev_is_word frame->Xprev_is_word
527
528 #define original_ims frame->Xoriginal_ims
529
530 #ifdef SUPPORT_UCP
531 #define prop_type frame->Xprop_type
532 #define prop_value frame->Xprop_value
533 #define prop_fail_result frame->Xprop_fail_result
534 #define prop_category frame->Xprop_category
535 #define prop_chartype frame->Xprop_chartype
536 #define prop_script frame->Xprop_script
537 #define oclength frame->Xoclength
538 #define occhars frame->Xocchars
539 #endif
540
541 #define ctype frame->Xctype
542 #define fc frame->Xfc
543 #define fi frame->Xfi
544 #define length frame->Xlength
545 #define max frame->Xmax
546 #define min frame->Xmin
547 #define number frame->Xnumber
548 #define offset frame->Xoffset
549 #define op frame->Xop
550 #define save_capture_last frame->Xsave_capture_last
551 #define save_offset1 frame->Xsave_offset1
552 #define save_offset2 frame->Xsave_offset2
553 #define save_offset3 frame->Xsave_offset3
554 #define stacksave frame->Xstacksave
555
556 #define newptrb frame->Xnewptrb
557
558 /* When recursion is being used, local variables are allocated on the stack and
559 get preserved during recursion in the normal way. In this environment, fi and
560 i, and fc and c, can be the same variables. */
561
562 #else /* NO_RECURSE not defined */
563 #define fi i
564 #define fc c
565
566
567 #ifdef SUPPORT_UTF8 /* Many of these variables are used only */
568 const uschar *charptr; /* in small blocks of the code. My normal */
569 #endif /* style of coding would have declared */
570 const uschar *callpat; /* them within each of those blocks. */
571 const uschar *data; /* However, in order to accommodate the */
572 const uschar *next; /* version of this code that uses an */
573 USPTR pp; /* external "stack" implemented on the */
574 const uschar *prev; /* heap, it is easier to declare them all */
575 USPTR saved_eptr; /* here, so the declarations can be cut */
576 /* out in a block. The only declarations */
577 recursion_info new_recursive; /* within blocks below are for variables */
578 /* that do not have to be preserved over */
579 BOOL cur_is_word; /* a recursive call to RMATCH(). */
580 BOOL condition;
581 BOOL prev_is_word;
582
583 unsigned long int original_ims;
584
585 #ifdef SUPPORT_UCP
586 int prop_type;
587 int prop_value;
588 int prop_fail_result;
589 int prop_category;
590 int prop_chartype;
591 int prop_script;
592 int oclength;
593 uschar occhars[8];
594 #endif
595
596 int codelink;
597 int ctype;
598 int length;
599 int max;
600 int min;
601 int number;
602 int offset;
603 int op;
604 int save_capture_last;
605 int save_offset1, save_offset2, save_offset3;
606 int stacksave[REC_STACK_SAVE_MAX];
607
608 eptrblock newptrb;
609 #endif /* NO_RECURSE */
610
611 /* These statements are here to stop the compiler complaining about unitialized
612 variables. */
613
614 #ifdef SUPPORT_UCP
615 prop_value = 0;
616 prop_fail_result = 0;
617 #endif
618
619
620 /* This label is used for tail recursion, which is used in a few cases even
621 when NO_RECURSE is not defined, in order to reduce the amount of stack that is
622 used. Thanks to Ian Taylor for noticing this possibility and sending the
623 original patch. */
624
625 TAIL_RECURSE:
626
627 /* OK, now we can get on with the real code of the function. Recursive calls
628 are specified by the macro RMATCH and RRETURN is used to return. When
629 NO_RECURSE is *not* defined, these just turn into a recursive call to match()
630 and a "return", respectively (possibly with some debugging if PCRE_DEBUG is
631 defined). However, RMATCH isn't like a function call because it's quite a
632 complicated macro. It has to be used in one particular way. This shouldn't,
633 however, impact performance when true recursion is being used. */
634
635 #ifdef SUPPORT_UTF8
636 utf8 = md->utf8; /* Local copy of the flag */
637 #else
638 utf8 = FALSE;
639 #endif
640
641 /* First check that we haven't called match() too many times, or that we
642 haven't exceeded the recursive call limit. */
643
644 if (md->match_call_count++ >= md->match_limit) RRETURN(PCRE_ERROR_MATCHLIMIT);
645 if (rdepth >= md->match_limit_recursion) RRETURN(PCRE_ERROR_RECURSIONLIMIT);
646
647 original_ims = ims; /* Save for resetting on ')' */
648
649 /* At the start of a group with an unlimited repeat that may match an empty
650 string, the match_cbegroup flag is set. When this is the case, add the current
651 subject pointer to the chain of such remembered pointers, to be checked when we
652 hit the closing ket, in order to break infinite loops that match no characters.
653 When match() is called in other circumstances, don't add to the chain. The
654 match_cbegroup flag must NOT be used with tail recursion, because the memory
655 block that is used is on the stack, so a new one may be required for each
656 match(). */
657
658 if ((flags & match_cbegroup) != 0)
659 {
660 newptrb.epb_saved_eptr = eptr;
661 newptrb.epb_prev = eptrb;
662 eptrb = &newptrb;
663 }
664
665 /* Now start processing the opcodes. */
666
667 for (;;)
668 {
669 minimize = possessive = FALSE;
670 op = *ecode;
671
672 switch(op)
673 {
674 case OP_FAIL:
675 RRETURN(MATCH_NOMATCH);
676
677 case OP_PRUNE:
678 RMATCH(eptr, ecode + _pcre_OP_lengths[*ecode], offset_top, md,
679 ims, eptrb, flags, RM51);
680 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
681 RRETURN(MATCH_PRUNE);
682
683 case OP_COMMIT:
684 RMATCH(eptr, ecode + _pcre_OP_lengths[*ecode], offset_top, md,
685 ims, eptrb, flags, RM52);
686 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
687 RRETURN(MATCH_COMMIT);
688
689 case OP_SKIP:
690 RMATCH(eptr, ecode + _pcre_OP_lengths[*ecode], offset_top, md,
691 ims, eptrb, flags, RM53);
692 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
693 md->start_match_ptr = eptr; /* Pass back current position */
694 RRETURN(MATCH_SKIP);
695
696 case OP_THEN:
697 RMATCH(eptr, ecode + _pcre_OP_lengths[*ecode], offset_top, md,
698 ims, eptrb, flags, RM54);
699 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
700 RRETURN(MATCH_THEN);
701
702 /* Handle a capturing bracket. If there is space in the offset vector, save
703 the current subject position in the working slot at the top of the vector.
704 We mustn't change the current values of the data slot, because they may be
705 set from a previous iteration of this group, and be referred to by a
706 reference inside the group.
707
708 If the bracket fails to match, we need to restore this value and also the
709 values of the final offsets, in case they were set by a previous iteration
710 of the same bracket.
711
712 If there isn't enough space in the offset vector, treat this as if it were
713 a non-capturing bracket. Don't worry about setting the flag for the error
714 case here; that is handled in the code for KET. */
715
716 case OP_CBRA:
717 case OP_SCBRA:
718 number = GET2(ecode, 1+LINK_SIZE);
719 offset = number << 1;
720
721 #ifdef PCRE_DEBUG
722 printf("start bracket %d\n", number);
723 printf("subject=");
724 pchars(eptr, 16, TRUE, md);
725 printf("\n");
726 #endif
727
728 if (offset < md->offset_max)
729 {
730 save_offset1 = md->offset_vector[offset];
731 save_offset2 = md->offset_vector[offset+1];
732 save_offset3 = md->offset_vector[md->offset_end - number];
733 save_capture_last = md->capture_last;
734
735 DPRINTF(("saving %d %d %d\n", save_offset1, save_offset2, save_offset3));
736 md->offset_vector[md->offset_end - number] = eptr - md->start_subject;
737
738 flags = (op == OP_SCBRA)? match_cbegroup : 0;
739 do
740 {
741 RMATCH(eptr, ecode + _pcre_OP_lengths[*ecode], offset_top, md,
742 ims, eptrb, flags, RM1);
743 if (rrc != MATCH_NOMATCH && rrc != MATCH_THEN) RRETURN(rrc);
744 md->capture_last = save_capture_last;
745 ecode += GET(ecode, 1);
746 }
747 while (*ecode == OP_ALT);
748
749 DPRINTF(("bracket %d failed\n", number));
750
751 md->offset_vector[offset] = save_offset1;
752 md->offset_vector[offset+1] = save_offset2;
753 md->offset_vector[md->offset_end - number] = save_offset3;
754
755 RRETURN(MATCH_NOMATCH);
756 }
757
758 /* FALL THROUGH ... Insufficient room for saving captured contents. Treat
759 as a non-capturing bracket. */
760
761 /* VVVVVVVVVVVVVVVVVVVVVVVVV */
762 /* VVVVVVVVVVVVVVVVVVVVVVVVV */
763
764 DPRINTF(("insufficient capture room: treat as non-capturing\n"));
765
766 /* VVVVVVVVVVVVVVVVVVVVVVVVV */
767 /* VVVVVVVVVVVVVVVVVVVVVVVVV */
768
769 /* Non-capturing bracket. Loop for all the alternatives. When we get to the
770 final alternative within the brackets, we would return the result of a
771 recursive call to match() whatever happened. We can reduce stack usage by
772 turning this into a tail recursion, except in the case when match_cbegroup
773 is set.*/
774
775 case OP_BRA:
776 case OP_SBRA:
777 DPRINTF(("start non-capturing bracket\n"));
778 flags = (op >= OP_SBRA)? match_cbegroup : 0;
779 for (;;)
780 {
781 if (ecode[GET(ecode, 1)] != OP_ALT) /* Final alternative */
782 {
783 if (flags == 0) /* Not a possibly empty group */
784 {
785 ecode += _pcre_OP_lengths[*ecode];
786 DPRINTF(("bracket 0 tail recursion\n"));
787 goto TAIL_RECURSE;
788 }
789
790 /* Possibly empty group; can't use tail recursion. */
791
792 RMATCH(eptr, ecode + _pcre_OP_lengths[*ecode], offset_top, md, ims,
793 eptrb, flags, RM48);
794 RRETURN(rrc);
795 }
796
797 /* For non-final alternatives, continue the loop for a NOMATCH result;
798 otherwise return. */
799
800 RMATCH(eptr, ecode + _pcre_OP_lengths[*ecode], offset_top, md, ims,
801 eptrb, flags, RM2);
802 if (rrc != MATCH_NOMATCH && rrc != MATCH_THEN) RRETURN(rrc);
803 ecode += GET(ecode, 1);
804 }
805 /* Control never reaches here. */
806
807 /* Conditional group: compilation checked that there are no more than
808 two branches. If the condition is false, skipping the first branch takes us
809 past the end if there is only one branch, but that's OK because that is
810 exactly what going to the ket would do. As there is only one branch to be
811 obeyed, we can use tail recursion to avoid using another stack frame. */
812
813 case OP_COND:
814 case OP_SCOND:
815 codelink= GET(ecode, 1);
816
817 /* Because of the way auto-callout works during compile, a callout item is
818 inserted between OP_COND and an assertion condition. */
819
820 if (ecode[LINK_SIZE+1] == OP_CALLOUT)
821 {
822 if (pcre_callout != NULL)
823 {
824 pcre_callout_block cb;
825 cb.version = 1; /* Version 1 of the callout block */
826 cb.callout_number = ecode[LINK_SIZE+2];
827 cb.offset_vector = md->offset_vector;
828 cb.subject = (PCRE_SPTR)md->start_subject;
829 cb.subject_length = md->end_subject - md->start_subject;
830 cb.start_match = mstart - md->start_subject;
831 cb.current_position = eptr - md->start_subject;
832 cb.pattern_position = GET(ecode, LINK_SIZE + 3);
833 cb.next_item_length = GET(ecode, 3 + 2*LINK_SIZE);
834 cb.capture_top = offset_top/2;
835 cb.capture_last = md->capture_last;
836 cb.callout_data = md->callout_data;
837 if ((rrc = (*pcre_callout)(&cb)) > 0) RRETURN(MATCH_NOMATCH);
838 if (rrc < 0) RRETURN(rrc);
839 }
840 ecode += _pcre_OP_lengths[OP_CALLOUT];
841 }
842
843 condcode = ecode[LINK_SIZE+1];
844
845 /* Now see what the actual condition is */
846
847 if (condcode == OP_RREF || condcode == OP_NRREF) /* Recursion test */
848 {
849 if (md->recursive == NULL) /* Not recursing => FALSE */
850 {
851 condition = FALSE;
852 ecode += GET(ecode, 1);
853 }
854 else
855 {
856 int recno = GET2(ecode, LINK_SIZE + 2); /* Recursion group number*/
857 condition = (recno == RREF_ANY || recno == md->recursive->group_num);
858
859 /* If the test is for recursion into a specific subpattern, and it is
860 false, but the test was set up by name, scan the table to see if the
861 name refers to any other numbers, and test them. The condition is true
862 if any one is set. */
863
864 if (!condition && condcode == OP_NRREF && recno != RREF_ANY)
865 {
866 uschar *slotA = md->name_table;
867 for (i = 0; i < md->name_count; i++)
868 {
869 if (GET2(slotA, 0) == recno) break;
870 slotA += md->name_entry_size;
871 }
872
873 /* Found a name for the number - there can be only one; duplicate
874 names for different numbers are allowed, but not vice versa. First
875 scan down for duplicates. */
876
877 if (i < md->name_count)
878 {
879 uschar *slotB = slotA;
880 while (slotB > md->name_table)
881 {
882 slotB -= md->name_entry_size;
883 if (strcmp((char *)slotA + 2, (char *)slotB + 2) == 0)
884 {
885 condition = GET2(slotB, 0) == md->recursive->group_num;
886 if (condition) break;
887 }
888 else break;
889 }
890
891 /* Scan up for duplicates */
892
893 if (!condition)
894 {
895 slotB = slotA;
896 for (i++; i < md->name_count; i++)
897 {
898 slotB += md->name_entry_size;
899 if (strcmp((char *)slotA + 2, (char *)slotB + 2) == 0)
900 {
901 condition = GET2(slotB, 0) == md->recursive->group_num;
902 if (condition) break;
903 }
904 else break;
905 }
906 }
907 }
908 }
909
910 /* Chose branch according to the condition */
911
912 ecode += condition? 3 : GET(ecode, 1);
913 }
914 }
915
916 else if (condcode == OP_CREF || condcode == OP_NCREF) /* Group used test */
917 {
918 offset = GET2(ecode, LINK_SIZE+2) << 1; /* Doubled ref number */
919 condition = offset < offset_top && md->offset_vector[offset] >= 0;
920
921 /* If the numbered capture is unset, but the reference was by name,
922 scan the table to see if the name refers to any other numbers, and test
923 them. The condition is true if any one is set. This is tediously similar
924 to the code above, but not close enough to try to amalgamate. */
925
926 if (!condition && condcode == OP_NCREF)
927 {
928 int refno = offset >> 1;
929 uschar *slotA = md->name_table;
930
931 for (i = 0; i < md->name_count; i++)
932 {
933 if (GET2(slotA, 0) == refno) break;
934 slotA += md->name_entry_size;
935 }
936
937 /* Found a name for the number - there can be only one; duplicate names
938 for different numbers are allowed, but not vice versa. First scan down
939 for duplicates. */
940
941 if (i < md->name_count)
942 {
943 uschar *slotB = slotA;
944 while (slotB > md->name_table)
945 {
946 slotB -= md->name_entry_size;
947 if (strcmp((char *)slotA + 2, (char *)slotB + 2) == 0)
948 {
949 offset = GET2(slotB, 0) << 1;
950 condition = offset < offset_top &&
951 md->offset_vector[offset] >= 0;
952 if (condition) break;
953 }
954 else break;
955 }
956
957 /* Scan up for duplicates */
958
959 if (!condition)
960 {
961 slotB = slotA;
962 for (i++; i < md->name_count; i++)
963 {
964 slotB += md->name_entry_size;
965 if (strcmp((char *)slotA + 2, (char *)slotB + 2) == 0)
966 {
967 offset = GET2(slotB, 0) << 1;
968 condition = offset < offset_top &&
969 md->offset_vector[offset] >= 0;
970 if (condition) break;
971 }
972 else break;
973 }
974 }
975 }
976 }
977
978 /* Chose branch according to the condition */
979
980 ecode += condition? 3 : GET(ecode, 1);
981 }
982
983 else if (condcode == OP_DEF) /* DEFINE - always false */
984 {
985 condition = FALSE;
986 ecode += GET(ecode, 1);
987 }
988
989 /* The condition is an assertion. Call match() to evaluate it - setting
990 the final argument match_condassert causes it to stop at the end of an
991 assertion. */
992
993 else
994 {
995 RMATCH(eptr, ecode + 1 + LINK_SIZE, offset_top, md, ims, NULL,
996 match_condassert, RM3);
997 if (rrc == MATCH_MATCH)
998 {
999 condition = TRUE;
1000 ecode += 1 + LINK_SIZE + GET(ecode, LINK_SIZE + 2);
1001 while (*ecode == OP_ALT) ecode += GET(ecode, 1);
1002 }
1003 else if (rrc != MATCH_NOMATCH && rrc != MATCH_THEN)
1004 {
1005 RRETURN(rrc); /* Need braces because of following else */
1006 }
1007 else
1008 {
1009 condition = FALSE;
1010 ecode += codelink;
1011 }
1012 }
1013
1014 /* We are now at the branch that is to be obeyed. As there is only one,
1015 we can use tail recursion to avoid using another stack frame, except when
1016 match_cbegroup is required for an unlimited repeat of a possibly empty
1017 group. If the second alternative doesn't exist, we can just plough on. */
1018
1019 if (condition || *ecode == OP_ALT)
1020 {
1021 ecode += 1 + LINK_SIZE;
1022 if (op == OP_SCOND) /* Possibly empty group */
1023 {
1024 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, match_cbegroup, RM49);
1025 RRETURN(rrc);
1026 }
1027 else /* Group must match something */
1028 {
1029 flags = 0;
1030 goto TAIL_RECURSE;
1031 }
1032 }
1033 else /* Condition false & no alternative */
1034 {
1035 ecode += 1 + LINK_SIZE;
1036 }
1037 break;
1038
1039
1040 /* Before OP_ACCEPT there may be any number of OP_CLOSE opcodes,
1041 to close any currently open capturing brackets. */
1042
1043 case OP_CLOSE:
1044 number = GET2(ecode, 1);
1045 offset = number << 1;
1046
1047 #ifdef PCRE_DEBUG
1048 printf("end bracket %d at *ACCEPT", number);
1049 printf("\n");
1050 #endif
1051
1052 md->capture_last = number;
1053 if (offset >= md->offset_max) md->offset_overflow = TRUE; else
1054 {
1055 md->offset_vector[offset] =
1056 md->offset_vector[md->offset_end - number];
1057 md->offset_vector[offset+1] = eptr - md->start_subject;
1058 if (offset_top <= offset) offset_top = offset + 2;
1059 }
1060 ecode += 3;
1061 break;
1062
1063
1064 /* End of the pattern, either real or forced. If we are in a top-level
1065 recursion, we should restore the offsets appropriately and continue from
1066 after the call. */
1067
1068 case OP_ACCEPT:
1069 case OP_END:
1070 if (md->recursive != NULL && md->recursive->group_num == 0)
1071 {
1072 recursion_info *rec = md->recursive;
1073 DPRINTF(("End of pattern in a (?0) recursion\n"));
1074 md->recursive = rec->prevrec;
1075 memmove(md->offset_vector, rec->offset_save,
1076 rec->saved_max * sizeof(int));
1077 offset_top = rec->save_offset_top;
1078 ims = original_ims;
1079 ecode = rec->after_call;
1080 break;
1081 }
1082
1083 /* Otherwise, if we have matched an empty string, fail if PCRE_NOTEMPTY is
1084 set, or if PCRE_NOTEMPTY_ATSTART is set and we have matched at the start of
1085 the subject. In both cases, backtracking will then try other alternatives,
1086 if any. */
1087
1088 if (eptr == mstart &&
1089 (md->notempty ||
1090 (md->notempty_atstart &&
1091 mstart == md->start_subject + md->start_offset)))
1092 RRETURN(MATCH_NOMATCH);
1093
1094 /* Otherwise, we have a match. */
1095
1096 md->end_match_ptr = eptr; /* Record where we ended */
1097 md->end_offset_top = offset_top; /* and how many extracts were taken */
1098 md->start_match_ptr = mstart; /* and the start (\K can modify) */
1099 RRETURN(MATCH_MATCH);
1100
1101 /* Change option settings */
1102
1103 case OP_OPT:
1104 ims = ecode[1];
1105 ecode += 2;
1106 DPRINTF(("ims set to %02lx\n", ims));
1107 break;
1108
1109 /* Assertion brackets. Check the alternative branches in turn - the
1110 matching won't pass the KET for an assertion. If any one branch matches,
1111 the assertion is true. Lookbehind assertions have an OP_REVERSE item at the
1112 start of each branch to move the current point backwards, so the code at
1113 this level is identical to the lookahead case. */
1114
1115 case OP_ASSERT:
1116 case OP_ASSERTBACK:
1117 do
1118 {
1119 RMATCH(eptr, ecode + 1 + LINK_SIZE, offset_top, md, ims, NULL, 0,
1120 RM4);
1121 if (rrc == MATCH_MATCH)
1122 {
1123 mstart = md->start_match_ptr; /* In case \K reset it */
1124 break;
1125 }
1126 if (rrc != MATCH_NOMATCH && rrc != MATCH_THEN) RRETURN(rrc);
1127 ecode += GET(ecode, 1);
1128 }
1129 while (*ecode == OP_ALT);
1130 if (*ecode == OP_KET) RRETURN(MATCH_NOMATCH);
1131
1132 /* If checking an assertion for a condition, return MATCH_MATCH. */
1133
1134 if ((flags & match_condassert) != 0) RRETURN(MATCH_MATCH);
1135
1136 /* Continue from after the assertion, updating the offsets high water
1137 mark, since extracts may have been taken during the assertion. */
1138
1139 do ecode += GET(ecode,1); while (*ecode == OP_ALT);
1140 ecode += 1 + LINK_SIZE;
1141 offset_top = md->end_offset_top;
1142 continue;
1143
1144 /* Negative assertion: all branches must fail to match. Encountering SKIP,
1145 PRUNE, or COMMIT means we must assume failure without checking subsequent
1146 branches. */
1147
1148 case OP_ASSERT_NOT:
1149 case OP_ASSERTBACK_NOT:
1150 do
1151 {
1152 RMATCH(eptr, ecode + 1 + LINK_SIZE, offset_top, md, ims, NULL, 0,
1153 RM5);
1154 if (rrc == MATCH_MATCH) RRETURN(MATCH_NOMATCH);
1155 if (rrc == MATCH_SKIP || rrc == MATCH_PRUNE || rrc == MATCH_COMMIT)
1156 {
1157 do ecode += GET(ecode,1); while (*ecode == OP_ALT);
1158 break;
1159 }
1160 if (rrc != MATCH_NOMATCH && rrc != MATCH_THEN) RRETURN(rrc);
1161 ecode += GET(ecode,1);
1162 }
1163 while (*ecode == OP_ALT);
1164
1165 if ((flags & match_condassert) != 0) RRETURN(MATCH_MATCH);
1166
1167 ecode += 1 + LINK_SIZE;
1168 continue;
1169
1170 /* Move the subject pointer back. This occurs only at the start of
1171 each branch of a lookbehind assertion. If we are too close to the start to
1172 move back, this match function fails. When working with UTF-8 we move
1173 back a number of characters, not bytes. */
1174
1175 case OP_REVERSE:
1176 #ifdef SUPPORT_UTF8
1177 if (utf8)
1178 {
1179 i = GET(ecode, 1);
1180 while (i-- > 0)
1181 {
1182 eptr--;
1183 if (eptr < md->start_subject) RRETURN(MATCH_NOMATCH);
1184 BACKCHAR(eptr);
1185 }
1186 }
1187 else
1188 #endif
1189
1190 /* No UTF-8 support, or not in UTF-8 mode: count is byte count */
1191
1192 {
1193 eptr -= GET(ecode, 1);
1194 if (eptr < md->start_subject) RRETURN(MATCH_NOMATCH);
1195 }
1196
1197 /* Save the earliest consulted character, then skip to next op code */
1198
1199 if (eptr < md->start_used_ptr) md->start_used_ptr = eptr;
1200 ecode += 1 + LINK_SIZE;
1201 break;
1202
1203 /* The callout item calls an external function, if one is provided, passing
1204 details of the match so far. This is mainly for debugging, though the
1205 function is able to force a failure. */
1206
1207 case OP_CALLOUT:
1208 if (pcre_callout != NULL)
1209 {
1210 pcre_callout_block cb;
1211 cb.version = 1; /* Version 1 of the callout block */
1212 cb.callout_number = ecode[1];
1213 cb.offset_vector = md->offset_vector;
1214 cb.subject = (PCRE_SPTR)md->start_subject;
1215 cb.subject_length = md->end_subject - md->start_subject;
1216 cb.start_match = mstart - md->start_subject;
1217 cb.current_position = eptr - md->start_subject;
1218 cb.pattern_position = GET(ecode, 2);
1219 cb.next_item_length = GET(ecode, 2 + LINK_SIZE);
1220 cb.capture_top = offset_top/2;
1221 cb.capture_last = md->capture_last;
1222 cb.callout_data = md->callout_data;
1223 if ((rrc = (*pcre_callout)(&cb)) > 0) RRETURN(MATCH_NOMATCH);
1224 if (rrc < 0) RRETURN(rrc);
1225 }
1226 ecode += 2 + 2*LINK_SIZE;
1227 break;
1228
1229 /* Recursion either matches the current regex, or some subexpression. The
1230 offset data is the offset to the starting bracket from the start of the
1231 whole pattern. (This is so that it works from duplicated subpatterns.)
1232
1233 If there are any capturing brackets started but not finished, we have to
1234 save their starting points and reinstate them after the recursion. However,
1235 we don't know how many such there are (offset_top records the completed
1236 total) so we just have to save all the potential data. There may be up to
1237 65535 such values, which is too large to put on the stack, but using malloc
1238 for small numbers seems expensive. As a compromise, the stack is used when
1239 there are no more than REC_STACK_SAVE_MAX values to store; otherwise malloc
1240 is used. A problem is what to do if the malloc fails ... there is no way of
1241 returning to the top level with an error. Save the top REC_STACK_SAVE_MAX
1242 values on the stack, and accept that the rest may be wrong.
1243
1244 There are also other values that have to be saved. We use a chained
1245 sequence of blocks that actually live on the stack. Thanks to Robin Houston
1246 for the original version of this logic. */
1247
1248 case OP_RECURSE:
1249 {
1250 callpat = md->start_code + GET(ecode, 1);
1251 new_recursive.group_num = (callpat == md->start_code)? 0 :
1252 GET2(callpat, 1 + LINK_SIZE);
1253
1254 /* Add to "recursing stack" */
1255
1256 new_recursive.prevrec = md->recursive;
1257 md->recursive = &new_recursive;
1258
1259 /* Find where to continue from afterwards */
1260
1261 ecode += 1 + LINK_SIZE;
1262 new_recursive.after_call = ecode;
1263
1264 /* Now save the offset data. */
1265
1266 new_recursive.saved_max = md->offset_end;
1267 if (new_recursive.saved_max <= REC_STACK_SAVE_MAX)
1268 new_recursive.offset_save = stacksave;
1269 else
1270 {
1271 new_recursive.offset_save =
1272 (int *)(pcre_malloc)(new_recursive.saved_max * sizeof(int));
1273 if (new_recursive.offset_save == NULL) RRETURN(PCRE_ERROR_NOMEMORY);
1274 }
1275
1276 memcpy(new_recursive.offset_save, md->offset_vector,
1277 new_recursive.saved_max * sizeof(int));
1278 new_recursive.save_offset_top = offset_top;
1279
1280 /* OK, now we can do the recursion. For each top-level alternative we
1281 restore the offset and recursion data. */
1282
1283 DPRINTF(("Recursing into group %d\n", new_recursive.group_num));
1284 flags = (*callpat >= OP_SBRA)? match_cbegroup : 0;
1285 do
1286 {
1287 RMATCH(eptr, callpat + _pcre_OP_lengths[*callpat], offset_top,
1288 md, ims, eptrb, flags, RM6);
1289 if (rrc == MATCH_MATCH)
1290 {
1291 DPRINTF(("Recursion matched\n"));
1292 md->recursive = new_recursive.prevrec;
1293 if (new_recursive.offset_save != stacksave)
1294 (pcre_free)(new_recursive.offset_save);
1295 RRETURN(MATCH_MATCH);
1296 }
1297 else if (rrc != MATCH_NOMATCH && rrc != MATCH_THEN)
1298 {
1299 DPRINTF(("Recursion gave error %d\n", rrc));
1300 if (new_recursive.offset_save != stacksave)
1301 (pcre_free)(new_recursive.offset_save);
1302 RRETURN(rrc);
1303 }
1304
1305 md->recursive = &new_recursive;
1306 memcpy(md->offset_vector, new_recursive.offset_save,
1307 new_recursive.saved_max * sizeof(int));
1308 callpat += GET(callpat, 1);
1309 }
1310 while (*callpat == OP_ALT);
1311
1312 DPRINTF(("Recursion didn't match\n"));
1313 md->recursive = new_recursive.prevrec;
1314 if (new_recursive.offset_save != stacksave)
1315 (pcre_free)(new_recursive.offset_save);
1316 RRETURN(MATCH_NOMATCH);
1317 }
1318 /* Control never reaches here */
1319
1320 /* "Once" brackets are like assertion brackets except that after a match,
1321 the point in the subject string is not moved back. Thus there can never be
1322 a move back into the brackets. Friedl calls these "atomic" subpatterns.
1323 Check the alternative branches in turn - the matching won't pass the KET
1324 for this kind of subpattern. If any one branch matches, we carry on as at
1325 the end of a normal bracket, leaving the subject pointer, but resetting
1326 the start-of-match value in case it was changed by \K. */
1327
1328 case OP_ONCE:
1329 prev = ecode;
1330 saved_eptr = eptr;
1331
1332 do
1333 {
1334 RMATCH(eptr, ecode + 1 + LINK_SIZE, offset_top, md, ims, eptrb, 0, RM7);
1335 if (rrc == MATCH_MATCH)
1336 {
1337 mstart = md->start_match_ptr;
1338 break;
1339 }
1340 if (rrc != MATCH_NOMATCH && rrc != MATCH_THEN) RRETURN(rrc);
1341 ecode += GET(ecode,1);
1342 }
1343 while (*ecode == OP_ALT);
1344
1345 /* If hit the end of the group (which could be repeated), fail */
1346
1347 if (*ecode != OP_ONCE && *ecode != OP_ALT) RRETURN(MATCH_NOMATCH);
1348
1349 /* Continue as from after the assertion, updating the offsets high water
1350 mark, since extracts may have been taken. */
1351
1352 do ecode += GET(ecode, 1); while (*ecode == OP_ALT);
1353
1354 offset_top = md->end_offset_top;
1355 eptr = md->end_match_ptr;
1356
1357 /* For a non-repeating ket, just continue at this level. This also
1358 happens for a repeating ket if no characters were matched in the group.
1359 This is the forcible breaking of infinite loops as implemented in Perl
1360 5.005. If there is an options reset, it will get obeyed in the normal
1361 course of events. */
1362
1363 if (*ecode == OP_KET || eptr == saved_eptr)
1364 {
1365 ecode += 1+LINK_SIZE;
1366 break;
1367 }
1368
1369 /* The repeating kets try the rest of the pattern or restart from the
1370 preceding bracket, in the appropriate order. The second "call" of match()
1371 uses tail recursion, to avoid using another stack frame. We need to reset
1372 any options that changed within the bracket before re-running it, so
1373 check the next opcode. */
1374
1375 if (ecode[1+LINK_SIZE] == OP_OPT)
1376 {
1377 ims = (ims & ~PCRE_IMS) | ecode[4];
1378 DPRINTF(("ims set to %02lx at group repeat\n", ims));
1379 }
1380
1381 if (*ecode == OP_KETRMIN)
1382 {
1383 RMATCH(eptr, ecode + 1 + LINK_SIZE, offset_top, md, ims, eptrb, 0, RM8);
1384 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1385 ecode = prev;
1386 flags = 0;
1387 goto TAIL_RECURSE;
1388 }
1389 else /* OP_KETRMAX */
1390 {
1391 RMATCH(eptr, prev, offset_top, md, ims, eptrb, match_cbegroup, RM9);
1392 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1393 ecode += 1 + LINK_SIZE;
1394 flags = 0;
1395 goto TAIL_RECURSE;
1396 }
1397 /* Control never gets here */
1398
1399 /* An alternation is the end of a branch; scan along to find the end of the
1400 bracketed group and go to there. */
1401
1402 case OP_ALT:
1403 do ecode += GET(ecode,1); while (*ecode == OP_ALT);
1404 break;
1405
1406 /* BRAZERO, BRAMINZERO and SKIPZERO occur just before a bracket group,
1407 indicating that it may occur zero times. It may repeat infinitely, or not
1408 at all - i.e. it could be ()* or ()? or even (){0} in the pattern. Brackets
1409 with fixed upper repeat limits are compiled as a number of copies, with the
1410 optional ones preceded by BRAZERO or BRAMINZERO. */
1411
1412 case OP_BRAZERO:
1413 {
1414 next = ecode+1;
1415 RMATCH(eptr, next, offset_top, md, ims, eptrb, 0, RM10);
1416 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1417 do next += GET(next,1); while (*next == OP_ALT);
1418 ecode = next + 1 + LINK_SIZE;
1419 }
1420 break;
1421
1422 case OP_BRAMINZERO:
1423 {
1424 next = ecode+1;
1425 do next += GET(next, 1); while (*next == OP_ALT);
1426 RMATCH(eptr, next + 1+LINK_SIZE, offset_top, md, ims, eptrb, 0, RM11);
1427 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1428 ecode++;
1429 }
1430 break;
1431
1432 case OP_SKIPZERO:
1433 {
1434 next = ecode+1;
1435 do next += GET(next,1); while (*next == OP_ALT);
1436 ecode = next + 1 + LINK_SIZE;
1437 }
1438 break;
1439
1440 /* End of a group, repeated or non-repeating. */
1441
1442 case OP_KET:
1443 case OP_KETRMIN:
1444 case OP_KETRMAX:
1445 prev = ecode - GET(ecode, 1);
1446
1447 /* If this was a group that remembered the subject start, in order to break
1448 infinite repeats of empty string matches, retrieve the subject start from
1449 the chain. Otherwise, set it NULL. */
1450
1451 if (*prev >= OP_SBRA)
1452 {
1453 saved_eptr = eptrb->epb_saved_eptr; /* Value at start of group */
1454 eptrb = eptrb->epb_prev; /* Backup to previous group */
1455 }
1456 else saved_eptr = NULL;
1457
1458 /* If we are at the end of an assertion group or an atomic group, stop
1459 matching and return MATCH_MATCH, but record the current high water mark for
1460 use by positive assertions. We also need to record the match start in case
1461 it was changed by \K. */
1462
1463 if (*prev == OP_ASSERT || *prev == OP_ASSERT_NOT ||
1464 *prev == OP_ASSERTBACK || *prev == OP_ASSERTBACK_NOT ||
1465 *prev == OP_ONCE)
1466 {
1467 md->end_match_ptr = eptr; /* For ONCE */
1468 md->end_offset_top = offset_top;
1469 md->start_match_ptr = mstart;
1470 RRETURN(MATCH_MATCH);
1471 }
1472
1473 /* For capturing groups we have to check the group number back at the start
1474 and if necessary complete handling an extraction by setting the offsets and
1475 bumping the high water mark. Note that whole-pattern recursion is coded as
1476 a recurse into group 0, so it won't be picked up here. Instead, we catch it
1477 when the OP_END is reached. Other recursion is handled here. */
1478
1479 if (*prev == OP_CBRA || *prev == OP_SCBRA)
1480 {
1481 number = GET2(prev, 1+LINK_SIZE);
1482 offset = number << 1;
1483
1484 #ifdef PCRE_DEBUG
1485 printf("end bracket %d", number);
1486 printf("\n");
1487 #endif
1488
1489 md->capture_last = number;
1490 if (offset >= md->offset_max) md->offset_overflow = TRUE; else
1491 {
1492 md->offset_vector[offset] =
1493 md->offset_vector[md->offset_end - number];
1494 md->offset_vector[offset+1] = eptr - md->start_subject;
1495 if (offset_top <= offset) offset_top = offset + 2;
1496 }
1497
1498 /* Handle a recursively called group. Restore the offsets
1499 appropriately and continue from after the call. */
1500
1501 if (md->recursive != NULL && md->recursive->group_num == number)
1502 {
1503 recursion_info *rec = md->recursive;
1504 DPRINTF(("Recursion (%d) succeeded - continuing\n", number));
1505 md->recursive = rec->prevrec;
1506 memcpy(md->offset_vector, rec->offset_save,
1507 rec->saved_max * sizeof(int));
1508 offset_top = rec->save_offset_top;
1509 ecode = rec->after_call;
1510 ims = original_ims;
1511 break;
1512 }
1513 }
1514
1515 /* For both capturing and non-capturing groups, reset the value of the ims
1516 flags, in case they got changed during the group. */
1517
1518 ims = original_ims;
1519 DPRINTF(("ims reset to %02lx\n", ims));
1520
1521 /* For a non-repeating ket, just continue at this level. This also
1522 happens for a repeating ket if no characters were matched in the group.
1523 This is the forcible breaking of infinite loops as implemented in Perl
1524 5.005. If there is an options reset, it will get obeyed in the normal
1525 course of events. */
1526
1527 if (*ecode == OP_KET || eptr == saved_eptr)
1528 {
1529 ecode += 1 + LINK_SIZE;
1530 break;
1531 }
1532
1533 /* The repeating kets try the rest of the pattern or restart from the
1534 preceding bracket, in the appropriate order. In the second case, we can use
1535 tail recursion to avoid using another stack frame, unless we have an
1536 unlimited repeat of a group that can match an empty string. */
1537
1538 flags = (*prev >= OP_SBRA)? match_cbegroup : 0;
1539
1540 if (*ecode == OP_KETRMIN)
1541 {
1542 RMATCH(eptr, ecode + 1 + LINK_SIZE, offset_top, md, ims, eptrb, 0, RM12);
1543 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1544 if (flags != 0) /* Could match an empty string */
1545 {
1546 RMATCH(eptr, prev, offset_top, md, ims, eptrb, flags, RM50);
1547 RRETURN(rrc);
1548 }
1549 ecode = prev;
1550 goto TAIL_RECURSE;
1551 }
1552 else /* OP_KETRMAX */
1553 {
1554 RMATCH(eptr, prev, offset_top, md, ims, eptrb, flags, RM13);
1555 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1556 ecode += 1 + LINK_SIZE;
1557 flags = 0;
1558 goto TAIL_RECURSE;
1559 }
1560 /* Control never gets here */
1561
1562 /* Start of subject unless notbol, or after internal newline if multiline */
1563
1564 case OP_CIRC:
1565 if (md->notbol && eptr == md->start_subject) RRETURN(MATCH_NOMATCH);
1566 if ((ims & PCRE_MULTILINE) != 0)
1567 {
1568 if (eptr != md->start_subject &&
1569 (eptr == md->end_subject || !WAS_NEWLINE(eptr)))
1570 RRETURN(MATCH_NOMATCH);
1571 ecode++;
1572 break;
1573 }
1574 /* ... else fall through */
1575
1576 /* Start of subject assertion */
1577
1578 case OP_SOD:
1579 if (eptr != md->start_subject) RRETURN(MATCH_NOMATCH);
1580 ecode++;
1581 break;
1582
1583 /* Start of match assertion */
1584
1585 case OP_SOM:
1586 if (eptr != md->start_subject + md->start_offset) RRETURN(MATCH_NOMATCH);
1587 ecode++;
1588 break;
1589
1590 /* Reset the start of match point */
1591
1592 case OP_SET_SOM:
1593 mstart = eptr;
1594 ecode++;
1595 break;
1596
1597 /* Assert before internal newline if multiline, or before a terminating
1598 newline unless endonly is set, else end of subject unless noteol is set. */
1599
1600 case OP_DOLL:
1601 if ((ims & PCRE_MULTILINE) != 0)
1602 {
1603 if (eptr < md->end_subject)
1604 { if (!IS_NEWLINE(eptr)) RRETURN(MATCH_NOMATCH); }
1605 else
1606 { if (md->noteol) RRETURN(MATCH_NOMATCH); }
1607 ecode++;
1608 break;
1609 }
1610 else
1611 {
1612 if (md->noteol) RRETURN(MATCH_NOMATCH);
1613 if (!md->endonly)
1614 {
1615 if (eptr != md->end_subject &&
1616 (!IS_NEWLINE(eptr) || eptr != md->end_subject - md->nllen))
1617 RRETURN(MATCH_NOMATCH);
1618 ecode++;
1619 break;
1620 }
1621 }
1622 /* ... else fall through for endonly */
1623
1624 /* End of subject assertion (\z) */
1625
1626 case OP_EOD:
1627 if (eptr < md->end_subject) RRETURN(MATCH_NOMATCH);
1628 ecode++;
1629 break;
1630
1631 /* End of subject or ending \n assertion (\Z) */
1632
1633 case OP_EODN:
1634 if (eptr != md->end_subject &&
1635 (!IS_NEWLINE(eptr) || eptr != md->end_subject - md->nllen))
1636 RRETURN(MATCH_NOMATCH);
1637 ecode++;
1638 break;
1639
1640 /* Word boundary assertions */
1641
1642 case OP_NOT_WORD_BOUNDARY:
1643 case OP_WORD_BOUNDARY:
1644 {
1645
1646 /* Find out if the previous and current characters are "word" characters.
1647 It takes a bit more work in UTF-8 mode. Characters > 255 are assumed to
1648 be "non-word" characters. Remember the earliest consulted character for
1649 partial matching. */
1650
1651 #ifdef SUPPORT_UTF8
1652 if (utf8)
1653 {
1654 if (eptr == md->start_subject) prev_is_word = FALSE; else
1655 {
1656 USPTR lastptr = eptr - 1;
1657 while((*lastptr & 0xc0) == 0x80) lastptr--;
1658 if (lastptr < md->start_used_ptr) md->start_used_ptr = lastptr;
1659 GETCHAR(c, lastptr);
1660 prev_is_word = c < 256 && (md->ctypes[c] & ctype_word) != 0;
1661 }
1662 if (eptr >= md->end_subject)
1663 {
1664 SCHECK_PARTIAL();
1665 cur_is_word = FALSE;
1666 }
1667 else
1668 {
1669 GETCHAR(c, eptr);
1670 cur_is_word = c < 256 && (md->ctypes[c] & ctype_word) != 0;
1671 }
1672 }
1673 else
1674 #endif
1675
1676 /* Not in UTF-8 mode */
1677
1678 {
1679 if (eptr == md->start_subject) prev_is_word = FALSE; else
1680 {
1681 if (eptr <= md->start_used_ptr) md->start_used_ptr = eptr - 1;
1682 prev_is_word = ((md->ctypes[eptr[-1]] & ctype_word) != 0);
1683 }
1684 if (eptr >= md->end_subject)
1685 {
1686 SCHECK_PARTIAL();
1687 cur_is_word = FALSE;
1688 }
1689 else cur_is_word = ((md->ctypes[*eptr] & ctype_word) != 0);
1690 }
1691
1692 /* Now see if the situation is what we want */
1693
1694 if ((*ecode++ == OP_WORD_BOUNDARY)?
1695 cur_is_word == prev_is_word : cur_is_word != prev_is_word)
1696 RRETURN(MATCH_NOMATCH);
1697 }
1698 break;
1699
1700 /* Match a single character type; inline for speed */
1701
1702 case OP_ANY:
1703 if (IS_NEWLINE(eptr)) RRETURN(MATCH_NOMATCH);
1704 /* Fall through */
1705
1706 case OP_ALLANY:
1707 if (eptr++ >= md->end_subject)
1708 {
1709 SCHECK_PARTIAL();
1710 RRETURN(MATCH_NOMATCH);
1711 }
1712 if (utf8) while (eptr < md->end_subject && (*eptr & 0xc0) == 0x80) eptr++;
1713 ecode++;
1714 break;
1715
1716 /* Match a single byte, even in UTF-8 mode. This opcode really does match
1717 any byte, even newline, independent of the setting of PCRE_DOTALL. */
1718
1719 case OP_ANYBYTE:
1720 if (eptr++ >= md->end_subject)
1721 {
1722 SCHECK_PARTIAL();
1723 RRETURN(MATCH_NOMATCH);
1724 }
1725 ecode++;
1726 break;
1727
1728 case OP_NOT_DIGIT:
1729 if (eptr >= md->end_subject)
1730 {
1731 SCHECK_PARTIAL();
1732 RRETURN(MATCH_NOMATCH);
1733 }
1734 GETCHARINCTEST(c, eptr);
1735 if (
1736 #ifdef SUPPORT_UTF8
1737 c < 256 &&
1738 #endif
1739 (md->ctypes[c] & ctype_digit) != 0
1740 )
1741 RRETURN(MATCH_NOMATCH);
1742 ecode++;
1743 break;
1744
1745 case OP_DIGIT:
1746 if (eptr >= md->end_subject)
1747 {
1748 SCHECK_PARTIAL();
1749 RRETURN(MATCH_NOMATCH);
1750 }
1751 GETCHARINCTEST(c, eptr);
1752 if (
1753 #ifdef SUPPORT_UTF8
1754 c >= 256 ||
1755 #endif
1756 (md->ctypes[c] & ctype_digit) == 0
1757 )
1758 RRETURN(MATCH_NOMATCH);
1759 ecode++;
1760 break;
1761
1762 case OP_NOT_WHITESPACE:
1763 if (eptr >= md->end_subject)
1764 {
1765 SCHECK_PARTIAL();
1766 RRETURN(MATCH_NOMATCH);
1767 }
1768 GETCHARINCTEST(c, eptr);
1769 if (
1770 #ifdef SUPPORT_UTF8
1771 c < 256 &&
1772 #endif
1773 (md->ctypes[c] & ctype_space) != 0
1774 )
1775 RRETURN(MATCH_NOMATCH);
1776 ecode++;
1777 break;
1778
1779 case OP_WHITESPACE:
1780 if (eptr >= md->end_subject)
1781 {
1782 SCHECK_PARTIAL();
1783 RRETURN(MATCH_NOMATCH);
1784 }
1785 GETCHARINCTEST(c, eptr);
1786 if (
1787 #ifdef SUPPORT_UTF8
1788 c >= 256 ||
1789 #endif
1790 (md->ctypes[c] & ctype_space) == 0
1791 )
1792 RRETURN(MATCH_NOMATCH);
1793 ecode++;
1794 break;
1795
1796 case OP_NOT_WORDCHAR:
1797 if (eptr >= md->end_subject)
1798 {
1799 SCHECK_PARTIAL();
1800 RRETURN(MATCH_NOMATCH);
1801 }
1802 GETCHARINCTEST(c, eptr);
1803 if (
1804 #ifdef SUPPORT_UTF8
1805 c < 256 &&
1806 #endif
1807 (md->ctypes[c] & ctype_word) != 0
1808 )
1809 RRETURN(MATCH_NOMATCH);
1810 ecode++;
1811 break;
1812
1813 case OP_WORDCHAR:
1814 if (eptr >= md->end_subject)
1815 {
1816 SCHECK_PARTIAL();
1817 RRETURN(MATCH_NOMATCH);
1818 }
1819 GETCHARINCTEST(c, eptr);
1820 if (
1821 #ifdef SUPPORT_UTF8
1822 c >= 256 ||
1823 #endif
1824 (md->ctypes[c] & ctype_word) == 0
1825 )
1826 RRETURN(MATCH_NOMATCH);
1827 ecode++;
1828 break;
1829
1830 case OP_ANYNL:
1831 if (eptr >= md->end_subject)
1832 {
1833 SCHECK_PARTIAL();
1834 RRETURN(MATCH_NOMATCH);
1835 }
1836 GETCHARINCTEST(c, eptr);
1837 switch(c)
1838 {
1839 default: RRETURN(MATCH_NOMATCH);
1840 case 0x000d:
1841 if (eptr < md->end_subject && *eptr == 0x0a) eptr++;
1842 break;
1843
1844 case 0x000a:
1845 break;
1846
1847 case 0x000b:
1848 case 0x000c:
1849 case 0x0085:
1850 case 0x2028:
1851 case 0x2029:
1852 if (md->bsr_anycrlf) RRETURN(MATCH_NOMATCH);
1853 break;
1854 }
1855 ecode++;
1856 break;
1857
1858 case OP_NOT_HSPACE:
1859 if (eptr >= md->end_subject)
1860 {
1861 SCHECK_PARTIAL();
1862 RRETURN(MATCH_NOMATCH);
1863 }
1864 GETCHARINCTEST(c, eptr);
1865 switch(c)
1866 {
1867 default: break;
1868 case 0x09: /* HT */
1869 case 0x20: /* SPACE */
1870 case 0xa0: /* NBSP */
1871 case 0x1680: /* OGHAM SPACE MARK */
1872 case 0x180e: /* MONGOLIAN VOWEL SEPARATOR */
1873 case 0x2000: /* EN QUAD */
1874 case 0x2001: /* EM QUAD */
1875 case 0x2002: /* EN SPACE */
1876 case 0x2003: /* EM SPACE */
1877 case 0x2004: /* THREE-PER-EM SPACE */
1878 case 0x2005: /* FOUR-PER-EM SPACE */
1879 case 0x2006: /* SIX-PER-EM SPACE */
1880 case 0x2007: /* FIGURE SPACE */
1881 case 0x2008: /* PUNCTUATION SPACE */
1882 case 0x2009: /* THIN SPACE */
1883 case 0x200A: /* HAIR SPACE */
1884 case 0x202f: /* NARROW NO-BREAK SPACE */
1885 case 0x205f: /* MEDIUM MATHEMATICAL SPACE */
1886 case 0x3000: /* IDEOGRAPHIC SPACE */
1887 RRETURN(MATCH_NOMATCH);
1888 }
1889 ecode++;
1890 break;
1891
1892 case OP_HSPACE:
1893 if (eptr >= md->end_subject)
1894 {
1895 SCHECK_PARTIAL();
1896 RRETURN(MATCH_NOMATCH);
1897 }
1898 GETCHARINCTEST(c, eptr);
1899 switch(c)
1900 {
1901 default: RRETURN(MATCH_NOMATCH);
1902 case 0x09: /* HT */
1903 case 0x20: /* SPACE */
1904 case 0xa0: /* NBSP */
1905 case 0x1680: /* OGHAM SPACE MARK */
1906 case 0x180e: /* MONGOLIAN VOWEL SEPARATOR */
1907 case 0x2000: /* EN QUAD */
1908 case 0x2001: /* EM QUAD */
1909 case 0x2002: /* EN SPACE */
1910 case 0x2003: /* EM SPACE */
1911 case 0x2004: /* THREE-PER-EM SPACE */
1912 case 0x2005: /* FOUR-PER-EM SPACE */
1913 case 0x2006: /* SIX-PER-EM SPACE */
1914 case 0x2007: /* FIGURE SPACE */
1915 case 0x2008: /* PUNCTUATION SPACE */
1916 case 0x2009: /* THIN SPACE */
1917 case 0x200A: /* HAIR SPACE */
1918 case 0x202f: /* NARROW NO-BREAK SPACE */
1919 case 0x205f: /* MEDIUM MATHEMATICAL SPACE */
1920 case 0x3000: /* IDEOGRAPHIC SPACE */
1921 break;
1922 }
1923 ecode++;
1924 break;
1925
1926 case OP_NOT_VSPACE:
1927 if (eptr >= md->end_subject)
1928 {
1929 SCHECK_PARTIAL();
1930 RRETURN(MATCH_NOMATCH);
1931 }
1932 GETCHARINCTEST(c, eptr);
1933 switch(c)
1934 {
1935 default: break;
1936 case 0x0a: /* LF */
1937 case 0x0b: /* VT */
1938 case 0x0c: /* FF */
1939 case 0x0d: /* CR */
1940 case 0x85: /* NEL */
1941 case 0x2028: /* LINE SEPARATOR */
1942 case 0x2029: /* PARAGRAPH SEPARATOR */
1943 RRETURN(MATCH_NOMATCH);
1944 }
1945 ecode++;
1946 break;
1947
1948 case OP_VSPACE:
1949 if (eptr >= md->end_subject)
1950 {
1951 SCHECK_PARTIAL();
1952 RRETURN(MATCH_NOMATCH);
1953 }
1954 GETCHARINCTEST(c, eptr);
1955 switch(c)
1956 {
1957 default: RRETURN(MATCH_NOMATCH);
1958 case 0x0a: /* LF */
1959 case 0x0b: /* VT */
1960 case 0x0c: /* FF */
1961 case 0x0d: /* CR */
1962 case 0x85: /* NEL */
1963 case 0x2028: /* LINE SEPARATOR */
1964 case 0x2029: /* PARAGRAPH SEPARATOR */
1965 break;
1966 }
1967 ecode++;
1968 break;
1969
1970 #ifdef SUPPORT_UCP
1971 /* Check the next character by Unicode property. We will get here only
1972 if the support is in the binary; otherwise a compile-time error occurs. */
1973
1974 case OP_PROP:
1975 case OP_NOTPROP:
1976 if (eptr >= md->end_subject)
1977 {
1978 SCHECK_PARTIAL();
1979 RRETURN(MATCH_NOMATCH);
1980 }
1981 GETCHARINCTEST(c, eptr);
1982 {
1983 const ucd_record *prop = GET_UCD(c);
1984
1985 switch(ecode[1])
1986 {
1987 case PT_ANY:
1988 if (op == OP_NOTPROP) RRETURN(MATCH_NOMATCH);
1989 break;
1990
1991 case PT_LAMP:
1992 if ((prop->chartype == ucp_Lu ||
1993 prop->chartype == ucp_Ll ||
1994 prop->chartype == ucp_Lt) == (op == OP_NOTPROP))
1995 RRETURN(MATCH_NOMATCH);
1996 break;
1997
1998 case PT_GC:
1999 if ((ecode[2] != _pcre_ucp_gentype[prop->chartype]) == (op == OP_PROP))
2000 RRETURN(MATCH_NOMATCH);
2001 break;
2002
2003 case PT_PC:
2004 if ((ecode[2] != prop->chartype) == (op == OP_PROP))
2005 RRETURN(MATCH_NOMATCH);
2006 break;
2007
2008 case PT_SC:
2009 if ((ecode[2] != prop->script) == (op == OP_PROP))
2010 RRETURN(MATCH_NOMATCH);
2011 break;
2012
2013 default:
2014 RRETURN(PCRE_ERROR_INTERNAL);
2015 }
2016
2017 ecode += 3;
2018 }
2019 break;
2020
2021 /* Match an extended Unicode sequence. We will get here only if the support
2022 is in the binary; otherwise a compile-time error occurs. */
2023
2024 case OP_EXTUNI:
2025 if (eptr >= md->end_subject)
2026 {
2027 SCHECK_PARTIAL();
2028 RRETURN(MATCH_NOMATCH);
2029 }
2030 GETCHARINCTEST(c, eptr);
2031 {
2032 int category = UCD_CATEGORY(c);
2033 if (category == ucp_M) RRETURN(MATCH_NOMATCH);
2034 while (eptr < md->end_subject)
2035 {
2036 int len = 1;
2037 if (!utf8) c = *eptr; else
2038 {
2039 GETCHARLEN(c, eptr, len);
2040 }
2041 category = UCD_CATEGORY(c);
2042 if (category != ucp_M) break;
2043 eptr += len;
2044 }
2045 }
2046 ecode++;
2047 break;
2048 #endif
2049
2050
2051 /* Match a back reference, possibly repeatedly. Look past the end of the
2052 item to see if there is repeat information following. The code is similar
2053 to that for character classes, but repeated for efficiency. Then obey
2054 similar code to character type repeats - written out again for speed.
2055 However, if the referenced string is the empty string, always treat
2056 it as matched, any number of times (otherwise there could be infinite
2057 loops). */
2058
2059 case OP_REF:
2060 {
2061 offset = GET2(ecode, 1) << 1; /* Doubled ref number */
2062 ecode += 3;
2063
2064 /* If the reference is unset, there are two possibilities:
2065
2066 (a) In the default, Perl-compatible state, set the length to be longer
2067 than the amount of subject left; this ensures that every attempt at a
2068 match fails. We can't just fail here, because of the possibility of
2069 quantifiers with zero minima.
2070
2071 (b) If the JavaScript compatibility flag is set, set the length to zero
2072 so that the back reference matches an empty string.
2073
2074 Otherwise, set the length to the length of what was matched by the
2075 referenced subpattern. */
2076
2077 if (offset >= offset_top || md->offset_vector[offset] < 0)
2078 length = (md->jscript_compat)? 0 : md->end_subject - eptr + 1;
2079 else
2080 length = md->offset_vector[offset+1] - md->offset_vector[offset];
2081
2082 /* Set up for repetition, or handle the non-repeated case */
2083
2084 switch (*ecode)
2085 {
2086 case OP_CRSTAR:
2087 case OP_CRMINSTAR:
2088 case OP_CRPLUS:
2089 case OP_CRMINPLUS:
2090 case OP_CRQUERY:
2091 case OP_CRMINQUERY:
2092 c = *ecode++ - OP_CRSTAR;
2093 minimize = (c & 1) != 0;
2094 min = rep_min[c]; /* Pick up values from tables; */
2095 max = rep_max[c]; /* zero for max => infinity */
2096 if (max == 0) max = INT_MAX;
2097 break;
2098
2099 case OP_CRRANGE:
2100 case OP_CRMINRANGE:
2101 minimize = (*ecode == OP_CRMINRANGE);
2102 min = GET2(ecode, 1);
2103 max = GET2(ecode, 3);
2104 if (max == 0) max = INT_MAX;
2105 ecode += 5;
2106 break;
2107
2108 default: /* No repeat follows */
2109 if (!match_ref(offset, eptr, length, md, ims))
2110 {
2111 CHECK_PARTIAL();
2112 RRETURN(MATCH_NOMATCH);
2113 }
2114 eptr += length;
2115 continue; /* With the main loop */
2116 }
2117
2118 /* If the length of the reference is zero, just continue with the
2119 main loop. */
2120
2121 if (length == 0) continue;
2122
2123 /* First, ensure the minimum number of matches are present. We get back
2124 the length of the reference string explicitly rather than passing the
2125 address of eptr, so that eptr can be a register variable. */
2126
2127 for (i = 1; i <= min; i++)
2128 {
2129 if (!match_ref(offset, eptr, length, md, ims))
2130 {
2131 CHECK_PARTIAL();
2132 RRETURN(MATCH_NOMATCH);
2133 }
2134 eptr += length;
2135 }
2136
2137 /* If min = max, continue at the same level without recursion.
2138 They are not both allowed to be zero. */
2139
2140 if (min == max) continue;
2141
2142 /* If minimizing, keep trying and advancing the pointer */
2143
2144 if (minimize)
2145 {
2146 for (fi = min;; fi++)
2147 {
2148 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM14);
2149 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2150 if (fi >= max) RRETURN(MATCH_NOMATCH);
2151 if (!match_ref(offset, eptr, length, md, ims))
2152 {
2153 CHECK_PARTIAL();
2154 RRETURN(MATCH_NOMATCH);
2155 }
2156 eptr += length;
2157 }
2158 /* Control never gets here */
2159 }
2160
2161 /* If maximizing, find the longest string and work backwards */
2162
2163 else
2164 {
2165 pp = eptr;
2166 for (i = min; i < max; i++)
2167 {
2168 if (!match_ref(offset, eptr, length, md, ims))
2169 {
2170 CHECK_PARTIAL();
2171 break;
2172 }
2173 eptr += length;
2174 }
2175 while (eptr >= pp)
2176 {
2177 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM15);
2178 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2179 eptr -= length;
2180 }
2181 RRETURN(MATCH_NOMATCH);
2182 }
2183 }
2184 /* Control never gets here */
2185
2186 /* Match a bit-mapped character class, possibly repeatedly. This op code is
2187 used when all the characters in the class have values in the range 0-255,
2188 and either the matching is caseful, or the characters are in the range
2189 0-127 when UTF-8 processing is enabled. The only difference between
2190 OP_CLASS and OP_NCLASS occurs when a data character outside the range is
2191 encountered.
2192
2193 First, look past the end of the item to see if there is repeat information
2194 following. Then obey similar code to character type repeats - written out
2195 again for speed. */
2196
2197 case OP_NCLASS:
2198 case OP_CLASS:
2199 {
2200 data = ecode + 1; /* Save for matching */
2201 ecode += 33; /* Advance past the item */
2202
2203 switch (*ecode)
2204 {
2205 case OP_CRSTAR:
2206 case OP_CRMINSTAR:
2207 case OP_CRPLUS:
2208 case OP_CRMINPLUS:
2209 case OP_CRQUERY:
2210 case OP_CRMINQUERY:
2211 c = *ecode++ - OP_CRSTAR;
2212 minimize = (c & 1) != 0;
2213 min = rep_min[c]; /* Pick up values from tables; */
2214 max = rep_max[c]; /* zero for max => infinity */
2215 if (max == 0) max = INT_MAX;
2216 break;
2217
2218 case OP_CRRANGE:
2219 case OP_CRMINRANGE:
2220 minimize = (*ecode == OP_CRMINRANGE);
2221 min = GET2(ecode, 1);
2222 max = GET2(ecode, 3);
2223 if (max == 0) max = INT_MAX;
2224 ecode += 5;
2225 break;
2226
2227 default: /* No repeat follows */
2228 min = max = 1;
2229 break;
2230 }
2231
2232 /* First, ensure the minimum number of matches are present. */
2233
2234 #ifdef SUPPORT_UTF8
2235 /* UTF-8 mode */
2236 if (utf8)
2237 {
2238 for (i = 1; i <= min; i++)
2239 {
2240 if (eptr >= md->end_subject)
2241 {
2242 SCHECK_PARTIAL();
2243 RRETURN(MATCH_NOMATCH);
2244 }
2245 GETCHARINC(c, eptr);
2246 if (c > 255)
2247 {
2248 if (op == OP_CLASS) RRETURN(MATCH_NOMATCH);
2249 }
2250 else
2251 {
2252 if ((data[c/8] & (1 << (c&7))) == 0) RRETURN(MATCH_NOMATCH);
2253 }
2254 }
2255 }
2256 else
2257 #endif
2258 /* Not UTF-8 mode */
2259 {
2260 for (i = 1; i <= min; i++)
2261 {
2262 if (eptr >= md->end_subject)
2263 {
2264 SCHECK_PARTIAL();
2265 RRETURN(MATCH_NOMATCH);
2266 }
2267 c = *eptr++;
2268 if ((data[c/8] & (1 << (c&7))) == 0) RRETURN(MATCH_NOMATCH);
2269 }
2270 }
2271
2272 /* If max == min we can continue with the main loop without the
2273 need to recurse. */
2274
2275 if (min == max) continue;
2276
2277 /* If minimizing, keep testing the rest of the expression and advancing
2278 the pointer while it matches the class. */
2279
2280 if (minimize)
2281 {
2282 #ifdef SUPPORT_UTF8
2283 /* UTF-8 mode */
2284 if (utf8)
2285 {
2286 for (fi = min;; fi++)
2287 {
2288 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM16);
2289 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2290 if (fi >= max) RRETURN(MATCH_NOMATCH);
2291 if (eptr >= md->end_subject)
2292 {
2293 SCHECK_PARTIAL();
2294 RRETURN(MATCH_NOMATCH);
2295 }
2296 GETCHARINC(c, eptr);
2297 if (c > 255)
2298 {
2299 if (op == OP_CLASS) RRETURN(MATCH_NOMATCH);
2300 }
2301 else
2302 {
2303 if ((data[c/8] & (1 << (c&7))) == 0) RRETURN(MATCH_NOMATCH);
2304 }
2305 }
2306 }
2307 else
2308 #endif
2309 /* Not UTF-8 mode */
2310 {
2311 for (fi = min;; fi++)
2312 {
2313 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM17);
2314 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2315 if (fi >= max) RRETURN(MATCH_NOMATCH);
2316 if (eptr >= md->end_subject)
2317 {
2318 SCHECK_PARTIAL();
2319 RRETURN(MATCH_NOMATCH);
2320 }
2321 c = *eptr++;
2322 if ((data[c/8] & (1 << (c&7))) == 0) RRETURN(MATCH_NOMATCH);
2323 }
2324 }
2325 /* Control never gets here */
2326 }
2327
2328 /* If maximizing, find the longest possible run, then work backwards. */
2329
2330 else
2331 {
2332 pp = eptr;
2333
2334 #ifdef SUPPORT_UTF8
2335 /* UTF-8 mode */
2336 if (utf8)
2337 {
2338 for (i = min; i < max; i++)
2339 {
2340 int len = 1;
2341 if (eptr >= md->end_subject)
2342 {
2343 SCHECK_PARTIAL();
2344 break;
2345 }
2346 GETCHARLEN(c, eptr, len);
2347 if (c > 255)
2348 {
2349 if (op == OP_CLASS) break;
2350 }
2351 else
2352 {
2353 if ((data[c/8] & (1 << (c&7))) == 0) break;
2354 }
2355 eptr += len;
2356 }
2357 for (;;)
2358 {
2359 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM18);
2360 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2361 if (eptr-- == pp) break; /* Stop if tried at original pos */
2362 BACKCHAR(eptr);
2363 }
2364 }
2365 else
2366 #endif
2367 /* Not UTF-8 mode */
2368 {
2369 for (i = min; i < max; i++)
2370 {
2371 if (eptr >= md->end_subject)
2372 {
2373 SCHECK_PARTIAL();
2374 break;
2375 }
2376 c = *eptr;
2377 if ((data[c/8] & (1 << (c&7))) == 0) break;
2378 eptr++;
2379 }
2380 while (eptr >= pp)
2381 {
2382 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM19);
2383 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2384 eptr--;
2385 }
2386 }
2387
2388 RRETURN(MATCH_NOMATCH);
2389 }
2390 }
2391 /* Control never gets here */
2392
2393
2394 /* Match an extended character class. This opcode is encountered only
2395 when UTF-8 mode mode is supported. Nevertheless, we may not be in UTF-8
2396 mode, because Unicode properties are supported in non-UTF-8 mode. */
2397
2398 #ifdef SUPPORT_UTF8
2399 case OP_XCLASS:
2400 {
2401 data = ecode + 1 + LINK_SIZE; /* Save for matching */
2402 ecode += GET(ecode, 1); /* Advance past the item */
2403
2404 switch (*ecode)
2405 {
2406 case OP_CRSTAR:
2407 case OP_CRMINSTAR:
2408 case OP_CRPLUS:
2409 case OP_CRMINPLUS:
2410 case OP_CRQUERY:
2411 case OP_CRMINQUERY:
2412 c = *ecode++ - OP_CRSTAR;
2413 minimize = (c & 1) != 0;
2414 min = rep_min[c]; /* Pick up values from tables; */
2415 max = rep_max[c]; /* zero for max => infinity */
2416 if (max == 0) max = INT_MAX;
2417 break;
2418
2419 case OP_CRRANGE:
2420 case OP_CRMINRANGE:
2421 minimize = (*ecode == OP_CRMINRANGE);
2422 min = GET2(ecode, 1);
2423 max = GET2(ecode, 3);
2424 if (max == 0) max = INT_MAX;
2425 ecode += 5;
2426 break;
2427
2428 default: /* No repeat follows */
2429 min = max = 1;
2430 break;
2431 }
2432
2433 /* First, ensure the minimum number of matches are present. */
2434
2435 for (i = 1; i <= min; i++)
2436 {
2437 if (eptr >= md->end_subject)
2438 {
2439 SCHECK_PARTIAL();
2440 RRETURN(MATCH_NOMATCH);
2441 }
2442 GETCHARINCTEST(c, eptr);
2443 if (!_pcre_xclass(c, data)) RRETURN(MATCH_NOMATCH);
2444 }
2445
2446 /* If max == min we can continue with the main loop without the
2447 need to recurse. */
2448
2449 if (min == max) continue;
2450
2451 /* If minimizing, keep testing the rest of the expression and advancing
2452 the pointer while it matches the class. */
2453
2454 if (minimize)
2455 {
2456 for (fi = min;; fi++)
2457 {
2458 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM20);
2459 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2460 if (fi >= max) RRETURN(MATCH_NOMATCH);
2461 if (eptr >= md->end_subject)
2462 {
2463 SCHECK_PARTIAL();
2464 RRETURN(MATCH_NOMATCH);
2465 }
2466 GETCHARINCTEST(c, eptr);
2467 if (!_pcre_xclass(c, data)) RRETURN(MATCH_NOMATCH);
2468 }
2469 /* Control never gets here */
2470 }
2471
2472 /* If maximizing, find the longest possible run, then work backwards. */
2473
2474 else
2475 {
2476 pp = eptr;
2477 for (i = min; i < max; i++)
2478 {
2479 int len = 1;
2480 if (eptr >= md->end_subject)
2481 {
2482 SCHECK_PARTIAL();
2483 break;
2484 }
2485 GETCHARLENTEST(c, eptr, len);
2486 if (!_pcre_xclass(c, data)) break;
2487 eptr += len;
2488 }
2489 for(;;)
2490 {
2491 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM21);
2492 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2493 if (eptr-- == pp) break; /* Stop if tried at original pos */
2494 if (utf8) BACKCHAR(eptr);
2495 }
2496 RRETURN(MATCH_NOMATCH);
2497 }
2498
2499 /* Control never gets here */
2500 }
2501 #endif /* End of XCLASS */
2502
2503 /* Match a single character, casefully */
2504
2505 case OP_CHAR:
2506 #ifdef SUPPORT_UTF8
2507 if (utf8)
2508 {
2509 length = 1;
2510 ecode++;
2511 GETCHARLEN(fc, ecode, length);
2512 if (length > md->end_subject - eptr)
2513 {
2514 CHECK_PARTIAL(); /* Not SCHECK_PARTIAL() */
2515 RRETURN(MATCH_NOMATCH);
2516 }
2517 while (length-- > 0) if (*ecode++ != *eptr++) RRETURN(MATCH_NOMATCH);
2518 }
2519 else
2520 #endif
2521
2522 /* Non-UTF-8 mode */
2523 {
2524 if (md->end_subject - eptr < 1)
2525 {
2526 SCHECK_PARTIAL(); /* This one can use SCHECK_PARTIAL() */
2527 RRETURN(MATCH_NOMATCH);
2528 }
2529 if (ecode[1] != *eptr++) RRETURN(MATCH_NOMATCH);
2530 ecode += 2;
2531 }
2532 break;
2533
2534 /* Match a single character, caselessly */
2535
2536 case OP_CHARNC:
2537 #ifdef SUPPORT_UTF8
2538 if (utf8)
2539 {
2540 length = 1;
2541 ecode++;
2542 GETCHARLEN(fc, ecode, length);
2543
2544 if (length > md->end_subject - eptr)
2545 {
2546 CHECK_PARTIAL(); /* Not SCHECK_PARTIAL() */
2547 RRETURN(MATCH_NOMATCH);
2548 }
2549
2550 /* If the pattern character's value is < 128, we have only one byte, and
2551 can use the fast lookup table. */
2552
2553 if (fc < 128)
2554 {
2555 if (md->lcc[*ecode++] != md->lcc[*eptr++]) RRETURN(MATCH_NOMATCH);
2556 }
2557
2558 /* Otherwise we must pick up the subject character */
2559
2560 else
2561 {
2562 unsigned int dc;
2563 GETCHARINC(dc, eptr);
2564 ecode += length;
2565
2566 /* If we have Unicode property support, we can use it to test the other
2567 case of the character, if there is one. */
2568
2569 if (fc != dc)
2570 {
2571 #ifdef SUPPORT_UCP
2572 if (dc != UCD_OTHERCASE(fc))
2573 #endif
2574 RRETURN(MATCH_NOMATCH);
2575 }
2576 }
2577 }
2578 else
2579 #endif /* SUPPORT_UTF8 */
2580
2581 /* Non-UTF-8 mode */
2582 {
2583 if (md->end_subject - eptr < 1)
2584 {
2585 SCHECK_PARTIAL(); /* This one can use SCHECK_PARTIAL() */
2586 RRETURN(MATCH_NOMATCH);
2587 }
2588 if (md->lcc[ecode[1]] != md->lcc[*eptr++]) RRETURN(MATCH_NOMATCH);
2589 ecode += 2;
2590 }
2591 break;
2592
2593 /* Match a single character repeatedly. */
2594
2595 case OP_EXACT:
2596 min = max = GET2(ecode, 1);
2597 ecode += 3;
2598 goto REPEATCHAR;
2599
2600 case OP_POSUPTO:
2601 possessive = TRUE;
2602 /* Fall through */
2603
2604 case OP_UPTO:
2605 case OP_MINUPTO:
2606 min = 0;
2607 max = GET2(ecode, 1);
2608 minimize = *ecode == OP_MINUPTO;
2609 ecode += 3;
2610 goto REPEATCHAR;
2611
2612 case OP_POSSTAR:
2613 possessive = TRUE;
2614 min = 0;
2615 max = INT_MAX;
2616 ecode++;
2617 goto REPEATCHAR;
2618
2619 case OP_POSPLUS:
2620 possessive = TRUE;
2621 min = 1;
2622 max = INT_MAX;
2623 ecode++;
2624 goto REPEATCHAR;
2625
2626 case OP_POSQUERY:
2627 possessive = TRUE;
2628 min = 0;
2629 max = 1;
2630 ecode++;
2631 goto REPEATCHAR;
2632
2633 case OP_STAR:
2634 case OP_MINSTAR:
2635 case OP_PLUS:
2636 case OP_MINPLUS:
2637 case OP_QUERY:
2638 case OP_MINQUERY:
2639 c = *ecode++ - OP_STAR;
2640 minimize = (c & 1) != 0;
2641
2642 min = rep_min[c]; /* Pick up values from tables; */
2643 max = rep_max[c]; /* zero for max => infinity */
2644 if (max == 0) max = INT_MAX;
2645
2646 /* Common code for all repeated single-character matches. */
2647
2648 REPEATCHAR:
2649 #ifdef SUPPORT_UTF8
2650 if (utf8)
2651 {
2652 length = 1;
2653 charptr = ecode;
2654 GETCHARLEN(fc, ecode, length);
2655 ecode += length;
2656
2657 /* Handle multibyte character matching specially here. There is
2658 support for caseless matching if UCP support is present. */
2659
2660 if (length > 1)
2661 {
2662 #ifdef SUPPORT_UCP
2663 unsigned int othercase;
2664 if ((ims & PCRE_CASELESS) != 0 &&
2665 (othercase = UCD_OTHERCASE(fc)) != fc)
2666 oclength = _pcre_ord2utf8(othercase, occhars);
2667 else oclength = 0;
2668 #endif /* SUPPORT_UCP */
2669
2670 for (i = 1; i <= min; i++)
2671 {
2672 if (eptr <= md->end_subject - length &&
2673 memcmp(eptr, charptr, length) == 0) eptr += length;
2674 #ifdef SUPPORT_UCP
2675 else if (oclength > 0 &&
2676 eptr <= md->end_subject - oclength &&
2677 memcmp(eptr, occhars, oclength) == 0) eptr += oclength;
2678 #endif /* SUPPORT_UCP */
2679 else
2680 {
2681 CHECK_PARTIAL();
2682 RRETURN(MATCH_NOMATCH);
2683 }
2684 }
2685
2686 if (min == max) continue;
2687
2688 if (minimize)
2689 {
2690 for (fi = min;; fi++)
2691 {
2692 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM22);
2693 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2694 if (fi >= max) RRETURN(MATCH_NOMATCH);
2695 if (eptr <= md->end_subject - length &&
2696 memcmp(eptr, charptr, length) == 0) eptr += length;
2697 #ifdef SUPPORT_UCP
2698 else if (oclength > 0 &&
2699 eptr <= md->end_subject - oclength &&
2700 memcmp(eptr, occhars, oclength) == 0) eptr += oclength;
2701 #endif /* SUPPORT_UCP */
2702 else
2703 {
2704 CHECK_PARTIAL();
2705 RRETURN(MATCH_NOMATCH);
2706 }
2707 }
2708 /* Control never gets here */
2709 }
2710
2711 else /* Maximize */
2712 {
2713 pp = eptr;
2714 for (i = min; i < max; i++)
2715 {
2716 if (eptr <= md->end_subject - length &&
2717 memcmp(eptr, charptr, length) == 0) eptr += length;
2718 #ifdef SUPPORT_UCP
2719 else if (oclength > 0 &&
2720 eptr <= md->end_subject - oclength &&
2721 memcmp(eptr, occhars, oclength) == 0) eptr += oclength;
2722 #endif /* SUPPORT_UCP */
2723 else
2724 {
2725 CHECK_PARTIAL();
2726 break;
2727 }
2728 }
2729
2730 if (possessive) continue;
2731
2732 for(;;)
2733 {
2734 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM23);
2735 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2736 if (eptr == pp) { RRETURN(MATCH_NOMATCH); }
2737 #ifdef SUPPORT_UCP
2738 eptr--;
2739 BACKCHAR(eptr);
2740 #else /* without SUPPORT_UCP */
2741 eptr -= length;
2742 #endif /* SUPPORT_UCP */
2743 }
2744 }
2745 /* Control never gets here */
2746 }
2747
2748 /* If the length of a UTF-8 character is 1, we fall through here, and
2749 obey the code as for non-UTF-8 characters below, though in this case the
2750 value of fc will always be < 128. */
2751 }
2752 else
2753 #endif /* SUPPORT_UTF8 */
2754
2755 /* When not in UTF-8 mode, load a single-byte character. */
2756
2757 fc = *ecode++;
2758
2759 /* The value of fc at this point is always less than 256, though we may or
2760 may not be in UTF-8 mode. The code is duplicated for the caseless and
2761 caseful cases, for speed, since matching characters is likely to be quite
2762 common. First, ensure the minimum number of matches are present. If min =
2763 max, continue at the same level without recursing. Otherwise, if
2764 minimizing, keep trying the rest of the expression and advancing one
2765 matching character if failing, up to the maximum. Alternatively, if
2766 maximizing, find the maximum number of characters and work backwards. */
2767
2768 DPRINTF(("matching %c{%d,%d} against subject %.*s\n", fc, min, max,
2769 max, eptr));
2770
2771 if ((ims & PCRE_CASELESS) != 0)
2772 {
2773 fc = md->lcc[fc];
2774 for (i = 1; i <= min; i++)
2775 {
2776 if (eptr >= md->end_subject)
2777 {
2778 SCHECK_PARTIAL();
2779 RRETURN(MATCH_NOMATCH);
2780 }
2781 if (fc != md->lcc[*eptr++]) RRETURN(MATCH_NOMATCH);
2782 }
2783 if (min == max) continue;
2784 if (minimize)
2785 {
2786 for (fi = min;; fi++)
2787 {
2788 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM24);
2789 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2790 if (fi >= max) RRETURN(MATCH_NOMATCH);
2791 if (eptr >= md->end_subject)
2792 {
2793 SCHECK_PARTIAL();
2794 RRETURN(MATCH_NOMATCH);
2795 }
2796 if (fc != md->lcc[*eptr++]) RRETURN(MATCH_NOMATCH);
2797 }
2798 /* Control never gets here */
2799 }
2800 else /* Maximize */
2801 {
2802 pp = eptr;
2803 for (i = min; i < max; i++)
2804 {
2805 if (eptr >= md->end_subject)
2806 {
2807 SCHECK_PARTIAL();
2808 break;
2809 }
2810 if (fc != md->lcc[*eptr]) break;
2811 eptr++;
2812 }
2813
2814 if (possessive) continue;
2815
2816 while (eptr >= pp)
2817 {
2818 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM25);
2819 eptr--;
2820 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2821 }
2822 RRETURN(MATCH_NOMATCH);
2823 }
2824 /* Control never gets here */
2825 }
2826
2827 /* Caseful comparisons (includes all multi-byte characters) */
2828
2829 else
2830 {
2831 for (i = 1; i <= min; i++)
2832 {
2833 if (eptr >= md->end_subject)
2834 {
2835 SCHECK_PARTIAL();
2836 RRETURN(MATCH_NOMATCH);
2837 }
2838 if (fc != *eptr++) RRETURN(MATCH_NOMATCH);
2839 }
2840
2841 if (min == max) continue;
2842
2843 if (minimize)
2844 {
2845 for (fi = min;; fi++)
2846 {
2847 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM26);
2848 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2849 if (fi >= max) RRETURN(MATCH_NOMATCH);
2850 if (eptr >= md->end_subject)
2851 {
2852 SCHECK_PARTIAL();
2853 RRETURN(MATCH_NOMATCH);
2854 }
2855 if (fc != *eptr++) RRETURN(MATCH_NOMATCH);
2856 }
2857 /* Control never gets here */
2858 }
2859 else /* Maximize */
2860 {
2861 pp = eptr;
2862 for (i = min; i < max; i++)
2863 {
2864 if (eptr >= md->end_subject)
2865 {
2866 SCHECK_PARTIAL();
2867 break;
2868 }
2869 if (fc != *eptr) break;
2870 eptr++;
2871 }
2872 if (possessive) continue;
2873
2874 while (eptr >= pp)
2875 {
2876 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM27);
2877 eptr--;
2878 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2879 }
2880 RRETURN(MATCH_NOMATCH);
2881 }
2882 }
2883 /* Control never gets here */
2884
2885 /* Match a negated single one-byte character. The character we are
2886 checking can be multibyte. */
2887
2888 case OP_NOT:
2889 if (eptr >= md->end_subject)
2890 {
2891 SCHECK_PARTIAL();
2892 RRETURN(MATCH_NOMATCH);
2893 }
2894 ecode++;
2895 GETCHARINCTEST(c, eptr);
2896 if ((ims & PCRE_CASELESS) != 0)
2897 {
2898 #ifdef SUPPORT_UTF8
2899 if (c < 256)
2900 #endif
2901 c = md->lcc[c];
2902 if (md->lcc[*ecode++] == c) RRETURN(MATCH_NOMATCH);
2903 }
2904 else
2905 {
2906 if (*ecode++ == c) RRETURN(MATCH_NOMATCH);
2907 }
2908 break;
2909
2910 /* Match a negated single one-byte character repeatedly. This is almost a
2911 repeat of the code for a repeated single character, but I haven't found a
2912 nice way of commoning these up that doesn't require a test of the
2913 positive/negative option for each character match. Maybe that wouldn't add
2914 very much to the time taken, but character matching *is* what this is all
2915 about... */
2916
2917 case OP_NOTEXACT:
2918 min = max = GET2(ecode, 1);
2919 ecode += 3;
2920 goto REPEATNOTCHAR;
2921
2922 case OP_NOTUPTO:
2923 case OP_NOTMINUPTO:
2924 min = 0;
2925 max = GET2(ecode, 1);
2926 minimize = *ecode == OP_NOTMINUPTO;
2927 ecode += 3;
2928 goto REPEATNOTCHAR;
2929
2930 case OP_NOTPOSSTAR:
2931 possessive = TRUE;
2932 min = 0;
2933 max = INT_MAX;
2934 ecode++;
2935 goto REPEATNOTCHAR;
2936
2937 case OP_NOTPOSPLUS:
2938 possessive = TRUE;
2939 min = 1;
2940 max = INT_MAX;
2941 ecode++;
2942 goto REPEATNOTCHAR;
2943
2944 case OP_NOTPOSQUERY:
2945 possessive = TRUE;
2946 min = 0;
2947 max = 1;
2948 ecode++;
2949 goto REPEATNOTCHAR;
2950
2951 case OP_NOTPOSUPTO:
2952 possessive = TRUE;
2953 min = 0;
2954 max = GET2(ecode, 1);
2955 ecode += 3;
2956 goto REPEATNOTCHAR;
2957
2958 case OP_NOTSTAR:
2959 case OP_NOTMINSTAR:
2960 case OP_NOTPLUS:
2961 case OP_NOTMINPLUS:
2962 case OP_NOTQUERY:
2963 case OP_NOTMINQUERY:
2964 c = *ecode++ - OP_NOTSTAR;
2965 minimize = (c & 1) != 0;
2966 min = rep_min[c]; /* Pick up values from tables; */
2967 max = rep_max[c]; /* zero for max => infinity */
2968 if (max == 0) max = INT_MAX;
2969
2970 /* Common code for all repeated single-byte matches. */
2971
2972 REPEATNOTCHAR:
2973 fc = *ecode++;
2974
2975 /* The code is duplicated for the caseless and caseful cases, for speed,
2976 since matching characters is likely to be quite common. First, ensure the
2977 minimum number of matches are present. If min = max, continue at the same
2978 level without recursing. Otherwise, if minimizing, keep trying the rest of
2979 the expression and advancing one matching character if failing, up to the
2980 maximum. Alternatively, if maximizing, find the maximum number of
2981 characters and work backwards. */
2982
2983 DPRINTF(("negative matching %c{%d,%d} against subject %.*s\n", fc, min, max,
2984 max, eptr));
2985
2986 if ((ims & PCRE_CASELESS) != 0)
2987 {
2988 fc = md->lcc[fc];
2989
2990 #ifdef SUPPORT_UTF8
2991 /* UTF-8 mode */
2992 if (utf8)
2993 {
2994 register unsigned int d;
2995 for (i = 1; i <= min; i++)
2996 {
2997 if (eptr >= md->end_subject)
2998 {
2999 SCHECK_PARTIAL();
3000 RRETURN(MATCH_NOMATCH);
3001 }
3002 GETCHARINC(d, eptr);
3003 if (d < 256) d = md->lcc[d];
3004 if (fc == d) RRETURN(MATCH_NOMATCH);
3005 }
3006 }
3007 else
3008 #endif
3009
3010 /* Not UTF-8 mode */
3011 {
3012 for (i = 1; i <= min; i++)
3013 {
3014 if (eptr >= md->end_subject)
3015 {
3016 SCHECK_PARTIAL();
3017 RRETURN(MATCH_NOMATCH);
3018 }
3019 if (fc == md->lcc[*eptr++]) RRETURN(MATCH_NOMATCH);
3020 }
3021 }
3022
3023 if (min == max) continue;
3024
3025 if (minimize)
3026 {
3027 #ifdef SUPPORT_UTF8
3028 /* UTF-8 mode */
3029 if (utf8)
3030 {
3031 register unsigned int d;
3032 for (fi = min;; fi++)
3033 {
3034 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM28);
3035 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3036 if (fi >= max) RRETURN(MATCH_NOMATCH);
3037 if (eptr >= md->end_subject)
3038 {
3039 SCHECK_PARTIAL();
3040 RRETURN(MATCH_NOMATCH);
3041 }
3042 GETCHARINC(d, eptr);
3043 if (d < 256) d = md->lcc[d];
3044 if (fc == d) RRETURN(MATCH_NOMATCH);
3045 }
3046 }
3047 else
3048 #endif
3049 /* Not UTF-8 mode */
3050 {
3051 for (fi = min;; fi++)
3052 {
3053 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM29);
3054 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3055 if (fi >= max) RRETURN(MATCH_NOMATCH);
3056 if (eptr >= md->end_subject)
3057 {
3058 SCHECK_PARTIAL();
3059 RRETURN(MATCH_NOMATCH);
3060 }
3061 if (fc == md->lcc[*eptr++]) RRETURN(MATCH_NOMATCH);
3062 }
3063 }
3064 /* Control never gets here */
3065 }
3066
3067 /* Maximize case */
3068
3069 else
3070 {
3071 pp = eptr;
3072
3073 #ifdef SUPPORT_UTF8
3074 /* UTF-8 mode */
3075 if (utf8)
3076 {
3077 register unsigned int d;
3078 for (i = min; i < max; i++)
3079 {
3080 int len = 1;
3081 if (eptr >= md->end_subject)
3082 {
3083 SCHECK_PARTIAL();
3084 break;
3085 }
3086 GETCHARLEN(d, eptr, len);
3087 if (d < 256) d = md->lcc[d];
3088 if (fc == d) break;
3089 eptr += len;
3090 }
3091 if (possessive) continue;
3092 for(;;)
3093 {
3094 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM30);
3095 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3096 if (eptr-- == pp) break; /* Stop if tried at original pos */
3097 BACKCHAR(eptr);
3098 }
3099 }
3100 else
3101 #endif
3102 /* Not UTF-8 mode */
3103 {
3104 for (i = min; i < max; i++)
3105 {
3106 if (eptr >= md->end_subject)
3107 {
3108 SCHECK_PARTIAL();
3109 break;
3110 }
3111 if (fc == md->lcc[*eptr]) break;
3112 eptr++;
3113 }
3114 if (possessive) continue;
3115 while (eptr >= pp)
3116 {
3117 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM31);
3118 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3119 eptr--;
3120 }
3121 }
3122
3123 RRETURN(MATCH_NOMATCH);
3124 }
3125 /* Control never gets here */
3126 }
3127
3128 /* Caseful comparisons */
3129
3130 else
3131 {
3132 #ifdef SUPPORT_UTF8
3133 /* UTF-8 mode */
3134 if (utf8)
3135 {
3136 register unsigned int d;
3137 for (i = 1; i <= min; i++)
3138 {
3139 if (eptr >= md->end_subject)
3140 {
3141 SCHECK_PARTIAL();
3142 RRETURN(MATCH_NOMATCH);
3143 }
3144 GETCHARINC(d, eptr);
3145 if (fc == d) RRETURN(MATCH_NOMATCH);
3146 }
3147 }
3148 else
3149 #endif
3150 /* Not UTF-8 mode */
3151 {
3152 for (i = 1; i <= min; i++)
3153 {
3154 if (eptr >= md->end_subject)
3155 {
3156 SCHECK_PARTIAL();
3157 RRETURN(MATCH_NOMATCH);
3158 }
3159 if (fc == *eptr++) RRETURN(MATCH_NOMATCH);
3160 }
3161 }
3162
3163 if (min == max) continue;
3164
3165 if (minimize)
3166 {
3167 #ifdef SUPPORT_UTF8
3168 /* UTF-8 mode */
3169 if (utf8)
3170 {
3171 register unsigned int d;
3172 for (fi = min;; fi++)
3173 {
3174 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM32);
3175 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3176 if (fi >= max) RRETURN(MATCH_NOMATCH);
3177 if (eptr >= md->end_subject)
3178 {
3179 SCHECK_PARTIAL();
3180 RRETURN(MATCH_NOMATCH);
3181 }
3182 GETCHARINC(d, eptr);
3183 if (fc == d) RRETURN(MATCH_NOMATCH);
3184 }
3185 }
3186 else
3187 #endif
3188 /* Not UTF-8 mode */
3189 {
3190 for (fi = min;; fi++)
3191 {
3192 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM33);
3193 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3194 if (fi >= max) RRETURN(MATCH_NOMATCH);
3195 if (eptr >= md->end_subject)
3196 {
3197 SCHECK_PARTIAL();
3198 RRETURN(MATCH_NOMATCH);
3199 }
3200 if (fc == *eptr++) RRETURN(MATCH_NOMATCH);
3201 }
3202 }
3203 /* Control never gets here */
3204 }
3205
3206 /* Maximize case */
3207
3208 else
3209 {
3210 pp = eptr;
3211
3212 #ifdef SUPPORT_UTF8
3213 /* UTF-8 mode */
3214 if (utf8)
3215 {
3216 register unsigned int d;
3217 for (i = min; i < max; i++)
3218 {
3219 int len = 1;
3220 if (eptr >= md->end_subject)
3221 {
3222 SCHECK_PARTIAL();
3223 break;
3224 }
3225 GETCHARLEN(d, eptr, len);
3226 if (fc == d) break;
3227 eptr += len;
3228 }
3229 if (possessive) continue;
3230 for(;;)
3231 {
3232 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM34);
3233 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3234 if (eptr-- == pp) break; /* Stop if tried at original pos */
3235 BACKCHAR(eptr);
3236 }
3237 }
3238 else
3239 #endif
3240 /* Not UTF-8 mode */
3241 {
3242 for (i = min; i < max; i++)
3243 {
3244 if (eptr >= md->end_subject)
3245 {
3246 SCHECK_PARTIAL();
3247 break;
3248 }
3249 if (fc == *eptr) break;
3250 eptr++;
3251 }
3252 if (possessive) continue;
3253 while (eptr >= pp)
3254 {
3255 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM35);
3256 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3257 eptr--;
3258 }
3259 }
3260
3261 RRETURN(MATCH_NOMATCH);
3262 }
3263 }
3264 /* Control never gets here */
3265
3266 /* Match a single character type repeatedly; several different opcodes
3267 share code. This is very similar to the code for single characters, but we
3268 repeat it in the interests of efficiency. */
3269
3270 case OP_TYPEEXACT:
3271 min = max = GET2(ecode, 1);
3272 minimize = TRUE;
3273 ecode += 3;
3274 goto REPEATTYPE;
3275
3276 case OP_TYPEUPTO:
3277 case OP_TYPEMINUPTO:
3278 min = 0;
3279 max = GET2(ecode, 1);
3280 minimize = *ecode == OP_TYPEMINUPTO;
3281 ecode += 3;
3282 goto REPEATTYPE;
3283
3284 case OP_TYPEPOSSTAR:
3285 possessive = TRUE;
3286 min = 0;
3287 max = INT_MAX;
3288 ecode++;
3289 goto REPEATTYPE;
3290
3291 case OP_TYPEPOSPLUS:
3292 possessive = TRUE;
3293 min = 1;
3294 max = INT_MAX;
3295 ecode++;
3296 goto REPEATTYPE;
3297
3298 case OP_TYPEPOSQUERY:
3299 possessive = TRUE;
3300 min = 0;
3301 max = 1;
3302 ecode++;
3303 goto REPEATTYPE;
3304
3305 case OP_TYPEPOSUPTO:
3306 possessive = TRUE;
3307 min = 0;
3308 max = GET2(ecode, 1);
3309 ecode += 3;
3310 goto REPEATTYPE;
3311
3312 case OP_TYPESTAR:
3313 case OP_TYPEMINSTAR:
3314 case OP_TYPEPLUS:
3315 case OP_TYPEMINPLUS:
3316 case OP_TYPEQUERY:
3317 case OP_TYPEMINQUERY:
3318 c = *ecode++ - OP_TYPESTAR;
3319 minimize = (c & 1) != 0;
3320 min = rep_min[c]; /* Pick up values from tables; */
3321 max = rep_max[c]; /* zero for max => infinity */
3322 if (max == 0) max = INT_MAX;
3323
3324 /* Common code for all repeated single character type matches. Note that
3325 in UTF-8 mode, '.' matches a character of any length, but for the other
3326 character types, the valid characters are all one-byte long. */
3327
3328 REPEATTYPE:
3329 ctype = *ecode++; /* Code for the character type */
3330
3331 #ifdef SUPPORT_UCP
3332 if (ctype == OP_PROP || ctype == OP_NOTPROP)
3333 {
3334 prop_fail_result = ctype == OP_NOTPROP;
3335 prop_type = *ecode++;
3336 prop_value = *ecode++;
3337 }
3338 else prop_type = -1;
3339 #endif
3340
3341 /* First, ensure the minimum number of matches are present. Use inline
3342 code for maximizing the speed, and do the type test once at the start
3343 (i.e. keep it out of the loop). Separate the UTF-8 code completely as that
3344 is tidier. Also separate the UCP code, which can be the same for both UTF-8
3345 and single-bytes. */
3346
3347 if (min > 0)
3348 {
3349 #ifdef SUPPORT_UCP
3350 if (prop_type >= 0)
3351 {
3352 switch(prop_type)
3353 {
3354 case PT_ANY:
3355 if (prop_fail_result) RRETURN(MATCH_NOMATCH);
3356 for (i = 1; i <= min; i++)
3357 {
3358 if (eptr >= md->end_subject)
3359 {
3360 SCHECK_PARTIAL();
3361 RRETURN(MATCH_NOMATCH);
3362 }
3363 GETCHARINCTEST(c, eptr);
3364 }
3365 break;
3366
3367 case PT_LAMP:
3368 for (i = 1; i <= min; i++)
3369 {
3370 if (eptr >= md->end_subject)
3371 {
3372 SCHECK_PARTIAL();
3373 RRETURN(MATCH_NOMATCH);
3374 }
3375 GETCHARINCTEST(c, eptr);
3376 prop_chartype = UCD_CHARTYPE(c);
3377 if ((prop_chartype == ucp_Lu ||
3378 prop_chartype == ucp_Ll ||
3379 prop_chartype == ucp_Lt) == prop_fail_result)
3380 RRETURN(MATCH_NOMATCH);
3381 }
3382 break;
3383
3384 case PT_GC:
3385 for (i = 1; i <= min; i++)
3386 {
3387 if (eptr >= md->end_subject)
3388 {
3389 SCHECK_PARTIAL();
3390 RRETURN(MATCH_NOMATCH);
3391 }
3392 GETCHARINCTEST(c, eptr);
3393 prop_category = UCD_CATEGORY(c);
3394 if ((prop_category == prop_value) == prop_fail_result)
3395 RRETURN(MATCH_NOMATCH);
3396 }
3397 break;
3398
3399 case PT_PC:
3400 for (i = 1; i <= min; i++)
3401 {
3402 if (eptr >= md->end_subject)
3403 {
3404 SCHECK_PARTIAL();
3405 RRETURN(MATCH_NOMATCH);
3406 }
3407 GETCHARINCTEST(c, eptr);
3408 prop_chartype = UCD_CHARTYPE(c);
3409 if ((prop_chartype == prop_value) == prop_fail_result)
3410 RRETURN(MATCH_NOMATCH);
3411 }
3412 break;
3413
3414 case PT_SC:
3415 for (i = 1; i <= min; i++)
3416 {
3417 if (eptr >= md->end_subject)
3418 {
3419 SCHECK_PARTIAL();
3420 RRETURN(MATCH_NOMATCH);
3421 }
3422 GETCHARINCTEST(c, eptr);
3423 prop_script = UCD_SCRIPT(c);
3424 if ((prop_script == prop_value) == prop_fail_result)
3425 RRETURN(MATCH_NOMATCH);
3426 }
3427 break;
3428
3429 default:
3430 RRETURN(PCRE_ERROR_INTERNAL);
3431 }
3432 }
3433
3434 /* Match extended Unicode sequences. We will get here only if the
3435 support is in the binary; otherwise a compile-time error occurs. */
3436
3437 else if (ctype == OP_EXTUNI)
3438 {
3439 for (i = 1; i <= min; i++)
3440 {
3441 if (eptr >= md->end_subject)
3442 {
3443 SCHECK_PARTIAL();
3444 RRETURN(MATCH_NOMATCH);
3445 }
3446 GETCHARINCTEST(c, eptr);
3447 prop_category = UCD_CATEGORY(c);
3448 if (prop_category == ucp_M) RRETURN(MATCH_NOMATCH);
3449 while (eptr < md->end_subject)
3450 {
3451 int len = 1;
3452 if (!utf8) c = *eptr;
3453 else { GETCHARLEN(c, eptr, len); }
3454 prop_category = UCD_CATEGORY(c);
3455 if (prop_category != ucp_M) break;
3456 eptr += len;
3457 }
3458 }
3459 }
3460
3461 else
3462 #endif /* SUPPORT_UCP */
3463
3464 /* Handle all other cases when the coding is UTF-8 */
3465
3466 #ifdef SUPPORT_UTF8
3467 if (utf8) switch(ctype)
3468 {
3469 case OP_ANY:
3470 for (i = 1; i <= min; i++)
3471 {
3472 if (eptr >= md->end_subject)
3473 {
3474 SCHECK_PARTIAL();
3475 RRETURN(MATCH_NOMATCH);
3476 }
3477 if (IS_NEWLINE(eptr)) RRETURN(MATCH_NOMATCH);
3478 eptr++;
3479 while (eptr < md->end_subject && (*eptr & 0xc0) == 0x80) eptr++;
3480 }
3481 break;
3482
3483 case OP_ALLANY:
3484 for (i = 1; i <= min; i++)
3485 {
3486 if (eptr >= md->end_subject)
3487 {
3488 SCHECK_PARTIAL();
3489 RRETURN(MATCH_NOMATCH);
3490 }
3491 eptr++;
3492 while (eptr < md->end_subject && (*eptr & 0xc0) == 0x80) eptr++;
3493 }
3494 break;
3495
3496 case OP_ANYBYTE:
3497 if (eptr > md->end_subject - min) RRETURN(MATCH_NOMATCH);
3498 eptr += min;
3499 break;
3500
3501 case OP_ANYNL:
3502 for (i = 1; i <= min; i++)
3503 {
3504 if (eptr >= md->end_subject)
3505 {
3506 SCHECK_PARTIAL();
3507 RRETURN(MATCH_NOMATCH);
3508 }
3509 GETCHARINC(c, eptr);
3510 switch(c)
3511 {
3512 default: RRETURN(MATCH_NOMATCH);
3513 case 0x000d:
3514 if (eptr < md->end_subject && *eptr == 0x0a) eptr++;
3515 break;
3516
3517 case 0x000a:
3518 break;
3519
3520 case 0x000b:
3521 case 0x000c:
3522 case 0x0085:
3523 case 0x2028:
3524 case 0x2029:
3525 if (md->bsr_anycrlf) RRETURN(MATCH_NOMATCH);
3526 break;
3527 }
3528 }
3529 break;
3530
3531 case OP_NOT_HSPACE:
3532 for (i = 1; i <= min; i++)
3533 {
3534 if (eptr >= md->end_subject)
3535 {
3536 SCHECK_PARTIAL();
3537 RRETURN(MATCH_NOMATCH);
3538 }
3539 GETCHARINC(c, eptr);
3540 switch(c)
3541 {
3542 default: break;
3543 case 0x09: /* HT */
3544 case 0x20: /* SPACE */
3545 case 0xa0: /* NBSP */
3546 case 0x1680: /* OGHAM SPACE MARK */
3547 case 0x180e: /* MONGOLIAN VOWEL SEPARATOR */
3548 case 0x2000: /* EN QUAD */
3549 case 0x2001: /* EM QUAD */
3550 case 0x2002: /* EN SPACE */
3551 case 0x2003: /* EM SPACE */
3552 case 0x2004: /* THREE-PER-EM SPACE */
3553 case 0x2005: /* FOUR-PER-EM SPACE */
3554 case 0x2006: /* SIX-PER-EM SPACE */
3555 case 0x2007: /* FIGURE SPACE */
3556 case 0x2008: /* PUNCTUATION SPACE */
3557 case 0x2009: /* THIN SPACE */
3558 case 0x200A: /* HAIR SPACE */
3559 case 0x202f: /* NARROW NO-BREAK SPACE */
3560 case 0x205f: /* MEDIUM MATHEMATICAL SPACE */
3561 case 0x3000: /* IDEOGRAPHIC SPACE */
3562 RRETURN(MATCH_NOMATCH);
3563 }
3564 }
3565 break;
3566
3567 case OP_HSPACE:
3568 for (i = 1; i <= min; i++)
3569 {
3570 if (eptr >= md->end_subject)
3571 {
3572 SCHECK_PARTIAL();
3573 RRETURN(MATCH_NOMATCH);
3574 }
3575 GETCHARINC(c, eptr);
3576 switch(c)
3577 {
3578 default: RRETURN(MATCH_NOMATCH);
3579 case 0x09: /* HT */
3580 case 0x20: /* SPACE */
3581 case 0xa0: /* NBSP */
3582 case 0x1680: /* OGHAM SPACE MARK */
3583 case 0x180e: /* MONGOLIAN VOWEL SEPARATOR */
3584 case 0x2000: /* EN QUAD */
3585 case 0x2001: /* EM QUAD */
3586 case 0x2002: /* EN SPACE */
3587 case 0x2003: /* EM SPACE */
3588 case 0x2004: /* THREE-PER-EM SPACE */
3589 case 0x2005: /* FOUR-PER-EM SPACE */
3590 case 0x2006: /* SIX-PER-EM SPACE */
3591 case 0x2007: /* FIGURE SPACE */
3592 case 0x2008: /* PUNCTUATION SPACE */
3593 case 0x2009: /* THIN SPACE */
3594 case 0x200A: /* HAIR SPACE */
3595 case 0x202f: /* NARROW NO-BREAK SPACE */
3596 case 0x205f: /* MEDIUM MATHEMATICAL SPACE */
3597 case 0x3000: /* IDEOGRAPHIC SPACE */
3598 break;
3599 }
3600 }
3601 break;
3602
3603 case OP_NOT_VSPACE:
3604 for (i = 1; i <= min; i++)
3605 {
3606 if (eptr >= md->end_subject)
3607 {
3608 SCHECK_PARTIAL();
3609 RRETURN(MATCH_NOMATCH);
3610 }
3611 GETCHARINC(c, eptr);
3612 switch(c)
3613 {
3614 default: break;
3615 case 0x0a: /* LF */
3616 case 0x0b: /* VT */
3617 case 0x0c: /* FF */
3618 case 0x0d: /* CR */
3619 case 0x85: /* NEL */
3620 case 0x2028: /* LINE SEPARATOR */
3621 case 0x2029: /* PARAGRAPH SEPARATOR */
3622 RRETURN(MATCH_NOMATCH);
3623 }
3624 }
3625 break;
3626
3627 case OP_VSPACE:
3628 for (i = 1; i <= min; i++)
3629 {
3630 if (eptr >= md->end_subject)
3631 {
3632 SCHECK_PARTIAL();
3633 RRETURN(MATCH_NOMATCH);
3634 }
3635 GETCHARINC(c, eptr);
3636 switch(c)
3637 {
3638 default: RRETURN(MATCH_NOMATCH);
3639 case 0x0a: /* LF */
3640 case 0x0b: /* VT */
3641 case 0x0c: /* FF */
3642 case 0x0d: /* CR */
3643 case 0x85: /* NEL */
3644 case 0x2028: /* LINE SEPARATOR */
3645 case 0x2029: /* PARAGRAPH SEPARATOR */
3646 break;
3647 }
3648 }
3649 break;
3650
3651 case OP_NOT_DIGIT:
3652 for (i = 1; i <= min; i++)
3653 {
3654 if (eptr >= md->end_subject)
3655 {
3656 SCHECK_PARTIAL();
3657 RRETURN(MATCH_NOMATCH);
3658 }
3659 GETCHARINC(c, eptr);
3660 if (c < 128 && (md->ctypes[c] & ctype_digit) != 0)
3661 RRETURN(MATCH_NOMATCH);
3662 }
3663 break;
3664
3665 case OP_DIGIT:
3666 for (i = 1; i <= min; i++)
3667 {
3668 if (eptr >= md->end_subject)
3669 {
3670 SCHECK_PARTIAL();
3671 RRETURN(MATCH_NOMATCH);
3672 }
3673 if (*eptr >= 128 || (md->ctypes[*eptr++] & ctype_digit) == 0)
3674 RRETURN(MATCH_NOMATCH);
3675 /* No need to skip more bytes - we know it's a 1-byte character */
3676 }
3677 break;
3678
3679 case OP_NOT_WHITESPACE:
3680 for (i = 1; i <= min; i++)
3681 {
3682 if (eptr >= md->end_subject)
3683 {
3684 SCHECK_PARTIAL();
3685 RRETURN(MATCH_NOMATCH);
3686 }
3687 if (*eptr < 128 && (md->ctypes[*eptr] & ctype_space) != 0)
3688 RRETURN(MATCH_NOMATCH);
3689 while (++eptr < md->end_subject && (*eptr & 0xc0) == 0x80);
3690 }
3691 break;
3692
3693 case OP_WHITESPACE:
3694 for (i = 1; i <= min; i++)
3695 {
3696 if (eptr >= md->end_subject)
3697 {
3698 SCHECK_PARTIAL();
3699 RRETURN(MATCH_NOMATCH);
3700 }
3701 if (*eptr >= 128 || (md->ctypes[*eptr++] & ctype_space) == 0)
3702 RRETURN(MATCH_NOMATCH);
3703 /* No need to skip more bytes - we know it's a 1-byte character */
3704 }
3705 break;
3706
3707 case OP_NOT_WORDCHAR:
3708 for (i = 1; i <= min; i++)
3709 {
3710 if (eptr >= md->end_subject)
3711 {
3712 SCHECK_PARTIAL();
3713 RRETURN(MATCH_NOMATCH);
3714 }
3715 if (*eptr < 128 && (md->ctypes[*eptr] & ctype_word) != 0)
3716 RRETURN(MATCH_NOMATCH);
3717 while (++eptr < md->end_subject && (*eptr & 0xc0) == 0x80);
3718 }
3719 break;
3720
3721 case OP_WORDCHAR:
3722 for (i = 1; i <= min; i++)
3723 {
3724 if (eptr >= md->end_subject)
3725 {
3726 SCHECK_PARTIAL();
3727 RRETURN(MATCH_NOMATCH);
3728 }
3729 if (*eptr >= 128 || (md->ctypes[*eptr++] & ctype_word) == 0)
3730 RRETURN(MATCH_NOMATCH);
3731 /* No need to skip more bytes - we know it's a 1-byte character */
3732 }
3733 break;
3734
3735 default:
3736 RRETURN(PCRE_ERROR_INTERNAL);
3737 } /* End switch(ctype) */
3738
3739 else
3740 #endif /* SUPPORT_UTF8 */
3741
3742 /* Code for the non-UTF-8 case for minimum matching of operators other
3743 than OP_PROP and OP_NOTPROP. */
3744
3745 switch(ctype)
3746 {
3747 case OP_ANY:
3748 for (i = 1; i <= min; i++)
3749 {
3750 if (eptr >= md->end_subject)
3751 {
3752 SCHECK_PARTIAL();
3753 RRETURN(MATCH_NOMATCH);
3754 }
3755 if (IS_NEWLINE(eptr)) RRETURN(MATCH_NOMATCH);
3756 eptr++;
3757 }
3758 break;
3759
3760 case OP_ALLANY:
3761 if (eptr > md->end_subject - min)
3762 {
3763 SCHECK_PARTIAL();
3764 RRETURN(MATCH_NOMATCH);
3765 }
3766 eptr += min;
3767 break;
3768
3769 case OP_ANYBYTE:
3770 if (eptr > md->end_subject - min)
3771 {
3772 SCHECK_PARTIAL();
3773 RRETURN(MATCH_NOMATCH);
3774 }
3775 eptr += min;
3776 break;
3777
3778 case OP_ANYNL:
3779 for (i = 1; i <= min; i++)
3780 {
3781 if (eptr >= md->end_subject)
3782 {
3783 SCHECK_PARTIAL();
3784 RRETURN(MATCH_NOMATCH);
3785 }
3786 switch(*eptr++)
3787 {
3788 default: RRETURN(MATCH_NOMATCH);
3789 case 0x000d:
3790 if (eptr < md->end_subject && *eptr == 0x0a) eptr++;
3791 break;
3792 case 0x000a:
3793 break;
3794
3795 case 0x000b:
3796 case 0x000c:
3797 case 0x0085:
3798 if (md->bsr_anycrlf) RRETURN(MATCH_NOMATCH);
3799 break;
3800 }
3801 }
3802 break;
3803
3804 case OP_NOT_HSPACE:
3805 for (i = 1; i <= min; i++)
3806 {
3807 if (eptr >= md->end_subject)
3808 {
3809 SCHECK_PARTIAL();
3810 RRETURN(MATCH_NOMATCH);
3811 }
3812 switch(*eptr++)
3813 {
3814 default: break;
3815 case 0x09: /* HT */
3816 case 0x20: /* SPACE */
3817 case 0xa0: /* NBSP */
3818 RRETURN(MATCH_NOMATCH);
3819 }
3820 }
3821 break;
3822
3823 case OP_HSPACE:
3824 for (i = 1; i <= min; i++)
3825 {
3826 if (eptr >= md->end_subject)
3827 {
3828 SCHECK_PARTIAL();
3829 RRETURN(MATCH_NOMATCH);
3830 }
3831 switch(*eptr++)
3832 {
3833 default: RRETURN(MATCH_NOMATCH);
3834 case 0x09: /* HT */
3835 case 0x20: /* SPACE */
3836 case 0xa0: /* NBSP */
3837 break;
3838 }
3839 }
3840 break;
3841
3842 case OP_NOT_VSPACE:
3843 for (i = 1; i <= min; i++)
3844 {
3845 if (eptr >= md->end_subject)
3846 {
3847 SCHECK_PARTIAL();
3848 RRETURN(MATCH_NOMATCH);
3849 }
3850 switch(*eptr++)
3851 {
3852 default: break;
3853 case 0x0a: /* LF */
3854 case 0x0b: /* VT */
3855 case 0x0c: /* FF */
3856 case 0x0d: /* CR */
3857 case 0x85: /* NEL */
3858 RRETURN(MATCH_NOMATCH);
3859 }
3860 }
3861 break;
3862
3863 case OP_VSPACE:
3864 for (i = 1; i <= min; i++)
3865 {
3866 if (eptr >= md->end_subject)
3867 {
3868 SCHECK_PARTIAL();
3869 RRETURN(MATCH_NOMATCH);
3870 }
3871 switch(*eptr++)
3872 {
3873 default: RRETURN(MATCH_NOMATCH);
3874 case 0x0a: /* LF */
3875 case 0x0b: /* VT */
3876 case 0x0c: /* FF */
3877 case 0x0d: /* CR */
3878 case 0x85: /* NEL */
3879 break;
3880 }
3881 }
3882 break;
3883
3884 case OP_NOT_DIGIT:
3885 for (i = 1; i <= min; i++)
3886 {
3887 if (eptr >= md->end_subject)
3888 {
3889 SCHECK_PARTIAL();
3890 RRETURN(MATCH_NOMATCH);
3891 }
3892 if ((md->ctypes[*eptr++] & ctype_digit) != 0) RRETURN(MATCH_NOMATCH);
3893 }
3894 break;
3895
3896 case OP_DIGIT:
3897 for (i = 1; i <= min; i++)
3898 {
3899 if (eptr >= md->end_subject)
3900 {
3901 SCHECK_PARTIAL();
3902 RRETURN(MATCH_NOMATCH);
3903 }
3904 if ((md->ctypes[*eptr++] & ctype_digit) == 0) RRETURN(MATCH_NOMATCH);
3905 }
3906 break;
3907
3908 case OP_NOT_WHITESPACE:
3909 for (i = 1; i <= min; i++)
3910 {
3911 if (eptr >= md->end_subject)
3912 {
3913 SCHECK_PARTIAL();
3914 RRETURN(MATCH_NOMATCH);
3915 }
3916 if ((md->ctypes[*eptr++] & ctype_space) != 0) RRETURN(MATCH_NOMATCH);
3917 }
3918 break;
3919
3920 case OP_WHITESPACE:
3921 for (i = 1; i <= min; i++)
3922 {
3923 if (eptr >= md->end_subject)
3924 {
3925 SCHECK_PARTIAL();
3926 RRETURN(MATCH_NOMATCH);
3927 }
3928 if ((md->ctypes[*eptr++] & ctype_space) == 0) RRETURN(MATCH_NOMATCH);
3929 }
3930 break;
3931
3932 case OP_NOT_WORDCHAR:
3933 for (i = 1; i <= min; i++)
3934 {
3935 if (eptr >= md->end_subject)
3936 {
3937 SCHECK_PARTIAL();
3938 RRETURN(MATCH_NOMATCH);
3939 }
3940 if ((md->ctypes[*eptr++] & ctype_word) != 0)
3941 RRETURN(MATCH_NOMATCH);
3942 }
3943 break;
3944
3945 case OP_WORDCHAR:
3946 for (i = 1; i <= min; i++)
3947 {
3948 if (eptr >= md->end_subject)
3949 {
3950 SCHECK_PARTIAL();
3951 RRETURN(MATCH_NOMATCH);
3952 }
3953 if ((md->ctypes[*eptr++] & ctype_word) == 0)
3954 RRETURN(MATCH_NOMATCH);
3955 }
3956 break;
3957
3958 default:
3959 RRETURN(PCRE_ERROR_INTERNAL);
3960 }
3961 }
3962
3963 /* If min = max, continue at the same level without recursing */
3964
3965 if (min == max) continue;
3966
3967 /* If minimizing, we have to test the rest of the pattern before each
3968 subsequent match. Again, separate the UTF-8 case for speed, and also
3969 separate the UCP cases. */
3970
3971 if (minimize)
3972 {
3973 #ifdef SUPPORT_UCP
3974 if (prop_type >= 0)
3975 {
3976 switch(prop_type)
3977 {
3978 case PT_ANY:
3979 for (fi = min;; fi++)
3980 {
3981 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM36);
3982 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3983 if (fi >= max) RRETURN(MATCH_NOMATCH);
3984 if (eptr >= md->end_subject)
3985 {
3986 SCHECK_PARTIAL();
3987 RRETURN(MATCH_NOMATCH);
3988 }
3989 GETCHARINC(c, eptr);
3990 if (prop_fail_result) RRETURN(MATCH_NOMATCH);
3991 }
3992 /* Control never gets here */
3993
3994 case PT_LAMP:
3995 for (fi = min;; fi++)
3996 {
3997 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM37);
3998 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3999 if (fi >= max) RRETURN(MATCH_NOMATCH);
4000 if (eptr >= md->end_subject)
4001 {
4002 SCHECK_PARTIAL();
4003 RRETURN(MATCH_NOMATCH);
4004 }
4005 GETCHARINC(c, eptr);
4006 prop_chartype = UCD_CHARTYPE(c);
4007 if ((prop_chartype == ucp_Lu ||
4008 prop_chartype == ucp_Ll ||
4009 prop_chartype == ucp_Lt) == prop_fail_result)
4010 RRETURN(MATCH_NOMATCH);
4011 }
4012 /* Control never gets here */
4013
4014 case PT_GC:
4015 for (fi = min;; fi++)
4016 {
4017 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM38);
4018 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
4019 if (fi >= max) RRETURN(MATCH_NOMATCH);
4020 if (eptr >= md->end_subject)
4021 {
4022 SCHECK_PARTIAL();
4023 RRETURN(MATCH_NOMATCH);
4024 }
4025 GETCHARINC(c, eptr);
4026 prop_category = UCD_CATEGORY(c);
4027 if ((prop_category == prop_value) == prop_fail_result)
4028 RRETURN(MATCH_NOMATCH);
4029 }
4030 /* Control never gets here */
4031
4032 case PT_PC:
4033 for (fi = min;; fi++)
4034 {
4035 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM39);
4036 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
4037 if (fi >= max) RRETURN(MATCH_NOMATCH);
4038 if (eptr >= md->end_subject)
4039 {
4040 SCHECK_PARTIAL();
4041 RRETURN(MATCH_NOMATCH);
4042 }
4043 GETCHARINC(c, eptr);
4044 prop_chartype = UCD_CHARTYPE(c);
4045 if ((prop_chartype == prop_value) == prop_fail_result)
4046 RRETURN(MATCH_NOMATCH);
4047 }
4048 /* Control never gets here */
4049
4050 case PT_SC:
4051 for (fi = min;; fi++)
4052 {
4053 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM40);
4054 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
4055 if (fi >= max) RRETURN(MATCH_NOMATCH);
4056 if (eptr >= md->end_subject)
4057 {
4058 SCHECK_PARTIAL();
4059 RRETURN(MATCH_NOMATCH);
4060 }
4061 GETCHARINC(c, eptr);
4062 prop_script = UCD_SCRIPT(c);
4063 if ((prop_script == prop_value) == prop_fail_result)
4064 RRETURN(MATCH_NOMATCH);
4065 }
4066 /* Control never gets here */
4067
4068 default:
4069 RRETURN(PCRE_ERROR_INTERNAL);
4070 }
4071 }
4072
4073 /* Match extended Unicode sequences. We will get here only if the
4074 support is in the binary; otherwise a compile-time error occurs. */
4075
4076 else if (ctype == OP_EXTUNI)
4077 {
4078 for (fi = min;; fi++)
4079 {
4080 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM41);
4081 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
4082 if (fi >= max) RRETURN(MATCH_NOMATCH);
4083 if (eptr >= md->end_subject)
4084 {
4085 SCHECK_PARTIAL();
4086 RRETURN(MATCH_NOMATCH);
4087 }
4088 GETCHARINCTEST(c, eptr);
4089 prop_category = UCD_CATEGORY(c);
4090 if (prop_category == ucp_M) RRETURN(MATCH_NOMATCH);
4091 while (eptr < md->end_subject)
4092 {
4093 int len = 1;
4094 if (!utf8) c = *eptr;
4095 else { GETCHARLEN(c, eptr, len); }
4096 prop_category = UCD_CATEGORY(c);
4097 if (prop_category != ucp_M) break;
4098 eptr += len;
4099 }
4100 }
4101 }
4102
4103 else
4104 #endif /* SUPPORT_UCP */
4105
4106 #ifdef SUPPORT_UTF8
4107 /* UTF-8 mode */
4108 if (utf8)
4109 {
4110 for (fi = min;; fi++)
4111 {
4112 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM42);
4113 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
4114 if (fi >= max) RRETURN(MATCH_NOMATCH);
4115 if (eptr >= md->end_subject)
4116 {
4117 SCHECK_PARTIAL();
4118 RRETURN(MATCH_NOMATCH);
4119 }
4120 if (ctype == OP_ANY && IS_NEWLINE(eptr))
4121 RRETURN(MATCH_NOMATCH);
4122 GETCHARINC(c, eptr);
4123 switch(ctype)
4124 {
4125 case OP_ANY: /* This is the non-NL case */
4126 case OP_ALLANY:
4127 case OP_ANYBYTE:
4128 break;
4129
4130 case OP_ANYNL:
4131 switch(c)
4132 {
4133 default: RRETURN(MATCH_NOMATCH);
4134 case 0x000d:
4135 if (eptr < md->end_subject && *eptr == 0x0a) eptr++;
4136 break;
4137 case 0x000a:
4138 break;
4139
4140 case 0x000b:
4141 case 0x000c:
4142 case 0x0085:
4143 case 0x2028:
4144 case 0x2029:
4145 if (md->bsr_anycrlf) RRETURN(MATCH_NOMATCH);
4146 break;
4147 }
4148 break;
4149
4150 case OP_NOT_HSPACE:
4151 switch(c)
4152 {
4153 default: break;
4154 case 0x09: /* HT */
4155 case 0x20: /* SPACE */
4156 case 0xa0: /* NBSP */
4157 case 0x1680: /* OGHAM SPACE MARK */
4158 case 0x180e: /* MONGOLIAN VOWEL SEPARATOR */
4159 case 0x2000: /* EN QUAD */
4160 case 0x2001: /* EM QUAD */
4161 case 0x2002: /* EN SPACE */
4162 case 0x2003: /* EM SPACE */
4163 case 0x2004: /* THREE-PER-EM SPACE */
4164 case 0x2005: /* FOUR-PER-EM SPACE */
4165 case 0x2006: /* SIX-PER-EM SPACE */
4166 case 0x2007: /* FIGURE SPACE */
4167 case 0x2008: /* PUNCTUATION SPACE */
4168 case 0x2009: /* THIN SPACE */
4169 case 0x200A: /* HAIR SPACE */
4170 case 0x202f: /* NARROW NO-BREAK SPACE */
4171 case 0x205f: /* MEDIUM MATHEMATICAL SPACE */
4172 case 0x3000: /* IDEOGRAPHIC SPACE */
4173 RRETURN(MATCH_NOMATCH);
4174 }
4175 break;
4176
4177 case OP_HSPACE:
4178 switch(c)
4179 {
4180 default: RRETURN(MATCH_NOMATCH);
4181 case 0x09: /* HT */
4182 case 0x20: /* SPACE */
4183 case 0xa0: /* NBSP */
4184 case 0x1680: /* OGHAM SPACE MARK */
4185 case 0x180e: /* MONGOLIAN VOWEL SEPARATOR */
4186 case 0x2000: /* EN QUAD */
4187 case 0x2001: /* EM QUAD */
4188 case 0x2002: /* EN SPACE */
4189 case 0x2003: /* EM SPACE */
4190 case 0x2004: /* THREE-PER-EM SPACE */
4191 case 0x2005: /* FOUR-PER-EM SPACE */
4192 case 0x2006: /* SIX-PER-EM SPACE */
4193 case 0x2007: /* FIGURE SPACE */
4194 case 0x2008: /* PUNCTUATION SPACE */
4195 case 0x2009: /* THIN SPACE */
4196 case 0x200A: /* HAIR SPACE */
4197 case 0x202f: /* NARROW NO-BREAK SPACE */
4198 case 0x205f: /* MEDIUM MATHEMATICAL SPACE */
4199 case 0x3000: /* IDEOGRAPHIC SPACE */
4200 break;
4201 }
4202 break;
4203
4204 case OP_NOT_VSPACE:
4205 switch(c)
4206 {
4207 default: break;
4208 case 0x0a: /* LF */
4209 case 0x0b: /* VT */
4210 case 0x0c: /* FF */
4211 case 0x0d: /* CR */
4212 case 0x85: /* NEL */
4213 case 0x2028: /* LINE SEPARATOR */
4214 case 0x2029: /* PARAGRAPH SEPARATOR */
4215 RRETURN(MATCH_NOMATCH);
4216 }
4217 break;
4218
4219 case OP_VSPACE:
4220 switch(c)
4221 {
4222 default: RRETURN(MATCH_NOMATCH);
4223 case 0x0a: /* LF */
4224 case 0x0b: /* VT */
4225 case 0x0c: /* FF */
4226 case 0x0d: /* CR */
4227 case 0x85: /* NEL */
4228 case 0x2028: /* LINE SEPARATOR */
4229 case 0x2029: /* PARAGRAPH SEPARATOR */
4230 break;
4231 }
4232 break;
4233
4234 case OP_NOT_DIGIT:
4235 if (c < 256 && (md->ctypes[c] & ctype_digit) != 0)
4236 RRETURN(MATCH_NOMATCH);
4237 break;
4238
4239 case OP_DIGIT:
4240 if (c >= 256 || (md->ctypes[c] & ctype_digit) == 0)
4241 RRETURN(MATCH_NOMATCH);
4242 break;
4243
4244 case OP_NOT_WHITESPACE:
4245 if (c < 256 && (md->ctypes[c] & ctype_space) != 0)
4246 RRETURN(MATCH_NOMATCH);
4247 break;
4248
4249 case OP_WHITESPACE:
4250 if (c >= 256 || (md->ctypes[c] & ctype_space) == 0)
4251 RRETURN(MATCH_NOMATCH);
4252 break;
4253
4254 case OP_NOT_WORDCHAR:
4255 if (c < 256 && (md->ctypes[c] & ctype_word) != 0)
4256 RRETURN(MATCH_NOMATCH);
4257 break;
4258
4259 case OP_WORDCHAR:
4260 if (c >= 256 || (md->ctypes[c] & ctype_word) == 0)
4261 RRETURN(MATCH_NOMATCH);
4262 break;
4263
4264 default:
4265 RRETURN(PCRE_ERROR_INTERNAL);
4266 }
4267 }
4268 }
4269 else
4270 #endif
4271 /* Not UTF-8 mode */
4272 {
4273 for (fi = min;; fi++)
4274 {
4275 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM43);
4276 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
4277 if (fi >= max) RRETURN(MATCH_NOMATCH);
4278 if (eptr >= md->end_subject)
4279 {
4280 SCHECK_PARTIAL();
4281 RRETURN(MATCH_NOMATCH);
4282 }
4283 if (ctype == OP_ANY && IS_NEWLINE(eptr))
4284 RRETURN(MATCH_NOMATCH);
4285 c = *eptr++;
4286 switch(ctype)
4287 {
4288 case OP_ANY: /* This is the non-NL case */
4289 case OP_ALLANY:
4290 case OP_ANYBYTE:
4291 break;
4292
4293 case OP_ANYNL:
4294 switch(c)
4295 {
4296 default: RRETURN(MATCH_NOMATCH);
4297 case 0x000d:
4298 if (eptr < md->end_subject && *eptr == 0x0a) eptr++;
4299 break;
4300
4301 case 0x000a:
4302 break;
4303
4304 case 0x000b:
4305 case 0x000c:
4306 case 0x0085:
4307 if (md->bsr_anycrlf) RRETURN(MATCH_NOMATCH);
4308 break;
4309 }
4310 break;
4311
4312 case OP_NOT_HSPACE:
4313 switch(c)
4314 {
4315 default: break;
4316 case 0x09: /* HT */
4317 case 0x20: /* SPACE */
4318 case 0xa0: /* NBSP */
4319 RRETURN(MATCH_NOMATCH);
4320 }
4321 break;
4322
4323 case OP_HSPACE:
4324 switch(c)
4325 {
4326 default: RRETURN(MATCH_NOMATCH);
4327 case 0x09: /* HT */
4328 case 0x20: /* SPACE */
4329 case 0xa0: /* NBSP */
4330 break;
4331 }
4332 break;
4333
4334 case OP_NOT_VSPACE:
4335 switch(c)
4336 {
4337 default: break;
4338 case 0x0a: /* LF */
4339 case 0x0b: /* VT */
4340 case 0x0c: /* FF */
4341 case 0x0d: /* CR */
4342 case 0x85: /* NEL */
4343 RRETURN(MATCH_NOMATCH);
4344 }
4345 break;
4346
4347 case OP_VSPACE:
4348 switch(c)
4349 {
4350 default: RRETURN(MATCH_NOMATCH);
4351 case 0x0a: /* LF */
4352 case 0x0b: /* VT */
4353 case 0x0c: /* FF */
4354 case 0x0d: /* CR */
4355 case 0x85: /* NEL */
4356 break;
4357 }
4358 break;
4359
4360 case OP_NOT_DIGIT:
4361 if ((md->ctypes[c] & ctype_digit) != 0) RRETURN(MATCH_NOMATCH);
4362 break;
4363
4364 case OP_DIGIT:
4365 if ((md->ctypes[c] & ctype_digit) == 0) RRETURN(MATCH_NOMATCH);
4366 break;
4367
4368 case OP_NOT_WHITESPACE:
4369 if ((md->ctypes[c] & ctype_space) != 0) RRETURN(MATCH_NOMATCH);
4370 break;
4371
4372 case OP_WHITESPACE:
4373 if ((md->ctypes[c] & ctype_space) == 0) RRETURN(MATCH_NOMATCH);
4374 break;
4375
4376 case OP_NOT_WORDCHAR:
4377 if ((md->ctypes[c] & ctype_word) != 0) RRETURN(MATCH_NOMATCH);
4378 break;
4379
4380 case OP_WORDCHAR:
4381 if ((md->ctypes[c] & ctype_word) == 0) RRETURN(MATCH_NOMATCH);
4382 break;
4383
4384 default:
4385 RRETURN(PCRE_ERROR_INTERNAL);
4386 }
4387 }
4388 }
4389 /* Control never gets here */
4390 }
4391
4392 /* If maximizing, it is worth using inline code for speed, doing the type
4393 test once at the start (i.e. keep it out of the loop). Again, keep the
4394 UTF-8 and UCP stuff separate. */
4395
4396 else
4397 {
4398 pp = eptr; /* Remember where we started */
4399
4400 #ifdef SUPPORT_UCP
4401 if (prop_type >= 0)
4402 {
4403 switch(prop_type)
4404 {
4405 case PT_ANY:
4406 for (i = min; i < max; i++)
4407 {
4408 int len = 1;
4409 if (eptr >= md->end_subject)
4410 {
4411 SCHECK_PARTIAL();
4412 break;
4413 }
4414 GETCHARLEN(c, eptr, len);
4415 if (prop_fail_result) break;
4416 eptr+= len;
4417 }
4418 break;
4419
4420 case PT_LAMP:
4421 for (i = min; i < max; i++)
4422 {
4423 int len = 1;
4424 if (eptr >= md->end_subject)
4425 {
4426 SCHECK_PARTIAL();
4427 break;
4428 }
4429 GETCHARLEN(c, eptr, len);
4430 prop_chartype = UCD_CHARTYPE(c);
4431 if ((prop_chartype == ucp_Lu ||
4432 prop_chartype == ucp_Ll ||
4433 prop_chartype == ucp_Lt) == prop_fail_result)
4434 break;
4435 eptr+= len;
4436 }
4437 break;
4438
4439 case PT_GC:
4440 for (i = min; i < max; i++)
4441 {
4442 int len = 1;
4443 if (eptr >= md->end_subject)
4444 {
4445 SCHECK_PARTIAL();
4446 break;
4447 }
4448 GETCHARLEN(c, eptr, len);
4449 prop_category = UCD_CATEGORY(c);
4450 if ((prop_category == prop_value) == prop_fail_result)
4451 break;
4452 eptr+= len;
4453 }
4454 break;
4455
4456 case PT_PC:
4457 for (i = min; i < max; i++)
4458 {
4459 int len = 1;
4460 if (eptr >= md->end_subject)
4461 {
4462 SCHECK_PARTIAL();
4463 break;
4464 }
4465 GETCHARLEN(c, eptr, len);
4466 prop_chartype = UCD_CHARTYPE(c);
4467 if ((prop_chartype == prop_value) == prop_fail_result)
4468 break;
4469 eptr+= len;
4470 }
4471 break;
4472
4473 case PT_SC:
4474 for (i = min; i < max; i++)
4475 {
4476 int len = 1;
4477 if (eptr >= md->end_subject)
4478 {
4479 SCHECK_PARTIAL();
4480 break;
4481 }
4482 GETCHARLEN(c, eptr, len);
4483 prop_script = UCD_SCRIPT(c);
4484 if ((prop_script == prop_value) == prop_fail_result)
4485 break;
4486 eptr+= len;
4487 }
4488 break;
4489 }
4490
4491 /* eptr is now past the end of the maximum run */
4492
4493 if (possessive) continue;
4494 for(;;)
4495 {
4496 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM44);
4497 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
4498 if (eptr-- == pp) break; /* Stop if tried at original pos */
4499 if (utf8) BACKCHAR(eptr);
4500 }
4501 }
4502
4503 /* Match extended Unicode sequences. We will get here only if the
4504 support is in the binary; otherwise a compile-time error occurs. */
4505
4506 else if (ctype == OP_EXTUNI)
4507 {
4508 for (i = min; i < max; i++)
4509 {
4510 if (eptr >= md->end_subject)
4511 {
4512 SCHECK_PARTIAL();
4513 break;
4514 }
4515 GETCHARINCTEST(c, eptr);
4516 prop_category = UCD_CATEGORY(c);
4517 if (prop_category == ucp_M) break;
4518 while (eptr < md->end_subject)
4519 {
4520 int len = 1;
4521 if (!utf8) c = *eptr; else
4522 {
4523 GETCHARLEN(c, eptr, len);
4524 }
4525 prop_category = UCD_CATEGORY(c);
4526 if (prop_category != ucp_M) break;
4527 eptr += len;
4528 }
4529 }
4530
4531 /* eptr is now past the end of the maximum run */
4532
4533 if (possessive) continue;
4534
4535 for(;;)
4536 {
4537 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM45);
4538 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
4539 if (eptr-- == pp) break; /* Stop if tried at original pos */
4540 for (;;) /* Move back over one extended */
4541 {
4542 int len = 1;
4543 if (!utf8) c = *eptr; else
4544 {
4545 BACKCHAR(eptr);
4546 GETCHARLEN(c, eptr, len);
4547 }
4548 prop_category = UCD_CATEGORY(c);
4549 if (prop_category != ucp_M) break;
4550 eptr--;
4551 }
4552 }
4553 }
4554
4555 else
4556 #endif /* SUPPORT_UCP */
4557
4558 #ifdef SUPPORT_UTF8
4559 /* UTF-8 mode */
4560
4561 if (utf8)
4562 {
4563 switch(ctype)
4564 {
4565 case OP_ANY:
4566 if (max < INT_MAX)
4567 {
4568 for (i = min; i < max; i++)
4569 {
4570 if (eptr >= md->end_subject)
4571 {
4572 SCHECK_PARTIAL();
4573 break;
4574 }
4575 if (IS_NEWLINE(eptr)) break;
4576 eptr++;
4577 while (eptr < md->end_subject && (*eptr & 0xc0) == 0x80) eptr++;
4578 }
4579 }
4580
4581 /* Handle unlimited UTF-8 repeat */
4582
4583 else
4584 {
4585 for (i = min; i < max; i++)
4586 {
4587 if (eptr >= md->end_subject)
4588 {
4589 SCHECK_PARTIAL();
4590 break;
4591 }
4592 if (IS_NEWLINE(eptr)) break;
4593 eptr++;
4594 while (eptr < md->end_subject && (*eptr & 0xc0) == 0x80) eptr++;
4595 }
4596 }
4597 break;
4598
4599 case OP_ALLANY:
4600 if (max < INT_MAX)
4601 {
4602 for (i = min; i < max; i++)
4603 {
4604 if (eptr >= md->end_subject)
4605 {
4606 SCHECK_PARTIAL();
4607 break;
4608 }
4609 eptr++;
4610 while (eptr < md->end_subject && (*eptr & 0xc0) == 0x80) eptr++;
4611 }
4612 }
4613 else eptr = md->end_subject; /* Unlimited UTF-8 repeat */
4614 break;
4615
4616 /* The byte case is the same as non-UTF8 */
4617
4618 case OP_ANYBYTE:
4619 c = max - min;
4620 if (c > (unsigned int)(md->end_subject - eptr))
4621 {
4622 eptr = md->end_subject;
4623 SCHECK_PARTIAL();
4624 }
4625 else eptr += c;
4626 break;
4627
4628 case OP_ANYNL:
4629 for (i = min; i < max; i++)
4630 {
4631 int len = 1;
4632 if (eptr >= md->end_subject)
4633 {
4634 SCHECK_PARTIAL();
4635 break;
4636 }
4637 GETCHARLEN(c, eptr, len);
4638 if (c == 0x000d)
4639 {
4640 if (++eptr >= md->end_subject) break;
4641 if (*eptr == 0x000a) eptr++;
4642 }
4643 else
4644 {
4645 if (c != 0x000a &&
4646 (md->bsr_anycrlf ||
4647 (c != 0x000b && c != 0x000c &&
4648 c != 0x0085 && c != 0x2028 && c != 0x2029)))
4649 break;
4650 eptr += len;
4651 }
4652 }
4653 break;
4654
4655 case OP_NOT_HSPACE:
4656 case OP_HSPACE:
4657 for (i = min; i < max; i++)
4658 {
4659 BOOL gotspace;
4660 int len = 1;
4661 if (eptr >= md->end_subject)
4662 {
4663 SCHECK_PARTIAL();
4664 break;
4665 }
4666 GETCHARLEN(c, eptr, len);
4667 switch(c)
4668 {
4669 default: gotspace = FALSE; break;
4670 case 0x09: /* HT */
4671 case 0x20: /* SPACE */
4672 case 0xa0: /* NBSP */
4673 case 0x1680: /* OGHAM SPACE MARK */
4674 case 0x180e: /* MONGOLIAN VOWEL SEPARATOR */
4675 case 0x2000: /* EN QUAD */
4676 case 0x2001: /* EM QUAD */
4677 case 0x2002: /* EN SPACE */
4678 case 0x2003: /* EM SPACE */
4679 case 0x2004: /* THREE-PER-EM SPACE */
4680 case 0x2005: /* FOUR-PER-EM SPACE */
4681 case 0x2006: /* SIX-PER-EM SPACE */
4682 case 0x2007: /* FIGURE SPACE */
4683 case 0x2008: /* PUNCTUATION SPACE */
4684 case 0x2009: /* THIN SPACE */
4685 case 0x200A: /* HAIR SPACE */
4686 case 0x202f: /* NARROW NO-BREAK SPACE */
4687 case 0x205f: /* MEDIUM MATHEMATICAL SPACE */
4688 case 0x3000: /* IDEOGRAPHIC SPACE */
4689 gotspace = TRUE;
4690 break;
4691 }
4692 if (gotspace == (ctype == OP_NOT_HSPACE)) break;
4693 eptr += len;
4694 }
4695 break;
4696
4697 case OP_NOT_VSPACE:
4698 case OP_VSPACE:
4699 for (i = min; i < max; i++)
4700 {
4701 BOOL gotspace;
4702 int len = 1;
4703 if (eptr >= md->end_subject)
4704 {
4705 SCHECK_PARTIAL();
4706 break;
4707 }
4708 GETCHARLEN(c, eptr, len);
4709 switch(c)
4710 {
4711 default: gotspace = FALSE; break;
4712 case 0x0a: /* LF */
4713 case 0x0b: /* VT */
4714 case 0x0c: /* FF */
4715 case 0x0d: /* CR */
4716 case 0x85: /* NEL */
4717 case 0x2028: /* LINE SEPARATOR */
4718 case 0x2029: /* PARAGRAPH SEPARATOR */
4719 gotspace = TRUE;
4720 break;
4721 }
4722 if (gotspace == (ctype == OP_NOT_VSPACE)) break;
4723 eptr += len;
4724 }
4725 break;
4726
4727 case OP_NOT_DIGIT:
4728 for (i = min; i < max; i++)
4729 {
4730 int len = 1;
4731 if (eptr >= md->end_subject)
4732 {
4733 SCHECK_PARTIAL();
4734 break;
4735 }
4736 GETCHARLEN(c, eptr, len);
4737 if (c < 256 && (md->ctypes[c] & ctype_digit) != 0) break;
4738 eptr+= len;
4739 }
4740 break;
4741
4742 case OP_DIGIT:
4743 for (i = min; i < max; i++)
4744 {
4745 int len = 1;
4746 if (eptr >= md->end_subject)
4747 {
4748 SCHECK_PARTIAL();
4749 break;
4750 }
4751 GETCHARLEN(c, eptr, len);
4752 if (c >= 256 ||(md->ctypes[c] & ctype_digit) == 0) break;
4753 eptr+= len;
4754 }
4755 break;
4756
4757 case OP_NOT_WHITESPACE:
4758 for (i = min; i < max; i++)
4759 {
4760 int len = 1;
4761 if (eptr >= md->end_subject)
4762 {
4763 SCHECK_PARTIAL();
4764 break;
4765 }
4766 GETCHARLEN(c, eptr, len);
4767 if (c < 256 && (md->ctypes[c] & ctype_space) != 0) break;
4768 eptr+= len;
4769 }
4770 break;
4771
4772 case OP_WHITESPACE:
4773 for (i = min; i < max; i++)
4774 {
4775 int len = 1;
4776 if (eptr >= md->end_subject)
4777 {
4778 SCHECK_PARTIAL();
4779 break;
4780 }
4781 GETCHARLEN(c, eptr, len);
4782 if (c >= 256 ||(md->ctypes[c] & ctype_space) == 0) break;
4783 eptr+= len;
4784 }
4785 break;
4786
4787 case OP_NOT_WORDCHAR:
4788 for (i = min; i < max; i++)
4789 {
4790 int len = 1;
4791 if (eptr >= md->end_subject)
4792 {
4793 SCHECK_PARTIAL();
4794 break;
4795 }
4796 GETCHARLEN(c, eptr, len);
4797 if (c < 256 && (md->ctypes[c] & ctype_word) != 0) break;
4798 eptr+= len;
4799 }
4800 break;
4801
4802 case OP_WORDCHAR:
4803 for (i = min; i < max; i++)
4804 {
4805 int len = 1;
4806 if (eptr >= md->end_subject)
4807 {
4808 SCHECK_PARTIAL();
4809 break;
4810 }
4811 GETCHARLEN(c, eptr, len);
4812 if (c >= 256 || (md->ctypes[c] & ctype_word) == 0) break;
4813 eptr+= len;
4814 }
4815 break;
4816
4817 default:
4818 RRETURN(PCRE_ERROR_INTERNAL);
4819 }
4820
4821 /* eptr is now past the end of the maximum run */
4822
4823 if (possessive) continue;
4824 for(;;)
4825 {
4826 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM46);
4827 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
4828 if (eptr-- == pp) break; /* Stop if tried at original pos */
4829 BACKCHAR(eptr);
4830 }
4831 }
4832 else
4833 #endif /* SUPPORT_UTF8 */
4834
4835 /* Not UTF-8 mode */
4836 {
4837 switch(ctype)
4838 {
4839 case OP_ANY:
4840 for (i = min; i < max; i++)
4841 {
4842 if (eptr >= md->end_subject)
4843 {
4844 SCHECK_PARTIAL();
4845 break;
4846 }
4847 if (IS_NEWLINE(eptr)) break;
4848 eptr++;
4849 }
4850 break;
4851
4852 case OP_ALLANY:
4853 case OP_ANYBYTE:
4854 c = max - min;
4855 if (c > (unsigned int)(md->end_subject - eptr))
4856 {
4857 eptr = md->end_subject;
4858 SCHECK_PARTIAL();
4859 }
4860 else eptr += c;
4861 break;
4862
4863 case OP_ANYNL:
4864 for (i = min; i < max; i++)
4865 {
4866 if (eptr >= md->end_subject)
4867 {
4868 SCHECK_PARTIAL();
4869 break;
4870 }
4871 c = *eptr;
4872 if (c == 0x000d)
4873 {
4874 if (++eptr >= md->end_subject) break;
4875 if (*eptr == 0x000a) eptr++;
4876 }
4877 else
4878 {
4879 if (c != 0x000a &&
4880 (md->bsr_anycrlf ||
4881 (c != 0x000b && c != 0x000c && c != 0x0085)))
4882 break;
4883 eptr++;
4884 }
4885 }
4886 break;
4887
4888 case OP_NOT_HSPACE:
4889 for (i = min; i < max; i++)
4890 {
4891 if (eptr >= md->end_subject)
4892 {
4893 SCHECK_PARTIAL();
4894 break;
4895 }
4896 c = *eptr;
4897 if (c == 0x09 || c == 0x20 || c == 0xa0) break;
4898 eptr++;
4899 }
4900 break;
4901
4902 case OP_HSPACE:
4903 for (i = min; i < max; i++)
4904 {
4905 if (eptr >= md->end_subject)
4906 {
4907 SCHECK_PARTIAL();
4908 break;
4909 }
4910 c = *eptr;
4911 if (c != 0x09 && c != 0x20 && c != 0xa0) break;
4912 eptr++;
4913 }
4914 break;
4915
4916 case OP_NOT_VSPACE:
4917 for (i = min; i < max; i++)
4918 {
4919 if (eptr >= md->end_subject)
4920 {
4921 SCHECK_PARTIAL();
4922 break;
4923 }
4924 c = *eptr;
4925 if (c == 0x0a || c == 0x0b || c == 0x0c || c == 0x0d || c == 0x85)
4926 break;
4927 eptr++;
4928 }
4929 break;
4930
4931 case OP_VSPACE:
4932 for (i = min; i < max; i++)
4933 {
4934 if (eptr >= md->end_subject)
4935 {
4936 SCHECK_PARTIAL();
4937 break;
4938 }
4939 c = *eptr;
4940 if (c != 0x0a && c != 0x0b && c != 0x0c && c != 0x0d && c != 0x85)
4941 break;
4942 eptr++;
4943 }
4944 break;
4945
4946 case OP_NOT_DIGIT:
4947 for (i = min; i < max; i++)
4948 {
4949 if (eptr >= md->end_subject)
4950 {
4951 SCHECK_PARTIAL();
4952 break;
4953 }
4954 if ((md->ctypes[*eptr] & ctype_digit) != 0) break;
4955 eptr++;
4956 }
4957 break;
4958
4959 case OP_DIGIT:
4960 for (i = min; i < max; i++)
4961 {
4962 if (eptr >= md->end_subject)
4963 {
4964 SCHECK_PARTIAL();
4965 break;
4966 }
4967 if ((md->ctypes[*eptr] & ctype_digit) == 0) break;
4968 eptr++;
4969 }
4970 break;
4971
4972 case OP_NOT_WHITESPACE:
4973 for (i = min; i < max; i++)
4974 {
4975 if (eptr >= md->end_subject)
4976 {
4977 SCHECK_PARTIAL();
4978 break;
4979 }
4980 if ((md->ctypes[*eptr] & ctype_space) != 0) break;
4981 eptr++;
4982 }
4983 break;
4984
4985 case OP_WHITESPACE:
4986 for (i = min; i < max; i++)
4987 {
4988 if (eptr >= md->end_subject)
4989 {
4990 SCHECK_PARTIAL();
4991 break;
4992 }
4993 if ((md->ctypes[*eptr] & ctype_space) == 0) break;
4994 eptr++;
4995 }
4996 break;
4997
4998 case OP_NOT_WORDCHAR:
4999 for (i = min; i < max; i++)
5000 {
5001 if (eptr >= md->end_subject)
5002 {
5003 SCHECK_PARTIAL();
5004 break;
5005 }
5006 if ((md->ctypes[*eptr] & ctype_word) != 0) break;
5007 eptr++;
5008 }
5009 break;
5010
5011 case OP_WORDCHAR:
5012 for (i = min; i < max; i++)
5013 {
5014 if (eptr >= md->end_subject)
5015 {
5016 SCHECK_PARTIAL();
5017 break;
5018 }
5019 if ((md->ctypes[*eptr] & ctype_word) == 0) break;
5020 eptr++;
5021 }
5022 break;
5023
5024 default:
5025 RRETURN(PCRE_ERROR_INTERNAL);
5026 }
5027
5028 /* eptr is now past the end of the maximum run */
5029
5030 if (possessive) continue;
5031 while (eptr >= pp)
5032 {
5033 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM47);
5034 eptr--;
5035 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
5036 }
5037 }
5038
5039 /* Get here if we can't make it match with any permitted repetitions */
5040
5041 RRETURN(MATCH_NOMATCH);
5042 }
5043 /* Control never gets here */
5044
5045 /* There's been some horrible disaster. Arrival here can only mean there is
5046 something seriously wrong in the code above or the OP_xxx definitions. */
5047
5048 default:
5049 DPRINTF(("Unknown opcode %d\n", *ecode));
5050 RRETURN(PCRE_ERROR_UNKNOWN_OPCODE);
5051 }
5052
5053 /* Do not stick any code in here without much thought; it is assumed
5054 that "continue" in the code above comes out to here to repeat the main
5055 loop. */
5056
5057 } /* End of main loop */
5058 /* Control never reaches here */
5059
5060
5061 /* When compiling to use the heap rather than the stack for recursive calls to
5062 match(), the RRETURN() macro jumps here. The number that is saved in
5063 frame->Xwhere indicates which label we actually want to return to. */
5064
5065 #ifdef NO_RECURSE
5066 #define LBL(val) case val: goto L_RM##val;
5067 HEAP_RETURN:
5068 switch (frame->Xwhere)
5069 {
5070 LBL( 1) LBL( 2) LBL( 3) LBL( 4) LBL( 5) LBL( 6) LBL( 7) LBL( 8)
5071 LBL( 9) LBL(10) LBL(11) LBL(12) LBL(13) LBL(14) LBL(15) LBL(17)
5072 LBL(19) LBL(24) LBL(25) LBL(26) LBL(27) LBL(29) LBL(31) LBL(33)
5073 LBL(35) LBL(43) LBL(47) LBL(48) LBL(49) LBL(50) LBL(51) LBL(52)
5074 LBL(53) LBL(54)
5075 #ifdef SUPPORT_UTF8
5076 LBL(16) LBL(18) LBL(20) LBL(21) LBL(22) LBL(23) LBL(28) LBL(30)
5077 LBL(32) LBL(34) LBL(42) LBL(46)
5078 #ifdef SUPPORT_UCP
5079 LBL(36) LBL(37) LBL(38) LBL(39) LBL(40) LBL(41) LBL(44) LBL(45)
5080 #endif /* SUPPORT_UCP */
5081 #endif /* SUPPORT_UTF8 */
5082 default:
5083 DPRINTF(("jump error in pcre match: label %d non-existent\n", frame->Xwhere));
5084 return PCRE_ERROR_INTERNAL;
5085 }
5086 #undef LBL
5087 #endif /* NO_RECURSE */
5088 }
5089
5090
5091 /***************************************************************************
5092 ****************************************************************************
5093 RECURSION IN THE match() FUNCTION
5094
5095 Undefine all the macros that were defined above to handle this. */
5096
5097 #ifdef NO_RECURSE
5098 #undef eptr
5099 #undef ecode
5100 #undef mstart
5101 #undef offset_top
5102 #undef ims
5103 #undef eptrb
5104 #undef flags
5105
5106 #undef callpat
5107 #undef charptr
5108 #undef data
5109 #undef next
5110 #undef pp
5111 #undef prev
5112 #undef saved_eptr
5113
5114 #undef new_recursive
5115
5116 #undef cur_is_word
5117 #undef condition
5118 #undef prev_is_word
5119
5120 #undef original_ims
5121
5122 #undef ctype
5123 #undef length
5124 #undef max
5125 #undef min
5126 #undef number
5127 #undef offset
5128 #undef op
5129 #undef save_capture_last
5130 #undef save_offset1
5131 #undef save_offset2
5132 #undef save_offset3
5133 #undef stacksave
5134
5135 #undef newptrb
5136
5137 #endif
5138
5139 /* These two are defined as macros in both cases */
5140
5141 #undef fc
5142 #undef fi
5143
5144 /***************************************************************************
5145 ***************************************************************************/
5146
5147
5148
5149 /*************************************************
5150 * Execute a Regular Expression *
5151 *************************************************/
5152
5153 /* This function applies a compiled re to a subject string and picks out
5154 portions of the string if it matches. Two elements in the vector are set for
5155 each substring: the offsets to the start and end of the substring.
5156
5157 Arguments:
5158 argument_re points to the compiled expression
5159 extra_data points to extra data or is NULL
5160 subject points to the subject string
5161 length length of subject string (may contain binary zeros)
5162 start_offset where to start in the subject string
5163 options option bits
5164 offsets points to a vector of ints to be filled in with offsets
5165 offsetcount the number of elements in the vector
5166
5167 Returns: > 0 => success; value is the number of elements filled in
5168 = 0 => success, but offsets is not big enough
5169 -1 => failed to match
5170 < -1 => some kind of unexpected problem
5171 */
5172
5173 PCRE_EXP_DEFN int PCRE_CALL_CONVENTION
5174 pcre_exec(const pcre *argument_re, const pcre_extra *extra_data,
5175 PCRE_SPTR subject, int length, int start_offset, int options, int *offsets,
5176 int offsetcount)
5177 {
5178 int rc, resetcount, ocount;
5179 int first_byte = -1;
5180 int req_byte = -1;
5181 int req_byte2 = -1;
5182 int newline;
5183 unsigned long int ims;
5184 BOOL using_temporary_offsets = FALSE;
5185 BOOL anchored;
5186 BOOL startline;
5187 BOOL firstline;
5188 BOOL first_byte_caseless = FALSE;
5189 BOOL req_byte_caseless = FALSE;
5190 BOOL utf8;
5191 match_data match_block;
5192 match_data *md = &match_block;
5193 const uschar *tables;
5194 const uschar *start_bits = NULL;
5195 USPTR start_match = (USPTR)subject + start_offset;
5196 USPTR end_subject;
5197 USPTR start_partial = NULL;
5198 USPTR req_byte_ptr = start_match - 1;
5199
5200 pcre_study_data internal_study;
5201 const pcre_study_data *study;
5202
5203 real_pcre internal_re;
5204 const real_pcre *external_re = (const real_pcre *)argument_re;
5205 const real_pcre *re = external_re;
5206
5207 /* Plausibility checks */
5208
5209 if ((options & ~PUBLIC_EXEC_OPTIONS) != 0) return PCRE_ERROR_BADOPTION;
5210 if (re == NULL || subject == NULL ||
5211 (offsets == NULL && offsetcount > 0)) return PCRE_ERROR_NULL;
5212 if (offsetcount < 0) return PCRE_ERROR_BADCOUNT;
5213
5214 /* This information is for finding all the numbers associated with a given
5215 name, for condition testing. */
5216
5217 md->name_table = (uschar *)re + re->name_table_offset;
5218 md->name_count = re->name_count;
5219 md->name_entry_size = re->name_entry_size;
5220
5221 /* Fish out the optional data from the extra_data structure, first setting
5222 the default values. */
5223
5224 study = NULL;
5225 md->match_limit = MATCH_LIMIT;
5226 md->match_limit_recursion = MATCH_LIMIT_RECURSION;
5227 md->callout_data = NULL;
5228
5229 /* The table pointer is always in native byte order. */
5230
5231 tables = external_re->tables;
5232
5233 if (extra_data != NULL)
5234 {
5235 register unsigned int flags = extra_data->flags;
5236 if ((flags & PCRE_EXTRA_STUDY_DATA) != 0)
5237 study = (const pcre_study_data *)extra_data->study_data;
5238 if ((flags & PCRE_EXTRA_MATCH_LIMIT) != 0)
5239 md->match_limit = extra_data->match_limit;
5240 if ((flags & PCRE_EXTRA_MATCH_LIMIT_RECURSION) != 0)
5241 md->match_limit_recursion = extra_data->match_limit_recursion;
5242 if ((flags & PCRE_EXTRA_CALLOUT_DATA) != 0)
5243 md->callout_data = extra_data->callout_data;
5244 if ((flags & PCRE_EXTRA_TABLES) != 0) tables = extra_data->tables;
5245 }
5246
5247 /* If the exec call supplied NULL for tables, use the inbuilt ones. This
5248 is a feature that makes it possible to save compiled regex and re-use them
5249 in other programs later. */
5250
5251 if (tables == NULL) tables = _pcre_default_tables;
5252
5253 /* Check that the first field in the block is the magic number. If it is not,
5254 test for a regex that was compiled on a host of opposite endianness. If this is
5255 the case, flipped values are put in internal_re and internal_study if there was
5256 study data too. */
5257
5258 if (re->magic_number != MAGIC_NUMBER)
5259 {
5260 re = _pcre_try_flipped(re, &internal_re, study, &internal_study);
5261 if (re == NULL) return PCRE_ERROR_BADMAGIC;
5262 if (study != NULL) study = &internal_study;
5263 }
5264
5265 /* Set up other data */
5266
5267 anchored = ((re->options | options) & PCRE_ANCHORED) != 0;
5268 startline = (re->flags & PCRE_STARTLINE) != 0;
5269 firstline = (re->options & PCRE_FIRSTLINE) != 0;
5270
5271 /* The code starts after the real_pcre block and the capture name table. */
5272
5273 md->start_code = (const uschar *)external_re + re->name_table_offset +
5274 re->name_count * re->name_entry_size;
5275
5276 md->start_subject = (USPTR)subject;
5277 md->start_offset = start_offset;
5278 md->end_subject = md->start_subject + length;
5279 end_subject = md->end_subject;
5280
5281 md->endonly = (re->options & PCRE_DOLLAR_ENDONLY) != 0;
5282 utf8 = md->utf8 = (re->options & PCRE_UTF8) != 0;
5283 md->jscript_compat = (re->options & PCRE_JAVASCRIPT_COMPAT) != 0;
5284
5285 md->notbol = (options & PCRE_NOTBOL) != 0;
5286 md->noteol = (options & PCRE_NOTEOL) != 0;
5287 md->notempty = (options & PCRE_NOTEMPTY) != 0;
5288 md->notempty_atstart = (options & PCRE_NOTEMPTY_ATSTART) != 0;
5289 md->partial = ((options & PCRE_PARTIAL_HARD) != 0)? 2 :
5290 ((options & PCRE_PARTIAL_SOFT) != 0)? 1 : 0;
5291 md->hitend = FALSE;
5292
5293 md->recursive = NULL; /* No recursion at top level */
5294
5295 md->lcc = tables + lcc_offset;
5296 md->ctypes = tables + ctypes_offset;
5297
5298 /* Handle different \R options. */
5299
5300 switch (options & (PCRE_BSR_ANYCRLF|PCRE_BSR_UNICODE))
5301 {
5302 case 0:
5303 if ((re->options & (PCRE_BSR_ANYCRLF|PCRE_BSR_UNICODE)) != 0)
5304 md->bsr_anycrlf = (re->options & PCRE_BSR_ANYCRLF) != 0;
5305 else
5306 #ifdef BSR_ANYCRLF
5307 md->bsr_anycrlf = TRUE;
5308 #else
5309 md->bsr_anycrlf = FALSE;
5310 #endif
5311 break;
5312
5313 case PCRE_BSR_ANYCRLF:
5314 md->bsr_anycrlf = TRUE;
5315 break;
5316
5317 case PCRE_BSR_UNICODE:
5318 md->bsr_anycrlf = FALSE;
5319 break;
5320
5321 default: return PCRE_ERROR_BADNEWLINE;
5322 }
5323
5324 /* Handle different types of newline. The three bits give eight cases. If
5325 nothing is set at run time, whatever was used at compile time applies. */
5326
5327 switch ((((options & PCRE_NEWLINE_BITS) == 0)? re->options :
5328 (pcre_uint32)options) & PCRE_NEWLINE_BITS)
5329 {
5330 case 0: newline = NEWLINE; break; /* Compile-time default */
5331 case PCRE_NEWLINE_CR: newline = CHAR_CR; break;
5332 case PCRE_NEWLINE_LF: newline = CHAR_NL; break;
5333 case PCRE_NEWLINE_CR+
5334 PCRE_NEWLINE_LF: newline = (CHAR_CR << 8) | CHAR_NL; break;
5335 case PCRE_NEWLINE_ANY: newline = -1; break;
5336 case PCRE_NEWLINE_ANYCRLF: newline = -2; break;
5337 default: return PCRE_ERROR_BADNEWLINE;
5338 }
5339
5340 if (newline == -2)
5341 {
5342 md->nltype = NLTYPE_ANYCRLF;
5343 }
5344 else if (newline < 0)
5345 {
5346 md->nltype = NLTYPE_ANY;
5347 }
5348 else
5349 {
5350 md->nltype = NLTYPE_FIXED;
5351 if (newline > 255)
5352 {
5353 md->nllen = 2;
5354 md->nl[0] = (newline >> 8) & 255;
5355 md->nl[1] = newline & 255;
5356 }
5357 else
5358 {
5359 md->nllen = 1;
5360 md->nl[0] = newline;
5361 }
5362 }
5363
5364 /* Partial matching was originally supported only for a restricted set of
5365 regexes; from release 8.00 there are no restrictions, but the bits are still
5366 defined (though never set). So there's no harm in leaving this code. */
5367
5368 if (md->partial && (re->flags & PCRE_NOPARTIAL) != 0)
5369 return PCRE_ERROR_BADPARTIAL;
5370
5371 /* Check a UTF-8 string if required. Unfortunately there's no way of passing
5372 back the character offset. */
5373
5374 #ifdef SUPPORT_UTF8
5375 if (utf8 && (options & PCRE_NO_UTF8_CHECK) == 0)
5376 {
5377 if (_pcre_valid_utf8((USPTR)subject, length) >= 0)
5378 return PCRE_ERROR_BADUTF8;
5379 if (start_offset > 0 && start_offset < length)
5380 {
5381 int tb = ((USPTR)subject)[start_offset];
5382 if (tb > 127)
5383 {
5384 tb &= 0xc0;
5385 if (tb != 0 && tb != 0xc0) return PCRE_ERROR_BADUTF8_OFFSET;
5386 }
5387 }
5388 }
5389 #endif
5390
5391 /* The ims options can vary during the matching as a result of the presence
5392 of (?ims) items in the pattern. They are kept in a local variable so that
5393 restoring at the exit of a group is easy. */
5394
5395 ims = re->options & (PCRE_CASELESS|PCRE_MULTILINE|PCRE_DOTALL);
5396
5397 /* If the expression has got more back references than the offsets supplied can
5398 hold, we get a temporary chunk of working store to use during the matching.
5399 Otherwise, we can use the vector supplied, rounding down its size to a multiple
5400 of 3. */
5401
5402 ocount = offsetcount - (offsetcount % 3);
5403
5404 if (re->top_backref > 0 && re->top_backref >= ocount/3)
5405 {
5406 ocount = re->top_backref * 3 + 3;
5407 md->offset_vector = (int *)(pcre_malloc)(ocount * sizeof(int));
5408 if (md->offset_vector == NULL) return PCRE_ERROR_NOMEMORY;
5409 using_temporary_offsets = TRUE;
5410 DPRINTF(("Got memory to hold back references\n"));
5411 }
5412 else md->offset_vector = offsets;
5413
5414 md->offset_end = ocount;
5415 md->offset_max = (2*ocount)/3;
5416 md->offset_overflow = FALSE;
5417 md->capture_last = -1;
5418
5419 /* Compute the minimum number of offsets that we need to reset each time. Doing
5420 this makes a huge difference to execution time when there aren't many brackets
5421 in the pattern. */
5422
5423 resetcount = 2 + re->top_bracket * 2;
5424 if (resetcount > offsetcount) resetcount = ocount;
5425
5426 /* Reset the working variable associated with each extraction. These should
5427 never be used unless previously set, but they get saved and restored, and so we
5428 initialize them to avoid reading uninitialized locations. */
5429
5430 if (md->offset_vector != NULL)
5431 {
5432 register int *iptr = md->offset_vector + ocount;
5433 register int *iend = iptr - resetcount/2 + 1;
5434 while (--iptr >= iend) *iptr = -1;
5435 }
5436
5437 /* Set up the first character to match, if available. The first_byte value is
5438 never set for an anchored regular expression, but the anchoring may be forced
5439 at run time, so we have to test for anchoring. The first char may be unset for
5440 an unanchored pattern, of course. If there's no first char and the pattern was
5441 studied, there may be a bitmap of possible first characters. */
5442
5443 if (!anchored)
5444 {
5445 if ((re->flags & PCRE_FIRSTSET) != 0)
5446 {
5447 first_byte = re->first_byte & 255;
5448 if ((first_byte_caseless = ((re->first_byte & REQ_CASELESS) != 0)) == TRUE)
5449 first_byte = md->lcc[first_byte];
5450 }
5451 else
5452 if (!startline && study != NULL &&
5453 (study->flags & PCRE_STUDY_MAPPED) != 0)
5454 start_bits = study->start_bits;
5455 }
5456
5457 /* For anchored or unanchored matches, there may be a "last known required
5458 character" set. */
5459
5460 if ((re->flags & PCRE_REQCHSET) != 0)
5461 {
5462 req_byte = re->req_byte & 255;
5463 req_byte_caseless = (re->req_byte & REQ_CASELESS) != 0;
5464 req_byte2 = (tables + fcc_offset)[req_byte]; /* case flipped */
5465 }
5466
5467
5468 /* ==========================================================================*/
5469
5470 /* Loop for handling unanchored repeated matching attempts; for anchored regexs
5471 the loop runs just once. */
5472
5473 for(;;)
5474 {
5475 USPTR save_end_subject = end_subject;
5476 USPTR new_start_match;
5477
5478 /* Reset the maximum number of extractions we might see. */
5479
5480 if (md->offset_vector != NULL)
5481 {
5482 register int *iptr = md->offset_vector;
5483 register int *iend = iptr + resetcount;
5484 while (iptr < iend) *iptr++ = -1;
5485 }
5486
5487 /* If firstline is TRUE, the start of the match is constrained to the first
5488 line of a multiline string. That is, the match must be before or at the first
5489 newline. Implement this by temporarily adjusting end_subject so that we stop
5490 scanning at a newline. If the match fails at the newline, later code breaks
5491 this loop. */
5492
5493 if (firstline)
5494 {
5495 USPTR t = start_match;
5496 #ifdef SUPPORT_UTF8
5497 if (utf8)
5498 {
5499 while (t < md->end_subject && !IS_NEWLINE(t))
5500 {
5501 t++;
5502 while (t < end_subject && (*t & 0xc0) == 0x80) t++;
5503 }
5504 }
5505 else
5506 #endif
5507 while (t < md->end_subject && !IS_NEWLINE(t)) t++;
5508 end_subject = t;
5509 }
5510
5511 /* There are some optimizations that avoid running the match if a known
5512 starting point is not found, or if a known later character is not present.
5513 However, there is an option that disables these, for testing and for ensuring
5514 that all callouts do actually occur. */
5515
5516 if ((options & PCRE_NO_START_OPTIMIZE) == 0)
5517 {
5518 /* Advance to a unique first byte if there is one. */
5519
5520 if (first_byte >= 0)
5521 {
5522 if (first_byte_caseless)
5523 while (start_match < end_subject && md->lcc[*start_match] != first_byte)
5524 start_match++;
5525 else
5526 while (start_match < end_subject && *start_match != first_byte)
5527 start_match++;
5528 }
5529
5530 /* Or to just after a linebreak for a multiline match */
5531
5532 else if (startline)
5533 {
5534 if (start_match > md->start_subject + start_offset)
5535 {
5536 #ifdef SUPPORT_UTF8
5537 if (utf8)
5538 {
5539 while (start_match < end_subject && !WAS_NEWLINE(start_match))
5540 {
5541 start_match++;
5542 while(start_match < end_subject && (*start_match & 0xc0) == 0x80)
5543 start_match++;
5544 }
5545 }
5546 else
5547 #endif
5548 while (start_match < end_subject && !WAS_NEWLINE(start_match))
5549 start_match++;
5550
5551 /* If we have just passed a CR and the newline option is ANY or ANYCRLF,
5552 and we are now at a LF, advance the match position by one more character.
5553 */
5554
5555 if (start_match[-1] == CHAR_CR &&
5556 (md->nltype == NLTYPE_ANY || md->nltype == NLTYPE_ANYCRLF) &&
5557 start_match < end_subject &&
5558 *start_match == CHAR_NL)
5559 start_match++;
5560 }
5561 }
5562
5563 /* Or to a non-unique first byte after study */
5564
5565 else if (start_bits != NULL)
5566 {
5567 while (start_match < end_subject)
5568 {
5569 register unsigned int c = *start_match;
5570 if ((start_bits[c/8] & (1 << (c&7))) == 0) start_match++;
5571 else break;
5572 }
5573 }
5574 } /* Starting optimizations */
5575
5576 /* Restore fudged end_subject */
5577
5578 end_subject = save_end_subject;
5579
5580 /* The following two optimizations are disabled for partial matching or if
5581 disabling is explicitly requested. */
5582
5583 if ((options & PCRE_NO_START_OPTIMIZE) == 0 && !md->partial)
5584 {
5585 /* If the pattern was studied, a minimum subject length may be set. This is
5586 a lower bound; no actual string of that length may actually match the
5587 pattern. Although the value is, strictly, in characters, we treat it as
5588 bytes to avoid spending too much time in this optimization. */
5589
5590 if (study != NULL && (study->flags & PCRE_STUDY_MINLEN) != 0 &&
5591 (pcre_uint32)(end_subject - start_match) < study->minlength)
5592 {
5593 rc = MATCH_NOMATCH;
5594 break;
5595 }
5596
5597 /* If req_byte is set, we know that that character must appear in the
5598 subject for the match to succeed. If the first character is set, req_byte
5599 must be later in the subject; otherwise the test starts at the match point.
5600 This optimization can save a huge amount of backtracking in patterns with
5601 nested unlimited repeats that aren't going to match. Writing separate code
5602 for cased/caseless versions makes it go faster, as does using an
5603 autoincrement and backing off on a match.
5604
5605 HOWEVER: when the subject string is very, very long, searching to its end
5606 can take a long time, and give bad performance on quite ordinary patterns.
5607 This showed up when somebody was matching something like /^\d+C/ on a
5608 32-megabyte string... so we don't do this when the string is sufficiently
5609 long. */
5610
5611 if (req_byte >= 0 && end_subject - start_match < REQ_BYTE_MAX)
5612 {
5613 register USPTR p = start_match + ((first_byte >= 0)? 1 : 0);
5614
5615 /* We don't need to repeat the search if we haven't yet reached the
5616 place we found it at last time. */
5617
5618 if (p > req_byte_ptr)
5619 {
5620 if (req_byte_caseless)
5621 {
5622 while (p < end_subject)
5623 {
5624 register int pp = *p++;
5625 if (pp == req_byte || pp == req_byte2) { p--; break; }
5626 }
5627 }
5628 else
5629 {
5630 while (p < end_subject)
5631 {
5632 if (*p++ == req_byte) { p--; break; }
5633 }
5634 }
5635
5636 /* If we can't find the required character, break the matching loop,
5637 forcing a match failure. */
5638
5639 if (p >= end_subject)
5640 {
5641 rc = MATCH_NOMATCH;
5642 break;
5643 }
5644
5645 /* If we have found the required character, save the point where we
5646 found it, so that we don't search again next time round the loop if
5647 the start hasn't passed this character yet. */
5648
5649 req_byte_ptr = p;
5650 }
5651 }
5652 }
5653
5654 #ifdef PCRE_DEBUG /* Sigh. Some compilers never learn. */
5655 printf(">>>> Match against: ");
5656 pchars(start_match, end_subject - start_match, TRUE, md);
5657 printf("\n");
5658 #endif
5659
5660 /* OK, we can now run the match. If "hitend" is set afterwards, remember the
5661 first starting point for which a partial match was found. */
5662
5663 md->start_match_ptr = start_match;
5664 md->start_used_ptr = start_match;
5665 md->match_call_count = 0;
5666 rc = match(start_match, md->start_code, start_match, NULL, 2, md, ims, NULL,
5667 0, 0);
5668 if (md->hitend && start_partial == NULL) start_partial = md->start_used_ptr;
5669
5670 switch(rc)
5671 {
5672 /* NOMATCH and PRUNE advance by one character. THEN at this level acts
5673 exactly like PRUNE. */
5674
5675 case MATCH_NOMATCH:
5676 case MATCH_PRUNE:
5677 case MATCH_THEN:
5678 new_start_match = start_match + 1;
5679 #ifdef SUPPORT_UTF8
5680 if (utf8)
5681 while(new_start_match < end_subject && (*new_start_match & 0xc0) == 0x80)
5682 new_start_match++;
5683 #endif
5684 break;
5685
5686 /* SKIP passes back the next starting point explicitly. */
5687
5688 case MATCH_SKIP:
5689 new_start_match = md->start_match_ptr;
5690 break;
5691
5692 /* COMMIT disables the bumpalong, but otherwise behaves as NOMATCH. */
5693
5694 case MATCH_COMMIT:
5695 rc = MATCH_NOMATCH;
5696 goto ENDLOOP;
5697
5698 /* Any other return is either a match, or some kind of error. */
5699
5700 default:
5701 goto ENDLOOP;
5702 }
5703
5704 /* Control reaches here for the various types of "no match at this point"
5705 result. Reset the code to MATCH_NOMATCH for subsequent checking. */
5706
5707 rc = MATCH_NOMATCH;
5708
5709 /* If PCRE_FIRSTLINE is set, the match must happen before or at the first
5710 newline in the subject (though it may continue over the newline). Therefore,
5711 if we have just failed to match, starting at a newline, do not continue. */
5712
5713 if (firstline && IS_NEWLINE(start_match)) break;
5714
5715 /* Advance to new matching position */
5716
5717 start_match = new_start_match;
5718
5719 /* Break the loop if the pattern is anchored or if we have passed the end of
5720 the subject. */
5721
5722 if (anchored || start_match > end_subject) break;
5723
5724 /* If we have just passed a CR and we are now at a LF, and the pattern does
5725 not contain any explicit matches for \r or \n, and the newline option is CRLF
5726 or ANY or ANYCRLF, advance the match position by one more character. */
5727
5728 if (start_match[-1] == CHAR_CR &&
5729 start_match < end_subject &&
5730 *start_match == CHAR_NL &&
5731 (re->flags & PCRE_HASCRORLF) == 0 &&
5732 (md->nltype == NLTYPE_ANY ||
5733 md->nltype == NLTYPE_ANYCRLF ||
5734 md->nllen == 2))
5735 start_match++;
5736
5737 } /* End of for(;;) "bumpalong" loop */
5738
5739 /* ==========================================================================*/
5740
5741 /* We reach here when rc is not MATCH_NOMATCH, or if one of the stopping
5742 conditions is true:
5743
5744 (1) The pattern is anchored or the match was failed by (*COMMIT);
5745
5746 (2) We are past the end of the subject;
5747
5748 (3) PCRE_FIRSTLINE is set and we have failed to match at a newline, because
5749 this option requests that a match occur at or before the first newline in
5750 the subject.
5751
5752 When we have a match and the offset vector is big enough to deal with any
5753 backreferences, captured substring offsets will already be set up. In the case
5754 where we had to get some local store to hold offsets for backreference
5755 processing, copy those that we can. In this case there need not be overflow if
5756 certain parts of the pattern were not used, even though there are more
5757 capturing parentheses than vector slots. */
5758
5759 ENDLOOP:
5760
5761 if (rc == MATCH_MATCH)
5762 {
5763 if (using_temporary_offsets)
5764 {
5765 if (offsetcount >= 4)
5766 {
5767 memcpy(offsets + 2, md->offset_vector + 2,
5768 (offsetcount - 2) * sizeof(int));
5769 DPRINTF(("Copied offsets from temporary memory\n"));
5770 }
5771 if (md->end_offset_top > offsetcount) md->offset_overflow = TRUE;
5772 DPRINTF(("Freeing temporary memory\n"));
5773 (pcre_free)(md->offset_vector);
5774 }
5775
5776 /* Set the return code to the number of captured strings, or 0 if there are
5777 too many to fit into the vector. */
5778
5779 rc = md->offset_overflow? 0 : md->end_offset_top/2;
5780
5781 /* If there is space, set up the whole thing as substring 0. The value of
5782 md->start_match_ptr might be modified if \K was encountered on the success
5783 matching path. */
5784
5785 if (offsetcount < 2) rc = 0; else
5786 {
5787 offsets[0] = md->start_match_ptr - md->start_subject;
5788 offsets[1] = md->end_match_ptr - md->start_subject;
5789 }
5790
5791 DPRINTF((">>>> returning %d\n", rc));
5792 return rc;
5793 }
5794
5795 /* Control gets here if there has been an error, or if the overall match
5796 attempt has failed at all permitted starting positions. */
5797
5798 if (using_temporary_offsets)
5799 {
5800 DPRINTF(("Freeing temporary memory\n"));
5801 (pcre_free)(md->offset_vector);
5802 }
5803
5804 if (rc != MATCH_NOMATCH && rc != PCRE_ERROR_PARTIAL)
5805 {
5806 DPRINTF((">>>> error: returning %d\n", rc));
5807 return rc;
5808 }
5809 else if (start_partial != NULL)
5810 {
5811 DPRINTF((">>>> returning PCRE_ERROR_PARTIAL\n"));
5812 if (offsetcount > 1)
5813 {
5814 offsets[0] = start_partial - (USPTR)subject;
5815 offsets[1] = end_subject - (USPTR)subject;
5816 }
5817 return PCRE_ERROR_PARTIAL;
5818 }
5819 else
5820 {
5821 DPRINTF((">>>> returning PCRE_ERROR_NOMATCH\n"));
5822 return PCRE_ERROR_NOMATCH;
5823 }
5824 }
5825
5826 /* End of pcre_exec.c */

Properties

Name Value
svn:eol-style native
svn:keywords "Author Date Id Revision Url"

webmaster@exim.org
ViewVC Help
Powered by ViewVC 1.1.12