/[pcre]/code/trunk/pcre_exec.c
ViewVC logotype

Contents of /code/trunk/pcre_exec.c

Parent Directory Parent Directory | Revision Log Revision Log


Revision 446 - (show annotations) (download)
Tue Sep 15 10:49:50 2009 UTC (4 years, 10 months ago) by ph10
File MIME type: text/plain
File size: 161775 byte(s)
Correct returned capture count after recursion has matched more than outer.

1 /*************************************************
2 * Perl-Compatible Regular Expressions *
3 *************************************************/
4
5 /* PCRE is a library of functions to support regular expressions whose syntax
6 and semantics are as close as possible to those of the Perl 5 language.
7
8 Written by Philip Hazel
9 Copyright (c) 1997-2009 University of Cambridge
10
11 -----------------------------------------------------------------------------
12 Redistribution and use in source and binary forms, with or without
13 modification, are permitted provided that the following conditions are met:
14
15 * Redistributions of source code must retain the above copyright notice,
16 this list of conditions and the following disclaimer.
17
18 * Redistributions in binary form must reproduce the above copyright
19 notice, this list of conditions and the following disclaimer in the
20 documentation and/or other materials provided with the distribution.
21
22 * Neither the name of the University of Cambridge nor the names of its
23 contributors may be used to endorse or promote products derived from
24 this software without specific prior written permission.
25
26 THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
27 AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
28 IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
29 ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
30 LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
31 CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
32 SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
33 INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
34 CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
35 ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
36 POSSIBILITY OF SUCH DAMAGE.
37 -----------------------------------------------------------------------------
38 */
39
40
41 /* This module contains pcre_exec(), the externally visible function that does
42 pattern matching using an NFA algorithm, trying to mimic Perl as closely as
43 possible. There are also some static supporting functions. */
44
45 #ifdef HAVE_CONFIG_H
46 #include "config.h"
47 #endif
48
49 #define NLBLOCK md /* Block containing newline information */
50 #define PSSTART start_subject /* Field containing processed string start */
51 #define PSEND end_subject /* Field containing processed string end */
52
53 #include "pcre_internal.h"
54
55 /* Undefine some potentially clashing cpp symbols */
56
57 #undef min
58 #undef max
59
60 /* Flag bits for the match() function */
61
62 #define match_condassert 0x01 /* Called to check a condition assertion */
63 #define match_cbegroup 0x02 /* Could-be-empty unlimited repeat group */
64
65 /* Non-error returns from the match() function. Error returns are externally
66 defined PCRE_ERROR_xxx codes, which are all negative. */
67
68 #define MATCH_MATCH 1
69 #define MATCH_NOMATCH 0
70
71 /* Special internal returns from the match() function. Make them sufficiently
72 negative to avoid the external error codes. */
73
74 #define MATCH_COMMIT (-999)
75 #define MATCH_PRUNE (-998)
76 #define MATCH_SKIP (-997)
77 #define MATCH_THEN (-996)
78
79 /* Maximum number of ints of offset to save on the stack for recursive calls.
80 If the offset vector is bigger, malloc is used. This should be a multiple of 3,
81 because the offset vector is always a multiple of 3 long. */
82
83 #define REC_STACK_SAVE_MAX 30
84
85 /* Min and max values for the common repeats; for the maxima, 0 => infinity */
86
87 static const char rep_min[] = { 0, 0, 1, 1, 0, 0 };
88 static const char rep_max[] = { 0, 0, 0, 0, 1, 1 };
89
90
91
92 #ifdef DEBUG
93 /*************************************************
94 * Debugging function to print chars *
95 *************************************************/
96
97 /* Print a sequence of chars in printable format, stopping at the end of the
98 subject if the requested.
99
100 Arguments:
101 p points to characters
102 length number to print
103 is_subject TRUE if printing from within md->start_subject
104 md pointer to matching data block, if is_subject is TRUE
105
106 Returns: nothing
107 */
108
109 static void
110 pchars(const uschar *p, int length, BOOL is_subject, match_data *md)
111 {
112 unsigned int c;
113 if (is_subject && length > md->end_subject - p) length = md->end_subject - p;
114 while (length-- > 0)
115 if (isprint(c = *(p++))) printf("%c", c); else printf("\\x%02x", c);
116 }
117 #endif
118
119
120
121 /*************************************************
122 * Match a back-reference *
123 *************************************************/
124
125 /* If a back reference hasn't been set, the length that is passed is greater
126 than the number of characters left in the string, so the match fails.
127
128 Arguments:
129 offset index into the offset vector
130 eptr points into the subject
131 length length to be matched
132 md points to match data block
133 ims the ims flags
134
135 Returns: TRUE if matched
136 */
137
138 static BOOL
139 match_ref(int offset, register USPTR eptr, int length, match_data *md,
140 unsigned long int ims)
141 {
142 USPTR p = md->start_subject + md->offset_vector[offset];
143
144 #ifdef DEBUG
145 if (eptr >= md->end_subject)
146 printf("matching subject <null>");
147 else
148 {
149 printf("matching subject ");
150 pchars(eptr, length, TRUE, md);
151 }
152 printf(" against backref ");
153 pchars(p, length, FALSE, md);
154 printf("\n");
155 #endif
156
157 /* Always fail if not enough characters left */
158
159 if (length > md->end_subject - eptr) return FALSE;
160
161 /* Separate the caseless case for speed. In UTF-8 mode we can only do this
162 properly if Unicode properties are supported. Otherwise, we can check only
163 ASCII characters. */
164
165 if ((ims & PCRE_CASELESS) != 0)
166 {
167 #ifdef SUPPORT_UTF8
168 #ifdef SUPPORT_UCP
169 if (md->utf8)
170 {
171 USPTR endptr = eptr + length;
172 while (eptr < endptr)
173 {
174 int c, d;
175 GETCHARINC(c, eptr);
176 GETCHARINC(d, p);
177 if (c != d && c != UCD_OTHERCASE(d)) return FALSE;
178 }
179 }
180 else
181 #endif
182 #endif
183
184 /* The same code works when not in UTF-8 mode and in UTF-8 mode when there
185 is no UCP support. */
186
187 while (length-- > 0)
188 { if (md->lcc[*p++] != md->lcc[*eptr++]) return FALSE; }
189 }
190
191 /* In the caseful case, we can just compare the bytes, whether or not we
192 are in UTF-8 mode. */
193
194 else
195 { while (length-- > 0) if (*p++ != *eptr++) return FALSE; }
196
197 return TRUE;
198 }
199
200
201
202 /***************************************************************************
203 ****************************************************************************
204 RECURSION IN THE match() FUNCTION
205
206 The match() function is highly recursive, though not every recursive call
207 increases the recursive depth. Nevertheless, some regular expressions can cause
208 it to recurse to a great depth. I was writing for Unix, so I just let it call
209 itself recursively. This uses the stack for saving everything that has to be
210 saved for a recursive call. On Unix, the stack can be large, and this works
211 fine.
212
213 It turns out that on some non-Unix-like systems there are problems with
214 programs that use a lot of stack. (This despite the fact that every last chip
215 has oodles of memory these days, and techniques for extending the stack have
216 been known for decades.) So....
217
218 There is a fudge, triggered by defining NO_RECURSE, which avoids recursive
219 calls by keeping local variables that need to be preserved in blocks of memory
220 obtained from malloc() instead instead of on the stack. Macros are used to
221 achieve this so that the actual code doesn't look very different to what it
222 always used to.
223
224 The original heap-recursive code used longjmp(). However, it seems that this
225 can be very slow on some operating systems. Following a suggestion from Stan
226 Switzer, the use of longjmp() has been abolished, at the cost of having to
227 provide a unique number for each call to RMATCH. There is no way of generating
228 a sequence of numbers at compile time in C. I have given them names, to make
229 them stand out more clearly.
230
231 Crude tests on x86 Linux show a small speedup of around 5-8%. However, on
232 FreeBSD, avoiding longjmp() more than halves the time taken to run the standard
233 tests. Furthermore, not using longjmp() means that local dynamic variables
234 don't have indeterminate values; this has meant that the frame size can be
235 reduced because the result can be "passed back" by straight setting of the
236 variable instead of being passed in the frame.
237 ****************************************************************************
238 ***************************************************************************/
239
240 /* Numbers for RMATCH calls. When this list is changed, the code at HEAP_RETURN
241 below must be updated in sync. */
242
243 enum { RM1=1, RM2, RM3, RM4, RM5, RM6, RM7, RM8, RM9, RM10,
244 RM11, RM12, RM13, RM14, RM15, RM16, RM17, RM18, RM19, RM20,
245 RM21, RM22, RM23, RM24, RM25, RM26, RM27, RM28, RM29, RM30,
246 RM31, RM32, RM33, RM34, RM35, RM36, RM37, RM38, RM39, RM40,
247 RM41, RM42, RM43, RM44, RM45, RM46, RM47, RM48, RM49, RM50,
248 RM51, RM52, RM53, RM54 };
249
250 /* These versions of the macros use the stack, as normal. There are debugging
251 versions and production versions. Note that the "rw" argument of RMATCH isn't
252 actuall used in this definition. */
253
254 #ifndef NO_RECURSE
255 #define REGISTER register
256
257 #ifdef DEBUG
258 #define RMATCH(ra,rb,rc,rd,re,rf,rg,rw) \
259 { \
260 printf("match() called in line %d\n", __LINE__); \
261 rrc = match(ra,rb,mstart,rc,rd,re,rf,rg,rdepth+1); \
262 printf("to line %d\n", __LINE__); \
263 }
264 #define RRETURN(ra) \
265 { \
266 printf("match() returned %d from line %d ", ra, __LINE__); \
267 return ra; \
268 }
269 #else
270 #define RMATCH(ra,rb,rc,rd,re,rf,rg,rw) \
271 rrc = match(ra,rb,mstart,rc,rd,re,rf,rg,rdepth+1)
272 #define RRETURN(ra) return ra
273 #endif
274
275 #else
276
277
278 /* These versions of the macros manage a private stack on the heap. Note that
279 the "rd" argument of RMATCH isn't actually used in this definition. It's the md
280 argument of match(), which never changes. */
281
282 #define REGISTER
283
284 #define RMATCH(ra,rb,rc,rd,re,rf,rg,rw)\
285 {\
286 heapframe *newframe = (pcre_stack_malloc)(sizeof(heapframe));\
287 frame->Xwhere = rw; \
288 newframe->Xeptr = ra;\
289 newframe->Xecode = rb;\
290 newframe->Xmstart = mstart;\
291 newframe->Xoffset_top = rc;\
292 newframe->Xims = re;\
293 newframe->Xeptrb = rf;\
294 newframe->Xflags = rg;\
295 newframe->Xrdepth = frame->Xrdepth + 1;\
296 newframe->Xprevframe = frame;\
297 frame = newframe;\
298 DPRINTF(("restarting from line %d\n", __LINE__));\
299 goto HEAP_RECURSE;\
300 L_##rw:\
301 DPRINTF(("jumped back to line %d\n", __LINE__));\
302 }
303
304 #define RRETURN(ra)\
305 {\
306 heapframe *newframe = frame;\
307 frame = newframe->Xprevframe;\
308 (pcre_stack_free)(newframe);\
309 if (frame != NULL)\
310 {\
311 rrc = ra;\
312 goto HEAP_RETURN;\
313 }\
314 return ra;\
315 }
316
317
318 /* Structure for remembering the local variables in a private frame */
319
320 typedef struct heapframe {
321 struct heapframe *Xprevframe;
322
323 /* Function arguments that may change */
324
325 USPTR Xeptr;
326 const uschar *Xecode;
327 USPTR Xmstart;
328 int Xoffset_top;
329 long int Xims;
330 eptrblock *Xeptrb;
331 int Xflags;
332 unsigned int Xrdepth;
333
334 /* Function local variables */
335
336 USPTR Xcallpat;
337 #ifdef SUPPORT_UTF8
338 USPTR Xcharptr;
339 #endif
340 USPTR Xdata;
341 USPTR Xnext;
342 USPTR Xpp;
343 USPTR Xprev;
344 USPTR Xsaved_eptr;
345
346 recursion_info Xnew_recursive;
347
348 BOOL Xcur_is_word;
349 BOOL Xcondition;
350 BOOL Xprev_is_word;
351
352 unsigned long int Xoriginal_ims;
353
354 #ifdef SUPPORT_UCP
355 int Xprop_type;
356 int Xprop_value;
357 int Xprop_fail_result;
358 int Xprop_category;
359 int Xprop_chartype;
360 int Xprop_script;
361 int Xoclength;
362 uschar Xocchars[8];
363 #endif
364
365 int Xcodelink;
366 int Xctype;
367 unsigned int Xfc;
368 int Xfi;
369 int Xlength;
370 int Xmax;
371 int Xmin;
372 int Xnumber;
373 int Xoffset;
374 int Xop;
375 int Xsave_capture_last;
376 int Xsave_offset1, Xsave_offset2, Xsave_offset3;
377 int Xstacksave[REC_STACK_SAVE_MAX];
378
379 eptrblock Xnewptrb;
380
381 /* Where to jump back to */
382
383 int Xwhere;
384
385 } heapframe;
386
387 #endif
388
389
390 /***************************************************************************
391 ***************************************************************************/
392
393
394
395 /*************************************************
396 * Match from current position *
397 *************************************************/
398
399 /* This function is called recursively in many circumstances. Whenever it
400 returns a negative (error) response, the outer incarnation must also return the
401 same response. */
402
403 /* These macros pack up tests that are used for partial matching, and which
404 appears several times in the code. We set the "hit end" flag if the pointer is
405 at the end of the subject and also past the start of the subject (i.e.
406 something has been matched). For hard partial matching, we then return
407 immediately. The second one is used when we already know we are past the end of
408 the subject. */
409
410 #define CHECK_PARTIAL()\
411 if (md->partial != 0 && eptr >= md->end_subject && eptr > mstart)\
412 {\
413 md->hitend = TRUE;\
414 if (md->partial > 1) RRETURN(PCRE_ERROR_PARTIAL);\
415 }
416
417 #define SCHECK_PARTIAL()\
418 if (md->partial && eptr > mstart)\
419 {\
420 md->hitend = TRUE;\
421 if (md->partial > 1) RRETURN(PCRE_ERROR_PARTIAL);\
422 }
423
424
425 /* Performance note: It might be tempting to extract commonly used fields from
426 the md structure (e.g. utf8, end_subject) into individual variables to improve
427 performance. Tests using gcc on a SPARC disproved this; in the first case, it
428 made performance worse.
429
430 Arguments:
431 eptr pointer to current character in subject
432 ecode pointer to current position in compiled code
433 mstart pointer to the current match start position (can be modified
434 by encountering \K)
435 offset_top current top pointer
436 md pointer to "static" info for the match
437 ims current /i, /m, and /s options
438 eptrb pointer to chain of blocks containing eptr at start of
439 brackets - for testing for empty matches
440 flags can contain
441 match_condassert - this is an assertion condition
442 match_cbegroup - this is the start of an unlimited repeat
443 group that can match an empty string
444 rdepth the recursion depth
445
446 Returns: MATCH_MATCH if matched ) these values are >= 0
447 MATCH_NOMATCH if failed to match )
448 a negative PCRE_ERROR_xxx value if aborted by an error condition
449 (e.g. stopped by repeated call or recursion limit)
450 */
451
452 static int
453 match(REGISTER USPTR eptr, REGISTER const uschar *ecode, USPTR mstart,
454 int offset_top, match_data *md, unsigned long int ims, eptrblock *eptrb,
455 int flags, unsigned int rdepth)
456 {
457 /* These variables do not need to be preserved over recursion in this function,
458 so they can be ordinary variables in all cases. Mark some of them with
459 "register" because they are used a lot in loops. */
460
461 register int rrc; /* Returns from recursive calls */
462 register int i; /* Used for loops not involving calls to RMATCH() */
463 register unsigned int c; /* Character values not kept over RMATCH() calls */
464 register BOOL utf8; /* Local copy of UTF-8 flag for speed */
465
466 BOOL minimize, possessive; /* Quantifier options */
467 int condcode;
468
469 /* When recursion is not being used, all "local" variables that have to be
470 preserved over calls to RMATCH() are part of a "frame" which is obtained from
471 heap storage. Set up the top-level frame here; others are obtained from the
472 heap whenever RMATCH() does a "recursion". See the macro definitions above. */
473
474 #ifdef NO_RECURSE
475 heapframe *frame = (pcre_stack_malloc)(sizeof(heapframe));
476 frame->Xprevframe = NULL; /* Marks the top level */
477
478 /* Copy in the original argument variables */
479
480 frame->Xeptr = eptr;
481 frame->Xecode = ecode;
482 frame->Xmstart = mstart;
483 frame->Xoffset_top = offset_top;
484 frame->Xims = ims;
485 frame->Xeptrb = eptrb;
486 frame->Xflags = flags;
487 frame->Xrdepth = rdepth;
488
489 /* This is where control jumps back to to effect "recursion" */
490
491 HEAP_RECURSE:
492
493 /* Macros make the argument variables come from the current frame */
494
495 #define eptr frame->Xeptr
496 #define ecode frame->Xecode
497 #define mstart frame->Xmstart
498 #define offset_top frame->Xoffset_top
499 #define ims frame->Xims
500 #define eptrb frame->Xeptrb
501 #define flags frame->Xflags
502 #define rdepth frame->Xrdepth
503
504 /* Ditto for the local variables */
505
506 #ifdef SUPPORT_UTF8
507 #define charptr frame->Xcharptr
508 #endif
509 #define callpat frame->Xcallpat
510 #define codelink frame->Xcodelink
511 #define data frame->Xdata
512 #define next frame->Xnext
513 #define pp frame->Xpp
514 #define prev frame->Xprev
515 #define saved_eptr frame->Xsaved_eptr
516
517 #define new_recursive frame->Xnew_recursive
518
519 #define cur_is_word frame->Xcur_is_word
520 #define condition frame->Xcondition
521 #define prev_is_word frame->Xprev_is_word
522
523 #define original_ims frame->Xoriginal_ims
524
525 #ifdef SUPPORT_UCP
526 #define prop_type frame->Xprop_type
527 #define prop_value frame->Xprop_value
528 #define prop_fail_result frame->Xprop_fail_result
529 #define prop_category frame->Xprop_category
530 #define prop_chartype frame->Xprop_chartype
531 #define prop_script frame->Xprop_script
532 #define oclength frame->Xoclength
533 #define occhars frame->Xocchars
534 #endif
535
536 #define ctype frame->Xctype
537 #define fc frame->Xfc
538 #define fi frame->Xfi
539 #define length frame->Xlength
540 #define max frame->Xmax
541 #define min frame->Xmin
542 #define number frame->Xnumber
543 #define offset frame->Xoffset
544 #define op frame->Xop
545 #define save_capture_last frame->Xsave_capture_last
546 #define save_offset1 frame->Xsave_offset1
547 #define save_offset2 frame->Xsave_offset2
548 #define save_offset3 frame->Xsave_offset3
549 #define stacksave frame->Xstacksave
550
551 #define newptrb frame->Xnewptrb
552
553 /* When recursion is being used, local variables are allocated on the stack and
554 get preserved during recursion in the normal way. In this environment, fi and
555 i, and fc and c, can be the same variables. */
556
557 #else /* NO_RECURSE not defined */
558 #define fi i
559 #define fc c
560
561
562 #ifdef SUPPORT_UTF8 /* Many of these variables are used only */
563 const uschar *charptr; /* in small blocks of the code. My normal */
564 #endif /* style of coding would have declared */
565 const uschar *callpat; /* them within each of those blocks. */
566 const uschar *data; /* However, in order to accommodate the */
567 const uschar *next; /* version of this code that uses an */
568 USPTR pp; /* external "stack" implemented on the */
569 const uschar *prev; /* heap, it is easier to declare them all */
570 USPTR saved_eptr; /* here, so the declarations can be cut */
571 /* out in a block. The only declarations */
572 recursion_info new_recursive; /* within blocks below are for variables */
573 /* that do not have to be preserved over */
574 BOOL cur_is_word; /* a recursive call to RMATCH(). */
575 BOOL condition;
576 BOOL prev_is_word;
577
578 unsigned long int original_ims;
579
580 #ifdef SUPPORT_UCP
581 int prop_type;
582 int prop_value;
583 int prop_fail_result;
584 int prop_category;
585 int prop_chartype;
586 int prop_script;
587 int oclength;
588 uschar occhars[8];
589 #endif
590
591 int codelink;
592 int ctype;
593 int length;
594 int max;
595 int min;
596 int number;
597 int offset;
598 int op;
599 int save_capture_last;
600 int save_offset1, save_offset2, save_offset3;
601 int stacksave[REC_STACK_SAVE_MAX];
602
603 eptrblock newptrb;
604 #endif /* NO_RECURSE */
605
606 /* These statements are here to stop the compiler complaining about unitialized
607 variables. */
608
609 #ifdef SUPPORT_UCP
610 prop_value = 0;
611 prop_fail_result = 0;
612 #endif
613
614
615 /* This label is used for tail recursion, which is used in a few cases even
616 when NO_RECURSE is not defined, in order to reduce the amount of stack that is
617 used. Thanks to Ian Taylor for noticing this possibility and sending the
618 original patch. */
619
620 TAIL_RECURSE:
621
622 /* OK, now we can get on with the real code of the function. Recursive calls
623 are specified by the macro RMATCH and RRETURN is used to return. When
624 NO_RECURSE is *not* defined, these just turn into a recursive call to match()
625 and a "return", respectively (possibly with some debugging if DEBUG is
626 defined). However, RMATCH isn't like a function call because it's quite a
627 complicated macro. It has to be used in one particular way. This shouldn't,
628 however, impact performance when true recursion is being used. */
629
630 #ifdef SUPPORT_UTF8
631 utf8 = md->utf8; /* Local copy of the flag */
632 #else
633 utf8 = FALSE;
634 #endif
635
636 /* First check that we haven't called match() too many times, or that we
637 haven't exceeded the recursive call limit. */
638
639 if (md->match_call_count++ >= md->match_limit) RRETURN(PCRE_ERROR_MATCHLIMIT);
640 if (rdepth >= md->match_limit_recursion) RRETURN(PCRE_ERROR_RECURSIONLIMIT);
641
642 original_ims = ims; /* Save for resetting on ')' */
643
644 /* At the start of a group with an unlimited repeat that may match an empty
645 string, the match_cbegroup flag is set. When this is the case, add the current
646 subject pointer to the chain of such remembered pointers, to be checked when we
647 hit the closing ket, in order to break infinite loops that match no characters.
648 When match() is called in other circumstances, don't add to the chain. The
649 match_cbegroup flag must NOT be used with tail recursion, because the memory
650 block that is used is on the stack, so a new one may be required for each
651 match(). */
652
653 if ((flags & match_cbegroup) != 0)
654 {
655 newptrb.epb_saved_eptr = eptr;
656 newptrb.epb_prev = eptrb;
657 eptrb = &newptrb;
658 }
659
660 /* Now start processing the opcodes. */
661
662 for (;;)
663 {
664 minimize = possessive = FALSE;
665 op = *ecode;
666
667 switch(op)
668 {
669 case OP_FAIL:
670 RRETURN(MATCH_NOMATCH);
671
672 case OP_PRUNE:
673 RMATCH(eptr, ecode + _pcre_OP_lengths[*ecode], offset_top, md,
674 ims, eptrb, flags, RM51);
675 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
676 RRETURN(MATCH_PRUNE);
677
678 case OP_COMMIT:
679 RMATCH(eptr, ecode + _pcre_OP_lengths[*ecode], offset_top, md,
680 ims, eptrb, flags, RM52);
681 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
682 RRETURN(MATCH_COMMIT);
683
684 case OP_SKIP:
685 RMATCH(eptr, ecode + _pcre_OP_lengths[*ecode], offset_top, md,
686 ims, eptrb, flags, RM53);
687 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
688 md->start_match_ptr = eptr; /* Pass back current position */
689 RRETURN(MATCH_SKIP);
690
691 case OP_THEN:
692 RMATCH(eptr, ecode + _pcre_OP_lengths[*ecode], offset_top, md,
693 ims, eptrb, flags, RM54);
694 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
695 RRETURN(MATCH_THEN);
696
697 /* Handle a capturing bracket. If there is space in the offset vector, save
698 the current subject position in the working slot at the top of the vector.
699 We mustn't change the current values of the data slot, because they may be
700 set from a previous iteration of this group, and be referred to by a
701 reference inside the group.
702
703 If the bracket fails to match, we need to restore this value and also the
704 values of the final offsets, in case they were set by a previous iteration
705 of the same bracket.
706
707 If there isn't enough space in the offset vector, treat this as if it were
708 a non-capturing bracket. Don't worry about setting the flag for the error
709 case here; that is handled in the code for KET. */
710
711 case OP_CBRA:
712 case OP_SCBRA:
713 number = GET2(ecode, 1+LINK_SIZE);
714 offset = number << 1;
715
716 #ifdef DEBUG
717 printf("start bracket %d\n", number);
718 printf("subject=");
719 pchars(eptr, 16, TRUE, md);
720 printf("\n");
721 #endif
722
723 if (offset < md->offset_max)
724 {
725 save_offset1 = md->offset_vector[offset];
726 save_offset2 = md->offset_vector[offset+1];
727 save_offset3 = md->offset_vector[md->offset_end - number];
728 save_capture_last = md->capture_last;
729
730 DPRINTF(("saving %d %d %d\n", save_offset1, save_offset2, save_offset3));
731 md->offset_vector[md->offset_end - number] = eptr - md->start_subject;
732
733 flags = (op == OP_SCBRA)? match_cbegroup : 0;
734 do
735 {
736 RMATCH(eptr, ecode + _pcre_OP_lengths[*ecode], offset_top, md,
737 ims, eptrb, flags, RM1);
738 if (rrc != MATCH_NOMATCH && rrc != MATCH_THEN) RRETURN(rrc);
739 md->capture_last = save_capture_last;
740 ecode += GET(ecode, 1);
741 }
742 while (*ecode == OP_ALT);
743
744 DPRINTF(("bracket %d failed\n", number));
745
746 md->offset_vector[offset] = save_offset1;
747 md->offset_vector[offset+1] = save_offset2;
748 md->offset_vector[md->offset_end - number] = save_offset3;
749
750 RRETURN(MATCH_NOMATCH);
751 }
752
753 /* FALL THROUGH ... Insufficient room for saving captured contents. Treat
754 as a non-capturing bracket. */
755
756 /* VVVVVVVVVVVVVVVVVVVVVVVVV */
757 /* VVVVVVVVVVVVVVVVVVVVVVVVV */
758
759 DPRINTF(("insufficient capture room: treat as non-capturing\n"));
760
761 /* VVVVVVVVVVVVVVVVVVVVVVVVV */
762 /* VVVVVVVVVVVVVVVVVVVVVVVVV */
763
764 /* Non-capturing bracket. Loop for all the alternatives. When we get to the
765 final alternative within the brackets, we would return the result of a
766 recursive call to match() whatever happened. We can reduce stack usage by
767 turning this into a tail recursion, except in the case when match_cbegroup
768 is set.*/
769
770 case OP_BRA:
771 case OP_SBRA:
772 DPRINTF(("start non-capturing bracket\n"));
773 flags = (op >= OP_SBRA)? match_cbegroup : 0;
774 for (;;)
775 {
776 if (ecode[GET(ecode, 1)] != OP_ALT) /* Final alternative */
777 {
778 if (flags == 0) /* Not a possibly empty group */
779 {
780 ecode += _pcre_OP_lengths[*ecode];
781 DPRINTF(("bracket 0 tail recursion\n"));
782 goto TAIL_RECURSE;
783 }
784
785 /* Possibly empty group; can't use tail recursion. */
786
787 RMATCH(eptr, ecode + _pcre_OP_lengths[*ecode], offset_top, md, ims,
788 eptrb, flags, RM48);
789 RRETURN(rrc);
790 }
791
792 /* For non-final alternatives, continue the loop for a NOMATCH result;
793 otherwise return. */
794
795 RMATCH(eptr, ecode + _pcre_OP_lengths[*ecode], offset_top, md, ims,
796 eptrb, flags, RM2);
797 if (rrc != MATCH_NOMATCH && rrc != MATCH_THEN) RRETURN(rrc);
798 ecode += GET(ecode, 1);
799 }
800 /* Control never reaches here. */
801
802 /* Conditional group: compilation checked that there are no more than
803 two branches. If the condition is false, skipping the first branch takes us
804 past the end if there is only one branch, but that's OK because that is
805 exactly what going to the ket would do. As there is only one branch to be
806 obeyed, we can use tail recursion to avoid using another stack frame. */
807
808 case OP_COND:
809 case OP_SCOND:
810 codelink= GET(ecode, 1);
811
812 /* Because of the way auto-callout works during compile, a callout item is
813 inserted between OP_COND and an assertion condition. */
814
815 if (ecode[LINK_SIZE+1] == OP_CALLOUT)
816 {
817 if (pcre_callout != NULL)
818 {
819 pcre_callout_block cb;
820 cb.version = 1; /* Version 1 of the callout block */
821 cb.callout_number = ecode[LINK_SIZE+2];
822 cb.offset_vector = md->offset_vector;
823 cb.subject = (PCRE_SPTR)md->start_subject;
824 cb.subject_length = md->end_subject - md->start_subject;
825 cb.start_match = mstart - md->start_subject;
826 cb.current_position = eptr - md->start_subject;
827 cb.pattern_position = GET(ecode, LINK_SIZE + 3);
828 cb.next_item_length = GET(ecode, 3 + 2*LINK_SIZE);
829 cb.capture_top = offset_top/2;
830 cb.capture_last = md->capture_last;
831 cb.callout_data = md->callout_data;
832 if ((rrc = (*pcre_callout)(&cb)) > 0) RRETURN(MATCH_NOMATCH);
833 if (rrc < 0) RRETURN(rrc);
834 }
835 ecode += _pcre_OP_lengths[OP_CALLOUT];
836 }
837
838 condcode = ecode[LINK_SIZE+1];
839
840 /* Now see what the actual condition is */
841
842 if (condcode == OP_RREF) /* Recursion test */
843 {
844 offset = GET2(ecode, LINK_SIZE + 2); /* Recursion group number*/
845 condition = md->recursive != NULL &&
846 (offset == RREF_ANY || offset == md->recursive->group_num);
847 ecode += condition? 3 : GET(ecode, 1);
848 }
849
850 else if (condcode == OP_CREF) /* Group used test */
851 {
852 offset = GET2(ecode, LINK_SIZE+2) << 1; /* Doubled ref number */
853 condition = offset < offset_top && md->offset_vector[offset] >= 0;
854 ecode += condition? 3 : GET(ecode, 1);
855 }
856
857 else if (condcode == OP_DEF) /* DEFINE - always false */
858 {
859 condition = FALSE;
860 ecode += GET(ecode, 1);
861 }
862
863 /* The condition is an assertion. Call match() to evaluate it - setting
864 the final argument match_condassert causes it to stop at the end of an
865 assertion. */
866
867 else
868 {
869 RMATCH(eptr, ecode + 1 + LINK_SIZE, offset_top, md, ims, NULL,
870 match_condassert, RM3);
871 if (rrc == MATCH_MATCH)
872 {
873 condition = TRUE;
874 ecode += 1 + LINK_SIZE + GET(ecode, LINK_SIZE + 2);
875 while (*ecode == OP_ALT) ecode += GET(ecode, 1);
876 }
877 else if (rrc != MATCH_NOMATCH && rrc != MATCH_THEN)
878 {
879 RRETURN(rrc); /* Need braces because of following else */
880 }
881 else
882 {
883 condition = FALSE;
884 ecode += codelink;
885 }
886 }
887
888 /* We are now at the branch that is to be obeyed. As there is only one,
889 we can use tail recursion to avoid using another stack frame, except when
890 match_cbegroup is required for an unlimited repeat of a possibly empty
891 group. If the second alternative doesn't exist, we can just plough on. */
892
893 if (condition || *ecode == OP_ALT)
894 {
895 ecode += 1 + LINK_SIZE;
896 if (op == OP_SCOND) /* Possibly empty group */
897 {
898 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, match_cbegroup, RM49);
899 RRETURN(rrc);
900 }
901 else /* Group must match something */
902 {
903 flags = 0;
904 goto TAIL_RECURSE;
905 }
906 }
907 else /* Condition false & no alternative */
908 {
909 ecode += 1 + LINK_SIZE;
910 }
911 break;
912
913
914 /* End of the pattern, either real or forced. If we are in a top-level
915 recursion, we should restore the offsets appropriately and continue from
916 after the call. */
917
918 case OP_ACCEPT:
919 case OP_END:
920 if (md->recursive != NULL && md->recursive->group_num == 0)
921 {
922 recursion_info *rec = md->recursive;
923 DPRINTF(("End of pattern in a (?0) recursion\n"));
924 md->recursive = rec->prevrec;
925 memmove(md->offset_vector, rec->offset_save,
926 rec->saved_max * sizeof(int));
927 offset_top = rec->offset_top;
928 mstart = rec->save_start;
929 ims = original_ims;
930 ecode = rec->after_call;
931 break;
932 }
933
934 /* Otherwise, if we have matched an empty string, fail if PCRE_NOTEMPTY is
935 set, or if PCRE_NOTEMPTY_ATSTART is set and we have matched at the start of
936 the subject. In both cases, backtracking will then try other alternatives,
937 if any. */
938
939 if (eptr == mstart &&
940 (md->notempty ||
941 (md->notempty_atstart &&
942 mstart == md->start_subject + md->start_offset)))
943 RRETURN(MATCH_NOMATCH);
944
945 /* Otherwise, we have a match. */
946
947 md->end_match_ptr = eptr; /* Record where we ended */
948 md->end_offset_top = offset_top; /* and how many extracts were taken */
949 md->start_match_ptr = mstart; /* and the start (\K can modify) */
950 RRETURN(MATCH_MATCH);
951
952 /* Change option settings */
953
954 case OP_OPT:
955 ims = ecode[1];
956 ecode += 2;
957 DPRINTF(("ims set to %02lx\n", ims));
958 break;
959
960 /* Assertion brackets. Check the alternative branches in turn - the
961 matching won't pass the KET for an assertion. If any one branch matches,
962 the assertion is true. Lookbehind assertions have an OP_REVERSE item at the
963 start of each branch to move the current point backwards, so the code at
964 this level is identical to the lookahead case. */
965
966 case OP_ASSERT:
967 case OP_ASSERTBACK:
968 do
969 {
970 RMATCH(eptr, ecode + 1 + LINK_SIZE, offset_top, md, ims, NULL, 0,
971 RM4);
972 if (rrc == MATCH_MATCH) break;
973 if (rrc != MATCH_NOMATCH && rrc != MATCH_THEN) RRETURN(rrc);
974 ecode += GET(ecode, 1);
975 }
976 while (*ecode == OP_ALT);
977 if (*ecode == OP_KET) RRETURN(MATCH_NOMATCH);
978
979 /* If checking an assertion for a condition, return MATCH_MATCH. */
980
981 if ((flags & match_condassert) != 0) RRETURN(MATCH_MATCH);
982
983 /* Continue from after the assertion, updating the offsets high water
984 mark, since extracts may have been taken during the assertion. */
985
986 do ecode += GET(ecode,1); while (*ecode == OP_ALT);
987 ecode += 1 + LINK_SIZE;
988 offset_top = md->end_offset_top;
989 continue;
990
991 /* Negative assertion: all branches must fail to match */
992
993 case OP_ASSERT_NOT:
994 case OP_ASSERTBACK_NOT:
995 do
996 {
997 RMATCH(eptr, ecode + 1 + LINK_SIZE, offset_top, md, ims, NULL, 0,
998 RM5);
999 if (rrc == MATCH_MATCH) RRETURN(MATCH_NOMATCH);
1000 if (rrc != MATCH_NOMATCH && rrc != MATCH_THEN) RRETURN(rrc);
1001 ecode += GET(ecode,1);
1002 }
1003 while (*ecode == OP_ALT);
1004
1005 if ((flags & match_condassert) != 0) RRETURN(MATCH_MATCH);
1006
1007 ecode += 1 + LINK_SIZE;
1008 continue;
1009
1010 /* Move the subject pointer back. This occurs only at the start of
1011 each branch of a lookbehind assertion. If we are too close to the start to
1012 move back, this match function fails. When working with UTF-8 we move
1013 back a number of characters, not bytes. */
1014
1015 case OP_REVERSE:
1016 #ifdef SUPPORT_UTF8
1017 if (utf8)
1018 {
1019 i = GET(ecode, 1);
1020 while (i-- > 0)
1021 {
1022 eptr--;
1023 if (eptr < md->start_subject) RRETURN(MATCH_NOMATCH);
1024 BACKCHAR(eptr);
1025 }
1026 }
1027 else
1028 #endif
1029
1030 /* No UTF-8 support, or not in UTF-8 mode: count is byte count */
1031
1032 {
1033 eptr -= GET(ecode, 1);
1034 if (eptr < md->start_subject) RRETURN(MATCH_NOMATCH);
1035 }
1036
1037 /* Save the earliest consulted character, then skip to next op code */
1038
1039 if (eptr < md->start_used_ptr) md->start_used_ptr = eptr;
1040 ecode += 1 + LINK_SIZE;
1041 break;
1042
1043 /* The callout item calls an external function, if one is provided, passing
1044 details of the match so far. This is mainly for debugging, though the
1045 function is able to force a failure. */
1046
1047 case OP_CALLOUT:
1048 if (pcre_callout != NULL)
1049 {
1050 pcre_callout_block cb;
1051 cb.version = 1; /* Version 1 of the callout block */
1052 cb.callout_number = ecode[1];
1053 cb.offset_vector = md->offset_vector;
1054 cb.subject = (PCRE_SPTR)md->start_subject;
1055 cb.subject_length = md->end_subject - md->start_subject;
1056 cb.start_match = mstart - md->start_subject;
1057 cb.current_position = eptr - md->start_subject;
1058 cb.pattern_position = GET(ecode, 2);
1059 cb.next_item_length = GET(ecode, 2 + LINK_SIZE);
1060 cb.capture_top = offset_top/2;
1061 cb.capture_last = md->capture_last;
1062 cb.callout_data = md->callout_data;
1063 if ((rrc = (*pcre_callout)(&cb)) > 0) RRETURN(MATCH_NOMATCH);
1064 if (rrc < 0) RRETURN(rrc);
1065 }
1066 ecode += 2 + 2*LINK_SIZE;
1067 break;
1068
1069 /* Recursion either matches the current regex, or some subexpression. The
1070 offset data is the offset to the starting bracket from the start of the
1071 whole pattern. (This is so that it works from duplicated subpatterns.)
1072
1073 If there are any capturing brackets started but not finished, we have to
1074 save their starting points and reinstate them after the recursion. However,
1075 we don't know how many such there are (offset_top records the completed
1076 total) so we just have to save all the potential data. There may be up to
1077 65535 such values, which is too large to put on the stack, but using malloc
1078 for small numbers seems expensive. As a compromise, the stack is used when
1079 there are no more than REC_STACK_SAVE_MAX values to store; otherwise malloc
1080 is used. A problem is what to do if the malloc fails ... there is no way of
1081 returning to the top level with an error. Save the top REC_STACK_SAVE_MAX
1082 values on the stack, and accept that the rest may be wrong.
1083
1084 There are also other values that have to be saved. We use a chained
1085 sequence of blocks that actually live on the stack. Thanks to Robin Houston
1086 for the original version of this logic. */
1087
1088 case OP_RECURSE:
1089 {
1090 callpat = md->start_code + GET(ecode, 1);
1091 new_recursive.group_num = (callpat == md->start_code)? 0 :
1092 GET2(callpat, 1 + LINK_SIZE);
1093
1094 /* Add to "recursing stack" */
1095
1096 new_recursive.prevrec = md->recursive;
1097 md->recursive = &new_recursive;
1098
1099 /* Find where to continue from afterwards */
1100
1101 ecode += 1 + LINK_SIZE;
1102 new_recursive.after_call = ecode;
1103
1104 /* Now save the offset data. */
1105
1106 new_recursive.saved_max = md->offset_end;
1107 if (new_recursive.saved_max <= REC_STACK_SAVE_MAX)
1108 new_recursive.offset_save = stacksave;
1109 else
1110 {
1111 new_recursive.offset_save =
1112 (int *)(pcre_malloc)(new_recursive.saved_max * sizeof(int));
1113 if (new_recursive.offset_save == NULL) RRETURN(PCRE_ERROR_NOMEMORY);
1114 }
1115
1116 memcpy(new_recursive.offset_save, md->offset_vector,
1117 new_recursive.saved_max * sizeof(int));
1118 new_recursive.save_start = mstart;
1119 new_recursive.offset_top = offset_top;
1120 mstart = eptr;
1121
1122 /* OK, now we can do the recursion. For each top-level alternative we
1123 restore the offset and recursion data. */
1124
1125 DPRINTF(("Recursing into group %d\n", new_recursive.group_num));
1126 flags = (*callpat >= OP_SBRA)? match_cbegroup : 0;
1127 do
1128 {
1129 RMATCH(eptr, callpat + _pcre_OP_lengths[*callpat], offset_top,
1130 md, ims, eptrb, flags, RM6);
1131 if (rrc == MATCH_MATCH)
1132 {
1133 DPRINTF(("Recursion matched\n"));
1134 md->recursive = new_recursive.prevrec;
1135 if (new_recursive.offset_save != stacksave)
1136 (pcre_free)(new_recursive.offset_save);
1137 RRETURN(MATCH_MATCH);
1138 }
1139 else if (rrc != MATCH_NOMATCH && rrc != MATCH_THEN)
1140 {
1141 DPRINTF(("Recursion gave error %d\n", rrc));
1142 if (new_recursive.offset_save != stacksave)
1143 (pcre_free)(new_recursive.offset_save);
1144 RRETURN(rrc);
1145 }
1146
1147 md->recursive = &new_recursive;
1148 memcpy(md->offset_vector, new_recursive.offset_save,
1149 new_recursive.saved_max * sizeof(int));
1150 callpat += GET(callpat, 1);
1151 }
1152 while (*callpat == OP_ALT);
1153
1154 DPRINTF(("Recursion didn't match\n"));
1155 md->recursive = new_recursive.prevrec;
1156 if (new_recursive.offset_save != stacksave)
1157 (pcre_free)(new_recursive.offset_save);
1158 RRETURN(MATCH_NOMATCH);
1159 }
1160 /* Control never reaches here */
1161
1162 /* "Once" brackets are like assertion brackets except that after a match,
1163 the point in the subject string is not moved back. Thus there can never be
1164 a move back into the brackets. Friedl calls these "atomic" subpatterns.
1165 Check the alternative branches in turn - the matching won't pass the KET
1166 for this kind of subpattern. If any one branch matches, we carry on as at
1167 the end of a normal bracket, leaving the subject pointer. */
1168
1169 case OP_ONCE:
1170 prev = ecode;
1171 saved_eptr = eptr;
1172
1173 do
1174 {
1175 RMATCH(eptr, ecode + 1 + LINK_SIZE, offset_top, md, ims, eptrb, 0, RM7);
1176 if (rrc == MATCH_MATCH) break;
1177 if (rrc != MATCH_NOMATCH && rrc != MATCH_THEN) RRETURN(rrc);
1178 ecode += GET(ecode,1);
1179 }
1180 while (*ecode == OP_ALT);
1181
1182 /* If hit the end of the group (which could be repeated), fail */
1183
1184 if (*ecode != OP_ONCE && *ecode != OP_ALT) RRETURN(MATCH_NOMATCH);
1185
1186 /* Continue as from after the assertion, updating the offsets high water
1187 mark, since extracts may have been taken. */
1188
1189 do ecode += GET(ecode, 1); while (*ecode == OP_ALT);
1190
1191 offset_top = md->end_offset_top;
1192 eptr = md->end_match_ptr;
1193
1194 /* For a non-repeating ket, just continue at this level. This also
1195 happens for a repeating ket if no characters were matched in the group.
1196 This is the forcible breaking of infinite loops as implemented in Perl
1197 5.005. If there is an options reset, it will get obeyed in the normal
1198 course of events. */
1199
1200 if (*ecode == OP_KET || eptr == saved_eptr)
1201 {
1202 ecode += 1+LINK_SIZE;
1203 break;
1204 }
1205
1206 /* The repeating kets try the rest of the pattern or restart from the
1207 preceding bracket, in the appropriate order. The second "call" of match()
1208 uses tail recursion, to avoid using another stack frame. We need to reset
1209 any options that changed within the bracket before re-running it, so
1210 check the next opcode. */
1211
1212 if (ecode[1+LINK_SIZE] == OP_OPT)
1213 {
1214 ims = (ims & ~PCRE_IMS) | ecode[4];
1215 DPRINTF(("ims set to %02lx at group repeat\n", ims));
1216 }
1217
1218 if (*ecode == OP_KETRMIN)
1219 {
1220 RMATCH(eptr, ecode + 1 + LINK_SIZE, offset_top, md, ims, eptrb, 0, RM8);
1221 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1222 ecode = prev;
1223 flags = 0;
1224 goto TAIL_RECURSE;
1225 }
1226 else /* OP_KETRMAX */
1227 {
1228 RMATCH(eptr, prev, offset_top, md, ims, eptrb, match_cbegroup, RM9);
1229 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1230 ecode += 1 + LINK_SIZE;
1231 flags = 0;
1232 goto TAIL_RECURSE;
1233 }
1234 /* Control never gets here */
1235
1236 /* An alternation is the end of a branch; scan along to find the end of the
1237 bracketed group and go to there. */
1238
1239 case OP_ALT:
1240 do ecode += GET(ecode,1); while (*ecode == OP_ALT);
1241 break;
1242
1243 /* BRAZERO, BRAMINZERO and SKIPZERO occur just before a bracket group,
1244 indicating that it may occur zero times. It may repeat infinitely, or not
1245 at all - i.e. it could be ()* or ()? or even (){0} in the pattern. Brackets
1246 with fixed upper repeat limits are compiled as a number of copies, with the
1247 optional ones preceded by BRAZERO or BRAMINZERO. */
1248
1249 case OP_BRAZERO:
1250 {
1251 next = ecode+1;
1252 RMATCH(eptr, next, offset_top, md, ims, eptrb, 0, RM10);
1253 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1254 do next += GET(next,1); while (*next == OP_ALT);
1255 ecode = next + 1 + LINK_SIZE;
1256 }
1257 break;
1258
1259 case OP_BRAMINZERO:
1260 {
1261 next = ecode+1;
1262 do next += GET(next, 1); while (*next == OP_ALT);
1263 RMATCH(eptr, next + 1+LINK_SIZE, offset_top, md, ims, eptrb, 0, RM11);
1264 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1265 ecode++;
1266 }
1267 break;
1268
1269 case OP_SKIPZERO:
1270 {
1271 next = ecode+1;
1272 do next += GET(next,1); while (*next == OP_ALT);
1273 ecode = next + 1 + LINK_SIZE;
1274 }
1275 break;
1276
1277 /* End of a group, repeated or non-repeating. */
1278
1279 case OP_KET:
1280 case OP_KETRMIN:
1281 case OP_KETRMAX:
1282 prev = ecode - GET(ecode, 1);
1283
1284 /* If this was a group that remembered the subject start, in order to break
1285 infinite repeats of empty string matches, retrieve the subject start from
1286 the chain. Otherwise, set it NULL. */
1287
1288 if (*prev >= OP_SBRA)
1289 {
1290 saved_eptr = eptrb->epb_saved_eptr; /* Value at start of group */
1291 eptrb = eptrb->epb_prev; /* Backup to previous group */
1292 }
1293 else saved_eptr = NULL;
1294
1295 /* If we are at the end of an assertion group, stop matching and return
1296 MATCH_MATCH, but record the current high water mark for use by positive
1297 assertions. Do this also for the "once" (atomic) groups. */
1298
1299 if (*prev == OP_ASSERT || *prev == OP_ASSERT_NOT ||
1300 *prev == OP_ASSERTBACK || *prev == OP_ASSERTBACK_NOT ||
1301 *prev == OP_ONCE)
1302 {
1303 md->end_match_ptr = eptr; /* For ONCE */
1304 md->end_offset_top = offset_top;
1305 RRETURN(MATCH_MATCH);
1306 }
1307
1308 /* For capturing groups we have to check the group number back at the start
1309 and if necessary complete handling an extraction by setting the offsets and
1310 bumping the high water mark. Note that whole-pattern recursion is coded as
1311 a recurse into group 0, so it won't be picked up here. Instead, we catch it
1312 when the OP_END is reached. Other recursion is handled here. */
1313
1314 if (*prev == OP_CBRA || *prev == OP_SCBRA)
1315 {
1316 number = GET2(prev, 1+LINK_SIZE);
1317 offset = number << 1;
1318
1319 #ifdef DEBUG
1320 printf("end bracket %d", number);
1321 printf("\n");
1322 #endif
1323
1324 md->capture_last = number;
1325 if (offset >= md->offset_max) md->offset_overflow = TRUE; else
1326 {
1327 md->offset_vector[offset] =
1328 md->offset_vector[md->offset_end - number];
1329 md->offset_vector[offset+1] = eptr - md->start_subject;
1330 if (offset_top <= offset) offset_top = offset + 2;
1331 }
1332
1333 /* Handle a recursively called group. Restore the offsets
1334 appropriately and continue from after the call. */
1335
1336 if (md->recursive != NULL && md->recursive->group_num == number)
1337 {
1338 recursion_info *rec = md->recursive;
1339 DPRINTF(("Recursion (%d) succeeded - continuing\n", number));
1340 md->recursive = rec->prevrec;
1341 mstart = rec->save_start;
1342 memcpy(md->offset_vector, rec->offset_save,
1343 rec->saved_max * sizeof(int));
1344 offset_top = rec->offset_top;
1345 ecode = rec->after_call;
1346 ims = original_ims;
1347 break;
1348 }
1349 }
1350
1351 /* For both capturing and non-capturing groups, reset the value of the ims
1352 flags, in case they got changed during the group. */
1353
1354 ims = original_ims;
1355 DPRINTF(("ims reset to %02lx\n", ims));
1356
1357 /* For a non-repeating ket, just continue at this level. This also
1358 happens for a repeating ket if no characters were matched in the group.
1359 This is the forcible breaking of infinite loops as implemented in Perl
1360 5.005. If there is an options reset, it will get obeyed in the normal
1361 course of events. */
1362
1363 if (*ecode == OP_KET || eptr == saved_eptr)
1364 {
1365 ecode += 1 + LINK_SIZE;
1366 break;
1367 }
1368
1369 /* The repeating kets try the rest of the pattern or restart from the
1370 preceding bracket, in the appropriate order. In the second case, we can use
1371 tail recursion to avoid using another stack frame, unless we have an
1372 unlimited repeat of a group that can match an empty string. */
1373
1374 flags = (*prev >= OP_SBRA)? match_cbegroup : 0;
1375
1376 if (*ecode == OP_KETRMIN)
1377 {
1378 RMATCH(eptr, ecode + 1 + LINK_SIZE, offset_top, md, ims, eptrb, 0, RM12);
1379 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1380 if (flags != 0) /* Could match an empty string */
1381 {
1382 RMATCH(eptr, prev, offset_top, md, ims, eptrb, flags, RM50);
1383 RRETURN(rrc);
1384 }
1385 ecode = prev;
1386 goto TAIL_RECURSE;
1387 }
1388 else /* OP_KETRMAX */
1389 {
1390 RMATCH(eptr, prev, offset_top, md, ims, eptrb, flags, RM13);
1391 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1392 ecode += 1 + LINK_SIZE;
1393 flags = 0;
1394 goto TAIL_RECURSE;
1395 }
1396 /* Control never gets here */
1397
1398 /* Start of subject unless notbol, or after internal newline if multiline */
1399
1400 case OP_CIRC:
1401 if (md->notbol && eptr == md->start_subject) RRETURN(MATCH_NOMATCH);
1402 if ((ims & PCRE_MULTILINE) != 0)
1403 {
1404 if (eptr != md->start_subject &&
1405 (eptr == md->end_subject || !WAS_NEWLINE(eptr)))
1406 RRETURN(MATCH_NOMATCH);
1407 ecode++;
1408 break;
1409 }
1410 /* ... else fall through */
1411
1412 /* Start of subject assertion */
1413
1414 case OP_SOD:
1415 if (eptr != md->start_subject) RRETURN(MATCH_NOMATCH);
1416 ecode++;
1417 break;
1418
1419 /* Start of match assertion */
1420
1421 case OP_SOM:
1422 if (eptr != md->start_subject + md->start_offset) RRETURN(MATCH_NOMATCH);
1423 ecode++;
1424 break;
1425
1426 /* Reset the start of match point */
1427
1428 case OP_SET_SOM:
1429 mstart = eptr;
1430 ecode++;
1431 break;
1432
1433 /* Assert before internal newline if multiline, or before a terminating
1434 newline unless endonly is set, else end of subject unless noteol is set. */
1435
1436 case OP_DOLL:
1437 if ((ims & PCRE_MULTILINE) != 0)
1438 {
1439 if (eptr < md->end_subject)
1440 { if (!IS_NEWLINE(eptr)) RRETURN(MATCH_NOMATCH); }
1441 else
1442 { if (md->noteol) RRETURN(MATCH_NOMATCH); }
1443 ecode++;
1444 break;
1445 }
1446 else
1447 {
1448 if (md->noteol) RRETURN(MATCH_NOMATCH);
1449 if (!md->endonly)
1450 {
1451 if (eptr != md->end_subject &&
1452 (!IS_NEWLINE(eptr) || eptr != md->end_subject - md->nllen))
1453 RRETURN(MATCH_NOMATCH);
1454 ecode++;
1455 break;
1456 }
1457 }
1458 /* ... else fall through for endonly */
1459
1460 /* End of subject assertion (\z) */
1461
1462 case OP_EOD:
1463 if (eptr < md->end_subject) RRETURN(MATCH_NOMATCH);
1464 ecode++;
1465 break;
1466
1467 /* End of subject or ending \n assertion (\Z) */
1468
1469 case OP_EODN:
1470 if (eptr != md->end_subject &&
1471 (!IS_NEWLINE(eptr) || eptr != md->end_subject - md->nllen))
1472 RRETURN(MATCH_NOMATCH);
1473 ecode++;
1474 break;
1475
1476 /* Word boundary assertions */
1477
1478 case OP_NOT_WORD_BOUNDARY:
1479 case OP_WORD_BOUNDARY:
1480 {
1481
1482 /* Find out if the previous and current characters are "word" characters.
1483 It takes a bit more work in UTF-8 mode. Characters > 255 are assumed to
1484 be "non-word" characters. Remember the earliest consulted character for
1485 partial matching. */
1486
1487 #ifdef SUPPORT_UTF8
1488 if (utf8)
1489 {
1490 if (eptr == md->start_subject) prev_is_word = FALSE; else
1491 {
1492 USPTR lastptr = eptr - 1;
1493 while((*lastptr & 0xc0) == 0x80) lastptr--;
1494 if (lastptr < md->start_used_ptr) md->start_used_ptr = lastptr;
1495 GETCHAR(c, lastptr);
1496 prev_is_word = c < 256 && (md->ctypes[c] & ctype_word) != 0;
1497 }
1498 if (eptr >= md->end_subject)
1499 {
1500 SCHECK_PARTIAL();
1501 cur_is_word = FALSE;
1502 }
1503 else
1504 {
1505 GETCHAR(c, eptr);
1506 cur_is_word = c < 256 && (md->ctypes[c] & ctype_word) != 0;
1507 }
1508 }
1509 else
1510 #endif
1511
1512 /* Not in UTF-8 mode */
1513
1514 {
1515 if (eptr == md->start_subject) prev_is_word = FALSE; else
1516 {
1517 if (eptr <= md->start_used_ptr) md->start_used_ptr = eptr - 1;
1518 prev_is_word = ((md->ctypes[eptr[-1]] & ctype_word) != 0);
1519 }
1520 if (eptr >= md->end_subject)
1521 {
1522 SCHECK_PARTIAL();
1523 cur_is_word = FALSE;
1524 }
1525 else cur_is_word = ((md->ctypes[*eptr] & ctype_word) != 0);
1526 }
1527
1528 /* Now see if the situation is what we want */
1529
1530 if ((*ecode++ == OP_WORD_BOUNDARY)?
1531 cur_is_word == prev_is_word : cur_is_word != prev_is_word)
1532 RRETURN(MATCH_NOMATCH);
1533 }
1534 break;
1535
1536 /* Match a single character type; inline for speed */
1537
1538 case OP_ANY:
1539 if (IS_NEWLINE(eptr)) RRETURN(MATCH_NOMATCH);
1540 /* Fall through */
1541
1542 case OP_ALLANY:
1543 if (eptr++ >= md->end_subject)
1544 {
1545 SCHECK_PARTIAL();
1546 RRETURN(MATCH_NOMATCH);
1547 }
1548 if (utf8) while (eptr < md->end_subject && (*eptr & 0xc0) == 0x80) eptr++;
1549 ecode++;
1550 break;
1551
1552 /* Match a single byte, even in UTF-8 mode. This opcode really does match
1553 any byte, even newline, independent of the setting of PCRE_DOTALL. */
1554
1555 case OP_ANYBYTE:
1556 if (eptr++ >= md->end_subject)
1557 {
1558 SCHECK_PARTIAL();
1559 RRETURN(MATCH_NOMATCH);
1560 }
1561 ecode++;
1562 break;
1563
1564 case OP_NOT_DIGIT:
1565 if (eptr >= md->end_subject)
1566 {
1567 SCHECK_PARTIAL();
1568 RRETURN(MATCH_NOMATCH);
1569 }
1570 GETCHARINCTEST(c, eptr);
1571 if (
1572 #ifdef SUPPORT_UTF8
1573 c < 256 &&
1574 #endif
1575 (md->ctypes[c] & ctype_digit) != 0
1576 )
1577 RRETURN(MATCH_NOMATCH);
1578 ecode++;
1579 break;
1580
1581 case OP_DIGIT:
1582 if (eptr >= md->end_subject)
1583 {
1584 SCHECK_PARTIAL();
1585 RRETURN(MATCH_NOMATCH);
1586 }
1587 GETCHARINCTEST(c, eptr);
1588 if (
1589 #ifdef SUPPORT_UTF8
1590 c >= 256 ||
1591 #endif
1592 (md->ctypes[c] & ctype_digit) == 0
1593 )
1594 RRETURN(MATCH_NOMATCH);
1595 ecode++;
1596 break;
1597
1598 case OP_NOT_WHITESPACE:
1599 if (eptr >= md->end_subject)
1600 {
1601 SCHECK_PARTIAL();
1602 RRETURN(MATCH_NOMATCH);
1603 }
1604 GETCHARINCTEST(c, eptr);
1605 if (
1606 #ifdef SUPPORT_UTF8
1607 c < 256 &&
1608 #endif
1609 (md->ctypes[c] & ctype_space) != 0
1610 )
1611 RRETURN(MATCH_NOMATCH);
1612 ecode++;
1613 break;
1614
1615 case OP_WHITESPACE:
1616 if (eptr >= md->end_subject)
1617 {
1618 SCHECK_PARTIAL();
1619 RRETURN(MATCH_NOMATCH);
1620 }
1621 GETCHARINCTEST(c, eptr);
1622 if (
1623 #ifdef SUPPORT_UTF8
1624 c >= 256 ||
1625 #endif
1626 (md->ctypes[c] & ctype_space) == 0
1627 )
1628 RRETURN(MATCH_NOMATCH);
1629 ecode++;
1630 break;
1631
1632 case OP_NOT_WORDCHAR:
1633 if (eptr >= md->end_subject)
1634 {
1635 SCHECK_PARTIAL();
1636 RRETURN(MATCH_NOMATCH);
1637 }
1638 GETCHARINCTEST(c, eptr);
1639 if (
1640 #ifdef SUPPORT_UTF8
1641 c < 256 &&
1642 #endif
1643 (md->ctypes[c] & ctype_word) != 0
1644 )
1645 RRETURN(MATCH_NOMATCH);
1646 ecode++;
1647 break;
1648
1649 case OP_WORDCHAR:
1650 if (eptr >= md->end_subject)
1651 {
1652 SCHECK_PARTIAL();
1653 RRETURN(MATCH_NOMATCH);
1654 }
1655 GETCHARINCTEST(c, eptr);
1656 if (
1657 #ifdef SUPPORT_UTF8
1658 c >= 256 ||
1659 #endif
1660 (md->ctypes[c] & ctype_word) == 0
1661 )
1662 RRETURN(MATCH_NOMATCH);
1663 ecode++;
1664 break;
1665
1666 case OP_ANYNL:
1667 if (eptr >= md->end_subject)
1668 {
1669 SCHECK_PARTIAL();
1670 RRETURN(MATCH_NOMATCH);
1671 }
1672 GETCHARINCTEST(c, eptr);
1673 switch(c)
1674 {
1675 default: RRETURN(MATCH_NOMATCH);
1676 case 0x000d:
1677 if (eptr < md->end_subject && *eptr == 0x0a) eptr++;
1678 break;
1679
1680 case 0x000a:
1681 break;
1682
1683 case 0x000b:
1684 case 0x000c:
1685 case 0x0085:
1686 case 0x2028:
1687 case 0x2029:
1688 if (md->bsr_anycrlf) RRETURN(MATCH_NOMATCH);
1689 break;
1690 }
1691 ecode++;
1692 break;
1693
1694 case OP_NOT_HSPACE:
1695 if (eptr >= md->end_subject)
1696 {
1697 SCHECK_PARTIAL();
1698 RRETURN(MATCH_NOMATCH);
1699 }
1700 GETCHARINCTEST(c, eptr);
1701 switch(c)
1702 {
1703 default: break;
1704 case 0x09: /* HT */
1705 case 0x20: /* SPACE */
1706 case 0xa0: /* NBSP */
1707 case 0x1680: /* OGHAM SPACE MARK */
1708 case 0x180e: /* MONGOLIAN VOWEL SEPARATOR */
1709 case 0x2000: /* EN QUAD */
1710 case 0x2001: /* EM QUAD */
1711 case 0x2002: /* EN SPACE */
1712 case 0x2003: /* EM SPACE */
1713 case 0x2004: /* THREE-PER-EM SPACE */
1714 case 0x2005: /* FOUR-PER-EM SPACE */
1715 case 0x2006: /* SIX-PER-EM SPACE */
1716 case 0x2007: /* FIGURE SPACE */
1717 case 0x2008: /* PUNCTUATION SPACE */
1718 case 0x2009: /* THIN SPACE */
1719 case 0x200A: /* HAIR SPACE */
1720 case 0x202f: /* NARROW NO-BREAK SPACE */
1721 case 0x205f: /* MEDIUM MATHEMATICAL SPACE */
1722 case 0x3000: /* IDEOGRAPHIC SPACE */
1723 RRETURN(MATCH_NOMATCH);
1724 }
1725 ecode++;
1726 break;
1727
1728 case OP_HSPACE:
1729 if (eptr >= md->end_subject)
1730 {
1731 SCHECK_PARTIAL();
1732 RRETURN(MATCH_NOMATCH);
1733 }
1734 GETCHARINCTEST(c, eptr);
1735 switch(c)
1736 {
1737 default: RRETURN(MATCH_NOMATCH);
1738 case 0x09: /* HT */
1739 case 0x20: /* SPACE */
1740 case 0xa0: /* NBSP */
1741 case 0x1680: /* OGHAM SPACE MARK */
1742 case 0x180e: /* MONGOLIAN VOWEL SEPARATOR */
1743 case 0x2000: /* EN QUAD */
1744 case 0x2001: /* EM QUAD */
1745 case 0x2002: /* EN SPACE */
1746 case 0x2003: /* EM SPACE */
1747 case 0x2004: /* THREE-PER-EM SPACE */
1748 case 0x2005: /* FOUR-PER-EM SPACE */
1749 case 0x2006: /* SIX-PER-EM SPACE */
1750 case 0x2007: /* FIGURE SPACE */
1751 case 0x2008: /* PUNCTUATION SPACE */
1752 case 0x2009: /* THIN SPACE */
1753 case 0x200A: /* HAIR SPACE */
1754 case 0x202f: /* NARROW NO-BREAK SPACE */
1755 case 0x205f: /* MEDIUM MATHEMATICAL SPACE */
1756 case 0x3000: /* IDEOGRAPHIC SPACE */
1757 break;
1758 }
1759 ecode++;
1760 break;
1761
1762 case OP_NOT_VSPACE:
1763 if (eptr >= md->end_subject)
1764 {
1765 SCHECK_PARTIAL();
1766 RRETURN(MATCH_NOMATCH);
1767 }
1768 GETCHARINCTEST(c, eptr);
1769 switch(c)
1770 {
1771 default: break;
1772 case 0x0a: /* LF */
1773 case 0x0b: /* VT */
1774 case 0x0c: /* FF */
1775 case 0x0d: /* CR */
1776 case 0x85: /* NEL */
1777 case 0x2028: /* LINE SEPARATOR */
1778 case 0x2029: /* PARAGRAPH SEPARATOR */
1779 RRETURN(MATCH_NOMATCH);
1780 }
1781 ecode++;
1782 break;
1783
1784 case OP_VSPACE:
1785 if (eptr >= md->end_subject)
1786 {
1787 SCHECK_PARTIAL();
1788 RRETURN(MATCH_NOMATCH);
1789 }
1790 GETCHARINCTEST(c, eptr);
1791 switch(c)
1792 {
1793 default: RRETURN(MATCH_NOMATCH);
1794 case 0x0a: /* LF */
1795 case 0x0b: /* VT */
1796 case 0x0c: /* FF */
1797 case 0x0d: /* CR */
1798 case 0x85: /* NEL */
1799 case 0x2028: /* LINE SEPARATOR */
1800 case 0x2029: /* PARAGRAPH SEPARATOR */
1801 break;
1802 }
1803 ecode++;
1804 break;
1805
1806 #ifdef SUPPORT_UCP
1807 /* Check the next character by Unicode property. We will get here only
1808 if the support is in the binary; otherwise a compile-time error occurs. */
1809
1810 case OP_PROP:
1811 case OP_NOTPROP:
1812 if (eptr >= md->end_subject)
1813 {
1814 SCHECK_PARTIAL();
1815 RRETURN(MATCH_NOMATCH);
1816 }
1817 GETCHARINCTEST(c, eptr);
1818 {
1819 const ucd_record *prop = GET_UCD(c);
1820
1821 switch(ecode[1])
1822 {
1823 case PT_ANY:
1824 if (op == OP_NOTPROP) RRETURN(MATCH_NOMATCH);
1825 break;
1826
1827 case PT_LAMP:
1828 if ((prop->chartype == ucp_Lu ||
1829 prop->chartype == ucp_Ll ||
1830 prop->chartype == ucp_Lt) == (op == OP_NOTPROP))
1831 RRETURN(MATCH_NOMATCH);
1832 break;
1833
1834 case PT_GC:
1835 if ((ecode[2] != _pcre_ucp_gentype[prop->chartype]) == (op == OP_PROP))
1836 RRETURN(MATCH_NOMATCH);
1837 break;
1838
1839 case PT_PC:
1840 if ((ecode[2] != prop->chartype) == (op == OP_PROP))
1841 RRETURN(MATCH_NOMATCH);
1842 break;
1843
1844 case PT_SC:
1845 if ((ecode[2] != prop->script) == (op == OP_PROP))
1846 RRETURN(MATCH_NOMATCH);
1847 break;
1848
1849 default:
1850 RRETURN(PCRE_ERROR_INTERNAL);
1851 }
1852
1853 ecode += 3;
1854 }
1855 break;
1856
1857 /* Match an extended Unicode sequence. We will get here only if the support
1858 is in the binary; otherwise a compile-time error occurs. */
1859
1860 case OP_EXTUNI:
1861 if (eptr >= md->end_subject)
1862 {
1863 SCHECK_PARTIAL();
1864 RRETURN(MATCH_NOMATCH);
1865 }
1866 GETCHARINCTEST(c, eptr);
1867 {
1868 int category = UCD_CATEGORY(c);
1869 if (category == ucp_M) RRETURN(MATCH_NOMATCH);
1870 while (eptr < md->end_subject)
1871 {
1872 int len = 1;
1873 if (!utf8) c = *eptr; else
1874 {
1875 GETCHARLEN(c, eptr, len);
1876 }
1877 category = UCD_CATEGORY(c);
1878 if (category != ucp_M) break;
1879 eptr += len;
1880 }
1881 }
1882 ecode++;
1883 break;
1884 #endif
1885
1886
1887 /* Match a back reference, possibly repeatedly. Look past the end of the
1888 item to see if there is repeat information following. The code is similar
1889 to that for character classes, but repeated for efficiency. Then obey
1890 similar code to character type repeats - written out again for speed.
1891 However, if the referenced string is the empty string, always treat
1892 it as matched, any number of times (otherwise there could be infinite
1893 loops). */
1894
1895 case OP_REF:
1896 {
1897 offset = GET2(ecode, 1) << 1; /* Doubled ref number */
1898 ecode += 3;
1899
1900 /* If the reference is unset, there are two possibilities:
1901
1902 (a) In the default, Perl-compatible state, set the length to be longer
1903 than the amount of subject left; this ensures that every attempt at a
1904 match fails. We can't just fail here, because of the possibility of
1905 quantifiers with zero minima.
1906
1907 (b) If the JavaScript compatibility flag is set, set the length to zero
1908 so that the back reference matches an empty string.
1909
1910 Otherwise, set the length to the length of what was matched by the
1911 referenced subpattern. */
1912
1913 if (offset >= offset_top || md->offset_vector[offset] < 0)
1914 length = (md->jscript_compat)? 0 : md->end_subject - eptr + 1;
1915 else
1916 length = md->offset_vector[offset+1] - md->offset_vector[offset];
1917
1918 /* Set up for repetition, or handle the non-repeated case */
1919
1920 switch (*ecode)
1921 {
1922 case OP_CRSTAR:
1923 case OP_CRMINSTAR:
1924 case OP_CRPLUS:
1925 case OP_CRMINPLUS:
1926 case OP_CRQUERY:
1927 case OP_CRMINQUERY:
1928 c = *ecode++ - OP_CRSTAR;
1929 minimize = (c & 1) != 0;
1930 min = rep_min[c]; /* Pick up values from tables; */
1931 max = rep_max[c]; /* zero for max => infinity */
1932 if (max == 0) max = INT_MAX;
1933 break;
1934
1935 case OP_CRRANGE:
1936 case OP_CRMINRANGE:
1937 minimize = (*ecode == OP_CRMINRANGE);
1938 min = GET2(ecode, 1);
1939 max = GET2(ecode, 3);
1940 if (max == 0) max = INT_MAX;
1941 ecode += 5;
1942 break;
1943
1944 default: /* No repeat follows */
1945 if (!match_ref(offset, eptr, length, md, ims))
1946 {
1947 CHECK_PARTIAL();
1948 RRETURN(MATCH_NOMATCH);
1949 }
1950 eptr += length;
1951 continue; /* With the main loop */
1952 }
1953
1954 /* If the length of the reference is zero, just continue with the
1955 main loop. */
1956
1957 if (length == 0) continue;
1958
1959 /* First, ensure the minimum number of matches are present. We get back
1960 the length of the reference string explicitly rather than passing the
1961 address of eptr, so that eptr can be a register variable. */
1962
1963 for (i = 1; i <= min; i++)
1964 {
1965 if (!match_ref(offset, eptr, length, md, ims))
1966 {
1967 CHECK_PARTIAL();
1968 RRETURN(MATCH_NOMATCH);
1969 }
1970 eptr += length;
1971 }
1972
1973 /* If min = max, continue at the same level without recursion.
1974 They are not both allowed to be zero. */
1975
1976 if (min == max) continue;
1977
1978 /* If minimizing, keep trying and advancing the pointer */
1979
1980 if (minimize)
1981 {
1982 for (fi = min;; fi++)
1983 {
1984 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM14);
1985 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1986 if (fi >= max) RRETURN(MATCH_NOMATCH);
1987 if (!match_ref(offset, eptr, length, md, ims))
1988 {
1989 CHECK_PARTIAL();
1990 RRETURN(MATCH_NOMATCH);
1991 }
1992 eptr += length;
1993 }
1994 /* Control never gets here */
1995 }
1996
1997 /* If maximizing, find the longest string and work backwards */
1998
1999 else
2000 {
2001 pp = eptr;
2002 for (i = min; i < max; i++)
2003 {
2004 if (!match_ref(offset, eptr, length, md, ims)) break;
2005 eptr += length;
2006 }
2007 while (eptr >= pp)
2008 {
2009 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM15);
2010 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2011 eptr -= length;
2012 }
2013 RRETURN(MATCH_NOMATCH);
2014 }
2015 }
2016 /* Control never gets here */
2017
2018 /* Match a bit-mapped character class, possibly repeatedly. This op code is
2019 used when all the characters in the class have values in the range 0-255,
2020 and either the matching is caseful, or the characters are in the range
2021 0-127 when UTF-8 processing is enabled. The only difference between
2022 OP_CLASS and OP_NCLASS occurs when a data character outside the range is
2023 encountered.
2024
2025 First, look past the end of the item to see if there is repeat information
2026 following. Then obey similar code to character type repeats - written out
2027 again for speed. */
2028
2029 case OP_NCLASS:
2030 case OP_CLASS:
2031 {
2032 data = ecode + 1; /* Save for matching */
2033 ecode += 33; /* Advance past the item */
2034
2035 switch (*ecode)
2036 {
2037 case OP_CRSTAR:
2038 case OP_CRMINSTAR:
2039 case OP_CRPLUS:
2040 case OP_CRMINPLUS:
2041 case OP_CRQUERY:
2042 case OP_CRMINQUERY:
2043 c = *ecode++ - OP_CRSTAR;
2044 minimize = (c & 1) != 0;
2045 min = rep_min[c]; /* Pick up values from tables; */
2046 max = rep_max[c]; /* zero for max => infinity */
2047 if (max == 0) max = INT_MAX;
2048 break;
2049
2050 case OP_CRRANGE:
2051 case OP_CRMINRANGE:
2052 minimize = (*ecode == OP_CRMINRANGE);
2053 min = GET2(ecode, 1);
2054 max = GET2(ecode, 3);
2055 if (max == 0) max = INT_MAX;
2056 ecode += 5;
2057 break;
2058
2059 default: /* No repeat follows */
2060 min = max = 1;
2061 break;
2062 }
2063
2064 /* First, ensure the minimum number of matches are present. */
2065
2066 #ifdef SUPPORT_UTF8
2067 /* UTF-8 mode */
2068 if (utf8)
2069 {
2070 for (i = 1; i <= min; i++)
2071 {
2072 if (eptr >= md->end_subject)
2073 {
2074 SCHECK_PARTIAL();
2075 RRETURN(MATCH_NOMATCH);
2076 }
2077 GETCHARINC(c, eptr);
2078 if (c > 255)
2079 {
2080 if (op == OP_CLASS) RRETURN(MATCH_NOMATCH);
2081 }
2082 else
2083 {
2084 if ((data[c/8] & (1 << (c&7))) == 0) RRETURN(MATCH_NOMATCH);
2085 }
2086 }
2087 }
2088 else
2089 #endif
2090 /* Not UTF-8 mode */
2091 {
2092 for (i = 1; i <= min; i++)
2093 {
2094 if (eptr >= md->end_subject)
2095 {
2096 SCHECK_PARTIAL();
2097 RRETURN(MATCH_NOMATCH);
2098 }
2099 c = *eptr++;
2100 if ((data[c/8] & (1 << (c&7))) == 0) RRETURN(MATCH_NOMATCH);
2101 }
2102 }
2103
2104 /* If max == min we can continue with the main loop without the
2105 need to recurse. */
2106
2107 if (min == max) continue;
2108
2109 /* If minimizing, keep testing the rest of the expression and advancing
2110 the pointer while it matches the class. */
2111
2112 if (minimize)
2113 {
2114 #ifdef SUPPORT_UTF8
2115 /* UTF-8 mode */
2116 if (utf8)
2117 {
2118 for (fi = min;; fi++)
2119 {
2120 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM16);
2121 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2122 if (fi >= max) RRETURN(MATCH_NOMATCH);
2123 if (eptr >= md->end_subject)
2124 {
2125 SCHECK_PARTIAL();
2126 RRETURN(MATCH_NOMATCH);
2127 }
2128 GETCHARINC(c, eptr);
2129 if (c > 255)
2130 {
2131 if (op == OP_CLASS) RRETURN(MATCH_NOMATCH);
2132 }
2133 else
2134 {
2135 if ((data[c/8] & (1 << (c&7))) == 0) RRETURN(MATCH_NOMATCH);
2136 }
2137 }
2138 }
2139 else
2140 #endif
2141 /* Not UTF-8 mode */
2142 {
2143 for (fi = min;; fi++)
2144 {
2145 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM17);
2146 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2147 if (fi >= max) RRETURN(MATCH_NOMATCH);
2148 if (eptr >= md->end_subject)
2149 {
2150 SCHECK_PARTIAL();
2151 RRETURN(MATCH_NOMATCH);
2152 }
2153 c = *eptr++;
2154 if ((data[c/8] & (1 << (c&7))) == 0) RRETURN(MATCH_NOMATCH);
2155 }
2156 }
2157 /* Control never gets here */
2158 }
2159
2160 /* If maximizing, find the longest possible run, then work backwards. */
2161
2162 else
2163 {
2164 pp = eptr;
2165
2166 #ifdef SUPPORT_UTF8
2167 /* UTF-8 mode */
2168 if (utf8)
2169 {
2170 for (i = min; i < max; i++)
2171 {
2172 int len = 1;
2173 if (eptr >= md->end_subject) break;
2174 GETCHARLEN(c, eptr, len);
2175 if (c > 255)
2176 {
2177 if (op == OP_CLASS) break;
2178 }
2179 else
2180 {
2181 if ((data[c/8] & (1 << (c&7))) == 0) break;
2182 }
2183 eptr += len;
2184 }
2185 for (;;)
2186 {
2187 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM18);
2188 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2189 if (eptr-- == pp) break; /* Stop if tried at original pos */
2190 BACKCHAR(eptr);
2191 }
2192 }
2193 else
2194 #endif
2195 /* Not UTF-8 mode */
2196 {
2197 for (i = min; i < max; i++)
2198 {
2199 if (eptr >= md->end_subject) break;
2200 c = *eptr;
2201 if ((data[c/8] & (1 << (c&7))) == 0) break;
2202 eptr++;
2203 }
2204 while (eptr >= pp)
2205 {
2206 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM19);
2207 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2208 eptr--;
2209 }
2210 }
2211
2212 RRETURN(MATCH_NOMATCH);
2213 }
2214 }
2215 /* Control never gets here */
2216
2217
2218 /* Match an extended character class. This opcode is encountered only
2219 when UTF-8 mode mode is supported. Nevertheless, we may not be in UTF-8
2220 mode, because Unicode properties are supported in non-UTF-8 mode. */
2221
2222 #ifdef SUPPORT_UTF8
2223 case OP_XCLASS:
2224 {
2225 data = ecode + 1 + LINK_SIZE; /* Save for matching */
2226 ecode += GET(ecode, 1); /* Advance past the item */
2227
2228 switch (*ecode)
2229 {
2230 case OP_CRSTAR:
2231 case OP_CRMINSTAR:
2232 case OP_CRPLUS:
2233 case OP_CRMINPLUS:
2234 case OP_CRQUERY:
2235 case OP_CRMINQUERY:
2236 c = *ecode++ - OP_CRSTAR;
2237 minimize = (c & 1) != 0;
2238 min = rep_min[c]; /* Pick up values from tables; */
2239 max = rep_max[c]; /* zero for max => infinity */
2240 if (max == 0) max = INT_MAX;
2241 break;
2242
2243 case OP_CRRANGE:
2244 case OP_CRMINRANGE:
2245 minimize = (*ecode == OP_CRMINRANGE);
2246 min = GET2(ecode, 1);
2247 max = GET2(ecode, 3);
2248 if (max == 0) max = INT_MAX;
2249 ecode += 5;
2250 break;
2251
2252 default: /* No repeat follows */
2253 min = max = 1;
2254 break;
2255 }
2256
2257 /* First, ensure the minimum number of matches are present. */
2258
2259 for (i = 1; i <= min; i++)
2260 {
2261 if (eptr >= md->end_subject)
2262 {
2263 SCHECK_PARTIAL();
2264 RRETURN(MATCH_NOMATCH);
2265 }
2266 GETCHARINCTEST(c, eptr);
2267 if (!_pcre_xclass(c, data)) RRETURN(MATCH_NOMATCH);
2268 }
2269
2270 /* If max == min we can continue with the main loop without the
2271 need to recurse. */
2272
2273 if (min == max) continue;
2274
2275 /* If minimizing, keep testing the rest of the expression and advancing
2276 the pointer while it matches the class. */
2277
2278 if (minimize)
2279 {
2280 for (fi = min;; fi++)
2281 {
2282 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM20);
2283 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2284 if (fi >= max) RRETURN(MATCH_NOMATCH);
2285 if (eptr >= md->end_subject)
2286 {
2287 SCHECK_PARTIAL();
2288 RRETURN(MATCH_NOMATCH);
2289 }
2290 GETCHARINCTEST(c, eptr);
2291 if (!_pcre_xclass(c, data)) RRETURN(MATCH_NOMATCH);
2292 }
2293 /* Control never gets here */
2294 }
2295
2296 /* If maximizing, find the longest possible run, then work backwards. */
2297
2298 else
2299 {
2300 pp = eptr;
2301 for (i = min; i < max; i++)
2302 {
2303 int len = 1;
2304 if (eptr >= md->end_subject) break;
2305 GETCHARLENTEST(c, eptr, len);
2306 if (!_pcre_xclass(c, data)) break;
2307 eptr += len;
2308 }
2309 for(;;)
2310 {
2311 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM21);
2312 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2313 if (eptr-- == pp) break; /* Stop if tried at original pos */
2314 if (utf8) BACKCHAR(eptr);
2315 }
2316 RRETURN(MATCH_NOMATCH);
2317 }
2318
2319 /* Control never gets here */
2320 }
2321 #endif /* End of XCLASS */
2322
2323 /* Match a single character, casefully */
2324
2325 case OP_CHAR:
2326 #ifdef SUPPORT_UTF8
2327 if (utf8)
2328 {
2329 length = 1;
2330 ecode++;
2331 GETCHARLEN(fc, ecode, length);
2332 if (length > md->end_subject - eptr)
2333 {
2334 CHECK_PARTIAL(); /* Not SCHECK_PARTIAL() */
2335 RRETURN(MATCH_NOMATCH);
2336 }
2337 while (length-- > 0) if (*ecode++ != *eptr++) RRETURN(MATCH_NOMATCH);
2338 }
2339 else
2340 #endif
2341
2342 /* Non-UTF-8 mode */
2343 {
2344 if (md->end_subject - eptr < 1)
2345 {
2346 SCHECK_PARTIAL(); /* This one can use SCHECK_PARTIAL() */
2347 RRETURN(MATCH_NOMATCH);
2348 }
2349 if (ecode[1] != *eptr++) RRETURN(MATCH_NOMATCH);
2350 ecode += 2;
2351 }
2352 break;
2353
2354 /* Match a single character, caselessly */
2355
2356 case OP_CHARNC:
2357 #ifdef SUPPORT_UTF8
2358 if (utf8)
2359 {
2360 length = 1;
2361 ecode++;
2362 GETCHARLEN(fc, ecode, length);
2363
2364 if (length > md->end_subject - eptr)
2365 {
2366 CHECK_PARTIAL(); /* Not SCHECK_PARTIAL() */
2367 RRETURN(MATCH_NOMATCH);
2368 }
2369
2370 /* If the pattern character's value is < 128, we have only one byte, and
2371 can use the fast lookup table. */
2372
2373 if (fc < 128)
2374 {
2375 if (md->lcc[*ecode++] != md->lcc[*eptr++]) RRETURN(MATCH_NOMATCH);
2376 }
2377
2378 /* Otherwise we must pick up the subject character */
2379
2380 else
2381 {
2382 unsigned int dc;
2383 GETCHARINC(dc, eptr);
2384 ecode += length;
2385
2386 /* If we have Unicode property support, we can use it to test the other
2387 case of the character, if there is one. */
2388
2389 if (fc != dc)
2390 {
2391 #ifdef SUPPORT_UCP
2392 if (dc != UCD_OTHERCASE(fc))
2393 #endif
2394 RRETURN(MATCH_NOMATCH);
2395 }
2396 }
2397 }
2398 else
2399 #endif /* SUPPORT_UTF8 */
2400
2401 /* Non-UTF-8 mode */
2402 {
2403 if (md->end_subject - eptr < 1)
2404 {
2405 SCHECK_PARTIAL(); /* This one can use SCHECK_PARTIAL() */
2406 RRETURN(MATCH_NOMATCH);
2407 }
2408 if (md->lcc[ecode[1]] != md->lcc[*eptr++]) RRETURN(MATCH_NOMATCH);
2409 ecode += 2;
2410 }
2411 break;
2412
2413 /* Match a single character repeatedly. */
2414
2415 case OP_EXACT:
2416 min = max = GET2(ecode, 1);
2417 ecode += 3;
2418 goto REPEATCHAR;
2419
2420 case OP_POSUPTO:
2421 possessive = TRUE;
2422 /* Fall through */
2423
2424 case OP_UPTO:
2425 case OP_MINUPTO:
2426 min = 0;
2427 max = GET2(ecode, 1);
2428 minimize = *ecode == OP_MINUPTO;
2429 ecode += 3;
2430 goto REPEATCHAR;
2431
2432 case OP_POSSTAR:
2433 possessive = TRUE;
2434 min = 0;
2435 max = INT_MAX;
2436 ecode++;
2437 goto REPEATCHAR;
2438
2439 case OP_POSPLUS:
2440 possessive = TRUE;
2441 min = 1;
2442 max = INT_MAX;
2443 ecode++;
2444 goto REPEATCHAR;
2445
2446 case OP_POSQUERY:
2447 possessive = TRUE;
2448 min = 0;
2449 max = 1;
2450 ecode++;
2451 goto REPEATCHAR;
2452
2453 case OP_STAR:
2454 case OP_MINSTAR:
2455 case OP_PLUS:
2456 case OP_MINPLUS:
2457 case OP_QUERY:
2458 case OP_MINQUERY:
2459 c = *ecode++ - OP_STAR;
2460 minimize = (c & 1) != 0;
2461
2462 min = rep_min[c]; /* Pick up values from tables; */
2463 max = rep_max[c]; /* zero for max => infinity */
2464 if (max == 0) max = INT_MAX;
2465
2466 /* Common code for all repeated single-character matches. */
2467
2468 REPEATCHAR:
2469 #ifdef SUPPORT_UTF8
2470 if (utf8)
2471 {
2472 length = 1;
2473 charptr = ecode;
2474 GETCHARLEN(fc, ecode, length);
2475 ecode += length;
2476
2477 /* Handle multibyte character matching specially here. There is
2478 support for caseless matching if UCP support is present. */
2479
2480 if (length > 1)
2481 {
2482 #ifdef SUPPORT_UCP
2483 unsigned int othercase;
2484 if ((ims & PCRE_CASELESS) != 0 &&
2485 (othercase = UCD_OTHERCASE(fc)) != fc)
2486 oclength = _pcre_ord2utf8(othercase, occhars);
2487 else oclength = 0;
2488 #endif /* SUPPORT_UCP */
2489
2490 for (i = 1; i <= min; i++)
2491 {
2492 if (eptr <= md->end_subject - length &&
2493 memcmp(eptr, charptr, length) == 0) eptr += length;
2494 #ifdef SUPPORT_UCP
2495 else if (oclength > 0 &&
2496 eptr <= md->end_subject - oclength &&
2497 memcmp(eptr, occhars, oclength) == 0) eptr += oclength;
2498 #endif /* SUPPORT_UCP */
2499 else
2500 {
2501 CHECK_PARTIAL();
2502 RRETURN(MATCH_NOMATCH);
2503 }
2504 }
2505
2506 if (min == max) continue;
2507
2508 if (minimize)
2509 {
2510 for (fi = min;; fi++)
2511 {
2512 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM22);
2513 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2514 if (fi >= max) RRETURN(MATCH_NOMATCH);
2515 if (eptr <= md->end_subject - length &&
2516 memcmp(eptr, charptr, length) == 0) eptr += length;
2517 #ifdef SUPPORT_UCP
2518 else if (oclength > 0 &&
2519 eptr <= md->end_subject - oclength &&
2520 memcmp(eptr, occhars, oclength) == 0) eptr += oclength;
2521 #endif /* SUPPORT_UCP */
2522 else
2523 {
2524 CHECK_PARTIAL();
2525 RRETURN(MATCH_NOMATCH);
2526 }
2527 }
2528 /* Control never gets here */
2529 }
2530
2531 else /* Maximize */
2532 {
2533 pp = eptr;
2534 for (i = min; i < max; i++)
2535 {
2536 if (eptr <= md->end_subject - length &&
2537 memcmp(eptr, charptr, length) == 0) eptr += length;
2538 #ifdef SUPPORT_UCP
2539 else if (oclength > 0 &&
2540 eptr <= md->end_subject - oclength &&
2541 memcmp(eptr, occhars, oclength) == 0) eptr += oclength;
2542 #endif /* SUPPORT_UCP */
2543 else break;
2544 }
2545
2546 if (possessive) continue;
2547
2548 for(;;)
2549 {
2550 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM23);
2551 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2552 if (eptr == pp) { RRETURN(MATCH_NOMATCH); }
2553 #ifdef SUPPORT_UCP
2554 eptr--;
2555 BACKCHAR(eptr);
2556 #else /* without SUPPORT_UCP */
2557 eptr -= length;
2558 #endif /* SUPPORT_UCP */
2559 }
2560 }
2561 /* Control never gets here */
2562 }
2563
2564 /* If the length of a UTF-8 character is 1, we fall through here, and
2565 obey the code as for non-UTF-8 characters below, though in this case the
2566 value of fc will always be < 128. */
2567 }
2568 else
2569 #endif /* SUPPORT_UTF8 */
2570
2571 /* When not in UTF-8 mode, load a single-byte character. */
2572
2573 fc = *ecode++;
2574
2575 /* The value of fc at this point is always less than 256, though we may or
2576 may not be in UTF-8 mode. The code is duplicated for the caseless and
2577 caseful cases, for speed, since matching characters is likely to be quite
2578 common. First, ensure the minimum number of matches are present. If min =
2579 max, continue at the same level without recursing. Otherwise, if
2580 minimizing, keep trying the rest of the expression and advancing one
2581 matching character if failing, up to the maximum. Alternatively, if
2582 maximizing, find the maximum number of characters and work backwards. */
2583
2584 DPRINTF(("matching %c{%d,%d} against subject %.*s\n", fc, min, max,
2585 max, eptr));
2586
2587 if ((ims & PCRE_CASELESS) != 0)
2588 {
2589 fc = md->lcc[fc];
2590 for (i = 1; i <= min; i++)
2591 {
2592 if (eptr >= md->end_subject)
2593 {
2594 SCHECK_PARTIAL();
2595 RRETURN(MATCH_NOMATCH);
2596 }
2597 if (fc != md->lcc[*eptr++]) RRETURN(MATCH_NOMATCH);
2598 }
2599 if (min == max) continue;
2600 if (minimize)
2601 {
2602 for (fi = min;; fi++)
2603 {
2604 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM24);
2605 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2606 if (fi >= max) RRETURN(MATCH_NOMATCH);
2607 if (eptr >= md->end_subject)
2608 {
2609 SCHECK_PARTIAL();
2610 RRETURN(MATCH_NOMATCH);
2611 }
2612 if (fc != md->lcc[*eptr++]) RRETURN(MATCH_NOMATCH);
2613 }
2614 /* Control never gets here */
2615 }
2616 else /* Maximize */
2617 {
2618 pp = eptr;
2619 for (i = min; i < max; i++)
2620 {
2621 if (eptr >= md->end_subject || fc != md->lcc[*eptr]) break;
2622 eptr++;
2623 }
2624
2625 if (possessive) continue;
2626
2627 while (eptr >= pp)
2628 {
2629 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM25);
2630 eptr--;
2631 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2632 }
2633 RRETURN(MATCH_NOMATCH);
2634 }
2635 /* Control never gets here */
2636 }
2637
2638 /* Caseful comparisons (includes all multi-byte characters) */
2639
2640 else
2641 {
2642 for (i = 1; i <= min; i++)
2643 {
2644 if (eptr >= md->end_subject)
2645 {
2646 SCHECK_PARTIAL();
2647 RRETURN(MATCH_NOMATCH);
2648 }
2649 if (fc != *eptr++) RRETURN(MATCH_NOMATCH);
2650 }
2651
2652 if (min == max) continue;
2653
2654 if (minimize)
2655 {
2656 for (fi = min;; fi++)
2657 {
2658 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM26);
2659 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2660 if (fi >= max) RRETURN(MATCH_NOMATCH);
2661 if (eptr >= md->end_subject)
2662 {
2663 SCHECK_PARTIAL();
2664 RRETURN(MATCH_NOMATCH);
2665 }
2666 if (fc != *eptr++) RRETURN(MATCH_NOMATCH);
2667 }
2668 /* Control never gets here */
2669 }
2670 else /* Maximize */
2671 {
2672 pp = eptr;
2673 for (i = min; i < max; i++)
2674 {
2675 if (eptr >= md->end_subject || fc != *eptr) break;
2676 eptr++;
2677 }
2678 if (possessive) continue;
2679
2680 while (eptr >= pp)
2681 {
2682 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM27);
2683 eptr--;
2684 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2685 }
2686 RRETURN(MATCH_NOMATCH);
2687 }
2688 }
2689 /* Control never gets here */
2690
2691 /* Match a negated single one-byte character. The character we are
2692 checking can be multibyte. */
2693
2694 case OP_NOT:
2695 if (eptr >= md->end_subject)
2696 {
2697 SCHECK_PARTIAL();
2698 RRETURN(MATCH_NOMATCH);
2699 }
2700 ecode++;
2701 GETCHARINCTEST(c, eptr);
2702 if ((ims & PCRE_CASELESS) != 0)
2703 {
2704 #ifdef SUPPORT_UTF8
2705 if (c < 256)
2706 #endif
2707 c = md->lcc[c];
2708 if (md->lcc[*ecode++] == c) RRETURN(MATCH_NOMATCH);
2709 }
2710 else
2711 {
2712 if (*ecode++ == c) RRETURN(MATCH_NOMATCH);
2713 }
2714 break;
2715
2716 /* Match a negated single one-byte character repeatedly. This is almost a
2717 repeat of the code for a repeated single character, but I haven't found a
2718 nice way of commoning these up that doesn't require a test of the
2719 positive/negative option for each character match. Maybe that wouldn't add
2720 very much to the time taken, but character matching *is* what this is all
2721 about... */
2722
2723 case OP_NOTEXACT:
2724 min = max = GET2(ecode, 1);
2725 ecode += 3;
2726 goto REPEATNOTCHAR;
2727
2728 case OP_NOTUPTO:
2729 case OP_NOTMINUPTO:
2730 min = 0;
2731 max = GET2(ecode, 1);
2732 minimize = *ecode == OP_NOTMINUPTO;
2733 ecode += 3;
2734 goto REPEATNOTCHAR;
2735
2736 case OP_NOTPOSSTAR:
2737 possessive = TRUE;
2738 min = 0;
2739 max = INT_MAX;
2740 ecode++;
2741 goto REPEATNOTCHAR;
2742
2743 case OP_NOTPOSPLUS:
2744 possessive = TRUE;
2745 min = 1;
2746 max = INT_MAX;
2747 ecode++;
2748 goto REPEATNOTCHAR;
2749
2750 case OP_NOTPOSQUERY:
2751 possessive = TRUE;
2752 min = 0;
2753 max = 1;
2754 ecode++;
2755 goto REPEATNOTCHAR;
2756
2757 case OP_NOTPOSUPTO:
2758 possessive = TRUE;
2759 min = 0;
2760 max = GET2(ecode, 1);
2761 ecode += 3;
2762 goto REPEATNOTCHAR;
2763
2764 case OP_NOTSTAR:
2765 case OP_NOTMINSTAR:
2766 case OP_NOTPLUS:
2767 case OP_NOTMINPLUS:
2768 case OP_NOTQUERY:
2769 case OP_NOTMINQUERY:
2770 c = *ecode++ - OP_NOTSTAR;
2771 minimize = (c & 1) != 0;
2772 min = rep_min[c]; /* Pick up values from tables; */
2773 max = rep_max[c]; /* zero for max => infinity */
2774 if (max == 0) max = INT_MAX;
2775
2776 /* Common code for all repeated single-byte matches. */
2777
2778 REPEATNOTCHAR:
2779 fc = *ecode++;
2780
2781 /* The code is duplicated for the caseless and caseful cases, for speed,
2782 since matching characters is likely to be quite common. First, ensure the
2783 minimum number of matches are present. If min = max, continue at the same
2784 level without recursing. Otherwise, if minimizing, keep trying the rest of
2785 the expression and advancing one matching character if failing, up to the
2786 maximum. Alternatively, if maximizing, find the maximum number of
2787 characters and work backwards. */
2788
2789 DPRINTF(("negative matching %c{%d,%d} against subject %.*s\n", fc, min, max,
2790 max, eptr));
2791
2792 if ((ims & PCRE_CASELESS) != 0)
2793 {
2794 fc = md->lcc[fc];
2795
2796 #ifdef SUPPORT_UTF8
2797 /* UTF-8 mode */
2798 if (utf8)
2799 {
2800 register unsigned int d;
2801 for (i = 1; i <= min; i++)
2802 {
2803 if (eptr >= md->end_subject)
2804 {
2805 SCHECK_PARTIAL();
2806 RRETURN(MATCH_NOMATCH);
2807 }
2808 GETCHARINC(d, eptr);
2809 if (d < 256) d = md->lcc[d];
2810 if (fc == d) RRETURN(MATCH_NOMATCH);
2811 }
2812 }
2813 else
2814 #endif
2815
2816 /* Not UTF-8 mode */
2817 {
2818 for (i = 1; i <= min; i++)
2819 {
2820 if (eptr >= md->end_subject)
2821 {
2822 SCHECK_PARTIAL();
2823 RRETURN(MATCH_NOMATCH);
2824 }
2825 if (fc == md->lcc[*eptr++]) RRETURN(MATCH_NOMATCH);
2826 }
2827 }
2828
2829 if (min == max) continue;
2830
2831 if (minimize)
2832 {
2833 #ifdef SUPPORT_UTF8
2834 /* UTF-8 mode */
2835 if (utf8)
2836 {
2837 register unsigned int d;
2838 for (fi = min;; fi++)
2839 {
2840 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM28);
2841 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2842 if (fi >= max) RRETURN(MATCH_NOMATCH);
2843 if (eptr >= md->end_subject)
2844 {
2845 SCHECK_PARTIAL();
2846 RRETURN(MATCH_NOMATCH);
2847 }
2848 GETCHARINC(d, eptr);
2849 if (d < 256) d = md->lcc[d];
2850 if (fc == d) RRETURN(MATCH_NOMATCH);
2851 }
2852 }
2853 else
2854 #endif
2855 /* Not UTF-8 mode */
2856 {
2857 for (fi = min;; fi++)
2858 {
2859 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM29);
2860 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2861 if (fi >= max) RRETURN(MATCH_NOMATCH);
2862 if (eptr >= md->end_subject)
2863 {
2864 SCHECK_PARTIAL();
2865 RRETURN(MATCH_NOMATCH);
2866 }
2867 if (fc == md->lcc[*eptr++]) RRETURN(MATCH_NOMATCH);
2868 }
2869 }
2870 /* Control never gets here */
2871 }
2872
2873 /* Maximize case */
2874
2875 else
2876 {
2877 pp = eptr;
2878
2879 #ifdef SUPPORT_UTF8
2880 /* UTF-8 mode */
2881 if (utf8)
2882 {
2883 register unsigned int d;
2884 for (i = min; i < max; i++)
2885 {
2886 int len = 1;
2887 if (eptr >= md->end_subject) break;
2888 GETCHARLEN(d, eptr, len);
2889 if (d < 256) d = md->lcc[d];
2890 if (fc == d) break;
2891 eptr += len;
2892 }
2893 if (possessive) continue;
2894 for(;;)
2895 {
2896 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM30);
2897 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2898 if (eptr-- == pp) break; /* Stop if tried at original pos */
2899 BACKCHAR(eptr);
2900 }
2901 }
2902 else
2903 #endif
2904 /* Not UTF-8 mode */
2905 {
2906 for (i = min; i < max; i++)
2907 {
2908 if (eptr >= md->end_subject || fc == md->lcc[*eptr]) break;
2909 eptr++;
2910 }
2911 if (possessive) continue;
2912 while (eptr >= pp)
2913 {
2914 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM31);
2915 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2916 eptr--;
2917 }
2918 }
2919
2920 RRETURN(MATCH_NOMATCH);
2921 }
2922 /* Control never gets here */
2923 }
2924
2925 /* Caseful comparisons */
2926
2927 else
2928 {
2929 #ifdef SUPPORT_UTF8
2930 /* UTF-8 mode */
2931 if (utf8)
2932 {
2933 register unsigned int d;
2934 for (i = 1; i <= min; i++)
2935 {
2936 if (eptr >= md->end_subject)
2937 {
2938 SCHECK_PARTIAL();
2939 RRETURN(MATCH_NOMATCH);
2940 }
2941 GETCHARINC(d, eptr);
2942 if (fc == d) RRETURN(MATCH_NOMATCH);
2943 }
2944 }
2945 else
2946 #endif
2947 /* Not UTF-8 mode */
2948 {
2949 for (i = 1; i <= min; i++)
2950 {
2951 if (eptr >= md->end_subject)
2952 {
2953 SCHECK_PARTIAL();
2954 RRETURN(MATCH_NOMATCH);
2955 }
2956 if (fc == *eptr++) RRETURN(MATCH_NOMATCH);
2957 }
2958 }
2959
2960 if (min == max) continue;
2961
2962 if (minimize)
2963 {
2964 #ifdef SUPPORT_UTF8
2965 /* UTF-8 mode */
2966 if (utf8)
2967 {
2968 register unsigned int d;
2969 for (fi = min;; fi++)
2970 {
2971 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM32);
2972 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2973 if (fi >= max) RRETURN(MATCH_NOMATCH);
2974 if (eptr >= md->end_subject)
2975 {
2976 SCHECK_PARTIAL();
2977 RRETURN(MATCH_NOMATCH);
2978 }
2979 GETCHARINC(d, eptr);
2980 if (fc == d) RRETURN(MATCH_NOMATCH);
2981 }
2982 }
2983 else
2984 #endif
2985 /* Not UTF-8 mode */
2986 {
2987 for (fi = min;; fi++)
2988 {
2989 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM33);
2990 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2991 if (fi >= max) RRETURN(MATCH_NOMATCH);
2992 if (eptr >= md->end_subject)
2993 {
2994 SCHECK_PARTIAL();
2995 RRETURN(MATCH_NOMATCH);
2996 }
2997 if (fc == *eptr++) RRETURN(MATCH_NOMATCH);
2998 }
2999 }
3000 /* Control never gets here */
3001 }
3002
3003 /* Maximize case */
3004
3005 else
3006 {
3007 pp = eptr;
3008
3009 #ifdef SUPPORT_UTF8
3010 /* UTF-8 mode */
3011 if (utf8)
3012 {
3013 register unsigned int d;
3014 for (i = min; i < max; i++)
3015 {
3016 int len = 1;
3017 if (eptr >= md->end_subject) break;
3018 GETCHARLEN(d, eptr, len);
3019 if (fc == d) break;
3020 eptr += len;
3021 }
3022 if (possessive) continue;
3023 for(;;)
3024 {
3025 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM34);
3026 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3027 if (eptr-- == pp) break; /* Stop if tried at original pos */
3028 BACKCHAR(eptr);
3029 }
3030 }
3031 else
3032 #endif
3033 /* Not UTF-8 mode */
3034 {
3035 for (i = min; i < max; i++)
3036 {
3037 if (eptr >= md->end_subject || fc == *eptr) break;
3038 eptr++;
3039 }
3040 if (possessive) continue;
3041 while (eptr >= pp)
3042 {
3043 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM35);
3044 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3045 eptr--;
3046 }
3047 }
3048
3049 RRETURN(MATCH_NOMATCH);
3050 }
3051 }
3052 /* Control never gets here */
3053
3054 /* Match a single character type repeatedly; several different opcodes
3055 share code. This is very similar to the code for single characters, but we
3056 repeat it in the interests of efficiency. */
3057
3058 case OP_TYPEEXACT:
3059 min = max = GET2(ecode, 1);
3060 minimize = TRUE;
3061 ecode += 3;
3062 goto REPEATTYPE;
3063
3064 case OP_TYPEUPTO:
3065 case OP_TYPEMINUPTO:
3066 min = 0;
3067 max = GET2(ecode, 1);
3068 minimize = *ecode == OP_TYPEMINUPTO;
3069 ecode += 3;
3070 goto REPEATTYPE;
3071
3072 case OP_TYPEPOSSTAR:
3073 possessive = TRUE;
3074 min = 0;
3075 max = INT_MAX;
3076 ecode++;
3077 goto REPEATTYPE;
3078
3079 case OP_TYPEPOSPLUS:
3080 possessive = TRUE;
3081 min = 1;
3082 max = INT_MAX;
3083 ecode++;
3084 goto REPEATTYPE;
3085
3086 case OP_TYPEPOSQUERY:
3087 possessive = TRUE;
3088 min = 0;
3089 max = 1;
3090 ecode++;
3091 goto REPEATTYPE;
3092
3093 case OP_TYPEPOSUPTO:
3094 possessive = TRUE;
3095 min = 0;
3096 max = GET2(ecode, 1);
3097 ecode += 3;
3098 goto REPEATTYPE;
3099
3100 case OP_TYPESTAR:
3101 case OP_TYPEMINSTAR:
3102 case OP_TYPEPLUS:
3103 case OP_TYPEMINPLUS:
3104 case OP_TYPEQUERY:
3105 case OP_TYPEMINQUERY:
3106 c = *ecode++ - OP_TYPESTAR;
3107 minimize = (c & 1) != 0;
3108 min = rep_min[c]; /* Pick up values from tables; */
3109 max = rep_max[c]; /* zero for max => infinity */
3110 if (max == 0) max = INT_MAX;
3111
3112 /* Common code for all repeated single character type matches. Note that
3113 in UTF-8 mode, '.' matches a character of any length, but for the other
3114 character types, the valid characters are all one-byte long. */
3115
3116 REPEATTYPE:
3117 ctype = *ecode++; /* Code for the character type */
3118
3119 #ifdef SUPPORT_UCP
3120 if (ctype == OP_PROP || ctype == OP_NOTPROP)
3121 {
3122 prop_fail_result = ctype == OP_NOTPROP;
3123 prop_type = *ecode++;
3124 prop_value = *ecode++;
3125 }
3126 else prop_type = -1;
3127 #endif
3128
3129 /* First, ensure the minimum number of matches are present. Use inline
3130 code for maximizing the speed, and do the type test once at the start
3131 (i.e. keep it out of the loop). Separate the UTF-8 code completely as that
3132 is tidier. Also separate the UCP code, which can be the same for both UTF-8
3133 and single-bytes. */
3134
3135 if (min > 0)
3136 {
3137 #ifdef SUPPORT_UCP
3138 if (prop_type >= 0)
3139 {
3140 switch(prop_type)
3141 {
3142 case PT_ANY:
3143 if (prop_fail_result) RRETURN(MATCH_NOMATCH);
3144 for (i = 1; i <= min; i++)
3145 {
3146 if (eptr >= md->end_subject)
3147 {
3148 SCHECK_PARTIAL();
3149 RRETURN(MATCH_NOMATCH);
3150 }
3151 GETCHARINCTEST(c, eptr);
3152 }
3153 break;
3154
3155 case PT_LAMP:
3156 for (i = 1; i <= min; i++)
3157 {
3158 if (eptr >= md->end_subject)
3159 {
3160 SCHECK_PARTIAL();
3161 RRETURN(MATCH_NOMATCH);
3162 }
3163 GETCHARINCTEST(c, eptr);
3164 prop_chartype = UCD_CHARTYPE(c);
3165 if ((prop_chartype == ucp_Lu ||
3166 prop_chartype == ucp_Ll ||
3167 prop_chartype == ucp_Lt) == prop_fail_result)
3168 RRETURN(MATCH_NOMATCH);
3169 }
3170 break;
3171
3172 case PT_GC:
3173 for (i = 1; i <= min; i++)
3174 {
3175 if (eptr >= md->end_subject)
3176 {
3177 SCHECK_PARTIAL();
3178 RRETURN(MATCH_NOMATCH);
3179 }
3180 GETCHARINCTEST(c, eptr);
3181 prop_category = UCD_CATEGORY(c);
3182 if ((prop_category == prop_value) == prop_fail_result)
3183 RRETURN(MATCH_NOMATCH);
3184 }
3185 break;
3186
3187 case PT_PC:
3188 for (i = 1; i <= min; i++)
3189 {
3190 if (eptr >= md->end_subject)
3191 {
3192 SCHECK_PARTIAL();
3193 RRETURN(MATCH_NOMATCH);
3194 }
3195 GETCHARINCTEST(c, eptr);
3196 prop_chartype = UCD_CHARTYPE(c);
3197 if ((prop_chartype == prop_value) == prop_fail_result)
3198 RRETURN(MATCH_NOMATCH);
3199 }
3200 break;
3201
3202 case PT_SC:
3203 for (i = 1; i <= min; i++)
3204 {
3205 if (eptr >= md->end_subject)
3206 {
3207 SCHECK_PARTIAL();
3208 RRETURN(MATCH_NOMATCH);
3209 }
3210 GETCHARINCTEST(c, eptr);
3211 prop_script = UCD_SCRIPT(c);
3212 if ((prop_script == prop_value) == prop_fail_result)
3213 RRETURN(MATCH_NOMATCH);
3214 }
3215 break;
3216
3217 default:
3218 RRETURN(PCRE_ERROR_INTERNAL);
3219 }
3220 }
3221
3222 /* Match extended Unicode sequences. We will get here only if the
3223 support is in the binary; otherwise a compile-time error occurs. */
3224
3225 else if (ctype == OP_EXTUNI)
3226 {
3227 for (i = 1; i <= min; i++)
3228 {
3229 if (eptr >= md->end_subject)
3230 {
3231 SCHECK_PARTIAL();
3232 RRETURN(MATCH_NOMATCH);
3233 }
3234 GETCHARINCTEST(c, eptr);
3235 prop_category = UCD_CATEGORY(c);
3236 if (prop_category == ucp_M) RRETURN(MATCH_NOMATCH);
3237 while (eptr < md->end_subject)
3238 {
3239 int len = 1;
3240 if (!utf8) c = *eptr;
3241 else { GETCHARLEN(c, eptr, len); }
3242 prop_category = UCD_CATEGORY(c);
3243 if (prop_category != ucp_M) break;
3244 eptr += len;
3245 }
3246 }
3247 }
3248
3249 else
3250 #endif /* SUPPORT_UCP */
3251
3252 /* Handle all other cases when the coding is UTF-8 */
3253
3254 #ifdef SUPPORT_UTF8
3255 if (utf8) switch(ctype)
3256 {
3257 case OP_ANY:
3258 for (i = 1; i <= min; i++)
3259 {
3260 if (eptr >= md->end_subject)
3261 {
3262 SCHECK_PARTIAL();
3263 RRETURN(MATCH_NOMATCH);
3264 }
3265 if (IS_NEWLINE(eptr)) RRETURN(MATCH_NOMATCH);
3266 eptr++;
3267 while (eptr < md->end_subject && (*eptr & 0xc0) == 0x80) eptr++;
3268 }
3269 break;
3270
3271 case OP_ALLANY:
3272 for (i = 1; i <= min; i++)
3273 {
3274 if (eptr >= md->end_subject)
3275 {
3276 SCHECK_PARTIAL();
3277 RRETURN(MATCH_NOMATCH);
3278 }
3279 eptr++;
3280 while (eptr < md->end_subject && (*eptr & 0xc0) == 0x80) eptr++;
3281 }
3282 break;
3283
3284 case OP_ANYBYTE:
3285 if (eptr > md->end_subject - min) RRETURN(MATCH_NOMATCH);
3286 eptr += min;
3287 break;
3288
3289 case OP_ANYNL:
3290 for (i = 1; i <= min; i++)
3291 {
3292 if (eptr >= md->end_subject)
3293 {
3294 SCHECK_PARTIAL();
3295 RRETURN(MATCH_NOMATCH);
3296 }
3297 GETCHARINC(c, eptr);
3298 switch(c)
3299 {
3300 default: RRETURN(MATCH_NOMATCH);
3301 case 0x000d:
3302 if (eptr < md->end_subject && *eptr == 0x0a) eptr++;
3303 break;
3304
3305 case 0x000a:
3306 break;
3307
3308 case 0x000b:
3309 case 0x000c:
3310 case 0x0085:
3311 case 0x2028:
3312 case 0x2029:
3313 if (md->bsr_anycrlf) RRETURN(MATCH_NOMATCH);
3314 break;
3315 }
3316 }
3317 break;
3318
3319 case OP_NOT_HSPACE:
3320 for (i = 1; i <= min; i++)
3321 {
3322 if (eptr >= md->end_subject)
3323 {
3324 SCHECK_PARTIAL();
3325 RRETURN(MATCH_NOMATCH);
3326 }
3327 GETCHARINC(c, eptr);
3328 switch(c)
3329 {
3330 default: break;
3331 case 0x09: /* HT */
3332 case 0x20: /* SPACE */
3333 case 0xa0: /* NBSP */
3334 case 0x1680: /* OGHAM SPACE MARK */
3335 case 0x180e: /* MONGOLIAN VOWEL SEPARATOR */
3336 case 0x2000: /* EN QUAD */
3337 case 0x2001: /* EM QUAD */
3338 case 0x2002: /* EN SPACE */
3339 case 0x2003: /* EM SPACE */
3340 case 0x2004: /* THREE-PER-EM SPACE */
3341 case 0x2005: /* FOUR-PER-EM SPACE */
3342 case 0x2006: /* SIX-PER-EM SPACE */
3343 case 0x2007: /* FIGURE SPACE */
3344 case 0x2008: /* PUNCTUATION SPACE */
3345 case 0x2009: /* THIN SPACE */
3346 case 0x200A: /* HAIR SPACE */
3347 case 0x202f: /* NARROW NO-BREAK SPACE */
3348 case 0x205f: /* MEDIUM MATHEMATICAL SPACE */
3349 case 0x3000: /* IDEOGRAPHIC SPACE */
3350 RRETURN(MATCH_NOMATCH);
3351 }
3352 }
3353 break;
3354
3355 case OP_HSPACE:
3356 for (i = 1; i <= min; i++)
3357 {
3358 if (eptr >= md->end_subject)
3359 {
3360 SCHECK_PARTIAL();
3361 RRETURN(MATCH_NOMATCH);
3362 }
3363 GETCHARINC(c, eptr);
3364 switch(c)
3365 {
3366 default: RRETURN(MATCH_NOMATCH);
3367 case 0x09: /* HT */
3368 case 0x20: /* SPACE */
3369 case 0xa0: /* NBSP */
3370 case 0x1680: /* OGHAM SPACE MARK */
3371 case 0x180e: /* MONGOLIAN VOWEL SEPARATOR */
3372 case 0x2000: /* EN QUAD */
3373 case 0x2001: /* EM QUAD */
3374 case 0x2002: /* EN SPACE */
3375 case 0x2003: /* EM SPACE */
3376 case 0x2004: /* THREE-PER-EM SPACE */
3377 case 0x2005: /* FOUR-PER-EM SPACE */
3378 case 0x2006: /* SIX-PER-EM SPACE */
3379 case 0x2007: /* FIGURE SPACE */
3380 case 0x2008: /* PUNCTUATION SPACE */
3381 case 0x2009: /* THIN SPACE */
3382 case 0x200A: /* HAIR SPACE */
3383 case 0x202f: /* NARROW NO-BREAK SPACE */
3384 case 0x205f: /* MEDIUM MATHEMATICAL SPACE */
3385 case 0x3000: /* IDEOGRAPHIC SPACE */
3386 break;
3387 }
3388 }
3389 break;
3390
3391 case OP_NOT_VSPACE:
3392 for (i = 1; i <= min; i++)
3393 {
3394 if (eptr >= md->end_subject)
3395 {
3396 SCHECK_PARTIAL();
3397 RRETURN(MATCH_NOMATCH);
3398 }
3399 GETCHARINC(c, eptr);
3400 switch(c)
3401 {
3402 default: break;
3403 case 0x0a: /* LF */
3404 case 0x0b: /* VT */
3405 case 0x0c: /* FF */
3406 case 0x0d: /* CR */
3407 case 0x85: /* NEL */
3408 case 0x2028: /* LINE SEPARATOR */
3409 case 0x2029: /* PARAGRAPH SEPARATOR */
3410 RRETURN(MATCH_NOMATCH);
3411 }
3412 }
3413 break;
3414
3415 case OP_VSPACE:
3416 for (i = 1; i <= min; i++)
3417 {
3418 if (eptr >= md->end_subject)
3419 {
3420 SCHECK_PARTIAL();
3421 RRETURN(MATCH_NOMATCH);
3422 }
3423 GETCHARINC(c, eptr);
3424 switch(c)
3425 {
3426 default: RRETURN(MATCH_NOMATCH);
3427 case 0x0a: /* LF */
3428 case 0x0b: /* VT */
3429 case 0x0c: /* FF */
3430 case 0x0d: /* CR */
3431 case 0x85: /* NEL */
3432 case 0x2028: /* LINE SEPARATOR */
3433 case 0x2029: /* PARAGRAPH SEPARATOR */
3434 break;
3435 }
3436 }
3437 break;
3438
3439 case OP_NOT_DIGIT:
3440 for (i = 1; i <= min; i++)
3441 {
3442 if (eptr >= md->end_subject)
3443 {
3444 SCHECK_PARTIAL();
3445 RRETURN(MATCH_NOMATCH);
3446 }
3447 GETCHARINC(c, eptr);
3448 if (c < 128 && (md->ctypes[c] & ctype_digit) != 0)
3449 RRETURN(MATCH_NOMATCH);
3450 }
3451 break;
3452
3453 case OP_DIGIT:
3454 for (i = 1; i <= min; i++)
3455 {
3456 if (eptr >= md->end_subject)
3457 {
3458 SCHECK_PARTIAL();
3459 RRETURN(MATCH_NOMATCH);
3460 }
3461 if (*eptr >= 128 || (md->ctypes[*eptr++] & ctype_digit) == 0)
3462 RRETURN(MATCH_NOMATCH);
3463 /* No need to skip more bytes - we know it's a 1-byte character */
3464 }
3465 break;
3466
3467 case OP_NOT_WHITESPACE:
3468 for (i = 1; i <= min; i++)
3469 {
3470 if (eptr >= md->end_subject)
3471 {
3472 SCHECK_PARTIAL();
3473 RRETURN(MATCH_NOMATCH);
3474 }
3475 if (*eptr < 128 && (md->ctypes[*eptr] & ctype_space) != 0)
3476 RRETURN(MATCH_NOMATCH);
3477 while (++eptr < md->end_subject && (*eptr & 0xc0) == 0x80);
3478 }
3479 break;
3480
3481 case OP_WHITESPACE:
3482 for (i = 1; i <= min; i++)
3483 {
3484 if (eptr >= md->end_subject)
3485 {
3486 SCHECK_PARTIAL();
3487 RRETURN(MATCH_NOMATCH);
3488 }
3489 if (*eptr >= 128 || (md->ctypes[*eptr++] & ctype_space) == 0)
3490 RRETURN(MATCH_NOMATCH);
3491 /* No need to skip more bytes - we know it's a 1-byte character */
3492 }
3493 break;
3494
3495 case OP_NOT_WORDCHAR:
3496 for (i = 1; i <= min; i++)
3497 {
3498 if (eptr >= md->end_subject ||
3499 (*eptr < 128 && (md->ctypes[*eptr] & ctype_word) != 0))
3500 RRETURN(MATCH_NOMATCH);
3501 while (++eptr < md->end_subject && (*eptr & 0xc0) == 0x80);
3502 }
3503 break;
3504
3505 case OP_WORDCHAR:
3506 for (i = 1; i <= min; i++)
3507 {
3508 if (eptr >= md->end_subject)
3509 {
3510 SCHECK_PARTIAL();
3511 RRETURN(MATCH_NOMATCH);
3512 }
3513 if (*eptr >= 128 || (md->ctypes[*eptr++] & ctype_word) == 0)
3514 RRETURN(MATCH_NOMATCH);
3515 /* No need to skip more bytes - we know it's a 1-byte character */
3516 }
3517 break;
3518
3519 default:
3520 RRETURN(PCRE_ERROR_INTERNAL);
3521 } /* End switch(ctype) */
3522
3523 else
3524 #endif /* SUPPORT_UTF8 */
3525
3526 /* Code for the non-UTF-8 case for minimum matching of operators other
3527 than OP_PROP and OP_NOTPROP. */
3528
3529 switch(ctype)
3530 {
3531 case OP_ANY:
3532 for (i = 1; i <= min; i++)
3533 {
3534 if (eptr >= md->end_subject)
3535 {
3536 SCHECK_PARTIAL();
3537 RRETURN(MATCH_NOMATCH);
3538 }
3539 if (IS_NEWLINE(eptr)) RRETURN(MATCH_NOMATCH);
3540 eptr++;
3541 }
3542 break;
3543
3544 case OP_ALLANY:
3545 if (eptr > md->end_subject - min)
3546 {
3547 SCHECK_PARTIAL();
3548 RRETURN(MATCH_NOMATCH);
3549 }
3550 eptr += min;
3551 break;
3552
3553 case OP_ANYBYTE:
3554 if (eptr > md->end_subject - min)
3555 {
3556 SCHECK_PARTIAL();
3557 RRETURN(MATCH_NOMATCH);
3558 }
3559 eptr += min;
3560 break;
3561
3562 case OP_ANYNL:
3563 for (i = 1; i <= min; i++)
3564 {
3565 if (eptr >= md->end_subject)
3566 {
3567 SCHECK_PARTIAL();
3568 RRETURN(MATCH_NOMATCH);
3569 }
3570 switch(*eptr++)
3571 {
3572 default: RRETURN(MATCH_NOMATCH);
3573 case 0x000d:
3574 if (eptr < md->end_subject && *eptr == 0x0a) eptr++;
3575 break;
3576 case 0x000a:
3577 break;
3578
3579 case 0x000b:
3580 case 0x000c:
3581 case 0x0085:
3582 if (md->bsr_anycrlf) RRETURN(MATCH_NOMATCH);
3583 break;
3584 }
3585 }
3586 break;
3587
3588 case OP_NOT_HSPACE:
3589 for (i = 1; i <= min; i++)
3590 {
3591 if (eptr >= md->end_subject)
3592 {
3593 SCHECK_PARTIAL();
3594 RRETURN(MATCH_NOMATCH);
3595 }
3596 switch(*eptr++)
3597 {
3598 default: break;
3599 case 0x09: /* HT */
3600 case 0x20: /* SPACE */
3601 case 0xa0: /* NBSP */
3602 RRETURN(MATCH_NOMATCH);
3603 }
3604 }
3605 break;
3606
3607 case OP_HSPACE:
3608 for (i = 1; i <= min; i++)
3609 {
3610 if (eptr >= md->end_subject)
3611 {
3612 SCHECK_PARTIAL();
3613 RRETURN(MATCH_NOMATCH);
3614 }
3615 switch(*eptr++)
3616 {
3617 default: RRETURN(MATCH_NOMATCH);
3618 case 0x09: /* HT */
3619 case 0x20: /* SPACE */
3620 case 0xa0: /* NBSP */
3621 break;
3622 }
3623 }
3624 break;
3625
3626 case OP_NOT_VSPACE:
3627 for (i = 1; i <= min; i++)
3628 {
3629 if (eptr >= md->end_subject)
3630 {
3631 SCHECK_PARTIAL();
3632 RRETURN(MATCH_NOMATCH);
3633 }
3634 switch(*eptr++)
3635 {
3636 default: break;
3637 case 0x0a: /* LF */
3638 case 0x0b: /* VT */
3639 case 0x0c: /* FF */
3640 case 0x0d: /* CR */
3641 case 0x85: /* NEL */
3642 RRETURN(MATCH_NOMATCH);
3643 }
3644 }
3645 break;
3646
3647 case OP_VSPACE:
3648 for (i = 1; i <= min; i++)
3649 {
3650 if (eptr >= md->end_subject)
3651 {
3652 SCHECK_PARTIAL();
3653 RRETURN(MATCH_NOMATCH);
3654 }
3655 switch(*eptr++)
3656 {
3657 default: RRETURN(MATCH_NOMATCH);
3658 case 0x0a: /* LF */
3659 case 0x0b: /* VT */
3660 case 0x0c: /* FF */
3661 case 0x0d: /* CR */
3662 case 0x85: /* NEL */
3663 break;
3664 }
3665 }
3666 break;
3667
3668 case OP_NOT_DIGIT:
3669 for (i = 1; i <= min; i++)
3670 {
3671 if (eptr >= md->end_subject)
3672 {
3673 SCHECK_PARTIAL();
3674 RRETURN(MATCH_NOMATCH);
3675 }
3676 if ((md->ctypes[*eptr++] & ctype_digit) != 0) RRETURN(MATCH_NOMATCH);
3677 }
3678 break;
3679
3680 case OP_DIGIT:
3681 for (i = 1; i <= min; i++)
3682 {
3683 if (eptr >= md->end_subject)
3684 {
3685 SCHECK_PARTIAL();
3686 RRETURN(MATCH_NOMATCH);
3687 }
3688 if ((md->ctypes[*eptr++] & ctype_digit) == 0) RRETURN(MATCH_NOMATCH);
3689 }
3690 break;
3691
3692 case OP_NOT_WHITESPACE:
3693 for (i = 1; i <= min; i++)
3694 {
3695 if (eptr >= md->end_subject)
3696 {
3697 SCHECK_PARTIAL();
3698 RRETURN(MATCH_NOMATCH);
3699 }
3700 if ((md->ctypes[*eptr++] & ctype_space) != 0) RRETURN(MATCH_NOMATCH);
3701 }
3702 break;
3703
3704 case OP_WHITESPACE:
3705 for (i = 1; i <= min; i++)
3706 {
3707 if (eptr >= md->end_subject)
3708 {
3709 SCHECK_PARTIAL();
3710 RRETURN(MATCH_NOMATCH);
3711 }
3712 if ((md->ctypes[*eptr++] & ctype_space) == 0) RRETURN(MATCH_NOMATCH);
3713 }
3714 break;
3715
3716 case OP_NOT_WORDCHAR:
3717 for (i = 1; i <= min; i++)
3718 {
3719 if (eptr >= md->end_subject)
3720 {
3721 SCHECK_PARTIAL();
3722 RRETURN(MATCH_NOMATCH);
3723 }
3724 if ((md->ctypes[*eptr++] & ctype_word) != 0)
3725 RRETURN(MATCH_NOMATCH);
3726 }
3727 break;
3728
3729 case OP_WORDCHAR:
3730 for (i = 1; i <= min; i++)
3731 {
3732 if (eptr >= md->end_subject)
3733 {
3734 SCHECK_PARTIAL();
3735 RRETURN(MATCH_NOMATCH);
3736 }
3737 if ((md->ctypes[*eptr++] & ctype_word) == 0)
3738 RRETURN(MATCH_NOMATCH);
3739 }
3740 break;
3741
3742 default:
3743 RRETURN(PCRE_ERROR_INTERNAL);
3744 }
3745 }
3746
3747 /* If min = max, continue at the same level without recursing */
3748
3749 if (min == max) continue;
3750
3751 /* If minimizing, we have to test the rest of the pattern before each
3752 subsequent match. Again, separate the UTF-8 case for speed, and also
3753 separate the UCP cases. */
3754
3755 if (minimize)
3756 {
3757 #ifdef SUPPORT_UCP
3758 if (prop_type >= 0)
3759 {
3760 switch(prop_type)
3761 {
3762 case PT_ANY:
3763 for (fi = min;; fi++)
3764 {
3765 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM36);
3766 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3767 if (fi >= max) RRETURN(MATCH_NOMATCH);
3768 if (eptr >= md->end_subject)
3769 {
3770 SCHECK_PARTIAL();
3771 RRETURN(MATCH_NOMATCH);
3772 }
3773 GETCHARINC(c, eptr);
3774 if (prop_fail_result) RRETURN(MATCH_NOMATCH);
3775 }
3776 /* Control never gets here */
3777
3778 case PT_LAMP:
3779 for (fi = min;; fi++)
3780 {
3781 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM37);
3782 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3783 if (fi >= max) RRETURN(MATCH_NOMATCH);
3784 if (eptr >= md->end_subject)
3785 {
3786 SCHECK_PARTIAL();
3787 RRETURN(MATCH_NOMATCH);
3788 }
3789 GETCHARINC(c, eptr);
3790 prop_chartype = UCD_CHARTYPE(c);
3791 if ((prop_chartype == ucp_Lu ||
3792 prop_chartype == ucp_Ll ||
3793 prop_chartype == ucp_Lt) == prop_fail_result)
3794 RRETURN(MATCH_NOMATCH);
3795 }
3796 /* Control never gets here */
3797
3798 case PT_GC:
3799 for (fi = min;; fi++)
3800 {
3801 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM38);
3802 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3803 if (fi >= max) RRETURN(MATCH_NOMATCH);
3804 if (eptr >= md->end_subject)
3805 {
3806 SCHECK_PARTIAL();
3807 RRETURN(MATCH_NOMATCH);
3808 }
3809 GETCHARINC(c, eptr);
3810 prop_category = UCD_CATEGORY(c);
3811 if ((prop_category == prop_value) == prop_fail_result)
3812 RRETURN(MATCH_NOMATCH);
3813 }
3814 /* Control never gets here */
3815
3816 case PT_PC:
3817 for (fi = min;; fi++)
3818 {
3819 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM39);
3820 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3821 if (fi >= max) RRETURN(MATCH_NOMATCH);
3822 if (eptr >= md->end_subject)
3823 {
3824 SCHECK_PARTIAL();
3825 RRETURN(MATCH_NOMATCH);
3826 }
3827 GETCHARINC(c, eptr);
3828 prop_chartype = UCD_CHARTYPE(c);
3829 if ((prop_chartype == prop_value) == prop_fail_result)
3830 RRETURN(MATCH_NOMATCH);
3831 }
3832 /* Control never gets here */
3833
3834 case PT_SC:
3835 for (fi = min;; fi++)
3836 {
3837 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM40);
3838 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3839 if (fi >= max) RRETURN(MATCH_NOMATCH);
3840 if (eptr >= md->end_subject)
3841 {
3842 SCHECK_PARTIAL();
3843 RRETURN(MATCH_NOMATCH);
3844 }
3845 GETCHARINC(c, eptr);
3846 prop_script = UCD_SCRIPT(c);
3847 if ((prop_script == prop_value) == prop_fail_result)
3848 RRETURN(MATCH_NOMATCH);
3849 }
3850 /* Control never gets here */
3851
3852 default:
3853 RRETURN(PCRE_ERROR_INTERNAL);
3854 }
3855 }
3856
3857 /* Match extended Unicode sequences. We will get here only if the
3858 support is in the binary; otherwise a compile-time error occurs. */
3859
3860 else if (ctype == OP_EXTUNI)
3861 {
3862 for (fi = min;; fi++)
3863 {
3864 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM41);
3865 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3866 if (fi >= max) RRETURN(MATCH_NOMATCH);
3867 if (eptr >= md->end_subject)
3868 {
3869 SCHECK_PARTIAL();
3870 RRETURN(MATCH_NOMATCH);
3871 }
3872 GETCHARINCTEST(c, eptr);
3873 prop_category = UCD_CATEGORY(c);
3874 if (prop_category == ucp_M) RRETURN(MATCH_NOMATCH);
3875 while (eptr < md->end_subject)
3876 {
3877 int len = 1;
3878 if (!utf8) c = *eptr;
3879 else { GETCHARLEN(c, eptr, len); }
3880 prop_category = UCD_CATEGORY(c);
3881 if (prop_category != ucp_M) break;
3882 eptr += len;
3883 }
3884 }
3885 }
3886
3887 else
3888 #endif /* SUPPORT_UCP */
3889
3890 #ifdef SUPPORT_UTF8
3891 /* UTF-8 mode */
3892 if (utf8)
3893 {
3894 for (fi = min;; fi++)
3895 {
3896 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM42);
3897 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3898 if (fi >= max) RRETURN(MATCH_NOMATCH);
3899 if (eptr >= md->end_subject)
3900 {
3901 SCHECK_PARTIAL();
3902 RRETURN(MATCH_NOMATCH);
3903 }
3904 if (ctype == OP_ANY && IS_NEWLINE(eptr))
3905 RRETURN(MATCH_NOMATCH);
3906 GETCHARINC(c, eptr);
3907 switch(ctype)
3908 {
3909 case OP_ANY: /* This is the non-NL case */
3910 case OP_ALLANY:
3911 case OP_ANYBYTE:
3912 break;
3913
3914 case OP_ANYNL:
3915 switch(c)
3916 {
3917 default: RRETURN(MATCH_NOMATCH);
3918 case 0x000d:
3919 if (eptr < md->end_subject && *eptr == 0x0a) eptr++;
3920 break;
3921 case 0x000a:
3922 break;
3923
3924 case 0x000b:
3925 case 0x000c:
3926 case 0x0085:
3927 case 0x2028:
3928 case 0x2029:
3929 if (md->bsr_anycrlf) RRETURN(MATCH_NOMATCH);
3930 break;
3931 }
3932 break;
3933
3934 case OP_NOT_HSPACE:
3935 switch(c)
3936 {
3937 default: break;
3938 case 0x09: /* HT */
3939 case 0x20: /* SPACE */
3940 case 0xa0: /* NBSP */
3941 case 0x1680: /* OGHAM SPACE MARK */
3942 case 0x180e: /* MONGOLIAN VOWEL SEPARATOR */
3943 case 0x2000: /* EN QUAD */
3944 case 0x2001: /* EM QUAD */
3945 case 0x2002: /* EN SPACE */
3946 case 0x2003: /* EM SPACE */
3947 case 0x2004: /* THREE-PER-EM SPACE */
3948 case 0x2005: /* FOUR-PER-EM SPACE */
3949 case 0x2006: /* SIX-PER-EM SPACE */
3950 case 0x2007: /* FIGURE SPACE */
3951 case 0x2008: /* PUNCTUATION SPACE */
3952 case 0x2009: /* THIN SPACE */
3953 case 0x200A: /* HAIR SPACE */
3954 case 0x202f: /* NARROW NO-BREAK SPACE */
3955 case 0x205f: /* MEDIUM MATHEMATICAL SPACE */
3956 case 0x3000: /* IDEOGRAPHIC SPACE */
3957 RRETURN(MATCH_NOMATCH);
3958 }
3959 break;
3960
3961 case OP_HSPACE:
3962 switch(c)
3963 {
3964 default: RRETURN(MATCH_NOMATCH);
3965 case 0x09: /* HT */
3966 case 0x20: /* SPACE */
3967 case 0xa0: /* NBSP */
3968 case 0x1680: /* OGHAM SPACE MARK */
3969 case 0x180e: /* MONGOLIAN VOWEL SEPARATOR */
3970 case 0x2000: /* EN QUAD */
3971 case 0x2001: /* EM QUAD */
3972 case 0x2002: /* EN SPACE */
3973 case 0x2003: /* EM SPACE */
3974 case 0x2004: /* THREE-PER-EM SPACE */
3975 case 0x2005: /* FOUR-PER-EM SPACE */
3976 case 0x2006: /* SIX-PER-EM SPACE */
3977 case 0x2007: /* FIGURE SPACE */
3978 case 0x2008: /* PUNCTUATION SPACE */
3979 case 0x2009: /* THIN SPACE */
3980 case 0x200A: /* HAIR SPACE */
3981 case 0x202f: /* NARROW NO-BREAK SPACE */
3982 case 0x205f: /* MEDIUM MATHEMATICAL SPACE */
3983 case 0x3000: /* IDEOGRAPHIC SPACE */
3984 break;
3985 }
3986 break;
3987
3988 case OP_NOT_VSPACE:
3989 switch(c)
3990 {
3991 default: break;
3992 case 0x0a: /* LF */
3993 case 0x0b: /* VT */
3994 case 0x0c: /* FF */
3995 case 0x0d: /* CR */
3996 case 0x85: /* NEL */
3997 case 0x2028: /* LINE SEPARATOR */
3998 case 0x2029: /* PARAGRAPH SEPARATOR */
3999 RRETURN(MATCH_NOMATCH);
4000 }
4001 break;
4002
4003 case OP_VSPACE:
4004 switch(c)
4005 {
4006 default: RRETURN(MATCH_NOMATCH);
4007 case 0x0a: /* LF */
4008 case 0x0b: /* VT */
4009 case 0x0c: /* FF */
4010 case 0x0d: /* CR */
4011 case 0x85: /* NEL */
4012 case 0x2028: /* LINE SEPARATOR */
4013 case 0x2029: /* PARAGRAPH SEPARATOR */
4014 break;
4015 }
4016 break;
4017
4018 case OP_NOT_DIGIT:
4019 if (c < 256 && (md->ctypes[c] & ctype_digit) != 0)
4020 RRETURN(MATCH_NOMATCH);
4021 break;
4022
4023 case OP_DIGIT:
4024 if (c >= 256 || (md->ctypes[c] & ctype_digit) == 0)
4025 RRETURN(MATCH_NOMATCH);
4026 break;
4027
4028 case OP_NOT_WHITESPACE:
4029 if (c < 256 && (md->ctypes[c] & ctype_space) != 0)
4030 RRETURN(MATCH_NOMATCH);
4031 break;
4032
4033 case OP_WHITESPACE:
4034 if (c >= 256 || (md->ctypes[c] & ctype_space) == 0)
4035 RRETURN(MATCH_NOMATCH);
4036 break;
4037
4038 case OP_NOT_WORDCHAR:
4039 if (c < 256 && (md->ctypes[c] & ctype_word) != 0)
4040 RRETURN(MATCH_NOMATCH);
4041 break;
4042
4043 case OP_WORDCHAR:
4044 if (c >= 256 || (md->ctypes[c] & ctype_word) == 0)
4045 RRETURN(MATCH_NOMATCH);
4046 break;
4047
4048 default:
4049 RRETURN(PCRE_ERROR_INTERNAL);
4050 }
4051 }
4052 }
4053 else
4054 #endif
4055 /* Not UTF-8 mode */
4056 {
4057 for (fi = min;; fi++)
4058 {
4059 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM43);
4060 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
4061 if (fi >= max) RRETURN(MATCH_NOMATCH);
4062 if (eptr >= md->end_subject)
4063 {
4064 SCHECK_PARTIAL();
4065 RRETURN(MATCH_NOMATCH);
4066 }
4067 if (ctype == OP_ANY && IS_NEWLINE(eptr))
4068 RRETURN(MATCH_NOMATCH);
4069 c = *eptr++;
4070 switch(ctype)
4071 {
4072 case OP_ANY: /* This is the non-NL case */
4073 case OP_ALLANY:
4074 case OP_ANYBYTE:
4075 break;
4076
4077 case OP_ANYNL:
4078 switch(c)
4079 {
4080 default: RRETURN(MATCH_NOMATCH);
4081 case 0x000d:
4082 if (eptr < md->end_subject && *eptr == 0x0a) eptr++;
4083 break;
4084
4085 case 0x000a:
4086 break;
4087
4088 case 0x000b:
4089 case 0x000c:
4090 case 0x0085:
4091 if (md->bsr_anycrlf) RRETURN(MATCH_NOMATCH);
4092 break;
4093 }
4094 break;
4095
4096 case OP_NOT_HSPACE:
4097 switch(c)
4098 {
4099 default: break;
4100 case 0x09: /* HT */
4101 case 0x20: /* SPACE */
4102 case 0xa0: /* NBSP */
4103 RRETURN(MATCH_NOMATCH);
4104 }
4105 break;
4106
4107 case OP_HSPACE:
4108 switch(c)
4109 {
4110 default: RRETURN(MATCH_NOMATCH);
4111 case 0x09: /* HT */
4112 case 0x20: /* SPACE */
4113 case 0xa0: /* NBSP */
4114 break;
4115 }
4116 break;
4117
4118 case OP_NOT_VSPACE:
4119 switch(c)
4120 {
4121 default: break;
4122 case 0x0a: /* LF */
4123 case 0x0b: /* VT */
4124 case 0x0c: /* FF */
4125 case 0x0d: /* CR */
4126 case 0x85: /* NEL */
4127 RRETURN(MATCH_NOMATCH);
4128 }
4129 break;
4130
4131 case OP_VSPACE:
4132 switch(c)
4133 {
4134 default: RRETURN(MATCH_NOMATCH);
4135 case 0x0a: /* LF */
4136 case 0x0b: /* VT */
4137 case 0x0c: /* FF */
4138 case 0x0d: /* CR */
4139 case 0x85: /* NEL */
4140 break;
4141 }
4142 break;
4143
4144 case OP_NOT_DIGIT:
4145 if ((md->ctypes[c] & ctype_digit) != 0) RRETURN(MATCH_NOMATCH);
4146 break;
4147
4148 case OP_DIGIT:
4149 if ((md->ctypes[c] & ctype_digit) == 0) RRETURN(MATCH_NOMATCH);
4150 break;
4151
4152 case OP_NOT_WHITESPACE:
4153 if ((md->ctypes[c] & ctype_space) != 0) RRETURN(MATCH_NOMATCH);
4154 break;
4155
4156 case OP_WHITESPACE:
4157 if ((md->ctypes[c] & ctype_space) == 0) RRETURN(MATCH_NOMATCH);
4158 break;
4159
4160 case OP_NOT_WORDCHAR:
4161 if ((md->ctypes[c] & ctype_word) != 0) RRETURN(MATCH_NOMATCH);
4162 break;
4163
4164 case OP_WORDCHAR:
4165 if ((md->ctypes[c] & ctype_word) == 0) RRETURN(MATCH_NOMATCH);
4166 break;
4167
4168 default:
4169 RRETURN(PCRE_ERROR_INTERNAL);
4170 }
4171 }
4172 }
4173 /* Control never gets here */
4174 }
4175
4176 /* If maximizing, it is worth using inline code for speed, doing the type
4177 test once at the start (i.e. keep it out of the loop). Again, keep the
4178 UTF-8 and UCP stuff separate. */
4179
4180 else
4181 {
4182 pp = eptr; /* Remember where we started */
4183
4184 #ifdef SUPPORT_UCP
4185 if (prop_type >= 0)
4186 {
4187 switch(prop_type)
4188 {
4189 case PT_ANY:
4190 for (i = min; i < max; i++)
4191 {
4192 int len = 1;
4193 if (eptr >= md->end_subject) break;
4194 GETCHARLEN(c, eptr, len);
4195 if (prop_fail_result) break;
4196 eptr+= len;
4197 }
4198 break;
4199
4200 case PT_LAMP:
4201 for (i = min; i < max; i++)
4202 {
4203 int len = 1;
4204 if (eptr >= md->end_subject) break;
4205 GETCHARLEN(c, eptr, len);
4206 prop_chartype = UCD_CHARTYPE(c);
4207 if ((prop_chartype == ucp_Lu ||
4208 prop_chartype == ucp_Ll ||
4209 prop_chartype == ucp_Lt) == prop_fail_result)
4210 break;
4211 eptr+= len;
4212 }
4213 break;
4214
4215 case PT_GC:
4216 for (i = min; i < max; i++)
4217 {
4218 int len = 1;
4219 if (eptr >= md->end_subject) break;
4220 GETCHARLEN(c, eptr, len);
4221 prop_category = UCD_CATEGORY(c);
4222 if ((prop_category == prop_value) == prop_fail_result)
4223 break;
4224 eptr+= len;
4225 }
4226 break;
4227
4228 case PT_PC:
4229 for (i = min; i < max; i++)
4230 {
4231 int len = 1;
4232 if (eptr >= md->end_subject) break;
4233 GETCHARLEN(c, eptr, len);
4234 prop_chartype = UCD_CHARTYPE(c);
4235 if ((prop_chartype == prop_value) == prop_fail_result)
4236 break;
4237 eptr+= len;
4238 }
4239 break;
4240
4241 case PT_SC:
4242 for (i = min; i < max; i++)
4243 {
4244 int len = 1;
4245 if (eptr >= md->end_subject) break;
4246 GETCHARLEN(c, eptr, len);
4247 prop_script = UCD_SCRIPT(c);
4248 if ((prop_script == prop_value) == prop_fail_result)
4249 break;
4250 eptr+= len;
4251 }
4252 break;
4253 }
4254
4255 /* eptr is now past the end of the maximum run */
4256
4257 if (possessive) continue;
4258 for(;;)
4259 {
4260 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM44);
4261 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
4262 if (eptr-- == pp) break; /* Stop if tried at original pos */
4263 if (utf8) BACKCHAR(eptr);
4264 }
4265 }
4266
4267 /* Match extended Unicode sequences. We will get here only if the
4268 support is in the binary; otherwise a compile-time error occurs. */
4269
4270 else if (ctype == OP_EXTUNI)
4271 {
4272 for (i = min; i < max; i++)
4273 {
4274 if (eptr >= md->end_subject) break;
4275 GETCHARINCTEST(c, eptr);
4276 prop_category = UCD_CATEGORY(c);
4277 if (prop_category == ucp_M) break;
4278 while (eptr < md->end_subject)
4279 {
4280 int len = 1;
4281 if (!utf8) c = *eptr; else
4282 {
4283 GETCHARLEN(c, eptr, len);
4284 }
4285 prop_category = UCD_CATEGORY(c);
4286 if (prop_category != ucp_M) break;
4287 eptr += len;
4288 }
4289 }
4290
4291 /* eptr is now past the end of the maximum run */
4292
4293 if (possessive) continue;
4294 for(;;)
4295 {
4296 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM45);
4297 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
4298 if (eptr-- == pp) break; /* Stop if tried at original pos */
4299 for (;;) /* Move back over one extended */
4300 {
4301 int len = 1;
4302 if (!utf8) c = *eptr; else
4303 {
4304 BACKCHAR(eptr);
4305 GETCHARLEN(c, eptr, len);
4306 }
4307 prop_category = UCD_CATEGORY(c);
4308 if (prop_category != ucp_M) break;
4309 eptr--;
4310 }
4311 }
4312 }
4313
4314 else
4315 #endif /* SUPPORT_UCP */
4316
4317 #ifdef SUPPORT_UTF8
4318 /* UTF-8 mode */
4319
4320 if (utf8)
4321 {
4322 switch(ctype)
4323 {
4324 case OP_ANY:
4325 if (max < INT_MAX)
4326 {
4327 for (i = min; i < max; i++)
4328 {
4329 if (eptr >= md->end_subject || IS_NEWLINE(eptr)) break;
4330 eptr++;
4331 while (eptr < md->end_subject && (*eptr & 0xc0) == 0x80) eptr++;
4332 }
4333 }
4334
4335 /* Handle unlimited UTF-8 repeat */
4336
4337 else
4338 {
4339 for (i = min; i < max; i++)
4340 {
4341 if (eptr >= md->end_subject || IS_NEWLINE(eptr)) break;
4342 eptr++;
4343 while (eptr < md->end_subject && (*eptr & 0xc0) == 0x80) eptr++;
4344 }
4345 }
4346 break;
4347
4348 case OP_ALLANY:
4349 if (max < INT_MAX)
4350 {
4351 for (i = min; i < max; i++)
4352 {
4353 if (eptr >= md->end_subject) break;
4354 eptr++;
4355 while (eptr < md->end_subject && (*eptr & 0xc0) == 0x80) eptr++;
4356 }
4357 }
4358 else eptr = md->end_subject; /* Unlimited UTF-8 repeat */
4359 break;
4360
4361 /* The byte case is the same as non-UTF8 */
4362
4363 case OP_ANYBYTE:
4364 c = max - min;
4365 if (c > (unsigned int)(md->end_subject - eptr))
4366 c = md->end_subject - eptr;
4367 eptr += c;
4368 break;
4369
4370 case OP_ANYNL:
4371 for (i = min; i < max; i++)
4372 {
4373 int len = 1;
4374 if (eptr >= md->end_subject) break;
4375 GETCHARLEN(c, eptr, len);
4376 if (c == 0x000d)
4377 {
4378 if (++eptr >= md->end_subject) break;
4379 if (*eptr == 0x000a) eptr++;
4380 }
4381 else
4382 {
4383 if (c != 0x000a &&
4384 (md->bsr_anycrlf ||
4385 (c != 0x000b && c != 0x000c &&
4386 c != 0x0085 && c != 0x2028 && c != 0x2029)))
4387 break;
4388 eptr += len;
4389 }
4390 }
4391 break;
4392
4393 case OP_NOT_HSPACE:
4394 case OP_HSPACE:
4395 for (i = min; i < max; i++)
4396 {
4397 BOOL gotspace;
4398 int len = 1;
4399 if (eptr >= md->end_subject) break;
4400 GETCHARLEN(c, eptr, len);
4401 switch(c)
4402 {
4403 default: gotspace = FALSE; break;
4404 case 0x09: /* HT */
4405 case 0x20: /* SPACE */
4406 case 0xa0: /* NBSP */
4407 case 0x1680: /* OGHAM SPACE MARK */
4408 case 0x180e: /* MONGOLIAN VOWEL SEPARATOR */
4409 case 0x2000: /* EN QUAD */
4410 case 0x2001: /* EM QUAD */
4411 case 0x2002: /* EN SPACE */
4412 case 0x2003: /* EM SPACE */
4413 case 0x2004: /* THREE-PER-EM SPACE */
4414 case 0x2005: /* FOUR-PER-EM SPACE */
4415 case 0x2006: /* SIX-PER-EM SPACE */
4416 case 0x2007: /* FIGURE SPACE */
4417 case 0x2008: /* PUNCTUATION SPACE */
4418 case 0x2009: /* THIN SPACE */
4419 case 0x200A: /* HAIR SPACE */
4420 case 0x202f: /* NARROW NO-BREAK SPACE */
4421 case 0x205f: /* MEDIUM MATHEMATICAL SPACE */
4422 case 0x3000: /* IDEOGRAPHIC SPACE */
4423 gotspace = TRUE;
4424 break;
4425 }
4426 if (gotspace == (ctype == OP_NOT_HSPACE)) break;
4427 eptr += len;
4428 }
4429 break;
4430
4431 case OP_NOT_VSPACE:
4432 case OP_VSPACE:
4433 for (i = min; i < max; i++)
4434 {
4435 BOOL gotspace;
4436 int len = 1;
4437 if (eptr >= md->end_subject) break;
4438 GETCHARLEN(c, eptr, len);
4439 switch(c)
4440 {
4441 default: gotspace = FALSE; break;
4442 case 0x0a: /* LF */
4443 case 0x0b: /* VT */
4444 case 0x0c: /* FF */
4445 case 0x0d: /* CR */
4446 case 0x85: /* NEL */
4447 case 0x2028: /* LINE SEPARATOR */
4448 case 0x2029: /* PARAGRAPH SEPARATOR */
4449 gotspace = TRUE;
4450 break;
4451 }
4452 if (gotspace == (ctype == OP_NOT_VSPACE)) break;
4453 eptr += len;
4454 }
4455 break;
4456
4457 case OP_NOT_DIGIT:
4458 for (i = min; i < max; i++)
4459 {
4460 int len = 1;
4461 if (eptr >= md->end_subject) break;
4462 GETCHARLEN(c, eptr, len);
4463 if (c < 256 && (md->ctypes[c] & ctype_digit) != 0) break;
4464 eptr+= len;
4465 }
4466 break;
4467
4468 case OP_DIGIT:
4469 for (i = min; i < max; i++)
4470 {
4471 int len = 1;
4472 if (eptr >= md->end_subject) break;
4473 GETCHARLEN(c, eptr, len);
4474 if (c >= 256 ||(md->ctypes[c] & ctype_digit) == 0) break;
4475 eptr+= len;
4476 }
4477 break;
4478
4479 case OP_NOT_WHITESPACE:
4480 for (i = min; i < max; i++)
4481 {
4482 int len = 1;
4483 if (eptr >= md->end_subject) break;
4484 GETCHARLEN(c, eptr, len);
4485 if (c < 256 && (md->ctypes[c] & ctype_space) != 0) break;
4486 eptr+= len;
4487 }
4488 break;
4489
4490 case OP_WHITESPACE:
4491 for (i = min; i < max; i++)
4492 {
4493 int len = 1;
4494 if (eptr >= md->end_subject) break;
4495 GETCHARLEN(c, eptr, len);
4496 if (c >= 256 ||(md->ctypes[c] & ctype_space) == 0) break;
4497 eptr+= len;
4498 }
4499 break;
4500
4501 case OP_NOT_WORDCHAR:
4502 for (i = min; i < max; i++)
4503 {
4504 int len = 1;
4505 if (eptr >= md->end_subject) break;
4506 GETCHARLEN(c, eptr, len);
4507 if (c < 256 && (md->ctypes[c] & ctype_word) != 0) break;
4508 eptr+= len;
4509 }
4510 break;
4511
4512 case OP_WORDCHAR:
4513 for (i = min; i < max; i++)
4514 {
4515 int len = 1;
4516 if (eptr >= md->end_subject) break;
4517 GETCHARLEN(c, eptr, len);
4518 if (c >= 256 || (md->ctypes[c] & ctype_word) == 0) break;
4519 eptr+= len;
4520 }
4521 break;
4522
4523 default:
4524 RRETURN(PCRE_ERROR_INTERNAL);
4525 }
4526
4527 /* eptr is now past the end of the maximum run */
4528
4529 if (possessive) continue;
4530 for(;;)
4531 {
4532 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM46);
4533 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
4534 if (eptr-- == pp) break; /* Stop if tried at original pos */
4535 BACKCHAR(eptr);
4536 }
4537 }
4538 else
4539 #endif /* SUPPORT_UTF8 */
4540
4541 /* Not UTF-8 mode */
4542 {
4543 switch(ctype)
4544 {
4545 case OP_ANY:
4546 for (i = min; i < max; i++)
4547 {
4548 if (eptr >= md->end_subject || IS_NEWLINE(eptr)) break;
4549 eptr++;
4550 }
4551 break;
4552
4553 case OP_ALLANY:
4554 case OP_ANYBYTE:
4555 c = max - min;
4556 if (c > (unsigned int)(md->end_subject - eptr))
4557 c = md->end_subject - eptr;
4558 eptr += c;
4559 break;
4560
4561 case OP_ANYNL:
4562 for (i = min; i < max; i++)
4563 {
4564 if (eptr >= md->end_subject) break;
4565 c = *eptr;
4566 if (c == 0x000d)
4567 {
4568 if (++eptr >= md->end_subject) break;
4569 if (*eptr == 0x000a) eptr++;
4570 }
4571 else
4572 {
4573 if (c != 0x000a &&
4574 (md->bsr_anycrlf ||
4575 (c != 0x000b && c != 0x000c && c != 0x0085)))
4576 break;
4577 eptr++;
4578 }
4579 }
4580 break;
4581
4582 case OP_NOT_HSPACE:
4583 for (i = min; i < max; i++)
4584 {
4585 if (eptr >= md->end_subject) break;
4586 c = *eptr;
4587 if (c == 0x09 || c == 0x20 || c == 0xa0) break;
4588 eptr++;
4589 }
4590 break;
4591
4592 case OP_HSPACE:
4593 for (i = min; i < max; i++)
4594 {
4595 if (eptr >= md->end_subject) break;
4596 c = *eptr;
4597 if (c != 0x09 && c != 0x20 && c != 0xa0) break;
4598 eptr++;
4599 }
4600 break;
4601
4602 case OP_NOT_VSPACE:
4603 for (i = min; i < max; i++)
4604 {
4605 if (eptr >= md->end_subject) break;
4606 c = *eptr;
4607 if (c == 0x0a || c == 0x0b || c == 0x0c || c == 0x0d || c == 0x85)
4608 break;
4609 eptr++;
4610 }
4611 break;
4612
4613 case OP_VSPACE:
4614 for (i = min; i < max; i++)
4615 {
4616 if (eptr >= md->end_subject) break;
4617 c = *eptr;
4618 if (c != 0x0a && c != 0x0b && c != 0x0c && c != 0x0d && c != 0x85)
4619 break;
4620 eptr++;
4621 }
4622 break;
4623
4624 case OP_NOT_DIGIT:
4625 for (i = min; i < max; i++)
4626 {
4627 if (eptr >= md->end_subject || (md->ctypes[*eptr] & ctype_digit) != 0)
4628 break;
4629 eptr++;
4630 }
4631 break;
4632
4633 case OP_DIGIT:
4634 for (i = min; i < max; i++)
4635 {
4636 if (eptr >= md->end_subject || (md->ctypes[*eptr] & ctype_digit) == 0)
4637 break;
4638 eptr++;
4639 }
4640 break;
4641
4642 case OP_NOT_WHITESPACE:
4643 for (i = min; i < max; i++)
4644 {
4645 if (eptr >= md->end_subject || (md->ctypes[*eptr] & ctype_space) != 0)
4646 break;
4647 eptr++;
4648 }
4649 break;
4650
4651 case OP_WHITESPACE:
4652 for (i = min; i < max; i++)
4653 {
4654 if (eptr >= md->end_subject || (md->ctypes[*eptr] & ctype_space) == 0)
4655 break;
4656 eptr++;
4657 }
4658 break;
4659
4660 case OP_NOT_WORDCHAR:
4661 for (i = min; i < max; i++)
4662 {
4663 if (eptr >= md->end_subject || (md->ctypes[*eptr] & ctype_word) != 0)
4664 break;
4665 eptr++;
4666 }
4667 break;
4668
4669 case OP_WORDCHAR:
4670 for (i = min; i < max; i++)
4671 {
4672 if (eptr >= md->end_subject || (md->ctypes[*eptr] & ctype_word) == 0)
4673 break;
4674 eptr++;
4675 }
4676 break;
4677
4678 default:
4679 RRETURN(PCRE_ERROR_INTERNAL);
4680 }
4681
4682 /* eptr is now past the end of the maximum run */
4683
4684 if (possessive) continue;
4685 while (eptr >= pp)
4686 {
4687 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM47);
4688 eptr--;
4689 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
4690 }
4691 }
4692
4693 /* Get here if we can't make it match with any permitted repetitions */
4694
4695 RRETURN(MATCH_NOMATCH);
4696 }
4697 /* Control never gets here */
4698
4699 /* There's been some horrible disaster. Arrival here can only mean there is
4700 something seriously wrong in the code above or the OP_xxx definitions. */
4701
4702 default:
4703 DPRINTF(("Unknown opcode %d\n", *ecode));
4704 RRETURN(PCRE_ERROR_UNKNOWN_OPCODE);
4705 }
4706
4707 /* Do not stick any code in here without much thought; it is assumed
4708 that "continue" in the code above comes out to here to repeat the main
4709 loop. */
4710
4711 } /* End of main loop */
4712 /* Control never reaches here */
4713
4714
4715 /* When compiling to use the heap rather than the stack for recursive calls to
4716 match(), the RRETURN() macro jumps here. The number that is saved in
4717 frame->Xwhere indicates which label we actually want to return to. */
4718
4719 #ifdef NO_RECURSE
4720 #define LBL(val) case val: goto L_RM##val;
4721 HEAP_RETURN:
4722 switch (frame->Xwhere)
4723 {
4724 LBL( 1) LBL( 2) LBL( 3) LBL( 4) LBL( 5) LBL( 6) LBL( 7) LBL( 8)
4725 LBL( 9) LBL(10) LBL(11) LBL(12) LBL(13) LBL(14) LBL(15) LBL(17)
4726 LBL(19) LBL(24) LBL(25) LBL(26) LBL(27) LBL(29) LBL(31) LBL(33)
4727 LBL(35) LBL(43) LBL(47) LBL(48) LBL(49) LBL(50) LBL(51) LBL(52)
4728 LBL(53) LBL(54)
4729 #ifdef SUPPORT_UTF8
4730 LBL(16) LBL(18) LBL(20) LBL(21) LBL(22) LBL(23) LBL(28) LBL(30)
4731 LBL(32) LBL(34) LBL(42) LBL(46)
4732 #ifdef SUPPORT_UCP
4733 LBL(36) LBL(37) LBL(38) LBL(39) LBL(40) LBL(41) LBL(44) LBL(45)
4734 #endif /* SUPPORT_UCP */
4735 #endif /* SUPPORT_UTF8 */
4736 default:
4737 DPRINTF(("jump error in pcre match: label %d non-existent\n", frame->Xwhere));
4738 return PCRE_ERROR_INTERNAL;
4739 }
4740 #undef LBL
4741 #endif /* NO_RECURSE */
4742 }
4743
4744
4745 /***************************************************************************
4746 ****************************************************************************
4747 RECURSION IN THE match() FUNCTION
4748
4749 Undefine all the macros that were defined above to handle this. */
4750
4751 #ifdef NO_RECURSE
4752 #undef eptr
4753 #undef ecode
4754 #undef mstart
4755 #undef offset_top
4756 #undef ims
4757 #undef eptrb
4758 #undef flags
4759
4760 #undef callpat
4761 #undef charptr
4762 #undef data
4763 #undef next
4764 #undef pp
4765 #undef prev
4766 #undef saved_eptr
4767
4768 #undef new_recursive
4769
4770 #undef cur_is_word
4771 #undef condition
4772 #undef prev_is_word
4773
4774 #undef original_ims
4775
4776 #undef ctype
4777 #undef length
4778 #undef max
4779 #undef min
4780 #undef number
4781 #undef offset
4782 #undef op
4783 #undef save_capture_last
4784 #undef save_offset1
4785 #undef save_offset2
4786 #undef save_offset3
4787 #undef stacksave
4788
4789 #undef newptrb
4790
4791 #endif
4792
4793 /* These two are defined as macros in both cases */
4794
4795 #undef fc
4796 #undef fi
4797
4798 /***************************************************************************
4799 ***************************************************************************/
4800
4801
4802
4803 /*************************************************
4804 * Execute a Regular Expression *
4805 *************************************************/
4806
4807 /* This function applies a compiled re to a subject string and picks out
4808 portions of the string if it matches. Two elements in the vector are set for
4809 each substring: the offsets to the start and end of the substring.
4810
4811 Arguments:
4812 argument_re points to the compiled expression
4813 extra_data points to extra data or is NULL
4814 subject points to the subject string
4815 length length of subject string (may contain binary zeros)
4816 start_offset where to start in the subject string
4817 options option bits
4818 offsets points to a vector of ints to be filled in with offsets
4819 offsetcount the number of elements in the vector
4820
4821 Returns: > 0 => success; value is the number of elements filled in
4822 = 0 => success, but offsets is not big enough
4823 -1 => failed to match
4824 < -1 => some kind of unexpected problem
4825 */
4826
4827 PCRE_EXP_DEFN int PCRE_CALL_CONVENTION
4828 pcre_exec(const pcre *argument_re, const pcre_extra *extra_data,
4829 PCRE_SPTR subject, int length, int start_offset, int options, int *offsets,
4830 int offsetcount)
4831 {
4832 int rc, resetcount, ocount;
4833 int first_byte = -1;
4834 int req_byte = -1;
4835 int req_byte2 = -1;
4836 int newline;
4837 unsigned long int ims;
4838 BOOL using_temporary_offsets = FALSE;
4839 BOOL anchored;
4840 BOOL startline;
4841 BOOL firstline;
4842 BOOL first_byte_caseless = FALSE;
4843 BOOL req_byte_caseless = FALSE;
4844 BOOL utf8;
4845 match_data match_block;
4846 match_data *md = &match_block;
4847 const uschar *tables;
4848 const uschar *start_bits = NULL;
4849 USPTR start_match = (USPTR)subject + start_offset;
4850 USPTR end_subject;
4851 USPTR start_partial = NULL;
4852 USPTR req_byte_ptr = start_match - 1;
4853
4854 pcre_study_data internal_study;
4855 const pcre_study_data *study;
4856
4857 real_pcre internal_re;
4858 const real_pcre *external_re = (const real_pcre *)argument_re;
4859 const real_pcre *re = external_re;
4860
4861 /* Plausibility checks */
4862
4863 if ((options & ~PUBLIC_EXEC_OPTIONS) != 0) return PCRE_ERROR_BADOPTION;
4864 if (re == NULL || subject == NULL ||
4865 (offsets == NULL && offsetcount > 0)) return PCRE_ERROR_NULL;
4866 if (offsetcount < 0) return PCRE_ERROR_BADCOUNT;
4867
4868 /* Fish out the optional data from the extra_data structure, first setting
4869 the default values. */
4870
4871 study = NULL;
4872 md->match_limit = MATCH_LIMIT;
4873 md->match_limit_recursion = MATCH_LIMIT_RECURSION;
4874 md->callout_data = NULL;
4875
4876 /* The table pointer is always in native byte order. */
4877
4878 tables = external_re->tables;
4879
4880 if (extra_data != NULL)
4881 {
4882 register unsigned int flags = extra_data->flags;
4883 if ((flags & PCRE_EXTRA_STUDY_DATA) != 0)
4884 study = (const pcre_study_data *)extra_data->study_data;
4885 if ((flags & PCRE_EXTRA_MATCH_LIMIT) != 0)
4886 md->match_limit = extra_data->match_limit;
4887 if ((flags & PCRE_EXTRA_MATCH_LIMIT_RECURSION) != 0)
4888 md->match_limit_recursion = extra_data->match_limit_recursion;
4889 if ((flags & PCRE_EXTRA_CALLOUT_DATA) != 0)
4890 md->callout_data = extra_data->callout_data;
4891 if ((flags & PCRE_EXTRA_TABLES) != 0) tables = extra_data->tables;
4892 }
4893
4894 /* If the exec call supplied NULL for tables, use the inbuilt ones. This
4895 is a feature that makes it possible to save compiled regex and re-use them
4896 in other programs later. */
4897
4898 if (tables == NULL) tables = _pcre_default_tables;
4899
4900 /* Check that the first field in the block is the magic number. If it is not,
4901 test for a regex that was compiled on a host of opposite endianness. If this is
4902 the case, flipped values are put in internal_re and internal_study if there was
4903 study data too. */
4904
4905 if (re->magic_number != MAGIC_NUMBER)
4906 {
4907 re = _pcre_try_flipped(re, &internal_re, study, &internal_study);
4908 if (re == NULL) return PCRE_ERROR_BADMAGIC;
4909 if (study != NULL) study = &internal_study;
4910 }
4911
4912 /* Set up other data */
4913
4914 anchored = ((re->options | options) & PCRE_ANCHORED) != 0;
4915 startline = (re->flags & PCRE_STARTLINE) != 0;
4916 firstline = (re->options & PCRE_FIRSTLINE) != 0;
4917
4918 /* The code starts after the real_pcre block and the capture name table. */
4919
4920 md->start_code = (const uschar *)external_re + re->name_table_offset +
4921 re->name_count * re->name_entry_size;
4922
4923 md->start_subject = (USPTR)subject;
4924 md->start_offset = start_offset;
4925 md->end_subject = md->start_subject + length;
4926 end_subject = md->end_subject;
4927
4928 md->endonly = (re->options & PCRE_DOLLAR_ENDONLY) != 0;
4929 utf8 = md->utf8 = (re->options & PCRE_UTF8) != 0;
4930 md->jscript_compat = (re->options & PCRE_JAVASCRIPT_COMPAT) != 0;
4931
4932 md->notbol = (options & PCRE_NOTBOL) != 0;
4933 md->noteol = (options & PCRE_NOTEOL) != 0;
4934 md->notempty = (options & PCRE_NOTEMPTY) != 0;
4935 md->notempty_atstart = (options & PCRE_NOTEMPTY_ATSTART) != 0;
4936 md->partial = ((options & PCRE_PARTIAL_HARD) != 0)? 2 :
4937 ((options & PCRE_PARTIAL_SOFT) != 0)? 1 : 0;
4938 md->hitend = FALSE;
4939
4940 md->recursive = NULL; /* No recursion at top level */
4941
4942 md->lcc = tables + lcc_offset;
4943 md->ctypes = tables + ctypes_offset;
4944
4945 /* Handle different \R options. */
4946
4947 switch (options & (PCRE_BSR_ANYCRLF|PCRE_BSR_UNICODE))
4948 {
4949 case 0:
4950 if ((re->options & (PCRE_BSR_ANYCRLF|PCRE_BSR_UNICODE)) != 0)
4951 md->bsr_anycrlf = (re->options & PCRE_BSR_ANYCRLF) != 0;
4952 else
4953 #ifdef BSR_ANYCRLF
4954 md->bsr_anycrlf = TRUE;
4955 #else
4956 md->bsr_anycrlf = FALSE;
4957 #endif
4958 break;
4959
4960 case PCRE_BSR_ANYCRLF:
4961 md->bsr_anycrlf = TRUE;
4962 break;
4963
4964 case PCRE_BSR_UNICODE:
4965 md->bsr_anycrlf = FALSE;
4966 break;
4967
4968 default: return PCRE_ERROR_BADNEWLINE;
4969 }
4970
4971 /* Handle different types of newline. The three bits give eight cases. If
4972 nothing is set at run time, whatever was used at compile time applies. */
4973
4974 switch ((((options & PCRE_NEWLINE_BITS) == 0)? re->options :
4975 (pcre_uint32)options) & PCRE_NEWLINE_BITS)
4976 {
4977 case 0: newline = NEWLINE; break; /* Compile-time default */
4978 case PCRE_NEWLINE_CR: newline = CHAR_CR; break;
4979 case PCRE_NEWLINE_LF: newline = CHAR_NL; break;
4980 case PCRE_NEWLINE_CR+
4981 PCRE_NEWLINE_LF: newline = (CHAR_CR << 8) | CHAR_NL; break;
4982 case PCRE_NEWLINE_ANY: newline = -1; break;
4983 case PCRE_NEWLINE_ANYCRLF: newline = -2; break;
4984 default: return PCRE_ERROR_BADNEWLINE;
4985 }
4986
4987 if (newline == -2)
4988 {
4989 md->nltype = NLTYPE_ANYCRLF;
4990 }
4991 else if (newline < 0)
4992 {
4993 md->nltype = NLTYPE_ANY;
4994 }
4995 else
4996 {
4997 md->nltype = NLTYPE_FIXED;
4998 if (newline > 255)
4999 {
5000 md->nllen = 2;
5001 md->nl[0] = (newline >> 8) & 255;
5002 md->nl[1] = newline & 255;
5003 }
5004 else
5005 {
5006 md->nllen = 1;
5007 md->nl[0] = newline;
5008 }
5009 }
5010
5011 /* Partial matching was originally supported only for a restricted set of
5012 regexes; from release 8.00 there are no restrictions, but the bits are still
5013 defined (though never set). So there's no harm in leaving this code. */
5014
5015 if (md->partial && (re->flags & PCRE_NOPARTIAL) != 0)
5016 return PCRE_ERROR_BADPARTIAL;
5017
5018 /* Check a UTF-8 string if required. Unfortunately there's no way of passing
5019 back the character offset. */
5020
5021 #ifdef SUPPORT_UTF8
5022 if (utf8 && (options & PCRE_NO_UTF8_CHECK) == 0)
5023 {
5024 if (_pcre_valid_utf8((USPTR)subject, length) >= 0)
5025 return PCRE_ERROR_BADUTF8;
5026 if (start_offset > 0 && start_offset < length)
5027 {
5028 int tb = ((USPTR)subject)[start_offset];
5029 if (tb > 127)
5030 {
5031 tb &= 0xc0;
5032 if (tb != 0 && tb != 0xc0) return PCRE_ERROR_BADUTF8_OFFSET;
5033 }
5034 }
5035 }
5036 #endif
5037
5038 /* The ims options can vary during the matching as a result of the presence
5039 of (?ims) items in the pattern. They are kept in a local variable so that
5040 restoring at the exit of a group is easy. */
5041
5042 ims = re->options & (PCRE_CASELESS|PCRE_MULTILINE|PCRE_DOTALL);
5043
5044 /* If the expression has got more back references than the offsets supplied can
5045 hold, we get a temporary chunk of working store to use during the matching.
5046 Otherwise, we can use the vector supplied, rounding down its size to a multiple
5047 of 3. */
5048
5049 ocount = offsetcount - (offsetcount % 3);
5050
5051 if (re->top_backref > 0 && re->top_backref >= ocount/3)
5052 {
5053 ocount = re->top_backref * 3 + 3;
5054 md->offset_vector = (int *)(pcre_malloc)(ocount * sizeof(int));
5055 if (md->offset_vector == NULL) return PCRE_ERROR_NOMEMORY;
5056 using_temporary_offsets = TRUE;
5057 DPRINTF(("Got memory to hold back references\n"));
5058 }
5059 else md->offset_vector = offsets;
5060
5061 md->offset_end = ocount;
5062 md->offset_max = (2*ocount)/3;
5063 md->offset_overflow = FALSE;
5064 md->capture_last = -1;
5065
5066 /* Compute the minimum number of offsets that we need to reset each time. Doing
5067 this makes a huge difference to execution time when there aren't many brackets
5068 in the pattern. */
5069
5070 resetcount = 2 + re->top_bracket * 2;
5071 if (resetcount > offsetcount) resetcount = ocount;
5072
5073 /* Reset the working variable associated with each extraction. These should
5074 never be used unless previously set, but they get saved and restored, and so we
5075 initialize them to avoid reading uninitialized locations. */
5076
5077 if (md->offset_vector != NULL)
5078 {
5079 register int *iptr = md->offset_vector + ocount;
5080 register int *iend = iptr - resetcount/2 + 1;
5081 while (--iptr >= iend) *iptr = -1;
5082 }
5083
5084 /* Set up the first character to match, if available. The first_byte value is
5085 never set for an anchored regular expression, but the anchoring may be forced
5086 at run time, so we have to test for anchoring. The first char may be unset for
5087 an unanchored pattern, of course. If there's no first char and the pattern was
5088 studied, there may be a bitmap of possible first characters. */
5089
5090 if (!anchored)
5091 {
5092 if ((re->flags & PCRE_FIRSTSET) != 0)
5093 {
5094 first_byte = re->first_byte & 255;
5095 if ((first_byte_caseless = ((re->first_byte & REQ_CASELESS) != 0)) == TRUE)
5096 first_byte = md->lcc[first_byte];
5097 }
5098 else
5099 if (!startline && study != NULL &&
5100 (study->options & PCRE_STUDY_MAPPED) != 0)
5101 start_bits = study->start_bits;
5102 }
5103
5104 /* For anchored or unanchored matches, there may be a "last known required
5105 character" set. */
5106
5107 if ((re->flags & PCRE_REQCHSET) != 0)
5108 {
5109 req_byte = re->req_byte & 255;
5110 req_byte_caseless = (re->req_byte & REQ_CASELESS) != 0;
5111 req_byte2 = (tables + fcc_offset)[req_byte]; /* case flipped */
5112 }
5113
5114
5115 /* ==========================================================================*/
5116
5117 /* Loop for handling unanchored repeated matching attempts; for anchored regexs
5118 the loop runs just once. */
5119
5120 for(;;)
5121 {
5122 USPTR save_end_subject = end_subject;
5123 USPTR new_start_match;
5124
5125 /* Reset the maximum number of extractions we might see. */
5126
5127 if (md->offset_vector != NULL)
5128 {
5129 register int *iptr = md->offset_vector;
5130 register int *iend = iptr + resetcount;
5131 while (iptr < iend) *iptr++ = -1;
5132 }
5133
5134 /* If firstline is TRUE, the start of the match is constrained to the first
5135 line of a multiline string. That is, the match must be before or at the first
5136 newline. Implement this by temporarily adjusting end_subject so that we stop
5137 scanning at a newline. If the match fails at the newline, later code breaks
5138 this loop. */
5139
5140 if (firstline)
5141 {
5142 USPTR t = start_match;
5143 #ifdef SUPPORT_UTF8
5144 if (utf8)
5145 {
5146 while (t < md->end_subject && !IS_NEWLINE(t))
5147 {
5148 t++;
5149 while (t < end_subject && (*t & 0xc0) == 0x80) t++;
5150 }
5151 }
5152 else
5153 #endif
5154 while (t < md->end_subject && !IS_NEWLINE(t)) t++;
5155 end_subject = t;
5156 }
5157
5158 /* There are some optimizations that avoid running the match if a known
5159 starting point is not found, or if a known later character is not present.
5160 However, there is an option that disables these, for testing and for ensuring
5161 that all callouts do actually occur. */
5162
5163 if ((options & PCRE_NO_START_OPTIMIZE) == 0)
5164 {
5165 /* Advance to a unique first byte if there is one. */
5166
5167 if (first_byte >= 0)
5168 {
5169 if (first_byte_caseless)
5170 while (start_match < end_subject && md->lcc[*start_match] != first_byte)
5171 start_match++;
5172 else
5173 while (start_match < end_subject && *start_match != first_byte)
5174 start_match++;
5175 }
5176
5177 /* Or to just after a linebreak for a multiline match */
5178
5179 else if (startline)
5180 {
5181 if (start_match > md->start_subject + start_offset)
5182 {
5183 #ifdef SUPPORT_UTF8
5184 if (utf8)
5185 {
5186 while (start_match < end_subject && !WAS_NEWLINE(start_match))
5187 {
5188 start_match++;
5189 while(start_match < end_subject && (*start_match & 0xc0) == 0x80)
5190 start_match++;
5191 }
5192 }
5193 else
5194 #endif
5195 while (start_match < end_subject && !WAS_NEWLINE(start_match))
5196 start_match++;
5197
5198 /* If we have just passed a CR and the newline option is ANY or ANYCRLF,
5199 and we are now at a LF, advance the match position by one more character.
5200 */
5201
5202 if (start_match[-1] == CHAR_CR &&
5203 (md->nltype == NLTYPE_ANY || md->nltype == NLTYPE_ANYCRLF) &&
5204 start_match < end_subject &&
5205 *start_match == CHAR_NL)
5206 start_match++;
5207 }
5208 }
5209
5210 /* Or to a non-unique first byte after study */
5211
5212 else if (start_bits != NULL)
5213 {
5214 while (start_match < end_subject)
5215 {
5216 register unsigned int c = *start_match;
5217 if ((start_bits[c/8] & (1 << (c&7))) == 0) start_match++;
5218 else break;
5219 }
5220 }
5221 } /* Starting optimizations */
5222
5223 /* Restore fudged end_subject */
5224
5225 end_subject = save_end_subject;
5226
5227 #ifdef DEBUG /* Sigh. Some compilers never learn. */
5228 printf(">>>> Match against: ");
5229 pchars(start_match, end_subject - start_match, TRUE, md);
5230 printf("\n");
5231 #endif
5232
5233 /* If req_byte is set, we know that that character must appear in the
5234 subject for the match to succeed. If the first character is set, req_byte
5235 must be later in the subject; otherwise the test starts at the match point.
5236 This optimization can save a huge amount of backtracking in patterns with
5237 nested unlimited repeats that aren't going to match. Writing separate code
5238 for cased/caseless versions makes it go faster, as does using an
5239 autoincrement and backing off on a match.
5240
5241 HOWEVER: when the subject string is very, very long, searching to its end
5242 can take a long time, and give bad performance on quite ordinary patterns.
5243 This showed up when somebody was matching something like /^\d+C/ on a
5244 32-megabyte string... so we don't do this when the string is sufficiently
5245 long.
5246
5247 ALSO: this processing is disabled when partial matching is requested, or if
5248 disabling is explicitly requested. */
5249
5250 if ((options & PCRE_NO_START_OPTIMIZE) == 0 &&
5251 req_byte >= 0 &&
5252 end_subject - start_match < REQ_BYTE_MAX &&
5253 !md->partial)
5254 {
5255 register USPTR p = start_match + ((first_byte >= 0)? 1 : 0);
5256
5257 /* We don't need to repeat the search if we haven't yet reached the
5258 place we found it at last time. */
5259
5260 if (p > req_byte_ptr)
5261 {
5262 if (req_byte_caseless)
5263 {
5264 while (p < end_subject)
5265 {
5266 register int pp = *p++;
5267 if (pp == req_byte || pp == req_byte2) { p--; break; }
5268 }
5269 }
5270 else
5271 {
5272 while (p < end_subject)
5273 {
5274 if (*p++ == req_byte) { p--; break; }
5275 }
5276 }
5277
5278 /* If we can't find the required character, break the matching loop,
5279 forcing a match failure. */
5280
5281 if (p >= end_subject)
5282 {
5283 rc = MATCH_NOMATCH;
5284 break;
5285 }
5286
5287 /* If we have found the required character, save the point where we
5288 found it, so that we don't search again next time round the loop if
5289 the start hasn't passed this character yet. */
5290
5291 req_byte_ptr = p;
5292 }
5293 }
5294
5295 /* OK, we can now run the match. If "hitend" is set afterwards, remember the
5296 first starting point for which a partial match was found. */
5297
5298 md->start_match_ptr = start_match;
5299 md->start_used_ptr = start_match;
5300 md->match_call_count = 0;
5301 rc = match(start_match, md->start_code, start_match, 2, md, ims, NULL, 0, 0);
5302 if (md->hitend && start_partial == NULL) start_partial = md->start_used_ptr;
5303
5304 switch(rc)
5305 {
5306 /* NOMATCH and PRUNE advance by one character. THEN at this level acts
5307 exactly like PRUNE. */
5308
5309 case MATCH_NOMATCH:
5310 case MATCH_PRUNE:
5311 case MATCH_THEN:
5312 new_start_match = start_match + 1;
5313 #ifdef SUPPORT_UTF8
5314 if (utf8)
5315 while(new_start_match < end_subject && (*new_start_match & 0xc0) == 0x80)
5316 new_start_match++;
5317 #endif
5318 break;
5319
5320 /* SKIP passes back the next starting point explicitly. */
5321
5322 case MATCH_SKIP:
5323 new_start_match = md->start_match_ptr;
5324 break;
5325
5326 /* COMMIT disables the bumpalong, but otherwise behaves as NOMATCH. */
5327
5328 case MATCH_COMMIT:
5329 rc = MATCH_NOMATCH;
5330 goto ENDLOOP;
5331
5332 /* Any other return is either a match, or some kind of error. */
5333
5334 default:
5335 goto ENDLOOP;
5336 }
5337
5338 /* Control reaches here for the various types of "no match at this point"
5339 result. Reset the code to MATCH_NOMATCH for subsequent checking. */
5340
5341 rc = MATCH_NOMATCH;
5342
5343 /* If PCRE_FIRSTLINE is set, the match must happen before or at the first
5344 newline in the subject (though it may continue over the newline). Therefore,
5345 if we have just failed to match, starting at a newline, do not continue. */
5346
5347 if (firstline && IS_NEWLINE(start_match)) break;
5348
5349 /* Advance to new matching position */
5350
5351 start_match = new_start_match;
5352
5353 /* Break the loop if the pattern is anchored or if we have passed the end of
5354 the subject. */
5355
5356 if (anchored || start_match > end_subject) break;
5357
5358 /* If we have just passed a CR and we are now at a LF, and the pattern does
5359 not contain any explicit matches for \r or \n, and the newline option is CRLF
5360 or ANY or ANYCRLF, advance the match position by one more character. */
5361
5362 if (start_match[-1] == CHAR_CR &&
5363 start_match < end_subject &&
5364 *start_match == CHAR_NL &&
5365 (re->flags & PCRE_HASCRORLF) == 0 &&
5366 (md->nltype == NLTYPE_ANY ||
5367 md->nltype == NLTYPE_ANYCRLF ||
5368 md->nllen == 2))
5369 start_match++;
5370
5371 } /* End of for(;;) "bumpalong" loop */
5372
5373 /* ==========================================================================*/
5374
5375 /* We reach here when rc is not MATCH_NOMATCH, or if one of the stopping
5376 conditions is true:
5377
5378 (1) The pattern is anchored or the match was failed by (*COMMIT);
5379
5380 (2) We are past the end of the subject;
5381
5382 (3) PCRE_FIRSTLINE is set and we have failed to match at a newline, because
5383 this option requests that a match occur at or before the first newline in
5384 the subject.
5385
5386 When we have a match and the offset vector is big enough to deal with any
5387 backreferences, captured substring offsets will already be set up. In the case
5388 where we had to get some local store to hold offsets for backreference
5389 processing, copy those that we can. In this case there need not be overflow if
5390 certain parts of the pattern were not used, even though there are more
5391 capturing parentheses than vector slots. */
5392
5393 ENDLOOP:
5394
5395 if (rc == MATCH_MATCH)
5396 {
5397 if (using_temporary_offsets)
5398 {
5399 if (offsetcount >= 4)
5400 {
5401 memcpy(offsets + 2, md->offset_vector + 2,
5402 (offsetcount - 2) * sizeof(int));
5403 DPRINTF(("Copied offsets from temporary memory\n"));
5404 }
5405 if (md->end_offset_top > offsetcount) md->offset_overflow = TRUE;
5406 DPRINTF(("Freeing temporary memory\n"));
5407 (pcre_free)(md->offset_vector);
5408 }
5409
5410 /* Set the return code to the number of captured strings, or 0 if there are
5411 too many to fit into the vector. */
5412
5413 rc = md->offset_overflow? 0 : md->end_offset_top/2;
5414
5415 /* If there is space, set up the whole thing as substring 0. The value of
5416 md->start_match_ptr might be modified if \K was encountered on the success
5417 matching path. */
5418
5419 if (offsetcount < 2) rc = 0; else
5420 {
5421 offsets[0] = md->start_match_ptr - md->start_subject;
5422 offsets[1] = md->end_match_ptr - md->start_subject;
5423 }
5424
5425 DPRINTF((">>>> returning %d\n", rc));
5426 return rc;
5427 }
5428
5429 /* Control gets here if there has been an error, or if the overall match
5430 attempt has failed at all permitted starting positions. */
5431
5432 if (using_temporary_offsets)
5433 {
5434 DPRINTF(("Freeing temporary memory\n"));
5435 (pcre_free)(md->offset_vector);
5436 }
5437
5438 if (rc != MATCH_NOMATCH && rc != PCRE_ERROR_PARTIAL)
5439 {
5440 DPRINTF((">>>> error: returning %d\n", rc));
5441 return rc;
5442 }
5443 else if (start_partial != NULL)
5444 {
5445 DPRINTF((">>>> returning PCRE_ERROR_PARTIAL\n"));
5446 if (offsetcount > 1)
5447 {
5448 offsets[0] = start_partial - (USPTR)subject;
5449 offsets[1] = end_subject - (USPTR)subject;
5450 }
5451 return PCRE_ERROR_PARTIAL;
5452 }
5453 else
5454 {
5455 DPRINTF((">>>> returning PCRE_ERROR_NOMATCH\n"));
5456 return PCRE_ERROR_NOMATCH;
5457 }
5458 }
5459
5460 /* End of pcre_exec.c */

Properties

Name Value
svn:eol-style native
svn:keywords "Author Date Id Revision Url"

webmaster@exim.org
ViewVC Help
Powered by ViewVC 1.1.12