/[pcre]/code/trunk/pcre_exec.c
ViewVC logotype

Contents of /code/trunk/pcre_exec.c

Parent Directory Parent Directory | Revision Log Revision Log


Revision 444 - (show annotations) (download)
Sun Sep 13 16:26:39 2009 UTC (4 years, 11 months ago) by ph10
File MIME type: text/plain
File size: 161641 byte(s)
Fix comment in code.

1 /*************************************************
2 * Perl-Compatible Regular Expressions *
3 *************************************************/
4
5 /* PCRE is a library of functions to support regular expressions whose syntax
6 and semantics are as close as possible to those of the Perl 5 language.
7
8 Written by Philip Hazel
9 Copyright (c) 1997-2009 University of Cambridge
10
11 -----------------------------------------------------------------------------
12 Redistribution and use in source and binary forms, with or without
13 modification, are permitted provided that the following conditions are met:
14
15 * Redistributions of source code must retain the above copyright notice,
16 this list of conditions and the following disclaimer.
17
18 * Redistributions in binary form must reproduce the above copyright
19 notice, this list of conditions and the following disclaimer in the
20 documentation and/or other materials provided with the distribution.
21
22 * Neither the name of the University of Cambridge nor the names of its
23 contributors may be used to endorse or promote products derived from
24 this software without specific prior written permission.
25
26 THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
27 AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
28 IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
29 ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
30 LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
31 CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
32 SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
33 INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
34 CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
35 ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
36 POSSIBILITY OF SUCH DAMAGE.
37 -----------------------------------------------------------------------------
38 */
39
40
41 /* This module contains pcre_exec(), the externally visible function that does
42 pattern matching using an NFA algorithm, trying to mimic Perl as closely as
43 possible. There are also some static supporting functions. */
44
45 #ifdef HAVE_CONFIG_H
46 #include "config.h"
47 #endif
48
49 #define NLBLOCK md /* Block containing newline information */
50 #define PSSTART start_subject /* Field containing processed string start */
51 #define PSEND end_subject /* Field containing processed string end */
52
53 #include "pcre_internal.h"
54
55 /* Undefine some potentially clashing cpp symbols */
56
57 #undef min
58 #undef max
59
60 /* Flag bits for the match() function */
61
62 #define match_condassert 0x01 /* Called to check a condition assertion */
63 #define match_cbegroup 0x02 /* Could-be-empty unlimited repeat group */
64
65 /* Non-error returns from the match() function. Error returns are externally
66 defined PCRE_ERROR_xxx codes, which are all negative. */
67
68 #define MATCH_MATCH 1
69 #define MATCH_NOMATCH 0
70
71 /* Special internal returns from the match() function. Make them sufficiently
72 negative to avoid the external error codes. */
73
74 #define MATCH_COMMIT (-999)
75 #define MATCH_PRUNE (-998)
76 #define MATCH_SKIP (-997)
77 #define MATCH_THEN (-996)
78
79 /* Maximum number of ints of offset to save on the stack for recursive calls.
80 If the offset vector is bigger, malloc is used. This should be a multiple of 3,
81 because the offset vector is always a multiple of 3 long. */
82
83 #define REC_STACK_SAVE_MAX 30
84
85 /* Min and max values for the common repeats; for the maxima, 0 => infinity */
86
87 static const char rep_min[] = { 0, 0, 1, 1, 0, 0 };
88 static const char rep_max[] = { 0, 0, 0, 0, 1, 1 };
89
90
91
92 #ifdef DEBUG
93 /*************************************************
94 * Debugging function to print chars *
95 *************************************************/
96
97 /* Print a sequence of chars in printable format, stopping at the end of the
98 subject if the requested.
99
100 Arguments:
101 p points to characters
102 length number to print
103 is_subject TRUE if printing from within md->start_subject
104 md pointer to matching data block, if is_subject is TRUE
105
106 Returns: nothing
107 */
108
109 static void
110 pchars(const uschar *p, int length, BOOL is_subject, match_data *md)
111 {
112 unsigned int c;
113 if (is_subject && length > md->end_subject - p) length = md->end_subject - p;
114 while (length-- > 0)
115 if (isprint(c = *(p++))) printf("%c", c); else printf("\\x%02x", c);
116 }
117 #endif
118
119
120
121 /*************************************************
122 * Match a back-reference *
123 *************************************************/
124
125 /* If a back reference hasn't been set, the length that is passed is greater
126 than the number of characters left in the string, so the match fails.
127
128 Arguments:
129 offset index into the offset vector
130 eptr points into the subject
131 length length to be matched
132 md points to match data block
133 ims the ims flags
134
135 Returns: TRUE if matched
136 */
137
138 static BOOL
139 match_ref(int offset, register USPTR eptr, int length, match_data *md,
140 unsigned long int ims)
141 {
142 USPTR p = md->start_subject + md->offset_vector[offset];
143
144 #ifdef DEBUG
145 if (eptr >= md->end_subject)
146 printf("matching subject <null>");
147 else
148 {
149 printf("matching subject ");
150 pchars(eptr, length, TRUE, md);
151 }
152 printf(" against backref ");
153 pchars(p, length, FALSE, md);
154 printf("\n");
155 #endif
156
157 /* Always fail if not enough characters left */
158
159 if (length > md->end_subject - eptr) return FALSE;
160
161 /* Separate the caseless case for speed. In UTF-8 mode we can only do this
162 properly if Unicode properties are supported. Otherwise, we can check only
163 ASCII characters. */
164
165 if ((ims & PCRE_CASELESS) != 0)
166 {
167 #ifdef SUPPORT_UTF8
168 #ifdef SUPPORT_UCP
169 if (md->utf8)
170 {
171 USPTR endptr = eptr + length;
172 while (eptr < endptr)
173 {
174 int c, d;
175 GETCHARINC(c, eptr);
176 GETCHARINC(d, p);
177 if (c != d && c != UCD_OTHERCASE(d)) return FALSE;
178 }
179 }
180 else
181 #endif
182 #endif
183
184 /* The same code works when not in UTF-8 mode and in UTF-8 mode when there
185 is no UCP support. */
186
187 while (length-- > 0)
188 { if (md->lcc[*p++] != md->lcc[*eptr++]) return FALSE; }
189 }
190
191 /* In the caseful case, we can just compare the bytes, whether or not we
192 are in UTF-8 mode. */
193
194 else
195 { while (length-- > 0) if (*p++ != *eptr++) return FALSE; }
196
197 return TRUE;
198 }
199
200
201
202 /***************************************************************************
203 ****************************************************************************
204 RECURSION IN THE match() FUNCTION
205
206 The match() function is highly recursive, though not every recursive call
207 increases the recursive depth. Nevertheless, some regular expressions can cause
208 it to recurse to a great depth. I was writing for Unix, so I just let it call
209 itself recursively. This uses the stack for saving everything that has to be
210 saved for a recursive call. On Unix, the stack can be large, and this works
211 fine.
212
213 It turns out that on some non-Unix-like systems there are problems with
214 programs that use a lot of stack. (This despite the fact that every last chip
215 has oodles of memory these days, and techniques for extending the stack have
216 been known for decades.) So....
217
218 There is a fudge, triggered by defining NO_RECURSE, which avoids recursive
219 calls by keeping local variables that need to be preserved in blocks of memory
220 obtained from malloc() instead instead of on the stack. Macros are used to
221 achieve this so that the actual code doesn't look very different to what it
222 always used to.
223
224 The original heap-recursive code used longjmp(). However, it seems that this
225 can be very slow on some operating systems. Following a suggestion from Stan
226 Switzer, the use of longjmp() has been abolished, at the cost of having to
227 provide a unique number for each call to RMATCH. There is no way of generating
228 a sequence of numbers at compile time in C. I have given them names, to make
229 them stand out more clearly.
230
231 Crude tests on x86 Linux show a small speedup of around 5-8%. However, on
232 FreeBSD, avoiding longjmp() more than halves the time taken to run the standard
233 tests. Furthermore, not using longjmp() means that local dynamic variables
234 don't have indeterminate values; this has meant that the frame size can be
235 reduced because the result can be "passed back" by straight setting of the
236 variable instead of being passed in the frame.
237 ****************************************************************************
238 ***************************************************************************/
239
240 /* Numbers for RMATCH calls. When this list is changed, the code at HEAP_RETURN
241 below must be updated in sync. */
242
243 enum { RM1=1, RM2, RM3, RM4, RM5, RM6, RM7, RM8, RM9, RM10,
244 RM11, RM12, RM13, RM14, RM15, RM16, RM17, RM18, RM19, RM20,
245 RM21, RM22, RM23, RM24, RM25, RM26, RM27, RM28, RM29, RM30,
246 RM31, RM32, RM33, RM34, RM35, RM36, RM37, RM38, RM39, RM40,
247 RM41, RM42, RM43, RM44, RM45, RM46, RM47, RM48, RM49, RM50,
248 RM51, RM52, RM53, RM54 };
249
250 /* These versions of the macros use the stack, as normal. There are debugging
251 versions and production versions. Note that the "rw" argument of RMATCH isn't
252 actuall used in this definition. */
253
254 #ifndef NO_RECURSE
255 #define REGISTER register
256
257 #ifdef DEBUG
258 #define RMATCH(ra,rb,rc,rd,re,rf,rg,rw) \
259 { \
260 printf("match() called in line %d\n", __LINE__); \
261 rrc = match(ra,rb,mstart,rc,rd,re,rf,rg,rdepth+1); \
262 printf("to line %d\n", __LINE__); \
263 }
264 #define RRETURN(ra) \
265 { \
266 printf("match() returned %d from line %d ", ra, __LINE__); \
267 return ra; \
268 }
269 #else
270 #define RMATCH(ra,rb,rc,rd,re,rf,rg,rw) \
271 rrc = match(ra,rb,mstart,rc,rd,re,rf,rg,rdepth+1)
272 #define RRETURN(ra) return ra
273 #endif
274
275 #else
276
277
278 /* These versions of the macros manage a private stack on the heap. Note that
279 the "rd" argument of RMATCH isn't actually used in this definition. It's the md
280 argument of match(), which never changes. */
281
282 #define REGISTER
283
284 #define RMATCH(ra,rb,rc,rd,re,rf,rg,rw)\
285 {\
286 heapframe *newframe = (pcre_stack_malloc)(sizeof(heapframe));\
287 frame->Xwhere = rw; \
288 newframe->Xeptr = ra;\
289 newframe->Xecode = rb;\
290 newframe->Xmstart = mstart;\
291 newframe->Xoffset_top = rc;\
292 newframe->Xims = re;\
293 newframe->Xeptrb = rf;\
294 newframe->Xflags = rg;\
295 newframe->Xrdepth = frame->Xrdepth + 1;\
296 newframe->Xprevframe = frame;\
297 frame = newframe;\
298 DPRINTF(("restarting from line %d\n", __LINE__));\
299 goto HEAP_RECURSE;\
300 L_##rw:\
301 DPRINTF(("jumped back to line %d\n", __LINE__));\
302 }
303
304 #define RRETURN(ra)\
305 {\
306 heapframe *newframe = frame;\
307 frame = newframe->Xprevframe;\
308 (pcre_stack_free)(newframe);\
309 if (frame != NULL)\
310 {\
311 rrc = ra;\
312 goto HEAP_RETURN;\
313 }\
314 return ra;\
315 }
316
317
318 /* Structure for remembering the local variables in a private frame */
319
320 typedef struct heapframe {
321 struct heapframe *Xprevframe;
322
323 /* Function arguments that may change */
324
325 USPTR Xeptr;
326 const uschar *Xecode;
327 USPTR Xmstart;
328 int Xoffset_top;
329 long int Xims;
330 eptrblock *Xeptrb;
331 int Xflags;
332 unsigned int Xrdepth;
333
334 /* Function local variables */
335
336 USPTR Xcallpat;
337 #ifdef SUPPORT_UTF8
338 USPTR Xcharptr;
339 #endif
340 USPTR Xdata;
341 USPTR Xnext;
342 USPTR Xpp;
343 USPTR Xprev;
344 USPTR Xsaved_eptr;
345
346 recursion_info Xnew_recursive;
347
348 BOOL Xcur_is_word;
349 BOOL Xcondition;
350 BOOL Xprev_is_word;
351
352 unsigned long int Xoriginal_ims;
353
354 #ifdef SUPPORT_UCP
355 int Xprop_type;
356 int Xprop_value;
357 int Xprop_fail_result;
358 int Xprop_category;
359 int Xprop_chartype;
360 int Xprop_script;
361 int Xoclength;
362 uschar Xocchars[8];
363 #endif
364
365 int Xcodelink;
366 int Xctype;
367 unsigned int Xfc;
368 int Xfi;
369 int Xlength;
370 int Xmax;
371 int Xmin;
372 int Xnumber;
373 int Xoffset;
374 int Xop;
375 int Xsave_capture_last;
376 int Xsave_offset1, Xsave_offset2, Xsave_offset3;
377 int Xstacksave[REC_STACK_SAVE_MAX];
378
379 eptrblock Xnewptrb;
380
381 /* Where to jump back to */
382
383 int Xwhere;
384
385 } heapframe;
386
387 #endif
388
389
390 /***************************************************************************
391 ***************************************************************************/
392
393
394
395 /*************************************************
396 * Match from current position *
397 *************************************************/
398
399 /* This function is called recursively in many circumstances. Whenever it
400 returns a negative (error) response, the outer incarnation must also return the
401 same response. */
402
403 /* These macros pack up tests that are used for partial matching, and which
404 appears several times in the code. We set the "hit end" flag if the pointer is
405 at the end of the subject and also past the start of the subject (i.e.
406 something has been matched). For hard partial matching, we then return
407 immediately. The second one is used when we already know we are past the end of
408 the subject. */
409
410 #define CHECK_PARTIAL()\
411 if (md->partial != 0 && eptr >= md->end_subject && eptr > mstart)\
412 {\
413 md->hitend = TRUE;\
414 if (md->partial > 1) RRETURN(PCRE_ERROR_PARTIAL);\
415 }
416
417 #define SCHECK_PARTIAL()\
418 if (md->partial && eptr > mstart)\
419 {\
420 md->hitend = TRUE;\
421 if (md->partial > 1) RRETURN(PCRE_ERROR_PARTIAL);\
422 }
423
424
425 /* Performance note: It might be tempting to extract commonly used fields from
426 the md structure (e.g. utf8, end_subject) into individual variables to improve
427 performance. Tests using gcc on a SPARC disproved this; in the first case, it
428 made performance worse.
429
430 Arguments:
431 eptr pointer to current character in subject
432 ecode pointer to current position in compiled code
433 mstart pointer to the current match start position (can be modified
434 by encountering \K)
435 offset_top current top pointer
436 md pointer to "static" info for the match
437 ims current /i, /m, and /s options
438 eptrb pointer to chain of blocks containing eptr at start of
439 brackets - for testing for empty matches
440 flags can contain
441 match_condassert - this is an assertion condition
442 match_cbegroup - this is the start of an unlimited repeat
443 group that can match an empty string
444 rdepth the recursion depth
445
446 Returns: MATCH_MATCH if matched ) these values are >= 0
447 MATCH_NOMATCH if failed to match )
448 a negative PCRE_ERROR_xxx value if aborted by an error condition
449 (e.g. stopped by repeated call or recursion limit)
450 */
451
452 static int
453 match(REGISTER USPTR eptr, REGISTER const uschar *ecode, USPTR mstart,
454 int offset_top, match_data *md, unsigned long int ims, eptrblock *eptrb,
455 int flags, unsigned int rdepth)
456 {
457 /* These variables do not need to be preserved over recursion in this function,
458 so they can be ordinary variables in all cases. Mark some of them with
459 "register" because they are used a lot in loops. */
460
461 register int rrc; /* Returns from recursive calls */
462 register int i; /* Used for loops not involving calls to RMATCH() */
463 register unsigned int c; /* Character values not kept over RMATCH() calls */
464 register BOOL utf8; /* Local copy of UTF-8 flag for speed */
465
466 BOOL minimize, possessive; /* Quantifier options */
467 int condcode;
468
469 /* When recursion is not being used, all "local" variables that have to be
470 preserved over calls to RMATCH() are part of a "frame" which is obtained from
471 heap storage. Set up the top-level frame here; others are obtained from the
472 heap whenever RMATCH() does a "recursion". See the macro definitions above. */
473
474 #ifdef NO_RECURSE
475 heapframe *frame = (pcre_stack_malloc)(sizeof(heapframe));
476 frame->Xprevframe = NULL; /* Marks the top level */
477
478 /* Copy in the original argument variables */
479
480 frame->Xeptr = eptr;
481 frame->Xecode = ecode;
482 frame->Xmstart = mstart;
483 frame->Xoffset_top = offset_top;
484 frame->Xims = ims;
485 frame->Xeptrb = eptrb;
486 frame->Xflags = flags;
487 frame->Xrdepth = rdepth;
488
489 /* This is where control jumps back to to effect "recursion" */
490
491 HEAP_RECURSE:
492
493 /* Macros make the argument variables come from the current frame */
494
495 #define eptr frame->Xeptr
496 #define ecode frame->Xecode
497 #define mstart frame->Xmstart
498 #define offset_top frame->Xoffset_top
499 #define ims frame->Xims
500 #define eptrb frame->Xeptrb
501 #define flags frame->Xflags
502 #define rdepth frame->Xrdepth
503
504 /* Ditto for the local variables */
505
506 #ifdef SUPPORT_UTF8
507 #define charptr frame->Xcharptr
508 #endif
509 #define callpat frame->Xcallpat
510 #define codelink frame->Xcodelink
511 #define data frame->Xdata
512 #define next frame->Xnext
513 #define pp frame->Xpp
514 #define prev frame->Xprev
515 #define saved_eptr frame->Xsaved_eptr
516
517 #define new_recursive frame->Xnew_recursive
518
519 #define cur_is_word frame->Xcur_is_word
520 #define condition frame->Xcondition
521 #define prev_is_word frame->Xprev_is_word
522
523 #define original_ims frame->Xoriginal_ims
524
525 #ifdef SUPPORT_UCP
526 #define prop_type frame->Xprop_type
527 #define prop_value frame->Xprop_value
528 #define prop_fail_result frame->Xprop_fail_result
529 #define prop_category frame->Xprop_category
530 #define prop_chartype frame->Xprop_chartype
531 #define prop_script frame->Xprop_script
532 #define oclength frame->Xoclength
533 #define occhars frame->Xocchars
534 #endif
535
536 #define ctype frame->Xctype
537 #define fc frame->Xfc
538 #define fi frame->Xfi
539 #define length frame->Xlength
540 #define max frame->Xmax
541 #define min frame->Xmin
542 #define number frame->Xnumber
543 #define offset frame->Xoffset
544 #define op frame->Xop
545 #define save_capture_last frame->Xsave_capture_last
546 #define save_offset1 frame->Xsave_offset1
547 #define save_offset2 frame->Xsave_offset2
548 #define save_offset3 frame->Xsave_offset3
549 #define stacksave frame->Xstacksave
550
551 #define newptrb frame->Xnewptrb
552
553 /* When recursion is being used, local variables are allocated on the stack and
554 get preserved during recursion in the normal way. In this environment, fi and
555 i, and fc and c, can be the same variables. */
556
557 #else /* NO_RECURSE not defined */
558 #define fi i
559 #define fc c
560
561
562 #ifdef SUPPORT_UTF8 /* Many of these variables are used only */
563 const uschar *charptr; /* in small blocks of the code. My normal */
564 #endif /* style of coding would have declared */
565 const uschar *callpat; /* them within each of those blocks. */
566 const uschar *data; /* However, in order to accommodate the */
567 const uschar *next; /* version of this code that uses an */
568 USPTR pp; /* external "stack" implemented on the */
569 const uschar *prev; /* heap, it is easier to declare them all */
570 USPTR saved_eptr; /* here, so the declarations can be cut */
571 /* out in a block. The only declarations */
572 recursion_info new_recursive; /* within blocks below are for variables */
573 /* that do not have to be preserved over */
574 BOOL cur_is_word; /* a recursive call to RMATCH(). */
575 BOOL condition;
576 BOOL prev_is_word;
577
578 unsigned long int original_ims;
579
580 #ifdef SUPPORT_UCP
581 int prop_type;
582 int prop_value;
583 int prop_fail_result;
584 int prop_category;
585 int prop_chartype;
586 int prop_script;
587 int oclength;
588 uschar occhars[8];
589 #endif
590
591 int codelink;
592 int ctype;
593 int length;
594 int max;
595 int min;
596 int number;
597 int offset;
598 int op;
599 int save_capture_last;
600 int save_offset1, save_offset2, save_offset3;
601 int stacksave[REC_STACK_SAVE_MAX];
602
603 eptrblock newptrb;
604 #endif /* NO_RECURSE */
605
606 /* These statements are here to stop the compiler complaining about unitialized
607 variables. */
608
609 #ifdef SUPPORT_UCP
610 prop_value = 0;
611 prop_fail_result = 0;
612 #endif
613
614
615 /* This label is used for tail recursion, which is used in a few cases even
616 when NO_RECURSE is not defined, in order to reduce the amount of stack that is
617 used. Thanks to Ian Taylor for noticing this possibility and sending the
618 original patch. */
619
620 TAIL_RECURSE:
621
622 /* OK, now we can get on with the real code of the function. Recursive calls
623 are specified by the macro RMATCH and RRETURN is used to return. When
624 NO_RECURSE is *not* defined, these just turn into a recursive call to match()
625 and a "return", respectively (possibly with some debugging if DEBUG is
626 defined). However, RMATCH isn't like a function call because it's quite a
627 complicated macro. It has to be used in one particular way. This shouldn't,
628 however, impact performance when true recursion is being used. */
629
630 #ifdef SUPPORT_UTF8
631 utf8 = md->utf8; /* Local copy of the flag */
632 #else
633 utf8 = FALSE;
634 #endif
635
636 /* First check that we haven't called match() too many times, or that we
637 haven't exceeded the recursive call limit. */
638
639 if (md->match_call_count++ >= md->match_limit) RRETURN(PCRE_ERROR_MATCHLIMIT);
640 if (rdepth >= md->match_limit_recursion) RRETURN(PCRE_ERROR_RECURSIONLIMIT);
641
642 original_ims = ims; /* Save for resetting on ')' */
643
644 /* At the start of a group with an unlimited repeat that may match an empty
645 string, the match_cbegroup flag is set. When this is the case, add the current
646 subject pointer to the chain of such remembered pointers, to be checked when we
647 hit the closing ket, in order to break infinite loops that match no characters.
648 When match() is called in other circumstances, don't add to the chain. The
649 match_cbegroup flag must NOT be used with tail recursion, because the memory
650 block that is used is on the stack, so a new one may be required for each
651 match(). */
652
653 if ((flags & match_cbegroup) != 0)
654 {
655 newptrb.epb_saved_eptr = eptr;
656 newptrb.epb_prev = eptrb;
657 eptrb = &newptrb;
658 }
659
660 /* Now start processing the opcodes. */
661
662 for (;;)
663 {
664 minimize = possessive = FALSE;
665 op = *ecode;
666
667 switch(op)
668 {
669 case OP_FAIL:
670 RRETURN(MATCH_NOMATCH);
671
672 case OP_PRUNE:
673 RMATCH(eptr, ecode + _pcre_OP_lengths[*ecode], offset_top, md,
674 ims, eptrb, flags, RM51);
675 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
676 RRETURN(MATCH_PRUNE);
677
678 case OP_COMMIT:
679 RMATCH(eptr, ecode + _pcre_OP_lengths[*ecode], offset_top, md,
680 ims, eptrb, flags, RM52);
681 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
682 RRETURN(MATCH_COMMIT);
683
684 case OP_SKIP:
685 RMATCH(eptr, ecode + _pcre_OP_lengths[*ecode], offset_top, md,
686 ims, eptrb, flags, RM53);
687 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
688 md->start_match_ptr = eptr; /* Pass back current position */
689 RRETURN(MATCH_SKIP);
690
691 case OP_THEN:
692 RMATCH(eptr, ecode + _pcre_OP_lengths[*ecode], offset_top, md,
693 ims, eptrb, flags, RM54);
694 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
695 RRETURN(MATCH_THEN);
696
697 /* Handle a capturing bracket. If there is space in the offset vector, save
698 the current subject position in the working slot at the top of the vector.
699 We mustn't change the current values of the data slot, because they may be
700 set from a previous iteration of this group, and be referred to by a
701 reference inside the group.
702
703 If the bracket fails to match, we need to restore this value and also the
704 values of the final offsets, in case they were set by a previous iteration
705 of the same bracket.
706
707 If there isn't enough space in the offset vector, treat this as if it were
708 a non-capturing bracket. Don't worry about setting the flag for the error
709 case here; that is handled in the code for KET. */
710
711 case OP_CBRA:
712 case OP_SCBRA:
713 number = GET2(ecode, 1+LINK_SIZE);
714 offset = number << 1;
715
716 #ifdef DEBUG
717 printf("start bracket %d\n", number);
718 printf("subject=");
719 pchars(eptr, 16, TRUE, md);
720 printf("\n");
721 #endif
722
723 if (offset < md->offset_max)
724 {
725 save_offset1 = md->offset_vector[offset];
726 save_offset2 = md->offset_vector[offset+1];
727 save_offset3 = md->offset_vector[md->offset_end - number];
728 save_capture_last = md->capture_last;
729
730 DPRINTF(("saving %d %d %d\n", save_offset1, save_offset2, save_offset3));
731 md->offset_vector[md->offset_end - number] = eptr - md->start_subject;
732
733 flags = (op == OP_SCBRA)? match_cbegroup : 0;
734 do
735 {
736 RMATCH(eptr, ecode + _pcre_OP_lengths[*ecode], offset_top, md,
737 ims, eptrb, flags, RM1);
738 if (rrc != MATCH_NOMATCH && rrc != MATCH_THEN) RRETURN(rrc);
739 md->capture_last = save_capture_last;
740 ecode += GET(ecode, 1);
741 }
742 while (*ecode == OP_ALT);
743
744 DPRINTF(("bracket %d failed\n", number));
745
746 md->offset_vector[offset] = save_offset1;
747 md->offset_vector[offset+1] = save_offset2;
748 md->offset_vector[md->offset_end - number] = save_offset3;
749
750 RRETURN(MATCH_NOMATCH);
751 }
752
753 /* FALL THROUGH ... Insufficient room for saving captured contents. Treat
754 as a non-capturing bracket. */
755
756 /* VVVVVVVVVVVVVVVVVVVVVVVVV */
757 /* VVVVVVVVVVVVVVVVVVVVVVVVV */
758
759 DPRINTF(("insufficient capture room: treat as non-capturing\n"));
760
761 /* VVVVVVVVVVVVVVVVVVVVVVVVV */
762 /* VVVVVVVVVVVVVVVVVVVVVVVVV */
763
764 /* Non-capturing bracket. Loop for all the alternatives. When we get to the
765 final alternative within the brackets, we would return the result of a
766 recursive call to match() whatever happened. We can reduce stack usage by
767 turning this into a tail recursion, except in the case when match_cbegroup
768 is set.*/
769
770 case OP_BRA:
771 case OP_SBRA:
772 DPRINTF(("start non-capturing bracket\n"));
773 flags = (op >= OP_SBRA)? match_cbegroup : 0;
774 for (;;)
775 {
776 if (ecode[GET(ecode, 1)] != OP_ALT) /* Final alternative */
777 {
778 if (flags == 0) /* Not a possibly empty group */
779 {
780 ecode += _pcre_OP_lengths[*ecode];
781 DPRINTF(("bracket 0 tail recursion\n"));
782 goto TAIL_RECURSE;
783 }
784
785 /* Possibly empty group; can't use tail recursion. */
786
787 RMATCH(eptr, ecode + _pcre_OP_lengths[*ecode], offset_top, md, ims,
788 eptrb, flags, RM48);
789 RRETURN(rrc);
790 }
791
792 /* For non-final alternatives, continue the loop for a NOMATCH result;
793 otherwise return. */
794
795 RMATCH(eptr, ecode + _pcre_OP_lengths[*ecode], offset_top, md, ims,
796 eptrb, flags, RM2);
797 if (rrc != MATCH_NOMATCH && rrc != MATCH_THEN) RRETURN(rrc);
798 ecode += GET(ecode, 1);
799 }
800 /* Control never reaches here. */
801
802 /* Conditional group: compilation checked that there are no more than
803 two branches. If the condition is false, skipping the first branch takes us
804 past the end if there is only one branch, but that's OK because that is
805 exactly what going to the ket would do. As there is only one branch to be
806 obeyed, we can use tail recursion to avoid using another stack frame. */
807
808 case OP_COND:
809 case OP_SCOND:
810 codelink= GET(ecode, 1);
811
812 /* Because of the way auto-callout works during compile, a callout item is
813 inserted between OP_COND and an assertion condition. */
814
815 if (ecode[LINK_SIZE+1] == OP_CALLOUT)
816 {
817 if (pcre_callout != NULL)
818 {
819 pcre_callout_block cb;
820 cb.version = 1; /* Version 1 of the callout block */
821 cb.callout_number = ecode[LINK_SIZE+2];
822 cb.offset_vector = md->offset_vector;
823 cb.subject = (PCRE_SPTR)md->start_subject;
824 cb.subject_length = md->end_subject - md->start_subject;
825 cb.start_match = mstart - md->start_subject;
826 cb.current_position = eptr - md->start_subject;
827 cb.pattern_position = GET(ecode, LINK_SIZE + 3);
828 cb.next_item_length = GET(ecode, 3 + 2*LINK_SIZE);
829 cb.capture_top = offset_top/2;
830 cb.capture_last = md->capture_last;
831 cb.callout_data = md->callout_data;
832 if ((rrc = (*pcre_callout)(&cb)) > 0) RRETURN(MATCH_NOMATCH);
833 if (rrc < 0) RRETURN(rrc);
834 }
835 ecode += _pcre_OP_lengths[OP_CALLOUT];
836 }
837
838 condcode = ecode[LINK_SIZE+1];
839
840 /* Now see what the actual condition is */
841
842 if (condcode == OP_RREF) /* Recursion test */
843 {
844 offset = GET2(ecode, LINK_SIZE + 2); /* Recursion group number*/
845 condition = md->recursive != NULL &&
846 (offset == RREF_ANY || offset == md->recursive->group_num);
847 ecode += condition? 3 : GET(ecode, 1);
848 }
849
850 else if (condcode == OP_CREF) /* Group used test */
851 {
852 offset = GET2(ecode, LINK_SIZE+2) << 1; /* Doubled ref number */
853 condition = offset < offset_top && md->offset_vector[offset] >= 0;
854 ecode += condition? 3 : GET(ecode, 1);
855 }
856
857 else if (condcode == OP_DEF) /* DEFINE - always false */
858 {
859 condition = FALSE;
860 ecode += GET(ecode, 1);
861 }
862
863 /* The condition is an assertion. Call match() to evaluate it - setting
864 the final argument match_condassert causes it to stop at the end of an
865 assertion. */
866
867 else
868 {
869 RMATCH(eptr, ecode + 1 + LINK_SIZE, offset_top, md, ims, NULL,
870 match_condassert, RM3);
871 if (rrc == MATCH_MATCH)
872 {
873 condition = TRUE;
874 ecode += 1 + LINK_SIZE + GET(ecode, LINK_SIZE + 2);
875 while (*ecode == OP_ALT) ecode += GET(ecode, 1);
876 }
877 else if (rrc != MATCH_NOMATCH && rrc != MATCH_THEN)
878 {
879 RRETURN(rrc); /* Need braces because of following else */
880 }
881 else
882 {
883 condition = FALSE;
884 ecode += codelink;
885 }
886 }
887
888 /* We are now at the branch that is to be obeyed. As there is only one,
889 we can use tail recursion to avoid using another stack frame, except when
890 match_cbegroup is required for an unlimited repeat of a possibly empty
891 group. If the second alternative doesn't exist, we can just plough on. */
892
893 if (condition || *ecode == OP_ALT)
894 {
895 ecode += 1 + LINK_SIZE;
896 if (op == OP_SCOND) /* Possibly empty group */
897 {
898 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, match_cbegroup, RM49);
899 RRETURN(rrc);
900 }
901 else /* Group must match something */
902 {
903 flags = 0;
904 goto TAIL_RECURSE;
905 }
906 }
907 else /* Condition false & no alternative */
908 {
909 ecode += 1 + LINK_SIZE;
910 }
911 break;
912
913
914 /* End of the pattern, either real or forced. If we are in a top-level
915 recursion, we should restore the offsets appropriately and continue from
916 after the call. */
917
918 case OP_ACCEPT:
919 case OP_END:
920 if (md->recursive != NULL && md->recursive->group_num == 0)
921 {
922 recursion_info *rec = md->recursive;
923 DPRINTF(("End of pattern in a (?0) recursion\n"));
924 md->recursive = rec->prevrec;
925 memmove(md->offset_vector, rec->offset_save,
926 rec->saved_max * sizeof(int));
927 mstart = rec->save_start;
928 ims = original_ims;
929 ecode = rec->after_call;
930 break;
931 }
932
933 /* Otherwise, if we have matched an empty string, fail if PCRE_NOTEMPTY is
934 set, or if PCRE_NOTEMPTY_ATSTART is set and we have matched at the start of
935 the subject. In both cases, backtracking will then try other alternatives,
936 if any. */
937
938 if (eptr == mstart &&
939 (md->notempty ||
940 (md->notempty_atstart &&
941 mstart == md->start_subject + md->start_offset)))
942 RRETURN(MATCH_NOMATCH);
943
944 /* Otherwise, we have a match. */
945
946 md->end_match_ptr = eptr; /* Record where we ended */
947 md->end_offset_top = offset_top; /* and how many extracts were taken */
948 md->start_match_ptr = mstart; /* and the start (\K can modify) */
949 RRETURN(MATCH_MATCH);
950
951 /* Change option settings */
952
953 case OP_OPT:
954 ims = ecode[1];
955 ecode += 2;
956 DPRINTF(("ims set to %02lx\n", ims));
957 break;
958
959 /* Assertion brackets. Check the alternative branches in turn - the
960 matching won't pass the KET for an assertion. If any one branch matches,
961 the assertion is true. Lookbehind assertions have an OP_REVERSE item at the
962 start of each branch to move the current point backwards, so the code at
963 this level is identical to the lookahead case. */
964
965 case OP_ASSERT:
966 case OP_ASSERTBACK:
967 do
968 {
969 RMATCH(eptr, ecode + 1 + LINK_SIZE, offset_top, md, ims, NULL, 0,
970 RM4);
971 if (rrc == MATCH_MATCH) break;
972 if (rrc != MATCH_NOMATCH && rrc != MATCH_THEN) RRETURN(rrc);
973 ecode += GET(ecode, 1);
974 }
975 while (*ecode == OP_ALT);
976 if (*ecode == OP_KET) RRETURN(MATCH_NOMATCH);
977
978 /* If checking an assertion for a condition, return MATCH_MATCH. */
979
980 if ((flags & match_condassert) != 0) RRETURN(MATCH_MATCH);
981
982 /* Continue from after the assertion, updating the offsets high water
983 mark, since extracts may have been taken during the assertion. */
984
985 do ecode += GET(ecode,1); while (*ecode == OP_ALT);
986 ecode += 1 + LINK_SIZE;
987 offset_top = md->end_offset_top;
988 continue;
989
990 /* Negative assertion: all branches must fail to match */
991
992 case OP_ASSERT_NOT:
993 case OP_ASSERTBACK_NOT:
994 do
995 {
996 RMATCH(eptr, ecode + 1 + LINK_SIZE, offset_top, md, ims, NULL, 0,
997 RM5);
998 if (rrc == MATCH_MATCH) RRETURN(MATCH_NOMATCH);
999 if (rrc != MATCH_NOMATCH && rrc != MATCH_THEN) RRETURN(rrc);
1000 ecode += GET(ecode,1);
1001 }
1002 while (*ecode == OP_ALT);
1003
1004 if ((flags & match_condassert) != 0) RRETURN(MATCH_MATCH);
1005
1006 ecode += 1 + LINK_SIZE;
1007 continue;
1008
1009 /* Move the subject pointer back. This occurs only at the start of
1010 each branch of a lookbehind assertion. If we are too close to the start to
1011 move back, this match function fails. When working with UTF-8 we move
1012 back a number of characters, not bytes. */
1013
1014 case OP_REVERSE:
1015 #ifdef SUPPORT_UTF8
1016 if (utf8)
1017 {
1018 i = GET(ecode, 1);
1019 while (i-- > 0)
1020 {
1021 eptr--;
1022 if (eptr < md->start_subject) RRETURN(MATCH_NOMATCH);
1023 BACKCHAR(eptr);
1024 }
1025 }
1026 else
1027 #endif
1028
1029 /* No UTF-8 support, or not in UTF-8 mode: count is byte count */
1030
1031 {
1032 eptr -= GET(ecode, 1);
1033 if (eptr < md->start_subject) RRETURN(MATCH_NOMATCH);
1034 }
1035
1036 /* Save the earliest consulted character, then skip to next op code */
1037
1038 if (eptr < md->start_used_ptr) md->start_used_ptr = eptr;
1039 ecode += 1 + LINK_SIZE;
1040 break;
1041
1042 /* The callout item calls an external function, if one is provided, passing
1043 details of the match so far. This is mainly for debugging, though the
1044 function is able to force a failure. */
1045
1046 case OP_CALLOUT:
1047 if (pcre_callout != NULL)
1048 {
1049 pcre_callout_block cb;
1050 cb.version = 1; /* Version 1 of the callout block */
1051 cb.callout_number = ecode[1];
1052 cb.offset_vector = md->offset_vector;
1053 cb.subject = (PCRE_SPTR)md->start_subject;
1054 cb.subject_length = md->end_subject - md->start_subject;
1055 cb.start_match = mstart - md->start_subject;
1056 cb.current_position = eptr - md->start_subject;
1057 cb.pattern_position = GET(ecode, 2);
1058 cb.next_item_length = GET(ecode, 2 + LINK_SIZE);
1059 cb.capture_top = offset_top/2;
1060 cb.capture_last = md->capture_last;
1061 cb.callout_data = md->callout_data;
1062 if ((rrc = (*pcre_callout)(&cb)) > 0) RRETURN(MATCH_NOMATCH);
1063 if (rrc < 0) RRETURN(rrc);
1064 }
1065 ecode += 2 + 2*LINK_SIZE;
1066 break;
1067
1068 /* Recursion either matches the current regex, or some subexpression. The
1069 offset data is the offset to the starting bracket from the start of the
1070 whole pattern. (This is so that it works from duplicated subpatterns.)
1071
1072 If there are any capturing brackets started but not finished, we have to
1073 save their starting points and reinstate them after the recursion. However,
1074 we don't know how many such there are (offset_top records the completed
1075 total) so we just have to save all the potential data. There may be up to
1076 65535 such values, which is too large to put on the stack, but using malloc
1077 for small numbers seems expensive. As a compromise, the stack is used when
1078 there are no more than REC_STACK_SAVE_MAX values to store; otherwise malloc
1079 is used. A problem is what to do if the malloc fails ... there is no way of
1080 returning to the top level with an error. Save the top REC_STACK_SAVE_MAX
1081 values on the stack, and accept that the rest may be wrong.
1082
1083 There are also other values that have to be saved. We use a chained
1084 sequence of blocks that actually live on the stack. Thanks to Robin Houston
1085 for the original version of this logic. */
1086
1087 case OP_RECURSE:
1088 {
1089 callpat = md->start_code + GET(ecode, 1);
1090 new_recursive.group_num = (callpat == md->start_code)? 0 :
1091 GET2(callpat, 1 + LINK_SIZE);
1092
1093 /* Add to "recursing stack" */
1094
1095 new_recursive.prevrec = md->recursive;
1096 md->recursive = &new_recursive;
1097
1098 /* Find where to continue from afterwards */
1099
1100 ecode += 1 + LINK_SIZE;
1101 new_recursive.after_call = ecode;
1102
1103 /* Now save the offset data. */
1104
1105 new_recursive.saved_max = md->offset_end;
1106 if (new_recursive.saved_max <= REC_STACK_SAVE_MAX)
1107 new_recursive.offset_save = stacksave;
1108 else
1109 {
1110 new_recursive.offset_save =
1111 (int *)(pcre_malloc)(new_recursive.saved_max * sizeof(int));
1112 if (new_recursive.offset_save == NULL) RRETURN(PCRE_ERROR_NOMEMORY);
1113 }
1114
1115 memcpy(new_recursive.offset_save, md->offset_vector,
1116 new_recursive.saved_max * sizeof(int));
1117 new_recursive.save_start = mstart;
1118 mstart = eptr;
1119
1120 /* OK, now we can do the recursion. For each top-level alternative we
1121 restore the offset and recursion data. */
1122
1123 DPRINTF(("Recursing into group %d\n", new_recursive.group_num));
1124 flags = (*callpat >= OP_SBRA)? match_cbegroup : 0;
1125 do
1126 {
1127 RMATCH(eptr, callpat + _pcre_OP_lengths[*callpat], offset_top,
1128 md, ims, eptrb, flags, RM6);
1129 if (rrc == MATCH_MATCH)
1130 {
1131 DPRINTF(("Recursion matched\n"));
1132 md->recursive = new_recursive.prevrec;
1133 if (new_recursive.offset_save != stacksave)
1134 (pcre_free)(new_recursive.offset_save);
1135 RRETURN(MATCH_MATCH);
1136 }
1137 else if (rrc != MATCH_NOMATCH && rrc != MATCH_THEN)
1138 {
1139 DPRINTF(("Recursion gave error %d\n", rrc));
1140 if (new_recursive.offset_save != stacksave)
1141 (pcre_free)(new_recursive.offset_save);
1142 RRETURN(rrc);
1143 }
1144
1145 md->recursive = &new_recursive;
1146 memcpy(md->offset_vector, new_recursive.offset_save,
1147 new_recursive.saved_max * sizeof(int));
1148 callpat += GET(callpat, 1);
1149 }
1150 while (*callpat == OP_ALT);
1151
1152 DPRINTF(("Recursion didn't match\n"));
1153 md->recursive = new_recursive.prevrec;
1154 if (new_recursive.offset_save != stacksave)
1155 (pcre_free)(new_recursive.offset_save);
1156 RRETURN(MATCH_NOMATCH);
1157 }
1158 /* Control never reaches here */
1159
1160 /* "Once" brackets are like assertion brackets except that after a match,
1161 the point in the subject string is not moved back. Thus there can never be
1162 a move back into the brackets. Friedl calls these "atomic" subpatterns.
1163 Check the alternative branches in turn - the matching won't pass the KET
1164 for this kind of subpattern. If any one branch matches, we carry on as at
1165 the end of a normal bracket, leaving the subject pointer. */
1166
1167 case OP_ONCE:
1168 prev = ecode;
1169 saved_eptr = eptr;
1170
1171 do
1172 {
1173 RMATCH(eptr, ecode + 1 + LINK_SIZE, offset_top, md, ims, eptrb, 0, RM7);
1174 if (rrc == MATCH_MATCH) break;
1175 if (rrc != MATCH_NOMATCH && rrc != MATCH_THEN) RRETURN(rrc);
1176 ecode += GET(ecode,1);
1177 }
1178 while (*ecode == OP_ALT);
1179
1180 /* If hit the end of the group (which could be repeated), fail */
1181
1182 if (*ecode != OP_ONCE && *ecode != OP_ALT) RRETURN(MATCH_NOMATCH);
1183
1184 /* Continue as from after the assertion, updating the offsets high water
1185 mark, since extracts may have been taken. */
1186
1187 do ecode += GET(ecode, 1); while (*ecode == OP_ALT);
1188
1189 offset_top = md->end_offset_top;
1190 eptr = md->end_match_ptr;
1191
1192 /* For a non-repeating ket, just continue at this level. This also
1193 happens for a repeating ket if no characters were matched in the group.
1194 This is the forcible breaking of infinite loops as implemented in Perl
1195 5.005. If there is an options reset, it will get obeyed in the normal
1196 course of events. */
1197
1198 if (*ecode == OP_KET || eptr == saved_eptr)
1199 {
1200 ecode += 1+LINK_SIZE;
1201 break;
1202 }
1203
1204 /* The repeating kets try the rest of the pattern or restart from the
1205 preceding bracket, in the appropriate order. The second "call" of match()
1206 uses tail recursion, to avoid using another stack frame. We need to reset
1207 any options that changed within the bracket before re-running it, so
1208 check the next opcode. */
1209
1210 if (ecode[1+LINK_SIZE] == OP_OPT)
1211 {
1212 ims = (ims & ~PCRE_IMS) | ecode[4];
1213 DPRINTF(("ims set to %02lx at group repeat\n", ims));
1214 }
1215
1216 if (*ecode == OP_KETRMIN)
1217 {
1218 RMATCH(eptr, ecode + 1 + LINK_SIZE, offset_top, md, ims, eptrb, 0, RM8);
1219 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1220 ecode = prev;
1221 flags = 0;
1222 goto TAIL_RECURSE;
1223 }
1224 else /* OP_KETRMAX */
1225 {
1226 RMATCH(eptr, prev, offset_top, md, ims, eptrb, match_cbegroup, RM9);
1227 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1228 ecode += 1 + LINK_SIZE;
1229 flags = 0;
1230 goto TAIL_RECURSE;
1231 }
1232 /* Control never gets here */
1233
1234 /* An alternation is the end of a branch; scan along to find the end of the
1235 bracketed group and go to there. */
1236
1237 case OP_ALT:
1238 do ecode += GET(ecode,1); while (*ecode == OP_ALT);
1239 break;
1240
1241 /* BRAZERO, BRAMINZERO and SKIPZERO occur just before a bracket group,
1242 indicating that it may occur zero times. It may repeat infinitely, or not
1243 at all - i.e. it could be ()* or ()? or even (){0} in the pattern. Brackets
1244 with fixed upper repeat limits are compiled as a number of copies, with the
1245 optional ones preceded by BRAZERO or BRAMINZERO. */
1246
1247 case OP_BRAZERO:
1248 {
1249 next = ecode+1;
1250 RMATCH(eptr, next, offset_top, md, ims, eptrb, 0, RM10);
1251 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1252 do next += GET(next,1); while (*next == OP_ALT);
1253 ecode = next + 1 + LINK_SIZE;
1254 }
1255 break;
1256
1257 case OP_BRAMINZERO:
1258 {
1259 next = ecode+1;
1260 do next += GET(next, 1); while (*next == OP_ALT);
1261 RMATCH(eptr, next + 1+LINK_SIZE, offset_top, md, ims, eptrb, 0, RM11);
1262 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1263 ecode++;
1264 }
1265 break;
1266
1267 case OP_SKIPZERO:
1268 {
1269 next = ecode+1;
1270 do next += GET(next,1); while (*next == OP_ALT);
1271 ecode = next + 1 + LINK_SIZE;
1272 }
1273 break;
1274
1275 /* End of a group, repeated or non-repeating. */
1276
1277 case OP_KET:
1278 case OP_KETRMIN:
1279 case OP_KETRMAX:
1280 prev = ecode - GET(ecode, 1);
1281
1282 /* If this was a group that remembered the subject start, in order to break
1283 infinite repeats of empty string matches, retrieve the subject start from
1284 the chain. Otherwise, set it NULL. */
1285
1286 if (*prev >= OP_SBRA)
1287 {
1288 saved_eptr = eptrb->epb_saved_eptr; /* Value at start of group */
1289 eptrb = eptrb->epb_prev; /* Backup to previous group */
1290 }
1291 else saved_eptr = NULL;
1292
1293 /* If we are at the end of an assertion group, stop matching and return
1294 MATCH_MATCH, but record the current high water mark for use by positive
1295 assertions. Do this also for the "once" (atomic) groups. */
1296
1297 if (*prev == OP_ASSERT || *prev == OP_ASSERT_NOT ||
1298 *prev == OP_ASSERTBACK || *prev == OP_ASSERTBACK_NOT ||
1299 *prev == OP_ONCE)
1300 {
1301 md->end_match_ptr = eptr; /* For ONCE */
1302 md->end_offset_top = offset_top;
1303 RRETURN(MATCH_MATCH);
1304 }
1305
1306 /* For capturing groups we have to check the group number back at the start
1307 and if necessary complete handling an extraction by setting the offsets and
1308 bumping the high water mark. Note that whole-pattern recursion is coded as
1309 a recurse into group 0, so it won't be picked up here. Instead, we catch it
1310 when the OP_END is reached. Other recursion is handled here. */
1311
1312 if (*prev == OP_CBRA || *prev == OP_SCBRA)
1313 {
1314 number = GET2(prev, 1+LINK_SIZE);
1315 offset = number << 1;
1316
1317 #ifdef DEBUG
1318 printf("end bracket %d", number);
1319 printf("\n");
1320 #endif
1321
1322 md->capture_last = number;
1323 if (offset >= md->offset_max) md->offset_overflow = TRUE; else
1324 {
1325 md->offset_vector[offset] =
1326 md->offset_vector[md->offset_end - number];
1327 md->offset_vector[offset+1] = eptr - md->start_subject;
1328 if (offset_top <= offset) offset_top = offset + 2;
1329 }
1330
1331 /* Handle a recursively called group. Restore the offsets
1332 appropriately and continue from after the call. */
1333
1334 if (md->recursive != NULL && md->recursive->group_num == number)
1335 {
1336 recursion_info *rec = md->recursive;
1337 DPRINTF(("Recursion (%d) succeeded - continuing\n", number));
1338 md->recursive = rec->prevrec;
1339 mstart = rec->save_start;
1340 memcpy(md->offset_vector, rec->offset_save,
1341 rec->saved_max * sizeof(int));
1342 ecode = rec->after_call;
1343 ims = original_ims;
1344 break;
1345 }
1346 }
1347
1348 /* For both capturing and non-capturing groups, reset the value of the ims
1349 flags, in case they got changed during the group. */
1350
1351 ims = original_ims;
1352 DPRINTF(("ims reset to %02lx\n", ims));
1353
1354 /* For a non-repeating ket, just continue at this level. This also
1355 happens for a repeating ket if no characters were matched in the group.
1356 This is the forcible breaking of infinite loops as implemented in Perl
1357 5.005. If there is an options reset, it will get obeyed in the normal
1358 course of events. */
1359
1360 if (*ecode == OP_KET || eptr == saved_eptr)
1361 {
1362 ecode += 1 + LINK_SIZE;
1363 break;
1364 }
1365
1366 /* The repeating kets try the rest of the pattern or restart from the
1367 preceding bracket, in the appropriate order. In the second case, we can use
1368 tail recursion to avoid using another stack frame, unless we have an
1369 unlimited repeat of a group that can match an empty string. */
1370
1371 flags = (*prev >= OP_SBRA)? match_cbegroup : 0;
1372
1373 if (*ecode == OP_KETRMIN)
1374 {
1375 RMATCH(eptr, ecode + 1 + LINK_SIZE, offset_top, md, ims, eptrb, 0, RM12);
1376 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1377 if (flags != 0) /* Could match an empty string */
1378 {
1379 RMATCH(eptr, prev, offset_top, md, ims, eptrb, flags, RM50);
1380 RRETURN(rrc);
1381 }
1382 ecode = prev;
1383 goto TAIL_RECURSE;
1384 }
1385 else /* OP_KETRMAX */
1386 {
1387 RMATCH(eptr, prev, offset_top, md, ims, eptrb, flags, RM13);
1388 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1389 ecode += 1 + LINK_SIZE;
1390 flags = 0;
1391 goto TAIL_RECURSE;
1392 }
1393 /* Control never gets here */
1394
1395 /* Start of subject unless notbol, or after internal newline if multiline */
1396
1397 case OP_CIRC:
1398 if (md->notbol && eptr == md->start_subject) RRETURN(MATCH_NOMATCH);
1399 if ((ims & PCRE_MULTILINE) != 0)
1400 {
1401 if (eptr != md->start_subject &&
1402 (eptr == md->end_subject || !WAS_NEWLINE(eptr)))
1403 RRETURN(MATCH_NOMATCH);
1404 ecode++;
1405 break;
1406 }
1407 /* ... else fall through */
1408
1409 /* Start of subject assertion */
1410
1411 case OP_SOD:
1412 if (eptr != md->start_subject) RRETURN(MATCH_NOMATCH);
1413 ecode++;
1414 break;
1415
1416 /* Start of match assertion */
1417
1418 case OP_SOM:
1419 if (eptr != md->start_subject + md->start_offset) RRETURN(MATCH_NOMATCH);
1420 ecode++;
1421 break;
1422
1423 /* Reset the start of match point */
1424
1425 case OP_SET_SOM:
1426 mstart = eptr;
1427 ecode++;
1428 break;
1429
1430 /* Assert before internal newline if multiline, or before a terminating
1431 newline unless endonly is set, else end of subject unless noteol is set. */
1432
1433 case OP_DOLL:
1434 if ((ims & PCRE_MULTILINE) != 0)
1435 {
1436 if (eptr < md->end_subject)
1437 { if (!IS_NEWLINE(eptr)) RRETURN(MATCH_NOMATCH); }
1438 else
1439 { if (md->noteol) RRETURN(MATCH_NOMATCH); }
1440 ecode++;
1441 break;
1442 }
1443 else
1444 {
1445 if (md->noteol) RRETURN(MATCH_NOMATCH);
1446 if (!md->endonly)
1447 {
1448 if (eptr != md->end_subject &&
1449 (!IS_NEWLINE(eptr) || eptr != md->end_subject - md->nllen))
1450 RRETURN(MATCH_NOMATCH);
1451 ecode++;
1452 break;
1453 }
1454 }
1455 /* ... else fall through for endonly */
1456
1457 /* End of subject assertion (\z) */
1458
1459 case OP_EOD:
1460 if (eptr < md->end_subject) RRETURN(MATCH_NOMATCH);
1461 ecode++;
1462 break;
1463
1464 /* End of subject or ending \n assertion (\Z) */
1465
1466 case OP_EODN:
1467 if (eptr != md->end_subject &&
1468 (!IS_NEWLINE(eptr) || eptr != md->end_subject - md->nllen))
1469 RRETURN(MATCH_NOMATCH);
1470 ecode++;
1471 break;
1472
1473 /* Word boundary assertions */
1474
1475 case OP_NOT_WORD_BOUNDARY:
1476 case OP_WORD_BOUNDARY:
1477 {
1478
1479 /* Find out if the previous and current characters are "word" characters.
1480 It takes a bit more work in UTF-8 mode. Characters > 255 are assumed to
1481 be "non-word" characters. Remember the earliest consulted character for
1482 partial matching. */
1483
1484 #ifdef SUPPORT_UTF8
1485 if (utf8)
1486 {
1487 if (eptr == md->start_subject) prev_is_word = FALSE; else
1488 {
1489 USPTR lastptr = eptr - 1;
1490 while((*lastptr & 0xc0) == 0x80) lastptr--;
1491 if (lastptr < md->start_used_ptr) md->start_used_ptr = lastptr;
1492 GETCHAR(c, lastptr);
1493 prev_is_word = c < 256 && (md->ctypes[c] & ctype_word) != 0;
1494 }
1495 if (eptr >= md->end_subject)
1496 {
1497 SCHECK_PARTIAL();
1498 cur_is_word = FALSE;
1499 }
1500 else
1501 {
1502 GETCHAR(c, eptr);
1503 cur_is_word = c < 256 && (md->ctypes[c] & ctype_word) != 0;
1504 }
1505 }
1506 else
1507 #endif
1508
1509 /* Not in UTF-8 mode */
1510
1511 {
1512 if (eptr == md->start_subject) prev_is_word = FALSE; else
1513 {
1514 if (eptr <= md->start_used_ptr) md->start_used_ptr = eptr - 1;
1515 prev_is_word = ((md->ctypes[eptr[-1]] & ctype_word) != 0);
1516 }
1517 if (eptr >= md->end_subject)
1518 {
1519 SCHECK_PARTIAL();
1520 cur_is_word = FALSE;
1521 }
1522 else cur_is_word = ((md->ctypes[*eptr] & ctype_word) != 0);
1523 }
1524
1525 /* Now see if the situation is what we want */
1526
1527 if ((*ecode++ == OP_WORD_BOUNDARY)?
1528 cur_is_word == prev_is_word : cur_is_word != prev_is_word)
1529 RRETURN(MATCH_NOMATCH);
1530 }
1531 break;
1532
1533 /* Match a single character type; inline for speed */
1534
1535 case OP_ANY:
1536 if (IS_NEWLINE(eptr)) RRETURN(MATCH_NOMATCH);
1537 /* Fall through */
1538
1539 case OP_ALLANY:
1540 if (eptr++ >= md->end_subject)
1541 {
1542 SCHECK_PARTIAL();
1543 RRETURN(MATCH_NOMATCH);
1544 }
1545 if (utf8) while (eptr < md->end_subject && (*eptr & 0xc0) == 0x80) eptr++;
1546 ecode++;
1547 break;
1548
1549 /* Match a single byte, even in UTF-8 mode. This opcode really does match
1550 any byte, even newline, independent of the setting of PCRE_DOTALL. */
1551
1552 case OP_ANYBYTE:
1553 if (eptr++ >= md->end_subject)
1554 {
1555 SCHECK_PARTIAL();
1556 RRETURN(MATCH_NOMATCH);
1557 }
1558 ecode++;
1559 break;
1560
1561 case OP_NOT_DIGIT:
1562 if (eptr >= md->end_subject)
1563 {
1564 SCHECK_PARTIAL();
1565 RRETURN(MATCH_NOMATCH);
1566 }
1567 GETCHARINCTEST(c, eptr);
1568 if (
1569 #ifdef SUPPORT_UTF8
1570 c < 256 &&
1571 #endif
1572 (md->ctypes[c] & ctype_digit) != 0
1573 )
1574 RRETURN(MATCH_NOMATCH);
1575 ecode++;
1576 break;
1577
1578 case OP_DIGIT:
1579 if (eptr >= md->end_subject)
1580 {
1581 SCHECK_PARTIAL();
1582 RRETURN(MATCH_NOMATCH);
1583 }
1584 GETCHARINCTEST(c, eptr);
1585 if (
1586 #ifdef SUPPORT_UTF8
1587 c >= 256 ||
1588 #endif
1589 (md->ctypes[c] & ctype_digit) == 0
1590 )
1591 RRETURN(MATCH_NOMATCH);
1592 ecode++;
1593 break;
1594
1595 case OP_NOT_WHITESPACE:
1596 if (eptr >= md->end_subject)
1597 {
1598 SCHECK_PARTIAL();
1599 RRETURN(MATCH_NOMATCH);
1600 }
1601 GETCHARINCTEST(c, eptr);
1602 if (
1603 #ifdef SUPPORT_UTF8
1604 c < 256 &&
1605 #endif
1606 (md->ctypes[c] & ctype_space) != 0
1607 )
1608 RRETURN(MATCH_NOMATCH);
1609 ecode++;
1610 break;
1611
1612 case OP_WHITESPACE:
1613 if (eptr >= md->end_subject)
1614 {
1615 SCHECK_PARTIAL();
1616 RRETURN(MATCH_NOMATCH);
1617 }
1618 GETCHARINCTEST(c, eptr);
1619 if (
1620 #ifdef SUPPORT_UTF8
1621 c >= 256 ||
1622 #endif
1623 (md->ctypes[c] & ctype_space) == 0
1624 )
1625 RRETURN(MATCH_NOMATCH);
1626 ecode++;
1627 break;
1628
1629 case OP_NOT_WORDCHAR:
1630 if (eptr >= md->end_subject)
1631 {
1632 SCHECK_PARTIAL();
1633 RRETURN(MATCH_NOMATCH);
1634 }
1635 GETCHARINCTEST(c, eptr);
1636 if (
1637 #ifdef SUPPORT_UTF8
1638 c < 256 &&
1639 #endif
1640 (md->ctypes[c] & ctype_word) != 0
1641 )
1642 RRETURN(MATCH_NOMATCH);
1643 ecode++;
1644 break;
1645
1646 case OP_WORDCHAR:
1647 if (eptr >= md->end_subject)
1648 {
1649 SCHECK_PARTIAL();
1650 RRETURN(MATCH_NOMATCH);
1651 }
1652 GETCHARINCTEST(c, eptr);
1653 if (
1654 #ifdef SUPPORT_UTF8
1655 c >= 256 ||
1656 #endif
1657 (md->ctypes[c] & ctype_word) == 0
1658 )
1659 RRETURN(MATCH_NOMATCH);
1660 ecode++;
1661 break;
1662
1663 case OP_ANYNL:
1664 if (eptr >= md->end_subject)
1665 {
1666 SCHECK_PARTIAL();
1667 RRETURN(MATCH_NOMATCH);
1668 }
1669 GETCHARINCTEST(c, eptr);
1670 switch(c)
1671 {
1672 default: RRETURN(MATCH_NOMATCH);
1673 case 0x000d:
1674 if (eptr < md->end_subject && *eptr == 0x0a) eptr++;
1675 break;
1676
1677 case 0x000a:
1678 break;
1679
1680 case 0x000b:
1681 case 0x000c:
1682 case 0x0085:
1683 case 0x2028:
1684 case 0x2029:
1685 if (md->bsr_anycrlf) RRETURN(MATCH_NOMATCH);
1686 break;
1687 }
1688 ecode++;
1689 break;
1690
1691 case OP_NOT_HSPACE:
1692 if (eptr >= md->end_subject)
1693 {
1694 SCHECK_PARTIAL();
1695 RRETURN(MATCH_NOMATCH);
1696 }
1697 GETCHARINCTEST(c, eptr);
1698 switch(c)
1699 {
1700 default: break;
1701 case 0x09: /* HT */
1702 case 0x20: /* SPACE */
1703 case 0xa0: /* NBSP */
1704 case 0x1680: /* OGHAM SPACE MARK */
1705 case 0x180e: /* MONGOLIAN VOWEL SEPARATOR */
1706 case 0x2000: /* EN QUAD */
1707 case 0x2001: /* EM QUAD */
1708 case 0x2002: /* EN SPACE */
1709 case 0x2003: /* EM SPACE */
1710 case 0x2004: /* THREE-PER-EM SPACE */
1711 case 0x2005: /* FOUR-PER-EM SPACE */
1712 case 0x2006: /* SIX-PER-EM SPACE */
1713 case 0x2007: /* FIGURE SPACE */
1714 case 0x2008: /* PUNCTUATION SPACE */
1715 case 0x2009: /* THIN SPACE */
1716 case 0x200A: /* HAIR SPACE */
1717 case 0x202f: /* NARROW NO-BREAK SPACE */
1718 case 0x205f: /* MEDIUM MATHEMATICAL SPACE */
1719 case 0x3000: /* IDEOGRAPHIC SPACE */
1720 RRETURN(MATCH_NOMATCH);
1721 }
1722 ecode++;
1723 break;
1724
1725 case OP_HSPACE:
1726 if (eptr >= md->end_subject)
1727 {
1728 SCHECK_PARTIAL();
1729 RRETURN(MATCH_NOMATCH);
1730 }
1731 GETCHARINCTEST(c, eptr);
1732 switch(c)
1733 {
1734 default: RRETURN(MATCH_NOMATCH);
1735 case 0x09: /* HT */
1736 case 0x20: /* SPACE */
1737 case 0xa0: /* NBSP */
1738 case 0x1680: /* OGHAM SPACE MARK */
1739 case 0x180e: /* MONGOLIAN VOWEL SEPARATOR */
1740 case 0x2000: /* EN QUAD */
1741 case 0x2001: /* EM QUAD */
1742 case 0x2002: /* EN SPACE */
1743 case 0x2003: /* EM SPACE */
1744 case 0x2004: /* THREE-PER-EM SPACE */
1745 case 0x2005: /* FOUR-PER-EM SPACE */
1746 case 0x2006: /* SIX-PER-EM SPACE */
1747 case 0x2007: /* FIGURE SPACE */
1748 case 0x2008: /* PUNCTUATION SPACE */
1749 case 0x2009: /* THIN SPACE */
1750 case 0x200A: /* HAIR SPACE */
1751 case 0x202f: /* NARROW NO-BREAK SPACE */
1752 case 0x205f: /* MEDIUM MATHEMATICAL SPACE */
1753 case 0x3000: /* IDEOGRAPHIC SPACE */
1754 break;
1755 }
1756 ecode++;
1757 break;
1758
1759 case OP_NOT_VSPACE:
1760 if (eptr >= md->end_subject)
1761 {
1762 SCHECK_PARTIAL();
1763 RRETURN(MATCH_NOMATCH);
1764 }
1765 GETCHARINCTEST(c, eptr);
1766 switch(c)
1767 {
1768 default: break;
1769 case 0x0a: /* LF */
1770 case 0x0b: /* VT */
1771 case 0x0c: /* FF */
1772 case 0x0d: /* CR */
1773 case 0x85: /* NEL */
1774 case 0x2028: /* LINE SEPARATOR */
1775 case 0x2029: /* PARAGRAPH SEPARATOR */
1776 RRETURN(MATCH_NOMATCH);
1777 }
1778 ecode++;
1779 break;
1780
1781 case OP_VSPACE:
1782 if (eptr >= md->end_subject)
1783 {
1784 SCHECK_PARTIAL();
1785 RRETURN(MATCH_NOMATCH);
1786 }
1787 GETCHARINCTEST(c, eptr);
1788 switch(c)
1789 {
1790 default: RRETURN(MATCH_NOMATCH);
1791 case 0x0a: /* LF */
1792 case 0x0b: /* VT */
1793 case 0x0c: /* FF */
1794 case 0x0d: /* CR */
1795 case 0x85: /* NEL */
1796 case 0x2028: /* LINE SEPARATOR */
1797 case 0x2029: /* PARAGRAPH SEPARATOR */
1798 break;
1799 }
1800 ecode++;
1801 break;
1802
1803 #ifdef SUPPORT_UCP
1804 /* Check the next character by Unicode property. We will get here only
1805 if the support is in the binary; otherwise a compile-time error occurs. */
1806
1807 case OP_PROP:
1808 case OP_NOTPROP:
1809 if (eptr >= md->end_subject)
1810 {
1811 SCHECK_PARTIAL();
1812 RRETURN(MATCH_NOMATCH);
1813 }
1814 GETCHARINCTEST(c, eptr);
1815 {
1816 const ucd_record *prop = GET_UCD(c);
1817
1818 switch(ecode[1])
1819 {
1820 case PT_ANY:
1821 if (op == OP_NOTPROP) RRETURN(MATCH_NOMATCH);
1822 break;
1823
1824 case PT_LAMP:
1825 if ((prop->chartype == ucp_Lu ||
1826 prop->chartype == ucp_Ll ||
1827 prop->chartype == ucp_Lt) == (op == OP_NOTPROP))
1828 RRETURN(MATCH_NOMATCH);
1829 break;
1830
1831 case PT_GC:
1832 if ((ecode[2] != _pcre_ucp_gentype[prop->chartype]) == (op == OP_PROP))
1833 RRETURN(MATCH_NOMATCH);
1834 break;
1835
1836 case PT_PC:
1837 if ((ecode[2] != prop->chartype) == (op == OP_PROP))
1838 RRETURN(MATCH_NOMATCH);
1839 break;
1840
1841 case PT_SC:
1842 if ((ecode[2] != prop->script) == (op == OP_PROP))
1843 RRETURN(MATCH_NOMATCH);
1844 break;
1845
1846 default:
1847 RRETURN(PCRE_ERROR_INTERNAL);
1848 }
1849
1850 ecode += 3;
1851 }
1852 break;
1853
1854 /* Match an extended Unicode sequence. We will get here only if the support
1855 is in the binary; otherwise a compile-time error occurs. */
1856
1857 case OP_EXTUNI:
1858 if (eptr >= md->end_subject)
1859 {
1860 SCHECK_PARTIAL();
1861 RRETURN(MATCH_NOMATCH);
1862 }
1863 GETCHARINCTEST(c, eptr);
1864 {
1865 int category = UCD_CATEGORY(c);
1866 if (category == ucp_M) RRETURN(MATCH_NOMATCH);
1867 while (eptr < md->end_subject)
1868 {
1869 int len = 1;
1870 if (!utf8) c = *eptr; else
1871 {
1872 GETCHARLEN(c, eptr, len);
1873 }
1874 category = UCD_CATEGORY(c);
1875 if (category != ucp_M) break;
1876 eptr += len;
1877 }
1878 }
1879 ecode++;
1880 break;
1881 #endif
1882
1883
1884 /* Match a back reference, possibly repeatedly. Look past the end of the
1885 item to see if there is repeat information following. The code is similar
1886 to that for character classes, but repeated for efficiency. Then obey
1887 similar code to character type repeats - written out again for speed.
1888 However, if the referenced string is the empty string, always treat
1889 it as matched, any number of times (otherwise there could be infinite
1890 loops). */
1891
1892 case OP_REF:
1893 {
1894 offset = GET2(ecode, 1) << 1; /* Doubled ref number */
1895 ecode += 3;
1896
1897 /* If the reference is unset, there are two possibilities:
1898
1899 (a) In the default, Perl-compatible state, set the length to be longer
1900 than the amount of subject left; this ensures that every attempt at a
1901 match fails. We can't just fail here, because of the possibility of
1902 quantifiers with zero minima.
1903
1904 (b) If the JavaScript compatibility flag is set, set the length to zero
1905 so that the back reference matches an empty string.
1906
1907 Otherwise, set the length to the length of what was matched by the
1908 referenced subpattern. */
1909
1910 if (offset >= offset_top || md->offset_vector[offset] < 0)
1911 length = (md->jscript_compat)? 0 : md->end_subject - eptr + 1;
1912 else
1913 length = md->offset_vector[offset+1] - md->offset_vector[offset];
1914
1915 /* Set up for repetition, or handle the non-repeated case */
1916
1917 switch (*ecode)
1918 {
1919 case OP_CRSTAR:
1920 case OP_CRMINSTAR:
1921 case OP_CRPLUS:
1922 case OP_CRMINPLUS:
1923 case OP_CRQUERY:
1924 case OP_CRMINQUERY:
1925 c = *ecode++ - OP_CRSTAR;
1926 minimize = (c & 1) != 0;
1927 min = rep_min[c]; /* Pick up values from tables; */
1928 max = rep_max[c]; /* zero for max => infinity */
1929 if (max == 0) max = INT_MAX;
1930 break;
1931
1932 case OP_CRRANGE:
1933 case OP_CRMINRANGE:
1934 minimize = (*ecode == OP_CRMINRANGE);
1935 min = GET2(ecode, 1);
1936 max = GET2(ecode, 3);
1937 if (max == 0) max = INT_MAX;
1938 ecode += 5;
1939 break;
1940
1941 default: /* No repeat follows */
1942 if (!match_ref(offset, eptr, length, md, ims))
1943 {
1944 CHECK_PARTIAL();
1945 RRETURN(MATCH_NOMATCH);
1946 }
1947 eptr += length;
1948 continue; /* With the main loop */
1949 }
1950
1951 /* If the length of the reference is zero, just continue with the
1952 main loop. */
1953
1954 if (length == 0) continue;
1955
1956 /* First, ensure the minimum number of matches are present. We get back
1957 the length of the reference string explicitly rather than passing the
1958 address of eptr, so that eptr can be a register variable. */
1959
1960 for (i = 1; i <= min; i++)
1961 {
1962 if (!match_ref(offset, eptr, length, md, ims))
1963 {
1964 CHECK_PARTIAL();
1965 RRETURN(MATCH_NOMATCH);
1966 }
1967 eptr += length;
1968 }
1969
1970 /* If min = max, continue at the same level without recursion.
1971 They are not both allowed to be zero. */
1972
1973 if (min == max) continue;
1974
1975 /* If minimizing, keep trying and advancing the pointer */
1976
1977 if (minimize)
1978 {
1979 for (fi = min;; fi++)
1980 {
1981 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM14);
1982 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1983 if (fi >= max) RRETURN(MATCH_NOMATCH);
1984 if (!match_ref(offset, eptr, length, md, ims))
1985 {
1986 CHECK_PARTIAL();
1987 RRETURN(MATCH_NOMATCH);
1988 }
1989 eptr += length;
1990 }
1991 /* Control never gets here */
1992 }
1993
1994 /* If maximizing, find the longest string and work backwards */
1995
1996 else
1997 {
1998 pp = eptr;
1999 for (i = min; i < max; i++)
2000 {
2001 if (!match_ref(offset, eptr, length, md, ims)) break;
2002 eptr += length;
2003 }
2004 while (eptr >= pp)
2005 {
2006 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM15);
2007 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2008 eptr -= length;
2009 }
2010 RRETURN(MATCH_NOMATCH);
2011 }
2012 }
2013 /* Control never gets here */
2014
2015 /* Match a bit-mapped character class, possibly repeatedly. This op code is
2016 used when all the characters in the class have values in the range 0-255,
2017 and either the matching is caseful, or the characters are in the range
2018 0-127 when UTF-8 processing is enabled. The only difference between
2019 OP_CLASS and OP_NCLASS occurs when a data character outside the range is
2020 encountered.
2021
2022 First, look past the end of the item to see if there is repeat information
2023 following. Then obey similar code to character type repeats - written out
2024 again for speed. */
2025
2026 case OP_NCLASS:
2027 case OP_CLASS:
2028 {
2029 data = ecode + 1; /* Save for matching */
2030 ecode += 33; /* Advance past the item */
2031
2032 switch (*ecode)
2033 {
2034 case OP_CRSTAR:
2035 case OP_CRMINSTAR:
2036 case OP_CRPLUS:
2037 case OP_CRMINPLUS:
2038 case OP_CRQUERY:
2039 case OP_CRMINQUERY:
2040 c = *ecode++ - OP_CRSTAR;
2041 minimize = (c & 1) != 0;
2042 min = rep_min[c]; /* Pick up values from tables; */
2043 max = rep_max[c]; /* zero for max => infinity */
2044 if (max == 0) max = INT_MAX;
2045 break;
2046
2047 case OP_CRRANGE:
2048 case OP_CRMINRANGE:
2049 minimize = (*ecode == OP_CRMINRANGE);
2050 min = GET2(ecode, 1);
2051 max = GET2(ecode, 3);
2052 if (max == 0) max = INT_MAX;
2053 ecode += 5;
2054 break;
2055
2056 default: /* No repeat follows */
2057 min = max = 1;
2058 break;
2059 }
2060
2061 /* First, ensure the minimum number of matches are present. */
2062
2063 #ifdef SUPPORT_UTF8
2064 /* UTF-8 mode */
2065 if (utf8)
2066 {
2067 for (i = 1; i <= min; i++)
2068 {
2069 if (eptr >= md->end_subject)
2070 {
2071 SCHECK_PARTIAL();
2072 RRETURN(MATCH_NOMATCH);
2073 }
2074 GETCHARINC(c, eptr);
2075 if (c > 255)
2076 {
2077 if (op == OP_CLASS) RRETURN(MATCH_NOMATCH);
2078 }
2079 else
2080 {
2081 if ((data[c/8] & (1 << (c&7))) == 0) RRETURN(MATCH_NOMATCH);
2082 }
2083 }
2084 }
2085 else
2086 #endif
2087 /* Not UTF-8 mode */
2088 {
2089 for (i = 1; i <= min; i++)
2090 {
2091 if (eptr >= md->end_subject)
2092 {
2093 SCHECK_PARTIAL();
2094 RRETURN(MATCH_NOMATCH);
2095 }
2096 c = *eptr++;
2097 if ((data[c/8] & (1 << (c&7))) == 0) RRETURN(MATCH_NOMATCH);
2098 }
2099 }
2100
2101 /* If max == min we can continue with the main loop without the
2102 need to recurse. */
2103
2104 if (min == max) continue;
2105
2106 /* If minimizing, keep testing the rest of the expression and advancing
2107 the pointer while it matches the class. */
2108
2109 if (minimize)
2110 {
2111 #ifdef SUPPORT_UTF8
2112 /* UTF-8 mode */
2113 if (utf8)
2114 {
2115 for (fi = min;; fi++)
2116 {
2117 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM16);
2118 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2119 if (fi >= max) RRETURN(MATCH_NOMATCH);
2120 if (eptr >= md->end_subject)
2121 {
2122 SCHECK_PARTIAL();
2123 RRETURN(MATCH_NOMATCH);
2124 }
2125 GETCHARINC(c, eptr);
2126 if (c > 255)
2127 {
2128 if (op == OP_CLASS) RRETURN(MATCH_NOMATCH);
2129 }
2130 else
2131 {
2132 if ((data[c/8] & (1 << (c&7))) == 0) RRETURN(MATCH_NOMATCH);
2133 }
2134 }
2135 }
2136 else
2137 #endif
2138 /* Not UTF-8 mode */
2139 {
2140 for (fi = min;; fi++)
2141 {
2142 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM17);
2143 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2144 if (fi >= max) RRETURN(MATCH_NOMATCH);
2145 if (eptr >= md->end_subject)
2146 {
2147 SCHECK_PARTIAL();
2148 RRETURN(MATCH_NOMATCH);
2149 }
2150 c = *eptr++;
2151 if ((data[c/8] & (1 << (c&7))) == 0) RRETURN(MATCH_NOMATCH);
2152 }
2153 }
2154 /* Control never gets here */
2155 }
2156
2157 /* If maximizing, find the longest possible run, then work backwards. */
2158
2159 else
2160 {
2161 pp = eptr;
2162
2163 #ifdef SUPPORT_UTF8
2164 /* UTF-8 mode */
2165 if (utf8)
2166 {
2167 for (i = min; i < max; i++)
2168 {
2169 int len = 1;
2170 if (eptr >= md->end_subject) break;
2171 GETCHARLEN(c, eptr, len);
2172 if (c > 255)
2173 {
2174 if (op == OP_CLASS) break;
2175 }
2176 else
2177 {
2178 if ((data[c/8] & (1 << (c&7))) == 0) break;
2179 }
2180 eptr += len;
2181 }
2182 for (;;)
2183 {
2184 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM18);
2185 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2186 if (eptr-- == pp) break; /* Stop if tried at original pos */
2187 BACKCHAR(eptr);
2188 }
2189 }
2190 else
2191 #endif
2192 /* Not UTF-8 mode */
2193 {
2194 for (i = min; i < max; i++)
2195 {
2196 if (eptr >= md->end_subject) break;
2197 c = *eptr;
2198 if ((data[c/8] & (1 << (c&7))) == 0) break;
2199 eptr++;
2200 }
2201 while (eptr >= pp)
2202 {
2203 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM19);
2204 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2205 eptr--;
2206 }
2207 }
2208
2209 RRETURN(MATCH_NOMATCH);
2210 }
2211 }
2212 /* Control never gets here */
2213
2214
2215 /* Match an extended character class. This opcode is encountered only
2216 when UTF-8 mode mode is supported. Nevertheless, we may not be in UTF-8
2217 mode, because Unicode properties are supported in non-UTF-8 mode. */
2218
2219 #ifdef SUPPORT_UTF8
2220 case OP_XCLASS:
2221 {
2222 data = ecode + 1 + LINK_SIZE; /* Save for matching */
2223 ecode += GET(ecode, 1); /* Advance past the item */
2224
2225 switch (*ecode)
2226 {
2227 case OP_CRSTAR:
2228 case OP_CRMINSTAR:
2229 case OP_CRPLUS:
2230 case OP_CRMINPLUS:
2231 case OP_CRQUERY:
2232 case OP_CRMINQUERY:
2233 c = *ecode++ - OP_CRSTAR;
2234 minimize = (c & 1) != 0;
2235 min = rep_min[c]; /* Pick up values from tables; */
2236 max = rep_max[c]; /* zero for max => infinity */
2237 if (max == 0) max = INT_MAX;
2238 break;
2239
2240 case OP_CRRANGE:
2241 case OP_CRMINRANGE:
2242 minimize = (*ecode == OP_CRMINRANGE);
2243 min = GET2(ecode, 1);
2244 max = GET2(ecode, 3);
2245 if (max == 0) max = INT_MAX;
2246 ecode += 5;
2247 break;
2248
2249 default: /* No repeat follows */
2250 min = max = 1;
2251 break;
2252 }
2253
2254 /* First, ensure the minimum number of matches are present. */
2255
2256 for (i = 1; i <= min; i++)
2257 {
2258 if (eptr >= md->end_subject)
2259 {
2260 SCHECK_PARTIAL();
2261 RRETURN(MATCH_NOMATCH);
2262 }
2263 GETCHARINCTEST(c, eptr);
2264 if (!_pcre_xclass(c, data)) RRETURN(MATCH_NOMATCH);
2265 }
2266
2267 /* If max == min we can continue with the main loop without the
2268 need to recurse. */
2269
2270 if (min == max) continue;
2271
2272 /* If minimizing, keep testing the rest of the expression and advancing
2273 the pointer while it matches the class. */
2274
2275 if (minimize)
2276 {
2277 for (fi = min;; fi++)
2278 {
2279 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM20);
2280 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2281 if (fi >= max) RRETURN(MATCH_NOMATCH);
2282 if (eptr >= md->end_subject)
2283 {
2284 SCHECK_PARTIAL();
2285 RRETURN(MATCH_NOMATCH);
2286 }
2287 GETCHARINCTEST(c, eptr);
2288 if (!_pcre_xclass(c, data)) RRETURN(MATCH_NOMATCH);
2289 }
2290 /* Control never gets here */
2291 }
2292
2293 /* If maximizing, find the longest possible run, then work backwards. */
2294
2295 else
2296 {
2297 pp = eptr;
2298 for (i = min; i < max; i++)
2299 {
2300 int len = 1;
2301 if (eptr >= md->end_subject) break;
2302 GETCHARLENTEST(c, eptr, len);
2303 if (!_pcre_xclass(c, data)) break;
2304 eptr += len;
2305 }
2306 for(;;)
2307 {
2308 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM21);
2309 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2310 if (eptr-- == pp) break; /* Stop if tried at original pos */
2311 if (utf8) BACKCHAR(eptr);
2312 }
2313 RRETURN(MATCH_NOMATCH);
2314 }
2315
2316 /* Control never gets here */
2317 }
2318 #endif /* End of XCLASS */
2319
2320 /* Match a single character, casefully */
2321
2322 case OP_CHAR:
2323 #ifdef SUPPORT_UTF8
2324 if (utf8)
2325 {
2326 length = 1;
2327 ecode++;
2328 GETCHARLEN(fc, ecode, length);
2329 if (length > md->end_subject - eptr)
2330 {
2331 CHECK_PARTIAL(); /* Not SCHECK_PARTIAL() */
2332 RRETURN(MATCH_NOMATCH);
2333 }
2334 while (length-- > 0) if (*ecode++ != *eptr++) RRETURN(MATCH_NOMATCH);
2335 }
2336 else
2337 #endif
2338
2339 /* Non-UTF-8 mode */
2340 {
2341 if (md->end_subject - eptr < 1)
2342 {
2343 SCHECK_PARTIAL(); /* This one can use SCHECK_PARTIAL() */
2344 RRETURN(MATCH_NOMATCH);
2345 }
2346 if (ecode[1] != *eptr++) RRETURN(MATCH_NOMATCH);
2347 ecode += 2;
2348 }
2349 break;
2350
2351 /* Match a single character, caselessly */
2352
2353 case OP_CHARNC:
2354 #ifdef SUPPORT_UTF8
2355 if (utf8)
2356 {
2357 length = 1;
2358 ecode++;
2359 GETCHARLEN(fc, ecode, length);
2360
2361 if (length > md->end_subject - eptr)
2362 {
2363 CHECK_PARTIAL(); /* Not SCHECK_PARTIAL() */
2364 RRETURN(MATCH_NOMATCH);
2365 }
2366
2367 /* If the pattern character's value is < 128, we have only one byte, and
2368 can use the fast lookup table. */
2369
2370 if (fc < 128)
2371 {
2372 if (md->lcc[*ecode++] != md->lcc[*eptr++]) RRETURN(MATCH_NOMATCH);
2373 }
2374
2375 /* Otherwise we must pick up the subject character */
2376
2377 else
2378 {
2379 unsigned int dc;
2380 GETCHARINC(dc, eptr);
2381 ecode += length;
2382
2383 /* If we have Unicode property support, we can use it to test the other
2384 case of the character, if there is one. */
2385
2386 if (fc != dc)
2387 {
2388 #ifdef SUPPORT_UCP
2389 if (dc != UCD_OTHERCASE(fc))
2390 #endif
2391 RRETURN(MATCH_NOMATCH);
2392 }
2393 }
2394 }
2395 else
2396 #endif /* SUPPORT_UTF8 */
2397
2398 /* Non-UTF-8 mode */
2399 {
2400 if (md->end_subject - eptr < 1)
2401 {
2402 SCHECK_PARTIAL(); /* This one can use SCHECK_PARTIAL() */
2403 RRETURN(MATCH_NOMATCH);
2404 }
2405 if (md->lcc[ecode[1]] != md->lcc[*eptr++]) RRETURN(MATCH_NOMATCH);
2406 ecode += 2;
2407 }
2408 break;
2409
2410 /* Match a single character repeatedly. */
2411
2412 case OP_EXACT:
2413 min = max = GET2(ecode, 1);
2414 ecode += 3;
2415 goto REPEATCHAR;
2416
2417 case OP_POSUPTO:
2418 possessive = TRUE;
2419 /* Fall through */
2420
2421 case OP_UPTO:
2422 case OP_MINUPTO:
2423 min = 0;
2424 max = GET2(ecode, 1);
2425 minimize = *ecode == OP_MINUPTO;
2426 ecode += 3;
2427 goto REPEATCHAR;
2428
2429 case OP_POSSTAR:
2430 possessive = TRUE;
2431 min = 0;
2432 max = INT_MAX;
2433 ecode++;
2434 goto REPEATCHAR;
2435
2436 case OP_POSPLUS:
2437 possessive = TRUE;
2438 min = 1;
2439 max = INT_MAX;
2440 ecode++;
2441 goto REPEATCHAR;
2442
2443 case OP_POSQUERY:
2444 possessive = TRUE;
2445 min = 0;
2446 max = 1;
2447 ecode++;
2448 goto REPEATCHAR;
2449
2450 case OP_STAR:
2451 case OP_MINSTAR:
2452 case OP_PLUS:
2453 case OP_MINPLUS:
2454 case OP_QUERY:
2455 case OP_MINQUERY:
2456 c = *ecode++ - OP_STAR;
2457 minimize = (c & 1) != 0;
2458
2459 min = rep_min[c]; /* Pick up values from tables; */
2460 max = rep_max[c]; /* zero for max => infinity */
2461 if (max == 0) max = INT_MAX;
2462
2463 /* Common code for all repeated single-character matches. */
2464
2465 REPEATCHAR:
2466 #ifdef SUPPORT_UTF8
2467 if (utf8)
2468 {
2469 length = 1;
2470 charptr = ecode;
2471 GETCHARLEN(fc, ecode, length);
2472 ecode += length;
2473
2474 /* Handle multibyte character matching specially here. There is
2475 support for caseless matching if UCP support is present. */
2476
2477 if (length > 1)
2478 {
2479 #ifdef SUPPORT_UCP
2480 unsigned int othercase;
2481 if ((ims & PCRE_CASELESS) != 0 &&
2482 (othercase = UCD_OTHERCASE(fc)) != fc)
2483 oclength = _pcre_ord2utf8(othercase, occhars);
2484 else oclength = 0;
2485 #endif /* SUPPORT_UCP */
2486
2487 for (i = 1; i <= min; i++)
2488 {
2489 if (eptr <= md->end_subject - length &&
2490 memcmp(eptr, charptr, length) == 0) eptr += length;
2491 #ifdef SUPPORT_UCP
2492 else if (oclength > 0 &&
2493 eptr <= md->end_subject - oclength &&
2494 memcmp(eptr, occhars, oclength) == 0) eptr += oclength;
2495 #endif /* SUPPORT_UCP */
2496 else
2497 {
2498 CHECK_PARTIAL();
2499 RRETURN(MATCH_NOMATCH);
2500 }
2501 }
2502
2503 if (min == max) continue;
2504
2505 if (minimize)
2506 {
2507 for (fi = min;; fi++)
2508 {
2509 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM22);
2510 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2511 if (fi >= max) RRETURN(MATCH_NOMATCH);
2512 if (eptr <= md->end_subject - length &&
2513 memcmp(eptr, charptr, length) == 0) eptr += length;
2514 #ifdef SUPPORT_UCP
2515 else if (oclength > 0 &&
2516 eptr <= md->end_subject - oclength &&
2517 memcmp(eptr, occhars, oclength) == 0) eptr += oclength;
2518 #endif /* SUPPORT_UCP */
2519 else
2520 {
2521 CHECK_PARTIAL();
2522 RRETURN(MATCH_NOMATCH);
2523 }
2524 }
2525 /* Control never gets here */
2526 }
2527
2528 else /* Maximize */
2529 {
2530 pp = eptr;
2531 for (i = min; i < max; i++)
2532 {
2533 if (eptr <= md->end_subject - length &&
2534 memcmp(eptr, charptr, length) == 0) eptr += length;
2535 #ifdef SUPPORT_UCP
2536 else if (oclength > 0 &&
2537 eptr <= md->end_subject - oclength &&
2538 memcmp(eptr, occhars, oclength) == 0) eptr += oclength;
2539 #endif /* SUPPORT_UCP */
2540 else break;
2541 }
2542
2543 if (possessive) continue;
2544
2545 for(;;)
2546 {
2547 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM23);
2548 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2549 if (eptr == pp) { RRETURN(MATCH_NOMATCH); }
2550 #ifdef SUPPORT_UCP
2551 eptr--;
2552 BACKCHAR(eptr);
2553 #else /* without SUPPORT_UCP */
2554 eptr -= length;
2555 #endif /* SUPPORT_UCP */
2556 }
2557 }
2558 /* Control never gets here */
2559 }
2560
2561 /* If the length of a UTF-8 character is 1, we fall through here, and
2562 obey the code as for non-UTF-8 characters below, though in this case the
2563 value of fc will always be < 128. */
2564 }
2565 else
2566 #endif /* SUPPORT_UTF8 */
2567
2568 /* When not in UTF-8 mode, load a single-byte character. */
2569
2570 fc = *ecode++;
2571
2572 /* The value of fc at this point is always less than 256, though we may or
2573 may not be in UTF-8 mode. The code is duplicated for the caseless and
2574 caseful cases, for speed, since matching characters is likely to be quite
2575 common. First, ensure the minimum number of matches are present. If min =
2576 max, continue at the same level without recursing. Otherwise, if
2577 minimizing, keep trying the rest of the expression and advancing one
2578 matching character if failing, up to the maximum. Alternatively, if
2579 maximizing, find the maximum number of characters and work backwards. */
2580
2581 DPRINTF(("matching %c{%d,%d} against subject %.*s\n", fc, min, max,
2582 max, eptr));
2583
2584 if ((ims & PCRE_CASELESS) != 0)
2585 {
2586 fc = md->lcc[fc];
2587 for (i = 1; i <= min; i++)
2588 {
2589 if (eptr >= md->end_subject)
2590 {
2591 SCHECK_PARTIAL();
2592 RRETURN(MATCH_NOMATCH);
2593 }
2594 if (fc != md->lcc[*eptr++]) RRETURN(MATCH_NOMATCH);
2595 }
2596 if (min == max) continue;
2597 if (minimize)
2598 {
2599 for (fi = min;; fi++)
2600 {
2601 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM24);
2602 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2603 if (fi >= max) RRETURN(MATCH_NOMATCH);
2604 if (eptr >= md->end_subject)
2605 {
2606 SCHECK_PARTIAL();
2607 RRETURN(MATCH_NOMATCH);
2608 }
2609 if (fc != md->lcc[*eptr++]) RRETURN(MATCH_NOMATCH);
2610 }
2611 /* Control never gets here */
2612 }
2613 else /* Maximize */
2614 {
2615 pp = eptr;
2616 for (i = min; i < max; i++)
2617 {
2618 if (eptr >= md->end_subject || fc != md->lcc[*eptr]) break;
2619 eptr++;
2620 }
2621
2622 if (possessive) continue;
2623
2624 while (eptr >= pp)
2625 {
2626 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM25);
2627 eptr--;
2628 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2629 }
2630 RRETURN(MATCH_NOMATCH);
2631 }
2632 /* Control never gets here */
2633 }
2634
2635 /* Caseful comparisons (includes all multi-byte characters) */
2636
2637 else
2638 {
2639 for (i = 1; i <= min; i++)
2640 {
2641 if (eptr >= md->end_subject)
2642 {
2643 SCHECK_PARTIAL();
2644 RRETURN(MATCH_NOMATCH);
2645 }
2646 if (fc != *eptr++) RRETURN(MATCH_NOMATCH);
2647 }
2648
2649 if (min == max) continue;
2650
2651 if (minimize)
2652 {
2653 for (fi = min;; fi++)
2654 {
2655 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM26);
2656 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2657 if (fi >= max) RRETURN(MATCH_NOMATCH);
2658 if (eptr >= md->end_subject)
2659 {
2660 SCHECK_PARTIAL();
2661 RRETURN(MATCH_NOMATCH);
2662 }
2663 if (fc != *eptr++) RRETURN(MATCH_NOMATCH);
2664 }
2665 /* Control never gets here */
2666 }
2667 else /* Maximize */
2668 {
2669 pp = eptr;
2670 for (i = min; i < max; i++)
2671 {
2672 if (eptr >= md->end_subject || fc != *eptr) break;
2673 eptr++;
2674 }
2675 if (possessive) continue;
2676
2677 while (eptr >= pp)
2678 {
2679 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM27);
2680 eptr--;
2681 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2682 }
2683 RRETURN(MATCH_NOMATCH);
2684 }
2685 }
2686 /* Control never gets here */
2687
2688 /* Match a negated single one-byte character. The character we are
2689 checking can be multibyte. */
2690
2691 case OP_NOT:
2692 if (eptr >= md->end_subject)
2693 {
2694 SCHECK_PARTIAL();
2695 RRETURN(MATCH_NOMATCH);
2696 }
2697 ecode++;
2698 GETCHARINCTEST(c, eptr);
2699 if ((ims & PCRE_CASELESS) != 0)
2700 {
2701 #ifdef SUPPORT_UTF8
2702 if (c < 256)
2703 #endif
2704 c = md->lcc[c];
2705 if (md->lcc[*ecode++] == c) RRETURN(MATCH_NOMATCH);
2706 }
2707 else
2708 {
2709 if (*ecode++ == c) RRETURN(MATCH_NOMATCH);
2710 }
2711 break;
2712
2713 /* Match a negated single one-byte character repeatedly. This is almost a
2714 repeat of the code for a repeated single character, but I haven't found a
2715 nice way of commoning these up that doesn't require a test of the
2716 positive/negative option for each character match. Maybe that wouldn't add
2717 very much to the time taken, but character matching *is* what this is all
2718 about... */
2719
2720 case OP_NOTEXACT:
2721 min = max = GET2(ecode, 1);
2722 ecode += 3;
2723 goto REPEATNOTCHAR;
2724
2725 case OP_NOTUPTO:
2726 case OP_NOTMINUPTO:
2727 min = 0;
2728 max = GET2(ecode, 1);
2729 minimize = *ecode == OP_NOTMINUPTO;
2730 ecode += 3;
2731 goto REPEATNOTCHAR;
2732
2733 case OP_NOTPOSSTAR:
2734 possessive = TRUE;
2735 min = 0;
2736 max = INT_MAX;
2737 ecode++;
2738 goto REPEATNOTCHAR;
2739
2740 case OP_NOTPOSPLUS:
2741 possessive = TRUE;
2742 min = 1;
2743 max = INT_MAX;
2744 ecode++;
2745 goto REPEATNOTCHAR;
2746
2747 case OP_NOTPOSQUERY:
2748 possessive = TRUE;
2749 min = 0;
2750 max = 1;
2751 ecode++;
2752 goto REPEATNOTCHAR;
2753
2754 case OP_NOTPOSUPTO:
2755 possessive = TRUE;
2756 min = 0;
2757 max = GET2(ecode, 1);
2758 ecode += 3;
2759 goto REPEATNOTCHAR;
2760
2761 case OP_NOTSTAR:
2762 case OP_NOTMINSTAR:
2763 case OP_NOTPLUS:
2764 case OP_NOTMINPLUS:
2765 case OP_NOTQUERY:
2766 case OP_NOTMINQUERY:
2767 c = *ecode++ - OP_NOTSTAR;
2768 minimize = (c & 1) != 0;
2769 min = rep_min[c]; /* Pick up values from tables; */
2770 max = rep_max[c]; /* zero for max => infinity */
2771 if (max == 0) max = INT_MAX;
2772
2773 /* Common code for all repeated single-byte matches. */
2774
2775 REPEATNOTCHAR:
2776 fc = *ecode++;
2777
2778 /* The code is duplicated for the caseless and caseful cases, for speed,
2779 since matching characters is likely to be quite common. First, ensure the
2780 minimum number of matches are present. If min = max, continue at the same
2781 level without recursing. Otherwise, if minimizing, keep trying the rest of
2782 the expression and advancing one matching character if failing, up to the
2783 maximum. Alternatively, if maximizing, find the maximum number of
2784 characters and work backwards. */
2785
2786 DPRINTF(("negative matching %c{%d,%d} against subject %.*s\n", fc, min, max,
2787 max, eptr));
2788
2789 if ((ims & PCRE_CASELESS) != 0)
2790 {
2791 fc = md->lcc[fc];
2792
2793 #ifdef SUPPORT_UTF8
2794 /* UTF-8 mode */
2795 if (utf8)
2796 {
2797 register unsigned int d;
2798 for (i = 1; i <= min; i++)
2799 {
2800 if (eptr >= md->end_subject)
2801 {
2802 SCHECK_PARTIAL();
2803 RRETURN(MATCH_NOMATCH);
2804 }
2805 GETCHARINC(d, eptr);
2806 if (d < 256) d = md->lcc[d];
2807 if (fc == d) RRETURN(MATCH_NOMATCH);
2808 }
2809 }
2810 else
2811 #endif
2812
2813 /* Not UTF-8 mode */
2814 {
2815 for (i = 1; i <= min; i++)
2816 {
2817 if (eptr >= md->end_subject)
2818 {
2819 SCHECK_PARTIAL();
2820 RRETURN(MATCH_NOMATCH);
2821 }
2822 if (fc == md->lcc[*eptr++]) RRETURN(MATCH_NOMATCH);
2823 }
2824 }
2825
2826 if (min == max) continue;
2827
2828 if (minimize)
2829 {
2830 #ifdef SUPPORT_UTF8
2831 /* UTF-8 mode */
2832 if (utf8)
2833 {
2834 register unsigned int d;
2835 for (fi = min;; fi++)
2836 {
2837 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM28);
2838 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2839 if (fi >= max) RRETURN(MATCH_NOMATCH);
2840 if (eptr >= md->end_subject)
2841 {
2842 SCHECK_PARTIAL();
2843 RRETURN(MATCH_NOMATCH);
2844 }
2845 GETCHARINC(d, eptr);
2846 if (d < 256) d = md->lcc[d];
2847 if (fc == d) RRETURN(MATCH_NOMATCH);
2848 }
2849 }
2850 else
2851 #endif
2852 /* Not UTF-8 mode */
2853 {
2854 for (fi = min;; fi++)
2855 {
2856 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM29);
2857 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2858 if (fi >= max) RRETURN(MATCH_NOMATCH);
2859 if (eptr >= md->end_subject)
2860 {
2861 SCHECK_PARTIAL();
2862 RRETURN(MATCH_NOMATCH);
2863 }
2864 if (fc == md->lcc[*eptr++]) RRETURN(MATCH_NOMATCH);
2865 }
2866 }
2867 /* Control never gets here */
2868 }
2869
2870 /* Maximize case */
2871
2872 else
2873 {
2874 pp = eptr;
2875
2876 #ifdef SUPPORT_UTF8
2877 /* UTF-8 mode */
2878 if (utf8)
2879 {
2880 register unsigned int d;
2881 for (i = min; i < max; i++)
2882 {
2883 int len = 1;
2884 if (eptr >= md->end_subject) break;
2885 GETCHARLEN(d, eptr, len);
2886 if (d < 256) d = md->lcc[d];
2887 if (fc == d) break;
2888 eptr += len;
2889 }
2890 if (possessive) continue;
2891 for(;;)
2892 {
2893 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM30);
2894 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2895 if (eptr-- == pp) break; /* Stop if tried at original pos */
2896 BACKCHAR(eptr);
2897 }
2898 }
2899 else
2900 #endif
2901 /* Not UTF-8 mode */
2902 {
2903 for (i = min; i < max; i++)
2904 {
2905 if (eptr >= md->end_subject || fc == md->lcc[*eptr]) break;
2906 eptr++;
2907 }
2908 if (possessive) continue;
2909 while (eptr >= pp)
2910 {
2911 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM31);
2912 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2913 eptr--;
2914 }
2915 }
2916
2917 RRETURN(MATCH_NOMATCH);
2918 }
2919 /* Control never gets here */
2920 }
2921
2922 /* Caseful comparisons */
2923
2924 else
2925 {
2926 #ifdef SUPPORT_UTF8
2927 /* UTF-8 mode */
2928 if (utf8)
2929 {
2930 register unsigned int d;
2931 for (i = 1; i <= min; i++)
2932 {
2933 if (eptr >= md->end_subject)
2934 {
2935 SCHECK_PARTIAL();
2936 RRETURN(MATCH_NOMATCH);
2937 }
2938 GETCHARINC(d, eptr);
2939 if (fc == d) RRETURN(MATCH_NOMATCH);
2940 }
2941 }
2942 else
2943 #endif
2944 /* Not UTF-8 mode */
2945 {
2946 for (i = 1; i <= min; i++)
2947 {
2948 if (eptr >= md->end_subject)
2949 {
2950 SCHECK_PARTIAL();
2951 RRETURN(MATCH_NOMATCH);
2952 }
2953 if (fc == *eptr++) RRETURN(MATCH_NOMATCH);
2954 }
2955 }
2956
2957 if (min == max) continue;
2958
2959 if (minimize)
2960 {
2961 #ifdef SUPPORT_UTF8
2962 /* UTF-8 mode */
2963 if (utf8)
2964 {
2965 register unsigned int d;
2966 for (fi = min;; fi++)
2967 {
2968 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM32);
2969 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2970 if (fi >= max) RRETURN(MATCH_NOMATCH);
2971 if (eptr >= md->end_subject)
2972 {
2973 SCHECK_PARTIAL();
2974 RRETURN(MATCH_NOMATCH);
2975 }
2976 GETCHARINC(d, eptr);
2977 if (fc == d) RRETURN(MATCH_NOMATCH);
2978 }
2979 }
2980 else
2981 #endif
2982 /* Not UTF-8 mode */
2983 {
2984 for (fi = min;; fi++)
2985 {
2986 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM33);
2987 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2988 if (fi >= max) RRETURN(MATCH_NOMATCH);
2989 if (eptr >= md->end_subject)
2990 {
2991 SCHECK_PARTIAL();
2992 RRETURN(MATCH_NOMATCH);
2993 }
2994 if (fc == *eptr++) RRETURN(MATCH_NOMATCH);
2995 }
2996 }
2997 /* Control never gets here */
2998 }
2999
3000 /* Maximize case */
3001
3002 else
3003 {
3004 pp = eptr;
3005
3006 #ifdef SUPPORT_UTF8
3007 /* UTF-8 mode */
3008 if (utf8)
3009 {
3010 register unsigned int d;
3011 for (i = min; i < max; i++)
3012 {
3013 int len = 1;
3014 if (eptr >= md->end_subject) break;
3015 GETCHARLEN(d, eptr, len);
3016 if (fc == d) break;
3017 eptr += len;
3018 }
3019 if (possessive) continue;
3020 for(;;)
3021 {
3022 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM34);
3023 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3024 if (eptr-- == pp) break; /* Stop if tried at original pos */
3025 BACKCHAR(eptr);
3026 }
3027 }
3028 else
3029 #endif
3030 /* Not UTF-8 mode */
3031 {
3032 for (i = min; i < max; i++)
3033 {
3034 if (eptr >= md->end_subject || fc == *eptr) break;
3035 eptr++;
3036 }
3037 if (possessive) continue;
3038 while (eptr >= pp)
3039 {
3040 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM35);
3041 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3042 eptr--;
3043 }
3044 }
3045
3046 RRETURN(MATCH_NOMATCH);
3047 }
3048 }
3049 /* Control never gets here */
3050
3051 /* Match a single character type repeatedly; several different opcodes
3052 share code. This is very similar to the code for single characters, but we
3053 repeat it in the interests of efficiency. */
3054
3055 case OP_TYPEEXACT:
3056 min = max = GET2(ecode, 1);
3057 minimize = TRUE;
3058 ecode += 3;
3059 goto REPEATTYPE;
3060
3061 case OP_TYPEUPTO:
3062 case OP_TYPEMINUPTO:
3063 min = 0;
3064 max = GET2(ecode, 1);
3065 minimize = *ecode == OP_TYPEMINUPTO;
3066 ecode += 3;
3067 goto REPEATTYPE;
3068
3069 case OP_TYPEPOSSTAR:
3070 possessive = TRUE;
3071 min = 0;
3072 max = INT_MAX;
3073 ecode++;
3074 goto REPEATTYPE;
3075
3076 case OP_TYPEPOSPLUS:
3077 possessive = TRUE;
3078 min = 1;
3079 max = INT_MAX;
3080 ecode++;
3081 goto REPEATTYPE;
3082
3083 case OP_TYPEPOSQUERY:
3084 possessive = TRUE;
3085 min = 0;
3086 max = 1;
3087 ecode++;
3088 goto REPEATTYPE;
3089
3090 case OP_TYPEPOSUPTO:
3091 possessive = TRUE;
3092 min = 0;
3093 max = GET2(ecode, 1);
3094 ecode += 3;
3095 goto REPEATTYPE;
3096
3097 case OP_TYPESTAR:
3098 case OP_TYPEMINSTAR:
3099 case OP_TYPEPLUS:
3100 case OP_TYPEMINPLUS:
3101 case OP_TYPEQUERY:
3102 case OP_TYPEMINQUERY:
3103 c = *ecode++ - OP_TYPESTAR;
3104 minimize = (c & 1) != 0;
3105 min = rep_min[c]; /* Pick up values from tables; */
3106 max = rep_max[c]; /* zero for max => infinity */
3107 if (max == 0) max = INT_MAX;
3108
3109 /* Common code for all repeated single character type matches. Note that
3110 in UTF-8 mode, '.' matches a character of any length, but for the other
3111 character types, the valid characters are all one-byte long. */
3112
3113 REPEATTYPE:
3114 ctype = *ecode++; /* Code for the character type */
3115
3116 #ifdef SUPPORT_UCP
3117 if (ctype == OP_PROP || ctype == OP_NOTPROP)
3118 {
3119 prop_fail_result = ctype == OP_NOTPROP;
3120 prop_type = *ecode++;
3121 prop_value = *ecode++;
3122 }
3123 else prop_type = -1;
3124 #endif
3125
3126 /* First, ensure the minimum number of matches are present. Use inline
3127 code for maximizing the speed, and do the type test once at the start
3128 (i.e. keep it out of the loop). Separate the UTF-8 code completely as that
3129 is tidier. Also separate the UCP code, which can be the same for both UTF-8
3130 and single-bytes. */
3131
3132 if (min > 0)
3133 {
3134 #ifdef SUPPORT_UCP
3135 if (prop_type >= 0)
3136 {
3137 switch(prop_type)
3138 {
3139 case PT_ANY:
3140 if (prop_fail_result) RRETURN(MATCH_NOMATCH);
3141 for (i = 1; i <= min; i++)
3142 {
3143 if (eptr >= md->end_subject)
3144 {
3145 SCHECK_PARTIAL();
3146 RRETURN(MATCH_NOMATCH);
3147 }
3148 GETCHARINCTEST(c, eptr);
3149 }
3150 break;
3151
3152 case PT_LAMP:
3153 for (i = 1; i <= min; i++)
3154 {
3155 if (eptr >= md->end_subject)
3156 {
3157 SCHECK_PARTIAL();
3158 RRETURN(MATCH_NOMATCH);
3159 }
3160 GETCHARINCTEST(c, eptr);
3161 prop_chartype = UCD_CHARTYPE(c);
3162 if ((prop_chartype == ucp_Lu ||
3163 prop_chartype == ucp_Ll ||
3164 prop_chartype == ucp_Lt) == prop_fail_result)
3165 RRETURN(MATCH_NOMATCH);
3166 }
3167 break;
3168
3169 case PT_GC:
3170 for (i = 1; i <= min; i++)
3171 {
3172 if (eptr >= md->end_subject)
3173 {
3174 SCHECK_PARTIAL();
3175 RRETURN(MATCH_NOMATCH);
3176 }
3177 GETCHARINCTEST(c, eptr);
3178 prop_category = UCD_CATEGORY(c);
3179 if ((prop_category == prop_value) == prop_fail_result)
3180 RRETURN(MATCH_NOMATCH);
3181 }
3182 break;
3183
3184 case PT_PC:
3185 for (i = 1; i <= min; i++)
3186 {
3187 if (eptr >= md->end_subject)
3188 {
3189 SCHECK_PARTIAL();
3190 RRETURN(MATCH_NOMATCH);
3191 }
3192 GETCHARINCTEST(c, eptr);
3193 prop_chartype = UCD_CHARTYPE(c);
3194 if ((prop_chartype == prop_value) == prop_fail_result)
3195 RRETURN(MATCH_NOMATCH);
3196 }
3197 break;
3198
3199 case PT_SC:
3200 for (i = 1; i <= min; i++)
3201 {
3202 if (eptr >= md->end_subject)
3203 {
3204 SCHECK_PARTIAL();
3205 RRETURN(MATCH_NOMATCH);
3206 }
3207 GETCHARINCTEST(c, eptr);
3208 prop_script = UCD_SCRIPT(c);
3209 if ((prop_script == prop_value) == prop_fail_result)
3210 RRETURN(MATCH_NOMATCH);
3211 }
3212 break;
3213
3214 default:
3215 RRETURN(PCRE_ERROR_INTERNAL);
3216 }
3217 }
3218
3219 /* Match extended Unicode sequences. We will get here only if the
3220 support is in the binary; otherwise a compile-time error occurs. */
3221
3222 else if (ctype == OP_EXTUNI)
3223 {
3224 for (i = 1; i <= min; i++)
3225 {
3226 if (eptr >= md->end_subject)
3227 {
3228 SCHECK_PARTIAL();
3229 RRETURN(MATCH_NOMATCH);
3230 }
3231 GETCHARINCTEST(c, eptr);
3232 prop_category = UCD_CATEGORY(c);
3233 if (prop_category == ucp_M) RRETURN(MATCH_NOMATCH);
3234 while (eptr < md->end_subject)
3235 {
3236 int len = 1;
3237 if (!utf8) c = *eptr;
3238 else { GETCHARLEN(c, eptr, len); }
3239 prop_category = UCD_CATEGORY(c);
3240 if (prop_category != ucp_M) break;
3241 eptr += len;
3242 }
3243 }
3244 }
3245
3246 else
3247 #endif /* SUPPORT_UCP */
3248
3249 /* Handle all other cases when the coding is UTF-8 */
3250
3251 #ifdef SUPPORT_UTF8
3252 if (utf8) switch(ctype)
3253 {
3254 case OP_ANY:
3255 for (i = 1; i <= min; i++)
3256 {
3257 if (eptr >= md->end_subject)
3258 {
3259 SCHECK_PARTIAL();
3260 RRETURN(MATCH_NOMATCH);
3261 }
3262 if (IS_NEWLINE(eptr)) RRETURN(MATCH_NOMATCH);
3263 eptr++;
3264 while (eptr < md->end_subject && (*eptr & 0xc0) == 0x80) eptr++;
3265 }
3266 break;
3267
3268 case OP_ALLANY:
3269 for (i = 1; i <= min; i++)
3270 {
3271 if (eptr >= md->end_subject)
3272 {
3273 SCHECK_PARTIAL();
3274 RRETURN(MATCH_NOMATCH);
3275 }
3276 eptr++;
3277 while (eptr < md->end_subject && (*eptr & 0xc0) == 0x80) eptr++;
3278 }
3279 break;
3280
3281 case OP_ANYBYTE:
3282 if (eptr > md->end_subject - min) RRETURN(MATCH_NOMATCH);
3283 eptr += min;
3284 break;
3285
3286 case OP_ANYNL:
3287 for (i = 1; i <= min; i++)
3288 {
3289 if (eptr >= md->end_subject)
3290 {
3291 SCHECK_PARTIAL();
3292 RRETURN(MATCH_NOMATCH);
3293 }
3294 GETCHARINC(c, eptr);
3295 switch(c)
3296 {
3297 default: RRETURN(MATCH_NOMATCH);
3298 case 0x000d:
3299 if (eptr < md->end_subject && *eptr == 0x0a) eptr++;
3300 break;
3301
3302 case 0x000a:
3303 break;
3304
3305 case 0x000b:
3306 case 0x000c:
3307 case 0x0085:
3308 case 0x2028:
3309 case 0x2029:
3310 if (md->bsr_anycrlf) RRETURN(MATCH_NOMATCH);
3311 break;
3312 }
3313 }
3314 break;
3315
3316 case OP_NOT_HSPACE:
3317 for (i = 1; i <= min; i++)
3318 {
3319 if (eptr >= md->end_subject)
3320 {
3321 SCHECK_PARTIAL();
3322 RRETURN(MATCH_NOMATCH);
3323 }
3324 GETCHARINC(c, eptr);
3325 switch(c)
3326 {
3327 default: break;
3328 case 0x09: /* HT */
3329 case 0x20: /* SPACE */
3330 case 0xa0: /* NBSP */
3331 case 0x1680: /* OGHAM SPACE MARK */
3332 case 0x180e: /* MONGOLIAN VOWEL SEPARATOR */
3333 case 0x2000: /* EN QUAD */
3334 case 0x2001: /* EM QUAD */
3335 case 0x2002: /* EN SPACE */
3336 case 0x2003: /* EM SPACE */
3337 case 0x2004: /* THREE-PER-EM SPACE */
3338 case 0x2005: /* FOUR-PER-EM SPACE */
3339 case 0x2006: /* SIX-PER-EM SPACE */
3340 case 0x2007: /* FIGURE SPACE */
3341 case 0x2008: /* PUNCTUATION SPACE */
3342 case 0x2009: /* THIN SPACE */
3343 case 0x200A: /* HAIR SPACE */
3344 case 0x202f: /* NARROW NO-BREAK SPACE */
3345 case 0x205f: /* MEDIUM MATHEMATICAL SPACE */
3346 case 0x3000: /* IDEOGRAPHIC SPACE */
3347 RRETURN(MATCH_NOMATCH);
3348 }
3349 }
3350 break;
3351
3352 case OP_HSPACE:
3353 for (i = 1; i <= min; i++)
3354 {
3355 if (eptr >= md->end_subject)
3356 {
3357 SCHECK_PARTIAL();
3358 RRETURN(MATCH_NOMATCH);
3359 }
3360 GETCHARINC(c, eptr);
3361 switch(c)
3362 {
3363 default: RRETURN(MATCH_NOMATCH);
3364 case 0x09: /* HT */
3365 case 0x20: /* SPACE */
3366 case 0xa0: /* NBSP */
3367 case 0x1680: /* OGHAM SPACE MARK */
3368 case 0x180e: /* MONGOLIAN VOWEL SEPARATOR */
3369 case 0x2000: /* EN QUAD */
3370 case 0x2001: /* EM QUAD */
3371 case 0x2002: /* EN SPACE */
3372 case 0x2003: /* EM SPACE */
3373 case 0x2004: /* THREE-PER-EM SPACE */
3374 case 0x2005: /* FOUR-PER-EM SPACE */
3375 case 0x2006: /* SIX-PER-EM SPACE */
3376 case 0x2007: /* FIGURE SPACE */
3377 case 0x2008: /* PUNCTUATION SPACE */
3378 case 0x2009: /* THIN SPACE */
3379 case 0x200A: /* HAIR SPACE */
3380 case 0x202f: /* NARROW NO-BREAK SPACE */
3381 case 0x205f: /* MEDIUM MATHEMATICAL SPACE */
3382 case 0x3000: /* IDEOGRAPHIC SPACE */
3383 break;
3384 }
3385 }
3386 break;
3387
3388 case OP_NOT_VSPACE:
3389 for (i = 1; i <= min; i++)
3390 {
3391 if (eptr >= md->end_subject)
3392 {
3393 SCHECK_PARTIAL();
3394 RRETURN(MATCH_NOMATCH);
3395 }
3396 GETCHARINC(c, eptr);
3397 switch(c)
3398 {
3399 default: break;
3400 case 0x0a: /* LF */
3401 case 0x0b: /* VT */
3402 case 0x0c: /* FF */
3403 case 0x0d: /* CR */
3404 case 0x85: /* NEL */
3405 case 0x2028: /* LINE SEPARATOR */
3406 case 0x2029: /* PARAGRAPH SEPARATOR */
3407 RRETURN(MATCH_NOMATCH);
3408 }
3409 }
3410 break;
3411
3412 case OP_VSPACE:
3413 for (i = 1; i <= min; i++)
3414 {
3415 if (eptr >= md->end_subject)
3416 {
3417 SCHECK_PARTIAL();
3418 RRETURN(MATCH_NOMATCH);
3419 }
3420 GETCHARINC(c, eptr);
3421 switch(c)
3422 {
3423 default: RRETURN(MATCH_NOMATCH);
3424 case 0x0a: /* LF */
3425 case 0x0b: /* VT */
3426 case 0x0c: /* FF */
3427 case 0x0d: /* CR */
3428 case 0x85: /* NEL */
3429 case 0x2028: /* LINE SEPARATOR */
3430 case 0x2029: /* PARAGRAPH SEPARATOR */
3431 break;
3432 }
3433 }
3434 break;
3435
3436 case OP_NOT_DIGIT:
3437 for (i = 1; i <= min; i++)
3438 {
3439 if (eptr >= md->end_subject)
3440 {
3441 SCHECK_PARTIAL();
3442 RRETURN(MATCH_NOMATCH);
3443 }
3444 GETCHARINC(c, eptr);
3445 if (c < 128 && (md->ctypes[c] & ctype_digit) != 0)
3446 RRETURN(MATCH_NOMATCH);
3447 }
3448 break;
3449
3450 case OP_DIGIT:
3451 for (i = 1; i <= min; i++)
3452 {
3453 if (eptr >= md->end_subject)
3454 {
3455 SCHECK_PARTIAL();
3456 RRETURN(MATCH_NOMATCH);
3457 }
3458 if (*eptr >= 128 || (md->ctypes[*eptr++] & ctype_digit) == 0)
3459 RRETURN(MATCH_NOMATCH);
3460 /* No need to skip more bytes - we know it's a 1-byte character */
3461 }
3462 break;
3463
3464 case OP_NOT_WHITESPACE:
3465 for (i = 1; i <= min; i++)
3466 {
3467 if (eptr >= md->end_subject)
3468 {
3469 SCHECK_PARTIAL();
3470 RRETURN(MATCH_NOMATCH);
3471 }
3472 if (*eptr < 128 && (md->ctypes[*eptr] & ctype_space) != 0)
3473 RRETURN(MATCH_NOMATCH);
3474 while (++eptr < md->end_subject && (*eptr & 0xc0) == 0x80);
3475 }
3476 break;
3477
3478 case OP_WHITESPACE:
3479 for (i = 1; i <= min; i++)
3480 {
3481 if (eptr >= md->end_subject)
3482 {
3483 SCHECK_PARTIAL();
3484 RRETURN(MATCH_NOMATCH);
3485 }
3486 if (*eptr >= 128 || (md->ctypes[*eptr++] & ctype_space) == 0)
3487 RRETURN(MATCH_NOMATCH);
3488 /* No need to skip more bytes - we know it's a 1-byte character */
3489 }
3490 break;
3491
3492 case OP_NOT_WORDCHAR:
3493 for (i = 1; i <= min; i++)
3494 {
3495 if (eptr >= md->end_subject ||
3496 (*eptr < 128 && (md->ctypes[*eptr] & ctype_word) != 0))
3497 RRETURN(MATCH_NOMATCH);
3498 while (++eptr < md->end_subject && (*eptr & 0xc0) == 0x80);
3499 }
3500 break;
3501
3502 case OP_WORDCHAR:
3503 for (i = 1; i <= min; i++)
3504 {
3505 if (eptr >= md->end_subject)
3506 {
3507 SCHECK_PARTIAL();
3508 RRETURN(MATCH_NOMATCH);
3509 }
3510 if (*eptr >= 128 || (md->ctypes[*eptr++] & ctype_word) == 0)
3511 RRETURN(MATCH_NOMATCH);
3512 /* No need to skip more bytes - we know it's a 1-byte character */
3513 }
3514 break;
3515
3516 default:
3517 RRETURN(PCRE_ERROR_INTERNAL);
3518 } /* End switch(ctype) */
3519
3520 else
3521 #endif /* SUPPORT_UTF8 */
3522
3523 /* Code for the non-UTF-8 case for minimum matching of operators other
3524 than OP_PROP and OP_NOTPROP. */
3525
3526 switch(ctype)
3527 {
3528 case OP_ANY:
3529 for (i = 1; i <= min; i++)
3530 {
3531 if (eptr >= md->end_subject)
3532 {
3533 SCHECK_PARTIAL();
3534 RRETURN(MATCH_NOMATCH);
3535 }
3536 if (IS_NEWLINE(eptr)) RRETURN(MATCH_NOMATCH);
3537 eptr++;
3538 }
3539 break;
3540
3541 case OP_ALLANY:
3542 if (eptr > md->end_subject - min)
3543 {
3544 SCHECK_PARTIAL();
3545 RRETURN(MATCH_NOMATCH);
3546 }
3547 eptr += min;
3548 break;
3549
3550 case OP_ANYBYTE:
3551 if (eptr > md->end_subject - min)
3552 {
3553 SCHECK_PARTIAL();
3554 RRETURN(MATCH_NOMATCH);
3555 }
3556 eptr += min;
3557 break;
3558
3559 case OP_ANYNL:
3560 for (i = 1; i <= min; i++)
3561 {
3562 if (eptr >= md->end_subject)
3563 {
3564 SCHECK_PARTIAL();
3565 RRETURN(MATCH_NOMATCH);
3566 }
3567 switch(*eptr++)
3568 {
3569 default: RRETURN(MATCH_NOMATCH);
3570 case 0x000d:
3571 if (eptr < md->end_subject && *eptr == 0x0a) eptr++;
3572 break;
3573 case 0x000a:
3574 break;
3575
3576 case 0x000b:
3577 case 0x000c:
3578 case 0x0085:
3579 if (md->bsr_anycrlf) RRETURN(MATCH_NOMATCH);
3580 break;
3581 }
3582 }
3583 break;
3584
3585 case OP_NOT_HSPACE:
3586 for (i = 1; i <= min; i++)
3587 {
3588 if (eptr >= md->end_subject)
3589 {
3590 SCHECK_PARTIAL();
3591 RRETURN(MATCH_NOMATCH);
3592 }
3593 switch(*eptr++)
3594 {
3595 default: break;
3596 case 0x09: /* HT */
3597 case 0x20: /* SPACE */
3598 case 0xa0: /* NBSP */
3599 RRETURN(MATCH_NOMATCH);
3600 }
3601 }
3602 break;
3603
3604 case OP_HSPACE:
3605 for (i = 1; i <= min; i++)
3606 {
3607 if (eptr >= md->end_subject)
3608 {
3609 SCHECK_PARTIAL();
3610 RRETURN(MATCH_NOMATCH);
3611 }
3612 switch(*eptr++)
3613 {
3614 default: RRETURN(MATCH_NOMATCH);
3615 case 0x09: /* HT */
3616 case 0x20: /* SPACE */
3617 case 0xa0: /* NBSP */
3618 break;
3619 }
3620 }
3621 break;
3622
3623 case OP_NOT_VSPACE:
3624 for (i = 1; i <= min; i++)
3625 {
3626 if (eptr >= md->end_subject)
3627 {
3628 SCHECK_PARTIAL();
3629 RRETURN(MATCH_NOMATCH);
3630 }
3631 switch(*eptr++)
3632 {
3633 default: break;
3634 case 0x0a: /* LF */
3635 case 0x0b: /* VT */
3636 case 0x0c: /* FF */
3637 case 0x0d: /* CR */
3638 case 0x85: /* NEL */
3639 RRETURN(MATCH_NOMATCH);
3640 }
3641 }
3642 break;
3643
3644 case OP_VSPACE:
3645 for (i = 1; i <= min; i++)
3646 {
3647 if (eptr >= md->end_subject)
3648 {
3649 SCHECK_PARTIAL();
3650 RRETURN(MATCH_NOMATCH);
3651 }
3652 switch(*eptr++)
3653 {
3654 default: RRETURN(MATCH_NOMATCH);
3655 case 0x0a: /* LF */
3656 case 0x0b: /* VT */
3657 case 0x0c: /* FF */
3658 case 0x0d: /* CR */
3659 case 0x85: /* NEL */
3660 break;
3661 }
3662 }
3663 break;
3664
3665 case OP_NOT_DIGIT:
3666 for (i = 1; i <= min; i++)
3667 {
3668 if (eptr >= md->end_subject)
3669 {
3670 SCHECK_PARTIAL();
3671 RRETURN(MATCH_NOMATCH);
3672 }
3673 if ((md->ctypes[*eptr++] & ctype_digit) != 0) RRETURN(MATCH_NOMATCH);
3674 }
3675 break;
3676
3677 case OP_DIGIT:
3678 for (i = 1; i <= min; i++)
3679 {
3680 if (eptr >= md->end_subject)
3681 {
3682 SCHECK_PARTIAL();
3683 RRETURN(MATCH_NOMATCH);
3684 }
3685 if ((md->ctypes[*eptr++] & ctype_digit) == 0) RRETURN(MATCH_NOMATCH);
3686 }
3687 break;
3688
3689 case OP_NOT_WHITESPACE:
3690 for (i = 1; i <= min; i++)
3691 {
3692 if (eptr >= md->end_subject)
3693 {
3694 SCHECK_PARTIAL();
3695 RRETURN(MATCH_NOMATCH);
3696 }
3697 if ((md->ctypes[*eptr++] & ctype_space) != 0) RRETURN(MATCH_NOMATCH);
3698 }
3699 break;
3700
3701 case OP_WHITESPACE:
3702 for (i = 1; i <= min; i++)
3703 {
3704 if (eptr >= md->end_subject)
3705 {
3706 SCHECK_PARTIAL();
3707 RRETURN(MATCH_NOMATCH);
3708 }
3709 if ((md->ctypes[*eptr++] & ctype_space) == 0) RRETURN(MATCH_NOMATCH);
3710 }
3711 break;
3712
3713 case OP_NOT_WORDCHAR:
3714 for (i = 1; i <= min; i++)
3715 {
3716 if (eptr >= md->end_subject)
3717 {
3718 SCHECK_PARTIAL();
3719 RRETURN(MATCH_NOMATCH);
3720 }
3721 if ((md->ctypes[*eptr++] & ctype_word) != 0)
3722 RRETURN(MATCH_NOMATCH);
3723 }
3724 break;
3725
3726 case OP_WORDCHAR:
3727 for (i = 1; i <= min; i++)
3728 {
3729 if (eptr >= md->end_subject)
3730 {
3731 SCHECK_PARTIAL();
3732 RRETURN(MATCH_NOMATCH);
3733 }
3734 if ((md->ctypes[*eptr++] & ctype_word) == 0)
3735 RRETURN(MATCH_NOMATCH);
3736 }
3737 break;
3738
3739 default:
3740 RRETURN(PCRE_ERROR_INTERNAL);
3741 }
3742 }
3743
3744 /* If min = max, continue at the same level without recursing */
3745
3746 if (min == max) continue;
3747
3748 /* If minimizing, we have to test the rest of the pattern before each
3749 subsequent match. Again, separate the UTF-8 case for speed, and also
3750 separate the UCP cases. */
3751
3752 if (minimize)
3753 {
3754 #ifdef SUPPORT_UCP
3755 if (prop_type >= 0)
3756 {
3757 switch(prop_type)
3758 {
3759 case PT_ANY:
3760 for (fi = min;; fi++)
3761 {
3762 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM36);
3763 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3764 if (fi >= max) RRETURN(MATCH_NOMATCH);
3765 if (eptr >= md->end_subject)
3766 {
3767 SCHECK_PARTIAL();
3768 RRETURN(MATCH_NOMATCH);
3769 }
3770 GETCHARINC(c, eptr);
3771 if (prop_fail_result) RRETURN(MATCH_NOMATCH);
3772 }
3773 /* Control never gets here */
3774
3775 case PT_LAMP:
3776 for (fi = min;; fi++)
3777 {
3778 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM37);
3779 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3780 if (fi >= max) RRETURN(MATCH_NOMATCH);
3781 if (eptr >= md->end_subject)
3782 {
3783 SCHECK_PARTIAL();
3784 RRETURN(MATCH_NOMATCH);
3785 }
3786 GETCHARINC(c, eptr);
3787 prop_chartype = UCD_CHARTYPE(c);
3788 if ((prop_chartype == ucp_Lu ||
3789 prop_chartype == ucp_Ll ||
3790 prop_chartype == ucp_Lt) == prop_fail_result)
3791 RRETURN(MATCH_NOMATCH);
3792 }
3793 /* Control never gets here */
3794
3795 case PT_GC:
3796 for (fi = min;; fi++)
3797 {
3798 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM38);
3799 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3800 if (fi >= max) RRETURN(MATCH_NOMATCH);
3801 if (eptr >= md->end_subject)
3802 {
3803 SCHECK_PARTIAL();
3804 RRETURN(MATCH_NOMATCH);
3805 }
3806 GETCHARINC(c, eptr);
3807 prop_category = UCD_CATEGORY(c);
3808 if ((prop_category == prop_value) == prop_fail_result)
3809 RRETURN(MATCH_NOMATCH);
3810 }
3811 /* Control never gets here */
3812
3813 case PT_PC:
3814 for (fi = min;; fi++)
3815 {
3816 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM39);
3817 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3818 if (fi >= max) RRETURN(MATCH_NOMATCH);
3819 if (eptr >= md->end_subject)
3820 {
3821 SCHECK_PARTIAL();
3822 RRETURN(MATCH_NOMATCH);
3823 }
3824 GETCHARINC(c, eptr);
3825 prop_chartype = UCD_CHARTYPE(c);
3826 if ((prop_chartype == prop_value) == prop_fail_result)
3827 RRETURN(MATCH_NOMATCH);
3828 }
3829 /* Control never gets here */
3830
3831 case PT_SC:
3832 for (fi = min;; fi++)
3833 {
3834 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM40);
3835 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3836 if (fi >= max) RRETURN(MATCH_NOMATCH);
3837 if (eptr >= md->end_subject)
3838 {
3839 SCHECK_PARTIAL();
3840 RRETURN(MATCH_NOMATCH);
3841 }
3842 GETCHARINC(c, eptr);
3843 prop_script = UCD_SCRIPT(c);
3844 if ((prop_script == prop_value) == prop_fail_result)
3845 RRETURN(MATCH_NOMATCH);
3846 }
3847 /* Control never gets here */
3848
3849 default:
3850 RRETURN(PCRE_ERROR_INTERNAL);
3851 }
3852 }
3853
3854 /* Match extended Unicode sequences. We will get here only if the
3855 support is in the binary; otherwise a compile-time error occurs. */
3856
3857 else if (ctype == OP_EXTUNI)
3858 {
3859 for (fi = min;; fi++)
3860 {
3861 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM41);
3862 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3863 if (fi >= max) RRETURN(MATCH_NOMATCH);
3864 if (eptr >= md->end_subject)
3865 {
3866 SCHECK_PARTIAL();
3867 RRETURN(MATCH_NOMATCH);
3868 }
3869 GETCHARINCTEST(c, eptr);
3870 prop_category = UCD_CATEGORY(c);
3871 if (prop_category == ucp_M) RRETURN(MATCH_NOMATCH);
3872 while (eptr < md->end_subject)
3873 {
3874 int len = 1;
3875 if (!utf8) c = *eptr;
3876 else { GETCHARLEN(c, eptr, len); }
3877 prop_category = UCD_CATEGORY(c);
3878 if (prop_category != ucp_M) break;
3879 eptr += len;
3880 }
3881 }
3882 }
3883
3884 else
3885 #endif /* SUPPORT_UCP */
3886
3887 #ifdef SUPPORT_UTF8
3888 /* UTF-8 mode */
3889 if (utf8)
3890 {
3891 for (fi = min;; fi++)
3892 {
3893 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM42);
3894 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3895 if (fi >= max) RRETURN(MATCH_NOMATCH);
3896 if (eptr >= md->end_subject)
3897 {
3898 SCHECK_PARTIAL();
3899 RRETURN(MATCH_NOMATCH);
3900 }
3901 if (ctype == OP_ANY && IS_NEWLINE(eptr))
3902 RRETURN(MATCH_NOMATCH);
3903 GETCHARINC(c, eptr);
3904 switch(ctype)
3905 {
3906 case OP_ANY: /* This is the non-NL case */
3907 case OP_ALLANY:
3908 case OP_ANYBYTE:
3909 break;
3910
3911 case OP_ANYNL:
3912 switch(c)
3913 {
3914 default: RRETURN(MATCH_NOMATCH);
3915 case 0x000d:
3916 if (eptr < md->end_subject && *eptr == 0x0a) eptr++;
3917 break;
3918 case 0x000a:
3919 break;
3920
3921 case 0x000b:
3922 case 0x000c:
3923 case 0x0085:
3924 case 0x2028:
3925 case 0x2029:
3926 if (md->bsr_anycrlf) RRETURN(MATCH_NOMATCH);
3927 break;
3928 }
3929 break;
3930
3931 case OP_NOT_HSPACE:
3932 switch(c)
3933 {
3934 default: break;
3935 case 0x09: /* HT */
3936 case 0x20: /* SPACE */
3937 case 0xa0: /* NBSP */
3938 case 0x1680: /* OGHAM SPACE MARK */
3939 case 0x180e: /* MONGOLIAN VOWEL SEPARATOR */
3940 case 0x2000: /* EN QUAD */
3941 case 0x2001: /* EM QUAD */
3942 case 0x2002: /* EN SPACE */
3943 case 0x2003: /* EM SPACE */
3944 case 0x2004: /* THREE-PER-EM SPACE */
3945 case 0x2005: /* FOUR-PER-EM SPACE */
3946 case 0x2006: /* SIX-PER-EM SPACE */
3947 case 0x2007: /* FIGURE SPACE */
3948 case 0x2008: /* PUNCTUATION SPACE */
3949 case 0x2009: /* THIN SPACE */
3950 case 0x200A: /* HAIR SPACE */
3951 case 0x202f: /* NARROW NO-BREAK SPACE */
3952 case 0x205f: /* MEDIUM MATHEMATICAL SPACE */
3953 case 0x3000: /* IDEOGRAPHIC SPACE */
3954 RRETURN(MATCH_NOMATCH);
3955 }
3956 break;
3957
3958 case OP_HSPACE:
3959 switch(c)
3960 {
3961 default: RRETURN(MATCH_NOMATCH);
3962 case 0x09: /* HT */
3963 case 0x20: /* SPACE */
3964 case 0xa0: /* NBSP */
3965 case 0x1680: /* OGHAM SPACE MARK */
3966 case 0x180e: /* MONGOLIAN VOWEL SEPARATOR */
3967 case 0x2000: /* EN QUAD */
3968 case 0x2001: /* EM QUAD */
3969 case 0x2002: /* EN SPACE */
3970 case 0x2003: /* EM SPACE */
3971 case 0x2004: /* THREE-PER-EM SPACE */
3972 case 0x2005: /* FOUR-PER-EM SPACE */
3973 case 0x2006: /* SIX-PER-EM SPACE */
3974 case 0x2007: /* FIGURE SPACE */
3975 case 0x2008: /* PUNCTUATION SPACE */
3976 case 0x2009: /* THIN SPACE */
3977 case 0x200A: /* HAIR SPACE */
3978 case 0x202f: /* NARROW NO-BREAK SPACE */
3979 case 0x205f: /* MEDIUM MATHEMATICAL SPACE */
3980 case 0x3000: /* IDEOGRAPHIC SPACE */
3981 break;
3982 }
3983 break;
3984
3985 case OP_NOT_VSPACE:
3986 switch(c)
3987 {
3988 default: break;
3989 case 0x0a: /* LF */
3990 case 0x0b: /* VT */
3991 case 0x0c: /* FF */
3992 case 0x0d: /* CR */
3993 case 0x85: /* NEL */
3994 case 0x2028: /* LINE SEPARATOR */
3995 case 0x2029: /* PARAGRAPH SEPARATOR */
3996 RRETURN(MATCH_NOMATCH);
3997 }
3998 break;
3999
4000 case OP_VSPACE:
4001 switch(c)
4002 {
4003 default: RRETURN(MATCH_NOMATCH);
4004 case 0x0a: /* LF */
4005 case 0x0b: /* VT */
4006 case 0x0c: /* FF */
4007 case 0x0d: /* CR */
4008 case 0x85: /* NEL */
4009 case 0x2028: /* LINE SEPARATOR */
4010 case 0x2029: /* PARAGRAPH SEPARATOR */
4011 break;
4012 }
4013 break;
4014
4015 case OP_NOT_DIGIT:
4016 if (c < 256 && (md->ctypes[c] & ctype_digit) != 0)
4017 RRETURN(MATCH_NOMATCH);
4018 break;
4019
4020 case OP_DIGIT:
4021 if (c >= 256 || (md->ctypes[c] & ctype_digit) == 0)
4022 RRETURN(MATCH_NOMATCH);
4023 break;
4024
4025 case OP_NOT_WHITESPACE:
4026 if (c < 256 && (md->ctypes[c] & ctype_space) != 0)
4027 RRETURN(MATCH_NOMATCH);
4028 break;
4029
4030 case OP_WHITESPACE:
4031 if (c >= 256 || (md->ctypes[c] & ctype_space) == 0)
4032 RRETURN(MATCH_NOMATCH);
4033 break;
4034
4035 case OP_NOT_WORDCHAR:
4036 if (c < 256 && (md->ctypes[c] & ctype_word) != 0)
4037 RRETURN(MATCH_NOMATCH);
4038 break;
4039
4040 case OP_WORDCHAR:
4041 if (c >= 256 || (md->ctypes[c] & ctype_word) == 0)
4042 RRETURN(MATCH_NOMATCH);
4043 break;
4044
4045 default:
4046 RRETURN(PCRE_ERROR_INTERNAL);
4047 }
4048 }
4049 }
4050 else
4051 #endif
4052 /* Not UTF-8 mode */
4053 {
4054 for (fi = min;; fi++)
4055 {
4056 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM43);
4057 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
4058 if (fi >= max) RRETURN(MATCH_NOMATCH);
4059 if (eptr >= md->end_subject)
4060 {
4061 SCHECK_PARTIAL();
4062 RRETURN(MATCH_NOMATCH);
4063 }
4064 if (ctype == OP_ANY && IS_NEWLINE(eptr))
4065 RRETURN(MATCH_NOMATCH);
4066 c = *eptr++;
4067 switch(ctype)
4068 {
4069 case OP_ANY: /* This is the non-NL case */
4070 case OP_ALLANY:
4071 case OP_ANYBYTE:
4072 break;
4073
4074 case OP_ANYNL:
4075 switch(c)
4076 {
4077 default: RRETURN(MATCH_NOMATCH);
4078 case 0x000d:
4079 if (eptr < md->end_subject && *eptr == 0x0a) eptr++;
4080 break;
4081
4082 case 0x000a:
4083 break;
4084
4085 case 0x000b:
4086 case 0x000c:
4087 case 0x0085:
4088 if (md->bsr_anycrlf) RRETURN(MATCH_NOMATCH);
4089 break;
4090 }
4091 break;
4092
4093 case OP_NOT_HSPACE:
4094 switch(c)
4095 {
4096 default: break;
4097 case 0x09: /* HT */
4098 case 0x20: /* SPACE */
4099 case 0xa0: /* NBSP */
4100 RRETURN(MATCH_NOMATCH);
4101 }
4102 break;
4103
4104 case OP_HSPACE:
4105 switch(c)
4106 {
4107 default: RRETURN(MATCH_NOMATCH);
4108 case 0x09: /* HT */
4109 case 0x20: /* SPACE */
4110 case 0xa0: /* NBSP */
4111 break;
4112 }
4113 break;
4114
4115 case OP_NOT_VSPACE:
4116 switch(c)
4117 {
4118 default: break;
4119 case 0x0a: /* LF */
4120 case 0x0b: /* VT */
4121 case 0x0c: /* FF */
4122 case 0x0d: /* CR */
4123 case 0x85: /* NEL */
4124 RRETURN(MATCH_NOMATCH);
4125 }
4126 break;
4127
4128 case OP_VSPACE:
4129 switch(c)
4130 {
4131 default: RRETURN(MATCH_NOMATCH);
4132 case 0x0a: /* LF */
4133 case 0x0b: /* VT */
4134 case 0x0c: /* FF */
4135 case 0x0d: /* CR */
4136 case 0x85: /* NEL */
4137 break;
4138 }
4139 break;
4140
4141 case OP_NOT_DIGIT:
4142 if ((md->ctypes[c] & ctype_digit) != 0) RRETURN(MATCH_NOMATCH);
4143 break;
4144
4145 case OP_DIGIT:
4146 if ((md->ctypes[c] & ctype_digit) == 0) RRETURN(MATCH_NOMATCH);
4147 break;
4148
4149 case OP_NOT_WHITESPACE:
4150 if ((md->ctypes[c] & ctype_space) != 0) RRETURN(MATCH_NOMATCH);
4151 break;
4152
4153 case OP_WHITESPACE:
4154 if ((md->ctypes[c] & ctype_space) == 0) RRETURN(MATCH_NOMATCH);
4155 break;
4156
4157 case OP_NOT_WORDCHAR:
4158 if ((md->ctypes[c] & ctype_word) != 0) RRETURN(MATCH_NOMATCH);
4159 break;
4160
4161 case OP_WORDCHAR:
4162 if ((md->ctypes[c] & ctype_word) == 0) RRETURN(MATCH_NOMATCH);
4163 break;
4164
4165 default:
4166 RRETURN(PCRE_ERROR_INTERNAL);
4167 }
4168 }
4169 }
4170 /* Control never gets here */
4171 }
4172
4173 /* If maximizing, it is worth using inline code for speed, doing the type
4174 test once at the start (i.e. keep it out of the loop). Again, keep the
4175 UTF-8 and UCP stuff separate. */
4176
4177 else
4178 {
4179 pp = eptr; /* Remember where we started */
4180
4181 #ifdef SUPPORT_UCP
4182 if (prop_type >= 0)
4183 {
4184 switch(prop_type)
4185 {
4186 case PT_ANY:
4187 for (i = min; i < max; i++)
4188 {
4189 int len = 1;
4190 if (eptr >= md->end_subject) break;
4191 GETCHARLEN(c, eptr, len);
4192 if (prop_fail_result) break;
4193 eptr+= len;
4194 }
4195 break;
4196
4197 case PT_LAMP:
4198 for (i = min; i < max; i++)
4199 {
4200 int len = 1;
4201 if (eptr >= md->end_subject) break;
4202 GETCHARLEN(c, eptr, len);
4203 prop_chartype = UCD_CHARTYPE(c);
4204 if ((prop_chartype == ucp_Lu ||
4205 prop_chartype == ucp_Ll ||
4206 prop_chartype == ucp_Lt) == prop_fail_result)
4207 break;
4208 eptr+= len;
4209 }
4210 break;
4211
4212 case PT_GC:
4213 for (i = min; i < max; i++)
4214 {
4215 int len = 1;
4216 if (eptr >= md->end_subject) break;
4217 GETCHARLEN(c, eptr, len);
4218 prop_category = UCD_CATEGORY(c);
4219 if ((prop_category == prop_value) == prop_fail_result)
4220 break;
4221 eptr+= len;
4222 }
4223 break;
4224
4225 case PT_PC:
4226 for (i = min; i < max; i++)
4227 {
4228 int len = 1;
4229 if (eptr >= md->end_subject) break;
4230 GETCHARLEN(c, eptr, len);
4231 prop_chartype = UCD_CHARTYPE(c);
4232 if ((prop_chartype == prop_value) == prop_fail_result)
4233 break;
4234 eptr+= len;
4235 }
4236 break;
4237
4238 case PT_SC:
4239 for (i = min; i < max; i++)
4240 {
4241 int len = 1;
4242 if (eptr >= md->end_subject) break;
4243 GETCHARLEN(c, eptr, len);
4244 prop_script = UCD_SCRIPT(c);
4245 if ((prop_script == prop_value) == prop_fail_result)
4246 break;
4247 eptr+= len;
4248 }
4249 break;
4250 }
4251
4252 /* eptr is now past the end of the maximum run */
4253
4254 if (possessive) continue;
4255 for(;;)
4256 {
4257 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM44);
4258 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
4259 if (eptr-- == pp) break; /* Stop if tried at original pos */
4260 if (utf8) BACKCHAR(eptr);
4261 }
4262 }
4263
4264 /* Match extended Unicode sequences. We will get here only if the
4265 support is in the binary; otherwise a compile-time error occurs. */
4266
4267 else if (ctype == OP_EXTUNI)
4268 {
4269 for (i = min; i < max; i++)
4270 {
4271 if (eptr >= md->end_subject) break;
4272 GETCHARINCTEST(c, eptr);
4273 prop_category = UCD_CATEGORY(c);
4274 if (prop_category == ucp_M) break;
4275 while (eptr < md->end_subject)
4276 {
4277 int len = 1;
4278 if (!utf8) c = *eptr; else
4279 {
4280 GETCHARLEN(c, eptr, len);
4281 }
4282 prop_category = UCD_CATEGORY(c);
4283 if (prop_category != ucp_M) break;
4284 eptr += len;
4285 }
4286 }
4287
4288 /* eptr is now past the end of the maximum run */
4289
4290 if (possessive) continue;
4291 for(;;)
4292 {
4293 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM45);
4294 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
4295 if (eptr-- == pp) break; /* Stop if tried at original pos */
4296 for (;;) /* Move back over one extended */
4297 {
4298 int len = 1;
4299 if (!utf8) c = *eptr; else
4300 {
4301 BACKCHAR(eptr);
4302 GETCHARLEN(c, eptr, len);
4303 }
4304 prop_category = UCD_CATEGORY(c);
4305 if (prop_category != ucp_M) break;
4306 eptr--;
4307 }
4308 }
4309 }
4310
4311 else
4312 #endif /* SUPPORT_UCP */
4313
4314 #ifdef SUPPORT_UTF8
4315 /* UTF-8 mode */
4316
4317 if (utf8)
4318 {
4319 switch(ctype)
4320 {
4321 case OP_ANY:
4322 if (max < INT_MAX)
4323 {
4324 for (i = min; i < max; i++)
4325 {
4326 if (eptr >= md->end_subject || IS_NEWLINE(eptr)) break;
4327 eptr++;
4328 while (eptr < md->end_subject && (*eptr & 0xc0) == 0x80) eptr++;
4329 }
4330 }
4331
4332 /* Handle unlimited UTF-8 repeat */
4333
4334 else
4335 {
4336 for (i = min; i < max; i++)
4337 {
4338 if (eptr >= md->end_subject || IS_NEWLINE(eptr)) break;
4339 eptr++;
4340 while (eptr < md->end_subject && (*eptr & 0xc0) == 0x80) eptr++;
4341 }
4342 }
4343 break;
4344
4345 case OP_ALLANY:
4346 if (max < INT_MAX)
4347 {
4348 for (i = min; i < max; i++)
4349 {
4350 if (eptr >= md->end_subject) break;
4351 eptr++;
4352 while (eptr < md->end_subject && (*eptr & 0xc0) == 0x80) eptr++;
4353 }
4354 }
4355 else eptr = md->end_subject; /* Unlimited UTF-8 repeat */
4356 break;
4357
4358 /* The byte case is the same as non-UTF8 */
4359
4360 case OP_ANYBYTE:
4361 c = max - min;
4362 if (c > (unsigned int)(md->end_subject - eptr))
4363 c = md->end_subject - eptr;
4364 eptr += c;
4365 break;
4366
4367 case OP_ANYNL:
4368 for (i = min; i < max; i++)
4369 {
4370 int len = 1;
4371 if (eptr >= md->end_subject) break;
4372 GETCHARLEN(c, eptr, len);
4373 if (c == 0x000d)
4374 {
4375 if (++eptr >= md->end_subject) break;
4376 if (*eptr == 0x000a) eptr++;
4377 }
4378 else
4379 {
4380 if (c != 0x000a &&
4381 (md->bsr_anycrlf ||
4382 (c != 0x000b && c != 0x000c &&
4383 c != 0x0085 && c != 0x2028 && c != 0x2029)))
4384 break;
4385 eptr += len;
4386 }
4387 }
4388 break;
4389
4390 case OP_NOT_HSPACE:
4391 case OP_HSPACE:
4392 for (i = min; i < max; i++)
4393 {
4394 BOOL gotspace;
4395 int len = 1;
4396 if (eptr >= md->end_subject) break;
4397 GETCHARLEN(c, eptr, len);
4398 switch(c)
4399 {
4400 default: gotspace = FALSE; break;
4401 case 0x09: /* HT */
4402 case 0x20: /* SPACE */
4403 case 0xa0: /* NBSP */
4404 case 0x1680: /* OGHAM SPACE MARK */
4405 case 0x180e: /* MONGOLIAN VOWEL SEPARATOR */
4406 case 0x2000: /* EN QUAD */
4407 case 0x2001: /* EM QUAD */
4408 case 0x2002: /* EN SPACE */
4409 case 0x2003: /* EM SPACE */
4410 case 0x2004: /* THREE-PER-EM SPACE */
4411 case 0x2005: /* FOUR-PER-EM SPACE */
4412 case 0x2006: /* SIX-PER-EM SPACE */
4413 case 0x2007: /* FIGURE SPACE */
4414 case 0x2008: /* PUNCTUATION SPACE */
4415 case 0x2009: /* THIN SPACE */
4416 case 0x200A: /* HAIR SPACE */
4417 case 0x202f: /* NARROW NO-BREAK SPACE */
4418 case 0x205f: /* MEDIUM MATHEMATICAL SPACE */
4419 case 0x3000: /* IDEOGRAPHIC SPACE */
4420 gotspace = TRUE;
4421 break;
4422 }
4423 if (gotspace == (ctype == OP_NOT_HSPACE)) break;
4424 eptr += len;
4425 }
4426 break;
4427
4428 case OP_NOT_VSPACE:
4429 case OP_VSPACE:
4430 for (i = min; i < max; i++)
4431 {
4432 BOOL gotspace;
4433 int len = 1;
4434 if (eptr >= md->end_subject) break;
4435 GETCHARLEN(c, eptr, len);
4436 switch(c)
4437 {
4438 default: gotspace = FALSE; break;
4439 case 0x0a: /* LF */
4440 case 0x0b: /* VT */
4441 case 0x0c: /* FF */
4442 case 0x0d: /* CR */
4443 case 0x85: /* NEL */
4444 case 0x2028: /* LINE SEPARATOR */
4445 case 0x2029: /* PARAGRAPH SEPARATOR */
4446 gotspace = TRUE;
4447 break;
4448 }
4449 if (gotspace == (ctype == OP_NOT_VSPACE)) break;
4450 eptr += len;
4451 }
4452 break;
4453
4454 case OP_NOT_DIGIT:
4455 for (i = min; i < max; i++)
4456 {
4457 int len = 1;
4458 if (eptr >= md->end_subject) break;
4459 GETCHARLEN(c, eptr, len);
4460 if (c < 256 && (md->ctypes[c] & ctype_digit) != 0) break;
4461 eptr+= len;
4462 }
4463 break;
4464
4465 case OP_DIGIT:
4466 for (i = min; i < max; i++)
4467 {
4468 int len = 1;
4469 if (eptr >= md->end_subject) break;
4470 GETCHARLEN(c, eptr, len);
4471 if (c >= 256 ||(md->ctypes[c] & ctype_digit) == 0) break;
4472 eptr+= len;
4473 }
4474 break;
4475
4476 case OP_NOT_WHITESPACE:
4477 for (i = min; i < max; i++)
4478 {
4479 int len = 1;
4480 if (eptr >= md->end_subject) break;
4481 GETCHARLEN(c, eptr, len);
4482 if (c < 256 && (md->ctypes[c] & ctype_space) != 0) break;
4483 eptr+= len;
4484 }
4485 break;
4486
4487 case OP_WHITESPACE:
4488 for (i = min; i < max; i++)
4489 {
4490 int len = 1;
4491 if (eptr >= md->end_subject) break;
4492 GETCHARLEN(c, eptr, len);
4493 if (c >= 256 ||(md->ctypes[c] & ctype_space) == 0) break;
4494 eptr+= len;
4495 }
4496 break;
4497
4498 case OP_NOT_WORDCHAR:
4499 for (i = min; i < max; i++)
4500 {
4501 int len = 1;
4502 if (eptr >= md->end_subject) break;
4503 GETCHARLEN(c, eptr, len);
4504 if (c < 256 && (md->ctypes[c] & ctype_word) != 0) break;
4505 eptr+= len;
4506 }
4507 break;
4508
4509 case OP_WORDCHAR:
4510 for (i = min; i < max; i++)
4511 {
4512 int len = 1;
4513 if (eptr >= md->end_subject) break;
4514 GETCHARLEN(c, eptr, len);
4515 if (c >= 256 || (md->ctypes[c] & ctype_word) == 0) break;
4516 eptr+= len;
4517 }
4518 break;
4519
4520 default:
4521 RRETURN(PCRE_ERROR_INTERNAL);
4522 }
4523
4524 /* eptr is now past the end of the maximum run */
4525
4526 if (possessive) continue;
4527 for(;;)
4528 {
4529 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM46);
4530 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
4531 if (eptr-- == pp) break; /* Stop if tried at original pos */
4532 BACKCHAR(eptr);
4533 }
4534 }
4535 else
4536 #endif /* SUPPORT_UTF8 */
4537
4538 /* Not UTF-8 mode */
4539 {
4540 switch(ctype)
4541 {
4542 case OP_ANY:
4543 for (i = min; i < max; i++)
4544 {
4545 if (eptr >= md->end_subject || IS_NEWLINE(eptr)) break;
4546 eptr++;
4547 }
4548 break;
4549
4550 case OP_ALLANY:
4551 case OP_ANYBYTE:
4552 c = max - min;
4553 if (c > (unsigned int)(md->end_subject - eptr))
4554 c = md->end_subject - eptr;
4555 eptr += c;
4556 break;
4557
4558 case OP_ANYNL:
4559 for (i = min; i < max; i++)
4560 {
4561 if (eptr >= md->end_subject) break;
4562 c = *eptr;
4563 if (c == 0x000d)
4564 {
4565 if (++eptr >= md->end_subject) break;
4566 if (*eptr == 0x000a) eptr++;
4567 }
4568 else
4569 {
4570 if (c != 0x000a &&
4571 (md->bsr_anycrlf ||
4572 (c != 0x000b && c != 0x000c && c != 0x0085)))
4573 break;
4574 eptr++;
4575 }
4576 }
4577 break;
4578
4579 case OP_NOT_HSPACE:
4580 for (i = min; i < max; i++)
4581 {
4582 if (eptr >= md->end_subject) break;
4583 c = *eptr;
4584 if (c == 0x09 || c == 0x20 || c == 0xa0) break;
4585 eptr++;
4586 }
4587 break;
4588
4589 case OP_HSPACE:
4590 for (i = min; i < max; i++)
4591 {
4592 if (eptr >= md->end_subject) break;
4593 c = *eptr;
4594 if (c != 0x09 && c != 0x20 && c != 0xa0) break;
4595 eptr++;
4596 }
4597 break;
4598
4599 case OP_NOT_VSPACE:
4600 for (i = min; i < max; i++)
4601 {
4602 if (eptr >= md->end_subject) break;
4603 c = *eptr;
4604 if (c == 0x0a || c == 0x0b || c == 0x0c || c == 0x0d || c == 0x85)
4605 break;
4606 eptr++;
4607 }
4608 break;
4609
4610 case OP_VSPACE:
4611 for (i = min; i < max; i++)
4612 {
4613 if (eptr >= md->end_subject) break;
4614 c = *eptr;
4615 if (c != 0x0a && c != 0x0b && c != 0x0c && c != 0x0d && c != 0x85)
4616 break;
4617 eptr++;
4618 }
4619 break;
4620
4621 case OP_NOT_DIGIT:
4622 for (i = min; i < max; i++)
4623 {
4624 if (eptr >= md->end_subject || (md->ctypes[*eptr] & ctype_digit) != 0)
4625 break;
4626 eptr++;
4627 }
4628 break;
4629
4630 case OP_DIGIT:
4631 for (i = min; i < max; i++)
4632 {
4633 if (eptr >= md->end_subject || (md->ctypes[*eptr] & ctype_digit) == 0)
4634 break;
4635 eptr++;
4636 }
4637 break;
4638
4639 case OP_NOT_WHITESPACE:
4640 for (i = min; i < max; i++)
4641 {
4642 if (eptr >= md->end_subject || (md->ctypes[*eptr] & ctype_space) != 0)
4643 break;
4644 eptr++;
4645 }
4646 break;
4647
4648 case OP_WHITESPACE:
4649 for (i = min; i < max; i++)
4650 {
4651 if (eptr >= md->end_subject || (md->ctypes[*eptr] & ctype_space) == 0)
4652 break;
4653 eptr++;
4654 }
4655 break;
4656
4657 case OP_NOT_WORDCHAR:
4658 for (i = min; i < max; i++)
4659 {
4660 if (eptr >= md->end_subject || (md->ctypes[*eptr] & ctype_word) != 0)
4661 break;
4662 eptr++;
4663 }
4664 break;
4665
4666 case OP_WORDCHAR:
4667 for (i = min; i < max; i++)
4668 {
4669 if (eptr >= md->end_subject || (md->ctypes[*eptr] & ctype_word) == 0)
4670 break;
4671 eptr++;
4672 }
4673 break;
4674
4675 default:
4676 RRETURN(PCRE_ERROR_INTERNAL);
4677 }
4678
4679 /* eptr is now past the end of the maximum run */
4680
4681 if (possessive) continue;
4682 while (eptr >= pp)
4683 {
4684 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM47);
4685 eptr--;
4686 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
4687 }
4688 }
4689
4690 /* Get here if we can't make it match with any permitted repetitions */
4691
4692 RRETURN(MATCH_NOMATCH);
4693 }
4694 /* Control never gets here */
4695
4696 /* There's been some horrible disaster. Arrival here can only mean there is
4697 something seriously wrong in the code above or the OP_xxx definitions. */
4698
4699 default:
4700 DPRINTF(("Unknown opcode %d\n", *ecode));
4701 RRETURN(PCRE_ERROR_UNKNOWN_OPCODE);
4702 }
4703
4704 /* Do not stick any code in here without much thought; it is assumed
4705 that "continue" in the code above comes out to here to repeat the main
4706 loop. */
4707
4708 } /* End of main loop */
4709 /* Control never reaches here */
4710
4711
4712 /* When compiling to use the heap rather than the stack for recursive calls to
4713 match(), the RRETURN() macro jumps here. The number that is saved in
4714 frame->Xwhere indicates which label we actually want to return to. */
4715
4716 #ifdef NO_RECURSE
4717 #define LBL(val) case val: goto L_RM##val;
4718 HEAP_RETURN:
4719 switch (frame->Xwhere)
4720 {
4721 LBL( 1) LBL( 2) LBL( 3) LBL( 4) LBL( 5) LBL( 6) LBL( 7) LBL( 8)
4722 LBL( 9) LBL(10) LBL(11) LBL(12) LBL(13) LBL(14) LBL(15) LBL(17)
4723 LBL(19) LBL(24) LBL(25) LBL(26) LBL(27) LBL(29) LBL(31) LBL(33)
4724 LBL(35) LBL(43) LBL(47) LBL(48) LBL(49) LBL(50) LBL(51) LBL(52)
4725 LBL(53) LBL(54)
4726 #ifdef SUPPORT_UTF8
4727 LBL(16) LBL(18) LBL(20) LBL(21) LBL(22) LBL(23) LBL(28) LBL(30)
4728 LBL(32) LBL(34) LBL(42) LBL(46)
4729 #ifdef SUPPORT_UCP
4730 LBL(36) LBL(37) LBL(38) LBL(39) LBL(40) LBL(41) LBL(44) LBL(45)
4731 #endif /* SUPPORT_UCP */
4732 #endif /* SUPPORT_UTF8 */
4733 default:
4734 DPRINTF(("jump error in pcre match: label %d non-existent\n", frame->Xwhere));
4735 return PCRE_ERROR_INTERNAL;
4736 }
4737 #undef LBL
4738 #endif /* NO_RECURSE */
4739 }
4740
4741
4742 /***************************************************************************
4743 ****************************************************************************
4744 RECURSION IN THE match() FUNCTION
4745
4746 Undefine all the macros that were defined above to handle this. */
4747
4748 #ifdef NO_RECURSE
4749 #undef eptr
4750 #undef ecode
4751 #undef mstart
4752 #undef offset_top
4753 #undef ims
4754 #undef eptrb
4755 #undef flags
4756
4757 #undef callpat
4758 #undef charptr
4759 #undef data
4760 #undef next
4761 #undef pp
4762 #undef prev
4763 #undef saved_eptr
4764
4765 #undef new_recursive
4766
4767 #undef cur_is_word
4768 #undef condition
4769 #undef prev_is_word
4770
4771 #undef original_ims
4772
4773 #undef ctype
4774 #undef length
4775 #undef max
4776 #undef min
4777 #undef number
4778 #undef offset
4779 #undef op
4780 #undef save_capture_last
4781 #undef save_offset1
4782 #undef save_offset2
4783 #undef save_offset3
4784 #undef stacksave
4785
4786 #undef newptrb
4787
4788 #endif
4789
4790 /* These two are defined as macros in both cases */
4791
4792 #undef fc
4793 #undef fi
4794
4795 /***************************************************************************
4796 ***************************************************************************/
4797
4798
4799
4800 /*************************************************
4801 * Execute a Regular Expression *
4802 *************************************************/
4803
4804 /* This function applies a compiled re to a subject string and picks out
4805 portions of the string if it matches. Two elements in the vector are set for
4806 each substring: the offsets to the start and end of the substring.
4807
4808 Arguments:
4809 argument_re points to the compiled expression
4810 extra_data points to extra data or is NULL
4811 subject points to the subject string
4812 length length of subject string (may contain binary zeros)
4813 start_offset where to start in the subject string
4814 options option bits
4815 offsets points to a vector of ints to be filled in with offsets
4816 offsetcount the number of elements in the vector
4817
4818 Returns: > 0 => success; value is the number of elements filled in
4819 = 0 => success, but offsets is not big enough
4820 -1 => failed to match
4821 < -1 => some kind of unexpected problem
4822 */
4823
4824 PCRE_EXP_DEFN int PCRE_CALL_CONVENTION
4825 pcre_exec(const pcre *argument_re, const pcre_extra *extra_data,
4826 PCRE_SPTR subject, int length, int start_offset, int options, int *offsets,
4827 int offsetcount)
4828 {
4829 int rc, resetcount, ocount;
4830 int first_byte = -1;
4831 int req_byte = -1;
4832 int req_byte2 = -1;
4833 int newline;
4834 unsigned long int ims;
4835 BOOL using_temporary_offsets = FALSE;
4836 BOOL anchored;
4837 BOOL startline;
4838 BOOL firstline;
4839 BOOL first_byte_caseless = FALSE;
4840 BOOL req_byte_caseless = FALSE;
4841 BOOL utf8;
4842 match_data match_block;
4843 match_data *md = &match_block;
4844 const uschar *tables;
4845 const uschar *start_bits = NULL;
4846 USPTR start_match = (USPTR)subject + start_offset;
4847 USPTR end_subject;
4848 USPTR start_partial = NULL;
4849 USPTR req_byte_ptr = start_match - 1;
4850
4851 pcre_study_data internal_study;
4852 const pcre_study_data *study;
4853
4854 real_pcre internal_re;
4855 const real_pcre *external_re = (const real_pcre *)argument_re;
4856 const real_pcre *re = external_re;
4857
4858 /* Plausibility checks */
4859
4860 if ((options & ~PUBLIC_EXEC_OPTIONS) != 0) return PCRE_ERROR_BADOPTION;
4861 if (re == NULL || subject == NULL ||
4862 (offsets == NULL && offsetcount > 0)) return PCRE_ERROR_NULL;
4863 if (offsetcount < 0) return PCRE_ERROR_BADCOUNT;
4864
4865 /* Fish out the optional data from the extra_data structure, first setting
4866 the default values. */
4867
4868 study = NULL;
4869 md->match_limit = MATCH_LIMIT;
4870 md->match_limit_recursion = MATCH_LIMIT_RECURSION;
4871 md->callout_data = NULL;
4872
4873 /* The table pointer is always in native byte order. */
4874
4875 tables = external_re->tables;
4876
4877 if (extra_data != NULL)
4878 {
4879 register unsigned int flags = extra_data->flags;
4880 if ((flags & PCRE_EXTRA_STUDY_DATA) != 0)
4881 study = (const pcre_study_data *)extra_data->study_data;
4882 if ((flags & PCRE_EXTRA_MATCH_LIMIT) != 0)
4883 md->match_limit = extra_data->match_limit;
4884 if ((flags & PCRE_EXTRA_MATCH_LIMIT_RECURSION) != 0)
4885 md->match_limit_recursion = extra_data->match_limit_recursion;
4886 if ((flags & PCRE_EXTRA_CALLOUT_DATA) != 0)
4887 md->callout_data = extra_data->callout_data;
4888 if ((flags & PCRE_EXTRA_TABLES) != 0) tables = extra_data->tables;
4889 }
4890
4891 /* If the exec call supplied NULL for tables, use the inbuilt ones. This
4892 is a feature that makes it possible to save compiled regex and re-use them
4893 in other programs later. */
4894
4895 if (tables == NULL) tables = _pcre_default_tables;
4896
4897 /* Check that the first field in the block is the magic number. If it is not,
4898 test for a regex that was compiled on a host of opposite endianness. If this is
4899 the case, flipped values are put in internal_re and internal_study if there was
4900 study data too. */
4901
4902 if (re->magic_number != MAGIC_NUMBER)
4903 {
4904 re = _pcre_try_flipped(re, &internal_re, study, &internal_study);
4905 if (re == NULL) return PCRE_ERROR_BADMAGIC;
4906 if (study != NULL) study = &internal_study;
4907 }
4908
4909 /* Set up other data */
4910
4911 anchored = ((re->options | options) & PCRE_ANCHORED) != 0;
4912 startline = (re->flags & PCRE_STARTLINE) != 0;
4913 firstline = (re->options & PCRE_FIRSTLINE) != 0;
4914
4915 /* The code starts after the real_pcre block and the capture name table. */
4916
4917 md->start_code = (const uschar *)external_re + re->name_table_offset +
4918 re->name_count * re->name_entry_size;
4919
4920 md->start_subject = (USPTR)subject;
4921 md->start_offset = start_offset;
4922 md->end_subject = md->start_subject + length;
4923 end_subject = md->end_subject;
4924
4925 md->endonly = (re->options & PCRE_DOLLAR_ENDONLY) != 0;
4926 utf8 = md->utf8 = (re->options & PCRE_UTF8) != 0;
4927 md->jscript_compat = (re->options & PCRE_JAVASCRIPT_COMPAT) != 0;
4928
4929 md->notbol = (options & PCRE_NOTBOL) != 0;
4930 md->noteol = (options & PCRE_NOTEOL) != 0;
4931 md->notempty = (options & PCRE_NOTEMPTY) != 0;
4932 md->notempty_atstart = (options & PCRE_NOTEMPTY_ATSTART) != 0;
4933 md->partial = ((options & PCRE_PARTIAL_HARD) != 0)? 2 :
4934 ((options & PCRE_PARTIAL_SOFT) != 0)? 1 : 0;
4935 md->hitend = FALSE;
4936
4937 md->recursive = NULL; /* No recursion at top level */
4938
4939 md->lcc = tables + lcc_offset;
4940 md->ctypes = tables + ctypes_offset;
4941
4942 /* Handle different \R options. */
4943
4944 switch (options & (PCRE_BSR_ANYCRLF|PCRE_BSR_UNICODE))
4945 {
4946 case 0:
4947 if ((re->options & (PCRE_BSR_ANYCRLF|PCRE_BSR_UNICODE)) != 0)
4948 md->bsr_anycrlf = (re->options & PCRE_BSR_ANYCRLF) != 0;
4949 else
4950 #ifdef BSR_ANYCRLF
4951 md->bsr_anycrlf = TRUE;
4952 #else
4953 md->bsr_anycrlf = FALSE;
4954 #endif
4955 break;
4956
4957 case PCRE_BSR_ANYCRLF:
4958 md->bsr_anycrlf = TRUE;
4959 break;
4960
4961 case PCRE_BSR_UNICODE:
4962 md->bsr_anycrlf = FALSE;
4963 break;
4964
4965 default: return PCRE_ERROR_BADNEWLINE;
4966 }
4967
4968 /* Handle different types of newline. The three bits give eight cases. If
4969 nothing is set at run time, whatever was used at compile time applies. */
4970
4971 switch ((((options & PCRE_NEWLINE_BITS) == 0)? re->options :
4972 (pcre_uint32)options) & PCRE_NEWLINE_BITS)
4973 {
4974 case 0: newline = NEWLINE; break; /* Compile-time default */
4975 case PCRE_NEWLINE_CR: newline = CHAR_CR; break;
4976 case PCRE_NEWLINE_LF: newline = CHAR_NL; break;
4977 case PCRE_NEWLINE_CR+
4978 PCRE_NEWLINE_LF: newline = (CHAR_CR << 8) | CHAR_NL; break;
4979 case PCRE_NEWLINE_ANY: newline = -1; break;
4980 case PCRE_NEWLINE_ANYCRLF: newline = -2; break;
4981 default: return PCRE_ERROR_BADNEWLINE;
4982 }
4983
4984 if (newline == -2)
4985 {
4986 md->nltype = NLTYPE_ANYCRLF;
4987 }
4988 else if (newline < 0)
4989 {
4990 md->nltype = NLTYPE_ANY;
4991 }
4992 else
4993 {
4994 md->nltype = NLTYPE_FIXED;
4995 if (newline > 255)
4996 {
4997 md->nllen = 2;
4998 md->nl[0] = (newline >> 8) & 255;
4999 md->nl[1] = newline & 255;
5000 }
5001 else
5002 {
5003 md->nllen = 1;
5004 md->nl[0] = newline;
5005 }
5006 }
5007
5008 /* Partial matching was originally supported only for a restricted set of
5009 regexes; from release 8.00 there are no restrictions, but the bits are still
5010 defined (though never set). So there's no harm in leaving this code. */
5011
5012 if (md->partial && (re->flags & PCRE_NOPARTIAL) != 0)
5013 return PCRE_ERROR_BADPARTIAL;
5014
5015 /* Check a UTF-8 string if required. Unfortunately there's no way of passing
5016 back the character offset. */
5017
5018 #ifdef SUPPORT_UTF8
5019 if (utf8 && (options & PCRE_NO_UTF8_CHECK) == 0)
5020 {
5021 if (_pcre_valid_utf8((USPTR)subject, length) >= 0)
5022 return PCRE_ERROR_BADUTF8;
5023 if (start_offset > 0 && start_offset < length)
5024 {
5025 int tb = ((USPTR)subject)[start_offset];
5026 if (tb > 127)
5027 {
5028 tb &= 0xc0;
5029 if (tb != 0 && tb != 0xc0) return PCRE_ERROR_BADUTF8_OFFSET;
5030 }
5031 }
5032 }
5033 #endif
5034
5035 /* The ims options can vary during the matching as a result of the presence
5036 of (?ims) items in the pattern. They are kept in a local variable so that
5037 restoring at the exit of a group is easy. */
5038
5039 ims = re->options & (PCRE_CASELESS|PCRE_MULTILINE|PCRE_DOTALL);
5040
5041 /* If the expression has got more back references than the offsets supplied can
5042 hold, we get a temporary chunk of working store to use during the matching.
5043 Otherwise, we can use the vector supplied, rounding down its size to a multiple
5044 of 3. */
5045
5046 ocount = offsetcount - (offsetcount % 3);
5047
5048 if (re->top_backref > 0 && re->top_backref >= ocount/3)
5049 {
5050 ocount = re->top_backref * 3 + 3;
5051 md->offset_vector = (int *)(pcre_malloc)(ocount * sizeof(int));
5052 if (md->offset_vector == NULL) return PCRE_ERROR_NOMEMORY;
5053 using_temporary_offsets = TRUE;
5054 DPRINTF(("Got memory to hold back references\n"));
5055 }
5056 else md->offset_vector = offsets;
5057
5058 md->offset_end = ocount;
5059 md->offset_max = (2*ocount)/3;
5060 md->offset_overflow = FALSE;
5061 md->capture_last = -1;
5062
5063 /* Compute the minimum number of offsets that we need to reset each time. Doing
5064 this makes a huge difference to execution time when there aren't many brackets
5065 in the pattern. */
5066
5067 resetcount = 2 + re->top_bracket * 2;
5068 if (resetcount > offsetcount) resetcount = ocount;
5069
5070 /* Reset the working variable associated with each extraction. These should
5071 never be used unless previously set, but they get saved and restored, and so we
5072 initialize them to avoid reading uninitialized locations. */
5073
5074 if (md->offset_vector != NULL)
5075 {
5076 register int *iptr = md->offset_vector + ocount;
5077 register int *iend = iptr - resetcount/2 + 1;
5078 while (--iptr >= iend) *iptr = -1;
5079 }
5080
5081 /* Set up the first character to match, if available. The first_byte value is
5082 never set for an anchored regular expression, but the anchoring may be forced
5083 at run time, so we have to test for anchoring. The first char may be unset for
5084 an unanchored pattern, of course. If there's no first char and the pattern was
5085 studied, there may be a bitmap of possible first characters. */
5086
5087 if (!anchored)
5088 {
5089 if ((re->flags & PCRE_FIRSTSET) != 0)
5090 {
5091 first_byte = re->first_byte & 255;
5092 if ((first_byte_caseless = ((re->first_byte & REQ_CASELESS) != 0)) == TRUE)
5093 first_byte = md->lcc[first_byte];
5094 }
5095 else
5096 if (!startline && study != NULL &&
5097 (study->options & PCRE_STUDY_MAPPED) != 0)
5098 start_bits = study->start_bits;
5099 }
5100
5101 /* For anchored or unanchored matches, there may be a "last known required
5102 character" set. */
5103
5104 if ((re->flags & PCRE_REQCHSET) != 0)
5105 {
5106 req_byte = re->req_byte & 255;
5107 req_byte_caseless = (re->req_byte & REQ_CASELESS) != 0;
5108 req_byte2 = (tables + fcc_offset)[req_byte]; /* case flipped */
5109 }
5110
5111
5112 /* ==========================================================================*/
5113
5114 /* Loop for handling unanchored repeated matching attempts; for anchored regexs
5115 the loop runs just once. */
5116
5117 for(;;)
5118 {
5119 USPTR save_end_subject = end_subject;
5120 USPTR new_start_match;
5121
5122 /* Reset the maximum number of extractions we might see. */
5123
5124 if (md->offset_vector != NULL)
5125 {
5126 register int *iptr = md->offset_vector;
5127 register int *iend = iptr + resetcount;
5128 while (iptr < iend) *iptr++ = -1;
5129 }
5130
5131 /* If firstline is TRUE, the start of the match is constrained to the first
5132 line of a multiline string. That is, the match must be before or at the first
5133 newline. Implement this by temporarily adjusting end_subject so that we stop
5134 scanning at a newline. If the match fails at the newline, later code breaks
5135 this loop. */
5136
5137 if (firstline)
5138 {
5139 USPTR t = start_match;
5140 #ifdef SUPPORT_UTF8
5141 if (utf8)
5142 {
5143 while (t < md->end_subject && !IS_NEWLINE(t))
5144 {
5145 t++;
5146 while (t < end_subject && (*t & 0xc0) == 0x80) t++;
5147 }
5148 }
5149 else
5150 #endif
5151 while (t < md->end_subject && !IS_NEWLINE(t)) t++;
5152 end_subject = t;
5153 }
5154
5155 /* There are some optimizations that avoid running the match if a known
5156 starting point is not found, or if a known later character is not present.
5157 However, there is an option that disables these, for testing and for ensuring
5158 that all callouts do actually occur. */
5159
5160 if ((options & PCRE_NO_START_OPTIMIZE) == 0)
5161 {
5162 /* Advance to a unique first byte if there is one. */
5163
5164 if (first_byte >= 0)
5165 {
5166 if (first_byte_caseless)
5167 while (start_match < end_subject && md->lcc[*start_match] != first_byte)
5168 start_match++;
5169 else
5170 while (start_match < end_subject && *start_match != first_byte)
5171 start_match++;
5172 }
5173
5174 /* Or to just after a linebreak for a multiline match */
5175
5176 else if (startline)
5177 {
5178 if (start_match > md->start_subject + start_offset)
5179 {
5180 #ifdef SUPPORT_UTF8
5181 if (utf8)
5182 {
5183 while (start_match < end_subject && !WAS_NEWLINE(start_match))
5184 {
5185 start_match++;
5186 while(start_match < end_subject && (*start_match & 0xc0) == 0x80)
5187 start_match++;
5188 }
5189 }
5190 else
5191 #endif
5192 while (start_match < end_subject && !WAS_NEWLINE(start_match))
5193 start_match++;
5194
5195 /* If we have just passed a CR and the newline option is ANY or ANYCRLF,
5196 and we are now at a LF, advance the match position by one more character.
5197 */
5198
5199 if (start_match[-1] == CHAR_CR &&
5200 (md->nltype == NLTYPE_ANY || md->nltype == NLTYPE_ANYCRLF) &&
5201 start_match < end_subject &&
5202 *start_match == CHAR_NL)
5203 start_match++;
5204 }
5205 }
5206
5207 /* Or to a non-unique first byte after study */
5208
5209 else if (start_bits != NULL)
5210 {
5211 while (start_match < end_subject)
5212 {
5213 register unsigned int c = *start_match;
5214 if ((start_bits[c/8] & (1 << (c&7))) == 0) start_match++;
5215 else break;
5216 }
5217 }
5218 } /* Starting optimizations */
5219
5220 /* Restore fudged end_subject */
5221
5222 end_subject = save_end_subject;
5223
5224 #ifdef DEBUG /* Sigh. Some compilers never learn. */
5225 printf(">>>> Match against: ");
5226 pchars(start_match, end_subject - start_match, TRUE, md);
5227 printf("\n");
5228 #endif
5229
5230 /* If req_byte is set, we know that that character must appear in the
5231 subject for the match to succeed. If the first character is set, req_byte
5232 must be later in the subject; otherwise the test starts at the match point.
5233 This optimization can save a huge amount of backtracking in patterns with
5234 nested unlimited repeats that aren't going to match. Writing separate code
5235 for cased/caseless versions makes it go faster, as does using an
5236 autoincrement and backing off on a match.
5237
5238 HOWEVER: when the subject string is very, very long, searching to its end
5239 can take a long time, and give bad performance on quite ordinary patterns.
5240 This showed up when somebody was matching something like /^\d+C/ on a
5241 32-megabyte string... so we don't do this when the string is sufficiently
5242 long.
5243
5244 ALSO: this processing is disabled when partial matching is requested, or if
5245 disabling is explicitly requested. */
5246
5247 if ((options & PCRE_NO_START_OPTIMIZE) == 0 &&
5248 req_byte >= 0 &&
5249 end_subject - start_match < REQ_BYTE_MAX &&
5250 !md->partial)
5251 {
5252 register USPTR p = start_match + ((first_byte >= 0)? 1 : 0);
5253
5254 /* We don't need to repeat the search if we haven't yet reached the
5255 place we found it at last time. */
5256
5257 if (p > req_byte_ptr)
5258 {
5259 if (req_byte_caseless)
5260 {
5261 while (p < end_subject)
5262 {
5263 register int pp = *p++;
5264 if (pp == req_byte || pp == req_byte2) { p--; break; }
5265 }
5266 }
5267 else
5268 {
5269 while (p < end_subject)
5270 {
5271 if (*p++ == req_byte) { p--; break; }
5272 }
5273 }
5274
5275 /* If we can't find the required character, break the matching loop,
5276 forcing a match failure. */
5277
5278 if (p >= end_subject)
5279 {
5280 rc = MATCH_NOMATCH;
5281 break;
5282 }
5283
5284 /* If we have found the required character, save the point where we
5285 found it, so that we don't search again next time round the loop if
5286 the start hasn't passed this character yet. */
5287
5288 req_byte_ptr = p;
5289 }
5290 }
5291
5292 /* OK, we can now run the match. If "hitend" is set afterwards, remember the
5293 first starting point for which a partial match was found. */
5294
5295 md->start_match_ptr = start_match;
5296 md->start_used_ptr = start_match;
5297 md->match_call_count = 0;
5298 rc = match(start_match, md->start_code, start_match, 2, md, ims, NULL, 0, 0);
5299 if (md->hitend && start_partial == NULL) start_partial = md->start_used_ptr;
5300
5301 switch(rc)
5302 {
5303 /* NOMATCH and PRUNE advance by one character. THEN at this level acts
5304 exactly like PRUNE. */
5305
5306 case MATCH_NOMATCH:
5307 case MATCH_PRUNE:
5308 case MATCH_THEN:
5309 new_start_match = start_match + 1;
5310 #ifdef SUPPORT_UTF8
5311 if (utf8)
5312 while(new_start_match < end_subject && (*new_start_match & 0xc0) == 0x80)
5313 new_start_match++;
5314 #endif
5315 break;
5316
5317 /* SKIP passes back the next starting point explicitly. */
5318
5319 case MATCH_SKIP:
5320 new_start_match = md->start_match_ptr;
5321 break;
5322
5323 /* COMMIT disables the bumpalong, but otherwise behaves as NOMATCH. */
5324
5325 case MATCH_COMMIT:
5326 rc = MATCH_NOMATCH;
5327 goto ENDLOOP;
5328
5329 /* Any other return is either a match, or some kind of error. */
5330
5331 default:
5332 goto ENDLOOP;
5333 }
5334
5335 /* Control reaches here for the various types of "no match at this point"
5336 result. Reset the code to MATCH_NOMATCH for subsequent checking. */
5337
5338 rc = MATCH_NOMATCH;
5339
5340 /* If PCRE_FIRSTLINE is set, the match must happen before or at the first
5341 newline in the subject (though it may continue over the newline). Therefore,
5342 if we have just failed to match, starting at a newline, do not continue. */
5343
5344 if (firstline && IS_NEWLINE(start_match)) break;
5345
5346 /* Advance to new matching position */
5347
5348 start_match = new_start_match;
5349
5350 /* Break the loop if the pattern is anchored or if we have passed the end of
5351 the subject. */
5352
5353 if (anchored || start_match > end_subject) break;
5354
5355 /* If we have just passed a CR and we are now at a LF, and the pattern does
5356 not contain any explicit matches for \r or \n, and the newline option is CRLF
5357 or ANY or ANYCRLF, advance the match position by one more character. */
5358
5359 if (start_match[-1] == CHAR_CR &&
5360 start_match < end_subject &&
5361 *start_match == CHAR_NL &&
5362 (re->flags & PCRE_HASCRORLF) == 0 &&
5363 (md->nltype == NLTYPE_ANY ||
5364 md->nltype == NLTYPE_ANYCRLF ||
5365 md->nllen == 2))
5366 start_match++;
5367
5368 } /* End of for(;;) "bumpalong" loop */
5369
5370 /* ==========================================================================*/
5371
5372 /* We reach here when rc is not MATCH_NOMATCH, or if one of the stopping
5373 conditions is true:
5374
5375 (1) The pattern is anchored or the match was failed by (*COMMIT);
5376
5377 (2) We are past the end of the subject;
5378
5379 (3) PCRE_FIRSTLINE is set and we have failed to match at a newline, because
5380 this option requests that a match occur at or before the first newline in
5381 the subject.
5382
5383 When we have a match and the offset vector is big enough to deal with any
5384 backreferences, captured substring offsets will already be set up. In the case
5385 where we had to get some local store to hold offsets for backreference
5386 processing, copy those that we can. In this case there need not be overflow if
5387 certain parts of the pattern were not used, even though there are more
5388 capturing parentheses than vector slots. */
5389
5390 ENDLOOP:
5391
5392 if (rc == MATCH_MATCH)
5393 {
5394 if (using_temporary_offsets)
5395 {
5396 if (offsetcount >= 4)
5397 {
5398 memcpy(offsets + 2, md->offset_vector + 2,
5399 (offsetcount - 2) * sizeof(int));
5400 DPRINTF(("Copied offsets from temporary memory\n"));
5401 }
5402 if (md->end_offset_top > offsetcount) md->offset_overflow = TRUE;
5403 DPRINTF(("Freeing temporary memory\n"));
5404 (pcre_free)(md->offset_vector);
5405 }
5406
5407 /* Set the return code to the number of captured strings, or 0 if there are
5408 too many to fit into the vector. */
5409
5410 rc = md->offset_overflow? 0 : md->end_offset_top/2;
5411
5412 /* If there is space, set up the whole thing as substring 0. The value of
5413 md->start_match_ptr might be modified if \K was encountered on the success
5414 matching path. */
5415
5416 if (offsetcount < 2) rc = 0; else
5417 {
5418 offsets[0] = md->start_match_ptr - md->start_subject;
5419 offsets[1] = md->end_match_ptr - md->start_subject;
5420 }
5421
5422 DPRINTF((">>>> returning %d\n", rc));
5423 return rc;
5424 }
5425
5426 /* Control gets here if there has been an error, or if the overall match
5427 attempt has failed at all permitted starting positions. */
5428
5429 if (using_temporary_offsets)
5430 {
5431 DPRINTF(("Freeing temporary memory\n"));
5432 (pcre_free)(md->offset_vector);
5433 }
5434
5435 if (rc != MATCH_NOMATCH && rc != PCRE_ERROR_PARTIAL)
5436 {
5437 DPRINTF((">>>> error: returning %d\n", rc));
5438 return rc;
5439 }
5440 else if (start_partial != NULL)
5441 {
5442 DPRINTF((">>>> returning PCRE_ERROR_PARTIAL\n"));
5443 if (offsetcount > 1)
5444 {
5445 offsets[0] = start_partial - (USPTR)subject;
5446 offsets[1] = end_subject - (USPTR)subject;
5447 }
5448 return PCRE_ERROR_PARTIAL;
5449 }
5450 else
5451 {
5452 DPRINTF((">>>> returning PCRE_ERROR_NOMATCH\n"));
5453 return PCRE_ERROR_NOMATCH;
5454 }
5455 }
5456
5457 /* End of pcre_exec.c */

Properties

Name Value
svn:eol-style native
svn:keywords "Author Date Id Revision Url"

webmaster@exim.org
ViewVC Help
Powered by ViewVC 1.1.12