/[pcre]/code/trunk/pcre_exec.c
ViewVC logotype

Contents of /code/trunk/pcre_exec.c

Parent Directory Parent Directory | Revision Log Revision Log


Revision 428 - (show annotations) (download)
Mon Aug 31 17:10:26 2009 UTC (5 years, 1 month ago) by ph10
File MIME type: text/plain
File size: 161004 byte(s)
Further partial match change: add PCRE_PARTIAL_HARD and make more intuitive.

1 /*************************************************
2 * Perl-Compatible Regular Expressions *
3 *************************************************/
4
5 /* PCRE is a library of functions to support regular expressions whose syntax
6 and semantics are as close as possible to those of the Perl 5 language.
7
8 Written by Philip Hazel
9 Copyright (c) 1997-2009 University of Cambridge
10
11 -----------------------------------------------------------------------------
12 Redistribution and use in source and binary forms, with or without
13 modification, are permitted provided that the following conditions are met:
14
15 * Redistributions of source code must retain the above copyright notice,
16 this list of conditions and the following disclaimer.
17
18 * Redistributions in binary form must reproduce the above copyright
19 notice, this list of conditions and the following disclaimer in the
20 documentation and/or other materials provided with the distribution.
21
22 * Neither the name of the University of Cambridge nor the names of its
23 contributors may be used to endorse or promote products derived from
24 this software without specific prior written permission.
25
26 THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
27 AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
28 IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
29 ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
30 LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
31 CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
32 SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
33 INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
34 CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
35 ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
36 POSSIBILITY OF SUCH DAMAGE.
37 -----------------------------------------------------------------------------
38 */
39
40
41 /* This module contains pcre_exec(), the externally visible function that does
42 pattern matching using an NFA algorithm, trying to mimic Perl as closely as
43 possible. There are also some static supporting functions. */
44
45 #ifdef HAVE_CONFIG_H
46 #include "config.h"
47 #endif
48
49 #define NLBLOCK md /* Block containing newline information */
50 #define PSSTART start_subject /* Field containing processed string start */
51 #define PSEND end_subject /* Field containing processed string end */
52
53 #include "pcre_internal.h"
54
55 /* Undefine some potentially clashing cpp symbols */
56
57 #undef min
58 #undef max
59
60 /* Flag bits for the match() function */
61
62 #define match_condassert 0x01 /* Called to check a condition assertion */
63 #define match_cbegroup 0x02 /* Could-be-empty unlimited repeat group */
64
65 /* Non-error returns from the match() function. Error returns are externally
66 defined PCRE_ERROR_xxx codes, which are all negative. */
67
68 #define MATCH_MATCH 1
69 #define MATCH_NOMATCH 0
70
71 /* Special internal returns from the match() function. Make them sufficiently
72 negative to avoid the external error codes. */
73
74 #define MATCH_COMMIT (-999)
75 #define MATCH_PRUNE (-998)
76 #define MATCH_SKIP (-997)
77 #define MATCH_THEN (-996)
78
79 /* Maximum number of ints of offset to save on the stack for recursive calls.
80 If the offset vector is bigger, malloc is used. This should be a multiple of 3,
81 because the offset vector is always a multiple of 3 long. */
82
83 #define REC_STACK_SAVE_MAX 30
84
85 /* Min and max values for the common repeats; for the maxima, 0 => infinity */
86
87 static const char rep_min[] = { 0, 0, 1, 1, 0, 0 };
88 static const char rep_max[] = { 0, 0, 0, 0, 1, 1 };
89
90
91
92 #ifdef DEBUG
93 /*************************************************
94 * Debugging function to print chars *
95 *************************************************/
96
97 /* Print a sequence of chars in printable format, stopping at the end of the
98 subject if the requested.
99
100 Arguments:
101 p points to characters
102 length number to print
103 is_subject TRUE if printing from within md->start_subject
104 md pointer to matching data block, if is_subject is TRUE
105
106 Returns: nothing
107 */
108
109 static void
110 pchars(const uschar *p, int length, BOOL is_subject, match_data *md)
111 {
112 unsigned int c;
113 if (is_subject && length > md->end_subject - p) length = md->end_subject - p;
114 while (length-- > 0)
115 if (isprint(c = *(p++))) printf("%c", c); else printf("\\x%02x", c);
116 }
117 #endif
118
119
120
121 /*************************************************
122 * Match a back-reference *
123 *************************************************/
124
125 /* If a back reference hasn't been set, the length that is passed is greater
126 than the number of characters left in the string, so the match fails.
127
128 Arguments:
129 offset index into the offset vector
130 eptr points into the subject
131 length length to be matched
132 md points to match data block
133 ims the ims flags
134
135 Returns: TRUE if matched
136 */
137
138 static BOOL
139 match_ref(int offset, register USPTR eptr, int length, match_data *md,
140 unsigned long int ims)
141 {
142 USPTR p = md->start_subject + md->offset_vector[offset];
143
144 #ifdef DEBUG
145 if (eptr >= md->end_subject)
146 printf("matching subject <null>");
147 else
148 {
149 printf("matching subject ");
150 pchars(eptr, length, TRUE, md);
151 }
152 printf(" against backref ");
153 pchars(p, length, FALSE, md);
154 printf("\n");
155 #endif
156
157 /* Always fail if not enough characters left */
158
159 if (length > md->end_subject - eptr) return FALSE;
160
161 /* Separate the caseless case for speed. In UTF-8 mode we can only do this
162 properly if Unicode properties are supported. Otherwise, we can check only
163 ASCII characters. */
164
165 if ((ims & PCRE_CASELESS) != 0)
166 {
167 #ifdef SUPPORT_UTF8
168 #ifdef SUPPORT_UCP
169 if (md->utf8)
170 {
171 USPTR endptr = eptr + length;
172 while (eptr < endptr)
173 {
174 int c, d;
175 GETCHARINC(c, eptr);
176 GETCHARINC(d, p);
177 if (c != d && c != UCD_OTHERCASE(d)) return FALSE;
178 }
179 }
180 else
181 #endif
182 #endif
183
184 /* The same code works when not in UTF-8 mode and in UTF-8 mode when there
185 is no UCP support. */
186
187 while (length-- > 0)
188 { if (md->lcc[*p++] != md->lcc[*eptr++]) return FALSE; }
189 }
190
191 /* In the caseful case, we can just compare the bytes, whether or not we
192 are in UTF-8 mode. */
193
194 else
195 { while (length-- > 0) if (*p++ != *eptr++) return FALSE; }
196
197 return TRUE;
198 }
199
200
201
202 /***************************************************************************
203 ****************************************************************************
204 RECURSION IN THE match() FUNCTION
205
206 The match() function is highly recursive, though not every recursive call
207 increases the recursive depth. Nevertheless, some regular expressions can cause
208 it to recurse to a great depth. I was writing for Unix, so I just let it call
209 itself recursively. This uses the stack for saving everything that has to be
210 saved for a recursive call. On Unix, the stack can be large, and this works
211 fine.
212
213 It turns out that on some non-Unix-like systems there are problems with
214 programs that use a lot of stack. (This despite the fact that every last chip
215 has oodles of memory these days, and techniques for extending the stack have
216 been known for decades.) So....
217
218 There is a fudge, triggered by defining NO_RECURSE, which avoids recursive
219 calls by keeping local variables that need to be preserved in blocks of memory
220 obtained from malloc() instead instead of on the stack. Macros are used to
221 achieve this so that the actual code doesn't look very different to what it
222 always used to.
223
224 The original heap-recursive code used longjmp(). However, it seems that this
225 can be very slow on some operating systems. Following a suggestion from Stan
226 Switzer, the use of longjmp() has been abolished, at the cost of having to
227 provide a unique number for each call to RMATCH. There is no way of generating
228 a sequence of numbers at compile time in C. I have given them names, to make
229 them stand out more clearly.
230
231 Crude tests on x86 Linux show a small speedup of around 5-8%. However, on
232 FreeBSD, avoiding longjmp() more than halves the time taken to run the standard
233 tests. Furthermore, not using longjmp() means that local dynamic variables
234 don't have indeterminate values; this has meant that the frame size can be
235 reduced because the result can be "passed back" by straight setting of the
236 variable instead of being passed in the frame.
237 ****************************************************************************
238 ***************************************************************************/
239
240 /* Numbers for RMATCH calls. When this list is changed, the code at HEAP_RETURN
241 below must be updated in sync. */
242
243 enum { RM1=1, RM2, RM3, RM4, RM5, RM6, RM7, RM8, RM9, RM10,
244 RM11, RM12, RM13, RM14, RM15, RM16, RM17, RM18, RM19, RM20,
245 RM21, RM22, RM23, RM24, RM25, RM26, RM27, RM28, RM29, RM30,
246 RM31, RM32, RM33, RM34, RM35, RM36, RM37, RM38, RM39, RM40,
247 RM41, RM42, RM43, RM44, RM45, RM46, RM47, RM48, RM49, RM50,
248 RM51, RM52, RM53, RM54 };
249
250 /* These versions of the macros use the stack, as normal. There are debugging
251 versions and production versions. Note that the "rw" argument of RMATCH isn't
252 actuall used in this definition. */
253
254 #ifndef NO_RECURSE
255 #define REGISTER register
256
257 #ifdef DEBUG
258 #define RMATCH(ra,rb,rc,rd,re,rf,rg,rw) \
259 { \
260 printf("match() called in line %d\n", __LINE__); \
261 rrc = match(ra,rb,mstart,rc,rd,re,rf,rg,rdepth+1); \
262 printf("to line %d\n", __LINE__); \
263 }
264 #define RRETURN(ra) \
265 { \
266 printf("match() returned %d from line %d ", ra, __LINE__); \
267 return ra; \
268 }
269 #else
270 #define RMATCH(ra,rb,rc,rd,re,rf,rg,rw) \
271 rrc = match(ra,rb,mstart,rc,rd,re,rf,rg,rdepth+1)
272 #define RRETURN(ra) return ra
273 #endif
274
275 #else
276
277
278 /* These versions of the macros manage a private stack on the heap. Note that
279 the "rd" argument of RMATCH isn't actually used in this definition. It's the md
280 argument of match(), which never changes. */
281
282 #define REGISTER
283
284 #define RMATCH(ra,rb,rc,rd,re,rf,rg,rw)\
285 {\
286 heapframe *newframe = (pcre_stack_malloc)(sizeof(heapframe));\
287 frame->Xwhere = rw; \
288 newframe->Xeptr = ra;\
289 newframe->Xecode = rb;\
290 newframe->Xmstart = mstart;\
291 newframe->Xoffset_top = rc;\
292 newframe->Xims = re;\
293 newframe->Xeptrb = rf;\
294 newframe->Xflags = rg;\
295 newframe->Xrdepth = frame->Xrdepth + 1;\
296 newframe->Xprevframe = frame;\
297 frame = newframe;\
298 DPRINTF(("restarting from line %d\n", __LINE__));\
299 goto HEAP_RECURSE;\
300 L_##rw:\
301 DPRINTF(("jumped back to line %d\n", __LINE__));\
302 }
303
304 #define RRETURN(ra)\
305 {\
306 heapframe *newframe = frame;\
307 frame = newframe->Xprevframe;\
308 (pcre_stack_free)(newframe);\
309 if (frame != NULL)\
310 {\
311 rrc = ra;\
312 goto HEAP_RETURN;\
313 }\
314 return ra;\
315 }
316
317
318 /* Structure for remembering the local variables in a private frame */
319
320 typedef struct heapframe {
321 struct heapframe *Xprevframe;
322
323 /* Function arguments that may change */
324
325 USPTR Xeptr;
326 const uschar *Xecode;
327 USPTR Xmstart;
328 int Xoffset_top;
329 long int Xims;
330 eptrblock *Xeptrb;
331 int Xflags;
332 unsigned int Xrdepth;
333
334 /* Function local variables */
335
336 USPTR Xcallpat;
337 #ifdef SUPPORT_UTF8
338 USPTR Xcharptr;
339 #endif
340 USPTR Xdata;
341 USPTR Xnext;
342 USPTR Xpp;
343 USPTR Xprev;
344 USPTR Xsaved_eptr;
345
346 recursion_info Xnew_recursive;
347
348 BOOL Xcur_is_word;
349 BOOL Xcondition;
350 BOOL Xprev_is_word;
351
352 unsigned long int Xoriginal_ims;
353
354 #ifdef SUPPORT_UCP
355 int Xprop_type;
356 int Xprop_value;
357 int Xprop_fail_result;
358 int Xprop_category;
359 int Xprop_chartype;
360 int Xprop_script;
361 int Xoclength;
362 uschar Xocchars[8];
363 #endif
364
365 int Xcodelink;
366 int Xctype;
367 unsigned int Xfc;
368 int Xfi;
369 int Xlength;
370 int Xmax;
371 int Xmin;
372 int Xnumber;
373 int Xoffset;
374 int Xop;
375 int Xsave_capture_last;
376 int Xsave_offset1, Xsave_offset2, Xsave_offset3;
377 int Xstacksave[REC_STACK_SAVE_MAX];
378
379 eptrblock Xnewptrb;
380
381 /* Where to jump back to */
382
383 int Xwhere;
384
385 } heapframe;
386
387 #endif
388
389
390 /***************************************************************************
391 ***************************************************************************/
392
393
394
395 /*************************************************
396 * Match from current position *
397 *************************************************/
398
399 /* This function is called recursively in many circumstances. Whenever it
400 returns a negative (error) response, the outer incarnation must also return the
401 same response. */
402
403 /* These macros pack up tests that are used for partial matching, and which
404 appears several times in the code. We set the "hit end" flag if the pointer is
405 at the end of the subject and also past the start of the subject (i.e.
406 something has been matched). For hard partial matching, we then return
407 immediately. The second one is used when we already know we are past the end of
408 the subject. */
409
410 #define CHECK_PARTIAL()\
411 if (md->partial && eptr >= md->end_subject && eptr > mstart)\
412 {\
413 md->hitend = TRUE;\
414 if (md->partial > 1) RRETURN(PCRE_ERROR_PARTIAL);\
415 }
416
417 #define SCHECK_PARTIAL()\
418 if (md->partial && eptr > mstart)\
419 {\
420 md->hitend = TRUE;\
421 if (md->partial > 1) RRETURN(PCRE_ERROR_PARTIAL);\
422 }
423
424
425 /* Performance note: It might be tempting to extract commonly used fields from
426 the md structure (e.g. utf8, end_subject) into individual variables to improve
427 performance. Tests using gcc on a SPARC disproved this; in the first case, it
428 made performance worse.
429
430 Arguments:
431 eptr pointer to current character in subject
432 ecode pointer to current position in compiled code
433 mstart pointer to the current match start position (can be modified
434 by encountering \K)
435 offset_top current top pointer
436 md pointer to "static" info for the match
437 ims current /i, /m, and /s options
438 eptrb pointer to chain of blocks containing eptr at start of
439 brackets - for testing for empty matches
440 flags can contain
441 match_condassert - this is an assertion condition
442 match_cbegroup - this is the start of an unlimited repeat
443 group that can match an empty string
444 rdepth the recursion depth
445
446 Returns: MATCH_MATCH if matched ) these values are >= 0
447 MATCH_NOMATCH if failed to match )
448 a negative PCRE_ERROR_xxx value if aborted by an error condition
449 (e.g. stopped by repeated call or recursion limit)
450 */
451
452 static int
453 match(REGISTER USPTR eptr, REGISTER const uschar *ecode, USPTR mstart,
454 int offset_top, match_data *md, unsigned long int ims, eptrblock *eptrb,
455 int flags, unsigned int rdepth)
456 {
457 /* These variables do not need to be preserved over recursion in this function,
458 so they can be ordinary variables in all cases. Mark some of them with
459 "register" because they are used a lot in loops. */
460
461 register int rrc; /* Returns from recursive calls */
462 register int i; /* Used for loops not involving calls to RMATCH() */
463 register unsigned int c; /* Character values not kept over RMATCH() calls */
464 register BOOL utf8; /* Local copy of UTF-8 flag for speed */
465
466 BOOL minimize, possessive; /* Quantifier options */
467 int condcode;
468
469 /* When recursion is not being used, all "local" variables that have to be
470 preserved over calls to RMATCH() are part of a "frame" which is obtained from
471 heap storage. Set up the top-level frame here; others are obtained from the
472 heap whenever RMATCH() does a "recursion". See the macro definitions above. */
473
474 #ifdef NO_RECURSE
475 heapframe *frame = (pcre_stack_malloc)(sizeof(heapframe));
476 frame->Xprevframe = NULL; /* Marks the top level */
477
478 /* Copy in the original argument variables */
479
480 frame->Xeptr = eptr;
481 frame->Xecode = ecode;
482 frame->Xmstart = mstart;
483 frame->Xoffset_top = offset_top;
484 frame->Xims = ims;
485 frame->Xeptrb = eptrb;
486 frame->Xflags = flags;
487 frame->Xrdepth = rdepth;
488
489 /* This is where control jumps back to to effect "recursion" */
490
491 HEAP_RECURSE:
492
493 /* Macros make the argument variables come from the current frame */
494
495 #define eptr frame->Xeptr
496 #define ecode frame->Xecode
497 #define mstart frame->Xmstart
498 #define offset_top frame->Xoffset_top
499 #define ims frame->Xims
500 #define eptrb frame->Xeptrb
501 #define flags frame->Xflags
502 #define rdepth frame->Xrdepth
503
504 /* Ditto for the local variables */
505
506 #ifdef SUPPORT_UTF8
507 #define charptr frame->Xcharptr
508 #endif
509 #define callpat frame->Xcallpat
510 #define codelink frame->Xcodelink
511 #define data frame->Xdata
512 #define next frame->Xnext
513 #define pp frame->Xpp
514 #define prev frame->Xprev
515 #define saved_eptr frame->Xsaved_eptr
516
517 #define new_recursive frame->Xnew_recursive
518
519 #define cur_is_word frame->Xcur_is_word
520 #define condition frame->Xcondition
521 #define prev_is_word frame->Xprev_is_word
522
523 #define original_ims frame->Xoriginal_ims
524
525 #ifdef SUPPORT_UCP
526 #define prop_type frame->Xprop_type
527 #define prop_value frame->Xprop_value
528 #define prop_fail_result frame->Xprop_fail_result
529 #define prop_category frame->Xprop_category
530 #define prop_chartype frame->Xprop_chartype
531 #define prop_script frame->Xprop_script
532 #define oclength frame->Xoclength
533 #define occhars frame->Xocchars
534 #endif
535
536 #define ctype frame->Xctype
537 #define fc frame->Xfc
538 #define fi frame->Xfi
539 #define length frame->Xlength
540 #define max frame->Xmax
541 #define min frame->Xmin
542 #define number frame->Xnumber
543 #define offset frame->Xoffset
544 #define op frame->Xop
545 #define save_capture_last frame->Xsave_capture_last
546 #define save_offset1 frame->Xsave_offset1
547 #define save_offset2 frame->Xsave_offset2
548 #define save_offset3 frame->Xsave_offset3
549 #define stacksave frame->Xstacksave
550
551 #define newptrb frame->Xnewptrb
552
553 /* When recursion is being used, local variables are allocated on the stack and
554 get preserved during recursion in the normal way. In this environment, fi and
555 i, and fc and c, can be the same variables. */
556
557 #else /* NO_RECURSE not defined */
558 #define fi i
559 #define fc c
560
561
562 #ifdef SUPPORT_UTF8 /* Many of these variables are used only */
563 const uschar *charptr; /* in small blocks of the code. My normal */
564 #endif /* style of coding would have declared */
565 const uschar *callpat; /* them within each of those blocks. */
566 const uschar *data; /* However, in order to accommodate the */
567 const uschar *next; /* version of this code that uses an */
568 USPTR pp; /* external "stack" implemented on the */
569 const uschar *prev; /* heap, it is easier to declare them all */
570 USPTR saved_eptr; /* here, so the declarations can be cut */
571 /* out in a block. The only declarations */
572 recursion_info new_recursive; /* within blocks below are for variables */
573 /* that do not have to be preserved over */
574 BOOL cur_is_word; /* a recursive call to RMATCH(). */
575 BOOL condition;
576 BOOL prev_is_word;
577
578 unsigned long int original_ims;
579
580 #ifdef SUPPORT_UCP
581 int prop_type;
582 int prop_value;
583 int prop_fail_result;
584 int prop_category;
585 int prop_chartype;
586 int prop_script;
587 int oclength;
588 uschar occhars[8];
589 #endif
590
591 int codelink;
592 int ctype;
593 int length;
594 int max;
595 int min;
596 int number;
597 int offset;
598 int op;
599 int save_capture_last;
600 int save_offset1, save_offset2, save_offset3;
601 int stacksave[REC_STACK_SAVE_MAX];
602
603 eptrblock newptrb;
604 #endif /* NO_RECURSE */
605
606 /* These statements are here to stop the compiler complaining about unitialized
607 variables. */
608
609 #ifdef SUPPORT_UCP
610 prop_value = 0;
611 prop_fail_result = 0;
612 #endif
613
614
615 /* This label is used for tail recursion, which is used in a few cases even
616 when NO_RECURSE is not defined, in order to reduce the amount of stack that is
617 used. Thanks to Ian Taylor for noticing this possibility and sending the
618 original patch. */
619
620 TAIL_RECURSE:
621
622 /* OK, now we can get on with the real code of the function. Recursive calls
623 are specified by the macro RMATCH and RRETURN is used to return. When
624 NO_RECURSE is *not* defined, these just turn into a recursive call to match()
625 and a "return", respectively (possibly with some debugging if DEBUG is
626 defined). However, RMATCH isn't like a function call because it's quite a
627 complicated macro. It has to be used in one particular way. This shouldn't,
628 however, impact performance when true recursion is being used. */
629
630 #ifdef SUPPORT_UTF8
631 utf8 = md->utf8; /* Local copy of the flag */
632 #else
633 utf8 = FALSE;
634 #endif
635
636 /* First check that we haven't called match() too many times, or that we
637 haven't exceeded the recursive call limit. */
638
639 if (md->match_call_count++ >= md->match_limit) RRETURN(PCRE_ERROR_MATCHLIMIT);
640 if (rdepth >= md->match_limit_recursion) RRETURN(PCRE_ERROR_RECURSIONLIMIT);
641
642 original_ims = ims; /* Save for resetting on ')' */
643
644 /* At the start of a group with an unlimited repeat that may match an empty
645 string, the match_cbegroup flag is set. When this is the case, add the current
646 subject pointer to the chain of such remembered pointers, to be checked when we
647 hit the closing ket, in order to break infinite loops that match no characters.
648 When match() is called in other circumstances, don't add to the chain. The
649 match_cbegroup flag must NOT be used with tail recursion, because the memory
650 block that is used is on the stack, so a new one may be required for each
651 match(). */
652
653 if ((flags & match_cbegroup) != 0)
654 {
655 newptrb.epb_saved_eptr = eptr;
656 newptrb.epb_prev = eptrb;
657 eptrb = &newptrb;
658 }
659
660 /* Now start processing the opcodes. */
661
662 for (;;)
663 {
664 minimize = possessive = FALSE;
665 op = *ecode;
666
667 switch(op)
668 {
669 case OP_FAIL:
670 RRETURN(MATCH_NOMATCH);
671
672 case OP_PRUNE:
673 RMATCH(eptr, ecode + _pcre_OP_lengths[*ecode], offset_top, md,
674 ims, eptrb, flags, RM51);
675 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
676 RRETURN(MATCH_PRUNE);
677
678 case OP_COMMIT:
679 RMATCH(eptr, ecode + _pcre_OP_lengths[*ecode], offset_top, md,
680 ims, eptrb, flags, RM52);
681 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
682 RRETURN(MATCH_COMMIT);
683
684 case OP_SKIP:
685 RMATCH(eptr, ecode + _pcre_OP_lengths[*ecode], offset_top, md,
686 ims, eptrb, flags, RM53);
687 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
688 md->start_match_ptr = eptr; /* Pass back current position */
689 RRETURN(MATCH_SKIP);
690
691 case OP_THEN:
692 RMATCH(eptr, ecode + _pcre_OP_lengths[*ecode], offset_top, md,
693 ims, eptrb, flags, RM54);
694 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
695 RRETURN(MATCH_THEN);
696
697 /* Handle a capturing bracket. If there is space in the offset vector, save
698 the current subject position in the working slot at the top of the vector.
699 We mustn't change the current values of the data slot, because they may be
700 set from a previous iteration of this group, and be referred to by a
701 reference inside the group.
702
703 If the bracket fails to match, we need to restore this value and also the
704 values of the final offsets, in case they were set by a previous iteration
705 of the same bracket.
706
707 If there isn't enough space in the offset vector, treat this as if it were
708 a non-capturing bracket. Don't worry about setting the flag for the error
709 case here; that is handled in the code for KET. */
710
711 case OP_CBRA:
712 case OP_SCBRA:
713 number = GET2(ecode, 1+LINK_SIZE);
714 offset = number << 1;
715
716 #ifdef DEBUG
717 printf("start bracket %d\n", number);
718 printf("subject=");
719 pchars(eptr, 16, TRUE, md);
720 printf("\n");
721 #endif
722
723 if (offset < md->offset_max)
724 {
725 save_offset1 = md->offset_vector[offset];
726 save_offset2 = md->offset_vector[offset+1];
727 save_offset3 = md->offset_vector[md->offset_end - number];
728 save_capture_last = md->capture_last;
729
730 DPRINTF(("saving %d %d %d\n", save_offset1, save_offset2, save_offset3));
731 md->offset_vector[md->offset_end - number] = eptr - md->start_subject;
732
733 flags = (op == OP_SCBRA)? match_cbegroup : 0;
734 do
735 {
736 RMATCH(eptr, ecode + _pcre_OP_lengths[*ecode], offset_top, md,
737 ims, eptrb, flags, RM1);
738 if (rrc != MATCH_NOMATCH && rrc != MATCH_THEN) RRETURN(rrc);
739 md->capture_last = save_capture_last;
740 ecode += GET(ecode, 1);
741 }
742 while (*ecode == OP_ALT);
743
744 DPRINTF(("bracket %d failed\n", number));
745
746 md->offset_vector[offset] = save_offset1;
747 md->offset_vector[offset+1] = save_offset2;
748 md->offset_vector[md->offset_end - number] = save_offset3;
749
750 RRETURN(MATCH_NOMATCH);
751 }
752
753 /* FALL THROUGH ... Insufficient room for saving captured contents. Treat
754 as a non-capturing bracket. */
755
756 /* VVVVVVVVVVVVVVVVVVVVVVVVV */
757 /* VVVVVVVVVVVVVVVVVVVVVVVVV */
758
759 DPRINTF(("insufficient capture room: treat as non-capturing\n"));
760
761 /* VVVVVVVVVVVVVVVVVVVVVVVVV */
762 /* VVVVVVVVVVVVVVVVVVVVVVVVV */
763
764 /* Non-capturing bracket. Loop for all the alternatives. When we get to the
765 final alternative within the brackets, we would return the result of a
766 recursive call to match() whatever happened. We can reduce stack usage by
767 turning this into a tail recursion, except in the case when match_cbegroup
768 is set.*/
769
770 case OP_BRA:
771 case OP_SBRA:
772 DPRINTF(("start non-capturing bracket\n"));
773 flags = (op >= OP_SBRA)? match_cbegroup : 0;
774 for (;;)
775 {
776 if (ecode[GET(ecode, 1)] != OP_ALT) /* Final alternative */
777 {
778 if (flags == 0) /* Not a possibly empty group */
779 {
780 ecode += _pcre_OP_lengths[*ecode];
781 DPRINTF(("bracket 0 tail recursion\n"));
782 goto TAIL_RECURSE;
783 }
784
785 /* Possibly empty group; can't use tail recursion. */
786
787 RMATCH(eptr, ecode + _pcre_OP_lengths[*ecode], offset_top, md, ims,
788 eptrb, flags, RM48);
789 RRETURN(rrc);
790 }
791
792 /* For non-final alternatives, continue the loop for a NOMATCH result;
793 otherwise return. */
794
795 RMATCH(eptr, ecode + _pcre_OP_lengths[*ecode], offset_top, md, ims,
796 eptrb, flags, RM2);
797 if (rrc != MATCH_NOMATCH && rrc != MATCH_THEN) RRETURN(rrc);
798 ecode += GET(ecode, 1);
799 }
800 /* Control never reaches here. */
801
802 /* Conditional group: compilation checked that there are no more than
803 two branches. If the condition is false, skipping the first branch takes us
804 past the end if there is only one branch, but that's OK because that is
805 exactly what going to the ket would do. As there is only one branch to be
806 obeyed, we can use tail recursion to avoid using another stack frame. */
807
808 case OP_COND:
809 case OP_SCOND:
810 codelink= GET(ecode, 1);
811
812 /* Because of the way auto-callout works during compile, a callout item is
813 inserted between OP_COND and an assertion condition. */
814
815 if (ecode[LINK_SIZE+1] == OP_CALLOUT)
816 {
817 if (pcre_callout != NULL)
818 {
819 pcre_callout_block cb;
820 cb.version = 1; /* Version 1 of the callout block */
821 cb.callout_number = ecode[LINK_SIZE+2];
822 cb.offset_vector = md->offset_vector;
823 cb.subject = (PCRE_SPTR)md->start_subject;
824 cb.subject_length = md->end_subject - md->start_subject;
825 cb.start_match = mstart - md->start_subject;
826 cb.current_position = eptr - md->start_subject;
827 cb.pattern_position = GET(ecode, LINK_SIZE + 3);
828 cb.next_item_length = GET(ecode, 3 + 2*LINK_SIZE);
829 cb.capture_top = offset_top/2;
830 cb.capture_last = md->capture_last;
831 cb.callout_data = md->callout_data;
832 if ((rrc = (*pcre_callout)(&cb)) > 0) RRETURN(MATCH_NOMATCH);
833 if (rrc < 0) RRETURN(rrc);
834 }
835 ecode += _pcre_OP_lengths[OP_CALLOUT];
836 }
837
838 condcode = ecode[LINK_SIZE+1];
839
840 /* Now see what the actual condition is */
841
842 if (condcode == OP_RREF) /* Recursion test */
843 {
844 offset = GET2(ecode, LINK_SIZE + 2); /* Recursion group number*/
845 condition = md->recursive != NULL &&
846 (offset == RREF_ANY || offset == md->recursive->group_num);
847 ecode += condition? 3 : GET(ecode, 1);
848 }
849
850 else if (condcode == OP_CREF) /* Group used test */
851 {
852 offset = GET2(ecode, LINK_SIZE+2) << 1; /* Doubled ref number */
853 condition = offset < offset_top && md->offset_vector[offset] >= 0;
854 ecode += condition? 3 : GET(ecode, 1);
855 }
856
857 else if (condcode == OP_DEF) /* DEFINE - always false */
858 {
859 condition = FALSE;
860 ecode += GET(ecode, 1);
861 }
862
863 /* The condition is an assertion. Call match() to evaluate it - setting
864 the final argument match_condassert causes it to stop at the end of an
865 assertion. */
866
867 else
868 {
869 RMATCH(eptr, ecode + 1 + LINK_SIZE, offset_top, md, ims, NULL,
870 match_condassert, RM3);
871 if (rrc == MATCH_MATCH)
872 {
873 condition = TRUE;
874 ecode += 1 + LINK_SIZE + GET(ecode, LINK_SIZE + 2);
875 while (*ecode == OP_ALT) ecode += GET(ecode, 1);
876 }
877 else if (rrc != MATCH_NOMATCH && rrc != MATCH_THEN)
878 {
879 RRETURN(rrc); /* Need braces because of following else */
880 }
881 else
882 {
883 condition = FALSE;
884 ecode += codelink;
885 }
886 }
887
888 /* We are now at the branch that is to be obeyed. As there is only one,
889 we can use tail recursion to avoid using another stack frame, except when
890 match_cbegroup is required for an unlimited repeat of a possibly empty
891 group. If the second alternative doesn't exist, we can just plough on. */
892
893 if (condition || *ecode == OP_ALT)
894 {
895 ecode += 1 + LINK_SIZE;
896 if (op == OP_SCOND) /* Possibly empty group */
897 {
898 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, match_cbegroup, RM49);
899 RRETURN(rrc);
900 }
901 else /* Group must match something */
902 {
903 flags = 0;
904 goto TAIL_RECURSE;
905 }
906 }
907 else /* Condition false & no alternative */
908 {
909 ecode += 1 + LINK_SIZE;
910 }
911 break;
912
913
914 /* End of the pattern, either real or forced. If we are in a top-level
915 recursion, we should restore the offsets appropriately and continue from
916 after the call. */
917
918 case OP_ACCEPT:
919 case OP_END:
920 if (md->recursive != NULL && md->recursive->group_num == 0)
921 {
922 recursion_info *rec = md->recursive;
923 DPRINTF(("End of pattern in a (?0) recursion\n"));
924 md->recursive = rec->prevrec;
925 memmove(md->offset_vector, rec->offset_save,
926 rec->saved_max * sizeof(int));
927 mstart = rec->save_start;
928 ims = original_ims;
929 ecode = rec->after_call;
930 break;
931 }
932
933 /* Otherwise, if PCRE_NOTEMPTY is set, fail if we have matched an empty
934 string - backtracking will then try other alternatives, if any. */
935
936 if (md->notempty && eptr == mstart) RRETURN(MATCH_NOMATCH);
937 md->end_match_ptr = eptr; /* Record where we ended */
938 md->end_offset_top = offset_top; /* and how many extracts were taken */
939 md->start_match_ptr = mstart; /* and the start (\K can modify) */
940 RRETURN(MATCH_MATCH);
941
942 /* Change option settings */
943
944 case OP_OPT:
945 ims = ecode[1];
946 ecode += 2;
947 DPRINTF(("ims set to %02lx\n", ims));
948 break;
949
950 /* Assertion brackets. Check the alternative branches in turn - the
951 matching won't pass the KET for an assertion. If any one branch matches,
952 the assertion is true. Lookbehind assertions have an OP_REVERSE item at the
953 start of each branch to move the current point backwards, so the code at
954 this level is identical to the lookahead case. */
955
956 case OP_ASSERT:
957 case OP_ASSERTBACK:
958 do
959 {
960 RMATCH(eptr, ecode + 1 + LINK_SIZE, offset_top, md, ims, NULL, 0,
961 RM4);
962 if (rrc == MATCH_MATCH) break;
963 if (rrc != MATCH_NOMATCH && rrc != MATCH_THEN) RRETURN(rrc);
964 ecode += GET(ecode, 1);
965 }
966 while (*ecode == OP_ALT);
967 if (*ecode == OP_KET) RRETURN(MATCH_NOMATCH);
968
969 /* If checking an assertion for a condition, return MATCH_MATCH. */
970
971 if ((flags & match_condassert) != 0) RRETURN(MATCH_MATCH);
972
973 /* Continue from after the assertion, updating the offsets high water
974 mark, since extracts may have been taken during the assertion. */
975
976 do ecode += GET(ecode,1); while (*ecode == OP_ALT);
977 ecode += 1 + LINK_SIZE;
978 offset_top = md->end_offset_top;
979 continue;
980
981 /* Negative assertion: all branches must fail to match */
982
983 case OP_ASSERT_NOT:
984 case OP_ASSERTBACK_NOT:
985 do
986 {
987 RMATCH(eptr, ecode + 1 + LINK_SIZE, offset_top, md, ims, NULL, 0,
988 RM5);
989 if (rrc == MATCH_MATCH) RRETURN(MATCH_NOMATCH);
990 if (rrc != MATCH_NOMATCH && rrc != MATCH_THEN) RRETURN(rrc);
991 ecode += GET(ecode,1);
992 }
993 while (*ecode == OP_ALT);
994
995 if ((flags & match_condassert) != 0) RRETURN(MATCH_MATCH);
996
997 ecode += 1 + LINK_SIZE;
998 continue;
999
1000 /* Move the subject pointer back. This occurs only at the start of
1001 each branch of a lookbehind assertion. If we are too close to the start to
1002 move back, this match function fails. When working with UTF-8 we move
1003 back a number of characters, not bytes. */
1004
1005 case OP_REVERSE:
1006 #ifdef SUPPORT_UTF8
1007 if (utf8)
1008 {
1009 i = GET(ecode, 1);
1010 while (i-- > 0)
1011 {
1012 eptr--;
1013 if (eptr < md->start_subject) RRETURN(MATCH_NOMATCH);
1014 BACKCHAR(eptr);
1015 }
1016 }
1017 else
1018 #endif
1019
1020 /* No UTF-8 support, or not in UTF-8 mode: count is byte count */
1021
1022 {
1023 eptr -= GET(ecode, 1);
1024 if (eptr < md->start_subject) RRETURN(MATCH_NOMATCH);
1025 }
1026
1027 /* Skip to next op code */
1028
1029 ecode += 1 + LINK_SIZE;
1030 break;
1031
1032 /* The callout item calls an external function, if one is provided, passing
1033 details of the match so far. This is mainly for debugging, though the
1034 function is able to force a failure. */
1035
1036 case OP_CALLOUT:
1037 if (pcre_callout != NULL)
1038 {
1039 pcre_callout_block cb;
1040 cb.version = 1; /* Version 1 of the callout block */
1041 cb.callout_number = ecode[1];
1042 cb.offset_vector = md->offset_vector;
1043 cb.subject = (PCRE_SPTR)md->start_subject;
1044 cb.subject_length = md->end_subject - md->start_subject;
1045 cb.start_match = mstart - md->start_subject;
1046 cb.current_position = eptr - md->start_subject;
1047 cb.pattern_position = GET(ecode, 2);
1048 cb.next_item_length = GET(ecode, 2 + LINK_SIZE);
1049 cb.capture_top = offset_top/2;
1050 cb.capture_last = md->capture_last;
1051 cb.callout_data = md->callout_data;
1052 if ((rrc = (*pcre_callout)(&cb)) > 0) RRETURN(MATCH_NOMATCH);
1053 if (rrc < 0) RRETURN(rrc);
1054 }
1055 ecode += 2 + 2*LINK_SIZE;
1056 break;
1057
1058 /* Recursion either matches the current regex, or some subexpression. The
1059 offset data is the offset to the starting bracket from the start of the
1060 whole pattern. (This is so that it works from duplicated subpatterns.)
1061
1062 If there are any capturing brackets started but not finished, we have to
1063 save their starting points and reinstate them after the recursion. However,
1064 we don't know how many such there are (offset_top records the completed
1065 total) so we just have to save all the potential data. There may be up to
1066 65535 such values, which is too large to put on the stack, but using malloc
1067 for small numbers seems expensive. As a compromise, the stack is used when
1068 there are no more than REC_STACK_SAVE_MAX values to store; otherwise malloc
1069 is used. A problem is what to do if the malloc fails ... there is no way of
1070 returning to the top level with an error. Save the top REC_STACK_SAVE_MAX
1071 values on the stack, and accept that the rest may be wrong.
1072
1073 There are also other values that have to be saved. We use a chained
1074 sequence of blocks that actually live on the stack. Thanks to Robin Houston
1075 for the original version of this logic. */
1076
1077 case OP_RECURSE:
1078 {
1079 callpat = md->start_code + GET(ecode, 1);
1080 new_recursive.group_num = (callpat == md->start_code)? 0 :
1081 GET2(callpat, 1 + LINK_SIZE);
1082
1083 /* Add to "recursing stack" */
1084
1085 new_recursive.prevrec = md->recursive;
1086 md->recursive = &new_recursive;
1087
1088 /* Find where to continue from afterwards */
1089
1090 ecode += 1 + LINK_SIZE;
1091 new_recursive.after_call = ecode;
1092
1093 /* Now save the offset data. */
1094
1095 new_recursive.saved_max = md->offset_end;
1096 if (new_recursive.saved_max <= REC_STACK_SAVE_MAX)
1097 new_recursive.offset_save = stacksave;
1098 else
1099 {
1100 new_recursive.offset_save =
1101 (int *)(pcre_malloc)(new_recursive.saved_max * sizeof(int));
1102 if (new_recursive.offset_save == NULL) RRETURN(PCRE_ERROR_NOMEMORY);
1103 }
1104
1105 memcpy(new_recursive.offset_save, md->offset_vector,
1106 new_recursive.saved_max * sizeof(int));
1107 new_recursive.save_start = mstart;
1108 mstart = eptr;
1109
1110 /* OK, now we can do the recursion. For each top-level alternative we
1111 restore the offset and recursion data. */
1112
1113 DPRINTF(("Recursing into group %d\n", new_recursive.group_num));
1114 flags = (*callpat >= OP_SBRA)? match_cbegroup : 0;
1115 do
1116 {
1117 RMATCH(eptr, callpat + _pcre_OP_lengths[*callpat], offset_top,
1118 md, ims, eptrb, flags, RM6);
1119 if (rrc == MATCH_MATCH)
1120 {
1121 DPRINTF(("Recursion matched\n"));
1122 md->recursive = new_recursive.prevrec;
1123 if (new_recursive.offset_save != stacksave)
1124 (pcre_free)(new_recursive.offset_save);
1125 RRETURN(MATCH_MATCH);
1126 }
1127 else if (rrc != MATCH_NOMATCH && rrc != MATCH_THEN)
1128 {
1129 DPRINTF(("Recursion gave error %d\n", rrc));
1130 if (new_recursive.offset_save != stacksave)
1131 (pcre_free)(new_recursive.offset_save);
1132 RRETURN(rrc);
1133 }
1134
1135 md->recursive = &new_recursive;
1136 memcpy(md->offset_vector, new_recursive.offset_save,
1137 new_recursive.saved_max * sizeof(int));
1138 callpat += GET(callpat, 1);
1139 }
1140 while (*callpat == OP_ALT);
1141
1142 DPRINTF(("Recursion didn't match\n"));
1143 md->recursive = new_recursive.prevrec;
1144 if (new_recursive.offset_save != stacksave)
1145 (pcre_free)(new_recursive.offset_save);
1146 RRETURN(MATCH_NOMATCH);
1147 }
1148 /* Control never reaches here */
1149
1150 /* "Once" brackets are like assertion brackets except that after a match,
1151 the point in the subject string is not moved back. Thus there can never be
1152 a move back into the brackets. Friedl calls these "atomic" subpatterns.
1153 Check the alternative branches in turn - the matching won't pass the KET
1154 for this kind of subpattern. If any one branch matches, we carry on as at
1155 the end of a normal bracket, leaving the subject pointer. */
1156
1157 case OP_ONCE:
1158 prev = ecode;
1159 saved_eptr = eptr;
1160
1161 do
1162 {
1163 RMATCH(eptr, ecode + 1 + LINK_SIZE, offset_top, md, ims, eptrb, 0, RM7);
1164 if (rrc == MATCH_MATCH) break;
1165 if (rrc != MATCH_NOMATCH && rrc != MATCH_THEN) RRETURN(rrc);
1166 ecode += GET(ecode,1);
1167 }
1168 while (*ecode == OP_ALT);
1169
1170 /* If hit the end of the group (which could be repeated), fail */
1171
1172 if (*ecode != OP_ONCE && *ecode != OP_ALT) RRETURN(MATCH_NOMATCH);
1173
1174 /* Continue as from after the assertion, updating the offsets high water
1175 mark, since extracts may have been taken. */
1176
1177 do ecode += GET(ecode, 1); while (*ecode == OP_ALT);
1178
1179 offset_top = md->end_offset_top;
1180 eptr = md->end_match_ptr;
1181
1182 /* For a non-repeating ket, just continue at this level. This also
1183 happens for a repeating ket if no characters were matched in the group.
1184 This is the forcible breaking of infinite loops as implemented in Perl
1185 5.005. If there is an options reset, it will get obeyed in the normal
1186 course of events. */
1187
1188 if (*ecode == OP_KET || eptr == saved_eptr)
1189 {
1190 ecode += 1+LINK_SIZE;
1191 break;
1192 }
1193
1194 /* The repeating kets try the rest of the pattern or restart from the
1195 preceding bracket, in the appropriate order. The second "call" of match()
1196 uses tail recursion, to avoid using another stack frame. We need to reset
1197 any options that changed within the bracket before re-running it, so
1198 check the next opcode. */
1199
1200 if (ecode[1+LINK_SIZE] == OP_OPT)
1201 {
1202 ims = (ims & ~PCRE_IMS) | ecode[4];
1203 DPRINTF(("ims set to %02lx at group repeat\n", ims));
1204 }
1205
1206 if (*ecode == OP_KETRMIN)
1207 {
1208 RMATCH(eptr, ecode + 1 + LINK_SIZE, offset_top, md, ims, eptrb, 0, RM8);
1209 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1210 ecode = prev;
1211 flags = 0;
1212 goto TAIL_RECURSE;
1213 }
1214 else /* OP_KETRMAX */
1215 {
1216 RMATCH(eptr, prev, offset_top, md, ims, eptrb, match_cbegroup, RM9);
1217 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1218 ecode += 1 + LINK_SIZE;
1219 flags = 0;
1220 goto TAIL_RECURSE;
1221 }
1222 /* Control never gets here */
1223
1224 /* An alternation is the end of a branch; scan along to find the end of the
1225 bracketed group and go to there. */
1226
1227 case OP_ALT:
1228 do ecode += GET(ecode,1); while (*ecode == OP_ALT);
1229 break;
1230
1231 /* BRAZERO, BRAMINZERO and SKIPZERO occur just before a bracket group,
1232 indicating that it may occur zero times. It may repeat infinitely, or not
1233 at all - i.e. it could be ()* or ()? or even (){0} in the pattern. Brackets
1234 with fixed upper repeat limits are compiled as a number of copies, with the
1235 optional ones preceded by BRAZERO or BRAMINZERO. */
1236
1237 case OP_BRAZERO:
1238 {
1239 next = ecode+1;
1240 RMATCH(eptr, next, offset_top, md, ims, eptrb, 0, RM10);
1241 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1242 do next += GET(next,1); while (*next == OP_ALT);
1243 ecode = next + 1 + LINK_SIZE;
1244 }
1245 break;
1246
1247 case OP_BRAMINZERO:
1248 {
1249 next = ecode+1;
1250 do next += GET(next, 1); while (*next == OP_ALT);
1251 RMATCH(eptr, next + 1+LINK_SIZE, offset_top, md, ims, eptrb, 0, RM11);
1252 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1253 ecode++;
1254 }
1255 break;
1256
1257 case OP_SKIPZERO:
1258 {
1259 next = ecode+1;
1260 do next += GET(next,1); while (*next == OP_ALT);
1261 ecode = next + 1 + LINK_SIZE;
1262 }
1263 break;
1264
1265 /* End of a group, repeated or non-repeating. */
1266
1267 case OP_KET:
1268 case OP_KETRMIN:
1269 case OP_KETRMAX:
1270 prev = ecode - GET(ecode, 1);
1271
1272 /* If this was a group that remembered the subject start, in order to break
1273 infinite repeats of empty string matches, retrieve the subject start from
1274 the chain. Otherwise, set it NULL. */
1275
1276 if (*prev >= OP_SBRA)
1277 {
1278 saved_eptr = eptrb->epb_saved_eptr; /* Value at start of group */
1279 eptrb = eptrb->epb_prev; /* Backup to previous group */
1280 }
1281 else saved_eptr = NULL;
1282
1283 /* If we are at the end of an assertion group, stop matching and return
1284 MATCH_MATCH, but record the current high water mark for use by positive
1285 assertions. Do this also for the "once" (atomic) groups. */
1286
1287 if (*prev == OP_ASSERT || *prev == OP_ASSERT_NOT ||
1288 *prev == OP_ASSERTBACK || *prev == OP_ASSERTBACK_NOT ||
1289 *prev == OP_ONCE)
1290 {
1291 md->end_match_ptr = eptr; /* For ONCE */
1292 md->end_offset_top = offset_top;
1293 RRETURN(MATCH_MATCH);
1294 }
1295
1296 /* For capturing groups we have to check the group number back at the start
1297 and if necessary complete handling an extraction by setting the offsets and
1298 bumping the high water mark. Note that whole-pattern recursion is coded as
1299 a recurse into group 0, so it won't be picked up here. Instead, we catch it
1300 when the OP_END is reached. Other recursion is handled here. */
1301
1302 if (*prev == OP_CBRA || *prev == OP_SCBRA)
1303 {
1304 number = GET2(prev, 1+LINK_SIZE);
1305 offset = number << 1;
1306
1307 #ifdef DEBUG
1308 printf("end bracket %d", number);
1309 printf("\n");
1310 #endif
1311
1312 md->capture_last = number;
1313 if (offset >= md->offset_max) md->offset_overflow = TRUE; else
1314 {
1315 md->offset_vector[offset] =
1316 md->offset_vector[md->offset_end - number];
1317 md->offset_vector[offset+1] = eptr - md->start_subject;
1318 if (offset_top <= offset) offset_top = offset + 2;
1319 }
1320
1321 /* Handle a recursively called group. Restore the offsets
1322 appropriately and continue from after the call. */
1323
1324 if (md->recursive != NULL && md->recursive->group_num == number)
1325 {
1326 recursion_info *rec = md->recursive;
1327 DPRINTF(("Recursion (%d) succeeded - continuing\n", number));
1328 md->recursive = rec->prevrec;
1329 mstart = rec->save_start;
1330 memcpy(md->offset_vector, rec->offset_save,
1331 rec->saved_max * sizeof(int));
1332 ecode = rec->after_call;
1333 ims = original_ims;
1334 break;
1335 }
1336 }
1337
1338 /* For both capturing and non-capturing groups, reset the value of the ims
1339 flags, in case they got changed during the group. */
1340
1341 ims = original_ims;
1342 DPRINTF(("ims reset to %02lx\n", ims));
1343
1344 /* For a non-repeating ket, just continue at this level. This also
1345 happens for a repeating ket if no characters were matched in the group.
1346 This is the forcible breaking of infinite loops as implemented in Perl
1347 5.005. If there is an options reset, it will get obeyed in the normal
1348 course of events. */
1349
1350 if (*ecode == OP_KET || eptr == saved_eptr)
1351 {
1352 ecode += 1 + LINK_SIZE;
1353 break;
1354 }
1355
1356 /* The repeating kets try the rest of the pattern or restart from the
1357 preceding bracket, in the appropriate order. In the second case, we can use
1358 tail recursion to avoid using another stack frame, unless we have an
1359 unlimited repeat of a group that can match an empty string. */
1360
1361 flags = (*prev >= OP_SBRA)? match_cbegroup : 0;
1362
1363 if (*ecode == OP_KETRMIN)
1364 {
1365 RMATCH(eptr, ecode + 1 + LINK_SIZE, offset_top, md, ims, eptrb, 0, RM12);
1366 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1367 if (flags != 0) /* Could match an empty string */
1368 {
1369 RMATCH(eptr, prev, offset_top, md, ims, eptrb, flags, RM50);
1370 RRETURN(rrc);
1371 }
1372 ecode = prev;
1373 goto TAIL_RECURSE;
1374 }
1375 else /* OP_KETRMAX */
1376 {
1377 RMATCH(eptr, prev, offset_top, md, ims, eptrb, flags, RM13);
1378 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1379 ecode += 1 + LINK_SIZE;
1380 flags = 0;
1381 goto TAIL_RECURSE;
1382 }
1383 /* Control never gets here */
1384
1385 /* Start of subject unless notbol, or after internal newline if multiline */
1386
1387 case OP_CIRC:
1388 if (md->notbol && eptr == md->start_subject) RRETURN(MATCH_NOMATCH);
1389 if ((ims & PCRE_MULTILINE) != 0)
1390 {
1391 if (eptr != md->start_subject &&
1392 (eptr == md->end_subject || !WAS_NEWLINE(eptr)))
1393 RRETURN(MATCH_NOMATCH);
1394 ecode++;
1395 break;
1396 }
1397 /* ... else fall through */
1398
1399 /* Start of subject assertion */
1400
1401 case OP_SOD:
1402 if (eptr != md->start_subject) RRETURN(MATCH_NOMATCH);
1403 ecode++;
1404 break;
1405
1406 /* Start of match assertion */
1407
1408 case OP_SOM:
1409 if (eptr != md->start_subject + md->start_offset) RRETURN(MATCH_NOMATCH);
1410 ecode++;
1411 break;
1412
1413 /* Reset the start of match point */
1414
1415 case OP_SET_SOM:
1416 mstart = eptr;
1417 ecode++;
1418 break;
1419
1420 /* Assert before internal newline if multiline, or before a terminating
1421 newline unless endonly is set, else end of subject unless noteol is set. */
1422
1423 case OP_DOLL:
1424 if ((ims & PCRE_MULTILINE) != 0)
1425 {
1426 if (eptr < md->end_subject)
1427 { if (!IS_NEWLINE(eptr)) RRETURN(MATCH_NOMATCH); }
1428 else
1429 { if (md->noteol) RRETURN(MATCH_NOMATCH); }
1430 ecode++;
1431 break;
1432 }
1433 else
1434 {
1435 if (md->noteol) RRETURN(MATCH_NOMATCH);
1436 if (!md->endonly)
1437 {
1438 if (eptr != md->end_subject &&
1439 (!IS_NEWLINE(eptr) || eptr != md->end_subject - md->nllen))
1440 RRETURN(MATCH_NOMATCH);
1441 ecode++;
1442 break;
1443 }
1444 }
1445 /* ... else fall through for endonly */
1446
1447 /* End of subject assertion (\z) */
1448
1449 case OP_EOD:
1450 if (eptr < md->end_subject) RRETURN(MATCH_NOMATCH);
1451 ecode++;
1452 break;
1453
1454 /* End of subject or ending \n assertion (\Z) */
1455
1456 case OP_EODN:
1457 if (eptr != md->end_subject &&
1458 (!IS_NEWLINE(eptr) || eptr != md->end_subject - md->nllen))
1459 RRETURN(MATCH_NOMATCH);
1460 ecode++;
1461 break;
1462
1463 /* Word boundary assertions */
1464
1465 case OP_NOT_WORD_BOUNDARY:
1466 case OP_WORD_BOUNDARY:
1467 {
1468
1469 /* Find out if the previous and current characters are "word" characters.
1470 It takes a bit more work in UTF-8 mode. Characters > 255 are assumed to
1471 be "non-word" characters. */
1472
1473 #ifdef SUPPORT_UTF8
1474 if (utf8)
1475 {
1476 if (eptr == md->start_subject) prev_is_word = FALSE; else
1477 {
1478 USPTR lastptr = eptr - 1;
1479 while((*lastptr & 0xc0) == 0x80) lastptr--;
1480 GETCHAR(c, lastptr);
1481 prev_is_word = c < 256 && (md->ctypes[c] & ctype_word) != 0;
1482 }
1483 if (eptr >= md->end_subject)
1484 {
1485 SCHECK_PARTIAL();
1486 cur_is_word = FALSE;
1487 }
1488 else
1489 {
1490 GETCHAR(c, eptr);
1491 cur_is_word = c < 256 && (md->ctypes[c] & ctype_word) != 0;
1492 }
1493 }
1494 else
1495 #endif
1496
1497 /* Not in UTF-8 mode */
1498
1499 {
1500 prev_is_word = (eptr != md->start_subject) &&
1501 ((md->ctypes[eptr[-1]] & ctype_word) != 0);
1502 if (eptr >= md->end_subject)
1503 {
1504 SCHECK_PARTIAL();
1505 cur_is_word = FALSE;
1506 }
1507 else cur_is_word = ((md->ctypes[*eptr] & ctype_word) != 0);
1508 }
1509
1510 /* Now see if the situation is what we want */
1511
1512 if ((*ecode++ == OP_WORD_BOUNDARY)?
1513 cur_is_word == prev_is_word : cur_is_word != prev_is_word)
1514 RRETURN(MATCH_NOMATCH);
1515 }
1516 break;
1517
1518 /* Match a single character type; inline for speed */
1519
1520 case OP_ANY:
1521 if (IS_NEWLINE(eptr)) RRETURN(MATCH_NOMATCH);
1522 /* Fall through */
1523
1524 case OP_ALLANY:
1525 if (eptr++ >= md->end_subject)
1526 {
1527 SCHECK_PARTIAL();
1528 RRETURN(MATCH_NOMATCH);
1529 }
1530 if (utf8) while (eptr < md->end_subject && (*eptr & 0xc0) == 0x80) eptr++;
1531 ecode++;
1532 break;
1533
1534 /* Match a single byte, even in UTF-8 mode. This opcode really does match
1535 any byte, even newline, independent of the setting of PCRE_DOTALL. */
1536
1537 case OP_ANYBYTE:
1538 if (eptr++ >= md->end_subject)
1539 {
1540 SCHECK_PARTIAL();
1541 RRETURN(MATCH_NOMATCH);
1542 }
1543 ecode++;
1544 break;
1545
1546 case OP_NOT_DIGIT:
1547 if (eptr >= md->end_subject)
1548 {
1549 SCHECK_PARTIAL();
1550 RRETURN(MATCH_NOMATCH);
1551 }
1552 GETCHARINCTEST(c, eptr);
1553 if (
1554 #ifdef SUPPORT_UTF8
1555 c < 256 &&
1556 #endif
1557 (md->ctypes[c] & ctype_digit) != 0
1558 )
1559 RRETURN(MATCH_NOMATCH);
1560 ecode++;
1561 break;
1562
1563 case OP_DIGIT:
1564 if (eptr >= md->end_subject)
1565 {
1566 SCHECK_PARTIAL();
1567 RRETURN(MATCH_NOMATCH);
1568 }
1569 GETCHARINCTEST(c, eptr);
1570 if (
1571 #ifdef SUPPORT_UTF8
1572 c >= 256 ||
1573 #endif
1574 (md->ctypes[c] & ctype_digit) == 0
1575 )
1576 RRETURN(MATCH_NOMATCH);
1577 ecode++;
1578 break;
1579
1580 case OP_NOT_WHITESPACE:
1581 if (eptr >= md->end_subject)
1582 {
1583 SCHECK_PARTIAL();
1584 RRETURN(MATCH_NOMATCH);
1585 }
1586 GETCHARINCTEST(c, eptr);
1587 if (
1588 #ifdef SUPPORT_UTF8
1589 c < 256 &&
1590 #endif
1591 (md->ctypes[c] & ctype_space) != 0
1592 )
1593 RRETURN(MATCH_NOMATCH);
1594 ecode++;
1595 break;
1596
1597 case OP_WHITESPACE:
1598 if (eptr >= md->end_subject)
1599 {
1600 SCHECK_PARTIAL();
1601 RRETURN(MATCH_NOMATCH);
1602 }
1603 GETCHARINCTEST(c, eptr);
1604 if (
1605 #ifdef SUPPORT_UTF8
1606 c >= 256 ||
1607 #endif
1608 (md->ctypes[c] & ctype_space) == 0
1609 )
1610 RRETURN(MATCH_NOMATCH);
1611 ecode++;
1612 break;
1613
1614 case OP_NOT_WORDCHAR:
1615 if (eptr >= md->end_subject)
1616 {
1617 SCHECK_PARTIAL();
1618 RRETURN(MATCH_NOMATCH);
1619 }
1620 GETCHARINCTEST(c, eptr);
1621 if (
1622 #ifdef SUPPORT_UTF8
1623 c < 256 &&
1624 #endif
1625 (md->ctypes[c] & ctype_word) != 0
1626 )
1627 RRETURN(MATCH_NOMATCH);
1628 ecode++;
1629 break;
1630
1631 case OP_WORDCHAR:
1632 if (eptr >= md->end_subject)
1633 {
1634 SCHECK_PARTIAL();
1635 RRETURN(MATCH_NOMATCH);
1636 }
1637 GETCHARINCTEST(c, eptr);
1638 if (
1639 #ifdef SUPPORT_UTF8
1640 c >= 256 ||
1641 #endif
1642 (md->ctypes[c] & ctype_word) == 0
1643 )
1644 RRETURN(MATCH_NOMATCH);
1645 ecode++;
1646 break;
1647
1648 case OP_ANYNL:
1649 if (eptr >= md->end_subject)
1650 {
1651 SCHECK_PARTIAL();
1652 RRETURN(MATCH_NOMATCH);
1653 }
1654 GETCHARINCTEST(c, eptr);
1655 switch(c)
1656 {
1657 default: RRETURN(MATCH_NOMATCH);
1658 case 0x000d:
1659 if (eptr < md->end_subject && *eptr == 0x0a) eptr++;
1660 break;
1661
1662 case 0x000a:
1663 break;
1664
1665 case 0x000b:
1666 case 0x000c:
1667 case 0x0085:
1668 case 0x2028:
1669 case 0x2029:
1670 if (md->bsr_anycrlf) RRETURN(MATCH_NOMATCH);
1671 break;
1672 }
1673 ecode++;
1674 break;
1675
1676 case OP_NOT_HSPACE:
1677 if (eptr >= md->end_subject)
1678 {
1679 SCHECK_PARTIAL();
1680 RRETURN(MATCH_NOMATCH);
1681 }
1682 GETCHARINCTEST(c, eptr);
1683 switch(c)
1684 {
1685 default: break;
1686 case 0x09: /* HT */
1687 case 0x20: /* SPACE */
1688 case 0xa0: /* NBSP */
1689 case 0x1680: /* OGHAM SPACE MARK */
1690 case 0x180e: /* MONGOLIAN VOWEL SEPARATOR */
1691 case 0x2000: /* EN QUAD */
1692 case 0x2001: /* EM QUAD */
1693 case 0x2002: /* EN SPACE */
1694 case 0x2003: /* EM SPACE */
1695 case 0x2004: /* THREE-PER-EM SPACE */
1696 case 0x2005: /* FOUR-PER-EM SPACE */
1697 case 0x2006: /* SIX-PER-EM SPACE */
1698 case 0x2007: /* FIGURE SPACE */
1699 case 0x2008: /* PUNCTUATION SPACE */
1700 case 0x2009: /* THIN SPACE */
1701 case 0x200A: /* HAIR SPACE */
1702 case 0x202f: /* NARROW NO-BREAK SPACE */
1703 case 0x205f: /* MEDIUM MATHEMATICAL SPACE */
1704 case 0x3000: /* IDEOGRAPHIC SPACE */
1705 RRETURN(MATCH_NOMATCH);
1706 }
1707 ecode++;
1708 break;
1709
1710 case OP_HSPACE:
1711 if (eptr >= md->end_subject)
1712 {
1713 SCHECK_PARTIAL();
1714 RRETURN(MATCH_NOMATCH);
1715 }
1716 GETCHARINCTEST(c, eptr);
1717 switch(c)
1718 {
1719 default: RRETURN(MATCH_NOMATCH);
1720 case 0x09: /* HT */
1721 case 0x20: /* SPACE */
1722 case 0xa0: /* NBSP */
1723 case 0x1680: /* OGHAM SPACE MARK */
1724 case 0x180e: /* MONGOLIAN VOWEL SEPARATOR */
1725 case 0x2000: /* EN QUAD */
1726 case 0x2001: /* EM QUAD */
1727 case 0x2002: /* EN SPACE */
1728 case 0x2003: /* EM SPACE */
1729 case 0x2004: /* THREE-PER-EM SPACE */
1730 case 0x2005: /* FOUR-PER-EM SPACE */
1731 case 0x2006: /* SIX-PER-EM SPACE */
1732 case 0x2007: /* FIGURE SPACE */
1733 case 0x2008: /* PUNCTUATION SPACE */
1734 case 0x2009: /* THIN SPACE */
1735 case 0x200A: /* HAIR SPACE */
1736 case 0x202f: /* NARROW NO-BREAK SPACE */
1737 case 0x205f: /* MEDIUM MATHEMATICAL SPACE */
1738 case 0x3000: /* IDEOGRAPHIC SPACE */
1739 break;
1740 }
1741 ecode++;
1742 break;
1743
1744 case OP_NOT_VSPACE:
1745 if (eptr >= md->end_subject)
1746 {
1747 SCHECK_PARTIAL();
1748 RRETURN(MATCH_NOMATCH);
1749 }
1750 GETCHARINCTEST(c, eptr);
1751 switch(c)
1752 {
1753 default: break;
1754 case 0x0a: /* LF */
1755 case 0x0b: /* VT */
1756 case 0x0c: /* FF */
1757 case 0x0d: /* CR */
1758 case 0x85: /* NEL */
1759 case 0x2028: /* LINE SEPARATOR */
1760 case 0x2029: /* PARAGRAPH SEPARATOR */
1761 RRETURN(MATCH_NOMATCH);
1762 }
1763 ecode++;
1764 break;
1765
1766 case OP_VSPACE:
1767 if (eptr >= md->end_subject)
1768 {
1769 SCHECK_PARTIAL();
1770 RRETURN(MATCH_NOMATCH);
1771 }
1772 GETCHARINCTEST(c, eptr);
1773 switch(c)
1774 {
1775 default: RRETURN(MATCH_NOMATCH);
1776 case 0x0a: /* LF */
1777 case 0x0b: /* VT */
1778 case 0x0c: /* FF */
1779 case 0x0d: /* CR */
1780 case 0x85: /* NEL */
1781 case 0x2028: /* LINE SEPARATOR */
1782 case 0x2029: /* PARAGRAPH SEPARATOR */
1783 break;
1784 }
1785 ecode++;
1786 break;
1787
1788 #ifdef SUPPORT_UCP
1789 /* Check the next character by Unicode property. We will get here only
1790 if the support is in the binary; otherwise a compile-time error occurs. */
1791
1792 case OP_PROP:
1793 case OP_NOTPROP:
1794 if (eptr >= md->end_subject)
1795 {
1796 SCHECK_PARTIAL();
1797 RRETURN(MATCH_NOMATCH);
1798 }
1799 GETCHARINCTEST(c, eptr);
1800 {
1801 const ucd_record *prop = GET_UCD(c);
1802
1803 switch(ecode[1])
1804 {
1805 case PT_ANY:
1806 if (op == OP_NOTPROP) RRETURN(MATCH_NOMATCH);
1807 break;
1808
1809 case PT_LAMP:
1810 if ((prop->chartype == ucp_Lu ||
1811 prop->chartype == ucp_Ll ||
1812 prop->chartype == ucp_Lt) == (op == OP_NOTPROP))
1813 RRETURN(MATCH_NOMATCH);
1814 break;
1815
1816 case PT_GC:
1817 if ((ecode[2] != _pcre_ucp_gentype[prop->chartype]) == (op == OP_PROP))
1818 RRETURN(MATCH_NOMATCH);
1819 break;
1820
1821 case PT_PC:
1822 if ((ecode[2] != prop->chartype) == (op == OP_PROP))
1823 RRETURN(MATCH_NOMATCH);
1824 break;
1825
1826 case PT_SC:
1827 if ((ecode[2] != prop->script) == (op == OP_PROP))
1828 RRETURN(MATCH_NOMATCH);
1829 break;
1830
1831 default:
1832 RRETURN(PCRE_ERROR_INTERNAL);
1833 }
1834
1835 ecode += 3;
1836 }
1837 break;
1838
1839 /* Match an extended Unicode sequence. We will get here only if the support
1840 is in the binary; otherwise a compile-time error occurs. */
1841
1842 case OP_EXTUNI:
1843 if (eptr >= md->end_subject)
1844 {
1845 SCHECK_PARTIAL();
1846 RRETURN(MATCH_NOMATCH);
1847 }
1848 GETCHARINCTEST(c, eptr);
1849 {
1850 int category = UCD_CATEGORY(c);
1851 if (category == ucp_M) RRETURN(MATCH_NOMATCH);
1852 while (eptr < md->end_subject)
1853 {
1854 int len = 1;
1855 if (!utf8) c = *eptr; else
1856 {
1857 GETCHARLEN(c, eptr, len);
1858 }
1859 category = UCD_CATEGORY(c);
1860 if (category != ucp_M) break;
1861 eptr += len;
1862 }
1863 }
1864 ecode++;
1865 break;
1866 #endif
1867
1868
1869 /* Match a back reference, possibly repeatedly. Look past the end of the
1870 item to see if there is repeat information following. The code is similar
1871 to that for character classes, but repeated for efficiency. Then obey
1872 similar code to character type repeats - written out again for speed.
1873 However, if the referenced string is the empty string, always treat
1874 it as matched, any number of times (otherwise there could be infinite
1875 loops). */
1876
1877 case OP_REF:
1878 {
1879 offset = GET2(ecode, 1) << 1; /* Doubled ref number */
1880 ecode += 3;
1881
1882 /* If the reference is unset, there are two possibilities:
1883
1884 (a) In the default, Perl-compatible state, set the length to be longer
1885 than the amount of subject left; this ensures that every attempt at a
1886 match fails. We can't just fail here, because of the possibility of
1887 quantifiers with zero minima.
1888
1889 (b) If the JavaScript compatibility flag is set, set the length to zero
1890 so that the back reference matches an empty string.
1891
1892 Otherwise, set the length to the length of what was matched by the
1893 referenced subpattern. */
1894
1895 if (offset >= offset_top || md->offset_vector[offset] < 0)
1896 length = (md->jscript_compat)? 0 : md->end_subject - eptr + 1;
1897 else
1898 length = md->offset_vector[offset+1] - md->offset_vector[offset];
1899
1900 /* Set up for repetition, or handle the non-repeated case */
1901
1902 switch (*ecode)
1903 {
1904 case OP_CRSTAR:
1905 case OP_CRMINSTAR:
1906 case OP_CRPLUS:
1907 case OP_CRMINPLUS:
1908 case OP_CRQUERY:
1909 case OP_CRMINQUERY:
1910 c = *ecode++ - OP_CRSTAR;
1911 minimize = (c & 1) != 0;
1912 min = rep_min[c]; /* Pick up values from tables; */
1913 max = rep_max[c]; /* zero for max => infinity */
1914 if (max == 0) max = INT_MAX;
1915 break;
1916
1917 case OP_CRRANGE:
1918 case OP_CRMINRANGE:
1919 minimize = (*ecode == OP_CRMINRANGE);
1920 min = GET2(ecode, 1);
1921 max = GET2(ecode, 3);
1922 if (max == 0) max = INT_MAX;
1923 ecode += 5;
1924 break;
1925
1926 default: /* No repeat follows */
1927 if (!match_ref(offset, eptr, length, md, ims))
1928 {
1929 CHECK_PARTIAL();
1930 RRETURN(MATCH_NOMATCH);
1931 }
1932 eptr += length;
1933 continue; /* With the main loop */
1934 }
1935
1936 /* If the length of the reference is zero, just continue with the
1937 main loop. */
1938
1939 if (length == 0) continue;
1940
1941 /* First, ensure the minimum number of matches are present. We get back
1942 the length of the reference string explicitly rather than passing the
1943 address of eptr, so that eptr can be a register variable. */
1944
1945 for (i = 1; i <= min; i++)
1946 {
1947 if (!match_ref(offset, eptr, length, md, ims))
1948 {
1949 CHECK_PARTIAL();
1950 RRETURN(MATCH_NOMATCH);
1951 }
1952 eptr += length;
1953 }
1954
1955 /* If min = max, continue at the same level without recursion.
1956 They are not both allowed to be zero. */
1957
1958 if (min == max) continue;
1959
1960 /* If minimizing, keep trying and advancing the pointer */
1961
1962 if (minimize)
1963 {
1964 for (fi = min;; fi++)
1965 {
1966 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM14);
1967 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1968 if (fi >= max) RRETURN(MATCH_NOMATCH);
1969 if (!match_ref(offset, eptr, length, md, ims))
1970 {
1971 CHECK_PARTIAL();
1972 RRETURN(MATCH_NOMATCH);
1973 }
1974 eptr += length;
1975 }
1976 /* Control never gets here */
1977 }
1978
1979 /* If maximizing, find the longest string and work backwards */
1980
1981 else
1982 {
1983 pp = eptr;
1984 for (i = min; i < max; i++)
1985 {
1986 if (!match_ref(offset, eptr, length, md, ims)) break;
1987 eptr += length;
1988 }
1989 while (eptr >= pp)
1990 {
1991 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM15);
1992 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1993 eptr -= length;
1994 }
1995 RRETURN(MATCH_NOMATCH);
1996 }
1997 }
1998 /* Control never gets here */
1999
2000 /* Match a bit-mapped character class, possibly repeatedly. This op code is
2001 used when all the characters in the class have values in the range 0-255,
2002 and either the matching is caseful, or the characters are in the range
2003 0-127 when UTF-8 processing is enabled. The only difference between
2004 OP_CLASS and OP_NCLASS occurs when a data character outside the range is
2005 encountered.
2006
2007 First, look past the end of the item to see if there is repeat information
2008 following. Then obey similar code to character type repeats - written out
2009 again for speed. */
2010
2011 case OP_NCLASS:
2012 case OP_CLASS:
2013 {
2014 data = ecode + 1; /* Save for matching */
2015 ecode += 33; /* Advance past the item */
2016
2017 switch (*ecode)
2018 {
2019 case OP_CRSTAR:
2020 case OP_CRMINSTAR:
2021 case OP_CRPLUS:
2022 case OP_CRMINPLUS:
2023 case OP_CRQUERY:
2024 case OP_CRMINQUERY:
2025 c = *ecode++ - OP_CRSTAR;
2026 minimize = (c & 1) != 0;
2027 min = rep_min[c]; /* Pick up values from tables; */
2028 max = rep_max[c]; /* zero for max => infinity */
2029 if (max == 0) max = INT_MAX;
2030 break;
2031
2032 case OP_CRRANGE:
2033 case OP_CRMINRANGE:
2034 minimize = (*ecode == OP_CRMINRANGE);
2035 min = GET2(ecode, 1);
2036 max = GET2(ecode, 3);
2037 if (max == 0) max = INT_MAX;
2038 ecode += 5;
2039 break;
2040
2041 default: /* No repeat follows */
2042 min = max = 1;
2043 break;
2044 }
2045
2046 /* First, ensure the minimum number of matches are present. */
2047
2048 #ifdef SUPPORT_UTF8
2049 /* UTF-8 mode */
2050 if (utf8)
2051 {
2052 for (i = 1; i <= min; i++)
2053 {
2054 if (eptr >= md->end_subject)
2055 {
2056 SCHECK_PARTIAL();
2057 RRETURN(MATCH_NOMATCH);
2058 }
2059 GETCHARINC(c, eptr);
2060 if (c > 255)
2061 {
2062 if (op == OP_CLASS) RRETURN(MATCH_NOMATCH);
2063 }
2064 else
2065 {
2066 if ((data[c/8] & (1 << (c&7))) == 0) RRETURN(MATCH_NOMATCH);
2067 }
2068 }
2069 }
2070 else
2071 #endif
2072 /* Not UTF-8 mode */
2073 {
2074 for (i = 1; i <= min; i++)
2075 {
2076 if (eptr >= md->end_subject)
2077 {
2078 SCHECK_PARTIAL();
2079 RRETURN(MATCH_NOMATCH);
2080 }
2081 c = *eptr++;
2082 if ((data[c/8] & (1 << (c&7))) == 0) RRETURN(MATCH_NOMATCH);
2083 }
2084 }
2085
2086 /* If max == min we can continue with the main loop without the
2087 need to recurse. */
2088
2089 if (min == max) continue;
2090
2091 /* If minimizing, keep testing the rest of the expression and advancing
2092 the pointer while it matches the class. */
2093
2094 if (minimize)
2095 {
2096 #ifdef SUPPORT_UTF8
2097 /* UTF-8 mode */
2098 if (utf8)
2099 {
2100 for (fi = min;; fi++)
2101 {
2102 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM16);
2103 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2104 if (fi >= max) RRETURN(MATCH_NOMATCH);
2105 if (eptr >= md->end_subject)
2106 {
2107 SCHECK_PARTIAL();
2108 RRETURN(MATCH_NOMATCH);
2109 }
2110 GETCHARINC(c, eptr);
2111 if (c > 255)
2112 {
2113 if (op == OP_CLASS) RRETURN(MATCH_NOMATCH);
2114 }
2115 else
2116 {
2117 if ((data[c/8] & (1 << (c&7))) == 0) RRETURN(MATCH_NOMATCH);
2118 }
2119 }
2120 }
2121 else
2122 #endif
2123 /* Not UTF-8 mode */
2124 {
2125 for (fi = min;; fi++)
2126 {
2127 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM17);
2128 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2129 if (fi >= max) RRETURN(MATCH_NOMATCH);
2130 if (eptr >= md->end_subject)
2131 {
2132 SCHECK_PARTIAL();
2133 RRETURN(MATCH_NOMATCH);
2134 }
2135 c = *eptr++;
2136 if ((data[c/8] & (1 << (c&7))) == 0) RRETURN(MATCH_NOMATCH);
2137 }
2138 }
2139 /* Control never gets here */
2140 }
2141
2142 /* If maximizing, find the longest possible run, then work backwards. */
2143
2144 else
2145 {
2146 pp = eptr;
2147
2148 #ifdef SUPPORT_UTF8
2149 /* UTF-8 mode */
2150 if (utf8)
2151 {
2152 for (i = min; i < max; i++)
2153 {
2154 int len = 1;
2155 if (eptr >= md->end_subject) break;
2156 GETCHARLEN(c, eptr, len);
2157 if (c > 255)
2158 {
2159 if (op == OP_CLASS) break;
2160 }
2161 else
2162 {
2163 if ((data[c/8] & (1 << (c&7))) == 0) break;
2164 }
2165 eptr += len;
2166 }
2167 for (;;)
2168 {
2169 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM18);
2170 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2171 if (eptr-- == pp) break; /* Stop if tried at original pos */
2172 BACKCHAR(eptr);
2173 }
2174 }
2175 else
2176 #endif
2177 /* Not UTF-8 mode */
2178 {
2179 for (i = min; i < max; i++)
2180 {
2181 if (eptr >= md->end_subject) break;
2182 c = *eptr;
2183 if ((data[c/8] & (1 << (c&7))) == 0) break;
2184 eptr++;
2185 }
2186 while (eptr >= pp)
2187 {
2188 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM19);
2189 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2190 eptr--;
2191 }
2192 }
2193
2194 RRETURN(MATCH_NOMATCH);
2195 }
2196 }
2197 /* Control never gets here */
2198
2199
2200 /* Match an extended character class. This opcode is encountered only
2201 when UTF-8 mode mode is supported. Nevertheless, we may not be in UTF-8
2202 mode, because Unicode properties are supported in non-UTF-8 mode. */
2203
2204 #ifdef SUPPORT_UTF8
2205 case OP_XCLASS:
2206 {
2207 data = ecode + 1 + LINK_SIZE; /* Save for matching */
2208 ecode += GET(ecode, 1); /* Advance past the item */
2209
2210 switch (*ecode)
2211 {
2212 case OP_CRSTAR:
2213 case OP_CRMINSTAR:
2214 case OP_CRPLUS:
2215 case OP_CRMINPLUS:
2216 case OP_CRQUERY:
2217 case OP_CRMINQUERY:
2218 c = *ecode++ - OP_CRSTAR;
2219 minimize = (c & 1) != 0;
2220 min = rep_min[c]; /* Pick up values from tables; */
2221 max = rep_max[c]; /* zero for max => infinity */
2222 if (max == 0) max = INT_MAX;
2223 break;
2224
2225 case OP_CRRANGE:
2226 case OP_CRMINRANGE:
2227 minimize = (*ecode == OP_CRMINRANGE);
2228 min = GET2(ecode, 1);
2229 max = GET2(ecode, 3);
2230 if (max == 0) max = INT_MAX;
2231 ecode += 5;
2232 break;
2233
2234 default: /* No repeat follows */
2235 min = max = 1;
2236 break;
2237 }
2238
2239 /* First, ensure the minimum number of matches are present. */
2240
2241 for (i = 1; i <= min; i++)
2242 {
2243 if (eptr >= md->end_subject)
2244 {
2245 SCHECK_PARTIAL();
2246 RRETURN(MATCH_NOMATCH);
2247 }
2248 GETCHARINCTEST(c, eptr);
2249 if (!_pcre_xclass(c, data)) RRETURN(MATCH_NOMATCH);
2250 }
2251
2252 /* If max == min we can continue with the main loop without the
2253 need to recurse. */
2254
2255 if (min == max) continue;
2256
2257 /* If minimizing, keep testing the rest of the expression and advancing
2258 the pointer while it matches the class. */
2259
2260 if (minimize)
2261 {
2262 for (fi = min;; fi++)
2263 {
2264 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM20);
2265 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2266 if (fi >= max) RRETURN(MATCH_NOMATCH);
2267 if (eptr >= md->end_subject)
2268 {
2269 SCHECK_PARTIAL();
2270 RRETURN(MATCH_NOMATCH);
2271 }
2272 GETCHARINCTEST(c, eptr);
2273 if (!_pcre_xclass(c, data)) RRETURN(MATCH_NOMATCH);
2274 }
2275 /* Control never gets here */
2276 }
2277
2278 /* If maximizing, find the longest possible run, then work backwards. */
2279
2280 else
2281 {
2282 pp = eptr;
2283 for (i = min; i < max; i++)
2284 {
2285 int len = 1;
2286 if (eptr >= md->end_subject) break;
2287 GETCHARLENTEST(c, eptr, len);
2288 if (!_pcre_xclass(c, data)) break;
2289 eptr += len;
2290 }
2291 for(;;)
2292 {
2293 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM21);
2294 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2295 if (eptr-- == pp) break; /* Stop if tried at original pos */
2296 if (utf8) BACKCHAR(eptr);
2297 }
2298 RRETURN(MATCH_NOMATCH);
2299 }
2300
2301 /* Control never gets here */
2302 }
2303 #endif /* End of XCLASS */
2304
2305 /* Match a single character, casefully */
2306
2307 case OP_CHAR:
2308 #ifdef SUPPORT_UTF8
2309 if (utf8)
2310 {
2311 length = 1;
2312 ecode++;
2313 GETCHARLEN(fc, ecode, length);
2314 if (length > md->end_subject - eptr)
2315 {
2316 CHECK_PARTIAL(); /* Not SCHECK_PARTIAL() */
2317 RRETURN(MATCH_NOMATCH);
2318 }
2319 while (length-- > 0) if (*ecode++ != *eptr++) RRETURN(MATCH_NOMATCH);
2320 }
2321 else
2322 #endif
2323
2324 /* Non-UTF-8 mode */
2325 {
2326 if (md->end_subject - eptr < 1)
2327 {
2328 SCHECK_PARTIAL(); /* This one can use SCHECK_PARTIAL() */
2329 RRETURN(MATCH_NOMATCH);
2330 }
2331 if (ecode[1] != *eptr++) RRETURN(MATCH_NOMATCH);
2332 ecode += 2;
2333 }
2334 break;
2335
2336 /* Match a single character, caselessly */
2337
2338 case OP_CHARNC:
2339 #ifdef SUPPORT_UTF8
2340 if (utf8)
2341 {
2342 length = 1;
2343 ecode++;
2344 GETCHARLEN(fc, ecode, length);
2345
2346 if (length > md->end_subject - eptr)
2347 {
2348 CHECK_PARTIAL(); /* Not SCHECK_PARTIAL() */
2349 RRETURN(MATCH_NOMATCH);
2350 }
2351
2352 /* If the pattern character's value is < 128, we have only one byte, and
2353 can use the fast lookup table. */
2354
2355 if (fc < 128)
2356 {
2357 if (md->lcc[*ecode++] != md->lcc[*eptr++]) RRETURN(MATCH_NOMATCH);
2358 }
2359
2360 /* Otherwise we must pick up the subject character */
2361
2362 else
2363 {
2364 unsigned int dc;
2365 GETCHARINC(dc, eptr);
2366 ecode += length;
2367
2368 /* If we have Unicode property support, we can use it to test the other
2369 case of the character, if there is one. */
2370
2371 if (fc != dc)
2372 {
2373 #ifdef SUPPORT_UCP
2374 if (dc != UCD_OTHERCASE(fc))
2375 #endif
2376 RRETURN(MATCH_NOMATCH);
2377 }
2378 }
2379 }
2380 else
2381 #endif /* SUPPORT_UTF8 */
2382
2383 /* Non-UTF-8 mode */
2384 {
2385 if (md->end_subject - eptr < 1)
2386 {
2387 SCHECK_PARTIAL(); /* This one can use SCHECK_PARTIAL() */
2388 RRETURN(MATCH_NOMATCH);
2389 }
2390 if (md->lcc[ecode[1]] != md->lcc[*eptr++]) RRETURN(MATCH_NOMATCH);
2391 ecode += 2;
2392 }
2393 break;
2394
2395 /* Match a single character repeatedly. */
2396
2397 case OP_EXACT:
2398 min = max = GET2(ecode, 1);
2399 ecode += 3;
2400 goto REPEATCHAR;
2401
2402 case OP_POSUPTO:
2403 possessive = TRUE;
2404 /* Fall through */
2405
2406 case OP_UPTO:
2407 case OP_MINUPTO:
2408 min = 0;
2409 max = GET2(ecode, 1);
2410 minimize = *ecode == OP_MINUPTO;
2411 ecode += 3;
2412 goto REPEATCHAR;
2413
2414 case OP_POSSTAR:
2415 possessive = TRUE;
2416 min = 0;
2417 max = INT_MAX;
2418 ecode++;
2419 goto REPEATCHAR;
2420
2421 case OP_POSPLUS:
2422 possessive = TRUE;
2423 min = 1;
2424 max = INT_MAX;
2425 ecode++;
2426 goto REPEATCHAR;
2427
2428 case OP_POSQUERY:
2429 possessive = TRUE;
2430 min = 0;
2431 max = 1;
2432 ecode++;
2433 goto REPEATCHAR;
2434
2435 case OP_STAR:
2436 case OP_MINSTAR:
2437 case OP_PLUS:
2438 case OP_MINPLUS:
2439 case OP_QUERY:
2440 case OP_MINQUERY:
2441 c = *ecode++ - OP_STAR;
2442 minimize = (c & 1) != 0;
2443
2444 min = rep_min[c]; /* Pick up values from tables; */
2445 max = rep_max[c]; /* zero for max => infinity */
2446 if (max == 0) max = INT_MAX;
2447
2448 /* Common code for all repeated single-character matches. */
2449
2450 REPEATCHAR:
2451 #ifdef SUPPORT_UTF8
2452 if (utf8)
2453 {
2454 length = 1;
2455 charptr = ecode;
2456 GETCHARLEN(fc, ecode, length);
2457 ecode += length;
2458
2459 /* Handle multibyte character matching specially here. There is
2460 support for caseless matching if UCP support is present. */
2461
2462 if (length > 1)
2463 {
2464 #ifdef SUPPORT_UCP
2465 unsigned int othercase;
2466 if ((ims & PCRE_CASELESS) != 0 &&
2467 (othercase = UCD_OTHERCASE(fc)) != fc)
2468 oclength = _pcre_ord2utf8(othercase, occhars);
2469 else oclength = 0;
2470 #endif /* SUPPORT_UCP */
2471
2472 for (i = 1; i <= min; i++)
2473 {
2474 if (eptr <= md->end_subject - length &&
2475 memcmp(eptr, charptr, length) == 0) eptr += length;
2476 #ifdef SUPPORT_UCP
2477 else if (oclength > 0 &&
2478 eptr <= md->end_subject - oclength &&
2479 memcmp(eptr, occhars, oclength) == 0) eptr += oclength;
2480 #endif /* SUPPORT_UCP */
2481 else
2482 {
2483 CHECK_PARTIAL();
2484 RRETURN(MATCH_NOMATCH);
2485 }
2486 }
2487
2488 if (min == max) continue;
2489
2490 if (minimize)
2491 {
2492 for (fi = min;; fi++)
2493 {
2494 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM22);
2495 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2496 if (fi >= max) RRETURN(MATCH_NOMATCH);
2497 if (eptr <= md->end_subject - length &&
2498 memcmp(eptr, charptr, length) == 0) eptr += length;
2499 #ifdef SUPPORT_UCP
2500 else if (oclength > 0 &&
2501 eptr <= md->end_subject - oclength &&
2502 memcmp(eptr, occhars, oclength) == 0) eptr += oclength;
2503 #endif /* SUPPORT_UCP */
2504 else
2505 {
2506 CHECK_PARTIAL();
2507 RRETURN(MATCH_NOMATCH);
2508 }
2509 }
2510 /* Control never gets here */
2511 }
2512
2513 else /* Maximize */
2514 {
2515 pp = eptr;
2516 for (i = min; i < max; i++)
2517 {
2518 if (eptr <= md->end_subject - length &&
2519 memcmp(eptr, charptr, length) == 0) eptr += length;
2520 #ifdef SUPPORT_UCP
2521 else if (oclength > 0 &&
2522 eptr <= md->end_subject - oclength &&
2523 memcmp(eptr, occhars, oclength) == 0) eptr += oclength;
2524 #endif /* SUPPORT_UCP */
2525 else break;
2526 }
2527
2528 if (possessive) continue;
2529
2530 for(;;)
2531 {
2532 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM23);
2533 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2534 if (eptr == pp) { RRETURN(MATCH_NOMATCH); }
2535 #ifdef SUPPORT_UCP
2536 eptr--;
2537 BACKCHAR(eptr);
2538 #else /* without SUPPORT_UCP */
2539 eptr -= length;
2540 #endif /* SUPPORT_UCP */
2541 }
2542 }
2543 /* Control never gets here */
2544 }
2545
2546 /* If the length of a UTF-8 character is 1, we fall through here, and
2547 obey the code as for non-UTF-8 characters below, though in this case the
2548 value of fc will always be < 128. */
2549 }
2550 else
2551 #endif /* SUPPORT_UTF8 */
2552
2553 /* When not in UTF-8 mode, load a single-byte character. */
2554
2555 fc = *ecode++;
2556
2557 /* The value of fc at this point is always less than 256, though we may or
2558 may not be in UTF-8 mode. The code is duplicated for the caseless and
2559 caseful cases, for speed, since matching characters is likely to be quite
2560 common. First, ensure the minimum number of matches are present. If min =
2561 max, continue at the same level without recursing. Otherwise, if
2562 minimizing, keep trying the rest of the expression and advancing one
2563 matching character if failing, up to the maximum. Alternatively, if
2564 maximizing, find the maximum number of characters and work backwards. */
2565
2566 DPRINTF(("matching %c{%d,%d} against subject %.*s\n", fc, min, max,
2567 max, eptr));
2568
2569 if ((ims & PCRE_CASELESS) != 0)
2570 {
2571 fc = md->lcc[fc];
2572 for (i = 1; i <= min; i++)
2573 {
2574 if (eptr >= md->end_subject)
2575 {
2576 SCHECK_PARTIAL();
2577 RRETURN(MATCH_NOMATCH);
2578 }
2579 if (fc != md->lcc[*eptr++]) RRETURN(MATCH_NOMATCH);
2580 }
2581 if (min == max) continue;
2582 if (minimize)
2583 {
2584 for (fi = min;; fi++)
2585 {
2586 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM24);
2587 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2588 if (fi >= max) RRETURN(MATCH_NOMATCH);
2589 if (eptr >= md->end_subject)
2590 {
2591 SCHECK_PARTIAL();
2592 RRETURN(MATCH_NOMATCH);
2593 }
2594 if (fc != md->lcc[*eptr++]) RRETURN(MATCH_NOMATCH);
2595 }
2596 /* Control never gets here */
2597 }
2598 else /* Maximize */
2599 {
2600 pp = eptr;
2601 for (i = min; i < max; i++)
2602 {
2603 if (eptr >= md->end_subject || fc != md->lcc[*eptr]) break;
2604 eptr++;
2605 }
2606
2607 if (possessive) continue;
2608
2609 while (eptr >= pp)
2610 {
2611 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM25);
2612 eptr--;
2613 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2614 }
2615 RRETURN(MATCH_NOMATCH);
2616 }
2617 /* Control never gets here */
2618 }
2619
2620 /* Caseful comparisons (includes all multi-byte characters) */
2621
2622 else
2623 {
2624 for (i = 1; i <= min; i++)
2625 {
2626 if (eptr >= md->end_subject)
2627 {
2628 SCHECK_PARTIAL();
2629 RRETURN(MATCH_NOMATCH);
2630 }
2631 if (fc != *eptr++) RRETURN(MATCH_NOMATCH);
2632 }
2633
2634 if (min == max) continue;
2635
2636 if (minimize)
2637 {
2638 for (fi = min;; fi++)
2639 {
2640 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM26);
2641 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2642 if (fi >= max) RRETURN(MATCH_NOMATCH);
2643 if (eptr >= md->end_subject)
2644 {
2645 SCHECK_PARTIAL();
2646 RRETURN(MATCH_NOMATCH);
2647 }
2648 if (fc != *eptr++) RRETURN(MATCH_NOMATCH);
2649 }
2650 /* Control never gets here */
2651 }
2652 else /* Maximize */
2653 {
2654 pp = eptr;
2655 for (i = min; i < max; i++)
2656 {
2657 if (eptr >= md->end_subject || fc != *eptr) break;
2658 eptr++;
2659 }
2660 if (possessive) continue;
2661
2662 while (eptr >= pp)
2663 {
2664 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM27);
2665 eptr--;
2666 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2667 }
2668 RRETURN(MATCH_NOMATCH);
2669 }
2670 }
2671 /* Control never gets here */
2672
2673 /* Match a negated single one-byte character. The character we are
2674 checking can be multibyte. */
2675
2676 case OP_NOT:
2677 if (eptr >= md->end_subject)
2678 {
2679 SCHECK_PARTIAL();
2680 RRETURN(MATCH_NOMATCH);
2681 }
2682 ecode++;
2683 GETCHARINCTEST(c, eptr);
2684 if ((ims & PCRE_CASELESS) != 0)
2685 {
2686 #ifdef SUPPORT_UTF8
2687 if (c < 256)
2688 #endif
2689 c = md->lcc[c];
2690 if (md->lcc[*ecode++] == c) RRETURN(MATCH_NOMATCH);
2691 }
2692 else
2693 {
2694 if (*ecode++ == c) RRETURN(MATCH_NOMATCH);
2695 }
2696 break;
2697
2698 /* Match a negated single one-byte character repeatedly. This is almost a
2699 repeat of the code for a repeated single character, but I haven't found a
2700 nice way of commoning these up that doesn't require a test of the
2701 positive/negative option for each character match. Maybe that wouldn't add
2702 very much to the time taken, but character matching *is* what this is all
2703 about... */
2704
2705 case OP_NOTEXACT:
2706 min = max = GET2(ecode, 1);
2707 ecode += 3;
2708 goto REPEATNOTCHAR;
2709
2710 case OP_NOTUPTO:
2711 case OP_NOTMINUPTO:
2712 min = 0;
2713 max = GET2(ecode, 1);
2714 minimize = *ecode == OP_NOTMINUPTO;
2715 ecode += 3;
2716 goto REPEATNOTCHAR;
2717
2718 case OP_NOTPOSSTAR:
2719 possessive = TRUE;
2720 min = 0;
2721 max = INT_MAX;
2722 ecode++;
2723 goto REPEATNOTCHAR;
2724
2725 case OP_NOTPOSPLUS:
2726 possessive = TRUE;
2727 min = 1;
2728 max = INT_MAX;
2729 ecode++;
2730 goto REPEATNOTCHAR;
2731
2732 case OP_NOTPOSQUERY:
2733 possessive = TRUE;
2734 min = 0;
2735 max = 1;
2736 ecode++;
2737 goto REPEATNOTCHAR;
2738
2739 case OP_NOTPOSUPTO:
2740 possessive = TRUE;
2741 min = 0;
2742 max = GET2(ecode, 1);
2743 ecode += 3;
2744 goto REPEATNOTCHAR;
2745
2746 case OP_NOTSTAR:
2747 case OP_NOTMINSTAR:
2748 case OP_NOTPLUS:
2749 case OP_NOTMINPLUS:
2750 case OP_NOTQUERY:
2751 case OP_NOTMINQUERY:
2752 c = *ecode++ - OP_NOTSTAR;
2753 minimize = (c & 1) != 0;
2754 min = rep_min[c]; /* Pick up values from tables; */
2755 max = rep_max[c]; /* zero for max => infinity */
2756 if (max == 0) max = INT_MAX;
2757
2758 /* Common code for all repeated single-byte matches. */
2759
2760 REPEATNOTCHAR:
2761 fc = *ecode++;
2762
2763 /* The code is duplicated for the caseless and caseful cases, for speed,
2764 since matching characters is likely to be quite common. First, ensure the
2765 minimum number of matches are present. If min = max, continue at the same
2766 level without recursing. Otherwise, if minimizing, keep trying the rest of
2767 the expression and advancing one matching character if failing, up to the
2768 maximum. Alternatively, if maximizing, find the maximum number of
2769 characters and work backwards. */
2770
2771 DPRINTF(("negative matching %c{%d,%d} against subject %.*s\n", fc, min, max,
2772 max, eptr));
2773
2774 if ((ims & PCRE_CASELESS) != 0)
2775 {
2776 fc = md->lcc[fc];
2777
2778 #ifdef SUPPORT_UTF8
2779 /* UTF-8 mode */
2780 if (utf8)
2781 {
2782 register unsigned int d;
2783 for (i = 1; i <= min; i++)
2784 {
2785 if (eptr >= md->end_subject)
2786 {
2787 SCHECK_PARTIAL();
2788 RRETURN(MATCH_NOMATCH);
2789 }
2790 GETCHARINC(d, eptr);
2791 if (d < 256) d = md->lcc[d];
2792 if (fc == d) RRETURN(MATCH_NOMATCH);
2793 }
2794 }
2795 else
2796 #endif
2797
2798 /* Not UTF-8 mode */
2799 {
2800 for (i = 1; i <= min; i++)
2801 {
2802 if (eptr >= md->end_subject)
2803 {
2804 SCHECK_PARTIAL();
2805 RRETURN(MATCH_NOMATCH);
2806 }
2807 if (fc == md->lcc[*eptr++]) RRETURN(MATCH_NOMATCH);
2808 }
2809 }
2810
2811 if (min == max) continue;
2812
2813 if (minimize)
2814 {
2815 #ifdef SUPPORT_UTF8
2816 /* UTF-8 mode */
2817 if (utf8)
2818 {
2819 register unsigned int d;
2820 for (fi = min;; fi++)
2821 {
2822 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM28);
2823 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2824 if (fi >= max) RRETURN(MATCH_NOMATCH);
2825 if (eptr >= md->end_subject)
2826 {
2827 SCHECK_PARTIAL();
2828 RRETURN(MATCH_NOMATCH);
2829 }
2830 GETCHARINC(d, eptr);
2831 if (d < 256) d = md->lcc[d];
2832 if (fc == d) RRETURN(MATCH_NOMATCH);
2833 }
2834 }
2835 else
2836 #endif
2837 /* Not UTF-8 mode */
2838 {
2839 for (fi = min;; fi++)
2840 {
2841 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM29);
2842 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2843 if (fi >= max) RRETURN(MATCH_NOMATCH);
2844 if (eptr >= md->end_subject)
2845 {
2846 SCHECK_PARTIAL();
2847 RRETURN(MATCH_NOMATCH);
2848 }
2849 if (fc == md->lcc[*eptr++]) RRETURN(MATCH_NOMATCH);
2850 }
2851 }
2852 /* Control never gets here */
2853 }
2854
2855 /* Maximize case */
2856
2857 else
2858 {
2859 pp = eptr;
2860
2861 #ifdef SUPPORT_UTF8
2862 /* UTF-8 mode */
2863 if (utf8)
2864 {
2865 register unsigned int d;
2866 for (i = min; i < max; i++)
2867 {
2868 int len = 1;
2869 if (eptr >= md->end_subject) break;
2870 GETCHARLEN(d, eptr, len);
2871 if (d < 256) d = md->lcc[d];
2872 if (fc == d) break;
2873 eptr += len;
2874 }
2875 if (possessive) continue;
2876 for(;;)
2877 {
2878 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM30);
2879 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2880 if (eptr-- == pp) break; /* Stop if tried at original pos */
2881 BACKCHAR(eptr);
2882 }
2883 }
2884 else
2885 #endif
2886 /* Not UTF-8 mode */
2887 {
2888 for (i = min; i < max; i++)
2889 {
2890 if (eptr >= md->end_subject || fc == md->lcc[*eptr]) break;
2891 eptr++;
2892 }
2893 if (possessive) continue;
2894 while (eptr >= pp)
2895 {
2896 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM31);
2897 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2898 eptr--;
2899 }
2900 }
2901
2902 RRETURN(MATCH_NOMATCH);
2903 }
2904 /* Control never gets here */
2905 }
2906
2907 /* Caseful comparisons */
2908
2909 else
2910 {
2911 #ifdef SUPPORT_UTF8
2912 /* UTF-8 mode */
2913 if (utf8)
2914 {
2915 register unsigned int d;
2916 for (i = 1; i <= min; i++)
2917 {
2918 if (eptr >= md->end_subject)
2919 {
2920 SCHECK_PARTIAL();
2921 RRETURN(MATCH_NOMATCH);
2922 }
2923 GETCHARINC(d, eptr);
2924 if (fc == d) RRETURN(MATCH_NOMATCH);
2925 }
2926 }
2927 else
2928 #endif
2929 /* Not UTF-8 mode */
2930 {
2931 for (i = 1; i <= min; i++)
2932 {
2933 if (eptr >= md->end_subject)
2934 {
2935 SCHECK_PARTIAL();
2936 RRETURN(MATCH_NOMATCH);
2937 }
2938 if (fc == *eptr++) RRETURN(MATCH_NOMATCH);
2939 }
2940 }
2941
2942 if (min == max) continue;
2943
2944 if (minimize)
2945 {
2946 #ifdef SUPPORT_UTF8
2947 /* UTF-8 mode */
2948 if (utf8)
2949 {
2950 register unsigned int d;
2951 for (fi = min;; fi++)
2952 {
2953 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM32);
2954 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2955 if (fi >= max) RRETURN(MATCH_NOMATCH);
2956 if (eptr >= md->end_subject)
2957 {
2958 SCHECK_PARTIAL();
2959 RRETURN(MATCH_NOMATCH);
2960 }
2961 GETCHARINC(d, eptr);
2962 if (fc == d) RRETURN(MATCH_NOMATCH);
2963 }
2964 }
2965 else
2966 #endif
2967 /* Not UTF-8 mode */
2968 {
2969 for (fi = min;; fi++)
2970 {
2971 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM33);
2972 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2973 if (fi >= max) RRETURN(MATCH_NOMATCH);
2974 if (eptr >= md->end_subject)
2975 {
2976 SCHECK_PARTIAL();
2977 RRETURN(MATCH_NOMATCH);
2978 }
2979 if (fc == *eptr++) RRETURN(MATCH_NOMATCH);
2980 }
2981 }
2982 /* Control never gets here */
2983 }
2984
2985 /* Maximize case */
2986
2987 else
2988 {
2989 pp = eptr;
2990
2991 #ifdef SUPPORT_UTF8
2992 /* UTF-8 mode */
2993 if (utf8)
2994 {
2995 register unsigned int d;
2996 for (i = min; i < max; i++)
2997 {
2998 int len = 1;
2999 if (eptr >= md->end_subject) break;
3000 GETCHARLEN(d, eptr, len);
3001 if (fc == d) break;
3002 eptr += len;
3003 }
3004 if (possessive) continue;
3005 for(;;)
3006 {
3007 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM34);
3008 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3009 if (eptr-- == pp) break; /* Stop if tried at original pos */
3010 BACKCHAR(eptr);
3011 }
3012 }
3013 else
3014 #endif
3015 /* Not UTF-8 mode */
3016 {
3017 for (i = min; i < max; i++)
3018 {
3019 if (eptr >= md->end_subject || fc == *eptr) break;
3020 eptr++;
3021 }
3022 if (possessive) continue;
3023 while (eptr >= pp)
3024 {
3025 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM35);
3026 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3027 eptr--;
3028 }
3029 }
3030
3031 RRETURN(MATCH_NOMATCH);
3032 }
3033 }
3034 /* Control never gets here */
3035
3036 /* Match a single character type repeatedly; several different opcodes
3037 share code. This is very similar to the code for single characters, but we
3038 repeat it in the interests of efficiency. */
3039
3040 case OP_TYPEEXACT:
3041 min = max = GET2(ecode, 1);
3042 minimize = TRUE;
3043 ecode += 3;
3044 goto REPEATTYPE;
3045
3046 case OP_TYPEUPTO:
3047 case OP_TYPEMINUPTO:
3048 min = 0;
3049 max = GET2(ecode, 1);
3050 minimize = *ecode == OP_TYPEMINUPTO;
3051 ecode += 3;
3052 goto REPEATTYPE;
3053
3054 case OP_TYPEPOSSTAR:
3055 possessive = TRUE;
3056 min = 0;
3057 max = INT_MAX;
3058 ecode++;
3059 goto REPEATTYPE;
3060
3061 case OP_TYPEPOSPLUS:
3062 possessive = TRUE;
3063 min = 1;
3064 max = INT_MAX;
3065 ecode++;
3066 goto REPEATTYPE;
3067
3068 case OP_TYPEPOSQUERY:
3069 possessive = TRUE;
3070 min = 0;
3071 max = 1;
3072 ecode++;
3073 goto REPEATTYPE;
3074
3075 case OP_TYPEPOSUPTO:
3076 possessive = TRUE;
3077 min = 0;
3078 max = GET2(ecode, 1);
3079 ecode += 3;
3080 goto REPEATTYPE;
3081
3082 case OP_TYPESTAR:
3083 case OP_TYPEMINSTAR:
3084 case OP_TYPEPLUS:
3085 case OP_TYPEMINPLUS:
3086 case OP_TYPEQUERY:
3087 case OP_TYPEMINQUERY:
3088 c = *ecode++ - OP_TYPESTAR;
3089 minimize = (c & 1) != 0;
3090 min = rep_min[c]; /* Pick up values from tables; */
3091 max = rep_max[c]; /* zero for max => infinity */
3092 if (max == 0) max = INT_MAX;
3093
3094 /* Common code for all repeated single character type matches. Note that
3095 in UTF-8 mode, '.' matches a character of any length, but for the other
3096 character types, the valid characters are all one-byte long. */
3097
3098 REPEATTYPE:
3099 ctype = *ecode++; /* Code for the character type */
3100
3101 #ifdef SUPPORT_UCP
3102 if (ctype == OP_PROP || ctype == OP_NOTPROP)
3103 {
3104 prop_fail_result = ctype == OP_NOTPROP;
3105 prop_type = *ecode++;
3106 prop_value = *ecode++;
3107 }
3108 else prop_type = -1;
3109 #endif
3110
3111 /* First, ensure the minimum number of matches are present. Use inline
3112 code for maximizing the speed, and do the type test once at the start
3113 (i.e. keep it out of the loop). Separate the UTF-8 code completely as that
3114 is tidier. Also separate the UCP code, which can be the same for both UTF-8
3115 and single-bytes. */
3116
3117 if (min > 0)
3118 {
3119 #ifdef SUPPORT_UCP
3120 if (prop_type >= 0)
3121 {
3122 switch(prop_type)
3123 {
3124 case PT_ANY:
3125 if (prop_fail_result) RRETURN(MATCH_NOMATCH);
3126 for (i = 1; i <= min; i++)
3127 {
3128 if (eptr >= md->end_subject)
3129 {
3130 SCHECK_PARTIAL();
3131 RRETURN(MATCH_NOMATCH);
3132 }
3133 GETCHARINCTEST(c, eptr);
3134 }
3135 break;
3136
3137 case PT_LAMP:
3138 for (i = 1; i <= min; i++)
3139 {
3140 if (eptr >= md->end_subject)
3141 {
3142 SCHECK_PARTIAL();
3143 RRETURN(MATCH_NOMATCH);
3144 }
3145 GETCHARINCTEST(c, eptr);
3146 prop_chartype = UCD_CHARTYPE(c);
3147 if ((prop_chartype == ucp_Lu ||
3148 prop_chartype == ucp_Ll ||
3149 prop_chartype == ucp_Lt) == prop_fail_result)
3150 RRETURN(MATCH_NOMATCH);
3151 }
3152 break;
3153
3154 case PT_GC:
3155 for (i = 1; i <= min; i++)
3156 {
3157 if (eptr >= md->end_subject)
3158 {
3159 SCHECK_PARTIAL();
3160 RRETURN(MATCH_NOMATCH);
3161 }
3162 GETCHARINCTEST(c, eptr);
3163 prop_category = UCD_CATEGORY(c);
3164 if ((prop_category == prop_value) == prop_fail_result)
3165 RRETURN(MATCH_NOMATCH);
3166 }
3167 break;
3168
3169 case PT_PC:
3170 for (i = 1; i <= min; i++)
3171 {
3172 if (eptr >= md->end_subject)
3173 {
3174 SCHECK_PARTIAL();
3175 RRETURN(MATCH_NOMATCH);
3176 }
3177 GETCHARINCTEST(c, eptr);
3178 prop_chartype = UCD_CHARTYPE(c);
3179 if ((prop_chartype == prop_value) == prop_fail_result)
3180 RRETURN(MATCH_NOMATCH);
3181 }
3182 break;
3183
3184 case PT_SC:
3185 for (i = 1; i <= min; i++)
3186 {
3187 if (eptr >= md->end_subject)
3188 {
3189 SCHECK_PARTIAL();
3190 RRETURN(MATCH_NOMATCH);
3191 }
3192 GETCHARINCTEST(c, eptr);
3193 prop_script = UCD_SCRIPT(c);
3194 if ((prop_script == prop_value) == prop_fail_result)
3195 RRETURN(MATCH_NOMATCH);
3196 }
3197 break;
3198
3199 default:
3200 RRETURN(PCRE_ERROR_INTERNAL);
3201 }
3202 }
3203
3204 /* Match extended Unicode sequences. We will get here only if the
3205 support is in the binary; otherwise a compile-time error occurs. */
3206
3207 else if (ctype == OP_EXTUNI)
3208 {
3209 for (i = 1; i <= min; i++)
3210 {
3211 if (eptr >= md->end_subject)
3212 {
3213 SCHECK_PARTIAL();
3214 RRETURN(MATCH_NOMATCH);
3215 }
3216 GETCHARINCTEST(c, eptr);
3217 prop_category = UCD_CATEGORY(c);
3218 if (prop_category == ucp_M) RRETURN(MATCH_NOMATCH);
3219 while (eptr < md->end_subject)
3220 {
3221 int len = 1;
3222 if (!utf8) c = *eptr;
3223 else { GETCHARLEN(c, eptr, len); }
3224 prop_category = UCD_CATEGORY(c);
3225 if (prop_category != ucp_M) break;
3226 eptr += len;
3227 }
3228 }
3229 }
3230
3231 else
3232 #endif /* SUPPORT_UCP */
3233
3234 /* Handle all other cases when the coding is UTF-8 */
3235
3236 #ifdef SUPPORT_UTF8
3237 if (utf8) switch(ctype)
3238 {
3239 case OP_ANY:
3240 for (i = 1; i <= min; i++)
3241 {
3242 if (eptr >= md->end_subject)
3243 {
3244 SCHECK_PARTIAL();
3245 RRETURN(MATCH_NOMATCH);
3246 }
3247 if (IS_NEWLINE(eptr)) RRETURN(MATCH_NOMATCH);
3248 eptr++;
3249 while (eptr < md->end_subject && (*eptr & 0xc0) == 0x80) eptr++;
3250 }
3251 break;
3252
3253 case OP_ALLANY:
3254 for (i = 1; i <= min; i++)
3255 {
3256 if (eptr >= md->end_subject)
3257 {
3258 SCHECK_PARTIAL();
3259 RRETURN(MATCH_NOMATCH);
3260 }
3261 eptr++;
3262 while (eptr < md->end_subject && (*eptr & 0xc0) == 0x80) eptr++;
3263 }
3264 break;
3265
3266 case OP_ANYBYTE:
3267 if (eptr > md->end_subject - min) RRETURN(MATCH_NOMATCH);
3268 eptr += min;
3269 break;
3270
3271 case OP_ANYNL:
3272 for (i = 1; i <= min; i++)
3273 {
3274 if (eptr >= md->end_subject)
3275 {
3276 SCHECK_PARTIAL();
3277 RRETURN(MATCH_NOMATCH);
3278 }
3279 GETCHARINC(c, eptr);
3280 switch(c)
3281 {
3282 default: RRETURN(MATCH_NOMATCH);
3283 case 0x000d:
3284 if (eptr < md->end_subject && *eptr == 0x0a) eptr++;
3285 break;
3286
3287 case 0x000a:
3288 break;
3289
3290 case 0x000b:
3291 case 0x000c:
3292 case 0x0085:
3293 case 0x2028:
3294 case 0x2029:
3295 if (md->bsr_anycrlf) RRETURN(MATCH_NOMATCH);
3296 break;
3297 }
3298 }
3299 break;
3300
3301 case OP_NOT_HSPACE:
3302 for (i = 1; i <= min; i++)
3303 {
3304 if (eptr >= md->end_subject)
3305 {
3306 SCHECK_PARTIAL();
3307 RRETURN(MATCH_NOMATCH);
3308 }
3309 GETCHARINC(c, eptr);
3310 switch(c)
3311 {
3312 default: break;
3313 case 0x09: /* HT */
3314 case 0x20: /* SPACE */
3315 case 0xa0: /* NBSP */
3316 case 0x1680: /* OGHAM SPACE MARK */
3317 case 0x180e: /* MONGOLIAN VOWEL SEPARATOR */
3318 case 0x2000: /* EN QUAD */
3319 case 0x2001: /* EM QUAD */
3320 case 0x2002: /* EN SPACE */
3321 case 0x2003: /* EM SPACE */
3322 case 0x2004: /* THREE-PER-EM SPACE */
3323 case 0x2005: /* FOUR-PER-EM SPACE */
3324 case 0x2006: /* SIX-PER-EM SPACE */
3325 case 0x2007: /* FIGURE SPACE */
3326 case 0x2008: /* PUNCTUATION SPACE */
3327 case 0x2009: /* THIN SPACE */
3328 case 0x200A: /* HAIR SPACE */
3329 case 0x202f: /* NARROW NO-BREAK SPACE */
3330 case 0x205f: /* MEDIUM MATHEMATICAL SPACE */
3331 case 0x3000: /* IDEOGRAPHIC SPACE */
3332 RRETURN(MATCH_NOMATCH);
3333 }
3334 }
3335 break;
3336
3337 case OP_HSPACE:
3338 for (i = 1; i <= min; i++)
3339 {
3340 if (eptr >= md->end_subject)
3341 {
3342 SCHECK_PARTIAL();
3343 RRETURN(MATCH_NOMATCH);
3344 }
3345 GETCHARINC(c, eptr);
3346 switch(c)
3347 {
3348 default: RRETURN(MATCH_NOMATCH);
3349 case 0x09: /* HT */
3350 case 0x20: /* SPACE */
3351 case 0xa0: /* NBSP */
3352 case 0x1680: /* OGHAM SPACE MARK */
3353 case 0x180e: /* MONGOLIAN VOWEL SEPARATOR */
3354 case 0x2000: /* EN QUAD */
3355 case 0x2001: /* EM QUAD */
3356 case 0x2002: /* EN SPACE */
3357 case 0x2003: /* EM SPACE */
3358 case 0x2004: /* THREE-PER-EM SPACE */
3359 case 0x2005: /* FOUR-PER-EM SPACE */
3360 case 0x2006: /* SIX-PER-EM SPACE */
3361 case 0x2007: /* FIGURE SPACE */
3362 case 0x2008: /* PUNCTUATION SPACE */
3363 case 0x2009: /* THIN SPACE */
3364 case 0x200A: /* HAIR SPACE */
3365 case 0x202f: /* NARROW NO-BREAK SPACE */
3366 case 0x205f: /* MEDIUM MATHEMATICAL SPACE */
3367 case 0x3000: /* IDEOGRAPHIC SPACE */
3368 break;
3369 }
3370 }
3371 break;
3372
3373 case OP_NOT_VSPACE:
3374 for (i = 1; i <= min; i++)
3375 {
3376 if (eptr >= md->end_subject)
3377 {
3378 SCHECK_PARTIAL();
3379 RRETURN(MATCH_NOMATCH);
3380 }
3381 GETCHARINC(c, eptr);
3382 switch(c)
3383 {
3384 default: break;
3385 case 0x0a: /* LF */
3386 case 0x0b: /* VT */
3387 case 0x0c: /* FF */
3388 case 0x0d: /* CR */
3389 case 0x85: /* NEL */
3390 case 0x2028: /* LINE SEPARATOR */
3391 case 0x2029: /* PARAGRAPH SEPARATOR */
3392 RRETURN(MATCH_NOMATCH);
3393 }
3394 }
3395 break;
3396
3397 case OP_VSPACE:
3398 for (i = 1; i <= min; i++)
3399 {
3400 if (eptr >= md->end_subject)
3401 {
3402 SCHECK_PARTIAL();
3403 RRETURN(MATCH_NOMATCH);
3404 }
3405 GETCHARINC(c, eptr);
3406 switch(c)
3407 {
3408 default: RRETURN(MATCH_NOMATCH);
3409 case 0x0a: /* LF */
3410 case 0x0b: /* VT */
3411 case 0x0c: /* FF */
3412 case 0x0d: /* CR */
3413 case 0x85: /* NEL */
3414 case 0x2028: /* LINE SEPARATOR */
3415 case 0x2029: /* PARAGRAPH SEPARATOR */
3416 break;
3417 }
3418 }
3419 break;
3420
3421 case OP_NOT_DIGIT:
3422 for (i = 1; i <= min; i++)
3423 {
3424 if (eptr >= md->end_subject)
3425 {
3426 SCHECK_PARTIAL();
3427 RRETURN(MATCH_NOMATCH);
3428 }
3429 GETCHARINC(c, eptr);
3430 if (c < 128 && (md->ctypes[c] & ctype_digit) != 0)
3431 RRETURN(MATCH_NOMATCH);
3432 }
3433 break;
3434
3435 case OP_DIGIT:
3436 for (i = 1; i <= min; i++)
3437 {
3438 if (eptr >= md->end_subject)
3439 {
3440 SCHECK_PARTIAL();
3441 RRETURN(MATCH_NOMATCH);
3442 }
3443 if (*eptr >= 128 || (md->ctypes[*eptr++] & ctype_digit) == 0)
3444 RRETURN(MATCH_NOMATCH);
3445 /* No need to skip more bytes - we know it's a 1-byte character */
3446 }
3447 break;
3448
3449 case OP_NOT_WHITESPACE:
3450 for (i = 1; i <= min; i++)
3451 {
3452 if (eptr >= md->end_subject)
3453 {
3454 SCHECK_PARTIAL();
3455 RRETURN(MATCH_NOMATCH);
3456 }
3457 if (*eptr < 128 && (md->ctypes[*eptr] & ctype_space) != 0)
3458 RRETURN(MATCH_NOMATCH);
3459 while (++eptr < md->end_subject && (*eptr & 0xc0) == 0x80);
3460 }
3461 break;
3462
3463 case OP_WHITESPACE:
3464 for (i = 1; i <= min; i++)
3465 {
3466 if (eptr >= md->end_subject)
3467 {
3468 SCHECK_PARTIAL();
3469 RRETURN(MATCH_NOMATCH);
3470 }
3471 if (*eptr >= 128 || (md->ctypes[*eptr++] & ctype_space) == 0)
3472 RRETURN(MATCH_NOMATCH);
3473 /* No need to skip more bytes - we know it's a 1-byte character */
3474 }
3475 break;
3476
3477 case OP_NOT_WORDCHAR:
3478 for (i = 1; i <= min; i++)
3479 {
3480 if (eptr >= md->end_subject ||
3481 (*eptr < 128 && (md->ctypes[*eptr] & ctype_word) != 0))
3482 RRETURN(MATCH_NOMATCH);
3483 while (++eptr < md->end_subject && (*eptr & 0xc0) == 0x80);
3484 }
3485 break;
3486
3487 case OP_WORDCHAR:
3488 for (i = 1; i <= min; i++)
3489 {
3490 if (eptr >= md->end_subject)
3491 {
3492 SCHECK_PARTIAL();
3493 RRETURN(MATCH_NOMATCH);
3494 }
3495 if (*eptr >= 128 || (md->ctypes[*eptr++] & ctype_word) == 0)
3496 RRETURN(MATCH_NOMATCH);
3497 /* No need to skip more bytes - we know it's a 1-byte character */
3498 }
3499 break;
3500
3501 default:
3502 RRETURN(PCRE_ERROR_INTERNAL);
3503 } /* End switch(ctype) */
3504
3505 else
3506 #endif /* SUPPORT_UTF8 */
3507
3508 /* Code for the non-UTF-8 case for minimum matching of operators other
3509 than OP_PROP and OP_NOTPROP. */
3510
3511 switch(ctype)
3512 {
3513 case OP_ANY:
3514 for (i = 1; i <= min; i++)
3515 {
3516 if (eptr >= md->end_subject)
3517 {
3518 SCHECK_PARTIAL();
3519 RRETURN(MATCH_NOMATCH);
3520 }
3521 if (IS_NEWLINE(eptr)) RRETURN(MATCH_NOMATCH);
3522 eptr++;
3523 }
3524 break;
3525
3526 case OP_ALLANY:
3527 if (eptr > md->end_subject - min)
3528 {
3529 SCHECK_PARTIAL();
3530 RRETURN(MATCH_NOMATCH);
3531 }
3532 eptr += min;
3533 break;
3534
3535 case OP_ANYBYTE:
3536 if (eptr > md->end_subject - min)
3537 {
3538 SCHECK_PARTIAL();
3539 RRETURN(MATCH_NOMATCH);
3540 }
3541 eptr += min;
3542 break;
3543
3544 case OP_ANYNL:
3545 for (i = 1; i <= min; i++)
3546 {
3547 if (eptr >= md->end_subject)
3548 {
3549 SCHECK_PARTIAL();
3550 RRETURN(MATCH_NOMATCH);
3551 }
3552 switch(*eptr++)
3553 {
3554 default: RRETURN(MATCH_NOMATCH);
3555 case 0x000d:
3556 if (eptr < md->end_subject && *eptr == 0x0a) eptr++;
3557 break;
3558 case 0x000a:
3559 break;
3560
3561 case 0x000b:
3562 case 0x000c:
3563 case 0x0085:
3564 if (md->bsr_anycrlf) RRETURN(MATCH_NOMATCH);
3565 break;
3566 }
3567 }
3568 break;
3569
3570 case OP_NOT_HSPACE:
3571 for (i = 1; i <= min; i++)
3572 {
3573 if (eptr >= md->end_subject)
3574 {
3575 SCHECK_PARTIAL();
3576 RRETURN(MATCH_NOMATCH);
3577 }
3578 switch(*eptr++)
3579 {
3580 default: break;
3581 case 0x09: /* HT */
3582 case 0x20: /* SPACE */
3583 case 0xa0: /* NBSP */
3584 RRETURN(MATCH_NOMATCH);
3585 }
3586 }
3587 break;
3588
3589 case OP_HSPACE:
3590 for (i = 1; i <= min; i++)
3591 {
3592 if (eptr >= md->end_subject)
3593 {
3594 SCHECK_PARTIAL();
3595 RRETURN(MATCH_NOMATCH);
3596 }
3597 switch(*eptr++)
3598 {
3599 default: RRETURN(MATCH_NOMATCH);
3600 case 0x09: /* HT */
3601 case 0x20: /* SPACE */
3602 case 0xa0: /* NBSP */
3603 break;
3604 }
3605 }
3606 break;
3607
3608 case OP_NOT_VSPACE:
3609 for (i = 1; i <= min; i++)
3610 {
3611 if (eptr >= md->end_subject)
3612 {
3613 SCHECK_PARTIAL();
3614 RRETURN(MATCH_NOMATCH);
3615 }
3616 switch(*eptr++)
3617 {
3618 default: break;
3619 case 0x0a: /* LF */
3620 case 0x0b: /* VT */
3621 case 0x0c: /* FF */
3622 case 0x0d: /* CR */
3623 case 0x85: /* NEL */
3624 RRETURN(MATCH_NOMATCH);
3625 }
3626 }
3627 break;
3628
3629 case OP_VSPACE:
3630 for (i = 1; i <= min; i++)
3631 {
3632 if (eptr >= md->end_subject)
3633 {
3634 SCHECK_PARTIAL();
3635 RRETURN(MATCH_NOMATCH);
3636 }
3637 switch(*eptr++)
3638 {
3639 default: RRETURN(MATCH_NOMATCH);
3640 case 0x0a: /* LF */
3641 case 0x0b: /* VT */
3642 case 0x0c: /* FF */
3643 case 0x0d: /* CR */
3644 case 0x85: /* NEL */
3645 break;
3646 }
3647 }
3648 break;
3649
3650 case OP_NOT_DIGIT:
3651 for (i = 1; i <= min; i++)
3652 {
3653 if (eptr >= md->end_subject)
3654 {
3655 SCHECK_PARTIAL();
3656 RRETURN(MATCH_NOMATCH);
3657 }
3658 if ((md->ctypes[*eptr++] & ctype_digit) != 0) RRETURN(MATCH_NOMATCH);
3659 }
3660 break;
3661
3662 case OP_DIGIT:
3663 for (i = 1; i <= min; i++)
3664 {
3665 if (eptr >= md->end_subject)
3666 {
3667 SCHECK_PARTIAL();
3668 RRETURN(MATCH_NOMATCH);
3669 }
3670 if ((md->ctypes[*eptr++] & ctype_digit) == 0) RRETURN(MATCH_NOMATCH);
3671 }
3672 break;
3673
3674 case OP_NOT_WHITESPACE:
3675 for (i = 1; i <= min; i++)
3676 {
3677 if (eptr >= md->end_subject)
3678 {
3679 SCHECK_PARTIAL();
3680 RRETURN(MATCH_NOMATCH);
3681 }
3682 if ((md->ctypes[*eptr++] & ctype_space) != 0) RRETURN(MATCH_NOMATCH);
3683 }
3684 break;
3685
3686 case OP_WHITESPACE:
3687 for (i = 1; i <= min; i++)
3688 {
3689 if (eptr >= md->end_subject)
3690 {
3691 SCHECK_PARTIAL();
3692 RRETURN(MATCH_NOMATCH);
3693 }
3694 if ((md->ctypes[*eptr++] & ctype_space) == 0) RRETURN(MATCH_NOMATCH);
3695 }
3696 break;
3697
3698 case OP_NOT_WORDCHAR:
3699 for (i = 1; i <= min; i++)
3700 {
3701 if (eptr >= md->end_subject)
3702 {
3703 SCHECK_PARTIAL();
3704 RRETURN(MATCH_NOMATCH);
3705 }
3706 if ((md->ctypes[*eptr++] & ctype_word) != 0)
3707 RRETURN(MATCH_NOMATCH);
3708 }
3709 break;
3710
3711 case OP_WORDCHAR:
3712 for (i = 1; i <= min; i++)
3713 {
3714 if (eptr >= md->end_subject)
3715 {
3716 SCHECK_PARTIAL();
3717 RRETURN(MATCH_NOMATCH);
3718 }
3719 if ((md->ctypes[*eptr++] & ctype_word) == 0)
3720 RRETURN(MATCH_NOMATCH);
3721 }
3722 break;
3723
3724 default:
3725 RRETURN(PCRE_ERROR_INTERNAL);
3726 }
3727 }
3728
3729 /* If min = max, continue at the same level without recursing */
3730
3731 if (min == max) continue;
3732
3733 /* If minimizing, we have to test the rest of the pattern before each
3734 subsequent match. Again, separate the UTF-8 case for speed, and also
3735 separate the UCP cases. */
3736
3737 if (minimize)
3738 {
3739 #ifdef SUPPORT_UCP
3740 if (prop_type >= 0)
3741 {
3742 switch(prop_type)
3743 {
3744 case PT_ANY:
3745 for (fi = min;; fi++)
3746 {
3747 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM36);
3748 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3749 if (fi >= max) RRETURN(MATCH_NOMATCH);
3750 if (eptr >= md->end_subject)
3751 {
3752 SCHECK_PARTIAL();
3753 RRETURN(MATCH_NOMATCH);
3754 }
3755 GETCHARINC(c, eptr);
3756 if (prop_fail_result) RRETURN(MATCH_NOMATCH);
3757 }
3758 /* Control never gets here */
3759
3760 case PT_LAMP:
3761 for (fi = min;; fi++)
3762 {
3763 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM37);
3764 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3765 if (fi >= max) RRETURN(MATCH_NOMATCH);
3766 if (eptr >= md->end_subject)
3767 {
3768 SCHECK_PARTIAL();
3769 RRETURN(MATCH_NOMATCH);
3770 }
3771 GETCHARINC(c, eptr);
3772 prop_chartype = UCD_CHARTYPE(c);
3773 if ((prop_chartype == ucp_Lu ||
3774 prop_chartype == ucp_Ll ||
3775 prop_chartype == ucp_Lt) == prop_fail_result)
3776 RRETURN(MATCH_NOMATCH);
3777 }
3778 /* Control never gets here */
3779
3780 case PT_GC:
3781 for (fi = min;; fi++)
3782 {
3783 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM38);
3784 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3785 if (fi >= max) RRETURN(MATCH_NOMATCH);
3786 if (eptr >= md->end_subject)
3787 {
3788 SCHECK_PARTIAL();
3789 RRETURN(MATCH_NOMATCH);
3790 }
3791 GETCHARINC(c, eptr);
3792 prop_category = UCD_CATEGORY(c);
3793 if ((prop_category == prop_value) == prop_fail_result)
3794 RRETURN(MATCH_NOMATCH);
3795 }
3796 /* Control never gets here */
3797
3798 case PT_PC:
3799 for (fi = min;; fi++)
3800 {
3801 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM39);
3802 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3803 if (fi >= max) RRETURN(MATCH_NOMATCH);
3804 if (eptr >= md->end_subject)
3805 {
3806 SCHECK_PARTIAL();
3807 RRETURN(MATCH_NOMATCH);
3808 }
3809 GETCHARINC(c, eptr);
3810 prop_chartype = UCD_CHARTYPE(c);
3811 if ((prop_chartype == prop_value) == prop_fail_result)
3812 RRETURN(MATCH_NOMATCH);
3813 }
3814 /* Control never gets here */
3815
3816 case PT_SC:
3817 for (fi = min;; fi++)
3818 {
3819 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM40);
3820 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3821 if (fi >= max) RRETURN(MATCH_NOMATCH);
3822 if (eptr >= md->end_subject)
3823 {
3824 SCHECK_PARTIAL();
3825 RRETURN(MATCH_NOMATCH);
3826 }
3827 GETCHARINC(c, eptr);
3828 prop_script = UCD_SCRIPT(c);
3829 if ((prop_script == prop_value) == prop_fail_result)
3830 RRETURN(MATCH_NOMATCH);
3831 }
3832 /* Control never gets here */
3833
3834 default:
3835 RRETURN(PCRE_ERROR_INTERNAL);
3836 }
3837 }
3838
3839 /* Match extended Unicode sequences. We will get here only if the
3840 support is in the binary; otherwise a compile-time error occurs. */
3841
3842 else if (ctype == OP_EXTUNI)
3843 {
3844 for (fi = min;; fi++)
3845 {
3846 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM41);
3847 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3848 if (fi >= max) RRETURN(MATCH_NOMATCH);
3849 if (eptr >= md->end_subject)
3850 {
3851 SCHECK_PARTIAL();
3852 RRETURN(MATCH_NOMATCH);
3853 }
3854 GETCHARINCTEST(c, eptr);
3855 prop_category = UCD_CATEGORY(c);
3856 if (prop_category == ucp_M) RRETURN(MATCH_NOMATCH);
3857 while (eptr < md->end_subject)
3858 {
3859 int len = 1;
3860 if (!utf8) c = *eptr;
3861 else { GETCHARLEN(c, eptr, len); }
3862 prop_category = UCD_CATEGORY(c);
3863 if (prop_category != ucp_M) break;
3864 eptr += len;
3865 }
3866 }
3867 }
3868
3869 else
3870 #endif /* SUPPORT_UCP */
3871
3872 #ifdef SUPPORT_UTF8
3873 /* UTF-8 mode */
3874 if (utf8)
3875 {
3876 for (fi = min;; fi++)
3877 {
3878 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM42);
3879 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3880 if (fi >= max) RRETURN(MATCH_NOMATCH);
3881 if (eptr >= md->end_subject)
3882 {
3883 SCHECK_PARTIAL();
3884 RRETURN(MATCH_NOMATCH);
3885 }
3886 if (ctype == OP_ANY && IS_NEWLINE(eptr))
3887 RRETURN(MATCH_NOMATCH);
3888 GETCHARINC(c, eptr);
3889 switch(ctype)
3890 {
3891 case OP_ANY: /* This is the non-NL case */
3892 case OP_ALLANY:
3893 case OP_ANYBYTE:
3894 break;
3895
3896 case OP_ANYNL:
3897 switch(c)
3898 {
3899 default: RRETURN(MATCH_NOMATCH);
3900 case 0x000d:
3901 if (eptr < md->end_subject && *eptr == 0x0a) eptr++;
3902 break;
3903 case 0x000a:
3904 break;
3905
3906 case 0x000b:
3907 case 0x000c:
3908 case 0x0085:
3909 case 0x2028:
3910 case 0x2029:
3911 if (md->bsr_anycrlf) RRETURN(MATCH_NOMATCH);
3912 break;
3913 }
3914 break;
3915
3916 case OP_NOT_HSPACE:
3917 switch(c)
3918 {
3919 default: break;
3920 case 0x09: /* HT */
3921 case 0x20: /* SPACE */
3922 case 0xa0: /* NBSP */
3923 case 0x1680: /* OGHAM SPACE MARK */
3924 case 0x180e: /* MONGOLIAN VOWEL SEPARATOR */
3925 case 0x2000: /* EN QUAD */
3926 case 0x2001: /* EM QUAD */
3927 case 0x2002: /* EN SPACE */
3928 case 0x2003: /* EM SPACE */
3929 case 0x2004: /* THREE-PER-EM SPACE */
3930 case 0x2005: /* FOUR-PER-EM SPACE */
3931 case 0x2006: /* SIX-PER-EM SPACE */
3932 case 0x2007: /* FIGURE SPACE */
3933 case 0x2008: /* PUNCTUATION SPACE */
3934 case 0x2009: /* THIN SPACE */
3935 case 0x200A: /* HAIR SPACE */
3936 case 0x202f: /* NARROW NO-BREAK SPACE */
3937 case 0x205f: /* MEDIUM MATHEMATICAL SPACE */
3938 case 0x3000: /* IDEOGRAPHIC SPACE */
3939 RRETURN(MATCH_NOMATCH);
3940 }
3941 break;
3942
3943 case OP_HSPACE:
3944 switch(c)
3945 {
3946 default: RRETURN(MATCH_NOMATCH);
3947 case 0x09: /* HT */
3948 case 0x20: /* SPACE */
3949 case 0xa0: /* NBSP */
3950 case 0x1680: /* OGHAM SPACE MARK */
3951 case 0x180e: /* MONGOLIAN VOWEL SEPARATOR */
3952 case 0x2000: /* EN QUAD */
3953 case 0x2001: /* EM QUAD */
3954 case 0x2002: /* EN SPACE */
3955 case 0x2003: /* EM SPACE */
3956 case 0x2004: /* THREE-PER-EM SPACE */
3957 case 0x2005: /* FOUR-PER-EM SPACE */
3958 case 0x2006: /* SIX-PER-EM SPACE */
3959 case 0x2007: /* FIGURE SPACE */
3960 case 0x2008: /* PUNCTUATION SPACE */
3961 case 0x2009: /* THIN SPACE */
3962 case 0x200A: /* HAIR SPACE */
3963 case 0x202f: /* NARROW NO-BREAK SPACE */
3964 case 0x205f: /* MEDIUM MATHEMATICAL SPACE */
3965 case 0x3000: /* IDEOGRAPHIC SPACE */
3966 break;
3967 }
3968 break;
3969
3970 case OP_NOT_VSPACE:
3971 switch(c)
3972 {
3973 default: break;
3974 case 0x0a: /* LF */
3975 case 0x0b: /* VT */
3976 case 0x0c: /* FF */
3977 case 0x0d: /* CR */
3978 case 0x85: /* NEL */
3979 case 0x2028: /* LINE SEPARATOR */
3980 case 0x2029: /* PARAGRAPH SEPARATOR */
3981 RRETURN(MATCH_NOMATCH);
3982 }
3983 break;
3984
3985 case OP_VSPACE:
3986 switch(c)
3987 {
3988 default: RRETURN(MATCH_NOMATCH);
3989 case 0x0a: /* LF */
3990 case 0x0b: /* VT */
3991 case 0x0c: /* FF */
3992 case 0x0d: /* CR */
3993 case 0x85: /* NEL */
3994 case 0x2028: /* LINE SEPARATOR */
3995 case 0x2029: /* PARAGRAPH SEPARATOR */
3996 break;
3997 }
3998 break;
3999
4000 case OP_NOT_DIGIT:
4001 if (c < 256 && (md->ctypes[c] & ctype_digit) != 0)
4002 RRETURN(MATCH_NOMATCH);
4003 break;
4004
4005 case OP_DIGIT:
4006 if (c >= 256 || (md->ctypes[c] & ctype_digit) == 0)
4007 RRETURN(MATCH_NOMATCH);
4008 break;
4009
4010 case OP_NOT_WHITESPACE:
4011 if (c < 256 && (md->ctypes[c] & ctype_space) != 0)
4012 RRETURN(MATCH_NOMATCH);
4013 break;
4014
4015 case OP_WHITESPACE:
4016 if (c >= 256 || (md->ctypes[c] & ctype_space) == 0)
4017 RRETURN(MATCH_NOMATCH);
4018 break;
4019
4020 case OP_NOT_WORDCHAR:
4021 if (c < 256 && (md->ctypes[c] & ctype_word) != 0)
4022 RRETURN(MATCH_NOMATCH);
4023 break;
4024
4025 case OP_WORDCHAR:
4026 if (c >= 256 || (md->ctypes[c] & ctype_word) == 0)
4027 RRETURN(MATCH_NOMATCH);
4028 break;
4029
4030 default:
4031 RRETURN(PCRE_ERROR_INTERNAL);
4032 }
4033 }
4034 }
4035 else
4036 #endif
4037 /* Not UTF-8 mode */
4038 {
4039 for (fi = min;; fi++)
4040 {
4041 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM43);
4042 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
4043 if (fi >= max) RRETURN(MATCH_NOMATCH);
4044 if (eptr >= md->end_subject)
4045 {
4046 SCHECK_PARTIAL();
4047 RRETURN(MATCH_NOMATCH);
4048 }
4049 if (ctype == OP_ANY && IS_NEWLINE(eptr))
4050 RRETURN(MATCH_NOMATCH);
4051 c = *eptr++;
4052 switch(ctype)
4053 {
4054 case OP_ANY: /* This is the non-NL case */
4055 case OP_ALLANY:
4056 case OP_ANYBYTE:
4057 break;
4058
4059 case OP_ANYNL:
4060 switch(c)
4061 {
4062 default: RRETURN(MATCH_NOMATCH);
4063 case 0x000d:
4064 if (eptr < md->end_subject && *eptr == 0x0a) eptr++;
4065 break;
4066
4067 case 0x000a:
4068 break;
4069
4070 case 0x000b:
4071 case 0x000c:
4072 case 0x0085:
4073 if (md->bsr_anycrlf) RRETURN(MATCH_NOMATCH);
4074 break;
4075 }
4076 break;
4077
4078 case OP_NOT_HSPACE:
4079 switch(c)
4080 {
4081 default: break;
4082 case 0x09: /* HT */
4083 case 0x20: /* SPACE */
4084 case 0xa0: /* NBSP */
4085 RRETURN(MATCH_NOMATCH);
4086 }
4087 break;
4088
4089 case OP_HSPACE:
4090 switch(c)
4091 {
4092 default: RRETURN(MATCH_NOMATCH);
4093 case 0x09: /* HT */
4094 case 0x20: /* SPACE */
4095 case 0xa0: /* NBSP */
4096 break;
4097 }
4098 break;
4099
4100 case OP_NOT_VSPACE:
4101 switch(c)
4102 {
4103 default: break;
4104 case 0x0a: /* LF */
4105 case 0x0b: /* VT */
4106 case 0x0c: /* FF */
4107 case 0x0d: /* CR */
4108 case 0x85: /* NEL */
4109 RRETURN(MATCH_NOMATCH);
4110 }
4111 break;
4112
4113 case OP_VSPACE:
4114 switch(c)
4115 {
4116 default: RRETURN(MATCH_NOMATCH);
4117 case 0x0a: /* LF */
4118 case 0x0b: /* VT */
4119 case 0x0c: /* FF */
4120 case 0x0d: /* CR */
4121 case 0x85: /* NEL */
4122 break;
4123 }
4124 break;
4125
4126 case OP_NOT_DIGIT:
4127 if ((md->ctypes[c] & ctype_digit) != 0) RRETURN(MATCH_NOMATCH);
4128 break;
4129
4130 case OP_DIGIT:
4131 if ((md->ctypes[c] & ctype_digit) == 0) RRETURN(MATCH_NOMATCH);
4132 break;
4133
4134 case OP_NOT_WHITESPACE:
4135 if ((md->ctypes[c] & ctype_space) != 0) RRETURN(MATCH_NOMATCH);
4136 break;
4137
4138 case OP_WHITESPACE:
4139 if ((md->ctypes[c] & ctype_space) == 0) RRETURN(MATCH_NOMATCH);
4140 break;
4141
4142 case OP_NOT_WORDCHAR:
4143 if ((md->ctypes[c] & ctype_word) != 0) RRETURN(MATCH_NOMATCH);
4144 break;
4145
4146 case OP_WORDCHAR:
4147 if ((md->ctypes[c] & ctype_word) == 0) RRETURN(MATCH_NOMATCH);
4148 break;
4149
4150 default:
4151 RRETURN(PCRE_ERROR_INTERNAL);
4152 }
4153 }
4154 }
4155 /* Control never gets here */
4156 }
4157
4158 /* If maximizing, it is worth using inline code for speed, doing the type
4159 test once at the start (i.e. keep it out of the loop). Again, keep the
4160 UTF-8 and UCP stuff separate. */
4161
4162 else
4163 {
4164 pp = eptr; /* Remember where we started */
4165
4166 #ifdef SUPPORT_UCP
4167 if (prop_type >= 0)
4168 {
4169 switch(prop_type)
4170 {
4171 case PT_ANY:
4172 for (i = min; i < max; i++)
4173 {
4174 int len = 1;
4175 if (eptr >= md->end_subject) break;
4176 GETCHARLEN(c, eptr, len);
4177 if (prop_fail_result) break;
4178 eptr+= len;
4179 }
4180 break;
4181
4182 case PT_LAMP:
4183 for (i = min; i < max; i++)
4184 {
4185 int len = 1;
4186 if (eptr >= md->end_subject) break;
4187 GETCHARLEN(c, eptr, len);
4188 prop_chartype = UCD_CHARTYPE(c);
4189 if ((prop_chartype == ucp_Lu ||
4190 prop_chartype == ucp_Ll ||
4191 prop_chartype == ucp_Lt) == prop_fail_result)
4192 break;
4193 eptr+= len;
4194 }
4195 break;
4196
4197 case PT_GC:
4198 for (i = min; i < max; i++)
4199 {
4200 int len = 1;
4201 if (eptr >= md->end_subject) break;
4202 GETCHARLEN(c, eptr, len);
4203 prop_category = UCD_CATEGORY(c);
4204 if ((prop_category == prop_value) == prop_fail_result)
4205 break;
4206 eptr+= len;
4207 }
4208 break;
4209
4210 case PT_PC:
4211 for (i = min; i < max; i++)
4212 {
4213 int len = 1;
4214 if (eptr >= md->end_subject) break;
4215 GETCHARLEN(c, eptr, len);
4216 prop_chartype = UCD_CHARTYPE(c);
4217 if ((prop_chartype == prop_value) == prop_fail_result)
4218 break;
4219 eptr+= len;
4220 }
4221 break;
4222
4223 case PT_SC:
4224 for (i = min; i < max; i++)
4225 {
4226 int len = 1;
4227 if (eptr >= md->end_subject) break;
4228 GETCHARLEN(c, eptr, len);
4229 prop_script = UCD_SCRIPT(c);
4230 if ((prop_script == prop_value) == prop_fail_result)
4231 break;
4232 eptr+= len;
4233 }
4234 break;
4235 }
4236
4237 /* eptr is now past the end of the maximum run */
4238
4239 if (possessive) continue;
4240 for(;;)
4241 {
4242 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM44);
4243 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
4244 if (eptr-- == pp) break; /* Stop if tried at original pos */
4245 if (utf8) BACKCHAR(eptr);
4246 }
4247 }
4248
4249 /* Match extended Unicode sequences. We will get here only if the
4250 support is in the binary; otherwise a compile-time error occurs. */
4251
4252 else if (ctype == OP_EXTUNI)
4253 {
4254 for (i = min; i < max; i++)
4255 {
4256 if (eptr >= md->end_subject) break;
4257 GETCHARINCTEST(c, eptr);
4258 prop_category = UCD_CATEGORY(c);
4259 if (prop_category == ucp_M) break;
4260 while (eptr < md->end_subject)
4261 {
4262 int len = 1;
4263 if (!utf8) c = *eptr; else
4264 {
4265 GETCHARLEN(c, eptr, len);
4266 }
4267 prop_category = UCD_CATEGORY(c);
4268 if (prop_category != ucp_M) break;
4269 eptr += len;
4270 }
4271 }
4272
4273 /* eptr is now past the end of the maximum run */
4274
4275 if (possessive) continue;
4276 for(;;)
4277 {
4278 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM45);
4279 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
4280 if (eptr-- == pp) break; /* Stop if tried at original pos */
4281 for (;;) /* Move back over one extended */
4282 {
4283 int len = 1;
4284 if (!utf8) c = *eptr; else
4285 {
4286 BACKCHAR(eptr);
4287 GETCHARLEN(c, eptr, len);
4288 }
4289 prop_category = UCD_CATEGORY(c);
4290 if (prop_category != ucp_M) break;
4291 eptr--;
4292 }
4293 }
4294 }
4295
4296 else
4297 #endif /* SUPPORT_UCP */
4298
4299 #ifdef SUPPORT_UTF8
4300 /* UTF-8 mode */
4301
4302 if (utf8)
4303 {
4304 switch(ctype)
4305 {
4306 case OP_ANY:
4307 if (max < INT_MAX)
4308 {
4309 for (i = min; i < max; i++)
4310 {
4311 if (eptr >= md->end_subject || IS_NEWLINE(eptr)) break;
4312 eptr++;
4313 while (eptr < md->end_subject && (*eptr & 0xc0) == 0x80) eptr++;
4314 }
4315 }
4316
4317 /* Handle unlimited UTF-8 repeat */
4318
4319 else
4320 {
4321 for (i = min; i < max; i++)
4322 {
4323 if (eptr >= md->end_subject || IS_NEWLINE(eptr)) break;
4324 eptr++;
4325 while (eptr < md->end_subject && (*eptr & 0xc0) == 0x80) eptr++;
4326 }
4327 }
4328 break;
4329
4330 case OP_ALLANY:
4331 if (max < INT_MAX)
4332 {
4333 for (i = min; i < max; i++)
4334 {
4335 if (eptr >= md->end_subject) break;
4336 eptr++;
4337 while (eptr < md->end_subject && (*eptr & 0xc0) == 0x80) eptr++;
4338 }
4339 }
4340 else eptr = md->end_subject; /* Unlimited UTF-8 repeat */
4341 break;
4342
4343 /* The byte case is the same as non-UTF8 */
4344
4345 case OP_ANYBYTE:
4346 c = max - min;
4347 if (c > (unsigned int)(md->end_subject - eptr))
4348 c = md->end_subject - eptr;
4349 eptr += c;
4350 break;
4351
4352 case OP_ANYNL:
4353 for (i = min; i < max; i++)
4354 {
4355 int len = 1;
4356 if (eptr >= md->end_subject) break;
4357 GETCHARLEN(c, eptr, len);
4358 if (c == 0x000d)
4359 {
4360 if (++eptr >= md->end_subject) break;
4361 if (*eptr == 0x000a) eptr++;
4362 }
4363 else
4364 {
4365 if (c != 0x000a &&
4366 (md->bsr_anycrlf ||
4367 (c != 0x000b && c != 0x000c &&
4368 c != 0x0085 && c != 0x2028 && c != 0x2029)))
4369 break;
4370 eptr += len;
4371 }
4372 }
4373 break;
4374
4375 case OP_NOT_HSPACE:
4376 case OP_HSPACE:
4377 for (i = min; i < max; i++)
4378 {
4379 BOOL gotspace;
4380 int len = 1;
4381 if (eptr >= md->end_subject) break;
4382 GETCHARLEN(c, eptr, len);
4383 switch(c)
4384 {
4385 default: gotspace = FALSE; break;
4386 case 0x09: /* HT */
4387 case 0x20: /* SPACE */
4388 case 0xa0: /* NBSP */
4389 case 0x1680: /* OGHAM SPACE MARK */
4390 case 0x180e: /* MONGOLIAN VOWEL SEPARATOR */
4391 case 0x2000: /* EN QUAD */
4392 case 0x2001: /* EM QUAD */
4393 case 0x2002: /* EN SPACE */
4394 case 0x2003: /* EM SPACE */
4395 case 0x2004: /* THREE-PER-EM SPACE */
4396 case 0x2005: /* FOUR-PER-EM SPACE */
4397 case 0x2006: /* SIX-PER-EM SPACE */
4398 case 0x2007: /* FIGURE SPACE */
4399 case 0x2008: /* PUNCTUATION SPACE */
4400 case 0x2009: /* THIN SPACE */
4401 case 0x200A: /* HAIR SPACE */
4402 case 0x202f: /* NARROW NO-BREAK SPACE */
4403 case 0x205f: /* MEDIUM MATHEMATICAL SPACE */
4404 case 0x3000: /* IDEOGRAPHIC SPACE */
4405 gotspace = TRUE;
4406 break;
4407 }
4408 if (gotspace == (ctype == OP_NOT_HSPACE)) break;
4409 eptr += len;
4410 }
4411 break;
4412
4413 case OP_NOT_VSPACE:
4414 case OP_VSPACE:
4415 for (i = min; i < max; i++)
4416 {
4417 BOOL gotspace;
4418 int len = 1;
4419 if (eptr >= md->end_subject) break;
4420 GETCHARLEN(c, eptr, len);
4421 switch(c)
4422 {
4423 default: gotspace = FALSE; break;
4424 case 0x0a: /* LF */
4425 case 0x0b: /* VT */
4426 case 0x0c: /* FF */
4427 case 0x0d: /* CR */
4428 case 0x85: /* NEL */
4429 case 0x2028: /* LINE SEPARATOR */
4430 case 0x2029: /* PARAGRAPH SEPARATOR */
4431 gotspace = TRUE;
4432 break;
4433 }
4434 if (gotspace == (ctype == OP_NOT_VSPACE)) break;
4435 eptr += len;
4436 }
4437 break;
4438
4439 case OP_NOT_DIGIT:
4440 for (i = min; i < max; i++)
4441 {
4442 int len = 1;
4443 if (eptr >= md->end_subject) break;
4444 GETCHARLEN(c, eptr, len);
4445 if (c < 256 && (md->ctypes[c] & ctype_digit) != 0) break;
4446 eptr+= len;
4447 }
4448 break;
4449
4450 case OP_DIGIT:
4451 for (i = min; i < max; i++)
4452 {
4453 int len = 1;
4454 if (eptr >= md->end_subject) break;
4455 GETCHARLEN(c, eptr, len);
4456 if (c >= 256 ||(md->ctypes[c] & ctype_digit) == 0) break;
4457 eptr+= len;
4458 }
4459 break;
4460
4461 case OP_NOT_WHITESPACE:
4462 for (i = min; i < max; i++)
4463 {
4464 int len = 1;
4465 if (eptr >= md->end_subject) break;
4466 GETCHARLEN(c, eptr, len);
4467 if (c < 256 && (md->ctypes[c] & ctype_space) != 0) break;
4468 eptr+= len;
4469 }
4470 break;
4471
4472 case OP_WHITESPACE:
4473 for (i = min; i < max; i++)
4474 {
4475 int len = 1;
4476 if (eptr >= md->end_subject) break;
4477 GETCHARLEN(c, eptr, len);
4478 if (c >= 256 ||(md->ctypes[c] & ctype_space) == 0) break;
4479 eptr+= len;
4480 }
4481 break;
4482
4483 case OP_NOT_WORDCHAR:
4484 for (i = min; i < max; i++)
4485 {
4486 int len = 1;
4487 if (eptr >= md->end_subject) break;
4488 GETCHARLEN(c, eptr, len);
4489 if (c < 256 && (md->ctypes[c] & ctype_word) != 0) break;
4490 eptr+= len;
4491 }
4492 break;
4493
4494 case OP_WORDCHAR:
4495 for (i = min; i < max; i++)
4496 {
4497 int len = 1;
4498 if (eptr >= md->end_subject) break;
4499 GETCHARLEN(c, eptr, len);
4500 if (c >= 256 || (md->ctypes[c] & ctype_word) == 0) break;
4501 eptr+= len;
4502 }
4503 break;
4504
4505 default:
4506 RRETURN(PCRE_ERROR_INTERNAL);
4507 }
4508
4509 /* eptr is now past the end of the maximum run */
4510
4511 if (possessive) continue;
4512 for(;;)
4513 {
4514 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM46);
4515 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
4516 if (eptr-- == pp) break; /* Stop if tried at original pos */
4517 BACKCHAR(eptr);
4518 }
4519 }
4520 else
4521 #endif /* SUPPORT_UTF8 */
4522
4523 /* Not UTF-8 mode */
4524 {
4525 switch(ctype)
4526 {
4527 case OP_ANY:
4528 for (i = min; i < max; i++)
4529 {
4530 if (eptr >= md->end_subject || IS_NEWLINE(eptr)) break;
4531 eptr++;
4532 }
4533 break;
4534
4535 case OP_ALLANY:
4536 case OP_ANYBYTE:
4537 c = max - min;
4538 if (c > (unsigned int)(md->end_subject - eptr))
4539 c = md->end_subject - eptr;
4540 eptr += c;
4541 break;
4542
4543 case OP_ANYNL:
4544 for (i = min; i < max; i++)
4545 {
4546 if (eptr >= md->end_subject) break;
4547 c = *eptr;
4548 if (c == 0x000d)
4549 {
4550 if (++eptr >= md->end_subject) break;
4551 if (*eptr == 0x000a) eptr++;
4552 }
4553 else
4554 {
4555 if (c != 0x000a &&
4556 (md->bsr_anycrlf ||
4557 (c != 0x000b && c != 0x000c && c != 0x0085)))
4558 break;
4559 eptr++;
4560 }
4561 }
4562 break;
4563
4564 case OP_NOT_HSPACE:
4565 for (i = min; i < max; i++)
4566 {
4567 if (eptr >= md->end_subject) break;
4568 c = *eptr;
4569 if (c == 0x09 || c == 0x20 || c == 0xa0) break;
4570 eptr++;
4571 }
4572 break;
4573
4574 case OP_HSPACE:
4575 for (i = min; i < max; i++)
4576 {
4577 if (eptr >= md->end_subject) break;
4578 c = *eptr;
4579 if (c != 0x09 && c != 0x20 && c != 0xa0) break;
4580 eptr++;
4581 }
4582 break;
4583
4584 case OP_NOT_VSPACE:
4585 for (i = min; i < max; i++)
4586 {
4587 if (eptr >= md->end_subject) break;
4588 c = *eptr;
4589 if (c == 0x0a || c == 0x0b || c == 0x0c || c == 0x0d || c == 0x85)
4590 break;
4591 eptr++;
4592 }
4593 break;
4594
4595 case OP_VSPACE:
4596 for (i = min; i < max; i++)
4597 {
4598 if (eptr >= md->end_subject) break;
4599 c = *eptr;
4600 if (c != 0x0a && c != 0x0b && c != 0x0c && c != 0x0d && c != 0x85)
4601 break;
4602 eptr++;
4603 }
4604 break;
4605
4606 case OP_NOT_DIGIT:
4607 for (i = min; i < max; i++)
4608 {
4609 if (eptr >= md->end_subject || (md->ctypes[*eptr] & ctype_digit) != 0)
4610 break;
4611 eptr++;
4612 }
4613 break;
4614
4615 case OP_DIGIT:
4616 for (i = min; i < max; i++)
4617 {
4618 if (eptr >= md->end_subject || (md->ctypes[*eptr] & ctype_digit) == 0)
4619 break;
4620 eptr++;
4621 }
4622 break;
4623
4624 case OP_NOT_WHITESPACE:
4625 for (i = min; i < max; i++)
4626 {
4627 if (eptr >= md->end_subject || (md->ctypes[*eptr] & ctype_space) != 0)
4628 break;
4629 eptr++;
4630 }
4631 break;
4632
4633 case OP_WHITESPACE:
4634 for (i = min; i < max; i++)
4635 {
4636 if (eptr >= md->end_subject || (md->ctypes[*eptr] & ctype_space) == 0)
4637 break;
4638 eptr++;
4639 }
4640 break;
4641
4642 case OP_NOT_WORDCHAR:
4643 for (i = min; i < max; i++)
4644 {
4645 if (eptr >= md->end_subject || (md->ctypes[*eptr] & ctype_word) != 0)
4646 break;
4647 eptr++;
4648 }
4649 break;
4650
4651 case OP_WORDCHAR:
4652 for (i = min; i < max; i++)
4653 {
4654 if (eptr >= md->end_subject || (md->ctypes[*eptr] & ctype_word) == 0)
4655 break;
4656 eptr++;
4657 }
4658 break;
4659
4660 default:
4661 RRETURN(PCRE_ERROR_INTERNAL);
4662 }
4663
4664 /* eptr is now past the end of the maximum run */
4665
4666 if (possessive) continue;
4667 while (eptr >= pp)
4668 {
4669 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM47);
4670 eptr--;
4671 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
4672 }
4673 }
4674
4675 /* Get here if we can't make it match with any permitted repetitions */
4676
4677 RRETURN(MATCH_NOMATCH);
4678 }
4679 /* Control never gets here */
4680
4681 /* There's been some horrible disaster. Arrival here can only mean there is
4682 something seriously wrong in the code above or the OP_xxx definitions. */
4683
4684 default:
4685 DPRINTF(("Unknown opcode %d\n", *ecode));
4686 RRETURN(PCRE_ERROR_UNKNOWN_OPCODE);
4687 }
4688
4689 /* Do not stick any code in here without much thought; it is assumed
4690 that "continue" in the code above comes out to here to repeat the main
4691 loop. */
4692
4693 } /* End of main loop */
4694 /* Control never reaches here */
4695
4696
4697 /* When compiling to use the heap rather than the stack for recursive calls to
4698 match(), the RRETURN() macro jumps here. The number that is saved in
4699 frame->Xwhere indicates which label we actually want to return to. */
4700
4701 #ifdef NO_RECURSE
4702 #define LBL(val) case val: goto L_RM##val;
4703 HEAP_RETURN:
4704 switch (frame->Xwhere)
4705 {
4706 LBL( 1) LBL( 2) LBL( 3) LBL( 4) LBL( 5) LBL( 6) LBL( 7) LBL( 8)
4707 LBL( 9) LBL(10) LBL(11) LBL(12) LBL(13) LBL(14) LBL(15) LBL(17)
4708 LBL(19) LBL(24) LBL(25) LBL(26) LBL(27) LBL(29) LBL(31) LBL(33)
4709 LBL(35) LBL(43) LBL(47) LBL(48) LBL(49) LBL(50) LBL(51) LBL(52)
4710 LBL(53) LBL(54)
4711 #ifdef SUPPORT_UTF8
4712 LBL(16) LBL(18) LBL(20) LBL(21) LBL(22) LBL(23) LBL(28) LBL(30)
4713 LBL(32) LBL(34) LBL(42) LBL(46)
4714 #ifdef SUPPORT_UCP
4715 LBL(36) LBL(37) LBL(38) LBL(39) LBL(40) LBL(41) LBL(44) LBL(45)
4716 #endif /* SUPPORT_UCP */
4717 #endif /* SUPPORT_UTF8 */
4718 default:
4719 DPRINTF(("jump error in pcre match: label %d non-existent\n", frame->Xwhere));
4720 return PCRE_ERROR_INTERNAL;
4721 }
4722 #undef LBL
4723 #endif /* NO_RECURSE */
4724 }
4725
4726
4727 /***************************************************************************
4728 ****************************************************************************
4729 RECURSION IN THE match() FUNCTION
4730
4731 Undefine all the macros that were defined above to handle this. */
4732
4733 #ifdef NO_RECURSE
4734 #undef eptr
4735 #undef ecode
4736 #undef mstart
4737 #undef offset_top
4738 #undef ims
4739 #undef eptrb
4740 #undef flags
4741
4742 #undef callpat
4743 #undef charptr
4744 #undef data
4745 #undef next
4746 #undef pp
4747 #undef prev
4748 #undef saved_eptr
4749
4750 #undef new_recursive
4751
4752 #undef cur_is_word
4753 #undef condition
4754 #undef prev_is_word
4755
4756 #undef original_ims
4757
4758 #undef ctype
4759 #undef length
4760 #undef max
4761 #undef min
4762 #undef number
4763 #undef offset
4764 #undef op
4765 #undef save_capture_last
4766 #undef save_offset1
4767 #undef save_offset2
4768 #undef save_offset3
4769 #undef stacksave
4770
4771 #undef newptrb
4772
4773 #endif
4774
4775 /* These two are defined as macros in both cases */
4776
4777 #undef fc
4778 #undef fi
4779
4780 /***************************************************************************
4781 ***************************************************************************/
4782
4783
4784
4785 /*************************************************
4786 * Execute a Regular Expression *
4787 *************************************************/
4788
4789 /* This function applies a compiled re to a subject string and picks out
4790 portions of the string if it matches. Two elements in the vector are set for
4791 each substring: the offsets to the start and end of the substring.
4792
4793 Arguments:
4794 argument_re points to the compiled expression
4795 extra_data points to extra data or is NULL
4796 subject points to the subject string
4797 length length of subject string (may contain binary zeros)
4798 start_offset where to start in the subject string
4799 options option bits
4800 offsets points to a vector of ints to be filled in with offsets
4801 offsetcount the number of elements in the vector
4802
4803 Returns: > 0 => success; value is the number of elements filled in
4804 = 0 => success, but offsets is not big enough
4805 -1 => failed to match
4806 < -1 => some kind of unexpected problem
4807 */
4808
4809 PCRE_EXP_DEFN int PCRE_CALL_CONVENTION
4810 pcre_exec(const pcre *argument_re, const pcre_extra *extra_data,
4811 PCRE_SPTR subject, int length, int start_offset, int options, int *offsets,
4812 int offsetcount)
4813 {
4814 int rc, resetcount, ocount;
4815 int first_byte = -1;
4816 int req_byte = -1;
4817 int req_byte2 = -1;
4818 int newline;
4819 unsigned long int ims;
4820 BOOL using_temporary_offsets = FALSE;
4821 BOOL anchored;
4822 BOOL startline;
4823 BOOL firstline;
4824 BOOL first_byte_caseless = FALSE;
4825 BOOL req_byte_caseless = FALSE;
4826 BOOL utf8;
4827 match_data match_block;
4828 match_data *md = &match_block;
4829 const uschar *tables;
4830 const uschar *start_bits = NULL;
4831 USPTR start_match = (USPTR)subject + start_offset;
4832 USPTR end_subject;
4833 USPTR start_partial = NULL;
4834 USPTR req_byte_ptr = start_match - 1;
4835
4836 pcre_study_data internal_study;
4837 const pcre_study_data *study;
4838
4839 real_pcre internal_re;
4840 const real_pcre *external_re = (const real_pcre *)argument_re;
4841 const real_pcre *re = external_re;
4842
4843 /* Plausibility checks */
4844
4845 if ((options & ~PUBLIC_EXEC_OPTIONS) != 0) return PCRE_ERROR_BADOPTION;
4846 if (re == NULL || subject == NULL ||
4847 (offsets == NULL && offsetcount > 0)) return PCRE_ERROR_NULL;
4848 if (offsetcount < 0) return PCRE_ERROR_BADCOUNT;
4849
4850 /* Fish out the optional data from the extra_data structure, first setting
4851 the default values. */
4852
4853 study = NULL;
4854 md->match_limit = MATCH_LIMIT;
4855 md->match_limit_recursion = MATCH_LIMIT_RECURSION;
4856 md->callout_data = NULL;
4857
4858 /* The table pointer is always in native byte order. */
4859
4860 tables = external_re->tables;
4861
4862 if (extra_data != NULL)
4863 {
4864 register unsigned int flags = extra_data->flags;
4865 if ((flags & PCRE_EXTRA_STUDY_DATA) != 0)
4866 study = (const pcre_study_data *)extra_data->study_data;
4867 if ((flags & PCRE_EXTRA_MATCH_LIMIT) != 0)
4868 md->match_limit = extra_data->match_limit;
4869 if ((flags & PCRE_EXTRA_MATCH_LIMIT_RECURSION) != 0)
4870 md->match_limit_recursion = extra_data->match_limit_recursion;
4871 if ((flags & PCRE_EXTRA_CALLOUT_DATA) != 0)
4872 md->callout_data = extra_data->callout_data;
4873 if ((flags & PCRE_EXTRA_TABLES) != 0) tables = extra_data->tables;
4874 }
4875
4876 /* If the exec call supplied NULL for tables, use the inbuilt ones. This
4877 is a feature that makes it possible to save compiled regex and re-use them
4878 in other programs later. */
4879
4880 if (tables == NULL) tables = _pcre_default_tables;
4881
4882 /* Check that the first field in the block is the magic number. If it is not,
4883 test for a regex that was compiled on a host of opposite endianness. If this is
4884 the case, flipped values are put in internal_re and internal_study if there was
4885 study data too. */
4886
4887 if (re->magic_number != MAGIC_NUMBER)
4888 {
4889 re = _pcre_try_flipped(re, &internal_re, study, &internal_study);
4890 if (re == NULL) return PCRE_ERROR_BADMAGIC;
4891 if (study != NULL) study = &internal_study;
4892 }
4893
4894 /* Set up other data */
4895
4896 anchored = ((re->options | options) & PCRE_ANCHORED) != 0;
4897 startline = (re->flags & PCRE_STARTLINE) != 0;
4898 firstline = (re->options & PCRE_FIRSTLINE) != 0;
4899
4900 /* The code starts after the real_pcre block and the capture name table. */
4901
4902 md->start_code = (const uschar *)external_re + re->name_table_offset +
4903 re->name_count * re->name_entry_size;
4904
4905 md->start_subject = (USPTR)subject;
4906 md->start_offset = start_offset;
4907 md->end_subject = md->start_subject + length;
4908 end_subject = md->end_subject;
4909
4910 md->endonly = (re->options & PCRE_DOLLAR_ENDONLY) != 0;
4911 utf8 = md->utf8 = (re->options & PCRE_UTF8) != 0;
4912 md->jscript_compat = (re->options & PCRE_JAVASCRIPT_COMPAT) != 0;
4913
4914 md->notbol = (options & PCRE_NOTBOL) != 0;
4915 md->noteol = (options & PCRE_NOTEOL) != 0;
4916 md->notempty = (options & PCRE_NOTEMPTY) != 0;
4917 md->partial = ((options & PCRE_PARTIAL_HARD) != 0)? 2 :
4918 ((options & PCRE_PARTIAL_SOFT) != 0)? 1 : 0;
4919 md->hitend = FALSE;
4920
4921 md->recursive = NULL; /* No recursion at top level */
4922
4923 md->lcc = tables + lcc_offset;
4924 md->ctypes = tables + ctypes_offset;
4925
4926 /* Handle different \R options. */
4927
4928 switch (options & (PCRE_BSR_ANYCRLF|PCRE_BSR_UNICODE))
4929 {
4930 case 0:
4931 if ((re->options & (PCRE_BSR_ANYCRLF|PCRE_BSR_UNICODE)) != 0)
4932 md->bsr_anycrlf = (re->options & PCRE_BSR_ANYCRLF) != 0;
4933 else
4934 #ifdef BSR_ANYCRLF
4935 md->bsr_anycrlf = TRUE;
4936 #else
4937 md->bsr_anycrlf = FALSE;
4938 #endif
4939 break;
4940
4941 case PCRE_BSR_ANYCRLF:
4942 md->bsr_anycrlf = TRUE;
4943 break;
4944
4945 case PCRE_BSR_UNICODE:
4946 md->bsr_anycrlf = FALSE;
4947 break;
4948
4949 default: return PCRE_ERROR_BADNEWLINE;
4950 }
4951
4952 /* Handle different types of newline. The three bits give eight cases. If
4953 nothing is set at run time, whatever was used at compile time applies. */
4954
4955 switch ((((options & PCRE_NEWLINE_BITS) == 0)? re->options :
4956 (pcre_uint32)options) & PCRE_NEWLINE_BITS)
4957 {
4958 case 0: newline = NEWLINE; break; /* Compile-time default */
4959 case PCRE_NEWLINE_CR: newline = CHAR_CR; break;
4960 case PCRE_NEWLINE_LF: newline = CHAR_NL; break;
4961 case PCRE_NEWLINE_CR+
4962 PCRE_NEWLINE_LF: newline = (CHAR_CR << 8) | CHAR_NL; break;
4963 case PCRE_NEWLINE_ANY: newline = -1; break;
4964 case PCRE_NEWLINE_ANYCRLF: newline = -2; break;
4965 default: return PCRE_ERROR_BADNEWLINE;
4966 }
4967
4968 if (newline == -2)
4969 {
4970 md->nltype = NLTYPE_ANYCRLF;
4971 }
4972 else if (newline < 0)
4973 {
4974 md->nltype = NLTYPE_ANY;
4975 }
4976 else
4977 {
4978 md->nltype = NLTYPE_FIXED;
4979 if (newline > 255)
4980 {
4981 md->nllen = 2;
4982 md->nl[0] = (newline >> 8) & 255;
4983 md->nl[1] = newline & 255;
4984 }
4985 else
4986 {
4987 md->nllen = 1;
4988 md->nl[0] = newline;
4989 }
4990 }
4991
4992 /* Partial matching was originally supported only for a restricted set of
4993 regexes; from release 8.00 there are no restrictions, but the bits are still
4994 defined (though never set). So there's no harm in leaving this code. */
4995
4996 if (md->partial && (re->flags & PCRE_NOPARTIAL) != 0)
4997 return PCRE_ERROR_BADPARTIAL;
4998
4999 /* Check a UTF-8 string if required. Unfortunately there's no way of passing
5000 back the character offset. */
5001
5002 #ifdef SUPPORT_UTF8
5003 if (utf8 && (options & PCRE_NO_UTF8_CHECK) == 0)
5004 {
5005 if (_pcre_valid_utf8((USPTR)subject, length) >= 0)
5006 return PCRE_ERROR_BADUTF8;
5007 if (start_offset > 0 && start_offset < length)
5008 {
5009 int tb = ((USPTR)subject)[start_offset];
5010 if (tb > 127)
5011 {
5012 tb &= 0xc0;
5013 if (tb != 0 && tb != 0xc0) return PCRE_ERROR_BADUTF8_OFFSET;
5014 }
5015 }
5016 }
5017 #endif
5018
5019 /* The ims options can vary during the matching as a result of the presence
5020 of (?ims) items in the pattern. They are kept in a local variable so that
5021 restoring at the exit of a group is easy. */
5022
5023 ims = re->options & (PCRE_CASELESS|PCRE_MULTILINE|PCRE_DOTALL);
5024
5025 /* If the expression has got more back references than the offsets supplied can
5026 hold, we get a temporary chunk of working store to use during the matching.
5027 Otherwise, we can use the vector supplied, rounding down its size to a multiple
5028 of 3. */
5029
5030 ocount = offsetcount - (offsetcount % 3);
5031
5032 if (re->top_backref > 0 && re->top_backref >= ocount/3)
5033 {
5034 ocount = re->top_backref * 3 + 3;
5035 md->offset_vector = (int *)(pcre_malloc)(ocount * sizeof(int));
5036 if (md->offset_vector == NULL) return PCRE_ERROR_NOMEMORY;
5037 using_temporary_offsets = TRUE;
5038 DPRINTF(("Got memory to hold back references\n"));
5039 }
5040 else md->offset_vector = offsets;
5041
5042 md->offset_end = ocount;
5043 md->offset_max = (2*ocount)/3;
5044 md->offset_overflow = FALSE;
5045 md->capture_last = -1;
5046
5047 /* Compute the minimum number of offsets that we need to reset each time. Doing
5048 this makes a huge difference to execution time when there aren't many brackets
5049 in the pattern. */
5050
5051 resetcount = 2 + re->top_bracket * 2;
5052 if (resetcount > offsetcount) resetcount = ocount;
5053
5054 /* Reset the working variable associated with each extraction. These should
5055 never be used unless previously set, but they get saved and restored, and so we
5056 initialize them to avoid reading uninitialized locations. */
5057
5058 if (md->offset_vector != NULL)
5059 {
5060 register int *iptr = md->offset_vector + ocount;
5061 register int *iend = iptr - resetcount/2 + 1;
5062 while (--iptr >= iend) *iptr = -1;
5063 }
5064
5065 /* Set up the first character to match, if available. The first_byte value is
5066 never set for an anchored regular expression, but the anchoring may be forced
5067 at run time, so we have to test for anchoring. The first char may be unset for
5068 an unanchored pattern, of course. If there's no first char and the pattern was
5069 studied, there may be a bitmap of possible first characters. */
5070
5071 if (!anchored)
5072 {
5073 if ((re->flags & PCRE_FIRSTSET) != 0)
5074 {
5075 first_byte = re->first_byte & 255;
5076 if ((first_byte_caseless = ((re->first_byte & REQ_CASELESS) != 0)) == TRUE)
5077 first_byte = md->lcc[first_byte];
5078 }
5079 else
5080 if (!startline && study != NULL &&
5081 (study->options & PCRE_STUDY_MAPPED) != 0)
5082 start_bits = study->start_bits;
5083 }
5084
5085 /* For anchored or unanchored matches, there may be a "last known required
5086 character" set. */
5087
5088 if ((re->flags & PCRE_REQCHSET) != 0)
5089 {
5090 req_byte = re->req_byte & 255;
5091 req_byte_caseless = (re->req_byte & REQ_CASELESS) != 0;
5092 req_byte2 = (tables + fcc_offset)[req_byte]; /* case flipped */
5093 }
5094
5095
5096 /* ==========================================================================*/
5097
5098 /* Loop for handling unanchored repeated matching attempts; for anchored regexs
5099 the loop runs just once. */
5100
5101 for(;;)
5102 {
5103 USPTR save_end_subject = end_subject;
5104 USPTR new_start_match;
5105
5106 /* Reset the maximum number of extractions we might see. */
5107
5108 if (md->offset_vector != NULL)
5109 {
5110 register int *iptr = md->offset_vector;
5111 register int *iend = iptr + resetcount;
5112 while (iptr < iend) *iptr++ = -1;
5113 }
5114
5115 /* If firstline is TRUE, the start of the match is constrained to the first
5116 line of a multiline string. That is, the match must be before or at the first
5117 newline. Implement this by temporarily adjusting end_subject so that we stop
5118 scanning at a newline. If the match fails at the newline, later code breaks
5119 this loop. */
5120
5121 if (firstline)
5122 {
5123 USPTR t = start_match;
5124 #ifdef SUPPORT_UTF8
5125 if (utf8)
5126 {
5127 while (t < md->end_subject && !IS_NEWLINE(t))
5128 {
5129 t++;
5130 while (t < end_subject && (*t & 0xc0) == 0x80) t++;
5131 }
5132 }
5133 else
5134 #endif
5135 while (t < md->end_subject && !IS_NEWLINE(t)) t++;
5136 end_subject = t;
5137 }
5138
5139 /* There are some optimizations that avoid running the match if a known
5140 starting point is not found, or if a known later character is not present.
5141 However, there is an option that disables these, for testing and for ensuring
5142 that all callouts do actually occur. */
5143
5144 if ((options & PCRE_NO_START_OPTIMIZE) == 0)
5145 {
5146 /* Advance to a unique first byte if there is one. */
5147
5148 if (first_byte >= 0)
5149 {
5150 if (first_byte_caseless)
5151 while (start_match < end_subject && md->lcc[*start_match] != first_byte)
5152 start_match++;
5153 else
5154 while (start_match < end_subject && *start_match != first_byte)
5155 start_match++;
5156 }
5157
5158 /* Or to just after a linebreak for a multiline match */
5159
5160 else if (startline)
5161 {
5162 if (start_match > md->start_subject + start_offset)
5163 {
5164 #ifdef SUPPORT_UTF8
5165 if (utf8)
5166 {
5167 while (start_match < end_subject && !WAS_NEWLINE(start_match))
5168 {
5169 start_match++;
5170 while(start_match < end_subject && (*start_match & 0xc0) == 0x80)
5171 start_match++;
5172 }
5173 }
5174 else
5175 #endif
5176 while (start_match < end_subject && !WAS_NEWLINE(start_match))
5177 start_match++;
5178
5179 /* If we have just passed a CR and the newline option is ANY or ANYCRLF,
5180 and we are now at a LF, advance the match position by one more character.
5181 */
5182
5183 if (start_match[-1] == CHAR_CR &&
5184 (md->nltype == NLTYPE_ANY || md->nltype == NLTYPE_ANYCRLF) &&
5185 start_match < end_subject &&
5186 *start_match == CHAR_NL)
5187 start_match++;
5188 }
5189 }
5190
5191 /* Or to a non-unique first byte after study */
5192
5193 else if (start_bits != NULL)
5194 {
5195 while (start_match < end_subject)
5196 {
5197 register unsigned int c = *start_match;
5198 if ((start_bits[c/8] & (1 << (c&7))) == 0) start_match++;
5199 else break;
5200 }
5201 }
5202 } /* Starting optimizations */
5203
5204 /* Restore fudged end_subject */
5205
5206 end_subject = save_end_subject;
5207
5208 #ifdef DEBUG /* Sigh. Some compilers never learn. */
5209 printf(">>>> Match against: ");
5210 pchars(start_match, end_subject - start_match, TRUE, md);
5211 printf("\n");
5212 #endif
5213
5214 /* If req_byte is set, we know that that character must appear in the
5215 subject for the match to succeed. If the first character is set, req_byte
5216 must be later in the subject; otherwise the test starts at the match point.
5217 This optimization can save a huge amount of backtracking in patterns with
5218 nested unlimited repeats that aren't going to match. Writing separate code
5219 for cased/caseless versions makes it go faster, as does using an
5220 autoincrement and backing off on a match.
5221
5222 HOWEVER: when the subject string is very, very long, searching to its end
5223 can take a long time, and give bad performance on quite ordinary patterns.
5224 This showed up when somebody was matching something like /^\d+C/ on a
5225 32-megabyte string... so we don't do this when the string is sufficiently
5226 long.
5227
5228 ALSO: this processing is disabled when partial matching is requested, or if
5229 disabling is explicitly requested. */
5230
5231 if ((options & PCRE_NO_START_OPTIMIZE) == 0 &&
5232 req_byte >= 0 &&
5233 end_subject - start_match < REQ_BYTE_MAX &&
5234 !md->partial)
5235 {
5236 register USPTR p = start_match + ((first_byte >= 0)? 1 : 0);
5237
5238 /* We don't need to repeat the search if we haven't yet reached the
5239 place we found it at last time. */
5240
5241 if (p > req_byte_ptr)
5242 {
5243 if (req_byte_caseless)
5244 {
5245 while (p < end_subject)
5246 {
5247 register int pp = *p++;
5248 if (pp == req_byte || pp == req_byte2) { p--; break; }
5249 }
5250 }
5251 else
5252 {
5253 while (p < end_subject)
5254 {
5255 if (*p++ == req_byte) { p--; break; }
5256 }
5257 }
5258
5259 /* If we can't find the required character, break the matching loop,
5260 forcing a match failure. */
5261
5262 if (p >= end_subject)
5263 {
5264 rc = MATCH_NOMATCH;
5265 break;
5266 }
5267
5268 /* If we have found the required character, save the point where we
5269 found it, so that we don't search again next time round the loop if
5270 the start hasn't passed this character yet. */
5271
5272 req_byte_ptr = p;
5273 }
5274 }
5275
5276 /* OK, we can now run the match. If "hitend" is set afterwards, remember the
5277 first starting point for which a partial match was found. */
5278
5279 md->start_match_ptr = start_match;
5280 md->match_call_count = 0;
5281 rc = match(start_match, md->start_code, start_match, 2, md, ims, NULL, 0, 0);
5282 if (md->hitend && start_partial == NULL) start_partial = start_match;
5283
5284 switch(rc)
5285 {
5286 /* NOMATCH and PRUNE advance by one character. THEN at this level acts
5287 exactly like PRUNE. */
5288
5289 case MATCH_NOMATCH:
5290 case MATCH_PRUNE:
5291 case MATCH_THEN:
5292 new_start_match = start_match + 1;
5293 #ifdef SUPPORT_UTF8
5294 if (utf8)
5295 while(new_start_match < end_subject && (*new_start_match & 0xc0) == 0x80)
5296 new_start_match++;
5297 #endif
5298 break;
5299
5300 /* SKIP passes back the next starting point explicitly. */
5301
5302 case MATCH_SKIP:
5303 new_start_match = md->start_match_ptr;
5304 break;
5305
5306 /* COMMIT disables the bumpalong, but otherwise behaves as NOMATCH. */
5307
5308 case MATCH_COMMIT:
5309 rc = MATCH_NOMATCH;
5310 goto ENDLOOP;
5311
5312 /* Any other return is some kind of error. */
5313
5314 default:
5315 goto ENDLOOP;
5316 }
5317
5318 /* Control reaches here for the various types of "no match at this point"
5319 result. Reset the code to MATCH_NOMATCH for subsequent checking. */
5320
5321 rc = MATCH_NOMATCH;
5322
5323 /* If PCRE_FIRSTLINE is set, the match must happen before or at the first
5324 newline in the subject (though it may continue over the newline). Therefore,
5325 if we have just failed to match, starting at a newline, do not continue. */
5326
5327 if (firstline && IS_NEWLINE(start_match)) break;
5328
5329 /* Advance to new matching position */
5330
5331 start_match = new_start_match;
5332
5333 /* Break the loop if the pattern is anchored or if we have passed the end of
5334 the subject. */
5335
5336 if (anchored || start_match > end_subject) break;
5337
5338 /* If we have just passed a CR and we are now at a LF, and the pattern does
5339 not contain any explicit matches for \r or \n, and the newline option is CRLF
5340 or ANY or ANYCRLF, advance the match position by one more character. */
5341
5342 if (start_match[-1] == CHAR_CR &&
5343 start_match < end_subject &&
5344 *start_match == CHAR_NL &&
5345 (re->flags & PCRE_HASCRORLF) == 0 &&
5346 (md->nltype == NLTYPE_ANY ||
5347 md->nltype == NLTYPE_ANYCRLF ||
5348 md->nllen == 2))
5349 start_match++;
5350
5351 } /* End of for(;;) "bumpalong" loop */
5352
5353 /* ==========================================================================*/
5354
5355 /* We reach here when rc is not MATCH_NOMATCH, or if one of the stopping
5356 conditions is true:
5357
5358 (1) The pattern is anchored or the match was failed by (*COMMIT);
5359
5360 (2) We are past the end of the subject;
5361
5362 (3) PCRE_FIRSTLINE is set and we have failed to match at a newline, because
5363 this option requests that a match occur at or before the first newline in
5364 the subject.
5365
5366 When we have a match and the offset vector is big enough to deal with any
5367 backreferences, captured substring offsets will already be set up. In the case
5368 where we had to get some local store to hold offsets for backreference
5369 processing, copy those that we can. In this case there need not be overflow if
5370 certain parts of the pattern were not used, even though there are more
5371 capturing parentheses than vector slots. */
5372
5373 ENDLOOP:
5374
5375 if (rc == MATCH_MATCH)
5376 {
5377 if (using_temporary_offsets)
5378 {
5379 if (offsetcount >= 4)
5380 {
5381 memcpy(offsets + 2, md->offset_vector + 2,
5382 (offsetcount - 2) * sizeof(int));
5383 DPRINTF(("Copied offsets from temporary memory\n"));
5384 }
5385 if (md->end_offset_top > offsetcount) md->offset_overflow = TRUE;
5386 DPRINTF(("Freeing temporary memory\n"));
5387 (pcre_free)(md->offset_vector);
5388 }
5389
5390 /* Set the return code to the number of captured strings, or 0 if there are
5391 too many to fit into the vector. */
5392
5393 rc = md->offset_overflow? 0 : md->end_offset_top/2;
5394
5395 /* If there is space, set up the whole thing as substring 0. The value of
5396 md->start_match_ptr might be modified if \K was encountered on the success
5397 matching path. */
5398
5399 if (offsetcount < 2) rc = 0; else
5400 {
5401 offsets[0] = md->start_match_ptr - md->start_subject;
5402 offsets[1] = md->end_match_ptr - md->start_subject;
5403 }
5404
5405 DPRINTF((">>>> returning %d\n", rc));
5406 return rc;
5407 }
5408
5409 /* Control gets here if there has been an error, or if the overall match
5410 attempt has failed at all permitted starting positions. */
5411
5412 if (using_temporary_offsets)
5413 {
5414 DPRINTF(("Freeing temporary memory\n"));
5415 (pcre_free)(md->offset_vector);
5416 }
5417
5418 if (rc != MATCH_NOMATCH && rc != PCRE_ERROR_PARTIAL)
5419 {
5420 DPRINTF((">>>> error: returning %d\n", rc));
5421 return rc;
5422 }
5423 else if (start_partial != NULL)
5424 {
5425 DPRINTF((">>>> returning PCRE_ERROR_PARTIAL\n"));
5426 if (offsetcount > 1)
5427 {
5428 offsets[0] = start_partial - (USPTR)subject;
5429 offsets[1] = end_subject - (USPTR)subject;
5430 }
5431 return PCRE_ERROR_PARTIAL;
5432 }
5433 else
5434 {
5435 DPRINTF((">>>> returning PCRE_ERROR_NOMATCH\n"));
5436 return PCRE_ERROR_NOMATCH;
5437 }
5438 }
5439
5440 /* End of pcre_exec.c */

Properties

Name Value
svn:eol-style native
svn:keywords "Author Date Id Revision Url"

webmaster@exim.org
ViewVC Help
Powered by ViewVC 1.1.12