/[pcre]/code/trunk/pcre_exec.c
ViewVC logotype

Contents of /code/trunk/pcre_exec.c

Parent Directory Parent Directory | Revision Log Revision Log


Revision 619 - (show annotations) (download)
Sun Jul 17 13:23:14 2011 UTC (3 years, 2 months ago) by ph10
File MIME type: text/plain
File size: 194867 byte(s)
Fix capturing not happening in assertion conditions.

1 /*************************************************
2 * Perl-Compatible Regular Expressions *
3 *************************************************/
4
5 /* PCRE is a library of functions to support regular expressions whose syntax
6 and semantics are as close as possible to those of the Perl 5 language.
7
8 Written by Philip Hazel
9 Copyright (c) 1997-2011 University of Cambridge
10
11 -----------------------------------------------------------------------------
12 Redistribution and use in source and binary forms, with or without
13 modification, are permitted provided that the following conditions are met:
14
15 * Redistributions of source code must retain the above copyright notice,
16 this list of conditions and the following disclaimer.
17
18 * Redistributions in binary form must reproduce the above copyright
19 notice, this list of conditions and the following disclaimer in the
20 documentation and/or other materials provided with the distribution.
21
22 * Neither the name of the University of Cambridge nor the names of its
23 contributors may be used to endorse or promote products derived from
24 this software without specific prior written permission.
25
26 THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
27 AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
28 IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
29 ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
30 LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
31 CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
32 SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
33 INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
34 CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
35 ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
36 POSSIBILITY OF SUCH DAMAGE.
37 -----------------------------------------------------------------------------
38 */
39
40
41 /* This module contains pcre_exec(), the externally visible function that does
42 pattern matching using an NFA algorithm, trying to mimic Perl as closely as
43 possible. There are also some static supporting functions. */
44
45 #ifdef HAVE_CONFIG_H
46 #include "config.h"
47 #endif
48
49 #define NLBLOCK md /* Block containing newline information */
50 #define PSSTART start_subject /* Field containing processed string start */
51 #define PSEND end_subject /* Field containing processed string end */
52
53 #include "pcre_internal.h"
54
55 /* Undefine some potentially clashing cpp symbols */
56
57 #undef min
58 #undef max
59
60 /* Values for setting in md->match_function_type to indicate two special types
61 of call to match(). We do it this way to save on using another stack variable,
62 as stack usage is to be discouraged. */
63
64 #define MATCH_CONDASSERT 1 /* Called to check a condition assertion */
65 #define MATCH_CBEGROUP 2 /* Could-be-empty unlimited repeat group */
66
67 /* Non-error returns from the match() function. Error returns are externally
68 defined PCRE_ERROR_xxx codes, which are all negative. */
69
70 #define MATCH_MATCH 1
71 #define MATCH_NOMATCH 0
72
73 /* Special internal returns from the match() function. Make them sufficiently
74 negative to avoid the external error codes. */
75
76 #define MATCH_ACCEPT (-999)
77 #define MATCH_COMMIT (-998)
78 #define MATCH_KETRPOS (-997)
79 #define MATCH_ONCE (-996)
80 #define MATCH_PRUNE (-995)
81 #define MATCH_SKIP (-994)
82 #define MATCH_SKIP_ARG (-993)
83 #define MATCH_THEN (-992)
84
85 /* This is a convenience macro for code that occurs many times. */
86
87 #define MRRETURN(ra) \
88 { \
89 md->mark = markptr; \
90 RRETURN(ra); \
91 }
92
93 /* Maximum number of ints of offset to save on the stack for recursive calls.
94 If the offset vector is bigger, malloc is used. This should be a multiple of 3,
95 because the offset vector is always a multiple of 3 long. */
96
97 #define REC_STACK_SAVE_MAX 30
98
99 /* Min and max values for the common repeats; for the maxima, 0 => infinity */
100
101 static const char rep_min[] = { 0, 0, 1, 1, 0, 0 };
102 static const char rep_max[] = { 0, 0, 0, 0, 1, 1 };
103
104
105
106 #ifdef PCRE_DEBUG
107 /*************************************************
108 * Debugging function to print chars *
109 *************************************************/
110
111 /* Print a sequence of chars in printable format, stopping at the end of the
112 subject if the requested.
113
114 Arguments:
115 p points to characters
116 length number to print
117 is_subject TRUE if printing from within md->start_subject
118 md pointer to matching data block, if is_subject is TRUE
119
120 Returns: nothing
121 */
122
123 static void
124 pchars(const uschar *p, int length, BOOL is_subject, match_data *md)
125 {
126 unsigned int c;
127 if (is_subject && length > md->end_subject - p) length = md->end_subject - p;
128 while (length-- > 0)
129 if (isprint(c = *(p++))) printf("%c", c); else printf("\\x%02x", c);
130 }
131 #endif
132
133
134
135 /*************************************************
136 * Match a back-reference *
137 *************************************************/
138
139 /* Normally, if a back reference hasn't been set, the length that is passed is
140 negative, so the match always fails. However, in JavaScript compatibility mode,
141 the length passed is zero. Note that in caseless UTF-8 mode, the number of
142 subject bytes matched may be different to the number of reference bytes.
143
144 Arguments:
145 offset index into the offset vector
146 eptr pointer into the subject
147 length length of reference to be matched (number of bytes)
148 md points to match data block
149 caseless TRUE if caseless
150
151 Returns: < 0 if not matched, otherwise the number of subject bytes matched
152 */
153
154 static int
155 match_ref(int offset, register USPTR eptr, int length, match_data *md,
156 BOOL caseless)
157 {
158 USPTR eptr_start = eptr;
159 register USPTR p = md->start_subject + md->offset_vector[offset];
160
161 #ifdef PCRE_DEBUG
162 if (eptr >= md->end_subject)
163 printf("matching subject <null>");
164 else
165 {
166 printf("matching subject ");
167 pchars(eptr, length, TRUE, md);
168 }
169 printf(" against backref ");
170 pchars(p, length, FALSE, md);
171 printf("\n");
172 #endif
173
174 /* Always fail if reference not set (and not JavaScript compatible). */
175
176 if (length < 0) return -1;
177
178 /* Separate the caseless case for speed. In UTF-8 mode we can only do this
179 properly if Unicode properties are supported. Otherwise, we can check only
180 ASCII characters. */
181
182 if (caseless)
183 {
184 #ifdef SUPPORT_UTF8
185 #ifdef SUPPORT_UCP
186 if (md->utf8)
187 {
188 /* Match characters up to the end of the reference. NOTE: the number of
189 bytes matched may differ, because there are some characters whose upper and
190 lower case versions code as different numbers of bytes. For example, U+023A
191 (2 bytes in UTF-8) is the upper case version of U+2C65 (3 bytes in UTF-8);
192 a sequence of 3 of the former uses 6 bytes, as does a sequence of two of
193 the latter. It is important, therefore, to check the length along the
194 reference, not along the subject (earlier code did this wrong). */
195
196 USPTR endptr = p + length;
197 while (p < endptr)
198 {
199 int c, d;
200 if (eptr >= md->end_subject) return -1;
201 GETCHARINC(c, eptr);
202 GETCHARINC(d, p);
203 if (c != d && c != UCD_OTHERCASE(d)) return -1;
204 }
205 }
206 else
207 #endif
208 #endif
209
210 /* The same code works when not in UTF-8 mode and in UTF-8 mode when there
211 is no UCP support. */
212 {
213 if (eptr + length > md->end_subject) return -1;
214 while (length-- > 0)
215 { if (md->lcc[*p++] != md->lcc[*eptr++]) return -1; }
216 }
217 }
218
219 /* In the caseful case, we can just compare the bytes, whether or not we
220 are in UTF-8 mode. */
221
222 else
223 {
224 if (eptr + length > md->end_subject) return -1;
225 while (length-- > 0) if (*p++ != *eptr++) return -1;
226 }
227
228 return eptr - eptr_start;
229 }
230
231
232
233 /***************************************************************************
234 ****************************************************************************
235 RECURSION IN THE match() FUNCTION
236
237 The match() function is highly recursive, though not every recursive call
238 increases the recursive depth. Nevertheless, some regular expressions can cause
239 it to recurse to a great depth. I was writing for Unix, so I just let it call
240 itself recursively. This uses the stack for saving everything that has to be
241 saved for a recursive call. On Unix, the stack can be large, and this works
242 fine.
243
244 It turns out that on some non-Unix-like systems there are problems with
245 programs that use a lot of stack. (This despite the fact that every last chip
246 has oodles of memory these days, and techniques for extending the stack have
247 been known for decades.) So....
248
249 There is a fudge, triggered by defining NO_RECURSE, which avoids recursive
250 calls by keeping local variables that need to be preserved in blocks of memory
251 obtained from malloc() instead instead of on the stack. Macros are used to
252 achieve this so that the actual code doesn't look very different to what it
253 always used to.
254
255 The original heap-recursive code used longjmp(). However, it seems that this
256 can be very slow on some operating systems. Following a suggestion from Stan
257 Switzer, the use of longjmp() has been abolished, at the cost of having to
258 provide a unique number for each call to RMATCH. There is no way of generating
259 a sequence of numbers at compile time in C. I have given them names, to make
260 them stand out more clearly.
261
262 Crude tests on x86 Linux show a small speedup of around 5-8%. However, on
263 FreeBSD, avoiding longjmp() more than halves the time taken to run the standard
264 tests. Furthermore, not using longjmp() means that local dynamic variables
265 don't have indeterminate values; this has meant that the frame size can be
266 reduced because the result can be "passed back" by straight setting of the
267 variable instead of being passed in the frame.
268 ****************************************************************************
269 ***************************************************************************/
270
271 /* Numbers for RMATCH calls. When this list is changed, the code at HEAP_RETURN
272 below must be updated in sync. */
273
274 enum { RM1=1, RM2, RM3, RM4, RM5, RM6, RM7, RM8, RM9, RM10,
275 RM11, RM12, RM13, RM14, RM15, RM16, RM17, RM18, RM19, RM20,
276 RM21, RM22, RM23, RM24, RM25, RM26, RM27, RM28, RM29, RM30,
277 RM31, RM32, RM33, RM34, RM35, RM36, RM37, RM38, RM39, RM40,
278 RM41, RM42, RM43, RM44, RM45, RM46, RM47, RM48, RM49, RM50,
279 RM51, RM52, RM53, RM54, RM55, RM56, RM57, RM58, RM59, RM60,
280 RM61, RM62, RM63, RM64, RM65, RM66 };
281
282 /* These versions of the macros use the stack, as normal. There are debugging
283 versions and production versions. Note that the "rw" argument of RMATCH isn't
284 actually used in this definition. */
285
286 #ifndef NO_RECURSE
287 #define REGISTER register
288
289 #ifdef PCRE_DEBUG
290 #define RMATCH(ra,rb,rc,rd,re,rw) \
291 { \
292 printf("match() called in line %d\n", __LINE__); \
293 rrc = match(ra,rb,mstart,markptr,rc,rd,re,rdepth+1); \
294 printf("to line %d\n", __LINE__); \
295 }
296 #define RRETURN(ra) \
297 { \
298 printf("match() returned %d from line %d ", ra, __LINE__); \
299 return ra; \
300 }
301 #else
302 #define RMATCH(ra,rb,rc,rd,re,rw) \
303 rrc = match(ra,rb,mstart,markptr,rc,rd,re,rdepth+1)
304 #define RRETURN(ra) return ra
305 #endif
306
307 #else
308
309
310 /* These versions of the macros manage a private stack on the heap. Note that
311 the "rd" argument of RMATCH isn't actually used in this definition. It's the md
312 argument of match(), which never changes. */
313
314 #define REGISTER
315
316 #define RMATCH(ra,rb,rc,rd,re,rw)\
317 {\
318 heapframe *newframe = (heapframe *)(pcre_stack_malloc)(sizeof(heapframe));\
319 if (newframe == NULL) RRETURN(PCRE_ERROR_NOMEMORY);\
320 frame->Xwhere = rw; \
321 newframe->Xeptr = ra;\
322 newframe->Xecode = rb;\
323 newframe->Xmstart = mstart;\
324 newframe->Xmarkptr = markptr;\
325 newframe->Xoffset_top = rc;\
326 newframe->Xeptrb = re;\
327 newframe->Xrdepth = frame->Xrdepth + 1;\
328 newframe->Xprevframe = frame;\
329 frame = newframe;\
330 DPRINTF(("restarting from line %d\n", __LINE__));\
331 goto HEAP_RECURSE;\
332 L_##rw:\
333 DPRINTF(("jumped back to line %d\n", __LINE__));\
334 }
335
336 #define RRETURN(ra)\
337 {\
338 heapframe *oldframe = frame;\
339 frame = oldframe->Xprevframe;\
340 (pcre_stack_free)(oldframe);\
341 if (frame != NULL)\
342 {\
343 rrc = ra;\
344 goto HEAP_RETURN;\
345 }\
346 return ra;\
347 }
348
349
350 /* Structure for remembering the local variables in a private frame */
351
352 typedef struct heapframe {
353 struct heapframe *Xprevframe;
354
355 /* Function arguments that may change */
356
357 USPTR Xeptr;
358 const uschar *Xecode;
359 USPTR Xmstart;
360 USPTR Xmarkptr;
361 int Xoffset_top;
362 eptrblock *Xeptrb;
363 unsigned int Xrdepth;
364
365 /* Function local variables */
366
367 USPTR Xcallpat;
368 #ifdef SUPPORT_UTF8
369 USPTR Xcharptr;
370 #endif
371 USPTR Xdata;
372 USPTR Xnext;
373 USPTR Xpp;
374 USPTR Xprev;
375 USPTR Xsaved_eptr;
376
377 recursion_info Xnew_recursive;
378
379 BOOL Xcur_is_word;
380 BOOL Xcondition;
381 BOOL Xprev_is_word;
382
383 #ifdef SUPPORT_UCP
384 int Xprop_type;
385 int Xprop_value;
386 int Xprop_fail_result;
387 int Xprop_category;
388 int Xprop_chartype;
389 int Xprop_script;
390 int Xoclength;
391 uschar Xocchars[8];
392 #endif
393
394 int Xcodelink;
395 int Xctype;
396 unsigned int Xfc;
397 int Xfi;
398 int Xlength;
399 int Xmax;
400 int Xmin;
401 int Xnumber;
402 int Xoffset;
403 int Xop;
404 int Xsave_capture_last;
405 int Xsave_offset1, Xsave_offset2, Xsave_offset3;
406 int Xstacksave[REC_STACK_SAVE_MAX];
407
408 eptrblock Xnewptrb;
409
410 /* Where to jump back to */
411
412 int Xwhere;
413
414 } heapframe;
415
416 #endif
417
418
419 /***************************************************************************
420 ***************************************************************************/
421
422
423
424 /*************************************************
425 * Match from current position *
426 *************************************************/
427
428 /* This function is called recursively in many circumstances. Whenever it
429 returns a negative (error) response, the outer incarnation must also return the
430 same response. */
431
432 /* These macros pack up tests that are used for partial matching, and which
433 appears several times in the code. We set the "hit end" flag if the pointer is
434 at the end of the subject and also past the start of the subject (i.e.
435 something has been matched). For hard partial matching, we then return
436 immediately. The second one is used when we already know we are past the end of
437 the subject. */
438
439 #define CHECK_PARTIAL()\
440 if (md->partial != 0 && eptr >= md->end_subject && \
441 eptr > md->start_used_ptr) \
442 { \
443 md->hitend = TRUE; \
444 if (md->partial > 1) MRRETURN(PCRE_ERROR_PARTIAL); \
445 }
446
447 #define SCHECK_PARTIAL()\
448 if (md->partial != 0 && eptr > md->start_used_ptr) \
449 { \
450 md->hitend = TRUE; \
451 if (md->partial > 1) MRRETURN(PCRE_ERROR_PARTIAL); \
452 }
453
454
455 /* Performance note: It might be tempting to extract commonly used fields from
456 the md structure (e.g. utf8, end_subject) into individual variables to improve
457 performance. Tests using gcc on a SPARC disproved this; in the first case, it
458 made performance worse.
459
460 Arguments:
461 eptr pointer to current character in subject
462 ecode pointer to current position in compiled code
463 mstart pointer to the current match start position (can be modified
464 by encountering \K)
465 markptr pointer to the most recent MARK name, or NULL
466 offset_top current top pointer
467 md pointer to "static" info for the match
468 eptrb pointer to chain of blocks containing eptr at start of
469 brackets - for testing for empty matches
470 rdepth the recursion depth
471
472 Returns: MATCH_MATCH if matched ) these values are >= 0
473 MATCH_NOMATCH if failed to match )
474 a negative MATCH_xxx value for PRUNE, SKIP, etc
475 a negative PCRE_ERROR_xxx value if aborted by an error condition
476 (e.g. stopped by repeated call or recursion limit)
477 */
478
479 static int
480 match(REGISTER USPTR eptr, REGISTER const uschar *ecode, USPTR mstart,
481 const uschar *markptr, int offset_top, match_data *md, eptrblock *eptrb,
482 unsigned int rdepth)
483 {
484 /* These variables do not need to be preserved over recursion in this function,
485 so they can be ordinary variables in all cases. Mark some of them with
486 "register" because they are used a lot in loops. */
487
488 register int rrc; /* Returns from recursive calls */
489 register int i; /* Used for loops not involving calls to RMATCH() */
490 register unsigned int c; /* Character values not kept over RMATCH() calls */
491 register BOOL utf8; /* Local copy of UTF-8 flag for speed */
492
493 BOOL minimize, possessive; /* Quantifier options */
494 BOOL caseless;
495 int condcode;
496
497 /* When recursion is not being used, all "local" variables that have to be
498 preserved over calls to RMATCH() are part of a "frame" which is obtained from
499 heap storage. Set up the top-level frame here; others are obtained from the
500 heap whenever RMATCH() does a "recursion". See the macro definitions above. */
501
502 #ifdef NO_RECURSE
503 heapframe *frame = (heapframe *)(pcre_stack_malloc)(sizeof(heapframe));
504 if (frame == NULL) RRETURN(PCRE_ERROR_NOMEMORY);
505 frame->Xprevframe = NULL; /* Marks the top level */
506
507 /* Copy in the original argument variables */
508
509 frame->Xeptr = eptr;
510 frame->Xecode = ecode;
511 frame->Xmstart = mstart;
512 frame->Xmarkptr = markptr;
513 frame->Xoffset_top = offset_top;
514 frame->Xeptrb = eptrb;
515 frame->Xrdepth = rdepth;
516
517 /* This is where control jumps back to to effect "recursion" */
518
519 HEAP_RECURSE:
520
521 /* Macros make the argument variables come from the current frame */
522
523 #define eptr frame->Xeptr
524 #define ecode frame->Xecode
525 #define mstart frame->Xmstart
526 #define markptr frame->Xmarkptr
527 #define offset_top frame->Xoffset_top
528 #define eptrb frame->Xeptrb
529 #define rdepth frame->Xrdepth
530
531 /* Ditto for the local variables */
532
533 #ifdef SUPPORT_UTF8
534 #define charptr frame->Xcharptr
535 #endif
536 #define callpat frame->Xcallpat
537 #define codelink frame->Xcodelink
538 #define data frame->Xdata
539 #define next frame->Xnext
540 #define pp frame->Xpp
541 #define prev frame->Xprev
542 #define saved_eptr frame->Xsaved_eptr
543
544 #define new_recursive frame->Xnew_recursive
545
546 #define cur_is_word frame->Xcur_is_word
547 #define condition frame->Xcondition
548 #define prev_is_word frame->Xprev_is_word
549
550 #ifdef SUPPORT_UCP
551 #define prop_type frame->Xprop_type
552 #define prop_value frame->Xprop_value
553 #define prop_fail_result frame->Xprop_fail_result
554 #define prop_category frame->Xprop_category
555 #define prop_chartype frame->Xprop_chartype
556 #define prop_script frame->Xprop_script
557 #define oclength frame->Xoclength
558 #define occhars frame->Xocchars
559 #endif
560
561 #define ctype frame->Xctype
562 #define fc frame->Xfc
563 #define fi frame->Xfi
564 #define length frame->Xlength
565 #define max frame->Xmax
566 #define min frame->Xmin
567 #define number frame->Xnumber
568 #define offset frame->Xoffset
569 #define op frame->Xop
570 #define save_capture_last frame->Xsave_capture_last
571 #define save_offset1 frame->Xsave_offset1
572 #define save_offset2 frame->Xsave_offset2
573 #define save_offset3 frame->Xsave_offset3
574 #define stacksave frame->Xstacksave
575
576 #define newptrb frame->Xnewptrb
577
578 /* When recursion is being used, local variables are allocated on the stack and
579 get preserved during recursion in the normal way. In this environment, fi and
580 i, and fc and c, can be the same variables. */
581
582 #else /* NO_RECURSE not defined */
583 #define fi i
584 #define fc c
585
586 /* Many of the following variables are used only in small blocks of the code.
587 My normal style of coding would have declared them within each of those blocks.
588 However, in order to accommodate the version of this code that uses an external
589 "stack" implemented on the heap, it is easier to declare them all here, so the
590 declarations can be cut out in a block. The only declarations within blocks
591 below are for variables that do not have to be preserved over a recursive call
592 to RMATCH(). */
593
594 #ifdef SUPPORT_UTF8
595 const uschar *charptr;
596 #endif
597 const uschar *callpat;
598 const uschar *data;
599 const uschar *next;
600 USPTR pp;
601 const uschar *prev;
602 USPTR saved_eptr;
603
604 recursion_info new_recursive;
605
606 BOOL cur_is_word;
607 BOOL condition;
608 BOOL prev_is_word;
609
610 #ifdef SUPPORT_UCP
611 int prop_type;
612 int prop_value;
613 int prop_fail_result;
614 int prop_category;
615 int prop_chartype;
616 int prop_script;
617 int oclength;
618 uschar occhars[8];
619 #endif
620
621 int codelink;
622 int ctype;
623 int length;
624 int max;
625 int min;
626 int number;
627 int offset;
628 int op;
629 int save_capture_last;
630 int save_offset1, save_offset2, save_offset3;
631 int stacksave[REC_STACK_SAVE_MAX];
632
633 eptrblock newptrb;
634 #endif /* NO_RECURSE */
635
636 /* To save space on the stack and in the heap frame, I have doubled up on some
637 of the local variables that are used only in localised parts of the code, but
638 still need to be preserved over recursive calls of match(). These macros define
639 the alternative names that are used. */
640
641 #define allow_zero cur_is_word
642 #define cbegroup condition
643 #define code_offset codelink
644 #define condassert condition
645 #define matched_once prev_is_word
646
647 /* These statements are here to stop the compiler complaining about unitialized
648 variables. */
649
650 #ifdef SUPPORT_UCP
651 prop_value = 0;
652 prop_fail_result = 0;
653 #endif
654
655
656 /* This label is used for tail recursion, which is used in a few cases even
657 when NO_RECURSE is not defined, in order to reduce the amount of stack that is
658 used. Thanks to Ian Taylor for noticing this possibility and sending the
659 original patch. */
660
661 TAIL_RECURSE:
662
663 /* OK, now we can get on with the real code of the function. Recursive calls
664 are specified by the macro RMATCH and RRETURN is used to return. When
665 NO_RECURSE is *not* defined, these just turn into a recursive call to match()
666 and a "return", respectively (possibly with some debugging if PCRE_DEBUG is
667 defined). However, RMATCH isn't like a function call because it's quite a
668 complicated macro. It has to be used in one particular way. This shouldn't,
669 however, impact performance when true recursion is being used. */
670
671 #ifdef SUPPORT_UTF8
672 utf8 = md->utf8; /* Local copy of the flag */
673 #else
674 utf8 = FALSE;
675 #endif
676
677 /* First check that we haven't called match() too many times, or that we
678 haven't exceeded the recursive call limit. */
679
680 if (md->match_call_count++ >= md->match_limit) RRETURN(PCRE_ERROR_MATCHLIMIT);
681 if (rdepth >= md->match_limit_recursion) RRETURN(PCRE_ERROR_RECURSIONLIMIT);
682
683 /* At the start of a group with an unlimited repeat that may match an empty
684 string, the variable md->match_function_type is set to MATCH_CBEGROUP. It is
685 done this way to save having to use another function argument, which would take
686 up space on the stack. See also MATCH_CONDASSERT below.
687
688 When MATCH_CBEGROUP is set, add the current subject pointer to the chain of
689 such remembered pointers, to be checked when we hit the closing ket, in order
690 to break infinite loops that match no characters. When match() is called in
691 other circumstances, don't add to the chain. The MATCH_CBEGROUP feature must
692 NOT be used with tail recursion, because the memory block that is used is on
693 the stack, so a new one may be required for each match(). */
694
695 if (md->match_function_type == MATCH_CBEGROUP)
696 {
697 newptrb.epb_saved_eptr = eptr;
698 newptrb.epb_prev = eptrb;
699 eptrb = &newptrb;
700 md->match_function_type = 0;
701 }
702
703 /* Now start processing the opcodes. */
704
705 for (;;)
706 {
707 minimize = possessive = FALSE;
708 op = *ecode;
709
710 switch(op)
711 {
712 case OP_MARK:
713 markptr = ecode + 2;
714 RMATCH(eptr, ecode + _pcre_OP_lengths[*ecode] + ecode[1], offset_top, md,
715 eptrb, RM55);
716
717 /* A return of MATCH_SKIP_ARG means that matching failed at SKIP with an
718 argument, and we must check whether that argument matches this MARK's
719 argument. It is passed back in md->start_match_ptr (an overloading of that
720 variable). If it does match, we reset that variable to the current subject
721 position and return MATCH_SKIP. Otherwise, pass back the return code
722 unaltered. */
723
724 if (rrc == MATCH_SKIP_ARG &&
725 strcmp((char *)markptr, (char *)(md->start_match_ptr)) == 0)
726 {
727 md->start_match_ptr = eptr;
728 RRETURN(MATCH_SKIP);
729 }
730
731 if (md->mark == NULL) md->mark = markptr;
732 RRETURN(rrc);
733
734 case OP_FAIL:
735 MRRETURN(MATCH_NOMATCH);
736
737 /* COMMIT overrides PRUNE, SKIP, and THEN */
738
739 case OP_COMMIT:
740 RMATCH(eptr, ecode + _pcre_OP_lengths[*ecode], offset_top, md,
741 eptrb, RM52);
742 if (rrc != MATCH_NOMATCH && rrc != MATCH_PRUNE &&
743 rrc != MATCH_SKIP && rrc != MATCH_SKIP_ARG &&
744 rrc != MATCH_THEN)
745 RRETURN(rrc);
746 MRRETURN(MATCH_COMMIT);
747
748 /* PRUNE overrides THEN */
749
750 case OP_PRUNE:
751 RMATCH(eptr, ecode + _pcre_OP_lengths[*ecode], offset_top, md,
752 eptrb, RM51);
753 if (rrc != MATCH_NOMATCH && rrc != MATCH_THEN) RRETURN(rrc);
754 MRRETURN(MATCH_PRUNE);
755
756 case OP_PRUNE_ARG:
757 RMATCH(eptr, ecode + _pcre_OP_lengths[*ecode] + ecode[1], offset_top, md,
758 eptrb, RM56);
759 if (rrc != MATCH_NOMATCH && rrc != MATCH_THEN) RRETURN(rrc);
760 md->mark = ecode + 2;
761 RRETURN(MATCH_PRUNE);
762
763 /* SKIP overrides PRUNE and THEN */
764
765 case OP_SKIP:
766 RMATCH(eptr, ecode + _pcre_OP_lengths[*ecode], offset_top, md,
767 eptrb, RM53);
768 if (rrc != MATCH_NOMATCH && rrc != MATCH_PRUNE && rrc != MATCH_THEN)
769 RRETURN(rrc);
770 md->start_match_ptr = eptr; /* Pass back current position */
771 MRRETURN(MATCH_SKIP);
772
773 case OP_SKIP_ARG:
774 RMATCH(eptr, ecode + _pcre_OP_lengths[*ecode] + ecode[1], offset_top, md,
775 eptrb, RM57);
776 if (rrc != MATCH_NOMATCH && rrc != MATCH_PRUNE && rrc != MATCH_THEN)
777 RRETURN(rrc);
778
779 /* Pass back the current skip name by overloading md->start_match_ptr and
780 returning the special MATCH_SKIP_ARG return code. This will either be
781 caught by a matching MARK, or get to the top, where it is treated the same
782 as PRUNE. */
783
784 md->start_match_ptr = ecode + 2;
785 RRETURN(MATCH_SKIP_ARG);
786
787 /* For THEN (and THEN_ARG) we pass back the address of the bracket or
788 the alt that is at the start of the current branch. This makes it possible
789 to skip back past alternatives that precede the THEN within the current
790 branch. */
791
792 case OP_THEN:
793 RMATCH(eptr, ecode + _pcre_OP_lengths[*ecode], offset_top, md,
794 eptrb, RM54);
795 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
796 md->start_match_ptr = ecode - GET(ecode, 1);
797 MRRETURN(MATCH_THEN);
798
799 case OP_THEN_ARG:
800 RMATCH(eptr, ecode + _pcre_OP_lengths[*ecode] + ecode[1+LINK_SIZE],
801 offset_top, md, eptrb, RM58);
802 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
803 md->start_match_ptr = ecode - GET(ecode, 1);
804 md->mark = ecode + LINK_SIZE + 2;
805 RRETURN(MATCH_THEN);
806
807 /* Handle a capturing bracket, other than those that are possessive with an
808 unlimited repeat. If there is space in the offset vector, save the current
809 subject position in the working slot at the top of the vector. We mustn't
810 change the current values of the data slot, because they may be set from a
811 previous iteration of this group, and be referred to by a reference inside
812 the group. A failure to match might occur after the group has succeeded,
813 if something later on doesn't match. For this reason, we need to restore
814 the working value and also the values of the final offsets, in case they
815 were set by a previous iteration of the same bracket.
816
817 If there isn't enough space in the offset vector, treat this as if it were
818 a non-capturing bracket. Don't worry about setting the flag for the error
819 case here; that is handled in the code for KET. */
820
821 case OP_CBRA:
822 case OP_SCBRA:
823 number = GET2(ecode, 1+LINK_SIZE);
824 offset = number << 1;
825
826 #ifdef PCRE_DEBUG
827 printf("start bracket %d\n", number);
828 printf("subject=");
829 pchars(eptr, 16, TRUE, md);
830 printf("\n");
831 #endif
832
833 if (offset < md->offset_max)
834 {
835 save_offset1 = md->offset_vector[offset];
836 save_offset2 = md->offset_vector[offset+1];
837 save_offset3 = md->offset_vector[md->offset_end - number];
838 save_capture_last = md->capture_last;
839
840 DPRINTF(("saving %d %d %d\n", save_offset1, save_offset2, save_offset3));
841 md->offset_vector[md->offset_end - number] =
842 (int)(eptr - md->start_subject);
843
844 for (;;)
845 {
846 if (op >= OP_SBRA) md->match_function_type = MATCH_CBEGROUP;
847 RMATCH(eptr, ecode + _pcre_OP_lengths[*ecode], offset_top, md,
848 eptrb, RM1);
849 if (rrc == MATCH_ONCE) break; /* Backing up through an atomic group */
850 if (rrc != MATCH_NOMATCH &&
851 (rrc != MATCH_THEN || md->start_match_ptr != ecode))
852 RRETURN(rrc);
853 md->capture_last = save_capture_last;
854 ecode += GET(ecode, 1);
855 if (*ecode != OP_ALT) break;
856 }
857
858 DPRINTF(("bracket %d failed\n", number));
859 md->offset_vector[offset] = save_offset1;
860 md->offset_vector[offset+1] = save_offset2;
861 md->offset_vector[md->offset_end - number] = save_offset3;
862
863 /* At this point, rrc will be one of MATCH_ONCE, MATCH_NOMATCH, or
864 MATCH_THEN. */
865
866 if (rrc != MATCH_THEN && md->mark == NULL) md->mark = markptr;
867 RRETURN(((rrc == MATCH_ONCE)? MATCH_ONCE:MATCH_NOMATCH));
868 }
869
870 /* FALL THROUGH ... Insufficient room for saving captured contents. Treat
871 as a non-capturing bracket. */
872
873 /* VVVVVVVVVVVVVVVVVVVVVVVVV */
874 /* VVVVVVVVVVVVVVVVVVVVVVVVV */
875
876 DPRINTF(("insufficient capture room: treat as non-capturing\n"));
877
878 /* VVVVVVVVVVVVVVVVVVVVVVVVV */
879 /* VVVVVVVVVVVVVVVVVVVVVVVVV */
880
881 /* Non-capturing or atomic group, except for possessive with unlimited
882 repeat. Loop for all the alternatives. When we get to the final alternative
883 within the brackets, we used to return the result of a recursive call to
884 match() whatever happened so it was possible to reduce stack usage by
885 turning this into a tail recursion, except in the case of a possibly empty
886 group. However, now that there is the possiblity of (*THEN) occurring in
887 the final alternative, this optimization is no longer possible.
888
889 MATCH_ONCE is returned when the end of an atomic group is successfully
890 reached, but subsequent matching fails. It passes back up the tree (causing
891 captured values to be reset) until the original atomic group level is
892 reached. This is tested by comparing md->once_target with the start of the
893 group. At this point, the return is converted into MATCH_NOMATCH so that
894 previous backup points can be taken. */
895
896 case OP_ONCE:
897 case OP_BRA:
898 case OP_SBRA:
899 DPRINTF(("start non-capturing bracket\n"));
900
901 for (;;)
902 {
903 if (op >= OP_SBRA || op == OP_ONCE) md->match_function_type = MATCH_CBEGROUP;
904 RMATCH(eptr, ecode + _pcre_OP_lengths[*ecode], offset_top, md, eptrb,
905 RM2);
906 if (rrc != MATCH_NOMATCH &&
907 (rrc != MATCH_THEN || md->start_match_ptr != ecode))
908 {
909 if (rrc == MATCH_ONCE)
910 {
911 const uschar *scode = ecode;
912 if (*scode != OP_ONCE) /* If not at start, find it */
913 {
914 while (*scode == OP_ALT) scode += GET(scode, 1);
915 scode -= GET(scode, 1);
916 }
917 if (md->once_target == scode) rrc = MATCH_NOMATCH;
918 }
919 RRETURN(rrc);
920 }
921 ecode += GET(ecode, 1);
922 if (*ecode != OP_ALT) break;
923 }
924 if (rrc != MATCH_THEN && md->mark == NULL) md->mark = markptr;
925 RRETURN(MATCH_NOMATCH);
926
927 /* Handle possessive capturing brackets with an unlimited repeat. We come
928 here from BRAZERO with allow_zero set TRUE. The offset_vector values are
929 handled similarly to the normal case above. However, the matching is
930 different. The end of these brackets will always be OP_KETRPOS, which
931 returns MATCH_KETRPOS without going further in the pattern. By this means
932 we can handle the group by iteration rather than recursion, thereby
933 reducing the amount of stack needed. */
934
935 case OP_CBRAPOS:
936 case OP_SCBRAPOS:
937 allow_zero = FALSE;
938
939 POSSESSIVE_CAPTURE:
940 number = GET2(ecode, 1+LINK_SIZE);
941 offset = number << 1;
942
943 #ifdef PCRE_DEBUG
944 printf("start possessive bracket %d\n", number);
945 printf("subject=");
946 pchars(eptr, 16, TRUE, md);
947 printf("\n");
948 #endif
949
950 if (offset < md->offset_max)
951 {
952 matched_once = FALSE;
953 code_offset = ecode - md->start_code;
954
955 save_offset1 = md->offset_vector[offset];
956 save_offset2 = md->offset_vector[offset+1];
957 save_offset3 = md->offset_vector[md->offset_end - number];
958 save_capture_last = md->capture_last;
959
960 DPRINTF(("saving %d %d %d\n", save_offset1, save_offset2, save_offset3));
961
962 /* Each time round the loop, save the current subject position for use
963 when the group matches. For MATCH_MATCH, the group has matched, so we
964 restart it with a new subject starting position, remembering that we had
965 at least one match. For MATCH_NOMATCH, carry on with the alternatives, as
966 usual. If we haven't matched any alternatives in any iteration, check to
967 see if a previous iteration matched. If so, the group has matched;
968 continue from afterwards. Otherwise it has failed; restore the previous
969 capture values before returning NOMATCH. */
970
971 for (;;)
972 {
973 md->offset_vector[md->offset_end - number] =
974 (int)(eptr - md->start_subject);
975 if (op >= OP_SBRA) md->match_function_type = MATCH_CBEGROUP;
976 RMATCH(eptr, ecode + _pcre_OP_lengths[*ecode], offset_top, md,
977 eptrb, RM63);
978 if (rrc == MATCH_KETRPOS)
979 {
980 offset_top = md->end_offset_top;
981 eptr = md->end_match_ptr;
982 ecode = md->start_code + code_offset;
983 save_capture_last = md->capture_last;
984 matched_once = TRUE;
985 continue;
986 }
987 if (rrc != MATCH_NOMATCH &&
988 (rrc != MATCH_THEN || md->start_match_ptr != ecode))
989 RRETURN(rrc);
990 md->capture_last = save_capture_last;
991 ecode += GET(ecode, 1);
992 if (*ecode != OP_ALT) break;
993 }
994
995 if (!matched_once)
996 {
997 md->offset_vector[offset] = save_offset1;
998 md->offset_vector[offset+1] = save_offset2;
999 md->offset_vector[md->offset_end - number] = save_offset3;
1000 }
1001
1002 if (rrc != MATCH_THEN && md->mark == NULL) md->mark = markptr;
1003 if (allow_zero || matched_once)
1004 {
1005 ecode += 1 + LINK_SIZE;
1006 break;
1007 }
1008
1009 RRETURN(MATCH_NOMATCH);
1010 }
1011
1012 /* FALL THROUGH ... Insufficient room for saving captured contents. Treat
1013 as a non-capturing bracket. */
1014
1015 /* VVVVVVVVVVVVVVVVVVVVVVVVV */
1016 /* VVVVVVVVVVVVVVVVVVVVVVVVV */
1017
1018 DPRINTF(("insufficient capture room: treat as non-capturing\n"));
1019
1020 /* VVVVVVVVVVVVVVVVVVVVVVVVV */
1021 /* VVVVVVVVVVVVVVVVVVVVVVVVV */
1022
1023 /* Non-capturing possessive bracket with unlimited repeat. We come here
1024 from BRAZERO with allow_zero = TRUE. The code is similar to the above,
1025 without the capturing complication. It is written out separately for speed
1026 and cleanliness. */
1027
1028 case OP_BRAPOS:
1029 case OP_SBRAPOS:
1030 allow_zero = FALSE;
1031
1032 POSSESSIVE_NON_CAPTURE:
1033 matched_once = FALSE;
1034 code_offset = ecode - md->start_code;
1035
1036 for (;;)
1037 {
1038 if (op >= OP_SBRA) md->match_function_type = MATCH_CBEGROUP;
1039 RMATCH(eptr, ecode + _pcre_OP_lengths[*ecode], offset_top, md,
1040 eptrb, RM48);
1041 if (rrc == MATCH_KETRPOS)
1042 {
1043 offset_top = md->end_offset_top;
1044 eptr = md->end_match_ptr;
1045 ecode = md->start_code + code_offset;
1046 matched_once = TRUE;
1047 continue;
1048 }
1049 if (rrc != MATCH_NOMATCH &&
1050 (rrc != MATCH_THEN || md->start_match_ptr != ecode))
1051 RRETURN(rrc);
1052 ecode += GET(ecode, 1);
1053 if (*ecode != OP_ALT) break;
1054 }
1055
1056 if (matched_once || allow_zero)
1057 {
1058 ecode += 1 + LINK_SIZE;
1059 break;
1060 }
1061 RRETURN(MATCH_NOMATCH);
1062
1063 /* Control never reaches here. */
1064
1065 /* Conditional group: compilation checked that there are no more than
1066 two branches. If the condition is false, skipping the first branch takes us
1067 past the end if there is only one branch, but that's OK because that is
1068 exactly what going to the ket would do. */
1069
1070 case OP_COND:
1071 case OP_SCOND:
1072 codelink = GET(ecode, 1);
1073
1074 /* Because of the way auto-callout works during compile, a callout item is
1075 inserted between OP_COND and an assertion condition. */
1076
1077 if (ecode[LINK_SIZE+1] == OP_CALLOUT)
1078 {
1079 if (pcre_callout != NULL)
1080 {
1081 pcre_callout_block cb;
1082 cb.version = 1; /* Version 1 of the callout block */
1083 cb.callout_number = ecode[LINK_SIZE+2];
1084 cb.offset_vector = md->offset_vector;
1085 cb.subject = (PCRE_SPTR)md->start_subject;
1086 cb.subject_length = (int)(md->end_subject - md->start_subject);
1087 cb.start_match = (int)(mstart - md->start_subject);
1088 cb.current_position = (int)(eptr - md->start_subject);
1089 cb.pattern_position = GET(ecode, LINK_SIZE + 3);
1090 cb.next_item_length = GET(ecode, 3 + 2*LINK_SIZE);
1091 cb.capture_top = offset_top/2;
1092 cb.capture_last = md->capture_last;
1093 cb.callout_data = md->callout_data;
1094 if ((rrc = (*pcre_callout)(&cb)) > 0) MRRETURN(MATCH_NOMATCH);
1095 if (rrc < 0) RRETURN(rrc);
1096 }
1097 ecode += _pcre_OP_lengths[OP_CALLOUT];
1098 }
1099
1100 condcode = ecode[LINK_SIZE+1];
1101
1102 /* Now see what the actual condition is */
1103
1104 if (condcode == OP_RREF || condcode == OP_NRREF) /* Recursion test */
1105 {
1106 if (md->recursive == NULL) /* Not recursing => FALSE */
1107 {
1108 condition = FALSE;
1109 ecode += GET(ecode, 1);
1110 }
1111 else
1112 {
1113 int recno = GET2(ecode, LINK_SIZE + 2); /* Recursion group number*/
1114 condition = (recno == RREF_ANY || recno == md->recursive->group_num);
1115
1116 /* If the test is for recursion into a specific subpattern, and it is
1117 false, but the test was set up by name, scan the table to see if the
1118 name refers to any other numbers, and test them. The condition is true
1119 if any one is set. */
1120
1121 if (!condition && condcode == OP_NRREF && recno != RREF_ANY)
1122 {
1123 uschar *slotA = md->name_table;
1124 for (i = 0; i < md->name_count; i++)
1125 {
1126 if (GET2(slotA, 0) == recno) break;
1127 slotA += md->name_entry_size;
1128 }
1129
1130 /* Found a name for the number - there can be only one; duplicate
1131 names for different numbers are allowed, but not vice versa. First
1132 scan down for duplicates. */
1133
1134 if (i < md->name_count)
1135 {
1136 uschar *slotB = slotA;
1137 while (slotB > md->name_table)
1138 {
1139 slotB -= md->name_entry_size;
1140 if (strcmp((char *)slotA + 2, (char *)slotB + 2) == 0)
1141 {
1142 condition = GET2(slotB, 0) == md->recursive->group_num;
1143 if (condition) break;
1144 }
1145 else break;
1146 }
1147
1148 /* Scan up for duplicates */
1149
1150 if (!condition)
1151 {
1152 slotB = slotA;
1153 for (i++; i < md->name_count; i++)
1154 {
1155 slotB += md->name_entry_size;
1156 if (strcmp((char *)slotA + 2, (char *)slotB + 2) == 0)
1157 {
1158 condition = GET2(slotB, 0) == md->recursive->group_num;
1159 if (condition) break;
1160 }
1161 else break;
1162 }
1163 }
1164 }
1165 }
1166
1167 /* Chose branch according to the condition */
1168
1169 ecode += condition? 3 : GET(ecode, 1);
1170 }
1171 }
1172
1173 else if (condcode == OP_CREF || condcode == OP_NCREF) /* Group used test */
1174 {
1175 offset = GET2(ecode, LINK_SIZE+2) << 1; /* Doubled ref number */
1176 condition = offset < offset_top && md->offset_vector[offset] >= 0;
1177
1178 /* If the numbered capture is unset, but the reference was by name,
1179 scan the table to see if the name refers to any other numbers, and test
1180 them. The condition is true if any one is set. This is tediously similar
1181 to the code above, but not close enough to try to amalgamate. */
1182
1183 if (!condition && condcode == OP_NCREF)
1184 {
1185 int refno = offset >> 1;
1186 uschar *slotA = md->name_table;
1187
1188 for (i = 0; i < md->name_count; i++)
1189 {
1190 if (GET2(slotA, 0) == refno) break;
1191 slotA += md->name_entry_size;
1192 }
1193
1194 /* Found a name for the number - there can be only one; duplicate names
1195 for different numbers are allowed, but not vice versa. First scan down
1196 for duplicates. */
1197
1198 if (i < md->name_count)
1199 {
1200 uschar *slotB = slotA;
1201 while (slotB > md->name_table)
1202 {
1203 slotB -= md->name_entry_size;
1204 if (strcmp((char *)slotA + 2, (char *)slotB + 2) == 0)
1205 {
1206 offset = GET2(slotB, 0) << 1;
1207 condition = offset < offset_top &&
1208 md->offset_vector[offset] >= 0;
1209 if (condition) break;
1210 }
1211 else break;
1212 }
1213
1214 /* Scan up for duplicates */
1215
1216 if (!condition)
1217 {
1218 slotB = slotA;
1219 for (i++; i < md->name_count; i++)
1220 {
1221 slotB += md->name_entry_size;
1222 if (strcmp((char *)slotA + 2, (char *)slotB + 2) == 0)
1223 {
1224 offset = GET2(slotB, 0) << 1;
1225 condition = offset < offset_top &&
1226 md->offset_vector[offset] >= 0;
1227 if (condition) break;
1228 }
1229 else break;
1230 }
1231 }
1232 }
1233 }
1234
1235 /* Chose branch according to the condition */
1236
1237 ecode += condition? 3 : GET(ecode, 1);
1238 }
1239
1240 else if (condcode == OP_DEF) /* DEFINE - always false */
1241 {
1242 condition = FALSE;
1243 ecode += GET(ecode, 1);
1244 }
1245
1246 /* The condition is an assertion. Call match() to evaluate it - setting
1247 md->match_function_type to MATCH_CONDASSERT causes it to stop at the end of
1248 an assertion. */
1249
1250 else
1251 {
1252 md->match_function_type = MATCH_CONDASSERT;
1253 RMATCH(eptr, ecode + 1 + LINK_SIZE, offset_top, md, NULL, RM3);
1254 if (rrc == MATCH_MATCH)
1255 {
1256 if (md->end_offset_top > offset_top)
1257 offset_top = md->end_offset_top; /* Captures may have happened */
1258 condition = TRUE;
1259 ecode += 1 + LINK_SIZE + GET(ecode, LINK_SIZE + 2);
1260 while (*ecode == OP_ALT) ecode += GET(ecode, 1);
1261 }
1262 else if (rrc != MATCH_NOMATCH &&
1263 (rrc != MATCH_THEN || md->start_match_ptr != ecode))
1264 {
1265 RRETURN(rrc); /* Need braces because of following else */
1266 }
1267 else
1268 {
1269 condition = FALSE;
1270 ecode += codelink;
1271 }
1272 }
1273
1274 /* We are now at the branch that is to be obeyed. As there is only one,
1275 we used to use tail recursion to avoid using another stack frame, except
1276 when there was unlimited repeat of a possibly empty group. However, that
1277 strategy no longer works because of the possibilty of (*THEN) being
1278 encountered in the branch. A recursive call to match() is always required,
1279 unless the second alternative doesn't exist, in which case we can just
1280 plough on. */
1281
1282 if (condition || *ecode == OP_ALT)
1283 {
1284 if (op == OP_SCOND) md->match_function_type = MATCH_CBEGROUP;
1285 RMATCH(eptr, ecode + 1 + LINK_SIZE, offset_top, md, eptrb, RM49);
1286 if (rrc == MATCH_THEN && md->start_match_ptr == ecode)
1287 rrc = MATCH_NOMATCH;
1288 RRETURN(rrc);
1289 }
1290 else /* Condition false & no alternative */
1291 {
1292 ecode += 1 + LINK_SIZE;
1293 }
1294 break;
1295
1296
1297 /* Before OP_ACCEPT there may be any number of OP_CLOSE opcodes,
1298 to close any currently open capturing brackets. */
1299
1300 case OP_CLOSE:
1301 number = GET2(ecode, 1);
1302 offset = number << 1;
1303
1304 #ifdef PCRE_DEBUG
1305 printf("end bracket %d at *ACCEPT", number);
1306 printf("\n");
1307 #endif
1308
1309 md->capture_last = number;
1310 if (offset >= md->offset_max) md->offset_overflow = TRUE; else
1311 {
1312 md->offset_vector[offset] =
1313 md->offset_vector[md->offset_end - number];
1314 md->offset_vector[offset+1] = (int)(eptr - md->start_subject);
1315 if (offset_top <= offset) offset_top = offset + 2;
1316 }
1317 ecode += 3;
1318 break;
1319
1320
1321 /* End of the pattern, either real or forced. */
1322
1323 case OP_END:
1324 case OP_ACCEPT:
1325 case OP_ASSERT_ACCEPT:
1326
1327 /* If we have matched an empty string, fail if not in an assertion and not
1328 in a recursion if either PCRE_NOTEMPTY is set, or if PCRE_NOTEMPTY_ATSTART
1329 is set and we have matched at the start of the subject. In both cases,
1330 backtracking will then try other alternatives, if any. */
1331
1332 if (eptr == mstart && op != OP_ASSERT_ACCEPT &&
1333 md->recursive == NULL &&
1334 (md->notempty ||
1335 (md->notempty_atstart &&
1336 mstart == md->start_subject + md->start_offset)))
1337 MRRETURN(MATCH_NOMATCH);
1338
1339 /* Otherwise, we have a match. */
1340
1341 md->end_match_ptr = eptr; /* Record where we ended */
1342 md->end_offset_top = offset_top; /* and how many extracts were taken */
1343 md->start_match_ptr = mstart; /* and the start (\K can modify) */
1344
1345 /* For some reason, the macros don't work properly if an expression is
1346 given as the argument to MRRETURN when the heap is in use. */
1347
1348 rrc = (op == OP_END)? MATCH_MATCH : MATCH_ACCEPT;
1349 MRRETURN(rrc);
1350
1351 /* Assertion brackets. Check the alternative branches in turn - the
1352 matching won't pass the KET for an assertion. If any one branch matches,
1353 the assertion is true. Lookbehind assertions have an OP_REVERSE item at the
1354 start of each branch to move the current point backwards, so the code at
1355 this level is identical to the lookahead case. When the assertion is part
1356 of a condition, we want to return immediately afterwards. The caller of
1357 this incarnation of the match() function will have set MATCH_CONDASSERT in
1358 md->match_function type, and one of these opcodes will be the first opcode
1359 that is processed. We use a local variable that is preserved over calls to
1360 match() to remember this case. */
1361
1362 case OP_ASSERT:
1363 case OP_ASSERTBACK:
1364 if (md->match_function_type == MATCH_CONDASSERT)
1365 {
1366 condassert = TRUE;
1367 md->match_function_type = 0;
1368 }
1369 else condassert = FALSE;
1370
1371 do
1372 {
1373 RMATCH(eptr, ecode + 1 + LINK_SIZE, offset_top, md, NULL, RM4);
1374 if (rrc == MATCH_MATCH || rrc == MATCH_ACCEPT)
1375 {
1376 mstart = md->start_match_ptr; /* In case \K reset it */
1377 break;
1378 }
1379 if (rrc != MATCH_NOMATCH &&
1380 (rrc != MATCH_THEN || md->start_match_ptr != ecode))
1381 RRETURN(rrc);
1382 ecode += GET(ecode, 1);
1383 }
1384 while (*ecode == OP_ALT);
1385
1386 if (*ecode == OP_KET) MRRETURN(MATCH_NOMATCH);
1387
1388 /* If checking an assertion for a condition, return MATCH_MATCH. */
1389
1390 if (condassert) RRETURN(MATCH_MATCH);
1391
1392 /* Continue from after the assertion, updating the offsets high water
1393 mark, since extracts may have been taken during the assertion. */
1394
1395 do ecode += GET(ecode,1); while (*ecode == OP_ALT);
1396 ecode += 1 + LINK_SIZE;
1397 offset_top = md->end_offset_top;
1398 continue;
1399
1400 /* Negative assertion: all branches must fail to match. Encountering SKIP,
1401 PRUNE, or COMMIT means we must assume failure without checking subsequent
1402 branches. */
1403
1404 case OP_ASSERT_NOT:
1405 case OP_ASSERTBACK_NOT:
1406 if (md->match_function_type == MATCH_CONDASSERT)
1407 {
1408 condassert = TRUE;
1409 md->match_function_type = 0;
1410 }
1411 else condassert = FALSE;
1412
1413 do
1414 {
1415 RMATCH(eptr, ecode + 1 + LINK_SIZE, offset_top, md, NULL, RM5);
1416 if (rrc == MATCH_MATCH || rrc == MATCH_ACCEPT) MRRETURN(MATCH_NOMATCH);
1417 if (rrc == MATCH_SKIP || rrc == MATCH_PRUNE || rrc == MATCH_COMMIT)
1418 {
1419 do ecode += GET(ecode,1); while (*ecode == OP_ALT);
1420 break;
1421 }
1422 if (rrc != MATCH_NOMATCH &&
1423 (rrc != MATCH_THEN || md->start_match_ptr != ecode))
1424 RRETURN(rrc);
1425 ecode += GET(ecode,1);
1426 }
1427 while (*ecode == OP_ALT);
1428
1429 if (condassert) RRETURN(MATCH_MATCH); /* Condition assertion */
1430
1431 ecode += 1 + LINK_SIZE;
1432 continue;
1433
1434 /* Move the subject pointer back. This occurs only at the start of
1435 each branch of a lookbehind assertion. If we are too close to the start to
1436 move back, this match function fails. When working with UTF-8 we move
1437 back a number of characters, not bytes. */
1438
1439 case OP_REVERSE:
1440 #ifdef SUPPORT_UTF8
1441 if (utf8)
1442 {
1443 i = GET(ecode, 1);
1444 while (i-- > 0)
1445 {
1446 eptr--;
1447 if (eptr < md->start_subject) MRRETURN(MATCH_NOMATCH);
1448 BACKCHAR(eptr);
1449 }
1450 }
1451 else
1452 #endif
1453
1454 /* No UTF-8 support, or not in UTF-8 mode: count is byte count */
1455
1456 {
1457 eptr -= GET(ecode, 1);
1458 if (eptr < md->start_subject) MRRETURN(MATCH_NOMATCH);
1459 }
1460
1461 /* Save the earliest consulted character, then skip to next op code */
1462
1463 if (eptr < md->start_used_ptr) md->start_used_ptr = eptr;
1464 ecode += 1 + LINK_SIZE;
1465 break;
1466
1467 /* The callout item calls an external function, if one is provided, passing
1468 details of the match so far. This is mainly for debugging, though the
1469 function is able to force a failure. */
1470
1471 case OP_CALLOUT:
1472 if (pcre_callout != NULL)
1473 {
1474 pcre_callout_block cb;
1475 cb.version = 1; /* Version 1 of the callout block */
1476 cb.callout_number = ecode[1];
1477 cb.offset_vector = md->offset_vector;
1478 cb.subject = (PCRE_SPTR)md->start_subject;
1479 cb.subject_length = (int)(md->end_subject - md->start_subject);
1480 cb.start_match = (int)(mstart - md->start_subject);
1481 cb.current_position = (int)(eptr - md->start_subject);
1482 cb.pattern_position = GET(ecode, 2);
1483 cb.next_item_length = GET(ecode, 2 + LINK_SIZE);
1484 cb.capture_top = offset_top/2;
1485 cb.capture_last = md->capture_last;
1486 cb.callout_data = md->callout_data;
1487 if ((rrc = (*pcre_callout)(&cb)) > 0) MRRETURN(MATCH_NOMATCH);
1488 if (rrc < 0) RRETURN(rrc);
1489 }
1490 ecode += 2 + 2*LINK_SIZE;
1491 break;
1492
1493 /* Recursion either matches the current regex, or some subexpression. The
1494 offset data is the offset to the starting bracket from the start of the
1495 whole pattern. (This is so that it works from duplicated subpatterns.)
1496
1497 The state of the capturing groups is preserved over recursion, and
1498 re-instated afterwards. We don't know how many are started and not yet
1499 finished (offset_top records the completed total) so we just have to save
1500 all the potential data. There may be up to 65535 such values, which is too
1501 large to put on the stack, but using malloc for small numbers seems
1502 expensive. As a compromise, the stack is used when there are no more than
1503 REC_STACK_SAVE_MAX values to store; otherwise malloc is used.
1504
1505 There are also other values that have to be saved. We use a chained
1506 sequence of blocks that actually live on the stack. Thanks to Robin Houston
1507 for the original version of this logic. It has, however, been hacked around
1508 a lot, so he is not to blame for the current way it works. */
1509
1510 case OP_RECURSE:
1511 {
1512 callpat = md->start_code + GET(ecode, 1);
1513 new_recursive.group_num = (callpat == md->start_code)? 0 :
1514 GET2(callpat, 1 + LINK_SIZE);
1515
1516 /* Add to "recursing stack" */
1517
1518 new_recursive.prevrec = md->recursive;
1519 md->recursive = &new_recursive;
1520
1521 /* Where to continue from afterwards */
1522
1523 ecode += 1 + LINK_SIZE;
1524
1525 /* Now save the offset data */
1526
1527 new_recursive.saved_max = md->offset_end;
1528 if (new_recursive.saved_max <= REC_STACK_SAVE_MAX)
1529 new_recursive.offset_save = stacksave;
1530 else
1531 {
1532 new_recursive.offset_save =
1533 (int *)(pcre_malloc)(new_recursive.saved_max * sizeof(int));
1534 if (new_recursive.offset_save == NULL) RRETURN(PCRE_ERROR_NOMEMORY);
1535 }
1536 memcpy(new_recursive.offset_save, md->offset_vector,
1537 new_recursive.saved_max * sizeof(int));
1538
1539 /* OK, now we can do the recursion. After processing each alternative,
1540 restore the offset data. If there were nested recursions, md->recursive
1541 might be changed, so reset it before looping. */
1542
1543 DPRINTF(("Recursing into group %d\n", new_recursive.group_num));
1544 cbegroup = (*callpat >= OP_SBRA);
1545 do
1546 {
1547 if (cbegroup) md->match_function_type = MATCH_CBEGROUP;
1548 RMATCH(eptr, callpat + _pcre_OP_lengths[*callpat], offset_top,
1549 md, eptrb, RM6);
1550 memcpy(md->offset_vector, new_recursive.offset_save,
1551 new_recursive.saved_max * sizeof(int));
1552 if (rrc == MATCH_MATCH || rrc == MATCH_ACCEPT)
1553 {
1554 DPRINTF(("Recursion matched\n"));
1555 md->recursive = new_recursive.prevrec;
1556 if (new_recursive.offset_save != stacksave)
1557 (pcre_free)(new_recursive.offset_save);
1558
1559 /* Set where we got to in the subject, and reset the start in case
1560 it was changed by \K. This *is* propagated back out of a recursion,
1561 for Perl compatibility. */
1562
1563 eptr = md->end_match_ptr;
1564 mstart = md->start_match_ptr;
1565 goto RECURSION_MATCHED; /* Exit loop; end processing */
1566 }
1567 else if (rrc != MATCH_NOMATCH &&
1568 (rrc != MATCH_THEN || md->start_match_ptr != ecode))
1569 {
1570 DPRINTF(("Recursion gave error %d\n", rrc));
1571 if (new_recursive.offset_save != stacksave)
1572 (pcre_free)(new_recursive.offset_save);
1573 RRETURN(rrc);
1574 }
1575
1576 md->recursive = &new_recursive;
1577 callpat += GET(callpat, 1);
1578 }
1579 while (*callpat == OP_ALT);
1580
1581 DPRINTF(("Recursion didn't match\n"));
1582 md->recursive = new_recursive.prevrec;
1583 if (new_recursive.offset_save != stacksave)
1584 (pcre_free)(new_recursive.offset_save);
1585 MRRETURN(MATCH_NOMATCH);
1586 }
1587
1588 RECURSION_MATCHED:
1589 break;
1590
1591 /* An alternation is the end of a branch; scan along to find the end of the
1592 bracketed group and go to there. */
1593
1594 case OP_ALT:
1595 do ecode += GET(ecode,1); while (*ecode == OP_ALT);
1596 break;
1597
1598 /* BRAZERO, BRAMINZERO and SKIPZERO occur just before a bracket group,
1599 indicating that it may occur zero times. It may repeat infinitely, or not
1600 at all - i.e. it could be ()* or ()? or even (){0} in the pattern. Brackets
1601 with fixed upper repeat limits are compiled as a number of copies, with the
1602 optional ones preceded by BRAZERO or BRAMINZERO. */
1603
1604 case OP_BRAZERO:
1605 next = ecode + 1;
1606 RMATCH(eptr, next, offset_top, md, eptrb, RM10);
1607 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1608 do next += GET(next, 1); while (*next == OP_ALT);
1609 ecode = next + 1 + LINK_SIZE;
1610 break;
1611
1612 case OP_BRAMINZERO:
1613 next = ecode + 1;
1614 do next += GET(next, 1); while (*next == OP_ALT);
1615 RMATCH(eptr, next + 1+LINK_SIZE, offset_top, md, eptrb, RM11);
1616 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1617 ecode++;
1618 break;
1619
1620 case OP_SKIPZERO:
1621 next = ecode+1;
1622 do next += GET(next,1); while (*next == OP_ALT);
1623 ecode = next + 1 + LINK_SIZE;
1624 break;
1625
1626 /* BRAPOSZERO occurs before a possessive bracket group. Don't do anything
1627 here; just jump to the group, with allow_zero set TRUE. */
1628
1629 case OP_BRAPOSZERO:
1630 op = *(++ecode);
1631 allow_zero = TRUE;
1632 if (op == OP_CBRAPOS || op == OP_SCBRAPOS) goto POSSESSIVE_CAPTURE;
1633 goto POSSESSIVE_NON_CAPTURE;
1634
1635 /* End of a group, repeated or non-repeating. */
1636
1637 case OP_KET:
1638 case OP_KETRMIN:
1639 case OP_KETRMAX:
1640 case OP_KETRPOS:
1641 prev = ecode - GET(ecode, 1);
1642
1643 /* If this was a group that remembered the subject start, in order to break
1644 infinite repeats of empty string matches, retrieve the subject start from
1645 the chain. Otherwise, set it NULL. */
1646
1647 if (*prev >= OP_SBRA || *prev == OP_ONCE)
1648 {
1649 saved_eptr = eptrb->epb_saved_eptr; /* Value at start of group */
1650 eptrb = eptrb->epb_prev; /* Backup to previous group */
1651 }
1652 else saved_eptr = NULL;
1653
1654 /* If we are at the end of an assertion group, stop matching and return
1655 MATCH_MATCH, but record the current high water mark for use by positive
1656 assertions. We also need to record the match start in case it was changed
1657 by \K. */
1658
1659 if (*prev == OP_ASSERT || *prev == OP_ASSERT_NOT ||
1660 *prev == OP_ASSERTBACK || *prev == OP_ASSERTBACK_NOT)
1661 {
1662 md->end_match_ptr = eptr; /* For ONCE */
1663 md->end_offset_top = offset_top;
1664 md->start_match_ptr = mstart;
1665 MRRETURN(MATCH_MATCH);
1666 }
1667
1668 /* For capturing groups we have to check the group number back at the start
1669 and if necessary complete handling an extraction by setting the offsets and
1670 bumping the high water mark. Whole-pattern recursion is coded as a recurse
1671 into group 0, so it won't be picked up here. Instead, we catch it when the
1672 OP_END is reached. Other recursion is handled here. We just have to record
1673 the current subject position and start match pointer and give a MATCH
1674 return. */
1675
1676 if (*prev == OP_CBRA || *prev == OP_SCBRA ||
1677 *prev == OP_CBRAPOS || *prev == OP_SCBRAPOS)
1678 {
1679 number = GET2(prev, 1+LINK_SIZE);
1680 offset = number << 1;
1681
1682 #ifdef PCRE_DEBUG
1683 printf("end bracket %d", number);
1684 printf("\n");
1685 #endif
1686
1687 /* Handle a recursively called group. */
1688
1689 if (md->recursive != NULL && md->recursive->group_num == number)
1690 {
1691 md->end_match_ptr = eptr;
1692 md->start_match_ptr = mstart;
1693 RRETURN(MATCH_MATCH);
1694 }
1695
1696 /* Deal with capturing */
1697
1698 md->capture_last = number;
1699 if (offset >= md->offset_max) md->offset_overflow = TRUE; else
1700 {
1701 /* If offset is greater than offset_top, it means that we are
1702 "skipping" a capturing group, and that group's offsets must be marked
1703 unset. In earlier versions of PCRE, all the offsets were unset at the
1704 start of matching, but this doesn't work because atomic groups and
1705 assertions can cause a value to be set that should later be unset.
1706 Example: matching /(?>(a))b|(a)c/ against "ac". This sets group 1 as
1707 part of the atomic group, but this is not on the final matching path,
1708 so must be unset when 2 is set. (If there is no group 2, there is no
1709 problem, because offset_top will then be 2, indicating no capture.) */
1710
1711 if (offset > offset_top)
1712 {
1713 register int *iptr = md->offset_vector + offset_top;
1714 register int *iend = md->offset_vector + offset;
1715 while (iptr < iend) *iptr++ = -1;
1716 }
1717
1718 /* Now make the extraction */
1719
1720 md->offset_vector[offset] =
1721 md->offset_vector[md->offset_end - number];
1722 md->offset_vector[offset+1] = (int)(eptr - md->start_subject);
1723 if (offset_top <= offset) offset_top = offset + 2;
1724 }
1725 }
1726
1727 /* For an ordinary non-repeating ket, just continue at this level. This
1728 also happens for a repeating ket if no characters were matched in the
1729 group. This is the forcible breaking of infinite loops as implemented in
1730 Perl 5.005. For a non-repeating atomic group, establish a backup point by
1731 processing the rest of the pattern at a lower level. If this results in a
1732 NOMATCH return, pass MATCH_ONCE back to the original OP_ONCE level, thereby
1733 bypassing intermediate backup points, but resetting any captures that
1734 happened along the way. */
1735
1736 if (*ecode == OP_KET || eptr == saved_eptr)
1737 {
1738 if (*prev == OP_ONCE)
1739 {
1740 RMATCH(eptr, ecode + 1 + LINK_SIZE, offset_top, md, eptrb, RM12);
1741 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1742 md->once_target = prev; /* Level at which to change to MATCH_NOMATCH */
1743 RRETURN(MATCH_ONCE);
1744 }
1745 ecode += 1 + LINK_SIZE; /* Carry on at this level */
1746 break;
1747 }
1748
1749 /* OP_KETRPOS is a possessive repeating ket. Remember the current position,
1750 and return the MATCH_KETRPOS. This makes it possible to do the repeats one
1751 at a time from the outer level, thus saving stack. */
1752
1753 if (*ecode == OP_KETRPOS)
1754 {
1755 md->end_match_ptr = eptr;
1756 md->end_offset_top = offset_top;
1757 RRETURN(MATCH_KETRPOS);
1758 }
1759
1760 /* The normal repeating kets try the rest of the pattern or restart from
1761 the preceding bracket, in the appropriate order. In the second case, we can
1762 use tail recursion to avoid using another stack frame, unless we have an
1763 an atomic group or an unlimited repeat of a group that can match an empty
1764 string. */
1765
1766 if (*ecode == OP_KETRMIN)
1767 {
1768 RMATCH(eptr, ecode + 1 + LINK_SIZE, offset_top, md, eptrb, RM64);
1769 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1770 if (*prev == OP_ONCE)
1771 {
1772 RMATCH(eptr, prev, offset_top, md, eptrb, RM66);
1773 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1774 md->once_target = prev; /* Level at which to change to MATCH_NOMATCH */
1775 RRETURN(MATCH_ONCE);
1776 }
1777 if (*prev >= OP_SBRA) /* Could match an empty string */
1778 {
1779 md->match_function_type = MATCH_CBEGROUP;
1780 RMATCH(eptr, prev, offset_top, md, eptrb, RM50);
1781 RRETURN(rrc);
1782 }
1783 ecode = prev;
1784 goto TAIL_RECURSE;
1785 }
1786 else /* OP_KETRMAX */
1787 {
1788 if (*prev >= OP_SBRA) md->match_function_type = MATCH_CBEGROUP;
1789 RMATCH(eptr, prev, offset_top, md, eptrb, RM13);
1790 if (rrc == MATCH_ONCE && md->once_target == prev) rrc = MATCH_NOMATCH;
1791 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1792 if (*prev == OP_ONCE)
1793 {
1794 RMATCH(eptr, ecode + 1 + LINK_SIZE, offset_top, md, eptrb, RM65);
1795 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1796 md->once_target = prev;
1797 RRETURN(MATCH_ONCE);
1798 }
1799 ecode += 1 + LINK_SIZE;
1800 goto TAIL_RECURSE;
1801 }
1802 /* Control never gets here */
1803
1804 /* Not multiline mode: start of subject assertion, unless notbol. */
1805
1806 case OP_CIRC:
1807 if (md->notbol && eptr == md->start_subject) MRRETURN(MATCH_NOMATCH);
1808
1809 /* Start of subject assertion */
1810
1811 case OP_SOD:
1812 if (eptr != md->start_subject) MRRETURN(MATCH_NOMATCH);
1813 ecode++;
1814 break;
1815
1816 /* Multiline mode: start of subject unless notbol, or after any newline. */
1817
1818 case OP_CIRCM:
1819 if (md->notbol && eptr == md->start_subject) MRRETURN(MATCH_NOMATCH);
1820 if (eptr != md->start_subject &&
1821 (eptr == md->end_subject || !WAS_NEWLINE(eptr)))
1822 MRRETURN(MATCH_NOMATCH);
1823 ecode++;
1824 break;
1825
1826 /* Start of match assertion */
1827
1828 case OP_SOM:
1829 if (eptr != md->start_subject + md->start_offset) MRRETURN(MATCH_NOMATCH);
1830 ecode++;
1831 break;
1832
1833 /* Reset the start of match point */
1834
1835 case OP_SET_SOM:
1836 mstart = eptr;
1837 ecode++;
1838 break;
1839
1840 /* Multiline mode: assert before any newline, or before end of subject
1841 unless noteol is set. */
1842
1843 case OP_DOLLM:
1844 if (eptr < md->end_subject)
1845 { if (!IS_NEWLINE(eptr)) MRRETURN(MATCH_NOMATCH); }
1846 else
1847 {
1848 if (md->noteol) MRRETURN(MATCH_NOMATCH);
1849 SCHECK_PARTIAL();
1850 }
1851 ecode++;
1852 break;
1853
1854 /* Not multiline mode: assert before a terminating newline or before end of
1855 subject unless noteol is set. */
1856
1857 case OP_DOLL:
1858 if (md->noteol) MRRETURN(MATCH_NOMATCH);
1859 if (!md->endonly) goto ASSERT_NL_OR_EOS;
1860
1861 /* ... else fall through for endonly */
1862
1863 /* End of subject assertion (\z) */
1864
1865 case OP_EOD:
1866 if (eptr < md->end_subject) MRRETURN(MATCH_NOMATCH);
1867 SCHECK_PARTIAL();
1868 ecode++;
1869 break;
1870
1871 /* End of subject or ending \n assertion (\Z) */
1872
1873 case OP_EODN:
1874 ASSERT_NL_OR_EOS:
1875 if (eptr < md->end_subject &&
1876 (!IS_NEWLINE(eptr) || eptr != md->end_subject - md->nllen))
1877 MRRETURN(MATCH_NOMATCH);
1878
1879 /* Either at end of string or \n before end. */
1880
1881 SCHECK_PARTIAL();
1882 ecode++;
1883 break;
1884
1885 /* Word boundary assertions */
1886
1887 case OP_NOT_WORD_BOUNDARY:
1888 case OP_WORD_BOUNDARY:
1889 {
1890
1891 /* Find out if the previous and current characters are "word" characters.
1892 It takes a bit more work in UTF-8 mode. Characters > 255 are assumed to
1893 be "non-word" characters. Remember the earliest consulted character for
1894 partial matching. */
1895
1896 #ifdef SUPPORT_UTF8
1897 if (utf8)
1898 {
1899 /* Get status of previous character */
1900
1901 if (eptr == md->start_subject) prev_is_word = FALSE; else
1902 {
1903 USPTR lastptr = eptr - 1;
1904 while((*lastptr & 0xc0) == 0x80) lastptr--;
1905 if (lastptr < md->start_used_ptr) md->start_used_ptr = lastptr;
1906 GETCHAR(c, lastptr);
1907 #ifdef SUPPORT_UCP
1908 if (md->use_ucp)
1909 {
1910 if (c == '_') prev_is_word = TRUE; else
1911 {
1912 int cat = UCD_CATEGORY(c);
1913 prev_is_word = (cat == ucp_L || cat == ucp_N);
1914 }
1915 }
1916 else
1917 #endif
1918 prev_is_word = c < 256 && (md->ctypes[c] & ctype_word) != 0;
1919 }
1920
1921 /* Get status of next character */
1922
1923 if (eptr >= md->end_subject)
1924 {
1925 SCHECK_PARTIAL();
1926 cur_is_word = FALSE;
1927 }
1928 else
1929 {
1930 GETCHAR(c, eptr);
1931 #ifdef SUPPORT_UCP
1932 if (md->use_ucp)
1933 {
1934 if (c == '_') cur_is_word = TRUE; else
1935 {
1936 int cat = UCD_CATEGORY(c);
1937 cur_is_word = (cat == ucp_L || cat == ucp_N);
1938 }
1939 }
1940 else
1941 #endif
1942 cur_is_word = c < 256 && (md->ctypes[c] & ctype_word) != 0;
1943 }
1944 }
1945 else
1946 #endif
1947
1948 /* Not in UTF-8 mode, but we may still have PCRE_UCP set, and for
1949 consistency with the behaviour of \w we do use it in this case. */
1950
1951 {
1952 /* Get status of previous character */
1953
1954 if (eptr == md->start_subject) prev_is_word = FALSE; else
1955 {
1956 if (eptr <= md->start_used_ptr) md->start_used_ptr = eptr - 1;
1957 #ifdef SUPPORT_UCP
1958 if (md->use_ucp)
1959 {
1960 c = eptr[-1];
1961 if (c == '_') prev_is_word = TRUE; else
1962 {
1963 int cat = UCD_CATEGORY(c);
1964 prev_is_word = (cat == ucp_L || cat == ucp_N);
1965 }
1966 }
1967 else
1968 #endif
1969 prev_is_word = ((md->ctypes[eptr[-1]] & ctype_word) != 0);
1970 }
1971
1972 /* Get status of next character */
1973
1974 if (eptr >= md->end_subject)
1975 {
1976 SCHECK_PARTIAL();
1977 cur_is_word = FALSE;
1978 }
1979 else
1980 #ifdef SUPPORT_UCP
1981 if (md->use_ucp)
1982 {
1983 c = *eptr;
1984 if (c == '_') cur_is_word = TRUE; else
1985 {
1986 int cat = UCD_CATEGORY(c);
1987 cur_is_word = (cat == ucp_L || cat == ucp_N);
1988 }
1989 }
1990 else
1991 #endif
1992 cur_is_word = ((md->ctypes[*eptr] & ctype_word) != 0);
1993 }
1994
1995 /* Now see if the situation is what we want */
1996
1997 if ((*ecode++ == OP_WORD_BOUNDARY)?
1998 cur_is_word == prev_is_word : cur_is_word != prev_is_word)
1999 MRRETURN(MATCH_NOMATCH);
2000 }
2001 break;
2002
2003 /* Match a single character type; inline for speed */
2004
2005 case OP_ANY:
2006 if (IS_NEWLINE(eptr)) MRRETURN(MATCH_NOMATCH);
2007 /* Fall through */
2008
2009 case OP_ALLANY:
2010 if (eptr++ >= md->end_subject)
2011 {
2012 SCHECK_PARTIAL();
2013 MRRETURN(MATCH_NOMATCH);
2014 }
2015 if (utf8) while (eptr < md->end_subject && (*eptr & 0xc0) == 0x80) eptr++;
2016 ecode++;
2017 break;
2018
2019 /* Match a single byte, even in UTF-8 mode. This opcode really does match
2020 any byte, even newline, independent of the setting of PCRE_DOTALL. */
2021
2022 case OP_ANYBYTE:
2023 if (eptr++ >= md->end_subject)
2024 {
2025 SCHECK_PARTIAL();
2026 MRRETURN(MATCH_NOMATCH);
2027 }
2028 ecode++;
2029 break;
2030
2031 case OP_NOT_DIGIT:
2032 if (eptr >= md->end_subject)
2033 {
2034 SCHECK_PARTIAL();
2035 MRRETURN(MATCH_NOMATCH);
2036 }
2037 GETCHARINCTEST(c, eptr);
2038 if (
2039 #ifdef SUPPORT_UTF8
2040 c < 256 &&
2041 #endif
2042 (md->ctypes[c] & ctype_digit) != 0
2043 )
2044 MRRETURN(MATCH_NOMATCH);
2045 ecode++;
2046 break;
2047
2048 case OP_DIGIT:
2049 if (eptr >= md->end_subject)
2050 {
2051 SCHECK_PARTIAL();
2052 MRRETURN(MATCH_NOMATCH);
2053 }
2054 GETCHARINCTEST(c, eptr);
2055 if (
2056 #ifdef SUPPORT_UTF8
2057 c >= 256 ||
2058 #endif
2059 (md->ctypes[c] & ctype_digit) == 0
2060 )
2061 MRRETURN(MATCH_NOMATCH);
2062 ecode++;
2063 break;
2064
2065 case OP_NOT_WHITESPACE:
2066 if (eptr >= md->end_subject)
2067 {
2068 SCHECK_PARTIAL();
2069 MRRETURN(MATCH_NOMATCH);
2070 }
2071 GETCHARINCTEST(c, eptr);
2072 if (
2073 #ifdef SUPPORT_UTF8
2074 c < 256 &&
2075 #endif
2076 (md->ctypes[c] & ctype_space) != 0
2077 )
2078 MRRETURN(MATCH_NOMATCH);
2079 ecode++;
2080 break;
2081
2082 case OP_WHITESPACE:
2083 if (eptr >= md->end_subject)
2084 {
2085 SCHECK_PARTIAL();
2086 MRRETURN(MATCH_NOMATCH);
2087 }
2088 GETCHARINCTEST(c, eptr);
2089 if (
2090 #ifdef SUPPORT_UTF8
2091 c >= 256 ||
2092 #endif
2093 (md->ctypes[c] & ctype_space) == 0
2094 )
2095 MRRETURN(MATCH_NOMATCH);
2096 ecode++;
2097 break;
2098
2099 case OP_NOT_WORDCHAR:
2100 if (eptr >= md->end_subject)
2101 {
2102 SCHECK_PARTIAL();
2103 MRRETURN(MATCH_NOMATCH);
2104 }
2105 GETCHARINCTEST(c, eptr);
2106 if (
2107 #ifdef SUPPORT_UTF8
2108 c < 256 &&
2109 #endif
2110 (md->ctypes[c] & ctype_word) != 0
2111 )
2112 MRRETURN(MATCH_NOMATCH);
2113 ecode++;
2114 break;
2115
2116 case OP_WORDCHAR:
2117 if (eptr >= md->end_subject)
2118 {
2119 SCHECK_PARTIAL();
2120 MRRETURN(MATCH_NOMATCH);
2121 }
2122 GETCHARINCTEST(c, eptr);
2123 if (
2124 #ifdef SUPPORT_UTF8
2125 c >= 256 ||
2126 #endif
2127 (md->ctypes[c] & ctype_word) == 0
2128 )
2129 MRRETURN(MATCH_NOMATCH);
2130 ecode++;
2131 break;
2132
2133 case OP_ANYNL:
2134 if (eptr >= md->end_subject)
2135 {
2136 SCHECK_PARTIAL();
2137 MRRETURN(MATCH_NOMATCH);
2138 }
2139 GETCHARINCTEST(c, eptr);
2140 switch(c)
2141 {
2142 default: MRRETURN(MATCH_NOMATCH);
2143
2144 case 0x000d:
2145 if (eptr < md->end_subject && *eptr == 0x0a) eptr++;
2146 break;
2147
2148 case 0x000a:
2149 break;
2150
2151 case 0x000b:
2152 case 0x000c:
2153 case 0x0085:
2154 case 0x2028:
2155 case 0x2029:
2156 if (md->bsr_anycrlf) MRRETURN(MATCH_NOMATCH);
2157 break;
2158 }
2159 ecode++;
2160 break;
2161
2162 case OP_NOT_HSPACE:
2163 if (eptr >= md->end_subject)
2164 {
2165 SCHECK_PARTIAL();
2166 MRRETURN(MATCH_NOMATCH);
2167 }
2168 GETCHARINCTEST(c, eptr);
2169 switch(c)
2170 {
2171 default: break;
2172 case 0x09: /* HT */
2173 case 0x20: /* SPACE */
2174 case 0xa0: /* NBSP */
2175 case 0x1680: /* OGHAM SPACE MARK */
2176 case 0x180e: /* MONGOLIAN VOWEL SEPARATOR */
2177 case 0x2000: /* EN QUAD */
2178 case 0x2001: /* EM QUAD */
2179 case 0x2002: /* EN SPACE */
2180 case 0x2003: /* EM SPACE */
2181 case 0x2004: /* THREE-PER-EM SPACE */
2182 case 0x2005: /* FOUR-PER-EM SPACE */
2183 case 0x2006: /* SIX-PER-EM SPACE */
2184 case 0x2007: /* FIGURE SPACE */
2185 case 0x2008: /* PUNCTUATION SPACE */
2186 case 0x2009: /* THIN SPACE */
2187 case 0x200A: /* HAIR SPACE */
2188 case 0x202f: /* NARROW NO-BREAK SPACE */
2189 case 0x205f: /* MEDIUM MATHEMATICAL SPACE */
2190 case 0x3000: /* IDEOGRAPHIC SPACE */
2191 MRRETURN(MATCH_NOMATCH);
2192 }
2193 ecode++;
2194 break;
2195
2196 case OP_HSPACE:
2197 if (eptr >= md->end_subject)
2198 {
2199 SCHECK_PARTIAL();
2200 MRRETURN(MATCH_NOMATCH);
2201 }
2202 GETCHARINCTEST(c, eptr);
2203 switch(c)
2204 {
2205 default: MRRETURN(MATCH_NOMATCH);
2206 case 0x09: /* HT */
2207 case 0x20: /* SPACE */
2208 case 0xa0: /* NBSP */
2209 case 0x1680: /* OGHAM SPACE MARK */
2210 case 0x180e: /* MONGOLIAN VOWEL SEPARATOR */
2211 case 0x2000: /* EN QUAD */
2212 case 0x2001: /* EM QUAD */
2213 case 0x2002: /* EN SPACE */
2214 case 0x2003: /* EM SPACE */
2215 case 0x2004: /* THREE-PER-EM SPACE */
2216 case 0x2005: /* FOUR-PER-EM SPACE */
2217 case 0x2006: /* SIX-PER-EM SPACE */
2218 case 0x2007: /* FIGURE SPACE */
2219 case 0x2008: /* PUNCTUATION SPACE */
2220 case 0x2009: /* THIN SPACE */
2221 case 0x200A: /* HAIR SPACE */
2222 case 0x202f: /* NARROW NO-BREAK SPACE */
2223 case 0x205f: /* MEDIUM MATHEMATICAL SPACE */
2224 case 0x3000: /* IDEOGRAPHIC SPACE */
2225 break;
2226 }
2227 ecode++;
2228 break;
2229
2230 case OP_NOT_VSPACE:
2231 if (eptr >= md->end_subject)
2232 {
2233 SCHECK_PARTIAL();
2234 MRRETURN(MATCH_NOMATCH);
2235 }
2236 GETCHARINCTEST(c, eptr);
2237 switch(c)
2238 {
2239 default: break;
2240 case 0x0a: /* LF */
2241 case 0x0b: /* VT */
2242 case 0x0c: /* FF */
2243 case 0x0d: /* CR */
2244 case 0x85: /* NEL */
2245 case 0x2028: /* LINE SEPARATOR */
2246 case 0x2029: /* PARAGRAPH SEPARATOR */
2247 MRRETURN(MATCH_NOMATCH);
2248 }
2249 ecode++;
2250 break;
2251
2252 case OP_VSPACE:
2253 if (eptr >= md->end_subject)
2254 {
2255 SCHECK_PARTIAL();
2256 MRRETURN(MATCH_NOMATCH);
2257 }
2258 GETCHARINCTEST(c, eptr);
2259 switch(c)
2260 {
2261 default: MRRETURN(MATCH_NOMATCH);
2262 case 0x0a: /* LF */
2263 case 0x0b: /* VT */
2264 case 0x0c: /* FF */
2265 case 0x0d: /* CR */
2266 case 0x85: /* NEL */
2267 case 0x2028: /* LINE SEPARATOR */
2268 case 0x2029: /* PARAGRAPH SEPARATOR */
2269 break;
2270 }
2271 ecode++;
2272 break;
2273
2274 #ifdef SUPPORT_UCP
2275 /* Check the next character by Unicode property. We will get here only
2276 if the support is in the binary; otherwise a compile-time error occurs. */
2277
2278 case OP_PROP:
2279 case OP_NOTPROP:
2280 if (eptr >= md->end_subject)
2281 {
2282 SCHECK_PARTIAL();
2283 MRRETURN(MATCH_NOMATCH);
2284 }
2285 GETCHARINCTEST(c, eptr);
2286 {
2287 const ucd_record *prop = GET_UCD(c);
2288
2289 switch(ecode[1])
2290 {
2291 case PT_ANY:
2292 if (op == OP_NOTPROP) MRRETURN(MATCH_NOMATCH);
2293 break;
2294
2295 case PT_LAMP:
2296 if ((prop->chartype == ucp_Lu ||
2297 prop->chartype == ucp_Ll ||
2298 prop->chartype == ucp_Lt) == (op == OP_NOTPROP))
2299 MRRETURN(MATCH_NOMATCH);
2300 break;
2301
2302 case PT_GC:
2303 if ((ecode[2] != _pcre_ucp_gentype[prop->chartype]) == (op == OP_PROP))
2304 MRRETURN(MATCH_NOMATCH);
2305 break;
2306
2307 case PT_PC:
2308 if ((ecode[2] != prop->chartype) == (op == OP_PROP))
2309 MRRETURN(MATCH_NOMATCH);
2310 break;
2311
2312 case PT_SC:
2313 if ((ecode[2] != prop->script) == (op == OP_PROP))
2314 MRRETURN(MATCH_NOMATCH);
2315 break;
2316
2317 /* These are specials */
2318
2319 case PT_ALNUM:
2320 if ((_pcre_ucp_gentype[prop->chartype] == ucp_L ||
2321 _pcre_ucp_gentype[prop->chartype] == ucp_N) == (op == OP_NOTPROP))
2322 MRRETURN(MATCH_NOMATCH);
2323 break;
2324
2325 case PT_SPACE: /* Perl space */
2326 if ((_pcre_ucp_gentype[prop->chartype] == ucp_Z ||
2327 c == CHAR_HT || c == CHAR_NL || c == CHAR_FF || c == CHAR_CR)
2328 == (op == OP_NOTPROP))
2329 MRRETURN(MATCH_NOMATCH);
2330 break;
2331
2332 case PT_PXSPACE: /* POSIX space */
2333 if ((_pcre_ucp_gentype[prop->chartype] == ucp_Z ||
2334 c == CHAR_HT || c == CHAR_NL || c == CHAR_VT ||
2335 c == CHAR_FF || c == CHAR_CR)
2336 == (op == OP_NOTPROP))
2337 MRRETURN(MATCH_NOMATCH);
2338 break;
2339
2340 case PT_WORD:
2341 if ((_pcre_ucp_gentype[prop->chartype] == ucp_L ||
2342 _pcre_ucp_gentype[prop->chartype] == ucp_N ||
2343 c == CHAR_UNDERSCORE) == (op == OP_NOTPROP))
2344 MRRETURN(MATCH_NOMATCH);
2345 break;
2346
2347 /* This should never occur */
2348
2349 default:
2350 RRETURN(PCRE_ERROR_INTERNAL);
2351 }
2352
2353 ecode += 3;
2354 }
2355 break;
2356
2357 /* Match an extended Unicode sequence. We will get here only if the support
2358 is in the binary; otherwise a compile-time error occurs. */
2359
2360 case OP_EXTUNI:
2361 if (eptr >= md->end_subject)
2362 {
2363 SCHECK_PARTIAL();
2364 MRRETURN(MATCH_NOMATCH);
2365 }
2366 GETCHARINCTEST(c, eptr);
2367 {
2368 int category = UCD_CATEGORY(c);
2369 if (category == ucp_M) MRRETURN(MATCH_NOMATCH);
2370 while (eptr < md->end_subject)
2371 {
2372 int len = 1;
2373 if (!utf8) c = *eptr; else
2374 {
2375 GETCHARLEN(c, eptr, len);
2376 }
2377 category = UCD_CATEGORY(c);
2378 if (category != ucp_M) break;
2379 eptr += len;
2380 }
2381 }
2382 ecode++;
2383 break;
2384 #endif
2385
2386
2387 /* Match a back reference, possibly repeatedly. Look past the end of the
2388 item to see if there is repeat information following. The code is similar
2389 to that for character classes, but repeated for efficiency. Then obey
2390 similar code to character type repeats - written out again for speed.
2391 However, if the referenced string is the empty string, always treat
2392 it as matched, any number of times (otherwise there could be infinite
2393 loops). */
2394
2395 case OP_REF:
2396 case OP_REFI:
2397 caseless = op == OP_REFI;
2398 offset = GET2(ecode, 1) << 1; /* Doubled ref number */
2399 ecode += 3;
2400
2401 /* If the reference is unset, there are two possibilities:
2402
2403 (a) In the default, Perl-compatible state, set the length negative;
2404 this ensures that every attempt at a match fails. We can't just fail
2405 here, because of the possibility of quantifiers with zero minima.
2406
2407 (b) If the JavaScript compatibility flag is set, set the length to zero
2408 so that the back reference matches an empty string.
2409
2410 Otherwise, set the length to the length of what was matched by the
2411 referenced subpattern. */
2412
2413 if (offset >= offset_top || md->offset_vector[offset] < 0)
2414 length = (md->jscript_compat)? 0 : -1;
2415 else
2416 length = md->offset_vector[offset+1] - md->offset_vector[offset];
2417
2418 /* Set up for repetition, or handle the non-repeated case */
2419
2420 switch (*ecode)
2421 {
2422 case OP_CRSTAR:
2423 case OP_CRMINSTAR:
2424 case OP_CRPLUS:
2425 case OP_CRMINPLUS:
2426 case OP_CRQUERY:
2427 case OP_CRMINQUERY:
2428 c = *ecode++ - OP_CRSTAR;
2429 minimize = (c & 1) != 0;
2430 min = rep_min[c]; /* Pick up values from tables; */
2431 max = rep_max[c]; /* zero for max => infinity */
2432 if (max == 0) max = INT_MAX;
2433 break;
2434
2435 case OP_CRRANGE:
2436 case OP_CRMINRANGE:
2437 minimize = (*ecode == OP_CRMINRANGE);
2438 min = GET2(ecode, 1);
2439 max = GET2(ecode, 3);
2440 if (max == 0) max = INT_MAX;
2441 ecode += 5;
2442 break;
2443
2444 default: /* No repeat follows */
2445 if ((length = match_ref(offset, eptr, length, md, caseless)) < 0)
2446 {
2447 CHECK_PARTIAL();
2448 MRRETURN(MATCH_NOMATCH);
2449 }
2450 eptr += length;
2451 continue; /* With the main loop */
2452 }
2453
2454 /* Handle repeated back references. If the length of the reference is
2455 zero, just continue with the main loop. */
2456
2457 if (length == 0) continue;
2458
2459 /* First, ensure the minimum number of matches are present. We get back
2460 the length of the reference string explicitly rather than passing the
2461 address of eptr, so that eptr can be a register variable. */
2462
2463 for (i = 1; i <= min; i++)
2464 {
2465 int slength;
2466 if ((slength = match_ref(offset, eptr, length, md, caseless)) < 0)
2467 {
2468 CHECK_PARTIAL();
2469 MRRETURN(MATCH_NOMATCH);
2470 }
2471 eptr += slength;
2472 }
2473
2474 /* If min = max, continue at the same level without recursion.
2475 They are not both allowed to be zero. */
2476
2477 if (min == max) continue;
2478
2479 /* If minimizing, keep trying and advancing the pointer */
2480
2481 if (minimize)
2482 {
2483 for (fi = min;; fi++)
2484 {
2485 int slength;
2486 RMATCH(eptr, ecode, offset_top, md, eptrb, RM14);
2487 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2488 if (fi >= max) MRRETURN(MATCH_NOMATCH);
2489 if ((slength = match_ref(offset, eptr, length, md, caseless)) < 0)
2490 {
2491 CHECK_PARTIAL();
2492 MRRETURN(MATCH_NOMATCH);
2493 }
2494 eptr += slength;
2495 }
2496 /* Control never gets here */
2497 }
2498
2499 /* If maximizing, find the longest string and work backwards */
2500
2501 else
2502 {
2503 pp = eptr;
2504 for (i = min; i < max; i++)
2505 {
2506 int slength;
2507 if ((slength = match_ref(offset, eptr, length, md, caseless)) < 0)
2508 {
2509 CHECK_PARTIAL();
2510 break;
2511 }
2512 eptr += slength;
2513 }
2514 while (eptr >= pp)
2515 {
2516 RMATCH(eptr, ecode, offset_top, md, eptrb, RM15);
2517 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2518 eptr -= length;
2519 }
2520 MRRETURN(MATCH_NOMATCH);
2521 }
2522 /* Control never gets here */
2523
2524 /* Match a bit-mapped character class, possibly repeatedly. This op code is
2525 used when all the characters in the class have values in the range 0-255,
2526 and either the matching is caseful, or the characters are in the range
2527 0-127 when UTF-8 processing is enabled. The only difference between
2528 OP_CLASS and OP_NCLASS occurs when a data character outside the range is
2529 encountered.
2530
2531 First, look past the end of the item to see if there is repeat information
2532 following. Then obey similar code to character type repeats - written out
2533 again for speed. */
2534
2535 case OP_NCLASS:
2536 case OP_CLASS:
2537 {
2538 data = ecode + 1; /* Save for matching */
2539 ecode += 33; /* Advance past the item */
2540
2541 switch (*ecode)
2542 {
2543 case OP_CRSTAR:
2544 case OP_CRMINSTAR:
2545 case OP_CRPLUS:
2546 case OP_CRMINPLUS:
2547 case OP_CRQUERY:
2548 case OP_CRMINQUERY:
2549 c = *ecode++ - OP_CRSTAR;
2550 minimize = (c & 1) != 0;
2551 min = rep_min[c]; /* Pick up values from tables; */
2552 max = rep_max[c]; /* zero for max => infinity */
2553 if (max == 0) max = INT_MAX;
2554 break;
2555
2556 case OP_CRRANGE:
2557 case OP_CRMINRANGE:
2558 minimize = (*ecode == OP_CRMINRANGE);
2559 min = GET2(ecode, 1);
2560 max = GET2(ecode, 3);
2561 if (max == 0) max = INT_MAX;
2562 ecode += 5;
2563 break;
2564
2565 default: /* No repeat follows */
2566 min = max = 1;
2567 break;
2568 }
2569
2570 /* First, ensure the minimum number of matches are present. */
2571
2572 #ifdef SUPPORT_UTF8
2573 /* UTF-8 mode */
2574 if (utf8)
2575 {
2576 for (i = 1; i <= min; i++)
2577 {
2578 if (eptr >= md->end_subject)
2579 {
2580 SCHECK_PARTIAL();
2581 MRRETURN(MATCH_NOMATCH);
2582 }
2583 GETCHARINC(c, eptr);
2584 if (c > 255)
2585 {
2586 if (op == OP_CLASS) MRRETURN(MATCH_NOMATCH);
2587 }
2588 else
2589 {
2590 if ((data[c/8] & (1 << (c&7))) == 0) MRRETURN(MATCH_NOMATCH);
2591 }
2592 }
2593 }
2594 else
2595 #endif
2596 /* Not UTF-8 mode */
2597 {
2598 for (i = 1; i <= min; i++)
2599 {
2600 if (eptr >= md->end_subject)
2601 {
2602 SCHECK_PARTIAL();
2603 MRRETURN(MATCH_NOMATCH);
2604 }
2605 c = *eptr++;
2606 if ((data[c/8] & (1 << (c&7))) == 0) MRRETURN(MATCH_NOMATCH);
2607 }
2608 }
2609
2610 /* If max == min we can continue with the main loop without the
2611 need to recurse. */
2612
2613 if (min == max) continue;
2614
2615 /* If minimizing, keep testing the rest of the expression and advancing
2616 the pointer while it matches the class. */
2617
2618 if (minimize)
2619 {
2620 #ifdef SUPPORT_UTF8
2621 /* UTF-8 mode */
2622 if (utf8)
2623 {
2624 for (fi = min;; fi++)
2625 {
2626 RMATCH(eptr, ecode, offset_top, md, eptrb, RM16);
2627 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2628 if (fi >= max) MRRETURN(MATCH_NOMATCH);
2629 if (eptr >= md->end_subject)
2630 {
2631 SCHECK_PARTIAL();
2632 MRRETURN(MATCH_NOMATCH);
2633 }
2634 GETCHARINC(c, eptr);
2635 if (c > 255)
2636 {
2637 if (op == OP_CLASS) MRRETURN(MATCH_NOMATCH);
2638 }
2639 else
2640 {
2641 if ((data[c/8] & (1 << (c&7))) == 0) MRRETURN(MATCH_NOMATCH);
2642 }
2643 }
2644 }
2645 else
2646 #endif
2647 /* Not UTF-8 mode */
2648 {
2649 for (fi = min;; fi++)
2650 {
2651 RMATCH(eptr, ecode, offset_top, md, eptrb, RM17);
2652 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2653 if (fi >= max) MRRETURN(MATCH_NOMATCH);
2654 if (eptr >= md->end_subject)
2655 {
2656 SCHECK_PARTIAL();
2657 MRRETURN(MATCH_NOMATCH);
2658 }
2659 c = *eptr++;
2660 if ((data[c/8] & (1 << (c&7))) == 0) MRRETURN(MATCH_NOMATCH);
2661 }
2662 }
2663 /* Control never gets here */
2664 }
2665
2666 /* If maximizing, find the longest possible run, then work backwards. */
2667
2668 else
2669 {
2670 pp = eptr;
2671
2672 #ifdef SUPPORT_UTF8
2673 /* UTF-8 mode */
2674 if (utf8)
2675 {
2676 for (i = min; i < max; i++)
2677 {
2678 int len = 1;
2679 if (eptr >= md->end_subject)
2680 {
2681 SCHECK_PARTIAL();
2682 break;
2683 }
2684 GETCHARLEN(c, eptr, len);
2685 if (c > 255)
2686 {
2687 if (op == OP_CLASS) break;
2688 }
2689 else
2690 {
2691 if ((data[c/8] & (1 << (c&7))) == 0) break;
2692 }
2693 eptr += len;
2694 }
2695 for (;;)
2696 {
2697 RMATCH(eptr, ecode, offset_top, md, eptrb, RM18);
2698 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2699 if (eptr-- == pp) break; /* Stop if tried at original pos */
2700 BACKCHAR(eptr);
2701 }
2702 }
2703 else
2704 #endif
2705 /* Not UTF-8 mode */
2706 {
2707 for (i = min; i < max; i++)
2708 {
2709 if (eptr >= md->end_subject)
2710 {
2711 SCHECK_PARTIAL();
2712 break;
2713 }
2714 c = *eptr;
2715 if ((data[c/8] & (1 << (c&7))) == 0) break;
2716 eptr++;
2717 }
2718 while (eptr >= pp)
2719 {
2720 RMATCH(eptr, ecode, offset_top, md, eptrb, RM19);
2721 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2722 eptr--;
2723 }
2724 }
2725
2726 MRRETURN(MATCH_NOMATCH);
2727 }
2728 }
2729 /* Control never gets here */
2730
2731
2732 /* Match an extended character class. This opcode is encountered only
2733 when UTF-8 mode mode is supported. Nevertheless, we may not be in UTF-8
2734 mode, because Unicode properties are supported in non-UTF-8 mode. */
2735
2736 #ifdef SUPPORT_UTF8
2737 case OP_XCLASS:
2738 {
2739 data = ecode + 1 + LINK_SIZE; /* Save for matching */
2740 ecode += GET(ecode, 1); /* Advance past the item */
2741
2742 switch (*ecode)
2743 {
2744 case OP_CRSTAR:
2745 case OP_CRMINSTAR:
2746 case OP_CRPLUS:
2747 case OP_CRMINPLUS:
2748 case OP_CRQUERY:
2749 case OP_CRMINQUERY:
2750 c = *ecode++ - OP_CRSTAR;
2751 minimize = (c & 1) != 0;
2752 min = rep_min[c]; /* Pick up values from tables; */
2753 max = rep_max[c]; /* zero for max => infinity */
2754 if (max == 0) max = INT_MAX;
2755 break;
2756
2757 case OP_CRRANGE:
2758 case OP_CRMINRANGE:
2759 minimize = (*ecode == OP_CRMINRANGE);
2760 min = GET2(ecode, 1);
2761 max = GET2(ecode, 3);
2762 if (max == 0) max = INT_MAX;
2763 ecode += 5;
2764 break;
2765
2766 default: /* No repeat follows */
2767 min = max = 1;
2768 break;
2769 }
2770
2771 /* First, ensure the minimum number of matches are present. */
2772
2773 for (i = 1; i <= min; i++)
2774 {
2775 if (eptr >= md->end_subject)
2776 {
2777 SCHECK_PARTIAL();
2778 MRRETURN(MATCH_NOMATCH);
2779 }
2780 GETCHARINCTEST(c, eptr);
2781 if (!_pcre_xclass(c, data)) MRRETURN(MATCH_NOMATCH);
2782 }
2783
2784 /* If max == min we can continue with the main loop without the
2785 need to recurse. */
2786
2787 if (min == max) continue;
2788
2789 /* If minimizing, keep testing the rest of the expression and advancing
2790 the pointer while it matches the class. */
2791
2792 if (minimize)
2793 {
2794 for (fi = min;; fi++)
2795 {
2796 RMATCH(eptr, ecode, offset_top, md, eptrb, RM20);
2797 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2798 if (fi >= max) MRRETURN(MATCH_NOMATCH);
2799 if (eptr >= md->end_subject)
2800 {
2801 SCHECK_PARTIAL();
2802 MRRETURN(MATCH_NOMATCH);
2803 }
2804 GETCHARINCTEST(c, eptr);
2805 if (!_pcre_xclass(c, data)) MRRETURN(MATCH_NOMATCH);
2806 }
2807 /* Control never gets here */
2808 }
2809
2810 /* If maximizing, find the longest possible run, then work backwards. */
2811
2812 else
2813 {
2814 pp = eptr;
2815 for (i = min; i < max; i++)
2816 {
2817 int len = 1;
2818 if (eptr >= md->end_subject)
2819 {
2820 SCHECK_PARTIAL();
2821 break;
2822 }
2823 GETCHARLENTEST(c, eptr, len);
2824 if (!_pcre_xclass(c, data)) break;
2825 eptr += len;
2826 }
2827 for(;;)
2828 {
2829 RMATCH(eptr, ecode, offset_top, md, eptrb, RM21);
2830 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2831 if (eptr-- == pp) break; /* Stop if tried at original pos */
2832 if (utf8) BACKCHAR(eptr);
2833 }
2834 MRRETURN(MATCH_NOMATCH);
2835 }
2836
2837 /* Control never gets here */
2838 }
2839 #endif /* End of XCLASS */
2840
2841 /* Match a single character, casefully */
2842
2843 case OP_CHAR:
2844 #ifdef SUPPORT_UTF8
2845 if (utf8)
2846 {
2847 length = 1;
2848 ecode++;
2849 GETCHARLEN(fc, ecode, length);
2850 if (length > md->end_subject - eptr)
2851 {
2852 CHECK_PARTIAL(); /* Not SCHECK_PARTIAL() */
2853 MRRETURN(MATCH_NOMATCH);
2854 }
2855 while (length-- > 0) if (*ecode++ != *eptr++) MRRETURN(MATCH_NOMATCH);
2856 }
2857 else
2858 #endif
2859
2860 /* Non-UTF-8 mode */
2861 {
2862 if (md->end_subject - eptr < 1)
2863 {
2864 SCHECK_PARTIAL(); /* This one can use SCHECK_PARTIAL() */
2865 MRRETURN(MATCH_NOMATCH);
2866 }
2867 if (ecode[1] != *eptr++) MRRETURN(MATCH_NOMATCH);
2868 ecode += 2;
2869 }
2870 break;
2871
2872 /* Match a single character, caselessly */
2873
2874 case OP_CHARI:
2875 #ifdef SUPPORT_UTF8
2876 if (utf8)
2877 {
2878 length = 1;
2879 ecode++;
2880 GETCHARLEN(fc, ecode, length);
2881
2882 if (length > md->end_subject - eptr)
2883 {
2884 CHECK_PARTIAL(); /* Not SCHECK_PARTIAL() */
2885 MRRETURN(MATCH_NOMATCH);
2886 }
2887
2888 /* If the pattern character's value is < 128, we have only one byte, and
2889 can use the fast lookup table. */
2890
2891 if (fc < 128)
2892 {
2893 if (md->lcc[*ecode++] != md->lcc[*eptr++]) MRRETURN(MATCH_NOMATCH);
2894 }
2895
2896 /* Otherwise we must pick up the subject character */
2897
2898 else
2899 {
2900 unsigned int dc;
2901 GETCHARINC(dc, eptr);
2902 ecode += length;
2903
2904 /* If we have Unicode property support, we can use it to test the other
2905 case of the character, if there is one. */
2906
2907 if (fc != dc)
2908 {
2909 #ifdef SUPPORT_UCP
2910 if (dc != UCD_OTHERCASE(fc))
2911 #endif
2912 MRRETURN(MATCH_NOMATCH);
2913 }
2914 }
2915 }
2916 else
2917 #endif /* SUPPORT_UTF8 */
2918
2919 /* Non-UTF-8 mode */
2920 {
2921 if (md->end_subject - eptr < 1)
2922 {
2923 SCHECK_PARTIAL(); /* This one can use SCHECK_PARTIAL() */
2924 MRRETURN(MATCH_NOMATCH);
2925 }
2926 if (md->lcc[ecode[1]] != md->lcc[*eptr++]) MRRETURN(MATCH_NOMATCH);
2927 ecode += 2;
2928 }
2929 break;
2930
2931 /* Match a single character repeatedly. */
2932
2933 case OP_EXACT:
2934 case OP_EXACTI:
2935 min = max = GET2(ecode, 1);
2936 ecode += 3;
2937 goto REPEATCHAR;
2938
2939 case OP_POSUPTO:
2940 case OP_POSUPTOI:
2941 possessive = TRUE;
2942 /* Fall through */
2943
2944 case OP_UPTO:
2945 case OP_UPTOI:
2946 case OP_MINUPTO:
2947 case OP_MINUPTOI:
2948 min = 0;
2949 max = GET2(ecode, 1);
2950 minimize = *ecode == OP_MINUPTO || *ecode == OP_MINUPTOI;
2951 ecode += 3;
2952 goto REPEATCHAR;
2953
2954 case OP_POSSTAR:
2955 case OP_POSSTARI:
2956 possessive = TRUE;
2957 min = 0;
2958 max = INT_MAX;
2959 ecode++;
2960 goto REPEATCHAR;
2961
2962 case OP_POSPLUS:
2963 case OP_POSPLUSI:
2964 possessive = TRUE;
2965 min = 1;
2966 max = INT_MAX;
2967 ecode++;
2968 goto REPEATCHAR;
2969
2970 case OP_POSQUERY:
2971 case OP_POSQUERYI:
2972 possessive = TRUE;
2973 min = 0;
2974 max = 1;
2975 ecode++;
2976 goto REPEATCHAR;
2977
2978 case OP_STAR:
2979 case OP_STARI:
2980 case OP_MINSTAR:
2981 case OP_MINSTARI:
2982 case OP_PLUS:
2983 case OP_PLUSI:
2984 case OP_MINPLUS:
2985 case OP_MINPLUSI:
2986 case OP_QUERY:
2987 case OP_QUERYI:
2988 case OP_MINQUERY:
2989 case OP_MINQUERYI:
2990 c = *ecode++ - ((op < OP_STARI)? OP_STAR : OP_STARI);
2991 minimize = (c & 1) != 0;
2992 min = rep_min[c]; /* Pick up values from tables; */
2993 max = rep_max[c]; /* zero for max => infinity */
2994 if (max == 0) max = INT_MAX;
2995
2996 /* Common code for all repeated single-character matches. */
2997
2998 REPEATCHAR:
2999 #ifdef SUPPORT_UTF8
3000 if (utf8)
3001 {
3002 length = 1;
3003 charptr = ecode;
3004 GETCHARLEN(fc, ecode, length);
3005 ecode += length;
3006
3007 /* Handle multibyte character matching specially here. There is
3008 support for caseless matching if UCP support is present. */
3009
3010 if (length > 1)
3011 {
3012 #ifdef SUPPORT_UCP
3013 unsigned int othercase;
3014 if (op >= OP_STARI && /* Caseless */
3015 (othercase = UCD_OTHERCASE(fc)) != fc)
3016 oclength = _pcre_ord2utf8(othercase, occhars);
3017 else oclength = 0;
3018 #endif /* SUPPORT_UCP */
3019
3020 for (i = 1; i <= min; i++)
3021 {
3022 if (eptr <= md->end_subject - length &&
3023 memcmp(eptr, charptr, length) == 0) eptr += length;
3024 #ifdef SUPPORT_UCP
3025 else if (oclength > 0 &&
3026 eptr <= md->end_subject - oclength &&
3027 memcmp(eptr, occhars, oclength) == 0) eptr += oclength;
3028 #endif /* SUPPORT_UCP */
3029 else
3030 {
3031 CHECK_PARTIAL();
3032 MRRETURN(MATCH_NOMATCH);
3033 }
3034 }
3035
3036 if (min == max) continue;
3037
3038 if (minimize)
3039 {
3040 for (fi = min;; fi++)
3041 {
3042 RMATCH(eptr, ecode, offset_top, md, eptrb, RM22);
3043 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3044 if (fi >= max) MRRETURN(MATCH_NOMATCH);
3045 if (eptr <= md->end_subject - length &&
3046 memcmp(eptr, charptr, length) == 0) eptr += length;
3047 #ifdef SUPPORT_UCP
3048 else if (oclength > 0 &&
3049 eptr <= md->end_subject - oclength &&
3050 memcmp(eptr, occhars, oclength) == 0) eptr += oclength;
3051 #endif /* SUPPORT_UCP */
3052 else
3053 {
3054 CHECK_PARTIAL();
3055 MRRETURN(MATCH_NOMATCH);
3056 }
3057 }
3058 /* Control never gets here */
3059 }
3060
3061 else /* Maximize */
3062 {
3063 pp = eptr;
3064 for (i = min; i < max; i++)
3065 {
3066 if (eptr <= md->end_subject - length &&
3067 memcmp(eptr, charptr, length) == 0) eptr += length;
3068 #ifdef SUPPORT_UCP
3069 else if (oclength > 0 &&
3070 eptr <= md->end_subject - oclength &&
3071 memcmp(eptr, occhars, oclength) == 0) eptr += oclength;
3072 #endif /* SUPPORT_UCP */
3073 else
3074 {
3075 CHECK_PARTIAL();
3076 break;
3077 }
3078 }
3079
3080 if (possessive) continue;
3081
3082 for(;;)
3083 {
3084 RMATCH(eptr, ecode, offset_top, md, eptrb, RM23);
3085 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3086 if (eptr == pp) { MRRETURN(MATCH_NOMATCH); }
3087 #ifdef SUPPORT_UCP
3088 eptr--;
3089 BACKCHAR(eptr);
3090 #else /* without SUPPORT_UCP */
3091 eptr -= length;
3092 #endif /* SUPPORT_UCP */
3093 }
3094 }
3095 /* Control never gets here */
3096 }
3097
3098 /* If the length of a UTF-8 character is 1, we fall through here, and
3099 obey the code as for non-UTF-8 characters below, though in this case the
3100 value of fc will always be < 128. */
3101 }
3102 else
3103 #endif /* SUPPORT_UTF8 */
3104
3105 /* When not in UTF-8 mode, load a single-byte character. */
3106
3107 fc = *ecode++;
3108
3109 /* The value of fc at this point is always less than 256, though we may or
3110 may not be in UTF-8 mode. The code is duplicated for the caseless and
3111 caseful cases, for speed, since matching characters is likely to be quite
3112 common. First, ensure the minimum number of matches are present. If min =
3113 max, continue at the same level without recursing. Otherwise, if
3114 minimizing, keep trying the rest of the expression and advancing one
3115 matching character if failing, up to the maximum. Alternatively, if
3116 maximizing, find the maximum number of characters and work backwards. */
3117
3118 DPRINTF(("matching %c{%d,%d} against subject %.*s\n", fc, min, max,
3119 max, eptr));
3120
3121 if (op >= OP_STARI) /* Caseless */
3122 {
3123 fc = md->lcc[fc];
3124 for (i = 1; i <= min; i++)
3125 {
3126 if (eptr >= md->end_subject)
3127 {
3128 SCHECK_PARTIAL();
3129 MRRETURN(MATCH_NOMATCH);
3130 }
3131 if (fc != md->lcc[*eptr++]) MRRETURN(MATCH_NOMATCH);
3132 }
3133 if (min == max) continue;
3134 if (minimize)
3135 {
3136 for (fi = min;; fi++)
3137 {
3138 RMATCH(eptr, ecode, offset_top, md, eptrb, RM24);
3139 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3140 if (fi >= max) MRRETURN(MATCH_NOMATCH);
3141 if (eptr >= md->end_subject)
3142 {
3143 SCHECK_PARTIAL();
3144 MRRETURN(MATCH_NOMATCH);
3145 }
3146 if (fc != md->lcc[*eptr++]) MRRETURN(MATCH_NOMATCH);
3147 }
3148 /* Control never gets here */
3149 }
3150 else /* Maximize */
3151 {
3152 pp = eptr;
3153 for (i = min; i < max; i++)
3154 {
3155 if (eptr >= md->end_subject)
3156 {
3157 SCHECK_PARTIAL();
3158 break;
3159 }
3160 if (fc != md->lcc[*eptr]) break;
3161 eptr++;
3162 }
3163
3164 if (possessive) continue;
3165
3166 while (eptr >= pp)
3167 {
3168 RMATCH(eptr, ecode, offset_top, md, eptrb, RM25);
3169 eptr--;
3170 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3171 }
3172 MRRETURN(MATCH_NOMATCH);
3173 }
3174 /* Control never gets here */
3175 }
3176
3177 /* Caseful comparisons (includes all multi-byte characters) */
3178
3179 else
3180 {
3181 for (i = 1; i <= min; i++)
3182 {
3183 if (eptr >= md->end_subject)
3184 {
3185 SCHECK_PARTIAL();
3186 MRRETURN(MATCH_NOMATCH);
3187 }
3188 if (fc != *eptr++) MRRETURN(MATCH_NOMATCH);
3189 }
3190
3191 if (min == max) continue;
3192
3193 if (minimize)
3194 {
3195 for (fi = min;; fi++)
3196 {
3197 RMATCH(eptr, ecode, offset_top, md, eptrb, RM26);
3198 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3199 if (fi >= max) MRRETURN(MATCH_NOMATCH);
3200 if (eptr >= md->end_subject)
3201 {
3202 SCHECK_PARTIAL();
3203 MRRETURN(MATCH_NOMATCH);
3204 }
3205 if (fc != *eptr++) MRRETURN(MATCH_NOMATCH);
3206 }
3207 /* Control never gets here */
3208 }
3209 else /* Maximize */
3210 {
3211 pp = eptr;
3212 for (i = min; i < max; i++)
3213 {
3214 if (eptr >= md->end_subject)
3215 {
3216 SCHECK_PARTIAL();
3217 break;
3218 }
3219 if (fc != *eptr) break;
3220 eptr++;
3221 }
3222 if (possessive) continue;
3223
3224 while (eptr >= pp)
3225 {
3226 RMATCH(eptr, ecode, offset_top, md, eptrb, RM27);
3227 eptr--;
3228 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3229 }
3230 MRRETURN(MATCH_NOMATCH);
3231 }
3232 }
3233 /* Control never gets here */
3234
3235 /* Match a negated single one-byte character. The character we are
3236 checking can be multibyte. */
3237
3238 case OP_NOT:
3239 case OP_NOTI:
3240 if (eptr >= md->end_subject)
3241 {
3242 SCHECK_PARTIAL();
3243 MRRETURN(MATCH_NOMATCH);
3244 }
3245 ecode++;
3246 GETCHARINCTEST(c, eptr);
3247 if (op == OP_NOTI) /* The caseless case */
3248 {
3249 #ifdef SUPPORT_UTF8
3250 if (c < 256)
3251 #endif
3252 c = md->lcc[c];
3253 if (md->lcc[*ecode++] == c) MRRETURN(MATCH_NOMATCH);
3254 }
3255 else /* Caseful */
3256 {
3257 if (*ecode++ == c) MRRETURN(MATCH_NOMATCH);
3258 }
3259 break;
3260
3261 /* Match a negated single one-byte character repeatedly. This is almost a
3262 repeat of the code for a repeated single character, but I haven't found a
3263 nice way of commoning these up that doesn't require a test of the
3264 positive/negative option for each character match. Maybe that wouldn't add
3265 very much to the time taken, but character matching *is* what this is all
3266 about... */
3267
3268 case OP_NOTEXACT:
3269 case OP_NOTEXACTI:
3270 min = max = GET2(ecode, 1);
3271 ecode += 3;
3272 goto REPEATNOTCHAR;
3273
3274 case OP_NOTUPTO:
3275 case OP_NOTUPTOI:
3276 case OP_NOTMINUPTO:
3277 case OP_NOTMINUPTOI:
3278 min = 0;
3279 max = GET2(ecode, 1);
3280 minimize = *ecode == OP_NOTMINUPTO || *ecode == OP_NOTMINUPTOI;
3281 ecode += 3;
3282 goto REPEATNOTCHAR;
3283
3284 case OP_NOTPOSSTAR:
3285 case OP_NOTPOSSTARI:
3286 possessive = TRUE;
3287 min = 0;
3288 max = INT_MAX;
3289 ecode++;
3290 goto REPEATNOTCHAR;
3291
3292 case OP_NOTPOSPLUS:
3293 case OP_NOTPOSPLUSI:
3294 possessive = TRUE;
3295 min = 1;
3296 max = INT_MAX;
3297 ecode++;
3298 goto REPEATNOTCHAR;
3299
3300 case OP_NOTPOSQUERY:
3301 case OP_NOTPOSQUERYI:
3302 possessive = TRUE;
3303 min = 0;
3304 max = 1;
3305 ecode++;
3306 goto REPEATNOTCHAR;
3307
3308 case OP_NOTPOSUPTO:
3309 case OP_NOTPOSUPTOI:
3310 possessive = TRUE;
3311 min = 0;
3312 max = GET2(ecode, 1);
3313 ecode += 3;
3314 goto REPEATNOTCHAR;
3315
3316 case OP_NOTSTAR:
3317 case OP_NOTSTARI:
3318 case OP_NOTMINSTAR:
3319 case OP_NOTMINSTARI:
3320 case OP_NOTPLUS:
3321 case OP_NOTPLUSI:
3322 case OP_NOTMINPLUS:
3323 case OP_NOTMINPLUSI:
3324 case OP_NOTQUERY:
3325 case OP_NOTQUERYI:
3326 case OP_NOTMINQUERY:
3327 case OP_NOTMINQUERYI:
3328 c = *ecode++ - ((op >= OP_NOTSTARI)? OP_NOTSTARI: OP_NOTSTAR);
3329 minimize = (c & 1) != 0;
3330 min = rep_min[c]; /* Pick up values from tables; */
3331 max = rep_max[c]; /* zero for max => infinity */
3332 if (max == 0) max = INT_MAX;
3333
3334 /* Common code for all repeated single-byte matches. */
3335
3336 REPEATNOTCHAR:
3337 fc = *ecode++;
3338
3339 /* The code is duplicated for the caseless and caseful cases, for speed,
3340 since matching characters is likely to be quite common. First, ensure the
3341 minimum number of matches are present. If min = max, continue at the same
3342 level without recursing. Otherwise, if minimizing, keep trying the rest of
3343 the expression and advancing one matching character if failing, up to the
3344 maximum. Alternatively, if maximizing, find the maximum number of
3345 characters and work backwards. */
3346
3347 DPRINTF(("negative matching %c{%d,%d} against subject %.*s\n", fc, min, max,
3348 max, eptr));
3349
3350 if (op >= OP_NOTSTARI) /* Caseless */
3351 {
3352 fc = md->lcc[fc];
3353
3354 #ifdef SUPPORT_UTF8
3355 /* UTF-8 mode */
3356 if (utf8)
3357 {
3358 register unsigned int d;
3359 for (i = 1; i <= min; i++)
3360 {
3361 if (eptr >= md->end_subject)
3362 {
3363 SCHECK_PARTIAL();
3364 MRRETURN(MATCH_NOMATCH);
3365 }
3366 GETCHARINC(d, eptr);
3367 if (d < 256) d = md->lcc[d];
3368 if (fc == d) MRRETURN(MATCH_NOMATCH);
3369 }
3370 }
3371 else
3372 #endif
3373
3374 /* Not UTF-8 mode */
3375 {
3376 for (i = 1; i <= min; i++)
3377 {
3378 if (eptr >= md->end_subject)
3379 {
3380 SCHECK_PARTIAL();
3381 MRRETURN(MATCH_NOMATCH);
3382 }
3383 if (fc == md->lcc[*eptr++]) MRRETURN(MATCH_NOMATCH);
3384 }
3385 }
3386
3387 if (min == max) continue;
3388
3389 if (minimize)
3390 {
3391 #ifdef SUPPORT_UTF8
3392 /* UTF-8 mode */
3393 if (utf8)
3394 {
3395 register unsigned int d;
3396 for (fi = min;; fi++)
3397 {
3398 RMATCH(eptr, ecode, offset_top, md, eptrb, RM28);
3399 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3400 if (fi >= max) MRRETURN(MATCH_NOMATCH);
3401 if (eptr >= md->end_subject)
3402 {
3403 SCHECK_PARTIAL();
3404 MRRETURN(MATCH_NOMATCH);
3405 }
3406 GETCHARINC(d, eptr);
3407 if (d < 256) d = md->lcc[d];
3408 if (fc == d) MRRETURN(MATCH_NOMATCH);
3409 }
3410 }
3411 else
3412 #endif
3413 /* Not UTF-8 mode */
3414 {
3415 for (fi = min;; fi++)
3416 {
3417 RMATCH(eptr, ecode, offset_top, md, eptrb, RM29);
3418 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3419 if (fi >= max) MRRETURN(MATCH_NOMATCH);
3420 if (eptr >= md->end_subject)
3421 {
3422 SCHECK_PARTIAL();
3423 MRRETURN(MATCH_NOMATCH);
3424 }
3425 if (fc == md->lcc[*eptr++]) MRRETURN(MATCH_NOMATCH);
3426 }
3427 }
3428 /* Control never gets here */
3429 }
3430
3431 /* Maximize case */
3432
3433 else
3434 {
3435 pp = eptr;
3436
3437 #ifdef SUPPORT_UTF8
3438 /* UTF-8 mode */
3439 if (utf8)
3440 {
3441 register unsigned int d;
3442 for (i = min; i < max; i++)
3443 {
3444 int len = 1;
3445 if (eptr >= md->end_subject)
3446 {
3447 SCHECK_PARTIAL();
3448 break;
3449 }
3450 GETCHARLEN(d, eptr, len);
3451 if (d < 256) d = md->lcc[d];
3452 if (fc == d) break;
3453 eptr += len;
3454 }
3455 if (possessive) continue;
3456 for(;;)
3457 {
3458 RMATCH(eptr, ecode, offset_top, md, eptrb, RM30);
3459 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3460 if (eptr-- == pp) break; /* Stop if tried at original pos */
3461 BACKCHAR(eptr);
3462 }
3463 }
3464 else
3465 #endif
3466 /* Not UTF-8 mode */
3467 {
3468 for (i = min; i < max; i++)
3469 {
3470 if (eptr >= md->end_subject)
3471 {
3472 SCHECK_PARTIAL();
3473 break;
3474 }
3475 if (fc == md->lcc[*eptr]) break;
3476 eptr++;
3477 }
3478 if (possessive) continue;
3479 while (eptr >= pp)
3480 {
3481 RMATCH(eptr, ecode, offset_top, md, eptrb, RM31);
3482 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3483 eptr--;
3484 }
3485 }
3486
3487 MRRETURN(MATCH_NOMATCH);
3488 }
3489 /* Control never gets here */
3490 }
3491
3492 /* Caseful comparisons */
3493
3494 else
3495 {
3496 #ifdef SUPPORT_UTF8
3497 /* UTF-8 mode */
3498 if (utf8)
3499 {
3500 register unsigned int d;
3501 for (i = 1; i <= min; i++)
3502 {
3503 if (eptr >= md->end_subject)
3504 {
3505 SCHECK_PARTIAL();
3506 MRRETURN(MATCH_NOMATCH);
3507 }
3508 GETCHARINC(d, eptr);
3509 if (fc == d) MRRETURN(MATCH_NOMATCH);
3510 }
3511 }
3512 else
3513 #endif
3514 /* Not UTF-8 mode */
3515 {
3516 for (i = 1; i <= min; i++)
3517 {
3518 if (eptr >= md->end_subject)
3519 {
3520 SCHECK_PARTIAL();
3521 MRRETURN(MATCH_NOMATCH);
3522 }
3523 if (fc == *eptr++) MRRETURN(MATCH_NOMATCH);
3524 }
3525 }
3526
3527 if (min == max) continue;
3528
3529 if (minimize)
3530 {
3531 #ifdef SUPPORT_UTF8
3532 /* UTF-8 mode */
3533 if (utf8)
3534 {
3535 register unsigned int d;
3536 for (fi = min;; fi++)
3537 {
3538 RMATCH(eptr, ecode, offset_top, md, eptrb, RM32);
3539 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3540 if (fi >= max) MRRETURN(MATCH_NOMATCH);
3541 if (eptr >= md->end_subject)
3542 {
3543 SCHECK_PARTIAL();
3544 MRRETURN(MATCH_NOMATCH);
3545 }
3546 GETCHARINC(d, eptr);
3547 if (fc == d) MRRETURN(MATCH_NOMATCH);
3548 }
3549 }
3550 else
3551 #endif
3552 /* Not UTF-8 mode */
3553 {
3554 for (fi = min;; fi++)
3555 {
3556 RMATCH(eptr, ecode, offset_top, md, eptrb, RM33);
3557 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3558 if (fi >= max) MRRETURN(MATCH_NOMATCH);
3559 if (eptr >= md->end_subject)
3560 {
3561 SCHECK_PARTIAL();
3562 MRRETURN(MATCH_NOMATCH);
3563 }
3564 if (fc == *eptr++) MRRETURN(MATCH_NOMATCH);
3565 }
3566 }
3567 /* Control never gets here */
3568 }
3569
3570 /* Maximize case */
3571
3572 else
3573 {
3574 pp = eptr;
3575
3576 #ifdef SUPPORT_UTF8
3577 /* UTF-8 mode */
3578 if (utf8)
3579 {
3580 register unsigned int d;
3581 for (i = min; i < max; i++)
3582 {
3583 int len = 1;
3584 if (eptr >= md->end_subject)
3585 {
3586 SCHECK_PARTIAL();
3587 break;
3588 }
3589 GETCHARLEN(d, eptr, len);
3590 if (fc == d) break;
3591 eptr += len;
3592 }
3593 if (possessive) continue;
3594 for(;;)
3595 {
3596 RMATCH(eptr, ecode, offset_top, md, eptrb, RM34);
3597 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3598 if (eptr-- == pp) break; /* Stop if tried at original pos */
3599 BACKCHAR(eptr);
3600 }
3601 }
3602 else
3603 #endif
3604 /* Not UTF-8 mode */
3605 {
3606 for (i = min; i < max; i++)
3607 {
3608 if (eptr >= md->end_subject)
3609 {
3610 SCHECK_PARTIAL();
3611 break;
3612 }
3613 if (fc == *eptr) break;
3614 eptr++;
3615 }
3616 if (possessive) continue;
3617 while (eptr >= pp)
3618 {
3619 RMATCH(eptr, ecode, offset_top, md, eptrb, RM35);
3620 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3621 eptr--;
3622 }
3623 }
3624
3625 MRRETURN(MATCH_NOMATCH);
3626 }
3627 }
3628 /* Control never gets here */
3629
3630 /* Match a single character type repeatedly; several different opcodes
3631 share code. This is very similar to the code for single characters, but we
3632 repeat it in the interests of efficiency. */
3633
3634 case OP_TYPEEXACT:
3635 min = max = GET2(ecode, 1);
3636 minimize = TRUE;
3637 ecode += 3;
3638 goto REPEATTYPE;
3639
3640 case OP_TYPEUPTO:
3641 case OP_TYPEMINUPTO:
3642 min = 0;
3643 max = GET2(ecode, 1);
3644 minimize = *ecode == OP_TYPEMINUPTO;
3645 ecode += 3;
3646 goto REPEATTYPE;
3647
3648 case OP_TYPEPOSSTAR:
3649 possessive = TRUE;
3650 min = 0;
3651 max = INT_MAX;
3652 ecode++;
3653 goto REPEATTYPE;
3654
3655 case OP_TYPEPOSPLUS:
3656 possessive = TRUE;
3657 min = 1;
3658 max = INT_MAX;
3659 ecode++;
3660 goto REPEATTYPE;
3661
3662 case OP_TYPEPOSQUERY:
3663 possessive = TRUE;
3664 min = 0;
3665 max = 1;
3666 ecode++;
3667 goto REPEATTYPE;
3668
3669 case OP_TYPEPOSUPTO:
3670 possessive = TRUE;
3671 min = 0;
3672 max = GET2(ecode, 1);
3673 ecode += 3;
3674 goto REPEATTYPE;
3675
3676 case OP_TYPESTAR:
3677 case OP_TYPEMINSTAR:
3678 case OP_TYPEPLUS:
3679 case OP_TYPEMINPLUS:
3680 case OP_TYPEQUERY:
3681 case OP_TYPEMINQUERY:
3682 c = *ecode++ - OP_TYPESTAR;
3683 minimize = (c & 1) != 0;
3684 min = rep_min[c]; /* Pick up values from tables; */
3685 max = rep_max[c]; /* zero for max => infinity */
3686 if (max == 0) max = INT_MAX;
3687
3688 /* Common code for all repeated single character type matches. Note that
3689 in UTF-8 mode, '.' matches a character of any length, but for the other
3690 character types, the valid characters are all one-byte long. */
3691
3692 REPEATTYPE:
3693 ctype = *ecode++; /* Code for the character type */
3694
3695 #ifdef SUPPORT_UCP
3696 if (ctype == OP_PROP || ctype == OP_NOTPROP)
3697 {
3698 prop_fail_result = ctype == OP_NOTPROP;
3699 prop_type = *ecode++;
3700 prop_value = *ecode++;
3701 }
3702 else prop_type = -1;
3703 #endif
3704
3705 /* First, ensure the minimum number of matches are present. Use inline
3706 code for maximizing the speed, and do the type test once at the start
3707 (i.e. keep it out of the loop). Separate the UTF-8 code completely as that
3708 is tidier. Also separate the UCP code, which can be the same for both UTF-8
3709 and single-bytes. */
3710
3711 if (min > 0)
3712 {
3713 #ifdef SUPPORT_UCP
3714 if (prop_type >= 0)
3715 {
3716 switch(prop_type)
3717 {
3718 case PT_ANY:
3719 if (prop_fail_result) MRRETURN(MATCH_NOMATCH);
3720 for (i = 1; i <= min; i++)
3721 {
3722 if (eptr >= md->end_subject)
3723 {
3724 SCHECK_PARTIAL();
3725 MRRETURN(MATCH_NOMATCH);
3726 }
3727 GETCHARINCTEST(c, eptr);
3728 }
3729 break;
3730
3731 case PT_LAMP:
3732 for (i = 1; i <= min; i++)
3733 {
3734 if (eptr >= md->end_subject)
3735 {
3736 SCHECK_PARTIAL();
3737 MRRETURN(MATCH_NOMATCH);
3738 }
3739 GETCHARINCTEST(c, eptr);
3740 prop_chartype = UCD_CHARTYPE(c);
3741 if ((prop_chartype == ucp_Lu ||
3742 prop_chartype == ucp_Ll ||
3743 prop_chartype == ucp_Lt) == prop_fail_result)
3744 MRRETURN(MATCH_NOMATCH);
3745 }
3746 break;
3747
3748 case PT_GC:
3749 for (i = 1; i <= min; i++)
3750 {
3751 if (eptr >= md->end_subject)
3752 {
3753 SCHECK_PARTIAL();
3754 MRRETURN(MATCH_NOMATCH);
3755 }
3756 GETCHARINCTEST(c, eptr);
3757 prop_category = UCD_CATEGORY(c);
3758 if ((prop_category == prop_value) == prop_fail_result)
3759 MRRETURN(MATCH_NOMATCH);
3760 }
3761 break;
3762
3763 case PT_PC:
3764 for (i = 1; i <= min; i++)
3765 {
3766 if (eptr >= md->end_subject)
3767 {
3768 SCHECK_PARTIAL();
3769 MRRETURN(MATCH_NOMATCH);
3770 }
3771 GETCHARINCTEST(c, eptr);
3772 prop_chartype = UCD_CHARTYPE(c);
3773 if ((prop_chartype == prop_value) == prop_fail_result)
3774 MRRETURN(MATCH_NOMATCH);
3775 }
3776 break;
3777
3778 case PT_SC:
3779 for (i = 1; i <= min; i++)
3780 {
3781 if (eptr >= md->end_subject)
3782 {
3783 SCHECK_PARTIAL();
3784 MRRETURN(MATCH_NOMATCH);
3785 }
3786 GETCHARINCTEST(c, eptr);
3787 prop_script = UCD_SCRIPT(c);
3788 if ((prop_script == prop_value) == prop_fail_result)
3789 MRRETURN(MATCH_NOMATCH);
3790 }
3791 break;
3792
3793 case PT_ALNUM:
3794 for (i = 1; i <= min; i++)
3795 {
3796 if (eptr >= md->end_subject)
3797 {
3798 SCHECK_PARTIAL();
3799 MRRETURN(MATCH_NOMATCH);
3800 }
3801 GETCHARINCTEST(c, eptr);
3802 prop_category = UCD_CATEGORY(c);
3803 if ((prop_category == ucp_L || prop_category == ucp_N)
3804 == prop_fail_result)
3805 MRRETURN(MATCH_NOMATCH);
3806 }
3807 break;
3808
3809 case PT_SPACE: /* Perl space */
3810 for (i = 1; i <= min; i++)
3811 {
3812 if (eptr >= md->end_subject)
3813 {
3814 SCHECK_PARTIAL();
3815 MRRETURN(MATCH_NOMATCH);
3816 }
3817 GETCHARINCTEST(c, eptr);
3818 prop_category = UCD_CATEGORY(c);
3819 if ((prop_category == ucp_Z || c == CHAR_HT || c == CHAR_NL ||
3820 c == CHAR_FF || c == CHAR_CR)
3821 == prop_fail_result)
3822 MRRETURN(MATCH_NOMATCH);
3823 }
3824 break;
3825
3826 case PT_PXSPACE: /* POSIX space */
3827 for (i = 1; i <= min; i++)
3828 {
3829 if (eptr >= md->end_subject)
3830 {
3831 SCHECK_PARTIAL();
3832 MRRETURN(MATCH_NOMATCH);
3833 }
3834 GETCHARINCTEST(c, eptr);
3835 prop_category = UCD_CATEGORY(c);
3836 if ((prop_category == ucp_Z || c == CHAR_HT || c == CHAR_NL ||
3837 c == CHAR_VT || c == CHAR_FF || c == CHAR_CR)
3838 == prop_fail_result)
3839 MRRETURN(MATCH_NOMATCH);
3840 }
3841 break;
3842
3843 case PT_WORD:
3844 for (i = 1; i <= min; i++)
3845 {
3846 if (eptr >= md->end_subject)
3847 {
3848 SCHECK_PARTIAL();
3849 MRRETURN(MATCH_NOMATCH);
3850 }
3851 GETCHARINCTEST(c, eptr);
3852 prop_category = UCD_CATEGORY(c);
3853 if ((prop_category == ucp_L || prop_category == ucp_N ||
3854 c == CHAR_UNDERSCORE)
3855 == prop_fail_result)
3856 MRRETURN(MATCH_NOMATCH);
3857 }
3858 break;
3859
3860 /* This should not occur */
3861
3862 default:
3863 RRETURN(PCRE_ERROR_INTERNAL);
3864 }
3865 }
3866
3867 /* Match extended Unicode sequences. We will get here only if the
3868 support is in the binary; otherwise a compile-time error occurs. */
3869
3870 else if (ctype == OP_EXTUNI)
3871 {
3872 for (i = 1; i <= min; i++)
3873 {
3874 if (eptr >= md->end_subject)
3875 {
3876 SCHECK_PARTIAL();
3877 MRRETURN(MATCH_NOMATCH);
3878 }
3879 GETCHARINCTEST(c, eptr);
3880 prop_category = UCD_CATEGORY(c);
3881 if (prop_category == ucp_M) MRRETURN(MATCH_NOMATCH);
3882 while (eptr < md->end_subject)
3883 {
3884 int len = 1;
3885 if (!utf8) c = *eptr;
3886 else { GETCHARLEN(c, eptr, len); }
3887 prop_category = UCD_CATEGORY(c);
3888 if (prop_category != ucp_M) break;
3889 eptr += len;
3890 }
3891 }
3892 }
3893
3894 else
3895 #endif /* SUPPORT_UCP */
3896
3897 /* Handle all other cases when the coding is UTF-8 */
3898
3899 #ifdef SUPPORT_UTF8
3900 if (utf8) switch(ctype)
3901 {
3902 case OP_ANY:
3903 for (i = 1; i <= min; i++)
3904 {
3905 if (eptr >= md->end_subject)
3906 {
3907 SCHECK_PARTIAL();
3908 MRRETURN(MATCH_NOMATCH);
3909 }
3910 if (IS_NEWLINE(eptr)) MRRETURN(MATCH_NOMATCH);
3911 eptr++;
3912 while (eptr < md->end_subject && (*eptr & 0xc0) == 0x80) eptr++;
3913 }
3914 break;
3915
3916 case OP_ALLANY:
3917 for (i = 1; i <= min; i++)
3918 {
3919 if (eptr >= md->end_subject)
3920 {
3921 SCHECK_PARTIAL();
3922 MRRETURN(MATCH_NOMATCH);
3923 }
3924 eptr++;
3925 while (eptr < md->end_subject && (*eptr & 0xc0) == 0x80) eptr++;
3926 }
3927 break;
3928
3929 case OP_ANYBYTE:
3930 if (eptr > md->end_subject - min) MRRETURN(MATCH_NOMATCH);
3931 eptr += min;
3932 break;
3933
3934 case OP_ANYNL:
3935 for (i = 1; i <= min; i++)
3936 {
3937 if (eptr >= md->end_subject)
3938 {
3939 SCHECK_PARTIAL();
3940 MRRETURN(MATCH_NOMATCH);
3941 }
3942 GETCHARINC(c, eptr);
3943 switch(c)
3944 {
3945 default: MRRETURN(MATCH_NOMATCH);
3946
3947 case 0x000d:
3948 if (eptr < md->end_subject && *eptr == 0x0a) eptr++;
3949 break;
3950
3951 case 0x000a:
3952 break;
3953
3954 case 0x000b:
3955 case 0x000c:
3956 case 0x0085:
3957 case 0x2028:
3958 case 0x2029:
3959 if (md->bsr_anycrlf) MRRETURN(MATCH_NOMATCH);
3960 break;
3961 }
3962 }
3963 break;
3964
3965 case OP_NOT_HSPACE:
3966 for (i = 1; i <= min; i++)
3967 {
3968 if (eptr >= md->end_subject)
3969 {
3970 SCHECK_PARTIAL();
3971 MRRETURN(MATCH_NOMATCH);
3972 }
3973 GETCHARINC(c, eptr);
3974 switch(c)
3975 {
3976 default: break;
3977 case 0x09: /* HT */
3978 case 0x20: /* SPACE */
3979 case 0xa0: /* NBSP */
3980 case 0x1680: /* OGHAM SPACE MARK */
3981 case 0x180e: /* MONGOLIAN VOWEL SEPARATOR */
3982 case 0x2000: /* EN QUAD */
3983 case 0x2001: /* EM QUAD */
3984 case 0x2002: /* EN SPACE */
3985 case 0x2003: /* EM SPACE */
3986 case 0x2004: /* THREE-PER-EM SPACE */
3987 case 0x2005: /* FOUR-PER-EM SPACE */
3988 case 0x2006: /* SIX-PER-EM SPACE */
3989 case 0x2007: /* FIGURE SPACE */
3990 case 0x2008: /* PUNCTUATION SPACE */
3991 case 0x2009: /* THIN SPACE */
3992 case 0x200A: /* HAIR SPACE */
3993 case 0x202f: /* NARROW NO-BREAK SPACE */
3994 case 0x205f: /* MEDIUM MATHEMATICAL SPACE */
3995 case 0x3000: /* IDEOGRAPHIC SPACE */
3996 MRRETURN(MATCH_NOMATCH);
3997 }
3998 }
3999 break;
4000
4001 case OP_HSPACE:
4002 for (i = 1; i <= min; i++)
4003 {
4004 if (eptr >= md->end_subject)
4005 {
4006 SCHECK_PARTIAL();
4007 MRRETURN(MATCH_NOMATCH);
4008 }
4009 GETCHARINC(c, eptr);
4010 switch(c)
4011 {
4012 default: MRRETURN(MATCH_NOMATCH);
4013 case 0x09: /* HT */
4014 case 0x20: /* SPACE */
4015 case 0xa0: /* NBSP */
4016 case 0x1680: /* OGHAM SPACE MARK */
4017 case 0x180e: /* MONGOLIAN VOWEL SEPARATOR */
4018 case 0x2000: /* EN QUAD */
4019 case 0x2001: /* EM QUAD */
4020 case 0x2002: /* EN SPACE */
4021 case 0x2003: /* EM SPACE */
4022 case 0x2004: /* THREE-PER-EM SPACE */
4023 case 0x2005: /* FOUR-PER-EM SPACE */
4024 case 0x2006: /* SIX-PER-EM SPACE */
4025 case 0x2007: /* FIGURE SPACE */
4026 case 0x2008: /* PUNCTUATION SPACE */
4027 case 0x2009: /* THIN SPACE */
4028 case 0x200A: /* HAIR SPACE */
4029 case 0x202f: /* NARROW NO-BREAK SPACE */
4030 case 0x205f: /* MEDIUM MATHEMATICAL SPACE */
4031 case 0x3000: /* IDEOGRAPHIC SPACE */
4032 break;
4033 }
4034 }
4035 break;
4036
4037 case OP_NOT_VSPACE:
4038 for (i = 1; i <= min; i++)
4039 {
4040 if (eptr >= md->end_subject)
4041 {
4042 SCHECK_PARTIAL();
4043 MRRETURN(MATCH_NOMATCH);
4044 }
4045 GETCHARINC(c, eptr);
4046 switch(c)
4047 {
4048 default: break;
4049 case 0x0a: /* LF */
4050 case 0x0b: /* VT */
4051 case 0x0c: /* FF */
4052 case 0x0d: /* CR */
4053 case 0x85: /* NEL */
4054 case 0x2028: /* LINE SEPARATOR */
4055 case 0x2029: /* PARAGRAPH SEPARATOR */
4056 MRRETURN(MATCH_NOMATCH);
4057 }
4058 }
4059 break;
4060
4061 case OP_VSPACE:
4062 for (i = 1; i <= min; i++)
4063 {
4064 if (eptr >= md->end_subject)
4065 {
4066 SCHECK_PARTIAL();
4067 MRRETURN(MATCH_NOMATCH);
4068 }
4069 GETCHARINC(c, eptr);
4070 switch(c)
4071 {
4072 default: MRRETURN(MATCH_NOMATCH);
4073 case 0x0a: /* LF */
4074 case 0x0b: /* VT */
4075 case 0x0c: /* FF */
4076 case 0x0d: /* CR */
4077 case 0x85: /* NEL */
4078 case 0x2028: /* LINE SEPARATOR */
4079 case 0x2029: /* PARAGRAPH SEPARATOR */
4080 break;
4081 }
4082 }
4083 break;
4084
4085 case OP_NOT_DIGIT:
4086 for (i = 1; i <= min; i++)
4087 {
4088 if (eptr >= md->end_subject)
4089 {
4090 SCHECK_PARTIAL();
4091 MRRETURN(MATCH_NOMATCH);
4092 }
4093 GETCHARINC(c, eptr);
4094 if (c < 128 && (md->ctypes[c] & ctype_digit) != 0)
4095 MRRETURN(MATCH_NOMATCH);
4096 }
4097 break;
4098
4099 case OP_DIGIT:
4100 for (i = 1; i <= min; i++)
4101 {
4102 if (eptr >= md->end_subject)
4103 {
4104 SCHECK_PARTIAL();
4105 MRRETURN(MATCH_NOMATCH);
4106 }
4107 if (*eptr >= 128 || (md->ctypes[*eptr++] & ctype_digit) == 0)
4108 MRRETURN(MATCH_NOMATCH);
4109 /* No need to skip more bytes - we know it's a 1-byte character */
4110 }
4111 break;
4112
4113 case OP_NOT_WHITESPACE:
4114 for (i = 1; i <= min; i++)
4115 {
4116 if (eptr >= md->end_subject)
4117 {
4118 SCHECK_PARTIAL();
4119 MRRETURN(MATCH_NOMATCH);
4120 }
4121 if (*eptr < 128 && (md->ctypes[*eptr] & ctype_space) != 0)
4122 MRRETURN(MATCH_NOMATCH);
4123 while (++eptr < md->end_subject && (*eptr & 0xc0) == 0x80);
4124 }
4125 break;
4126
4127 case OP_WHITESPACE:
4128 for (i = 1; i <= min; i++)
4129 {
4130 if (eptr >= md->end_subject)
4131 {
4132 SCHECK_PARTIAL();
4133 MRRETURN(MATCH_NOMATCH);
4134 }
4135 if (*eptr >= 128 || (md->ctypes[*eptr++] & ctype_space) == 0)
4136 MRRETURN(MATCH_NOMATCH);
4137 /* No need to skip more bytes - we know it's a 1-byte character */
4138 }
4139 break;
4140
4141 case OP_NOT_WORDCHAR:
4142 for (i = 1; i <= min; i++)
4143 {
4144 if (eptr >= md->end_subject)
4145 {
4146 SCHECK_PARTIAL();
4147 MRRETURN(MATCH_NOMATCH);
4148 }
4149 if (*eptr < 128 && (md->ctypes[*eptr] & ctype_word) != 0)
4150 MRRETURN(MATCH_NOMATCH);
4151 while (++eptr < md->end_subject && (*eptr & 0xc0) == 0x80);
4152 }
4153 break;
4154
4155 case OP_WORDCHAR:
4156 for (i = 1; i <= min; i++)
4157 {
4158 if (eptr >= md->end_subject)
4159 {
4160 SCHECK_PARTIAL();
4161 MRRETURN(MATCH_NOMATCH);
4162 }
4163 if (*eptr >= 128 || (md->ctypes[*eptr++] & ctype_word) == 0)
4164 MRRETURN(MATCH_NOMATCH);
4165 /* No need to skip more bytes - we know it's a 1-byte character */
4166 }
4167 break;
4168
4169 default:
4170 RRETURN(PCRE_ERROR_INTERNAL);
4171 } /* End switch(ctype) */
4172
4173 else
4174 #endif /* SUPPORT_UTF8 */
4175
4176 /* Code for the non-UTF-8 case for minimum matching of operators other
4177 than OP_PROP and OP_NOTPROP. */
4178
4179 switch(ctype)
4180 {
4181 case OP_ANY:
4182 for (i = 1; i <= min; i++)
4183 {
4184 if (eptr >= md->end_subject)
4185 {
4186 SCHECK_PARTIAL();
4187 MRRETURN(MATCH_NOMATCH);
4188 }
4189 if (IS_NEWLINE(eptr)) MRRETURN(MATCH_NOMATCH);
4190 eptr++;
4191 }
4192 break;
4193
4194 case OP_ALLANY:
4195 if (eptr > md->end_subject - min)
4196 {
4197 SCHECK_PARTIAL();
4198 MRRETURN(MATCH_NOMATCH);
4199 }
4200 eptr += min;
4201 break;
4202
4203 case OP_ANYBYTE:
4204 if (eptr > md->end_subject - min)
4205 {
4206 SCHECK_PARTIAL();
4207 MRRETURN(MATCH_NOMATCH);
4208 }
4209 eptr += min;
4210 break;
4211
4212 case OP_ANYNL:
4213 for (i = 1; i <= min; i++)
4214 {
4215 if (eptr >= md->end_subject)
4216 {
4217 SCHECK_PARTIAL();
4218 MRRETURN(MATCH_NOMATCH);
4219 }
4220 switch(*eptr++)
4221 {
4222 default: MRRETURN(MATCH_NOMATCH);
4223
4224 case 0x000d:
4225 if (eptr < md->end_subject && *eptr == 0x0a) eptr++;
4226 break;
4227
4228 case 0x000a:
4229 break;
4230
4231 case 0x000b:
4232 case 0x000c:
4233 case 0x0085:
4234 if (md->bsr_anycrlf) MRRETURN(MATCH_NOMATCH);
4235 break;
4236 }
4237 }
4238 break;
4239
4240 case OP_NOT_HSPACE:
4241 for (i = 1; i <= min; i++)
4242 {
4243 if (eptr >= md->end_subject)
4244 {
4245 SCHECK_PARTIAL();
4246 MRRETURN(MATCH_NOMATCH);
4247 }
4248 switch(*eptr++)
4249 {
4250 default: break;
4251 case 0x09: /* HT */
4252 case 0x20: /* SPACE */
4253 case 0xa0: /* NBSP */
4254 MRRETURN(MATCH_NOMATCH);
4255 }
4256 }
4257 break;
4258
4259 case OP_HSPACE:
4260 for (i = 1; i <= min; i++)
4261 {
4262 if (eptr >= md->end_subject)
4263 {
4264 SCHECK_PARTIAL();
4265 MRRETURN(MATCH_NOMATCH);
4266 }
4267 switch(*eptr++)
4268 {
4269 default: MRRETURN(MATCH_NOMATCH);
4270 case 0x09: /* HT */
4271 case 0x20: /* SPACE */
4272 case 0xa0: /* NBSP */
4273 break;
4274 }
4275 }
4276 break;
4277
4278 case OP_NOT_VSPACE:
4279 for (i = 1; i <= min; i++)
4280 {
4281 if (eptr >= md->end_subject)
4282 {
4283 SCHECK_PARTIAL();
4284 MRRETURN(MATCH_NOMATCH);
4285 }
4286 switch(*eptr++)
4287 {
4288 default: break;
4289 case 0x0a: /* LF */
4290 case 0x0b: /* VT */
4291 case 0x0c: /* FF */
4292 case 0x0d: /* CR */
4293 case 0x85: /* NEL */
4294 MRRETURN(MATCH_NOMATCH);
4295 }
4296 }
4297 break;
4298
4299 case OP_VSPACE:
4300 for (i = 1; i <= min; i++)
4301 {
4302 if (eptr >= md->end_subject)
4303 {
4304 SCHECK_PARTIAL();
4305 MRRETURN(MATCH_NOMATCH);
4306 }
4307 switch(*eptr++)
4308 {
4309 default: MRRETURN(MATCH_NOMATCH);
4310 case 0x0a: /* LF */
4311 case 0x0b: /* VT */
4312 case 0x0c: /* FF */
4313 case 0x0d: /* CR */
4314 case 0x85: /* NEL */
4315 break;
4316 }
4317 }
4318 break;
4319
4320 case OP_NOT_DIGIT:
4321 for (i = 1; i <= min; i++)
4322 {
4323 if (eptr >= md->end_subject)
4324 {
4325 SCHECK_PARTIAL();
4326 MRRETURN(MATCH_NOMATCH);
4327 }
4328 if ((md->ctypes[*eptr++] & ctype_digit) != 0) MRRETURN(MATCH_NOMATCH);
4329 }
4330 break;
4331
4332 case OP_DIGIT:
4333 for (i = 1; i <= min; i++)
4334 {
4335 if (eptr >= md->end_subject)
4336 {
4337 SCHECK_PARTIAL();
4338 MRRETURN(MATCH_NOMATCH);
4339 }
4340 if ((md->ctypes[*eptr++] & ctype_digit) == 0) MRRETURN(MATCH_NOMATCH);
4341 }
4342 break;
4343
4344 case OP_NOT_WHITESPACE:
4345 for (i = 1; i <= min; i++)
4346 {
4347 if (eptr >= md->end_subject)
4348 {
4349 SCHECK_PARTIAL();
4350 MRRETURN(MATCH_NOMATCH);
4351 }
4352 if ((md->ctypes[*eptr++] & ctype_space) != 0) MRRETURN(MATCH_NOMATCH);
4353 }
4354 break;
4355
4356 case OP_WHITESPACE:
4357 for (i = 1; i <= min; i++)
4358 {
4359 if (eptr >= md->end_subject)
4360 {
4361 SCHECK_PARTIAL();
4362 MRRETURN(MATCH_NOMATCH);
4363 }
4364 if ((md->ctypes[*eptr++] & ctype_space) == 0) MRRETURN(MATCH_NOMATCH);
4365 }
4366 break;
4367
4368 case OP_NOT_WORDCHAR:
4369 for (i = 1; i <= min; i++)
4370 {
4371 if (eptr >= md->end_subject)
4372 {
4373 SCHECK_PARTIAL();
4374 MRRETURN(MATCH_NOMATCH);
4375 }
4376 if ((md->ctypes[*eptr++] & ctype_word) != 0)
4377 MRRETURN(MATCH_NOMATCH);
4378 }
4379 break;
4380
4381 case OP_WORDCHAR:
4382 for (i = 1; i <= min; i++)
4383 {
4384 if (eptr >= md->end_subject)
4385 {
4386 SCHECK_PARTIAL();
4387 MRRETURN(MATCH_NOMATCH);
4388 }
4389 if ((md->ctypes[*eptr++] & ctype_word) == 0)
4390 MRRETURN(MATCH_NOMATCH);
4391 }
4392 break;
4393
4394 default:
4395 RRETURN(PCRE_ERROR_INTERNAL);
4396 }
4397 }
4398
4399 /* If min = max, continue at the same level without recursing */
4400
4401 if (min == max) continue;
4402
4403 /* If minimizing, we have to test the rest of the pattern before each
4404 subsequent match. Again, separate the UTF-8 case for speed, and also
4405 separate the UCP cases. */
4406
4407 if (minimize)
4408 {
4409 #ifdef SUPPORT_UCP
4410 if (prop_type >= 0)
4411 {
4412 switch(prop_type)
4413 {
4414 case PT_ANY:
4415 for (fi = min;; fi++)
4416 {
4417 RMATCH(eptr, ecode, offset_top, md, eptrb, RM36);
4418 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
4419 if (fi >= max) MRRETURN(MATCH_NOMATCH);
4420 if (eptr >= md->end_subject)
4421 {
4422 SCHECK_PARTIAL();
4423 MRRETURN(MATCH_NOMATCH);
4424 }
4425 GETCHARINCTEST(c, eptr);
4426 if (prop_fail_result) MRRETURN(MATCH_NOMATCH);
4427 }
4428 /* Control never gets here */
4429
4430 case PT_LAMP:
4431 for (fi = min;; fi++)
4432 {
4433 RMATCH(eptr, ecode, offset_top, md, eptrb, RM37);
4434 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
4435 if (fi >= max) MRRETURN(MATCH_NOMATCH);
4436 if (eptr >= md->end_subject)
4437 {
4438 SCHECK_PARTIAL();
4439 MRRETURN(MATCH_NOMATCH);
4440 }
4441 GETCHARINCTEST(c, eptr);
4442 prop_chartype = UCD_CHARTYPE(c);
4443 if ((prop_chartype == ucp_Lu ||
4444 prop_chartype == ucp_Ll ||
4445 prop_chartype == ucp_Lt) == prop_fail_result)
4446 MRRETURN(MATCH_NOMATCH);
4447 }
4448 /* Control never gets here */
4449
4450 case PT_GC:
4451 for (fi = min;; fi++)
4452 {
4453 RMATCH(eptr, ecode, offset_top, md, eptrb, RM38);
4454 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
4455 if (fi >= max) MRRETURN(MATCH_NOMATCH);
4456 if (eptr >= md->end_subject)
4457 {
4458 SCHECK_PARTIAL();
4459 MRRETURN(MATCH_NOMATCH);
4460 }
4461 GETCHARINCTEST(c, eptr);
4462 prop_category = UCD_CATEGORY(c);
4463 if ((prop_category == prop_value) == prop_fail_result)
4464 MRRETURN(MATCH_NOMATCH);
4465 }
4466 /* Control never gets here */
4467
4468 case PT_PC:
4469 for (fi = min;; fi++)
4470 {
4471 RMATCH(eptr, ecode, offset_top, md, eptrb, RM39);
4472 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
4473 if (fi >= max) MRRETURN(MATCH_NOMATCH);
4474 if (eptr >= md->end_subject)
4475 {
4476 SCHECK_PARTIAL();
4477 MRRETURN(MATCH_NOMATCH);
4478 }
4479 GETCHARINCTEST(c, eptr);
4480 prop_chartype = UCD_CHARTYPE(c);
4481 if ((prop_chartype == prop_value) == prop_fail_result)
4482 MRRETURN(MATCH_NOMATCH);
4483 }
4484 /* Control never gets here */
4485
4486 case PT_SC:
4487 for (fi = min;; fi++)
4488 {
4489 RMATCH(eptr, ecode, offset_top, md, eptrb, RM40);
4490 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
4491 if (fi >= max) MRRETURN(MATCH_NOMATCH);
4492 if (eptr >= md->end_subject)
4493 {
4494 SCHECK_PARTIAL();
4495 MRRETURN(MATCH_NOMATCH);
4496 }
4497 GETCHARINCTEST(c, eptr);
4498 prop_script = UCD_SCRIPT(c);
4499 if ((prop_script == prop_value) == prop_fail_result)
4500 MRRETURN(MATCH_NOMATCH);
4501 }
4502 /* Control never gets here */
4503
4504 case PT_ALNUM:
4505 for (fi = min;; fi++)
4506 {
4507 RMATCH(eptr, ecode, offset_top, md, eptrb, RM59);
4508 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
4509 if (fi >= max) MRRETURN(MATCH_NOMATCH);
4510 if (eptr >= md->end_subject)
4511 {
4512 SCHECK_PARTIAL();
4513 MRRETURN(MATCH_NOMATCH);
4514 }
4515 GETCHARINCTEST(c, eptr);
4516 prop_category = UCD_CATEGORY(c);
4517 if ((prop_category == ucp_L || prop_category == ucp_N)
4518 == prop_fail_result)
4519 MRRETURN(MATCH_NOMATCH);
4520 }
4521 /* Control never gets here */
4522
4523 case PT_SPACE: /* Perl space */
4524 for (fi = min;; fi++)
4525 {
4526 RMATCH(eptr, ecode, offset_top, md, eptrb, RM60);
4527 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
4528 if (fi >= max) MRRETURN(MATCH_NOMATCH);
4529 if (eptr >= md->end_subject)
4530 {
4531 SCHECK_PARTIAL();
4532 MRRETURN(MATCH_NOMATCH);
4533 }
4534 GETCHARINCTEST(c, eptr);
4535 prop_category = UCD_CATEGORY(c);
4536 if ((prop_category == ucp_Z || c == CHAR_HT || c == CHAR_NL ||
4537 c == CHAR_FF || c == CHAR_CR)
4538 == prop_fail_result)
4539 MRRETURN(MATCH_NOMATCH);
4540 }
4541 /* Control never gets here */
4542
4543 case PT_PXSPACE: /* POSIX space */
4544 for (fi = min;; fi++)
4545 {
4546 RMATCH(eptr, ecode, offset_top, md, eptrb, RM61);
4547 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
4548 if (fi >= max) MRRETURN(MATCH_NOMATCH);
4549 if (eptr >= md->end_subject)
4550 {
4551 SCHECK_PARTIAL();
4552 MRRETURN(MATCH_NOMATCH);
4553 }
4554 GETCHARINCTEST(c, eptr);
4555 prop_category = UCD_CATEGORY(c);
4556 if ((prop_category == ucp_Z || c == CHAR_HT || c == CHAR_NL ||
4557 c == CHAR_VT || c == CHAR_FF || c == CHAR_CR)
4558 == prop_fail_result)
4559 MRRETURN(MATCH_NOMATCH);
4560 }
4561 /* Control never gets here */
4562
4563 case PT_WORD:
4564 for (fi = min;; fi++)
4565 {
4566 RMATCH(eptr, ecode, offset_top, md, eptrb, RM62);
4567 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
4568 if (fi >= max) MRRETURN(MATCH_NOMATCH);
4569 if (eptr >= md->end_subject)
4570 {
4571 SCHECK_PARTIAL();
4572 MRRETURN(MATCH_NOMATCH);
4573 }
4574 GETCHARINCTEST(c, eptr);
4575 prop_category = UCD_CATEGORY(c);
4576 if ((prop_category == ucp_L ||
4577 prop_category == ucp_N ||
4578 c == CHAR_UNDERSCORE)
4579 == prop_fail_result)
4580 MRRETURN(MATCH_NOMATCH);
4581 }
4582 /* Control never gets here */
4583
4584 /* This should never occur */
4585
4586 default:
4587 RRETURN(PCRE_ERROR_INTERNAL);
4588 }
4589 }
4590
4591 /* Match extended Unicode sequences. We will get here only if the
4592 support is in the binary; otherwise a compile-time error occurs. */
4593
4594 else if (ctype == OP_EXTUNI)
4595 {
4596 for (fi = min;; fi++)
4597 {
4598 RMATCH(eptr, ecode, offset_top, md, eptrb, RM41);
4599 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
4600 if (fi >= max) MRRETURN(MATCH_NOMATCH);
4601 if (eptr >= md->end_subject)
4602 {
4603 SCHECK_PARTIAL();
4604 MRRETURN(MATCH_NOMATCH);
4605 }
4606 GETCHARINCTEST(c, eptr);
4607 prop_category = UCD_CATEGORY(c);
4608 if (prop_category == ucp_M) MRRETURN(MATCH_NOMATCH);
4609 while (eptr < md->end_subject)
4610 {
4611 int len = 1;
4612 if (!utf8) c = *eptr;
4613 else { GETCHARLEN(c, eptr, len); }
4614 prop_category = UCD_CATEGORY(c);
4615 if (prop_category != ucp_M) break;
4616 eptr += len;
4617 }
4618 }
4619 }
4620
4621 else
4622 #endif /* SUPPORT_UCP */
4623
4624 #ifdef SUPPORT_UTF8
4625 /* UTF-8 mode */
4626 if (utf8)
4627 {
4628 for (fi = min;; fi++)
4629 {
4630 RMATCH(eptr, ecode, offset_top, md, eptrb, RM42);
4631 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
4632 if (fi >= max) MRRETURN(MATCH_NOMATCH);
4633 if (eptr >= md->end_subject)
4634 {
4635 SCHECK_PARTIAL();
4636 MRRETURN(MATCH_NOMATCH);
4637 }
4638 if (ctype == OP_ANY && IS_NEWLINE(eptr))
4639 MRRETURN(MATCH_NOMATCH);
4640 GETCHARINC(c, eptr);
4641 switch(ctype)
4642 {
4643 case OP_ANY: /* This is the non-NL case */
4644 case OP_ALLANY:
4645 case OP_ANYBYTE:
4646 break;
4647
4648 case OP_ANYNL:
4649 switch(c)
4650 {
4651 default: MRRETURN(MATCH_NOMATCH);
4652 case 0x000d:
4653 if (eptr < md->end_subject && *eptr == 0x0a) eptr++;
4654 break;
4655 case 0x000a:
4656 break;
4657
4658 case 0x000b:
4659 case 0x000c:
4660 case 0x0085:
4661 case 0x2028:
4662 case 0x2029:
4663 if (md->bsr_anycrlf) MRRETURN(MATCH_NOMATCH);
4664 break;
4665 }
4666 break;
4667
4668 case OP_NOT_HSPACE:
4669 switch(c)
4670 {
4671 default: break;
4672 case 0x09: /* HT */
4673 case 0x20: /* SPACE */
4674 case 0xa0: /* NBSP */
4675 case 0x1680: /* OGHAM SPACE MARK */
4676 case 0x180e: /* MONGOLIAN VOWEL SEPARATOR */
4677 case 0x2000: /* EN QUAD */
4678 case 0x2001: /* EM QUAD */
4679 case 0x2002: /* EN SPACE */
4680 case 0x2003: /* EM SPACE */
4681 case 0x2004: /* THREE-PER-EM SPACE */
4682 case 0x2005: /* FOUR-PER-EM SPACE */
4683 case 0x2006: /* SIX-PER-EM SPACE */
4684 case 0x2007: /* FIGURE SPACE */
4685 case 0x2008: /* PUNCTUATION SPACE */
4686 case 0x2009: /* THIN SPACE */
4687 case 0x200A: /* HAIR SPACE */
4688 case 0x202f: /* NARROW NO-BREAK SPACE */
4689 case 0x205f: /* MEDIUM MATHEMATICAL SPACE */
4690 case 0x3000: /* IDEOGRAPHIC SPACE */
4691 MRRETURN(MATCH_NOMATCH);
4692 }
4693 break;
4694
4695 case OP_HSPACE:
4696 switch(c)
4697 {
4698 default: MRRETURN(MATCH_NOMATCH);
4699 case 0x09: /* HT */
4700 case 0x20: /* SPACE */
4701 case 0xa0: /* NBSP */
4702 case 0x1680: /* OGHAM SPACE MARK */
4703 case 0x180e: /* MONGOLIAN VOWEL SEPARATOR */
4704 case 0x2000: /* EN QUAD */
4705 case 0x2001: /* EM QUAD */
4706 case 0x2002: /* EN SPACE */
4707 case 0x2003: /* EM SPACE */
4708 case 0x2004: /* THREE-PER-EM SPACE */
4709 case 0x2005: /* FOUR-PER-EM SPACE */
4710 case 0x2006: /* SIX-PER-EM SPACE */
4711 case 0x2007: /* FIGURE SPACE */
4712 case 0x2008: /* PUNCTUATION SPACE */
4713 case 0x2009: /* THIN SPACE */
4714 case 0x200A: /* HAIR SPACE */
4715 case 0x202f: /* NARROW NO-BREAK SPACE */
4716 case 0x205f: /* MEDIUM MATHEMATICAL SPACE */
4717 case 0x3000: /* IDEOGRAPHIC SPACE */
4718 break;
4719 }
4720 break;
4721
4722 case OP_NOT_VSPACE:
4723 switch(c)
4724 {
4725 default: break;
4726 case 0x0a: /* LF */
4727 case 0x0b: /* VT */
4728 case 0x0c: /* FF */
4729 case 0x0d: /* CR */
4730 case 0x85: /* NEL */
4731 case 0x2028: /* LINE SEPARATOR */
4732 case 0x2029: /* PARAGRAPH SEPARATOR */
4733 MRRETURN(MATCH_NOMATCH);
4734 }
4735 break;
4736
4737 case OP_VSPACE:
4738 switch(c)
4739 {
4740 default: MRRETURN(MATCH_NOMATCH);
4741 case 0x0a: /* LF */
4742 case 0x0b: /* VT */
4743 case 0x0c: /* FF */
4744 case 0x0d: /* CR */
4745 case 0x85: /* NEL */
4746 case 0x2028: /* LINE SEPARATOR */
4747 case 0x2029: /* PARAGRAPH SEPARATOR */
4748 break;
4749 }
4750 break;
4751
4752 case OP_NOT_DIGIT:
4753 if (c < 256 && (md->ctypes[c] & ctype_digit) != 0)
4754 MRRETURN(MATCH_NOMATCH);
4755 break;
4756
4757 case OP_DIGIT:
4758 if (c >= 256 || (md->ctypes[c] & ctype_digit) == 0)
4759 MRRETURN(MATCH_NOMATCH);
4760 break;
4761
4762 case OP_NOT_WHITESPACE:
4763 if (c < 256 && (md->ctypes[c] & ctype_space) != 0)
4764 MRRETURN(MATCH_NOMATCH);
4765 break;
4766
4767 case OP_WHITESPACE:
4768 if (c >= 256 || (md->ctypes[c] & ctype_space) == 0)
4769 MRRETURN(MATCH_NOMATCH);
4770 break;
4771
4772 case OP_NOT_WORDCHAR:
4773 if (c < 256 && (md->ctypes[c] & ctype_word) != 0)
4774 MRRETURN(MATCH_NOMATCH);
4775 break;
4776
4777 case OP_WORDCHAR:
4778 if (c >= 256 || (md->ctypes[c] & ctype_word) == 0)
4779 MRRETURN(MATCH_NOMATCH);
4780 break;
4781
4782 default:
4783 RRETURN(PCRE_ERROR_INTERNAL);
4784 }
4785 }
4786 }
4787 else
4788 #endif
4789 /* Not UTF-8 mode */
4790 {
4791 for (fi = min;; fi++)
4792 {
4793 RMATCH(eptr, ecode, offset_top, md, eptrb, RM43);
4794 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
4795 if (fi >= max) MRRETURN(MATCH_NOMATCH);
4796 if (eptr >= md->end_subject)
4797 {
4798 SCHECK_PARTIAL();
4799 MRRETURN(MATCH_NOMATCH);
4800 }
4801 if (ctype == OP_ANY && IS_NEWLINE(eptr))
4802 MRRETURN(MATCH_NOMATCH);
4803 c = *eptr++;
4804 switch(ctype)
4805 {
4806 case OP_ANY: /* This is the non-NL case */
4807 case OP_ALLANY:
4808 case OP_ANYBYTE:
4809 break;
4810
4811 case OP_ANYNL:
4812 switch(c)
4813 {
4814 default: MRRETURN(MATCH_NOMATCH);
4815 case 0x000d:
4816 if (eptr < md->end_subject && *eptr == 0x0a) eptr++;
4817 break;
4818
4819 case 0x000a:
4820 break;
4821
4822 case 0x000b:
4823 case 0x000c:
4824 case 0x0085:
4825 if (md->bsr_anycrlf) MRRETURN(MATCH_NOMATCH);
4826 break;
4827 }
4828 break;
4829
4830 case OP_NOT_HSPACE:
4831 switch(c)
4832 {
4833 default: break;
4834 case 0x09: /* HT */
4835 case 0x20: /* SPACE */
4836 case 0xa0: /* NBSP */
4837 MRRETURN(MATCH_NOMATCH);
4838 }
4839 break;
4840
4841 case OP_HSPACE:
4842 switch(c)
4843 {
4844 default: MRRETURN(MATCH_NOMATCH);
4845 case 0x09: /* HT */
4846 case 0x20: /* SPACE */
4847 case 0xa0: /* NBSP */
4848 break;
4849 }
4850 break;
4851
4852 case OP_NOT_VSPACE:
4853 switch(c)
4854 {
4855 default: break;
4856 case 0x0a: /* LF */
4857 case 0x0b: /* VT */
4858 case 0x0c: /* FF */
4859 case 0x0d: /* CR */
4860 case 0x85: /* NEL */
4861 MRRETURN(MATCH_NOMATCH);
4862 }
4863 break;
4864
4865 case OP_VSPACE:
4866 switch(c)
4867 {
4868 default: MRRETURN(MATCH_NOMATCH);
4869 case 0x0a: /* LF */
4870 case 0x0b: /* VT */
4871 case 0x0c: /* FF */
4872 case 0x0d: /* CR */
4873 case 0x85: /* NEL */
4874 break;
4875 }
4876 break;
4877
4878 case OP_NOT_DIGIT:
4879 if ((md->ctypes[c] & ctype_digit) != 0) MRRETURN(MATCH_NOMATCH);
4880 break;
4881
4882 case OP_DIGIT:
4883 if ((md->ctypes[c] & ctype_digit) == 0) MRRETURN(MATCH_NOMATCH);
4884 break;
4885
4886 case OP_NOT_WHITESPACE:
4887 if ((md->ctypes[c] & ctype_space) != 0) MRRETURN(MATCH_NOMATCH);
4888 break;
4889
4890 case OP_WHITESPACE:
4891 if ((md->ctypes[c] & ctype_space) == 0) MRRETURN(MATCH_NOMATCH);
4892 break;
4893
4894 case OP_NOT_WORDCHAR:
4895 if ((md->ctypes[c] & ctype_word) != 0) MRRETURN(MATCH_NOMATCH);
4896 break;
4897
4898 case OP_WORDCHAR:
4899 if ((md->ctypes[c] & ctype_word) == 0) MRRETURN(MATCH_NOMATCH);
4900 break;
4901
4902 default:
4903 RRETURN(PCRE_ERROR_INTERNAL);
4904 }
4905 }
4906 }
4907 /* Control never gets here */
4908 }
4909
4910 /* If maximizing, it is worth using inline code for speed, doing the type
4911 test once at the start (i.e. keep it out of the loop). Again, keep the
4912 UTF-8 and UCP stuff separate. */
4913
4914 else
4915 {
4916 pp = eptr; /* Remember where we started */
4917
4918 #ifdef SUPPORT_UCP
4919 if (prop_type >= 0)
4920 {
4921 switch(prop_type)
4922 {
4923 case PT_ANY:
4924 for (i = min; i < max; i++)
4925 {
4926 int len = 1;
4927 if (eptr >= md->end_subject)
4928 {
4929 SCHECK_PARTIAL();
4930 break;
4931 }
4932 GETCHARLENTEST(c, eptr, len);
4933 if (prop_fail_result) break;
4934 eptr+= len;
4935 }
4936 break;
4937
4938 case PT_LAMP:
4939 for (i = min; i < max; i++)
4940 {
4941 int len = 1;
4942 if (eptr >= md->end_subject)
4943 {
4944 SCHECK_PARTIAL();
4945 break;
4946 }
4947 GETCHARLENTEST(c, eptr, len);
4948 prop_chartype = UCD_CHARTYPE(c);
4949 if ((prop_chartype == ucp_Lu ||
4950 prop_chartype == ucp_Ll ||
4951 prop_chartype == ucp_Lt) == prop_fail_result)
4952 break;
4953 eptr+= len;
4954 }
4955 break;
4956
4957 case PT_GC:
4958 for (i = min; i < max; i++)
4959 {
4960 int len = 1;
4961 if (eptr >= md->end_subject)
4962 {
4963 SCHECK_PARTIAL();
4964 break;
4965 }
4966 GETCHARLENTEST(c, eptr, len);
4967 prop_category = UCD_CATEGORY(c);
4968 if ((prop_category == prop_value) == prop_fail_result)
4969 break;
4970 eptr+= len;
4971 }
4972 break;
4973
4974 case PT_PC:
4975 for (i = min; i < max; i++)
4976 {
4977 int len = 1;
4978 if (eptr >= md->end_subject)
4979 {
4980 SCHECK_PARTIAL();
4981 break;
4982 }
4983 GETCHARLENTEST(c, eptr, len);
4984 prop_chartype = UCD_CHARTYPE(c);
4985 if ((prop_chartype == prop_value) == prop_fail_result)
4986 break;
4987 eptr+= len;
4988 }
4989 break;
4990
4991 case PT_SC:
4992 for (i = min; i < max; i++)
4993 {
4994 int len = 1;
4995 if (eptr >= md->end_subject)
4996 {
4997 SCHECK_PARTIAL();
4998 break;
4999 }
5000 GETCHARLENTEST(c, eptr, len);
5001 prop_script = UCD_SCRIPT(c);
5002 if ((prop_script == prop_value) == prop_fail_result)
5003 break;
5004 eptr+= len;
5005 }
5006 break;
5007
5008 case PT_ALNUM:
5009 for (i = min; i < max; i++)
5010 {
5011 int len = 1;
5012 if (eptr >= md->end_subject)
5013 {
5014 SCHECK_PARTIAL();
5015 break;
5016 }
5017 GETCHARLENTEST(c, eptr, len);
5018 prop_category = UCD_CATEGORY(c);
5019 if ((prop_category == ucp_L || prop_category == ucp_N)
5020 == prop_fail_result)
5021 break;
5022 eptr+= len;
5023 }
5024 break;
5025
5026 case PT_SPACE: /* Perl space */
5027 for (i = min; i < max; i++)
5028 {
5029 int len = 1;
5030 if (eptr >= md->end_subject)
5031 {
5032 SCHECK_PARTIAL();
5033 break;
5034 }
5035 GETCHARLENTEST(c, eptr, len);
5036 prop_category = UCD_CATEGORY(c);
5037 if ((prop_category == ucp_Z || c == CHAR_HT || c == CHAR_NL ||
5038 c == CHAR_FF || c == CHAR_CR)
5039 == prop_fail_result)
5040 break;
5041 eptr+= len;
5042 }
5043 break;
5044
5045 case PT_PXSPACE: /* POSIX space */
5046 for (i = min; i < max; i++)
5047 {
5048 int len = 1;
5049 if (eptr >= md->end_subject)
5050 {
5051 SCHECK_PARTIAL();
5052 break;
5053 }
5054 GETCHARLENTEST(c, eptr, len);
5055 prop_category = UCD_CATEGORY(c);
5056 if ((prop_category == ucp_Z || c == CHAR_HT || c == CHAR_NL ||
5057 c == CHAR_VT || c == CHAR_FF || c == CHAR_CR)
5058 == prop_fail_result)
5059 break;
5060 eptr+= len;
5061 }
5062 break;
5063
5064 case PT_WORD:
5065 for (i = min; i < max; i++)
5066 {
5067 int len = 1;
5068 if (eptr >= md->end_subject)
5069 {
5070 SCHECK_PARTIAL();
5071 break;
5072 }
5073 GETCHARLENTEST(c, eptr, len);
5074 prop_category = UCD_CATEGORY(c);
5075 if ((prop_category == ucp_L || prop_category == ucp_N ||
5076 c == CHAR_UNDERSCORE) == prop_fail_result)
5077 break;
5078 eptr+= len;
5079 }
5080 break;
5081
5082 default:
5083 RRETURN(PCRE_ERROR_INTERNAL);
5084 }
5085
5086 /* eptr is now past the end of the maximum run */
5087
5088 if (possessive) continue;
5089 for(;;)
5090 {
5091 RMATCH(eptr, ecode, offset_top, md, eptrb, RM44);
5092 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
5093 if (eptr-- == pp) break; /* Stop if tried at original pos */
5094 if (utf8) BACKCHAR(eptr);
5095 }
5096 }
5097
5098 /* Match extended Unicode sequences. We will get here only if the
5099 support is in the binary; otherwise a compile-time error occurs. */
5100
5101 else if (ctype == OP_EXTUNI)
5102 {
5103 for (i = min; i < max; i++)
5104 {
5105 if (eptr >= md->end_subject)
5106 {
5107 SCHECK_PARTIAL();
5108 break;
5109 }
5110 GETCHARINCTEST(c, eptr);
5111 prop_category = UCD_CATEGORY(c);
5112 if (prop_category == ucp_M) break;
5113 while (eptr < md->end_subject)
5114 {
5115 int len = 1;
5116 if (!utf8) c = *eptr; else
5117 {
5118 GETCHARLEN(c, eptr, len);
5119 }
5120 prop_category = UCD_CATEGORY(c);
5121 if (prop_category != ucp_M) break;
5122 eptr += len;
5123 }
5124 }
5125
5126 /* eptr is now past the end of the maximum run */
5127
5128 if (possessive) continue;
5129
5130 for(;;)
5131 {
5132 RMATCH(eptr, ecode, offset_top, md, eptrb, RM45);
5133 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
5134 if (eptr-- == pp) break; /* Stop if tried at original pos */
5135 for (;;) /* Move back over one extended */
5136 {
5137 int len = 1;
5138 if (!utf8) c = *eptr; else
5139 {
5140 BACKCHAR(eptr);
5141 GETCHARLEN(c, eptr, len);
5142 }
5143 prop_category = UCD_CATEGORY(c);
5144 if (prop_category != ucp_M) break;
5145 eptr--;
5146 }
5147 }
5148 }
5149
5150 else
5151 #endif /* SUPPORT_UCP */
5152
5153 #ifdef SUPPORT_UTF8
5154 /* UTF-8 mode */
5155
5156 if (utf8)
5157 {
5158 switch(ctype)
5159 {
5160 case OP_ANY:
5161 if (max < INT_MAX)
5162 {
5163 for (i = min; i < max; i++)
5164 {
5165 if (eptr >= md->end_subject)
5166 {
5167 SCHECK_PARTIAL();
5168 break;
5169 }
5170 if (IS_NEWLINE(eptr)) break;
5171 eptr++;
5172 while (eptr < md->end_subject && (*eptr & 0xc0) == 0x80) eptr++;
5173 }
5174 }
5175
5176 /* Handle unlimited UTF-8 repeat */
5177
5178 else
5179 {
5180 for (i = min; i < max; i++)
5181 {
5182 if (eptr >= md->end_subject)
5183 {
5184 SCHECK_PARTIAL();
5185 break;
5186 }
5187 if (IS_NEWLINE(eptr)) break;
5188 eptr++;
5189 while (eptr < md->end_subject && (*eptr & 0xc0) == 0x80) eptr++;
5190 }
5191 }
5192 break;
5193
5194 case OP_ALLANY:
5195 if (max < INT_MAX)
5196 {
5197 for (i = min; i < max; i++)
5198 {
5199 if (eptr >= md->end_subject)
5200 {
5201 SCHECK_PARTIAL();
5202 break;
5203 }
5204 eptr++;
5205 while (eptr < md->end_subject && (*eptr & 0xc0) == 0x80) eptr++;
5206 }
5207 }
5208 else eptr = md->end_subject; /* Unlimited UTF-8 repeat */
5209 break;
5210
5211 /* The byte case is the same as non-UTF8 */
5212
5213 case OP_ANYBYTE:
5214 c = max - min;
5215 if (c > (unsigned int)(md->end_subject - eptr))
5216 {
5217 eptr = md->end_subject;
5218 SCHECK_PARTIAL();
5219 }
5220 else eptr += c;
5221 break;
5222
5223 case OP_ANYNL:
5224 for (i = min; i < max; i++)
5225 {
5226 int len = 1;
5227 if (eptr >= md->end_subject)
5228 {
5229 SCHECK_PARTIAL();
5230 break;
5231 }
5232 GETCHARLEN(c, eptr, len);
5233 if (c == 0x000d)
5234 {
5235 if (++eptr >= md->end_subject) break;
5236 if (*eptr == 0x000a) eptr++;
5237 }
5238 else
5239 {
5240 if (c != 0x000a &&
5241 (md->bsr_anycrlf ||
5242 (c != 0x000b && c != 0x000c &&
5243 c != 0x0085 && c != 0x2028 && c != 0x2029)))
5244 break;
5245 eptr += len;
5246 }
5247 }
5248 break;
5249
5250 case OP_NOT_HSPACE:
5251 case OP_HSPACE:
5252 for (i = min; i < max; i++)
5253 {
5254 BOOL gotspace;
5255 int len = 1;
5256 if (eptr >= md->end_subject)
5257 {
5258 SCHECK_PARTIAL();
5259 break;
5260 }
5261 GETCHARLEN(c, eptr, len);
5262 switch(c)
5263 {
5264 default: gotspace = FALSE; break;
5265 case 0x09: /* HT */
5266 case 0x20: /* SPACE */
5267 case 0xa0: /* NBSP */
5268 case 0x1680: /* OGHAM SPACE MARK */
5269 case 0x180e: /* MONGOLIAN VOWEL SEPARATOR */
5270 case 0x2000: /* EN QUAD */
5271 case 0x2001: /* EM QUAD */
5272 case 0x2002: /* EN SPACE */
5273 case 0x2003: /* EM SPACE */
5274 case 0x2004: /* THREE-PER-EM SPACE */
5275 case 0x2005: /* FOUR-PER-EM SPACE */
5276 case 0x2006: /* SIX-PER-EM SPACE */
5277 case 0x2007: /* FIGURE SPACE */
5278 case 0x2008: /* PUNCTUATION SPACE */
5279 case 0x2009: /* THIN SPACE */
5280 case 0x200A: /* HAIR SPACE */
5281 case 0x202f: /* NARROW NO-BREAK SPACE */
5282 case 0x205f: /* MEDIUM MATHEMATICAL SPACE */
5283 case 0x3000: /* IDEOGRAPHIC SPACE */
5284 gotspace = TRUE;
5285 break;
5286 }
5287 if (gotspace == (ctype == OP_NOT_HSPACE)) break;
5288 eptr += len;
5289 }
5290 break;
5291
5292 case OP_NOT_VSPACE:
5293 case OP_VSPACE:
5294 for (i = min; i < max; i++)
5295 {
5296 BOOL gotspace;
5297 int len = 1;
5298 if (eptr >= md->end_subject)
5299 {
5300 SCHECK_PARTIAL();
5301 break;
5302 }
5303 GETCHARLEN(c, eptr, len);
5304 switch(c)
5305 {
5306 default: gotspace = FALSE; break;
5307 case 0x0a: /* LF */
5308 case 0x0b: /* VT */
5309 case 0x0c: /* FF */
5310 case 0x0d: /* CR */
5311 case 0x85: /* NEL */
5312 case 0x2028: /* LINE SEPARATOR */
5313 case 0x2029: /* PARAGRAPH SEPARATOR */
5314 gotspace = TRUE;
5315 break;
5316 }
5317 if (gotspace == (ctype == OP_NOT_VSPACE)) break;
5318 eptr += len;
5319 }
5320 break;
5321
5322 case OP_NOT_DIGIT:
5323 for (i = min; i < max; i++)
5324 {
5325 int len = 1;
5326 if (eptr >= md->end_subject)
5327 {
5328 SCHECK_PARTIAL();
5329 break;
5330 }
5331 GETCHARLEN(c, eptr, len);
5332 if (c < 256 && (md->ctypes[c] & ctype_digit) != 0) break;
5333 eptr+= len;
5334 }
5335 break;
5336
5337 case OP_DIGIT:
5338 for (i = min; i < max; i++)
5339 {
5340 int len = 1;
5341 if (eptr >= md->end_subject)
5342 {
5343 SCHECK_PARTIAL();
5344 break;
5345 }
5346 GETCHARLEN(c, eptr, len);
5347 if (c >= 256 ||(md->ctypes[c] & ctype_digit) == 0) break;
5348 eptr+= len;
5349 }
5350 break;
5351
5352 case OP_NOT_WHITESPACE:
5353 for (i = min; i < max; i++)
5354 {
5355 int len = 1;
5356 if (eptr >= md->end_subject)
5357 {
5358 SCHECK_PARTIAL();
5359 break;
5360 }
5361 GETCHARLEN(c, eptr, len);
5362 if (c < 256 && (md->ctypes[c] & ctype_space) != 0) break;
5363 eptr+= len;
5364 }
5365 break;
5366
5367 case OP_WHITESPACE:
5368 for (i = min; i < max; i++)
5369 {
5370 int len = 1;
5371 if (eptr >= md->end_subject)
5372 {
5373 SCHECK_PARTIAL();
5374 break;
5375 }
5376 GETCHARLEN(c, eptr, len);
5377 if (c >= 256 ||(md->ctypes[c] & ctype_space) == 0) break;
5378 eptr+= len;
5379 }
5380 break;
5381
5382 case OP_NOT_WORDCHAR:
5383 for (i = min; i < max; i++)
5384 {
5385 int len = 1;
5386 if (eptr >= md->end_subject)
5387 {
5388 SCHECK_PARTIAL();
5389 break;
5390 }
5391 GETCHARLEN(c, eptr, len);
5392 if (c < 256 && (md->ctypes[c] & ctype_word) != 0) break;
5393 eptr+= len;
5394 }
5395 break;
5396
5397 case OP_WORDCHAR:
5398 for (i = min; i < max; i++)
5399 {
5400 int len = 1;
5401 if (eptr >= md->end_subject)
5402 {
5403 SCHECK_PARTIAL();
5404 break;
5405 }
5406 GETCHARLEN(c, eptr, len);
5407 if (c >= 256 || (md->ctypes[c] & ctype_word) == 0) break;
5408 eptr+= len;
5409 }
5410 break;
5411
5412 default:
5413 RRETURN(PCRE_ERROR_INTERNAL);
5414 }
5415
5416 /* eptr is now past the end of the maximum run. If possessive, we are
5417 done (no backing up). Otherwise, match at this position; anything other
5418 than no match is immediately returned. For nomatch, back up one
5419 character, unless we are matching \R and the last thing matched was
5420 \r\n, in which case, back up two bytes. */
5421
5422 if (possessive) continue;
5423 for(;;)
5424 {
5425 RMATCH(eptr, ecode, offset_top, md, eptrb, RM46);
5426 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
5427 if (eptr-- == pp) break; /* Stop if tried at original pos */
5428 BACKCHAR(eptr);
5429 if (ctype == OP_ANYNL && eptr > pp && *eptr == '\n' &&
5430 eptr[-1] == '\r') eptr--;
5431 }
5432 }
5433 else
5434 #endif /* SUPPORT_UTF8 */
5435
5436 /* Not UTF-8 mode */
5437 {
5438 switch(ctype)
5439 {
5440 case OP_ANY:
5441 for (i = min; i < max; i++)
5442 {
5443 if (eptr >= md->end_subject)
5444 {
5445 SCHECK_PARTIAL();
5446 break;
5447 }
5448 if (IS_NEWLINE(eptr)) break;
5449 eptr++;
5450 }
5451 break;
5452
5453 case OP_ALLANY:
5454 case OP_ANYBYTE:
5455 c = max - min;
5456 if (c > (unsigned int)(md->end_subject - eptr))
5457 {
5458 eptr = md->end_subject;
5459 SCHECK_PARTIAL();
5460 }
5461 else eptr += c;
5462 break;
5463
5464 case OP_ANYNL:
5465 for (i = min; i < max; i++)
5466 {
5467 if (eptr >= md->end_subject)
5468 {
5469 SCHECK_PARTIAL();
5470 break;
5471 }
5472 c = *eptr;
5473 if (c == 0x000d)
5474 {
5475 if (++eptr >= md->end_subject) break;
5476 if (*eptr == 0x000a) eptr++;
5477 }
5478 else
5479 {
5480 if (c != 0x000a &&
5481 (md->bsr_anycrlf ||
5482 (c != 0x000b && c != 0x000c && c != 0x0085)))
5483 break;
5484 eptr++;
5485 }
5486 }
5487 break;
5488
5489 case OP_NOT_HSPACE:
5490 for (i = min; i < max; i++)
5491 {
5492 if (eptr >= md->end_subject)
5493 {
5494 SCHECK_PARTIAL();
5495 break;
5496 }
5497 c = *eptr;
5498 if (c == 0x09 || c == 0x20 || c == 0xa0) break;
5499 eptr++;
5500 }
5501 break;
5502
5503 case OP_HSPACE:
5504 for (i = min; i < max; i++)
5505 {
5506 if (eptr >= md->end_subject)
5507 {
5508 SCHECK_PARTIAL();
5509 break;
5510 }
5511 c = *eptr;
5512 if (c != 0x09 && c != 0x20 && c != 0xa0) break;
5513 eptr++;
5514 }
5515 break;
5516
5517 case OP_NOT_VSPACE:
5518 for (i = min; i < max; i++)
5519 {
5520 if (eptr >= md->end_subject)
5521 {
5522 SCHECK_PARTIAL();
5523 break;
5524 }
5525 c = *eptr;
5526 if (c == 0x0a || c == 0x0b || c == 0x0c || c == 0x0d || c == 0x85)
5527 break;
5528 eptr++;
5529 }
5530 break;
5531
5532 case OP_VSPACE:
5533 for (i = min; i < max; i++)
5534 {
5535 if (eptr >= md->end_subject)
5536 {
5537 SCHECK_PARTIAL();
5538 break;
5539 }
5540 c = *eptr;
5541 if (c != 0x0a && c != 0x0b && c != 0x0c && c != 0x0d && c != 0x85)
5542 break;
5543 eptr++;
5544 }
5545 break;
5546
5547 case OP_NOT_DIGIT:
5548 for (i = min; i < max; i++)
5549 {
5550 if (eptr >= md->end_subject)
5551 {
5552 SCHECK_PARTIAL();
5553 break;
5554 }
5555 if ((md->ctypes[*eptr] & ctype_digit) != 0) break;
5556 eptr++;
5557 }
5558 break;
5559
5560 case OP_DIGIT:
5561 for (i = min; i < max; i++)
5562 {
5563 if (eptr >= md->end_subject)
5564 {
5565 SCHECK_PARTIAL();
5566 break;
5567 }
5568 if ((md->ctypes[*eptr] & ctype_digit) == 0) break;
5569 eptr++;
5570 }
5571 break;
5572
5573 case OP_NOT_WHITESPACE:
5574 for (i = min; i < max; i++)
5575 {
5576 if (eptr >= md->end_subject)
5577 {
5578 SCHECK_PARTIAL();
5579 break;
5580 }
5581 if ((md->ctypes[*eptr] & ctype_space) != 0) break;
5582 eptr++;
5583 }
5584 break;
5585
5586 case OP_WHITESPACE:
5587 for (i = min; i < max; i++)
5588 {
5589 if (eptr >= md->end_subject)
5590 {
5591 SCHECK_PARTIAL();
5592 break;
5593 }
5594 if ((md->ctypes[*eptr] & ctype_space) == 0) break;
5595 eptr++;
5596 }
5597 break;
5598
5599 case OP_NOT_WORDCHAR:
5600 for (i = min; i < max; i++)
5601 {
5602 if (eptr >= md->end_subject)
5603 {
5604 SCHECK_PARTIAL();
5605 break;
5606 }
5607 if ((md->ctypes[*eptr] & ctype_word) != 0) break;
5608 eptr++;
5609 }
5610 break;
5611
5612 case OP_WORDCHAR:
5613 for (i = min; i < max; i++)
5614 {
5615 if (eptr >= md->end_subject)
5616 {
5617 SCHECK_PARTIAL();
5618 break;
5619 }
5620 if ((md->ctypes[*eptr] & ctype_word) == 0) break;
5621 eptr++;
5622 }
5623 break;
5624
5625 default:
5626 RRETURN(PCRE_ERROR_INTERNAL);
5627 }
5628
5629 /* eptr is now past the end of the maximum run. If possessive, we are
5630 done (no backing up). Otherwise, match at this position; anything other
5631 than no match is immediately returned. For nomatch, back up one
5632 character (byte), unless we are matching \R and the last thing matched
5633 was \r\n, in which case, back up two bytes. */
5634
5635 if (possessive) continue;
5636 while (eptr >= pp)
5637 {
5638 RMATCH(eptr, ecode, offset_top, md, eptrb, RM47);
5639 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
5640 eptr--;
5641 if (ctype == OP_ANYNL && eptr > pp && *eptr == '\n' &&
5642 eptr[-1] == '\r') eptr--;
5643 }
5644 }
5645
5646 /* Get here if we can't make it match with any permitted repetitions */
5647
5648 MRRETURN(MATCH_NOMATCH);
5649 }
5650 /* Control never gets here */
5651
5652 /* There's been some horrible disaster. Arrival here can only mean there is
5653 something seriously wrong in the code above or the OP_xxx definitions. */
5654
5655 default:
5656 DPRINTF(("Unknown opcode %d\n", *ecode));
5657 RRETURN(PCRE_ERROR_UNKNOWN_OPCODE);
5658 }
5659
5660 /* Do not stick any code in here without much thought; it is assumed
5661 that "continue" in the code above comes out to here to repeat the main
5662 loop. */
5663
5664 } /* End of main loop */
5665 /* Control never reaches here */
5666
5667
5668 /* When compiling to use the heap rather than the stack for recursive calls to
5669 match(), the RRETURN() macro jumps here. The number that is saved in
5670 frame->Xwhere indicates which label we actually want to return to. */
5671
5672 #ifdef NO_RECURSE
5673 #define LBL(val) case val: goto L_RM##val;
5674 HEAP_RETURN:
5675 switch (frame->Xwhere)
5676 {
5677 LBL( 1) LBL( 2) LBL( 3) LBL( 4) LBL( 5) LBL( 6) LBL( 7) LBL( 8)
5678 LBL( 9) LBL(10) LBL(11) LBL(12) LBL(13) LBL(14) LBL(15) LBL(17)
5679 LBL(19) LBL(24) LBL(25) LBL(26) LBL(27) LBL(29) LBL(31) LBL(33)
5680 LBL(35) LBL(43) LBL(47) LBL(48) LBL(49) LBL(50) LBL(51) LBL(52)
5681 LBL(53) LBL(54) LBL(55) LBL(56) LBL(57) LBL(58) LBL(63) LBL(64)
5682 LBL(65) LBL(66)
5683 #ifdef SUPPORT_UTF8
5684 LBL(16) LBL(18) LBL(20) LBL(21) LBL(22) LBL(23) LBL(28) LBL(30)
5685 LBL(32) LBL(34) LBL(42) LBL(46)
5686 #ifdef SUPPORT_UCP
5687 LBL(36) LBL(37) LBL(38) LBL(39) LBL(40) LBL(41) LBL(44) LBL(45)
5688 LBL(59) LBL(60) LBL(61) LBL(62)
5689 #endif /* SUPPORT_UCP */
5690 #endif /* SUPPORT_UTF8 */
5691 default:
5692 DPRINTF(("jump error in pcre match: label %d non-existent\n", frame->Xwhere));
5693 return PCRE_ERROR_INTERNAL;
5694 }
5695 #undef LBL
5696 #endif /* NO_RECURSE */
5697 }
5698
5699
5700 /***************************************************************************
5701 ****************************************************************************
5702 RECURSION IN THE match() FUNCTION
5703
5704 Undefine all the macros that were defined above to handle this. */
5705
5706 #ifdef NO_RECURSE
5707 #undef eptr
5708 #undef ecode
5709 #undef mstart
5710 #undef offset_top
5711 #undef eptrb
5712 #undef flags
5713
5714 #undef callpat
5715 #undef charptr
5716 #undef data
5717 #undef next
5718 #undef pp
5719 #undef prev
5720 #undef saved_eptr
5721
5722 #undef new_recursive
5723
5724 #undef cur_is_word
5725 #undef condition
5726 #undef prev_is_word
5727
5728 #undef ctype
5729 #undef length
5730 #undef max
5731 #undef min
5732 #undef number
5733 #undef offset
5734 #undef op
5735 #undef save_capture_last
5736 #undef save_offset1
5737 #undef save_offset2
5738 #undef save_offset3
5739 #undef stacksave
5740
5741 #undef newptrb
5742
5743 #endif
5744
5745 /* These two are defined as macros in both cases */
5746
5747 #undef fc
5748 #undef fi
5749
5750 /***************************************************************************
5751 ***************************************************************************/
5752
5753
5754
5755 /*************************************************
5756 * Execute a Regular Expression *
5757 *************************************************/
5758
5759 /* This function applies a compiled re to a subject string and picks out
5760 portions of the string if it matches. Two elements in the vector are set for
5761 each substring: the offsets to the start and end of the substring.
5762
5763 Arguments:
5764 argument_re points to the compiled expression
5765 extra_data points to extra data or is NULL
5766 subject points to the subject string
5767 length length of subject string (may contain binary zeros)
5768 start_offset where to start in the subject string
5769 options option bits
5770 offsets points to a vector of ints to be filled in with offsets
5771 offsetcount the number of elements in the vector
5772
5773 Returns: > 0 => success; value is the number of elements filled in
5774 = 0 => success, but offsets is not big enough
5775 -1 => failed to match
5776 < -1 => some kind of unexpected problem
5777 */
5778
5779 PCRE_EXP_DEFN int PCRE_CALL_CONVENTION
5780 pcre_exec(const pcre *argument_re, const pcre_extra *extra_data,
5781 PCRE_SPTR subject, int length, int start_offset, int options, int *offsets,
5782 int offsetcount)
5783 {
5784 int rc, ocount;
5785 int first_byte = -1;
5786 int req_byte = -1;
5787 int req_byte2 = -1;
5788 int newline;
5789 BOOL using_temporary_offsets = FALSE;
5790 BOOL anchored;
5791 BOOL startline;
5792 BOOL firstline;
5793 BOOL first_byte_caseless = FALSE;
5794 BOOL req_byte_caseless = FALSE;
5795 BOOL utf8;
5796 match_data match_block;
5797 match_data *md = &match_block;
5798 const uschar *tables;
5799 const uschar *start_bits = NULL;
5800 USPTR start_match = (USPTR)subject + start_offset;
5801 USPTR end_subject;
5802 USPTR start_partial = NULL;
5803 USPTR req_byte_ptr = start_match - 1;
5804
5805 pcre_study_data internal_study;
5806 const pcre_study_data *study;
5807
5808 real_pcre internal_re;
5809 const real_pcre *external_re = (const real_pcre *)argument_re;
5810 const real_pcre *re = external_re;
5811
5812 /* Plausibility checks */
5813
5814 if ((options & ~PUBLIC_EXEC_OPTIONS) != 0) return PCRE_ERROR_BADOPTION;
5815 if (re == NULL || subject == NULL ||
5816 (offsets == NULL && offsetcount > 0)) return PCRE_ERROR_NULL;
5817 if (offsetcount < 0) return PCRE_ERROR_BADCOUNT;
5818 if (start_offset < 0 || start_offset > length) return PCRE_ERROR_BADOFFSET;
5819
5820 /* This information is for finding all the numbers associated with a given
5821 name, for condition testing. */
5822
5823 md->name_table = (uschar *)re + re->name_table_offset;
5824 md->name_count = re->name_count;
5825 md->name_entry_size = re->name_entry_size;
5826
5827 /* Fish out the optional data from the extra_data structure, first setting
5828 the default values. */
5829
5830 study = NULL;
5831 md->match_limit = MATCH_LIMIT;
5832 md->match_limit_recursion = MATCH_LIMIT_RECURSION;
5833 md->callout_data = NULL;
5834
5835 /* The table pointer is always in native byte order. */
5836
5837 tables = external_re->tables;
5838
5839 if (extra_data != NULL)
5840 {
5841 register unsigned int flags = extra_data->flags;
5842 if ((flags & PCRE_EXTRA_STUDY_DATA) != 0)
5843 study = (const pcre_study_data *)extra_data->study_data;
5844 if ((flags & PCRE_EXTRA_MATCH_LIMIT) != 0)
5845 md->match_limit = extra_data->match_limit;
5846 if ((flags & PCRE_EXTRA_MATCH_LIMIT_RECURSION) != 0)
5847 md->match_limit_recursion = extra_data->match_limit_recursion;
5848 if ((flags & PCRE_EXTRA_CALLOUT_DATA) != 0)
5849 md->callout_data = extra_data->callout_data;
5850 if ((flags & PCRE_EXTRA_TABLES) != 0) tables = extra_data->tables;
5851 }
5852
5853 /* If the exec call supplied NULL for tables, use the inbuilt ones. This
5854 is a feature that makes it possible to save compiled regex and re-use them
5855 in other programs later. */
5856
5857 if (tables == NULL) tables = _pcre_default_tables;
5858
5859 /* Check that the first field in the block is the magic number. If it is not,
5860 test for a regex that was compiled on a host of opposite endianness. If this is
5861 the case, flipped values are put in internal_re and internal_study if there was
5862 study data too. */
5863
5864 if (re->magic_number != MAGIC_NUMBER)
5865 {
5866 re = _pcre_try_flipped(re, &internal_re, study, &internal_study);
5867 if (re == NULL) return PCRE_ERROR_BADMAGIC;
5868 if (study != NULL) study = &internal_study;
5869 }
5870
5871 /* Set up other data */
5872
5873 anchored = ((re->options | options) & PCRE_ANCHORED) != 0;
5874 startline = (re->flags & PCRE_STARTLINE) != 0;
5875 firstline = (re->options & PCRE_FIRSTLINE) != 0;
5876
5877 /* The code starts after the real_pcre block and the capture name table. */
5878
5879 md->start_code = (const uschar *)external_re + re->name_table_offset +
5880 re->name_count * re->name_entry_size;
5881
5882 md->start_subject = (USPTR)subject;
5883 md->start_offset = start_offset;
5884 md->end_subject = md->start_subject + length;
5885 end_subject = md->end_subject;
5886
5887 md->endonly = (re->options & PCRE_DOLLAR_ENDONLY) != 0;
5888 utf8 = md->utf8 = (re->options & PCRE_UTF8) != 0;
5889 md->use_ucp = (re->options & PCRE_UCP) != 0;
5890 md->jscript_compat = (re->options & PCRE_JAVASCRIPT_COMPAT) != 0;
5891
5892 /* Some options are unpacked into BOOL variables in the hope that testing
5893 them will be faster than individual option bits. */
5894
5895 md->notbol = (options & PCRE_NOTBOL) != 0;
5896 md->noteol = (options & PCRE_NOTEOL) != 0;
5897 md->notempty = (options & PCRE_NOTEMPTY) != 0;
5898 md->notempty_atstart = (options & PCRE_NOTEMPTY_ATSTART) != 0;
5899 md->partial = ((options & PCRE_PARTIAL_HARD) != 0)? 2 :
5900 ((options & PCRE_PARTIAL_SOFT) != 0)? 1 : 0;
5901
5902
5903 md->hitend = FALSE;
5904 md->mark = NULL; /* In case never set */
5905
5906 md->recursive = NULL; /* No recursion at top level */
5907
5908 md->lcc = tables + lcc_offset;
5909 md->ctypes = tables + ctypes_offset;
5910
5911 /* Handle different \R options. */
5912
5913 switch (options & (PCRE_BSR_ANYCRLF|PCRE_BSR_UNICODE))
5914 {
5915 case 0:
5916 if ((re->options & (PCRE_BSR_ANYCRLF|PCRE_BSR_UNICODE)) != 0)
5917 md->bsr_anycrlf = (re->options & PCRE_BSR_ANYCRLF) != 0;
5918 else
5919 #ifdef BSR_ANYCRLF
5920 md->bsr_anycrlf = TRUE;
5921 #else
5922 md->bsr_anycrlf = FALSE;
5923 #endif
5924 break;
5925
5926 case PCRE_BSR_ANYCRLF:
5927 md->bsr_anycrlf = TRUE;
5928 break;
5929
5930 case PCRE_BSR_UNICODE:
5931 md->bsr_anycrlf = FALSE;
5932 break;
5933
5934 default: return PCRE_ERROR_BADNEWLINE;
5935 }
5936
5937 /* Handle different types of newline. The three bits give eight cases. If
5938 nothing is set at run time, whatever was used at compile time applies. */
5939
5940 switch ((((options & PCRE_NEWLINE_BITS) == 0)? re->options :
5941 (pcre_uint32)options) & PCRE_NEWLINE_BITS)
5942 {
5943 case 0: newline = NEWLINE; break; /* Compile-time default */
5944 case PCRE_NEWLINE_CR: newline = CHAR_CR; break;
5945 case PCRE_NEWLINE_LF: newline = CHAR_NL; break;
5946 case PCRE_NEWLINE_CR+
5947 PCRE_NEWLINE_LF: newline = (CHAR_CR << 8) | CHAR_NL; break;
5948 case PCRE_NEWLINE_ANY: newline = -1; break;
5949 case PCRE_NEWLINE_ANYCRLF: newline = -2; break;
5950 default: return PCRE_ERROR_BADNEWLINE;
5951 }
5952
5953 if (newline == -2)
5954 {
5955 md->nltype = NLTYPE_ANYCRLF;
5956 }
5957 else if (newline < 0)
5958 {
5959 md->nltype = NLTYPE_ANY;
5960 }
5961 else
5962 {
5963 md->nltype = NLTYPE_FIXED;
5964 if (newline > 255)
5965 {
5966 md->nllen = 2;
5967 md->nl[0] = (newline >> 8) & 255;
5968 md->nl[1] = newline & 255;
5969 }
5970 else
5971 {
5972 md->nllen = 1;
5973 md->nl[0] = newline;
5974 }
5975 }
5976
5977 /* Partial matching was originally supported only for a restricted set of
5978 regexes; from release 8.00 there are no restrictions, but the bits are still
5979 defined (though never set). So there's no harm in leaving this code. */
5980
5981 if (md->partial && (re->flags & PCRE_NOPARTIAL) != 0)
5982 return PCRE_ERROR_BADPARTIAL;
5983
5984 /* Check a UTF-8 string if required. Pass back the character offset and error
5985 code for an invalid string if a results vector is available. */
5986
5987 #ifdef SUPPORT_UTF8
5988 if (utf8 && (options & PCRE_NO_UTF8_CHECK) == 0)
5989 {
5990 int erroroffset;
5991 int errorcode = _pcre_valid_utf8((USPTR)subject, length, &erroroffset);
5992 if (errorcode != 0)
5993 {
5994 if (offsetcount >= 2)
5995 {
5996 offsets[0] = erroroffset;
5997 offsets[1] = errorcode;
5998 }
5999 return (errorcode <= PCRE_UTF8_ERR5 && md->partial > 1)?
6000 PCRE_ERROR_SHORTUTF8 : PCRE_ERROR_BADUTF8;
6001 }
6002
6003 /* Check that a start_offset points to the start of a UTF-8 character. */
6004
6005 if (start_offset > 0 && start_offset < length &&
6006 (((USPTR)subject)[start_offset] & 0xc0) == 0x80)
6007 return PCRE_ERROR_BADUTF8_OFFSET;
6008 }
6009 #endif
6010
6011 /* If the expression has got more back references than the offsets supplied can
6012 hold, we get a temporary chunk of working store to use during the matching.
6013 Otherwise, we can use the vector supplied, rounding down its size to a multiple
6014 of 3. */
6015
6016 ocount = offsetcount - (offsetcount % 3);
6017
6018 if (re->top_backref > 0 && re->top_backref >= ocount/3)
6019 {
6020 ocount = re->top_backref * 3 + 3;
6021 md->offset_vector = (int *)(pcre_malloc)(ocount * sizeof(int));
6022 if (md->offset_vector == NULL) return PCRE_ERROR_NOMEMORY;
6023 using_temporary_offsets = TRUE;
6024 DPRINTF(("Got memory to hold back references\n"));
6025 }
6026 else md->offset_vector = offsets;
6027
6028 md->offset_end = ocount;
6029 md->offset_max = (2*ocount)/3;
6030 md->offset_overflow = FALSE;
6031 md->capture_last = -1;
6032
6033 /* Reset the working variable associated with each extraction. These should
6034 never be used unless previously set, but they get saved and restored, and so we
6035 initialize them to avoid reading uninitialized locations. Also, unset the
6036 offsets for the matched string. This is really just for tidiness with callouts,
6037 in case they inspect these fields. */
6038
6039 if (md->offset_vector != NULL)
6040 {
6041 register int *iptr = md->offset_vector + ocount;
6042 register int *iend = iptr - re->top_bracket;
6043 if (iend < md->offset_vector + 2) iend = md->offset_vector + 2;
6044 while (--iptr >= iend) *iptr = -1;
6045 md->offset_vector[0] = md->offset_vector[1] = -1;
6046 }
6047
6048 /* Set up the first character to match, if available. The first_byte value is
6049 never set for an anchored regular expression, but the anchoring may be forced
6050 at run time, so we have to test for anchoring. The first char may be unset for
6051 an unanchored pattern, of course. If there's no first char and the pattern was
6052 studied, there may be a bitmap of possible first characters. */
6053
6054 if (!anchored)
6055 {
6056 if ((re->flags & PCRE_FIRSTSET) != 0)
6057 {
6058 first_byte = re->first_byte & 255;
6059 if ((first_byte_caseless = ((re->first_byte & REQ_CASELESS) != 0)) == TRUE)
6060 first_byte = md->lcc[first_byte];
6061 }
6062 else
6063 if (!startline && study != NULL &&
6064 (study->flags & PCRE_STUDY_MAPPED) != 0)
6065 start_bits = study->start_bits;
6066 }
6067
6068 /* For anchored or unanchored matches, there may be a "last known required
6069 character" set. */
6070
6071 if ((re->flags & PCRE_REQCHSET) != 0)
6072 {
6073 req_byte = re->req_byte & 255;
6074 req_byte_caseless = (re->req_byte & REQ_CASELESS) != 0;
6075 req_byte2 = (tables + fcc_offset)[req_byte]; /* case flipped */
6076 }
6077
6078
6079
6080
6081 /* ==========================================================================*/
6082
6083 /* Loop for handling unanchored repeated matching attempts; for anchored regexs
6084 the loop runs just once. */
6085
6086 for(;;)
6087 {
6088 USPTR save_end_subject = end_subject;
6089 USPTR new_start_match;
6090
6091 /* If firstline is TRUE, the start of the match is constrained to the first
6092 line of a multiline string. That is, the match must be before or at the first
6093 newline. Implement this by temporarily adjusting end_subject so that we stop
6094 scanning at a newline. If the match fails at the newline, later code breaks
6095 this loop. */
6096
6097 if (firstline)
6098 {
6099 USPTR t = start_match;
6100 #ifdef SUPPORT_UTF8
6101 if (utf8)
6102 {
6103 while (t < md->end_subject && !IS_NEWLINE(t))
6104 {
6105 t++;
6106 while (t < end_subject && (*t & 0xc0) == 0x80) t++;
6107 }
6108 }
6109 else
6110 #endif
6111 while (t < md->end_subject && !IS_NEWLINE(t)) t++;
6112 end_subject = t;
6113 }
6114
6115 /* There are some optimizations that avoid running the match if a known
6116 starting point is not found, or if a known later character is not present.
6117 However, there is an option that disables these, for testing and for ensuring
6118 that all callouts do actually occur. The option can be set in the regex by
6119 (*NO_START_OPT) or passed in match-time options. */
6120
6121 if (((options | re->options) & PCRE_NO_START_OPTIMIZE) == 0)
6122 {
6123 /* Advance to a unique first byte if there is one. */
6124
6125 if (first_byte >= 0)
6126 {
6127 if (first_byte_caseless)
6128 while (start_match < end_subject && md->lcc[*start_match] != first_byte)
6129 start_match++;
6130 else
6131 while (start_match < end_subject && *start_match != first_byte)
6132 start_match++;
6133 }
6134
6135 /* Or to just after a linebreak for a multiline match */
6136
6137 else if (startline)
6138 {
6139 if (start_match > md->start_subject + start_offset)
6140 {
6141 #ifdef SUPPORT_UTF8
6142 if (utf8)
6143 {
6144 while (start_match < end_subject && !WAS_NEWLINE(start_match))
6145 {
6146 start_match++;
6147 while(start_match < end_subject && (*start_match & 0xc0) == 0x80)
6148 start_match++;
6149 }
6150 }
6151 else
6152 #endif
6153 while (start_match < end_subject && !WAS_NEWLINE(start_match))
6154 start_match++;
6155
6156 /* If we have just passed a CR and the newline option is ANY or ANYCRLF,
6157 and we are now at a LF, advance the match position by one more character.
6158 */
6159
6160 if (start_match[-1] == CHAR_CR &&
6161 (md->nltype == NLTYPE_ANY || md->nltype == NLTYPE_ANYCRLF) &&
6162 start_match < end_subject &&
6163 *start_match == CHAR_NL)
6164 start_match++;
6165 }
6166 }
6167
6168 /* Or to a non-unique first byte after study */
6169
6170 else if (start_bits != NULL)
6171 {
6172 while (start_match < end_subject)
6173 {
6174 register unsigned int c = *start_match;
6175 if ((start_bits[c/8] & (1 << (c&7))) == 0)
6176 {
6177 start_match++;
6178 #ifdef SUPPORT_UTF8
6179 if (utf8)
6180 while(start_match < end_subject && (*start_match & 0xc0) == 0x80)
6181 start_match++;
6182 #endif
6183 }
6184 else break;
6185 }
6186 }
6187 } /* Starting optimizations */
6188
6189 /* Restore fudged end_subject */
6190
6191 end_subject = save_end_subject;
6192
6193 /* The following two optimizations are disabled for partial matching or if
6194 disabling is explicitly requested. */
6195
6196 if ((options & PCRE_NO_START_OPTIMIZE) == 0 && !md->partial)
6197 {
6198 /* If the pattern was studied, a minimum subject length may be set. This is
6199 a lower bound; no actual string of that length may actually match the
6200 pattern. Although the value is, strictly, in characters, we treat it as
6201 bytes to avoid spending too much time in this optimization. */
6202
6203 if (study != NULL && (study->flags & PCRE_STUDY_MINLEN) != 0 &&
6204 (pcre_uint32)(end_subject - start_match) < study->minlength)
6205 {
6206 rc = MATCH_NOMATCH;
6207 break;
6208 }
6209
6210 /* If req_byte is set, we know that that character must appear in the
6211 subject for the match to succeed. If the first character is set, req_byte
6212 must be later in the subject; otherwise the test starts at the match point.
6213 This optimization can save a huge amount of backtracking in patterns with
6214 nested unlimited repeats that aren't going to match. Writing separate code
6215 for cased/caseless versions makes it go faster, as does using an
6216 autoincrement and backing off on a match.
6217
6218 HOWEVER: when the subject string is very, very long, searching to its end
6219 can take a long time, and give bad performance on quite ordinary patterns.
6220 This showed up when somebody was matching something like /^\d+C/ on a
6221 32-megabyte string... so we don't do this when the string is sufficiently
6222 long. */
6223
6224 if (req_byte >= 0 && end_subject - start_match < REQ_BYTE_MAX)
6225 {
6226 register USPTR p = start_match + ((first_byte >= 0)? 1 : 0);
6227
6228 /* We don't need to repeat the search if we haven't yet reached the
6229 place we found it at last time. */
6230
6231 if (p > req_byte_ptr)
6232 {
6233 if (req_byte_caseless)
6234 {
6235 while (p < end_subject)
6236 {
6237 register int pp = *p++;
6238 if (pp == req_byte || pp == req_byte2) { p--; break; }
6239 }
6240 }
6241 else
6242 {
6243 while (p < end_subject)
6244 {
6245 if (*p++ == req_byte) { p--; break; }
6246 }
6247 }
6248
6249 /* If we can't find the required character, break the matching loop,
6250 forcing a match failure. */
6251
6252 if (p >= end_subject)
6253 {
6254 rc = MATCH_NOMATCH;
6255 break;
6256 }
6257
6258 /* If we have found the required character, save the point where we
6259 found it, so that we don't search again next time round the loop if
6260 the start hasn't passed this character yet. */
6261
6262 req_byte_ptr = p;
6263 }
6264 }
6265 }
6266
6267 #ifdef PCRE_DEBUG /* Sigh. Some compilers never learn. */
6268 printf(">>>> Match against: ");
6269 pchars(start_match, end_subject - start_match, TRUE, md);
6270 printf("\n");
6271 #endif
6272
6273 /* OK, we can now run the match. If "hitend" is set afterwards, remember the
6274 first starting point for which a partial match was found. */
6275
6276 md->start_match_ptr = start_match;
6277 md->start_used_ptr = start_match;
6278 md->match_call_count = 0;
6279 md->match_function_type = 0;
6280 md->end_offset_top = 0;
6281 rc = match(start_match, md->start_code, start_match, NULL, 2, md, NULL, 0);
6282 if (md->hitend && start_partial == NULL) start_partial = md->start_used_ptr;
6283
6284 switch(rc)
6285 {
6286 /* SKIP passes back the next starting point explicitly, but if it is the
6287 same as the match we have just done, treat it as NOMATCH. */
6288
6289 case MATCH_SKIP:
6290 if (md->start_match_ptr != start_match)
6291 {
6292 new_start_match = md->start_match_ptr;
6293 break;
6294 }
6295 /* Fall through */
6296
6297 /* If MATCH_SKIP_ARG reaches this level it means that a MARK that matched
6298 the SKIP's arg was not found. We also treat this as NOMATCH. */
6299
6300 case MATCH_SKIP_ARG:
6301 /* Fall through */
6302
6303 /* NOMATCH and PRUNE advance by one character. THEN at this level acts
6304 exactly like PRUNE. */
6305
6306 case MATCH_NOMATCH:
6307 case MATCH_PRUNE:
6308 case MATCH_THEN:
6309 new_start_match = start_match + 1;
6310 #ifdef SUPPORT_UTF8
6311 if (utf8)
6312 while(new_start_match < end_subject && (*new_start_match & 0xc0) == 0x80)
6313 new_start_match++;
6314 #endif
6315 break;
6316
6317 /* COMMIT disables the bumpalong, but otherwise behaves as NOMATCH. */
6318
6319 case MATCH_COMMIT:
6320 rc = MATCH_NOMATCH;
6321 goto ENDLOOP;
6322
6323 /* Any other return is either a match, or some kind of error. */
6324
6325 default:
6326 goto ENDLOOP;
6327 }
6328
6329 /* Control reaches here for the various types of "no match at this point"
6330 result. Reset the code to MATCH_NOMATCH for subsequent checking. */
6331
6332 rc = MATCH_NOMATCH;
6333
6334 /* If PCRE_FIRSTLINE is set, the match must happen before or at the first
6335 newline in the subject (though it may continue over the newline). Therefore,
6336 if we have just failed to match, starting at a newline, do not continue. */
6337
6338 if (firstline && IS_NEWLINE(start_match)) break;
6339
6340 /* Advance to new matching position */
6341
6342 start_match = new_start_match;
6343
6344 /* Break the loop if the pattern is anchored or if we have passed the end of
6345 the subject. */
6346
6347 if (anchored || start_match > end_subject) break;
6348
6349 /* If we have just passed a CR and we are now at a LF, and the pattern does
6350 not contain any explicit matches for \r or \n, and the newline option is CRLF
6351 or ANY or ANYCRLF, advance the match position by one more character. */
6352
6353 if (start_match[-1] == CHAR_CR &&
6354 start_match < end_subject &&
6355 *start_match == CHAR_NL &&
6356 (re->flags & PCRE_HASCRORLF) == 0 &&
6357 (md->nltype == NLTYPE_ANY ||
6358 md->nltype == NLTYPE_ANYCRLF ||
6359 md->nllen == 2))
6360 start_match++;
6361
6362 md->mark = NULL; /* Reset for start of next match attempt */
6363 } /* End of for(;;) "bumpalong" loop */
6364
6365 /* ==========================================================================*/
6366
6367 /* We reach here when rc is not MATCH_NOMATCH, or if one of the stopping
6368 conditions is true:
6369
6370 (1) The pattern is anchored or the match was failed by (*COMMIT);
6371
6372 (2) We are past the end of the subject;
6373
6374 (3) PCRE_FIRSTLINE is set and we have failed to match at a newline, because
6375 this option requests that a match occur at or before the first newline in
6376 the subject.
6377
6378 When we have a match and the offset vector is big enough to deal with any
6379 backreferences, captured substring offsets will already be set up. In the case
6380 where we had to get some local store to hold offsets for backreference
6381 processing, copy those that we can. In this case there need not be overflow if
6382 certain parts of the pattern were not used, even though there are more
6383 capturing parentheses than vector slots. */
6384
6385 ENDLOOP:
6386
6387 if (rc == MATCH_MATCH || rc == MATCH_ACCEPT)
6388 {
6389 if (using_temporary_offsets)
6390 {
6391 if (offsetcount >= 4)
6392 {
6393 memcpy(offsets + 2, md->offset_vector + 2,
6394 (offsetcount - 2) * sizeof(int));
6395 DPRINTF(("Copied offsets from temporary memory\n"));
6396 }
6397 if (md->end_offset_top > offsetcount) md->offset_overflow = TRUE;
6398 DPRINTF(("Freeing temporary memory\n"));
6399 (pcre_free)(md->offset_vector);
6400 }
6401
6402 /* Set the return code to the number of captured strings, or 0 if there are
6403 too many to fit into the vector. */
6404
6405 rc = md->offset_overflow? 0 : md->end_offset_top/2;
6406
6407 /* If there is space, set up the whole thing as substring 0. The value of
6408 md->start_match_ptr might be modified if \K was encountered on the success
6409 matching path. */
6410
6411 if (offsetcount < 2) rc = 0; else
6412 {
6413 offsets[0] = (int)(md->start_match_ptr - md->start_subject);
6414 offsets[1] = (int)(md->end_match_ptr - md->start_subject);
6415 }
6416
6417 DPRINTF((">>>> returning %d\n", rc));
6418 goto RETURN_MARK;
6419 }
6420
6421 /* Control gets here if there has been an error, or if the overall match
6422 attempt has failed at all permitted starting positions. */
6423
6424 if (using_temporary_offsets)
6425 {
6426 DPRINTF(("Freeing temporary memory\n"));
6427 (pcre_free)(md->offset_vector);
6428 }
6429
6430 /* For anything other than nomatch or partial match, just return the code. */
6431
6432 if (rc != MATCH_NOMATCH && rc != PCRE_ERROR_PARTIAL)
6433 {
6434 DPRINTF((">>>> error: returning %d\n", rc));
6435 return rc;
6436 }
6437
6438 /* Handle partial matches - disable any mark data */
6439
6440 if (start_partial != NULL)
6441 {
6442 DPRINTF((">>>> returning PCRE_ERROR_PARTIAL\n"));
6443 md->mark = NULL;
6444 if (offsetcount > 1)
6445 {
6446 offsets[0] = (int)(start_partial - (USPTR)subject);
6447 offsets[1] = (int)(end_subject - (USPTR)subject);
6448 }
6449 rc = PCRE_ERROR_PARTIAL;
6450 }
6451
6452 /* This is the classic nomatch case */
6453
6454 else
6455 {
6456 DPRINTF((">>>> returning PCRE_ERROR_NOMATCH\n"));
6457 rc = PCRE_ERROR_NOMATCH;
6458 }
6459
6460 /* Return the MARK data if it has been requested. */
6461
6462 RETURN_MARK:
6463
6464 if (extra_data != NULL && (extra_data->flags & PCRE_EXTRA_MARK) != 0)
6465 *(extra_data->mark) = (unsigned char *)(md->mark);
6466 return rc;
6467 }
6468
6469 /* End of pcre_exec.c */

Properties

Name Value
svn:eol-style native
svn:keywords "Author Date Id Revision Url"

webmaster@exim.org
ViewVC Help
Powered by ViewVC 1.1.12