/[pcre]/code/trunk/pcre_exec.c
ViewVC logotype

Contents of /code/trunk/pcre_exec.c

Parent Directory Parent Directory | Revision Log Revision Log


Revision 835 - (show annotations) (download)
Wed Dec 28 16:10:09 2011 UTC (2 years, 9 months ago) by ph10
File MIME type: text/plain
File size: 201321 byte(s)
Rolled back trunk to r755 to prepare for merging the 16-bit branch.

1 /*************************************************
2 * Perl-Compatible Regular Expressions *
3 *************************************************/
4
5 /* PCRE is a library of functions to support regular expressions whose syntax
6 and semantics are as close as possible to those of the Perl 5 language.
7
8 Written by Philip Hazel
9 Copyright (c) 1997-2011 University of Cambridge
10
11 -----------------------------------------------------------------------------
12 Redistribution and use in source and binary forms, with or without
13 modification, are permitted provided that the following conditions are met:
14
15 * Redistributions of source code must retain the above copyright notice,
16 this list of conditions and the following disclaimer.
17
18 * Redistributions in binary form must reproduce the above copyright
19 notice, this list of conditions and the following disclaimer in the
20 documentation and/or other materials provided with the distribution.
21
22 * Neither the name of the University of Cambridge nor the names of its
23 contributors may be used to endorse or promote products derived from
24 this software without specific prior written permission.
25
26 THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
27 AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
28 IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
29 ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
30 LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
31 CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
32 SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
33 INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
34 CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
35 ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
36 POSSIBILITY OF SUCH DAMAGE.
37 -----------------------------------------------------------------------------
38 */
39
40
41 /* This module contains pcre_exec(), the externally visible function that does
42 pattern matching using an NFA algorithm, trying to mimic Perl as closely as
43 possible. There are also some static supporting functions. */
44
45 #ifdef HAVE_CONFIG_H
46 #include "config.h"
47 #endif
48
49 #define NLBLOCK md /* Block containing newline information */
50 #define PSSTART start_subject /* Field containing processed string start */
51 #define PSEND end_subject /* Field containing processed string end */
52
53 #include "pcre_internal.h"
54
55 /* Undefine some potentially clashing cpp symbols */
56
57 #undef min
58 #undef max
59
60 /* Values for setting in md->match_function_type to indicate two special types
61 of call to match(). We do it this way to save on using another stack variable,
62 as stack usage is to be discouraged. */
63
64 #define MATCH_CONDASSERT 1 /* Called to check a condition assertion */
65 #define MATCH_CBEGROUP 2 /* Could-be-empty unlimited repeat group */
66
67 /* Non-error returns from the match() function. Error returns are externally
68 defined PCRE_ERROR_xxx codes, which are all negative. */
69
70 #define MATCH_MATCH 1
71 #define MATCH_NOMATCH 0
72
73 /* Special internal returns from the match() function. Make them sufficiently
74 negative to avoid the external error codes. */
75
76 #define MATCH_ACCEPT (-999)
77 #define MATCH_COMMIT (-998)
78 #define MATCH_KETRPOS (-997)
79 #define MATCH_ONCE (-996)
80 #define MATCH_PRUNE (-995)
81 #define MATCH_SKIP (-994)
82 #define MATCH_SKIP_ARG (-993)
83 #define MATCH_THEN (-992)
84
85 /* This is a convenience macro for code that occurs many times. */
86
87 #define MRRETURN(ra) \
88 { \
89 md->mark = markptr; \
90 RRETURN(ra); \
91 }
92
93 /* Maximum number of ints of offset to save on the stack for recursive calls.
94 If the offset vector is bigger, malloc is used. This should be a multiple of 3,
95 because the offset vector is always a multiple of 3 long. */
96
97 #define REC_STACK_SAVE_MAX 30
98
99 /* Min and max values for the common repeats; for the maxima, 0 => infinity */
100
101 static const char rep_min[] = { 0, 0, 1, 1, 0, 0 };
102 static const char rep_max[] = { 0, 0, 0, 0, 1, 1 };
103
104
105
106 #ifdef PCRE_DEBUG
107 /*************************************************
108 * Debugging function to print chars *
109 *************************************************/
110
111 /* Print a sequence of chars in printable format, stopping at the end of the
112 subject if the requested.
113
114 Arguments:
115 p points to characters
116 length number to print
117 is_subject TRUE if printing from within md->start_subject
118 md pointer to matching data block, if is_subject is TRUE
119
120 Returns: nothing
121 */
122
123 static void
124 pchars(const uschar *p, int length, BOOL is_subject, match_data *md)
125 {
126 unsigned int c;
127 if (is_subject && length > md->end_subject - p) length = md->end_subject - p;
128 while (length-- > 0)
129 if (isprint(c = *(p++))) printf("%c", c); else printf("\\x%02x", c);
130 }
131 #endif
132
133
134
135 /*************************************************
136 * Match a back-reference *
137 *************************************************/
138
139 /* Normally, if a back reference hasn't been set, the length that is passed is
140 negative, so the match always fails. However, in JavaScript compatibility mode,
141 the length passed is zero. Note that in caseless UTF-8 mode, the number of
142 subject bytes matched may be different to the number of reference bytes.
143
144 Arguments:
145 offset index into the offset vector
146 eptr pointer into the subject
147 length length of reference to be matched (number of bytes)
148 md points to match data block
149 caseless TRUE if caseless
150
151 Returns: < 0 if not matched, otherwise the number of subject bytes matched
152 */
153
154 static int
155 match_ref(int offset, register USPTR eptr, int length, match_data *md,
156 BOOL caseless)
157 {
158 USPTR eptr_start = eptr;
159 register USPTR p = md->start_subject + md->offset_vector[offset];
160
161 #ifdef PCRE_DEBUG
162 if (eptr >= md->end_subject)
163 printf("matching subject <null>");
164 else
165 {
166 printf("matching subject ");
167 pchars(eptr, length, TRUE, md);
168 }
169 printf(" against backref ");
170 pchars(p, length, FALSE, md);
171 printf("\n");
172 #endif
173
174 /* Always fail if reference not set (and not JavaScript compatible). */
175
176 if (length < 0) return -1;
177
178 /* Separate the caseless case for speed. In UTF-8 mode we can only do this
179 properly if Unicode properties are supported. Otherwise, we can check only
180 ASCII characters. */
181
182 if (caseless)
183 {
184 #ifdef SUPPORT_UTF8
185 #ifdef SUPPORT_UCP
186 if (md->utf8)
187 {
188 /* Match characters up to the end of the reference. NOTE: the number of
189 bytes matched may differ, because there are some characters whose upper and
190 lower case versions code as different numbers of bytes. For example, U+023A
191 (2 bytes in UTF-8) is the upper case version of U+2C65 (3 bytes in UTF-8);
192 a sequence of 3 of the former uses 6 bytes, as does a sequence of two of
193 the latter. It is important, therefore, to check the length along the
194 reference, not along the subject (earlier code did this wrong). */
195
196 USPTR endptr = p + length;
197 while (p < endptr)
198 {
199 int c, d;
200 if (eptr >= md->end_subject) return -1;
201 GETCHARINC(c, eptr);
202 GETCHARINC(d, p);
203 if (c != d && c != UCD_OTHERCASE(d)) return -1;
204 }
205 }
206 else
207 #endif
208 #endif
209
210 /* The same code works when not in UTF-8 mode and in UTF-8 mode when there
211 is no UCP support. */
212 {
213 if (eptr + length > md->end_subject) return -1;
214 while (length-- > 0)
215 { if (md->lcc[*p++] != md->lcc[*eptr++]) return -1; }
216 }
217 }
218
219 /* In the caseful case, we can just compare the bytes, whether or not we
220 are in UTF-8 mode. */
221
222 else
223 {
224 if (eptr + length > md->end_subject) return -1;
225 while (length-- > 0) if (*p++ != *eptr++) return -1;
226 }
227
228 return eptr - eptr_start;
229 }
230
231
232
233 /***************************************************************************
234 ****************************************************************************
235 RECURSION IN THE match() FUNCTION
236
237 The match() function is highly recursive, though not every recursive call
238 increases the recursive depth. Nevertheless, some regular expressions can cause
239 it to recurse to a great depth. I was writing for Unix, so I just let it call
240 itself recursively. This uses the stack for saving everything that has to be
241 saved for a recursive call. On Unix, the stack can be large, and this works
242 fine.
243
244 It turns out that on some non-Unix-like systems there are problems with
245 programs that use a lot of stack. (This despite the fact that every last chip
246 has oodles of memory these days, and techniques for extending the stack have
247 been known for decades.) So....
248
249 There is a fudge, triggered by defining NO_RECURSE, which avoids recursive
250 calls by keeping local variables that need to be preserved in blocks of memory
251 obtained from malloc() instead instead of on the stack. Macros are used to
252 achieve this so that the actual code doesn't look very different to what it
253 always used to.
254
255 The original heap-recursive code used longjmp(). However, it seems that this
256 can be very slow on some operating systems. Following a suggestion from Stan
257 Switzer, the use of longjmp() has been abolished, at the cost of having to
258 provide a unique number for each call to RMATCH. There is no way of generating
259 a sequence of numbers at compile time in C. I have given them names, to make
260 them stand out more clearly.
261
262 Crude tests on x86 Linux show a small speedup of around 5-8%. However, on
263 FreeBSD, avoiding longjmp() more than halves the time taken to run the standard
264 tests. Furthermore, not using longjmp() means that local dynamic variables
265 don't have indeterminate values; this has meant that the frame size can be
266 reduced because the result can be "passed back" by straight setting of the
267 variable instead of being passed in the frame.
268 ****************************************************************************
269 ***************************************************************************/
270
271 /* Numbers for RMATCH calls. When this list is changed, the code at HEAP_RETURN
272 below must be updated in sync. */
273
274 enum { RM1=1, RM2, RM3, RM4, RM5, RM6, RM7, RM8, RM9, RM10,
275 RM11, RM12, RM13, RM14, RM15, RM16, RM17, RM18, RM19, RM20,
276 RM21, RM22, RM23, RM24, RM25, RM26, RM27, RM28, RM29, RM30,
277 RM31, RM32, RM33, RM34, RM35, RM36, RM37, RM38, RM39, RM40,
278 RM41, RM42, RM43, RM44, RM45, RM46, RM47, RM48, RM49, RM50,
279 RM51, RM52, RM53, RM54, RM55, RM56, RM57, RM58, RM59, RM60,
280 RM61, RM62, RM63, RM64, RM65, RM66 };
281
282 /* These versions of the macros use the stack, as normal. There are debugging
283 versions and production versions. Note that the "rw" argument of RMATCH isn't
284 actually used in this definition. */
285
286 #ifndef NO_RECURSE
287 #define REGISTER register
288
289 #ifdef PCRE_DEBUG
290 #define RMATCH(ra,rb,rc,rd,re,rw) \
291 { \
292 printf("match() called in line %d\n", __LINE__); \
293 rrc = match(ra,rb,mstart,markptr,rc,rd,re,rdepth+1); \
294 printf("to line %d\n", __LINE__); \
295 }
296 #define RRETURN(ra) \
297 { \
298 printf("match() returned %d from line %d ", ra, __LINE__); \
299 return ra; \
300 }
301 #else
302 #define RMATCH(ra,rb,rc,rd,re,rw) \
303 rrc = match(ra,rb,mstart,markptr,rc,rd,re,rdepth+1)
304 #define RRETURN(ra) return ra
305 #endif
306
307 #else
308
309
310 /* These versions of the macros manage a private stack on the heap. Note that
311 the "rd" argument of RMATCH isn't actually used in this definition. It's the md
312 argument of match(), which never changes. */
313
314 #define REGISTER
315
316 #define RMATCH(ra,rb,rc,rd,re,rw)\
317 {\
318 heapframe *newframe = (heapframe *)(pcre_stack_malloc)(sizeof(heapframe));\
319 if (newframe == NULL) RRETURN(PCRE_ERROR_NOMEMORY);\
320 frame->Xwhere = rw; \
321 newframe->Xeptr = ra;\
322 newframe->Xecode = rb;\
323 newframe->Xmstart = mstart;\
324 newframe->Xmarkptr = markptr;\
325 newframe->Xoffset_top = rc;\
326 newframe->Xeptrb = re;\
327 newframe->Xrdepth = frame->Xrdepth + 1;\
328 newframe->Xprevframe = frame;\
329 frame = newframe;\
330 DPRINTF(("restarting from line %d\n", __LINE__));\
331 goto HEAP_RECURSE;\
332 L_##rw:\
333 DPRINTF(("jumped back to line %d\n", __LINE__));\
334 }
335
336 #define RRETURN(ra)\
337 {\
338 heapframe *oldframe = frame;\
339 frame = oldframe->Xprevframe;\
340 (pcre_stack_free)(oldframe);\
341 if (frame != NULL)\
342 {\
343 rrc = ra;\
344 goto HEAP_RETURN;\
345 }\
346 return ra;\
347 }
348
349
350 /* Structure for remembering the local variables in a private frame */
351
352 typedef struct heapframe {
353 struct heapframe *Xprevframe;
354
355 /* Function arguments that may change */
356
357 USPTR Xeptr;
358 const uschar *Xecode;
359 USPTR Xmstart;
360 USPTR Xmarkptr;
361 int Xoffset_top;
362 eptrblock *Xeptrb;
363 unsigned int Xrdepth;
364
365 /* Function local variables */
366
367 USPTR Xcallpat;
368 #ifdef SUPPORT_UTF8
369 USPTR Xcharptr;
370 #endif
371 USPTR Xdata;
372 USPTR Xnext;
373 USPTR Xpp;
374 USPTR Xprev;
375 USPTR Xsaved_eptr;
376
377 recursion_info Xnew_recursive;
378
379 BOOL Xcur_is_word;
380 BOOL Xcondition;
381 BOOL Xprev_is_word;
382
383 #ifdef SUPPORT_UCP
384 int Xprop_type;
385 int Xprop_value;
386 int Xprop_fail_result;
387 int Xoclength;
388 uschar Xocchars[8];
389 #endif
390
391 int Xcodelink;
392 int Xctype;
393 unsigned int Xfc;
394 int Xfi;
395 int Xlength;
396 int Xmax;
397 int Xmin;
398 int Xnumber;
399 int Xoffset;
400 int Xop;
401 int Xsave_capture_last;
402 int Xsave_offset1, Xsave_offset2, Xsave_offset3;
403 int Xstacksave[REC_STACK_SAVE_MAX];
404
405 eptrblock Xnewptrb;
406
407 /* Where to jump back to */
408
409 int Xwhere;
410
411 } heapframe;
412
413 #endif
414
415
416 /***************************************************************************
417 ***************************************************************************/
418
419
420
421 /*************************************************
422 * Match from current position *
423 *************************************************/
424
425 /* This function is called recursively in many circumstances. Whenever it
426 returns a negative (error) response, the outer incarnation must also return the
427 same response. */
428
429 /* These macros pack up tests that are used for partial matching, and which
430 appears several times in the code. We set the "hit end" flag if the pointer is
431 at the end of the subject and also past the start of the subject (i.e.
432 something has been matched). For hard partial matching, we then return
433 immediately. The second one is used when we already know we are past the end of
434 the subject. */
435
436 #define CHECK_PARTIAL()\
437 if (md->partial != 0 && eptr >= md->end_subject && \
438 eptr > md->start_used_ptr) \
439 { \
440 md->hitend = TRUE; \
441 if (md->partial > 1) MRRETURN(PCRE_ERROR_PARTIAL); \
442 }
443
444 #define SCHECK_PARTIAL()\
445 if (md->partial != 0 && eptr > md->start_used_ptr) \
446 { \
447 md->hitend = TRUE; \
448 if (md->partial > 1) MRRETURN(PCRE_ERROR_PARTIAL); \
449 }
450
451
452 /* Performance note: It might be tempting to extract commonly used fields from
453 the md structure (e.g. utf8, end_subject) into individual variables to improve
454 performance. Tests using gcc on a SPARC disproved this; in the first case, it
455 made performance worse.
456
457 Arguments:
458 eptr pointer to current character in subject
459 ecode pointer to current position in compiled code
460 mstart pointer to the current match start position (can be modified
461 by encountering \K)
462 markptr pointer to the most recent MARK name, or NULL
463 offset_top current top pointer
464 md pointer to "static" info for the match
465 eptrb pointer to chain of blocks containing eptr at start of
466 brackets - for testing for empty matches
467 rdepth the recursion depth
468
469 Returns: MATCH_MATCH if matched ) these values are >= 0
470 MATCH_NOMATCH if failed to match )
471 a negative MATCH_xxx value for PRUNE, SKIP, etc
472 a negative PCRE_ERROR_xxx value if aborted by an error condition
473 (e.g. stopped by repeated call or recursion limit)
474 */
475
476 static int
477 match(REGISTER USPTR eptr, REGISTER const uschar *ecode, USPTR mstart,
478 const uschar *markptr, int offset_top, match_data *md, eptrblock *eptrb,
479 unsigned int rdepth)
480 {
481 /* These variables do not need to be preserved over recursion in this function,
482 so they can be ordinary variables in all cases. Mark some of them with
483 "register" because they are used a lot in loops. */
484
485 register int rrc; /* Returns from recursive calls */
486 register int i; /* Used for loops not involving calls to RMATCH() */
487 register unsigned int c; /* Character values not kept over RMATCH() calls */
488 register BOOL utf8; /* Local copy of UTF-8 flag for speed */
489
490 BOOL minimize, possessive; /* Quantifier options */
491 BOOL caseless;
492 int condcode;
493
494 /* When recursion is not being used, all "local" variables that have to be
495 preserved over calls to RMATCH() are part of a "frame" which is obtained from
496 heap storage. Set up the top-level frame here; others are obtained from the
497 heap whenever RMATCH() does a "recursion". See the macro definitions above. */
498
499 #ifdef NO_RECURSE
500 heapframe *frame = (heapframe *)(pcre_stack_malloc)(sizeof(heapframe));
501 if (frame == NULL) RRETURN(PCRE_ERROR_NOMEMORY);
502 frame->Xprevframe = NULL; /* Marks the top level */
503
504 /* Copy in the original argument variables */
505
506 frame->Xeptr = eptr;
507 frame->Xecode = ecode;
508 frame->Xmstart = mstart;
509 frame->Xmarkptr = markptr;
510 frame->Xoffset_top = offset_top;
511 frame->Xeptrb = eptrb;
512 frame->Xrdepth = rdepth;
513
514 /* This is where control jumps back to to effect "recursion" */
515
516 HEAP_RECURSE:
517
518 /* Macros make the argument variables come from the current frame */
519
520 #define eptr frame->Xeptr
521 #define ecode frame->Xecode
522 #define mstart frame->Xmstart
523 #define markptr frame->Xmarkptr
524 #define offset_top frame->Xoffset_top
525 #define eptrb frame->Xeptrb
526 #define rdepth frame->Xrdepth
527
528 /* Ditto for the local variables */
529
530 #ifdef SUPPORT_UTF8
531 #define charptr frame->Xcharptr
532 #endif
533 #define callpat frame->Xcallpat
534 #define codelink frame->Xcodelink
535 #define data frame->Xdata
536 #define next frame->Xnext
537 #define pp frame->Xpp
538 #define prev frame->Xprev
539 #define saved_eptr frame->Xsaved_eptr
540
541 #define new_recursive frame->Xnew_recursive
542
543 #define cur_is_word frame->Xcur_is_word
544 #define condition frame->Xcondition
545 #define prev_is_word frame->Xprev_is_word
546
547 #ifdef SUPPORT_UCP
548 #define prop_type frame->Xprop_type
549 #define prop_value frame->Xprop_value
550 #define prop_fail_result frame->Xprop_fail_result
551 #define oclength frame->Xoclength
552 #define occhars frame->Xocchars
553 #endif
554
555 #define ctype frame->Xctype
556 #define fc frame->Xfc
557 #define fi frame->Xfi
558 #define length frame->Xlength
559 #define max frame->Xmax
560 #define min frame->Xmin
561 #define number frame->Xnumber
562 #define offset frame->Xoffset
563 #define op frame->Xop
564 #define save_capture_last frame->Xsave_capture_last
565 #define save_offset1 frame->Xsave_offset1
566 #define save_offset2 frame->Xsave_offset2
567 #define save_offset3 frame->Xsave_offset3
568 #define stacksave frame->Xstacksave
569
570 #define newptrb frame->Xnewptrb
571
572 /* When recursion is being used, local variables are allocated on the stack and
573 get preserved during recursion in the normal way. In this environment, fi and
574 i, and fc and c, can be the same variables. */
575
576 #else /* NO_RECURSE not defined */
577 #define fi i
578 #define fc c
579
580 /* Many of the following variables are used only in small blocks of the code.
581 My normal style of coding would have declared them within each of those blocks.
582 However, in order to accommodate the version of this code that uses an external
583 "stack" implemented on the heap, it is easier to declare them all here, so the
584 declarations can be cut out in a block. The only declarations within blocks
585 below are for variables that do not have to be preserved over a recursive call
586 to RMATCH(). */
587
588 #ifdef SUPPORT_UTF8
589 const uschar *charptr;
590 #endif
591 const uschar *callpat;
592 const uschar *data;
593 const uschar *next;
594 USPTR pp;
595 const uschar *prev;
596 USPTR saved_eptr;
597
598 recursion_info new_recursive;
599
600 BOOL cur_is_word;
601 BOOL condition;
602 BOOL prev_is_word;
603
604 #ifdef SUPPORT_UCP
605 int prop_type;
606 int prop_value;
607 int prop_fail_result;
608 int oclength;
609 uschar occhars[8];
610 #endif
611
612 int codelink;
613 int ctype;
614 int length;
615 int max;
616 int min;
617 int number;
618 int offset;
619 int op;
620 int save_capture_last;
621 int save_offset1, save_offset2, save_offset3;
622 int stacksave[REC_STACK_SAVE_MAX];
623
624 eptrblock newptrb;
625 #endif /* NO_RECURSE */
626
627 /* To save space on the stack and in the heap frame, I have doubled up on some
628 of the local variables that are used only in localised parts of the code, but
629 still need to be preserved over recursive calls of match(). These macros define
630 the alternative names that are used. */
631
632 #define allow_zero cur_is_word
633 #define cbegroup condition
634 #define code_offset codelink
635 #define condassert condition
636 #define matched_once prev_is_word
637
638 /* These statements are here to stop the compiler complaining about unitialized
639 variables. */
640
641 #ifdef SUPPORT_UCP
642 prop_value = 0;
643 prop_fail_result = 0;
644 #endif
645
646
647 /* This label is used for tail recursion, which is used in a few cases even
648 when NO_RECURSE is not defined, in order to reduce the amount of stack that is
649 used. Thanks to Ian Taylor for noticing this possibility and sending the
650 original patch. */
651
652 TAIL_RECURSE:
653
654 /* OK, now we can get on with the real code of the function. Recursive calls
655 are specified by the macro RMATCH and RRETURN is used to return. When
656 NO_RECURSE is *not* defined, these just turn into a recursive call to match()
657 and a "return", respectively (possibly with some debugging if PCRE_DEBUG is
658 defined). However, RMATCH isn't like a function call because it's quite a
659 complicated macro. It has to be used in one particular way. This shouldn't,
660 however, impact performance when true recursion is being used. */
661
662 #ifdef SUPPORT_UTF8
663 utf8 = md->utf8; /* Local copy of the flag */
664 #else
665 utf8 = FALSE;
666 #endif
667
668 /* First check that we haven't called match() too many times, or that we
669 haven't exceeded the recursive call limit. */
670
671 if (md->match_call_count++ >= md->match_limit) RRETURN(PCRE_ERROR_MATCHLIMIT);
672 if (rdepth >= md->match_limit_recursion) RRETURN(PCRE_ERROR_RECURSIONLIMIT);
673
674 /* At the start of a group with an unlimited repeat that may match an empty
675 string, the variable md->match_function_type is set to MATCH_CBEGROUP. It is
676 done this way to save having to use another function argument, which would take
677 up space on the stack. See also MATCH_CONDASSERT below.
678
679 When MATCH_CBEGROUP is set, add the current subject pointer to the chain of
680 such remembered pointers, to be checked when we hit the closing ket, in order
681 to break infinite loops that match no characters. When match() is called in
682 other circumstances, don't add to the chain. The MATCH_CBEGROUP feature must
683 NOT be used with tail recursion, because the memory block that is used is on
684 the stack, so a new one may be required for each match(). */
685
686 if (md->match_function_type == MATCH_CBEGROUP)
687 {
688 newptrb.epb_saved_eptr = eptr;
689 newptrb.epb_prev = eptrb;
690 eptrb = &newptrb;
691 md->match_function_type = 0;
692 }
693
694 /* Now start processing the opcodes. */
695
696 for (;;)
697 {
698 minimize = possessive = FALSE;
699 op = *ecode;
700
701 switch(op)
702 {
703 case OP_MARK:
704 markptr = ecode + 2;
705 RMATCH(eptr, ecode + _pcre_OP_lengths[*ecode] + ecode[1], offset_top, md,
706 eptrb, RM55);
707
708 /* A return of MATCH_SKIP_ARG means that matching failed at SKIP with an
709 argument, and we must check whether that argument matches this MARK's
710 argument. It is passed back in md->start_match_ptr (an overloading of that
711 variable). If it does match, we reset that variable to the current subject
712 position and return MATCH_SKIP. Otherwise, pass back the return code
713 unaltered. */
714
715 if (rrc == MATCH_SKIP_ARG &&
716 strcmp((char *)markptr, (char *)(md->start_match_ptr)) == 0)
717 {
718 md->start_match_ptr = eptr;
719 RRETURN(MATCH_SKIP);
720 }
721
722 if (md->mark == NULL) md->mark = markptr;
723 RRETURN(rrc);
724
725 case OP_FAIL:
726 MRRETURN(MATCH_NOMATCH);
727
728 /* COMMIT overrides PRUNE, SKIP, and THEN */
729
730 case OP_COMMIT:
731 RMATCH(eptr, ecode + _pcre_OP_lengths[*ecode], offset_top, md,
732 eptrb, RM52);
733 if (rrc != MATCH_NOMATCH && rrc != MATCH_PRUNE &&
734 rrc != MATCH_SKIP && rrc != MATCH_SKIP_ARG &&
735 rrc != MATCH_THEN)
736 RRETURN(rrc);
737 MRRETURN(MATCH_COMMIT);
738
739 /* PRUNE overrides THEN */
740
741 case OP_PRUNE:
742 RMATCH(eptr, ecode + _pcre_OP_lengths[*ecode], offset_top, md,
743 eptrb, RM51);
744 if (rrc != MATCH_NOMATCH && rrc != MATCH_THEN) RRETURN(rrc);
745 MRRETURN(MATCH_PRUNE);
746
747 case OP_PRUNE_ARG:
748 RMATCH(eptr, ecode + _pcre_OP_lengths[*ecode] + ecode[1], offset_top, md,
749 eptrb, RM56);
750 if (rrc != MATCH_NOMATCH && rrc != MATCH_THEN) RRETURN(rrc);
751 md->mark = ecode + 2;
752 RRETURN(MATCH_PRUNE);
753
754 /* SKIP overrides PRUNE and THEN */
755
756 case OP_SKIP:
757 RMATCH(eptr, ecode + _pcre_OP_lengths[*ecode], offset_top, md,
758 eptrb, RM53);
759 if (rrc != MATCH_NOMATCH && rrc != MATCH_PRUNE && rrc != MATCH_THEN)
760 RRETURN(rrc);
761 md->start_match_ptr = eptr; /* Pass back current position */
762 MRRETURN(MATCH_SKIP);
763
764 case OP_SKIP_ARG:
765 RMATCH(eptr, ecode + _pcre_OP_lengths[*ecode] + ecode[1], offset_top, md,
766 eptrb, RM57);
767 if (rrc != MATCH_NOMATCH && rrc != MATCH_PRUNE && rrc != MATCH_THEN)
768 RRETURN(rrc);
769
770 /* Pass back the current skip name by overloading md->start_match_ptr and
771 returning the special MATCH_SKIP_ARG return code. This will either be
772 caught by a matching MARK, or get to the top, where it is treated the same
773 as PRUNE. */
774
775 md->start_match_ptr = ecode + 2;
776 RRETURN(MATCH_SKIP_ARG);
777
778 /* For THEN (and THEN_ARG) we pass back the address of the opcode, so that
779 the branch in which it occurs can be determined. Overload the start of
780 match pointer to do this. */
781
782 case OP_THEN:
783 RMATCH(eptr, ecode + _pcre_OP_lengths[*ecode], offset_top, md,
784 eptrb, RM54);
785 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
786 md->start_match_ptr = ecode;
787 MRRETURN(MATCH_THEN);
788
789 case OP_THEN_ARG:
790 RMATCH(eptr, ecode + _pcre_OP_lengths[*ecode] + ecode[1], offset_top,
791 md, eptrb, RM58);
792 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
793 md->start_match_ptr = ecode;
794 md->mark = ecode + 2;
795 RRETURN(MATCH_THEN);
796
797 /* Handle an atomic group that does not contain any capturing parentheses.
798 This can be handled like an assertion. Prior to 8.13, all atomic groups
799 were handled this way. In 8.13, the code was changed as below for ONCE, so
800 that backups pass through the group and thereby reset captured values.
801 However, this uses a lot more stack, so in 8.20, atomic groups that do not
802 contain any captures generate OP_ONCE_NC, which can be handled in the old,
803 less stack intensive way.
804
805 Check the alternative branches in turn - the matching won't pass the KET
806 for this kind of subpattern. If any one branch matches, we carry on as at
807 the end of a normal bracket, leaving the subject pointer, but resetting
808 the start-of-match value in case it was changed by \K. */
809
810 case OP_ONCE_NC:
811 prev = ecode;
812 saved_eptr = eptr;
813 do
814 {
815 RMATCH(eptr, ecode + 1 + LINK_SIZE, offset_top, md, eptrb, RM64);
816 if (rrc == MATCH_MATCH) /* Note: _not_ MATCH_ACCEPT */
817 {
818 mstart = md->start_match_ptr;
819 markptr = md->mark;
820 break;
821 }
822 if (rrc == MATCH_THEN)
823 {
824 next = ecode + GET(ecode,1);
825 if (md->start_match_ptr < next &&
826 (*ecode == OP_ALT || *next == OP_ALT))
827 rrc = MATCH_NOMATCH;
828 }
829
830 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
831 ecode += GET(ecode,1);
832 }
833 while (*ecode == OP_ALT);
834
835 /* If hit the end of the group (which could be repeated), fail */
836
837 if (*ecode != OP_ONCE_NC && *ecode != OP_ALT) RRETURN(MATCH_NOMATCH);
838
839 /* Continue as from after the group, updating the offsets high water
840 mark, since extracts may have been taken. */
841
842 do ecode += GET(ecode, 1); while (*ecode == OP_ALT);
843
844 offset_top = md->end_offset_top;
845 eptr = md->end_match_ptr;
846
847 /* For a non-repeating ket, just continue at this level. This also
848 happens for a repeating ket if no characters were matched in the group.
849 This is the forcible breaking of infinite loops as implemented in Perl
850 5.005. */
851
852 if (*ecode == OP_KET || eptr == saved_eptr)
853 {
854 ecode += 1+LINK_SIZE;
855 break;
856 }
857
858 /* The repeating kets try the rest of the pattern or restart from the
859 preceding bracket, in the appropriate order. The second "call" of match()
860 uses tail recursion, to avoid using another stack frame. */
861
862 if (*ecode == OP_KETRMIN)
863 {
864 RMATCH(eptr, ecode + 1 + LINK_SIZE, offset_top, md, eptrb, RM65);
865 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
866 ecode = prev;
867 goto TAIL_RECURSE;
868 }
869 else /* OP_KETRMAX */
870 {
871 md->match_function_type = MATCH_CBEGROUP;
872 RMATCH(eptr, prev, offset_top, md, eptrb, RM66);
873 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
874 ecode += 1 + LINK_SIZE;
875 goto TAIL_RECURSE;
876 }
877 /* Control never gets here */
878
879 /* Handle a capturing bracket, other than those that are possessive with an
880 unlimited repeat. If there is space in the offset vector, save the current
881 subject position in the working slot at the top of the vector. We mustn't
882 change the current values of the data slot, because they may be set from a
883 previous iteration of this group, and be referred to by a reference inside
884 the group. A failure to match might occur after the group has succeeded,
885 if something later on doesn't match. For this reason, we need to restore
886 the working value and also the values of the final offsets, in case they
887 were set by a previous iteration of the same bracket.
888
889 If there isn't enough space in the offset vector, treat this as if it were
890 a non-capturing bracket. Don't worry about setting the flag for the error
891 case here; that is handled in the code for KET. */
892
893 case OP_CBRA:
894 case OP_SCBRA:
895 number = GET2(ecode, 1+LINK_SIZE);
896 offset = number << 1;
897
898 #ifdef PCRE_DEBUG
899 printf("start bracket %d\n", number);
900 printf("subject=");
901 pchars(eptr, 16, TRUE, md);
902 printf("\n");
903 #endif
904
905 if (offset < md->offset_max)
906 {
907 save_offset1 = md->offset_vector[offset];
908 save_offset2 = md->offset_vector[offset+1];
909 save_offset3 = md->offset_vector[md->offset_end - number];
910 save_capture_last = md->capture_last;
911
912 DPRINTF(("saving %d %d %d\n", save_offset1, save_offset2, save_offset3));
913 md->offset_vector[md->offset_end - number] =
914 (int)(eptr - md->start_subject);
915
916 for (;;)
917 {
918 if (op >= OP_SBRA) md->match_function_type = MATCH_CBEGROUP;
919 RMATCH(eptr, ecode + _pcre_OP_lengths[*ecode], offset_top, md,
920 eptrb, RM1);
921 if (rrc == MATCH_ONCE) break; /* Backing up through an atomic group */
922
923 /* If we backed up to a THEN, check whether it is within the current
924 branch by comparing the address of the THEN that is passed back with
925 the end of the branch. If it is within the current branch, and the
926 branch is one of two or more alternatives (it either starts or ends
927 with OP_ALT), we have reached the limit of THEN's action, so convert
928 the return code to NOMATCH, which will cause normal backtracking to
929 happen from now on. Otherwise, THEN is passed back to an outer
930 alternative. This implements Perl's treatment of parenthesized groups,
931 where a group not containing | does not affect the current alternative,
932 that is, (X) is NOT the same as (X|(*F)). */
933
934 if (rrc == MATCH_THEN)
935 {
936 next = ecode + GET(ecode,1);
937 if (md->start_match_ptr < next &&
938 (*ecode == OP_ALT || *next == OP_ALT))
939 rrc = MATCH_NOMATCH;
940 }
941
942 /* Anything other than NOMATCH is passed back. */
943
944 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
945 md->capture_last = save_capture_last;
946 ecode += GET(ecode, 1);
947 if (*ecode != OP_ALT) break;
948 }
949
950 DPRINTF(("bracket %d failed\n", number));
951 md->offset_vector[offset] = save_offset1;
952 md->offset_vector[offset+1] = save_offset2;
953 md->offset_vector[md->offset_end - number] = save_offset3;
954
955 /* At this point, rrc will be one of MATCH_ONCE or MATCH_NOMATCH. */
956
957 if (md->mark == NULL) md->mark = markptr;
958 RRETURN(rrc);
959 }
960
961 /* FALL THROUGH ... Insufficient room for saving captured contents. Treat
962 as a non-capturing bracket. */
963
964 /* VVVVVVVVVVVVVVVVVVVVVVVVV */
965 /* VVVVVVVVVVVVVVVVVVVVVVVVV */
966
967 DPRINTF(("insufficient capture room: treat as non-capturing\n"));
968
969 /* VVVVVVVVVVVVVVVVVVVVVVVVV */
970 /* VVVVVVVVVVVVVVVVVVVVVVVVV */
971
972 /* Non-capturing or atomic group, except for possessive with unlimited
973 repeat and ONCE group with no captures. Loop for all the alternatives.
974
975 When we get to the final alternative within the brackets, we used to return
976 the result of a recursive call to match() whatever happened so it was
977 possible to reduce stack usage by turning this into a tail recursion,
978 except in the case of a possibly empty group. However, now that there is
979 the possiblity of (*THEN) occurring in the final alternative, this
980 optimization is no longer always possible.
981
982 We can optimize if we know there are no (*THEN)s in the pattern; at present
983 this is the best that can be done.
984
985 MATCH_ONCE is returned when the end of an atomic group is successfully
986 reached, but subsequent matching fails. It passes back up the tree (causing
987 captured values to be reset) until the original atomic group level is
988 reached. This is tested by comparing md->once_target with the start of the
989 group. At this point, the return is converted into MATCH_NOMATCH so that
990 previous backup points can be taken. */
991
992 case OP_ONCE:
993 case OP_BRA:
994 case OP_SBRA:
995 DPRINTF(("start non-capturing bracket\n"));
996
997 for (;;)
998 {
999 if (op >= OP_SBRA || op == OP_ONCE) md->match_function_type = MATCH_CBEGROUP;
1000
1001 /* If this is not a possibly empty group, and there are no (*THEN)s in
1002 the pattern, and this is the final alternative, optimize as described
1003 above. */
1004
1005 else if (!md->hasthen && ecode[GET(ecode, 1)] != OP_ALT)
1006 {
1007 ecode += _pcre_OP_lengths[*ecode];
1008 goto TAIL_RECURSE;
1009 }
1010
1011 /* In all other cases, we have to make another call to match(). */
1012
1013 RMATCH(eptr, ecode + _pcre_OP_lengths[*ecode], offset_top, md, eptrb,
1014 RM2);
1015
1016 /* See comment in the code for capturing groups above about handling
1017 THEN. */
1018
1019 if (rrc == MATCH_THEN)
1020 {
1021 next = ecode + GET(ecode,1);
1022 if (md->start_match_ptr < next &&
1023 (*ecode == OP_ALT || *next == OP_ALT))
1024 rrc = MATCH_NOMATCH;
1025 }
1026
1027 if (rrc != MATCH_NOMATCH)
1028 {
1029 if (rrc == MATCH_ONCE)
1030 {
1031 const uschar *scode = ecode;
1032 if (*scode != OP_ONCE) /* If not at start, find it */
1033 {
1034 while (*scode == OP_ALT) scode += GET(scode, 1);
1035 scode -= GET(scode, 1);
1036 }
1037 if (md->once_target == scode) rrc = MATCH_NOMATCH;
1038 }
1039 RRETURN(rrc);
1040 }
1041 ecode += GET(ecode, 1);
1042 if (*ecode != OP_ALT) break;
1043 }
1044
1045 if (md->mark == NULL) md->mark = markptr;
1046 RRETURN(MATCH_NOMATCH);
1047
1048 /* Handle possessive capturing brackets with an unlimited repeat. We come
1049 here from BRAZERO with allow_zero set TRUE. The offset_vector values are
1050 handled similarly to the normal case above. However, the matching is
1051 different. The end of these brackets will always be OP_KETRPOS, which
1052 returns MATCH_KETRPOS without going further in the pattern. By this means
1053 we can handle the group by iteration rather than recursion, thereby
1054 reducing the amount of stack needed. */
1055
1056 case OP_CBRAPOS:
1057 case OP_SCBRAPOS:
1058 allow_zero = FALSE;
1059
1060 POSSESSIVE_CAPTURE:
1061 number = GET2(ecode, 1+LINK_SIZE);
1062 offset = number << 1;
1063
1064 #ifdef PCRE_DEBUG
1065 printf("start possessive bracket %d\n", number);
1066 printf("subject=");
1067 pchars(eptr, 16, TRUE, md);
1068 printf("\n");
1069 #endif
1070
1071 if (offset < md->offset_max)
1072 {
1073 matched_once = FALSE;
1074 code_offset = ecode - md->start_code;
1075
1076 save_offset1 = md->offset_vector[offset];
1077 save_offset2 = md->offset_vector[offset+1];
1078 save_offset3 = md->offset_vector[md->offset_end - number];
1079 save_capture_last = md->capture_last;
1080
1081 DPRINTF(("saving %d %d %d\n", save_offset1, save_offset2, save_offset3));
1082
1083 /* Each time round the loop, save the current subject position for use
1084 when the group matches. For MATCH_MATCH, the group has matched, so we
1085 restart it with a new subject starting position, remembering that we had
1086 at least one match. For MATCH_NOMATCH, carry on with the alternatives, as
1087 usual. If we haven't matched any alternatives in any iteration, check to
1088 see if a previous iteration matched. If so, the group has matched;
1089 continue from afterwards. Otherwise it has failed; restore the previous
1090 capture values before returning NOMATCH. */
1091
1092 for (;;)
1093 {
1094 md->offset_vector[md->offset_end - number] =
1095 (int)(eptr - md->start_subject);
1096 if (op >= OP_SBRA) md->match_function_type = MATCH_CBEGROUP;
1097 RMATCH(eptr, ecode + _pcre_OP_lengths[*ecode], offset_top, md,
1098 eptrb, RM63);
1099 if (rrc == MATCH_KETRPOS)
1100 {
1101 offset_top = md->end_offset_top;
1102 eptr = md->end_match_ptr;
1103 ecode = md->start_code + code_offset;
1104 save_capture_last = md->capture_last;
1105 matched_once = TRUE;
1106 continue;
1107 }
1108
1109 /* See comment in the code for capturing groups above about handling
1110 THEN. */
1111
1112 if (rrc == MATCH_THEN)
1113 {
1114 next = ecode + GET(ecode,1);
1115 if (md->start_match_ptr < next &&
1116 (*ecode == OP_ALT || *next == OP_ALT))
1117 rrc = MATCH_NOMATCH;
1118 }
1119
1120 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1121 md->capture_last = save_capture_last;
1122 ecode += GET(ecode, 1);
1123 if (*ecode != OP_ALT) break;
1124 }
1125
1126 if (!matched_once)
1127 {
1128 md->offset_vector[offset] = save_offset1;
1129 md->offset_vector[offset+1] = save_offset2;
1130 md->offset_vector[md->offset_end - number] = save_offset3;
1131 }
1132
1133 if (md->mark == NULL) md->mark = markptr;
1134 if (allow_zero || matched_once)
1135 {
1136 ecode += 1 + LINK_SIZE;
1137 break;
1138 }
1139
1140 RRETURN(MATCH_NOMATCH);
1141 }
1142
1143 /* FALL THROUGH ... Insufficient room for saving captured contents. Treat
1144 as a non-capturing bracket. */
1145
1146 /* VVVVVVVVVVVVVVVVVVVVVVVVV */
1147 /* VVVVVVVVVVVVVVVVVVVVVVVVV */
1148
1149 DPRINTF(("insufficient capture room: treat as non-capturing\n"));
1150
1151 /* VVVVVVVVVVVVVVVVVVVVVVVVV */
1152 /* VVVVVVVVVVVVVVVVVVVVVVVVV */
1153
1154 /* Non-capturing possessive bracket with unlimited repeat. We come here
1155 from BRAZERO with allow_zero = TRUE. The code is similar to the above,
1156 without the capturing complication. It is written out separately for speed
1157 and cleanliness. */
1158
1159 case OP_BRAPOS:
1160 case OP_SBRAPOS:
1161 allow_zero = FALSE;
1162
1163 POSSESSIVE_NON_CAPTURE:
1164 matched_once = FALSE;
1165 code_offset = ecode - md->start_code;
1166
1167 for (;;)
1168 {
1169 if (op >= OP_SBRA) md->match_function_type = MATCH_CBEGROUP;
1170 RMATCH(eptr, ecode + _pcre_OP_lengths[*ecode], offset_top, md,
1171 eptrb, RM48);
1172 if (rrc == MATCH_KETRPOS)
1173 {
1174 offset_top = md->end_offset_top;
1175 eptr = md->end_match_ptr;
1176 ecode = md->start_code + code_offset;
1177 matched_once = TRUE;
1178 continue;
1179 }
1180
1181 /* See comment in the code for capturing groups above about handling
1182 THEN. */
1183
1184 if (rrc == MATCH_THEN)
1185 {
1186 next = ecode + GET(ecode,1);
1187 if (md->start_match_ptr < next &&
1188 (*ecode == OP_ALT || *next == OP_ALT))
1189 rrc = MATCH_NOMATCH;
1190 }
1191
1192 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1193 ecode += GET(ecode, 1);
1194 if (*ecode != OP_ALT) break;
1195 }
1196
1197 if (matched_once || allow_zero)
1198 {
1199 ecode += 1 + LINK_SIZE;
1200 break;
1201 }
1202 RRETURN(MATCH_NOMATCH);
1203
1204 /* Control never reaches here. */
1205
1206 /* Conditional group: compilation checked that there are no more than
1207 two branches. If the condition is false, skipping the first branch takes us
1208 past the end if there is only one branch, but that's OK because that is
1209 exactly what going to the ket would do. */
1210
1211 case OP_COND:
1212 case OP_SCOND:
1213 codelink = GET(ecode, 1);
1214
1215 /* Because of the way auto-callout works during compile, a callout item is
1216 inserted between OP_COND and an assertion condition. */
1217
1218 if (ecode[LINK_SIZE+1] == OP_CALLOUT)
1219 {
1220 if (pcre_callout != NULL)
1221 {
1222 pcre_callout_block cb;
1223 cb.version = 2; /* Version 1 of the callout block */
1224 cb.callout_number = ecode[LINK_SIZE+2];
1225 cb.offset_vector = md->offset_vector;
1226 cb.subject = (PCRE_SPTR)md->start_subject;
1227 cb.subject_length = (int)(md->end_subject - md->start_subject);
1228 cb.start_match = (int)(mstart - md->start_subject);
1229 cb.current_position = (int)(eptr - md->start_subject);
1230 cb.pattern_position = GET(ecode, LINK_SIZE + 3);
1231 cb.next_item_length = GET(ecode, 3 + 2*LINK_SIZE);
1232 cb.capture_top = offset_top/2;
1233 cb.capture_last = md->capture_last;
1234 cb.callout_data = md->callout_data;
1235 cb.mark = markptr;
1236 if ((rrc = (*pcre_callout)(&cb)) > 0) MRRETURN(MATCH_NOMATCH);
1237 if (rrc < 0) RRETURN(rrc);
1238 }
1239 ecode += _pcre_OP_lengths[OP_CALLOUT];
1240 }
1241
1242 condcode = ecode[LINK_SIZE+1];
1243
1244 /* Now see what the actual condition is */
1245
1246 if (condcode == OP_RREF || condcode == OP_NRREF) /* Recursion test */
1247 {
1248 if (md->recursive == NULL) /* Not recursing => FALSE */
1249 {
1250 condition = FALSE;
1251 ecode += GET(ecode, 1);
1252 }
1253 else
1254 {
1255 int recno = GET2(ecode, LINK_SIZE + 2); /* Recursion group number*/
1256 condition = (recno == RREF_ANY || recno == md->recursive->group_num);
1257
1258 /* If the test is for recursion into a specific subpattern, and it is
1259 false, but the test was set up by name, scan the table to see if the
1260 name refers to any other numbers, and test them. The condition is true
1261 if any one is set. */
1262
1263 if (!condition && condcode == OP_NRREF)
1264 {
1265 uschar *slotA = md->name_table;
1266 for (i = 0; i < md->name_count; i++)
1267 {
1268 if (GET2(slotA, 0) == recno) break;
1269 slotA += md->name_entry_size;
1270 }
1271
1272 /* Found a name for the number - there can be only one; duplicate
1273 names for different numbers are allowed, but not vice versa. First
1274 scan down for duplicates. */
1275
1276 if (i < md->name_count)
1277 {
1278 uschar *slotB = slotA;
1279 while (slotB > md->name_table)
1280 {
1281 slotB -= md->name_entry_size;
1282 if (strcmp((char *)slotA + 2, (char *)slotB + 2) == 0)
1283 {
1284 condition = GET2(slotB, 0) == md->recursive->group_num;
1285 if (condition) break;
1286 }
1287 else break;
1288 }
1289
1290 /* Scan up for duplicates */
1291
1292 if (!condition)
1293 {
1294 slotB = slotA;
1295 for (i++; i < md->name_count; i++)
1296 {
1297 slotB += md->name_entry_size;
1298 if (strcmp((char *)slotA + 2, (char *)slotB + 2) == 0)
1299 {
1300 condition = GET2(slotB, 0) == md->recursive->group_num;
1301 if (condition) break;
1302 }
1303 else break;
1304 }
1305 }
1306 }
1307 }
1308
1309 /* Chose branch according to the condition */
1310
1311 ecode += condition? 3 : GET(ecode, 1);
1312 }
1313 }
1314
1315 else if (condcode == OP_CREF || condcode == OP_NCREF) /* Group used test */
1316 {
1317 offset = GET2(ecode, LINK_SIZE+2) << 1; /* Doubled ref number */
1318 condition = offset < offset_top && md->offset_vector[offset] >= 0;
1319
1320 /* If the numbered capture is unset, but the reference was by name,
1321 scan the table to see if the name refers to any other numbers, and test
1322 them. The condition is true if any one is set. This is tediously similar
1323 to the code above, but not close enough to try to amalgamate. */
1324
1325 if (!condition && condcode == OP_NCREF)
1326 {
1327 int refno = offset >> 1;
1328 uschar *slotA = md->name_table;
1329
1330 for (i = 0; i < md->name_count; i++)
1331 {
1332 if (GET2(slotA, 0) == refno) break;
1333 slotA += md->name_entry_size;
1334 }
1335
1336 /* Found a name for the number - there can be only one; duplicate names
1337 for different numbers are allowed, but not vice versa. First scan down
1338 for duplicates. */
1339
1340 if (i < md->name_count)
1341 {
1342 uschar *slotB = slotA;
1343 while (slotB > md->name_table)
1344 {
1345 slotB -= md->name_entry_size;
1346 if (strcmp((char *)slotA + 2, (char *)slotB + 2) == 0)
1347 {
1348 offset = GET2(slotB, 0) << 1;
1349 condition = offset < offset_top &&
1350 md->offset_vector[offset] >= 0;
1351 if (condition) break;
1352 }
1353 else break;
1354 }
1355
1356 /* Scan up for duplicates */
1357
1358 if (!condition)
1359 {
1360 slotB = slotA;
1361 for (i++; i < md->name_count; i++)
1362 {
1363 slotB += md->name_entry_size;
1364 if (strcmp((char *)slotA + 2, (char *)slotB + 2) == 0)
1365 {
1366 offset = GET2(slotB, 0) << 1;
1367 condition = offset < offset_top &&
1368 md->offset_vector[offset] >= 0;
1369 if (condition) break;
1370 }
1371 else break;
1372 }
1373 }
1374 }
1375 }
1376
1377 /* Chose branch according to the condition */
1378
1379 ecode += condition? 3 : GET(ecode, 1);
1380 }
1381
1382 else if (condcode == OP_DEF) /* DEFINE - always false */
1383 {
1384 condition = FALSE;
1385 ecode += GET(ecode, 1);
1386 }
1387
1388 /* The condition is an assertion. Call match() to evaluate it - setting
1389 md->match_function_type to MATCH_CONDASSERT causes it to stop at the end of
1390 an assertion. */
1391
1392 else
1393 {
1394 md->match_function_type = MATCH_CONDASSERT;
1395 RMATCH(eptr, ecode + 1 + LINK_SIZE, offset_top, md, NULL, RM3);
1396 if (rrc == MATCH_MATCH)
1397 {
1398 if (md->end_offset_top > offset_top)
1399 offset_top = md->end_offset_top; /* Captures may have happened */
1400 condition = TRUE;
1401 ecode += 1 + LINK_SIZE + GET(ecode, LINK_SIZE + 2);
1402 while (*ecode == OP_ALT) ecode += GET(ecode, 1);
1403 }
1404
1405 /* PCRE doesn't allow the effect of (*THEN) to escape beyond an
1406 assertion; it is therefore treated as NOMATCH. */
1407
1408 else if (rrc != MATCH_NOMATCH && rrc != MATCH_THEN)
1409 {
1410 RRETURN(rrc); /* Need braces because of following else */
1411 }
1412 else
1413 {
1414 condition = FALSE;
1415 ecode += codelink;
1416 }
1417 }
1418
1419 /* We are now at the branch that is to be obeyed. As there is only one, can
1420 use tail recursion to avoid using another stack frame, except when there is
1421 unlimited repeat of a possibly empty group. In the latter case, a recursive
1422 call to match() is always required, unless the second alternative doesn't
1423 exist, in which case we can just plough on. Note that, for compatibility
1424 with Perl, the | in a conditional group is NOT treated as creating two
1425 alternatives. If a THEN is encountered in the branch, it propagates out to
1426 the enclosing alternative (unless nested in a deeper set of alternatives,
1427 of course). */
1428
1429 if (condition || *ecode == OP_ALT)
1430 {
1431 if (op != OP_SCOND)
1432 {
1433 ecode += 1 + LINK_SIZE;
1434 goto TAIL_RECURSE;
1435 }
1436
1437 md->match_function_type = MATCH_CBEGROUP;
1438 RMATCH(eptr, ecode + 1 + LINK_SIZE, offset_top, md, eptrb, RM49);
1439 RRETURN(rrc);
1440 }
1441
1442 /* Condition false & no alternative; continue after the group. */
1443
1444 else
1445 {
1446 ecode += 1 + LINK_SIZE;
1447 }
1448 break;
1449
1450
1451 /* Before OP_ACCEPT there may be any number of OP_CLOSE opcodes,
1452 to close any currently open capturing brackets. */
1453
1454 case OP_CLOSE:
1455 number = GET2(ecode, 1);
1456 offset = number << 1;
1457
1458 #ifdef PCRE_DEBUG
1459 printf("end bracket %d at *ACCEPT", number);
1460 printf("\n");
1461 #endif
1462
1463 md->capture_last = number;
1464 if (offset >= md->offset_max) md->offset_overflow = TRUE; else
1465 {
1466 md->offset_vector[offset] =
1467 md->offset_vector[md->offset_end - number];
1468 md->offset_vector[offset+1] = (int)(eptr - md->start_subject);
1469 if (offset_top <= offset) offset_top = offset + 2;
1470 }
1471 ecode += 3;
1472 break;
1473
1474
1475 /* End of the pattern, either real or forced. */
1476
1477 case OP_END:
1478 case OP_ACCEPT:
1479 case OP_ASSERT_ACCEPT:
1480
1481 /* If we have matched an empty string, fail if not in an assertion and not
1482 in a recursion if either PCRE_NOTEMPTY is set, or if PCRE_NOTEMPTY_ATSTART
1483 is set and we have matched at the start of the subject. In both cases,
1484 backtracking will then try other alternatives, if any. */
1485
1486 if (eptr == mstart && op != OP_ASSERT_ACCEPT &&
1487 md->recursive == NULL &&
1488 (md->notempty ||
1489 (md->notempty_atstart &&
1490 mstart == md->start_subject + md->start_offset)))
1491 MRRETURN(MATCH_NOMATCH);
1492
1493 /* Otherwise, we have a match. */
1494
1495 md->end_match_ptr = eptr; /* Record where we ended */
1496 md->end_offset_top = offset_top; /* and how many extracts were taken */
1497 md->start_match_ptr = mstart; /* and the start (\K can modify) */
1498
1499 /* For some reason, the macros don't work properly if an expression is
1500 given as the argument to MRRETURN when the heap is in use. */
1501
1502 rrc = (op == OP_END)? MATCH_MATCH : MATCH_ACCEPT;
1503 MRRETURN(rrc);
1504
1505 /* Assertion brackets. Check the alternative branches in turn - the
1506 matching won't pass the KET for an assertion. If any one branch matches,
1507 the assertion is true. Lookbehind assertions have an OP_REVERSE item at the
1508 start of each branch to move the current point backwards, so the code at
1509 this level is identical to the lookahead case. When the assertion is part
1510 of a condition, we want to return immediately afterwards. The caller of
1511 this incarnation of the match() function will have set MATCH_CONDASSERT in
1512 md->match_function type, and one of these opcodes will be the first opcode
1513 that is processed. We use a local variable that is preserved over calls to
1514 match() to remember this case. */
1515
1516 case OP_ASSERT:
1517 case OP_ASSERTBACK:
1518 if (md->match_function_type == MATCH_CONDASSERT)
1519 {
1520 condassert = TRUE;
1521 md->match_function_type = 0;
1522 }
1523 else condassert = FALSE;
1524
1525 do
1526 {
1527 RMATCH(eptr, ecode + 1 + LINK_SIZE, offset_top, md, NULL, RM4);
1528 if (rrc == MATCH_MATCH || rrc == MATCH_ACCEPT)
1529 {
1530 mstart = md->start_match_ptr; /* In case \K reset it */
1531 markptr = md->mark;
1532 break;
1533 }
1534
1535 /* PCRE does not allow THEN to escape beyond an assertion; it is treated
1536 as NOMATCH. */
1537
1538 if (rrc != MATCH_NOMATCH && rrc != MATCH_THEN) RRETURN(rrc);
1539 ecode += GET(ecode, 1);
1540 }
1541 while (*ecode == OP_ALT);
1542
1543 if (*ecode == OP_KET) MRRETURN(MATCH_NOMATCH);
1544
1545 /* If checking an assertion for a condition, return MATCH_MATCH. */
1546
1547 if (condassert) RRETURN(MATCH_MATCH);
1548
1549 /* Continue from after the assertion, updating the offsets high water
1550 mark, since extracts may have been taken during the assertion. */
1551
1552 do ecode += GET(ecode,1); while (*ecode == OP_ALT);
1553 ecode += 1 + LINK_SIZE;
1554 offset_top = md->end_offset_top;
1555 continue;
1556
1557 /* Negative assertion: all branches must fail to match. Encountering SKIP,
1558 PRUNE, or COMMIT means we must assume failure without checking subsequent
1559 branches. */
1560
1561 case OP_ASSERT_NOT:
1562 case OP_ASSERTBACK_NOT:
1563 if (md->match_function_type == MATCH_CONDASSERT)
1564 {
1565 condassert = TRUE;
1566 md->match_function_type = 0;
1567 }
1568 else condassert = FALSE;
1569
1570 do
1571 {
1572 RMATCH(eptr, ecode + 1 + LINK_SIZE, offset_top, md, NULL, RM5);
1573 if (rrc == MATCH_MATCH || rrc == MATCH_ACCEPT) MRRETURN(MATCH_NOMATCH);
1574 if (rrc == MATCH_SKIP || rrc == MATCH_PRUNE || rrc == MATCH_COMMIT)
1575 {
1576 do ecode += GET(ecode,1); while (*ecode == OP_ALT);
1577 break;
1578 }
1579
1580 /* PCRE does not allow THEN to escape beyond an assertion; it is treated
1581 as NOMATCH. */
1582
1583 if (rrc != MATCH_NOMATCH && rrc != MATCH_THEN) RRETURN(rrc);
1584 ecode += GET(ecode,1);
1585 }
1586 while (*ecode == OP_ALT);
1587
1588 if (condassert) RRETURN(MATCH_MATCH); /* Condition assertion */
1589
1590 ecode += 1 + LINK_SIZE;
1591 continue;
1592
1593 /* Move the subject pointer back. This occurs only at the start of
1594 each branch of a lookbehind assertion. If we are too close to the start to
1595 move back, this match function fails. When working with UTF-8 we move
1596 back a number of characters, not bytes. */
1597
1598 case OP_REVERSE:
1599 #ifdef SUPPORT_UTF8
1600 if (utf8)
1601 {
1602 i = GET(ecode, 1);
1603 while (i-- > 0)
1604 {
1605 eptr--;
1606 if (eptr < md->start_subject) MRRETURN(MATCH_NOMATCH);
1607 BACKCHAR(eptr);
1608 }
1609 }
1610 else
1611 #endif
1612
1613 /* No UTF-8 support, or not in UTF-8 mode: count is byte count */
1614
1615 {
1616 eptr -= GET(ecode, 1);
1617 if (eptr < md->start_subject) MRRETURN(MATCH_NOMATCH);
1618 }
1619
1620 /* Save the earliest consulted character, then skip to next op code */
1621
1622 if (eptr < md->start_used_ptr) md->start_used_ptr = eptr;
1623 ecode += 1 + LINK_SIZE;
1624 break;
1625
1626 /* The callout item calls an external function, if one is provided, passing
1627 details of the match so far. This is mainly for debugging, though the
1628 function is able to force a failure. */
1629
1630 case OP_CALLOUT:
1631 if (pcre_callout != NULL)
1632 {
1633 pcre_callout_block cb;
1634 cb.version = 2; /* Version 1 of the callout block */
1635 cb.callout_number = ecode[1];
1636 cb.offset_vector = md->offset_vector;
1637 cb.subject = (PCRE_SPTR)md->start_subject;
1638 cb.subject_length = (int)(md->end_subject - md->start_subject);
1639 cb.start_match = (int)(mstart - md->start_subject);
1640 cb.current_position = (int)(eptr - md->start_subject);
1641 cb.pattern_position = GET(ecode, 2);
1642 cb.next_item_length = GET(ecode, 2 + LINK_SIZE);
1643 cb.capture_top = offset_top/2;
1644 cb.capture_last = md->capture_last;
1645 cb.callout_data = md->callout_data;
1646 cb.mark = markptr;
1647 if ((rrc = (*pcre_callout)(&cb)) > 0) MRRETURN(MATCH_NOMATCH);
1648 if (rrc < 0) RRETURN(rrc);
1649 }
1650 ecode += 2 + 2*LINK_SIZE;
1651 break;
1652
1653 /* Recursion either matches the current regex, or some subexpression. The
1654 offset data is the offset to the starting bracket from the start of the
1655 whole pattern. (This is so that it works from duplicated subpatterns.)
1656
1657 The state of the capturing groups is preserved over recursion, and
1658 re-instated afterwards. We don't know how many are started and not yet
1659 finished (offset_top records the completed total) so we just have to save
1660 all the potential data. There may be up to 65535 such values, which is too
1661 large to put on the stack, but using malloc for small numbers seems
1662 expensive. As a compromise, the stack is used when there are no more than
1663 REC_STACK_SAVE_MAX values to store; otherwise malloc is used.
1664
1665 There are also other values that have to be saved. We use a chained
1666 sequence of blocks that actually live on the stack. Thanks to Robin Houston
1667 for the original version of this logic. It has, however, been hacked around
1668 a lot, so he is not to blame for the current way it works. */
1669
1670 case OP_RECURSE:
1671 {
1672 recursion_info *ri;
1673 int recno;
1674
1675 callpat = md->start_code + GET(ecode, 1);
1676 recno = (callpat == md->start_code)? 0 :
1677 GET2(callpat, 1 + LINK_SIZE);
1678
1679 /* Check for repeating a recursion without advancing the subject pointer.
1680 This should catch convoluted mutual recursions. (Some simple cases are
1681 caught at compile time.) */
1682
1683 for (ri = md->recursive; ri != NULL; ri = ri->prevrec)
1684 if (recno == ri->group_num && eptr == ri->subject_position)
1685 RRETURN(PCRE_ERROR_RECURSELOOP);
1686
1687 /* Add to "recursing stack" */
1688
1689 new_recursive.group_num = recno;
1690 new_recursive.subject_position = eptr;
1691 new_recursive.prevrec = md->recursive;
1692 md->recursive = &new_recursive;
1693
1694 /* Where to continue from afterwards */
1695
1696 ecode += 1 + LINK_SIZE;
1697
1698 /* Now save the offset data */
1699
1700 new_recursive.saved_max = md->offset_end;
1701 if (new_recursive.saved_max <= REC_STACK_SAVE_MAX)
1702 new_recursive.offset_save = stacksave;
1703 else
1704 {
1705 new_recursive.offset_save =
1706 (int *)(pcre_malloc)(new_recursive.saved_max * sizeof(int));
1707 if (new_recursive.offset_save == NULL) RRETURN(PCRE_ERROR_NOMEMORY);
1708 }
1709 memcpy(new_recursive.offset_save, md->offset_vector,
1710 new_recursive.saved_max * sizeof(int));
1711
1712 /* OK, now we can do the recursion. After processing each alternative,
1713 restore the offset data. If there were nested recursions, md->recursive
1714 might be changed, so reset it before looping. */
1715
1716 DPRINTF(("Recursing into group %d\n", new_recursive.group_num));
1717 cbegroup = (*callpat >= OP_SBRA);
1718 do
1719 {
1720 if (cbegroup) md->match_function_type = MATCH_CBEGROUP;
1721 RMATCH(eptr, callpat + _pcre_OP_lengths[*callpat], offset_top,
1722 md, eptrb, RM6);
1723 memcpy(md->offset_vector, new_recursive.offset_save,
1724 new_recursive.saved_max * sizeof(int));
1725 md->recursive = new_recursive.prevrec;
1726 if (rrc == MATCH_MATCH || rrc == MATCH_ACCEPT)
1727 {
1728 DPRINTF(("Recursion matched\n"));
1729 if (new_recursive.offset_save != stacksave)
1730 (pcre_free)(new_recursive.offset_save);
1731
1732 /* Set where we got to in the subject, and reset the start in case
1733 it was changed by \K. This *is* propagated back out of a recursion,
1734 for Perl compatibility. */
1735
1736 eptr = md->end_match_ptr;
1737 mstart = md->start_match_ptr;
1738 goto RECURSION_MATCHED; /* Exit loop; end processing */
1739 }
1740
1741 /* PCRE does not allow THEN to escape beyond a recursion; it is treated
1742 as NOMATCH. */
1743
1744 else if (rrc != MATCH_NOMATCH && rrc != MATCH_THEN)
1745 {
1746 DPRINTF(("Recursion gave error %d\n", rrc));
1747 if (new_recursive.offset_save != stacksave)
1748 (pcre_free)(new_recursive.offset_save);
1749 RRETURN(rrc);
1750 }
1751
1752 md->recursive = &new_recursive;
1753 callpat += GET(callpat, 1);
1754 }
1755 while (*callpat == OP_ALT);
1756
1757 DPRINTF(("Recursion didn't match\n"));
1758 md->recursive = new_recursive.prevrec;
1759 if (new_recursive.offset_save != stacksave)
1760 (pcre_free)(new_recursive.offset_save);
1761 MRRETURN(MATCH_NOMATCH);
1762 }
1763
1764 RECURSION_MATCHED:
1765 break;
1766
1767 /* An alternation is the end of a branch; scan along to find the end of the
1768 bracketed group and go to there. */
1769
1770 case OP_ALT:
1771 do ecode += GET(ecode,1); while (*ecode == OP_ALT);
1772 break;
1773
1774 /* BRAZERO, BRAMINZERO and SKIPZERO occur just before a bracket group,
1775 indicating that it may occur zero times. It may repeat infinitely, or not
1776 at all - i.e. it could be ()* or ()? or even (){0} in the pattern. Brackets
1777 with fixed upper repeat limits are compiled as a number of copies, with the
1778 optional ones preceded by BRAZERO or BRAMINZERO. */
1779
1780 case OP_BRAZERO:
1781 next = ecode + 1;
1782 RMATCH(eptr, next, offset_top, md, eptrb, RM10);
1783 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1784 do next += GET(next, 1); while (*next == OP_ALT);
1785 ecode = next + 1 + LINK_SIZE;
1786 break;
1787
1788 case OP_BRAMINZERO:
1789 next = ecode + 1;
1790 do next += GET(next, 1); while (*next == OP_ALT);
1791 RMATCH(eptr, next + 1+LINK_SIZE, offset_top, md, eptrb, RM11);
1792 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1793 ecode++;
1794 break;
1795
1796 case OP_SKIPZERO:
1797 next = ecode+1;
1798 do next += GET(next,1); while (*next == OP_ALT);
1799 ecode = next + 1 + LINK_SIZE;
1800 break;
1801
1802 /* BRAPOSZERO occurs before a possessive bracket group. Don't do anything
1803 here; just jump to the group, with allow_zero set TRUE. */
1804
1805 case OP_BRAPOSZERO:
1806 op = *(++ecode);
1807 allow_zero = TRUE;
1808 if (op == OP_CBRAPOS || op == OP_SCBRAPOS) goto POSSESSIVE_CAPTURE;
1809 goto POSSESSIVE_NON_CAPTURE;
1810
1811 /* End of a group, repeated or non-repeating. */
1812
1813 case OP_KET:
1814 case OP_KETRMIN:
1815 case OP_KETRMAX:
1816 case OP_KETRPOS:
1817 prev = ecode - GET(ecode, 1);
1818
1819 /* If this was a group that remembered the subject start, in order to break
1820 infinite repeats of empty string matches, retrieve the subject start from
1821 the chain. Otherwise, set it NULL. */
1822
1823 if (*prev >= OP_SBRA || *prev == OP_ONCE)
1824 {
1825 saved_eptr = eptrb->epb_saved_eptr; /* Value at start of group */
1826 eptrb = eptrb->epb_prev; /* Backup to previous group */
1827 }
1828 else saved_eptr = NULL;
1829
1830 /* If we are at the end of an assertion group or a non-capturing atomic
1831 group, stop matching and return MATCH_MATCH, but record the current high
1832 water mark for use by positive assertions. We also need to record the match
1833 start in case it was changed by \K. */
1834
1835 if ((*prev >= OP_ASSERT && *prev <= OP_ASSERTBACK_NOT) ||
1836 *prev == OP_ONCE_NC)
1837 {
1838 md->end_match_ptr = eptr; /* For ONCE_NC */
1839 md->end_offset_top = offset_top;
1840 md->start_match_ptr = mstart;
1841 MRRETURN(MATCH_MATCH); /* Sets md->mark */
1842 }
1843
1844 /* For capturing groups we have to check the group number back at the start
1845 and if necessary complete handling an extraction by setting the offsets and
1846 bumping the high water mark. Whole-pattern recursion is coded as a recurse
1847 into group 0, so it won't be picked up here. Instead, we catch it when the
1848 OP_END is reached. Other recursion is handled here. We just have to record
1849 the current subject position and start match pointer and give a MATCH
1850 return. */
1851
1852 if (*prev == OP_CBRA || *prev == OP_SCBRA ||
1853 *prev == OP_CBRAPOS || *prev == OP_SCBRAPOS)
1854 {
1855 number = GET2(prev, 1+LINK_SIZE);
1856 offset = number << 1;
1857
1858 #ifdef PCRE_DEBUG
1859 printf("end bracket %d", number);
1860 printf("\n");
1861 #endif
1862
1863 /* Handle a recursively called group. */
1864
1865 if (md->recursive != NULL && md->recursive->group_num == number)
1866 {
1867 md->end_match_ptr = eptr;
1868 md->start_match_ptr = mstart;
1869 RRETURN(MATCH_MATCH);
1870 }
1871
1872 /* Deal with capturing */
1873
1874 md->capture_last = number;
1875 if (offset >= md->offset_max) md->offset_overflow = TRUE; else
1876 {
1877 /* If offset is greater than offset_top, it means that we are
1878 "skipping" a capturing group, and that group's offsets must be marked
1879 unset. In earlier versions of PCRE, all the offsets were unset at the
1880 start of matching, but this doesn't work because atomic groups and
1881 assertions can cause a value to be set that should later be unset.
1882 Example: matching /(?>(a))b|(a)c/ against "ac". This sets group 1 as
1883 part of the atomic group, but this is not on the final matching path,
1884 so must be unset when 2 is set. (If there is no group 2, there is no
1885 problem, because offset_top will then be 2, indicating no capture.) */
1886
1887 if (offset > offset_top)
1888 {
1889 register int *iptr = md->offset_vector + offset_top;
1890 register int *iend = md->offset_vector + offset;
1891 while (iptr < iend) *iptr++ = -1;
1892 }
1893
1894 /* Now make the extraction */
1895
1896 md->offset_vector[offset] =
1897 md->offset_vector[md->offset_end - number];
1898 md->offset_vector[offset+1] = (int)(eptr - md->start_subject);
1899 if (offset_top <= offset) offset_top = offset + 2;
1900 }
1901 }
1902
1903 /* For an ordinary non-repeating ket, just continue at this level. This
1904 also happens for a repeating ket if no characters were matched in the
1905 group. This is the forcible breaking of infinite loops as implemented in
1906 Perl 5.005. For a non-repeating atomic group that includes captures,
1907 establish a backup point by processing the rest of the pattern at a lower
1908 level. If this results in a NOMATCH return, pass MATCH_ONCE back to the
1909 original OP_ONCE level, thereby bypassing intermediate backup points, but
1910 resetting any captures that happened along the way. */
1911
1912 if (*ecode == OP_KET || eptr == saved_eptr)
1913 {
1914 if (*prev == OP_ONCE)
1915 {
1916 RMATCH(eptr, ecode + 1 + LINK_SIZE, offset_top, md, eptrb, RM12);
1917 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1918 md->once_target = prev; /* Level at which to change to MATCH_NOMATCH */
1919 RRETURN(MATCH_ONCE);
1920 }
1921 ecode += 1 + LINK_SIZE; /* Carry on at this level */
1922 break;
1923 }
1924
1925 /* OP_KETRPOS is a possessive repeating ket. Remember the current position,
1926 and return the MATCH_KETRPOS. This makes it possible to do the repeats one
1927 at a time from the outer level, thus saving stack. */
1928
1929 if (*ecode == OP_KETRPOS)
1930 {
1931 md->end_match_ptr = eptr;
1932 md->end_offset_top = offset_top;
1933 RRETURN(MATCH_KETRPOS);
1934 }
1935
1936 /* The normal repeating kets try the rest of the pattern or restart from
1937 the preceding bracket, in the appropriate order. In the second case, we can
1938 use tail recursion to avoid using another stack frame, unless we have an
1939 an atomic group or an unlimited repeat of a group that can match an empty
1940 string. */
1941
1942 if (*ecode == OP_KETRMIN)
1943 {
1944 RMATCH(eptr, ecode + 1 + LINK_SIZE, offset_top, md, eptrb, RM7);
1945 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1946 if (*prev == OP_ONCE)
1947 {
1948 RMATCH(eptr, prev, offset_top, md, eptrb, RM8);
1949 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1950 md->once_target = prev; /* Level at which to change to MATCH_NOMATCH */
1951 RRETURN(MATCH_ONCE);
1952 }
1953 if (*prev >= OP_SBRA) /* Could match an empty string */
1954 {
1955 md->match_function_type = MATCH_CBEGROUP;
1956 RMATCH(eptr, prev, offset_top, md, eptrb, RM50);
1957 RRETURN(rrc);
1958 }
1959 ecode = prev;
1960 goto TAIL_RECURSE;
1961 }
1962 else /* OP_KETRMAX */
1963 {
1964 if (*prev >= OP_SBRA) md->match_function_type = MATCH_CBEGROUP;
1965 RMATCH(eptr, prev, offset_top, md, eptrb, RM13);
1966 if (rrc == MATCH_ONCE && md->once_target == prev) rrc = MATCH_NOMATCH;
1967 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1968 if (*prev == OP_ONCE)
1969 {
1970 RMATCH(eptr, ecode + 1 + LINK_SIZE, offset_top, md, eptrb, RM9);
1971 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1972 md->once_target = prev;
1973 RRETURN(MATCH_ONCE);
1974 }
1975 ecode += 1 + LINK_SIZE;
1976 goto TAIL_RECURSE;
1977 }
1978 /* Control never gets here */
1979
1980 /* Not multiline mode: start of subject assertion, unless notbol. */
1981
1982 case OP_CIRC:
1983 if (md->notbol && eptr == md->start_subject) MRRETURN(MATCH_NOMATCH);
1984
1985 /* Start of subject assertion */
1986
1987 case OP_SOD:
1988 if (eptr != md->start_subject) MRRETURN(MATCH_NOMATCH);
1989 ecode++;
1990 break;
1991
1992 /* Multiline mode: start of subject unless notbol, or after any newline. */
1993
1994 case OP_CIRCM:
1995 if (md->notbol && eptr == md->start_subject) MRRETURN(MATCH_NOMATCH);
1996 if (eptr != md->start_subject &&
1997 (eptr == md->end_subject || !WAS_NEWLINE(eptr)))
1998 MRRETURN(MATCH_NOMATCH);
1999 ecode++;
2000 break;
2001
2002 /* Start of match assertion */
2003
2004 case OP_SOM:
2005 if (eptr != md->start_subject + md->start_offset) MRRETURN(MATCH_NOMATCH);
2006 ecode++;
2007 break;
2008
2009 /* Reset the start of match point */
2010
2011 case OP_SET_SOM:
2012 mstart = eptr;
2013 ecode++;
2014 break;
2015
2016 /* Multiline mode: assert before any newline, or before end of subject
2017 unless noteol is set. */
2018
2019 case OP_DOLLM:
2020 if (eptr < md->end_subject)
2021 { if (!IS_NEWLINE(eptr)) MRRETURN(MATCH_NOMATCH); }
2022 else
2023 {
2024 if (md->noteol) MRRETURN(MATCH_NOMATCH);
2025 SCHECK_PARTIAL();
2026 }
2027 ecode++;
2028 break;
2029
2030 /* Not multiline mode: assert before a terminating newline or before end of
2031 subject unless noteol is set. */
2032
2033 case OP_DOLL:
2034 if (md->noteol) MRRETURN(MATCH_NOMATCH);
2035 if (!md->endonly) goto ASSERT_NL_OR_EOS;
2036
2037 /* ... else fall through for endonly */
2038
2039 /* End of subject assertion (\z) */
2040
2041 case OP_EOD:
2042 if (eptr < md->end_subject) MRRETURN(MATCH_NOMATCH);
2043 SCHECK_PARTIAL();
2044 ecode++;
2045 break;
2046
2047 /* End of subject or ending \n assertion (\Z) */
2048
2049 case OP_EODN:
2050 ASSERT_NL_OR_EOS:
2051 if (eptr < md->end_subject &&
2052 (!IS_NEWLINE(eptr) || eptr != md->end_subject - md->nllen))
2053 MRRETURN(MATCH_NOMATCH);
2054
2055 /* Either at end of string or \n before end. */
2056
2057 SCHECK_PARTIAL();
2058 ecode++;
2059 break;
2060
2061 /* Word boundary assertions */
2062
2063 case OP_NOT_WORD_BOUNDARY:
2064 case OP_WORD_BOUNDARY:
2065 {
2066
2067 /* Find out if the previous and current characters are "word" characters.
2068 It takes a bit more work in UTF-8 mode. Characters > 255 are assumed to
2069 be "non-word" characters. Remember the earliest consulted character for
2070 partial matching. */
2071
2072 #ifdef SUPPORT_UTF8
2073 if (utf8)
2074 {
2075 /* Get status of previous character */
2076
2077 if (eptr == md->start_subject) prev_is_word = FALSE; else
2078 {
2079 USPTR lastptr = eptr - 1;
2080 while((*lastptr & 0xc0) == 0x80) lastptr--;
2081 if (lastptr < md->start_used_ptr) md->start_used_ptr = lastptr;
2082 GETCHAR(c, lastptr);
2083 #ifdef SUPPORT_UCP
2084 if (md->use_ucp)
2085 {
2086 if (c == '_') prev_is_word = TRUE; else
2087 {
2088 int cat = UCD_CATEGORY(c);
2089 prev_is_word = (cat == ucp_L || cat == ucp_N);
2090 }
2091 }
2092 else
2093 #endif
2094 prev_is_word = c < 256 && (md->ctypes[c] & ctype_word) != 0;
2095 }
2096
2097 /* Get status of next character */
2098
2099 if (eptr >= md->end_subject)
2100 {
2101 SCHECK_PARTIAL();
2102 cur_is_word = FALSE;
2103 }
2104 else
2105 {
2106 GETCHAR(c, eptr);
2107 #ifdef SUPPORT_UCP
2108 if (md->use_ucp)
2109 {
2110 if (c == '_') cur_is_word = TRUE; else
2111 {
2112 int cat = UCD_CATEGORY(c);
2113 cur_is_word = (cat == ucp_L || cat == ucp_N);
2114 }
2115 }
2116 else
2117 #endif
2118 cur_is_word = c < 256 && (md->ctypes[c] & ctype_word) != 0;
2119 }
2120 }
2121 else
2122 #endif
2123
2124 /* Not in UTF-8 mode, but we may still have PCRE_UCP set, and for
2125 consistency with the behaviour of \w we do use it in this case. */
2126
2127 {
2128 /* Get status of previous character */
2129
2130 if (eptr == md->start_subject) prev_is_word = FALSE; else
2131 {
2132 if (eptr <= md->start_used_ptr) md->start_used_ptr = eptr - 1;
2133 #ifdef SUPPORT_UCP
2134 if (md->use_ucp)
2135 {
2136 c = eptr[-1];
2137 if (c == '_') prev_is_word = TRUE; else
2138 {
2139 int cat = UCD_CATEGORY(c);
2140 prev_is_word = (cat == ucp_L || cat == ucp_N);
2141 }
2142 }
2143 else
2144 #endif
2145 prev_is_word = ((md->ctypes[eptr[-1]] & ctype_word) != 0);
2146 }
2147
2148 /* Get status of next character */
2149
2150 if (eptr >= md->end_subject)
2151 {
2152 SCHECK_PARTIAL();
2153 cur_is_word = FALSE;
2154 }
2155 else
2156 #ifdef SUPPORT_UCP
2157 if (md->use_ucp)
2158 {
2159 c = *eptr;
2160 if (c == '_') cur_is_word = TRUE; else
2161 {
2162 int cat = UCD_CATEGORY(c);
2163 cur_is_word = (cat == ucp_L || cat == ucp_N);
2164 }
2165 }
2166 else
2167 #endif
2168 cur_is_word = ((md->ctypes[*eptr] & ctype_word) != 0);
2169 }
2170
2171 /* Now see if the situation is what we want */
2172
2173 if ((*ecode++ == OP_WORD_BOUNDARY)?
2174 cur_is_word == prev_is_word : cur_is_word != prev_is_word)
2175 MRRETURN(MATCH_NOMATCH);
2176 }
2177 break;
2178
2179 /* Match a single character type; inline for speed */
2180
2181 case OP_ANY:
2182 if (IS_NEWLINE(eptr)) MRRETURN(MATCH_NOMATCH);
2183 /* Fall through */
2184
2185 case OP_ALLANY:
2186 if (eptr >= md->end_subject) /* DO NOT merge the eptr++ here; it must */
2187 { /* not be updated before SCHECK_PARTIAL. */
2188 SCHECK_PARTIAL();
2189 MRRETURN(MATCH_NOMATCH);
2190 }
2191 eptr++;
2192 if (utf8) while (eptr < md->end_subject && (*eptr & 0xc0) == 0x80) eptr++;
2193 ecode++;
2194 break;
2195
2196 /* Match a single byte, even in UTF-8 mode. This opcode really does match
2197 any byte, even newline, independent of the setting of PCRE_DOTALL. */
2198
2199 case OP_ANYBYTE:
2200 if (eptr >= md->end_subject) /* DO NOT merge the eptr++ here; it must */
2201 { /* not be updated before SCHECK_PARTIAL. */
2202 SCHECK_PARTIAL();
2203 MRRETURN(MATCH_NOMATCH);
2204 }
2205 eptr++;
2206 ecode++;
2207 break;
2208
2209 case OP_NOT_DIGIT:
2210 if (eptr >= md->end_subject)
2211 {
2212 SCHECK_PARTIAL();
2213 MRRETURN(MATCH_NOMATCH);
2214 }
2215 GETCHARINCTEST(c, eptr);
2216 if (
2217 #ifdef SUPPORT_UTF8
2218 c < 256 &&
2219 #endif
2220 (md->ctypes[c] & ctype_digit) != 0
2221 )
2222 MRRETURN(MATCH_NOMATCH);
2223 ecode++;
2224 break;
2225
2226 case OP_DIGIT:
2227 if (eptr >= md->end_subject)
2228 {
2229 SCHECK_PARTIAL();
2230 MRRETURN(MATCH_NOMATCH);
2231 }
2232 GETCHARINCTEST(c, eptr);
2233 if (
2234 #ifdef SUPPORT_UTF8
2235 c >= 256 ||
2236 #endif
2237 (md->ctypes[c] & ctype_digit) == 0
2238 )
2239 MRRETURN(MATCH_NOMATCH);
2240 ecode++;
2241 break;
2242
2243 case OP_NOT_WHITESPACE:
2244 if (eptr >= md->end_subject)
2245 {
2246 SCHECK_PARTIAL();
2247 MRRETURN(MATCH_NOMATCH);
2248 }
2249 GETCHARINCTEST(c, eptr);
2250 if (
2251 #ifdef SUPPORT_UTF8
2252 c < 256 &&
2253 #endif
2254 (md->ctypes[c] & ctype_space) != 0
2255 )
2256 MRRETURN(MATCH_NOMATCH);
2257 ecode++;
2258 break;
2259
2260 case OP_WHITESPACE:
2261 if (eptr >= md->end_subject)
2262 {
2263 SCHECK_PARTIAL();
2264 MRRETURN(MATCH_NOMATCH);
2265 }
2266 GETCHARINCTEST(c, eptr);
2267 if (
2268 #ifdef SUPPORT_UTF8
2269 c >= 256 ||
2270 #endif
2271 (md->ctypes[c] & ctype_space) == 0
2272 )
2273 MRRETURN(MATCH_NOMATCH);
2274 ecode++;
2275 break;
2276
2277 case OP_NOT_WORDCHAR:
2278 if (eptr >= md->end_subject)
2279 {
2280 SCHECK_PARTIAL();
2281 MRRETURN(MATCH_NOMATCH);
2282 }
2283 GETCHARINCTEST(c, eptr);
2284 if (
2285 #ifdef SUPPORT_UTF8
2286 c < 256 &&
2287 #endif
2288 (md->ctypes[c] & ctype_word) != 0
2289 )
2290 MRRETURN(MATCH_NOMATCH);
2291 ecode++;
2292 break;
2293
2294 case OP_WORDCHAR:
2295 if (eptr >= md->end_subject)
2296 {
2297 SCHECK_PARTIAL();
2298 MRRETURN(MATCH_NOMATCH);
2299 }
2300 GETCHARINCTEST(c, eptr);
2301 if (
2302 #ifdef SUPPORT_UTF8
2303 c >= 256 ||
2304 #endif
2305 (md->ctypes[c] & ctype_word) == 0
2306 )
2307 MRRETURN(MATCH_NOMATCH);
2308 ecode++;
2309 break;
2310
2311 case OP_ANYNL:
2312 if (eptr >= md->end_subject)
2313 {
2314 SCHECK_PARTIAL();
2315 MRRETURN(MATCH_NOMATCH);
2316 }
2317 GETCHARINCTEST(c, eptr);
2318 switch(c)
2319 {
2320 default: MRRETURN(MATCH_NOMATCH);
2321
2322 case 0x000d:
2323 if (eptr < md->end_subject && *eptr == 0x0a) eptr++;
2324 break;
2325
2326 case 0x000a:
2327 break;
2328
2329 case 0x000b:
2330 case 0x000c:
2331 case 0x0085:
2332 case 0x2028:
2333 case 0x2029:
2334 if (md->bsr_anycrlf) MRRETURN(MATCH_NOMATCH);
2335 break;
2336 }
2337 ecode++;
2338 break;
2339
2340 case OP_NOT_HSPACE:
2341 if (eptr >= md->end_subject)
2342 {
2343 SCHECK_PARTIAL();
2344 MRRETURN(MATCH_NOMATCH);
2345 }
2346 GETCHARINCTEST(c, eptr);
2347 switch(c)
2348 {
2349 default: break;
2350 case 0x09: /* HT */
2351 case 0x20: /* SPACE */
2352 case 0xa0: /* NBSP */
2353 case 0x1680: /* OGHAM SPACE MARK */
2354 case 0x180e: /* MONGOLIAN VOWEL SEPARATOR */
2355 case 0x2000: /* EN QUAD */
2356 case 0x2001: /* EM QUAD */
2357 case 0x2002: /* EN SPACE */
2358 case 0x2003: /* EM SPACE */
2359 case 0x2004: /* THREE-PER-EM SPACE */
2360 case 0x2005: /* FOUR-PER-EM SPACE */
2361 case 0x2006: /* SIX-PER-EM SPACE */
2362 case 0x2007: /* FIGURE SPACE */
2363 case 0x2008: /* PUNCTUATION SPACE */
2364 case 0x2009: /* THIN SPACE */
2365 case 0x200A: /* HAIR SPACE */
2366 case 0x202f: /* NARROW NO-BREAK SPACE */
2367 case 0x205f: /* MEDIUM MATHEMATICAL SPACE */
2368 case 0x3000: /* IDEOGRAPHIC SPACE */
2369 MRRETURN(MATCH_NOMATCH);
2370 }
2371 ecode++;
2372 break;
2373
2374 case OP_HSPACE:
2375 if (eptr >= md->end_subject)
2376 {
2377 SCHECK_PARTIAL();
2378 MRRETURN(MATCH_NOMATCH);
2379 }
2380 GETCHARINCTEST(c, eptr);
2381 switch(c)
2382 {
2383 default: MRRETURN(MATCH_NOMATCH);
2384 case 0x09: /* HT */
2385 case 0x20: /* SPACE */
2386 case 0xa0: /* NBSP */
2387 case 0x1680: /* OGHAM SPACE MARK */
2388 case 0x180e: /* MONGOLIAN VOWEL SEPARATOR */
2389 case 0x2000: /* EN QUAD */
2390 case 0x2001: /* EM QUAD */
2391 case 0x2002: /* EN SPACE */
2392 case 0x2003: /* EM SPACE */
2393 case 0x2004: /* THREE-PER-EM SPACE */
2394 case 0x2005: /* FOUR-PER-EM SPACE */
2395 case 0x2006: /* SIX-PER-EM SPACE */
2396 case 0x2007: /* FIGURE SPACE */
2397 case 0x2008: /* PUNCTUATION SPACE */
2398 case 0x2009: /* THIN SPACE */
2399 case 0x200A: /* HAIR SPACE */
2400 case 0x202f: /* NARROW NO-BREAK SPACE */
2401 case 0x205f: /* MEDIUM MATHEMATICAL SPACE */
2402 case 0x3000: /* IDEOGRAPHIC SPACE */
2403 break;
2404 }
2405 ecode++;
2406 break;
2407
2408 case OP_NOT_VSPACE:
2409 if (eptr >= md->end_subject)
2410 {
2411 SCHECK_PARTIAL();
2412 MRRETURN(MATCH_NOMATCH);
2413 }
2414 GETCHARINCTEST(c, eptr);
2415 switch(c)
2416 {
2417 default: break;
2418 case 0x0a: /* LF */
2419 case 0x0b: /* VT */
2420 case 0x0c: /* FF */
2421 case 0x0d: /* CR */
2422 case 0x85: /* NEL */
2423 case 0x2028: /* LINE SEPARATOR */
2424 case 0x2029: /* PARAGRAPH SEPARATOR */
2425 MRRETURN(MATCH_NOMATCH);
2426 }
2427 ecode++;
2428 break;
2429
2430 case OP_VSPACE:
2431 if (eptr >= md->end_subject)
2432 {
2433 SCHECK_PARTIAL();
2434 MRRETURN(MATCH_NOMATCH);
2435 }
2436 GETCHARINCTEST(c, eptr);
2437 switch(c)
2438 {
2439 default: MRRETURN(MATCH_NOMATCH);
2440 case 0x0a: /* LF */
2441 case 0x0b: /* VT */
2442 case 0x0c: /* FF */
2443 case 0x0d: /* CR */
2444 case 0x85: /* NEL */
2445 case 0x2028: /* LINE SEPARATOR */
2446 case 0x2029: /* PARAGRAPH SEPARATOR */
2447 break;
2448 }
2449 ecode++;
2450 break;
2451
2452 #ifdef SUPPORT_UCP
2453 /* Check the next character by Unicode property. We will get here only
2454 if the support is in the binary; otherwise a compile-time error occurs. */
2455
2456 case OP_PROP:
2457 case OP_NOTPROP:
2458 if (eptr >= md->end_subject)
2459 {
2460 SCHECK_PARTIAL();
2461 MRRETURN(MATCH_NOMATCH);
2462 }
2463 GETCHARINCTEST(c, eptr);
2464 {
2465 const ucd_record *prop = GET_UCD(c);
2466
2467 switch(ecode[1])
2468 {
2469 case PT_ANY:
2470 if (op == OP_NOTPROP) MRRETURN(MATCH_NOMATCH);
2471 break;
2472
2473 case PT_LAMP:
2474 if ((prop->chartype == ucp_Lu ||
2475 prop->chartype == ucp_Ll ||
2476 prop->chartype == ucp_Lt) == (op == OP_NOTPROP))
2477 MRRETURN(MATCH_NOMATCH);
2478 break;
2479
2480 case PT_GC:
2481 if ((ecode[2] != _pcre_ucp_gentype[prop->chartype]) == (op == OP_PROP))
2482 MRRETURN(MATCH_NOMATCH);
2483 break;
2484
2485 case PT_PC:
2486 if ((ecode[2] != prop->chartype) == (op == OP_PROP))
2487 MRRETURN(MATCH_NOMATCH);
2488 break;
2489
2490 case PT_SC:
2491 if ((ecode[2] != prop->script) == (op == OP_PROP))
2492 MRRETURN(MATCH_NOMATCH);
2493 break;
2494
2495 /* These are specials */
2496
2497 case PT_ALNUM:
2498 if ((_pcre_ucp_gentype[prop->chartype] == ucp_L ||
2499 _pcre_ucp_gentype[prop->chartype] == ucp_N) == (op == OP_NOTPROP))
2500 MRRETURN(MATCH_NOMATCH);
2501 break;
2502
2503 case PT_SPACE: /* Perl space */
2504 if ((_pcre_ucp_gentype[prop->chartype] == ucp_Z ||
2505 c == CHAR_HT || c == CHAR_NL || c == CHAR_FF || c == CHAR_CR)
2506 == (op == OP_NOTPROP))
2507 MRRETURN(MATCH_NOMATCH);
2508 break;
2509
2510 case PT_PXSPACE: /* POSIX space */
2511 if ((_pcre_ucp_gentype[prop->chartype] == ucp_Z ||
2512 c == CHAR_HT || c == CHAR_NL || c == CHAR_VT ||
2513 c == CHAR_FF || c == CHAR_CR)
2514 == (op == OP_NOTPROP))
2515 MRRETURN(MATCH_NOMATCH);
2516 break;
2517
2518 case PT_WORD:
2519 if ((_pcre_ucp_gentype[prop->chartype] == ucp_L ||
2520 _pcre_ucp_gentype[prop->chartype] == ucp_N ||
2521 c == CHAR_UNDERSCORE) == (op == OP_NOTPROP))
2522 MRRETURN(MATCH_NOMATCH);
2523 break;
2524
2525 /* This should never occur */
2526
2527 default:
2528 RRETURN(PCRE_ERROR_INTERNAL);
2529 }
2530
2531 ecode += 3;
2532 }
2533 break;
2534
2535 /* Match an extended Unicode sequence. We will get here only if the support
2536 is in the binary; otherwise a compile-time error occurs. */
2537
2538 case OP_EXTUNI:
2539 if (eptr >= md->end_subject)
2540 {
2541 SCHECK_PARTIAL();
2542 MRRETURN(MATCH_NOMATCH);
2543 }
2544 GETCHARINCTEST(c, eptr);
2545 if (UCD_CATEGORY(c) == ucp_M) MRRETURN(MATCH_NOMATCH);
2546 while (eptr < md->end_subject)
2547 {
2548 int len = 1;
2549 if (!utf8) c = *eptr; else { GETCHARLEN(c, eptr, len); }
2550 if (UCD_CATEGORY(c) != ucp_M) break;
2551 eptr += len;
2552 }
2553 ecode++;
2554 break;
2555 #endif
2556
2557
2558 /* Match a back reference, possibly repeatedly. Look past the end of the
2559 item to see if there is repeat information following. The code is similar
2560 to that for character classes, but repeated for efficiency. Then obey
2561 similar code to character type repeats - written out again for speed.
2562 However, if the referenced string is the empty string, always treat
2563 it as matched, any number of times (otherwise there could be infinite
2564 loops). */
2565
2566 case OP_REF:
2567 case OP_REFI:
2568 caseless = op == OP_REFI;
2569 offset = GET2(ecode, 1) << 1; /* Doubled ref number */
2570 ecode += 3;
2571
2572 /* If the reference is unset, there are two possibilities:
2573
2574 (a) In the default, Perl-compatible state, set the length negative;
2575 this ensures that every attempt at a match fails. We can't just fail
2576 here, because of the possibility of quantifiers with zero minima.
2577
2578 (b) If the JavaScript compatibility flag is set, set the length to zero
2579 so that the back reference matches an empty string.
2580
2581 Otherwise, set the length to the length of what was matched by the
2582 referenced subpattern. */
2583
2584 if (offset >= offset_top || md->offset_vector[offset] < 0)
2585 length = (md->jscript_compat)? 0 : -1;
2586 else
2587 length = md->offset_vector[offset+1] - md->offset_vector[offset];
2588
2589 /* Set up for repetition, or handle the non-repeated case */
2590
2591 switch (*ecode)
2592 {
2593 case OP_CRSTAR:
2594 case OP_CRMINSTAR:
2595 case OP_CRPLUS:
2596 case OP_CRMINPLUS:
2597 case OP_CRQUERY:
2598 case OP_CRMINQUERY:
2599 c = *ecode++ - OP_CRSTAR;
2600 minimize = (c & 1) != 0;
2601 min = rep_min[c]; /* Pick up values from tables; */
2602 max = rep_max[c]; /* zero for max => infinity */
2603 if (max == 0) max = INT_MAX;
2604 break;
2605
2606 case OP_CRRANGE:
2607 case OP_CRMINRANGE:
2608 minimize = (*ecode == OP_CRMINRANGE);
2609 min = GET2(ecode, 1);
2610 max = GET2(ecode, 3);
2611 if (max == 0) max = INT_MAX;
2612 ecode += 5;
2613 break;
2614
2615 default: /* No repeat follows */
2616 if ((length = match_ref(offset, eptr, length, md, caseless)) < 0)
2617 {
2618 CHECK_PARTIAL();
2619 MRRETURN(MATCH_NOMATCH);
2620 }
2621 eptr += length;
2622 continue; /* With the main loop */
2623 }
2624
2625 /* Handle repeated back references. If the length of the reference is
2626 zero, just continue with the main loop. */
2627
2628 if (length == 0) continue;
2629
2630 /* First, ensure the minimum number of matches are present. We get back
2631 the length of the reference string explicitly rather than passing the
2632 address of eptr, so that eptr can be a register variable. */
2633
2634 for (i = 1; i <= min; i++)
2635 {
2636 int slength;
2637 if ((slength = match_ref(offset, eptr, length, md, caseless)) < 0)
2638 {
2639 CHECK_PARTIAL();
2640 MRRETURN(MATCH_NOMATCH);
2641 }
2642 eptr += slength;
2643 }
2644
2645 /* If min = max, continue at the same level without recursion.
2646 They are not both allowed to be zero. */
2647
2648 if (min == max) continue;
2649
2650 /* If minimizing, keep trying and advancing the pointer */
2651
2652 if (minimize)
2653 {
2654 for (fi = min;; fi++)
2655 {
2656 int slength;
2657 RMATCH(eptr, ecode, offset_top, md, eptrb, RM14);
2658 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2659 if (fi >= max) MRRETURN(MATCH_NOMATCH);
2660 if ((slength = match_ref(offset, eptr, length, md, caseless)) < 0)
2661 {
2662 CHECK_PARTIAL();
2663 MRRETURN(MATCH_NOMATCH);
2664 }
2665 eptr += slength;
2666 }
2667 /* Control never gets here */
2668 }
2669
2670 /* If maximizing, find the longest string and work backwards */
2671
2672 else
2673 {
2674 pp = eptr;
2675 for (i = min; i < max; i++)
2676 {
2677 int slength;
2678 if ((slength = match_ref(offset, eptr, length, md, caseless)) < 0)
2679 {
2680 CHECK_PARTIAL();
2681 break;
2682 }
2683 eptr += slength;
2684 }
2685 while (eptr >= pp)
2686 {
2687 RMATCH(eptr, ecode, offset_top, md, eptrb, RM15);
2688 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2689 eptr -= length;
2690 }
2691 MRRETURN(MATCH_NOMATCH);
2692 }
2693 /* Control never gets here */
2694
2695 /* Match a bit-mapped character class, possibly repeatedly. This op code is
2696 used when all the characters in the class have values in the range 0-255,
2697 and either the matching is caseful, or the characters are in the range
2698 0-127 when UTF-8 processing is enabled. The only difference between
2699 OP_CLASS and OP_NCLASS occurs when a data character outside the range is
2700 encountered.
2701
2702 First, look past the end of the item to see if there is repeat information
2703 following. Then obey similar code to character type repeats - written out
2704 again for speed. */
2705
2706 case OP_NCLASS:
2707 case OP_CLASS:
2708 {
2709 data = ecode + 1; /* Save for matching */
2710 ecode += 33; /* Advance past the item */
2711
2712 switch (*ecode)
2713 {
2714 case OP_CRSTAR:
2715 case OP_CRMINSTAR:
2716 case OP_CRPLUS:
2717 case OP_CRMINPLUS:
2718 case OP_CRQUERY:
2719 case OP_CRMINQUERY:
2720 c = *ecode++ - OP_CRSTAR;
2721 minimize = (c & 1) != 0;
2722 min = rep_min[c]; /* Pick up values from tables; */
2723 max = rep_max[c]; /* zero for max => infinity */
2724 if (max == 0) max = INT_MAX;
2725 break;
2726
2727 case OP_CRRANGE:
2728 case OP_CRMINRANGE:
2729 minimize = (*ecode == OP_CRMINRANGE);
2730 min = GET2(ecode, 1);
2731 max = GET2(ecode, 3);
2732 if (max == 0) max = INT_MAX;
2733 ecode += 5;
2734 break;
2735
2736 default: /* No repeat follows */
2737 min = max = 1;
2738 break;
2739 }
2740
2741 /* First, ensure the minimum number of matches are present. */
2742
2743 #ifdef SUPPORT_UTF8
2744 /* UTF-8 mode */
2745 if (utf8)
2746 {
2747 for (i = 1; i <= min; i++)
2748 {
2749 if (eptr >= md->end_subject)
2750 {
2751 SCHECK_PARTIAL();
2752 MRRETURN(MATCH_NOMATCH);
2753 }
2754 GETCHARINC(c, eptr);
2755 if (c > 255)
2756 {
2757 if (op == OP_CLASS) MRRETURN(MATCH_NOMATCH);
2758 }
2759 else
2760 {
2761 if ((data[c/8] & (1 << (c&7))) == 0) MRRETURN(MATCH_NOMATCH);
2762 }
2763 }
2764 }
2765 else
2766 #endif
2767 /* Not UTF-8 mode */
2768 {
2769 for (i = 1; i <= min; i++)
2770 {
2771 if (eptr >= md->end_subject)
2772 {
2773 SCHECK_PARTIAL();
2774 MRRETURN(MATCH_NOMATCH);
2775 }
2776 c = *eptr++;
2777 if ((data[c/8] & (1 << (c&7))) == 0) MRRETURN(MATCH_NOMATCH);
2778 }
2779 }
2780
2781 /* If max == min we can continue with the main loop without the
2782 need to recurse. */
2783
2784 if (min == max) continue;
2785
2786 /* If minimizing, keep testing the rest of the expression and advancing
2787 the pointer while it matches the class. */
2788
2789 if (minimize)
2790 {
2791 #ifdef SUPPORT_UTF8
2792 /* UTF-8 mode */
2793 if (utf8)
2794 {
2795 for (fi = min;; fi++)
2796 {
2797 RMATCH(eptr, ecode, offset_top, md, eptrb, RM16);
2798 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2799 if (fi >= max) MRRETURN(MATCH_NOMATCH);
2800 if (eptr >= md->end_subject)
2801 {
2802 SCHECK_PARTIAL();
2803 MRRETURN(MATCH_NOMATCH);
2804 }
2805 GETCHARINC(c, eptr);
2806 if (c > 255)
2807 {
2808 if (op == OP_CLASS) MRRETURN(MATCH_NOMATCH);
2809 }
2810 else
2811 {
2812 if ((data[c/8] & (1 << (c&7))) == 0) MRRETURN(MATCH_NOMATCH);
2813 }
2814 }
2815 }
2816 else
2817 #endif
2818 /* Not UTF-8 mode */
2819 {
2820 for (fi = min;; fi++)
2821 {
2822 RMATCH(eptr, ecode, offset_top, md, eptrb, RM17);
2823 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2824 if (fi >= max) MRRETURN(MATCH_NOMATCH);
2825 if (eptr >= md->end_subject)
2826 {
2827 SCHECK_PARTIAL();
2828 MRRETURN(MATCH_NOMATCH);
2829 }
2830 c = *eptr++;
2831 if ((data[c/8] & (1 << (c&7))) == 0) MRRETURN(MATCH_NOMATCH);
2832 }
2833 }
2834 /* Control never gets here */
2835 }
2836
2837 /* If maximizing, find the longest possible run, then work backwards. */
2838
2839 else
2840 {
2841 pp = eptr;
2842
2843 #ifdef SUPPORT_UTF8
2844 /* UTF-8 mode */
2845 if (utf8)
2846 {
2847 for (i = min; i < max; i++)
2848 {
2849 int len = 1;
2850 if (eptr >= md->end_subject)
2851 {
2852 SCHECK_PARTIAL();
2853 break;
2854 }
2855 GETCHARLEN(c, eptr, len);
2856 if (c > 255)
2857 {
2858 if (op == OP_CLASS) break;
2859 }
2860 else
2861 {
2862 if ((data[c/8] & (1 << (c&7))) == 0) break;
2863 }
2864 eptr += len;
2865 }
2866 for (;;)
2867 {
2868 RMATCH(eptr, ecode, offset_top, md, eptrb, RM18);
2869 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2870 if (eptr-- == pp) break; /* Stop if tried at original pos */
2871 BACKCHAR(eptr);
2872 }
2873 }
2874 else
2875 #endif
2876 /* Not UTF-8 mode */
2877 {
2878 for (i = min; i < max; i++)
2879 {
2880 if (eptr >= md->end_subject)
2881 {
2882 SCHECK_PARTIAL();
2883 break;
2884 }
2885 c = *eptr;
2886 if ((data[c/8] & (1 << (c&7))) == 0) break;
2887 eptr++;
2888 }
2889 while (eptr >= pp)
2890 {
2891 RMATCH(eptr, ecode, offset_top, md, eptrb, RM19);
2892 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2893 eptr--;
2894 }
2895 }
2896
2897 MRRETURN(MATCH_NOMATCH);
2898 }
2899 }
2900 /* Control never gets here */
2901
2902
2903 /* Match an extended character class. This opcode is encountered only
2904 when UTF-8 mode mode is supported. Nevertheless, we may not be in UTF-8
2905 mode, because Unicode properties are supported in non-UTF-8 mode. */
2906
2907 #ifdef SUPPORT_UTF8
2908 case OP_XCLASS:
2909 {
2910 data = ecode + 1 + LINK_SIZE; /* Save for matching */
2911 ecode += GET(ecode, 1); /* Advance past the item */
2912
2913 switch (*ecode)
2914 {
2915 case OP_CRSTAR:
2916 case OP_CRMINSTAR:
2917 case OP_CRPLUS:
2918 case OP_CRMINPLUS:
2919 case OP_CRQUERY:
2920 case OP_CRMINQUERY:
2921 c = *ecode++ - OP_CRSTAR;
2922 minimize = (c & 1) != 0;
2923 min = rep_min[c]; /* Pick up values from tables; */
2924 max = rep_max[c]; /* zero for max => infinity */
2925 if (max == 0) max = INT_MAX;
2926 break;
2927
2928 case OP_CRRANGE:
2929 case OP_CRMINRANGE:
2930 minimize = (*ecode == OP_CRMINRANGE);
2931 min = GET2(ecode, 1);
2932 max = GET2(ecode, 3);
2933 if (max == 0) max = INT_MAX;
2934 ecode += 5;
2935 break;
2936
2937 default: /* No repeat follows */
2938 min = max = 1;
2939 break;
2940 }
2941
2942 /* First, ensure the minimum number of matches are present. */
2943
2944 for (i = 1; i <= min; i++)
2945 {
2946 if (eptr >= md->end_subject)
2947 {
2948 SCHECK_PARTIAL();
2949 MRRETURN(MATCH_NOMATCH);
2950 }
2951 GETCHARINCTEST(c, eptr);
2952 if (!_pcre_xclass(c, data)) MRRETURN(MATCH_NOMATCH);
2953 }
2954
2955 /* If max == min we can continue with the main loop without the
2956 need to recurse. */
2957
2958 if (min == max) continue;
2959
2960 /* If minimizing, keep testing the rest of the expression and advancing
2961 the pointer while it matches the class. */
2962
2963 if (minimize)
2964 {
2965 for (fi = min;; fi++)
2966 {
2967 RMATCH(eptr, ecode, offset_top, md, eptrb, RM20);
2968 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2969 if (fi >= max) MRRETURN(MATCH_NOMATCH);
2970 if (eptr >= md->end_subject)
2971 {
2972 SCHECK_PARTIAL();
2973 MRRETURN(MATCH_NOMATCH);
2974 }
2975 GETCHARINCTEST(c, eptr);
2976 if (!_pcre_xclass(c, data)) MRRETURN(MATCH_NOMATCH);
2977 }
2978 /* Control never gets here */
2979 }
2980
2981 /* If maximizing, find the longest possible run, then work backwards. */
2982
2983 else
2984 {
2985 pp = eptr;
2986 for (i = min; i < max; i++)
2987 {
2988 int len = 1;
2989 if (eptr >= md->end_subject)
2990 {
2991 SCHECK_PARTIAL();
2992 break;
2993 }
2994 GETCHARLENTEST(c, eptr, len);
2995 if (!_pcre_xclass(c, data)) break;
2996 eptr += len;
2997 }
2998 for(;;)
2999 {
3000 RMATCH(eptr, ecode, offset_top, md, eptrb, RM21);
3001 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3002 if (eptr-- == pp) break; /* Stop if tried at original pos */
3003 if (utf8) BACKCHAR(eptr);
3004 }
3005 MRRETURN(MATCH_NOMATCH);
3006 }
3007
3008 /* Control never gets here */
3009 }
3010 #endif /* End of XCLASS */
3011
3012 /* Match a single character, casefully */
3013
3014 case OP_CHAR:
3015 #ifdef SUPPORT_UTF8
3016 if (utf8)
3017 {
3018 length = 1;
3019 ecode++;
3020 GETCHARLEN(fc, ecode, length);
3021 if (length > md->end_subject - eptr)
3022 {
3023 CHECK_PARTIAL(); /* Not SCHECK_PARTIAL() */
3024 MRRETURN(MATCH_NOMATCH);
3025 }
3026 while (length-- > 0) if (*ecode++ != *eptr++) MRRETURN(MATCH_NOMATCH);
3027 }
3028 else
3029 #endif
3030
3031 /* Non-UTF-8 mode */
3032 {
3033 if (md->end_subject - eptr < 1)
3034 {
3035 SCHECK_PARTIAL(); /* This one can use SCHECK_PARTIAL() */
3036 MRRETURN(MATCH_NOMATCH);
3037 }
3038 if (ecode[1] != *eptr++) MRRETURN(MATCH_NOMATCH);
3039 ecode += 2;
3040 }
3041 break;
3042
3043 /* Match a single character, caselessly */
3044
3045 case OP_CHARI:
3046 #ifdef SUPPORT_UTF8
3047 if (utf8)
3048 {
3049 length = 1;
3050 ecode++;
3051 GETCHARLEN(fc, ecode, length);
3052
3053 if (length > md->end_subject - eptr)
3054 {
3055 CHECK_PARTIAL(); /* Not SCHECK_PARTIAL() */
3056 MRRETURN(MATCH_NOMATCH);
3057 }
3058
3059 /* If the pattern character's value is < 128, we have only one byte, and
3060 can use the fast lookup table. */
3061
3062 if (fc < 128)
3063 {
3064 if (md->lcc[*ecode++] != md->lcc[*eptr++]) MRRETURN(MATCH_NOMATCH);
3065 }
3066
3067 /* Otherwise we must pick up the subject character */
3068
3069 else
3070 {
3071 unsigned int dc;
3072 GETCHARINC(dc, eptr);
3073 ecode += length;
3074
3075 /* If we have Unicode property support, we can use it to test the other
3076 case of the character, if there is one. */
3077
3078 if (fc != dc)
3079 {
3080 #ifdef SUPPORT_UCP
3081 if (dc != UCD_OTHERCASE(fc))
3082 #endif
3083 MRRETURN(MATCH_NOMATCH);
3084 }
3085 }
3086 }
3087 else
3088 #endif /* SUPPORT_UTF8 */
3089
3090 /* Non-UTF-8 mode */
3091 {
3092 if (md->end_subject - eptr < 1)
3093 {
3094 SCHECK_PARTIAL(); /* This one can use SCHECK_PARTIAL() */
3095 MRRETURN(MATCH_NOMATCH);
3096 }
3097 if (md->lcc[ecode[1]] != md->lcc[*eptr++]) MRRETURN(MATCH_NOMATCH);
3098 ecode += 2;
3099 }
3100 break;
3101
3102 /* Match a single character repeatedly. */
3103
3104 case OP_EXACT:
3105 case OP_EXACTI:
3106 min = max = GET2(ecode, 1);
3107 ecode += 3;
3108 goto REPEATCHAR;
3109
3110 case OP_POSUPTO:
3111 case OP_POSUPTOI:
3112 possessive = TRUE;
3113 /* Fall through */
3114
3115 case OP_UPTO:
3116 case OP_UPTOI:
3117 case OP_MINUPTO:
3118 case OP_MINUPTOI:
3119 min = 0;
3120 max = GET2(ecode, 1);
3121 minimize = *ecode == OP_MINUPTO || *ecode == OP_MINUPTOI;
3122 ecode += 3;
3123 goto REPEATCHAR;
3124
3125 case OP_POSSTAR:
3126 case OP_POSSTARI:
3127 possessive = TRUE;
3128 min = 0;
3129 max = INT_MAX;
3130 ecode++;
3131 goto REPEATCHAR;
3132
3133 case OP_POSPLUS:
3134 case OP_POSPLUSI:
3135 possessive = TRUE;
3136 min = 1;
3137 max = INT_MAX;
3138 ecode++;
3139 goto REPEATCHAR;
3140
3141 case OP_POSQUERY:
3142 case OP_POSQUERYI:
3143 possessive = TRUE;
3144 min = 0;
3145 max = 1;
3146 ecode++;
3147 goto REPEATCHAR;
3148
3149 case OP_STAR:
3150 case OP_STARI:
3151 case OP_MINSTAR:
3152 case OP_MINSTARI:
3153 case OP_PLUS:
3154 case OP_PLUSI:
3155 case OP_MINPLUS:
3156 case OP_MINPLUSI:
3157 case OP_QUERY:
3158 case OP_QUERYI:
3159 case OP_MINQUERY:
3160 case OP_MINQUERYI:
3161 c = *ecode++ - ((op < OP_STARI)? OP_STAR : OP_STARI);
3162 minimize = (c & 1) != 0;
3163 min = rep_min[c]; /* Pick up values from tables; */
3164 max = rep_max[c]; /* zero for max => infinity */
3165 if (max == 0) max = INT_MAX;
3166
3167 /* Common code for all repeated single-character matches. */
3168
3169 REPEATCHAR:
3170 #ifdef SUPPORT_UTF8
3171 if (utf8)
3172 {
3173 length = 1;
3174 charptr = ecode;
3175 GETCHARLEN(fc, ecode, length);
3176 ecode += length;
3177
3178 /* Handle multibyte character matching specially here. There is
3179 support for caseless matching if UCP support is present. */
3180
3181 if (length > 1)
3182 {
3183 #ifdef SUPPORT_UCP
3184 unsigned int othercase;
3185 if (op >= OP_STARI && /* Caseless */
3186 (othercase = UCD_OTHERCASE(fc)) != fc)
3187 oclength = _pcre_ord2utf8(othercase, occhars);
3188 else oclength = 0;
3189 #endif /* SUPPORT_UCP */
3190
3191 for (i = 1; i <= min; i++)
3192 {
3193 if (eptr <= md->end_subject - length &&
3194 memcmp(eptr, charptr, length) == 0) eptr += length;
3195 #ifdef SUPPORT_UCP
3196 else if (oclength > 0 &&
3197 eptr <= md->end_subject - oclength &&
3198 memcmp(eptr, occhars, oclength) == 0) eptr += oclength;
3199 #endif /* SUPPORT_UCP */
3200 else
3201 {
3202 CHECK_PARTIAL();
3203 MRRETURN(MATCH_NOMATCH);
3204 }
3205 }
3206
3207 if (min == max) continue;
3208
3209 if (minimize)
3210 {
3211 for (fi = min;; fi++)
3212 {
3213 RMATCH(eptr, ecode, offset_top, md, eptrb, RM22);
3214 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3215 if (fi >= max) MRRETURN(MATCH_NOMATCH);
3216 if (eptr <= md->end_subject - length &&
3217 memcmp(eptr, charptr, length) == 0) eptr += length;
3218 #ifdef SUPPORT_UCP
3219 else if (oclength > 0 &&
3220 eptr <= md->end_subject - oclength &&
3221 memcmp(eptr, occhars, oclength) == 0) eptr += oclength;
3222 #endif /* SUPPORT_UCP */
3223 else
3224 {
3225 CHECK_PARTIAL();
3226 MRRETURN(MATCH_NOMATCH);
3227 }
3228 }
3229 /* Control never gets here */
3230 }
3231
3232 else /* Maximize */
3233 {
3234 pp = eptr;
3235 for (i = min; i < max; i++)
3236 {
3237 if (eptr <= md->end_subject - length &&
3238 memcmp(eptr, charptr, length) == 0) eptr += length;
3239 #ifdef SUPPORT_UCP
3240 else if (oclength > 0 &&
3241 eptr <= md->end_subject - oclength &&
3242 memcmp(eptr, occhars, oclength) == 0) eptr += oclength;
3243 #endif /* SUPPORT_UCP */
3244 else
3245 {
3246 CHECK_PARTIAL();
3247 break;
3248 }
3249 }
3250
3251 if (possessive) continue;
3252
3253 for(;;)
3254 {
3255 RMATCH(eptr, ecode, offset_top, md, eptrb, RM23);
3256 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3257 if (eptr == pp) { MRRETURN(MATCH_NOMATCH); }
3258 #ifdef SUPPORT_UCP
3259 eptr--;
3260 BACKCHAR(eptr);
3261 #else /* without SUPPORT_UCP */
3262 eptr -= length;
3263 #endif /* SUPPORT_UCP */
3264 }
3265 }
3266 /* Control never gets here */
3267 }
3268
3269 /* If the length of a UTF-8 character is 1, we fall through here, and
3270 obey the code as for non-UTF-8 characters below, though in this case the
3271 value of fc will always be < 128. */
3272 }
3273 else
3274 #endif /* SUPPORT_UTF8 */
3275
3276 /* When not in UTF-8 mode, load a single-byte character. */
3277
3278 fc = *ecode++;
3279
3280 /* The value of fc at this point is always less than 256, though we may or
3281 may not be in UTF-8 mode. The code is duplicated for the caseless and
3282 caseful cases, for speed, since matching characters is likely to be quite
3283 common. First, ensure the minimum number of matches are present. If min =
3284 max, continue at the same level without recursing. Otherwise, if
3285 minimizing, keep trying the rest of the expression and advancing one
3286 matching character if failing, up to the maximum. Alternatively, if
3287 maximizing, find the maximum number of characters and work backwards. */
3288
3289 DPRINTF(("matching %c{%d,%d} against subject %.*s\n", fc, min, max,
3290 max, eptr));
3291
3292 if (op >= OP_STARI) /* Caseless */
3293 {
3294 fc = md->lcc[fc];
3295 for (i = 1; i <= min; i++)
3296 {
3297 if (eptr >= md->end_subject)
3298 {
3299 SCHECK_PARTIAL();
3300 MRRETURN(MATCH_NOMATCH);
3301 }
3302 if (fc != md->lcc[*eptr++]) MRRETURN(MATCH_NOMATCH);
3303 }
3304 if (min == max) continue;
3305 if (minimize)
3306 {
3307 for (fi = min;; fi++)
3308 {
3309 RMATCH(eptr, ecode, offset_top, md, eptrb, RM24);
3310 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3311 if (fi >= max) MRRETURN(MATCH_NOMATCH);
3312 if (eptr >= md->end_subject)
3313 {
3314 SCHECK_PARTIAL();
3315 MRRETURN(MATCH_NOMATCH);
3316 }
3317 if (fc != md->lcc[*eptr++]) MRRETURN(MATCH_NOMATCH);
3318 }
3319 /* Control never gets here */
3320 }
3321 else /* Maximize */
3322 {
3323 pp = eptr;
3324 for (i = min; i < max; i++)
3325 {
3326 if (eptr >= md->end_subject)
3327 {
3328 SCHECK_PARTIAL();
3329 break;
3330 }
3331 if (fc != md->lcc[*eptr]) break;
3332 eptr++;
3333 }
3334
3335 if (possessive) continue;
3336
3337 while (eptr >= pp)
3338 {
3339 RMATCH(eptr, ecode, offset_top, md, eptrb, RM25);
3340 eptr--;
3341 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3342 }
3343 MRRETURN(MATCH_NOMATCH);
3344 }
3345 /* Control never gets here */
3346 }
3347
3348 /* Caseful comparisons (includes all multi-byte characters) */
3349
3350 else
3351 {
3352 for (i = 1; i <= min; i++)
3353 {
3354 if (eptr >= md->end_subject)
3355 {
3356 SCHECK_PARTIAL();
3357 MRRETURN(MATCH_NOMATCH);
3358 }
3359 if (fc != *eptr++) MRRETURN(MATCH_NOMATCH);
3360 }
3361
3362 if (min == max) continue;
3363
3364 if (minimize)
3365 {
3366 for (fi = min;; fi++)
3367 {
3368 RMATCH(eptr, ecode, offset_top, md, eptrb, RM26);
3369 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3370 if (fi >= max) MRRETURN(MATCH_NOMATCH);
3371 if (eptr >= md->end_subject)
3372 {
3373 SCHECK_PARTIAL();
3374 MRRETURN(MATCH_NOMATCH);
3375 }
3376 if (fc != *eptr++) MRRETURN(MATCH_NOMATCH);
3377 }
3378 /* Control never gets here */
3379 }
3380 else /* Maximize */
3381 {
3382 pp = eptr;
3383 for (i = min; i < max; i++)
3384 {
3385 if (eptr >= md->end_subject)
3386 {
3387 SCHECK_PARTIAL();
3388 break;
3389 }
3390 if (fc != *eptr) break;
3391 eptr++;
3392 }
3393 if (possessive) continue;
3394
3395 while (eptr >= pp)
3396 {
3397 RMATCH(eptr, ecode, offset_top, md, eptrb, RM27);
3398 eptr--;
3399 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3400 }
3401 MRRETURN(MATCH_NOMATCH);
3402 }
3403 }
3404 /* Control never gets here */
3405
3406 /* Match a negated single one-byte character. The character we are
3407 checking can be multibyte. */
3408
3409 case OP_NOT:
3410 case OP_NOTI:
3411 if (eptr >= md->end_subject)
3412 {
3413 SCHECK_PARTIAL();
3414 MRRETURN(MATCH_NOMATCH);
3415 }
3416 ecode++;
3417 GETCHARINCTEST(c, eptr);
3418 if (op == OP_NOTI) /* The caseless case */
3419 {
3420 #ifdef SUPPORT_UTF8
3421 if (c < 256)
3422 #endif
3423 c = md->lcc[c];
3424 if (md->lcc[*ecode++] == c) MRRETURN(MATCH_NOMATCH);
3425 }
3426 else /* Caseful */
3427 {
3428 if (*ecode++ == c) MRRETURN(MATCH_NOMATCH);
3429 }
3430 break;
3431
3432 /* Match a negated single one-byte character repeatedly. This is almost a
3433 repeat of the code for a repeated single character, but I haven't found a
3434 nice way of commoning these up that doesn't require a test of the
3435 positive/negative option for each character match. Maybe that wouldn't add
3436 very much to the time taken, but character matching *is* what this is all
3437 about... */
3438
3439 case OP_NOTEXACT:
3440 case OP_NOTEXACTI:
3441 min = max = GET2(ecode, 1);
3442 ecode += 3;
3443 goto REPEATNOTCHAR;
3444
3445 case OP_NOTUPTO:
3446 case OP_NOTUPTOI:
3447 case OP_NOTMINUPTO:
3448 case OP_NOTMINUPTOI:
3449 min = 0;
3450 max = GET2(ecode, 1);
3451 minimize = *ecode == OP_NOTMINUPTO || *ecode == OP_NOTMINUPTOI;
3452 ecode += 3;
3453 goto REPEATNOTCHAR;
3454
3455 case OP_NOTPOSSTAR:
3456 case OP_NOTPOSSTARI:
3457 possessive = TRUE;
3458 min = 0;
3459 max = INT_MAX;
3460 ecode++;
3461 goto REPEATNOTCHAR;
3462
3463 case OP_NOTPOSPLUS:
3464 case OP_NOTPOSPLUSI:
3465 possessive = TRUE;
3466 min = 1;
3467 max = INT_MAX;
3468 ecode++;
3469 goto REPEATNOTCHAR;
3470
3471 case OP_NOTPOSQUERY:
3472 case OP_NOTPOSQUERYI:
3473 possessive = TRUE;
3474 min = 0;
3475 max = 1;
3476 ecode++;
3477 goto REPEATNOTCHAR;
3478
3479 case OP_NOTPOSUPTO:
3480 case OP_NOTPOSUPTOI:
3481 possessive = TRUE;
3482 min = 0;
3483 max = GET2(ecode, 1);
3484 ecode += 3;
3485 goto REPEATNOTCHAR;
3486
3487 case OP_NOTSTAR:
3488 case OP_NOTSTARI:
3489 case OP_NOTMINSTAR:
3490 case OP_NOTMINSTARI:
3491 case OP_NOTPLUS:
3492 case OP_NOTPLUSI:
3493 case OP_NOTMINPLUS:
3494 case OP_NOTMINPLUSI:
3495 case OP_NOTQUERY:
3496 case OP_NOTQUERYI:
3497 case OP_NOTMINQUERY:
3498 case OP_NOTMINQUERYI:
3499 c = *ecode++ - ((op >= OP_NOTSTARI)? OP_NOTSTARI: OP_NOTSTAR);
3500 minimize = (c & 1) != 0;
3501 min = rep_min[c]; /* Pick up values from tables; */
3502 max = rep_max[c]; /* zero for max => infinity */
3503 if (max == 0) max = INT_MAX;
3504
3505 /* Common code for all repeated single-byte matches. */
3506
3507 REPEATNOTCHAR:
3508 fc = *ecode++;
3509
3510 /* The code is duplicated for the caseless and caseful cases, for speed,
3511 since matching characters is likely to be quite common. First, ensure the
3512 minimum number of matches are present. If min = max, continue at the same
3513 level without recursing. Otherwise, if minimizing, keep trying the rest of
3514 the expression and advancing one matching character if failing, up to the
3515 maximum. Alternatively, if maximizing, find the maximum number of
3516 characters and work backwards. */
3517
3518 DPRINTF(("negative matching %c{%d,%d} against subject %.*s\n", fc, min, max,
3519 max, eptr));
3520
3521 if (op >= OP_NOTSTARI) /* Caseless */
3522 {
3523 fc = md->lcc[fc];
3524
3525 #ifdef SUPPORT_UTF8
3526 /* UTF-8 mode */
3527 if (utf8)
3528 {
3529 register unsigned int d;
3530 for (i = 1; i <= min; i++)
3531 {
3532 if (eptr >= md->end_subject)
3533 {
3534 SCHECK_PARTIAL();
3535 MRRETURN(MATCH_NOMATCH);
3536 }
3537 GETCHARINC(d, eptr);
3538 if (d < 256) d = md->lcc[d];
3539 if (fc == d) MRRETURN(MATCH_NOMATCH);
3540 }
3541 }
3542 else
3543 #endif
3544
3545 /* Not UTF-8 mode */
3546 {
3547 for (i = 1; i <= min; i++)
3548 {
3549 if (eptr >= md->end_subject)
3550 {
3551 SCHECK_PARTIAL();
3552 MRRETURN(MATCH_NOMATCH);
3553 }
3554 if (fc == md->lcc[*eptr++]) MRRETURN(MATCH_NOMATCH);
3555 }
3556 }
3557
3558 if (min == max) continue;
3559
3560 if (minimize)
3561 {
3562 #ifdef SUPPORT_UTF8
3563 /* UTF-8 mode */
3564 if (utf8)
3565 {
3566 register unsigned int d;
3567 for (fi = min;; fi++)
3568 {
3569 RMATCH(eptr, ecode, offset_top, md, eptrb, RM28);
3570 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3571 if (fi >= max) MRRETURN(MATCH_NOMATCH);
3572 if (eptr >= md->end_subject)
3573 {
3574 SCHECK_PARTIAL();
3575 MRRETURN(MATCH_NOMATCH);
3576 }
3577 GETCHARINC(d, eptr);
3578 if (d < 256) d = md->lcc[d];
3579 if (fc == d) MRRETURN(MATCH_NOMATCH);
3580 }
3581 }
3582 else
3583 #endif
3584 /* Not UTF-8 mode */
3585 {
3586 for (fi = min;; fi++)
3587 {
3588 RMATCH(eptr, ecode, offset_top, md, eptrb, RM29);
3589 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3590 if (fi >= max) MRRETURN(MATCH_NOMATCH);
3591 if (eptr >= md->end_subject)
3592 {
3593 SCHECK_PARTIAL();
3594 MRRETURN(MATCH_NOMATCH);
3595 }
3596 if (fc == md->lcc[*eptr++]) MRRETURN(MATCH_NOMATCH);
3597 }
3598 }
3599 /* Control never gets here */
3600 }
3601
3602 /* Maximize case */
3603
3604 else
3605 {
3606 pp = eptr;
3607
3608 #ifdef SUPPORT_UTF8
3609 /* UTF-8 mode */
3610 if (utf8)
3611 {
3612 register unsigned int d;
3613 for (i = min; i < max; i++)
3614 {
3615 int len = 1;
3616 if (eptr >= md->end_subject)
3617 {
3618 SCHECK_PARTIAL();
3619 break;
3620 }
3621 GETCHARLEN(d, eptr, len);
3622 if (d < 256) d = md->lcc[d];
3623 if (fc == d) break;
3624 eptr += len;
3625 }
3626 if (possessive) continue;
3627 for(;;)
3628 {
3629 RMATCH(eptr, ecode, offset_top, md, eptrb, RM30);
3630 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3631 if (eptr-- == pp) break; /* Stop if tried at original pos */
3632 BACKCHAR(eptr);
3633 }
3634 }
3635 else
3636 #endif
3637 /* Not UTF-8 mode */
3638 {
3639 for (i = min; i < max; i++)
3640 {
3641 if (eptr >= md->end_subject)
3642 {
3643 SCHECK_PARTIAL();
3644 break;
3645 }
3646 if (fc == md->lcc[*eptr]) break;
3647 eptr++;
3648 }
3649 if (possessive) continue;
3650 while (eptr >= pp)
3651 {
3652 RMATCH(eptr, ecode, offset_top, md, eptrb, RM31);
3653 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3654 eptr--;
3655 }
3656 }
3657
3658 MRRETURN(MATCH_NOMATCH);
3659 }
3660 /* Control never gets here */
3661 }
3662
3663 /* Caseful comparisons */
3664
3665 else
3666 {
3667 #ifdef SUPPORT_UTF8
3668 /* UTF-8 mode */
3669 if (utf8)
3670 {
3671 register unsigned int d;
3672 for (i = 1; i <= min; i++)
3673 {
3674 if (eptr >= md->end_subject)
3675 {
3676 SCHECK_PARTIAL();
3677 MRRETURN(MATCH_NOMATCH);
3678 }
3679 GETCHARINC(d, eptr);
3680 if (fc == d) MRRETURN(MATCH_NOMATCH);
3681 }
3682 }
3683 else
3684 #endif
3685 /* Not UTF-8 mode */
3686 {
3687 for (i = 1; i <= min; i++)
3688 {
3689 if (eptr >= md->end_subject)
3690 {
3691 SCHECK_PARTIAL();
3692 MRRETURN(MATCH_NOMATCH);
3693 }
3694 if (fc == *eptr++) MRRETURN(MATCH_NOMATCH);
3695 }
3696 }
3697
3698 if (min == max) continue;
3699
3700 if (minimize)
3701 {
3702 #ifdef SUPPORT_UTF8
3703 /* UTF-8 mode */
3704 if (utf8)
3705 {
3706 register unsigned int d;
3707 for (fi = min;; fi++)
3708 {
3709 RMATCH(eptr, ecode, offset_top, md, eptrb, RM32);
3710 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3711 if (fi >= max) MRRETURN(MATCH_NOMATCH);
3712 if (eptr >= md->end_subject)
3713 {
3714 SCHECK_PARTIAL();
3715 MRRETURN(MATCH_NOMATCH);
3716 }
3717 GETCHARINC(d, eptr);
3718 if (fc == d) MRRETURN(MATCH_NOMATCH);
3719 }
3720 }
3721 else
3722 #endif
3723 /* Not UTF-8 mode */
3724 {
3725 for (fi = min;; fi++)
3726 {
3727 RMATCH(eptr, ecode, offset_top, md, eptrb, RM33);
3728 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3729 if (fi >= max) MRRETURN(MATCH_NOMATCH);
3730 if (eptr >= md->end_subject)
3731 {
3732 SCHECK_PARTIAL();
3733 MRRETURN(MATCH_NOMATCH);
3734 }
3735 if (fc == *eptr++) MRRETURN(MATCH_NOMATCH);
3736 }
3737 }
3738 /* Control never gets here */
3739 }
3740
3741 /* Maximize case */
3742
3743 else
3744 {
3745 pp = eptr;
3746
3747 #ifdef SUPPORT_UTF8
3748 /* UTF-8 mode */
3749 if (utf8)
3750 {
3751 register unsigned int d;
3752 for (i = min; i < max; i++)
3753 {
3754 int len = 1;
3755 if (eptr >= md->end_subject)
3756 {
3757 SCHECK_PARTIAL();
3758 break;
3759 }
3760 GETCHARLEN(d, eptr, len);
3761 if (fc == d) break;
3762 eptr += len;
3763 }
3764 if (possessive) continue;
3765 for(;;)
3766 {
3767 RMATCH(eptr, ecode, offset_top, md, eptrb, RM34);
3768 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3769 if (eptr-- == pp) break; /* Stop if tried at original pos */
3770 BACKCHAR(eptr);
3771 }
3772 }
3773 else
3774 #endif
3775 /* Not UTF-8 mode */
3776 {
3777 for (i = min; i < max; i++)
3778 {
3779 if (eptr >= md->end_subject)
3780 {
3781 SCHECK_PARTIAL();
3782 break;
3783 }
3784 if (fc == *eptr) break;
3785 eptr++;
3786 }
3787 if (possessive) continue;
3788 while (eptr >= pp)
3789 {
3790 RMATCH(eptr, ecode, offset_top, md, eptrb, RM35);
3791 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3792 eptr--;
3793 }
3794 }
3795
3796 MRRETURN(MATCH_NOMATCH);
3797 }
3798 }
3799 /* Control never gets here */
3800
3801 /* Match a single character type repeatedly; several different opcodes
3802 share code. This is very similar to the code for single characters, but we
3803 repeat it in the interests of efficiency. */
3804
3805 case OP_TYPEEXACT:
3806 min = max = GET2(ecode, 1);
3807 minimize = TRUE;
3808 ecode += 3;
3809 goto REPEATTYPE;
3810
3811 case OP_TYPEUPTO:
3812 case OP_TYPEMINUPTO:
3813 min = 0;
3814 max = GET2(ecode, 1);
3815 minimize = *ecode == OP_TYPEMINUPTO;
3816 ecode += 3;
3817 goto REPEATTYPE;
3818
3819 case OP_TYPEPOSSTAR:
3820 possessive = TRUE;
3821 min = 0;
3822 max = INT_MAX;
3823 ecode++;
3824 goto REPEATTYPE;
3825
3826 case OP_TYPEPOSPLUS:
3827 possessive = TRUE;
3828 min = 1;
3829 max = INT_MAX;
3830 ecode++;
3831 goto REPEATTYPE;
3832
3833 case OP_TYPEPOSQUERY:
3834 possessive = TRUE;
3835 min = 0;
3836 max = 1;
3837 ecode++;
3838 goto REPEATTYPE;
3839
3840 case OP_TYPEPOSUPTO:
3841 possessive = TRUE;
3842 min = 0;
3843 max = GET2(ecode, 1);
3844 ecode += 3;
3845 goto REPEATTYPE;
3846
3847 case OP_TYPESTAR:
3848 case OP_TYPEMINSTAR:
3849 case OP_TYPEPLUS:
3850 case OP_TYPEMINPLUS:
3851 case OP_TYPEQUERY:
3852 case OP_TYPEMINQUERY:
3853 c = *ecode++ - OP_TYPESTAR;
3854 minimize = (c & 1) != 0;
3855 min = rep_min[c]; /* Pick up values from tables; */
3856 max = rep_max[c]; /* zero for max => infinity */
3857 if (max == 0) max = INT_MAX;
3858
3859 /* Common code for all repeated single character type matches. Note that
3860 in UTF-8 mode, '.' matches a character of any length, but for the other
3861 character types, the valid characters are all one-byte long. */
3862
3863 REPEATTYPE:
3864 ctype = *ecode++; /* Code for the character type */
3865
3866 #ifdef SUPPORT_UCP
3867 if (ctype == OP_PROP || ctype == OP_NOTPROP)
3868 {
3869 prop_fail_result = ctype == OP_NOTPROP;
3870 prop_type = *ecode++;
3871 prop_value = *ecode++;
3872 }
3873 else prop_type = -1;
3874 #endif
3875
3876 /* First, ensure the minimum number of matches are present. Use inline
3877 code for maximizing the speed, and do the type test once at the start
3878 (i.e. keep it out of the loop). Separate the UTF-8 code completely as that
3879 is tidier. Also separate the UCP code, which can be the same for both UTF-8
3880 and single-bytes. */
3881
3882 if (min > 0)
3883 {
3884 #ifdef SUPPORT_UCP
3885 if (prop_type >= 0)
3886 {
3887 switch(prop_type)
3888 {
3889 case PT_ANY:
3890 if (prop_fail_result) MRRETURN(MATCH_NOMATCH);
3891 for (i = 1; i <= min; i++)
3892 {
3893 if (eptr >= md->end_subject)
3894 {
3895 SCHECK_PARTIAL();
3896 MRRETURN(MATCH_NOMATCH);
3897 }
3898 GETCHARINCTEST(c, eptr);
3899 }
3900 break;
3901
3902 case PT_LAMP:
3903 for (i = 1; i <= min; i++)
3904 {
3905 int chartype;
3906 if (eptr >= md->end_subject)
3907 {
3908 SCHECK_PARTIAL();
3909 MRRETURN(MATCH_NOMATCH);
3910 }
3911 GETCHARINCTEST(c, eptr);
3912 chartype = UCD_CHARTYPE(c);
3913 if ((chartype == ucp_Lu ||
3914 chartype == ucp_Ll ||
3915 chartype == ucp_Lt) == prop_fail_result)
3916 MRRETURN(MATCH_NOMATCH);
3917 }
3918 break;
3919
3920 case PT_GC:
3921 for (i = 1; i <= min; i++)
3922 {
3923 if (eptr >= md->end_subject)
3924 {
3925 SCHECK_PARTIAL();
3926 MRRETURN(MATCH_NOMATCH);
3927 }
3928 GETCHARINCTEST(c, eptr);
3929 if ((UCD_CATEGORY(c) == prop_value) == prop_fail_result)
3930 MRRETURN(MATCH_NOMATCH);
3931 }
3932 break;
3933
3934 case PT_PC:
3935 for (i = 1; i <= min; i++)
3936 {
3937 if (eptr >= md->end_subject)
3938 {
3939 SCHECK_PARTIAL();
3940 MRRETURN(MATCH_NOMATCH);
3941 }
3942 GETCHARINCTEST(c, eptr);
3943 if ((UCD_CHARTYPE(c) == prop_value) == prop_fail_result)
3944 MRRETURN(MATCH_NOMATCH);
3945 }
3946 break;
3947
3948 case PT_SC:
3949 for (i = 1; i <= min; i++)
3950 {
3951 if (eptr >= md->end_subject)
3952 {
3953 SCHECK_PARTIAL();
3954 MRRETURN(MATCH_NOMATCH);
3955 }
3956 GETCHARINCTEST(c, eptr);
3957 if ((UCD_SCRIPT(c) == prop_value) == prop_fail_result)
3958 MRRETURN(MATCH_NOMATCH);
3959 }
3960 break;
3961
3962 case PT_ALNUM:
3963 for (i = 1; i <= min; i++)
3964 {
3965 int category;
3966 if (eptr >= md->end_subject)
3967 {
3968 SCHECK_PARTIAL();
3969 MRRETURN(MATCH_NOMATCH);
3970 }
3971 GETCHARINCTEST(c, eptr);
3972 category = UCD_CATEGORY(c);
3973 if ((category == ucp_L || category == ucp_N) == prop_fail_result)
3974 MRRETURN(MATCH_NOMATCH);
3975 }
3976 break;
3977
3978 case PT_SPACE: /* Perl space */
3979 for (i = 1; i <= min; i++)
3980 {
3981 if (eptr >= md->end_subject)
3982 {
3983 SCHECK_PARTIAL();
3984 MRRETURN(MATCH_NOMATCH);
3985 }
3986 GETCHARINCTEST(c, eptr);
3987 if ((UCD_CATEGORY(c) == ucp_Z || c == CHAR_HT || c == CHAR_NL ||
3988 c == CHAR_FF || c == CHAR_CR)
3989 == prop_fail_result)
3990 MRRETURN(MATCH_NOMATCH);
3991 }
3992 break;
3993
3994 case PT_PXSPACE: /* POSIX space */
3995 for (i = 1; i <= min; i++)
3996 {
3997 if (eptr >= md->end_subject)
3998 {
3999 SCHECK_PARTIAL();
4000 MRRETURN(MATCH_NOMATCH);
4001 }
4002 GETCHARINCTEST(c, eptr);
4003 if ((UCD_CATEGORY(c) == ucp_Z || c == CHAR_HT || c == CHAR_NL ||
4004 c == CHAR_VT || c == CHAR_FF || c == CHAR_CR)
4005 == prop_fail_result)
4006 MRRETURN(MATCH_NOMATCH);
4007 }
4008 break;
4009
4010 case PT_WORD:
4011 for (i = 1; i <= min; i++)
4012 {
4013 int category;
4014 if (eptr >= md->end_subject)
4015 {
4016 SCHECK_PARTIAL();
4017 MRRETURN(MATCH_NOMATCH);
4018 }
4019 GETCHARINCTEST(c, eptr);
4020 category = UCD_CATEGORY(c);
4021 if ((category == ucp_L || category == ucp_N || c == CHAR_UNDERSCORE)
4022 == prop_fail_result)
4023 MRRETURN(MATCH_NOMATCH);
4024 }
4025 break;
4026
4027 /* This should not occur */
4028
4029 default:
4030 RRETURN(PCRE_ERROR_INTERNAL);
4031 }
4032 }
4033
4034 /* Match extended Unicode sequences. We will get here only if the
4035 support is in the binary; otherwise a compile-time error occurs. */
4036
4037 else if (ctype == OP_EXTUNI)
4038 {
4039 for (i = 1; i <= min; i++)
4040 {
4041 if (eptr >= md->end_subject)
4042 {
4043 SCHECK_PARTIAL();
4044 MRRETURN(MATCH_NOMATCH);
4045 }
4046 GETCHARINCTEST(c, eptr);
4047 if (UCD_CATEGORY(c) == ucp_M) MRRETURN(MATCH_NOMATCH);
4048 while (eptr < md->end_subject)
4049 {
4050 int len = 1;
4051 if (!utf8) c = *eptr; else { GETCHARLEN(c, eptr, len); }
4052 if (UCD_CATEGORY(c) != ucp_M) break;
4053 eptr += len;
4054 }
4055 }
4056 }
4057
4058 else
4059 #endif /* SUPPORT_UCP */
4060
4061 /* Handle all other cases when the coding is UTF-8 */
4062
4063 #ifdef SUPPORT_UTF8
4064 if (utf8) switch(ctype)
4065 {
4066 case OP_ANY:
4067 for (i = 1; i <= min; i++)
4068 {
4069 if (eptr >= md->end_subject)
4070 {
4071 SCHECK_PARTIAL();
4072 MRRETURN(MATCH_NOMATCH);
4073 }
4074 if (IS_NEWLINE(eptr)) MRRETURN(MATCH_NOMATCH);
4075 eptr++;
4076 while (eptr < md->end_subject && (*eptr & 0xc0) == 0x80) eptr++;
4077 }
4078 break;
4079
4080 case OP_ALLANY:
4081 for (i = 1; i <= min; i++)
4082 {
4083 if (eptr >= md->end_subject)
4084 {
4085 SCHECK_PARTIAL();
4086 MRRETURN(MATCH_NOMATCH);
4087 }
4088 eptr++;
4089 while (eptr < md->end_subject && (*eptr & 0xc0) == 0x80) eptr++;
4090 }
4091 break;
4092
4093 case OP_ANYBYTE:
4094 if (eptr > md->end_subject - min) MRRETURN(MATCH_NOMATCH);
4095 eptr += min;
4096 break;
4097
4098 case OP_ANYNL:
4099 for (i = 1; i <= min; i++)
4100 {
4101 if (eptr >= md->end_subject)
4102 {
4103 SCHECK_PARTIAL();
4104 MRRETURN(MATCH_NOMATCH);
4105 }
4106 GETCHARINC(c, eptr);
4107 switch(c)
4108 {
4109 default: MRRETURN(MATCH_NOMATCH);
4110
4111 case 0x000d:
4112 if (eptr < md->end_subject && *eptr == 0x0a) eptr++;
4113 break;
4114
4115 case 0x000a:
4116 break;
4117
4118 case 0x000b:
4119 case 0x000c:
4120 case 0x0085:
4121 case 0x2028:
4122 case 0x2029:
4123 if (md->bsr_anycrlf) MRRETURN(MATCH_NOMATCH);
4124 break;
4125 }
4126 }
4127 break;
4128
4129 case OP_NOT_HSPACE:
4130 for (i = 1; i <= min; i++)
4131 {
4132 if (eptr >= md->end_subject)
4133 {
4134 SCHECK_PARTIAL();
4135 MRRETURN(MATCH_NOMATCH);
4136 }
4137 GETCHARINC(c, eptr);
4138 switch(c)
4139 {
4140 default: break;
4141 case 0x09: /* HT */
4142 case 0x20: /* SPACE */
4143 case 0xa0: /* NBSP */
4144 case 0x1680: /* OGHAM SPACE MARK */
4145 case 0x180e: /* MONGOLIAN VOWEL SEPARATOR */
4146 case 0x2000: /* EN QUAD */
4147 case 0x2001: /* EM QUAD */
4148 case 0x2002: /* EN SPACE */
4149 case 0x2003: /* EM SPACE */
4150 case 0x2004: /* THREE-PER-EM SPACE */
4151 case 0x2005: /* FOUR-PER-EM SPACE */
4152 case 0x2006: /* SIX-PER-EM SPACE */
4153 case 0x2007: /* FIGURE SPACE */
4154 case 0x2008: /* PUNCTUATION SPACE */
4155 case 0x2009: /* THIN SPACE */
4156 case 0x200A: /* HAIR SPACE */
4157 case 0x202f: /* NARROW NO-BREAK SPACE */
4158 case 0x205f: /* MEDIUM MATHEMATICAL SPACE */
4159 case 0x3000: /* IDEOGRAPHIC SPACE */
4160 MRRETURN(MATCH_NOMATCH);
4161 }
4162 }
4163 break;
4164
4165 case OP_HSPACE:
4166 for (i = 1; i <= min; i++)
4167 {
4168 if (eptr >= md->end_subject)
4169 {
4170 SCHECK_PARTIAL();
4171 MRRETURN(MATCH_NOMATCH);
4172 }
4173 GETCHARINC(c, eptr);
4174 switch(c)
4175 {
4176 default: MRRETURN(MATCH_NOMATCH);
4177 case 0x09: /* HT */
4178 case 0x20: /* SPACE */
4179 case 0xa0: /* NBSP */
4180 case 0x1680: /* OGHAM SPACE MARK */
4181 case 0x180e: /* MONGOLIAN VOWEL SEPARATOR */
4182 case 0x2000: /* EN QUAD */
4183 case 0x2001: /* EM QUAD */
4184 case 0x2002: /* EN SPACE */
4185 case 0x2003: /* EM SPACE */
4186 case 0x2004: /* THREE-PER-EM SPACE */
4187 case 0x2005: /* FOUR-PER-EM SPACE */
4188 case 0x2006: /* SIX-PER-EM SPACE */
4189 case 0x2007: /* FIGURE SPACE */
4190 case 0x2008: /* PUNCTUATION SPACE */
4191 case 0x2009: /* THIN SPACE */
4192 case 0x200A: /* HAIR SPACE */
4193 case 0x202f: /* NARROW NO-BREAK SPACE */
4194 case 0x205f: /* MEDIUM MATHEMATICAL SPACE */
4195 case 0x3000: /* IDEOGRAPHIC SPACE */
4196 break;
4197 }
4198 }
4199 break;
4200
4201 case OP_NOT_VSPACE:
4202 for (i = 1; i <= min; i++)
4203 {
4204 if (eptr >= md->end_subject)
4205 {
4206 SCHECK_PARTIAL();
4207 MRRETURN(MATCH_NOMATCH);
4208 }
4209 GETCHARINC(c, eptr);
4210 switch(c)
4211 {
4212 default: break;
4213 case 0x0a: /* LF */
4214 case 0x0b: /* VT */
4215 case 0x0c: /* FF */
4216 case 0x0d: /* CR */
4217 case 0x85: /* NEL */
4218 case 0x2028: /* LINE SEPARATOR */
4219 case 0x2029: /* PARAGRAPH SEPARATOR */
4220 MRRETURN(MATCH_NOMATCH);
4221 }
4222 }
4223 break;
4224
4225 case OP_VSPACE:
4226 for (i = 1; i <= min; i++)
4227 {
4228 if (eptr >= md->end_subject)
4229 {
4230 SCHECK_PARTIAL();
4231 MRRETURN(MATCH_NOMATCH);
4232 }
4233 GETCHARINC(c, eptr);
4234 switch(c)
4235 {
4236 default: MRRETURN(MATCH_NOMATCH);
4237 case 0x0a: /* LF */
4238 case 0x0b: /* VT */
4239 case 0x0c: /* FF */
4240 case 0x0d: /* CR */
4241 case 0x85: /* NEL */
4242 case 0x2028: /* LINE SEPARATOR */
4243 case 0x2029: /* PARAGRAPH SEPARATOR */
4244 break;
4245 }
4246 }
4247 break;
4248
4249 case OP_NOT_DIGIT:
4250 for (i = 1; i <= min; i++)
4251 {
4252 if (eptr >= md->end_subject)
4253 {
4254 SCHECK_PARTIAL();
4255 MRRETURN(MATCH_NOMATCH);
4256 }
4257 GETCHARINC(c, eptr);
4258 if (c < 128 && (md->ctypes[c] & ctype_digit) != 0)
4259 MRRETURN(MATCH_NOMATCH);
4260 }
4261 break;
4262
4263 case OP_DIGIT:
4264 for (i = 1; i <= min; i++)
4265 {
4266 if (eptr >= md->end_subject)
4267 {
4268 SCHECK_PARTIAL();
4269 MRRETURN(MATCH_NOMATCH);
4270 }
4271 if (*eptr >= 128 || (md->ctypes[*eptr++] & ctype_digit) == 0)
4272 MRRETURN(MATCH_NOMATCH);
4273 /* No need to skip more bytes - we know it's a 1-byte character */
4274 }
4275 break;
4276
4277 case OP_NOT_WHITESPACE:
4278 for (i = 1; i <= min; i++)
4279 {
4280 if (eptr >= md->end_subject)
4281 {
4282 SCHECK_PARTIAL();
4283 MRRETURN(MATCH_NOMATCH);
4284 }
4285 if (*eptr < 128 && (md->ctypes[*eptr] & ctype_space) != 0)
4286 MRRETURN(MATCH_NOMATCH);
4287 while (++eptr < md->end_subject && (*eptr & 0xc0) == 0x80);
4288 }
4289 break;
4290
4291 case OP_WHITESPACE:
4292 for (i = 1; i <= min; i++)
4293 {
4294 if (eptr >= md->end_subject)
4295 {
4296 SCHECK_PARTIAL();
4297 MRRETURN(MATCH_NOMATCH);
4298 }
4299 if (*eptr >= 128 || (md->ctypes[*eptr++] & ctype_space) == 0)
4300 MRRETURN(MATCH_NOMATCH);
4301 /* No need to skip more bytes - we know it's a 1-byte character */
4302 }
4303 break;
4304
4305 case OP_NOT_WORDCHAR:
4306 for (i = 1; i <= min; i++)
4307 {
4308 if (eptr >= md->end_subject)
4309 {
4310 SCHECK_PARTIAL();
4311 MRRETURN(MATCH_NOMATCH);
4312 }
4313 if (*eptr < 128 && (md->ctypes[*eptr] & ctype_word) != 0)
4314 MRRETURN(MATCH_NOMATCH);
4315 while (++eptr < md->end_subject && (*eptr & 0xc0) == 0x80);
4316 }
4317 break;
4318
4319 case OP_WORDCHAR:
4320 for (i = 1; i <= min; i++)
4321 {
4322 if (eptr >= md->end_subject)
4323 {
4324 SCHECK_PARTIAL();
4325 MRRETURN(MATCH_NOMATCH);
4326 }
4327 if (*eptr >= 128 || (md->ctypes[*eptr++] & ctype_word) == 0)
4328 MRRETURN(MATCH_NOMATCH);
4329 /* No need to skip more bytes - we know it's a 1-byte character */
4330 }
4331 break;
4332
4333 default:
4334 RRETURN(PCRE_ERROR_INTERNAL);
4335 } /* End switch(ctype) */
4336
4337 else
4338 #endif /* SUPPORT_UTF8 */
4339
4340 /* Code for the non-UTF-8 case for minimum matching of operators other
4341 than OP_PROP and OP_NOTPROP. */
4342
4343 switch(ctype)
4344 {
4345 case OP_ANY:
4346 for (i = 1; i <= min; i++)
4347 {
4348 if (eptr >= md->end_subject)
4349 {
4350 SCHECK_PARTIAL();
4351 MRRETURN(MATCH_NOMATCH);
4352 }
4353 if (IS_NEWLINE(eptr)) MRRETURN(MATCH_NOMATCH);
4354 eptr++;
4355 }
4356 break;
4357
4358 case OP_ALLANY:
4359 if (eptr > md->end_subject - min)
4360 {
4361 SCHECK_PARTIAL();
4362 MRRETURN(MATCH_NOMATCH);
4363 }
4364 eptr += min;
4365 break;
4366
4367 case OP_ANYBYTE:
4368 if (eptr > md->end_subject - min)
4369 {
4370 SCHECK_PARTIAL();
4371 MRRETURN(MATCH_NOMATCH);
4372 }
4373 eptr += min;
4374 break;
4375
4376 case OP_ANYNL:
4377 for (i = 1; i <= min; i++)
4378 {
4379 if (eptr >= md->end_subject)
4380 {
4381 SCHECK_PARTIAL();
4382 MRRETURN(MATCH_NOMATCH);
4383 }
4384 switch(*eptr++)
4385 {
4386 default: MRRETURN(MATCH_NOMATCH);
4387
4388 case 0x000d:
4389 if (eptr < md->end_subject && *eptr == 0x0a) eptr++;
4390 break;
4391
4392 case 0x000a:
4393 break;
4394
4395 case 0x000b:
4396 case 0x000c:
4397 case 0x0085:
4398 if (md->bsr_anycrlf) MRRETURN(MATCH_NOMATCH);
4399 break;
4400 }
4401 }
4402 break;
4403
4404 case OP_NOT_HSPACE:
4405 for (i = 1; i <= min; i++)
4406 {
4407 if (eptr >= md->end_subject)
4408 {
4409 SCHECK_PARTIAL();
4410 MRRETURN(MATCH_NOMATCH);
4411 }
4412 switch(*eptr++)
4413 {
4414 default: break;
4415 case 0x09: /* HT */
4416 case 0x20: /* SPACE */
4417 case 0xa0: /* NBSP */
4418 MRRETURN(MATCH_NOMATCH);
4419 }
4420 }
4421 break;
4422
4423 case OP_HSPACE:
4424 for (i = 1; i <= min; i++)
4425 {
4426 if (eptr >= md->end_subject)
4427 {
4428 SCHECK_PARTIAL();
4429 MRRETURN(MATCH_NOMATCH);
4430 }
4431 switch(*eptr++)
4432 {
4433 default: MRRETURN(MATCH_NOMATCH);
4434 case 0x09: /* HT */
4435 case 0x20: /* SPACE */
4436 case 0xa0: /* NBSP */
4437 break;
4438 }
4439 }
4440 break;
4441
4442 case OP_NOT_VSPACE:
4443 for (i = 1; i <= min; i++)
4444 {
4445 if (eptr >= md->end_subject)
4446 {
4447 SCHECK_PARTIAL();
4448 MRRETURN(MATCH_NOMATCH);
4449 }
4450 switch(*eptr++)
4451 {
4452 default: break;
4453 case 0x0a: /* LF */
4454 case 0x0b: /* VT */
4455 case 0x0c: /* FF */
4456 case 0x0d: /* CR */
4457 case 0x85: /* NEL */
4458 MRRETURN(MATCH_NOMATCH);
4459 }
4460 }
4461 break;
4462
4463 case OP_VSPACE:
4464 for (i = 1; i <= min; i++)
4465 {
4466 if (eptr >= md->end_subject)
4467 {
4468 SCHECK_PARTIAL();
4469 MRRETURN(MATCH_NOMATCH);
4470 }
4471 switch(*eptr++)
4472 {
4473 default: MRRETURN(MATCH_NOMATCH);
4474 case 0x0a: /* LF */
4475 case 0x0b: /* VT */
4476 case 0x0c: /* FF */
4477 case 0x0d: /* CR */
4478 case 0x85: /* NEL */
4479 break;
4480 }
4481 }
4482 break;
4483
4484 case OP_NOT_DIGIT:
4485 for (i = 1; i <= min; i++)
4486 {
4487 if (eptr >= md->end_subject)
4488 {
4489 SCHECK_PARTIAL();
4490 MRRETURN(MATCH_NOMATCH);
4491 }
4492 if ((md->ctypes[*eptr++] & ctype_digit) != 0) MRRETURN(MATCH_NOMATCH);
4493 }
4494 break;
4495
4496 case OP_DIGIT:
4497 for (i = 1; i <= min; i++)
4498 {
4499 if (eptr >= md->end_subject)
4500 {
4501 SCHECK_PARTIAL();
4502 MRRETURN(MATCH_NOMATCH);
4503 }
4504 if ((md->ctypes[*eptr++] & ctype_digit) == 0) MRRETURN(MATCH_NOMATCH);
4505 }
4506 break;
4507
4508 case OP_NOT_WHITESPACE:
4509 for (i = 1; i <= min; i++)
4510 {
4511 if (eptr >= md->end_subject)
4512 {
4513 SCHECK_PARTIAL();
4514 MRRETURN(MATCH_NOMATCH);
4515 }
4516 if ((md->ctypes[*eptr++] & ctype_space) != 0) MRRETURN(MATCH_NOMATCH);
4517 }
4518 break;
4519
4520 case OP_WHITESPACE:
4521 for (i = 1; i <= min; i++)
4522 {
4523 if (eptr >= md->end_subject)
4524 {
4525 SCHECK_PARTIAL();
4526 MRRETURN(MATCH_NOMATCH);
4527 }
4528 if ((md->ctypes[*eptr++] & ctype_space) == 0) MRRETURN(MATCH_NOMATCH);
4529 }
4530 break;
4531
4532 case OP_NOT_WORDCHAR:
4533 for (i = 1; i <= min; i++)
4534 {
4535 if (eptr >= md->end_subject)
4536 {
4537 SCHECK_PARTIAL();
4538 MRRETURN(MATCH_NOMATCH);
4539 }
4540 if ((md->ctypes[*eptr++] & ctype_word) != 0)
4541 MRRETURN(MATCH_NOMATCH);
4542 }
4543 break;
4544
4545 case OP_WORDCHAR:
4546 for (i = 1; i <= min; i++)
4547 {
4548 if (eptr >= md->end_subject)
4549 {
4550 SCHECK_PARTIAL();
4551 MRRETURN(MATCH_NOMATCH);
4552 }
4553 if ((md->ctypes[*eptr++] & ctype_word) == 0)
4554 MRRETURN(MATCH_NOMATCH);
4555 }
4556 break;
4557
4558 default:
4559 RRETURN(PCRE_ERROR_INTERNAL);
4560 }
4561 }
4562
4563 /* If min = max, continue at the same level without recursing */
4564
4565 if (min == max) continue;
4566
4567 /* If minimizing, we have to test the rest of the pattern before each
4568 subsequent match. Again, separate the UTF-8 case for speed, and also
4569 separate the UCP cases. */
4570
4571 if (minimize)
4572 {
4573 #ifdef SUPPORT_UCP
4574 if (prop_type >= 0)
4575 {
4576 switch(prop_type)
4577 {
4578 case PT_ANY:
4579 for (fi = min;; fi++)
4580 {
4581 RMATCH(eptr, ecode, offset_top, md, eptrb, RM36);
4582 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
4583 if (fi >= max) MRRETURN(MATCH_NOMATCH);
4584 if (eptr >= md->end_subject)
4585 {
4586 SCHECK_PARTIAL();
4587 MRRETURN(MATCH_NOMATCH);
4588 }
4589 GETCHARINCTEST(c, eptr);
4590 if (prop_fail_result) MRRETURN(MATCH_NOMATCH);
4591 }
4592 /* Control never gets here */
4593
4594 case PT_LAMP:
4595 for (fi = min;; fi++)
4596 {
4597 int chartype;
4598 RMATCH(eptr, ecode, offset_top, md, eptrb, RM37);
4599 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
4600 if (fi >= max) MRRETURN(MATCH_NOMATCH);
4601 if (eptr >= md->end_subject)
4602 {
4603 SCHECK_PARTIAL();
4604 MRRETURN(MATCH_NOMATCH);
4605 }
4606 GETCHARINCTEST(c, eptr);
4607 chartype = UCD_CHARTYPE(c);
4608 if ((chartype == ucp_Lu ||
4609 chartype == ucp_Ll ||
4610 chartype == ucp_Lt) == prop_fail_result)
4611 MRRETURN(MATCH_NOMATCH);
4612 }
4613 /* Control never gets here */
4614
4615 case PT_GC:
4616 for (fi = min;; fi++)
4617 {
4618 RMATCH(eptr, ecode, offset_top, md, eptrb, RM38);
4619 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
4620 if (fi >= max) MRRETURN(MATCH_NOMATCH);
4621 if (eptr >= md->end_subject)
4622 {
4623 SCHECK_PARTIAL();
4624 MRRETURN(MATCH_NOMATCH);
4625 }
4626 GETCHARINCTEST(c, eptr);
4627 if ((UCD_CATEGORY(c) == prop_value) == prop_fail_result)
4628 MRRETURN(MATCH_NOMATCH);
4629 }
4630 /* Control never gets here */
4631
4632 case PT_PC:
4633 for (fi = min;; fi++)
4634 {
4635 RMATCH(eptr, ecode, offset_top, md, eptrb, RM39);
4636 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
4637 if (fi >= max) MRRETURN(MATCH_NOMATCH);
4638 if (eptr >= md->end_subject)
4639 {
4640 SCHECK_PARTIAL();
4641 MRRETURN(MATCH_NOMATCH);
4642 }
4643 GETCHARINCTEST(c, eptr);
4644 if ((UCD_CHARTYPE(c) == prop_value) == prop_fail_result)
4645 MRRETURN(MATCH_NOMATCH);
4646 }
4647 /* Control never gets here */
4648
4649 case PT_SC:
4650 for (fi = min;; fi++)
4651 {
4652 RMATCH(eptr, ecode, offset_top, md, eptrb, RM40);
4653 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
4654 if (fi >= max) MRRETURN(MATCH_NOMATCH);
4655 if (eptr >= md->end_subject)
4656 {
4657 SCHECK_PARTIAL();
4658 MRRETURN(MATCH_NOMATCH);
4659 }
4660 GETCHARINCTEST(c, eptr);
4661 if ((UCD_SCRIPT(c) == prop_value) == prop_fail_result)
4662 MRRETURN(MATCH_NOMATCH);
4663 }
4664 /* Control never gets here */
4665
4666 case PT_ALNUM:
4667 for (fi = min;; fi++)
4668 {
4669 int category;
4670 RMATCH(eptr, ecode, offset_top, md, eptrb, RM59);
4671 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
4672 if (fi >= max) MRRETURN(MATCH_NOMATCH);
4673 if (eptr >= md->end_subject)
4674 {
4675 SCHECK_PARTIAL();
4676 MRRETURN(MATCH_NOMATCH);
4677 }
4678 GETCHARINCTEST(c, eptr);
4679 category = UCD_CATEGORY(c);
4680 if ((category == ucp_L || category == ucp_N) == prop_fail_result)
4681 MRRETURN(MATCH_NOMATCH);
4682 }
4683 /* Control never gets here */
4684
4685 case PT_SPACE: /* Perl space */
4686 for (fi = min;; fi++)
4687 {
4688 RMATCH(eptr, ecode, offset_top, md, eptrb, RM60);
4689 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
4690 if (fi >= max) MRRETURN(MATCH_NOMATCH);
4691 if (eptr >= md->end_subject)
4692 {
4693 SCHECK_PARTIAL();
4694 MRRETURN(MATCH_NOMATCH);
4695 }
4696 GETCHARINCTEST(c, eptr);
4697 if ((UCD_CATEGORY(c) == ucp_Z || c == CHAR_HT || c == CHAR_NL ||
4698 c == CHAR_FF || c == CHAR_CR)
4699 == prop_fail_result)
4700 MRRETURN(MATCH_NOMATCH);
4701 }
4702 /* Control never gets here */
4703
4704 case PT_PXSPACE: /* POSIX space */
4705 for (fi = min;; fi++)
4706 {
4707 RMATCH(eptr, ecode, offset_top, md, eptrb, RM61);
4708 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
4709 if (fi >= max) MRRETURN(MATCH_NOMATCH);
4710 if (eptr >= md->end_subject)
4711 {
4712 SCHECK_PARTIAL();
4713 MRRETURN(MATCH_NOMATCH);
4714 }
4715 GETCHARINCTEST(c, eptr);
4716 if ((UCD_CATEGORY(c) == ucp_Z || c == CHAR_HT || c == CHAR_NL ||
4717 c == CHAR_VT || c == CHAR_FF || c == CHAR_CR)
4718 == prop_fail_result)
4719 MRRETURN(MATCH_NOMATCH);
4720 }
4721 /* Control never gets here */
4722
4723 case PT_WORD:
4724 for (fi = min;; fi++)
4725 {
4726 int category;
4727 RMATCH(eptr, ecode, offset_top, md, eptrb, RM62);
4728 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
4729 if (fi >= max) MRRETURN(MATCH_NOMATCH);
4730 if (eptr >= md->end_subject)
4731 {
4732 SCHECK_PARTIAL();
4733 MRRETURN(MATCH_NOMATCH);
4734 }
4735 GETCHARINCTEST(c, eptr);
4736 category = UCD_CATEGORY(c);
4737 if ((category == ucp_L ||
4738 category == ucp_N ||
4739 c == CHAR_UNDERSCORE)
4740 == prop_fail_result)
4741 MRRETURN(MATCH_NOMATCH);
4742 }
4743 /* Control never gets here */
4744
4745 /* This should never occur */
4746
4747 default:
4748 RRETURN(PCRE_ERROR_INTERNAL);
4749 }
4750 }
4751
4752 /* Match extended Unicode sequences. We will get here only if the
4753 support is in the binary; otherwise a compile-time error occurs. */
4754
4755 else if (ctype == OP_EXTUNI)
4756 {
4757 for (fi = min;; fi++)
4758 {
4759 RMATCH(eptr, ecode, offset_top, md, eptrb, RM41);
4760 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
4761 if (fi >= max) MRRETURN(MATCH_NOMATCH);
4762 if (eptr >= md->end_subject)
4763 {
4764 SCHECK_PARTIAL();
4765 MRRETURN(MATCH_NOMATCH);
4766 }
4767 GETCHARINCTEST(c, eptr);
4768 if (UCD_CATEGORY(c) == ucp_M) MRRETURN(MATCH_NOMATCH);
4769 while (eptr < md->end_subject)
4770 {
4771 int len = 1;
4772 if (!utf8) c = *eptr; else { GETCHARLEN(c, eptr, len); }
4773 if (UCD_CATEGORY(c) != ucp_M) break;
4774 eptr += len;
4775 }
4776 }
4777 }
4778 else
4779 #endif /* SUPPORT_UCP */
4780
4781 #ifdef SUPPORT_UTF8
4782 /* UTF-8 mode */
4783 if (utf8)
4784 {
4785 for (fi = min;; fi++)
4786 {
4787 RMATCH(eptr, ecode, offset_top, md, eptrb, RM42);
4788 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
4789 if (fi >= max) MRRETURN(MATCH_NOMATCH);
4790 if (eptr >= md->end_subject)
4791 {
4792 SCHECK_PARTIAL();
4793 MRRETURN(MATCH_NOMATCH);
4794 }
4795 if (ctype == OP_ANY && IS_NEWLINE(eptr))
4796 MRRETURN(MATCH_NOMATCH);
4797 GETCHARINC(c, eptr);
4798 switch(ctype)
4799 {
4800 case OP_ANY: /* This is the non-NL case */
4801 case OP_ALLANY:
4802 case OP_ANYBYTE:
4803 break;
4804
4805 case OP_ANYNL:
4806 switch(c)
4807 {
4808 default: MRRETURN(MATCH_NOMATCH);
4809 case 0x000d:
4810 if (eptr < md->end_subject && *eptr == 0x0a) eptr++;
4811 break;
4812 case 0x000a:
4813 break;
4814
4815 case 0x000b:
4816 case 0x000c:
4817 case 0x0085:
4818 case 0x2028:
4819 case 0x2029:
4820 if (md->bsr_anycrlf) MRRETURN(MATCH_NOMATCH);
4821 break;
4822 }
4823 break;
4824
4825 case OP_NOT_HSPACE:
4826 switch(c)
4827 {
4828 default: break;
4829 case 0x09: /* HT */
4830 case 0x20: /* SPACE */
4831 case 0xa0: /* NBSP */
4832 case 0x1680: /* OGHAM SPACE MARK */
4833 case 0x180e: /* MONGOLIAN VOWEL SEPARATOR */
4834 case 0x2000: /* EN QUAD */
4835 case 0x2001: /* EM QUAD */
4836 case 0x2002: /* EN SPACE */
4837 case 0x2003: /* EM SPACE */
4838 case 0x2004: /* THREE-PER-EM SPACE */
4839 case 0x2005: /* FOUR-PER-EM SPACE */
4840 case 0x2006: /* SIX-PER-EM SPACE */
4841 case 0x2007: /* FIGURE SPACE */
4842 case 0x2008: /* PUNCTUATION SPACE */
4843 case 0x2009: /* THIN SPACE */
4844 case 0x200A: /* HAIR SPACE */
4845 case 0x202f: /* NARROW NO-BREAK SPACE */
4846 case 0x205f: /* MEDIUM MATHEMATICAL SPACE */
4847 case 0x3000: /* IDEOGRAPHIC SPACE */
4848 MRRETURN(MATCH_NOMATCH);
4849 }
4850 break;
4851
4852 case OP_HSPACE:
4853 switch(c)
4854 {
4855 default: MRRETURN(MATCH_NOMATCH);
4856 case 0x09: /* HT */
4857 case 0x20: /* SPACE */
4858 case 0xa0: /* NBSP */
4859 case 0x1680: /* OGHAM SPACE MARK */
4860 case 0x180e: /* MONGOLIAN VOWEL SEPARATOR */
4861 case 0x2000: /* EN QUAD */
4862 case 0x2001: /* EM QUAD */
4863 case 0x2002: /* EN SPACE */
4864 case 0x2003: /* EM SPACE */
4865 case 0x2004: /* THREE-PER-EM SPACE */
4866 case 0x2005: /* FOUR-PER-EM SPACE */
4867 case 0x2006: /* SIX-PER-EM SPACE */
4868 case 0x2007: /* FIGURE SPACE */
4869 case 0x2008: /* PUNCTUATION SPACE */
4870 case 0x2009: /* THIN SPACE */
4871 case 0x200A: /* HAIR SPACE */
4872 case 0x202f: /* NARROW NO-BREAK SPACE */
4873 case 0x205f: /* MEDIUM MATHEMATICAL SPACE */
4874 case 0x3000: /* IDEOGRAPHIC SPACE */
4875 break;
4876 }
4877 break;
4878
4879 case OP_NOT_VSPACE:
4880 switch(c)
4881 {
4882 default: break;
4883 case 0x0a: /* LF */
4884 case 0x0b: /* VT */
4885 case 0x0c: /* FF */
4886 case 0x0d: /* CR */
4887 case 0x85: /* NEL */
4888 case 0x2028: /* LINE SEPARATOR */
4889 case 0x2029: /* PARAGRAPH SEPARATOR */
4890 MRRETURN(MATCH_NOMATCH);
4891 }
4892 break;
4893
4894 case OP_VSPACE:
4895 switch(c)
4896 {
4897 default: MRRETURN(MATCH_NOMATCH);
4898 case 0x0a: /* LF */
4899 case 0x0b: /* VT */
4900 case 0x0c: /* FF */
4901 case 0x0d: /* CR */
4902 case 0x85: /* NEL */
4903 case 0x2028: /* LINE SEPARATOR */
4904 case 0x2029: /* PARAGRAPH SEPARATOR */
4905 break;
4906 }
4907 break;
4908
4909 case OP_NOT_DIGIT:
4910 if (c < 256 && (md->ctypes[c] & ctype_digit) != 0)
4911 MRRETURN(MATCH_NOMATCH);
4912 break;
4913
4914 case OP_DIGIT:
4915 if (c >= 256 || (md->ctypes[c] & ctype_digit) == 0)
4916 MRRETURN(MATCH_NOMATCH);
4917 break;
4918
4919 case OP_NOT_WHITESPACE:
4920 if (c < 256 && (md->ctypes[c] & ctype_space) != 0)
4921 MRRETURN(MATCH_NOMATCH);
4922 break;
4923
4924 case OP_WHITESPACE:
4925 if (c >= 256 || (md->ctypes[c] & ctype_space) == 0)
4926 MRRETURN(MATCH_NOMATCH);
4927 break;
4928
4929 case OP_NOT_WORDCHAR:
4930 if (c < 256 && (md->ctypes[c] & ctype_word) != 0)
4931 MRRETURN(MATCH_NOMATCH);
4932 break;
4933
4934 case OP_WORDCHAR:
4935 if (c >= 256 || (md->ctypes[c] & ctype_word) == 0)
4936 MRRETURN(MATCH_NOMATCH);
4937 break;
4938
4939 default:
4940 RRETURN(PCRE_ERROR_INTERNAL);
4941 }
4942 }
4943 }
4944 else
4945 #endif
4946 /* Not UTF-8 mode */
4947 {
4948 for (fi = min;; fi++)
4949 {
4950 RMATCH(eptr, ecode, offset_top, md, eptrb, RM43);
4951 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
4952 if (fi >= max) MRRETURN(MATCH_NOMATCH);
4953 if (eptr >= md->end_subject)
4954 {
4955 SCHECK_PARTIAL();
4956 MRRETURN(MATCH_NOMATCH);
4957 }
4958 if (ctype == OP_ANY && IS_NEWLINE(eptr))
4959 MRRETURN(MATCH_NOMATCH);
4960 c = *eptr++;
4961 switch(ctype)
4962 {
4963 case OP_ANY: /* This is the non-NL case */
4964 case OP_ALLANY:
4965 case OP_ANYBYTE:
4966 break;
4967
4968 case OP_ANYNL:
4969 switch(c)
4970 {
4971 default: MRRETURN(MATCH_NOMATCH);
4972 case 0x000d:
4973 if (eptr < md->end_subject && *eptr == 0x0a) eptr++;
4974 break;
4975
4976 case 0x000a:
4977 break;
4978
4979 case 0x000b:
4980 case 0x000c:
4981 case 0x0085:
4982 if (md->bsr_anycrlf) MRRETURN(MATCH_NOMATCH);
4983 break;
4984 }
4985 break;
4986
4987 case OP_NOT_HSPACE:
4988 switch(c)
4989 {
4990 default: break;
4991 case 0x09: /* HT */
4992 case 0x20: /* SPACE */
4993 case 0xa0: /* NBSP */
4994 MRRETURN(MATCH_NOMATCH);
4995 }
4996 break;
4997
4998 case OP_HSPACE:
4999 switch(c)
5000 {
5001 default: MRRETURN(MATCH_NOMATCH);
5002 case 0x09: /* HT */
5003 case 0x20: /* SPACE */
5004 case 0xa0: /* NBSP */
5005 break;
5006 }
5007 break;
5008
5009 case OP_NOT_VSPACE:
5010 switch(c)
5011 {
5012 default: break;
5013 case 0x0a: /* LF */
5014 case 0x0b: /* VT */
5015 case 0x0c: /* FF */
5016 case 0x0d: /* CR */
5017 case 0x85: /* NEL */
5018 MRRETURN(MATCH_NOMATCH);
5019 }
5020 break;
5021
5022 case OP_VSPACE:
5023 switch(c)
5024 {
5025 default: MRRETURN(MATCH_NOMATCH);
5026 case 0x0a: /* LF */
5027 case 0x0b: /* VT */
5028 case 0x0c: /* FF */
5029 case 0x0d: /* CR */
5030 case 0x85: /* NEL */
5031 break;
5032 }
5033 break;
5034
5035 case OP_NOT_DIGIT:
5036 if ((md->ctypes[c] & ctype_digit) != 0) MRRETURN(MATCH_NOMATCH);
5037 break;
5038
5039 case OP_DIGIT:
5040 if ((md->ctypes[c] & ctype_digit) == 0) MRRETURN(MATCH_NOMATCH);
5041 break;
5042
5043 case OP_NOT_WHITESPACE:
5044 if ((md->ctypes[c] & ctype_space) != 0) MRRETURN(MATCH_NOMATCH);
5045 break;
5046
5047 case OP_WHITESPACE:
5048 if ((md->ctypes[c] & ctype_space) == 0) MRRETURN(MATCH_NOMATCH);
5049 break;
5050
5051 case OP_NOT_WORDCHAR:
5052 if ((md->ctypes[c] & ctype_word) != 0) MRRETURN(MATCH_NOMATCH);
5053 break;
5054
5055 case OP_WORDCHAR:
5056 if ((md->ctypes[c] & ctype_word) == 0) MRRETURN(MATCH_NOMATCH);
5057 break;
5058
5059 default:
5060 RRETURN(PCRE_ERROR_INTERNAL);
5061 }
5062 }
5063 }
5064 /* Control never gets here */
5065 }
5066
5067 /* If maximizing, it is worth using inline code for speed, doing the type
5068 test once at the start (i.e. keep it out of the loop). Again, keep the
5069 UTF-8 and UCP stuff separate. */
5070
5071 else
5072 {
5073 pp = eptr; /* Remember where we started */
5074
5075 #ifdef SUPPORT_UCP
5076 if (prop_type >= 0)
5077 {
5078 switch(prop_type)
5079 {
5080 case PT_ANY:
5081 for (i = min; i < max; i++)
5082 {
5083 int len = 1;
5084 if (eptr >= md->end_subject)
5085 {
5086 SCHECK_PARTIAL();
5087 break;
5088 }
5089 GETCHARLENTEST(c, eptr, len);
5090 if (prop_fail_result) break;
5091 eptr+= len;
5092 }
5093 break;
5094
5095 case PT_LAMP:
5096 for (i = min; i < max; i++)
5097 {
5098 int chartype;
5099 int len = 1;
5100 if (eptr >= md->end_subject)
5101 {
5102 SCHECK_PARTIAL();
5103 break;
5104 }
5105 GETCHARLENTEST(c, eptr, len);
5106 chartype = UCD_CHARTYPE(c);
5107 if ((chartype == ucp_Lu ||
5108 chartype == ucp_Ll ||
5109 chartype == ucp_Lt) == prop_fail_result)
5110 break;
5111 eptr+= len;
5112 }
5113 break;
5114
5115 case PT_GC:
5116 for (i = min; i < max; i++)
5117 {
5118 int len = 1;
5119 if (eptr >= md->end_subject)
5120 {
5121 SCHECK_PARTIAL();
5122 break;
5123 }
5124 GETCHARLENTEST(c, eptr, len);
5125 if ((UCD_CATEGORY(c) == prop_value) == prop_fail_result) break;
5126 eptr+= len;
5127 }
5128 break;
5129
5130 case PT_PC:
5131 for (i = min; i < max; i++)
5132 {
5133 int len = 1;
5134 if (eptr >= md->end_subject)
5135 {
5136 SCHECK_PARTIAL();
5137 break;
5138 }
5139 GETCHARLENTEST(c, eptr, len);
5140 if ((UCD_CHARTYPE(c) == prop_value) == prop_fail_result) break;
5141 eptr+= len;
5142 }
5143 break;
5144
5145 case PT_SC:
5146 for (i = min; i < max; i++)
5147 {
5148 int len = 1;
5149 if (eptr >= md->end_subject)
5150 {
5151 SCHECK_PARTIAL();
5152 break;
5153 }
5154 GETCHARLENTEST(c, eptr, len);
5155 if ((UCD_SCRIPT(c) == prop_value) == prop_fail_result) break;
5156 eptr+= len;
5157 }
5158 break;
5159
5160 case PT_ALNUM:
5161 for (i = min; i < max; i++)
5162 {
5163 int category;
5164 int len = 1;
5165 if (eptr >= md->end_subject)
5166 {
5167 SCHECK_PARTIAL();
5168 break;
5169 }
5170 GETCHARLENTEST(c, eptr, len);
5171 category = UCD_CATEGORY(c);
5172 if ((category == ucp_L || category == ucp_N) == prop_fail_result)
5173 break;
5174 eptr+= len;
5175 }
5176 break;
5177
5178 case PT_SPACE: /* Perl space */
5179 for (i = min; i < max; i++)
5180 {
5181 int len = 1;
5182 if (eptr >= md->end_subject)
5183 {
5184 SCHECK_PARTIAL();
5185 break;
5186 }
5187 GETCHARLENTEST(c, eptr, len);
5188 if ((UCD_CATEGORY(c) == ucp_Z || c == CHAR_HT || c == CHAR_NL ||
5189 c == CHAR_FF || c == CHAR_CR)
5190 == prop_fail_result)
5191 break;
5192 eptr+= len;
5193 }
5194 break;
5195
5196 case PT_PXSPACE: /* POSIX space */
5197 for (i = min; i < max; i++)
5198 {
5199 int len = 1;
5200 if (eptr >= md->end_subject)
5201 {
5202 SCHECK_PARTIAL();
5203 break;
5204 }
5205 GETCHARLENTEST(c, eptr, len);
5206 if ((UCD_CATEGORY(c) == ucp_Z || c == CHAR_HT || c == CHAR_NL ||
5207 c == CHAR_VT || c == CHAR_FF || c == CHAR_CR)
5208 == prop_fail_result)
5209 break;
5210 eptr+= len;
5211 }
5212 break;
5213
5214 case PT_WORD:
5215 for (i = min; i < max; i++)
5216 {
5217 int category;
5218 int len = 1;
5219 if (eptr >= md->end_subject)
5220 {
5221 SCHECK_PARTIAL();
5222 break;
5223 }
5224 GETCHARLENTEST(c, eptr, len);
5225 category = UCD_CATEGORY(c);
5226 if ((category == ucp_L || category == ucp_N ||
5227 c == CHAR_UNDERSCORE) == prop_fail_result)
5228 break;
5229 eptr+= len;
5230 }
5231 break;
5232
5233 default:
5234 RRETURN(PCRE_ERROR_INTERNAL);
5235 }
5236
5237 /* eptr is now past the end of the maximum run */
5238
5239 if (possessive) continue;
5240 for(;;)
5241 {
5242 RMATCH(eptr, ecode, offset_top, md, eptrb, RM44);
5243 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
5244 if (eptr-- == pp) break; /* Stop if tried at original pos */
5245 if (utf8) BACKCHAR(eptr);
5246 }
5247 }
5248
5249 /* Match extended Unicode sequences. We will get here only if the
5250 support is in the binary; otherwise a compile-time error occurs. */
5251
5252 else if (ctype == OP_EXTUNI)
5253 {
5254 for (i = min; i < max; i++)
5255 {
5256 int len = 1;
5257 if (eptr >= md->end_subject)
5258 {
5259 SCHECK_PARTIAL();
5260 break;
5261 }
5262 if (!utf8) c = *eptr; else { GETCHARLEN(c, eptr, len); }
5263 if (UCD_CATEGORY(c) == ucp_M) break;
5264 eptr += len;
5265 while (eptr < md->end_subject)
5266 {
5267 len = 1;
5268 if (!utf8) c = *eptr; else { GETCHARLEN(c, eptr, len); }
5269 if (UCD_CATEGORY(c) != ucp_M) break;
5270 eptr += len;
5271 }
5272 }
5273
5274 /* eptr is now past the end of the maximum run */
5275
5276 if (possessive) continue;
5277
5278 for(;;)
5279 {
5280 RMATCH(eptr, ecode, offset_top, md, eptrb, RM45);
5281 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
5282 if (eptr-- == pp) break; /* Stop if tried at original pos */
5283 for (;;) /* Move back over one extended */
5284 {
5285 if (!utf8) c = *eptr; else
5286 {
5287 BACKCHAR(eptr);
5288 GETCHAR(c, eptr);
5289 }
5290 if (UCD_CATEGORY(c) != ucp_M) break;
5291 eptr--;
5292 }
5293 }
5294 }
5295
5296 else
5297 #endif /* SUPPORT_UCP */
5298
5299 #ifdef SUPPORT_UTF8
5300 /* UTF-8 mode */
5301
5302 if (utf8)
5303 {
5304 switch(ctype)
5305 {
5306 case OP_ANY:
5307 if (max < INT_MAX)
5308 {
5309 for (i = min; i < max; i++)
5310 {
5311 if (eptr >= md->end_subject)
5312 {
5313 SCHECK_PARTIAL();
5314 break;
5315 }
5316 if (IS_NEWLINE(eptr)) break;
5317 eptr++;
5318 while (eptr < md->end_subject && (*eptr & 0xc0) == 0x80) eptr++;
5319 }
5320 }
5321
5322 /* Handle unlimited UTF-8 repeat */
5323
5324 else
5325 {
5326 for (i = min; i < max; i++)
5327 {
5328 if (eptr >= md->end_subject)
5329 {
5330 SCHECK_PARTIAL();
5331 break;
5332 }
5333 if (IS_NEWLINE(eptr)) break;
5334 eptr++;
5335 while (eptr < md->end_subject && (*eptr & 0xc0) == 0x80) eptr++;
5336 }
5337 }
5338 break;
5339
5340 case OP_ALLANY:
5341 if (max < INT_MAX)
5342 {
5343 for (i = min; i < max; i++)
5344 {
5345 if (eptr >= md->end_subject)
5346 {
5347 SCHECK_PARTIAL();
5348 break;
5349 }
5350 eptr++;
5351 while (eptr < md->end_subject && (*eptr & 0xc0) == 0x80) eptr++;
5352 }
5353 }
5354 else
5355 {
5356 eptr = md->end_subject; /* Unlimited UTF-8 repeat */
5357 SCHECK_PARTIAL();
5358 }
5359 break;
5360
5361 /* The byte case is the same as non-UTF8 */
5362
5363 case OP_ANYBYTE:
5364 c = max - min;
5365 if (c > (unsigned int)(md->end_subject - eptr))
5366 {
5367 eptr = md->end_subject;
5368 SCHECK_PARTIAL();
5369 }
5370 else eptr += c;
5371 break;
5372
5373 case OP_ANYNL:
5374 for (i = min; i < max; i++)
5375 {
5376 int len = 1;
5377 if (eptr >= md->end_subject)
5378 {
5379 SCHECK_PARTIAL();
5380 break;
5381 }
5382 GETCHARLEN(c, eptr, len);
5383 if (c == 0x000d)
5384 {
5385 if (++eptr >= md->end_subject) break;
5386 if (*eptr == 0x000a) eptr++;
5387 }
5388 else
5389 {
5390 if (c != 0x000a &&
5391 (md->bsr_anycrlf ||
5392 (c != 0x000b && c != 0x000c &&
5393 c != 0x0085 && c != 0x2028 && c != 0x2029)))
5394 break;
5395 eptr += len;
5396 }
5397 }
5398 break;
5399
5400 case OP_NOT_HSPACE:
5401 case OP_HSPACE:
5402 for (i = min; i < max; i++)
5403 {
5404 BOOL gotspace;
5405 int len = 1;
5406 if (eptr >= md->end_subject)
5407 {
5408 SCHECK_PARTIAL();
5409 break;
5410 }
5411 GETCHARLEN(c, eptr, len);
5412 switch(c)
5413 {
5414 default: gotspace = FALSE; break;
5415 case 0x09: /* HT */
5416 case 0x20: /* SPACE */
5417 case 0xa0: /* NBSP */
5418 case 0x1680: /* OGHAM SPACE MARK */
5419 case 0x180e: /* MONGOLIAN VOWEL SEPARATOR */
5420 case 0x2000: /* EN QUAD */
5421 case 0x2001: /* EM QUAD */
5422 case 0x2002: /* EN SPACE */
5423 case 0x2003: /* EM SPACE */
5424 case 0x2004: /* THREE-PER-EM SPACE */
5425 case 0x2005: /* FOUR-PER-EM SPACE */
5426 case 0x2006: /* SIX-PER-EM SPACE */
5427 case 0x2007: /* FIGURE SPACE */
5428 case 0x2008: /* PUNCTUATION SPACE */
5429 case 0x2009: /* THIN SPACE */
5430 case 0x200A: /* HAIR SPACE */
5431 case 0x202f: /* NARROW NO-BREAK SPACE */
5432 case 0x205f: /* MEDIUM MATHEMATICAL SPACE */
5433 case 0x3000: /* IDEOGRAPHIC SPACE */
5434 gotspace = TRUE;
5435 break;
5436 }
5437 if (gotspace == (ctype == OP_NOT_HSPACE)) break;
5438 eptr += len;
5439 }
5440 break;
5441
5442 case OP_NOT_VSPACE:
5443 case OP_VSPACE:
5444 for (i = min; i < max; i++)
5445 {
5446 BOOL gotspace;
5447 int len = 1;
5448 if (eptr >= md->end_subject)
5449 {
5450 SCHECK_PARTIAL();
5451 break;
5452 }
5453 GETCHARLEN(c, eptr, len);
5454 switch(c)
5455 {
5456 default: gotspace = FALSE; break;
5457 case 0x0a: /* LF */
5458 case 0x0b: /* VT */
5459 case 0x0c: /* FF */
5460 case 0x0d: /* CR */
5461 case 0x85: /* NEL */
5462 case 0x2028: /* LINE SEPARATOR */
5463 case 0x2029: /* PARAGRAPH SEPARATOR */
5464 gotspace = TRUE;
5465 break;
5466 }
5467 if (gotspace == (ctype == OP_NOT_VSPACE)) break;
5468 eptr += len;
5469 }
5470 break;
5471
5472 case OP_NOT_DIGIT:
5473 for (i = min; i < max; i++)
5474 {
5475 int len = 1;
5476 if (eptr >= md->end_subject)
5477 {
5478 SCHECK_PARTIAL();
5479 break;
5480 }
5481 GETCHARLEN(c, eptr, len);
5482 if (c < 256 && (md->ctypes[c] & ctype_digit) != 0) break;
5483 eptr+= len;
5484 }
5485 break;
5486
5487 case OP_DIGIT:
5488 for (i = min; i < max; i++)
5489 {
5490 int len = 1;
5491 if (eptr >= md->end_subject)
5492 {
5493 SCHECK_PARTIAL();
5494 break;
5495 }
5496 GETCHARLEN(c, eptr, len);
5497 if (c >= 256 ||(md->ctypes[c] & ctype_digit) == 0) break;
5498 eptr+= len;
5499 }
5500 break;
5501
5502 case OP_NOT_WHITESPACE:
5503 for (i = min; i < max; i++)
5504 {
5505 int len = 1;
5506 if (eptr >= md->end_subject)
5507 {
5508 SCHECK_PARTIAL();
5509 break;
5510 }
5511 GETCHARLEN(c, eptr, len);
5512 if (c < 256 && (md->ctypes[c] & ctype_space) != 0) break;
5513 eptr+= len;
5514 }
5515 break;
5516
5517 case OP_WHITESPACE:
5518 for (i = min; i < max; i++)
5519 {
5520 int len = 1;
5521 if (eptr >= md->end_subject)
5522 {
5523 SCHECK_PARTIAL();
5524 break;
5525 }
5526 GETCHARLEN(c, eptr, len);
5527 if (c >= 256 ||(md->ctypes[c] & ctype_space) == 0) break;
5528 eptr+= len;
5529 }
5530 break;
5531
5532 case OP_NOT_WORDCHAR:
5533 for (i = min; i < max; i++)
5534 {
5535 int len = 1;
5536 if (eptr >= md->end_subject)
5537 {
5538 SCHECK_PARTIAL();
5539 break;
5540 }
5541 GETCHARLEN(c, eptr, len);
5542 if (c < 256 && (md->ctypes[c] & ctype_word) != 0) break;
5543 eptr+= len;
5544 }
5545 break;
5546
5547 case OP_WORDCHAR:
5548 for (i = min; i < max; i++)
5549 {
5550 int len = 1;
5551 if (eptr >= md->end_subject)
5552 {
5553 SCHECK_PARTIAL();
5554 break;
5555 }
5556 GETCHARLEN(c, eptr, len);
5557 if (c >= 256 || (md->ctypes[c] & ctype_word) == 0) break;
5558 eptr+= len;
5559 }
5560 break;
5561
5562 default:
5563 RRETURN(PCRE_ERROR_INTERNAL);
5564 }
5565
5566 /* eptr is now past the end of the maximum run. If possessive, we are
5567 done (no backing up). Otherwise, match at this position; anything other
5568 than no match is immediately returned. For nomatch, back up one
5569 character, unless we are matching \R and the last thing matched was
5570 \r\n, in which case, back up two bytes. */
5571
5572 if (possessive) continue;
5573 for(;;)
5574 {
5575 RMATCH(eptr, ecode, offset_top, md, eptrb, RM46);
5576 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
5577 if (eptr-- == pp) break; /* Stop if tried at original pos */
5578 BACKCHAR(eptr);
5579 if (ctype == OP_ANYNL && eptr > pp && *eptr == '\n' &&
5580 eptr[-1] == '\r') eptr--;
5581 }
5582 }
5583 else
5584 #endif /* SUPPORT_UTF8 */
5585
5586 /* Not UTF-8 mode */
5587 {
5588 switch(ctype)
5589 {
5590 case OP_ANY:
5591 for (i = min; i < max; i++)
5592 {
5593 if (eptr >= md->end_subject)
5594 {
5595 SCHECK_PARTIAL();
5596 break;
5597 }
5598 if (IS_NEWLINE(eptr)) break;
5599 eptr++;
5600 }
5601 break;
5602
5603 case OP_ALLANY:
5604 case OP_ANYBYTE:
5605 c = max - min;
5606 if (c > (unsigned int)(md->end_subject - eptr))
5607 {
5608 eptr = md->end_subject;
5609 SCHECK_PARTIAL();
5610 }
5611 else eptr += c;
5612 break;
5613
5614 case OP_ANYNL:
5615 for (i = min; i < max; i++)
5616 {
5617 if (eptr >= md->end_subject)
5618 {
5619 SCHECK_PARTIAL();
5620 break;
5621 }
5622 c = *eptr;
5623 if (c == 0x000d)
5624 {
5625 if (++eptr >= md->end_subject) break;
5626 if (*eptr == 0x000a) eptr++;
5627 }
5628 else
5629 {
5630 if (c != 0x000a &&
5631 (md->bsr_anycrlf ||
5632 (c != 0x000b && c != 0x000c && c != 0x0085)))
5633 break;
5634 eptr++;
5635 }
5636 }
5637 break;
5638
5639 case OP_NOT_HSPACE:
5640 for (i = min; i < max; i++)
5641 {
5642 if (eptr >= md->end_subject)
5643 {
5644 SCHECK_PARTIAL();
5645 break;
5646 }
5647 c = *eptr;
5648 if (c == 0x09 || c == 0x20 || c == 0xa0) break;
5649 eptr++;
5650 }
5651 break;
5652
5653 case OP_HSPACE:
5654 for (i = min; i < max; i++)
5655 {
5656 if (eptr >= md->end_subject)
5657 {
5658 SCHECK_PARTIAL();
5659 break;
5660 }
5661 c = *eptr;
5662 if (c != 0x09 && c != 0x20 && c != 0xa0) break;
5663 eptr++;
5664 }
5665 break;
5666
5667 case OP_NOT_VSPACE:
5668 for (i = min; i < max; i++)
5669 {
5670 if (eptr >= md->end_subject)
5671 {
5672 SCHECK_PARTIAL();
5673 break;
5674 }
5675 c = *eptr;
5676 if (c == 0x0a || c == 0x0b || c == 0x0c || c == 0x0d || c == 0x85)
5677 break;
5678 eptr++;
5679 }
5680 break;
5681
5682 case OP_VSPACE:
5683 for (i = min; i < max; i++)
5684 {
5685 if (eptr >= md->end_subject)
5686 {
5687 SCHECK_PARTIAL();
5688 break;
5689 }
5690 c = *eptr;
5691 if (c != 0x0a && c != 0x0b && c != 0x0c && c != 0x0d && c != 0x85)
5692 break;
5693 eptr++;
5694 }
5695 break;
5696
5697 case OP_NOT_DIGIT:
5698 for (i = min; i < max; i++)
5699 {
5700 if (eptr >= md->end_subject)
5701 {
5702 SCHECK_PARTIAL();
5703 break;
5704 }
5705 if ((md->ctypes[*eptr] & ctype_digit) != 0) break;
5706 eptr++;
5707 }
5708 break;
5709
5710 case OP_DIGIT:
5711 for (i = min; i < max; i++)
5712 {
5713 if (eptr >= md->end_subject)
5714 {
5715 SCHECK_PARTIAL();
5716 break;
5717 }
5718 if ((md->ctypes[*eptr] & ctype_digit) == 0) break;
5719 eptr++;
5720 }
5721 break;
5722
5723 case OP_NOT_WHITESPACE:
5724 for (i = min; i < max; i++)
5725 {
5726 if (eptr >= md->end_subject)
5727 {
5728 SCHECK_PARTIAL();
5729 break;
5730 }
5731 if ((md->ctypes[*eptr] & ctype_space) != 0) break;
5732 eptr++;
5733 }
5734 break;
5735
5736 case OP_WHITESPACE:
5737 for (i = min; i < max; i++)
5738 {
5739 if (eptr >= md->end_subject)
5740 {
5741 SCHECK_PARTIAL();
5742 break;
5743 }
5744 if ((md->ctypes[*eptr] & ctype_space) == 0) break;
5745 eptr++;
5746 }
5747 break;
5748
5749 case OP_NOT_WORDCHAR:
5750 for (i = min; i < max; i++)
5751 {
5752 if (eptr >= md->end_subject)
5753 {
5754 SCHECK_PARTIAL();
5755 break;
5756 }
5757 if ((md->ctypes[*eptr] & ctype_word) != 0) break;
5758 eptr++;
5759 }
5760 break;
5761
5762 case OP_WORDCHAR:
5763 for (i = min; i < max; i++)
5764 {
5765 if (eptr >= md->end_subject)
5766 {
5767 SCHECK_PARTIAL();
5768 break;
5769 }
5770 if ((md->ctypes[*eptr] & ctype_word) == 0) break;
5771 eptr++;
5772 }
5773 break;
5774
5775 default:
5776 RRETURN(PCRE_ERROR_INTERNAL);
5777 }
5778
5779 /* eptr is now past the end of the maximum run. If possessive, we are
5780 done (no backing up). Otherwise, match at this position; anything other
5781 than no match is immediately returned. For nomatch, back up one
5782 character (byte), unless we are matching \R and the last thing matched
5783 was \r\n, in which case, back up two bytes. */
5784
5785 if (possessive) continue;
5786 while (eptr >= pp)
5787 {
5788 RMATCH(eptr, ecode, offset_top, md, eptrb, RM47);
5789 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
5790 eptr--;
5791 if (ctype == OP_ANYNL && eptr > pp && *eptr == '\n' &&
5792 eptr[-1] == '\r') eptr--;
5793 }
5794 }
5795
5796 /* Get here if we can't make it match with any permitted repetitions */
5797
5798 MRRETURN(MATCH_NOMATCH);
5799 }
5800 /* Control never gets here */
5801
5802 /* There's been some horrible disaster. Arrival here can only mean there is
5803 something seriously wrong in the code above or the OP_xxx definitions. */
5804
5805 default:
5806 DPRINTF(("Unknown opcode %d\n", *ecode));
5807 RRETURN(PCRE_ERROR_UNKNOWN_OPCODE);
5808 }
5809
5810 /* Do not stick any code in here without much thought; it is assumed
5811 that "continue" in the code above comes out to here to repeat the main
5812 loop. */
5813
5814 } /* End of main loop */
5815 /* Control never reaches here */
5816
5817
5818 /* When compiling to use the heap rather than the stack for recursive calls to
5819 match(), the RRETURN() macro jumps here. The number that is saved in
5820 frame->Xwhere indicates which label we actually want to return to. */
5821
5822 #ifdef NO_RECURSE
5823 #define LBL(val) case val: goto L_RM##val;
5824 HEAP_RETURN:
5825 switch (frame->Xwhere)
5826 {
5827 LBL( 1) LBL( 2) LBL( 3) LBL( 4) LBL( 5) LBL( 6) LBL( 7) LBL( 8)
5828 LBL( 9) LBL(10) LBL(11) LBL(12) LBL(13) LBL(14) LBL(15) LBL(17)
5829 LBL(19) LBL(24) LBL(25) LBL(26) LBL(27) LBL(29) LBL(31) LBL(33)
5830 LBL(35) LBL(43) LBL(47) LBL(48) LBL(49) LBL(50) LBL(51) LBL(52)
5831 LBL(53) LBL(54) LBL(55) LBL(56) LBL(57) LBL(58) LBL(63) LBL(64)
5832 LBL(65) LBL(66)
5833 #ifdef SUPPORT_UTF8
5834 LBL(16) LBL(18) LBL(20) LBL(21) LBL(22) LBL(23) LBL(28) LBL(30)
5835 LBL(32) LBL(34) LBL(42) LBL(46)
5836 #ifdef SUPPORT_UCP
5837 LBL(36) LBL(37) LBL(38) LBL(39) LBL(40) LBL(41) LBL(44) LBL(45)
5838 LBL(59) LBL(60) LBL(61) LBL(62)
5839 #endif /* SUPPORT_UCP */
5840 #endif /* SUPPORT_UTF8 */
5841 default:
5842 DPRINTF(("jump error in pcre match: label %d non-existent\n", frame->Xwhere));
5843 return PCRE_ERROR_INTERNAL;
5844 }
5845 #undef LBL
5846 #endif /* NO_RECURSE */
5847 }
5848
5849
5850 /***************************************************************************
5851 ****************************************************************************
5852 RECURSION IN THE match() FUNCTION
5853
5854 Undefine all the macros that were defined above to handle this. */
5855
5856 #ifdef NO_RECURSE
5857 #undef eptr
5858 #undef ecode
5859 #undef mstart
5860 #undef offset_top
5861 #undef eptrb
5862 #undef flags
5863
5864 #undef callpat
5865 #undef charptr
5866 #undef data
5867 #undef next
5868 #undef pp
5869 #undef prev
5870 #undef saved_eptr
5871
5872 #undef new_recursive
5873
5874 #undef cur_is_word
5875 #undef condition
5876 #undef prev_is_word
5877
5878 #undef ctype
5879 #undef length
5880 #undef max
5881 #undef min
5882 #undef number
5883 #undef offset
5884 #undef op
5885 #undef save_capture_last
5886 #undef save_offset1
5887 #undef save_offset2
5888 #undef save_offset3
5889 #undef stacksave
5890
5891 #undef newptrb
5892
5893 #endif
5894
5895 /* These two are defined as macros in both cases */
5896
5897 #undef fc
5898 #undef fi
5899
5900 /***************************************************************************
5901 ***************************************************************************/
5902
5903
5904
5905 /*************************************************
5906 * Execute a Regular Expression *
5907 *************************************************/
5908
5909 /* This function applies a compiled re to a subject string and picks out
5910 portions of the string if it matches. Two elements in the vector are set for
5911 each substring: the offsets to the start and end of the substring.
5912
5913 Arguments:
5914 argument_re points to the compiled expression
5915 extra_data points to extra data or is NULL
5916 subject points to the subject string
5917 length length of subject string (may contain binary zeros)
5918 start_offset where to start in the subject string
5919 options option bits
5920 offsets points to a vector of ints to be filled in with offsets
5921 offsetcount the number of elements in the vector
5922
5923 Returns: > 0 => success; value is the number of elements filled in
5924 = 0 => success, but offsets is not big enough
5925 -1 => failed to match
5926 < -1 => some kind of unexpected problem
5927 */
5928
5929 PCRE_EXP_DEFN int PCRE_CALL_CONVENTION
5930 pcre_exec(const pcre *argument_re, const pcre_extra *extra_data,
5931 PCRE_SPTR subject, int length, int start_offset, int options, int *offsets,
5932 int offsetcount)
5933 {
5934 int rc, ocount, arg_offset_max;
5935 int first_byte = -1;
5936 int req_byte = -1;
5937 int req_byte2 = -1;
5938 int newline;
5939 BOOL using_temporary_offsets = FALSE;
5940 BOOL anchored;
5941 BOOL startline;
5942 BOOL firstline;
5943 BOOL first_byte_caseless = FALSE;
5944 BOOL req_byte_caseless = FALSE;
5945 BOOL utf8;
5946 match_data match_block;
5947 match_data *md = &match_block;
5948 const uschar *tables;
5949 const uschar *start_bits = NULL;
5950 USPTR start_match = (USPTR)subject + start_offset;
5951 USPTR end_subject;
5952 USPTR start_partial = NULL;
5953 USPTR req_byte_ptr = start_match - 1;
5954
5955 pcre_study_data internal_study;
5956 const pcre_study_data *study;
5957
5958 real_pcre internal_re;
5959 const real_pcre *external_re = (const real_pcre *)argument_re;
5960 const real_pcre *re = external_re;
5961
5962 /* Plausibility checks */
5963
5964 if ((options & ~PUBLIC_EXEC_OPTIONS) != 0) return PCRE_ERROR_BADOPTION;
5965 if (re == NULL || subject == NULL ||
5966 (offsets == NULL && offsetcount > 0)) return PCRE_ERROR_NULL;
5967 if (offsetcount < 0) return PCRE_ERROR_BADCOUNT;
5968 if (start_offset < 0 || start_offset > length) return PCRE_ERROR_BADOFFSET;
5969
5970 /* These two settings are used in the code for checking a UTF-8 string that
5971 follows immediately afterwards. Other values in the md block are used only
5972 during "normal" pcre_exec() processing, not when the JIT support is in use,
5973 so they are set up later. */
5974
5975 utf8 = md->utf8 = (re->options & PCRE_UTF8) != 0;
5976 md->partial = ((options & PCRE_PARTIAL_HARD) != 0)? 2 :
5977 ((options & PCRE_PARTIAL_SOFT) != 0)? 1 : 0;
5978
5979 /* Check a UTF-8 string if required. Pass back the character offset and error
5980 code for an invalid string if a results vector is available. */
5981
5982 #ifdef SUPPORT_UTF8
5983 if (utf8 && (options & PCRE_NO_UTF8_CHECK) == 0)
5984 {
5985 int erroroffset;
5986 int errorcode = _pcre_valid_utf8((USPTR)subject, length, &erroroffset);
5987 if (errorcode != 0)
5988 {
5989 if (offsetcount >= 2)
5990 {
5991 offsets[0] = erroroffset;
5992 offsets[1] = errorcode;
5993 }
5994 return (errorcode <= PCRE_UTF8_ERR5 && md->partial > 1)?
5995 PCRE_ERROR_SHORTUTF8 : PCRE_ERROR_BADUTF8;
5996 }
5997
5998 /* Check that a start_offset points to the start of a UTF-8 character. */
5999 if (start_offset > 0 && start_offset < length &&
6000 (((USPTR)subject)[start_offset] & 0xc0) == 0x80)
6001 return PCRE_ERROR_BADUTF8_OFFSET;
6002 }
6003 #endif
6004
6005 /* If the pattern was successfully studied with JIT support, run the JIT
6006 executable instead of the rest of this function. Most options must be set at
6007 compile time for the JIT code to be usable. Fallback to the normal code path if
6008 an unsupported flag is set. In particular, JIT does not support partial
6009 matching. */
6010
6011 #ifdef SUPPORT_JIT
6012 if (extra_data != NULL
6013 && (extra_data->flags & PCRE_EXTRA_EXECUTABLE_JIT) != 0
6014 && extra_data->executable_jit != NULL
6015 && (extra_data->flags & PCRE_EXTRA_TABLES) == 0
6016 && (options & ~(PCRE_NO_UTF8_CHECK | PCRE_NOTBOL | PCRE_NOTEOL |
6017 PCRE_NOTEMPTY | PCRE_NOTEMPTY_ATSTART)) == 0)
6018 return _pcre_jit_exec(re, extra_data->executable_jit, subject, length,
6019 start_offset, options, ((extra_data->flags & PCRE_EXTRA_MATCH_LIMIT) == 0)
6020 ? MATCH_LIMIT : extra_data->match_limit, offsets, offsetcount);
6021 #endif
6022
6023 /* Carry on with non-JIT matching. This information is for finding all the
6024 numbers associated with a given name, for condition testing. */
6025
6026 md->name_table = (uschar *)re + re->name_table_offset;
6027 md->name_count = re->name_count;
6028 md->name_entry_size = re->name_entry_size;
6029
6030 /* Fish out the optional data from the extra_data structure, first setting
6031 the default values. */
6032
6033 study = NULL;
6034 md->match_limit = MATCH_LIMIT;
6035 md->match_limit_recursion = MATCH_LIMIT_RECURSION;
6036 md->callout_data = NULL;
6037
6038 /* The table pointer is always in native byte order. */
6039
6040 tables = external_re->tables;
6041
6042 if (extra_data != NULL)
6043 {
6044 register unsigned int flags = extra_data->flags;
6045 if ((flags & PCRE_EXTRA_STUDY_DATA) != 0)
6046 study = (const pcre_study_data *)extra_data->study_data;
6047 if ((flags & PCRE_EXTRA_MATCH_LIMIT) != 0)
6048 md->match_limit = extra_data->match_limit;
6049 if ((flags & PCRE_EXTRA_MATCH_LIMIT_RECURSION) != 0)
6050 md->match_limit_recursion = extra_data->match_limit_recursion;
6051 if ((flags & PCRE_EXTRA_CALLOUT_DATA) != 0)
6052 md->callout_data = extra_data->callout_data;
6053 if ((flags & PCRE_EXTRA_TABLES) != 0) tables = extra_data->tables;
6054 }
6055
6056 /* If the exec call supplied NULL for tables, use the inbuilt ones. This
6057 is a feature that makes it possible to save compiled regex and re-use them
6058 in other programs later. */
6059
6060 if (tables == NULL) tables = _pcre_default_tables;
6061
6062 /* Check that the first field in the block is the magic number. If it is not,
6063 test for a regex that was compiled on a host of opposite endianness. If this is
6064 the case, flipped values are put in internal_re and internal_study if there was
6065 study data too. */
6066
6067 if (re->magic_number != MAGIC_NUMBER)
6068 {
6069 re = _pcre_try_flipped(re, &internal_re, study, &internal_study);
6070 if (re == NULL) return PCRE_ERROR_BADMAGIC;
6071 if (study != NULL) study = &internal_study;
6072 }
6073
6074 /* Set up other data */
6075
6076 anchored = ((re->options | options) & PCRE_ANCHORED) != 0;
6077 startline = (re->flags & PCRE_STARTLINE) != 0;
6078 firstline = (re->options & PCRE_FIRSTLINE) != 0;
6079
6080 /* The code starts after the real_pcre block and the capture name table. */
6081
6082 md->start_code = (const uschar *)external_re + re->name_table_offset +
6083 re->name_count * re->name_entry_size;
6084
6085 md->start_subject = (USPTR)subject;
6086 md->start_offset = start_offset;
6087 md->end_subject = md->start_subject + length;
6088 end_subject = md->end_subject;
6089
6090 md->endonly = (re->options & PCRE_DOLLAR_ENDONLY) != 0;
6091 md->use_ucp = (re->options & PCRE_UCP) != 0;
6092 md->jscript_compat = (re->options & PCRE_JAVASCRIPT_COMPAT) != 0;
6093
6094 /* Some options are unpacked into BOOL variables in the hope that testing
6095 them will be faster than individual option bits. */
6096
6097 md->notbol = (options & PCRE_NOTBOL) != 0;
6098 md->noteol = (options & PCRE_NOTEOL) != 0;
6099 md->notempty = (options & PCRE_NOTEMPTY) != 0;
6100 md->notempty_atstart = (options & PCRE_NOTEMPTY_ATSTART) != 0;
6101
6102 md->hitend = FALSE;
6103 md->mark = NULL; /* In case never set */
6104
6105 md->recursive = NULL; /* No recursion at top level */
6106 md->hasthen = (re->flags & PCRE_HASTHEN) != 0;
6107
6108 md->lcc = tables + lcc_offset;
6109 md->ctypes = tables + ctypes_offset;
6110
6111 /* Handle different \R options. */
6112
6113 switch (options & (PCRE_BSR_ANYCRLF|PCRE_BSR_UNICODE))
6114 {
6115 case 0:
6116 if ((re->options & (PCRE_BSR_ANYCRLF|PCRE_BSR_UNICODE)) != 0)
6117 md->bsr_anycrlf = (re->options & PCRE_BSR_ANYCRLF) != 0;
6118 else
6119 #ifdef BSR_ANYCRLF
6120 md->bsr_anycrlf = TRUE;
6121 #else
6122 md->bsr_anycrlf = FALSE;
6123 #endif
6124 break;
6125
6126 case PCRE_BSR_ANYCRLF:
6127 md->bsr_anycrlf = TRUE;
6128 break;
6129
6130 case PCRE_BSR_UNICODE:
6131 md->bsr_anycrlf = FALSE;
6132 break;
6133
6134 default: return PCRE_ERROR_BADNEWLINE;
6135 }
6136
6137 /* Handle different types of newline. The three bits give eight cases. If
6138 nothing is set at run time, whatever was used at compile time applies. */
6139
6140 switch ((((options & PCRE_NEWLINE_BITS) == 0)? re->options :
6141 (pcre_uint32)options) & PCRE_NEWLINE_BITS)
6142 {
6143 case 0: newline = NEWLINE; break; /* Compile-time default */
6144 case PCRE_NEWLINE_CR: newline = CHAR_CR; break;
6145 case PCRE_NEWLINE_LF: newline = CHAR_NL; break;
6146 case PCRE_NEWLINE_CR+
6147 PCRE_NEWLINE_LF: newline = (CHAR_CR << 8) | CHAR_NL; break;
6148 case PCRE_NEWLINE_ANY: newline = -1; break;
6149 case PCRE_NEWLINE_ANYCRLF: newline = -2; break;
6150 default: return PCRE_ERROR_BADNEWLINE;
6151 }
6152
6153 if (newline == -2)
6154 {
6155 md->nltype = NLTYPE_ANYCRLF;
6156 }
6157 else if (newline < 0)
6158 {
6159 md->nltype = NLTYPE_ANY;
6160 }
6161 else
6162 {
6163 md->nltype = NLTYPE_FIXED;
6164 if (newline > 255)
6165 {
6166 md->nllen = 2;
6167 md->nl[0] = (newline >> 8) & 255;
6168 md->nl[1] = newline & 255;
6169 }
6170 else
6171 {
6172 md->nllen = 1;
6173 md->nl[0] = newline;
6174 }
6175 }
6176
6177 /* Partial matching was originally supported only for a restricted set of
6178 regexes; from release 8.00 there are no restrictions, but the bits are still
6179 defined (though never set). So there's no harm in leaving this code. */
6180
6181 if (md->partial && (re->flags & PCRE_NOPARTIAL) != 0)
6182 return PCRE_ERROR_BADPARTIAL;
6183
6184 /* If the expression has got more back references than the offsets supplied can
6185 hold, we get a temporary chunk of working store to use during the matching.
6186 Otherwise, we can use the vector supplied, rounding down its size to a multiple
6187 of 3. */
6188
6189 ocount = offsetcount - (offsetcount % 3);
6190 arg_offset_max = (2*ocount)/3;
6191
6192 if (re->top_backref > 0 && re->top_backref >= ocount/3)
6193 {
6194 ocount = re->top_backref * 3 + 3;
6195 md->offset_vector = (int *)(pcre_malloc)(ocount * sizeof(int));
6196 if (md->offset_vector == NULL) return PCRE_ERROR_NOMEMORY;
6197 using_temporary_offsets = TRUE;
6198 DPRINTF(("Got memory to hold back references\n"));
6199 }
6200 else md->offset_vector = offsets;
6201
6202 md->offset_end = ocount;
6203 md->offset_max = (2*ocount)/3;
6204 md->offset_overflow = FALSE;
6205 md->capture_last = -1;
6206
6207 /* Reset the working variable associated with each extraction. These should
6208 never be used unless previously set, but they get saved and restored, and so we
6209 initialize them to avoid reading uninitialized locations. Also, unset the
6210 offsets for the matched string. This is really just for tidiness with callouts,
6211 in case they inspect these fields. */
6212
6213 if (md->offset_vector != NULL)
6214 {
6215 register int *iptr = md->offset_vector + ocount;
6216 register int *iend = iptr - re->top_bracket;
6217 if (iend < md->offset_vector + 2) iend = md->offset_vector + 2;
6218 while (--iptr >= iend) *iptr = -1;
6219 md->offset_vector[0] = md->offset_vector[1] = -1;
6220 }
6221
6222 /* Set up the first character to match, if available. The first_byte value is
6223 never set for an anchored regular expression, but the anchoring may be forced
6224 at run time, so we have to test for anchoring. The first char may be unset for
6225 an unanchored pattern, of course. If there's no first char and the pattern was
6226 studied, there may be a bitmap of possible first characters. */
6227
6228 if (!anchored)
6229 {
6230 if ((re->flags & PCRE_FIRSTSET) != 0)
6231 {
6232 first_byte = re->first_byte & 255;
6233 if ((first_byte_caseless = ((re->first_byte & REQ_CASELESS) != 0)) == TRUE)
6234 first_byte = md->lcc[first_byte];
6235 }
6236 else
6237 if (!startline && study != NULL &&
6238 (study->flags & PCRE_STUDY_MAPPED) != 0)
6239 start_bits = study->start_bits;
6240 }
6241
6242 /* For anchored or unanchored matches, there may be a "last known required
6243 character" set. */
6244
6245 if ((re->flags & PCRE_REQCHSET) != 0)
6246 {
6247 req_byte = re->req_byte & 255;
6248 req_byte_caseless = (re->req_byte & REQ_CASELESS) != 0;
6249 req_byte2 = (tables + fcc_offset)[req_byte]; /* case flipped */
6250 }
6251
6252
6253
6254
6255 /* ==========================================================================*/
6256
6257 /* Loop for handling unanchored repeated matching attempts; for anchored regexs
6258 the loop runs just once. */
6259
6260 for(;;)
6261 {
6262 USPTR save_end_subject = end_subject;
6263 USPTR new_start_match;
6264
6265 /* If firstline is TRUE, the start of the match is constrained to the first
6266 line of a multiline string. That is, the match must be before or at the first
6267 newline. Implement this by temporarily adjusting end_subject so that we stop
6268 scanning at a newline. If the match fails at the newline, later code breaks
6269 this loop. */
6270
6271 if (firstline)
6272 {
6273 USPTR t = start_match;
6274 #ifdef SUPPORT_UTF8
6275 if (utf8)
6276 {
6277 while (t < md->end_subject && !IS_NEWLINE(t))
6278 {
6279 t++;
6280 while (t < end_subject && (*t & 0xc0) == 0x80) t++;
6281 }
6282 }
6283 else
6284 #endif
6285 while (t < md->end_subject && !IS_NEWLINE(t)) t++;
6286 end_subject = t;
6287 }
6288
6289 /* There are some optimizations that avoid running the match if a known
6290 starting point is not found, or if a known later character is not present.
6291 However, there is an option that disables these, for testing and for ensuring
6292 that all callouts do actually occur. The option can be set in the regex by
6293 (*NO_START_OPT) or passed in match-time options. */
6294
6295 if (((options | re->options) & PCRE_NO_START_OPTIMIZE) == 0)
6296 {
6297 /* Advance to a unique first byte if there is one. */
6298
6299 if (first_byte >= 0)
6300 {
6301 if (first_byte_caseless)
6302 while (start_match < end_subject && md->lcc[*start_match] != first_byte)
6303 start_match++;
6304 else
6305 while (start_match < end_subject && *start_match != first_byte)
6306 start_match++;
6307 }
6308
6309 /* Or to just after a linebreak for a multiline match */
6310
6311 else if (startline)
6312 {
6313 if (start_match > md->start_subject + start_offset)
6314 {
6315 #ifdef SUPPORT_UTF8
6316 if (utf8)
6317 {
6318 while (start_match < end_subject && !WAS_NEWLINE(start_match))
6319 {
6320 start_match++;
6321 while(start_match < end_subject && (*start_match & 0xc0) == 0x80)
6322 start_match++;
6323 }
6324 }
6325 else
6326 #endif
6327 while (start_match < end_subject && !WAS_NEWLINE(start_match))
6328 start_match++;
6329
6330 /* If we have just passed a CR and the newline option is ANY or ANYCRLF,
6331 and we are now at a LF, advance the match position by one more character.
6332 */
6333
6334 if (start_match[-1] == CHAR_CR &&
6335 (md->nltype == NLTYPE_ANY || md->nltype == NLTYPE_ANYCRLF) &&
6336 start_match < end_subject &&
6337 *start_match == CHAR_NL)
6338 start_match++;
6339 }
6340 }
6341
6342 /* Or to a non-unique first byte after study */
6343
6344 else if (start_bits != NULL)
6345 {
6346 while (start_match < end_subject)
6347 {
6348 register unsigned int c = *start_match;
6349 if ((start_bits[c/8] & (1 << (c&7))) == 0)
6350 {
6351 start_match++;
6352 #ifdef SUPPORT_UTF8
6353 if (utf8)
6354 while(start_match < end_subject && (*start_match & 0xc0) == 0x80)
6355 start_match++;
6356 #endif
6357 }
6358 else break;
6359 }
6360 }
6361 } /* Starting optimizations */
6362
6363 /* Restore fudged end_subject */
6364
6365 end_subject = save_end_subject;
6366
6367 /* The following two optimizations are disabled for partial matching or if
6368 disabling is explicitly requested. */
6369
6370 if (((options | re->options) & PCRE_NO_START_OPTIMIZE) == 0 && !md->partial)
6371 {
6372 /* If the pattern was studied, a minimum subject length may be set. This is
6373 a lower bound; no actual string of that length may actually match the
6374 pattern. Although the value is, strictly, in characters, we treat it as
6375 bytes to avoid spending too much time in this optimization. */
6376
6377 if (study != NULL && (study->flags & PCRE_STUDY_MINLEN) != 0 &&
6378 (pcre_uint32)(end_subject - start_match) < study->minlength)
6379 {
6380 rc = MATCH_NOMATCH;
6381 break;
6382 }
6383
6384 /* If req_byte is set, we know that that character must appear in the
6385 subject for the match to succeed. If the first character is set, req_byte
6386 must be later in the subject; otherwise the test starts at the match point.
6387 This optimization can save a huge amount of backtracking in patterns with
6388 nested unlimited repeats that aren't going to match. Writing separate code
6389 for cased/caseless versions makes it go faster, as does using an
6390 autoincrement and backing off on a match.
6391
6392 HOWEVER: when the subject string is very, very long, searching to its end
6393 can take a long time, and give bad performance on quite ordinary patterns.
6394 This showed up when somebody was matching something like /^\d+C/ on a
6395 32-megabyte string... so we don't do this when the string is sufficiently
6396 long. */
6397
6398 if (req_byte >= 0 && end_subject - start_match < REQ_BYTE_MAX)
6399 {
6400 register USPTR p = start_match + ((first_byte >= 0)? 1 : 0);
6401
6402 /* We don't need to repeat the search if we haven't yet reached the
6403 place we found it at last time. */
6404
6405 if (p > req_byte_ptr)
6406 {
6407 if (req_byte_caseless)
6408 {
6409 while (p < end_subject)
6410 {
6411 register int pp = *p++;
6412 if (pp == req_byte || pp == req_byte2) { p--; break; }
6413 }
6414 }
6415 else
6416 {
6417 while (p < end_subject)
6418 {
6419 if (*p++ == req_byte) { p--; break; }
6420 }
6421 }
6422
6423 /* If we can't find the required character, break the matching loop,
6424 forcing a match failure. */
6425
6426 if (p >= end_subject)
6427 {
6428 rc = MATCH_NOMATCH;
6429 break;
6430 }
6431
6432 /* If we have found the required character, save the point where we
6433 found it, so that we don't search again next time round the loop if
6434 the start hasn't passed this character yet. */
6435
6436 req_byte_ptr = p;
6437 }
6438 }
6439 }
6440
6441 #ifdef PCRE_DEBUG /* Sigh. Some compilers never learn. */
6442 printf(">>>> Match against: ");
6443 pchars(start_match, end_subject - start_match, TRUE, md);
6444 printf("\n");
6445 #endif
6446
6447 /* OK, we can now run the match. If "hitend" is set afterwards, remember the
6448 first starting point for which a partial match was found. */
6449
6450 md->start_match_ptr = start_match;
6451 md->start_used_ptr = start_match;
6452 md->match_call_count = 0;
6453 md->match_function_type = 0;
6454 md->end_offset_top = 0;
6455 rc = match(start_match, md->start_code, start_match, NULL, 2, md, NULL, 0);
6456 if (md->hitend && start_partial == NULL) start_partial = md->start_used_ptr;
6457
6458 switch(rc)
6459 {
6460 /* SKIP passes back the next starting point explicitly, but if it is the
6461 same as the match we have just done, treat it as NOMATCH. */
6462
6463 case MATCH_SKIP:
6464 if (md->start_match_ptr != start_match)
6465 {
6466 new_start_match = md->start_match_ptr;
6467 break;
6468 }
6469 /* Fall through */
6470
6471 /* If MATCH_SKIP_ARG reaches this level it means that a MARK that matched
6472 the SKIP's arg was not found. We also treat this as NOMATCH. */
6473
6474 case MATCH_SKIP_ARG:
6475 /* Fall through */
6476
6477 /* NOMATCH and PRUNE advance by one character. THEN at this level acts
6478 exactly like PRUNE. */
6479
6480 case MATCH_NOMATCH:
6481 case MATCH_PRUNE:
6482 case MATCH_THEN:
6483 new_start_match = start_match + 1;
6484 #ifdef SUPPORT_UTF8
6485 if (utf8)
6486 while(new_start_match < end_subject && (*new_start_match & 0xc0) == 0x80)
6487 new_start_match++;
6488 #endif
6489 break;
6490
6491 /* COMMIT disables the bumpalong, but otherwise behaves as NOMATCH. */
6492
6493 case MATCH_COMMIT:
6494 rc = MATCH_NOMATCH;
6495 goto ENDLOOP;
6496
6497 /* Any other return is either a match, or some kind of error. */
6498
6499 default:
6500 goto ENDLOOP;
6501 }
6502
6503 /* Control reaches here for the various types of "no match at this point"
6504 result. Reset the code to MATCH_NOMATCH for subsequent checking. */
6505
6506 rc = MATCH_NOMATCH;
6507
6508 /* If PCRE_FIRSTLINE is set, the match must happen before or at the first
6509 newline in the subject (though it may continue over the newline). Therefore,
6510 if we have just failed to match, starting at a newline, do not continue. */
6511
6512 if (firstline && IS_NEWLINE(start_match)) break;
6513
6514 /* Advance to new matching position */
6515
6516 start_match = new_start_match;
6517
6518 /* Break the loop if the pattern is anchored or if we have passed the end of
6519 the subject. */
6520
6521 if (anchored || start_match > end_subject) break;
6522
6523 /* If we have just passed a CR and we are now at a LF, and the pattern does
6524 not contain any explicit matches for \r or \n, and the newline option is CRLF
6525 or ANY or ANYCRLF, advance the match position by one more character. */
6526
6527 if (start_match[-1] == CHAR_CR &&
6528 start_match < end_subject &&
6529 *start_match == CHAR_NL &&
6530 (re->flags & PCRE_HASCRORLF) == 0 &&
6531 (md->nltype == NLTYPE_ANY ||
6532 md->nltype == NLTYPE_ANYCRLF ||
6533 md->nllen == 2))
6534 start_match++;
6535
6536 md->mark = NULL; /* Reset for start of next match attempt */
6537 } /* End of for(;;) "bumpalong" loop */
6538
6539 /* ==========================================================================*/
6540
6541 /* We reach here when rc is not MATCH_NOMATCH, or if one of the stopping
6542 conditions is true:
6543
6544 (1) The pattern is anchored or the match was failed by (*COMMIT);
6545
6546 (2) We are past the end of the subject;
6547
6548 (3) PCRE_FIRSTLINE is set and we have failed to match at a newline, because
6549 this option requests that a match occur at or before the first newline in
6550 the subject.
6551
6552 When we have a match and the offset vector is big enough to deal with any
6553 backreferences, captured substring offsets will already be set up. In the case
6554 where we had to get some local store to hold offsets for backreference
6555 processing, copy those that we can. In this case there need not be overflow if
6556 certain parts of the pattern were not used, even though there are more
6557 capturing parentheses than vector slots. */
6558
6559 ENDLOOP:
6560
6561 if (rc == MATCH_MATCH || rc == MATCH_ACCEPT)
6562 {
6563 if (using_temporary_offsets)
6564 {
6565 if (arg_offset_max >= 4)
6566 {
6567 memcpy(offsets + 2, md->offset_vector + 2,
6568 (arg_offset_max - 2) * sizeof(int));
6569 DPRINTF(("Copied offsets from temporary memory\n"));
6570 }
6571 if (md->end_offset_top > arg_offset_max) md->offset_overflow = TRUE;
6572 DPRINTF(("Freeing temporary memory\n"));
6573 (pcre_free)(md->offset_vector);
6574 }
6575
6576 /* Set the return code to the number of captured strings, or 0 if there were
6577 too many to fit into the vector. */
6578
6579 rc = (md->offset_overflow && md->end_offset_top >= arg_offset_max)?
6580 0 : md->end_offset_top/2;
6581
6582 /* If there is space in the offset vector, set any unused pairs at the end of
6583 the pattern to -1 for backwards compatibility. It is documented that this
6584 happens. In earlier versions, the whole set of potential capturing offsets
6585 was set to -1 each time round the loop, but this is handled differently now.
6586 "Gaps" are set to -1 dynamically instead (this fixes a bug). Thus, it is only
6587 those at the end that need unsetting here. We can't just unset them all at
6588 the start of the whole thing because they may get set in one branch that is
6589 not the final matching branch. */
6590
6591 if (md->end_offset_top/2 <= re->top_bracket && offsets != NULL)
6592 {
6593 register int *iptr, *iend;
6594 int resetcount = 2 + re->top_bracket * 2;
6595 if (resetcount > offsetcount) resetcount = ocount;
6596 iptr = offsets + md->end_offset_top;
6597 iend = offsets + resetcount;
6598 while (iptr < iend) *iptr++ = -1;
6599 }
6600
6601 /* If there is space, set up the whole thing as substring 0. The value of
6602 md->start_match_ptr might be modified if \K was encountered on the success
6603 matching path. */
6604
6605 if (offsetcount < 2) rc = 0; else
6606 {
6607 offsets[0] = (int)(md->start_match_ptr - md->start_subject);
6608 offsets[1] = (int)(md->end_match_ptr - md->start_subject);
6609 }
6610
6611 DPRINTF((">>>> returning %d\n", rc));
6612 goto RETURN_MARK;
6613 }
6614
6615 /* Control gets here if there has been an error, or if the overall match
6616 attempt has failed at all permitted starting positions. */
6617
6618 if (using_temporary_offsets)
6619 {
6620 DPRINTF(("Freeing temporary memory\n"));
6621 (pcre_free)(md->offset_vector);
6622 }
6623
6624 /* For anything other than nomatch or partial match, just return the code. */
6625
6626 if (rc != MATCH_NOMATCH && rc != PCRE_ERROR_PARTIAL)
6627 {
6628 DPRINTF((">>>> error: returning %d\n", rc));
6629 return rc;
6630 }
6631
6632 /* Handle partial matches - disable any mark data */
6633
6634 if (start_partial != NULL)
6635 {
6636 DPRINTF((">>>> returning PCRE_ERROR_PARTIAL\n"));
6637 md->mark = NULL;
6638 if (offsetcount > 1)
6639 {
6640 offsets[0] = (int)(start_partial - (USPTR)subject);
6641 offsets[1] = (int)(end_subject - (USPTR)subject);
6642 }
6643 rc = PCRE_ERROR_PARTIAL;
6644 }
6645
6646 /* This is the classic nomatch case */
6647
6648 else
6649 {
6650 DPRINTF((">>>> returning PCRE_ERROR_NOMATCH\n"));
6651 rc = PCRE_ERROR_NOMATCH;
6652 }
6653
6654 /* Return the MARK data if it has been requested. */
6655
6656 RETURN_MARK:
6657
6658 if (extra_data != NULL && (extra_data->flags & PCRE_EXTRA_MARK) != 0)
6659 *(extra_data->mark) = (unsigned char *)(md->mark);
6660 return rc;
6661 }
6662
6663 /* End of pcre_exec.c */

Properties

Name Value
svn:eol-style native
svn:keywords "Author Date Id Revision Url"

webmaster@exim.org
ViewVC Help
Powered by ViewVC 1.1.12