/[pcre]/code/trunk/pcre_exec.c
ViewVC logotype

Contents of /code/trunk/pcre_exec.c

Parent Directory Parent Directory | Revision Log Revision Log


Revision 642 - (show annotations) (download)
Thu Jul 28 18:59:40 2011 UTC (3 years, 2 months ago) by ph10
File MIME type: text/plain
File size: 194303 byte(s)
Avoid false positive for infinite recursion by not checking conditionals at 
compile time, but add tests at runtime that also catch infinite mutual 
recursion.

1 /*************************************************
2 * Perl-Compatible Regular Expressions *
3 *************************************************/
4
5 /* PCRE is a library of functions to support regular expressions whose syntax
6 and semantics are as close as possible to those of the Perl 5 language.
7
8 Written by Philip Hazel
9 Copyright (c) 1997-2011 University of Cambridge
10
11 -----------------------------------------------------------------------------
12 Redistribution and use in source and binary forms, with or without
13 modification, are permitted provided that the following conditions are met:
14
15 * Redistributions of source code must retain the above copyright notice,
16 this list of conditions and the following disclaimer.
17
18 * Redistributions in binary form must reproduce the above copyright
19 notice, this list of conditions and the following disclaimer in the
20 documentation and/or other materials provided with the distribution.
21
22 * Neither the name of the University of Cambridge nor the names of its
23 contributors may be used to endorse or promote products derived from
24 this software without specific prior written permission.
25
26 THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
27 AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
28 IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
29 ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
30 LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
31 CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
32 SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
33 INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
34 CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
35 ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
36 POSSIBILITY OF SUCH DAMAGE.
37 -----------------------------------------------------------------------------
38 */
39
40
41 /* This module contains pcre_exec(), the externally visible function that does
42 pattern matching using an NFA algorithm, trying to mimic Perl as closely as
43 possible. There are also some static supporting functions. */
44
45 #ifdef HAVE_CONFIG_H
46 #include "config.h"
47 #endif
48
49 #define NLBLOCK md /* Block containing newline information */
50 #define PSSTART start_subject /* Field containing processed string start */
51 #define PSEND end_subject /* Field containing processed string end */
52
53 #include "pcre_internal.h"
54
55 /* Undefine some potentially clashing cpp symbols */
56
57 #undef min
58 #undef max
59
60 /* Values for setting in md->match_function_type to indicate two special types
61 of call to match(). We do it this way to save on using another stack variable,
62 as stack usage is to be discouraged. */
63
64 #define MATCH_CONDASSERT 1 /* Called to check a condition assertion */
65 #define MATCH_CBEGROUP 2 /* Could-be-empty unlimited repeat group */
66
67 /* Non-error returns from the match() function. Error returns are externally
68 defined PCRE_ERROR_xxx codes, which are all negative. */
69
70 #define MATCH_MATCH 1
71 #define MATCH_NOMATCH 0
72
73 /* Special internal returns from the match() function. Make them sufficiently
74 negative to avoid the external error codes. */
75
76 #define MATCH_ACCEPT (-999)
77 #define MATCH_COMMIT (-998)
78 #define MATCH_KETRPOS (-997)
79 #define MATCH_ONCE (-996)
80 #define MATCH_PRUNE (-995)
81 #define MATCH_SKIP (-994)
82 #define MATCH_SKIP_ARG (-993)
83 #define MATCH_THEN (-992)
84
85 /* This is a convenience macro for code that occurs many times. */
86
87 #define MRRETURN(ra) \
88 { \
89 md->mark = markptr; \
90 RRETURN(ra); \
91 }
92
93 /* Maximum number of ints of offset to save on the stack for recursive calls.
94 If the offset vector is bigger, malloc is used. This should be a multiple of 3,
95 because the offset vector is always a multiple of 3 long. */
96
97 #define REC_STACK_SAVE_MAX 30
98
99 /* Min and max values for the common repeats; for the maxima, 0 => infinity */
100
101 static const char rep_min[] = { 0, 0, 1, 1, 0, 0 };
102 static const char rep_max[] = { 0, 0, 0, 0, 1, 1 };
103
104
105
106 #ifdef PCRE_DEBUG
107 /*************************************************
108 * Debugging function to print chars *
109 *************************************************/
110
111 /* Print a sequence of chars in printable format, stopping at the end of the
112 subject if the requested.
113
114 Arguments:
115 p points to characters
116 length number to print
117 is_subject TRUE if printing from within md->start_subject
118 md pointer to matching data block, if is_subject is TRUE
119
120 Returns: nothing
121 */
122
123 static void
124 pchars(const uschar *p, int length, BOOL is_subject, match_data *md)
125 {
126 unsigned int c;
127 if (is_subject && length > md->end_subject - p) length = md->end_subject - p;
128 while (length-- > 0)
129 if (isprint(c = *(p++))) printf("%c", c); else printf("\\x%02x", c);
130 }
131 #endif
132
133
134
135 /*************************************************
136 * Match a back-reference *
137 *************************************************/
138
139 /* Normally, if a back reference hasn't been set, the length that is passed is
140 negative, so the match always fails. However, in JavaScript compatibility mode,
141 the length passed is zero. Note that in caseless UTF-8 mode, the number of
142 subject bytes matched may be different to the number of reference bytes.
143
144 Arguments:
145 offset index into the offset vector
146 eptr pointer into the subject
147 length length of reference to be matched (number of bytes)
148 md points to match data block
149 caseless TRUE if caseless
150
151 Returns: < 0 if not matched, otherwise the number of subject bytes matched
152 */
153
154 static int
155 match_ref(int offset, register USPTR eptr, int length, match_data *md,
156 BOOL caseless)
157 {
158 USPTR eptr_start = eptr;
159 register USPTR p = md->start_subject + md->offset_vector[offset];
160
161 #ifdef PCRE_DEBUG
162 if (eptr >= md->end_subject)
163 printf("matching subject <null>");
164 else
165 {
166 printf("matching subject ");
167 pchars(eptr, length, TRUE, md);
168 }
169 printf(" against backref ");
170 pchars(p, length, FALSE, md);
171 printf("\n");
172 #endif
173
174 /* Always fail if reference not set (and not JavaScript compatible). */
175
176 if (length < 0) return -1;
177
178 /* Separate the caseless case for speed. In UTF-8 mode we can only do this
179 properly if Unicode properties are supported. Otherwise, we can check only
180 ASCII characters. */
181
182 if (caseless)
183 {
184 #ifdef SUPPORT_UTF8
185 #ifdef SUPPORT_UCP
186 if (md->utf8)
187 {
188 /* Match characters up to the end of the reference. NOTE: the number of
189 bytes matched may differ, because there are some characters whose upper and
190 lower case versions code as different numbers of bytes. For example, U+023A
191 (2 bytes in UTF-8) is the upper case version of U+2C65 (3 bytes in UTF-8);
192 a sequence of 3 of the former uses 6 bytes, as does a sequence of two of
193 the latter. It is important, therefore, to check the length along the
194 reference, not along the subject (earlier code did this wrong). */
195
196 USPTR endptr = p + length;
197 while (p < endptr)
198 {
199 int c, d;
200 if (eptr >= md->end_subject) return -1;
201 GETCHARINC(c, eptr);
202 GETCHARINC(d, p);
203 if (c != d && c != UCD_OTHERCASE(d)) return -1;
204 }
205 }
206 else
207 #endif
208 #endif
209
210 /* The same code works when not in UTF-8 mode and in UTF-8 mode when there
211 is no UCP support. */
212 {
213 if (eptr + length > md->end_subject) return -1;
214 while (length-- > 0)
215 { if (md->lcc[*p++] != md->lcc[*eptr++]) return -1; }
216 }
217 }
218
219 /* In the caseful case, we can just compare the bytes, whether or not we
220 are in UTF-8 mode. */
221
222 else
223 {
224 if (eptr + length > md->end_subject) return -1;
225 while (length-- > 0) if (*p++ != *eptr++) return -1;
226 }
227
228 return eptr - eptr_start;
229 }
230
231
232
233 /***************************************************************************
234 ****************************************************************************
235 RECURSION IN THE match() FUNCTION
236
237 The match() function is highly recursive, though not every recursive call
238 increases the recursive depth. Nevertheless, some regular expressions can cause
239 it to recurse to a great depth. I was writing for Unix, so I just let it call
240 itself recursively. This uses the stack for saving everything that has to be
241 saved for a recursive call. On Unix, the stack can be large, and this works
242 fine.
243
244 It turns out that on some non-Unix-like systems there are problems with
245 programs that use a lot of stack. (This despite the fact that every last chip
246 has oodles of memory these days, and techniques for extending the stack have
247 been known for decades.) So....
248
249 There is a fudge, triggered by defining NO_RECURSE, which avoids recursive
250 calls by keeping local variables that need to be preserved in blocks of memory
251 obtained from malloc() instead instead of on the stack. Macros are used to
252 achieve this so that the actual code doesn't look very different to what it
253 always used to.
254
255 The original heap-recursive code used longjmp(). However, it seems that this
256 can be very slow on some operating systems. Following a suggestion from Stan
257 Switzer, the use of longjmp() has been abolished, at the cost of having to
258 provide a unique number for each call to RMATCH. There is no way of generating
259 a sequence of numbers at compile time in C. I have given them names, to make
260 them stand out more clearly.
261
262 Crude tests on x86 Linux show a small speedup of around 5-8%. However, on
263 FreeBSD, avoiding longjmp() more than halves the time taken to run the standard
264 tests. Furthermore, not using longjmp() means that local dynamic variables
265 don't have indeterminate values; this has meant that the frame size can be
266 reduced because the result can be "passed back" by straight setting of the
267 variable instead of being passed in the frame.
268 ****************************************************************************
269 ***************************************************************************/
270
271 /* Numbers for RMATCH calls. When this list is changed, the code at HEAP_RETURN
272 below must be updated in sync. */
273
274 enum { RM1=1, RM2, RM3, RM4, RM5, RM6, RM7, RM8, RM9, RM10,
275 RM11, RM12, RM13, RM14, RM15, RM16, RM17, RM18, RM19, RM20,
276 RM21, RM22, RM23, RM24, RM25, RM26, RM27, RM28, RM29, RM30,
277 RM31, RM32, RM33, RM34, RM35, RM36, RM37, RM38, RM39, RM40,
278 RM41, RM42, RM43, RM44, RM45, RM46, RM47, RM48, RM49, RM50,
279 RM51, RM52, RM53, RM54, RM55, RM56, RM57, RM58, RM59, RM60,
280 RM61, RM62, RM63 };
281
282 /* These versions of the macros use the stack, as normal. There are debugging
283 versions and production versions. Note that the "rw" argument of RMATCH isn't
284 actually used in this definition. */
285
286 #ifndef NO_RECURSE
287 #define REGISTER register
288
289 #ifdef PCRE_DEBUG
290 #define RMATCH(ra,rb,rc,rd,re,rw) \
291 { \
292 printf("match() called in line %d\n", __LINE__); \
293 rrc = match(ra,rb,mstart,markptr,rc,rd,re,rdepth+1); \
294 printf("to line %d\n", __LINE__); \
295 }
296 #define RRETURN(ra) \
297 { \
298 printf("match() returned %d from line %d ", ra, __LINE__); \
299 return ra; \
300 }
301 #else
302 #define RMATCH(ra,rb,rc,rd,re,rw) \
303 rrc = match(ra,rb,mstart,markptr,rc,rd,re,rdepth+1)
304 #define RRETURN(ra) return ra
305 #endif
306
307 #else
308
309
310 /* These versions of the macros manage a private stack on the heap. Note that
311 the "rd" argument of RMATCH isn't actually used in this definition. It's the md
312 argument of match(), which never changes. */
313
314 #define REGISTER
315
316 #define RMATCH(ra,rb,rc,rd,re,rw)\
317 {\
318 heapframe *newframe = (heapframe *)(pcre_stack_malloc)(sizeof(heapframe));\
319 if (newframe == NULL) RRETURN(PCRE_ERROR_NOMEMORY);\
320 frame->Xwhere = rw; \
321 newframe->Xeptr = ra;\
322 newframe->Xecode = rb;\
323 newframe->Xmstart = mstart;\
324 newframe->Xmarkptr = markptr;\
325 newframe->Xoffset_top = rc;\
326 newframe->Xeptrb = re;\
327 newframe->Xrdepth = frame->Xrdepth + 1;\
328 newframe->Xprevframe = frame;\
329 frame = newframe;\
330 DPRINTF(("restarting from line %d\n", __LINE__));\
331 goto HEAP_RECURSE;\
332 L_##rw:\
333 DPRINTF(("jumped back to line %d\n", __LINE__));\
334 }
335
336 #define RRETURN(ra)\
337 {\
338 heapframe *oldframe = frame;\
339 frame = oldframe->Xprevframe;\
340 (pcre_stack_free)(oldframe);\
341 if (frame != NULL)\
342 {\
343 rrc = ra;\
344 goto HEAP_RETURN;\
345 }\
346 return ra;\
347 }
348
349
350 /* Structure for remembering the local variables in a private frame */
351
352 typedef struct heapframe {
353 struct heapframe *Xprevframe;
354
355 /* Function arguments that may change */
356
357 USPTR Xeptr;
358 const uschar *Xecode;
359 USPTR Xmstart;
360 USPTR Xmarkptr;
361 int Xoffset_top;
362 eptrblock *Xeptrb;
363 unsigned int Xrdepth;
364
365 /* Function local variables */
366
367 USPTR Xcallpat;
368 #ifdef SUPPORT_UTF8
369 USPTR Xcharptr;
370 #endif
371 USPTR Xdata;
372 USPTR Xnext;
373 USPTR Xpp;
374 USPTR Xprev;
375 USPTR Xsaved_eptr;
376
377 recursion_info Xnew_recursive;
378
379 BOOL Xcur_is_word;
380 BOOL Xcondition;
381 BOOL Xprev_is_word;
382
383 #ifdef SUPPORT_UCP
384 int Xprop_type;
385 int Xprop_value;
386 int Xprop_fail_result;
387 int Xoclength;
388 uschar Xocchars[8];
389 #endif
390
391 int Xcodelink;
392 int Xctype;
393 unsigned int Xfc;
394 int Xfi;
395 int Xlength;
396 int Xmax;
397 int Xmin;
398 int Xnumber;
399 int Xoffset;
400 int Xop;
401 int Xsave_capture_last;
402 int Xsave_offset1, Xsave_offset2, Xsave_offset3;
403 int Xstacksave[REC_STACK_SAVE_MAX];
404
405 eptrblock Xnewptrb;
406
407 /* Where to jump back to */
408
409 int Xwhere;
410
411 } heapframe;
412
413 #endif
414
415
416 /***************************************************************************
417 ***************************************************************************/
418
419
420
421 /*************************************************
422 * Match from current position *
423 *************************************************/
424
425 /* This function is called recursively in many circumstances. Whenever it
426 returns a negative (error) response, the outer incarnation must also return the
427 same response. */
428
429 /* These macros pack up tests that are used for partial matching, and which
430 appears several times in the code. We set the "hit end" flag if the pointer is
431 at the end of the subject and also past the start of the subject (i.e.
432 something has been matched). For hard partial matching, we then return
433 immediately. The second one is used when we already know we are past the end of
434 the subject. */
435
436 #define CHECK_PARTIAL()\
437 if (md->partial != 0 && eptr >= md->end_subject && \
438 eptr > md->start_used_ptr) \
439 { \
440 md->hitend = TRUE; \
441 if (md->partial > 1) MRRETURN(PCRE_ERROR_PARTIAL); \
442 }
443
444 #define SCHECK_PARTIAL()\
445 if (md->partial != 0 && eptr > md->start_used_ptr) \
446 { \
447 md->hitend = TRUE; \
448 if (md->partial > 1) MRRETURN(PCRE_ERROR_PARTIAL); \
449 }
450
451
452 /* Performance note: It might be tempting to extract commonly used fields from
453 the md structure (e.g. utf8, end_subject) into individual variables to improve
454 performance. Tests using gcc on a SPARC disproved this; in the first case, it
455 made performance worse.
456
457 Arguments:
458 eptr pointer to current character in subject
459 ecode pointer to current position in compiled code
460 mstart pointer to the current match start position (can be modified
461 by encountering \K)
462 markptr pointer to the most recent MARK name, or NULL
463 offset_top current top pointer
464 md pointer to "static" info for the match
465 eptrb pointer to chain of blocks containing eptr at start of
466 brackets - for testing for empty matches
467 rdepth the recursion depth
468
469 Returns: MATCH_MATCH if matched ) these values are >= 0
470 MATCH_NOMATCH if failed to match )
471 a negative MATCH_xxx value for PRUNE, SKIP, etc
472 a negative PCRE_ERROR_xxx value if aborted by an error condition
473 (e.g. stopped by repeated call or recursion limit)
474 */
475
476 static int
477 match(REGISTER USPTR eptr, REGISTER const uschar *ecode, USPTR mstart,
478 const uschar *markptr, int offset_top, match_data *md, eptrblock *eptrb,
479 unsigned int rdepth)
480 {
481 /* These variables do not need to be preserved over recursion in this function,
482 so they can be ordinary variables in all cases. Mark some of them with
483 "register" because they are used a lot in loops. */
484
485 register int rrc; /* Returns from recursive calls */
486 register int i; /* Used for loops not involving calls to RMATCH() */
487 register unsigned int c; /* Character values not kept over RMATCH() calls */
488 register BOOL utf8; /* Local copy of UTF-8 flag for speed */
489
490 BOOL minimize, possessive; /* Quantifier options */
491 BOOL caseless;
492 int condcode;
493
494 /* When recursion is not being used, all "local" variables that have to be
495 preserved over calls to RMATCH() are part of a "frame" which is obtained from
496 heap storage. Set up the top-level frame here; others are obtained from the
497 heap whenever RMATCH() does a "recursion". See the macro definitions above. */
498
499 #ifdef NO_RECURSE
500 heapframe *frame = (heapframe *)(pcre_stack_malloc)(sizeof(heapframe));
501 if (frame == NULL) RRETURN(PCRE_ERROR_NOMEMORY);
502 frame->Xprevframe = NULL; /* Marks the top level */
503
504 /* Copy in the original argument variables */
505
506 frame->Xeptr = eptr;
507 frame->Xecode = ecode;
508 frame->Xmstart = mstart;
509 frame->Xmarkptr = markptr;
510 frame->Xoffset_top = offset_top;
511 frame->Xeptrb = eptrb;
512 frame->Xrdepth = rdepth;
513
514 /* This is where control jumps back to to effect "recursion" */
515
516 HEAP_RECURSE:
517
518 /* Macros make the argument variables come from the current frame */
519
520 #define eptr frame->Xeptr
521 #define ecode frame->Xecode
522 #define mstart frame->Xmstart
523 #define markptr frame->Xmarkptr
524 #define offset_top frame->Xoffset_top
525 #define eptrb frame->Xeptrb
526 #define rdepth frame->Xrdepth
527
528 /* Ditto for the local variables */
529
530 #ifdef SUPPORT_UTF8
531 #define charptr frame->Xcharptr
532 #endif
533 #define callpat frame->Xcallpat
534 #define codelink frame->Xcodelink
535 #define data frame->Xdata
536 #define next frame->Xnext
537 #define pp frame->Xpp
538 #define prev frame->Xprev
539 #define saved_eptr frame->Xsaved_eptr
540
541 #define new_recursive frame->Xnew_recursive
542
543 #define cur_is_word frame->Xcur_is_word
544 #define condition frame->Xcondition
545 #define prev_is_word frame->Xprev_is_word
546
547 #ifdef SUPPORT_UCP
548 #define prop_type frame->Xprop_type
549 #define prop_value frame->Xprop_value
550 #define prop_fail_result frame->Xprop_fail_result
551 #define oclength frame->Xoclength
552 #define occhars frame->Xocchars
553 #endif
554
555 #define ctype frame->Xctype
556 #define fc frame->Xfc
557 #define fi frame->Xfi
558 #define length frame->Xlength
559 #define max frame->Xmax
560 #define min frame->Xmin
561 #define number frame->Xnumber
562 #define offset frame->Xoffset
563 #define op frame->Xop
564 #define save_capture_last frame->Xsave_capture_last
565 #define save_offset1 frame->Xsave_offset1
566 #define save_offset2 frame->Xsave_offset2
567 #define save_offset3 frame->Xsave_offset3
568 #define stacksave frame->Xstacksave
569
570 #define newptrb frame->Xnewptrb
571
572 /* When recursion is being used, local variables are allocated on the stack and
573 get preserved during recursion in the normal way. In this environment, fi and
574 i, and fc and c, can be the same variables. */
575
576 #else /* NO_RECURSE not defined */
577 #define fi i
578 #define fc c
579
580 /* Many of the following variables are used only in small blocks of the code.
581 My normal style of coding would have declared them within each of those blocks.
582 However, in order to accommodate the version of this code that uses an external
583 "stack" implemented on the heap, it is easier to declare them all here, so the
584 declarations can be cut out in a block. The only declarations within blocks
585 below are for variables that do not have to be preserved over a recursive call
586 to RMATCH(). */
587
588 #ifdef SUPPORT_UTF8
589 const uschar *charptr;
590 #endif
591 const uschar *callpat;
592 const uschar *data;
593 const uschar *next;
594 USPTR pp;
595 const uschar *prev;
596 USPTR saved_eptr;
597
598 recursion_info new_recursive;
599
600 BOOL cur_is_word;
601 BOOL condition;
602 BOOL prev_is_word;
603
604 #ifdef SUPPORT_UCP
605 int prop_type;
606 int prop_value;
607 int prop_fail_result;
608 int oclength;
609 uschar occhars[8];
610 #endif
611
612 int codelink;
613 int ctype;
614 int length;
615 int max;
616 int min;
617 int number;
618 int offset;
619 int op;
620 int save_capture_last;
621 int save_offset1, save_offset2, save_offset3;
622 int stacksave[REC_STACK_SAVE_MAX];
623
624 eptrblock newptrb;
625 #endif /* NO_RECURSE */
626
627 /* To save space on the stack and in the heap frame, I have doubled up on some
628 of the local variables that are used only in localised parts of the code, but
629 still need to be preserved over recursive calls of match(). These macros define
630 the alternative names that are used. */
631
632 #define allow_zero cur_is_word
633 #define cbegroup condition
634 #define code_offset codelink
635 #define condassert condition
636 #define matched_once prev_is_word
637
638 /* These statements are here to stop the compiler complaining about unitialized
639 variables. */
640
641 #ifdef SUPPORT_UCP
642 prop_value = 0;
643 prop_fail_result = 0;
644 #endif
645
646
647 /* This label is used for tail recursion, which is used in a few cases even
648 when NO_RECURSE is not defined, in order to reduce the amount of stack that is
649 used. Thanks to Ian Taylor for noticing this possibility and sending the
650 original patch. */
651
652 TAIL_RECURSE:
653
654 /* OK, now we can get on with the real code of the function. Recursive calls
655 are specified by the macro RMATCH and RRETURN is used to return. When
656 NO_RECURSE is *not* defined, these just turn into a recursive call to match()
657 and a "return", respectively (possibly with some debugging if PCRE_DEBUG is
658 defined). However, RMATCH isn't like a function call because it's quite a
659 complicated macro. It has to be used in one particular way. This shouldn't,
660 however, impact performance when true recursion is being used. */
661
662 #ifdef SUPPORT_UTF8
663 utf8 = md->utf8; /* Local copy of the flag */
664 #else
665 utf8 = FALSE;
666 #endif
667
668 /* First check that we haven't called match() too many times, or that we
669 haven't exceeded the recursive call limit. */
670
671 if (md->match_call_count++ >= md->match_limit) RRETURN(PCRE_ERROR_MATCHLIMIT);
672 if (rdepth >= md->match_limit_recursion) RRETURN(PCRE_ERROR_RECURSIONLIMIT);
673
674 /* At the start of a group with an unlimited repeat that may match an empty
675 string, the variable md->match_function_type is set to MATCH_CBEGROUP. It is
676 done this way to save having to use another function argument, which would take
677 up space on the stack. See also MATCH_CONDASSERT below.
678
679 When MATCH_CBEGROUP is set, add the current subject pointer to the chain of
680 such remembered pointers, to be checked when we hit the closing ket, in order
681 to break infinite loops that match no characters. When match() is called in
682 other circumstances, don't add to the chain. The MATCH_CBEGROUP feature must
683 NOT be used with tail recursion, because the memory block that is used is on
684 the stack, so a new one may be required for each match(). */
685
686 if (md->match_function_type == MATCH_CBEGROUP)
687 {
688 newptrb.epb_saved_eptr = eptr;
689 newptrb.epb_prev = eptrb;
690 eptrb = &newptrb;
691 md->match_function_type = 0;
692 }
693
694 /* Now start processing the opcodes. */
695
696 for (;;)
697 {
698 minimize = possessive = FALSE;
699 op = *ecode;
700
701 switch(op)
702 {
703 case OP_MARK:
704 markptr = ecode + 2;
705 RMATCH(eptr, ecode + _pcre_OP_lengths[*ecode] + ecode[1], offset_top, md,
706 eptrb, RM55);
707
708 /* A return of MATCH_SKIP_ARG means that matching failed at SKIP with an
709 argument, and we must check whether that argument matches this MARK's
710 argument. It is passed back in md->start_match_ptr (an overloading of that
711 variable). If it does match, we reset that variable to the current subject
712 position and return MATCH_SKIP. Otherwise, pass back the return code
713 unaltered. */
714
715 if (rrc == MATCH_SKIP_ARG &&
716 strcmp((char *)markptr, (char *)(md->start_match_ptr)) == 0)
717 {
718 md->start_match_ptr = eptr;
719 RRETURN(MATCH_SKIP);
720 }
721
722 if (md->mark == NULL) md->mark = markptr;
723 RRETURN(rrc);
724
725 case OP_FAIL:
726 MRRETURN(MATCH_NOMATCH);
727
728 /* COMMIT overrides PRUNE, SKIP, and THEN */
729
730 case OP_COMMIT:
731 RMATCH(eptr, ecode + _pcre_OP_lengths[*ecode], offset_top, md,
732 eptrb, RM52);
733 if (rrc != MATCH_NOMATCH && rrc != MATCH_PRUNE &&
734 rrc != MATCH_SKIP && rrc != MATCH_SKIP_ARG &&
735 rrc != MATCH_THEN)
736 RRETURN(rrc);
737 MRRETURN(MATCH_COMMIT);
738
739 /* PRUNE overrides THEN */
740
741 case OP_PRUNE:
742 RMATCH(eptr, ecode + _pcre_OP_lengths[*ecode], offset_top, md,
743 eptrb, RM51);
744 if (rrc != MATCH_NOMATCH && rrc != MATCH_THEN) RRETURN(rrc);
745 MRRETURN(MATCH_PRUNE);
746
747 case OP_PRUNE_ARG:
748 RMATCH(eptr, ecode + _pcre_OP_lengths[*ecode] + ecode[1], offset_top, md,
749 eptrb, RM56);
750 if (rrc != MATCH_NOMATCH && rrc != MATCH_THEN) RRETURN(rrc);
751 md->mark = ecode + 2;
752 RRETURN(MATCH_PRUNE);
753
754 /* SKIP overrides PRUNE and THEN */
755
756 case OP_SKIP:
757 RMATCH(eptr, ecode + _pcre_OP_lengths[*ecode], offset_top, md,
758 eptrb, RM53);
759 if (rrc != MATCH_NOMATCH && rrc != MATCH_PRUNE && rrc != MATCH_THEN)
760 RRETURN(rrc);
761 md->start_match_ptr = eptr; /* Pass back current position */
762 MRRETURN(MATCH_SKIP);
763
764 case OP_SKIP_ARG:
765 RMATCH(eptr, ecode + _pcre_OP_lengths[*ecode] + ecode[1], offset_top, md,
766 eptrb, RM57);
767 if (rrc != MATCH_NOMATCH && rrc != MATCH_PRUNE && rrc != MATCH_THEN)
768 RRETURN(rrc);
769
770 /* Pass back the current skip name by overloading md->start_match_ptr and
771 returning the special MATCH_SKIP_ARG return code. This will either be
772 caught by a matching MARK, or get to the top, where it is treated the same
773 as PRUNE. */
774
775 md->start_match_ptr = ecode + 2;
776 RRETURN(MATCH_SKIP_ARG);
777
778 /* For THEN (and THEN_ARG) we pass back the address of the bracket or
779 the alt that is at the start of the current branch. This makes it possible
780 to skip back past alternatives that precede the THEN within the current
781 branch. */
782
783 case OP_THEN:
784 RMATCH(eptr, ecode + _pcre_OP_lengths[*ecode], offset_top, md,
785 eptrb, RM54);
786 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
787 md->start_match_ptr = ecode - GET(ecode, 1);
788 MRRETURN(MATCH_THEN);
789
790 case OP_THEN_ARG:
791 RMATCH(eptr, ecode + _pcre_OP_lengths[*ecode] + ecode[1+LINK_SIZE],
792 offset_top, md, eptrb, RM58);
793 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
794 md->start_match_ptr = ecode - GET(ecode, 1);
795 md->mark = ecode + LINK_SIZE + 2;
796 RRETURN(MATCH_THEN);
797
798 /* Handle a capturing bracket, other than those that are possessive with an
799 unlimited repeat. If there is space in the offset vector, save the current
800 subject position in the working slot at the top of the vector. We mustn't
801 change the current values of the data slot, because they may be set from a
802 previous iteration of this group, and be referred to by a reference inside
803 the group. A failure to match might occur after the group has succeeded,
804 if something later on doesn't match. For this reason, we need to restore
805 the working value and also the values of the final offsets, in case they
806 were set by a previous iteration of the same bracket.
807
808 If there isn't enough space in the offset vector, treat this as if it were
809 a non-capturing bracket. Don't worry about setting the flag for the error
810 case here; that is handled in the code for KET. */
811
812 case OP_CBRA:
813 case OP_SCBRA:
814 number = GET2(ecode, 1+LINK_SIZE);
815 offset = number << 1;
816
817 #ifdef PCRE_DEBUG
818 printf("start bracket %d\n", number);
819 printf("subject=");
820 pchars(eptr, 16, TRUE, md);
821 printf("\n");
822 #endif
823
824 if (offset < md->offset_max)
825 {
826 save_offset1 = md->offset_vector[offset];
827 save_offset2 = md->offset_vector[offset+1];
828 save_offset3 = md->offset_vector[md->offset_end - number];
829 save_capture_last = md->capture_last;
830
831 DPRINTF(("saving %d %d %d\n", save_offset1, save_offset2, save_offset3));
832 md->offset_vector[md->offset_end - number] =
833 (int)(eptr - md->start_subject);
834
835 for (;;)
836 {
837 if (op >= OP_SBRA) md->match_function_type = MATCH_CBEGROUP;
838 RMATCH(eptr, ecode + _pcre_OP_lengths[*ecode], offset_top, md,
839 eptrb, RM1);
840 if (rrc == MATCH_ONCE) break; /* Backing up through an atomic group */
841 if (rrc != MATCH_NOMATCH &&
842 (rrc != MATCH_THEN || md->start_match_ptr != ecode))
843 RRETURN(rrc);
844 md->capture_last = save_capture_last;
845 ecode += GET(ecode, 1);
846 if (*ecode != OP_ALT) break;
847 }
848
849 DPRINTF(("bracket %d failed\n", number));
850 md->offset_vector[offset] = save_offset1;
851 md->offset_vector[offset+1] = save_offset2;
852 md->offset_vector[md->offset_end - number] = save_offset3;
853
854 /* At this point, rrc will be one of MATCH_ONCE, MATCH_NOMATCH, or
855 MATCH_THEN. */
856
857 if (rrc != MATCH_THEN && md->mark == NULL) md->mark = markptr;
858 RRETURN(((rrc == MATCH_ONCE)? MATCH_ONCE:MATCH_NOMATCH));
859 }
860
861 /* FALL THROUGH ... Insufficient room for saving captured contents. Treat
862 as a non-capturing bracket. */
863
864 /* VVVVVVVVVVVVVVVVVVVVVVVVV */
865 /* VVVVVVVVVVVVVVVVVVVVVVVVV */
866
867 DPRINTF(("insufficient capture room: treat as non-capturing\n"));
868
869 /* VVVVVVVVVVVVVVVVVVVVVVVVV */
870 /* VVVVVVVVVVVVVVVVVVVVVVVVV */
871
872 /* Non-capturing or atomic group, except for possessive with unlimited
873 repeat. Loop for all the alternatives. When we get to the final alternative
874 within the brackets, we used to return the result of a recursive call to
875 match() whatever happened so it was possible to reduce stack usage by
876 turning this into a tail recursion, except in the case of a possibly empty
877 group. However, now that there is the possiblity of (*THEN) occurring in
878 the final alternative, this optimization is no longer possible.
879
880 MATCH_ONCE is returned when the end of an atomic group is successfully
881 reached, but subsequent matching fails. It passes back up the tree (causing
882 captured values to be reset) until the original atomic group level is
883 reached. This is tested by comparing md->once_target with the start of the
884 group. At this point, the return is converted into MATCH_NOMATCH so that
885 previous backup points can be taken. */
886
887 case OP_ONCE:
888 case OP_BRA:
889 case OP_SBRA:
890 DPRINTF(("start non-capturing bracket\n"));
891
892 for (;;)
893 {
894 if (op >= OP_SBRA || op == OP_ONCE) md->match_function_type = MATCH_CBEGROUP;
895 RMATCH(eptr, ecode + _pcre_OP_lengths[*ecode], offset_top, md, eptrb,
896 RM2);
897 if (rrc != MATCH_NOMATCH &&
898 (rrc != MATCH_THEN || md->start_match_ptr != ecode))
899 {
900 if (rrc == MATCH_ONCE)
901 {
902 const uschar *scode = ecode;
903 if (*scode != OP_ONCE) /* If not at start, find it */
904 {
905 while (*scode == OP_ALT) scode += GET(scode, 1);
906 scode -= GET(scode, 1);
907 }
908 if (md->once_target == scode) rrc = MATCH_NOMATCH;
909 }
910 RRETURN(rrc);
911 }
912 ecode += GET(ecode, 1);
913 if (*ecode != OP_ALT) break;
914 }
915 if (rrc != MATCH_THEN && md->mark == NULL) md->mark = markptr;
916 RRETURN(MATCH_NOMATCH);
917
918 /* Handle possessive capturing brackets with an unlimited repeat. We come
919 here from BRAZERO with allow_zero set TRUE. The offset_vector values are
920 handled similarly to the normal case above. However, the matching is
921 different. The end of these brackets will always be OP_KETRPOS, which
922 returns MATCH_KETRPOS without going further in the pattern. By this means
923 we can handle the group by iteration rather than recursion, thereby
924 reducing the amount of stack needed. */
925
926 case OP_CBRAPOS:
927 case OP_SCBRAPOS:
928 allow_zero = FALSE;
929
930 POSSESSIVE_CAPTURE:
931 number = GET2(ecode, 1+LINK_SIZE);
932 offset = number << 1;
933
934 #ifdef PCRE_DEBUG
935 printf("start possessive bracket %d\n", number);
936 printf("subject=");
937 pchars(eptr, 16, TRUE, md);
938 printf("\n");
939 #endif
940
941 if (offset < md->offset_max)
942 {
943 matched_once = FALSE;
944 code_offset = ecode - md->start_code;
945
946 save_offset1 = md->offset_vector[offset];
947 save_offset2 = md->offset_vector[offset+1];
948 save_offset3 = md->offset_vector[md->offset_end - number];
949 save_capture_last = md->capture_last;
950
951 DPRINTF(("saving %d %d %d\n", save_offset1, save_offset2, save_offset3));
952
953 /* Each time round the loop, save the current subject position for use
954 when the group matches. For MATCH_MATCH, the group has matched, so we
955 restart it with a new subject starting position, remembering that we had
956 at least one match. For MATCH_NOMATCH, carry on with the alternatives, as
957 usual. If we haven't matched any alternatives in any iteration, check to
958 see if a previous iteration matched. If so, the group has matched;
959 continue from afterwards. Otherwise it has failed; restore the previous
960 capture values before returning NOMATCH. */
961
962 for (;;)
963 {
964 md->offset_vector[md->offset_end - number] =
965 (int)(eptr - md->start_subject);
966 if (op >= OP_SBRA) md->match_function_type = MATCH_CBEGROUP;
967 RMATCH(eptr, ecode + _pcre_OP_lengths[*ecode], offset_top, md,
968 eptrb, RM63);
969 if (rrc == MATCH_KETRPOS)
970 {
971 offset_top = md->end_offset_top;
972 eptr = md->end_match_ptr;
973 ecode = md->start_code + code_offset;
974 save_capture_last = md->capture_last;
975 matched_once = TRUE;
976 continue;
977 }
978 if (rrc != MATCH_NOMATCH &&
979 (rrc != MATCH_THEN || md->start_match_ptr != ecode))
980 RRETURN(rrc);
981 md->capture_last = save_capture_last;
982 ecode += GET(ecode, 1);
983 if (*ecode != OP_ALT) break;
984 }
985
986 if (!matched_once)
987 {
988 md->offset_vector[offset] = save_offset1;
989 md->offset_vector[offset+1] = save_offset2;
990 md->offset_vector[md->offset_end - number] = save_offset3;
991 }
992
993 if (rrc != MATCH_THEN && md->mark == NULL) md->mark = markptr;
994 if (allow_zero || matched_once)
995 {
996 ecode += 1 + LINK_SIZE;
997 break;
998 }
999
1000 RRETURN(MATCH_NOMATCH);
1001 }
1002
1003 /* FALL THROUGH ... Insufficient room for saving captured contents. Treat
1004 as a non-capturing bracket. */
1005
1006 /* VVVVVVVVVVVVVVVVVVVVVVVVV */
1007 /* VVVVVVVVVVVVVVVVVVVVVVVVV */
1008
1009 DPRINTF(("insufficient capture room: treat as non-capturing\n"));
1010
1011 /* VVVVVVVVVVVVVVVVVVVVVVVVV */
1012 /* VVVVVVVVVVVVVVVVVVVVVVVVV */
1013
1014 /* Non-capturing possessive bracket with unlimited repeat. We come here
1015 from BRAZERO with allow_zero = TRUE. The code is similar to the above,
1016 without the capturing complication. It is written out separately for speed
1017 and cleanliness. */
1018
1019 case OP_BRAPOS:
1020 case OP_SBRAPOS:
1021 allow_zero = FALSE;
1022
1023 POSSESSIVE_NON_CAPTURE:
1024 matched_once = FALSE;
1025 code_offset = ecode - md->start_code;
1026
1027 for (;;)
1028 {
1029 if (op >= OP_SBRA) md->match_function_type = MATCH_CBEGROUP;
1030 RMATCH(eptr, ecode + _pcre_OP_lengths[*ecode], offset_top, md,
1031 eptrb, RM48);
1032 if (rrc == MATCH_KETRPOS)
1033 {
1034 offset_top = md->end_offset_top;
1035 eptr = md->end_match_ptr;
1036 ecode = md->start_code + code_offset;
1037 matched_once = TRUE;
1038 continue;
1039 }
1040 if (rrc != MATCH_NOMATCH &&
1041 (rrc != MATCH_THEN || md->start_match_ptr != ecode))
1042 RRETURN(rrc);
1043 ecode += GET(ecode, 1);
1044 if (*ecode != OP_ALT) break;
1045 }
1046
1047 if (matched_once || allow_zero)
1048 {
1049 ecode += 1 + LINK_SIZE;
1050 break;
1051 }
1052 RRETURN(MATCH_NOMATCH);
1053
1054 /* Control never reaches here. */
1055
1056 /* Conditional group: compilation checked that there are no more than
1057 two branches. If the condition is false, skipping the first branch takes us
1058 past the end if there is only one branch, but that's OK because that is
1059 exactly what going to the ket would do. */
1060
1061 case OP_COND:
1062 case OP_SCOND:
1063 codelink = GET(ecode, 1);
1064
1065 /* Because of the way auto-callout works during compile, a callout item is
1066 inserted between OP_COND and an assertion condition. */
1067
1068 if (ecode[LINK_SIZE+1] == OP_CALLOUT)
1069 {
1070 if (pcre_callout != NULL)
1071 {
1072 pcre_callout_block cb;
1073 cb.version = 1; /* Version 1 of the callout block */
1074 cb.callout_number = ecode[LINK_SIZE+2];
1075 cb.offset_vector = md->offset_vector;
1076 cb.subject = (PCRE_SPTR)md->start_subject;
1077 cb.subject_length = (int)(md->end_subject - md->start_subject);
1078 cb.start_match = (int)(mstart - md->start_subject);
1079 cb.current_position = (int)(eptr - md->start_subject);
1080 cb.pattern_position = GET(ecode, LINK_SIZE + 3);
1081 cb.next_item_length = GET(ecode, 3 + 2*LINK_SIZE);
1082 cb.capture_top = offset_top/2;
1083 cb.capture_last = md->capture_last;
1084 cb.callout_data = md->callout_data;
1085 if ((rrc = (*pcre_callout)(&cb)) > 0) MRRETURN(MATCH_NOMATCH);
1086 if (rrc < 0) RRETURN(rrc);
1087 }
1088 ecode += _pcre_OP_lengths[OP_CALLOUT];
1089 }
1090
1091 condcode = ecode[LINK_SIZE+1];
1092
1093 /* Now see what the actual condition is */
1094
1095 if (condcode == OP_RREF || condcode == OP_NRREF) /* Recursion test */
1096 {
1097 if (md->recursive == NULL) /* Not recursing => FALSE */
1098 {
1099 condition = FALSE;
1100 ecode += GET(ecode, 1);
1101 }
1102 else
1103 {
1104 int recno = GET2(ecode, LINK_SIZE + 2); /* Recursion group number*/
1105 condition = (recno == RREF_ANY || recno == md->recursive->group_num);
1106
1107 /* If the test is for recursion into a specific subpattern, and it is
1108 false, but the test was set up by name, scan the table to see if the
1109 name refers to any other numbers, and test them. The condition is true
1110 if any one is set. */
1111
1112 if (!condition && condcode == OP_NRREF && recno != RREF_ANY)
1113 {
1114 uschar *slotA = md->name_table;
1115 for (i = 0; i < md->name_count; i++)
1116 {
1117 if (GET2(slotA, 0) == recno) break;
1118 slotA += md->name_entry_size;
1119 }
1120
1121 /* Found a name for the number - there can be only one; duplicate
1122 names for different numbers are allowed, but not vice versa. First
1123 scan down for duplicates. */
1124
1125 if (i < md->name_count)
1126 {
1127 uschar *slotB = slotA;
1128 while (slotB > md->name_table)
1129 {
1130 slotB -= md->name_entry_size;
1131 if (strcmp((char *)slotA + 2, (char *)slotB + 2) == 0)
1132 {
1133 condition = GET2(slotB, 0) == md->recursive->group_num;
1134 if (condition) break;
1135 }
1136 else break;
1137 }
1138
1139 /* Scan up for duplicates */
1140
1141 if (!condition)
1142 {
1143 slotB = slotA;
1144 for (i++; i < md->name_count; i++)
1145 {
1146 slotB += md->name_entry_size;
1147 if (strcmp((char *)slotA + 2, (char *)slotB + 2) == 0)
1148 {
1149 condition = GET2(slotB, 0) == md->recursive->group_num;
1150 if (condition) break;
1151 }
1152 else break;
1153 }
1154 }
1155 }
1156 }
1157
1158 /* Chose branch according to the condition */
1159
1160 ecode += condition? 3 : GET(ecode, 1);
1161 }
1162 }
1163
1164 else if (condcode == OP_CREF || condcode == OP_NCREF) /* Group used test */
1165 {
1166 offset = GET2(ecode, LINK_SIZE+2) << 1; /* Doubled ref number */
1167 condition = offset < offset_top && md->offset_vector[offset] >= 0;
1168
1169 /* If the numbered capture is unset, but the reference was by name,
1170 scan the table to see if the name refers to any other numbers, and test
1171 them. The condition is true if any one is set. This is tediously similar
1172 to the code above, but not close enough to try to amalgamate. */
1173
1174 if (!condition && condcode == OP_NCREF)
1175 {
1176 int refno = offset >> 1;
1177 uschar *slotA = md->name_table;
1178
1179 for (i = 0; i < md->name_count; i++)
1180 {
1181 if (GET2(slotA, 0) == refno) break;
1182 slotA += md->name_entry_size;
1183 }
1184
1185 /* Found a name for the number - there can be only one; duplicate names
1186 for different numbers are allowed, but not vice versa. First scan down
1187 for duplicates. */
1188
1189 if (i < md->name_count)
1190 {
1191 uschar *slotB = slotA;
1192 while (slotB > md->name_table)
1193 {
1194 slotB -= md->name_entry_size;
1195 if (strcmp((char *)slotA + 2, (char *)slotB + 2) == 0)
1196 {
1197 offset = GET2(slotB, 0) << 1;
1198 condition = offset < offset_top &&
1199 md->offset_vector[offset] >= 0;
1200 if (condition) break;
1201 }
1202 else break;
1203 }
1204
1205 /* Scan up for duplicates */
1206
1207 if (!condition)
1208 {
1209 slotB = slotA;
1210 for (i++; i < md->name_count; i++)
1211 {
1212 slotB += md->name_entry_size;
1213 if (strcmp((char *)slotA + 2, (char *)slotB + 2) == 0)
1214 {
1215 offset = GET2(slotB, 0) << 1;
1216 condition = offset < offset_top &&
1217 md->offset_vector[offset] >= 0;
1218 if (condition) break;
1219 }
1220 else break;
1221 }
1222 }
1223 }
1224 }
1225
1226 /* Chose branch according to the condition */
1227
1228 ecode += condition? 3 : GET(ecode, 1);
1229 }
1230
1231 else if (condcode == OP_DEF) /* DEFINE - always false */
1232 {
1233 condition = FALSE;
1234 ecode += GET(ecode, 1);
1235 }
1236
1237 /* The condition is an assertion. Call match() to evaluate it - setting
1238 md->match_function_type to MATCH_CONDASSERT causes it to stop at the end of
1239 an assertion. */
1240
1241 else
1242 {
1243 md->match_function_type = MATCH_CONDASSERT;
1244 RMATCH(eptr, ecode + 1 + LINK_SIZE, offset_top, md, NULL, RM3);
1245 if (rrc == MATCH_MATCH)
1246 {
1247 if (md->end_offset_top > offset_top)
1248 offset_top = md->end_offset_top; /* Captures may have happened */
1249 condition = TRUE;
1250 ecode += 1 + LINK_SIZE + GET(ecode, LINK_SIZE + 2);
1251 while (*ecode == OP_ALT) ecode += GET(ecode, 1);
1252 }
1253 else if (rrc != MATCH_NOMATCH &&
1254 (rrc != MATCH_THEN || md->start_match_ptr != ecode))
1255 {
1256 RRETURN(rrc); /* Need braces because of following else */
1257 }
1258 else
1259 {
1260 condition = FALSE;
1261 ecode += codelink;
1262 }
1263 }
1264
1265 /* We are now at the branch that is to be obeyed. As there is only one,
1266 we used to use tail recursion to avoid using another stack frame, except
1267 when there was unlimited repeat of a possibly empty group. However, that
1268 strategy no longer works because of the possibilty of (*THEN) being
1269 encountered in the branch. A recursive call to match() is always required,
1270 unless the second alternative doesn't exist, in which case we can just
1271 plough on. */
1272
1273 if (condition || *ecode == OP_ALT)
1274 {
1275 if (op == OP_SCOND) md->match_function_type = MATCH_CBEGROUP;
1276 RMATCH(eptr, ecode + 1 + LINK_SIZE, offset_top, md, eptrb, RM49);
1277 if (rrc == MATCH_THEN && md->start_match_ptr == ecode)
1278 rrc = MATCH_NOMATCH;
1279 RRETURN(rrc);
1280 }
1281 else /* Condition false & no alternative */
1282 {
1283 ecode += 1 + LINK_SIZE;
1284 }
1285 break;
1286
1287
1288 /* Before OP_ACCEPT there may be any number of OP_CLOSE opcodes,
1289 to close any currently open capturing brackets. */
1290
1291 case OP_CLOSE:
1292 number = GET2(ecode, 1);
1293 offset = number << 1;
1294
1295 #ifdef PCRE_DEBUG
1296 printf("end bracket %d at *ACCEPT", number);
1297 printf("\n");
1298 #endif
1299
1300 md->capture_last = number;
1301 if (offset >= md->offset_max) md->offset_overflow = TRUE; else
1302 {
1303 md->offset_vector[offset] =
1304 md->offset_vector[md->offset_end - number];
1305 md->offset_vector[offset+1] = (int)(eptr - md->start_subject);
1306 if (offset_top <= offset) offset_top = offset + 2;
1307 }
1308 ecode += 3;
1309 break;
1310
1311
1312 /* End of the pattern, either real or forced. */
1313
1314 case OP_END:
1315 case OP_ACCEPT:
1316 case OP_ASSERT_ACCEPT:
1317
1318 /* If we have matched an empty string, fail if not in an assertion and not
1319 in a recursion if either PCRE_NOTEMPTY is set, or if PCRE_NOTEMPTY_ATSTART
1320 is set and we have matched at the start of the subject. In both cases,
1321 backtracking will then try other alternatives, if any. */
1322
1323 if (eptr == mstart && op != OP_ASSERT_ACCEPT &&
1324 md->recursive == NULL &&
1325 (md->notempty ||
1326 (md->notempty_atstart &&
1327 mstart == md->start_subject + md->start_offset)))
1328 MRRETURN(MATCH_NOMATCH);
1329
1330 /* Otherwise, we have a match. */
1331
1332 md->end_match_ptr = eptr; /* Record where we ended */
1333 md->end_offset_top = offset_top; /* and how many extracts were taken */
1334 md->start_match_ptr = mstart; /* and the start (\K can modify) */
1335
1336 /* For some reason, the macros don't work properly if an expression is
1337 given as the argument to MRRETURN when the heap is in use. */
1338
1339 rrc = (op == OP_END)? MATCH_MATCH : MATCH_ACCEPT;
1340 MRRETURN(rrc);
1341
1342 /* Assertion brackets. Check the alternative branches in turn - the
1343 matching won't pass the KET for an assertion. If any one branch matches,
1344 the assertion is true. Lookbehind assertions have an OP_REVERSE item at the
1345 start of each branch to move the current point backwards, so the code at
1346 this level is identical to the lookahead case. When the assertion is part
1347 of a condition, we want to return immediately afterwards. The caller of
1348 this incarnation of the match() function will have set MATCH_CONDASSERT in
1349 md->match_function type, and one of these opcodes will be the first opcode
1350 that is processed. We use a local variable that is preserved over calls to
1351 match() to remember this case. */
1352
1353 case OP_ASSERT:
1354 case OP_ASSERTBACK:
1355 if (md->match_function_type == MATCH_CONDASSERT)
1356 {
1357 condassert = TRUE;
1358 md->match_function_type = 0;
1359 }
1360 else condassert = FALSE;
1361
1362 do
1363 {
1364 RMATCH(eptr, ecode + 1 + LINK_SIZE, offset_top, md, NULL, RM4);
1365 if (rrc == MATCH_MATCH || rrc == MATCH_ACCEPT)
1366 {
1367 mstart = md->start_match_ptr; /* In case \K reset it */
1368 markptr = md->mark;
1369 break;
1370 }
1371 if (rrc != MATCH_NOMATCH &&
1372 (rrc != MATCH_THEN || md->start_match_ptr != ecode))
1373 RRETURN(rrc);
1374 ecode += GET(ecode, 1);
1375 }
1376 while (*ecode == OP_ALT);
1377
1378 if (*ecode == OP_KET) MRRETURN(MATCH_NOMATCH);
1379
1380 /* If checking an assertion for a condition, return MATCH_MATCH. */
1381
1382 if (condassert) RRETURN(MATCH_MATCH);
1383
1384 /* Continue from after the assertion, updating the offsets high water
1385 mark, since extracts may have been taken during the assertion. */
1386
1387 do ecode += GET(ecode,1); while (*ecode == OP_ALT);
1388 ecode += 1 + LINK_SIZE;
1389 offset_top = md->end_offset_top;
1390 continue;
1391
1392 /* Negative assertion: all branches must fail to match. Encountering SKIP,
1393 PRUNE, or COMMIT means we must assume failure without checking subsequent
1394 branches. */
1395
1396 case OP_ASSERT_NOT:
1397 case OP_ASSERTBACK_NOT:
1398 if (md->match_function_type == MATCH_CONDASSERT)
1399 {
1400 condassert = TRUE;
1401 md->match_function_type = 0;
1402 }
1403 else condassert = FALSE;
1404
1405 do
1406 {
1407 RMATCH(eptr, ecode + 1 + LINK_SIZE, offset_top, md, NULL, RM5);
1408 if (rrc == MATCH_MATCH || rrc == MATCH_ACCEPT) MRRETURN(MATCH_NOMATCH);
1409 if (rrc == MATCH_SKIP || rrc == MATCH_PRUNE || rrc == MATCH_COMMIT)
1410 {
1411 do ecode += GET(ecode,1); while (*ecode == OP_ALT);
1412 break;
1413 }
1414 if (rrc != MATCH_NOMATCH &&
1415 (rrc != MATCH_THEN || md->start_match_ptr != ecode))
1416 RRETURN(rrc);
1417 ecode += GET(ecode,1);
1418 }
1419 while (*ecode == OP_ALT);
1420
1421 if (condassert) RRETURN(MATCH_MATCH); /* Condition assertion */
1422
1423 ecode += 1 + LINK_SIZE;
1424 continue;
1425
1426 /* Move the subject pointer back. This occurs only at the start of
1427 each branch of a lookbehind assertion. If we are too close to the start to
1428 move back, this match function fails. When working with UTF-8 we move
1429 back a number of characters, not bytes. */
1430
1431 case OP_REVERSE:
1432 #ifdef SUPPORT_UTF8
1433 if (utf8)
1434 {
1435 i = GET(ecode, 1);
1436 while (i-- > 0)
1437 {
1438 eptr--;
1439 if (eptr < md->start_subject) MRRETURN(MATCH_NOMATCH);
1440 BACKCHAR(eptr);
1441 }
1442 }
1443 else
1444 #endif
1445
1446 /* No UTF-8 support, or not in UTF-8 mode: count is byte count */
1447
1448 {
1449 eptr -= GET(ecode, 1);
1450 if (eptr < md->start_subject) MRRETURN(MATCH_NOMATCH);
1451 }
1452
1453 /* Save the earliest consulted character, then skip to next op code */
1454
1455 if (eptr < md->start_used_ptr) md->start_used_ptr = eptr;
1456 ecode += 1 + LINK_SIZE;
1457 break;
1458
1459 /* The callout item calls an external function, if one is provided, passing
1460 details of the match so far. This is mainly for debugging, though the
1461 function is able to force a failure. */
1462
1463 case OP_CALLOUT:
1464 if (pcre_callout != NULL)
1465 {
1466 pcre_callout_block cb;
1467 cb.version = 1; /* Version 1 of the callout block */
1468 cb.callout_number = ecode[1];
1469 cb.offset_vector = md->offset_vector;
1470 cb.subject = (PCRE_SPTR)md->start_subject;
1471 cb.subject_length = (int)(md->end_subject - md->start_subject);
1472 cb.start_match = (int)(mstart - md->start_subject);
1473 cb.current_position = (int)(eptr - md->start_subject);
1474 cb.pattern_position = GET(ecode, 2);
1475 cb.next_item_length = GET(ecode, 2 + LINK_SIZE);
1476 cb.capture_top = offset_top/2;
1477 cb.capture_last = md->capture_last;
1478 cb.callout_data = md->callout_data;
1479 if ((rrc = (*pcre_callout)(&cb)) > 0) MRRETURN(MATCH_NOMATCH);
1480 if (rrc < 0) RRETURN(rrc);
1481 }
1482 ecode += 2 + 2*LINK_SIZE;
1483 break;
1484
1485 /* Recursion either matches the current regex, or some subexpression. The
1486 offset data is the offset to the starting bracket from the start of the
1487 whole pattern. (This is so that it works from duplicated subpatterns.)
1488
1489 The state of the capturing groups is preserved over recursion, and
1490 re-instated afterwards. We don't know how many are started and not yet
1491 finished (offset_top records the completed total) so we just have to save
1492 all the potential data. There may be up to 65535 such values, which is too
1493 large to put on the stack, but using malloc for small numbers seems
1494 expensive. As a compromise, the stack is used when there are no more than
1495 REC_STACK_SAVE_MAX values to store; otherwise malloc is used.
1496
1497 There are also other values that have to be saved. We use a chained
1498 sequence of blocks that actually live on the stack. Thanks to Robin Houston
1499 for the original version of this logic. It has, however, been hacked around
1500 a lot, so he is not to blame for the current way it works. */
1501
1502 case OP_RECURSE:
1503 {
1504 recursion_info *ri;
1505 int recno;
1506
1507 callpat = md->start_code + GET(ecode, 1);
1508 recno = (callpat == md->start_code)? 0 :
1509 GET2(callpat, 1 + LINK_SIZE);
1510
1511 /* Check for repeating a recursion without advancing the subject pointer.
1512 This should catch convoluted mutual recursions. (Some simple cases are
1513 caught at compile time.) */
1514
1515 for (ri = md->recursive; ri != NULL; ri = ri->prevrec)
1516 if (recno == ri->group_num && eptr == ri->subject_position)
1517 RRETURN(PCRE_ERROR_RECURSELOOP);
1518
1519 /* Add to "recursing stack" */
1520
1521 new_recursive.group_num = recno;
1522 new_recursive.subject_position = eptr;
1523 new_recursive.prevrec = md->recursive;
1524 md->recursive = &new_recursive;
1525
1526 /* Where to continue from afterwards */
1527
1528 ecode += 1 + LINK_SIZE;
1529
1530 /* Now save the offset data */
1531
1532 new_recursive.saved_max = md->offset_end;
1533 if (new_recursive.saved_max <= REC_STACK_SAVE_MAX)
1534 new_recursive.offset_save = stacksave;
1535 else
1536 {
1537 new_recursive.offset_save =
1538 (int *)(pcre_malloc)(new_recursive.saved_max * sizeof(int));
1539 if (new_recursive.offset_save == NULL) RRETURN(PCRE_ERROR_NOMEMORY);
1540 }
1541 memcpy(new_recursive.offset_save, md->offset_vector,
1542 new_recursive.saved_max * sizeof(int));
1543
1544 /* OK, now we can do the recursion. After processing each alternative,
1545 restore the offset data. If there were nested recursions, md->recursive
1546 might be changed, so reset it before looping. */
1547
1548 DPRINTF(("Recursing into group %d\n", new_recursive.group_num));
1549 cbegroup = (*callpat >= OP_SBRA);
1550 do
1551 {
1552 if (cbegroup) md->match_function_type = MATCH_CBEGROUP;
1553 RMATCH(eptr, callpat + _pcre_OP_lengths[*callpat], offset_top,
1554 md, eptrb, RM6);
1555 memcpy(md->offset_vector, new_recursive.offset_save,
1556 new_recursive.saved_max * sizeof(int));
1557 if (rrc == MATCH_MATCH || rrc == MATCH_ACCEPT)
1558 {
1559 DPRINTF(("Recursion matched\n"));
1560 md->recursive = new_recursive.prevrec;
1561 if (new_recursive.offset_save != stacksave)
1562 (pcre_free)(new_recursive.offset_save);
1563
1564 /* Set where we got to in the subject, and reset the start in case
1565 it was changed by \K. This *is* propagated back out of a recursion,
1566 for Perl compatibility. */
1567
1568 eptr = md->end_match_ptr;
1569 mstart = md->start_match_ptr;
1570 goto RECURSION_MATCHED; /* Exit loop; end processing */
1571 }
1572 else if (rrc != MATCH_NOMATCH &&
1573 (rrc != MATCH_THEN || md->start_match_ptr != ecode))
1574 {
1575 DPRINTF(("Recursion gave error %d\n", rrc));
1576 if (new_recursive.offset_save != stacksave)
1577 (pcre_free)(new_recursive.offset_save);
1578 RRETURN(rrc);
1579 }
1580
1581 md->recursive = &new_recursive;
1582 callpat += GET(callpat, 1);
1583 }
1584 while (*callpat == OP_ALT);
1585
1586 DPRINTF(("Recursion didn't match\n"));
1587 md->recursive = new_recursive.prevrec;
1588 if (new_recursive.offset_save != stacksave)
1589 (pcre_free)(new_recursive.offset_save);
1590 MRRETURN(MATCH_NOMATCH);
1591 }
1592
1593 RECURSION_MATCHED:
1594 break;
1595
1596 /* An alternation is the end of a branch; scan along to find the end of the
1597 bracketed group and go to there. */
1598
1599 case OP_ALT:
1600 do ecode += GET(ecode,1); while (*ecode == OP_ALT);
1601 break;
1602
1603 /* BRAZERO, BRAMINZERO and SKIPZERO occur just before a bracket group,
1604 indicating that it may occur zero times. It may repeat infinitely, or not
1605 at all - i.e. it could be ()* or ()? or even (){0} in the pattern. Brackets
1606 with fixed upper repeat limits are compiled as a number of copies, with the
1607 optional ones preceded by BRAZERO or BRAMINZERO. */
1608
1609 case OP_BRAZERO:
1610 next = ecode + 1;
1611 RMATCH(eptr, next, offset_top, md, eptrb, RM10);
1612 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1613 do next += GET(next, 1); while (*next == OP_ALT);
1614 ecode = next + 1 + LINK_SIZE;
1615 break;
1616
1617 case OP_BRAMINZERO:
1618 next = ecode + 1;
1619 do next += GET(next, 1); while (*next == OP_ALT);
1620 RMATCH(eptr, next + 1+LINK_SIZE, offset_top, md, eptrb, RM11);
1621 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1622 ecode++;
1623 break;
1624
1625 case OP_SKIPZERO:
1626 next = ecode+1;
1627 do next += GET(next,1); while (*next == OP_ALT);
1628 ecode = next + 1 + LINK_SIZE;
1629 break;
1630
1631 /* BRAPOSZERO occurs before a possessive bracket group. Don't do anything
1632 here; just jump to the group, with allow_zero set TRUE. */
1633
1634 case OP_BRAPOSZERO:
1635 op = *(++ecode);
1636 allow_zero = TRUE;
1637 if (op == OP_CBRAPOS || op == OP_SCBRAPOS) goto POSSESSIVE_CAPTURE;
1638 goto POSSESSIVE_NON_CAPTURE;
1639
1640 /* End of a group, repeated or non-repeating. */
1641
1642 case OP_KET:
1643 case OP_KETRMIN:
1644 case OP_KETRMAX:
1645 case OP_KETRPOS:
1646 prev = ecode - GET(ecode, 1);
1647
1648 /* If this was a group that remembered the subject start, in order to break
1649 infinite repeats of empty string matches, retrieve the subject start from
1650 the chain. Otherwise, set it NULL. */
1651
1652 if (*prev >= OP_SBRA || *prev == OP_ONCE)
1653 {
1654 saved_eptr = eptrb->epb_saved_eptr; /* Value at start of group */
1655 eptrb = eptrb->epb_prev; /* Backup to previous group */
1656 }
1657 else saved_eptr = NULL;
1658
1659 /* If we are at the end of an assertion group, stop matching and return
1660 MATCH_MATCH, but record the current high water mark for use by positive
1661 assertions. We also need to record the match start in case it was changed
1662 by \K. */
1663
1664 if (*prev == OP_ASSERT || *prev == OP_ASSERT_NOT ||
1665 *prev == OP_ASSERTBACK || *prev == OP_ASSERTBACK_NOT)
1666 {
1667 md->end_match_ptr = eptr; /* For ONCE */
1668 md->end_offset_top = offset_top;
1669 md->start_match_ptr = mstart;
1670 MRRETURN(MATCH_MATCH); /* Sets md->mark */
1671 }
1672
1673 /* For capturing groups we have to check the group number back at the start
1674 and if necessary complete handling an extraction by setting the offsets and
1675 bumping the high water mark. Whole-pattern recursion is coded as a recurse
1676 into group 0, so it won't be picked up here. Instead, we catch it when the
1677 OP_END is reached. Other recursion is handled here. We just have to record
1678 the current subject position and start match pointer and give a MATCH
1679 return. */
1680
1681 if (*prev == OP_CBRA || *prev == OP_SCBRA ||
1682 *prev == OP_CBRAPOS || *prev == OP_SCBRAPOS)
1683 {
1684 number = GET2(prev, 1+LINK_SIZE);
1685 offset = number << 1;
1686
1687 #ifdef PCRE_DEBUG
1688 printf("end bracket %d", number);
1689 printf("\n");
1690 #endif
1691
1692 /* Handle a recursively called group. */
1693
1694 if (md->recursive != NULL && md->recursive->group_num == number)
1695 {
1696 md->end_match_ptr = eptr;
1697 md->start_match_ptr = mstart;
1698 RRETURN(MATCH_MATCH);
1699 }
1700
1701 /* Deal with capturing */
1702
1703 md->capture_last = number;
1704 if (offset >= md->offset_max) md->offset_overflow = TRUE; else
1705 {
1706 /* If offset is greater than offset_top, it means that we are
1707 "skipping" a capturing group, and that group's offsets must be marked
1708 unset. In earlier versions of PCRE, all the offsets were unset at the
1709 start of matching, but this doesn't work because atomic groups and
1710 assertions can cause a value to be set that should later be unset.
1711 Example: matching /(?>(a))b|(a)c/ against "ac". This sets group 1 as
1712 part of the atomic group, but this is not on the final matching path,
1713 so must be unset when 2 is set. (If there is no group 2, there is no
1714 problem, because offset_top will then be 2, indicating no capture.) */
1715
1716 if (offset > offset_top)
1717 {
1718 register int *iptr = md->offset_vector + offset_top;
1719 register int *iend = md->offset_vector + offset;
1720 while (iptr < iend) *iptr++ = -1;
1721 }
1722
1723 /* Now make the extraction */
1724
1725 md->offset_vector[offset] =
1726 md->offset_vector[md->offset_end - number];
1727 md->offset_vector[offset+1] = (int)(eptr - md->start_subject);
1728 if (offset_top <= offset) offset_top = offset + 2;
1729 }
1730 }
1731
1732 /* For an ordinary non-repeating ket, just continue at this level. This
1733 also happens for a repeating ket if no characters were matched in the
1734 group. This is the forcible breaking of infinite loops as implemented in
1735 Perl 5.005. For a non-repeating atomic group, establish a backup point by
1736 processing the rest of the pattern at a lower level. If this results in a
1737 NOMATCH return, pass MATCH_ONCE back to the original OP_ONCE level, thereby
1738 bypassing intermediate backup points, but resetting any captures that
1739 happened along the way. */
1740
1741 if (*ecode == OP_KET || eptr == saved_eptr)
1742 {
1743 if (*prev == OP_ONCE)
1744 {
1745 RMATCH(eptr, ecode + 1 + LINK_SIZE, offset_top, md, eptrb, RM12);
1746 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1747 md->once_target = prev; /* Level at which to change to MATCH_NOMATCH */
1748 RRETURN(MATCH_ONCE);
1749 }
1750 ecode += 1 + LINK_SIZE; /* Carry on at this level */
1751 break;
1752 }
1753
1754 /* OP_KETRPOS is a possessive repeating ket. Remember the current position,
1755 and return the MATCH_KETRPOS. This makes it possible to do the repeats one
1756 at a time from the outer level, thus saving stack. */
1757
1758 if (*ecode == OP_KETRPOS)
1759 {
1760 md->end_match_ptr = eptr;
1761 md->end_offset_top = offset_top;
1762 RRETURN(MATCH_KETRPOS);
1763 }
1764
1765 /* The normal repeating kets try the rest of the pattern or restart from
1766 the preceding bracket, in the appropriate order. In the second case, we can
1767 use tail recursion to avoid using another stack frame, unless we have an
1768 an atomic group or an unlimited repeat of a group that can match an empty
1769 string. */
1770
1771 if (*ecode == OP_KETRMIN)
1772 {
1773 RMATCH(eptr, ecode + 1 + LINK_SIZE, offset_top, md, eptrb, RM7);
1774 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1775 if (*prev == OP_ONCE)
1776 {
1777 RMATCH(eptr, prev, offset_top, md, eptrb, RM8);
1778 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1779 md->once_target = prev; /* Level at which to change to MATCH_NOMATCH */
1780 RRETURN(MATCH_ONCE);
1781 }
1782 if (*prev >= OP_SBRA) /* Could match an empty string */
1783 {
1784 md->match_function_type = MATCH_CBEGROUP;
1785 RMATCH(eptr, prev, offset_top, md, eptrb, RM50);
1786 RRETURN(rrc);
1787 }
1788 ecode = prev;
1789 goto TAIL_RECURSE;
1790 }
1791 else /* OP_KETRMAX */
1792 {
1793 if (*prev >= OP_SBRA) md->match_function_type = MATCH_CBEGROUP;
1794 RMATCH(eptr, prev, offset_top, md, eptrb, RM13);
1795 if (rrc == MATCH_ONCE && md->once_target == prev) rrc = MATCH_NOMATCH;
1796 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1797 if (*prev == OP_ONCE)
1798 {
1799 RMATCH(eptr, ecode + 1 + LINK_SIZE, offset_top, md, eptrb, RM9);
1800 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1801 md->once_target = prev;
1802 RRETURN(MATCH_ONCE);
1803 }
1804 ecode += 1 + LINK_SIZE;
1805 goto TAIL_RECURSE;
1806 }
1807 /* Control never gets here */
1808
1809 /* Not multiline mode: start of subject assertion, unless notbol. */
1810
1811 case OP_CIRC:
1812 if (md->notbol && eptr == md->start_subject) MRRETURN(MATCH_NOMATCH);
1813
1814 /* Start of subject assertion */
1815
1816 case OP_SOD:
1817 if (eptr != md->start_subject) MRRETURN(MATCH_NOMATCH);
1818 ecode++;
1819 break;
1820
1821 /* Multiline mode: start of subject unless notbol, or after any newline. */
1822
1823 case OP_CIRCM:
1824 if (md->notbol && eptr == md->start_subject) MRRETURN(MATCH_NOMATCH);
1825 if (eptr != md->start_subject &&
1826 (eptr == md->end_subject || !WAS_NEWLINE(eptr)))
1827 MRRETURN(MATCH_NOMATCH);
1828 ecode++;
1829 break;
1830
1831 /* Start of match assertion */
1832
1833 case OP_SOM:
1834 if (eptr != md->start_subject + md->start_offset) MRRETURN(MATCH_NOMATCH);
1835 ecode++;
1836 break;
1837
1838 /* Reset the start of match point */
1839
1840 case OP_SET_SOM:
1841 mstart = eptr;
1842 ecode++;
1843 break;
1844
1845 /* Multiline mode: assert before any newline, or before end of subject
1846 unless noteol is set. */
1847
1848 case OP_DOLLM:
1849 if (eptr < md->end_subject)
1850 { if (!IS_NEWLINE(eptr)) MRRETURN(MATCH_NOMATCH); }
1851 else
1852 {
1853 if (md->noteol) MRRETURN(MATCH_NOMATCH);
1854 SCHECK_PARTIAL();
1855 }
1856 ecode++;
1857 break;
1858
1859 /* Not multiline mode: assert before a terminating newline or before end of
1860 subject unless noteol is set. */
1861
1862 case OP_DOLL:
1863 if (md->noteol) MRRETURN(MATCH_NOMATCH);
1864 if (!md->endonly) goto ASSERT_NL_OR_EOS;
1865
1866 /* ... else fall through for endonly */
1867
1868 /* End of subject assertion (\z) */
1869
1870 case OP_EOD:
1871 if (eptr < md->end_subject) MRRETURN(MATCH_NOMATCH);
1872 SCHECK_PARTIAL();
1873 ecode++;
1874 break;
1875
1876 /* End of subject or ending \n assertion (\Z) */
1877
1878 case OP_EODN:
1879 ASSERT_NL_OR_EOS:
1880 if (eptr < md->end_subject &&
1881 (!IS_NEWLINE(eptr) || eptr != md->end_subject - md->nllen))
1882 MRRETURN(MATCH_NOMATCH);
1883
1884 /* Either at end of string or \n before end. */
1885
1886 SCHECK_PARTIAL();
1887 ecode++;
1888 break;
1889
1890 /* Word boundary assertions */
1891
1892 case OP_NOT_WORD_BOUNDARY:
1893 case OP_WORD_BOUNDARY:
1894 {
1895
1896 /* Find out if the previous and current characters are "word" characters.
1897 It takes a bit more work in UTF-8 mode. Characters > 255 are assumed to
1898 be "non-word" characters. Remember the earliest consulted character for
1899 partial matching. */
1900
1901 #ifdef SUPPORT_UTF8
1902 if (utf8)
1903 {
1904 /* Get status of previous character */
1905
1906 if (eptr == md->start_subject) prev_is_word = FALSE; else
1907 {
1908 USPTR lastptr = eptr - 1;
1909 while((*lastptr & 0xc0) == 0x80) lastptr--;
1910 if (lastptr < md->start_used_ptr) md->start_used_ptr = lastptr;
1911 GETCHAR(c, lastptr);
1912 #ifdef SUPPORT_UCP
1913 if (md->use_ucp)
1914 {
1915 if (c == '_') prev_is_word = TRUE; else
1916 {
1917 int cat = UCD_CATEGORY(c);
1918 prev_is_word = (cat == ucp_L || cat == ucp_N);
1919 }
1920 }
1921 else
1922 #endif
1923 prev_is_word = c < 256 && (md->ctypes[c] & ctype_word) != 0;
1924 }
1925
1926 /* Get status of next character */
1927
1928 if (eptr >= md->end_subject)
1929 {
1930 SCHECK_PARTIAL();
1931 cur_is_word = FALSE;
1932 }
1933 else
1934 {
1935 GETCHAR(c, eptr);
1936 #ifdef SUPPORT_UCP
1937 if (md->use_ucp)
1938 {
1939 if (c == '_') cur_is_word = TRUE; else
1940 {
1941 int cat = UCD_CATEGORY(c);
1942 cur_is_word = (cat == ucp_L || cat == ucp_N);
1943 }
1944 }
1945 else
1946 #endif
1947 cur_is_word = c < 256 && (md->ctypes[c] & ctype_word) != 0;
1948 }
1949 }
1950 else
1951 #endif
1952
1953 /* Not in UTF-8 mode, but we may still have PCRE_UCP set, and for
1954 consistency with the behaviour of \w we do use it in this case. */
1955
1956 {
1957 /* Get status of previous character */
1958
1959 if (eptr == md->start_subject) prev_is_word = FALSE; else
1960 {
1961 if (eptr <= md->start_used_ptr) md->start_used_ptr = eptr - 1;
1962 #ifdef SUPPORT_UCP
1963 if (md->use_ucp)
1964 {
1965 c = eptr[-1];
1966 if (c == '_') prev_is_word = TRUE; else
1967 {
1968 int cat = UCD_CATEGORY(c);
1969 prev_is_word = (cat == ucp_L || cat == ucp_N);
1970 }
1971 }
1972 else
1973 #endif
1974 prev_is_word = ((md->ctypes[eptr[-1]] & ctype_word) != 0);
1975 }
1976
1977 /* Get status of next character */
1978
1979 if (eptr >= md->end_subject)
1980 {
1981 SCHECK_PARTIAL();
1982 cur_is_word = FALSE;
1983 }
1984 else
1985 #ifdef SUPPORT_UCP
1986 if (md->use_ucp)
1987 {
1988 c = *eptr;
1989 if (c == '_') cur_is_word = TRUE; else
1990 {
1991 int cat = UCD_CATEGORY(c);
1992 cur_is_word = (cat == ucp_L || cat == ucp_N);
1993 }
1994 }
1995 else
1996 #endif
1997 cur_is_word = ((md->ctypes[*eptr] & ctype_word) != 0);
1998 }
1999
2000 /* Now see if the situation is what we want */
2001
2002 if ((*ecode++ == OP_WORD_BOUNDARY)?
2003 cur_is_word == prev_is_word : cur_is_word != prev_is_word)
2004 MRRETURN(MATCH_NOMATCH);
2005 }
2006 break;
2007
2008 /* Match a single character type; inline for speed */
2009
2010 case OP_ANY:
2011 if (IS_NEWLINE(eptr)) MRRETURN(MATCH_NOMATCH);
2012 /* Fall through */
2013
2014 case OP_ALLANY:
2015 if (eptr++ >= md->end_subject)
2016 {
2017 SCHECK_PARTIAL();
2018 MRRETURN(MATCH_NOMATCH);
2019 }
2020 if (utf8) while (eptr < md->end_subject && (*eptr & 0xc0) == 0x80) eptr++;
2021 ecode++;
2022 break;
2023
2024 /* Match a single byte, even in UTF-8 mode. This opcode really does match
2025 any byte, even newline, independent of the setting of PCRE_DOTALL. */
2026
2027 case OP_ANYBYTE:
2028 if (eptr++ >= md->end_subject)
2029 {
2030 SCHECK_PARTIAL();
2031 MRRETURN(MATCH_NOMATCH);
2032 }
2033 ecode++;
2034 break;
2035
2036 case OP_NOT_DIGIT:
2037 if (eptr >= md->end_subject)
2038 {
2039 SCHECK_PARTIAL();
2040 MRRETURN(MATCH_NOMATCH);
2041 }
2042 GETCHARINCTEST(c, eptr);
2043 if (
2044 #ifdef SUPPORT_UTF8
2045 c < 256 &&
2046 #endif
2047 (md->ctypes[c] & ctype_digit) != 0
2048 )
2049 MRRETURN(MATCH_NOMATCH);
2050 ecode++;
2051 break;
2052
2053 case OP_DIGIT:
2054 if (eptr >= md->end_subject)
2055 {
2056 SCHECK_PARTIAL();
2057 MRRETURN(MATCH_NOMATCH);
2058 }
2059 GETCHARINCTEST(c, eptr);
2060 if (
2061 #ifdef SUPPORT_UTF8
2062 c >= 256 ||
2063 #endif
2064 (md->ctypes[c] & ctype_digit) == 0
2065 )
2066 MRRETURN(MATCH_NOMATCH);
2067 ecode++;
2068 break;
2069
2070 case OP_NOT_WHITESPACE:
2071 if (eptr >= md->end_subject)
2072 {
2073 SCHECK_PARTIAL();
2074 MRRETURN(MATCH_NOMATCH);
2075 }
2076 GETCHARINCTEST(c, eptr);
2077 if (
2078 #ifdef SUPPORT_UTF8
2079 c < 256 &&
2080 #endif
2081 (md->ctypes[c] & ctype_space) != 0
2082 )
2083 MRRETURN(MATCH_NOMATCH);
2084 ecode++;
2085 break;
2086
2087 case OP_WHITESPACE:
2088 if (eptr >= md->end_subject)
2089 {
2090 SCHECK_PARTIAL();
2091 MRRETURN(MATCH_NOMATCH);
2092 }
2093 GETCHARINCTEST(c, eptr);
2094 if (
2095 #ifdef SUPPORT_UTF8
2096 c >= 256 ||
2097 #endif
2098 (md->ctypes[c] & ctype_space) == 0
2099 )
2100 MRRETURN(MATCH_NOMATCH);
2101 ecode++;
2102 break;
2103
2104 case OP_NOT_WORDCHAR:
2105 if (eptr >= md->end_subject)
2106 {
2107 SCHECK_PARTIAL();
2108 MRRETURN(MATCH_NOMATCH);
2109 }
2110 GETCHARINCTEST(c, eptr);
2111 if (
2112 #ifdef SUPPORT_UTF8
2113 c < 256 &&
2114 #endif
2115 (md->ctypes[c] & ctype_word) != 0
2116 )
2117 MRRETURN(MATCH_NOMATCH);
2118 ecode++;
2119 break;
2120
2121 case OP_WORDCHAR:
2122 if (eptr >= md->end_subject)
2123 {
2124 SCHECK_PARTIAL();
2125 MRRETURN(MATCH_NOMATCH);
2126 }
2127 GETCHARINCTEST(c, eptr);
2128 if (
2129 #ifdef SUPPORT_UTF8
2130 c >= 256 ||
2131 #endif
2132 (md->ctypes[c] & ctype_word) == 0
2133 )
2134 MRRETURN(MATCH_NOMATCH);
2135 ecode++;
2136 break;
2137
2138 case OP_ANYNL:
2139 if (eptr >= md->end_subject)
2140 {
2141 SCHECK_PARTIAL();
2142 MRRETURN(MATCH_NOMATCH);
2143 }
2144 GETCHARINCTEST(c, eptr);
2145 switch(c)
2146 {
2147 default: MRRETURN(MATCH_NOMATCH);
2148
2149 case 0x000d:
2150 if (eptr < md->end_subject && *eptr == 0x0a) eptr++;
2151 break;
2152
2153 case 0x000a:
2154 break;
2155
2156 case 0x000b:
2157 case 0x000c:
2158 case 0x0085:
2159 case 0x2028:
2160 case 0x2029:
2161 if (md->bsr_anycrlf) MRRETURN(MATCH_NOMATCH);
2162 break;
2163 }
2164 ecode++;
2165 break;
2166
2167 case OP_NOT_HSPACE:
2168 if (eptr >= md->end_subject)
2169 {
2170 SCHECK_PARTIAL();
2171 MRRETURN(MATCH_NOMATCH);
2172 }
2173 GETCHARINCTEST(c, eptr);
2174 switch(c)
2175 {
2176 default: break;
2177 case 0x09: /* HT */
2178 case 0x20: /* SPACE */
2179 case 0xa0: /* NBSP */
2180 case 0x1680: /* OGHAM SPACE MARK */
2181 case 0x180e: /* MONGOLIAN VOWEL SEPARATOR */
2182 case 0x2000: /* EN QUAD */
2183 case 0x2001: /* EM QUAD */
2184 case 0x2002: /* EN SPACE */
2185 case 0x2003: /* EM SPACE */
2186 case 0x2004: /* THREE-PER-EM SPACE */
2187 case 0x2005: /* FOUR-PER-EM SPACE */
2188 case 0x2006: /* SIX-PER-EM SPACE */
2189 case 0x2007: /* FIGURE SPACE */
2190 case 0x2008: /* PUNCTUATION SPACE */
2191 case 0x2009: /* THIN SPACE */
2192 case 0x200A: /* HAIR SPACE */
2193 case 0x202f: /* NARROW NO-BREAK SPACE */
2194 case 0x205f: /* MEDIUM MATHEMATICAL SPACE */
2195 case 0x3000: /* IDEOGRAPHIC SPACE */
2196 MRRETURN(MATCH_NOMATCH);
2197 }
2198 ecode++;
2199 break;
2200
2201 case OP_HSPACE:
2202 if (eptr >= md->end_subject)
2203 {
2204 SCHECK_PARTIAL();
2205 MRRETURN(MATCH_NOMATCH);
2206 }
2207 GETCHARINCTEST(c, eptr);
2208 switch(c)
2209 {
2210 default: MRRETURN(MATCH_NOMATCH);
2211 case 0x09: /* HT */
2212 case 0x20: /* SPACE */
2213 case 0xa0: /* NBSP */
2214 case 0x1680: /* OGHAM SPACE MARK */
2215 case 0x180e: /* MONGOLIAN VOWEL SEPARATOR */
2216 case 0x2000: /* EN QUAD */
2217 case 0x2001: /* EM QUAD */
2218 case 0x2002: /* EN SPACE */
2219 case 0x2003: /* EM SPACE */
2220 case 0x2004: /* THREE-PER-EM SPACE */
2221 case 0x2005: /* FOUR-PER-EM SPACE */
2222 case 0x2006: /* SIX-PER-EM SPACE */
2223 case 0x2007: /* FIGURE SPACE */
2224 case 0x2008: /* PUNCTUATION SPACE */
2225 case 0x2009: /* THIN SPACE */
2226 case 0x200A: /* HAIR SPACE */
2227 case 0x202f: /* NARROW NO-BREAK SPACE */
2228 case 0x205f: /* MEDIUM MATHEMATICAL SPACE */
2229 case 0x3000: /* IDEOGRAPHIC SPACE */
2230 break;
2231 }
2232 ecode++;
2233 break;
2234
2235 case OP_NOT_VSPACE:
2236 if (eptr >= md->end_subject)
2237 {
2238 SCHECK_PARTIAL();
2239 MRRETURN(MATCH_NOMATCH);
2240 }
2241 GETCHARINCTEST(c, eptr);
2242 switch(c)
2243 {
2244 default: break;
2245 case 0x0a: /* LF */
2246 case 0x0b: /* VT */
2247 case 0x0c: /* FF */
2248 case 0x0d: /* CR */
2249 case 0x85: /* NEL */
2250 case 0x2028: /* LINE SEPARATOR */
2251 case 0x2029: /* PARAGRAPH SEPARATOR */
2252 MRRETURN(MATCH_NOMATCH);
2253 }
2254 ecode++;
2255 break;
2256
2257 case OP_VSPACE:
2258 if (eptr >= md->end_subject)
2259 {
2260 SCHECK_PARTIAL();
2261 MRRETURN(MATCH_NOMATCH);
2262 }
2263 GETCHARINCTEST(c, eptr);
2264 switch(c)
2265 {
2266 default: MRRETURN(MATCH_NOMATCH);
2267 case 0x0a: /* LF */
2268 case 0x0b: /* VT */
2269 case 0x0c: /* FF */
2270 case 0x0d: /* CR */
2271 case 0x85: /* NEL */
2272 case 0x2028: /* LINE SEPARATOR */
2273 case 0x2029: /* PARAGRAPH SEPARATOR */
2274 break;
2275 }
2276 ecode++;
2277 break;
2278
2279 #ifdef SUPPORT_UCP
2280 /* Check the next character by Unicode property. We will get here only
2281 if the support is in the binary; otherwise a compile-time error occurs. */
2282
2283 case OP_PROP:
2284 case OP_NOTPROP:
2285 if (eptr >= md->end_subject)
2286 {
2287 SCHECK_PARTIAL();
2288 MRRETURN(MATCH_NOMATCH);
2289 }
2290 GETCHARINCTEST(c, eptr);
2291 {
2292 const ucd_record *prop = GET_UCD(c);
2293
2294 switch(ecode[1])
2295 {
2296 case PT_ANY:
2297 if (op == OP_NOTPROP) MRRETURN(MATCH_NOMATCH);
2298 break;
2299
2300 case PT_LAMP:
2301 if ((prop->chartype == ucp_Lu ||
2302 prop->chartype == ucp_Ll ||
2303 prop->chartype == ucp_Lt) == (op == OP_NOTPROP))
2304 MRRETURN(MATCH_NOMATCH);
2305 break;
2306
2307 case PT_GC:
2308 if ((ecode[2] != _pcre_ucp_gentype[prop->chartype]) == (op == OP_PROP))
2309 MRRETURN(MATCH_NOMATCH);
2310 break;
2311
2312 case PT_PC:
2313 if ((ecode[2] != prop->chartype) == (op == OP_PROP))
2314 MRRETURN(MATCH_NOMATCH);
2315 break;
2316
2317 case PT_SC:
2318 if ((ecode[2] != prop->script) == (op == OP_PROP))
2319 MRRETURN(MATCH_NOMATCH);
2320 break;
2321
2322 /* These are specials */
2323
2324 case PT_ALNUM:
2325 if ((_pcre_ucp_gentype[prop->chartype] == ucp_L ||
2326 _pcre_ucp_gentype[prop->chartype] == ucp_N) == (op == OP_NOTPROP))
2327 MRRETURN(MATCH_NOMATCH);
2328 break;
2329
2330 case PT_SPACE: /* Perl space */
2331 if ((_pcre_ucp_gentype[prop->chartype] == ucp_Z ||
2332 c == CHAR_HT || c == CHAR_NL || c == CHAR_FF || c == CHAR_CR)
2333 == (op == OP_NOTPROP))
2334 MRRETURN(MATCH_NOMATCH);
2335 break;
2336
2337 case PT_PXSPACE: /* POSIX space */
2338 if ((_pcre_ucp_gentype[prop->chartype] == ucp_Z ||
2339 c == CHAR_HT || c == CHAR_NL || c == CHAR_VT ||
2340 c == CHAR_FF || c == CHAR_CR)
2341 == (op == OP_NOTPROP))
2342 MRRETURN(MATCH_NOMATCH);
2343 break;
2344
2345 case PT_WORD:
2346 if ((_pcre_ucp_gentype[prop->chartype] == ucp_L ||
2347 _pcre_ucp_gentype[prop->chartype] == ucp_N ||
2348 c == CHAR_UNDERSCORE) == (op == OP_NOTPROP))
2349 MRRETURN(MATCH_NOMATCH);
2350 break;
2351
2352 /* This should never occur */
2353
2354 default:
2355 RRETURN(PCRE_ERROR_INTERNAL);
2356 }
2357
2358 ecode += 3;
2359 }
2360 break;
2361
2362 /* Match an extended Unicode sequence. We will get here only if the support
2363 is in the binary; otherwise a compile-time error occurs. */
2364
2365 case OP_EXTUNI:
2366 if (eptr >= md->end_subject)
2367 {
2368 SCHECK_PARTIAL();
2369 MRRETURN(MATCH_NOMATCH);
2370 }
2371 GETCHARINCTEST(c, eptr);
2372 if (UCD_CATEGORY(c) == ucp_M) MRRETURN(MATCH_NOMATCH);
2373 while (eptr < md->end_subject)
2374 {
2375 int len = 1;
2376 if (!utf8) c = *eptr; else { GETCHARLEN(c, eptr, len); }
2377 if (UCD_CATEGORY(c) != ucp_M) break;
2378 eptr += len;
2379 }
2380 ecode++;
2381 break;
2382 #endif
2383
2384
2385 /* Match a back reference, possibly repeatedly. Look past the end of the
2386 item to see if there is repeat information following. The code is similar
2387 to that for character classes, but repeated for efficiency. Then obey
2388 similar code to character type repeats - written out again for speed.
2389 However, if the referenced string is the empty string, always treat
2390 it as matched, any number of times (otherwise there could be infinite
2391 loops). */
2392
2393 case OP_REF:
2394 case OP_REFI:
2395 caseless = op == OP_REFI;
2396 offset = GET2(ecode, 1) << 1; /* Doubled ref number */
2397 ecode += 3;
2398
2399 /* If the reference is unset, there are two possibilities:
2400
2401 (a) In the default, Perl-compatible state, set the length negative;
2402 this ensures that every attempt at a match fails. We can't just fail
2403 here, because of the possibility of quantifiers with zero minima.
2404
2405 (b) If the JavaScript compatibility flag is set, set the length to zero
2406 so that the back reference matches an empty string.
2407
2408 Otherwise, set the length to the length of what was matched by the
2409 referenced subpattern. */
2410
2411 if (offset >= offset_top || md->offset_vector[offset] < 0)
2412 length = (md->jscript_compat)? 0 : -1;
2413 else
2414 length = md->offset_vector[offset+1] - md->offset_vector[offset];
2415
2416 /* Set up for repetition, or handle the non-repeated case */
2417
2418 switch (*ecode)
2419 {
2420 case OP_CRSTAR:
2421 case OP_CRMINSTAR:
2422 case OP_CRPLUS:
2423 case OP_CRMINPLUS:
2424 case OP_CRQUERY:
2425 case OP_CRMINQUERY:
2426 c = *ecode++ - OP_CRSTAR;
2427 minimize = (c & 1) != 0;
2428 min = rep_min[c]; /* Pick up values from tables; */
2429 max = rep_max[c]; /* zero for max => infinity */
2430 if (max == 0) max = INT_MAX;
2431 break;
2432
2433 case OP_CRRANGE:
2434 case OP_CRMINRANGE:
2435 minimize = (*ecode == OP_CRMINRANGE);
2436 min = GET2(ecode, 1);
2437 max = GET2(ecode, 3);
2438 if (max == 0) max = INT_MAX;
2439 ecode += 5;
2440 break;
2441
2442 default: /* No repeat follows */
2443 if ((length = match_ref(offset, eptr, length, md, caseless)) < 0)
2444 {
2445 CHECK_PARTIAL();
2446 MRRETURN(MATCH_NOMATCH);
2447 }
2448 eptr += length;
2449 continue; /* With the main loop */
2450 }
2451
2452 /* Handle repeated back references. If the length of the reference is
2453 zero, just continue with the main loop. */
2454
2455 if (length == 0) continue;
2456
2457 /* First, ensure the minimum number of matches are present. We get back
2458 the length of the reference string explicitly rather than passing the
2459 address of eptr, so that eptr can be a register variable. */
2460
2461 for (i = 1; i <= min; i++)
2462 {
2463 int slength;
2464 if ((slength = match_ref(offset, eptr, length, md, caseless)) < 0)
2465 {
2466 CHECK_PARTIAL();
2467 MRRETURN(MATCH_NOMATCH);
2468 }
2469 eptr += slength;
2470 }
2471
2472 /* If min = max, continue at the same level without recursion.
2473 They are not both allowed to be zero. */
2474
2475 if (min == max) continue;
2476
2477 /* If minimizing, keep trying and advancing the pointer */
2478
2479 if (minimize)
2480 {
2481 for (fi = min;; fi++)
2482 {
2483 int slength;
2484 RMATCH(eptr, ecode, offset_top, md, eptrb, RM14);
2485 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2486 if (fi >= max) MRRETURN(MATCH_NOMATCH);
2487 if ((slength = match_ref(offset, eptr, length, md, caseless)) < 0)
2488 {
2489 CHECK_PARTIAL();
2490 MRRETURN(MATCH_NOMATCH);
2491 }
2492 eptr += slength;
2493 }
2494 /* Control never gets here */
2495 }
2496
2497 /* If maximizing, find the longest string and work backwards */
2498
2499 else
2500 {
2501 pp = eptr;
2502 for (i = min; i < max; i++)
2503 {
2504 int slength;
2505 if ((slength = match_ref(offset, eptr, length, md, caseless)) < 0)
2506 {
2507 CHECK_PARTIAL();
2508 break;
2509 }
2510 eptr += slength;
2511 }
2512 while (eptr >= pp)
2513 {
2514 RMATCH(eptr, ecode, offset_top, md, eptrb, RM15);
2515 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2516 eptr -= length;
2517 }
2518 MRRETURN(MATCH_NOMATCH);
2519 }
2520 /* Control never gets here */
2521
2522 /* Match a bit-mapped character class, possibly repeatedly. This op code is
2523 used when all the characters in the class have values in the range 0-255,
2524 and either the matching is caseful, or the characters are in the range
2525 0-127 when UTF-8 processing is enabled. The only difference between
2526 OP_CLASS and OP_NCLASS occurs when a data character outside the range is
2527 encountered.
2528
2529 First, look past the end of the item to see if there is repeat information
2530 following. Then obey similar code to character type repeats - written out
2531 again for speed. */
2532
2533 case OP_NCLASS:
2534 case OP_CLASS:
2535 {
2536 data = ecode + 1; /* Save for matching */
2537 ecode += 33; /* Advance past the item */
2538
2539 switch (*ecode)
2540 {
2541 case OP_CRSTAR:
2542 case OP_CRMINSTAR:
2543 case OP_CRPLUS:
2544 case OP_CRMINPLUS:
2545 case OP_CRQUERY:
2546 case OP_CRMINQUERY:
2547 c = *ecode++ - OP_CRSTAR;
2548 minimize = (c & 1) != 0;
2549 min = rep_min[c]; /* Pick up values from tables; */
2550 max = rep_max[c]; /* zero for max => infinity */
2551 if (max == 0) max = INT_MAX;
2552 break;
2553
2554 case OP_CRRANGE:
2555 case OP_CRMINRANGE:
2556 minimize = (*ecode == OP_CRMINRANGE);
2557 min = GET2(ecode, 1);
2558 max = GET2(ecode, 3);
2559 if (max == 0) max = INT_MAX;
2560 ecode += 5;
2561 break;
2562
2563 default: /* No repeat follows */
2564 min = max = 1;
2565 break;
2566 }
2567
2568 /* First, ensure the minimum number of matches are present. */
2569
2570 #ifdef SUPPORT_UTF8
2571 /* UTF-8 mode */
2572 if (utf8)
2573 {
2574 for (i = 1; i <= min; i++)
2575 {
2576 if (eptr >= md->end_subject)
2577 {
2578 SCHECK_PARTIAL();
2579 MRRETURN(MATCH_NOMATCH);
2580 }
2581 GETCHARINC(c, eptr);
2582 if (c > 255)
2583 {
2584 if (op == OP_CLASS) MRRETURN(MATCH_NOMATCH);
2585 }
2586 else
2587 {
2588 if ((data[c/8] & (1 << (c&7))) == 0) MRRETURN(MATCH_NOMATCH);
2589 }
2590 }
2591 }
2592 else
2593 #endif
2594 /* Not UTF-8 mode */
2595 {
2596 for (i = 1; i <= min; i++)
2597 {
2598 if (eptr >= md->end_subject)
2599 {
2600 SCHECK_PARTIAL();
2601 MRRETURN(MATCH_NOMATCH);
2602 }
2603 c = *eptr++;
2604 if ((data[c/8] & (1 << (c&7))) == 0) MRRETURN(MATCH_NOMATCH);
2605 }
2606 }
2607
2608 /* If max == min we can continue with the main loop without the
2609 need to recurse. */
2610
2611 if (min == max) continue;
2612
2613 /* If minimizing, keep testing the rest of the expression and advancing
2614 the pointer while it matches the class. */
2615
2616 if (minimize)
2617 {
2618 #ifdef SUPPORT_UTF8
2619 /* UTF-8 mode */
2620 if (utf8)
2621 {
2622 for (fi = min;; fi++)
2623 {
2624 RMATCH(eptr, ecode, offset_top, md, eptrb, RM16);
2625 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2626 if (fi >= max) MRRETURN(MATCH_NOMATCH);
2627 if (eptr >= md->end_subject)
2628 {
2629 SCHECK_PARTIAL();
2630 MRRETURN(MATCH_NOMATCH);
2631 }
2632 GETCHARINC(c, eptr);
2633 if (c > 255)
2634 {
2635 if (op == OP_CLASS) MRRETURN(MATCH_NOMATCH);
2636 }
2637 else
2638 {
2639 if ((data[c/8] & (1 << (c&7))) == 0) MRRETURN(MATCH_NOMATCH);
2640 }
2641 }
2642 }
2643 else
2644 #endif
2645 /* Not UTF-8 mode */
2646 {
2647 for (fi = min;; fi++)
2648 {
2649 RMATCH(eptr, ecode, offset_top, md, eptrb, RM17);
2650 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2651 if (fi >= max) MRRETURN(MATCH_NOMATCH);
2652 if (eptr >= md->end_subject)
2653 {
2654 SCHECK_PARTIAL();
2655 MRRETURN(MATCH_NOMATCH);
2656 }
2657 c = *eptr++;
2658 if ((data[c/8] & (1 << (c&7))) == 0) MRRETURN(MATCH_NOMATCH);
2659 }
2660 }
2661 /* Control never gets here */
2662 }
2663
2664 /* If maximizing, find the longest possible run, then work backwards. */
2665
2666 else
2667 {
2668 pp = eptr;
2669
2670 #ifdef SUPPORT_UTF8
2671 /* UTF-8 mode */
2672 if (utf8)
2673 {
2674 for (i = min; i < max; i++)
2675 {
2676 int len = 1;
2677 if (eptr >= md->end_subject)
2678 {
2679 SCHECK_PARTIAL();
2680 break;
2681 }
2682 GETCHARLEN(c, eptr, len);
2683 if (c > 255)
2684 {
2685 if (op == OP_CLASS) break;
2686 }
2687 else
2688 {
2689 if ((data[c/8] & (1 << (c&7))) == 0) break;
2690 }
2691 eptr += len;
2692 }
2693 for (;;)
2694 {
2695 RMATCH(eptr, ecode, offset_top, md, eptrb, RM18);
2696 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2697 if (eptr-- == pp) break; /* Stop if tried at original pos */
2698 BACKCHAR(eptr);
2699 }
2700 }
2701 else
2702 #endif
2703 /* Not UTF-8 mode */
2704 {
2705 for (i = min; i < max; i++)
2706 {
2707 if (eptr >= md->end_subject)
2708 {
2709 SCHECK_PARTIAL();
2710 break;
2711 }
2712 c = *eptr;
2713 if ((data[c/8] & (1 << (c&7))) == 0) break;
2714 eptr++;
2715 }
2716 while (eptr >= pp)
2717 {
2718 RMATCH(eptr, ecode, offset_top, md, eptrb, RM19);
2719 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2720 eptr--;
2721 }
2722 }
2723
2724 MRRETURN(MATCH_NOMATCH);
2725 }
2726 }
2727 /* Control never gets here */
2728
2729
2730 /* Match an extended character class. This opcode is encountered only
2731 when UTF-8 mode mode is supported. Nevertheless, we may not be in UTF-8
2732 mode, because Unicode properties are supported in non-UTF-8 mode. */
2733
2734 #ifdef SUPPORT_UTF8
2735 case OP_XCLASS:
2736 {
2737 data = ecode + 1 + LINK_SIZE; /* Save for matching */
2738 ecode += GET(ecode, 1); /* Advance past the item */
2739
2740 switch (*ecode)
2741 {
2742 case OP_CRSTAR:
2743 case OP_CRMINSTAR:
2744 case OP_CRPLUS:
2745 case OP_CRMINPLUS:
2746 case OP_CRQUERY:
2747 case OP_CRMINQUERY:
2748 c = *ecode++ - OP_CRSTAR;
2749 minimize = (c & 1) != 0;
2750 min = rep_min[c]; /* Pick up values from tables; */
2751 max = rep_max[c]; /* zero for max => infinity */
2752 if (max == 0) max = INT_MAX;
2753 break;
2754
2755 case OP_CRRANGE:
2756 case OP_CRMINRANGE:
2757 minimize = (*ecode == OP_CRMINRANGE);
2758 min = GET2(ecode, 1);
2759 max = GET2(ecode, 3);
2760 if (max == 0) max = INT_MAX;
2761 ecode += 5;
2762 break;
2763
2764 default: /* No repeat follows */
2765 min = max = 1;
2766 break;
2767 }
2768
2769 /* First, ensure the minimum number of matches are present. */
2770
2771 for (i = 1; i <= min; i++)
2772 {
2773 if (eptr >= md->end_subject)
2774 {
2775 SCHECK_PARTIAL();
2776 MRRETURN(MATCH_NOMATCH);
2777 }
2778 GETCHARINCTEST(c, eptr);
2779 if (!_pcre_xclass(c, data)) MRRETURN(MATCH_NOMATCH);
2780 }
2781
2782 /* If max == min we can continue with the main loop without the
2783 need to recurse. */
2784
2785 if (min == max) continue;
2786
2787 /* If minimizing, keep testing the rest of the expression and advancing
2788 the pointer while it matches the class. */
2789
2790 if (minimize)
2791 {
2792 for (fi = min;; fi++)
2793 {
2794 RMATCH(eptr, ecode, offset_top, md, eptrb, RM20);
2795 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2796 if (fi >= max) MRRETURN(MATCH_NOMATCH);
2797 if (eptr >= md->end_subject)
2798 {
2799 SCHECK_PARTIAL();
2800 MRRETURN(MATCH_NOMATCH);
2801 }
2802 GETCHARINCTEST(c, eptr);
2803 if (!_pcre_xclass(c, data)) MRRETURN(MATCH_NOMATCH);
2804 }
2805 /* Control never gets here */
2806 }
2807
2808 /* If maximizing, find the longest possible run, then work backwards. */
2809
2810 else
2811 {
2812 pp = eptr;
2813 for (i = min; i < max; i++)
2814 {
2815 int len = 1;
2816 if (eptr >= md->end_subject)
2817 {
2818 SCHECK_PARTIAL();
2819 break;
2820 }
2821 GETCHARLENTEST(c, eptr, len);
2822 if (!_pcre_xclass(c, data)) break;
2823 eptr += len;
2824 }
2825 for(;;)
2826 {
2827 RMATCH(eptr, ecode, offset_top, md, eptrb, RM21);
2828 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2829 if (eptr-- == pp) break; /* Stop if tried at original pos */
2830 if (utf8) BACKCHAR(eptr);
2831 }
2832 MRRETURN(MATCH_NOMATCH);
2833 }
2834
2835 /* Control never gets here */
2836 }
2837 #endif /* End of XCLASS */
2838
2839 /* Match a single character, casefully */
2840
2841 case OP_CHAR:
2842 #ifdef SUPPORT_UTF8
2843 if (utf8)
2844 {
2845 length = 1;
2846 ecode++;
2847 GETCHARLEN(fc, ecode, length);
2848 if (length > md->end_subject - eptr)
2849 {
2850 CHECK_PARTIAL(); /* Not SCHECK_PARTIAL() */
2851 MRRETURN(MATCH_NOMATCH);
2852 }
2853 while (length-- > 0) if (*ecode++ != *eptr++) MRRETURN(MATCH_NOMATCH);
2854 }
2855 else
2856 #endif
2857
2858 /* Non-UTF-8 mode */
2859 {
2860 if (md->end_subject - eptr < 1)
2861 {
2862 SCHECK_PARTIAL(); /* This one can use SCHECK_PARTIAL() */
2863 MRRETURN(MATCH_NOMATCH);
2864 }
2865 if (ecode[1] != *eptr++) MRRETURN(MATCH_NOMATCH);
2866 ecode += 2;
2867 }
2868 break;
2869
2870 /* Match a single character, caselessly */
2871
2872 case OP_CHARI:
2873 #ifdef SUPPORT_UTF8
2874 if (utf8)
2875 {
2876 length = 1;
2877 ecode++;
2878 GETCHARLEN(fc, ecode, length);
2879
2880 if (length > md->end_subject - eptr)
2881 {
2882 CHECK_PARTIAL(); /* Not SCHECK_PARTIAL() */
2883 MRRETURN(MATCH_NOMATCH);
2884 }
2885
2886 /* If the pattern character's value is < 128, we have only one byte, and
2887 can use the fast lookup table. */
2888
2889 if (fc < 128)
2890 {
2891 if (md->lcc[*ecode++] != md->lcc[*eptr++]) MRRETURN(MATCH_NOMATCH);
2892 }
2893
2894 /* Otherwise we must pick up the subject character */
2895
2896 else
2897 {
2898 unsigned int dc;
2899 GETCHARINC(dc, eptr);
2900 ecode += length;
2901
2902 /* If we have Unicode property support, we can use it to test the other
2903 case of the character, if there is one. */
2904
2905 if (fc != dc)
2906 {
2907 #ifdef SUPPORT_UCP
2908 if (dc != UCD_OTHERCASE(fc))
2909 #endif
2910 MRRETURN(MATCH_NOMATCH);
2911 }
2912 }
2913 }
2914 else
2915 #endif /* SUPPORT_UTF8 */
2916
2917 /* Non-UTF-8 mode */
2918 {
2919 if (md->end_subject - eptr < 1)
2920 {
2921 SCHECK_PARTIAL(); /* This one can use SCHECK_PARTIAL() */
2922 MRRETURN(MATCH_NOMATCH);
2923 }
2924 if (md->lcc[ecode[1]] != md->lcc[*eptr++]) MRRETURN(MATCH_NOMATCH);
2925 ecode += 2;
2926 }
2927 break;
2928
2929 /* Match a single character repeatedly. */
2930
2931 case OP_EXACT:
2932 case OP_EXACTI:
2933 min = max = GET2(ecode, 1);
2934 ecode += 3;
2935 goto REPEATCHAR;
2936
2937 case OP_POSUPTO:
2938 case OP_POSUPTOI:
2939 possessive = TRUE;
2940 /* Fall through */
2941
2942 case OP_UPTO:
2943 case OP_UPTOI:
2944 case OP_MINUPTO:
2945 case OP_MINUPTOI:
2946 min = 0;
2947 max = GET2(ecode, 1);
2948 minimize = *ecode == OP_MINUPTO || *ecode == OP_MINUPTOI;
2949 ecode += 3;
2950 goto REPEATCHAR;
2951
2952 case OP_POSSTAR:
2953 case OP_POSSTARI:
2954 possessive = TRUE;
2955 min = 0;
2956 max = INT_MAX;
2957 ecode++;
2958 goto REPEATCHAR;
2959
2960 case OP_POSPLUS:
2961 case OP_POSPLUSI:
2962 possessive = TRUE;
2963 min = 1;
2964 max = INT_MAX;
2965 ecode++;
2966 goto REPEATCHAR;
2967
2968 case OP_POSQUERY:
2969 case OP_POSQUERYI:
2970 possessive = TRUE;
2971 min = 0;
2972 max = 1;
2973 ecode++;
2974 goto REPEATCHAR;
2975
2976 case OP_STAR:
2977 case OP_STARI:
2978 case OP_MINSTAR:
2979 case OP_MINSTARI:
2980 case OP_PLUS:
2981 case OP_PLUSI:
2982 case OP_MINPLUS:
2983 case OP_MINPLUSI:
2984 case OP_QUERY:
2985 case OP_QUERYI:
2986 case OP_MINQUERY:
2987 case OP_MINQUERYI:
2988 c = *ecode++ - ((op < OP_STARI)? OP_STAR : OP_STARI);
2989 minimize = (c & 1) != 0;
2990 min = rep_min[c]; /* Pick up values from tables; */
2991 max = rep_max[c]; /* zero for max => infinity */
2992 if (max == 0) max = INT_MAX;
2993
2994 /* Common code for all repeated single-character matches. */
2995
2996 REPEATCHAR:
2997 #ifdef SUPPORT_UTF8
2998 if (utf8)
2999 {
3000 length = 1;
3001 charptr = ecode;
3002 GETCHARLEN(fc, ecode, length);
3003 ecode += length;
3004
3005 /* Handle multibyte character matching specially here. There is
3006 support for caseless matching if UCP support is present. */
3007
3008 if (length > 1)
3009 {
3010 #ifdef SUPPORT_UCP
3011 unsigned int othercase;
3012 if (op >= OP_STARI && /* Caseless */
3013 (othercase = UCD_OTHERCASE(fc)) != fc)
3014 oclength = _pcre_ord2utf8(othercase, occhars);
3015 else oclength = 0;
3016 #endif /* SUPPORT_UCP */
3017
3018 for (i = 1; i <= min; i++)
3019 {
3020 if (eptr <= md->end_subject - length &&
3021 memcmp(eptr, charptr, length) == 0) eptr += length;
3022 #ifdef SUPPORT_UCP
3023 else if (oclength > 0 &&
3024 eptr <= md->end_subject - oclength &&
3025 memcmp(eptr, occhars, oclength) == 0) eptr += oclength;
3026 #endif /* SUPPORT_UCP */
3027 else
3028 {
3029 CHECK_PARTIAL();
3030 MRRETURN(MATCH_NOMATCH);
3031 }
3032 }
3033
3034 if (min == max) continue;
3035
3036 if (minimize)
3037 {
3038 for (fi = min;; fi++)
3039 {
3040 RMATCH(eptr, ecode, offset_top, md, eptrb, RM22);
3041 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3042 if (fi >= max) MRRETURN(MATCH_NOMATCH);
3043 if (eptr <= md->end_subject - length &&
3044 memcmp(eptr, charptr, length) == 0) eptr += length;
3045 #ifdef SUPPORT_UCP
3046 else if (oclength > 0 &&
3047 eptr <= md->end_subject - oclength &&
3048 memcmp(eptr, occhars, oclength) == 0) eptr += oclength;
3049 #endif /* SUPPORT_UCP */
3050 else
3051 {
3052 CHECK_PARTIAL();
3053 MRRETURN(MATCH_NOMATCH);
3054 }
3055 }
3056 /* Control never gets here */
3057 }
3058
3059 else /* Maximize */
3060 {
3061 pp = eptr;
3062 for (i = min; i < max; i++)
3063 {
3064 if (eptr <= md->end_subject - length &&
3065 memcmp(eptr, charptr, length) == 0) eptr += length;
3066 #ifdef SUPPORT_UCP
3067 else if (oclength > 0 &&
3068 eptr <= md->end_subject - oclength &&
3069 memcmp(eptr, occhars, oclength) == 0) eptr += oclength;
3070 #endif /* SUPPORT_UCP */
3071 else
3072 {
3073 CHECK_PARTIAL();
3074 break;
3075 }
3076 }
3077
3078 if (possessive) continue;
3079
3080 for(;;)
3081 {
3082 RMATCH(eptr, ecode, offset_top, md, eptrb, RM23);
3083 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3084 if (eptr == pp) { MRRETURN(MATCH_NOMATCH); }
3085 #ifdef SUPPORT_UCP
3086 eptr--;
3087 BACKCHAR(eptr);
3088 #else /* without SUPPORT_UCP */
3089 eptr -= length;
3090 #endif /* SUPPORT_UCP */
3091 }
3092 }
3093 /* Control never gets here */
3094 }
3095
3096 /* If the length of a UTF-8 character is 1, we fall through here, and
3097 obey the code as for non-UTF-8 characters below, though in this case the
3098 value of fc will always be < 128. */
3099 }
3100 else
3101 #endif /* SUPPORT_UTF8 */
3102
3103 /* When not in UTF-8 mode, load a single-byte character. */
3104
3105 fc = *ecode++;
3106
3107 /* The value of fc at this point is always less than 256, though we may or
3108 may not be in UTF-8 mode. The code is duplicated for the caseless and
3109 caseful cases, for speed, since matching characters is likely to be quite
3110 common. First, ensure the minimum number of matches are present. If min =
3111 max, continue at the same level without recursing. Otherwise, if
3112 minimizing, keep trying the rest of the expression and advancing one
3113 matching character if failing, up to the maximum. Alternatively, if
3114 maximizing, find the maximum number of characters and work backwards. */
3115
3116 DPRINTF(("matching %c{%d,%d} against subject %.*s\n", fc, min, max,
3117 max, eptr));
3118
3119 if (op >= OP_STARI) /* Caseless */
3120 {
3121 fc = md->lcc[fc];
3122 for (i = 1; i <= min; i++)
3123 {
3124 if (eptr >= md->end_subject)
3125 {
3126 SCHECK_PARTIAL();
3127 MRRETURN(MATCH_NOMATCH);
3128 }
3129 if (fc != md->lcc[*eptr++]) MRRETURN(MATCH_NOMATCH);
3130 }
3131 if (min == max) continue;
3132 if (minimize)
3133 {
3134 for (fi = min;; fi++)
3135 {
3136 RMATCH(eptr, ecode, offset_top, md, eptrb, RM24);
3137 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3138 if (fi >= max) MRRETURN(MATCH_NOMATCH);
3139 if (eptr >= md->end_subject)
3140 {
3141 SCHECK_PARTIAL();
3142 MRRETURN(MATCH_NOMATCH);
3143 }
3144 if (fc != md->lcc[*eptr++]) MRRETURN(MATCH_NOMATCH);
3145 }
3146 /* Control never gets here */
3147 }
3148 else /* Maximize */
3149 {
3150 pp = eptr;
3151 for (i = min; i < max; i++)
3152 {
3153 if (eptr >= md->end_subject)
3154 {
3155 SCHECK_PARTIAL();
3156 break;
3157 }
3158 if (fc != md->lcc[*eptr]) break;
3159 eptr++;
3160 }
3161
3162 if (possessive) continue;
3163
3164 while (eptr >= pp)
3165 {
3166 RMATCH(eptr, ecode, offset_top, md, eptrb, RM25);
3167 eptr--;
3168 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3169 }
3170 MRRETURN(MATCH_NOMATCH);
3171 }
3172 /* Control never gets here */
3173 }
3174
3175 /* Caseful comparisons (includes all multi-byte characters) */
3176
3177 else
3178 {
3179 for (i = 1; i <= min; i++)
3180 {
3181 if (eptr >= md->end_subject)
3182 {
3183 SCHECK_PARTIAL();
3184 MRRETURN(MATCH_NOMATCH);
3185 }
3186 if (fc != *eptr++) MRRETURN(MATCH_NOMATCH);
3187 }
3188
3189 if (min == max) continue;
3190
3191 if (minimize)
3192 {
3193 for (fi = min;; fi++)
3194 {
3195 RMATCH(eptr, ecode, offset_top, md, eptrb, RM26);
3196 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3197 if (fi >= max) MRRETURN(MATCH_NOMATCH);
3198 if (eptr >= md->end_subject)
3199 {
3200 SCHECK_PARTIAL();
3201 MRRETURN(MATCH_NOMATCH);
3202 }
3203 if (fc != *eptr++) MRRETURN(MATCH_NOMATCH);
3204 }
3205 /* Control never gets here */
3206 }
3207 else /* Maximize */
3208 {
3209 pp = eptr;
3210 for (i = min; i < max; i++)
3211 {
3212 if (eptr >= md->end_subject)
3213 {
3214 SCHECK_PARTIAL();
3215 break;
3216 }
3217 if (fc != *eptr) break;
3218 eptr++;
3219 }
3220 if (possessive) continue;
3221
3222 while (eptr >= pp)
3223 {
3224 RMATCH(eptr, ecode, offset_top, md, eptrb, RM27);
3225 eptr--;
3226 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3227 }
3228 MRRETURN(MATCH_NOMATCH);
3229 }
3230 }
3231 /* Control never gets here */
3232
3233 /* Match a negated single one-byte character. The character we are
3234 checking can be multibyte. */
3235
3236 case OP_NOT:
3237 case OP_NOTI:
3238 if (eptr >= md->end_subject)
3239 {
3240 SCHECK_PARTIAL();
3241 MRRETURN(MATCH_NOMATCH);
3242 }
3243 ecode++;
3244 GETCHARINCTEST(c, eptr);
3245 if (op == OP_NOTI) /* The caseless case */
3246 {
3247 #ifdef SUPPORT_UTF8
3248 if (c < 256)
3249 #endif
3250 c = md->lcc[c];
3251 if (md->lcc[*ecode++] == c) MRRETURN(MATCH_NOMATCH);
3252 }
3253 else /* Caseful */
3254 {
3255 if (*ecode++ == c) MRRETURN(MATCH_NOMATCH);
3256 }
3257 break;
3258
3259 /* Match a negated single one-byte character repeatedly. This is almost a
3260 repeat of the code for a repeated single character, but I haven't found a
3261 nice way of commoning these up that doesn't require a test of the
3262 positive/negative option for each character match. Maybe that wouldn't add
3263 very much to the time taken, but character matching *is* what this is all
3264 about... */
3265
3266 case OP_NOTEXACT:
3267 case OP_NOTEXACTI:
3268 min = max = GET2(ecode, 1);
3269 ecode += 3;
3270 goto REPEATNOTCHAR;
3271
3272 case OP_NOTUPTO:
3273 case OP_NOTUPTOI:
3274 case OP_NOTMINUPTO:
3275 case OP_NOTMINUPTOI:
3276 min = 0;
3277 max = GET2(ecode, 1);
3278 minimize = *ecode == OP_NOTMINUPTO || *ecode == OP_NOTMINUPTOI;
3279 ecode += 3;
3280 goto REPEATNOTCHAR;
3281
3282 case OP_NOTPOSSTAR:
3283 case OP_NOTPOSSTARI:
3284 possessive = TRUE;
3285 min = 0;
3286 max = INT_MAX;
3287 ecode++;
3288 goto REPEATNOTCHAR;
3289
3290 case OP_NOTPOSPLUS:
3291 case OP_NOTPOSPLUSI:
3292 possessive = TRUE;
3293 min = 1;
3294 max = INT_MAX;
3295 ecode++;
3296 goto REPEATNOTCHAR;
3297
3298 case OP_NOTPOSQUERY:
3299 case OP_NOTPOSQUERYI:
3300 possessive = TRUE;
3301 min = 0;
3302 max = 1;
3303 ecode++;
3304 goto REPEATNOTCHAR;
3305
3306 case OP_NOTPOSUPTO:
3307 case OP_NOTPOSUPTOI:
3308 possessive = TRUE;
3309 min = 0;
3310 max = GET2(ecode, 1);
3311 ecode += 3;
3312 goto REPEATNOTCHAR;
3313
3314 case OP_NOTSTAR:
3315 case OP_NOTSTARI:
3316 case OP_NOTMINSTAR:
3317 case OP_NOTMINSTARI:
3318 case OP_NOTPLUS:
3319 case OP_NOTPLUSI:
3320 case OP_NOTMINPLUS:
3321 case OP_NOTMINPLUSI:
3322 case OP_NOTQUERY:
3323 case OP_NOTQUERYI:
3324 case OP_NOTMINQUERY:
3325 case OP_NOTMINQUERYI:
3326 c = *ecode++ - ((op >= OP_NOTSTARI)? OP_NOTSTARI: OP_NOTSTAR);
3327 minimize = (c & 1) != 0;
3328 min = rep_min[c]; /* Pick up values from tables; */
3329 max = rep_max[c]; /* zero for max => infinity */
3330 if (max == 0) max = INT_MAX;
3331
3332 /* Common code for all repeated single-byte matches. */
3333
3334 REPEATNOTCHAR:
3335 fc = *ecode++;
3336
3337 /* The code is duplicated for the caseless and caseful cases, for speed,
3338 since matching characters is likely to be quite common. First, ensure the
3339 minimum number of matches are present. If min = max, continue at the same
3340 level without recursing. Otherwise, if minimizing, keep trying the rest of
3341 the expression and advancing one matching character if failing, up to the
3342 maximum. Alternatively, if maximizing, find the maximum number of
3343 characters and work backwards. */
3344
3345 DPRINTF(("negative matching %c{%d,%d} against subject %.*s\n", fc, min, max,
3346 max, eptr));
3347
3348 if (op >= OP_NOTSTARI) /* Caseless */
3349 {
3350 fc = md->lcc[fc];
3351
3352 #ifdef SUPPORT_UTF8
3353 /* UTF-8 mode */
3354 if (utf8)
3355 {
3356 register unsigned int d;
3357 for (i = 1; i <= min; i++)
3358 {
3359 if (eptr >= md->end_subject)
3360 {
3361 SCHECK_PARTIAL();
3362 MRRETURN(MATCH_NOMATCH);
3363 }
3364 GETCHARINC(d, eptr);
3365 if (d < 256) d = md->lcc[d];
3366 if (fc == d) MRRETURN(MATCH_NOMATCH);
3367 }
3368 }
3369 else
3370 #endif
3371
3372 /* Not UTF-8 mode */
3373 {
3374 for (i = 1; i <= min; i++)
3375 {
3376 if (eptr >= md->end_subject)
3377 {
3378 SCHECK_PARTIAL();
3379 MRRETURN(MATCH_NOMATCH);
3380 }
3381 if (fc == md->lcc[*eptr++]) MRRETURN(MATCH_NOMATCH);
3382 }
3383 }
3384
3385 if (min == max) continue;
3386
3387 if (minimize)
3388 {
3389 #ifdef SUPPORT_UTF8
3390 /* UTF-8 mode */
3391 if (utf8)
3392 {
3393 register unsigned int d;
3394 for (fi = min;; fi++)
3395 {
3396 RMATCH(eptr, ecode, offset_top, md, eptrb, RM28);
3397 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3398 if (fi >= max) MRRETURN(MATCH_NOMATCH);
3399 if (eptr >= md->end_subject)
3400 {
3401 SCHECK_PARTIAL();
3402 MRRETURN(MATCH_NOMATCH);
3403 }
3404 GETCHARINC(d, eptr);
3405 if (d < 256) d = md->lcc[d];
3406 if (fc == d) MRRETURN(MATCH_NOMATCH);
3407 }
3408 }
3409 else
3410 #endif
3411 /* Not UTF-8 mode */
3412 {
3413 for (fi = min;; fi++)
3414 {
3415 RMATCH(eptr, ecode, offset_top, md, eptrb, RM29);
3416 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3417 if (fi >= max) MRRETURN(MATCH_NOMATCH);
3418 if (eptr >= md->end_subject)
3419 {
3420 SCHECK_PARTIAL();
3421 MRRETURN(MATCH_NOMATCH);
3422 }
3423 if (fc == md->lcc[*eptr++]) MRRETURN(MATCH_NOMATCH);
3424 }
3425 }
3426 /* Control never gets here */
3427 }
3428
3429 /* Maximize case */
3430
3431 else
3432 {
3433 pp = eptr;
3434
3435 #ifdef SUPPORT_UTF8
3436 /* UTF-8 mode */
3437 if (utf8)
3438 {
3439 register unsigned int d;
3440 for (i = min; i < max; i++)
3441 {
3442 int len = 1;
3443 if (eptr >= md->end_subject)
3444 {
3445 SCHECK_PARTIAL();
3446 break;
3447 }
3448 GETCHARLEN(d, eptr, len);
3449 if (d < 256) d = md->lcc[d];
3450 if (fc == d) break;
3451 eptr += len;
3452 }
3453 if (possessive) continue;
3454 for(;;)
3455 {
3456 RMATCH(eptr, ecode, offset_top, md, eptrb, RM30);
3457 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3458 if (eptr-- == pp) break; /* Stop if tried at original pos */
3459 BACKCHAR(eptr);
3460 }
3461 }
3462 else
3463 #endif
3464 /* Not UTF-8 mode */
3465 {
3466 for (i = min; i < max; i++)
3467 {
3468 if (eptr >= md->end_subject)
3469 {
3470 SCHECK_PARTIAL();
3471 break;
3472 }
3473 if (fc == md->lcc[*eptr]) break;
3474 eptr++;
3475 }
3476 if (possessive) continue;
3477 while (eptr >= pp)
3478 {
3479 RMATCH(eptr, ecode, offset_top, md, eptrb, RM31);
3480 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3481 eptr--;
3482 }
3483 }
3484
3485 MRRETURN(MATCH_NOMATCH);
3486 }
3487 /* Control never gets here */
3488 }
3489
3490 /* Caseful comparisons */
3491
3492 else
3493 {
3494 #ifdef SUPPORT_UTF8
3495 /* UTF-8 mode */
3496 if (utf8)
3497 {
3498 register unsigned int d;
3499 for (i = 1; i <= min; i++)
3500 {
3501 if (eptr >= md->end_subject)
3502 {
3503 SCHECK_PARTIAL();
3504 MRRETURN(MATCH_NOMATCH);
3505 }
3506 GETCHARINC(d, eptr);
3507 if (fc == d) MRRETURN(MATCH_NOMATCH);
3508 }
3509 }
3510 else
3511 #endif
3512 /* Not UTF-8 mode */
3513 {
3514 for (i = 1; i <= min; i++)
3515 {
3516 if (eptr >= md->end_subject)
3517 {
3518 SCHECK_PARTIAL();
3519 MRRETURN(MATCH_NOMATCH);
3520 }
3521 if (fc == *eptr++) MRRETURN(MATCH_NOMATCH);
3522 }
3523 }
3524
3525 if (min == max) continue;
3526
3527 if (minimize)
3528 {
3529 #ifdef SUPPORT_UTF8
3530 /* UTF-8 mode */
3531 if (utf8)
3532 {
3533 register unsigned int d;
3534 for (fi = min;; fi++)
3535 {
3536 RMATCH(eptr, ecode, offset_top, md, eptrb, RM32);
3537 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3538 if (fi >= max) MRRETURN(MATCH_NOMATCH);
3539 if (eptr >= md->end_subject)
3540 {
3541 SCHECK_PARTIAL();
3542 MRRETURN(MATCH_NOMATCH);
3543 }
3544 GETCHARINC(d, eptr);
3545 if (fc == d) MRRETURN(MATCH_NOMATCH);
3546 }
3547 }
3548 else
3549 #endif
3550 /* Not UTF-8 mode */
3551 {
3552 for (fi = min;; fi++)
3553 {
3554 RMATCH(eptr, ecode, offset_top, md, eptrb, RM33);
3555 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3556 if (fi >= max) MRRETURN(MATCH_NOMATCH);
3557 if (eptr >= md->end_subject)
3558 {
3559 SCHECK_PARTIAL();
3560 MRRETURN(MATCH_NOMATCH);
3561 }
3562 if (fc == *eptr++) MRRETURN(MATCH_NOMATCH);
3563 }
3564 }
3565 /* Control never gets here */
3566 }
3567
3568 /* Maximize case */
3569
3570 else
3571 {
3572 pp = eptr;
3573
3574 #ifdef SUPPORT_UTF8
3575 /* UTF-8 mode */
3576 if (utf8)
3577 {
3578 register unsigned int d;
3579 for (i = min; i < max; i++)
3580 {
3581 int len = 1;
3582 if (eptr >= md->end_subject)
3583 {
3584 SCHECK_PARTIAL();
3585 break;
3586 }
3587 GETCHARLEN(d, eptr, len);
3588 if (fc == d) break;
3589 eptr += len;
3590 }
3591 if (possessive) continue;
3592 for(;;)
3593 {
3594 RMATCH(eptr, ecode, offset_top, md, eptrb, RM34);
3595 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3596 if (eptr-- == pp) break; /* Stop if tried at original pos */
3597 BACKCHAR(eptr);
3598 }
3599 }
3600 else
3601 #endif
3602 /* Not UTF-8 mode */
3603 {
3604 for (i = min; i < max; i++)
3605 {
3606 if (eptr >= md->end_subject)
3607 {
3608 SCHECK_PARTIAL();
3609 break;
3610 }
3611 if (fc == *eptr) break;
3612 eptr++;
3613 }
3614 if (possessive) continue;
3615 while (eptr >= pp)
3616 {
3617 RMATCH(eptr, ecode, offset_top, md, eptrb, RM35);
3618 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3619 eptr--;
3620 }
3621 }
3622
3623 MRRETURN(MATCH_NOMATCH);
3624 }
3625 }
3626 /* Control never gets here */
3627
3628 /* Match a single character type repeatedly; several different opcodes
3629 share code. This is very similar to the code for single characters, but we
3630 repeat it in the interests of efficiency. */
3631
3632 case OP_TYPEEXACT:
3633 min = max = GET2(ecode, 1);
3634 minimize = TRUE;
3635 ecode += 3;
3636 goto REPEATTYPE;
3637
3638 case OP_TYPEUPTO:
3639 case OP_TYPEMINUPTO:
3640 min = 0;
3641 max = GET2(ecode, 1);
3642 minimize = *ecode == OP_TYPEMINUPTO;
3643 ecode += 3;
3644 goto REPEATTYPE;
3645
3646 case OP_TYPEPOSSTAR:
3647 possessive = TRUE;
3648 min = 0;
3649 max = INT_MAX;
3650 ecode++;
3651 goto REPEATTYPE;
3652
3653 case OP_TYPEPOSPLUS:
3654 possessive = TRUE;
3655 min = 1;
3656 max = INT_MAX;
3657 ecode++;
3658 goto REPEATTYPE;
3659
3660 case OP_TYPEPOSQUERY:
3661 possessive = TRUE;
3662 min = 0;
3663 max = 1;
3664 ecode++;
3665 goto REPEATTYPE;
3666
3667 case OP_TYPEPOSUPTO:
3668 possessive = TRUE;
3669 min = 0;
3670 max = GET2(ecode, 1);
3671 ecode += 3;
3672 goto REPEATTYPE;
3673
3674 case OP_TYPESTAR:
3675 case OP_TYPEMINSTAR:
3676 case OP_TYPEPLUS:
3677 case OP_TYPEMINPLUS:
3678 case OP_TYPEQUERY:
3679 case OP_TYPEMINQUERY:
3680 c = *ecode++ - OP_TYPESTAR;
3681 minimize = (c & 1) != 0;
3682 min = rep_min[c]; /* Pick up values from tables; */
3683 max = rep_max[c]; /* zero for max => infinity */
3684 if (max == 0) max = INT_MAX;
3685
3686 /* Common code for all repeated single character type matches. Note that
3687 in UTF-8 mode, '.' matches a character of any length, but for the other
3688 character types, the valid characters are all one-byte long. */
3689
3690 REPEATTYPE:
3691 ctype = *ecode++; /* Code for the character type */
3692
3693 #ifdef SUPPORT_UCP
3694 if (ctype == OP_PROP || ctype == OP_NOTPROP)
3695 {
3696 prop_fail_result = ctype == OP_NOTPROP;
3697 prop_type = *ecode++;
3698 prop_value = *ecode++;
3699 }
3700 else prop_type = -1;
3701 #endif
3702
3703 /* First, ensure the minimum number of matches are present. Use inline
3704 code for maximizing the speed, and do the type test once at the start
3705 (i.e. keep it out of the loop). Separate the UTF-8 code completely as that
3706 is tidier. Also separate the UCP code, which can be the same for both UTF-8
3707 and single-bytes. */
3708
3709 if (min > 0)
3710 {
3711 #ifdef SUPPORT_UCP
3712 if (prop_type >= 0)
3713 {
3714 switch(prop_type)
3715 {
3716 case PT_ANY:
3717 if (prop_fail_result) MRRETURN(MATCH_NOMATCH);
3718 for (i = 1; i <= min; i++)
3719 {
3720 if (eptr >= md->end_subject)
3721 {
3722 SCHECK_PARTIAL();
3723 MRRETURN(MATCH_NOMATCH);
3724 }
3725 GETCHARINCTEST(c, eptr);
3726 }
3727 break;
3728
3729 case PT_LAMP:
3730 for (i = 1; i <= min; i++)
3731 {
3732 int chartype;
3733 if (eptr >= md->end_subject)
3734 {
3735 SCHECK_PARTIAL();
3736 MRRETURN(MATCH_NOMATCH);
3737 }
3738 GETCHARINCTEST(c, eptr);
3739 chartype = UCD_CHARTYPE(c);
3740 if ((chartype == ucp_Lu ||
3741 chartype == ucp_Ll ||
3742 chartype == ucp_Lt) == prop_fail_result)
3743 MRRETURN(MATCH_NOMATCH);
3744 }
3745 break;
3746
3747 case PT_GC:
3748 for (i = 1; i <= min; i++)
3749 {
3750 if (eptr >= md->end_subject)
3751 {
3752 SCHECK_PARTIAL();
3753 MRRETURN(MATCH_NOMATCH);
3754 }
3755 GETCHARINCTEST(c, eptr);
3756 if ((UCD_CATEGORY(c) == prop_value) == prop_fail_result)
3757 MRRETURN(MATCH_NOMATCH);
3758 }
3759 break;
3760
3761 case PT_PC:
3762 for (i = 1; i <= min; i++)
3763 {
3764 if (eptr >= md->end_subject)
3765 {
3766 SCHECK_PARTIAL();
3767 MRRETURN(MATCH_NOMATCH);
3768 }
3769 GETCHARINCTEST(c, eptr);
3770 if ((UCD_CHARTYPE(c) == prop_value) == prop_fail_result)
3771 MRRETURN(MATCH_NOMATCH);
3772 }
3773 break;
3774
3775 case PT_SC:
3776 for (i = 1; i <= min; i++)
3777 {
3778 if (eptr >= md->end_subject)
3779 {
3780 SCHECK_PARTIAL();
3781 MRRETURN(MATCH_NOMATCH);
3782 }
3783 GETCHARINCTEST(c, eptr);
3784 if ((UCD_SCRIPT(c) == prop_value) == prop_fail_result)
3785 MRRETURN(MATCH_NOMATCH);
3786 }
3787 break;
3788
3789 case PT_ALNUM:
3790 for (i = 1; i <= min; i++)
3791 {
3792 int category;
3793 if (eptr >= md->end_subject)
3794 {
3795 SCHECK_PARTIAL();
3796 MRRETURN(MATCH_NOMATCH);
3797 }
3798 GETCHARINCTEST(c, eptr);
3799 category = UCD_CATEGORY(c);
3800 if ((category == ucp_L || category == ucp_N) == prop_fail_result)
3801 MRRETURN(MATCH_NOMATCH);
3802 }
3803 break;
3804
3805 case PT_SPACE: /* Perl space */
3806 for (i = 1; i <= min; i++)
3807 {
3808 if (eptr >= md->end_subject)
3809 {
3810 SCHECK_PARTIAL();
3811 MRRETURN(MATCH_NOMATCH);
3812 }
3813 GETCHARINCTEST(c, eptr);
3814 if ((UCD_CATEGORY(c) == ucp_Z || c == CHAR_HT || c == CHAR_NL ||
3815 c == CHAR_FF || c == CHAR_CR)
3816 == prop_fail_result)
3817 MRRETURN(MATCH_NOMATCH);
3818 }
3819 break;
3820
3821 case PT_PXSPACE: /* POSIX space */
3822 for (i = 1; i <= min; i++)
3823 {
3824 if (eptr >= md->end_subject)
3825 {
3826 SCHECK_PARTIAL();
3827 MRRETURN(MATCH_NOMATCH);
3828 }
3829 GETCHARINCTEST(c, eptr);
3830 if ((UCD_CATEGORY(c) == ucp_Z || c == CHAR_HT || c == CHAR_NL ||
3831 c == CHAR_VT || c == CHAR_FF || c == CHAR_CR)
3832 == prop_fail_result)
3833 MRRETURN(MATCH_NOMATCH);
3834 }
3835 break;
3836
3837 case PT_WORD:
3838 for (i = 1; i <= min; i++)
3839 {
3840 int category;
3841 if (eptr >= md->end_subject)
3842 {
3843 SCHECK_PARTIAL();
3844 MRRETURN(MATCH_NOMATCH);
3845 }
3846 GETCHARINCTEST(c, eptr);
3847 category = UCD_CATEGORY(c);
3848 if ((category == ucp_L || category == ucp_N || c == CHAR_UNDERSCORE)
3849 == prop_fail_result)
3850 MRRETURN(MATCH_NOMATCH);
3851 }
3852 break;
3853
3854 /* This should not occur */
3855
3856 default:
3857 RRETURN(PCRE_ERROR_INTERNAL);
3858 }
3859 }
3860
3861 /* Match extended Unicode sequences. We will get here only if the
3862 support is in the binary; otherwise a compile-time error occurs. */
3863
3864 else if (ctype == OP_EXTUNI)
3865 {
3866 for (i = 1; i <= min; i++)
3867 {
3868 if (eptr >= md->end_subject)
3869 {
3870 SCHECK_PARTIAL();
3871 MRRETURN(MATCH_NOMATCH);
3872 }
3873 GETCHARINCTEST(c, eptr);
3874 if (UCD_CATEGORY(c) == ucp_M) MRRETURN(MATCH_NOMATCH);
3875 while (eptr < md->end_subject)
3876 {
3877 int len = 1;
3878 if (!utf8) c = *eptr; else { GETCHARLEN(c, eptr, len); }
3879 if (UCD_CATEGORY(c) != ucp_M) break;
3880 eptr += len;
3881 }
3882 }
3883 }
3884
3885 else
3886 #endif /* SUPPORT_UCP */
3887
3888 /* Handle all other cases when the coding is UTF-8 */
3889
3890 #ifdef SUPPORT_UTF8
3891 if (utf8) switch(ctype)
3892 {
3893 case OP_ANY:
3894 for (i = 1; i <= min; i++)
3895 {
3896 if (eptr >= md->end_subject)
3897 {
3898 SCHECK_PARTIAL();
3899 MRRETURN(MATCH_NOMATCH);
3900 }
3901 if (IS_NEWLINE(eptr)) MRRETURN(MATCH_NOMATCH);
3902 eptr++;
3903 while (eptr < md->end_subject && (*eptr & 0xc0) == 0x80) eptr++;
3904 }
3905 break;
3906
3907 case OP_ALLANY:
3908 for (i = 1; i <= min; i++)
3909 {
3910 if (eptr >= md->end_subject)
3911 {
3912 SCHECK_PARTIAL();
3913 MRRETURN(MATCH_NOMATCH);
3914 }
3915 eptr++;
3916 while (eptr < md->end_subject && (*eptr & 0xc0) == 0x80) eptr++;
3917 }
3918 break;
3919
3920 case OP_ANYBYTE:
3921 if (eptr > md->end_subject - min) MRRETURN(MATCH_NOMATCH);
3922 eptr += min;
3923 break;
3924
3925 case OP_ANYNL:
3926 for (i = 1; i <= min; i++)
3927 {
3928 if (eptr >= md->end_subject)
3929 {
3930 SCHECK_PARTIAL();
3931 MRRETURN(MATCH_NOMATCH);
3932 }
3933 GETCHARINC(c, eptr);
3934 switch(c)
3935 {
3936 default: MRRETURN(MATCH_NOMATCH);
3937
3938 case 0x000d:
3939 if (eptr < md->end_subject && *eptr == 0x0a) eptr++;
3940 break;
3941
3942 case 0x000a:
3943 break;
3944
3945 case 0x000b:
3946 case 0x000c:
3947 case 0x0085:
3948 case 0x2028:
3949 case 0x2029:
3950 if (md->bsr_anycrlf) MRRETURN(MATCH_NOMATCH);
3951 break;
3952 }
3953 }
3954 break;
3955
3956 case OP_NOT_HSPACE:
3957 for (i = 1; i <= min; i++)
3958 {
3959 if (eptr >= md->end_subject)
3960 {
3961 SCHECK_PARTIAL();
3962 MRRETURN(MATCH_NOMATCH);
3963 }
3964 GETCHARINC(c, eptr);
3965 switch(c)
3966 {
3967 default: break;
3968 case 0x09: /* HT */
3969 case 0x20: /* SPACE */
3970 case 0xa0: /* NBSP */
3971 case 0x1680: /* OGHAM SPACE MARK */
3972 case 0x180e: /* MONGOLIAN VOWEL SEPARATOR */
3973 case 0x2000: /* EN QUAD */
3974 case 0x2001: /* EM QUAD */
3975 case 0x2002: /* EN SPACE */
3976 case 0x2003: /* EM SPACE */
3977 case 0x2004: /* THREE-PER-EM SPACE */
3978 case 0x2005: /* FOUR-PER-EM SPACE */
3979 case 0x2006: /* SIX-PER-EM SPACE */
3980 case 0x2007: /* FIGURE SPACE */
3981 case 0x2008: /* PUNCTUATION SPACE */
3982 case 0x2009: /* THIN SPACE */
3983 case 0x200A: /* HAIR SPACE */
3984 case 0x202f: /* NARROW NO-BREAK SPACE */
3985 case 0x205f: /* MEDIUM MATHEMATICAL SPACE */
3986 case 0x3000: /* IDEOGRAPHIC SPACE */
3987 MRRETURN(MATCH_NOMATCH);
3988 }
3989 }
3990 break;
3991
3992 case OP_HSPACE:
3993 for (i = 1; i <= min; i++)
3994 {
3995 if (eptr >= md->end_subject)
3996 {
3997 SCHECK_PARTIAL();
3998 MRRETURN(MATCH_NOMATCH);
3999 }
4000 GETCHARINC(c, eptr);
4001 switch(c)
4002 {
4003 default: MRRETURN(MATCH_NOMATCH);
4004 case 0x09: /* HT */
4005 case 0x20: /* SPACE */
4006 case 0xa0: /* NBSP */
4007 case 0x1680: /* OGHAM SPACE MARK */
4008 case 0x180e: /* MONGOLIAN VOWEL SEPARATOR */
4009 case 0x2000: /* EN QUAD */
4010 case 0x2001: /* EM QUAD */
4011 case 0x2002: /* EN SPACE */
4012 case 0x2003: /* EM SPACE */
4013 case 0x2004: /* THREE-PER-EM SPACE */
4014 case 0x2005: /* FOUR-PER-EM SPACE */
4015 case 0x2006: /* SIX-PER-EM SPACE */
4016 case 0x2007: /* FIGURE SPACE */
4017 case 0x2008: /* PUNCTUATION SPACE */
4018 case 0x2009: /* THIN SPACE */
4019 case 0x200A: /* HAIR SPACE */
4020 case 0x202f: /* NARROW NO-BREAK SPACE */
4021 case 0x205f: /* MEDIUM MATHEMATICAL SPACE */
4022 case 0x3000: /* IDEOGRAPHIC SPACE */
4023 break;
4024 }
4025 }
4026 break;
4027
4028 case OP_NOT_VSPACE:
4029 for (i = 1; i <= min; i++)
4030 {
4031 if (eptr >= md->end_subject)
4032 {
4033 SCHECK_PARTIAL();
4034 MRRETURN(MATCH_NOMATCH);
4035 }
4036 GETCHARINC(c, eptr);
4037 switch(c)
4038 {
4039 default: break;
4040 case 0x0a: /* LF */
4041 case 0x0b: /* VT */
4042 case 0x0c: /* FF */
4043 case 0x0d: /* CR */
4044 case 0x85: /* NEL */
4045 case 0x2028: /* LINE SEPARATOR */
4046 case 0x2029: /* PARAGRAPH SEPARATOR */
4047 MRRETURN(MATCH_NOMATCH);
4048 }
4049 }
4050 break;
4051
4052 case OP_VSPACE:
4053 for (i = 1; i <= min; i++)
4054 {
4055 if (eptr >= md->end_subject)
4056 {
4057 SCHECK_PARTIAL();
4058 MRRETURN(MATCH_NOMATCH);
4059 }
4060 GETCHARINC(c, eptr);
4061 switch(c)
4062 {
4063 default: MRRETURN(MATCH_NOMATCH);
4064 case 0x0a: /* LF */
4065 case 0x0b: /* VT */
4066 case 0x0c: /* FF */
4067 case 0x0d: /* CR */
4068 case 0x85: /* NEL */
4069 case 0x2028: /* LINE SEPARATOR */
4070 case 0x2029: /* PARAGRAPH SEPARATOR */
4071 break;
4072 }
4073 }
4074 break;
4075
4076 case OP_NOT_DIGIT:
4077 for (i = 1; i <= min; i++)
4078 {
4079 if (eptr >= md->end_subject)
4080 {
4081 SCHECK_PARTIAL();
4082 MRRETURN(MATCH_NOMATCH);
4083 }
4084 GETCHARINC(c, eptr);
4085 if (c < 128 && (md->ctypes[c] & ctype_digit) != 0)
4086 MRRETURN(MATCH_NOMATCH);
4087 }
4088 break;
4089
4090 case OP_DIGIT:
4091 for (i = 1; i <= min; i++)
4092 {
4093 if (eptr >= md->end_subject)
4094 {
4095 SCHECK_PARTIAL();
4096 MRRETURN(MATCH_NOMATCH);
4097 }
4098 if (*eptr >= 128 || (md->ctypes[*eptr++] & ctype_digit) == 0)
4099 MRRETURN(MATCH_NOMATCH);
4100 /* No need to skip more bytes - we know it's a 1-byte character */
4101 }
4102 break;
4103
4104 case OP_NOT_WHITESPACE:
4105 for (i = 1; i <= min; i++)
4106 {
4107 if (eptr >= md->end_subject)
4108 {
4109 SCHECK_PARTIAL();
4110 MRRETURN(MATCH_NOMATCH);
4111 }
4112 if (*eptr < 128 && (md->ctypes[*eptr] & ctype_space) != 0)
4113 MRRETURN(MATCH_NOMATCH);
4114 while (++eptr < md->end_subject && (*eptr & 0xc0) == 0x80);
4115 }
4116 break;
4117
4118 case OP_WHITESPACE:
4119 for (i = 1; i <= min; i++)
4120 {
4121 if (eptr >= md->end_subject)
4122 {
4123 SCHECK_PARTIAL();
4124 MRRETURN(MATCH_NOMATCH);
4125 }
4126 if (*eptr >= 128 || (md->ctypes[*eptr++] & ctype_space) == 0)
4127 MRRETURN(MATCH_NOMATCH);
4128 /* No need to skip more bytes - we know it's a 1-byte character */
4129 }
4130 break;
4131
4132 case OP_NOT_WORDCHAR:
4133 for (i = 1; i <= min; i++)
4134 {
4135 if (eptr >= md->end_subject)
4136 {
4137 SCHECK_PARTIAL();
4138 MRRETURN(MATCH_NOMATCH);
4139 }
4140 if (*eptr < 128 && (md->ctypes[*eptr] & ctype_word) != 0)
4141 MRRETURN(MATCH_NOMATCH);
4142 while (++eptr < md->end_subject && (*eptr & 0xc0) == 0x80);
4143 }
4144 break;
4145
4146 case OP_WORDCHAR:
4147 for (i = 1; i <= min; i++)
4148 {
4149 if (eptr >= md->end_subject)
4150 {
4151 SCHECK_PARTIAL();
4152 MRRETURN(MATCH_NOMATCH);
4153 }
4154 if (*eptr >= 128 || (md->ctypes[*eptr++] & ctype_word) == 0)
4155 MRRETURN(MATCH_NOMATCH);
4156 /* No need to skip more bytes - we know it's a 1-byte character */
4157 }
4158 break;
4159
4160 default:
4161 RRETURN(PCRE_ERROR_INTERNAL);
4162 } /* End switch(ctype) */
4163
4164 else
4165 #endif /* SUPPORT_UTF8 */
4166
4167 /* Code for the non-UTF-8 case for minimum matching of operators other
4168 than OP_PROP and OP_NOTPROP. */
4169
4170 switch(ctype)
4171 {
4172 case OP_ANY:
4173 for (i = 1; i <= min; i++)
4174 {
4175 if (eptr >= md->end_subject)
4176 {
4177 SCHECK_PARTIAL();
4178 MRRETURN(MATCH_NOMATCH);
4179 }
4180 if (IS_NEWLINE(eptr)) MRRETURN(MATCH_NOMATCH);
4181 eptr++;
4182 }
4183 break;
4184
4185 case OP_ALLANY:
4186 if (eptr > md->end_subject - min)
4187 {
4188 SCHECK_PARTIAL();
4189 MRRETURN(MATCH_NOMATCH);
4190 }
4191 eptr += min;
4192 break;
4193
4194 case OP_ANYBYTE:
4195 if (eptr > md->end_subject - min)
4196 {
4197 SCHECK_PARTIAL();
4198 MRRETURN(MATCH_NOMATCH);
4199 }
4200 eptr += min;
4201 break;
4202
4203 case OP_ANYNL:
4204 for (i = 1; i <= min; i++)
4205 {
4206 if (eptr >= md->end_subject)
4207 {
4208 SCHECK_PARTIAL();
4209 MRRETURN(MATCH_NOMATCH);
4210 }
4211 switch(*eptr++)
4212 {
4213 default: MRRETURN(MATCH_NOMATCH);
4214
4215 case 0x000d:
4216 if (eptr < md->end_subject && *eptr == 0x0a) eptr++;
4217 break;
4218
4219 case 0x000a:
4220 break;
4221
4222 case 0x000b:
4223 case 0x000c:
4224 case 0x0085:
4225 if (md->bsr_anycrlf) MRRETURN(MATCH_NOMATCH);
4226 break;
4227 }
4228 }
4229 break;
4230
4231 case OP_NOT_HSPACE:
4232 for (i = 1; i <= min; i++)
4233 {
4234 if (eptr >= md->end_subject)
4235 {
4236 SCHECK_PARTIAL();
4237 MRRETURN(MATCH_NOMATCH);
4238 }
4239 switch(*eptr++)
4240 {
4241 default: break;
4242 case 0x09: /* HT */
4243 case 0x20: /* SPACE */
4244 case 0xa0: /* NBSP */
4245 MRRETURN(MATCH_NOMATCH);
4246 }
4247 }
4248 break;
4249
4250 case OP_HSPACE:
4251 for (i = 1; i <= min; i++)
4252 {
4253 if (eptr >= md->end_subject)
4254 {
4255 SCHECK_PARTIAL();
4256 MRRETURN(MATCH_NOMATCH);
4257 }
4258 switch(*eptr++)
4259 {
4260 default: MRRETURN(MATCH_NOMATCH);
4261 case 0x09: /* HT */
4262 case 0x20: /* SPACE */
4263 case 0xa0: /* NBSP */
4264 break;
4265 }
4266 }
4267 break;
4268
4269 case OP_NOT_VSPACE:
4270 for (i = 1; i <= min; i++)
4271 {
4272 if (eptr >= md->end_subject)
4273 {
4274 SCHECK_PARTIAL();
4275 MRRETURN(MATCH_NOMATCH);
4276 }
4277 switch(*eptr++)
4278 {
4279 default: break;
4280 case 0x0a: /* LF */
4281 case 0x0b: /* VT */
4282 case 0x0c: /* FF */
4283 case 0x0d: /* CR */
4284 case 0x85: /* NEL */
4285 MRRETURN(MATCH_NOMATCH);
4286 }
4287 }
4288 break;
4289
4290 case OP_VSPACE:
4291 for (i = 1; i <= min; i++)
4292 {
4293 if (eptr >= md->end_subject)
4294 {
4295 SCHECK_PARTIAL();
4296 MRRETURN(MATCH_NOMATCH);
4297 }
4298 switch(*eptr++)
4299 {
4300 default: MRRETURN(MATCH_NOMATCH);
4301 case 0x0a: /* LF */
4302 case 0x0b: /* VT */
4303 case 0x0c: /* FF */
4304 case 0x0d: /* CR */
4305 case 0x85: /* NEL */
4306 break;
4307 }
4308 }
4309 break;
4310
4311 case OP_NOT_DIGIT:
4312 for (i = 1; i <= min; i++)
4313 {
4314 if (eptr >= md->end_subject)
4315 {
4316 SCHECK_PARTIAL();
4317 MRRETURN(MATCH_NOMATCH);
4318 }
4319 if ((md->ctypes[*eptr++] & ctype_digit) != 0) MRRETURN(MATCH_NOMATCH);
4320 }
4321 break;
4322
4323 case OP_DIGIT:
4324 for (i = 1; i <= min; i++)
4325 {
4326 if (eptr >= md->end_subject)
4327 {
4328 SCHECK_PARTIAL();
4329 MRRETURN(MATCH_NOMATCH);
4330 }
4331 if ((md->ctypes[*eptr++] & ctype_digit) == 0) MRRETURN(MATCH_NOMATCH);
4332 }
4333 break;
4334
4335 case OP_NOT_WHITESPACE:
4336 for (i = 1; i <= min; i++)
4337 {
4338 if (eptr >= md->end_subject)
4339 {
4340 SCHECK_PARTIAL();
4341 MRRETURN(MATCH_NOMATCH);
4342 }
4343 if ((md->ctypes[*eptr++] & ctype_space) != 0) MRRETURN(MATCH_NOMATCH);
4344 }
4345 break;
4346
4347 case OP_WHITESPACE:
4348 for (i = 1; i <= min; i++)
4349 {
4350 if (eptr >= md->end_subject)
4351 {
4352 SCHECK_PARTIAL();
4353 MRRETURN(MATCH_NOMATCH);
4354 }
4355 if ((md->ctypes[*eptr++] & ctype_space) == 0) MRRETURN(MATCH_NOMATCH);
4356 }
4357 break;
4358
4359 case OP_NOT_WORDCHAR:
4360 for (i = 1; i <= min; i++)
4361 {
4362 if (eptr >= md->end_subject)
4363 {
4364 SCHECK_PARTIAL();
4365 MRRETURN(MATCH_NOMATCH);
4366 }
4367 if ((md->ctypes[*eptr++] & ctype_word) != 0)
4368 MRRETURN(MATCH_NOMATCH);
4369 }
4370 break;
4371
4372 case OP_WORDCHAR:
4373 for (i = 1; i <= min; i++)
4374 {
4375 if (eptr >= md->end_subject)
4376 {
4377 SCHECK_PARTIAL();
4378 MRRETURN(MATCH_NOMATCH);
4379 }
4380 if ((md->ctypes[*eptr++] & ctype_word) == 0)
4381 MRRETURN(MATCH_NOMATCH);
4382 }
4383 break;
4384
4385 default:
4386 RRETURN(PCRE_ERROR_INTERNAL);
4387 }
4388 }
4389
4390 /* If min = max, continue at the same level without recursing */
4391
4392 if (min == max) continue;
4393
4394 /* If minimizing, we have to test the rest of the pattern before each
4395 subsequent match. Again, separate the UTF-8 case for speed, and also
4396 separate the UCP cases. */
4397
4398 if (minimize)
4399 {
4400 #ifdef SUPPORT_UCP
4401 if (prop_type >= 0)
4402 {
4403 switch(prop_type)
4404 {
4405 case PT_ANY:
4406 for (fi = min;; fi++)
4407 {
4408 RMATCH(eptr, ecode, offset_top, md, eptrb, RM36);
4409 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
4410 if (fi >= max) MRRETURN(MATCH_NOMATCH);
4411 if (eptr >= md->end_subject)
4412 {
4413 SCHECK_PARTIAL();
4414 MRRETURN(MATCH_NOMATCH);
4415 }
4416 GETCHARINCTEST(c, eptr);
4417 if (prop_fail_result) MRRETURN(MATCH_NOMATCH);
4418 }
4419 /* Control never gets here */
4420
4421 case PT_LAMP:
4422 for (fi = min;; fi++)
4423 {
4424 int chartype;
4425 RMATCH(eptr, ecode, offset_top, md, eptrb, RM37);
4426 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
4427 if (fi >= max) MRRETURN(MATCH_NOMATCH);
4428 if (eptr >= md->end_subject)
4429 {
4430 SCHECK_PARTIAL();
4431 MRRETURN(MATCH_NOMATCH);
4432 }
4433 GETCHARINCTEST(c, eptr);
4434 chartype = UCD_CHARTYPE(c);
4435 if ((chartype == ucp_Lu ||
4436 chartype == ucp_Ll ||
4437 chartype == ucp_Lt) == prop_fail_result)
4438 MRRETURN(MATCH_NOMATCH);
4439 }
4440 /* Control never gets here */
4441
4442 case PT_GC:
4443 for (fi = min;; fi++)
4444 {
4445 RMATCH(eptr, ecode, offset_top, md, eptrb, RM38);
4446 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
4447 if (fi >= max) MRRETURN(MATCH_NOMATCH);
4448 if (eptr >= md->end_subject)
4449 {
4450 SCHECK_PARTIAL();
4451 MRRETURN(MATCH_NOMATCH);
4452 }
4453 GETCHARINCTEST(c, eptr);
4454 if ((UCD_CATEGORY(c) == prop_value) == prop_fail_result)
4455 MRRETURN(MATCH_NOMATCH);
4456 }
4457 /* Control never gets here */
4458
4459 case PT_PC:
4460 for (fi = min;; fi++)
4461 {
4462 RMATCH(eptr, ecode, offset_top, md, eptrb, RM39);
4463 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
4464 if (fi >= max) MRRETURN(MATCH_NOMATCH);
4465 if (eptr >= md->end_subject)
4466 {
4467 SCHECK_PARTIAL();
4468 MRRETURN(MATCH_NOMATCH);
4469 }
4470 GETCHARINCTEST(c, eptr);
4471 if ((UCD_CHARTYPE(c) == prop_value) == prop_fail_result)
4472 MRRETURN(MATCH_NOMATCH);
4473 }
4474 /* Control never gets here */
4475
4476 case PT_SC:
4477 for (fi = min;; fi++)
4478 {
4479 RMATCH(eptr, ecode, offset_top, md, eptrb, RM40);
4480 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
4481 if (fi >= max) MRRETURN(MATCH_NOMATCH);
4482 if (eptr >= md->end_subject)
4483 {
4484 SCHECK_PARTIAL();
4485 MRRETURN(MATCH_NOMATCH);
4486 }
4487 GETCHARINCTEST(c, eptr);
4488 if ((UCD_SCRIPT(c) == prop_value) == prop_fail_result)
4489 MRRETURN(MATCH_NOMATCH);
4490 }
4491 /* Control never gets here */
4492
4493 case PT_ALNUM:
4494 for (fi = min;; fi++)
4495 {
4496 int category;
4497 RMATCH(eptr, ecode, offset_top, md, eptrb, RM59);
4498 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
4499 if (fi >= max) MRRETURN(MATCH_NOMATCH);
4500 if (eptr >= md->end_subject)
4501 {
4502 SCHECK_PARTIAL();
4503 MRRETURN(MATCH_NOMATCH);
4504 }
4505 GETCHARINCTEST(c, eptr);
4506 category = UCD_CATEGORY(c);
4507 if ((category == ucp_L || category == ucp_N) == prop_fail_result)
4508 MRRETURN(MATCH_NOMATCH);
4509 }
4510 /* Control never gets here */
4511
4512 case PT_SPACE: /* Perl space */
4513 for (fi = min;; fi++)
4514 {
4515 RMATCH(eptr, ecode, offset_top, md, eptrb, RM60);
4516 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
4517 if (fi >= max) MRRETURN(MATCH_NOMATCH);
4518 if (eptr >= md->end_subject)
4519 {
4520 SCHECK_PARTIAL();
4521 MRRETURN(MATCH_NOMATCH);
4522 }
4523 GETCHARINCTEST(c, eptr);
4524 if ((UCD_CATEGORY(c) == ucp_Z || c == CHAR_HT || c == CHAR_NL ||
4525 c == CHAR_FF || c == CHAR_CR)
4526 == prop_fail_result)
4527 MRRETURN(MATCH_NOMATCH);
4528 }
4529 /* Control never gets here */
4530
4531 case PT_PXSPACE: /* POSIX space */
4532 for (fi = min;; fi++)
4533 {
4534 RMATCH(eptr, ecode, offset_top, md, eptrb, RM61);
4535 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
4536 if (fi >= max) MRRETURN(MATCH_NOMATCH);
4537 if (eptr >= md->end_subject)
4538 {
4539 SCHECK_PARTIAL();
4540 MRRETURN(MATCH_NOMATCH);
4541 }
4542 GETCHARINCTEST(c, eptr);
4543 if ((UCD_CATEGORY(c) == ucp_Z || c == CHAR_HT || c == CHAR_NL ||
4544 c == CHAR_VT || c == CHAR_FF || c == CHAR_CR)
4545 == prop_fail_result)
4546 MRRETURN(MATCH_NOMATCH);
4547 }
4548 /* Control never gets here */
4549
4550 case PT_WORD:
4551 for (fi = min;; fi++)
4552 {
4553 int category;
4554 RMATCH(eptr, ecode, offset_top, md, eptrb, RM62);
4555 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
4556 if (fi >= max) MRRETURN(MATCH_NOMATCH);
4557 if (eptr >= md->end_subject)
4558 {
4559 SCHECK_PARTIAL();
4560 MRRETURN(MATCH_NOMATCH);
4561 }
4562 GETCHARINCTEST(c, eptr);
4563 category = UCD_CATEGORY(c);
4564 if ((category == ucp_L ||
4565 category == ucp_N ||
4566 c == CHAR_UNDERSCORE)
4567 == prop_fail_result)
4568 MRRETURN(MATCH_NOMATCH);
4569 }
4570 /* Control never gets here */
4571
4572 /* This should never occur */
4573
4574 default:
4575 RRETURN(PCRE_ERROR_INTERNAL);
4576 }
4577 }
4578
4579 /* Match extended Unicode sequences. We will get here only if the
4580 support is in the binary; otherwise a compile-time error occurs. */
4581
4582 else if (ctype == OP_EXTUNI)
4583 {
4584 for (fi = min;; fi++)
4585 {
4586 RMATCH(eptr, ecode, offset_top, md, eptrb, RM41);
4587 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
4588 if (fi >= max) MRRETURN(MATCH_NOMATCH);
4589 if (eptr >= md->end_subject)
4590 {
4591 SCHECK_PARTIAL();
4592 MRRETURN(MATCH_NOMATCH);
4593 }
4594 GETCHARINCTEST(c, eptr);
4595 if (UCD_CATEGORY(c) == ucp_M) MRRETURN(MATCH_NOMATCH);
4596 while (eptr < md->end_subject)
4597 {
4598 int len = 1;
4599 if (!utf8) c = *eptr; else { GETCHARLEN(c, eptr, len); }
4600 if (UCD_CATEGORY(c) != ucp_M) break;
4601 eptr += len;
4602 }
4603 }
4604 }
4605 else
4606 #endif /* SUPPORT_UCP */
4607
4608 #ifdef SUPPORT_UTF8
4609 /* UTF-8 mode */
4610 if (utf8)
4611 {
4612 for (fi = min;; fi++)
4613 {
4614 RMATCH(eptr, ecode, offset_top, md, eptrb, RM42);
4615 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
4616 if (fi >= max) MRRETURN(MATCH_NOMATCH);
4617 if (eptr >= md->end_subject)
4618 {
4619 SCHECK_PARTIAL();
4620 MRRETURN(MATCH_NOMATCH);
4621 }
4622 if (ctype == OP_ANY && IS_NEWLINE(eptr))
4623 MRRETURN(MATCH_NOMATCH);
4624 GETCHARINC(c, eptr);
4625 switch(ctype)
4626 {
4627 case OP_ANY: /* This is the non-NL case */
4628 case OP_ALLANY:
4629 case OP_ANYBYTE:
4630 break;
4631
4632 case OP_ANYNL:
4633 switch(c)
4634 {
4635 default: MRRETURN(MATCH_NOMATCH);
4636 case 0x000d:
4637 if (eptr < md->end_subject && *eptr == 0x0a) eptr++;
4638 break;
4639 case 0x000a:
4640 break;
4641
4642 case 0x000b:
4643 case 0x000c:
4644 case 0x0085:
4645 case 0x2028:
4646 case 0x2029:
4647 if (md->bsr_anycrlf) MRRETURN(MATCH_NOMATCH);
4648 break;
4649 }
4650 break;
4651
4652 case OP_NOT_HSPACE:
4653 switch(c)
4654 {
4655 default: break;
4656 case 0x09: /* HT */
4657 case 0x20: /* SPACE */
4658 case 0xa0: /* NBSP */
4659 case 0x1680: /* OGHAM SPACE MARK */
4660 case 0x180e: /* MONGOLIAN VOWEL SEPARATOR */
4661 case 0x2000: /* EN QUAD */
4662 case 0x2001: /* EM QUAD */
4663 case 0x2002: /* EN SPACE */
4664 case 0x2003: /* EM SPACE */
4665 case 0x2004: /* THREE-PER-EM SPACE */
4666 case 0x2005: /* FOUR-PER-EM SPACE */
4667 case 0x2006: /* SIX-PER-EM SPACE */
4668 case 0x2007: /* FIGURE SPACE */
4669 case 0x2008: /* PUNCTUATION SPACE */
4670 case 0x2009: /* THIN SPACE */
4671 case 0x200A: /* HAIR SPACE */
4672 case 0x202f: /* NARROW NO-BREAK SPACE */
4673 case 0x205f: /* MEDIUM MATHEMATICAL SPACE */
4674 case 0x3000: /* IDEOGRAPHIC SPACE */
4675 MRRETURN(MATCH_NOMATCH);
4676 }
4677 break;
4678
4679 case OP_HSPACE:
4680 switch(c)
4681 {
4682 default: MRRETURN(MATCH_NOMATCH);
4683 case 0x09: /* HT */
4684 case 0x20: /* SPACE */
4685 case 0xa0: /* NBSP */
4686 case 0x1680: /* OGHAM SPACE MARK */
4687 case 0x180e: /* MONGOLIAN VOWEL SEPARATOR */
4688 case 0x2000: /* EN QUAD */
4689 case 0x2001: /* EM QUAD */
4690 case 0x2002: /* EN SPACE */
4691 case 0x2003: /* EM SPACE */
4692 case 0x2004: /* THREE-PER-EM SPACE */
4693 case 0x2005: /* FOUR-PER-EM SPACE */
4694 case 0x2006: /* SIX-PER-EM SPACE */
4695 case 0x2007: /* FIGURE SPACE */
4696 case 0x2008: /* PUNCTUATION SPACE */
4697 case 0x2009: /* THIN SPACE */
4698 case 0x200A: /* HAIR SPACE */
4699 case 0x202f: /* NARROW NO-BREAK SPACE */
4700 case 0x205f: /* MEDIUM MATHEMATICAL SPACE */
4701 case 0x3000: /* IDEOGRAPHIC SPACE */
4702 break;
4703 }
4704 break;
4705
4706 case OP_NOT_VSPACE:
4707 switch(c)
4708 {
4709 default: break;
4710 case 0x0a: /* LF */
4711 case 0x0b: /* VT */
4712 case 0x0c: /* FF */
4713 case 0x0d: /* CR */
4714 case 0x85: /* NEL */
4715 case 0x2028: /* LINE SEPARATOR */
4716 case 0x2029: /* PARAGRAPH SEPARATOR */
4717 MRRETURN(MATCH_NOMATCH);
4718 }
4719 break;
4720
4721 case OP_VSPACE:
4722 switch(c)
4723 {
4724 default: MRRETURN(MATCH_NOMATCH);
4725 case 0x0a: /* LF */
4726 case 0x0b: /* VT */
4727 case 0x0c: /* FF */
4728 case 0x0d: /* CR */
4729 case 0x85: /* NEL */
4730 case 0x2028: /* LINE SEPARATOR */
4731 case 0x2029: /* PARAGRAPH SEPARATOR */
4732 break;
4733 }
4734 break;
4735
4736 case OP_NOT_DIGIT:
4737 if (c < 256 && (md->ctypes[c] & ctype_digit) != 0)
4738 MRRETURN(MATCH_NOMATCH);
4739 break;
4740
4741 case OP_DIGIT:
4742 if (c >= 256 || (md->ctypes[c] & ctype_digit) == 0)
4743 MRRETURN(MATCH_NOMATCH);
4744 break;
4745
4746 case OP_NOT_WHITESPACE:
4747 if (c < 256 && (md->ctypes[c] & ctype_space) != 0)
4748 MRRETURN(MATCH_NOMATCH);
4749 break;
4750
4751 case OP_WHITESPACE:
4752 if (c >= 256 || (md->ctypes[c] & ctype_space) == 0)
4753 MRRETURN(MATCH_NOMATCH);
4754 break;
4755
4756 case OP_NOT_WORDCHAR:
4757 if (c < 256 && (md->ctypes[c] & ctype_word) != 0)
4758 MRRETURN(MATCH_NOMATCH);
4759 break;
4760
4761 case OP_WORDCHAR:
4762 if (c >= 256 || (md->ctypes[c] & ctype_word) == 0)
4763 MRRETURN(MATCH_NOMATCH);
4764 break;
4765
4766 default:
4767 RRETURN(PCRE_ERROR_INTERNAL);
4768 }
4769 }
4770 }
4771 else
4772 #endif
4773 /* Not UTF-8 mode */
4774 {
4775 for (fi = min;; fi++)
4776 {
4777 RMATCH(eptr, ecode, offset_top, md, eptrb, RM43);
4778 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
4779 if (fi >= max) MRRETURN(MATCH_NOMATCH);
4780 if (eptr >= md->end_subject)
4781 {
4782 SCHECK_PARTIAL();
4783 MRRETURN(MATCH_NOMATCH);
4784 }
4785 if (ctype == OP_ANY && IS_NEWLINE(eptr))
4786 MRRETURN(MATCH_NOMATCH);
4787 c = *eptr++;
4788 switch(ctype)
4789 {
4790 case OP_ANY: /* This is the non-NL case */
4791 case OP_ALLANY:
4792 case OP_ANYBYTE:
4793 break;
4794
4795 case OP_ANYNL:
4796 switch(c)
4797 {
4798 default: MRRETURN(MATCH_NOMATCH);
4799 case 0x000d:
4800 if (eptr < md->end_subject && *eptr == 0x0a) eptr++;
4801 break;
4802
4803 case 0x000a:
4804 break;
4805
4806 case 0x000b:
4807 case 0x000c:
4808 case 0x0085:
4809 if (md->bsr_anycrlf) MRRETURN(MATCH_NOMATCH);
4810 break;
4811 }
4812 break;
4813
4814 case OP_NOT_HSPACE:
4815 switch(c)
4816 {
4817 default: break;
4818 case 0x09: /* HT */
4819 case 0x20: /* SPACE */
4820 case 0xa0: /* NBSP */
4821 MRRETURN(MATCH_NOMATCH);
4822 }
4823 break;
4824
4825 case OP_HSPACE:
4826 switch(c)
4827 {
4828 default: MRRETURN(MATCH_NOMATCH);
4829 case 0x09: /* HT */
4830 case 0x20: /* SPACE */
4831 case 0xa0: /* NBSP */
4832 break;
4833 }
4834 break;
4835
4836 case OP_NOT_VSPACE:
4837 switch(c)
4838 {
4839 default: break;
4840 case 0x0a: /* LF */
4841 case 0x0b: /* VT */
4842 case 0x0c: /* FF */
4843 case 0x0d: /* CR */
4844 case 0x85: /* NEL */
4845 MRRETURN(MATCH_NOMATCH);
4846 }
4847 break;
4848
4849 case OP_VSPACE:
4850 switch(c)
4851 {
4852 default: MRRETURN(MATCH_NOMATCH);
4853 case 0x0a: /* LF */
4854 case 0x0b: /* VT */
4855 case 0x0c: /* FF */
4856 case 0x0d: /* CR */
4857 case 0x85: /* NEL */
4858 break;
4859 }
4860 break;
4861
4862 case OP_NOT_DIGIT:
4863 if ((md->ctypes[c] & ctype_digit) != 0) MRRETURN(MATCH_NOMATCH);
4864 break;
4865
4866 case OP_DIGIT:
4867 if ((md->ctypes[c] & ctype_digit) == 0) MRRETURN(MATCH_NOMATCH);
4868 break;
4869
4870 case OP_NOT_WHITESPACE:
4871 if ((md->ctypes[c] & ctype_space) != 0) MRRETURN(MATCH_NOMATCH);
4872 break;
4873
4874 case OP_WHITESPACE:
4875 if ((md->ctypes[c] & ctype_space) == 0) MRRETURN(MATCH_NOMATCH);
4876 break;
4877
4878 case OP_NOT_WORDCHAR:
4879 if ((md->ctypes[c] & ctype_word) != 0) MRRETURN(MATCH_NOMATCH);
4880 break;
4881
4882 case OP_WORDCHAR:
4883 if ((md->ctypes[c] & ctype_word) == 0) MRRETURN(MATCH_NOMATCH);
4884 break;
4885
4886 default:
4887 RRETURN(PCRE_ERROR_INTERNAL);
4888 }
4889 }
4890 }
4891 /* Control never gets here */
4892 }
4893
4894 /* If maximizing, it is worth using inline code for speed, doing the type
4895 test once at the start (i.e. keep it out of the loop). Again, keep the
4896 UTF-8 and UCP stuff separate. */
4897
4898 else
4899 {
4900 pp = eptr; /* Remember where we started */
4901
4902 #ifdef SUPPORT_UCP
4903 if (prop_type >= 0)
4904 {
4905 switch(prop_type)
4906 {
4907 case PT_ANY:
4908 for (i = min; i < max; i++)
4909 {
4910 int len = 1;
4911 if (eptr >= md->end_subject)
4912 {
4913 SCHECK_PARTIAL();
4914 break;
4915 }
4916 GETCHARLENTEST(c, eptr, len);
4917 if (prop_fail_result) break;
4918 eptr+= len;
4919 }
4920 break;
4921
4922 case PT_LAMP:
4923 for (i = min; i < max; i++)
4924 {
4925 int chartype;
4926 int len = 1;
4927 if (eptr >= md->end_subject)
4928 {
4929 SCHECK_PARTIAL();
4930 break;
4931 }
4932 GETCHARLENTEST(c, eptr, len);
4933 chartype = UCD_CHARTYPE(c);
4934 if ((chartype == ucp_Lu ||
4935 chartype == ucp_Ll ||
4936 chartype == ucp_Lt) == prop_fail_result)
4937 break;
4938 eptr+= len;
4939 }
4940 break;
4941
4942 case PT_GC:
4943 for (i = min; i < max; i++)
4944 {
4945 int len = 1;
4946 if (eptr >= md->end_subject)
4947 {
4948 SCHECK_PARTIAL();
4949 break;
4950 }
4951 GETCHARLENTEST(c, eptr, len);
4952 if ((UCD_CATEGORY(c) == prop_value) == prop_fail_result) break;
4953 eptr+= len;
4954 }
4955 break;
4956
4957 case PT_PC:
4958 for (i = min; i < max; i++)
4959 {
4960 int len = 1;
4961 if (eptr >= md->end_subject)
4962 {
4963 SCHECK_PARTIAL();
4964 break;
4965 }
4966 GETCHARLENTEST(c, eptr, len);
4967 if ((UCD_CHARTYPE(c) == prop_value) == prop_fail_result) break;
4968 eptr+= len;
4969 }
4970 break;
4971
4972 case PT_SC:
4973 for (i = min; i < max; i++)
4974 {
4975 int len = 1;
4976 if (eptr >= md->end_subject)
4977 {
4978 SCHECK_PARTIAL();
4979 break;
4980 }
4981 GETCHARLENTEST(c, eptr, len);
4982 if ((UCD_SCRIPT(c) == prop_value) == prop_fail_result) break;
4983 eptr+= len;
4984 }
4985 break;
4986
4987 case PT_ALNUM:
4988 for (i = min; i < max; i++)
4989 {
4990 int category;
4991 int len = 1;
4992 if (eptr >= md->end_subject)
4993 {
4994 SCHECK_PARTIAL();
4995 break;
4996 }
4997 GETCHARLENTEST(c, eptr, len);
4998 category = UCD_CATEGORY(c);
4999 if ((category == ucp_L || category == ucp_N) == prop_fail_result)
5000 break;
5001 eptr+= len;
5002 }
5003 break;
5004
5005 case PT_SPACE: /* Perl space */
5006 for (i = min; i < max; i++)
5007 {
5008 int len = 1;
5009 if (eptr >= md->end_subject)
5010 {
5011 SCHECK_PARTIAL();
5012 break;
5013 }
5014 GETCHARLENTEST(c, eptr, len);
5015 if ((UCD_CATEGORY(c) == ucp_Z || c == CHAR_HT || c == CHAR_NL ||
5016 c == CHAR_FF || c == CHAR_CR)
5017 == prop_fail_result)
5018 break;
5019 eptr+= len;
5020 }
5021 break;
5022
5023 case PT_PXSPACE: /* POSIX space */
5024 for (i = min; i < max; i++)
5025 {
5026 int len = 1;
5027 if (eptr >= md->end_subject)
5028 {
5029 SCHECK_PARTIAL();
5030 break;
5031 }
5032 GETCHARLENTEST(c, eptr, len);
5033 if ((UCD_CATEGORY(c) == ucp_Z || c == CHAR_HT || c == CHAR_NL ||
5034 c == CHAR_VT || c == CHAR_FF || c == CHAR_CR)
5035 == prop_fail_result)
5036 break;
5037 eptr+= len;
5038 }
5039 break;
5040
5041 case PT_WORD:
5042 for (i = min; i < max; i++)
5043 {
5044 int category;
5045 int len = 1;
5046 if (eptr >= md->end_subject)
5047 {
5048 SCHECK_PARTIAL();
5049 break;
5050 }
5051 GETCHARLENTEST(c, eptr, len);
5052 category = UCD_CATEGORY(c);
5053 if ((category == ucp_L || category == ucp_N ||
5054 c == CHAR_UNDERSCORE) == prop_fail_result)
5055 break;
5056 eptr+= len;
5057 }
5058 break;
5059
5060 default:
5061 RRETURN(PCRE_ERROR_INTERNAL);
5062 }
5063
5064 /* eptr is now past the end of the maximum run */
5065
5066 if (possessive) continue;
5067 for(;;)
5068 {
5069 RMATCH(eptr, ecode, offset_top, md, eptrb, RM44);
5070 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
5071 if (eptr-- == pp) break; /* Stop if tried at original pos */
5072 if (utf8) BACKCHAR(eptr);
5073 }
5074 }
5075
5076 /* Match extended Unicode sequences. We will get here only if the
5077 support is in the binary; otherwise a compile-time error occurs. */
5078
5079 else if (ctype == OP_EXTUNI)
5080 {
5081 for (i = min; i < max; i++)
5082 {
5083 int len = 1;
5084 if (eptr >= md->end_subject)
5085 {
5086 SCHECK_PARTIAL();
5087 break;
5088 }
5089 if (!utf8) c = *eptr; else { GETCHARLEN(c, eptr, len); }
5090 if (UCD_CATEGORY(c) == ucp_M) break;
5091 eptr += len;
5092 while (eptr < md->end_subject)
5093 {
5094 len = 1;
5095 if (!utf8) c = *eptr; else { GETCHARLEN(c, eptr, len); }
5096 if (UCD_CATEGORY(c) != ucp_M) break;
5097 eptr += len;
5098 }
5099 }
5100
5101 /* eptr is now past the end of the maximum run */
5102
5103 if (possessive) continue;
5104
5105 for(;;)
5106 {
5107 RMATCH(eptr, ecode, offset_top, md, eptrb, RM45);
5108 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
5109 if (eptr-- == pp) break; /* Stop if tried at original pos */
5110 for (;;) /* Move back over one extended */
5111 {
5112 if (!utf8) c = *eptr; else
5113 {
5114 BACKCHAR(eptr);
5115 GETCHAR(c, eptr);
5116 }
5117 if (UCD_CATEGORY(c) != ucp_M) break;
5118 eptr--;
5119 }
5120 }
5121 }
5122
5123 else
5124 #endif /* SUPPORT_UCP */
5125
5126 #ifdef SUPPORT_UTF8
5127 /* UTF-8 mode */
5128
5129 if (utf8)
5130 {
5131 switch(ctype)
5132 {
5133 case OP_ANY:
5134 if (max < INT_MAX)
5135 {
5136 for (i = min; i < max; i++)
5137 {
5138 if (eptr >= md->end_subject)
5139 {
5140 SCHECK_PARTIAL();
5141 break;
5142 }
5143 if (IS_NEWLINE(eptr)) break;
5144 eptr++;
5145 while (eptr < md->end_subject && (*eptr & 0xc0) == 0x80) eptr++;
5146 }
5147 }
5148
5149 /* Handle unlimited UTF-8 repeat */
5150
5151 else
5152 {
5153 for (i = min; i < max; i++)
5154 {
5155 if (eptr >= md->end_subject)
5156 {
5157 SCHECK_PARTIAL();
5158 break;
5159 }
5160 if (IS_NEWLINE(eptr)) break;
5161 eptr++;
5162 while (eptr < md->end_subject && (*eptr & 0xc0) == 0x80) eptr++;
5163 }
5164 }
5165 break;
5166
5167 case OP_ALLANY:
5168 if (max < INT_MAX)
5169 {
5170 for (i = min; i < max; i++)
5171 {
5172 if (eptr >= md->end_subject)
5173 {
5174 SCHECK_PARTIAL();
5175 break;
5176 }
5177 eptr++;
5178 while (eptr < md->end_subject && (*eptr & 0xc0) == 0x80) eptr++;
5179 }
5180 }
5181 else eptr = md->end_subject; /* Unlimited UTF-8 repeat */
5182 break;
5183
5184 /* The byte case is the same as non-UTF8 */
5185
5186 case OP_ANYBYTE:
5187 c = max - min;
5188 if (c > (unsigned int)(md->end_subject - eptr))
5189 {
5190 eptr = md->end_subject;
5191 SCHECK_PARTIAL();
5192 }
5193 else eptr += c;
5194 break;
5195
5196 case OP_ANYNL:
5197 for (i = min; i < max; i++)
5198 {
5199 int len = 1;
5200 if (eptr >= md->end_subject)
5201 {
5202 SCHECK_PARTIAL();
5203 break;
5204 }
5205 GETCHARLEN(c, eptr, len);
5206 if (c == 0x000d)
5207 {
5208 if (++eptr >= md->end_subject) break;
5209 if (*eptr == 0x000a) eptr++;
5210 }
5211 else
5212 {
5213 if (c != 0x000a &&
5214 (md->bsr_anycrlf ||
5215 (c != 0x000b && c != 0x000c &&
5216 c != 0x0085 && c != 0x2028 && c != 0x2029)))
5217 break;
5218 eptr += len;
5219 }
5220 }
5221 break;
5222
5223 case OP_NOT_HSPACE:
5224 case OP_HSPACE:
5225 for (i = min; i < max; i++)
5226 {
5227 BOOL gotspace;
5228 int len = 1;
5229 if (eptr >= md->end_subject)
5230 {
5231 SCHECK_PARTIAL();
5232 break;
5233 }
5234 GETCHARLEN(c, eptr, len);
5235 switch(c)
5236 {
5237 default: gotspace = FALSE; break;
5238 case 0x09: /* HT */
5239 case 0x20: /* SPACE */
5240 case 0xa0: /* NBSP */
5241 case 0x1680: /* OGHAM SPACE MARK */
5242 case 0x180e: /* MONGOLIAN VOWEL SEPARATOR */
5243 case 0x2000: /* EN QUAD */
5244 case 0x2001: /* EM QUAD */
5245 case 0x2002: /* EN SPACE */
5246 case 0x2003: /* EM SPACE */
5247 case 0x2004: /* THREE-PER-EM SPACE */
5248 case 0x2005: /* FOUR-PER-EM SPACE */
5249 case 0x2006: /* SIX-PER-EM SPACE */
5250 case 0x2007: /* FIGURE SPACE */
5251 case 0x2008: /* PUNCTUATION SPACE */
5252 case 0x2009: /* THIN SPACE */
5253 case 0x200A: /* HAIR SPACE */
5254 case 0x202f: /* NARROW NO-BREAK SPACE */
5255 case 0x205f: /* MEDIUM MATHEMATICAL SPACE */
5256 case 0x3000: /* IDEOGRAPHIC SPACE */
5257 gotspace = TRUE;
5258 break;
5259 }
5260 if (gotspace == (ctype == OP_NOT_HSPACE)) break;
5261 eptr += len;
5262 }
5263 break;
5264
5265 case OP_NOT_VSPACE:
5266 case OP_VSPACE:
5267 for (i = min; i < max; i++)
5268 {
5269 BOOL gotspace;
5270 int len = 1;
5271 if (eptr >= md->end_subject)
5272 {
5273 SCHECK_PARTIAL();
5274 break;
5275 }
5276 GETCHARLEN(c, eptr, len);
5277 switch(c)
5278 {
5279 default: gotspace = FALSE; break;
5280 case 0x0a: /* LF */
5281 case 0x0b: /* VT */
5282 case 0x0c: /* FF */
5283 case 0x0d: /* CR */
5284 case 0x85: /* NEL */
5285 case 0x2028: /* LINE SEPARATOR */
5286 case 0x2029: /* PARAGRAPH SEPARATOR */
5287 gotspace = TRUE;
5288 break;
5289 }
5290 if (gotspace == (ctype == OP_NOT_VSPACE)) break;
5291 eptr += len;
5292 }
5293 break;
5294
5295 case OP_NOT_DIGIT:
5296 for (i = min; i < max; i++)
5297 {
5298 int len = 1;
5299 if (eptr >= md->end_subject)
5300 {
5301 SCHECK_PARTIAL();
5302 break;
5303 }
5304 GETCHARLEN(c, eptr, len);
5305 if (c < 256 && (md->ctypes[c] & ctype_digit) != 0) break;
5306 eptr+= len;
5307 }
5308 break;
5309
5310 case OP_DIGIT:
5311 for (i = min; i < max; i++)
5312 {
5313 int len = 1;
5314 if (eptr >= md->end_subject)
5315 {
5316 SCHECK_PARTIAL();
5317 break;
5318 }
5319 GETCHARLEN(c, eptr, len);
5320 if (c >= 256 ||(md->ctypes[c] & ctype_digit) == 0) break;
5321 eptr+= len;
5322 }
5323 break;
5324
5325 case OP_NOT_WHITESPACE:
5326 for (i = min; i < max; i++)
5327 {
5328 int len = 1;
5329 if (eptr >= md->end_subject)
5330 {
5331 SCHECK_PARTIAL();
5332 break;
5333 }
5334 GETCHARLEN(c, eptr, len);
5335 if (c < 256 && (md->ctypes[c] & ctype_space) != 0) break;
5336 eptr+= len;
5337 }
5338 break;
5339
5340 case OP_WHITESPACE:
5341 for (i = min; i < max; i++)
5342 {
5343 int len = 1;
5344 if (eptr >= md->end_subject)
5345 {
5346 SCHECK_PARTIAL();
5347 break;
5348 }
5349 GETCHARLEN(c, eptr, len);
5350 if (c >= 256 ||(md->ctypes[c] & ctype_space) == 0) break;
5351 eptr+= len;
5352 }
5353 break;
5354
5355 case OP_NOT_WORDCHAR:
5356 for (i = min; i < max; i++)
5357 {
5358 int len = 1;
5359 if (eptr >= md->end_subject)
5360 {
5361 SCHECK_PARTIAL();
5362 break;
5363 }
5364 GETCHARLEN(c, eptr, len);
5365 if (c < 256 && (md->ctypes[c] & ctype_word) != 0) break;
5366 eptr+= len;
5367 }
5368 break;
5369
5370 case OP_WORDCHAR:
5371 for (i = min; i < max; i++)
5372 {
5373 int len = 1;
5374 if (eptr >= md->end_subject)
5375 {
5376 SCHECK_PARTIAL();
5377 break;
5378 }
5379 GETCHARLEN(c, eptr, len);
5380 if (c >= 256 || (md->ctypes[c] & ctype_word) == 0) break;
5381 eptr+= len;
5382 }
5383 break;
5384
5385 default:
5386 RRETURN(PCRE_ERROR_INTERNAL);
5387 }
5388
5389 /* eptr is now past the end of the maximum run. If possessive, we are
5390 done (no backing up). Otherwise, match at this position; anything other
5391 than no match is immediately returned. For nomatch, back up one
5392 character, unless we are matching \R and the last thing matched was
5393 \r\n, in which case, back up two bytes. */
5394
5395 if (possessive) continue;
5396 for(;;)
5397 {
5398 RMATCH(eptr, ecode, offset_top, md, eptrb, RM46);
5399 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
5400 if (eptr-- == pp) break; /* Stop if tried at original pos */
5401 BACKCHAR(eptr);
5402 if (ctype == OP_ANYNL && eptr > pp && *eptr == '\n' &&
5403 eptr[-1] == '\r') eptr--;
5404 }
5405 }
5406 else
5407 #endif /* SUPPORT_UTF8 */
5408
5409 /* Not UTF-8 mode */
5410 {
5411 switch(ctype)
5412 {
5413 case OP_ANY:
5414 for (i = min; i < max; i++)
5415 {
5416 if (eptr >= md->end_subject)
5417 {
5418 SCHECK_PARTIAL();
5419 break;
5420 }
5421 if (IS_NEWLINE(eptr)) break;
5422 eptr++;
5423 }
5424 break;
5425
5426 case OP_ALLANY:
5427 case OP_ANYBYTE:
5428 c = max - min;
5429 if (c > (unsigned int)(md->end_subject - eptr))
5430 {
5431 eptr = md->end_subject;
5432 SCHECK_PARTIAL();
5433 }
5434 else eptr += c;
5435 break;
5436
5437 case OP_ANYNL:
5438 for (i = min; i < max; i++)
5439 {
5440 if (eptr >= md->end_subject)
5441 {
5442 SCHECK_PARTIAL();
5443 break;
5444 }
5445 c = *eptr;
5446 if (c == 0x000d)
5447 {
5448 if (++eptr >= md->end_subject) break;
5449 if (*eptr == 0x000a) eptr++;
5450 }
5451 else
5452 {
5453 if (c != 0x000a &&
5454 (md->bsr_anycrlf ||
5455 (c != 0x000b && c != 0x000c && c != 0x0085)))
5456 break;
5457 eptr++;
5458 }
5459 }
5460 break;
5461
5462 case OP_NOT_HSPACE:
5463 for (i = min; i < max; i++)
5464 {
5465 if (eptr >= md->end_subject)
5466 {
5467 SCHECK_PARTIAL();
5468 break;
5469 }
5470 c = *eptr;
5471 if (c == 0x09 || c == 0x20 || c == 0xa0) break;
5472 eptr++;
5473 }
5474 break;
5475
5476 case OP_HSPACE:
5477 for (i = min; i < max; i++)
5478 {
5479 if (eptr >= md->end_subject)
5480 {
5481 SCHECK_PARTIAL();
5482 break;
5483 }
5484 c = *eptr;
5485 if (c != 0x09 && c != 0x20 && c != 0xa0) break;
5486 eptr++;
5487 }
5488 break;
5489
5490 case OP_NOT_VSPACE:
5491 for (i = min; i < max; i++)
5492 {
5493 if (eptr >= md->end_subject)
5494 {
5495 SCHECK_PARTIAL();
5496 break;
5497 }
5498 c = *eptr;
5499 if (c == 0x0a || c == 0x0b || c == 0x0c || c == 0x0d || c == 0x85)
5500 break;
5501 eptr++;
5502 }
5503 break;
5504
5505 case OP_VSPACE:
5506 for (i = min; i < max; i++)
5507 {
5508 if (eptr >= md->end_subject)
5509 {
5510 SCHECK_PARTIAL();
5511 break;
5512 }
5513 c = *eptr;
5514 if (c != 0x0a && c != 0x0b && c != 0x0c && c != 0x0d && c != 0x85)
5515 break;
5516 eptr++;
5517 }
5518 break;
5519
5520 case OP_NOT_DIGIT:
5521 for (i = min; i < max; i++)
5522 {
5523 if (eptr >= md->end_subject)
5524 {
5525 SCHECK_PARTIAL();
5526 break;
5527 }
5528 if ((md->ctypes[*eptr] & ctype_digit) != 0) break;
5529 eptr++;
5530 }
5531 break;
5532
5533 case OP_DIGIT:
5534 for (i = min; i < max; i++)
5535 {
5536 if (eptr >= md->end_subject)
5537 {
5538 SCHECK_PARTIAL();
5539 break;
5540 }
5541 if ((md->ctypes[*eptr] & ctype_digit) == 0) break;
5542 eptr++;
5543 }
5544 break;
5545
5546 case OP_NOT_WHITESPACE:
5547 for (i = min; i < max; i++)
5548 {
5549 if (eptr >= md->end_subject)
5550 {
5551 SCHECK_PARTIAL();
5552 break;
5553 }
5554 if ((md->ctypes[*eptr] & ctype_space) != 0) break;
5555 eptr++;
5556 }
5557 break;
5558
5559 case OP_WHITESPACE:
5560 for (i = min; i < max; i++)
5561 {
5562 if (eptr >= md->end_subject)
5563 {
5564 SCHECK_PARTIAL();
5565 break;
5566 }
5567 if ((md->ctypes[*eptr] & ctype_space) == 0) break;
5568 eptr++;
5569 }
5570 break;
5571
5572 case OP_NOT_WORDCHAR:
5573 for (i = min; i < max; i++)
5574 {
5575 if (eptr >= md->end_subject)
5576 {
5577 SCHECK_PARTIAL();
5578 break;
5579 }
5580 if ((md->ctypes[*eptr] & ctype_word) != 0) break;
5581 eptr++;
5582 }
5583 break;
5584
5585 case OP_WORDCHAR:
5586 for (i = min; i < max; i++)
5587 {
5588 if (eptr >= md->end_subject)
5589 {
5590 SCHECK_PARTIAL();
5591 break;
5592 }
5593 if ((md->ctypes[*eptr] & ctype_word) == 0) break;
5594 eptr++;
5595 }
5596 break;
5597
5598 default:
5599 RRETURN(PCRE_ERROR_INTERNAL);
5600 }
5601
5602 /* eptr is now past the end of the maximum run. If possessive, we are
5603 done (no backing up). Otherwise, match at this position; anything other
5604 than no match is immediately returned. For nomatch, back up one
5605 character (byte), unless we are matching \R and the last thing matched
5606 was \r\n, in which case, back up two bytes. */
5607
5608 if (possessive) continue;
5609 while (eptr >= pp)
5610 {
5611 RMATCH(eptr, ecode, offset_top, md, eptrb, RM47);
5612 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
5613 eptr--;
5614 if (ctype == OP_ANYNL && eptr > pp && *eptr == '\n' &&
5615 eptr[-1] == '\r') eptr--;
5616 }
5617 }
5618
5619 /* Get here if we can't make it match with any permitted repetitions */
5620
5621 MRRETURN(MATCH_NOMATCH);
5622 }
5623 /* Control never gets here */
5624
5625 /* There's been some horrible disaster. Arrival here can only mean there is
5626 something seriously wrong in the code above or the OP_xxx definitions. */
5627
5628 default:
5629 DPRINTF(("Unknown opcode %d\n", *ecode));
5630 RRETURN(PCRE_ERROR_UNKNOWN_OPCODE);
5631 }
5632
5633 /* Do not stick any code in here without much thought; it is assumed
5634 that "continue" in the code above comes out to here to repeat the main
5635 loop. */
5636
5637 } /* End of main loop */
5638 /* Control never reaches here */
5639
5640
5641 /* When compiling to use the heap rather than the stack for recursive calls to
5642 match(), the RRETURN() macro jumps here. The number that is saved in
5643 frame->Xwhere indicates which label we actually want to return to. */
5644
5645 #ifdef NO_RECURSE
5646 #define LBL(val) case val: goto L_RM##val;
5647 HEAP_RETURN:
5648 switch (frame->Xwhere)
5649 {
5650 LBL( 1) LBL( 2) LBL( 3) LBL( 4) LBL( 5) LBL( 6) LBL( 7) LBL( 8)
5651 LBL( 9) LBL(10) LBL(11) LBL(12) LBL(13) LBL(14) LBL(15) LBL(17)
5652 LBL(19) LBL(24) LBL(25) LBL(26) LBL(27) LBL(29) LBL(31) LBL(33)
5653 LBL(35) LBL(43) LBL(47) LBL(48) LBL(49) LBL(50) LBL(51) LBL(52)
5654 LBL(53) LBL(54) LBL(55) LBL(56) LBL(57) LBL(58) LBL(63)
5655 #ifdef SUPPORT_UTF8
5656 LBL(16) LBL(18) LBL(20) LBL(21) LBL(22) LBL(23) LBL(28) LBL(30)
5657 LBL(32) LBL(34) LBL(42) LBL(46)
5658 #ifdef SUPPORT_UCP
5659 LBL(36) LBL(37) LBL(38) LBL(39) LBL(40) LBL(41) LBL(44) LBL(45)
5660 LBL(59) LBL(60) LBL(61) LBL(62)
5661 #endif /* SUPPORT_UCP */
5662 #endif /* SUPPORT_UTF8 */
5663 default:
5664 DPRINTF(("jump error in pcre match: label %d non-existent\n", frame->Xwhere));
5665 return PCRE_ERROR_INTERNAL;
5666 }
5667 #undef LBL
5668 #endif /* NO_RECURSE */
5669 }
5670
5671
5672 /***************************************************************************
5673 ****************************************************************************
5674 RECURSION IN THE match() FUNCTION
5675
5676 Undefine all the macros that were defined above to handle this. */
5677
5678 #ifdef NO_RECURSE
5679 #undef eptr
5680 #undef ecode
5681 #undef mstart
5682 #undef offset_top
5683 #undef eptrb
5684 #undef flags
5685
5686 #undef callpat
5687 #undef charptr
5688 #undef data
5689 #undef next
5690 #undef pp
5691 #undef prev
5692 #undef saved_eptr
5693
5694 #undef new_recursive
5695
5696 #undef cur_is_word
5697 #undef condition
5698 #undef prev_is_word
5699
5700 #undef ctype
5701 #undef length
5702 #undef max
5703 #undef min
5704 #undef number
5705 #undef offset
5706 #undef op
5707 #undef save_capture_last
5708 #undef save_offset1
5709 #undef save_offset2
5710 #undef save_offset3
5711 #undef stacksave
5712
5713 #undef newptrb
5714
5715 #endif
5716
5717 /* These two are defined as macros in both cases */
5718
5719 #undef fc
5720 #undef fi
5721
5722 /***************************************************************************
5723 ***************************************************************************/
5724
5725
5726
5727 /*************************************************
5728 * Execute a Regular Expression *
5729 *************************************************/
5730
5731 /* This function applies a compiled re to a subject string and picks out
5732 portions of the string if it matches. Two elements in the vector are set for
5733 each substring: the offsets to the start and end of the substring.
5734
5735 Arguments:
5736 argument_re points to the compiled expression
5737 extra_data points to extra data or is NULL
5738 subject points to the subject string
5739 length length of subject string (may contain binary zeros)
5740 start_offset where to start in the subject string
5741 options option bits
5742 offsets points to a vector of ints to be filled in with offsets
5743 offsetcount the number of elements in the vector
5744
5745 Returns: > 0 => success; value is the number of elements filled in
5746 = 0 => success, but offsets is not big enough
5747 -1 => failed to match
5748 < -1 => some kind of unexpected problem
5749 */
5750
5751 PCRE_EXP_DEFN int PCRE_CALL_CONVENTION
5752 pcre_exec(const pcre *argument_re, const pcre_extra *extra_data,
5753 PCRE_SPTR subject, int length, int start_offset, int options, int *offsets,
5754 int offsetcount)
5755 {
5756 int rc, ocount;
5757 int first_byte = -1;
5758 int req_byte = -1;
5759 int req_byte2 = -1;
5760 int newline;
5761 BOOL using_temporary_offsets = FALSE;
5762 BOOL anchored;
5763 BOOL startline;
5764 BOOL firstline;
5765 BOOL first_byte_caseless = FALSE;
5766 BOOL req_byte_caseless = FALSE;
5767 BOOL utf8;
5768 match_data match_block;
5769 match_data *md = &match_block;
5770 const uschar *tables;
5771 const uschar *start_bits = NULL;
5772 USPTR start_match = (USPTR)subject + start_offset;
5773 USPTR end_subject;
5774 USPTR start_partial = NULL;
5775 USPTR req_byte_ptr = start_match - 1;
5776
5777 pcre_study_data internal_study;
5778 const pcre_study_data *study;
5779
5780 real_pcre internal_re;
5781 const real_pcre *external_re = (const real_pcre *)argument_re;
5782 const real_pcre *re = external_re;
5783
5784 /* Plausibility checks */
5785
5786 if ((options & ~PUBLIC_EXEC_OPTIONS) != 0) return PCRE_ERROR_BADOPTION;
5787 if (re == NULL || subject == NULL ||
5788 (offsets == NULL && offsetcount > 0)) return PCRE_ERROR_NULL;
5789 if (offsetcount < 0) return PCRE_ERROR_BADCOUNT;
5790 if (start_offset < 0 || start_offset > length) return PCRE_ERROR_BADOFFSET;
5791
5792 /* This information is for finding all the numbers associated with a given
5793 name, for condition testing. */
5794
5795 md->name_table = (uschar *)re + re->name_table_offset;
5796 md->name_count = re->name_count;
5797 md->name_entry_size = re->name_entry_size;
5798
5799 /* Fish out the optional data from the extra_data structure, first setting
5800 the default values. */
5801
5802 study = NULL;
5803 md->match_limit = MATCH_LIMIT;
5804 md->match_limit_recursion = MATCH_LIMIT_RECURSION;
5805 md->callout_data = NULL;
5806
5807 /* The table pointer is always in native byte order. */
5808
5809 tables = external_re->tables;
5810
5811 if (extra_data != NULL)
5812 {
5813 register unsigned int flags = extra_data->flags;
5814 if ((flags & PCRE_EXTRA_STUDY_DATA) != 0)
5815 study = (const pcre_study_data *)extra_data->study_data;
5816 if ((flags & PCRE_EXTRA_MATCH_LIMIT) != 0)
5817 md->match_limit = extra_data->match_limit;
5818 if ((flags & PCRE_EXTRA_MATCH_LIMIT_RECURSION) != 0)
5819 md->match_limit_recursion = extra_data->match_limit_recursion;
5820 if ((flags & PCRE_EXTRA_CALLOUT_DATA) != 0)
5821 md->callout_data = extra_data->callout_data;
5822 if ((flags & PCRE_EXTRA_TABLES) != 0) tables = extra_data->tables;
5823 }
5824
5825 /* If the exec call supplied NULL for tables, use the inbuilt ones. This
5826 is a feature that makes it possible to save compiled regex and re-use them
5827 in other programs later. */
5828
5829 if (tables == NULL) tables = _pcre_default_tables;
5830
5831 /* Check that the first field in the block is the magic number. If it is not,
5832 test for a regex that was compiled on a host of opposite endianness. If this is
5833 the case, flipped values are put in internal_re and internal_study if there was
5834 study data too. */
5835
5836 if (re->magic_number != MAGIC_NUMBER)
5837 {
5838 re = _pcre_try_flipped(re, &internal_re, study, &internal_study);
5839 if (re == NULL) return PCRE_ERROR_BADMAGIC;
5840 if (study != NULL) study = &internal_study;
5841 }
5842
5843 /* Set up other data */
5844
5845 anchored = ((re->options | options) & PCRE_ANCHORED) != 0;
5846 startline = (re->flags & PCRE_STARTLINE) != 0;
5847 firstline = (re->options & PCRE_FIRSTLINE) != 0;
5848
5849 /* The code starts after the real_pcre block and the capture name table. */
5850
5851 md->start_code = (const uschar *)external_re + re->name_table_offset +
5852 re->name_count * re->name_entry_size;
5853
5854 md->start_subject = (USPTR)subject;
5855 md->start_offset = start_offset;
5856 md->end_subject = md->start_subject + length;
5857 end_subject = md->end_subject;
5858
5859 md->endonly = (re->options & PCRE_DOLLAR_ENDONLY) != 0;
5860 utf8 = md->utf8 = (re->options & PCRE_UTF8) != 0;
5861 md->use_ucp = (re->options & PCRE_UCP) != 0;
5862 md->jscript_compat = (re->options & PCRE_JAVASCRIPT_COMPAT) != 0;
5863
5864 /* Some options are unpacked into BOOL variables in the hope that testing
5865 them will be faster than individual option bits. */
5866
5867 md->notbol = (options & PCRE_NOTBOL) != 0;
5868 md->noteol = (options & PCRE_NOTEOL) != 0;
5869 md->notempty = (options & PCRE_NOTEMPTY) != 0;
5870 md->notempty_atstart = (options & PCRE_NOTEMPTY_ATSTART) != 0;
5871 md->partial = ((options & PCRE_PARTIAL_HARD) != 0)? 2 :
5872 ((options & PCRE_PARTIAL_SOFT) != 0)? 1 : 0;
5873
5874
5875 md->hitend = FALSE;
5876 md->mark = NULL; /* In case never set */
5877
5878 md->recursive = NULL; /* No recursion at top level */
5879
5880 md->lcc = tables + lcc_offset;
5881 md->ctypes = tables + ctypes_offset;
5882
5883 /* Handle different \R options. */
5884
5885 switch (options & (PCRE_BSR_ANYCRLF|PCRE_BSR_UNICODE))
5886 {
5887 case 0:
5888 if ((re->options & (PCRE_BSR_ANYCRLF|PCRE_BSR_UNICODE)) != 0)
5889 md->bsr_anycrlf = (re->options & PCRE_BSR_ANYCRLF) != 0;
5890 else
5891 #ifdef BSR_ANYCRLF
5892 md->bsr_anycrlf = TRUE;
5893 #else
5894 md->bsr_anycrlf = FALSE;
5895 #endif
5896 break;
5897
5898 case PCRE_BSR_ANYCRLF:
5899 md->bsr_anycrlf = TRUE;
5900 break;
5901
5902 case PCRE_BSR_UNICODE:
5903 md->bsr_anycrlf = FALSE;
5904 break;
5905
5906 default: return PCRE_ERROR_BADNEWLINE;
5907 }
5908
5909 /* Handle different types of newline. The three bits give eight cases. If
5910 nothing is set at run time, whatever was used at compile time applies. */
5911
5912 switch ((((options & PCRE_NEWLINE_BITS) == 0)? re->options :
5913 (pcre_uint32)options) & PCRE_NEWLINE_BITS)
5914 {
5915 case 0: newline = NEWLINE; break; /* Compile-time default */
5916 case PCRE_NEWLINE_CR: newline = CHAR_CR; break;
5917 case PCRE_NEWLINE_LF: newline = CHAR_NL; break;
5918 case PCRE_NEWLINE_CR+
5919 PCRE_NEWLINE_LF: newline = (CHAR_CR << 8) | CHAR_NL; break;
5920 case PCRE_NEWLINE_ANY: newline = -1; break;
5921 case PCRE_NEWLINE_ANYCRLF: newline = -2; break;
5922 default: return PCRE_ERROR_BADNEWLINE;
5923 }
5924
5925 if (newline == -2)
5926 {
5927 md->nltype = NLTYPE_ANYCRLF;
5928 }
5929 else if (newline < 0)
5930 {
5931 md->nltype = NLTYPE_ANY;
5932 }
5933 else
5934 {
5935 md->nltype = NLTYPE_FIXED;
5936 if (newline > 255)
5937 {
5938 md->nllen = 2;
5939 md->nl[0] = (newline >> 8) & 255;
5940 md->nl[1] = newline & 255;
5941 }
5942 else
5943 {
5944 md->nllen = 1;
5945 md->nl[0] = newline;
5946 }
5947 }
5948
5949 /* Partial matching was originally supported only for a restricted set of
5950 regexes; from release 8.00 there are no restrictions, but the bits are still
5951 defined (though never set). So there's no harm in leaving this code. */
5952
5953 if (md->partial && (re->flags & PCRE_NOPARTIAL) != 0)
5954 return PCRE_ERROR_BADPARTIAL;
5955
5956 /* Check a UTF-8 string if required. Pass back the character offset and error
5957 code for an invalid string if a results vector is available. */
5958
5959 #ifdef SUPPORT_UTF8
5960 if (utf8 && (options & PCRE_NO_UTF8_CHECK) == 0)
5961 {
5962 int erroroffset;
5963 int errorcode = _pcre_valid_utf8((USPTR)subject, length, &erroroffset);
5964 if (errorcode != 0)
5965 {
5966 if (offsetcount >= 2)
5967 {
5968 offsets[0] = erroroffset;
5969 offsets[1] = errorcode;
5970 }
5971 return (errorcode <= PCRE_UTF8_ERR5 && md->partial > 1)?
5972 PCRE_ERROR_SHORTUTF8 : PCRE_ERROR_BADUTF8;
5973 }
5974
5975 /* Check that a start_offset points to the start of a UTF-8 character. */
5976
5977 if (start_offset > 0 && start_offset < length &&
5978 (((USPTR)subject)[start_offset] & 0xc0) == 0x80)
5979 return PCRE_ERROR_BADUTF8_OFFSET;
5980 }
5981 #endif
5982
5983 /* If the expression has got more back references than the offsets supplied can
5984 hold, we get a temporary chunk of working store to use during the matching.
5985 Otherwise, we can use the vector supplied, rounding down its size to a multiple
5986 of 3. */
5987
5988 ocount = offsetcount - (offsetcount % 3);
5989
5990 if (re->top_backref > 0 && re->top_backref >= ocount/3)
5991 {
5992 ocount = re->top_backref * 3 + 3;
5993 md->offset_vector = (int *)(pcre_malloc)(ocount * sizeof(int));
5994 if (md->offset_vector == NULL) return PCRE_ERROR_NOMEMORY;
5995 using_temporary_offsets = TRUE;
5996 DPRINTF(("Got memory to hold back references\n"));
5997 }
5998 else md->offset_vector = offsets;
5999
6000 md->offset_end = ocount;
6001 md->offset_max = (2*ocount)/3;
6002 md->offset_overflow = FALSE;
6003 md->capture_last = -1;
6004
6005 /* Reset the working variable associated with each extraction. These should
6006 never be used unless previously set, but they get saved and restored, and so we
6007 initialize them to avoid reading uninitialized locations. Also, unset the
6008 offsets for the matched string. This is really just for tidiness with callouts,
6009 in case they inspect these fields. */
6010
6011 if (md->offset_vector != NULL)
6012 {
6013 register int *iptr = md->offset_vector + ocount;
6014 register int *iend = iptr - re->top_bracket;
6015 if (iend < md->offset_vector + 2) iend = md->offset_vector + 2;
6016 while (--iptr >= iend) *iptr = -1;
6017 md->offset_vector[0] = md->offset_vector[1] = -1;
6018 }
6019
6020 /* Set up the first character to match, if available. The first_byte value is
6021 never set for an anchored regular expression, but the anchoring may be forced
6022 at run time, so we have to test for anchoring. The first char may be unset for
6023 an unanchored pattern, of course. If there's no first char and the pattern was
6024 studied, there may be a bitmap of possible first characters. */
6025
6026 if (!anchored)
6027 {
6028 if ((re->flags & PCRE_FIRSTSET) != 0)
6029 {
6030 first_byte = re->first_byte & 255;
6031 if ((first_byte_caseless = ((re->first_byte & REQ_CASELESS) != 0)) == TRUE)
6032 first_byte = md->lcc[first_byte];
6033 }
6034 else
6035 if (!startline && study != NULL &&
6036 (study->flags & PCRE_STUDY_MAPPED) != 0)
6037 start_bits = study->start_bits;
6038 }
6039
6040 /* For anchored or unanchored matches, there may be a "last known required
6041 character" set. */
6042
6043 if ((re->flags & PCRE_REQCHSET) != 0)
6044 {
6045 req_byte = re->req_byte & 255;
6046 req_byte_caseless = (re->req_byte & REQ_CASELESS) != 0;
6047 req_byte2 = (tables + fcc_offset)[req_byte]; /* case flipped */
6048 }
6049
6050
6051
6052
6053 /* ==========================================================================*/
6054
6055 /* Loop for handling unanchored repeated matching attempts; for anchored regexs
6056 the loop runs just once. */
6057
6058 for(;;)
6059 {
6060 USPTR save_end_subject = end_subject;
6061 USPTR new_start_match;
6062
6063 /* If firstline is TRUE, the start of the match is constrained to the first
6064 line of a multiline string. That is, the match must be before or at the first
6065 newline. Implement this by temporarily adjusting end_subject so that we stop
6066 scanning at a newline. If the match fails at the newline, later code breaks
6067 this loop. */
6068
6069 if (firstline)
6070 {
6071 USPTR t = start_match;
6072 #ifdef SUPPORT_UTF8
6073 if (utf8)
6074 {
6075 while (t < md->end_subject && !IS_NEWLINE(t))
6076 {
6077 t++;
6078 while (t < end_subject && (*t & 0xc0) == 0x80) t++;
6079 }
6080 }
6081 else
6082 #endif
6083 while (t < md->end_subject && !IS_NEWLINE(t)) t++;
6084 end_subject = t;
6085 }
6086
6087 /* There are some optimizations that avoid running the match if a known
6088 starting point is not found, or if a known later character is not present.
6089 However, there is an option that disables these, for testing and for ensuring
6090 that all callouts do actually occur. The option can be set in the regex by
6091 (*NO_START_OPT) or passed in match-time options. */
6092
6093 if (((options | re->options) & PCRE_NO_START_OPTIMIZE) == 0)
6094 {
6095 /* Advance to a unique first byte if there is one. */
6096
6097 if (first_byte >= 0)
6098 {
6099 if (first_byte_caseless)
6100 while (start_match < end_subject && md->lcc[*start_match] != first_byte)
6101 start_match++;
6102 else
6103 while (start_match < end_subject && *start_match != first_byte)
6104 start_match++;
6105 }
6106
6107 /* Or to just after a linebreak for a multiline match */
6108
6109 else if (startline)
6110 {
6111 if (start_match > md->start_subject + start_offset)
6112 {
6113 #ifdef SUPPORT_UTF8
6114 if (utf8)
6115 {
6116 while (start_match < end_subject && !WAS_NEWLINE(start_match))
6117 {
6118 start_match++;
6119 while(start_match < end_subject && (*start_match & 0xc0) == 0x80)
6120 start_match++;
6121 }
6122 }
6123 else
6124 #endif
6125 while (start_match < end_subject && !WAS_NEWLINE(start_match))
6126 start_match++;
6127
6128 /* If we have just passed a CR and the newline option is ANY or ANYCRLF,
6129 and we are now at a LF, advance the match position by one more character.
6130 */
6131
6132 if (start_match[-1] == CHAR_CR &&
6133 (md->nltype == NLTYPE_ANY || md->nltype == NLTYPE_ANYCRLF) &&
6134 start_match < end_subject &&
6135 *start_match == CHAR_NL)
6136 start_match++;
6137 }
6138 }
6139
6140 /* Or to a non-unique first byte after study */
6141
6142 else if (start_bits != NULL)
6143 {
6144 while (start_match < end_subject)
6145 {
6146 register unsigned int c = *start_match;
6147 if ((start_bits[c/8] & (1 << (c&7))) == 0)
6148 {
6149 start_match++;
6150 #ifdef SUPPORT_UTF8
6151 if (utf8)
6152 while(start_match < end_subject && (*start_match & 0xc0) == 0x80)
6153 start_match++;
6154 #endif
6155 }
6156 else break;
6157 }
6158 }
6159 } /* Starting optimizations */
6160
6161 /* Restore fudged end_subject */
6162
6163 end_subject = save_end_subject;
6164
6165 /* The following two optimizations are disabled for partial matching or if
6166 disabling is explicitly requested. */
6167
6168 if ((options & PCRE_NO_START_OPTIMIZE) == 0 && !md->partial)
6169 {
6170 /* If the pattern was studied, a minimum subject length may be set. This is
6171 a lower bound; no actual string of that length may actually match the
6172 pattern. Although the value is, strictly, in characters, we treat it as
6173 bytes to avoid spending too much time in this optimization. */
6174
6175 if (study != NULL && (study->flags & PCRE_STUDY_MINLEN) != 0 &&
6176 (pcre_uint32)(end_subject - start_match) < study->minlength)
6177 {
6178 rc = MATCH_NOMATCH;
6179 break;
6180 }
6181
6182 /* If req_byte is set, we know that that character must appear in the
6183 subject for the match to succeed. If the first character is set, req_byte
6184 must be later in the subject; otherwise the test starts at the match point.
6185 This optimization can save a huge amount of backtracking in patterns with
6186 nested unlimited repeats that aren't going to match. Writing separate code
6187 for cased/caseless versions makes it go faster, as does using an
6188 autoincrement and backing off on a match.
6189
6190 HOWEVER: when the subject string is very, very long, searching to its end
6191 can take a long time, and give bad performance on quite ordinary patterns.
6192 This showed up when somebody was matching something like /^\d+C/ on a
6193 32-megabyte string... so we don't do this when the string is sufficiently
6194 long. */
6195
6196 if (req_byte >= 0 && end_subject - start_match < REQ_BYTE_MAX)
6197 {
6198 register USPTR p = start_match + ((first_byte >= 0)? 1 : 0);
6199
6200 /* We don't need to repeat the search if we haven't yet reached the
6201 place we found it at last time. */
6202
6203 if (p > req_byte_ptr)
6204 {
6205 if (req_byte_caseless)
6206 {
6207 while (p < end_subject)
6208 {
6209 register int pp = *p++;
6210 if (pp == req_byte || pp == req_byte2) { p--; break; }
6211 }
6212 }
6213 else
6214 {
6215 while (p < end_subject)
6216 {
6217 if (*p++ == req_byte) { p--; break; }
6218 }
6219 }
6220
6221 /* If we can't find the required character, break the matching loop,
6222 forcing a match failure. */
6223
6224 if (p >= end_subject)
6225 {
6226 rc = MATCH_NOMATCH;
6227 break;
6228 }
6229
6230 /* If we have found the required character, save the point where we
6231 found it, so that we don't search again next time round the loop if
6232 the start hasn't passed this character yet. */
6233
6234 req_byte_ptr = p;
6235 }
6236 }
6237 }
6238
6239 #ifdef PCRE_DEBUG /* Sigh. Some compilers never learn. */
6240 printf(">>>> Match against: ");
6241 pchars(start_match, end_subject - start_match, TRUE, md);
6242 printf("\n");
6243 #endif
6244
6245 /* OK, we can now run the match. If "hitend" is set afterwards, remember the
6246 first starting point for which a partial match was found. */
6247
6248 md->start_match_ptr = start_match;
6249 md->start_used_ptr = start_match;
6250 md->match_call_count = 0;
6251 md->match_function_type = 0;
6252 md->end_offset_top = 0;
6253 rc = match(start_match, md->start_code, start_match, NULL, 2, md, NULL, 0);
6254 if (md->hitend && start_partial == NULL) start_partial = md->start_used_ptr;
6255
6256 switch(rc)
6257 {
6258 /* SKIP passes back the next starting point explicitly, but if it is the
6259 same as the match we have just done, treat it as NOMATCH. */
6260
6261 case MATCH_SKIP:
6262 if (md->start_match_ptr != start_match)
6263 {
6264 new_start_match = md->start_match_ptr;
6265 break;
6266 }
6267 /* Fall through */
6268
6269 /* If MATCH_SKIP_ARG reaches this level it means that a MARK that matched
6270 the SKIP's arg was not found. We also treat this as NOMATCH. */
6271
6272 case MATCH_SKIP_ARG:
6273 /* Fall through */
6274
6275 /* NOMATCH and PRUNE advance by one character. THEN at this level acts
6276 exactly like PRUNE. */
6277
6278 case MATCH_NOMATCH:
6279 case MATCH_PRUNE:
6280 case MATCH_THEN:
6281 new_start_match = start_match + 1;
6282 #ifdef SUPPORT_UTF8
6283 if (utf8)
6284 while(new_start_match < end_subject && (*new_start_match & 0xc0) == 0x80)
6285 new_start_match++;
6286 #endif
6287 break;
6288
6289 /* COMMIT disables the bumpalong, but otherwise behaves as NOMATCH. */
6290
6291 case MATCH_COMMIT:
6292 rc = MATCH_NOMATCH;
6293 goto ENDLOOP;
6294
6295 /* Any other return is either a match, or some kind of error. */
6296
6297 default:
6298 goto ENDLOOP;
6299 }
6300
6301 /* Control reaches here for the various types of "no match at this point"
6302 result. Reset the code to MATCH_NOMATCH for subsequent checking. */
6303
6304 rc = MATCH_NOMATCH;
6305
6306 /* If PCRE_FIRSTLINE is set, the match must happen before or at the first
6307 newline in the subject (though it may continue over the newline). Therefore,
6308 if we have just failed to match, starting at a newline, do not continue. */
6309
6310 if (firstline && IS_NEWLINE(start_match)) break;
6311
6312 /* Advance to new matching position */
6313
6314 start_match = new_start_match;
6315
6316 /* Break the loop if the pattern is anchored or if we have passed the end of
6317 the subject. */
6318
6319 if (anchored || start_match > end_subject) break;
6320
6321 /* If we have just passed a CR and we are now at a LF, and the pattern does
6322 not contain any explicit matches for \r or \n, and the newline option is CRLF
6323 or ANY or ANYCRLF, advance the match position by one more character. */
6324
6325 if (start_match[-1] == CHAR_CR &&
6326 start_match < end_subject &&
6327 *start_match == CHAR_NL &&
6328 (re->flags & PCRE_HASCRORLF) == 0 &&
6329 (md->nltype == NLTYPE_ANY ||
6330 md->nltype == NLTYPE_ANYCRLF ||
6331 md->nllen == 2))
6332 start_match++;
6333
6334 md->mark = NULL; /* Reset for start of next match attempt */
6335 } /* End of for(;;) "bumpalong" loop */
6336
6337 /* ==========================================================================*/
6338
6339 /* We reach here when rc is not MATCH_NOMATCH, or if one of the stopping
6340 conditions is true:
6341
6342 (1) The pattern is anchored or the match was failed by (*COMMIT);
6343
6344 (2) We are past the end of the subject;
6345
6346 (3) PCRE_FIRSTLINE is set and we have failed to match at a newline, because
6347 this option requests that a match occur at or before the first newline in
6348 the subject.
6349
6350 When we have a match and the offset vector is big enough to deal with any
6351 backreferences, captured substring offsets will already be set up. In the case
6352 where we had to get some local store to hold offsets for backreference
6353 processing, copy those that we can. In this case there need not be overflow if
6354 certain parts of the pattern were not used, even though there are more
6355 capturing parentheses than vector slots. */
6356
6357 ENDLOOP:
6358
6359 if (rc == MATCH_MATCH || rc == MATCH_ACCEPT)
6360 {
6361 if (using_temporary_offsets)
6362 {
6363 if (offsetcount >= 4)
6364 {
6365 memcpy(offsets + 2, md->offset_vector + 2,
6366 (offsetcount - 2) * sizeof(int));
6367 DPRINTF(("Copied offsets from temporary memory\n"));
6368 }
6369 if (md->end_offset_top > offsetcount) md->offset_overflow = TRUE;
6370 DPRINTF(("Freeing temporary memory\n"));
6371 (pcre_free)(md->offset_vector);
6372 }
6373
6374 /* Set the return code to the number of captured strings, or 0 if there are
6375 too many to fit into the vector. */
6376
6377 rc = md->offset_overflow? 0 : md->end_offset_top/2;
6378
6379 /* If there is space in the offset vector, set any unused pairs at the end of
6380 the pattern to -1 for backwards compatibility. It is documented that this
6381 happens. In earlier versions, the whole set of potential capturing offsets
6382 was set to -1 each time round the loop, but this is handled differently now.
6383 "Gaps" are set to -1 dynamically instead (this fixes a bug). Thus, it is only
6384 those at the end that need unsetting here. We can't just unset them all at
6385 the start of the whole thing because they may get set in one branch that is
6386 not the final matching branch. */
6387
6388 if (md->end_offset_top/2 <= re->top_bracket && offsets != NULL)
6389 {
6390 register int *iptr, *iend;
6391 int resetcount = 2 + re->top_bracket * 2;
6392 if (resetcount > offsetcount) resetcount = ocount;
6393 iptr = offsets + md->end_offset_top;
6394 iend = offsets + resetcount;
6395 while (iptr < iend) *iptr++ = -1;
6396 }
6397
6398 /* If there is space, set up the whole thing as substring 0. The value of
6399 md->start_match_ptr might be modified if \K was encountered on the success
6400 matching path. */
6401
6402 if (offsetcount < 2) rc = 0; else
6403 {
6404 offsets[0] = (int)(md->start_match_ptr - md->start_subject);
6405 offsets[1] = (int)(md->end_match_ptr - md->start_subject);
6406 }
6407
6408 DPRINTF((">>>> returning %d\n", rc));
6409 goto RETURN_MARK;
6410 }
6411
6412 /* Control gets here if there has been an error, or if the overall match
6413 attempt has failed at all permitted starting positions. */
6414
6415 if (using_temporary_offsets)
6416 {
6417 DPRINTF(("Freeing temporary memory\n"));
6418 (pcre_free)(md->offset_vector);
6419 }
6420
6421 /* For anything other than nomatch or partial match, just return the code. */
6422
6423 if (rc != MATCH_NOMATCH && rc != PCRE_ERROR_PARTIAL)
6424 {
6425 DPRINTF((">>>> error: returning %d\n", rc));
6426 return rc;
6427 }
6428
6429 /* Handle partial matches - disable any mark data */
6430
6431 if (start_partial != NULL)
6432 {
6433 DPRINTF((">>>> returning PCRE_ERROR_PARTIAL\n"));
6434 md->mark = NULL;
6435 if (offsetcount > 1)
6436 {
6437 offsets[0] = (int)(start_partial - (USPTR)subject);
6438 offsets[1] = (int)(end_subject - (USPTR)subject);
6439 }
6440 rc = PCRE_ERROR_PARTIAL;
6441 }
6442
6443 /* This is the classic nomatch case */
6444
6445 else
6446 {
6447 DPRINTF((">>>> returning PCRE_ERROR_NOMATCH\n"));
6448 rc = PCRE_ERROR_NOMATCH;
6449 }
6450
6451 /* Return the MARK data if it has been requested. */
6452
6453 RETURN_MARK:
6454
6455 if (extra_data != NULL && (extra_data->flags & PCRE_EXTRA_MARK) != 0)
6456 *(extra_data->mark) = (unsigned char *)(md->mark);
6457 return rc;
6458 }
6459
6460 /* End of pcre_exec.c */

Properties

Name Value
svn:eol-style native
svn:keywords "Author Date Id Revision Url"

webmaster@exim.org
ViewVC Help
Powered by ViewVC 1.1.12