/[pcre]/code/trunk/pcre_exec.c
ViewVC logotype

Contents of /code/trunk/pcre_exec.c

Parent Directory Parent Directory | Revision Log Revision Log


Revision 654 - (show annotations) (download)
Tue Aug 2 11:00:40 2011 UTC (3 years, 4 months ago) by ph10
File MIME type: text/plain
File size: 194663 byte(s)
Documentation and general text tidies in preparation for test release.

1 /*************************************************
2 * Perl-Compatible Regular Expressions *
3 *************************************************/
4
5 /* PCRE is a library of functions to support regular expressions whose syntax
6 and semantics are as close as possible to those of the Perl 5 language.
7
8 Written by Philip Hazel
9 Copyright (c) 1997-2011 University of Cambridge
10
11 -----------------------------------------------------------------------------
12 Redistribution and use in source and binary forms, with or without
13 modification, are permitted provided that the following conditions are met:
14
15 * Redistributions of source code must retain the above copyright notice,
16 this list of conditions and the following disclaimer.
17
18 * Redistributions in binary form must reproduce the above copyright
19 notice, this list of conditions and the following disclaimer in the
20 documentation and/or other materials provided with the distribution.
21
22 * Neither the name of the University of Cambridge nor the names of its
23 contributors may be used to endorse or promote products derived from
24 this software without specific prior written permission.
25
26 THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
27 AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
28 IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
29 ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
30 LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
31 CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
32 SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
33 INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
34 CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
35 ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
36 POSSIBILITY OF SUCH DAMAGE.
37 -----------------------------------------------------------------------------
38 */
39
40
41 /* This module contains pcre_exec(), the externally visible function that does
42 pattern matching using an NFA algorithm, trying to mimic Perl as closely as
43 possible. There are also some static supporting functions. */
44
45 #ifdef HAVE_CONFIG_H
46 #include "config.h"
47 #endif
48
49 #define NLBLOCK md /* Block containing newline information */
50 #define PSSTART start_subject /* Field containing processed string start */
51 #define PSEND end_subject /* Field containing processed string end */
52
53 #include "pcre_internal.h"
54
55 /* Undefine some potentially clashing cpp symbols */
56
57 #undef min
58 #undef max
59
60 /* Values for setting in md->match_function_type to indicate two special types
61 of call to match(). We do it this way to save on using another stack variable,
62 as stack usage is to be discouraged. */
63
64 #define MATCH_CONDASSERT 1 /* Called to check a condition assertion */
65 #define MATCH_CBEGROUP 2 /* Could-be-empty unlimited repeat group */
66
67 /* Non-error returns from the match() function. Error returns are externally
68 defined PCRE_ERROR_xxx codes, which are all negative. */
69
70 #define MATCH_MATCH 1
71 #define MATCH_NOMATCH 0
72
73 /* Special internal returns from the match() function. Make them sufficiently
74 negative to avoid the external error codes. */
75
76 #define MATCH_ACCEPT (-999)
77 #define MATCH_COMMIT (-998)
78 #define MATCH_KETRPOS (-997)
79 #define MATCH_ONCE (-996)
80 #define MATCH_PRUNE (-995)
81 #define MATCH_SKIP (-994)
82 #define MATCH_SKIP_ARG (-993)
83 #define MATCH_THEN (-992)
84
85 /* This is a convenience macro for code that occurs many times. */
86
87 #define MRRETURN(ra) \
88 { \
89 md->mark = markptr; \
90 RRETURN(ra); \
91 }
92
93 /* Maximum number of ints of offset to save on the stack for recursive calls.
94 If the offset vector is bigger, malloc is used. This should be a multiple of 3,
95 because the offset vector is always a multiple of 3 long. */
96
97 #define REC_STACK_SAVE_MAX 30
98
99 /* Min and max values for the common repeats; for the maxima, 0 => infinity */
100
101 static const char rep_min[] = { 0, 0, 1, 1, 0, 0 };
102 static const char rep_max[] = { 0, 0, 0, 0, 1, 1 };
103
104
105
106 #ifdef PCRE_DEBUG
107 /*************************************************
108 * Debugging function to print chars *
109 *************************************************/
110
111 /* Print a sequence of chars in printable format, stopping at the end of the
112 subject if the requested.
113
114 Arguments:
115 p points to characters
116 length number to print
117 is_subject TRUE if printing from within md->start_subject
118 md pointer to matching data block, if is_subject is TRUE
119
120 Returns: nothing
121 */
122
123 static void
124 pchars(const uschar *p, int length, BOOL is_subject, match_data *md)
125 {
126 unsigned int c;
127 if (is_subject && length > md->end_subject - p) length = md->end_subject - p;
128 while (length-- > 0)
129 if (isprint(c = *(p++))) printf("%c", c); else printf("\\x%02x", c);
130 }
131 #endif
132
133
134
135 /*************************************************
136 * Match a back-reference *
137 *************************************************/
138
139 /* Normally, if a back reference hasn't been set, the length that is passed is
140 negative, so the match always fails. However, in JavaScript compatibility mode,
141 the length passed is zero. Note that in caseless UTF-8 mode, the number of
142 subject bytes matched may be different to the number of reference bytes.
143
144 Arguments:
145 offset index into the offset vector
146 eptr pointer into the subject
147 length length of reference to be matched (number of bytes)
148 md points to match data block
149 caseless TRUE if caseless
150
151 Returns: < 0 if not matched, otherwise the number of subject bytes matched
152 */
153
154 static int
155 match_ref(int offset, register USPTR eptr, int length, match_data *md,
156 BOOL caseless)
157 {
158 USPTR eptr_start = eptr;
159 register USPTR p = md->start_subject + md->offset_vector[offset];
160
161 #ifdef PCRE_DEBUG
162 if (eptr >= md->end_subject)
163 printf("matching subject <null>");
164 else
165 {
166 printf("matching subject ");
167 pchars(eptr, length, TRUE, md);
168 }
169 printf(" against backref ");
170 pchars(p, length, FALSE, md);
171 printf("\n");
172 #endif
173
174 /* Always fail if reference not set (and not JavaScript compatible). */
175
176 if (length < 0) return -1;
177
178 /* Separate the caseless case for speed. In UTF-8 mode we can only do this
179 properly if Unicode properties are supported. Otherwise, we can check only
180 ASCII characters. */
181
182 if (caseless)
183 {
184 #ifdef SUPPORT_UTF8
185 #ifdef SUPPORT_UCP
186 if (md->utf8)
187 {
188 /* Match characters up to the end of the reference. NOTE: the number of
189 bytes matched may differ, because there are some characters whose upper and
190 lower case versions code as different numbers of bytes. For example, U+023A
191 (2 bytes in UTF-8) is the upper case version of U+2C65 (3 bytes in UTF-8);
192 a sequence of 3 of the former uses 6 bytes, as does a sequence of two of
193 the latter. It is important, therefore, to check the length along the
194 reference, not along the subject (earlier code did this wrong). */
195
196 USPTR endptr = p + length;
197 while (p < endptr)
198 {
199 int c, d;
200 if (eptr >= md->end_subject) return -1;
201 GETCHARINC(c, eptr);
202 GETCHARINC(d, p);
203 if (c != d && c != UCD_OTHERCASE(d)) return -1;
204 }
205 }
206 else
207 #endif
208 #endif
209
210 /* The same code works when not in UTF-8 mode and in UTF-8 mode when there
211 is no UCP support. */
212 {
213 if (eptr + length > md->end_subject) return -1;
214 while (length-- > 0)
215 { if (md->lcc[*p++] != md->lcc[*eptr++]) return -1; }
216 }
217 }
218
219 /* In the caseful case, we can just compare the bytes, whether or not we
220 are in UTF-8 mode. */
221
222 else
223 {
224 if (eptr + length > md->end_subject) return -1;
225 while (length-- > 0) if (*p++ != *eptr++) return -1;
226 }
227
228 return eptr - eptr_start;
229 }
230
231
232
233 /***************************************************************************
234 ****************************************************************************
235 RECURSION IN THE match() FUNCTION
236
237 The match() function is highly recursive, though not every recursive call
238 increases the recursive depth. Nevertheless, some regular expressions can cause
239 it to recurse to a great depth. I was writing for Unix, so I just let it call
240 itself recursively. This uses the stack for saving everything that has to be
241 saved for a recursive call. On Unix, the stack can be large, and this works
242 fine.
243
244 It turns out that on some non-Unix-like systems there are problems with
245 programs that use a lot of stack. (This despite the fact that every last chip
246 has oodles of memory these days, and techniques for extending the stack have
247 been known for decades.) So....
248
249 There is a fudge, triggered by defining NO_RECURSE, which avoids recursive
250 calls by keeping local variables that need to be preserved in blocks of memory
251 obtained from malloc() instead instead of on the stack. Macros are used to
252 achieve this so that the actual code doesn't look very different to what it
253 always used to.
254
255 The original heap-recursive code used longjmp(). However, it seems that this
256 can be very slow on some operating systems. Following a suggestion from Stan
257 Switzer, the use of longjmp() has been abolished, at the cost of having to
258 provide a unique number for each call to RMATCH. There is no way of generating
259 a sequence of numbers at compile time in C. I have given them names, to make
260 them stand out more clearly.
261
262 Crude tests on x86 Linux show a small speedup of around 5-8%. However, on
263 FreeBSD, avoiding longjmp() more than halves the time taken to run the standard
264 tests. Furthermore, not using longjmp() means that local dynamic variables
265 don't have indeterminate values; this has meant that the frame size can be
266 reduced because the result can be "passed back" by straight setting of the
267 variable instead of being passed in the frame.
268 ****************************************************************************
269 ***************************************************************************/
270
271 /* Numbers for RMATCH calls. When this list is changed, the code at HEAP_RETURN
272 below must be updated in sync. */
273
274 enum { RM1=1, RM2, RM3, RM4, RM5, RM6, RM7, RM8, RM9, RM10,
275 RM11, RM12, RM13, RM14, RM15, RM16, RM17, RM18, RM19, RM20,
276 RM21, RM22, RM23, RM24, RM25, RM26, RM27, RM28, RM29, RM30,
277 RM31, RM32, RM33, RM34, RM35, RM36, RM37, RM38, RM39, RM40,
278 RM41, RM42, RM43, RM44, RM45, RM46, RM47, RM48, RM49, RM50,
279 RM51, RM52, RM53, RM54, RM55, RM56, RM57, RM58, RM59, RM60,
280 RM61, RM62, RM63 };
281
282 /* These versions of the macros use the stack, as normal. There are debugging
283 versions and production versions. Note that the "rw" argument of RMATCH isn't
284 actually used in this definition. */
285
286 #ifndef NO_RECURSE
287 #define REGISTER register
288
289 #ifdef PCRE_DEBUG
290 #define RMATCH(ra,rb,rc,rd,re,rw) \
291 { \
292 printf("match() called in line %d\n", __LINE__); \
293 rrc = match(ra,rb,mstart,markptr,rc,rd,re,rdepth+1); \
294 printf("to line %d\n", __LINE__); \
295 }
296 #define RRETURN(ra) \
297 { \
298 printf("match() returned %d from line %d ", ra, __LINE__); \
299 return ra; \
300 }
301 #else
302 #define RMATCH(ra,rb,rc,rd,re,rw) \
303 rrc = match(ra,rb,mstart,markptr,rc,rd,re,rdepth+1)
304 #define RRETURN(ra) return ra
305 #endif
306
307 #else
308
309
310 /* These versions of the macros manage a private stack on the heap. Note that
311 the "rd" argument of RMATCH isn't actually used in this definition. It's the md
312 argument of match(), which never changes. */
313
314 #define REGISTER
315
316 #define RMATCH(ra,rb,rc,rd,re,rw)\
317 {\
318 heapframe *newframe = (heapframe *)(pcre_stack_malloc)(sizeof(heapframe));\
319 if (newframe == NULL) RRETURN(PCRE_ERROR_NOMEMORY);\
320 frame->Xwhere = rw; \
321 newframe->Xeptr = ra;\
322 newframe->Xecode = rb;\
323 newframe->Xmstart = mstart;\
324 newframe->Xmarkptr = markptr;\
325 newframe->Xoffset_top = rc;\
326 newframe->Xeptrb = re;\
327 newframe->Xrdepth = frame->Xrdepth + 1;\
328 newframe->Xprevframe = frame;\
329 frame = newframe;\
330 DPRINTF(("restarting from line %d\n", __LINE__));\
331 goto HEAP_RECURSE;\
332 L_##rw:\
333 DPRINTF(("jumped back to line %d\n", __LINE__));\
334 }
335
336 #define RRETURN(ra)\
337 {\
338 heapframe *oldframe = frame;\
339 frame = oldframe->Xprevframe;\
340 (pcre_stack_free)(oldframe);\
341 if (frame != NULL)\
342 {\
343 rrc = ra;\
344 goto HEAP_RETURN;\
345 }\
346 return ra;\
347 }
348
349
350 /* Structure for remembering the local variables in a private frame */
351
352 typedef struct heapframe {
353 struct heapframe *Xprevframe;
354
355 /* Function arguments that may change */
356
357 USPTR Xeptr;
358 const uschar *Xecode;
359 USPTR Xmstart;
360 USPTR Xmarkptr;
361 int Xoffset_top;
362 eptrblock *Xeptrb;
363 unsigned int Xrdepth;
364
365 /* Function local variables */
366
367 USPTR Xcallpat;
368 #ifdef SUPPORT_UTF8
369 USPTR Xcharptr;
370 #endif
371 USPTR Xdata;
372 USPTR Xnext;
373 USPTR Xpp;
374 USPTR Xprev;
375 USPTR Xsaved_eptr;
376
377 recursion_info Xnew_recursive;
378
379 BOOL Xcur_is_word;
380 BOOL Xcondition;
381 BOOL Xprev_is_word;
382
383 #ifdef SUPPORT_UCP
384 int Xprop_type;
385 int Xprop_value;
386 int Xprop_fail_result;
387 int Xoclength;
388 uschar Xocchars[8];
389 #endif
390
391 int Xcodelink;
392 int Xctype;
393 unsigned int Xfc;
394 int Xfi;
395 int Xlength;
396 int Xmax;
397 int Xmin;
398 int Xnumber;
399 int Xoffset;
400 int Xop;
401 int Xsave_capture_last;
402 int Xsave_offset1, Xsave_offset2, Xsave_offset3;
403 int Xstacksave[REC_STACK_SAVE_MAX];
404
405 eptrblock Xnewptrb;
406
407 /* Where to jump back to */
408
409 int Xwhere;
410
411 } heapframe;
412
413 #endif
414
415
416 /***************************************************************************
417 ***************************************************************************/
418
419
420
421 /*************************************************
422 * Match from current position *
423 *************************************************/
424
425 /* This function is called recursively in many circumstances. Whenever it
426 returns a negative (error) response, the outer incarnation must also return the
427 same response. */
428
429 /* These macros pack up tests that are used for partial matching, and which
430 appears several times in the code. We set the "hit end" flag if the pointer is
431 at the end of the subject and also past the start of the subject (i.e.
432 something has been matched). For hard partial matching, we then return
433 immediately. The second one is used when we already know we are past the end of
434 the subject. */
435
436 #define CHECK_PARTIAL()\
437 if (md->partial != 0 && eptr >= md->end_subject && \
438 eptr > md->start_used_ptr) \
439 { \
440 md->hitend = TRUE; \
441 if (md->partial > 1) MRRETURN(PCRE_ERROR_PARTIAL); \
442 }
443
444 #define SCHECK_PARTIAL()\
445 if (md->partial != 0 && eptr > md->start_used_ptr) \
446 { \
447 md->hitend = TRUE; \
448 if (md->partial > 1) MRRETURN(PCRE_ERROR_PARTIAL); \
449 }
450
451
452 /* Performance note: It might be tempting to extract commonly used fields from
453 the md structure (e.g. utf8, end_subject) into individual variables to improve
454 performance. Tests using gcc on a SPARC disproved this; in the first case, it
455 made performance worse.
456
457 Arguments:
458 eptr pointer to current character in subject
459 ecode pointer to current position in compiled code
460 mstart pointer to the current match start position (can be modified
461 by encountering \K)
462 markptr pointer to the most recent MARK name, or NULL
463 offset_top current top pointer
464 md pointer to "static" info for the match
465 eptrb pointer to chain of blocks containing eptr at start of
466 brackets - for testing for empty matches
467 rdepth the recursion depth
468
469 Returns: MATCH_MATCH if matched ) these values are >= 0
470 MATCH_NOMATCH if failed to match )
471 a negative MATCH_xxx value for PRUNE, SKIP, etc
472 a negative PCRE_ERROR_xxx value if aborted by an error condition
473 (e.g. stopped by repeated call or recursion limit)
474 */
475
476 static int
477 match(REGISTER USPTR eptr, REGISTER const uschar *ecode, USPTR mstart,
478 const uschar *markptr, int offset_top, match_data *md, eptrblock *eptrb,
479 unsigned int rdepth)
480 {
481 /* These variables do not need to be preserved over recursion in this function,
482 so they can be ordinary variables in all cases. Mark some of them with
483 "register" because they are used a lot in loops. */
484
485 register int rrc; /* Returns from recursive calls */
486 register int i; /* Used for loops not involving calls to RMATCH() */
487 register unsigned int c; /* Character values not kept over RMATCH() calls */
488 register BOOL utf8; /* Local copy of UTF-8 flag for speed */
489
490 BOOL minimize, possessive; /* Quantifier options */
491 BOOL caseless;
492 int condcode;
493
494 /* When recursion is not being used, all "local" variables that have to be
495 preserved over calls to RMATCH() are part of a "frame" which is obtained from
496 heap storage. Set up the top-level frame here; others are obtained from the
497 heap whenever RMATCH() does a "recursion". See the macro definitions above. */
498
499 #ifdef NO_RECURSE
500 heapframe *frame = (heapframe *)(pcre_stack_malloc)(sizeof(heapframe));
501 if (frame == NULL) RRETURN(PCRE_ERROR_NOMEMORY);
502 frame->Xprevframe = NULL; /* Marks the top level */
503
504 /* Copy in the original argument variables */
505
506 frame->Xeptr = eptr;
507 frame->Xecode = ecode;
508 frame->Xmstart = mstart;
509 frame->Xmarkptr = markptr;
510 frame->Xoffset_top = offset_top;
511 frame->Xeptrb = eptrb;
512 frame->Xrdepth = rdepth;
513
514 /* This is where control jumps back to to effect "recursion" */
515
516 HEAP_RECURSE:
517
518 /* Macros make the argument variables come from the current frame */
519
520 #define eptr frame->Xeptr
521 #define ecode frame->Xecode
522 #define mstart frame->Xmstart
523 #define markptr frame->Xmarkptr
524 #define offset_top frame->Xoffset_top
525 #define eptrb frame->Xeptrb
526 #define rdepth frame->Xrdepth
527
528 /* Ditto for the local variables */
529
530 #ifdef SUPPORT_UTF8
531 #define charptr frame->Xcharptr
532 #endif
533 #define callpat frame->Xcallpat
534 #define codelink frame->Xcodelink
535 #define data frame->Xdata
536 #define next frame->Xnext
537 #define pp frame->Xpp
538 #define prev frame->Xprev
539 #define saved_eptr frame->Xsaved_eptr
540
541 #define new_recursive frame->Xnew_recursive
542
543 #define cur_is_word frame->Xcur_is_word
544 #define condition frame->Xcondition
545 #define prev_is_word frame->Xprev_is_word
546
547 #ifdef SUPPORT_UCP
548 #define prop_type frame->Xprop_type
549 #define prop_value frame->Xprop_value
550 #define prop_fail_result frame->Xprop_fail_result
551 #define oclength frame->Xoclength
552 #define occhars frame->Xocchars
553 #endif
554
555 #define ctype frame->Xctype
556 #define fc frame->Xfc
557 #define fi frame->Xfi
558 #define length frame->Xlength
559 #define max frame->Xmax
560 #define min frame->Xmin
561 #define number frame->Xnumber
562 #define offset frame->Xoffset
563 #define op frame->Xop
564 #define save_capture_last frame->Xsave_capture_last
565 #define save_offset1 frame->Xsave_offset1
566 #define save_offset2 frame->Xsave_offset2
567 #define save_offset3 frame->Xsave_offset3
568 #define stacksave frame->Xstacksave
569
570 #define newptrb frame->Xnewptrb
571
572 /* When recursion is being used, local variables are allocated on the stack and
573 get preserved during recursion in the normal way. In this environment, fi and
574 i, and fc and c, can be the same variables. */
575
576 #else /* NO_RECURSE not defined */
577 #define fi i
578 #define fc c
579
580 /* Many of the following variables are used only in small blocks of the code.
581 My normal style of coding would have declared them within each of those blocks.
582 However, in order to accommodate the version of this code that uses an external
583 "stack" implemented on the heap, it is easier to declare them all here, so the
584 declarations can be cut out in a block. The only declarations within blocks
585 below are for variables that do not have to be preserved over a recursive call
586 to RMATCH(). */
587
588 #ifdef SUPPORT_UTF8
589 const uschar *charptr;
590 #endif
591 const uschar *callpat;
592 const uschar *data;
593 const uschar *next;
594 USPTR pp;
595 const uschar *prev;
596 USPTR saved_eptr;
597
598 recursion_info new_recursive;
599
600 BOOL cur_is_word;
601 BOOL condition;
602 BOOL prev_is_word;
603
604 #ifdef SUPPORT_UCP
605 int prop_type;
606 int prop_value;
607 int prop_fail_result;
608 int oclength;
609 uschar occhars[8];
610 #endif
611
612 int codelink;
613 int ctype;
614 int length;
615 int max;
616 int min;
617 int number;
618 int offset;
619 int op;
620 int save_capture_last;
621 int save_offset1, save_offset2, save_offset3;
622 int stacksave[REC_STACK_SAVE_MAX];
623
624 eptrblock newptrb;
625 #endif /* NO_RECURSE */
626
627 /* To save space on the stack and in the heap frame, I have doubled up on some
628 of the local variables that are used only in localised parts of the code, but
629 still need to be preserved over recursive calls of match(). These macros define
630 the alternative names that are used. */
631
632 #define allow_zero cur_is_word
633 #define cbegroup condition
634 #define code_offset codelink
635 #define condassert condition
636 #define matched_once prev_is_word
637
638 /* These statements are here to stop the compiler complaining about unitialized
639 variables. */
640
641 #ifdef SUPPORT_UCP
642 prop_value = 0;
643 prop_fail_result = 0;
644 #endif
645
646
647 /* This label is used for tail recursion, which is used in a few cases even
648 when NO_RECURSE is not defined, in order to reduce the amount of stack that is
649 used. Thanks to Ian Taylor for noticing this possibility and sending the
650 original patch. */
651
652 TAIL_RECURSE:
653
654 /* OK, now we can get on with the real code of the function. Recursive calls
655 are specified by the macro RMATCH and RRETURN is used to return. When
656 NO_RECURSE is *not* defined, these just turn into a recursive call to match()
657 and a "return", respectively (possibly with some debugging if PCRE_DEBUG is
658 defined). However, RMATCH isn't like a function call because it's quite a
659 complicated macro. It has to be used in one particular way. This shouldn't,
660 however, impact performance when true recursion is being used. */
661
662 #ifdef SUPPORT_UTF8
663 utf8 = md->utf8; /* Local copy of the flag */
664 #else
665 utf8 = FALSE;
666 #endif
667
668 /* First check that we haven't called match() too many times, or that we
669 haven't exceeded the recursive call limit. */
670
671 if (md->match_call_count++ >= md->match_limit) RRETURN(PCRE_ERROR_MATCHLIMIT);
672 if (rdepth >= md->match_limit_recursion) RRETURN(PCRE_ERROR_RECURSIONLIMIT);
673
674 /* At the start of a group with an unlimited repeat that may match an empty
675 string, the variable md->match_function_type is set to MATCH_CBEGROUP. It is
676 done this way to save having to use another function argument, which would take
677 up space on the stack. See also MATCH_CONDASSERT below.
678
679 When MATCH_CBEGROUP is set, add the current subject pointer to the chain of
680 such remembered pointers, to be checked when we hit the closing ket, in order
681 to break infinite loops that match no characters. When match() is called in
682 other circumstances, don't add to the chain. The MATCH_CBEGROUP feature must
683 NOT be used with tail recursion, because the memory block that is used is on
684 the stack, so a new one may be required for each match(). */
685
686 if (md->match_function_type == MATCH_CBEGROUP)
687 {
688 newptrb.epb_saved_eptr = eptr;
689 newptrb.epb_prev = eptrb;
690 eptrb = &newptrb;
691 md->match_function_type = 0;
692 }
693
694 /* Now start processing the opcodes. */
695
696 for (;;)
697 {
698 minimize = possessive = FALSE;
699 op = *ecode;
700
701 switch(op)
702 {
703 case OP_MARK:
704 markptr = ecode + 2;
705 RMATCH(eptr, ecode + _pcre_OP_lengths[*ecode] + ecode[1], offset_top, md,
706 eptrb, RM55);
707
708 /* A return of MATCH_SKIP_ARG means that matching failed at SKIP with an
709 argument, and we must check whether that argument matches this MARK's
710 argument. It is passed back in md->start_match_ptr (an overloading of that
711 variable). If it does match, we reset that variable to the current subject
712 position and return MATCH_SKIP. Otherwise, pass back the return code
713 unaltered. */
714
715 if (rrc == MATCH_SKIP_ARG &&
716 strcmp((char *)markptr, (char *)(md->start_match_ptr)) == 0)
717 {
718 md->start_match_ptr = eptr;
719 RRETURN(MATCH_SKIP);
720 }
721
722 if (md->mark == NULL) md->mark = markptr;
723 RRETURN(rrc);
724
725 case OP_FAIL:
726 MRRETURN(MATCH_NOMATCH);
727
728 /* COMMIT overrides PRUNE, SKIP, and THEN */
729
730 case OP_COMMIT:
731 RMATCH(eptr, ecode + _pcre_OP_lengths[*ecode], offset_top, md,
732 eptrb, RM52);
733 if (rrc != MATCH_NOMATCH && rrc != MATCH_PRUNE &&
734 rrc != MATCH_SKIP && rrc != MATCH_SKIP_ARG &&
735 rrc != MATCH_THEN)
736 RRETURN(rrc);
737 MRRETURN(MATCH_COMMIT);
738
739 /* PRUNE overrides THEN */
740
741 case OP_PRUNE:
742 RMATCH(eptr, ecode + _pcre_OP_lengths[*ecode], offset_top, md,
743 eptrb, RM51);
744 if (rrc != MATCH_NOMATCH && rrc != MATCH_THEN) RRETURN(rrc);
745 MRRETURN(MATCH_PRUNE);
746
747 case OP_PRUNE_ARG:
748 RMATCH(eptr, ecode + _pcre_OP_lengths[*ecode] + ecode[1], offset_top, md,
749 eptrb, RM56);
750 if (rrc != MATCH_NOMATCH && rrc != MATCH_THEN) RRETURN(rrc);
751 md->mark = ecode + 2;
752 RRETURN(MATCH_PRUNE);
753
754 /* SKIP overrides PRUNE and THEN */
755
756 case OP_SKIP:
757 RMATCH(eptr, ecode + _pcre_OP_lengths[*ecode], offset_top, md,
758 eptrb, RM53);
759 if (rrc != MATCH_NOMATCH && rrc != MATCH_PRUNE && rrc != MATCH_THEN)
760 RRETURN(rrc);
761 md->start_match_ptr = eptr; /* Pass back current position */
762 MRRETURN(MATCH_SKIP);
763
764 case OP_SKIP_ARG:
765 RMATCH(eptr, ecode + _pcre_OP_lengths[*ecode] + ecode[1], offset_top, md,
766 eptrb, RM57);
767 if (rrc != MATCH_NOMATCH && rrc != MATCH_PRUNE && rrc != MATCH_THEN)
768 RRETURN(rrc);
769
770 /* Pass back the current skip name by overloading md->start_match_ptr and
771 returning the special MATCH_SKIP_ARG return code. This will either be
772 caught by a matching MARK, or get to the top, where it is treated the same
773 as PRUNE. */
774
775 md->start_match_ptr = ecode + 2;
776 RRETURN(MATCH_SKIP_ARG);
777
778 /* For THEN (and THEN_ARG) we pass back the address of the bracket or
779 the alt that is at the start of the current branch. This makes it possible
780 to skip back past alternatives that precede the THEN within the current
781 branch. */
782
783 case OP_THEN:
784 RMATCH(eptr, ecode + _pcre_OP_lengths[*ecode], offset_top, md,
785 eptrb, RM54);
786 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
787 md->start_match_ptr = ecode - GET(ecode, 1);
788 MRRETURN(MATCH_THEN);
789
790 case OP_THEN_ARG:
791 RMATCH(eptr, ecode + _pcre_OP_lengths[*ecode] + ecode[1+LINK_SIZE],
792 offset_top, md, eptrb, RM58);
793 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
794 md->start_match_ptr = ecode - GET(ecode, 1);
795 md->mark = ecode + LINK_SIZE + 2;
796 RRETURN(MATCH_THEN);
797
798 /* Handle a capturing bracket, other than those that are possessive with an
799 unlimited repeat. If there is space in the offset vector, save the current
800 subject position in the working slot at the top of the vector. We mustn't
801 change the current values of the data slot, because they may be set from a
802 previous iteration of this group, and be referred to by a reference inside
803 the group. A failure to match might occur after the group has succeeded,
804 if something later on doesn't match. For this reason, we need to restore
805 the working value and also the values of the final offsets, in case they
806 were set by a previous iteration of the same bracket.
807
808 If there isn't enough space in the offset vector, treat this as if it were
809 a non-capturing bracket. Don't worry about setting the flag for the error
810 case here; that is handled in the code for KET. */
811
812 case OP_CBRA:
813 case OP_SCBRA:
814 number = GET2(ecode, 1+LINK_SIZE);
815 offset = number << 1;
816
817 #ifdef PCRE_DEBUG
818 printf("start bracket %d\n", number);
819 printf("subject=");
820 pchars(eptr, 16, TRUE, md);
821 printf("\n");
822 #endif
823
824 if (offset < md->offset_max)
825 {
826 save_offset1 = md->offset_vector[offset];
827 save_offset2 = md->offset_vector[offset+1];
828 save_offset3 = md->offset_vector[md->offset_end - number];
829 save_capture_last = md->capture_last;
830
831 DPRINTF(("saving %d %d %d\n", save_offset1, save_offset2, save_offset3));
832 md->offset_vector[md->offset_end - number] =
833 (int)(eptr - md->start_subject);
834
835 for (;;)
836 {
837 if (op >= OP_SBRA) md->match_function_type = MATCH_CBEGROUP;
838 RMATCH(eptr, ecode + _pcre_OP_lengths[*ecode], offset_top, md,
839 eptrb, RM1);
840 if (rrc == MATCH_ONCE) break; /* Backing up through an atomic group */
841 if (rrc != MATCH_NOMATCH &&
842 (rrc != MATCH_THEN || md->start_match_ptr != ecode))
843 RRETURN(rrc);
844 md->capture_last = save_capture_last;
845 ecode += GET(ecode, 1);
846 if (*ecode != OP_ALT) break;
847 }
848
849 DPRINTF(("bracket %d failed\n", number));
850 md->offset_vector[offset] = save_offset1;
851 md->offset_vector[offset+1] = save_offset2;
852 md->offset_vector[md->offset_end - number] = save_offset3;
853
854 /* At this point, rrc will be one of MATCH_ONCE, MATCH_NOMATCH, or
855 MATCH_THEN. */
856
857 if (rrc != MATCH_THEN && md->mark == NULL) md->mark = markptr;
858 RRETURN(((rrc == MATCH_ONCE)? MATCH_ONCE:MATCH_NOMATCH));
859 }
860
861 /* FALL THROUGH ... Insufficient room for saving captured contents. Treat
862 as a non-capturing bracket. */
863
864 /* VVVVVVVVVVVVVVVVVVVVVVVVV */
865 /* VVVVVVVVVVVVVVVVVVVVVVVVV */
866
867 DPRINTF(("insufficient capture room: treat as non-capturing\n"));
868
869 /* VVVVVVVVVVVVVVVVVVVVVVVVV */
870 /* VVVVVVVVVVVVVVVVVVVVVVVVV */
871
872 /* Non-capturing or atomic group, except for possessive with unlimited
873 repeat. Loop for all the alternatives. When we get to the final alternative
874 within the brackets, we used to return the result of a recursive call to
875 match() whatever happened so it was possible to reduce stack usage by
876 turning this into a tail recursion, except in the case of a possibly empty
877 group. However, now that there is the possiblity of (*THEN) occurring in
878 the final alternative, this optimization is no longer possible.
879
880 MATCH_ONCE is returned when the end of an atomic group is successfully
881 reached, but subsequent matching fails. It passes back up the tree (causing
882 captured values to be reset) until the original atomic group level is
883 reached. This is tested by comparing md->once_target with the start of the
884 group. At this point, the return is converted into MATCH_NOMATCH so that
885 previous backup points can be taken. */
886
887 case OP_ONCE:
888 case OP_BRA:
889 case OP_SBRA:
890 DPRINTF(("start non-capturing bracket\n"));
891
892 for (;;)
893 {
894 if (op >= OP_SBRA || op == OP_ONCE) md->match_function_type = MATCH_CBEGROUP;
895 RMATCH(eptr, ecode + _pcre_OP_lengths[*ecode], offset_top, md, eptrb,
896 RM2);
897 if (rrc != MATCH_NOMATCH &&
898 (rrc != MATCH_THEN || md->start_match_ptr != ecode))
899 {
900 if (rrc == MATCH_ONCE)
901 {
902 const uschar *scode = ecode;
903 if (*scode != OP_ONCE) /* If not at start, find it */
904 {
905 while (*scode == OP_ALT) scode += GET(scode, 1);
906 scode -= GET(scode, 1);
907 }
908 if (md->once_target == scode) rrc = MATCH_NOMATCH;
909 }
910 RRETURN(rrc);
911 }
912 ecode += GET(ecode, 1);
913 if (*ecode != OP_ALT) break;
914 }
915 if (rrc != MATCH_THEN && md->mark == NULL) md->mark = markptr;
916 RRETURN(MATCH_NOMATCH);
917
918 /* Handle possessive capturing brackets with an unlimited repeat. We come
919 here from BRAZERO with allow_zero set TRUE. The offset_vector values are
920 handled similarly to the normal case above. However, the matching is
921 different. The end of these brackets will always be OP_KETRPOS, which
922 returns MATCH_KETRPOS without going further in the pattern. By this means
923 we can handle the group by iteration rather than recursion, thereby
924 reducing the amount of stack needed. */
925
926 case OP_CBRAPOS:
927 case OP_SCBRAPOS:
928 allow_zero = FALSE;
929
930 POSSESSIVE_CAPTURE:
931 number = GET2(ecode, 1+LINK_SIZE);
932 offset = number << 1;
933
934 #ifdef PCRE_DEBUG
935 printf("start possessive bracket %d\n", number);
936 printf("subject=");
937 pchars(eptr, 16, TRUE, md);
938 printf("\n");
939 #endif
940
941 if (offset < md->offset_max)
942 {
943 matched_once = FALSE;
944 code_offset = ecode - md->start_code;
945
946 save_offset1 = md->offset_vector[offset];
947 save_offset2 = md->offset_vector[offset+1];
948 save_offset3 = md->offset_vector[md->offset_end - number];
949 save_capture_last = md->capture_last;
950
951 DPRINTF(("saving %d %d %d\n", save_offset1, save_offset2, save_offset3));
952
953 /* Each time round the loop, save the current subject position for use
954 when the group matches. For MATCH_MATCH, the group has matched, so we
955 restart it with a new subject starting position, remembering that we had
956 at least one match. For MATCH_NOMATCH, carry on with the alternatives, as
957 usual. If we haven't matched any alternatives in any iteration, check to
958 see if a previous iteration matched. If so, the group has matched;
959 continue from afterwards. Otherwise it has failed; restore the previous
960 capture values before returning NOMATCH. */
961
962 for (;;)
963 {
964 md->offset_vector[md->offset_end - number] =
965 (int)(eptr - md->start_subject);
966 if (op >= OP_SBRA) md->match_function_type = MATCH_CBEGROUP;
967 RMATCH(eptr, ecode + _pcre_OP_lengths[*ecode], offset_top, md,
968 eptrb, RM63);
969 if (rrc == MATCH_KETRPOS)
970 {
971 offset_top = md->end_offset_top;
972 eptr = md->end_match_ptr;
973 ecode = md->start_code + code_offset;
974 save_capture_last = md->capture_last;
975 matched_once = TRUE;
976 continue;
977 }
978 if (rrc != MATCH_NOMATCH &&
979 (rrc != MATCH_THEN || md->start_match_ptr != ecode))
980 RRETURN(rrc);
981 md->capture_last = save_capture_last;
982 ecode += GET(ecode, 1);
983 if (*ecode != OP_ALT) break;
984 }
985
986 if (!matched_once)
987 {
988 md->offset_vector[offset] = save_offset1;
989 md->offset_vector[offset+1] = save_offset2;
990 md->offset_vector[md->offset_end - number] = save_offset3;
991 }
992
993 if (rrc != MATCH_THEN && md->mark == NULL) md->mark = markptr;
994 if (allow_zero || matched_once)
995 {
996 ecode += 1 + LINK_SIZE;
997 break;
998 }
999
1000 RRETURN(MATCH_NOMATCH);
1001 }
1002
1003 /* FALL THROUGH ... Insufficient room for saving captured contents. Treat
1004 as a non-capturing bracket. */
1005
1006 /* VVVVVVVVVVVVVVVVVVVVVVVVV */
1007 /* VVVVVVVVVVVVVVVVVVVVVVVVV */
1008
1009 DPRINTF(("insufficient capture room: treat as non-capturing\n"));
1010
1011 /* VVVVVVVVVVVVVVVVVVVVVVVVV */
1012 /* VVVVVVVVVVVVVVVVVVVVVVVVV */
1013
1014 /* Non-capturing possessive bracket with unlimited repeat. We come here
1015 from BRAZERO with allow_zero = TRUE. The code is similar to the above,
1016 without the capturing complication. It is written out separately for speed
1017 and cleanliness. */
1018
1019 case OP_BRAPOS:
1020 case OP_SBRAPOS:
1021 allow_zero = FALSE;
1022
1023 POSSESSIVE_NON_CAPTURE:
1024 matched_once = FALSE;
1025 code_offset = ecode - md->start_code;
1026
1027 for (;;)
1028 {
1029 if (op >= OP_SBRA) md->match_function_type = MATCH_CBEGROUP;
1030 RMATCH(eptr, ecode + _pcre_OP_lengths[*ecode], offset_top, md,
1031 eptrb, RM48);
1032 if (rrc == MATCH_KETRPOS)
1033 {
1034 offset_top = md->end_offset_top;
1035 eptr = md->end_match_ptr;
1036 ecode = md->start_code + code_offset;
1037 matched_once = TRUE;
1038 continue;
1039 }
1040 if (rrc != MATCH_NOMATCH &&
1041 (rrc != MATCH_THEN || md->start_match_ptr != ecode))
1042 RRETURN(rrc);
1043 ecode += GET(ecode, 1);
1044 if (*ecode != OP_ALT) break;
1045 }
1046
1047 if (matched_once || allow_zero)
1048 {
1049 ecode += 1 + LINK_SIZE;
1050 break;
1051 }
1052 RRETURN(MATCH_NOMATCH);
1053
1054 /* Control never reaches here. */
1055
1056 /* Conditional group: compilation checked that there are no more than
1057 two branches. If the condition is false, skipping the first branch takes us
1058 past the end if there is only one branch, but that's OK because that is
1059 exactly what going to the ket would do. */
1060
1061 case OP_COND:
1062 case OP_SCOND:
1063 codelink = GET(ecode, 1);
1064
1065 /* Because of the way auto-callout works during compile, a callout item is
1066 inserted between OP_COND and an assertion condition. */
1067
1068 if (ecode[LINK_SIZE+1] == OP_CALLOUT)
1069 {
1070 if (pcre_callout != NULL)
1071 {
1072 pcre_callout_block cb;
1073 cb.version = 2; /* Version 1 of the callout block */
1074 cb.callout_number = ecode[LINK_SIZE+2];
1075 cb.offset_vector = md->offset_vector;
1076 cb.subject = (PCRE_SPTR)md->start_subject;
1077 cb.subject_length = (int)(md->end_subject - md->start_subject);
1078 cb.start_match = (int)(mstart - md->start_subject);
1079 cb.current_position = (int)(eptr - md->start_subject);
1080 cb.pattern_position = GET(ecode, LINK_SIZE + 3);
1081 cb.next_item_length = GET(ecode, 3 + 2*LINK_SIZE);
1082 cb.capture_top = offset_top/2;
1083 cb.capture_last = md->capture_last;
1084 cb.callout_data = md->callout_data;
1085 cb.mark = markptr;
1086 if ((rrc = (*pcre_callout)(&cb)) > 0) MRRETURN(MATCH_NOMATCH);
1087 if (rrc < 0) RRETURN(rrc);
1088 }
1089 ecode += _pcre_OP_lengths[OP_CALLOUT];
1090 }
1091
1092 condcode = ecode[LINK_SIZE+1];
1093
1094 /* Now see what the actual condition is */
1095
1096 if (condcode == OP_RREF || condcode == OP_NRREF) /* Recursion test */
1097 {
1098 if (md->recursive == NULL) /* Not recursing => FALSE */
1099 {
1100 condition = FALSE;
1101 ecode += GET(ecode, 1);
1102 }
1103 else
1104 {
1105 int recno = GET2(ecode, LINK_SIZE + 2); /* Recursion group number*/
1106 condition = (recno == RREF_ANY || recno == md->recursive->group_num);
1107
1108 /* If the test is for recursion into a specific subpattern, and it is
1109 false, but the test was set up by name, scan the table to see if the
1110 name refers to any other numbers, and test them. The condition is true
1111 if any one is set. */
1112
1113 if (!condition && condcode == OP_NRREF && recno != RREF_ANY)
1114 {
1115 uschar *slotA = md->name_table;
1116 for (i = 0; i < md->name_count; i++)
1117 {
1118 if (GET2(slotA, 0) == recno) break;
1119 slotA += md->name_entry_size;
1120 }
1121
1122 /* Found a name for the number - there can be only one; duplicate
1123 names for different numbers are allowed, but not vice versa. First
1124 scan down for duplicates. */
1125
1126 if (i < md->name_count)
1127 {
1128 uschar *slotB = slotA;
1129 while (slotB > md->name_table)
1130 {
1131 slotB -= md->name_entry_size;
1132 if (strcmp((char *)slotA + 2, (char *)slotB + 2) == 0)
1133 {
1134 condition = GET2(slotB, 0) == md->recursive->group_num;
1135 if (condition) break;
1136 }
1137 else break;
1138 }
1139
1140 /* Scan up for duplicates */
1141
1142 if (!condition)
1143 {
1144 slotB = slotA;
1145 for (i++; i < md->name_count; i++)
1146 {
1147 slotB += md->name_entry_size;
1148 if (strcmp((char *)slotA + 2, (char *)slotB + 2) == 0)
1149 {
1150 condition = GET2(slotB, 0) == md->recursive->group_num;
1151 if (condition) break;
1152 }
1153 else break;
1154 }
1155 }
1156 }
1157 }
1158
1159 /* Chose branch according to the condition */
1160
1161 ecode += condition? 3 : GET(ecode, 1);
1162 }
1163 }
1164
1165 else if (condcode == OP_CREF || condcode == OP_NCREF) /* Group used test */
1166 {
1167 offset = GET2(ecode, LINK_SIZE+2) << 1; /* Doubled ref number */
1168 condition = offset < offset_top && md->offset_vector[offset] >= 0;
1169
1170 /* If the numbered capture is unset, but the reference was by name,
1171 scan the table to see if the name refers to any other numbers, and test
1172 them. The condition is true if any one is set. This is tediously similar
1173 to the code above, but not close enough to try to amalgamate. */
1174
1175 if (!condition && condcode == OP_NCREF)
1176 {
1177 int refno = offset >> 1;
1178 uschar *slotA = md->name_table;
1179
1180 for (i = 0; i < md->name_count; i++)
1181 {
1182 if (GET2(slotA, 0) == refno) break;
1183 slotA += md->name_entry_size;
1184 }
1185
1186 /* Found a name for the number - there can be only one; duplicate names
1187 for different numbers are allowed, but not vice versa. First scan down
1188 for duplicates. */
1189
1190 if (i < md->name_count)
1191 {
1192 uschar *slotB = slotA;
1193 while (slotB > md->name_table)
1194 {
1195 slotB -= md->name_entry_size;
1196 if (strcmp((char *)slotA + 2, (char *)slotB + 2) == 0)
1197 {
1198 offset = GET2(slotB, 0) << 1;
1199 condition = offset < offset_top &&
1200 md->offset_vector[offset] >= 0;
1201 if (condition) break;
1202 }
1203 else break;
1204 }
1205
1206 /* Scan up for duplicates */
1207
1208 if (!condition)
1209 {
1210 slotB = slotA;
1211 for (i++; i < md->name_count; i++)
1212 {
1213 slotB += md->name_entry_size;
1214 if (strcmp((char *)slotA + 2, (char *)slotB + 2) == 0)
1215 {
1216 offset = GET2(slotB, 0) << 1;
1217 condition = offset < offset_top &&
1218 md->offset_vector[offset] >= 0;
1219 if (condition) break;
1220 }
1221 else break;
1222 }
1223 }
1224 }
1225 }
1226
1227 /* Chose branch according to the condition */
1228
1229 ecode += condition? 3 : GET(ecode, 1);
1230 }
1231
1232 else if (condcode == OP_DEF) /* DEFINE - always false */
1233 {
1234 condition = FALSE;
1235 ecode += GET(ecode, 1);
1236 }
1237
1238 /* The condition is an assertion. Call match() to evaluate it - setting
1239 md->match_function_type to MATCH_CONDASSERT causes it to stop at the end of
1240 an assertion. */
1241
1242 else
1243 {
1244 md->match_function_type = MATCH_CONDASSERT;
1245 RMATCH(eptr, ecode + 1 + LINK_SIZE, offset_top, md, NULL, RM3);
1246 if (rrc == MATCH_MATCH)
1247 {
1248 if (md->end_offset_top > offset_top)
1249 offset_top = md->end_offset_top; /* Captures may have happened */
1250 condition = TRUE;
1251 ecode += 1 + LINK_SIZE + GET(ecode, LINK_SIZE + 2);
1252 while (*ecode == OP_ALT) ecode += GET(ecode, 1);
1253 }
1254 else if (rrc != MATCH_NOMATCH &&
1255 (rrc != MATCH_THEN || md->start_match_ptr != ecode))
1256 {
1257 RRETURN(rrc); /* Need braces because of following else */
1258 }
1259 else
1260 {
1261 condition = FALSE;
1262 ecode += codelink;
1263 }
1264 }
1265
1266 /* We are now at the branch that is to be obeyed. As there is only one,
1267 we used to use tail recursion to avoid using another stack frame, except
1268 when there was unlimited repeat of a possibly empty group. However, that
1269 strategy no longer works because of the possibilty of (*THEN) being
1270 encountered in the branch. A recursive call to match() is always required,
1271 unless the second alternative doesn't exist, in which case we can just
1272 plough on. */
1273
1274 if (condition || *ecode == OP_ALT)
1275 {
1276 if (op == OP_SCOND) md->match_function_type = MATCH_CBEGROUP;
1277 RMATCH(eptr, ecode + 1 + LINK_SIZE, offset_top, md, eptrb, RM49);
1278 if (rrc == MATCH_THEN && md->start_match_ptr == ecode)
1279 rrc = MATCH_NOMATCH;
1280 RRETURN(rrc);
1281 }
1282 else /* Condition false & no alternative */
1283 {
1284 ecode += 1 + LINK_SIZE;
1285 }
1286 break;
1287
1288
1289 /* Before OP_ACCEPT there may be any number of OP_CLOSE opcodes,
1290 to close any currently open capturing brackets. */
1291
1292 case OP_CLOSE:
1293 number = GET2(ecode, 1);
1294 offset = number << 1;
1295
1296 #ifdef PCRE_DEBUG
1297 printf("end bracket %d at *ACCEPT", number);
1298 printf("\n");
1299 #endif
1300
1301 md->capture_last = number;
1302 if (offset >= md->offset_max) md->offset_overflow = TRUE; else
1303 {
1304 md->offset_vector[offset] =
1305 md->offset_vector[md->offset_end - number];
1306 md->offset_vector[offset+1] = (int)(eptr - md->start_subject);
1307 if (offset_top <= offset) offset_top = offset + 2;
1308 }
1309 ecode += 3;
1310 break;
1311
1312
1313 /* End of the pattern, either real or forced. */
1314
1315 case OP_END:
1316 case OP_ACCEPT:
1317 case OP_ASSERT_ACCEPT:
1318
1319 /* If we have matched an empty string, fail if not in an assertion and not
1320 in a recursion if either PCRE_NOTEMPTY is set, or if PCRE_NOTEMPTY_ATSTART
1321 is set and we have matched at the start of the subject. In both cases,
1322 backtracking will then try other alternatives, if any. */
1323
1324 if (eptr == mstart && op != OP_ASSERT_ACCEPT &&
1325 md->recursive == NULL &&
1326 (md->notempty ||
1327 (md->notempty_atstart &&
1328 mstart == md->start_subject + md->start_offset)))
1329 MRRETURN(MATCH_NOMATCH);
1330
1331 /* Otherwise, we have a match. */
1332
1333 md->end_match_ptr = eptr; /* Record where we ended */
1334 md->end_offset_top = offset_top; /* and how many extracts were taken */
1335 md->start_match_ptr = mstart; /* and the start (\K can modify) */
1336
1337 /* For some reason, the macros don't work properly if an expression is
1338 given as the argument to MRRETURN when the heap is in use. */
1339
1340 rrc = (op == OP_END)? MATCH_MATCH : MATCH_ACCEPT;
1341 MRRETURN(rrc);
1342
1343 /* Assertion brackets. Check the alternative branches in turn - the
1344 matching won't pass the KET for an assertion. If any one branch matches,
1345 the assertion is true. Lookbehind assertions have an OP_REVERSE item at the
1346 start of each branch to move the current point backwards, so the code at
1347 this level is identical to the lookahead case. When the assertion is part
1348 of a condition, we want to return immediately afterwards. The caller of
1349 this incarnation of the match() function will have set MATCH_CONDASSERT in
1350 md->match_function type, and one of these opcodes will be the first opcode
1351 that is processed. We use a local variable that is preserved over calls to
1352 match() to remember this case. */
1353
1354 case OP_ASSERT:
1355 case OP_ASSERTBACK:
1356 if (md->match_function_type == MATCH_CONDASSERT)
1357 {
1358 condassert = TRUE;
1359 md->match_function_type = 0;
1360 }
1361 else condassert = FALSE;
1362
1363 do
1364 {
1365 RMATCH(eptr, ecode + 1 + LINK_SIZE, offset_top, md, NULL, RM4);
1366 if (rrc == MATCH_MATCH || rrc == MATCH_ACCEPT)
1367 {
1368 mstart = md->start_match_ptr; /* In case \K reset it */
1369 markptr = md->mark;
1370 break;
1371 }
1372 if (rrc != MATCH_NOMATCH &&
1373 (rrc != MATCH_THEN || md->start_match_ptr != ecode))
1374 RRETURN(rrc);
1375 ecode += GET(ecode, 1);
1376 }
1377 while (*ecode == OP_ALT);
1378
1379 if (*ecode == OP_KET) MRRETURN(MATCH_NOMATCH);
1380
1381 /* If checking an assertion for a condition, return MATCH_MATCH. */
1382
1383 if (condassert) RRETURN(MATCH_MATCH);
1384
1385 /* Continue from after the assertion, updating the offsets high water
1386 mark, since extracts may have been taken during the assertion. */
1387
1388 do ecode += GET(ecode,1); while (*ecode == OP_ALT);
1389 ecode += 1 + LINK_SIZE;
1390 offset_top = md->end_offset_top;
1391 continue;
1392
1393 /* Negative assertion: all branches must fail to match. Encountering SKIP,
1394 PRUNE, or COMMIT means we must assume failure without checking subsequent
1395 branches. */
1396
1397 case OP_ASSERT_NOT:
1398 case OP_ASSERTBACK_NOT:
1399 if (md->match_function_type == MATCH_CONDASSERT)
1400 {
1401 condassert = TRUE;
1402 md->match_function_type = 0;
1403 }
1404 else condassert = FALSE;
1405
1406 do
1407 {
1408 RMATCH(eptr, ecode + 1 + LINK_SIZE, offset_top, md, NULL, RM5);
1409 if (rrc == MATCH_MATCH || rrc == MATCH_ACCEPT) MRRETURN(MATCH_NOMATCH);
1410 if (rrc == MATCH_SKIP || rrc == MATCH_PRUNE || rrc == MATCH_COMMIT)
1411 {
1412 do ecode += GET(ecode,1); while (*ecode == OP_ALT);
1413 break;
1414 }
1415 if (rrc != MATCH_NOMATCH &&
1416 (rrc != MATCH_THEN || md->start_match_ptr != ecode))
1417 RRETURN(rrc);
1418 ecode += GET(ecode,1);
1419 }
1420 while (*ecode == OP_ALT);
1421
1422 if (condassert) RRETURN(MATCH_MATCH); /* Condition assertion */
1423
1424 ecode += 1 + LINK_SIZE;
1425 continue;
1426
1427 /* Move the subject pointer back. This occurs only at the start of
1428 each branch of a lookbehind assertion. If we are too close to the start to
1429 move back, this match function fails. When working with UTF-8 we move
1430 back a number of characters, not bytes. */
1431
1432 case OP_REVERSE:
1433 #ifdef SUPPORT_UTF8
1434 if (utf8)
1435 {
1436 i = GET(ecode, 1);
1437 while (i-- > 0)
1438 {
1439 eptr--;
1440 if (eptr < md->start_subject) MRRETURN(MATCH_NOMATCH);
1441 BACKCHAR(eptr);
1442 }
1443 }
1444 else
1445 #endif
1446
1447 /* No UTF-8 support, or not in UTF-8 mode: count is byte count */
1448
1449 {
1450 eptr -= GET(ecode, 1);
1451 if (eptr < md->start_subject) MRRETURN(MATCH_NOMATCH);
1452 }
1453
1454 /* Save the earliest consulted character, then skip to next op code */
1455
1456 if (eptr < md->start_used_ptr) md->start_used_ptr = eptr;
1457 ecode += 1 + LINK_SIZE;
1458 break;
1459
1460 /* The callout item calls an external function, if one is provided, passing
1461 details of the match so far. This is mainly for debugging, though the
1462 function is able to force a failure. */
1463
1464 case OP_CALLOUT:
1465 if (pcre_callout != NULL)
1466 {
1467 pcre_callout_block cb;
1468 cb.version = 2; /* Version 1 of the callout block */
1469 cb.callout_number = ecode[1];
1470 cb.offset_vector = md->offset_vector;
1471 cb.subject = (PCRE_SPTR)md->start_subject;
1472 cb.subject_length = (int)(md->end_subject - md->start_subject);
1473 cb.start_match = (int)(mstart - md->start_subject);
1474 cb.current_position = (int)(eptr - md->start_subject);
1475 cb.pattern_position = GET(ecode, 2);
1476 cb.next_item_length = GET(ecode, 2 + LINK_SIZE);
1477 cb.capture_top = offset_top/2;
1478 cb.capture_last = md->capture_last;
1479 cb.callout_data = md->callout_data;
1480 cb.mark = markptr;
1481 if ((rrc = (*pcre_callout)(&cb)) > 0) MRRETURN(MATCH_NOMATCH);
1482 if (rrc < 0) RRETURN(rrc);
1483 }
1484 ecode += 2 + 2*LINK_SIZE;
1485 break;
1486
1487 /* Recursion either matches the current regex, or some subexpression. The
1488 offset data is the offset to the starting bracket from the start of the
1489 whole pattern. (This is so that it works from duplicated subpatterns.)
1490
1491 The state of the capturing groups is preserved over recursion, and
1492 re-instated afterwards. We don't know how many are started and not yet
1493 finished (offset_top records the completed total) so we just have to save
1494 all the potential data. There may be up to 65535 such values, which is too
1495 large to put on the stack, but using malloc for small numbers seems
1496 expensive. As a compromise, the stack is used when there are no more than
1497 REC_STACK_SAVE_MAX values to store; otherwise malloc is used.
1498
1499 There are also other values that have to be saved. We use a chained
1500 sequence of blocks that actually live on the stack. Thanks to Robin Houston
1501 for the original version of this logic. It has, however, been hacked around
1502 a lot, so he is not to blame for the current way it works. */
1503
1504 case OP_RECURSE:
1505 {
1506 recursion_info *ri;
1507 int recno;
1508
1509 callpat = md->start_code + GET(ecode, 1);
1510 recno = (callpat == md->start_code)? 0 :
1511 GET2(callpat, 1 + LINK_SIZE);
1512
1513 /* Check for repeating a recursion without advancing the subject pointer.
1514 This should catch convoluted mutual recursions. (Some simple cases are
1515 caught at compile time.) */
1516
1517 for (ri = md->recursive; ri != NULL; ri = ri->prevrec)
1518 if (recno == ri->group_num && eptr == ri->subject_position)
1519 RRETURN(PCRE_ERROR_RECURSELOOP);
1520
1521 /* Add to "recursing stack" */
1522
1523 new_recursive.group_num = recno;
1524 new_recursive.subject_position = eptr;
1525 new_recursive.prevrec = md->recursive;
1526 md->recursive = &new_recursive;
1527
1528 /* Where to continue from afterwards */
1529
1530 ecode += 1 + LINK_SIZE;
1531
1532 /* Now save the offset data */
1533
1534 new_recursive.saved_max = md->offset_end;
1535 if (new_recursive.saved_max <= REC_STACK_SAVE_MAX)
1536 new_recursive.offset_save = stacksave;
1537 else
1538 {
1539 new_recursive.offset_save =
1540 (int *)(pcre_malloc)(new_recursive.saved_max * sizeof(int));
1541 if (new_recursive.offset_save == NULL) RRETURN(PCRE_ERROR_NOMEMORY);
1542 }
1543 memcpy(new_recursive.offset_save, md->offset_vector,
1544 new_recursive.saved_max * sizeof(int));
1545
1546 /* OK, now we can do the recursion. After processing each alternative,
1547 restore the offset data. If there were nested recursions, md->recursive
1548 might be changed, so reset it before looping. */
1549
1550 DPRINTF(("Recursing into group %d\n", new_recursive.group_num));
1551 cbegroup = (*callpat >= OP_SBRA);
1552 do
1553 {
1554 if (cbegroup) md->match_function_type = MATCH_CBEGROUP;
1555 RMATCH(eptr, callpat + _pcre_OP_lengths[*callpat], offset_top,
1556 md, eptrb, RM6);
1557 memcpy(md->offset_vector, new_recursive.offset_save,
1558 new_recursive.saved_max * sizeof(int));
1559 if (rrc == MATCH_MATCH || rrc == MATCH_ACCEPT)
1560 {
1561 DPRINTF(("Recursion matched\n"));
1562 md->recursive = new_recursive.prevrec;
1563 if (new_recursive.offset_save != stacksave)
1564 (pcre_free)(new_recursive.offset_save);
1565
1566 /* Set where we got to in the subject, and reset the start in case
1567 it was changed by \K. This *is* propagated back out of a recursion,
1568 for Perl compatibility. */
1569
1570 eptr = md->end_match_ptr;
1571 mstart = md->start_match_ptr;
1572 goto RECURSION_MATCHED; /* Exit loop; end processing */
1573 }
1574 else if (rrc != MATCH_NOMATCH &&
1575 (rrc != MATCH_THEN || md->start_match_ptr != ecode))
1576 {
1577 DPRINTF(("Recursion gave error %d\n", rrc));
1578 if (new_recursive.offset_save != stacksave)
1579 (pcre_free)(new_recursive.offset_save);
1580 RRETURN(rrc);
1581 }
1582
1583 md->recursive = &new_recursive;
1584 callpat += GET(callpat, 1);
1585 }
1586 while (*callpat == OP_ALT);
1587
1588 DPRINTF(("Recursion didn't match\n"));
1589 md->recursive = new_recursive.prevrec;
1590 if (new_recursive.offset_save != stacksave)
1591 (pcre_free)(new_recursive.offset_save);
1592 MRRETURN(MATCH_NOMATCH);
1593 }
1594
1595 RECURSION_MATCHED:
1596 break;
1597
1598 /* An alternation is the end of a branch; scan along to find the end of the
1599 bracketed group and go to there. */
1600
1601 case OP_ALT:
1602 do ecode += GET(ecode,1); while (*ecode == OP_ALT);
1603 break;
1604
1605 /* BRAZERO, BRAMINZERO and SKIPZERO occur just before a bracket group,
1606 indicating that it may occur zero times. It may repeat infinitely, or not
1607 at all - i.e. it could be ()* or ()? or even (){0} in the pattern. Brackets
1608 with fixed upper repeat limits are compiled as a number of copies, with the
1609 optional ones preceded by BRAZERO or BRAMINZERO. */
1610
1611 case OP_BRAZERO:
1612 next = ecode + 1;
1613 RMATCH(eptr, next, offset_top, md, eptrb, RM10);
1614 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1615 do next += GET(next, 1); while (*next == OP_ALT);
1616 ecode = next + 1 + LINK_SIZE;
1617 break;
1618
1619 case OP_BRAMINZERO:
1620 next = ecode + 1;
1621 do next += GET(next, 1); while (*next == OP_ALT);
1622 RMATCH(eptr, next + 1+LINK_SIZE, offset_top, md, eptrb, RM11);
1623 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1624 ecode++;
1625 break;
1626
1627 case OP_SKIPZERO:
1628 next = ecode+1;
1629 do next += GET(next,1); while (*next == OP_ALT);
1630 ecode = next + 1 + LINK_SIZE;
1631 break;
1632
1633 /* BRAPOSZERO occurs before a possessive bracket group. Don't do anything
1634 here; just jump to the group, with allow_zero set TRUE. */
1635
1636 case OP_BRAPOSZERO:
1637 op = *(++ecode);
1638 allow_zero = TRUE;
1639 if (op == OP_CBRAPOS || op == OP_SCBRAPOS) goto POSSESSIVE_CAPTURE;
1640 goto POSSESSIVE_NON_CAPTURE;
1641
1642 /* End of a group, repeated or non-repeating. */
1643
1644 case OP_KET:
1645 case OP_KETRMIN:
1646 case OP_KETRMAX:
1647 case OP_KETRPOS:
1648 prev = ecode - GET(ecode, 1);
1649
1650 /* If this was a group that remembered the subject start, in order to break
1651 infinite repeats of empty string matches, retrieve the subject start from
1652 the chain. Otherwise, set it NULL. */
1653
1654 if (*prev >= OP_SBRA || *prev == OP_ONCE)
1655 {
1656 saved_eptr = eptrb->epb_saved_eptr; /* Value at start of group */
1657 eptrb = eptrb->epb_prev; /* Backup to previous group */
1658 }
1659 else saved_eptr = NULL;
1660
1661 /* If we are at the end of an assertion group, stop matching and return
1662 MATCH_MATCH, but record the current high water mark for use by positive
1663 assertions. We also need to record the match start in case it was changed
1664 by \K. */
1665
1666 if (*prev == OP_ASSERT || *prev == OP_ASSERT_NOT ||
1667 *prev == OP_ASSERTBACK || *prev == OP_ASSERTBACK_NOT)
1668 {
1669 md->end_match_ptr = eptr; /* For ONCE */
1670 md->end_offset_top = offset_top;
1671 md->start_match_ptr = mstart;
1672 MRRETURN(MATCH_MATCH); /* Sets md->mark */
1673 }
1674
1675 /* For capturing groups we have to check the group number back at the start
1676 and if necessary complete handling an extraction by setting the offsets and
1677 bumping the high water mark. Whole-pattern recursion is coded as a recurse
1678 into group 0, so it won't be picked up here. Instead, we catch it when the
1679 OP_END is reached. Other recursion is handled here. We just have to record
1680 the current subject position and start match pointer and give a MATCH
1681 return. */
1682
1683 if (*prev == OP_CBRA || *prev == OP_SCBRA ||
1684 *prev == OP_CBRAPOS || *prev == OP_SCBRAPOS)
1685 {
1686 number = GET2(prev, 1+LINK_SIZE);
1687 offset = number << 1;
1688
1689 #ifdef PCRE_DEBUG
1690 printf("end bracket %d", number);
1691 printf("\n");
1692 #endif
1693
1694 /* Handle a recursively called group. */
1695
1696 if (md->recursive != NULL && md->recursive->group_num == number)
1697 {
1698 md->end_match_ptr = eptr;
1699 md->start_match_ptr = mstart;
1700 RRETURN(MATCH_MATCH);
1701 }
1702
1703 /* Deal with capturing */
1704
1705 md->capture_last = number;
1706 if (offset >= md->offset_max) md->offset_overflow = TRUE; else
1707 {
1708 /* If offset is greater than offset_top, it means that we are
1709 "skipping" a capturing group, and that group's offsets must be marked
1710 unset. In earlier versions of PCRE, all the offsets were unset at the
1711 start of matching, but this doesn't work because atomic groups and
1712 assertions can cause a value to be set that should later be unset.
1713 Example: matching /(?>(a))b|(a)c/ against "ac". This sets group 1 as
1714 part of the atomic group, but this is not on the final matching path,
1715 so must be unset when 2 is set. (If there is no group 2, there is no
1716 problem, because offset_top will then be 2, indicating no capture.) */
1717
1718 if (offset > offset_top)
1719 {
1720 register int *iptr = md->offset_vector + offset_top;
1721 register int *iend = md->offset_vector + offset;
1722 while (iptr < iend) *iptr++ = -1;
1723 }
1724
1725 /* Now make the extraction */
1726
1727 md->offset_vector[offset] =
1728 md->offset_vector[md->offset_end - number];
1729 md->offset_vector[offset+1] = (int)(eptr - md->start_subject);
1730 if (offset_top <= offset) offset_top = offset + 2;
1731 }
1732 }
1733
1734 /* For an ordinary non-repeating ket, just continue at this level. This
1735 also happens for a repeating ket if no characters were matched in the
1736 group. This is the forcible breaking of infinite loops as implemented in
1737 Perl 5.005. For a non-repeating atomic group, establish a backup point by
1738 processing the rest of the pattern at a lower level. If this results in a
1739 NOMATCH return, pass MATCH_ONCE back to the original OP_ONCE level, thereby
1740 bypassing intermediate backup points, but resetting any captures that
1741 happened along the way. */
1742
1743 if (*ecode == OP_KET || eptr == saved_eptr)
1744 {
1745 if (*prev == OP_ONCE)
1746 {
1747 RMATCH(eptr, ecode + 1 + LINK_SIZE, offset_top, md, eptrb, RM12);
1748 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1749 md->once_target = prev; /* Level at which to change to MATCH_NOMATCH */
1750 RRETURN(MATCH_ONCE);
1751 }
1752 ecode += 1 + LINK_SIZE; /* Carry on at this level */
1753 break;
1754 }
1755
1756 /* OP_KETRPOS is a possessive repeating ket. Remember the current position,
1757 and return the MATCH_KETRPOS. This makes it possible to do the repeats one
1758 at a time from the outer level, thus saving stack. */
1759
1760 if (*ecode == OP_KETRPOS)
1761 {
1762 md->end_match_ptr = eptr;
1763 md->end_offset_top = offset_top;
1764 RRETURN(MATCH_KETRPOS);
1765 }
1766
1767 /* The normal repeating kets try the rest of the pattern or restart from
1768 the preceding bracket, in the appropriate order. In the second case, we can
1769 use tail recursion to avoid using another stack frame, unless we have an
1770 an atomic group or an unlimited repeat of a group that can match an empty
1771 string. */
1772
1773 if (*ecode == OP_KETRMIN)
1774 {
1775 RMATCH(eptr, ecode + 1 + LINK_SIZE, offset_top, md, eptrb, RM7);
1776 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1777 if (*prev == OP_ONCE)
1778 {
1779 RMATCH(eptr, prev, offset_top, md, eptrb, RM8);
1780 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1781 md->once_target = prev; /* Level at which to change to MATCH_NOMATCH */
1782 RRETURN(MATCH_ONCE);
1783 }
1784 if (*prev >= OP_SBRA) /* Could match an empty string */
1785 {
1786 md->match_function_type = MATCH_CBEGROUP;
1787 RMATCH(eptr, prev, offset_top, md, eptrb, RM50);
1788 RRETURN(rrc);
1789 }
1790 ecode = prev;
1791 goto TAIL_RECURSE;
1792 }
1793 else /* OP_KETRMAX */
1794 {
1795 if (*prev >= OP_SBRA) md->match_function_type = MATCH_CBEGROUP;
1796 RMATCH(eptr, prev, offset_top, md, eptrb, RM13);
1797 if (rrc == MATCH_ONCE && md->once_target == prev) rrc = MATCH_NOMATCH;
1798 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1799 if (*prev == OP_ONCE)
1800 {
1801 RMATCH(eptr, ecode + 1 + LINK_SIZE, offset_top, md, eptrb, RM9);
1802 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1803 md->once_target = prev;
1804 RRETURN(MATCH_ONCE);
1805 }
1806 ecode += 1 + LINK_SIZE;
1807 goto TAIL_RECURSE;
1808 }
1809 /* Control never gets here */
1810
1811 /* Not multiline mode: start of subject assertion, unless notbol. */
1812
1813 case OP_CIRC:
1814 if (md->notbol && eptr == md->start_subject) MRRETURN(MATCH_NOMATCH);
1815
1816 /* Start of subject assertion */
1817
1818 case OP_SOD:
1819 if (eptr != md->start_subject) MRRETURN(MATCH_NOMATCH);
1820 ecode++;
1821 break;
1822
1823 /* Multiline mode: start of subject unless notbol, or after any newline. */
1824
1825 case OP_CIRCM:
1826 if (md->notbol && eptr == md->start_subject) MRRETURN(MATCH_NOMATCH);
1827 if (eptr != md->start_subject &&
1828 (eptr == md->end_subject || !WAS_NEWLINE(eptr)))
1829 MRRETURN(MATCH_NOMATCH);
1830 ecode++;
1831 break;
1832
1833 /* Start of match assertion */
1834
1835 case OP_SOM:
1836 if (eptr != md->start_subject + md->start_offset) MRRETURN(MATCH_NOMATCH);
1837 ecode++;
1838 break;
1839
1840 /* Reset the start of match point */
1841
1842 case OP_SET_SOM:
1843 mstart = eptr;
1844 ecode++;
1845 break;
1846
1847 /* Multiline mode: assert before any newline, or before end of subject
1848 unless noteol is set. */
1849
1850 case OP_DOLLM:
1851 if (eptr < md->end_subject)
1852 { if (!IS_NEWLINE(eptr)) MRRETURN(MATCH_NOMATCH); }
1853 else
1854 {
1855 if (md->noteol) MRRETURN(MATCH_NOMATCH);
1856 SCHECK_PARTIAL();
1857 }
1858 ecode++;
1859 break;
1860
1861 /* Not multiline mode: assert before a terminating newline or before end of
1862 subject unless noteol is set. */
1863
1864 case OP_DOLL:
1865 if (md->noteol) MRRETURN(MATCH_NOMATCH);
1866 if (!md->endonly) goto ASSERT_NL_OR_EOS;
1867
1868 /* ... else fall through for endonly */
1869
1870 /* End of subject assertion (\z) */
1871
1872 case OP_EOD:
1873 if (eptr < md->end_subject) MRRETURN(MATCH_NOMATCH);
1874 SCHECK_PARTIAL();
1875 ecode++;
1876 break;
1877
1878 /* End of subject or ending \n assertion (\Z) */
1879
1880 case OP_EODN:
1881 ASSERT_NL_OR_EOS:
1882 if (eptr < md->end_subject &&
1883 (!IS_NEWLINE(eptr) || eptr != md->end_subject - md->nllen))
1884 MRRETURN(MATCH_NOMATCH);
1885
1886 /* Either at end of string or \n before end. */
1887
1888 SCHECK_PARTIAL();
1889 ecode++;
1890 break;
1891
1892 /* Word boundary assertions */
1893
1894 case OP_NOT_WORD_BOUNDARY:
1895 case OP_WORD_BOUNDARY:
1896 {
1897
1898 /* Find out if the previous and current characters are "word" characters.
1899 It takes a bit more work in UTF-8 mode. Characters > 255 are assumed to
1900 be "non-word" characters. Remember the earliest consulted character for
1901 partial matching. */
1902
1903 #ifdef SUPPORT_UTF8
1904 if (utf8)
1905 {
1906 /* Get status of previous character */
1907
1908 if (eptr == md->start_subject) prev_is_word = FALSE; else
1909 {
1910 USPTR lastptr = eptr - 1;
1911 while((*lastptr & 0xc0) == 0x80) lastptr--;
1912 if (lastptr < md->start_used_ptr) md->start_used_ptr = lastptr;
1913 GETCHAR(c, lastptr);
1914 #ifdef SUPPORT_UCP
1915 if (md->use_ucp)
1916 {
1917 if (c == '_') prev_is_word = TRUE; else
1918 {
1919 int cat = UCD_CATEGORY(c);
1920 prev_is_word = (cat == ucp_L || cat == ucp_N);
1921 }
1922 }
1923 else
1924 #endif
1925 prev_is_word = c < 256 && (md->ctypes[c] & ctype_word) != 0;
1926 }
1927
1928 /* Get status of next character */
1929
1930 if (eptr >= md->end_subject)
1931 {
1932 SCHECK_PARTIAL();
1933 cur_is_word = FALSE;
1934 }
1935 else
1936 {
1937 GETCHAR(c, eptr);
1938 #ifdef SUPPORT_UCP
1939 if (md->use_ucp)
1940 {
1941 if (c == '_') cur_is_word = TRUE; else
1942 {
1943 int cat = UCD_CATEGORY(c);
1944 cur_is_word = (cat == ucp_L || cat == ucp_N);
1945 }
1946 }
1947 else
1948 #endif
1949 cur_is_word = c < 256 && (md->ctypes[c] & ctype_word) != 0;
1950 }
1951 }
1952 else
1953 #endif
1954
1955 /* Not in UTF-8 mode, but we may still have PCRE_UCP set, and for
1956 consistency with the behaviour of \w we do use it in this case. */
1957
1958 {
1959 /* Get status of previous character */
1960
1961 if (eptr == md->start_subject) prev_is_word = FALSE; else
1962 {
1963 if (eptr <= md->start_used_ptr) md->start_used_ptr = eptr - 1;
1964 #ifdef SUPPORT_UCP
1965 if (md->use_ucp)
1966 {
1967 c = eptr[-1];
1968 if (c == '_') prev_is_word = TRUE; else
1969 {
1970 int cat = UCD_CATEGORY(c);
1971 prev_is_word = (cat == ucp_L || cat == ucp_N);
1972 }
1973 }
1974 else
1975 #endif
1976 prev_is_word = ((md->ctypes[eptr[-1]] & ctype_word) != 0);
1977 }
1978
1979 /* Get status of next character */
1980
1981 if (eptr >= md->end_subject)
1982 {
1983 SCHECK_PARTIAL();
1984 cur_is_word = FALSE;
1985 }
1986 else
1987 #ifdef SUPPORT_UCP
1988 if (md->use_ucp)
1989 {
1990 c = *eptr;
1991 if (c == '_') cur_is_word = TRUE; else
1992 {
1993 int cat = UCD_CATEGORY(c);
1994 cur_is_word = (cat == ucp_L || cat == ucp_N);
1995 }
1996 }
1997 else
1998 #endif
1999 cur_is_word = ((md->ctypes[*eptr] & ctype_word) != 0);
2000 }
2001
2002 /* Now see if the situation is what we want */
2003
2004 if ((*ecode++ == OP_WORD_BOUNDARY)?
2005 cur_is_word == prev_is_word : cur_is_word != prev_is_word)
2006 MRRETURN(MATCH_NOMATCH);
2007 }
2008 break;
2009
2010 /* Match a single character type; inline for speed */
2011
2012 case OP_ANY:
2013 if (IS_NEWLINE(eptr)) MRRETURN(MATCH_NOMATCH);
2014 /* Fall through */
2015
2016 case OP_ALLANY:
2017 if (eptr >= md->end_subject) /* DO NOT merge the eptr++ here; it must */
2018 { /* not be updated before SCHECK_PARTIAL. */
2019 SCHECK_PARTIAL();
2020 MRRETURN(MATCH_NOMATCH);
2021 }
2022 eptr++;
2023 if (utf8) while (eptr < md->end_subject && (*eptr & 0xc0) == 0x80) eptr++;
2024 ecode++;
2025 break;
2026
2027 /* Match a single byte, even in UTF-8 mode. This opcode really does match
2028 any byte, even newline, independent of the setting of PCRE_DOTALL. */
2029
2030 case OP_ANYBYTE:
2031 if (eptr >= md->end_subject) /* DO NOT merge the eptr++ here; it must */
2032 { /* not be updated before SCHECK_PARTIAL. */
2033 SCHECK_PARTIAL();
2034 MRRETURN(MATCH_NOMATCH);
2035 }
2036 eptr++;
2037 ecode++;
2038 break;
2039
2040 case OP_NOT_DIGIT:
2041 if (eptr >= md->end_subject)
2042 {
2043 SCHECK_PARTIAL();
2044 MRRETURN(MATCH_NOMATCH);
2045 }
2046 GETCHARINCTEST(c, eptr);
2047 if (
2048 #ifdef SUPPORT_UTF8
2049 c < 256 &&
2050 #endif
2051 (md->ctypes[c] & ctype_digit) != 0
2052 )
2053 MRRETURN(MATCH_NOMATCH);
2054 ecode++;
2055 break;
2056
2057 case OP_DIGIT:
2058 if (eptr >= md->end_subject)
2059 {
2060 SCHECK_PARTIAL();
2061 MRRETURN(MATCH_NOMATCH);
2062 }
2063 GETCHARINCTEST(c, eptr);
2064 if (
2065 #ifdef SUPPORT_UTF8
2066 c >= 256 ||
2067 #endif
2068 (md->ctypes[c] & ctype_digit) == 0
2069 )
2070 MRRETURN(MATCH_NOMATCH);
2071 ecode++;
2072 break;
2073
2074 case OP_NOT_WHITESPACE:
2075 if (eptr >= md->end_subject)
2076 {
2077 SCHECK_PARTIAL();
2078 MRRETURN(MATCH_NOMATCH);
2079 }
2080 GETCHARINCTEST(c, eptr);
2081 if (
2082 #ifdef SUPPORT_UTF8
2083 c < 256 &&
2084 #endif
2085 (md->ctypes[c] & ctype_space) != 0
2086 )
2087 MRRETURN(MATCH_NOMATCH);
2088 ecode++;
2089 break;
2090
2091 case OP_WHITESPACE:
2092 if (eptr >= md->end_subject)
2093 {
2094 SCHECK_PARTIAL();
2095 MRRETURN(MATCH_NOMATCH);
2096 }
2097 GETCHARINCTEST(c, eptr);
2098 if (
2099 #ifdef SUPPORT_UTF8
2100 c >= 256 ||
2101 #endif
2102 (md->ctypes[c] & ctype_space) == 0
2103 )
2104 MRRETURN(MATCH_NOMATCH);
2105 ecode++;
2106 break;
2107
2108 case OP_NOT_WORDCHAR:
2109 if (eptr >= md->end_subject)
2110 {
2111 SCHECK_PARTIAL();
2112 MRRETURN(MATCH_NOMATCH);
2113 }
2114 GETCHARINCTEST(c, eptr);
2115 if (
2116 #ifdef SUPPORT_UTF8
2117 c < 256 &&
2118 #endif
2119 (md->ctypes[c] & ctype_word) != 0
2120 )
2121 MRRETURN(MATCH_NOMATCH);
2122 ecode++;
2123 break;
2124
2125 case OP_WORDCHAR:
2126 if (eptr >= md->end_subject)
2127 {
2128 SCHECK_PARTIAL();
2129 MRRETURN(MATCH_NOMATCH);
2130 }
2131 GETCHARINCTEST(c, eptr);
2132 if (
2133 #ifdef SUPPORT_UTF8
2134 c >= 256 ||
2135 #endif
2136 (md->ctypes[c] & ctype_word) == 0
2137 )
2138 MRRETURN(MATCH_NOMATCH);
2139 ecode++;
2140 break;
2141
2142 case OP_ANYNL:
2143 if (eptr >= md->end_subject)
2144 {
2145 SCHECK_PARTIAL();
2146 MRRETURN(MATCH_NOMATCH);
2147 }
2148 GETCHARINCTEST(c, eptr);
2149 switch(c)
2150 {
2151 default: MRRETURN(MATCH_NOMATCH);
2152
2153 case 0x000d:
2154 if (eptr < md->end_subject && *eptr == 0x0a) eptr++;
2155 break;
2156
2157 case 0x000a:
2158 break;
2159
2160 case 0x000b:
2161 case 0x000c:
2162 case 0x0085:
2163 case 0x2028:
2164 case 0x2029:
2165 if (md->bsr_anycrlf) MRRETURN(MATCH_NOMATCH);
2166 break;
2167 }
2168 ecode++;
2169 break;
2170
2171 case OP_NOT_HSPACE:
2172 if (eptr >= md->end_subject)
2173 {
2174 SCHECK_PARTIAL();
2175 MRRETURN(MATCH_NOMATCH);
2176 }
2177 GETCHARINCTEST(c, eptr);
2178 switch(c)
2179 {
2180 default: break;
2181 case 0x09: /* HT */
2182 case 0x20: /* SPACE */
2183 case 0xa0: /* NBSP */
2184 case 0x1680: /* OGHAM SPACE MARK */
2185 case 0x180e: /* MONGOLIAN VOWEL SEPARATOR */
2186 case 0x2000: /* EN QUAD */
2187 case 0x2001: /* EM QUAD */
2188 case 0x2002: /* EN SPACE */
2189 case 0x2003: /* EM SPACE */
2190 case 0x2004: /* THREE-PER-EM SPACE */
2191 case 0x2005: /* FOUR-PER-EM SPACE */
2192 case 0x2006: /* SIX-PER-EM SPACE */
2193 case 0x2007: /* FIGURE SPACE */
2194 case 0x2008: /* PUNCTUATION SPACE */
2195 case 0x2009: /* THIN SPACE */
2196 case 0x200A: /* HAIR SPACE */
2197 case 0x202f: /* NARROW NO-BREAK SPACE */
2198 case 0x205f: /* MEDIUM MATHEMATICAL SPACE */
2199 case 0x3000: /* IDEOGRAPHIC SPACE */
2200 MRRETURN(MATCH_NOMATCH);
2201 }
2202 ecode++;
2203 break;
2204
2205 case OP_HSPACE:
2206 if (eptr >= md->end_subject)
2207 {
2208 SCHECK_PARTIAL();
2209 MRRETURN(MATCH_NOMATCH);
2210 }
2211 GETCHARINCTEST(c, eptr);
2212 switch(c)
2213 {
2214 default: MRRETURN(MATCH_NOMATCH);
2215 case 0x09: /* HT */
2216 case 0x20: /* SPACE */
2217 case 0xa0: /* NBSP */
2218 case 0x1680: /* OGHAM SPACE MARK */
2219 case 0x180e: /* MONGOLIAN VOWEL SEPARATOR */
2220 case 0x2000: /* EN QUAD */
2221 case 0x2001: /* EM QUAD */
2222 case 0x2002: /* EN SPACE */
2223 case 0x2003: /* EM SPACE */
2224 case 0x2004: /* THREE-PER-EM SPACE */
2225 case 0x2005: /* FOUR-PER-EM SPACE */
2226 case 0x2006: /* SIX-PER-EM SPACE */
2227 case 0x2007: /* FIGURE SPACE */
2228 case 0x2008: /* PUNCTUATION SPACE */
2229 case 0x2009: /* THIN SPACE */
2230 case 0x200A: /* HAIR SPACE */
2231 case 0x202f: /* NARROW NO-BREAK SPACE */
2232 case 0x205f: /* MEDIUM MATHEMATICAL SPACE */
2233 case 0x3000: /* IDEOGRAPHIC SPACE */
2234 break;
2235 }
2236 ecode++;
2237 break;
2238
2239 case OP_NOT_VSPACE:
2240 if (eptr >= md->end_subject)
2241 {
2242 SCHECK_PARTIAL();
2243 MRRETURN(MATCH_NOMATCH);
2244 }
2245 GETCHARINCTEST(c, eptr);
2246 switch(c)
2247 {
2248 default: break;
2249 case 0x0a: /* LF */
2250 case 0x0b: /* VT */
2251 case 0x0c: /* FF */
2252 case 0x0d: /* CR */
2253 case 0x85: /* NEL */
2254 case 0x2028: /* LINE SEPARATOR */
2255 case 0x2029: /* PARAGRAPH SEPARATOR */
2256 MRRETURN(MATCH_NOMATCH);
2257 }
2258 ecode++;
2259 break;
2260
2261 case OP_VSPACE:
2262 if (eptr >= md->end_subject)
2263 {
2264 SCHECK_PARTIAL();
2265 MRRETURN(MATCH_NOMATCH);
2266 }
2267 GETCHARINCTEST(c, eptr);
2268 switch(c)
2269 {
2270 default: MRRETURN(MATCH_NOMATCH);
2271 case 0x0a: /* LF */
2272 case 0x0b: /* VT */
2273 case 0x0c: /* FF */
2274 case 0x0d: /* CR */
2275 case 0x85: /* NEL */
2276 case 0x2028: /* LINE SEPARATOR */
2277 case 0x2029: /* PARAGRAPH SEPARATOR */
2278 break;
2279 }
2280 ecode++;
2281 break;
2282
2283 #ifdef SUPPORT_UCP
2284 /* Check the next character by Unicode property. We will get here only
2285 if the support is in the binary; otherwise a compile-time error occurs. */
2286
2287 case OP_PROP:
2288 case OP_NOTPROP:
2289 if (eptr >= md->end_subject)
2290 {
2291 SCHECK_PARTIAL();
2292 MRRETURN(MATCH_NOMATCH);
2293 }
2294 GETCHARINCTEST(c, eptr);
2295 {
2296 const ucd_record *prop = GET_UCD(c);
2297
2298 switch(ecode[1])
2299 {
2300 case PT_ANY:
2301 if (op == OP_NOTPROP) MRRETURN(MATCH_NOMATCH);
2302 break;
2303
2304 case PT_LAMP:
2305 if ((prop->chartype == ucp_Lu ||
2306 prop->chartype == ucp_Ll ||
2307 prop->chartype == ucp_Lt) == (op == OP_NOTPROP))
2308 MRRETURN(MATCH_NOMATCH);
2309 break;
2310
2311 case PT_GC:
2312 if ((ecode[2] != _pcre_ucp_gentype[prop->chartype]) == (op == OP_PROP))
2313 MRRETURN(MATCH_NOMATCH);
2314 break;
2315
2316 case PT_PC:
2317 if ((ecode[2] != prop->chartype) == (op == OP_PROP))
2318 MRRETURN(MATCH_NOMATCH);
2319 break;
2320
2321 case PT_SC:
2322 if ((ecode[2] != prop->script) == (op == OP_PROP))
2323 MRRETURN(MATCH_NOMATCH);
2324 break;
2325
2326 /* These are specials */
2327
2328 case PT_ALNUM:
2329 if ((_pcre_ucp_gentype[prop->chartype] == ucp_L ||
2330 _pcre_ucp_gentype[prop->chartype] == ucp_N) == (op == OP_NOTPROP))
2331 MRRETURN(MATCH_NOMATCH);
2332 break;
2333
2334 case PT_SPACE: /* Perl space */
2335 if ((_pcre_ucp_gentype[prop->chartype] == ucp_Z ||
2336 c == CHAR_HT || c == CHAR_NL || c == CHAR_FF || c == CHAR_CR)
2337 == (op == OP_NOTPROP))
2338 MRRETURN(MATCH_NOMATCH);
2339 break;
2340
2341 case PT_PXSPACE: /* POSIX space */
2342 if ((_pcre_ucp_gentype[prop->chartype] == ucp_Z ||
2343 c == CHAR_HT || c == CHAR_NL || c == CHAR_VT ||
2344 c == CHAR_FF || c == CHAR_CR)
2345 == (op == OP_NOTPROP))
2346 MRRETURN(MATCH_NOMATCH);
2347 break;
2348
2349 case PT_WORD:
2350 if ((_pcre_ucp_gentype[prop->chartype] == ucp_L ||
2351 _pcre_ucp_gentype[prop->chartype] == ucp_N ||
2352 c == CHAR_UNDERSCORE) == (op == OP_NOTPROP))
2353 MRRETURN(MATCH_NOMATCH);
2354 break;
2355
2356 /* This should never occur */
2357
2358 default:
2359 RRETURN(PCRE_ERROR_INTERNAL);
2360 }
2361
2362 ecode += 3;
2363 }
2364 break;
2365
2366 /* Match an extended Unicode sequence. We will get here only if the support
2367 is in the binary; otherwise a compile-time error occurs. */
2368
2369 case OP_EXTUNI:
2370 if (eptr >= md->end_subject)
2371 {
2372 SCHECK_PARTIAL();
2373 MRRETURN(MATCH_NOMATCH);
2374 }
2375 GETCHARINCTEST(c, eptr);
2376 if (UCD_CATEGORY(c) == ucp_M) MRRETURN(MATCH_NOMATCH);
2377 while (eptr < md->end_subject)
2378 {
2379 int len = 1;
2380 if (!utf8) c = *eptr; else { GETCHARLEN(c, eptr, len); }
2381 if (UCD_CATEGORY(c) != ucp_M) break;
2382 eptr += len;
2383 }
2384 ecode++;
2385 break;
2386 #endif
2387
2388
2389 /* Match a back reference, possibly repeatedly. Look past the end of the
2390 item to see if there is repeat information following. The code is similar
2391 to that for character classes, but repeated for efficiency. Then obey
2392 similar code to character type repeats - written out again for speed.
2393 However, if the referenced string is the empty string, always treat
2394 it as matched, any number of times (otherwise there could be infinite
2395 loops). */
2396
2397 case OP_REF:
2398 case OP_REFI:
2399 caseless = op == OP_REFI;
2400 offset = GET2(ecode, 1) << 1; /* Doubled ref number */
2401 ecode += 3;
2402
2403 /* If the reference is unset, there are two possibilities:
2404
2405 (a) In the default, Perl-compatible state, set the length negative;
2406 this ensures that every attempt at a match fails. We can't just fail
2407 here, because of the possibility of quantifiers with zero minima.
2408
2409 (b) If the JavaScript compatibility flag is set, set the length to zero
2410 so that the back reference matches an empty string.
2411
2412 Otherwise, set the length to the length of what was matched by the
2413 referenced subpattern. */
2414
2415 if (offset >= offset_top || md->offset_vector[offset] < 0)
2416 length = (md->jscript_compat)? 0 : -1;
2417 else
2418 length = md->offset_vector[offset+1] - md->offset_vector[offset];
2419
2420 /* Set up for repetition, or handle the non-repeated case */
2421
2422 switch (*ecode)
2423 {
2424 case OP_CRSTAR:
2425 case OP_CRMINSTAR:
2426 case OP_CRPLUS:
2427 case OP_CRMINPLUS:
2428 case OP_CRQUERY:
2429 case OP_CRMINQUERY:
2430 c = *ecode++ - OP_CRSTAR;
2431 minimize = (c & 1) != 0;
2432 min = rep_min[c]; /* Pick up values from tables; */
2433 max = rep_max[c]; /* zero for max => infinity */
2434 if (max == 0) max = INT_MAX;
2435 break;
2436
2437 case OP_CRRANGE:
2438 case OP_CRMINRANGE:
2439 minimize = (*ecode == OP_CRMINRANGE);
2440 min = GET2(ecode, 1);
2441 max = GET2(ecode, 3);
2442 if (max == 0) max = INT_MAX;
2443 ecode += 5;
2444 break;
2445
2446 default: /* No repeat follows */
2447 if ((length = match_ref(offset, eptr, length, md, caseless)) < 0)
2448 {
2449 CHECK_PARTIAL();
2450 MRRETURN(MATCH_NOMATCH);
2451 }
2452 eptr += length;
2453 continue; /* With the main loop */
2454 }
2455
2456 /* Handle repeated back references. If the length of the reference is
2457 zero, just continue with the main loop. */
2458
2459 if (length == 0) continue;
2460
2461 /* First, ensure the minimum number of matches are present. We get back
2462 the length of the reference string explicitly rather than passing the
2463 address of eptr, so that eptr can be a register variable. */
2464
2465 for (i = 1; i <= min; i++)
2466 {
2467 int slength;
2468 if ((slength = match_ref(offset, eptr, length, md, caseless)) < 0)
2469 {
2470 CHECK_PARTIAL();
2471 MRRETURN(MATCH_NOMATCH);
2472 }
2473 eptr += slength;
2474 }
2475
2476 /* If min = max, continue at the same level without recursion.
2477 They are not both allowed to be zero. */
2478
2479 if (min == max) continue;
2480
2481 /* If minimizing, keep trying and advancing the pointer */
2482
2483 if (minimize)
2484 {
2485 for (fi = min;; fi++)
2486 {
2487 int slength;
2488 RMATCH(eptr, ecode, offset_top, md, eptrb, RM14);
2489 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2490 if (fi >= max) MRRETURN(MATCH_NOMATCH);
2491 if ((slength = match_ref(offset, eptr, length, md, caseless)) < 0)
2492 {
2493 CHECK_PARTIAL();
2494 MRRETURN(MATCH_NOMATCH);
2495 }
2496 eptr += slength;
2497 }
2498 /* Control never gets here */
2499 }
2500
2501 /* If maximizing, find the longest string and work backwards */
2502
2503 else
2504 {
2505 pp = eptr;
2506 for (i = min; i < max; i++)
2507 {
2508 int slength;
2509 if ((slength = match_ref(offset, eptr, length, md, caseless)) < 0)
2510 {
2511 CHECK_PARTIAL();
2512 break;
2513 }
2514 eptr += slength;
2515 }
2516 while (eptr >= pp)
2517 {
2518 RMATCH(eptr, ecode, offset_top, md, eptrb, RM15);
2519 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2520 eptr -= length;
2521 }
2522 MRRETURN(MATCH_NOMATCH);
2523 }
2524 /* Control never gets here */
2525
2526 /* Match a bit-mapped character class, possibly repeatedly. This op code is
2527 used when all the characters in the class have values in the range 0-255,
2528 and either the matching is caseful, or the characters are in the range
2529 0-127 when UTF-8 processing is enabled. The only difference between
2530 OP_CLASS and OP_NCLASS occurs when a data character outside the range is
2531 encountered.
2532
2533 First, look past the end of the item to see if there is repeat information
2534 following. Then obey similar code to character type repeats - written out
2535 again for speed. */
2536
2537 case OP_NCLASS:
2538 case OP_CLASS:
2539 {
2540 data = ecode + 1; /* Save for matching */
2541 ecode += 33; /* Advance past the item */
2542
2543 switch (*ecode)
2544 {
2545 case OP_CRSTAR:
2546 case OP_CRMINSTAR:
2547 case OP_CRPLUS:
2548 case OP_CRMINPLUS:
2549 case OP_CRQUERY:
2550 case OP_CRMINQUERY:
2551 c = *ecode++ - OP_CRSTAR;
2552 minimize = (c & 1) != 0;
2553 min = rep_min[c]; /* Pick up values from tables; */
2554 max = rep_max[c]; /* zero for max => infinity */
2555 if (max == 0) max = INT_MAX;
2556 break;
2557
2558 case OP_CRRANGE:
2559 case OP_CRMINRANGE:
2560 minimize = (*ecode == OP_CRMINRANGE);
2561 min = GET2(ecode, 1);
2562 max = GET2(ecode, 3);
2563 if (max == 0) max = INT_MAX;
2564 ecode += 5;
2565 break;
2566
2567 default: /* No repeat follows */
2568 min = max = 1;
2569 break;
2570 }
2571
2572 /* First, ensure the minimum number of matches are present. */
2573
2574 #ifdef SUPPORT_UTF8
2575 /* UTF-8 mode */
2576 if (utf8)
2577 {
2578 for (i = 1; i <= min; i++)
2579 {
2580 if (eptr >= md->end_subject)
2581 {
2582 SCHECK_PARTIAL();
2583 MRRETURN(MATCH_NOMATCH);
2584 }
2585 GETCHARINC(c, eptr);
2586 if (c > 255)
2587 {
2588 if (op == OP_CLASS) MRRETURN(MATCH_NOMATCH);
2589 }
2590 else
2591 {
2592 if ((data[c/8] & (1 << (c&7))) == 0) MRRETURN(MATCH_NOMATCH);
2593 }
2594 }
2595 }
2596 else
2597 #endif
2598 /* Not UTF-8 mode */
2599 {
2600 for (i = 1; i <= min; i++)
2601 {
2602 if (eptr >= md->end_subject)
2603 {
2604 SCHECK_PARTIAL();
2605 MRRETURN(MATCH_NOMATCH);
2606 }
2607 c = *eptr++;
2608 if ((data[c/8] & (1 << (c&7))) == 0) MRRETURN(MATCH_NOMATCH);
2609 }
2610 }
2611
2612 /* If max == min we can continue with the main loop without the
2613 need to recurse. */
2614
2615 if (min == max) continue;
2616
2617 /* If minimizing, keep testing the rest of the expression and advancing
2618 the pointer while it matches the class. */
2619
2620 if (minimize)
2621 {
2622 #ifdef SUPPORT_UTF8
2623 /* UTF-8 mode */
2624 if (utf8)
2625 {
2626 for (fi = min;; fi++)
2627 {
2628 RMATCH(eptr, ecode, offset_top, md, eptrb, RM16);
2629 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2630 if (fi >= max) MRRETURN(MATCH_NOMATCH);
2631 if (eptr >= md->end_subject)
2632 {
2633 SCHECK_PARTIAL();
2634 MRRETURN(MATCH_NOMATCH);
2635 }
2636 GETCHARINC(c, eptr);
2637 if (c > 255)
2638 {
2639 if (op == OP_CLASS) MRRETURN(MATCH_NOMATCH);
2640 }
2641 else
2642 {
2643 if ((data[c/8] & (1 << (c&7))) == 0) MRRETURN(MATCH_NOMATCH);
2644 }
2645 }
2646 }
2647 else
2648 #endif
2649 /* Not UTF-8 mode */
2650 {
2651 for (fi = min;; fi++)
2652 {
2653 RMATCH(eptr, ecode, offset_top, md, eptrb, RM17);
2654 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2655 if (fi >= max) MRRETURN(MATCH_NOMATCH);
2656 if (eptr >= md->end_subject)
2657 {
2658 SCHECK_PARTIAL();
2659 MRRETURN(MATCH_NOMATCH);
2660 }
2661 c = *eptr++;
2662 if ((data[c/8] & (1 << (c&7))) == 0) MRRETURN(MATCH_NOMATCH);
2663 }
2664 }
2665 /* Control never gets here */
2666 }
2667
2668 /* If maximizing, find the longest possible run, then work backwards. */
2669
2670 else
2671 {
2672 pp = eptr;
2673
2674 #ifdef SUPPORT_UTF8
2675 /* UTF-8 mode */
2676 if (utf8)
2677 {
2678 for (i = min; i < max; i++)
2679 {
2680 int len = 1;
2681 if (eptr >= md->end_subject)
2682 {
2683 SCHECK_PARTIAL();
2684 break;
2685 }
2686 GETCHARLEN(c, eptr, len);
2687 if (c > 255)
2688 {
2689 if (op == OP_CLASS) break;
2690 }
2691 else
2692 {
2693 if ((data[c/8] & (1 << (c&7))) == 0) break;
2694 }
2695 eptr += len;
2696 }
2697 for (;;)
2698 {
2699 RMATCH(eptr, ecode, offset_top, md, eptrb, RM18);
2700 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2701 if (eptr-- == pp) break; /* Stop if tried at original pos */
2702 BACKCHAR(eptr);
2703 }
2704 }
2705 else
2706 #endif
2707 /* Not UTF-8 mode */
2708 {
2709 for (i = min; i < max; i++)
2710 {
2711 if (eptr >= md->end_subject)
2712 {
2713 SCHECK_PARTIAL();
2714 break;
2715 }
2716 c = *eptr;
2717 if ((data[c/8] & (1 << (c&7))) == 0) break;
2718 eptr++;
2719 }
2720 while (eptr >= pp)
2721 {
2722 RMATCH(eptr, ecode, offset_top, md, eptrb, RM19);
2723 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2724 eptr--;
2725 }
2726 }
2727
2728 MRRETURN(MATCH_NOMATCH);
2729 }
2730 }
2731 /* Control never gets here */
2732
2733
2734 /* Match an extended character class. This opcode is encountered only
2735 when UTF-8 mode mode is supported. Nevertheless, we may not be in UTF-8
2736 mode, because Unicode properties are supported in non-UTF-8 mode. */
2737
2738 #ifdef SUPPORT_UTF8
2739 case OP_XCLASS:
2740 {
2741 data = ecode + 1 + LINK_SIZE; /* Save for matching */
2742 ecode += GET(ecode, 1); /* Advance past the item */
2743
2744 switch (*ecode)
2745 {
2746 case OP_CRSTAR:
2747 case OP_CRMINSTAR:
2748 case OP_CRPLUS:
2749 case OP_CRMINPLUS:
2750 case OP_CRQUERY:
2751 case OP_CRMINQUERY:
2752 c = *ecode++ - OP_CRSTAR;
2753 minimize = (c & 1) != 0;
2754 min = rep_min[c]; /* Pick up values from tables; */
2755 max = rep_max[c]; /* zero for max => infinity */
2756 if (max == 0) max = INT_MAX;
2757 break;
2758
2759 case OP_CRRANGE:
2760 case OP_CRMINRANGE:
2761 minimize = (*ecode == OP_CRMINRANGE);
2762 min = GET2(ecode, 1);
2763 max = GET2(ecode, 3);
2764 if (max == 0) max = INT_MAX;
2765 ecode += 5;
2766 break;
2767
2768 default: /* No repeat follows */
2769 min = max = 1;
2770 break;
2771 }
2772
2773 /* First, ensure the minimum number of matches are present. */
2774
2775 for (i = 1; i <= min; i++)
2776 {
2777 if (eptr >= md->end_subject)
2778 {
2779 SCHECK_PARTIAL();
2780 MRRETURN(MATCH_NOMATCH);
2781 }
2782 GETCHARINCTEST(c, eptr);
2783 if (!_pcre_xclass(c, data)) MRRETURN(MATCH_NOMATCH);
2784 }
2785
2786 /* If max == min we can continue with the main loop without the
2787 need to recurse. */
2788
2789 if (min == max) continue;
2790
2791 /* If minimizing, keep testing the rest of the expression and advancing
2792 the pointer while it matches the class. */
2793
2794 if (minimize)
2795 {
2796 for (fi = min;; fi++)
2797 {
2798 RMATCH(eptr, ecode, offset_top, md, eptrb, RM20);
2799 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2800 if (fi >= max) MRRETURN(MATCH_NOMATCH);
2801 if (eptr >= md->end_subject)
2802 {
2803 SCHECK_PARTIAL();
2804 MRRETURN(MATCH_NOMATCH);
2805 }
2806 GETCHARINCTEST(c, eptr);
2807 if (!_pcre_xclass(c, data)) MRRETURN(MATCH_NOMATCH);
2808 }
2809 /* Control never gets here */
2810 }
2811
2812 /* If maximizing, find the longest possible run, then work backwards. */
2813
2814 else
2815 {
2816 pp = eptr;
2817 for (i = min; i < max; i++)
2818 {
2819 int len = 1;
2820 if (eptr >= md->end_subject)
2821 {
2822 SCHECK_PARTIAL();
2823 break;
2824 }
2825 GETCHARLENTEST(c, eptr, len);
2826 if (!_pcre_xclass(c, data)) break;
2827 eptr += len;
2828 }
2829 for(;;)
2830 {
2831 RMATCH(eptr, ecode, offset_top, md, eptrb, RM21);
2832 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2833 if (eptr-- == pp) break; /* Stop if tried at original pos */
2834 if (utf8) BACKCHAR(eptr);
2835 }
2836 MRRETURN(MATCH_NOMATCH);
2837 }
2838
2839 /* Control never gets here */
2840 }
2841 #endif /* End of XCLASS */
2842
2843 /* Match a single character, casefully */
2844
2845 case OP_CHAR:
2846 #ifdef SUPPORT_UTF8
2847 if (utf8)
2848 {
2849 length = 1;
2850 ecode++;
2851 GETCHARLEN(fc, ecode, length);
2852 if (length > md->end_subject - eptr)
2853 {
2854 CHECK_PARTIAL(); /* Not SCHECK_PARTIAL() */
2855 MRRETURN(MATCH_NOMATCH);
2856 }
2857 while (length-- > 0) if (*ecode++ != *eptr++) MRRETURN(MATCH_NOMATCH);
2858 }
2859 else
2860 #endif
2861
2862 /* Non-UTF-8 mode */
2863 {
2864 if (md->end_subject - eptr < 1)
2865 {
2866 SCHECK_PARTIAL(); /* This one can use SCHECK_PARTIAL() */
2867 MRRETURN(MATCH_NOMATCH);
2868 }
2869 if (ecode[1] != *eptr++) MRRETURN(MATCH_NOMATCH);
2870 ecode += 2;
2871 }
2872 break;
2873
2874 /* Match a single character, caselessly */
2875
2876 case OP_CHARI:
2877 #ifdef SUPPORT_UTF8
2878 if (utf8)
2879 {
2880 length = 1;
2881 ecode++;
2882 GETCHARLEN(fc, ecode, length);
2883
2884 if (length > md->end_subject - eptr)
2885 {
2886 CHECK_PARTIAL(); /* Not SCHECK_PARTIAL() */
2887 MRRETURN(MATCH_NOMATCH);
2888 }
2889
2890 /* If the pattern character's value is < 128, we have only one byte, and
2891 can use the fast lookup table. */
2892
2893 if (fc < 128)
2894 {
2895 if (md->lcc[*ecode++] != md->lcc[*eptr++]) MRRETURN(MATCH_NOMATCH);
2896 }
2897
2898 /* Otherwise we must pick up the subject character */
2899
2900 else
2901 {
2902 unsigned int dc;
2903 GETCHARINC(dc, eptr);
2904 ecode += length;
2905
2906 /* If we have Unicode property support, we can use it to test the other
2907 case of the character, if there is one. */
2908
2909 if (fc != dc)
2910 {
2911 #ifdef SUPPORT_UCP
2912 if (dc != UCD_OTHERCASE(fc))
2913 #endif
2914 MRRETURN(MATCH_NOMATCH);
2915 }
2916 }
2917 }
2918 else
2919 #endif /* SUPPORT_UTF8 */
2920
2921 /* Non-UTF-8 mode */
2922 {
2923 if (md->end_subject - eptr < 1)
2924 {
2925 SCHECK_PARTIAL(); /* This one can use SCHECK_PARTIAL() */
2926 MRRETURN(MATCH_NOMATCH);
2927 }
2928 if (md->lcc[ecode[1]] != md->lcc[*eptr++]) MRRETURN(MATCH_NOMATCH);
2929 ecode += 2;
2930 }
2931 break;
2932
2933 /* Match a single character repeatedly. */
2934
2935 case OP_EXACT:
2936 case OP_EXACTI:
2937 min = max = GET2(ecode, 1);
2938 ecode += 3;
2939 goto REPEATCHAR;
2940
2941 case OP_POSUPTO:
2942 case OP_POSUPTOI:
2943 possessive = TRUE;
2944 /* Fall through */
2945
2946 case OP_UPTO:
2947 case OP_UPTOI:
2948 case OP_MINUPTO:
2949 case OP_MINUPTOI:
2950 min = 0;
2951 max = GET2(ecode, 1);
2952 minimize = *ecode == OP_MINUPTO || *ecode == OP_MINUPTOI;
2953 ecode += 3;
2954 goto REPEATCHAR;
2955
2956 case OP_POSSTAR:
2957 case OP_POSSTARI:
2958 possessive = TRUE;
2959 min = 0;
2960 max = INT_MAX;
2961 ecode++;
2962 goto REPEATCHAR;
2963
2964 case OP_POSPLUS:
2965 case OP_POSPLUSI:
2966 possessive = TRUE;
2967 min = 1;
2968 max = INT_MAX;
2969 ecode++;
2970 goto REPEATCHAR;
2971
2972 case OP_POSQUERY:
2973 case OP_POSQUERYI:
2974 possessive = TRUE;
2975 min = 0;
2976 max = 1;
2977 ecode++;
2978 goto REPEATCHAR;
2979
2980 case OP_STAR:
2981 case OP_STARI:
2982 case OP_MINSTAR:
2983 case OP_MINSTARI:
2984 case OP_PLUS:
2985 case OP_PLUSI:
2986 case OP_MINPLUS:
2987 case OP_MINPLUSI:
2988 case OP_QUERY:
2989 case OP_QUERYI:
2990 case OP_MINQUERY:
2991 case OP_MINQUERYI:
2992 c = *ecode++ - ((op < OP_STARI)? OP_STAR : OP_STARI);
2993 minimize = (c & 1) != 0;
2994 min = rep_min[c]; /* Pick up values from tables; */
2995 max = rep_max[c]; /* zero for max => infinity */
2996 if (max == 0) max = INT_MAX;
2997
2998 /* Common code for all repeated single-character matches. */
2999
3000 REPEATCHAR:
3001 #ifdef SUPPORT_UTF8
3002 if (utf8)
3003 {
3004 length = 1;
3005 charptr = ecode;
3006 GETCHARLEN(fc, ecode, length);
3007 ecode += length;
3008
3009 /* Handle multibyte character matching specially here. There is
3010 support for caseless matching if UCP support is present. */
3011
3012 if (length > 1)
3013 {
3014 #ifdef SUPPORT_UCP
3015 unsigned int othercase;
3016 if (op >= OP_STARI && /* Caseless */
3017 (othercase = UCD_OTHERCASE(fc)) != fc)
3018 oclength = _pcre_ord2utf8(othercase, occhars);
3019 else oclength = 0;
3020 #endif /* SUPPORT_UCP */
3021
3022 for (i = 1; i <= min; i++)
3023 {
3024 if (eptr <= md->end_subject - length &&
3025 memcmp(eptr, charptr, length) == 0) eptr += length;
3026 #ifdef SUPPORT_UCP
3027 else if (oclength > 0 &&
3028 eptr <= md->end_subject - oclength &&
3029 memcmp(eptr, occhars, oclength) == 0) eptr += oclength;
3030 #endif /* SUPPORT_UCP */
3031 else
3032 {
3033 CHECK_PARTIAL();
3034 MRRETURN(MATCH_NOMATCH);
3035 }
3036 }
3037
3038 if (min == max) continue;
3039
3040 if (minimize)
3041 {
3042 for (fi = min;; fi++)
3043 {
3044 RMATCH(eptr, ecode, offset_top, md, eptrb, RM22);
3045 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3046 if (fi >= max) MRRETURN(MATCH_NOMATCH);
3047 if (eptr <= md->end_subject - length &&
3048 memcmp(eptr, charptr, length) == 0) eptr += length;
3049 #ifdef SUPPORT_UCP
3050 else if (oclength > 0 &&
3051 eptr <= md->end_subject - oclength &&
3052 memcmp(eptr, occhars, oclength) == 0) eptr += oclength;
3053 #endif /* SUPPORT_UCP */
3054 else
3055 {
3056 CHECK_PARTIAL();
3057 MRRETURN(MATCH_NOMATCH);
3058 }
3059 }
3060 /* Control never gets here */
3061 }
3062
3063 else /* Maximize */
3064 {
3065 pp = eptr;
3066 for (i = min; i < max; i++)
3067 {
3068 if (eptr <= md->end_subject - length &&
3069 memcmp(eptr, charptr, length) == 0) eptr += length;
3070 #ifdef SUPPORT_UCP
3071 else if (oclength > 0 &&
3072 eptr <= md->end_subject - oclength &&
3073 memcmp(eptr, occhars, oclength) == 0) eptr += oclength;
3074 #endif /* SUPPORT_UCP */
3075 else
3076 {
3077 CHECK_PARTIAL();
3078 break;
3079 }
3080 }
3081
3082 if (possessive) continue;
3083
3084 for(;;)
3085 {
3086 RMATCH(eptr, ecode, offset_top, md, eptrb, RM23);
3087 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3088 if (eptr == pp) { MRRETURN(MATCH_NOMATCH); }
3089 #ifdef SUPPORT_UCP
3090 eptr--;
3091 BACKCHAR(eptr);
3092 #else /* without SUPPORT_UCP */
3093 eptr -= length;
3094 #endif /* SUPPORT_UCP */
3095 }
3096 }
3097 /* Control never gets here */
3098 }
3099
3100 /* If the length of a UTF-8 character is 1, we fall through here, and
3101 obey the code as for non-UTF-8 characters below, though in this case the
3102 value of fc will always be < 128. */
3103 }
3104 else
3105 #endif /* SUPPORT_UTF8 */
3106
3107 /* When not in UTF-8 mode, load a single-byte character. */
3108
3109 fc = *ecode++;
3110
3111 /* The value of fc at this point is always less than 256, though we may or
3112 may not be in UTF-8 mode. The code is duplicated for the caseless and
3113 caseful cases, for speed, since matching characters is likely to be quite
3114 common. First, ensure the minimum number of matches are present. If min =
3115 max, continue at the same level without recursing. Otherwise, if
3116 minimizing, keep trying the rest of the expression and advancing one
3117 matching character if failing, up to the maximum. Alternatively, if
3118 maximizing, find the maximum number of characters and work backwards. */
3119
3120 DPRINTF(("matching %c{%d,%d} against subject %.*s\n", fc, min, max,
3121 max, eptr));
3122
3123 if (op >= OP_STARI) /* Caseless */
3124 {
3125 fc = md->lcc[fc];
3126 for (i = 1; i <= min; i++)
3127 {
3128 if (eptr >= md->end_subject)
3129 {
3130 SCHECK_PARTIAL();
3131 MRRETURN(MATCH_NOMATCH);
3132 }
3133 if (fc != md->lcc[*eptr++]) MRRETURN(MATCH_NOMATCH);
3134 }
3135 if (min == max) continue;
3136 if (minimize)
3137 {
3138 for (fi = min;; fi++)
3139 {
3140 RMATCH(eptr, ecode, offset_top, md, eptrb, RM24);
3141 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3142 if (fi >= max) MRRETURN(MATCH_NOMATCH);
3143 if (eptr >= md->end_subject)
3144 {
3145 SCHECK_PARTIAL();
3146 MRRETURN(MATCH_NOMATCH);
3147 }
3148 if (fc != md->lcc[*eptr++]) MRRETURN(MATCH_NOMATCH);
3149 }
3150 /* Control never gets here */
3151 }
3152 else /* Maximize */
3153 {
3154 pp = eptr;
3155 for (i = min; i < max; i++)
3156 {
3157 if (eptr >= md->end_subject)
3158 {
3159 SCHECK_PARTIAL();
3160 break;
3161 }
3162 if (fc != md->lcc[*eptr]) break;
3163 eptr++;
3164 }
3165
3166 if (possessive) continue;
3167
3168 while (eptr >= pp)
3169 {
3170 RMATCH(eptr, ecode, offset_top, md, eptrb, RM25);
3171 eptr--;
3172 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3173 }
3174 MRRETURN(MATCH_NOMATCH);
3175 }
3176 /* Control never gets here */
3177 }
3178
3179 /* Caseful comparisons (includes all multi-byte characters) */
3180
3181 else
3182 {
3183 for (i = 1; i <= min; i++)
3184 {
3185 if (eptr >= md->end_subject)
3186 {
3187 SCHECK_PARTIAL();
3188 MRRETURN(MATCH_NOMATCH);
3189 }
3190 if (fc != *eptr++) MRRETURN(MATCH_NOMATCH);
3191 }
3192
3193 if (min == max) continue;
3194
3195 if (minimize)
3196 {
3197 for (fi = min;; fi++)
3198 {
3199 RMATCH(eptr, ecode, offset_top, md, eptrb, RM26);
3200 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3201 if (fi >= max) MRRETURN(MATCH_NOMATCH);
3202 if (eptr >= md->end_subject)
3203 {
3204 SCHECK_PARTIAL();
3205 MRRETURN(MATCH_NOMATCH);
3206 }
3207 if (fc != *eptr++) MRRETURN(MATCH_NOMATCH);
3208 }
3209 /* Control never gets here */
3210 }
3211 else /* Maximize */
3212 {
3213 pp = eptr;
3214 for (i = min; i < max; i++)
3215 {
3216 if (eptr >= md->end_subject)
3217 {
3218 SCHECK_PARTIAL();
3219 break;
3220 }
3221 if (fc != *eptr) break;
3222 eptr++;
3223 }
3224 if (possessive) continue;
3225
3226 while (eptr >= pp)
3227 {
3228 RMATCH(eptr, ecode, offset_top, md, eptrb, RM27);
3229 eptr--;
3230 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3231 }
3232 MRRETURN(MATCH_NOMATCH);
3233 }
3234 }
3235 /* Control never gets here */
3236
3237 /* Match a negated single one-byte character. The character we are
3238 checking can be multibyte. */
3239
3240 case OP_NOT:
3241 case OP_NOTI:
3242 if (eptr >= md->end_subject)
3243 {
3244 SCHECK_PARTIAL();
3245 MRRETURN(MATCH_NOMATCH);
3246 }
3247 ecode++;
3248 GETCHARINCTEST(c, eptr);
3249 if (op == OP_NOTI) /* The caseless case */
3250 {
3251 #ifdef SUPPORT_UTF8
3252 if (c < 256)
3253 #endif
3254 c = md->lcc[c];
3255 if (md->lcc[*ecode++] == c) MRRETURN(MATCH_NOMATCH);
3256 }
3257 else /* Caseful */
3258 {
3259 if (*ecode++ == c) MRRETURN(MATCH_NOMATCH);
3260 }
3261 break;
3262
3263 /* Match a negated single one-byte character repeatedly. This is almost a
3264 repeat of the code for a repeated single character, but I haven't found a
3265 nice way of commoning these up that doesn't require a test of the
3266 positive/negative option for each character match. Maybe that wouldn't add
3267 very much to the time taken, but character matching *is* what this is all
3268 about... */
3269
3270 case OP_NOTEXACT:
3271 case OP_NOTEXACTI:
3272 min = max = GET2(ecode, 1);
3273 ecode += 3;
3274 goto REPEATNOTCHAR;
3275
3276 case OP_NOTUPTO:
3277 case OP_NOTUPTOI:
3278 case OP_NOTMINUPTO:
3279 case OP_NOTMINUPTOI:
3280 min = 0;
3281 max = GET2(ecode, 1);
3282 minimize = *ecode == OP_NOTMINUPTO || *ecode == OP_NOTMINUPTOI;
3283 ecode += 3;
3284 goto REPEATNOTCHAR;
3285
3286 case OP_NOTPOSSTAR:
3287 case OP_NOTPOSSTARI:
3288 possessive = TRUE;
3289 min = 0;
3290 max = INT_MAX;
3291 ecode++;
3292 goto REPEATNOTCHAR;
3293
3294 case OP_NOTPOSPLUS:
3295 case OP_NOTPOSPLUSI:
3296 possessive = TRUE;
3297 min = 1;
3298 max = INT_MAX;
3299 ecode++;
3300 goto REPEATNOTCHAR;
3301
3302 case OP_NOTPOSQUERY:
3303 case OP_NOTPOSQUERYI:
3304 possessive = TRUE;
3305 min = 0;
3306 max = 1;
3307 ecode++;
3308 goto REPEATNOTCHAR;
3309
3310 case OP_NOTPOSUPTO:
3311 case OP_NOTPOSUPTOI:
3312 possessive = TRUE;
3313 min = 0;
3314 max = GET2(ecode, 1);
3315 ecode += 3;
3316 goto REPEATNOTCHAR;
3317
3318 case OP_NOTSTAR:
3319 case OP_NOTSTARI:
3320 case OP_NOTMINSTAR:
3321 case OP_NOTMINSTARI:
3322 case OP_NOTPLUS:
3323 case OP_NOTPLUSI:
3324 case OP_NOTMINPLUS:
3325 case OP_NOTMINPLUSI:
3326 case OP_NOTQUERY:
3327 case OP_NOTQUERYI:
3328 case OP_NOTMINQUERY:
3329 case OP_NOTMINQUERYI:
3330 c = *ecode++ - ((op >= OP_NOTSTARI)? OP_NOTSTARI: OP_NOTSTAR);
3331 minimize = (c & 1) != 0;
3332 min = rep_min[c]; /* Pick up values from tables; */
3333 max = rep_max[c]; /* zero for max => infinity */
3334 if (max == 0) max = INT_MAX;
3335
3336 /* Common code for all repeated single-byte matches. */
3337
3338 REPEATNOTCHAR:
3339 fc = *ecode++;
3340
3341 /* The code is duplicated for the caseless and caseful cases, for speed,
3342 since matching characters is likely to be quite common. First, ensure the
3343 minimum number of matches are present. If min = max, continue at the same
3344 level without recursing. Otherwise, if minimizing, keep trying the rest of
3345 the expression and advancing one matching character if failing, up to the
3346 maximum. Alternatively, if maximizing, find the maximum number of
3347 characters and work backwards. */
3348
3349 DPRINTF(("negative matching %c{%d,%d} against subject %.*s\n", fc, min, max,
3350 max, eptr));
3351
3352 if (op >= OP_NOTSTARI) /* Caseless */
3353 {
3354 fc = md->lcc[fc];
3355
3356 #ifdef SUPPORT_UTF8
3357 /* UTF-8 mode */
3358 if (utf8)
3359 {
3360 register unsigned int d;
3361 for (i = 1; i <= min; i++)
3362 {
3363 if (eptr >= md->end_subject)
3364 {
3365 SCHECK_PARTIAL();
3366 MRRETURN(MATCH_NOMATCH);
3367 }
3368 GETCHARINC(d, eptr);
3369 if (d < 256) d = md->lcc[d];
3370 if (fc == d) MRRETURN(MATCH_NOMATCH);
3371 }
3372 }
3373 else
3374 #endif
3375
3376 /* Not UTF-8 mode */
3377 {
3378 for (i = 1; i <= min; i++)
3379 {
3380 if (eptr >= md->end_subject)
3381 {
3382 SCHECK_PARTIAL();
3383 MRRETURN(MATCH_NOMATCH);
3384 }
3385 if (fc == md->lcc[*eptr++]) MRRETURN(MATCH_NOMATCH);
3386 }
3387 }
3388
3389 if (min == max) continue;
3390
3391 if (minimize)
3392 {
3393 #ifdef SUPPORT_UTF8
3394 /* UTF-8 mode */
3395 if (utf8)
3396 {
3397 register unsigned int d;
3398 for (fi = min;; fi++)
3399 {
3400 RMATCH(eptr, ecode, offset_top, md, eptrb, RM28);
3401 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3402 if (fi >= max) MRRETURN(MATCH_NOMATCH);
3403 if (eptr >= md->end_subject)
3404 {
3405 SCHECK_PARTIAL();
3406 MRRETURN(MATCH_NOMATCH);
3407 }
3408 GETCHARINC(d, eptr);
3409 if (d < 256) d = md->lcc[d];
3410 if (fc == d) MRRETURN(MATCH_NOMATCH);
3411 }
3412 }
3413 else
3414 #endif
3415 /* Not UTF-8 mode */
3416 {
3417 for (fi = min;; fi++)
3418 {
3419 RMATCH(eptr, ecode, offset_top, md, eptrb, RM29);
3420 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3421 if (fi >= max) MRRETURN(MATCH_NOMATCH);
3422 if (eptr >= md->end_subject)
3423 {
3424 SCHECK_PARTIAL();
3425 MRRETURN(MATCH_NOMATCH);
3426 }
3427 if (fc == md->lcc[*eptr++]) MRRETURN(MATCH_NOMATCH);
3428 }
3429 }
3430 /* Control never gets here */
3431 }
3432
3433 /* Maximize case */
3434
3435 else
3436 {
3437 pp = eptr;
3438
3439 #ifdef SUPPORT_UTF8
3440 /* UTF-8 mode */
3441 if (utf8)
3442 {
3443 register unsigned int d;
3444 for (i = min; i < max; i++)
3445 {
3446 int len = 1;
3447 if (eptr >= md->end_subject)
3448 {
3449 SCHECK_PARTIAL();
3450 break;
3451 }
3452 GETCHARLEN(d, eptr, len);
3453 if (d < 256) d = md->lcc[d];
3454 if (fc == d) break;
3455 eptr += len;
3456 }
3457 if (possessive) continue;
3458 for(;;)
3459 {
3460 RMATCH(eptr, ecode, offset_top, md, eptrb, RM30);
3461 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3462 if (eptr-- == pp) break; /* Stop if tried at original pos */
3463 BACKCHAR(eptr);
3464 }
3465 }
3466 else
3467 #endif
3468 /* Not UTF-8 mode */
3469 {
3470 for (i = min; i < max; i++)
3471 {
3472 if (eptr >= md->end_subject)
3473 {
3474 SCHECK_PARTIAL();
3475 break;
3476 }
3477 if (fc == md->lcc[*eptr]) break;
3478 eptr++;
3479 }
3480 if (possessive) continue;
3481 while (eptr >= pp)
3482 {
3483 RMATCH(eptr, ecode, offset_top, md, eptrb, RM31);
3484 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3485 eptr--;
3486 }
3487 }
3488
3489 MRRETURN(MATCH_NOMATCH);
3490 }
3491 /* Control never gets here */
3492 }
3493
3494 /* Caseful comparisons */
3495
3496 else
3497 {
3498 #ifdef SUPPORT_UTF8
3499 /* UTF-8 mode */
3500 if (utf8)
3501 {
3502 register unsigned int d;
3503 for (i = 1; i <= min; i++)
3504 {
3505 if (eptr >= md->end_subject)
3506 {
3507 SCHECK_PARTIAL();
3508 MRRETURN(MATCH_NOMATCH);
3509 }
3510 GETCHARINC(d, eptr);
3511 if (fc == d) MRRETURN(MATCH_NOMATCH);
3512 }
3513 }
3514 else
3515 #endif
3516 /* Not UTF-8 mode */
3517 {
3518 for (i = 1; i <= min; i++)
3519 {
3520 if (eptr >= md->end_subject)
3521 {
3522 SCHECK_PARTIAL();
3523 MRRETURN(MATCH_NOMATCH);
3524 }
3525 if (fc == *eptr++) MRRETURN(MATCH_NOMATCH);
3526 }
3527 }
3528
3529 if (min == max) continue;
3530
3531 if (minimize)
3532 {
3533 #ifdef SUPPORT_UTF8
3534 /* UTF-8 mode */
3535 if (utf8)
3536 {
3537 register unsigned int d;
3538 for (fi = min;; fi++)
3539 {
3540 RMATCH(eptr, ecode, offset_top, md, eptrb, RM32);
3541 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3542 if (fi >= max) MRRETURN(MATCH_NOMATCH);
3543 if (eptr >= md->end_subject)
3544 {
3545 SCHECK_PARTIAL();
3546 MRRETURN(MATCH_NOMATCH);
3547 }
3548 GETCHARINC(d, eptr);
3549 if (fc == d) MRRETURN(MATCH_NOMATCH);
3550 }
3551 }
3552 else
3553 #endif
3554 /* Not UTF-8 mode */
3555 {
3556 for (fi = min;; fi++)
3557 {
3558 RMATCH(eptr, ecode, offset_top, md, eptrb, RM33);
3559 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3560 if (fi >= max) MRRETURN(MATCH_NOMATCH);
3561 if (eptr >= md->end_subject)
3562 {
3563 SCHECK_PARTIAL();
3564 MRRETURN(MATCH_NOMATCH);
3565 }
3566 if (fc == *eptr++) MRRETURN(MATCH_NOMATCH);
3567 }
3568 }
3569 /* Control never gets here */
3570 }
3571
3572 /* Maximize case */
3573
3574 else
3575 {
3576 pp = eptr;
3577
3578 #ifdef SUPPORT_UTF8
3579 /* UTF-8 mode */
3580 if (utf8)
3581 {
3582 register unsigned int d;
3583 for (i = min; i < max; i++)
3584 {
3585 int len = 1;
3586 if (eptr >= md->end_subject)
3587 {
3588 SCHECK_PARTIAL();
3589 break;
3590 }
3591 GETCHARLEN(d, eptr, len);
3592 if (fc == d) break;
3593 eptr += len;
3594 }
3595 if (possessive) continue;
3596 for(;;)
3597 {
3598 RMATCH(eptr, ecode, offset_top, md, eptrb, RM34);
3599 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3600 if (eptr-- == pp) break; /* Stop if tried at original pos */
3601 BACKCHAR(eptr);
3602 }
3603 }
3604 else
3605 #endif
3606 /* Not UTF-8 mode */
3607 {
3608 for (i = min; i < max; i++)
3609 {
3610 if (eptr >= md->end_subject)
3611 {
3612 SCHECK_PARTIAL();
3613 break;
3614 }
3615 if (fc == *eptr) break;
3616 eptr++;
3617 }
3618 if (possessive) continue;
3619 while (eptr >= pp)
3620 {
3621 RMATCH(eptr, ecode, offset_top, md, eptrb, RM35);
3622 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3623 eptr--;
3624 }
3625 }
3626
3627 MRRETURN(MATCH_NOMATCH);
3628 }
3629 }
3630 /* Control never gets here */
3631
3632 /* Match a single character type repeatedly; several different opcodes
3633 share code. This is very similar to the code for single characters, but we
3634 repeat it in the interests of efficiency. */
3635
3636 case OP_TYPEEXACT:
3637 min = max = GET2(ecode, 1);
3638 minimize = TRUE;
3639 ecode += 3;
3640 goto REPEATTYPE;
3641
3642 case OP_TYPEUPTO:
3643 case OP_TYPEMINUPTO:
3644 min = 0;
3645 max = GET2(ecode, 1);
3646 minimize = *ecode == OP_TYPEMINUPTO;
3647 ecode += 3;
3648 goto REPEATTYPE;
3649
3650 case OP_TYPEPOSSTAR:
3651 possessive = TRUE;
3652 min = 0;
3653 max = INT_MAX;
3654 ecode++;
3655 goto REPEATTYPE;
3656
3657 case OP_TYPEPOSPLUS:
3658 possessive = TRUE;
3659 min = 1;
3660 max = INT_MAX;
3661 ecode++;
3662 goto REPEATTYPE;
3663
3664 case OP_TYPEPOSQUERY:
3665 possessive = TRUE;
3666 min = 0;
3667 max = 1;
3668 ecode++;
3669 goto REPEATTYPE;
3670
3671 case OP_TYPEPOSUPTO:
3672 possessive = TRUE;
3673 min = 0;
3674 max = GET2(ecode, 1);
3675 ecode += 3;
3676 goto REPEATTYPE;
3677
3678 case OP_TYPESTAR:
3679 case OP_TYPEMINSTAR:
3680 case OP_TYPEPLUS:
3681 case OP_TYPEMINPLUS:
3682 case OP_TYPEQUERY:
3683 case OP_TYPEMINQUERY:
3684 c = *ecode++ - OP_TYPESTAR;
3685 minimize = (c & 1) != 0;
3686 min = rep_min[c]; /* Pick up values from tables; */
3687 max = rep_max[c]; /* zero for max => infinity */
3688 if (max == 0) max = INT_MAX;
3689
3690 /* Common code for all repeated single character type matches. Note that
3691 in UTF-8 mode, '.' matches a character of any length, but for the other
3692 character types, the valid characters are all one-byte long. */
3693
3694 REPEATTYPE:
3695 ctype = *ecode++; /* Code for the character type */
3696
3697 #ifdef SUPPORT_UCP
3698 if (ctype == OP_PROP || ctype == OP_NOTPROP)
3699 {
3700 prop_fail_result = ctype == OP_NOTPROP;
3701 prop_type = *ecode++;
3702 prop_value = *ecode++;
3703 }
3704 else prop_type = -1;
3705 #endif
3706
3707 /* First, ensure the minimum number of matches are present. Use inline
3708 code for maximizing the speed, and do the type test once at the start
3709 (i.e. keep it out of the loop). Separate the UTF-8 code completely as that
3710 is tidier. Also separate the UCP code, which can be the same for both UTF-8
3711 and single-bytes. */
3712
3713 if (min > 0)
3714 {
3715 #ifdef SUPPORT_UCP
3716 if (prop_type >= 0)
3717 {
3718 switch(prop_type)
3719 {
3720 case PT_ANY:
3721 if (prop_fail_result) MRRETURN(MATCH_NOMATCH);
3722 for (i = 1; i <= min; i++)
3723 {
3724 if (eptr >= md->end_subject)
3725 {
3726 SCHECK_PARTIAL();
3727 MRRETURN(MATCH_NOMATCH);
3728 }
3729 GETCHARINCTEST(c, eptr);
3730 }
3731 break;
3732
3733 case PT_LAMP:
3734 for (i = 1; i <= min; i++)
3735 {
3736 int chartype;
3737 if (eptr >= md->end_subject)
3738 {
3739 SCHECK_PARTIAL();
3740 MRRETURN(MATCH_NOMATCH);
3741 }
3742 GETCHARINCTEST(c, eptr);
3743 chartype = UCD_CHARTYPE(c);
3744 if ((chartype == ucp_Lu ||
3745 chartype == ucp_Ll ||
3746 chartype == ucp_Lt) == prop_fail_result)
3747 MRRETURN(MATCH_NOMATCH);
3748 }
3749 break;
3750
3751 case PT_GC:
3752 for (i = 1; i <= min; i++)
3753 {
3754 if (eptr >= md->end_subject)
3755 {
3756 SCHECK_PARTIAL();
3757 MRRETURN(MATCH_NOMATCH);
3758 }
3759 GETCHARINCTEST(c, eptr);
3760 if ((UCD_CATEGORY(c) == prop_value) == prop_fail_result)
3761 MRRETURN(MATCH_NOMATCH);
3762 }
3763 break;
3764
3765 case PT_PC:
3766 for (i = 1; i <= min; i++)
3767 {
3768 if (eptr >= md->end_subject)
3769 {
3770 SCHECK_PARTIAL();
3771 MRRETURN(MATCH_NOMATCH);
3772 }
3773 GETCHARINCTEST(c, eptr);
3774 if ((UCD_CHARTYPE(c) == prop_value) == prop_fail_result)
3775 MRRETURN(MATCH_NOMATCH);
3776 }
3777 break;
3778
3779 case PT_SC:
3780 for (i = 1; i <= min; i++)
3781 {
3782 if (eptr >= md->end_subject)
3783 {
3784 SCHECK_PARTIAL();
3785 MRRETURN(MATCH_NOMATCH);
3786 }
3787 GETCHARINCTEST(c, eptr);
3788 if ((UCD_SCRIPT(c) == prop_value) == prop_fail_result)
3789 MRRETURN(MATCH_NOMATCH);
3790 }
3791 break;
3792
3793 case PT_ALNUM:
3794 for (i = 1; i <= min; i++)
3795 {
3796 int category;
3797 if (eptr >= md->end_subject)
3798 {
3799 SCHECK_PARTIAL();
3800 MRRETURN(MATCH_NOMATCH);
3801 }
3802 GETCHARINCTEST(c, eptr);
3803 category = UCD_CATEGORY(c);
3804 if ((category == ucp_L || category == ucp_N) == prop_fail_result)
3805 MRRETURN(MATCH_NOMATCH);
3806 }
3807 break;
3808
3809 case PT_SPACE: /* Perl space */
3810 for (i = 1; i <= min; i++)
3811 {
3812 if (eptr >= md->end_subject)
3813 {
3814 SCHECK_PARTIAL();
3815 MRRETURN(MATCH_NOMATCH);
3816 }
3817 GETCHARINCTEST(c, eptr);
3818 if ((UCD_CATEGORY(c) == ucp_Z || c == CHAR_HT || c == CHAR_NL ||
3819 c == CHAR_FF || c == CHAR_CR)
3820 == prop_fail_result)
3821 MRRETURN(MATCH_NOMATCH);
3822 }
3823 break;
3824
3825 case PT_PXSPACE: /* POSIX space */
3826 for (i = 1; i <= min; i++)
3827 {
3828 if (eptr >= md->end_subject)
3829 {
3830 SCHECK_PARTIAL();
3831 MRRETURN(MATCH_NOMATCH);
3832 }
3833 GETCHARINCTEST(c, eptr);
3834 if ((UCD_CATEGORY(c) == ucp_Z || c == CHAR_HT || c == CHAR_NL ||
3835 c == CHAR_VT || c == CHAR_FF || c == CHAR_CR)
3836 == prop_fail_result)
3837 MRRETURN(MATCH_NOMATCH);
3838 }
3839 break;
3840
3841 case PT_WORD:
3842 for (i = 1; i <= min; i++)
3843 {
3844 int category;
3845 if (eptr >= md->end_subject)
3846 {
3847 SCHECK_PARTIAL();
3848 MRRETURN(MATCH_NOMATCH);
3849 }
3850 GETCHARINCTEST(c, eptr);
3851 category = UCD_CATEGORY(c);
3852 if ((category == ucp_L || category == ucp_N || c == CHAR_UNDERSCORE)
3853 == prop_fail_result)
3854 MRRETURN(MATCH_NOMATCH);
3855 }
3856 break;
3857
3858 /* This should not occur */
3859
3860 default:
3861 RRETURN(PCRE_ERROR_INTERNAL);
3862 }
3863 }
3864
3865 /* Match extended Unicode sequences. We will get here only if the
3866 support is in the binary; otherwise a compile-time error occurs. */
3867
3868 else if (ctype == OP_EXTUNI)
3869 {
3870 for (i = 1; i <= min; i++)
3871 {
3872 if (eptr >= md->end_subject)
3873 {
3874 SCHECK_PARTIAL();
3875 MRRETURN(MATCH_NOMATCH);
3876 }
3877 GETCHARINCTEST(c, eptr);
3878 if (UCD_CATEGORY(c) == ucp_M) MRRETURN(MATCH_NOMATCH);
3879 while (eptr < md->end_subject)
3880 {
3881 int len = 1;
3882 if (!utf8) c = *eptr; else { GETCHARLEN(c, eptr, len); }
3883 if (UCD_CATEGORY(c) != ucp_M) break;
3884 eptr += len;
3885 }
3886 }
3887 }
3888
3889 else
3890 #endif /* SUPPORT_UCP */
3891
3892 /* Handle all other cases when the coding is UTF-8 */
3893
3894 #ifdef SUPPORT_UTF8
3895 if (utf8) switch(ctype)
3896 {
3897 case OP_ANY:
3898 for (i = 1; i <= min; i++)
3899 {
3900 if (eptr >= md->end_subject)
3901 {
3902 SCHECK_PARTIAL();
3903 MRRETURN(MATCH_NOMATCH);
3904 }
3905 if (IS_NEWLINE(eptr)) MRRETURN(MATCH_NOMATCH);
3906 eptr++;
3907 while (eptr < md->end_subject && (*eptr & 0xc0) == 0x80) eptr++;
3908 }
3909 break;
3910
3911 case OP_ALLANY:
3912 for (i = 1; i <= min; i++)
3913 {
3914 if (eptr >= md->end_subject)
3915 {
3916 SCHECK_PARTIAL();
3917 MRRETURN(MATCH_NOMATCH);
3918 }
3919 eptr++;
3920 while (eptr < md->end_subject && (*eptr & 0xc0) == 0x80) eptr++;
3921 }
3922 break;
3923
3924 case OP_ANYBYTE:
3925 if (eptr > md->end_subject - min) MRRETURN(MATCH_NOMATCH);
3926 eptr += min;
3927 break;
3928
3929 case OP_ANYNL:
3930 for (i = 1; i <= min; i++)
3931 {
3932 if (eptr >= md->end_subject)
3933 {
3934 SCHECK_PARTIAL();
3935 MRRETURN(MATCH_NOMATCH);
3936 }
3937 GETCHARINC(c, eptr);
3938 switch(c)
3939 {
3940 default: MRRETURN(MATCH_NOMATCH);
3941
3942 case 0x000d:
3943 if (eptr < md->end_subject && *eptr == 0x0a) eptr++;
3944 break;
3945
3946 case 0x000a:
3947 break;
3948
3949 case 0x000b:
3950 case 0x000c:
3951 case 0x0085:
3952 case 0x2028:
3953 case 0x2029:
3954 if (md->bsr_anycrlf) MRRETURN(MATCH_NOMATCH);
3955 break;
3956 }
3957 }
3958 break;
3959
3960 case OP_NOT_HSPACE:
3961 for (i = 1; i <= min; i++)
3962 {
3963 if (eptr >= md->end_subject)
3964 {
3965 SCHECK_PARTIAL();
3966 MRRETURN(MATCH_NOMATCH);
3967 }
3968 GETCHARINC(c, eptr);
3969 switch(c)
3970 {
3971 default: break;
3972 case 0x09: /* HT */
3973 case 0x20: /* SPACE */
3974 case 0xa0: /* NBSP */
3975 case 0x1680: /* OGHAM SPACE MARK */
3976 case 0x180e: /* MONGOLIAN VOWEL SEPARATOR */
3977 case 0x2000: /* EN QUAD */
3978 case 0x2001: /* EM QUAD */
3979 case 0x2002: /* EN SPACE */
3980 case 0x2003: /* EM SPACE */
3981 case 0x2004: /* THREE-PER-EM SPACE */
3982 case 0x2005: /* FOUR-PER-EM SPACE */
3983 case 0x2006: /* SIX-PER-EM SPACE */
3984 case 0x2007: /* FIGURE SPACE */
3985 case 0x2008: /* PUNCTUATION SPACE */
3986 case 0x2009: /* THIN SPACE */
3987 case 0x200A: /* HAIR SPACE */
3988 case 0x202f: /* NARROW NO-BREAK SPACE */
3989 case 0x205f: /* MEDIUM MATHEMATICAL SPACE */
3990 case 0x3000: /* IDEOGRAPHIC SPACE */
3991 MRRETURN(MATCH_NOMATCH);
3992 }
3993 }
3994 break;
3995
3996 case OP_HSPACE:
3997 for (i = 1; i <= min; i++)
3998 {
3999 if (eptr >= md->end_subject)
4000 {
4001 SCHECK_PARTIAL();
4002 MRRETURN(MATCH_NOMATCH);
4003 }
4004 GETCHARINC(c, eptr);
4005 switch(c)
4006 {
4007 default: MRRETURN(MATCH_NOMATCH);
4008 case 0x09: /* HT */
4009 case 0x20: /* SPACE */
4010 case 0xa0: /* NBSP */
4011 case 0x1680: /* OGHAM SPACE MARK */
4012 case 0x180e: /* MONGOLIAN VOWEL SEPARATOR */
4013 case 0x2000: /* EN QUAD */
4014 case 0x2001: /* EM QUAD */
4015 case 0x2002: /* EN SPACE */
4016 case 0x2003: /* EM SPACE */
4017 case 0x2004: /* THREE-PER-EM SPACE */
4018 case 0x2005: /* FOUR-PER-EM SPACE */
4019 case 0x2006: /* SIX-PER-EM SPACE */
4020 case 0x2007: /* FIGURE SPACE */
4021 case 0x2008: /* PUNCTUATION SPACE */
4022 case 0x2009: /* THIN SPACE */
4023 case 0x200A: /* HAIR SPACE */
4024 case 0x202f: /* NARROW NO-BREAK SPACE */
4025 case 0x205f: /* MEDIUM MATHEMATICAL SPACE */
4026 case 0x3000: /* IDEOGRAPHIC SPACE */
4027 break;
4028 }
4029 }
4030 break;
4031
4032 case OP_NOT_VSPACE:
4033 for (i = 1; i <= min; i++)
4034 {
4035 if (eptr >= md->end_subject)
4036 {
4037 SCHECK_PARTIAL();
4038 MRRETURN(MATCH_NOMATCH);
4039 }
4040 GETCHARINC(c, eptr);
4041 switch(c)
4042 {
4043 default: break;
4044 case 0x0a: /* LF */
4045 case 0x0b: /* VT */
4046 case 0x0c: /* FF */
4047 case 0x0d: /* CR */
4048 case 0x85: /* NEL */
4049 case 0x2028: /* LINE SEPARATOR */
4050 case 0x2029: /* PARAGRAPH SEPARATOR */
4051 MRRETURN(MATCH_NOMATCH);
4052 }
4053 }
4054 break;
4055
4056 case OP_VSPACE:
4057 for (i = 1; i <= min; i++)
4058 {
4059 if (eptr >= md->end_subject)
4060 {
4061 SCHECK_PARTIAL();
4062 MRRETURN(MATCH_NOMATCH);
4063 }
4064 GETCHARINC(c, eptr);
4065 switch(c)
4066 {
4067 default: MRRETURN(MATCH_NOMATCH);
4068 case 0x0a: /* LF */
4069 case 0x0b: /* VT */
4070 case 0x0c: /* FF */
4071 case 0x0d: /* CR */
4072 case 0x85: /* NEL */
4073 case 0x2028: /* LINE SEPARATOR */
4074 case 0x2029: /* PARAGRAPH SEPARATOR */
4075 break;
4076 }
4077 }
4078 break;
4079
4080 case OP_NOT_DIGIT:
4081 for (i = 1; i <= min; i++)
4082 {
4083 if (eptr >= md->end_subject)
4084 {
4085 SCHECK_PARTIAL();
4086 MRRETURN(MATCH_NOMATCH);
4087 }
4088 GETCHARINC(c, eptr);
4089 if (c < 128 && (md->ctypes[c] & ctype_digit) != 0)
4090 MRRETURN(MATCH_NOMATCH);
4091 }
4092 break;
4093
4094 case OP_DIGIT:
4095 for (i = 1; i <= min; i++)
4096 {
4097 if (eptr >= md->end_subject)
4098 {
4099 SCHECK_PARTIAL();
4100 MRRETURN(MATCH_NOMATCH);
4101 }
4102 if (*eptr >= 128 || (md->ctypes[*eptr++] & ctype_digit) == 0)
4103 MRRETURN(MATCH_NOMATCH);
4104 /* No need to skip more bytes - we know it's a 1-byte character */
4105 }
4106 break;
4107
4108 case OP_NOT_WHITESPACE:
4109 for (i = 1; i <= min; i++)
4110 {
4111 if (eptr >= md->end_subject)
4112 {
4113 SCHECK_PARTIAL();
4114 MRRETURN(MATCH_NOMATCH);
4115 }
4116 if (*eptr < 128 && (md->ctypes[*eptr] & ctype_space) != 0)
4117 MRRETURN(MATCH_NOMATCH);
4118 while (++eptr < md->end_subject && (*eptr & 0xc0) == 0x80);
4119 }
4120 break;
4121
4122 case OP_WHITESPACE:
4123 for (i = 1; i <= min; i++)
4124 {
4125 if (eptr >= md->end_subject)
4126 {
4127 SCHECK_PARTIAL();
4128 MRRETURN(MATCH_NOMATCH);
4129 }
4130 if (*eptr >= 128 || (md->ctypes[*eptr++] & ctype_space) == 0)
4131 MRRETURN(MATCH_NOMATCH);
4132 /* No need to skip more bytes - we know it's a 1-byte character */
4133 }
4134 break;
4135
4136 case OP_NOT_WORDCHAR:
4137 for (i = 1; i <= min; i++)
4138 {
4139 if (eptr >= md->end_subject)
4140 {
4141 SCHECK_PARTIAL();
4142 MRRETURN(MATCH_NOMATCH);
4143 }
4144 if (*eptr < 128 && (md->ctypes[*eptr] & ctype_word) != 0)
4145 MRRETURN(MATCH_NOMATCH);
4146 while (++eptr < md->end_subject && (*eptr & 0xc0) == 0x80);
4147 }
4148 break;
4149
4150 case OP_WORDCHAR:
4151 for (i = 1; i <= min; i++)
4152 {
4153 if (eptr >= md->end_subject)
4154 {
4155 SCHECK_PARTIAL();
4156 MRRETURN(MATCH_NOMATCH);
4157 }
4158 if (*eptr >= 128 || (md->ctypes[*eptr++] & ctype_word) == 0)
4159 MRRETURN(MATCH_NOMATCH);
4160 /* No need to skip more bytes - we know it's a 1-byte character */
4161 }
4162 break;
4163
4164 default:
4165 RRETURN(PCRE_ERROR_INTERNAL);
4166 } /* End switch(ctype) */
4167
4168 else
4169 #endif /* SUPPORT_UTF8 */
4170
4171 /* Code for the non-UTF-8 case for minimum matching of operators other
4172 than OP_PROP and OP_NOTPROP. */
4173
4174 switch(ctype)
4175 {
4176 case OP_ANY:
4177 for (i = 1; i <= min; i++)
4178 {
4179 if (eptr >= md->end_subject)
4180 {
4181 SCHECK_PARTIAL();
4182 MRRETURN(MATCH_NOMATCH);
4183 }
4184 if (IS_NEWLINE(eptr)) MRRETURN(MATCH_NOMATCH);
4185 eptr++;
4186 }
4187 break;
4188
4189 case OP_ALLANY:
4190 if (eptr > md->end_subject - min)
4191 {
4192 SCHECK_PARTIAL();
4193 MRRETURN(MATCH_NOMATCH);
4194 }
4195 eptr += min;
4196 break;
4197
4198 case OP_ANYBYTE:
4199 if (eptr > md->end_subject - min)
4200 {
4201 SCHECK_PARTIAL();
4202 MRRETURN(MATCH_NOMATCH);
4203 }
4204 eptr += min;
4205 break;
4206
4207 case OP_ANYNL:
4208 for (i = 1; i <= min; i++)
4209 {
4210 if (eptr >= md->end_subject)
4211 {
4212 SCHECK_PARTIAL();
4213 MRRETURN(MATCH_NOMATCH);
4214 }
4215 switch(*eptr++)
4216 {
4217 default: MRRETURN(MATCH_NOMATCH);
4218
4219 case 0x000d:
4220 if (eptr < md->end_subject && *eptr == 0x0a) eptr++;
4221 break;
4222
4223 case 0x000a:
4224 break;
4225
4226 case 0x000b:
4227 case 0x000c:
4228 case 0x0085:
4229 if (md->bsr_anycrlf) MRRETURN(MATCH_NOMATCH);
4230 break;
4231 }
4232 }
4233 break;
4234
4235 case OP_NOT_HSPACE:
4236 for (i = 1; i <= min; i++)
4237 {
4238 if (eptr >= md->end_subject)
4239 {
4240 SCHECK_PARTIAL();
4241 MRRETURN(MATCH_NOMATCH);
4242 }
4243 switch(*eptr++)
4244 {
4245 default: break;
4246 case 0x09: /* HT */
4247 case 0x20: /* SPACE */
4248 case 0xa0: /* NBSP */
4249 MRRETURN(MATCH_NOMATCH);
4250 }
4251 }
4252 break;
4253
4254 case OP_HSPACE:
4255 for (i = 1; i <= min; i++)
4256 {
4257 if (eptr >= md->end_subject)
4258 {
4259 SCHECK_PARTIAL();
4260 MRRETURN(MATCH_NOMATCH);
4261 }
4262 switch(*eptr++)
4263 {
4264 default: MRRETURN(MATCH_NOMATCH);
4265 case 0x09: /* HT */
4266 case 0x20: /* SPACE */
4267 case 0xa0: /* NBSP */
4268 break;
4269 }
4270 }
4271 break;
4272
4273 case OP_NOT_VSPACE:
4274 for (i = 1; i <= min; i++)
4275 {
4276 if (eptr >= md->end_subject)
4277 {
4278 SCHECK_PARTIAL();
4279 MRRETURN(MATCH_NOMATCH);
4280 }
4281 switch(*eptr++)
4282 {
4283 default: break;
4284 case 0x0a: /* LF */
4285 case 0x0b: /* VT */
4286 case 0x0c: /* FF */
4287 case 0x0d: /* CR */
4288 case 0x85: /* NEL */
4289 MRRETURN(MATCH_NOMATCH);
4290 }
4291 }
4292 break;
4293
4294 case OP_VSPACE:
4295 for (i = 1; i <= min; i++)
4296 {
4297 if (eptr >= md->end_subject)
4298 {
4299 SCHECK_PARTIAL();
4300 MRRETURN(MATCH_NOMATCH);
4301 }
4302 switch(*eptr++)
4303 {
4304 default: MRRETURN(MATCH_NOMATCH);
4305 case 0x0a: /* LF */
4306 case 0x0b: /* VT */
4307 case 0x0c: /* FF */
4308 case 0x0d: /* CR */
4309 case 0x85: /* NEL */
4310 break;
4311 }
4312 }
4313 break;
4314
4315 case OP_NOT_DIGIT:
4316 for (i = 1; i <= min; i++)
4317 {
4318 if (eptr >= md->end_subject)
4319 {
4320 SCHECK_PARTIAL();
4321 MRRETURN(MATCH_NOMATCH);
4322 }
4323 if ((md->ctypes[*eptr++] & ctype_digit) != 0) MRRETURN(MATCH_NOMATCH);
4324 }
4325 break;
4326
4327 case OP_DIGIT:
4328 for (i = 1; i <= min; i++)
4329 {
4330 if (eptr >= md->end_subject)
4331 {
4332 SCHECK_PARTIAL();
4333 MRRETURN(MATCH_NOMATCH);
4334 }
4335 if ((md->ctypes[*eptr++] & ctype_digit) == 0) MRRETURN(MATCH_NOMATCH);
4336 }
4337 break;
4338
4339 case OP_NOT_WHITESPACE:
4340 for (i = 1; i <= min; i++)
4341 {
4342 if (eptr >= md->end_subject)
4343 {
4344 SCHECK_PARTIAL();
4345 MRRETURN(MATCH_NOMATCH);
4346 }
4347 if ((md->ctypes[*eptr++] & ctype_space) != 0) MRRETURN(MATCH_NOMATCH);
4348 }
4349 break;
4350
4351 case OP_WHITESPACE:
4352 for (i = 1; i <= min; i++)
4353 {
4354 if (eptr >= md->end_subject)
4355 {
4356 SCHECK_PARTIAL();
4357 MRRETURN(MATCH_NOMATCH);
4358 }
4359 if ((md->ctypes[*eptr++] & ctype_space) == 0) MRRETURN(MATCH_NOMATCH);
4360 }
4361 break;
4362
4363 case OP_NOT_WORDCHAR:
4364 for (i = 1; i <= min; i++)
4365 {
4366 if (eptr >= md->end_subject)
4367 {
4368 SCHECK_PARTIAL();
4369 MRRETURN(MATCH_NOMATCH);
4370 }
4371 if ((md->ctypes[*eptr++] & ctype_word) != 0)
4372 MRRETURN(MATCH_NOMATCH);
4373 }
4374 break;
4375
4376 case OP_WORDCHAR:
4377 for (i = 1; i <= min; i++)
4378 {
4379 if (eptr >= md->end_subject)
4380 {
4381 SCHECK_PARTIAL();
4382 MRRETURN(MATCH_NOMATCH);
4383 }
4384 if ((md->ctypes[*eptr++] & ctype_word) == 0)
4385 MRRETURN(MATCH_NOMATCH);
4386 }
4387 break;
4388
4389 default:
4390 RRETURN(PCRE_ERROR_INTERNAL);
4391 }
4392 }
4393
4394 /* If min = max, continue at the same level without recursing */
4395
4396 if (min == max) continue;
4397
4398 /* If minimizing, we have to test the rest of the pattern before each
4399 subsequent match. Again, separate the UTF-8 case for speed, and also
4400 separate the UCP cases. */
4401
4402 if (minimize)
4403 {
4404 #ifdef SUPPORT_UCP
4405 if (prop_type >= 0)
4406 {
4407 switch(prop_type)
4408 {
4409 case PT_ANY:
4410 for (fi = min;; fi++)
4411 {
4412 RMATCH(eptr, ecode, offset_top, md, eptrb, RM36);
4413 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
4414 if (fi >= max) MRRETURN(MATCH_NOMATCH);
4415 if (eptr >= md->end_subject)
4416 {
4417 SCHECK_PARTIAL();
4418 MRRETURN(MATCH_NOMATCH);
4419 }
4420 GETCHARINCTEST(c, eptr);
4421 if (prop_fail_result) MRRETURN(MATCH_NOMATCH);
4422 }
4423 /* Control never gets here */
4424
4425 case PT_LAMP:
4426 for (fi = min;; fi++)
4427 {
4428 int chartype;
4429 RMATCH(eptr, ecode, offset_top, md, eptrb, RM37);
4430 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
4431 if (fi >= max) MRRETURN(MATCH_NOMATCH);
4432 if (eptr >= md->end_subject)
4433 {
4434 SCHECK_PARTIAL();
4435 MRRETURN(MATCH_NOMATCH);
4436 }
4437 GETCHARINCTEST(c, eptr);
4438 chartype = UCD_CHARTYPE(c);
4439 if ((chartype == ucp_Lu ||
4440 chartype == ucp_Ll ||
4441 chartype == ucp_Lt) == prop_fail_result)
4442 MRRETURN(MATCH_NOMATCH);
4443 }
4444 /* Control never gets here */
4445
4446 case PT_GC:
4447 for (fi = min;; fi++)
4448 {
4449 RMATCH(eptr, ecode, offset_top, md, eptrb, RM38);
4450 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
4451 if (fi >= max) MRRETURN(MATCH_NOMATCH);
4452 if (eptr >= md->end_subject)
4453 {
4454 SCHECK_PARTIAL();
4455 MRRETURN(MATCH_NOMATCH);
4456 }
4457 GETCHARINCTEST(c, eptr);
4458 if ((UCD_CATEGORY(c) == prop_value) == prop_fail_result)
4459 MRRETURN(MATCH_NOMATCH);
4460 }
4461 /* Control never gets here */
4462
4463 case PT_PC:
4464 for (fi = min;; fi++)
4465 {
4466 RMATCH(eptr, ecode, offset_top, md, eptrb, RM39);
4467 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
4468 if (fi >= max) MRRETURN(MATCH_NOMATCH);
4469 if (eptr >= md->end_subject)
4470 {
4471 SCHECK_PARTIAL();
4472 MRRETURN(MATCH_NOMATCH);
4473 }
4474 GETCHARINCTEST(c, eptr);
4475 if ((UCD_CHARTYPE(c) == prop_value) == prop_fail_result)
4476 MRRETURN(MATCH_NOMATCH);
4477 }
4478 /* Control never gets here */
4479
4480 case PT_SC:
4481 for (fi = min;; fi++)
4482 {
4483 RMATCH(eptr, ecode, offset_top, md, eptrb, RM40);
4484 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
4485 if (fi >= max) MRRETURN(MATCH_NOMATCH);
4486 if (eptr >= md->end_subject)
4487 {
4488 SCHECK_PARTIAL();
4489 MRRETURN(MATCH_NOMATCH);
4490 }
4491 GETCHARINCTEST(c, eptr);
4492 if ((UCD_SCRIPT(c) == prop_value) == prop_fail_result)
4493 MRRETURN(MATCH_NOMATCH);
4494 }
4495 /* Control never gets here */
4496
4497 case PT_ALNUM:
4498 for (fi = min;; fi++)
4499 {
4500 int category;
4501 RMATCH(eptr, ecode, offset_top, md, eptrb, RM59);
4502 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
4503 if (fi >= max) MRRETURN(MATCH_NOMATCH);
4504 if (eptr >= md->end_subject)
4505 {
4506 SCHECK_PARTIAL();
4507 MRRETURN(MATCH_NOMATCH);
4508 }
4509 GETCHARINCTEST(c, eptr);
4510 category = UCD_CATEGORY(c);
4511 if ((category == ucp_L || category == ucp_N) == prop_fail_result)
4512 MRRETURN(MATCH_NOMATCH);
4513 }
4514 /* Control never gets here */
4515
4516 case PT_SPACE: /* Perl space */
4517 for (fi = min;; fi++)
4518 {
4519 RMATCH(eptr, ecode, offset_top, md, eptrb, RM60);
4520 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
4521 if (fi >= max) MRRETURN(MATCH_NOMATCH);
4522 if (eptr >= md->end_subject)
4523 {
4524 SCHECK_PARTIAL();
4525 MRRETURN(MATCH_NOMATCH);
4526 }
4527 GETCHARINCTEST(c, eptr);
4528 if ((UCD_CATEGORY(c) == ucp_Z || c == CHAR_HT || c == CHAR_NL ||
4529 c == CHAR_FF || c == CHAR_CR)
4530 == prop_fail_result)
4531 MRRETURN(MATCH_NOMATCH);
4532 }
4533 /* Control never gets here */
4534
4535 case PT_PXSPACE: /* POSIX space */
4536 for (fi = min;; fi++)
4537 {
4538 RMATCH(eptr, ecode, offset_top, md, eptrb, RM61);
4539 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
4540 if (fi >= max) MRRETURN(MATCH_NOMATCH);
4541 if (eptr >= md->end_subject)
4542 {
4543 SCHECK_PARTIAL();
4544 MRRETURN(MATCH_NOMATCH);
4545 }
4546 GETCHARINCTEST(c, eptr);
4547 if ((UCD_CATEGORY(c) == ucp_Z || c == CHAR_HT || c == CHAR_NL ||
4548 c == CHAR_VT || c == CHAR_FF || c == CHAR_CR)
4549 == prop_fail_result)
4550 MRRETURN(MATCH_NOMATCH);
4551 }
4552 /* Control never gets here */
4553
4554 case PT_WORD:
4555 for (fi = min;; fi++)
4556 {
4557 int category;
4558 RMATCH(eptr, ecode, offset_top, md, eptrb, RM62);
4559 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
4560 if (fi >= max) MRRETURN(MATCH_NOMATCH);
4561 if (eptr >= md->end_subject)
4562 {
4563 SCHECK_PARTIAL();
4564 MRRETURN(MATCH_NOMATCH);
4565 }
4566 GETCHARINCTEST(c, eptr);
4567 category = UCD_CATEGORY(c);
4568 if ((category == ucp_L ||
4569 category == ucp_N ||
4570 c == CHAR_UNDERSCORE)
4571 == prop_fail_result)
4572 MRRETURN(MATCH_NOMATCH);
4573 }
4574 /* Control never gets here */
4575
4576 /* This should never occur */
4577
4578 default:
4579 RRETURN(PCRE_ERROR_INTERNAL);
4580 }
4581 }
4582
4583 /* Match extended Unicode sequences. We will get here only if the
4584 support is in the binary; otherwise a compile-time error occurs. */
4585
4586 else if (ctype == OP_EXTUNI)
4587 {
4588 for (fi = min;; fi++)
4589 {
4590 RMATCH(eptr, ecode, offset_top, md, eptrb, RM41);
4591 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
4592 if (fi >= max) MRRETURN(MATCH_NOMATCH);
4593 if (eptr >= md->end_subject)
4594 {
4595 SCHECK_PARTIAL();
4596 MRRETURN(MATCH_NOMATCH);
4597 }
4598 GETCHARINCTEST(c, eptr);
4599 if (UCD_CATEGORY(c) == ucp_M) MRRETURN(MATCH_NOMATCH);
4600 while (eptr < md->end_subject)
4601 {
4602 int len = 1;
4603 if (!utf8) c = *eptr; else { GETCHARLEN(c, eptr, len); }
4604 if (UCD_CATEGORY(c) != ucp_M) break;
4605 eptr += len;
4606 }
4607 }
4608 }
4609 else
4610 #endif /* SUPPORT_UCP */
4611
4612 #ifdef SUPPORT_UTF8
4613 /* UTF-8 mode */
4614 if (utf8)
4615 {
4616 for (fi = min;; fi++)
4617 {
4618 RMATCH(eptr, ecode, offset_top, md, eptrb, RM42);
4619 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
4620 if (fi >= max) MRRETURN(MATCH_NOMATCH);
4621 if (eptr >= md->end_subject)
4622 {
4623 SCHECK_PARTIAL();
4624 MRRETURN(MATCH_NOMATCH);
4625 }
4626 if (ctype == OP_ANY && IS_NEWLINE(eptr))
4627 MRRETURN(MATCH_NOMATCH);
4628 GETCHARINC(c, eptr);
4629 switch(ctype)
4630 {
4631 case OP_ANY: /* This is the non-NL case */
4632 case OP_ALLANY:
4633 case OP_ANYBYTE:
4634 break;
4635
4636 case OP_ANYNL:
4637 switch(c)
4638 {
4639 default: MRRETURN(MATCH_NOMATCH);
4640 case 0x000d:
4641 if (eptr < md->end_subject && *eptr == 0x0a) eptr++;
4642 break;
4643 case 0x000a:
4644 break;
4645
4646 case 0x000b:
4647 case 0x000c:
4648 case 0x0085:
4649 case 0x2028:
4650 case 0x2029:
4651 if (md->bsr_anycrlf) MRRETURN(MATCH_NOMATCH);
4652 break;
4653 }
4654 break;
4655
4656 case OP_NOT_HSPACE:
4657 switch(c)
4658 {
4659 default: break;
4660 case 0x09: /* HT */
4661 case 0x20: /* SPACE */
4662 case 0xa0: /* NBSP */
4663 case 0x1680: /* OGHAM SPACE MARK */
4664 case 0x180e: /* MONGOLIAN VOWEL SEPARATOR */
4665 case 0x2000: /* EN QUAD */
4666 case 0x2001: /* EM QUAD */
4667 case 0x2002: /* EN SPACE */
4668 case 0x2003: /* EM SPACE */
4669 case 0x2004: /* THREE-PER-EM SPACE */
4670 case 0x2005: /* FOUR-PER-EM SPACE */
4671 case 0x2006: /* SIX-PER-EM SPACE */
4672 case 0x2007: /* FIGURE SPACE */
4673 case 0x2008: /* PUNCTUATION SPACE */
4674 case 0x2009: /* THIN SPACE */
4675 case 0x200A: /* HAIR SPACE */
4676 case 0x202f: /* NARROW NO-BREAK SPACE */
4677 case 0x205f: /* MEDIUM MATHEMATICAL SPACE */
4678 case 0x3000: /* IDEOGRAPHIC SPACE */
4679 MRRETURN(MATCH_NOMATCH);
4680 }
4681 break;
4682
4683 case OP_HSPACE:
4684 switch(c)
4685 {
4686 default: MRRETURN(MATCH_NOMATCH);
4687 case 0x09: /* HT */
4688 case 0x20: /* SPACE */
4689 case 0xa0: /* NBSP */
4690 case 0x1680: /* OGHAM SPACE MARK */
4691 case 0x180e: /* MONGOLIAN VOWEL SEPARATOR */
4692 case 0x2000: /* EN QUAD */
4693 case 0x2001: /* EM QUAD */
4694 case 0x2002: /* EN SPACE */
4695 case 0x2003: /* EM SPACE */
4696 case 0x2004: /* THREE-PER-EM SPACE */
4697 case 0x2005: /* FOUR-PER-EM SPACE */
4698 case 0x2006: /* SIX-PER-EM SPACE */
4699 case 0x2007: /* FIGURE SPACE */
4700 case 0x2008: /* PUNCTUATION SPACE */
4701 case 0x2009: /* THIN SPACE */
4702 case 0x200A: /* HAIR SPACE */
4703 case 0x202f: /* NARROW NO-BREAK SPACE */
4704 case 0x205f: /* MEDIUM MATHEMATICAL SPACE */
4705 case 0x3000: /* IDEOGRAPHIC SPACE */
4706 break;
4707 }
4708 break;
4709
4710 case OP_NOT_VSPACE:
4711 switch(c)
4712 {
4713 default: break;
4714 case 0x0a: /* LF */
4715 case 0x0b: /* VT */
4716 case 0x0c: /* FF */
4717 case 0x0d: /* CR */
4718 case 0x85: /* NEL */
4719 case 0x2028: /* LINE SEPARATOR */
4720 case 0x2029: /* PARAGRAPH SEPARATOR */
4721 MRRETURN(MATCH_NOMATCH);
4722 }
4723 break;
4724
4725 case OP_VSPACE:
4726 switch(c)
4727 {
4728 default: MRRETURN(MATCH_NOMATCH);
4729 case 0x0a: /* LF */
4730 case 0x0b: /* VT */
4731 case 0x0c: /* FF */
4732 case 0x0d: /* CR */
4733 case 0x85: /* NEL */
4734 case 0x2028: /* LINE SEPARATOR */
4735 case 0x2029: /* PARAGRAPH SEPARATOR */
4736 break;
4737 }
4738 break;
4739
4740 case OP_NOT_DIGIT:
4741 if (c < 256 && (md->ctypes[c] & ctype_digit) != 0)
4742 MRRETURN(MATCH_NOMATCH);
4743 break;
4744
4745 case OP_DIGIT:
4746 if (c >= 256 || (md->ctypes[c] & ctype_digit) == 0)
4747 MRRETURN(MATCH_NOMATCH);
4748 break;
4749
4750 case OP_NOT_WHITESPACE:
4751 if (c < 256 && (md->ctypes[c] & ctype_space) != 0)
4752 MRRETURN(MATCH_NOMATCH);
4753 break;
4754
4755 case OP_WHITESPACE:
4756 if (c >= 256 || (md->ctypes[c] & ctype_space) == 0)
4757 MRRETURN(MATCH_NOMATCH);
4758 break;
4759
4760 case OP_NOT_WORDCHAR:
4761 if (c < 256 && (md->ctypes[c] & ctype_word) != 0)
4762 MRRETURN(MATCH_NOMATCH);
4763 break;
4764
4765 case OP_WORDCHAR:
4766 if (c >= 256 || (md->ctypes[c] & ctype_word) == 0)
4767 MRRETURN(MATCH_NOMATCH);
4768 break;
4769
4770 default:
4771 RRETURN(PCRE_ERROR_INTERNAL);
4772 }
4773 }
4774 }
4775 else
4776 #endif
4777 /* Not UTF-8 mode */
4778 {
4779 for (fi = min;; fi++)
4780 {
4781 RMATCH(eptr, ecode, offset_top, md, eptrb, RM43);
4782 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
4783 if (fi >= max) MRRETURN(MATCH_NOMATCH);
4784 if (eptr >= md->end_subject)
4785 {
4786 SCHECK_PARTIAL();
4787 MRRETURN(MATCH_NOMATCH);
4788 }
4789 if (ctype == OP_ANY && IS_NEWLINE(eptr))
4790 MRRETURN(MATCH_NOMATCH);
4791 c = *eptr++;
4792 switch(ctype)
4793 {
4794 case OP_ANY: /* This is the non-NL case */
4795 case OP_ALLANY:
4796 case OP_ANYBYTE:
4797 break;
4798
4799 case OP_ANYNL:
4800 switch(c)
4801 {
4802 default: MRRETURN(MATCH_NOMATCH);
4803 case 0x000d:
4804 if (eptr < md->end_subject && *eptr == 0x0a) eptr++;
4805 break;
4806
4807 case 0x000a:
4808 break;
4809
4810 case 0x000b:
4811 case 0x000c:
4812 case 0x0085:
4813 if (md->bsr_anycrlf) MRRETURN(MATCH_NOMATCH);
4814 break;
4815 }
4816 break;
4817
4818 case OP_NOT_HSPACE:
4819 switch(c)
4820 {
4821 default: break;
4822 case 0x09: /* HT */
4823 case 0x20: /* SPACE */
4824 case 0xa0: /* NBSP */
4825 MRRETURN(MATCH_NOMATCH);
4826 }
4827 break;
4828
4829 case OP_HSPACE:
4830 switch(c)
4831 {
4832 default: MRRETURN(MATCH_NOMATCH);
4833 case 0x09: /* HT */
4834 case 0x20: /* SPACE */
4835 case 0xa0: /* NBSP */
4836 break;
4837 }
4838 break;
4839
4840 case OP_NOT_VSPACE:
4841 switch(c)
4842 {
4843 default: break;
4844 case 0x0a: /* LF */
4845 case 0x0b: /* VT */
4846 case 0x0c: /* FF */
4847 case 0x0d: /* CR */
4848 case 0x85: /* NEL */
4849 MRRETURN(MATCH_NOMATCH);
4850 }
4851 break;
4852
4853 case OP_VSPACE:
4854 switch(c)
4855 {
4856 default: MRRETURN(MATCH_NOMATCH);
4857 case 0x0a: /* LF */
4858 case 0x0b: /* VT */
4859 case 0x0c: /* FF */
4860 case 0x0d: /* CR */
4861 case 0x85: /* NEL */
4862 break;
4863 }
4864 break;
4865
4866 case OP_NOT_DIGIT:
4867 if ((md->ctypes[c] & ctype_digit) != 0) MRRETURN(MATCH_NOMATCH);
4868 break;
4869
4870 case OP_DIGIT:
4871 if ((md->ctypes[c] & ctype_digit) == 0) MRRETURN(MATCH_NOMATCH);
4872 break;
4873
4874 case OP_NOT_WHITESPACE:
4875 if ((md->ctypes[c] & ctype_space) != 0) MRRETURN(MATCH_NOMATCH);
4876 break;
4877
4878 case OP_WHITESPACE:
4879 if ((md->ctypes[c] & ctype_space) == 0) MRRETURN(MATCH_NOMATCH);
4880 break;
4881
4882 case OP_NOT_WORDCHAR:
4883 if ((md->ctypes[c] & ctype_word) != 0) MRRETURN(MATCH_NOMATCH);
4884 break;
4885
4886 case OP_WORDCHAR:
4887 if ((md->ctypes[c] & ctype_word) == 0) MRRETURN(MATCH_NOMATCH);
4888 break;
4889
4890 default:
4891 RRETURN(PCRE_ERROR_INTERNAL);
4892 }
4893 }
4894 }
4895 /* Control never gets here */
4896 }
4897
4898 /* If maximizing, it is worth using inline code for speed, doing the type
4899 test once at the start (i.e. keep it out of the loop). Again, keep the
4900 UTF-8 and UCP stuff separate. */
4901
4902 else
4903 {
4904 pp = eptr; /* Remember where we started */
4905
4906 #ifdef SUPPORT_UCP
4907 if (prop_type >= 0)
4908 {
4909 switch(prop_type)
4910 {
4911 case PT_ANY:
4912 for (i = min; i < max; i++)
4913 {
4914 int len = 1;
4915 if (eptr >= md->end_subject)
4916 {
4917 SCHECK_PARTIAL();
4918 break;
4919 }
4920 GETCHARLENTEST(c, eptr, len);
4921 if (prop_fail_result) break;
4922 eptr+= len;
4923 }
4924 break;
4925
4926 case PT_LAMP:
4927 for (i = min; i < max; i++)
4928 {
4929 int chartype;
4930 int len = 1;
4931 if (eptr >= md->end_subject)
4932 {
4933 SCHECK_PARTIAL();
4934 break;
4935 }
4936 GETCHARLENTEST(c, eptr, len);
4937 chartype = UCD_CHARTYPE(c);
4938 if ((chartype == ucp_Lu ||
4939 chartype == ucp_Ll ||
4940 chartype == ucp_Lt) == prop_fail_result)
4941 break;
4942 eptr+= len;
4943 }
4944 break;
4945
4946 case PT_GC:
4947 for (i = min; i < max; i++)
4948 {
4949 int len = 1;
4950 if (eptr >= md->end_subject)
4951 {
4952 SCHECK_PARTIAL();
4953 break;
4954 }
4955 GETCHARLENTEST(c, eptr, len);
4956 if ((UCD_CATEGORY(c) == prop_value) == prop_fail_result) break;
4957 eptr+= len;
4958 }
4959 break;
4960
4961 case PT_PC:
4962 for (i = min; i < max; i++)
4963 {
4964 int len = 1;
4965 if (eptr >= md->end_subject)
4966 {
4967 SCHECK_PARTIAL();
4968 break;
4969 }
4970 GETCHARLENTEST(c, eptr, len);
4971 if ((UCD_CHARTYPE(c) == prop_value) == prop_fail_result) break;
4972 eptr+= len;
4973 }
4974 break;
4975
4976 case PT_SC:
4977 for (i = min; i < max; i++)
4978 {
4979 int len = 1;
4980 if (eptr >= md->end_subject)
4981 {
4982 SCHECK_PARTIAL();
4983 break;
4984 }
4985 GETCHARLENTEST(c, eptr, len);
4986 if ((UCD_SCRIPT(c) == prop_value) == prop_fail_result) break;
4987 eptr+= len;
4988 }
4989 break;
4990
4991 case PT_ALNUM:
4992 for (i = min; i < max; i++)
4993 {
4994 int category;
4995 int len = 1;
4996 if (eptr >= md->end_subject)
4997 {
4998 SCHECK_PARTIAL();
4999 break;
5000 }
5001 GETCHARLENTEST(c, eptr, len);
5002 category = UCD_CATEGORY(c);
5003 if ((category == ucp_L || category == ucp_N) == prop_fail_result)
5004 break;
5005 eptr+= len;
5006 }
5007 break;
5008
5009 case PT_SPACE: /* Perl space */
5010 for (i = min; i < max; i++)
5011 {
5012 int len = 1;
5013 if (eptr >= md->end_subject)
5014 {
5015 SCHECK_PARTIAL();
5016 break;
5017 }
5018 GETCHARLENTEST(c, eptr, len);
5019 if ((UCD_CATEGORY(c) == ucp_Z || c == CHAR_HT || c == CHAR_NL ||
5020 c == CHAR_FF || c == CHAR_CR)
5021 == prop_fail_result)
5022 break;
5023 eptr+= len;
5024 }
5025 break;
5026
5027 case PT_PXSPACE: /* POSIX space */
5028 for (i = min; i < max; i++)
5029 {
5030 int len = 1;
5031 if (eptr >= md->end_subject)
5032 {
5033 SCHECK_PARTIAL();
5034 break;
5035 }
5036 GETCHARLENTEST(c, eptr, len);
5037 if ((UCD_CATEGORY(c) == ucp_Z || c == CHAR_HT || c == CHAR_NL ||
5038 c == CHAR_VT || c == CHAR_FF || c == CHAR_CR)
5039 == prop_fail_result)
5040 break;
5041 eptr+= len;
5042 }
5043 break;
5044
5045 case PT_WORD:
5046 for (i = min; i < max; i++)
5047 {
5048 int category;
5049 int len = 1;
5050 if (eptr >= md->end_subject)
5051 {
5052 SCHECK_PARTIAL();
5053 break;
5054 }
5055 GETCHARLENTEST(c, eptr, len);
5056 category = UCD_CATEGORY(c);
5057 if ((category == ucp_L || category == ucp_N ||
5058 c == CHAR_UNDERSCORE) == prop_fail_result)
5059 break;
5060 eptr+= len;
5061 }
5062 break;
5063
5064 default:
5065 RRETURN(PCRE_ERROR_INTERNAL);
5066 }
5067
5068 /* eptr is now past the end of the maximum run */
5069
5070 if (possessive) continue;
5071 for(;;)
5072 {
5073 RMATCH(eptr, ecode, offset_top, md, eptrb, RM44);
5074 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
5075 if (eptr-- == pp) break; /* Stop if tried at original pos */
5076 if (utf8) BACKCHAR(eptr);
5077 }
5078 }
5079
5080 /* Match extended Unicode sequences. We will get here only if the
5081 support is in the binary; otherwise a compile-time error occurs. */
5082
5083 else if (ctype == OP_EXTUNI)
5084 {
5085 for (i = min; i < max; i++)
5086 {
5087 int len = 1;
5088 if (eptr >= md->end_subject)
5089 {
5090 SCHECK_PARTIAL();
5091 break;
5092 }
5093 if (!utf8) c = *eptr; else { GETCHARLEN(c, eptr, len); }
5094 if (UCD_CATEGORY(c) == ucp_M) break;
5095 eptr += len;
5096 while (eptr < md->end_subject)
5097 {
5098 len = 1;
5099 if (!utf8) c = *eptr; else { GETCHARLEN(c, eptr, len); }
5100 if (UCD_CATEGORY(c) != ucp_M) break;
5101 eptr += len;
5102 }
5103 }
5104
5105 /* eptr is now past the end of the maximum run */
5106
5107 if (possessive) continue;
5108
5109 for(;;)
5110 {
5111 RMATCH(eptr, ecode, offset_top, md, eptrb, RM45);
5112 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
5113 if (eptr-- == pp) break; /* Stop if tried at original pos */
5114 for (;;) /* Move back over one extended */
5115 {
5116 if (!utf8) c = *eptr; else
5117 {
5118 BACKCHAR(eptr);
5119 GETCHAR(c, eptr);
5120 }
5121 if (UCD_CATEGORY(c) != ucp_M) break;
5122 eptr--;
5123 }
5124 }
5125 }
5126
5127 else
5128 #endif /* SUPPORT_UCP */
5129
5130 #ifdef SUPPORT_UTF8
5131 /* UTF-8 mode */
5132
5133 if (utf8)
5134 {
5135 switch(ctype)
5136 {
5137 case OP_ANY:
5138 if (max < INT_MAX)
5139 {
5140 for (i = min; i < max; i++)
5141 {
5142 if (eptr >= md->end_subject)
5143 {
5144 SCHECK_PARTIAL();
5145 break;
5146 }
5147 if (IS_NEWLINE(eptr)) break;
5148 eptr++;
5149 while (eptr < md->end_subject && (*eptr & 0xc0) == 0x80) eptr++;
5150 }
5151 }
5152
5153 /* Handle unlimited UTF-8 repeat */
5154
5155 else
5156 {
5157 for (i = min; i < max; i++)
5158 {
5159 if (eptr >= md->end_subject)
5160 {
5161 SCHECK_PARTIAL();
5162 break;
5163 }
5164 if (IS_NEWLINE(eptr)) break;
5165 eptr++;
5166 while (eptr < md->end_subject && (*eptr & 0xc0) == 0x80) eptr++;
5167 }
5168 }
5169 break;
5170
5171 case OP_ALLANY:
5172 if (max < INT_MAX)
5173 {
5174 for (i = min; i < max; i++)
5175 {
5176 if (eptr >= md->end_subject)
5177 {
5178 SCHECK_PARTIAL();
5179 break;
5180 }
5181 eptr++;
5182 while (eptr < md->end_subject && (*eptr & 0xc0) == 0x80) eptr++;
5183 }
5184 }
5185 else
5186 {
5187 eptr = md->end_subject; /* Unlimited UTF-8 repeat */
5188 SCHECK_PARTIAL();
5189 }
5190 break;
5191
5192 /* The byte case is the same as non-UTF8 */
5193
5194 case OP_ANYBYTE:
5195 c = max - min;
5196 if (c > (unsigned int)(md->end_subject - eptr))
5197 {
5198 eptr = md->end_subject;
5199 SCHECK_PARTIAL();
5200 }
5201 else eptr += c;
5202 break;
5203
5204 case OP_ANYNL:
5205 for (i = min; i < max; i++)
5206 {
5207 int len = 1;
5208 if (eptr >= md->end_subject)
5209 {
5210 SCHECK_PARTIAL();
5211 break;
5212 }
5213 GETCHARLEN(c, eptr, len);
5214 if (c == 0x000d)
5215 {
5216 if (++eptr >= md->end_subject) break;
5217 if (*eptr == 0x000a) eptr++;
5218 }
5219 else
5220 {
5221 if (c != 0x000a &&
5222 (md->bsr_anycrlf ||
5223 (c != 0x000b && c != 0x000c &&
5224 c != 0x0085 && c != 0x2028 && c != 0x2029)))
5225 break;
5226 eptr += len;
5227 }
5228 }
5229 break;
5230
5231 case OP_NOT_HSPACE:
5232 case OP_HSPACE:
5233 for (i = min; i < max; i++)
5234 {
5235 BOOL gotspace;
5236 int len = 1;
5237 if (eptr >= md->end_subject)
5238 {
5239 SCHECK_PARTIAL();
5240 break;
5241 }
5242 GETCHARLEN(c, eptr, len);
5243 switch(c)
5244 {
5245 default: gotspace = FALSE; break;
5246 case 0x09: /* HT */
5247 case 0x20: /* SPACE */
5248 case 0xa0: /* NBSP */
5249 case 0x1680: /* OGHAM SPACE MARK */
5250 case 0x180e: /* MONGOLIAN VOWEL SEPARATOR */
5251 case 0x2000: /* EN QUAD */
5252 case 0x2001: /* EM QUAD */
5253 case 0x2002: /* EN SPACE */
5254 case 0x2003: /* EM SPACE */
5255 case 0x2004: /* THREE-PER-EM SPACE */
5256 case 0x2005: /* FOUR-PER-EM SPACE */
5257 case 0x2006: /* SIX-PER-EM SPACE */
5258 case 0x2007: /* FIGURE SPACE */
5259 case 0x2008: /* PUNCTUATION SPACE */
5260 case 0x2009: /* THIN SPACE */
5261 case 0x200A: /* HAIR SPACE */
5262 case 0x202f: /* NARROW NO-BREAK SPACE */
5263 case 0x205f: /* MEDIUM MATHEMATICAL SPACE */
5264 case 0x3000: /* IDEOGRAPHIC SPACE */
5265 gotspace = TRUE;
5266 break;
5267 }
5268 if (gotspace == (ctype == OP_NOT_HSPACE)) break;
5269 eptr += len;
5270 }
5271 break;
5272
5273 case OP_NOT_VSPACE:
5274 case OP_VSPACE:
5275 for (i = min; i < max; i++)
5276 {
5277 BOOL gotspace;
5278 int len = 1;
5279 if (eptr >= md->end_subject)
5280 {
5281 SCHECK_PARTIAL();
5282 break;
5283 }
5284 GETCHARLEN(c, eptr, len);
5285 switch(c)
5286 {
5287 default: gotspace = FALSE; break;
5288 case 0x0a: /* LF */
5289 case 0x0b: /* VT */
5290 case 0x0c: /* FF */
5291 case 0x0d: /* CR */
5292 case 0x85: /* NEL */
5293 case 0x2028: /* LINE SEPARATOR */
5294 case 0x2029: /* PARAGRAPH SEPARATOR */
5295 gotspace = TRUE;
5296 break;
5297 }
5298 if (gotspace == (ctype == OP_NOT_VSPACE)) break;
5299 eptr += len;
5300 }
5301 break;
5302
5303 case OP_NOT_DIGIT:
5304 for (i = min; i < max; i++)
5305 {
5306 int len = 1;
5307 if (eptr >= md->end_subject)
5308 {
5309 SCHECK_PARTIAL();
5310 break;
5311 }
5312 GETCHARLEN(c, eptr, len);
5313 if (c < 256 && (md->ctypes[c] & ctype_digit) != 0) break;
5314 eptr+= len;
5315 }
5316 break;
5317
5318 case OP_DIGIT:
5319 for (i = min; i < max; i++)
5320 {
5321 int len = 1;
5322 if (eptr >= md->end_subject)
5323 {
5324 SCHECK_PARTIAL();
5325 break;
5326 }
5327 GETCHARLEN(c, eptr, len);
5328 if (c >= 256 ||(md->ctypes[c] & ctype_digit) == 0) break;
5329 eptr+= len;
5330 }
5331 break;
5332
5333 case OP_NOT_WHITESPACE:
5334 for (i = min; i < max; i++)
5335 {
5336 int len = 1;
5337 if (eptr >= md->end_subject)
5338 {
5339 SCHECK_PARTIAL();
5340 break;
5341 }
5342 GETCHARLEN(c, eptr, len);
5343 if (c < 256 && (md->ctypes[c] & ctype_space) != 0) break;
5344 eptr+= len;
5345 }
5346 break;
5347
5348 case OP_WHITESPACE:
5349 for (i = min; i < max; i++)
5350 {
5351 int len = 1;
5352 if (eptr >= md->end_subject)
5353 {
5354 SCHECK_PARTIAL();
5355 break;
5356 }
5357 GETCHARLEN(c, eptr, len);
5358 if (c >= 256 ||(md->ctypes[c] & ctype_space) == 0) break;
5359 eptr+= len;
5360 }
5361 break;
5362
5363 case OP_NOT_WORDCHAR:
5364 for (i = min; i < max; i++)
5365 {
5366 int len = 1;
5367 if (eptr >= md->end_subject)
5368 {
5369 SCHECK_PARTIAL();
5370 break;
5371 }
5372 GETCHARLEN(c, eptr, len);
5373 if (c < 256 && (md->ctypes[c] & ctype_word) != 0) break;
5374 eptr+= len;
5375 }
5376 break;
5377
5378 case OP_WORDCHAR:
5379 for (i = min; i < max; i++)
5380 {
5381 int len = 1;
5382 if (eptr >= md->end_subject)
5383 {
5384 SCHECK_PARTIAL();
5385 break;
5386 }
5387 GETCHARLEN(c, eptr, len);
5388 if (c >= 256 || (md->ctypes[c] & ctype_word) == 0) break;
5389 eptr+= len;
5390 }
5391 break;
5392
5393 default:
5394 RRETURN(PCRE_ERROR_INTERNAL);
5395 }
5396
5397 /* eptr is now past the end of the maximum run. If possessive, we are
5398 done (no backing up). Otherwise, match at this position; anything other
5399 than no match is immediately returned. For nomatch, back up one
5400 character, unless we are matching \R and the last thing matched was
5401 \r\n, in which case, back up two bytes. */
5402
5403 if (possessive) continue;
5404 for(;;)
5405 {
5406 RMATCH(eptr, ecode, offset_top, md, eptrb, RM46);
5407 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
5408 if (eptr-- == pp) break; /* Stop if tried at original pos */
5409 BACKCHAR(eptr);
5410 if (ctype == OP_ANYNL && eptr > pp && *eptr == '\n' &&
5411 eptr[-1] == '\r') eptr--;
5412 }
5413 }
5414 else
5415 #endif /* SUPPORT_UTF8 */
5416
5417 /* Not UTF-8 mode */
5418 {
5419 switch(ctype)
5420 {
5421 case OP_ANY:
5422 for (i = min; i < max; i++)
5423 {
5424 if (eptr >= md->end_subject)
5425 {
5426 SCHECK_PARTIAL();
5427 break;
5428 }
5429 if (IS_NEWLINE(eptr)) break;
5430 eptr++;
5431 }
5432 break;
5433
5434 case OP_ALLANY:
5435 case OP_ANYBYTE:
5436 c = max - min;
5437 if (c > (unsigned int)(md->end_subject - eptr))
5438 {
5439 eptr = md->end_subject;
5440 SCHECK_PARTIAL();
5441 }
5442 else eptr += c;
5443 break;
5444
5445 case OP_ANYNL:
5446 for (i = min; i < max; i++)
5447 {
5448 if (eptr >= md->end_subject)
5449 {
5450 SCHECK_PARTIAL();
5451 break;
5452 }
5453 c = *eptr;
5454 if (c == 0x000d)
5455 {
5456 if (++eptr >= md->end_subject) break;
5457 if (*eptr == 0x000a) eptr++;
5458 }
5459 else
5460 {
5461 if (c != 0x000a &&
5462 (md->bsr_anycrlf ||
5463 (c != 0x000b && c != 0x000c && c != 0x0085)))
5464 break;
5465 eptr++;
5466 }
5467 }
5468 break;
5469
5470 case OP_NOT_HSPACE:
5471 for (i = min; i < max; i++)
5472 {
5473 if (eptr >= md->end_subject)
5474 {
5475 SCHECK_PARTIAL();
5476 break;
5477 }
5478 c = *eptr;
5479 if (c == 0x09 || c == 0x20 || c == 0xa0) break;
5480 eptr++;
5481 }
5482 break;
5483
5484 case OP_HSPACE:
5485 for (i = min; i < max; i++)
5486 {
5487 if (eptr >= md->end_subject)
5488 {
5489 SCHECK_PARTIAL();
5490 break;
5491 }
5492 c = *eptr;
5493 if (c != 0x09 && c != 0x20 && c != 0xa0) break;
5494 eptr++;
5495 }
5496 break;
5497
5498 case OP_NOT_VSPACE:
5499 for (i = min; i < max; i++)
5500 {
5501 if (eptr >= md->end_subject)
5502 {
5503 SCHECK_PARTIAL();
5504 break;
5505 }
5506 c = *eptr;
5507 if (c == 0x0a || c == 0x0b || c == 0x0c || c == 0x0d || c == 0x85)
5508 break;
5509 eptr++;
5510 }
5511 break;
5512
5513 case OP_VSPACE:
5514 for (i = min; i < max; i++)
5515 {
5516 if (eptr >= md->end_subject)
5517 {
5518 SCHECK_PARTIAL();
5519 break;
5520 }
5521 c = *eptr;
5522 if (c != 0x0a && c != 0x0b && c != 0x0c && c != 0x0d && c != 0x85)
5523 break;
5524 eptr++;
5525 }
5526 break;
5527
5528 case OP_NOT_DIGIT:
5529 for (i = min; i < max; i++)
5530 {
5531 if (eptr >= md->end_subject)
5532 {
5533 SCHECK_PARTIAL();
5534 break;
5535 }
5536 if ((md->ctypes[*eptr] & ctype_digit) != 0) break;
5537 eptr++;
5538 }
5539 break;
5540
5541 case OP_DIGIT:
5542 for (i = min; i < max; i++)
5543 {
5544 if (eptr >= md->end_subject)
5545 {
5546 SCHECK_PARTIAL();
5547 break;
5548 }
5549 if ((md->ctypes[*eptr] & ctype_digit) == 0) break;
5550 eptr++;
5551 }
5552 break;
5553
5554 case OP_NOT_WHITESPACE:
5555 for (i = min; i < max; i++)
5556 {
5557 if (eptr >= md->end_subject)
5558 {
5559 SCHECK_PARTIAL();
5560 break;
5561 }
5562 if ((md->ctypes[*eptr] & ctype_space) != 0) break;
5563 eptr++;
5564 }
5565 break;
5566
5567 case OP_WHITESPACE:
5568 for (i = min; i < max; i++)
5569 {
5570 if (eptr >= md->end_subject)
5571 {
5572 SCHECK_PARTIAL();
5573 break;
5574 }
5575 if ((md->ctypes[*eptr] & ctype_space) == 0) break;
5576 eptr++;
5577 }
5578 break;
5579
5580 case OP_NOT_WORDCHAR:
5581 for (i = min; i < max; i++)
5582 {
5583 if (eptr >= md->end_subject)
5584 {
5585 SCHECK_PARTIAL();
5586 break;
5587 }
5588 if ((md->ctypes[*eptr] & ctype_word) != 0) break;
5589 eptr++;
5590 }
5591 break;
5592
5593 case OP_WORDCHAR:
5594 for (i = min; i < max; i++)
5595 {
5596 if (eptr >= md->end_subject)
5597 {
5598 SCHECK_PARTIAL();
5599 break;
5600 }
5601 if ((md->ctypes[*eptr] & ctype_word) == 0) break;
5602 eptr++;
5603 }
5604 break;
5605
5606 default:
5607 RRETURN(PCRE_ERROR_INTERNAL);
5608 }
5609
5610 /* eptr is now past the end of the maximum run. If possessive, we are
5611 done (no backing up). Otherwise, match at this position; anything other
5612 than no match is immediately returned. For nomatch, back up one
5613 character (byte), unless we are matching \R and the last thing matched
5614 was \r\n, in which case, back up two bytes. */
5615
5616 if (possessive) continue;
5617 while (eptr >= pp)
5618 {
5619 RMATCH(eptr, ecode, offset_top, md, eptrb, RM47);
5620 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
5621 eptr--;
5622 if (ctype == OP_ANYNL && eptr > pp && *eptr == '\n' &&
5623 eptr[-1] == '\r') eptr--;
5624 }
5625 }
5626
5627 /* Get here if we can't make it match with any permitted repetitions */
5628
5629 MRRETURN(MATCH_NOMATCH);
5630 }
5631 /* Control never gets here */
5632
5633 /* There's been some horrible disaster. Arrival here can only mean there is
5634 something seriously wrong in the code above or the OP_xxx definitions. */
5635
5636 default:
5637 DPRINTF(("Unknown opcode %d\n", *ecode));
5638 RRETURN(PCRE_ERROR_UNKNOWN_OPCODE);
5639 }
5640
5641 /* Do not stick any code in here without much thought; it is assumed
5642 that "continue" in the code above comes out to here to repeat the main
5643 loop. */
5644
5645 } /* End of main loop */
5646 /* Control never reaches here */
5647
5648
5649 /* When compiling to use the heap rather than the stack for recursive calls to
5650 match(), the RRETURN() macro jumps here. The number that is saved in
5651 frame->Xwhere indicates which label we actually want to return to. */
5652
5653 #ifdef NO_RECURSE
5654 #define LBL(val) case val: goto L_RM##val;
5655 HEAP_RETURN:
5656 switch (frame->Xwhere)
5657 {
5658 LBL( 1) LBL( 2) LBL( 3) LBL( 4) LBL( 5) LBL( 6) LBL( 7) LBL( 8)
5659 LBL( 9) LBL(10) LBL(11) LBL(12) LBL(13) LBL(14) LBL(15) LBL(17)
5660 LBL(19) LBL(24) LBL(25) LBL(26) LBL(27) LBL(29) LBL(31) LBL(33)
5661 LBL(35) LBL(43) LBL(47) LBL(48) LBL(49) LBL(50) LBL(51) LBL(52)
5662 LBL(53) LBL(54) LBL(55) LBL(56) LBL(57) LBL(58) LBL(63)
5663 #ifdef SUPPORT_UTF8
5664 LBL(16) LBL(18) LBL(20) LBL(21) LBL(22) LBL(23) LBL(28) LBL(30)
5665 LBL(32) LBL(34) LBL(42) LBL(46)
5666 #ifdef SUPPORT_UCP
5667 LBL(36) LBL(37) LBL(38) LBL(39) LBL(40) LBL(41) LBL(44) LBL(45)
5668 LBL(59) LBL(60) LBL(61) LBL(62)
5669 #endif /* SUPPORT_UCP */
5670 #endif /* SUPPORT_UTF8 */
5671 default:
5672 DPRINTF(("jump error in pcre match: label %d non-existent\n", frame->Xwhere));
5673 return PCRE_ERROR_INTERNAL;
5674 }
5675 #undef LBL
5676 #endif /* NO_RECURSE */
5677 }
5678
5679
5680 /***************************************************************************
5681 ****************************************************************************
5682 RECURSION IN THE match() FUNCTION
5683
5684 Undefine all the macros that were defined above to handle this. */
5685
5686 #ifdef NO_RECURSE
5687 #undef eptr
5688 #undef ecode
5689 #undef mstart
5690 #undef offset_top
5691 #undef eptrb
5692 #undef flags
5693
5694 #undef callpat
5695 #undef charptr
5696 #undef data
5697 #undef next
5698 #undef pp
5699 #undef prev
5700 #undef saved_eptr
5701
5702 #undef new_recursive
5703
5704 #undef cur_is_word
5705 #undef condition
5706 #undef prev_is_word
5707
5708 #undef ctype
5709 #undef length
5710 #undef max
5711 #undef min
5712 #undef number
5713 #undef offset
5714 #undef op
5715 #undef save_capture_last
5716 #undef save_offset1
5717 #undef save_offset2
5718 #undef save_offset3
5719 #undef stacksave
5720
5721 #undef newptrb
5722
5723 #endif
5724
5725 /* These two are defined as macros in both cases */
5726
5727 #undef fc
5728 #undef fi
5729
5730 /***************************************************************************
5731 ***************************************************************************/
5732
5733
5734
5735 /*************************************************
5736 * Execute a Regular Expression *
5737 *************************************************/
5738
5739 /* This function applies a compiled re to a subject string and picks out
5740 portions of the string if it matches. Two elements in the vector are set for
5741 each substring: the offsets to the start and end of the substring.
5742
5743 Arguments:
5744 argument_re points to the compiled expression
5745 extra_data points to extra data or is NULL
5746 subject points to the subject string
5747 length length of subject string (may contain binary zeros)
5748 start_offset where to start in the subject string
5749 options option bits
5750 offsets points to a vector of ints to be filled in with offsets
5751 offsetcount the number of elements in the vector
5752
5753 Returns: > 0 => success; value is the number of elements filled in
5754 = 0 => success, but offsets is not big enough
5755 -1 => failed to match
5756 < -1 => some kind of unexpected problem
5757 */
5758
5759 PCRE_EXP_DEFN int PCRE_CALL_CONVENTION
5760 pcre_exec(const pcre *argument_re, const pcre_extra *extra_data,
5761 PCRE_SPTR subject, int length, int start_offset, int options, int *offsets,
5762 int offsetcount)
5763 {
5764 int rc, ocount;
5765 int first_byte = -1;
5766 int req_byte = -1;
5767 int req_byte2 = -1;
5768 int newline;
5769 BOOL using_temporary_offsets = FALSE;
5770 BOOL anchored;
5771 BOOL startline;
5772 BOOL firstline;
5773 BOOL first_byte_caseless = FALSE;
5774 BOOL req_byte_caseless = FALSE;
5775 BOOL utf8;
5776 match_data match_block;
5777 match_data *md = &match_block;
5778 const uschar *tables;
5779 const uschar *start_bits = NULL;
5780 USPTR start_match = (USPTR)subject + start_offset;
5781 USPTR end_subject;
5782 USPTR start_partial = NULL;
5783 USPTR req_byte_ptr = start_match - 1;
5784
5785 pcre_study_data internal_study;
5786 const pcre_study_data *study;
5787
5788 real_pcre internal_re;
5789 const real_pcre *external_re = (const real_pcre *)argument_re;
5790 const real_pcre *re = external_re;
5791
5792 /* Plausibility checks */
5793
5794 if ((options & ~PUBLIC_EXEC_OPTIONS) != 0) return PCRE_ERROR_BADOPTION;
5795 if (re == NULL || subject == NULL ||
5796 (offsets == NULL && offsetcount > 0)) return PCRE_ERROR_NULL;
5797 if (offsetcount < 0) return PCRE_ERROR_BADCOUNT;
5798 if (start_offset < 0 || start_offset > length) return PCRE_ERROR_BADOFFSET;
5799
5800 /* This information is for finding all the numbers associated with a given
5801 name, for condition testing. */
5802
5803 md->name_table = (uschar *)re + re->name_table_offset;
5804 md->name_count = re->name_count;
5805 md->name_entry_size = re->name_entry_size;
5806
5807 /* Fish out the optional data from the extra_data structure, first setting
5808 the default values. */
5809
5810 study = NULL;
5811 md->match_limit = MATCH_LIMIT;
5812 md->match_limit_recursion = MATCH_LIMIT_RECURSION;
5813 md->callout_data = NULL;
5814
5815 /* The table pointer is always in native byte order. */
5816
5817 tables = external_re->tables;
5818
5819 if (extra_data != NULL)
5820 {
5821 register unsigned int flags = extra_data->flags;
5822 if ((flags & PCRE_EXTRA_STUDY_DATA) != 0)
5823 study = (const pcre_study_data *)extra_data->study_data;
5824 if ((flags & PCRE_EXTRA_MATCH_LIMIT) != 0)
5825 md->match_limit = extra_data->match_limit;
5826 if ((flags & PCRE_EXTRA_MATCH_LIMIT_RECURSION) != 0)
5827 md->match_limit_recursion = extra_data->match_limit_recursion;
5828 if ((flags & PCRE_EXTRA_CALLOUT_DATA) != 0)
5829 md->callout_data = extra_data->callout_data;
5830 if ((flags & PCRE_EXTRA_TABLES) != 0) tables = extra_data->tables;
5831 }
5832
5833 /* If the exec call supplied NULL for tables, use the inbuilt ones. This
5834 is a feature that makes it possible to save compiled regex and re-use them
5835 in other programs later. */
5836
5837 if (tables == NULL) tables = _pcre_default_tables;
5838
5839 /* Check that the first field in the block is the magic number. If it is not,
5840 test for a regex that was compiled on a host of opposite endianness. If this is
5841 the case, flipped values are put in internal_re and internal_study if there was
5842 study data too. */
5843
5844 if (re->magic_number != MAGIC_NUMBER)
5845 {
5846 re = _pcre_try_flipped(re, &internal_re, study, &internal_study);
5847 if (re == NULL) return PCRE_ERROR_BADMAGIC;
5848 if (study != NULL) study = &internal_study;
5849 }
5850
5851 /* Set up other data */
5852
5853 anchored = ((re->options | options) & PCRE_ANCHORED) != 0;
5854 startline = (re->flags & PCRE_STARTLINE) != 0;
5855 firstline = (re->options & PCRE_FIRSTLINE) != 0;
5856
5857 /* The code starts after the real_pcre block and the capture name table. */
5858
5859 md->start_code = (const uschar *)external_re + re->name_table_offset +
5860 re->name_count * re->name_entry_size;
5861
5862 md->start_subject = (USPTR)subject;
5863 md->start_offset = start_offset;
5864 md->end_subject = md->start_subject + length;
5865 end_subject = md->end_subject;
5866
5867 md->endonly = (re->options & PCRE_DOLLAR_ENDONLY) != 0;
5868 utf8 = md->utf8 = (re->options & PCRE_UTF8) != 0;
5869 md->use_ucp = (re->options & PCRE_UCP) != 0;
5870 md->jscript_compat = (re->options & PCRE_JAVASCRIPT_COMPAT) != 0;
5871
5872 /* Some options are unpacked into BOOL variables in the hope that testing
5873 them will be faster than individual option bits. */
5874
5875 md->notbol = (options & PCRE_NOTBOL) != 0;
5876 md->noteol = (options & PCRE_NOTEOL) != 0;
5877 md->notempty = (options & PCRE_NOTEMPTY) != 0;
5878 md->notempty_atstart = (options & PCRE_NOTEMPTY_ATSTART) != 0;
5879 md->partial = ((options & PCRE_PARTIAL_HARD) != 0)? 2 :
5880 ((options & PCRE_PARTIAL_SOFT) != 0)? 1 : 0;
5881
5882
5883 md->hitend = FALSE;
5884 md->mark = NULL; /* In case never set */
5885
5886 md->recursive = NULL; /* No recursion at top level */
5887
5888 md->lcc = tables + lcc_offset;
5889 md->ctypes = tables + ctypes_offset;
5890
5891 /* Handle different \R options. */
5892
5893 switch (options & (PCRE_BSR_ANYCRLF|PCRE_BSR_UNICODE))
5894 {
5895 case 0:
5896 if ((re->options & (PCRE_BSR_ANYCRLF|PCRE_BSR_UNICODE)) != 0)
5897 md->bsr_anycrlf = (re->options & PCRE_BSR_ANYCRLF) != 0;
5898 else
5899 #ifdef BSR_ANYCRLF
5900 md->bsr_anycrlf = TRUE;
5901 #else
5902 md->bsr_anycrlf = FALSE;
5903 #endif
5904 break;
5905
5906 case PCRE_BSR_ANYCRLF:
5907 md->bsr_anycrlf = TRUE;
5908 break;
5909
5910 case PCRE_BSR_UNICODE:
5911 md->bsr_anycrlf = FALSE;
5912 break;
5913
5914 default: return PCRE_ERROR_BADNEWLINE;
5915 }
5916
5917 /* Handle different types of newline. The three bits give eight cases. If
5918 nothing is set at run time, whatever was used at compile time applies. */
5919
5920 switch ((((options & PCRE_NEWLINE_BITS) == 0)? re->options :
5921 (pcre_uint32)options) & PCRE_NEWLINE_BITS)
5922 {
5923 case 0: newline = NEWLINE; break; /* Compile-time default */
5924 case PCRE_NEWLINE_CR: newline = CHAR_CR; break;
5925 case PCRE_NEWLINE_LF: newline = CHAR_NL; break;
5926 case PCRE_NEWLINE_CR+
5927 PCRE_NEWLINE_LF: newline = (CHAR_CR << 8) | CHAR_NL; break;
5928 case PCRE_NEWLINE_ANY: newline = -1; break;
5929 case PCRE_NEWLINE_ANYCRLF: newline = -2; break;
5930 default: return PCRE_ERROR_BADNEWLINE;
5931 }
5932
5933 if (newline == -2)
5934 {
5935 md->nltype = NLTYPE_ANYCRLF;
5936 }
5937 else if (newline < 0)
5938 {
5939 md->nltype = NLTYPE_ANY;
5940 }
5941 else
5942 {
5943 md->nltype = NLTYPE_FIXED;
5944 if (newline > 255)
5945 {
5946 md->nllen = 2;
5947 md->nl[0] = (newline >> 8) & 255;
5948 md->nl[1] = newline & 255;
5949 }
5950 else
5951 {
5952 md->nllen = 1;
5953 md->nl[0] = newline;
5954 }
5955 }
5956
5957 /* Partial matching was originally supported only for a restricted set of
5958 regexes; from release 8.00 there are no restrictions, but the bits are still
5959 defined (though never set). So there's no harm in leaving this code. */
5960
5961 if (md->partial && (re->flags & PCRE_NOPARTIAL) != 0)
5962 return PCRE_ERROR_BADPARTIAL;
5963
5964 /* Check a UTF-8 string if required. Pass back the character offset and error
5965 code for an invalid string if a results vector is available. */
5966
5967 #ifdef SUPPORT_UTF8
5968 if (utf8 && (options & PCRE_NO_UTF8_CHECK) == 0)
5969 {
5970 int erroroffset;
5971 int errorcode = _pcre_valid_utf8((USPTR)subject, length, &erroroffset);
5972 if (errorcode != 0)
5973 {
5974 if (offsetcount >= 2)
5975 {
5976 offsets[0] = erroroffset;
5977 offsets[1] = errorcode;
5978 }
5979 return (errorcode <= PCRE_UTF8_ERR5 && md->partial > 1)?
5980 PCRE_ERROR_SHORTUTF8 : PCRE_ERROR_BADUTF8;
5981 }
5982
5983 /* Check that a start_offset points to the start of a UTF-8 character. */
5984
5985 if (start_offset > 0 && start_offset < length &&
5986 (((USPTR)subject)[start_offset] & 0xc0) == 0x80)
5987 return PCRE_ERROR_BADUTF8_OFFSET;
5988 }
5989 #endif
5990
5991 /* If the expression has got more back references than the offsets supplied can
5992 hold, we get a temporary chunk of working store to use during the matching.
5993 Otherwise, we can use the vector supplied, rounding down its size to a multiple
5994 of 3. */
5995
5996 ocount = offsetcount - (offsetcount % 3);
5997
5998 if (re->top_backref > 0 && re->top_backref >= ocount/3)
5999 {
6000 ocount = re->top_backref * 3 + 3;
6001 md->offset_vector = (int *)(pcre_malloc)(ocount * sizeof(int));
6002 if (md->offset_vector == NULL) return PCRE_ERROR_NOMEMORY;
6003 using_temporary_offsets = TRUE;
6004 DPRINTF(("Got memory to hold back references\n"));
6005 }
6006 else md->offset_vector = offsets;
6007
6008 md->offset_end = ocount;
6009 md->offset_max = (2*ocount)/3;
6010 md->offset_overflow = FALSE;
6011 md->capture_last = -1;
6012
6013 /* Reset the working variable associated with each extraction. These should
6014 never be used unless previously set, but they get saved and restored, and so we
6015 initialize them to avoid reading uninitialized locations. Also, unset the
6016 offsets for the matched string. This is really just for tidiness with callouts,
6017 in case they inspect these fields. */
6018
6019 if (md->offset_vector != NULL)
6020 {
6021 register int *iptr = md->offset_vector + ocount;
6022 register int *iend = iptr - re->top_bracket;
6023 if (iend < md->offset_vector + 2) iend = md->offset_vector + 2;
6024 while (--iptr >= iend) *iptr = -1;
6025 md->offset_vector[0] = md->offset_vector[1] = -1;
6026 }
6027
6028 /* Set up the first character to match, if available. The first_byte value is
6029 never set for an anchored regular expression, but the anchoring may be forced
6030 at run time, so we have to test for anchoring. The first char may be unset for
6031 an unanchored pattern, of course. If there's no first char and the pattern was
6032 studied, there may be a bitmap of possible first characters. */
6033
6034 if (!anchored)
6035 {
6036 if ((re->flags & PCRE_FIRSTSET) != 0)
6037 {
6038 first_byte = re->first_byte & 255;
6039 if ((first_byte_caseless = ((re->first_byte & REQ_CASELESS) != 0)) == TRUE)
6040 first_byte = md->lcc[first_byte];
6041 }
6042 else
6043 if (!startline && study != NULL &&
6044 (study->flags & PCRE_STUDY_MAPPED) != 0)
6045 start_bits = study->start_bits;
6046 }
6047
6048 /* For anchored or unanchored matches, there may be a "last known required
6049 character" set. */
6050
6051 if ((re->flags & PCRE_REQCHSET) != 0)
6052 {
6053 req_byte = re->req_byte & 255;
6054 req_byte_caseless = (re->req_byte & REQ_CASELESS) != 0;
6055 req_byte2 = (tables + fcc_offset)[req_byte]; /* case flipped */
6056 }
6057
6058
6059
6060
6061 /* ==========================================================================*/
6062
6063 /* Loop for handling unanchored repeated matching attempts; for anchored regexs
6064 the loop runs just once. */
6065
6066 for(;;)
6067 {
6068 USPTR save_end_subject = end_subject;
6069 USPTR new_start_match;
6070
6071 /* If firstline is TRUE, the start of the match is constrained to the first
6072 line of a multiline string. That is, the match must be before or at the first
6073 newline. Implement this by temporarily adjusting end_subject so that we stop
6074 scanning at a newline. If the match fails at the newline, later code breaks
6075 this loop. */
6076
6077 if (firstline)
6078 {
6079 USPTR t = start_match;
6080 #ifdef SUPPORT_UTF8
6081 if (utf8)
6082 {
6083 while (t < md->end_subject && !IS_NEWLINE(t))
6084 {
6085 t++;
6086 while (t < end_subject && (*t & 0xc0) == 0x80) t++;
6087 }
6088 }
6089 else
6090 #endif
6091 while (t < md->end_subject && !IS_NEWLINE(t)) t++;
6092 end_subject = t;
6093 }
6094
6095 /* There are some optimizations that avoid running the match if a known
6096 starting point is not found, or if a known later character is not present.
6097 However, there is an option that disables these, for testing and for ensuring
6098 that all callouts do actually occur. The option can be set in the regex by
6099 (*NO_START_OPT) or passed in match-time options. */
6100
6101 if (((options | re->options) & PCRE_NO_START_OPTIMIZE) == 0)
6102 {
6103 /* Advance to a unique first byte if there is one. */
6104
6105 if (first_byte >= 0)
6106 {
6107 if (first_byte_caseless)
6108 while (start_match < end_subject && md->lcc[*start_match] != first_byte)
6109 start_match++;
6110 else
6111 while (start_match < end_subject && *start_match != first_byte)
6112 start_match++;
6113 }
6114
6115 /* Or to just after a linebreak for a multiline match */
6116
6117 else if (startline)
6118 {
6119 if (start_match > md->start_subject + start_offset)
6120 {
6121 #ifdef SUPPORT_UTF8
6122 if (utf8)
6123 {
6124 while (start_match < end_subject && !WAS_NEWLINE(start_match))
6125 {
6126 start_match++;
6127 while(start_match < end_subject && (*start_match & 0xc0) == 0x80)
6128 start_match++;
6129 }
6130 }
6131 else
6132 #endif
6133 while (start_match < end_subject && !WAS_NEWLINE(start_match))
6134 start_match++;
6135
6136 /* If we have just passed a CR and the newline option is ANY or ANYCRLF,
6137 and we are now at a LF, advance the match position by one more character.
6138 */
6139
6140 if (start_match[-1] == CHAR_CR &&
6141 (md->nltype == NLTYPE_ANY || md->nltype == NLTYPE_ANYCRLF) &&
6142 start_match < end_subject &&
6143 *start_match == CHAR_NL)
6144 start_match++;
6145 }
6146 }
6147
6148 /* Or to a non-unique first byte after study */
6149
6150 else if (start_bits != NULL)
6151 {
6152 while (start_match < end_subject)
6153 {
6154 register unsigned int c = *start_match;
6155 if ((start_bits[c/8] & (1 << (c&7))) == 0)
6156 {
6157 start_match++;
6158 #ifdef SUPPORT_UTF8
6159 if (utf8)
6160 while(start_match < end_subject && (*start_match & 0xc0) == 0x80)
6161 start_match++;
6162 #endif
6163 }
6164 else break;
6165 }
6166 }
6167 } /* Starting optimizations */
6168
6169 /* Restore fudged end_subject */
6170
6171 end_subject = save_end_subject;
6172
6173 /* The following two optimizations are disabled for partial matching or if
6174 disabling is explicitly requested. */
6175
6176 if ((options & PCRE_NO_START_OPTIMIZE) == 0 && !md->partial)
6177 {
6178 /* If the pattern was studied, a minimum subject length may be set. This is
6179 a lower bound; no actual string of that length may actually match the
6180 pattern. Although the value is, strictly, in characters, we treat it as
6181 bytes to avoid spending too much time in this optimization. */
6182
6183 if (study != NULL && (study->flags & PCRE_STUDY_MINLEN) != 0 &&
6184 (pcre_uint32)(end_subject - start_match) < study->minlength)
6185 {
6186 rc = MATCH_NOMATCH;
6187 break;
6188 }
6189
6190 /* If req_byte is set, we know that that character must appear in the
6191 subject for the match to succeed. If the first character is set, req_byte
6192 must be later in the subject; otherwise the test starts at the match point.
6193 This optimization can save a huge amount of backtracking in patterns with
6194 nested unlimited repeats that aren't going to match. Writing separate code
6195 for cased/caseless versions makes it go faster, as does using an
6196 autoincrement and backing off on a match.
6197
6198 HOWEVER: when the subject string is very, very long, searching to its end
6199 can take a long time, and give bad performance on quite ordinary patterns.
6200 This showed up when somebody was matching something like /^\d+C/ on a
6201 32-megabyte string... so we don't do this when the string is sufficiently
6202 long. */
6203
6204 if (req_byte >= 0 && end_subject - start_match < REQ_BYTE_MAX)
6205 {
6206 register USPTR p = start_match + ((first_byte >= 0)? 1 : 0);
6207
6208 /* We don't need to repeat the search if we haven't yet reached the
6209 place we found it at last time. */
6210
6211 if (p > req_byte_ptr)
6212 {
6213 if (req_byte_caseless)
6214 {
6215 while (p < end_subject)
6216 {
6217 register int pp = *p++;
6218 if (pp == req_byte || pp == req_byte2) { p--; break; }
6219 }
6220 }
6221 else
6222 {
6223 while (p < end_subject)
6224 {
6225 if (*p++ == req_byte) { p--; break; }
6226 }
6227 }
6228
6229 /* If we can't find the required character, break the matching loop,
6230 forcing a match failure. */
6231
6232 if (p >= end_subject)
6233 {
6234 rc = MATCH_NOMATCH;
6235 break;
6236 }
6237
6238 /* If we have found the required character, save the point where we
6239 found it, so that we don't search again next time round the loop if
6240 the start hasn't passed this character yet. */
6241
6242 req_byte_ptr = p;
6243 }
6244 }
6245 }
6246
6247 #ifdef PCRE_DEBUG /* Sigh. Some compilers never learn. */
6248 printf(">>>> Match against: ");
6249 pchars(start_match, end_subject - start_match, TRUE, md);
6250 printf("\n");
6251 #endif
6252
6253 /* OK, we can now run the match. If "hitend" is set afterwards, remember the
6254 first starting point for which a partial match was found. */
6255
6256 md->start_match_ptr = start_match;
6257 md->start_used_ptr = start_match;
6258 md->match_call_count = 0;
6259 md->match_function_type = 0;
6260 md->end_offset_top = 0;
6261 rc = match(start_match, md->start_code, start_match, NULL, 2, md, NULL, 0);
6262 if (md->hitend && start_partial == NULL) start_partial = md->start_used_ptr;
6263
6264 switch(rc)
6265 {
6266 /* SKIP passes back the next starting point explicitly, but if it is the
6267 same as the match we have just done, treat it as NOMATCH. */
6268
6269 case MATCH_SKIP:
6270 if (md->start_match_ptr != start_match)
6271 {
6272 new_start_match = md->start_match_ptr;
6273 break;
6274 }
6275 /* Fall through */
6276
6277 /* If MATCH_SKIP_ARG reaches this level it means that a MARK that matched
6278 the SKIP's arg was not found. We also treat this as NOMATCH. */
6279
6280 case MATCH_SKIP_ARG:
6281 /* Fall through */
6282
6283 /* NOMATCH and PRUNE advance by one character. THEN at this level acts
6284 exactly like PRUNE. */
6285
6286 case MATCH_NOMATCH:
6287 case MATCH_PRUNE:
6288 case MATCH_THEN:
6289 new_start_match = start_match + 1;
6290 #ifdef SUPPORT_UTF8
6291 if (utf8)
6292 while(new_start_match < end_subject && (*new_start_match & 0xc0) == 0x80)
6293 new_start_match++;
6294 #endif
6295 break;
6296
6297 /* COMMIT disables the bumpalong, but otherwise behaves as NOMATCH. */
6298
6299 case MATCH_COMMIT:
6300 rc = MATCH_NOMATCH;
6301 goto ENDLOOP;
6302
6303 /* Any other return is either a match, or some kind of error. */
6304
6305 default:
6306 goto ENDLOOP;
6307 }
6308
6309 /* Control reaches here for the various types of "no match at this point"
6310 result. Reset the code to MATCH_NOMATCH for subsequent checking. */
6311
6312 rc = MATCH_NOMATCH;
6313
6314 /* If PCRE_FIRSTLINE is set, the match must happen before or at the first
6315 newline in the subject (though it may continue over the newline). Therefore,
6316 if we have just failed to match, starting at a newline, do not continue. */
6317
6318 if (firstline && IS_NEWLINE(start_match)) break;
6319
6320 /* Advance to new matching position */
6321
6322 start_match = new_start_match;
6323
6324 /* Break the loop if the pattern is anchored or if we have passed the end of
6325 the subject. */
6326
6327 if (anchored || start_match > end_subject) break;
6328
6329 /* If we have just passed a CR and we are now at a LF, and the pattern does
6330 not contain any explicit matches for \r or \n, and the newline option is CRLF
6331 or ANY or ANYCRLF, advance the match position by one more character. */
6332
6333 if (start_match[-1] == CHAR_CR &&
6334 start_match < end_subject &&
6335 *start_match == CHAR_NL &&
6336 (re->flags & PCRE_HASCRORLF) == 0 &&
6337 (md->nltype == NLTYPE_ANY ||
6338 md->nltype == NLTYPE_ANYCRLF ||
6339 md->nllen == 2))
6340 start_match++;
6341
6342 md->mark = NULL; /* Reset for start of next match attempt */
6343 } /* End of for(;;) "bumpalong" loop */
6344
6345 /* ==========================================================================*/
6346
6347 /* We reach here when rc is not MATCH_NOMATCH, or if one of the stopping
6348 conditions is true:
6349
6350 (1) The pattern is anchored or the match was failed by (*COMMIT);
6351
6352 (2) We are past the end of the subject;
6353
6354 (3) PCRE_FIRSTLINE is set and we have failed to match at a newline, because
6355 this option requests that a match occur at or before the first newline in
6356 the subject.
6357
6358 When we have a match and the offset vector is big enough to deal with any
6359 backreferences, captured substring offsets will already be set up. In the case
6360 where we had to get some local store to hold offsets for backreference
6361 processing, copy those that we can. In this case there need not be overflow if
6362 certain parts of the pattern were not used, even though there are more
6363 capturing parentheses than vector slots. */
6364
6365 ENDLOOP:
6366
6367 if (rc == MATCH_MATCH || rc == MATCH_ACCEPT)
6368 {
6369 if (using_temporary_offsets)
6370 {
6371 if (offsetcount >= 4)
6372 {
6373 memcpy(offsets + 2, md->offset_vector + 2,
6374 (offsetcount - 2) * sizeof(int));
6375 DPRINTF(("Copied offsets from temporary memory\n"));
6376 }
6377 if (md->end_offset_top > offsetcount) md->offset_overflow = TRUE;
6378 DPRINTF(("Freeing temporary memory\n"));
6379 (pcre_free)(md->offset_vector);
6380 }
6381
6382 /* Set the return code to the number of captured strings, or 0 if there are
6383 too many to fit into the vector. */
6384
6385 rc = md->offset_overflow? 0 : md->end_offset_top/2;
6386
6387 /* If there is space in the offset vector, set any unused pairs at the end of
6388 the pattern to -1 for backwards compatibility. It is documented that this
6389 happens. In earlier versions, the whole set of potential capturing offsets
6390 was set to -1 each time round the loop, but this is handled differently now.
6391 "Gaps" are set to -1 dynamically instead (this fixes a bug). Thus, it is only
6392 those at the end that need unsetting here. We can't just unset them all at
6393 the start of the whole thing because they may get set in one branch that is
6394 not the final matching branch. */
6395
6396 if (md->end_offset_top/2 <= re->top_bracket && offsets != NULL)
6397 {
6398 register int *iptr, *iend;
6399 int resetcount = 2 + re->top_bracket * 2;
6400 if (resetcount > offsetcount) resetcount = ocount;
6401 iptr = offsets + md->end_offset_top;
6402 iend = offsets + resetcount;
6403 while (iptr < iend) *iptr++ = -1;
6404 }
6405
6406 /* If there is space, set up the whole thing as substring 0. The value of
6407 md->start_match_ptr might be modified if \K was encountered on the success
6408 matching path. */
6409
6410 if (offsetcount < 2) rc = 0; else
6411 {
6412 offsets[0] = (int)(md->start_match_ptr - md->start_subject);
6413 offsets[1] = (int)(md->end_match_ptr - md->start_subject);
6414 }
6415
6416 DPRINTF((">>>> returning %d\n", rc));
6417 goto RETURN_MARK;
6418 }
6419
6420 /* Control gets here if there has been an error, or if the overall match
6421 attempt has failed at all permitted starting positions. */
6422
6423 if (using_temporary_offsets)
6424 {
6425 DPRINTF(("Freeing temporary memory\n"));
6426 (pcre_free)(md->offset_vector);
6427 }
6428
6429 /* For anything other than nomatch or partial match, just return the code. */
6430
6431 if (rc != MATCH_NOMATCH && rc != PCRE_ERROR_PARTIAL)
6432 {
6433 DPRINTF((">>>> error: returning %d\n", rc));
6434 return rc;
6435 }
6436
6437 /* Handle partial matches - disable any mark data */
6438
6439 if (start_partial != NULL)
6440 {
6441 DPRINTF((">>>> returning PCRE_ERROR_PARTIAL\n"));
6442 md->mark = NULL;
6443 if (offsetcount > 1)
6444 {
6445 offsets[0] = (int)(start_partial - (USPTR)subject);
6446 offsets[1] = (int)(end_subject - (USPTR)subject);
6447 }
6448 rc = PCRE_ERROR_PARTIAL;
6449 }
6450
6451 /* This is the classic nomatch case */
6452
6453 else
6454 {
6455 DPRINTF((">>>> returning PCRE_ERROR_NOMATCH\n"));
6456 rc = PCRE_ERROR_NOMATCH;
6457 }
6458
6459 /* Return the MARK data if it has been requested. */
6460
6461 RETURN_MARK:
6462
6463 if (extra_data != NULL && (extra_data->flags & PCRE_EXTRA_MARK) != 0)
6464 *(extra_data->mark) = (unsigned char *)(md->mark);
6465 return rc;
6466 }
6467
6468 /* End of pcre_exec.c */

Properties

Name Value
svn:eol-style native
svn:keywords "Author Date Id Revision Url"

webmaster@exim.org
ViewVC Help
Powered by ViewVC 1.1.12