/[pcre]/code/trunk/pcre_exec.c
ViewVC logotype

Contents of /code/trunk/pcre_exec.c

Parent Directory Parent Directory | Revision Log Revision Log


Revision 626 - (show annotations) (download)
Wed Jul 20 17:51:54 2011 UTC (3 years, 3 months ago) by ph10
File MIME type: text/plain
File size: 193733 byte(s)
Add the /= modifier to pcretest so as to be able to check unset capturing 
parentheses at the ends of patterns.

1 /*************************************************
2 * Perl-Compatible Regular Expressions *
3 *************************************************/
4
5 /* PCRE is a library of functions to support regular expressions whose syntax
6 and semantics are as close as possible to those of the Perl 5 language.
7
8 Written by Philip Hazel
9 Copyright (c) 1997-2011 University of Cambridge
10
11 -----------------------------------------------------------------------------
12 Redistribution and use in source and binary forms, with or without
13 modification, are permitted provided that the following conditions are met:
14
15 * Redistributions of source code must retain the above copyright notice,
16 this list of conditions and the following disclaimer.
17
18 * Redistributions in binary form must reproduce the above copyright
19 notice, this list of conditions and the following disclaimer in the
20 documentation and/or other materials provided with the distribution.
21
22 * Neither the name of the University of Cambridge nor the names of its
23 contributors may be used to endorse or promote products derived from
24 this software without specific prior written permission.
25
26 THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
27 AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
28 IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
29 ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
30 LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
31 CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
32 SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
33 INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
34 CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
35 ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
36 POSSIBILITY OF SUCH DAMAGE.
37 -----------------------------------------------------------------------------
38 */
39
40
41 /* This module contains pcre_exec(), the externally visible function that does
42 pattern matching using an NFA algorithm, trying to mimic Perl as closely as
43 possible. There are also some static supporting functions. */
44
45 #ifdef HAVE_CONFIG_H
46 #include "config.h"
47 #endif
48
49 #define NLBLOCK md /* Block containing newline information */
50 #define PSSTART start_subject /* Field containing processed string start */
51 #define PSEND end_subject /* Field containing processed string end */
52
53 #include "pcre_internal.h"
54
55 /* Undefine some potentially clashing cpp symbols */
56
57 #undef min
58 #undef max
59
60 /* Values for setting in md->match_function_type to indicate two special types
61 of call to match(). We do it this way to save on using another stack variable,
62 as stack usage is to be discouraged. */
63
64 #define MATCH_CONDASSERT 1 /* Called to check a condition assertion */
65 #define MATCH_CBEGROUP 2 /* Could-be-empty unlimited repeat group */
66
67 /* Non-error returns from the match() function. Error returns are externally
68 defined PCRE_ERROR_xxx codes, which are all negative. */
69
70 #define MATCH_MATCH 1
71 #define MATCH_NOMATCH 0
72
73 /* Special internal returns from the match() function. Make them sufficiently
74 negative to avoid the external error codes. */
75
76 #define MATCH_ACCEPT (-999)
77 #define MATCH_COMMIT (-998)
78 #define MATCH_KETRPOS (-997)
79 #define MATCH_ONCE (-996)
80 #define MATCH_PRUNE (-995)
81 #define MATCH_SKIP (-994)
82 #define MATCH_SKIP_ARG (-993)
83 #define MATCH_THEN (-992)
84
85 /* This is a convenience macro for code that occurs many times. */
86
87 #define MRRETURN(ra) \
88 { \
89 md->mark = markptr; \
90 RRETURN(ra); \
91 }
92
93 /* Maximum number of ints of offset to save on the stack for recursive calls.
94 If the offset vector is bigger, malloc is used. This should be a multiple of 3,
95 because the offset vector is always a multiple of 3 long. */
96
97 #define REC_STACK_SAVE_MAX 30
98
99 /* Min and max values for the common repeats; for the maxima, 0 => infinity */
100
101 static const char rep_min[] = { 0, 0, 1, 1, 0, 0 };
102 static const char rep_max[] = { 0, 0, 0, 0, 1, 1 };
103
104
105
106 #ifdef PCRE_DEBUG
107 /*************************************************
108 * Debugging function to print chars *
109 *************************************************/
110
111 /* Print a sequence of chars in printable format, stopping at the end of the
112 subject if the requested.
113
114 Arguments:
115 p points to characters
116 length number to print
117 is_subject TRUE if printing from within md->start_subject
118 md pointer to matching data block, if is_subject is TRUE
119
120 Returns: nothing
121 */
122
123 static void
124 pchars(const uschar *p, int length, BOOL is_subject, match_data *md)
125 {
126 unsigned int c;
127 if (is_subject && length > md->end_subject - p) length = md->end_subject - p;
128 while (length-- > 0)
129 if (isprint(c = *(p++))) printf("%c", c); else printf("\\x%02x", c);
130 }
131 #endif
132
133
134
135 /*************************************************
136 * Match a back-reference *
137 *************************************************/
138
139 /* Normally, if a back reference hasn't been set, the length that is passed is
140 negative, so the match always fails. However, in JavaScript compatibility mode,
141 the length passed is zero. Note that in caseless UTF-8 mode, the number of
142 subject bytes matched may be different to the number of reference bytes.
143
144 Arguments:
145 offset index into the offset vector
146 eptr pointer into the subject
147 length length of reference to be matched (number of bytes)
148 md points to match data block
149 caseless TRUE if caseless
150
151 Returns: < 0 if not matched, otherwise the number of subject bytes matched
152 */
153
154 static int
155 match_ref(int offset, register USPTR eptr, int length, match_data *md,
156 BOOL caseless)
157 {
158 USPTR eptr_start = eptr;
159 register USPTR p = md->start_subject + md->offset_vector[offset];
160
161 #ifdef PCRE_DEBUG
162 if (eptr >= md->end_subject)
163 printf("matching subject <null>");
164 else
165 {
166 printf("matching subject ");
167 pchars(eptr, length, TRUE, md);
168 }
169 printf(" against backref ");
170 pchars(p, length, FALSE, md);
171 printf("\n");
172 #endif
173
174 /* Always fail if reference not set (and not JavaScript compatible). */
175
176 if (length < 0) return -1;
177
178 /* Separate the caseless case for speed. In UTF-8 mode we can only do this
179 properly if Unicode properties are supported. Otherwise, we can check only
180 ASCII characters. */
181
182 if (caseless)
183 {
184 #ifdef SUPPORT_UTF8
185 #ifdef SUPPORT_UCP
186 if (md->utf8)
187 {
188 /* Match characters up to the end of the reference. NOTE: the number of
189 bytes matched may differ, because there are some characters whose upper and
190 lower case versions code as different numbers of bytes. For example, U+023A
191 (2 bytes in UTF-8) is the upper case version of U+2C65 (3 bytes in UTF-8);
192 a sequence of 3 of the former uses 6 bytes, as does a sequence of two of
193 the latter. It is important, therefore, to check the length along the
194 reference, not along the subject (earlier code did this wrong). */
195
196 USPTR endptr = p + length;
197 while (p < endptr)
198 {
199 int c, d;
200 if (eptr >= md->end_subject) return -1;
201 GETCHARINC(c, eptr);
202 GETCHARINC(d, p);
203 if (c != d && c != UCD_OTHERCASE(d)) return -1;
204 }
205 }
206 else
207 #endif
208 #endif
209
210 /* The same code works when not in UTF-8 mode and in UTF-8 mode when there
211 is no UCP support. */
212 {
213 if (eptr + length > md->end_subject) return -1;
214 while (length-- > 0)
215 { if (md->lcc[*p++] != md->lcc[*eptr++]) return -1; }
216 }
217 }
218
219 /* In the caseful case, we can just compare the bytes, whether or not we
220 are in UTF-8 mode. */
221
222 else
223 {
224 if (eptr + length > md->end_subject) return -1;
225 while (length-- > 0) if (*p++ != *eptr++) return -1;
226 }
227
228 return eptr - eptr_start;
229 }
230
231
232
233 /***************************************************************************
234 ****************************************************************************
235 RECURSION IN THE match() FUNCTION
236
237 The match() function is highly recursive, though not every recursive call
238 increases the recursive depth. Nevertheless, some regular expressions can cause
239 it to recurse to a great depth. I was writing for Unix, so I just let it call
240 itself recursively. This uses the stack for saving everything that has to be
241 saved for a recursive call. On Unix, the stack can be large, and this works
242 fine.
243
244 It turns out that on some non-Unix-like systems there are problems with
245 programs that use a lot of stack. (This despite the fact that every last chip
246 has oodles of memory these days, and techniques for extending the stack have
247 been known for decades.) So....
248
249 There is a fudge, triggered by defining NO_RECURSE, which avoids recursive
250 calls by keeping local variables that need to be preserved in blocks of memory
251 obtained from malloc() instead instead of on the stack. Macros are used to
252 achieve this so that the actual code doesn't look very different to what it
253 always used to.
254
255 The original heap-recursive code used longjmp(). However, it seems that this
256 can be very slow on some operating systems. Following a suggestion from Stan
257 Switzer, the use of longjmp() has been abolished, at the cost of having to
258 provide a unique number for each call to RMATCH. There is no way of generating
259 a sequence of numbers at compile time in C. I have given them names, to make
260 them stand out more clearly.
261
262 Crude tests on x86 Linux show a small speedup of around 5-8%. However, on
263 FreeBSD, avoiding longjmp() more than halves the time taken to run the standard
264 tests. Furthermore, not using longjmp() means that local dynamic variables
265 don't have indeterminate values; this has meant that the frame size can be
266 reduced because the result can be "passed back" by straight setting of the
267 variable instead of being passed in the frame.
268 ****************************************************************************
269 ***************************************************************************/
270
271 /* Numbers for RMATCH calls. When this list is changed, the code at HEAP_RETURN
272 below must be updated in sync. */
273
274 enum { RM1=1, RM2, RM3, RM4, RM5, RM6, RM7, RM8, RM9, RM10,
275 RM11, RM12, RM13, RM14, RM15, RM16, RM17, RM18, RM19, RM20,
276 RM21, RM22, RM23, RM24, RM25, RM26, RM27, RM28, RM29, RM30,
277 RM31, RM32, RM33, RM34, RM35, RM36, RM37, RM38, RM39, RM40,
278 RM41, RM42, RM43, RM44, RM45, RM46, RM47, RM48, RM49, RM50,
279 RM51, RM52, RM53, RM54, RM55, RM56, RM57, RM58, RM59, RM60,
280 RM61, RM62, RM63 };
281
282 /* These versions of the macros use the stack, as normal. There are debugging
283 versions and production versions. Note that the "rw" argument of RMATCH isn't
284 actually used in this definition. */
285
286 #ifndef NO_RECURSE
287 #define REGISTER register
288
289 #ifdef PCRE_DEBUG
290 #define RMATCH(ra,rb,rc,rd,re,rw) \
291 { \
292 printf("match() called in line %d\n", __LINE__); \
293 rrc = match(ra,rb,mstart,markptr,rc,rd,re,rdepth+1); \
294 printf("to line %d\n", __LINE__); \
295 }
296 #define RRETURN(ra) \
297 { \
298 printf("match() returned %d from line %d ", ra, __LINE__); \
299 return ra; \
300 }
301 #else
302 #define RMATCH(ra,rb,rc,rd,re,rw) \
303 rrc = match(ra,rb,mstart,markptr,rc,rd,re,rdepth+1)
304 #define RRETURN(ra) return ra
305 #endif
306
307 #else
308
309
310 /* These versions of the macros manage a private stack on the heap. Note that
311 the "rd" argument of RMATCH isn't actually used in this definition. It's the md
312 argument of match(), which never changes. */
313
314 #define REGISTER
315
316 #define RMATCH(ra,rb,rc,rd,re,rw)\
317 {\
318 heapframe *newframe = (heapframe *)(pcre_stack_malloc)(sizeof(heapframe));\
319 if (newframe == NULL) RRETURN(PCRE_ERROR_NOMEMORY);\
320 frame->Xwhere = rw; \
321 newframe->Xeptr = ra;\
322 newframe->Xecode = rb;\
323 newframe->Xmstart = mstart;\
324 newframe->Xmarkptr = markptr;\
325 newframe->Xoffset_top = rc;\
326 newframe->Xeptrb = re;\
327 newframe->Xrdepth = frame->Xrdepth + 1;\
328 newframe->Xprevframe = frame;\
329 frame = newframe;\
330 DPRINTF(("restarting from line %d\n", __LINE__));\
331 goto HEAP_RECURSE;\
332 L_##rw:\
333 DPRINTF(("jumped back to line %d\n", __LINE__));\
334 }
335
336 #define RRETURN(ra)\
337 {\
338 heapframe *oldframe = frame;\
339 frame = oldframe->Xprevframe;\
340 (pcre_stack_free)(oldframe);\
341 if (frame != NULL)\
342 {\
343 rrc = ra;\
344 goto HEAP_RETURN;\
345 }\
346 return ra;\
347 }
348
349
350 /* Structure for remembering the local variables in a private frame */
351
352 typedef struct heapframe {
353 struct heapframe *Xprevframe;
354
355 /* Function arguments that may change */
356
357 USPTR Xeptr;
358 const uschar *Xecode;
359 USPTR Xmstart;
360 USPTR Xmarkptr;
361 int Xoffset_top;
362 eptrblock *Xeptrb;
363 unsigned int Xrdepth;
364
365 /* Function local variables */
366
367 USPTR Xcallpat;
368 #ifdef SUPPORT_UTF8
369 USPTR Xcharptr;
370 #endif
371 USPTR Xdata;
372 USPTR Xnext;
373 USPTR Xpp;
374 USPTR Xprev;
375 USPTR Xsaved_eptr;
376
377 recursion_info Xnew_recursive;
378
379 BOOL Xcur_is_word;
380 BOOL Xcondition;
381 BOOL Xprev_is_word;
382
383 #ifdef SUPPORT_UCP
384 int Xprop_type;
385 int Xprop_value;
386 int Xprop_fail_result;
387 int Xoclength;
388 uschar Xocchars[8];
389 #endif
390
391 int Xcodelink;
392 int Xctype;
393 unsigned int Xfc;
394 int Xfi;
395 int Xlength;
396 int Xmax;
397 int Xmin;
398 int Xnumber;
399 int Xoffset;
400 int Xop;
401 int Xsave_capture_last;
402 int Xsave_offset1, Xsave_offset2, Xsave_offset3;
403 int Xstacksave[REC_STACK_SAVE_MAX];
404
405 eptrblock Xnewptrb;
406
407 /* Where to jump back to */
408
409 int Xwhere;
410
411 } heapframe;
412
413 #endif
414
415
416 /***************************************************************************
417 ***************************************************************************/
418
419
420
421 /*************************************************
422 * Match from current position *
423 *************************************************/
424
425 /* This function is called recursively in many circumstances. Whenever it
426 returns a negative (error) response, the outer incarnation must also return the
427 same response. */
428
429 /* These macros pack up tests that are used for partial matching, and which
430 appears several times in the code. We set the "hit end" flag if the pointer is
431 at the end of the subject and also past the start of the subject (i.e.
432 something has been matched). For hard partial matching, we then return
433 immediately. The second one is used when we already know we are past the end of
434 the subject. */
435
436 #define CHECK_PARTIAL()\
437 if (md->partial != 0 && eptr >= md->end_subject && \
438 eptr > md->start_used_ptr) \
439 { \
440 md->hitend = TRUE; \
441 if (md->partial > 1) MRRETURN(PCRE_ERROR_PARTIAL); \
442 }
443
444 #define SCHECK_PARTIAL()\
445 if (md->partial != 0 && eptr > md->start_used_ptr) \
446 { \
447 md->hitend = TRUE; \
448 if (md->partial > 1) MRRETURN(PCRE_ERROR_PARTIAL); \
449 }
450
451
452 /* Performance note: It might be tempting to extract commonly used fields from
453 the md structure (e.g. utf8, end_subject) into individual variables to improve
454 performance. Tests using gcc on a SPARC disproved this; in the first case, it
455 made performance worse.
456
457 Arguments:
458 eptr pointer to current character in subject
459 ecode pointer to current position in compiled code
460 mstart pointer to the current match start position (can be modified
461 by encountering \K)
462 markptr pointer to the most recent MARK name, or NULL
463 offset_top current top pointer
464 md pointer to "static" info for the match
465 eptrb pointer to chain of blocks containing eptr at start of
466 brackets - for testing for empty matches
467 rdepth the recursion depth
468
469 Returns: MATCH_MATCH if matched ) these values are >= 0
470 MATCH_NOMATCH if failed to match )
471 a negative MATCH_xxx value for PRUNE, SKIP, etc
472 a negative PCRE_ERROR_xxx value if aborted by an error condition
473 (e.g. stopped by repeated call or recursion limit)
474 */
475
476 static int
477 match(REGISTER USPTR eptr, REGISTER const uschar *ecode, USPTR mstart,
478 const uschar *markptr, int offset_top, match_data *md, eptrblock *eptrb,
479 unsigned int rdepth)
480 {
481 /* These variables do not need to be preserved over recursion in this function,
482 so they can be ordinary variables in all cases. Mark some of them with
483 "register" because they are used a lot in loops. */
484
485 register int rrc; /* Returns from recursive calls */
486 register int i; /* Used for loops not involving calls to RMATCH() */
487 register unsigned int c; /* Character values not kept over RMATCH() calls */
488 register BOOL utf8; /* Local copy of UTF-8 flag for speed */
489
490 BOOL minimize, possessive; /* Quantifier options */
491 BOOL caseless;
492 int condcode;
493
494 /* When recursion is not being used, all "local" variables that have to be
495 preserved over calls to RMATCH() are part of a "frame" which is obtained from
496 heap storage. Set up the top-level frame here; others are obtained from the
497 heap whenever RMATCH() does a "recursion". See the macro definitions above. */
498
499 #ifdef NO_RECURSE
500 heapframe *frame = (heapframe *)(pcre_stack_malloc)(sizeof(heapframe));
501 if (frame == NULL) RRETURN(PCRE_ERROR_NOMEMORY);
502 frame->Xprevframe = NULL; /* Marks the top level */
503
504 /* Copy in the original argument variables */
505
506 frame->Xeptr = eptr;
507 frame->Xecode = ecode;
508 frame->Xmstart = mstart;
509 frame->Xmarkptr = markptr;
510 frame->Xoffset_top = offset_top;
511 frame->Xeptrb = eptrb;
512 frame->Xrdepth = rdepth;
513
514 /* This is where control jumps back to to effect "recursion" */
515
516 HEAP_RECURSE:
517
518 /* Macros make the argument variables come from the current frame */
519
520 #define eptr frame->Xeptr
521 #define ecode frame->Xecode
522 #define mstart frame->Xmstart
523 #define markptr frame->Xmarkptr
524 #define offset_top frame->Xoffset_top
525 #define eptrb frame->Xeptrb
526 #define rdepth frame->Xrdepth
527
528 /* Ditto for the local variables */
529
530 #ifdef SUPPORT_UTF8
531 #define charptr frame->Xcharptr
532 #endif
533 #define callpat frame->Xcallpat
534 #define codelink frame->Xcodelink
535 #define data frame->Xdata
536 #define next frame->Xnext
537 #define pp frame->Xpp
538 #define prev frame->Xprev
539 #define saved_eptr frame->Xsaved_eptr
540
541 #define new_recursive frame->Xnew_recursive
542
543 #define cur_is_word frame->Xcur_is_word
544 #define condition frame->Xcondition
545 #define prev_is_word frame->Xprev_is_word
546
547 #ifdef SUPPORT_UCP
548 #define prop_type frame->Xprop_type
549 #define prop_value frame->Xprop_value
550 #define prop_fail_result frame->Xprop_fail_result
551 #define oclength frame->Xoclength
552 #define occhars frame->Xocchars
553 #endif
554
555 #define ctype frame->Xctype
556 #define fc frame->Xfc
557 #define fi frame->Xfi
558 #define length frame->Xlength
559 #define max frame->Xmax
560 #define min frame->Xmin
561 #define number frame->Xnumber
562 #define offset frame->Xoffset
563 #define op frame->Xop
564 #define save_capture_last frame->Xsave_capture_last
565 #define save_offset1 frame->Xsave_offset1
566 #define save_offset2 frame->Xsave_offset2
567 #define save_offset3 frame->Xsave_offset3
568 #define stacksave frame->Xstacksave
569
570 #define newptrb frame->Xnewptrb
571
572 /* When recursion is being used, local variables are allocated on the stack and
573 get preserved during recursion in the normal way. In this environment, fi and
574 i, and fc and c, can be the same variables. */
575
576 #else /* NO_RECURSE not defined */
577 #define fi i
578 #define fc c
579
580 /* Many of the following variables are used only in small blocks of the code.
581 My normal style of coding would have declared them within each of those blocks.
582 However, in order to accommodate the version of this code that uses an external
583 "stack" implemented on the heap, it is easier to declare them all here, so the
584 declarations can be cut out in a block. The only declarations within blocks
585 below are for variables that do not have to be preserved over a recursive call
586 to RMATCH(). */
587
588 #ifdef SUPPORT_UTF8
589 const uschar *charptr;
590 #endif
591 const uschar *callpat;
592 const uschar *data;
593 const uschar *next;
594 USPTR pp;
595 const uschar *prev;
596 USPTR saved_eptr;
597
598 recursion_info new_recursive;
599
600 BOOL cur_is_word;
601 BOOL condition;
602 BOOL prev_is_word;
603
604 #ifdef SUPPORT_UCP
605 int prop_type;
606 int prop_value;
607 int prop_fail_result;
608 int oclength;
609 uschar occhars[8];
610 #endif
611
612 int codelink;
613 int ctype;
614 int length;
615 int max;
616 int min;
617 int number;
618 int offset;
619 int op;
620 int save_capture_last;
621 int save_offset1, save_offset2, save_offset3;
622 int stacksave[REC_STACK_SAVE_MAX];
623
624 eptrblock newptrb;
625 #endif /* NO_RECURSE */
626
627 /* To save space on the stack and in the heap frame, I have doubled up on some
628 of the local variables that are used only in localised parts of the code, but
629 still need to be preserved over recursive calls of match(). These macros define
630 the alternative names that are used. */
631
632 #define allow_zero cur_is_word
633 #define cbegroup condition
634 #define code_offset codelink
635 #define condassert condition
636 #define matched_once prev_is_word
637
638 /* These statements are here to stop the compiler complaining about unitialized
639 variables. */
640
641 #ifdef SUPPORT_UCP
642 prop_value = 0;
643 prop_fail_result = 0;
644 #endif
645
646
647 /* This label is used for tail recursion, which is used in a few cases even
648 when NO_RECURSE is not defined, in order to reduce the amount of stack that is
649 used. Thanks to Ian Taylor for noticing this possibility and sending the
650 original patch. */
651
652 TAIL_RECURSE:
653
654 /* OK, now we can get on with the real code of the function. Recursive calls
655 are specified by the macro RMATCH and RRETURN is used to return. When
656 NO_RECURSE is *not* defined, these just turn into a recursive call to match()
657 and a "return", respectively (possibly with some debugging if PCRE_DEBUG is
658 defined). However, RMATCH isn't like a function call because it's quite a
659 complicated macro. It has to be used in one particular way. This shouldn't,
660 however, impact performance when true recursion is being used. */
661
662 #ifdef SUPPORT_UTF8
663 utf8 = md->utf8; /* Local copy of the flag */
664 #else
665 utf8 = FALSE;
666 #endif
667
668 /* First check that we haven't called match() too many times, or that we
669 haven't exceeded the recursive call limit. */
670
671 if (md->match_call_count++ >= md->match_limit) RRETURN(PCRE_ERROR_MATCHLIMIT);
672 if (rdepth >= md->match_limit_recursion) RRETURN(PCRE_ERROR_RECURSIONLIMIT);
673
674 /* At the start of a group with an unlimited repeat that may match an empty
675 string, the variable md->match_function_type is set to MATCH_CBEGROUP. It is
676 done this way to save having to use another function argument, which would take
677 up space on the stack. See also MATCH_CONDASSERT below.
678
679 When MATCH_CBEGROUP is set, add the current subject pointer to the chain of
680 such remembered pointers, to be checked when we hit the closing ket, in order
681 to break infinite loops that match no characters. When match() is called in
682 other circumstances, don't add to the chain. The MATCH_CBEGROUP feature must
683 NOT be used with tail recursion, because the memory block that is used is on
684 the stack, so a new one may be required for each match(). */
685
686 if (md->match_function_type == MATCH_CBEGROUP)
687 {
688 newptrb.epb_saved_eptr = eptr;
689 newptrb.epb_prev = eptrb;
690 eptrb = &newptrb;
691 md->match_function_type = 0;
692 }
693
694 /* Now start processing the opcodes. */
695
696 for (;;)
697 {
698 minimize = possessive = FALSE;
699 op = *ecode;
700
701 switch(op)
702 {
703 case OP_MARK:
704 markptr = ecode + 2;
705 RMATCH(eptr, ecode + _pcre_OP_lengths[*ecode] + ecode[1], offset_top, md,
706 eptrb, RM55);
707
708 /* A return of MATCH_SKIP_ARG means that matching failed at SKIP with an
709 argument, and we must check whether that argument matches this MARK's
710 argument. It is passed back in md->start_match_ptr (an overloading of that
711 variable). If it does match, we reset that variable to the current subject
712 position and return MATCH_SKIP. Otherwise, pass back the return code
713 unaltered. */
714
715 if (rrc == MATCH_SKIP_ARG &&
716 strcmp((char *)markptr, (char *)(md->start_match_ptr)) == 0)
717 {
718 md->start_match_ptr = eptr;
719 RRETURN(MATCH_SKIP);
720 }
721
722 if (md->mark == NULL) md->mark = markptr;
723 RRETURN(rrc);
724
725 case OP_FAIL:
726 MRRETURN(MATCH_NOMATCH);
727
728 /* COMMIT overrides PRUNE, SKIP, and THEN */
729
730 case OP_COMMIT:
731 RMATCH(eptr, ecode + _pcre_OP_lengths[*ecode], offset_top, md,
732 eptrb, RM52);
733 if (rrc != MATCH_NOMATCH && rrc != MATCH_PRUNE &&
734 rrc != MATCH_SKIP && rrc != MATCH_SKIP_ARG &&
735 rrc != MATCH_THEN)
736 RRETURN(rrc);
737 MRRETURN(MATCH_COMMIT);
738
739 /* PRUNE overrides THEN */
740
741 case OP_PRUNE:
742 RMATCH(eptr, ecode + _pcre_OP_lengths[*ecode], offset_top, md,
743 eptrb, RM51);
744 if (rrc != MATCH_NOMATCH && rrc != MATCH_THEN) RRETURN(rrc);
745 MRRETURN(MATCH_PRUNE);
746
747 case OP_PRUNE_ARG:
748 RMATCH(eptr, ecode + _pcre_OP_lengths[*ecode] + ecode[1], offset_top, md,
749 eptrb, RM56);
750 if (rrc != MATCH_NOMATCH && rrc != MATCH_THEN) RRETURN(rrc);
751 md->mark = ecode + 2;
752 RRETURN(MATCH_PRUNE);
753
754 /* SKIP overrides PRUNE and THEN */
755
756 case OP_SKIP:
757 RMATCH(eptr, ecode + _pcre_OP_lengths[*ecode], offset_top, md,
758 eptrb, RM53);
759 if (rrc != MATCH_NOMATCH && rrc != MATCH_PRUNE && rrc != MATCH_THEN)
760 RRETURN(rrc);
761 md->start_match_ptr = eptr; /* Pass back current position */
762 MRRETURN(MATCH_SKIP);
763
764 case OP_SKIP_ARG:
765 RMATCH(eptr, ecode + _pcre_OP_lengths[*ecode] + ecode[1], offset_top, md,
766 eptrb, RM57);
767 if (rrc != MATCH_NOMATCH && rrc != MATCH_PRUNE && rrc != MATCH_THEN)
768 RRETURN(rrc);
769
770 /* Pass back the current skip name by overloading md->start_match_ptr and
771 returning the special MATCH_SKIP_ARG return code. This will either be
772 caught by a matching MARK, or get to the top, where it is treated the same
773 as PRUNE. */
774
775 md->start_match_ptr = ecode + 2;
776 RRETURN(MATCH_SKIP_ARG);
777
778 /* For THEN (and THEN_ARG) we pass back the address of the bracket or
779 the alt that is at the start of the current branch. This makes it possible
780 to skip back past alternatives that precede the THEN within the current
781 branch. */
782
783 case OP_THEN:
784 RMATCH(eptr, ecode + _pcre_OP_lengths[*ecode], offset_top, md,
785 eptrb, RM54);
786 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
787 md->start_match_ptr = ecode - GET(ecode, 1);
788 MRRETURN(MATCH_THEN);
789
790 case OP_THEN_ARG:
791 RMATCH(eptr, ecode + _pcre_OP_lengths[*ecode] + ecode[1+LINK_SIZE],
792 offset_top, md, eptrb, RM58);
793 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
794 md->start_match_ptr = ecode - GET(ecode, 1);
795 md->mark = ecode + LINK_SIZE + 2;
796 RRETURN(MATCH_THEN);
797
798 /* Handle a capturing bracket, other than those that are possessive with an
799 unlimited repeat. If there is space in the offset vector, save the current
800 subject position in the working slot at the top of the vector. We mustn't
801 change the current values of the data slot, because they may be set from a
802 previous iteration of this group, and be referred to by a reference inside
803 the group. A failure to match might occur after the group has succeeded,
804 if something later on doesn't match. For this reason, we need to restore
805 the working value and also the values of the final offsets, in case they
806 were set by a previous iteration of the same bracket.
807
808 If there isn't enough space in the offset vector, treat this as if it were
809 a non-capturing bracket. Don't worry about setting the flag for the error
810 case here; that is handled in the code for KET. */
811
812 case OP_CBRA:
813 case OP_SCBRA:
814 number = GET2(ecode, 1+LINK_SIZE);
815 offset = number << 1;
816
817 #ifdef PCRE_DEBUG
818 printf("start bracket %d\n", number);
819 printf("subject=");
820 pchars(eptr, 16, TRUE, md);
821 printf("\n");
822 #endif
823
824 if (offset < md->offset_max)
825 {
826 save_offset1 = md->offset_vector[offset];
827 save_offset2 = md->offset_vector[offset+1];
828 save_offset3 = md->offset_vector[md->offset_end - number];
829 save_capture_last = md->capture_last;
830
831 DPRINTF(("saving %d %d %d\n", save_offset1, save_offset2, save_offset3));
832 md->offset_vector[md->offset_end - number] =
833 (int)(eptr - md->start_subject);
834
835 for (;;)
836 {
837 if (op >= OP_SBRA) md->match_function_type = MATCH_CBEGROUP;
838 RMATCH(eptr, ecode + _pcre_OP_lengths[*ecode], offset_top, md,
839 eptrb, RM1);
840 if (rrc == MATCH_ONCE) break; /* Backing up through an atomic group */
841 if (rrc != MATCH_NOMATCH &&
842 (rrc != MATCH_THEN || md->start_match_ptr != ecode))
843 RRETURN(rrc);
844 md->capture_last = save_capture_last;
845 ecode += GET(ecode, 1);
846 if (*ecode != OP_ALT) break;
847 }
848
849 DPRINTF(("bracket %d failed\n", number));
850 md->offset_vector[offset] = save_offset1;
851 md->offset_vector[offset+1] = save_offset2;
852 md->offset_vector[md->offset_end - number] = save_offset3;
853
854 /* At this point, rrc will be one of MATCH_ONCE, MATCH_NOMATCH, or
855 MATCH_THEN. */
856
857 if (rrc != MATCH_THEN && md->mark == NULL) md->mark = markptr;
858 RRETURN(((rrc == MATCH_ONCE)? MATCH_ONCE:MATCH_NOMATCH));
859 }
860
861 /* FALL THROUGH ... Insufficient room for saving captured contents. Treat
862 as a non-capturing bracket. */
863
864 /* VVVVVVVVVVVVVVVVVVVVVVVVV */
865 /* VVVVVVVVVVVVVVVVVVVVVVVVV */
866
867 DPRINTF(("insufficient capture room: treat as non-capturing\n"));
868
869 /* VVVVVVVVVVVVVVVVVVVVVVVVV */
870 /* VVVVVVVVVVVVVVVVVVVVVVVVV */
871
872 /* Non-capturing or atomic group, except for possessive with unlimited
873 repeat. Loop for all the alternatives. When we get to the final alternative
874 within the brackets, we used to return the result of a recursive call to
875 match() whatever happened so it was possible to reduce stack usage by
876 turning this into a tail recursion, except in the case of a possibly empty
877 group. However, now that there is the possiblity of (*THEN) occurring in
878 the final alternative, this optimization is no longer possible.
879
880 MATCH_ONCE is returned when the end of an atomic group is successfully
881 reached, but subsequent matching fails. It passes back up the tree (causing
882 captured values to be reset) until the original atomic group level is
883 reached. This is tested by comparing md->once_target with the start of the
884 group. At this point, the return is converted into MATCH_NOMATCH so that
885 previous backup points can be taken. */
886
887 case OP_ONCE:
888 case OP_BRA:
889 case OP_SBRA:
890 DPRINTF(("start non-capturing bracket\n"));
891
892 for (;;)
893 {
894 if (op >= OP_SBRA || op == OP_ONCE) md->match_function_type = MATCH_CBEGROUP;
895 RMATCH(eptr, ecode + _pcre_OP_lengths[*ecode], offset_top, md, eptrb,
896 RM2);
897 if (rrc != MATCH_NOMATCH &&
898 (rrc != MATCH_THEN || md->start_match_ptr != ecode))
899 {
900 if (rrc == MATCH_ONCE)
901 {
902 const uschar *scode = ecode;
903 if (*scode != OP_ONCE) /* If not at start, find it */
904 {
905 while (*scode == OP_ALT) scode += GET(scode, 1);
906 scode -= GET(scode, 1);
907 }
908 if (md->once_target == scode) rrc = MATCH_NOMATCH;
909 }
910 RRETURN(rrc);
911 }
912 ecode += GET(ecode, 1);
913 if (*ecode != OP_ALT) break;
914 }
915 if (rrc != MATCH_THEN && md->mark == NULL) md->mark = markptr;
916 RRETURN(MATCH_NOMATCH);
917
918 /* Handle possessive capturing brackets with an unlimited repeat. We come
919 here from BRAZERO with allow_zero set TRUE. The offset_vector values are
920 handled similarly to the normal case above. However, the matching is
921 different. The end of these brackets will always be OP_KETRPOS, which
922 returns MATCH_KETRPOS without going further in the pattern. By this means
923 we can handle the group by iteration rather than recursion, thereby
924 reducing the amount of stack needed. */
925
926 case OP_CBRAPOS:
927 case OP_SCBRAPOS:
928 allow_zero = FALSE;
929
930 POSSESSIVE_CAPTURE:
931 number = GET2(ecode, 1+LINK_SIZE);
932 offset = number << 1;
933
934 #ifdef PCRE_DEBUG
935 printf("start possessive bracket %d\n", number);
936 printf("subject=");
937 pchars(eptr, 16, TRUE, md);
938 printf("\n");
939 #endif
940
941 if (offset < md->offset_max)
942 {
943 matched_once = FALSE;
944 code_offset = ecode - md->start_code;
945
946 save_offset1 = md->offset_vector[offset];
947 save_offset2 = md->offset_vector[offset+1];
948 save_offset3 = md->offset_vector[md->offset_end - number];
949 save_capture_last = md->capture_last;
950
951 DPRINTF(("saving %d %d %d\n", save_offset1, save_offset2, save_offset3));
952
953 /* Each time round the loop, save the current subject position for use
954 when the group matches. For MATCH_MATCH, the group has matched, so we
955 restart it with a new subject starting position, remembering that we had
956 at least one match. For MATCH_NOMATCH, carry on with the alternatives, as
957 usual. If we haven't matched any alternatives in any iteration, check to
958 see if a previous iteration matched. If so, the group has matched;
959 continue from afterwards. Otherwise it has failed; restore the previous
960 capture values before returning NOMATCH. */
961
962 for (;;)
963 {
964 md->offset_vector[md->offset_end - number] =
965 (int)(eptr - md->start_subject);
966 if (op >= OP_SBRA) md->match_function_type = MATCH_CBEGROUP;
967 RMATCH(eptr, ecode + _pcre_OP_lengths[*ecode], offset_top, md,
968 eptrb, RM63);
969 if (rrc == MATCH_KETRPOS)
970 {
971 offset_top = md->end_offset_top;
972 eptr = md->end_match_ptr;
973 ecode = md->start_code + code_offset;
974 save_capture_last = md->capture_last;
975 matched_once = TRUE;
976 continue;
977 }
978 if (rrc != MATCH_NOMATCH &&
979 (rrc != MATCH_THEN || md->start_match_ptr != ecode))
980 RRETURN(rrc);
981 md->capture_last = save_capture_last;
982 ecode += GET(ecode, 1);
983 if (*ecode != OP_ALT) break;
984 }
985
986 if (!matched_once)
987 {
988 md->offset_vector[offset] = save_offset1;
989 md->offset_vector[offset+1] = save_offset2;
990 md->offset_vector[md->offset_end - number] = save_offset3;
991 }
992
993 if (rrc != MATCH_THEN && md->mark == NULL) md->mark = markptr;
994 if (allow_zero || matched_once)
995 {
996 ecode += 1 + LINK_SIZE;
997 break;
998 }
999
1000 RRETURN(MATCH_NOMATCH);
1001 }
1002
1003 /* FALL THROUGH ... Insufficient room for saving captured contents. Treat
1004 as a non-capturing bracket. */
1005
1006 /* VVVVVVVVVVVVVVVVVVVVVVVVV */
1007 /* VVVVVVVVVVVVVVVVVVVVVVVVV */
1008
1009 DPRINTF(("insufficient capture room: treat as non-capturing\n"));
1010
1011 /* VVVVVVVVVVVVVVVVVVVVVVVVV */
1012 /* VVVVVVVVVVVVVVVVVVVVVVVVV */
1013
1014 /* Non-capturing possessive bracket with unlimited repeat. We come here
1015 from BRAZERO with allow_zero = TRUE. The code is similar to the above,
1016 without the capturing complication. It is written out separately for speed
1017 and cleanliness. */
1018
1019 case OP_BRAPOS:
1020 case OP_SBRAPOS:
1021 allow_zero = FALSE;
1022
1023 POSSESSIVE_NON_CAPTURE:
1024 matched_once = FALSE;
1025 code_offset = ecode - md->start_code;
1026
1027 for (;;)
1028 {
1029 if (op >= OP_SBRA) md->match_function_type = MATCH_CBEGROUP;
1030 RMATCH(eptr, ecode + _pcre_OP_lengths[*ecode], offset_top, md,
1031 eptrb, RM48);
1032 if (rrc == MATCH_KETRPOS)
1033 {
1034 offset_top = md->end_offset_top;
1035 eptr = md->end_match_ptr;
1036 ecode = md->start_code + code_offset;
1037 matched_once = TRUE;
1038 continue;
1039 }
1040 if (rrc != MATCH_NOMATCH &&
1041 (rrc != MATCH_THEN || md->start_match_ptr != ecode))
1042 RRETURN(rrc);
1043 ecode += GET(ecode, 1);
1044 if (*ecode != OP_ALT) break;
1045 }
1046
1047 if (matched_once || allow_zero)
1048 {
1049 ecode += 1 + LINK_SIZE;
1050 break;
1051 }
1052 RRETURN(MATCH_NOMATCH);
1053
1054 /* Control never reaches here. */
1055
1056 /* Conditional group: compilation checked that there are no more than
1057 two branches. If the condition is false, skipping the first branch takes us
1058 past the end if there is only one branch, but that's OK because that is
1059 exactly what going to the ket would do. */
1060
1061 case OP_COND:
1062 case OP_SCOND:
1063 codelink = GET(ecode, 1);
1064
1065 /* Because of the way auto-callout works during compile, a callout item is
1066 inserted between OP_COND and an assertion condition. */
1067
1068 if (ecode[LINK_SIZE+1] == OP_CALLOUT)
1069 {
1070 if (pcre_callout != NULL)
1071 {
1072 pcre_callout_block cb;
1073 cb.version = 1; /* Version 1 of the callout block */
1074 cb.callout_number = ecode[LINK_SIZE+2];
1075 cb.offset_vector = md->offset_vector;
1076 cb.subject = (PCRE_SPTR)md->start_subject;
1077 cb.subject_length = (int)(md->end_subject - md->start_subject);
1078 cb.start_match = (int)(mstart - md->start_subject);
1079 cb.current_position = (int)(eptr - md->start_subject);
1080 cb.pattern_position = GET(ecode, LINK_SIZE + 3);
1081 cb.next_item_length = GET(ecode, 3 + 2*LINK_SIZE);
1082 cb.capture_top = offset_top/2;
1083 cb.capture_last = md->capture_last;
1084 cb.callout_data = md->callout_data;
1085 if ((rrc = (*pcre_callout)(&cb)) > 0) MRRETURN(MATCH_NOMATCH);
1086 if (rrc < 0) RRETURN(rrc);
1087 }
1088 ecode += _pcre_OP_lengths[OP_CALLOUT];
1089 }
1090
1091 condcode = ecode[LINK_SIZE+1];
1092
1093 /* Now see what the actual condition is */
1094
1095 if (condcode == OP_RREF || condcode == OP_NRREF) /* Recursion test */
1096 {
1097 if (md->recursive == NULL) /* Not recursing => FALSE */
1098 {
1099 condition = FALSE;
1100 ecode += GET(ecode, 1);
1101 }
1102 else
1103 {
1104 int recno = GET2(ecode, LINK_SIZE + 2); /* Recursion group number*/
1105 condition = (recno == RREF_ANY || recno == md->recursive->group_num);
1106
1107 /* If the test is for recursion into a specific subpattern, and it is
1108 false, but the test was set up by name, scan the table to see if the
1109 name refers to any other numbers, and test them. The condition is true
1110 if any one is set. */
1111
1112 if (!condition && condcode == OP_NRREF && recno != RREF_ANY)
1113 {
1114 uschar *slotA = md->name_table;
1115 for (i = 0; i < md->name_count; i++)
1116 {
1117 if (GET2(slotA, 0) == recno) break;
1118 slotA += md->name_entry_size;
1119 }
1120
1121 /* Found a name for the number - there can be only one; duplicate
1122 names for different numbers are allowed, but not vice versa. First
1123 scan down for duplicates. */
1124
1125 if (i < md->name_count)
1126 {
1127 uschar *slotB = slotA;
1128 while (slotB > md->name_table)
1129 {
1130 slotB -= md->name_entry_size;
1131 if (strcmp((char *)slotA + 2, (char *)slotB + 2) == 0)
1132 {
1133 condition = GET2(slotB, 0) == md->recursive->group_num;
1134 if (condition) break;
1135 }
1136 else break;
1137 }
1138
1139 /* Scan up for duplicates */
1140
1141 if (!condition)
1142 {
1143 slotB = slotA;
1144 for (i++; i < md->name_count; i++)
1145 {
1146 slotB += md->name_entry_size;
1147 if (strcmp((char *)slotA + 2, (char *)slotB + 2) == 0)
1148 {
1149 condition = GET2(slotB, 0) == md->recursive->group_num;
1150 if (condition) break;
1151 }
1152 else break;
1153 }
1154 }
1155 }
1156 }
1157
1158 /* Chose branch according to the condition */
1159
1160 ecode += condition? 3 : GET(ecode, 1);
1161 }
1162 }
1163
1164 else if (condcode == OP_CREF || condcode == OP_NCREF) /* Group used test */
1165 {
1166 offset = GET2(ecode, LINK_SIZE+2) << 1; /* Doubled ref number */
1167 condition = offset < offset_top && md->offset_vector[offset] >= 0;
1168
1169 /* If the numbered capture is unset, but the reference was by name,
1170 scan the table to see if the name refers to any other numbers, and test
1171 them. The condition is true if any one is set. This is tediously similar
1172 to the code above, but not close enough to try to amalgamate. */
1173
1174 if (!condition && condcode == OP_NCREF)
1175 {
1176 int refno = offset >> 1;
1177 uschar *slotA = md->name_table;
1178
1179 for (i = 0; i < md->name_count; i++)
1180 {
1181 if (GET2(slotA, 0) == refno) break;
1182 slotA += md->name_entry_size;
1183 }
1184
1185 /* Found a name for the number - there can be only one; duplicate names
1186 for different numbers are allowed, but not vice versa. First scan down
1187 for duplicates. */
1188
1189 if (i < md->name_count)
1190 {
1191 uschar *slotB = slotA;
1192 while (slotB > md->name_table)
1193 {
1194 slotB -= md->name_entry_size;
1195 if (strcmp((char *)slotA + 2, (char *)slotB + 2) == 0)
1196 {
1197 offset = GET2(slotB, 0) << 1;
1198 condition = offset < offset_top &&
1199 md->offset_vector[offset] >= 0;
1200 if (condition) break;
1201 }
1202 else break;
1203 }
1204
1205 /* Scan up for duplicates */
1206
1207 if (!condition)
1208 {
1209 slotB = slotA;
1210 for (i++; i < md->name_count; i++)
1211 {
1212 slotB += md->name_entry_size;
1213 if (strcmp((char *)slotA + 2, (char *)slotB + 2) == 0)
1214 {
1215 offset = GET2(slotB, 0) << 1;
1216 condition = offset < offset_top &&
1217 md->offset_vector[offset] >= 0;
1218 if (condition) break;
1219 }
1220 else break;
1221 }
1222 }
1223 }
1224 }
1225
1226 /* Chose branch according to the condition */
1227
1228 ecode += condition? 3 : GET(ecode, 1);
1229 }
1230
1231 else if (condcode == OP_DEF) /* DEFINE - always false */
1232 {
1233 condition = FALSE;
1234 ecode += GET(ecode, 1);
1235 }
1236
1237 /* The condition is an assertion. Call match() to evaluate it - setting
1238 md->match_function_type to MATCH_CONDASSERT causes it to stop at the end of
1239 an assertion. */
1240
1241 else
1242 {
1243 md->match_function_type = MATCH_CONDASSERT;
1244 RMATCH(eptr, ecode + 1 + LINK_SIZE, offset_top, md, NULL, RM3);
1245 if (rrc == MATCH_MATCH)
1246 {
1247 if (md->end_offset_top > offset_top)
1248 offset_top = md->end_offset_top; /* Captures may have happened */
1249 condition = TRUE;
1250 ecode += 1 + LINK_SIZE + GET(ecode, LINK_SIZE + 2);
1251 while (*ecode == OP_ALT) ecode += GET(ecode, 1);
1252 }
1253 else if (rrc != MATCH_NOMATCH &&
1254 (rrc != MATCH_THEN || md->start_match_ptr != ecode))
1255 {
1256 RRETURN(rrc); /* Need braces because of following else */
1257 }
1258 else
1259 {
1260 condition = FALSE;
1261 ecode += codelink;
1262 }
1263 }
1264
1265 /* We are now at the branch that is to be obeyed. As there is only one,
1266 we used to use tail recursion to avoid using another stack frame, except
1267 when there was unlimited repeat of a possibly empty group. However, that
1268 strategy no longer works because of the possibilty of (*THEN) being
1269 encountered in the branch. A recursive call to match() is always required,
1270 unless the second alternative doesn't exist, in which case we can just
1271 plough on. */
1272
1273 if (condition || *ecode == OP_ALT)
1274 {
1275 if (op == OP_SCOND) md->match_function_type = MATCH_CBEGROUP;
1276 RMATCH(eptr, ecode + 1 + LINK_SIZE, offset_top, md, eptrb, RM49);
1277 if (rrc == MATCH_THEN && md->start_match_ptr == ecode)
1278 rrc = MATCH_NOMATCH;
1279 RRETURN(rrc);
1280 }
1281 else /* Condition false & no alternative */
1282 {
1283 ecode += 1 + LINK_SIZE;
1284 }
1285 break;
1286
1287
1288 /* Before OP_ACCEPT there may be any number of OP_CLOSE opcodes,
1289 to close any currently open capturing brackets. */
1290
1291 case OP_CLOSE:
1292 number = GET2(ecode, 1);
1293 offset = number << 1;
1294
1295 #ifdef PCRE_DEBUG
1296 printf("end bracket %d at *ACCEPT", number);
1297 printf("\n");
1298 #endif
1299
1300 md->capture_last = number;
1301 if (offset >= md->offset_max) md->offset_overflow = TRUE; else
1302 {
1303 md->offset_vector[offset] =
1304 md->offset_vector[md->offset_end - number];
1305 md->offset_vector[offset+1] = (int)(eptr - md->start_subject);
1306 if (offset_top <= offset) offset_top = offset + 2;
1307 }
1308 ecode += 3;
1309 break;
1310
1311
1312 /* End of the pattern, either real or forced. */
1313
1314 case OP_END:
1315 case OP_ACCEPT:
1316 case OP_ASSERT_ACCEPT:
1317
1318 /* If we have matched an empty string, fail if not in an assertion and not
1319 in a recursion if either PCRE_NOTEMPTY is set, or if PCRE_NOTEMPTY_ATSTART
1320 is set and we have matched at the start of the subject. In both cases,
1321 backtracking will then try other alternatives, if any. */
1322
1323 if (eptr == mstart && op != OP_ASSERT_ACCEPT &&
1324 md->recursive == NULL &&
1325 (md->notempty ||
1326 (md->notempty_atstart &&
1327 mstart == md->start_subject + md->start_offset)))
1328 MRRETURN(MATCH_NOMATCH);
1329
1330 /* Otherwise, we have a match. */
1331
1332 md->end_match_ptr = eptr; /* Record where we ended */
1333 md->end_offset_top = offset_top; /* and how many extracts were taken */
1334 md->start_match_ptr = mstart; /* and the start (\K can modify) */
1335
1336 /* For some reason, the macros don't work properly if an expression is
1337 given as the argument to MRRETURN when the heap is in use. */
1338
1339 rrc = (op == OP_END)? MATCH_MATCH : MATCH_ACCEPT;
1340 MRRETURN(rrc);
1341
1342 /* Assertion brackets. Check the alternative branches in turn - the
1343 matching won't pass the KET for an assertion. If any one branch matches,
1344 the assertion is true. Lookbehind assertions have an OP_REVERSE item at the
1345 start of each branch to move the current point backwards, so the code at
1346 this level is identical to the lookahead case. When the assertion is part
1347 of a condition, we want to return immediately afterwards. The caller of
1348 this incarnation of the match() function will have set MATCH_CONDASSERT in
1349 md->match_function type, and one of these opcodes will be the first opcode
1350 that is processed. We use a local variable that is preserved over calls to
1351 match() to remember this case. */
1352
1353 case OP_ASSERT:
1354 case OP_ASSERTBACK:
1355 if (md->match_function_type == MATCH_CONDASSERT)
1356 {
1357 condassert = TRUE;
1358 md->match_function_type = 0;
1359 }
1360 else condassert = FALSE;
1361
1362 do
1363 {
1364 RMATCH(eptr, ecode + 1 + LINK_SIZE, offset_top, md, NULL, RM4);
1365 if (rrc == MATCH_MATCH || rrc == MATCH_ACCEPT)
1366 {
1367 mstart = md->start_match_ptr; /* In case \K reset it */
1368 break;
1369 }
1370 if (rrc != MATCH_NOMATCH &&
1371 (rrc != MATCH_THEN || md->start_match_ptr != ecode))
1372 RRETURN(rrc);
1373 ecode += GET(ecode, 1);
1374 }
1375 while (*ecode == OP_ALT);
1376
1377 if (*ecode == OP_KET) MRRETURN(MATCH_NOMATCH);
1378
1379 /* If checking an assertion for a condition, return MATCH_MATCH. */
1380
1381 if (condassert) RRETURN(MATCH_MATCH);
1382
1383 /* Continue from after the assertion, updating the offsets high water
1384 mark, since extracts may have been taken during the assertion. */
1385
1386 do ecode += GET(ecode,1); while (*ecode == OP_ALT);
1387 ecode += 1 + LINK_SIZE;
1388 offset_top = md->end_offset_top;
1389 continue;
1390
1391 /* Negative assertion: all branches must fail to match. Encountering SKIP,
1392 PRUNE, or COMMIT means we must assume failure without checking subsequent
1393 branches. */
1394
1395 case OP_ASSERT_NOT:
1396 case OP_ASSERTBACK_NOT:
1397 if (md->match_function_type == MATCH_CONDASSERT)
1398 {
1399 condassert = TRUE;
1400 md->match_function_type = 0;
1401 }
1402 else condassert = FALSE;
1403
1404 do
1405 {
1406 RMATCH(eptr, ecode + 1 + LINK_SIZE, offset_top, md, NULL, RM5);
1407 if (rrc == MATCH_MATCH || rrc == MATCH_ACCEPT) MRRETURN(MATCH_NOMATCH);
1408 if (rrc == MATCH_SKIP || rrc == MATCH_PRUNE || rrc == MATCH_COMMIT)
1409 {
1410 do ecode += GET(ecode,1); while (*ecode == OP_ALT);
1411 break;
1412 }
1413 if (rrc != MATCH_NOMATCH &&
1414 (rrc != MATCH_THEN || md->start_match_ptr != ecode))
1415 RRETURN(rrc);
1416 ecode += GET(ecode,1);
1417 }
1418 while (*ecode == OP_ALT);
1419
1420 if (condassert) RRETURN(MATCH_MATCH); /* Condition assertion */
1421
1422 ecode += 1 + LINK_SIZE;
1423 continue;
1424
1425 /* Move the subject pointer back. This occurs only at the start of
1426 each branch of a lookbehind assertion. If we are too close to the start to
1427 move back, this match function fails. When working with UTF-8 we move
1428 back a number of characters, not bytes. */
1429
1430 case OP_REVERSE:
1431 #ifdef SUPPORT_UTF8
1432 if (utf8)
1433 {
1434 i = GET(ecode, 1);
1435 while (i-- > 0)
1436 {
1437 eptr--;
1438 if (eptr < md->start_subject) MRRETURN(MATCH_NOMATCH);
1439 BACKCHAR(eptr);
1440 }
1441 }
1442 else
1443 #endif
1444
1445 /* No UTF-8 support, or not in UTF-8 mode: count is byte count */
1446
1447 {
1448 eptr -= GET(ecode, 1);
1449 if (eptr < md->start_subject) MRRETURN(MATCH_NOMATCH);
1450 }
1451
1452 /* Save the earliest consulted character, then skip to next op code */
1453
1454 if (eptr < md->start_used_ptr) md->start_used_ptr = eptr;
1455 ecode += 1 + LINK_SIZE;
1456 break;
1457
1458 /* The callout item calls an external function, if one is provided, passing
1459 details of the match so far. This is mainly for debugging, though the
1460 function is able to force a failure. */
1461
1462 case OP_CALLOUT:
1463 if (pcre_callout != NULL)
1464 {
1465 pcre_callout_block cb;
1466 cb.version = 1; /* Version 1 of the callout block */
1467 cb.callout_number = ecode[1];
1468 cb.offset_vector = md->offset_vector;
1469 cb.subject = (PCRE_SPTR)md->start_subject;
1470 cb.subject_length = (int)(md->end_subject - md->start_subject);
1471 cb.start_match = (int)(mstart - md->start_subject);
1472 cb.current_position = (int)(eptr - md->start_subject);
1473 cb.pattern_position = GET(ecode, 2);
1474 cb.next_item_length = GET(ecode, 2 + LINK_SIZE);
1475 cb.capture_top = offset_top/2;
1476 cb.capture_last = md->capture_last;
1477 cb.callout_data = md->callout_data;
1478 if ((rrc = (*pcre_callout)(&cb)) > 0) MRRETURN(MATCH_NOMATCH);
1479 if (rrc < 0) RRETURN(rrc);
1480 }
1481 ecode += 2 + 2*LINK_SIZE;
1482 break;
1483
1484 /* Recursion either matches the current regex, or some subexpression. The
1485 offset data is the offset to the starting bracket from the start of the
1486 whole pattern. (This is so that it works from duplicated subpatterns.)
1487
1488 The state of the capturing groups is preserved over recursion, and
1489 re-instated afterwards. We don't know how many are started and not yet
1490 finished (offset_top records the completed total) so we just have to save
1491 all the potential data. There may be up to 65535 such values, which is too
1492 large to put on the stack, but using malloc for small numbers seems
1493 expensive. As a compromise, the stack is used when there are no more than
1494 REC_STACK_SAVE_MAX values to store; otherwise malloc is used.
1495
1496 There are also other values that have to be saved. We use a chained
1497 sequence of blocks that actually live on the stack. Thanks to Robin Houston
1498 for the original version of this logic. It has, however, been hacked around
1499 a lot, so he is not to blame for the current way it works. */
1500
1501 case OP_RECURSE:
1502 {
1503 callpat = md->start_code + GET(ecode, 1);
1504 new_recursive.group_num = (callpat == md->start_code)? 0 :
1505 GET2(callpat, 1 + LINK_SIZE);
1506
1507 /* Add to "recursing stack" */
1508
1509 new_recursive.prevrec = md->recursive;
1510 md->recursive = &new_recursive;
1511
1512 /* Where to continue from afterwards */
1513
1514 ecode += 1 + LINK_SIZE;
1515
1516 /* Now save the offset data */
1517
1518 new_recursive.saved_max = md->offset_end;
1519 if (new_recursive.saved_max <= REC_STACK_SAVE_MAX)
1520 new_recursive.offset_save = stacksave;
1521 else
1522 {
1523 new_recursive.offset_save =
1524 (int *)(pcre_malloc)(new_recursive.saved_max * sizeof(int));
1525 if (new_recursive.offset_save == NULL) RRETURN(PCRE_ERROR_NOMEMORY);
1526 }
1527 memcpy(new_recursive.offset_save, md->offset_vector,
1528 new_recursive.saved_max * sizeof(int));
1529
1530 /* OK, now we can do the recursion. After processing each alternative,
1531 restore the offset data. If there were nested recursions, md->recursive
1532 might be changed, so reset it before looping. */
1533
1534 DPRINTF(("Recursing into group %d\n", new_recursive.group_num));
1535 cbegroup = (*callpat >= OP_SBRA);
1536 do
1537 {
1538 if (cbegroup) md->match_function_type = MATCH_CBEGROUP;
1539 RMATCH(eptr, callpat + _pcre_OP_lengths[*callpat], offset_top,
1540 md, eptrb, RM6);
1541 memcpy(md->offset_vector, new_recursive.offset_save,
1542 new_recursive.saved_max * sizeof(int));
1543 if (rrc == MATCH_MATCH || rrc == MATCH_ACCEPT)
1544 {
1545 DPRINTF(("Recursion matched\n"));
1546 md->recursive = new_recursive.prevrec;
1547 if (new_recursive.offset_save != stacksave)
1548 (pcre_free)(new_recursive.offset_save);
1549
1550 /* Set where we got to in the subject, and reset the start in case
1551 it was changed by \K. This *is* propagated back out of a recursion,
1552 for Perl compatibility. */
1553
1554 eptr = md->end_match_ptr;
1555 mstart = md->start_match_ptr;
1556 goto RECURSION_MATCHED; /* Exit loop; end processing */
1557 }
1558 else if (rrc != MATCH_NOMATCH &&
1559 (rrc != MATCH_THEN || md->start_match_ptr != ecode))
1560 {
1561 DPRINTF(("Recursion gave error %d\n", rrc));
1562 if (new_recursive.offset_save != stacksave)
1563 (pcre_free)(new_recursive.offset_save);
1564 RRETURN(rrc);
1565 }
1566
1567 md->recursive = &new_recursive;
1568 callpat += GET(callpat, 1);
1569 }
1570 while (*callpat == OP_ALT);
1571
1572 DPRINTF(("Recursion didn't match\n"));
1573 md->recursive = new_recursive.prevrec;
1574 if (new_recursive.offset_save != stacksave)
1575 (pcre_free)(new_recursive.offset_save);
1576 MRRETURN(MATCH_NOMATCH);
1577 }
1578
1579 RECURSION_MATCHED:
1580 break;
1581
1582 /* An alternation is the end of a branch; scan along to find the end of the
1583 bracketed group and go to there. */
1584
1585 case OP_ALT:
1586 do ecode += GET(ecode,1); while (*ecode == OP_ALT);
1587 break;
1588
1589 /* BRAZERO, BRAMINZERO and SKIPZERO occur just before a bracket group,
1590 indicating that it may occur zero times. It may repeat infinitely, or not
1591 at all - i.e. it could be ()* or ()? or even (){0} in the pattern. Brackets
1592 with fixed upper repeat limits are compiled as a number of copies, with the
1593 optional ones preceded by BRAZERO or BRAMINZERO. */
1594
1595 case OP_BRAZERO:
1596 next = ecode + 1;
1597 RMATCH(eptr, next, offset_top, md, eptrb, RM10);
1598 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1599 do next += GET(next, 1); while (*next == OP_ALT);
1600 ecode = next + 1 + LINK_SIZE;
1601 break;
1602
1603 case OP_BRAMINZERO:
1604 next = ecode + 1;
1605 do next += GET(next, 1); while (*next == OP_ALT);
1606 RMATCH(eptr, next + 1+LINK_SIZE, offset_top, md, eptrb, RM11);
1607 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1608 ecode++;
1609 break;
1610
1611 case OP_SKIPZERO:
1612 next = ecode+1;
1613 do next += GET(next,1); while (*next == OP_ALT);
1614 ecode = next + 1 + LINK_SIZE;
1615 break;
1616
1617 /* BRAPOSZERO occurs before a possessive bracket group. Don't do anything
1618 here; just jump to the group, with allow_zero set TRUE. */
1619
1620 case OP_BRAPOSZERO:
1621 op = *(++ecode);
1622 allow_zero = TRUE;
1623 if (op == OP_CBRAPOS || op == OP_SCBRAPOS) goto POSSESSIVE_CAPTURE;
1624 goto POSSESSIVE_NON_CAPTURE;
1625
1626 /* End of a group, repeated or non-repeating. */
1627
1628 case OP_KET:
1629 case OP_KETRMIN:
1630 case OP_KETRMAX:
1631 case OP_KETRPOS:
1632 prev = ecode - GET(ecode, 1);
1633
1634 /* If this was a group that remembered the subject start, in order to break
1635 infinite repeats of empty string matches, retrieve the subject start from
1636 the chain. Otherwise, set it NULL. */
1637
1638 if (*prev >= OP_SBRA || *prev == OP_ONCE)
1639 {
1640 saved_eptr = eptrb->epb_saved_eptr; /* Value at start of group */
1641 eptrb = eptrb->epb_prev; /* Backup to previous group */
1642 }
1643 else saved_eptr = NULL;
1644
1645 /* If we are at the end of an assertion group, stop matching and return
1646 MATCH_MATCH, but record the current high water mark for use by positive
1647 assertions. We also need to record the match start in case it was changed
1648 by \K. */
1649
1650 if (*prev == OP_ASSERT || *prev == OP_ASSERT_NOT ||
1651 *prev == OP_ASSERTBACK || *prev == OP_ASSERTBACK_NOT)
1652 {
1653 md->end_match_ptr = eptr; /* For ONCE */
1654 md->end_offset_top = offset_top;
1655 md->start_match_ptr = mstart;
1656 MRRETURN(MATCH_MATCH);
1657 }
1658
1659 /* For capturing groups we have to check the group number back at the start
1660 and if necessary complete handling an extraction by setting the offsets and
1661 bumping the high water mark. Whole-pattern recursion is coded as a recurse
1662 into group 0, so it won't be picked up here. Instead, we catch it when the
1663 OP_END is reached. Other recursion is handled here. We just have to record
1664 the current subject position and start match pointer and give a MATCH
1665 return. */
1666
1667 if (*prev == OP_CBRA || *prev == OP_SCBRA ||
1668 *prev == OP_CBRAPOS || *prev == OP_SCBRAPOS)
1669 {
1670 number = GET2(prev, 1+LINK_SIZE);
1671 offset = number << 1;
1672
1673 #ifdef PCRE_DEBUG
1674 printf("end bracket %d", number);
1675 printf("\n");
1676 #endif
1677
1678 /* Handle a recursively called group. */
1679
1680 if (md->recursive != NULL && md->recursive->group_num == number)
1681 {
1682 md->end_match_ptr = eptr;
1683 md->start_match_ptr = mstart;
1684 RRETURN(MATCH_MATCH);
1685 }
1686
1687 /* Deal with capturing */
1688
1689 md->capture_last = number;
1690 if (offset >= md->offset_max) md->offset_overflow = TRUE; else
1691 {
1692 /* If offset is greater than offset_top, it means that we are
1693 "skipping" a capturing group, and that group's offsets must be marked
1694 unset. In earlier versions of PCRE, all the offsets were unset at the
1695 start of matching, but this doesn't work because atomic groups and
1696 assertions can cause a value to be set that should later be unset.
1697 Example: matching /(?>(a))b|(a)c/ against "ac". This sets group 1 as
1698 part of the atomic group, but this is not on the final matching path,
1699 so must be unset when 2 is set. (If there is no group 2, there is no
1700 problem, because offset_top will then be 2, indicating no capture.) */
1701
1702 if (offset > offset_top)
1703 {
1704 register int *iptr = md->offset_vector + offset_top;
1705 register int *iend = md->offset_vector + offset;
1706 while (iptr < iend) *iptr++ = -1;
1707 }
1708
1709 /* Now make the extraction */
1710
1711 md->offset_vector[offset] =
1712 md->offset_vector[md->offset_end - number];
1713 md->offset_vector[offset+1] = (int)(eptr - md->start_subject);
1714 if (offset_top <= offset) offset_top = offset + 2;
1715 }
1716 }
1717
1718 /* For an ordinary non-repeating ket, just continue at this level. This
1719 also happens for a repeating ket if no characters were matched in the
1720 group. This is the forcible breaking of infinite loops as implemented in
1721 Perl 5.005. For a non-repeating atomic group, establish a backup point by
1722 processing the rest of the pattern at a lower level. If this results in a
1723 NOMATCH return, pass MATCH_ONCE back to the original OP_ONCE level, thereby
1724 bypassing intermediate backup points, but resetting any captures that
1725 happened along the way. */
1726
1727 if (*ecode == OP_KET || eptr == saved_eptr)
1728 {
1729 if (*prev == OP_ONCE)
1730 {
1731 RMATCH(eptr, ecode + 1 + LINK_SIZE, offset_top, md, eptrb, RM12);
1732 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1733 md->once_target = prev; /* Level at which to change to MATCH_NOMATCH */
1734 RRETURN(MATCH_ONCE);
1735 }
1736 ecode += 1 + LINK_SIZE; /* Carry on at this level */
1737 break;
1738 }
1739
1740 /* OP_KETRPOS is a possessive repeating ket. Remember the current position,
1741 and return the MATCH_KETRPOS. This makes it possible to do the repeats one
1742 at a time from the outer level, thus saving stack. */
1743
1744 if (*ecode == OP_KETRPOS)
1745 {
1746 md->end_match_ptr = eptr;
1747 md->end_offset_top = offset_top;
1748 RRETURN(MATCH_KETRPOS);
1749 }
1750
1751 /* The normal repeating kets try the rest of the pattern or restart from
1752 the preceding bracket, in the appropriate order. In the second case, we can
1753 use tail recursion to avoid using another stack frame, unless we have an
1754 an atomic group or an unlimited repeat of a group that can match an empty
1755 string. */
1756
1757 if (*ecode == OP_KETRMIN)
1758 {
1759 RMATCH(eptr, ecode + 1 + LINK_SIZE, offset_top, md, eptrb, RM7);
1760 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1761 if (*prev == OP_ONCE)
1762 {
1763 RMATCH(eptr, prev, offset_top, md, eptrb, RM8);
1764 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1765 md->once_target = prev; /* Level at which to change to MATCH_NOMATCH */
1766 RRETURN(MATCH_ONCE);
1767 }
1768 if (*prev >= OP_SBRA) /* Could match an empty string */
1769 {
1770 md->match_function_type = MATCH_CBEGROUP;
1771 RMATCH(eptr, prev, offset_top, md, eptrb, RM50);
1772 RRETURN(rrc);
1773 }
1774 ecode = prev;
1775 goto TAIL_RECURSE;
1776 }
1777 else /* OP_KETRMAX */
1778 {
1779 if (*prev >= OP_SBRA) md->match_function_type = MATCH_CBEGROUP;
1780 RMATCH(eptr, prev, offset_top, md, eptrb, RM13);
1781 if (rrc == MATCH_ONCE && md->once_target == prev) rrc = MATCH_NOMATCH;
1782 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1783 if (*prev == OP_ONCE)
1784 {
1785 RMATCH(eptr, ecode + 1 + LINK_SIZE, offset_top, md, eptrb, RM9);
1786 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1787 md->once_target = prev;
1788 RRETURN(MATCH_ONCE);
1789 }
1790 ecode += 1 + LINK_SIZE;
1791 goto TAIL_RECURSE;
1792 }
1793 /* Control never gets here */
1794
1795 /* Not multiline mode: start of subject assertion, unless notbol. */
1796
1797 case OP_CIRC:
1798 if (md->notbol && eptr == md->start_subject) MRRETURN(MATCH_NOMATCH);
1799
1800 /* Start of subject assertion */
1801
1802 case OP_SOD:
1803 if (eptr != md->start_subject) MRRETURN(MATCH_NOMATCH);
1804 ecode++;
1805 break;
1806
1807 /* Multiline mode: start of subject unless notbol, or after any newline. */
1808
1809 case OP_CIRCM:
1810 if (md->notbol && eptr == md->start_subject) MRRETURN(MATCH_NOMATCH);
1811 if (eptr != md->start_subject &&
1812 (eptr == md->end_subject || !WAS_NEWLINE(eptr)))
1813 MRRETURN(MATCH_NOMATCH);
1814 ecode++;
1815 break;
1816
1817 /* Start of match assertion */
1818
1819 case OP_SOM:
1820 if (eptr != md->start_subject + md->start_offset) MRRETURN(MATCH_NOMATCH);
1821 ecode++;
1822 break;
1823
1824 /* Reset the start of match point */
1825
1826 case OP_SET_SOM:
1827 mstart = eptr;
1828 ecode++;
1829 break;
1830
1831 /* Multiline mode: assert before any newline, or before end of subject
1832 unless noteol is set. */
1833
1834 case OP_DOLLM:
1835 if (eptr < md->end_subject)
1836 { if (!IS_NEWLINE(eptr)) MRRETURN(MATCH_NOMATCH); }
1837 else
1838 {
1839 if (md->noteol) MRRETURN(MATCH_NOMATCH);
1840 SCHECK_PARTIAL();
1841 }
1842 ecode++;
1843 break;
1844
1845 /* Not multiline mode: assert before a terminating newline or before end of
1846 subject unless noteol is set. */
1847
1848 case OP_DOLL:
1849 if (md->noteol) MRRETURN(MATCH_NOMATCH);
1850 if (!md->endonly) goto ASSERT_NL_OR_EOS;
1851
1852 /* ... else fall through for endonly */
1853
1854 /* End of subject assertion (\z) */
1855
1856 case OP_EOD:
1857 if (eptr < md->end_subject) MRRETURN(MATCH_NOMATCH);
1858 SCHECK_PARTIAL();
1859 ecode++;
1860 break;
1861
1862 /* End of subject or ending \n assertion (\Z) */
1863
1864 case OP_EODN:
1865 ASSERT_NL_OR_EOS:
1866 if (eptr < md->end_subject &&
1867 (!IS_NEWLINE(eptr) || eptr != md->end_subject - md->nllen))
1868 MRRETURN(MATCH_NOMATCH);
1869
1870 /* Either at end of string or \n before end. */
1871
1872 SCHECK_PARTIAL();
1873 ecode++;
1874 break;
1875
1876 /* Word boundary assertions */
1877
1878 case OP_NOT_WORD_BOUNDARY:
1879 case OP_WORD_BOUNDARY:
1880 {
1881
1882 /* Find out if the previous and current characters are "word" characters.
1883 It takes a bit more work in UTF-8 mode. Characters > 255 are assumed to
1884 be "non-word" characters. Remember the earliest consulted character for
1885 partial matching. */
1886
1887 #ifdef SUPPORT_UTF8
1888 if (utf8)
1889 {
1890 /* Get status of previous character */
1891
1892 if (eptr == md->start_subject) prev_is_word = FALSE; else
1893 {
1894 USPTR lastptr = eptr - 1;
1895 while((*lastptr & 0xc0) == 0x80) lastptr--;
1896 if (lastptr < md->start_used_ptr) md->start_used_ptr = lastptr;
1897 GETCHAR(c, lastptr);
1898 #ifdef SUPPORT_UCP
1899 if (md->use_ucp)
1900 {
1901 if (c == '_') prev_is_word = TRUE; else
1902 {
1903 int cat = UCD_CATEGORY(c);
1904 prev_is_word = (cat == ucp_L || cat == ucp_N);
1905 }
1906 }
1907 else
1908 #endif
1909 prev_is_word = c < 256 && (md->ctypes[c] & ctype_word) != 0;
1910 }
1911
1912 /* Get status of next character */
1913
1914 if (eptr >= md->end_subject)
1915 {
1916 SCHECK_PARTIAL();
1917 cur_is_word = FALSE;
1918 }
1919 else
1920 {
1921 GETCHAR(c, eptr);
1922 #ifdef SUPPORT_UCP
1923 if (md->use_ucp)
1924 {
1925 if (c == '_') cur_is_word = TRUE; else
1926 {
1927 int cat = UCD_CATEGORY(c);
1928 cur_is_word = (cat == ucp_L || cat == ucp_N);
1929 }
1930 }
1931 else
1932 #endif
1933 cur_is_word = c < 256 && (md->ctypes[c] & ctype_word) != 0;
1934 }
1935 }
1936 else
1937 #endif
1938
1939 /* Not in UTF-8 mode, but we may still have PCRE_UCP set, and for
1940 consistency with the behaviour of \w we do use it in this case. */
1941
1942 {
1943 /* Get status of previous character */
1944
1945 if (eptr == md->start_subject) prev_is_word = FALSE; else
1946 {
1947 if (eptr <= md->start_used_ptr) md->start_used_ptr = eptr - 1;
1948 #ifdef SUPPORT_UCP
1949 if (md->use_ucp)
1950 {
1951 c = eptr[-1];
1952 if (c == '_') prev_is_word = TRUE; else
1953 {
1954 int cat = UCD_CATEGORY(c);
1955 prev_is_word = (cat == ucp_L || cat == ucp_N);
1956 }
1957 }
1958 else
1959 #endif
1960 prev_is_word = ((md->ctypes[eptr[-1]] & ctype_word) != 0);
1961 }
1962
1963 /* Get status of next character */
1964
1965 if (eptr >= md->end_subject)
1966 {
1967 SCHECK_PARTIAL();
1968 cur_is_word = FALSE;
1969 }
1970 else
1971 #ifdef SUPPORT_UCP
1972 if (md->use_ucp)
1973 {
1974 c = *eptr;
1975 if (c == '_') cur_is_word = TRUE; else
1976 {
1977 int cat = UCD_CATEGORY(c);
1978 cur_is_word = (cat == ucp_L || cat == ucp_N);
1979 }
1980 }
1981 else
1982 #endif
1983 cur_is_word = ((md->ctypes[*eptr] & ctype_word) != 0);
1984 }
1985
1986 /* Now see if the situation is what we want */
1987
1988 if ((*ecode++ == OP_WORD_BOUNDARY)?
1989 cur_is_word == prev_is_word : cur_is_word != prev_is_word)
1990 MRRETURN(MATCH_NOMATCH);
1991 }
1992 break;
1993
1994 /* Match a single character type; inline for speed */
1995
1996 case OP_ANY:
1997 if (IS_NEWLINE(eptr)) MRRETURN(MATCH_NOMATCH);
1998 /* Fall through */
1999
2000 case OP_ALLANY:
2001 if (eptr++ >= md->end_subject)
2002 {
2003 SCHECK_PARTIAL();
2004 MRRETURN(MATCH_NOMATCH);
2005 }
2006 if (utf8) while (eptr < md->end_subject && (*eptr & 0xc0) == 0x80) eptr++;
2007 ecode++;
2008 break;
2009
2010 /* Match a single byte, even in UTF-8 mode. This opcode really does match
2011 any byte, even newline, independent of the setting of PCRE_DOTALL. */
2012
2013 case OP_ANYBYTE:
2014 if (eptr++ >= md->end_subject)
2015 {
2016 SCHECK_PARTIAL();
2017 MRRETURN(MATCH_NOMATCH);
2018 }
2019 ecode++;
2020 break;
2021
2022 case OP_NOT_DIGIT:
2023 if (eptr >= md->end_subject)
2024 {
2025 SCHECK_PARTIAL();
2026 MRRETURN(MATCH_NOMATCH);
2027 }
2028 GETCHARINCTEST(c, eptr);
2029 if (
2030 #ifdef SUPPORT_UTF8
2031 c < 256 &&
2032 #endif
2033 (md->ctypes[c] & ctype_digit) != 0
2034 )
2035 MRRETURN(MATCH_NOMATCH);
2036 ecode++;
2037 break;
2038
2039 case OP_DIGIT:
2040 if (eptr >= md->end_subject)
2041 {
2042 SCHECK_PARTIAL();
2043 MRRETURN(MATCH_NOMATCH);
2044 }
2045 GETCHARINCTEST(c, eptr);
2046 if (
2047 #ifdef SUPPORT_UTF8
2048 c >= 256 ||
2049 #endif
2050 (md->ctypes[c] & ctype_digit) == 0
2051 )
2052 MRRETURN(MATCH_NOMATCH);
2053 ecode++;
2054 break;
2055
2056 case OP_NOT_WHITESPACE:
2057 if (eptr >= md->end_subject)
2058 {
2059 SCHECK_PARTIAL();
2060 MRRETURN(MATCH_NOMATCH);
2061 }
2062 GETCHARINCTEST(c, eptr);
2063 if (
2064 #ifdef SUPPORT_UTF8
2065 c < 256 &&
2066 #endif
2067 (md->ctypes[c] & ctype_space) != 0
2068 )
2069 MRRETURN(MATCH_NOMATCH);
2070 ecode++;
2071 break;
2072
2073 case OP_WHITESPACE:
2074 if (eptr >= md->end_subject)
2075 {
2076 SCHECK_PARTIAL();
2077 MRRETURN(MATCH_NOMATCH);
2078 }
2079 GETCHARINCTEST(c, eptr);
2080 if (
2081 #ifdef SUPPORT_UTF8
2082 c >= 256 ||
2083 #endif
2084 (md->ctypes[c] & ctype_space) == 0
2085 )
2086 MRRETURN(MATCH_NOMATCH);
2087 ecode++;
2088 break;
2089
2090 case OP_NOT_WORDCHAR:
2091 if (eptr >= md->end_subject)
2092 {
2093 SCHECK_PARTIAL();
2094 MRRETURN(MATCH_NOMATCH);
2095 }
2096 GETCHARINCTEST(c, eptr);
2097 if (
2098 #ifdef SUPPORT_UTF8
2099 c < 256 &&
2100 #endif
2101 (md->ctypes[c] & ctype_word) != 0
2102 )
2103 MRRETURN(MATCH_NOMATCH);
2104 ecode++;
2105 break;
2106
2107 case OP_WORDCHAR:
2108 if (eptr >= md->end_subject)
2109 {
2110 SCHECK_PARTIAL();
2111 MRRETURN(MATCH_NOMATCH);
2112 }
2113 GETCHARINCTEST(c, eptr);
2114 if (
2115 #ifdef SUPPORT_UTF8
2116 c >= 256 ||
2117 #endif
2118 (md->ctypes[c] & ctype_word) == 0
2119 )
2120 MRRETURN(MATCH_NOMATCH);
2121 ecode++;
2122 break;
2123
2124 case OP_ANYNL:
2125 if (eptr >= md->end_subject)
2126 {
2127 SCHECK_PARTIAL();
2128 MRRETURN(MATCH_NOMATCH);
2129 }
2130 GETCHARINCTEST(c, eptr);
2131 switch(c)
2132 {
2133 default: MRRETURN(MATCH_NOMATCH);
2134
2135 case 0x000d:
2136 if (eptr < md->end_subject && *eptr == 0x0a) eptr++;
2137 break;
2138
2139 case 0x000a:
2140 break;
2141
2142 case 0x000b:
2143 case 0x000c:
2144 case 0x0085:
2145 case 0x2028:
2146 case 0x2029:
2147 if (md->bsr_anycrlf) MRRETURN(MATCH_NOMATCH);
2148 break;
2149 }
2150 ecode++;
2151 break;
2152
2153 case OP_NOT_HSPACE:
2154 if (eptr >= md->end_subject)
2155 {
2156 SCHECK_PARTIAL();
2157 MRRETURN(MATCH_NOMATCH);
2158 }
2159 GETCHARINCTEST(c, eptr);
2160 switch(c)
2161 {
2162 default: break;
2163 case 0x09: /* HT */
2164 case 0x20: /* SPACE */
2165 case 0xa0: /* NBSP */
2166 case 0x1680: /* OGHAM SPACE MARK */
2167 case 0x180e: /* MONGOLIAN VOWEL SEPARATOR */
2168 case 0x2000: /* EN QUAD */
2169 case 0x2001: /* EM QUAD */
2170 case 0x2002: /* EN SPACE */
2171 case 0x2003: /* EM SPACE */
2172 case 0x2004: /* THREE-PER-EM SPACE */
2173 case 0x2005: /* FOUR-PER-EM SPACE */
2174 case 0x2006: /* SIX-PER-EM SPACE */
2175 case 0x2007: /* FIGURE SPACE */
2176 case 0x2008: /* PUNCTUATION SPACE */
2177 case 0x2009: /* THIN SPACE */
2178 case 0x200A: /* HAIR SPACE */
2179 case 0x202f: /* NARROW NO-BREAK SPACE */
2180 case 0x205f: /* MEDIUM MATHEMATICAL SPACE */
2181 case 0x3000: /* IDEOGRAPHIC SPACE */
2182 MRRETURN(MATCH_NOMATCH);
2183 }
2184 ecode++;
2185 break;
2186
2187 case OP_HSPACE:
2188 if (eptr >= md->end_subject)
2189 {
2190 SCHECK_PARTIAL();
2191 MRRETURN(MATCH_NOMATCH);
2192 }
2193 GETCHARINCTEST(c, eptr);
2194 switch(c)
2195 {
2196 default: MRRETURN(MATCH_NOMATCH);
2197 case 0x09: /* HT */
2198 case 0x20: /* SPACE */
2199 case 0xa0: /* NBSP */
2200 case 0x1680: /* OGHAM SPACE MARK */
2201 case 0x180e: /* MONGOLIAN VOWEL SEPARATOR */
2202 case 0x2000: /* EN QUAD */
2203 case 0x2001: /* EM QUAD */
2204 case 0x2002: /* EN SPACE */
2205 case 0x2003: /* EM SPACE */
2206 case 0x2004: /* THREE-PER-EM SPACE */
2207 case 0x2005: /* FOUR-PER-EM SPACE */
2208 case 0x2006: /* SIX-PER-EM SPACE */
2209 case 0x2007: /* FIGURE SPACE */
2210 case 0x2008: /* PUNCTUATION SPACE */
2211 case 0x2009: /* THIN SPACE */
2212 case 0x200A: /* HAIR SPACE */
2213 case 0x202f: /* NARROW NO-BREAK SPACE */
2214 case 0x205f: /* MEDIUM MATHEMATICAL SPACE */
2215 case 0x3000: /* IDEOGRAPHIC SPACE */
2216 break;
2217 }
2218 ecode++;
2219 break;
2220
2221 case OP_NOT_VSPACE:
2222 if (eptr >= md->end_subject)
2223 {
2224 SCHECK_PARTIAL();
2225 MRRETURN(MATCH_NOMATCH);
2226 }
2227 GETCHARINCTEST(c, eptr);
2228 switch(c)
2229 {
2230 default: break;
2231 case 0x0a: /* LF */
2232 case 0x0b: /* VT */
2233 case 0x0c: /* FF */
2234 case 0x0d: /* CR */
2235 case 0x85: /* NEL */
2236 case 0x2028: /* LINE SEPARATOR */
2237 case 0x2029: /* PARAGRAPH SEPARATOR */
2238 MRRETURN(MATCH_NOMATCH);
2239 }
2240 ecode++;
2241 break;
2242
2243 case OP_VSPACE:
2244 if (eptr >= md->end_subject)
2245 {
2246 SCHECK_PARTIAL();
2247 MRRETURN(MATCH_NOMATCH);
2248 }
2249 GETCHARINCTEST(c, eptr);
2250 switch(c)
2251 {
2252 default: MRRETURN(MATCH_NOMATCH);
2253 case 0x0a: /* LF */
2254 case 0x0b: /* VT */
2255 case 0x0c: /* FF */
2256 case 0x0d: /* CR */
2257 case 0x85: /* NEL */
2258 case 0x2028: /* LINE SEPARATOR */
2259 case 0x2029: /* PARAGRAPH SEPARATOR */
2260 break;
2261 }
2262 ecode++;
2263 break;
2264
2265 #ifdef SUPPORT_UCP
2266 /* Check the next character by Unicode property. We will get here only
2267 if the support is in the binary; otherwise a compile-time error occurs. */
2268
2269 case OP_PROP:
2270 case OP_NOTPROP:
2271 if (eptr >= md->end_subject)
2272 {
2273 SCHECK_PARTIAL();
2274 MRRETURN(MATCH_NOMATCH);
2275 }
2276 GETCHARINCTEST(c, eptr);
2277 {
2278 const ucd_record *prop = GET_UCD(c);
2279
2280 switch(ecode[1])
2281 {
2282 case PT_ANY:
2283 if (op == OP_NOTPROP) MRRETURN(MATCH_NOMATCH);
2284 break;
2285
2286 case PT_LAMP:
2287 if ((prop->chartype == ucp_Lu ||
2288 prop->chartype == ucp_Ll ||
2289 prop->chartype == ucp_Lt) == (op == OP_NOTPROP))
2290 MRRETURN(MATCH_NOMATCH);
2291 break;
2292
2293 case PT_GC:
2294 if ((ecode[2] != _pcre_ucp_gentype[prop->chartype]) == (op == OP_PROP))
2295 MRRETURN(MATCH_NOMATCH);
2296 break;
2297
2298 case PT_PC:
2299 if ((ecode[2] != prop->chartype) == (op == OP_PROP))
2300 MRRETURN(MATCH_NOMATCH);
2301 break;
2302
2303 case PT_SC:
2304 if ((ecode[2] != prop->script) == (op == OP_PROP))
2305 MRRETURN(MATCH_NOMATCH);
2306 break;
2307
2308 /* These are specials */
2309
2310 case PT_ALNUM:
2311 if ((_pcre_ucp_gentype[prop->chartype] == ucp_L ||
2312 _pcre_ucp_gentype[prop->chartype] == ucp_N) == (op == OP_NOTPROP))
2313 MRRETURN(MATCH_NOMATCH);
2314 break;
2315
2316 case PT_SPACE: /* Perl space */
2317 if ((_pcre_ucp_gentype[prop->chartype] == ucp_Z ||
2318 c == CHAR_HT || c == CHAR_NL || c == CHAR_FF || c == CHAR_CR)
2319 == (op == OP_NOTPROP))
2320 MRRETURN(MATCH_NOMATCH);
2321 break;
2322
2323 case PT_PXSPACE: /* POSIX space */
2324 if ((_pcre_ucp_gentype[prop->chartype] == ucp_Z ||
2325 c == CHAR_HT || c == CHAR_NL || c == CHAR_VT ||
2326 c == CHAR_FF || c == CHAR_CR)
2327 == (op == OP_NOTPROP))
2328 MRRETURN(MATCH_NOMATCH);
2329 break;
2330
2331 case PT_WORD:
2332 if ((_pcre_ucp_gentype[prop->chartype] == ucp_L ||
2333 _pcre_ucp_gentype[prop->chartype] == ucp_N ||
2334 c == CHAR_UNDERSCORE) == (op == OP_NOTPROP))
2335 MRRETURN(MATCH_NOMATCH);
2336 break;
2337
2338 /* This should never occur */
2339
2340 default:
2341 RRETURN(PCRE_ERROR_INTERNAL);
2342 }
2343
2344 ecode += 3;
2345 }
2346 break;
2347
2348 /* Match an extended Unicode sequence. We will get here only if the support
2349 is in the binary; otherwise a compile-time error occurs. */
2350
2351 case OP_EXTUNI:
2352 if (eptr >= md->end_subject)
2353 {
2354 SCHECK_PARTIAL();
2355 MRRETURN(MATCH_NOMATCH);
2356 }
2357 GETCHARINCTEST(c, eptr);
2358 if (UCD_CATEGORY(c) == ucp_M) MRRETURN(MATCH_NOMATCH);
2359 while (eptr < md->end_subject)
2360 {
2361 int len = 1;
2362 if (!utf8) c = *eptr; else { GETCHARLEN(c, eptr, len); }
2363 if (UCD_CATEGORY(c) != ucp_M) break;
2364 eptr += len;
2365 }
2366 ecode++;
2367 break;
2368 #endif
2369
2370
2371 /* Match a back reference, possibly repeatedly. Look past the end of the
2372 item to see if there is repeat information following. The code is similar
2373 to that for character classes, but repeated for efficiency. Then obey
2374 similar code to character type repeats - written out again for speed.
2375 However, if the referenced string is the empty string, always treat
2376 it as matched, any number of times (otherwise there could be infinite
2377 loops). */
2378
2379 case OP_REF:
2380 case OP_REFI:
2381 caseless = op == OP_REFI;
2382 offset = GET2(ecode, 1) << 1; /* Doubled ref number */
2383 ecode += 3;
2384
2385 /* If the reference is unset, there are two possibilities:
2386
2387 (a) In the default, Perl-compatible state, set the length negative;
2388 this ensures that every attempt at a match fails. We can't just fail
2389 here, because of the possibility of quantifiers with zero minima.
2390
2391 (b) If the JavaScript compatibility flag is set, set the length to zero
2392 so that the back reference matches an empty string.
2393
2394 Otherwise, set the length to the length of what was matched by the
2395 referenced subpattern. */
2396
2397 if (offset >= offset_top || md->offset_vector[offset] < 0)
2398 length = (md->jscript_compat)? 0 : -1;
2399 else
2400 length = md->offset_vector[offset+1] - md->offset_vector[offset];
2401
2402 /* Set up for repetition, or handle the non-repeated case */
2403
2404 switch (*ecode)
2405 {
2406 case OP_CRSTAR:
2407 case OP_CRMINSTAR:
2408 case OP_CRPLUS:
2409 case OP_CRMINPLUS:
2410 case OP_CRQUERY:
2411 case OP_CRMINQUERY:
2412 c = *ecode++ - OP_CRSTAR;
2413 minimize = (c & 1) != 0;
2414 min = rep_min[c]; /* Pick up values from tables; */
2415 max = rep_max[c]; /* zero for max => infinity */
2416 if (max == 0) max = INT_MAX;
2417 break;
2418
2419 case OP_CRRANGE:
2420 case OP_CRMINRANGE:
2421 minimize = (*ecode == OP_CRMINRANGE);
2422 min = GET2(ecode, 1);
2423 max = GET2(ecode, 3);
2424 if (max == 0) max = INT_MAX;
2425 ecode += 5;
2426 break;
2427
2428 default: /* No repeat follows */
2429 if ((length = match_ref(offset, eptr, length, md, caseless)) < 0)
2430 {
2431 CHECK_PARTIAL();
2432 MRRETURN(MATCH_NOMATCH);
2433 }
2434 eptr += length;
2435 continue; /* With the main loop */
2436 }
2437
2438 /* Handle repeated back references. If the length of the reference is
2439 zero, just continue with the main loop. */
2440
2441 if (length == 0) continue;
2442
2443 /* First, ensure the minimum number of matches are present. We get back
2444 the length of the reference string explicitly rather than passing the
2445 address of eptr, so that eptr can be a register variable. */
2446
2447 for (i = 1; i <= min; i++)
2448 {
2449 int slength;
2450 if ((slength = match_ref(offset, eptr, length, md, caseless)) < 0)
2451 {
2452 CHECK_PARTIAL();
2453 MRRETURN(MATCH_NOMATCH);
2454 }
2455 eptr += slength;
2456 }
2457
2458 /* If min = max, continue at the same level without recursion.
2459 They are not both allowed to be zero. */
2460
2461 if (min == max) continue;
2462
2463 /* If minimizing, keep trying and advancing the pointer */
2464
2465 if (minimize)
2466 {
2467 for (fi = min;; fi++)
2468 {
2469 int slength;
2470 RMATCH(eptr, ecode, offset_top, md, eptrb, RM14);
2471 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2472 if (fi >= max) MRRETURN(MATCH_NOMATCH);
2473 if ((slength = match_ref(offset, eptr, length, md, caseless)) < 0)
2474 {
2475 CHECK_PARTIAL();
2476 MRRETURN(MATCH_NOMATCH);
2477 }
2478 eptr += slength;
2479 }
2480 /* Control never gets here */
2481 }
2482
2483 /* If maximizing, find the longest string and work backwards */
2484
2485 else
2486 {
2487 pp = eptr;
2488 for (i = min; i < max; i++)
2489 {
2490 int slength;
2491 if ((slength = match_ref(offset, eptr, length, md, caseless)) < 0)
2492 {
2493 CHECK_PARTIAL();
2494 break;
2495 }
2496 eptr += slength;
2497 }
2498 while (eptr >= pp)
2499 {
2500 RMATCH(eptr, ecode, offset_top, md, eptrb, RM15);
2501 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2502 eptr -= length;
2503 }
2504 MRRETURN(MATCH_NOMATCH);
2505 }
2506 /* Control never gets here */
2507
2508 /* Match a bit-mapped character class, possibly repeatedly. This op code is
2509 used when all the characters in the class have values in the range 0-255,
2510 and either the matching is caseful, or the characters are in the range
2511 0-127 when UTF-8 processing is enabled. The only difference between
2512 OP_CLASS and OP_NCLASS occurs when a data character outside the range is
2513 encountered.
2514
2515 First, look past the end of the item to see if there is repeat information
2516 following. Then obey similar code to character type repeats - written out
2517 again for speed. */
2518
2519 case OP_NCLASS:
2520 case OP_CLASS:
2521 {
2522 data = ecode + 1; /* Save for matching */
2523 ecode += 33; /* Advance past the item */
2524
2525 switch (*ecode)
2526 {
2527 case OP_CRSTAR:
2528 case OP_CRMINSTAR:
2529 case OP_CRPLUS:
2530 case OP_CRMINPLUS:
2531 case OP_CRQUERY:
2532 case OP_CRMINQUERY:
2533 c = *ecode++ - OP_CRSTAR;
2534 minimize = (c & 1) != 0;
2535 min = rep_min[c]; /* Pick up values from tables; */
2536 max = rep_max[c]; /* zero for max => infinity */
2537 if (max == 0) max = INT_MAX;
2538 break;
2539
2540 case OP_CRRANGE:
2541 case OP_CRMINRANGE:
2542 minimize = (*ecode == OP_CRMINRANGE);
2543 min = GET2(ecode, 1);
2544 max = GET2(ecode, 3);
2545 if (max == 0) max = INT_MAX;
2546 ecode += 5;
2547 break;
2548
2549 default: /* No repeat follows */
2550 min = max = 1;
2551 break;
2552 }
2553
2554 /* First, ensure the minimum number of matches are present. */
2555
2556 #ifdef SUPPORT_UTF8
2557 /* UTF-8 mode */
2558 if (utf8)
2559 {
2560 for (i = 1; i <= min; i++)
2561 {
2562 if (eptr >= md->end_subject)
2563 {
2564 SCHECK_PARTIAL();
2565 MRRETURN(MATCH_NOMATCH);
2566 }
2567 GETCHARINC(c, eptr);
2568 if (c > 255)
2569 {
2570 if (op == OP_CLASS) MRRETURN(MATCH_NOMATCH);
2571 }
2572 else
2573 {
2574 if ((data[c/8] & (1 << (c&7))) == 0) MRRETURN(MATCH_NOMATCH);
2575 }
2576 }
2577 }
2578 else
2579 #endif
2580 /* Not UTF-8 mode */
2581 {
2582 for (i = 1; i <= min; i++)
2583 {
2584 if (eptr >= md->end_subject)
2585 {
2586 SCHECK_PARTIAL();
2587 MRRETURN(MATCH_NOMATCH);
2588 }
2589 c = *eptr++;
2590 if ((data[c/8] & (1 << (c&7))) == 0) MRRETURN(MATCH_NOMATCH);
2591 }
2592 }
2593
2594 /* If max == min we can continue with the main loop without the
2595 need to recurse. */
2596
2597 if (min == max) continue;
2598
2599 /* If minimizing, keep testing the rest of the expression and advancing
2600 the pointer while it matches the class. */
2601
2602 if (minimize)
2603 {
2604 #ifdef SUPPORT_UTF8
2605 /* UTF-8 mode */
2606 if (utf8)
2607 {
2608 for (fi = min;; fi++)
2609 {
2610 RMATCH(eptr, ecode, offset_top, md, eptrb, RM16);
2611 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2612 if (fi >= max) MRRETURN(MATCH_NOMATCH);
2613 if (eptr >= md->end_subject)
2614 {
2615 SCHECK_PARTIAL();
2616 MRRETURN(MATCH_NOMATCH);
2617 }
2618 GETCHARINC(c, eptr);
2619 if (c > 255)
2620 {
2621 if (op == OP_CLASS) MRRETURN(MATCH_NOMATCH);
2622 }
2623 else
2624 {
2625 if ((data[c/8] & (1 << (c&7))) == 0) MRRETURN(MATCH_NOMATCH);
2626 }
2627 }
2628 }
2629 else
2630 #endif
2631 /* Not UTF-8 mode */
2632 {
2633 for (fi = min;; fi++)
2634 {
2635 RMATCH(eptr, ecode, offset_top, md, eptrb, RM17);
2636 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2637 if (fi >= max) MRRETURN(MATCH_NOMATCH);
2638 if (eptr >= md->end_subject)
2639 {
2640 SCHECK_PARTIAL();
2641 MRRETURN(MATCH_NOMATCH);
2642 }
2643 c = *eptr++;
2644 if ((data[c/8] & (1 << (c&7))) == 0) MRRETURN(MATCH_NOMATCH);
2645 }
2646 }
2647 /* Control never gets here */
2648 }
2649
2650 /* If maximizing, find the longest possible run, then work backwards. */
2651
2652 else
2653 {
2654 pp = eptr;
2655
2656 #ifdef SUPPORT_UTF8
2657 /* UTF-8 mode */
2658 if (utf8)
2659 {
2660 for (i = min; i < max; i++)
2661 {
2662 int len = 1;
2663 if (eptr >= md->end_subject)
2664 {
2665 SCHECK_PARTIAL();
2666 break;
2667 }
2668 GETCHARLEN(c, eptr, len);
2669 if (c > 255)
2670 {
2671 if (op == OP_CLASS) break;
2672 }
2673 else
2674 {
2675 if ((data[c/8] & (1 << (c&7))) == 0) break;
2676 }
2677 eptr += len;
2678 }
2679 for (;;)
2680 {
2681 RMATCH(eptr, ecode, offset_top, md, eptrb, RM18);
2682 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2683 if (eptr-- == pp) break; /* Stop if tried at original pos */
2684 BACKCHAR(eptr);
2685 }
2686 }
2687 else
2688 #endif
2689 /* Not UTF-8 mode */
2690 {
2691 for (i = min; i < max; i++)
2692 {
2693 if (eptr >= md->end_subject)
2694 {
2695 SCHECK_PARTIAL();
2696 break;
2697 }
2698 c = *eptr;
2699 if ((data[c/8] & (1 << (c&7))) == 0) break;
2700 eptr++;
2701 }
2702 while (eptr >= pp)
2703 {
2704 RMATCH(eptr, ecode, offset_top, md, eptrb, RM19);
2705 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2706 eptr--;
2707 }
2708 }
2709
2710 MRRETURN(MATCH_NOMATCH);
2711 }
2712 }
2713 /* Control never gets here */
2714
2715
2716 /* Match an extended character class. This opcode is encountered only
2717 when UTF-8 mode mode is supported. Nevertheless, we may not be in UTF-8
2718 mode, because Unicode properties are supported in non-UTF-8 mode. */
2719
2720 #ifdef SUPPORT_UTF8
2721 case OP_XCLASS:
2722 {
2723 data = ecode + 1 + LINK_SIZE; /* Save for matching */
2724 ecode += GET(ecode, 1); /* Advance past the item */
2725
2726 switch (*ecode)
2727 {
2728 case OP_CRSTAR:
2729 case OP_CRMINSTAR:
2730 case OP_CRPLUS:
2731 case OP_CRMINPLUS:
2732 case OP_CRQUERY:
2733 case OP_CRMINQUERY:
2734 c = *ecode++ - OP_CRSTAR;
2735 minimize = (c & 1) != 0;
2736 min = rep_min[c]; /* Pick up values from tables; */
2737 max = rep_max[c]; /* zero for max => infinity */
2738 if (max == 0) max = INT_MAX;
2739 break;
2740
2741 case OP_CRRANGE:
2742 case OP_CRMINRANGE:
2743 minimize = (*ecode == OP_CRMINRANGE);
2744 min = GET2(ecode, 1);
2745 max = GET2(ecode, 3);
2746 if (max == 0) max = INT_MAX;
2747 ecode += 5;
2748 break;
2749
2750 default: /* No repeat follows */
2751 min = max = 1;
2752 break;
2753 }
2754
2755 /* First, ensure the minimum number of matches are present. */
2756
2757 for (i = 1; i <= min; i++)
2758 {
2759 if (eptr >= md->end_subject)
2760 {
2761 SCHECK_PARTIAL();
2762 MRRETURN(MATCH_NOMATCH);
2763 }
2764 GETCHARINCTEST(c, eptr);
2765 if (!_pcre_xclass(c, data)) MRRETURN(MATCH_NOMATCH);
2766 }
2767
2768 /* If max == min we can continue with the main loop without the
2769 need to recurse. */
2770
2771 if (min == max) continue;
2772
2773 /* If minimizing, keep testing the rest of the expression and advancing
2774 the pointer while it matches the class. */
2775
2776 if (minimize)
2777 {
2778 for (fi = min;; fi++)
2779 {
2780 RMATCH(eptr, ecode, offset_top, md, eptrb, RM20);
2781 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2782 if (fi >= max) MRRETURN(MATCH_NOMATCH);
2783 if (eptr >= md->end_subject)
2784 {
2785 SCHECK_PARTIAL();
2786 MRRETURN(MATCH_NOMATCH);
2787 }
2788 GETCHARINCTEST(c, eptr);
2789 if (!_pcre_xclass(c, data)) MRRETURN(MATCH_NOMATCH);
2790 }
2791 /* Control never gets here */
2792 }
2793
2794 /* If maximizing, find the longest possible run, then work backwards. */
2795
2796 else
2797 {
2798 pp = eptr;
2799 for (i = min; i < max; i++)
2800 {
2801 int len = 1;
2802 if (eptr >= md->end_subject)
2803 {
2804 SCHECK_PARTIAL();
2805 break;
2806 }
2807 GETCHARLENTEST(c, eptr, len);
2808 if (!_pcre_xclass(c, data)) break;
2809 eptr += len;
2810 }
2811 for(;;)
2812 {
2813 RMATCH(eptr, ecode, offset_top, md, eptrb, RM21);
2814 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2815 if (eptr-- == pp) break; /* Stop if tried at original pos */
2816 if (utf8) BACKCHAR(eptr);
2817 }
2818 MRRETURN(MATCH_NOMATCH);
2819 }
2820
2821 /* Control never gets here */
2822 }
2823 #endif /* End of XCLASS */
2824
2825 /* Match a single character, casefully */
2826
2827 case OP_CHAR:
2828 #ifdef SUPPORT_UTF8
2829 if (utf8)
2830 {
2831 length = 1;
2832 ecode++;
2833 GETCHARLEN(fc, ecode, length);
2834 if (length > md->end_subject - eptr)
2835 {
2836 CHECK_PARTIAL(); /* Not SCHECK_PARTIAL() */
2837 MRRETURN(MATCH_NOMATCH);
2838 }
2839 while (length-- > 0) if (*ecode++ != *eptr++) MRRETURN(MATCH_NOMATCH);
2840 }
2841 else
2842 #endif
2843
2844 /* Non-UTF-8 mode */
2845 {
2846 if (md->end_subject - eptr < 1)
2847 {
2848 SCHECK_PARTIAL(); /* This one can use SCHECK_PARTIAL() */
2849 MRRETURN(MATCH_NOMATCH);
2850 }
2851 if (ecode[1] != *eptr++) MRRETURN(MATCH_NOMATCH);
2852 ecode += 2;
2853 }
2854 break;
2855
2856 /* Match a single character, caselessly */
2857
2858 case OP_CHARI:
2859 #ifdef SUPPORT_UTF8
2860 if (utf8)
2861 {
2862 length = 1;
2863 ecode++;
2864 GETCHARLEN(fc, ecode, length);
2865
2866 if (length > md->end_subject - eptr)
2867 {
2868 CHECK_PARTIAL(); /* Not SCHECK_PARTIAL() */
2869 MRRETURN(MATCH_NOMATCH);
2870 }
2871
2872 /* If the pattern character's value is < 128, we have only one byte, and
2873 can use the fast lookup table. */
2874
2875 if (fc < 128)
2876 {
2877 if (md->lcc[*ecode++] != md->lcc[*eptr++]) MRRETURN(MATCH_NOMATCH);
2878 }
2879
2880 /* Otherwise we must pick up the subject character */
2881
2882 else
2883 {
2884 unsigned int dc;
2885 GETCHARINC(dc, eptr);
2886 ecode += length;
2887
2888 /* If we have Unicode property support, we can use it to test the other
2889 case of the character, if there is one. */
2890
2891 if (fc != dc)
2892 {
2893 #ifdef SUPPORT_UCP
2894 if (dc != UCD_OTHERCASE(fc))
2895 #endif
2896 MRRETURN(MATCH_NOMATCH);
2897 }
2898 }
2899 }
2900 else
2901 #endif /* SUPPORT_UTF8 */
2902
2903 /* Non-UTF-8 mode */
2904 {
2905 if (md->end_subject - eptr < 1)
2906 {
2907 SCHECK_PARTIAL(); /* This one can use SCHECK_PARTIAL() */
2908 MRRETURN(MATCH_NOMATCH);
2909 }
2910 if (md->lcc[ecode[1]] != md->lcc[*eptr++]) MRRETURN(MATCH_NOMATCH);
2911 ecode += 2;
2912 }
2913 break;
2914
2915 /* Match a single character repeatedly. */
2916
2917 case OP_EXACT:
2918 case OP_EXACTI:
2919 min = max = GET2(ecode, 1);
2920 ecode += 3;
2921 goto REPEATCHAR;
2922
2923 case OP_POSUPTO:
2924 case OP_POSUPTOI:
2925 possessive = TRUE;
2926 /* Fall through */
2927
2928 case OP_UPTO:
2929 case OP_UPTOI:
2930 case OP_MINUPTO:
2931 case OP_MINUPTOI:
2932 min = 0;
2933 max = GET2(ecode, 1);
2934 minimize = *ecode == OP_MINUPTO || *ecode == OP_MINUPTOI;
2935 ecode += 3;
2936 goto REPEATCHAR;
2937
2938 case OP_POSSTAR:
2939 case OP_POSSTARI:
2940 possessive = TRUE;
2941 min = 0;
2942 max = INT_MAX;
2943 ecode++;
2944 goto REPEATCHAR;
2945
2946 case OP_POSPLUS:
2947 case OP_POSPLUSI:
2948 possessive = TRUE;
2949 min = 1;
2950 max = INT_MAX;
2951 ecode++;
2952 goto REPEATCHAR;
2953
2954 case OP_POSQUERY:
2955 case OP_POSQUERYI:
2956 possessive = TRUE;
2957 min = 0;
2958 max = 1;
2959 ecode++;
2960 goto REPEATCHAR;
2961
2962 case OP_STAR:
2963 case OP_STARI:
2964 case OP_MINSTAR:
2965 case OP_MINSTARI:
2966 case OP_PLUS:
2967 case OP_PLUSI:
2968 case OP_MINPLUS:
2969 case OP_MINPLUSI:
2970 case OP_QUERY:
2971 case OP_QUERYI:
2972 case OP_MINQUERY:
2973 case OP_MINQUERYI:
2974 c = *ecode++ - ((op < OP_STARI)? OP_STAR : OP_STARI);
2975 minimize = (c & 1) != 0;
2976 min = rep_min[c]; /* Pick up values from tables; */
2977 max = rep_max[c]; /* zero for max => infinity */
2978 if (max == 0) max = INT_MAX;
2979
2980 /* Common code for all repeated single-character matches. */
2981
2982 REPEATCHAR:
2983 #ifdef SUPPORT_UTF8
2984 if (utf8)
2985 {
2986 length = 1;
2987 charptr = ecode;
2988 GETCHARLEN(fc, ecode, length);
2989 ecode += length;
2990
2991 /* Handle multibyte character matching specially here. There is
2992 support for caseless matching if UCP support is present. */
2993
2994 if (length > 1)
2995 {
2996 #ifdef SUPPORT_UCP
2997 unsigned int othercase;
2998 if (op >= OP_STARI && /* Caseless */
2999 (othercase = UCD_OTHERCASE(fc)) != fc)
3000 oclength = _pcre_ord2utf8(othercase, occhars);
3001 else oclength = 0;
3002 #endif /* SUPPORT_UCP */
3003
3004 for (i = 1; i <= min; i++)
3005 {
3006 if (eptr <= md->end_subject - length &&
3007 memcmp(eptr, charptr, length) == 0) eptr += length;
3008 #ifdef SUPPORT_UCP
3009 else if (oclength > 0 &&
3010 eptr <= md->end_subject - oclength &&
3011 memcmp(eptr, occhars, oclength) == 0) eptr += oclength;
3012 #endif /* SUPPORT_UCP */
3013 else
3014 {
3015 CHECK_PARTIAL();
3016 MRRETURN(MATCH_NOMATCH);
3017 }
3018 }
3019
3020 if (min == max) continue;
3021
3022 if (minimize)
3023 {
3024 for (fi = min;; fi++)
3025 {
3026 RMATCH(eptr, ecode, offset_top, md, eptrb, RM22);
3027 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3028 if (fi >= max) MRRETURN(MATCH_NOMATCH);
3029 if (eptr <= md->end_subject - length &&
3030 memcmp(eptr, charptr, length) == 0) eptr += length;
3031 #ifdef SUPPORT_UCP
3032 else if (oclength > 0 &&
3033 eptr <= md->end_subject - oclength &&
3034 memcmp(eptr, occhars, oclength) == 0) eptr += oclength;
3035 #endif /* SUPPORT_UCP */
3036 else
3037 {
3038 CHECK_PARTIAL();
3039 MRRETURN(MATCH_NOMATCH);
3040 }
3041 }
3042 /* Control never gets here */
3043 }
3044
3045 else /* Maximize */
3046 {
3047 pp = eptr;
3048 for (i = min; i < max; i++)
3049 {
3050 if (eptr <= md->end_subject - length &&
3051 memcmp(eptr, charptr, length) == 0) eptr += length;
3052 #ifdef SUPPORT_UCP
3053 else if (oclength > 0 &&
3054 eptr <= md->end_subject - oclength &&
3055 memcmp(eptr, occhars, oclength) == 0) eptr += oclength;
3056 #endif /* SUPPORT_UCP */
3057 else
3058 {
3059 CHECK_PARTIAL();
3060 break;
3061 }
3062 }
3063
3064 if (possessive) continue;
3065
3066 for(;;)
3067 {
3068 RMATCH(eptr, ecode, offset_top, md, eptrb, RM23);
3069 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3070 if (eptr == pp) { MRRETURN(MATCH_NOMATCH); }
3071 #ifdef SUPPORT_UCP
3072 eptr--;
3073 BACKCHAR(eptr);
3074 #else /* without SUPPORT_UCP */
3075 eptr -= length;
3076 #endif /* SUPPORT_UCP */
3077 }
3078 }
3079 /* Control never gets here */
3080 }
3081
3082 /* If the length of a UTF-8 character is 1, we fall through here, and
3083 obey the code as for non-UTF-8 characters below, though in this case the
3084 value of fc will always be < 128. */
3085 }
3086 else
3087 #endif /* SUPPORT_UTF8 */
3088
3089 /* When not in UTF-8 mode, load a single-byte character. */
3090
3091 fc = *ecode++;
3092
3093 /* The value of fc at this point is always less than 256, though we may or
3094 may not be in UTF-8 mode. The code is duplicated for the caseless and
3095 caseful cases, for speed, since matching characters is likely to be quite
3096 common. First, ensure the minimum number of matches are present. If min =
3097 max, continue at the same level without recursing. Otherwise, if
3098 minimizing, keep trying the rest of the expression and advancing one
3099 matching character if failing, up to the maximum. Alternatively, if
3100 maximizing, find the maximum number of characters and work backwards. */
3101
3102 DPRINTF(("matching %c{%d,%d} against subject %.*s\n", fc, min, max,
3103 max, eptr));
3104
3105 if (op >= OP_STARI) /* Caseless */
3106 {
3107 fc = md->lcc[fc];
3108 for (i = 1; i <= min; i++)
3109 {
3110 if (eptr >= md->end_subject)
3111 {
3112 SCHECK_PARTIAL();
3113 MRRETURN(MATCH_NOMATCH);
3114 }
3115 if (fc != md->lcc[*eptr++]) MRRETURN(MATCH_NOMATCH);
3116 }
3117 if (min == max) continue;
3118 if (minimize)
3119 {
3120 for (fi = min;; fi++)
3121 {
3122 RMATCH(eptr, ecode, offset_top, md, eptrb, RM24);
3123 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3124 if (fi >= max) MRRETURN(MATCH_NOMATCH);
3125 if (eptr >= md->end_subject)
3126 {
3127 SCHECK_PARTIAL();
3128 MRRETURN(MATCH_NOMATCH);
3129 }
3130 if (fc != md->lcc[*eptr++]) MRRETURN(MATCH_NOMATCH);
3131 }
3132 /* Control never gets here */
3133 }
3134 else /* Maximize */
3135 {
3136 pp = eptr;
3137 for (i = min; i < max; i++)
3138 {
3139 if (eptr >= md->end_subject)
3140 {
3141 SCHECK_PARTIAL();
3142 break;
3143 }
3144 if (fc != md->lcc[*eptr]) break;
3145 eptr++;
3146 }
3147
3148 if (possessive) continue;
3149
3150 while (eptr >= pp)
3151 {
3152 RMATCH(eptr, ecode, offset_top, md, eptrb, RM25);
3153 eptr--;
3154 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3155 }
3156 MRRETURN(MATCH_NOMATCH);
3157 }
3158 /* Control never gets here */
3159 }
3160
3161 /* Caseful comparisons (includes all multi-byte characters) */
3162
3163 else
3164 {
3165 for (i = 1; i <= min; i++)
3166 {
3167 if (eptr >= md->end_subject)
3168 {
3169 SCHECK_PARTIAL();
3170 MRRETURN(MATCH_NOMATCH);
3171 }
3172 if (fc != *eptr++) MRRETURN(MATCH_NOMATCH);
3173 }
3174
3175 if (min == max) continue;
3176
3177 if (minimize)
3178 {
3179 for (fi = min;; fi++)
3180 {
3181 RMATCH(eptr, ecode, offset_top, md, eptrb, RM26);
3182 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3183 if (fi >= max) MRRETURN(MATCH_NOMATCH);
3184 if (eptr >= md->end_subject)
3185 {
3186 SCHECK_PARTIAL();
3187 MRRETURN(MATCH_NOMATCH);
3188 }
3189 if (fc != *eptr++) MRRETURN(MATCH_NOMATCH);
3190 }
3191 /* Control never gets here */
3192 }
3193 else /* Maximize */
3194 {
3195 pp = eptr;
3196 for (i = min; i < max; i++)
3197 {
3198 if (eptr >= md->end_subject)
3199 {
3200 SCHECK_PARTIAL();
3201 break;
3202 }
3203 if (fc != *eptr) break;
3204 eptr++;
3205 }
3206 if (possessive) continue;
3207
3208 while (eptr >= pp)
3209 {
3210 RMATCH(eptr, ecode, offset_top, md, eptrb, RM27);
3211 eptr--;
3212 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3213 }
3214 MRRETURN(MATCH_NOMATCH);
3215 }
3216 }
3217 /* Control never gets here */
3218
3219 /* Match a negated single one-byte character. The character we are
3220 checking can be multibyte. */
3221
3222 case OP_NOT:
3223 case OP_NOTI:
3224 if (eptr >= md->end_subject)
3225 {
3226 SCHECK_PARTIAL();
3227 MRRETURN(MATCH_NOMATCH);
3228 }
3229 ecode++;
3230 GETCHARINCTEST(c, eptr);
3231 if (op == OP_NOTI) /* The caseless case */
3232 {
3233 #ifdef SUPPORT_UTF8
3234 if (c < 256)
3235 #endif
3236 c = md->lcc[c];
3237 if (md->lcc[*ecode++] == c) MRRETURN(MATCH_NOMATCH);
3238 }
3239 else /* Caseful */
3240 {
3241 if (*ecode++ == c) MRRETURN(MATCH_NOMATCH);
3242 }
3243 break;
3244
3245 /* Match a negated single one-byte character repeatedly. This is almost a
3246 repeat of the code for a repeated single character, but I haven't found a
3247 nice way of commoning these up that doesn't require a test of the
3248 positive/negative option for each character match. Maybe that wouldn't add
3249 very much to the time taken, but character matching *is* what this is all
3250 about... */
3251
3252 case OP_NOTEXACT:
3253 case OP_NOTEXACTI:
3254 min = max = GET2(ecode, 1);
3255 ecode += 3;
3256 goto REPEATNOTCHAR;
3257
3258 case OP_NOTUPTO:
3259 case OP_NOTUPTOI:
3260 case OP_NOTMINUPTO:
3261 case OP_NOTMINUPTOI:
3262 min = 0;
3263 max = GET2(ecode, 1);
3264 minimize = *ecode == OP_NOTMINUPTO || *ecode == OP_NOTMINUPTOI;
3265 ecode += 3;
3266 goto REPEATNOTCHAR;
3267
3268 case OP_NOTPOSSTAR:
3269 case OP_NOTPOSSTARI:
3270 possessive = TRUE;
3271 min = 0;
3272 max = INT_MAX;
3273 ecode++;
3274 goto REPEATNOTCHAR;
3275
3276 case OP_NOTPOSPLUS:
3277 case OP_NOTPOSPLUSI:
3278 possessive = TRUE;
3279 min = 1;
3280 max = INT_MAX;
3281 ecode++;
3282 goto REPEATNOTCHAR;
3283
3284 case OP_NOTPOSQUERY:
3285 case OP_NOTPOSQUERYI:
3286 possessive = TRUE;
3287 min = 0;
3288 max = 1;
3289 ecode++;
3290 goto REPEATNOTCHAR;
3291
3292 case OP_NOTPOSUPTO:
3293 case OP_NOTPOSUPTOI:
3294 possessive = TRUE;
3295 min = 0;
3296 max = GET2(ecode, 1);
3297 ecode += 3;
3298 goto REPEATNOTCHAR;
3299
3300 case OP_NOTSTAR:
3301 case OP_NOTSTARI:
3302 case OP_NOTMINSTAR:
3303 case OP_NOTMINSTARI:
3304 case OP_NOTPLUS:
3305 case OP_NOTPLUSI:
3306 case OP_NOTMINPLUS:
3307 case OP_NOTMINPLUSI:
3308 case OP_NOTQUERY:
3309 case OP_NOTQUERYI:
3310 case OP_NOTMINQUERY:
3311 case OP_NOTMINQUERYI:
3312 c = *ecode++ - ((op >= OP_NOTSTARI)? OP_NOTSTARI: OP_NOTSTAR);
3313 minimize = (c & 1) != 0;
3314 min = rep_min[c]; /* Pick up values from tables; */
3315 max = rep_max[c]; /* zero for max => infinity */
3316 if (max == 0) max = INT_MAX;
3317
3318 /* Common code for all repeated single-byte matches. */
3319
3320 REPEATNOTCHAR:
3321 fc = *ecode++;
3322
3323 /* The code is duplicated for the caseless and caseful cases, for speed,
3324 since matching characters is likely to be quite common. First, ensure the
3325 minimum number of matches are present. If min = max, continue at the same
3326 level without recursing. Otherwise, if minimizing, keep trying the rest of
3327 the expression and advancing one matching character if failing, up to the
3328 maximum. Alternatively, if maximizing, find the maximum number of
3329 characters and work backwards. */
3330
3331 DPRINTF(("negative matching %c{%d,%d} against subject %.*s\n", fc, min, max,
3332 max, eptr));
3333
3334 if (op >= OP_NOTSTARI) /* Caseless */
3335 {
3336 fc = md->lcc[fc];
3337
3338 #ifdef SUPPORT_UTF8
3339 /* UTF-8 mode */
3340 if (utf8)
3341 {
3342 register unsigned int d;
3343 for (i = 1; i <= min; i++)
3344 {
3345 if (eptr >= md->end_subject)
3346 {
3347 SCHECK_PARTIAL();
3348 MRRETURN(MATCH_NOMATCH);
3349 }
3350 GETCHARINC(d, eptr);
3351 if (d < 256) d = md->lcc[d];
3352 if (fc == d) MRRETURN(MATCH_NOMATCH);
3353 }
3354 }
3355 else
3356 #endif
3357
3358 /* Not UTF-8 mode */
3359 {
3360 for (i = 1; i <= min; i++)
3361 {
3362 if (eptr >= md->end_subject)
3363 {
3364 SCHECK_PARTIAL();
3365 MRRETURN(MATCH_NOMATCH);
3366 }
3367 if (fc == md->lcc[*eptr++]) MRRETURN(MATCH_NOMATCH);
3368 }
3369 }
3370
3371 if (min == max) continue;
3372
3373 if (minimize)
3374 {
3375 #ifdef SUPPORT_UTF8
3376 /* UTF-8 mode */
3377 if (utf8)
3378 {
3379 register unsigned int d;
3380 for (fi = min;; fi++)
3381 {
3382 RMATCH(eptr, ecode, offset_top, md, eptrb, RM28);
3383 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3384 if (fi >= max) MRRETURN(MATCH_NOMATCH);
3385 if (eptr >= md->end_subject)
3386 {
3387 SCHECK_PARTIAL();
3388 MRRETURN(MATCH_NOMATCH);
3389 }
3390 GETCHARINC(d, eptr);
3391 if (d < 256) d = md->lcc[d];
3392 if (fc == d) MRRETURN(MATCH_NOMATCH);
3393 }
3394 }
3395 else
3396 #endif
3397 /* Not UTF-8 mode */
3398 {
3399 for (fi = min;; fi++)
3400 {
3401 RMATCH(eptr, ecode, offset_top, md, eptrb, RM29);
3402 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3403 if (fi >= max) MRRETURN(MATCH_NOMATCH);
3404 if (eptr >= md->end_subject)
3405 {
3406 SCHECK_PARTIAL();
3407 MRRETURN(MATCH_NOMATCH);
3408 }
3409 if (fc == md->lcc[*eptr++]) MRRETURN(MATCH_NOMATCH);
3410 }
3411 }
3412 /* Control never gets here */
3413 }
3414
3415 /* Maximize case */
3416
3417 else
3418 {
3419 pp = eptr;
3420
3421 #ifdef SUPPORT_UTF8
3422 /* UTF-8 mode */
3423 if (utf8)
3424 {
3425 register unsigned int d;
3426 for (i = min; i < max; i++)
3427 {
3428 int len = 1;
3429 if (eptr >= md->end_subject)
3430 {
3431 SCHECK_PARTIAL();
3432 break;
3433 }
3434 GETCHARLEN(d, eptr, len);
3435 if (d < 256) d = md->lcc[d];
3436 if (fc == d) break;
3437 eptr += len;
3438 }
3439 if (possessive) continue;
3440 for(;;)
3441 {
3442 RMATCH(eptr, ecode, offset_top, md, eptrb, RM30);
3443 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3444 if (eptr-- == pp) break; /* Stop if tried at original pos */
3445 BACKCHAR(eptr);
3446 }
3447 }
3448 else
3449 #endif
3450 /* Not UTF-8 mode */
3451 {
3452 for (i = min; i < max; i++)
3453 {
3454 if (eptr >= md->end_subject)
3455 {
3456 SCHECK_PARTIAL();
3457 break;
3458 }
3459 if (fc == md->lcc[*eptr]) break;
3460 eptr++;
3461 }
3462 if (possessive) continue;
3463 while (eptr >= pp)
3464 {
3465 RMATCH(eptr, ecode, offset_top, md, eptrb, RM31);
3466 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3467 eptr--;
3468 }
3469 }
3470
3471 MRRETURN(MATCH_NOMATCH);
3472 }
3473 /* Control never gets here */
3474 }
3475
3476 /* Caseful comparisons */
3477
3478 else
3479 {
3480 #ifdef SUPPORT_UTF8
3481 /* UTF-8 mode */
3482 if (utf8)
3483 {
3484 register unsigned int d;
3485 for (i = 1; i <= min; i++)
3486 {
3487 if (eptr >= md->end_subject)
3488 {
3489 SCHECK_PARTIAL();
3490 MRRETURN(MATCH_NOMATCH);
3491 }
3492 GETCHARINC(d, eptr);
3493 if (fc == d) MRRETURN(MATCH_NOMATCH);
3494 }
3495 }
3496 else
3497 #endif
3498 /* Not UTF-8 mode */
3499 {
3500 for (i = 1; i <= min; i++)
3501 {
3502 if (eptr >= md->end_subject)
3503 {
3504 SCHECK_PARTIAL();
3505 MRRETURN(MATCH_NOMATCH);
3506 }
3507 if (fc == *eptr++) MRRETURN(MATCH_NOMATCH);
3508 }
3509 }
3510
3511 if (min == max) continue;
3512
3513 if (minimize)
3514 {
3515 #ifdef SUPPORT_UTF8
3516 /* UTF-8 mode */
3517 if (utf8)
3518 {
3519 register unsigned int d;
3520 for (fi = min;; fi++)
3521 {
3522 RMATCH(eptr, ecode, offset_top, md, eptrb, RM32);
3523 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3524 if (fi >= max) MRRETURN(MATCH_NOMATCH);
3525 if (eptr >= md->end_subject)
3526 {
3527 SCHECK_PARTIAL();
3528 MRRETURN(MATCH_NOMATCH);
3529 }
3530 GETCHARINC(d, eptr);
3531 if (fc == d) MRRETURN(MATCH_NOMATCH);
3532 }
3533 }
3534 else
3535 #endif
3536 /* Not UTF-8 mode */
3537 {
3538 for (fi = min;; fi++)
3539 {
3540 RMATCH(eptr, ecode, offset_top, md, eptrb, RM33);
3541 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3542 if (fi >= max) MRRETURN(MATCH_NOMATCH);
3543 if (eptr >= md->end_subject)
3544 {
3545 SCHECK_PARTIAL();
3546 MRRETURN(MATCH_NOMATCH);
3547 }
3548 if (fc == *eptr++) MRRETURN(MATCH_NOMATCH);
3549 }
3550 }
3551 /* Control never gets here */
3552 }
3553
3554 /* Maximize case */
3555
3556 else
3557 {
3558 pp = eptr;
3559
3560 #ifdef SUPPORT_UTF8
3561 /* UTF-8 mode */
3562 if (utf8)
3563 {
3564 register unsigned int d;
3565 for (i = min; i < max; i++)
3566 {
3567 int len = 1;
3568 if (eptr >= md->end_subject)
3569 {
3570 SCHECK_PARTIAL();
3571 break;
3572 }
3573 GETCHARLEN(d, eptr, len);
3574 if (fc == d) break;
3575 eptr += len;
3576 }
3577 if (possessive) continue;
3578 for(;;)
3579 {
3580 RMATCH(eptr, ecode, offset_top, md, eptrb, RM34);
3581 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3582 if (eptr-- == pp) break; /* Stop if tried at original pos */
3583 BACKCHAR(eptr);
3584 }
3585 }
3586 else
3587 #endif
3588 /* Not UTF-8 mode */
3589 {
3590 for (i = min; i < max; i++)
3591 {
3592 if (eptr >= md->end_subject)
3593 {
3594 SCHECK_PARTIAL();
3595 break;
3596 }
3597 if (fc == *eptr) break;
3598 eptr++;
3599 }
3600 if (possessive) continue;
3601 while (eptr >= pp)
3602 {
3603 RMATCH(eptr, ecode, offset_top, md, eptrb, RM35);
3604 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3605 eptr--;
3606 }
3607 }
3608
3609 MRRETURN(MATCH_NOMATCH);
3610 }
3611 }
3612 /* Control never gets here */
3613
3614 /* Match a single character type repeatedly; several different opcodes
3615 share code. This is very similar to the code for single characters, but we
3616 repeat it in the interests of efficiency. */
3617
3618 case OP_TYPEEXACT:
3619 min = max = GET2(ecode, 1);
3620 minimize = TRUE;
3621 ecode += 3;
3622 goto REPEATTYPE;
3623
3624 case OP_TYPEUPTO:
3625 case OP_TYPEMINUPTO:
3626 min = 0;
3627 max = GET2(ecode, 1);
3628 minimize = *ecode == OP_TYPEMINUPTO;
3629 ecode += 3;
3630 goto REPEATTYPE;
3631
3632 case OP_TYPEPOSSTAR:
3633 possessive = TRUE;
3634 min = 0;
3635 max = INT_MAX;
3636 ecode++;
3637 goto REPEATTYPE;
3638
3639 case OP_TYPEPOSPLUS:
3640 possessive = TRUE;
3641 min = 1;
3642 max = INT_MAX;
3643 ecode++;
3644 goto REPEATTYPE;
3645
3646 case OP_TYPEPOSQUERY:
3647 possessive = TRUE;
3648 min = 0;
3649 max = 1;
3650 ecode++;
3651 goto REPEATTYPE;
3652
3653 case OP_TYPEPOSUPTO:
3654 possessive = TRUE;
3655 min = 0;
3656 max = GET2(ecode, 1);
3657 ecode += 3;
3658 goto REPEATTYPE;
3659
3660 case OP_TYPESTAR:
3661 case OP_TYPEMINSTAR:
3662 case OP_TYPEPLUS:
3663 case OP_TYPEMINPLUS:
3664 case OP_TYPEQUERY:
3665 case OP_TYPEMINQUERY:
3666 c = *ecode++ - OP_TYPESTAR;
3667 minimize = (c & 1) != 0;
3668 min = rep_min[c]; /* Pick up values from tables; */
3669 max = rep_max[c]; /* zero for max => infinity */
3670 if (max == 0) max = INT_MAX;
3671
3672 /* Common code for all repeated single character type matches. Note that
3673 in UTF-8 mode, '.' matches a character of any length, but for the other
3674 character types, the valid characters are all one-byte long. */
3675
3676 REPEATTYPE:
3677 ctype = *ecode++; /* Code for the character type */
3678
3679 #ifdef SUPPORT_UCP
3680 if (ctype == OP_PROP || ctype == OP_NOTPROP)
3681 {
3682 prop_fail_result = ctype == OP_NOTPROP;
3683 prop_type = *ecode++;
3684 prop_value = *ecode++;
3685 }
3686 else prop_type = -1;
3687 #endif
3688
3689 /* First, ensure the minimum number of matches are present. Use inline
3690 code for maximizing the speed, and do the type test once at the start
3691 (i.e. keep it out of the loop). Separate the UTF-8 code completely as that
3692 is tidier. Also separate the UCP code, which can be the same for both UTF-8
3693 and single-bytes. */
3694
3695 if (min > 0)
3696 {
3697 #ifdef SUPPORT_UCP
3698 if (prop_type >= 0)
3699 {
3700 switch(prop_type)
3701 {
3702 case PT_ANY:
3703 if (prop_fail_result) MRRETURN(MATCH_NOMATCH);
3704 for (i = 1; i <= min; i++)
3705 {
3706 if (eptr >= md->end_subject)
3707 {
3708 SCHECK_PARTIAL();
3709 MRRETURN(MATCH_NOMATCH);
3710 }
3711 GETCHARINCTEST(c, eptr);
3712 }
3713 break;
3714
3715 case PT_LAMP:
3716 for (i = 1; i <= min; i++)
3717 {
3718 int chartype;
3719 if (eptr >= md->end_subject)
3720 {
3721 SCHECK_PARTIAL();
3722 MRRETURN(MATCH_NOMATCH);
3723 }
3724 GETCHARINCTEST(c, eptr);
3725 chartype = UCD_CHARTYPE(c);
3726 if ((chartype == ucp_Lu ||
3727 chartype == ucp_Ll ||
3728 chartype == ucp_Lt) == prop_fail_result)
3729 MRRETURN(MATCH_NOMATCH);
3730 }
3731 break;
3732
3733 case PT_GC:
3734 for (i = 1; i <= min; i++)
3735 {
3736 if (eptr >= md->end_subject)
3737 {
3738 SCHECK_PARTIAL();
3739 MRRETURN(MATCH_NOMATCH);
3740 }
3741 GETCHARINCTEST(c, eptr);
3742 if ((UCD_CATEGORY(c) == prop_value) == prop_fail_result)
3743 MRRETURN(MATCH_NOMATCH);
3744 }
3745 break;
3746
3747 case PT_PC:
3748 for (i = 1; i <= min; i++)
3749 {
3750 if (eptr >= md->end_subject)
3751 {
3752 SCHECK_PARTIAL();
3753 MRRETURN(MATCH_NOMATCH);
3754 }
3755 GETCHARINCTEST(c, eptr);
3756 if ((UCD_CHARTYPE(c) == prop_value) == prop_fail_result)
3757 MRRETURN(MATCH_NOMATCH);
3758 }
3759 break;
3760
3761 case PT_SC:
3762 for (i = 1; i <= min; i++)
3763 {
3764 if (eptr >= md->end_subject)
3765 {
3766 SCHECK_PARTIAL();
3767 MRRETURN(MATCH_NOMATCH);
3768 }
3769 GETCHARINCTEST(c, eptr);
3770 if ((UCD_SCRIPT(c) == prop_value) == prop_fail_result)
3771 MRRETURN(MATCH_NOMATCH);
3772 }
3773 break;
3774
3775 case PT_ALNUM:
3776 for (i = 1; i <= min; i++)
3777 {
3778 int category;
3779 if (eptr >= md->end_subject)
3780 {
3781 SCHECK_PARTIAL();
3782 MRRETURN(MATCH_NOMATCH);
3783 }
3784 GETCHARINCTEST(c, eptr);
3785 category = UCD_CATEGORY(c);
3786 if ((category == ucp_L || category == ucp_N) == prop_fail_result)
3787 MRRETURN(MATCH_NOMATCH);
3788 }
3789 break;
3790
3791 case PT_SPACE: /* Perl space */
3792 for (i = 1; i <= min; i++)
3793 {
3794 if (eptr >= md->end_subject)
3795 {
3796 SCHECK_PARTIAL();
3797 MRRETURN(MATCH_NOMATCH);
3798 }
3799 GETCHARINCTEST(c, eptr);
3800 if ((UCD_CATEGORY(c) == ucp_Z || c == CHAR_HT || c == CHAR_NL ||
3801 c == CHAR_FF || c == CHAR_CR)
3802 == prop_fail_result)
3803 MRRETURN(MATCH_NOMATCH);
3804 }
3805 break;
3806
3807 case PT_PXSPACE: /* POSIX space */
3808 for (i = 1; i <= min; i++)
3809 {
3810 if (eptr >= md->end_subject)
3811 {
3812 SCHECK_PARTIAL();
3813 MRRETURN(MATCH_NOMATCH);
3814 }
3815 GETCHARINCTEST(c, eptr);
3816 if ((UCD_CATEGORY(c) == ucp_Z || c == CHAR_HT || c == CHAR_NL ||
3817 c == CHAR_VT || c == CHAR_FF || c == CHAR_CR)
3818 == prop_fail_result)
3819 MRRETURN(MATCH_NOMATCH);
3820 }
3821 break;
3822
3823 case PT_WORD:
3824 for (i = 1; i <= min; i++)
3825 {
3826 int category;
3827 if (eptr >= md->end_subject)
3828 {
3829 SCHECK_PARTIAL();
3830 MRRETURN(MATCH_NOMATCH);
3831 }
3832 GETCHARINCTEST(c, eptr);
3833 category = UCD_CATEGORY(c);
3834 if ((category == ucp_L || category == ucp_N || c == CHAR_UNDERSCORE)
3835 == prop_fail_result)
3836 MRRETURN(MATCH_NOMATCH);
3837 }
3838 break;
3839
3840 /* This should not occur */
3841
3842 default:
3843 RRETURN(PCRE_ERROR_INTERNAL);
3844 }
3845 }
3846
3847 /* Match extended Unicode sequences. We will get here only if the
3848 support is in the binary; otherwise a compile-time error occurs. */
3849
3850 else if (ctype == OP_EXTUNI)
3851 {
3852 for (i = 1; i <= min; i++)
3853 {
3854 if (eptr >= md->end_subject)
3855 {
3856 SCHECK_PARTIAL();
3857 MRRETURN(MATCH_NOMATCH);
3858 }
3859 GETCHARINCTEST(c, eptr);
3860 if (UCD_CATEGORY(c) == ucp_M) MRRETURN(MATCH_NOMATCH);
3861 while (eptr < md->end_subject)
3862 {
3863 int len = 1;
3864 if (!utf8) c = *eptr; else { GETCHARLEN(c, eptr, len); }
3865 if (UCD_CATEGORY(c) != ucp_M) break;
3866 eptr += len;
3867 }
3868 }
3869 }
3870
3871 else
3872 #endif /* SUPPORT_UCP */
3873
3874 /* Handle all other cases when the coding is UTF-8 */
3875
3876 #ifdef SUPPORT_UTF8
3877 if (utf8) switch(ctype)
3878 {
3879 case OP_ANY:
3880 for (i = 1; i <= min; i++)
3881 {
3882 if (eptr >= md->end_subject)
3883 {
3884 SCHECK_PARTIAL();
3885 MRRETURN(MATCH_NOMATCH);
3886 }
3887 if (IS_NEWLINE(eptr)) MRRETURN(MATCH_NOMATCH);
3888 eptr++;
3889 while (eptr < md->end_subject && (*eptr & 0xc0) == 0x80) eptr++;
3890 }
3891 break;
3892
3893 case OP_ALLANY:
3894 for (i = 1; i <= min; i++)
3895 {
3896 if (eptr >= md->end_subject)
3897 {
3898 SCHECK_PARTIAL();
3899 MRRETURN(MATCH_NOMATCH);
3900 }
3901 eptr++;
3902 while (eptr < md->end_subject && (*eptr & 0xc0) == 0x80) eptr++;
3903 }
3904 break;
3905
3906 case OP_ANYBYTE:
3907 if (eptr > md->end_subject - min) MRRETURN(MATCH_NOMATCH);
3908 eptr += min;
3909 break;
3910
3911 case OP_ANYNL:
3912 for (i = 1; i <= min; i++)
3913 {
3914 if (eptr >= md->end_subject)
3915 {
3916 SCHECK_PARTIAL();
3917 MRRETURN(MATCH_NOMATCH);
3918 }
3919 GETCHARINC(c, eptr);
3920 switch(c)
3921 {
3922 default: MRRETURN(MATCH_NOMATCH);
3923
3924 case 0x000d:
3925 if (eptr < md->end_subject && *eptr == 0x0a) eptr++;
3926 break;
3927
3928 case 0x000a:
3929 break;
3930
3931 case 0x000b:
3932 case 0x000c:
3933 case 0x0085:
3934 case 0x2028:
3935 case 0x2029:
3936 if (md->bsr_anycrlf) MRRETURN(MATCH_NOMATCH);
3937 break;
3938 }
3939 }
3940 break;
3941
3942 case OP_NOT_HSPACE:
3943 for (i = 1; i <= min; i++)
3944 {
3945 if (eptr >= md->end_subject)
3946 {
3947 SCHECK_PARTIAL();
3948 MRRETURN(MATCH_NOMATCH);
3949 }
3950 GETCHARINC(c, eptr);
3951 switch(c)
3952 {
3953 default: break;
3954 case 0x09: /* HT */
3955 case 0x20: /* SPACE */
3956 case 0xa0: /* NBSP */
3957 case 0x1680: /* OGHAM SPACE MARK */
3958 case 0x180e: /* MONGOLIAN VOWEL SEPARATOR */
3959 case 0x2000: /* EN QUAD */
3960 case 0x2001: /* EM QUAD */
3961 case 0x2002: /* EN SPACE */
3962 case 0x2003: /* EM SPACE */
3963 case 0x2004: /* THREE-PER-EM SPACE */
3964 case 0x2005: /* FOUR-PER-EM SPACE */
3965 case 0x2006: /* SIX-PER-EM SPACE */
3966 case 0x2007: /* FIGURE SPACE */
3967 case 0x2008: /* PUNCTUATION SPACE */
3968 case 0x2009: /* THIN SPACE */
3969 case 0x200A: /* HAIR SPACE */
3970 case 0x202f: /* NARROW NO-BREAK SPACE */
3971 case 0x205f: /* MEDIUM MATHEMATICAL SPACE */
3972 case 0x3000: /* IDEOGRAPHIC SPACE */
3973 MRRETURN(MATCH_NOMATCH);
3974 }
3975 }
3976 break;
3977
3978 case OP_HSPACE:
3979 for (i = 1; i <= min; i++)
3980 {
3981 if (eptr >= md->end_subject)
3982 {
3983 SCHECK_PARTIAL();
3984 MRRETURN(MATCH_NOMATCH);
3985 }
3986 GETCHARINC(c, eptr);
3987 switch(c)
3988 {
3989 default: MRRETURN(MATCH_NOMATCH);
3990 case 0x09: /* HT */
3991 case 0x20: /* SPACE */
3992 case 0xa0: /* NBSP */
3993 case 0x1680: /* OGHAM SPACE MARK */
3994 case 0x180e: /* MONGOLIAN VOWEL SEPARATOR */
3995 case 0x2000: /* EN QUAD */
3996 case 0x2001: /* EM QUAD */
3997 case 0x2002: /* EN SPACE */
3998 case 0x2003: /* EM SPACE */
3999 case 0x2004: /* THREE-PER-EM SPACE */
4000 case 0x2005: /* FOUR-PER-EM SPACE */
4001 case 0x2006: /* SIX-PER-EM SPACE */
4002 case 0x2007: /* FIGURE SPACE */
4003 case 0x2008: /* PUNCTUATION SPACE */
4004 case 0x2009: /* THIN SPACE */
4005 case 0x200A: /* HAIR SPACE */
4006 case 0x202f: /* NARROW NO-BREAK SPACE */
4007 case 0x205f: /* MEDIUM MATHEMATICAL SPACE */
4008 case 0x3000: /* IDEOGRAPHIC SPACE */
4009 break;
4010 }
4011 }
4012 break;
4013
4014 case OP_NOT_VSPACE:
4015 for (i = 1; i <= min; i++)
4016 {
4017 if (eptr >= md->end_subject)
4018 {
4019 SCHECK_PARTIAL();
4020 MRRETURN(MATCH_NOMATCH);
4021 }
4022 GETCHARINC(c, eptr);
4023 switch(c)
4024 {
4025 default: break;
4026 case 0x0a: /* LF */
4027 case 0x0b: /* VT */
4028 case 0x0c: /* FF */
4029 case 0x0d: /* CR */
4030 case 0x85: /* NEL */
4031 case 0x2028: /* LINE SEPARATOR */
4032 case 0x2029: /* PARAGRAPH SEPARATOR */
4033 MRRETURN(MATCH_NOMATCH);
4034 }
4035 }
4036 break;
4037
4038 case OP_VSPACE:
4039 for (i = 1; i <= min; i++)
4040 {
4041 if (eptr >= md->end_subject)
4042 {
4043 SCHECK_PARTIAL();
4044 MRRETURN(MATCH_NOMATCH);
4045 }
4046 GETCHARINC(c, eptr);
4047 switch(c)
4048 {
4049 default: MRRETURN(MATCH_NOMATCH);
4050 case 0x0a: /* LF */
4051 case 0x0b: /* VT */
4052 case 0x0c: /* FF */
4053 case 0x0d: /* CR */
4054 case 0x85: /* NEL */
4055 case 0x2028: /* LINE SEPARATOR */
4056 case 0x2029: /* PARAGRAPH SEPARATOR */
4057 break;
4058 }
4059 }
4060 break;
4061
4062 case OP_NOT_DIGIT:
4063 for (i = 1; i <= min; i++)
4064 {
4065 if (eptr >= md->end_subject)
4066 {
4067 SCHECK_PARTIAL();
4068 MRRETURN(MATCH_NOMATCH);
4069 }
4070 GETCHARINC(c, eptr);
4071 if (c < 128 && (md->ctypes[c] & ctype_digit) != 0)
4072 MRRETURN(MATCH_NOMATCH);
4073 }
4074 break;
4075
4076 case OP_DIGIT:
4077 for (i = 1; i <= min; i++)
4078 {
4079 if (eptr >= md->end_subject)
4080 {
4081 SCHECK_PARTIAL();
4082 MRRETURN(MATCH_NOMATCH);
4083 }
4084 if (*eptr >= 128 || (md->ctypes[*eptr++] & ctype_digit) == 0)
4085 MRRETURN(MATCH_NOMATCH);
4086 /* No need to skip more bytes - we know it's a 1-byte character */
4087 }
4088 break;
4089
4090 case OP_NOT_WHITESPACE:
4091 for (i = 1; i <= min; i++)
4092 {
4093 if (eptr >= md->end_subject)
4094 {
4095 SCHECK_PARTIAL();
4096 MRRETURN(MATCH_NOMATCH);
4097 }
4098 if (*eptr < 128 && (md->ctypes[*eptr] & ctype_space) != 0)
4099 MRRETURN(MATCH_NOMATCH);
4100 while (++eptr < md->end_subject && (*eptr & 0xc0) == 0x80);
4101 }
4102 break;
4103
4104 case OP_WHITESPACE:
4105 for (i = 1; i <= min; i++)
4106 {
4107 if (eptr >= md->end_subject)
4108 {
4109 SCHECK_PARTIAL();
4110 MRRETURN(MATCH_NOMATCH);
4111 }
4112 if (*eptr >= 128 || (md->ctypes[*eptr++] & ctype_space) == 0)
4113 MRRETURN(MATCH_NOMATCH);
4114 /* No need to skip more bytes - we know it's a 1-byte character */
4115 }
4116 break;
4117
4118 case OP_NOT_WORDCHAR:
4119 for (i = 1; i <= min; i++)
4120 {
4121 if (eptr >= md->end_subject)
4122 {
4123 SCHECK_PARTIAL();
4124 MRRETURN(MATCH_NOMATCH);
4125 }
4126 if (*eptr < 128 && (md->ctypes[*eptr] & ctype_word) != 0)
4127 MRRETURN(MATCH_NOMATCH);
4128 while (++eptr < md->end_subject && (*eptr & 0xc0) == 0x80);
4129 }
4130 break;
4131
4132 case OP_WORDCHAR:
4133 for (i = 1; i <= min; i++)
4134 {
4135 if (eptr >= md->end_subject)
4136 {
4137 SCHECK_PARTIAL();
4138 MRRETURN(MATCH_NOMATCH);
4139 }
4140 if (*eptr >= 128 || (md->ctypes[*eptr++] & ctype_word) == 0)
4141 MRRETURN(MATCH_NOMATCH);
4142 /* No need to skip more bytes - we know it's a 1-byte character */
4143 }
4144 break;
4145
4146 default:
4147 RRETURN(PCRE_ERROR_INTERNAL);
4148 } /* End switch(ctype) */
4149
4150 else
4151 #endif /* SUPPORT_UTF8 */
4152
4153 /* Code for the non-UTF-8 case for minimum matching of operators other
4154 than OP_PROP and OP_NOTPROP. */
4155
4156 switch(ctype)
4157 {
4158 case OP_ANY:
4159 for (i = 1; i <= min; i++)
4160 {
4161 if (eptr >= md->end_subject)
4162 {
4163 SCHECK_PARTIAL();
4164 MRRETURN(MATCH_NOMATCH);
4165 }
4166 if (IS_NEWLINE(eptr)) MRRETURN(MATCH_NOMATCH);
4167 eptr++;
4168 }
4169 break;
4170
4171 case OP_ALLANY:
4172 if (eptr > md->end_subject - min)
4173 {
4174 SCHECK_PARTIAL();
4175 MRRETURN(MATCH_NOMATCH);
4176 }
4177 eptr += min;
4178 break;
4179
4180 case OP_ANYBYTE:
4181 if (eptr > md->end_subject - min)
4182 {
4183 SCHECK_PARTIAL();
4184 MRRETURN(MATCH_NOMATCH);
4185 }
4186 eptr += min;
4187 break;
4188
4189 case OP_ANYNL:
4190 for (i = 1; i <= min; i++)
4191 {
4192 if (eptr >= md->end_subject)
4193 {
4194 SCHECK_PARTIAL();
4195 MRRETURN(MATCH_NOMATCH);
4196 }
4197 switch(*eptr++)
4198 {
4199 default: MRRETURN(MATCH_NOMATCH);
4200
4201 case 0x000d:
4202 if (eptr < md->end_subject && *eptr == 0x0a) eptr++;
4203 break;
4204
4205 case 0x000a:
4206 break;
4207
4208 case 0x000b:
4209 case 0x000c:
4210 case 0x0085:
4211 if (md->bsr_anycrlf) MRRETURN(MATCH_NOMATCH);
4212 break;
4213 }
4214 }
4215 break;
4216
4217 case OP_NOT_HSPACE:
4218 for (i = 1; i <= min; i++)
4219 {
4220 if (eptr >= md->end_subject)
4221 {
4222 SCHECK_PARTIAL();
4223 MRRETURN(MATCH_NOMATCH);
4224 }
4225 switch(*eptr++)
4226 {
4227 default: break;
4228 case 0x09: /* HT */
4229 case 0x20: /* SPACE */
4230 case 0xa0: /* NBSP */
4231 MRRETURN(MATCH_NOMATCH);
4232 }
4233 }
4234 break;
4235
4236 case OP_HSPACE:
4237 for (i = 1; i <= min; i++)
4238 {
4239 if (eptr >= md->end_subject)
4240 {
4241 SCHECK_PARTIAL();
4242 MRRETURN(MATCH_NOMATCH);
4243 }
4244 switch(*eptr++)
4245 {
4246 default: MRRETURN(MATCH_NOMATCH);
4247 case 0x09: /* HT */
4248 case 0x20: /* SPACE */
4249 case 0xa0: /* NBSP */
4250 break;
4251 }
4252 }
4253 break;
4254
4255 case OP_NOT_VSPACE:
4256 for (i = 1; i <= min; i++)
4257 {
4258 if (eptr >= md->end_subject)
4259 {
4260 SCHECK_PARTIAL();
4261 MRRETURN(MATCH_NOMATCH);
4262 }
4263 switch(*eptr++)
4264 {
4265 default: break;
4266 case 0x0a: /* LF */
4267 case 0x0b: /* VT */
4268 case 0x0c: /* FF */
4269 case 0x0d: /* CR */
4270 case 0x85: /* NEL */
4271 MRRETURN(MATCH_NOMATCH);
4272 }
4273 }
4274 break;
4275
4276 case OP_VSPACE:
4277 for (i = 1; i <= min; i++)
4278 {
4279 if (eptr >= md->end_subject)
4280 {
4281 SCHECK_PARTIAL();
4282 MRRETURN(MATCH_NOMATCH);
4283 }
4284 switch(*eptr++)
4285 {
4286 default: MRRETURN(MATCH_NOMATCH);
4287 case 0x0a: /* LF */
4288 case 0x0b: /* VT */
4289 case 0x0c: /* FF */
4290 case 0x0d: /* CR */
4291 case 0x85: /* NEL */
4292 break;
4293 }
4294 }
4295 break;
4296
4297 case OP_NOT_DIGIT:
4298 for (i = 1; i <= min; i++)
4299 {
4300 if (eptr >= md->end_subject)
4301 {
4302 SCHECK_PARTIAL();
4303 MRRETURN(MATCH_NOMATCH);
4304 }
4305 if ((md->ctypes[*eptr++] & ctype_digit) != 0) MRRETURN(MATCH_NOMATCH);
4306 }
4307 break;
4308
4309 case OP_DIGIT:
4310 for (i = 1; i <= min; i++)
4311 {
4312 if (eptr >= md->end_subject)
4313 {
4314 SCHECK_PARTIAL();
4315 MRRETURN(MATCH_NOMATCH);
4316 }
4317 if ((md->ctypes[*eptr++] & ctype_digit) == 0) MRRETURN(MATCH_NOMATCH);
4318 }
4319 break;
4320
4321 case OP_NOT_WHITESPACE:
4322 for (i = 1; i <= min; i++)
4323 {
4324 if (eptr >= md->end_subject)
4325 {
4326 SCHECK_PARTIAL();
4327 MRRETURN(MATCH_NOMATCH);
4328 }
4329 if ((md->ctypes[*eptr++] & ctype_space) != 0) MRRETURN(MATCH_NOMATCH);
4330 }
4331 break;
4332
4333 case OP_WHITESPACE:
4334 for (i = 1; i <= min; i++)
4335 {
4336 if (eptr >= md->end_subject)
4337 {
4338 SCHECK_PARTIAL();
4339 MRRETURN(MATCH_NOMATCH);
4340 }
4341 if ((md->ctypes[*eptr++] & ctype_space) == 0) MRRETURN(MATCH_NOMATCH);
4342 }
4343 break;
4344
4345 case OP_NOT_WORDCHAR:
4346 for (i = 1; i <= min; i++)
4347 {
4348 if (eptr >= md->end_subject)
4349 {
4350 SCHECK_PARTIAL();
4351 MRRETURN(MATCH_NOMATCH);
4352 }
4353 if ((md->ctypes[*eptr++] & ctype_word) != 0)
4354 MRRETURN(MATCH_NOMATCH);
4355 }
4356 break;
4357
4358 case OP_WORDCHAR:
4359 for (i = 1; i <= min; i++)
4360 {
4361 if (eptr >= md->end_subject)
4362 {
4363 SCHECK_PARTIAL();
4364 MRRETURN(MATCH_NOMATCH);
4365 }
4366 if ((md->ctypes[*eptr++] & ctype_word) == 0)
4367 MRRETURN(MATCH_NOMATCH);
4368 }
4369 break;
4370
4371 default:
4372 RRETURN(PCRE_ERROR_INTERNAL);
4373 }
4374 }
4375
4376 /* If min = max, continue at the same level without recursing */
4377
4378 if (min == max) continue;
4379
4380 /* If minimizing, we have to test the rest of the pattern before each
4381 subsequent match. Again, separate the UTF-8 case for speed, and also
4382 separate the UCP cases. */
4383
4384 if (minimize)
4385 {
4386 #ifdef SUPPORT_UCP
4387 if (prop_type >= 0)
4388 {
4389 switch(prop_type)
4390 {
4391 case PT_ANY:
4392 for (fi = min;; fi++)
4393 {
4394 RMATCH(eptr, ecode, offset_top, md, eptrb, RM36);
4395 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
4396 if (fi >= max) MRRETURN(MATCH_NOMATCH);
4397 if (eptr >= md->end_subject)
4398 {
4399 SCHECK_PARTIAL();
4400 MRRETURN(MATCH_NOMATCH);
4401 }
4402 GETCHARINCTEST(c, eptr);
4403 if (prop_fail_result) MRRETURN(MATCH_NOMATCH);
4404 }
4405 /* Control never gets here */
4406
4407 case PT_LAMP:
4408 for (fi = min;; fi++)
4409 {
4410 int chartype;
4411 RMATCH(eptr, ecode, offset_top, md, eptrb, RM37);
4412 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
4413 if (fi >= max) MRRETURN(MATCH_NOMATCH);
4414 if (eptr >= md->end_subject)
4415 {
4416 SCHECK_PARTIAL();
4417 MRRETURN(MATCH_NOMATCH);
4418 }
4419 GETCHARINCTEST(c, eptr);
4420 chartype = UCD_CHARTYPE(c);
4421 if ((chartype == ucp_Lu ||
4422 chartype == ucp_Ll ||
4423 chartype == ucp_Lt) == prop_fail_result)
4424 MRRETURN(MATCH_NOMATCH);
4425 }
4426 /* Control never gets here */
4427
4428 case PT_GC:
4429 for (fi = min;; fi++)
4430 {
4431 RMATCH(eptr, ecode, offset_top, md, eptrb, RM38);
4432 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
4433 if (fi >= max) MRRETURN(MATCH_NOMATCH);
4434 if (eptr >= md->end_subject)
4435 {
4436 SCHECK_PARTIAL();
4437 MRRETURN(MATCH_NOMATCH);
4438 }
4439 GETCHARINCTEST(c, eptr);
4440 if ((UCD_CATEGORY(c) == prop_value) == prop_fail_result)
4441 MRRETURN(MATCH_NOMATCH);
4442 }
4443 /* Control never gets here */
4444
4445 case PT_PC:
4446 for (fi = min;; fi++)
4447 {
4448 RMATCH(eptr, ecode, offset_top, md, eptrb, RM39);
4449 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
4450 if (fi >= max) MRRETURN(MATCH_NOMATCH);
4451 if (eptr >= md->end_subject)
4452 {
4453 SCHECK_PARTIAL();
4454 MRRETURN(MATCH_NOMATCH);
4455 }
4456 GETCHARINCTEST(c, eptr);
4457 if ((UCD_CHARTYPE(c) == prop_value) == prop_fail_result)
4458 MRRETURN(MATCH_NOMATCH);
4459 }
4460 /* Control never gets here */
4461
4462 case PT_SC:
4463 for (fi = min;; fi++)
4464 {
4465 RMATCH(eptr, ecode, offset_top, md, eptrb, RM40);
4466 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
4467 if (fi >= max) MRRETURN(MATCH_NOMATCH);
4468 if (eptr >= md->end_subject)
4469 {
4470 SCHECK_PARTIAL();
4471 MRRETURN(MATCH_NOMATCH);
4472 }
4473 GETCHARINCTEST(c, eptr);
4474 if ((UCD_SCRIPT(c) == prop_value) == prop_fail_result)
4475 MRRETURN(MATCH_NOMATCH);
4476 }
4477 /* Control never gets here */
4478
4479 case PT_ALNUM:
4480 for (fi = min;; fi++)
4481 {
4482 int category;
4483 RMATCH(eptr, ecode, offset_top, md, eptrb, RM59);
4484 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
4485 if (fi >= max) MRRETURN(MATCH_NOMATCH);
4486 if (eptr >= md->end_subject)
4487 {
4488 SCHECK_PARTIAL();
4489 MRRETURN(MATCH_NOMATCH);
4490 }
4491 GETCHARINCTEST(c, eptr);
4492 category = UCD_CATEGORY(c);
4493 if ((category == ucp_L || category == ucp_N) == prop_fail_result)
4494 MRRETURN(MATCH_NOMATCH);
4495 }
4496 /* Control never gets here */
4497
4498 case PT_SPACE: /* Perl space */
4499 for (fi = min;; fi++)
4500 {
4501 RMATCH(eptr, ecode, offset_top, md, eptrb, RM60);
4502 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
4503 if (fi >= max) MRRETURN(MATCH_NOMATCH);
4504 if (eptr >= md->end_subject)
4505 {
4506 SCHECK_PARTIAL();
4507 MRRETURN(MATCH_NOMATCH);
4508 }
4509 GETCHARINCTEST(c, eptr);
4510 if ((UCD_CATEGORY(c) == ucp_Z || c == CHAR_HT || c == CHAR_NL ||
4511 c == CHAR_FF || c == CHAR_CR)
4512 == prop_fail_result)
4513 MRRETURN(MATCH_NOMATCH);
4514 }
4515 /* Control never gets here */
4516
4517 case PT_PXSPACE: /* POSIX space */
4518 for (fi = min;; fi++)
4519 {
4520 RMATCH(eptr, ecode, offset_top, md, eptrb, RM61);
4521 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
4522 if (fi >= max) MRRETURN(MATCH_NOMATCH);
4523 if (eptr >= md->end_subject)
4524 {
4525 SCHECK_PARTIAL();
4526 MRRETURN(MATCH_NOMATCH);
4527 }
4528 GETCHARINCTEST(c, eptr);
4529 if ((UCD_CATEGORY(c) == ucp_Z || c == CHAR_HT || c == CHAR_NL ||
4530 c == CHAR_VT || c == CHAR_FF || c == CHAR_CR)
4531 == prop_fail_result)
4532 MRRETURN(MATCH_NOMATCH);
4533 }
4534 /* Control never gets here */
4535
4536 case PT_WORD:
4537 for (fi = min;; fi++)
4538 {
4539 int category;
4540 RMATCH(eptr, ecode, offset_top, md, eptrb, RM62);
4541 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
4542 if (fi >= max) MRRETURN(MATCH_NOMATCH);
4543 if (eptr >= md->end_subject)
4544 {
4545 SCHECK_PARTIAL();
4546 MRRETURN(MATCH_NOMATCH);
4547 }
4548 GETCHARINCTEST(c, eptr);
4549 category = UCD_CATEGORY(c);
4550 if ((category == ucp_L ||
4551 category == ucp_N ||
4552 c == CHAR_UNDERSCORE)
4553 == prop_fail_result)
4554 MRRETURN(MATCH_NOMATCH);
4555 }
4556 /* Control never gets here */
4557
4558 /* This should never occur */
4559
4560 default:
4561 RRETURN(PCRE_ERROR_INTERNAL);
4562 }
4563 }
4564
4565 /* Match extended Unicode sequences. We will get here only if the
4566 support is in the binary; otherwise a compile-time error occurs. */
4567
4568 else if (ctype == OP_EXTUNI)
4569 {
4570 for (fi = min;; fi++)
4571 {
4572 RMATCH(eptr, ecode, offset_top, md, eptrb, RM41);
4573 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
4574 if (fi >= max) MRRETURN(MATCH_NOMATCH);
4575 if (eptr >= md->end_subject)
4576 {
4577 SCHECK_PARTIAL();
4578 MRRETURN(MATCH_NOMATCH);
4579 }
4580 GETCHARINCTEST(c, eptr);
4581 if (UCD_CATEGORY(c) == ucp_M) MRRETURN(MATCH_NOMATCH);
4582 while (eptr < md->end_subject)
4583 {
4584 int len = 1;
4585 if (!utf8) c = *eptr; else { GETCHARLEN(c, eptr, len); }
4586 if (UCD_CATEGORY(c) != ucp_M) break;
4587 eptr += len;
4588 }
4589 }
4590 }
4591 else
4592 #endif /* SUPPORT_UCP */
4593
4594 #ifdef SUPPORT_UTF8
4595 /* UTF-8 mode */
4596 if (utf8)
4597 {
4598 for (fi = min;; fi++)
4599 {
4600 RMATCH(eptr, ecode, offset_top, md, eptrb, RM42);
4601 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
4602 if (fi >= max) MRRETURN(MATCH_NOMATCH);
4603 if (eptr >= md->end_subject)
4604 {
4605 SCHECK_PARTIAL();
4606 MRRETURN(MATCH_NOMATCH);
4607 }
4608 if (ctype == OP_ANY && IS_NEWLINE(eptr))
4609 MRRETURN(MATCH_NOMATCH);
4610 GETCHARINC(c, eptr);
4611 switch(ctype)
4612 {
4613 case OP_ANY: /* This is the non-NL case */
4614 case OP_ALLANY:
4615 case OP_ANYBYTE:
4616 break;
4617
4618 case OP_ANYNL:
4619 switch(c)
4620 {
4621 default: MRRETURN(MATCH_NOMATCH);
4622 case 0x000d:
4623 if (eptr < md->end_subject && *eptr == 0x0a) eptr++;
4624 break;
4625 case 0x000a:
4626 break;
4627
4628 case 0x000b:
4629 case 0x000c:
4630 case 0x0085:
4631 case 0x2028:
4632 case 0x2029:
4633 if (md->bsr_anycrlf) MRRETURN(MATCH_NOMATCH);
4634 break;
4635 }
4636 break;
4637
4638 case OP_NOT_HSPACE:
4639 switch(c)
4640 {
4641 default: break;
4642 case 0x09: /* HT */
4643 case 0x20: /* SPACE */
4644 case 0xa0: /* NBSP */
4645 case 0x1680: /* OGHAM SPACE MARK */
4646 case 0x180e: /* MONGOLIAN VOWEL SEPARATOR */
4647 case 0x2000: /* EN QUAD */
4648 case 0x2001: /* EM QUAD */
4649 case 0x2002: /* EN SPACE */
4650 case 0x2003: /* EM SPACE */
4651 case 0x2004: /* THREE-PER-EM SPACE */
4652 case 0x2005: /* FOUR-PER-EM SPACE */
4653 case 0x2006: /* SIX-PER-EM SPACE */
4654 case 0x2007: /* FIGURE SPACE */
4655 case 0x2008: /* PUNCTUATION SPACE */
4656 case 0x2009: /* THIN SPACE */
4657 case 0x200A: /* HAIR SPACE */
4658 case 0x202f: /* NARROW NO-BREAK SPACE */
4659 case 0x205f: /* MEDIUM MATHEMATICAL SPACE */
4660 case 0x3000: /* IDEOGRAPHIC SPACE */
4661 MRRETURN(MATCH_NOMATCH);
4662 }
4663 break;
4664
4665 case OP_HSPACE:
4666 switch(c)
4667 {
4668 default: MRRETURN(MATCH_NOMATCH);
4669 case 0x09: /* HT */
4670 case 0x20: /* SPACE */
4671 case 0xa0: /* NBSP */
4672 case 0x1680: /* OGHAM SPACE MARK */
4673 case 0x180e: /* MONGOLIAN VOWEL SEPARATOR */
4674 case 0x2000: /* EN QUAD */
4675 case 0x2001: /* EM QUAD */
4676 case 0x2002: /* EN SPACE */
4677 case 0x2003: /* EM SPACE */
4678 case 0x2004: /* THREE-PER-EM SPACE */
4679 case 0x2005: /* FOUR-PER-EM SPACE */
4680 case 0x2006: /* SIX-PER-EM SPACE */
4681 case 0x2007: /* FIGURE SPACE */
4682 case 0x2008: /* PUNCTUATION SPACE */
4683 case 0x2009: /* THIN SPACE */
4684 case 0x200A: /* HAIR SPACE */
4685 case 0x202f: /* NARROW NO-BREAK SPACE */
4686 case 0x205f: /* MEDIUM MATHEMATICAL SPACE */
4687 case 0x3000: /* IDEOGRAPHIC SPACE */
4688 break;
4689 }
4690 break;
4691
4692 case OP_NOT_VSPACE:
4693 switch(c)
4694 {
4695 default: break;
4696 case 0x0a: /* LF */
4697 case 0x0b: /* VT */
4698 case 0x0c: /* FF */
4699 case 0x0d: /* CR */
4700 case 0x85: /* NEL */
4701 case 0x2028: /* LINE SEPARATOR */
4702 case 0x2029: /* PARAGRAPH SEPARATOR */
4703 MRRETURN(MATCH_NOMATCH);
4704 }
4705 break;
4706
4707 case OP_VSPACE:
4708 switch(c)
4709 {
4710 default: MRRETURN(MATCH_NOMATCH);
4711 case 0x0a: /* LF */
4712 case 0x0b: /* VT */
4713 case 0x0c: /* FF */
4714 case 0x0d: /* CR */
4715 case 0x85: /* NEL */
4716 case 0x2028: /* LINE SEPARATOR */
4717 case 0x2029: /* PARAGRAPH SEPARATOR */
4718 break;
4719 }
4720 break;
4721
4722 case OP_NOT_DIGIT:
4723 if (c < 256 && (md->ctypes[c] & ctype_digit) != 0)
4724 MRRETURN(MATCH_NOMATCH);
4725 break;
4726
4727 case OP_DIGIT:
4728 if (c >= 256 || (md->ctypes[c] & ctype_digit) == 0)
4729 MRRETURN(MATCH_NOMATCH);
4730 break;
4731
4732 case OP_NOT_WHITESPACE:
4733 if (c < 256 && (md->ctypes[c] & ctype_space) != 0)
4734 MRRETURN(MATCH_NOMATCH);
4735 break;
4736
4737 case OP_WHITESPACE:
4738 if (c >= 256 || (md->ctypes[c] & ctype_space) == 0)
4739 MRRETURN(MATCH_NOMATCH);
4740 break;
4741
4742 case OP_NOT_WORDCHAR:
4743 if (c < 256 && (md->ctypes[c] & ctype_word) != 0)
4744 MRRETURN(MATCH_NOMATCH);
4745 break;
4746
4747 case OP_WORDCHAR:
4748 if (c >= 256 || (md->ctypes[c] & ctype_word) == 0)
4749 MRRETURN(MATCH_NOMATCH);
4750 break;
4751
4752 default:
4753 RRETURN(PCRE_ERROR_INTERNAL);
4754 }
4755 }
4756 }
4757 else
4758 #endif
4759 /* Not UTF-8 mode */
4760 {
4761 for (fi = min;; fi++)
4762 {
4763 RMATCH(eptr, ecode, offset_top, md, eptrb, RM43);
4764 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
4765 if (fi >= max) MRRETURN(MATCH_NOMATCH);
4766 if (eptr >= md->end_subject)
4767 {
4768 SCHECK_PARTIAL();
4769 MRRETURN(MATCH_NOMATCH);
4770 }
4771 if (ctype == OP_ANY && IS_NEWLINE(eptr))
4772 MRRETURN(MATCH_NOMATCH);
4773 c = *eptr++;
4774 switch(ctype)
4775 {
4776 case OP_ANY: /* This is the non-NL case */
4777 case OP_ALLANY:
4778 case OP_ANYBYTE:
4779 break;
4780
4781 case OP_ANYNL:
4782 switch(c)
4783 {
4784 default: MRRETURN(MATCH_NOMATCH);
4785 case 0x000d:
4786 if (eptr < md->end_subject && *eptr == 0x0a) eptr++;
4787 break;
4788
4789 case 0x000a:
4790 break;
4791
4792 case 0x000b:
4793 case 0x000c:
4794 case 0x0085:
4795 if (md->bsr_anycrlf) MRRETURN(MATCH_NOMATCH);
4796 break;
4797 }
4798 break;
4799
4800 case OP_NOT_HSPACE:
4801 switch(c)
4802 {
4803 default: break;
4804 case 0x09: /* HT */
4805 case 0x20: /* SPACE */
4806 case 0xa0: /* NBSP */
4807 MRRETURN(MATCH_NOMATCH);
4808 }
4809 break;
4810
4811 case OP_HSPACE:
4812 switch(c)
4813 {
4814 default: MRRETURN(MATCH_NOMATCH);
4815 case 0x09: /* HT */
4816 case 0x20: /* SPACE */
4817 case 0xa0: /* NBSP */
4818 break;
4819 }
4820 break;
4821
4822 case OP_NOT_VSPACE:
4823 switch(c)
4824 {
4825 default: break;
4826 case 0x0a: /* LF */
4827 case 0x0b: /* VT */
4828 case 0x0c: /* FF */
4829 case 0x0d: /* CR */
4830 case 0x85: /* NEL */
4831 MRRETURN(MATCH_NOMATCH);
4832 }
4833 break;
4834
4835 case OP_VSPACE:
4836 switch(c)
4837 {
4838 default: MRRETURN(MATCH_NOMATCH);
4839 case 0x0a: /* LF */
4840 case 0x0b: /* VT */
4841 case 0x0c: /* FF */
4842 case 0x0d: /* CR */
4843 case 0x85: /* NEL */
4844 break;
4845 }
4846 break;
4847
4848 case OP_NOT_DIGIT:
4849 if ((md->ctypes[c] & ctype_digit) != 0) MRRETURN(MATCH_NOMATCH);
4850 break;
4851
4852 case OP_DIGIT:
4853 if ((md->ctypes[c] & ctype_digit) == 0) MRRETURN(MATCH_NOMATCH);
4854 break;
4855
4856 case OP_NOT_WHITESPACE:
4857 if ((md->ctypes[c] & ctype_space) != 0) MRRETURN(MATCH_NOMATCH);
4858 break;
4859
4860 case OP_WHITESPACE:
4861 if ((md->ctypes[c] & ctype_space) == 0) MRRETURN(MATCH_NOMATCH);
4862 break;
4863
4864 case OP_NOT_WORDCHAR:
4865 if ((md->ctypes[c] & ctype_word) != 0) MRRETURN(MATCH_NOMATCH);
4866 break;
4867
4868 case OP_WORDCHAR:
4869 if ((md->ctypes[c] & ctype_word) == 0) MRRETURN(MATCH_NOMATCH);
4870 break;
4871
4872 default:
4873 RRETURN(PCRE_ERROR_INTERNAL);
4874 }
4875 }
4876 }
4877 /* Control never gets here */
4878 }
4879
4880 /* If maximizing, it is worth using inline code for speed, doing the type
4881 test once at the start (i.e. keep it out of the loop). Again, keep the
4882 UTF-8 and UCP stuff separate. */
4883
4884 else
4885 {
4886 pp = eptr; /* Remember where we started */
4887
4888 #ifdef SUPPORT_UCP
4889 if (prop_type >= 0)
4890 {
4891 switch(prop_type)
4892 {
4893 case PT_ANY:
4894 for (i = min; i < max; i++)
4895 {
4896 int len = 1;
4897 if (eptr >= md->end_subject)
4898 {
4899 SCHECK_PARTIAL();
4900 break;
4901 }
4902 GETCHARLENTEST(c, eptr, len);
4903 if (prop_fail_result) break;
4904 eptr+= len;
4905 }
4906 break;
4907
4908 case PT_LAMP:
4909 for (i = min; i < max; i++)
4910 {
4911 int chartype;
4912 int len = 1;
4913 if (eptr >= md->end_subject)
4914 {
4915 SCHECK_PARTIAL();
4916 break;
4917 }
4918 GETCHARLENTEST(c, eptr, len);
4919 chartype = UCD_CHARTYPE(c);
4920 if ((chartype == ucp_Lu ||
4921 chartype == ucp_Ll ||
4922 chartype == ucp_Lt) == prop_fail_result)
4923 break;
4924 eptr+= len;
4925 }
4926 break;
4927
4928 case PT_GC:
4929 for (i = min; i < max; i++)
4930 {
4931 int len = 1;
4932 if (eptr >= md->end_subject)
4933 {
4934 SCHECK_PARTIAL();
4935 break;
4936 }
4937 GETCHARLENTEST(c, eptr, len);
4938 if ((UCD_CATEGORY(c) == prop_value) == prop_fail_result) break;
4939 eptr+= len;
4940 }
4941 break;
4942
4943 case PT_PC:
4944 for (i = min; i < max; i++)
4945 {
4946 int len = 1;
4947 if (eptr >= md->end_subject)
4948 {
4949 SCHECK_PARTIAL();
4950 break;
4951 }
4952 GETCHARLENTEST(c, eptr, len);
4953 if ((UCD_CHARTYPE(c) == prop_value) == prop_fail_result) break;
4954 eptr+= len;
4955 }
4956 break;
4957
4958 case PT_SC:
4959 for (i = min; i < max; i++)
4960 {
4961 int len = 1;
4962 if (eptr >= md->end_subject)
4963 {
4964 SCHECK_PARTIAL();
4965 break;
4966 }
4967 GETCHARLENTEST(c, eptr, len);
4968 if ((UCD_SCRIPT(c) == prop_value) == prop_fail_result) break;
4969 eptr+= len;
4970 }
4971 break;
4972
4973 case PT_ALNUM:
4974 for (i = min; i < max; i++)
4975 {
4976 int category;
4977 int len = 1;
4978 if (eptr >= md->end_subject)
4979 {
4980 SCHECK_PARTIAL();
4981 break;
4982 }
4983 GETCHARLENTEST(c, eptr, len);
4984 category = UCD_CATEGORY(c);
4985 if ((category == ucp_L || category == ucp_N) == prop_fail_result)
4986 break;
4987 eptr+= len;
4988 }
4989 break;
4990
4991 case PT_SPACE: /* Perl space */
4992 for (i = min; i < max; i++)
4993 {
4994 int len = 1;
4995 if (eptr >= md->end_subject)
4996 {
4997 SCHECK_PARTIAL();
4998 break;
4999 }
5000 GETCHARLENTEST(c, eptr, len);
5001 if ((UCD_CATEGORY(c) == ucp_Z || c == CHAR_HT || c == CHAR_NL ||
5002 c == CHAR_FF || c == CHAR_CR)
5003 == prop_fail_result)
5004 break;
5005 eptr+= len;
5006 }
5007 break;
5008
5009 case PT_PXSPACE: /* POSIX space */
5010 for (i = min; i < max; i++)
5011 {
5012 int len = 1;
5013 if (eptr >= md->end_subject)
5014 {
5015 SCHECK_PARTIAL();
5016 break;
5017 }
5018 GETCHARLENTEST(c, eptr, len);
5019 if ((UCD_CATEGORY(c) == ucp_Z || c == CHAR_HT || c == CHAR_NL ||
5020 c == CHAR_VT || c == CHAR_FF || c == CHAR_CR)
5021 == prop_fail_result)
5022 break;
5023 eptr+= len;
5024 }
5025 break;
5026
5027 case PT_WORD:
5028 for (i = min; i < max; i++)
5029 {
5030 int category;
5031 int len = 1;
5032 if (eptr >= md->end_subject)
5033 {
5034 SCHECK_PARTIAL();
5035 break;
5036 }
5037 GETCHARLENTEST(c, eptr, len);
5038 category = UCD_CATEGORY(c);
5039 if ((category == ucp_L || category == ucp_N ||
5040 c == CHAR_UNDERSCORE) == prop_fail_result)
5041 break;
5042 eptr+= len;
5043 }
5044 break;
5045
5046 default:
5047 RRETURN(PCRE_ERROR_INTERNAL);
5048 }
5049
5050 /* eptr is now past the end of the maximum run */
5051
5052 if (possessive) continue;
5053 for(;;)
5054 {
5055 RMATCH(eptr, ecode, offset_top, md, eptrb, RM44);
5056 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
5057 if (eptr-- == pp) break; /* Stop if tried at original pos */
5058 if (utf8) BACKCHAR(eptr);
5059 }
5060 }
5061
5062 /* Match extended Unicode sequences. We will get here only if the
5063 support is in the binary; otherwise a compile-time error occurs. */
5064
5065 else if (ctype == OP_EXTUNI)
5066 {
5067 for (i = min; i < max; i++)
5068 {
5069 int len = 1;
5070 if (eptr >= md->end_subject)
5071 {
5072 SCHECK_PARTIAL();
5073 break;
5074 }
5075 if (!utf8) c = *eptr; else { GETCHARLEN(c, eptr, len); }
5076 if (UCD_CATEGORY(c) == ucp_M) break;
5077 eptr += len;
5078 while (eptr < md->end_subject)
5079 {
5080 len = 1;
5081 if (!utf8) c = *eptr; else { GETCHARLEN(c, eptr, len); }
5082 if (UCD_CATEGORY(c) != ucp_M) break;
5083 eptr += len;
5084 }
5085 }
5086
5087 /* eptr is now past the end of the maximum run */
5088
5089 if (possessive) continue;
5090
5091 for(;;)
5092 {
5093 RMATCH(eptr, ecode, offset_top, md, eptrb, RM45);
5094 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
5095 if (eptr-- == pp) break; /* Stop if tried at original pos */
5096 for (;;) /* Move back over one extended */
5097 {
5098 if (!utf8) c = *eptr; else
5099 {
5100 BACKCHAR(eptr);
5101 GETCHAR(c, eptr);
5102 }
5103 if (UCD_CATEGORY(c) != ucp_M) break;
5104 eptr--;
5105 }
5106 }
5107 }
5108
5109 else
5110 #endif /* SUPPORT_UCP */
5111
5112 #ifdef SUPPORT_UTF8
5113 /* UTF-8 mode */
5114
5115 if (utf8)
5116 {
5117 switch(ctype)
5118 {
5119 case OP_ANY:
5120 if (max < INT_MAX)
5121 {
5122 for (i = min; i < max; i++)
5123 {
5124 if (eptr >= md->end_subject)
5125 {
5126 SCHECK_PARTIAL();
5127 break;
5128 }
5129 if (IS_NEWLINE(eptr)) break;
5130 eptr++;
5131 while (eptr < md->end_subject && (*eptr & 0xc0) == 0x80) eptr++;
5132 }
5133 }
5134
5135 /* Handle unlimited UTF-8 repeat */
5136
5137 else
5138 {
5139 for (i = min; i < max; i++)
5140 {
5141 if (eptr >= md->end_subject)
5142 {
5143 SCHECK_PARTIAL();
5144 break;
5145 }
5146 if (IS_NEWLINE(eptr)) break;
5147 eptr++;
5148 while (eptr < md->end_subject && (*eptr & 0xc0) == 0x80) eptr++;
5149 }
5150 }
5151 break;
5152
5153 case OP_ALLANY:
5154 if (max < INT_MAX)
5155 {
5156 for (i = min; i < max; i++)
5157 {
5158 if (eptr >= md->end_subject)
5159 {
5160 SCHECK_PARTIAL();
5161 break;
5162 }
5163 eptr++;
5164 while (eptr < md->end_subject && (*eptr & 0xc0) == 0x80) eptr++;
5165 }
5166 }
5167 else eptr = md->end_subject; /* Unlimited UTF-8 repeat */
5168 break;
5169
5170 /* The byte case is the same as non-UTF8 */
5171
5172 case OP_ANYBYTE:
5173 c = max - min;
5174 if (c > (unsigned int)(md->end_subject - eptr))
5175 {
5176 eptr = md->end_subject;
5177 SCHECK_PARTIAL();
5178 }
5179 else eptr += c;
5180 break;
5181
5182 case OP_ANYNL:
5183 for (i = min; i < max; i++)
5184 {
5185 int len = 1;
5186 if (eptr >= md->end_subject)
5187 {
5188 SCHECK_PARTIAL();
5189 break;
5190 }
5191 GETCHARLEN(c, eptr, len);
5192 if (c == 0x000d)
5193 {
5194 if (++eptr >= md->end_subject) break;
5195 if (*eptr == 0x000a) eptr++;
5196 }
5197 else
5198 {
5199 if (c != 0x000a &&
5200 (md->bsr_anycrlf ||
5201 (c != 0x000b && c != 0x000c &&
5202 c != 0x0085 && c != 0x2028 && c != 0x2029)))
5203 break;
5204 eptr += len;
5205 }
5206 }
5207 break;
5208
5209 case OP_NOT_HSPACE:
5210 case OP_HSPACE:
5211 for (i = min; i < max; i++)
5212 {
5213 BOOL gotspace;
5214 int len = 1;
5215 if (eptr >= md->end_subject)
5216 {
5217 SCHECK_PARTIAL();
5218 break;
5219 }
5220 GETCHARLEN(c, eptr, len);
5221 switch(c)
5222 {
5223 default: gotspace = FALSE; break;
5224 case 0x09: /* HT */
5225 case 0x20: /* SPACE */
5226 case 0xa0: /* NBSP */
5227 case 0x1680: /* OGHAM SPACE MARK */
5228 case 0x180e: /* MONGOLIAN VOWEL SEPARATOR */
5229 case 0x2000: /* EN QUAD */
5230 case 0x2001: /* EM QUAD */
5231 case 0x2002: /* EN SPACE */
5232 case 0x2003: /* EM SPACE */
5233 case 0x2004: /* THREE-PER-EM SPACE */
5234 case 0x2005: /* FOUR-PER-EM SPACE */
5235 case 0x2006: /* SIX-PER-EM SPACE */
5236 case 0x2007: /* FIGURE SPACE */
5237 case 0x2008: /* PUNCTUATION SPACE */
5238 case 0x2009: /* THIN SPACE */
5239 case 0x200A: /* HAIR SPACE */
5240 case 0x202f: /* NARROW NO-BREAK SPACE */
5241 case 0x205f: /* MEDIUM MATHEMATICAL SPACE */
5242 case 0x3000: /* IDEOGRAPHIC SPACE */
5243 gotspace = TRUE;
5244 break;
5245 }
5246 if (gotspace == (ctype == OP_NOT_HSPACE)) break;
5247 eptr += len;
5248 }
5249 break;
5250
5251 case OP_NOT_VSPACE:
5252 case OP_VSPACE:
5253 for (i = min; i < max; i++)
5254 {
5255 BOOL gotspace;
5256 int len = 1;
5257 if (eptr >= md->end_subject)
5258 {
5259 SCHECK_PARTIAL();
5260 break;
5261 }
5262 GETCHARLEN(c, eptr, len);
5263 switch(c)
5264 {
5265 default: gotspace = FALSE; break;
5266 case 0x0a: /* LF */
5267 case 0x0b: /* VT */
5268 case 0x0c: /* FF */
5269 case 0x0d: /* CR */
5270 case 0x85: /* NEL */
5271 case 0x2028: /* LINE SEPARATOR */
5272 case 0x2029: /* PARAGRAPH SEPARATOR */
5273 gotspace = TRUE;
5274 break;
5275 }
5276 if (gotspace == (ctype == OP_NOT_VSPACE)) break;
5277 eptr += len;
5278 }
5279 break;
5280
5281 case OP_NOT_DIGIT:
5282 for (i = min; i < max; i++)
5283 {
5284 int len = 1;
5285 if (eptr >= md->end_subject)
5286 {
5287 SCHECK_PARTIAL();
5288 break;
5289 }
5290 GETCHARLEN(c, eptr, len);
5291 if (c < 256 && (md->ctypes[c] & ctype_digit) != 0) break;
5292 eptr+= len;
5293 }
5294 break;
5295
5296 case OP_DIGIT:
5297 for (i = min; i < max; i++)
5298 {
5299 int len = 1;
5300 if (eptr >= md->end_subject)
5301 {
5302 SCHECK_PARTIAL();
5303 break;
5304 }
5305 GETCHARLEN(c, eptr, len);
5306 if (c >= 256 ||(md->ctypes[c] & ctype_digit) == 0) break;
5307 eptr+= len;
5308 }
5309 break;
5310
5311 case OP_NOT_WHITESPACE:
5312 for (i = min; i < max; i++)
5313 {
5314 int len = 1;
5315 if (eptr >= md->end_subject)
5316 {
5317 SCHECK_PARTIAL();
5318 break;
5319 }
5320 GETCHARLEN(c, eptr, len);
5321 if (c < 256 && (md->ctypes[c] & ctype_space) != 0) break;
5322 eptr+= len;
5323 }
5324 break;
5325
5326 case OP_WHITESPACE:
5327 for (i = min; i < max; i++)
5328 {
5329 int len = 1;
5330 if (eptr >= md->end_subject)
5331 {
5332 SCHECK_PARTIAL();
5333 break;
5334 }
5335 GETCHARLEN(c, eptr, len);
5336 if (c >= 256 ||(md->ctypes[c] & ctype_space) == 0) break;
5337 eptr+= len;
5338 }
5339 break;
5340
5341 case OP_NOT_WORDCHAR:
5342 for (i = min; i < max; i++)
5343 {
5344 int len = 1;
5345 if (eptr >= md->end_subject)
5346 {
5347 SCHECK_PARTIAL();
5348 break;
5349 }
5350 GETCHARLEN(c, eptr, len);
5351 if (c < 256 && (md->ctypes[c] & ctype_word) != 0) break;
5352 eptr+= len;
5353 }
5354 break;
5355
5356 case OP_WORDCHAR:
5357 for (i = min; i < max; i++)
5358 {
5359 int len = 1;
5360 if (eptr >= md->end_subject)
5361 {
5362 SCHECK_PARTIAL();
5363 break;
5364 }
5365 GETCHARLEN(c, eptr, len);
5366 if (c >= 256 || (md->ctypes[c] & ctype_word) == 0) break;
5367 eptr+= len;
5368 }
5369 break;
5370
5371 default:
5372 RRETURN(PCRE_ERROR_INTERNAL);
5373 }
5374
5375 /* eptr is now past the end of the maximum run. If possessive, we are
5376 done (no backing up). Otherwise, match at this position; anything other
5377 than no match is immediately returned. For nomatch, back up one
5378 character, unless we are matching \R and the last thing matched was
5379 \r\n, in which case, back up two bytes. */
5380
5381 if (possessive) continue;
5382 for(;;)
5383 {
5384 RMATCH(eptr, ecode, offset_top, md, eptrb, RM46);
5385 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
5386 if (eptr-- == pp) break; /* Stop if tried at original pos */
5387 BACKCHAR(eptr);
5388 if (ctype == OP_ANYNL && eptr > pp && *eptr == '\n' &&
5389 eptr[-1] == '\r') eptr--;
5390 }
5391 }
5392 else
5393 #endif /* SUPPORT_UTF8 */
5394
5395 /* Not UTF-8 mode */
5396 {
5397 switch(ctype)
5398 {
5399 case OP_ANY:
5400 for (i = min; i < max; i++)
5401 {
5402 if (eptr >= md->end_subject)
5403 {
5404 SCHECK_PARTIAL();
5405 break;
5406 }
5407 if (IS_NEWLINE(eptr)) break;
5408 eptr++;
5409 }
5410 break;
5411
5412 case OP_ALLANY:
5413 case OP_ANYBYTE:
5414 c = max - min;
5415 if (c > (unsigned int)(md->end_subject - eptr))
5416 {
5417 eptr = md->end_subject;
5418 SCHECK_PARTIAL();
5419 }
5420 else eptr += c;
5421 break;
5422
5423 case OP_ANYNL:
5424 for (i = min; i < max; i++)
5425 {
5426 if (eptr >= md->end_subject)
5427 {
5428 SCHECK_PARTIAL();
5429 break;
5430 }
5431 c = *eptr;
5432 if (c == 0x000d)
5433 {
5434 if (++eptr >= md->end_subject) break;
5435 if (*eptr == 0x000a) eptr++;
5436 }
5437 else
5438 {
5439 if (c != 0x000a &&
5440 (md->bsr_anycrlf ||
5441 (c != 0x000b && c != 0x000c && c != 0x0085)))
5442 break;
5443 eptr++;
5444 }
5445 }
5446 break;
5447
5448 case OP_NOT_HSPACE:
5449 for (i = min; i < max; i++)
5450 {
5451 if (eptr >= md->end_subject)
5452 {
5453 SCHECK_PARTIAL();
5454 break;
5455 }
5456 c = *eptr;
5457 if (c == 0x09 || c == 0x20 || c == 0xa0) break;
5458 eptr++;
5459 }
5460 break;
5461
5462 case OP_HSPACE:
5463 for (i = min; i < max; i++)
5464 {
5465 if (eptr >= md->end_subject)
5466 {
5467 SCHECK_PARTIAL();
5468 break;
5469 }
5470 c = *eptr;
5471 if (c != 0x09 && c != 0x20 && c != 0xa0) break;
5472 eptr++;
5473 }
5474 break;
5475
5476 case OP_NOT_VSPACE:
5477 for (i = min; i < max; i++)
5478 {
5479 if (eptr >= md->end_subject)
5480 {
5481 SCHECK_PARTIAL();
5482 break;
5483 }
5484 c = *eptr;
5485 if (c == 0x0a || c == 0x0b || c == 0x0c || c == 0x0d || c == 0x85)
5486 break;
5487 eptr++;
5488 }
5489 break;
5490
5491 case OP_VSPACE:
5492 for (i = min; i < max; i++)
5493 {
5494 if (eptr >= md->end_subject)
5495 {
5496 SCHECK_PARTIAL();
5497 break;
5498 }
5499 c = *eptr;
5500 if (c != 0x0a && c != 0x0b && c != 0x0c && c != 0x0d && c != 0x85)
5501 break;
5502 eptr++;
5503 }
5504 break;
5505
5506 case OP_NOT_DIGIT:
5507 for (i = min; i < max; i++)
5508 {
5509 if (eptr >= md->end_subject)
5510 {
5511 SCHECK_PARTIAL();
5512 break;
5513 }
5514 if ((md->ctypes[*eptr] & ctype_digit) != 0) break;
5515 eptr++;
5516 }
5517 break;
5518
5519 case OP_DIGIT:
5520 for (i = min; i < max; i++)
5521 {
5522 if (eptr >= md->end_subject)
5523 {
5524 SCHECK_PARTIAL();
5525 break;
5526 }
5527 if ((md->ctypes[*eptr] & ctype_digit) == 0) break;
5528 eptr++;
5529 }
5530 break;
5531
5532 case OP_NOT_WHITESPACE:
5533 for (i = min; i < max; i++)
5534 {
5535 if (eptr >= md->end_subject)
5536 {
5537 SCHECK_PARTIAL();
5538 break;
5539 }
5540 if ((md->ctypes[*eptr] & ctype_space) != 0) break;
5541 eptr++;
5542 }
5543 break;
5544
5545 case OP_WHITESPACE:
5546 for (i = min; i < max; i++)
5547 {
5548 if (eptr >= md->end_subject)
5549 {
5550 SCHECK_PARTIAL();
5551 break;
5552 }
5553 if ((md->ctypes[*eptr] & ctype_space) == 0) break;
5554 eptr++;
5555 }
5556 break;
5557
5558 case OP_NOT_WORDCHAR:
5559 for (i = min; i < max; i++)
5560 {
5561 if (eptr >= md->end_subject)
5562 {
5563 SCHECK_PARTIAL();
5564 break;
5565 }
5566 if ((md->ctypes[*eptr] & ctype_word) != 0) break;
5567 eptr++;
5568 }
5569 break;
5570
5571 case OP_WORDCHAR:
5572 for (i = min; i < max; i++)
5573 {
5574 if (eptr >= md->end_subject)
5575 {
5576 SCHECK_PARTIAL();
5577 break;
5578 }
5579 if ((md->ctypes[*eptr] & ctype_word) == 0) break;
5580 eptr++;
5581 }
5582 break;
5583
5584 default:
5585 RRETURN(PCRE_ERROR_INTERNAL);
5586 }
5587
5588 /* eptr is now past the end of the maximum run. If possessive, we are
5589 done (no backing up). Otherwise, match at this position; anything other
5590 than no match is immediately returned. For nomatch, back up one
5591 character (byte), unless we are matching \R and the last thing matched
5592 was \r\n, in which case, back up two bytes. */
5593
5594 if (possessive) continue;
5595 while (eptr >= pp)
5596 {
5597 RMATCH(eptr, ecode, offset_top, md, eptrb, RM47);
5598 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
5599 eptr--;
5600 if (ctype == OP_ANYNL && eptr > pp && *eptr == '\n' &&
5601 eptr[-1] == '\r') eptr--;
5602 }
5603 }
5604
5605 /* Get here if we can't make it match with any permitted repetitions */
5606
5607 MRRETURN(MATCH_NOMATCH);
5608 }
5609 /* Control never gets here */
5610
5611 /* There's been some horrible disaster. Arrival here can only mean there is
5612 something seriously wrong in the code above or the OP_xxx definitions. */
5613
5614 default:
5615 DPRINTF(("Unknown opcode %d\n", *ecode));
5616 RRETURN(PCRE_ERROR_UNKNOWN_OPCODE);
5617 }
5618
5619 /* Do not stick any code in here without much thought; it is assumed
5620 that "continue" in the code above comes out to here to repeat the main
5621 loop. */
5622
5623 } /* End of main loop */
5624 /* Control never reaches here */
5625
5626
5627 /* When compiling to use the heap rather than the stack for recursive calls to
5628 match(), the RRETURN() macro jumps here. The number that is saved in
5629 frame->Xwhere indicates which label we actually want to return to. */
5630
5631 #ifdef NO_RECURSE
5632 #define LBL(val) case val: goto L_RM##val;
5633 HEAP_RETURN:
5634 switch (frame->Xwhere)
5635 {
5636 LBL( 1) LBL( 2) LBL( 3) LBL( 4) LBL( 5) LBL( 6) LBL( 7) LBL( 8)
5637 LBL( 9) LBL(10) LBL(11) LBL(12) LBL(13) LBL(14) LBL(15) LBL(17)
5638 LBL(19) LBL(24) LBL(25) LBL(26) LBL(27) LBL(29) LBL(31) LBL(33)
5639 LBL(35) LBL(43) LBL(47) LBL(48) LBL(49) LBL(50) LBL(51) LBL(52)
5640 LBL(53) LBL(54) LBL(55) LBL(56) LBL(57) LBL(58) LBL(63)
5641 #ifdef SUPPORT_UTF8
5642 LBL(16) LBL(18) LBL(20) LBL(21) LBL(22) LBL(23) LBL(28) LBL(30)
5643 LBL(32) LBL(34) LBL(42) LBL(46)
5644 #ifdef SUPPORT_UCP
5645 LBL(36) LBL(37) LBL(38) LBL(39) LBL(40) LBL(41) LBL(44) LBL(45)
5646 LBL(59) LBL(60) LBL(61) LBL(62)
5647 #endif /* SUPPORT_UCP */
5648 #endif /* SUPPORT_UTF8 */
5649 default:
5650 DPRINTF(("jump error in pcre match: label %d non-existent\n", frame->Xwhere));
5651 return PCRE_ERROR_INTERNAL;
5652 }
5653 #undef LBL
5654 #endif /* NO_RECURSE */
5655 }
5656
5657
5658 /***************************************************************************
5659 ****************************************************************************
5660 RECURSION IN THE match() FUNCTION
5661
5662 Undefine all the macros that were defined above to handle this. */
5663
5664 #ifdef NO_RECURSE
5665 #undef eptr
5666 #undef ecode
5667 #undef mstart
5668 #undef offset_top
5669 #undef eptrb
5670 #undef flags
5671
5672 #undef callpat
5673 #undef charptr
5674 #undef data
5675 #undef next
5676 #undef pp
5677 #undef prev
5678 #undef saved_eptr
5679
5680 #undef new_recursive
5681
5682 #undef cur_is_word
5683 #undef condition
5684 #undef prev_is_word
5685
5686 #undef ctype
5687 #undef length
5688 #undef max
5689 #undef min
5690 #undef number
5691 #undef offset
5692 #undef op
5693 #undef save_capture_last
5694 #undef save_offset1
5695 #undef save_offset2
5696 #undef save_offset3
5697 #undef stacksave
5698
5699 #undef newptrb
5700
5701 #endif
5702
5703 /* These two are defined as macros in both cases */
5704
5705 #undef fc
5706 #undef fi
5707
5708 /***************************************************************************
5709 ***************************************************************************/
5710
5711
5712
5713 /*************************************************
5714 * Execute a Regular Expression *
5715 *************************************************/
5716
5717 /* This function applies a compiled re to a subject string and picks out
5718 portions of the string if it matches. Two elements in the vector are set for
5719 each substring: the offsets to the start and end of the substring.
5720
5721 Arguments:
5722 argument_re points to the compiled expression
5723 extra_data points to extra data or is NULL
5724 subject points to the subject string
5725 length length of subject string (may contain binary zeros)
5726 start_offset where to start in the subject string
5727 options option bits
5728 offsets points to a vector of ints to be filled in with offsets
5729 offsetcount the number of elements in the vector
5730
5731 Returns: > 0 => success; value is the number of elements filled in
5732 = 0 => success, but offsets is not big enough
5733 -1 => failed to match
5734 < -1 => some kind of unexpected problem
5735 */
5736
5737 PCRE_EXP_DEFN int PCRE_CALL_CONVENTION
5738 pcre_exec(const pcre *argument_re, const pcre_extra *extra_data,
5739 PCRE_SPTR subject, int length, int start_offset, int options, int *offsets,
5740 int offsetcount)
5741 {
5742 int rc, ocount;
5743 int first_byte = -1;
5744 int req_byte = -1;
5745 int req_byte2 = -1;
5746 int newline;
5747 BOOL using_temporary_offsets = FALSE;
5748 BOOL anchored;
5749 BOOL startline;
5750 BOOL firstline;
5751 BOOL first_byte_caseless = FALSE;
5752 BOOL req_byte_caseless = FALSE;
5753 BOOL utf8;
5754 match_data match_block;
5755 match_data *md = &match_block;
5756 const uschar *tables;
5757 const uschar *start_bits = NULL;
5758 USPTR start_match = (USPTR)subject + start_offset;
5759 USPTR end_subject;
5760 USPTR start_partial = NULL;
5761 USPTR req_byte_ptr = start_match - 1;
5762
5763 pcre_study_data internal_study;
5764 const pcre_study_data *study;
5765
5766 real_pcre internal_re;
5767 const real_pcre *external_re = (const real_pcre *)argument_re;
5768 const real_pcre *re = external_re;
5769
5770 /* Plausibility checks */
5771
5772 if ((options & ~PUBLIC_EXEC_OPTIONS) != 0) return PCRE_ERROR_BADOPTION;
5773 if (re == NULL || subject == NULL ||
5774 (offsets == NULL && offsetcount > 0)) return PCRE_ERROR_NULL;
5775 if (offsetcount < 0) return PCRE_ERROR_BADCOUNT;
5776 if (start_offset < 0 || start_offset > length) return PCRE_ERROR_BADOFFSET;
5777
5778 /* This information is for finding all the numbers associated with a given
5779 name, for condition testing. */
5780
5781 md->name_table = (uschar *)re + re->name_table_offset;
5782 md->name_count = re->name_count;
5783 md->name_entry_size = re->name_entry_size;
5784
5785 /* Fish out the optional data from the extra_data structure, first setting
5786 the default values. */
5787
5788 study = NULL;
5789 md->match_limit = MATCH_LIMIT;
5790 md->match_limit_recursion = MATCH_LIMIT_RECURSION;
5791 md->callout_data = NULL;
5792
5793 /* The table pointer is always in native byte order. */
5794
5795 tables = external_re->tables;
5796
5797 if (extra_data != NULL)
5798 {
5799 register unsigned int flags = extra_data->flags;
5800 if ((flags & PCRE_EXTRA_STUDY_DATA) != 0)
5801 study = (const pcre_study_data *)extra_data->study_data;
5802 if ((flags & PCRE_EXTRA_MATCH_LIMIT) != 0)
5803 md->match_limit = extra_data->match_limit;
5804 if ((flags & PCRE_EXTRA_MATCH_LIMIT_RECURSION) != 0)
5805 md->match_limit_recursion = extra_data->match_limit_recursion;
5806 if ((flags & PCRE_EXTRA_CALLOUT_DATA) != 0)
5807 md->callout_data = extra_data->callout_data;
5808 if ((flags & PCRE_EXTRA_TABLES) != 0) tables = extra_data->tables;
5809 }
5810
5811 /* If the exec call supplied NULL for tables, use the inbuilt ones. This
5812 is a feature that makes it possible to save compiled regex and re-use them
5813 in other programs later. */
5814
5815 if (tables == NULL) tables = _pcre_default_tables;
5816
5817 /* Check that the first field in the block is the magic number. If it is not,
5818 test for a regex that was compiled on a host of opposite endianness. If this is
5819 the case, flipped values are put in internal_re and internal_study if there was
5820 study data too. */
5821
5822 if (re->magic_number != MAGIC_NUMBER)
5823 {
5824 re = _pcre_try_flipped(re, &internal_re, study, &internal_study);
5825 if (re == NULL) return PCRE_ERROR_BADMAGIC;
5826 if (study != NULL) study = &internal_study;
5827 }
5828
5829 /* Set up other data */
5830
5831 anchored = ((re->options | options) & PCRE_ANCHORED) != 0;
5832 startline = (re->flags & PCRE_STARTLINE) != 0;
5833 firstline = (re->options & PCRE_FIRSTLINE) != 0;
5834
5835 /* The code starts after the real_pcre block and the capture name table. */
5836
5837 md->start_code = (const uschar *)external_re + re->name_table_offset +
5838 re->name_count * re->name_entry_size;
5839
5840 md->start_subject = (USPTR)subject;
5841 md->start_offset = start_offset;
5842 md->end_subject = md->start_subject + length;
5843 end_subject = md->end_subject;
5844
5845 md->endonly = (re->options & PCRE_DOLLAR_ENDONLY) != 0;
5846 utf8 = md->utf8 = (re->options & PCRE_UTF8) != 0;
5847 md->use_ucp = (re->options & PCRE_UCP) != 0;
5848 md->jscript_compat = (re->options & PCRE_JAVASCRIPT_COMPAT) != 0;
5849
5850 /* Some options are unpacked into BOOL variables in the hope that testing
5851 them will be faster than individual option bits. */
5852
5853 md->notbol = (options & PCRE_NOTBOL) != 0;
5854 md->noteol = (options & PCRE_NOTEOL) != 0;
5855 md->notempty = (options & PCRE_NOTEMPTY) != 0;
5856 md->notempty_atstart = (options & PCRE_NOTEMPTY_ATSTART) != 0;
5857 md->partial = ((options & PCRE_PARTIAL_HARD) != 0)? 2 :
5858 ((options & PCRE_PARTIAL_SOFT) != 0)? 1 : 0;
5859
5860
5861 md->hitend = FALSE;
5862 md->mark = NULL; /* In case never set */
5863
5864 md->recursive = NULL; /* No recursion at top level */
5865
5866 md->lcc = tables + lcc_offset;
5867 md->ctypes = tables + ctypes_offset;
5868
5869 /* Handle different \R options. */
5870
5871 switch (options & (PCRE_BSR_ANYCRLF|PCRE_BSR_UNICODE))
5872 {
5873 case 0:
5874 if ((re->options & (PCRE_BSR_ANYCRLF|PCRE_BSR_UNICODE)) != 0)
5875 md->bsr_anycrlf = (re->options & PCRE_BSR_ANYCRLF) != 0;
5876 else
5877 #ifdef BSR_ANYCRLF
5878 md->bsr_anycrlf = TRUE;
5879 #else
5880 md->bsr_anycrlf = FALSE;
5881 #endif
5882 break;
5883
5884 case PCRE_BSR_ANYCRLF:
5885 md->bsr_anycrlf = TRUE;
5886 break;
5887
5888 case PCRE_BSR_UNICODE:
5889 md->bsr_anycrlf = FALSE;
5890 break;
5891
5892 default: return PCRE_ERROR_BADNEWLINE;
5893 }
5894
5895 /* Handle different types of newline. The three bits give eight cases. If
5896 nothing is set at run time, whatever was used at compile time applies. */
5897
5898 switch ((((options & PCRE_NEWLINE_BITS) == 0)? re->options :
5899 (pcre_uint32)options) & PCRE_NEWLINE_BITS)
5900 {
5901 case 0: newline = NEWLINE; break; /* Compile-time default */
5902 case PCRE_NEWLINE_CR: newline = CHAR_CR; break;
5903 case PCRE_NEWLINE_LF: newline = CHAR_NL; break;
5904 case PCRE_NEWLINE_CR+
5905 PCRE_NEWLINE_LF: newline = (CHAR_CR << 8) | CHAR_NL; break;
5906 case PCRE_NEWLINE_ANY: newline = -1; break;
5907 case PCRE_NEWLINE_ANYCRLF: newline = -2; break;
5908 default: return PCRE_ERROR_BADNEWLINE;
5909 }
5910
5911 if (newline == -2)
5912 {
5913 md->nltype = NLTYPE_ANYCRLF;
5914 }
5915 else if (newline < 0)
5916 {
5917 md->nltype = NLTYPE_ANY;
5918 }
5919 else
5920 {
5921 md->nltype = NLTYPE_FIXED;
5922 if (newline > 255)
5923 {
5924 md->nllen = 2;
5925 md->nl[0] = (newline >> 8) & 255;
5926 md->nl[1] = newline & 255;
5927 }
5928 else
5929 {
5930 md->nllen = 1;
5931 md->nl[0] = newline;
5932 }
5933 }
5934
5935 /* Partial matching was originally supported only for a restricted set of
5936 regexes; from release 8.00 there are no restrictions, but the bits are still
5937 defined (though never set). So there's no harm in leaving this code. */
5938
5939 if (md->partial && (re->flags & PCRE_NOPARTIAL) != 0)
5940 return PCRE_ERROR_BADPARTIAL;
5941
5942 /* Check a UTF-8 string if required. Pass back the character offset and error
5943 code for an invalid string if a results vector is available. */
5944
5945 #ifdef SUPPORT_UTF8
5946 if (utf8 && (options & PCRE_NO_UTF8_CHECK) == 0)
5947 {
5948 int erroroffset;
5949 int errorcode = _pcre_valid_utf8((USPTR)subject, length, &erroroffset);
5950 if (errorcode != 0)
5951 {
5952 if (offsetcount >= 2)
5953 {
5954 offsets[0] = erroroffset;
5955 offsets[1] = errorcode;
5956 }
5957 return (errorcode <= PCRE_UTF8_ERR5 && md->partial > 1)?
5958 PCRE_ERROR_SHORTUTF8 : PCRE_ERROR_BADUTF8;
5959 }
5960
5961 /* Check that a start_offset points to the start of a UTF-8 character. */
5962
5963 if (start_offset > 0 && start_offset < length &&
5964 (((USPTR)subject)[start_offset] & 0xc0) == 0x80)
5965 return PCRE_ERROR_BADUTF8_OFFSET;
5966 }
5967 #endif
5968
5969 /* If the expression has got more back references than the offsets supplied can
5970 hold, we get a temporary chunk of working store to use during the matching.
5971 Otherwise, we can use the vector supplied, rounding down its size to a multiple
5972 of 3. */
5973
5974 ocount = offsetcount - (offsetcount % 3);
5975
5976 if (re->top_backref > 0 && re->top_backref >= ocount/3)
5977 {
5978 ocount = re->top_backref * 3 + 3;
5979 md->offset_vector = (int *)(pcre_malloc)(ocount * sizeof(int));
5980 if (md->offset_vector == NULL) return PCRE_ERROR_NOMEMORY;
5981 using_temporary_offsets = TRUE;
5982 DPRINTF(("Got memory to hold back references\n"));
5983 }
5984 else md->offset_vector = offsets;
5985
5986 md->offset_end = ocount;
5987 md->offset_max = (2*ocount)/3;
5988 md->offset_overflow = FALSE;
5989 md->capture_last = -1;
5990
5991 /* Reset the working variable associated with each extraction. These should
5992 never be used unless previously set, but they get saved and restored, and so we
5993 initialize them to avoid reading uninitialized locations. Also, unset the
5994 offsets for the matched string. This is really just for tidiness with callouts,
5995 in case they inspect these fields. */
5996
5997 if (md->offset_vector != NULL)
5998 {
5999 register int *iptr = md->offset_vector + ocount;
6000 register int *iend = iptr - re->top_bracket;
6001 if (iend < md->offset_vector + 2) iend = md->offset_vector + 2;
6002 while (--iptr >= iend) *iptr = -1;
6003 md->offset_vector[0] = md->offset_vector[1] = -1;
6004 }
6005
6006 /* Set up the first character to match, if available. The first_byte value is
6007 never set for an anchored regular expression, but the anchoring may be forced
6008 at run time, so we have to test for anchoring. The first char may be unset for
6009 an unanchored pattern, of course. If there's no first char and the pattern was
6010 studied, there may be a bitmap of possible first characters. */
6011
6012 if (!anchored)
6013 {
6014 if ((re->flags & PCRE_FIRSTSET) != 0)
6015 {
6016 first_byte = re->first_byte & 255;
6017 if ((first_byte_caseless = ((re->first_byte & REQ_CASELESS) != 0)) == TRUE)
6018 first_byte = md->lcc[first_byte];
6019 }
6020 else
6021 if (!startline && study != NULL &&
6022 (study->flags & PCRE_STUDY_MAPPED) != 0)
6023 start_bits = study->start_bits;
6024 }
6025
6026 /* For anchored or unanchored matches, there may be a "last known required
6027 character" set. */
6028
6029 if ((re->flags & PCRE_REQCHSET) != 0)
6030 {
6031 req_byte = re->req_byte & 255;
6032 req_byte_caseless = (re->req_byte & REQ_CASELESS) != 0;
6033 req_byte2 = (tables + fcc_offset)[req_byte]; /* case flipped */
6034 }
6035
6036
6037
6038
6039 /* ==========================================================================*/
6040
6041 /* Loop for handling unanchored repeated matching attempts; for anchored regexs
6042 the loop runs just once. */
6043
6044 for(;;)
6045 {
6046 USPTR save_end_subject = end_subject;
6047 USPTR new_start_match;
6048
6049 /* If firstline is TRUE, the start of the match is constrained to the first
6050 line of a multiline string. That is, the match must be before or at the first
6051 newline. Implement this by temporarily adjusting end_subject so that we stop
6052 scanning at a newline. If the match fails at the newline, later code breaks
6053 this loop. */
6054
6055 if (firstline)
6056 {
6057 USPTR t = start_match;
6058 #ifdef SUPPORT_UTF8
6059 if (utf8)
6060 {
6061 while (t < md->end_subject && !IS_NEWLINE(t))
6062 {
6063 t++;
6064 while (t < end_subject && (*t & 0xc0) == 0x80) t++;
6065 }
6066 }
6067 else
6068 #endif
6069 while (t < md->end_subject && !IS_NEWLINE(t)) t++;
6070 end_subject = t;
6071 }
6072
6073 /* There are some optimizations that avoid running the match if a known
6074 starting point is not found, or if a known later character is not present.
6075 However, there is an option that disables these, for testing and for ensuring
6076 that all callouts do actually occur. The option can be set in the regex by
6077 (*NO_START_OPT) or passed in match-time options. */
6078
6079 if (((options | re->options) & PCRE_NO_START_OPTIMIZE) == 0)
6080 {
6081 /* Advance to a unique first byte if there is one. */
6082
6083 if (first_byte >= 0)
6084 {
6085 if (first_byte_caseless)
6086 while (start_match < end_subject && md->lcc[*start_match] != first_byte)
6087 start_match++;
6088 else
6089 while (start_match < end_subject && *start_match != first_byte)
6090 start_match++;
6091 }
6092
6093 /* Or to just after a linebreak for a multiline match */
6094
6095 else if (startline)
6096 {
6097 if (start_match > md->start_subject + start_offset)
6098 {
6099 #ifdef SUPPORT_UTF8
6100 if (utf8)
6101 {
6102 while (start_match < end_subject && !WAS_NEWLINE(start_match))
6103 {
6104 start_match++;
6105 while(start_match < end_subject && (*start_match & 0xc0) == 0x80)
6106 start_match++;
6107 }
6108 }
6109 else
6110 #endif
6111 while (start_match < end_subject && !WAS_NEWLINE(start_match))
6112 start_match++;
6113
6114 /* If we have just passed a CR and the newline option is ANY or ANYCRLF,
6115 and we are now at a LF, advance the match position by one more character.
6116 */
6117
6118 if (start_match[-1] == CHAR_CR &&
6119 (md->nltype == NLTYPE_ANY || md->nltype == NLTYPE_ANYCRLF) &&
6120 start_match < end_subject &&
6121 *start_match == CHAR_NL)
6122 start_match++;
6123 }
6124 }
6125
6126 /* Or to a non-unique first byte after study */
6127
6128 else if (start_bits != NULL)
6129 {
6130 while (start_match < end_subject)
6131 {
6132 register unsigned int c = *start_match;
6133 if ((start_bits[c/8] & (1 << (c&7))) == 0)
6134 {
6135 start_match++;
6136 #ifdef SUPPORT_UTF8
6137 if (utf8)
6138 while(start_match < end_subject && (*start_match & 0xc0) == 0x80)
6139 start_match++;
6140 #endif
6141 }
6142 else break;
6143 }
6144 }
6145 } /* Starting optimizations */
6146
6147 /* Restore fudged end_subject */
6148
6149 end_subject = save_end_subject;
6150
6151 /* The following two optimizations are disabled for partial matching or if
6152 disabling is explicitly requested. */
6153
6154 if ((options & PCRE_NO_START_OPTIMIZE) == 0 && !md->partial)
6155 {
6156 /* If the pattern was studied, a minimum subject length may be set. This is
6157 a lower bound; no actual string of that length may actually match the
6158 pattern. Although the value is, strictly, in characters, we treat it as
6159 bytes to avoid spending too much time in this optimization. */
6160
6161 if (study != NULL && (study->flags & PCRE_STUDY_MINLEN) != 0 &&
6162 (pcre_uint32)(end_subject - start_match) < study->minlength)
6163 {
6164 rc = MATCH_NOMATCH;
6165 break;
6166 }
6167
6168 /* If req_byte is set, we know that that character must appear in the
6169 subject for the match to succeed. If the first character is set, req_byte
6170 must be later in the subject; otherwise the test starts at the match point.
6171 This optimization can save a huge amount of backtracking in patterns with
6172 nested unlimited repeats that aren't going to match. Writing separate code
6173 for cased/caseless versions makes it go faster, as does using an
6174 autoincrement and backing off on a match.
6175
6176 HOWEVER: when the subject string is very, very long, searching to its end
6177 can take a long time, and give bad performance on quite ordinary patterns.
6178 This showed up when somebody was matching something like /^\d+C/ on a
6179 32-megabyte string... so we don't do this when the string is sufficiently
6180 long. */
6181
6182 if (req_byte >= 0 && end_subject - start_match < REQ_BYTE_MAX)
6183 {
6184 register USPTR p = start_match + ((first_byte >= 0)? 1 : 0);
6185
6186 /* We don't need to repeat the search if we haven't yet reached the
6187 place we found it at last time. */
6188
6189 if (p > req_byte_ptr)
6190 {
6191 if (req_byte_caseless)
6192 {
6193 while (p < end_subject)
6194 {
6195 register int pp = *p++;
6196 if (pp == req_byte || pp == req_byte2) { p--; break; }
6197 }
6198 }
6199 else
6200 {
6201 while (p < end_subject)
6202 {
6203 if (*p++ == req_byte) { p--; break; }
6204 }
6205 }
6206
6207 /* If we can't find the required character, break the matching loop,
6208 forcing a match failure. */
6209
6210 if (p >= end_subject)
6211 {
6212 rc = MATCH_NOMATCH;
6213 break;
6214 }
6215
6216 /* If we have found the required character, save the point where we
6217 found it, so that we don't search again next time round the loop if
6218 the start hasn't passed this character yet. */
6219
6220 req_byte_ptr = p;
6221 }
6222 }
6223 }
6224
6225 #ifdef PCRE_DEBUG /* Sigh. Some compilers never learn. */
6226 printf(">>>> Match against: ");
6227 pchars(start_match, end_subject - start_match, TRUE, md);
6228 printf("\n");
6229 #endif
6230
6231 /* OK, we can now run the match. If "hitend" is set afterwards, remember the
6232 first starting point for which a partial match was found. */
6233
6234 md->start_match_ptr = start_match;
6235 md->start_used_ptr = start_match;
6236 md->match_call_count = 0;
6237 md->match_function_type = 0;
6238 md->end_offset_top = 0;
6239 rc = match(start_match, md->start_code, start_match, NULL, 2, md, NULL, 0);
6240 if (md->hitend && start_partial == NULL) start_partial = md->start_used_ptr;
6241
6242 switch(rc)
6243 {
6244 /* SKIP passes back the next starting point explicitly, but if it is the
6245 same as the match we have just done, treat it as NOMATCH. */
6246
6247 case MATCH_SKIP:
6248 if (md->start_match_ptr != start_match)
6249 {
6250 new_start_match = md->start_match_ptr;
6251 break;
6252 }
6253 /* Fall through */
6254
6255 /* If MATCH_SKIP_ARG reaches this level it means that a MARK that matched
6256 the SKIP's arg was not found. We also treat this as NOMATCH. */
6257
6258 case MATCH_SKIP_ARG:
6259 /* Fall through */
6260
6261 /* NOMATCH and PRUNE advance by one character. THEN at this level acts
6262 exactly like PRUNE. */
6263
6264 case MATCH_NOMATCH:
6265 case MATCH_PRUNE:
6266 case MATCH_THEN:
6267 new_start_match = start_match + 1;
6268 #ifdef SUPPORT_UTF8
6269 if (utf8)
6270 while(new_start_match < end_subject && (*new_start_match & 0xc0) == 0x80)
6271 new_start_match++;
6272 #endif
6273 break;
6274
6275 /* COMMIT disables the bumpalong, but otherwise behaves as NOMATCH. */
6276
6277 case MATCH_COMMIT:
6278 rc = MATCH_NOMATCH;
6279 goto ENDLOOP;
6280
6281 /* Any other return is either a match, or some kind of error. */
6282
6283 default:
6284 goto ENDLOOP;
6285 }
6286
6287 /* Control reaches here for the various types of "no match at this point"
6288 result. Reset the code to MATCH_NOMATCH for subsequent checking. */
6289
6290 rc = MATCH_NOMATCH;
6291
6292 /* If PCRE_FIRSTLINE is set, the match must happen before or at the first
6293 newline in the subject (though it may continue over the newline). Therefore,
6294 if we have just failed to match, starting at a newline, do not continue. */
6295
6296 if (firstline && IS_NEWLINE(start_match)) break;
6297
6298 /* Advance to new matching position */
6299
6300 start_match = new_start_match;
6301
6302 /* Break the loop if the pattern is anchored or if we have passed the end of
6303 the subject. */
6304
6305 if (anchored || start_match > end_subject) break;
6306
6307 /* If we have just passed a CR and we are now at a LF, and the pattern does
6308 not contain any explicit matches for \r or \n, and the newline option is CRLF
6309 or ANY or ANYCRLF, advance the match position by one more character. */
6310
6311 if (start_match[-1] == CHAR_CR &&
6312 start_match < end_subject &&
6313 *start_match == CHAR_NL &&
6314 (re->flags & PCRE_HASCRORLF) == 0 &&
6315 (md->nltype == NLTYPE_ANY ||
6316 md->nltype == NLTYPE_ANYCRLF ||
6317 md->nllen == 2))
6318 start_match++;
6319
6320 md->mark = NULL; /* Reset for start of next match attempt */
6321 } /* End of for(;;) "bumpalong" loop */
6322
6323 /* ==========================================================================*/
6324
6325 /* We reach here when rc is not MATCH_NOMATCH, or if one of the stopping
6326 conditions is true:
6327
6328 (1) The pattern is anchored or the match was failed by (*COMMIT);
6329
6330 (2) We are past the end of the subject;
6331
6332 (3) PCRE_FIRSTLINE is set and we have failed to match at a newline, because
6333 this option requests that a match occur at or before the first newline in
6334 the subject.
6335
6336 When we have a match and the offset vector is big enough to deal with any
6337 backreferences, captured substring offsets will already be set up. In the case
6338 where we had to get some local store to hold offsets for backreference
6339 processing, copy those that we can. In this case there need not be overflow if
6340 certain parts of the pattern were not used, even though there are more
6341 capturing parentheses than vector slots. */
6342
6343 ENDLOOP:
6344
6345 if (rc == MATCH_MATCH || rc == MATCH_ACCEPT)
6346 {
6347 if (using_temporary_offsets)
6348 {
6349 if (offsetcount >= 4)
6350 {
6351 memcpy(offsets + 2, md->offset_vector + 2,
6352 (offsetcount - 2) * sizeof(int));
6353 DPRINTF(("Copied offsets from temporary memory\n"));
6354 }
6355 if (md->end_offset_top > offsetcount) md->offset_overflow = TRUE;
6356 DPRINTF(("Freeing temporary memory\n"));
6357 (pcre_free)(md->offset_vector);
6358 }
6359
6360 /* Set the return code to the number of captured strings, or 0 if there are
6361 too many to fit into the vector. */
6362
6363 rc = md->offset_overflow? 0 : md->end_offset_top/2;
6364
6365 /* If there is space in the offset vector, set any unused pairs at the end of
6366 the pattern to -1 for backwards compatibility. It is documented that this
6367 happens. In earlier versions, the whole set of potential capturing offsets
6368 was set to -1 each time round the loop, but this is handled differently now.
6369 "Gaps" are set to -1 dynamically instead (this fixes a bug). Thus, it is only
6370 those at the end that need unsetting here. We can't just unset them all at
6371 the start of the whole thing because they may get set in one branch that is
6372 not the final matching branch. */
6373
6374 if (md->end_offset_top/2 <= re->top_bracket && offsets != NULL)
6375 {
6376 register int *iptr, *iend;
6377 int resetcount = 2 + re->top_bracket * 2;
6378 if (resetcount > offsetcount) resetcount = ocount;
6379 iptr = offsets + md->end_offset_top;
6380 iend = offsets + resetcount;
6381 while (iptr < iend) *iptr++ = -1;
6382 }
6383
6384 /* If there is space, set up the whole thing as substring 0. The value of
6385 md->start_match_ptr might be modified if \K was encountered on the success
6386 matching path. */
6387
6388 if (offsetcount < 2) rc = 0; else
6389 {
6390 offsets[0] = (int)(md->start_match_ptr - md->start_subject);
6391 offsets[1] = (int)(md->end_match_ptr - md->start_subject);
6392 }
6393
6394 DPRINTF((">>>> returning %d\n", rc));
6395 goto RETURN_MARK;
6396 }
6397
6398 /* Control gets here if there has been an error, or if the overall match
6399 attempt has failed at all permitted starting positions. */
6400
6401 if (using_temporary_offsets)
6402 {
6403 DPRINTF(("Freeing temporary memory\n"));
6404 (pcre_free)(md->offset_vector);
6405 }
6406
6407 /* For anything other than nomatch or partial match, just return the code. */
6408
6409 if (rc != MATCH_NOMATCH && rc != PCRE_ERROR_PARTIAL)
6410 {
6411 DPRINTF((">>>> error: returning %d\n", rc));
6412 return rc;
6413 }
6414
6415 /* Handle partial matches - disable any mark data */
6416
6417 if (start_partial != NULL)
6418 {
6419 DPRINTF((">>>> returning PCRE_ERROR_PARTIAL\n"));
6420 md->mark = NULL;
6421 if (offsetcount > 1)
6422 {
6423 offsets[0] = (int)(start_partial - (USPTR)subject);
6424 offsets[1] = (int)(end_subject - (USPTR)subject);
6425 }
6426 rc = PCRE_ERROR_PARTIAL;
6427 }
6428
6429 /* This is the classic nomatch case */
6430
6431 else
6432 {
6433 DPRINTF((">>>> returning PCRE_ERROR_NOMATCH\n"));
6434 rc = PCRE_ERROR_NOMATCH;
6435 }
6436
6437 /* Return the MARK data if it has been requested. */
6438
6439 RETURN_MARK:
6440
6441 if (extra_data != NULL && (extra_data->flags & PCRE_EXTRA_MARK) != 0)
6442 *(extra_data->mark) = (unsigned char *)(md->mark);
6443 return rc;
6444 }
6445
6446 /* End of pcre_exec.c */

Properties

Name Value
svn:eol-style native
svn:keywords "Author Date Id Revision Url"

webmaster@exim.org
ViewVC Help
Powered by ViewVC 1.1.12