/[pcre]/code/trunk/pcre_exec.c
ViewVC logotype

Contents of /code/trunk/pcre_exec.c

Parent Directory Parent Directory | Revision Log Revision Log


Revision 500 - (show annotations) (download)
Sat Mar 6 19:00:29 2010 UTC (4 years, 4 months ago) by ph10
File MIME type: text/plain
File size: 171227 byte(s)
Fix bugs with \K in atomic groups, subroutines, and assertions.

1 /*************************************************
2 * Perl-Compatible Regular Expressions *
3 *************************************************/
4
5 /* PCRE is a library of functions to support regular expressions whose syntax
6 and semantics are as close as possible to those of the Perl 5 language.
7
8 Written by Philip Hazel
9 Copyright (c) 1997-2010 University of Cambridge
10
11 -----------------------------------------------------------------------------
12 Redistribution and use in source and binary forms, with or without
13 modification, are permitted provided that the following conditions are met:
14
15 * Redistributions of source code must retain the above copyright notice,
16 this list of conditions and the following disclaimer.
17
18 * Redistributions in binary form must reproduce the above copyright
19 notice, this list of conditions and the following disclaimer in the
20 documentation and/or other materials provided with the distribution.
21
22 * Neither the name of the University of Cambridge nor the names of its
23 contributors may be used to endorse or promote products derived from
24 this software without specific prior written permission.
25
26 THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
27 AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
28 IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
29 ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
30 LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
31 CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
32 SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
33 INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
34 CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
35 ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
36 POSSIBILITY OF SUCH DAMAGE.
37 -----------------------------------------------------------------------------
38 */
39
40
41 /* This module contains pcre_exec(), the externally visible function that does
42 pattern matching using an NFA algorithm, trying to mimic Perl as closely as
43 possible. There are also some static supporting functions. */
44
45 #ifdef HAVE_CONFIG_H
46 #include "config.h"
47 #endif
48
49 #define NLBLOCK md /* Block containing newline information */
50 #define PSSTART start_subject /* Field containing processed string start */
51 #define PSEND end_subject /* Field containing processed string end */
52
53 #include "pcre_internal.h"
54
55 /* Undefine some potentially clashing cpp symbols */
56
57 #undef min
58 #undef max
59
60 /* Flag bits for the match() function */
61
62 #define match_condassert 0x01 /* Called to check a condition assertion */
63 #define match_cbegroup 0x02 /* Could-be-empty unlimited repeat group */
64
65 /* Non-error returns from the match() function. Error returns are externally
66 defined PCRE_ERROR_xxx codes, which are all negative. */
67
68 #define MATCH_MATCH 1
69 #define MATCH_NOMATCH 0
70
71 /* Special internal returns from the match() function. Make them sufficiently
72 negative to avoid the external error codes. */
73
74 #define MATCH_COMMIT (-999)
75 #define MATCH_PRUNE (-998)
76 #define MATCH_SKIP (-997)
77 #define MATCH_THEN (-996)
78
79 /* Maximum number of ints of offset to save on the stack for recursive calls.
80 If the offset vector is bigger, malloc is used. This should be a multiple of 3,
81 because the offset vector is always a multiple of 3 long. */
82
83 #define REC_STACK_SAVE_MAX 30
84
85 /* Min and max values for the common repeats; for the maxima, 0 => infinity */
86
87 static const char rep_min[] = { 0, 0, 1, 1, 0, 0 };
88 static const char rep_max[] = { 0, 0, 0, 0, 1, 1 };
89
90
91
92 #ifdef PCRE_DEBUG
93 /*************************************************
94 * Debugging function to print chars *
95 *************************************************/
96
97 /* Print a sequence of chars in printable format, stopping at the end of the
98 subject if the requested.
99
100 Arguments:
101 p points to characters
102 length number to print
103 is_subject TRUE if printing from within md->start_subject
104 md pointer to matching data block, if is_subject is TRUE
105
106 Returns: nothing
107 */
108
109 static void
110 pchars(const uschar *p, int length, BOOL is_subject, match_data *md)
111 {
112 unsigned int c;
113 if (is_subject && length > md->end_subject - p) length = md->end_subject - p;
114 while (length-- > 0)
115 if (isprint(c = *(p++))) printf("%c", c); else printf("\\x%02x", c);
116 }
117 #endif
118
119
120
121 /*************************************************
122 * Match a back-reference *
123 *************************************************/
124
125 /* If a back reference hasn't been set, the length that is passed is greater
126 than the number of characters left in the string, so the match fails.
127
128 Arguments:
129 offset index into the offset vector
130 eptr points into the subject
131 length length to be matched
132 md points to match data block
133 ims the ims flags
134
135 Returns: TRUE if matched
136 */
137
138 static BOOL
139 match_ref(int offset, register USPTR eptr, int length, match_data *md,
140 unsigned long int ims)
141 {
142 USPTR p = md->start_subject + md->offset_vector[offset];
143
144 #ifdef PCRE_DEBUG
145 if (eptr >= md->end_subject)
146 printf("matching subject <null>");
147 else
148 {
149 printf("matching subject ");
150 pchars(eptr, length, TRUE, md);
151 }
152 printf(" against backref ");
153 pchars(p, length, FALSE, md);
154 printf("\n");
155 #endif
156
157 /* Always fail if not enough characters left */
158
159 if (length > md->end_subject - eptr) return FALSE;
160
161 /* Separate the caseless case for speed. In UTF-8 mode we can only do this
162 properly if Unicode properties are supported. Otherwise, we can check only
163 ASCII characters. */
164
165 if ((ims & PCRE_CASELESS) != 0)
166 {
167 #ifdef SUPPORT_UTF8
168 #ifdef SUPPORT_UCP
169 if (md->utf8)
170 {
171 USPTR endptr = eptr + length;
172 while (eptr < endptr)
173 {
174 int c, d;
175 GETCHARINC(c, eptr);
176 GETCHARINC(d, p);
177 if (c != d && c != UCD_OTHERCASE(d)) return FALSE;
178 }
179 }
180 else
181 #endif
182 #endif
183
184 /* The same code works when not in UTF-8 mode and in UTF-8 mode when there
185 is no UCP support. */
186
187 while (length-- > 0)
188 { if (md->lcc[*p++] != md->lcc[*eptr++]) return FALSE; }
189 }
190
191 /* In the caseful case, we can just compare the bytes, whether or not we
192 are in UTF-8 mode. */
193
194 else
195 { while (length-- > 0) if (*p++ != *eptr++) return FALSE; }
196
197 return TRUE;
198 }
199
200
201
202 /***************************************************************************
203 ****************************************************************************
204 RECURSION IN THE match() FUNCTION
205
206 The match() function is highly recursive, though not every recursive call
207 increases the recursive depth. Nevertheless, some regular expressions can cause
208 it to recurse to a great depth. I was writing for Unix, so I just let it call
209 itself recursively. This uses the stack for saving everything that has to be
210 saved for a recursive call. On Unix, the stack can be large, and this works
211 fine.
212
213 It turns out that on some non-Unix-like systems there are problems with
214 programs that use a lot of stack. (This despite the fact that every last chip
215 has oodles of memory these days, and techniques for extending the stack have
216 been known for decades.) So....
217
218 There is a fudge, triggered by defining NO_RECURSE, which avoids recursive
219 calls by keeping local variables that need to be preserved in blocks of memory
220 obtained from malloc() instead instead of on the stack. Macros are used to
221 achieve this so that the actual code doesn't look very different to what it
222 always used to.
223
224 The original heap-recursive code used longjmp(). However, it seems that this
225 can be very slow on some operating systems. Following a suggestion from Stan
226 Switzer, the use of longjmp() has been abolished, at the cost of having to
227 provide a unique number for each call to RMATCH. There is no way of generating
228 a sequence of numbers at compile time in C. I have given them names, to make
229 them stand out more clearly.
230
231 Crude tests on x86 Linux show a small speedup of around 5-8%. However, on
232 FreeBSD, avoiding longjmp() more than halves the time taken to run the standard
233 tests. Furthermore, not using longjmp() means that local dynamic variables
234 don't have indeterminate values; this has meant that the frame size can be
235 reduced because the result can be "passed back" by straight setting of the
236 variable instead of being passed in the frame.
237 ****************************************************************************
238 ***************************************************************************/
239
240 /* Numbers for RMATCH calls. When this list is changed, the code at HEAP_RETURN
241 below must be updated in sync. */
242
243 enum { RM1=1, RM2, RM3, RM4, RM5, RM6, RM7, RM8, RM9, RM10,
244 RM11, RM12, RM13, RM14, RM15, RM16, RM17, RM18, RM19, RM20,
245 RM21, RM22, RM23, RM24, RM25, RM26, RM27, RM28, RM29, RM30,
246 RM31, RM32, RM33, RM34, RM35, RM36, RM37, RM38, RM39, RM40,
247 RM41, RM42, RM43, RM44, RM45, RM46, RM47, RM48, RM49, RM50,
248 RM51, RM52, RM53, RM54 };
249
250 /* These versions of the macros use the stack, as normal. There are debugging
251 versions and production versions. Note that the "rw" argument of RMATCH isn't
252 actuall used in this definition. */
253
254 #ifndef NO_RECURSE
255 #define REGISTER register
256
257 #ifdef PCRE_DEBUG
258 #define RMATCH(ra,rb,rc,rd,re,rf,rg,rw) \
259 { \
260 printf("match() called in line %d\n", __LINE__); \
261 rrc = match(ra,rb,mstart,rc,rd,re,rf,rg,rdepth+1); \
262 printf("to line %d\n", __LINE__); \
263 }
264 #define RRETURN(ra) \
265 { \
266 printf("match() returned %d from line %d ", ra, __LINE__); \
267 return ra; \
268 }
269 #else
270 #define RMATCH(ra,rb,rc,rd,re,rf,rg,rw) \
271 rrc = match(ra,rb,mstart,rc,rd,re,rf,rg,rdepth+1)
272 #define RRETURN(ra) return ra
273 #endif
274
275 #else
276
277
278 /* These versions of the macros manage a private stack on the heap. Note that
279 the "rd" argument of RMATCH isn't actually used in this definition. It's the md
280 argument of match(), which never changes. */
281
282 #define REGISTER
283
284 #define RMATCH(ra,rb,rc,rd,re,rf,rg,rw)\
285 {\
286 heapframe *newframe = (pcre_stack_malloc)(sizeof(heapframe));\
287 frame->Xwhere = rw; \
288 newframe->Xeptr = ra;\
289 newframe->Xecode = rb;\
290 newframe->Xmstart = mstart;\
291 newframe->Xoffset_top = rc;\
292 newframe->Xims = re;\
293 newframe->Xeptrb = rf;\
294 newframe->Xflags = rg;\
295 newframe->Xrdepth = frame->Xrdepth + 1;\
296 newframe->Xprevframe = frame;\
297 frame = newframe;\
298 DPRINTF(("restarting from line %d\n", __LINE__));\
299 goto HEAP_RECURSE;\
300 L_##rw:\
301 DPRINTF(("jumped back to line %d\n", __LINE__));\
302 }
303
304 #define RRETURN(ra)\
305 {\
306 heapframe *newframe = frame;\
307 frame = newframe->Xprevframe;\
308 (pcre_stack_free)(newframe);\
309 if (frame != NULL)\
310 {\
311 rrc = ra;\
312 goto HEAP_RETURN;\
313 }\
314 return ra;\
315 }
316
317
318 /* Structure for remembering the local variables in a private frame */
319
320 typedef struct heapframe {
321 struct heapframe *Xprevframe;
322
323 /* Function arguments that may change */
324
325 USPTR Xeptr;
326 const uschar *Xecode;
327 USPTR Xmstart;
328 int Xoffset_top;
329 long int Xims;
330 eptrblock *Xeptrb;
331 int Xflags;
332 unsigned int Xrdepth;
333
334 /* Function local variables */
335
336 USPTR Xcallpat;
337 #ifdef SUPPORT_UTF8
338 USPTR Xcharptr;
339 #endif
340 USPTR Xdata;
341 USPTR Xnext;
342 USPTR Xpp;
343 USPTR Xprev;
344 USPTR Xsaved_eptr;
345
346 recursion_info Xnew_recursive;
347
348 BOOL Xcur_is_word;
349 BOOL Xcondition;
350 BOOL Xprev_is_word;
351
352 unsigned long int Xoriginal_ims;
353
354 #ifdef SUPPORT_UCP
355 int Xprop_type;
356 int Xprop_value;
357 int Xprop_fail_result;
358 int Xprop_category;
359 int Xprop_chartype;
360 int Xprop_script;
361 int Xoclength;
362 uschar Xocchars[8];
363 #endif
364
365 int Xcodelink;
366 int Xctype;
367 unsigned int Xfc;
368 int Xfi;
369 int Xlength;
370 int Xmax;
371 int Xmin;
372 int Xnumber;
373 int Xoffset;
374 int Xop;
375 int Xsave_capture_last;
376 int Xsave_offset1, Xsave_offset2, Xsave_offset3;
377 int Xstacksave[REC_STACK_SAVE_MAX];
378
379 eptrblock Xnewptrb;
380
381 /* Where to jump back to */
382
383 int Xwhere;
384
385 } heapframe;
386
387 #endif
388
389
390 /***************************************************************************
391 ***************************************************************************/
392
393
394
395 /*************************************************
396 * Match from current position *
397 *************************************************/
398
399 /* This function is called recursively in many circumstances. Whenever it
400 returns a negative (error) response, the outer incarnation must also return the
401 same response. */
402
403 /* These macros pack up tests that are used for partial matching, and which
404 appears several times in the code. We set the "hit end" flag if the pointer is
405 at the end of the subject and also past the start of the subject (i.e.
406 something has been matched). For hard partial matching, we then return
407 immediately. The second one is used when we already know we are past the end of
408 the subject. */
409
410 #define CHECK_PARTIAL()\
411 if (md->partial != 0 && eptr >= md->end_subject && eptr > mstart)\
412 {\
413 md->hitend = TRUE;\
414 if (md->partial > 1) RRETURN(PCRE_ERROR_PARTIAL);\
415 }
416
417 #define SCHECK_PARTIAL()\
418 if (md->partial != 0 && eptr > mstart)\
419 {\
420 md->hitend = TRUE;\
421 if (md->partial > 1) RRETURN(PCRE_ERROR_PARTIAL);\
422 }
423
424
425 /* Performance note: It might be tempting to extract commonly used fields from
426 the md structure (e.g. utf8, end_subject) into individual variables to improve
427 performance. Tests using gcc on a SPARC disproved this; in the first case, it
428 made performance worse.
429
430 Arguments:
431 eptr pointer to current character in subject
432 ecode pointer to current position in compiled code
433 mstart pointer to the current match start position (can be modified
434 by encountering \K)
435 offset_top current top pointer
436 md pointer to "static" info for the match
437 ims current /i, /m, and /s options
438 eptrb pointer to chain of blocks containing eptr at start of
439 brackets - for testing for empty matches
440 flags can contain
441 match_condassert - this is an assertion condition
442 match_cbegroup - this is the start of an unlimited repeat
443 group that can match an empty string
444 rdepth the recursion depth
445
446 Returns: MATCH_MATCH if matched ) these values are >= 0
447 MATCH_NOMATCH if failed to match )
448 a negative PCRE_ERROR_xxx value if aborted by an error condition
449 (e.g. stopped by repeated call or recursion limit)
450 */
451
452 static int
453 match(REGISTER USPTR eptr, REGISTER const uschar *ecode, USPTR mstart,
454 int offset_top, match_data *md, unsigned long int ims, eptrblock *eptrb,
455 int flags, unsigned int rdepth)
456 {
457 /* These variables do not need to be preserved over recursion in this function,
458 so they can be ordinary variables in all cases. Mark some of them with
459 "register" because they are used a lot in loops. */
460
461 register int rrc; /* Returns from recursive calls */
462 register int i; /* Used for loops not involving calls to RMATCH() */
463 register unsigned int c; /* Character values not kept over RMATCH() calls */
464 register BOOL utf8; /* Local copy of UTF-8 flag for speed */
465
466 BOOL minimize, possessive; /* Quantifier options */
467 int condcode;
468
469 /* When recursion is not being used, all "local" variables that have to be
470 preserved over calls to RMATCH() are part of a "frame" which is obtained from
471 heap storage. Set up the top-level frame here; others are obtained from the
472 heap whenever RMATCH() does a "recursion". See the macro definitions above. */
473
474 #ifdef NO_RECURSE
475 heapframe *frame = (pcre_stack_malloc)(sizeof(heapframe));
476 frame->Xprevframe = NULL; /* Marks the top level */
477
478 /* Copy in the original argument variables */
479
480 frame->Xeptr = eptr;
481 frame->Xecode = ecode;
482 frame->Xmstart = mstart;
483 frame->Xoffset_top = offset_top;
484 frame->Xims = ims;
485 frame->Xeptrb = eptrb;
486 frame->Xflags = flags;
487 frame->Xrdepth = rdepth;
488
489 /* This is where control jumps back to to effect "recursion" */
490
491 HEAP_RECURSE:
492
493 /* Macros make the argument variables come from the current frame */
494
495 #define eptr frame->Xeptr
496 #define ecode frame->Xecode
497 #define mstart frame->Xmstart
498 #define offset_top frame->Xoffset_top
499 #define ims frame->Xims
500 #define eptrb frame->Xeptrb
501 #define flags frame->Xflags
502 #define rdepth frame->Xrdepth
503
504 /* Ditto for the local variables */
505
506 #ifdef SUPPORT_UTF8
507 #define charptr frame->Xcharptr
508 #endif
509 #define callpat frame->Xcallpat
510 #define codelink frame->Xcodelink
511 #define data frame->Xdata
512 #define next frame->Xnext
513 #define pp frame->Xpp
514 #define prev frame->Xprev
515 #define saved_eptr frame->Xsaved_eptr
516
517 #define new_recursive frame->Xnew_recursive
518
519 #define cur_is_word frame->Xcur_is_word
520 #define condition frame->Xcondition
521 #define prev_is_word frame->Xprev_is_word
522
523 #define original_ims frame->Xoriginal_ims
524
525 #ifdef SUPPORT_UCP
526 #define prop_type frame->Xprop_type
527 #define prop_value frame->Xprop_value
528 #define prop_fail_result frame->Xprop_fail_result
529 #define prop_category frame->Xprop_category
530 #define prop_chartype frame->Xprop_chartype
531 #define prop_script frame->Xprop_script
532 #define oclength frame->Xoclength
533 #define occhars frame->Xocchars
534 #endif
535
536 #define ctype frame->Xctype
537 #define fc frame->Xfc
538 #define fi frame->Xfi
539 #define length frame->Xlength
540 #define max frame->Xmax
541 #define min frame->Xmin
542 #define number frame->Xnumber
543 #define offset frame->Xoffset
544 #define op frame->Xop
545 #define save_capture_last frame->Xsave_capture_last
546 #define save_offset1 frame->Xsave_offset1
547 #define save_offset2 frame->Xsave_offset2
548 #define save_offset3 frame->Xsave_offset3
549 #define stacksave frame->Xstacksave
550
551 #define newptrb frame->Xnewptrb
552
553 /* When recursion is being used, local variables are allocated on the stack and
554 get preserved during recursion in the normal way. In this environment, fi and
555 i, and fc and c, can be the same variables. */
556
557 #else /* NO_RECURSE not defined */
558 #define fi i
559 #define fc c
560
561
562 #ifdef SUPPORT_UTF8 /* Many of these variables are used only */
563 const uschar *charptr; /* in small blocks of the code. My normal */
564 #endif /* style of coding would have declared */
565 const uschar *callpat; /* them within each of those blocks. */
566 const uschar *data; /* However, in order to accommodate the */
567 const uschar *next; /* version of this code that uses an */
568 USPTR pp; /* external "stack" implemented on the */
569 const uschar *prev; /* heap, it is easier to declare them all */
570 USPTR saved_eptr; /* here, so the declarations can be cut */
571 /* out in a block. The only declarations */
572 recursion_info new_recursive; /* within blocks below are for variables */
573 /* that do not have to be preserved over */
574 BOOL cur_is_word; /* a recursive call to RMATCH(). */
575 BOOL condition;
576 BOOL prev_is_word;
577
578 unsigned long int original_ims;
579
580 #ifdef SUPPORT_UCP
581 int prop_type;
582 int prop_value;
583 int prop_fail_result;
584 int prop_category;
585 int prop_chartype;
586 int prop_script;
587 int oclength;
588 uschar occhars[8];
589 #endif
590
591 int codelink;
592 int ctype;
593 int length;
594 int max;
595 int min;
596 int number;
597 int offset;
598 int op;
599 int save_capture_last;
600 int save_offset1, save_offset2, save_offset3;
601 int stacksave[REC_STACK_SAVE_MAX];
602
603 eptrblock newptrb;
604 #endif /* NO_RECURSE */
605
606 /* These statements are here to stop the compiler complaining about unitialized
607 variables. */
608
609 #ifdef SUPPORT_UCP
610 prop_value = 0;
611 prop_fail_result = 0;
612 #endif
613
614
615 /* This label is used for tail recursion, which is used in a few cases even
616 when NO_RECURSE is not defined, in order to reduce the amount of stack that is
617 used. Thanks to Ian Taylor for noticing this possibility and sending the
618 original patch. */
619
620 TAIL_RECURSE:
621
622 /* OK, now we can get on with the real code of the function. Recursive calls
623 are specified by the macro RMATCH and RRETURN is used to return. When
624 NO_RECURSE is *not* defined, these just turn into a recursive call to match()
625 and a "return", respectively (possibly with some debugging if PCRE_DEBUG is
626 defined). However, RMATCH isn't like a function call because it's quite a
627 complicated macro. It has to be used in one particular way. This shouldn't,
628 however, impact performance when true recursion is being used. */
629
630 #ifdef SUPPORT_UTF8
631 utf8 = md->utf8; /* Local copy of the flag */
632 #else
633 utf8 = FALSE;
634 #endif
635
636 /* First check that we haven't called match() too many times, or that we
637 haven't exceeded the recursive call limit. */
638
639 if (md->match_call_count++ >= md->match_limit) RRETURN(PCRE_ERROR_MATCHLIMIT);
640 if (rdepth >= md->match_limit_recursion) RRETURN(PCRE_ERROR_RECURSIONLIMIT);
641
642 original_ims = ims; /* Save for resetting on ')' */
643
644 /* At the start of a group with an unlimited repeat that may match an empty
645 string, the match_cbegroup flag is set. When this is the case, add the current
646 subject pointer to the chain of such remembered pointers, to be checked when we
647 hit the closing ket, in order to break infinite loops that match no characters.
648 When match() is called in other circumstances, don't add to the chain. The
649 match_cbegroup flag must NOT be used with tail recursion, because the memory
650 block that is used is on the stack, so a new one may be required for each
651 match(). */
652
653 if ((flags & match_cbegroup) != 0)
654 {
655 newptrb.epb_saved_eptr = eptr;
656 newptrb.epb_prev = eptrb;
657 eptrb = &newptrb;
658 }
659
660 /* Now start processing the opcodes. */
661
662 for (;;)
663 {
664 minimize = possessive = FALSE;
665 op = *ecode;
666
667 switch(op)
668 {
669 case OP_FAIL:
670 RRETURN(MATCH_NOMATCH);
671
672 case OP_PRUNE:
673 RMATCH(eptr, ecode + _pcre_OP_lengths[*ecode], offset_top, md,
674 ims, eptrb, flags, RM51);
675 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
676 RRETURN(MATCH_PRUNE);
677
678 case OP_COMMIT:
679 RMATCH(eptr, ecode + _pcre_OP_lengths[*ecode], offset_top, md,
680 ims, eptrb, flags, RM52);
681 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
682 RRETURN(MATCH_COMMIT);
683
684 case OP_SKIP:
685 RMATCH(eptr, ecode + _pcre_OP_lengths[*ecode], offset_top, md,
686 ims, eptrb, flags, RM53);
687 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
688 md->start_match_ptr = eptr; /* Pass back current position */
689 RRETURN(MATCH_SKIP);
690
691 case OP_THEN:
692 RMATCH(eptr, ecode + _pcre_OP_lengths[*ecode], offset_top, md,
693 ims, eptrb, flags, RM54);
694 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
695 RRETURN(MATCH_THEN);
696
697 /* Handle a capturing bracket. If there is space in the offset vector, save
698 the current subject position in the working slot at the top of the vector.
699 We mustn't change the current values of the data slot, because they may be
700 set from a previous iteration of this group, and be referred to by a
701 reference inside the group.
702
703 If the bracket fails to match, we need to restore this value and also the
704 values of the final offsets, in case they were set by a previous iteration
705 of the same bracket.
706
707 If there isn't enough space in the offset vector, treat this as if it were
708 a non-capturing bracket. Don't worry about setting the flag for the error
709 case here; that is handled in the code for KET. */
710
711 case OP_CBRA:
712 case OP_SCBRA:
713 number = GET2(ecode, 1+LINK_SIZE);
714 offset = number << 1;
715
716 #ifdef PCRE_DEBUG
717 printf("start bracket %d\n", number);
718 printf("subject=");
719 pchars(eptr, 16, TRUE, md);
720 printf("\n");
721 #endif
722
723 if (offset < md->offset_max)
724 {
725 save_offset1 = md->offset_vector[offset];
726 save_offset2 = md->offset_vector[offset+1];
727 save_offset3 = md->offset_vector[md->offset_end - number];
728 save_capture_last = md->capture_last;
729
730 DPRINTF(("saving %d %d %d\n", save_offset1, save_offset2, save_offset3));
731 md->offset_vector[md->offset_end - number] = eptr - md->start_subject;
732
733 flags = (op == OP_SCBRA)? match_cbegroup : 0;
734 do
735 {
736 RMATCH(eptr, ecode + _pcre_OP_lengths[*ecode], offset_top, md,
737 ims, eptrb, flags, RM1);
738 if (rrc != MATCH_NOMATCH && rrc != MATCH_THEN) RRETURN(rrc);
739 md->capture_last = save_capture_last;
740 ecode += GET(ecode, 1);
741 }
742 while (*ecode == OP_ALT);
743
744 DPRINTF(("bracket %d failed\n", number));
745
746 md->offset_vector[offset] = save_offset1;
747 md->offset_vector[offset+1] = save_offset2;
748 md->offset_vector[md->offset_end - number] = save_offset3;
749
750 RRETURN(MATCH_NOMATCH);
751 }
752
753 /* FALL THROUGH ... Insufficient room for saving captured contents. Treat
754 as a non-capturing bracket. */
755
756 /* VVVVVVVVVVVVVVVVVVVVVVVVV */
757 /* VVVVVVVVVVVVVVVVVVVVVVVVV */
758
759 DPRINTF(("insufficient capture room: treat as non-capturing\n"));
760
761 /* VVVVVVVVVVVVVVVVVVVVVVVVV */
762 /* VVVVVVVVVVVVVVVVVVVVVVVVV */
763
764 /* Non-capturing bracket. Loop for all the alternatives. When we get to the
765 final alternative within the brackets, we would return the result of a
766 recursive call to match() whatever happened. We can reduce stack usage by
767 turning this into a tail recursion, except in the case when match_cbegroup
768 is set.*/
769
770 case OP_BRA:
771 case OP_SBRA:
772 DPRINTF(("start non-capturing bracket\n"));
773 flags = (op >= OP_SBRA)? match_cbegroup : 0;
774 for (;;)
775 {
776 if (ecode[GET(ecode, 1)] != OP_ALT) /* Final alternative */
777 {
778 if (flags == 0) /* Not a possibly empty group */
779 {
780 ecode += _pcre_OP_lengths[*ecode];
781 DPRINTF(("bracket 0 tail recursion\n"));
782 goto TAIL_RECURSE;
783 }
784
785 /* Possibly empty group; can't use tail recursion. */
786
787 RMATCH(eptr, ecode + _pcre_OP_lengths[*ecode], offset_top, md, ims,
788 eptrb, flags, RM48);
789 RRETURN(rrc);
790 }
791
792 /* For non-final alternatives, continue the loop for a NOMATCH result;
793 otherwise return. */
794
795 RMATCH(eptr, ecode + _pcre_OP_lengths[*ecode], offset_top, md, ims,
796 eptrb, flags, RM2);
797 if (rrc != MATCH_NOMATCH && rrc != MATCH_THEN) RRETURN(rrc);
798 ecode += GET(ecode, 1);
799 }
800 /* Control never reaches here. */
801
802 /* Conditional group: compilation checked that there are no more than
803 two branches. If the condition is false, skipping the first branch takes us
804 past the end if there is only one branch, but that's OK because that is
805 exactly what going to the ket would do. As there is only one branch to be
806 obeyed, we can use tail recursion to avoid using another stack frame. */
807
808 case OP_COND:
809 case OP_SCOND:
810 codelink= GET(ecode, 1);
811
812 /* Because of the way auto-callout works during compile, a callout item is
813 inserted between OP_COND and an assertion condition. */
814
815 if (ecode[LINK_SIZE+1] == OP_CALLOUT)
816 {
817 if (pcre_callout != NULL)
818 {
819 pcre_callout_block cb;
820 cb.version = 1; /* Version 1 of the callout block */
821 cb.callout_number = ecode[LINK_SIZE+2];
822 cb.offset_vector = md->offset_vector;
823 cb.subject = (PCRE_SPTR)md->start_subject;
824 cb.subject_length = md->end_subject - md->start_subject;
825 cb.start_match = mstart - md->start_subject;
826 cb.current_position = eptr - md->start_subject;
827 cb.pattern_position = GET(ecode, LINK_SIZE + 3);
828 cb.next_item_length = GET(ecode, 3 + 2*LINK_SIZE);
829 cb.capture_top = offset_top/2;
830 cb.capture_last = md->capture_last;
831 cb.callout_data = md->callout_data;
832 if ((rrc = (*pcre_callout)(&cb)) > 0) RRETURN(MATCH_NOMATCH);
833 if (rrc < 0) RRETURN(rrc);
834 }
835 ecode += _pcre_OP_lengths[OP_CALLOUT];
836 }
837
838 condcode = ecode[LINK_SIZE+1];
839
840 /* Now see what the actual condition is */
841
842 if (condcode == OP_RREF || condcode == OP_NRREF) /* Recursion test */
843 {
844 if (md->recursive == NULL) /* Not recursing => FALSE */
845 {
846 condition = FALSE;
847 ecode += GET(ecode, 1);
848 }
849 else
850 {
851 int recno = GET2(ecode, LINK_SIZE + 2); /* Recursion group number*/
852 condition = (recno == RREF_ANY || recno == md->recursive->group_num);
853
854 /* If the test is for recursion into a specific subpattern, and it is
855 false, but the test was set up by name, scan the table to see if the
856 name refers to any other numbers, and test them. The condition is true
857 if any one is set. */
858
859 if (!condition && condcode == OP_NRREF && recno != RREF_ANY)
860 {
861 uschar *slotA = md->name_table;
862 for (i = 0; i < md->name_count; i++)
863 {
864 if (GET2(slotA, 0) == recno) break;
865 slotA += md->name_entry_size;
866 }
867
868 /* Found a name for the number - there can be only one; duplicate
869 names for different numbers are allowed, but not vice versa. First
870 scan down for duplicates. */
871
872 if (i < md->name_count)
873 {
874 uschar *slotB = slotA;
875 while (slotB > md->name_table)
876 {
877 slotB -= md->name_entry_size;
878 if (strcmp((char *)slotA + 2, (char *)slotB + 2) == 0)
879 {
880 condition = GET2(slotB, 0) == md->recursive->group_num;
881 if (condition) break;
882 }
883 else break;
884 }
885
886 /* Scan up for duplicates */
887
888 if (!condition)
889 {
890 slotB = slotA;
891 for (i++; i < md->name_count; i++)
892 {
893 slotB += md->name_entry_size;
894 if (strcmp((char *)slotA + 2, (char *)slotB + 2) == 0)
895 {
896 condition = GET2(slotB, 0) == md->recursive->group_num;
897 if (condition) break;
898 }
899 else break;
900 }
901 }
902 }
903 }
904
905 /* Chose branch according to the condition */
906
907 ecode += condition? 3 : GET(ecode, 1);
908 }
909 }
910
911 else if (condcode == OP_CREF || condcode == OP_NCREF) /* Group used test */
912 {
913 offset = GET2(ecode, LINK_SIZE+2) << 1; /* Doubled ref number */
914 condition = offset < offset_top && md->offset_vector[offset] >= 0;
915
916 /* If the numbered capture is unset, but the reference was by name,
917 scan the table to see if the name refers to any other numbers, and test
918 them. The condition is true if any one is set. This is tediously similar
919 to the code above, but not close enough to try to amalgamate. */
920
921 if (!condition && condcode == OP_NCREF)
922 {
923 int refno = offset >> 1;
924 uschar *slotA = md->name_table;
925
926 for (i = 0; i < md->name_count; i++)
927 {
928 if (GET2(slotA, 0) == refno) break;
929 slotA += md->name_entry_size;
930 }
931
932 /* Found a name for the number - there can be only one; duplicate names
933 for different numbers are allowed, but not vice versa. First scan down
934 for duplicates. */
935
936 if (i < md->name_count)
937 {
938 uschar *slotB = slotA;
939 while (slotB > md->name_table)
940 {
941 slotB -= md->name_entry_size;
942 if (strcmp((char *)slotA + 2, (char *)slotB + 2) == 0)
943 {
944 offset = GET2(slotB, 0) << 1;
945 condition = offset < offset_top &&
946 md->offset_vector[offset] >= 0;
947 if (condition) break;
948 }
949 else break;
950 }
951
952 /* Scan up for duplicates */
953
954 if (!condition)
955 {
956 slotB = slotA;
957 for (i++; i < md->name_count; i++)
958 {
959 slotB += md->name_entry_size;
960 if (strcmp((char *)slotA + 2, (char *)slotB + 2) == 0)
961 {
962 offset = GET2(slotB, 0) << 1;
963 condition = offset < offset_top &&
964 md->offset_vector[offset] >= 0;
965 if (condition) break;
966 }
967 else break;
968 }
969 }
970 }
971 }
972
973 /* Chose branch according to the condition */
974
975 ecode += condition? 3 : GET(ecode, 1);
976 }
977
978 else if (condcode == OP_DEF) /* DEFINE - always false */
979 {
980 condition = FALSE;
981 ecode += GET(ecode, 1);
982 }
983
984 /* The condition is an assertion. Call match() to evaluate it - setting
985 the final argument match_condassert causes it to stop at the end of an
986 assertion. */
987
988 else
989 {
990 RMATCH(eptr, ecode + 1 + LINK_SIZE, offset_top, md, ims, NULL,
991 match_condassert, RM3);
992 if (rrc == MATCH_MATCH)
993 {
994 condition = TRUE;
995 ecode += 1 + LINK_SIZE + GET(ecode, LINK_SIZE + 2);
996 while (*ecode == OP_ALT) ecode += GET(ecode, 1);
997 }
998 else if (rrc != MATCH_NOMATCH && rrc != MATCH_THEN)
999 {
1000 RRETURN(rrc); /* Need braces because of following else */
1001 }
1002 else
1003 {
1004 condition = FALSE;
1005 ecode += codelink;
1006 }
1007 }
1008
1009 /* We are now at the branch that is to be obeyed. As there is only one,
1010 we can use tail recursion to avoid using another stack frame, except when
1011 match_cbegroup is required for an unlimited repeat of a possibly empty
1012 group. If the second alternative doesn't exist, we can just plough on. */
1013
1014 if (condition || *ecode == OP_ALT)
1015 {
1016 ecode += 1 + LINK_SIZE;
1017 if (op == OP_SCOND) /* Possibly empty group */
1018 {
1019 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, match_cbegroup, RM49);
1020 RRETURN(rrc);
1021 }
1022 else /* Group must match something */
1023 {
1024 flags = 0;
1025 goto TAIL_RECURSE;
1026 }
1027 }
1028 else /* Condition false & no alternative */
1029 {
1030 ecode += 1 + LINK_SIZE;
1031 }
1032 break;
1033
1034
1035 /* Before OP_ACCEPT there may be any number of OP_CLOSE opcodes,
1036 to close any currently open capturing brackets. */
1037
1038 case OP_CLOSE:
1039 number = GET2(ecode, 1);
1040 offset = number << 1;
1041
1042 #ifdef PCRE_DEBUG
1043 printf("end bracket %d at *ACCEPT", number);
1044 printf("\n");
1045 #endif
1046
1047 md->capture_last = number;
1048 if (offset >= md->offset_max) md->offset_overflow = TRUE; else
1049 {
1050 md->offset_vector[offset] =
1051 md->offset_vector[md->offset_end - number];
1052 md->offset_vector[offset+1] = eptr - md->start_subject;
1053 if (offset_top <= offset) offset_top = offset + 2;
1054 }
1055 ecode += 3;
1056 break;
1057
1058
1059 /* End of the pattern, either real or forced. If we are in a top-level
1060 recursion, we should restore the offsets appropriately and continue from
1061 after the call. */
1062
1063 case OP_ACCEPT:
1064 case OP_END:
1065 if (md->recursive != NULL && md->recursive->group_num == 0)
1066 {
1067 recursion_info *rec = md->recursive;
1068 DPRINTF(("End of pattern in a (?0) recursion\n"));
1069 md->recursive = rec->prevrec;
1070 memmove(md->offset_vector, rec->offset_save,
1071 rec->saved_max * sizeof(int));
1072 offset_top = rec->save_offset_top;
1073 ims = original_ims;
1074 ecode = rec->after_call;
1075 break;
1076 }
1077
1078 /* Otherwise, if we have matched an empty string, fail if PCRE_NOTEMPTY is
1079 set, or if PCRE_NOTEMPTY_ATSTART is set and we have matched at the start of
1080 the subject. In both cases, backtracking will then try other alternatives,
1081 if any. */
1082
1083 if (eptr == mstart &&
1084 (md->notempty ||
1085 (md->notempty_atstart &&
1086 mstart == md->start_subject + md->start_offset)))
1087 RRETURN(MATCH_NOMATCH);
1088
1089 /* Otherwise, we have a match. */
1090
1091 md->end_match_ptr = eptr; /* Record where we ended */
1092 md->end_offset_top = offset_top; /* and how many extracts were taken */
1093 md->start_match_ptr = mstart; /* and the start (\K can modify) */
1094 RRETURN(MATCH_MATCH);
1095
1096 /* Change option settings */
1097
1098 case OP_OPT:
1099 ims = ecode[1];
1100 ecode += 2;
1101 DPRINTF(("ims set to %02lx\n", ims));
1102 break;
1103
1104 /* Assertion brackets. Check the alternative branches in turn - the
1105 matching won't pass the KET for an assertion. If any one branch matches,
1106 the assertion is true. Lookbehind assertions have an OP_REVERSE item at the
1107 start of each branch to move the current point backwards, so the code at
1108 this level is identical to the lookahead case. */
1109
1110 case OP_ASSERT:
1111 case OP_ASSERTBACK:
1112 do
1113 {
1114 RMATCH(eptr, ecode + 1 + LINK_SIZE, offset_top, md, ims, NULL, 0,
1115 RM4);
1116 if (rrc == MATCH_MATCH)
1117 {
1118 mstart = md->start_match_ptr; /* In case \K reset it */
1119 break;
1120 }
1121 if (rrc != MATCH_NOMATCH && rrc != MATCH_THEN) RRETURN(rrc);
1122 ecode += GET(ecode, 1);
1123 }
1124 while (*ecode == OP_ALT);
1125 if (*ecode == OP_KET) RRETURN(MATCH_NOMATCH);
1126
1127 /* If checking an assertion for a condition, return MATCH_MATCH. */
1128
1129 if ((flags & match_condassert) != 0) RRETURN(MATCH_MATCH);
1130
1131 /* Continue from after the assertion, updating the offsets high water
1132 mark, since extracts may have been taken during the assertion. */
1133
1134 do ecode += GET(ecode,1); while (*ecode == OP_ALT);
1135 ecode += 1 + LINK_SIZE;
1136 offset_top = md->end_offset_top;
1137 continue;
1138
1139 /* Negative assertion: all branches must fail to match. Encountering SKIP,
1140 PRUNE, or COMMIT means we must assume failure without checking subsequent
1141 branches. */
1142
1143 case OP_ASSERT_NOT:
1144 case OP_ASSERTBACK_NOT:
1145 do
1146 {
1147 RMATCH(eptr, ecode + 1 + LINK_SIZE, offset_top, md, ims, NULL, 0,
1148 RM5);
1149 if (rrc == MATCH_MATCH) RRETURN(MATCH_NOMATCH);
1150 if (rrc == MATCH_SKIP || rrc == MATCH_PRUNE || rrc == MATCH_COMMIT)
1151 {
1152 do ecode += GET(ecode,1); while (*ecode == OP_ALT);
1153 break;
1154 }
1155 if (rrc != MATCH_NOMATCH && rrc != MATCH_THEN) RRETURN(rrc);
1156 ecode += GET(ecode,1);
1157 }
1158 while (*ecode == OP_ALT);
1159
1160 if ((flags & match_condassert) != 0) RRETURN(MATCH_MATCH);
1161
1162 ecode += 1 + LINK_SIZE;
1163 continue;
1164
1165 /* Move the subject pointer back. This occurs only at the start of
1166 each branch of a lookbehind assertion. If we are too close to the start to
1167 move back, this match function fails. When working with UTF-8 we move
1168 back a number of characters, not bytes. */
1169
1170 case OP_REVERSE:
1171 #ifdef SUPPORT_UTF8
1172 if (utf8)
1173 {
1174 i = GET(ecode, 1);
1175 while (i-- > 0)
1176 {
1177 eptr--;
1178 if (eptr < md->start_subject) RRETURN(MATCH_NOMATCH);
1179 BACKCHAR(eptr);
1180 }
1181 }
1182 else
1183 #endif
1184
1185 /* No UTF-8 support, or not in UTF-8 mode: count is byte count */
1186
1187 {
1188 eptr -= GET(ecode, 1);
1189 if (eptr < md->start_subject) RRETURN(MATCH_NOMATCH);
1190 }
1191
1192 /* Save the earliest consulted character, then skip to next op code */
1193
1194 if (eptr < md->start_used_ptr) md->start_used_ptr = eptr;
1195 ecode += 1 + LINK_SIZE;
1196 break;
1197
1198 /* The callout item calls an external function, if one is provided, passing
1199 details of the match so far. This is mainly for debugging, though the
1200 function is able to force a failure. */
1201
1202 case OP_CALLOUT:
1203 if (pcre_callout != NULL)
1204 {
1205 pcre_callout_block cb;
1206 cb.version = 1; /* Version 1 of the callout block */
1207 cb.callout_number = ecode[1];
1208 cb.offset_vector = md->offset_vector;
1209 cb.subject = (PCRE_SPTR)md->start_subject;
1210 cb.subject_length = md->end_subject - md->start_subject;
1211 cb.start_match = mstart - md->start_subject;
1212 cb.current_position = eptr - md->start_subject;
1213 cb.pattern_position = GET(ecode, 2);
1214 cb.next_item_length = GET(ecode, 2 + LINK_SIZE);
1215 cb.capture_top = offset_top/2;
1216 cb.capture_last = md->capture_last;
1217 cb.callout_data = md->callout_data;
1218 if ((rrc = (*pcre_callout)(&cb)) > 0) RRETURN(MATCH_NOMATCH);
1219 if (rrc < 0) RRETURN(rrc);
1220 }
1221 ecode += 2 + 2*LINK_SIZE;
1222 break;
1223
1224 /* Recursion either matches the current regex, or some subexpression. The
1225 offset data is the offset to the starting bracket from the start of the
1226 whole pattern. (This is so that it works from duplicated subpatterns.)
1227
1228 If there are any capturing brackets started but not finished, we have to
1229 save their starting points and reinstate them after the recursion. However,
1230 we don't know how many such there are (offset_top records the completed
1231 total) so we just have to save all the potential data. There may be up to
1232 65535 such values, which is too large to put on the stack, but using malloc
1233 for small numbers seems expensive. As a compromise, the stack is used when
1234 there are no more than REC_STACK_SAVE_MAX values to store; otherwise malloc
1235 is used. A problem is what to do if the malloc fails ... there is no way of
1236 returning to the top level with an error. Save the top REC_STACK_SAVE_MAX
1237 values on the stack, and accept that the rest may be wrong.
1238
1239 There are also other values that have to be saved. We use a chained
1240 sequence of blocks that actually live on the stack. Thanks to Robin Houston
1241 for the original version of this logic. */
1242
1243 case OP_RECURSE:
1244 {
1245 callpat = md->start_code + GET(ecode, 1);
1246 new_recursive.group_num = (callpat == md->start_code)? 0 :
1247 GET2(callpat, 1 + LINK_SIZE);
1248
1249 /* Add to "recursing stack" */
1250
1251 new_recursive.prevrec = md->recursive;
1252 md->recursive = &new_recursive;
1253
1254 /* Find where to continue from afterwards */
1255
1256 ecode += 1 + LINK_SIZE;
1257 new_recursive.after_call = ecode;
1258
1259 /* Now save the offset data. */
1260
1261 new_recursive.saved_max = md->offset_end;
1262 if (new_recursive.saved_max <= REC_STACK_SAVE_MAX)
1263 new_recursive.offset_save = stacksave;
1264 else
1265 {
1266 new_recursive.offset_save =
1267 (int *)(pcre_malloc)(new_recursive.saved_max * sizeof(int));
1268 if (new_recursive.offset_save == NULL) RRETURN(PCRE_ERROR_NOMEMORY);
1269 }
1270
1271 memcpy(new_recursive.offset_save, md->offset_vector,
1272 new_recursive.saved_max * sizeof(int));
1273 new_recursive.save_offset_top = offset_top;
1274
1275 /* OK, now we can do the recursion. For each top-level alternative we
1276 restore the offset and recursion data. */
1277
1278 DPRINTF(("Recursing into group %d\n", new_recursive.group_num));
1279 flags = (*callpat >= OP_SBRA)? match_cbegroup : 0;
1280 do
1281 {
1282 RMATCH(eptr, callpat + _pcre_OP_lengths[*callpat], offset_top,
1283 md, ims, eptrb, flags, RM6);
1284 if (rrc == MATCH_MATCH)
1285 {
1286 DPRINTF(("Recursion matched\n"));
1287 md->recursive = new_recursive.prevrec;
1288 if (new_recursive.offset_save != stacksave)
1289 (pcre_free)(new_recursive.offset_save);
1290 RRETURN(MATCH_MATCH);
1291 }
1292 else if (rrc != MATCH_NOMATCH && rrc != MATCH_THEN)
1293 {
1294 DPRINTF(("Recursion gave error %d\n", rrc));
1295 if (new_recursive.offset_save != stacksave)
1296 (pcre_free)(new_recursive.offset_save);
1297 RRETURN(rrc);
1298 }
1299
1300 md->recursive = &new_recursive;
1301 memcpy(md->offset_vector, new_recursive.offset_save,
1302 new_recursive.saved_max * sizeof(int));
1303 callpat += GET(callpat, 1);
1304 }
1305 while (*callpat == OP_ALT);
1306
1307 DPRINTF(("Recursion didn't match\n"));
1308 md->recursive = new_recursive.prevrec;
1309 if (new_recursive.offset_save != stacksave)
1310 (pcre_free)(new_recursive.offset_save);
1311 RRETURN(MATCH_NOMATCH);
1312 }
1313 /* Control never reaches here */
1314
1315 /* "Once" brackets are like assertion brackets except that after a match,
1316 the point in the subject string is not moved back. Thus there can never be
1317 a move back into the brackets. Friedl calls these "atomic" subpatterns.
1318 Check the alternative branches in turn - the matching won't pass the KET
1319 for this kind of subpattern. If any one branch matches, we carry on as at
1320 the end of a normal bracket, leaving the subject pointer, but resetting
1321 the start-of-match value in case it was changed by \K. */
1322
1323 case OP_ONCE:
1324 prev = ecode;
1325 saved_eptr = eptr;
1326
1327 do
1328 {
1329 RMATCH(eptr, ecode + 1 + LINK_SIZE, offset_top, md, ims, eptrb, 0, RM7);
1330 if (rrc == MATCH_MATCH)
1331 {
1332 mstart = md->start_match_ptr;
1333 break;
1334 }
1335 if (rrc != MATCH_NOMATCH && rrc != MATCH_THEN) RRETURN(rrc);
1336 ecode += GET(ecode,1);
1337 }
1338 while (*ecode == OP_ALT);
1339
1340 /* If hit the end of the group (which could be repeated), fail */
1341
1342 if (*ecode != OP_ONCE && *ecode != OP_ALT) RRETURN(MATCH_NOMATCH);
1343
1344 /* Continue as from after the assertion, updating the offsets high water
1345 mark, since extracts may have been taken. */
1346
1347 do ecode += GET(ecode, 1); while (*ecode == OP_ALT);
1348
1349 offset_top = md->end_offset_top;
1350 eptr = md->end_match_ptr;
1351
1352 /* For a non-repeating ket, just continue at this level. This also
1353 happens for a repeating ket if no characters were matched in the group.
1354 This is the forcible breaking of infinite loops as implemented in Perl
1355 5.005. If there is an options reset, it will get obeyed in the normal
1356 course of events. */
1357
1358 if (*ecode == OP_KET || eptr == saved_eptr)
1359 {
1360 ecode += 1+LINK_SIZE;
1361 break;
1362 }
1363
1364 /* The repeating kets try the rest of the pattern or restart from the
1365 preceding bracket, in the appropriate order. The second "call" of match()
1366 uses tail recursion, to avoid using another stack frame. We need to reset
1367 any options that changed within the bracket before re-running it, so
1368 check the next opcode. */
1369
1370 if (ecode[1+LINK_SIZE] == OP_OPT)
1371 {
1372 ims = (ims & ~PCRE_IMS) | ecode[4];
1373 DPRINTF(("ims set to %02lx at group repeat\n", ims));
1374 }
1375
1376 if (*ecode == OP_KETRMIN)
1377 {
1378 RMATCH(eptr, ecode + 1 + LINK_SIZE, offset_top, md, ims, eptrb, 0, RM8);
1379 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1380 ecode = prev;
1381 flags = 0;
1382 goto TAIL_RECURSE;
1383 }
1384 else /* OP_KETRMAX */
1385 {
1386 RMATCH(eptr, prev, offset_top, md, ims, eptrb, match_cbegroup, RM9);
1387 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1388 ecode += 1 + LINK_SIZE;
1389 flags = 0;
1390 goto TAIL_RECURSE;
1391 }
1392 /* Control never gets here */
1393
1394 /* An alternation is the end of a branch; scan along to find the end of the
1395 bracketed group and go to there. */
1396
1397 case OP_ALT:
1398 do ecode += GET(ecode,1); while (*ecode == OP_ALT);
1399 break;
1400
1401 /* BRAZERO, BRAMINZERO and SKIPZERO occur just before a bracket group,
1402 indicating that it may occur zero times. It may repeat infinitely, or not
1403 at all - i.e. it could be ()* or ()? or even (){0} in the pattern. Brackets
1404 with fixed upper repeat limits are compiled as a number of copies, with the
1405 optional ones preceded by BRAZERO or BRAMINZERO. */
1406
1407 case OP_BRAZERO:
1408 {
1409 next = ecode+1;
1410 RMATCH(eptr, next, offset_top, md, ims, eptrb, 0, RM10);
1411 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1412 do next += GET(next,1); while (*next == OP_ALT);
1413 ecode = next + 1 + LINK_SIZE;
1414 }
1415 break;
1416
1417 case OP_BRAMINZERO:
1418 {
1419 next = ecode+1;
1420 do next += GET(next, 1); while (*next == OP_ALT);
1421 RMATCH(eptr, next + 1+LINK_SIZE, offset_top, md, ims, eptrb, 0, RM11);
1422 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1423 ecode++;
1424 }
1425 break;
1426
1427 case OP_SKIPZERO:
1428 {
1429 next = ecode+1;
1430 do next += GET(next,1); while (*next == OP_ALT);
1431 ecode = next + 1 + LINK_SIZE;
1432 }
1433 break;
1434
1435 /* End of a group, repeated or non-repeating. */
1436
1437 case OP_KET:
1438 case OP_KETRMIN:
1439 case OP_KETRMAX:
1440 prev = ecode - GET(ecode, 1);
1441
1442 /* If this was a group that remembered the subject start, in order to break
1443 infinite repeats of empty string matches, retrieve the subject start from
1444 the chain. Otherwise, set it NULL. */
1445
1446 if (*prev >= OP_SBRA)
1447 {
1448 saved_eptr = eptrb->epb_saved_eptr; /* Value at start of group */
1449 eptrb = eptrb->epb_prev; /* Backup to previous group */
1450 }
1451 else saved_eptr = NULL;
1452
1453 /* If we are at the end of an assertion group or an atomic group, stop
1454 matching and return MATCH_MATCH, but record the current high water mark for
1455 use by positive assertions. We also need to record the match start in case
1456 it was changed by \K. */
1457
1458 if (*prev == OP_ASSERT || *prev == OP_ASSERT_NOT ||
1459 *prev == OP_ASSERTBACK || *prev == OP_ASSERTBACK_NOT ||
1460 *prev == OP_ONCE)
1461 {
1462 md->end_match_ptr = eptr; /* For ONCE */
1463 md->end_offset_top = offset_top;
1464 md->start_match_ptr = mstart;
1465 RRETURN(MATCH_MATCH);
1466 }
1467
1468 /* For capturing groups we have to check the group number back at the start
1469 and if necessary complete handling an extraction by setting the offsets and
1470 bumping the high water mark. Note that whole-pattern recursion is coded as
1471 a recurse into group 0, so it won't be picked up here. Instead, we catch it
1472 when the OP_END is reached. Other recursion is handled here. */
1473
1474 if (*prev == OP_CBRA || *prev == OP_SCBRA)
1475 {
1476 number = GET2(prev, 1+LINK_SIZE);
1477 offset = number << 1;
1478
1479 #ifdef PCRE_DEBUG
1480 printf("end bracket %d", number);
1481 printf("\n");
1482 #endif
1483
1484 md->capture_last = number;
1485 if (offset >= md->offset_max) md->offset_overflow = TRUE; else
1486 {
1487 md->offset_vector[offset] =
1488 md->offset_vector[md->offset_end - number];
1489 md->offset_vector[offset+1] = eptr - md->start_subject;
1490 if (offset_top <= offset) offset_top = offset + 2;
1491 }
1492
1493 /* Handle a recursively called group. Restore the offsets
1494 appropriately and continue from after the call. */
1495
1496 if (md->recursive != NULL && md->recursive->group_num == number)
1497 {
1498 recursion_info *rec = md->recursive;
1499 DPRINTF(("Recursion (%d) succeeded - continuing\n", number));
1500 md->recursive = rec->prevrec;
1501 memcpy(md->offset_vector, rec->offset_save,
1502 rec->saved_max * sizeof(int));
1503 offset_top = rec->save_offset_top;
1504 ecode = rec->after_call;
1505 ims = original_ims;
1506 break;
1507 }
1508 }
1509
1510 /* For both capturing and non-capturing groups, reset the value of the ims
1511 flags, in case they got changed during the group. */
1512
1513 ims = original_ims;
1514 DPRINTF(("ims reset to %02lx\n", ims));
1515
1516 /* For a non-repeating ket, just continue at this level. This also
1517 happens for a repeating ket if no characters were matched in the group.
1518 This is the forcible breaking of infinite loops as implemented in Perl
1519 5.005. If there is an options reset, it will get obeyed in the normal
1520 course of events. */
1521
1522 if (*ecode == OP_KET || eptr == saved_eptr)
1523 {
1524 ecode += 1 + LINK_SIZE;
1525 break;
1526 }
1527
1528 /* The repeating kets try the rest of the pattern or restart from the
1529 preceding bracket, in the appropriate order. In the second case, we can use
1530 tail recursion to avoid using another stack frame, unless we have an
1531 unlimited repeat of a group that can match an empty string. */
1532
1533 flags = (*prev >= OP_SBRA)? match_cbegroup : 0;
1534
1535 if (*ecode == OP_KETRMIN)
1536 {
1537 RMATCH(eptr, ecode + 1 + LINK_SIZE, offset_top, md, ims, eptrb, 0, RM12);
1538 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1539 if (flags != 0) /* Could match an empty string */
1540 {
1541 RMATCH(eptr, prev, offset_top, md, ims, eptrb, flags, RM50);
1542 RRETURN(rrc);
1543 }
1544 ecode = prev;
1545 goto TAIL_RECURSE;
1546 }
1547 else /* OP_KETRMAX */
1548 {
1549 RMATCH(eptr, prev, offset_top, md, ims, eptrb, flags, RM13);
1550 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1551 ecode += 1 + LINK_SIZE;
1552 flags = 0;
1553 goto TAIL_RECURSE;
1554 }
1555 /* Control never gets here */
1556
1557 /* Start of subject unless notbol, or after internal newline if multiline */
1558
1559 case OP_CIRC:
1560 if (md->notbol && eptr == md->start_subject) RRETURN(MATCH_NOMATCH);
1561 if ((ims & PCRE_MULTILINE) != 0)
1562 {
1563 if (eptr != md->start_subject &&
1564 (eptr == md->end_subject || !WAS_NEWLINE(eptr)))
1565 RRETURN(MATCH_NOMATCH);
1566 ecode++;
1567 break;
1568 }
1569 /* ... else fall through */
1570
1571 /* Start of subject assertion */
1572
1573 case OP_SOD:
1574 if (eptr != md->start_subject) RRETURN(MATCH_NOMATCH);
1575 ecode++;
1576 break;
1577
1578 /* Start of match assertion */
1579
1580 case OP_SOM:
1581 if (eptr != md->start_subject + md->start_offset) RRETURN(MATCH_NOMATCH);
1582 ecode++;
1583 break;
1584
1585 /* Reset the start of match point */
1586
1587 case OP_SET_SOM:
1588 mstart = eptr;
1589 ecode++;
1590 break;
1591
1592 /* Assert before internal newline if multiline, or before a terminating
1593 newline unless endonly is set, else end of subject unless noteol is set. */
1594
1595 case OP_DOLL:
1596 if ((ims & PCRE_MULTILINE) != 0)
1597 {
1598 if (eptr < md->end_subject)
1599 { if (!IS_NEWLINE(eptr)) RRETURN(MATCH_NOMATCH); }
1600 else
1601 { if (md->noteol) RRETURN(MATCH_NOMATCH); }
1602 ecode++;
1603 break;
1604 }
1605 else
1606 {
1607 if (md->noteol) RRETURN(MATCH_NOMATCH);
1608 if (!md->endonly)
1609 {
1610 if (eptr != md->end_subject &&
1611 (!IS_NEWLINE(eptr) || eptr != md->end_subject - md->nllen))
1612 RRETURN(MATCH_NOMATCH);
1613 ecode++;
1614 break;
1615 }
1616 }
1617 /* ... else fall through for endonly */
1618
1619 /* End of subject assertion (\z) */
1620
1621 case OP_EOD:
1622 if (eptr < md->end_subject) RRETURN(MATCH_NOMATCH);
1623 ecode++;
1624 break;
1625
1626 /* End of subject or ending \n assertion (\Z) */
1627
1628 case OP_EODN:
1629 if (eptr != md->end_subject &&
1630 (!IS_NEWLINE(eptr) || eptr != md->end_subject - md->nllen))
1631 RRETURN(MATCH_NOMATCH);
1632 ecode++;
1633 break;
1634
1635 /* Word boundary assertions */
1636
1637 case OP_NOT_WORD_BOUNDARY:
1638 case OP_WORD_BOUNDARY:
1639 {
1640
1641 /* Find out if the previous and current characters are "word" characters.
1642 It takes a bit more work in UTF-8 mode. Characters > 255 are assumed to
1643 be "non-word" characters. Remember the earliest consulted character for
1644 partial matching. */
1645
1646 #ifdef SUPPORT_UTF8
1647 if (utf8)
1648 {
1649 if (eptr == md->start_subject) prev_is_word = FALSE; else
1650 {
1651 USPTR lastptr = eptr - 1;
1652 while((*lastptr & 0xc0) == 0x80) lastptr--;
1653 if (lastptr < md->start_used_ptr) md->start_used_ptr = lastptr;
1654 GETCHAR(c, lastptr);
1655 prev_is_word = c < 256 && (md->ctypes[c] & ctype_word) != 0;
1656 }
1657 if (eptr >= md->end_subject)
1658 {
1659 SCHECK_PARTIAL();
1660 cur_is_word = FALSE;
1661 }
1662 else
1663 {
1664 GETCHAR(c, eptr);
1665 cur_is_word = c < 256 && (md->ctypes[c] & ctype_word) != 0;
1666 }
1667 }
1668 else
1669 #endif
1670
1671 /* Not in UTF-8 mode */
1672
1673 {
1674 if (eptr == md->start_subject) prev_is_word = FALSE; else
1675 {
1676 if (eptr <= md->start_used_ptr) md->start_used_ptr = eptr - 1;
1677 prev_is_word = ((md->ctypes[eptr[-1]] & ctype_word) != 0);
1678 }
1679 if (eptr >= md->end_subject)
1680 {
1681 SCHECK_PARTIAL();
1682 cur_is_word = FALSE;
1683 }
1684 else cur_is_word = ((md->ctypes[*eptr] & ctype_word) != 0);
1685 }
1686
1687 /* Now see if the situation is what we want */
1688
1689 if ((*ecode++ == OP_WORD_BOUNDARY)?
1690 cur_is_word == prev_is_word : cur_is_word != prev_is_word)
1691 RRETURN(MATCH_NOMATCH);
1692 }
1693 break;
1694
1695 /* Match a single character type; inline for speed */
1696
1697 case OP_ANY:
1698 if (IS_NEWLINE(eptr)) RRETURN(MATCH_NOMATCH);
1699 /* Fall through */
1700
1701 case OP_ALLANY:
1702 if (eptr++ >= md->end_subject)
1703 {
1704 SCHECK_PARTIAL();
1705 RRETURN(MATCH_NOMATCH);
1706 }
1707 if (utf8) while (eptr < md->end_subject && (*eptr & 0xc0) == 0x80) eptr++;
1708 ecode++;
1709 break;
1710
1711 /* Match a single byte, even in UTF-8 mode. This opcode really does match
1712 any byte, even newline, independent of the setting of PCRE_DOTALL. */
1713
1714 case OP_ANYBYTE:
1715 if (eptr++ >= md->end_subject)
1716 {
1717 SCHECK_PARTIAL();
1718 RRETURN(MATCH_NOMATCH);
1719 }
1720 ecode++;
1721 break;
1722
1723 case OP_NOT_DIGIT:
1724 if (eptr >= md->end_subject)
1725 {
1726 SCHECK_PARTIAL();
1727 RRETURN(MATCH_NOMATCH);
1728 }
1729 GETCHARINCTEST(c, eptr);
1730 if (
1731 #ifdef SUPPORT_UTF8
1732 c < 256 &&
1733 #endif
1734 (md->ctypes[c] & ctype_digit) != 0
1735 )
1736 RRETURN(MATCH_NOMATCH);
1737 ecode++;
1738 break;
1739
1740 case OP_DIGIT:
1741 if (eptr >= md->end_subject)
1742 {
1743 SCHECK_PARTIAL();
1744 RRETURN(MATCH_NOMATCH);
1745 }
1746 GETCHARINCTEST(c, eptr);
1747 if (
1748 #ifdef SUPPORT_UTF8
1749 c >= 256 ||
1750 #endif
1751 (md->ctypes[c] & ctype_digit) == 0
1752 )
1753 RRETURN(MATCH_NOMATCH);
1754 ecode++;
1755 break;
1756
1757 case OP_NOT_WHITESPACE:
1758 if (eptr >= md->end_subject)
1759 {
1760 SCHECK_PARTIAL();
1761 RRETURN(MATCH_NOMATCH);
1762 }
1763 GETCHARINCTEST(c, eptr);
1764 if (
1765 #ifdef SUPPORT_UTF8
1766 c < 256 &&
1767 #endif
1768 (md->ctypes[c] & ctype_space) != 0
1769 )
1770 RRETURN(MATCH_NOMATCH);
1771 ecode++;
1772 break;
1773
1774 case OP_WHITESPACE:
1775 if (eptr >= md->end_subject)
1776 {
1777 SCHECK_PARTIAL();
1778 RRETURN(MATCH_NOMATCH);
1779 }
1780 GETCHARINCTEST(c, eptr);
1781 if (
1782 #ifdef SUPPORT_UTF8
1783 c >= 256 ||
1784 #endif
1785 (md->ctypes[c] & ctype_space) == 0
1786 )
1787 RRETURN(MATCH_NOMATCH);
1788 ecode++;
1789 break;
1790
1791 case OP_NOT_WORDCHAR:
1792 if (eptr >= md->end_subject)
1793 {
1794 SCHECK_PARTIAL();
1795 RRETURN(MATCH_NOMATCH);
1796 }
1797 GETCHARINCTEST(c, eptr);
1798 if (
1799 #ifdef SUPPORT_UTF8
1800 c < 256 &&
1801 #endif
1802 (md->ctypes[c] & ctype_word) != 0
1803 )
1804 RRETURN(MATCH_NOMATCH);
1805 ecode++;
1806 break;
1807
1808 case OP_WORDCHAR:
1809 if (eptr >= md->end_subject)
1810 {
1811 SCHECK_PARTIAL();
1812 RRETURN(MATCH_NOMATCH);
1813 }
1814 GETCHARINCTEST(c, eptr);
1815 if (
1816 #ifdef SUPPORT_UTF8
1817 c >= 256 ||
1818 #endif
1819 (md->ctypes[c] & ctype_word) == 0
1820 )
1821 RRETURN(MATCH_NOMATCH);
1822 ecode++;
1823 break;
1824
1825 case OP_ANYNL:
1826 if (eptr >= md->end_subject)
1827 {
1828 SCHECK_PARTIAL();
1829 RRETURN(MATCH_NOMATCH);
1830 }
1831 GETCHARINCTEST(c, eptr);
1832 switch(c)
1833 {
1834 default: RRETURN(MATCH_NOMATCH);
1835 case 0x000d:
1836 if (eptr < md->end_subject && *eptr == 0x0a) eptr++;
1837 break;
1838
1839 case 0x000a:
1840 break;
1841
1842 case 0x000b:
1843 case 0x000c:
1844 case 0x0085:
1845 case 0x2028:
1846 case 0x2029:
1847 if (md->bsr_anycrlf) RRETURN(MATCH_NOMATCH);
1848 break;
1849 }
1850 ecode++;
1851 break;
1852
1853 case OP_NOT_HSPACE:
1854 if (eptr >= md->end_subject)
1855 {
1856 SCHECK_PARTIAL();
1857 RRETURN(MATCH_NOMATCH);
1858 }
1859 GETCHARINCTEST(c, eptr);
1860 switch(c)
1861 {
1862 default: break;
1863 case 0x09: /* HT */
1864 case 0x20: /* SPACE */
1865 case 0xa0: /* NBSP */
1866 case 0x1680: /* OGHAM SPACE MARK */
1867 case 0x180e: /* MONGOLIAN VOWEL SEPARATOR */
1868 case 0x2000: /* EN QUAD */
1869 case 0x2001: /* EM QUAD */
1870 case 0x2002: /* EN SPACE */
1871 case 0x2003: /* EM SPACE */
1872 case 0x2004: /* THREE-PER-EM SPACE */
1873 case 0x2005: /* FOUR-PER-EM SPACE */
1874 case 0x2006: /* SIX-PER-EM SPACE */
1875 case 0x2007: /* FIGURE SPACE */
1876 case 0x2008: /* PUNCTUATION SPACE */
1877 case 0x2009: /* THIN SPACE */
1878 case 0x200A: /* HAIR SPACE */
1879 case 0x202f: /* NARROW NO-BREAK SPACE */
1880 case 0x205f: /* MEDIUM MATHEMATICAL SPACE */
1881 case 0x3000: /* IDEOGRAPHIC SPACE */
1882 RRETURN(MATCH_NOMATCH);
1883 }
1884 ecode++;
1885 break;
1886
1887 case OP_HSPACE:
1888 if (eptr >= md->end_subject)
1889 {
1890 SCHECK_PARTIAL();
1891 RRETURN(MATCH_NOMATCH);
1892 }
1893 GETCHARINCTEST(c, eptr);
1894 switch(c)
1895 {
1896 default: RRETURN(MATCH_NOMATCH);
1897 case 0x09: /* HT */
1898 case 0x20: /* SPACE */
1899 case 0xa0: /* NBSP */
1900 case 0x1680: /* OGHAM SPACE MARK */
1901 case 0x180e: /* MONGOLIAN VOWEL SEPARATOR */
1902 case 0x2000: /* EN QUAD */
1903 case 0x2001: /* EM QUAD */
1904 case 0x2002: /* EN SPACE */
1905 case 0x2003: /* EM SPACE */
1906 case 0x2004: /* THREE-PER-EM SPACE */
1907 case 0x2005: /* FOUR-PER-EM SPACE */
1908 case 0x2006: /* SIX-PER-EM SPACE */
1909 case 0x2007: /* FIGURE SPACE */
1910 case 0x2008: /* PUNCTUATION SPACE */
1911 case 0x2009: /* THIN SPACE */
1912 case 0x200A: /* HAIR SPACE */
1913 case 0x202f: /* NARROW NO-BREAK SPACE */
1914 case 0x205f: /* MEDIUM MATHEMATICAL SPACE */
1915 case 0x3000: /* IDEOGRAPHIC SPACE */
1916 break;
1917 }
1918 ecode++;
1919 break;
1920
1921 case OP_NOT_VSPACE:
1922 if (eptr >= md->end_subject)
1923 {
1924 SCHECK_PARTIAL();
1925 RRETURN(MATCH_NOMATCH);
1926 }
1927 GETCHARINCTEST(c, eptr);
1928 switch(c)
1929 {
1930 default: break;
1931 case 0x0a: /* LF */
1932 case 0x0b: /* VT */
1933 case 0x0c: /* FF */
1934 case 0x0d: /* CR */
1935 case 0x85: /* NEL */
1936 case 0x2028: /* LINE SEPARATOR */
1937 case 0x2029: /* PARAGRAPH SEPARATOR */
1938 RRETURN(MATCH_NOMATCH);
1939 }
1940 ecode++;
1941 break;
1942
1943 case OP_VSPACE:
1944 if (eptr >= md->end_subject)
1945 {
1946 SCHECK_PARTIAL();
1947 RRETURN(MATCH_NOMATCH);
1948 }
1949 GETCHARINCTEST(c, eptr);
1950 switch(c)
1951 {
1952 default: RRETURN(MATCH_NOMATCH);
1953 case 0x0a: /* LF */
1954 case 0x0b: /* VT */
1955 case 0x0c: /* FF */
1956 case 0x0d: /* CR */
1957 case 0x85: /* NEL */
1958 case 0x2028: /* LINE SEPARATOR */
1959 case 0x2029: /* PARAGRAPH SEPARATOR */
1960 break;
1961 }
1962 ecode++;
1963 break;
1964
1965 #ifdef SUPPORT_UCP
1966 /* Check the next character by Unicode property. We will get here only
1967 if the support is in the binary; otherwise a compile-time error occurs. */
1968
1969 case OP_PROP:
1970 case OP_NOTPROP:
1971 if (eptr >= md->end_subject)
1972 {
1973 SCHECK_PARTIAL();
1974 RRETURN(MATCH_NOMATCH);
1975 }
1976 GETCHARINCTEST(c, eptr);
1977 {
1978 const ucd_record *prop = GET_UCD(c);
1979
1980 switch(ecode[1])
1981 {
1982 case PT_ANY:
1983 if (op == OP_NOTPROP) RRETURN(MATCH_NOMATCH);
1984 break;
1985
1986 case PT_LAMP:
1987 if ((prop->chartype == ucp_Lu ||
1988 prop->chartype == ucp_Ll ||
1989 prop->chartype == ucp_Lt) == (op == OP_NOTPROP))
1990 RRETURN(MATCH_NOMATCH);
1991 break;
1992
1993 case PT_GC:
1994 if ((ecode[2] != _pcre_ucp_gentype[prop->chartype]) == (op == OP_PROP))
1995 RRETURN(MATCH_NOMATCH);
1996 break;
1997
1998 case PT_PC:
1999 if ((ecode[2] != prop->chartype) == (op == OP_PROP))
2000 RRETURN(MATCH_NOMATCH);
2001 break;
2002
2003 case PT_SC:
2004 if ((ecode[2] != prop->script) == (op == OP_PROP))
2005 RRETURN(MATCH_NOMATCH);
2006 break;
2007
2008 default:
2009 RRETURN(PCRE_ERROR_INTERNAL);
2010 }
2011
2012 ecode += 3;
2013 }
2014 break;
2015
2016 /* Match an extended Unicode sequence. We will get here only if the support
2017 is in the binary; otherwise a compile-time error occurs. */
2018
2019 case OP_EXTUNI:
2020 if (eptr >= md->end_subject)
2021 {
2022 SCHECK_PARTIAL();
2023 RRETURN(MATCH_NOMATCH);
2024 }
2025 GETCHARINCTEST(c, eptr);
2026 {
2027 int category = UCD_CATEGORY(c);
2028 if (category == ucp_M) RRETURN(MATCH_NOMATCH);
2029 while (eptr < md->end_subject)
2030 {
2031 int len = 1;
2032 if (!utf8) c = *eptr; else
2033 {
2034 GETCHARLEN(c, eptr, len);
2035 }
2036 category = UCD_CATEGORY(c);
2037 if (category != ucp_M) break;
2038 eptr += len;
2039 }
2040 }
2041 ecode++;
2042 break;
2043 #endif
2044
2045
2046 /* Match a back reference, possibly repeatedly. Look past the end of the
2047 item to see if there is repeat information following. The code is similar
2048 to that for character classes, but repeated for efficiency. Then obey
2049 similar code to character type repeats - written out again for speed.
2050 However, if the referenced string is the empty string, always treat
2051 it as matched, any number of times (otherwise there could be infinite
2052 loops). */
2053
2054 case OP_REF:
2055 {
2056 offset = GET2(ecode, 1) << 1; /* Doubled ref number */
2057 ecode += 3;
2058
2059 /* If the reference is unset, there are two possibilities:
2060
2061 (a) In the default, Perl-compatible state, set the length to be longer
2062 than the amount of subject left; this ensures that every attempt at a
2063 match fails. We can't just fail here, because of the possibility of
2064 quantifiers with zero minima.
2065
2066 (b) If the JavaScript compatibility flag is set, set the length to zero
2067 so that the back reference matches an empty string.
2068
2069 Otherwise, set the length to the length of what was matched by the
2070 referenced subpattern. */
2071
2072 if (offset >= offset_top || md->offset_vector[offset] < 0)
2073 length = (md->jscript_compat)? 0 : md->end_subject - eptr + 1;
2074 else
2075 length = md->offset_vector[offset+1] - md->offset_vector[offset];
2076
2077 /* Set up for repetition, or handle the non-repeated case */
2078
2079 switch (*ecode)
2080 {
2081 case OP_CRSTAR:
2082 case OP_CRMINSTAR:
2083 case OP_CRPLUS:
2084 case OP_CRMINPLUS:
2085 case OP_CRQUERY:
2086 case OP_CRMINQUERY:
2087 c = *ecode++ - OP_CRSTAR;
2088 minimize = (c & 1) != 0;
2089 min = rep_min[c]; /* Pick up values from tables; */
2090 max = rep_max[c]; /* zero for max => infinity */
2091 if (max == 0) max = INT_MAX;
2092 break;
2093
2094 case OP_CRRANGE:
2095 case OP_CRMINRANGE:
2096 minimize = (*ecode == OP_CRMINRANGE);
2097 min = GET2(ecode, 1);
2098 max = GET2(ecode, 3);
2099 if (max == 0) max = INT_MAX;
2100 ecode += 5;
2101 break;
2102
2103 default: /* No repeat follows */
2104 if (!match_ref(offset, eptr, length, md, ims))
2105 {
2106 CHECK_PARTIAL();
2107 RRETURN(MATCH_NOMATCH);
2108 }
2109 eptr += length;
2110 continue; /* With the main loop */
2111 }
2112
2113 /* If the length of the reference is zero, just continue with the
2114 main loop. */
2115
2116 if (length == 0) continue;
2117
2118 /* First, ensure the minimum number of matches are present. We get back
2119 the length of the reference string explicitly rather than passing the
2120 address of eptr, so that eptr can be a register variable. */
2121
2122 for (i = 1; i <= min; i++)
2123 {
2124 if (!match_ref(offset, eptr, length, md, ims))
2125 {
2126 CHECK_PARTIAL();
2127 RRETURN(MATCH_NOMATCH);
2128 }
2129 eptr += length;
2130 }
2131
2132 /* If min = max, continue at the same level without recursion.
2133 They are not both allowed to be zero. */
2134
2135 if (min == max) continue;
2136
2137 /* If minimizing, keep trying and advancing the pointer */
2138
2139 if (minimize)
2140 {
2141 for (fi = min;; fi++)
2142 {
2143 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM14);
2144 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2145 if (fi >= max) RRETURN(MATCH_NOMATCH);
2146 if (!match_ref(offset, eptr, length, md, ims))
2147 {
2148 CHECK_PARTIAL();
2149 RRETURN(MATCH_NOMATCH);
2150 }
2151 eptr += length;
2152 }
2153 /* Control never gets here */
2154 }
2155
2156 /* If maximizing, find the longest string and work backwards */
2157
2158 else
2159 {
2160 pp = eptr;
2161 for (i = min; i < max; i++)
2162 {
2163 if (!match_ref(offset, eptr, length, md, ims))
2164 {
2165 CHECK_PARTIAL();
2166 break;
2167 }
2168 eptr += length;
2169 }
2170 while (eptr >= pp)
2171 {
2172 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM15);
2173 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2174 eptr -= length;
2175 }
2176 RRETURN(MATCH_NOMATCH);
2177 }
2178 }
2179 /* Control never gets here */
2180
2181 /* Match a bit-mapped character class, possibly repeatedly. This op code is
2182 used when all the characters in the class have values in the range 0-255,
2183 and either the matching is caseful, or the characters are in the range
2184 0-127 when UTF-8 processing is enabled. The only difference between
2185 OP_CLASS and OP_NCLASS occurs when a data character outside the range is
2186 encountered.
2187
2188 First, look past the end of the item to see if there is repeat information
2189 following. Then obey similar code to character type repeats - written out
2190 again for speed. */
2191
2192 case OP_NCLASS:
2193 case OP_CLASS:
2194 {
2195 data = ecode + 1; /* Save for matching */
2196 ecode += 33; /* Advance past the item */
2197
2198 switch (*ecode)
2199 {
2200 case OP_CRSTAR:
2201 case OP_CRMINSTAR:
2202 case OP_CRPLUS:
2203 case OP_CRMINPLUS:
2204 case OP_CRQUERY:
2205 case OP_CRMINQUERY:
2206 c = *ecode++ - OP_CRSTAR;
2207 minimize = (c & 1) != 0;
2208 min = rep_min[c]; /* Pick up values from tables; */
2209 max = rep_max[c]; /* zero for max => infinity */
2210 if (max == 0) max = INT_MAX;
2211 break;
2212
2213 case OP_CRRANGE:
2214 case OP_CRMINRANGE:
2215 minimize = (*ecode == OP_CRMINRANGE);
2216 min = GET2(ecode, 1);
2217 max = GET2(ecode, 3);
2218 if (max == 0) max = INT_MAX;
2219 ecode += 5;
2220 break;
2221
2222 default: /* No repeat follows */
2223 min = max = 1;
2224 break;
2225 }
2226
2227 /* First, ensure the minimum number of matches are present. */
2228
2229 #ifdef SUPPORT_UTF8
2230 /* UTF-8 mode */
2231 if (utf8)
2232 {
2233 for (i = 1; i <= min; i++)
2234 {
2235 if (eptr >= md->end_subject)
2236 {
2237 SCHECK_PARTIAL();
2238 RRETURN(MATCH_NOMATCH);
2239 }
2240 GETCHARINC(c, eptr);
2241 if (c > 255)
2242 {
2243 if (op == OP_CLASS) RRETURN(MATCH_NOMATCH);
2244 }
2245 else
2246 {
2247 if ((data[c/8] & (1 << (c&7))) == 0) RRETURN(MATCH_NOMATCH);
2248 }
2249 }
2250 }
2251 else
2252 #endif
2253 /* Not UTF-8 mode */
2254 {
2255 for (i = 1; i <= min; i++)
2256 {
2257 if (eptr >= md->end_subject)
2258 {
2259 SCHECK_PARTIAL();
2260 RRETURN(MATCH_NOMATCH);
2261 }
2262 c = *eptr++;
2263 if ((data[c/8] & (1 << (c&7))) == 0) RRETURN(MATCH_NOMATCH);
2264 }
2265 }
2266
2267 /* If max == min we can continue with the main loop without the
2268 need to recurse. */
2269
2270 if (min == max) continue;
2271
2272 /* If minimizing, keep testing the rest of the expression and advancing
2273 the pointer while it matches the class. */
2274
2275 if (minimize)
2276 {
2277 #ifdef SUPPORT_UTF8
2278 /* UTF-8 mode */
2279 if (utf8)
2280 {
2281 for (fi = min;; fi++)
2282 {
2283 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM16);
2284 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2285 if (fi >= max) RRETURN(MATCH_NOMATCH);
2286 if (eptr >= md->end_subject)
2287 {
2288 SCHECK_PARTIAL();
2289 RRETURN(MATCH_NOMATCH);
2290 }
2291 GETCHARINC(c, eptr);
2292 if (c > 255)
2293 {
2294 if (op == OP_CLASS) RRETURN(MATCH_NOMATCH);
2295 }
2296 else
2297 {
2298 if ((data[c/8] & (1 << (c&7))) == 0) RRETURN(MATCH_NOMATCH);
2299 }
2300 }
2301 }
2302 else
2303 #endif
2304 /* Not UTF-8 mode */
2305 {
2306 for (fi = min;; fi++)
2307 {
2308 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM17);
2309 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2310 if (fi >= max) RRETURN(MATCH_NOMATCH);
2311 if (eptr >= md->end_subject)
2312 {
2313 SCHECK_PARTIAL();
2314 RRETURN(MATCH_NOMATCH);
2315 }
2316 c = *eptr++;
2317 if ((data[c/8] & (1 << (c&7))) == 0) RRETURN(MATCH_NOMATCH);
2318 }
2319 }
2320 /* Control never gets here */
2321 }
2322
2323 /* If maximizing, find the longest possible run, then work backwards. */
2324
2325 else
2326 {
2327 pp = eptr;
2328
2329 #ifdef SUPPORT_UTF8
2330 /* UTF-8 mode */
2331 if (utf8)
2332 {
2333 for (i = min; i < max; i++)
2334 {
2335 int len = 1;
2336 if (eptr >= md->end_subject)
2337 {
2338 SCHECK_PARTIAL();
2339 break;
2340 }
2341 GETCHARLEN(c, eptr, len);
2342 if (c > 255)
2343 {
2344 if (op == OP_CLASS) break;
2345 }
2346 else
2347 {
2348 if ((data[c/8] & (1 << (c&7))) == 0) break;
2349 }
2350 eptr += len;
2351 }
2352 for (;;)
2353 {
2354 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM18);
2355 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2356 if (eptr-- == pp) break; /* Stop if tried at original pos */
2357 BACKCHAR(eptr);
2358 }
2359 }
2360 else
2361 #endif
2362 /* Not UTF-8 mode */
2363 {
2364 for (i = min; i < max; i++)
2365 {
2366 if (eptr >= md->end_subject)
2367 {
2368 SCHECK_PARTIAL();
2369 break;
2370 }
2371 c = *eptr;
2372 if ((data[c/8] & (1 << (c&7))) == 0) break;
2373 eptr++;
2374 }
2375 while (eptr >= pp)
2376 {
2377 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM19);
2378 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2379 eptr--;
2380 }
2381 }
2382
2383 RRETURN(MATCH_NOMATCH);
2384 }
2385 }
2386 /* Control never gets here */
2387
2388
2389 /* Match an extended character class. This opcode is encountered only
2390 when UTF-8 mode mode is supported. Nevertheless, we may not be in UTF-8
2391 mode, because Unicode properties are supported in non-UTF-8 mode. */
2392
2393 #ifdef SUPPORT_UTF8
2394 case OP_XCLASS:
2395 {
2396 data = ecode + 1 + LINK_SIZE; /* Save for matching */
2397 ecode += GET(ecode, 1); /* Advance past the item */
2398
2399 switch (*ecode)
2400 {
2401 case OP_CRSTAR:
2402 case OP_CRMINSTAR:
2403 case OP_CRPLUS:
2404 case OP_CRMINPLUS:
2405 case OP_CRQUERY:
2406 case OP_CRMINQUERY:
2407 c = *ecode++ - OP_CRSTAR;
2408 minimize = (c & 1) != 0;
2409 min = rep_min[c]; /* Pick up values from tables; */
2410 max = rep_max[c]; /* zero for max => infinity */
2411 if (max == 0) max = INT_MAX;
2412 break;
2413
2414 case OP_CRRANGE:
2415 case OP_CRMINRANGE:
2416 minimize = (*ecode == OP_CRMINRANGE);
2417 min = GET2(ecode, 1);
2418 max = GET2(ecode, 3);
2419 if (max == 0) max = INT_MAX;
2420 ecode += 5;
2421 break;
2422
2423 default: /* No repeat follows */
2424 min = max = 1;
2425 break;
2426 }
2427
2428 /* First, ensure the minimum number of matches are present. */
2429
2430 for (i = 1; i <= min; i++)
2431 {
2432 if (eptr >= md->end_subject)
2433 {
2434 SCHECK_PARTIAL();
2435 RRETURN(MATCH_NOMATCH);
2436 }
2437 GETCHARINCTEST(c, eptr);
2438 if (!_pcre_xclass(c, data)) RRETURN(MATCH_NOMATCH);
2439 }
2440
2441 /* If max == min we can continue with the main loop without the
2442 need to recurse. */
2443
2444 if (min == max) continue;
2445
2446 /* If minimizing, keep testing the rest of the expression and advancing
2447 the pointer while it matches the class. */
2448
2449 if (minimize)
2450 {
2451 for (fi = min;; fi++)
2452 {
2453 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM20);
2454 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2455 if (fi >= max) RRETURN(MATCH_NOMATCH);
2456 if (eptr >= md->end_subject)
2457 {
2458 SCHECK_PARTIAL();
2459 RRETURN(MATCH_NOMATCH);
2460 }
2461 GETCHARINCTEST(c, eptr);
2462 if (!_pcre_xclass(c, data)) RRETURN(MATCH_NOMATCH);
2463 }
2464 /* Control never gets here */
2465 }
2466
2467 /* If maximizing, find the longest possible run, then work backwards. */
2468
2469 else
2470 {
2471 pp = eptr;
2472 for (i = min; i < max; i++)
2473 {
2474 int len = 1;
2475 if (eptr >= md->end_subject)
2476 {
2477 SCHECK_PARTIAL();
2478 break;
2479 }
2480 GETCHARLENTEST(c, eptr, len);
2481 if (!_pcre_xclass(c, data)) break;
2482 eptr += len;
2483 }
2484 for(;;)
2485 {
2486 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM21);
2487 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2488 if (eptr-- == pp) break; /* Stop if tried at original pos */
2489 if (utf8) BACKCHAR(eptr);
2490 }
2491 RRETURN(MATCH_NOMATCH);
2492 }
2493
2494 /* Control never gets here */
2495 }
2496 #endif /* End of XCLASS */
2497
2498 /* Match a single character, casefully */
2499
2500 case OP_CHAR:
2501 #ifdef SUPPORT_UTF8
2502 if (utf8)
2503 {
2504 length = 1;
2505 ecode++;
2506 GETCHARLEN(fc, ecode, length);
2507 if (length > md->end_subject - eptr)
2508 {
2509 CHECK_PARTIAL(); /* Not SCHECK_PARTIAL() */
2510 RRETURN(MATCH_NOMATCH);
2511 }
2512 while (length-- > 0) if (*ecode++ != *eptr++) RRETURN(MATCH_NOMATCH);
2513 }
2514 else
2515 #endif
2516
2517 /* Non-UTF-8 mode */
2518 {
2519 if (md->end_subject - eptr < 1)
2520 {
2521 SCHECK_PARTIAL(); /* This one can use SCHECK_PARTIAL() */
2522 RRETURN(MATCH_NOMATCH);
2523 }
2524 if (ecode[1] != *eptr++) RRETURN(MATCH_NOMATCH);
2525 ecode += 2;
2526 }
2527 break;
2528
2529 /* Match a single character, caselessly */
2530
2531 case OP_CHARNC:
2532 #ifdef SUPPORT_UTF8
2533 if (utf8)
2534 {
2535 length = 1;
2536 ecode++;
2537 GETCHARLEN(fc, ecode, length);
2538
2539 if (length > md->end_subject - eptr)
2540 {
2541 CHECK_PARTIAL(); /* Not SCHECK_PARTIAL() */
2542 RRETURN(MATCH_NOMATCH);
2543 }
2544
2545 /* If the pattern character's value is < 128, we have only one byte, and
2546 can use the fast lookup table. */
2547
2548 if (fc < 128)
2549 {
2550 if (md->lcc[*ecode++] != md->lcc[*eptr++]) RRETURN(MATCH_NOMATCH);
2551 }
2552
2553 /* Otherwise we must pick up the subject character */
2554
2555 else
2556 {
2557 unsigned int dc;
2558 GETCHARINC(dc, eptr);
2559 ecode += length;
2560
2561 /* If we have Unicode property support, we can use it to test the other
2562 case of the character, if there is one. */
2563
2564 if (fc != dc)
2565 {
2566 #ifdef SUPPORT_UCP
2567 if (dc != UCD_OTHERCASE(fc))
2568 #endif
2569 RRETURN(MATCH_NOMATCH);
2570 }
2571 }
2572 }
2573 else
2574 #endif /* SUPPORT_UTF8 */
2575
2576 /* Non-UTF-8 mode */
2577 {
2578 if (md->end_subject - eptr < 1)
2579 {
2580 SCHECK_PARTIAL(); /* This one can use SCHECK_PARTIAL() */
2581 RRETURN(MATCH_NOMATCH);
2582 }
2583 if (md->lcc[ecode[1]] != md->lcc[*eptr++]) RRETURN(MATCH_NOMATCH);
2584 ecode += 2;
2585 }
2586 break;
2587
2588 /* Match a single character repeatedly. */
2589
2590 case OP_EXACT:
2591 min = max = GET2(ecode, 1);
2592 ecode += 3;
2593 goto REPEATCHAR;
2594
2595 case OP_POSUPTO:
2596 possessive = TRUE;
2597 /* Fall through */
2598
2599 case OP_UPTO:
2600 case OP_MINUPTO:
2601 min = 0;
2602 max = GET2(ecode, 1);
2603 minimize = *ecode == OP_MINUPTO;
2604 ecode += 3;
2605 goto REPEATCHAR;
2606
2607 case OP_POSSTAR:
2608 possessive = TRUE;
2609 min = 0;
2610 max = INT_MAX;
2611 ecode++;
2612 goto REPEATCHAR;
2613
2614 case OP_POSPLUS:
2615 possessive = TRUE;
2616 min = 1;
2617 max = INT_MAX;
2618 ecode++;
2619 goto REPEATCHAR;
2620
2621 case OP_POSQUERY:
2622 possessive = TRUE;
2623 min = 0;
2624 max = 1;
2625 ecode++;
2626 goto REPEATCHAR;
2627
2628 case OP_STAR:
2629 case OP_MINSTAR:
2630 case OP_PLUS:
2631 case OP_MINPLUS:
2632 case OP_QUERY:
2633 case OP_MINQUERY:
2634 c = *ecode++ - OP_STAR;
2635 minimize = (c & 1) != 0;
2636
2637 min = rep_min[c]; /* Pick up values from tables; */
2638 max = rep_max[c]; /* zero for max => infinity */
2639 if (max == 0) max = INT_MAX;
2640
2641 /* Common code for all repeated single-character matches. */
2642
2643 REPEATCHAR:
2644 #ifdef SUPPORT_UTF8
2645 if (utf8)
2646 {
2647 length = 1;
2648 charptr = ecode;
2649 GETCHARLEN(fc, ecode, length);
2650 ecode += length;
2651
2652 /* Handle multibyte character matching specially here. There is
2653 support for caseless matching if UCP support is present. */
2654
2655 if (length > 1)
2656 {
2657 #ifdef SUPPORT_UCP
2658 unsigned int othercase;
2659 if ((ims & PCRE_CASELESS) != 0 &&
2660 (othercase = UCD_OTHERCASE(fc)) != fc)
2661 oclength = _pcre_ord2utf8(othercase, occhars);
2662 else oclength = 0;
2663 #endif /* SUPPORT_UCP */
2664
2665 for (i = 1; i <= min; i++)
2666 {
2667 if (eptr <= md->end_subject - length &&
2668 memcmp(eptr, charptr, length) == 0) eptr += length;
2669 #ifdef SUPPORT_UCP
2670 else if (oclength > 0 &&
2671 eptr <= md->end_subject - oclength &&
2672 memcmp(eptr, occhars, oclength) == 0) eptr += oclength;
2673 #endif /* SUPPORT_UCP */
2674 else
2675 {
2676 CHECK_PARTIAL();
2677 RRETURN(MATCH_NOMATCH);
2678 }
2679 }
2680
2681 if (min == max) continue;
2682
2683 if (minimize)
2684 {
2685 for (fi = min;; fi++)
2686 {
2687 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM22);
2688 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2689 if (fi >= max) RRETURN(MATCH_NOMATCH);
2690 if (eptr <= md->end_subject - length &&
2691 memcmp(eptr, charptr, length) == 0) eptr += length;
2692 #ifdef SUPPORT_UCP
2693 else if (oclength > 0 &&
2694 eptr <= md->end_subject - oclength &&
2695 memcmp(eptr, occhars, oclength) == 0) eptr += oclength;
2696 #endif /* SUPPORT_UCP */
2697 else
2698 {
2699 CHECK_PARTIAL();
2700 RRETURN(MATCH_NOMATCH);
2701 }
2702 }
2703 /* Control never gets here */
2704 }
2705
2706 else /* Maximize */
2707 {
2708 pp = eptr;
2709 for (i = min; i < max; i++)
2710 {
2711 if (eptr <= md->end_subject - length &&
2712 memcmp(eptr, charptr, length) == 0) eptr += length;
2713 #ifdef SUPPORT_UCP
2714 else if (oclength > 0 &&
2715 eptr <= md->end_subject - oclength &&
2716 memcmp(eptr, occhars, oclength) == 0) eptr += oclength;
2717 #endif /* SUPPORT_UCP */
2718 else
2719 {
2720 CHECK_PARTIAL();
2721 break;
2722 }
2723 }
2724
2725 if (possessive) continue;
2726
2727 for(;;)
2728 {
2729 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM23);
2730 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2731 if (eptr == pp) { RRETURN(MATCH_NOMATCH); }
2732 #ifdef SUPPORT_UCP
2733 eptr--;
2734 BACKCHAR(eptr);
2735 #else /* without SUPPORT_UCP */
2736 eptr -= length;
2737 #endif /* SUPPORT_UCP */
2738 }
2739 }
2740 /* Control never gets here */
2741 }
2742
2743 /* If the length of a UTF-8 character is 1, we fall through here, and
2744 obey the code as for non-UTF-8 characters below, though in this case the
2745 value of fc will always be < 128. */
2746 }
2747 else
2748 #endif /* SUPPORT_UTF8 */
2749
2750 /* When not in UTF-8 mode, load a single-byte character. */
2751
2752 fc = *ecode++;
2753
2754 /* The value of fc at this point is always less than 256, though we may or
2755 may not be in UTF-8 mode. The code is duplicated for the caseless and
2756 caseful cases, for speed, since matching characters is likely to be quite
2757 common. First, ensure the minimum number of matches are present. If min =
2758 max, continue at the same level without recursing. Otherwise, if
2759 minimizing, keep trying the rest of the expression and advancing one
2760 matching character if failing, up to the maximum. Alternatively, if
2761 maximizing, find the maximum number of characters and work backwards. */
2762
2763 DPRINTF(("matching %c{%d,%d} against subject %.*s\n", fc, min, max,
2764 max, eptr));
2765
2766 if ((ims & PCRE_CASELESS) != 0)
2767 {
2768 fc = md->lcc[fc];
2769 for (i = 1; i <= min; i++)
2770 {
2771 if (eptr >= md->end_subject)
2772 {
2773 SCHECK_PARTIAL();
2774 RRETURN(MATCH_NOMATCH);
2775 }
2776 if (fc != md->lcc[*eptr++]) RRETURN(MATCH_NOMATCH);
2777 }
2778 if (min == max) continue;
2779 if (minimize)
2780 {
2781 for (fi = min;; fi++)
2782 {
2783 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM24);
2784 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2785 if (fi >= max) RRETURN(MATCH_NOMATCH);
2786 if (eptr >= md->end_subject)
2787 {
2788 SCHECK_PARTIAL();
2789 RRETURN(MATCH_NOMATCH);
2790 }
2791 if (fc != md->lcc[*eptr++]) RRETURN(MATCH_NOMATCH);
2792 }
2793 /* Control never gets here */
2794 }
2795 else /* Maximize */
2796 {
2797 pp = eptr;
2798 for (i = min; i < max; i++)
2799 {
2800 if (eptr >= md->end_subject)
2801 {
2802 SCHECK_PARTIAL();
2803 break;
2804 }
2805 if (fc != md->lcc[*eptr]) break;
2806 eptr++;
2807 }
2808
2809 if (possessive) continue;
2810
2811 while (eptr >= pp)
2812 {
2813 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM25);
2814 eptr--;
2815 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2816 }
2817 RRETURN(MATCH_NOMATCH);
2818 }
2819 /* Control never gets here */
2820 }
2821
2822 /* Caseful comparisons (includes all multi-byte characters) */
2823
2824 else
2825 {
2826 for (i = 1; i <= min; i++)
2827 {
2828 if (eptr >= md->end_subject)
2829 {
2830 SCHECK_PARTIAL();
2831 RRETURN(MATCH_NOMATCH);
2832 }
2833 if (fc != *eptr++) RRETURN(MATCH_NOMATCH);
2834 }
2835
2836 if (min == max) continue;
2837
2838 if (minimize)
2839 {
2840 for (fi = min;; fi++)
2841 {
2842 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM26);
2843 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2844 if (fi >= max) RRETURN(MATCH_NOMATCH);
2845 if (eptr >= md->end_subject)
2846 {
2847 SCHECK_PARTIAL();
2848 RRETURN(MATCH_NOMATCH);
2849 }
2850 if (fc != *eptr++) RRETURN(MATCH_NOMATCH);
2851 }
2852 /* Control never gets here */
2853 }
2854 else /* Maximize */
2855 {
2856 pp = eptr;
2857 for (i = min; i < max; i++)
2858 {
2859 if (eptr >= md->end_subject)
2860 {
2861 SCHECK_PARTIAL();
2862 break;
2863 }
2864 if (fc != *eptr) break;
2865 eptr++;
2866 }
2867 if (possessive) continue;
2868
2869 while (eptr >= pp)
2870 {
2871 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM27);
2872 eptr--;
2873 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2874 }
2875 RRETURN(MATCH_NOMATCH);
2876 }
2877 }
2878 /* Control never gets here */
2879
2880 /* Match a negated single one-byte character. The character we are
2881 checking can be multibyte. */
2882
2883 case OP_NOT:
2884 if (eptr >= md->end_subject)
2885 {
2886 SCHECK_PARTIAL();
2887 RRETURN(MATCH_NOMATCH);
2888 }
2889 ecode++;
2890 GETCHARINCTEST(c, eptr);
2891 if ((ims & PCRE_CASELESS) != 0)
2892 {
2893 #ifdef SUPPORT_UTF8
2894 if (c < 256)
2895 #endif
2896 c = md->lcc[c];
2897 if (md->lcc[*ecode++] == c) RRETURN(MATCH_NOMATCH);
2898 }
2899 else
2900 {
2901 if (*ecode++ == c) RRETURN(MATCH_NOMATCH);
2902 }
2903 break;
2904
2905 /* Match a negated single one-byte character repeatedly. This is almost a
2906 repeat of the code for a repeated single character, but I haven't found a
2907 nice way of commoning these up that doesn't require a test of the
2908 positive/negative option for each character match. Maybe that wouldn't add
2909 very much to the time taken, but character matching *is* what this is all
2910 about... */
2911
2912 case OP_NOTEXACT:
2913 min = max = GET2(ecode, 1);
2914 ecode += 3;
2915 goto REPEATNOTCHAR;
2916
2917 case OP_NOTUPTO:
2918 case OP_NOTMINUPTO:
2919 min = 0;
2920 max = GET2(ecode, 1);
2921 minimize = *ecode == OP_NOTMINUPTO;
2922 ecode += 3;
2923 goto REPEATNOTCHAR;
2924
2925 case OP_NOTPOSSTAR:
2926 possessive = TRUE;
2927 min = 0;
2928 max = INT_MAX;
2929 ecode++;
2930 goto REPEATNOTCHAR;
2931
2932 case OP_NOTPOSPLUS:
2933 possessive = TRUE;
2934 min = 1;
2935 max = INT_MAX;
2936 ecode++;
2937 goto REPEATNOTCHAR;
2938
2939 case OP_NOTPOSQUERY:
2940 possessive = TRUE;
2941 min = 0;
2942 max = 1;
2943 ecode++;
2944 goto REPEATNOTCHAR;
2945
2946 case OP_NOTPOSUPTO:
2947 possessive = TRUE;
2948 min = 0;
2949 max = GET2(ecode, 1);
2950 ecode += 3;
2951 goto REPEATNOTCHAR;
2952
2953 case OP_NOTSTAR:
2954 case OP_NOTMINSTAR:
2955 case OP_NOTPLUS:
2956 case OP_NOTMINPLUS:
2957 case OP_NOTQUERY:
2958 case OP_NOTMINQUERY:
2959 c = *ecode++ - OP_NOTSTAR;
2960 minimize = (c & 1) != 0;
2961 min = rep_min[c]; /* Pick up values from tables; */
2962 max = rep_max[c]; /* zero for max => infinity */
2963 if (max == 0) max = INT_MAX;
2964
2965 /* Common code for all repeated single-byte matches. */
2966
2967 REPEATNOTCHAR:
2968 fc = *ecode++;
2969
2970 /* The code is duplicated for the caseless and caseful cases, for speed,
2971 since matching characters is likely to be quite common. First, ensure the
2972 minimum number of matches are present. If min = max, continue at the same
2973 level without recursing. Otherwise, if minimizing, keep trying the rest of
2974 the expression and advancing one matching character if failing, up to the
2975 maximum. Alternatively, if maximizing, find the maximum number of
2976 characters and work backwards. */
2977
2978 DPRINTF(("negative matching %c{%d,%d} against subject %.*s\n", fc, min, max,
2979 max, eptr));
2980
2981 if ((ims & PCRE_CASELESS) != 0)
2982 {
2983 fc = md->lcc[fc];
2984
2985 #ifdef SUPPORT_UTF8
2986 /* UTF-8 mode */
2987 if (utf8)
2988 {
2989 register unsigned int d;
2990 for (i = 1; i <= min; i++)
2991 {
2992 if (eptr >= md->end_subject)
2993 {
2994 SCHECK_PARTIAL();
2995 RRETURN(MATCH_NOMATCH);
2996 }
2997 GETCHARINC(d, eptr);
2998 if (d < 256) d = md->lcc[d];
2999 if (fc == d) RRETURN(MATCH_NOMATCH);
3000 }
3001 }
3002 else
3003 #endif
3004
3005 /* Not UTF-8 mode */
3006 {
3007 for (i = 1; i <= min; i++)
3008 {
3009 if (eptr >= md->end_subject)
3010 {
3011 SCHECK_PARTIAL();
3012 RRETURN(MATCH_NOMATCH);
3013 }
3014 if (fc == md->lcc[*eptr++]) RRETURN(MATCH_NOMATCH);
3015 }
3016 }
3017
3018 if (min == max) continue;
3019
3020 if (minimize)
3021 {
3022 #ifdef SUPPORT_UTF8
3023 /* UTF-8 mode */
3024 if (utf8)
3025 {
3026 register unsigned int d;
3027 for (fi = min;; fi++)
3028 {
3029 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM28);
3030 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3031 if (fi >= max) RRETURN(MATCH_NOMATCH);
3032 if (eptr >= md->end_subject)
3033 {
3034 SCHECK_PARTIAL();
3035 RRETURN(MATCH_NOMATCH);
3036 }
3037 GETCHARINC(d, eptr);
3038 if (d < 256) d = md->lcc[d];
3039 if (fc == d) RRETURN(MATCH_NOMATCH);
3040 }
3041 }
3042 else
3043 #endif
3044 /* Not UTF-8 mode */
3045 {
3046 for (fi = min;; fi++)
3047 {
3048 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM29);
3049 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3050 if (fi >= max) RRETURN(MATCH_NOMATCH);
3051 if (eptr >= md->end_subject)
3052 {
3053 SCHECK_PARTIAL();
3054 RRETURN(MATCH_NOMATCH);
3055 }
3056 if (fc == md->lcc[*eptr++]) RRETURN(MATCH_NOMATCH);
3057 }
3058 }
3059 /* Control never gets here */
3060 }
3061
3062 /* Maximize case */
3063
3064 else
3065 {
3066 pp = eptr;
3067
3068 #ifdef SUPPORT_UTF8
3069 /* UTF-8 mode */
3070 if (utf8)
3071 {
3072 register unsigned int d;
3073 for (i = min; i < max; i++)
3074 {
3075 int len = 1;
3076 if (eptr >= md->end_subject)
3077 {
3078 SCHECK_PARTIAL();
3079 break;
3080 }
3081 GETCHARLEN(d, eptr, len);
3082 if (d < 256) d = md->lcc[d];
3083 if (fc == d) break;
3084 eptr += len;
3085 }
3086 if (possessive) continue;
3087 for(;;)
3088 {
3089 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM30);
3090 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3091 if (eptr-- == pp) break; /* Stop if tried at original pos */
3092 BACKCHAR(eptr);
3093 }
3094 }
3095 else
3096 #endif
3097 /* Not UTF-8 mode */
3098 {
3099 for (i = min; i < max; i++)
3100 {
3101 if (eptr >= md->end_subject)
3102 {
3103 SCHECK_PARTIAL();
3104 break;
3105 }
3106 if (fc == md->lcc[*eptr]) break;
3107 eptr++;
3108 }
3109 if (possessive) continue;
3110 while (eptr >= pp)
3111 {
3112 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM31);
3113 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3114 eptr--;
3115 }
3116 }
3117
3118 RRETURN(MATCH_NOMATCH);
3119 }
3120 /* Control never gets here */
3121 }
3122
3123 /* Caseful comparisons */
3124
3125 else
3126 {
3127 #ifdef SUPPORT_UTF8
3128 /* UTF-8 mode */
3129 if (utf8)
3130 {
3131 register unsigned int d;
3132 for (i = 1; i <= min; i++)
3133 {
3134 if (eptr >= md->end_subject)
3135 {
3136 SCHECK_PARTIAL();
3137 RRETURN(MATCH_NOMATCH);
3138 }
3139 GETCHARINC(d, eptr);
3140 if (fc == d) RRETURN(MATCH_NOMATCH);
3141 }
3142 }
3143 else
3144 #endif
3145 /* Not UTF-8 mode */
3146 {
3147 for (i = 1; i <= min; i++)
3148 {
3149 if (eptr >= md->end_subject)
3150 {
3151 SCHECK_PARTIAL();
3152 RRETURN(MATCH_NOMATCH);
3153 }
3154 if (fc == *eptr++) RRETURN(MATCH_NOMATCH);
3155 }
3156 }
3157
3158 if (min == max) continue;
3159
3160 if (minimize)
3161 {
3162 #ifdef SUPPORT_UTF8
3163 /* UTF-8 mode */
3164 if (utf8)
3165 {
3166 register unsigned int d;
3167 for (fi = min;; fi++)
3168 {
3169 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM32);
3170 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3171 if (fi >= max) RRETURN(MATCH_NOMATCH);
3172 if (eptr >= md->end_subject)
3173 {
3174 SCHECK_PARTIAL();
3175 RRETURN(MATCH_NOMATCH);
3176 }
3177 GETCHARINC(d, eptr);
3178 if (fc == d) RRETURN(MATCH_NOMATCH);
3179 }
3180 }
3181 else
3182 #endif
3183 /* Not UTF-8 mode */
3184 {
3185 for (fi = min;; fi++)
3186 {
3187 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM33);
3188 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3189 if (fi >= max) RRETURN(MATCH_NOMATCH);
3190 if (eptr >= md->end_subject)
3191 {
3192 SCHECK_PARTIAL();
3193 RRETURN(MATCH_NOMATCH);
3194 }
3195 if (fc == *eptr++) RRETURN(MATCH_NOMATCH);
3196 }
3197 }
3198 /* Control never gets here */
3199 }
3200
3201 /* Maximize case */
3202
3203 else
3204 {
3205 pp = eptr;
3206
3207 #ifdef SUPPORT_UTF8
3208 /* UTF-8 mode */
3209 if (utf8)
3210 {
3211 register unsigned int d;
3212 for (i = min; i < max; i++)
3213 {
3214 int len = 1;
3215 if (eptr >= md->end_subject)
3216 {
3217 SCHECK_PARTIAL();
3218 break;
3219 }
3220 GETCHARLEN(d, eptr, len);
3221 if (fc == d) break;
3222 eptr += len;
3223 }
3224 if (possessive) continue;
3225 for(;;)
3226 {
3227 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM34);
3228 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3229 if (eptr-- == pp) break; /* Stop if tried at original pos */
3230 BACKCHAR(eptr);
3231 }
3232 }
3233 else
3234 #endif
3235 /* Not UTF-8 mode */
3236 {
3237 for (i = min; i < max; i++)
3238 {
3239 if (eptr >= md->end_subject)
3240 {
3241 SCHECK_PARTIAL();
3242 break;
3243 }
3244 if (fc == *eptr) break;
3245 eptr++;
3246 }
3247 if (possessive) continue;
3248 while (eptr >= pp)
3249 {
3250 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM35);
3251 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3252 eptr--;
3253 }
3254 }
3255
3256 RRETURN(MATCH_NOMATCH);
3257 }
3258 }
3259 /* Control never gets here */
3260
3261 /* Match a single character type repeatedly; several different opcodes
3262 share code. This is very similar to the code for single characters, but we
3263 repeat it in the interests of efficiency. */
3264
3265 case OP_TYPEEXACT:
3266 min = max = GET2(ecode, 1);
3267 minimize = TRUE;
3268 ecode += 3;
3269 goto REPEATTYPE;
3270
3271 case OP_TYPEUPTO:
3272 case OP_TYPEMINUPTO:
3273 min = 0;
3274 max = GET2(ecode, 1);
3275 minimize = *ecode == OP_TYPEMINUPTO;
3276 ecode += 3;
3277 goto REPEATTYPE;
3278
3279 case OP_TYPEPOSSTAR:
3280 possessive = TRUE;
3281 min = 0;
3282 max = INT_MAX;
3283 ecode++;
3284 goto REPEATTYPE;
3285
3286 case OP_TYPEPOSPLUS:
3287 possessive = TRUE;
3288 min = 1;
3289 max = INT_MAX;
3290 ecode++;
3291 goto REPEATTYPE;
3292
3293 case OP_TYPEPOSQUERY:
3294 possessive = TRUE;
3295 min = 0;
3296 max = 1;
3297 ecode++;
3298 goto REPEATTYPE;
3299
3300 case OP_TYPEPOSUPTO:
3301 possessive = TRUE;
3302 min = 0;
3303 max = GET2(ecode, 1);
3304 ecode += 3;
3305 goto REPEATTYPE;
3306
3307 case OP_TYPESTAR:
3308 case OP_TYPEMINSTAR:
3309 case OP_TYPEPLUS:
3310 case OP_TYPEMINPLUS:
3311 case OP_TYPEQUERY:
3312 case OP_TYPEMINQUERY:
3313 c = *ecode++ - OP_TYPESTAR;
3314 minimize = (c & 1) != 0;
3315 min = rep_min[c]; /* Pick up values from tables; */
3316 max = rep_max[c]; /* zero for max => infinity */
3317 if (max == 0) max = INT_MAX;
3318
3319 /* Common code for all repeated single character type matches. Note that
3320 in UTF-8 mode, '.' matches a character of any length, but for the other
3321 character types, the valid characters are all one-byte long. */
3322
3323 REPEATTYPE:
3324 ctype = *ecode++; /* Code for the character type */
3325
3326 #ifdef SUPPORT_UCP
3327 if (ctype == OP_PROP || ctype == OP_NOTPROP)
3328 {
3329 prop_fail_result = ctype == OP_NOTPROP;
3330 prop_type = *ecode++;
3331 prop_value = *ecode++;
3332 }
3333 else prop_type = -1;
3334 #endif
3335
3336 /* First, ensure the minimum number of matches are present. Use inline
3337 code for maximizing the speed, and do the type test once at the start
3338 (i.e. keep it out of the loop). Separate the UTF-8 code completely as that
3339 is tidier. Also separate the UCP code, which can be the same for both UTF-8
3340 and single-bytes. */
3341
3342 if (min > 0)
3343 {
3344 #ifdef SUPPORT_UCP
3345 if (prop_type >= 0)
3346 {
3347 switch(prop_type)
3348 {
3349 case PT_ANY:
3350 if (prop_fail_result) RRETURN(MATCH_NOMATCH);
3351 for (i = 1; i <= min; i++)
3352 {
3353 if (eptr >= md->end_subject)
3354 {
3355 SCHECK_PARTIAL();
3356 RRETURN(MATCH_NOMATCH);
3357 }
3358 GETCHARINCTEST(c, eptr);
3359 }
3360 break;
3361
3362 case PT_LAMP:
3363 for (i = 1; i <= min; i++)
3364 {
3365 if (eptr >= md->end_subject)
3366 {
3367 SCHECK_PARTIAL();
3368 RRETURN(MATCH_NOMATCH);
3369 }
3370 GETCHARINCTEST(c, eptr);
3371 prop_chartype = UCD_CHARTYPE(c);
3372 if ((prop_chartype == ucp_Lu ||
3373 prop_chartype == ucp_Ll ||
3374 prop_chartype == ucp_Lt) == prop_fail_result)
3375 RRETURN(MATCH_NOMATCH);
3376 }
3377 break;
3378
3379 case PT_GC:
3380 for (i = 1; i <= min; i++)
3381 {
3382 if (eptr >= md->end_subject)
3383 {
3384 SCHECK_PARTIAL();
3385 RRETURN(MATCH_NOMATCH);
3386 }
3387 GETCHARINCTEST(c, eptr);
3388 prop_category = UCD_CATEGORY(c);
3389 if ((prop_category == prop_value) == prop_fail_result)
3390 RRETURN(MATCH_NOMATCH);
3391 }
3392 break;
3393
3394 case PT_PC:
3395 for (i = 1; i <= min; i++)
3396 {
3397 if (eptr >= md->end_subject)
3398 {
3399 SCHECK_PARTIAL();
3400 RRETURN(MATCH_NOMATCH);
3401 }
3402 GETCHARINCTEST(c, eptr);
3403 prop_chartype = UCD_CHARTYPE(c);
3404 if ((prop_chartype == prop_value) == prop_fail_result)
3405 RRETURN(MATCH_NOMATCH);
3406 }
3407 break;
3408
3409 case PT_SC:
3410 for (i = 1; i <= min; i++)
3411 {
3412 if (eptr >= md->end_subject)
3413 {
3414 SCHECK_PARTIAL();
3415 RRETURN(MATCH_NOMATCH);
3416 }
3417 GETCHARINCTEST(c, eptr);
3418 prop_script = UCD_SCRIPT(c);
3419 if ((prop_script == prop_value) == prop_fail_result)
3420 RRETURN(MATCH_NOMATCH);
3421 }
3422 break;
3423
3424 default:
3425 RRETURN(PCRE_ERROR_INTERNAL);
3426 }
3427 }
3428
3429 /* Match extended Unicode sequences. We will get here only if the
3430 support is in the binary; otherwise a compile-time error occurs. */
3431
3432 else if (ctype == OP_EXTUNI)
3433 {
3434 for (i = 1; i <= min; i++)
3435 {
3436 if (eptr >= md->end_subject)
3437 {
3438 SCHECK_PARTIAL();
3439 RRETURN(MATCH_NOMATCH);
3440 }
3441 GETCHARINCTEST(c, eptr);
3442 prop_category = UCD_CATEGORY(c);
3443 if (prop_category == ucp_M) RRETURN(MATCH_NOMATCH);
3444 while (eptr < md->end_subject)
3445 {
3446 int len = 1;
3447 if (!utf8) c = *eptr;
3448 else { GETCHARLEN(c, eptr, len); }
3449 prop_category = UCD_CATEGORY(c);
3450 if (prop_category != ucp_M) break;
3451 eptr += len;
3452 }
3453 }
3454 }
3455
3456 else
3457 #endif /* SUPPORT_UCP */
3458
3459 /* Handle all other cases when the coding is UTF-8 */
3460
3461 #ifdef SUPPORT_UTF8
3462 if (utf8) switch(ctype)
3463 {
3464 case OP_ANY:
3465 for (i = 1; i <= min; i++)
3466 {
3467 if (eptr >= md->end_subject)
3468 {
3469 SCHECK_PARTIAL();
3470 RRETURN(MATCH_NOMATCH);
3471 }
3472 if (IS_NEWLINE(eptr)) RRETURN(MATCH_NOMATCH);
3473 eptr++;
3474 while (eptr < md->end_subject && (*eptr & 0xc0) == 0x80) eptr++;
3475 }
3476 break;
3477
3478 case OP_ALLANY:
3479 for (i = 1; i <= min; i++)
3480 {
3481 if (eptr >= md->end_subject)
3482 {
3483 SCHECK_PARTIAL();
3484 RRETURN(MATCH_NOMATCH);
3485 }
3486 eptr++;
3487 while (eptr < md->end_subject && (*eptr & 0xc0) == 0x80) eptr++;
3488 }
3489 break;
3490
3491 case OP_ANYBYTE:
3492 if (eptr > md->end_subject - min) RRETURN(MATCH_NOMATCH);
3493 eptr += min;
3494 break;
3495
3496 case OP_ANYNL:
3497 for (i = 1; i <= min; i++)
3498 {
3499 if (eptr >= md->end_subject)
3500 {
3501 SCHECK_PARTIAL();
3502 RRETURN(MATCH_NOMATCH);
3503 }
3504 GETCHARINC(c, eptr);
3505 switch(c)
3506 {
3507 default: RRETURN(MATCH_NOMATCH);
3508 case 0x000d:
3509 if (eptr < md->end_subject && *eptr == 0x0a) eptr++;
3510 break;
3511
3512 case 0x000a:
3513 break;
3514
3515 case 0x000b:
3516 case 0x000c:
3517 case 0x0085:
3518 case 0x2028:
3519 case 0x2029:
3520 if (md->bsr_anycrlf) RRETURN(MATCH_NOMATCH);
3521 break;
3522 }
3523 }
3524 break;
3525
3526 case OP_NOT_HSPACE:
3527 for (i = 1; i <= min; i++)
3528 {
3529 if (eptr >= md->end_subject)
3530 {
3531 SCHECK_PARTIAL();
3532 RRETURN(MATCH_NOMATCH);
3533 }
3534 GETCHARINC(c, eptr);
3535 switch(c)
3536 {
3537 default: break;
3538 case 0x09: /* HT */
3539 case 0x20: /* SPACE */
3540 case 0xa0: /* NBSP */
3541 case 0x1680: /* OGHAM SPACE MARK */
3542 case 0x180e: /* MONGOLIAN VOWEL SEPARATOR */
3543 case 0x2000: /* EN QUAD */
3544 case 0x2001: /* EM QUAD */
3545 case 0x2002: /* EN SPACE */
3546 case 0x2003: /* EM SPACE */
3547 case 0x2004: /* THREE-PER-EM SPACE */
3548 case 0x2005: /* FOUR-PER-EM SPACE */
3549 case 0x2006: /* SIX-PER-EM SPACE */
3550 case 0x2007: /* FIGURE SPACE */
3551 case 0x2008: /* PUNCTUATION SPACE */
3552 case 0x2009: /* THIN SPACE */
3553 case 0x200A: /* HAIR SPACE */
3554 case 0x202f: /* NARROW NO-BREAK SPACE */
3555 case 0x205f: /* MEDIUM MATHEMATICAL SPACE */
3556 case 0x3000: /* IDEOGRAPHIC SPACE */
3557 RRETURN(MATCH_NOMATCH);
3558 }
3559 }
3560 break;
3561
3562 case OP_HSPACE:
3563 for (i = 1; i <= min; i++)
3564 {
3565 if (eptr >= md->end_subject)
3566 {
3567 SCHECK_PARTIAL();
3568 RRETURN(MATCH_NOMATCH);
3569 }
3570 GETCHARINC(c, eptr);
3571 switch(c)
3572 {
3573 default: RRETURN(MATCH_NOMATCH);
3574 case 0x09: /* HT */
3575 case 0x20: /* SPACE */
3576 case 0xa0: /* NBSP */
3577 case 0x1680: /* OGHAM SPACE MARK */
3578 case 0x180e: /* MONGOLIAN VOWEL SEPARATOR */
3579 case 0x2000: /* EN QUAD */
3580 case 0x2001: /* EM QUAD */
3581 case 0x2002: /* EN SPACE */
3582 case 0x2003: /* EM SPACE */
3583 case 0x2004: /* THREE-PER-EM SPACE */
3584 case 0x2005: /* FOUR-PER-EM SPACE */
3585 case 0x2006: /* SIX-PER-EM SPACE */
3586 case 0x2007: /* FIGURE SPACE */
3587 case 0x2008: /* PUNCTUATION SPACE */
3588 case 0x2009: /* THIN SPACE */
3589 case 0x200A: /* HAIR SPACE */
3590 case 0x202f: /* NARROW NO-BREAK SPACE */
3591 case 0x205f: /* MEDIUM MATHEMATICAL SPACE */
3592 case 0x3000: /* IDEOGRAPHIC SPACE */
3593 break;
3594 }
3595 }
3596 break;
3597
3598 case OP_NOT_VSPACE:
3599 for (i = 1; i <= min; i++)
3600 {
3601 if (eptr >= md->end_subject)
3602 {
3603 SCHECK_PARTIAL();
3604 RRETURN(MATCH_NOMATCH);
3605 }
3606 GETCHARINC(c, eptr);
3607 switch(c)
3608 {
3609 default: break;
3610 case 0x0a: /* LF */
3611 case 0x0b: /* VT */
3612 case 0x0c: /* FF */
3613 case 0x0d: /* CR */
3614 case 0x85: /* NEL */
3615 case 0x2028: /* LINE SEPARATOR */
3616 case 0x2029: /* PARAGRAPH SEPARATOR */
3617 RRETURN(MATCH_NOMATCH);
3618 }
3619 }
3620 break;
3621
3622 case OP_VSPACE:
3623 for (i = 1; i <= min; i++)
3624 {
3625 if (eptr >= md->end_subject)
3626 {
3627 SCHECK_PARTIAL();
3628 RRETURN(MATCH_NOMATCH);
3629 }
3630 GETCHARINC(c, eptr);
3631 switch(c)
3632 {
3633 default: RRETURN(MATCH_NOMATCH);
3634 case 0x0a: /* LF */
3635 case 0x0b: /* VT */
3636 case 0x0c: /* FF */
3637 case 0x0d: /* CR */
3638 case 0x85: /* NEL */
3639 case 0x2028: /* LINE SEPARATOR */
3640 case 0x2029: /* PARAGRAPH SEPARATOR */
3641 break;
3642 }
3643 }
3644 break;
3645
3646 case OP_NOT_DIGIT:
3647 for (i = 1; i <= min; i++)
3648 {
3649 if (eptr >= md->end_subject)
3650 {
3651 SCHECK_PARTIAL();
3652 RRETURN(MATCH_NOMATCH);
3653 }
3654 GETCHARINC(c, eptr);
3655 if (c < 128 && (md->ctypes[c] & ctype_digit) != 0)
3656 RRETURN(MATCH_NOMATCH);
3657 }
3658 break;
3659
3660 case OP_DIGIT:
3661 for (i = 1; i <= min; i++)
3662 {
3663 if (eptr >= md->end_subject)
3664 {
3665 SCHECK_PARTIAL();
3666 RRETURN(MATCH_NOMATCH);
3667 }
3668 if (*eptr >= 128 || (md->ctypes[*eptr++] & ctype_digit) == 0)
3669 RRETURN(MATCH_NOMATCH);
3670 /* No need to skip more bytes - we know it's a 1-byte character */
3671 }
3672 break;
3673
3674 case OP_NOT_WHITESPACE:
3675 for (i = 1; i <= min; i++)
3676 {
3677 if (eptr >= md->end_subject)
3678 {
3679 SCHECK_PARTIAL();
3680 RRETURN(MATCH_NOMATCH);
3681 }
3682 if (*eptr < 128 && (md->ctypes[*eptr] & ctype_space) != 0)
3683 RRETURN(MATCH_NOMATCH);
3684 while (++eptr < md->end_subject && (*eptr & 0xc0) == 0x80);
3685 }
3686 break;
3687
3688 case OP_WHITESPACE:
3689 for (i = 1; i <= min; i++)
3690 {
3691 if (eptr >= md->end_subject)
3692 {
3693 SCHECK_PARTIAL();
3694 RRETURN(MATCH_NOMATCH);
3695 }
3696 if (*eptr >= 128 || (md->ctypes[*eptr++] & ctype_space) == 0)
3697 RRETURN(MATCH_NOMATCH);
3698 /* No need to skip more bytes - we know it's a 1-byte character */
3699 }
3700 break;
3701
3702 case OP_NOT_WORDCHAR:
3703 for (i = 1; i <= min; i++)
3704 {
3705 if (eptr >= md->end_subject)
3706 {
3707 SCHECK_PARTIAL();
3708 RRETURN(MATCH_NOMATCH);
3709 }
3710 if (*eptr < 128 && (md->ctypes[*eptr] & ctype_word) != 0)
3711 RRETURN(MATCH_NOMATCH);
3712 while (++eptr < md->end_subject && (*eptr & 0xc0) == 0x80);
3713 }
3714 break;
3715
3716 case OP_WORDCHAR:
3717 for (i = 1; i <= min; i++)
3718 {
3719 if (eptr >= md->end_subject)
3720 {
3721 SCHECK_PARTIAL();
3722 RRETURN(MATCH_NOMATCH);
3723 }
3724 if (*eptr >= 128 || (md->ctypes[*eptr++] & ctype_word) == 0)
3725 RRETURN(MATCH_NOMATCH);
3726 /* No need to skip more bytes - we know it's a 1-byte character */
3727 }
3728 break;
3729
3730 default:
3731 RRETURN(PCRE_ERROR_INTERNAL);
3732 } /* End switch(ctype) */
3733
3734 else
3735 #endif /* SUPPORT_UTF8 */
3736
3737 /* Code for the non-UTF-8 case for minimum matching of operators other
3738 than OP_PROP and OP_NOTPROP. */
3739
3740 switch(ctype)
3741 {
3742 case OP_ANY:
3743 for (i = 1; i <= min; i++)
3744 {
3745 if (eptr >= md->end_subject)
3746 {
3747 SCHECK_PARTIAL();
3748 RRETURN(MATCH_NOMATCH);
3749 }
3750 if (IS_NEWLINE(eptr)) RRETURN(MATCH_NOMATCH);
3751 eptr++;
3752 }
3753 break;
3754
3755 case OP_ALLANY:
3756 if (eptr > md->end_subject - min)
3757 {
3758 SCHECK_PARTIAL();
3759 RRETURN(MATCH_NOMATCH);
3760 }
3761 eptr += min;
3762 break;
3763
3764 case OP_ANYBYTE:
3765 if (eptr > md->end_subject - min)
3766 {
3767 SCHECK_PARTIAL();
3768 RRETURN(MATCH_NOMATCH);
3769 }
3770 eptr += min;
3771 break;
3772
3773 case OP_ANYNL:
3774 for (i = 1; i <= min; i++)
3775 {
3776 if (eptr >= md->end_subject)
3777 {
3778 SCHECK_PARTIAL();
3779 RRETURN(MATCH_NOMATCH);
3780 }
3781 switch(*eptr++)
3782 {
3783 default: RRETURN(MATCH_NOMATCH);
3784 case 0x000d:
3785 if (eptr < md->end_subject && *eptr == 0x0a) eptr++;
3786 break;
3787 case 0x000a:
3788 break;
3789
3790 case 0x000b:
3791 case 0x000c:
3792 case 0x0085:
3793 if (md->bsr_anycrlf) RRETURN(MATCH_NOMATCH);
3794 break;
3795 }
3796 }
3797 break;
3798
3799 case OP_NOT_HSPACE:
3800 for (i = 1; i <= min; i++)
3801 {
3802 if (eptr >= md->end_subject)
3803 {
3804 SCHECK_PARTIAL();
3805 RRETURN(MATCH_NOMATCH);
3806 }
3807 switch(*eptr++)
3808 {
3809 default: break;
3810 case 0x09: /* HT */
3811 case 0x20: /* SPACE */
3812 case 0xa0: /* NBSP */
3813 RRETURN(MATCH_NOMATCH);
3814 }
3815 }
3816 break;
3817
3818 case OP_HSPACE:
3819 for (i = 1; i <= min; i++)
3820 {
3821 if (eptr >= md->end_subject)
3822 {
3823 SCHECK_PARTIAL();
3824 RRETURN(MATCH_NOMATCH);
3825 }
3826 switch(*eptr++)
3827 {
3828 default: RRETURN(MATCH_NOMATCH);
3829 case 0x09: /* HT */
3830 case 0x20: /* SPACE */
3831 case 0xa0: /* NBSP */
3832 break;
3833 }
3834 }
3835 break;
3836
3837 case OP_NOT_VSPACE:
3838 for (i = 1; i <= min; i++)
3839 {
3840 if (eptr >= md->end_subject)
3841 {
3842 SCHECK_PARTIAL();
3843 RRETURN(MATCH_NOMATCH);
3844 }
3845 switch(*eptr++)
3846 {
3847 default: break;
3848 case 0x0a: /* LF */
3849 case 0x0b: /* VT */
3850 case 0x0c: /* FF */
3851 case 0x0d: /* CR */
3852 case 0x85: /* NEL */
3853 RRETURN(MATCH_NOMATCH);
3854 }
3855 }
3856 break;
3857
3858 case OP_VSPACE:
3859 for (i = 1; i <= min; i++)
3860 {
3861 if (eptr >= md->end_subject)
3862 {
3863 SCHECK_PARTIAL();
3864 RRETURN(MATCH_NOMATCH);
3865 }
3866 switch(*eptr++)
3867 {
3868 default: RRETURN(MATCH_NOMATCH);
3869 case 0x0a: /* LF */
3870 case 0x0b: /* VT */
3871 case 0x0c: /* FF */
3872 case 0x0d: /* CR */
3873 case 0x85: /* NEL */
3874 break;
3875 }
3876 }
3877 break;
3878
3879 case OP_NOT_DIGIT:
3880 for (i = 1; i <= min; i++)
3881 {
3882 if (eptr >= md->end_subject)
3883 {
3884 SCHECK_PARTIAL();
3885 RRETURN(MATCH_NOMATCH);
3886 }
3887 if ((md->ctypes[*eptr++] & ctype_digit) != 0) RRETURN(MATCH_NOMATCH);
3888 }
3889 break;
3890
3891 case OP_DIGIT:
3892 for (i = 1; i <= min; i++)
3893 {
3894 if (eptr >= md->end_subject)
3895 {
3896 SCHECK_PARTIAL();
3897 RRETURN(MATCH_NOMATCH);
3898 }
3899 if ((md->ctypes[*eptr++] & ctype_digit) == 0) RRETURN(MATCH_NOMATCH);
3900 }
3901 break;
3902
3903 case OP_NOT_WHITESPACE:
3904 for (i = 1; i <= min; i++)
3905 {
3906 if (eptr >= md->end_subject)
3907 {
3908 SCHECK_PARTIAL();
3909 RRETURN(MATCH_NOMATCH);
3910 }
3911 if ((md->ctypes[*eptr++] & ctype_space) != 0) RRETURN(MATCH_NOMATCH);
3912 }
3913 break;
3914
3915 case OP_WHITESPACE:
3916 for (i = 1; i <= min; i++)
3917 {
3918 if (eptr >= md->end_subject)
3919 {
3920 SCHECK_PARTIAL();
3921 RRETURN(MATCH_NOMATCH);
3922 }
3923 if ((md->ctypes[*eptr++] & ctype_space) == 0) RRETURN(MATCH_NOMATCH);
3924 }
3925 break;
3926
3927 case OP_NOT_WORDCHAR:
3928 for (i = 1; i <= min; i++)
3929 {
3930 if (eptr >= md->end_subject)
3931 {
3932 SCHECK_PARTIAL();
3933 RRETURN(MATCH_NOMATCH);
3934 }
3935 if ((md->ctypes[*eptr++] & ctype_word) != 0)
3936 RRETURN(MATCH_NOMATCH);
3937 }
3938 break;
3939
3940 case OP_WORDCHAR:
3941 for (i = 1; i <= min; i++)
3942 {
3943 if (eptr >= md->end_subject)
3944 {
3945 SCHECK_PARTIAL();
3946 RRETURN(MATCH_NOMATCH);
3947 }
3948 if ((md->ctypes[*eptr++] & ctype_word) == 0)
3949 RRETURN(MATCH_NOMATCH);
3950 }
3951 break;
3952
3953 default:
3954 RRETURN(PCRE_ERROR_INTERNAL);
3955 }
3956 }
3957
3958 /* If min = max, continue at the same level without recursing */
3959
3960 if (min == max) continue;
3961
3962 /* If minimizing, we have to test the rest of the pattern before each
3963 subsequent match. Again, separate the UTF-8 case for speed, and also
3964 separate the UCP cases. */
3965
3966 if (minimize)
3967 {
3968 #ifdef SUPPORT_UCP
3969 if (prop_type >= 0)
3970 {
3971 switch(prop_type)
3972 {
3973 case PT_ANY:
3974 for (fi = min;; fi++)
3975 {
3976 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM36);
3977 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3978 if (fi >= max) RRETURN(MATCH_NOMATCH);
3979 if (eptr >= md->end_subject)
3980 {
3981 SCHECK_PARTIAL();
3982 RRETURN(MATCH_NOMATCH);
3983 }
3984 GETCHARINC(c, eptr);
3985 if (prop_fail_result) RRETURN(MATCH_NOMATCH);
3986 }
3987 /* Control never gets here */
3988
3989 case PT_LAMP:
3990 for (fi = min;; fi++)
3991 {
3992 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM37);
3993 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3994 if (fi >= max) RRETURN(MATCH_NOMATCH);
3995 if (eptr >= md->end_subject)
3996 {
3997 SCHECK_PARTIAL();
3998 RRETURN(MATCH_NOMATCH);
3999 }
4000 GETCHARINC(c, eptr);
4001 prop_chartype = UCD_CHARTYPE(c);
4002 if ((prop_chartype == ucp_Lu ||
4003 prop_chartype == ucp_Ll ||
4004 prop_chartype == ucp_Lt) == prop_fail_result)
4005 RRETURN(MATCH_NOMATCH);
4006 }
4007 /* Control never gets here */
4008
4009 case PT_GC:
4010 for (fi = min;; fi++)
4011 {
4012 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM38);
4013 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
4014 if (fi >= max) RRETURN(MATCH_NOMATCH);
4015 if (eptr >= md->end_subject)
4016 {
4017 SCHECK_PARTIAL();
4018 RRETURN(MATCH_NOMATCH);
4019 }
4020 GETCHARINC(c, eptr);
4021 prop_category = UCD_CATEGORY(c);
4022 if ((prop_category == prop_value) == prop_fail_result)
4023 RRETURN(MATCH_NOMATCH);
4024 }
4025 /* Control never gets here */
4026
4027 case PT_PC:
4028 for (fi = min;; fi++)
4029 {
4030 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM39);
4031 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
4032 if (fi >= max) RRETURN(MATCH_NOMATCH);
4033 if (eptr >= md->end_subject)
4034 {
4035 SCHECK_PARTIAL();
4036 RRETURN(MATCH_NOMATCH);
4037 }
4038 GETCHARINC(c, eptr);
4039 prop_chartype = UCD_CHARTYPE(c);
4040 if ((prop_chartype == prop_value) == prop_fail_result)
4041 RRETURN(MATCH_NOMATCH);
4042 }
4043 /* Control never gets here */
4044
4045 case PT_SC:
4046 for (fi = min;; fi++)
4047 {
4048 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM40);
4049 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
4050 if (fi >= max) RRETURN(MATCH_NOMATCH);
4051 if (eptr >= md->end_subject)
4052 {
4053 SCHECK_PARTIAL();
4054 RRETURN(MATCH_NOMATCH);
4055 }
4056 GETCHARINC(c, eptr);
4057 prop_script = UCD_SCRIPT(c);
4058 if ((prop_script == prop_value) == prop_fail_result)
4059 RRETURN(MATCH_NOMATCH);
4060 }
4061 /* Control never gets here */
4062
4063 default:
4064 RRETURN(PCRE_ERROR_INTERNAL);
4065 }
4066 }
4067
4068 /* Match extended Unicode sequences. We will get here only if the
4069 support is in the binary; otherwise a compile-time error occurs. */
4070
4071 else if (ctype == OP_EXTUNI)
4072 {
4073 for (fi = min;; fi++)
4074 {
4075 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM41);
4076 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
4077 if (fi >= max) RRETURN(MATCH_NOMATCH);
4078 if (eptr >= md->end_subject)
4079 {
4080 SCHECK_PARTIAL();
4081 RRETURN(MATCH_NOMATCH);
4082 }
4083 GETCHARINCTEST(c, eptr);
4084 prop_category = UCD_CATEGORY(c);
4085 if (prop_category == ucp_M) RRETURN(MATCH_NOMATCH);
4086 while (eptr < md->end_subject)
4087 {
4088 int len = 1;
4089 if (!utf8) c = *eptr;
4090 else { GETCHARLEN(c, eptr, len); }
4091 prop_category = UCD_CATEGORY(c);
4092 if (prop_category != ucp_M) break;
4093 eptr += len;
4094 }
4095 }
4096 }
4097
4098 else
4099 #endif /* SUPPORT_UCP */
4100
4101 #ifdef SUPPORT_UTF8
4102 /* UTF-8 mode */
4103 if (utf8)
4104 {
4105 for (fi = min;; fi++)
4106 {
4107 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM42);
4108 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
4109 if (fi >= max) RRETURN(MATCH_NOMATCH);
4110 if (eptr >= md->end_subject)
4111 {
4112 SCHECK_PARTIAL();
4113 RRETURN(MATCH_NOMATCH);
4114 }
4115 if (ctype == OP_ANY && IS_NEWLINE(eptr))
4116 RRETURN(MATCH_NOMATCH);
4117 GETCHARINC(c, eptr);
4118 switch(ctype)
4119 {
4120 case OP_ANY: /* This is the non-NL case */
4121 case OP_ALLANY:
4122 case OP_ANYBYTE:
4123 break;
4124
4125 case OP_ANYNL:
4126 switch(c)
4127 {
4128 default: RRETURN(MATCH_NOMATCH);
4129 case 0x000d:
4130 if (eptr < md->end_subject && *eptr == 0x0a) eptr++;
4131 break;
4132 case 0x000a:
4133 break;
4134
4135 case 0x000b:
4136 case 0x000c:
4137 case 0x0085:
4138 case 0x2028:
4139 case 0x2029:
4140 if (md->bsr_anycrlf) RRETURN(MATCH_NOMATCH);
4141 break;
4142 }
4143 break;
4144
4145 case OP_NOT_HSPACE:
4146 switch(c)
4147 {
4148 default: break;
4149 case 0x09: /* HT */
4150 case 0x20: /* SPACE */
4151 case 0xa0: /* NBSP */
4152 case 0x1680: /* OGHAM SPACE MARK */
4153 case 0x180e: /* MONGOLIAN VOWEL SEPARATOR */
4154 case 0x2000: /* EN QUAD */
4155 case 0x2001: /* EM QUAD */
4156 case 0x2002: /* EN SPACE */
4157 case 0x2003: /* EM SPACE */
4158 case 0x2004: /* THREE-PER-EM SPACE */
4159 case 0x2005: /* FOUR-PER-EM SPACE */
4160 case 0x2006: /* SIX-PER-EM SPACE */
4161 case 0x2007: /* FIGURE SPACE */
4162 case 0x2008: /* PUNCTUATION SPACE */
4163 case 0x2009: /* THIN SPACE */
4164 case 0x200A: /* HAIR SPACE */
4165 case 0x202f: /* NARROW NO-BREAK SPACE */
4166 case 0x205f: /* MEDIUM MATHEMATICAL SPACE */
4167 case 0x3000: /* IDEOGRAPHIC SPACE */
4168 RRETURN(MATCH_NOMATCH);
4169 }
4170 break;
4171
4172 case OP_HSPACE:
4173 switch(c)
4174 {
4175 default: RRETURN(MATCH_NOMATCH);
4176 case 0x09: /* HT */
4177 case 0x20: /* SPACE */
4178 case 0xa0: /* NBSP */
4179 case 0x1680: /* OGHAM SPACE MARK */
4180 case 0x180e: /* MONGOLIAN VOWEL SEPARATOR */
4181 case 0x2000: /* EN QUAD */
4182 case 0x2001: /* EM QUAD */
4183 case 0x2002: /* EN SPACE */
4184 case 0x2003: /* EM SPACE */
4185 case 0x2004: /* THREE-PER-EM SPACE */
4186 case 0x2005: /* FOUR-PER-EM SPACE */
4187 case 0x2006: /* SIX-PER-EM SPACE */
4188 case 0x2007: /* FIGURE SPACE */
4189 case 0x2008: /* PUNCTUATION SPACE */
4190 case 0x2009: /* THIN SPACE */
4191 case 0x200A: /* HAIR SPACE */
4192 case 0x202f: /* NARROW NO-BREAK SPACE */
4193 case 0x205f: /* MEDIUM MATHEMATICAL SPACE */
4194 case 0x3000: /* IDEOGRAPHIC SPACE */
4195 break;
4196 }
4197 break;
4198
4199 case OP_NOT_VSPACE:
4200 switch(c)
4201 {
4202 default: break;
4203 case 0x0a: /* LF */
4204 case 0x0b: /* VT */
4205 case 0x0c: /* FF */
4206 case 0x0d: /* CR */
4207 case 0x85: /* NEL */
4208 case 0x2028: /* LINE SEPARATOR */
4209 case 0x2029: /* PARAGRAPH SEPARATOR */
4210 RRETURN(MATCH_NOMATCH);
4211 }
4212 break;
4213
4214 case OP_VSPACE:
4215 switch(c)
4216 {
4217 default: RRETURN(MATCH_NOMATCH);
4218 case 0x0a: /* LF */
4219 case 0x0b: /* VT */
4220 case 0x0c: /* FF */
4221 case 0x0d: /* CR */
4222 case 0x85: /* NEL */
4223 case 0x2028: /* LINE SEPARATOR */
4224 case 0x2029: /* PARAGRAPH SEPARATOR */
4225 break;
4226 }
4227 break;
4228
4229 case OP_NOT_DIGIT:
4230 if (c < 256 && (md->ctypes[c] & ctype_digit) != 0)
4231 RRETURN(MATCH_NOMATCH);
4232 break;
4233
4234 case OP_DIGIT:
4235 if (c >= 256 || (md->ctypes[c] & ctype_digit) == 0)
4236 RRETURN(MATCH_NOMATCH);
4237 break;
4238
4239 case OP_NOT_WHITESPACE:
4240 if (c < 256 && (md->ctypes[c] & ctype_space) != 0)
4241 RRETURN(MATCH_NOMATCH);
4242 break;
4243
4244 case OP_WHITESPACE:
4245 if (c >= 256 || (md->ctypes[c] & ctype_space) == 0)
4246 RRETURN(MATCH_NOMATCH);
4247 break;
4248
4249 case OP_NOT_WORDCHAR:
4250 if (c < 256 && (md->ctypes[c] & ctype_word) != 0)
4251 RRETURN(MATCH_NOMATCH);
4252 break;
4253
4254 case OP_WORDCHAR:
4255 if (c >= 256 || (md->ctypes[c] & ctype_word) == 0)
4256 RRETURN(MATCH_NOMATCH);
4257 break;
4258
4259 default:
4260 RRETURN(PCRE_ERROR_INTERNAL);
4261 }
4262 }
4263 }
4264 else
4265 #endif
4266 /* Not UTF-8 mode */
4267 {
4268 for (fi = min;; fi++)
4269 {
4270 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM43);
4271 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
4272 if (fi >= max) RRETURN(MATCH_NOMATCH);
4273 if (eptr >= md->end_subject)
4274 {
4275 SCHECK_PARTIAL();
4276 RRETURN(MATCH_NOMATCH);
4277 }
4278 if (ctype == OP_ANY && IS_NEWLINE(eptr))
4279 RRETURN(MATCH_NOMATCH);
4280 c = *eptr++;
4281 switch(ctype)
4282 {
4283 case OP_ANY: /* This is the non-NL case */
4284 case OP_ALLANY:
4285 case OP_ANYBYTE:
4286 break;
4287
4288 case OP_ANYNL:
4289 switch(c)
4290 {
4291 default: RRETURN(MATCH_NOMATCH);
4292 case 0x000d:
4293 if (eptr < md->end_subject && *eptr == 0x0a) eptr++;
4294 break;
4295
4296 case 0x000a:
4297 break;
4298
4299 case 0x000b:
4300 case 0x000c:
4301 case 0x0085:
4302 if (md->bsr_anycrlf) RRETURN(MATCH_NOMATCH);
4303 break;
4304 }
4305 break;
4306
4307 case OP_NOT_HSPACE:
4308 switch(c)
4309 {
4310 default: break;
4311 case 0x09: /* HT */
4312 case 0x20: /* SPACE */
4313 case 0xa0: /* NBSP */
4314 RRETURN(MATCH_NOMATCH);
4315 }
4316 break;
4317
4318 case OP_HSPACE:
4319 switch(c)
4320 {
4321 default: RRETURN(MATCH_NOMATCH);
4322 case 0x09: /* HT */
4323 case 0x20: /* SPACE */
4324 case 0xa0: /* NBSP */
4325 break;
4326 }
4327 break;
4328
4329 case OP_NOT_VSPACE:
4330 switch(c)
4331 {
4332 default: break;
4333 case 0x0a: /* LF */
4334 case 0x0b: /* VT */
4335 case 0x0c: /* FF */
4336 case 0x0d: /* CR */
4337 case 0x85: /* NEL */
4338 RRETURN(MATCH_NOMATCH);
4339 }
4340 break;
4341
4342 case OP_VSPACE:
4343 switch(c)
4344 {
4345 default: RRETURN(MATCH_NOMATCH);
4346 case 0x0a: /* LF */
4347 case 0x0b: /* VT */
4348 case 0x0c: /* FF */
4349 case 0x0d: /* CR */
4350 case 0x85: /* NEL */
4351 break;
4352 }
4353 break;
4354
4355 case OP_NOT_DIGIT:
4356 if ((md->ctypes[c] & ctype_digit) != 0) RRETURN(MATCH_NOMATCH);
4357 break;
4358
4359 case OP_DIGIT:
4360 if ((md->ctypes[c] & ctype_digit) == 0) RRETURN(MATCH_NOMATCH);
4361 break;
4362
4363 case OP_NOT_WHITESPACE:
4364 if ((md->ctypes[c] & ctype_space) != 0) RRETURN(MATCH_NOMATCH);
4365 break;
4366
4367 case OP_WHITESPACE:
4368 if ((md->ctypes[c] & ctype_space) == 0) RRETURN(MATCH_NOMATCH);
4369 break;
4370
4371 case OP_NOT_WORDCHAR:
4372 if ((md->ctypes[c] & ctype_word) != 0) RRETURN(MATCH_NOMATCH);
4373 break;
4374
4375 case OP_WORDCHAR:
4376 if ((md->ctypes[c] & ctype_word) == 0) RRETURN(MATCH_NOMATCH);
4377 break;
4378
4379 default:
4380 RRETURN(PCRE_ERROR_INTERNAL);
4381 }
4382 }
4383 }
4384 /* Control never gets here */
4385 }
4386
4387 /* If maximizing, it is worth using inline code for speed, doing the type
4388 test once at the start (i.e. keep it out of the loop). Again, keep the
4389 UTF-8 and UCP stuff separate. */
4390
4391 else
4392 {
4393 pp = eptr; /* Remember where we started */
4394
4395 #ifdef SUPPORT_UCP
4396 if (prop_type >= 0)
4397 {
4398 switch(prop_type)
4399 {
4400 case PT_ANY:
4401 for (i = min; i < max; i++)
4402 {
4403 int len = 1;
4404 if (eptr >= md->end_subject)
4405 {
4406 SCHECK_PARTIAL();
4407 break;
4408 }
4409 GETCHARLEN(c, eptr, len);
4410 if (prop_fail_result) break;
4411 eptr+= len;
4412 }
4413 break;
4414
4415 case PT_LAMP:
4416 for (i = min; i < max; i++)
4417 {
4418 int len = 1;
4419 if (eptr >= md->end_subject)
4420 {
4421 SCHECK_PARTIAL();
4422 break;
4423 }
4424 GETCHARLEN(c, eptr, len);
4425 prop_chartype = UCD_CHARTYPE(c);
4426 if ((prop_chartype == ucp_Lu ||
4427 prop_chartype == ucp_Ll ||
4428 prop_chartype == ucp_Lt) == prop_fail_result)
4429 break;
4430 eptr+= len;
4431 }
4432 break;
4433
4434 case PT_GC:
4435 for (i = min; i < max; i++)
4436 {
4437 int len = 1;
4438 if (eptr >= md->end_subject)
4439 {
4440 SCHECK_PARTIAL();
4441 break;
4442 }
4443 GETCHARLEN(c, eptr, len);
4444 prop_category = UCD_CATEGORY(c);
4445 if ((prop_category == prop_value) == prop_fail_result)
4446 break;
4447 eptr+= len;
4448 }
4449 break;
4450
4451 case PT_PC:
4452 for (i = min; i < max; i++)
4453 {
4454 int len = 1;
4455 if (eptr >= md->end_subject)
4456 {
4457 SCHECK_PARTIAL();
4458 break;
4459 }
4460 GETCHARLEN(c, eptr, len);
4461 prop_chartype = UCD_CHARTYPE(c);
4462 if ((prop_chartype == prop_value) == prop_fail_result)
4463 break;
4464 eptr+= len;
4465 }
4466 break;
4467
4468 case PT_SC:
4469 for (i = min; i < max; i++)
4470 {
4471 int len = 1;
4472 if (eptr >= md->end_subject)
4473 {
4474 SCHECK_PARTIAL();
4475 break;
4476 }
4477 GETCHARLEN(c, eptr, len);
4478 prop_script = UCD_SCRIPT(c);
4479 if ((prop_script == prop_value) == prop_fail_result)
4480 break;
4481 eptr+= len;
4482 }
4483 break;
4484 }
4485
4486 /* eptr is now past the end of the maximum run */
4487
4488 if (possessive) continue;
4489 for(;;)
4490 {
4491 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM44);
4492 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
4493 if (eptr-- == pp) break; /* Stop if tried at original pos */
4494 if (utf8) BACKCHAR(eptr);
4495 }
4496 }
4497
4498 /* Match extended Unicode sequences. We will get here only if the
4499 support is in the binary; otherwise a compile-time error occurs. */
4500
4501 else if (ctype == OP_EXTUNI)
4502 {
4503 for (i = min; i < max; i++)
4504 {
4505 if (eptr >= md->end_subject)
4506 {
4507 SCHECK_PARTIAL();
4508 break;
4509 }
4510 GETCHARINCTEST(c, eptr);
4511 prop_category = UCD_CATEGORY(c);
4512 if (prop_category == ucp_M) break;
4513 while (eptr < md->end_subject)
4514 {
4515 int len = 1;
4516 if (!utf8) c = *eptr; else
4517 {
4518 GETCHARLEN(c, eptr, len);
4519 }
4520 prop_category = UCD_CATEGORY(c);
4521 if (prop_category != ucp_M) break;
4522 eptr += len;
4523 }
4524 }
4525
4526 /* eptr is now past the end of the maximum run */
4527
4528 if (possessive) continue;
4529
4530 for(;;)
4531 {
4532 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM45);
4533 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
4534 if (eptr-- == pp) break; /* Stop if tried at original pos */
4535 for (;;) /* Move back over one extended */
4536 {
4537 int len = 1;
4538 if (!utf8) c = *eptr; else
4539 {
4540 BACKCHAR(eptr);
4541 GETCHARLEN(c, eptr, len);
4542 }
4543 prop_category = UCD_CATEGORY(c);
4544 if (prop_category != ucp_M) break;
4545 eptr--;
4546 }
4547 }
4548 }
4549
4550 else
4551 #endif /* SUPPORT_UCP */
4552
4553 #ifdef SUPPORT_UTF8
4554 /* UTF-8 mode */
4555
4556 if (utf8)
4557 {
4558 switch(ctype)
4559 {
4560 case OP_ANY:
4561 if (max < INT_MAX)
4562 {
4563 for (i = min; i < max; i++)
4564 {
4565 if (eptr >= md->end_subject)
4566 {
4567 SCHECK_PARTIAL();
4568 break;
4569 }
4570 if (IS_NEWLINE(eptr)) break;
4571 eptr++;
4572 while (eptr < md->end_subject && (*eptr & 0xc0) == 0x80) eptr++;
4573 }
4574 }
4575
4576 /* Handle unlimited UTF-8 repeat */
4577
4578 else
4579 {
4580 for (i = min; i < max; i++)
4581 {
4582 if (eptr >= md->end_subject)
4583 {
4584 SCHECK_PARTIAL();
4585 break;
4586 }
4587 if (IS_NEWLINE(eptr)) break;
4588 eptr++;
4589 while (eptr < md->end_subject && (*eptr & 0xc0) == 0x80) eptr++;
4590 }
4591 }
4592 break;
4593
4594 case OP_ALLANY:
4595 if (max < INT_MAX)
4596 {
4597 for (i = min; i < max; i++)
4598 {
4599 if (eptr >= md->end_subject)
4600 {
4601 SCHECK_PARTIAL();
4602 break;
4603 }
4604 eptr++;
4605 while (eptr < md->end_subject && (*eptr & 0xc0) == 0x80) eptr++;
4606 }
4607 }
4608 else eptr = md->end_subject; /* Unlimited UTF-8 repeat */
4609 break;
4610
4611 /* The byte case is the same as non-UTF8 */
4612
4613 case OP_ANYBYTE:
4614 c = max - min;
4615 if (c > (unsigned int)(md->end_subject - eptr))
4616 {
4617 eptr = md->end_subject;
4618 SCHECK_PARTIAL();
4619 }
4620 else eptr += c;
4621 break;
4622
4623 case OP_ANYNL:
4624 for (i = min; i < max; i++)
4625 {
4626 int len = 1;
4627 if (eptr >= md->end_subject)
4628 {
4629 SCHECK_PARTIAL();
4630 break;
4631 }
4632 GETCHARLEN(c, eptr, len);
4633 if (c == 0x000d)
4634 {
4635 if (++eptr >= md->end_subject) break;
4636 if (*eptr == 0x000a) eptr++;
4637 }
4638 else
4639 {
4640 if (c != 0x000a &&
4641 (md->bsr_anycrlf ||
4642 (c != 0x000b && c != 0x000c &&
4643 c != 0x0085 && c != 0x2028 && c != 0x2029)))
4644 break;
4645 eptr += len;
4646 }
4647 }
4648 break;
4649
4650 case OP_NOT_HSPACE:
4651 case OP_HSPACE:
4652 for (i = min; i < max; i++)
4653 {
4654 BOOL gotspace;
4655 int len = 1;
4656 if (eptr >= md->end_subject)
4657 {
4658 SCHECK_PARTIAL();
4659 break;
4660 }
4661 GETCHARLEN(c, eptr, len);
4662 switch(c)
4663 {
4664 default: gotspace = FALSE; break;
4665 case 0x09: /* HT */
4666 case 0x20: /* SPACE */
4667 case 0xa0: /* NBSP */
4668 case 0x1680: /* OGHAM SPACE MARK */
4669 case 0x180e: /* MONGOLIAN VOWEL SEPARATOR */
4670 case 0x2000: /* EN QUAD */
4671 case 0x2001: /* EM QUAD */
4672 case 0x2002: /* EN SPACE */
4673 case 0x2003: /* EM SPACE */
4674 case 0x2004: /* THREE-PER-EM SPACE */
4675 case 0x2005: /* FOUR-PER-EM SPACE */
4676 case 0x2006: /* SIX-PER-EM SPACE */
4677 case 0x2007: /* FIGURE SPACE */
4678 case 0x2008: /* PUNCTUATION SPACE */
4679 case 0x2009: /* THIN SPACE */
4680 case 0x200A: /* HAIR SPACE */
4681 case 0x202f: /* NARROW NO-BREAK SPACE */
4682 case 0x205f: /* MEDIUM MATHEMATICAL SPACE */
4683 case 0x3000: /* IDEOGRAPHIC SPACE */
4684 gotspace = TRUE;
4685 break;
4686 }
4687 if (gotspace == (ctype == OP_NOT_HSPACE)) break;
4688 eptr += len;
4689 }
4690 break;
4691
4692 case OP_NOT_VSPACE:
4693 case OP_VSPACE:
4694 for (i = min; i < max; i++)
4695 {
4696 BOOL gotspace;
4697 int len = 1;
4698 if (eptr >= md->end_subject)
4699 {
4700 SCHECK_PARTIAL();
4701 break;
4702 }
4703 GETCHARLEN(c, eptr, len);
4704 switch(c)
4705 {
4706 default: gotspace = FALSE; break;
4707 case 0x0a: /* LF */
4708 case 0x0b: /* VT */
4709 case 0x0c: /* FF */
4710 case 0x0d: /* CR */
4711 case 0x85: /* NEL */
4712 case 0x2028: /* LINE SEPARATOR */
4713 case 0x2029: /* PARAGRAPH SEPARATOR */
4714 gotspace = TRUE;
4715 break;
4716 }
4717 if (gotspace == (ctype == OP_NOT_VSPACE)) break;
4718 eptr += len;
4719 }
4720 break;
4721
4722 case OP_NOT_DIGIT:
4723 for (i = min; i < max; i++)
4724 {
4725 int len = 1;
4726 if (eptr >= md->end_subject)
4727 {
4728 SCHECK_PARTIAL();
4729 break;
4730 }
4731 GETCHARLEN(c, eptr, len);
4732 if (c < 256 && (md->ctypes[c] & ctype_digit) != 0) break;
4733 eptr+= len;
4734 }
4735 break;
4736
4737 case OP_DIGIT:
4738 for (i = min; i < max; i++)
4739 {
4740 int len = 1;
4741 if (eptr >= md->end_subject)
4742 {
4743 SCHECK_PARTIAL();
4744 break;
4745 }
4746 GETCHARLEN(c, eptr, len);
4747 if (c >= 256 ||(md->ctypes[c] & ctype_digit) == 0) break;
4748 eptr+= len;
4749 }
4750 break;
4751
4752 case OP_NOT_WHITESPACE:
4753 for (i = min; i < max; i++)
4754 {
4755 int len = 1;
4756 if (eptr >= md->end_subject)
4757 {
4758 SCHECK_PARTIAL();
4759 break;
4760 }
4761 GETCHARLEN(c, eptr, len);
4762 if (c < 256 && (md->ctypes[c] & ctype_space) != 0) break;
4763 eptr+= len;
4764 }
4765 break;
4766
4767 case OP_WHITESPACE:
4768 for (i = min; i < max; i++)
4769 {
4770 int len = 1;
4771 if (eptr >= md->end_subject)
4772 {
4773 SCHECK_PARTIAL();
4774 break;
4775 }
4776 GETCHARLEN(c, eptr, len);
4777 if (c >= 256 ||(md->ctypes[c] & ctype_space) == 0) break;
4778 eptr+= len;
4779 }
4780 break;
4781
4782 case OP_NOT_WORDCHAR:
4783 for (i = min; i < max; i++)
4784 {
4785 int len = 1;
4786 if (eptr >= md->end_subject)
4787 {
4788 SCHECK_PARTIAL();
4789 break;
4790 }
4791 GETCHARLEN(c, eptr, len);
4792 if (c < 256 && (md->ctypes[c] & ctype_word) != 0) break;
4793 eptr+= len;
4794 }
4795 break;
4796
4797 case OP_WORDCHAR:
4798 for (i = min; i < max; i++)
4799 {
4800 int len = 1;
4801 if (eptr >= md->end_subject)
4802 {
4803 SCHECK_PARTIAL();
4804 break;
4805 }
4806 GETCHARLEN(c, eptr, len);
4807 if (c >= 256 || (md->ctypes[c] & ctype_word) == 0) break;
4808 eptr+= len;
4809 }
4810 break;
4811
4812 default:
4813 RRETURN(PCRE_ERROR_INTERNAL);
4814 }
4815
4816 /* eptr is now past the end of the maximum run */
4817
4818 if (possessive) continue;
4819 for(;;)
4820 {
4821 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM46);
4822 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
4823 if (eptr-- == pp) break; /* Stop if tried at original pos */
4824 BACKCHAR(eptr);
4825 }
4826 }
4827 else
4828 #endif /* SUPPORT_UTF8 */
4829
4830 /* Not UTF-8 mode */
4831 {
4832 switch(ctype)
4833 {
4834 case OP_ANY:
4835 for (i = min; i < max; i++)
4836 {
4837 if (eptr >= md->end_subject)
4838 {
4839 SCHECK_PARTIAL();
4840 break;
4841 }
4842 if (IS_NEWLINE(eptr)) break;
4843 eptr++;
4844 }
4845 break;
4846
4847 case OP_ALLANY:
4848 case OP_ANYBYTE:
4849 c = max - min;
4850 if (c > (unsigned int)(md->end_subject - eptr))
4851 {
4852 eptr = md->end_subject;
4853 SCHECK_PARTIAL();
4854 }
4855 else eptr += c;
4856 break;
4857
4858 case OP_ANYNL:
4859 for (i = min; i < max; i++)
4860 {
4861 if (eptr >= md->end_subject)
4862 {
4863 SCHECK_PARTIAL();
4864 break;
4865 }
4866 c = *eptr;
4867 if (c == 0x000d)
4868 {
4869 if (++eptr >= md->end_subject) break;
4870 if (*eptr == 0x000a) eptr++;
4871 }
4872 else
4873 {
4874 if (c != 0x000a &&
4875 (md->bsr_anycrlf ||
4876 (c != 0x000b && c != 0x000c && c != 0x0085)))
4877 break;
4878 eptr++;
4879 }
4880 }
4881 break;
4882
4883 case OP_NOT_HSPACE:
4884 for (i = min; i < max; i++)
4885 {
4886 if (eptr >= md->end_subject)
4887 {
4888 SCHECK_PARTIAL();
4889 break;
4890 }
4891 c = *eptr;
4892 if (c == 0x09 || c == 0x20 || c == 0xa0) break;
4893 eptr++;
4894 }
4895 break;
4896
4897 case OP_HSPACE:
4898 for (i = min; i < max; i++)
4899 {
4900 if (eptr >= md->end_subject)
4901 {
4902 SCHECK_PARTIAL();
4903 break;
4904 }
4905 c = *eptr;
4906 if (c != 0x09 && c != 0x20 && c != 0xa0) break;
4907 eptr++;
4908 }
4909 break;
4910
4911 case OP_NOT_VSPACE:
4912 for (i = min; i < max; i++)
4913 {
4914 if (eptr >= md->end_subject)
4915 {
4916 SCHECK_PARTIAL();
4917 break;
4918 }
4919 c = *eptr;
4920 if (c == 0x0a || c == 0x0b || c == 0x0c || c == 0x0d || c == 0x85)
4921 break;
4922 eptr++;
4923 }
4924 break;
4925
4926 case OP_VSPACE:
4927 for (i = min; i < max; i++)
4928 {
4929 if (eptr >= md->end_subject)
4930 {
4931 SCHECK_PARTIAL();
4932 break;
4933 }
4934 c = *eptr;
4935 if (c != 0x0a && c != 0x0b && c != 0x0c && c != 0x0d && c != 0x85)
4936 break;
4937 eptr++;
4938 }
4939 break;
4940
4941 case OP_NOT_DIGIT:
4942 for (i = min; i < max; i++)
4943 {
4944 if (eptr >= md->end_subject)
4945 {
4946 SCHECK_PARTIAL();
4947 break;
4948 }
4949 if ((md->ctypes[*eptr] & ctype_digit) != 0) break;
4950 eptr++;
4951 }
4952 break;
4953
4954 case OP_DIGIT:
4955 for (i = min; i < max; i++)
4956 {
4957 if (eptr >= md->end_subject)
4958 {
4959 SCHECK_PARTIAL();
4960 break;
4961 }
4962 if ((md->ctypes[*eptr] & ctype_digit) == 0) break;
4963 eptr++;
4964 }
4965 break;
4966
4967 case OP_NOT_WHITESPACE:
4968 for (i = min; i < max; i++)
4969 {
4970 if (eptr >= md->end_subject)
4971 {
4972 SCHECK_PARTIAL();
4973 break;
4974 }
4975 if ((md->ctypes[*eptr] & ctype_space) != 0) break;
4976 eptr++;
4977 }
4978 break;
4979
4980 case OP_WHITESPACE:
4981 for (i = min; i < max; i++)
4982 {
4983 if (eptr >= md->end_subject)
4984 {
4985 SCHECK_PARTIAL();
4986 break;
4987 }
4988 if ((md->ctypes[*eptr] & ctype_space) == 0) break;
4989 eptr++;
4990 }
4991 break;
4992
4993 case OP_NOT_WORDCHAR:
4994 for (i = min; i < max; i++)
4995 {
4996 if (eptr >= md->end_subject)
4997 {
4998 SCHECK_PARTIAL();
4999 break;
5000 }
5001 if ((md->ctypes[*eptr] & ctype_word) != 0) break;
5002 eptr++;
5003 }
5004 break;
5005
5006 case OP_WORDCHAR:
5007 for (i = min; i < max; i++)
5008 {
5009 if (eptr >= md->end_subject)
5010 {
5011 SCHECK_PARTIAL();
5012 break;
5013 }
5014 if ((md->ctypes[*eptr] & ctype_word) == 0) break;
5015 eptr++;
5016 }
5017 break;
5018
5019 default:
5020 RRETURN(PCRE_ERROR_INTERNAL);
5021 }
5022
5023 /* eptr is now past the end of the maximum run */
5024
5025 if (possessive) continue;
5026 while (eptr >= pp)
5027 {
5028 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM47);
5029 eptr--;
5030 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
5031 }
5032 }
5033
5034 /* Get here if we can't make it match with any permitted repetitions */
5035
5036 RRETURN(MATCH_NOMATCH);
5037 }
5038 /* Control never gets here */
5039
5040 /* There's been some horrible disaster. Arrival here can only mean there is
5041 something seriously wrong in the code above or the OP_xxx definitions. */
5042
5043 default:
5044 DPRINTF(("Unknown opcode %d\n", *ecode));
5045 RRETURN(PCRE_ERROR_UNKNOWN_OPCODE);
5046 }
5047
5048 /* Do not stick any code in here without much thought; it is assumed
5049 that "continue" in the code above comes out to here to repeat the main
5050 loop. */
5051
5052 } /* End of main loop */
5053 /* Control never reaches here */
5054
5055
5056 /* When compiling to use the heap rather than the stack for recursive calls to
5057 match(), the RRETURN() macro jumps here. The number that is saved in
5058 frame->Xwhere indicates which label we actually want to return to. */
5059
5060 #ifdef NO_RECURSE
5061 #define LBL(val) case val: goto L_RM##val;
5062 HEAP_RETURN:
5063 switch (frame->Xwhere)
5064 {
5065 LBL( 1) LBL( 2) LBL( 3) LBL( 4) LBL( 5) LBL( 6) LBL( 7) LBL( 8)
5066 LBL( 9) LBL(10) LBL(11) LBL(12) LBL(13) LBL(14) LBL(15) LBL(17)
5067 LBL(19) LBL(24) LBL(25) LBL(26) LBL(27) LBL(29) LBL(31) LBL(33)
5068 LBL(35) LBL(43) LBL(47) LBL(48) LBL(49) LBL(50) LBL(51) LBL(52)
5069 LBL(53) LBL(54)
5070 #ifdef SUPPORT_UTF8
5071 LBL(16) LBL(18) LBL(20) LBL(21) LBL(22) LBL(23) LBL(28) LBL(30)
5072 LBL(32) LBL(34) LBL(42) LBL(46)
5073 #ifdef SUPPORT_UCP
5074 LBL(36) LBL(37) LBL(38) LBL(39) LBL(40) LBL(41) LBL(44) LBL(45)
5075 #endif /* SUPPORT_UCP */
5076 #endif /* SUPPORT_UTF8 */
5077 default:
5078 DPRINTF(("jump error in pcre match: label %d non-existent\n", frame->Xwhere));
5079 return PCRE_ERROR_INTERNAL;
5080 }
5081 #undef LBL
5082 #endif /* NO_RECURSE */
5083 }
5084
5085
5086 /***************************************************************************
5087 ****************************************************************************
5088 RECURSION IN THE match() FUNCTION
5089
5090 Undefine all the macros that were defined above to handle this. */
5091
5092 #ifdef NO_RECURSE
5093 #undef eptr
5094 #undef ecode
5095 #undef mstart
5096 #undef offset_top
5097 #undef ims
5098 #undef eptrb
5099 #undef flags
5100
5101 #undef callpat
5102 #undef charptr
5103 #undef data
5104 #undef next
5105 #undef pp
5106 #undef prev
5107 #undef saved_eptr
5108
5109 #undef new_recursive
5110
5111 #undef cur_is_word
5112 #undef condition
5113 #undef prev_is_word
5114
5115 #undef original_ims
5116
5117 #undef ctype
5118 #undef length
5119 #undef max
5120 #undef min
5121 #undef number
5122 #undef offset
5123 #undef op
5124 #undef save_capture_last
5125 #undef save_offset1
5126 #undef save_offset2
5127 #undef save_offset3
5128 #undef stacksave
5129
5130 #undef newptrb
5131
5132 #endif
5133
5134 /* These two are defined as macros in both cases */
5135
5136 #undef fc
5137 #undef fi
5138
5139 /***************************************************************************
5140 ***************************************************************************/
5141
5142
5143
5144 /*************************************************
5145 * Execute a Regular Expression *
5146 *************************************************/
5147
5148 /* This function applies a compiled re to a subject string and picks out
5149 portions of the string if it matches. Two elements in the vector are set for
5150 each substring: the offsets to the start and end of the substring.
5151
5152 Arguments:
5153 argument_re points to the compiled expression
5154 extra_data points to extra data or is NULL
5155 subject points to the subject string
5156 length length of subject string (may contain binary zeros)
5157 start_offset where to start in the subject string
5158 options option bits
5159 offsets points to a vector of ints to be filled in with offsets
5160 offsetcount the number of elements in the vector
5161
5162 Returns: > 0 => success; value is the number of elements filled in
5163 = 0 => success, but offsets is not big enough
5164 -1 => failed to match
5165 < -1 => some kind of unexpected problem
5166 */
5167
5168 PCRE_EXP_DEFN int PCRE_CALL_CONVENTION
5169 pcre_exec(const pcre *argument_re, const pcre_extra *extra_data,
5170 PCRE_SPTR subject, int length, int start_offset, int options, int *offsets,
5171 int offsetcount)
5172 {
5173 int rc, resetcount, ocount;
5174 int first_byte = -1;
5175 int req_byte = -1;
5176 int req_byte2 = -1;
5177 int newline;
5178 unsigned long int ims;
5179 BOOL using_temporary_offsets = FALSE;
5180 BOOL anchored;
5181 BOOL startline;
5182 BOOL firstline;
5183 BOOL first_byte_caseless = FALSE;
5184 BOOL req_byte_caseless = FALSE;
5185 BOOL utf8;
5186 match_data match_block;
5187 match_data *md = &match_block;
5188 const uschar *tables;
5189 const uschar *start_bits = NULL;
5190 USPTR start_match = (USPTR)subject + start_offset;
5191 USPTR end_subject;
5192 USPTR start_partial = NULL;
5193 USPTR req_byte_ptr = start_match - 1;
5194
5195 pcre_study_data internal_study;
5196 const pcre_study_data *study;
5197
5198 real_pcre internal_re;
5199 const real_pcre *external_re = (const real_pcre *)argument_re;
5200 const real_pcre *re = external_re;
5201
5202 /* Plausibility checks */
5203
5204 if ((options & ~PUBLIC_EXEC_OPTIONS) != 0) return PCRE_ERROR_BADOPTION;
5205 if (re == NULL || subject == NULL ||
5206 (offsets == NULL && offsetcount > 0)) return PCRE_ERROR_NULL;
5207 if (offsetcount < 0) return PCRE_ERROR_BADCOUNT;
5208
5209 /* This information is for finding all the numbers associated with a given
5210 name, for condition testing. */
5211
5212 md->name_table = (uschar *)re + re->name_table_offset;
5213 md->name_count = re->name_count;
5214 md->name_entry_size = re->name_entry_size;
5215
5216 /* Fish out the optional data from the extra_data structure, first setting
5217 the default values. */
5218
5219 study = NULL;
5220 md->match_limit = MATCH_LIMIT;
5221 md->match_limit_recursion = MATCH_LIMIT_RECURSION;
5222 md->callout_data = NULL;
5223
5224 /* The table pointer is always in native byte order. */
5225
5226 tables = external_re->tables;
5227
5228 if (extra_data != NULL)
5229 {
5230 register unsigned int flags = extra_data->flags;
5231 if ((flags & PCRE_EXTRA_STUDY_DATA) != 0)
5232 study = (const pcre_study_data *)extra_data->study_data;
5233 if ((flags & PCRE_EXTRA_MATCH_LIMIT) != 0)
5234 md->match_limit = extra_data->match_limit;
5235 if ((flags & PCRE_EXTRA_MATCH_LIMIT_RECURSION) != 0)
5236 md->match_limit_recursion = extra_data->match_limit_recursion;
5237 if ((flags & PCRE_EXTRA_CALLOUT_DATA) != 0)
5238 md->callout_data = extra_data->callout_data;
5239 if ((flags & PCRE_EXTRA_TABLES) != 0) tables = extra_data->tables;
5240 }
5241
5242 /* If the exec call supplied NULL for tables, use the inbuilt ones. This
5243 is a feature that makes it possible to save compiled regex and re-use them
5244 in other programs later. */
5245
5246 if (tables == NULL) tables = _pcre_default_tables;
5247
5248 /* Check that the first field in the block is the magic number. If it is not,
5249 test for a regex that was compiled on a host of opposite endianness. If this is
5250 the case, flipped values are put in internal_re and internal_study if there was
5251 study data too. */
5252
5253 if (re->magic_number != MAGIC_NUMBER)
5254 {
5255 re = _pcre_try_flipped(re, &internal_re, study, &internal_study);
5256 if (re == NULL) return PCRE_ERROR_BADMAGIC;
5257 if (study != NULL) study = &internal_study;
5258 }
5259
5260 /* Set up other data */
5261
5262 anchored = ((re->options | options) & PCRE_ANCHORED) != 0;
5263 startline = (re->flags & PCRE_STARTLINE) != 0;
5264 firstline = (re->options & PCRE_FIRSTLINE) != 0;
5265
5266 /* The code starts after the real_pcre block and the capture name table. */
5267
5268 md->start_code = (const uschar *)external_re + re->name_table_offset +
5269 re->name_count * re->name_entry_size;
5270
5271 md->start_subject = (USPTR)subject;
5272 md->start_offset = start_offset;
5273 md->end_subject = md->start_subject + length;
5274 end_subject = md->end_subject;
5275
5276 md->endonly = (re->options & PCRE_DOLLAR_ENDONLY) != 0;
5277 utf8 = md->utf8 = (re->options & PCRE_UTF8) != 0;
5278 md->jscript_compat = (re->options & PCRE_JAVASCRIPT_COMPAT) != 0;
5279
5280 md->notbol = (options & PCRE_NOTBOL) != 0;
5281 md->noteol = (options & PCRE_NOTEOL) != 0;
5282 md->notempty = (options & PCRE_NOTEMPTY) != 0;
5283 md->notempty_atstart = (options & PCRE_NOTEMPTY_ATSTART) != 0;
5284 md->partial = ((options & PCRE_PARTIAL_HARD) != 0)? 2 :
5285 ((options & PCRE_PARTIAL_SOFT) != 0)? 1 : 0;
5286 md->hitend = FALSE;
5287
5288 md->recursive = NULL; /* No recursion at top level */
5289
5290 md->lcc = tables + lcc_offset;
5291 md->ctypes = tables + ctypes_offset;
5292
5293 /* Handle different \R options. */
5294
5295 switch (options & (PCRE_BSR_ANYCRLF|PCRE_BSR_UNICODE))
5296 {
5297 case 0:
5298 if ((re->options & (PCRE_BSR_ANYCRLF|PCRE_BSR_UNICODE)) != 0)
5299 md->bsr_anycrlf = (re->options & PCRE_BSR_ANYCRLF) != 0;
5300 else
5301 #ifdef BSR_ANYCRLF
5302 md->bsr_anycrlf = TRUE;
5303 #else
5304 md->bsr_anycrlf = FALSE;
5305 #endif
5306 break;
5307
5308 case PCRE_BSR_ANYCRLF:
5309 md->bsr_anycrlf = TRUE;
5310 break;
5311
5312 case PCRE_BSR_UNICODE:
5313 md->bsr_anycrlf = FALSE;
5314 break;
5315
5316 default: return PCRE_ERROR_BADNEWLINE;
5317 }
5318
5319 /* Handle different types of newline. The three bits give eight cases. If
5320 nothing is set at run time, whatever was used at compile time applies. */
5321
5322 switch ((((options & PCRE_NEWLINE_BITS) == 0)? re->options :
5323 (pcre_uint32)options) & PCRE_NEWLINE_BITS)
5324 {
5325 case 0: newline = NEWLINE; break; /* Compile-time default */
5326 case PCRE_NEWLINE_CR: newline = CHAR_CR; break;
5327 case PCRE_NEWLINE_LF: newline = CHAR_NL; break;
5328 case PCRE_NEWLINE_CR+
5329 PCRE_NEWLINE_LF: newline = (CHAR_CR << 8) | CHAR_NL; break;
5330 case PCRE_NEWLINE_ANY: newline = -1; break;
5331 case PCRE_NEWLINE_ANYCRLF: newline = -2; break;
5332 default: return PCRE_ERROR_BADNEWLINE;
5333 }
5334
5335 if (newline == -2)
5336 {
5337 md->nltype = NLTYPE_ANYCRLF;
5338 }
5339 else if (newline < 0)
5340 {
5341 md->nltype = NLTYPE_ANY;
5342 }
5343 else
5344 {
5345 md->nltype = NLTYPE_FIXED;
5346 if (newline > 255)
5347 {
5348 md->nllen = 2;
5349 md->nl[0] = (newline >> 8) & 255;
5350 md->nl[1] = newline & 255;
5351 }
5352 else
5353 {
5354 md->nllen = 1;
5355 md->nl[0] = newline;
5356 }
5357 }
5358
5359 /* Partial matching was originally supported only for a restricted set of
5360 regexes; from release 8.00 there are no restrictions, but the bits are still
5361 defined (though never set). So there's no harm in leaving this code. */
5362
5363 if (md->partial && (re->flags & PCRE_NOPARTIAL) != 0)
5364 return PCRE_ERROR_BADPARTIAL;
5365
5366 /* Check a UTF-8 string if required. Unfortunately there's no way of passing
5367 back the character offset. */
5368
5369 #ifdef SUPPORT_UTF8
5370 if (utf8 && (options & PCRE_NO_UTF8_CHECK) == 0)
5371 {
5372 if (_pcre_valid_utf8((USPTR)subject, length) >= 0)
5373 return PCRE_ERROR_BADUTF8;
5374 if (start_offset > 0 && start_offset < length)
5375 {
5376 int tb = ((USPTR)subject)[start_offset];
5377 if (tb > 127)
5378 {
5379 tb &= 0xc0;
5380 if (tb != 0 && tb != 0xc0) return PCRE_ERROR_BADUTF8_OFFSET;
5381 }
5382 }
5383 }
5384 #endif
5385
5386 /* The ims options can vary during the matching as a result of the presence
5387 of (?ims) items in the pattern. They are kept in a local variable so that
5388 restoring at the exit of a group is easy. */
5389
5390 ims = re->options & (PCRE_CASELESS|PCRE_MULTILINE|PCRE_DOTALL);
5391
5392 /* If the expression has got more back references than the offsets supplied can
5393 hold, we get a temporary chunk of working store to use during the matching.
5394 Otherwise, we can use the vector supplied, rounding down its size to a multiple
5395 of 3. */
5396
5397 ocount = offsetcount - (offsetcount % 3);
5398
5399 if (re->top_backref > 0 && re->top_backref >= ocount/3)
5400 {
5401 ocount = re->top_backref * 3 + 3;
5402 md->offset_vector = (int *)(pcre_malloc)(ocount * sizeof(int));
5403 if (md->offset_vector == NULL) return PCRE_ERROR_NOMEMORY;
5404 using_temporary_offsets = TRUE;
5405 DPRINTF(("Got memory to hold back references\n"));
5406 }
5407 else md->offset_vector = offsets;
5408
5409 md->offset_end = ocount;
5410 md->offset_max = (2*ocount)/3;
5411 md->offset_overflow = FALSE;
5412 md->capture_last = -1;
5413
5414 /* Compute the minimum number of offsets that we need to reset each time. Doing
5415 this makes a huge difference to execution time when there aren't many brackets
5416 in the pattern. */
5417
5418 resetcount = 2 + re->top_bracket * 2;
5419 if (resetcount > offsetcount) resetcount = ocount;
5420
5421 /* Reset the working variable associated with each extraction. These should
5422 never be used unless previously set, but they get saved and restored, and so we
5423 initialize them to avoid reading uninitialized locations. */
5424
5425 if (md->offset_vector != NULL)
5426 {
5427 register int *iptr = md->offset_vector + ocount;
5428 register int *iend = iptr - resetcount/2 + 1;
5429 while (--iptr >= iend) *iptr = -1;
5430 }
5431
5432 /* Set up the first character to match, if available. The first_byte value is
5433 never set for an anchored regular expression, but the anchoring may be forced
5434 at run time, so we have to test for anchoring. The first char may be unset for
5435 an unanchored pattern, of course. If there's no first char and the pattern was
5436 studied, there may be a bitmap of possible first characters. */
5437
5438 if (!anchored)
5439 {
5440 if ((re->flags & PCRE_FIRSTSET) != 0)
5441 {
5442 first_byte = re->first_byte & 255;
5443 if ((first_byte_caseless = ((re->first_byte & REQ_CASELESS) != 0)) == TRUE)
5444 first_byte = md->lcc[first_byte];
5445 }
5446 else
5447 if (!startline && study != NULL &&
5448 (study->flags & PCRE_STUDY_MAPPED) != 0)
5449 start_bits = study->start_bits;
5450 }
5451
5452 /* For anchored or unanchored matches, there may be a "last known required
5453 character" set. */
5454
5455 if ((re->flags & PCRE_REQCHSET) != 0)
5456 {
5457 req_byte = re->req_byte & 255;
5458 req_byte_caseless = (re->req_byte & REQ_CASELESS) != 0;
5459 req_byte2 = (tables + fcc_offset)[req_byte]; /* case flipped */
5460 }
5461
5462
5463 /* ==========================================================================*/
5464
5465 /* Loop for handling unanchored repeated matching attempts; for anchored regexs
5466 the loop runs just once. */
5467
5468 for(;;)
5469 {
5470 USPTR save_end_subject = end_subject;
5471 USPTR new_start_match;
5472
5473 /* Reset the maximum number of extractions we might see. */
5474
5475 if (md->offset_vector != NULL)
5476 {
5477 register int *iptr = md->offset_vector;
5478 register int *iend = iptr + resetcount;
5479 while (iptr < iend) *iptr++ = -1;
5480 }
5481
5482 /* If firstline is TRUE, the start of the match is constrained to the first
5483 line of a multiline string. That is, the match must be before or at the first
5484 newline. Implement this by temporarily adjusting end_subject so that we stop
5485 scanning at a newline. If the match fails at the newline, later code breaks
5486 this loop. */
5487
5488 if (firstline)
5489 {
5490 USPTR t = start_match;
5491 #ifdef SUPPORT_UTF8
5492 if (utf8)
5493 {
5494 while (t < md->end_subject && !IS_NEWLINE(t))
5495 {
5496 t++;
5497 while (t < end_subject && (*t & 0xc0) == 0x80) t++;
5498 }
5499 }
5500 else
5501 #endif
5502 while (t < md->end_subject && !IS_NEWLINE(t)) t++;
5503 end_subject = t;
5504 }
5505
5506 /* There are some optimizations that avoid running the match if a known
5507 starting point is not found, or if a known later character is not present.
5508 However, there is an option that disables these, for testing and for ensuring
5509 that all callouts do actually occur. */
5510
5511 if ((options & PCRE_NO_START_OPTIMIZE) == 0)
5512 {
5513 /* Advance to a unique first byte if there is one. */
5514
5515 if (first_byte >= 0)
5516 {
5517 if (first_byte_caseless)
5518 while (start_match < end_subject && md->lcc[*start_match] != first_byte)
5519 start_match++;
5520 else
5521 while (start_match < end_subject && *start_match != first_byte)
5522 start_match++;
5523 }
5524
5525 /* Or to just after a linebreak for a multiline match */
5526
5527 else if (startline)
5528 {
5529 if (start_match > md->start_subject + start_offset)
5530 {
5531 #ifdef SUPPORT_UTF8
5532 if (utf8)
5533 {
5534 while (start_match < end_subject && !WAS_NEWLINE(start_match))
5535 {
5536 start_match++;
5537 while(start_match < end_subject && (*start_match & 0xc0) == 0x80)
5538 start_match++;
5539 }
5540 }
5541 else
5542 #endif
5543 while (start_match < end_subject && !WAS_NEWLINE(start_match))
5544 start_match++;
5545
5546 /* If we have just passed a CR and the newline option is ANY or ANYCRLF,
5547 and we are now at a LF, advance the match position by one more character.
5548 */
5549
5550 if (start_match[-1] == CHAR_CR &&
5551 (md->nltype == NLTYPE_ANY || md->nltype == NLTYPE_ANYCRLF) &&
5552 start_match < end_subject &&
5553 *start_match == CHAR_NL)
5554 start_match++;
5555 }
5556 }
5557
5558 /* Or to a non-unique first byte after study */
5559
5560 else if (start_bits != NULL)
5561 {
5562 while (start_match < end_subject)
5563 {
5564 register unsigned int c = *start_match;
5565 if ((start_bits[c/8] & (1 << (c&7))) == 0) start_match++;
5566 else break;
5567 }
5568 }
5569 } /* Starting optimizations */
5570
5571 /* Restore fudged end_subject */
5572
5573 end_subject = save_end_subject;
5574
5575 /* The following two optimizations are disabled for partial matching or if
5576 disabling is explicitly requested. */
5577
5578 if ((options & PCRE_NO_START_OPTIMIZE) == 0 && !md->partial)
5579 {
5580 /* If the pattern was studied, a minimum subject length may be set. This is
5581 a lower bound; no actual string of that length may actually match the
5582 pattern. Although the value is, strictly, in characters, we treat it as
5583 bytes to avoid spending too much time in this optimization. */
5584
5585 if (study != NULL && (study->flags & PCRE_STUDY_MINLEN) != 0 &&
5586 (pcre_uint32)(end_subject - start_match) < study->minlength)
5587 {
5588 rc = MATCH_NOMATCH;
5589 break;
5590 }
5591
5592 /* If req_byte is set, we know that that character must appear in the
5593 subject for the match to succeed. If the first character is set, req_byte
5594 must be later in the subject; otherwise the test starts at the match point.
5595 This optimization can save a huge amount of backtracking in patterns with
5596 nested unlimited repeats that aren't going to match. Writing separate code
5597 for cased/caseless versions makes it go faster, as does using an
5598 autoincrement and backing off on a match.
5599
5600 HOWEVER: when the subject string is very, very long, searching to its end
5601 can take a long time, and give bad performance on quite ordinary patterns.
5602 This showed up when somebody was matching something like /^\d+C/ on a
5603 32-megabyte string... so we don't do this when the string is sufficiently
5604 long. */
5605
5606 if (req_byte >= 0 && end_subject - start_match < REQ_BYTE_MAX)
5607 {
5608 register USPTR p = start_match + ((first_byte >= 0)? 1 : 0);
5609
5610 /* We don't need to repeat the search if we haven't yet reached the
5611 place we found it at last time. */
5612
5613 if (p > req_byte_ptr)
5614 {
5615 if (req_byte_caseless)
5616 {
5617 while (p < end_subject)
5618 {
5619 register int pp = *p++;
5620 if (pp == req_byte || pp == req_byte2) { p--; break; }
5621 }
5622 }
5623 else
5624 {
5625 while (p < end_subject)
5626 {
5627 if (*p++ == req_byte) { p--; break; }
5628 }
5629 }
5630
5631 /* If we can't find the required character, break the matching loop,
5632 forcing a match failure. */
5633
5634 if (p >= end_subject)
5635 {
5636 rc = MATCH_NOMATCH;
5637 break;
5638 }
5639
5640 /* If we have found the required character, save the point where we
5641 found it, so that we don't search again next time round the loop if
5642 the start hasn't passed this character yet. */
5643
5644 req_byte_ptr = p;
5645 }
5646 }
5647 }
5648
5649 #ifdef PCRE_DEBUG /* Sigh. Some compilers never learn. */
5650 printf(">>>> Match against: ");
5651 pchars(start_match, end_subject - start_match, TRUE, md);
5652 printf("\n");
5653 #endif
5654
5655 /* OK, we can now run the match. If "hitend" is set afterwards, remember the
5656 first starting point for which a partial match was found. */
5657
5658 md->start_match_ptr = start_match;
5659 md->start_used_ptr = start_match;
5660 md->match_call_count = 0;
5661 rc = match(start_match, md->start_code, start_match, 2, md, ims, NULL, 0, 0);
5662 if (md->hitend && start_partial == NULL) start_partial = md->start_used_ptr;
5663
5664 switch(rc)
5665 {
5666 /* NOMATCH and PRUNE advance by one character. THEN at this level acts
5667 exactly like PRUNE. */
5668
5669 case MATCH_NOMATCH:
5670 case MATCH_PRUNE:
5671 case MATCH_THEN:
5672 new_start_match = start_match + 1;
5673 #ifdef SUPPORT_UTF8
5674 if (utf8)
5675 while(new_start_match < end_subject && (*new_start_match & 0xc0) == 0x80)
5676 new_start_match++;
5677 #endif
5678 break;
5679
5680 /* SKIP passes back the next starting point explicitly. */
5681
5682 case MATCH_SKIP:
5683 new_start_match = md->start_match_ptr;
5684 break;
5685
5686 /* COMMIT disables the bumpalong, but otherwise behaves as NOMATCH. */
5687
5688 case MATCH_COMMIT:
5689 rc = MATCH_NOMATCH;
5690 goto ENDLOOP;
5691
5692 /* Any other return is either a match, or some kind of error. */
5693
5694 default:
5695 goto ENDLOOP;
5696 }
5697
5698 /* Control reaches here for the various types of "no match at this point"
5699 result. Reset the code to MATCH_NOMATCH for subsequent checking. */
5700
5701 rc = MATCH_NOMATCH;
5702
5703 /* If PCRE_FIRSTLINE is set, the match must happen before or at the first
5704 newline in the subject (though it may continue over the newline). Therefore,
5705 if we have just failed to match, starting at a newline, do not continue. */
5706
5707 if (firstline && IS_NEWLINE(start_match)) break;
5708
5709 /* Advance to new matching position */
5710
5711 start_match = new_start_match;
5712
5713 /* Break the loop if the pattern is anchored or if we have passed the end of
5714 the subject. */
5715
5716 if (anchored || start_match > end_subject) break;
5717
5718 /* If we have just passed a CR and we are now at a LF, and the pattern does
5719 not contain any explicit matches for \r or \n, and the newline option is CRLF
5720 or ANY or ANYCRLF, advance the match position by one more character. */
5721
5722 if (start_match[-1] == CHAR_CR &&
5723 start_match < end_subject &&
5724 *start_match == CHAR_NL &&
5725 (re->flags & PCRE_HASCRORLF) == 0 &&
5726 (md->nltype == NLTYPE_ANY ||
5727 md->nltype == NLTYPE_ANYCRLF ||
5728 md->nllen == 2))
5729 start_match++;
5730
5731 } /* End of for(;;) "bumpalong" loop */
5732
5733 /* ==========================================================================*/
5734
5735 /* We reach here when rc is not MATCH_NOMATCH, or if one of the stopping
5736 conditions is true:
5737
5738 (1) The pattern is anchored or the match was failed by (*COMMIT);
5739
5740 (2) We are past the end of the subject;
5741
5742 (3) PCRE_FIRSTLINE is set and we have failed to match at a newline, because
5743 this option requests that a match occur at or before the first newline in
5744 the subject.
5745
5746 When we have a match and the offset vector is big enough to deal with any
5747 backreferences, captured substring offsets will already be set up. In the case
5748 where we had to get some local store to hold offsets for backreference
5749 processing, copy those that we can. In this case there need not be overflow if
5750 certain parts of the pattern were not used, even though there are more
5751 capturing parentheses than vector slots. */
5752
5753 ENDLOOP:
5754
5755 if (rc == MATCH_MATCH)
5756 {
5757 if (using_temporary_offsets)
5758 {
5759 if (offsetcount >= 4)
5760 {
5761 memcpy(offsets + 2, md->offset_vector + 2,
5762 (offsetcount - 2) * sizeof(int));
5763 DPRINTF(("Copied offsets from temporary memory\n"));
5764 }
5765 if (md->end_offset_top > offsetcount) md->offset_overflow = TRUE;
5766 DPRINTF(("Freeing temporary memory\n"));
5767 (pcre_free)(md->offset_vector);
5768 }
5769
5770 /* Set the return code to the number of captured strings, or 0 if there are
5771 too many to fit into the vector. */
5772
5773 rc = md->offset_overflow? 0 : md->end_offset_top/2;
5774
5775 /* If there is space, set up the whole thing as substring 0. The value of
5776 md->start_match_ptr might be modified if \K was encountered on the success
5777 matching path. */
5778
5779 if (offsetcount < 2) rc = 0; else
5780 {
5781 offsets[0] = md->start_match_ptr - md->start_subject;
5782 offsets[1] = md->end_match_ptr - md->start_subject;
5783 }
5784
5785 DPRINTF((">>>> returning %d\n", rc));
5786 return rc;
5787 }
5788
5789 /* Control gets here if there has been an error, or if the overall match
5790 attempt has failed at all permitted starting positions. */
5791
5792 if (using_temporary_offsets)
5793 {
5794 DPRINTF(("Freeing temporary memory\n"));
5795 (pcre_free)(md->offset_vector);
5796 }
5797
5798 if (rc != MATCH_NOMATCH && rc != PCRE_ERROR_PARTIAL)
5799 {
5800 DPRINTF((">>>> error: returning %d\n", rc));
5801 return rc;
5802 }
5803 else if (start_partial != NULL)
5804 {
5805 DPRINTF((">>>> returning PCRE_ERROR_PARTIAL\n"));
5806 if (offsetcount > 1)
5807 {
5808 offsets[0] = start_partial - (USPTR)subject;
5809 offsets[1] = end_subject - (USPTR)subject;
5810 }
5811 return PCRE_ERROR_PARTIAL;
5812 }
5813 else
5814 {
5815 DPRINTF((">>>> returning PCRE_ERROR_NOMATCH\n"));
5816 return PCRE_ERROR_NOMATCH;
5817 }
5818 }
5819
5820 /* End of pcre_exec.c */

Properties

Name Value
svn:eol-style native
svn:keywords "Author Date Id Revision Url"

webmaster@exim.org
ViewVC Help
Powered by ViewVC 1.1.12