/[pcre]/code/trunk/pcre_exec.c
ViewVC logotype

Contents of /code/trunk/pcre_exec.c

Parent Directory Parent Directory | Revision Log Revision Log


Revision 482 - (show annotations) (download)
Mon Jan 4 15:55:46 2010 UTC (4 years, 8 months ago) by ph10
File MIME type: text/plain
File size: 171035 byte(s)
Fix partial match bug (code omitted) for \W.

1 /*************************************************
2 * Perl-Compatible Regular Expressions *
3 *************************************************/
4
5 /* PCRE is a library of functions to support regular expressions whose syntax
6 and semantics are as close as possible to those of the Perl 5 language.
7
8 Written by Philip Hazel
9 Copyright (c) 1997-2010 University of Cambridge
10
11 -----------------------------------------------------------------------------
12 Redistribution and use in source and binary forms, with or without
13 modification, are permitted provided that the following conditions are met:
14
15 * Redistributions of source code must retain the above copyright notice,
16 this list of conditions and the following disclaimer.
17
18 * Redistributions in binary form must reproduce the above copyright
19 notice, this list of conditions and the following disclaimer in the
20 documentation and/or other materials provided with the distribution.
21
22 * Neither the name of the University of Cambridge nor the names of its
23 contributors may be used to endorse or promote products derived from
24 this software without specific prior written permission.
25
26 THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
27 AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
28 IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
29 ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
30 LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
31 CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
32 SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
33 INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
34 CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
35 ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
36 POSSIBILITY OF SUCH DAMAGE.
37 -----------------------------------------------------------------------------
38 */
39
40
41 /* This module contains pcre_exec(), the externally visible function that does
42 pattern matching using an NFA algorithm, trying to mimic Perl as closely as
43 possible. There are also some static supporting functions. */
44
45 #ifdef HAVE_CONFIG_H
46 #include "config.h"
47 #endif
48
49 #define NLBLOCK md /* Block containing newline information */
50 #define PSSTART start_subject /* Field containing processed string start */
51 #define PSEND end_subject /* Field containing processed string end */
52
53 #include "pcre_internal.h"
54
55 /* Undefine some potentially clashing cpp symbols */
56
57 #undef min
58 #undef max
59
60 /* Flag bits for the match() function */
61
62 #define match_condassert 0x01 /* Called to check a condition assertion */
63 #define match_cbegroup 0x02 /* Could-be-empty unlimited repeat group */
64
65 /* Non-error returns from the match() function. Error returns are externally
66 defined PCRE_ERROR_xxx codes, which are all negative. */
67
68 #define MATCH_MATCH 1
69 #define MATCH_NOMATCH 0
70
71 /* Special internal returns from the match() function. Make them sufficiently
72 negative to avoid the external error codes. */
73
74 #define MATCH_COMMIT (-999)
75 #define MATCH_PRUNE (-998)
76 #define MATCH_SKIP (-997)
77 #define MATCH_THEN (-996)
78
79 /* Maximum number of ints of offset to save on the stack for recursive calls.
80 If the offset vector is bigger, malloc is used. This should be a multiple of 3,
81 because the offset vector is always a multiple of 3 long. */
82
83 #define REC_STACK_SAVE_MAX 30
84
85 /* Min and max values for the common repeats; for the maxima, 0 => infinity */
86
87 static const char rep_min[] = { 0, 0, 1, 1, 0, 0 };
88 static const char rep_max[] = { 0, 0, 0, 0, 1, 1 };
89
90
91
92 #ifdef PCRE_DEBUG
93 /*************************************************
94 * Debugging function to print chars *
95 *************************************************/
96
97 /* Print a sequence of chars in printable format, stopping at the end of the
98 subject if the requested.
99
100 Arguments:
101 p points to characters
102 length number to print
103 is_subject TRUE if printing from within md->start_subject
104 md pointer to matching data block, if is_subject is TRUE
105
106 Returns: nothing
107 */
108
109 static void
110 pchars(const uschar *p, int length, BOOL is_subject, match_data *md)
111 {
112 unsigned int c;
113 if (is_subject && length > md->end_subject - p) length = md->end_subject - p;
114 while (length-- > 0)
115 if (isprint(c = *(p++))) printf("%c", c); else printf("\\x%02x", c);
116 }
117 #endif
118
119
120
121 /*************************************************
122 * Match a back-reference *
123 *************************************************/
124
125 /* If a back reference hasn't been set, the length that is passed is greater
126 than the number of characters left in the string, so the match fails.
127
128 Arguments:
129 offset index into the offset vector
130 eptr points into the subject
131 length length to be matched
132 md points to match data block
133 ims the ims flags
134
135 Returns: TRUE if matched
136 */
137
138 static BOOL
139 match_ref(int offset, register USPTR eptr, int length, match_data *md,
140 unsigned long int ims)
141 {
142 USPTR p = md->start_subject + md->offset_vector[offset];
143
144 #ifdef PCRE_DEBUG
145 if (eptr >= md->end_subject)
146 printf("matching subject <null>");
147 else
148 {
149 printf("matching subject ");
150 pchars(eptr, length, TRUE, md);
151 }
152 printf(" against backref ");
153 pchars(p, length, FALSE, md);
154 printf("\n");
155 #endif
156
157 /* Always fail if not enough characters left */
158
159 if (length > md->end_subject - eptr) return FALSE;
160
161 /* Separate the caseless case for speed. In UTF-8 mode we can only do this
162 properly if Unicode properties are supported. Otherwise, we can check only
163 ASCII characters. */
164
165 if ((ims & PCRE_CASELESS) != 0)
166 {
167 #ifdef SUPPORT_UTF8
168 #ifdef SUPPORT_UCP
169 if (md->utf8)
170 {
171 USPTR endptr = eptr + length;
172 while (eptr < endptr)
173 {
174 int c, d;
175 GETCHARINC(c, eptr);
176 GETCHARINC(d, p);
177 if (c != d && c != UCD_OTHERCASE(d)) return FALSE;
178 }
179 }
180 else
181 #endif
182 #endif
183
184 /* The same code works when not in UTF-8 mode and in UTF-8 mode when there
185 is no UCP support. */
186
187 while (length-- > 0)
188 { if (md->lcc[*p++] != md->lcc[*eptr++]) return FALSE; }
189 }
190
191 /* In the caseful case, we can just compare the bytes, whether or not we
192 are in UTF-8 mode. */
193
194 else
195 { while (length-- > 0) if (*p++ != *eptr++) return FALSE; }
196
197 return TRUE;
198 }
199
200
201
202 /***************************************************************************
203 ****************************************************************************
204 RECURSION IN THE match() FUNCTION
205
206 The match() function is highly recursive, though not every recursive call
207 increases the recursive depth. Nevertheless, some regular expressions can cause
208 it to recurse to a great depth. I was writing for Unix, so I just let it call
209 itself recursively. This uses the stack for saving everything that has to be
210 saved for a recursive call. On Unix, the stack can be large, and this works
211 fine.
212
213 It turns out that on some non-Unix-like systems there are problems with
214 programs that use a lot of stack. (This despite the fact that every last chip
215 has oodles of memory these days, and techniques for extending the stack have
216 been known for decades.) So....
217
218 There is a fudge, triggered by defining NO_RECURSE, which avoids recursive
219 calls by keeping local variables that need to be preserved in blocks of memory
220 obtained from malloc() instead instead of on the stack. Macros are used to
221 achieve this so that the actual code doesn't look very different to what it
222 always used to.
223
224 The original heap-recursive code used longjmp(). However, it seems that this
225 can be very slow on some operating systems. Following a suggestion from Stan
226 Switzer, the use of longjmp() has been abolished, at the cost of having to
227 provide a unique number for each call to RMATCH. There is no way of generating
228 a sequence of numbers at compile time in C. I have given them names, to make
229 them stand out more clearly.
230
231 Crude tests on x86 Linux show a small speedup of around 5-8%. However, on
232 FreeBSD, avoiding longjmp() more than halves the time taken to run the standard
233 tests. Furthermore, not using longjmp() means that local dynamic variables
234 don't have indeterminate values; this has meant that the frame size can be
235 reduced because the result can be "passed back" by straight setting of the
236 variable instead of being passed in the frame.
237 ****************************************************************************
238 ***************************************************************************/
239
240 /* Numbers for RMATCH calls. When this list is changed, the code at HEAP_RETURN
241 below must be updated in sync. */
242
243 enum { RM1=1, RM2, RM3, RM4, RM5, RM6, RM7, RM8, RM9, RM10,
244 RM11, RM12, RM13, RM14, RM15, RM16, RM17, RM18, RM19, RM20,
245 RM21, RM22, RM23, RM24, RM25, RM26, RM27, RM28, RM29, RM30,
246 RM31, RM32, RM33, RM34, RM35, RM36, RM37, RM38, RM39, RM40,
247 RM41, RM42, RM43, RM44, RM45, RM46, RM47, RM48, RM49, RM50,
248 RM51, RM52, RM53, RM54 };
249
250 /* These versions of the macros use the stack, as normal. There are debugging
251 versions and production versions. Note that the "rw" argument of RMATCH isn't
252 actuall used in this definition. */
253
254 #ifndef NO_RECURSE
255 #define REGISTER register
256
257 #ifdef PCRE_DEBUG
258 #define RMATCH(ra,rb,rc,rd,re,rf,rg,rw) \
259 { \
260 printf("match() called in line %d\n", __LINE__); \
261 rrc = match(ra,rb,mstart,rc,rd,re,rf,rg,rdepth+1); \
262 printf("to line %d\n", __LINE__); \
263 }
264 #define RRETURN(ra) \
265 { \
266 printf("match() returned %d from line %d ", ra, __LINE__); \
267 return ra; \
268 }
269 #else
270 #define RMATCH(ra,rb,rc,rd,re,rf,rg,rw) \
271 rrc = match(ra,rb,mstart,rc,rd,re,rf,rg,rdepth+1)
272 #define RRETURN(ra) return ra
273 #endif
274
275 #else
276
277
278 /* These versions of the macros manage a private stack on the heap. Note that
279 the "rd" argument of RMATCH isn't actually used in this definition. It's the md
280 argument of match(), which never changes. */
281
282 #define REGISTER
283
284 #define RMATCH(ra,rb,rc,rd,re,rf,rg,rw)\
285 {\
286 heapframe *newframe = (pcre_stack_malloc)(sizeof(heapframe));\
287 frame->Xwhere = rw; \
288 newframe->Xeptr = ra;\
289 newframe->Xecode = rb;\
290 newframe->Xmstart = mstart;\
291 newframe->Xoffset_top = rc;\
292 newframe->Xims = re;\
293 newframe->Xeptrb = rf;\
294 newframe->Xflags = rg;\
295 newframe->Xrdepth = frame->Xrdepth + 1;\
296 newframe->Xprevframe = frame;\
297 frame = newframe;\
298 DPRINTF(("restarting from line %d\n", __LINE__));\
299 goto HEAP_RECURSE;\
300 L_##rw:\
301 DPRINTF(("jumped back to line %d\n", __LINE__));\
302 }
303
304 #define RRETURN(ra)\
305 {\
306 heapframe *newframe = frame;\
307 frame = newframe->Xprevframe;\
308 (pcre_stack_free)(newframe);\
309 if (frame != NULL)\
310 {\
311 rrc = ra;\
312 goto HEAP_RETURN;\
313 }\
314 return ra;\
315 }
316
317
318 /* Structure for remembering the local variables in a private frame */
319
320 typedef struct heapframe {
321 struct heapframe *Xprevframe;
322
323 /* Function arguments that may change */
324
325 USPTR Xeptr;
326 const uschar *Xecode;
327 USPTR Xmstart;
328 int Xoffset_top;
329 long int Xims;
330 eptrblock *Xeptrb;
331 int Xflags;
332 unsigned int Xrdepth;
333
334 /* Function local variables */
335
336 USPTR Xcallpat;
337 #ifdef SUPPORT_UTF8
338 USPTR Xcharptr;
339 #endif
340 USPTR Xdata;
341 USPTR Xnext;
342 USPTR Xpp;
343 USPTR Xprev;
344 USPTR Xsaved_eptr;
345
346 recursion_info Xnew_recursive;
347
348 BOOL Xcur_is_word;
349 BOOL Xcondition;
350 BOOL Xprev_is_word;
351
352 unsigned long int Xoriginal_ims;
353
354 #ifdef SUPPORT_UCP
355 int Xprop_type;
356 int Xprop_value;
357 int Xprop_fail_result;
358 int Xprop_category;
359 int Xprop_chartype;
360 int Xprop_script;
361 int Xoclength;
362 uschar Xocchars[8];
363 #endif
364
365 int Xcodelink;
366 int Xctype;
367 unsigned int Xfc;
368 int Xfi;
369 int Xlength;
370 int Xmax;
371 int Xmin;
372 int Xnumber;
373 int Xoffset;
374 int Xop;
375 int Xsave_capture_last;
376 int Xsave_offset1, Xsave_offset2, Xsave_offset3;
377 int Xstacksave[REC_STACK_SAVE_MAX];
378
379 eptrblock Xnewptrb;
380
381 /* Where to jump back to */
382
383 int Xwhere;
384
385 } heapframe;
386
387 #endif
388
389
390 /***************************************************************************
391 ***************************************************************************/
392
393
394
395 /*************************************************
396 * Match from current position *
397 *************************************************/
398
399 /* This function is called recursively in many circumstances. Whenever it
400 returns a negative (error) response, the outer incarnation must also return the
401 same response. */
402
403 /* These macros pack up tests that are used for partial matching, and which
404 appears several times in the code. We set the "hit end" flag if the pointer is
405 at the end of the subject and also past the start of the subject (i.e.
406 something has been matched). For hard partial matching, we then return
407 immediately. The second one is used when we already know we are past the end of
408 the subject. */
409
410 #define CHECK_PARTIAL()\
411 if (md->partial != 0 && eptr >= md->end_subject && eptr > mstart)\
412 {\
413 md->hitend = TRUE;\
414 if (md->partial > 1) RRETURN(PCRE_ERROR_PARTIAL);\
415 }
416
417 #define SCHECK_PARTIAL()\
418 if (md->partial != 0 && eptr > mstart)\
419 {\
420 md->hitend = TRUE;\
421 if (md->partial > 1) RRETURN(PCRE_ERROR_PARTIAL);\
422 }
423
424
425 /* Performance note: It might be tempting to extract commonly used fields from
426 the md structure (e.g. utf8, end_subject) into individual variables to improve
427 performance. Tests using gcc on a SPARC disproved this; in the first case, it
428 made performance worse.
429
430 Arguments:
431 eptr pointer to current character in subject
432 ecode pointer to current position in compiled code
433 mstart pointer to the current match start position (can be modified
434 by encountering \K)
435 offset_top current top pointer
436 md pointer to "static" info for the match
437 ims current /i, /m, and /s options
438 eptrb pointer to chain of blocks containing eptr at start of
439 brackets - for testing for empty matches
440 flags can contain
441 match_condassert - this is an assertion condition
442 match_cbegroup - this is the start of an unlimited repeat
443 group that can match an empty string
444 rdepth the recursion depth
445
446 Returns: MATCH_MATCH if matched ) these values are >= 0
447 MATCH_NOMATCH if failed to match )
448 a negative PCRE_ERROR_xxx value if aborted by an error condition
449 (e.g. stopped by repeated call or recursion limit)
450 */
451
452 static int
453 match(REGISTER USPTR eptr, REGISTER const uschar *ecode, USPTR mstart,
454 int offset_top, match_data *md, unsigned long int ims, eptrblock *eptrb,
455 int flags, unsigned int rdepth)
456 {
457 /* These variables do not need to be preserved over recursion in this function,
458 so they can be ordinary variables in all cases. Mark some of them with
459 "register" because they are used a lot in loops. */
460
461 register int rrc; /* Returns from recursive calls */
462 register int i; /* Used for loops not involving calls to RMATCH() */
463 register unsigned int c; /* Character values not kept over RMATCH() calls */
464 register BOOL utf8; /* Local copy of UTF-8 flag for speed */
465
466 BOOL minimize, possessive; /* Quantifier options */
467 int condcode;
468
469 /* When recursion is not being used, all "local" variables that have to be
470 preserved over calls to RMATCH() are part of a "frame" which is obtained from
471 heap storage. Set up the top-level frame here; others are obtained from the
472 heap whenever RMATCH() does a "recursion". See the macro definitions above. */
473
474 #ifdef NO_RECURSE
475 heapframe *frame = (pcre_stack_malloc)(sizeof(heapframe));
476 frame->Xprevframe = NULL; /* Marks the top level */
477
478 /* Copy in the original argument variables */
479
480 frame->Xeptr = eptr;
481 frame->Xecode = ecode;
482 frame->Xmstart = mstart;
483 frame->Xoffset_top = offset_top;
484 frame->Xims = ims;
485 frame->Xeptrb = eptrb;
486 frame->Xflags = flags;
487 frame->Xrdepth = rdepth;
488
489 /* This is where control jumps back to to effect "recursion" */
490
491 HEAP_RECURSE:
492
493 /* Macros make the argument variables come from the current frame */
494
495 #define eptr frame->Xeptr
496 #define ecode frame->Xecode
497 #define mstart frame->Xmstart
498 #define offset_top frame->Xoffset_top
499 #define ims frame->Xims
500 #define eptrb frame->Xeptrb
501 #define flags frame->Xflags
502 #define rdepth frame->Xrdepth
503
504 /* Ditto for the local variables */
505
506 #ifdef SUPPORT_UTF8
507 #define charptr frame->Xcharptr
508 #endif
509 #define callpat frame->Xcallpat
510 #define codelink frame->Xcodelink
511 #define data frame->Xdata
512 #define next frame->Xnext
513 #define pp frame->Xpp
514 #define prev frame->Xprev
515 #define saved_eptr frame->Xsaved_eptr
516
517 #define new_recursive frame->Xnew_recursive
518
519 #define cur_is_word frame->Xcur_is_word
520 #define condition frame->Xcondition
521 #define prev_is_word frame->Xprev_is_word
522
523 #define original_ims frame->Xoriginal_ims
524
525 #ifdef SUPPORT_UCP
526 #define prop_type frame->Xprop_type
527 #define prop_value frame->Xprop_value
528 #define prop_fail_result frame->Xprop_fail_result
529 #define prop_category frame->Xprop_category
530 #define prop_chartype frame->Xprop_chartype
531 #define prop_script frame->Xprop_script
532 #define oclength frame->Xoclength
533 #define occhars frame->Xocchars
534 #endif
535
536 #define ctype frame->Xctype
537 #define fc frame->Xfc
538 #define fi frame->Xfi
539 #define length frame->Xlength
540 #define max frame->Xmax
541 #define min frame->Xmin
542 #define number frame->Xnumber
543 #define offset frame->Xoffset
544 #define op frame->Xop
545 #define save_capture_last frame->Xsave_capture_last
546 #define save_offset1 frame->Xsave_offset1
547 #define save_offset2 frame->Xsave_offset2
548 #define save_offset3 frame->Xsave_offset3
549 #define stacksave frame->Xstacksave
550
551 #define newptrb frame->Xnewptrb
552
553 /* When recursion is being used, local variables are allocated on the stack and
554 get preserved during recursion in the normal way. In this environment, fi and
555 i, and fc and c, can be the same variables. */
556
557 #else /* NO_RECURSE not defined */
558 #define fi i
559 #define fc c
560
561
562 #ifdef SUPPORT_UTF8 /* Many of these variables are used only */
563 const uschar *charptr; /* in small blocks of the code. My normal */
564 #endif /* style of coding would have declared */
565 const uschar *callpat; /* them within each of those blocks. */
566 const uschar *data; /* However, in order to accommodate the */
567 const uschar *next; /* version of this code that uses an */
568 USPTR pp; /* external "stack" implemented on the */
569 const uschar *prev; /* heap, it is easier to declare them all */
570 USPTR saved_eptr; /* here, so the declarations can be cut */
571 /* out in a block. The only declarations */
572 recursion_info new_recursive; /* within blocks below are for variables */
573 /* that do not have to be preserved over */
574 BOOL cur_is_word; /* a recursive call to RMATCH(). */
575 BOOL condition;
576 BOOL prev_is_word;
577
578 unsigned long int original_ims;
579
580 #ifdef SUPPORT_UCP
581 int prop_type;
582 int prop_value;
583 int prop_fail_result;
584 int prop_category;
585 int prop_chartype;
586 int prop_script;
587 int oclength;
588 uschar occhars[8];
589 #endif
590
591 int codelink;
592 int ctype;
593 int length;
594 int max;
595 int min;
596 int number;
597 int offset;
598 int op;
599 int save_capture_last;
600 int save_offset1, save_offset2, save_offset3;
601 int stacksave[REC_STACK_SAVE_MAX];
602
603 eptrblock newptrb;
604 #endif /* NO_RECURSE */
605
606 /* These statements are here to stop the compiler complaining about unitialized
607 variables. */
608
609 #ifdef SUPPORT_UCP
610 prop_value = 0;
611 prop_fail_result = 0;
612 #endif
613
614
615 /* This label is used for tail recursion, which is used in a few cases even
616 when NO_RECURSE is not defined, in order to reduce the amount of stack that is
617 used. Thanks to Ian Taylor for noticing this possibility and sending the
618 original patch. */
619
620 TAIL_RECURSE:
621
622 /* OK, now we can get on with the real code of the function. Recursive calls
623 are specified by the macro RMATCH and RRETURN is used to return. When
624 NO_RECURSE is *not* defined, these just turn into a recursive call to match()
625 and a "return", respectively (possibly with some debugging if PCRE_DEBUG is
626 defined). However, RMATCH isn't like a function call because it's quite a
627 complicated macro. It has to be used in one particular way. This shouldn't,
628 however, impact performance when true recursion is being used. */
629
630 #ifdef SUPPORT_UTF8
631 utf8 = md->utf8; /* Local copy of the flag */
632 #else
633 utf8 = FALSE;
634 #endif
635
636 /* First check that we haven't called match() too many times, or that we
637 haven't exceeded the recursive call limit. */
638
639 if (md->match_call_count++ >= md->match_limit) RRETURN(PCRE_ERROR_MATCHLIMIT);
640 if (rdepth >= md->match_limit_recursion) RRETURN(PCRE_ERROR_RECURSIONLIMIT);
641
642 original_ims = ims; /* Save for resetting on ')' */
643
644 /* At the start of a group with an unlimited repeat that may match an empty
645 string, the match_cbegroup flag is set. When this is the case, add the current
646 subject pointer to the chain of such remembered pointers, to be checked when we
647 hit the closing ket, in order to break infinite loops that match no characters.
648 When match() is called in other circumstances, don't add to the chain. The
649 match_cbegroup flag must NOT be used with tail recursion, because the memory
650 block that is used is on the stack, so a new one may be required for each
651 match(). */
652
653 if ((flags & match_cbegroup) != 0)
654 {
655 newptrb.epb_saved_eptr = eptr;
656 newptrb.epb_prev = eptrb;
657 eptrb = &newptrb;
658 }
659
660 /* Now start processing the opcodes. */
661
662 for (;;)
663 {
664 minimize = possessive = FALSE;
665 op = *ecode;
666
667 switch(op)
668 {
669 case OP_FAIL:
670 RRETURN(MATCH_NOMATCH);
671
672 case OP_PRUNE:
673 RMATCH(eptr, ecode + _pcre_OP_lengths[*ecode], offset_top, md,
674 ims, eptrb, flags, RM51);
675 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
676 RRETURN(MATCH_PRUNE);
677
678 case OP_COMMIT:
679 RMATCH(eptr, ecode + _pcre_OP_lengths[*ecode], offset_top, md,
680 ims, eptrb, flags, RM52);
681 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
682 RRETURN(MATCH_COMMIT);
683
684 case OP_SKIP:
685 RMATCH(eptr, ecode + _pcre_OP_lengths[*ecode], offset_top, md,
686 ims, eptrb, flags, RM53);
687 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
688 md->start_match_ptr = eptr; /* Pass back current position */
689 RRETURN(MATCH_SKIP);
690
691 case OP_THEN:
692 RMATCH(eptr, ecode + _pcre_OP_lengths[*ecode], offset_top, md,
693 ims, eptrb, flags, RM54);
694 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
695 RRETURN(MATCH_THEN);
696
697 /* Handle a capturing bracket. If there is space in the offset vector, save
698 the current subject position in the working slot at the top of the vector.
699 We mustn't change the current values of the data slot, because they may be
700 set from a previous iteration of this group, and be referred to by a
701 reference inside the group.
702
703 If the bracket fails to match, we need to restore this value and also the
704 values of the final offsets, in case they were set by a previous iteration
705 of the same bracket.
706
707 If there isn't enough space in the offset vector, treat this as if it were
708 a non-capturing bracket. Don't worry about setting the flag for the error
709 case here; that is handled in the code for KET. */
710
711 case OP_CBRA:
712 case OP_SCBRA:
713 number = GET2(ecode, 1+LINK_SIZE);
714 offset = number << 1;
715
716 #ifdef PCRE_DEBUG
717 printf("start bracket %d\n", number);
718 printf("subject=");
719 pchars(eptr, 16, TRUE, md);
720 printf("\n");
721 #endif
722
723 if (offset < md->offset_max)
724 {
725 save_offset1 = md->offset_vector[offset];
726 save_offset2 = md->offset_vector[offset+1];
727 save_offset3 = md->offset_vector[md->offset_end - number];
728 save_capture_last = md->capture_last;
729
730 DPRINTF(("saving %d %d %d\n", save_offset1, save_offset2, save_offset3));
731 md->offset_vector[md->offset_end - number] = eptr - md->start_subject;
732
733 flags = (op == OP_SCBRA)? match_cbegroup : 0;
734 do
735 {
736 RMATCH(eptr, ecode + _pcre_OP_lengths[*ecode], offset_top, md,
737 ims, eptrb, flags, RM1);
738 if (rrc != MATCH_NOMATCH && rrc != MATCH_THEN) RRETURN(rrc);
739 md->capture_last = save_capture_last;
740 ecode += GET(ecode, 1);
741 }
742 while (*ecode == OP_ALT);
743
744 DPRINTF(("bracket %d failed\n", number));
745
746 md->offset_vector[offset] = save_offset1;
747 md->offset_vector[offset+1] = save_offset2;
748 md->offset_vector[md->offset_end - number] = save_offset3;
749
750 RRETURN(MATCH_NOMATCH);
751 }
752
753 /* FALL THROUGH ... Insufficient room for saving captured contents. Treat
754 as a non-capturing bracket. */
755
756 /* VVVVVVVVVVVVVVVVVVVVVVVVV */
757 /* VVVVVVVVVVVVVVVVVVVVVVVVV */
758
759 DPRINTF(("insufficient capture room: treat as non-capturing\n"));
760
761 /* VVVVVVVVVVVVVVVVVVVVVVVVV */
762 /* VVVVVVVVVVVVVVVVVVVVVVVVV */
763
764 /* Non-capturing bracket. Loop for all the alternatives. When we get to the
765 final alternative within the brackets, we would return the result of a
766 recursive call to match() whatever happened. We can reduce stack usage by
767 turning this into a tail recursion, except in the case when match_cbegroup
768 is set.*/
769
770 case OP_BRA:
771 case OP_SBRA:
772 DPRINTF(("start non-capturing bracket\n"));
773 flags = (op >= OP_SBRA)? match_cbegroup : 0;
774 for (;;)
775 {
776 if (ecode[GET(ecode, 1)] != OP_ALT) /* Final alternative */
777 {
778 if (flags == 0) /* Not a possibly empty group */
779 {
780 ecode += _pcre_OP_lengths[*ecode];
781 DPRINTF(("bracket 0 tail recursion\n"));
782 goto TAIL_RECURSE;
783 }
784
785 /* Possibly empty group; can't use tail recursion. */
786
787 RMATCH(eptr, ecode + _pcre_OP_lengths[*ecode], offset_top, md, ims,
788 eptrb, flags, RM48);
789 RRETURN(rrc);
790 }
791
792 /* For non-final alternatives, continue the loop for a NOMATCH result;
793 otherwise return. */
794
795 RMATCH(eptr, ecode + _pcre_OP_lengths[*ecode], offset_top, md, ims,
796 eptrb, flags, RM2);
797 if (rrc != MATCH_NOMATCH && rrc != MATCH_THEN) RRETURN(rrc);
798 ecode += GET(ecode, 1);
799 }
800 /* Control never reaches here. */
801
802 /* Conditional group: compilation checked that there are no more than
803 two branches. If the condition is false, skipping the first branch takes us
804 past the end if there is only one branch, but that's OK because that is
805 exactly what going to the ket would do. As there is only one branch to be
806 obeyed, we can use tail recursion to avoid using another stack frame. */
807
808 case OP_COND:
809 case OP_SCOND:
810 codelink= GET(ecode, 1);
811
812 /* Because of the way auto-callout works during compile, a callout item is
813 inserted between OP_COND and an assertion condition. */
814
815 if (ecode[LINK_SIZE+1] == OP_CALLOUT)
816 {
817 if (pcre_callout != NULL)
818 {
819 pcre_callout_block cb;
820 cb.version = 1; /* Version 1 of the callout block */
821 cb.callout_number = ecode[LINK_SIZE+2];
822 cb.offset_vector = md->offset_vector;
823 cb.subject = (PCRE_SPTR)md->start_subject;
824 cb.subject_length = md->end_subject - md->start_subject;
825 cb.start_match = mstart - md->start_subject;
826 cb.current_position = eptr - md->start_subject;
827 cb.pattern_position = GET(ecode, LINK_SIZE + 3);
828 cb.next_item_length = GET(ecode, 3 + 2*LINK_SIZE);
829 cb.capture_top = offset_top/2;
830 cb.capture_last = md->capture_last;
831 cb.callout_data = md->callout_data;
832 if ((rrc = (*pcre_callout)(&cb)) > 0) RRETURN(MATCH_NOMATCH);
833 if (rrc < 0) RRETURN(rrc);
834 }
835 ecode += _pcre_OP_lengths[OP_CALLOUT];
836 }
837
838 condcode = ecode[LINK_SIZE+1];
839
840 /* Now see what the actual condition is */
841
842 if (condcode == OP_RREF || condcode == OP_NRREF) /* Recursion test */
843 {
844 if (md->recursive == NULL) /* Not recursing => FALSE */
845 {
846 condition = FALSE;
847 ecode += GET(ecode, 1);
848 }
849 else
850 {
851 int recno = GET2(ecode, LINK_SIZE + 2); /* Recursion group number*/
852 condition = (recno == RREF_ANY || recno == md->recursive->group_num);
853
854 /* If the test is for recursion into a specific subpattern, and it is
855 false, but the test was set up by name, scan the table to see if the
856 name refers to any other numbers, and test them. The condition is true
857 if any one is set. */
858
859 if (!condition && condcode == OP_NRREF && recno != RREF_ANY)
860 {
861 uschar *slotA = md->name_table;
862 for (i = 0; i < md->name_count; i++)
863 {
864 if (GET2(slotA, 0) == recno) break;
865 slotA += md->name_entry_size;
866 }
867
868 /* Found a name for the number - there can be only one; duplicate
869 names for different numbers are allowed, but not vice versa. First
870 scan down for duplicates. */
871
872 if (i < md->name_count)
873 {
874 uschar *slotB = slotA;
875 while (slotB > md->name_table)
876 {
877 slotB -= md->name_entry_size;
878 if (strcmp((char *)slotA + 2, (char *)slotB + 2) == 0)
879 {
880 condition = GET2(slotB, 0) == md->recursive->group_num;
881 if (condition) break;
882 }
883 else break;
884 }
885
886 /* Scan up for duplicates */
887
888 if (!condition)
889 {
890 slotB = slotA;
891 for (i++; i < md->name_count; i++)
892 {
893 slotB += md->name_entry_size;
894 if (strcmp((char *)slotA + 2, (char *)slotB + 2) == 0)
895 {
896 condition = GET2(slotB, 0) == md->recursive->group_num;
897 if (condition) break;
898 }
899 else break;
900 }
901 }
902 }
903 }
904
905 /* Chose branch according to the condition */
906
907 ecode += condition? 3 : GET(ecode, 1);
908 }
909 }
910
911 else if (condcode == OP_CREF || condcode == OP_NCREF) /* Group used test */
912 {
913 offset = GET2(ecode, LINK_SIZE+2) << 1; /* Doubled ref number */
914 condition = offset < offset_top && md->offset_vector[offset] >= 0;
915
916 /* If the numbered capture is unset, but the reference was by name,
917 scan the table to see if the name refers to any other numbers, and test
918 them. The condition is true if any one is set. This is tediously similar
919 to the code above, but not close enough to try to amalgamate. */
920
921 if (!condition && condcode == OP_NCREF)
922 {
923 int refno = offset >> 1;
924 uschar *slotA = md->name_table;
925
926 for (i = 0; i < md->name_count; i++)
927 {
928 if (GET2(slotA, 0) == refno) break;
929 slotA += md->name_entry_size;
930 }
931
932 /* Found a name for the number - there can be only one; duplicate names
933 for different numbers are allowed, but not vice versa. First scan down
934 for duplicates. */
935
936 if (i < md->name_count)
937 {
938 uschar *slotB = slotA;
939 while (slotB > md->name_table)
940 {
941 slotB -= md->name_entry_size;
942 if (strcmp((char *)slotA + 2, (char *)slotB + 2) == 0)
943 {
944 offset = GET2(slotB, 0) << 1;
945 condition = offset < offset_top &&
946 md->offset_vector[offset] >= 0;
947 if (condition) break;
948 }
949 else break;
950 }
951
952 /* Scan up for duplicates */
953
954 if (!condition)
955 {
956 slotB = slotA;
957 for (i++; i < md->name_count; i++)
958 {
959 slotB += md->name_entry_size;
960 if (strcmp((char *)slotA + 2, (char *)slotB + 2) == 0)
961 {
962 offset = GET2(slotB, 0) << 1;
963 condition = offset < offset_top &&
964 md->offset_vector[offset] >= 0;
965 if (condition) break;
966 }
967 else break;
968 }
969 }
970 }
971 }
972
973 /* Chose branch according to the condition */
974
975 ecode += condition? 3 : GET(ecode, 1);
976 }
977
978 else if (condcode == OP_DEF) /* DEFINE - always false */
979 {
980 condition = FALSE;
981 ecode += GET(ecode, 1);
982 }
983
984 /* The condition is an assertion. Call match() to evaluate it - setting
985 the final argument match_condassert causes it to stop at the end of an
986 assertion. */
987
988 else
989 {
990 RMATCH(eptr, ecode + 1 + LINK_SIZE, offset_top, md, ims, NULL,
991 match_condassert, RM3);
992 if (rrc == MATCH_MATCH)
993 {
994 condition = TRUE;
995 ecode += 1 + LINK_SIZE + GET(ecode, LINK_SIZE + 2);
996 while (*ecode == OP_ALT) ecode += GET(ecode, 1);
997 }
998 else if (rrc != MATCH_NOMATCH && rrc != MATCH_THEN)
999 {
1000 RRETURN(rrc); /* Need braces because of following else */
1001 }
1002 else
1003 {
1004 condition = FALSE;
1005 ecode += codelink;
1006 }
1007 }
1008
1009 /* We are now at the branch that is to be obeyed. As there is only one,
1010 we can use tail recursion to avoid using another stack frame, except when
1011 match_cbegroup is required for an unlimited repeat of a possibly empty
1012 group. If the second alternative doesn't exist, we can just plough on. */
1013
1014 if (condition || *ecode == OP_ALT)
1015 {
1016 ecode += 1 + LINK_SIZE;
1017 if (op == OP_SCOND) /* Possibly empty group */
1018 {
1019 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, match_cbegroup, RM49);
1020 RRETURN(rrc);
1021 }
1022 else /* Group must match something */
1023 {
1024 flags = 0;
1025 goto TAIL_RECURSE;
1026 }
1027 }
1028 else /* Condition false & no alternative */
1029 {
1030 ecode += 1 + LINK_SIZE;
1031 }
1032 break;
1033
1034
1035 /* Before OP_ACCEPT there may be any number of OP_CLOSE opcodes,
1036 to close any currently open capturing brackets. */
1037
1038 case OP_CLOSE:
1039 number = GET2(ecode, 1);
1040 offset = number << 1;
1041
1042 #ifdef PCRE_DEBUG
1043 printf("end bracket %d at *ACCEPT", number);
1044 printf("\n");
1045 #endif
1046
1047 md->capture_last = number;
1048 if (offset >= md->offset_max) md->offset_overflow = TRUE; else
1049 {
1050 md->offset_vector[offset] =
1051 md->offset_vector[md->offset_end - number];
1052 md->offset_vector[offset+1] = eptr - md->start_subject;
1053 if (offset_top <= offset) offset_top = offset + 2;
1054 }
1055 ecode += 3;
1056 break;
1057
1058
1059 /* End of the pattern, either real or forced. If we are in a top-level
1060 recursion, we should restore the offsets appropriately and continue from
1061 after the call. */
1062
1063 case OP_ACCEPT:
1064 case OP_END:
1065 if (md->recursive != NULL && md->recursive->group_num == 0)
1066 {
1067 recursion_info *rec = md->recursive;
1068 DPRINTF(("End of pattern in a (?0) recursion\n"));
1069 md->recursive = rec->prevrec;
1070 memmove(md->offset_vector, rec->offset_save,
1071 rec->saved_max * sizeof(int));
1072 offset_top = rec->save_offset_top;
1073 mstart = rec->save_start;
1074 ims = original_ims;
1075 ecode = rec->after_call;
1076 break;
1077 }
1078
1079 /* Otherwise, if we have matched an empty string, fail if PCRE_NOTEMPTY is
1080 set, or if PCRE_NOTEMPTY_ATSTART is set and we have matched at the start of
1081 the subject. In both cases, backtracking will then try other alternatives,
1082 if any. */
1083
1084 if (eptr == mstart &&
1085 (md->notempty ||
1086 (md->notempty_atstart &&
1087 mstart == md->start_subject + md->start_offset)))
1088 RRETURN(MATCH_NOMATCH);
1089
1090 /* Otherwise, we have a match. */
1091
1092 md->end_match_ptr = eptr; /* Record where we ended */
1093 md->end_offset_top = offset_top; /* and how many extracts were taken */
1094 md->start_match_ptr = mstart; /* and the start (\K can modify) */
1095 RRETURN(MATCH_MATCH);
1096
1097 /* Change option settings */
1098
1099 case OP_OPT:
1100 ims = ecode[1];
1101 ecode += 2;
1102 DPRINTF(("ims set to %02lx\n", ims));
1103 break;
1104
1105 /* Assertion brackets. Check the alternative branches in turn - the
1106 matching won't pass the KET for an assertion. If any one branch matches,
1107 the assertion is true. Lookbehind assertions have an OP_REVERSE item at the
1108 start of each branch to move the current point backwards, so the code at
1109 this level is identical to the lookahead case. */
1110
1111 case OP_ASSERT:
1112 case OP_ASSERTBACK:
1113 do
1114 {
1115 RMATCH(eptr, ecode + 1 + LINK_SIZE, offset_top, md, ims, NULL, 0,
1116 RM4);
1117 if (rrc == MATCH_MATCH) break;
1118 if (rrc != MATCH_NOMATCH && rrc != MATCH_THEN) RRETURN(rrc);
1119 ecode += GET(ecode, 1);
1120 }
1121 while (*ecode == OP_ALT);
1122 if (*ecode == OP_KET) RRETURN(MATCH_NOMATCH);
1123
1124 /* If checking an assertion for a condition, return MATCH_MATCH. */
1125
1126 if ((flags & match_condassert) != 0) RRETURN(MATCH_MATCH);
1127
1128 /* Continue from after the assertion, updating the offsets high water
1129 mark, since extracts may have been taken during the assertion. */
1130
1131 do ecode += GET(ecode,1); while (*ecode == OP_ALT);
1132 ecode += 1 + LINK_SIZE;
1133 offset_top = md->end_offset_top;
1134 continue;
1135
1136 /* Negative assertion: all branches must fail to match. Encountering SKIP,
1137 PRUNE, or COMMIT means we must assume failure without checking subsequent
1138 branches. */
1139
1140 case OP_ASSERT_NOT:
1141 case OP_ASSERTBACK_NOT:
1142 do
1143 {
1144 RMATCH(eptr, ecode + 1 + LINK_SIZE, offset_top, md, ims, NULL, 0,
1145 RM5);
1146 if (rrc == MATCH_MATCH) RRETURN(MATCH_NOMATCH);
1147 if (rrc == MATCH_SKIP || rrc == MATCH_PRUNE || rrc == MATCH_COMMIT)
1148 {
1149 do ecode += GET(ecode,1); while (*ecode == OP_ALT);
1150 break;
1151 }
1152 if (rrc != MATCH_NOMATCH && rrc != MATCH_THEN) RRETURN(rrc);
1153 ecode += GET(ecode,1);
1154 }
1155 while (*ecode == OP_ALT);
1156
1157 if ((flags & match_condassert) != 0) RRETURN(MATCH_MATCH);
1158
1159 ecode += 1 + LINK_SIZE;
1160 continue;
1161
1162 /* Move the subject pointer back. This occurs only at the start of
1163 each branch of a lookbehind assertion. If we are too close to the start to
1164 move back, this match function fails. When working with UTF-8 we move
1165 back a number of characters, not bytes. */
1166
1167 case OP_REVERSE:
1168 #ifdef SUPPORT_UTF8
1169 if (utf8)
1170 {
1171 i = GET(ecode, 1);
1172 while (i-- > 0)
1173 {
1174 eptr--;
1175 if (eptr < md->start_subject) RRETURN(MATCH_NOMATCH);
1176 BACKCHAR(eptr);
1177 }
1178 }
1179 else
1180 #endif
1181
1182 /* No UTF-8 support, or not in UTF-8 mode: count is byte count */
1183
1184 {
1185 eptr -= GET(ecode, 1);
1186 if (eptr < md->start_subject) RRETURN(MATCH_NOMATCH);
1187 }
1188
1189 /* Save the earliest consulted character, then skip to next op code */
1190
1191 if (eptr < md->start_used_ptr) md->start_used_ptr = eptr;
1192 ecode += 1 + LINK_SIZE;
1193 break;
1194
1195 /* The callout item calls an external function, if one is provided, passing
1196 details of the match so far. This is mainly for debugging, though the
1197 function is able to force a failure. */
1198
1199 case OP_CALLOUT:
1200 if (pcre_callout != NULL)
1201 {
1202 pcre_callout_block cb;
1203 cb.version = 1; /* Version 1 of the callout block */
1204 cb.callout_number = ecode[1];
1205 cb.offset_vector = md->offset_vector;
1206 cb.subject = (PCRE_SPTR)md->start_subject;
1207 cb.subject_length = md->end_subject - md->start_subject;
1208 cb.start_match = mstart - md->start_subject;
1209 cb.current_position = eptr - md->start_subject;
1210 cb.pattern_position = GET(ecode, 2);
1211 cb.next_item_length = GET(ecode, 2 + LINK_SIZE);
1212 cb.capture_top = offset_top/2;
1213 cb.capture_last = md->capture_last;
1214 cb.callout_data = md->callout_data;
1215 if ((rrc = (*pcre_callout)(&cb)) > 0) RRETURN(MATCH_NOMATCH);
1216 if (rrc < 0) RRETURN(rrc);
1217 }
1218 ecode += 2 + 2*LINK_SIZE;
1219 break;
1220
1221 /* Recursion either matches the current regex, or some subexpression. The
1222 offset data is the offset to the starting bracket from the start of the
1223 whole pattern. (This is so that it works from duplicated subpatterns.)
1224
1225 If there are any capturing brackets started but not finished, we have to
1226 save their starting points and reinstate them after the recursion. However,
1227 we don't know how many such there are (offset_top records the completed
1228 total) so we just have to save all the potential data. There may be up to
1229 65535 such values, which is too large to put on the stack, but using malloc
1230 for small numbers seems expensive. As a compromise, the stack is used when
1231 there are no more than REC_STACK_SAVE_MAX values to store; otherwise malloc
1232 is used. A problem is what to do if the malloc fails ... there is no way of
1233 returning to the top level with an error. Save the top REC_STACK_SAVE_MAX
1234 values on the stack, and accept that the rest may be wrong.
1235
1236 There are also other values that have to be saved. We use a chained
1237 sequence of blocks that actually live on the stack. Thanks to Robin Houston
1238 for the original version of this logic. */
1239
1240 case OP_RECURSE:
1241 {
1242 callpat = md->start_code + GET(ecode, 1);
1243 new_recursive.group_num = (callpat == md->start_code)? 0 :
1244 GET2(callpat, 1 + LINK_SIZE);
1245
1246 /* Add to "recursing stack" */
1247
1248 new_recursive.prevrec = md->recursive;
1249 md->recursive = &new_recursive;
1250
1251 /* Find where to continue from afterwards */
1252
1253 ecode += 1 + LINK_SIZE;
1254 new_recursive.after_call = ecode;
1255
1256 /* Now save the offset data. */
1257
1258 new_recursive.saved_max = md->offset_end;
1259 if (new_recursive.saved_max <= REC_STACK_SAVE_MAX)
1260 new_recursive.offset_save = stacksave;
1261 else
1262 {
1263 new_recursive.offset_save =
1264 (int *)(pcre_malloc)(new_recursive.saved_max * sizeof(int));
1265 if (new_recursive.offset_save == NULL) RRETURN(PCRE_ERROR_NOMEMORY);
1266 }
1267
1268 memcpy(new_recursive.offset_save, md->offset_vector,
1269 new_recursive.saved_max * sizeof(int));
1270 new_recursive.save_start = mstart;
1271 new_recursive.save_offset_top = offset_top;
1272 mstart = eptr;
1273
1274 /* OK, now we can do the recursion. For each top-level alternative we
1275 restore the offset and recursion data. */
1276
1277 DPRINTF(("Recursing into group %d\n", new_recursive.group_num));
1278 flags = (*callpat >= OP_SBRA)? match_cbegroup : 0;
1279 do
1280 {
1281 RMATCH(eptr, callpat + _pcre_OP_lengths[*callpat], offset_top,
1282 md, ims, eptrb, flags, RM6);
1283 if (rrc == MATCH_MATCH)
1284 {
1285 DPRINTF(("Recursion matched\n"));
1286 md->recursive = new_recursive.prevrec;
1287 if (new_recursive.offset_save != stacksave)
1288 (pcre_free)(new_recursive.offset_save);
1289 RRETURN(MATCH_MATCH);
1290 }
1291 else if (rrc != MATCH_NOMATCH && rrc != MATCH_THEN)
1292 {
1293 DPRINTF(("Recursion gave error %d\n", rrc));
1294 if (new_recursive.offset_save != stacksave)
1295 (pcre_free)(new_recursive.offset_save);
1296 RRETURN(rrc);
1297 }
1298
1299 md->recursive = &new_recursive;
1300 memcpy(md->offset_vector, new_recursive.offset_save,
1301 new_recursive.saved_max * sizeof(int));
1302 callpat += GET(callpat, 1);
1303 }
1304 while (*callpat == OP_ALT);
1305
1306 DPRINTF(("Recursion didn't match\n"));
1307 md->recursive = new_recursive.prevrec;
1308 if (new_recursive.offset_save != stacksave)
1309 (pcre_free)(new_recursive.offset_save);
1310 RRETURN(MATCH_NOMATCH);
1311 }
1312 /* Control never reaches here */
1313
1314 /* "Once" brackets are like assertion brackets except that after a match,
1315 the point in the subject string is not moved back. Thus there can never be
1316 a move back into the brackets. Friedl calls these "atomic" subpatterns.
1317 Check the alternative branches in turn - the matching won't pass the KET
1318 for this kind of subpattern. If any one branch matches, we carry on as at
1319 the end of a normal bracket, leaving the subject pointer. */
1320
1321 case OP_ONCE:
1322 prev = ecode;
1323 saved_eptr = eptr;
1324
1325 do
1326 {
1327 RMATCH(eptr, ecode + 1 + LINK_SIZE, offset_top, md, ims, eptrb, 0, RM7);
1328 if (rrc == MATCH_MATCH) break;
1329 if (rrc != MATCH_NOMATCH && rrc != MATCH_THEN) RRETURN(rrc);
1330 ecode += GET(ecode,1);
1331 }
1332 while (*ecode == OP_ALT);
1333
1334 /* If hit the end of the group (which could be repeated), fail */
1335
1336 if (*ecode != OP_ONCE && *ecode != OP_ALT) RRETURN(MATCH_NOMATCH);
1337
1338 /* Continue as from after the assertion, updating the offsets high water
1339 mark, since extracts may have been taken. */
1340
1341 do ecode += GET(ecode, 1); while (*ecode == OP_ALT);
1342
1343 offset_top = md->end_offset_top;
1344 eptr = md->end_match_ptr;
1345
1346 /* For a non-repeating ket, just continue at this level. This also
1347 happens for a repeating ket if no characters were matched in the group.
1348 This is the forcible breaking of infinite loops as implemented in Perl
1349 5.005. If there is an options reset, it will get obeyed in the normal
1350 course of events. */
1351
1352 if (*ecode == OP_KET || eptr == saved_eptr)
1353 {
1354 ecode += 1+LINK_SIZE;
1355 break;
1356 }
1357
1358 /* The repeating kets try the rest of the pattern or restart from the
1359 preceding bracket, in the appropriate order. The second "call" of match()
1360 uses tail recursion, to avoid using another stack frame. We need to reset
1361 any options that changed within the bracket before re-running it, so
1362 check the next opcode. */
1363
1364 if (ecode[1+LINK_SIZE] == OP_OPT)
1365 {
1366 ims = (ims & ~PCRE_IMS) | ecode[4];
1367 DPRINTF(("ims set to %02lx at group repeat\n", ims));
1368 }
1369
1370 if (*ecode == OP_KETRMIN)
1371 {
1372 RMATCH(eptr, ecode + 1 + LINK_SIZE, offset_top, md, ims, eptrb, 0, RM8);
1373 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1374 ecode = prev;
1375 flags = 0;
1376 goto TAIL_RECURSE;
1377 }
1378 else /* OP_KETRMAX */
1379 {
1380 RMATCH(eptr, prev, offset_top, md, ims, eptrb, match_cbegroup, RM9);
1381 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1382 ecode += 1 + LINK_SIZE;
1383 flags = 0;
1384 goto TAIL_RECURSE;
1385 }
1386 /* Control never gets here */
1387
1388 /* An alternation is the end of a branch; scan along to find the end of the
1389 bracketed group and go to there. */
1390
1391 case OP_ALT:
1392 do ecode += GET(ecode,1); while (*ecode == OP_ALT);
1393 break;
1394
1395 /* BRAZERO, BRAMINZERO and SKIPZERO occur just before a bracket group,
1396 indicating that it may occur zero times. It may repeat infinitely, or not
1397 at all - i.e. it could be ()* or ()? or even (){0} in the pattern. Brackets
1398 with fixed upper repeat limits are compiled as a number of copies, with the
1399 optional ones preceded by BRAZERO or BRAMINZERO. */
1400
1401 case OP_BRAZERO:
1402 {
1403 next = ecode+1;
1404 RMATCH(eptr, next, offset_top, md, ims, eptrb, 0, RM10);
1405 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1406 do next += GET(next,1); while (*next == OP_ALT);
1407 ecode = next + 1 + LINK_SIZE;
1408 }
1409 break;
1410
1411 case OP_BRAMINZERO:
1412 {
1413 next = ecode+1;
1414 do next += GET(next, 1); while (*next == OP_ALT);
1415 RMATCH(eptr, next + 1+LINK_SIZE, offset_top, md, ims, eptrb, 0, RM11);
1416 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1417 ecode++;
1418 }
1419 break;
1420
1421 case OP_SKIPZERO:
1422 {
1423 next = ecode+1;
1424 do next += GET(next,1); while (*next == OP_ALT);
1425 ecode = next + 1 + LINK_SIZE;
1426 }
1427 break;
1428
1429 /* End of a group, repeated or non-repeating. */
1430
1431 case OP_KET:
1432 case OP_KETRMIN:
1433 case OP_KETRMAX:
1434 prev = ecode - GET(ecode, 1);
1435
1436 /* If this was a group that remembered the subject start, in order to break
1437 infinite repeats of empty string matches, retrieve the subject start from
1438 the chain. Otherwise, set it NULL. */
1439
1440 if (*prev >= OP_SBRA)
1441 {
1442 saved_eptr = eptrb->epb_saved_eptr; /* Value at start of group */
1443 eptrb = eptrb->epb_prev; /* Backup to previous group */
1444 }
1445 else saved_eptr = NULL;
1446
1447 /* If we are at the end of an assertion group, stop matching and return
1448 MATCH_MATCH, but record the current high water mark for use by positive
1449 assertions. Do this also for the "once" (atomic) groups. */
1450
1451 if (*prev == OP_ASSERT || *prev == OP_ASSERT_NOT ||
1452 *prev == OP_ASSERTBACK || *prev == OP_ASSERTBACK_NOT ||
1453 *prev == OP_ONCE)
1454 {
1455 md->end_match_ptr = eptr; /* For ONCE */
1456 md->end_offset_top = offset_top;
1457 RRETURN(MATCH_MATCH);
1458 }
1459
1460 /* For capturing groups we have to check the group number back at the start
1461 and if necessary complete handling an extraction by setting the offsets and
1462 bumping the high water mark. Note that whole-pattern recursion is coded as
1463 a recurse into group 0, so it won't be picked up here. Instead, we catch it
1464 when the OP_END is reached. Other recursion is handled here. */
1465
1466 if (*prev == OP_CBRA || *prev == OP_SCBRA)
1467 {
1468 number = GET2(prev, 1+LINK_SIZE);
1469 offset = number << 1;
1470
1471 #ifdef PCRE_DEBUG
1472 printf("end bracket %d", number);
1473 printf("\n");
1474 #endif
1475
1476 md->capture_last = number;
1477 if (offset >= md->offset_max) md->offset_overflow = TRUE; else
1478 {
1479 md->offset_vector[offset] =
1480 md->offset_vector[md->offset_end - number];
1481 md->offset_vector[offset+1] = eptr - md->start_subject;
1482 if (offset_top <= offset) offset_top = offset + 2;
1483 }
1484
1485 /* Handle a recursively called group. Restore the offsets
1486 appropriately and continue from after the call. */
1487
1488 if (md->recursive != NULL && md->recursive->group_num == number)
1489 {
1490 recursion_info *rec = md->recursive;
1491 DPRINTF(("Recursion (%d) succeeded - continuing\n", number));
1492 md->recursive = rec->prevrec;
1493 mstart = rec->save_start;
1494 memcpy(md->offset_vector, rec->offset_save,
1495 rec->saved_max * sizeof(int));
1496 offset_top = rec->save_offset_top;
1497 ecode = rec->after_call;
1498 ims = original_ims;
1499 break;
1500 }
1501 }
1502
1503 /* For both capturing and non-capturing groups, reset the value of the ims
1504 flags, in case they got changed during the group. */
1505
1506 ims = original_ims;
1507 DPRINTF(("ims reset to %02lx\n", ims));
1508
1509 /* For a non-repeating ket, just continue at this level. This also
1510 happens for a repeating ket if no characters were matched in the group.
1511 This is the forcible breaking of infinite loops as implemented in Perl
1512 5.005. If there is an options reset, it will get obeyed in the normal
1513 course of events. */
1514
1515 if (*ecode == OP_KET || eptr == saved_eptr)
1516 {
1517 ecode += 1 + LINK_SIZE;
1518 break;
1519 }
1520
1521 /* The repeating kets try the rest of the pattern or restart from the
1522 preceding bracket, in the appropriate order. In the second case, we can use
1523 tail recursion to avoid using another stack frame, unless we have an
1524 unlimited repeat of a group that can match an empty string. */
1525
1526 flags = (*prev >= OP_SBRA)? match_cbegroup : 0;
1527
1528 if (*ecode == OP_KETRMIN)
1529 {
1530 RMATCH(eptr, ecode + 1 + LINK_SIZE, offset_top, md, ims, eptrb, 0, RM12);
1531 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1532 if (flags != 0) /* Could match an empty string */
1533 {
1534 RMATCH(eptr, prev, offset_top, md, ims, eptrb, flags, RM50);
1535 RRETURN(rrc);
1536 }
1537 ecode = prev;
1538 goto TAIL_RECURSE;
1539 }
1540 else /* OP_KETRMAX */
1541 {
1542 RMATCH(eptr, prev, offset_top, md, ims, eptrb, flags, RM13);
1543 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1544 ecode += 1 + LINK_SIZE;
1545 flags = 0;
1546 goto TAIL_RECURSE;
1547 }
1548 /* Control never gets here */
1549
1550 /* Start of subject unless notbol, or after internal newline if multiline */
1551
1552 case OP_CIRC:
1553 if (md->notbol && eptr == md->start_subject) RRETURN(MATCH_NOMATCH);
1554 if ((ims & PCRE_MULTILINE) != 0)
1555 {
1556 if (eptr != md->start_subject &&
1557 (eptr == md->end_subject || !WAS_NEWLINE(eptr)))
1558 RRETURN(MATCH_NOMATCH);
1559 ecode++;
1560 break;
1561 }
1562 /* ... else fall through */
1563
1564 /* Start of subject assertion */
1565
1566 case OP_SOD:
1567 if (eptr != md->start_subject) RRETURN(MATCH_NOMATCH);
1568 ecode++;
1569 break;
1570
1571 /* Start of match assertion */
1572
1573 case OP_SOM:
1574 if (eptr != md->start_subject + md->start_offset) RRETURN(MATCH_NOMATCH);
1575 ecode++;
1576 break;
1577
1578 /* Reset the start of match point */
1579
1580 case OP_SET_SOM:
1581 mstart = eptr;
1582 ecode++;
1583 break;
1584
1585 /* Assert before internal newline if multiline, or before a terminating
1586 newline unless endonly is set, else end of subject unless noteol is set. */
1587
1588 case OP_DOLL:
1589 if ((ims & PCRE_MULTILINE) != 0)
1590 {
1591 if (eptr < md->end_subject)
1592 { if (!IS_NEWLINE(eptr)) RRETURN(MATCH_NOMATCH); }
1593 else
1594 { if (md->noteol) RRETURN(MATCH_NOMATCH); }
1595 ecode++;
1596 break;
1597 }
1598 else
1599 {
1600 if (md->noteol) RRETURN(MATCH_NOMATCH);
1601 if (!md->endonly)
1602 {
1603 if (eptr != md->end_subject &&
1604 (!IS_NEWLINE(eptr) || eptr != md->end_subject - md->nllen))
1605 RRETURN(MATCH_NOMATCH);
1606 ecode++;
1607 break;
1608 }
1609 }
1610 /* ... else fall through for endonly */
1611
1612 /* End of subject assertion (\z) */
1613
1614 case OP_EOD:
1615 if (eptr < md->end_subject) RRETURN(MATCH_NOMATCH);
1616 ecode++;
1617 break;
1618
1619 /* End of subject or ending \n assertion (\Z) */
1620
1621 case OP_EODN:
1622 if (eptr != md->end_subject &&
1623 (!IS_NEWLINE(eptr) || eptr != md->end_subject - md->nllen))
1624 RRETURN(MATCH_NOMATCH);
1625 ecode++;
1626 break;
1627
1628 /* Word boundary assertions */
1629
1630 case OP_NOT_WORD_BOUNDARY:
1631 case OP_WORD_BOUNDARY:
1632 {
1633
1634 /* Find out if the previous and current characters are "word" characters.
1635 It takes a bit more work in UTF-8 mode. Characters > 255 are assumed to
1636 be "non-word" characters. Remember the earliest consulted character for
1637 partial matching. */
1638
1639 #ifdef SUPPORT_UTF8
1640 if (utf8)
1641 {
1642 if (eptr == md->start_subject) prev_is_word = FALSE; else
1643 {
1644 USPTR lastptr = eptr - 1;
1645 while((*lastptr & 0xc0) == 0x80) lastptr--;
1646 if (lastptr < md->start_used_ptr) md->start_used_ptr = lastptr;
1647 GETCHAR(c, lastptr);
1648 prev_is_word = c < 256 && (md->ctypes[c] & ctype_word) != 0;
1649 }
1650 if (eptr >= md->end_subject)
1651 {
1652 SCHECK_PARTIAL();
1653 cur_is_word = FALSE;
1654 }
1655 else
1656 {
1657 GETCHAR(c, eptr);
1658 cur_is_word = c < 256 && (md->ctypes[c] & ctype_word) != 0;
1659 }
1660 }
1661 else
1662 #endif
1663
1664 /* Not in UTF-8 mode */
1665
1666 {
1667 if (eptr == md->start_subject) prev_is_word = FALSE; else
1668 {
1669 if (eptr <= md->start_used_ptr) md->start_used_ptr = eptr - 1;
1670 prev_is_word = ((md->ctypes[eptr[-1]] & ctype_word) != 0);
1671 }
1672 if (eptr >= md->end_subject)
1673 {
1674 SCHECK_PARTIAL();
1675 cur_is_word = FALSE;
1676 }
1677 else cur_is_word = ((md->ctypes[*eptr] & ctype_word) != 0);
1678 }
1679
1680 /* Now see if the situation is what we want */
1681
1682 if ((*ecode++ == OP_WORD_BOUNDARY)?
1683 cur_is_word == prev_is_word : cur_is_word != prev_is_word)
1684 RRETURN(MATCH_NOMATCH);
1685 }
1686 break;
1687
1688 /* Match a single character type; inline for speed */
1689
1690 case OP_ANY:
1691 if (IS_NEWLINE(eptr)) RRETURN(MATCH_NOMATCH);
1692 /* Fall through */
1693
1694 case OP_ALLANY:
1695 if (eptr++ >= md->end_subject)
1696 {
1697 SCHECK_PARTIAL();
1698 RRETURN(MATCH_NOMATCH);
1699 }
1700 if (utf8) while (eptr < md->end_subject && (*eptr & 0xc0) == 0x80) eptr++;
1701 ecode++;
1702 break;
1703
1704 /* Match a single byte, even in UTF-8 mode. This opcode really does match
1705 any byte, even newline, independent of the setting of PCRE_DOTALL. */
1706
1707 case OP_ANYBYTE:
1708 if (eptr++ >= md->end_subject)
1709 {
1710 SCHECK_PARTIAL();
1711 RRETURN(MATCH_NOMATCH);
1712 }
1713 ecode++;
1714 break;
1715
1716 case OP_NOT_DIGIT:
1717 if (eptr >= md->end_subject)
1718 {
1719 SCHECK_PARTIAL();
1720 RRETURN(MATCH_NOMATCH);
1721 }
1722 GETCHARINCTEST(c, eptr);
1723 if (
1724 #ifdef SUPPORT_UTF8
1725 c < 256 &&
1726 #endif
1727 (md->ctypes[c] & ctype_digit) != 0
1728 )
1729 RRETURN(MATCH_NOMATCH);
1730 ecode++;
1731 break;
1732
1733 case OP_DIGIT:
1734 if (eptr >= md->end_subject)
1735 {
1736 SCHECK_PARTIAL();
1737 RRETURN(MATCH_NOMATCH);
1738 }
1739 GETCHARINCTEST(c, eptr);
1740 if (
1741 #ifdef SUPPORT_UTF8
1742 c >= 256 ||
1743 #endif
1744 (md->ctypes[c] & ctype_digit) == 0
1745 )
1746 RRETURN(MATCH_NOMATCH);
1747 ecode++;
1748 break;
1749
1750 case OP_NOT_WHITESPACE:
1751 if (eptr >= md->end_subject)
1752 {
1753 SCHECK_PARTIAL();
1754 RRETURN(MATCH_NOMATCH);
1755 }
1756 GETCHARINCTEST(c, eptr);
1757 if (
1758 #ifdef SUPPORT_UTF8
1759 c < 256 &&
1760 #endif
1761 (md->ctypes[c] & ctype_space) != 0
1762 )
1763 RRETURN(MATCH_NOMATCH);
1764 ecode++;
1765 break;
1766
1767 case OP_WHITESPACE:
1768 if (eptr >= md->end_subject)
1769 {
1770 SCHECK_PARTIAL();
1771 RRETURN(MATCH_NOMATCH);
1772 }
1773 GETCHARINCTEST(c, eptr);
1774 if (
1775 #ifdef SUPPORT_UTF8
1776 c >= 256 ||
1777 #endif
1778 (md->ctypes[c] & ctype_space) == 0
1779 )
1780 RRETURN(MATCH_NOMATCH);
1781 ecode++;
1782 break;
1783
1784 case OP_NOT_WORDCHAR:
1785 if (eptr >= md->end_subject)
1786 {
1787 SCHECK_PARTIAL();
1788 RRETURN(MATCH_NOMATCH);
1789 }
1790 GETCHARINCTEST(c, eptr);
1791 if (
1792 #ifdef SUPPORT_UTF8
1793 c < 256 &&
1794 #endif
1795 (md->ctypes[c] & ctype_word) != 0
1796 )
1797 RRETURN(MATCH_NOMATCH);
1798 ecode++;
1799 break;
1800
1801 case OP_WORDCHAR:
1802 if (eptr >= md->end_subject)
1803 {
1804 SCHECK_PARTIAL();
1805 RRETURN(MATCH_NOMATCH);
1806 }
1807 GETCHARINCTEST(c, eptr);
1808 if (
1809 #ifdef SUPPORT_UTF8
1810 c >= 256 ||
1811 #endif
1812 (md->ctypes[c] & ctype_word) == 0
1813 )
1814 RRETURN(MATCH_NOMATCH);
1815 ecode++;
1816 break;
1817
1818 case OP_ANYNL:
1819 if (eptr >= md->end_subject)
1820 {
1821 SCHECK_PARTIAL();
1822 RRETURN(MATCH_NOMATCH);
1823 }
1824 GETCHARINCTEST(c, eptr);
1825 switch(c)
1826 {
1827 default: RRETURN(MATCH_NOMATCH);
1828 case 0x000d:
1829 if (eptr < md->end_subject && *eptr == 0x0a) eptr++;
1830 break;
1831
1832 case 0x000a:
1833 break;
1834
1835 case 0x000b:
1836 case 0x000c:
1837 case 0x0085:
1838 case 0x2028:
1839 case 0x2029:
1840 if (md->bsr_anycrlf) RRETURN(MATCH_NOMATCH);
1841 break;
1842 }
1843 ecode++;
1844 break;
1845
1846 case OP_NOT_HSPACE:
1847 if (eptr >= md->end_subject)
1848 {
1849 SCHECK_PARTIAL();
1850 RRETURN(MATCH_NOMATCH);
1851 }
1852 GETCHARINCTEST(c, eptr);
1853 switch(c)
1854 {
1855 default: break;
1856 case 0x09: /* HT */
1857 case 0x20: /* SPACE */
1858 case 0xa0: /* NBSP */
1859 case 0x1680: /* OGHAM SPACE MARK */
1860 case 0x180e: /* MONGOLIAN VOWEL SEPARATOR */
1861 case 0x2000: /* EN QUAD */
1862 case 0x2001: /* EM QUAD */
1863 case 0x2002: /* EN SPACE */
1864 case 0x2003: /* EM SPACE */
1865 case 0x2004: /* THREE-PER-EM SPACE */
1866 case 0x2005: /* FOUR-PER-EM SPACE */
1867 case 0x2006: /* SIX-PER-EM SPACE */
1868 case 0x2007: /* FIGURE SPACE */
1869 case 0x2008: /* PUNCTUATION SPACE */
1870 case 0x2009: /* THIN SPACE */
1871 case 0x200A: /* HAIR SPACE */
1872 case 0x202f: /* NARROW NO-BREAK SPACE */
1873 case 0x205f: /* MEDIUM MATHEMATICAL SPACE */
1874 case 0x3000: /* IDEOGRAPHIC SPACE */
1875 RRETURN(MATCH_NOMATCH);
1876 }
1877 ecode++;
1878 break;
1879
1880 case OP_HSPACE:
1881 if (eptr >= md->end_subject)
1882 {
1883 SCHECK_PARTIAL();
1884 RRETURN(MATCH_NOMATCH);
1885 }
1886 GETCHARINCTEST(c, eptr);
1887 switch(c)
1888 {
1889 default: RRETURN(MATCH_NOMATCH);
1890 case 0x09: /* HT */
1891 case 0x20: /* SPACE */
1892 case 0xa0: /* NBSP */
1893 case 0x1680: /* OGHAM SPACE MARK */
1894 case 0x180e: /* MONGOLIAN VOWEL SEPARATOR */
1895 case 0x2000: /* EN QUAD */
1896 case 0x2001: /* EM QUAD */
1897 case 0x2002: /* EN SPACE */
1898 case 0x2003: /* EM SPACE */
1899 case 0x2004: /* THREE-PER-EM SPACE */
1900 case 0x2005: /* FOUR-PER-EM SPACE */
1901 case 0x2006: /* SIX-PER-EM SPACE */
1902 case 0x2007: /* FIGURE SPACE */
1903 case 0x2008: /* PUNCTUATION SPACE */
1904 case 0x2009: /* THIN SPACE */
1905 case 0x200A: /* HAIR SPACE */
1906 case 0x202f: /* NARROW NO-BREAK SPACE */
1907 case 0x205f: /* MEDIUM MATHEMATICAL SPACE */
1908 case 0x3000: /* IDEOGRAPHIC SPACE */
1909 break;
1910 }
1911 ecode++;
1912 break;
1913
1914 case OP_NOT_VSPACE:
1915 if (eptr >= md->end_subject)
1916 {
1917 SCHECK_PARTIAL();
1918 RRETURN(MATCH_NOMATCH);
1919 }
1920 GETCHARINCTEST(c, eptr);
1921 switch(c)
1922 {
1923 default: break;
1924 case 0x0a: /* LF */
1925 case 0x0b: /* VT */
1926 case 0x0c: /* FF */
1927 case 0x0d: /* CR */
1928 case 0x85: /* NEL */
1929 case 0x2028: /* LINE SEPARATOR */
1930 case 0x2029: /* PARAGRAPH SEPARATOR */
1931 RRETURN(MATCH_NOMATCH);
1932 }
1933 ecode++;
1934 break;
1935
1936 case OP_VSPACE:
1937 if (eptr >= md->end_subject)
1938 {
1939 SCHECK_PARTIAL();
1940 RRETURN(MATCH_NOMATCH);
1941 }
1942 GETCHARINCTEST(c, eptr);
1943 switch(c)
1944 {
1945 default: RRETURN(MATCH_NOMATCH);
1946 case 0x0a: /* LF */
1947 case 0x0b: /* VT */
1948 case 0x0c: /* FF */
1949 case 0x0d: /* CR */
1950 case 0x85: /* NEL */
1951 case 0x2028: /* LINE SEPARATOR */
1952 case 0x2029: /* PARAGRAPH SEPARATOR */
1953 break;
1954 }
1955 ecode++;
1956 break;
1957
1958 #ifdef SUPPORT_UCP
1959 /* Check the next character by Unicode property. We will get here only
1960 if the support is in the binary; otherwise a compile-time error occurs. */
1961
1962 case OP_PROP:
1963 case OP_NOTPROP:
1964 if (eptr >= md->end_subject)
1965 {
1966 SCHECK_PARTIAL();
1967 RRETURN(MATCH_NOMATCH);
1968 }
1969 GETCHARINCTEST(c, eptr);
1970 {
1971 const ucd_record *prop = GET_UCD(c);
1972
1973 switch(ecode[1])
1974 {
1975 case PT_ANY:
1976 if (op == OP_NOTPROP) RRETURN(MATCH_NOMATCH);
1977 break;
1978
1979 case PT_LAMP:
1980 if ((prop->chartype == ucp_Lu ||
1981 prop->chartype == ucp_Ll ||
1982 prop->chartype == ucp_Lt) == (op == OP_NOTPROP))
1983 RRETURN(MATCH_NOMATCH);
1984 break;
1985
1986 case PT_GC:
1987 if ((ecode[2] != _pcre_ucp_gentype[prop->chartype]) == (op == OP_PROP))
1988 RRETURN(MATCH_NOMATCH);
1989 break;
1990
1991 case PT_PC:
1992 if ((ecode[2] != prop->chartype) == (op == OP_PROP))
1993 RRETURN(MATCH_NOMATCH);
1994 break;
1995
1996 case PT_SC:
1997 if ((ecode[2] != prop->script) == (op == OP_PROP))
1998 RRETURN(MATCH_NOMATCH);
1999 break;
2000
2001 default:
2002 RRETURN(PCRE_ERROR_INTERNAL);
2003 }
2004
2005 ecode += 3;
2006 }
2007 break;
2008
2009 /* Match an extended Unicode sequence. We will get here only if the support
2010 is in the binary; otherwise a compile-time error occurs. */
2011
2012 case OP_EXTUNI:
2013 if (eptr >= md->end_subject)
2014 {
2015 SCHECK_PARTIAL();
2016 RRETURN(MATCH_NOMATCH);
2017 }
2018 GETCHARINCTEST(c, eptr);
2019 {
2020 int category = UCD_CATEGORY(c);
2021 if (category == ucp_M) RRETURN(MATCH_NOMATCH);
2022 while (eptr < md->end_subject)
2023 {
2024 int len = 1;
2025 if (!utf8) c = *eptr; else
2026 {
2027 GETCHARLEN(c, eptr, len);
2028 }
2029 category = UCD_CATEGORY(c);
2030 if (category != ucp_M) break;
2031 eptr += len;
2032 }
2033 }
2034 ecode++;
2035 break;
2036 #endif
2037
2038
2039 /* Match a back reference, possibly repeatedly. Look past the end of the
2040 item to see if there is repeat information following. The code is similar
2041 to that for character classes, but repeated for efficiency. Then obey
2042 similar code to character type repeats - written out again for speed.
2043 However, if the referenced string is the empty string, always treat
2044 it as matched, any number of times (otherwise there could be infinite
2045 loops). */
2046
2047 case OP_REF:
2048 {
2049 offset = GET2(ecode, 1) << 1; /* Doubled ref number */
2050 ecode += 3;
2051
2052 /* If the reference is unset, there are two possibilities:
2053
2054 (a) In the default, Perl-compatible state, set the length to be longer
2055 than the amount of subject left; this ensures that every attempt at a
2056 match fails. We can't just fail here, because of the possibility of
2057 quantifiers with zero minima.
2058
2059 (b) If the JavaScript compatibility flag is set, set the length to zero
2060 so that the back reference matches an empty string.
2061
2062 Otherwise, set the length to the length of what was matched by the
2063 referenced subpattern. */
2064
2065 if (offset >= offset_top || md->offset_vector[offset] < 0)
2066 length = (md->jscript_compat)? 0 : md->end_subject - eptr + 1;
2067 else
2068 length = md->offset_vector[offset+1] - md->offset_vector[offset];
2069
2070 /* Set up for repetition, or handle the non-repeated case */
2071
2072 switch (*ecode)
2073 {
2074 case OP_CRSTAR:
2075 case OP_CRMINSTAR:
2076 case OP_CRPLUS:
2077 case OP_CRMINPLUS:
2078 case OP_CRQUERY:
2079 case OP_CRMINQUERY:
2080 c = *ecode++ - OP_CRSTAR;
2081 minimize = (c & 1) != 0;
2082 min = rep_min[c]; /* Pick up values from tables; */
2083 max = rep_max[c]; /* zero for max => infinity */
2084 if (max == 0) max = INT_MAX;
2085 break;
2086
2087 case OP_CRRANGE:
2088 case OP_CRMINRANGE:
2089 minimize = (*ecode == OP_CRMINRANGE);
2090 min = GET2(ecode, 1);
2091 max = GET2(ecode, 3);
2092 if (max == 0) max = INT_MAX;
2093 ecode += 5;
2094 break;
2095
2096 default: /* No repeat follows */
2097 if (!match_ref(offset, eptr, length, md, ims))
2098 {
2099 CHECK_PARTIAL();
2100 RRETURN(MATCH_NOMATCH);
2101 }
2102 eptr += length;
2103 continue; /* With the main loop */
2104 }
2105
2106 /* If the length of the reference is zero, just continue with the
2107 main loop. */
2108
2109 if (length == 0) continue;
2110
2111 /* First, ensure the minimum number of matches are present. We get back
2112 the length of the reference string explicitly rather than passing the
2113 address of eptr, so that eptr can be a register variable. */
2114
2115 for (i = 1; i <= min; i++)
2116 {
2117 if (!match_ref(offset, eptr, length, md, ims))
2118 {
2119 CHECK_PARTIAL();
2120 RRETURN(MATCH_NOMATCH);
2121 }
2122 eptr += length;
2123 }
2124
2125 /* If min = max, continue at the same level without recursion.
2126 They are not both allowed to be zero. */
2127
2128 if (min == max) continue;
2129
2130 /* If minimizing, keep trying and advancing the pointer */
2131
2132 if (minimize)
2133 {
2134 for (fi = min;; fi++)
2135 {
2136 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM14);
2137 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2138 if (fi >= max) RRETURN(MATCH_NOMATCH);
2139 if (!match_ref(offset, eptr, length, md, ims))
2140 {
2141 CHECK_PARTIAL();
2142 RRETURN(MATCH_NOMATCH);
2143 }
2144 eptr += length;
2145 }
2146 /* Control never gets here */
2147 }
2148
2149 /* If maximizing, find the longest string and work backwards */
2150
2151 else
2152 {
2153 pp = eptr;
2154 for (i = min; i < max; i++)
2155 {
2156 if (!match_ref(offset, eptr, length, md, ims))
2157 {
2158 CHECK_PARTIAL();
2159 break;
2160 }
2161 eptr += length;
2162 }
2163 while (eptr >= pp)
2164 {
2165 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM15);
2166 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2167 eptr -= length;
2168 }
2169 RRETURN(MATCH_NOMATCH);
2170 }
2171 }
2172 /* Control never gets here */
2173
2174 /* Match a bit-mapped character class, possibly repeatedly. This op code is
2175 used when all the characters in the class have values in the range 0-255,
2176 and either the matching is caseful, or the characters are in the range
2177 0-127 when UTF-8 processing is enabled. The only difference between
2178 OP_CLASS and OP_NCLASS occurs when a data character outside the range is
2179 encountered.
2180
2181 First, look past the end of the item to see if there is repeat information
2182 following. Then obey similar code to character type repeats - written out
2183 again for speed. */
2184
2185 case OP_NCLASS:
2186 case OP_CLASS:
2187 {
2188 data = ecode + 1; /* Save for matching */
2189 ecode += 33; /* Advance past the item */
2190
2191 switch (*ecode)
2192 {
2193 case OP_CRSTAR:
2194 case OP_CRMINSTAR:
2195 case OP_CRPLUS:
2196 case OP_CRMINPLUS:
2197 case OP_CRQUERY:
2198 case OP_CRMINQUERY:
2199 c = *ecode++ - OP_CRSTAR;
2200 minimize = (c & 1) != 0;
2201 min = rep_min[c]; /* Pick up values from tables; */
2202 max = rep_max[c]; /* zero for max => infinity */
2203 if (max == 0) max = INT_MAX;
2204 break;
2205
2206 case OP_CRRANGE:
2207 case OP_CRMINRANGE:
2208 minimize = (*ecode == OP_CRMINRANGE);
2209 min = GET2(ecode, 1);
2210 max = GET2(ecode, 3);
2211 if (max == 0) max = INT_MAX;
2212 ecode += 5;
2213 break;
2214
2215 default: /* No repeat follows */
2216 min = max = 1;
2217 break;
2218 }
2219
2220 /* First, ensure the minimum number of matches are present. */
2221
2222 #ifdef SUPPORT_UTF8
2223 /* UTF-8 mode */
2224 if (utf8)
2225 {
2226 for (i = 1; i <= min; i++)
2227 {
2228 if (eptr >= md->end_subject)
2229 {
2230 SCHECK_PARTIAL();
2231 RRETURN(MATCH_NOMATCH);
2232 }
2233 GETCHARINC(c, eptr);
2234 if (c > 255)
2235 {
2236 if (op == OP_CLASS) RRETURN(MATCH_NOMATCH);
2237 }
2238 else
2239 {
2240 if ((data[c/8] & (1 << (c&7))) == 0) RRETURN(MATCH_NOMATCH);
2241 }
2242 }
2243 }
2244 else
2245 #endif
2246 /* Not UTF-8 mode */
2247 {
2248 for (i = 1; i <= min; i++)
2249 {
2250 if (eptr >= md->end_subject)
2251 {
2252 SCHECK_PARTIAL();
2253 RRETURN(MATCH_NOMATCH);
2254 }
2255 c = *eptr++;
2256 if ((data[c/8] & (1 << (c&7))) == 0) RRETURN(MATCH_NOMATCH);
2257 }
2258 }
2259
2260 /* If max == min we can continue with the main loop without the
2261 need to recurse. */
2262
2263 if (min == max) continue;
2264
2265 /* If minimizing, keep testing the rest of the expression and advancing
2266 the pointer while it matches the class. */
2267
2268 if (minimize)
2269 {
2270 #ifdef SUPPORT_UTF8
2271 /* UTF-8 mode */
2272 if (utf8)
2273 {
2274 for (fi = min;; fi++)
2275 {
2276 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM16);
2277 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2278 if (fi >= max) RRETURN(MATCH_NOMATCH);
2279 if (eptr >= md->end_subject)
2280 {
2281 SCHECK_PARTIAL();
2282 RRETURN(MATCH_NOMATCH);
2283 }
2284 GETCHARINC(c, eptr);
2285 if (c > 255)
2286 {
2287 if (op == OP_CLASS) RRETURN(MATCH_NOMATCH);
2288 }
2289 else
2290 {
2291 if ((data[c/8] & (1 << (c&7))) == 0) RRETURN(MATCH_NOMATCH);
2292 }
2293 }
2294 }
2295 else
2296 #endif
2297 /* Not UTF-8 mode */
2298 {
2299 for (fi = min;; fi++)
2300 {
2301 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM17);
2302 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2303 if (fi >= max) RRETURN(MATCH_NOMATCH);
2304 if (eptr >= md->end_subject)
2305 {
2306 SCHECK_PARTIAL();
2307 RRETURN(MATCH_NOMATCH);
2308 }
2309 c = *eptr++;
2310 if ((data[c/8] & (1 << (c&7))) == 0) RRETURN(MATCH_NOMATCH);
2311 }
2312 }
2313 /* Control never gets here */
2314 }
2315
2316 /* If maximizing, find the longest possible run, then work backwards. */
2317
2318 else
2319 {
2320 pp = eptr;
2321
2322 #ifdef SUPPORT_UTF8
2323 /* UTF-8 mode */
2324 if (utf8)
2325 {
2326 for (i = min; i < max; i++)
2327 {
2328 int len = 1;
2329 if (eptr >= md->end_subject)
2330 {
2331 SCHECK_PARTIAL();
2332 break;
2333 }
2334 GETCHARLEN(c, eptr, len);
2335 if (c > 255)
2336 {
2337 if (op == OP_CLASS) break;
2338 }
2339 else
2340 {
2341 if ((data[c/8] & (1 << (c&7))) == 0) break;
2342 }
2343 eptr += len;
2344 }
2345 for (;;)
2346 {
2347 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM18);
2348 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2349 if (eptr-- == pp) break; /* Stop if tried at original pos */
2350 BACKCHAR(eptr);
2351 }
2352 }
2353 else
2354 #endif
2355 /* Not UTF-8 mode */
2356 {
2357 for (i = min; i < max; i++)
2358 {
2359 if (eptr >= md->end_subject)
2360 {
2361 SCHECK_PARTIAL();
2362 break;
2363 }
2364 c = *eptr;
2365 if ((data[c/8] & (1 << (c&7))) == 0) break;
2366 eptr++;
2367 }
2368 while (eptr >= pp)
2369 {
2370 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM19);
2371 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2372 eptr--;
2373 }
2374 }
2375
2376 RRETURN(MATCH_NOMATCH);
2377 }
2378 }
2379 /* Control never gets here */
2380
2381
2382 /* Match an extended character class. This opcode is encountered only
2383 when UTF-8 mode mode is supported. Nevertheless, we may not be in UTF-8
2384 mode, because Unicode properties are supported in non-UTF-8 mode. */
2385
2386 #ifdef SUPPORT_UTF8
2387 case OP_XCLASS:
2388 {
2389 data = ecode + 1 + LINK_SIZE; /* Save for matching */
2390 ecode += GET(ecode, 1); /* Advance past the item */
2391
2392 switch (*ecode)
2393 {
2394 case OP_CRSTAR:
2395 case OP_CRMINSTAR:
2396 case OP_CRPLUS:
2397 case OP_CRMINPLUS:
2398 case OP_CRQUERY:
2399 case OP_CRMINQUERY:
2400 c = *ecode++ - OP_CRSTAR;
2401 minimize = (c & 1) != 0;
2402 min = rep_min[c]; /* Pick up values from tables; */
2403 max = rep_max[c]; /* zero for max => infinity */
2404 if (max == 0) max = INT_MAX;
2405 break;
2406
2407 case OP_CRRANGE:
2408 case OP_CRMINRANGE:
2409 minimize = (*ecode == OP_CRMINRANGE);
2410 min = GET2(ecode, 1);
2411 max = GET2(ecode, 3);
2412 if (max == 0) max = INT_MAX;
2413 ecode += 5;
2414 break;
2415
2416 default: /* No repeat follows */
2417 min = max = 1;
2418 break;
2419 }
2420
2421 /* First, ensure the minimum number of matches are present. */
2422
2423 for (i = 1; i <= min; i++)
2424 {
2425 if (eptr >= md->end_subject)
2426 {
2427 SCHECK_PARTIAL();
2428 RRETURN(MATCH_NOMATCH);
2429 }
2430 GETCHARINCTEST(c, eptr);
2431 if (!_pcre_xclass(c, data)) RRETURN(MATCH_NOMATCH);
2432 }
2433
2434 /* If max == min we can continue with the main loop without the
2435 need to recurse. */
2436
2437 if (min == max) continue;
2438
2439 /* If minimizing, keep testing the rest of the expression and advancing
2440 the pointer while it matches the class. */
2441
2442 if (minimize)
2443 {
2444 for (fi = min;; fi++)
2445 {
2446 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM20);
2447 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2448 if (fi >= max) RRETURN(MATCH_NOMATCH);
2449 if (eptr >= md->end_subject)
2450 {
2451 SCHECK_PARTIAL();
2452 RRETURN(MATCH_NOMATCH);
2453 }
2454 GETCHARINCTEST(c, eptr);
2455 if (!_pcre_xclass(c, data)) RRETURN(MATCH_NOMATCH);
2456 }
2457 /* Control never gets here */
2458 }
2459
2460 /* If maximizing, find the longest possible run, then work backwards. */
2461
2462 else
2463 {
2464 pp = eptr;
2465 for (i = min; i < max; i++)
2466 {
2467 int len = 1;
2468 if (eptr >= md->end_subject)
2469 {
2470 SCHECK_PARTIAL();
2471 break;
2472 }
2473 GETCHARLENTEST(c, eptr, len);
2474 if (!_pcre_xclass(c, data)) break;
2475 eptr += len;
2476 }
2477 for(;;)
2478 {
2479 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM21);
2480 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2481 if (eptr-- == pp) break; /* Stop if tried at original pos */
2482 if (utf8) BACKCHAR(eptr);
2483 }
2484 RRETURN(MATCH_NOMATCH);
2485 }
2486
2487 /* Control never gets here */
2488 }
2489 #endif /* End of XCLASS */
2490
2491 /* Match a single character, casefully */
2492
2493 case OP_CHAR:
2494 #ifdef SUPPORT_UTF8
2495 if (utf8)
2496 {
2497 length = 1;
2498 ecode++;
2499 GETCHARLEN(fc, ecode, length);
2500 if (length > md->end_subject - eptr)
2501 {
2502 CHECK_PARTIAL(); /* Not SCHECK_PARTIAL() */
2503 RRETURN(MATCH_NOMATCH);
2504 }
2505 while (length-- > 0) if (*ecode++ != *eptr++) RRETURN(MATCH_NOMATCH);
2506 }
2507 else
2508 #endif
2509
2510 /* Non-UTF-8 mode */
2511 {
2512 if (md->end_subject - eptr < 1)
2513 {
2514 SCHECK_PARTIAL(); /* This one can use SCHECK_PARTIAL() */
2515 RRETURN(MATCH_NOMATCH);
2516 }
2517 if (ecode[1] != *eptr++) RRETURN(MATCH_NOMATCH);
2518 ecode += 2;
2519 }
2520 break;
2521
2522 /* Match a single character, caselessly */
2523
2524 case OP_CHARNC:
2525 #ifdef SUPPORT_UTF8
2526 if (utf8)
2527 {
2528 length = 1;
2529 ecode++;
2530 GETCHARLEN(fc, ecode, length);
2531
2532 if (length > md->end_subject - eptr)
2533 {
2534 CHECK_PARTIAL(); /* Not SCHECK_PARTIAL() */
2535 RRETURN(MATCH_NOMATCH);
2536 }
2537
2538 /* If the pattern character's value is < 128, we have only one byte, and
2539 can use the fast lookup table. */
2540
2541 if (fc < 128)
2542 {
2543 if (md->lcc[*ecode++] != md->lcc[*eptr++]) RRETURN(MATCH_NOMATCH);
2544 }
2545
2546 /* Otherwise we must pick up the subject character */
2547
2548 else
2549 {
2550 unsigned int dc;
2551 GETCHARINC(dc, eptr);
2552 ecode += length;
2553
2554 /* If we have Unicode property support, we can use it to test the other
2555 case of the character, if there is one. */
2556
2557 if (fc != dc)
2558 {
2559 #ifdef SUPPORT_UCP
2560 if (dc != UCD_OTHERCASE(fc))
2561 #endif
2562 RRETURN(MATCH_NOMATCH);
2563 }
2564 }
2565 }
2566 else
2567 #endif /* SUPPORT_UTF8 */
2568
2569 /* Non-UTF-8 mode */
2570 {
2571 if (md->end_subject - eptr < 1)
2572 {
2573 SCHECK_PARTIAL(); /* This one can use SCHECK_PARTIAL() */
2574 RRETURN(MATCH_NOMATCH);
2575 }
2576 if (md->lcc[ecode[1]] != md->lcc[*eptr++]) RRETURN(MATCH_NOMATCH);
2577 ecode += 2;
2578 }
2579 break;
2580
2581 /* Match a single character repeatedly. */
2582
2583 case OP_EXACT:
2584 min = max = GET2(ecode, 1);
2585 ecode += 3;
2586 goto REPEATCHAR;
2587
2588 case OP_POSUPTO:
2589 possessive = TRUE;
2590 /* Fall through */
2591
2592 case OP_UPTO:
2593 case OP_MINUPTO:
2594 min = 0;
2595 max = GET2(ecode, 1);
2596 minimize = *ecode == OP_MINUPTO;
2597 ecode += 3;
2598 goto REPEATCHAR;
2599
2600 case OP_POSSTAR:
2601 possessive = TRUE;
2602 min = 0;
2603 max = INT_MAX;
2604 ecode++;
2605 goto REPEATCHAR;
2606
2607 case OP_POSPLUS:
2608 possessive = TRUE;
2609 min = 1;
2610 max = INT_MAX;
2611 ecode++;
2612 goto REPEATCHAR;
2613
2614 case OP_POSQUERY:
2615 possessive = TRUE;
2616 min = 0;
2617 max = 1;
2618 ecode++;
2619 goto REPEATCHAR;
2620
2621 case OP_STAR:
2622 case OP_MINSTAR:
2623 case OP_PLUS:
2624 case OP_MINPLUS:
2625 case OP_QUERY:
2626 case OP_MINQUERY:
2627 c = *ecode++ - OP_STAR;
2628 minimize = (c & 1) != 0;
2629
2630 min = rep_min[c]; /* Pick up values from tables; */
2631 max = rep_max[c]; /* zero for max => infinity */
2632 if (max == 0) max = INT_MAX;
2633
2634 /* Common code for all repeated single-character matches. */
2635
2636 REPEATCHAR:
2637 #ifdef SUPPORT_UTF8
2638 if (utf8)
2639 {
2640 length = 1;
2641 charptr = ecode;
2642 GETCHARLEN(fc, ecode, length);
2643 ecode += length;
2644
2645 /* Handle multibyte character matching specially here. There is
2646 support for caseless matching if UCP support is present. */
2647
2648 if (length > 1)
2649 {
2650 #ifdef SUPPORT_UCP
2651 unsigned int othercase;
2652 if ((ims & PCRE_CASELESS) != 0 &&
2653 (othercase = UCD_OTHERCASE(fc)) != fc)
2654 oclength = _pcre_ord2utf8(othercase, occhars);
2655 else oclength = 0;
2656 #endif /* SUPPORT_UCP */
2657
2658 for (i = 1; i <= min; i++)
2659 {
2660 if (eptr <= md->end_subject - length &&
2661 memcmp(eptr, charptr, length) == 0) eptr += length;
2662 #ifdef SUPPORT_UCP
2663 else if (oclength > 0 &&
2664 eptr <= md->end_subject - oclength &&
2665 memcmp(eptr, occhars, oclength) == 0) eptr += oclength;
2666 #endif /* SUPPORT_UCP */
2667 else
2668 {
2669 CHECK_PARTIAL();
2670 RRETURN(MATCH_NOMATCH);
2671 }
2672 }
2673
2674 if (min == max) continue;
2675
2676 if (minimize)
2677 {
2678 for (fi = min;; fi++)
2679 {
2680 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM22);
2681 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2682 if (fi >= max) RRETURN(MATCH_NOMATCH);
2683 if (eptr <= md->end_subject - length &&
2684 memcmp(eptr, charptr, length) == 0) eptr += length;
2685 #ifdef SUPPORT_UCP
2686 else if (oclength > 0 &&
2687 eptr <= md->end_subject - oclength &&
2688 memcmp(eptr, occhars, oclength) == 0) eptr += oclength;
2689 #endif /* SUPPORT_UCP */
2690 else
2691 {
2692 CHECK_PARTIAL();
2693 RRETURN(MATCH_NOMATCH);
2694 }
2695 }
2696 /* Control never gets here */
2697 }
2698
2699 else /* Maximize */
2700 {
2701 pp = eptr;
2702 for (i = min; i < max; i++)
2703 {
2704 if (eptr <= md->end_subject - length &&
2705 memcmp(eptr, charptr, length) == 0) eptr += length;
2706 #ifdef SUPPORT_UCP
2707 else if (oclength > 0 &&
2708 eptr <= md->end_subject - oclength &&
2709 memcmp(eptr, occhars, oclength) == 0) eptr += oclength;
2710 #endif /* SUPPORT_UCP */
2711 else
2712 {
2713 CHECK_PARTIAL();
2714 break;
2715 }
2716 }
2717
2718 if (possessive) continue;
2719
2720 for(;;)
2721 {
2722 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM23);
2723 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2724 if (eptr == pp) { RRETURN(MATCH_NOMATCH); }
2725 #ifdef SUPPORT_UCP
2726 eptr--;
2727 BACKCHAR(eptr);
2728 #else /* without SUPPORT_UCP */
2729 eptr -= length;
2730 #endif /* SUPPORT_UCP */
2731 }
2732 }
2733 /* Control never gets here */
2734 }
2735
2736 /* If the length of a UTF-8 character is 1, we fall through here, and
2737 obey the code as for non-UTF-8 characters below, though in this case the
2738 value of fc will always be < 128. */
2739 }
2740 else
2741 #endif /* SUPPORT_UTF8 */
2742
2743 /* When not in UTF-8 mode, load a single-byte character. */
2744
2745 fc = *ecode++;
2746
2747 /* The value of fc at this point is always less than 256, though we may or
2748 may not be in UTF-8 mode. The code is duplicated for the caseless and
2749 caseful cases, for speed, since matching characters is likely to be quite
2750 common. First, ensure the minimum number of matches are present. If min =
2751 max, continue at the same level without recursing. Otherwise, if
2752 minimizing, keep trying the rest of the expression and advancing one
2753 matching character if failing, up to the maximum. Alternatively, if
2754 maximizing, find the maximum number of characters and work backwards. */
2755
2756 DPRINTF(("matching %c{%d,%d} against subject %.*s\n", fc, min, max,
2757 max, eptr));
2758
2759 if ((ims & PCRE_CASELESS) != 0)
2760 {
2761 fc = md->lcc[fc];
2762 for (i = 1; i <= min; i++)
2763 {
2764 if (eptr >= md->end_subject)
2765 {
2766 SCHECK_PARTIAL();
2767 RRETURN(MATCH_NOMATCH);
2768 }
2769 if (fc != md->lcc[*eptr++]) RRETURN(MATCH_NOMATCH);
2770 }
2771 if (min == max) continue;
2772 if (minimize)
2773 {
2774 for (fi = min;; fi++)
2775 {
2776 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM24);
2777 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2778 if (fi >= max) RRETURN(MATCH_NOMATCH);
2779 if (eptr >= md->end_subject)
2780 {
2781 SCHECK_PARTIAL();
2782 RRETURN(MATCH_NOMATCH);
2783 }
2784 if (fc != md->lcc[*eptr++]) RRETURN(MATCH_NOMATCH);
2785 }
2786 /* Control never gets here */
2787 }
2788 else /* Maximize */
2789 {
2790 pp = eptr;
2791 for (i = min; i < max; i++)
2792 {
2793 if (eptr >= md->end_subject)
2794 {
2795 SCHECK_PARTIAL();
2796 break;
2797 }
2798 if (fc != md->lcc[*eptr]) break;
2799 eptr++;
2800 }
2801
2802 if (possessive) continue;
2803
2804 while (eptr >= pp)
2805 {
2806 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM25);
2807 eptr--;
2808 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2809 }
2810 RRETURN(MATCH_NOMATCH);
2811 }
2812 /* Control never gets here */
2813 }
2814
2815 /* Caseful comparisons (includes all multi-byte characters) */
2816
2817 else
2818 {
2819 for (i = 1; i <= min; i++)
2820 {
2821 if (eptr >= md->end_subject)
2822 {
2823 SCHECK_PARTIAL();
2824 RRETURN(MATCH_NOMATCH);
2825 }
2826 if (fc != *eptr++) RRETURN(MATCH_NOMATCH);
2827 }
2828
2829 if (min == max) continue;
2830
2831 if (minimize)
2832 {
2833 for (fi = min;; fi++)
2834 {
2835 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM26);
2836 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2837 if (fi >= max) RRETURN(MATCH_NOMATCH);
2838 if (eptr >= md->end_subject)
2839 {
2840 SCHECK_PARTIAL();
2841 RRETURN(MATCH_NOMATCH);
2842 }
2843 if (fc != *eptr++) RRETURN(MATCH_NOMATCH);
2844 }
2845 /* Control never gets here */
2846 }
2847 else /* Maximize */
2848 {
2849 pp = eptr;
2850 for (i = min; i < max; i++)
2851 {
2852 if (eptr >= md->end_subject)
2853 {
2854 SCHECK_PARTIAL();
2855 break;
2856 }
2857 if (fc != *eptr) break;
2858 eptr++;
2859 }
2860 if (possessive) continue;
2861
2862 while (eptr >= pp)
2863 {
2864 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM27);
2865 eptr--;
2866 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2867 }
2868 RRETURN(MATCH_NOMATCH);
2869 }
2870 }
2871 /* Control never gets here */
2872
2873 /* Match a negated single one-byte character. The character we are
2874 checking can be multibyte. */
2875
2876 case OP_NOT:
2877 if (eptr >= md->end_subject)
2878 {
2879 SCHECK_PARTIAL();
2880 RRETURN(MATCH_NOMATCH);
2881 }
2882 ecode++;
2883 GETCHARINCTEST(c, eptr);
2884 if ((ims & PCRE_CASELESS) != 0)
2885 {
2886 #ifdef SUPPORT_UTF8
2887 if (c < 256)
2888 #endif
2889 c = md->lcc[c];
2890 if (md->lcc[*ecode++] == c) RRETURN(MATCH_NOMATCH);
2891 }
2892 else
2893 {
2894 if (*ecode++ == c) RRETURN(MATCH_NOMATCH);
2895 }
2896 break;
2897
2898 /* Match a negated single one-byte character repeatedly. This is almost a
2899 repeat of the code for a repeated single character, but I haven't found a
2900 nice way of commoning these up that doesn't require a test of the
2901 positive/negative option for each character match. Maybe that wouldn't add
2902 very much to the time taken, but character matching *is* what this is all
2903 about... */
2904
2905 case OP_NOTEXACT:
2906 min = max = GET2(ecode, 1);
2907 ecode += 3;
2908 goto REPEATNOTCHAR;
2909
2910 case OP_NOTUPTO:
2911 case OP_NOTMINUPTO:
2912 min = 0;
2913 max = GET2(ecode, 1);
2914 minimize = *ecode == OP_NOTMINUPTO;
2915 ecode += 3;
2916 goto REPEATNOTCHAR;
2917
2918 case OP_NOTPOSSTAR:
2919 possessive = TRUE;
2920 min = 0;
2921 max = INT_MAX;
2922 ecode++;
2923 goto REPEATNOTCHAR;
2924
2925 case OP_NOTPOSPLUS:
2926 possessive = TRUE;
2927 min = 1;
2928 max = INT_MAX;
2929 ecode++;
2930 goto REPEATNOTCHAR;
2931
2932 case OP_NOTPOSQUERY:
2933 possessive = TRUE;
2934 min = 0;
2935 max = 1;
2936 ecode++;
2937 goto REPEATNOTCHAR;
2938
2939 case OP_NOTPOSUPTO:
2940 possessive = TRUE;
2941 min = 0;
2942 max = GET2(ecode, 1);
2943 ecode += 3;
2944 goto REPEATNOTCHAR;
2945
2946 case OP_NOTSTAR:
2947 case OP_NOTMINSTAR:
2948 case OP_NOTPLUS:
2949 case OP_NOTMINPLUS:
2950 case OP_NOTQUERY:
2951 case OP_NOTMINQUERY:
2952 c = *ecode++ - OP_NOTSTAR;
2953 minimize = (c & 1) != 0;
2954 min = rep_min[c]; /* Pick up values from tables; */
2955 max = rep_max[c]; /* zero for max => infinity */
2956 if (max == 0) max = INT_MAX;
2957
2958 /* Common code for all repeated single-byte matches. */
2959
2960 REPEATNOTCHAR:
2961 fc = *ecode++;
2962
2963 /* The code is duplicated for the caseless and caseful cases, for speed,
2964 since matching characters is likely to be quite common. First, ensure the
2965 minimum number of matches are present. If min = max, continue at the same
2966 level without recursing. Otherwise, if minimizing, keep trying the rest of
2967 the expression and advancing one matching character if failing, up to the
2968 maximum. Alternatively, if maximizing, find the maximum number of
2969 characters and work backwards. */
2970
2971 DPRINTF(("negative matching %c{%d,%d} against subject %.*s\n", fc, min, max,
2972 max, eptr));
2973
2974 if ((ims & PCRE_CASELESS) != 0)
2975 {
2976 fc = md->lcc[fc];
2977
2978 #ifdef SUPPORT_UTF8
2979 /* UTF-8 mode */
2980 if (utf8)
2981 {
2982 register unsigned int d;
2983 for (i = 1; i <= min; i++)
2984 {
2985 if (eptr >= md->end_subject)
2986 {
2987 SCHECK_PARTIAL();
2988 RRETURN(MATCH_NOMATCH);
2989 }
2990 GETCHARINC(d, eptr);
2991 if (d < 256) d = md->lcc[d];
2992 if (fc == d) RRETURN(MATCH_NOMATCH);
2993 }
2994 }
2995 else
2996 #endif
2997
2998 /* Not UTF-8 mode */
2999 {
3000 for (i = 1; i <= min; i++)
3001 {
3002 if (eptr >= md->end_subject)
3003 {
3004 SCHECK_PARTIAL();
3005 RRETURN(MATCH_NOMATCH);
3006 }
3007 if (fc == md->lcc[*eptr++]) RRETURN(MATCH_NOMATCH);
3008 }
3009 }
3010
3011 if (min == max) continue;
3012
3013 if (minimize)
3014 {
3015 #ifdef SUPPORT_UTF8
3016 /* UTF-8 mode */
3017 if (utf8)
3018 {
3019 register unsigned int d;
3020 for (fi = min;; fi++)
3021 {
3022 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM28);
3023 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3024 if (fi >= max) RRETURN(MATCH_NOMATCH);
3025 if (eptr >= md->end_subject)
3026 {
3027 SCHECK_PARTIAL();
3028 RRETURN(MATCH_NOMATCH);
3029 }
3030 GETCHARINC(d, eptr);
3031 if (d < 256) d = md->lcc[d];
3032 if (fc == d) RRETURN(MATCH_NOMATCH);
3033 }
3034 }
3035 else
3036 #endif
3037 /* Not UTF-8 mode */
3038 {
3039 for (fi = min;; fi++)
3040 {
3041 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM29);
3042 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3043 if (fi >= max) RRETURN(MATCH_NOMATCH);
3044 if (eptr >= md->end_subject)
3045 {
3046 SCHECK_PARTIAL();
3047 RRETURN(MATCH_NOMATCH);
3048 }
3049 if (fc == md->lcc[*eptr++]) RRETURN(MATCH_NOMATCH);
3050 }
3051 }
3052 /* Control never gets here */
3053 }
3054
3055 /* Maximize case */
3056
3057 else
3058 {
3059 pp = eptr;
3060
3061 #ifdef SUPPORT_UTF8
3062 /* UTF-8 mode */
3063 if (utf8)
3064 {
3065 register unsigned int d;
3066 for (i = min; i < max; i++)
3067 {
3068 int len = 1;
3069 if (eptr >= md->end_subject)
3070 {
3071 SCHECK_PARTIAL();
3072 break;
3073 }
3074 GETCHARLEN(d, eptr, len);
3075 if (d < 256) d = md->lcc[d];
3076 if (fc == d) break;
3077 eptr += len;
3078 }
3079 if (possessive) continue;
3080 for(;;)
3081 {
3082 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM30);
3083 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3084 if (eptr-- == pp) break; /* Stop if tried at original pos */
3085 BACKCHAR(eptr);
3086 }
3087 }
3088 else
3089 #endif
3090 /* Not UTF-8 mode */
3091 {
3092 for (i = min; i < max; i++)
3093 {
3094 if (eptr >= md->end_subject)
3095 {
3096 SCHECK_PARTIAL();
3097 break;
3098 }
3099 if (fc == md->lcc[*eptr]) break;
3100 eptr++;
3101 }
3102 if (possessive) continue;
3103 while (eptr >= pp)
3104 {
3105 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM31);
3106 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3107 eptr--;
3108 }
3109 }
3110
3111 RRETURN(MATCH_NOMATCH);
3112 }
3113 /* Control never gets here */
3114 }
3115
3116 /* Caseful comparisons */
3117
3118 else
3119 {
3120 #ifdef SUPPORT_UTF8
3121 /* UTF-8 mode */
3122 if (utf8)
3123 {
3124 register unsigned int d;
3125 for (i = 1; i <= min; i++)
3126 {
3127 if (eptr >= md->end_subject)
3128 {
3129 SCHECK_PARTIAL();
3130 RRETURN(MATCH_NOMATCH);
3131 }
3132 GETCHARINC(d, eptr);
3133 if (fc == d) RRETURN(MATCH_NOMATCH);
3134 }
3135 }
3136 else
3137 #endif
3138 /* Not UTF-8 mode */
3139 {
3140 for (i = 1; i <= min; i++)
3141 {
3142 if (eptr >= md->end_subject)
3143 {
3144 SCHECK_PARTIAL();
3145 RRETURN(MATCH_NOMATCH);
3146 }
3147 if (fc == *eptr++) RRETURN(MATCH_NOMATCH);
3148 }
3149 }
3150
3151 if (min == max) continue;
3152
3153 if (minimize)
3154 {
3155 #ifdef SUPPORT_UTF8
3156 /* UTF-8 mode */
3157 if (utf8)
3158 {
3159 register unsigned int d;
3160 for (fi = min;; fi++)
3161 {
3162 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM32);
3163 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3164 if (fi >= max) RRETURN(MATCH_NOMATCH);
3165 if (eptr >= md->end_subject)
3166 {
3167 SCHECK_PARTIAL();
3168 RRETURN(MATCH_NOMATCH);
3169 }
3170 GETCHARINC(d, eptr);
3171 if (fc == d) RRETURN(MATCH_NOMATCH);
3172 }
3173 }
3174 else
3175 #endif
3176 /* Not UTF-8 mode */
3177 {
3178 for (fi = min;; fi++)
3179 {
3180 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM33);
3181 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3182 if (fi >= max) RRETURN(MATCH_NOMATCH);
3183 if (eptr >= md->end_subject)
3184 {
3185 SCHECK_PARTIAL();
3186 RRETURN(MATCH_NOMATCH);
3187 }
3188 if (fc == *eptr++) RRETURN(MATCH_NOMATCH);
3189 }
3190 }
3191 /* Control never gets here */
3192 }
3193
3194 /* Maximize case */
3195
3196 else
3197 {
3198 pp = eptr;
3199
3200 #ifdef SUPPORT_UTF8
3201 /* UTF-8 mode */
3202 if (utf8)
3203 {
3204 register unsigned int d;
3205 for (i = min; i < max; i++)
3206 {
3207 int len = 1;
3208 if (eptr >= md->end_subject)
3209 {
3210 SCHECK_PARTIAL();
3211 break;
3212 }
3213 GETCHARLEN(d, eptr, len);
3214 if (fc == d) break;
3215 eptr += len;
3216 }
3217 if (possessive) continue;
3218 for(;;)
3219 {
3220 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM34);
3221 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3222 if (eptr-- == pp) break; /* Stop if tried at original pos */
3223 BACKCHAR(eptr);
3224 }
3225 }
3226 else
3227 #endif
3228 /* Not UTF-8 mode */
3229 {
3230 for (i = min; i < max; i++)
3231 {
3232 if (eptr >= md->end_subject)
3233 {
3234 SCHECK_PARTIAL();
3235 break;
3236 }
3237 if (fc == *eptr) break;
3238 eptr++;
3239 }
3240 if (possessive) continue;
3241 while (eptr >= pp)
3242 {
3243 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM35);
3244 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3245 eptr--;
3246 }
3247 }
3248
3249 RRETURN(MATCH_NOMATCH);
3250 }
3251 }
3252 /* Control never gets here */
3253
3254 /* Match a single character type repeatedly; several different opcodes
3255 share code. This is very similar to the code for single characters, but we
3256 repeat it in the interests of efficiency. */
3257
3258 case OP_TYPEEXACT:
3259 min = max = GET2(ecode, 1);
3260 minimize = TRUE;
3261 ecode += 3;
3262 goto REPEATTYPE;
3263
3264 case OP_TYPEUPTO:
3265 case OP_TYPEMINUPTO:
3266 min = 0;
3267 max = GET2(ecode, 1);
3268 minimize = *ecode == OP_TYPEMINUPTO;
3269 ecode += 3;
3270 goto REPEATTYPE;
3271
3272 case OP_TYPEPOSSTAR:
3273 possessive = TRUE;
3274 min = 0;
3275 max = INT_MAX;
3276 ecode++;
3277 goto REPEATTYPE;
3278
3279 case OP_TYPEPOSPLUS:
3280 possessive = TRUE;
3281 min = 1;
3282 max = INT_MAX;
3283 ecode++;
3284 goto REPEATTYPE;
3285
3286 case OP_TYPEPOSQUERY:
3287 possessive = TRUE;
3288 min = 0;
3289 max = 1;
3290 ecode++;
3291 goto REPEATTYPE;
3292
3293 case OP_TYPEPOSUPTO:
3294 possessive = TRUE;
3295 min = 0;
3296 max = GET2(ecode, 1);
3297 ecode += 3;
3298 goto REPEATTYPE;
3299
3300 case OP_TYPESTAR:
3301 case OP_TYPEMINSTAR:
3302 case OP_TYPEPLUS:
3303 case OP_TYPEMINPLUS:
3304 case OP_TYPEQUERY:
3305 case OP_TYPEMINQUERY:
3306 c = *ecode++ - OP_TYPESTAR;
3307 minimize = (c & 1) != 0;
3308 min = rep_min[c]; /* Pick up values from tables; */
3309 max = rep_max[c]; /* zero for max => infinity */
3310 if (max == 0) max = INT_MAX;
3311
3312 /* Common code for all repeated single character type matches. Note that
3313 in UTF-8 mode, '.' matches a character of any length, but for the other
3314 character types, the valid characters are all one-byte long. */
3315
3316 REPEATTYPE:
3317 ctype = *ecode++; /* Code for the character type */
3318
3319 #ifdef SUPPORT_UCP
3320 if (ctype == OP_PROP || ctype == OP_NOTPROP)
3321 {
3322 prop_fail_result = ctype == OP_NOTPROP;
3323 prop_type = *ecode++;
3324 prop_value = *ecode++;
3325 }
3326 else prop_type = -1;
3327 #endif
3328
3329 /* First, ensure the minimum number of matches are present. Use inline
3330 code for maximizing the speed, and do the type test once at the start
3331 (i.e. keep it out of the loop). Separate the UTF-8 code completely as that
3332 is tidier. Also separate the UCP code, which can be the same for both UTF-8
3333 and single-bytes. */
3334
3335 if (min > 0)
3336 {
3337 #ifdef SUPPORT_UCP
3338 if (prop_type >= 0)
3339 {
3340 switch(prop_type)
3341 {
3342 case PT_ANY:
3343 if (prop_fail_result) RRETURN(MATCH_NOMATCH);
3344 for (i = 1; i <= min; i++)
3345 {
3346 if (eptr >= md->end_subject)
3347 {
3348 SCHECK_PARTIAL();
3349 RRETURN(MATCH_NOMATCH);
3350 }
3351 GETCHARINCTEST(c, eptr);
3352 }
3353 break;
3354
3355 case PT_LAMP:
3356 for (i = 1; i <= min; i++)
3357 {
3358 if (eptr >= md->end_subject)
3359 {
3360 SCHECK_PARTIAL();
3361 RRETURN(MATCH_NOMATCH);
3362 }
3363 GETCHARINCTEST(c, eptr);
3364 prop_chartype = UCD_CHARTYPE(c);
3365 if ((prop_chartype == ucp_Lu ||
3366 prop_chartype == ucp_Ll ||
3367 prop_chartype == ucp_Lt) == prop_fail_result)
3368 RRETURN(MATCH_NOMATCH);
3369 }
3370 break;
3371
3372 case PT_GC:
3373 for (i = 1; i <= min; i++)
3374 {
3375 if (eptr >= md->end_subject)
3376 {
3377 SCHECK_PARTIAL();
3378 RRETURN(MATCH_NOMATCH);
3379 }
3380 GETCHARINCTEST(c, eptr);
3381 prop_category = UCD_CATEGORY(c);
3382 if ((prop_category == prop_value) == prop_fail_result)
3383 RRETURN(MATCH_NOMATCH);
3384 }
3385 break;
3386
3387 case PT_PC:
3388 for (i = 1; i <= min; i++)
3389 {
3390 if (eptr >= md->end_subject)
3391 {
3392 SCHECK_PARTIAL();
3393 RRETURN(MATCH_NOMATCH);
3394 }
3395 GETCHARINCTEST(c, eptr);
3396 prop_chartype = UCD_CHARTYPE(c);
3397 if ((prop_chartype == prop_value) == prop_fail_result)
3398 RRETURN(MATCH_NOMATCH);
3399 }
3400 break;
3401
3402 case PT_SC:
3403 for (i = 1; i <= min; i++)
3404 {
3405 if (eptr >= md->end_subject)
3406 {
3407 SCHECK_PARTIAL();
3408 RRETURN(MATCH_NOMATCH);
3409 }
3410 GETCHARINCTEST(c, eptr);
3411 prop_script = UCD_SCRIPT(c);
3412 if ((prop_script == prop_value) == prop_fail_result)
3413 RRETURN(MATCH_NOMATCH);
3414 }
3415 break;
3416
3417 default:
3418 RRETURN(PCRE_ERROR_INTERNAL);
3419 }
3420 }
3421
3422 /* Match extended Unicode sequences. We will get here only if the
3423 support is in the binary; otherwise a compile-time error occurs. */
3424
3425 else if (ctype == OP_EXTUNI)
3426 {
3427 for (i = 1; i <= min; i++)
3428 {
3429 if (eptr >= md->end_subject)
3430 {
3431 SCHECK_PARTIAL();
3432 RRETURN(MATCH_NOMATCH);
3433 }
3434 GETCHARINCTEST(c, eptr);
3435 prop_category = UCD_CATEGORY(c);
3436 if (prop_category == ucp_M) RRETURN(MATCH_NOMATCH);
3437 while (eptr < md->end_subject)
3438 {
3439 int len = 1;
3440 if (!utf8) c = *eptr;
3441 else { GETCHARLEN(c, eptr, len); }
3442 prop_category = UCD_CATEGORY(c);
3443 if (prop_category != ucp_M) break;
3444 eptr += len;
3445 }
3446 }
3447 }
3448
3449 else
3450 #endif /* SUPPORT_UCP */
3451
3452 /* Handle all other cases when the coding is UTF-8 */
3453
3454 #ifdef SUPPORT_UTF8
3455 if (utf8) switch(ctype)
3456 {
3457 case OP_ANY:
3458 for (i = 1; i <= min; i++)
3459 {
3460 if (eptr >= md->end_subject)
3461 {
3462 SCHECK_PARTIAL();
3463 RRETURN(MATCH_NOMATCH);
3464 }
3465 if (IS_NEWLINE(eptr)) RRETURN(MATCH_NOMATCH);
3466 eptr++;
3467 while (eptr < md->end_subject && (*eptr & 0xc0) == 0x80) eptr++;
3468 }
3469 break;
3470
3471 case OP_ALLANY:
3472 for (i = 1; i <= min; i++)
3473 {
3474 if (eptr >= md->end_subject)
3475 {
3476 SCHECK_PARTIAL();
3477 RRETURN(MATCH_NOMATCH);
3478 }
3479 eptr++;
3480 while (eptr < md->end_subject && (*eptr & 0xc0) == 0x80) eptr++;
3481 }
3482 break;
3483
3484 case OP_ANYBYTE:
3485 if (eptr > md->end_subject - min) RRETURN(MATCH_NOMATCH);
3486 eptr += min;
3487 break;
3488
3489 case OP_ANYNL:
3490 for (i = 1; i <= min; i++)
3491 {
3492 if (eptr >= md->end_subject)
3493 {
3494 SCHECK_PARTIAL();
3495 RRETURN(MATCH_NOMATCH);
3496 }
3497 GETCHARINC(c, eptr);
3498 switch(c)
3499 {
3500 default: RRETURN(MATCH_NOMATCH);
3501 case 0x000d:
3502 if (eptr < md->end_subject && *eptr == 0x0a) eptr++;
3503 break;
3504
3505 case 0x000a:
3506 break;
3507
3508 case 0x000b:
3509 case 0x000c:
3510 case 0x0085:
3511 case 0x2028:
3512 case 0x2029:
3513 if (md->bsr_anycrlf) RRETURN(MATCH_NOMATCH);
3514 break;
3515 }
3516 }
3517 break;
3518
3519 case OP_NOT_HSPACE:
3520 for (i = 1; i <= min; i++)
3521 {
3522 if (eptr >= md->end_subject)
3523 {
3524 SCHECK_PARTIAL();
3525 RRETURN(MATCH_NOMATCH);
3526 }
3527 GETCHARINC(c, eptr);
3528 switch(c)
3529 {
3530 default: break;
3531 case 0x09: /* HT */
3532 case 0x20: /* SPACE */
3533 case 0xa0: /* NBSP */
3534 case 0x1680: /* OGHAM SPACE MARK */
3535 case 0x180e: /* MONGOLIAN VOWEL SEPARATOR */
3536 case 0x2000: /* EN QUAD */
3537 case 0x2001: /* EM QUAD */
3538 case 0x2002: /* EN SPACE */
3539 case 0x2003: /* EM SPACE */
3540 case 0x2004: /* THREE-PER-EM SPACE */
3541 case 0x2005: /* FOUR-PER-EM SPACE */
3542 case 0x2006: /* SIX-PER-EM SPACE */
3543 case 0x2007: /* FIGURE SPACE */
3544 case 0x2008: /* PUNCTUATION SPACE */
3545 case 0x2009: /* THIN SPACE */
3546 case 0x200A: /* HAIR SPACE */
3547 case 0x202f: /* NARROW NO-BREAK SPACE */
3548 case 0x205f: /* MEDIUM MATHEMATICAL SPACE */
3549 case 0x3000: /* IDEOGRAPHIC SPACE */
3550 RRETURN(MATCH_NOMATCH);
3551 }
3552 }
3553 break;
3554
3555 case OP_HSPACE:
3556 for (i = 1; i <= min; i++)
3557 {
3558 if (eptr >= md->end_subject)
3559 {
3560 SCHECK_PARTIAL();
3561 RRETURN(MATCH_NOMATCH);
3562 }
3563 GETCHARINC(c, eptr);
3564 switch(c)
3565 {
3566 default: RRETURN(MATCH_NOMATCH);
3567 case 0x09: /* HT */
3568 case 0x20: /* SPACE */
3569 case 0xa0: /* NBSP */
3570 case 0x1680: /* OGHAM SPACE MARK */
3571 case 0x180e: /* MONGOLIAN VOWEL SEPARATOR */
3572 case 0x2000: /* EN QUAD */
3573 case 0x2001: /* EM QUAD */
3574 case 0x2002: /* EN SPACE */
3575 case 0x2003: /* EM SPACE */
3576 case 0x2004: /* THREE-PER-EM SPACE */
3577 case 0x2005: /* FOUR-PER-EM SPACE */
3578 case 0x2006: /* SIX-PER-EM SPACE */
3579 case 0x2007: /* FIGURE SPACE */
3580 case 0x2008: /* PUNCTUATION SPACE */
3581 case 0x2009: /* THIN SPACE */
3582 case 0x200A: /* HAIR SPACE */
3583 case 0x202f: /* NARROW NO-BREAK SPACE */
3584 case 0x205f: /* MEDIUM MATHEMATICAL SPACE */
3585 case 0x3000: /* IDEOGRAPHIC SPACE */
3586 break;
3587 }
3588 }
3589 break;
3590
3591 case OP_NOT_VSPACE:
3592 for (i = 1; i <= min; i++)
3593 {
3594 if (eptr >= md->end_subject)
3595 {
3596 SCHECK_PARTIAL();
3597 RRETURN(MATCH_NOMATCH);
3598 }
3599 GETCHARINC(c, eptr);
3600 switch(c)
3601 {
3602 default: break;
3603 case 0x0a: /* LF */
3604 case 0x0b: /* VT */
3605 case 0x0c: /* FF */
3606 case 0x0d: /* CR */
3607 case 0x85: /* NEL */
3608 case 0x2028: /* LINE SEPARATOR */
3609 case 0x2029: /* PARAGRAPH SEPARATOR */
3610 RRETURN(MATCH_NOMATCH);
3611 }
3612 }
3613 break;
3614
3615 case OP_VSPACE:
3616 for (i = 1; i <= min; i++)
3617 {
3618 if (eptr >= md->end_subject)
3619 {
3620 SCHECK_PARTIAL();
3621 RRETURN(MATCH_NOMATCH);
3622 }
3623 GETCHARINC(c, eptr);
3624 switch(c)
3625 {
3626 default: RRETURN(MATCH_NOMATCH);
3627 case 0x0a: /* LF */
3628 case 0x0b: /* VT */
3629 case 0x0c: /* FF */
3630 case 0x0d: /* CR */
3631 case 0x85: /* NEL */
3632 case 0x2028: /* LINE SEPARATOR */
3633 case 0x2029: /* PARAGRAPH SEPARATOR */
3634 break;
3635 }
3636 }
3637 break;
3638
3639 case OP_NOT_DIGIT:
3640 for (i = 1; i <= min; i++)
3641 {
3642 if (eptr >= md->end_subject)
3643 {
3644 SCHECK_PARTIAL();
3645 RRETURN(MATCH_NOMATCH);
3646 }
3647 GETCHARINC(c, eptr);
3648 if (c < 128 && (md->ctypes[c] & ctype_digit) != 0)
3649 RRETURN(MATCH_NOMATCH);
3650 }
3651 break;
3652
3653 case OP_DIGIT:
3654 for (i = 1; i <= min; i++)
3655 {
3656 if (eptr >= md->end_subject)
3657 {
3658 SCHECK_PARTIAL();
3659 RRETURN(MATCH_NOMATCH);
3660 }
3661 if (*eptr >= 128 || (md->ctypes[*eptr++] & ctype_digit) == 0)
3662 RRETURN(MATCH_NOMATCH);
3663 /* No need to skip more bytes - we know it's a 1-byte character */
3664 }
3665 break;
3666
3667 case OP_NOT_WHITESPACE:
3668 for (i = 1; i <= min; i++)
3669 {
3670 if (eptr >= md->end_subject)
3671 {
3672 SCHECK_PARTIAL();
3673 RRETURN(MATCH_NOMATCH);
3674 }
3675 if (*eptr < 128 && (md->ctypes[*eptr] & ctype_space) != 0)
3676 RRETURN(MATCH_NOMATCH);
3677 while (++eptr < md->end_subject && (*eptr & 0xc0) == 0x80);
3678 }
3679 break;
3680
3681 case OP_WHITESPACE:
3682 for (i = 1; i <= min; i++)
3683 {
3684 if (eptr >= md->end_subject)
3685 {
3686 SCHECK_PARTIAL();
3687 RRETURN(MATCH_NOMATCH);
3688 }
3689 if (*eptr >= 128 || (md->ctypes[*eptr++] & ctype_space) == 0)
3690 RRETURN(MATCH_NOMATCH);
3691 /* No need to skip more bytes - we know it's a 1-byte character */
3692 }
3693 break;
3694
3695 case OP_NOT_WORDCHAR:
3696 for (i = 1; i <= min; i++)
3697 {
3698 if (eptr >= md->end_subject)
3699 {
3700 SCHECK_PARTIAL();
3701 RRETURN(MATCH_NOMATCH);
3702 }
3703 if (*eptr < 128 && (md->ctypes[*eptr] & ctype_word) != 0)
3704 RRETURN(MATCH_NOMATCH);
3705 while (++eptr < md->end_subject && (*eptr & 0xc0) == 0x80);
3706 }
3707 break;
3708
3709 case OP_WORDCHAR:
3710 for (i = 1; i <= min; i++)
3711 {
3712 if (eptr >= md->end_subject)
3713 {
3714 SCHECK_PARTIAL();
3715 RRETURN(MATCH_NOMATCH);
3716 }
3717 if (*eptr >= 128 || (md->ctypes[*eptr++] & ctype_word) == 0)
3718 RRETURN(MATCH_NOMATCH);
3719 /* No need to skip more bytes - we know it's a 1-byte character */
3720 }
3721 break;
3722
3723 default:
3724 RRETURN(PCRE_ERROR_INTERNAL);
3725 } /* End switch(ctype) */
3726
3727 else
3728 #endif /* SUPPORT_UTF8 */
3729
3730 /* Code for the non-UTF-8 case for minimum matching of operators other
3731 than OP_PROP and OP_NOTPROP. */
3732
3733 switch(ctype)
3734 {
3735 case OP_ANY:
3736 for (i = 1; i <= min; i++)
3737 {
3738 if (eptr >= md->end_subject)
3739 {
3740 SCHECK_PARTIAL();
3741 RRETURN(MATCH_NOMATCH);
3742 }
3743 if (IS_NEWLINE(eptr)) RRETURN(MATCH_NOMATCH);
3744 eptr++;
3745 }
3746 break;
3747
3748 case OP_ALLANY:
3749 if (eptr > md->end_subject - min)
3750 {
3751 SCHECK_PARTIAL();
3752 RRETURN(MATCH_NOMATCH);
3753 }
3754 eptr += min;
3755 break;
3756
3757 case OP_ANYBYTE:
3758 if (eptr > md->end_subject - min)
3759 {
3760 SCHECK_PARTIAL();
3761 RRETURN(MATCH_NOMATCH);
3762 }
3763 eptr += min;
3764 break;
3765
3766 case OP_ANYNL:
3767 for (i = 1; i <= min; i++)
3768 {
3769 if (eptr >= md->end_subject)
3770 {
3771 SCHECK_PARTIAL();
3772 RRETURN(MATCH_NOMATCH);
3773 }
3774 switch(*eptr++)
3775 {
3776 default: RRETURN(MATCH_NOMATCH);
3777 case 0x000d:
3778 if (eptr < md->end_subject && *eptr == 0x0a) eptr++;
3779 break;
3780 case 0x000a:
3781 break;
3782
3783 case 0x000b:
3784 case 0x000c:
3785 case 0x0085:
3786 if (md->bsr_anycrlf) RRETURN(MATCH_NOMATCH);
3787 break;
3788 }
3789 }
3790 break;
3791
3792 case OP_NOT_HSPACE:
3793 for (i = 1; i <= min; i++)
3794 {
3795 if (eptr >= md->end_subject)
3796 {
3797 SCHECK_PARTIAL();
3798 RRETURN(MATCH_NOMATCH);
3799 }
3800 switch(*eptr++)
3801 {
3802 default: break;
3803 case 0x09: /* HT */
3804 case 0x20: /* SPACE */
3805 case 0xa0: /* NBSP */
3806 RRETURN(MATCH_NOMATCH);
3807 }
3808 }
3809 break;
3810
3811 case OP_HSPACE:
3812 for (i = 1; i <= min; i++)
3813 {
3814 if (eptr >= md->end_subject)
3815 {
3816 SCHECK_PARTIAL();
3817 RRETURN(MATCH_NOMATCH);
3818 }
3819 switch(*eptr++)
3820 {
3821 default: RRETURN(MATCH_NOMATCH);
3822 case 0x09: /* HT */
3823 case 0x20: /* SPACE */
3824 case 0xa0: /* NBSP */
3825 break;
3826 }
3827 }
3828 break;
3829
3830 case OP_NOT_VSPACE:
3831 for (i = 1; i <= min; i++)
3832 {
3833 if (eptr >= md->end_subject)
3834 {
3835 SCHECK_PARTIAL();
3836 RRETURN(MATCH_NOMATCH);
3837 }
3838 switch(*eptr++)
3839 {
3840 default: break;
3841 case 0x0a: /* LF */
3842 case 0x0b: /* VT */
3843 case 0x0c: /* FF */
3844 case 0x0d: /* CR */
3845 case 0x85: /* NEL */
3846 RRETURN(MATCH_NOMATCH);
3847 }
3848 }
3849 break;
3850
3851 case OP_VSPACE:
3852 for (i = 1; i <= min; i++)
3853 {
3854 if (eptr >= md->end_subject)
3855 {
3856 SCHECK_PARTIAL();
3857 RRETURN(MATCH_NOMATCH);
3858 }
3859 switch(*eptr++)
3860 {
3861 default: RRETURN(MATCH_NOMATCH);
3862 case 0x0a: /* LF */
3863 case 0x0b: /* VT */
3864 case 0x0c: /* FF */
3865 case 0x0d: /* CR */
3866 case 0x85: /* NEL */
3867 break;
3868 }
3869 }
3870 break;
3871
3872 case OP_NOT_DIGIT:
3873 for (i = 1; i <= min; i++)
3874 {
3875 if (eptr >= md->end_subject)
3876 {
3877 SCHECK_PARTIAL();
3878 RRETURN(MATCH_NOMATCH);
3879 }
3880 if ((md->ctypes[*eptr++] & ctype_digit) != 0) RRETURN(MATCH_NOMATCH);
3881 }
3882 break;
3883
3884 case OP_DIGIT:
3885 for (i = 1; i <= min; i++)
3886 {
3887 if (eptr >= md->end_subject)
3888 {
3889 SCHECK_PARTIAL();
3890 RRETURN(MATCH_NOMATCH);
3891 }
3892 if ((md->ctypes[*eptr++] & ctype_digit) == 0) RRETURN(MATCH_NOMATCH);
3893 }
3894 break;
3895
3896 case OP_NOT_WHITESPACE:
3897 for (i = 1; i <= min; i++)
3898 {
3899 if (eptr >= md->end_subject)
3900 {
3901 SCHECK_PARTIAL();
3902 RRETURN(MATCH_NOMATCH);
3903 }
3904 if ((md->ctypes[*eptr++] & ctype_space) != 0) RRETURN(MATCH_NOMATCH);
3905 }
3906 break;
3907
3908 case OP_WHITESPACE:
3909 for (i = 1; i <= min; i++)
3910 {
3911 if (eptr >= md->end_subject)
3912 {
3913 SCHECK_PARTIAL();
3914 RRETURN(MATCH_NOMATCH);
3915 }
3916 if ((md->ctypes[*eptr++] & ctype_space) == 0) RRETURN(MATCH_NOMATCH);
3917 }
3918 break;
3919
3920 case OP_NOT_WORDCHAR:
3921 for (i = 1; i <= min; i++)
3922 {
3923 if (eptr >= md->end_subject)
3924 {
3925 SCHECK_PARTIAL();
3926 RRETURN(MATCH_NOMATCH);
3927 }
3928 if ((md->ctypes[*eptr++] & ctype_word) != 0)
3929 RRETURN(MATCH_NOMATCH);
3930 }
3931 break;
3932
3933 case OP_WORDCHAR:
3934 for (i = 1; i <= min; i++)
3935 {
3936 if (eptr >= md->end_subject)
3937 {
3938 SCHECK_PARTIAL();
3939 RRETURN(MATCH_NOMATCH);
3940 }
3941 if ((md->ctypes[*eptr++] & ctype_word) == 0)
3942 RRETURN(MATCH_NOMATCH);
3943 }
3944 break;
3945
3946 default:
3947 RRETURN(PCRE_ERROR_INTERNAL);
3948 }
3949 }
3950
3951 /* If min = max, continue at the same level without recursing */
3952
3953 if (min == max) continue;
3954
3955 /* If minimizing, we have to test the rest of the pattern before each
3956 subsequent match. Again, separate the UTF-8 case for speed, and also
3957 separate the UCP cases. */
3958
3959 if (minimize)
3960 {
3961 #ifdef SUPPORT_UCP
3962 if (prop_type >= 0)
3963 {
3964 switch(prop_type)
3965 {
3966 case PT_ANY:
3967 for (fi = min;; fi++)
3968 {
3969 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM36);
3970 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3971 if (fi >= max) RRETURN(MATCH_NOMATCH);
3972 if (eptr >= md->end_subject)
3973 {
3974 SCHECK_PARTIAL();
3975 RRETURN(MATCH_NOMATCH);
3976 }
3977 GETCHARINC(c, eptr);
3978 if (prop_fail_result) RRETURN(MATCH_NOMATCH);
3979 }
3980 /* Control never gets here */
3981
3982 case PT_LAMP:
3983 for (fi = min;; fi++)
3984 {
3985 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM37);
3986 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3987 if (fi >= max) RRETURN(MATCH_NOMATCH);
3988 if (eptr >= md->end_subject)
3989 {
3990 SCHECK_PARTIAL();
3991 RRETURN(MATCH_NOMATCH);
3992 }
3993 GETCHARINC(c, eptr);
3994 prop_chartype = UCD_CHARTYPE(c);
3995 if ((prop_chartype == ucp_Lu ||
3996 prop_chartype == ucp_Ll ||
3997 prop_chartype == ucp_Lt) == prop_fail_result)
3998 RRETURN(MATCH_NOMATCH);
3999 }
4000 /* Control never gets here */
4001
4002 case PT_GC:
4003 for (fi = min;; fi++)
4004 {
4005 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM38);
4006 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
4007 if (fi >= max) RRETURN(MATCH_NOMATCH);
4008 if (eptr >= md->end_subject)
4009 {
4010 SCHECK_PARTIAL();
4011 RRETURN(MATCH_NOMATCH);
4012 }
4013 GETCHARINC(c, eptr);
4014 prop_category = UCD_CATEGORY(c);
4015 if ((prop_category == prop_value) == prop_fail_result)
4016 RRETURN(MATCH_NOMATCH);
4017 }
4018 /* Control never gets here */
4019
4020 case PT_PC:
4021 for (fi = min;; fi++)
4022 {
4023 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM39);
4024 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
4025 if (fi >= max) RRETURN(MATCH_NOMATCH);
4026 if (eptr >= md->end_subject)
4027 {
4028 SCHECK_PARTIAL();
4029 RRETURN(MATCH_NOMATCH);
4030 }
4031 GETCHARINC(c, eptr);
4032 prop_chartype = UCD_CHARTYPE(c);
4033 if ((prop_chartype == prop_value) == prop_fail_result)
4034 RRETURN(MATCH_NOMATCH);
4035 }
4036 /* Control never gets here */
4037
4038 case PT_SC:
4039 for (fi = min;; fi++)
4040 {
4041 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM40);
4042 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
4043 if (fi >= max) RRETURN(MATCH_NOMATCH);
4044 if (eptr >= md->end_subject)
4045 {
4046 SCHECK_PARTIAL();
4047 RRETURN(MATCH_NOMATCH);
4048 }
4049 GETCHARINC(c, eptr);
4050 prop_script = UCD_SCRIPT(c);
4051 if ((prop_script == prop_value) == prop_fail_result)
4052 RRETURN(MATCH_NOMATCH);
4053 }
4054 /* Control never gets here */
4055
4056 default:
4057 RRETURN(PCRE_ERROR_INTERNAL);
4058 }
4059 }
4060
4061 /* Match extended Unicode sequences. We will get here only if the
4062 support is in the binary; otherwise a compile-time error occurs. */
4063
4064 else if (ctype == OP_EXTUNI)
4065 {
4066 for (fi = min;; fi++)
4067 {
4068 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM41);
4069 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
4070 if (fi >= max) RRETURN(MATCH_NOMATCH);
4071 if (eptr >= md->end_subject)
4072 {
4073 SCHECK_PARTIAL();
4074 RRETURN(MATCH_NOMATCH);
4075 }
4076 GETCHARINCTEST(c, eptr);
4077 prop_category = UCD_CATEGORY(c);
4078 if (prop_category == ucp_M) RRETURN(MATCH_NOMATCH);
4079 while (eptr < md->end_subject)
4080 {
4081 int len = 1;
4082 if (!utf8) c = *eptr;
4083 else { GETCHARLEN(c, eptr, len); }
4084 prop_category = UCD_CATEGORY(c);
4085 if (prop_category != ucp_M) break;
4086 eptr += len;
4087 }
4088 }
4089 }
4090
4091 else
4092 #endif /* SUPPORT_UCP */
4093
4094 #ifdef SUPPORT_UTF8
4095 /* UTF-8 mode */
4096 if (utf8)
4097 {
4098 for (fi = min;; fi++)
4099 {
4100 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM42);
4101 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
4102 if (fi >= max) RRETURN(MATCH_NOMATCH);
4103 if (eptr >= md->end_subject)
4104 {
4105 SCHECK_PARTIAL();
4106 RRETURN(MATCH_NOMATCH);
4107 }
4108 if (ctype == OP_ANY && IS_NEWLINE(eptr))
4109 RRETURN(MATCH_NOMATCH);
4110 GETCHARINC(c, eptr);
4111 switch(ctype)
4112 {
4113 case OP_ANY: /* This is the non-NL case */
4114 case OP_ALLANY:
4115 case OP_ANYBYTE:
4116 break;
4117
4118 case OP_ANYNL:
4119 switch(c)
4120 {
4121 default: RRETURN(MATCH_NOMATCH);
4122 case 0x000d:
4123 if (eptr < md->end_subject && *eptr == 0x0a) eptr++;
4124 break;
4125 case 0x000a:
4126 break;
4127
4128 case 0x000b:
4129 case 0x000c:
4130 case 0x0085:
4131 case 0x2028:
4132 case 0x2029:
4133 if (md->bsr_anycrlf) RRETURN(MATCH_NOMATCH);
4134 break;
4135 }
4136 break;
4137
4138 case OP_NOT_HSPACE:
4139 switch(c)
4140 {
4141 default: break;
4142 case 0x09: /* HT */
4143 case 0x20: /* SPACE */
4144 case 0xa0: /* NBSP */
4145 case 0x1680: /* OGHAM SPACE MARK */
4146 case 0x180e: /* MONGOLIAN VOWEL SEPARATOR */
4147 case 0x2000: /* EN QUAD */
4148 case 0x2001: /* EM QUAD */
4149 case 0x2002: /* EN SPACE */
4150 case 0x2003: /* EM SPACE */
4151 case 0x2004: /* THREE-PER-EM SPACE */
4152 case 0x2005: /* FOUR-PER-EM SPACE */
4153 case 0x2006: /* SIX-PER-EM SPACE */
4154 case 0x2007: /* FIGURE SPACE */
4155 case 0x2008: /* PUNCTUATION SPACE */
4156 case 0x2009: /* THIN SPACE */
4157 case 0x200A: /* HAIR SPACE */
4158 case 0x202f: /* NARROW NO-BREAK SPACE */
4159 case 0x205f: /* MEDIUM MATHEMATICAL SPACE */
4160 case 0x3000: /* IDEOGRAPHIC SPACE */
4161 RRETURN(MATCH_NOMATCH);
4162 }
4163 break;
4164
4165 case OP_HSPACE:
4166 switch(c)
4167 {
4168 default: RRETURN(MATCH_NOMATCH);
4169 case 0x09: /* HT */
4170 case 0x20: /* SPACE */
4171 case 0xa0: /* NBSP */
4172 case 0x1680: /* OGHAM SPACE MARK */
4173 case 0x180e: /* MONGOLIAN VOWEL SEPARATOR */
4174 case 0x2000: /* EN QUAD */
4175 case 0x2001: /* EM QUAD */
4176 case 0x2002: /* EN SPACE */
4177 case 0x2003: /* EM SPACE */
4178 case 0x2004: /* THREE-PER-EM SPACE */
4179 case 0x2005: /* FOUR-PER-EM SPACE */
4180 case 0x2006: /* SIX-PER-EM SPACE */
4181 case 0x2007: /* FIGURE SPACE */
4182 case 0x2008: /* PUNCTUATION SPACE */
4183 case 0x2009: /* THIN SPACE */
4184 case 0x200A: /* HAIR SPACE */
4185 case 0x202f: /* NARROW NO-BREAK SPACE */
4186 case 0x205f: /* MEDIUM MATHEMATICAL SPACE */
4187 case 0x3000: /* IDEOGRAPHIC SPACE */
4188 break;
4189 }
4190 break;
4191
4192 case OP_NOT_VSPACE:
4193 switch(c)
4194 {
4195 default: break;
4196 case 0x0a: /* LF */
4197 case 0x0b: /* VT */
4198 case 0x0c: /* FF */
4199 case 0x0d: /* CR */
4200 case 0x85: /* NEL */
4201 case 0x2028: /* LINE SEPARATOR */
4202 case 0x2029: /* PARAGRAPH SEPARATOR */
4203 RRETURN(MATCH_NOMATCH);
4204 }
4205 break;
4206
4207 case OP_VSPACE:
4208 switch(c)
4209 {
4210 default: RRETURN(MATCH_NOMATCH);
4211 case 0x0a: /* LF */
4212 case 0x0b: /* VT */
4213 case 0x0c: /* FF */
4214 case 0x0d: /* CR */
4215 case 0x85: /* NEL */
4216 case 0x2028: /* LINE SEPARATOR */
4217 case 0x2029: /* PARAGRAPH SEPARATOR */
4218 break;
4219 }
4220 break;
4221
4222 case OP_NOT_DIGIT:
4223 if (c < 256 && (md->ctypes[c] & ctype_digit) != 0)
4224 RRETURN(MATCH_NOMATCH);
4225 break;
4226
4227 case OP_DIGIT:
4228 if (c >= 256 || (md->ctypes[c] & ctype_digit) == 0)
4229 RRETURN(MATCH_NOMATCH);
4230 break;
4231
4232 case OP_NOT_WHITESPACE:
4233 if (c < 256 && (md->ctypes[c] & ctype_space) != 0)
4234 RRETURN(MATCH_NOMATCH);
4235 break;
4236
4237 case OP_WHITESPACE:
4238 if (c >= 256 || (md->ctypes[c] & ctype_space) == 0)
4239 RRETURN(MATCH_NOMATCH);
4240 break;
4241
4242 case OP_NOT_WORDCHAR:
4243 if (c < 256 && (md->ctypes[c] & ctype_word) != 0)
4244 RRETURN(MATCH_NOMATCH);
4245 break;
4246
4247 case OP_WORDCHAR:
4248 if (c >= 256 || (md->ctypes[c] & ctype_word) == 0)
4249 RRETURN(MATCH_NOMATCH);
4250 break;
4251
4252 default:
4253 RRETURN(PCRE_ERROR_INTERNAL);
4254 }
4255 }
4256 }
4257 else
4258 #endif
4259 /* Not UTF-8 mode */
4260 {
4261 for (fi = min;; fi++)
4262 {
4263 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM43);
4264 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
4265 if (fi >= max) RRETURN(MATCH_NOMATCH);
4266 if (eptr >= md->end_subject)
4267 {
4268 SCHECK_PARTIAL();
4269 RRETURN(MATCH_NOMATCH);
4270 }
4271 if (ctype == OP_ANY && IS_NEWLINE(eptr))
4272 RRETURN(MATCH_NOMATCH);
4273 c = *eptr++;
4274 switch(ctype)
4275 {
4276 case OP_ANY: /* This is the non-NL case */
4277 case OP_ALLANY:
4278 case OP_ANYBYTE:
4279 break;
4280
4281 case OP_ANYNL:
4282 switch(c)
4283 {
4284 default: RRETURN(MATCH_NOMATCH);
4285 case 0x000d:
4286 if (eptr < md->end_subject && *eptr == 0x0a) eptr++;
4287 break;
4288
4289 case 0x000a:
4290 break;
4291
4292 case 0x000b:
4293 case 0x000c:
4294 case 0x0085:
4295 if (md->bsr_anycrlf) RRETURN(MATCH_NOMATCH);
4296 break;
4297 }
4298 break;
4299
4300 case OP_NOT_HSPACE:
4301 switch(c)
4302 {
4303 default: break;
4304 case 0x09: /* HT */
4305 case 0x20: /* SPACE */
4306 case 0xa0: /* NBSP */
4307 RRETURN(MATCH_NOMATCH);
4308 }
4309 break;
4310
4311 case OP_HSPACE:
4312 switch(c)
4313 {
4314 default: RRETURN(MATCH_NOMATCH);
4315 case 0x09: /* HT */
4316 case 0x20: /* SPACE */
4317 case 0xa0: /* NBSP */
4318 break;
4319 }
4320 break;
4321
4322 case OP_NOT_VSPACE:
4323 switch(c)
4324 {
4325 default: break;
4326 case 0x0a: /* LF */
4327 case 0x0b: /* VT */
4328 case 0x0c: /* FF */
4329 case 0x0d: /* CR */
4330 case 0x85: /* NEL */
4331 RRETURN(MATCH_NOMATCH);
4332 }
4333 break;
4334
4335 case OP_VSPACE:
4336 switch(c)
4337 {
4338 default: RRETURN(MATCH_NOMATCH);
4339 case 0x0a: /* LF */
4340 case 0x0b: /* VT */
4341 case 0x0c: /* FF */
4342 case 0x0d: /* CR */
4343 case 0x85: /* NEL */
4344 break;
4345 }
4346 break;
4347
4348 case OP_NOT_DIGIT:
4349 if ((md->ctypes[c] & ctype_digit) != 0) RRETURN(MATCH_NOMATCH);
4350 break;
4351
4352 case OP_DIGIT:
4353 if ((md->ctypes[c] & ctype_digit) == 0) RRETURN(MATCH_NOMATCH);
4354 break;
4355
4356 case OP_NOT_WHITESPACE:
4357 if ((md->ctypes[c] & ctype_space) != 0) RRETURN(MATCH_NOMATCH);
4358 break;
4359
4360 case OP_WHITESPACE:
4361 if ((md->ctypes[c] & ctype_space) == 0) RRETURN(MATCH_NOMATCH);
4362 break;
4363
4364 case OP_NOT_WORDCHAR:
4365 if ((md->ctypes[c] & ctype_word) != 0) RRETURN(MATCH_NOMATCH);
4366 break;
4367
4368 case OP_WORDCHAR:
4369 if ((md->ctypes[c] & ctype_word) == 0) RRETURN(MATCH_NOMATCH);
4370 break;
4371
4372 default:
4373 RRETURN(PCRE_ERROR_INTERNAL);
4374 }
4375 }
4376 }
4377 /* Control never gets here */
4378 }
4379
4380 /* If maximizing, it is worth using inline code for speed, doing the type
4381 test once at the start (i.e. keep it out of the loop). Again, keep the
4382 UTF-8 and UCP stuff separate. */
4383
4384 else
4385 {
4386 pp = eptr; /* Remember where we started */
4387
4388 #ifdef SUPPORT_UCP
4389 if (prop_type >= 0)
4390 {
4391 switch(prop_type)
4392 {
4393 case PT_ANY:
4394 for (i = min; i < max; i++)
4395 {
4396 int len = 1;
4397 if (eptr >= md->end_subject)
4398 {
4399 SCHECK_PARTIAL();
4400 break;
4401 }
4402 GETCHARLEN(c, eptr, len);
4403 if (prop_fail_result) break;
4404 eptr+= len;
4405 }
4406 break;
4407
4408 case PT_LAMP:
4409 for (i = min; i < max; i++)
4410 {
4411 int len = 1;
4412 if (eptr >= md->end_subject)
4413 {
4414 SCHECK_PARTIAL();
4415 break;
4416 }
4417 GETCHARLEN(c, eptr, len);
4418 prop_chartype = UCD_CHARTYPE(c);
4419 if ((prop_chartype == ucp_Lu ||
4420 prop_chartype == ucp_Ll ||
4421 prop_chartype == ucp_Lt) == prop_fail_result)
4422 break;
4423 eptr+= len;
4424 }
4425 break;
4426
4427 case PT_GC:
4428 for (i = min; i < max; i++)
4429 {
4430 int len = 1;
4431 if (eptr >= md->end_subject)
4432 {
4433 SCHECK_PARTIAL();
4434 break;
4435 }
4436 GETCHARLEN(c, eptr, len);
4437 prop_category = UCD_CATEGORY(c);
4438 if ((prop_category == prop_value) == prop_fail_result)
4439 break;
4440 eptr+= len;
4441 }
4442 break;
4443
4444 case PT_PC:
4445 for (i = min; i < max; i++)
4446 {
4447 int len = 1;
4448 if (eptr >= md->end_subject)
4449 {
4450 SCHECK_PARTIAL();
4451 break;
4452 }
4453 GETCHARLEN(c, eptr, len);
4454 prop_chartype = UCD_CHARTYPE(c);
4455 if ((prop_chartype == prop_value) == prop_fail_result)
4456 break;
4457 eptr+= len;
4458 }
4459 break;
4460
4461 case PT_SC:
4462 for (i = min; i < max; i++)
4463 {
4464 int len = 1;
4465 if (eptr >= md->end_subject)
4466 {
4467 SCHECK_PARTIAL();
4468 break;
4469 }
4470 GETCHARLEN(c, eptr, len);
4471 prop_script = UCD_SCRIPT(c);
4472 if ((prop_script == prop_value) == prop_fail_result)
4473 break;
4474 eptr+= len;
4475 }
4476 break;
4477 }
4478
4479 /* eptr is now past the end of the maximum run */
4480
4481 if (possessive) continue;
4482 for(;;)
4483 {
4484 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM44);
4485 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
4486 if (eptr-- == pp) break; /* Stop if tried at original pos */
4487 if (utf8) BACKCHAR(eptr);
4488 }
4489 }
4490
4491 /* Match extended Unicode sequences. We will get here only if the
4492 support is in the binary; otherwise a compile-time error occurs. */
4493
4494 else if (ctype == OP_EXTUNI)
4495 {
4496 for (i = min; i < max; i++)
4497 {
4498 if (eptr >= md->end_subject)
4499 {
4500 SCHECK_PARTIAL();
4501 break;
4502 }
4503 GETCHARINCTEST(c, eptr);
4504 prop_category = UCD_CATEGORY(c);
4505 if (prop_category == ucp_M) break;
4506 while (eptr < md->end_subject)
4507 {
4508 int len = 1;
4509 if (!utf8) c = *eptr; else
4510 {
4511 GETCHARLEN(c, eptr, len);
4512 }
4513 prop_category = UCD_CATEGORY(c);
4514 if (prop_category != ucp_M) break;
4515 eptr += len;
4516 }
4517 }
4518
4519 /* eptr is now past the end of the maximum run */
4520
4521 if (possessive) continue;
4522
4523 for(;;)
4524 {
4525 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM45);
4526 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
4527 if (eptr-- == pp) break; /* Stop if tried at original pos */
4528 for (;;) /* Move back over one extended */
4529 {
4530 int len = 1;
4531 if (!utf8) c = *eptr; else
4532 {
4533 BACKCHAR(eptr);
4534 GETCHARLEN(c, eptr, len);
4535 }
4536 prop_category = UCD_CATEGORY(c);
4537 if (prop_category != ucp_M) break;
4538 eptr--;
4539 }
4540 }
4541 }
4542
4543 else
4544 #endif /* SUPPORT_UCP */
4545
4546 #ifdef SUPPORT_UTF8
4547 /* UTF-8 mode */
4548
4549 if (utf8)
4550 {
4551 switch(ctype)
4552 {
4553 case OP_ANY:
4554 if (max < INT_MAX)
4555 {
4556 for (i = min; i < max; i++)
4557 {
4558 if (eptr >= md->end_subject)
4559 {
4560 SCHECK_PARTIAL();
4561 break;
4562 }
4563 if (IS_NEWLINE(eptr)) break;
4564 eptr++;
4565 while (eptr < md->end_subject && (*eptr & 0xc0) == 0x80) eptr++;
4566 }
4567 }
4568
4569 /* Handle unlimited UTF-8 repeat */
4570
4571 else
4572 {
4573 for (i = min; i < max; i++)
4574 {
4575 if (eptr >= md->end_subject)
4576 {
4577 SCHECK_PARTIAL();
4578 break;
4579 }
4580 if (IS_NEWLINE(eptr)) break;
4581 eptr++;
4582 while (eptr < md->end_subject && (*eptr & 0xc0) == 0x80) eptr++;
4583 }
4584 }
4585 break;
4586
4587 case OP_ALLANY:
4588 if (max < INT_MAX)
4589 {
4590 for (i = min; i < max; i++)
4591 {
4592 if (eptr >= md->end_subject)
4593 {
4594 SCHECK_PARTIAL();
4595 break;
4596 }
4597 eptr++;
4598 while (eptr < md->end_subject && (*eptr & 0xc0) == 0x80) eptr++;
4599 }
4600 }
4601 else eptr = md->end_subject; /* Unlimited UTF-8 repeat */
4602 break;
4603
4604 /* The byte case is the same as non-UTF8 */
4605
4606 case OP_ANYBYTE:
4607 c = max - min;
4608 if (c > (unsigned int)(md->end_subject - eptr))
4609 {
4610 eptr = md->end_subject;
4611 SCHECK_PARTIAL();
4612 }
4613 else eptr += c;
4614 break;
4615
4616 case OP_ANYNL:
4617 for (i = min; i < max; i++)
4618 {
4619 int len = 1;
4620 if (eptr >= md->end_subject)
4621 {
4622 SCHECK_PARTIAL();
4623 break;
4624 }
4625 GETCHARLEN(c, eptr, len);
4626 if (c == 0x000d)
4627 {
4628 if (++eptr >= md->end_subject) break;
4629 if (*eptr == 0x000a) eptr++;
4630 }
4631 else
4632 {
4633 if (c != 0x000a &&
4634 (md->bsr_anycrlf ||
4635 (c != 0x000b && c != 0x000c &&
4636 c != 0x0085 && c != 0x2028 && c != 0x2029)))
4637 break;
4638 eptr += len;
4639 }
4640 }
4641 break;
4642
4643 case OP_NOT_HSPACE:
4644 case OP_HSPACE:
4645 for (i = min; i < max; i++)
4646 {
4647 BOOL gotspace;
4648 int len = 1;
4649 if (eptr >= md->end_subject)
4650 {
4651 SCHECK_PARTIAL();
4652 break;
4653 }
4654 GETCHARLEN(c, eptr, len);
4655 switch(c)
4656 {
4657 default: gotspace = FALSE; break;
4658 case 0x09: /* HT */
4659 case 0x20: /* SPACE */
4660 case 0xa0: /* NBSP */
4661 case 0x1680: /* OGHAM SPACE MARK */
4662 case 0x180e: /* MONGOLIAN VOWEL SEPARATOR */
4663 case 0x2000: /* EN QUAD */
4664 case 0x2001: /* EM QUAD */
4665 case 0x2002: /* EN SPACE */
4666 case 0x2003: /* EM SPACE */
4667 case 0x2004: /* THREE-PER-EM SPACE */
4668 case 0x2005: /* FOUR-PER-EM SPACE */
4669 case 0x2006: /* SIX-PER-EM SPACE */
4670 case 0x2007: /* FIGURE SPACE */
4671 case 0x2008: /* PUNCTUATION SPACE */
4672 case 0x2009: /* THIN SPACE */
4673 case 0x200A: /* HAIR SPACE */
4674 case 0x202f: /* NARROW NO-BREAK SPACE */
4675 case 0x205f: /* MEDIUM MATHEMATICAL SPACE */
4676 case 0x3000: /* IDEOGRAPHIC SPACE */
4677 gotspace = TRUE;
4678 break;
4679 }
4680 if (gotspace == (ctype == OP_NOT_HSPACE)) break;
4681 eptr += len;
4682 }
4683 break;
4684
4685 case OP_NOT_VSPACE:
4686 case OP_VSPACE:
4687 for (i = min; i < max; i++)
4688 {
4689 BOOL gotspace;
4690 int len = 1;
4691 if (eptr >= md->end_subject)
4692 {
4693 SCHECK_PARTIAL();
4694 break;
4695 }
4696 GETCHARLEN(c, eptr, len);
4697 switch(c)
4698 {
4699 default: gotspace = FALSE; break;
4700 case 0x0a: /* LF */
4701 case 0x0b: /* VT */
4702 case 0x0c: /* FF */
4703 case 0x0d: /* CR */
4704 case 0x85: /* NEL */
4705 case 0x2028: /* LINE SEPARATOR */
4706 case 0x2029: /* PARAGRAPH SEPARATOR */
4707 gotspace = TRUE;
4708 break;
4709 }
4710 if (gotspace == (ctype == OP_NOT_VSPACE)) break;
4711 eptr += len;
4712 }
4713 break;
4714
4715 case OP_NOT_DIGIT:
4716 for (i = min; i < max; i++)
4717 {
4718 int len = 1;
4719 if (eptr >= md->end_subject)
4720 {
4721 SCHECK_PARTIAL();
4722 break;
4723 }
4724 GETCHARLEN(c, eptr, len);
4725 if (c < 256 && (md->ctypes[c] & ctype_digit) != 0) break;
4726 eptr+= len;
4727 }
4728 break;
4729
4730 case OP_DIGIT:
4731 for (i = min; i < max; i++)
4732 {
4733 int len = 1;
4734 if (eptr >= md->end_subject)
4735 {
4736 SCHECK_PARTIAL();
4737 break;
4738 }
4739 GETCHARLEN(c, eptr, len);
4740 if (c >= 256 ||(md->ctypes[c] & ctype_digit) == 0) break;
4741 eptr+= len;
4742 }
4743 break;
4744
4745 case OP_NOT_WHITESPACE:
4746 for (i = min; i < max; i++)
4747 {
4748 int len = 1;
4749 if (eptr >= md->end_subject)
4750 {
4751 SCHECK_PARTIAL();
4752 break;
4753 }
4754 GETCHARLEN(c, eptr, len);
4755 if (c < 256 && (md->ctypes[c] & ctype_space) != 0) break;
4756 eptr+= len;
4757 }
4758 break;
4759
4760 case OP_WHITESPACE:
4761 for (i = min; i < max; i++)
4762 {
4763 int len = 1;
4764 if (eptr >= md->end_subject)
4765 {
4766 SCHECK_PARTIAL();
4767 break;
4768 }
4769 GETCHARLEN(c, eptr, len);
4770 if (c >= 256 ||(md->ctypes[c] & ctype_space) == 0) break;
4771 eptr+= len;
4772 }
4773 break;
4774
4775 case OP_NOT_WORDCHAR:
4776 for (i = min; i < max; i++)
4777 {
4778 int len = 1;
4779 if (eptr >= md->end_subject)
4780 {
4781 SCHECK_PARTIAL();
4782 break;
4783 }
4784 GETCHARLEN(c, eptr, len);
4785 if (c < 256 && (md->ctypes[c] & ctype_word) != 0) break;
4786 eptr+= len;
4787 }
4788 break;
4789
4790 case OP_WORDCHAR:
4791 for (i = min; i < max; i++)
4792 {
4793 int len = 1;
4794 if (eptr >= md->end_subject)
4795 {
4796 SCHECK_PARTIAL();
4797 break;
4798 }
4799 GETCHARLEN(c, eptr, len);
4800 if (c >= 256 || (md->ctypes[c] & ctype_word) == 0) break;
4801 eptr+= len;
4802 }
4803 break;
4804
4805 default:
4806 RRETURN(PCRE_ERROR_INTERNAL);
4807 }
4808
4809 /* eptr is now past the end of the maximum run */
4810
4811 if (possessive) continue;
4812 for(;;)
4813 {
4814 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM46);
4815 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
4816 if (eptr-- == pp) break; /* Stop if tried at original pos */
4817 BACKCHAR(eptr);
4818 }
4819 }
4820 else
4821 #endif /* SUPPORT_UTF8 */
4822
4823 /* Not UTF-8 mode */
4824 {
4825 switch(ctype)
4826 {
4827 case OP_ANY:
4828 for (i = min; i < max; i++)
4829 {
4830 if (eptr >= md->end_subject)
4831 {
4832 SCHECK_PARTIAL();
4833 break;
4834 }
4835 if (IS_NEWLINE(eptr)) break;
4836 eptr++;
4837 }
4838 break;
4839
4840 case OP_ALLANY:
4841 case OP_ANYBYTE:
4842 c = max - min;
4843 if (c > (unsigned int)(md->end_subject - eptr))
4844 {
4845 eptr = md->end_subject;
4846 SCHECK_PARTIAL();
4847 }
4848 else eptr += c;
4849 break;
4850
4851 case OP_ANYNL:
4852 for (i = min; i < max; i++)
4853 {
4854 if (eptr >= md->end_subject)
4855 {
4856 SCHECK_PARTIAL();
4857 break;
4858 }
4859 c = *eptr;
4860 if (c == 0x000d)
4861 {
4862 if (++eptr >= md->end_subject) break;
4863 if (*eptr == 0x000a) eptr++;
4864 }
4865 else
4866 {
4867 if (c != 0x000a &&
4868 (md->bsr_anycrlf ||
4869 (c != 0x000b && c != 0x000c && c != 0x0085)))
4870 break;
4871 eptr++;
4872 }
4873 }
4874 break;
4875
4876 case OP_NOT_HSPACE:
4877 for (i = min; i < max; i++)
4878 {
4879 if (eptr >= md->end_subject)
4880 {
4881 SCHECK_PARTIAL();
4882 break;
4883 }
4884 c = *eptr;
4885 if (c == 0x09 || c == 0x20 || c == 0xa0) break;
4886 eptr++;
4887 }
4888 break;
4889
4890 case OP_HSPACE:
4891 for (i = min; i < max; i++)
4892 {
4893 if (eptr >= md->end_subject)
4894 {
4895 SCHECK_PARTIAL();
4896 break;
4897 }
4898 c = *eptr;
4899 if (c != 0x09 && c != 0x20 && c != 0xa0) break;
4900 eptr++;
4901 }
4902 break;
4903
4904 case OP_NOT_VSPACE:
4905 for (i = min; i < max; i++)
4906 {
4907 if (eptr >= md->end_subject)
4908 {
4909 SCHECK_PARTIAL();
4910 break;
4911 }
4912 c = *eptr;
4913 if (c == 0x0a || c == 0x0b || c == 0x0c || c == 0x0d || c == 0x85)
4914 break;
4915 eptr++;
4916 }
4917 break;
4918
4919 case OP_VSPACE:
4920 for (i = min; i < max; i++)
4921 {
4922 if (eptr >= md->end_subject)
4923 {
4924 SCHECK_PARTIAL();
4925 break;
4926 }
4927 c = *eptr;
4928 if (c != 0x0a && c != 0x0b && c != 0x0c && c != 0x0d && c != 0x85)
4929 break;
4930 eptr++;
4931 }
4932 break;
4933
4934 case OP_NOT_DIGIT:
4935 for (i = min; i < max; i++)
4936 {
4937 if (eptr >= md->end_subject)
4938 {
4939 SCHECK_PARTIAL();
4940 break;
4941 }
4942 if ((md->ctypes[*eptr] & ctype_digit) != 0) break;
4943 eptr++;
4944 }
4945 break;
4946
4947 case OP_DIGIT:
4948 for (i = min; i < max; i++)
4949 {
4950 if (eptr >= md->end_subject)
4951 {
4952 SCHECK_PARTIAL();
4953 break;
4954 }
4955 if ((md->ctypes[*eptr] & ctype_digit) == 0) break;
4956 eptr++;
4957 }
4958 break;
4959
4960 case OP_NOT_WHITESPACE:
4961 for (i = min; i < max; i++)
4962 {
4963 if (eptr >= md->end_subject)
4964 {
4965 SCHECK_PARTIAL();
4966 break;
4967 }
4968 if ((md->ctypes[*eptr] & ctype_space) != 0) break;
4969 eptr++;
4970 }
4971 break;
4972
4973 case OP_WHITESPACE:
4974 for (i = min; i < max; i++)
4975 {
4976 if (eptr >= md->end_subject)
4977 {
4978 SCHECK_PARTIAL();
4979 break;
4980 }
4981 if ((md->ctypes[*eptr] & ctype_space) == 0) break;
4982 eptr++;
4983 }
4984 break;
4985
4986 case OP_NOT_WORDCHAR:
4987 for (i = min; i < max; i++)
4988 {
4989 if (eptr >= md->end_subject)
4990 {
4991 SCHECK_PARTIAL();
4992 break;
4993 }
4994 if ((md->ctypes[*eptr] & ctype_word) != 0) break;
4995 eptr++;
4996 }
4997 break;
4998
4999 case OP_WORDCHAR:
5000 for (i = min; i < max; i++)
5001 {
5002 if (eptr >= md->end_subject)
5003 {
5004 SCHECK_PARTIAL();
5005 break;
5006 }
5007 if ((md->ctypes[*eptr] & ctype_word) == 0) break;
5008 eptr++;
5009 }
5010 break;
5011
5012 default:
5013 RRETURN(PCRE_ERROR_INTERNAL);
5014 }
5015
5016 /* eptr is now past the end of the maximum run */
5017
5018 if (possessive) continue;
5019 while (eptr >= pp)
5020 {
5021 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM47);
5022 eptr--;
5023 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
5024 }
5025 }
5026
5027 /* Get here if we can't make it match with any permitted repetitions */
5028
5029 RRETURN(MATCH_NOMATCH);
5030 }
5031 /* Control never gets here */
5032
5033 /* There's been some horrible disaster. Arrival here can only mean there is
5034 something seriously wrong in the code above or the OP_xxx definitions. */
5035
5036 default:
5037 DPRINTF(("Unknown opcode %d\n", *ecode));
5038 RRETURN(PCRE_ERROR_UNKNOWN_OPCODE);
5039 }
5040
5041 /* Do not stick any code in here without much thought; it is assumed
5042 that "continue" in the code above comes out to here to repeat the main
5043 loop. */
5044
5045 } /* End of main loop */
5046 /* Control never reaches here */
5047
5048
5049 /* When compiling to use the heap rather than the stack for recursive calls to
5050 match(), the RRETURN() macro jumps here. The number that is saved in
5051 frame->Xwhere indicates which label we actually want to return to. */
5052
5053 #ifdef NO_RECURSE
5054 #define LBL(val) case val: goto L_RM##val;
5055 HEAP_RETURN:
5056 switch (frame->Xwhere)
5057 {
5058 LBL( 1) LBL( 2) LBL( 3) LBL( 4) LBL( 5) LBL( 6) LBL( 7) LBL( 8)
5059 LBL( 9) LBL(10) LBL(11) LBL(12) LBL(13) LBL(14) LBL(15) LBL(17)
5060 LBL(19) LBL(24) LBL(25) LBL(26) LBL(27) LBL(29) LBL(31) LBL(33)
5061 LBL(35) LBL(43) LBL(47) LBL(48) LBL(49) LBL(50) LBL(51) LBL(52)
5062 LBL(53) LBL(54)
5063 #ifdef SUPPORT_UTF8
5064 LBL(16) LBL(18) LBL(20) LBL(21) LBL(22) LBL(23) LBL(28) LBL(30)
5065 LBL(32) LBL(34) LBL(42) LBL(46)
5066 #ifdef SUPPORT_UCP
5067 LBL(36) LBL(37) LBL(38) LBL(39) LBL(40) LBL(41) LBL(44) LBL(45)
5068 #endif /* SUPPORT_UCP */
5069 #endif /* SUPPORT_UTF8 */
5070 default:
5071 DPRINTF(("jump error in pcre match: label %d non-existent\n", frame->Xwhere));
5072 return PCRE_ERROR_INTERNAL;
5073 }
5074 #undef LBL
5075 #endif /* NO_RECURSE */
5076 }
5077
5078
5079 /***************************************************************************
5080 ****************************************************************************
5081 RECURSION IN THE match() FUNCTION
5082
5083 Undefine all the macros that were defined above to handle this. */
5084
5085 #ifdef NO_RECURSE
5086 #undef eptr
5087 #undef ecode
5088 #undef mstart
5089 #undef offset_top
5090 #undef ims
5091 #undef eptrb
5092 #undef flags
5093
5094 #undef callpat
5095 #undef charptr
5096 #undef data
5097 #undef next
5098 #undef pp
5099 #undef prev
5100 #undef saved_eptr
5101
5102 #undef new_recursive
5103
5104 #undef cur_is_word
5105 #undef condition
5106 #undef prev_is_word
5107
5108 #undef original_ims
5109
5110 #undef ctype
5111 #undef length
5112 #undef max
5113 #undef min
5114 #undef number
5115 #undef offset
5116 #undef op
5117 #undef save_capture_last
5118 #undef save_offset1
5119 #undef save_offset2
5120 #undef save_offset3
5121 #undef stacksave
5122
5123 #undef newptrb
5124
5125 #endif
5126
5127 /* These two are defined as macros in both cases */
5128
5129 #undef fc
5130 #undef fi
5131
5132 /***************************************************************************
5133 ***************************************************************************/
5134
5135
5136
5137 /*************************************************
5138 * Execute a Regular Expression *
5139 *************************************************/
5140
5141 /* This function applies a compiled re to a subject string and picks out
5142 portions of the string if it matches. Two elements in the vector are set for
5143 each substring: the offsets to the start and end of the substring.
5144
5145 Arguments:
5146 argument_re points to the compiled expression
5147 extra_data points to extra data or is NULL
5148 subject points to the subject string
5149 length length of subject string (may contain binary zeros)
5150 start_offset where to start in the subject string
5151 options option bits
5152 offsets points to a vector of ints to be filled in with offsets
5153 offsetcount the number of elements in the vector
5154
5155 Returns: > 0 => success; value is the number of elements filled in
5156 = 0 => success, but offsets is not big enough
5157 -1 => failed to match
5158 < -1 => some kind of unexpected problem
5159 */
5160
5161 PCRE_EXP_DEFN int PCRE_CALL_CONVENTION
5162 pcre_exec(const pcre *argument_re, const pcre_extra *extra_data,
5163 PCRE_SPTR subject, int length, int start_offset, int options, int *offsets,
5164 int offsetcount)
5165 {
5166 int rc, resetcount, ocount;
5167 int first_byte = -1;
5168 int req_byte = -1;
5169 int req_byte2 = -1;
5170 int newline;
5171 unsigned long int ims;
5172 BOOL using_temporary_offsets = FALSE;
5173 BOOL anchored;
5174 BOOL startline;
5175 BOOL firstline;
5176 BOOL first_byte_caseless = FALSE;
5177 BOOL req_byte_caseless = FALSE;
5178 BOOL utf8;
5179 match_data match_block;
5180 match_data *md = &match_block;
5181 const uschar *tables;
5182 const uschar *start_bits = NULL;
5183 USPTR start_match = (USPTR)subject + start_offset;
5184 USPTR end_subject;
5185 USPTR start_partial = NULL;
5186 USPTR req_byte_ptr = start_match - 1;
5187
5188 pcre_study_data internal_study;
5189 const pcre_study_data *study;
5190
5191 real_pcre internal_re;
5192 const real_pcre *external_re = (const real_pcre *)argument_re;
5193 const real_pcre *re = external_re;
5194
5195 /* Plausibility checks */
5196
5197 if ((options & ~PUBLIC_EXEC_OPTIONS) != 0) return PCRE_ERROR_BADOPTION;
5198 if (re == NULL || subject == NULL ||
5199 (offsets == NULL && offsetcount > 0)) return PCRE_ERROR_NULL;
5200 if (offsetcount < 0) return PCRE_ERROR_BADCOUNT;
5201
5202 /* This information is for finding all the numbers associated with a given
5203 name, for condition testing. */
5204
5205 md->name_table = (uschar *)re + re->name_table_offset;
5206 md->name_count = re->name_count;
5207 md->name_entry_size = re->name_entry_size;
5208
5209 /* Fish out the optional data from the extra_data structure, first setting
5210 the default values. */
5211
5212 study = NULL;
5213 md->match_limit = MATCH_LIMIT;
5214 md->match_limit_recursion = MATCH_LIMIT_RECURSION;
5215 md->callout_data = NULL;
5216
5217 /* The table pointer is always in native byte order. */
5218
5219 tables = external_re->tables;
5220
5221 if (extra_data != NULL)
5222 {
5223 register unsigned int flags = extra_data->flags;
5224 if ((flags & PCRE_EXTRA_STUDY_DATA) != 0)
5225 study = (const pcre_study_data *)extra_data->study_data;
5226 if ((flags & PCRE_EXTRA_MATCH_LIMIT) != 0)
5227 md->match_limit = extra_data->match_limit;
5228 if ((flags & PCRE_EXTRA_MATCH_LIMIT_RECURSION) != 0)
5229 md->match_limit_recursion = extra_data->match_limit_recursion;
5230 if ((flags & PCRE_EXTRA_CALLOUT_DATA) != 0)
5231 md->callout_data = extra_data->callout_data;
5232 if ((flags & PCRE_EXTRA_TABLES) != 0) tables = extra_data->tables;
5233 }
5234
5235 /* If the exec call supplied NULL for tables, use the inbuilt ones. This
5236 is a feature that makes it possible to save compiled regex and re-use them
5237 in other programs later. */
5238
5239 if (tables == NULL) tables = _pcre_default_tables;
5240
5241 /* Check that the first field in the block is the magic number. If it is not,
5242 test for a regex that was compiled on a host of opposite endianness. If this is
5243 the case, flipped values are put in internal_re and internal_study if there was
5244 study data too. */
5245
5246 if (re->magic_number != MAGIC_NUMBER)
5247 {
5248 re = _pcre_try_flipped(re, &internal_re, study, &internal_study);
5249 if (re == NULL) return PCRE_ERROR_BADMAGIC;
5250 if (study != NULL) study = &internal_study;
5251 }
5252
5253 /* Set up other data */
5254
5255 anchored = ((re->options | options) & PCRE_ANCHORED) != 0;
5256 startline = (re->flags & PCRE_STARTLINE) != 0;
5257 firstline = (re->options & PCRE_FIRSTLINE) != 0;
5258
5259 /* The code starts after the real_pcre block and the capture name table. */
5260
5261 md->start_code = (const uschar *)external_re + re->name_table_offset +
5262 re->name_count * re->name_entry_size;
5263
5264 md->start_subject = (USPTR)subject;
5265 md->start_offset = start_offset;
5266 md->end_subject = md->start_subject + length;
5267 end_subject = md->end_subject;
5268
5269 md->endonly = (re->options & PCRE_DOLLAR_ENDONLY) != 0;
5270 utf8 = md->utf8 = (re->options & PCRE_UTF8) != 0;
5271 md->jscript_compat = (re->options & PCRE_JAVASCRIPT_COMPAT) != 0;
5272
5273 md->notbol = (options & PCRE_NOTBOL) != 0;
5274 md->noteol = (options & PCRE_NOTEOL) != 0;
5275 md->notempty = (options & PCRE_NOTEMPTY) != 0;
5276 md->notempty_atstart = (options & PCRE_NOTEMPTY_ATSTART) != 0;
5277 md->partial = ((options & PCRE_PARTIAL_HARD) != 0)? 2 :
5278 ((options & PCRE_PARTIAL_SOFT) != 0)? 1 : 0;
5279 md->hitend = FALSE;
5280
5281 md->recursive = NULL; /* No recursion at top level */
5282
5283 md->lcc = tables + lcc_offset;
5284 md->ctypes = tables + ctypes_offset;
5285
5286 /* Handle different \R options. */
5287
5288 switch (options & (PCRE_BSR_ANYCRLF|PCRE_BSR_UNICODE))
5289 {
5290 case 0:
5291 if ((re->options & (PCRE_BSR_ANYCRLF|PCRE_BSR_UNICODE)) != 0)
5292 md->bsr_anycrlf = (re->options & PCRE_BSR_ANYCRLF) != 0;
5293 else
5294 #ifdef BSR_ANYCRLF
5295 md->bsr_anycrlf = TRUE;
5296 #else
5297 md->bsr_anycrlf = FALSE;
5298 #endif
5299 break;
5300
5301 case PCRE_BSR_ANYCRLF:
5302 md->bsr_anycrlf = TRUE;
5303 break;
5304
5305 case PCRE_BSR_UNICODE:
5306 md->bsr_anycrlf = FALSE;
5307 break;
5308
5309 default: return PCRE_ERROR_BADNEWLINE;
5310 }
5311
5312 /* Handle different types of newline. The three bits give eight cases. If
5313 nothing is set at run time, whatever was used at compile time applies. */
5314
5315 switch ((((options & PCRE_NEWLINE_BITS) == 0)? re->options :
5316 (pcre_uint32)options) & PCRE_NEWLINE_BITS)
5317 {
5318 case 0: newline = NEWLINE; break; /* Compile-time default */
5319 case PCRE_NEWLINE_CR: newline = CHAR_CR; break;
5320 case PCRE_NEWLINE_LF: newline = CHAR_NL; break;
5321 case PCRE_NEWLINE_CR+
5322 PCRE_NEWLINE_LF: newline = (CHAR_CR << 8) | CHAR_NL; break;
5323 case PCRE_NEWLINE_ANY: newline = -1; break;
5324 case PCRE_NEWLINE_ANYCRLF: newline = -2; break;
5325 default: return PCRE_ERROR_BADNEWLINE;
5326 }
5327
5328 if (newline == -2)
5329 {
5330 md->nltype = NLTYPE_ANYCRLF;
5331 }
5332 else if (newline < 0)
5333 {
5334 md->nltype = NLTYPE_ANY;
5335 }
5336 else
5337 {
5338 md->nltype = NLTYPE_FIXED;
5339 if (newline > 255)
5340 {
5341 md->nllen = 2;
5342 md->nl[0] = (newline >> 8) & 255;
5343 md->nl[1] = newline & 255;
5344 }
5345 else
5346 {
5347 md->nllen = 1;
5348 md->nl[0] = newline;
5349 }
5350 }
5351
5352 /* Partial matching was originally supported only for a restricted set of
5353 regexes; from release 8.00 there are no restrictions, but the bits are still
5354 defined (though never set). So there's no harm in leaving this code. */
5355
5356 if (md->partial && (re->flags & PCRE_NOPARTIAL) != 0)
5357 return PCRE_ERROR_BADPARTIAL;
5358
5359 /* Check a UTF-8 string if required. Unfortunately there's no way of passing
5360 back the character offset. */
5361
5362 #ifdef SUPPORT_UTF8
5363 if (utf8 && (options & PCRE_NO_UTF8_CHECK) == 0)
5364 {
5365 if (_pcre_valid_utf8((USPTR)subject, length) >= 0)
5366 return PCRE_ERROR_BADUTF8;
5367 if (start_offset > 0 && start_offset < length)
5368 {
5369 int tb = ((USPTR)subject)[start_offset];
5370 if (tb > 127)
5371 {
5372 tb &= 0xc0;
5373 if (tb != 0 && tb != 0xc0) return PCRE_ERROR_BADUTF8_OFFSET;
5374 }
5375 }
5376 }
5377 #endif
5378
5379 /* The ims options can vary during the matching as a result of the presence
5380 of (?ims) items in the pattern. They are kept in a local variable so that
5381 restoring at the exit of a group is easy. */
5382
5383 ims = re->options & (PCRE_CASELESS|PCRE_MULTILINE|PCRE_DOTALL);
5384
5385 /* If the expression has got more back references than the offsets supplied can
5386 hold, we get a temporary chunk of working store to use during the matching.
5387 Otherwise, we can use the vector supplied, rounding down its size to a multiple
5388 of 3. */
5389
5390 ocount = offsetcount - (offsetcount % 3);
5391
5392 if (re->top_backref > 0 && re->top_backref >= ocount/3)
5393 {
5394 ocount = re->top_backref * 3 + 3;
5395 md->offset_vector = (int *)(pcre_malloc)(ocount * sizeof(int));
5396 if (md->offset_vector == NULL) return PCRE_ERROR_NOMEMORY;
5397 using_temporary_offsets = TRUE;
5398 DPRINTF(("Got memory to hold back references\n"));
5399 }
5400 else md->offset_vector = offsets;
5401
5402 md->offset_end = ocount;
5403 md->offset_max = (2*ocount)/3;
5404 md->offset_overflow = FALSE;
5405 md->capture_last = -1;
5406
5407 /* Compute the minimum number of offsets that we need to reset each time. Doing
5408 this makes a huge difference to execution time when there aren't many brackets
5409 in the pattern. */
5410
5411 resetcount = 2 + re->top_bracket * 2;
5412 if (resetcount > offsetcount) resetcount = ocount;
5413
5414 /* Reset the working variable associated with each extraction. These should
5415 never be used unless previously set, but they get saved and restored, and so we
5416 initialize them to avoid reading uninitialized locations. */
5417
5418 if (md->offset_vector != NULL)
5419 {
5420 register int *iptr = md->offset_vector + ocount;
5421 register int *iend = iptr - resetcount/2 + 1;
5422 while (--iptr >= iend) *iptr = -1;
5423 }
5424
5425 /* Set up the first character to match, if available. The first_byte value is
5426 never set for an anchored regular expression, but the anchoring may be forced
5427 at run time, so we have to test for anchoring. The first char may be unset for
5428 an unanchored pattern, of course. If there's no first char and the pattern was
5429 studied, there may be a bitmap of possible first characters. */
5430
5431 if (!anchored)
5432 {
5433 if ((re->flags & PCRE_FIRSTSET) != 0)
5434 {
5435 first_byte = re->first_byte & 255;
5436 if ((first_byte_caseless = ((re->first_byte & REQ_CASELESS) != 0)) == TRUE)
5437 first_byte = md->lcc[first_byte];
5438 }
5439 else
5440 if (!startline && study != NULL &&
5441 (study->flags & PCRE_STUDY_MAPPED) != 0)
5442 start_bits = study->start_bits;
5443 }
5444
5445 /* For anchored or unanchored matches, there may be a "last known required
5446 character" set. */
5447
5448 if ((re->flags & PCRE_REQCHSET) != 0)
5449 {
5450 req_byte = re->req_byte & 255;
5451 req_byte_caseless = (re->req_byte & REQ_CASELESS) != 0;
5452 req_byte2 = (tables + fcc_offset)[req_byte]; /* case flipped */
5453 }
5454
5455
5456 /* ==========================================================================*/
5457
5458 /* Loop for handling unanchored repeated matching attempts; for anchored regexs
5459 the loop runs just once. */
5460
5461 for(;;)
5462 {
5463 USPTR save_end_subject = end_subject;
5464 USPTR new_start_match;
5465
5466 /* Reset the maximum number of extractions we might see. */
5467
5468 if (md->offset_vector != NULL)
5469 {
5470 register int *iptr = md->offset_vector;
5471 register int *iend = iptr + resetcount;
5472 while (iptr < iend) *iptr++ = -1;
5473 }
5474
5475 /* If firstline is TRUE, the start of the match is constrained to the first
5476 line of a multiline string. That is, the match must be before or at the first
5477 newline. Implement this by temporarily adjusting end_subject so that we stop
5478 scanning at a newline. If the match fails at the newline, later code breaks
5479 this loop. */
5480
5481 if (firstline)
5482 {
5483 USPTR t = start_match;
5484 #ifdef SUPPORT_UTF8
5485 if (utf8)
5486 {
5487 while (t < md->end_subject && !IS_NEWLINE(t))
5488 {
5489 t++;
5490 while (t < end_subject && (*t & 0xc0) == 0x80) t++;
5491 }
5492 }
5493 else
5494 #endif
5495 while (t < md->end_subject && !IS_NEWLINE(t)) t++;
5496 end_subject = t;
5497 }
5498
5499 /* There are some optimizations that avoid running the match if a known
5500 starting point is not found, or if a known later character is not present.
5501 However, there is an option that disables these, for testing and for ensuring
5502 that all callouts do actually occur. */
5503
5504 if ((options & PCRE_NO_START_OPTIMIZE) == 0)
5505 {
5506 /* Advance to a unique first byte if there is one. */
5507
5508 if (first_byte >= 0)
5509 {
5510 if (first_byte_caseless)
5511 while (start_match < end_subject && md->lcc[*start_match] != first_byte)
5512 start_match++;
5513 else
5514 while (start_match < end_subject && *start_match != first_byte)
5515 start_match++;
5516 }
5517
5518 /* Or to just after a linebreak for a multiline match */
5519
5520 else if (startline)
5521 {
5522 if (start_match > md->start_subject + start_offset)
5523 {
5524 #ifdef SUPPORT_UTF8
5525 if (utf8)
5526 {
5527 while (start_match < end_subject && !WAS_NEWLINE(start_match))
5528 {
5529 start_match++;
5530 while(start_match < end_subject && (*start_match & 0xc0) == 0x80)
5531 start_match++;
5532 }
5533 }
5534 else
5535 #endif
5536 while (start_match < end_subject && !WAS_NEWLINE(start_match))
5537 start_match++;
5538
5539 /* If we have just passed a CR and the newline option is ANY or ANYCRLF,
5540 and we are now at a LF, advance the match position by one more character.
5541 */
5542
5543 if (start_match[-1] == CHAR_CR &&
5544 (md->nltype == NLTYPE_ANY || md->nltype == NLTYPE_ANYCRLF) &&
5545 start_match < end_subject &&
5546 *start_match == CHAR_NL)
5547 start_match++;
5548 }
5549 }
5550
5551 /* Or to a non-unique first byte after study */
5552
5553 else if (start_bits != NULL)
5554 {
5555 while (start_match < end_subject)
5556 {
5557 register unsigned int c = *start_match;
5558 if ((start_bits[c/8] & (1 << (c&7))) == 0) start_match++;
5559 else break;
5560 }
5561 }
5562 } /* Starting optimizations */
5563
5564 /* Restore fudged end_subject */
5565
5566 end_subject = save_end_subject;
5567
5568 /* The following two optimizations are disabled for partial matching or if
5569 disabling is explicitly requested. */
5570
5571 if ((options & PCRE_NO_START_OPTIMIZE) == 0 && !md->partial)
5572 {
5573 /* If the pattern was studied, a minimum subject length may be set. This is
5574 a lower bound; no actual string of that length may actually match the
5575 pattern. Although the value is, strictly, in characters, we treat it as
5576 bytes to avoid spending too much time in this optimization. */
5577
5578 if (study != NULL && (study->flags & PCRE_STUDY_MINLEN) != 0 &&
5579 (pcre_uint32)(end_subject - start_match) < study->minlength)
5580 {
5581 rc = MATCH_NOMATCH;
5582 break;
5583 }
5584
5585 /* If req_byte is set, we know that that character must appear in the
5586 subject for the match to succeed. If the first character is set, req_byte
5587 must be later in the subject; otherwise the test starts at the match point.
5588 This optimization can save a huge amount of backtracking in patterns with
5589 nested unlimited repeats that aren't going to match. Writing separate code
5590 for cased/caseless versions makes it go faster, as does using an
5591 autoincrement and backing off on a match.
5592
5593 HOWEVER: when the subject string is very, very long, searching to its end
5594 can take a long time, and give bad performance on quite ordinary patterns.
5595 This showed up when somebody was matching something like /^\d+C/ on a
5596 32-megabyte string... so we don't do this when the string is sufficiently
5597 long. */
5598
5599 if (req_byte >= 0 && end_subject - start_match < REQ_BYTE_MAX)
5600 {
5601 register USPTR p = start_match + ((first_byte >= 0)? 1 : 0);
5602
5603 /* We don't need to repeat the search if we haven't yet reached the
5604 place we found it at last time. */
5605
5606 if (p > req_byte_ptr)
5607 {
5608 if (req_byte_caseless)
5609 {
5610 while (p < end_subject)
5611 {
5612 register int pp = *p++;
5613 if (pp == req_byte || pp == req_byte2) { p--; break; }
5614 }
5615 }
5616 else
5617 {
5618 while (p < end_subject)
5619 {
5620 if (*p++ == req_byte) { p--; break; }
5621 }
5622 }
5623
5624 /* If we can't find the required character, break the matching loop,
5625 forcing a match failure. */
5626
5627 if (p >= end_subject)
5628 {
5629 rc = MATCH_NOMATCH;
5630 break;
5631 }
5632
5633 /* If we have found the required character, save the point where we
5634 found it, so that we don't search again next time round the loop if
5635 the start hasn't passed this character yet. */
5636
5637 req_byte_ptr = p;
5638 }
5639 }
5640 }
5641
5642 #ifdef PCRE_DEBUG /* Sigh. Some compilers never learn. */
5643 printf(">>>> Match against: ");
5644 pchars(start_match, end_subject - start_match, TRUE, md);
5645 printf("\n");
5646 #endif
5647
5648 /* OK, we can now run the match. If "hitend" is set afterwards, remember the
5649 first starting point for which a partial match was found. */
5650
5651 md->start_match_ptr = start_match;
5652 md->start_used_ptr = start_match;
5653 md->match_call_count = 0;
5654 rc = match(start_match, md->start_code, start_match, 2, md, ims, NULL, 0, 0);
5655 if (md->hitend && start_partial == NULL) start_partial = md->start_used_ptr;
5656
5657 switch(rc)
5658 {
5659 /* NOMATCH and PRUNE advance by one character. THEN at this level acts
5660 exactly like PRUNE. */
5661
5662 case MATCH_NOMATCH:
5663 case MATCH_PRUNE:
5664 case MATCH_THEN:
5665 new_start_match = start_match + 1;
5666 #ifdef SUPPORT_UTF8
5667 if (utf8)
5668 while(new_start_match < end_subject && (*new_start_match & 0xc0) == 0x80)
5669 new_start_match++;
5670 #endif
5671 break;
5672
5673 /* SKIP passes back the next starting point explicitly. */
5674
5675 case MATCH_SKIP:
5676 new_start_match = md->start_match_ptr;
5677 break;
5678
5679 /* COMMIT disables the bumpalong, but otherwise behaves as NOMATCH. */
5680
5681 case MATCH_COMMIT:
5682 rc = MATCH_NOMATCH;
5683 goto ENDLOOP;
5684
5685 /* Any other return is either a match, or some kind of error. */
5686
5687 default:
5688 goto ENDLOOP;
5689 }
5690
5691 /* Control reaches here for the various types of "no match at this point"
5692 result. Reset the code to MATCH_NOMATCH for subsequent checking. */
5693
5694 rc = MATCH_NOMATCH;
5695
5696 /* If PCRE_FIRSTLINE is set, the match must happen before or at the first
5697 newline in the subject (though it may continue over the newline). Therefore,
5698 if we have just failed to match, starting at a newline, do not continue. */
5699
5700 if (firstline && IS_NEWLINE(start_match)) break;
5701
5702 /* Advance to new matching position */
5703
5704 start_match = new_start_match;
5705
5706 /* Break the loop if the pattern is anchored or if we have passed the end of
5707 the subject. */
5708
5709 if (anchored || start_match > end_subject) break;
5710
5711 /* If we have just passed a CR and we are now at a LF, and the pattern does
5712 not contain any explicit matches for \r or \n, and the newline option is CRLF
5713 or ANY or ANYCRLF, advance the match position by one more character. */
5714
5715 if (start_match[-1] == CHAR_CR &&
5716 start_match < end_subject &&
5717 *start_match == CHAR_NL &&
5718 (re->flags & PCRE_HASCRORLF) == 0 &&
5719 (md->nltype == NLTYPE_ANY ||
5720 md->nltype == NLTYPE_ANYCRLF ||
5721 md->nllen == 2))
5722 start_match++;
5723
5724 } /* End of for(;;) "bumpalong" loop */
5725
5726 /* ==========================================================================*/
5727
5728 /* We reach here when rc is not MATCH_NOMATCH, or if one of the stopping
5729 conditions is true:
5730
5731 (1) The pattern is anchored or the match was failed by (*COMMIT);
5732
5733 (2) We are past the end of the subject;
5734
5735 (3) PCRE_FIRSTLINE is set and we have failed to match at a newline, because
5736 this option requests that a match occur at or before the first newline in
5737 the subject.
5738
5739 When we have a match and the offset vector is big enough to deal with any
5740 backreferences, captured substring offsets will already be set up. In the case
5741 where we had to get some local store to hold offsets for backreference
5742 processing, copy those that we can. In this case there need not be overflow if
5743 certain parts of the pattern were not used, even though there are more
5744 capturing parentheses than vector slots. */
5745
5746 ENDLOOP:
5747
5748 if (rc == MATCH_MATCH)
5749 {
5750 if (using_temporary_offsets)
5751 {
5752 if (offsetcount >= 4)
5753 {
5754 memcpy(offsets + 2, md->offset_vector + 2,
5755 (offsetcount - 2) * sizeof(int));
5756 DPRINTF(("Copied offsets from temporary memory\n"));
5757 }
5758 if (md->end_offset_top > offsetcount) md->offset_overflow = TRUE;
5759 DPRINTF(("Freeing temporary memory\n"));
5760 (pcre_free)(md->offset_vector);
5761 }
5762
5763 /* Set the return code to the number of captured strings, or 0 if there are
5764 too many to fit into the vector. */
5765
5766 rc = md->offset_overflow? 0 : md->end_offset_top/2;
5767
5768 /* If there is space, set up the whole thing as substring 0. The value of
5769 md->start_match_ptr might be modified if \K was encountered on the success
5770 matching path. */
5771
5772 if (offsetcount < 2) rc = 0; else
5773 {
5774 offsets[0] = md->start_match_ptr - md->start_subject;
5775 offsets[1] = md->end_match_ptr - md->start_subject;
5776 }
5777
5778 DPRINTF((">>>> returning %d\n", rc));
5779 return rc;
5780 }
5781
5782 /* Control gets here if there has been an error, or if the overall match
5783 attempt has failed at all permitted starting positions. */
5784
5785 if (using_temporary_offsets)
5786 {
5787 DPRINTF(("Freeing temporary memory\n"));
5788 (pcre_free)(md->offset_vector);
5789 }
5790
5791 if (rc != MATCH_NOMATCH && rc != PCRE_ERROR_PARTIAL)
5792 {
5793 DPRINTF((">>>> error: returning %d\n", rc));
5794 return rc;
5795 }
5796 else if (start_partial != NULL)
5797 {
5798 DPRINTF((">>>> returning PCRE_ERROR_PARTIAL\n"));
5799 if (offsetcount > 1)
5800 {
5801 offsets[0] = start_partial - (USPTR)subject;
5802 offsets[1] = end_subject - (USPTR)subject;
5803 }
5804 return PCRE_ERROR_PARTIAL;
5805 }
5806 else
5807 {
5808 DPRINTF((">>>> returning PCRE_ERROR_NOMATCH\n"));
5809 return PCRE_ERROR_NOMATCH;
5810 }
5811 }
5812
5813 /* End of pcre_exec.c */

Properties

Name Value
svn:eol-style native
svn:keywords "Author Date Id Revision Url"

webmaster@exim.org
ViewVC Help
Powered by ViewVC 1.1.12