/[pcre]/code/trunk/pcre_exec.c
ViewVC logotype

Contents of /code/trunk/pcre_exec.c

Parent Directory Parent Directory | Revision Log Revision Log


Revision 406 - (show annotations) (download)
Mon Mar 23 12:05:43 2009 UTC (5 years, 7 months ago) by ph10
File MIME type: text/plain
File size: 153106 byte(s)
Trailing space tidies

1 /*************************************************
2 * Perl-Compatible Regular Expressions *
3 *************************************************/
4
5 /* PCRE is a library of functions to support regular expressions whose syntax
6 and semantics are as close as possible to those of the Perl 5 language.
7
8 Written by Philip Hazel
9 Copyright (c) 1997-2009 University of Cambridge
10
11 -----------------------------------------------------------------------------
12 Redistribution and use in source and binary forms, with or without
13 modification, are permitted provided that the following conditions are met:
14
15 * Redistributions of source code must retain the above copyright notice,
16 this list of conditions and the following disclaimer.
17
18 * Redistributions in binary form must reproduce the above copyright
19 notice, this list of conditions and the following disclaimer in the
20 documentation and/or other materials provided with the distribution.
21
22 * Neither the name of the University of Cambridge nor the names of its
23 contributors may be used to endorse or promote products derived from
24 this software without specific prior written permission.
25
26 THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
27 AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
28 IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
29 ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
30 LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
31 CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
32 SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
33 INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
34 CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
35 ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
36 POSSIBILITY OF SUCH DAMAGE.
37 -----------------------------------------------------------------------------
38 */
39
40
41 /* This module contains pcre_exec(), the externally visible function that does
42 pattern matching using an NFA algorithm, trying to mimic Perl as closely as
43 possible. There are also some static supporting functions. */
44
45 #ifdef HAVE_CONFIG_H
46 #include "config.h"
47 #endif
48
49 #define NLBLOCK md /* Block containing newline information */
50 #define PSSTART start_subject /* Field containing processed string start */
51 #define PSEND end_subject /* Field containing processed string end */
52
53 #include "pcre_internal.h"
54
55 /* Undefine some potentially clashing cpp symbols */
56
57 #undef min
58 #undef max
59
60 /* Flag bits for the match() function */
61
62 #define match_condassert 0x01 /* Called to check a condition assertion */
63 #define match_cbegroup 0x02 /* Could-be-empty unlimited repeat group */
64
65 /* Non-error returns from the match() function. Error returns are externally
66 defined PCRE_ERROR_xxx codes, which are all negative. */
67
68 #define MATCH_MATCH 1
69 #define MATCH_NOMATCH 0
70
71 /* Special internal returns from the match() function. Make them sufficiently
72 negative to avoid the external error codes. */
73
74 #define MATCH_COMMIT (-999)
75 #define MATCH_PRUNE (-998)
76 #define MATCH_SKIP (-997)
77 #define MATCH_THEN (-996)
78
79 /* Maximum number of ints of offset to save on the stack for recursive calls.
80 If the offset vector is bigger, malloc is used. This should be a multiple of 3,
81 because the offset vector is always a multiple of 3 long. */
82
83 #define REC_STACK_SAVE_MAX 30
84
85 /* Min and max values for the common repeats; for the maxima, 0 => infinity */
86
87 static const char rep_min[] = { 0, 0, 1, 1, 0, 0 };
88 static const char rep_max[] = { 0, 0, 0, 0, 1, 1 };
89
90
91
92 #ifdef DEBUG
93 /*************************************************
94 * Debugging function to print chars *
95 *************************************************/
96
97 /* Print a sequence of chars in printable format, stopping at the end of the
98 subject if the requested.
99
100 Arguments:
101 p points to characters
102 length number to print
103 is_subject TRUE if printing from within md->start_subject
104 md pointer to matching data block, if is_subject is TRUE
105
106 Returns: nothing
107 */
108
109 static void
110 pchars(const uschar *p, int length, BOOL is_subject, match_data *md)
111 {
112 unsigned int c;
113 if (is_subject && length > md->end_subject - p) length = md->end_subject - p;
114 while (length-- > 0)
115 if (isprint(c = *(p++))) printf("%c", c); else printf("\\x%02x", c);
116 }
117 #endif
118
119
120
121 /*************************************************
122 * Match a back-reference *
123 *************************************************/
124
125 /* If a back reference hasn't been set, the length that is passed is greater
126 than the number of characters left in the string, so the match fails.
127
128 Arguments:
129 offset index into the offset vector
130 eptr points into the subject
131 length length to be matched
132 md points to match data block
133 ims the ims flags
134
135 Returns: TRUE if matched
136 */
137
138 static BOOL
139 match_ref(int offset, register USPTR eptr, int length, match_data *md,
140 unsigned long int ims)
141 {
142 USPTR p = md->start_subject + md->offset_vector[offset];
143
144 #ifdef DEBUG
145 if (eptr >= md->end_subject)
146 printf("matching subject <null>");
147 else
148 {
149 printf("matching subject ");
150 pchars(eptr, length, TRUE, md);
151 }
152 printf(" against backref ");
153 pchars(p, length, FALSE, md);
154 printf("\n");
155 #endif
156
157 /* Always fail if not enough characters left */
158
159 if (length > md->end_subject - eptr) return FALSE;
160
161 /* Separate the caseless case for speed. In UTF-8 mode we can only do this
162 properly if Unicode properties are supported. Otherwise, we can check only
163 ASCII characters. */
164
165 if ((ims & PCRE_CASELESS) != 0)
166 {
167 #ifdef SUPPORT_UTF8
168 #ifdef SUPPORT_UCP
169 if (md->utf8)
170 {
171 USPTR endptr = eptr + length;
172 while (eptr < endptr)
173 {
174 int c, d;
175 GETCHARINC(c, eptr);
176 GETCHARINC(d, p);
177 if (c != d && c != UCD_OTHERCASE(d)) return FALSE;
178 }
179 }
180 else
181 #endif
182 #endif
183
184 /* The same code works when not in UTF-8 mode and in UTF-8 mode when there
185 is no UCP support. */
186
187 while (length-- > 0)
188 { if (md->lcc[*p++] != md->lcc[*eptr++]) return FALSE; }
189 }
190
191 /* In the caseful case, we can just compare the bytes, whether or not we
192 are in UTF-8 mode. */
193
194 else
195 { while (length-- > 0) if (*p++ != *eptr++) return FALSE; }
196
197 return TRUE;
198 }
199
200
201
202 /***************************************************************************
203 ****************************************************************************
204 RECURSION IN THE match() FUNCTION
205
206 The match() function is highly recursive, though not every recursive call
207 increases the recursive depth. Nevertheless, some regular expressions can cause
208 it to recurse to a great depth. I was writing for Unix, so I just let it call
209 itself recursively. This uses the stack for saving everything that has to be
210 saved for a recursive call. On Unix, the stack can be large, and this works
211 fine.
212
213 It turns out that on some non-Unix-like systems there are problems with
214 programs that use a lot of stack. (This despite the fact that every last chip
215 has oodles of memory these days, and techniques for extending the stack have
216 been known for decades.) So....
217
218 There is a fudge, triggered by defining NO_RECURSE, which avoids recursive
219 calls by keeping local variables that need to be preserved in blocks of memory
220 obtained from malloc() instead instead of on the stack. Macros are used to
221 achieve this so that the actual code doesn't look very different to what it
222 always used to.
223
224 The original heap-recursive code used longjmp(). However, it seems that this
225 can be very slow on some operating systems. Following a suggestion from Stan
226 Switzer, the use of longjmp() has been abolished, at the cost of having to
227 provide a unique number for each call to RMATCH. There is no way of generating
228 a sequence of numbers at compile time in C. I have given them names, to make
229 them stand out more clearly.
230
231 Crude tests on x86 Linux show a small speedup of around 5-8%. However, on
232 FreeBSD, avoiding longjmp() more than halves the time taken to run the standard
233 tests. Furthermore, not using longjmp() means that local dynamic variables
234 don't have indeterminate values; this has meant that the frame size can be
235 reduced because the result can be "passed back" by straight setting of the
236 variable instead of being passed in the frame.
237 ****************************************************************************
238 ***************************************************************************/
239
240 /* Numbers for RMATCH calls. When this list is changed, the code at HEAP_RETURN
241 below must be updated in sync. */
242
243 enum { RM1=1, RM2, RM3, RM4, RM5, RM6, RM7, RM8, RM9, RM10,
244 RM11, RM12, RM13, RM14, RM15, RM16, RM17, RM18, RM19, RM20,
245 RM21, RM22, RM23, RM24, RM25, RM26, RM27, RM28, RM29, RM30,
246 RM31, RM32, RM33, RM34, RM35, RM36, RM37, RM38, RM39, RM40,
247 RM41, RM42, RM43, RM44, RM45, RM46, RM47, RM48, RM49, RM50,
248 RM51, RM52, RM53, RM54 };
249
250 /* These versions of the macros use the stack, as normal. There are debugging
251 versions and production versions. Note that the "rw" argument of RMATCH isn't
252 actuall used in this definition. */
253
254 #ifndef NO_RECURSE
255 #define REGISTER register
256
257 #ifdef DEBUG
258 #define RMATCH(ra,rb,rc,rd,re,rf,rg,rw) \
259 { \
260 printf("match() called in line %d\n", __LINE__); \
261 rrc = match(ra,rb,mstart,rc,rd,re,rf,rg,rdepth+1); \
262 printf("to line %d\n", __LINE__); \
263 }
264 #define RRETURN(ra) \
265 { \
266 printf("match() returned %d from line %d ", ra, __LINE__); \
267 return ra; \
268 }
269 #else
270 #define RMATCH(ra,rb,rc,rd,re,rf,rg,rw) \
271 rrc = match(ra,rb,mstart,rc,rd,re,rf,rg,rdepth+1)
272 #define RRETURN(ra) return ra
273 #endif
274
275 #else
276
277
278 /* These versions of the macros manage a private stack on the heap. Note that
279 the "rd" argument of RMATCH isn't actually used in this definition. It's the md
280 argument of match(), which never changes. */
281
282 #define REGISTER
283
284 #define RMATCH(ra,rb,rc,rd,re,rf,rg,rw)\
285 {\
286 heapframe *newframe = (pcre_stack_malloc)(sizeof(heapframe));\
287 frame->Xwhere = rw; \
288 newframe->Xeptr = ra;\
289 newframe->Xecode = rb;\
290 newframe->Xmstart = mstart;\
291 newframe->Xoffset_top = rc;\
292 newframe->Xims = re;\
293 newframe->Xeptrb = rf;\
294 newframe->Xflags = rg;\
295 newframe->Xrdepth = frame->Xrdepth + 1;\
296 newframe->Xprevframe = frame;\
297 frame = newframe;\
298 DPRINTF(("restarting from line %d\n", __LINE__));\
299 goto HEAP_RECURSE;\
300 L_##rw:\
301 DPRINTF(("jumped back to line %d\n", __LINE__));\
302 }
303
304 #define RRETURN(ra)\
305 {\
306 heapframe *newframe = frame;\
307 frame = newframe->Xprevframe;\
308 (pcre_stack_free)(newframe);\
309 if (frame != NULL)\
310 {\
311 rrc = ra;\
312 goto HEAP_RETURN;\
313 }\
314 return ra;\
315 }
316
317
318 /* Structure for remembering the local variables in a private frame */
319
320 typedef struct heapframe {
321 struct heapframe *Xprevframe;
322
323 /* Function arguments that may change */
324
325 const uschar *Xeptr;
326 const uschar *Xecode;
327 const uschar *Xmstart;
328 int Xoffset_top;
329 long int Xims;
330 eptrblock *Xeptrb;
331 int Xflags;
332 unsigned int Xrdepth;
333
334 /* Function local variables */
335
336 const uschar *Xcallpat;
337 #ifdef SUPPORT_UTF8
338 const uschar *Xcharptr;
339 #endif
340 const uschar *Xdata;
341 const uschar *Xnext;
342 const uschar *Xpp;
343 const uschar *Xprev;
344 const uschar *Xsaved_eptr;
345
346 recursion_info Xnew_recursive;
347
348 BOOL Xcur_is_word;
349 BOOL Xcondition;
350 BOOL Xprev_is_word;
351
352 unsigned long int Xoriginal_ims;
353
354 #ifdef SUPPORT_UCP
355 int Xprop_type;
356 int Xprop_value;
357 int Xprop_fail_result;
358 int Xprop_category;
359 int Xprop_chartype;
360 int Xprop_script;
361 int Xoclength;
362 uschar Xocchars[8];
363 #endif
364
365 int Xcodelink;
366 int Xctype;
367 unsigned int Xfc;
368 int Xfi;
369 int Xlength;
370 int Xmax;
371 int Xmin;
372 int Xnumber;
373 int Xoffset;
374 int Xop;
375 int Xsave_capture_last;
376 int Xsave_offset1, Xsave_offset2, Xsave_offset3;
377 int Xstacksave[REC_STACK_SAVE_MAX];
378
379 eptrblock Xnewptrb;
380
381 /* Where to jump back to */
382
383 int Xwhere;
384
385 } heapframe;
386
387 #endif
388
389
390 /***************************************************************************
391 ***************************************************************************/
392
393
394
395 /*************************************************
396 * Match from current position *
397 *************************************************/
398
399 /* This function is called recursively in many circumstances. Whenever it
400 returns a negative (error) response, the outer incarnation must also return the
401 same response.
402
403 Performance note: It might be tempting to extract commonly used fields from the
404 md structure (e.g. utf8, end_subject) into individual variables to improve
405 performance. Tests using gcc on a SPARC disproved this; in the first case, it
406 made performance worse.
407
408 Arguments:
409 eptr pointer to current character in subject
410 ecode pointer to current position in compiled code
411 mstart pointer to the current match start position (can be modified
412 by encountering \K)
413 offset_top current top pointer
414 md pointer to "static" info for the match
415 ims current /i, /m, and /s options
416 eptrb pointer to chain of blocks containing eptr at start of
417 brackets - for testing for empty matches
418 flags can contain
419 match_condassert - this is an assertion condition
420 match_cbegroup - this is the start of an unlimited repeat
421 group that can match an empty string
422 rdepth the recursion depth
423
424 Returns: MATCH_MATCH if matched ) these values are >= 0
425 MATCH_NOMATCH if failed to match )
426 a negative PCRE_ERROR_xxx value if aborted by an error condition
427 (e.g. stopped by repeated call or recursion limit)
428 */
429
430 static int
431 match(REGISTER USPTR eptr, REGISTER const uschar *ecode, const uschar *mstart,
432 int offset_top, match_data *md, unsigned long int ims, eptrblock *eptrb,
433 int flags, unsigned int rdepth)
434 {
435 /* These variables do not need to be preserved over recursion in this function,
436 so they can be ordinary variables in all cases. Mark some of them with
437 "register" because they are used a lot in loops. */
438
439 register int rrc; /* Returns from recursive calls */
440 register int i; /* Used for loops not involving calls to RMATCH() */
441 register unsigned int c; /* Character values not kept over RMATCH() calls */
442 register BOOL utf8; /* Local copy of UTF-8 flag for speed */
443
444 BOOL minimize, possessive; /* Quantifier options */
445 int condcode;
446
447 /* When recursion is not being used, all "local" variables that have to be
448 preserved over calls to RMATCH() are part of a "frame" which is obtained from
449 heap storage. Set up the top-level frame here; others are obtained from the
450 heap whenever RMATCH() does a "recursion". See the macro definitions above. */
451
452 #ifdef NO_RECURSE
453 heapframe *frame = (pcre_stack_malloc)(sizeof(heapframe));
454 frame->Xprevframe = NULL; /* Marks the top level */
455
456 /* Copy in the original argument variables */
457
458 frame->Xeptr = eptr;
459 frame->Xecode = ecode;
460 frame->Xmstart = mstart;
461 frame->Xoffset_top = offset_top;
462 frame->Xims = ims;
463 frame->Xeptrb = eptrb;
464 frame->Xflags = flags;
465 frame->Xrdepth = rdepth;
466
467 /* This is where control jumps back to to effect "recursion" */
468
469 HEAP_RECURSE:
470
471 /* Macros make the argument variables come from the current frame */
472
473 #define eptr frame->Xeptr
474 #define ecode frame->Xecode
475 #define mstart frame->Xmstart
476 #define offset_top frame->Xoffset_top
477 #define ims frame->Xims
478 #define eptrb frame->Xeptrb
479 #define flags frame->Xflags
480 #define rdepth frame->Xrdepth
481
482 /* Ditto for the local variables */
483
484 #ifdef SUPPORT_UTF8
485 #define charptr frame->Xcharptr
486 #endif
487 #define callpat frame->Xcallpat
488 #define codelink frame->Xcodelink
489 #define data frame->Xdata
490 #define next frame->Xnext
491 #define pp frame->Xpp
492 #define prev frame->Xprev
493 #define saved_eptr frame->Xsaved_eptr
494
495 #define new_recursive frame->Xnew_recursive
496
497 #define cur_is_word frame->Xcur_is_word
498 #define condition frame->Xcondition
499 #define prev_is_word frame->Xprev_is_word
500
501 #define original_ims frame->Xoriginal_ims
502
503 #ifdef SUPPORT_UCP
504 #define prop_type frame->Xprop_type
505 #define prop_value frame->Xprop_value
506 #define prop_fail_result frame->Xprop_fail_result
507 #define prop_category frame->Xprop_category
508 #define prop_chartype frame->Xprop_chartype
509 #define prop_script frame->Xprop_script
510 #define oclength frame->Xoclength
511 #define occhars frame->Xocchars
512 #endif
513
514 #define ctype frame->Xctype
515 #define fc frame->Xfc
516 #define fi frame->Xfi
517 #define length frame->Xlength
518 #define max frame->Xmax
519 #define min frame->Xmin
520 #define number frame->Xnumber
521 #define offset frame->Xoffset
522 #define op frame->Xop
523 #define save_capture_last frame->Xsave_capture_last
524 #define save_offset1 frame->Xsave_offset1
525 #define save_offset2 frame->Xsave_offset2
526 #define save_offset3 frame->Xsave_offset3
527 #define stacksave frame->Xstacksave
528
529 #define newptrb frame->Xnewptrb
530
531 /* When recursion is being used, local variables are allocated on the stack and
532 get preserved during recursion in the normal way. In this environment, fi and
533 i, and fc and c, can be the same variables. */
534
535 #else /* NO_RECURSE not defined */
536 #define fi i
537 #define fc c
538
539
540 #ifdef SUPPORT_UTF8 /* Many of these variables are used only */
541 const uschar *charptr; /* in small blocks of the code. My normal */
542 #endif /* style of coding would have declared */
543 const uschar *callpat; /* them within each of those blocks. */
544 const uschar *data; /* However, in order to accommodate the */
545 const uschar *next; /* version of this code that uses an */
546 USPTR pp; /* external "stack" implemented on the */
547 const uschar *prev; /* heap, it is easier to declare them all */
548 USPTR saved_eptr; /* here, so the declarations can be cut */
549 /* out in a block. The only declarations */
550 recursion_info new_recursive; /* within blocks below are for variables */
551 /* that do not have to be preserved over */
552 BOOL cur_is_word; /* a recursive call to RMATCH(). */
553 BOOL condition;
554 BOOL prev_is_word;
555
556 unsigned long int original_ims;
557
558 #ifdef SUPPORT_UCP
559 int prop_type;
560 int prop_value;
561 int prop_fail_result;
562 int prop_category;
563 int prop_chartype;
564 int prop_script;
565 int oclength;
566 uschar occhars[8];
567 #endif
568
569 int codelink;
570 int ctype;
571 int length;
572 int max;
573 int min;
574 int number;
575 int offset;
576 int op;
577 int save_capture_last;
578 int save_offset1, save_offset2, save_offset3;
579 int stacksave[REC_STACK_SAVE_MAX];
580
581 eptrblock newptrb;
582 #endif /* NO_RECURSE */
583
584 /* These statements are here to stop the compiler complaining about unitialized
585 variables. */
586
587 #ifdef SUPPORT_UCP
588 prop_value = 0;
589 prop_fail_result = 0;
590 #endif
591
592
593 /* This label is used for tail recursion, which is used in a few cases even
594 when NO_RECURSE is not defined, in order to reduce the amount of stack that is
595 used. Thanks to Ian Taylor for noticing this possibility and sending the
596 original patch. */
597
598 TAIL_RECURSE:
599
600 /* OK, now we can get on with the real code of the function. Recursive calls
601 are specified by the macro RMATCH and RRETURN is used to return. When
602 NO_RECURSE is *not* defined, these just turn into a recursive call to match()
603 and a "return", respectively (possibly with some debugging if DEBUG is
604 defined). However, RMATCH isn't like a function call because it's quite a
605 complicated macro. It has to be used in one particular way. This shouldn't,
606 however, impact performance when true recursion is being used. */
607
608 #ifdef SUPPORT_UTF8
609 utf8 = md->utf8; /* Local copy of the flag */
610 #else
611 utf8 = FALSE;
612 #endif
613
614 /* First check that we haven't called match() too many times, or that we
615 haven't exceeded the recursive call limit. */
616
617 if (md->match_call_count++ >= md->match_limit) RRETURN(PCRE_ERROR_MATCHLIMIT);
618 if (rdepth >= md->match_limit_recursion) RRETURN(PCRE_ERROR_RECURSIONLIMIT);
619
620 original_ims = ims; /* Save for resetting on ')' */
621
622 /* At the start of a group with an unlimited repeat that may match an empty
623 string, the match_cbegroup flag is set. When this is the case, add the current
624 subject pointer to the chain of such remembered pointers, to be checked when we
625 hit the closing ket, in order to break infinite loops that match no characters.
626 When match() is called in other circumstances, don't add to the chain. The
627 match_cbegroup flag must NOT be used with tail recursion, because the memory
628 block that is used is on the stack, so a new one may be required for each
629 match(). */
630
631 if ((flags & match_cbegroup) != 0)
632 {
633 newptrb.epb_saved_eptr = eptr;
634 newptrb.epb_prev = eptrb;
635 eptrb = &newptrb;
636 }
637
638 /* Now start processing the opcodes. */
639
640 for (;;)
641 {
642 minimize = possessive = FALSE;
643 op = *ecode;
644
645 /* For partial matching, remember if we ever hit the end of the subject after
646 matching at least one subject character. */
647
648 if (md->partial &&
649 eptr >= md->end_subject &&
650 eptr > mstart)
651 md->hitend = TRUE;
652
653 switch(op)
654 {
655 case OP_FAIL:
656 RRETURN(MATCH_NOMATCH);
657
658 case OP_PRUNE:
659 RMATCH(eptr, ecode + _pcre_OP_lengths[*ecode], offset_top, md,
660 ims, eptrb, flags, RM51);
661 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
662 RRETURN(MATCH_PRUNE);
663
664 case OP_COMMIT:
665 RMATCH(eptr, ecode + _pcre_OP_lengths[*ecode], offset_top, md,
666 ims, eptrb, flags, RM52);
667 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
668 RRETURN(MATCH_COMMIT);
669
670 case OP_SKIP:
671 RMATCH(eptr, ecode + _pcre_OP_lengths[*ecode], offset_top, md,
672 ims, eptrb, flags, RM53);
673 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
674 md->start_match_ptr = eptr; /* Pass back current position */
675 RRETURN(MATCH_SKIP);
676
677 case OP_THEN:
678 RMATCH(eptr, ecode + _pcre_OP_lengths[*ecode], offset_top, md,
679 ims, eptrb, flags, RM54);
680 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
681 RRETURN(MATCH_THEN);
682
683 /* Handle a capturing bracket. If there is space in the offset vector, save
684 the current subject position in the working slot at the top of the vector.
685 We mustn't change the current values of the data slot, because they may be
686 set from a previous iteration of this group, and be referred to by a
687 reference inside the group.
688
689 If the bracket fails to match, we need to restore this value and also the
690 values of the final offsets, in case they were set by a previous iteration
691 of the same bracket.
692
693 If there isn't enough space in the offset vector, treat this as if it were
694 a non-capturing bracket. Don't worry about setting the flag for the error
695 case here; that is handled in the code for KET. */
696
697 case OP_CBRA:
698 case OP_SCBRA:
699 number = GET2(ecode, 1+LINK_SIZE);
700 offset = number << 1;
701
702 #ifdef DEBUG
703 printf("start bracket %d\n", number);
704 printf("subject=");
705 pchars(eptr, 16, TRUE, md);
706 printf("\n");
707 #endif
708
709 if (offset < md->offset_max)
710 {
711 save_offset1 = md->offset_vector[offset];
712 save_offset2 = md->offset_vector[offset+1];
713 save_offset3 = md->offset_vector[md->offset_end - number];
714 save_capture_last = md->capture_last;
715
716 DPRINTF(("saving %d %d %d\n", save_offset1, save_offset2, save_offset3));
717 md->offset_vector[md->offset_end - number] = eptr - md->start_subject;
718
719 flags = (op == OP_SCBRA)? match_cbegroup : 0;
720 do
721 {
722 RMATCH(eptr, ecode + _pcre_OP_lengths[*ecode], offset_top, md,
723 ims, eptrb, flags, RM1);
724 if (rrc != MATCH_NOMATCH && rrc != MATCH_THEN) RRETURN(rrc);
725 md->capture_last = save_capture_last;
726 ecode += GET(ecode, 1);
727 }
728 while (*ecode == OP_ALT);
729
730 DPRINTF(("bracket %d failed\n", number));
731
732 md->offset_vector[offset] = save_offset1;
733 md->offset_vector[offset+1] = save_offset2;
734 md->offset_vector[md->offset_end - number] = save_offset3;
735
736 RRETURN(MATCH_NOMATCH);
737 }
738
739 /* FALL THROUGH ... Insufficient room for saving captured contents. Treat
740 as a non-capturing bracket. */
741
742 /* VVVVVVVVVVVVVVVVVVVVVVVVV */
743 /* VVVVVVVVVVVVVVVVVVVVVVVVV */
744
745 DPRINTF(("insufficient capture room: treat as non-capturing\n"));
746
747 /* VVVVVVVVVVVVVVVVVVVVVVVVV */
748 /* VVVVVVVVVVVVVVVVVVVVVVVVV */
749
750 /* Non-capturing bracket. Loop for all the alternatives. When we get to the
751 final alternative within the brackets, we would return the result of a
752 recursive call to match() whatever happened. We can reduce stack usage by
753 turning this into a tail recursion, except in the case when match_cbegroup
754 is set.*/
755
756 case OP_BRA:
757 case OP_SBRA:
758 DPRINTF(("start non-capturing bracket\n"));
759 flags = (op >= OP_SBRA)? match_cbegroup : 0;
760 for (;;)
761 {
762 if (ecode[GET(ecode, 1)] != OP_ALT) /* Final alternative */
763 {
764 if (flags == 0) /* Not a possibly empty group */
765 {
766 ecode += _pcre_OP_lengths[*ecode];
767 DPRINTF(("bracket 0 tail recursion\n"));
768 goto TAIL_RECURSE;
769 }
770
771 /* Possibly empty group; can't use tail recursion. */
772
773 RMATCH(eptr, ecode + _pcre_OP_lengths[*ecode], offset_top, md, ims,
774 eptrb, flags, RM48);
775 RRETURN(rrc);
776 }
777
778 /* For non-final alternatives, continue the loop for a NOMATCH result;
779 otherwise return. */
780
781 RMATCH(eptr, ecode + _pcre_OP_lengths[*ecode], offset_top, md, ims,
782 eptrb, flags, RM2);
783 if (rrc != MATCH_NOMATCH && rrc != MATCH_THEN) RRETURN(rrc);
784 ecode += GET(ecode, 1);
785 }
786 /* Control never reaches here. */
787
788 /* Conditional group: compilation checked that there are no more than
789 two branches. If the condition is false, skipping the first branch takes us
790 past the end if there is only one branch, but that's OK because that is
791 exactly what going to the ket would do. As there is only one branch to be
792 obeyed, we can use tail recursion to avoid using another stack frame. */
793
794 case OP_COND:
795 case OP_SCOND:
796 codelink= GET(ecode, 1);
797
798 /* Because of the way auto-callout works during compile, a callout item is
799 inserted between OP_COND and an assertion condition. */
800
801 if (ecode[LINK_SIZE+1] == OP_CALLOUT)
802 {
803 if (pcre_callout != NULL)
804 {
805 pcre_callout_block cb;
806 cb.version = 1; /* Version 1 of the callout block */
807 cb.callout_number = ecode[LINK_SIZE+2];
808 cb.offset_vector = md->offset_vector;
809 cb.subject = (PCRE_SPTR)md->start_subject;
810 cb.subject_length = md->end_subject - md->start_subject;
811 cb.start_match = mstart - md->start_subject;
812 cb.current_position = eptr - md->start_subject;
813 cb.pattern_position = GET(ecode, LINK_SIZE + 3);
814 cb.next_item_length = GET(ecode, 3 + 2*LINK_SIZE);
815 cb.capture_top = offset_top/2;
816 cb.capture_last = md->capture_last;
817 cb.callout_data = md->callout_data;
818 if ((rrc = (*pcre_callout)(&cb)) > 0) RRETURN(MATCH_NOMATCH);
819 if (rrc < 0) RRETURN(rrc);
820 }
821 ecode += _pcre_OP_lengths[OP_CALLOUT];
822 }
823
824 condcode = ecode[LINK_SIZE+1];
825
826 /* Now see what the actual condition is */
827
828 if (condcode == OP_RREF) /* Recursion test */
829 {
830 offset = GET2(ecode, LINK_SIZE + 2); /* Recursion group number*/
831 condition = md->recursive != NULL &&
832 (offset == RREF_ANY || offset == md->recursive->group_num);
833 ecode += condition? 3 : GET(ecode, 1);
834 }
835
836 else if (condcode == OP_CREF) /* Group used test */
837 {
838 offset = GET2(ecode, LINK_SIZE+2) << 1; /* Doubled ref number */
839 condition = offset < offset_top && md->offset_vector[offset] >= 0;
840 ecode += condition? 3 : GET(ecode, 1);
841 }
842
843 else if (condcode == OP_DEF) /* DEFINE - always false */
844 {
845 condition = FALSE;
846 ecode += GET(ecode, 1);
847 }
848
849 /* The condition is an assertion. Call match() to evaluate it - setting
850 the final argument match_condassert causes it to stop at the end of an
851 assertion. */
852
853 else
854 {
855 RMATCH(eptr, ecode + 1 + LINK_SIZE, offset_top, md, ims, NULL,
856 match_condassert, RM3);
857 if (rrc == MATCH_MATCH)
858 {
859 condition = TRUE;
860 ecode += 1 + LINK_SIZE + GET(ecode, LINK_SIZE + 2);
861 while (*ecode == OP_ALT) ecode += GET(ecode, 1);
862 }
863 else if (rrc != MATCH_NOMATCH && rrc != MATCH_THEN)
864 {
865 RRETURN(rrc); /* Need braces because of following else */
866 }
867 else
868 {
869 condition = FALSE;
870 ecode += codelink;
871 }
872 }
873
874 /* We are now at the branch that is to be obeyed. As there is only one,
875 we can use tail recursion to avoid using another stack frame, except when
876 match_cbegroup is required for an unlimited repeat of a possibly empty
877 group. If the second alternative doesn't exist, we can just plough on. */
878
879 if (condition || *ecode == OP_ALT)
880 {
881 ecode += 1 + LINK_SIZE;
882 if (op == OP_SCOND) /* Possibly empty group */
883 {
884 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, match_cbegroup, RM49);
885 RRETURN(rrc);
886 }
887 else /* Group must match something */
888 {
889 flags = 0;
890 goto TAIL_RECURSE;
891 }
892 }
893 else /* Condition false & no alternative */
894 {
895 ecode += 1 + LINK_SIZE;
896 }
897 break;
898
899
900 /* End of the pattern, either real or forced. If we are in a top-level
901 recursion, we should restore the offsets appropriately and continue from
902 after the call. */
903
904 case OP_ACCEPT:
905 case OP_END:
906 if (md->recursive != NULL && md->recursive->group_num == 0)
907 {
908 recursion_info *rec = md->recursive;
909 DPRINTF(("End of pattern in a (?0) recursion\n"));
910 md->recursive = rec->prevrec;
911 memmove(md->offset_vector, rec->offset_save,
912 rec->saved_max * sizeof(int));
913 mstart = rec->save_start;
914 ims = original_ims;
915 ecode = rec->after_call;
916 break;
917 }
918
919 /* Otherwise, if PCRE_NOTEMPTY is set, fail if we have matched an empty
920 string - backtracking will then try other alternatives, if any. */
921
922 if (md->notempty && eptr == mstart) RRETURN(MATCH_NOMATCH);
923 md->end_match_ptr = eptr; /* Record where we ended */
924 md->end_offset_top = offset_top; /* and how many extracts were taken */
925 md->start_match_ptr = mstart; /* and the start (\K can modify) */
926 RRETURN(MATCH_MATCH);
927
928 /* Change option settings */
929
930 case OP_OPT:
931 ims = ecode[1];
932 ecode += 2;
933 DPRINTF(("ims set to %02lx\n", ims));
934 break;
935
936 /* Assertion brackets. Check the alternative branches in turn - the
937 matching won't pass the KET for an assertion. If any one branch matches,
938 the assertion is true. Lookbehind assertions have an OP_REVERSE item at the
939 start of each branch to move the current point backwards, so the code at
940 this level is identical to the lookahead case. */
941
942 case OP_ASSERT:
943 case OP_ASSERTBACK:
944 do
945 {
946 RMATCH(eptr, ecode + 1 + LINK_SIZE, offset_top, md, ims, NULL, 0,
947 RM4);
948 if (rrc == MATCH_MATCH) break;
949 if (rrc != MATCH_NOMATCH && rrc != MATCH_THEN) RRETURN(rrc);
950 ecode += GET(ecode, 1);
951 }
952 while (*ecode == OP_ALT);
953 if (*ecode == OP_KET) RRETURN(MATCH_NOMATCH);
954
955 /* If checking an assertion for a condition, return MATCH_MATCH. */
956
957 if ((flags & match_condassert) != 0) RRETURN(MATCH_MATCH);
958
959 /* Continue from after the assertion, updating the offsets high water
960 mark, since extracts may have been taken during the assertion. */
961
962 do ecode += GET(ecode,1); while (*ecode == OP_ALT);
963 ecode += 1 + LINK_SIZE;
964 offset_top = md->end_offset_top;
965 continue;
966
967 /* Negative assertion: all branches must fail to match */
968
969 case OP_ASSERT_NOT:
970 case OP_ASSERTBACK_NOT:
971 do
972 {
973 RMATCH(eptr, ecode + 1 + LINK_SIZE, offset_top, md, ims, NULL, 0,
974 RM5);
975 if (rrc == MATCH_MATCH) RRETURN(MATCH_NOMATCH);
976 if (rrc != MATCH_NOMATCH && rrc != MATCH_THEN) RRETURN(rrc);
977 ecode += GET(ecode,1);
978 }
979 while (*ecode == OP_ALT);
980
981 if ((flags & match_condassert) != 0) RRETURN(MATCH_MATCH);
982
983 ecode += 1 + LINK_SIZE;
984 continue;
985
986 /* Move the subject pointer back. This occurs only at the start of
987 each branch of a lookbehind assertion. If we are too close to the start to
988 move back, this match function fails. When working with UTF-8 we move
989 back a number of characters, not bytes. */
990
991 case OP_REVERSE:
992 #ifdef SUPPORT_UTF8
993 if (utf8)
994 {
995 i = GET(ecode, 1);
996 while (i-- > 0)
997 {
998 eptr--;
999 if (eptr < md->start_subject) RRETURN(MATCH_NOMATCH);
1000 BACKCHAR(eptr);
1001 }
1002 }
1003 else
1004 #endif
1005
1006 /* No UTF-8 support, or not in UTF-8 mode: count is byte count */
1007
1008 {
1009 eptr -= GET(ecode, 1);
1010 if (eptr < md->start_subject) RRETURN(MATCH_NOMATCH);
1011 }
1012
1013 /* Skip to next op code */
1014
1015 ecode += 1 + LINK_SIZE;
1016 break;
1017
1018 /* The callout item calls an external function, if one is provided, passing
1019 details of the match so far. This is mainly for debugging, though the
1020 function is able to force a failure. */
1021
1022 case OP_CALLOUT:
1023 if (pcre_callout != NULL)
1024 {
1025 pcre_callout_block cb;
1026 cb.version = 1; /* Version 1 of the callout block */
1027 cb.callout_number = ecode[1];
1028 cb.offset_vector = md->offset_vector;
1029 cb.subject = (PCRE_SPTR)md->start_subject;
1030 cb.subject_length = md->end_subject - md->start_subject;
1031 cb.start_match = mstart - md->start_subject;
1032 cb.current_position = eptr - md->start_subject;
1033 cb.pattern_position = GET(ecode, 2);
1034 cb.next_item_length = GET(ecode, 2 + LINK_SIZE);
1035 cb.capture_top = offset_top/2;
1036 cb.capture_last = md->capture_last;
1037 cb.callout_data = md->callout_data;
1038 if ((rrc = (*pcre_callout)(&cb)) > 0) RRETURN(MATCH_NOMATCH);
1039 if (rrc < 0) RRETURN(rrc);
1040 }
1041 ecode += 2 + 2*LINK_SIZE;
1042 break;
1043
1044 /* Recursion either matches the current regex, or some subexpression. The
1045 offset data is the offset to the starting bracket from the start of the
1046 whole pattern. (This is so that it works from duplicated subpatterns.)
1047
1048 If there are any capturing brackets started but not finished, we have to
1049 save their starting points and reinstate them after the recursion. However,
1050 we don't know how many such there are (offset_top records the completed
1051 total) so we just have to save all the potential data. There may be up to
1052 65535 such values, which is too large to put on the stack, but using malloc
1053 for small numbers seems expensive. As a compromise, the stack is used when
1054 there are no more than REC_STACK_SAVE_MAX values to store; otherwise malloc
1055 is used. A problem is what to do if the malloc fails ... there is no way of
1056 returning to the top level with an error. Save the top REC_STACK_SAVE_MAX
1057 values on the stack, and accept that the rest may be wrong.
1058
1059 There are also other values that have to be saved. We use a chained
1060 sequence of blocks that actually live on the stack. Thanks to Robin Houston
1061 for the original version of this logic. */
1062
1063 case OP_RECURSE:
1064 {
1065 callpat = md->start_code + GET(ecode, 1);
1066 new_recursive.group_num = (callpat == md->start_code)? 0 :
1067 GET2(callpat, 1 + LINK_SIZE);
1068
1069 /* Add to "recursing stack" */
1070
1071 new_recursive.prevrec = md->recursive;
1072 md->recursive = &new_recursive;
1073
1074 /* Find where to continue from afterwards */
1075
1076 ecode += 1 + LINK_SIZE;
1077 new_recursive.after_call = ecode;
1078
1079 /* Now save the offset data. */
1080
1081 new_recursive.saved_max = md->offset_end;
1082 if (new_recursive.saved_max <= REC_STACK_SAVE_MAX)
1083 new_recursive.offset_save = stacksave;
1084 else
1085 {
1086 new_recursive.offset_save =
1087 (int *)(pcre_malloc)(new_recursive.saved_max * sizeof(int));
1088 if (new_recursive.offset_save == NULL) RRETURN(PCRE_ERROR_NOMEMORY);
1089 }
1090
1091 memcpy(new_recursive.offset_save, md->offset_vector,
1092 new_recursive.saved_max * sizeof(int));
1093 new_recursive.save_start = mstart;
1094 mstart = eptr;
1095
1096 /* OK, now we can do the recursion. For each top-level alternative we
1097 restore the offset and recursion data. */
1098
1099 DPRINTF(("Recursing into group %d\n", new_recursive.group_num));
1100 flags = (*callpat >= OP_SBRA)? match_cbegroup : 0;
1101 do
1102 {
1103 RMATCH(eptr, callpat + _pcre_OP_lengths[*callpat], offset_top,
1104 md, ims, eptrb, flags, RM6);
1105 if (rrc == MATCH_MATCH)
1106 {
1107 DPRINTF(("Recursion matched\n"));
1108 md->recursive = new_recursive.prevrec;
1109 if (new_recursive.offset_save != stacksave)
1110 (pcre_free)(new_recursive.offset_save);
1111 RRETURN(MATCH_MATCH);
1112 }
1113 else if (rrc != MATCH_NOMATCH && rrc != MATCH_THEN)
1114 {
1115 DPRINTF(("Recursion gave error %d\n", rrc));
1116 if (new_recursive.offset_save != stacksave)
1117 (pcre_free)(new_recursive.offset_save);
1118 RRETURN(rrc);
1119 }
1120
1121 md->recursive = &new_recursive;
1122 memcpy(md->offset_vector, new_recursive.offset_save,
1123 new_recursive.saved_max * sizeof(int));
1124 callpat += GET(callpat, 1);
1125 }
1126 while (*callpat == OP_ALT);
1127
1128 DPRINTF(("Recursion didn't match\n"));
1129 md->recursive = new_recursive.prevrec;
1130 if (new_recursive.offset_save != stacksave)
1131 (pcre_free)(new_recursive.offset_save);
1132 RRETURN(MATCH_NOMATCH);
1133 }
1134 /* Control never reaches here */
1135
1136 /* "Once" brackets are like assertion brackets except that after a match,
1137 the point in the subject string is not moved back. Thus there can never be
1138 a move back into the brackets. Friedl calls these "atomic" subpatterns.
1139 Check the alternative branches in turn - the matching won't pass the KET
1140 for this kind of subpattern. If any one branch matches, we carry on as at
1141 the end of a normal bracket, leaving the subject pointer. */
1142
1143 case OP_ONCE:
1144 prev = ecode;
1145 saved_eptr = eptr;
1146
1147 do
1148 {
1149 RMATCH(eptr, ecode + 1 + LINK_SIZE, offset_top, md, ims, eptrb, 0, RM7);
1150 if (rrc == MATCH_MATCH) break;
1151 if (rrc != MATCH_NOMATCH && rrc != MATCH_THEN) RRETURN(rrc);
1152 ecode += GET(ecode,1);
1153 }
1154 while (*ecode == OP_ALT);
1155
1156 /* If hit the end of the group (which could be repeated), fail */
1157
1158 if (*ecode != OP_ONCE && *ecode != OP_ALT) RRETURN(MATCH_NOMATCH);
1159
1160 /* Continue as from after the assertion, updating the offsets high water
1161 mark, since extracts may have been taken. */
1162
1163 do ecode += GET(ecode, 1); while (*ecode == OP_ALT);
1164
1165 offset_top = md->end_offset_top;
1166 eptr = md->end_match_ptr;
1167
1168 /* For a non-repeating ket, just continue at this level. This also
1169 happens for a repeating ket if no characters were matched in the group.
1170 This is the forcible breaking of infinite loops as implemented in Perl
1171 5.005. If there is an options reset, it will get obeyed in the normal
1172 course of events. */
1173
1174 if (*ecode == OP_KET || eptr == saved_eptr)
1175 {
1176 ecode += 1+LINK_SIZE;
1177 break;
1178 }
1179
1180 /* The repeating kets try the rest of the pattern or restart from the
1181 preceding bracket, in the appropriate order. The second "call" of match()
1182 uses tail recursion, to avoid using another stack frame. We need to reset
1183 any options that changed within the bracket before re-running it, so
1184 check the next opcode. */
1185
1186 if (ecode[1+LINK_SIZE] == OP_OPT)
1187 {
1188 ims = (ims & ~PCRE_IMS) | ecode[4];
1189 DPRINTF(("ims set to %02lx at group repeat\n", ims));
1190 }
1191
1192 if (*ecode == OP_KETRMIN)
1193 {
1194 RMATCH(eptr, ecode + 1 + LINK_SIZE, offset_top, md, ims, eptrb, 0, RM8);
1195 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1196 ecode = prev;
1197 flags = 0;
1198 goto TAIL_RECURSE;
1199 }
1200 else /* OP_KETRMAX */
1201 {
1202 RMATCH(eptr, prev, offset_top, md, ims, eptrb, match_cbegroup, RM9);
1203 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1204 ecode += 1 + LINK_SIZE;
1205 flags = 0;
1206 goto TAIL_RECURSE;
1207 }
1208 /* Control never gets here */
1209
1210 /* An alternation is the end of a branch; scan along to find the end of the
1211 bracketed group and go to there. */
1212
1213 case OP_ALT:
1214 do ecode += GET(ecode,1); while (*ecode == OP_ALT);
1215 break;
1216
1217 /* BRAZERO, BRAMINZERO and SKIPZERO occur just before a bracket group,
1218 indicating that it may occur zero times. It may repeat infinitely, or not
1219 at all - i.e. it could be ()* or ()? or even (){0} in the pattern. Brackets
1220 with fixed upper repeat limits are compiled as a number of copies, with the
1221 optional ones preceded by BRAZERO or BRAMINZERO. */
1222
1223 case OP_BRAZERO:
1224 {
1225 next = ecode+1;
1226 RMATCH(eptr, next, offset_top, md, ims, eptrb, 0, RM10);
1227 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1228 do next += GET(next,1); while (*next == OP_ALT);
1229 ecode = next + 1 + LINK_SIZE;
1230 }
1231 break;
1232
1233 case OP_BRAMINZERO:
1234 {
1235 next = ecode+1;
1236 do next += GET(next, 1); while (*next == OP_ALT);
1237 RMATCH(eptr, next + 1+LINK_SIZE, offset_top, md, ims, eptrb, 0, RM11);
1238 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1239 ecode++;
1240 }
1241 break;
1242
1243 case OP_SKIPZERO:
1244 {
1245 next = ecode+1;
1246 do next += GET(next,1); while (*next == OP_ALT);
1247 ecode = next + 1 + LINK_SIZE;
1248 }
1249 break;
1250
1251 /* End of a group, repeated or non-repeating. */
1252
1253 case OP_KET:
1254 case OP_KETRMIN:
1255 case OP_KETRMAX:
1256 prev = ecode - GET(ecode, 1);
1257
1258 /* If this was a group that remembered the subject start, in order to break
1259 infinite repeats of empty string matches, retrieve the subject start from
1260 the chain. Otherwise, set it NULL. */
1261
1262 if (*prev >= OP_SBRA)
1263 {
1264 saved_eptr = eptrb->epb_saved_eptr; /* Value at start of group */
1265 eptrb = eptrb->epb_prev; /* Backup to previous group */
1266 }
1267 else saved_eptr = NULL;
1268
1269 /* If we are at the end of an assertion group, stop matching and return
1270 MATCH_MATCH, but record the current high water mark for use by positive
1271 assertions. Do this also for the "once" (atomic) groups. */
1272
1273 if (*prev == OP_ASSERT || *prev == OP_ASSERT_NOT ||
1274 *prev == OP_ASSERTBACK || *prev == OP_ASSERTBACK_NOT ||
1275 *prev == OP_ONCE)
1276 {
1277 md->end_match_ptr = eptr; /* For ONCE */
1278 md->end_offset_top = offset_top;
1279 RRETURN(MATCH_MATCH);
1280 }
1281
1282 /* For capturing groups we have to check the group number back at the start
1283 and if necessary complete handling an extraction by setting the offsets and
1284 bumping the high water mark. Note that whole-pattern recursion is coded as
1285 a recurse into group 0, so it won't be picked up here. Instead, we catch it
1286 when the OP_END is reached. Other recursion is handled here. */
1287
1288 if (*prev == OP_CBRA || *prev == OP_SCBRA)
1289 {
1290 number = GET2(prev, 1+LINK_SIZE);
1291 offset = number << 1;
1292
1293 #ifdef DEBUG
1294 printf("end bracket %d", number);
1295 printf("\n");
1296 #endif
1297
1298 md->capture_last = number;
1299 if (offset >= md->offset_max) md->offset_overflow = TRUE; else
1300 {
1301 md->offset_vector[offset] =
1302 md->offset_vector[md->offset_end - number];
1303 md->offset_vector[offset+1] = eptr - md->start_subject;
1304 if (offset_top <= offset) offset_top = offset + 2;
1305 }
1306
1307 /* Handle a recursively called group. Restore the offsets
1308 appropriately and continue from after the call. */
1309
1310 if (md->recursive != NULL && md->recursive->group_num == number)
1311 {
1312 recursion_info *rec = md->recursive;
1313 DPRINTF(("Recursion (%d) succeeded - continuing\n", number));
1314 md->recursive = rec->prevrec;
1315 mstart = rec->save_start;
1316 memcpy(md->offset_vector, rec->offset_save,
1317 rec->saved_max * sizeof(int));
1318 ecode = rec->after_call;
1319 ims = original_ims;
1320 break;
1321 }
1322 }
1323
1324 /* For both capturing and non-capturing groups, reset the value of the ims
1325 flags, in case they got changed during the group. */
1326
1327 ims = original_ims;
1328 DPRINTF(("ims reset to %02lx\n", ims));
1329
1330 /* For a non-repeating ket, just continue at this level. This also
1331 happens for a repeating ket if no characters were matched in the group.
1332 This is the forcible breaking of infinite loops as implemented in Perl
1333 5.005. If there is an options reset, it will get obeyed in the normal
1334 course of events. */
1335
1336 if (*ecode == OP_KET || eptr == saved_eptr)
1337 {
1338 ecode += 1 + LINK_SIZE;
1339 break;
1340 }
1341
1342 /* The repeating kets try the rest of the pattern or restart from the
1343 preceding bracket, in the appropriate order. In the second case, we can use
1344 tail recursion to avoid using another stack frame, unless we have an
1345 unlimited repeat of a group that can match an empty string. */
1346
1347 flags = (*prev >= OP_SBRA)? match_cbegroup : 0;
1348
1349 if (*ecode == OP_KETRMIN)
1350 {
1351 RMATCH(eptr, ecode + 1 + LINK_SIZE, offset_top, md, ims, eptrb, 0, RM12);
1352 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1353 if (flags != 0) /* Could match an empty string */
1354 {
1355 RMATCH(eptr, prev, offset_top, md, ims, eptrb, flags, RM50);
1356 RRETURN(rrc);
1357 }
1358 ecode = prev;
1359 goto TAIL_RECURSE;
1360 }
1361 else /* OP_KETRMAX */
1362 {
1363 RMATCH(eptr, prev, offset_top, md, ims, eptrb, flags, RM13);
1364 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1365 ecode += 1 + LINK_SIZE;
1366 flags = 0;
1367 goto TAIL_RECURSE;
1368 }
1369 /* Control never gets here */
1370
1371 /* Start of subject unless notbol, or after internal newline if multiline */
1372
1373 case OP_CIRC:
1374 if (md->notbol && eptr == md->start_subject) RRETURN(MATCH_NOMATCH);
1375 if ((ims & PCRE_MULTILINE) != 0)
1376 {
1377 if (eptr != md->start_subject &&
1378 (eptr == md->end_subject || !WAS_NEWLINE(eptr)))
1379 RRETURN(MATCH_NOMATCH);
1380 ecode++;
1381 break;
1382 }
1383 /* ... else fall through */
1384
1385 /* Start of subject assertion */
1386
1387 case OP_SOD:
1388 if (eptr != md->start_subject) RRETURN(MATCH_NOMATCH);
1389 ecode++;
1390 break;
1391
1392 /* Start of match assertion */
1393
1394 case OP_SOM:
1395 if (eptr != md->start_subject + md->start_offset) RRETURN(MATCH_NOMATCH);
1396 ecode++;
1397 break;
1398
1399 /* Reset the start of match point */
1400
1401 case OP_SET_SOM:
1402 mstart = eptr;
1403 ecode++;
1404 break;
1405
1406 /* Assert before internal newline if multiline, or before a terminating
1407 newline unless endonly is set, else end of subject unless noteol is set. */
1408
1409 case OP_DOLL:
1410 if ((ims & PCRE_MULTILINE) != 0)
1411 {
1412 if (eptr < md->end_subject)
1413 { if (!IS_NEWLINE(eptr)) RRETURN(MATCH_NOMATCH); }
1414 else
1415 { if (md->noteol) RRETURN(MATCH_NOMATCH); }
1416 ecode++;
1417 break;
1418 }
1419 else
1420 {
1421 if (md->noteol) RRETURN(MATCH_NOMATCH);
1422 if (!md->endonly)
1423 {
1424 if (eptr != md->end_subject &&
1425 (!IS_NEWLINE(eptr) || eptr != md->end_subject - md->nllen))
1426 RRETURN(MATCH_NOMATCH);
1427 ecode++;
1428 break;
1429 }
1430 }
1431 /* ... else fall through for endonly */
1432
1433 /* End of subject assertion (\z) */
1434
1435 case OP_EOD:
1436 if (eptr < md->end_subject) RRETURN(MATCH_NOMATCH);
1437 ecode++;
1438 break;
1439
1440 /* End of subject or ending \n assertion (\Z) */
1441
1442 case OP_EODN:
1443 if (eptr != md->end_subject &&
1444 (!IS_NEWLINE(eptr) || eptr != md->end_subject - md->nllen))
1445 RRETURN(MATCH_NOMATCH);
1446 ecode++;
1447 break;
1448
1449 /* Word boundary assertions */
1450
1451 case OP_NOT_WORD_BOUNDARY:
1452 case OP_WORD_BOUNDARY:
1453 {
1454
1455 /* Find out if the previous and current characters are "word" characters.
1456 It takes a bit more work in UTF-8 mode. Characters > 255 are assumed to
1457 be "non-word" characters. */
1458
1459 #ifdef SUPPORT_UTF8
1460 if (utf8)
1461 {
1462 if (eptr == md->start_subject) prev_is_word = FALSE; else
1463 {
1464 const uschar *lastptr = eptr - 1;
1465 while((*lastptr & 0xc0) == 0x80) lastptr--;
1466 GETCHAR(c, lastptr);
1467 prev_is_word = c < 256 && (md->ctypes[c] & ctype_word) != 0;
1468 }
1469 if (eptr >= md->end_subject) cur_is_word = FALSE; else
1470 {
1471 GETCHAR(c, eptr);
1472 cur_is_word = c < 256 && (md->ctypes[c] & ctype_word) != 0;
1473 }
1474 }
1475 else
1476 #endif
1477
1478 /* More streamlined when not in UTF-8 mode */
1479
1480 {
1481 prev_is_word = (eptr != md->start_subject) &&
1482 ((md->ctypes[eptr[-1]] & ctype_word) != 0);
1483 cur_is_word = (eptr < md->end_subject) &&
1484 ((md->ctypes[*eptr] & ctype_word) != 0);
1485 }
1486
1487 /* Now see if the situation is what we want */
1488
1489 if ((*ecode++ == OP_WORD_BOUNDARY)?
1490 cur_is_word == prev_is_word : cur_is_word != prev_is_word)
1491 RRETURN(MATCH_NOMATCH);
1492 }
1493 break;
1494
1495 /* Match a single character type; inline for speed */
1496
1497 case OP_ANY:
1498 if (IS_NEWLINE(eptr)) RRETURN(MATCH_NOMATCH);
1499 /* Fall through */
1500
1501 case OP_ALLANY:
1502 if (eptr++ >= md->end_subject) RRETURN(MATCH_NOMATCH);
1503 if (utf8) while (eptr < md->end_subject && (*eptr & 0xc0) == 0x80) eptr++;
1504 ecode++;
1505 break;
1506
1507 /* Match a single byte, even in UTF-8 mode. This opcode really does match
1508 any byte, even newline, independent of the setting of PCRE_DOTALL. */
1509
1510 case OP_ANYBYTE:
1511 if (eptr++ >= md->end_subject) RRETURN(MATCH_NOMATCH);
1512 ecode++;
1513 break;
1514
1515 case OP_NOT_DIGIT:
1516 if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
1517 GETCHARINCTEST(c, eptr);
1518 if (
1519 #ifdef SUPPORT_UTF8
1520 c < 256 &&
1521 #endif
1522 (md->ctypes[c] & ctype_digit) != 0
1523 )
1524 RRETURN(MATCH_NOMATCH);
1525 ecode++;
1526 break;
1527
1528 case OP_DIGIT:
1529 if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
1530 GETCHARINCTEST(c, eptr);
1531 if (
1532 #ifdef SUPPORT_UTF8
1533 c >= 256 ||
1534 #endif
1535 (md->ctypes[c] & ctype_digit) == 0
1536 )
1537 RRETURN(MATCH_NOMATCH);
1538 ecode++;
1539 break;
1540
1541 case OP_NOT_WHITESPACE:
1542 if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
1543 GETCHARINCTEST(c, eptr);
1544 if (
1545 #ifdef SUPPORT_UTF8
1546 c < 256 &&
1547 #endif
1548 (md->ctypes[c] & ctype_space) != 0
1549 )
1550 RRETURN(MATCH_NOMATCH);
1551 ecode++;
1552 break;
1553
1554 case OP_WHITESPACE:
1555 if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
1556 GETCHARINCTEST(c, eptr);
1557 if (
1558 #ifdef SUPPORT_UTF8
1559 c >= 256 ||
1560 #endif
1561 (md->ctypes[c] & ctype_space) == 0
1562 )
1563 RRETURN(MATCH_NOMATCH);
1564 ecode++;
1565 break;
1566
1567 case OP_NOT_WORDCHAR:
1568 if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
1569 GETCHARINCTEST(c, eptr);
1570 if (
1571 #ifdef SUPPORT_UTF8
1572 c < 256 &&
1573 #endif
1574 (md->ctypes[c] & ctype_word) != 0
1575 )
1576 RRETURN(MATCH_NOMATCH);
1577 ecode++;
1578 break;
1579
1580 case OP_WORDCHAR:
1581 if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
1582 GETCHARINCTEST(c, eptr);
1583 if (
1584 #ifdef SUPPORT_UTF8
1585 c >= 256 ||
1586 #endif
1587 (md->ctypes[c] & ctype_word) == 0
1588 )
1589 RRETURN(MATCH_NOMATCH);
1590 ecode++;
1591 break;
1592
1593 case OP_ANYNL:
1594 if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
1595 GETCHARINCTEST(c, eptr);
1596 switch(c)
1597 {
1598 default: RRETURN(MATCH_NOMATCH);
1599 case 0x000d:
1600 if (eptr < md->end_subject && *eptr == 0x0a) eptr++;
1601 break;
1602
1603 case 0x000a:
1604 break;
1605
1606 case 0x000b:
1607 case 0x000c:
1608 case 0x0085:
1609 case 0x2028:
1610 case 0x2029:
1611 if (md->bsr_anycrlf) RRETURN(MATCH_NOMATCH);
1612 break;
1613 }
1614 ecode++;
1615 break;
1616
1617 case OP_NOT_HSPACE:
1618 if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
1619 GETCHARINCTEST(c, eptr);
1620 switch(c)
1621 {
1622 default: break;
1623 case 0x09: /* HT */
1624 case 0x20: /* SPACE */
1625 case 0xa0: /* NBSP */
1626 case 0x1680: /* OGHAM SPACE MARK */
1627 case 0x180e: /* MONGOLIAN VOWEL SEPARATOR */
1628 case 0x2000: /* EN QUAD */
1629 case 0x2001: /* EM QUAD */
1630 case 0x2002: /* EN SPACE */
1631 case 0x2003: /* EM SPACE */
1632 case 0x2004: /* THREE-PER-EM SPACE */
1633 case 0x2005: /* FOUR-PER-EM SPACE */
1634 case 0x2006: /* SIX-PER-EM SPACE */
1635 case 0x2007: /* FIGURE SPACE */
1636 case 0x2008: /* PUNCTUATION SPACE */
1637 case 0x2009: /* THIN SPACE */
1638 case 0x200A: /* HAIR SPACE */
1639 case 0x202f: /* NARROW NO-BREAK SPACE */
1640 case 0x205f: /* MEDIUM MATHEMATICAL SPACE */
1641 case 0x3000: /* IDEOGRAPHIC SPACE */
1642 RRETURN(MATCH_NOMATCH);
1643 }
1644 ecode++;
1645 break;
1646
1647 case OP_HSPACE:
1648 if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
1649 GETCHARINCTEST(c, eptr);
1650 switch(c)
1651 {
1652 default: RRETURN(MATCH_NOMATCH);
1653 case 0x09: /* HT */
1654 case 0x20: /* SPACE */
1655 case 0xa0: /* NBSP */
1656 case 0x1680: /* OGHAM SPACE MARK */
1657 case 0x180e: /* MONGOLIAN VOWEL SEPARATOR */
1658 case 0x2000: /* EN QUAD */
1659 case 0x2001: /* EM QUAD */
1660 case 0x2002: /* EN SPACE */
1661 case 0x2003: /* EM SPACE */
1662 case 0x2004: /* THREE-PER-EM SPACE */
1663 case 0x2005: /* FOUR-PER-EM SPACE */
1664 case 0x2006: /* SIX-PER-EM SPACE */
1665 case 0x2007: /* FIGURE SPACE */
1666 case 0x2008: /* PUNCTUATION SPACE */
1667 case 0x2009: /* THIN SPACE */
1668 case 0x200A: /* HAIR SPACE */
1669 case 0x202f: /* NARROW NO-BREAK SPACE */
1670 case 0x205f: /* MEDIUM MATHEMATICAL SPACE */
1671 case 0x3000: /* IDEOGRAPHIC SPACE */
1672 break;
1673 }
1674 ecode++;
1675 break;
1676
1677 case OP_NOT_VSPACE:
1678 if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
1679 GETCHARINCTEST(c, eptr);
1680 switch(c)
1681 {
1682 default: break;
1683 case 0x0a: /* LF */
1684 case 0x0b: /* VT */
1685 case 0x0c: /* FF */
1686 case 0x0d: /* CR */
1687 case 0x85: /* NEL */
1688 case 0x2028: /* LINE SEPARATOR */
1689 case 0x2029: /* PARAGRAPH SEPARATOR */
1690 RRETURN(MATCH_NOMATCH);
1691 }
1692 ecode++;
1693 break;
1694
1695 case OP_VSPACE:
1696 if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
1697 GETCHARINCTEST(c, eptr);
1698 switch(c)
1699 {
1700 default: RRETURN(MATCH_NOMATCH);
1701 case 0x0a: /* LF */
1702 case 0x0b: /* VT */
1703 case 0x0c: /* FF */
1704 case 0x0d: /* CR */
1705 case 0x85: /* NEL */
1706 case 0x2028: /* LINE SEPARATOR */
1707 case 0x2029: /* PARAGRAPH SEPARATOR */
1708 break;
1709 }
1710 ecode++;
1711 break;
1712
1713 #ifdef SUPPORT_UCP
1714 /* Check the next character by Unicode property. We will get here only
1715 if the support is in the binary; otherwise a compile-time error occurs. */
1716
1717 case OP_PROP:
1718 case OP_NOTPROP:
1719 if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
1720 GETCHARINCTEST(c, eptr);
1721 {
1722 const ucd_record *prop = GET_UCD(c);
1723
1724 switch(ecode[1])
1725 {
1726 case PT_ANY:
1727 if (op == OP_NOTPROP) RRETURN(MATCH_NOMATCH);
1728 break;
1729
1730 case PT_LAMP:
1731 if ((prop->chartype == ucp_Lu ||
1732 prop->chartype == ucp_Ll ||
1733 prop->chartype == ucp_Lt) == (op == OP_NOTPROP))
1734 RRETURN(MATCH_NOMATCH);
1735 break;
1736
1737 case PT_GC:
1738 if ((ecode[2] != _pcre_ucp_gentype[prop->chartype]) == (op == OP_PROP))
1739 RRETURN(MATCH_NOMATCH);
1740 break;
1741
1742 case PT_PC:
1743 if ((ecode[2] != prop->chartype) == (op == OP_PROP))
1744 RRETURN(MATCH_NOMATCH);
1745 break;
1746
1747 case PT_SC:
1748 if ((ecode[2] != prop->script) == (op == OP_PROP))
1749 RRETURN(MATCH_NOMATCH);
1750 break;
1751
1752 default:
1753 RRETURN(PCRE_ERROR_INTERNAL);
1754 }
1755
1756 ecode += 3;
1757 }
1758 break;
1759
1760 /* Match an extended Unicode sequence. We will get here only if the support
1761 is in the binary; otherwise a compile-time error occurs. */
1762
1763 case OP_EXTUNI:
1764 if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
1765 GETCHARINCTEST(c, eptr);
1766 {
1767 int category = UCD_CATEGORY(c);
1768 if (category == ucp_M) RRETURN(MATCH_NOMATCH);
1769 while (eptr < md->end_subject)
1770 {
1771 int len = 1;
1772 if (!utf8) c = *eptr; else
1773 {
1774 GETCHARLEN(c, eptr, len);
1775 }
1776 category = UCD_CATEGORY(c);
1777 if (category != ucp_M) break;
1778 eptr += len;
1779 }
1780 }
1781 ecode++;
1782 break;
1783 #endif
1784
1785
1786 /* Match a back reference, possibly repeatedly. Look past the end of the
1787 item to see if there is repeat information following. The code is similar
1788 to that for character classes, but repeated for efficiency. Then obey
1789 similar code to character type repeats - written out again for speed.
1790 However, if the referenced string is the empty string, always treat
1791 it as matched, any number of times (otherwise there could be infinite
1792 loops). */
1793
1794 case OP_REF:
1795 {
1796 offset = GET2(ecode, 1) << 1; /* Doubled ref number */
1797 ecode += 3;
1798
1799 /* If the reference is unset, there are two possibilities:
1800
1801 (a) In the default, Perl-compatible state, set the length to be longer
1802 than the amount of subject left; this ensures that every attempt at a
1803 match fails. We can't just fail here, because of the possibility of
1804 quantifiers with zero minima.
1805
1806 (b) If the JavaScript compatibility flag is set, set the length to zero
1807 so that the back reference matches an empty string.
1808
1809 Otherwise, set the length to the length of what was matched by the
1810 referenced subpattern. */
1811
1812 if (offset >= offset_top || md->offset_vector[offset] < 0)
1813 length = (md->jscript_compat)? 0 : md->end_subject - eptr + 1;
1814 else
1815 length = md->offset_vector[offset+1] - md->offset_vector[offset];
1816
1817 /* Set up for repetition, or handle the non-repeated case */
1818
1819 switch (*ecode)
1820 {
1821 case OP_CRSTAR:
1822 case OP_CRMINSTAR:
1823 case OP_CRPLUS:
1824 case OP_CRMINPLUS:
1825 case OP_CRQUERY:
1826 case OP_CRMINQUERY:
1827 c = *ecode++ - OP_CRSTAR;
1828 minimize = (c & 1) != 0;
1829 min = rep_min[c]; /* Pick up values from tables; */
1830 max = rep_max[c]; /* zero for max => infinity */
1831 if (max == 0) max = INT_MAX;
1832 break;
1833
1834 case OP_CRRANGE:
1835 case OP_CRMINRANGE:
1836 minimize = (*ecode == OP_CRMINRANGE);
1837 min = GET2(ecode, 1);
1838 max = GET2(ecode, 3);
1839 if (max == 0) max = INT_MAX;
1840 ecode += 5;
1841 break;
1842
1843 default: /* No repeat follows */
1844 if (!match_ref(offset, eptr, length, md, ims)) RRETURN(MATCH_NOMATCH);
1845 eptr += length;
1846 continue; /* With the main loop */
1847 }
1848
1849 /* If the length of the reference is zero, just continue with the
1850 main loop. */
1851
1852 if (length == 0) continue;
1853
1854 /* First, ensure the minimum number of matches are present. We get back
1855 the length of the reference string explicitly rather than passing the
1856 address of eptr, so that eptr can be a register variable. */
1857
1858 for (i = 1; i <= min; i++)
1859 {
1860 if (!match_ref(offset, eptr, length, md, ims)) RRETURN(MATCH_NOMATCH);
1861 eptr += length;
1862 }
1863
1864 /* If min = max, continue at the same level without recursion.
1865 They are not both allowed to be zero. */
1866
1867 if (min == max) continue;
1868
1869 /* If minimizing, keep trying and advancing the pointer */
1870
1871 if (minimize)
1872 {
1873 for (fi = min;; fi++)
1874 {
1875 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM14);
1876 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1877 if (fi >= max || !match_ref(offset, eptr, length, md, ims))
1878 RRETURN(MATCH_NOMATCH);
1879 eptr += length;
1880 }
1881 /* Control never gets here */
1882 }
1883
1884 /* If maximizing, find the longest string and work backwards */
1885
1886 else
1887 {
1888 pp = eptr;
1889 for (i = min; i < max; i++)
1890 {
1891 if (!match_ref(offset, eptr, length, md, ims)) break;
1892 eptr += length;
1893 }
1894 while (eptr >= pp)
1895 {
1896 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM15);
1897 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1898 eptr -= length;
1899 }
1900 RRETURN(MATCH_NOMATCH);
1901 }
1902 }
1903 /* Control never gets here */
1904
1905
1906
1907 /* Match a bit-mapped character class, possibly repeatedly. This op code is
1908 used when all the characters in the class have values in the range 0-255,
1909 and either the matching is caseful, or the characters are in the range
1910 0-127 when UTF-8 processing is enabled. The only difference between
1911 OP_CLASS and OP_NCLASS occurs when a data character outside the range is
1912 encountered.
1913
1914 First, look past the end of the item to see if there is repeat information
1915 following. Then obey similar code to character type repeats - written out
1916 again for speed. */
1917
1918 case OP_NCLASS:
1919 case OP_CLASS:
1920 {
1921 data = ecode + 1; /* Save for matching */
1922 ecode += 33; /* Advance past the item */
1923
1924 switch (*ecode)
1925 {
1926 case OP_CRSTAR:
1927 case OP_CRMINSTAR:
1928 case OP_CRPLUS:
1929 case OP_CRMINPLUS:
1930 case OP_CRQUERY:
1931 case OP_CRMINQUERY:
1932 c = *ecode++ - OP_CRSTAR;
1933 minimize = (c & 1) != 0;
1934 min = rep_min[c]; /* Pick up values from tables; */
1935 max = rep_max[c]; /* zero for max => infinity */
1936 if (max == 0) max = INT_MAX;
1937 break;
1938
1939 case OP_CRRANGE:
1940 case OP_CRMINRANGE:
1941 minimize = (*ecode == OP_CRMINRANGE);
1942 min = GET2(ecode, 1);
1943 max = GET2(ecode, 3);
1944 if (max == 0) max = INT_MAX;
1945 ecode += 5;
1946 break;
1947
1948 default: /* No repeat follows */
1949 min = max = 1;
1950 break;
1951 }
1952
1953 /* First, ensure the minimum number of matches are present. */
1954
1955 #ifdef SUPPORT_UTF8
1956 /* UTF-8 mode */
1957 if (utf8)
1958 {
1959 for (i = 1; i <= min; i++)
1960 {
1961 if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
1962 GETCHARINC(c, eptr);
1963 if (c > 255)
1964 {
1965 if (op == OP_CLASS) RRETURN(MATCH_NOMATCH);
1966 }
1967 else
1968 {
1969 if ((data[c/8] & (1 << (c&7))) == 0) RRETURN(MATCH_NOMATCH);
1970 }
1971 }
1972 }
1973 else
1974 #endif
1975 /* Not UTF-8 mode */
1976 {
1977 for (i = 1; i <= min; i++)
1978 {
1979 if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
1980 c = *eptr++;
1981 if ((data[c/8] & (1 << (c&7))) == 0) RRETURN(MATCH_NOMATCH);
1982 }
1983 }
1984
1985 /* If max == min we can continue with the main loop without the
1986 need to recurse. */
1987
1988 if (min == max) continue;
1989
1990 /* If minimizing, keep testing the rest of the expression and advancing
1991 the pointer while it matches the class. */
1992
1993 if (minimize)
1994 {
1995 #ifdef SUPPORT_UTF8
1996 /* UTF-8 mode */
1997 if (utf8)
1998 {
1999 for (fi = min;; fi++)
2000 {
2001 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM16);
2002 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2003 if (fi >= max || eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
2004 GETCHARINC(c, eptr);
2005 if (c > 255)
2006 {
2007 if (op == OP_CLASS) RRETURN(MATCH_NOMATCH);
2008 }
2009 else
2010 {
2011 if ((data[c/8] & (1 << (c&7))) == 0) RRETURN(MATCH_NOMATCH);
2012 }
2013 }
2014 }
2015 else
2016 #endif
2017 /* Not UTF-8 mode */
2018 {
2019 for (fi = min;; fi++)
2020 {
2021 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM17);
2022 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2023 if (fi >= max || eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
2024 c = *eptr++;
2025 if ((data[c/8] & (1 << (c&7))) == 0) RRETURN(MATCH_NOMATCH);
2026 }
2027 }
2028 /* Control never gets here */
2029 }
2030
2031 /* If maximizing, find the longest possible run, then work backwards. */
2032
2033 else
2034 {
2035 pp = eptr;
2036
2037 #ifdef SUPPORT_UTF8
2038 /* UTF-8 mode */
2039 if (utf8)
2040 {
2041 for (i = min; i < max; i++)
2042 {
2043 int len = 1;
2044 if (eptr >= md->end_subject) break;
2045 GETCHARLEN(c, eptr, len);
2046 if (c > 255)
2047 {
2048 if (op == OP_CLASS) break;
2049 }
2050 else
2051 {
2052 if ((data[c/8] & (1 << (c&7))) == 0) break;
2053 }
2054 eptr += len;
2055 }
2056 for (;;)
2057 {
2058 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM18);
2059 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2060 if (eptr-- == pp) break; /* Stop if tried at original pos */
2061 BACKCHAR(eptr);
2062 }
2063 }
2064 else
2065 #endif
2066 /* Not UTF-8 mode */
2067 {
2068 for (i = min; i < max; i++)
2069 {
2070 if (eptr >= md->end_subject) break;
2071 c = *eptr;
2072 if ((data[c/8] & (1 << (c&7))) == 0) break;
2073 eptr++;
2074 }
2075 while (eptr >= pp)
2076 {
2077 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM19);
2078 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2079 eptr--;
2080 }
2081 }
2082
2083 RRETURN(MATCH_NOMATCH);
2084 }
2085 }
2086 /* Control never gets here */
2087
2088
2089 /* Match an extended character class. This opcode is encountered only
2090 when UTF-8 mode mode is supported. Nevertheless, we may not be in UTF-8
2091 mode, because Unicode properties are supported in non-UTF-8 mode. */
2092
2093 #ifdef SUPPORT_UTF8
2094 case OP_XCLASS:
2095 {
2096 data = ecode + 1 + LINK_SIZE; /* Save for matching */
2097 ecode += GET(ecode, 1); /* Advance past the item */
2098
2099 switch (*ecode)
2100 {
2101 case OP_CRSTAR:
2102 case OP_CRMINSTAR:
2103 case OP_CRPLUS:
2104 case OP_CRMINPLUS:
2105 case OP_CRQUERY:
2106 case OP_CRMINQUERY:
2107 c = *ecode++ - OP_CRSTAR;
2108 minimize = (c & 1) != 0;
2109 min = rep_min[c]; /* Pick up values from tables; */
2110 max = rep_max[c]; /* zero for max => infinity */
2111 if (max == 0) max = INT_MAX;
2112 break;
2113
2114 case OP_CRRANGE:
2115 case OP_CRMINRANGE:
2116 minimize = (*ecode == OP_CRMINRANGE);
2117 min = GET2(ecode, 1);
2118 max = GET2(ecode, 3);
2119 if (max == 0) max = INT_MAX;
2120 ecode += 5;
2121 break;
2122
2123 default: /* No repeat follows */
2124 min = max = 1;
2125 break;
2126 }
2127
2128 /* First, ensure the minimum number of matches are present. */
2129
2130 for (i = 1; i <= min; i++)
2131 {
2132 if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
2133 GETCHARINCTEST(c, eptr);
2134 if (!_pcre_xclass(c, data)) RRETURN(MATCH_NOMATCH);
2135 }
2136
2137 /* If max == min we can continue with the main loop without the
2138 need to recurse. */
2139
2140 if (min == max) continue;
2141
2142 /* If minimizing, keep testing the rest of the expression and advancing
2143 the pointer while it matches the class. */
2144
2145 if (minimize)
2146 {
2147 for (fi = min;; fi++)
2148 {
2149 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM20);
2150 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2151 if (fi >= max || eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
2152 GETCHARINCTEST(c, eptr);
2153 if (!_pcre_xclass(c, data)) RRETURN(MATCH_NOMATCH);
2154 }
2155 /* Control never gets here */
2156 }
2157
2158 /* If maximizing, find the longest possible run, then work backwards. */
2159
2160 else
2161 {
2162 pp = eptr;
2163 for (i = min; i < max; i++)
2164 {
2165 int len = 1;
2166 if (eptr >= md->end_subject) break;
2167 GETCHARLENTEST(c, eptr, len);
2168 if (!_pcre_xclass(c, data)) break;
2169 eptr += len;
2170 }
2171 for(;;)
2172 {
2173 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM21);
2174 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2175 if (eptr-- == pp) break; /* Stop if tried at original pos */
2176 if (utf8) BACKCHAR(eptr);
2177 }
2178 RRETURN(MATCH_NOMATCH);
2179 }
2180
2181 /* Control never gets here */
2182 }
2183 #endif /* End of XCLASS */
2184
2185 /* Match a single character, casefully */
2186
2187 case OP_CHAR:
2188 #ifdef SUPPORT_UTF8
2189 if (utf8)
2190 {
2191 length = 1;
2192 ecode++;
2193 GETCHARLEN(fc, ecode, length);
2194 if (length > md->end_subject - eptr) RRETURN(MATCH_NOMATCH);
2195 while (length-- > 0) if (*ecode++ != *eptr++) RRETURN(MATCH_NOMATCH);
2196 }
2197 else
2198 #endif
2199
2200 /* Non-UTF-8 mode */
2201 {
2202 if (md->end_subject - eptr < 1) RRETURN(MATCH_NOMATCH);
2203 if (ecode[1] != *eptr++) RRETURN(MATCH_NOMATCH);
2204 ecode += 2;
2205 }
2206 break;
2207
2208 /* Match a single character, caselessly */
2209
2210 case OP_CHARNC:
2211 #ifdef SUPPORT_UTF8
2212 if (utf8)
2213 {
2214 length = 1;
2215 ecode++;
2216 GETCHARLEN(fc, ecode, length);
2217
2218 if (length > md->end_subject - eptr) RRETURN(MATCH_NOMATCH);
2219
2220 /* If the pattern character's value is < 128, we have only one byte, and
2221 can use the fast lookup table. */
2222
2223 if (fc < 128)
2224 {
2225 if (md->lcc[*ecode++] != md->lcc[*eptr++]) RRETURN(MATCH_NOMATCH);
2226 }
2227
2228 /* Otherwise we must pick up the subject character */
2229
2230 else
2231 {
2232 unsigned int dc;
2233 GETCHARINC(dc, eptr);
2234 ecode += length;
2235
2236 /* If we have Unicode property support, we can use it to test the other
2237 case of the character, if there is one. */
2238
2239 if (fc != dc)
2240 {
2241 #ifdef SUPPORT_UCP
2242 if (dc != UCD_OTHERCASE(fc))
2243 #endif
2244 RRETURN(MATCH_NOMATCH);
2245 }
2246 }
2247 }
2248 else
2249 #endif /* SUPPORT_UTF8 */
2250
2251 /* Non-UTF-8 mode */
2252 {
2253 if (md->end_subject - eptr < 1) RRETURN(MATCH_NOMATCH);
2254 if (md->lcc[ecode[1]] != md->lcc[*eptr++]) RRETURN(MATCH_NOMATCH);
2255 ecode += 2;
2256 }
2257 break;
2258
2259 /* Match a single character repeatedly. */
2260
2261 case OP_EXACT:
2262 min = max = GET2(ecode, 1);
2263 ecode += 3;
2264 goto REPEATCHAR;
2265
2266 case OP_POSUPTO:
2267 possessive = TRUE;
2268 /* Fall through */
2269
2270 case OP_UPTO:
2271 case OP_MINUPTO:
2272 min = 0;
2273 max = GET2(ecode, 1);
2274 minimize = *ecode == OP_MINUPTO;
2275 ecode += 3;
2276 goto REPEATCHAR;
2277
2278 case OP_POSSTAR:
2279 possessive = TRUE;
2280 min = 0;
2281 max = INT_MAX;
2282 ecode++;
2283 goto REPEATCHAR;
2284
2285 case OP_POSPLUS:
2286 possessive = TRUE;
2287 min = 1;
2288 max = INT_MAX;
2289 ecode++;
2290 goto REPEATCHAR;
2291
2292 case OP_POSQUERY:
2293 possessive = TRUE;
2294 min = 0;
2295 max = 1;
2296 ecode++;
2297 goto REPEATCHAR;
2298
2299 case OP_STAR:
2300 case OP_MINSTAR:
2301 case OP_PLUS:
2302 case OP_MINPLUS:
2303 case OP_QUERY:
2304 case OP_MINQUERY:
2305 c = *ecode++ - OP_STAR;
2306 minimize = (c & 1) != 0;
2307 min = rep_min[c]; /* Pick up values from tables; */
2308 max = rep_max[c]; /* zero for max => infinity */
2309 if (max == 0) max = INT_MAX;
2310
2311 /* Common code for all repeated single-character matches. We can give
2312 up quickly if there are fewer than the minimum number of characters left in
2313 the subject. */
2314
2315 REPEATCHAR:
2316 #ifdef SUPPORT_UTF8
2317 if (utf8)
2318 {
2319 length = 1;
2320 charptr = ecode;
2321 GETCHARLEN(fc, ecode, length);
2322 if (min * length > md->end_subject - eptr) RRETURN(MATCH_NOMATCH);
2323 ecode += length;
2324
2325 /* Handle multibyte character matching specially here. There is
2326 support for caseless matching if UCP support is present. */
2327
2328 if (length > 1)
2329 {
2330 #ifdef SUPPORT_UCP
2331 unsigned int othercase;
2332 if ((ims & PCRE_CASELESS) != 0 &&
2333 (othercase = UCD_OTHERCASE(fc)) != fc)
2334 oclength = _pcre_ord2utf8(othercase, occhars);
2335 else oclength = 0;
2336 #endif /* SUPPORT_UCP */
2337
2338 for (i = 1; i <= min; i++)
2339 {
2340 if (memcmp(eptr, charptr, length) == 0) eptr += length;
2341 #ifdef SUPPORT_UCP
2342 /* Need braces because of following else */
2343 else if (oclength == 0) { RRETURN(MATCH_NOMATCH); }
2344 else
2345 {
2346 if (memcmp(eptr, occhars, oclength) != 0) RRETURN(MATCH_NOMATCH);
2347 eptr += oclength;
2348 }
2349 #else /* without SUPPORT_UCP */
2350 else { RRETURN(MATCH_NOMATCH); }
2351 #endif /* SUPPORT_UCP */
2352 }
2353
2354 if (min == max) continue;
2355
2356 if (minimize)
2357 {
2358 for (fi = min;; fi++)
2359 {
2360 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM22);
2361 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2362 if (fi >= max || eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
2363 if (memcmp(eptr, charptr, length) == 0) eptr += length;
2364 #ifdef SUPPORT_UCP
2365 /* Need braces because of following else */
2366 else if (oclength == 0) { RRETURN(MATCH_NOMATCH); }
2367 else
2368 {
2369 if (memcmp(eptr, occhars, oclength) != 0) RRETURN(MATCH_NOMATCH);
2370 eptr += oclength;
2371 }
2372 #else /* without SUPPORT_UCP */
2373 else { RRETURN (MATCH_NOMATCH); }
2374 #endif /* SUPPORT_UCP */
2375 }
2376 /* Control never gets here */
2377 }
2378
2379 else /* Maximize */
2380 {
2381 pp = eptr;
2382 for (i = min; i < max; i++)
2383 {
2384 if (eptr > md->end_subject - length) break;
2385 if (memcmp(eptr, charptr, length) == 0) eptr += length;
2386 #ifdef SUPPORT_UCP
2387 else if (oclength == 0) break;
2388 else
2389 {
2390 if (memcmp(eptr, occhars, oclength) != 0) break;
2391 eptr += oclength;
2392 }
2393 #else /* without SUPPORT_UCP */
2394 else break;
2395 #endif /* SUPPORT_UCP */
2396 }
2397
2398 if (possessive) continue;
2399 for(;;)
2400 {
2401 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM23);
2402 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2403 if (eptr == pp) RRETURN(MATCH_NOMATCH);
2404 #ifdef SUPPORT_UCP
2405 eptr--;
2406 BACKCHAR(eptr);
2407 #else /* without SUPPORT_UCP */
2408 eptr -= length;
2409 #endif /* SUPPORT_UCP */
2410 }
2411 }
2412 /* Control never gets here */
2413 }
2414
2415 /* If the length of a UTF-8 character is 1, we fall through here, and
2416 obey the code as for non-UTF-8 characters below, though in this case the
2417 value of fc will always be < 128. */
2418 }
2419 else
2420 #endif /* SUPPORT_UTF8 */
2421
2422 /* When not in UTF-8 mode, load a single-byte character. */
2423 {
2424 if (min > md->end_subject - eptr) RRETURN(MATCH_NOMATCH);
2425 fc = *ecode++;
2426 }
2427
2428 /* The value of fc at this point is always less than 256, though we may or
2429 may not be in UTF-8 mode. The code is duplicated for the caseless and
2430 caseful cases, for speed, since matching characters is likely to be quite
2431 common. First, ensure the minimum number of matches are present. If min =
2432 max, continue at the same level without recursing. Otherwise, if
2433 minimizing, keep trying the rest of the expression and advancing one
2434 matching character if failing, up to the maximum. Alternatively, if
2435 maximizing, find the maximum number of characters and work backwards. */
2436
2437 DPRINTF(("matching %c{%d,%d} against subject %.*s\n", fc, min, max,
2438 max, eptr));
2439
2440 if ((ims & PCRE_CASELESS) != 0)
2441 {
2442 fc = md->lcc[fc];
2443 for (i = 1; i <= min; i++)
2444 if (fc != md->lcc[*eptr++]) RRETURN(MATCH_NOMATCH);
2445 if (min == max) continue;
2446 if (minimize)
2447 {
2448 for (fi = min;; fi++)
2449 {
2450 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM24);
2451 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2452 if (fi >= max || eptr >= md->end_subject ||
2453 fc != md->lcc[*eptr++])
2454 RRETURN(MATCH_NOMATCH);
2455 }
2456 /* Control never gets here */
2457 }
2458 else /* Maximize */
2459 {
2460 pp = eptr;
2461 for (i = min; i < max; i++)
2462 {
2463 if (eptr >= md->end_subject || fc != md->lcc[*eptr]) break;
2464 eptr++;
2465 }
2466 if (possessive) continue;
2467 while (eptr >= pp)
2468 {
2469 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM25);
2470 eptr--;
2471 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2472 }
2473 RRETURN(MATCH_NOMATCH);
2474 }
2475 /* Control never gets here */
2476 }
2477
2478 /* Caseful comparisons (includes all multi-byte characters) */
2479
2480 else
2481 {
2482 for (i = 1; i <= min; i++) if (fc != *eptr++) RRETURN(MATCH_NOMATCH);
2483 if (min == max) continue;
2484 if (minimize)
2485 {
2486 for (fi = min;; fi++)
2487 {
2488 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM26);
2489 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2490 if (fi >= max || eptr >= md->end_subject || fc != *eptr++)
2491 RRETURN(MATCH_NOMATCH);
2492 }
2493 /* Control never gets here */
2494 }
2495 else /* Maximize */
2496 {
2497 pp = eptr;
2498 for (i = min; i < max; i++)
2499 {
2500 if (eptr >= md->end_subject || fc != *eptr) break;
2501 eptr++;
2502 }
2503 if (possessive) continue;
2504 while (eptr >= pp)
2505 {
2506 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM27);
2507 eptr--;
2508 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2509 }
2510 RRETURN(MATCH_NOMATCH);
2511 }
2512 }
2513 /* Control never gets here */
2514
2515 /* Match a negated single one-byte character. The character we are
2516 checking can be multibyte. */
2517
2518 case OP_NOT:
2519 if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
2520 ecode++;
2521 GETCHARINCTEST(c, eptr);
2522 if ((ims & PCRE_CASELESS) != 0)
2523 {
2524 #ifdef SUPPORT_UTF8
2525 if (c < 256)
2526 #endif
2527 c = md->lcc[c];
2528 if (md->lcc[*ecode++] == c) RRETURN(MATCH_NOMATCH);
2529 }
2530 else
2531 {
2532 if (*ecode++ == c) RRETURN(MATCH_NOMATCH);
2533 }
2534 break;
2535
2536 /* Match a negated single one-byte character repeatedly. This is almost a
2537 repeat of the code for a repeated single character, but I haven't found a
2538 nice way of commoning these up that doesn't require a test of the
2539 positive/negative option for each character match. Maybe that wouldn't add
2540 very much to the time taken, but character matching *is* what this is all
2541 about... */
2542
2543 case OP_NOTEXACT:
2544 min = max = GET2(ecode, 1);
2545 ecode += 3;
2546 goto REPEATNOTCHAR;
2547
2548 case OP_NOTUPTO:
2549 case OP_NOTMINUPTO:
2550 min = 0;
2551 max = GET2(ecode, 1);
2552 minimize = *ecode == OP_NOTMINUPTO;
2553 ecode += 3;
2554 goto REPEATNOTCHAR;
2555
2556 case OP_NOTPOSSTAR:
2557 possessive = TRUE;
2558 min = 0;
2559 max = INT_MAX;
2560 ecode++;
2561 goto REPEATNOTCHAR;
2562
2563 case OP_NOTPOSPLUS:
2564 possessive = TRUE;
2565 min = 1;
2566 max = INT_MAX;
2567 ecode++;
2568 goto REPEATNOTCHAR;
2569
2570 case OP_NOTPOSQUERY:
2571 possessive = TRUE;
2572 min = 0;
2573 max = 1;
2574 ecode++;
2575 goto REPEATNOTCHAR;
2576
2577 case OP_NOTPOSUPTO:
2578 possessive = TRUE;
2579 min = 0;
2580 max = GET2(ecode, 1);
2581 ecode += 3;
2582 goto REPEATNOTCHAR;
2583
2584 case OP_NOTSTAR:
2585 case OP_NOTMINSTAR:
2586 case OP_NOTPLUS:
2587 case OP_NOTMINPLUS:
2588 case OP_NOTQUERY:
2589 case OP_NOTMINQUERY:
2590 c = *ecode++ - OP_NOTSTAR;
2591 minimize = (c & 1) != 0;
2592 min = rep_min[c]; /* Pick up values from tables; */
2593 max = rep_max[c]; /* zero for max => infinity */
2594 if (max == 0) max = INT_MAX;
2595
2596 /* Common code for all repeated single-byte matches. We can give up quickly
2597 if there are fewer than the minimum number of bytes left in the
2598 subject. */
2599
2600 REPEATNOTCHAR:
2601 if (min > md->end_subject - eptr) RRETURN(MATCH_NOMATCH);
2602 fc = *ecode++;
2603
2604 /* The code is duplicated for the caseless and caseful cases, for speed,
2605 since matching characters is likely to be quite common. First, ensure the
2606 minimum number of matches are present. If min = max, continue at the same
2607 level without recursing. Otherwise, if minimizing, keep trying the rest of
2608 the expression and advancing one matching character if failing, up to the
2609 maximum. Alternatively, if maximizing, find the maximum number of
2610 characters and work backwards. */
2611
2612 DPRINTF(("negative matching %c{%d,%d} against subject %.*s\n", fc, min, max,
2613 max, eptr));
2614
2615 if ((ims & PCRE_CASELESS) != 0)
2616 {
2617 fc = md->lcc[fc];
2618
2619 #ifdef SUPPORT_UTF8
2620 /* UTF-8 mode */
2621 if (utf8)
2622 {
2623 register unsigned int d;
2624 for (i = 1; i <= min; i++)
2625 {
2626 GETCHARINC(d, eptr);
2627 if (d < 256) d = md->lcc[d];
2628 if (fc == d) RRETURN(MATCH_NOMATCH);
2629 }
2630 }
2631 else
2632 #endif
2633
2634 /* Not UTF-8 mode */
2635 {
2636 for (i = 1; i <= min; i++)
2637 if (fc == md->lcc[*eptr++]) RRETURN(MATCH_NOMATCH);
2638 }
2639
2640 if (min == max) continue;
2641
2642 if (minimize)
2643 {
2644 #ifdef SUPPORT_UTF8
2645 /* UTF-8 mode */
2646 if (utf8)
2647 {
2648 register unsigned int d;
2649 for (fi = min;; fi++)
2650 {
2651 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM28);
2652 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2653 if (fi >= max || eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
2654 GETCHARINC(d, eptr);
2655 if (d < 256) d = md->lcc[d];
2656 if (fc == d) RRETURN(MATCH_NOMATCH);
2657
2658 }
2659 }
2660 else
2661 #endif
2662 /* Not UTF-8 mode */
2663 {
2664 for (fi = min;; fi++)
2665 {
2666 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM29);
2667 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2668 if (fi >= max || eptr >= md->end_subject || fc == md->lcc[*eptr++])
2669 RRETURN(MATCH_NOMATCH);
2670 }
2671 }
2672 /* Control never gets here */
2673 }
2674
2675 /* Maximize case */
2676
2677 else
2678 {
2679 pp = eptr;
2680
2681 #ifdef SUPPORT_UTF8
2682 /* UTF-8 mode */
2683 if (utf8)
2684 {
2685 register unsigned int d;
2686 for (i = min; i < max; i++)
2687 {
2688 int len = 1;
2689 if (eptr >= md->end_subject) break;
2690 GETCHARLEN(d, eptr, len);
2691 if (d < 256) d = md->lcc[d];
2692 if (fc == d) break;
2693 eptr += len;
2694 }
2695 if (possessive) continue;
2696 for(;;)
2697 {
2698 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM30);
2699 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2700 if (eptr-- == pp) break; /* Stop if tried at original pos */
2701 BACKCHAR(eptr);
2702 }
2703 }
2704 else
2705 #endif
2706 /* Not UTF-8 mode */
2707 {
2708 for (i = min; i < max; i++)
2709 {
2710 if (eptr >= md->end_subject || fc == md->lcc[*eptr]) break;
2711 eptr++;
2712 }
2713 if (possessive) continue;
2714 while (eptr >= pp)
2715 {
2716 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM31);
2717 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2718 eptr--;
2719 }
2720 }
2721
2722 RRETURN(MATCH_NOMATCH);
2723 }
2724 /* Control never gets here */
2725 }
2726
2727 /* Caseful comparisons */
2728
2729 else
2730 {
2731 #ifdef SUPPORT_UTF8
2732 /* UTF-8 mode */
2733 if (utf8)
2734 {
2735 register unsigned int d;
2736 for (i = 1; i <= min; i++)
2737 {
2738 GETCHARINC(d, eptr);
2739 if (fc == d) RRETURN(MATCH_NOMATCH);
2740 }
2741 }
2742 else
2743 #endif
2744 /* Not UTF-8 mode */
2745 {
2746 for (i = 1; i <= min; i++)
2747 if (fc == *eptr++) RRETURN(MATCH_NOMATCH);
2748 }
2749
2750 if (min == max) continue;
2751
2752 if (minimize)
2753 {
2754 #ifdef SUPPORT_UTF8
2755 /* UTF-8 mode */
2756 if (utf8)
2757 {
2758 register unsigned int d;
2759 for (fi = min;; fi++)
2760 {
2761 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM32);
2762 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2763 if (fi >= max || eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
2764 GETCHARINC(d, eptr);
2765 if (fc == d) RRETURN(MATCH_NOMATCH);
2766 }
2767 }
2768 else
2769 #endif
2770 /* Not UTF-8 mode */
2771 {
2772 for (fi = min;; fi++)
2773 {
2774 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM33);
2775 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2776 if (fi >= max || eptr >= md->end_subject || fc == *eptr++)
2777 RRETURN(MATCH_NOMATCH);
2778 }
2779 }
2780 /* Control never gets here */
2781 }
2782
2783 /* Maximize case */
2784
2785 else
2786 {
2787 pp = eptr;
2788
2789 #ifdef SUPPORT_UTF8
2790 /* UTF-8 mode */
2791 if (utf8)
2792 {
2793 register unsigned int d;
2794 for (i = min; i < max; i++)
2795 {
2796 int len = 1;
2797 if (eptr >= md->end_subject) break;
2798 GETCHARLEN(d, eptr, len);
2799 if (fc == d) break;
2800 eptr += len;
2801 }
2802 if (possessive) continue;
2803 for(;;)
2804 {
2805 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM34);
2806 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2807 if (eptr-- == pp) break; /* Stop if tried at original pos */
2808 BACKCHAR(eptr);
2809 }
2810 }
2811 else
2812 #endif
2813 /* Not UTF-8 mode */
2814 {
2815 for (i = min; i < max; i++)
2816 {
2817 if (eptr >= md->end_subject || fc == *eptr) break;
2818 eptr++;
2819 }
2820 if (possessive) continue;
2821 while (eptr >= pp)
2822 {
2823 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM35);
2824 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2825 eptr--;
2826 }
2827 }
2828
2829 RRETURN(MATCH_NOMATCH);
2830 }
2831 }
2832 /* Control never gets here */
2833
2834 /* Match a single character type repeatedly; several different opcodes
2835 share code. This is very similar to the code for single characters, but we
2836 repeat it in the interests of efficiency. */
2837
2838 case OP_TYPEEXACT:
2839 min = max = GET2(ecode, 1);
2840 minimize = TRUE;
2841 ecode += 3;
2842 goto REPEATTYPE;
2843
2844 case OP_TYPEUPTO:
2845 case OP_TYPEMINUPTO:
2846 min = 0;
2847 max = GET2(ecode, 1);
2848 minimize = *ecode == OP_TYPEMINUPTO;
2849 ecode += 3;
2850 goto REPEATTYPE;
2851
2852 case OP_TYPEPOSSTAR:
2853 possessive = TRUE;
2854 min = 0;
2855 max = INT_MAX;
2856 ecode++;
2857 goto REPEATTYPE;
2858
2859 case OP_TYPEPOSPLUS:
2860 possessive = TRUE;
2861 min = 1;
2862 max = INT_MAX;
2863 ecode++;
2864 goto REPEATTYPE;
2865
2866 case OP_TYPEPOSQUERY:
2867 possessive = TRUE;
2868 min = 0;
2869 max = 1;
2870 ecode++;
2871 goto REPEATTYPE;
2872
2873 case OP_TYPEPOSUPTO:
2874 possessive = TRUE;
2875 min = 0;
2876 max = GET2(ecode, 1);
2877 ecode += 3;
2878 goto REPEATTYPE;
2879
2880 case OP_TYPESTAR:
2881 case OP_TYPEMINSTAR:
2882 case OP_TYPEPLUS:
2883 case OP_TYPEMINPLUS:
2884 case OP_TYPEQUERY:
2885 case OP_TYPEMINQUERY:
2886 c = *ecode++ - OP_TYPESTAR;
2887 minimize = (c & 1) != 0;
2888 min = rep_min[c]; /* Pick up values from tables; */
2889 max = rep_max[c]; /* zero for max => infinity */
2890 if (max == 0) max = INT_MAX;
2891
2892 /* Common code for all repeated single character type matches. Note that
2893 in UTF-8 mode, '.' matches a character of any length, but for the other
2894 character types, the valid characters are all one-byte long. */
2895
2896 REPEATTYPE:
2897 ctype = *ecode++; /* Code for the character type */
2898
2899 #ifdef SUPPORT_UCP
2900 if (ctype == OP_PROP || ctype == OP_NOTPROP)
2901 {
2902 prop_fail_result = ctype == OP_NOTPROP;
2903 prop_type = *ecode++;
2904 prop_value = *ecode++;
2905 }
2906 else prop_type = -1;
2907 #endif
2908
2909 /* First, ensure the minimum number of matches are present. Use inline
2910 code for maximizing the speed, and do the type test once at the start
2911 (i.e. keep it out of the loop). Also we can test that there are at least
2912 the minimum number of bytes before we start. This isn't as effective in
2913 UTF-8 mode, but it does no harm. Separate the UTF-8 code completely as that
2914 is tidier. Also separate the UCP code, which can be the same for both UTF-8
2915 and single-bytes. */
2916
2917 if (min > md->end_subject - eptr) RRETURN(MATCH_NOMATCH);
2918 if (min > 0)
2919 {
2920 #ifdef SUPPORT_UCP
2921 if (prop_type >= 0)
2922 {
2923 switch(prop_type)
2924 {
2925 case PT_ANY:
2926 if (prop_fail_result) RRETURN(MATCH_NOMATCH);
2927 for (i = 1; i <= min; i++)
2928 {
2929 if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
2930 GETCHARINCTEST(c, eptr);
2931 }
2932 break;
2933
2934 case PT_LAMP:
2935 for (i = 1; i <= min; i++)
2936 {
2937 if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
2938 GETCHARINCTEST(c, eptr);
2939 prop_chartype = UCD_CHARTYPE(c);
2940 if ((prop_chartype == ucp_Lu ||
2941 prop_chartype == ucp_Ll ||
2942 prop_chartype == ucp_Lt) == prop_fail_result)
2943 RRETURN(MATCH_NOMATCH);
2944 }
2945 break;
2946
2947 case PT_GC:
2948 for (i = 1; i <= min; i++)
2949 {
2950 if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
2951 GETCHARINCTEST(c, eptr);
2952 prop_category = UCD_CATEGORY(c);
2953 if ((prop_category == prop_value) == prop_fail_result)
2954 RRETURN(MATCH_NOMATCH);
2955 }
2956 break;
2957
2958 case PT_PC:
2959 for (i = 1; i <= min; i++)
2960 {
2961 if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
2962 GETCHARINCTEST(c, eptr);
2963 prop_chartype = UCD_CHARTYPE(c);
2964 if ((prop_chartype == prop_value) == prop_fail_result)
2965 RRETURN(MATCH_NOMATCH);
2966 }
2967 break;
2968
2969 case PT_SC:
2970 for (i = 1; i <= min; i++)
2971 {
2972 if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
2973 GETCHARINCTEST(c, eptr);
2974 prop_script = UCD_SCRIPT(c);
2975 if ((prop_script == prop_value) == prop_fail_result)
2976 RRETURN(MATCH_NOMATCH);
2977 }
2978 break;
2979
2980 default:
2981 RRETURN(PCRE_ERROR_INTERNAL);
2982 }
2983 }
2984
2985 /* Match extended Unicode sequences. We will get here only if the
2986 support is in the binary; otherwise a compile-time error occurs. */
2987
2988 else if (ctype == OP_EXTUNI)
2989 {
2990 for (i = 1; i <= min; i++)
2991 {
2992 GETCHARINCTEST(c, eptr);
2993 prop_category = UCD_CATEGORY(c);
2994 if (prop_category == ucp_M) RRETURN(MATCH_NOMATCH);
2995 while (eptr < md->end_subject)
2996 {
2997 int len = 1;
2998 if (!utf8) c = *eptr; else
2999 {
3000 GETCHARLEN(c, eptr, len);
3001 }
3002 prop_category = UCD_CATEGORY(c);
3003 if (prop_category != ucp_M) break;
3004 eptr += len;
3005 }
3006 }
3007 }
3008
3009 else
3010 #endif /* SUPPORT_UCP */
3011
3012 /* Handle all other cases when the coding is UTF-8 */
3013
3014 #ifdef SUPPORT_UTF8
3015 if (utf8) switch(ctype)
3016 {
3017 case OP_ANY:
3018 for (i = 1; i <= min; i++)
3019 {
3020 if (eptr >= md->end_subject || IS_NEWLINE(eptr))
3021 RRETURN(MATCH_NOMATCH);
3022 eptr++;
3023 while (eptr < md->end_subject && (*eptr & 0xc0) == 0x80) eptr++;
3024 }
3025 break;
3026
3027 case OP_ALLANY:
3028 for (i = 1; i <= min; i++)
3029 {
3030 if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
3031 eptr++;
3032 while (eptr < md->end_subject && (*eptr & 0xc0) == 0x80) eptr++;
3033 }
3034 break;
3035
3036 case OP_ANYBYTE:
3037 eptr += min;
3038 break;
3039
3040 case OP_ANYNL:
3041 for (i = 1; i <= min; i++)
3042 {
3043 if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
3044 GETCHARINC(c, eptr);
3045 switch(c)
3046 {
3047 default: RRETURN(MATCH_NOMATCH);
3048 case 0x000d:
3049 if (eptr < md->end_subject && *eptr == 0x0a) eptr++;
3050 break;
3051
3052 case 0x000a:
3053 break;
3054
3055 case 0x000b:
3056 case 0x000c:
3057 case 0x0085:
3058 case 0x2028:
3059 case 0x2029:
3060 if (md->bsr_anycrlf) RRETURN(MATCH_NOMATCH);
3061 break;
3062 }
3063 }
3064 break;
3065
3066 case OP_NOT_HSPACE:
3067 for (i = 1; i <= min; i++)
3068 {
3069 if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
3070 GETCHARINC(c, eptr);
3071 switch(c)
3072 {
3073 default: break;
3074 case 0x09: /* HT */
3075 case 0x20: /* SPACE */
3076 case 0xa0: /* NBSP */
3077 case 0x1680: /* OGHAM SPACE MARK */
3078 case 0x180e: /* MONGOLIAN VOWEL SEPARATOR */
3079 case 0x2000: /* EN QUAD */
3080 case 0x2001: /* EM QUAD */
3081 case 0x2002: /* EN SPACE */
3082 case 0x2003: /* EM SPACE */
3083 case 0x2004: /* THREE-PER-EM SPACE */
3084 case 0x2005: /* FOUR-PER-EM SPACE */
3085 case 0x2006: /* SIX-PER-EM SPACE */
3086 case 0x2007: /* FIGURE SPACE */
3087 case 0x2008: /* PUNCTUATION SPACE */
3088 case 0x2009: /* THIN SPACE */
3089 case 0x200A: /* HAIR SPACE */
3090 case 0x202f: /* NARROW NO-BREAK SPACE */
3091 case 0x205f: /* MEDIUM MATHEMATICAL SPACE */
3092 case 0x3000: /* IDEOGRAPHIC SPACE */
3093 RRETURN(MATCH_NOMATCH);
3094 }
3095 }
3096 break;
3097
3098 case OP_HSPACE:
3099 for (i = 1; i <= min; i++)
3100 {
3101 if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
3102 GETCHARINC(c, eptr);
3103 switch(c)
3104 {
3105 default: RRETURN(MATCH_NOMATCH);
3106 case 0x09: /* HT */
3107 case 0x20: /* SPACE */
3108 case 0xa0: /* NBSP */
3109 case 0x1680: /* OGHAM SPACE MARK */
3110 case 0x180e: /* MONGOLIAN VOWEL SEPARATOR */
3111 case 0x2000: /* EN QUAD */
3112 case 0x2001: /* EM QUAD */
3113 case 0x2002: /* EN SPACE */
3114 case 0x2003: /* EM SPACE */
3115 case 0x2004: /* THREE-PER-EM SPACE */
3116 case 0x2005: /* FOUR-PER-EM SPACE */
3117 case 0x2006: /* SIX-PER-EM SPACE */
3118 case 0x2007: /* FIGURE SPACE */
3119 case 0x2008: /* PUNCTUATION SPACE */
3120 case 0x2009: /* THIN SPACE */
3121 case 0x200A: /* HAIR SPACE */
3122 case 0x202f: /* NARROW NO-BREAK SPACE */
3123 case 0x205f: /* MEDIUM MATHEMATICAL SPACE */
3124 case 0x3000: /* IDEOGRAPHIC SPACE */
3125 break;
3126 }
3127 }
3128 break;
3129
3130 case OP_NOT_VSPACE:
3131 for (i = 1; i <= min; i++)
3132 {
3133 if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
3134 GETCHARINC(c, eptr);
3135 switch(c)
3136 {
3137 default: break;
3138 case 0x0a: /* LF */
3139 case 0x0b: /* VT */
3140 case 0x0c: /* FF */
3141 case 0x0d: /* CR */
3142 case 0x85: /* NEL */
3143 case 0x2028: /* LINE SEPARATOR */
3144 case 0x2029: /* PARAGRAPH SEPARATOR */
3145 RRETURN(MATCH_NOMATCH);
3146 }
3147 }
3148 break;
3149
3150 case OP_VSPACE:
3151 for (i = 1; i <= min; i++)
3152 {
3153 if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
3154 GETCHARINC(c, eptr);
3155 switch(c)
3156 {
3157 default: RRETURN(MATCH_NOMATCH);
3158 case 0x0a: /* LF */
3159 case 0x0b: /* VT */
3160 case 0x0c: /* FF */
3161 case 0x0d: /* CR */
3162 case 0x85: /* NEL */
3163 case 0x2028: /* LINE SEPARATOR */
3164 case 0x2029: /* PARAGRAPH SEPARATOR */
3165 break;
3166 }
3167 }
3168 break;
3169
3170 case OP_NOT_DIGIT:
3171 for (i = 1; i <= min; i++)
3172 {
3173 if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
3174 GETCHARINC(c, eptr);
3175 if (c < 128 && (md->ctypes[c] & ctype_digit) != 0)
3176 RRETURN(MATCH_NOMATCH);
3177 }
3178 break;
3179
3180 case OP_DIGIT:
3181 for (i = 1; i <= min; i++)
3182 {
3183 if (eptr >= md->end_subject ||
3184 *eptr >= 128 || (md->ctypes[*eptr++] & ctype_digit) == 0)
3185 RRETURN(MATCH_NOMATCH);
3186 /* No need to skip more bytes - we know it's a 1-byte character */
3187 }
3188 break;
3189
3190 case OP_NOT_WHITESPACE:
3191 for (i = 1; i <= min; i++)
3192 {
3193 if (eptr >= md->end_subject ||
3194 (*eptr < 128 && (md->ctypes[*eptr] & ctype_space) != 0))
3195 RRETURN(MATCH_NOMATCH);
3196 while (++eptr < md->end_subject && (*eptr & 0xc0) == 0x80);
3197 }
3198 break;
3199
3200 case OP_WHITESPACE:
3201 for (i = 1; i <= min; i++)
3202 {
3203 if (eptr >= md->end_subject ||
3204 *eptr >= 128 || (md->ctypes[*eptr++] & ctype_space) == 0)
3205 RRETURN(MATCH_NOMATCH);
3206 /* No need to skip more bytes - we know it's a 1-byte character */
3207 }
3208 break;
3209
3210 case OP_NOT_WORDCHAR:
3211 for (i = 1; i <= min; i++)
3212 {
3213 if (eptr >= md->end_subject ||
3214 (*eptr < 128 && (md->ctypes[*eptr] & ctype_word) != 0))
3215 RRETURN(MATCH_NOMATCH);
3216 while (++eptr < md->end_subject && (*eptr & 0xc0) == 0x80);
3217 }
3218 break;
3219
3220 case OP_WORDCHAR:
3221 for (i = 1; i <= min; i++)
3222 {
3223 if (eptr >= md->end_subject ||
3224 *eptr >= 128 || (md->ctypes[*eptr++] & ctype_word) == 0)
3225 RRETURN(MATCH_NOMATCH);
3226 /* No need to skip more bytes - we know it's a 1-byte character */
3227 }
3228 break;
3229
3230 default:
3231 RRETURN(PCRE_ERROR_INTERNAL);
3232 } /* End switch(ctype) */
3233
3234 else
3235 #endif /* SUPPORT_UTF8 */
3236
3237 /* Code for the non-UTF-8 case for minimum matching of operators other
3238 than OP_PROP and OP_NOTPROP. We can assume that there are the minimum
3239 number of bytes present, as this was tested above. */
3240
3241 switch(ctype)
3242 {
3243 case OP_ANY:
3244 for (i = 1; i <= min; i++)
3245 {
3246 if (IS_NEWLINE(eptr)) RRETURN(MATCH_NOMATCH);
3247 eptr++;
3248 }
3249 break;
3250
3251 case OP_ALLANY:
3252 eptr += min;
3253 break;
3254
3255 case OP_ANYBYTE:
3256 eptr += min;
3257 break;
3258
3259 /* Because of the CRLF case, we can't assume the minimum number of
3260 bytes are present in this case. */
3261
3262 case OP_ANYNL:
3263 for (i = 1; i <= min; i++)
3264 {
3265 if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
3266 switch(*eptr++)
3267 {
3268 default: RRETURN(MATCH_NOMATCH);
3269 case 0x000d:
3270 if (eptr < md->end_subject && *eptr == 0x0a) eptr++;
3271 break;
3272 case 0x000a:
3273 break;
3274
3275 case 0x000b:
3276 case 0x000c:
3277 case 0x0085:
3278 if (md->bsr_anycrlf) RRETURN(MATCH_NOMATCH);
3279 break;
3280 }
3281 }
3282 break;
3283
3284 case OP_NOT_HSPACE:
3285 for (i = 1; i <= min; i++)
3286 {
3287 if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
3288 switch(*eptr++)
3289 {
3290 default: break;
3291 case 0x09: /* HT */
3292 case 0x20: /* SPACE */
3293 case 0xa0: /* NBSP */
3294 RRETURN(MATCH_NOMATCH);
3295 }
3296 }
3297 break;
3298
3299 case OP_HSPACE:
3300 for (i = 1; i <= min; i++)
3301 {
3302 if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
3303 switch(*eptr++)
3304 {
3305 default: RRETURN(MATCH_NOMATCH);
3306 case 0x09: /* HT */
3307 case 0x20: /* SPACE */
3308 case 0xa0: /* NBSP */
3309 break;
3310 }
3311 }
3312 break;
3313
3314 case OP_NOT_VSPACE:
3315 for (i = 1; i <= min; i++)
3316 {
3317 if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
3318 switch(*eptr++)
3319 {
3320 default: break;
3321 case 0x0a: /* LF */
3322 case 0x0b: /* VT */
3323 case 0x0c: /* FF */
3324 case 0x0d: /* CR */
3325 case 0x85: /* NEL */
3326 RRETURN(MATCH_NOMATCH);
3327 }
3328 }
3329 break;
3330
3331 case OP_VSPACE:
3332 for (i = 1; i <= min; i++)
3333 {
3334 if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
3335 switch(*eptr++)
3336 {
3337 default: RRETURN(MATCH_NOMATCH);
3338 case 0x0a: /* LF */
3339 case 0x0b: /* VT */
3340 case 0x0c: /* FF */
3341 case 0x0d: /* CR */
3342 case 0x85: /* NEL */
3343 break;
3344 }
3345 }
3346 break;
3347
3348 case OP_NOT_DIGIT:
3349 for (i = 1; i <= min; i++)
3350 if ((md->ctypes[*eptr++] & ctype_digit) != 0) RRETURN(MATCH_NOMATCH);
3351 break;
3352
3353 case OP_DIGIT:
3354 for (i = 1; i <= min; i++)
3355 if ((md->ctypes[*eptr++] & ctype_digit) == 0) RRETURN(MATCH_NOMATCH);
3356 break;
3357
3358 case OP_NOT_WHITESPACE:
3359 for (i = 1; i <= min; i++)
3360 if ((md->ctypes[*eptr++] & ctype_space) != 0) RRETURN(MATCH_NOMATCH);
3361 break;
3362
3363 case OP_WHITESPACE:
3364 for (i = 1; i <= min; i++)
3365 if ((md->ctypes[*eptr++] & ctype_space) == 0) RRETURN(MATCH_NOMATCH);
3366 break;
3367
3368 case OP_NOT_WORDCHAR:
3369 for (i = 1; i <= min; i++)
3370 if ((md->ctypes[*eptr++] & ctype_word) != 0)
3371 RRETURN(MATCH_NOMATCH);
3372 break;
3373
3374 case OP_WORDCHAR:
3375 for (i = 1; i <= min; i++)
3376 if ((md->ctypes[*eptr++] & ctype_word) == 0)
3377 RRETURN(MATCH_NOMATCH);
3378 break;
3379
3380 default:
3381 RRETURN(PCRE_ERROR_INTERNAL);
3382 }
3383 }
3384
3385 /* If min = max, continue at the same level without recursing */
3386
3387 if (min == max) continue;
3388
3389 /* If minimizing, we have to test the rest of the pattern before each
3390 subsequent match. Again, separate the UTF-8 case for speed, and also
3391 separate the UCP cases. */
3392
3393 if (minimize)
3394 {
3395 #ifdef SUPPORT_UCP
3396 if (prop_type >= 0)
3397 {
3398 switch(prop_type)
3399 {
3400 case PT_ANY:
3401 for (fi = min;; fi++)
3402 {
3403 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM36);
3404 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3405 if (fi >= max || eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
3406 GETCHARINC(c, eptr);
3407 if (prop_fail_result) RRETURN(MATCH_NOMATCH);
3408 }
3409 /* Control never gets here */
3410
3411 case PT_LAMP:
3412 for (fi = min;; fi++)
3413 {
3414 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM37);
3415 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3416 if (fi >= max || eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
3417 GETCHARINC(c, eptr);
3418 prop_chartype = UCD_CHARTYPE(c);
3419 if ((prop_chartype == ucp_Lu ||
3420 prop_chartype == ucp_Ll ||
3421 prop_chartype == ucp_Lt) == prop_fail_result)
3422 RRETURN(MATCH_NOMATCH);
3423 }
3424 /* Control never gets here */
3425
3426 case PT_GC:
3427 for (fi = min;; fi++)
3428 {
3429 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM38);
3430 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3431 if (fi >= max || eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
3432 GETCHARINC(c, eptr);
3433 prop_category = UCD_CATEGORY(c);
3434 if ((prop_category == prop_value) == prop_fail_result)
3435 RRETURN(MATCH_NOMATCH);
3436 }
3437 /* Control never gets here */
3438
3439 case PT_PC:
3440 for (fi = min;; fi++)
3441 {
3442 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM39);
3443 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3444 if (fi >= max || eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
3445 GETCHARINC(c, eptr);
3446 prop_chartype = UCD_CHARTYPE(c);
3447 if ((prop_chartype == prop_value) == prop_fail_result)
3448 RRETURN(MATCH_NOMATCH);
3449 }
3450 /* Control never gets here */
3451
3452 case PT_SC:
3453 for (fi = min;; fi++)
3454 {
3455 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM40);
3456 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3457 if (fi >= max || eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
3458 GETCHARINC(c, eptr);
3459 prop_script = UCD_SCRIPT(c);
3460 if ((prop_script == prop_value) == prop_fail_result)
3461 RRETURN(MATCH_NOMATCH);
3462 }
3463 /* Control never gets here */
3464
3465 default:
3466 RRETURN(PCRE_ERROR_INTERNAL);
3467 }
3468 }
3469
3470 /* Match extended Unicode sequences. We will get here only if the
3471 support is in the binary; otherwise a compile-time error occurs. */
3472
3473 else if (ctype == OP_EXTUNI)
3474 {
3475 for (fi = min;; fi++)
3476 {
3477 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM41);
3478 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3479 if (fi >= max || eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
3480 GETCHARINCTEST(c, eptr);
3481 prop_category = UCD_CATEGORY(c);
3482 if (prop_category == ucp_M) RRETURN(MATCH_NOMATCH);
3483 while (eptr < md->end_subject)
3484 {
3485 int len = 1;
3486 if (!utf8) c = *eptr; else
3487 {
3488 GETCHARLEN(c, eptr, len);
3489 }
3490 prop_category = UCD_CATEGORY(c);
3491 if (prop_category != ucp_M) break;
3492 eptr += len;
3493 }
3494 }
3495 }
3496
3497 else
3498 #endif /* SUPPORT_UCP */
3499
3500 #ifdef SUPPORT_UTF8
3501 /* UTF-8 mode */
3502 if (utf8)
3503 {
3504 for (fi = min;; fi++)
3505 {
3506 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM42);
3507 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3508 if (fi >= max || eptr >= md->end_subject ||
3509 (ctype == OP_ANY && IS_NEWLINE(eptr)))
3510 RRETURN(MATCH_NOMATCH);
3511
3512 GETCHARINC(c, eptr);
3513 switch(ctype)
3514 {
3515 case OP_ANY: /* This is the non-NL case */
3516 case OP_ALLANY:
3517 case OP_ANYBYTE:
3518 break;
3519
3520 case OP_ANYNL:
3521 switch(c)
3522 {
3523 default: RRETURN(MATCH_NOMATCH);
3524 case 0x000d:
3525 if (eptr < md->end_subject && *eptr == 0x0a) eptr++;
3526 break;
3527 case 0x000a:
3528 break;
3529
3530 case 0x000b:
3531 case 0x000c:
3532 case 0x0085:
3533 case 0x2028:
3534 case 0x2029:
3535 if (md->bsr_anycrlf) RRETURN(MATCH_NOMATCH);
3536 break;
3537 }
3538 break;
3539
3540 case OP_NOT_HSPACE:
3541 switch(c)
3542 {
3543 default: break;
3544 case 0x09: /* HT */
3545 case 0x20: /* SPACE */
3546 case 0xa0: /* NBSP */
3547 case 0x1680: /* OGHAM SPACE MARK */
3548 case 0x180e: /* MONGOLIAN VOWEL SEPARATOR */
3549 case 0x2000: /* EN QUAD */
3550 case 0x2001: /* EM QUAD */
3551 case 0x2002: /* EN SPACE */
3552 case 0x2003: /* EM SPACE */
3553 case 0x2004: /* THREE-PER-EM SPACE */
3554 case 0x2005: /* FOUR-PER-EM SPACE */
3555 case 0x2006: /* SIX-PER-EM SPACE */
3556 case 0x2007: /* FIGURE SPACE */
3557 case 0x2008: /* PUNCTUATION SPACE */
3558 case 0x2009: /* THIN SPACE */
3559 case 0x200A: /* HAIR SPACE */
3560 case 0x202f: /* NARROW NO-BREAK SPACE */
3561 case 0x205f: /* MEDIUM MATHEMATICAL SPACE */
3562 case 0x3000: /* IDEOGRAPHIC SPACE */
3563 RRETURN(MATCH_NOMATCH);
3564 }
3565 break;
3566
3567 case OP_HSPACE:
3568 switch(c)
3569 {
3570 default: RRETURN(MATCH_NOMATCH);
3571 case 0x09: /* HT */
3572 case 0x20: /* SPACE */
3573 case 0xa0: /* NBSP */
3574 case 0x1680: /* OGHAM SPACE MARK */
3575 case 0x180e: /* MONGOLIAN VOWEL SEPARATOR */
3576 case 0x2000: /* EN QUAD */
3577 case 0x2001: /* EM QUAD */
3578 case 0x2002: /* EN SPACE */
3579 case 0x2003: /* EM SPACE */
3580 case 0x2004: /* THREE-PER-EM SPACE */
3581 case 0x2005: /* FOUR-PER-EM SPACE */
3582 case 0x2006: /* SIX-PER-EM SPACE */
3583 case 0x2007: /* FIGURE SPACE */
3584 case 0x2008: /* PUNCTUATION SPACE */
3585 case 0x2009: /* THIN SPACE */
3586 case 0x200A: /* HAIR SPACE */
3587 case 0x202f: /* NARROW NO-BREAK SPACE */
3588 case 0x205f: /* MEDIUM MATHEMATICAL SPACE */
3589 case 0x3000: /* IDEOGRAPHIC SPACE */
3590 break;
3591 }
3592 break;
3593
3594 case OP_NOT_VSPACE:
3595 switch(c)
3596 {
3597 default: break;
3598 case 0x0a: /* LF */
3599 case 0x0b: /* VT */
3600 case 0x0c: /* FF */
3601 case 0x0d: /* CR */
3602 case 0x85: /* NEL */
3603 case 0x2028: /* LINE SEPARATOR */
3604 case 0x2029: /* PARAGRAPH SEPARATOR */
3605 RRETURN(MATCH_NOMATCH);
3606 }
3607 break;
3608
3609 case OP_VSPACE:
3610 switch(c)
3611 {
3612 default: RRETURN(MATCH_NOMATCH);
3613 case 0x0a: /* LF */
3614 case 0x0b: /* VT */
3615 case 0x0c: /* FF */
3616 case 0x0d: /* CR */
3617 case 0x85: /* NEL */
3618 case 0x2028: /* LINE SEPARATOR */
3619 case 0x2029: /* PARAGRAPH SEPARATOR */
3620 break;
3621 }
3622 break;
3623
3624 case OP_NOT_DIGIT:
3625 if (c < 256 && (md->ctypes[c] & ctype_digit) != 0)
3626 RRETURN(MATCH_NOMATCH);
3627 break;
3628
3629 case OP_DIGIT:
3630 if (c >= 256 || (md->ctypes[c] & ctype_digit) == 0)
3631 RRETURN(MATCH_NOMATCH);
3632 break;
3633
3634 case OP_NOT_WHITESPACE:
3635 if (c < 256 && (md->ctypes[c] & ctype_space) != 0)
3636 RRETURN(MATCH_NOMATCH);
3637 break;
3638
3639 case OP_WHITESPACE:
3640 if (c >= 256 || (md->ctypes[c] & ctype_space) == 0)
3641 RRETURN(MATCH_NOMATCH);
3642 break;
3643
3644 case OP_NOT_WORDCHAR:
3645 if (c < 256 && (md->ctypes[c] & ctype_word) != 0)
3646 RRETURN(MATCH_NOMATCH);
3647 break;
3648
3649 case OP_WORDCHAR:
3650 if (c >= 256 || (md->ctypes[c] & ctype_word) == 0)
3651 RRETURN(MATCH_NOMATCH);
3652 break;
3653
3654 default:
3655 RRETURN(PCRE_ERROR_INTERNAL);
3656 }
3657 }
3658 }
3659 else
3660 #endif
3661 /* Not UTF-8 mode */
3662 {
3663 for (fi = min;; fi++)
3664 {
3665 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM43);
3666 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3667 if (fi >= max || eptr >= md->end_subject ||
3668 (ctype == OP_ANY && IS_NEWLINE(eptr)))
3669 RRETURN(MATCH_NOMATCH);
3670
3671 c = *eptr++;
3672 switch(ctype)
3673 {
3674 case OP_ANY: /* This is the non-NL case */
3675 case OP_ALLANY:
3676 case OP_ANYBYTE:
3677 break;
3678
3679 case OP_ANYNL:
3680 switch(c)
3681 {
3682 default: RRETURN(MATCH_NOMATCH);
3683 case 0x000d:
3684 if (eptr < md->end_subject && *eptr == 0x0a) eptr++;
3685 break;
3686
3687 case 0x000a:
3688 break;
3689
3690 case 0x000b:
3691 case 0x000c:
3692 case 0x0085:
3693 if (md->bsr_anycrlf) RRETURN(MATCH_NOMATCH);
3694 break;
3695 }
3696 break;
3697
3698 case OP_NOT_HSPACE:
3699 switch(c)
3700 {
3701 default: break;
3702 case 0x09: /* HT */
3703 case 0x20: /* SPACE */
3704 case 0xa0: /* NBSP */
3705 RRETURN(MATCH_NOMATCH);
3706 }
3707 break;
3708
3709 case OP_HSPACE:
3710 switch(c)
3711 {
3712 default: RRETURN(MATCH_NOMATCH);
3713 case 0x09: /* HT */
3714 case 0x20: /* SPACE */
3715 case 0xa0: /* NBSP */
3716 break;
3717 }
3718 break;
3719
3720 case OP_NOT_VSPACE:
3721 switch(c)
3722 {
3723 default: break;
3724 case 0x0a: /* LF */
3725 case 0x0b: /* VT */
3726 case 0x0c: /* FF */
3727 case 0x0d: /* CR */
3728 case 0x85: /* NEL */
3729 RRETURN(MATCH_NOMATCH);
3730 }
3731 break;
3732
3733 case OP_VSPACE:
3734 switch(c)
3735 {
3736 default: RRETURN(MATCH_NOMATCH);
3737 case 0x0a: /* LF */
3738 case 0x0b: /* VT */
3739 case 0x0c: /* FF */
3740 case 0x0d: /* CR */
3741 case 0x85: /* NEL */
3742 break;
3743 }
3744 break;
3745
3746 case OP_NOT_DIGIT:
3747 if ((md->ctypes[c] & ctype_digit) != 0) RRETURN(MATCH_NOMATCH);
3748 break;
3749
3750 case OP_DIGIT:
3751 if ((md->ctypes[c] & ctype_digit) == 0) RRETURN(MATCH_NOMATCH);
3752 break;
3753
3754 case OP_NOT_WHITESPACE:
3755 if ((md->ctypes[c] & ctype_space) != 0) RRETURN(MATCH_NOMATCH);
3756 break;
3757
3758 case OP_WHITESPACE:
3759 if ((md->ctypes[c] & ctype_space) == 0) RRETURN(MATCH_NOMATCH);
3760 break;
3761
3762 case OP_NOT_WORDCHAR:
3763 if ((md->ctypes[c] & ctype_word) != 0) RRETURN(MATCH_NOMATCH);
3764 break;
3765
3766 case OP_WORDCHAR:
3767 if ((md->ctypes[c] & ctype_word) == 0) RRETURN(MATCH_NOMATCH);
3768 break;
3769
3770 default:
3771 RRETURN(PCRE_ERROR_INTERNAL);
3772 }
3773 }
3774 }
3775 /* Control never gets here */
3776 }
3777
3778 /* If maximizing, it is worth using inline code for speed, doing the type
3779 test once at the start (i.e. keep it out of the loop). Again, keep the
3780 UTF-8 and UCP stuff separate. */
3781
3782 else
3783 {
3784 pp = eptr; /* Remember where we started */
3785
3786 #ifdef SUPPORT_UCP
3787 if (prop_type >= 0)
3788 {
3789 switch(prop_type)
3790 {
3791 case PT_ANY:
3792 for (i = min; i < max; i++)
3793 {
3794 int len = 1;
3795 if (eptr >= md->end_subject) break;
3796 GETCHARLEN(c, eptr, len);
3797 if (prop_fail_result) break;
3798 eptr+= len;
3799 }
3800 break;
3801
3802 case PT_LAMP:
3803 for (i = min; i < max; i++)
3804 {
3805 int len = 1;
3806 if (eptr >= md->end_subject) break;
3807 GETCHARLEN(c, eptr, len);
3808 prop_chartype = UCD_CHARTYPE(c);
3809 if ((prop_chartype == ucp_Lu ||
3810 prop_chartype == ucp_Ll ||
3811 prop_chartype == ucp_Lt) == prop_fail_result)
3812 break;
3813 eptr+= len;
3814 }
3815 break;
3816
3817 case PT_GC:
3818 for (i = min; i < max; i++)
3819 {
3820 int len = 1;
3821 if (eptr >= md->end_subject) break;
3822 GETCHARLEN(c, eptr, len);
3823 prop_category = UCD_CATEGORY(c);
3824 if ((prop_category == prop_value) == prop_fail_result)
3825 break;
3826 eptr+= len;
3827 }
3828 break;
3829
3830 case PT_PC:
3831 for (i = min; i < max; i++)
3832 {
3833 int len = 1;
3834 if (eptr >= md->end_subject) break;
3835 GETCHARLEN(c, eptr, len);
3836 prop_chartype = UCD_CHARTYPE(c);
3837 if ((prop_chartype == prop_value) == prop_fail_result)
3838 break;
3839 eptr+= len;
3840 }
3841 break;
3842
3843 case PT_SC:
3844 for (i = min; i < max; i++)
3845 {
3846 int len = 1;
3847 if (eptr >= md->end_subject) break;
3848 GETCHARLEN(c, eptr, len);
3849 prop_script = UCD_SCRIPT(c);
3850 if ((prop_script == prop_value) == prop_fail_result)
3851 break;
3852 eptr+= len;
3853 }
3854 break;
3855 }
3856
3857 /* eptr is now past the end of the maximum run */
3858
3859 if (possessive) continue;
3860 for(;;)
3861 {
3862 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM44);
3863 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3864 if (eptr-- == pp) break; /* Stop if tried at original pos */
3865 if (utf8) BACKCHAR(eptr);
3866 }
3867 }
3868
3869 /* Match extended Unicode sequences. We will get here only if the
3870 support is in the binary; otherwise a compile-time error occurs. */
3871
3872 else if (ctype == OP_EXTUNI)
3873 {
3874 for (i = min; i < max; i++)
3875 {
3876 if (eptr >= md->end_subject) break;
3877 GETCHARINCTEST(c, eptr);
3878 prop_category = UCD_CATEGORY(c);
3879 if (prop_category == ucp_M) break;
3880 while (eptr < md->end_subject)
3881 {
3882 int len = 1;
3883 if (!utf8) c = *eptr; else
3884 {
3885 GETCHARLEN(c, eptr, len);
3886 }
3887 prop_category = UCD_CATEGORY(c);
3888 if (prop_category != ucp_M) break;
3889 eptr += len;
3890 }
3891 }
3892
3893 /* eptr is now past the end of the maximum run */
3894
3895 if (possessive) continue;
3896 for(;;)
3897 {
3898 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM45);
3899 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3900 if (eptr-- == pp) break; /* Stop if tried at original pos */
3901 for (;;) /* Move back over one extended */
3902 {
3903 int len = 1;
3904 if (!utf8) c = *eptr; else
3905 {
3906 BACKCHAR(eptr);
3907 GETCHARLEN(c, eptr, len);
3908 }
3909 prop_category = UCD_CATEGORY(c);
3910 if (prop_category != ucp_M) break;
3911 eptr--;
3912 }
3913 }
3914 }
3915
3916 else
3917 #endif /* SUPPORT_UCP */
3918
3919 #ifdef SUPPORT_UTF8
3920 /* UTF-8 mode */
3921
3922 if (utf8)
3923 {
3924 switch(ctype)
3925 {
3926 case OP_ANY:
3927 if (max < INT_MAX)
3928 {
3929 for (i = min; i < max; i++)
3930 {
3931 if (eptr >= md->end_subject || IS_NEWLINE(eptr)) break;
3932 eptr++;
3933 while (eptr < md->end_subject && (*eptr & 0xc0) == 0x80) eptr++;
3934 }
3935 }
3936
3937 /* Handle unlimited UTF-8 repeat */
3938
3939 else
3940 {
3941 for (i = min; i < max; i++)
3942 {
3943 if (eptr >= md->end_subject || IS_NEWLINE(eptr)) break;
3944 eptr++;
3945 while (eptr < md->end_subject && (*eptr & 0xc0) == 0x80) eptr++;
3946 }
3947 }
3948 break;
3949
3950 case OP_ALLANY:
3951 if (max < INT_MAX)
3952 {
3953 for (i = min; i < max; i++)
3954 {
3955 if (eptr >= md->end_subject) break;
3956 eptr++;
3957 while (eptr < md->end_subject && (*eptr & 0xc0) == 0x80) eptr++;
3958 }
3959 }
3960 else eptr = md->end_subject; /* Unlimited UTF-8 repeat */
3961 break;
3962
3963 /* The byte case is the same as non-UTF8 */
3964
3965 case OP_ANYBYTE:
3966 c = max - min;
3967 if (c > (unsigned int)(md->end_subject - eptr))
3968 c = md->end_subject - eptr;
3969 eptr += c;
3970 break;
3971
3972 case OP_ANYNL:
3973 for (i = min; i < max; i++)
3974 {
3975 int len = 1;
3976 if (eptr >= md->end_subject) break;
3977 GETCHARLEN(c, eptr, len);
3978 if (c == 0x000d)
3979 {
3980 if (++eptr >= md->end_subject) break;
3981 if (*eptr == 0x000a) eptr++;
3982 }
3983 else
3984 {
3985 if (c != 0x000a &&
3986 (md->bsr_anycrlf ||
3987 (c != 0x000b && c != 0x000c &&
3988 c != 0x0085 && c != 0x2028 && c != 0x2029)))
3989 break;
3990 eptr += len;
3991 }
3992 }
3993 break;
3994
3995 case OP_NOT_HSPACE:
3996 case OP_HSPACE:
3997 for (i = min; i < max; i++)
3998 {
3999 BOOL gotspace;
4000 int len = 1;
4001 if (eptr >= md->end_subject) break;
4002 GETCHARLEN(c, eptr, len);
4003 switch(c)
4004 {
4005 default: gotspace = FALSE; break;
4006 case 0x09: /* HT */
4007 case 0x20: /* SPACE */
4008 case 0xa0: /* NBSP */
4009 case 0x1680: /* OGHAM SPACE MARK */
4010 case 0x180e: /* MONGOLIAN VOWEL SEPARATOR */
4011 case 0x2000: /* EN QUAD */
4012 case 0x2001: /* EM QUAD */
4013 case 0x2002: /* EN SPACE */
4014 case 0x2003: /* EM SPACE */
4015 case 0x2004: /* THREE-PER-EM SPACE */
4016 case 0x2005: /* FOUR-PER-EM SPACE */
4017 case 0x2006: /* SIX-PER-EM SPACE */
4018 case 0x2007: /* FIGURE SPACE */
4019 case 0x2008: /* PUNCTUATION SPACE */
4020 case 0x2009: /* THIN SPACE */
4021 case 0x200A: /* HAIR SPACE */
4022 case 0x202f: /* NARROW NO-BREAK SPACE */
4023 case 0x205f: /* MEDIUM MATHEMATICAL SPACE */
4024 case 0x3000: /* IDEOGRAPHIC SPACE */
4025 gotspace = TRUE;
4026 break;
4027 }
4028 if (gotspace == (ctype == OP_NOT_HSPACE)) break;
4029 eptr += len;
4030 }
4031 break;
4032
4033 case OP_NOT_VSPACE:
4034 case OP_VSPACE:
4035 for (i = min; i < max; i++)
4036 {
4037 BOOL gotspace;
4038 int len = 1;
4039 if (eptr >= md->end_subject) break;
4040 GETCHARLEN(c, eptr, len);
4041 switch(c)
4042 {
4043 default: gotspace = FALSE; break;
4044 case 0x0a: /* LF */
4045 case 0x0b: /* VT */
4046 case 0x0c: /* FF */
4047 case 0x0d: /* CR */
4048 case 0x85: /* NEL */
4049 case 0x2028: /* LINE SEPARATOR */
4050 case 0x2029: /* PARAGRAPH SEPARATOR */
4051 gotspace = TRUE;
4052 break;
4053 }
4054 if (gotspace == (ctype == OP_NOT_VSPACE)) break;
4055 eptr += len;
4056 }
4057 break;
4058
4059 case OP_NOT_DIGIT:
4060 for (i = min; i < max; i++)
4061 {
4062 int len = 1;
4063 if (eptr >= md->end_subject) break;
4064 GETCHARLEN(c, eptr, len);
4065 if (c < 256 && (md->ctypes[c] & ctype_digit) != 0) break;
4066 eptr+= len;
4067 }
4068 break;
4069
4070 case OP_DIGIT:
4071 for (i = min; i < max; i++)
4072 {
4073 int len = 1;
4074 if (eptr >= md->end_subject) break;
4075 GETCHARLEN(c, eptr, len);
4076 if (c >= 256 ||(md->ctypes[c] & ctype_digit) == 0) break;
4077 eptr+= len;
4078 }
4079 break;
4080
4081 case OP_NOT_WHITESPACE:
4082 for (i = min; i < max; i++)
4083 {
4084 int len = 1;
4085 if (eptr >= md->end_subject) break;
4086 GETCHARLEN(c, eptr, len);
4087 if (c < 256 && (md->ctypes[c] & ctype_space) != 0) break;
4088 eptr+= len;
4089 }
4090 break;
4091
4092 case OP_WHITESPACE:
4093 for (i = min; i < max; i++)
4094 {
4095 int len = 1;
4096 if (eptr >= md->end_subject) break;
4097 GETCHARLEN(c, eptr, len);
4098 if (c >= 256 ||(md->ctypes[c] & ctype_space) == 0) break;
4099 eptr+= len;
4100 }
4101 break;
4102
4103 case OP_NOT_WORDCHAR:
4104 for (i = min; i < max; i++)
4105 {
4106 int len = 1;
4107 if (eptr >= md->end_subject) break;
4108 GETCHARLEN(c, eptr, len);
4109 if (c < 256 && (md->ctypes[c] & ctype_word) != 0) break;
4110 eptr+= len;
4111 }
4112 break;
4113
4114 case OP_WORDCHAR:
4115 for (i = min; i < max; i++)
4116 {
4117 int len = 1;
4118 if (eptr >= md->end_subject) break;
4119 GETCHARLEN(c, eptr, len);
4120 if (c >= 256 || (md->ctypes[c] & ctype_word) == 0) break;
4121 eptr+= len;
4122 }
4123 break;
4124
4125 default:
4126 RRETURN(PCRE_ERROR_INTERNAL);
4127 }
4128
4129 /* eptr is now past the end of the maximum run */
4130
4131 if (possessive) continue;
4132 for(;;)
4133 {
4134 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM46);
4135 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
4136 if (eptr-- == pp) break; /* Stop if tried at original pos */
4137 BACKCHAR(eptr);
4138 }
4139 }
4140 else
4141 #endif /* SUPPORT_UTF8 */
4142
4143 /* Not UTF-8 mode */
4144 {
4145 switch(ctype)
4146 {
4147 case OP_ANY:
4148 for (i = min; i < max; i++)
4149 {
4150 if (eptr >= md->end_subject || IS_NEWLINE(eptr)) break;
4151 eptr++;
4152 }
4153 break;
4154
4155 case OP_ALLANY:
4156 case OP_ANYBYTE:
4157 c = max - min;
4158 if (c > (unsigned int)(md->end_subject - eptr))
4159 c = md->end_subject - eptr;
4160 eptr += c;
4161 break;
4162
4163 case OP_ANYNL:
4164 for (i = min; i < max; i++)
4165 {
4166 if (eptr >= md->end_subject) break;
4167 c = *eptr;
4168 if (c == 0x000d)
4169 {
4170 if (++eptr >= md->end_subject) break;
4171 if (*eptr == 0x000a) eptr++;
4172 }
4173 else
4174 {
4175 if (c != 0x000a &&
4176 (md->bsr_anycrlf ||
4177 (c != 0x000b && c != 0x000c && c != 0x0085)))
4178 break;
4179 eptr++;
4180 }
4181 }
4182 break;
4183
4184 case OP_NOT_HSPACE:
4185 for (i = min; i < max; i++)
4186 {
4187 if (eptr >= md->end_subject) break;
4188 c = *eptr;
4189 if (c == 0x09 || c == 0x20 || c == 0xa0) break;
4190 eptr++;
4191 }
4192 break;
4193
4194 case OP_HSPACE:
4195 for (i = min; i < max; i++)
4196 {
4197 if (eptr >= md->end_subject) break;
4198 c = *eptr;
4199 if (c != 0x09 && c != 0x20 && c != 0xa0) break;
4200 eptr++;
4201 }
4202 break;
4203
4204 case OP_NOT_VSPACE:
4205 for (i = min; i < max; i++)
4206 {
4207 if (eptr >= md->end_subject) break;
4208 c = *eptr;
4209 if (c == 0x0a || c == 0x0b || c == 0x0c || c == 0x0d || c == 0x85)
4210 break;
4211 eptr++;
4212 }
4213 break;
4214
4215 case OP_VSPACE:
4216 for (i = min; i < max; i++)
4217 {
4218 if (eptr >= md->end_subject) break;
4219 c = *eptr;
4220 if (c != 0x0a && c != 0x0b && c != 0x0c && c != 0x0d && c != 0x85)
4221 break;
4222 eptr++;
4223 }
4224 break;
4225
4226 case OP_NOT_DIGIT:
4227 for (i = min; i < max; i++)
4228 {
4229 if (eptr >= md->end_subject || (md->ctypes[*eptr] & ctype_digit) != 0)
4230 break;
4231 eptr++;
4232 }
4233 break;
4234
4235 case OP_DIGIT:
4236 for (i = min; i < max; i++)
4237 {
4238 if (eptr >= md->end_subject || (md->ctypes[*eptr] & ctype_digit) == 0)
4239 break;
4240 eptr++;
4241 }
4242 break;
4243
4244 case OP_NOT_WHITESPACE:
4245 for (i = min; i < max; i++)
4246 {
4247 if (eptr >= md->end_subject || (md->ctypes[*eptr] & ctype_space) != 0)
4248 break;
4249 eptr++;
4250 }
4251 break;
4252
4253 case OP_WHITESPACE:
4254 for (i = min; i < max; i++)
4255 {
4256 if (eptr >= md->end_subject || (md->ctypes[*eptr] & ctype_space) == 0)
4257 break;
4258 eptr++;
4259 }
4260 break;
4261
4262 case OP_NOT_WORDCHAR:
4263 for (i = min; i < max; i++)
4264 {
4265 if (eptr >= md->end_subject || (md->ctypes[*eptr] & ctype_word) != 0)
4266 break;
4267 eptr++;
4268 }
4269 break;
4270
4271 case OP_WORDCHAR:
4272 for (i = min; i < max; i++)
4273 {
4274 if (eptr >= md->end_subject || (md->ctypes[*eptr] & ctype_word) == 0)
4275 break;
4276 eptr++;
4277 }
4278 break;
4279
4280 default:
4281 RRETURN(PCRE_ERROR_INTERNAL);
4282 }
4283
4284 /* eptr is now past the end of the maximum run */
4285
4286 if (possessive) continue;
4287 while (eptr >= pp)
4288 {
4289 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM47);
4290 eptr--;
4291 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
4292 }
4293 }
4294
4295 /* Get here if we can't make it match with any permitted repetitions */
4296
4297 RRETURN(MATCH_NOMATCH);
4298 }
4299 /* Control never gets here */
4300
4301 /* There's been some horrible disaster. Arrival here can only mean there is
4302 something seriously wrong in the code above or the OP_xxx definitions. */
4303
4304 default:
4305 DPRINTF(("Unknown opcode %d\n", *ecode));
4306 RRETURN(PCRE_ERROR_UNKNOWN_OPCODE);
4307 }
4308
4309 /* Do not stick any code in here without much thought; it is assumed
4310 that "continue" in the code above comes out to here to repeat the main
4311 loop. */
4312
4313 } /* End of main loop */
4314 /* Control never reaches here */
4315
4316
4317 /* When compiling to use the heap rather than the stack for recursive calls to
4318 match(), the RRETURN() macro jumps here. The number that is saved in
4319 frame->Xwhere indicates which label we actually want to return to. */
4320
4321 #ifdef NO_RECURSE
4322 #define LBL(val) case val: goto L_RM##val;
4323 HEAP_RETURN:
4324 switch (frame->Xwhere)
4325 {
4326 LBL( 1) LBL( 2) LBL( 3) LBL( 4) LBL( 5) LBL( 6) LBL( 7) LBL( 8)
4327 LBL( 9) LBL(10) LBL(11) LBL(12) LBL(13) LBL(14) LBL(15) LBL(17)
4328 LBL(19) LBL(24) LBL(25) LBL(26) LBL(27) LBL(29) LBL(31) LBL(33)
4329 LBL(35) LBL(43) LBL(47) LBL(48) LBL(49) LBL(50) LBL(51) LBL(52)
4330 LBL(53) LBL(54)
4331 #ifdef SUPPORT_UTF8
4332 LBL(16) LBL(18) LBL(20) LBL(21) LBL(22) LBL(23) LBL(28) LBL(30)
4333 LBL(32) LBL(34) LBL(42) LBL(46)
4334 #ifdef SUPPORT_UCP
4335 LBL(36) LBL(37) LBL(38) LBL(39) LBL(40) LBL(41) LBL(44) LBL(45)
4336 #endif /* SUPPORT_UCP */
4337 #endif /* SUPPORT_UTF8 */
4338 default:
4339 DPRINTF(("jump error in pcre match: label %d non-existent\n", frame->Xwhere));
4340 return PCRE_ERROR_INTERNAL;
4341 }
4342 #undef LBL
4343 #endif /* NO_RECURSE */
4344 }
4345
4346
4347 /***************************************************************************
4348 ****************************************************************************
4349 RECURSION IN THE match() FUNCTION
4350
4351 Undefine all the macros that were defined above to handle this. */
4352
4353 #ifdef NO_RECURSE
4354 #undef eptr
4355 #undef ecode
4356 #undef mstart
4357 #undef offset_top
4358 #undef ims
4359 #undef eptrb
4360 #undef flags
4361
4362 #undef callpat
4363 #undef charptr
4364 #undef data
4365 #undef next
4366 #undef pp
4367 #undef prev
4368 #undef saved_eptr
4369
4370 #undef new_recursive
4371
4372 #undef cur_is_word
4373 #undef condition
4374 #undef prev_is_word
4375
4376 #undef original_ims
4377
4378 #undef ctype
4379 #undef length
4380 #undef max
4381 #undef min
4382 #undef number
4383 #undef offset
4384 #undef op
4385 #undef save_capture_last
4386 #undef save_offset1
4387 #undef save_offset2
4388 #undef save_offset3
4389 #undef stacksave
4390
4391 #undef newptrb
4392
4393 #endif
4394
4395 /* These two are defined as macros in both cases */
4396
4397 #undef fc
4398 #undef fi
4399
4400 /***************************************************************************
4401 ***************************************************************************/
4402
4403
4404
4405 /*************************************************
4406 * Execute a Regular Expression *
4407 *************************************************/
4408
4409 /* This function applies a compiled re to a subject string and picks out
4410 portions of the string if it matches. Two elements in the vector are set for
4411 each substring: the offsets to the start and end of the substring.
4412
4413 Arguments:
4414 argument_re points to the compiled expression
4415 extra_data points to extra data or is NULL
4416 subject points to the subject string
4417 length length of subject string (may contain binary zeros)
4418 start_offset where to start in the subject string
4419 options option bits
4420 offsets points to a vector of ints to be filled in with offsets
4421 offsetcount the number of elements in the vector
4422
4423 Returns: > 0 => success; value is the number of elements filled in
4424 = 0 => success, but offsets is not big enough
4425 -1 => failed to match
4426 < -1 => some kind of unexpected problem
4427 */
4428
4429 PCRE_EXP_DEFN int PCRE_CALL_CONVENTION
4430 pcre_exec(const pcre *argument_re, const pcre_extra *extra_data,
4431 PCRE_SPTR subject, int length, int start_offset, int options, int *offsets,
4432 int offsetcount)
4433 {
4434 int rc, resetcount, ocount;
4435 int first_byte = -1;
4436 int req_byte = -1;
4437 int req_byte2 = -1;
4438 int newline;
4439 unsigned long int ims;
4440 BOOL using_temporary_offsets = FALSE;
4441 BOOL anchored;
4442 BOOL startline;
4443 BOOL firstline;
4444 BOOL first_byte_caseless = FALSE;
4445 BOOL req_byte_caseless = FALSE;
4446 BOOL utf8;
4447 match_data match_block;
4448 match_data *md = &match_block;
4449 const uschar *tables;
4450 const uschar *start_bits = NULL;
4451 USPTR start_match = (USPTR)subject + start_offset;
4452 USPTR end_subject;
4453 USPTR req_byte_ptr = start_match - 1;
4454
4455 pcre_study_data internal_study;
4456 const pcre_study_data *study;
4457
4458 real_pcre internal_re;
4459 const real_pcre *external_re = (const real_pcre *)argument_re;
4460 const real_pcre *re = external_re;
4461
4462 /* Plausibility checks */
4463
4464 if ((options & ~PUBLIC_EXEC_OPTIONS) != 0) return PCRE_ERROR_BADOPTION;
4465 if (re == NULL || subject == NULL ||
4466 (offsets == NULL && offsetcount > 0)) return PCRE_ERROR_NULL;
4467 if (offsetcount < 0) return PCRE_ERROR_BADCOUNT;
4468
4469 /* Fish out the optional data from the extra_data structure, first setting
4470 the default values. */
4471
4472 study = NULL;
4473 md->match_limit = MATCH_LIMIT;
4474 md->match_limit_recursion = MATCH_LIMIT_RECURSION;
4475 md->callout_data = NULL;
4476
4477 /* The table pointer is always in native byte order. */
4478
4479 tables = external_re->tables;
4480
4481 if (extra_data != NULL)
4482 {
4483 register unsigned int flags = extra_data->flags;
4484 if ((flags & PCRE_EXTRA_STUDY_DATA) != 0)
4485 study = (const pcre_study_data *)extra_data->study_data;
4486 if ((flags & PCRE_EXTRA_MATCH_LIMIT) != 0)
4487 md->match_limit = extra_data->match_limit;
4488 if ((flags & PCRE_EXTRA_MATCH_LIMIT_RECURSION) != 0)
4489 md->match_limit_recursion = extra_data->match_limit_recursion;
4490 if ((flags & PCRE_EXTRA_CALLOUT_DATA) != 0)
4491 md->callout_data = extra_data->callout_data;
4492 if ((flags & PCRE_EXTRA_TABLES) != 0) tables = extra_data->tables;
4493 }
4494
4495 /* If the exec call supplied NULL for tables, use the inbuilt ones. This
4496 is a feature that makes it possible to save compiled regex and re-use them
4497 in other programs later. */
4498
4499 if (tables == NULL) tables = _pcre_default_tables;
4500
4501 /* Check that the first field in the block is the magic number. If it is not,
4502 test for a regex that was compiled on a host of opposite endianness. If this is
4503 the case, flipped values are put in internal_re and internal_study if there was
4504 study data too. */
4505
4506 if (re->magic_number != MAGIC_NUMBER)
4507 {
4508 re = _pcre_try_flipped(re, &internal_re, study, &internal_study);
4509 if (re == NULL) return PCRE_ERROR_BADMAGIC;
4510 if (study != NULL) study = &internal_study;
4511 }
4512
4513 /* Set up other data */
4514
4515 anchored = ((re->options | options) & PCRE_ANCHORED) != 0;
4516 startline = (re->flags & PCRE_STARTLINE) != 0;
4517 firstline = (re->options & PCRE_FIRSTLINE) != 0;
4518
4519 /* The code starts after the real_pcre block and the capture name table. */
4520
4521 md->start_code = (const uschar *)external_re + re->name_table_offset +
4522 re->name_count * re->name_entry_size;
4523
4524 md->start_subject = (USPTR)subject;
4525 md->start_offset = start_offset;
4526 md->end_subject = md->start_subject + length;
4527 end_subject = md->end_subject;
4528
4529 md->endonly = (re->options & PCRE_DOLLAR_ENDONLY) != 0;
4530 utf8 = md->utf8 = (re->options & PCRE_UTF8) != 0;
4531 md->jscript_compat = (re->options & PCRE_JAVASCRIPT_COMPAT) != 0;
4532
4533 md->notbol = (options & PCRE_NOTBOL) != 0;
4534 md->noteol = (options & PCRE_NOTEOL) != 0;
4535 md->notempty = (options & PCRE_NOTEMPTY) != 0;
4536 md->partial = (options & PCRE_PARTIAL) != 0;
4537 md->hitend = FALSE;
4538
4539 md->recursive = NULL; /* No recursion at top level */
4540
4541 md->lcc = tables + lcc_offset;
4542 md->ctypes = tables + ctypes_offset;
4543
4544 /* Handle different \R options. */
4545
4546 switch (options & (PCRE_BSR_ANYCRLF|PCRE_BSR_UNICODE))
4547 {
4548 case 0:
4549 if ((re->options & (PCRE_BSR_ANYCRLF|PCRE_BSR_UNICODE)) != 0)
4550 md->bsr_anycrlf = (re->options & PCRE_BSR_ANYCRLF) != 0;
4551 else
4552 #ifdef BSR_ANYCRLF
4553 md->bsr_anycrlf = TRUE;
4554 #else
4555 md->bsr_anycrlf = FALSE;
4556 #endif
4557 break;
4558
4559 case PCRE_BSR_ANYCRLF:
4560 md->bsr_anycrlf = TRUE;
4561 break;
4562
4563 case PCRE_BSR_UNICODE:
4564 md->bsr_anycrlf = FALSE;
4565 break;
4566
4567 default: return PCRE_ERROR_BADNEWLINE;
4568 }
4569
4570 /* Handle different types of newline. The three bits give eight cases. If
4571 nothing is set at run time, whatever was used at compile time applies. */
4572
4573 switch ((((options & PCRE_NEWLINE_BITS) == 0)? re->options :
4574 (pcre_uint32)options) & PCRE_NEWLINE_BITS)
4575 {
4576 case 0: newline = NEWLINE; break; /* Compile-time default */
4577 case PCRE_NEWLINE_CR: newline = CHAR_CR; break;
4578 case PCRE_NEWLINE_LF: newline = CHAR_NL; break;
4579 case PCRE_NEWLINE_CR+
4580 PCRE_NEWLINE_LF: newline = (CHAR_CR << 8) | CHAR_NL; break;
4581 case PCRE_NEWLINE_ANY: newline = -1; break;
4582 case PCRE_NEWLINE_ANYCRLF: newline = -2; break;
4583 default: return PCRE_ERROR_BADNEWLINE;
4584 }
4585
4586 if (newline == -2)
4587 {
4588 md->nltype = NLTYPE_ANYCRLF;
4589 }
4590 else if (newline < 0)
4591 {
4592 md->nltype = NLTYPE_ANY;
4593 }
4594 else
4595 {
4596 md->nltype = NLTYPE_FIXED;
4597 if (newline > 255)
4598 {
4599 md->nllen = 2;
4600 md->nl[0] = (newline >> 8) & 255;
4601 md->nl[1] = newline & 255;
4602 }
4603 else
4604 {
4605 md->nllen = 1;
4606 md->nl[0] = newline;
4607 }
4608 }
4609
4610 /* Partial matching is supported only for a restricted set of regexes at the
4611 moment. */
4612
4613 if (md->partial && (re->flags & PCRE_NOPARTIAL) != 0)
4614 return PCRE_ERROR_BADPARTIAL;
4615
4616 /* Check a UTF-8 string if required. Unfortunately there's no way of passing
4617 back the character offset. */
4618
4619 #ifdef SUPPORT_UTF8
4620 if (utf8 && (options & PCRE_NO_UTF8_CHECK) == 0)
4621 {
4622 if (_pcre_valid_utf8((uschar *)subject, length) >= 0)
4623 return PCRE_ERROR_BADUTF8;
4624 if (start_offset > 0 && start_offset < length)
4625 {
4626 int tb = ((uschar *)subject)[start_offset];
4627 if (tb > 127)
4628 {
4629 tb &= 0xc0;
4630 if (tb != 0 && tb != 0xc0) return PCRE_ERROR_BADUTF8_OFFSET;
4631 }
4632 }
4633 }
4634 #endif
4635
4636 /* The ims options can vary during the matching as a result of the presence
4637 of (?ims) items in the pattern. They are kept in a local variable so that
4638 restoring at the exit of a group is easy. */
4639
4640 ims = re->options & (PCRE_CASELESS|PCRE_MULTILINE|PCRE_DOTALL);
4641
4642 /* If the expression has got more back references than the offsets supplied can
4643 hold, we get a temporary chunk of working store to use during the matching.
4644 Otherwise, we can use the vector supplied, rounding down its size to a multiple
4645 of 3. */
4646
4647 ocount = offsetcount - (offsetcount % 3);
4648
4649 if (re->top_backref > 0 && re->top_backref >= ocount/3)
4650 {
4651 ocount = re->top_backref * 3 + 3;
4652 md->offset_vector = (int *)(pcre_malloc)(ocount * sizeof(int));
4653 if (md->offset_vector == NULL) return PCRE_ERROR_NOMEMORY;
4654 using_temporary_offsets = TRUE;
4655 DPRINTF(("Got memory to hold back references\n"));
4656 }
4657 else md->offset_vector = offsets;
4658
4659 md->offset_end = ocount;
4660 md->offset_max = (2*ocount)/3;
4661 md->offset_overflow = FALSE;
4662 md->capture_last = -1;
4663
4664 /* Compute the minimum number of offsets that we need to reset each time. Doing
4665 this makes a huge difference to execution time when there aren't many brackets
4666 in the pattern. */
4667
4668 resetcount = 2 + re->top_bracket * 2;
4669 if (resetcount > offsetcount) resetcount = ocount;
4670
4671 /* Reset the working variable associated with each extraction. These should
4672 never be used unless previously set, but they get saved and restored, and so we
4673 initialize them to avoid reading uninitialized locations. */
4674
4675 if (md->offset_vector != NULL)
4676 {
4677 register int *iptr = md->offset_vector + ocount;
4678 register int *iend = iptr - resetcount/2 + 1;
4679 while (--iptr >= iend) *iptr = -1;
4680 }
4681
4682 /* Set up the first character to match, if available. The first_byte value is
4683 never set for an anchored regular expression, but the anchoring may be forced
4684 at run time, so we have to test for anchoring. The first char may be unset for
4685 an unanchored pattern, of course. If there's no first char and the pattern was
4686 studied, there may be a bitmap of possible first characters. */
4687
4688 if (!anchored)
4689 {
4690 if ((re->flags & PCRE_FIRSTSET) != 0)
4691 {
4692 first_byte = re->first_byte & 255;
4693 if ((first_byte_caseless = ((re->first_byte & REQ_CASELESS) != 0)) == TRUE)
4694 first_byte = md->lcc[first_byte];
4695 }
4696 else
4697 if (!startline && study != NULL &&
4698 (study->options & PCRE_STUDY_MAPPED) != 0)
4699 start_bits = study->start_bits;
4700 }
4701
4702 /* For anchored or unanchored matches, there may be a "last known required
4703 character" set. */
4704
4705 if ((re->flags & PCRE_REQCHSET) != 0)
4706 {
4707 req_byte = re->req_byte & 255;
4708 req_byte_caseless = (re->req_byte & REQ_CASELESS) != 0;
4709 req_byte2 = (tables + fcc_offset)[req_byte]; /* case flipped */
4710 }
4711
4712
4713 /* ==========================================================================*/
4714
4715 /* Loop for handling unanchored repeated matching attempts; for anchored regexs
4716 the loop runs just once. */
4717
4718 for(;;)
4719 {
4720 USPTR save_end_subject = end_subject;
4721 USPTR new_start_match;
4722
4723 /* Reset the maximum number of extractions we might see. */
4724
4725 if (md->offset_vector != NULL)
4726 {
4727 register int *iptr = md->offset_vector;
4728 register int *iend = iptr + resetcount;
4729 while (iptr < iend) *iptr++ = -1;
4730 }
4731
4732 /* If firstline is TRUE, the start of the match is constrained to the first
4733 line of a multiline string. That is, the match must be before or at the first
4734 newline. Implement this by temporarily adjusting end_subject so that we stop
4735 scanning at a newline. If the match fails at the newline, later code breaks
4736 this loop. */
4737
4738 if (firstline)
4739 {
4740 USPTR t = start_match;
4741 #ifdef SUPPORT_UTF8
4742 if (utf8)
4743 {
4744 while (t < md->end_subject && !IS_NEWLINE(t))
4745 {
4746 t++;
4747 while (t < end_subject && (*t & 0xc0) == 0x80) t++;
4748 }
4749 }
4750 else
4751 #endif
4752 while (t < md->end_subject && !IS_NEWLINE(t)) t++;
4753 end_subject = t;
4754 }
4755
4756 /* There are some optimizations that avoid running the match if a known
4757 starting point is not found, or if a known later character is not present.
4758 However, there is an option that disables these, for testing and for ensuring
4759 that all callouts do actually occur. */
4760
4761 if ((options & PCRE_NO_START_OPTIMIZE) == 0)
4762 {
4763 /* Advance to a unique first byte if there is one. */
4764
4765 if (first_byte >= 0)
4766 {
4767 if (first_byte_caseless)
4768 while (start_match < end_subject && md->lcc[*start_match] != first_byte)
4769 start_match++;
4770 else
4771 while (start_match < end_subject && *start_match != first_byte)
4772 start_match++;
4773 }
4774
4775 /* Or to just after a linebreak for a multiline match */
4776
4777 else if (startline)
4778 {
4779 if (start_match > md->start_subject + start_offset)
4780 {
4781 #ifdef SUPPORT_UTF8
4782 if (utf8)
4783 {
4784 while (start_match < end_subject && !WAS_NEWLINE(start_match))
4785 {
4786 start_match++;
4787 while(start_match < end_subject && (*start_match & 0xc0) == 0x80)
4788 start_match++;
4789 }
4790 }
4791 else
4792 #endif
4793 while (start_match < end_subject && !WAS_NEWLINE(start_match))
4794 start_match++;
4795
4796 /* If we have just passed a CR and the newline option is ANY or ANYCRLF,
4797 and we are now at a LF, advance the match position by one more character.
4798 */
4799
4800 if (start_match[-1] == CHAR_CR &&
4801 (md->nltype == NLTYPE_ANY || md->nltype == NLTYPE_ANYCRLF) &&
4802 start_match < end_subject &&
4803 *start_match == CHAR_NL)
4804 start_match++;
4805 }
4806 }
4807
4808 /* Or to a non-unique first byte after study */
4809
4810 else if (start_bits != NULL)
4811 {
4812 while (start_match < end_subject)
4813 {
4814 register unsigned int c = *start_match;
4815 if ((start_bits[c/8] & (1 << (c&7))) == 0) start_match++;
4816 else break;
4817 }
4818 }
4819 } /* Starting optimizations */
4820
4821 /* Restore fudged end_subject */
4822
4823 end_subject = save_end_subject;
4824
4825 #ifdef DEBUG /* Sigh. Some compilers never learn. */
4826 printf(">>>> Match against: ");
4827 pchars(start_match, end_subject - start_match, TRUE, md);
4828 printf("\n");
4829 #endif
4830
4831 /* If req_byte is set, we know that that character must appear in the
4832 subject for the match to succeed. If the first character is set, req_byte
4833 must be later in the subject; otherwise the test starts at the match point.
4834 This optimization can save a huge amount of backtracking in patterns with
4835 nested unlimited repeats that aren't going to match. Writing separate code
4836 for cased/caseless versions makes it go faster, as does using an
4837 autoincrement and backing off on a match.
4838
4839 HOWEVER: when the subject string is very, very long, searching to its end
4840 can take a long time, and give bad performance on quite ordinary patterns.
4841 This showed up when somebody was matching something like /^\d+C/ on a
4842 32-megabyte string... so we don't do this when the string is sufficiently
4843 long.
4844
4845 ALSO: this processing is disabled when partial matching is requested, or if
4846 disabling is explicitly requested. */
4847
4848 if ((options & PCRE_NO_START_OPTIMIZE) == 0 &&
4849 req_byte >= 0 &&
4850 end_subject - start_match < REQ_BYTE_MAX &&
4851 !md->partial)
4852 {
4853 register USPTR p = start_match + ((first_byte >= 0)? 1 : 0);
4854
4855 /* We don't need to repeat the search if we haven't yet reached the
4856 place we found it at last time. */
4857
4858 if (p > req_byte_ptr)
4859 {
4860 if (req_byte_caseless)
4861 {
4862 while (p < end_subject)
4863 {
4864 register int pp = *p++;
4865 if (pp == req_byte || pp == req_byte2) { p--; break; }
4866 }
4867 }
4868 else
4869 {
4870 while (p < end_subject)
4871 {
4872 if (*p++ == req_byte) { p--; break; }
4873 }
4874 }
4875
4876 /* If we can't find the required character, break the matching loop,
4877 forcing a match failure. */
4878
4879 if (p >= end_subject)
4880 {
4881 rc = MATCH_NOMATCH;
4882 break;
4883 }
4884
4885 /* If we have found the required character, save the point where we
4886 found it, so that we don't search again next time round the loop if
4887 the start hasn't passed this character yet. */
4888
4889 req_byte_ptr = p;
4890 }
4891 }
4892
4893 /* OK, we can now run the match. */
4894
4895 md->start_match_ptr = start_match;
4896 md->match_call_count = 0;
4897 rc = match(start_match, md->start_code, start_match, 2, md, ims, NULL, 0, 0);
4898
4899 switch(rc)
4900 {
4901 /* NOMATCH and PRUNE advance by one character. THEN at this level acts
4902 exactly like PRUNE. */
4903
4904 case MATCH_NOMATCH:
4905 case MATCH_PRUNE:
4906 case MATCH_THEN:
4907 new_start_match = start_match + 1;
4908 #ifdef SUPPORT_UTF8
4909 if (utf8)
4910 while(new_start_match < end_subject && (*new_start_match & 0xc0) == 0x80)
4911 new_start_match++;
4912 #endif
4913 break;
4914
4915 /* SKIP passes back the next starting point explicitly. */
4916
4917 case MATCH_SKIP:
4918 new_start_match = md->start_match_ptr;
4919 break;
4920
4921 /* COMMIT disables the bumpalong, but otherwise behaves as NOMATCH. */
4922
4923 case MATCH_COMMIT:
4924 rc = MATCH_NOMATCH;
4925 goto ENDLOOP;
4926
4927 /* Any other return is some kind of error. */
4928
4929 default:
4930 goto ENDLOOP;
4931 }
4932
4933 /* Control reaches here for the various types of "no match at this point"
4934 result. Reset the code to MATCH_NOMATCH for subsequent checking. */
4935
4936 rc = MATCH_NOMATCH;
4937
4938 /* If PCRE_FIRSTLINE is set, the match must happen before or at the first
4939 newline in the subject (though it may continue over the newline). Therefore,
4940 if we have just failed to match, starting at a newline, do not continue. */
4941
4942 if (firstline && IS_NEWLINE(start_match)) break;
4943
4944 /* Advance to new matching position */
4945
4946 start_match = new_start_match;
4947
4948 /* Break the loop if the pattern is anchored or if we have passed the end of
4949 the subject. */
4950
4951 if (anchored || start_match > end_subject) break;
4952
4953 /* If we have just passed a CR and we are now at a LF, and the pattern does
4954 not contain any explicit matches for \r or \n, and the newline option is CRLF
4955 or ANY or ANYCRLF, advance the match position by one more character. */
4956
4957 if (start_match[-1] == CHAR_CR &&
4958 start_match < end_subject &&
4959 *start_match == CHAR_NL &&
4960 (re->flags & PCRE_HASCRORLF) == 0 &&
4961 (md->nltype == NLTYPE_ANY ||
4962 md->nltype == NLTYPE_ANYCRLF ||
4963 md->nllen == 2))
4964 start_match++;
4965
4966 } /* End of for(;;) "bumpalong" loop */
4967
4968 /* ==========================================================================*/
4969
4970 /* We reach here when rc is not MATCH_NOMATCH, or if one of the stopping
4971 conditions is true:
4972
4973 (1) The pattern is anchored or the match was failed by (*COMMIT);
4974
4975 (2) We are past the end of the subject;
4976
4977 (3) PCRE_FIRSTLINE is set and we have failed to match at a newline, because
4978 this option requests that a match occur at or before the first newline in
4979 the subject.
4980
4981 When we have a match and the offset vector is big enough to deal with any
4982 backreferences, captured substring offsets will already be set up. In the case
4983 where we had to get some local store to hold offsets for backreference
4984 processing, copy those that we can. In this case there need not be overflow if
4985 certain parts of the pattern were not used, even though there are more
4986 capturing parentheses than vector slots. */
4987
4988 ENDLOOP:
4989
4990 if (rc == MATCH_MATCH)
4991 {
4992 if (using_temporary_offsets)
4993 {
4994 if (offsetcount >= 4)
4995 {
4996 memcpy(offsets + 2, md->offset_vector + 2,
4997 (offsetcount - 2) * sizeof(int));
4998 DPRINTF(("Copied offsets from temporary memory\n"));
4999 }
5000 if (md->end_offset_top > offsetcount) md->offset_overflow = TRUE;
5001 DPRINTF(("Freeing temporary memory\n"));
5002 (pcre_free)(md->offset_vector);
5003 }
5004
5005 /* Set the return code to the number of captured strings, or 0 if there are
5006 too many to fit into the vector. */
5007
5008 rc = md->offset_overflow? 0 : md->end_offset_top/2;
5009
5010 /* If there is space, set up the whole thing as substring 0. The value of
5011 md->start_match_ptr might be modified if \K was encountered on the success
5012 matching path. */
5013
5014 if (offsetcount < 2) rc = 0; else
5015 {
5016 offsets[0] = md->start_match_ptr - md->start_subject;
5017 offsets[1] = md->end_match_ptr - md->start_subject;
5018 }
5019
5020 DPRINTF((">>>> returning %d\n", rc));
5021 return rc;
5022 }
5023
5024 /* Control gets here if there has been an error, or if the overall match
5025 attempt has failed at all permitted starting positions. */
5026
5027 if (using_temporary_offsets)
5028 {
5029 DPRINTF(("Freeing temporary memory\n"));
5030 (pcre_free)(md->offset_vector);
5031 }
5032
5033 if (rc != MATCH_NOMATCH)
5034 {
5035 DPRINTF((">>>> error: returning %d\n", rc));
5036 return rc;
5037 }
5038 else if (md->partial && md->hitend)
5039 {
5040 DPRINTF((">>>> returning PCRE_ERROR_PARTIAL\n"));
5041 return PCRE_ERROR_PARTIAL;
5042 }
5043 else
5044 {
5045 DPRINTF((">>>> returning PCRE_ERROR_NOMATCH\n"));
5046 return PCRE_ERROR_NOMATCH;
5047 }
5048 }
5049
5050 /* End of pcre_exec.c */

Properties

Name Value
svn:eol-style native
svn:keywords "Author Date Id Revision Url"

webmaster@exim.org
ViewVC Help
Powered by ViewVC 1.1.12