/[pcre]/code/trunk/pcre_exec.c
ViewVC logotype

Contents of /code/trunk/pcre_exec.c

Parent Directory Parent Directory | Revision Log Revision Log


Revision 168 - (show annotations) (download)
Tue May 29 15:18:18 2007 UTC (6 years, 10 months ago) by ph10
File MIME type: text/plain
File size: 130368 byte(s)
Add support for the Perl 5.10 \K facility.

1 /*************************************************
2 * Perl-Compatible Regular Expressions *
3 *************************************************/
4
5 /* PCRE is a library of functions to support regular expressions whose syntax
6 and semantics are as close as possible to those of the Perl 5 language.
7
8 Written by Philip Hazel
9 Copyright (c) 1997-2007 University of Cambridge
10
11 -----------------------------------------------------------------------------
12 Redistribution and use in source and binary forms, with or without
13 modification, are permitted provided that the following conditions are met:
14
15 * Redistributions of source code must retain the above copyright notice,
16 this list of conditions and the following disclaimer.
17
18 * Redistributions in binary form must reproduce the above copyright
19 notice, this list of conditions and the following disclaimer in the
20 documentation and/or other materials provided with the distribution.
21
22 * Neither the name of the University of Cambridge nor the names of its
23 contributors may be used to endorse or promote products derived from
24 this software without specific prior written permission.
25
26 THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
27 AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
28 IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
29 ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
30 LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
31 CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
32 SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
33 INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
34 CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
35 ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
36 POSSIBILITY OF SUCH DAMAGE.
37 -----------------------------------------------------------------------------
38 */
39
40
41 /* This module contains pcre_exec(), the externally visible function that does
42 pattern matching using an NFA algorithm, trying to mimic Perl as closely as
43 possible. There are also some static supporting functions. */
44
45 #define NLBLOCK md /* Block containing newline information */
46 #define PSSTART start_subject /* Field containing processed string start */
47 #define PSEND end_subject /* Field containing processed string end */
48
49 #include "pcre_internal.h"
50
51 /* Undefine some potentially clashing cpp symbols */
52
53 #undef min
54 #undef max
55
56 /* The chain of eptrblocks for tail recursions uses memory in stack workspace,
57 obtained at top level, the size of which is defined by EPTR_WORK_SIZE. */
58
59 #define EPTR_WORK_SIZE (1000)
60
61 /* Flag bits for the match() function */
62
63 #define match_condassert 0x01 /* Called to check a condition assertion */
64 #define match_cbegroup 0x02 /* Could-be-empty unlimited repeat group */
65 #define match_tail_recursed 0x04 /* Tail recursive call */
66
67 /* Non-error returns from the match() function. Error returns are externally
68 defined PCRE_ERROR_xxx codes, which are all negative. */
69
70 #define MATCH_MATCH 1
71 #define MATCH_NOMATCH 0
72
73 /* Maximum number of ints of offset to save on the stack for recursive calls.
74 If the offset vector is bigger, malloc is used. This should be a multiple of 3,
75 because the offset vector is always a multiple of 3 long. */
76
77 #define REC_STACK_SAVE_MAX 30
78
79 /* Min and max values for the common repeats; for the maxima, 0 => infinity */
80
81 static const char rep_min[] = { 0, 0, 1, 1, 0, 0 };
82 static const char rep_max[] = { 0, 0, 0, 0, 1, 1 };
83
84
85
86 #ifdef DEBUG
87 /*************************************************
88 * Debugging function to print chars *
89 *************************************************/
90
91 /* Print a sequence of chars in printable format, stopping at the end of the
92 subject if the requested.
93
94 Arguments:
95 p points to characters
96 length number to print
97 is_subject TRUE if printing from within md->start_subject
98 md pointer to matching data block, if is_subject is TRUE
99
100 Returns: nothing
101 */
102
103 static void
104 pchars(const uschar *p, int length, BOOL is_subject, match_data *md)
105 {
106 unsigned int c;
107 if (is_subject && length > md->end_subject - p) length = md->end_subject - p;
108 while (length-- > 0)
109 if (isprint(c = *(p++))) printf("%c", c); else printf("\\x%02x", c);
110 }
111 #endif
112
113
114
115 /*************************************************
116 * Match a back-reference *
117 *************************************************/
118
119 /* If a back reference hasn't been set, the length that is passed is greater
120 than the number of characters left in the string, so the match fails.
121
122 Arguments:
123 offset index into the offset vector
124 eptr points into the subject
125 length length to be matched
126 md points to match data block
127 ims the ims flags
128
129 Returns: TRUE if matched
130 */
131
132 static BOOL
133 match_ref(int offset, register USPTR eptr, int length, match_data *md,
134 unsigned long int ims)
135 {
136 USPTR p = md->start_subject + md->offset_vector[offset];
137
138 #ifdef DEBUG
139 if (eptr >= md->end_subject)
140 printf("matching subject <null>");
141 else
142 {
143 printf("matching subject ");
144 pchars(eptr, length, TRUE, md);
145 }
146 printf(" against backref ");
147 pchars(p, length, FALSE, md);
148 printf("\n");
149 #endif
150
151 /* Always fail if not enough characters left */
152
153 if (length > md->end_subject - eptr) return FALSE;
154
155 /* Separate the caselesss case for speed */
156
157 if ((ims & PCRE_CASELESS) != 0)
158 {
159 while (length-- > 0)
160 if (md->lcc[*p++] != md->lcc[*eptr++]) return FALSE;
161 }
162 else
163 { while (length-- > 0) if (*p++ != *eptr++) return FALSE; }
164
165 return TRUE;
166 }
167
168
169
170 /***************************************************************************
171 ****************************************************************************
172 RECURSION IN THE match() FUNCTION
173
174 The match() function is highly recursive, though not every recursive call
175 increases the recursive depth. Nevertheless, some regular expressions can cause
176 it to recurse to a great depth. I was writing for Unix, so I just let it call
177 itself recursively. This uses the stack for saving everything that has to be
178 saved for a recursive call. On Unix, the stack can be large, and this works
179 fine.
180
181 It turns out that on some non-Unix-like systems there are problems with
182 programs that use a lot of stack. (This despite the fact that every last chip
183 has oodles of memory these days, and techniques for extending the stack have
184 been known for decades.) So....
185
186 There is a fudge, triggered by defining NO_RECURSE, which avoids recursive
187 calls by keeping local variables that need to be preserved in blocks of memory
188 obtained from malloc() instead instead of on the stack. Macros are used to
189 achieve this so that the actual code doesn't look very different to what it
190 always used to.
191
192 The original heap-recursive code used longjmp(). However, it seems that this
193 can be very slow on some operating systems. Following a suggestion from Stan
194 Switzer, the use of longjmp() has been abolished, at the cost of having to
195 provide a unique number for each call to RMATCH. There is no way of generating
196 a sequence of numbers at compile time in C. I have given them names, to make
197 them stand out more clearly.
198
199 Crude tests on x86 Linux show a small speedup of around 5-8%. However, on
200 FreeBSD, avoiding longjmp() more than halves the time taken to run the standard
201 tests. Furthermore, not using longjmp() means that local dynamic variables
202 don't have indeterminate values; this has meant that the frame size can be
203 reduced because the result can be "passed back" by straight setting of the
204 variable instead of being passed in the frame.
205 ****************************************************************************
206 ***************************************************************************/
207
208
209 /* Numbers for RMATCH calls */
210
211 enum { RM1=1, RM2, RM3, RM4, RM5, RM6, RM7, RM8, RM9, RM10,
212 RM11, RM12, RM13, RM14, RM15, RM16, RM17, RM18, RM19, RM20,
213 RM21, RM22, RM23, RM24, RM25, RM26, RM27, RM28, RM29, RM30,
214 RM31, RM32, RM33, RM34, RM35, RM36, RM37, RM38, RM39, RM40,
215 RM41, RM42, RM43, RM44, RM45, RM46, RM47 };
216
217
218 /* These versions of the macros use the stack, as normal. There are debugging
219 versions and production versions. Note that the "rw" argument of RMATCH isn't
220 actuall used in this definition. */
221
222 #ifndef NO_RECURSE
223 #define REGISTER register
224
225 #ifdef DEBUG
226 #define RMATCH(ra,rb,rc,rd,re,rf,rg,rw) \
227 { \
228 printf("match() called in line %d\n", __LINE__); \
229 rrc = match(ra,rb,mstart,rc,rd,re,rf,rg,rdepth+1); \
230 printf("to line %d\n", __LINE__); \
231 }
232 #define RRETURN(ra) \
233 { \
234 printf("match() returned %d from line %d ", ra, __LINE__); \
235 return ra; \
236 }
237 #else
238 #define RMATCH(ra,rb,rc,rd,re,rf,rg,rw) \
239 rrc = match(ra,rb,mstart,rc,rd,re,rf,rg,rdepth+1)
240 #define RRETURN(ra) return ra
241 #endif
242
243 #else
244
245
246 /* These versions of the macros manage a private stack on the heap. Note that
247 the "rd" argument of RMATCH isn't actually used in this definition. It's the md
248 argument of match(), which never changes. */
249
250 #define REGISTER
251
252 #define RMATCH(ra,rb,rc,rd,re,rf,rg,rw)\
253 {\
254 heapframe *newframe = (pcre_stack_malloc)(sizeof(heapframe));\
255 frame->Xwhere = rw; \
256 newframe->Xeptr = ra;\
257 newframe->Xecode = rb;\
258 newframe->Xmstart = mstart;\
259 newframe->Xoffset_top = rc;\
260 newframe->Xims = re;\
261 newframe->Xeptrb = rf;\
262 newframe->Xflags = rg;\
263 newframe->Xrdepth = frame->Xrdepth + 1;\
264 newframe->Xprevframe = frame;\
265 frame = newframe;\
266 DPRINTF(("restarting from line %d\n", __LINE__));\
267 goto HEAP_RECURSE;\
268 L_##rw:\
269 DPRINTF(("jumped back to line %d\n", __LINE__));\
270 }
271
272 #define RRETURN(ra)\
273 {\
274 heapframe *newframe = frame;\
275 frame = newframe->Xprevframe;\
276 (pcre_stack_free)(newframe);\
277 if (frame != NULL)\
278 {\
279 rrc = ra;\
280 goto HEAP_RETURN;\
281 }\
282 return ra;\
283 }
284
285
286 /* Structure for remembering the local variables in a private frame */
287
288 typedef struct heapframe {
289 struct heapframe *Xprevframe;
290
291 /* Function arguments that may change */
292
293 const uschar *Xeptr;
294 const uschar *Xecode;
295 const uschar *Xmstart;
296 int Xoffset_top;
297 long int Xims;
298 eptrblock *Xeptrb;
299 int Xflags;
300 unsigned int Xrdepth;
301
302 /* Function local variables */
303
304 const uschar *Xcallpat;
305 const uschar *Xcharptr;
306 const uschar *Xdata;
307 const uschar *Xnext;
308 const uschar *Xpp;
309 const uschar *Xprev;
310 const uschar *Xsaved_eptr;
311
312 recursion_info Xnew_recursive;
313
314 BOOL Xcur_is_word;
315 BOOL Xcondition;
316 BOOL Xprev_is_word;
317
318 unsigned long int Xoriginal_ims;
319
320 #ifdef SUPPORT_UCP
321 int Xprop_type;
322 int Xprop_value;
323 int Xprop_fail_result;
324 int Xprop_category;
325 int Xprop_chartype;
326 int Xprop_script;
327 int Xoclength;
328 uschar Xocchars[8];
329 #endif
330
331 int Xctype;
332 unsigned int Xfc;
333 int Xfi;
334 int Xlength;
335 int Xmax;
336 int Xmin;
337 int Xnumber;
338 int Xoffset;
339 int Xop;
340 int Xsave_capture_last;
341 int Xsave_offset1, Xsave_offset2, Xsave_offset3;
342 int Xstacksave[REC_STACK_SAVE_MAX];
343
344 eptrblock Xnewptrb;
345
346 /* Where to jump back to */
347
348 int Xwhere;
349
350 } heapframe;
351
352 #endif
353
354
355 /***************************************************************************
356 ***************************************************************************/
357
358
359
360 /*************************************************
361 * Match from current position *
362 *************************************************/
363
364 /* This function is called recursively in many circumstances. Whenever it
365 returns a negative (error) response, the outer incarnation must also return the
366 same response.
367
368 Performance note: It might be tempting to extract commonly used fields from the
369 md structure (e.g. utf8, end_subject) into individual variables to improve
370 performance. Tests using gcc on a SPARC disproved this; in the first case, it
371 made performance worse.
372
373 Arguments:
374 eptr pointer to current character in subject
375 ecode pointer to current position in compiled code
376 mstart pointer to the current match start position (can be modified
377 by encountering \K)
378 offset_top current top pointer
379 md pointer to "static" info for the match
380 ims current /i, /m, and /s options
381 eptrb pointer to chain of blocks containing eptr at start of
382 brackets - for testing for empty matches
383 flags can contain
384 match_condassert - this is an assertion condition
385 match_cbegroup - this is the start of an unlimited repeat
386 group that can match an empty string
387 match_tail_recursed - this is a tail_recursed group
388 rdepth the recursion depth
389
390 Returns: MATCH_MATCH if matched ) these values are >= 0
391 MATCH_NOMATCH if failed to match )
392 a negative PCRE_ERROR_xxx value if aborted by an error condition
393 (e.g. stopped by repeated call or recursion limit)
394 */
395
396 static int
397 match(REGISTER USPTR eptr, REGISTER const uschar *ecode, const uschar *mstart,
398 int offset_top, match_data *md, unsigned long int ims, eptrblock *eptrb,
399 int flags, unsigned int rdepth)
400 {
401 /* These variables do not need to be preserved over recursion in this function,
402 so they can be ordinary variables in all cases. Mark some of them with
403 "register" because they are used a lot in loops. */
404
405 register int rrc; /* Returns from recursive calls */
406 register int i; /* Used for loops not involving calls to RMATCH() */
407 register unsigned int c; /* Character values not kept over RMATCH() calls */
408 register BOOL utf8; /* Local copy of UTF-8 flag for speed */
409
410 BOOL minimize, possessive; /* Quantifier options */
411
412 /* When recursion is not being used, all "local" variables that have to be
413 preserved over calls to RMATCH() are part of a "frame" which is obtained from
414 heap storage. Set up the top-level frame here; others are obtained from the
415 heap whenever RMATCH() does a "recursion". See the macro definitions above. */
416
417 #ifdef NO_RECURSE
418 heapframe *frame = (pcre_stack_malloc)(sizeof(heapframe));
419 frame->Xprevframe = NULL; /* Marks the top level */
420
421 /* Copy in the original argument variables */
422
423 frame->Xeptr = eptr;
424 frame->Xecode = ecode;
425 frame->Xmstart = mstart;
426 frame->Xoffset_top = offset_top;
427 frame->Xims = ims;
428 frame->Xeptrb = eptrb;
429 frame->Xflags = flags;
430 frame->Xrdepth = rdepth;
431
432 /* This is where control jumps back to to effect "recursion" */
433
434 HEAP_RECURSE:
435
436 /* Macros make the argument variables come from the current frame */
437
438 #define eptr frame->Xeptr
439 #define ecode frame->Xecode
440 #define mstart frame->Xmstart
441 #define offset_top frame->Xoffset_top
442 #define ims frame->Xims
443 #define eptrb frame->Xeptrb
444 #define flags frame->Xflags
445 #define rdepth frame->Xrdepth
446
447 /* Ditto for the local variables */
448
449 #ifdef SUPPORT_UTF8
450 #define charptr frame->Xcharptr
451 #endif
452 #define callpat frame->Xcallpat
453 #define data frame->Xdata
454 #define next frame->Xnext
455 #define pp frame->Xpp
456 #define prev frame->Xprev
457 #define saved_eptr frame->Xsaved_eptr
458
459 #define new_recursive frame->Xnew_recursive
460
461 #define cur_is_word frame->Xcur_is_word
462 #define condition frame->Xcondition
463 #define prev_is_word frame->Xprev_is_word
464
465 #define original_ims frame->Xoriginal_ims
466
467 #ifdef SUPPORT_UCP
468 #define prop_type frame->Xprop_type
469 #define prop_value frame->Xprop_value
470 #define prop_fail_result frame->Xprop_fail_result
471 #define prop_category frame->Xprop_category
472 #define prop_chartype frame->Xprop_chartype
473 #define prop_script frame->Xprop_script
474 #define oclength frame->Xoclength
475 #define occhars frame->Xocchars
476 #endif
477
478 #define ctype frame->Xctype
479 #define fc frame->Xfc
480 #define fi frame->Xfi
481 #define length frame->Xlength
482 #define max frame->Xmax
483 #define min frame->Xmin
484 #define number frame->Xnumber
485 #define offset frame->Xoffset
486 #define op frame->Xop
487 #define save_capture_last frame->Xsave_capture_last
488 #define save_offset1 frame->Xsave_offset1
489 #define save_offset2 frame->Xsave_offset2
490 #define save_offset3 frame->Xsave_offset3
491 #define stacksave frame->Xstacksave
492
493 #define newptrb frame->Xnewptrb
494
495 /* When recursion is being used, local variables are allocated on the stack and
496 get preserved during recursion in the normal way. In this environment, fi and
497 i, and fc and c, can be the same variables. */
498
499 #else /* NO_RECURSE not defined */
500 #define fi i
501 #define fc c
502
503
504 #ifdef SUPPORT_UTF8 /* Many of these variables are used only */
505 const uschar *charptr; /* in small blocks of the code. My normal */
506 #endif /* style of coding would have declared */
507 const uschar *callpat; /* them within each of those blocks. */
508 const uschar *data; /* However, in order to accommodate the */
509 const uschar *next; /* version of this code that uses an */
510 USPTR pp; /* external "stack" implemented on the */
511 const uschar *prev; /* heap, it is easier to declare them all */
512 USPTR saved_eptr; /* here, so the declarations can be cut */
513 /* out in a block. The only declarations */
514 recursion_info new_recursive; /* within blocks below are for variables */
515 /* that do not have to be preserved over */
516 BOOL cur_is_word; /* a recursive call to RMATCH(). */
517 BOOL condition;
518 BOOL prev_is_word;
519
520 unsigned long int original_ims;
521
522 #ifdef SUPPORT_UCP
523 int prop_type;
524 int prop_value;
525 int prop_fail_result;
526 int prop_category;
527 int prop_chartype;
528 int prop_script;
529 int oclength;
530 uschar occhars[8];
531 #endif
532
533 int ctype;
534 int length;
535 int max;
536 int min;
537 int number;
538 int offset;
539 int op;
540 int save_capture_last;
541 int save_offset1, save_offset2, save_offset3;
542 int stacksave[REC_STACK_SAVE_MAX];
543
544 eptrblock newptrb;
545 #endif /* NO_RECURSE */
546
547 /* These statements are here to stop the compiler complaining about unitialized
548 variables. */
549
550 #ifdef SUPPORT_UCP
551 prop_value = 0;
552 prop_fail_result = 0;
553 #endif
554
555
556 /* This label is used for tail recursion, which is used in a few cases even
557 when NO_RECURSE is not defined, in order to reduce the amount of stack that is
558 used. Thanks to Ian Taylor for noticing this possibility and sending the
559 original patch. */
560
561 TAIL_RECURSE:
562
563 /* OK, now we can get on with the real code of the function. Recursive calls
564 are specified by the macro RMATCH and RRETURN is used to return. When
565 NO_RECURSE is *not* defined, these just turn into a recursive call to match()
566 and a "return", respectively (possibly with some debugging if DEBUG is
567 defined). However, RMATCH isn't like a function call because it's quite a
568 complicated macro. It has to be used in one particular way. This shouldn't,
569 however, impact performance when true recursion is being used. */
570
571 #ifdef SUPPORT_UTF8
572 utf8 = md->utf8; /* Local copy of the flag */
573 #else
574 utf8 = FALSE;
575 #endif
576
577 /* First check that we haven't called match() too many times, or that we
578 haven't exceeded the recursive call limit. */
579
580 if (md->match_call_count++ >= md->match_limit) RRETURN(PCRE_ERROR_MATCHLIMIT);
581 if (rdepth >= md->match_limit_recursion) RRETURN(PCRE_ERROR_RECURSIONLIMIT);
582
583 original_ims = ims; /* Save for resetting on ')' */
584
585 /* At the start of a group with an unlimited repeat that may match an empty
586 string, the match_cbegroup flag is set. When this is the case, add the current
587 subject pointer to the chain of such remembered pointers, to be checked when we
588 hit the closing ket, in order to break infinite loops that match no characters.
589 When match() is called in other circumstances, don't add to the chain. If this
590 is a tail recursion, use a block from the workspace, as the one on the stack is
591 already used. */
592
593 if ((flags & match_cbegroup) != 0)
594 {
595 eptrblock *p;
596 if ((flags & match_tail_recursed) != 0)
597 {
598 if (md->eptrn >= EPTR_WORK_SIZE) RRETURN(PCRE_ERROR_NULLWSLIMIT);
599 p = md->eptrchain + md->eptrn++;
600 }
601 else p = &newptrb;
602 p->epb_saved_eptr = eptr;
603 p->epb_prev = eptrb;
604 eptrb = p;
605 }
606
607 /* Now start processing the opcodes. */
608
609 for (;;)
610 {
611 minimize = possessive = FALSE;
612 op = *ecode;
613
614 /* For partial matching, remember if we ever hit the end of the subject after
615 matching at least one subject character. */
616
617 if (md->partial &&
618 eptr >= md->end_subject &&
619 eptr > mstart)
620 md->hitend = TRUE;
621
622 switch(op)
623 {
624 /* Handle a capturing bracket. If there is space in the offset vector, save
625 the current subject position in the working slot at the top of the vector.
626 We mustn't change the current values of the data slot, because they may be
627 set from a previous iteration of this group, and be referred to by a
628 reference inside the group.
629
630 If the bracket fails to match, we need to restore this value and also the
631 values of the final offsets, in case they were set by a previous iteration
632 of the same bracket.
633
634 If there isn't enough space in the offset vector, treat this as if it were
635 a non-capturing bracket. Don't worry about setting the flag for the error
636 case here; that is handled in the code for KET. */
637
638 case OP_CBRA:
639 case OP_SCBRA:
640 number = GET2(ecode, 1+LINK_SIZE);
641 offset = number << 1;
642
643 #ifdef DEBUG
644 printf("start bracket %d\n", number);
645 printf("subject=");
646 pchars(eptr, 16, TRUE, md);
647 printf("\n");
648 #endif
649
650 if (offset < md->offset_max)
651 {
652 save_offset1 = md->offset_vector[offset];
653 save_offset2 = md->offset_vector[offset+1];
654 save_offset3 = md->offset_vector[md->offset_end - number];
655 save_capture_last = md->capture_last;
656
657 DPRINTF(("saving %d %d %d\n", save_offset1, save_offset2, save_offset3));
658 md->offset_vector[md->offset_end - number] = eptr - md->start_subject;
659
660 flags = (op == OP_SCBRA)? match_cbegroup : 0;
661 do
662 {
663 RMATCH(eptr, ecode + _pcre_OP_lengths[*ecode], offset_top, md,
664 ims, eptrb, flags, RM1);
665 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
666 md->capture_last = save_capture_last;
667 ecode += GET(ecode, 1);
668 }
669 while (*ecode == OP_ALT);
670
671 DPRINTF(("bracket %d failed\n", number));
672
673 md->offset_vector[offset] = save_offset1;
674 md->offset_vector[offset+1] = save_offset2;
675 md->offset_vector[md->offset_end - number] = save_offset3;
676
677 RRETURN(MATCH_NOMATCH);
678 }
679
680 /* Insufficient room for saving captured contents. Treat as a non-capturing
681 bracket. */
682
683 DPRINTF(("insufficient capture room: treat as non-capturing\n"));
684
685 /* Non-capturing bracket. Loop for all the alternatives. When we get to the
686 final alternative within the brackets, we would return the result of a
687 recursive call to match() whatever happened. We can reduce stack usage by
688 turning this into a tail recursion. */
689
690 case OP_BRA:
691 case OP_SBRA:
692 DPRINTF(("start non-capturing bracket\n"));
693 flags = (op >= OP_SBRA)? match_cbegroup : 0;
694 for (;;)
695 {
696 if (ecode[GET(ecode, 1)] != OP_ALT)
697 {
698 ecode += _pcre_OP_lengths[*ecode];
699 flags |= match_tail_recursed;
700 DPRINTF(("bracket 0 tail recursion\n"));
701 goto TAIL_RECURSE;
702 }
703
704 /* For non-final alternatives, continue the loop for a NOMATCH result;
705 otherwise return. */
706
707 RMATCH(eptr, ecode + _pcre_OP_lengths[*ecode], offset_top, md, ims,
708 eptrb, flags, RM2);
709 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
710 ecode += GET(ecode, 1);
711 }
712 /* Control never reaches here. */
713
714 /* Conditional group: compilation checked that there are no more than
715 two branches. If the condition is false, skipping the first branch takes us
716 past the end if there is only one branch, but that's OK because that is
717 exactly what going to the ket would do. As there is only one branch to be
718 obeyed, we can use tail recursion to avoid using another stack frame. */
719
720 case OP_COND:
721 case OP_SCOND:
722 if (ecode[LINK_SIZE+1] == OP_RREF) /* Recursion test */
723 {
724 offset = GET2(ecode, LINK_SIZE + 2); /* Recursion group number*/
725 condition = md->recursive != NULL &&
726 (offset == RREF_ANY || offset == md->recursive->group_num);
727 ecode += condition? 3 : GET(ecode, 1);
728 }
729
730 else if (ecode[LINK_SIZE+1] == OP_CREF) /* Group used test */
731 {
732 offset = GET2(ecode, LINK_SIZE+2) << 1; /* Doubled ref number */
733 condition = offset < offset_top && md->offset_vector[offset] >= 0;
734 ecode += condition? 3 : GET(ecode, 1);
735 }
736
737 else if (ecode[LINK_SIZE+1] == OP_DEF) /* DEFINE - always false */
738 {
739 condition = FALSE;
740 ecode += GET(ecode, 1);
741 }
742
743 /* The condition is an assertion. Call match() to evaluate it - setting
744 the final argument match_condassert causes it to stop at the end of an
745 assertion. */
746
747 else
748 {
749 RMATCH(eptr, ecode + 1 + LINK_SIZE, offset_top, md, ims, NULL,
750 match_condassert, RM3);
751 if (rrc == MATCH_MATCH)
752 {
753 condition = TRUE;
754 ecode += 1 + LINK_SIZE + GET(ecode, LINK_SIZE + 2);
755 while (*ecode == OP_ALT) ecode += GET(ecode, 1);
756 }
757 else if (rrc != MATCH_NOMATCH)
758 {
759 RRETURN(rrc); /* Need braces because of following else */
760 }
761 else
762 {
763 condition = FALSE;
764 ecode += GET(ecode, 1);
765 }
766 }
767
768 /* We are now at the branch that is to be obeyed. As there is only one,
769 we can use tail recursion to avoid using another stack frame. If the second
770 alternative doesn't exist, we can just plough on. */
771
772 if (condition || *ecode == OP_ALT)
773 {
774 ecode += 1 + LINK_SIZE;
775 flags = match_tail_recursed | ((op == OP_SCOND)? match_cbegroup : 0);
776 goto TAIL_RECURSE;
777 }
778 else
779 {
780 ecode += 1 + LINK_SIZE;
781 }
782 break;
783
784
785 /* End of the pattern. If we are in a top-level recursion, we should
786 restore the offsets appropriately and continue from after the call. */
787
788 case OP_END:
789 if (md->recursive != NULL && md->recursive->group_num == 0)
790 {
791 recursion_info *rec = md->recursive;
792 DPRINTF(("End of pattern in a (?0) recursion\n"));
793 md->recursive = rec->prevrec;
794 memmove(md->offset_vector, rec->offset_save,
795 rec->saved_max * sizeof(int));
796 mstart = rec->save_start;
797 ims = original_ims;
798 ecode = rec->after_call;
799 break;
800 }
801
802 /* Otherwise, if PCRE_NOTEMPTY is set, fail if we have matched an empty
803 string - backtracking will then try other alternatives, if any. */
804
805 if (md->notempty && eptr == mstart) RRETURN(MATCH_NOMATCH);
806 md->end_match_ptr = eptr; /* Record where we ended */
807 md->end_offset_top = offset_top; /* and how many extracts were taken */
808 md->start_match_ptr = mstart; /* and the start (\K can modify) */
809 RRETURN(MATCH_MATCH);
810
811 /* Change option settings */
812
813 case OP_OPT:
814 ims = ecode[1];
815 ecode += 2;
816 DPRINTF(("ims set to %02lx\n", ims));
817 break;
818
819 /* Assertion brackets. Check the alternative branches in turn - the
820 matching won't pass the KET for an assertion. If any one branch matches,
821 the assertion is true. Lookbehind assertions have an OP_REVERSE item at the
822 start of each branch to move the current point backwards, so the code at
823 this level is identical to the lookahead case. */
824
825 case OP_ASSERT:
826 case OP_ASSERTBACK:
827 do
828 {
829 RMATCH(eptr, ecode + 1 + LINK_SIZE, offset_top, md, ims, NULL, 0,
830 RM4);
831 if (rrc == MATCH_MATCH) break;
832 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
833 ecode += GET(ecode, 1);
834 }
835 while (*ecode == OP_ALT);
836 if (*ecode == OP_KET) RRETURN(MATCH_NOMATCH);
837
838 /* If checking an assertion for a condition, return MATCH_MATCH. */
839
840 if ((flags & match_condassert) != 0) RRETURN(MATCH_MATCH);
841
842 /* Continue from after the assertion, updating the offsets high water
843 mark, since extracts may have been taken during the assertion. */
844
845 do ecode += GET(ecode,1); while (*ecode == OP_ALT);
846 ecode += 1 + LINK_SIZE;
847 offset_top = md->end_offset_top;
848 continue;
849
850 /* Negative assertion: all branches must fail to match */
851
852 case OP_ASSERT_NOT:
853 case OP_ASSERTBACK_NOT:
854 do
855 {
856 RMATCH(eptr, ecode + 1 + LINK_SIZE, offset_top, md, ims, NULL, 0,
857 RM5);
858 if (rrc == MATCH_MATCH) RRETURN(MATCH_NOMATCH);
859 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
860 ecode += GET(ecode,1);
861 }
862 while (*ecode == OP_ALT);
863
864 if ((flags & match_condassert) != 0) RRETURN(MATCH_MATCH);
865
866 ecode += 1 + LINK_SIZE;
867 continue;
868
869 /* Move the subject pointer back. This occurs only at the start of
870 each branch of a lookbehind assertion. If we are too close to the start to
871 move back, this match function fails. When working with UTF-8 we move
872 back a number of characters, not bytes. */
873
874 case OP_REVERSE:
875 #ifdef SUPPORT_UTF8
876 if (utf8)
877 {
878 i = GET(ecode, 1);
879 while (i-- > 0)
880 {
881 eptr--;
882 if (eptr < md->start_subject) RRETURN(MATCH_NOMATCH);
883 BACKCHAR(eptr)
884 }
885 }
886 else
887 #endif
888
889 /* No UTF-8 support, or not in UTF-8 mode: count is byte count */
890
891 {
892 eptr -= GET(ecode, 1);
893 if (eptr < md->start_subject) RRETURN(MATCH_NOMATCH);
894 }
895
896 /* Skip to next op code */
897
898 ecode += 1 + LINK_SIZE;
899 break;
900
901 /* The callout item calls an external function, if one is provided, passing
902 details of the match so far. This is mainly for debugging, though the
903 function is able to force a failure. */
904
905 case OP_CALLOUT:
906 if (pcre_callout != NULL)
907 {
908 pcre_callout_block cb;
909 cb.version = 1; /* Version 1 of the callout block */
910 cb.callout_number = ecode[1];
911 cb.offset_vector = md->offset_vector;
912 cb.subject = (PCRE_SPTR)md->start_subject;
913 cb.subject_length = md->end_subject - md->start_subject;
914 cb.start_match = mstart - md->start_subject;
915 cb.current_position = eptr - md->start_subject;
916 cb.pattern_position = GET(ecode, 2);
917 cb.next_item_length = GET(ecode, 2 + LINK_SIZE);
918 cb.capture_top = offset_top/2;
919 cb.capture_last = md->capture_last;
920 cb.callout_data = md->callout_data;
921 if ((rrc = (*pcre_callout)(&cb)) > 0) RRETURN(MATCH_NOMATCH);
922 if (rrc < 0) RRETURN(rrc);
923 }
924 ecode += 2 + 2*LINK_SIZE;
925 break;
926
927 /* Recursion either matches the current regex, or some subexpression. The
928 offset data is the offset to the starting bracket from the start of the
929 whole pattern. (This is so that it works from duplicated subpatterns.)
930
931 If there are any capturing brackets started but not finished, we have to
932 save their starting points and reinstate them after the recursion. However,
933 we don't know how many such there are (offset_top records the completed
934 total) so we just have to save all the potential data. There may be up to
935 65535 such values, which is too large to put on the stack, but using malloc
936 for small numbers seems expensive. As a compromise, the stack is used when
937 there are no more than REC_STACK_SAVE_MAX values to store; otherwise malloc
938 is used. A problem is what to do if the malloc fails ... there is no way of
939 returning to the top level with an error. Save the top REC_STACK_SAVE_MAX
940 values on the stack, and accept that the rest may be wrong.
941
942 There are also other values that have to be saved. We use a chained
943 sequence of blocks that actually live on the stack. Thanks to Robin Houston
944 for the original version of this logic. */
945
946 case OP_RECURSE:
947 {
948 callpat = md->start_code + GET(ecode, 1);
949 new_recursive.group_num = (callpat == md->start_code)? 0 :
950 GET2(callpat, 1 + LINK_SIZE);
951
952 /* Add to "recursing stack" */
953
954 new_recursive.prevrec = md->recursive;
955 md->recursive = &new_recursive;
956
957 /* Find where to continue from afterwards */
958
959 ecode += 1 + LINK_SIZE;
960 new_recursive.after_call = ecode;
961
962 /* Now save the offset data. */
963
964 new_recursive.saved_max = md->offset_end;
965 if (new_recursive.saved_max <= REC_STACK_SAVE_MAX)
966 new_recursive.offset_save = stacksave;
967 else
968 {
969 new_recursive.offset_save =
970 (int *)(pcre_malloc)(new_recursive.saved_max * sizeof(int));
971 if (new_recursive.offset_save == NULL) RRETURN(PCRE_ERROR_NOMEMORY);
972 }
973
974 memcpy(new_recursive.offset_save, md->offset_vector,
975 new_recursive.saved_max * sizeof(int));
976 new_recursive.save_start = mstart;
977 mstart = eptr;
978
979 /* OK, now we can do the recursion. For each top-level alternative we
980 restore the offset and recursion data. */
981
982 DPRINTF(("Recursing into group %d\n", new_recursive.group_num));
983 flags = (*callpat >= OP_SBRA)? match_cbegroup : 0;
984 do
985 {
986 RMATCH(eptr, callpat + _pcre_OP_lengths[*callpat], offset_top,
987 md, ims, eptrb, flags, RM6);
988 if (rrc == MATCH_MATCH)
989 {
990 DPRINTF(("Recursion matched\n"));
991 md->recursive = new_recursive.prevrec;
992 if (new_recursive.offset_save != stacksave)
993 (pcre_free)(new_recursive.offset_save);
994 RRETURN(MATCH_MATCH);
995 }
996 else if (rrc != MATCH_NOMATCH)
997 {
998 DPRINTF(("Recursion gave error %d\n", rrc));
999 RRETURN(rrc);
1000 }
1001
1002 md->recursive = &new_recursive;
1003 memcpy(md->offset_vector, new_recursive.offset_save,
1004 new_recursive.saved_max * sizeof(int));
1005 callpat += GET(callpat, 1);
1006 }
1007 while (*callpat == OP_ALT);
1008
1009 DPRINTF(("Recursion didn't match\n"));
1010 md->recursive = new_recursive.prevrec;
1011 if (new_recursive.offset_save != stacksave)
1012 (pcre_free)(new_recursive.offset_save);
1013 RRETURN(MATCH_NOMATCH);
1014 }
1015 /* Control never reaches here */
1016
1017 /* "Once" brackets are like assertion brackets except that after a match,
1018 the point in the subject string is not moved back. Thus there can never be
1019 a move back into the brackets. Friedl calls these "atomic" subpatterns.
1020 Check the alternative branches in turn - the matching won't pass the KET
1021 for this kind of subpattern. If any one branch matches, we carry on as at
1022 the end of a normal bracket, leaving the subject pointer. */
1023
1024 case OP_ONCE:
1025 prev = ecode;
1026 saved_eptr = eptr;
1027
1028 do
1029 {
1030 RMATCH(eptr, ecode + 1 + LINK_SIZE, offset_top, md, ims,
1031 eptrb, 0, RM7);
1032 if (rrc == MATCH_MATCH) break;
1033 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1034 ecode += GET(ecode,1);
1035 }
1036 while (*ecode == OP_ALT);
1037
1038 /* If hit the end of the group (which could be repeated), fail */
1039
1040 if (*ecode != OP_ONCE && *ecode != OP_ALT) RRETURN(MATCH_NOMATCH);
1041
1042 /* Continue as from after the assertion, updating the offsets high water
1043 mark, since extracts may have been taken. */
1044
1045 do ecode += GET(ecode, 1); while (*ecode == OP_ALT);
1046
1047 offset_top = md->end_offset_top;
1048 eptr = md->end_match_ptr;
1049
1050 /* For a non-repeating ket, just continue at this level. This also
1051 happens for a repeating ket if no characters were matched in the group.
1052 This is the forcible breaking of infinite loops as implemented in Perl
1053 5.005. If there is an options reset, it will get obeyed in the normal
1054 course of events. */
1055
1056 if (*ecode == OP_KET || eptr == saved_eptr)
1057 {
1058 ecode += 1+LINK_SIZE;
1059 break;
1060 }
1061
1062 /* The repeating kets try the rest of the pattern or restart from the
1063 preceding bracket, in the appropriate order. The second "call" of match()
1064 uses tail recursion, to avoid using another stack frame. We need to reset
1065 any options that changed within the bracket before re-running it, so
1066 check the next opcode. */
1067
1068 if (ecode[1+LINK_SIZE] == OP_OPT)
1069 {
1070 ims = (ims & ~PCRE_IMS) | ecode[4];
1071 DPRINTF(("ims set to %02lx at group repeat\n", ims));
1072 }
1073
1074 if (*ecode == OP_KETRMIN)
1075 {
1076 RMATCH(eptr, ecode + 1 + LINK_SIZE, offset_top, md, ims, eptrb, 0,
1077 RM8);
1078 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1079 ecode = prev;
1080 flags = match_tail_recursed;
1081 goto TAIL_RECURSE;
1082 }
1083 else /* OP_KETRMAX */
1084 {
1085 RMATCH(eptr, prev, offset_top, md, ims, eptrb, match_cbegroup, RM9);
1086 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1087 ecode += 1 + LINK_SIZE;
1088 flags = match_tail_recursed;
1089 goto TAIL_RECURSE;
1090 }
1091 /* Control never gets here */
1092
1093 /* An alternation is the end of a branch; scan along to find the end of the
1094 bracketed group and go to there. */
1095
1096 case OP_ALT:
1097 do ecode += GET(ecode,1); while (*ecode == OP_ALT);
1098 break;
1099
1100 /* BRAZERO and BRAMINZERO occur just before a bracket group, indicating
1101 that it may occur zero times. It may repeat infinitely, or not at all -
1102 i.e. it could be ()* or ()? in the pattern. Brackets with fixed upper
1103 repeat limits are compiled as a number of copies, with the optional ones
1104 preceded by BRAZERO or BRAMINZERO. */
1105
1106 case OP_BRAZERO:
1107 {
1108 next = ecode+1;
1109 RMATCH(eptr, next, offset_top, md, ims, eptrb, 0, RM10);
1110 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1111 do next += GET(next,1); while (*next == OP_ALT);
1112 ecode = next + 1 + LINK_SIZE;
1113 }
1114 break;
1115
1116 case OP_BRAMINZERO:
1117 {
1118 next = ecode+1;
1119 do next += GET(next, 1); while (*next == OP_ALT);
1120 RMATCH(eptr, next + 1+LINK_SIZE, offset_top, md, ims, eptrb, 0, RM11);
1121 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1122 ecode++;
1123 }
1124 break;
1125
1126 /* End of a group, repeated or non-repeating. */
1127
1128 case OP_KET:
1129 case OP_KETRMIN:
1130 case OP_KETRMAX:
1131 prev = ecode - GET(ecode, 1);
1132
1133 /* If this was a group that remembered the subject start, in order to break
1134 infinite repeats of empty string matches, retrieve the subject start from
1135 the chain. Otherwise, set it NULL. */
1136
1137 if (*prev >= OP_SBRA)
1138 {
1139 saved_eptr = eptrb->epb_saved_eptr; /* Value at start of group */
1140 eptrb = eptrb->epb_prev; /* Backup to previous group */
1141 }
1142 else saved_eptr = NULL;
1143
1144 /* If we are at the end of an assertion group, stop matching and return
1145 MATCH_MATCH, but record the current high water mark for use by positive
1146 assertions. Do this also for the "once" (atomic) groups. */
1147
1148 if (*prev == OP_ASSERT || *prev == OP_ASSERT_NOT ||
1149 *prev == OP_ASSERTBACK || *prev == OP_ASSERTBACK_NOT ||
1150 *prev == OP_ONCE)
1151 {
1152 md->end_match_ptr = eptr; /* For ONCE */
1153 md->end_offset_top = offset_top;
1154 RRETURN(MATCH_MATCH);
1155 }
1156
1157 /* For capturing groups we have to check the group number back at the start
1158 and if necessary complete handling an extraction by setting the offsets and
1159 bumping the high water mark. Note that whole-pattern recursion is coded as
1160 a recurse into group 0, so it won't be picked up here. Instead, we catch it
1161 when the OP_END is reached. Other recursion is handled here. */
1162
1163 if (*prev == OP_CBRA || *prev == OP_SCBRA)
1164 {
1165 number = GET2(prev, 1+LINK_SIZE);
1166 offset = number << 1;
1167
1168 #ifdef DEBUG
1169 printf("end bracket %d", number);
1170 printf("\n");
1171 #endif
1172
1173 md->capture_last = number;
1174 if (offset >= md->offset_max) md->offset_overflow = TRUE; else
1175 {
1176 md->offset_vector[offset] =
1177 md->offset_vector[md->offset_end - number];
1178 md->offset_vector[offset+1] = eptr - md->start_subject;
1179 if (offset_top <= offset) offset_top = offset + 2;
1180 }
1181
1182 /* Handle a recursively called group. Restore the offsets
1183 appropriately and continue from after the call. */
1184
1185 if (md->recursive != NULL && md->recursive->group_num == number)
1186 {
1187 recursion_info *rec = md->recursive;
1188 DPRINTF(("Recursion (%d) succeeded - continuing\n", number));
1189 md->recursive = rec->prevrec;
1190 mstart = rec->save_start;
1191 memcpy(md->offset_vector, rec->offset_save,
1192 rec->saved_max * sizeof(int));
1193 ecode = rec->after_call;
1194 ims = original_ims;
1195 break;
1196 }
1197 }
1198
1199 /* For both capturing and non-capturing groups, reset the value of the ims
1200 flags, in case they got changed during the group. */
1201
1202 ims = original_ims;
1203 DPRINTF(("ims reset to %02lx\n", ims));
1204
1205 /* For a non-repeating ket, just continue at this level. This also
1206 happens for a repeating ket if no characters were matched in the group.
1207 This is the forcible breaking of infinite loops as implemented in Perl
1208 5.005. If there is an options reset, it will get obeyed in the normal
1209 course of events. */
1210
1211 if (*ecode == OP_KET || eptr == saved_eptr)
1212 {
1213 ecode += 1 + LINK_SIZE;
1214 break;
1215 }
1216
1217 /* The repeating kets try the rest of the pattern or restart from the
1218 preceding bracket, in the appropriate order. In the second case, we can use
1219 tail recursion to avoid using another stack frame. */
1220
1221 flags = (*prev >= OP_SBRA)? match_cbegroup : 0;
1222
1223 if (*ecode == OP_KETRMIN)
1224 {
1225 RMATCH(eptr, ecode + 1+LINK_SIZE, offset_top, md, ims, eptrb, 0,
1226 RM12);
1227 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1228 ecode = prev;
1229 flags |= match_tail_recursed;
1230 goto TAIL_RECURSE;
1231 }
1232 else /* OP_KETRMAX */
1233 {
1234 RMATCH(eptr, prev, offset_top, md, ims, eptrb, flags, RM13);
1235 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1236 ecode += 1 + LINK_SIZE;
1237 flags = match_tail_recursed;
1238 goto TAIL_RECURSE;
1239 }
1240 /* Control never gets here */
1241
1242 /* Start of subject unless notbol, or after internal newline if multiline */
1243
1244 case OP_CIRC:
1245 if (md->notbol && eptr == md->start_subject) RRETURN(MATCH_NOMATCH);
1246 if ((ims & PCRE_MULTILINE) != 0)
1247 {
1248 if (eptr != md->start_subject &&
1249 (eptr == md->end_subject || !WAS_NEWLINE(eptr)))
1250 RRETURN(MATCH_NOMATCH);
1251 ecode++;
1252 break;
1253 }
1254 /* ... else fall through */
1255
1256 /* Start of subject assertion */
1257
1258 case OP_SOD:
1259 if (eptr != md->start_subject) RRETURN(MATCH_NOMATCH);
1260 ecode++;
1261 break;
1262
1263 /* Start of match assertion */
1264
1265 case OP_SOM:
1266 if (eptr != md->start_subject + md->start_offset) RRETURN(MATCH_NOMATCH);
1267 ecode++;
1268 break;
1269
1270 /* Reset the start of match point */
1271
1272 case OP_SET_SOM:
1273 mstart = eptr;
1274 ecode++;
1275 break;
1276
1277 /* Assert before internal newline if multiline, or before a terminating
1278 newline unless endonly is set, else end of subject unless noteol is set. */
1279
1280 case OP_DOLL:
1281 if ((ims & PCRE_MULTILINE) != 0)
1282 {
1283 if (eptr < md->end_subject)
1284 { if (!IS_NEWLINE(eptr)) RRETURN(MATCH_NOMATCH); }
1285 else
1286 { if (md->noteol) RRETURN(MATCH_NOMATCH); }
1287 ecode++;
1288 break;
1289 }
1290 else
1291 {
1292 if (md->noteol) RRETURN(MATCH_NOMATCH);
1293 if (!md->endonly)
1294 {
1295 if (eptr != md->end_subject &&
1296 (!IS_NEWLINE(eptr) || eptr != md->end_subject - md->nllen))
1297 RRETURN(MATCH_NOMATCH);
1298 ecode++;
1299 break;
1300 }
1301 }
1302 /* ... else fall through for endonly */
1303
1304 /* End of subject assertion (\z) */
1305
1306 case OP_EOD:
1307 if (eptr < md->end_subject) RRETURN(MATCH_NOMATCH);
1308 ecode++;
1309 break;
1310
1311 /* End of subject or ending \n assertion (\Z) */
1312
1313 case OP_EODN:
1314 if (eptr != md->end_subject &&
1315 (!IS_NEWLINE(eptr) || eptr != md->end_subject - md->nllen))
1316 RRETURN(MATCH_NOMATCH);
1317 ecode++;
1318 break;
1319
1320 /* Word boundary assertions */
1321
1322 case OP_NOT_WORD_BOUNDARY:
1323 case OP_WORD_BOUNDARY:
1324 {
1325
1326 /* Find out if the previous and current characters are "word" characters.
1327 It takes a bit more work in UTF-8 mode. Characters > 255 are assumed to
1328 be "non-word" characters. */
1329
1330 #ifdef SUPPORT_UTF8
1331 if (utf8)
1332 {
1333 if (eptr == md->start_subject) prev_is_word = FALSE; else
1334 {
1335 const uschar *lastptr = eptr - 1;
1336 while((*lastptr & 0xc0) == 0x80) lastptr--;
1337 GETCHAR(c, lastptr);
1338 prev_is_word = c < 256 && (md->ctypes[c] & ctype_word) != 0;
1339 }
1340 if (eptr >= md->end_subject) cur_is_word = FALSE; else
1341 {
1342 GETCHAR(c, eptr);
1343 cur_is_word = c < 256 && (md->ctypes[c] & ctype_word) != 0;
1344 }
1345 }
1346 else
1347 #endif
1348
1349 /* More streamlined when not in UTF-8 mode */
1350
1351 {
1352 prev_is_word = (eptr != md->start_subject) &&
1353 ((md->ctypes[eptr[-1]] & ctype_word) != 0);
1354 cur_is_word = (eptr < md->end_subject) &&
1355 ((md->ctypes[*eptr] & ctype_word) != 0);
1356 }
1357
1358 /* Now see if the situation is what we want */
1359
1360 if ((*ecode++ == OP_WORD_BOUNDARY)?
1361 cur_is_word == prev_is_word : cur_is_word != prev_is_word)
1362 RRETURN(MATCH_NOMATCH);
1363 }
1364 break;
1365
1366 /* Match a single character type; inline for speed */
1367
1368 case OP_ANY:
1369 if ((ims & PCRE_DOTALL) == 0)
1370 {
1371 if (IS_NEWLINE(eptr)) RRETURN(MATCH_NOMATCH);
1372 }
1373 if (eptr++ >= md->end_subject) RRETURN(MATCH_NOMATCH);
1374 if (utf8)
1375 while (eptr < md->end_subject && (*eptr & 0xc0) == 0x80) eptr++;
1376 ecode++;
1377 break;
1378
1379 /* Match a single byte, even in UTF-8 mode. This opcode really does match
1380 any byte, even newline, independent of the setting of PCRE_DOTALL. */
1381
1382 case OP_ANYBYTE:
1383 if (eptr++ >= md->end_subject) RRETURN(MATCH_NOMATCH);
1384 ecode++;
1385 break;
1386
1387 case OP_NOT_DIGIT:
1388 if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
1389 GETCHARINCTEST(c, eptr);
1390 if (
1391 #ifdef SUPPORT_UTF8
1392 c < 256 &&
1393 #endif
1394 (md->ctypes[c] & ctype_digit) != 0
1395 )
1396 RRETURN(MATCH_NOMATCH);
1397 ecode++;
1398 break;
1399
1400 case OP_DIGIT:
1401 if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
1402 GETCHARINCTEST(c, eptr);
1403 if (
1404 #ifdef SUPPORT_UTF8
1405 c >= 256 ||
1406 #endif
1407 (md->ctypes[c] & ctype_digit) == 0
1408 )
1409 RRETURN(MATCH_NOMATCH);
1410 ecode++;
1411 break;
1412
1413 case OP_NOT_WHITESPACE:
1414 if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
1415 GETCHARINCTEST(c, eptr);
1416 if (
1417 #ifdef SUPPORT_UTF8
1418 c < 256 &&
1419 #endif
1420 (md->ctypes[c] & ctype_space) != 0
1421 )
1422 RRETURN(MATCH_NOMATCH);
1423 ecode++;
1424 break;
1425
1426 case OP_WHITESPACE:
1427 if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
1428 GETCHARINCTEST(c, eptr);
1429 if (
1430 #ifdef SUPPORT_UTF8
1431 c >= 256 ||
1432 #endif
1433 (md->ctypes[c] & ctype_space) == 0
1434 )
1435 RRETURN(MATCH_NOMATCH);
1436 ecode++;
1437 break;
1438
1439 case OP_NOT_WORDCHAR:
1440 if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
1441 GETCHARINCTEST(c, eptr);
1442 if (
1443 #ifdef SUPPORT_UTF8
1444 c < 256 &&
1445 #endif
1446 (md->ctypes[c] & ctype_word) != 0
1447 )
1448 RRETURN(MATCH_NOMATCH);
1449 ecode++;
1450 break;
1451
1452 case OP_WORDCHAR:
1453 if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
1454 GETCHARINCTEST(c, eptr);
1455 if (
1456 #ifdef SUPPORT_UTF8
1457 c >= 256 ||
1458 #endif
1459 (md->ctypes[c] & ctype_word) == 0
1460 )
1461 RRETURN(MATCH_NOMATCH);
1462 ecode++;
1463 break;
1464
1465 case OP_ANYNL:
1466 if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
1467 GETCHARINCTEST(c, eptr);
1468 switch(c)
1469 {
1470 default: RRETURN(MATCH_NOMATCH);
1471 case 0x000d:
1472 if (eptr < md->end_subject && *eptr == 0x0a) eptr++;
1473 break;
1474 case 0x000a:
1475 case 0x000b:
1476 case 0x000c:
1477 case 0x0085:
1478 case 0x2028:
1479 case 0x2029:
1480 break;
1481 }
1482 ecode++;
1483 break;
1484
1485 #ifdef SUPPORT_UCP
1486 /* Check the next character by Unicode property. We will get here only
1487 if the support is in the binary; otherwise a compile-time error occurs. */
1488
1489 case OP_PROP:
1490 case OP_NOTPROP:
1491 if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
1492 GETCHARINCTEST(c, eptr);
1493 {
1494 int chartype, script;
1495 int category = _pcre_ucp_findprop(c, &chartype, &script);
1496
1497 switch(ecode[1])
1498 {
1499 case PT_ANY:
1500 if (op == OP_NOTPROP) RRETURN(MATCH_NOMATCH);
1501 break;
1502
1503 case PT_LAMP:
1504 if ((chartype == ucp_Lu ||
1505 chartype == ucp_Ll ||
1506 chartype == ucp_Lt) == (op == OP_NOTPROP))
1507 RRETURN(MATCH_NOMATCH);
1508 break;
1509
1510 case PT_GC:
1511 if ((ecode[2] != category) == (op == OP_PROP))
1512 RRETURN(MATCH_NOMATCH);
1513 break;
1514
1515 case PT_PC:
1516 if ((ecode[2] != chartype) == (op == OP_PROP))
1517 RRETURN(MATCH_NOMATCH);
1518 break;
1519
1520 case PT_SC:
1521 if ((ecode[2] != script) == (op == OP_PROP))
1522 RRETURN(MATCH_NOMATCH);
1523 break;
1524
1525 default:
1526 RRETURN(PCRE_ERROR_INTERNAL);
1527 }
1528
1529 ecode += 3;
1530 }
1531 break;
1532
1533 /* Match an extended Unicode sequence. We will get here only if the support
1534 is in the binary; otherwise a compile-time error occurs. */
1535
1536 case OP_EXTUNI:
1537 if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
1538 GETCHARINCTEST(c, eptr);
1539 {
1540 int chartype, script;
1541 int category = _pcre_ucp_findprop(c, &chartype, &script);
1542 if (category == ucp_M) RRETURN(MATCH_NOMATCH);
1543 while (eptr < md->end_subject)
1544 {
1545 int len = 1;
1546 if (!utf8) c = *eptr; else
1547 {
1548 GETCHARLEN(c, eptr, len);
1549 }
1550 category = _pcre_ucp_findprop(c, &chartype, &script);
1551 if (category != ucp_M) break;
1552 eptr += len;
1553 }
1554 }
1555 ecode++;
1556 break;
1557 #endif
1558
1559
1560 /* Match a back reference, possibly repeatedly. Look past the end of the
1561 item to see if there is repeat information following. The code is similar
1562 to that for character classes, but repeated for efficiency. Then obey
1563 similar code to character type repeats - written out again for speed.
1564 However, if the referenced string is the empty string, always treat
1565 it as matched, any number of times (otherwise there could be infinite
1566 loops). */
1567
1568 case OP_REF:
1569 {
1570 offset = GET2(ecode, 1) << 1; /* Doubled ref number */
1571 ecode += 3; /* Advance past item */
1572
1573 /* If the reference is unset, set the length to be longer than the amount
1574 of subject left; this ensures that every attempt at a match fails. We
1575 can't just fail here, because of the possibility of quantifiers with zero
1576 minima. */
1577
1578 length = (offset >= offset_top || md->offset_vector[offset] < 0)?
1579 md->end_subject - eptr + 1 :
1580 md->offset_vector[offset+1] - md->offset_vector[offset];
1581
1582 /* Set up for repetition, or handle the non-repeated case */
1583
1584 switch (*ecode)
1585 {
1586 case OP_CRSTAR:
1587 case OP_CRMINSTAR:
1588 case OP_CRPLUS:
1589 case OP_CRMINPLUS:
1590 case OP_CRQUERY:
1591 case OP_CRMINQUERY:
1592 c = *ecode++ - OP_CRSTAR;
1593 minimize = (c & 1) != 0;
1594 min = rep_min[c]; /* Pick up values from tables; */
1595 max = rep_max[c]; /* zero for max => infinity */
1596 if (max == 0) max = INT_MAX;
1597 break;
1598
1599 case OP_CRRANGE:
1600 case OP_CRMINRANGE:
1601 minimize = (*ecode == OP_CRMINRANGE);
1602 min = GET2(ecode, 1);
1603 max = GET2(ecode, 3);
1604 if (max == 0) max = INT_MAX;
1605 ecode += 5;
1606 break;
1607
1608 default: /* No repeat follows */
1609 if (!match_ref(offset, eptr, length, md, ims)) RRETURN(MATCH_NOMATCH);
1610 eptr += length;
1611 continue; /* With the main loop */
1612 }
1613
1614 /* If the length of the reference is zero, just continue with the
1615 main loop. */
1616
1617 if (length == 0) continue;
1618
1619 /* First, ensure the minimum number of matches are present. We get back
1620 the length of the reference string explicitly rather than passing the
1621 address of eptr, so that eptr can be a register variable. */
1622
1623 for (i = 1; i <= min; i++)
1624 {
1625 if (!match_ref(offset, eptr, length, md, ims)) RRETURN(MATCH_NOMATCH);
1626 eptr += length;
1627 }
1628
1629 /* If min = max, continue at the same level without recursion.
1630 They are not both allowed to be zero. */
1631
1632 if (min == max) continue;
1633
1634 /* If minimizing, keep trying and advancing the pointer */
1635
1636 if (minimize)
1637 {
1638 for (fi = min;; fi++)
1639 {
1640 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM14);
1641 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1642 if (fi >= max || !match_ref(offset, eptr, length, md, ims))
1643 RRETURN(MATCH_NOMATCH);
1644 eptr += length;
1645 }
1646 /* Control never gets here */
1647 }
1648
1649 /* If maximizing, find the longest string and work backwards */
1650
1651 else
1652 {
1653 pp = eptr;
1654 for (i = min; i < max; i++)
1655 {
1656 if (!match_ref(offset, eptr, length, md, ims)) break;
1657 eptr += length;
1658 }
1659 while (eptr >= pp)
1660 {
1661 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM15);
1662 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1663 eptr -= length;
1664 }
1665 RRETURN(MATCH_NOMATCH);
1666 }
1667 }
1668 /* Control never gets here */
1669
1670
1671
1672 /* Match a bit-mapped character class, possibly repeatedly. This op code is
1673 used when all the characters in the class have values in the range 0-255,
1674 and either the matching is caseful, or the characters are in the range
1675 0-127 when UTF-8 processing is enabled. The only difference between
1676 OP_CLASS and OP_NCLASS occurs when a data character outside the range is
1677 encountered.
1678
1679 First, look past the end of the item to see if there is repeat information
1680 following. Then obey similar code to character type repeats - written out
1681 again for speed. */
1682
1683 case OP_NCLASS:
1684 case OP_CLASS:
1685 {
1686 data = ecode + 1; /* Save for matching */
1687 ecode += 33; /* Advance past the item */
1688
1689 switch (*ecode)
1690 {
1691 case OP_CRSTAR:
1692 case OP_CRMINSTAR:
1693 case OP_CRPLUS:
1694 case OP_CRMINPLUS:
1695 case OP_CRQUERY:
1696 case OP_CRMINQUERY:
1697 c = *ecode++ - OP_CRSTAR;
1698 minimize = (c & 1) != 0;
1699 min = rep_min[c]; /* Pick up values from tables; */
1700 max = rep_max[c]; /* zero for max => infinity */
1701 if (max == 0) max = INT_MAX;
1702 break;
1703
1704 case OP_CRRANGE:
1705 case OP_CRMINRANGE:
1706 minimize = (*ecode == OP_CRMINRANGE);
1707 min = GET2(ecode, 1);
1708 max = GET2(ecode, 3);
1709 if (max == 0) max = INT_MAX;
1710 ecode += 5;
1711 break;
1712
1713 default: /* No repeat follows */
1714 min = max = 1;
1715 break;
1716 }
1717
1718 /* First, ensure the minimum number of matches are present. */
1719
1720 #ifdef SUPPORT_UTF8
1721 /* UTF-8 mode */
1722 if (utf8)
1723 {
1724 for (i = 1; i <= min; i++)
1725 {
1726 if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
1727 GETCHARINC(c, eptr);
1728 if (c > 255)
1729 {
1730 if (op == OP_CLASS) RRETURN(MATCH_NOMATCH);
1731 }
1732 else
1733 {
1734 if ((data[c/8] & (1 << (c&7))) == 0) RRETURN(MATCH_NOMATCH);
1735 }
1736 }
1737 }
1738 else
1739 #endif
1740 /* Not UTF-8 mode */
1741 {
1742 for (i = 1; i <= min; i++)
1743 {
1744 if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
1745 c = *eptr++;
1746 if ((data[c/8] & (1 << (c&7))) == 0) RRETURN(MATCH_NOMATCH);
1747 }
1748 }
1749
1750 /* If max == min we can continue with the main loop without the
1751 need to recurse. */
1752
1753 if (min == max) continue;
1754
1755 /* If minimizing, keep testing the rest of the expression and advancing
1756 the pointer while it matches the class. */
1757
1758 if (minimize)
1759 {
1760 #ifdef SUPPORT_UTF8
1761 /* UTF-8 mode */
1762 if (utf8)
1763 {
1764 for (fi = min;; fi++)
1765 {
1766 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM16);
1767 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1768 if (fi >= max || eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
1769 GETCHARINC(c, eptr);
1770 if (c > 255)
1771 {
1772 if (op == OP_CLASS) RRETURN(MATCH_NOMATCH);
1773 }
1774 else
1775 {
1776 if ((data[c/8] & (1 << (c&7))) == 0) RRETURN(MATCH_NOMATCH);
1777 }
1778 }
1779 }
1780 else
1781 #endif
1782 /* Not UTF-8 mode */
1783 {
1784 for (fi = min;; fi++)
1785 {
1786 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM17);
1787 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1788 if (fi >= max || eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
1789 c = *eptr++;
1790 if ((data[c/8] & (1 << (c&7))) == 0) RRETURN(MATCH_NOMATCH);
1791 }
1792 }
1793 /* Control never gets here */
1794 }
1795
1796 /* If maximizing, find the longest possible run, then work backwards. */
1797
1798 else
1799 {
1800 pp = eptr;
1801
1802 #ifdef SUPPORT_UTF8
1803 /* UTF-8 mode */
1804 if (utf8)
1805 {
1806 for (i = min; i < max; i++)
1807 {
1808 int len = 1;
1809 if (eptr >= md->end_subject) break;
1810 GETCHARLEN(c, eptr, len);
1811 if (c > 255)
1812 {
1813 if (op == OP_CLASS) break;
1814 }
1815 else
1816 {
1817 if ((data[c/8] & (1 << (c&7))) == 0) break;
1818 }
1819 eptr += len;
1820 }
1821 for (;;)
1822 {
1823 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM18);
1824 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1825 if (eptr-- == pp) break; /* Stop if tried at original pos */
1826 BACKCHAR(eptr);
1827 }
1828 }
1829 else
1830 #endif
1831 /* Not UTF-8 mode */
1832 {
1833 for (i = min; i < max; i++)
1834 {
1835 if (eptr >= md->end_subject) break;
1836 c = *eptr;
1837 if ((data[c/8] & (1 << (c&7))) == 0) break;
1838 eptr++;
1839 }
1840 while (eptr >= pp)
1841 {
1842 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM19);
1843 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1844 eptr--;
1845 }
1846 }
1847
1848 RRETURN(MATCH_NOMATCH);
1849 }
1850 }
1851 /* Control never gets here */
1852
1853
1854 /* Match an extended character class. This opcode is encountered only
1855 in UTF-8 mode, because that's the only time it is compiled. */
1856
1857 #ifdef SUPPORT_UTF8
1858 case OP_XCLASS:
1859 {
1860 data = ecode + 1 + LINK_SIZE; /* Save for matching */
1861 ecode += GET(ecode, 1); /* Advance past the item */
1862
1863 switch (*ecode)
1864 {
1865 case OP_CRSTAR:
1866 case OP_CRMINSTAR:
1867 case OP_CRPLUS:
1868 case OP_CRMINPLUS:
1869 case OP_CRQUERY:
1870 case OP_CRMINQUERY:
1871 c = *ecode++ - OP_CRSTAR;
1872 minimize = (c & 1) != 0;
1873 min = rep_min[c]; /* Pick up values from tables; */
1874 max = rep_max[c]; /* zero for max => infinity */
1875 if (max == 0) max = INT_MAX;
1876 break;
1877
1878 case OP_CRRANGE:
1879 case OP_CRMINRANGE:
1880 minimize = (*ecode == OP_CRMINRANGE);
1881 min = GET2(ecode, 1);
1882 max = GET2(ecode, 3);
1883 if (max == 0) max = INT_MAX;
1884 ecode += 5;
1885 break;
1886
1887 default: /* No repeat follows */
1888 min = max = 1;
1889 break;
1890 }
1891
1892 /* First, ensure the minimum number of matches are present. */
1893
1894 for (i = 1; i <= min; i++)
1895 {
1896 if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
1897 GETCHARINC(c, eptr);
1898 if (!_pcre_xclass(c, data)) RRETURN(MATCH_NOMATCH);
1899 }
1900
1901 /* If max == min we can continue with the main loop without the
1902 need to recurse. */
1903
1904 if (min == max) continue;
1905
1906 /* If minimizing, keep testing the rest of the expression and advancing
1907 the pointer while it matches the class. */
1908
1909 if (minimize)
1910 {
1911 for (fi = min;; fi++)
1912 {
1913 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM20);
1914 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1915 if (fi >= max || eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
1916 GETCHARINC(c, eptr);
1917 if (!_pcre_xclass(c, data)) RRETURN(MATCH_NOMATCH);
1918 }
1919 /* Control never gets here */
1920 }
1921
1922 /* If maximizing, find the longest possible run, then work backwards. */
1923
1924 else
1925 {
1926 pp = eptr;
1927 for (i = min; i < max; i++)
1928 {
1929 int len = 1;
1930 if (eptr >= md->end_subject) break;
1931 GETCHARLEN(c, eptr, len);
1932 if (!_pcre_xclass(c, data)) break;
1933 eptr += len;
1934 }
1935 for(;;)
1936 {
1937 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM21);
1938 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1939 if (eptr-- == pp) break; /* Stop if tried at original pos */
1940 BACKCHAR(eptr)
1941 }
1942 RRETURN(MATCH_NOMATCH);
1943 }
1944
1945 /* Control never gets here */
1946 }
1947 #endif /* End of XCLASS */
1948
1949 /* Match a single character, casefully */
1950
1951 case OP_CHAR:
1952 #ifdef SUPPORT_UTF8
1953 if (utf8)
1954 {
1955 length = 1;
1956 ecode++;
1957 GETCHARLEN(fc, ecode, length);
1958 if (length > md->end_subject - eptr) RRETURN(MATCH_NOMATCH);
1959 while (length-- > 0) if (*ecode++ != *eptr++) RRETURN(MATCH_NOMATCH);
1960 }
1961 else
1962 #endif
1963
1964 /* Non-UTF-8 mode */
1965 {
1966 if (md->end_subject - eptr < 1) RRETURN(MATCH_NOMATCH);
1967 if (ecode[1] != *eptr++) RRETURN(MATCH_NOMATCH);
1968 ecode += 2;
1969 }
1970 break;
1971
1972 /* Match a single character, caselessly */
1973
1974 case OP_CHARNC:
1975 #ifdef SUPPORT_UTF8
1976 if (utf8)
1977 {
1978 length = 1;
1979 ecode++;
1980 GETCHARLEN(fc, ecode, length);
1981
1982 if (length > md->end_subject - eptr) RRETURN(MATCH_NOMATCH);
1983
1984 /* If the pattern character's value is < 128, we have only one byte, and
1985 can use the fast lookup table. */
1986
1987 if (fc < 128)
1988 {
1989 if (md->lcc[*ecode++] != md->lcc[*eptr++]) RRETURN(MATCH_NOMATCH);
1990 }
1991
1992 /* Otherwise we must pick up the subject character */
1993
1994 else
1995 {
1996 unsigned int dc;
1997 GETCHARINC(dc, eptr);
1998 ecode += length;
1999
2000 /* If we have Unicode property support, we can use it to test the other
2001 case of the character, if there is one. */
2002
2003 if (fc != dc)
2004 {
2005 #ifdef SUPPORT_UCP
2006 if (dc != _pcre_ucp_othercase(fc))
2007 #endif
2008 RRETURN(MATCH_NOMATCH);
2009 }
2010 }
2011 }
2012 else
2013 #endif /* SUPPORT_UTF8 */
2014
2015 /* Non-UTF-8 mode */
2016 {
2017 if (md->end_subject - eptr < 1) RRETURN(MATCH_NOMATCH);
2018 if (md->lcc[ecode[1]] != md->lcc[*eptr++]) RRETURN(MATCH_NOMATCH);
2019 ecode += 2;
2020 }
2021 break;
2022
2023 /* Match a single character repeatedly. */
2024
2025 case OP_EXACT:
2026 min = max = GET2(ecode, 1);
2027 ecode += 3;
2028 goto REPEATCHAR;
2029
2030 case OP_POSUPTO:
2031 possessive = TRUE;
2032 /* Fall through */
2033
2034 case OP_UPTO:
2035 case OP_MINUPTO:
2036 min = 0;
2037 max = GET2(ecode, 1);
2038 minimize = *ecode == OP_MINUPTO;
2039 ecode += 3;
2040 goto REPEATCHAR;
2041
2042 case OP_POSSTAR:
2043 possessive = TRUE;
2044 min = 0;
2045 max = INT_MAX;
2046 ecode++;
2047 goto REPEATCHAR;
2048
2049 case OP_POSPLUS:
2050 possessive = TRUE;
2051 min = 1;
2052 max = INT_MAX;
2053 ecode++;
2054 goto REPEATCHAR;
2055
2056 case OP_POSQUERY:
2057 possessive = TRUE;
2058 min = 0;
2059 max = 1;
2060 ecode++;
2061 goto REPEATCHAR;
2062
2063 case OP_STAR:
2064 case OP_MINSTAR:
2065 case OP_PLUS:
2066 case OP_MINPLUS:
2067 case OP_QUERY:
2068 case OP_MINQUERY:
2069 c = *ecode++ - OP_STAR;
2070 minimize = (c & 1) != 0;
2071 min = rep_min[c]; /* Pick up values from tables; */
2072 max = rep_max[c]; /* zero for max => infinity */
2073 if (max == 0) max = INT_MAX;
2074
2075 /* Common code for all repeated single-character matches. We can give
2076 up quickly if there are fewer than the minimum number of characters left in
2077 the subject. */
2078
2079 REPEATCHAR:
2080 #ifdef SUPPORT_UTF8
2081 if (utf8)
2082 {
2083 length = 1;
2084 charptr = ecode;
2085 GETCHARLEN(fc, ecode, length);
2086 if (min * length > md->end_subject - eptr) RRETURN(MATCH_NOMATCH);
2087 ecode += length;
2088
2089 /* Handle multibyte character matching specially here. There is
2090 support for caseless matching if UCP support is present. */
2091
2092 if (length > 1)
2093 {
2094 #ifdef SUPPORT_UCP
2095 unsigned int othercase;
2096 if ((ims & PCRE_CASELESS) != 0 &&
2097 (othercase = _pcre_ucp_othercase(fc)) != NOTACHAR)
2098 oclength = _pcre_ord2utf8(othercase, occhars);
2099 else oclength = 0;
2100 #endif /* SUPPORT_UCP */
2101
2102 for (i = 1; i <= min; i++)
2103 {
2104 if (memcmp(eptr, charptr, length) == 0) eptr += length;
2105 #ifdef SUPPORT_UCP
2106 /* Need braces because of following else */
2107 else if (oclength == 0) { RRETURN(MATCH_NOMATCH); }
2108 else
2109 {
2110 if (memcmp(eptr, occhars, oclength) != 0) RRETURN(MATCH_NOMATCH);
2111 eptr += oclength;
2112 }
2113 #else /* without SUPPORT_UCP */
2114 else { RRETURN(MATCH_NOMATCH); }
2115 #endif /* SUPPORT_UCP */
2116 }
2117
2118 if (min == max) continue;
2119
2120 if (minimize)
2121 {
2122 for (fi = min;; fi++)
2123 {
2124 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM22);
2125 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2126 if (fi >= max || eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
2127 if (memcmp(eptr, charptr, length) == 0) eptr += length;
2128 #ifdef SUPPORT_UCP
2129 /* Need braces because of following else */
2130 else if (oclength == 0) { RRETURN(MATCH_NOMATCH); }
2131 else
2132 {
2133 if (memcmp(eptr, occhars, oclength) != 0) RRETURN(MATCH_NOMATCH);
2134 eptr += oclength;
2135 }
2136 #else /* without SUPPORT_UCP */
2137 else { RRETURN (MATCH_NOMATCH); }
2138 #endif /* SUPPORT_UCP */
2139 }
2140 /* Control never gets here */
2141 }
2142
2143 else /* Maximize */
2144 {
2145 pp = eptr;
2146 for (i = min; i < max; i++)
2147 {
2148 if (eptr > md->end_subject - length) break;
2149 if (memcmp(eptr, charptr, length) == 0) eptr += length;
2150 #ifdef SUPPORT_UCP
2151 else if (oclength == 0) break;
2152 else
2153 {
2154 if (memcmp(eptr, occhars, oclength) != 0) break;
2155 eptr += oclength;
2156 }
2157 #else /* without SUPPORT_UCP */
2158 else break;
2159 #endif /* SUPPORT_UCP */
2160 }
2161
2162 if (possessive) continue;
2163 for(;;)
2164 {
2165 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM23);
2166 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2167 if (eptr == pp) RRETURN(MATCH_NOMATCH);
2168 #ifdef SUPPORT_UCP
2169 eptr--;
2170 BACKCHAR(eptr);
2171 #else /* without SUPPORT_UCP */
2172 eptr -= length;
2173 #endif /* SUPPORT_UCP */
2174 }
2175 }
2176 /* Control never gets here */
2177 }
2178
2179 /* If the length of a UTF-8 character is 1, we fall through here, and
2180 obey the code as for non-UTF-8 characters below, though in this case the
2181 value of fc will always be < 128. */
2182 }
2183 else
2184 #endif /* SUPPORT_UTF8 */
2185
2186 /* When not in UTF-8 mode, load a single-byte character. */
2187 {
2188 if (min > md->end_subject - eptr) RRETURN(MATCH_NOMATCH);
2189 fc = *ecode++;
2190 }
2191
2192 /* The value of fc at this point is always less than 256, though we may or
2193 may not be in UTF-8 mode. The code is duplicated for the caseless and
2194 caseful cases, for speed, since matching characters is likely to be quite
2195 common. First, ensure the minimum number of matches are present. If min =
2196 max, continue at the same level without recursing. Otherwise, if
2197 minimizing, keep trying the rest of the expression and advancing one
2198 matching character if failing, up to the maximum. Alternatively, if
2199 maximizing, find the maximum number of characters and work backwards. */
2200
2201 DPRINTF(("matching %c{%d,%d} against subject %.*s\n", fc, min, max,
2202 max, eptr));
2203
2204 if ((ims & PCRE_CASELESS) != 0)
2205 {
2206 fc = md->lcc[fc];
2207 for (i = 1; i <= min; i++)
2208 if (fc != md->lcc[*eptr++]) RRETURN(MATCH_NOMATCH);
2209 if (min == max) continue;
2210 if (minimize)
2211 {
2212 for (fi = min;; fi++)
2213 {
2214 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM24);
2215 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2216 if (fi >= max || eptr >= md->end_subject ||
2217 fc != md->lcc[*eptr++])
2218 RRETURN(MATCH_NOMATCH);
2219 }
2220 /* Control never gets here */
2221 }
2222 else /* Maximize */
2223 {
2224 pp = eptr;
2225 for (i = min; i < max; i++)
2226 {
2227 if (eptr >= md->end_subject || fc != md->lcc[*eptr]) break;
2228 eptr++;
2229 }
2230 if (possessive) continue;
2231 while (eptr >= pp)
2232 {
2233 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM25);
2234 eptr--;
2235 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2236 }
2237 RRETURN(MATCH_NOMATCH);
2238 }
2239 /* Control never gets here */
2240 }
2241
2242 /* Caseful comparisons (includes all multi-byte characters) */
2243
2244 else
2245 {
2246 for (i = 1; i <= min; i++) if (fc != *eptr++) RRETURN(MATCH_NOMATCH);
2247 if (min == max) continue;
2248 if (minimize)
2249 {
2250 for (fi = min;; fi++)
2251 {
2252 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM26);
2253 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2254 if (fi >= max || eptr >= md->end_subject || fc != *eptr++)
2255 RRETURN(MATCH_NOMATCH);
2256 }
2257 /* Control never gets here */
2258 }
2259 else /* Maximize */
2260 {
2261 pp = eptr;
2262 for (i = min; i < max; i++)
2263 {
2264 if (eptr >= md->end_subject || fc != *eptr) break;
2265 eptr++;
2266 }
2267 if (possessive) continue;
2268 while (eptr >= pp)
2269 {
2270 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM27);
2271 eptr--;
2272 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2273 }
2274 RRETURN(MATCH_NOMATCH);
2275 }
2276 }
2277 /* Control never gets here */
2278
2279 /* Match a negated single one-byte character. The character we are
2280 checking can be multibyte. */
2281
2282 case OP_NOT:
2283 if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
2284 ecode++;
2285 GETCHARINCTEST(c, eptr);
2286 if ((ims & PCRE_CASELESS) != 0)
2287 {
2288 #ifdef SUPPORT_UTF8
2289 if (c < 256)
2290 #endif
2291 c = md->lcc[c];
2292 if (md->lcc[*ecode++] == c) RRETURN(MATCH_NOMATCH);
2293 }
2294 else
2295 {
2296 if (*ecode++ == c) RRETURN(MATCH_NOMATCH);
2297 }
2298 break;
2299
2300 /* Match a negated single one-byte character repeatedly. This is almost a
2301 repeat of the code for a repeated single character, but I haven't found a
2302 nice way of commoning these up that doesn't require a test of the
2303 positive/negative option for each character match. Maybe that wouldn't add
2304 very much to the time taken, but character matching *is* what this is all
2305 about... */
2306
2307 case OP_NOTEXACT:
2308 min = max = GET2(ecode, 1);
2309 ecode += 3;
2310 goto REPEATNOTCHAR;
2311
2312 case OP_NOTUPTO:
2313 case OP_NOTMINUPTO:
2314 min = 0;
2315 max = GET2(ecode, 1);
2316 minimize = *ecode == OP_NOTMINUPTO;
2317 ecode += 3;
2318 goto REPEATNOTCHAR;
2319
2320 case OP_NOTPOSSTAR:
2321 possessive = TRUE;
2322 min = 0;
2323 max = INT_MAX;
2324 ecode++;
2325 goto REPEATNOTCHAR;
2326
2327 case OP_NOTPOSPLUS:
2328 possessive = TRUE;
2329 min = 1;
2330 max = INT_MAX;
2331 ecode++;
2332 goto REPEATNOTCHAR;
2333
2334 case OP_NOTPOSQUERY:
2335 possessive = TRUE;
2336 min = 0;
2337 max = 1;
2338 ecode++;
2339 goto REPEATNOTCHAR;
2340
2341 case OP_NOTPOSUPTO:
2342 possessive = TRUE;
2343 min = 0;
2344 max = GET2(ecode, 1);
2345 ecode += 3;
2346 goto REPEATNOTCHAR;
2347
2348 case OP_NOTSTAR:
2349 case OP_NOTMINSTAR:
2350 case OP_NOTPLUS:
2351 case OP_NOTMINPLUS:
2352 case OP_NOTQUERY:
2353 case OP_NOTMINQUERY:
2354 c = *ecode++ - OP_NOTSTAR;
2355 minimize = (c & 1) != 0;
2356 min = rep_min[c]; /* Pick up values from tables; */
2357 max = rep_max[c]; /* zero for max => infinity */
2358 if (max == 0) max = INT_MAX;
2359
2360 /* Common code for all repeated single-byte matches. We can give up quickly
2361 if there are fewer than the minimum number of bytes left in the
2362 subject. */
2363
2364 REPEATNOTCHAR:
2365 if (min > md->end_subject - eptr) RRETURN(MATCH_NOMATCH);
2366 fc = *ecode++;
2367
2368 /* The code is duplicated for the caseless and caseful cases, for speed,
2369 since matching characters is likely to be quite common. First, ensure the
2370 minimum number of matches are present. If min = max, continue at the same
2371 level without recursing. Otherwise, if minimizing, keep trying the rest of
2372 the expression and advancing one matching character if failing, up to the
2373 maximum. Alternatively, if maximizing, find the maximum number of
2374 characters and work backwards. */
2375
2376 DPRINTF(("negative matching %c{%d,%d} against subject %.*s\n", fc, min, max,
2377 max, eptr));
2378
2379 if ((ims & PCRE_CASELESS) != 0)
2380 {
2381 fc = md->lcc[fc];
2382
2383 #ifdef SUPPORT_UTF8
2384 /* UTF-8 mode */
2385 if (utf8)
2386 {
2387 register unsigned int d;
2388 for (i = 1; i <= min; i++)
2389 {
2390 GETCHARINC(d, eptr);
2391 if (d < 256) d = md->lcc[d];
2392 if (fc == d) RRETURN(MATCH_NOMATCH);
2393 }
2394 }
2395 else
2396 #endif
2397
2398 /* Not UTF-8 mode */
2399 {
2400 for (i = 1; i <= min; i++)
2401 if (fc == md->lcc[*eptr++]) RRETURN(MATCH_NOMATCH);
2402 }
2403
2404 if (min == max) continue;
2405
2406 if (minimize)
2407 {
2408 #ifdef SUPPORT_UTF8
2409 /* UTF-8 mode */
2410 if (utf8)
2411 {
2412 register unsigned int d;
2413 for (fi = min;; fi++)
2414 {
2415 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM28);
2416 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2417 GETCHARINC(d, eptr);
2418 if (d < 256) d = md->lcc[d];
2419 if (fi >= max || eptr >= md->end_subject || fc == d)
2420 RRETURN(MATCH_NOMATCH);
2421 }
2422 }
2423 else
2424 #endif
2425 /* Not UTF-8 mode */
2426 {
2427 for (fi = min;; fi++)
2428 {
2429 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM29);
2430 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2431 if (fi >= max || eptr >= md->end_subject || fc == md->lcc[*eptr++])
2432 RRETURN(MATCH_NOMATCH);
2433 }
2434 }
2435 /* Control never gets here */
2436 }
2437
2438 /* Maximize case */
2439
2440 else
2441 {
2442 pp = eptr;
2443
2444 #ifdef SUPPORT_UTF8
2445 /* UTF-8 mode */
2446 if (utf8)
2447 {
2448 register unsigned int d;
2449 for (i = min; i < max; i++)
2450 {
2451 int len = 1;
2452 if (eptr >= md->end_subject) break;
2453 GETCHARLEN(d, eptr, len);
2454 if (d < 256) d = md->lcc[d];
2455 if (fc == d) break;
2456 eptr += len;
2457 }
2458 if (possessive) continue;
2459 for(;;)
2460 {
2461 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM30);
2462 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2463 if (eptr-- == pp) break; /* Stop if tried at original pos */
2464 BACKCHAR(eptr);
2465 }
2466 }
2467 else
2468 #endif
2469 /* Not UTF-8 mode */
2470 {
2471 for (i = min; i < max; i++)
2472 {
2473 if (eptr >= md->end_subject || fc == md->lcc[*eptr]) break;
2474 eptr++;
2475 }
2476 if (possessive) continue;
2477 while (eptr >= pp)
2478 {
2479 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM31);
2480 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2481 eptr--;
2482 }
2483 }
2484
2485 RRETURN(MATCH_NOMATCH);
2486 }
2487 /* Control never gets here */
2488 }
2489
2490 /* Caseful comparisons */
2491
2492 else
2493 {
2494 #ifdef SUPPORT_UTF8
2495 /* UTF-8 mode */
2496 if (utf8)
2497 {
2498 register unsigned int d;
2499 for (i = 1; i <= min; i++)
2500 {
2501 GETCHARINC(d, eptr);
2502 if (fc == d) RRETURN(MATCH_NOMATCH);
2503 }
2504 }
2505 else
2506 #endif
2507 /* Not UTF-8 mode */
2508 {
2509 for (i = 1; i <= min; i++)
2510 if (fc == *eptr++) RRETURN(MATCH_NOMATCH);
2511 }
2512
2513 if (min == max) continue;
2514
2515 if (minimize)
2516 {
2517 #ifdef SUPPORT_UTF8
2518 /* UTF-8 mode */
2519 if (utf8)
2520 {
2521 register unsigned int d;
2522 for (fi = min;; fi++)
2523 {
2524 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM32);
2525 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2526 GETCHARINC(d, eptr);
2527 if (fi >= max || eptr >= md->end_subject || fc == d)
2528 RRETURN(MATCH_NOMATCH);
2529 }
2530 }
2531 else
2532 #endif
2533 /* Not UTF-8 mode */
2534 {
2535 for (fi = min;; fi++)
2536 {
2537 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM33);
2538 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2539 if (fi >= max || eptr >= md->end_subject || fc == *eptr++)
2540 RRETURN(MATCH_NOMATCH);
2541 }
2542 }
2543 /* Control never gets here */
2544 }
2545
2546 /* Maximize case */
2547
2548 else
2549 {
2550 pp = eptr;
2551
2552 #ifdef SUPPORT_UTF8
2553 /* UTF-8 mode */
2554 if (utf8)
2555 {
2556 register unsigned int d;
2557 for (i = min; i < max; i++)
2558 {
2559 int len = 1;
2560 if (eptr >= md->end_subject) break;
2561 GETCHARLEN(d, eptr, len);
2562 if (fc == d) break;
2563 eptr += len;
2564 }
2565 if (possessive) continue;
2566 for(;;)
2567 {
2568 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM34);
2569 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2570 if (eptr-- == pp) break; /* Stop if tried at original pos */
2571 BACKCHAR(eptr);
2572 }
2573 }
2574 else
2575 #endif
2576 /* Not UTF-8 mode */
2577 {
2578 for (i = min; i < max; i++)
2579 {
2580 if (eptr >= md->end_subject || fc == *eptr) break;
2581 eptr++;
2582 }
2583 if (possessive) continue;
2584 while (eptr >= pp)
2585 {
2586 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM35);
2587 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2588 eptr--;
2589 }
2590 }
2591
2592 RRETURN(MATCH_NOMATCH);
2593 }
2594 }
2595 /* Control never gets here */
2596
2597 /* Match a single character type repeatedly; several different opcodes
2598 share code. This is very similar to the code for single characters, but we
2599 repeat it in the interests of efficiency. */
2600
2601 case OP_TYPEEXACT:
2602 min = max = GET2(ecode, 1);
2603 minimize = TRUE;
2604 ecode += 3;
2605 goto REPEATTYPE;
2606
2607 case OP_TYPEUPTO:
2608 case OP_TYPEMINUPTO:
2609 min = 0;
2610 max = GET2(ecode, 1);
2611 minimize = *ecode == OP_TYPEMINUPTO;
2612 ecode += 3;
2613 goto REPEATTYPE;
2614
2615 case OP_TYPEPOSSTAR:
2616 possessive = TRUE;
2617 min = 0;
2618 max = INT_MAX;
2619 ecode++;
2620 goto REPEATTYPE;
2621
2622 case OP_TYPEPOSPLUS:
2623 possessive = TRUE;
2624 min = 1;
2625 max = INT_MAX;
2626 ecode++;
2627 goto REPEATTYPE;
2628
2629 case OP_TYPEPOSQUERY:
2630 possessive = TRUE;
2631 min = 0;
2632 max = 1;
2633 ecode++;
2634 goto REPEATTYPE;
2635
2636 case OP_TYPEPOSUPTO:
2637 possessive = TRUE;
2638 min = 0;
2639 max = GET2(ecode, 1);
2640 ecode += 3;
2641 goto REPEATTYPE;
2642
2643 case OP_TYPESTAR:
2644 case OP_TYPEMINSTAR:
2645 case OP_TYPEPLUS:
2646 case OP_TYPEMINPLUS:
2647 case OP_TYPEQUERY:
2648 case OP_TYPEMINQUERY:
2649 c = *ecode++ - OP_TYPESTAR;
2650 minimize = (c & 1) != 0;
2651 min = rep_min[c]; /* Pick up values from tables; */
2652 max = rep_max[c]; /* zero for max => infinity */
2653 if (max == 0) max = INT_MAX;
2654
2655 /* Common code for all repeated single character type matches. Note that
2656 in UTF-8 mode, '.' matches a character of any length, but for the other
2657 character types, the valid characters are all one-byte long. */
2658
2659 REPEATTYPE:
2660 ctype = *ecode++; /* Code for the character type */
2661
2662 #ifdef SUPPORT_UCP
2663 if (ctype == OP_PROP || ctype == OP_NOTPROP)
2664 {
2665 prop_fail_result = ctype == OP_NOTPROP;
2666 prop_type = *ecode++;
2667 prop_value = *ecode++;
2668 }
2669 else prop_type = -1;
2670 #endif
2671
2672 /* First, ensure the minimum number of matches are present. Use inline
2673 code for maximizing the speed, and do the type test once at the start
2674 (i.e. keep it out of the loop). Also we can test that there are at least
2675 the minimum number of bytes before we start. This isn't as effective in
2676 UTF-8 mode, but it does no harm. Separate the UTF-8 code completely as that
2677 is tidier. Also separate the UCP code, which can be the same for both UTF-8
2678 and single-bytes. */
2679
2680 if (min > md->end_subject - eptr) RRETURN(MATCH_NOMATCH);
2681 if (min > 0)
2682 {
2683 #ifdef SUPPORT_UCP
2684 if (prop_type >= 0)
2685 {
2686 switch(prop_type)
2687 {
2688 case PT_ANY:
2689 if (prop_fail_result) RRETURN(MATCH_NOMATCH);
2690 for (i = 1; i <= min; i++)
2691 {
2692 if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
2693 GETCHARINC(c, eptr);
2694 }
2695 break;
2696
2697 case PT_LAMP:
2698 for (i = 1; i <= min; i++)
2699 {
2700 if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
2701 GETCHARINC(c, eptr);
2702 prop_category = _pcre_ucp_findprop(c, &prop_chartype, &prop_script);
2703 if ((prop_chartype == ucp_Lu ||
2704 prop_chartype == ucp_Ll ||
2705 prop_chartype == ucp_Lt) == prop_fail_result)
2706 RRETURN(MATCH_NOMATCH);
2707 }
2708 break;
2709
2710 case PT_GC:
2711 for (i = 1; i <= min; i++)
2712 {
2713 if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
2714 GETCHARINC(c, eptr);
2715 prop_category = _pcre_ucp_findprop(c, &prop_chartype, &prop_script);
2716 if ((prop_category == prop_value) == prop_fail_result)
2717 RRETURN(MATCH_NOMATCH);
2718 }
2719 break;
2720
2721 case PT_PC:
2722 for (i = 1; i <= min; i++)
2723 {
2724 if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
2725 GETCHARINC(c, eptr);
2726 prop_category = _pcre_ucp_findprop(c, &prop_chartype, &prop_script);
2727 if ((prop_chartype == prop_value) == prop_fail_result)
2728 RRETURN(MATCH_NOMATCH);
2729 }
2730 break;
2731
2732 case PT_SC:
2733 for (i = 1; i <= min; i++)
2734 {
2735 if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
2736 GETCHARINC(c, eptr);
2737 prop_category = _pcre_ucp_findprop(c, &prop_chartype, &prop_script);
2738 if ((prop_script == prop_value) == prop_fail_result)
2739 RRETURN(MATCH_NOMATCH);
2740 }
2741 break;
2742
2743 default:
2744 RRETURN(PCRE_ERROR_INTERNAL);
2745 }
2746 }
2747
2748 /* Match extended Unicode sequences. We will get here only if the
2749 support is in the binary; otherwise a compile-time error occurs. */
2750
2751 else if (ctype == OP_EXTUNI)
2752 {
2753 for (i = 1; i <= min; i++)
2754 {
2755 GETCHARINCTEST(c, eptr);
2756 prop_category = _pcre_ucp_findprop(c, &prop_chartype, &prop_script);
2757 if (prop_category == ucp_M) RRETURN(MATCH_NOMATCH);
2758 while (eptr < md->end_subject)
2759 {
2760 int len = 1;
2761 if (!utf8) c = *eptr; else
2762 {
2763 GETCHARLEN(c, eptr, len);
2764 }
2765 prop_category = _pcre_ucp_findprop(c, &prop_chartype, &prop_script);
2766 if (prop_category != ucp_M) break;
2767 eptr += len;
2768 }
2769 }
2770 }
2771
2772 else
2773 #endif /* SUPPORT_UCP */
2774
2775 /* Handle all other cases when the coding is UTF-8 */
2776
2777 #ifdef SUPPORT_UTF8
2778 if (utf8) switch(ctype)
2779 {
2780 case OP_ANY:
2781 for (i = 1; i <= min; i++)
2782 {
2783 if (eptr >= md->end_subject ||
2784 ((ims & PCRE_DOTALL) == 0 && IS_NEWLINE(eptr)))
2785 RRETURN(MATCH_NOMATCH);
2786 eptr++;
2787 while (eptr < md->end_subject && (*eptr & 0xc0) == 0x80) eptr++;
2788 }
2789 break;
2790
2791 case OP_ANYBYTE:
2792 eptr += min;
2793 break;
2794
2795 case OP_ANYNL:
2796 for (i = 1; i <= min; i++)
2797 {
2798 if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
2799 GETCHARINC(c, eptr);
2800 switch(c)
2801 {
2802 default: RRETURN(MATCH_NOMATCH);
2803 case 0x000d:
2804 if (eptr < md->end_subject && *eptr == 0x0a) eptr++;
2805 break;
2806 case 0x000a:
2807 case 0x000b:
2808 case 0x000c:
2809 case 0x0085:
2810 case 0x2028:
2811 case 0x2029:
2812 break;
2813 }
2814 }
2815 break;
2816
2817 case OP_NOT_DIGIT:
2818 for (i = 1; i <= min; i++)
2819 {
2820 if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
2821 GETCHARINC(c, eptr);
2822 if (c < 128 && (md->ctypes[c] & ctype_digit) != 0)
2823 RRETURN(MATCH_NOMATCH);
2824 }
2825 break;
2826
2827 case OP_DIGIT:
2828 for (i = 1; i <= min; i++)
2829 {
2830 if (eptr >= md->end_subject ||
2831 *eptr >= 128 || (md->ctypes[*eptr++] & ctype_digit) == 0)
2832 RRETURN(MATCH_NOMATCH);
2833 /* No need to skip more bytes - we know it's a 1-byte character */
2834 }
2835 break;
2836
2837 case OP_NOT_WHITESPACE:
2838 for (i = 1; i <= min; i++)
2839 {
2840 if (eptr >= md->end_subject ||
2841 (*eptr < 128 && (md->ctypes[*eptr++] & ctype_space) != 0))
2842 RRETURN(MATCH_NOMATCH);
2843 while (eptr < md->end_subject && (*eptr & 0xc0) == 0x80) eptr++;
2844 }
2845 break;
2846
2847 case OP_WHITESPACE:
2848 for (i = 1; i <= min; i++)
2849 {
2850 if (eptr >= md->end_subject ||
2851 *eptr >= 128 || (md->ctypes[*eptr++] & ctype_space) == 0)
2852 RRETURN(MATCH_NOMATCH);
2853 /* No need to skip more bytes - we know it's a 1-byte character */
2854 }
2855 break;
2856
2857 case OP_NOT_WORDCHAR:
2858 for (i = 1; i <= min; i++)
2859 {
2860 if (eptr >= md->end_subject ||
2861 (*eptr < 128 && (md->ctypes[*eptr++] & ctype_word) != 0))
2862 RRETURN(MATCH_NOMATCH);
2863 while (eptr < md->end_subject && (*eptr & 0xc0) == 0x80) eptr++;
2864 }
2865 break;
2866
2867 case OP_WORDCHAR:
2868 for (i = 1; i <= min; i++)
2869 {
2870 if (eptr >= md->end_subject ||
2871 *eptr >= 128 || (md->ctypes[*eptr++] & ctype_word) == 0)
2872 RRETURN(MATCH_NOMATCH);
2873 /* No need to skip more bytes - we know it's a 1-byte character */
2874 }
2875 break;
2876
2877 default:
2878 RRETURN(PCRE_ERROR_INTERNAL);
2879 } /* End switch(ctype) */
2880
2881 else
2882 #endif /* SUPPORT_UTF8 */
2883
2884 /* Code for the non-UTF-8 case for minimum matching of operators other
2885 than OP_PROP and OP_NOTPROP. We can assume that there are the minimum
2886 number of bytes present, as this was tested above. */
2887
2888 switch(ctype)
2889 {
2890 case OP_ANY:
2891 if ((ims & PCRE_DOTALL) == 0)
2892 {
2893 for (i = 1; i <= min; i++)
2894 {
2895 if (IS_NEWLINE(eptr)) RRETURN(MATCH_NOMATCH);
2896 eptr++;
2897 }
2898 }
2899 else eptr += min;
2900 break;
2901
2902 case OP_ANYBYTE:
2903 eptr += min;
2904 break;
2905
2906 /* Because of the CRLF case, we can't assume the minimum number of
2907 bytes are present in this case. */
2908
2909 case OP_ANYNL:
2910 for (i = 1; i <= min; i++)
2911 {
2912 if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
2913 switch(*eptr++)
2914 {
2915 default: RRETURN(MATCH_NOMATCH);
2916 case 0x000d:
2917 if (eptr < md->end_subject && *eptr == 0x0a) eptr++;
2918 break;
2919 case 0x000a:
2920 case 0x000b:
2921 case 0x000c:
2922 case 0x0085:
2923 break;
2924 }
2925 }
2926 break;
2927
2928 case OP_NOT_DIGIT:
2929 for (i = 1; i <= min; i++)
2930 if ((md->ctypes[*eptr++] & ctype_digit) != 0) RRETURN(MATCH_NOMATCH);
2931 break;
2932
2933 case OP_DIGIT:
2934 for (i = 1; i <= min; i++)
2935 if ((md->ctypes[*eptr++] & ctype_digit) == 0) RRETURN(MATCH_NOMATCH);
2936 break;
2937
2938 case OP_NOT_WHITESPACE:
2939 for (i = 1; i <= min; i++)
2940 if ((md->ctypes[*eptr++] & ctype_space) != 0) RRETURN(MATCH_NOMATCH);
2941 break;
2942
2943 case OP_WHITESPACE:
2944 for (i = 1; i <= min; i++)
2945 if ((md->ctypes[*eptr++] & ctype_space) == 0) RRETURN(MATCH_NOMATCH);
2946 break;
2947
2948 case OP_NOT_WORDCHAR:
2949 for (i = 1; i <= min; i++)
2950 if ((md->ctypes[*eptr++] & ctype_word) != 0)
2951 RRETURN(MATCH_NOMATCH);
2952 break;
2953
2954 case OP_WORDCHAR:
2955 for (i = 1; i <= min; i++)
2956 if ((md->ctypes[*eptr++] & ctype_word) == 0)
2957 RRETURN(MATCH_NOMATCH);
2958 break;
2959
2960 default:
2961 RRETURN(PCRE_ERROR_INTERNAL);
2962 }
2963 }
2964
2965 /* If min = max, continue at the same level without recursing */
2966
2967 if (min == max) continue;
2968
2969 /* If minimizing, we have to test the rest of the pattern before each
2970 subsequent match. Again, separate the UTF-8 case for speed, and also
2971 separate the UCP cases. */
2972
2973 if (minimize)
2974 {
2975 #ifdef SUPPORT_UCP
2976 if (prop_type >= 0)
2977 {
2978 switch(prop_type)
2979 {
2980 case PT_ANY:
2981 for (fi = min;; fi++)
2982 {
2983 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM36);
2984 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2985 if (fi >= max || eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
2986 GETCHARINC(c, eptr);
2987 if (prop_fail_result) RRETURN(MATCH_NOMATCH);
2988 }
2989 /* Control never gets here */
2990
2991 case PT_LAMP:
2992 for (fi = min;; fi++)
2993 {
2994 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM37);
2995 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2996 if (fi >= max || eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
2997 GETCHARINC(c, eptr);
2998 prop_category = _pcre_ucp_findprop(c, &prop_chartype, &prop_script);
2999 if ((prop_chartype == ucp_Lu ||
3000 prop_chartype == ucp_Ll ||
3001 prop_chartype == ucp_Lt) == prop_fail_result)
3002 RRETURN(MATCH_NOMATCH);
3003 }
3004 /* Control never gets here */
3005
3006 case PT_GC:
3007 for (fi = min;; fi++)
3008 {
3009 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM38);
3010 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3011 if (fi >= max || eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
3012 GETCHARINC(c, eptr);
3013 prop_category = _pcre_ucp_findprop(c, &prop_chartype, &prop_script);
3014 if ((prop_category == prop_value) == prop_fail_result)
3015 RRETURN(MATCH_NOMATCH);
3016 }
3017 /* Control never gets here */
3018
3019 case PT_PC:
3020 for (fi = min;; fi++)
3021 {
3022 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM39);
3023 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3024 if (fi >= max || eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
3025 GETCHARINC(c, eptr);
3026 prop_category = _pcre_ucp_findprop(c, &prop_chartype, &prop_script);
3027 if ((prop_chartype == prop_value) == prop_fail_result)
3028 RRETURN(MATCH_NOMATCH);
3029 }
3030 /* Control never gets here */
3031
3032 case PT_SC:
3033 for (fi = min;; fi++)
3034 {
3035 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM40);
3036 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3037 if (fi >= max || eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
3038 GETCHARINC(c, eptr);
3039 prop_category = _pcre_ucp_findprop(c, &prop_chartype, &prop_script);
3040 if ((prop_script == prop_value) == prop_fail_result)
3041 RRETURN(MATCH_NOMATCH);
3042 }
3043 /* Control never gets here */
3044
3045 default:
3046 RRETURN(PCRE_ERROR_INTERNAL);
3047 }
3048 }
3049
3050 /* Match extended Unicode sequences. We will get here only if the
3051 support is in the binary; otherwise a compile-time error occurs. */
3052
3053 else if (ctype == OP_EXTUNI)
3054 {
3055 for (fi = min;; fi++)
3056 {
3057 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM41);
3058 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3059 if (fi >= max || eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
3060 GETCHARINCTEST(c, eptr);
3061 prop_category = _pcre_ucp_findprop(c, &prop_chartype, &prop_script);
3062 if (prop_category == ucp_M) RRETURN(MATCH_NOMATCH);
3063 while (eptr < md->end_subject)
3064 {
3065 int len = 1;
3066 if (!utf8) c = *eptr; else
3067 {
3068 GETCHARLEN(c, eptr, len);
3069 }
3070 prop_category = _pcre_ucp_findprop(c, &prop_chartype, &prop_script);
3071 if (prop_category != ucp_M) break;
3072 eptr += len;
3073 }
3074 }
3075 }
3076
3077 else
3078 #endif /* SUPPORT_UCP */
3079
3080 #ifdef SUPPORT_UTF8
3081 /* UTF-8 mode */
3082 if (utf8)
3083 {
3084 for (fi = min;; fi++)
3085 {
3086 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM42);
3087 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3088 if (fi >= max || eptr >= md->end_subject ||
3089 (ctype == OP_ANY && (ims & PCRE_DOTALL) == 0 &&
3090 IS_NEWLINE(eptr)))
3091 RRETURN(MATCH_NOMATCH);
3092
3093 GETCHARINC(c, eptr);
3094 switch(ctype)
3095 {
3096 case OP_ANY: /* This is the DOTALL case */
3097 break;
3098
3099 case OP_ANYBYTE:
3100 break;
3101
3102 case OP_ANYNL:
3103 switch(c)
3104 {
3105 default: RRETURN(MATCH_NOMATCH);
3106 case 0x000d:
3107 if (eptr < md->end_subject && *eptr == 0x0a) eptr++;
3108 break;
3109 case 0x000a:
3110 case 0x000b:
3111 case 0x000c:
3112 case 0x0085:
3113 case 0x2028:
3114 case 0x2029:
3115 break;
3116 }
3117 break;
3118
3119 case OP_NOT_DIGIT:
3120 if (c < 256 && (md->ctypes[c] & ctype_digit) != 0)
3121 RRETURN(MATCH_NOMATCH);
3122 break;
3123
3124 case OP_DIGIT:
3125 if (c >= 256 || (md->ctypes[c] & ctype_digit) == 0)
3126 RRETURN(MATCH_NOMATCH);
3127 break;
3128
3129 case OP_NOT_WHITESPACE:
3130 if (c < 256 && (md->ctypes[c] & ctype_space) != 0)
3131 RRETURN(MATCH_NOMATCH);
3132 break;
3133
3134 case OP_WHITESPACE:
3135 if (c >= 256 || (md->ctypes[c] & ctype_space) == 0)
3136 RRETURN(MATCH_NOMATCH);
3137 break;
3138
3139 case OP_NOT_WORDCHAR:
3140 if (c < 256 && (md->ctypes[c] & ctype_word) != 0)
3141 RRETURN(MATCH_NOMATCH);
3142 break;
3143
3144 case OP_WORDCHAR:
3145 if (c >= 256 || (md->ctypes[c] & ctype_word) == 0)
3146 RRETURN(MATCH_NOMATCH);
3147 break;
3148
3149 default:
3150 RRETURN(PCRE_ERROR_INTERNAL);
3151 }
3152 }
3153 }
3154 else
3155 #endif
3156 /* Not UTF-8 mode */
3157 {
3158 for (fi = min;; fi++)
3159 {
3160 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM43);
3161 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3162 if (fi >= max || eptr >= md->end_subject ||
3163 ((ims & PCRE_DOTALL) == 0 && IS_NEWLINE(eptr)))
3164 RRETURN(MATCH_NOMATCH);
3165
3166 c = *eptr++;
3167 switch(ctype)
3168 {
3169 case OP_ANY: /* This is the DOTALL case */
3170 break;
3171
3172 case OP_ANYBYTE:
3173 break;
3174
3175 case OP_ANYNL:
3176 switch(c)
3177 {
3178 default: RRETURN(MATCH_NOMATCH);
3179 case 0x000d:
3180 if (eptr < md->end_subject && *eptr == 0x0a) eptr++;
3181 break;
3182 case 0x000a:
3183 case 0x000b:
3184 case 0x000c:
3185 case 0x0085:
3186 break;
3187 }
3188 break;
3189
3190 case OP_NOT_DIGIT:
3191 if ((md->ctypes[c] & ctype_digit) != 0) RRETURN(MATCH_NOMATCH);
3192 break;
3193
3194 case OP_DIGIT:
3195 if ((md->ctypes[c] & ctype_digit) == 0) RRETURN(MATCH_NOMATCH);
3196 break;
3197
3198 case OP_NOT_WHITESPACE:
3199 if ((md->ctypes[c] & ctype_space) != 0) RRETURN(MATCH_NOMATCH);
3200 break;
3201
3202 case OP_WHITESPACE:
3203 if ((md->ctypes[c] & ctype_space) == 0) RRETURN(MATCH_NOMATCH);
3204 break;
3205
3206 case OP_NOT_WORDCHAR:
3207 if ((md->ctypes[c] & ctype_word) != 0) RRETURN(MATCH_NOMATCH);
3208 break;
3209
3210 case OP_WORDCHAR:
3211 if ((md->ctypes[c] & ctype_word) == 0) RRETURN(MATCH_NOMATCH);
3212 break;
3213
3214 default:
3215 RRETURN(PCRE_ERROR_INTERNAL);
3216 }
3217 }
3218 }
3219 /* Control never gets here */
3220 }
3221
3222 /* If maximizing, it is worth using inline code for speed, doing the type
3223 test once at the start (i.e. keep it out of the loop). Again, keep the
3224 UTF-8 and UCP stuff separate. */
3225
3226 else
3227 {
3228 pp = eptr; /* Remember where we started */
3229
3230 #ifdef SUPPORT_UCP
3231 if (prop_type >= 0)
3232 {
3233 switch(prop_type)
3234 {
3235 case PT_ANY:
3236 for (i = min; i < max; i++)
3237 {
3238 int len = 1;
3239 if (eptr >= md->end_subject) break;
3240 GETCHARLEN(c, eptr, len);
3241 if (prop_fail_result) break;
3242 eptr+= len;
3243 }
3244 break;
3245
3246 case PT_LAMP:
3247 for (i = min; i < max; i++)
3248 {
3249 int len = 1;
3250 if (eptr >= md->end_subject) break;
3251 GETCHARLEN(c, eptr, len);
3252 prop_category = _pcre_ucp_findprop(c, &prop_chartype, &prop_script);
3253 if ((prop_chartype == ucp_Lu ||
3254 prop_chartype == ucp_Ll ||
3255 prop_chartype == ucp_Lt) == prop_fail_result)
3256 break;
3257 eptr+= len;
3258 }
3259 break;
3260
3261 case PT_GC:
3262 for (i = min; i < max; i++)
3263 {
3264 int len = 1;
3265 if (eptr >= md->end_subject) break;
3266 GETCHARLEN(c, eptr, len);
3267 prop_category = _pcre_ucp_findprop(c, &prop_chartype, &prop_script);
3268 if ((prop_category == prop_value) == prop_fail_result)
3269 break;
3270 eptr+= len;
3271 }
3272 break;
3273
3274 case PT_PC:
3275 for (i = min; i < max; i++)
3276 {
3277 int len = 1;
3278 if (eptr >= md->end_subject) break;
3279 GETCHARLEN(c, eptr, len);
3280 prop_category = _pcre_ucp_findprop(c, &prop_chartype, &prop_script);
3281 if ((prop_chartype == prop_value) == prop_fail_result)
3282 break;
3283 eptr+= len;
3284 }
3285 break;
3286
3287 case PT_SC:
3288 for (i = min; i < max; i++)
3289 {
3290 int len = 1;
3291 if (eptr >= md->end_subject) break;
3292 GETCHARLEN(c, eptr, len);
3293 prop_category = _pcre_ucp_findprop(c, &prop_chartype, &prop_script);
3294 if ((prop_script == prop_value) == prop_fail_result)
3295 break;
3296 eptr+= len;
3297 }
3298 break;
3299 }
3300
3301 /* eptr is now past the end of the maximum run */
3302
3303 if (possessive) continue;
3304 for(;;)
3305 {
3306 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM44);
3307 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3308 if (eptr-- == pp) break; /* Stop if tried at original pos */
3309 BACKCHAR(eptr);
3310 }
3311 }
3312
3313 /* Match extended Unicode sequences. We will get here only if the
3314 support is in the binary; otherwise a compile-time error occurs. */
3315
3316 else if (ctype == OP_EXTUNI)
3317 {
3318 for (i = min; i < max; i++)
3319 {
3320 if (eptr >= md->end_subject) break;
3321 GETCHARINCTEST(c, eptr);
3322 prop_category = _pcre_ucp_findprop(c, &prop_chartype, &prop_script);
3323 if (prop_category == ucp_M) break;
3324 while (eptr < md->end_subject)
3325 {
3326 int len = 1;
3327 if (!utf8) c = *eptr; else
3328 {
3329 GETCHARLEN(c, eptr, len);
3330 }
3331 prop_category = _pcre_ucp_findprop(c, &prop_chartype, &prop_script);
3332 if (prop_category != ucp_M) break;
3333 eptr += len;
3334 }
3335 }
3336
3337 /* eptr is now past the end of the maximum run */
3338
3339 if (possessive) continue;
3340 for(;;)
3341 {
3342 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM45);
3343 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3344 if (eptr-- == pp) break; /* Stop if tried at original pos */
3345 for (;;) /* Move back over one extended */
3346 {
3347 int len = 1;
3348 BACKCHAR(eptr);
3349 if (!utf8) c = *eptr; else
3350 {
3351 GETCHARLEN(c, eptr, len);
3352 }
3353 prop_category = _pcre_ucp_findprop(c, &prop_chartype, &prop_script);
3354 if (prop_category != ucp_M) break;
3355 eptr--;
3356 }
3357 }
3358 }
3359
3360 else
3361 #endif /* SUPPORT_UCP */
3362
3363 #ifdef SUPPORT_UTF8
3364 /* UTF-8 mode */
3365
3366 if (utf8)
3367 {
3368 switch(ctype)
3369 {
3370 case OP_ANY:
3371
3372 /* Special code is required for UTF8, but when the maximum is
3373 unlimited we don't need it, so we repeat the non-UTF8 code. This is
3374 probably worth it, because .* is quite a common idiom. */
3375
3376 if (max < INT_MAX)
3377 {
3378 if ((ims & PCRE_DOTALL) == 0)
3379 {
3380 for (i = min; i < max; i++)
3381 {
3382 if (eptr >= md->end_subject || IS_NEWLINE(eptr)) break;
3383 eptr++;
3384 while (eptr < md->end_subject && (*eptr & 0xc0) == 0x80) eptr++;
3385 }
3386 }
3387 else
3388 {
3389 for (i = min; i < max; i++)
3390 {
3391 if (eptr >= md->end_subject) break;
3392 eptr++;
3393 while (eptr < md->end_subject && (*eptr & 0xc0) == 0x80) eptr++;
3394 }
3395 }
3396 }
3397
3398 /* Handle unlimited UTF-8 repeat */
3399
3400 else
3401 {
3402 if ((ims & PCRE_DOTALL) == 0)
3403 {
3404 for (i = min; i < max; i++)
3405 {
3406 if (eptr >= md->end_subject || IS_NEWLINE(eptr)) break;
3407 eptr++;
3408 }
3409 break;
3410 }
3411 else
3412 {
3413 c = max - min;
3414 if (c > (unsigned int)(md->end_subject - eptr))
3415 c = md->end_subject - eptr;
3416 eptr += c;
3417 }
3418 }
3419 break;
3420
3421 /* The byte case is the same as non-UTF8 */
3422
3423 case OP_ANYBYTE:
3424 c = max - min;
3425 if (c > (unsigned int)(md->end_subject - eptr))
3426 c = md->end_subject - eptr;
3427 eptr += c;
3428 break;
3429
3430 case OP_ANYNL:
3431 for (i = min; i < max; i++)
3432 {
3433 int len = 1;
3434 if (eptr >= md->end_subject) break;
3435 GETCHARLEN(c, eptr, len);
3436 if (c == 0x000d)
3437 {
3438 if (++eptr >= md->end_subject) break;
3439 if (*eptr == 0x000a) eptr++;
3440 }
3441 else
3442 {
3443 if (c != 0x000a && c != 0x000b && c != 0x000c &&
3444 c != 0x0085 && c != 0x2028 && c != 0x2029)
3445 break;
3446 eptr += len;
3447 }
3448 }
3449 break;
3450
3451 case OP_NOT_DIGIT:
3452 for (i = min; i < max; i++)
3453 {
3454 int len = 1;
3455 if (eptr >= md->end_subject) break;
3456 GETCHARLEN(c, eptr, len);
3457 if (c < 256 && (md->ctypes[c] & ctype_digit) != 0) break;
3458 eptr+= len;
3459 }
3460 break;
3461
3462 case OP_DIGIT:
3463 for (i = min; i < max; i++)
3464 {
3465 int len = 1;
3466 if (eptr >= md->end_subject) break;
3467 GETCHARLEN(c, eptr, len);
3468 if (c >= 256 ||(md->ctypes[c] & ctype_digit) == 0) break;
3469 eptr+= len;
3470 }
3471 break;
3472
3473 case OP_NOT_WHITESPACE:
3474 for (i = min; i < max; i++)
3475 {
3476 int len = 1;
3477 if (eptr >= md->end_subject) break;
3478 GETCHARLEN(c, eptr, len);
3479 if (c < 256 && (md->ctypes[c] & ctype_space) != 0) break;
3480 eptr+= len;
3481 }
3482 break;
3483
3484 case OP_WHITESPACE:
3485 for (i = min; i < max; i++)
3486 {
3487 int len = 1;
3488 if (eptr >= md->end_subject) break;
3489 GETCHARLEN(c, eptr, len);
3490 if (c >= 256 ||(md->ctypes[c] & ctype_space) == 0) break;
3491 eptr+= len;
3492 }
3493 break;
3494
3495 case OP_NOT_WORDCHAR:
3496 for (i = min; i < max; i++)
3497 {
3498 int len = 1;
3499 if (eptr >= md->end_subject) break;
3500 GETCHARLEN(c, eptr, len);
3501 if (c < 256 && (md->ctypes[c] & ctype_word) != 0) break;
3502 eptr+= len;
3503 }
3504 break;
3505
3506 case OP_WORDCHAR:
3507 for (i = min; i < max; i++)
3508 {
3509 int len = 1;
3510 if (eptr >= md->end_subject) break;
3511 GETCHARLEN(c, eptr, len);
3512 if (c >= 256 || (md->ctypes[c] & ctype_word) == 0) break;
3513 eptr+= len;
3514 }
3515 break;
3516
3517 default:
3518 RRETURN(PCRE_ERROR_INTERNAL);
3519 }
3520
3521 /* eptr is now past the end of the maximum run */
3522
3523 if (possessive) continue;
3524 for(;;)
3525 {
3526 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM46);
3527 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3528 if (eptr-- == pp) break; /* Stop if tried at original pos */
3529 BACKCHAR(eptr);
3530 }
3531 }
3532 else
3533 #endif
3534
3535 /* Not UTF-8 mode */
3536 {
3537 switch(ctype)
3538 {
3539 case OP_ANY:
3540 if ((ims & PCRE_DOTALL) == 0)
3541 {
3542 for (i = min; i < max; i++)
3543 {
3544 if (eptr >= md->end_subject || IS_NEWLINE(eptr)) break;
3545 eptr++;
3546 }
3547 break;
3548 }
3549 /* For DOTALL case, fall through and treat as \C */
3550
3551 case OP_ANYBYTE:
3552 c = max - min;
3553 if (c > (unsigned int)(md->end_subject - eptr))
3554 c = md->end_subject - eptr;
3555 eptr += c;
3556 break;
3557
3558 case OP_ANYNL:
3559 for (i = min; i < max; i++)
3560 {
3561 if (eptr >= md->end_subject) break;
3562 c = *eptr;
3563 if (c == 0x000d)
3564 {
3565 if (++eptr >= md->end_subject) break;
3566 if (*eptr == 0x000a) eptr++;
3567 }
3568 else
3569 {
3570 if (c != 0x000a && c != 0x000b && c != 0x000c && c != 0x0085)
3571 break;
3572 eptr++;
3573 }
3574 }
3575 break;
3576
3577 case OP_NOT_DIGIT:
3578 for (i = min; i < max; i++)
3579 {
3580 if (eptr >= md->end_subject || (md->ctypes[*eptr] & ctype_digit) != 0)
3581 break;
3582 eptr++;
3583 }
3584 break;
3585
3586 case OP_DIGIT:
3587 for (i = min; i < max; i++)
3588 {
3589 if (eptr >= md->end_subject || (md->ctypes[*eptr] & ctype_digit) == 0)
3590 break;
3591 eptr++;
3592 }
3593 break;
3594
3595 case OP_NOT_WHITESPACE:
3596 for (i = min; i < max; i++)
3597 {
3598 if (eptr >= md->end_subject || (md->ctypes[*eptr] & ctype_space) != 0)
3599 break;
3600 eptr++;
3601 }
3602 break;
3603
3604 case OP_WHITESPACE:
3605 for (i = min; i < max; i++)
3606 {
3607 if (eptr >= md->end_subject || (md->ctypes[*eptr] & ctype_space) == 0)
3608 break;
3609 eptr++;
3610 }
3611 break;
3612
3613 case OP_NOT_WORDCHAR:
3614 for (i = min; i < max; i++)
3615 {
3616 if (eptr >= md->end_subject || (md->ctypes[*eptr] & ctype_word) != 0)
3617 break;
3618 eptr++;
3619 }
3620 break;
3621
3622 case OP_WORDCHAR:
3623 for (i = min; i < max; i++)
3624 {
3625 if (eptr >= md->end_subject || (md->ctypes[*eptr] & ctype_word) == 0)
3626 break;
3627 eptr++;
3628 }
3629 break;
3630
3631 default:
3632 RRETURN(PCRE_ERROR_INTERNAL);
3633 }
3634
3635 /* eptr is now past the end of the maximum run */
3636
3637 if (possessive) continue;
3638 while (eptr >= pp)
3639 {
3640 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM47);
3641 eptr--;
3642 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3643 }
3644 }
3645
3646 /* Get here if we can't make it match with any permitted repetitions */
3647
3648 RRETURN(MATCH_NOMATCH);
3649 }
3650 /* Control never gets here */
3651
3652 /* There's been some horrible disaster. Arrival here can only mean there is
3653 something seriously wrong in the code above or the OP_xxx definitions. */
3654
3655 default:
3656 DPRINTF(("Unknown opcode %d\n", *ecode));
3657 RRETURN(PCRE_ERROR_UNKNOWN_OPCODE);
3658 }
3659
3660 /* Do not stick any code in here without much thought; it is assumed
3661 that "continue" in the code above comes out to here to repeat the main
3662 loop. */
3663
3664 } /* End of main loop */
3665 /* Control never reaches here */
3666
3667
3668 /* When compiling to use the heap rather than the stack for recursive calls to
3669 match(), the RRETURN() macro jumps here. The number that is saved in
3670 frame->Xwhere indicates which label we actually want to return to. */
3671
3672 #ifdef NO_RECURSE
3673 #define LBL(val) case val: goto L_RM##val;
3674 HEAP_RETURN:
3675 switch (frame->Xwhere)
3676 {
3677 LBL( 1) LBL( 2) LBL( 3) LBL( 4) LBL( 5) LBL( 6) LBL( 7) LBL( 8)
3678 LBL( 9) LBL(10) LBL(11) LBL(12) LBL(13) LBL(14) LBL(15) LBL(16)
3679 LBL(17) LBL(18) LBL(19) LBL(20) LBL(21) LBL(22) LBL(23) LBL(24)
3680 LBL(25) LBL(26) LBL(27) LBL(28) LBL(29) LBL(30) LBL(31) LBL(32)
3681 LBL(33) LBL(34) LBL(35) LBL(36) LBL(37) LBL(38) LBL(39) LBL(40)
3682 LBL(41) LBL(42) LBL(43) LBL(44) LBL(45) LBL(46) LBL(47)
3683 default:
3684 DPRINTF(("jump error in pcre match: label %d non-existent\n", frame->Xwhere));
3685 return PCRE_ERROR_INTERNAL;
3686 }
3687 #undef LBL
3688 #endif /* NO_RECURSE */
3689 }
3690
3691
3692 /***************************************************************************
3693 ****************************************************************************
3694 RECURSION IN THE match() FUNCTION
3695
3696 Undefine all the macros that were defined above to handle this. */
3697
3698 #ifdef NO_RECURSE
3699 #undef eptr
3700 #undef ecode
3701 #undef mstart
3702 #undef offset_top
3703 #undef ims
3704 #undef eptrb
3705 #undef flags
3706
3707 #undef callpat
3708 #undef charptr
3709 #undef data
3710 #undef next
3711 #undef pp
3712 #undef prev
3713 #undef saved_eptr
3714
3715 #undef new_recursive
3716
3717 #undef cur_is_word
3718 #undef condition
3719 #undef prev_is_word
3720
3721 #undef original_ims
3722
3723 #undef ctype
3724 #undef length
3725 #undef max
3726 #undef min
3727 #undef number
3728 #undef offset
3729 #undef op
3730 #undef save_capture_last
3731 #undef save_offset1
3732 #undef save_offset2
3733 #undef save_offset3
3734 #undef stacksave
3735
3736 #undef newptrb
3737
3738 #endif
3739
3740 /* These two are defined as macros in both cases */
3741
3742 #undef fc
3743 #undef fi
3744
3745 /***************************************************************************
3746 ***************************************************************************/
3747
3748
3749
3750 /*************************************************
3751 * Execute a Regular Expression *
3752 *************************************************/
3753
3754 /* This function applies a compiled re to a subject string and picks out
3755 portions of the string if it matches. Two elements in the vector are set for
3756 each substring: the offsets to the start and end of the substring.
3757
3758 Arguments:
3759 argument_re points to the compiled expression
3760 extra_data points to extra data or is NULL
3761 subject points to the subject string
3762 length length of subject string (may contain binary zeros)
3763 start_offset where to start in the subject string
3764 options option bits
3765 offsets points to a vector of ints to be filled in with offsets
3766 offsetcount the number of elements in the vector
3767
3768 Returns: > 0 => success; value is the number of elements filled in
3769 = 0 => success, but offsets is not big enough
3770 -1 => failed to match
3771 < -1 => some kind of unexpected problem
3772 */
3773
3774 PCRE_EXP_DEFN int
3775 pcre_exec(const pcre *argument_re, const pcre_extra *extra_data,
3776 PCRE_SPTR subject, int length, int start_offset, int options, int *offsets,
3777 int offsetcount)
3778 {
3779 int rc, resetcount, ocount;
3780 int first_byte = -1;
3781 int req_byte = -1;
3782 int req_byte2 = -1;
3783 int newline;
3784 unsigned long int ims;
3785 BOOL using_temporary_offsets = FALSE;
3786 BOOL anchored;
3787 BOOL startline;
3788 BOOL firstline;
3789 BOOL first_byte_caseless = FALSE;
3790 BOOL req_byte_caseless = FALSE;
3791 BOOL utf8;
3792 match_data match_block;
3793 match_data *md = &match_block;
3794 const uschar *tables;
3795 const uschar *start_bits = NULL;
3796 USPTR start_match = (USPTR)subject + start_offset;
3797 USPTR end_subject;
3798 USPTR req_byte_ptr = start_match - 1;
3799 eptrblock eptrchain[EPTR_WORK_SIZE];
3800
3801 pcre_study_data internal_study;
3802 const pcre_study_data *study;
3803
3804 real_pcre internal_re;
3805 const real_pcre *external_re = (const real_pcre *)argument_re;
3806 const real_pcre *re = external_re;
3807
3808 /* Plausibility checks */
3809
3810 if ((options & ~PUBLIC_EXEC_OPTIONS) != 0) return PCRE_ERROR_BADOPTION;
3811 if (re == NULL || subject == NULL ||
3812 (offsets == NULL && offsetcount > 0)) return PCRE_ERROR_NULL;
3813 if (offsetcount < 0) return PCRE_ERROR_BADCOUNT;
3814
3815 /* Fish out the optional data from the extra_data structure, first setting
3816 the default values. */
3817
3818 study = NULL;
3819 md->match_limit = MATCH_LIMIT;
3820 md->match_limit_recursion = MATCH_LIMIT_RECURSION;
3821 md->callout_data = NULL;
3822
3823 /* The table pointer is always in native byte order. */
3824
3825 tables = external_re->tables;
3826
3827 if (extra_data != NULL)
3828 {
3829 register unsigned int flags = extra_data->flags;
3830 if ((flags & PCRE_EXTRA_STUDY_DATA) != 0)
3831 study = (const pcre_study_data *)extra_data->study_data;
3832 if ((flags & PCRE_EXTRA_MATCH_LIMIT) != 0)
3833 md->match_limit = extra_data->match_limit;
3834 if ((flags & PCRE_EXTRA_MATCH_LIMIT_RECURSION) != 0)
3835 md->match_limit_recursion = extra_data->match_limit_recursion;
3836 if ((flags & PCRE_EXTRA_CALLOUT_DATA) != 0)
3837 md->callout_data = extra_data->callout_data;
3838 if ((flags & PCRE_EXTRA_TABLES) != 0) tables = extra_data->tables;
3839 }
3840
3841 /* If the exec call supplied NULL for tables, use the inbuilt ones. This
3842 is a feature that makes it possible to save compiled regex and re-use them
3843 in other programs later. */
3844
3845 if (tables == NULL) tables = _pcre_default_tables;
3846
3847 /* Check that the first field in the block is the magic number. If it is not,
3848 test for a regex that was compiled on a host of opposite endianness. If this is
3849 the case, flipped values are put in internal_re and internal_study if there was
3850 study data too. */
3851
3852 if (re->magic_number != MAGIC_NUMBER)
3853 {
3854 re = _pcre_try_flipped(re, &internal_re, study, &internal_study);
3855 if (re == NULL) return PCRE_ERROR_BADMAGIC;
3856 if (study != NULL) study = &internal_study;
3857 }
3858
3859 /* Set up other data */
3860
3861 anchored = ((re->options | options) & PCRE_ANCHORED) != 0;
3862 startline = (re->options & PCRE_STARTLINE) != 0;
3863 firstline = (re->options & PCRE_FIRSTLINE) != 0;
3864
3865 /* The code starts after the real_pcre block and the capture name table. */
3866
3867 md->start_code = (const uschar *)external_re + re->name_table_offset +
3868 re->name_count * re->name_entry_size;
3869
3870 md->start_subject = (USPTR)subject;
3871 md->start_offset = start_offset;
3872 md->end_subject = md->start_subject + length;
3873 end_subject = md->end_subject;
3874
3875 md->endonly = (re->options & PCRE_DOLLAR_ENDONLY) != 0;
3876 utf8 = md->utf8 = (re->options & PCRE_UTF8) != 0;
3877
3878 md->notbol = (options & PCRE_NOTBOL) != 0;
3879 md->noteol = (options & PCRE_NOTEOL) != 0;
3880 md->notempty = (options & PCRE_NOTEMPTY) != 0;
3881 md->partial = (options & PCRE_PARTIAL) != 0;
3882 md->hitend = FALSE;
3883
3884 md->recursive = NULL; /* No recursion at top level */
3885 md->eptrchain = eptrchain; /* Make workspace generally available */
3886
3887 md->lcc = tables + lcc_offset;
3888 md->ctypes = tables + ctypes_offset;
3889
3890 /* Handle different types of newline. The three bits give eight cases. If
3891 nothing is set at run time, whatever was used at compile time applies. */
3892
3893 switch ((((options & PCRE_NEWLINE_BITS) == 0)? re->options : (pcre_uint32)options) &
3894 PCRE_NEWLINE_BITS)
3895 {
3896 case 0: newline = NEWLINE; break; /* Compile-time default */
3897 case PCRE_NEWLINE_CR: newline = '\r'; break;
3898 case PCRE_NEWLINE_LF: newline = '\n'; break;
3899 case PCRE_NEWLINE_CR+
3900 PCRE_NEWLINE_LF: newline = ('\r' << 8) | '\n'; break;
3901 case PCRE_NEWLINE_ANY: newline = -1; break;
3902 case PCRE_NEWLINE_ANYCRLF: newline = -2; break;
3903 default: return PCRE_ERROR_BADNEWLINE;
3904 }
3905
3906 if (newline == -2)
3907 {
3908 md->nltype = NLTYPE_ANYCRLF;
3909 }
3910 else if (newline < 0)
3911 {
3912 md->nltype = NLTYPE_ANY;
3913 }
3914 else
3915 {
3916 md->nltype = NLTYPE_FIXED;
3917 if (newline > 255)
3918 {
3919 md->nllen = 2;
3920 md->nl[0] = (newline >> 8) & 255;
3921 md->nl[1] = newline & 255;
3922 }
3923 else
3924 {
3925 md->nllen = 1;
3926 md->nl[0] = newline;
3927 }
3928 }
3929
3930 /* Partial matching is supported only for a restricted set of regexes at the
3931 moment. */
3932
3933 if (md->partial && (re->options & PCRE_NOPARTIAL) != 0)
3934 return PCRE_ERROR_BADPARTIAL;
3935
3936 /* Check a UTF-8 string if required. Unfortunately there's no way of passing
3937 back the character offset. */
3938
3939 #ifdef SUPPORT_UTF8
3940 if (utf8 && (options & PCRE_NO_UTF8_CHECK) == 0)
3941 {
3942 if (_pcre_valid_utf8((uschar *)subject, length) >= 0)
3943 return PCRE_ERROR_BADUTF8;
3944 if (start_offset > 0 && start_offset < length)
3945 {
3946 int tb = ((uschar *)subject)[start_offset];
3947 if (tb > 127)
3948 {
3949 tb &= 0xc0;
3950 if (tb != 0 && tb != 0xc0) return PCRE_ERROR_BADUTF8_OFFSET;
3951 }
3952 }
3953 }
3954 #endif
3955
3956 /* The ims options can vary during the matching as a result of the presence
3957 of (?ims) items in the pattern. They are kept in a local variable so that
3958 restoring at the exit of a group is easy. */
3959
3960 ims = re->options & (PCRE_CASELESS|PCRE_MULTILINE|PCRE_DOTALL);
3961
3962 /* If the expression has got more back references than the offsets supplied can
3963 hold, we get a temporary chunk of working store to use during the matching.
3964 Otherwise, we can use the vector supplied, rounding down its size to a multiple
3965 of 3. */
3966
3967 ocount = offsetcount - (offsetcount % 3);
3968
3969 if (re->top_backref > 0 && re->top_backref >= ocount/3)
3970 {
3971 ocount = re->top_backref * 3 + 3;
3972 md->offset_vector = (int *)(pcre_malloc)(ocount * sizeof(int));
3973 if (md->offset_vector == NULL) return PCRE_ERROR_NOMEMORY;
3974 using_temporary_offsets = TRUE;
3975 DPRINTF(("Got memory to hold back references\n"));
3976 }
3977 else md->offset_vector = offsets;
3978
3979 md->offset_end = ocount;
3980 md->offset_max = (2*ocount)/3;
3981 md->offset_overflow = FALSE;
3982 md->capture_last = -1;
3983
3984 /* Compute the minimum number of offsets that we need to reset each time. Doing
3985 this makes a huge difference to execution time when there aren't many brackets
3986 in the pattern. */
3987
3988 resetcount = 2 + re->top_bracket * 2;
3989 if (resetcount > offsetcount) resetcount = ocount;
3990
3991 /* Reset the working variable associated with each extraction. These should
3992 never be used unless previously set, but they get saved and restored, and so we
3993 initialize them to avoid reading uninitialized locations. */
3994
3995 if (md->offset_vector != NULL)
3996 {
3997 register int *iptr = md->offset_vector + ocount;
3998 register int *iend = iptr - resetcount/2 + 1;
3999 while (--iptr >= iend) *iptr = -1;
4000 }
4001
4002 /* Set up the first character to match, if available. The first_byte value is
4003 never set for an anchored regular expression, but the anchoring may be forced
4004 at run time, so we have to test for anchoring. The first char may be unset for
4005 an unanchored pattern, of course. If there's no first char and the pattern was
4006 studied, there may be a bitmap of possible first characters. */
4007
4008 if (!anchored)
4009 {
4010 if ((re->options & PCRE_FIRSTSET) != 0)
4011 {
4012 first_byte = re->first_byte & 255;
4013 if ((first_byte_caseless = ((re->first_byte & REQ_CASELESS) != 0)) == TRUE)
4014 first_byte = md->lcc[first_byte];
4015 }
4016 else
4017 if (!startline && study != NULL &&
4018 (study->options & PCRE_STUDY_MAPPED) != 0)
4019 start_bits = study->start_bits;
4020 }
4021
4022 /* For anchored or unanchored matches, there may be a "last known required
4023 character" set. */
4024
4025 if ((re->options & PCRE_REQCHSET) != 0)
4026 {
4027 req_byte = re->req_byte & 255;
4028 req_byte_caseless = (re->req_byte & REQ_CASELESS) != 0;
4029 req_byte2 = (tables + fcc_offset)[req_byte]; /* case flipped */
4030 }
4031
4032
4033 /* ==========================================================================*/
4034
4035 /* Loop for handling unanchored repeated matching attempts; for anchored regexs
4036 the loop runs just once. */
4037
4038 for(;;)
4039 {
4040 USPTR save_end_subject = end_subject;
4041
4042 /* Reset the maximum number of extractions we might see. */
4043
4044 if (md->offset_vector != NULL)
4045 {
4046 register int *iptr = md->offset_vector;
4047 register int *iend = iptr + resetcount;
4048 while (iptr < iend) *iptr++ = -1;
4049 }
4050
4051 /* Advance to a unique first char if possible. If firstline is TRUE, the
4052 start of the match is constrained to the first line of a multiline string.
4053 That is, the match must be before or at the first newline. Implement this by
4054 temporarily adjusting end_subject so that we stop scanning at a newline. If
4055 the match fails at the newline, later code breaks this loop. */
4056
4057 if (firstline)
4058 {
4059 USPTR t = start_match;
4060 while (t < md->end_subject && !IS_NEWLINE(t)) t++;
4061 end_subject = t;
4062 }
4063
4064 /* Now test for a unique first byte */
4065
4066 if (first_byte >= 0)
4067 {
4068 if (first_byte_caseless)
4069 while (start_match < end_subject &&
4070 md->lcc[*start_match] != first_byte)
4071 start_match++;
4072 else
4073 while (start_match < end_subject && *start_match != first_byte)
4074 start_match++;
4075 }
4076
4077 /* Or to just after a linebreak for a multiline match if possible */
4078
4079 else if (startline)
4080 {
4081 if (start_match > md->start_subject + start_offset)
4082 {
4083 while (start_match <= end_subject && !WAS_NEWLINE(start_match))
4084 start_match++;
4085
4086 /* If we have just passed a CR and the newline option is ANY or ANYCRLF,
4087 and we are now at a LF, advance the match position by one more character.
4088 */
4089
4090 if (start_match[-1] == '\r' &&
4091 (md->nltype == NLTYPE_ANY || md->nltype == NLTYPE_ANYCRLF) &&
4092 start_match < end_subject &&
4093 *start_match == '\n')
4094 start_match++;
4095 }
4096 }
4097
4098 /* Or to a non-unique first char after study */
4099
4100 else if (start_bits != NULL)
4101 {
4102 while (start_match < end_subject)
4103 {
4104 register unsigned int c = *start_match;
4105 if ((start_bits[c/8] & (1 << (c&7))) == 0) start_match++; else break;
4106 }
4107 }
4108
4109 /* Restore fudged end_subject */
4110
4111 end_subject = save_end_subject;
4112
4113 #ifdef DEBUG /* Sigh. Some compilers never learn. */
4114 printf(">>>> Match against: ");
4115 pchars(start_match, end_subject - start_match, TRUE, md);
4116 printf("\n");
4117 #endif
4118
4119 /* If req_byte is set, we know that that character must appear in the subject
4120 for the match to succeed. If the first character is set, req_byte must be
4121 later in the subject; otherwise the test starts at the match point. This
4122 optimization can save a huge amount of backtracking in patterns with nested
4123 unlimited repeats that aren't going to match. Writing separate code for
4124 cased/caseless versions makes it go faster, as does using an autoincrement
4125 and backing off on a match.
4126
4127 HOWEVER: when the subject string is very, very long, searching to its end can
4128 take a long time, and give bad performance on quite ordinary patterns. This
4129 showed up when somebody was matching something like /^\d+C/ on a 32-megabyte
4130 string... so we don't do this when the string is sufficiently long.
4131
4132 ALSO: this processing is disabled when partial matching is requested.
4133 */
4134
4135 if (req_byte >= 0 &&
4136 end_subject - start_match < REQ_BYTE_MAX &&
4137 !md->partial)
4138 {
4139 register USPTR p = start_match + ((first_byte >= 0)? 1 : 0);
4140
4141 /* We don't need to repeat the search if we haven't yet reached the
4142 place we found it at last time. */
4143
4144 if (p > req_byte_ptr)
4145 {
4146 if (req_byte_caseless)
4147 {
4148 while (p < end_subject)
4149 {
4150 register int pp = *p++;
4151 if (pp == req_byte || pp == req_byte2) { p--; break; }
4152 }
4153 }
4154 else
4155 {
4156 while (p < end_subject)
4157 {
4158 if (*p++ == req_byte) { p--; break; }
4159 }
4160 }
4161
4162 /* If we can't find the required character, break the matching loop,
4163 forcing a match failure. */
4164
4165 if (p >= end_subject)
4166 {
4167 rc = MATCH_NOMATCH;
4168 break;
4169 }
4170
4171 /* If we have found the required character, save the point where we
4172 found it, so that we don't search again next time round the loop if
4173 the start hasn't passed this character yet. */
4174
4175 req_byte_ptr = p;
4176 }
4177 }
4178
4179 /* OK, we can now run the match. */
4180
4181 md->start_match_ptr = start_match; /* Insurance */
4182 md->match_call_count = 0;
4183 md->eptrn = 0; /* Next free eptrchain slot */
4184 rc = match(start_match, md->start_code, start_match, 2, md,
4185 ims, NULL, 0, 0);
4186
4187 /* Any return other than MATCH_NOMATCH breaks the loop. */
4188
4189 if (rc != MATCH_NOMATCH) break;
4190
4191 /* If PCRE_FIRSTLINE is set, the match must happen before or at the first
4192 newline in the subject (though it may continue over the newline). Therefore,
4193 if we have just failed to match, starting at a newline, do not continue. */
4194
4195 if (firstline && IS_NEWLINE(start_match)) break;
4196
4197 /* Advance the match position by one character. */
4198
4199 start_match++;
4200 #ifdef SUPPORT_UTF8
4201 if (utf8)
4202 while(start_match < end_subject && (*start_match & 0xc0) == 0x80)
4203 start_match++;
4204 #endif
4205
4206 /* Break the loop if the pattern is anchored or if we have passed the end of
4207 the subject. */
4208
4209 if (anchored || start_match > end_subject) break;
4210
4211 /* If we have just passed a CR and the newline option is CRLF or ANY or
4212 ANYCRLF, and we are now at a LF, advance the match position by one more
4213 character. */
4214
4215 if (start_match[-1] == '\r' &&
4216 (md->nltype == NLTYPE_ANY ||
4217 md->nltype == NLTYPE_ANYCRLF ||
4218 md->nllen == 2) &&
4219 start_match < end_subject &&
4220 *start_match == '\n')
4221 start_match++;
4222
4223 } /* End of for(;;) "bumpalong" loop */
4224
4225 /* ==========================================================================*/
4226
4227 /* We reach here when rc is not MATCH_NOMATCH, or if one of the stopping
4228 conditions is true:
4229
4230 (1) The pattern is anchored;
4231
4232 (2) We are past the end of the subject;
4233
4234 (3) PCRE_FIRSTLINE is set and we have failed to match at a newline, because
4235 this option requests that a match occur at or before the first newline in
4236 the subject.
4237
4238 When we have a match and the offset vector is big enough to deal with any
4239 backreferences, captured substring offsets will already be set up. In the case
4240 where we had to get some local store to hold offsets for backreference
4241 processing, copy those that we can. In this case there need not be overflow if
4242 certain parts of the pattern were not used, even though there are more
4243 capturing parentheses than vector slots. */
4244
4245 if (rc == MATCH_MATCH)
4246 {
4247 if (using_temporary_offsets)
4248 {
4249 if (offsetcount >= 4)
4250 {
4251 memcpy(offsets + 2, md->offset_vector + 2,
4252 (offsetcount - 2) * sizeof(int));
4253 DPRINTF(("Copied offsets from temporary memory\n"));
4254 }
4255 if (md->end_offset_top > offsetcount) md->offset_overflow = TRUE;
4256 DPRINTF(("Freeing temporary memory\n"));
4257 (pcre_free)(md->offset_vector);
4258 }
4259
4260 /* Set the return code to the number of captured strings, or 0 if there are
4261 too many to fit into the vector. */
4262
4263 rc = md->offset_overflow? 0 : md->end_offset_top/2;
4264
4265 /* If there is space, set up the whole thing as substring 0. The value of
4266 md->start_match_ptr might be modified if \K was encountered on the success
4267 matching path. */
4268
4269 if (offsetcount < 2) rc = 0; else
4270 {
4271 offsets[0] = md->start_match_ptr - md->start_subject;
4272 offsets[1] = md->end_match_ptr - md->start_subject;
4273 }
4274
4275 DPRINTF((">>>> returning %d\n", rc));
4276 return rc;
4277 }
4278
4279 /* Control gets here if there has been an error, or if the overall match
4280 attempt has failed at all permitted starting positions. */
4281
4282 if (using_temporary_offsets)
4283 {
4284 DPRINTF(("Freeing temporary memory\n"));
4285 (pcre_free)(md->offset_vector);
4286 }
4287
4288 if (rc != MATCH_NOMATCH)
4289 {
4290 DPRINTF((">>>> error: returning %d\n", rc));
4291 return rc;
4292 }
4293 else if (md->partial && md->hitend)
4294 {
4295 DPRINTF((">>>> returning PCRE_ERROR_PARTIAL\n"));
4296 return PCRE_ERROR_PARTIAL;
4297 }
4298 else
4299 {
4300 DPRINTF((">>>> returning PCRE_ERROR_NOMATCH\n"));
4301 return PCRE_ERROR_NOMATCH;
4302 }
4303 }
4304
4305 /* End of pcre_exec.c */

Properties

Name Value
svn:eol-style native
svn:keywords "Author Date Id Revision Url"

webmaster@exim.org
ViewVC Help
Powered by ViewVC 1.1.12