/[pcre]/code/trunk/pcre_exec.c
ViewVC logotype

Contents of /code/trunk/pcre_exec.c

Parent Directory Parent Directory | Revision Log Revision Log


Revision 391 - (show annotations) (download)
Tue Mar 17 21:16:01 2009 UTC (5 years, 1 month ago) by ph10
File MIME type: text/plain
File size: 152899 byte(s)
Add support for UTF-8 in EBCDIC environments.

1 /*************************************************
2 * Perl-Compatible Regular Expressions *
3 *************************************************/
4
5 /* PCRE is a library of functions to support regular expressions whose syntax
6 and semantics are as close as possible to those of the Perl 5 language.
7
8 Written by Philip Hazel
9 Copyright (c) 1997-2009 University of Cambridge
10
11 -----------------------------------------------------------------------------
12 Redistribution and use in source and binary forms, with or without
13 modification, are permitted provided that the following conditions are met:
14
15 * Redistributions of source code must retain the above copyright notice,
16 this list of conditions and the following disclaimer.
17
18 * Redistributions in binary form must reproduce the above copyright
19 notice, this list of conditions and the following disclaimer in the
20 documentation and/or other materials provided with the distribution.
21
22 * Neither the name of the University of Cambridge nor the names of its
23 contributors may be used to endorse or promote products derived from
24 this software without specific prior written permission.
25
26 THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
27 AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
28 IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
29 ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
30 LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
31 CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
32 SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
33 INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
34 CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
35 ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
36 POSSIBILITY OF SUCH DAMAGE.
37 -----------------------------------------------------------------------------
38 */
39
40
41 /* This module contains pcre_exec(), the externally visible function that does
42 pattern matching using an NFA algorithm, trying to mimic Perl as closely as
43 possible. There are also some static supporting functions. */
44
45 #ifdef HAVE_CONFIG_H
46 #include "config.h"
47 #endif
48
49 #define NLBLOCK md /* Block containing newline information */
50 #define PSSTART start_subject /* Field containing processed string start */
51 #define PSEND end_subject /* Field containing processed string end */
52
53 #include "pcre_internal.h"
54
55 /* Undefine some potentially clashing cpp symbols */
56
57 #undef min
58 #undef max
59
60 /* Flag bits for the match() function */
61
62 #define match_condassert 0x01 /* Called to check a condition assertion */
63 #define match_cbegroup 0x02 /* Could-be-empty unlimited repeat group */
64
65 /* Non-error returns from the match() function. Error returns are externally
66 defined PCRE_ERROR_xxx codes, which are all negative. */
67
68 #define MATCH_MATCH 1
69 #define MATCH_NOMATCH 0
70
71 /* Special internal returns from the match() function. Make them sufficiently
72 negative to avoid the external error codes. */
73
74 #define MATCH_COMMIT (-999)
75 #define MATCH_PRUNE (-998)
76 #define MATCH_SKIP (-997)
77 #define MATCH_THEN (-996)
78
79 /* Maximum number of ints of offset to save on the stack for recursive calls.
80 If the offset vector is bigger, malloc is used. This should be a multiple of 3,
81 because the offset vector is always a multiple of 3 long. */
82
83 #define REC_STACK_SAVE_MAX 30
84
85 /* Min and max values for the common repeats; for the maxima, 0 => infinity */
86
87 static const char rep_min[] = { 0, 0, 1, 1, 0, 0 };
88 static const char rep_max[] = { 0, 0, 0, 0, 1, 1 };
89
90
91
92 #ifdef DEBUG
93 /*************************************************
94 * Debugging function to print chars *
95 *************************************************/
96
97 /* Print a sequence of chars in printable format, stopping at the end of the
98 subject if the requested.
99
100 Arguments:
101 p points to characters
102 length number to print
103 is_subject TRUE if printing from within md->start_subject
104 md pointer to matching data block, if is_subject is TRUE
105
106 Returns: nothing
107 */
108
109 static void
110 pchars(const uschar *p, int length, BOOL is_subject, match_data *md)
111 {
112 unsigned int c;
113 if (is_subject && length > md->end_subject - p) length = md->end_subject - p;
114 while (length-- > 0)
115 if (isprint(c = *(p++))) printf("%c", c); else printf("\\x%02x", c);
116 }
117 #endif
118
119
120
121 /*************************************************
122 * Match a back-reference *
123 *************************************************/
124
125 /* If a back reference hasn't been set, the length that is passed is greater
126 than the number of characters left in the string, so the match fails.
127
128 Arguments:
129 offset index into the offset vector
130 eptr points into the subject
131 length length to be matched
132 md points to match data block
133 ims the ims flags
134
135 Returns: TRUE if matched
136 */
137
138 static BOOL
139 match_ref(int offset, register USPTR eptr, int length, match_data *md,
140 unsigned long int ims)
141 {
142 USPTR p = md->start_subject + md->offset_vector[offset];
143
144 #ifdef DEBUG
145 if (eptr >= md->end_subject)
146 printf("matching subject <null>");
147 else
148 {
149 printf("matching subject ");
150 pchars(eptr, length, TRUE, md);
151 }
152 printf(" against backref ");
153 pchars(p, length, FALSE, md);
154 printf("\n");
155 #endif
156
157 /* Always fail if not enough characters left */
158
159 if (length > md->end_subject - eptr) return FALSE;
160
161 /* Separate the caseless case for speed. In UTF-8 mode we can only do this
162 properly if Unicode properties are supported. Otherwise, we can check only
163 ASCII characters. */
164
165 if ((ims & PCRE_CASELESS) != 0)
166 {
167 #ifdef SUPPORT_UTF8
168 #ifdef SUPPORT_UCP
169 if (md->utf8)
170 {
171 USPTR endptr = eptr + length;
172 while (eptr < endptr)
173 {
174 int c, d;
175 GETCHARINC(c, eptr);
176 GETCHARINC(d, p);
177 if (c != d && c != UCD_OTHERCASE(d)) return FALSE;
178 }
179 }
180 else
181 #endif
182 #endif
183
184 /* The same code works when not in UTF-8 mode and in UTF-8 mode when there
185 is no UCP support. */
186
187 while (length-- > 0)
188 { if (md->lcc[*p++] != md->lcc[*eptr++]) return FALSE; }
189 }
190
191 /* In the caseful case, we can just compare the bytes, whether or not we
192 are in UTF-8 mode. */
193
194 else
195 { while (length-- > 0) if (*p++ != *eptr++) return FALSE; }
196
197 return TRUE;
198 }
199
200
201
202 /***************************************************************************
203 ****************************************************************************
204 RECURSION IN THE match() FUNCTION
205
206 The match() function is highly recursive, though not every recursive call
207 increases the recursive depth. Nevertheless, some regular expressions can cause
208 it to recurse to a great depth. I was writing for Unix, so I just let it call
209 itself recursively. This uses the stack for saving everything that has to be
210 saved for a recursive call. On Unix, the stack can be large, and this works
211 fine.
212
213 It turns out that on some non-Unix-like systems there are problems with
214 programs that use a lot of stack. (This despite the fact that every last chip
215 has oodles of memory these days, and techniques for extending the stack have
216 been known for decades.) So....
217
218 There is a fudge, triggered by defining NO_RECURSE, which avoids recursive
219 calls by keeping local variables that need to be preserved in blocks of memory
220 obtained from malloc() instead instead of on the stack. Macros are used to
221 achieve this so that the actual code doesn't look very different to what it
222 always used to.
223
224 The original heap-recursive code used longjmp(). However, it seems that this
225 can be very slow on some operating systems. Following a suggestion from Stan
226 Switzer, the use of longjmp() has been abolished, at the cost of having to
227 provide a unique number for each call to RMATCH. There is no way of generating
228 a sequence of numbers at compile time in C. I have given them names, to make
229 them stand out more clearly.
230
231 Crude tests on x86 Linux show a small speedup of around 5-8%. However, on
232 FreeBSD, avoiding longjmp() more than halves the time taken to run the standard
233 tests. Furthermore, not using longjmp() means that local dynamic variables
234 don't have indeterminate values; this has meant that the frame size can be
235 reduced because the result can be "passed back" by straight setting of the
236 variable instead of being passed in the frame.
237 ****************************************************************************
238 ***************************************************************************/
239
240 /* Numbers for RMATCH calls. When this list is changed, the code at HEAP_RETURN
241 below must be updated in sync. */
242
243 enum { RM1=1, RM2, RM3, RM4, RM5, RM6, RM7, RM8, RM9, RM10,
244 RM11, RM12, RM13, RM14, RM15, RM16, RM17, RM18, RM19, RM20,
245 RM21, RM22, RM23, RM24, RM25, RM26, RM27, RM28, RM29, RM30,
246 RM31, RM32, RM33, RM34, RM35, RM36, RM37, RM38, RM39, RM40,
247 RM41, RM42, RM43, RM44, RM45, RM46, RM47, RM48, RM49, RM50,
248 RM51, RM52, RM53, RM54 };
249
250 /* These versions of the macros use the stack, as normal. There are debugging
251 versions and production versions. Note that the "rw" argument of RMATCH isn't
252 actuall used in this definition. */
253
254 #ifndef NO_RECURSE
255 #define REGISTER register
256
257 #ifdef DEBUG
258 #define RMATCH(ra,rb,rc,rd,re,rf,rg,rw) \
259 { \
260 printf("match() called in line %d\n", __LINE__); \
261 rrc = match(ra,rb,mstart,rc,rd,re,rf,rg,rdepth+1); \
262 printf("to line %d\n", __LINE__); \
263 }
264 #define RRETURN(ra) \
265 { \
266 printf("match() returned %d from line %d ", ra, __LINE__); \
267 return ra; \
268 }
269 #else
270 #define RMATCH(ra,rb,rc,rd,re,rf,rg,rw) \
271 rrc = match(ra,rb,mstart,rc,rd,re,rf,rg,rdepth+1)
272 #define RRETURN(ra) return ra
273 #endif
274
275 #else
276
277
278 /* These versions of the macros manage a private stack on the heap. Note that
279 the "rd" argument of RMATCH isn't actually used in this definition. It's the md
280 argument of match(), which never changes. */
281
282 #define REGISTER
283
284 #define RMATCH(ra,rb,rc,rd,re,rf,rg,rw)\
285 {\
286 heapframe *newframe = (pcre_stack_malloc)(sizeof(heapframe));\
287 frame->Xwhere = rw; \
288 newframe->Xeptr = ra;\
289 newframe->Xecode = rb;\
290 newframe->Xmstart = mstart;\
291 newframe->Xoffset_top = rc;\
292 newframe->Xims = re;\
293 newframe->Xeptrb = rf;\
294 newframe->Xflags = rg;\
295 newframe->Xrdepth = frame->Xrdepth + 1;\
296 newframe->Xprevframe = frame;\
297 frame = newframe;\
298 DPRINTF(("restarting from line %d\n", __LINE__));\
299 goto HEAP_RECURSE;\
300 L_##rw:\
301 DPRINTF(("jumped back to line %d\n", __LINE__));\
302 }
303
304 #define RRETURN(ra)\
305 {\
306 heapframe *newframe = frame;\
307 frame = newframe->Xprevframe;\
308 (pcre_stack_free)(newframe);\
309 if (frame != NULL)\
310 {\
311 rrc = ra;\
312 goto HEAP_RETURN;\
313 }\
314 return ra;\
315 }
316
317
318 /* Structure for remembering the local variables in a private frame */
319
320 typedef struct heapframe {
321 struct heapframe *Xprevframe;
322
323 /* Function arguments that may change */
324
325 const uschar *Xeptr;
326 const uschar *Xecode;
327 const uschar *Xmstart;
328 int Xoffset_top;
329 long int Xims;
330 eptrblock *Xeptrb;
331 int Xflags;
332 unsigned int Xrdepth;
333
334 /* Function local variables */
335
336 const uschar *Xcallpat;
337 const uschar *Xcharptr;
338 const uschar *Xdata;
339 const uschar *Xnext;
340 const uschar *Xpp;
341 const uschar *Xprev;
342 const uschar *Xsaved_eptr;
343
344 recursion_info Xnew_recursive;
345
346 BOOL Xcur_is_word;
347 BOOL Xcondition;
348 BOOL Xprev_is_word;
349
350 unsigned long int Xoriginal_ims;
351
352 #ifdef SUPPORT_UCP
353 int Xprop_type;
354 int Xprop_value;
355 int Xprop_fail_result;
356 int Xprop_category;
357 int Xprop_chartype;
358 int Xprop_script;
359 int Xoclength;
360 uschar Xocchars[8];
361 #endif
362
363 int Xctype;
364 unsigned int Xfc;
365 int Xfi;
366 int Xlength;
367 int Xmax;
368 int Xmin;
369 int Xnumber;
370 int Xoffset;
371 int Xop;
372 int Xsave_capture_last;
373 int Xsave_offset1, Xsave_offset2, Xsave_offset3;
374 int Xstacksave[REC_STACK_SAVE_MAX];
375
376 eptrblock Xnewptrb;
377
378 /* Where to jump back to */
379
380 int Xwhere;
381
382 } heapframe;
383
384 #endif
385
386
387 /***************************************************************************
388 ***************************************************************************/
389
390
391
392 /*************************************************
393 * Match from current position *
394 *************************************************/
395
396 /* This function is called recursively in many circumstances. Whenever it
397 returns a negative (error) response, the outer incarnation must also return the
398 same response.
399
400 Performance note: It might be tempting to extract commonly used fields from the
401 md structure (e.g. utf8, end_subject) into individual variables to improve
402 performance. Tests using gcc on a SPARC disproved this; in the first case, it
403 made performance worse.
404
405 Arguments:
406 eptr pointer to current character in subject
407 ecode pointer to current position in compiled code
408 mstart pointer to the current match start position (can be modified
409 by encountering \K)
410 offset_top current top pointer
411 md pointer to "static" info for the match
412 ims current /i, /m, and /s options
413 eptrb pointer to chain of blocks containing eptr at start of
414 brackets - for testing for empty matches
415 flags can contain
416 match_condassert - this is an assertion condition
417 match_cbegroup - this is the start of an unlimited repeat
418 group that can match an empty string
419 rdepth the recursion depth
420
421 Returns: MATCH_MATCH if matched ) these values are >= 0
422 MATCH_NOMATCH if failed to match )
423 a negative PCRE_ERROR_xxx value if aborted by an error condition
424 (e.g. stopped by repeated call or recursion limit)
425 */
426
427 static int
428 match(REGISTER USPTR eptr, REGISTER const uschar *ecode, const uschar *mstart,
429 int offset_top, match_data *md, unsigned long int ims, eptrblock *eptrb,
430 int flags, unsigned int rdepth)
431 {
432 /* These variables do not need to be preserved over recursion in this function,
433 so they can be ordinary variables in all cases. Mark some of them with
434 "register" because they are used a lot in loops. */
435
436 register int rrc; /* Returns from recursive calls */
437 register int i; /* Used for loops not involving calls to RMATCH() */
438 register unsigned int c; /* Character values not kept over RMATCH() calls */
439 register BOOL utf8; /* Local copy of UTF-8 flag for speed */
440
441 BOOL minimize, possessive; /* Quantifier options */
442
443 /* When recursion is not being used, all "local" variables that have to be
444 preserved over calls to RMATCH() are part of a "frame" which is obtained from
445 heap storage. Set up the top-level frame here; others are obtained from the
446 heap whenever RMATCH() does a "recursion". See the macro definitions above. */
447
448 #ifdef NO_RECURSE
449 heapframe *frame = (pcre_stack_malloc)(sizeof(heapframe));
450 frame->Xprevframe = NULL; /* Marks the top level */
451
452 /* Copy in the original argument variables */
453
454 frame->Xeptr = eptr;
455 frame->Xecode = ecode;
456 frame->Xmstart = mstart;
457 frame->Xoffset_top = offset_top;
458 frame->Xims = ims;
459 frame->Xeptrb = eptrb;
460 frame->Xflags = flags;
461 frame->Xrdepth = rdepth;
462
463 /* This is where control jumps back to to effect "recursion" */
464
465 HEAP_RECURSE:
466
467 /* Macros make the argument variables come from the current frame */
468
469 #define eptr frame->Xeptr
470 #define ecode frame->Xecode
471 #define mstart frame->Xmstart
472 #define offset_top frame->Xoffset_top
473 #define ims frame->Xims
474 #define eptrb frame->Xeptrb
475 #define flags frame->Xflags
476 #define rdepth frame->Xrdepth
477
478 /* Ditto for the local variables */
479
480 #ifdef SUPPORT_UTF8
481 #define charptr frame->Xcharptr
482 #endif
483 #define callpat frame->Xcallpat
484 #define data frame->Xdata
485 #define next frame->Xnext
486 #define pp frame->Xpp
487 #define prev frame->Xprev
488 #define saved_eptr frame->Xsaved_eptr
489
490 #define new_recursive frame->Xnew_recursive
491
492 #define cur_is_word frame->Xcur_is_word
493 #define condition frame->Xcondition
494 #define prev_is_word frame->Xprev_is_word
495
496 #define original_ims frame->Xoriginal_ims
497
498 #ifdef SUPPORT_UCP
499 #define prop_type frame->Xprop_type
500 #define prop_value frame->Xprop_value
501 #define prop_fail_result frame->Xprop_fail_result
502 #define prop_category frame->Xprop_category
503 #define prop_chartype frame->Xprop_chartype
504 #define prop_script frame->Xprop_script
505 #define oclength frame->Xoclength
506 #define occhars frame->Xocchars
507 #endif
508
509 #define ctype frame->Xctype
510 #define fc frame->Xfc
511 #define fi frame->Xfi
512 #define length frame->Xlength
513 #define max frame->Xmax
514 #define min frame->Xmin
515 #define number frame->Xnumber
516 #define offset frame->Xoffset
517 #define op frame->Xop
518 #define save_capture_last frame->Xsave_capture_last
519 #define save_offset1 frame->Xsave_offset1
520 #define save_offset2 frame->Xsave_offset2
521 #define save_offset3 frame->Xsave_offset3
522 #define stacksave frame->Xstacksave
523
524 #define newptrb frame->Xnewptrb
525
526 /* When recursion is being used, local variables are allocated on the stack and
527 get preserved during recursion in the normal way. In this environment, fi and
528 i, and fc and c, can be the same variables. */
529
530 #else /* NO_RECURSE not defined */
531 #define fi i
532 #define fc c
533
534
535 #ifdef SUPPORT_UTF8 /* Many of these variables are used only */
536 const uschar *charptr; /* in small blocks of the code. My normal */
537 #endif /* style of coding would have declared */
538 const uschar *callpat; /* them within each of those blocks. */
539 const uschar *data; /* However, in order to accommodate the */
540 const uschar *next; /* version of this code that uses an */
541 USPTR pp; /* external "stack" implemented on the */
542 const uschar *prev; /* heap, it is easier to declare them all */
543 USPTR saved_eptr; /* here, so the declarations can be cut */
544 /* out in a block. The only declarations */
545 recursion_info new_recursive; /* within blocks below are for variables */
546 /* that do not have to be preserved over */
547 BOOL cur_is_word; /* a recursive call to RMATCH(). */
548 BOOL condition;
549 BOOL prev_is_word;
550
551 unsigned long int original_ims;
552
553 #ifdef SUPPORT_UCP
554 int prop_type;
555 int prop_value;
556 int prop_fail_result;
557 int prop_category;
558 int prop_chartype;
559 int prop_script;
560 int oclength;
561 uschar occhars[8];
562 #endif
563
564 int ctype;
565 int length;
566 int max;
567 int min;
568 int number;
569 int offset;
570 int op;
571 int save_capture_last;
572 int save_offset1, save_offset2, save_offset3;
573 int stacksave[REC_STACK_SAVE_MAX];
574
575 eptrblock newptrb;
576 #endif /* NO_RECURSE */
577
578 /* These statements are here to stop the compiler complaining about unitialized
579 variables. */
580
581 #ifdef SUPPORT_UCP
582 prop_value = 0;
583 prop_fail_result = 0;
584 #endif
585
586
587 /* This label is used for tail recursion, which is used in a few cases even
588 when NO_RECURSE is not defined, in order to reduce the amount of stack that is
589 used. Thanks to Ian Taylor for noticing this possibility and sending the
590 original patch. */
591
592 TAIL_RECURSE:
593
594 /* OK, now we can get on with the real code of the function. Recursive calls
595 are specified by the macro RMATCH and RRETURN is used to return. When
596 NO_RECURSE is *not* defined, these just turn into a recursive call to match()
597 and a "return", respectively (possibly with some debugging if DEBUG is
598 defined). However, RMATCH isn't like a function call because it's quite a
599 complicated macro. It has to be used in one particular way. This shouldn't,
600 however, impact performance when true recursion is being used. */
601
602 #ifdef SUPPORT_UTF8
603 utf8 = md->utf8; /* Local copy of the flag */
604 #else
605 utf8 = FALSE;
606 #endif
607
608 /* First check that we haven't called match() too many times, or that we
609 haven't exceeded the recursive call limit. */
610
611 if (md->match_call_count++ >= md->match_limit) RRETURN(PCRE_ERROR_MATCHLIMIT);
612 if (rdepth >= md->match_limit_recursion) RRETURN(PCRE_ERROR_RECURSIONLIMIT);
613
614 original_ims = ims; /* Save for resetting on ')' */
615
616 /* At the start of a group with an unlimited repeat that may match an empty
617 string, the match_cbegroup flag is set. When this is the case, add the current
618 subject pointer to the chain of such remembered pointers, to be checked when we
619 hit the closing ket, in order to break infinite loops that match no characters.
620 When match() is called in other circumstances, don't add to the chain. The
621 match_cbegroup flag must NOT be used with tail recursion, because the memory
622 block that is used is on the stack, so a new one may be required for each
623 match(). */
624
625 if ((flags & match_cbegroup) != 0)
626 {
627 newptrb.epb_saved_eptr = eptr;
628 newptrb.epb_prev = eptrb;
629 eptrb = &newptrb;
630 }
631
632 /* Now start processing the opcodes. */
633
634 for (;;)
635 {
636 minimize = possessive = FALSE;
637 op = *ecode;
638
639 /* For partial matching, remember if we ever hit the end of the subject after
640 matching at least one subject character. */
641
642 if (md->partial &&
643 eptr >= md->end_subject &&
644 eptr > mstart)
645 md->hitend = TRUE;
646
647 switch(op)
648 {
649 case OP_FAIL:
650 RRETURN(MATCH_NOMATCH);
651
652 case OP_PRUNE:
653 RMATCH(eptr, ecode + _pcre_OP_lengths[*ecode], offset_top, md,
654 ims, eptrb, flags, RM51);
655 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
656 RRETURN(MATCH_PRUNE);
657
658 case OP_COMMIT:
659 RMATCH(eptr, ecode + _pcre_OP_lengths[*ecode], offset_top, md,
660 ims, eptrb, flags, RM52);
661 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
662 RRETURN(MATCH_COMMIT);
663
664 case OP_SKIP:
665 RMATCH(eptr, ecode + _pcre_OP_lengths[*ecode], offset_top, md,
666 ims, eptrb, flags, RM53);
667 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
668 md->start_match_ptr = eptr; /* Pass back current position */
669 RRETURN(MATCH_SKIP);
670
671 case OP_THEN:
672 RMATCH(eptr, ecode + _pcre_OP_lengths[*ecode], offset_top, md,
673 ims, eptrb, flags, RM54);
674 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
675 RRETURN(MATCH_THEN);
676
677 /* Handle a capturing bracket. If there is space in the offset vector, save
678 the current subject position in the working slot at the top of the vector.
679 We mustn't change the current values of the data slot, because they may be
680 set from a previous iteration of this group, and be referred to by a
681 reference inside the group.
682
683 If the bracket fails to match, we need to restore this value and also the
684 values of the final offsets, in case they were set by a previous iteration
685 of the same bracket.
686
687 If there isn't enough space in the offset vector, treat this as if it were
688 a non-capturing bracket. Don't worry about setting the flag for the error
689 case here; that is handled in the code for KET. */
690
691 case OP_CBRA:
692 case OP_SCBRA:
693 number = GET2(ecode, 1+LINK_SIZE);
694 offset = number << 1;
695
696 #ifdef DEBUG
697 printf("start bracket %d\n", number);
698 printf("subject=");
699 pchars(eptr, 16, TRUE, md);
700 printf("\n");
701 #endif
702
703 if (offset < md->offset_max)
704 {
705 save_offset1 = md->offset_vector[offset];
706 save_offset2 = md->offset_vector[offset+1];
707 save_offset3 = md->offset_vector[md->offset_end - number];
708 save_capture_last = md->capture_last;
709
710 DPRINTF(("saving %d %d %d\n", save_offset1, save_offset2, save_offset3));
711 md->offset_vector[md->offset_end - number] = eptr - md->start_subject;
712
713 flags = (op == OP_SCBRA)? match_cbegroup : 0;
714 do
715 {
716 RMATCH(eptr, ecode + _pcre_OP_lengths[*ecode], offset_top, md,
717 ims, eptrb, flags, RM1);
718 if (rrc != MATCH_NOMATCH && rrc != MATCH_THEN) RRETURN(rrc);
719 md->capture_last = save_capture_last;
720 ecode += GET(ecode, 1);
721 }
722 while (*ecode == OP_ALT);
723
724 DPRINTF(("bracket %d failed\n", number));
725
726 md->offset_vector[offset] = save_offset1;
727 md->offset_vector[offset+1] = save_offset2;
728 md->offset_vector[md->offset_end - number] = save_offset3;
729
730 RRETURN(MATCH_NOMATCH);
731 }
732
733 /* FALL THROUGH ... Insufficient room for saving captured contents. Treat
734 as a non-capturing bracket. */
735
736 /* VVVVVVVVVVVVVVVVVVVVVVVVV */
737 /* VVVVVVVVVVVVVVVVVVVVVVVVV */
738
739 DPRINTF(("insufficient capture room: treat as non-capturing\n"));
740
741 /* VVVVVVVVVVVVVVVVVVVVVVVVV */
742 /* VVVVVVVVVVVVVVVVVVVVVVVVV */
743
744 /* Non-capturing bracket. Loop for all the alternatives. When we get to the
745 final alternative within the brackets, we would return the result of a
746 recursive call to match() whatever happened. We can reduce stack usage by
747 turning this into a tail recursion, except in the case when match_cbegroup
748 is set.*/
749
750 case OP_BRA:
751 case OP_SBRA:
752 DPRINTF(("start non-capturing bracket\n"));
753 flags = (op >= OP_SBRA)? match_cbegroup : 0;
754 for (;;)
755 {
756 if (ecode[GET(ecode, 1)] != OP_ALT) /* Final alternative */
757 {
758 if (flags == 0) /* Not a possibly empty group */
759 {
760 ecode += _pcre_OP_lengths[*ecode];
761 DPRINTF(("bracket 0 tail recursion\n"));
762 goto TAIL_RECURSE;
763 }
764
765 /* Possibly empty group; can't use tail recursion. */
766
767 RMATCH(eptr, ecode + _pcre_OP_lengths[*ecode], offset_top, md, ims,
768 eptrb, flags, RM48);
769 RRETURN(rrc);
770 }
771
772 /* For non-final alternatives, continue the loop for a NOMATCH result;
773 otherwise return. */
774
775 RMATCH(eptr, ecode + _pcre_OP_lengths[*ecode], offset_top, md, ims,
776 eptrb, flags, RM2);
777 if (rrc != MATCH_NOMATCH && rrc != MATCH_THEN) RRETURN(rrc);
778 ecode += GET(ecode, 1);
779 }
780 /* Control never reaches here. */
781
782 /* Conditional group: compilation checked that there are no more than
783 two branches. If the condition is false, skipping the first branch takes us
784 past the end if there is only one branch, but that's OK because that is
785 exactly what going to the ket would do. As there is only one branch to be
786 obeyed, we can use tail recursion to avoid using another stack frame. */
787
788 case OP_COND:
789 case OP_SCOND:
790 /* Because of the way auto-callout works during compile, a callout item is
791 inserted between OP_COND and an assertion condition. */
792
793 if (ecode[LINK_SIZE+1] == OP_CALLOUT)
794 {
795 if (pcre_callout != NULL)
796 {
797 pcre_callout_block cb;
798 cb.version = 1; /* Version 1 of the callout block */
799 cb.callout_number = ecode[LINK_SIZE+2];
800 cb.offset_vector = md->offset_vector;
801 cb.subject = (PCRE_SPTR)md->start_subject;
802 cb.subject_length = md->end_subject - md->start_subject;
803 cb.start_match = mstart - md->start_subject;
804 cb.current_position = eptr - md->start_subject;
805 cb.pattern_position = GET(ecode, LINK_SIZE + 3);
806 cb.next_item_length = GET(ecode, 3 + 2*LINK_SIZE);
807 cb.capture_top = offset_top/2;
808 cb.capture_last = md->capture_last;
809 cb.callout_data = md->callout_data;
810 if ((rrc = (*pcre_callout)(&cb)) > 0) RRETURN(MATCH_NOMATCH);
811 if (rrc < 0) RRETURN(rrc);
812 }
813 ecode += _pcre_OP_lengths[OP_CALLOUT];
814 }
815
816 /* Now see what the actual condition is */
817
818 if (ecode[LINK_SIZE+1] == OP_RREF) /* Recursion test */
819 {
820 offset = GET2(ecode, LINK_SIZE + 2); /* Recursion group number*/
821 condition = md->recursive != NULL &&
822 (offset == RREF_ANY || offset == md->recursive->group_num);
823 ecode += condition? 3 : GET(ecode, 1);
824 }
825
826 else if (ecode[LINK_SIZE+1] == OP_CREF) /* Group used test */
827 {
828 offset = GET2(ecode, LINK_SIZE+2) << 1; /* Doubled ref number */
829 condition = offset < offset_top && md->offset_vector[offset] >= 0;
830 ecode += condition? 3 : GET(ecode, 1);
831 }
832
833 else if (ecode[LINK_SIZE+1] == OP_DEF) /* DEFINE - always false */
834 {
835 condition = FALSE;
836 ecode += GET(ecode, 1);
837 }
838
839 /* The condition is an assertion. Call match() to evaluate it - setting
840 the final argument match_condassert causes it to stop at the end of an
841 assertion. */
842
843 else
844 {
845 RMATCH(eptr, ecode + 1 + LINK_SIZE, offset_top, md, ims, NULL,
846 match_condassert, RM3);
847 if (rrc == MATCH_MATCH)
848 {
849 condition = TRUE;
850 ecode += 1 + LINK_SIZE + GET(ecode, LINK_SIZE + 2);
851 while (*ecode == OP_ALT) ecode += GET(ecode, 1);
852 }
853 else if (rrc != MATCH_NOMATCH && rrc != MATCH_THEN)
854 {
855 RRETURN(rrc); /* Need braces because of following else */
856 }
857 else
858 {
859 condition = FALSE;
860 ecode += GET(ecode, 1);
861 }
862 }
863
864 /* We are now at the branch that is to be obeyed. As there is only one,
865 we can use tail recursion to avoid using another stack frame, except when
866 match_cbegroup is required for an unlimited repeat of a possibly empty
867 group. If the second alternative doesn't exist, we can just plough on. */
868
869 if (condition || *ecode == OP_ALT)
870 {
871 ecode += 1 + LINK_SIZE;
872 if (op == OP_SCOND) /* Possibly empty group */
873 {
874 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, match_cbegroup, RM49);
875 RRETURN(rrc);
876 }
877 else /* Group must match something */
878 {
879 flags = 0;
880 goto TAIL_RECURSE;
881 }
882 }
883 else /* Condition false & no 2nd alternative */
884 {
885 ecode += 1 + LINK_SIZE;
886 }
887 break;
888
889
890 /* End of the pattern, either real or forced. If we are in a top-level
891 recursion, we should restore the offsets appropriately and continue from
892 after the call. */
893
894 case OP_ACCEPT:
895 case OP_END:
896 if (md->recursive != NULL && md->recursive->group_num == 0)
897 {
898 recursion_info *rec = md->recursive;
899 DPRINTF(("End of pattern in a (?0) recursion\n"));
900 md->recursive = rec->prevrec;
901 memmove(md->offset_vector, rec->offset_save,
902 rec->saved_max * sizeof(int));
903 mstart = rec->save_start;
904 ims = original_ims;
905 ecode = rec->after_call;
906 break;
907 }
908
909 /* Otherwise, if PCRE_NOTEMPTY is set, fail if we have matched an empty
910 string - backtracking will then try other alternatives, if any. */
911
912 if (md->notempty && eptr == mstart) RRETURN(MATCH_NOMATCH);
913 md->end_match_ptr = eptr; /* Record where we ended */
914 md->end_offset_top = offset_top; /* and how many extracts were taken */
915 md->start_match_ptr = mstart; /* and the start (\K can modify) */
916 RRETURN(MATCH_MATCH);
917
918 /* Change option settings */
919
920 case OP_OPT:
921 ims = ecode[1];
922 ecode += 2;
923 DPRINTF(("ims set to %02lx\n", ims));
924 break;
925
926 /* Assertion brackets. Check the alternative branches in turn - the
927 matching won't pass the KET for an assertion. If any one branch matches,
928 the assertion is true. Lookbehind assertions have an OP_REVERSE item at the
929 start of each branch to move the current point backwards, so the code at
930 this level is identical to the lookahead case. */
931
932 case OP_ASSERT:
933 case OP_ASSERTBACK:
934 do
935 {
936 RMATCH(eptr, ecode + 1 + LINK_SIZE, offset_top, md, ims, NULL, 0,
937 RM4);
938 if (rrc == MATCH_MATCH) break;
939 if (rrc != MATCH_NOMATCH && rrc != MATCH_THEN) RRETURN(rrc);
940 ecode += GET(ecode, 1);
941 }
942 while (*ecode == OP_ALT);
943 if (*ecode == OP_KET) RRETURN(MATCH_NOMATCH);
944
945 /* If checking an assertion for a condition, return MATCH_MATCH. */
946
947 if ((flags & match_condassert) != 0) RRETURN(MATCH_MATCH);
948
949 /* Continue from after the assertion, updating the offsets high water
950 mark, since extracts may have been taken during the assertion. */
951
952 do ecode += GET(ecode,1); while (*ecode == OP_ALT);
953 ecode += 1 + LINK_SIZE;
954 offset_top = md->end_offset_top;
955 continue;
956
957 /* Negative assertion: all branches must fail to match */
958
959 case OP_ASSERT_NOT:
960 case OP_ASSERTBACK_NOT:
961 do
962 {
963 RMATCH(eptr, ecode + 1 + LINK_SIZE, offset_top, md, ims, NULL, 0,
964 RM5);
965 if (rrc == MATCH_MATCH) RRETURN(MATCH_NOMATCH);
966 if (rrc != MATCH_NOMATCH && rrc != MATCH_THEN) RRETURN(rrc);
967 ecode += GET(ecode,1);
968 }
969 while (*ecode == OP_ALT);
970
971 if ((flags & match_condassert) != 0) RRETURN(MATCH_MATCH);
972
973 ecode += 1 + LINK_SIZE;
974 continue;
975
976 /* Move the subject pointer back. This occurs only at the start of
977 each branch of a lookbehind assertion. If we are too close to the start to
978 move back, this match function fails. When working with UTF-8 we move
979 back a number of characters, not bytes. */
980
981 case OP_REVERSE:
982 #ifdef SUPPORT_UTF8
983 if (utf8)
984 {
985 i = GET(ecode, 1);
986 while (i-- > 0)
987 {
988 eptr--;
989 if (eptr < md->start_subject) RRETURN(MATCH_NOMATCH);
990 BACKCHAR(eptr);
991 }
992 }
993 else
994 #endif
995
996 /* No UTF-8 support, or not in UTF-8 mode: count is byte count */
997
998 {
999 eptr -= GET(ecode, 1);
1000 if (eptr < md->start_subject) RRETURN(MATCH_NOMATCH);
1001 }
1002
1003 /* Skip to next op code */
1004
1005 ecode += 1 + LINK_SIZE;
1006 break;
1007
1008 /* The callout item calls an external function, if one is provided, passing
1009 details of the match so far. This is mainly for debugging, though the
1010 function is able to force a failure. */
1011
1012 case OP_CALLOUT:
1013 if (pcre_callout != NULL)
1014 {
1015 pcre_callout_block cb;
1016 cb.version = 1; /* Version 1 of the callout block */
1017 cb.callout_number = ecode[1];
1018 cb.offset_vector = md->offset_vector;
1019 cb.subject = (PCRE_SPTR)md->start_subject;
1020 cb.subject_length = md->end_subject - md->start_subject;
1021 cb.start_match = mstart - md->start_subject;
1022 cb.current_position = eptr - md->start_subject;
1023 cb.pattern_position = GET(ecode, 2);
1024 cb.next_item_length = GET(ecode, 2 + LINK_SIZE);
1025 cb.capture_top = offset_top/2;
1026 cb.capture_last = md->capture_last;
1027 cb.callout_data = md->callout_data;
1028 if ((rrc = (*pcre_callout)(&cb)) > 0) RRETURN(MATCH_NOMATCH);
1029 if (rrc < 0) RRETURN(rrc);
1030 }
1031 ecode += 2 + 2*LINK_SIZE;
1032 break;
1033
1034 /* Recursion either matches the current regex, or some subexpression. The
1035 offset data is the offset to the starting bracket from the start of the
1036 whole pattern. (This is so that it works from duplicated subpatterns.)
1037
1038 If there are any capturing brackets started but not finished, we have to
1039 save their starting points and reinstate them after the recursion. However,
1040 we don't know how many such there are (offset_top records the completed
1041 total) so we just have to save all the potential data. There may be up to
1042 65535 such values, which is too large to put on the stack, but using malloc
1043 for small numbers seems expensive. As a compromise, the stack is used when
1044 there are no more than REC_STACK_SAVE_MAX values to store; otherwise malloc
1045 is used. A problem is what to do if the malloc fails ... there is no way of
1046 returning to the top level with an error. Save the top REC_STACK_SAVE_MAX
1047 values on the stack, and accept that the rest may be wrong.
1048
1049 There are also other values that have to be saved. We use a chained
1050 sequence of blocks that actually live on the stack. Thanks to Robin Houston
1051 for the original version of this logic. */
1052
1053 case OP_RECURSE:
1054 {
1055 callpat = md->start_code + GET(ecode, 1);
1056 new_recursive.group_num = (callpat == md->start_code)? 0 :
1057 GET2(callpat, 1 + LINK_SIZE);
1058
1059 /* Add to "recursing stack" */
1060
1061 new_recursive.prevrec = md->recursive;
1062 md->recursive = &new_recursive;
1063
1064 /* Find where to continue from afterwards */
1065
1066 ecode += 1 + LINK_SIZE;
1067 new_recursive.after_call = ecode;
1068
1069 /* Now save the offset data. */
1070
1071 new_recursive.saved_max = md->offset_end;
1072 if (new_recursive.saved_max <= REC_STACK_SAVE_MAX)
1073 new_recursive.offset_save = stacksave;
1074 else
1075 {
1076 new_recursive.offset_save =
1077 (int *)(pcre_malloc)(new_recursive.saved_max * sizeof(int));
1078 if (new_recursive.offset_save == NULL) RRETURN(PCRE_ERROR_NOMEMORY);
1079 }
1080
1081 memcpy(new_recursive.offset_save, md->offset_vector,
1082 new_recursive.saved_max * sizeof(int));
1083 new_recursive.save_start = mstart;
1084 mstart = eptr;
1085
1086 /* OK, now we can do the recursion. For each top-level alternative we
1087 restore the offset and recursion data. */
1088
1089 DPRINTF(("Recursing into group %d\n", new_recursive.group_num));
1090 flags = (*callpat >= OP_SBRA)? match_cbegroup : 0;
1091 do
1092 {
1093 RMATCH(eptr, callpat + _pcre_OP_lengths[*callpat], offset_top,
1094 md, ims, eptrb, flags, RM6);
1095 if (rrc == MATCH_MATCH)
1096 {
1097 DPRINTF(("Recursion matched\n"));
1098 md->recursive = new_recursive.prevrec;
1099 if (new_recursive.offset_save != stacksave)
1100 (pcre_free)(new_recursive.offset_save);
1101 RRETURN(MATCH_MATCH);
1102 }
1103 else if (rrc != MATCH_NOMATCH && rrc != MATCH_THEN)
1104 {
1105 DPRINTF(("Recursion gave error %d\n", rrc));
1106 RRETURN(rrc);
1107 }
1108
1109 md->recursive = &new_recursive;
1110 memcpy(md->offset_vector, new_recursive.offset_save,
1111 new_recursive.saved_max * sizeof(int));
1112 callpat += GET(callpat, 1);
1113 }
1114 while (*callpat == OP_ALT);
1115
1116 DPRINTF(("Recursion didn't match\n"));
1117 md->recursive = new_recursive.prevrec;
1118 if (new_recursive.offset_save != stacksave)
1119 (pcre_free)(new_recursive.offset_save);
1120 RRETURN(MATCH_NOMATCH);
1121 }
1122 /* Control never reaches here */
1123
1124 /* "Once" brackets are like assertion brackets except that after a match,
1125 the point in the subject string is not moved back. Thus there can never be
1126 a move back into the brackets. Friedl calls these "atomic" subpatterns.
1127 Check the alternative branches in turn - the matching won't pass the KET
1128 for this kind of subpattern. If any one branch matches, we carry on as at
1129 the end of a normal bracket, leaving the subject pointer. */
1130
1131 case OP_ONCE:
1132 prev = ecode;
1133 saved_eptr = eptr;
1134
1135 do
1136 {
1137 RMATCH(eptr, ecode + 1 + LINK_SIZE, offset_top, md, ims, eptrb, 0, RM7);
1138 if (rrc == MATCH_MATCH) break;
1139 if (rrc != MATCH_NOMATCH && rrc != MATCH_THEN) RRETURN(rrc);
1140 ecode += GET(ecode,1);
1141 }
1142 while (*ecode == OP_ALT);
1143
1144 /* If hit the end of the group (which could be repeated), fail */
1145
1146 if (*ecode != OP_ONCE && *ecode != OP_ALT) RRETURN(MATCH_NOMATCH);
1147
1148 /* Continue as from after the assertion, updating the offsets high water
1149 mark, since extracts may have been taken. */
1150
1151 do ecode += GET(ecode, 1); while (*ecode == OP_ALT);
1152
1153 offset_top = md->end_offset_top;
1154 eptr = md->end_match_ptr;
1155
1156 /* For a non-repeating ket, just continue at this level. This also
1157 happens for a repeating ket if no characters were matched in the group.
1158 This is the forcible breaking of infinite loops as implemented in Perl
1159 5.005. If there is an options reset, it will get obeyed in the normal
1160 course of events. */
1161
1162 if (*ecode == OP_KET || eptr == saved_eptr)
1163 {
1164 ecode += 1+LINK_SIZE;
1165 break;
1166 }
1167
1168 /* The repeating kets try the rest of the pattern or restart from the
1169 preceding bracket, in the appropriate order. The second "call" of match()
1170 uses tail recursion, to avoid using another stack frame. We need to reset
1171 any options that changed within the bracket before re-running it, so
1172 check the next opcode. */
1173
1174 if (ecode[1+LINK_SIZE] == OP_OPT)
1175 {
1176 ims = (ims & ~PCRE_IMS) | ecode[4];
1177 DPRINTF(("ims set to %02lx at group repeat\n", ims));
1178 }
1179
1180 if (*ecode == OP_KETRMIN)
1181 {
1182 RMATCH(eptr, ecode + 1 + LINK_SIZE, offset_top, md, ims, eptrb, 0, RM8);
1183 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1184 ecode = prev;
1185 flags = 0;
1186 goto TAIL_RECURSE;
1187 }
1188 else /* OP_KETRMAX */
1189 {
1190 RMATCH(eptr, prev, offset_top, md, ims, eptrb, match_cbegroup, RM9);
1191 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1192 ecode += 1 + LINK_SIZE;
1193 flags = 0;
1194 goto TAIL_RECURSE;
1195 }
1196 /* Control never gets here */
1197
1198 /* An alternation is the end of a branch; scan along to find the end of the
1199 bracketed group and go to there. */
1200
1201 case OP_ALT:
1202 do ecode += GET(ecode,1); while (*ecode == OP_ALT);
1203 break;
1204
1205 /* BRAZERO, BRAMINZERO and SKIPZERO occur just before a bracket group,
1206 indicating that it may occur zero times. It may repeat infinitely, or not
1207 at all - i.e. it could be ()* or ()? or even (){0} in the pattern. Brackets
1208 with fixed upper repeat limits are compiled as a number of copies, with the
1209 optional ones preceded by BRAZERO or BRAMINZERO. */
1210
1211 case OP_BRAZERO:
1212 {
1213 next = ecode+1;
1214 RMATCH(eptr, next, offset_top, md, ims, eptrb, 0, RM10);
1215 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1216 do next += GET(next,1); while (*next == OP_ALT);
1217 ecode = next + 1 + LINK_SIZE;
1218 }
1219 break;
1220
1221 case OP_BRAMINZERO:
1222 {
1223 next = ecode+1;
1224 do next += GET(next, 1); while (*next == OP_ALT);
1225 RMATCH(eptr, next + 1+LINK_SIZE, offset_top, md, ims, eptrb, 0, RM11);
1226 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1227 ecode++;
1228 }
1229 break;
1230
1231 case OP_SKIPZERO:
1232 {
1233 next = ecode+1;
1234 do next += GET(next,1); while (*next == OP_ALT);
1235 ecode = next + 1 + LINK_SIZE;
1236 }
1237 break;
1238
1239 /* End of a group, repeated or non-repeating. */
1240
1241 case OP_KET:
1242 case OP_KETRMIN:
1243 case OP_KETRMAX:
1244 prev = ecode - GET(ecode, 1);
1245
1246 /* If this was a group that remembered the subject start, in order to break
1247 infinite repeats of empty string matches, retrieve the subject start from
1248 the chain. Otherwise, set it NULL. */
1249
1250 if (*prev >= OP_SBRA)
1251 {
1252 saved_eptr = eptrb->epb_saved_eptr; /* Value at start of group */
1253 eptrb = eptrb->epb_prev; /* Backup to previous group */
1254 }
1255 else saved_eptr = NULL;
1256
1257 /* If we are at the end of an assertion group, stop matching and return
1258 MATCH_MATCH, but record the current high water mark for use by positive
1259 assertions. Do this also for the "once" (atomic) groups. */
1260
1261 if (*prev == OP_ASSERT || *prev == OP_ASSERT_NOT ||
1262 *prev == OP_ASSERTBACK || *prev == OP_ASSERTBACK_NOT ||
1263 *prev == OP_ONCE)
1264 {
1265 md->end_match_ptr = eptr; /* For ONCE */
1266 md->end_offset_top = offset_top;
1267 RRETURN(MATCH_MATCH);
1268 }
1269
1270 /* For capturing groups we have to check the group number back at the start
1271 and if necessary complete handling an extraction by setting the offsets and
1272 bumping the high water mark. Note that whole-pattern recursion is coded as
1273 a recurse into group 0, so it won't be picked up here. Instead, we catch it
1274 when the OP_END is reached. Other recursion is handled here. */
1275
1276 if (*prev == OP_CBRA || *prev == OP_SCBRA)
1277 {
1278 number = GET2(prev, 1+LINK_SIZE);
1279 offset = number << 1;
1280
1281 #ifdef DEBUG
1282 printf("end bracket %d", number);
1283 printf("\n");
1284 #endif
1285
1286 md->capture_last = number;
1287 if (offset >= md->offset_max) md->offset_overflow = TRUE; else
1288 {
1289 md->offset_vector[offset] =
1290 md->offset_vector[md->offset_end - number];
1291 md->offset_vector[offset+1] = eptr - md->start_subject;
1292 if (offset_top <= offset) offset_top = offset + 2;
1293 }
1294
1295 /* Handle a recursively called group. Restore the offsets
1296 appropriately and continue from after the call. */
1297
1298 if (md->recursive != NULL && md->recursive->group_num == number)
1299 {
1300 recursion_info *rec = md->recursive;
1301 DPRINTF(("Recursion (%d) succeeded - continuing\n", number));
1302 md->recursive = rec->prevrec;
1303 mstart = rec->save_start;
1304 memcpy(md->offset_vector, rec->offset_save,
1305 rec->saved_max * sizeof(int));
1306 ecode = rec->after_call;
1307 ims = original_ims;
1308 break;
1309 }
1310 }
1311
1312 /* For both capturing and non-capturing groups, reset the value of the ims
1313 flags, in case they got changed during the group. */
1314
1315 ims = original_ims;
1316 DPRINTF(("ims reset to %02lx\n", ims));
1317
1318 /* For a non-repeating ket, just continue at this level. This also
1319 happens for a repeating ket if no characters were matched in the group.
1320 This is the forcible breaking of infinite loops as implemented in Perl
1321 5.005. If there is an options reset, it will get obeyed in the normal
1322 course of events. */
1323
1324 if (*ecode == OP_KET || eptr == saved_eptr)
1325 {
1326 ecode += 1 + LINK_SIZE;
1327 break;
1328 }
1329
1330 /* The repeating kets try the rest of the pattern or restart from the
1331 preceding bracket, in the appropriate order. In the second case, we can use
1332 tail recursion to avoid using another stack frame, unless we have an
1333 unlimited repeat of a group that can match an empty string. */
1334
1335 flags = (*prev >= OP_SBRA)? match_cbegroup : 0;
1336
1337 if (*ecode == OP_KETRMIN)
1338 {
1339 RMATCH(eptr, ecode + 1 + LINK_SIZE, offset_top, md, ims, eptrb, 0, RM12);
1340 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1341 if (flags != 0) /* Could match an empty string */
1342 {
1343 RMATCH(eptr, prev, offset_top, md, ims, eptrb, flags, RM50);
1344 RRETURN(rrc);
1345 }
1346 ecode = prev;
1347 goto TAIL_RECURSE;
1348 }
1349 else /* OP_KETRMAX */
1350 {
1351 RMATCH(eptr, prev, offset_top, md, ims, eptrb, flags, RM13);
1352 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1353 ecode += 1 + LINK_SIZE;
1354 flags = 0;
1355 goto TAIL_RECURSE;
1356 }
1357 /* Control never gets here */
1358
1359 /* Start of subject unless notbol, or after internal newline if multiline */
1360
1361 case OP_CIRC:
1362 if (md->notbol && eptr == md->start_subject) RRETURN(MATCH_NOMATCH);
1363 if ((ims & PCRE_MULTILINE) != 0)
1364 {
1365 if (eptr != md->start_subject &&
1366 (eptr == md->end_subject || !WAS_NEWLINE(eptr)))
1367 RRETURN(MATCH_NOMATCH);
1368 ecode++;
1369 break;
1370 }
1371 /* ... else fall through */
1372
1373 /* Start of subject assertion */
1374
1375 case OP_SOD:
1376 if (eptr != md->start_subject) RRETURN(MATCH_NOMATCH);
1377 ecode++;
1378 break;
1379
1380 /* Start of match assertion */
1381
1382 case OP_SOM:
1383 if (eptr != md->start_subject + md->start_offset) RRETURN(MATCH_NOMATCH);
1384 ecode++;
1385 break;
1386
1387 /* Reset the start of match point */
1388
1389 case OP_SET_SOM:
1390 mstart = eptr;
1391 ecode++;
1392 break;
1393
1394 /* Assert before internal newline if multiline, or before a terminating
1395 newline unless endonly is set, else end of subject unless noteol is set. */
1396
1397 case OP_DOLL:
1398 if ((ims & PCRE_MULTILINE) != 0)
1399 {
1400 if (eptr < md->end_subject)
1401 { if (!IS_NEWLINE(eptr)) RRETURN(MATCH_NOMATCH); }
1402 else
1403 { if (md->noteol) RRETURN(MATCH_NOMATCH); }
1404 ecode++;
1405 break;
1406 }
1407 else
1408 {
1409 if (md->noteol) RRETURN(MATCH_NOMATCH);
1410 if (!md->endonly)
1411 {
1412 if (eptr != md->end_subject &&
1413 (!IS_NEWLINE(eptr) || eptr != md->end_subject - md->nllen))
1414 RRETURN(MATCH_NOMATCH);
1415 ecode++;
1416 break;
1417 }
1418 }
1419 /* ... else fall through for endonly */
1420
1421 /* End of subject assertion (\z) */
1422
1423 case OP_EOD:
1424 if (eptr < md->end_subject) RRETURN(MATCH_NOMATCH);
1425 ecode++;
1426 break;
1427
1428 /* End of subject or ending \n assertion (\Z) */
1429
1430 case OP_EODN:
1431 if (eptr != md->end_subject &&
1432 (!IS_NEWLINE(eptr) || eptr != md->end_subject - md->nllen))
1433 RRETURN(MATCH_NOMATCH);
1434 ecode++;
1435 break;
1436
1437 /* Word boundary assertions */
1438
1439 case OP_NOT_WORD_BOUNDARY:
1440 case OP_WORD_BOUNDARY:
1441 {
1442
1443 /* Find out if the previous and current characters are "word" characters.
1444 It takes a bit more work in UTF-8 mode. Characters > 255 are assumed to
1445 be "non-word" characters. */
1446
1447 #ifdef SUPPORT_UTF8
1448 if (utf8)
1449 {
1450 if (eptr == md->start_subject) prev_is_word = FALSE; else
1451 {
1452 const uschar *lastptr = eptr - 1;
1453 while((*lastptr & 0xc0) == 0x80) lastptr--;
1454 GETCHAR(c, lastptr);
1455 prev_is_word = c < 256 && (md->ctypes[c] & ctype_word) != 0;
1456 }
1457 if (eptr >= md->end_subject) cur_is_word = FALSE; else
1458 {
1459 GETCHAR(c, eptr);
1460 cur_is_word = c < 256 && (md->ctypes[c] & ctype_word) != 0;
1461 }
1462 }
1463 else
1464 #endif
1465
1466 /* More streamlined when not in UTF-8 mode */
1467
1468 {
1469 prev_is_word = (eptr != md->start_subject) &&
1470 ((md->ctypes[eptr[-1]] & ctype_word) != 0);
1471 cur_is_word = (eptr < md->end_subject) &&
1472 ((md->ctypes[*eptr] & ctype_word) != 0);
1473 }
1474
1475 /* Now see if the situation is what we want */
1476
1477 if ((*ecode++ == OP_WORD_BOUNDARY)?
1478 cur_is_word == prev_is_word : cur_is_word != prev_is_word)
1479 RRETURN(MATCH_NOMATCH);
1480 }
1481 break;
1482
1483 /* Match a single character type; inline for speed */
1484
1485 case OP_ANY:
1486 if (IS_NEWLINE(eptr)) RRETURN(MATCH_NOMATCH);
1487 /* Fall through */
1488
1489 case OP_ALLANY:
1490 if (eptr++ >= md->end_subject) RRETURN(MATCH_NOMATCH);
1491 if (utf8) while (eptr < md->end_subject && (*eptr & 0xc0) == 0x80) eptr++;
1492 ecode++;
1493 break;
1494
1495 /* Match a single byte, even in UTF-8 mode. This opcode really does match
1496 any byte, even newline, independent of the setting of PCRE_DOTALL. */
1497
1498 case OP_ANYBYTE:
1499 if (eptr++ >= md->end_subject) RRETURN(MATCH_NOMATCH);
1500 ecode++;
1501 break;
1502
1503 case OP_NOT_DIGIT:
1504 if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
1505 GETCHARINCTEST(c, eptr);
1506 if (
1507 #ifdef SUPPORT_UTF8
1508 c < 256 &&
1509 #endif
1510 (md->ctypes[c] & ctype_digit) != 0
1511 )
1512 RRETURN(MATCH_NOMATCH);
1513 ecode++;
1514 break;
1515
1516 case OP_DIGIT:
1517 if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
1518 GETCHARINCTEST(c, eptr);
1519 if (
1520 #ifdef SUPPORT_UTF8
1521 c >= 256 ||
1522 #endif
1523 (md->ctypes[c] & ctype_digit) == 0
1524 )
1525 RRETURN(MATCH_NOMATCH);
1526 ecode++;
1527 break;
1528
1529 case OP_NOT_WHITESPACE:
1530 if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
1531 GETCHARINCTEST(c, eptr);
1532 if (
1533 #ifdef SUPPORT_UTF8
1534 c < 256 &&
1535 #endif
1536 (md->ctypes[c] & ctype_space) != 0
1537 )
1538 RRETURN(MATCH_NOMATCH);
1539 ecode++;
1540 break;
1541
1542 case OP_WHITESPACE:
1543 if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
1544 GETCHARINCTEST(c, eptr);
1545 if (
1546 #ifdef SUPPORT_UTF8
1547 c >= 256 ||
1548 #endif
1549 (md->ctypes[c] & ctype_space) == 0
1550 )
1551 RRETURN(MATCH_NOMATCH);
1552 ecode++;
1553 break;
1554
1555 case OP_NOT_WORDCHAR:
1556 if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
1557 GETCHARINCTEST(c, eptr);
1558 if (
1559 #ifdef SUPPORT_UTF8
1560 c < 256 &&
1561 #endif
1562 (md->ctypes[c] & ctype_word) != 0
1563 )
1564 RRETURN(MATCH_NOMATCH);
1565 ecode++;
1566 break;
1567
1568 case OP_WORDCHAR:
1569 if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
1570 GETCHARINCTEST(c, eptr);
1571 if (
1572 #ifdef SUPPORT_UTF8
1573 c >= 256 ||
1574 #endif
1575 (md->ctypes[c] & ctype_word) == 0
1576 )
1577 RRETURN(MATCH_NOMATCH);
1578 ecode++;
1579 break;
1580
1581 case OP_ANYNL:
1582 if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
1583 GETCHARINCTEST(c, eptr);
1584 switch(c)
1585 {
1586 default: RRETURN(MATCH_NOMATCH);
1587 case 0x000d:
1588 if (eptr < md->end_subject && *eptr == 0x0a) eptr++;
1589 break;
1590
1591 case 0x000a:
1592 break;
1593
1594 case 0x000b:
1595 case 0x000c:
1596 case 0x0085:
1597 case 0x2028:
1598 case 0x2029:
1599 if (md->bsr_anycrlf) RRETURN(MATCH_NOMATCH);
1600 break;
1601 }
1602 ecode++;
1603 break;
1604
1605 case OP_NOT_HSPACE:
1606 if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
1607 GETCHARINCTEST(c, eptr);
1608 switch(c)
1609 {
1610 default: break;
1611 case 0x09: /* HT */
1612 case 0x20: /* SPACE */
1613 case 0xa0: /* NBSP */
1614 case 0x1680: /* OGHAM SPACE MARK */
1615 case 0x180e: /* MONGOLIAN VOWEL SEPARATOR */
1616 case 0x2000: /* EN QUAD */
1617 case 0x2001: /* EM QUAD */
1618 case 0x2002: /* EN SPACE */
1619 case 0x2003: /* EM SPACE */
1620 case 0x2004: /* THREE-PER-EM SPACE */
1621 case 0x2005: /* FOUR-PER-EM SPACE */
1622 case 0x2006: /* SIX-PER-EM SPACE */
1623 case 0x2007: /* FIGURE SPACE */
1624 case 0x2008: /* PUNCTUATION SPACE */
1625 case 0x2009: /* THIN SPACE */
1626 case 0x200A: /* HAIR SPACE */
1627 case 0x202f: /* NARROW NO-BREAK SPACE */
1628 case 0x205f: /* MEDIUM MATHEMATICAL SPACE */
1629 case 0x3000: /* IDEOGRAPHIC SPACE */
1630 RRETURN(MATCH_NOMATCH);
1631 }
1632 ecode++;
1633 break;
1634
1635 case OP_HSPACE:
1636 if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
1637 GETCHARINCTEST(c, eptr);
1638 switch(c)
1639 {
1640 default: RRETURN(MATCH_NOMATCH);
1641 case 0x09: /* HT */
1642 case 0x20: /* SPACE */
1643 case 0xa0: /* NBSP */
1644 case 0x1680: /* OGHAM SPACE MARK */
1645 case 0x180e: /* MONGOLIAN VOWEL SEPARATOR */
1646 case 0x2000: /* EN QUAD */
1647 case 0x2001: /* EM QUAD */
1648 case 0x2002: /* EN SPACE */
1649 case 0x2003: /* EM SPACE */
1650 case 0x2004: /* THREE-PER-EM SPACE */
1651 case 0x2005: /* FOUR-PER-EM SPACE */
1652 case 0x2006: /* SIX-PER-EM SPACE */
1653 case 0x2007: /* FIGURE SPACE */
1654 case 0x2008: /* PUNCTUATION SPACE */
1655 case 0x2009: /* THIN SPACE */
1656 case 0x200A: /* HAIR SPACE */
1657 case 0x202f: /* NARROW NO-BREAK SPACE */
1658 case 0x205f: /* MEDIUM MATHEMATICAL SPACE */
1659 case 0x3000: /* IDEOGRAPHIC SPACE */
1660 break;
1661 }
1662 ecode++;
1663 break;
1664
1665 case OP_NOT_VSPACE:
1666 if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
1667 GETCHARINCTEST(c, eptr);
1668 switch(c)
1669 {
1670 default: break;
1671 case 0x0a: /* LF */
1672 case 0x0b: /* VT */
1673 case 0x0c: /* FF */
1674 case 0x0d: /* CR */
1675 case 0x85: /* NEL */
1676 case 0x2028: /* LINE SEPARATOR */
1677 case 0x2029: /* PARAGRAPH SEPARATOR */
1678 RRETURN(MATCH_NOMATCH);
1679 }
1680 ecode++;
1681 break;
1682
1683 case OP_VSPACE:
1684 if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
1685 GETCHARINCTEST(c, eptr);
1686 switch(c)
1687 {
1688 default: RRETURN(MATCH_NOMATCH);
1689 case 0x0a: /* LF */
1690 case 0x0b: /* VT */
1691 case 0x0c: /* FF */
1692 case 0x0d: /* CR */
1693 case 0x85: /* NEL */
1694 case 0x2028: /* LINE SEPARATOR */
1695 case 0x2029: /* PARAGRAPH SEPARATOR */
1696 break;
1697 }
1698 ecode++;
1699 break;
1700
1701 #ifdef SUPPORT_UCP
1702 /* Check the next character by Unicode property. We will get here only
1703 if the support is in the binary; otherwise a compile-time error occurs. */
1704
1705 case OP_PROP:
1706 case OP_NOTPROP:
1707 if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
1708 GETCHARINCTEST(c, eptr);
1709 {
1710 const ucd_record *prop = GET_UCD(c);
1711
1712 switch(ecode[1])
1713 {
1714 case PT_ANY:
1715 if (op == OP_NOTPROP) RRETURN(MATCH_NOMATCH);
1716 break;
1717
1718 case PT_LAMP:
1719 if ((prop->chartype == ucp_Lu ||
1720 prop->chartype == ucp_Ll ||
1721 prop->chartype == ucp_Lt) == (op == OP_NOTPROP))
1722 RRETURN(MATCH_NOMATCH);
1723 break;
1724
1725 case PT_GC:
1726 if ((ecode[2] != _pcre_ucp_gentype[prop->chartype]) == (op == OP_PROP))
1727 RRETURN(MATCH_NOMATCH);
1728 break;
1729
1730 case PT_PC:
1731 if ((ecode[2] != prop->chartype) == (op == OP_PROP))
1732 RRETURN(MATCH_NOMATCH);
1733 break;
1734
1735 case PT_SC:
1736 if ((ecode[2] != prop->script) == (op == OP_PROP))
1737 RRETURN(MATCH_NOMATCH);
1738 break;
1739
1740 default:
1741 RRETURN(PCRE_ERROR_INTERNAL);
1742 }
1743
1744 ecode += 3;
1745 }
1746 break;
1747
1748 /* Match an extended Unicode sequence. We will get here only if the support
1749 is in the binary; otherwise a compile-time error occurs. */
1750
1751 case OP_EXTUNI:
1752 if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
1753 GETCHARINCTEST(c, eptr);
1754 {
1755 int category = UCD_CATEGORY(c);
1756 if (category == ucp_M) RRETURN(MATCH_NOMATCH);
1757 while (eptr < md->end_subject)
1758 {
1759 int len = 1;
1760 if (!utf8) c = *eptr; else
1761 {
1762 GETCHARLEN(c, eptr, len);
1763 }
1764 category = UCD_CATEGORY(c);
1765 if (category != ucp_M) break;
1766 eptr += len;
1767 }
1768 }
1769 ecode++;
1770 break;
1771 #endif
1772
1773
1774 /* Match a back reference, possibly repeatedly. Look past the end of the
1775 item to see if there is repeat information following. The code is similar
1776 to that for character classes, but repeated for efficiency. Then obey
1777 similar code to character type repeats - written out again for speed.
1778 However, if the referenced string is the empty string, always treat
1779 it as matched, any number of times (otherwise there could be infinite
1780 loops). */
1781
1782 case OP_REF:
1783 {
1784 offset = GET2(ecode, 1) << 1; /* Doubled ref number */
1785 ecode += 3;
1786
1787 /* If the reference is unset, there are two possibilities:
1788
1789 (a) In the default, Perl-compatible state, set the length to be longer
1790 than the amount of subject left; this ensures that every attempt at a
1791 match fails. We can't just fail here, because of the possibility of
1792 quantifiers with zero minima.
1793
1794 (b) If the JavaScript compatibility flag is set, set the length to zero
1795 so that the back reference matches an empty string.
1796
1797 Otherwise, set the length to the length of what was matched by the
1798 referenced subpattern. */
1799
1800 if (offset >= offset_top || md->offset_vector[offset] < 0)
1801 length = (md->jscript_compat)? 0 : md->end_subject - eptr + 1;
1802 else
1803 length = md->offset_vector[offset+1] - md->offset_vector[offset];
1804
1805 /* Set up for repetition, or handle the non-repeated case */
1806
1807 switch (*ecode)
1808 {
1809 case OP_CRSTAR:
1810 case OP_CRMINSTAR:
1811 case OP_CRPLUS:
1812 case OP_CRMINPLUS:
1813 case OP_CRQUERY:
1814 case OP_CRMINQUERY:
1815 c = *ecode++ - OP_CRSTAR;
1816 minimize = (c & 1) != 0;
1817 min = rep_min[c]; /* Pick up values from tables; */
1818 max = rep_max[c]; /* zero for max => infinity */
1819 if (max == 0) max = INT_MAX;
1820 break;
1821
1822 case OP_CRRANGE:
1823 case OP_CRMINRANGE:
1824 minimize = (*ecode == OP_CRMINRANGE);
1825 min = GET2(ecode, 1);
1826 max = GET2(ecode, 3);
1827 if (max == 0) max = INT_MAX;
1828 ecode += 5;
1829 break;
1830
1831 default: /* No repeat follows */
1832 if (!match_ref(offset, eptr, length, md, ims)) RRETURN(MATCH_NOMATCH);
1833 eptr += length;
1834 continue; /* With the main loop */
1835 }
1836
1837 /* If the length of the reference is zero, just continue with the
1838 main loop. */
1839
1840 if (length == 0) continue;
1841
1842 /* First, ensure the minimum number of matches are present. We get back
1843 the length of the reference string explicitly rather than passing the
1844 address of eptr, so that eptr can be a register variable. */
1845
1846 for (i = 1; i <= min; i++)
1847 {
1848 if (!match_ref(offset, eptr, length, md, ims)) RRETURN(MATCH_NOMATCH);
1849 eptr += length;
1850 }
1851
1852 /* If min = max, continue at the same level without recursion.
1853 They are not both allowed to be zero. */
1854
1855 if (min == max) continue;
1856
1857 /* If minimizing, keep trying and advancing the pointer */
1858
1859 if (minimize)
1860 {
1861 for (fi = min;; fi++)
1862 {
1863 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM14);
1864 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1865 if (fi >= max || !match_ref(offset, eptr, length, md, ims))
1866 RRETURN(MATCH_NOMATCH);
1867 eptr += length;
1868 }
1869 /* Control never gets here */
1870 }
1871
1872 /* If maximizing, find the longest string and work backwards */
1873
1874 else
1875 {
1876 pp = eptr;
1877 for (i = min; i < max; i++)
1878 {
1879 if (!match_ref(offset, eptr, length, md, ims)) break;
1880 eptr += length;
1881 }
1882 while (eptr >= pp)
1883 {
1884 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM15);
1885 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1886 eptr -= length;
1887 }
1888 RRETURN(MATCH_NOMATCH);
1889 }
1890 }
1891 /* Control never gets here */
1892
1893
1894
1895 /* Match a bit-mapped character class, possibly repeatedly. This op code is
1896 used when all the characters in the class have values in the range 0-255,
1897 and either the matching is caseful, or the characters are in the range
1898 0-127 when UTF-8 processing is enabled. The only difference between
1899 OP_CLASS and OP_NCLASS occurs when a data character outside the range is
1900 encountered.
1901
1902 First, look past the end of the item to see if there is repeat information
1903 following. Then obey similar code to character type repeats - written out
1904 again for speed. */
1905
1906 case OP_NCLASS:
1907 case OP_CLASS:
1908 {
1909 data = ecode + 1; /* Save for matching */
1910 ecode += 33; /* Advance past the item */
1911
1912 switch (*ecode)
1913 {
1914 case OP_CRSTAR:
1915 case OP_CRMINSTAR:
1916 case OP_CRPLUS:
1917 case OP_CRMINPLUS:
1918 case OP_CRQUERY:
1919 case OP_CRMINQUERY:
1920 c = *ecode++ - OP_CRSTAR;
1921 minimize = (c & 1) != 0;
1922 min = rep_min[c]; /* Pick up values from tables; */
1923 max = rep_max[c]; /* zero for max => infinity */
1924 if (max == 0) max = INT_MAX;
1925 break;
1926
1927 case OP_CRRANGE:
1928 case OP_CRMINRANGE:
1929 minimize = (*ecode == OP_CRMINRANGE);
1930 min = GET2(ecode, 1);
1931 max = GET2(ecode, 3);
1932 if (max == 0) max = INT_MAX;
1933 ecode += 5;
1934 break;
1935
1936 default: /* No repeat follows */
1937 min = max = 1;
1938 break;
1939 }
1940
1941 /* First, ensure the minimum number of matches are present. */
1942
1943 #ifdef SUPPORT_UTF8
1944 /* UTF-8 mode */
1945 if (utf8)
1946 {
1947 for (i = 1; i <= min; i++)
1948 {
1949 if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
1950 GETCHARINC(c, eptr);
1951 if (c > 255)
1952 {
1953 if (op == OP_CLASS) RRETURN(MATCH_NOMATCH);
1954 }
1955 else
1956 {
1957 if ((data[c/8] & (1 << (c&7))) == 0) RRETURN(MATCH_NOMATCH);
1958 }
1959 }
1960 }
1961 else
1962 #endif
1963 /* Not UTF-8 mode */
1964 {
1965 for (i = 1; i <= min; i++)
1966 {
1967 if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
1968 c = *eptr++;
1969 if ((data[c/8] & (1 << (c&7))) == 0) RRETURN(MATCH_NOMATCH);
1970 }
1971 }
1972
1973 /* If max == min we can continue with the main loop without the
1974 need to recurse. */
1975
1976 if (min == max) continue;
1977
1978 /* If minimizing, keep testing the rest of the expression and advancing
1979 the pointer while it matches the class. */
1980
1981 if (minimize)
1982 {
1983 #ifdef SUPPORT_UTF8
1984 /* UTF-8 mode */
1985 if (utf8)
1986 {
1987 for (fi = min;; fi++)
1988 {
1989 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM16);
1990 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1991 if (fi >= max || eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
1992 GETCHARINC(c, eptr);
1993 if (c > 255)
1994 {
1995 if (op == OP_CLASS) RRETURN(MATCH_NOMATCH);
1996 }
1997 else
1998 {
1999 if ((data[c/8] & (1 << (c&7))) == 0) RRETURN(MATCH_NOMATCH);
2000 }
2001 }
2002 }
2003 else
2004 #endif
2005 /* Not UTF-8 mode */
2006 {
2007 for (fi = min;; fi++)
2008 {
2009 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM17);
2010 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2011 if (fi >= max || eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
2012 c = *eptr++;
2013 if ((data[c/8] & (1 << (c&7))) == 0) RRETURN(MATCH_NOMATCH);
2014 }
2015 }
2016 /* Control never gets here */
2017 }
2018
2019 /* If maximizing, find the longest possible run, then work backwards. */
2020
2021 else
2022 {
2023 pp = eptr;
2024
2025 #ifdef SUPPORT_UTF8
2026 /* UTF-8 mode */
2027 if (utf8)
2028 {
2029 for (i = min; i < max; i++)
2030 {
2031 int len = 1;
2032 if (eptr >= md->end_subject) break;
2033 GETCHARLEN(c, eptr, len);
2034 if (c > 255)
2035 {
2036 if (op == OP_CLASS) break;
2037 }
2038 else
2039 {
2040 if ((data[c/8] & (1 << (c&7))) == 0) break;
2041 }
2042 eptr += len;
2043 }
2044 for (;;)
2045 {
2046 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM18);
2047 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2048 if (eptr-- == pp) break; /* Stop if tried at original pos */
2049 BACKCHAR(eptr);
2050 }
2051 }
2052 else
2053 #endif
2054 /* Not UTF-8 mode */
2055 {
2056 for (i = min; i < max; i++)
2057 {
2058 if (eptr >= md->end_subject) break;
2059 c = *eptr;
2060 if ((data[c/8] & (1 << (c&7))) == 0) break;
2061 eptr++;
2062 }
2063 while (eptr >= pp)
2064 {
2065 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM19);
2066 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2067 eptr--;
2068 }
2069 }
2070
2071 RRETURN(MATCH_NOMATCH);
2072 }
2073 }
2074 /* Control never gets here */
2075
2076
2077 /* Match an extended character class. This opcode is encountered only
2078 when UTF-8 mode mode is supported. Nevertheless, we may not be in UTF-8
2079 mode, because Unicode properties are supported in non-UTF-8 mode. */
2080
2081 #ifdef SUPPORT_UTF8
2082 case OP_XCLASS:
2083 {
2084 data = ecode + 1 + LINK_SIZE; /* Save for matching */
2085 ecode += GET(ecode, 1); /* Advance past the item */
2086
2087 switch (*ecode)
2088 {
2089 case OP_CRSTAR:
2090 case OP_CRMINSTAR:
2091 case OP_CRPLUS:
2092 case OP_CRMINPLUS:
2093 case OP_CRQUERY:
2094 case OP_CRMINQUERY:
2095 c = *ecode++ - OP_CRSTAR;
2096 minimize = (c & 1) != 0;
2097 min = rep_min[c]; /* Pick up values from tables; */
2098 max = rep_max[c]; /* zero for max => infinity */
2099 if (max == 0) max = INT_MAX;
2100 break;
2101
2102 case OP_CRRANGE:
2103 case OP_CRMINRANGE:
2104 minimize = (*ecode == OP_CRMINRANGE);
2105 min = GET2(ecode, 1);
2106 max = GET2(ecode, 3);
2107 if (max == 0) max = INT_MAX;
2108 ecode += 5;
2109 break;
2110
2111 default: /* No repeat follows */
2112 min = max = 1;
2113 break;
2114 }
2115
2116 /* First, ensure the minimum number of matches are present. */
2117
2118 for (i = 1; i <= min; i++)
2119 {
2120 if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
2121 GETCHARINCTEST(c, eptr);
2122 if (!_pcre_xclass(c, data)) RRETURN(MATCH_NOMATCH);
2123 }
2124
2125 /* If max == min we can continue with the main loop without the
2126 need to recurse. */
2127
2128 if (min == max) continue;
2129
2130 /* If minimizing, keep testing the rest of the expression and advancing
2131 the pointer while it matches the class. */
2132
2133 if (minimize)
2134 {
2135 for (fi = min;; fi++)
2136 {
2137 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM20);
2138 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2139 if (fi >= max || eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
2140 GETCHARINCTEST(c, eptr);
2141 if (!_pcre_xclass(c, data)) RRETURN(MATCH_NOMATCH);
2142 }
2143 /* Control never gets here */
2144 }
2145
2146 /* If maximizing, find the longest possible run, then work backwards. */
2147
2148 else
2149 {
2150 pp = eptr;
2151 for (i = min; i < max; i++)
2152 {
2153 int len = 1;
2154 if (eptr >= md->end_subject) break;
2155 GETCHARLENTEST(c, eptr, len);
2156 if (!_pcre_xclass(c, data)) break;
2157 eptr += len;
2158 }
2159 for(;;)
2160 {
2161 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM21);
2162 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2163 if (eptr-- == pp) break; /* Stop if tried at original pos */
2164 if (utf8) BACKCHAR(eptr);
2165 }
2166 RRETURN(MATCH_NOMATCH);
2167 }
2168
2169 /* Control never gets here */
2170 }
2171 #endif /* End of XCLASS */
2172
2173 /* Match a single character, casefully */
2174
2175 case OP_CHAR:
2176 #ifdef SUPPORT_UTF8
2177 if (utf8)
2178 {
2179 length = 1;
2180 ecode++;
2181 GETCHARLEN(fc, ecode, length);
2182 if (length > md->end_subject - eptr) RRETURN(MATCH_NOMATCH);
2183 while (length-- > 0) if (*ecode++ != *eptr++) RRETURN(MATCH_NOMATCH);
2184 }
2185 else
2186 #endif
2187
2188 /* Non-UTF-8 mode */
2189 {
2190 if (md->end_subject - eptr < 1) RRETURN(MATCH_NOMATCH);
2191 if (ecode[1] != *eptr++) RRETURN(MATCH_NOMATCH);
2192 ecode += 2;
2193 }
2194 break;
2195
2196 /* Match a single character, caselessly */
2197
2198 case OP_CHARNC:
2199 #ifdef SUPPORT_UTF8
2200 if (utf8)
2201 {
2202 length = 1;
2203 ecode++;
2204 GETCHARLEN(fc, ecode, length);
2205
2206 if (length > md->end_subject - eptr) RRETURN(MATCH_NOMATCH);
2207
2208 /* If the pattern character's value is < 128, we have only one byte, and
2209 can use the fast lookup table. */
2210
2211 if (fc < 128)
2212 {
2213 if (md->lcc[*ecode++] != md->lcc[*eptr++]) RRETURN(MATCH_NOMATCH);
2214 }
2215
2216 /* Otherwise we must pick up the subject character */
2217
2218 else
2219 {
2220 unsigned int dc;
2221 GETCHARINC(dc, eptr);
2222 ecode += length;
2223
2224 /* If we have Unicode property support, we can use it to test the other
2225 case of the character, if there is one. */
2226
2227 if (fc != dc)
2228 {
2229 #ifdef SUPPORT_UCP
2230 if (dc != UCD_OTHERCASE(fc))
2231 #endif
2232 RRETURN(MATCH_NOMATCH);
2233 }
2234 }
2235 }
2236 else
2237 #endif /* SUPPORT_UTF8 */
2238
2239 /* Non-UTF-8 mode */
2240 {
2241 if (md->end_subject - eptr < 1) RRETURN(MATCH_NOMATCH);
2242 if (md->lcc[ecode[1]] != md->lcc[*eptr++]) RRETURN(MATCH_NOMATCH);
2243 ecode += 2;
2244 }
2245 break;
2246
2247 /* Match a single character repeatedly. */
2248
2249 case OP_EXACT:
2250 min = max = GET2(ecode, 1);
2251 ecode += 3;
2252 goto REPEATCHAR;
2253
2254 case OP_POSUPTO:
2255 possessive = TRUE;
2256 /* Fall through */
2257
2258 case OP_UPTO:
2259 case OP_MINUPTO:
2260 min = 0;
2261 max = GET2(ecode, 1);
2262 minimize = *ecode == OP_MINUPTO;
2263 ecode += 3;
2264 goto REPEATCHAR;
2265
2266 case OP_POSSTAR:
2267 possessive = TRUE;
2268 min = 0;
2269 max = INT_MAX;
2270 ecode++;
2271 goto REPEATCHAR;
2272
2273 case OP_POSPLUS:
2274 possessive = TRUE;
2275 min = 1;
2276 max = INT_MAX;
2277 ecode++;
2278 goto REPEATCHAR;
2279
2280 case OP_POSQUERY:
2281 possessive = TRUE;
2282 min = 0;
2283 max = 1;
2284 ecode++;
2285 goto REPEATCHAR;
2286
2287 case OP_STAR:
2288 case OP_MINSTAR:
2289 case OP_PLUS:
2290 case OP_MINPLUS:
2291 case OP_QUERY:
2292 case OP_MINQUERY:
2293 c = *ecode++ - OP_STAR;
2294 minimize = (c & 1) != 0;
2295 min = rep_min[c]; /* Pick up values from tables; */
2296 max = rep_max[c]; /* zero for max => infinity */
2297 if (max == 0) max = INT_MAX;
2298
2299 /* Common code for all repeated single-character matches. We can give
2300 up quickly if there are fewer than the minimum number of characters left in
2301 the subject. */
2302
2303 REPEATCHAR:
2304 #ifdef SUPPORT_UTF8
2305 if (utf8)
2306 {
2307 length = 1;
2308 charptr = ecode;
2309 GETCHARLEN(fc, ecode, length);
2310 if (min * length > md->end_subject - eptr) RRETURN(MATCH_NOMATCH);
2311 ecode += length;
2312
2313 /* Handle multibyte character matching specially here. There is
2314 support for caseless matching if UCP support is present. */
2315
2316 if (length > 1)
2317 {
2318 #ifdef SUPPORT_UCP
2319 unsigned int othercase;
2320 if ((ims & PCRE_CASELESS) != 0 &&
2321 (othercase = UCD_OTHERCASE(fc)) != fc)
2322 oclength = _pcre_ord2utf8(othercase, occhars);
2323 else oclength = 0;
2324 #endif /* SUPPORT_UCP */
2325
2326 for (i = 1; i <= min; i++)
2327 {
2328 if (memcmp(eptr, charptr, length) == 0) eptr += length;
2329 #ifdef SUPPORT_UCP
2330 /* Need braces because of following else */
2331 else if (oclength == 0) { RRETURN(MATCH_NOMATCH); }
2332 else
2333 {
2334 if (memcmp(eptr, occhars, oclength) != 0) RRETURN(MATCH_NOMATCH);
2335 eptr += oclength;
2336 }
2337 #else /* without SUPPORT_UCP */
2338 else { RRETURN(MATCH_NOMATCH); }
2339 #endif /* SUPPORT_UCP */
2340 }
2341
2342 if (min == max) continue;
2343
2344 if (minimize)
2345 {
2346 for (fi = min;; fi++)
2347 {
2348 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM22);
2349 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2350 if (fi >= max || eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
2351 if (memcmp(eptr, charptr, length) == 0) eptr += length;
2352 #ifdef SUPPORT_UCP
2353 /* Need braces because of following else */
2354 else if (oclength == 0) { RRETURN(MATCH_NOMATCH); }
2355 else
2356 {
2357 if (memcmp(eptr, occhars, oclength) != 0) RRETURN(MATCH_NOMATCH);
2358 eptr += oclength;
2359 }
2360 #else /* without SUPPORT_UCP */
2361 else { RRETURN (MATCH_NOMATCH); }
2362 #endif /* SUPPORT_UCP */
2363 }
2364 /* Control never gets here */
2365 }
2366
2367 else /* Maximize */
2368 {
2369 pp = eptr;
2370 for (i = min; i < max; i++)
2371 {
2372 if (eptr > md->end_subject - length) break;
2373 if (memcmp(eptr, charptr, length) == 0) eptr += length;
2374 #ifdef SUPPORT_UCP
2375 else if (oclength == 0) break;
2376 else
2377 {
2378 if (memcmp(eptr, occhars, oclength) != 0) break;
2379 eptr += oclength;
2380 }
2381 #else /* without SUPPORT_UCP */
2382 else break;
2383 #endif /* SUPPORT_UCP */
2384 }
2385
2386 if (possessive) continue;
2387 for(;;)
2388 {
2389 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM23);
2390 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2391 if (eptr == pp) RRETURN(MATCH_NOMATCH);
2392 #ifdef SUPPORT_UCP
2393 eptr--;
2394 BACKCHAR(eptr);
2395 #else /* without SUPPORT_UCP */
2396 eptr -= length;
2397 #endif /* SUPPORT_UCP */
2398 }
2399 }
2400 /* Control never gets here */
2401 }
2402
2403 /* If the length of a UTF-8 character is 1, we fall through here, and
2404 obey the code as for non-UTF-8 characters below, though in this case the
2405 value of fc will always be < 128. */
2406 }
2407 else
2408 #endif /* SUPPORT_UTF8 */
2409
2410 /* When not in UTF-8 mode, load a single-byte character. */
2411 {
2412 if (min > md->end_subject - eptr) RRETURN(MATCH_NOMATCH);
2413 fc = *ecode++;
2414 }
2415
2416 /* The value of fc at this point is always less than 256, though we may or
2417 may not be in UTF-8 mode. The code is duplicated for the caseless and
2418 caseful cases, for speed, since matching characters is likely to be quite
2419 common. First, ensure the minimum number of matches are present. If min =
2420 max, continue at the same level without recursing. Otherwise, if
2421 minimizing, keep trying the rest of the expression and advancing one
2422 matching character if failing, up to the maximum. Alternatively, if
2423 maximizing, find the maximum number of characters and work backwards. */
2424
2425 DPRINTF(("matching %c{%d,%d} against subject %.*s\n", fc, min, max,
2426 max, eptr));
2427
2428 if ((ims & PCRE_CASELESS) != 0)
2429 {
2430 fc = md->lcc[fc];
2431 for (i = 1; i <= min; i++)
2432 if (fc != md->lcc[*eptr++]) RRETURN(MATCH_NOMATCH);
2433 if (min == max) continue;
2434 if (minimize)
2435 {
2436 for (fi = min;; fi++)
2437 {
2438 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM24);
2439 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2440 if (fi >= max || eptr >= md->end_subject ||
2441 fc != md->lcc[*eptr++])
2442 RRETURN(MATCH_NOMATCH);
2443 }
2444 /* Control never gets here */
2445 }
2446 else /* Maximize */
2447 {
2448 pp = eptr;
2449 for (i = min; i < max; i++)
2450 {
2451 if (eptr >= md->end_subject || fc != md->lcc[*eptr]) break;
2452 eptr++;
2453 }
2454 if (possessive) continue;
2455 while (eptr >= pp)
2456 {
2457 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM25);
2458 eptr--;
2459 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2460 }
2461 RRETURN(MATCH_NOMATCH);
2462 }
2463 /* Control never gets here */
2464 }
2465
2466 /* Caseful comparisons (includes all multi-byte characters) */
2467
2468 else
2469 {
2470 for (i = 1; i <= min; i++) if (fc != *eptr++) RRETURN(MATCH_NOMATCH);
2471 if (min == max) continue;
2472 if (minimize)
2473 {
2474 for (fi = min;; fi++)
2475 {
2476 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM26);
2477 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2478 if (fi >= max || eptr >= md->end_subject || fc != *eptr++)
2479 RRETURN(MATCH_NOMATCH);
2480 }
2481 /* Control never gets here */
2482 }
2483 else /* Maximize */
2484 {
2485 pp = eptr;
2486 for (i = min; i < max; i++)
2487 {
2488 if (eptr >= md->end_subject || fc != *eptr) break;
2489 eptr++;
2490 }
2491 if (possessive) continue;
2492 while (eptr >= pp)
2493 {
2494 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM27);
2495 eptr--;
2496 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2497 }
2498 RRETURN(MATCH_NOMATCH);
2499 }
2500 }
2501 /* Control never gets here */
2502
2503 /* Match a negated single one-byte character. The character we are
2504 checking can be multibyte. */
2505
2506 case OP_NOT:
2507 if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
2508 ecode++;
2509 GETCHARINCTEST(c, eptr);
2510 if ((ims & PCRE_CASELESS) != 0)
2511 {
2512 #ifdef SUPPORT_UTF8
2513 if (c < 256)
2514 #endif
2515 c = md->lcc[c];
2516 if (md->lcc[*ecode++] == c) RRETURN(MATCH_NOMATCH);
2517 }
2518 else
2519 {
2520 if (*ecode++ == c) RRETURN(MATCH_NOMATCH);
2521 }
2522 break;
2523
2524 /* Match a negated single one-byte character repeatedly. This is almost a
2525 repeat of the code for a repeated single character, but I haven't found a
2526 nice way of commoning these up that doesn't require a test of the
2527 positive/negative option for each character match. Maybe that wouldn't add
2528 very much to the time taken, but character matching *is* what this is all
2529 about... */
2530
2531 case OP_NOTEXACT:
2532 min = max = GET2(ecode, 1);
2533 ecode += 3;
2534 goto REPEATNOTCHAR;
2535
2536 case OP_NOTUPTO:
2537 case OP_NOTMINUPTO:
2538 min = 0;
2539 max = GET2(ecode, 1);
2540 minimize = *ecode == OP_NOTMINUPTO;
2541 ecode += 3;
2542 goto REPEATNOTCHAR;
2543
2544 case OP_NOTPOSSTAR:
2545 possessive = TRUE;
2546 min = 0;
2547 max = INT_MAX;
2548 ecode++;
2549 goto REPEATNOTCHAR;
2550
2551 case OP_NOTPOSPLUS:
2552 possessive = TRUE;
2553 min = 1;
2554 max = INT_MAX;
2555 ecode++;
2556 goto REPEATNOTCHAR;
2557
2558 case OP_NOTPOSQUERY:
2559 possessive = TRUE;
2560 min = 0;
2561 max = 1;
2562 ecode++;
2563 goto REPEATNOTCHAR;
2564
2565 case OP_NOTPOSUPTO:
2566 possessive = TRUE;
2567 min = 0;
2568 max = GET2(ecode, 1);
2569 ecode += 3;
2570 goto REPEATNOTCHAR;
2571
2572 case OP_NOTSTAR:
2573 case OP_NOTMINSTAR:
2574 case OP_NOTPLUS:
2575 case OP_NOTMINPLUS:
2576 case OP_NOTQUERY:
2577 case OP_NOTMINQUERY:
2578 c = *ecode++ - OP_NOTSTAR;
2579 minimize = (c & 1) != 0;
2580 min = rep_min[c]; /* Pick up values from tables; */
2581 max = rep_max[c]; /* zero for max => infinity */
2582 if (max == 0) max = INT_MAX;
2583
2584 /* Common code for all repeated single-byte matches. We can give up quickly
2585 if there are fewer than the minimum number of bytes left in the
2586 subject. */
2587
2588 REPEATNOTCHAR:
2589 if (min > md->end_subject - eptr) RRETURN(MATCH_NOMATCH);
2590 fc = *ecode++;
2591
2592 /* The code is duplicated for the caseless and caseful cases, for speed,
2593 since matching characters is likely to be quite common. First, ensure the
2594 minimum number of matches are present. If min = max, continue at the same
2595 level without recursing. Otherwise, if minimizing, keep trying the rest of
2596 the expression and advancing one matching character if failing, up to the
2597 maximum. Alternatively, if maximizing, find the maximum number of
2598 characters and work backwards. */
2599
2600 DPRINTF(("negative matching %c{%d,%d} against subject %.*s\n", fc, min, max,
2601 max, eptr));
2602
2603 if ((ims & PCRE_CASELESS) != 0)
2604 {
2605 fc = md->lcc[fc];
2606
2607 #ifdef SUPPORT_UTF8
2608 /* UTF-8 mode */
2609 if (utf8)
2610 {
2611 register unsigned int d;
2612 for (i = 1; i <= min; i++)
2613 {
2614 GETCHARINC(d, eptr);
2615 if (d < 256) d = md->lcc[d];
2616 if (fc == d) RRETURN(MATCH_NOMATCH);
2617 }
2618 }
2619 else
2620 #endif
2621
2622 /* Not UTF-8 mode */
2623 {
2624 for (i = 1; i <= min; i++)
2625 if (fc == md->lcc[*eptr++]) RRETURN(MATCH_NOMATCH);
2626 }
2627
2628 if (min == max) continue;
2629
2630 if (minimize)
2631 {
2632 #ifdef SUPPORT_UTF8
2633 /* UTF-8 mode */
2634 if (utf8)
2635 {
2636 register unsigned int d;
2637 for (fi = min;; fi++)
2638 {
2639 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM28);
2640 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2641 if (fi >= max || eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
2642 GETCHARINC(d, eptr);
2643 if (d < 256) d = md->lcc[d];
2644 if (fc == d) RRETURN(MATCH_NOMATCH);
2645
2646 }
2647 }
2648 else
2649 #endif
2650 /* Not UTF-8 mode */
2651 {
2652 for (fi = min;; fi++)
2653 {
2654 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM29);
2655 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2656 if (fi >= max || eptr >= md->end_subject || fc == md->lcc[*eptr++])
2657 RRETURN(MATCH_NOMATCH);
2658 }
2659 }
2660 /* Control never gets here */
2661 }
2662
2663 /* Maximize case */
2664
2665 else
2666 {
2667 pp = eptr;
2668
2669 #ifdef SUPPORT_UTF8
2670 /* UTF-8 mode */
2671 if (utf8)
2672 {
2673 register unsigned int d;
2674 for (i = min; i < max; i++)
2675 {
2676 int len = 1;
2677 if (eptr >= md->end_subject) break;
2678 GETCHARLEN(d, eptr, len);
2679 if (d < 256) d = md->lcc[d];
2680 if (fc == d) break;
2681 eptr += len;
2682 }
2683 if (possessive) continue;
2684 for(;;)
2685 {
2686 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM30);
2687 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2688 if (eptr-- == pp) break; /* Stop if tried at original pos */
2689 BACKCHAR(eptr);
2690 }
2691 }
2692 else
2693 #endif
2694 /* Not UTF-8 mode */
2695 {
2696 for (i = min; i < max; i++)
2697 {
2698 if (eptr >= md->end_subject || fc == md->lcc[*eptr]) break;
2699 eptr++;
2700 }
2701 if (possessive) continue;
2702 while (eptr >= pp)
2703 {
2704 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM31);
2705 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2706 eptr--;
2707 }
2708 }
2709
2710 RRETURN(MATCH_NOMATCH);
2711 }
2712 /* Control never gets here */
2713 }
2714
2715 /* Caseful comparisons */
2716
2717 else
2718 {
2719 #ifdef SUPPORT_UTF8
2720 /* UTF-8 mode */
2721 if (utf8)
2722 {
2723 register unsigned int d;
2724 for (i = 1; i <= min; i++)
2725 {
2726 GETCHARINC(d, eptr);
2727 if (fc == d) RRETURN(MATCH_NOMATCH);
2728 }
2729 }
2730 else
2731 #endif
2732 /* Not UTF-8 mode */
2733 {
2734 for (i = 1; i <= min; i++)
2735 if (fc == *eptr++) RRETURN(MATCH_NOMATCH);
2736 }
2737
2738 if (min == max) continue;
2739
2740 if (minimize)
2741 {
2742 #ifdef SUPPORT_UTF8
2743 /* UTF-8 mode */
2744 if (utf8)
2745 {
2746 register unsigned int d;
2747 for (fi = min;; fi++)
2748 {
2749 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM32);
2750 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2751 if (fi >= max || eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
2752 GETCHARINC(d, eptr);
2753 if (fc == d) RRETURN(MATCH_NOMATCH);
2754 }
2755 }
2756 else
2757 #endif
2758 /* Not UTF-8 mode */
2759 {
2760 for (fi = min;; fi++)
2761 {
2762 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM33);
2763 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2764 if (fi >= max || eptr >= md->end_subject || fc == *eptr++)
2765 RRETURN(MATCH_NOMATCH);
2766 }
2767 }
2768 /* Control never gets here */
2769 }
2770
2771 /* Maximize case */
2772
2773 else
2774 {
2775 pp = eptr;
2776
2777 #ifdef SUPPORT_UTF8
2778 /* UTF-8 mode */
2779 if (utf8)
2780 {
2781 register unsigned int d;
2782 for (i = min; i < max; i++)
2783 {
2784 int len = 1;
2785 if (eptr >= md->end_subject) break;
2786 GETCHARLEN(d, eptr, len);
2787 if (fc == d) break;
2788 eptr += len;
2789 }
2790 if (possessive) continue;
2791 for(;;)
2792 {
2793 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM34);
2794 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2795 if (eptr-- == pp) break; /* Stop if tried at original pos */
2796 BACKCHAR(eptr);
2797 }
2798 }
2799 else
2800 #endif
2801 /* Not UTF-8 mode */
2802 {
2803 for (i = min; i < max; i++)
2804 {
2805 if (eptr >= md->end_subject || fc == *eptr) break;
2806 eptr++;
2807 }
2808 if (possessive) continue;
2809 while (eptr >= pp)
2810 {
2811 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM35);
2812 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2813 eptr--;
2814 }
2815 }
2816
2817 RRETURN(MATCH_NOMATCH);
2818 }
2819 }
2820 /* Control never gets here */
2821
2822 /* Match a single character type repeatedly; several different opcodes
2823 share code. This is very similar to the code for single characters, but we
2824 repeat it in the interests of efficiency. */
2825
2826 case OP_TYPEEXACT:
2827 min = max = GET2(ecode, 1);
2828 minimize = TRUE;
2829 ecode += 3;
2830 goto REPEATTYPE;
2831
2832 case OP_TYPEUPTO:
2833 case OP_TYPEMINUPTO:
2834 min = 0;
2835 max = GET2(ecode, 1);
2836 minimize = *ecode == OP_TYPEMINUPTO;
2837 ecode += 3;
2838 goto REPEATTYPE;
2839
2840 case OP_TYPEPOSSTAR:
2841 possessive = TRUE;
2842 min = 0;
2843 max = INT_MAX;
2844 ecode++;
2845 goto REPEATTYPE;
2846
2847 case OP_TYPEPOSPLUS:
2848 possessive = TRUE;
2849 min = 1;
2850 max = INT_MAX;
2851 ecode++;
2852 goto REPEATTYPE;
2853
2854 case OP_TYPEPOSQUERY:
2855 possessive = TRUE;
2856 min = 0;
2857 max = 1;
2858 ecode++;
2859 goto REPEATTYPE;
2860
2861 case OP_TYPEPOSUPTO:
2862 possessive = TRUE;
2863 min = 0;
2864 max = GET2(ecode, 1);
2865 ecode += 3;
2866 goto REPEATTYPE;
2867
2868 case OP_TYPESTAR:
2869 case OP_TYPEMINSTAR:
2870 case OP_TYPEPLUS:
2871 case OP_TYPEMINPLUS:
2872 case OP_TYPEQUERY:
2873 case OP_TYPEMINQUERY:
2874 c = *ecode++ - OP_TYPESTAR;
2875 minimize = (c & 1) != 0;
2876 min = rep_min[c]; /* Pick up values from tables; */
2877 max = rep_max[c]; /* zero for max => infinity */
2878 if (max == 0) max = INT_MAX;
2879
2880 /* Common code for all repeated single character type matches. Note that
2881 in UTF-8 mode, '.' matches a character of any length, but for the other
2882 character types, the valid characters are all one-byte long. */
2883
2884 REPEATTYPE:
2885 ctype = *ecode++; /* Code for the character type */
2886
2887 #ifdef SUPPORT_UCP
2888 if (ctype == OP_PROP || ctype == OP_NOTPROP)
2889 {
2890 prop_fail_result = ctype == OP_NOTPROP;
2891 prop_type = *ecode++;
2892 prop_value = *ecode++;
2893 }
2894 else prop_type = -1;
2895 #endif
2896
2897 /* First, ensure the minimum number of matches are present. Use inline
2898 code for maximizing the speed, and do the type test once at the start
2899 (i.e. keep it out of the loop). Also we can test that there are at least
2900 the minimum number of bytes before we start. This isn't as effective in
2901 UTF-8 mode, but it does no harm. Separate the UTF-8 code completely as that
2902 is tidier. Also separate the UCP code, which can be the same for both UTF-8
2903 and single-bytes. */
2904
2905 if (min > md->end_subject - eptr) RRETURN(MATCH_NOMATCH);
2906 if (min > 0)
2907 {
2908 #ifdef SUPPORT_UCP
2909 if (prop_type >= 0)
2910 {
2911 switch(prop_type)
2912 {
2913 case PT_ANY:
2914 if (prop_fail_result) RRETURN(MATCH_NOMATCH);
2915 for (i = 1; i <= min; i++)
2916 {
2917 if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
2918 GETCHARINCTEST(c, eptr);
2919 }
2920 break;
2921
2922 case PT_LAMP:
2923 for (i = 1; i <= min; i++)
2924 {
2925 if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
2926 GETCHARINCTEST(c, eptr);
2927 prop_chartype = UCD_CHARTYPE(c);
2928 if ((prop_chartype == ucp_Lu ||
2929 prop_chartype == ucp_Ll ||
2930 prop_chartype == ucp_Lt) == prop_fail_result)
2931 RRETURN(MATCH_NOMATCH);
2932 }
2933 break;
2934
2935 case PT_GC:
2936 for (i = 1; i <= min; i++)
2937 {
2938 if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
2939 GETCHARINCTEST(c, eptr);
2940 prop_category = UCD_CATEGORY(c);
2941 if ((prop_category == prop_value) == prop_fail_result)
2942 RRETURN(MATCH_NOMATCH);
2943 }
2944 break;
2945
2946 case PT_PC:
2947 for (i = 1; i <= min; i++)
2948 {
2949 if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
2950 GETCHARINCTEST(c, eptr);
2951 prop_chartype = UCD_CHARTYPE(c);
2952 if ((prop_chartype == prop_value) == prop_fail_result)
2953 RRETURN(MATCH_NOMATCH);
2954 }
2955 break;
2956
2957 case PT_SC:
2958 for (i = 1; i <= min; i++)
2959 {
2960 if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
2961 GETCHARINCTEST(c, eptr);
2962 prop_script = UCD_SCRIPT(c);
2963 if ((prop_script == prop_value) == prop_fail_result)
2964 RRETURN(MATCH_NOMATCH);
2965 }
2966 break;
2967
2968 default:
2969 RRETURN(PCRE_ERROR_INTERNAL);
2970 }
2971 }
2972
2973 /* Match extended Unicode sequences. We will get here only if the
2974 support is in the binary; otherwise a compile-time error occurs. */
2975
2976 else if (ctype == OP_EXTUNI)
2977 {
2978 for (i = 1; i <= min; i++)
2979 {
2980 GETCHARINCTEST(c, eptr);
2981 prop_category = UCD_CATEGORY(c);
2982 if (prop_category == ucp_M) RRETURN(MATCH_NOMATCH);
2983 while (eptr < md->end_subject)
2984 {
2985 int len = 1;
2986 if (!utf8) c = *eptr; else
2987 {
2988 GETCHARLEN(c, eptr, len);
2989 }
2990 prop_category = UCD_CATEGORY(c);
2991 if (prop_category != ucp_M) break;
2992 eptr += len;
2993 }
2994 }
2995 }
2996
2997 else
2998 #endif /* SUPPORT_UCP */
2999
3000 /* Handle all other cases when the coding is UTF-8 */
3001
3002 #ifdef SUPPORT_UTF8
3003 if (utf8) switch(ctype)
3004 {
3005 case OP_ANY:
3006 for (i = 1; i <= min; i++)
3007 {
3008 if (eptr >= md->end_subject || IS_NEWLINE(eptr))
3009 RRETURN(MATCH_NOMATCH);
3010 eptr++;
3011 while (eptr < md->end_subject && (*eptr & 0xc0) == 0x80) eptr++;
3012 }
3013 break;
3014
3015 case OP_ALLANY:
3016 for (i = 1; i <= min; i++)
3017 {
3018 if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
3019 eptr++;
3020 while (eptr < md->end_subject && (*eptr & 0xc0) == 0x80) eptr++;
3021 }
3022 break;
3023
3024 case OP_ANYBYTE:
3025 eptr += min;
3026 break;
3027
3028 case OP_ANYNL:
3029 for (i = 1; i <= min; i++)
3030 {
3031 if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
3032 GETCHARINC(c, eptr);
3033 switch(c)
3034 {
3035 default: RRETURN(MATCH_NOMATCH);
3036 case 0x000d:
3037 if (eptr < md->end_subject && *eptr == 0x0a) eptr++;
3038 break;
3039
3040 case 0x000a:
3041 break;
3042
3043 case 0x000b:
3044 case 0x000c:
3045 case 0x0085:
3046 case 0x2028:
3047 case 0x2029:
3048 if (md->bsr_anycrlf) RRETURN(MATCH_NOMATCH);
3049 break;
3050 }
3051 }
3052 break;
3053
3054 case OP_NOT_HSPACE:
3055 for (i = 1; i <= min; i++)
3056 {
3057 if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
3058 GETCHARINC(c, eptr);
3059 switch(c)
3060 {
3061 default: break;
3062 case 0x09: /* HT */
3063 case 0x20: /* SPACE */
3064 case 0xa0: /* NBSP */
3065 case 0x1680: /* OGHAM SPACE MARK */
3066 case 0x180e: /* MONGOLIAN VOWEL SEPARATOR */
3067 case 0x2000: /* EN QUAD */
3068 case 0x2001: /* EM QUAD */
3069 case 0x2002: /* EN SPACE */
3070 case 0x2003: /* EM SPACE */
3071 case 0x2004: /* THREE-PER-EM SPACE */
3072 case 0x2005: /* FOUR-PER-EM SPACE */
3073 case 0x2006: /* SIX-PER-EM SPACE */
3074 case 0x2007: /* FIGURE SPACE */
3075 case 0x2008: /* PUNCTUATION SPACE */
3076 case 0x2009: /* THIN SPACE */
3077 case 0x200A: /* HAIR SPACE */
3078 case 0x202f: /* NARROW NO-BREAK SPACE */
3079 case 0x205f: /* MEDIUM MATHEMATICAL SPACE */
3080 case 0x3000: /* IDEOGRAPHIC SPACE */
3081 RRETURN(MATCH_NOMATCH);
3082 }
3083 }
3084 break;
3085
3086 case OP_HSPACE:
3087 for (i = 1; i <= min; i++)
3088 {
3089 if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
3090 GETCHARINC(c, eptr);
3091 switch(c)
3092 {
3093 default: RRETURN(MATCH_NOMATCH);
3094 case 0x09: /* HT */
3095 case 0x20: /* SPACE */
3096 case 0xa0: /* NBSP */
3097 case 0x1680: /* OGHAM SPACE MARK */
3098 case 0x180e: /* MONGOLIAN VOWEL SEPARATOR */
3099 case 0x2000: /* EN QUAD */
3100 case 0x2001: /* EM QUAD */
3101 case 0x2002: /* EN SPACE */
3102 case 0x2003: /* EM SPACE */
3103 case 0x2004: /* THREE-PER-EM SPACE */
3104 case 0x2005: /* FOUR-PER-EM SPACE */
3105 case 0x2006: /* SIX-PER-EM SPACE */
3106 case 0x2007: /* FIGURE SPACE */
3107 case 0x2008: /* PUNCTUATION SPACE */
3108 case 0x2009: /* THIN SPACE */
3109 case 0x200A: /* HAIR SPACE */
3110 case 0x202f: /* NARROW NO-BREAK SPACE */
3111 case 0x205f: /* MEDIUM MATHEMATICAL SPACE */
3112 case 0x3000: /* IDEOGRAPHIC SPACE */
3113 break;
3114 }
3115 }
3116 break;
3117
3118 case OP_NOT_VSPACE:
3119 for (i = 1; i <= min; i++)
3120 {
3121 if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
3122 GETCHARINC(c, eptr);
3123 switch(c)
3124 {
3125 default: break;
3126 case 0x0a: /* LF */
3127 case 0x0b: /* VT */
3128 case 0x0c: /* FF */
3129 case 0x0d: /* CR */
3130 case 0x85: /* NEL */
3131 case 0x2028: /* LINE SEPARATOR */
3132 case 0x2029: /* PARAGRAPH SEPARATOR */
3133 RRETURN(MATCH_NOMATCH);
3134 }
3135 }
3136 break;
3137
3138 case OP_VSPACE:
3139 for (i = 1; i <= min; i++)
3140 {
3141 if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
3142 GETCHARINC(c, eptr);
3143 switch(c)
3144 {
3145 default: RRETURN(MATCH_NOMATCH);
3146 case 0x0a: /* LF */
3147 case 0x0b: /* VT */
3148 case 0x0c: /* FF */
3149 case 0x0d: /* CR */
3150 case 0x85: /* NEL */
3151 case 0x2028: /* LINE SEPARATOR */
3152 case 0x2029: /* PARAGRAPH SEPARATOR */
3153 break;
3154 }
3155 }
3156 break;
3157
3158 case OP_NOT_DIGIT:
3159 for (i = 1; i <= min; i++)
3160 {
3161 if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
3162 GETCHARINC(c, eptr);
3163 if (c < 128 && (md->ctypes[c] & ctype_digit) != 0)
3164 RRETURN(MATCH_NOMATCH);
3165 }
3166 break;
3167
3168 case OP_DIGIT:
3169 for (i = 1; i <= min; i++)
3170 {
3171 if (eptr >= md->end_subject ||
3172 *eptr >= 128 || (md->ctypes[*eptr++] & ctype_digit) == 0)
3173 RRETURN(MATCH_NOMATCH);
3174 /* No need to skip more bytes - we know it's a 1-byte character */
3175 }
3176 break;
3177
3178 case OP_NOT_WHITESPACE:
3179 for (i = 1; i <= min; i++)
3180 {
3181 if (eptr >= md->end_subject ||
3182 (*eptr < 128 && (md->ctypes[*eptr] & ctype_space) != 0))
3183 RRETURN(MATCH_NOMATCH);
3184 while (++eptr < md->end_subject && (*eptr & 0xc0) == 0x80);
3185 }
3186 break;
3187
3188 case OP_WHITESPACE:
3189 for (i = 1; i <= min; i++)
3190 {
3191 if (eptr >= md->end_subject ||
3192 *eptr >= 128 || (md->ctypes[*eptr++] & ctype_space) == 0)
3193 RRETURN(MATCH_NOMATCH);
3194 /* No need to skip more bytes - we know it's a 1-byte character */
3195 }
3196 break;
3197
3198 case OP_NOT_WORDCHAR:
3199 for (i = 1; i <= min; i++)
3200 {
3201 if (eptr >= md->end_subject ||
3202 (*eptr < 128 && (md->ctypes[*eptr] & ctype_word) != 0))
3203 RRETURN(MATCH_NOMATCH);
3204 while (++eptr < md->end_subject && (*eptr & 0xc0) == 0x80);
3205 }
3206 break;
3207
3208 case OP_WORDCHAR:
3209 for (i = 1; i <= min; i++)
3210 {
3211 if (eptr >= md->end_subject ||
3212 *eptr >= 128 || (md->ctypes[*eptr++] & ctype_word) == 0)
3213 RRETURN(MATCH_NOMATCH);
3214 /* No need to skip more bytes - we know it's a 1-byte character */
3215 }
3216 break;
3217
3218 default:
3219 RRETURN(PCRE_ERROR_INTERNAL);
3220 } /* End switch(ctype) */
3221
3222 else
3223 #endif /* SUPPORT_UTF8 */
3224
3225 /* Code for the non-UTF-8 case for minimum matching of operators other
3226 than OP_PROP and OP_NOTPROP. We can assume that there are the minimum
3227 number of bytes present, as this was tested above. */
3228
3229 switch(ctype)
3230 {
3231 case OP_ANY:
3232 for (i = 1; i <= min; i++)
3233 {
3234 if (IS_NEWLINE(eptr)) RRETURN(MATCH_NOMATCH);
3235 eptr++;
3236 }
3237 break;
3238
3239 case OP_ALLANY:
3240 eptr += min;
3241 break;
3242
3243 case OP_ANYBYTE:
3244 eptr += min;
3245 break;
3246
3247 /* Because of the CRLF case, we can't assume the minimum number of
3248 bytes are present in this case. */
3249
3250 case OP_ANYNL:
3251 for (i = 1; i <= min; i++)
3252 {
3253 if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
3254 switch(*eptr++)
3255 {
3256 default: RRETURN(MATCH_NOMATCH);
3257 case 0x000d:
3258 if (eptr < md->end_subject && *eptr == 0x0a) eptr++;
3259 break;
3260 case 0x000a:
3261 break;
3262
3263 case 0x000b:
3264 case 0x000c:
3265 case 0x0085:
3266 if (md->bsr_anycrlf) RRETURN(MATCH_NOMATCH);
3267 break;
3268 }
3269 }
3270 break;
3271
3272 case OP_NOT_HSPACE:
3273 for (i = 1; i <= min; i++)
3274 {
3275 if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
3276 switch(*eptr++)
3277 {
3278 default: break;
3279 case 0x09: /* HT */
3280 case 0x20: /* SPACE */
3281 case 0xa0: /* NBSP */
3282 RRETURN(MATCH_NOMATCH);
3283 }
3284 }
3285 break;
3286
3287 case OP_HSPACE:
3288 for (i = 1; i <= min; i++)
3289 {
3290 if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
3291 switch(*eptr++)
3292 {
3293 default: RRETURN(MATCH_NOMATCH);
3294 case 0x09: /* HT */
3295 case 0x20: /* SPACE */
3296 case 0xa0: /* NBSP */
3297 break;
3298 }
3299 }
3300 break;
3301
3302 case OP_NOT_VSPACE:
3303 for (i = 1; i <= min; i++)
3304 {
3305 if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
3306 switch(*eptr++)
3307 {
3308 default: break;
3309 case 0x0a: /* LF */
3310 case 0x0b: /* VT */
3311 case 0x0c: /* FF */
3312 case 0x0d: /* CR */
3313 case 0x85: /* NEL */
3314 RRETURN(MATCH_NOMATCH);
3315 }
3316 }
3317 break;
3318
3319 case OP_VSPACE:
3320 for (i = 1; i <= min; i++)
3321 {
3322 if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
3323 switch(*eptr++)
3324 {
3325 default: RRETURN(MATCH_NOMATCH);
3326 case 0x0a: /* LF */
3327 case 0x0b: /* VT */
3328 case 0x0c: /* FF */
3329 case 0x0d: /* CR */
3330 case 0x85: /* NEL */
3331 break;
3332 }
3333 }
3334 break;
3335
3336 case OP_NOT_DIGIT:
3337 for (i = 1; i <= min; i++)
3338 if ((md->ctypes[*eptr++] & ctype_digit) != 0) RRETURN(MATCH_NOMATCH);
3339 break;
3340
3341 case OP_DIGIT:
3342 for (i = 1; i <= min; i++)
3343 if ((md->ctypes[*eptr++] & ctype_digit) == 0) RRETURN(MATCH_NOMATCH);
3344 break;
3345
3346 case OP_NOT_WHITESPACE:
3347 for (i = 1; i <= min; i++)
3348 if ((md->ctypes[*eptr++] & ctype_space) != 0) RRETURN(MATCH_NOMATCH);
3349 break;
3350
3351 case OP_WHITESPACE:
3352 for (i = 1; i <= min; i++)
3353 if ((md->ctypes[*eptr++] & ctype_space) == 0) RRETURN(MATCH_NOMATCH);
3354 break;
3355
3356 case OP_NOT_WORDCHAR:
3357 for (i = 1; i <= min; i++)
3358 if ((md->ctypes[*eptr++] & ctype_word) != 0)
3359 RRETURN(MATCH_NOMATCH);
3360 break;
3361
3362 case OP_WORDCHAR:
3363 for (i = 1; i <= min; i++)
3364 if ((md->ctypes[*eptr++] & ctype_word) == 0)
3365 RRETURN(MATCH_NOMATCH);
3366 break;
3367
3368 default:
3369 RRETURN(PCRE_ERROR_INTERNAL);
3370 }
3371 }
3372
3373 /* If min = max, continue at the same level without recursing */
3374
3375 if (min == max) continue;
3376
3377 /* If minimizing, we have to test the rest of the pattern before each
3378 subsequent match. Again, separate the UTF-8 case for speed, and also
3379 separate the UCP cases. */
3380
3381 if (minimize)
3382 {
3383 #ifdef SUPPORT_UCP
3384 if (prop_type >= 0)
3385 {
3386 switch(prop_type)
3387 {
3388 case PT_ANY:
3389 for (fi = min;; fi++)
3390 {
3391 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM36);
3392 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3393 if (fi >= max || eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
3394 GETCHARINC(c, eptr);
3395 if (prop_fail_result) RRETURN(MATCH_NOMATCH);
3396 }
3397 /* Control never gets here */
3398
3399 case PT_LAMP:
3400 for (fi = min;; fi++)
3401 {
3402 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM37);
3403 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3404 if (fi >= max || eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
3405 GETCHARINC(c, eptr);
3406 prop_chartype = UCD_CHARTYPE(c);
3407 if ((prop_chartype == ucp_Lu ||
3408 prop_chartype == ucp_Ll ||
3409 prop_chartype == ucp_Lt) == prop_fail_result)
3410 RRETURN(MATCH_NOMATCH);
3411 }
3412 /* Control never gets here */
3413
3414 case PT_GC:
3415 for (fi = min;; fi++)
3416 {
3417 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM38);
3418 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3419 if (fi >= max || eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
3420 GETCHARINC(c, eptr);
3421 prop_category = UCD_CATEGORY(c);
3422 if ((prop_category == prop_value) == prop_fail_result)
3423 RRETURN(MATCH_NOMATCH);
3424 }
3425 /* Control never gets here */
3426
3427 case PT_PC:
3428 for (fi = min;; fi++)
3429 {
3430 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM39);
3431 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3432 if (fi >= max || eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
3433 GETCHARINC(c, eptr);
3434 prop_chartype = UCD_CHARTYPE(c);
3435 if ((prop_chartype == prop_value) == prop_fail_result)
3436 RRETURN(MATCH_NOMATCH);
3437 }
3438 /* Control never gets here */
3439
3440 case PT_SC:
3441 for (fi = min;; fi++)
3442 {
3443 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM40);
3444 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3445 if (fi >= max || eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
3446 GETCHARINC(c, eptr);
3447 prop_script = UCD_SCRIPT(c);
3448 if ((prop_script == prop_value) == prop_fail_result)
3449 RRETURN(MATCH_NOMATCH);
3450 }
3451 /* Control never gets here */
3452
3453 default:
3454 RRETURN(PCRE_ERROR_INTERNAL);
3455 }
3456 }
3457
3458 /* Match extended Unicode sequences. We will get here only if the
3459 support is in the binary; otherwise a compile-time error occurs. */
3460
3461 else if (ctype == OP_EXTUNI)
3462 {
3463 for (fi = min;; fi++)
3464 {
3465 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM41);
3466 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3467 if (fi >= max || eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
3468 GETCHARINCTEST(c, eptr);
3469 prop_category = UCD_CATEGORY(c);
3470 if (prop_category == ucp_M) RRETURN(MATCH_NOMATCH);
3471 while (eptr < md->end_subject)
3472 {
3473 int len = 1;
3474 if (!utf8) c = *eptr; else
3475 {
3476 GETCHARLEN(c, eptr, len);
3477 }
3478 prop_category = UCD_CATEGORY(c);
3479 if (prop_category != ucp_M) break;
3480 eptr += len;
3481 }
3482 }
3483 }
3484
3485 else
3486 #endif /* SUPPORT_UCP */
3487
3488 #ifdef SUPPORT_UTF8
3489 /* UTF-8 mode */
3490 if (utf8)
3491 {
3492 for (fi = min;; fi++)
3493 {
3494 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM42);
3495 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3496 if (fi >= max || eptr >= md->end_subject ||
3497 (ctype == OP_ANY && IS_NEWLINE(eptr)))
3498 RRETURN(MATCH_NOMATCH);
3499
3500 GETCHARINC(c, eptr);
3501 switch(ctype)
3502 {
3503 case OP_ANY: /* This is the non-NL case */
3504 case OP_ALLANY:
3505 case OP_ANYBYTE:
3506 break;
3507
3508 case OP_ANYNL:
3509 switch(c)
3510 {
3511 default: RRETURN(MATCH_NOMATCH);
3512 case 0x000d:
3513 if (eptr < md->end_subject && *eptr == 0x0a) eptr++;
3514 break;
3515 case 0x000a:
3516 break;
3517
3518 case 0x000b:
3519 case 0x000c:
3520 case 0x0085:
3521 case 0x2028:
3522 case 0x2029:
3523 if (md->bsr_anycrlf) RRETURN(MATCH_NOMATCH);
3524 break;
3525 }
3526 break;
3527
3528 case OP_NOT_HSPACE:
3529 switch(c)
3530 {
3531 default: break;
3532 case 0x09: /* HT */
3533 case 0x20: /* SPACE */
3534 case 0xa0: /* NBSP */
3535 case 0x1680: /* OGHAM SPACE MARK */
3536 case 0x180e: /* MONGOLIAN VOWEL SEPARATOR */
3537 case 0x2000: /* EN QUAD */
3538 case 0x2001: /* EM QUAD */
3539 case 0x2002: /* EN SPACE */
3540 case 0x2003: /* EM SPACE */
3541 case 0x2004: /* THREE-PER-EM SPACE */
3542 case 0x2005: /* FOUR-PER-EM SPACE */
3543 case 0x2006: /* SIX-PER-EM SPACE */
3544 case 0x2007: /* FIGURE SPACE */
3545 case 0x2008: /* PUNCTUATION SPACE */
3546 case 0x2009: /* THIN SPACE */
3547 case 0x200A: /* HAIR SPACE */
3548 case 0x202f: /* NARROW NO-BREAK SPACE */
3549 case 0x205f: /* MEDIUM MATHEMATICAL SPACE */
3550 case 0x3000: /* IDEOGRAPHIC SPACE */
3551 RRETURN(MATCH_NOMATCH);
3552 }
3553 break;
3554
3555 case OP_HSPACE:
3556 switch(c)
3557 {
3558 default: RRETURN(MATCH_NOMATCH);
3559 case 0x09: /* HT */
3560 case 0x20: /* SPACE */
3561 case 0xa0: /* NBSP */
3562 case 0x1680: /* OGHAM SPACE MARK */
3563 case 0x180e: /* MONGOLIAN VOWEL SEPARATOR */
3564 case 0x2000: /* EN QUAD */
3565 case 0x2001: /* EM QUAD */
3566 case 0x2002: /* EN SPACE */
3567 case 0x2003: /* EM SPACE */
3568 case 0x2004: /* THREE-PER-EM SPACE */
3569 case 0x2005: /* FOUR-PER-EM SPACE */
3570 case 0x2006: /* SIX-PER-EM SPACE */
3571 case 0x2007: /* FIGURE SPACE */
3572 case 0x2008: /* PUNCTUATION SPACE */
3573 case 0x2009: /* THIN SPACE */
3574 case 0x200A: /* HAIR SPACE */
3575 case 0x202f: /* NARROW NO-BREAK SPACE */
3576 case 0x205f: /* MEDIUM MATHEMATICAL SPACE */
3577 case 0x3000: /* IDEOGRAPHIC SPACE */
3578 break;
3579 }
3580 break;
3581
3582 case OP_NOT_VSPACE:
3583 switch(c)
3584 {
3585 default: break;
3586 case 0x0a: /* LF */
3587 case 0x0b: /* VT */
3588 case 0x0c: /* FF */
3589 case 0x0d: /* CR */
3590 case 0x85: /* NEL */
3591 case 0x2028: /* LINE SEPARATOR */
3592 case 0x2029: /* PARAGRAPH SEPARATOR */
3593 RRETURN(MATCH_NOMATCH);
3594 }
3595 break;
3596
3597 case OP_VSPACE:
3598 switch(c)
3599 {
3600 default: RRETURN(MATCH_NOMATCH);
3601 case 0x0a: /* LF */
3602 case 0x0b: /* VT */
3603 case 0x0c: /* FF */
3604 case 0x0d: /* CR */
3605 case 0x85: /* NEL */
3606 case 0x2028: /* LINE SEPARATOR */
3607 case 0x2029: /* PARAGRAPH SEPARATOR */
3608 break;
3609 }
3610 break;
3611
3612 case OP_NOT_DIGIT:
3613 if (c < 256 && (md->ctypes[c] & ctype_digit) != 0)
3614 RRETURN(MATCH_NOMATCH);
3615 break;
3616
3617 case OP_DIGIT:
3618 if (c >= 256 || (md->ctypes[c] & ctype_digit) == 0)
3619 RRETURN(MATCH_NOMATCH);
3620 break;
3621
3622 case OP_NOT_WHITESPACE:
3623 if (c < 256 && (md->ctypes[c] & ctype_space) != 0)
3624 RRETURN(MATCH_NOMATCH);
3625 break;
3626
3627 case OP_WHITESPACE:
3628 if (c >= 256 || (md->ctypes[c] & ctype_space) == 0)
3629 RRETURN(MATCH_NOMATCH);
3630 break;
3631
3632 case OP_NOT_WORDCHAR:
3633 if (c < 256 && (md->ctypes[c] & ctype_word) != 0)
3634 RRETURN(MATCH_NOMATCH);
3635 break;
3636
3637 case OP_WORDCHAR:
3638 if (c >= 256 || (md->ctypes[c] & ctype_word) == 0)
3639 RRETURN(MATCH_NOMATCH);
3640 break;
3641
3642 default:
3643 RRETURN(PCRE_ERROR_INTERNAL);
3644 }
3645 }
3646 }
3647 else
3648 #endif
3649 /* Not UTF-8 mode */
3650 {
3651 for (fi = min;; fi++)
3652 {
3653 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM43);
3654 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3655 if (fi >= max || eptr >= md->end_subject ||
3656 (ctype == OP_ANY && IS_NEWLINE(eptr)))
3657 RRETURN(MATCH_NOMATCH);
3658
3659 c = *eptr++;
3660 switch(ctype)
3661 {
3662 case OP_ANY: /* This is the non-NL case */
3663 case OP_ALLANY:
3664 case OP_ANYBYTE:
3665 break;
3666
3667 case OP_ANYNL:
3668 switch(c)
3669 {
3670 default: RRETURN(MATCH_NOMATCH);
3671 case 0x000d:
3672 if (eptr < md->end_subject && *eptr == 0x0a) eptr++;
3673 break;
3674
3675 case 0x000a:
3676 break;
3677
3678 case 0x000b:
3679 case 0x000c:
3680 case 0x0085:
3681 if (md->bsr_anycrlf) RRETURN(MATCH_NOMATCH);
3682 break;
3683 }
3684 break;
3685
3686 case OP_NOT_HSPACE:
3687 switch(c)
3688 {
3689 default: break;
3690 case 0x09: /* HT */
3691 case 0x20: /* SPACE */
3692 case 0xa0: /* NBSP */
3693 RRETURN(MATCH_NOMATCH);
3694 }
3695 break;
3696
3697 case OP_HSPACE:
3698 switch(c)
3699 {
3700 default: RRETURN(MATCH_NOMATCH);
3701 case 0x09: /* HT */
3702 case 0x20: /* SPACE */
3703 case 0xa0: /* NBSP */
3704 break;
3705 }
3706 break;
3707
3708 case OP_NOT_VSPACE:
3709 switch(c)
3710 {
3711 default: break;
3712 case 0x0a: /* LF */
3713 case 0x0b: /* VT */
3714 case 0x0c: /* FF */
3715 case 0x0d: /* CR */
3716 case 0x85: /* NEL */
3717 RRETURN(MATCH_NOMATCH);
3718 }
3719 break;
3720
3721 case OP_VSPACE:
3722 switch(c)
3723 {
3724 default: RRETURN(MATCH_NOMATCH);
3725 case 0x0a: /* LF */
3726 case 0x0b: /* VT */
3727 case 0x0c: /* FF */
3728 case 0x0d: /* CR */
3729 case 0x85: /* NEL */
3730 break;
3731 }
3732 break;
3733
3734 case OP_NOT_DIGIT:
3735 if ((md->ctypes[c] & ctype_digit) != 0) RRETURN(MATCH_NOMATCH);
3736 break;
3737
3738 case OP_DIGIT:
3739 if ((md->ctypes[c] & ctype_digit) == 0) RRETURN(MATCH_NOMATCH);
3740 break;
3741
3742 case OP_NOT_WHITESPACE:
3743 if ((md->ctypes[c] & ctype_space) != 0) RRETURN(MATCH_NOMATCH);
3744 break;
3745
3746 case OP_WHITESPACE:
3747 if ((md->ctypes[c] & ctype_space) == 0) RRETURN(MATCH_NOMATCH);
3748 break;
3749
3750 case OP_NOT_WORDCHAR:
3751 if ((md->ctypes[c] & ctype_word) != 0) RRETURN(MATCH_NOMATCH);
3752 break;
3753
3754 case OP_WORDCHAR:
3755 if ((md->ctypes[c] & ctype_word) == 0) RRETURN(MATCH_NOMATCH);
3756 break;
3757
3758 default:
3759 RRETURN(PCRE_ERROR_INTERNAL);
3760 }
3761 }
3762 }
3763 /* Control never gets here */
3764 }
3765
3766 /* If maximizing, it is worth using inline code for speed, doing the type
3767 test once at the start (i.e. keep it out of the loop). Again, keep the
3768 UTF-8 and UCP stuff separate. */
3769
3770 else
3771 {
3772 pp = eptr; /* Remember where we started */
3773
3774 #ifdef SUPPORT_UCP
3775 if (prop_type >= 0)
3776 {
3777 switch(prop_type)
3778 {
3779 case PT_ANY:
3780 for (i = min; i < max; i++)
3781 {
3782 int len = 1;
3783 if (eptr >= md->end_subject) break;
3784 GETCHARLEN(c, eptr, len);
3785 if (prop_fail_result) break;
3786 eptr+= len;
3787 }
3788 break;
3789
3790 case PT_LAMP:
3791 for (i = min; i < max; i++)
3792 {
3793 int len = 1;
3794 if (eptr >= md->end_subject) break;
3795 GETCHARLEN(c, eptr, len);
3796 prop_chartype = UCD_CHARTYPE(c);
3797 if ((prop_chartype == ucp_Lu ||
3798 prop_chartype == ucp_Ll ||
3799 prop_chartype == ucp_Lt) == prop_fail_result)
3800 break;
3801 eptr+= len;
3802 }
3803 break;
3804
3805 case PT_GC:
3806 for (i = min; i < max; i++)
3807 {
3808 int len = 1;
3809 if (eptr >= md->end_subject) break;
3810 GETCHARLEN(c, eptr, len);
3811 prop_category = UCD_CATEGORY(c);
3812 if ((prop_category == prop_value) == prop_fail_result)
3813 break;
3814 eptr+= len;
3815 }
3816 break;
3817
3818 case PT_PC:
3819 for (i = min; i < max; i++)
3820 {
3821 int len = 1;
3822 if (eptr >= md->end_subject) break;
3823 GETCHARLEN(c, eptr, len);
3824 prop_chartype = UCD_CHARTYPE(c);
3825 if ((prop_chartype == prop_value) == prop_fail_result)
3826 break;
3827 eptr+= len;
3828 }
3829 break;
3830
3831 case PT_SC:
3832 for (i = min; i < max; i++)
3833 {
3834 int len = 1;
3835 if (eptr >= md->end_subject) break;
3836 GETCHARLEN(c, eptr, len);
3837 prop_script = UCD_SCRIPT(c);
3838 if ((prop_script == prop_value) == prop_fail_result)
3839 break;
3840 eptr+= len;
3841 }
3842 break;
3843 }
3844
3845 /* eptr is now past the end of the maximum run */
3846
3847 if (possessive) continue;
3848 for(;;)
3849 {
3850 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM44);
3851 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3852 if (eptr-- == pp) break; /* Stop if tried at original pos */
3853 if (utf8) BACKCHAR(eptr);
3854 }
3855 }
3856
3857 /* Match extended Unicode sequences. We will get here only if the
3858 support is in the binary; otherwise a compile-time error occurs. */
3859
3860 else if (ctype == OP_EXTUNI)
3861 {
3862 for (i = min; i < max; i++)
3863 {
3864 if (eptr >= md->end_subject) break;
3865 GETCHARINCTEST(c, eptr);
3866 prop_category = UCD_CATEGORY(c);
3867 if (prop_category == ucp_M) break;
3868 while (eptr < md->end_subject)
3869 {
3870 int len = 1;
3871 if (!utf8) c = *eptr; else
3872 {
3873 GETCHARLEN(c, eptr, len);
3874 }
3875 prop_category = UCD_CATEGORY(c);
3876 if (prop_category != ucp_M) break;
3877 eptr += len;
3878 }
3879 }
3880
3881 /* eptr is now past the end of the maximum run */
3882
3883 if (possessive) continue;
3884 for(;;)
3885 {
3886 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM45);
3887 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3888 if (eptr-- == pp) break; /* Stop if tried at original pos */
3889 for (;;) /* Move back over one extended */
3890 {
3891 int len = 1;
3892 if (!utf8) c = *eptr; else
3893 {
3894 BACKCHAR(eptr);
3895 GETCHARLEN(c, eptr, len);
3896 }
3897 prop_category = UCD_CATEGORY(c);
3898 if (prop_category != ucp_M) break;
3899 eptr--;
3900 }
3901 }
3902 }
3903
3904 else
3905 #endif /* SUPPORT_UCP */
3906
3907 #ifdef SUPPORT_UTF8
3908 /* UTF-8 mode */
3909
3910 if (utf8)
3911 {
3912 switch(ctype)
3913 {
3914 case OP_ANY:
3915 if (max < INT_MAX)
3916 {
3917 for (i = min; i < max; i++)
3918 {
3919 if (eptr >= md->end_subject || IS_NEWLINE(eptr)) break;
3920 eptr++;
3921 while (eptr < md->end_subject && (*eptr & 0xc0) == 0x80) eptr++;
3922 }
3923 }
3924
3925 /* Handle unlimited UTF-8 repeat */
3926
3927 else
3928 {
3929 for (i = min; i < max; i++)
3930 {
3931 if (eptr >= md->end_subject || IS_NEWLINE(eptr)) break;
3932 eptr++;
3933 while (eptr < md->end_subject && (*eptr & 0xc0) == 0x80) eptr++;
3934 }
3935 }
3936 break;
3937
3938 case OP_ALLANY:
3939 if (max < INT_MAX)
3940 {
3941 for (i = min; i < max; i++)
3942 {
3943 if (eptr >= md->end_subject) break;
3944 eptr++;
3945 while (eptr < md->end_subject && (*eptr & 0xc0) == 0x80) eptr++;
3946 }
3947 }
3948 else eptr = md->end_subject; /* Unlimited UTF-8 repeat */
3949 break;
3950
3951 /* The byte case is the same as non-UTF8 */
3952
3953 case OP_ANYBYTE:
3954 c = max - min;
3955 if (c > (unsigned int)(md->end_subject - eptr))
3956 c = md->end_subject - eptr;
3957 eptr += c;
3958 break;
3959
3960 case OP_ANYNL:
3961 for (i = min; i < max; i++)
3962 {
3963 int len = 1;
3964 if (eptr >= md->end_subject) break;
3965 GETCHARLEN(c, eptr, len);
3966 if (c == 0x000d)
3967 {
3968 if (++eptr >= md->end_subject) break;
3969 if (*eptr == 0x000a) eptr++;
3970 }
3971 else
3972 {
3973 if (c != 0x000a &&
3974 (md->bsr_anycrlf ||
3975 (c != 0x000b && c != 0x000c &&
3976 c != 0x0085 && c != 0x2028 && c != 0x2029)))
3977 break;
3978 eptr += len;
3979 }
3980 }
3981 break;
3982
3983 case OP_NOT_HSPACE:
3984 case OP_HSPACE:
3985 for (i = min; i < max; i++)
3986 {
3987 BOOL gotspace;
3988 int len = 1;
3989 if (eptr >= md->end_subject) break;
3990 GETCHARLEN(c, eptr, len);
3991 switch(c)
3992 {
3993 default: gotspace = FALSE; break;
3994 case 0x09: /* HT */
3995 case 0x20: /* SPACE */
3996 case 0xa0: /* NBSP */
3997 case 0x1680: /* OGHAM SPACE MARK */
3998 case 0x180e: /* MONGOLIAN VOWEL SEPARATOR */
3999 case 0x2000: /* EN QUAD */
4000 case 0x2001: /* EM QUAD */
4001 case 0x2002: /* EN SPACE */
4002 case 0x2003: /* EM SPACE */
4003 case 0x2004: /* THREE-PER-EM SPACE */
4004 case 0x2005: /* FOUR-PER-EM SPACE */
4005 case 0x2006: /* SIX-PER-EM SPACE */
4006 case 0x2007: /* FIGURE SPACE */
4007 case 0x2008: /* PUNCTUATION SPACE */
4008 case 0x2009: /* THIN SPACE */
4009 case 0x200A: /* HAIR SPACE */
4010 case 0x202f: /* NARROW NO-BREAK SPACE */
4011 case 0x205f: /* MEDIUM MATHEMATICAL SPACE */
4012 case 0x3000: /* IDEOGRAPHIC SPACE */
4013 gotspace = TRUE;
4014 break;
4015 }
4016 if (gotspace == (ctype == OP_NOT_HSPACE)) break;
4017 eptr += len;
4018 }
4019 break;
4020
4021 case OP_NOT_VSPACE:
4022 case OP_VSPACE:
4023 for (i = min; i < max; i++)
4024 {
4025 BOOL gotspace;
4026 int len = 1;
4027 if (eptr >= md->end_subject) break;
4028 GETCHARLEN(c, eptr, len);
4029 switch(c)
4030 {
4031 default: gotspace = FALSE; break;
4032 case 0x0a: /* LF */
4033 case 0x0b: /* VT */
4034 case 0x0c: /* FF */
4035 case 0x0d: /* CR */
4036 case 0x85: /* NEL */
4037 case 0x2028: /* LINE SEPARATOR */
4038 case 0x2029: /* PARAGRAPH SEPARATOR */
4039 gotspace = TRUE;
4040 break;
4041 }
4042 if (gotspace == (ctype == OP_NOT_VSPACE)) break;
4043 eptr += len;
4044 }
4045 break;
4046
4047 case OP_NOT_DIGIT:
4048 for (i = min; i < max; i++)
4049 {
4050 int len = 1;
4051 if (eptr >= md->end_subject) break;
4052 GETCHARLEN(c, eptr, len);
4053 if (c < 256 && (md->ctypes[c] & ctype_digit) != 0) break;
4054 eptr+= len;
4055 }
4056 break;
4057
4058 case OP_DIGIT:
4059 for (i = min; i < max; i++)
4060 {
4061 int len = 1;
4062 if (eptr >= md->end_subject) break;
4063 GETCHARLEN(c, eptr, len);
4064 if (c >= 256 ||(md->ctypes[c] & ctype_digit) == 0) break;
4065 eptr+= len;
4066 }
4067 break;
4068
4069 case OP_NOT_WHITESPACE:
4070 for (i = min; i < max; i++)
4071 {
4072 int len = 1;
4073 if (eptr >= md->end_subject) break;
4074 GETCHARLEN(c, eptr, len);
4075 if (c < 256 && (md->ctypes[c] & ctype_space) != 0) break;
4076 eptr+= len;
4077 }
4078 break;
4079
4080 case OP_WHITESPACE:
4081 for (i = min; i < max; i++)
4082 {
4083 int len = 1;
4084 if (eptr >= md->end_subject) break;
4085 GETCHARLEN(c, eptr, len);
4086 if (c >= 256 ||(md->ctypes[c] & ctype_space) == 0) break;
4087 eptr+= len;
4088 }
4089 break;
4090
4091 case OP_NOT_WORDCHAR:
4092 for (i = min; i < max; i++)
4093 {
4094 int len = 1;
4095 if (eptr >= md->end_subject) break;
4096 GETCHARLEN(c, eptr, len);
4097 if (c < 256 && (md->ctypes[c] & ctype_word) != 0) break;
4098 eptr+= len;
4099 }
4100 break;
4101
4102 case OP_WORDCHAR:
4103 for (i = min; i < max; i++)
4104 {
4105 int len = 1;
4106 if (eptr >= md->end_subject) break;
4107 GETCHARLEN(c, eptr, len);
4108 if (c >= 256 || (md->ctypes[c] & ctype_word) == 0) break;
4109 eptr+= len;
4110 }
4111 break;
4112
4113 default:
4114 RRETURN(PCRE_ERROR_INTERNAL);
4115 }
4116
4117 /* eptr is now past the end of the maximum run */
4118
4119 if (possessive) continue;
4120 for(;;)
4121 {
4122 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM46);
4123 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
4124 if (eptr-- == pp) break; /* Stop if tried at original pos */
4125 BACKCHAR(eptr);
4126 }
4127 }
4128 else
4129 #endif /* SUPPORT_UTF8 */
4130
4131 /* Not UTF-8 mode */
4132 {
4133 switch(ctype)
4134 {
4135 case OP_ANY:
4136 for (i = min; i < max; i++)
4137 {
4138 if (eptr >= md->end_subject || IS_NEWLINE(eptr)) break;
4139 eptr++;
4140 }
4141 break;
4142
4143 case OP_ALLANY:
4144 case OP_ANYBYTE:
4145 c = max - min;
4146 if (c > (unsigned int)(md->end_subject - eptr))
4147 c = md->end_subject - eptr;
4148 eptr += c;
4149 break;
4150
4151 case OP_ANYNL:
4152 for (i = min; i < max; i++)
4153 {
4154 if (eptr >= md->end_subject) break;
4155 c = *eptr;
4156 if (c == 0x000d)
4157 {
4158 if (++eptr >= md->end_subject) break;
4159 if (*eptr == 0x000a) eptr++;
4160 }
4161 else
4162 {
4163 if (c != 0x000a &&
4164 (md->bsr_anycrlf ||
4165 (c != 0x000b && c != 0x000c && c != 0x0085)))
4166 break;
4167 eptr++;
4168 }
4169 }
4170 break;
4171
4172 case OP_NOT_HSPACE:
4173 for (i = min; i < max; i++)
4174 {
4175 if (eptr >= md->end_subject) break;
4176 c = *eptr;
4177 if (c == 0x09 || c == 0x20 || c == 0xa0) break;
4178 eptr++;
4179 }
4180 break;
4181
4182 case OP_HSPACE:
4183 for (i = min; i < max; i++)
4184 {
4185 if (eptr >= md->end_subject) break;
4186 c = *eptr;
4187 if (c != 0x09 && c != 0x20 && c != 0xa0) break;
4188 eptr++;
4189 }
4190 break;
4191
4192 case OP_NOT_VSPACE:
4193 for (i = min; i < max; i++)
4194 {
4195 if (eptr >= md->end_subject) break;
4196 c = *eptr;
4197 if (c == 0x0a || c == 0x0b || c == 0x0c || c == 0x0d || c == 0x85)
4198 break;
4199 eptr++;
4200 }
4201 break;
4202
4203 case OP_VSPACE:
4204 for (i = min; i < max; i++)
4205 {
4206 if (eptr >= md->end_subject) break;
4207 c = *eptr;
4208 if (c != 0x0a && c != 0x0b && c != 0x0c && c != 0x0d && c != 0x85)
4209 break;
4210 eptr++;
4211 }
4212 break;
4213
4214 case OP_NOT_DIGIT:
4215 for (i = min; i < max; i++)
4216 {
4217 if (eptr >= md->end_subject || (md->ctypes[*eptr] & ctype_digit) != 0)
4218 break;
4219 eptr++;
4220 }
4221 break;
4222
4223 case OP_DIGIT:
4224 for (i = min; i < max; i++)
4225 {
4226 if (eptr >= md->end_subject || (md->ctypes[*eptr] & ctype_digit) == 0)
4227 break;
4228 eptr++;
4229 }
4230 break;
4231
4232 case OP_NOT_WHITESPACE:
4233 for (i = min; i < max; i++)
4234 {
4235 if (eptr >= md->end_subject || (md->ctypes[*eptr] & ctype_space) != 0)
4236 break;
4237 eptr++;
4238 }
4239 break;
4240
4241 case OP_WHITESPACE:
4242 for (i = min; i < max; i++)
4243 {
4244 if (eptr >= md->end_subject || (md->ctypes[*eptr] & ctype_space) == 0)
4245 break;
4246 eptr++;
4247 }
4248 break;
4249
4250 case OP_NOT_WORDCHAR:
4251 for (i = min; i < max; i++)
4252 {
4253 if (eptr >= md->end_subject || (md->ctypes[*eptr] & ctype_word) != 0)
4254 break;
4255 eptr++;
4256 }
4257 break;
4258
4259 case OP_WORDCHAR:
4260 for (i = min; i < max; i++)
4261 {
4262 if (eptr >= md->end_subject || (md->ctypes[*eptr] & ctype_word) == 0)
4263 break;
4264 eptr++;
4265 }
4266 break;
4267
4268 default:
4269 RRETURN(PCRE_ERROR_INTERNAL);
4270 }
4271
4272 /* eptr is now past the end of the maximum run */
4273
4274 if (possessive) continue;
4275 while (eptr >= pp)
4276 {
4277 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM47);
4278 eptr--;
4279 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
4280 }
4281 }
4282
4283 /* Get here if we can't make it match with any permitted repetitions */
4284
4285 RRETURN(MATCH_NOMATCH);
4286 }
4287 /* Control never gets here */
4288
4289 /* There's been some horrible disaster. Arrival here can only mean there is
4290 something seriously wrong in the code above or the OP_xxx definitions. */
4291
4292 default:
4293 DPRINTF(("Unknown opcode %d\n", *ecode));
4294 RRETURN(PCRE_ERROR_UNKNOWN_OPCODE);
4295 }
4296
4297 /* Do not stick any code in here without much thought; it is assumed
4298 that "continue" in the code above comes out to here to repeat the main
4299 loop. */
4300
4301 } /* End of main loop */
4302 /* Control never reaches here */
4303
4304
4305 /* When compiling to use the heap rather than the stack for recursive calls to
4306 match(), the RRETURN() macro jumps here. The number that is saved in
4307 frame->Xwhere indicates which label we actually want to return to. */
4308
4309 #ifdef NO_RECURSE
4310 #define LBL(val) case val: goto L_RM##val;
4311 HEAP_RETURN:
4312 switch (frame->Xwhere)
4313 {
4314 LBL( 1) LBL( 2) LBL( 3) LBL( 4) LBL( 5) LBL( 6) LBL( 7) LBL( 8)
4315 LBL( 9) LBL(10) LBL(11) LBL(12) LBL(13) LBL(14) LBL(15) LBL(17)
4316 LBL(19) LBL(24) LBL(25) LBL(26) LBL(27) LBL(29) LBL(31) LBL(33)
4317 LBL(35) LBL(43) LBL(47) LBL(48) LBL(49) LBL(50) LBL(51) LBL(52)
4318 LBL(53) LBL(54)
4319 #ifdef SUPPORT_UTF8
4320 LBL(16) LBL(18) LBL(20) LBL(21) LBL(22) LBL(23) LBL(28) LBL(30)
4321 LBL(32) LBL(34) LBL(42) LBL(46)
4322 #ifdef SUPPORT_UCP
4323 LBL(36) LBL(37) LBL(38) LBL(39) LBL(40) LBL(41) LBL(44) LBL(45)
4324 #endif /* SUPPORT_UCP */
4325 #endif /* SUPPORT_UTF8 */
4326 default:
4327 DPRINTF(("jump error in pcre match: label %d non-existent\n", frame->Xwhere));
4328 return PCRE_ERROR_INTERNAL;
4329 }
4330 #undef LBL
4331 #endif /* NO_RECURSE */
4332 }
4333
4334
4335 /***************************************************************************
4336 ****************************************************************************
4337 RECURSION IN THE match() FUNCTION
4338
4339 Undefine all the macros that were defined above to handle this. */
4340
4341 #ifdef NO_RECURSE
4342 #undef eptr
4343 #undef ecode
4344 #undef mstart
4345 #undef offset_top
4346 #undef ims
4347 #undef eptrb
4348 #undef flags
4349
4350 #undef callpat
4351 #undef charptr
4352 #undef data
4353 #undef next
4354 #undef pp
4355 #undef prev
4356 #undef saved_eptr
4357
4358 #undef new_recursive
4359
4360 #undef cur_is_word
4361 #undef condition
4362 #undef prev_is_word
4363
4364 #undef original_ims
4365
4366 #undef ctype
4367 #undef length
4368 #undef max
4369 #undef min
4370 #undef number
4371 #undef offset
4372 #undef op
4373 #undef save_capture_last
4374 #undef save_offset1
4375 #undef save_offset2
4376 #undef save_offset3
4377 #undef stacksave
4378
4379 #undef newptrb
4380
4381 #endif
4382
4383 /* These two are defined as macros in both cases */
4384
4385 #undef fc
4386 #undef fi
4387
4388 /***************************************************************************
4389 ***************************************************************************/
4390
4391
4392
4393 /*************************************************
4394 * Execute a Regular Expression *
4395 *************************************************/
4396
4397 /* This function applies a compiled re to a subject string and picks out
4398 portions of the string if it matches. Two elements in the vector are set for
4399 each substring: the offsets to the start and end of the substring.
4400
4401 Arguments:
4402 argument_re points to the compiled expression
4403 extra_data points to extra data or is NULL
4404 subject points to the subject string
4405 length length of subject string (may contain binary zeros)
4406 start_offset where to start in the subject string
4407 options option bits
4408 offsets points to a vector of ints to be filled in with offsets
4409 offsetcount the number of elements in the vector
4410
4411 Returns: > 0 => success; value is the number of elements filled in
4412 = 0 => success, but offsets is not big enough
4413 -1 => failed to match
4414 < -1 => some kind of unexpected problem
4415 */
4416
4417 PCRE_EXP_DEFN int PCRE_CALL_CONVENTION
4418 pcre_exec(const pcre *argument_re, const pcre_extra *extra_data,
4419 PCRE_SPTR subject, int length, int start_offset, int options, int *offsets,
4420 int offsetcount)
4421 {
4422 int rc, resetcount, ocount;
4423 int first_byte = -1;
4424 int req_byte = -1;
4425 int req_byte2 = -1;
4426 int newline;
4427 unsigned long int ims;
4428 BOOL using_temporary_offsets = FALSE;
4429 BOOL anchored;
4430 BOOL startline;
4431 BOOL firstline;
4432 BOOL first_byte_caseless = FALSE;
4433 BOOL req_byte_caseless = FALSE;
4434 BOOL utf8;
4435 match_data match_block;
4436 match_data *md = &match_block;
4437 const uschar *tables;
4438 const uschar *start_bits = NULL;
4439 USPTR start_match = (USPTR)subject + start_offset;
4440 USPTR end_subject;
4441 USPTR req_byte_ptr = start_match - 1;
4442
4443 pcre_study_data internal_study;
4444 const pcre_study_data *study;
4445
4446 real_pcre internal_re;
4447 const real_pcre *external_re = (const real_pcre *)argument_re;
4448 const real_pcre *re = external_re;
4449
4450 /* Plausibility checks */
4451
4452 if ((options & ~PUBLIC_EXEC_OPTIONS) != 0) return PCRE_ERROR_BADOPTION;
4453 if (re == NULL || subject == NULL ||
4454 (offsets == NULL && offsetcount > 0)) return PCRE_ERROR_NULL;
4455 if (offsetcount < 0) return PCRE_ERROR_BADCOUNT;
4456
4457 /* Fish out the optional data from the extra_data structure, first setting
4458 the default values. */
4459
4460 study = NULL;
4461 md->match_limit = MATCH_LIMIT;
4462 md->match_limit_recursion = MATCH_LIMIT_RECURSION;
4463 md->callout_data = NULL;
4464
4465 /* The table pointer is always in native byte order. */
4466
4467 tables = external_re->tables;
4468
4469 if (extra_data != NULL)
4470 {
4471 register unsigned int flags = extra_data->flags;
4472 if ((flags & PCRE_EXTRA_STUDY_DATA) != 0)
4473 study = (const pcre_study_data *)extra_data->study_data;
4474 if ((flags & PCRE_EXTRA_MATCH_LIMIT) != 0)
4475 md->match_limit = extra_data->match_limit;
4476 if ((flags & PCRE_EXTRA_MATCH_LIMIT_RECURSION) != 0)
4477 md->match_limit_recursion = extra_data->match_limit_recursion;
4478 if ((flags & PCRE_EXTRA_CALLOUT_DATA) != 0)
4479 md->callout_data = extra_data->callout_data;
4480 if ((flags & PCRE_EXTRA_TABLES) != 0) tables = extra_data->tables;
4481 }
4482
4483 /* If the exec call supplied NULL for tables, use the inbuilt ones. This
4484 is a feature that makes it possible to save compiled regex and re-use them
4485 in other programs later. */
4486
4487 if (tables == NULL) tables = _pcre_default_tables;
4488
4489 /* Check that the first field in the block is the magic number. If it is not,
4490 test for a regex that was compiled on a host of opposite endianness. If this is
4491 the case, flipped values are put in internal_re and internal_study if there was
4492 study data too. */
4493
4494 if (re->magic_number != MAGIC_NUMBER)
4495 {
4496 re = _pcre_try_flipped(re, &internal_re, study, &internal_study);
4497 if (re == NULL) return PCRE_ERROR_BADMAGIC;
4498 if (study != NULL) study = &internal_study;
4499 }
4500
4501 /* Set up other data */
4502
4503 anchored = ((re->options | options) & PCRE_ANCHORED) != 0;
4504 startline = (re->flags & PCRE_STARTLINE) != 0;
4505 firstline = (re->options & PCRE_FIRSTLINE) != 0;
4506
4507 /* The code starts after the real_pcre block and the capture name table. */
4508
4509 md->start_code = (const uschar *)external_re + re->name_table_offset +
4510 re->name_count * re->name_entry_size;
4511
4512 md->start_subject = (USPTR)subject;
4513 md->start_offset = start_offset;
4514 md->end_subject = md->start_subject + length;
4515 end_subject = md->end_subject;
4516
4517 md->endonly = (re->options & PCRE_DOLLAR_ENDONLY) != 0;
4518 utf8 = md->utf8 = (re->options & PCRE_UTF8) != 0;
4519 md->jscript_compat = (re->options & PCRE_JAVASCRIPT_COMPAT) != 0;
4520
4521 md->notbol = (options & PCRE_NOTBOL) != 0;
4522 md->noteol = (options & PCRE_NOTEOL) != 0;
4523 md->notempty = (options & PCRE_NOTEMPTY) != 0;
4524 md->partial = (options & PCRE_PARTIAL) != 0;
4525 md->hitend = FALSE;
4526
4527 md->recursive = NULL; /* No recursion at top level */
4528
4529 md->lcc = tables + lcc_offset;
4530 md->ctypes = tables + ctypes_offset;
4531
4532 /* Handle different \R options. */
4533
4534 switch (options & (PCRE_BSR_ANYCRLF|PCRE_BSR_UNICODE))
4535 {
4536 case 0:
4537 if ((re->options & (PCRE_BSR_ANYCRLF|PCRE_BSR_UNICODE)) != 0)
4538 md->bsr_anycrlf = (re->options & PCRE_BSR_ANYCRLF) != 0;
4539 else
4540 #ifdef BSR_ANYCRLF
4541 md->bsr_anycrlf = TRUE;
4542 #else
4543 md->bsr_anycrlf = FALSE;
4544 #endif
4545 break;
4546
4547 case PCRE_BSR_ANYCRLF:
4548 md->bsr_anycrlf = TRUE;
4549 break;
4550
4551 case PCRE_BSR_UNICODE:
4552 md->bsr_anycrlf = FALSE;
4553 break;
4554
4555 default: return PCRE_ERROR_BADNEWLINE;
4556 }
4557
4558 /* Handle different types of newline. The three bits give eight cases. If
4559 nothing is set at run time, whatever was used at compile time applies. */
4560
4561 switch ((((options & PCRE_NEWLINE_BITS) == 0)? re->options :
4562 (pcre_uint32)options) & PCRE_NEWLINE_BITS)
4563 {
4564 case 0: newline = NEWLINE; break; /* Compile-time default */
4565 case PCRE_NEWLINE_CR: newline = CHAR_CR; break;
4566 case PCRE_NEWLINE_LF: newline = CHAR_NL; break;
4567 case PCRE_NEWLINE_CR+
4568 PCRE_NEWLINE_LF: newline = (CHAR_CR << 8) | CHAR_NL; break;
4569 case PCRE_NEWLINE_ANY: newline = -1; break;
4570 case PCRE_NEWLINE_ANYCRLF: newline = -2; break;
4571 default: return PCRE_ERROR_BADNEWLINE;
4572 }
4573
4574 if (newline == -2)
4575 {
4576 md->nltype = NLTYPE_ANYCRLF;
4577 }
4578 else if (newline < 0)
4579 {
4580 md->nltype = NLTYPE_ANY;
4581 }
4582 else
4583 {
4584 md->nltype = NLTYPE_FIXED;
4585 if (newline > 255)
4586 {
4587 md->nllen = 2;
4588 md->nl[0] = (newline >> 8) & 255;
4589 md->nl[1] = newline & 255;
4590 }
4591 else
4592 {
4593 md->nllen = 1;
4594 md->nl[0] = newline;
4595 }
4596 }
4597
4598 /* Partial matching is supported only for a restricted set of regexes at the
4599 moment. */
4600
4601 if (md->partial && (re->flags & PCRE_NOPARTIAL) != 0)
4602 return PCRE_ERROR_BADPARTIAL;
4603
4604 /* Check a UTF-8 string if required. Unfortunately there's no way of passing
4605 back the character offset. */
4606
4607 #ifdef SUPPORT_UTF8
4608 if (utf8 && (options & PCRE_NO_UTF8_CHECK) == 0)
4609 {
4610 if (_pcre_valid_utf8((uschar *)subject, length) >= 0)
4611 return PCRE_ERROR_BADUTF8;
4612 if (start_offset > 0 && start_offset < length)
4613 {
4614 int tb = ((uschar *)subject)[start_offset];
4615 if (tb > 127)
4616 {
4617 tb &= 0xc0;
4618 if (tb != 0 && tb != 0xc0) return PCRE_ERROR_BADUTF8_OFFSET;
4619 }
4620 }
4621 }
4622 #endif
4623
4624 /* The ims options can vary during the matching as a result of the presence
4625 of (?ims) items in the pattern. They are kept in a local variable so that
4626 restoring at the exit of a group is easy. */
4627
4628 ims = re->options & (PCRE_CASELESS|PCRE_MULTILINE|PCRE_DOTALL);
4629
4630 /* If the expression has got more back references than the offsets supplied can
4631 hold, we get a temporary chunk of working store to use during the matching.
4632 Otherwise, we can use the vector supplied, rounding down its size to a multiple
4633 of 3. */
4634
4635 ocount = offsetcount - (offsetcount % 3);
4636
4637 if (re->top_backref > 0 && re->top_backref >= ocount/3)
4638 {
4639 ocount = re->top_backref * 3 + 3;
4640 md->offset_vector = (int *)(pcre_malloc)(ocount * sizeof(int));
4641 if (md->offset_vector == NULL) return PCRE_ERROR_NOMEMORY;
4642 using_temporary_offsets = TRUE;
4643 DPRINTF(("Got memory to hold back references\n"));
4644 }
4645 else md->offset_vector = offsets;
4646
4647 md->offset_end = ocount;
4648 md->offset_max = (2*ocount)/3;
4649 md->offset_overflow = FALSE;
4650 md->capture_last = -1;
4651
4652 /* Compute the minimum number of offsets that we need to reset each time. Doing
4653 this makes a huge difference to execution time when there aren't many brackets
4654 in the pattern. */
4655
4656 resetcount = 2 + re->top_bracket * 2;
4657 if (resetcount > offsetcount) resetcount = ocount;
4658
4659 /* Reset the working variable associated with each extraction. These should
4660 never be used unless previously set, but they get saved and restored, and so we
4661 initialize them to avoid reading uninitialized locations. */
4662
4663 if (md->offset_vector != NULL)
4664 {
4665 register int *iptr = md->offset_vector + ocount;
4666 register int *iend = iptr - resetcount/2 + 1;
4667 while (--iptr >= iend) *iptr = -1;
4668 }
4669
4670 /* Set up the first character to match, if available. The first_byte value is
4671 never set for an anchored regular expression, but the anchoring may be forced
4672 at run time, so we have to test for anchoring. The first char may be unset for
4673 an unanchored pattern, of course. If there's no first char and the pattern was
4674 studied, there may be a bitmap of possible first characters. */
4675
4676 if (!anchored)
4677 {
4678 if ((re->flags & PCRE_FIRSTSET) != 0)
4679 {
4680 first_byte = re->first_byte & 255;
4681 if ((first_byte_caseless = ((re->first_byte & REQ_CASELESS) != 0)) == TRUE)
4682 first_byte = md->lcc[first_byte];
4683 }
4684 else
4685 if (!startline && study != NULL &&
4686 (study->options & PCRE_STUDY_MAPPED) != 0)
4687 start_bits = study->start_bits;
4688 }
4689
4690 /* For anchored or unanchored matches, there may be a "last known required
4691 character" set. */
4692
4693 if ((re->flags & PCRE_REQCHSET) != 0)
4694 {
4695 req_byte = re->req_byte & 255;
4696 req_byte_caseless = (re->req_byte & REQ_CASELESS) != 0;
4697 req_byte2 = (tables + fcc_offset)[req_byte]; /* case flipped */
4698 }
4699
4700
4701 /* ==========================================================================*/
4702
4703 /* Loop for handling unanchored repeated matching attempts; for anchored regexs
4704 the loop runs just once. */
4705
4706 for(;;)
4707 {
4708 USPTR save_end_subject = end_subject;
4709 USPTR new_start_match;
4710
4711 /* Reset the maximum number of extractions we might see. */
4712
4713 if (md->offset_vector != NULL)
4714 {
4715 register int *iptr = md->offset_vector;
4716 register int *iend = iptr + resetcount;
4717 while (iptr < iend) *iptr++ = -1;
4718 }
4719
4720 /* If firstline is TRUE, the start of the match is constrained to the first
4721 line of a multiline string. That is, the match must be before or at the first
4722 newline. Implement this by temporarily adjusting end_subject so that we stop
4723 scanning at a newline. If the match fails at the newline, later code breaks
4724 this loop. */
4725
4726 if (firstline)
4727 {
4728 USPTR t = start_match;
4729 #ifdef SUPPORT_UTF8
4730 if (utf8)
4731 {
4732 while (t < md->end_subject && !IS_NEWLINE(t))
4733 {
4734 t++;
4735 while (t < end_subject && (*t & 0xc0) == 0x80) t++;
4736 }
4737 }
4738 else
4739 #endif
4740 while (t < md->end_subject && !IS_NEWLINE(t)) t++;
4741 end_subject = t;
4742 }
4743
4744 /* There are some optimizations that avoid running the match if a known
4745 starting point is not found, or if a known later character is not present.
4746 However, there is an option that disables these, for testing and for ensuring
4747 that all callouts do actually occur. */
4748
4749 if ((options & PCRE_NO_START_OPTIMIZE) == 0)
4750 {
4751 /* Advance to a unique first byte if there is one. */
4752
4753 if (first_byte >= 0)
4754 {
4755 if (first_byte_caseless)
4756 while (start_match < end_subject && md->lcc[*start_match] != first_byte)
4757 start_match++;
4758 else
4759 while (start_match < end_subject && *start_match != first_byte)
4760 start_match++;
4761 }
4762
4763 /* Or to just after a linebreak for a multiline match */
4764
4765 else if (startline)
4766 {
4767 if (start_match > md->start_subject + start_offset)
4768 {
4769 #ifdef SUPPORT_UTF8
4770 if (utf8)
4771 {
4772 while (start_match < end_subject && !WAS_NEWLINE(start_match))
4773 {
4774 start_match++;
4775 while(start_match < end_subject && (*start_match & 0xc0) == 0x80)
4776 start_match++;
4777 }
4778 }
4779 else
4780 #endif
4781 while (start_match < end_subject && !WAS_NEWLINE(start_match))
4782 start_match++;
4783
4784 /* If we have just passed a CR and the newline option is ANY or ANYCRLF,
4785 and we are now at a LF, advance the match position by one more character.
4786 */
4787
4788 if (start_match[-1] == CHAR_CR &&
4789 (md->nltype == NLTYPE_ANY || md->nltype == NLTYPE_ANYCRLF) &&
4790 start_match < end_subject &&
4791 *start_match == CHAR_NL)
4792 start_match++;
4793 }
4794 }
4795
4796 /* Or to a non-unique first byte after study */
4797
4798 else if (start_bits != NULL)
4799 {
4800 while (start_match < end_subject)
4801 {
4802 register unsigned int c = *start_match;
4803 if ((start_bits[c/8] & (1 << (c&7))) == 0) start_match++;
4804 else break;
4805 }
4806 }
4807 } /* Starting optimizations */
4808
4809 /* Restore fudged end_subject */
4810
4811 end_subject = save_end_subject;
4812
4813 #ifdef DEBUG /* Sigh. Some compilers never learn. */
4814 printf(">>>> Match against: ");
4815 pchars(start_match, end_subject - start_match, TRUE, md);
4816 printf("\n");
4817 #endif
4818
4819 /* If req_byte is set, we know that that character must appear in the
4820 subject for the match to succeed. If the first character is set, req_byte
4821 must be later in the subject; otherwise the test starts at the match point.
4822 This optimization can save a huge amount of backtracking in patterns with
4823 nested unlimited repeats that aren't going to match. Writing separate code
4824 for cased/caseless versions makes it go faster, as does using an
4825 autoincrement and backing off on a match.
4826
4827 HOWEVER: when the subject string is very, very long, searching to its end
4828 can take a long time, and give bad performance on quite ordinary patterns.
4829 This showed up when somebody was matching something like /^\d+C/ on a
4830 32-megabyte string... so we don't do this when the string is sufficiently
4831 long.
4832
4833 ALSO: this processing is disabled when partial matching is requested, or if
4834 disabling is explicitly requested. */
4835
4836 if ((options & PCRE_NO_START_OPTIMIZE) == 0 &&
4837 req_byte >= 0 &&
4838 end_subject - start_match < REQ_BYTE_MAX &&
4839 !md->partial)
4840 {
4841 register USPTR p = start_match + ((first_byte >= 0)? 1 : 0);
4842
4843 /* We don't need to repeat the search if we haven't yet reached the
4844 place we found it at last time. */
4845
4846 if (p > req_byte_ptr)
4847 {
4848 if (req_byte_caseless)
4849 {
4850 while (p < end_subject)
4851 {
4852 register int pp = *p++;
4853 if (pp == req_byte || pp == req_byte2) { p--; break; }
4854 }
4855 }
4856 else
4857 {
4858 while (p < end_subject)
4859 {
4860 if (*p++ == req_byte) { p--; break; }
4861 }
4862 }
4863
4864 /* If we can't find the required character, break the matching loop,
4865 forcing a match failure. */
4866
4867 if (p >= end_subject)
4868 {
4869 rc = MATCH_NOMATCH;
4870 break;
4871 }
4872
4873 /* If we have found the required character, save the point where we
4874 found it, so that we don't search again next time round the loop if
4875 the start hasn't passed this character yet. */
4876
4877 req_byte_ptr = p;
4878 }
4879 }
4880
4881 /* OK, we can now run the match. */
4882
4883 md->start_match_ptr = start_match;
4884 md->match_call_count = 0;
4885 rc = match(start_match, md->start_code, start_match, 2, md, ims, NULL, 0, 0);
4886
4887 switch(rc)
4888 {
4889 /* NOMATCH and PRUNE advance by one character. THEN at this level acts
4890 exactly like PRUNE. */
4891
4892 case MATCH_NOMATCH:
4893 case MATCH_PRUNE:
4894 case MATCH_THEN:
4895 new_start_match = start_match + 1;
4896 #ifdef SUPPORT_UTF8
4897 if (utf8)
4898 while(new_start_match < end_subject && (*new_start_match & 0xc0) == 0x80)
4899 new_start_match++;
4900 #endif
4901 break;
4902
4903 /* SKIP passes back the next starting point explicitly. */
4904
4905 case MATCH_SKIP:
4906 new_start_match = md->start_match_ptr;
4907 break;
4908
4909 /* COMMIT disables the bumpalong, but otherwise behaves as NOMATCH. */
4910
4911 case MATCH_COMMIT:
4912 rc = MATCH_NOMATCH;
4913 goto ENDLOOP;
4914
4915 /* Any other return is some kind of error. */
4916
4917 default:
4918 goto ENDLOOP;
4919 }
4920
4921 /* Control reaches here for the various types of "no match at this point"
4922 result. Reset the code to MATCH_NOMATCH for subsequent checking. */
4923
4924 rc = MATCH_NOMATCH;
4925
4926 /* If PCRE_FIRSTLINE is set, the match must happen before or at the first
4927 newline in the subject (though it may continue over the newline). Therefore,
4928 if we have just failed to match, starting at a newline, do not continue. */
4929
4930 if (firstline && IS_NEWLINE(start_match)) break;
4931
4932 /* Advance to new matching position */
4933
4934 start_match = new_start_match;
4935
4936 /* Break the loop if the pattern is anchored or if we have passed the end of
4937 the subject. */
4938
4939 if (anchored || start_match > end_subject) break;
4940
4941 /* If we have just passed a CR and we are now at a LF, and the pattern does
4942 not contain any explicit matches for \r or \n, and the newline option is CRLF
4943 or ANY or ANYCRLF, advance the match position by one more character. */
4944
4945 if (start_match[-1] == CHAR_CR &&
4946 start_match < end_subject &&
4947 *start_match == CHAR_NL &&
4948 (re->flags & PCRE_HASCRORLF) == 0 &&
4949 (md->nltype == NLTYPE_ANY ||
4950 md->nltype == NLTYPE_ANYCRLF ||
4951 md->nllen == 2))
4952 start_match++;
4953
4954 } /* End of for(;;) "bumpalong" loop */
4955
4956 /* ==========================================================================*/
4957
4958 /* We reach here when rc is not MATCH_NOMATCH, or if one of the stopping
4959 conditions is true:
4960
4961 (1) The pattern is anchored or the match was failed by (*COMMIT);
4962
4963 (2) We are past the end of the subject;
4964
4965 (3) PCRE_FIRSTLINE is set and we have failed to match at a newline, because
4966 this option requests that a match occur at or before the first newline in
4967 the subject.
4968
4969 When we have a match and the offset vector is big enough to deal with any
4970 backreferences, captured substring offsets will already be set up. In the case
4971 where we had to get some local store to hold offsets for backreference
4972 processing, copy those that we can. In this case there need not be overflow if
4973 certain parts of the pattern were not used, even though there are more
4974 capturing parentheses than vector slots. */
4975
4976 ENDLOOP:
4977
4978 if (rc == MATCH_MATCH)
4979 {
4980 if (using_temporary_offsets)
4981 {
4982 if (offsetcount >= 4)
4983 {
4984 memcpy(offsets + 2, md->offset_vector + 2,
4985 (offsetcount - 2) * sizeof(int));
4986 DPRINTF(("Copied offsets from temporary memory\n"));
4987 }
4988 if (md->end_offset_top > offsetcount) md->offset_overflow = TRUE;
4989 DPRINTF(("Freeing temporary memory\n"));
4990 (pcre_free)(md->offset_vector);
4991 }
4992
4993 /* Set the return code to the number of captured strings, or 0 if there are
4994 too many to fit into the vector. */
4995
4996 rc = md->offset_overflow? 0 : md->end_offset_top/2;
4997
4998 /* If there is space, set up the whole thing as substring 0. The value of
4999 md->start_match_ptr might be modified if \K was encountered on the success
5000 matching path. */
5001
5002 if (offsetcount < 2) rc = 0; else
5003 {
5004 offsets[0] = md->start_match_ptr - md->start_subject;
5005 offsets[1] = md->end_match_ptr - md->start_subject;
5006 }
5007
5008 DPRINTF((">>>> returning %d\n", rc));
5009 return rc;
5010 }
5011
5012 /* Control gets here if there has been an error, or if the overall match
5013 attempt has failed at all permitted starting positions. */
5014
5015 if (using_temporary_offsets)
5016 {
5017 DPRINTF(("Freeing temporary memory\n"));
5018 (pcre_free)(md->offset_vector);
5019 }
5020
5021 if (rc != MATCH_NOMATCH)
5022 {
5023 DPRINTF((">>>> error: returning %d\n", rc));
5024 return rc;
5025 }
5026 else if (md->partial && md->hitend)
5027 {
5028 DPRINTF((">>>> returning PCRE_ERROR_PARTIAL\n"));
5029 return PCRE_ERROR_PARTIAL;
5030 }
5031 else
5032 {
5033 DPRINTF((">>>> returning PCRE_ERROR_NOMATCH\n"));
5034 return PCRE_ERROR_NOMATCH;
5035 }
5036 }
5037
5038 /* End of pcre_exec.c */

Properties

Name Value
svn:eol-style native
svn:keywords "Author Date Id Revision Url"

webmaster@exim.org
ViewVC Help
Powered by ViewVC 1.1.12