/[pcre]/code/trunk/pcre_exec.c
ViewVC logotype

Contents of /code/trunk/pcre_exec.c

Parent Directory Parent Directory | Revision Log Revision Log


Revision 399 - (show annotations) (download)
Sat Mar 21 12:34:15 2009 UTC (5 years, 6 months ago) by ph10
File MIME type: text/plain
File size: 152925 byte(s)
Further fix to auto-callout with conditional groups whose condition is an 
assertion.

1 /*************************************************
2 * Perl-Compatible Regular Expressions *
3 *************************************************/
4
5 /* PCRE is a library of functions to support regular expressions whose syntax
6 and semantics are as close as possible to those of the Perl 5 language.
7
8 Written by Philip Hazel
9 Copyright (c) 1997-2009 University of Cambridge
10
11 -----------------------------------------------------------------------------
12 Redistribution and use in source and binary forms, with or without
13 modification, are permitted provided that the following conditions are met:
14
15 * Redistributions of source code must retain the above copyright notice,
16 this list of conditions and the following disclaimer.
17
18 * Redistributions in binary form must reproduce the above copyright
19 notice, this list of conditions and the following disclaimer in the
20 documentation and/or other materials provided with the distribution.
21
22 * Neither the name of the University of Cambridge nor the names of its
23 contributors may be used to endorse or promote products derived from
24 this software without specific prior written permission.
25
26 THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
27 AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
28 IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
29 ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
30 LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
31 CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
32 SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
33 INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
34 CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
35 ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
36 POSSIBILITY OF SUCH DAMAGE.
37 -----------------------------------------------------------------------------
38 */
39
40
41 /* This module contains pcre_exec(), the externally visible function that does
42 pattern matching using an NFA algorithm, trying to mimic Perl as closely as
43 possible. There are also some static supporting functions. */
44
45 #ifdef HAVE_CONFIG_H
46 #include "config.h"
47 #endif
48
49 #define NLBLOCK md /* Block containing newline information */
50 #define PSSTART start_subject /* Field containing processed string start */
51 #define PSEND end_subject /* Field containing processed string end */
52
53 #include "pcre_internal.h"
54
55 /* Undefine some potentially clashing cpp symbols */
56
57 #undef min
58 #undef max
59
60 /* Flag bits for the match() function */
61
62 #define match_condassert 0x01 /* Called to check a condition assertion */
63 #define match_cbegroup 0x02 /* Could-be-empty unlimited repeat group */
64
65 /* Non-error returns from the match() function. Error returns are externally
66 defined PCRE_ERROR_xxx codes, which are all negative. */
67
68 #define MATCH_MATCH 1
69 #define MATCH_NOMATCH 0
70
71 /* Special internal returns from the match() function. Make them sufficiently
72 negative to avoid the external error codes. */
73
74 #define MATCH_COMMIT (-999)
75 #define MATCH_PRUNE (-998)
76 #define MATCH_SKIP (-997)
77 #define MATCH_THEN (-996)
78
79 /* Maximum number of ints of offset to save on the stack for recursive calls.
80 If the offset vector is bigger, malloc is used. This should be a multiple of 3,
81 because the offset vector is always a multiple of 3 long. */
82
83 #define REC_STACK_SAVE_MAX 30
84
85 /* Min and max values for the common repeats; for the maxima, 0 => infinity */
86
87 static const char rep_min[] = { 0, 0, 1, 1, 0, 0 };
88 static const char rep_max[] = { 0, 0, 0, 0, 1, 1 };
89
90
91
92 #ifdef DEBUG
93 /*************************************************
94 * Debugging function to print chars *
95 *************************************************/
96
97 /* Print a sequence of chars in printable format, stopping at the end of the
98 subject if the requested.
99
100 Arguments:
101 p points to characters
102 length number to print
103 is_subject TRUE if printing from within md->start_subject
104 md pointer to matching data block, if is_subject is TRUE
105
106 Returns: nothing
107 */
108
109 static void
110 pchars(const uschar *p, int length, BOOL is_subject, match_data *md)
111 {
112 unsigned int c;
113 if (is_subject && length > md->end_subject - p) length = md->end_subject - p;
114 while (length-- > 0)
115 if (isprint(c = *(p++))) printf("%c", c); else printf("\\x%02x", c);
116 }
117 #endif
118
119
120
121 /*************************************************
122 * Match a back-reference *
123 *************************************************/
124
125 /* If a back reference hasn't been set, the length that is passed is greater
126 than the number of characters left in the string, so the match fails.
127
128 Arguments:
129 offset index into the offset vector
130 eptr points into the subject
131 length length to be matched
132 md points to match data block
133 ims the ims flags
134
135 Returns: TRUE if matched
136 */
137
138 static BOOL
139 match_ref(int offset, register USPTR eptr, int length, match_data *md,
140 unsigned long int ims)
141 {
142 USPTR p = md->start_subject + md->offset_vector[offset];
143
144 #ifdef DEBUG
145 if (eptr >= md->end_subject)
146 printf("matching subject <null>");
147 else
148 {
149 printf("matching subject ");
150 pchars(eptr, length, TRUE, md);
151 }
152 printf(" against backref ");
153 pchars(p, length, FALSE, md);
154 printf("\n");
155 #endif
156
157 /* Always fail if not enough characters left */
158
159 if (length > md->end_subject - eptr) return FALSE;
160
161 /* Separate the caseless case for speed. In UTF-8 mode we can only do this
162 properly if Unicode properties are supported. Otherwise, we can check only
163 ASCII characters. */
164
165 if ((ims & PCRE_CASELESS) != 0)
166 {
167 #ifdef SUPPORT_UTF8
168 #ifdef SUPPORT_UCP
169 if (md->utf8)
170 {
171 USPTR endptr = eptr + length;
172 while (eptr < endptr)
173 {
174 int c, d;
175 GETCHARINC(c, eptr);
176 GETCHARINC(d, p);
177 if (c != d && c != UCD_OTHERCASE(d)) return FALSE;
178 }
179 }
180 else
181 #endif
182 #endif
183
184 /* The same code works when not in UTF-8 mode and in UTF-8 mode when there
185 is no UCP support. */
186
187 while (length-- > 0)
188 { if (md->lcc[*p++] != md->lcc[*eptr++]) return FALSE; }
189 }
190
191 /* In the caseful case, we can just compare the bytes, whether or not we
192 are in UTF-8 mode. */
193
194 else
195 { while (length-- > 0) if (*p++ != *eptr++) return FALSE; }
196
197 return TRUE;
198 }
199
200
201
202 /***************************************************************************
203 ****************************************************************************
204 RECURSION IN THE match() FUNCTION
205
206 The match() function is highly recursive, though not every recursive call
207 increases the recursive depth. Nevertheless, some regular expressions can cause
208 it to recurse to a great depth. I was writing for Unix, so I just let it call
209 itself recursively. This uses the stack for saving everything that has to be
210 saved for a recursive call. On Unix, the stack can be large, and this works
211 fine.
212
213 It turns out that on some non-Unix-like systems there are problems with
214 programs that use a lot of stack. (This despite the fact that every last chip
215 has oodles of memory these days, and techniques for extending the stack have
216 been known for decades.) So....
217
218 There is a fudge, triggered by defining NO_RECURSE, which avoids recursive
219 calls by keeping local variables that need to be preserved in blocks of memory
220 obtained from malloc() instead instead of on the stack. Macros are used to
221 achieve this so that the actual code doesn't look very different to what it
222 always used to.
223
224 The original heap-recursive code used longjmp(). However, it seems that this
225 can be very slow on some operating systems. Following a suggestion from Stan
226 Switzer, the use of longjmp() has been abolished, at the cost of having to
227 provide a unique number for each call to RMATCH. There is no way of generating
228 a sequence of numbers at compile time in C. I have given them names, to make
229 them stand out more clearly.
230
231 Crude tests on x86 Linux show a small speedup of around 5-8%. However, on
232 FreeBSD, avoiding longjmp() more than halves the time taken to run the standard
233 tests. Furthermore, not using longjmp() means that local dynamic variables
234 don't have indeterminate values; this has meant that the frame size can be
235 reduced because the result can be "passed back" by straight setting of the
236 variable instead of being passed in the frame.
237 ****************************************************************************
238 ***************************************************************************/
239
240 /* Numbers for RMATCH calls. When this list is changed, the code at HEAP_RETURN
241 below must be updated in sync. */
242
243 enum { RM1=1, RM2, RM3, RM4, RM5, RM6, RM7, RM8, RM9, RM10,
244 RM11, RM12, RM13, RM14, RM15, RM16, RM17, RM18, RM19, RM20,
245 RM21, RM22, RM23, RM24, RM25, RM26, RM27, RM28, RM29, RM30,
246 RM31, RM32, RM33, RM34, RM35, RM36, RM37, RM38, RM39, RM40,
247 RM41, RM42, RM43, RM44, RM45, RM46, RM47, RM48, RM49, RM50,
248 RM51, RM52, RM53, RM54 };
249
250 /* These versions of the macros use the stack, as normal. There are debugging
251 versions and production versions. Note that the "rw" argument of RMATCH isn't
252 actuall used in this definition. */
253
254 #ifndef NO_RECURSE
255 #define REGISTER register
256
257 #ifdef DEBUG
258 #define RMATCH(ra,rb,rc,rd,re,rf,rg,rw) \
259 { \
260 printf("match() called in line %d\n", __LINE__); \
261 rrc = match(ra,rb,mstart,rc,rd,re,rf,rg,rdepth+1); \
262 printf("to line %d\n", __LINE__); \
263 }
264 #define RRETURN(ra) \
265 { \
266 printf("match() returned %d from line %d ", ra, __LINE__); \
267 return ra; \
268 }
269 #else
270 #define RMATCH(ra,rb,rc,rd,re,rf,rg,rw) \
271 rrc = match(ra,rb,mstart,rc,rd,re,rf,rg,rdepth+1)
272 #define RRETURN(ra) return ra
273 #endif
274
275 #else
276
277
278 /* These versions of the macros manage a private stack on the heap. Note that
279 the "rd" argument of RMATCH isn't actually used in this definition. It's the md
280 argument of match(), which never changes. */
281
282 #define REGISTER
283
284 #define RMATCH(ra,rb,rc,rd,re,rf,rg,rw)\
285 {\
286 heapframe *newframe = (pcre_stack_malloc)(sizeof(heapframe));\
287 frame->Xwhere = rw; \
288 newframe->Xeptr = ra;\
289 newframe->Xecode = rb;\
290 newframe->Xmstart = mstart;\
291 newframe->Xoffset_top = rc;\
292 newframe->Xims = re;\
293 newframe->Xeptrb = rf;\
294 newframe->Xflags = rg;\
295 newframe->Xrdepth = frame->Xrdepth + 1;\
296 newframe->Xprevframe = frame;\
297 frame = newframe;\
298 DPRINTF(("restarting from line %d\n", __LINE__));\
299 goto HEAP_RECURSE;\
300 L_##rw:\
301 DPRINTF(("jumped back to line %d\n", __LINE__));\
302 }
303
304 #define RRETURN(ra)\
305 {\
306 heapframe *newframe = frame;\
307 frame = newframe->Xprevframe;\
308 (pcre_stack_free)(newframe);\
309 if (frame != NULL)\
310 {\
311 rrc = ra;\
312 goto HEAP_RETURN;\
313 }\
314 return ra;\
315 }
316
317
318 /* Structure for remembering the local variables in a private frame */
319
320 typedef struct heapframe {
321 struct heapframe *Xprevframe;
322
323 /* Function arguments that may change */
324
325 const uschar *Xeptr;
326 const uschar *Xecode;
327 const uschar *Xmstart;
328 int Xoffset_top;
329 long int Xims;
330 eptrblock *Xeptrb;
331 int Xflags;
332 unsigned int Xrdepth;
333
334 /* Function local variables */
335
336 const uschar *Xcallpat;
337 const uschar *Xcharptr;
338 const uschar *Xdata;
339 const uschar *Xnext;
340 const uschar *Xpp;
341 const uschar *Xprev;
342 const uschar *Xsaved_eptr;
343
344 recursion_info Xnew_recursive;
345
346 BOOL Xcur_is_word;
347 BOOL Xcondition;
348 BOOL Xprev_is_word;
349
350 unsigned long int Xoriginal_ims;
351
352 #ifdef SUPPORT_UCP
353 int Xprop_type;
354 int Xprop_value;
355 int Xprop_fail_result;
356 int Xprop_category;
357 int Xprop_chartype;
358 int Xprop_script;
359 int Xoclength;
360 uschar Xocchars[8];
361 #endif
362
363 int Xctype;
364 unsigned int Xfc;
365 int Xfi;
366 int Xlength;
367 int Xmax;
368 int Xmin;
369 int Xnumber;
370 int Xoffset;
371 int Xop;
372 int Xsave_capture_last;
373 int Xsave_offset1, Xsave_offset2, Xsave_offset3;
374 int Xstacksave[REC_STACK_SAVE_MAX];
375
376 eptrblock Xnewptrb;
377
378 /* Where to jump back to */
379
380 int Xwhere;
381
382 } heapframe;
383
384 #endif
385
386
387 /***************************************************************************
388 ***************************************************************************/
389
390
391
392 /*************************************************
393 * Match from current position *
394 *************************************************/
395
396 /* This function is called recursively in many circumstances. Whenever it
397 returns a negative (error) response, the outer incarnation must also return the
398 same response.
399
400 Performance note: It might be tempting to extract commonly used fields from the
401 md structure (e.g. utf8, end_subject) into individual variables to improve
402 performance. Tests using gcc on a SPARC disproved this; in the first case, it
403 made performance worse.
404
405 Arguments:
406 eptr pointer to current character in subject
407 ecode pointer to current position in compiled code
408 mstart pointer to the current match start position (can be modified
409 by encountering \K)
410 offset_top current top pointer
411 md pointer to "static" info for the match
412 ims current /i, /m, and /s options
413 eptrb pointer to chain of blocks containing eptr at start of
414 brackets - for testing for empty matches
415 flags can contain
416 match_condassert - this is an assertion condition
417 match_cbegroup - this is the start of an unlimited repeat
418 group that can match an empty string
419 rdepth the recursion depth
420
421 Returns: MATCH_MATCH if matched ) these values are >= 0
422 MATCH_NOMATCH if failed to match )
423 a negative PCRE_ERROR_xxx value if aborted by an error condition
424 (e.g. stopped by repeated call or recursion limit)
425 */
426
427 static int
428 match(REGISTER USPTR eptr, REGISTER const uschar *ecode, const uschar *mstart,
429 int offset_top, match_data *md, unsigned long int ims, eptrblock *eptrb,
430 int flags, unsigned int rdepth)
431 {
432 /* These variables do not need to be preserved over recursion in this function,
433 so they can be ordinary variables in all cases. Mark some of them with
434 "register" because they are used a lot in loops. */
435
436 register int rrc; /* Returns from recursive calls */
437 register int i; /* Used for loops not involving calls to RMATCH() */
438 register unsigned int c; /* Character values not kept over RMATCH() calls */
439 register BOOL utf8; /* Local copy of UTF-8 flag for speed */
440
441 BOOL minimize, possessive; /* Quantifier options */
442
443 /* When recursion is not being used, all "local" variables that have to be
444 preserved over calls to RMATCH() are part of a "frame" which is obtained from
445 heap storage. Set up the top-level frame here; others are obtained from the
446 heap whenever RMATCH() does a "recursion". See the macro definitions above. */
447
448 #ifdef NO_RECURSE
449 heapframe *frame = (pcre_stack_malloc)(sizeof(heapframe));
450 frame->Xprevframe = NULL; /* Marks the top level */
451
452 /* Copy in the original argument variables */
453
454 frame->Xeptr = eptr;
455 frame->Xecode = ecode;
456 frame->Xmstart = mstart;
457 frame->Xoffset_top = offset_top;
458 frame->Xims = ims;
459 frame->Xeptrb = eptrb;
460 frame->Xflags = flags;
461 frame->Xrdepth = rdepth;
462
463 /* This is where control jumps back to to effect "recursion" */
464
465 HEAP_RECURSE:
466
467 /* Macros make the argument variables come from the current frame */
468
469 #define eptr frame->Xeptr
470 #define ecode frame->Xecode
471 #define mstart frame->Xmstart
472 #define offset_top frame->Xoffset_top
473 #define ims frame->Xims
474 #define eptrb frame->Xeptrb
475 #define flags frame->Xflags
476 #define rdepth frame->Xrdepth
477
478 /* Ditto for the local variables */
479
480 #ifdef SUPPORT_UTF8
481 #define charptr frame->Xcharptr
482 #endif
483 #define callpat frame->Xcallpat
484 #define data frame->Xdata
485 #define next frame->Xnext
486 #define pp frame->Xpp
487 #define prev frame->Xprev
488 #define saved_eptr frame->Xsaved_eptr
489
490 #define new_recursive frame->Xnew_recursive
491
492 #define cur_is_word frame->Xcur_is_word
493 #define condition frame->Xcondition
494 #define prev_is_word frame->Xprev_is_word
495
496 #define original_ims frame->Xoriginal_ims
497
498 #ifdef SUPPORT_UCP
499 #define prop_type frame->Xprop_type
500 #define prop_value frame->Xprop_value
501 #define prop_fail_result frame->Xprop_fail_result
502 #define prop_category frame->Xprop_category
503 #define prop_chartype frame->Xprop_chartype
504 #define prop_script frame->Xprop_script
505 #define oclength frame->Xoclength
506 #define occhars frame->Xocchars
507 #endif
508
509 #define ctype frame->Xctype
510 #define fc frame->Xfc
511 #define fi frame->Xfi
512 #define length frame->Xlength
513 #define max frame->Xmax
514 #define min frame->Xmin
515 #define number frame->Xnumber
516 #define offset frame->Xoffset
517 #define op frame->Xop
518 #define save_capture_last frame->Xsave_capture_last
519 #define save_offset1 frame->Xsave_offset1
520 #define save_offset2 frame->Xsave_offset2
521 #define save_offset3 frame->Xsave_offset3
522 #define stacksave frame->Xstacksave
523
524 #define newptrb frame->Xnewptrb
525
526 /* When recursion is being used, local variables are allocated on the stack and
527 get preserved during recursion in the normal way. In this environment, fi and
528 i, and fc and c, can be the same variables. */
529
530 #else /* NO_RECURSE not defined */
531 #define fi i
532 #define fc c
533
534
535 #ifdef SUPPORT_UTF8 /* Many of these variables are used only */
536 const uschar *charptr; /* in small blocks of the code. My normal */
537 #endif /* style of coding would have declared */
538 const uschar *callpat; /* them within each of those blocks. */
539 const uschar *data; /* However, in order to accommodate the */
540 const uschar *next; /* version of this code that uses an */
541 USPTR pp; /* external "stack" implemented on the */
542 const uschar *prev; /* heap, it is easier to declare them all */
543 USPTR saved_eptr; /* here, so the declarations can be cut */
544 /* out in a block. The only declarations */
545 recursion_info new_recursive; /* within blocks below are for variables */
546 /* that do not have to be preserved over */
547 BOOL cur_is_word; /* a recursive call to RMATCH(). */
548 BOOL condition;
549 BOOL prev_is_word;
550
551 unsigned long int original_ims;
552
553 #ifdef SUPPORT_UCP
554 int prop_type;
555 int prop_value;
556 int prop_fail_result;
557 int prop_category;
558 int prop_chartype;
559 int prop_script;
560 int oclength;
561 uschar occhars[8];
562 #endif
563
564 int codelink;
565 int condcode;
566 int ctype;
567 int length;
568 int max;
569 int min;
570 int number;
571 int offset;
572 int op;
573 int save_capture_last;
574 int save_offset1, save_offset2, save_offset3;
575 int stacksave[REC_STACK_SAVE_MAX];
576
577 eptrblock newptrb;
578 #endif /* NO_RECURSE */
579
580 /* These statements are here to stop the compiler complaining about unitialized
581 variables. */
582
583 #ifdef SUPPORT_UCP
584 prop_value = 0;
585 prop_fail_result = 0;
586 #endif
587
588
589 /* This label is used for tail recursion, which is used in a few cases even
590 when NO_RECURSE is not defined, in order to reduce the amount of stack that is
591 used. Thanks to Ian Taylor for noticing this possibility and sending the
592 original patch. */
593
594 TAIL_RECURSE:
595
596 /* OK, now we can get on with the real code of the function. Recursive calls
597 are specified by the macro RMATCH and RRETURN is used to return. When
598 NO_RECURSE is *not* defined, these just turn into a recursive call to match()
599 and a "return", respectively (possibly with some debugging if DEBUG is
600 defined). However, RMATCH isn't like a function call because it's quite a
601 complicated macro. It has to be used in one particular way. This shouldn't,
602 however, impact performance when true recursion is being used. */
603
604 #ifdef SUPPORT_UTF8
605 utf8 = md->utf8; /* Local copy of the flag */
606 #else
607 utf8 = FALSE;
608 #endif
609
610 /* First check that we haven't called match() too many times, or that we
611 haven't exceeded the recursive call limit. */
612
613 if (md->match_call_count++ >= md->match_limit) RRETURN(PCRE_ERROR_MATCHLIMIT);
614 if (rdepth >= md->match_limit_recursion) RRETURN(PCRE_ERROR_RECURSIONLIMIT);
615
616 original_ims = ims; /* Save for resetting on ')' */
617
618 /* At the start of a group with an unlimited repeat that may match an empty
619 string, the match_cbegroup flag is set. When this is the case, add the current
620 subject pointer to the chain of such remembered pointers, to be checked when we
621 hit the closing ket, in order to break infinite loops that match no characters.
622 When match() is called in other circumstances, don't add to the chain. The
623 match_cbegroup flag must NOT be used with tail recursion, because the memory
624 block that is used is on the stack, so a new one may be required for each
625 match(). */
626
627 if ((flags & match_cbegroup) != 0)
628 {
629 newptrb.epb_saved_eptr = eptr;
630 newptrb.epb_prev = eptrb;
631 eptrb = &newptrb;
632 }
633
634 /* Now start processing the opcodes. */
635
636 for (;;)
637 {
638 minimize = possessive = FALSE;
639 op = *ecode;
640
641 /* For partial matching, remember if we ever hit the end of the subject after
642 matching at least one subject character. */
643
644 if (md->partial &&
645 eptr >= md->end_subject &&
646 eptr > mstart)
647 md->hitend = TRUE;
648
649 switch(op)
650 {
651 case OP_FAIL:
652 RRETURN(MATCH_NOMATCH);
653
654 case OP_PRUNE:
655 RMATCH(eptr, ecode + _pcre_OP_lengths[*ecode], offset_top, md,
656 ims, eptrb, flags, RM51);
657 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
658 RRETURN(MATCH_PRUNE);
659
660 case OP_COMMIT:
661 RMATCH(eptr, ecode + _pcre_OP_lengths[*ecode], offset_top, md,
662 ims, eptrb, flags, RM52);
663 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
664 RRETURN(MATCH_COMMIT);
665
666 case OP_SKIP:
667 RMATCH(eptr, ecode + _pcre_OP_lengths[*ecode], offset_top, md,
668 ims, eptrb, flags, RM53);
669 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
670 md->start_match_ptr = eptr; /* Pass back current position */
671 RRETURN(MATCH_SKIP);
672
673 case OP_THEN:
674 RMATCH(eptr, ecode + _pcre_OP_lengths[*ecode], offset_top, md,
675 ims, eptrb, flags, RM54);
676 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
677 RRETURN(MATCH_THEN);
678
679 /* Handle a capturing bracket. If there is space in the offset vector, save
680 the current subject position in the working slot at the top of the vector.
681 We mustn't change the current values of the data slot, because they may be
682 set from a previous iteration of this group, and be referred to by a
683 reference inside the group.
684
685 If the bracket fails to match, we need to restore this value and also the
686 values of the final offsets, in case they were set by a previous iteration
687 of the same bracket.
688
689 If there isn't enough space in the offset vector, treat this as if it were
690 a non-capturing bracket. Don't worry about setting the flag for the error
691 case here; that is handled in the code for KET. */
692
693 case OP_CBRA:
694 case OP_SCBRA:
695 number = GET2(ecode, 1+LINK_SIZE);
696 offset = number << 1;
697
698 #ifdef DEBUG
699 printf("start bracket %d\n", number);
700 printf("subject=");
701 pchars(eptr, 16, TRUE, md);
702 printf("\n");
703 #endif
704
705 if (offset < md->offset_max)
706 {
707 save_offset1 = md->offset_vector[offset];
708 save_offset2 = md->offset_vector[offset+1];
709 save_offset3 = md->offset_vector[md->offset_end - number];
710 save_capture_last = md->capture_last;
711
712 DPRINTF(("saving %d %d %d\n", save_offset1, save_offset2, save_offset3));
713 md->offset_vector[md->offset_end - number] = eptr - md->start_subject;
714
715 flags = (op == OP_SCBRA)? match_cbegroup : 0;
716 do
717 {
718 RMATCH(eptr, ecode + _pcre_OP_lengths[*ecode], offset_top, md,
719 ims, eptrb, flags, RM1);
720 if (rrc != MATCH_NOMATCH && rrc != MATCH_THEN) RRETURN(rrc);
721 md->capture_last = save_capture_last;
722 ecode += GET(ecode, 1);
723 }
724 while (*ecode == OP_ALT);
725
726 DPRINTF(("bracket %d failed\n", number));
727
728 md->offset_vector[offset] = save_offset1;
729 md->offset_vector[offset+1] = save_offset2;
730 md->offset_vector[md->offset_end - number] = save_offset3;
731
732 RRETURN(MATCH_NOMATCH);
733 }
734
735 /* FALL THROUGH ... Insufficient room for saving captured contents. Treat
736 as a non-capturing bracket. */
737
738 /* VVVVVVVVVVVVVVVVVVVVVVVVV */
739 /* VVVVVVVVVVVVVVVVVVVVVVVVV */
740
741 DPRINTF(("insufficient capture room: treat as non-capturing\n"));
742
743 /* VVVVVVVVVVVVVVVVVVVVVVVVV */
744 /* VVVVVVVVVVVVVVVVVVVVVVVVV */
745
746 /* Non-capturing bracket. Loop for all the alternatives. When we get to the
747 final alternative within the brackets, we would return the result of a
748 recursive call to match() whatever happened. We can reduce stack usage by
749 turning this into a tail recursion, except in the case when match_cbegroup
750 is set.*/
751
752 case OP_BRA:
753 case OP_SBRA:
754 DPRINTF(("start non-capturing bracket\n"));
755 flags = (op >= OP_SBRA)? match_cbegroup : 0;
756 for (;;)
757 {
758 if (ecode[GET(ecode, 1)] != OP_ALT) /* Final alternative */
759 {
760 if (flags == 0) /* Not a possibly empty group */
761 {
762 ecode += _pcre_OP_lengths[*ecode];
763 DPRINTF(("bracket 0 tail recursion\n"));
764 goto TAIL_RECURSE;
765 }
766
767 /* Possibly empty group; can't use tail recursion. */
768
769 RMATCH(eptr, ecode + _pcre_OP_lengths[*ecode], offset_top, md, ims,
770 eptrb, flags, RM48);
771 RRETURN(rrc);
772 }
773
774 /* For non-final alternatives, continue the loop for a NOMATCH result;
775 otherwise return. */
776
777 RMATCH(eptr, ecode + _pcre_OP_lengths[*ecode], offset_top, md, ims,
778 eptrb, flags, RM2);
779 if (rrc != MATCH_NOMATCH && rrc != MATCH_THEN) RRETURN(rrc);
780 ecode += GET(ecode, 1);
781 }
782 /* Control never reaches here. */
783
784 /* Conditional group: compilation checked that there are no more than
785 two branches. If the condition is false, skipping the first branch takes us
786 past the end if there is only one branch, but that's OK because that is
787 exactly what going to the ket would do. As there is only one branch to be
788 obeyed, we can use tail recursion to avoid using another stack frame. */
789
790 case OP_COND:
791 case OP_SCOND:
792 codelink= GET(ecode, 1);
793
794 /* Because of the way auto-callout works during compile, a callout item is
795 inserted between OP_COND and an assertion condition. */
796
797 if (ecode[LINK_SIZE+1] == OP_CALLOUT)
798 {
799 if (pcre_callout != NULL)
800 {
801 pcre_callout_block cb;
802 cb.version = 1; /* Version 1 of the callout block */
803 cb.callout_number = ecode[LINK_SIZE+2];
804 cb.offset_vector = md->offset_vector;
805 cb.subject = (PCRE_SPTR)md->start_subject;
806 cb.subject_length = md->end_subject - md->start_subject;
807 cb.start_match = mstart - md->start_subject;
808 cb.current_position = eptr - md->start_subject;
809 cb.pattern_position = GET(ecode, LINK_SIZE + 3);
810 cb.next_item_length = GET(ecode, 3 + 2*LINK_SIZE);
811 cb.capture_top = offset_top/2;
812 cb.capture_last = md->capture_last;
813 cb.callout_data = md->callout_data;
814 if ((rrc = (*pcre_callout)(&cb)) > 0) RRETURN(MATCH_NOMATCH);
815 if (rrc < 0) RRETURN(rrc);
816 }
817 ecode += _pcre_OP_lengths[OP_CALLOUT];
818 }
819
820 condcode = ecode[LINK_SIZE+1];
821
822 /* Now see what the actual condition is */
823
824 if (condcode == OP_RREF) /* Recursion test */
825 {
826 offset = GET2(ecode, LINK_SIZE + 2); /* Recursion group number*/
827 condition = md->recursive != NULL &&
828 (offset == RREF_ANY || offset == md->recursive->group_num);
829 ecode += condition? 3 : GET(ecode, 1);
830 }
831
832 else if (condcode == OP_CREF) /* Group used test */
833 {
834 offset = GET2(ecode, LINK_SIZE+2) << 1; /* Doubled ref number */
835 condition = offset < offset_top && md->offset_vector[offset] >= 0;
836 ecode += condition? 3 : GET(ecode, 1);
837 }
838
839 else if (condcode == OP_DEF) /* DEFINE - always false */
840 {
841 condition = FALSE;
842 ecode += GET(ecode, 1);
843 }
844
845 /* The condition is an assertion. Call match() to evaluate it - setting
846 the final argument match_condassert causes it to stop at the end of an
847 assertion. */
848
849 else
850 {
851 RMATCH(eptr, ecode + 1 + LINK_SIZE, offset_top, md, ims, NULL,
852 match_condassert, RM3);
853 if (rrc == MATCH_MATCH)
854 {
855 condition = TRUE;
856 ecode += 1 + LINK_SIZE + GET(ecode, LINK_SIZE + 2);
857 while (*ecode == OP_ALT) ecode += GET(ecode, 1);
858 }
859 else if (rrc != MATCH_NOMATCH && rrc != MATCH_THEN)
860 {
861 RRETURN(rrc); /* Need braces because of following else */
862 }
863 else
864 {
865 condition = FALSE;
866 ecode += codelink;
867 }
868 }
869
870 /* We are now at the branch that is to be obeyed. As there is only one,
871 we can use tail recursion to avoid using another stack frame, except when
872 match_cbegroup is required for an unlimited repeat of a possibly empty
873 group. If the second alternative doesn't exist, we can just plough on. */
874
875 if (condition || *ecode == OP_ALT)
876 {
877 ecode += 1 + LINK_SIZE;
878 if (op == OP_SCOND) /* Possibly empty group */
879 {
880 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, match_cbegroup, RM49);
881 RRETURN(rrc);
882 }
883 else /* Group must match something */
884 {
885 flags = 0;
886 goto TAIL_RECURSE;
887 }
888 }
889 else /* Condition false & no alternative */
890 {
891 ecode += 1 + LINK_SIZE;
892 }
893 break;
894
895
896 /* End of the pattern, either real or forced. If we are in a top-level
897 recursion, we should restore the offsets appropriately and continue from
898 after the call. */
899
900 case OP_ACCEPT:
901 case OP_END:
902 if (md->recursive != NULL && md->recursive->group_num == 0)
903 {
904 recursion_info *rec = md->recursive;
905 DPRINTF(("End of pattern in a (?0) recursion\n"));
906 md->recursive = rec->prevrec;
907 memmove(md->offset_vector, rec->offset_save,
908 rec->saved_max * sizeof(int));
909 mstart = rec->save_start;
910 ims = original_ims;
911 ecode = rec->after_call;
912 break;
913 }
914
915 /* Otherwise, if PCRE_NOTEMPTY is set, fail if we have matched an empty
916 string - backtracking will then try other alternatives, if any. */
917
918 if (md->notempty && eptr == mstart) RRETURN(MATCH_NOMATCH);
919 md->end_match_ptr = eptr; /* Record where we ended */
920 md->end_offset_top = offset_top; /* and how many extracts were taken */
921 md->start_match_ptr = mstart; /* and the start (\K can modify) */
922 RRETURN(MATCH_MATCH);
923
924 /* Change option settings */
925
926 case OP_OPT:
927 ims = ecode[1];
928 ecode += 2;
929 DPRINTF(("ims set to %02lx\n", ims));
930 break;
931
932 /* Assertion brackets. Check the alternative branches in turn - the
933 matching won't pass the KET for an assertion. If any one branch matches,
934 the assertion is true. Lookbehind assertions have an OP_REVERSE item at the
935 start of each branch to move the current point backwards, so the code at
936 this level is identical to the lookahead case. */
937
938 case OP_ASSERT:
939 case OP_ASSERTBACK:
940 do
941 {
942 RMATCH(eptr, ecode + 1 + LINK_SIZE, offset_top, md, ims, NULL, 0,
943 RM4);
944 if (rrc == MATCH_MATCH) break;
945 if (rrc != MATCH_NOMATCH && rrc != MATCH_THEN) RRETURN(rrc);
946 ecode += GET(ecode, 1);
947 }
948 while (*ecode == OP_ALT);
949 if (*ecode == OP_KET) RRETURN(MATCH_NOMATCH);
950
951 /* If checking an assertion for a condition, return MATCH_MATCH. */
952
953 if ((flags & match_condassert) != 0) RRETURN(MATCH_MATCH);
954
955 /* Continue from after the assertion, updating the offsets high water
956 mark, since extracts may have been taken during the assertion. */
957
958 do ecode += GET(ecode,1); while (*ecode == OP_ALT);
959 ecode += 1 + LINK_SIZE;
960 offset_top = md->end_offset_top;
961 continue;
962
963 /* Negative assertion: all branches must fail to match */
964
965 case OP_ASSERT_NOT:
966 case OP_ASSERTBACK_NOT:
967 do
968 {
969 RMATCH(eptr, ecode + 1 + LINK_SIZE, offset_top, md, ims, NULL, 0,
970 RM5);
971 if (rrc == MATCH_MATCH) RRETURN(MATCH_NOMATCH);
972 if (rrc != MATCH_NOMATCH && rrc != MATCH_THEN) RRETURN(rrc);
973 ecode += GET(ecode,1);
974 }
975 while (*ecode == OP_ALT);
976
977 if ((flags & match_condassert) != 0) RRETURN(MATCH_MATCH);
978
979 ecode += 1 + LINK_SIZE;
980 continue;
981
982 /* Move the subject pointer back. This occurs only at the start of
983 each branch of a lookbehind assertion. If we are too close to the start to
984 move back, this match function fails. When working with UTF-8 we move
985 back a number of characters, not bytes. */
986
987 case OP_REVERSE:
988 #ifdef SUPPORT_UTF8
989 if (utf8)
990 {
991 i = GET(ecode, 1);
992 while (i-- > 0)
993 {
994 eptr--;
995 if (eptr < md->start_subject) RRETURN(MATCH_NOMATCH);
996 BACKCHAR(eptr);
997 }
998 }
999 else
1000 #endif
1001
1002 /* No UTF-8 support, or not in UTF-8 mode: count is byte count */
1003
1004 {
1005 eptr -= GET(ecode, 1);
1006 if (eptr < md->start_subject) RRETURN(MATCH_NOMATCH);
1007 }
1008
1009 /* Skip to next op code */
1010
1011 ecode += 1 + LINK_SIZE;
1012 break;
1013
1014 /* The callout item calls an external function, if one is provided, passing
1015 details of the match so far. This is mainly for debugging, though the
1016 function is able to force a failure. */
1017
1018 case OP_CALLOUT:
1019 if (pcre_callout != NULL)
1020 {
1021 pcre_callout_block cb;
1022 cb.version = 1; /* Version 1 of the callout block */
1023 cb.callout_number = ecode[1];
1024 cb.offset_vector = md->offset_vector;
1025 cb.subject = (PCRE_SPTR)md->start_subject;
1026 cb.subject_length = md->end_subject - md->start_subject;
1027 cb.start_match = mstart - md->start_subject;
1028 cb.current_position = eptr - md->start_subject;
1029 cb.pattern_position = GET(ecode, 2);
1030 cb.next_item_length = GET(ecode, 2 + LINK_SIZE);
1031 cb.capture_top = offset_top/2;
1032 cb.capture_last = md->capture_last;
1033 cb.callout_data = md->callout_data;
1034 if ((rrc = (*pcre_callout)(&cb)) > 0) RRETURN(MATCH_NOMATCH);
1035 if (rrc < 0) RRETURN(rrc);
1036 }
1037 ecode += 2 + 2*LINK_SIZE;
1038 break;
1039
1040 /* Recursion either matches the current regex, or some subexpression. The
1041 offset data is the offset to the starting bracket from the start of the
1042 whole pattern. (This is so that it works from duplicated subpatterns.)
1043
1044 If there are any capturing brackets started but not finished, we have to
1045 save their starting points and reinstate them after the recursion. However,
1046 we don't know how many such there are (offset_top records the completed
1047 total) so we just have to save all the potential data. There may be up to
1048 65535 such values, which is too large to put on the stack, but using malloc
1049 for small numbers seems expensive. As a compromise, the stack is used when
1050 there are no more than REC_STACK_SAVE_MAX values to store; otherwise malloc
1051 is used. A problem is what to do if the malloc fails ... there is no way of
1052 returning to the top level with an error. Save the top REC_STACK_SAVE_MAX
1053 values on the stack, and accept that the rest may be wrong.
1054
1055 There are also other values that have to be saved. We use a chained
1056 sequence of blocks that actually live on the stack. Thanks to Robin Houston
1057 for the original version of this logic. */
1058
1059 case OP_RECURSE:
1060 {
1061 callpat = md->start_code + GET(ecode, 1);
1062 new_recursive.group_num = (callpat == md->start_code)? 0 :
1063 GET2(callpat, 1 + LINK_SIZE);
1064
1065 /* Add to "recursing stack" */
1066
1067 new_recursive.prevrec = md->recursive;
1068 md->recursive = &new_recursive;
1069
1070 /* Find where to continue from afterwards */
1071
1072 ecode += 1 + LINK_SIZE;
1073 new_recursive.after_call = ecode;
1074
1075 /* Now save the offset data. */
1076
1077 new_recursive.saved_max = md->offset_end;
1078 if (new_recursive.saved_max <= REC_STACK_SAVE_MAX)
1079 new_recursive.offset_save = stacksave;
1080 else
1081 {
1082 new_recursive.offset_save =
1083 (int *)(pcre_malloc)(new_recursive.saved_max * sizeof(int));
1084 if (new_recursive.offset_save == NULL) RRETURN(PCRE_ERROR_NOMEMORY);
1085 }
1086
1087 memcpy(new_recursive.offset_save, md->offset_vector,
1088 new_recursive.saved_max * sizeof(int));
1089 new_recursive.save_start = mstart;
1090 mstart = eptr;
1091
1092 /* OK, now we can do the recursion. For each top-level alternative we
1093 restore the offset and recursion data. */
1094
1095 DPRINTF(("Recursing into group %d\n", new_recursive.group_num));
1096 flags = (*callpat >= OP_SBRA)? match_cbegroup : 0;
1097 do
1098 {
1099 RMATCH(eptr, callpat + _pcre_OP_lengths[*callpat], offset_top,
1100 md, ims, eptrb, flags, RM6);
1101 if (rrc == MATCH_MATCH)
1102 {
1103 DPRINTF(("Recursion matched\n"));
1104 md->recursive = new_recursive.prevrec;
1105 if (new_recursive.offset_save != stacksave)
1106 (pcre_free)(new_recursive.offset_save);
1107 RRETURN(MATCH_MATCH);
1108 }
1109 else if (rrc != MATCH_NOMATCH && rrc != MATCH_THEN)
1110 {
1111 DPRINTF(("Recursion gave error %d\n", rrc));
1112 RRETURN(rrc);
1113 }
1114
1115 md->recursive = &new_recursive;
1116 memcpy(md->offset_vector, new_recursive.offset_save,
1117 new_recursive.saved_max * sizeof(int));
1118 callpat += GET(callpat, 1);
1119 }
1120 while (*callpat == OP_ALT);
1121
1122 DPRINTF(("Recursion didn't match\n"));
1123 md->recursive = new_recursive.prevrec;
1124 if (new_recursive.offset_save != stacksave)
1125 (pcre_free)(new_recursive.offset_save);
1126 RRETURN(MATCH_NOMATCH);
1127 }
1128 /* Control never reaches here */
1129
1130 /* "Once" brackets are like assertion brackets except that after a match,
1131 the point in the subject string is not moved back. Thus there can never be
1132 a move back into the brackets. Friedl calls these "atomic" subpatterns.
1133 Check the alternative branches in turn - the matching won't pass the KET
1134 for this kind of subpattern. If any one branch matches, we carry on as at
1135 the end of a normal bracket, leaving the subject pointer. */
1136
1137 case OP_ONCE:
1138 prev = ecode;
1139 saved_eptr = eptr;
1140
1141 do
1142 {
1143 RMATCH(eptr, ecode + 1 + LINK_SIZE, offset_top, md, ims, eptrb, 0, RM7);
1144 if (rrc == MATCH_MATCH) break;
1145 if (rrc != MATCH_NOMATCH && rrc != MATCH_THEN) RRETURN(rrc);
1146 ecode += GET(ecode,1);
1147 }
1148 while (*ecode == OP_ALT);
1149
1150 /* If hit the end of the group (which could be repeated), fail */
1151
1152 if (*ecode != OP_ONCE && *ecode != OP_ALT) RRETURN(MATCH_NOMATCH);
1153
1154 /* Continue as from after the assertion, updating the offsets high water
1155 mark, since extracts may have been taken. */
1156
1157 do ecode += GET(ecode, 1); while (*ecode == OP_ALT);
1158
1159 offset_top = md->end_offset_top;
1160 eptr = md->end_match_ptr;
1161
1162 /* For a non-repeating ket, just continue at this level. This also
1163 happens for a repeating ket if no characters were matched in the group.
1164 This is the forcible breaking of infinite loops as implemented in Perl
1165 5.005. If there is an options reset, it will get obeyed in the normal
1166 course of events. */
1167
1168 if (*ecode == OP_KET || eptr == saved_eptr)
1169 {
1170 ecode += 1+LINK_SIZE;
1171 break;
1172 }
1173
1174 /* The repeating kets try the rest of the pattern or restart from the
1175 preceding bracket, in the appropriate order. The second "call" of match()
1176 uses tail recursion, to avoid using another stack frame. We need to reset
1177 any options that changed within the bracket before re-running it, so
1178 check the next opcode. */
1179
1180 if (ecode[1+LINK_SIZE] == OP_OPT)
1181 {
1182 ims = (ims & ~PCRE_IMS) | ecode[4];
1183 DPRINTF(("ims set to %02lx at group repeat\n", ims));
1184 }
1185
1186 if (*ecode == OP_KETRMIN)
1187 {
1188 RMATCH(eptr, ecode + 1 + LINK_SIZE, offset_top, md, ims, eptrb, 0, RM8);
1189 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1190 ecode = prev;
1191 flags = 0;
1192 goto TAIL_RECURSE;
1193 }
1194 else /* OP_KETRMAX */
1195 {
1196 RMATCH(eptr, prev, offset_top, md, ims, eptrb, match_cbegroup, RM9);
1197 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1198 ecode += 1 + LINK_SIZE;
1199 flags = 0;
1200 goto TAIL_RECURSE;
1201 }
1202 /* Control never gets here */
1203
1204 /* An alternation is the end of a branch; scan along to find the end of the
1205 bracketed group and go to there. */
1206
1207 case OP_ALT:
1208 do ecode += GET(ecode,1); while (*ecode == OP_ALT);
1209 break;
1210
1211 /* BRAZERO, BRAMINZERO and SKIPZERO occur just before a bracket group,
1212 indicating that it may occur zero times. It may repeat infinitely, or not
1213 at all - i.e. it could be ()* or ()? or even (){0} in the pattern. Brackets
1214 with fixed upper repeat limits are compiled as a number of copies, with the
1215 optional ones preceded by BRAZERO or BRAMINZERO. */
1216
1217 case OP_BRAZERO:
1218 {
1219 next = ecode+1;
1220 RMATCH(eptr, next, offset_top, md, ims, eptrb, 0, RM10);
1221 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1222 do next += GET(next,1); while (*next == OP_ALT);
1223 ecode = next + 1 + LINK_SIZE;
1224 }
1225 break;
1226
1227 case OP_BRAMINZERO:
1228 {
1229 next = ecode+1;
1230 do next += GET(next, 1); while (*next == OP_ALT);
1231 RMATCH(eptr, next + 1+LINK_SIZE, offset_top, md, ims, eptrb, 0, RM11);
1232 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1233 ecode++;
1234 }
1235 break;
1236
1237 case OP_SKIPZERO:
1238 {
1239 next = ecode+1;
1240 do next += GET(next,1); while (*next == OP_ALT);
1241 ecode = next + 1 + LINK_SIZE;
1242 }
1243 break;
1244
1245 /* End of a group, repeated or non-repeating. */
1246
1247 case OP_KET:
1248 case OP_KETRMIN:
1249 case OP_KETRMAX:
1250 prev = ecode - GET(ecode, 1);
1251
1252 /* If this was a group that remembered the subject start, in order to break
1253 infinite repeats of empty string matches, retrieve the subject start from
1254 the chain. Otherwise, set it NULL. */
1255
1256 if (*prev >= OP_SBRA)
1257 {
1258 saved_eptr = eptrb->epb_saved_eptr; /* Value at start of group */
1259 eptrb = eptrb->epb_prev; /* Backup to previous group */
1260 }
1261 else saved_eptr = NULL;
1262
1263 /* If we are at the end of an assertion group, stop matching and return
1264 MATCH_MATCH, but record the current high water mark for use by positive
1265 assertions. Do this also for the "once" (atomic) groups. */
1266
1267 if (*prev == OP_ASSERT || *prev == OP_ASSERT_NOT ||
1268 *prev == OP_ASSERTBACK || *prev == OP_ASSERTBACK_NOT ||
1269 *prev == OP_ONCE)
1270 {
1271 md->end_match_ptr = eptr; /* For ONCE */
1272 md->end_offset_top = offset_top;
1273 RRETURN(MATCH_MATCH);
1274 }
1275
1276 /* For capturing groups we have to check the group number back at the start
1277 and if necessary complete handling an extraction by setting the offsets and
1278 bumping the high water mark. Note that whole-pattern recursion is coded as
1279 a recurse into group 0, so it won't be picked up here. Instead, we catch it
1280 when the OP_END is reached. Other recursion is handled here. */
1281
1282 if (*prev == OP_CBRA || *prev == OP_SCBRA)
1283 {
1284 number = GET2(prev, 1+LINK_SIZE);
1285 offset = number << 1;
1286
1287 #ifdef DEBUG
1288 printf("end bracket %d", number);
1289 printf("\n");
1290 #endif
1291
1292 md->capture_last = number;
1293 if (offset >= md->offset_max) md->offset_overflow = TRUE; else
1294 {
1295 md->offset_vector[offset] =
1296 md->offset_vector[md->offset_end - number];
1297 md->offset_vector[offset+1] = eptr - md->start_subject;
1298 if (offset_top <= offset) offset_top = offset + 2;
1299 }
1300
1301 /* Handle a recursively called group. Restore the offsets
1302 appropriately and continue from after the call. */
1303
1304 if (md->recursive != NULL && md->recursive->group_num == number)
1305 {
1306 recursion_info *rec = md->recursive;
1307 DPRINTF(("Recursion (%d) succeeded - continuing\n", number));
1308 md->recursive = rec->prevrec;
1309 mstart = rec->save_start;
1310 memcpy(md->offset_vector, rec->offset_save,
1311 rec->saved_max * sizeof(int));
1312 ecode = rec->after_call;
1313 ims = original_ims;
1314 break;
1315 }
1316 }
1317
1318 /* For both capturing and non-capturing groups, reset the value of the ims
1319 flags, in case they got changed during the group. */
1320
1321 ims = original_ims;
1322 DPRINTF(("ims reset to %02lx\n", ims));
1323
1324 /* For a non-repeating ket, just continue at this level. This also
1325 happens for a repeating ket if no characters were matched in the group.
1326 This is the forcible breaking of infinite loops as implemented in Perl
1327 5.005. If there is an options reset, it will get obeyed in the normal
1328 course of events. */
1329
1330 if (*ecode == OP_KET || eptr == saved_eptr)
1331 {
1332 ecode += 1 + LINK_SIZE;
1333 break;
1334 }
1335
1336 /* The repeating kets try the rest of the pattern or restart from the
1337 preceding bracket, in the appropriate order. In the second case, we can use
1338 tail recursion to avoid using another stack frame, unless we have an
1339 unlimited repeat of a group that can match an empty string. */
1340
1341 flags = (*prev >= OP_SBRA)? match_cbegroup : 0;
1342
1343 if (*ecode == OP_KETRMIN)
1344 {
1345 RMATCH(eptr, ecode + 1 + LINK_SIZE, offset_top, md, ims, eptrb, 0, RM12);
1346 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1347 if (flags != 0) /* Could match an empty string */
1348 {
1349 RMATCH(eptr, prev, offset_top, md, ims, eptrb, flags, RM50);
1350 RRETURN(rrc);
1351 }
1352 ecode = prev;
1353 goto TAIL_RECURSE;
1354 }
1355 else /* OP_KETRMAX */
1356 {
1357 RMATCH(eptr, prev, offset_top, md, ims, eptrb, flags, RM13);
1358 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1359 ecode += 1 + LINK_SIZE;
1360 flags = 0;
1361 goto TAIL_RECURSE;
1362 }
1363 /* Control never gets here */
1364
1365 /* Start of subject unless notbol, or after internal newline if multiline */
1366
1367 case OP_CIRC:
1368 if (md->notbol && eptr == md->start_subject) RRETURN(MATCH_NOMATCH);
1369 if ((ims & PCRE_MULTILINE) != 0)
1370 {
1371 if (eptr != md->start_subject &&
1372 (eptr == md->end_subject || !WAS_NEWLINE(eptr)))
1373 RRETURN(MATCH_NOMATCH);
1374 ecode++;
1375 break;
1376 }
1377 /* ... else fall through */
1378
1379 /* Start of subject assertion */
1380
1381 case OP_SOD:
1382 if (eptr != md->start_subject) RRETURN(MATCH_NOMATCH);
1383 ecode++;
1384 break;
1385
1386 /* Start of match assertion */
1387
1388 case OP_SOM:
1389 if (eptr != md->start_subject + md->start_offset) RRETURN(MATCH_NOMATCH);
1390 ecode++;
1391 break;
1392
1393 /* Reset the start of match point */
1394
1395 case OP_SET_SOM:
1396 mstart = eptr;
1397 ecode++;
1398 break;
1399
1400 /* Assert before internal newline if multiline, or before a terminating
1401 newline unless endonly is set, else end of subject unless noteol is set. */
1402
1403 case OP_DOLL:
1404 if ((ims & PCRE_MULTILINE) != 0)
1405 {
1406 if (eptr < md->end_subject)
1407 { if (!IS_NEWLINE(eptr)) RRETURN(MATCH_NOMATCH); }
1408 else
1409 { if (md->noteol) RRETURN(MATCH_NOMATCH); }
1410 ecode++;
1411 break;
1412 }
1413 else
1414 {
1415 if (md->noteol) RRETURN(MATCH_NOMATCH);
1416 if (!md->endonly)
1417 {
1418 if (eptr != md->end_subject &&
1419 (!IS_NEWLINE(eptr) || eptr != md->end_subject - md->nllen))
1420 RRETURN(MATCH_NOMATCH);
1421 ecode++;
1422 break;
1423 }
1424 }
1425 /* ... else fall through for endonly */
1426
1427 /* End of subject assertion (\z) */
1428
1429 case OP_EOD:
1430 if (eptr < md->end_subject) RRETURN(MATCH_NOMATCH);
1431 ecode++;
1432 break;
1433
1434 /* End of subject or ending \n assertion (\Z) */
1435
1436 case OP_EODN:
1437 if (eptr != md->end_subject &&
1438 (!IS_NEWLINE(eptr) || eptr != md->end_subject - md->nllen))
1439 RRETURN(MATCH_NOMATCH);
1440 ecode++;
1441 break;
1442
1443 /* Word boundary assertions */
1444
1445 case OP_NOT_WORD_BOUNDARY:
1446 case OP_WORD_BOUNDARY:
1447 {
1448
1449 /* Find out if the previous and current characters are "word" characters.
1450 It takes a bit more work in UTF-8 mode. Characters > 255 are assumed to
1451 be "non-word" characters. */
1452
1453 #ifdef SUPPORT_UTF8
1454 if (utf8)
1455 {
1456 if (eptr == md->start_subject) prev_is_word = FALSE; else
1457 {
1458 const uschar *lastptr = eptr - 1;
1459 while((*lastptr & 0xc0) == 0x80) lastptr--;
1460 GETCHAR(c, lastptr);
1461 prev_is_word = c < 256 && (md->ctypes[c] & ctype_word) != 0;
1462 }
1463 if (eptr >= md->end_subject) cur_is_word = FALSE; else
1464 {
1465 GETCHAR(c, eptr);
1466 cur_is_word = c < 256 && (md->ctypes[c] & ctype_word) != 0;
1467 }
1468 }
1469 else
1470 #endif
1471
1472 /* More streamlined when not in UTF-8 mode */
1473
1474 {
1475 prev_is_word = (eptr != md->start_subject) &&
1476 ((md->ctypes[eptr[-1]] & ctype_word) != 0);
1477 cur_is_word = (eptr < md->end_subject) &&
1478 ((md->ctypes[*eptr] & ctype_word) != 0);
1479 }
1480
1481 /* Now see if the situation is what we want */
1482
1483 if ((*ecode++ == OP_WORD_BOUNDARY)?
1484 cur_is_word == prev_is_word : cur_is_word != prev_is_word)
1485 RRETURN(MATCH_NOMATCH);
1486 }
1487 break;
1488
1489 /* Match a single character type; inline for speed */
1490
1491 case OP_ANY:
1492 if (IS_NEWLINE(eptr)) RRETURN(MATCH_NOMATCH);
1493 /* Fall through */
1494
1495 case OP_ALLANY:
1496 if (eptr++ >= md->end_subject) RRETURN(MATCH_NOMATCH);
1497 if (utf8) while (eptr < md->end_subject && (*eptr & 0xc0) == 0x80) eptr++;
1498 ecode++;
1499 break;
1500
1501 /* Match a single byte, even in UTF-8 mode. This opcode really does match
1502 any byte, even newline, independent of the setting of PCRE_DOTALL. */
1503
1504 case OP_ANYBYTE:
1505 if (eptr++ >= md->end_subject) RRETURN(MATCH_NOMATCH);
1506 ecode++;
1507 break;
1508
1509 case OP_NOT_DIGIT:
1510 if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
1511 GETCHARINCTEST(c, eptr);
1512 if (
1513 #ifdef SUPPORT_UTF8
1514 c < 256 &&
1515 #endif
1516 (md->ctypes[c] & ctype_digit) != 0
1517 )
1518 RRETURN(MATCH_NOMATCH);
1519 ecode++;
1520 break;
1521
1522 case OP_DIGIT:
1523 if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
1524 GETCHARINCTEST(c, eptr);
1525 if (
1526 #ifdef SUPPORT_UTF8
1527 c >= 256 ||
1528 #endif
1529 (md->ctypes[c] & ctype_digit) == 0
1530 )
1531 RRETURN(MATCH_NOMATCH);
1532 ecode++;
1533 break;
1534
1535 case OP_NOT_WHITESPACE:
1536 if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
1537 GETCHARINCTEST(c, eptr);
1538 if (
1539 #ifdef SUPPORT_UTF8
1540 c < 256 &&
1541 #endif
1542 (md->ctypes[c] & ctype_space) != 0
1543 )
1544 RRETURN(MATCH_NOMATCH);
1545 ecode++;
1546 break;
1547
1548 case OP_WHITESPACE:
1549 if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
1550 GETCHARINCTEST(c, eptr);
1551 if (
1552 #ifdef SUPPORT_UTF8
1553 c >= 256 ||
1554 #endif
1555 (md->ctypes[c] & ctype_space) == 0
1556 )
1557 RRETURN(MATCH_NOMATCH);
1558 ecode++;
1559 break;
1560
1561 case OP_NOT_WORDCHAR:
1562 if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
1563 GETCHARINCTEST(c, eptr);
1564 if (
1565 #ifdef SUPPORT_UTF8
1566 c < 256 &&
1567 #endif
1568 (md->ctypes[c] & ctype_word) != 0
1569 )
1570 RRETURN(MATCH_NOMATCH);
1571 ecode++;
1572 break;
1573
1574 case OP_WORDCHAR:
1575 if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
1576 GETCHARINCTEST(c, eptr);
1577 if (
1578 #ifdef SUPPORT_UTF8
1579 c >= 256 ||
1580 #endif
1581 (md->ctypes[c] & ctype_word) == 0
1582 )
1583 RRETURN(MATCH_NOMATCH);
1584 ecode++;
1585 break;
1586
1587 case OP_ANYNL:
1588 if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
1589 GETCHARINCTEST(c, eptr);
1590 switch(c)
1591 {
1592 default: RRETURN(MATCH_NOMATCH);
1593 case 0x000d:
1594 if (eptr < md->end_subject && *eptr == 0x0a) eptr++;
1595 break;
1596
1597 case 0x000a:
1598 break;
1599
1600 case 0x000b:
1601 case 0x000c:
1602 case 0x0085:
1603 case 0x2028:
1604 case 0x2029:
1605 if (md->bsr_anycrlf) RRETURN(MATCH_NOMATCH);
1606 break;
1607 }
1608 ecode++;
1609 break;
1610
1611 case OP_NOT_HSPACE:
1612 if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
1613 GETCHARINCTEST(c, eptr);
1614 switch(c)
1615 {
1616 default: break;
1617 case 0x09: /* HT */
1618 case 0x20: /* SPACE */
1619 case 0xa0: /* NBSP */
1620 case 0x1680: /* OGHAM SPACE MARK */
1621 case 0x180e: /* MONGOLIAN VOWEL SEPARATOR */
1622 case 0x2000: /* EN QUAD */
1623 case 0x2001: /* EM QUAD */
1624 case 0x2002: /* EN SPACE */
1625 case 0x2003: /* EM SPACE */
1626 case 0x2004: /* THREE-PER-EM SPACE */
1627 case 0x2005: /* FOUR-PER-EM SPACE */
1628 case 0x2006: /* SIX-PER-EM SPACE */
1629 case 0x2007: /* FIGURE SPACE */
1630 case 0x2008: /* PUNCTUATION SPACE */
1631 case 0x2009: /* THIN SPACE */
1632 case 0x200A: /* HAIR SPACE */
1633 case 0x202f: /* NARROW NO-BREAK SPACE */
1634 case 0x205f: /* MEDIUM MATHEMATICAL SPACE */
1635 case 0x3000: /* IDEOGRAPHIC SPACE */
1636 RRETURN(MATCH_NOMATCH);
1637 }
1638 ecode++;
1639 break;
1640
1641 case OP_HSPACE:
1642 if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
1643 GETCHARINCTEST(c, eptr);
1644 switch(c)
1645 {
1646 default: RRETURN(MATCH_NOMATCH);
1647 case 0x09: /* HT */
1648 case 0x20: /* SPACE */
1649 case 0xa0: /* NBSP */
1650 case 0x1680: /* OGHAM SPACE MARK */
1651 case 0x180e: /* MONGOLIAN VOWEL SEPARATOR */
1652 case 0x2000: /* EN QUAD */
1653 case 0x2001: /* EM QUAD */
1654 case 0x2002: /* EN SPACE */
1655 case 0x2003: /* EM SPACE */
1656 case 0x2004: /* THREE-PER-EM SPACE */
1657 case 0x2005: /* FOUR-PER-EM SPACE */
1658 case 0x2006: /* SIX-PER-EM SPACE */
1659 case 0x2007: /* FIGURE SPACE */
1660 case 0x2008: /* PUNCTUATION SPACE */
1661 case 0x2009: /* THIN SPACE */
1662 case 0x200A: /* HAIR SPACE */
1663 case 0x202f: /* NARROW NO-BREAK SPACE */
1664 case 0x205f: /* MEDIUM MATHEMATICAL SPACE */
1665 case 0x3000: /* IDEOGRAPHIC SPACE */
1666 break;
1667 }
1668 ecode++;
1669 break;
1670
1671 case OP_NOT_VSPACE:
1672 if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
1673 GETCHARINCTEST(c, eptr);
1674 switch(c)
1675 {
1676 default: break;
1677 case 0x0a: /* LF */
1678 case 0x0b: /* VT */
1679 case 0x0c: /* FF */
1680 case 0x0d: /* CR */
1681 case 0x85: /* NEL */
1682 case 0x2028: /* LINE SEPARATOR */
1683 case 0x2029: /* PARAGRAPH SEPARATOR */
1684 RRETURN(MATCH_NOMATCH);
1685 }
1686 ecode++;
1687 break;
1688
1689 case OP_VSPACE:
1690 if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
1691 GETCHARINCTEST(c, eptr);
1692 switch(c)
1693 {
1694 default: RRETURN(MATCH_NOMATCH);
1695 case 0x0a: /* LF */
1696 case 0x0b: /* VT */
1697 case 0x0c: /* FF */
1698 case 0x0d: /* CR */
1699 case 0x85: /* NEL */
1700 case 0x2028: /* LINE SEPARATOR */
1701 case 0x2029: /* PARAGRAPH SEPARATOR */
1702 break;
1703 }
1704 ecode++;
1705 break;
1706
1707 #ifdef SUPPORT_UCP
1708 /* Check the next character by Unicode property. We will get here only
1709 if the support is in the binary; otherwise a compile-time error occurs. */
1710
1711 case OP_PROP:
1712 case OP_NOTPROP:
1713 if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
1714 GETCHARINCTEST(c, eptr);
1715 {
1716 const ucd_record *prop = GET_UCD(c);
1717
1718 switch(ecode[1])
1719 {
1720 case PT_ANY:
1721 if (op == OP_NOTPROP) RRETURN(MATCH_NOMATCH);
1722 break;
1723
1724 case PT_LAMP:
1725 if ((prop->chartype == ucp_Lu ||
1726 prop->chartype == ucp_Ll ||
1727 prop->chartype == ucp_Lt) == (op == OP_NOTPROP))
1728 RRETURN(MATCH_NOMATCH);
1729 break;
1730
1731 case PT_GC:
1732 if ((ecode[2] != _pcre_ucp_gentype[prop->chartype]) == (op == OP_PROP))
1733 RRETURN(MATCH_NOMATCH);
1734 break;
1735
1736 case PT_PC:
1737 if ((ecode[2] != prop->chartype) == (op == OP_PROP))
1738 RRETURN(MATCH_NOMATCH);
1739 break;
1740
1741 case PT_SC:
1742 if ((ecode[2] != prop->script) == (op == OP_PROP))
1743 RRETURN(MATCH_NOMATCH);
1744 break;
1745
1746 default:
1747 RRETURN(PCRE_ERROR_INTERNAL);
1748 }
1749
1750 ecode += 3;
1751 }
1752 break;
1753
1754 /* Match an extended Unicode sequence. We will get here only if the support
1755 is in the binary; otherwise a compile-time error occurs. */
1756
1757 case OP_EXTUNI:
1758 if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
1759 GETCHARINCTEST(c, eptr);
1760 {
1761 int category = UCD_CATEGORY(c);
1762 if (category == ucp_M) RRETURN(MATCH_NOMATCH);
1763 while (eptr < md->end_subject)
1764 {
1765 int len = 1;
1766 if (!utf8) c = *eptr; else
1767 {
1768 GETCHARLEN(c, eptr, len);
1769 }
1770 category = UCD_CATEGORY(c);
1771 if (category != ucp_M) break;
1772 eptr += len;
1773 }
1774 }
1775 ecode++;
1776 break;
1777 #endif
1778
1779
1780 /* Match a back reference, possibly repeatedly. Look past the end of the
1781 item to see if there is repeat information following. The code is similar
1782 to that for character classes, but repeated for efficiency. Then obey
1783 similar code to character type repeats - written out again for speed.
1784 However, if the referenced string is the empty string, always treat
1785 it as matched, any number of times (otherwise there could be infinite
1786 loops). */
1787
1788 case OP_REF:
1789 {
1790 offset = GET2(ecode, 1) << 1; /* Doubled ref number */
1791 ecode += 3;
1792
1793 /* If the reference is unset, there are two possibilities:
1794
1795 (a) In the default, Perl-compatible state, set the length to be longer
1796 than the amount of subject left; this ensures that every attempt at a
1797 match fails. We can't just fail here, because of the possibility of
1798 quantifiers with zero minima.
1799
1800 (b) If the JavaScript compatibility flag is set, set the length to zero
1801 so that the back reference matches an empty string.
1802
1803 Otherwise, set the length to the length of what was matched by the
1804 referenced subpattern. */
1805
1806 if (offset >= offset_top || md->offset_vector[offset] < 0)
1807 length = (md->jscript_compat)? 0 : md->end_subject - eptr + 1;
1808 else
1809 length = md->offset_vector[offset+1] - md->offset_vector[offset];
1810
1811 /* Set up for repetition, or handle the non-repeated case */
1812
1813 switch (*ecode)
1814 {
1815 case OP_CRSTAR:
1816 case OP_CRMINSTAR:
1817 case OP_CRPLUS:
1818 case OP_CRMINPLUS:
1819 case OP_CRQUERY:
1820 case OP_CRMINQUERY:
1821 c = *ecode++ - OP_CRSTAR;
1822 minimize = (c & 1) != 0;
1823 min = rep_min[c]; /* Pick up values from tables; */
1824 max = rep_max[c]; /* zero for max => infinity */
1825 if (max == 0) max = INT_MAX;
1826 break;
1827
1828 case OP_CRRANGE:
1829 case OP_CRMINRANGE:
1830 minimize = (*ecode == OP_CRMINRANGE);
1831 min = GET2(ecode, 1);
1832 max = GET2(ecode, 3);
1833 if (max == 0) max = INT_MAX;
1834 ecode += 5;
1835 break;
1836
1837 default: /* No repeat follows */
1838 if (!match_ref(offset, eptr, length, md, ims)) RRETURN(MATCH_NOMATCH);
1839 eptr += length;
1840 continue; /* With the main loop */
1841 }
1842
1843 /* If the length of the reference is zero, just continue with the
1844 main loop. */
1845
1846 if (length == 0) continue;
1847
1848 /* First, ensure the minimum number of matches are present. We get back
1849 the length of the reference string explicitly rather than passing the
1850 address of eptr, so that eptr can be a register variable. */
1851
1852 for (i = 1; i <= min; i++)
1853 {
1854 if (!match_ref(offset, eptr, length, md, ims)) RRETURN(MATCH_NOMATCH);
1855 eptr += length;
1856 }
1857
1858 /* If min = max, continue at the same level without recursion.
1859 They are not both allowed to be zero. */
1860
1861 if (min == max) continue;
1862
1863 /* If minimizing, keep trying and advancing the pointer */
1864
1865 if (minimize)
1866 {
1867 for (fi = min;; fi++)
1868 {
1869 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM14);
1870 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1871 if (fi >= max || !match_ref(offset, eptr, length, md, ims))
1872 RRETURN(MATCH_NOMATCH);
1873 eptr += length;
1874 }
1875 /* Control never gets here */
1876 }
1877
1878 /* If maximizing, find the longest string and work backwards */
1879
1880 else
1881 {
1882 pp = eptr;
1883 for (i = min; i < max; i++)
1884 {
1885 if (!match_ref(offset, eptr, length, md, ims)) break;
1886 eptr += length;
1887 }
1888 while (eptr >= pp)
1889 {
1890 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM15);
1891 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1892 eptr -= length;
1893 }
1894 RRETURN(MATCH_NOMATCH);
1895 }
1896 }
1897 /* Control never gets here */
1898
1899
1900
1901 /* Match a bit-mapped character class, possibly repeatedly. This op code is
1902 used when all the characters in the class have values in the range 0-255,
1903 and either the matching is caseful, or the characters are in the range
1904 0-127 when UTF-8 processing is enabled. The only difference between
1905 OP_CLASS and OP_NCLASS occurs when a data character outside the range is
1906 encountered.
1907
1908 First, look past the end of the item to see if there is repeat information
1909 following. Then obey similar code to character type repeats - written out
1910 again for speed. */
1911
1912 case OP_NCLASS:
1913 case OP_CLASS:
1914 {
1915 data = ecode + 1; /* Save for matching */
1916 ecode += 33; /* Advance past the item */
1917
1918 switch (*ecode)
1919 {
1920 case OP_CRSTAR:
1921 case OP_CRMINSTAR:
1922 case OP_CRPLUS:
1923 case OP_CRMINPLUS:
1924 case OP_CRQUERY:
1925 case OP_CRMINQUERY:
1926 c = *ecode++ - OP_CRSTAR;
1927 minimize = (c & 1) != 0;
1928 min = rep_min[c]; /* Pick up values from tables; */
1929 max = rep_max[c]; /* zero for max => infinity */
1930 if (max == 0) max = INT_MAX;
1931 break;
1932
1933 case OP_CRRANGE:
1934 case OP_CRMINRANGE:
1935 minimize = (*ecode == OP_CRMINRANGE);
1936 min = GET2(ecode, 1);
1937 max = GET2(ecode, 3);
1938 if (max == 0) max = INT_MAX;
1939 ecode += 5;
1940 break;
1941
1942 default: /* No repeat follows */
1943 min = max = 1;
1944 break;
1945 }
1946
1947 /* First, ensure the minimum number of matches are present. */
1948
1949 #ifdef SUPPORT_UTF8
1950 /* UTF-8 mode */
1951 if (utf8)
1952 {
1953 for (i = 1; i <= min; i++)
1954 {
1955 if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
1956 GETCHARINC(c, eptr);
1957 if (c > 255)
1958 {
1959 if (op == OP_CLASS) RRETURN(MATCH_NOMATCH);
1960 }
1961 else
1962 {
1963 if ((data[c/8] & (1 << (c&7))) == 0) RRETURN(MATCH_NOMATCH);
1964 }
1965 }
1966 }
1967 else
1968 #endif
1969 /* Not UTF-8 mode */
1970 {
1971 for (i = 1; i <= min; i++)
1972 {
1973 if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
1974 c = *eptr++;
1975 if ((data[c/8] & (1 << (c&7))) == 0) RRETURN(MATCH_NOMATCH);
1976 }
1977 }
1978
1979 /* If max == min we can continue with the main loop without the
1980 need to recurse. */
1981
1982 if (min == max) continue;
1983
1984 /* If minimizing, keep testing the rest of the expression and advancing
1985 the pointer while it matches the class. */
1986
1987 if (minimize)
1988 {
1989 #ifdef SUPPORT_UTF8
1990 /* UTF-8 mode */
1991 if (utf8)
1992 {
1993 for (fi = min;; fi++)
1994 {
1995 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM16);
1996 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1997 if (fi >= max || eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
1998 GETCHARINC(c, eptr);
1999 if (c > 255)
2000 {
2001 if (op == OP_CLASS) RRETURN(MATCH_NOMATCH);
2002 }
2003 else
2004 {
2005 if ((data[c/8] & (1 << (c&7))) == 0) RRETURN(MATCH_NOMATCH);
2006 }
2007 }
2008 }
2009 else
2010 #endif
2011 /* Not UTF-8 mode */
2012 {
2013 for (fi = min;; fi++)
2014 {
2015 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM17);
2016 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2017 if (fi >= max || eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
2018 c = *eptr++;
2019 if ((data[c/8] & (1 << (c&7))) == 0) RRETURN(MATCH_NOMATCH);
2020 }
2021 }
2022 /* Control never gets here */
2023 }
2024
2025 /* If maximizing, find the longest possible run, then work backwards. */
2026
2027 else
2028 {
2029 pp = eptr;
2030
2031 #ifdef SUPPORT_UTF8
2032 /* UTF-8 mode */
2033 if (utf8)
2034 {
2035 for (i = min; i < max; i++)
2036 {
2037 int len = 1;
2038 if (eptr >= md->end_subject) break;
2039 GETCHARLEN(c, eptr, len);
2040 if (c > 255)
2041 {
2042 if (op == OP_CLASS) break;
2043 }
2044 else
2045 {
2046 if ((data[c/8] & (1 << (c&7))) == 0) break;
2047 }
2048 eptr += len;
2049 }
2050 for (;;)
2051 {
2052 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM18);
2053 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2054 if (eptr-- == pp) break; /* Stop if tried at original pos */
2055 BACKCHAR(eptr);
2056 }
2057 }
2058 else
2059 #endif
2060 /* Not UTF-8 mode */
2061 {
2062 for (i = min; i < max; i++)
2063 {
2064 if (eptr >= md->end_subject) break;
2065 c = *eptr;
2066 if ((data[c/8] & (1 << (c&7))) == 0) break;
2067 eptr++;
2068 }
2069 while (eptr >= pp)
2070 {
2071 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM19);
2072 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2073 eptr--;
2074 }
2075 }
2076
2077 RRETURN(MATCH_NOMATCH);
2078 }
2079 }
2080 /* Control never gets here */
2081
2082
2083 /* Match an extended character class. This opcode is encountered only
2084 when UTF-8 mode mode is supported. Nevertheless, we may not be in UTF-8
2085 mode, because Unicode properties are supported in non-UTF-8 mode. */
2086
2087 #ifdef SUPPORT_UTF8
2088 case OP_XCLASS:
2089 {
2090 data = ecode + 1 + LINK_SIZE; /* Save for matching */
2091 ecode += GET(ecode, 1); /* Advance past the item */
2092
2093 switch (*ecode)
2094 {
2095 case OP_CRSTAR:
2096 case OP_CRMINSTAR:
2097 case OP_CRPLUS:
2098 case OP_CRMINPLUS:
2099 case OP_CRQUERY:
2100 case OP_CRMINQUERY:
2101 c = *ecode++ - OP_CRSTAR;
2102 minimize = (c & 1) != 0;
2103 min = rep_min[c]; /* Pick up values from tables; */
2104 max = rep_max[c]; /* zero for max => infinity */
2105 if (max == 0) max = INT_MAX;
2106 break;
2107
2108 case OP_CRRANGE:
2109 case OP_CRMINRANGE:
2110 minimize = (*ecode == OP_CRMINRANGE);
2111 min = GET2(ecode, 1);
2112 max = GET2(ecode, 3);
2113 if (max == 0) max = INT_MAX;
2114 ecode += 5;
2115 break;
2116
2117 default: /* No repeat follows */
2118 min = max = 1;
2119 break;
2120 }
2121
2122 /* First, ensure the minimum number of matches are present. */
2123
2124 for (i = 1; i <= min; i++)
2125 {
2126 if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
2127 GETCHARINCTEST(c, eptr);
2128 if (!_pcre_xclass(c, data)) RRETURN(MATCH_NOMATCH);
2129 }
2130
2131 /* If max == min we can continue with the main loop without the
2132 need to recurse. */
2133
2134 if (min == max) continue;
2135
2136 /* If minimizing, keep testing the rest of the expression and advancing
2137 the pointer while it matches the class. */
2138
2139 if (minimize)
2140 {
2141 for (fi = min;; fi++)
2142 {
2143 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM20);
2144 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2145 if (fi >= max || eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
2146 GETCHARINCTEST(c, eptr);
2147 if (!_pcre_xclass(c, data)) RRETURN(MATCH_NOMATCH);
2148 }
2149 /* Control never gets here */
2150 }
2151
2152 /* If maximizing, find the longest possible run, then work backwards. */
2153
2154 else
2155 {
2156 pp = eptr;
2157 for (i = min; i < max; i++)
2158 {
2159 int len = 1;
2160 if (eptr >= md->end_subject) break;
2161 GETCHARLENTEST(c, eptr, len);
2162 if (!_pcre_xclass(c, data)) break;
2163 eptr += len;
2164 }
2165 for(;;)
2166 {
2167 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM21);
2168 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2169 if (eptr-- == pp) break; /* Stop if tried at original pos */
2170 if (utf8) BACKCHAR(eptr);
2171 }
2172 RRETURN(MATCH_NOMATCH);
2173 }
2174
2175 /* Control never gets here */
2176 }
2177 #endif /* End of XCLASS */
2178
2179 /* Match a single character, casefully */
2180
2181 case OP_CHAR:
2182 #ifdef SUPPORT_UTF8
2183 if (utf8)
2184 {
2185 length = 1;
2186 ecode++;
2187 GETCHARLEN(fc, ecode, length);
2188 if (length > md->end_subject - eptr) RRETURN(MATCH_NOMATCH);
2189 while (length-- > 0) if (*ecode++ != *eptr++) RRETURN(MATCH_NOMATCH);
2190 }
2191 else
2192 #endif
2193
2194 /* Non-UTF-8 mode */
2195 {
2196 if (md->end_subject - eptr < 1) RRETURN(MATCH_NOMATCH);
2197 if (ecode[1] != *eptr++) RRETURN(MATCH_NOMATCH);
2198 ecode += 2;
2199 }
2200 break;
2201
2202 /* Match a single character, caselessly */
2203
2204 case OP_CHARNC:
2205 #ifdef SUPPORT_UTF8
2206 if (utf8)
2207 {
2208 length = 1;
2209 ecode++;
2210 GETCHARLEN(fc, ecode, length);
2211
2212 if (length > md->end_subject - eptr) RRETURN(MATCH_NOMATCH);
2213
2214 /* If the pattern character's value is < 128, we have only one byte, and
2215 can use the fast lookup table. */
2216
2217 if (fc < 128)
2218 {
2219 if (md->lcc[*ecode++] != md->lcc[*eptr++]) RRETURN(MATCH_NOMATCH);
2220 }
2221
2222 /* Otherwise we must pick up the subject character */
2223
2224 else
2225 {
2226 unsigned int dc;
2227 GETCHARINC(dc, eptr);
2228 ecode += length;
2229
2230 /* If we have Unicode property support, we can use it to test the other
2231 case of the character, if there is one. */
2232
2233 if (fc != dc)
2234 {
2235 #ifdef SUPPORT_UCP
2236 if (dc != UCD_OTHERCASE(fc))
2237 #endif
2238 RRETURN(MATCH_NOMATCH);
2239 }
2240 }
2241 }
2242 else
2243 #endif /* SUPPORT_UTF8 */
2244
2245 /* Non-UTF-8 mode */
2246 {
2247 if (md->end_subject - eptr < 1) RRETURN(MATCH_NOMATCH);
2248 if (md->lcc[ecode[1]] != md->lcc[*eptr++]) RRETURN(MATCH_NOMATCH);
2249 ecode += 2;
2250 }
2251 break;
2252
2253 /* Match a single character repeatedly. */
2254
2255 case OP_EXACT:
2256 min = max = GET2(ecode, 1);
2257 ecode += 3;
2258 goto REPEATCHAR;
2259
2260 case OP_POSUPTO:
2261 possessive = TRUE;
2262 /* Fall through */
2263
2264 case OP_UPTO:
2265 case OP_MINUPTO:
2266 min = 0;
2267 max = GET2(ecode, 1);
2268 minimize = *ecode == OP_MINUPTO;
2269 ecode += 3;
2270 goto REPEATCHAR;
2271
2272 case OP_POSSTAR:
2273 possessive = TRUE;
2274 min = 0;
2275 max = INT_MAX;
2276 ecode++;
2277 goto REPEATCHAR;
2278
2279 case OP_POSPLUS:
2280 possessive = TRUE;
2281 min = 1;
2282 max = INT_MAX;
2283 ecode++;
2284 goto REPEATCHAR;
2285
2286 case OP_POSQUERY:
2287 possessive = TRUE;
2288 min = 0;
2289 max = 1;
2290 ecode++;
2291 goto REPEATCHAR;
2292
2293 case OP_STAR:
2294 case OP_MINSTAR:
2295 case OP_PLUS:
2296 case OP_MINPLUS:
2297 case OP_QUERY:
2298 case OP_MINQUERY:
2299 c = *ecode++ - OP_STAR;
2300 minimize = (c & 1) != 0;
2301 min = rep_min[c]; /* Pick up values from tables; */
2302 max = rep_max[c]; /* zero for max => infinity */
2303 if (max == 0) max = INT_MAX;
2304
2305 /* Common code for all repeated single-character matches. We can give
2306 up quickly if there are fewer than the minimum number of characters left in
2307 the subject. */
2308
2309 REPEATCHAR:
2310 #ifdef SUPPORT_UTF8
2311 if (utf8)
2312 {
2313 length = 1;
2314 charptr = ecode;
2315 GETCHARLEN(fc, ecode, length);
2316 if (min * length > md->end_subject - eptr) RRETURN(MATCH_NOMATCH);
2317 ecode += length;
2318
2319 /* Handle multibyte character matching specially here. There is
2320 support for caseless matching if UCP support is present. */
2321
2322 if (length > 1)
2323 {
2324 #ifdef SUPPORT_UCP
2325 unsigned int othercase;
2326 if ((ims & PCRE_CASELESS) != 0 &&
2327 (othercase = UCD_OTHERCASE(fc)) != fc)
2328 oclength = _pcre_ord2utf8(othercase, occhars);
2329 else oclength = 0;
2330 #endif /* SUPPORT_UCP */
2331
2332 for (i = 1; i <= min; i++)
2333 {
2334 if (memcmp(eptr, charptr, length) == 0) eptr += length;
2335 #ifdef SUPPORT_UCP
2336 /* Need braces because of following else */
2337 else if (oclength == 0) { RRETURN(MATCH_NOMATCH); }
2338 else
2339 {
2340 if (memcmp(eptr, occhars, oclength) != 0) RRETURN(MATCH_NOMATCH);
2341 eptr += oclength;
2342 }
2343 #else /* without SUPPORT_UCP */
2344 else { RRETURN(MATCH_NOMATCH); }
2345 #endif /* SUPPORT_UCP */
2346 }
2347
2348 if (min == max) continue;
2349
2350 if (minimize)
2351 {
2352 for (fi = min;; fi++)
2353 {
2354 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM22);
2355 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2356 if (fi >= max || eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
2357 if (memcmp(eptr, charptr, length) == 0) eptr += length;
2358 #ifdef SUPPORT_UCP
2359 /* Need braces because of following else */
2360 else if (oclength == 0) { RRETURN(MATCH_NOMATCH); }
2361 else
2362 {
2363 if (memcmp(eptr, occhars, oclength) != 0) RRETURN(MATCH_NOMATCH);
2364 eptr += oclength;
2365 }
2366 #else /* without SUPPORT_UCP */
2367 else { RRETURN (MATCH_NOMATCH); }
2368 #endif /* SUPPORT_UCP */
2369 }
2370 /* Control never gets here */
2371 }
2372
2373 else /* Maximize */
2374 {
2375 pp = eptr;
2376 for (i = min; i < max; i++)
2377 {
2378 if (eptr > md->end_subject - length) break;
2379 if (memcmp(eptr, charptr, length) == 0) eptr += length;
2380 #ifdef SUPPORT_UCP
2381 else if (oclength == 0) break;
2382 else
2383 {
2384 if (memcmp(eptr, occhars, oclength) != 0) break;
2385 eptr += oclength;
2386 }
2387 #else /* without SUPPORT_UCP */
2388 else break;
2389 #endif /* SUPPORT_UCP */
2390 }
2391
2392 if (possessive) continue;
2393 for(;;)
2394 {
2395 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM23);
2396 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2397 if (eptr == pp) RRETURN(MATCH_NOMATCH);
2398 #ifdef SUPPORT_UCP
2399 eptr--;
2400 BACKCHAR(eptr);
2401 #else /* without SUPPORT_UCP */
2402 eptr -= length;
2403 #endif /* SUPPORT_UCP */
2404 }
2405 }
2406 /* Control never gets here */
2407 }
2408
2409 /* If the length of a UTF-8 character is 1, we fall through here, and
2410 obey the code as for non-UTF-8 characters below, though in this case the
2411 value of fc will always be < 128. */
2412 }
2413 else
2414 #endif /* SUPPORT_UTF8 */
2415
2416 /* When not in UTF-8 mode, load a single-byte character. */
2417 {
2418 if (min > md->end_subject - eptr) RRETURN(MATCH_NOMATCH);
2419 fc = *ecode++;
2420 }
2421
2422 /* The value of fc at this point is always less than 256, though we may or
2423 may not be in UTF-8 mode. The code is duplicated for the caseless and
2424 caseful cases, for speed, since matching characters is likely to be quite
2425 common. First, ensure the minimum number of matches are present. If min =
2426 max, continue at the same level without recursing. Otherwise, if
2427 minimizing, keep trying the rest of the expression and advancing one
2428 matching character if failing, up to the maximum. Alternatively, if
2429 maximizing, find the maximum number of characters and work backwards. */
2430
2431 DPRINTF(("matching %c{%d,%d} against subject %.*s\n", fc, min, max,
2432 max, eptr));
2433
2434 if ((ims & PCRE_CASELESS) != 0)
2435 {
2436 fc = md->lcc[fc];
2437 for (i = 1; i <= min; i++)
2438 if (fc != md->lcc[*eptr++]) RRETURN(MATCH_NOMATCH);
2439 if (min == max) continue;
2440 if (minimize)
2441 {
2442 for (fi = min;; fi++)
2443 {
2444 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM24);
2445 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2446 if (fi >= max || eptr >= md->end_subject ||
2447 fc != md->lcc[*eptr++])
2448 RRETURN(MATCH_NOMATCH);
2449 }
2450 /* Control never gets here */
2451 }
2452 else /* Maximize */
2453 {
2454 pp = eptr;
2455 for (i = min; i < max; i++)
2456 {
2457 if (eptr >= md->end_subject || fc != md->lcc[*eptr]) break;
2458 eptr++;
2459 }
2460 if (possessive) continue;
2461 while (eptr >= pp)
2462 {
2463 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM25);
2464 eptr--;
2465 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2466 }
2467 RRETURN(MATCH_NOMATCH);
2468 }
2469 /* Control never gets here */
2470 }
2471
2472 /* Caseful comparisons (includes all multi-byte characters) */
2473
2474 else
2475 {
2476 for (i = 1; i <= min; i++) if (fc != *eptr++) RRETURN(MATCH_NOMATCH);
2477 if (min == max) continue;
2478 if (minimize)
2479 {
2480 for (fi = min;; fi++)
2481 {
2482 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM26);
2483 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2484 if (fi >= max || eptr >= md->end_subject || fc != *eptr++)
2485 RRETURN(MATCH_NOMATCH);
2486 }
2487 /* Control never gets here */
2488 }
2489 else /* Maximize */
2490 {
2491 pp = eptr;
2492 for (i = min; i < max; i++)
2493 {
2494 if (eptr >= md->end_subject || fc != *eptr) break;
2495 eptr++;
2496 }
2497 if (possessive) continue;
2498 while (eptr >= pp)
2499 {
2500 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM27);
2501 eptr--;
2502 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2503 }
2504 RRETURN(MATCH_NOMATCH);
2505 }
2506 }
2507 /* Control never gets here */
2508
2509 /* Match a negated single one-byte character. The character we are
2510 checking can be multibyte. */
2511
2512 case OP_NOT:
2513 if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
2514 ecode++;
2515 GETCHARINCTEST(c, eptr);
2516 if ((ims & PCRE_CASELESS) != 0)
2517 {
2518 #ifdef SUPPORT_UTF8
2519 if (c < 256)
2520 #endif
2521 c = md->lcc[c];
2522 if (md->lcc[*ecode++] == c) RRETURN(MATCH_NOMATCH);
2523 }
2524 else
2525 {
2526 if (*ecode++ == c) RRETURN(MATCH_NOMATCH);
2527 }
2528 break;
2529
2530 /* Match a negated single one-byte character repeatedly. This is almost a
2531 repeat of the code for a repeated single character, but I haven't found a
2532 nice way of commoning these up that doesn't require a test of the
2533 positive/negative option for each character match. Maybe that wouldn't add
2534 very much to the time taken, but character matching *is* what this is all
2535 about... */
2536
2537 case OP_NOTEXACT:
2538 min = max = GET2(ecode, 1);
2539 ecode += 3;
2540 goto REPEATNOTCHAR;
2541
2542 case OP_NOTUPTO:
2543 case OP_NOTMINUPTO:
2544 min = 0;
2545 max = GET2(ecode, 1);
2546 minimize = *ecode == OP_NOTMINUPTO;
2547 ecode += 3;
2548 goto REPEATNOTCHAR;
2549
2550 case OP_NOTPOSSTAR:
2551 possessive = TRUE;
2552 min = 0;
2553 max = INT_MAX;
2554 ecode++;
2555 goto REPEATNOTCHAR;
2556
2557 case OP_NOTPOSPLUS:
2558 possessive = TRUE;
2559 min = 1;
2560 max = INT_MAX;
2561 ecode++;
2562 goto REPEATNOTCHAR;
2563
2564 case OP_NOTPOSQUERY:
2565 possessive = TRUE;
2566 min = 0;
2567 max = 1;
2568 ecode++;
2569 goto REPEATNOTCHAR;
2570
2571 case OP_NOTPOSUPTO:
2572 possessive = TRUE;
2573 min = 0;
2574 max = GET2(ecode, 1);
2575 ecode += 3;
2576 goto REPEATNOTCHAR;
2577
2578 case OP_NOTSTAR:
2579 case OP_NOTMINSTAR:
2580 case OP_NOTPLUS:
2581 case OP_NOTMINPLUS:
2582 case OP_NOTQUERY:
2583 case OP_NOTMINQUERY:
2584 c = *ecode++ - OP_NOTSTAR;
2585 minimize = (c & 1) != 0;
2586 min = rep_min[c]; /* Pick up values from tables; */
2587 max = rep_max[c]; /* zero for max => infinity */
2588 if (max == 0) max = INT_MAX;
2589
2590 /* Common code for all repeated single-byte matches. We can give up quickly
2591 if there are fewer than the minimum number of bytes left in the
2592 subject. */
2593
2594 REPEATNOTCHAR:
2595 if (min > md->end_subject - eptr) RRETURN(MATCH_NOMATCH);
2596 fc = *ecode++;
2597
2598 /* The code is duplicated for the caseless and caseful cases, for speed,
2599 since matching characters is likely to be quite common. First, ensure the
2600 minimum number of matches are present. If min = max, continue at the same
2601 level without recursing. Otherwise, if minimizing, keep trying the rest of
2602 the expression and advancing one matching character if failing, up to the
2603 maximum. Alternatively, if maximizing, find the maximum number of
2604 characters and work backwards. */
2605
2606 DPRINTF(("negative matching %c{%d,%d} against subject %.*s\n", fc, min, max,
2607 max, eptr));
2608
2609 if ((ims & PCRE_CASELESS) != 0)
2610 {
2611 fc = md->lcc[fc];
2612
2613 #ifdef SUPPORT_UTF8
2614 /* UTF-8 mode */
2615 if (utf8)
2616 {
2617 register unsigned int d;
2618 for (i = 1; i <= min; i++)
2619 {
2620 GETCHARINC(d, eptr);
2621 if (d < 256) d = md->lcc[d];
2622 if (fc == d) RRETURN(MATCH_NOMATCH);
2623 }
2624 }
2625 else
2626 #endif
2627
2628 /* Not UTF-8 mode */
2629 {
2630 for (i = 1; i <= min; i++)
2631 if (fc == md->lcc[*eptr++]) RRETURN(MATCH_NOMATCH);
2632 }
2633
2634 if (min == max) continue;
2635
2636 if (minimize)
2637 {
2638 #ifdef SUPPORT_UTF8
2639 /* UTF-8 mode */
2640 if (utf8)
2641 {
2642 register unsigned int d;
2643 for (fi = min;; fi++)
2644 {
2645 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM28);
2646 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2647 if (fi >= max || eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
2648 GETCHARINC(d, eptr);
2649 if (d < 256) d = md->lcc[d];
2650 if (fc == d) RRETURN(MATCH_NOMATCH);
2651
2652 }
2653 }
2654 else
2655 #endif
2656 /* Not UTF-8 mode */
2657 {
2658 for (fi = min;; fi++)
2659 {
2660 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM29);
2661 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2662 if (fi >= max || eptr >= md->end_subject || fc == md->lcc[*eptr++])
2663 RRETURN(MATCH_NOMATCH);
2664 }
2665 }
2666 /* Control never gets here */
2667 }
2668
2669 /* Maximize case */
2670
2671 else
2672 {
2673 pp = eptr;
2674
2675 #ifdef SUPPORT_UTF8
2676 /* UTF-8 mode */
2677 if (utf8)
2678 {
2679 register unsigned int d;
2680 for (i = min; i < max; i++)
2681 {
2682 int len = 1;
2683 if (eptr >= md->end_subject) break;
2684 GETCHARLEN(d, eptr, len);
2685 if (d < 256) d = md->lcc[d];
2686 if (fc == d) break;
2687 eptr += len;
2688 }
2689 if (possessive) continue;
2690 for(;;)
2691 {
2692 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM30);
2693 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2694 if (eptr-- == pp) break; /* Stop if tried at original pos */
2695 BACKCHAR(eptr);
2696 }
2697 }
2698 else
2699 #endif
2700 /* Not UTF-8 mode */
2701 {
2702 for (i = min; i < max; i++)
2703 {
2704 if (eptr >= md->end_subject || fc == md->lcc[*eptr]) break;
2705 eptr++;
2706 }
2707 if (possessive) continue;
2708 while (eptr >= pp)
2709 {
2710 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM31);
2711 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2712 eptr--;
2713 }
2714 }
2715
2716 RRETURN(MATCH_NOMATCH);
2717 }
2718 /* Control never gets here */
2719 }
2720
2721 /* Caseful comparisons */
2722
2723 else
2724 {
2725 #ifdef SUPPORT_UTF8
2726 /* UTF-8 mode */
2727 if (utf8)
2728 {
2729 register unsigned int d;
2730 for (i = 1; i <= min; i++)
2731 {
2732 GETCHARINC(d, eptr);
2733 if (fc == d) RRETURN(MATCH_NOMATCH);
2734 }
2735 }
2736 else
2737 #endif
2738 /* Not UTF-8 mode */
2739 {
2740 for (i = 1; i <= min; i++)
2741 if (fc == *eptr++) RRETURN(MATCH_NOMATCH);
2742 }
2743
2744 if (min == max) continue;
2745
2746 if (minimize)
2747 {
2748 #ifdef SUPPORT_UTF8
2749 /* UTF-8 mode */
2750 if (utf8)
2751 {
2752 register unsigned int d;
2753 for (fi = min;; fi++)
2754 {
2755 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM32);
2756 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2757 if (fi >= max || eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
2758 GETCHARINC(d, eptr);
2759 if (fc == d) RRETURN(MATCH_NOMATCH);
2760 }
2761 }
2762 else
2763 #endif
2764 /* Not UTF-8 mode */
2765 {
2766 for (fi = min;; fi++)
2767 {
2768 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM33);
2769 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2770 if (fi >= max || eptr >= md->end_subject || fc == *eptr++)
2771 RRETURN(MATCH_NOMATCH);
2772 }
2773 }
2774 /* Control never gets here */
2775 }
2776
2777 /* Maximize case */
2778
2779 else
2780 {
2781 pp = eptr;
2782
2783 #ifdef SUPPORT_UTF8
2784 /* UTF-8 mode */
2785 if (utf8)
2786 {
2787 register unsigned int d;
2788 for (i = min; i < max; i++)
2789 {
2790 int len = 1;
2791 if (eptr >= md->end_subject) break;
2792 GETCHARLEN(d, eptr, len);
2793 if (fc == d) break;
2794 eptr += len;
2795 }
2796 if (possessive) continue;
2797 for(;;)
2798 {
2799 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM34);
2800 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2801 if (eptr-- == pp) break; /* Stop if tried at original pos */
2802 BACKCHAR(eptr);
2803 }
2804 }
2805 else
2806 #endif
2807 /* Not UTF-8 mode */
2808 {
2809 for (i = min; i < max; i++)
2810 {
2811 if (eptr >= md->end_subject || fc == *eptr) break;
2812 eptr++;
2813 }
2814 if (possessive) continue;
2815 while (eptr >= pp)
2816 {
2817 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM35);
2818 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2819 eptr--;
2820 }
2821 }
2822
2823 RRETURN(MATCH_NOMATCH);
2824 }
2825 }
2826 /* Control never gets here */
2827
2828 /* Match a single character type repeatedly; several different opcodes
2829 share code. This is very similar to the code for single characters, but we
2830 repeat it in the interests of efficiency. */
2831
2832 case OP_TYPEEXACT:
2833 min = max = GET2(ecode, 1);
2834 minimize = TRUE;
2835 ecode += 3;
2836 goto REPEATTYPE;
2837
2838 case OP_TYPEUPTO:
2839 case OP_TYPEMINUPTO:
2840 min = 0;
2841 max = GET2(ecode, 1);
2842 minimize = *ecode == OP_TYPEMINUPTO;
2843 ecode += 3;
2844 goto REPEATTYPE;
2845
2846 case OP_TYPEPOSSTAR:
2847 possessive = TRUE;
2848 min = 0;
2849 max = INT_MAX;
2850 ecode++;
2851 goto REPEATTYPE;
2852
2853 case OP_TYPEPOSPLUS:
2854 possessive = TRUE;
2855 min = 1;
2856 max = INT_MAX;
2857 ecode++;
2858 goto REPEATTYPE;
2859
2860 case OP_TYPEPOSQUERY:
2861 possessive = TRUE;
2862 min = 0;
2863 max = 1;
2864 ecode++;
2865 goto REPEATTYPE;
2866
2867 case OP_TYPEPOSUPTO:
2868 possessive = TRUE;
2869 min = 0;
2870 max = GET2(ecode, 1);
2871 ecode += 3;
2872 goto REPEATTYPE;
2873
2874 case OP_TYPESTAR:
2875 case OP_TYPEMINSTAR:
2876 case OP_TYPEPLUS:
2877 case OP_TYPEMINPLUS:
2878 case OP_TYPEQUERY:
2879 case OP_TYPEMINQUERY:
2880 c = *ecode++ - OP_TYPESTAR;
2881 minimize = (c & 1) != 0;
2882 min = rep_min[c]; /* Pick up values from tables; */
2883 max = rep_max[c]; /* zero for max => infinity */
2884 if (max == 0) max = INT_MAX;
2885
2886 /* Common code for all repeated single character type matches. Note that
2887 in UTF-8 mode, '.' matches a character of any length, but for the other
2888 character types, the valid characters are all one-byte long. */
2889
2890 REPEATTYPE:
2891 ctype = *ecode++; /* Code for the character type */
2892
2893 #ifdef SUPPORT_UCP
2894 if (ctype == OP_PROP || ctype == OP_NOTPROP)
2895 {
2896 prop_fail_result = ctype == OP_NOTPROP;
2897 prop_type = *ecode++;
2898 prop_value = *ecode++;
2899 }
2900 else prop_type = -1;
2901 #endif
2902
2903 /* First, ensure the minimum number of matches are present. Use inline
2904 code for maximizing the speed, and do the type test once at the start
2905 (i.e. keep it out of the loop). Also we can test that there are at least
2906 the minimum number of bytes before we start. This isn't as effective in
2907 UTF-8 mode, but it does no harm. Separate the UTF-8 code completely as that
2908 is tidier. Also separate the UCP code, which can be the same for both UTF-8
2909 and single-bytes. */
2910
2911 if (min > md->end_subject - eptr) RRETURN(MATCH_NOMATCH);
2912 if (min > 0)
2913 {
2914 #ifdef SUPPORT_UCP
2915 if (prop_type >= 0)
2916 {
2917 switch(prop_type)
2918 {
2919 case PT_ANY:
2920 if (prop_fail_result) RRETURN(MATCH_NOMATCH);
2921 for (i = 1; i <= min; i++)
2922 {
2923 if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
2924 GETCHARINCTEST(c, eptr);
2925 }
2926 break;
2927
2928 case PT_LAMP:
2929 for (i = 1; i <= min; i++)
2930 {
2931 if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
2932 GETCHARINCTEST(c, eptr);
2933 prop_chartype = UCD_CHARTYPE(c);
2934 if ((prop_chartype == ucp_Lu ||
2935 prop_chartype == ucp_Ll ||
2936 prop_chartype == ucp_Lt) == prop_fail_result)
2937 RRETURN(MATCH_NOMATCH);
2938 }
2939 break;
2940
2941 case PT_GC:
2942 for (i = 1; i <= min; i++)
2943 {
2944 if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
2945 GETCHARINCTEST(c, eptr);
2946 prop_category = UCD_CATEGORY(c);
2947 if ((prop_category == prop_value) == prop_fail_result)
2948 RRETURN(MATCH_NOMATCH);
2949 }
2950 break;
2951
2952 case PT_PC:
2953 for (i = 1; i <= min; i++)
2954 {
2955 if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
2956 GETCHARINCTEST(c, eptr);
2957 prop_chartype = UCD_CHARTYPE(c);
2958 if ((prop_chartype == prop_value) == prop_fail_result)
2959 RRETURN(MATCH_NOMATCH);
2960 }
2961 break;
2962
2963 case PT_SC:
2964 for (i = 1; i <= min; i++)
2965 {
2966 if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
2967 GETCHARINCTEST(c, eptr);
2968 prop_script = UCD_SCRIPT(c);
2969 if ((prop_script == prop_value) == prop_fail_result)
2970 RRETURN(MATCH_NOMATCH);
2971 }
2972 break;
2973
2974 default:
2975 RRETURN(PCRE_ERROR_INTERNAL);
2976 }
2977 }
2978
2979 /* Match extended Unicode sequences. We will get here only if the
2980 support is in the binary; otherwise a compile-time error occurs. */
2981
2982 else if (ctype == OP_EXTUNI)
2983 {
2984 for (i = 1; i <= min; i++)
2985 {
2986 GETCHARINCTEST(c, eptr);
2987 prop_category = UCD_CATEGORY(c);
2988 if (prop_category == ucp_M) RRETURN(MATCH_NOMATCH);
2989 while (eptr < md->end_subject)
2990 {
2991 int len = 1;
2992 if (!utf8) c = *eptr; else
2993 {
2994 GETCHARLEN(c, eptr, len);
2995 }
2996 prop_category = UCD_CATEGORY(c);
2997 if (prop_category != ucp_M) break;
2998 eptr += len;
2999 }
3000 }
3001 }
3002
3003 else
3004 #endif /* SUPPORT_UCP */
3005
3006 /* Handle all other cases when the coding is UTF-8 */
3007
3008 #ifdef SUPPORT_UTF8
3009 if (utf8) switch(ctype)
3010 {
3011 case OP_ANY:
3012 for (i = 1; i <= min; i++)
3013 {
3014 if (eptr >= md->end_subject || IS_NEWLINE(eptr))
3015 RRETURN(MATCH_NOMATCH);
3016 eptr++;
3017 while (eptr < md->end_subject && (*eptr & 0xc0) == 0x80) eptr++;
3018 }
3019 break;
3020
3021 case OP_ALLANY:
3022 for (i = 1; i <= min; i++)
3023 {
3024 if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
3025 eptr++;
3026 while (eptr < md->end_subject && (*eptr & 0xc0) == 0x80) eptr++;
3027 }
3028 break;
3029
3030 case OP_ANYBYTE:
3031 eptr += min;
3032 break;
3033
3034 case OP_ANYNL:
3035 for (i = 1; i <= min; i++)
3036 {
3037 if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
3038 GETCHARINC(c, eptr);
3039 switch(c)
3040 {
3041 default: RRETURN(MATCH_NOMATCH);
3042 case 0x000d:
3043 if (eptr < md->end_subject && *eptr == 0x0a) eptr++;
3044 break;
3045
3046 case 0x000a:
3047 break;
3048
3049 case 0x000b:
3050 case 0x000c:
3051 case 0x0085:
3052 case 0x2028:
3053 case 0x2029:
3054 if (md->bsr_anycrlf) RRETURN(MATCH_NOMATCH);
3055 break;
3056 }
3057 }
3058 break;
3059
3060 case OP_NOT_HSPACE:
3061 for (i = 1; i <= min; i++)
3062 {
3063 if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
3064 GETCHARINC(c, eptr);
3065 switch(c)
3066 {
3067 default: break;
3068 case 0x09: /* HT */
3069 case 0x20: /* SPACE */
3070 case 0xa0: /* NBSP */
3071 case 0x1680: /* OGHAM SPACE MARK */
3072 case 0x180e: /* MONGOLIAN VOWEL SEPARATOR */
3073 case 0x2000: /* EN QUAD */
3074 case 0x2001: /* EM QUAD */
3075 case 0x2002: /* EN SPACE */
3076 case 0x2003: /* EM SPACE */
3077 case 0x2004: /* THREE-PER-EM SPACE */
3078 case 0x2005: /* FOUR-PER-EM SPACE */
3079 case 0x2006: /* SIX-PER-EM SPACE */
3080 case 0x2007: /* FIGURE SPACE */
3081 case 0x2008: /* PUNCTUATION SPACE */
3082 case 0x2009: /* THIN SPACE */
3083 case 0x200A: /* HAIR SPACE */
3084 case 0x202f: /* NARROW NO-BREAK SPACE */
3085 case 0x205f: /* MEDIUM MATHEMATICAL SPACE */
3086 case 0x3000: /* IDEOGRAPHIC SPACE */
3087 RRETURN(MATCH_NOMATCH);
3088 }
3089 }
3090 break;
3091
3092 case OP_HSPACE:
3093 for (i = 1; i <= min; i++)
3094 {
3095 if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
3096 GETCHARINC(c, eptr);
3097 switch(c)
3098 {
3099 default: RRETURN(MATCH_NOMATCH);
3100 case 0x09: /* HT */
3101 case 0x20: /* SPACE */
3102 case 0xa0: /* NBSP */
3103 case 0x1680: /* OGHAM SPACE MARK */
3104 case 0x180e: /* MONGOLIAN VOWEL SEPARATOR */
3105 case 0x2000: /* EN QUAD */
3106 case 0x2001: /* EM QUAD */
3107 case 0x2002: /* EN SPACE */
3108 case 0x2003: /* EM SPACE */
3109 case 0x2004: /* THREE-PER-EM SPACE */
3110 case 0x2005: /* FOUR-PER-EM SPACE */
3111 case 0x2006: /* SIX-PER-EM SPACE */
3112 case 0x2007: /* FIGURE SPACE */
3113 case 0x2008: /* PUNCTUATION SPACE */
3114 case 0x2009: /* THIN SPACE */
3115 case 0x200A: /* HAIR SPACE */
3116 case 0x202f: /* NARROW NO-BREAK SPACE */
3117 case 0x205f: /* MEDIUM MATHEMATICAL SPACE */
3118 case 0x3000: /* IDEOGRAPHIC SPACE */
3119 break;
3120 }
3121 }
3122 break;
3123
3124 case OP_NOT_VSPACE:
3125 for (i = 1; i <= min; i++)
3126 {
3127 if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
3128 GETCHARINC(c, eptr);
3129 switch(c)
3130 {
3131 default: break;
3132 case 0x0a: /* LF */
3133 case 0x0b: /* VT */
3134 case 0x0c: /* FF */
3135 case 0x0d: /* CR */
3136 case 0x85: /* NEL */
3137 case 0x2028: /* LINE SEPARATOR */
3138 case 0x2029: /* PARAGRAPH SEPARATOR */
3139 RRETURN(MATCH_NOMATCH);
3140 }
3141 }
3142 break;
3143
3144 case OP_VSPACE:
3145 for (i = 1; i <= min; i++)
3146 {
3147 if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
3148 GETCHARINC(c, eptr);
3149 switch(c)
3150 {
3151 default: RRETURN(MATCH_NOMATCH);
3152 case 0x0a: /* LF */
3153 case 0x0b: /* VT */
3154 case 0x0c: /* FF */
3155 case 0x0d: /* CR */
3156 case 0x85: /* NEL */
3157 case 0x2028: /* LINE SEPARATOR */
3158 case 0x2029: /* PARAGRAPH SEPARATOR */
3159 break;
3160 }
3161 }
3162 break;
3163
3164 case OP_NOT_DIGIT:
3165 for (i = 1; i <= min; i++)
3166 {
3167 if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
3168 GETCHARINC(c, eptr);
3169 if (c < 128 && (md->ctypes[c] & ctype_digit) != 0)
3170 RRETURN(MATCH_NOMATCH);
3171 }
3172 break;
3173
3174 case OP_DIGIT:
3175 for (i = 1; i <= min; i++)
3176 {
3177 if (eptr >= md->end_subject ||
3178 *eptr >= 128 || (md->ctypes[*eptr++] & ctype_digit) == 0)
3179 RRETURN(MATCH_NOMATCH);
3180 /* No need to skip more bytes - we know it's a 1-byte character */
3181 }
3182 break;
3183
3184 case OP_NOT_WHITESPACE:
3185 for (i = 1; i <= min; i++)
3186 {
3187 if (eptr >= md->end_subject ||
3188 (*eptr < 128 && (md->ctypes[*eptr] & ctype_space) != 0))
3189 RRETURN(MATCH_NOMATCH);
3190 while (++eptr < md->end_subject && (*eptr & 0xc0) == 0x80);
3191 }
3192 break;
3193
3194 case OP_WHITESPACE:
3195 for (i = 1; i <= min; i++)
3196 {
3197 if (eptr >= md->end_subject ||
3198 *eptr >= 128 || (md->ctypes[*eptr++] & ctype_space) == 0)
3199 RRETURN(MATCH_NOMATCH);
3200 /* No need to skip more bytes - we know it's a 1-byte character */
3201 }
3202 break;
3203
3204 case OP_NOT_WORDCHAR:
3205 for (i = 1; i <= min; i++)
3206 {
3207 if (eptr >= md->end_subject ||
3208 (*eptr < 128 && (md->ctypes[*eptr] & ctype_word) != 0))
3209 RRETURN(MATCH_NOMATCH);
3210 while (++eptr < md->end_subject && (*eptr & 0xc0) == 0x80);
3211 }
3212 break;
3213
3214 case OP_WORDCHAR:
3215 for (i = 1; i <= min; i++)
3216 {
3217 if (eptr >= md->end_subject ||
3218 *eptr >= 128 || (md->ctypes[*eptr++] & ctype_word) == 0)
3219 RRETURN(MATCH_NOMATCH);
3220 /* No need to skip more bytes - we know it's a 1-byte character */
3221 }
3222 break;
3223
3224 default:
3225 RRETURN(PCRE_ERROR_INTERNAL);
3226 } /* End switch(ctype) */
3227
3228 else
3229 #endif /* SUPPORT_UTF8 */
3230
3231 /* Code for the non-UTF-8 case for minimum matching of operators other
3232 than OP_PROP and OP_NOTPROP. We can assume that there are the minimum
3233 number of bytes present, as this was tested above. */
3234
3235 switch(ctype)
3236 {
3237 case OP_ANY:
3238 for (i = 1; i <= min; i++)
3239 {
3240 if (IS_NEWLINE(eptr)) RRETURN(MATCH_NOMATCH);
3241 eptr++;
3242 }
3243 break;
3244
3245 case OP_ALLANY:
3246 eptr += min;
3247 break;
3248
3249 case OP_ANYBYTE:
3250 eptr += min;
3251 break;
3252
3253 /* Because of the CRLF case, we can't assume the minimum number of
3254 bytes are present in this case. */
3255
3256 case OP_ANYNL:
3257 for (i = 1; i <= min; i++)
3258 {
3259 if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
3260 switch(*eptr++)
3261 {
3262 default: RRETURN(MATCH_NOMATCH);
3263 case 0x000d:
3264 if (eptr < md->end_subject && *eptr == 0x0a) eptr++;
3265 break;
3266 case 0x000a:
3267 break;
3268
3269 case 0x000b:
3270 case 0x000c:
3271 case 0x0085:
3272 if (md->bsr_anycrlf) RRETURN(MATCH_NOMATCH);
3273 break;
3274 }
3275 }
3276 break;
3277
3278 case OP_NOT_HSPACE:
3279 for (i = 1; i <= min; i++)
3280 {
3281 if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
3282 switch(*eptr++)
3283 {
3284 default: break;
3285 case 0x09: /* HT */
3286 case 0x20: /* SPACE */
3287 case 0xa0: /* NBSP */
3288 RRETURN(MATCH_NOMATCH);
3289 }
3290 }
3291 break;
3292
3293 case OP_HSPACE:
3294 for (i = 1; i <= min; i++)
3295 {
3296 if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
3297 switch(*eptr++)
3298 {
3299 default: RRETURN(MATCH_NOMATCH);
3300 case 0x09: /* HT */
3301 case 0x20: /* SPACE */
3302 case 0xa0: /* NBSP */
3303 break;
3304 }
3305 }
3306 break;
3307
3308 case OP_NOT_VSPACE:
3309 for (i = 1; i <= min; i++)
3310 {
3311 if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
3312 switch(*eptr++)
3313 {
3314 default: break;
3315 case 0x0a: /* LF */
3316 case 0x0b: /* VT */
3317 case 0x0c: /* FF */
3318 case 0x0d: /* CR */
3319 case 0x85: /* NEL */
3320 RRETURN(MATCH_NOMATCH);
3321 }
3322 }
3323 break;
3324
3325 case OP_VSPACE:
3326 for (i = 1; i <= min; i++)
3327 {
3328 if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
3329 switch(*eptr++)
3330 {
3331 default: RRETURN(MATCH_NOMATCH);
3332 case 0x0a: /* LF */
3333 case 0x0b: /* VT */
3334 case 0x0c: /* FF */
3335 case 0x0d: /* CR */
3336 case 0x85: /* NEL */
3337 break;
3338 }
3339 }
3340 break;
3341
3342 case OP_NOT_DIGIT:
3343 for (i = 1; i <= min; i++)
3344 if ((md->ctypes[*eptr++] & ctype_digit) != 0) RRETURN(MATCH_NOMATCH);
3345 break;
3346
3347 case OP_DIGIT:
3348 for (i = 1; i <= min; i++)
3349 if ((md->ctypes[*eptr++] & ctype_digit) == 0) RRETURN(MATCH_NOMATCH);
3350 break;
3351
3352 case OP_NOT_WHITESPACE:
3353 for (i = 1; i <= min; i++)
3354 if ((md->ctypes[*eptr++] & ctype_space) != 0) RRETURN(MATCH_NOMATCH);
3355 break;
3356
3357 case OP_WHITESPACE:
3358 for (i = 1; i <= min; i++)
3359 if ((md->ctypes[*eptr++] & ctype_space) == 0) RRETURN(MATCH_NOMATCH);
3360 break;
3361
3362 case OP_NOT_WORDCHAR:
3363 for (i = 1; i <= min; i++)
3364 if ((md->ctypes[*eptr++] & ctype_word) != 0)
3365 RRETURN(MATCH_NOMATCH);
3366 break;
3367
3368 case OP_WORDCHAR:
3369 for (i = 1; i <= min; i++)
3370 if ((md->ctypes[*eptr++] & ctype_word) == 0)
3371 RRETURN(MATCH_NOMATCH);
3372 break;
3373
3374 default:
3375 RRETURN(PCRE_ERROR_INTERNAL);
3376 }
3377 }
3378
3379 /* If min = max, continue at the same level without recursing */
3380
3381 if (min == max) continue;
3382
3383 /* If minimizing, we have to test the rest of the pattern before each
3384 subsequent match. Again, separate the UTF-8 case for speed, and also
3385 separate the UCP cases. */
3386
3387 if (minimize)
3388 {
3389 #ifdef SUPPORT_UCP
3390 if (prop_type >= 0)
3391 {
3392 switch(prop_type)
3393 {
3394 case PT_ANY:
3395 for (fi = min;; fi++)
3396 {
3397 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM36);
3398 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3399 if (fi >= max || eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
3400 GETCHARINC(c, eptr);
3401 if (prop_fail_result) RRETURN(MATCH_NOMATCH);
3402 }
3403 /* Control never gets here */
3404
3405 case PT_LAMP:
3406 for (fi = min;; fi++)
3407 {
3408 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM37);
3409 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3410 if (fi >= max || eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
3411 GETCHARINC(c, eptr);
3412 prop_chartype = UCD_CHARTYPE(c);
3413 if ((prop_chartype == ucp_Lu ||
3414 prop_chartype == ucp_Ll ||
3415 prop_chartype == ucp_Lt) == prop_fail_result)
3416 RRETURN(MATCH_NOMATCH);
3417 }
3418 /* Control never gets here */
3419
3420 case PT_GC:
3421 for (fi = min;; fi++)
3422 {
3423 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM38);
3424 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3425 if (fi >= max || eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
3426 GETCHARINC(c, eptr);
3427 prop_category = UCD_CATEGORY(c);
3428 if ((prop_category == prop_value) == prop_fail_result)
3429 RRETURN(MATCH_NOMATCH);
3430 }
3431 /* Control never gets here */
3432
3433 case PT_PC:
3434 for (fi = min;; fi++)
3435 {
3436 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM39);
3437 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3438 if (fi >= max || eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
3439 GETCHARINC(c, eptr);
3440 prop_chartype = UCD_CHARTYPE(c);
3441 if ((prop_chartype == prop_value) == prop_fail_result)
3442 RRETURN(MATCH_NOMATCH);
3443 }
3444 /* Control never gets here */
3445
3446 case PT_SC:
3447 for (fi = min;; fi++)
3448 {
3449 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM40);
3450 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3451 if (fi >= max || eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
3452 GETCHARINC(c, eptr);
3453 prop_script = UCD_SCRIPT(c);
3454 if ((prop_script == prop_value) == prop_fail_result)
3455 RRETURN(MATCH_NOMATCH);
3456 }
3457 /* Control never gets here */
3458
3459 default:
3460 RRETURN(PCRE_ERROR_INTERNAL);
3461 }
3462 }
3463
3464 /* Match extended Unicode sequences. We will get here only if the
3465 support is in the binary; otherwise a compile-time error occurs. */
3466
3467 else if (ctype == OP_EXTUNI)
3468 {
3469 for (fi = min;; fi++)
3470 {
3471 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM41);
3472 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3473 if (fi >= max || eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
3474 GETCHARINCTEST(c, eptr);
3475 prop_category = UCD_CATEGORY(c);
3476 if (prop_category == ucp_M) RRETURN(MATCH_NOMATCH);
3477 while (eptr < md->end_subject)
3478 {
3479 int len = 1;
3480 if (!utf8) c = *eptr; else
3481 {
3482 GETCHARLEN(c, eptr, len);
3483 }
3484 prop_category = UCD_CATEGORY(c);
3485 if (prop_category != ucp_M) break;
3486 eptr += len;
3487 }
3488 }
3489 }
3490
3491 else
3492 #endif /* SUPPORT_UCP */
3493
3494 #ifdef SUPPORT_UTF8
3495 /* UTF-8 mode */
3496 if (utf8)
3497 {
3498 for (fi = min;; fi++)
3499 {
3500 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM42);
3501 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3502 if (fi >= max || eptr >= md->end_subject ||
3503 (ctype == OP_ANY && IS_NEWLINE(eptr)))
3504 RRETURN(MATCH_NOMATCH);
3505
3506 GETCHARINC(c, eptr);
3507 switch(ctype)
3508 {
3509 case OP_ANY: /* This is the non-NL case */
3510 case OP_ALLANY:
3511 case OP_ANYBYTE:
3512 break;
3513
3514 case OP_ANYNL:
3515 switch(c)
3516 {
3517 default: RRETURN(MATCH_NOMATCH);
3518 case 0x000d:
3519 if (eptr < md->end_subject && *eptr == 0x0a) eptr++;
3520 break;
3521 case 0x000a:
3522 break;
3523
3524 case 0x000b:
3525 case 0x000c:
3526 case 0x0085:
3527 case 0x2028:
3528 case 0x2029:
3529 if (md->bsr_anycrlf) RRETURN(MATCH_NOMATCH);
3530 break;
3531 }
3532 break;
3533
3534 case OP_NOT_HSPACE:
3535 switch(c)
3536 {
3537 default: break;
3538 case 0x09: /* HT */
3539 case 0x20: /* SPACE */
3540 case 0xa0: /* NBSP */
3541 case 0x1680: /* OGHAM SPACE MARK */
3542 case 0x180e: /* MONGOLIAN VOWEL SEPARATOR */
3543 case 0x2000: /* EN QUAD */
3544 case 0x2001: /* EM QUAD */
3545 case 0x2002: /* EN SPACE */
3546 case 0x2003: /* EM SPACE */
3547 case 0x2004: /* THREE-PER-EM SPACE */
3548 case 0x2005: /* FOUR-PER-EM SPACE */
3549 case 0x2006: /* SIX-PER-EM SPACE */
3550 case 0x2007: /* FIGURE SPACE */
3551 case 0x2008: /* PUNCTUATION SPACE */
3552 case 0x2009: /* THIN SPACE */
3553 case 0x200A: /* HAIR SPACE */
3554 case 0x202f: /* NARROW NO-BREAK SPACE */
3555 case 0x205f: /* MEDIUM MATHEMATICAL SPACE */
3556 case 0x3000: /* IDEOGRAPHIC SPACE */
3557 RRETURN(MATCH_NOMATCH);
3558 }
3559 break;
3560
3561 case OP_HSPACE:
3562 switch(c)
3563 {
3564 default: RRETURN(MATCH_NOMATCH);
3565 case 0x09: /* HT */
3566 case 0x20: /* SPACE */
3567 case 0xa0: /* NBSP */
3568 case 0x1680: /* OGHAM SPACE MARK */
3569 case 0x180e: /* MONGOLIAN VOWEL SEPARATOR */
3570 case 0x2000: /* EN QUAD */
3571 case 0x2001: /* EM QUAD */
3572 case 0x2002: /* EN SPACE */
3573 case 0x2003: /* EM SPACE */
3574 case 0x2004: /* THREE-PER-EM SPACE */
3575 case 0x2005: /* FOUR-PER-EM SPACE */
3576 case 0x2006: /* SIX-PER-EM SPACE */
3577 case 0x2007: /* FIGURE SPACE */
3578 case 0x2008: /* PUNCTUATION SPACE */
3579 case 0x2009: /* THIN SPACE */
3580 case 0x200A: /* HAIR SPACE */
3581 case 0x202f: /* NARROW NO-BREAK SPACE */
3582 case 0x205f: /* MEDIUM MATHEMATICAL SPACE */
3583 case 0x3000: /* IDEOGRAPHIC SPACE */
3584 break;
3585 }
3586 break;
3587
3588 case OP_NOT_VSPACE:
3589 switch(c)
3590 {
3591 default: break;
3592 case 0x0a: /* LF */
3593 case 0x0b: /* VT */
3594 case 0x0c: /* FF */
3595 case 0x0d: /* CR */
3596 case 0x85: /* NEL */
3597 case 0x2028: /* LINE SEPARATOR */
3598 case 0x2029: /* PARAGRAPH SEPARATOR */
3599 RRETURN(MATCH_NOMATCH);
3600 }
3601 break;
3602
3603 case OP_VSPACE:
3604 switch(c)
3605 {
3606 default: RRETURN(MATCH_NOMATCH);
3607 case 0x0a: /* LF */
3608 case 0x0b: /* VT */
3609 case 0x0c: /* FF */
3610 case 0x0d: /* CR */
3611 case 0x85: /* NEL */
3612 case 0x2028: /* LINE SEPARATOR */
3613 case 0x2029: /* PARAGRAPH SEPARATOR */
3614 break;
3615 }
3616 break;
3617
3618 case OP_NOT_DIGIT:
3619 if (c < 256 && (md->ctypes[c] & ctype_digit) != 0)
3620 RRETURN(MATCH_NOMATCH);
3621 break;
3622
3623 case OP_DIGIT:
3624 if (c >= 256 || (md->ctypes[c] & ctype_digit) == 0)
3625 RRETURN(MATCH_NOMATCH);
3626 break;
3627
3628 case OP_NOT_WHITESPACE:
3629 if (c < 256 && (md->ctypes[c] & ctype_space) != 0)
3630 RRETURN(MATCH_NOMATCH);
3631 break;
3632
3633 case OP_WHITESPACE:
3634 if (c >= 256 || (md->ctypes[c] & ctype_space) == 0)
3635 RRETURN(MATCH_NOMATCH);
3636 break;
3637
3638 case OP_NOT_WORDCHAR:
3639 if (c < 256 && (md->ctypes[c] & ctype_word) != 0)
3640 RRETURN(MATCH_NOMATCH);
3641 break;
3642
3643 case OP_WORDCHAR:
3644 if (c >= 256 || (md->ctypes[c] & ctype_word) == 0)
3645 RRETURN(MATCH_NOMATCH);
3646 break;
3647
3648 default:
3649 RRETURN(PCRE_ERROR_INTERNAL);
3650 }
3651 }
3652 }
3653 else
3654 #endif
3655 /* Not UTF-8 mode */
3656 {
3657 for (fi = min;; fi++)
3658 {
3659 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM43);
3660 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3661 if (fi >= max || eptr >= md->end_subject ||
3662 (ctype == OP_ANY && IS_NEWLINE(eptr)))
3663 RRETURN(MATCH_NOMATCH);
3664
3665 c = *eptr++;
3666 switch(ctype)
3667 {
3668 case OP_ANY: /* This is the non-NL case */
3669 case OP_ALLANY:
3670 case OP_ANYBYTE:
3671 break;
3672
3673 case OP_ANYNL:
3674 switch(c)
3675 {
3676 default: RRETURN(MATCH_NOMATCH);
3677 case 0x000d:
3678 if (eptr < md->end_subject && *eptr == 0x0a) eptr++;
3679 break;
3680
3681 case 0x000a:
3682 break;
3683
3684 case 0x000b:
3685 case 0x000c:
3686 case 0x0085:
3687 if (md->bsr_anycrlf) RRETURN(MATCH_NOMATCH);
3688 break;
3689 }
3690 break;
3691
3692 case OP_NOT_HSPACE:
3693 switch(c)
3694 {
3695 default: break;
3696 case 0x09: /* HT */
3697 case 0x20: /* SPACE */
3698 case 0xa0: /* NBSP */
3699 RRETURN(MATCH_NOMATCH);
3700 }
3701 break;
3702
3703 case OP_HSPACE:
3704 switch(c)
3705 {
3706 default: RRETURN(MATCH_NOMATCH);
3707 case 0x09: /* HT */
3708 case 0x20: /* SPACE */
3709 case 0xa0: /* NBSP */
3710 break;
3711 }
3712 break;
3713
3714 case OP_NOT_VSPACE:
3715 switch(c)
3716 {
3717 default: break;
3718 case 0x0a: /* LF */
3719 case 0x0b: /* VT */
3720 case 0x0c: /* FF */
3721 case 0x0d: /* CR */
3722 case 0x85: /* NEL */
3723 RRETURN(MATCH_NOMATCH);
3724 }
3725 break;
3726
3727 case OP_VSPACE:
3728 switch(c)
3729 {
3730 default: RRETURN(MATCH_NOMATCH);
3731 case 0x0a: /* LF */
3732 case 0x0b: /* VT */
3733 case 0x0c: /* FF */
3734 case 0x0d: /* CR */
3735 case 0x85: /* NEL */
3736 break;
3737 }
3738 break;
3739
3740 case OP_NOT_DIGIT:
3741 if ((md->ctypes[c] & ctype_digit) != 0) RRETURN(MATCH_NOMATCH);
3742 break;
3743
3744 case OP_DIGIT:
3745 if ((md->ctypes[c] & ctype_digit) == 0) RRETURN(MATCH_NOMATCH);
3746 break;
3747
3748 case OP_NOT_WHITESPACE:
3749 if ((md->ctypes[c] & ctype_space) != 0) RRETURN(MATCH_NOMATCH);
3750 break;
3751
3752 case OP_WHITESPACE:
3753 if ((md->ctypes[c] & ctype_space) == 0) RRETURN(MATCH_NOMATCH);
3754 break;
3755
3756 case OP_NOT_WORDCHAR:
3757 if ((md->ctypes[c] & ctype_word) != 0) RRETURN(MATCH_NOMATCH);
3758 break;
3759
3760 case OP_WORDCHAR:
3761 if ((md->ctypes[c] & ctype_word) == 0) RRETURN(MATCH_NOMATCH);
3762 break;
3763
3764 default:
3765 RRETURN(PCRE_ERROR_INTERNAL);
3766 }
3767 }
3768 }
3769 /* Control never gets here */
3770 }
3771
3772 /* If maximizing, it is worth using inline code for speed, doing the type
3773 test once at the start (i.e. keep it out of the loop). Again, keep the
3774 UTF-8 and UCP stuff separate. */
3775
3776 else
3777 {
3778 pp = eptr; /* Remember where we started */
3779
3780 #ifdef SUPPORT_UCP
3781 if (prop_type >= 0)
3782 {
3783 switch(prop_type)
3784 {
3785 case PT_ANY:
3786 for (i = min; i < max; i++)
3787 {
3788 int len = 1;
3789 if (eptr >= md->end_subject) break;
3790 GETCHARLEN(c, eptr, len);
3791 if (prop_fail_result) break;
3792 eptr+= len;
3793 }
3794 break;
3795
3796 case PT_LAMP:
3797 for (i = min; i < max; i++)
3798 {
3799 int len = 1;
3800 if (eptr >= md->end_subject) break;
3801 GETCHARLEN(c, eptr, len);
3802 prop_chartype = UCD_CHARTYPE(c);
3803 if ((prop_chartype == ucp_Lu ||
3804 prop_chartype == ucp_Ll ||
3805 prop_chartype == ucp_Lt) == prop_fail_result)
3806 break;
3807 eptr+= len;
3808 }
3809 break;
3810
3811 case PT_GC:
3812 for (i = min; i < max; i++)
3813 {
3814 int len = 1;
3815 if (eptr >= md->end_subject) break;
3816 GETCHARLEN(c, eptr, len);
3817 prop_category = UCD_CATEGORY(c);
3818 if ((prop_category == prop_value) == prop_fail_result)
3819 break;
3820 eptr+= len;
3821 }
3822 break;
3823
3824 case PT_PC:
3825 for (i = min; i < max; i++)
3826 {
3827 int len = 1;
3828 if (eptr >= md->end_subject) break;
3829 GETCHARLEN(c, eptr, len);
3830 prop_chartype = UCD_CHARTYPE(c);
3831 if ((prop_chartype == prop_value) == prop_fail_result)
3832 break;
3833 eptr+= len;
3834 }
3835 break;
3836
3837 case PT_SC:
3838 for (i = min; i < max; i++)
3839 {
3840 int len = 1;
3841 if (eptr >= md->end_subject) break;
3842 GETCHARLEN(c, eptr, len);
3843 prop_script = UCD_SCRIPT(c);
3844 if ((prop_script == prop_value) == prop_fail_result)
3845 break;
3846 eptr+= len;
3847 }
3848 break;
3849 }
3850
3851 /* eptr is now past the end of the maximum run */
3852
3853 if (possessive) continue;
3854 for(;;)
3855 {
3856 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM44);
3857 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3858 if (eptr-- == pp) break; /* Stop if tried at original pos */
3859 if (utf8) BACKCHAR(eptr);
3860 }
3861 }
3862
3863 /* Match extended Unicode sequences. We will get here only if the
3864 support is in the binary; otherwise a compile-time error occurs. */
3865
3866 else if (ctype == OP_EXTUNI)
3867 {
3868 for (i = min; i < max; i++)
3869 {
3870 if (eptr >= md->end_subject) break;
3871 GETCHARINCTEST(c, eptr);
3872 prop_category = UCD_CATEGORY(c);
3873 if (prop_category == ucp_M) break;
3874 while (eptr < md->end_subject)
3875 {
3876 int len = 1;
3877 if (!utf8) c = *eptr; else
3878 {
3879 GETCHARLEN(c, eptr, len);
3880 }
3881 prop_category = UCD_CATEGORY(c);
3882 if (prop_category != ucp_M) break;
3883 eptr += len;
3884 }
3885 }
3886
3887 /* eptr is now past the end of the maximum run */
3888
3889 if (possessive) continue;
3890 for(;;)
3891 {
3892 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM45);
3893 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3894 if (eptr-- == pp) break; /* Stop if tried at original pos */
3895 for (;;) /* Move back over one extended */
3896 {
3897 int len = 1;
3898 if (!utf8) c = *eptr; else
3899 {
3900 BACKCHAR(eptr);
3901 GETCHARLEN(c, eptr, len);
3902 }
3903 prop_category = UCD_CATEGORY(c);
3904 if (prop_category != ucp_M) break;
3905 eptr--;
3906 }
3907 }
3908 }
3909
3910 else
3911 #endif /* SUPPORT_UCP */
3912
3913 #ifdef SUPPORT_UTF8
3914 /* UTF-8 mode */
3915
3916 if (utf8)
3917 {
3918 switch(ctype)
3919 {
3920 case OP_ANY:
3921 if (max < INT_MAX)
3922 {
3923 for (i = min; i < max; i++)
3924 {
3925 if (eptr >= md->end_subject || IS_NEWLINE(eptr)) break;
3926 eptr++;
3927 while (eptr < md->end_subject && (*eptr & 0xc0) == 0x80) eptr++;
3928 }
3929 }
3930
3931 /* Handle unlimited UTF-8 repeat */
3932
3933 else
3934 {
3935 for (i = min; i < max; i++)
3936 {
3937 if (eptr >= md->end_subject || IS_NEWLINE(eptr)) break;
3938 eptr++;
3939 while (eptr < md->end_subject && (*eptr & 0xc0) == 0x80) eptr++;
3940 }
3941 }
3942 break;
3943
3944 case OP_ALLANY:
3945 if (max < INT_MAX)
3946 {
3947 for (i = min; i < max; i++)
3948 {
3949 if (eptr >= md->end_subject) break;
3950 eptr++;
3951 while (eptr < md->end_subject && (*eptr & 0xc0) == 0x80) eptr++;
3952 }
3953 }
3954 else eptr = md->end_subject; /* Unlimited UTF-8 repeat */
3955 break;
3956
3957 /* The byte case is the same as non-UTF8 */
3958
3959 case OP_ANYBYTE:
3960 c = max - min;
3961 if (c > (unsigned int)(md->end_subject - eptr))
3962 c = md->end_subject - eptr;
3963 eptr += c;
3964 break;
3965
3966 case OP_ANYNL:
3967 for (i = min; i < max; i++)
3968 {
3969 int len = 1;
3970 if (eptr >= md->end_subject) break;
3971 GETCHARLEN(c, eptr, len);
3972 if (c == 0x000d)
3973 {
3974 if (++eptr >= md->end_subject) break;
3975 if (*eptr == 0x000a) eptr++;
3976 }
3977 else
3978 {
3979 if (c != 0x000a &&
3980 (md->bsr_anycrlf ||
3981 (c != 0x000b && c != 0x000c &&
3982 c != 0x0085 && c != 0x2028 && c != 0x2029)))
3983 break;
3984 eptr += len;
3985 }
3986 }
3987 break;
3988
3989 case OP_NOT_HSPACE:
3990 case OP_HSPACE:
3991 for (i = min; i < max; i++)
3992 {
3993 BOOL gotspace;
3994 int len = 1;
3995 if (eptr >= md->end_subject) break;
3996 GETCHARLEN(c, eptr, len);
3997 switch(c)
3998 {
3999 default: gotspace = FALSE; break;
4000 case 0x09: /* HT */
4001 case 0x20: /* SPACE */
4002 case 0xa0: /* NBSP */
4003 case 0x1680: /* OGHAM SPACE MARK */
4004 case 0x180e: /* MONGOLIAN VOWEL SEPARATOR */
4005 case 0x2000: /* EN QUAD */
4006 case 0x2001: /* EM QUAD */
4007 case 0x2002: /* EN SPACE */
4008 case 0x2003: /* EM SPACE */
4009 case 0x2004: /* THREE-PER-EM SPACE */
4010 case 0x2005: /* FOUR-PER-EM SPACE */
4011 case 0x2006: /* SIX-PER-EM SPACE */
4012 case 0x2007: /* FIGURE SPACE */
4013 case 0x2008: /* PUNCTUATION SPACE */
4014 case 0x2009: /* THIN SPACE */
4015 case 0x200A: /* HAIR SPACE */
4016 case 0x202f: /* NARROW NO-BREAK SPACE */
4017 case 0x205f: /* MEDIUM MATHEMATICAL SPACE */
4018 case 0x3000: /* IDEOGRAPHIC SPACE */
4019 gotspace = TRUE;
4020 break;
4021 }
4022 if (gotspace == (ctype == OP_NOT_HSPACE)) break;
4023 eptr += len;
4024 }
4025 break;
4026
4027 case OP_NOT_VSPACE:
4028 case OP_VSPACE:
4029 for (i = min; i < max; i++)
4030 {
4031 BOOL gotspace;
4032 int len = 1;
4033 if (eptr >= md->end_subject) break;
4034 GETCHARLEN(c, eptr, len);
4035 switch(c)
4036 {
4037 default: gotspace = FALSE; break;
4038 case 0x0a: /* LF */
4039 case 0x0b: /* VT */
4040 case 0x0c: /* FF */
4041 case 0x0d: /* CR */
4042 case 0x85: /* NEL */
4043 case 0x2028: /* LINE SEPARATOR */
4044 case 0x2029: /* PARAGRAPH SEPARATOR */
4045 gotspace = TRUE;
4046 break;
4047 }
4048 if (gotspace == (ctype == OP_NOT_VSPACE)) break;
4049 eptr += len;
4050 }
4051 break;
4052
4053 case OP_NOT_DIGIT:
4054 for (i = min; i < max; i++)
4055 {
4056 int len = 1;
4057 if (eptr >= md->end_subject) break;
4058 GETCHARLEN(c, eptr, len);
4059 if (c < 256 && (md->ctypes[c] & ctype_digit) != 0) break;
4060 eptr+= len;
4061 }
4062 break;
4063
4064 case OP_DIGIT:
4065 for (i = min; i < max; i++)
4066 {
4067 int len = 1;
4068 if (eptr >= md->end_subject) break;
4069 GETCHARLEN(c, eptr, len);
4070 if (c >= 256 ||(md->ctypes[c] & ctype_digit) == 0) break;
4071 eptr+= len;
4072 }
4073 break;
4074
4075 case OP_NOT_WHITESPACE:
4076 for (i = min; i < max; i++)
4077 {
4078 int len = 1;
4079 if (eptr >= md->end_subject) break;
4080 GETCHARLEN(c, eptr, len);
4081 if (c < 256 && (md->ctypes[c] & ctype_space) != 0) break;
4082 eptr+= len;
4083 }
4084 break;
4085
4086 case OP_WHITESPACE:
4087 for (i = min; i < max; i++)
4088 {
4089 int len = 1;
4090 if (eptr >= md->end_subject) break;
4091 GETCHARLEN(c, eptr, len);
4092 if (c >= 256 ||(md->ctypes[c] & ctype_space) == 0) break;
4093 eptr+= len;
4094 }
4095 break;
4096
4097 case OP_NOT_WORDCHAR:
4098 for (i = min; i < max; i++)
4099 {
4100 int len = 1;
4101 if (eptr >= md->end_subject) break;
4102 GETCHARLEN(c, eptr, len);
4103 if (c < 256 && (md->ctypes[c] & ctype_word) != 0) break;
4104 eptr+= len;
4105 }
4106 break;
4107
4108 case OP_WORDCHAR:
4109 for (i = min; i < max; i++)
4110 {
4111 int len = 1;
4112 if (eptr >= md->end_subject) break;
4113 GETCHARLEN(c, eptr, len);
4114 if (c >= 256 || (md->ctypes[c] & ctype_word) == 0) break;
4115 eptr+= len;
4116 }
4117 break;
4118
4119 default:
4120 RRETURN(PCRE_ERROR_INTERNAL);
4121 }
4122
4123 /* eptr is now past the end of the maximum run */
4124
4125 if (possessive) continue;
4126 for(;;)
4127 {
4128 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM46);
4129 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
4130 if (eptr-- == pp) break; /* Stop if tried at original pos */
4131 BACKCHAR(eptr);
4132 }
4133 }
4134 else
4135 #endif /* SUPPORT_UTF8 */
4136
4137 /* Not UTF-8 mode */
4138 {
4139 switch(ctype)
4140 {
4141 case OP_ANY:
4142 for (i = min; i < max; i++)
4143 {
4144 if (eptr >= md->end_subject || IS_NEWLINE(eptr)) break;
4145 eptr++;
4146 }
4147 break;
4148
4149 case OP_ALLANY:
4150 case OP_ANYBYTE:
4151 c = max - min;
4152 if (c > (unsigned int)(md->end_subject - eptr))
4153 c = md->end_subject - eptr;
4154 eptr += c;
4155 break;
4156
4157 case OP_ANYNL:
4158 for (i = min; i < max; i++)
4159 {
4160 if (eptr >= md->end_subject) break;
4161 c = *eptr;
4162 if (c == 0x000d)
4163 {
4164 if (++eptr >= md->end_subject) break;
4165 if (*eptr == 0x000a) eptr++;
4166 }
4167 else
4168 {
4169 if (c != 0x000a &&
4170 (md->bsr_anycrlf ||
4171 (c != 0x000b && c != 0x000c && c != 0x0085)))
4172 break;
4173 eptr++;
4174 }
4175 }
4176 break;
4177
4178 case OP_NOT_HSPACE:
4179 for (i = min; i < max; i++)
4180 {
4181 if (eptr >= md->end_subject) break;
4182 c = *eptr;
4183 if (c == 0x09 || c == 0x20 || c == 0xa0) break;
4184 eptr++;
4185 }
4186 break;
4187
4188 case OP_HSPACE:
4189 for (i = min; i < max; i++)
4190 {
4191 if (eptr >= md->end_subject) break;
4192 c = *eptr;
4193 if (c != 0x09 && c != 0x20 && c != 0xa0) break;
4194 eptr++;
4195 }
4196 break;
4197
4198 case OP_NOT_VSPACE:
4199 for (i = min; i < max; i++)
4200 {
4201 if (eptr >= md->end_subject) break;
4202 c = *eptr;
4203 if (c == 0x0a || c == 0x0b || c == 0x0c || c == 0x0d || c == 0x85)
4204 break;
4205 eptr++;
4206 }
4207 break;
4208
4209 case OP_VSPACE:
4210 for (i = min; i < max; i++)
4211 {
4212 if (eptr >= md->end_subject) break;
4213 c = *eptr;
4214 if (c != 0x0a && c != 0x0b && c != 0x0c && c != 0x0d && c != 0x85)
4215 break;
4216 eptr++;
4217 }
4218 break;
4219
4220 case OP_NOT_DIGIT:
4221 for (i = min; i < max; i++)
4222 {
4223 if (eptr >= md->end_subject || (md->ctypes[*eptr] & ctype_digit) != 0)
4224 break;
4225 eptr++;
4226 }
4227 break;
4228
4229 case OP_DIGIT:
4230 for (i = min; i < max; i++)
4231 {
4232 if (eptr >= md->end_subject || (md->ctypes[*eptr] & ctype_digit) == 0)
4233 break;
4234 eptr++;
4235 }
4236 break;
4237
4238 case OP_NOT_WHITESPACE:
4239 for (i = min; i < max; i++)
4240 {
4241 if (eptr >= md->end_subject || (md->ctypes[*eptr] & ctype_space) != 0)
4242 break;
4243 eptr++;
4244 }
4245 break;
4246
4247 case OP_WHITESPACE:
4248 for (i = min; i < max; i++)
4249 {
4250 if (eptr >= md->end_subject || (md->ctypes[*eptr] & ctype_space) == 0)
4251 break;
4252 eptr++;
4253 }
4254 break;
4255
4256 case OP_NOT_WORDCHAR:
4257 for (i = min; i < max; i++)
4258 {
4259 if (eptr >= md->end_subject || (md->ctypes[*eptr] & ctype_word) != 0)
4260 break;
4261 eptr++;
4262 }
4263 break;
4264
4265 case OP_WORDCHAR:
4266 for (i = min; i < max; i++)
4267 {
4268 if (eptr >= md->end_subject || (md->ctypes[*eptr] & ctype_word) == 0)
4269 break;
4270 eptr++;
4271 }
4272 break;
4273
4274 default:
4275 RRETURN(PCRE_ERROR_INTERNAL);
4276 }
4277
4278 /* eptr is now past the end of the maximum run */
4279
4280 if (possessive) continue;
4281 while (eptr >= pp)
4282 {
4283 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM47);
4284 eptr--;
4285 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
4286 }
4287 }
4288
4289 /* Get here if we can't make it match with any permitted repetitions */
4290
4291 RRETURN(MATCH_NOMATCH);
4292 }
4293 /* Control never gets here */
4294
4295 /* There's been some horrible disaster. Arrival here can only mean there is
4296 something seriously wrong in the code above or the OP_xxx definitions. */
4297
4298 default:
4299 DPRINTF(("Unknown opcode %d\n", *ecode));
4300 RRETURN(PCRE_ERROR_UNKNOWN_OPCODE);
4301 }
4302
4303 /* Do not stick any code in here without much thought; it is assumed
4304 that "continue" in the code above comes out to here to repeat the main
4305 loop. */
4306
4307 } /* End of main loop */
4308 /* Control never reaches here */
4309
4310
4311 /* When compiling to use the heap rather than the stack for recursive calls to
4312 match(), the RRETURN() macro jumps here. The number that is saved in
4313 frame->Xwhere indicates which label we actually want to return to. */
4314
4315 #ifdef NO_RECURSE
4316 #define LBL(val) case val: goto L_RM##val;
4317 HEAP_RETURN:
4318 switch (frame->Xwhere)
4319 {
4320 LBL( 1) LBL( 2) LBL( 3) LBL( 4) LBL( 5) LBL( 6) LBL( 7) LBL( 8)
4321 LBL( 9) LBL(10) LBL(11) LBL(12) LBL(13) LBL(14) LBL(15) LBL(17)
4322 LBL(19) LBL(24) LBL(25) LBL(26) LBL(27) LBL(29) LBL(31) LBL(33)
4323 LBL(35) LBL(43) LBL(47) LBL(48) LBL(49) LBL(50) LBL(51) LBL(52)
4324 LBL(53) LBL(54)
4325 #ifdef SUPPORT_UTF8
4326 LBL(16) LBL(18) LBL(20) LBL(21) LBL(22) LBL(23) LBL(28) LBL(30)
4327 LBL(32) LBL(34) LBL(42) LBL(46)
4328 #ifdef SUPPORT_UCP
4329 LBL(36) LBL(37) LBL(38) LBL(39) LBL(40) LBL(41) LBL(44) LBL(45)
4330 #endif /* SUPPORT_UCP */
4331 #endif /* SUPPORT_UTF8 */
4332 default:
4333 DPRINTF(("jump error in pcre match: label %d non-existent\n", frame->Xwhere));
4334 return PCRE_ERROR_INTERNAL;
4335 }
4336 #undef LBL
4337 #endif /* NO_RECURSE */
4338 }
4339
4340
4341 /***************************************************************************
4342 ****************************************************************************
4343 RECURSION IN THE match() FUNCTION
4344
4345 Undefine all the macros that were defined above to handle this. */
4346
4347 #ifdef NO_RECURSE
4348 #undef eptr
4349 #undef ecode
4350 #undef mstart
4351 #undef offset_top
4352 #undef ims
4353 #undef eptrb
4354 #undef flags
4355
4356 #undef callpat
4357 #undef charptr
4358 #undef data
4359 #undef next
4360 #undef pp
4361 #undef prev
4362 #undef saved_eptr
4363
4364 #undef new_recursive
4365
4366 #undef cur_is_word
4367 #undef condition
4368 #undef prev_is_word
4369
4370 #undef original_ims
4371
4372 #undef ctype
4373 #undef length
4374 #undef max
4375 #undef min
4376 #undef number
4377 #undef offset
4378 #undef op
4379 #undef save_capture_last
4380 #undef save_offset1
4381 #undef save_offset2
4382 #undef save_offset3
4383 #undef stacksave
4384
4385 #undef newptrb
4386
4387 #endif
4388
4389 /* These two are defined as macros in both cases */
4390
4391 #undef fc
4392 #undef fi
4393
4394 /***************************************************************************
4395 ***************************************************************************/
4396
4397
4398
4399 /*************************************************
4400 * Execute a Regular Expression *
4401 *************************************************/
4402
4403 /* This function applies a compiled re to a subject string and picks out
4404 portions of the string if it matches. Two elements in the vector are set for
4405 each substring: the offsets to the start and end of the substring.
4406
4407 Arguments:
4408 argument_re points to the compiled expression
4409 extra_data points to extra data or is NULL
4410 subject points to the subject string
4411 length length of subject string (may contain binary zeros)
4412 start_offset where to start in the subject string
4413 options option bits
4414 offsets points to a vector of ints to be filled in with offsets
4415 offsetcount the number of elements in the vector
4416
4417 Returns: > 0 => success; value is the number of elements filled in
4418 = 0 => success, but offsets is not big enough
4419 -1 => failed to match
4420 < -1 => some kind of unexpected problem
4421 */
4422
4423 PCRE_EXP_DEFN int PCRE_CALL_CONVENTION
4424 pcre_exec(const pcre *argument_re, const pcre_extra *extra_data,
4425 PCRE_SPTR subject, int length, int start_offset, int options, int *offsets,
4426 int offsetcount)
4427 {
4428 int rc, resetcount, ocount;
4429 int first_byte = -1;
4430 int req_byte = -1;
4431 int req_byte2 = -1;
4432 int newline;
4433 unsigned long int ims;
4434 BOOL using_temporary_offsets = FALSE;
4435 BOOL anchored;
4436 BOOL startline;
4437 BOOL firstline;
4438 BOOL first_byte_caseless = FALSE;
4439 BOOL req_byte_caseless = FALSE;
4440 BOOL utf8;
4441 match_data match_block;
4442 match_data *md = &match_block;
4443 const uschar *tables;
4444 const uschar *start_bits = NULL;
4445 USPTR start_match = (USPTR)subject + start_offset;
4446 USPTR end_subject;
4447 USPTR req_byte_ptr = start_match - 1;
4448
4449 pcre_study_data internal_study;
4450 const pcre_study_data *study;
4451
4452 real_pcre internal_re;
4453 const real_pcre *external_re = (const real_pcre *)argument_re;
4454 const real_pcre *re = external_re;
4455
4456 /* Plausibility checks */
4457
4458 if ((options & ~PUBLIC_EXEC_OPTIONS) != 0) return PCRE_ERROR_BADOPTION;
4459 if (re == NULL || subject == NULL ||
4460 (offsets == NULL && offsetcount > 0)) return PCRE_ERROR_NULL;
4461 if (offsetcount < 0) return PCRE_ERROR_BADCOUNT;
4462
4463 /* Fish out the optional data from the extra_data structure, first setting
4464 the default values. */
4465
4466 study = NULL;
4467 md->match_limit = MATCH_LIMIT;
4468 md->match_limit_recursion = MATCH_LIMIT_RECURSION;
4469 md->callout_data = NULL;
4470
4471 /* The table pointer is always in native byte order. */
4472
4473 tables = external_re->tables;
4474
4475 if (extra_data != NULL)
4476 {
4477 register unsigned int flags = extra_data->flags;
4478 if ((flags & PCRE_EXTRA_STUDY_DATA) != 0)
4479 study = (const pcre_study_data *)extra_data->study_data;
4480 if ((flags & PCRE_EXTRA_MATCH_LIMIT) != 0)
4481 md->match_limit = extra_data->match_limit;
4482 if ((flags & PCRE_EXTRA_MATCH_LIMIT_RECURSION) != 0)
4483 md->match_limit_recursion = extra_data->match_limit_recursion;
4484 if ((flags & PCRE_EXTRA_CALLOUT_DATA) != 0)
4485 md->callout_data = extra_data->callout_data;
4486 if ((flags & PCRE_EXTRA_TABLES) != 0) tables = extra_data->tables;
4487 }
4488
4489 /* If the exec call supplied NULL for tables, use the inbuilt ones. This
4490 is a feature that makes it possible to save compiled regex and re-use them
4491 in other programs later. */
4492
4493 if (tables == NULL) tables = _pcre_default_tables;
4494
4495 /* Check that the first field in the block is the magic number. If it is not,
4496 test for a regex that was compiled on a host of opposite endianness. If this is
4497 the case, flipped values are put in internal_re and internal_study if there was
4498 study data too. */
4499
4500 if (re->magic_number != MAGIC_NUMBER)
4501 {
4502 re = _pcre_try_flipped(re, &internal_re, study, &internal_study);
4503 if (re == NULL) return PCRE_ERROR_BADMAGIC;
4504 if (study != NULL) study = &internal_study;
4505 }
4506
4507 /* Set up other data */
4508
4509 anchored = ((re->options | options) & PCRE_ANCHORED) != 0;
4510 startline = (re->flags & PCRE_STARTLINE) != 0;
4511 firstline = (re->options & PCRE_FIRSTLINE) != 0;
4512
4513 /* The code starts after the real_pcre block and the capture name table. */
4514
4515 md->start_code = (const uschar *)external_re + re->name_table_offset +
4516 re->name_count * re->name_entry_size;
4517
4518 md->start_subject = (USPTR)subject;
4519 md->start_offset = start_offset;
4520 md->end_subject = md->start_subject + length;
4521 end_subject = md->end_subject;
4522
4523 md->endonly = (re->options & PCRE_DOLLAR_ENDONLY) != 0;
4524 utf8 = md->utf8 = (re->options & PCRE_UTF8) != 0;
4525 md->jscript_compat = (re->options & PCRE_JAVASCRIPT_COMPAT) != 0;
4526
4527 md->notbol = (options & PCRE_NOTBOL) != 0;
4528 md->noteol = (options & PCRE_NOTEOL) != 0;
4529 md->notempty = (options & PCRE_NOTEMPTY) != 0;
4530 md->partial = (options & PCRE_PARTIAL) != 0;
4531 md->hitend = FALSE;
4532
4533 md->recursive = NULL; /* No recursion at top level */
4534
4535 md->lcc = tables + lcc_offset;
4536 md->ctypes = tables + ctypes_offset;
4537
4538 /* Handle different \R options. */
4539
4540 switch (options & (PCRE_BSR_ANYCRLF|PCRE_BSR_UNICODE))
4541 {
4542 case 0:
4543 if ((re->options & (PCRE_BSR_ANYCRLF|PCRE_BSR_UNICODE)) != 0)
4544 md->bsr_anycrlf = (re->options & PCRE_BSR_ANYCRLF) != 0;
4545 else
4546 #ifdef BSR_ANYCRLF
4547 md->bsr_anycrlf = TRUE;
4548 #else
4549 md->bsr_anycrlf = FALSE;
4550 #endif
4551 break;
4552
4553 case PCRE_BSR_ANYCRLF:
4554 md->bsr_anycrlf = TRUE;
4555 break;
4556
4557 case PCRE_BSR_UNICODE:
4558 md->bsr_anycrlf = FALSE;
4559 break;
4560
4561 default: return PCRE_ERROR_BADNEWLINE;
4562 }
4563
4564 /* Handle different types of newline. The three bits give eight cases. If
4565 nothing is set at run time, whatever was used at compile time applies. */
4566
4567 switch ((((options & PCRE_NEWLINE_BITS) == 0)? re->options :
4568 (pcre_uint32)options) & PCRE_NEWLINE_BITS)
4569 {
4570 case 0: newline = NEWLINE; break; /* Compile-time default */
4571 case PCRE_NEWLINE_CR: newline = CHAR_CR; break;
4572 case PCRE_NEWLINE_LF: newline = CHAR_NL; break;
4573 case PCRE_NEWLINE_CR+
4574 PCRE_NEWLINE_LF: newline = (CHAR_CR << 8) | CHAR_NL; break;
4575 case PCRE_NEWLINE_ANY: newline = -1; break;
4576 case PCRE_NEWLINE_ANYCRLF: newline = -2; break;
4577 default: return PCRE_ERROR_BADNEWLINE;
4578 }
4579
4580 if (newline == -2)
4581 {
4582 md->nltype = NLTYPE_ANYCRLF;
4583 }
4584 else if (newline < 0)
4585 {
4586 md->nltype = NLTYPE_ANY;
4587 }
4588 else
4589 {
4590 md->nltype = NLTYPE_FIXED;
4591 if (newline > 255)
4592 {
4593 md->nllen = 2;
4594 md->nl[0] = (newline >> 8) & 255;
4595 md->nl[1] = newline & 255;
4596 }
4597 else
4598 {
4599 md->nllen = 1;
4600 md->nl[0] = newline;
4601 }
4602 }
4603
4604 /* Partial matching is supported only for a restricted set of regexes at the
4605 moment. */
4606
4607 if (md->partial && (re->flags & PCRE_NOPARTIAL) != 0)
4608 return PCRE_ERROR_BADPARTIAL;
4609
4610 /* Check a UTF-8 string if required. Unfortunately there's no way of passing
4611 back the character offset. */
4612
4613 #ifdef SUPPORT_UTF8
4614 if (utf8 && (options & PCRE_NO_UTF8_CHECK) == 0)
4615 {
4616 if (_pcre_valid_utf8((uschar *)subject, length) >= 0)
4617 return PCRE_ERROR_BADUTF8;
4618 if (start_offset > 0 && start_offset < length)
4619 {
4620 int tb = ((uschar *)subject)[start_offset];
4621 if (tb > 127)
4622 {
4623 tb &= 0xc0;
4624 if (tb != 0 && tb != 0xc0) return PCRE_ERROR_BADUTF8_OFFSET;
4625 }
4626 }
4627 }
4628 #endif
4629
4630 /* The ims options can vary during the matching as a result of the presence
4631 of (?ims) items in the pattern. They are kept in a local variable so that
4632 restoring at the exit of a group is easy. */
4633
4634 ims = re->options & (PCRE_CASELESS|PCRE_MULTILINE|PCRE_DOTALL);
4635
4636 /* If the expression has got more back references than the offsets supplied can
4637 hold, we get a temporary chunk of working store to use during the matching.
4638 Otherwise, we can use the vector supplied, rounding down its size to a multiple
4639 of 3. */
4640
4641 ocount = offsetcount - (offsetcount % 3);
4642
4643 if (re->top_backref > 0 && re->top_backref >= ocount/3)
4644 {
4645 ocount = re->top_backref * 3 + 3;
4646 md->offset_vector = (int *)(pcre_malloc)(ocount * sizeof(int));
4647 if (md->offset_vector == NULL) return PCRE_ERROR_NOMEMORY;
4648 using_temporary_offsets = TRUE;
4649 DPRINTF(("Got memory to hold back references\n"));
4650 }
4651 else md->offset_vector = offsets;
4652
4653 md->offset_end = ocount;
4654 md->offset_max = (2*ocount)/3;
4655 md->offset_overflow = FALSE;
4656 md->capture_last = -1;
4657
4658 /* Compute the minimum number of offsets that we need to reset each time. Doing
4659 this makes a huge difference to execution time when there aren't many brackets
4660 in the pattern. */
4661
4662 resetcount = 2 + re->top_bracket * 2;
4663 if (resetcount > offsetcount) resetcount = ocount;
4664
4665 /* Reset the working variable associated with each extraction. These should
4666 never be used unless previously set, but they get saved and restored, and so we
4667 initialize them to avoid reading uninitialized locations. */
4668
4669 if (md->offset_vector != NULL)
4670 {
4671 register int *iptr = md->offset_vector + ocount;
4672 register int *iend = iptr - resetcount/2 + 1;
4673 while (--iptr >= iend) *iptr = -1;
4674 }
4675
4676 /* Set up the first character to match, if available. The first_byte value is
4677 never set for an anchored regular expression, but the anchoring may be forced
4678 at run time, so we have to test for anchoring. The first char may be unset for
4679 an unanchored pattern, of course. If there's no first char and the pattern was
4680 studied, there may be a bitmap of possible first characters. */
4681
4682 if (!anchored)
4683 {
4684 if ((re->flags & PCRE_FIRSTSET) != 0)
4685 {
4686 first_byte = re->first_byte & 255;
4687 if ((first_byte_caseless = ((re->first_byte & REQ_CASELESS) != 0)) == TRUE)
4688 first_byte = md->lcc[first_byte];
4689 }
4690 else
4691 if (!startline && study != NULL &&
4692 (study->options & PCRE_STUDY_MAPPED) != 0)
4693 start_bits = study->start_bits;
4694 }
4695
4696 /* For anchored or unanchored matches, there may be a "last known required
4697 character" set. */
4698
4699 if ((re->flags & PCRE_REQCHSET) != 0)
4700 {
4701 req_byte = re->req_byte & 255;
4702 req_byte_caseless = (re->req_byte & REQ_CASELESS) != 0;
4703 req_byte2 = (tables + fcc_offset)[req_byte]; /* case flipped */
4704 }
4705
4706
4707 /* ==========================================================================*/
4708
4709 /* Loop for handling unanchored repeated matching attempts; for anchored regexs
4710 the loop runs just once. */
4711
4712 for(;;)
4713 {
4714 USPTR save_end_subject = end_subject;
4715 USPTR new_start_match;
4716
4717 /* Reset the maximum number of extractions we might see. */
4718
4719 if (md->offset_vector != NULL)
4720 {
4721 register int *iptr = md->offset_vector;
4722 register int *iend = iptr + resetcount;
4723 while (iptr < iend) *iptr++ = -1;
4724 }
4725
4726 /* If firstline is TRUE, the start of the match is constrained to the first
4727 line of a multiline string. That is, the match must be before or at the first
4728 newline. Implement this by temporarily adjusting end_subject so that we stop
4729 scanning at a newline. If the match fails at the newline, later code breaks
4730 this loop. */
4731
4732 if (firstline)
4733 {
4734 USPTR t = start_match;
4735 #ifdef SUPPORT_UTF8
4736 if (utf8)
4737 {
4738 while (t < md->end_subject && !IS_NEWLINE(t))
4739 {
4740 t++;
4741 while (t < end_subject && (*t & 0xc0) == 0x80) t++;
4742 }
4743 }
4744 else
4745 #endif
4746 while (t < md->end_subject && !IS_NEWLINE(t)) t++;
4747 end_subject = t;
4748 }
4749
4750 /* There are some optimizations that avoid running the match if a known
4751 starting point is not found, or if a known later character is not present.
4752 However, there is an option that disables these, for testing and for ensuring
4753 that all callouts do actually occur. */
4754
4755 if ((options & PCRE_NO_START_OPTIMIZE) == 0)
4756 {
4757 /* Advance to a unique first byte if there is one. */
4758
4759 if (first_byte >= 0)
4760 {
4761 if (first_byte_caseless)
4762 while (start_match < end_subject && md->lcc[*start_match] != first_byte)
4763 start_match++;
4764 else
4765 while (start_match < end_subject && *start_match != first_byte)
4766 start_match++;
4767 }
4768
4769 /* Or to just after a linebreak for a multiline match */
4770
4771 else if (startline)
4772 {
4773 if (start_match > md->start_subject + start_offset)
4774 {
4775 #ifdef SUPPORT_UTF8
4776 if (utf8)
4777 {
4778 while (start_match < end_subject && !WAS_NEWLINE(start_match))
4779 {
4780 start_match++;
4781 while(start_match < end_subject && (*start_match & 0xc0) == 0x80)
4782 start_match++;
4783 }
4784 }
4785 else
4786 #endif
4787 while (start_match < end_subject && !WAS_NEWLINE(start_match))
4788 start_match++;
4789
4790 /* If we have just passed a CR and the newline option is ANY or ANYCRLF,
4791 and we are now at a LF, advance the match position by one more character.
4792 */
4793
4794 if (start_match[-1] == CHAR_CR &&
4795 (md->nltype == NLTYPE_ANY || md->nltype == NLTYPE_ANYCRLF) &&
4796 start_match < end_subject &&
4797 *start_match == CHAR_NL)
4798 start_match++;
4799 }
4800 }
4801
4802 /* Or to a non-unique first byte after study */
4803
4804 else if (start_bits != NULL)
4805 {
4806 while (start_match < end_subject)
4807 {
4808 register unsigned int c = *start_match;
4809 if ((start_bits[c/8] & (1 << (c&7))) == 0) start_match++;
4810 else break;
4811 }
4812 }
4813 } /* Starting optimizations */
4814
4815 /* Restore fudged end_subject */
4816
4817 end_subject = save_end_subject;
4818
4819 #ifdef DEBUG /* Sigh. Some compilers never learn. */
4820 printf(">>>> Match against: ");
4821 pchars(start_match, end_subject - start_match, TRUE, md);
4822 printf("\n");
4823 #endif
4824
4825 /* If req_byte is set, we know that that character must appear in the
4826 subject for the match to succeed. If the first character is set, req_byte
4827 must be later in the subject; otherwise the test starts at the match point.
4828 This optimization can save a huge amount of backtracking in patterns with
4829 nested unlimited repeats that aren't going to match. Writing separate code
4830 for cased/caseless versions makes it go faster, as does using an
4831 autoincrement and backing off on a match.
4832
4833 HOWEVER: when the subject string is very, very long, searching to its end
4834 can take a long time, and give bad performance on quite ordinary patterns.
4835 This showed up when somebody was matching something like /^\d+C/ on a
4836 32-megabyte string... so we don't do this when the string is sufficiently
4837 long.
4838
4839 ALSO: this processing is disabled when partial matching is requested, or if
4840 disabling is explicitly requested. */
4841
4842 if ((options & PCRE_NO_START_OPTIMIZE) == 0 &&
4843 req_byte >= 0 &&
4844 end_subject - start_match < REQ_BYTE_MAX &&
4845 !md->partial)
4846 {
4847 register USPTR p = start_match + ((first_byte >= 0)? 1 : 0);
4848
4849 /* We don't need to repeat the search if we haven't yet reached the
4850 place we found it at last time. */
4851
4852 if (p > req_byte_ptr)
4853 {
4854 if (req_byte_caseless)
4855 {
4856 while (p < end_subject)
4857 {
4858 register int pp = *p++;
4859 if (pp == req_byte || pp == req_byte2) { p--; break; }
4860 }
4861 }
4862 else
4863 {
4864 while (p < end_subject)
4865 {
4866 if (*p++ == req_byte) { p--; break; }
4867 }
4868 }
4869
4870 /* If we can't find the required character, break the matching loop,
4871 forcing a match failure. */
4872
4873 if (p >= end_subject)
4874 {
4875 rc = MATCH_NOMATCH;
4876 break;
4877 }
4878
4879 /* If we have found the required character, save the point where we
4880 found it, so that we don't search again next time round the loop if
4881 the start hasn't passed this character yet. */
4882
4883 req_byte_ptr = p;
4884 }
4885 }
4886
4887 /* OK, we can now run the match. */
4888
4889 md->start_match_ptr = start_match;
4890 md->match_call_count = 0;
4891 rc = match(start_match, md->start_code, start_match, 2, md, ims, NULL, 0, 0);
4892
4893 switch(rc)
4894 {
4895 /* NOMATCH and PRUNE advance by one character. THEN at this level acts
4896 exactly like PRUNE. */
4897
4898 case MATCH_NOMATCH:
4899 case MATCH_PRUNE:
4900 case MATCH_THEN:
4901 new_start_match = start_match + 1;
4902 #ifdef SUPPORT_UTF8
4903 if (utf8)
4904 while(new_start_match < end_subject && (*new_start_match & 0xc0) == 0x80)
4905 new_start_match++;
4906 #endif
4907 break;
4908
4909 /* SKIP passes back the next starting point explicitly. */
4910
4911 case MATCH_SKIP:
4912 new_start_match = md->start_match_ptr;
4913 break;
4914
4915 /* COMMIT disables the bumpalong, but otherwise behaves as NOMATCH. */
4916
4917 case MATCH_COMMIT:
4918 rc = MATCH_NOMATCH;
4919 goto ENDLOOP;
4920
4921 /* Any other return is some kind of error. */
4922
4923 default:
4924 goto ENDLOOP;
4925 }
4926
4927 /* Control reaches here for the various types of "no match at this point"
4928 result. Reset the code to MATCH_NOMATCH for subsequent checking. */
4929
4930 rc = MATCH_NOMATCH;
4931
4932 /* If PCRE_FIRSTLINE is set, the match must happen before or at the first
4933 newline in the subject (though it may continue over the newline). Therefore,
4934 if we have just failed to match, starting at a newline, do not continue. */
4935
4936 if (firstline && IS_NEWLINE(start_match)) break;
4937
4938 /* Advance to new matching position */
4939
4940 start_match = new_start_match;
4941
4942 /* Break the loop if the pattern is anchored or if we have passed the end of
4943 the subject. */
4944
4945 if (anchored || start_match > end_subject) break;
4946
4947 /* If we have just passed a CR and we are now at a LF, and the pattern does
4948 not contain any explicit matches for \r or \n, and the newline option is CRLF
4949 or ANY or ANYCRLF, advance the match position by one more character. */
4950
4951 if (start_match[-1] == CHAR_CR &&
4952 start_match < end_subject &&
4953 *start_match == CHAR_NL &&
4954 (re->flags & PCRE_HASCRORLF) == 0 &&
4955 (md->nltype == NLTYPE_ANY ||
4956 md->nltype == NLTYPE_ANYCRLF ||
4957 md->nllen == 2))
4958 start_match++;
4959
4960 } /* End of for(;;) "bumpalong" loop */
4961
4962 /* ==========================================================================*/
4963
4964 /* We reach here when rc is not MATCH_NOMATCH, or if one of the stopping
4965 conditions is true:
4966
4967 (1) The pattern is anchored or the match was failed by (*COMMIT);
4968
4969 (2) We are past the end of the subject;
4970
4971 (3) PCRE_FIRSTLINE is set and we have failed to match at a newline, because
4972 this option requests that a match occur at or before the first newline in
4973 the subject.
4974
4975 When we have a match and the offset vector is big enough to deal with any
4976 backreferences, captured substring offsets will already be set up. In the case
4977 where we had to get some local store to hold offsets for backreference
4978 processing, copy those that we can. In this case there need not be overflow if
4979 certain parts of the pattern were not used, even though there are more
4980 capturing parentheses than vector slots. */
4981
4982 ENDLOOP:
4983
4984 if (rc == MATCH_MATCH)
4985 {
4986 if (using_temporary_offsets)
4987 {
4988 if (offsetcount >= 4)
4989 {
4990 memcpy(offsets + 2, md->offset_vector + 2,
4991 (offsetcount - 2) * sizeof(int));
4992 DPRINTF(("Copied offsets from temporary memory\n"));
4993 }
4994 if (md->end_offset_top > offsetcount) md->offset_overflow = TRUE;
4995 DPRINTF(("Freeing temporary memory\n"));
4996 (pcre_free)(md->offset_vector);
4997 }
4998
4999 /* Set the return code to the number of captured strings, or 0 if there are
5000 too many to fit into the vector. */
5001
5002 rc = md->offset_overflow? 0 : md->end_offset_top/2;
5003
5004 /* If there is space, set up the whole thing as substring 0. The value of
5005 md->start_match_ptr might be modified if \K was encountered on the success
5006 matching path. */
5007
5008 if (offsetcount < 2) rc = 0; else
5009 {
5010 offsets[0] = md->start_match_ptr - md->start_subject;
5011 offsets[1] = md->end_match_ptr - md->start_subject;
5012 }
5013
5014 DPRINTF((">>>> returning %d\n", rc));
5015 return rc;
5016 }
5017
5018 /* Control gets here if there has been an error, or if the overall match
5019 attempt has failed at all permitted starting positions. */
5020
5021 if (using_temporary_offsets)
5022 {
5023 DPRINTF(("Freeing temporary memory\n"));
5024 (pcre_free)(md->offset_vector);
5025 }
5026
5027 if (rc != MATCH_NOMATCH)
5028 {
5029 DPRINTF((">>>> error: returning %d\n", rc));
5030 return rc;
5031 }
5032 else if (md->partial && md->hitend)
5033 {
5034 DPRINTF((">>>> returning PCRE_ERROR_PARTIAL\n"));
5035 return PCRE_ERROR_PARTIAL;
5036 }
5037 else
5038 {
5039 DPRINTF((">>>> returning PCRE_ERROR_NOMATCH\n"));
5040 return PCRE_ERROR_NOMATCH;
5041 }
5042 }
5043
5044 /* End of pcre_exec.c */

Properties

Name Value
svn:eol-style native
svn:keywords "Author Date Id Revision Url"

webmaster@exim.org
ViewVC Help
Powered by ViewVC 1.1.12