/[pcre]/code/trunk/pcre_exec.c
ViewVC logotype

Contents of /code/trunk/pcre_exec.c

Parent Directory Parent Directory | Revision Log Revision Log


Revision 341 - (show annotations) (download)
Sat Apr 19 16:41:04 2008 UTC (6 years, 6 months ago) by ph10
File MIME type: text/plain
File size: 151712 byte(s)
Fix DFA (?!) bug; add support for JavaScript empty classes.

1 /*************************************************
2 * Perl-Compatible Regular Expressions *
3 *************************************************/
4
5 /* PCRE is a library of functions to support regular expressions whose syntax
6 and semantics are as close as possible to those of the Perl 5 language.
7
8 Written by Philip Hazel
9 Copyright (c) 1997-2008 University of Cambridge
10
11 -----------------------------------------------------------------------------
12 Redistribution and use in source and binary forms, with or without
13 modification, are permitted provided that the following conditions are met:
14
15 * Redistributions of source code must retain the above copyright notice,
16 this list of conditions and the following disclaimer.
17
18 * Redistributions in binary form must reproduce the above copyright
19 notice, this list of conditions and the following disclaimer in the
20 documentation and/or other materials provided with the distribution.
21
22 * Neither the name of the University of Cambridge nor the names of its
23 contributors may be used to endorse or promote products derived from
24 this software without specific prior written permission.
25
26 THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
27 AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
28 IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
29 ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
30 LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
31 CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
32 SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
33 INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
34 CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
35 ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
36 POSSIBILITY OF SUCH DAMAGE.
37 -----------------------------------------------------------------------------
38 */
39
40
41 /* This module contains pcre_exec(), the externally visible function that does
42 pattern matching using an NFA algorithm, trying to mimic Perl as closely as
43 possible. There are also some static supporting functions. */
44
45 #ifdef HAVE_CONFIG_H
46 #include "config.h"
47 #endif
48
49 #define NLBLOCK md /* Block containing newline information */
50 #define PSSTART start_subject /* Field containing processed string start */
51 #define PSEND end_subject /* Field containing processed string end */
52
53 #include "pcre_internal.h"
54
55 /* Undefine some potentially clashing cpp symbols */
56
57 #undef min
58 #undef max
59
60 /* Flag bits for the match() function */
61
62 #define match_condassert 0x01 /* Called to check a condition assertion */
63 #define match_cbegroup 0x02 /* Could-be-empty unlimited repeat group */
64
65 /* Non-error returns from the match() function. Error returns are externally
66 defined PCRE_ERROR_xxx codes, which are all negative. */
67
68 #define MATCH_MATCH 1
69 #define MATCH_NOMATCH 0
70
71 /* Special internal returns from the match() function. Make them sufficiently
72 negative to avoid the external error codes. */
73
74 #define MATCH_COMMIT (-999)
75 #define MATCH_PRUNE (-998)
76 #define MATCH_SKIP (-997)
77 #define MATCH_THEN (-996)
78
79 /* Maximum number of ints of offset to save on the stack for recursive calls.
80 If the offset vector is bigger, malloc is used. This should be a multiple of 3,
81 because the offset vector is always a multiple of 3 long. */
82
83 #define REC_STACK_SAVE_MAX 30
84
85 /* Min and max values for the common repeats; for the maxima, 0 => infinity */
86
87 static const char rep_min[] = { 0, 0, 1, 1, 0, 0 };
88 static const char rep_max[] = { 0, 0, 0, 0, 1, 1 };
89
90
91
92 #ifdef DEBUG
93 /*************************************************
94 * Debugging function to print chars *
95 *************************************************/
96
97 /* Print a sequence of chars in printable format, stopping at the end of the
98 subject if the requested.
99
100 Arguments:
101 p points to characters
102 length number to print
103 is_subject TRUE if printing from within md->start_subject
104 md pointer to matching data block, if is_subject is TRUE
105
106 Returns: nothing
107 */
108
109 static void
110 pchars(const uschar *p, int length, BOOL is_subject, match_data *md)
111 {
112 unsigned int c;
113 if (is_subject && length > md->end_subject - p) length = md->end_subject - p;
114 while (length-- > 0)
115 if (isprint(c = *(p++))) printf("%c", c); else printf("\\x%02x", c);
116 }
117 #endif
118
119
120
121 /*************************************************
122 * Match a back-reference *
123 *************************************************/
124
125 /* If a back reference hasn't been set, the length that is passed is greater
126 than the number of characters left in the string, so the match fails.
127
128 Arguments:
129 offset index into the offset vector
130 eptr points into the subject
131 length length to be matched
132 md points to match data block
133 ims the ims flags
134
135 Returns: TRUE if matched
136 */
137
138 static BOOL
139 match_ref(int offset, register USPTR eptr, int length, match_data *md,
140 unsigned long int ims)
141 {
142 USPTR p = md->start_subject + md->offset_vector[offset];
143
144 #ifdef DEBUG
145 if (eptr >= md->end_subject)
146 printf("matching subject <null>");
147 else
148 {
149 printf("matching subject ");
150 pchars(eptr, length, TRUE, md);
151 }
152 printf(" against backref ");
153 pchars(p, length, FALSE, md);
154 printf("\n");
155 #endif
156
157 /* Always fail if not enough characters left */
158
159 if (length > md->end_subject - eptr) return FALSE;
160
161 /* Separate the caselesss case for speed */
162
163 if ((ims & PCRE_CASELESS) != 0)
164 {
165 while (length-- > 0)
166 if (md->lcc[*p++] != md->lcc[*eptr++]) return FALSE;
167 }
168 else
169 { while (length-- > 0) if (*p++ != *eptr++) return FALSE; }
170
171 return TRUE;
172 }
173
174
175
176 /***************************************************************************
177 ****************************************************************************
178 RECURSION IN THE match() FUNCTION
179
180 The match() function is highly recursive, though not every recursive call
181 increases the recursive depth. Nevertheless, some regular expressions can cause
182 it to recurse to a great depth. I was writing for Unix, so I just let it call
183 itself recursively. This uses the stack for saving everything that has to be
184 saved for a recursive call. On Unix, the stack can be large, and this works
185 fine.
186
187 It turns out that on some non-Unix-like systems there are problems with
188 programs that use a lot of stack. (This despite the fact that every last chip
189 has oodles of memory these days, and techniques for extending the stack have
190 been known for decades.) So....
191
192 There is a fudge, triggered by defining NO_RECURSE, which avoids recursive
193 calls by keeping local variables that need to be preserved in blocks of memory
194 obtained from malloc() instead instead of on the stack. Macros are used to
195 achieve this so that the actual code doesn't look very different to what it
196 always used to.
197
198 The original heap-recursive code used longjmp(). However, it seems that this
199 can be very slow on some operating systems. Following a suggestion from Stan
200 Switzer, the use of longjmp() has been abolished, at the cost of having to
201 provide a unique number for each call to RMATCH. There is no way of generating
202 a sequence of numbers at compile time in C. I have given them names, to make
203 them stand out more clearly.
204
205 Crude tests on x86 Linux show a small speedup of around 5-8%. However, on
206 FreeBSD, avoiding longjmp() more than halves the time taken to run the standard
207 tests. Furthermore, not using longjmp() means that local dynamic variables
208 don't have indeterminate values; this has meant that the frame size can be
209 reduced because the result can be "passed back" by straight setting of the
210 variable instead of being passed in the frame.
211 ****************************************************************************
212 ***************************************************************************/
213
214 /* Numbers for RMATCH calls. When this list is changed, the code at HEAP_RETURN
215 below must be updated in sync. */
216
217 enum { RM1=1, RM2, RM3, RM4, RM5, RM6, RM7, RM8, RM9, RM10,
218 RM11, RM12, RM13, RM14, RM15, RM16, RM17, RM18, RM19, RM20,
219 RM21, RM22, RM23, RM24, RM25, RM26, RM27, RM28, RM29, RM30,
220 RM31, RM32, RM33, RM34, RM35, RM36, RM37, RM38, RM39, RM40,
221 RM41, RM42, RM43, RM44, RM45, RM46, RM47, RM48, RM49, RM50,
222 RM51, RM52, RM53, RM54 };
223
224 /* These versions of the macros use the stack, as normal. There are debugging
225 versions and production versions. Note that the "rw" argument of RMATCH isn't
226 actuall used in this definition. */
227
228 #ifndef NO_RECURSE
229 #define REGISTER register
230
231 #ifdef DEBUG
232 #define RMATCH(ra,rb,rc,rd,re,rf,rg,rw) \
233 { \
234 printf("match() called in line %d\n", __LINE__); \
235 rrc = match(ra,rb,mstart,rc,rd,re,rf,rg,rdepth+1); \
236 printf("to line %d\n", __LINE__); \
237 }
238 #define RRETURN(ra) \
239 { \
240 printf("match() returned %d from line %d ", ra, __LINE__); \
241 return ra; \
242 }
243 #else
244 #define RMATCH(ra,rb,rc,rd,re,rf,rg,rw) \
245 rrc = match(ra,rb,mstart,rc,rd,re,rf,rg,rdepth+1)
246 #define RRETURN(ra) return ra
247 #endif
248
249 #else
250
251
252 /* These versions of the macros manage a private stack on the heap. Note that
253 the "rd" argument of RMATCH isn't actually used in this definition. It's the md
254 argument of match(), which never changes. */
255
256 #define REGISTER
257
258 #define RMATCH(ra,rb,rc,rd,re,rf,rg,rw)\
259 {\
260 heapframe *newframe = (pcre_stack_malloc)(sizeof(heapframe));\
261 frame->Xwhere = rw; \
262 newframe->Xeptr = ra;\
263 newframe->Xecode = rb;\
264 newframe->Xmstart = mstart;\
265 newframe->Xoffset_top = rc;\
266 newframe->Xims = re;\
267 newframe->Xeptrb = rf;\
268 newframe->Xflags = rg;\
269 newframe->Xrdepth = frame->Xrdepth + 1;\
270 newframe->Xprevframe = frame;\
271 frame = newframe;\
272 DPRINTF(("restarting from line %d\n", __LINE__));\
273 goto HEAP_RECURSE;\
274 L_##rw:\
275 DPRINTF(("jumped back to line %d\n", __LINE__));\
276 }
277
278 #define RRETURN(ra)\
279 {\
280 heapframe *newframe = frame;\
281 frame = newframe->Xprevframe;\
282 (pcre_stack_free)(newframe);\
283 if (frame != NULL)\
284 {\
285 rrc = ra;\
286 goto HEAP_RETURN;\
287 }\
288 return ra;\
289 }
290
291
292 /* Structure for remembering the local variables in a private frame */
293
294 typedef struct heapframe {
295 struct heapframe *Xprevframe;
296
297 /* Function arguments that may change */
298
299 const uschar *Xeptr;
300 const uschar *Xecode;
301 const uschar *Xmstart;
302 int Xoffset_top;
303 long int Xims;
304 eptrblock *Xeptrb;
305 int Xflags;
306 unsigned int Xrdepth;
307
308 /* Function local variables */
309
310 const uschar *Xcallpat;
311 const uschar *Xcharptr;
312 const uschar *Xdata;
313 const uschar *Xnext;
314 const uschar *Xpp;
315 const uschar *Xprev;
316 const uschar *Xsaved_eptr;
317
318 recursion_info Xnew_recursive;
319
320 BOOL Xcur_is_word;
321 BOOL Xcondition;
322 BOOL Xprev_is_word;
323
324 unsigned long int Xoriginal_ims;
325
326 #ifdef SUPPORT_UCP
327 int Xprop_type;
328 int Xprop_value;
329 int Xprop_fail_result;
330 int Xprop_category;
331 int Xprop_chartype;
332 int Xprop_script;
333 int Xoclength;
334 uschar Xocchars[8];
335 #endif
336
337 int Xctype;
338 unsigned int Xfc;
339 int Xfi;
340 int Xlength;
341 int Xmax;
342 int Xmin;
343 int Xnumber;
344 int Xoffset;
345 int Xop;
346 int Xsave_capture_last;
347 int Xsave_offset1, Xsave_offset2, Xsave_offset3;
348 int Xstacksave[REC_STACK_SAVE_MAX];
349
350 eptrblock Xnewptrb;
351
352 /* Where to jump back to */
353
354 int Xwhere;
355
356 } heapframe;
357
358 #endif
359
360
361 /***************************************************************************
362 ***************************************************************************/
363
364
365
366 /*************************************************
367 * Match from current position *
368 *************************************************/
369
370 /* This function is called recursively in many circumstances. Whenever it
371 returns a negative (error) response, the outer incarnation must also return the
372 same response.
373
374 Performance note: It might be tempting to extract commonly used fields from the
375 md structure (e.g. utf8, end_subject) into individual variables to improve
376 performance. Tests using gcc on a SPARC disproved this; in the first case, it
377 made performance worse.
378
379 Arguments:
380 eptr pointer to current character in subject
381 ecode pointer to current position in compiled code
382 mstart pointer to the current match start position (can be modified
383 by encountering \K)
384 offset_top current top pointer
385 md pointer to "static" info for the match
386 ims current /i, /m, and /s options
387 eptrb pointer to chain of blocks containing eptr at start of
388 brackets - for testing for empty matches
389 flags can contain
390 match_condassert - this is an assertion condition
391 match_cbegroup - this is the start of an unlimited repeat
392 group that can match an empty string
393 rdepth the recursion depth
394
395 Returns: MATCH_MATCH if matched ) these values are >= 0
396 MATCH_NOMATCH if failed to match )
397 a negative PCRE_ERROR_xxx value if aborted by an error condition
398 (e.g. stopped by repeated call or recursion limit)
399 */
400
401 static int
402 match(REGISTER USPTR eptr, REGISTER const uschar *ecode, const uschar *mstart,
403 int offset_top, match_data *md, unsigned long int ims, eptrblock *eptrb,
404 int flags, unsigned int rdepth)
405 {
406 /* These variables do not need to be preserved over recursion in this function,
407 so they can be ordinary variables in all cases. Mark some of them with
408 "register" because they are used a lot in loops. */
409
410 register int rrc; /* Returns from recursive calls */
411 register int i; /* Used for loops not involving calls to RMATCH() */
412 register unsigned int c; /* Character values not kept over RMATCH() calls */
413 register BOOL utf8; /* Local copy of UTF-8 flag for speed */
414
415 BOOL minimize, possessive; /* Quantifier options */
416
417 /* When recursion is not being used, all "local" variables that have to be
418 preserved over calls to RMATCH() are part of a "frame" which is obtained from
419 heap storage. Set up the top-level frame here; others are obtained from the
420 heap whenever RMATCH() does a "recursion". See the macro definitions above. */
421
422 #ifdef NO_RECURSE
423 heapframe *frame = (pcre_stack_malloc)(sizeof(heapframe));
424 frame->Xprevframe = NULL; /* Marks the top level */
425
426 /* Copy in the original argument variables */
427
428 frame->Xeptr = eptr;
429 frame->Xecode = ecode;
430 frame->Xmstart = mstart;
431 frame->Xoffset_top = offset_top;
432 frame->Xims = ims;
433 frame->Xeptrb = eptrb;
434 frame->Xflags = flags;
435 frame->Xrdepth = rdepth;
436
437 /* This is where control jumps back to to effect "recursion" */
438
439 HEAP_RECURSE:
440
441 /* Macros make the argument variables come from the current frame */
442
443 #define eptr frame->Xeptr
444 #define ecode frame->Xecode
445 #define mstart frame->Xmstart
446 #define offset_top frame->Xoffset_top
447 #define ims frame->Xims
448 #define eptrb frame->Xeptrb
449 #define flags frame->Xflags
450 #define rdepth frame->Xrdepth
451
452 /* Ditto for the local variables */
453
454 #ifdef SUPPORT_UTF8
455 #define charptr frame->Xcharptr
456 #endif
457 #define callpat frame->Xcallpat
458 #define data frame->Xdata
459 #define next frame->Xnext
460 #define pp frame->Xpp
461 #define prev frame->Xprev
462 #define saved_eptr frame->Xsaved_eptr
463
464 #define new_recursive frame->Xnew_recursive
465
466 #define cur_is_word frame->Xcur_is_word
467 #define condition frame->Xcondition
468 #define prev_is_word frame->Xprev_is_word
469
470 #define original_ims frame->Xoriginal_ims
471
472 #ifdef SUPPORT_UCP
473 #define prop_type frame->Xprop_type
474 #define prop_value frame->Xprop_value
475 #define prop_fail_result frame->Xprop_fail_result
476 #define prop_category frame->Xprop_category
477 #define prop_chartype frame->Xprop_chartype
478 #define prop_script frame->Xprop_script
479 #define oclength frame->Xoclength
480 #define occhars frame->Xocchars
481 #endif
482
483 #define ctype frame->Xctype
484 #define fc frame->Xfc
485 #define fi frame->Xfi
486 #define length frame->Xlength
487 #define max frame->Xmax
488 #define min frame->Xmin
489 #define number frame->Xnumber
490 #define offset frame->Xoffset
491 #define op frame->Xop
492 #define save_capture_last frame->Xsave_capture_last
493 #define save_offset1 frame->Xsave_offset1
494 #define save_offset2 frame->Xsave_offset2
495 #define save_offset3 frame->Xsave_offset3
496 #define stacksave frame->Xstacksave
497
498 #define newptrb frame->Xnewptrb
499
500 /* When recursion is being used, local variables are allocated on the stack and
501 get preserved during recursion in the normal way. In this environment, fi and
502 i, and fc and c, can be the same variables. */
503
504 #else /* NO_RECURSE not defined */
505 #define fi i
506 #define fc c
507
508
509 #ifdef SUPPORT_UTF8 /* Many of these variables are used only */
510 const uschar *charptr; /* in small blocks of the code. My normal */
511 #endif /* style of coding would have declared */
512 const uschar *callpat; /* them within each of those blocks. */
513 const uschar *data; /* However, in order to accommodate the */
514 const uschar *next; /* version of this code that uses an */
515 USPTR pp; /* external "stack" implemented on the */
516 const uschar *prev; /* heap, it is easier to declare them all */
517 USPTR saved_eptr; /* here, so the declarations can be cut */
518 /* out in a block. The only declarations */
519 recursion_info new_recursive; /* within blocks below are for variables */
520 /* that do not have to be preserved over */
521 BOOL cur_is_word; /* a recursive call to RMATCH(). */
522 BOOL condition;
523 BOOL prev_is_word;
524
525 unsigned long int original_ims;
526
527 #ifdef SUPPORT_UCP
528 int prop_type;
529 int prop_value;
530 int prop_fail_result;
531 int prop_category;
532 int prop_chartype;
533 int prop_script;
534 int oclength;
535 uschar occhars[8];
536 #endif
537
538 int ctype;
539 int length;
540 int max;
541 int min;
542 int number;
543 int offset;
544 int op;
545 int save_capture_last;
546 int save_offset1, save_offset2, save_offset3;
547 int stacksave[REC_STACK_SAVE_MAX];
548
549 eptrblock newptrb;
550 #endif /* NO_RECURSE */
551
552 /* These statements are here to stop the compiler complaining about unitialized
553 variables. */
554
555 #ifdef SUPPORT_UCP
556 prop_value = 0;
557 prop_fail_result = 0;
558 #endif
559
560
561 /* This label is used for tail recursion, which is used in a few cases even
562 when NO_RECURSE is not defined, in order to reduce the amount of stack that is
563 used. Thanks to Ian Taylor for noticing this possibility and sending the
564 original patch. */
565
566 TAIL_RECURSE:
567
568 /* OK, now we can get on with the real code of the function. Recursive calls
569 are specified by the macro RMATCH and RRETURN is used to return. When
570 NO_RECURSE is *not* defined, these just turn into a recursive call to match()
571 and a "return", respectively (possibly with some debugging if DEBUG is
572 defined). However, RMATCH isn't like a function call because it's quite a
573 complicated macro. It has to be used in one particular way. This shouldn't,
574 however, impact performance when true recursion is being used. */
575
576 #ifdef SUPPORT_UTF8
577 utf8 = md->utf8; /* Local copy of the flag */
578 #else
579 utf8 = FALSE;
580 #endif
581
582 /* First check that we haven't called match() too many times, or that we
583 haven't exceeded the recursive call limit. */
584
585 if (md->match_call_count++ >= md->match_limit) RRETURN(PCRE_ERROR_MATCHLIMIT);
586 if (rdepth >= md->match_limit_recursion) RRETURN(PCRE_ERROR_RECURSIONLIMIT);
587
588 original_ims = ims; /* Save for resetting on ')' */
589
590 /* At the start of a group with an unlimited repeat that may match an empty
591 string, the match_cbegroup flag is set. When this is the case, add the current
592 subject pointer to the chain of such remembered pointers, to be checked when we
593 hit the closing ket, in order to break infinite loops that match no characters.
594 When match() is called in other circumstances, don't add to the chain. The
595 match_cbegroup flag must NOT be used with tail recursion, because the memory
596 block that is used is on the stack, so a new one may be required for each
597 match(). */
598
599 if ((flags & match_cbegroup) != 0)
600 {
601 newptrb.epb_saved_eptr = eptr;
602 newptrb.epb_prev = eptrb;
603 eptrb = &newptrb;
604 }
605
606 /* Now start processing the opcodes. */
607
608 for (;;)
609 {
610 minimize = possessive = FALSE;
611 op = *ecode;
612
613 /* For partial matching, remember if we ever hit the end of the subject after
614 matching at least one subject character. */
615
616 if (md->partial &&
617 eptr >= md->end_subject &&
618 eptr > mstart)
619 md->hitend = TRUE;
620
621 switch(op)
622 {
623 case OP_FAIL:
624 RRETURN(MATCH_NOMATCH);
625
626 case OP_PRUNE:
627 RMATCH(eptr, ecode + _pcre_OP_lengths[*ecode], offset_top, md,
628 ims, eptrb, flags, RM51);
629 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
630 RRETURN(MATCH_PRUNE);
631
632 case OP_COMMIT:
633 RMATCH(eptr, ecode + _pcre_OP_lengths[*ecode], offset_top, md,
634 ims, eptrb, flags, RM52);
635 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
636 RRETURN(MATCH_COMMIT);
637
638 case OP_SKIP:
639 RMATCH(eptr, ecode + _pcre_OP_lengths[*ecode], offset_top, md,
640 ims, eptrb, flags, RM53);
641 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
642 md->start_match_ptr = eptr; /* Pass back current position */
643 RRETURN(MATCH_SKIP);
644
645 case OP_THEN:
646 RMATCH(eptr, ecode + _pcre_OP_lengths[*ecode], offset_top, md,
647 ims, eptrb, flags, RM54);
648 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
649 RRETURN(MATCH_THEN);
650
651 /* Handle a capturing bracket. If there is space in the offset vector, save
652 the current subject position in the working slot at the top of the vector.
653 We mustn't change the current values of the data slot, because they may be
654 set from a previous iteration of this group, and be referred to by a
655 reference inside the group.
656
657 If the bracket fails to match, we need to restore this value and also the
658 values of the final offsets, in case they were set by a previous iteration
659 of the same bracket.
660
661 If there isn't enough space in the offset vector, treat this as if it were
662 a non-capturing bracket. Don't worry about setting the flag for the error
663 case here; that is handled in the code for KET. */
664
665 case OP_CBRA:
666 case OP_SCBRA:
667 number = GET2(ecode, 1+LINK_SIZE);
668 offset = number << 1;
669
670 #ifdef DEBUG
671 printf("start bracket %d\n", number);
672 printf("subject=");
673 pchars(eptr, 16, TRUE, md);
674 printf("\n");
675 #endif
676
677 if (offset < md->offset_max)
678 {
679 save_offset1 = md->offset_vector[offset];
680 save_offset2 = md->offset_vector[offset+1];
681 save_offset3 = md->offset_vector[md->offset_end - number];
682 save_capture_last = md->capture_last;
683
684 DPRINTF(("saving %d %d %d\n", save_offset1, save_offset2, save_offset3));
685 md->offset_vector[md->offset_end - number] = eptr - md->start_subject;
686
687 flags = (op == OP_SCBRA)? match_cbegroup : 0;
688 do
689 {
690 RMATCH(eptr, ecode + _pcre_OP_lengths[*ecode], offset_top, md,
691 ims, eptrb, flags, RM1);
692 if (rrc != MATCH_NOMATCH && rrc != MATCH_THEN) RRETURN(rrc);
693 md->capture_last = save_capture_last;
694 ecode += GET(ecode, 1);
695 }
696 while (*ecode == OP_ALT);
697
698 DPRINTF(("bracket %d failed\n", number));
699
700 md->offset_vector[offset] = save_offset1;
701 md->offset_vector[offset+1] = save_offset2;
702 md->offset_vector[md->offset_end - number] = save_offset3;
703
704 RRETURN(MATCH_NOMATCH);
705 }
706
707 /* FALL THROUGH ... Insufficient room for saving captured contents. Treat
708 as a non-capturing bracket. */
709
710 /* VVVVVVVVVVVVVVVVVVVVVVVVV */
711 /* VVVVVVVVVVVVVVVVVVVVVVVVV */
712
713 DPRINTF(("insufficient capture room: treat as non-capturing\n"));
714
715 /* VVVVVVVVVVVVVVVVVVVVVVVVV */
716 /* VVVVVVVVVVVVVVVVVVVVVVVVV */
717
718 /* Non-capturing bracket. Loop for all the alternatives. When we get to the
719 final alternative within the brackets, we would return the result of a
720 recursive call to match() whatever happened. We can reduce stack usage by
721 turning this into a tail recursion, except in the case when match_cbegroup
722 is set.*/
723
724 case OP_BRA:
725 case OP_SBRA:
726 DPRINTF(("start non-capturing bracket\n"));
727 flags = (op >= OP_SBRA)? match_cbegroup : 0;
728 for (;;)
729 {
730 if (ecode[GET(ecode, 1)] != OP_ALT) /* Final alternative */
731 {
732 if (flags == 0) /* Not a possibly empty group */
733 {
734 ecode += _pcre_OP_lengths[*ecode];
735 DPRINTF(("bracket 0 tail recursion\n"));
736 goto TAIL_RECURSE;
737 }
738
739 /* Possibly empty group; can't use tail recursion. */
740
741 RMATCH(eptr, ecode + _pcre_OP_lengths[*ecode], offset_top, md, ims,
742 eptrb, flags, RM48);
743 RRETURN(rrc);
744 }
745
746 /* For non-final alternatives, continue the loop for a NOMATCH result;
747 otherwise return. */
748
749 RMATCH(eptr, ecode + _pcre_OP_lengths[*ecode], offset_top, md, ims,
750 eptrb, flags, RM2);
751 if (rrc != MATCH_NOMATCH && rrc != MATCH_THEN) RRETURN(rrc);
752 ecode += GET(ecode, 1);
753 }
754 /* Control never reaches here. */
755
756 /* Conditional group: compilation checked that there are no more than
757 two branches. If the condition is false, skipping the first branch takes us
758 past the end if there is only one branch, but that's OK because that is
759 exactly what going to the ket would do. As there is only one branch to be
760 obeyed, we can use tail recursion to avoid using another stack frame. */
761
762 case OP_COND:
763 case OP_SCOND:
764 if (ecode[LINK_SIZE+1] == OP_RREF) /* Recursion test */
765 {
766 offset = GET2(ecode, LINK_SIZE + 2); /* Recursion group number*/
767 condition = md->recursive != NULL &&
768 (offset == RREF_ANY || offset == md->recursive->group_num);
769 ecode += condition? 3 : GET(ecode, 1);
770 }
771
772 else if (ecode[LINK_SIZE+1] == OP_CREF) /* Group used test */
773 {
774 offset = GET2(ecode, LINK_SIZE+2) << 1; /* Doubled ref number */
775 condition = offset < offset_top && md->offset_vector[offset] >= 0;
776 ecode += condition? 3 : GET(ecode, 1);
777 }
778
779 else if (ecode[LINK_SIZE+1] == OP_DEF) /* DEFINE - always false */
780 {
781 condition = FALSE;
782 ecode += GET(ecode, 1);
783 }
784
785 /* The condition is an assertion. Call match() to evaluate it - setting
786 the final argument match_condassert causes it to stop at the end of an
787 assertion. */
788
789 else
790 {
791 RMATCH(eptr, ecode + 1 + LINK_SIZE, offset_top, md, ims, NULL,
792 match_condassert, RM3);
793 if (rrc == MATCH_MATCH)
794 {
795 condition = TRUE;
796 ecode += 1 + LINK_SIZE + GET(ecode, LINK_SIZE + 2);
797 while (*ecode == OP_ALT) ecode += GET(ecode, 1);
798 }
799 else if (rrc != MATCH_NOMATCH && rrc != MATCH_THEN)
800 {
801 RRETURN(rrc); /* Need braces because of following else */
802 }
803 else
804 {
805 condition = FALSE;
806 ecode += GET(ecode, 1);
807 }
808 }
809
810 /* We are now at the branch that is to be obeyed. As there is only one,
811 we can use tail recursion to avoid using another stack frame, except when
812 match_cbegroup is required for an unlimited repeat of a possibly empty
813 group. If the second alternative doesn't exist, we can just plough on. */
814
815 if (condition || *ecode == OP_ALT)
816 {
817 ecode += 1 + LINK_SIZE;
818 if (op == OP_SCOND) /* Possibly empty group */
819 {
820 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, match_cbegroup, RM49);
821 RRETURN(rrc);
822 }
823 else /* Group must match something */
824 {
825 flags = 0;
826 goto TAIL_RECURSE;
827 }
828 }
829 else /* Condition false & no 2nd alternative */
830 {
831 ecode += 1 + LINK_SIZE;
832 }
833 break;
834
835
836 /* End of the pattern, either real or forced. If we are in a top-level
837 recursion, we should restore the offsets appropriately and continue from
838 after the call. */
839
840 case OP_ACCEPT:
841 case OP_END:
842 if (md->recursive != NULL && md->recursive->group_num == 0)
843 {
844 recursion_info *rec = md->recursive;
845 DPRINTF(("End of pattern in a (?0) recursion\n"));
846 md->recursive = rec->prevrec;
847 memmove(md->offset_vector, rec->offset_save,
848 rec->saved_max * sizeof(int));
849 mstart = rec->save_start;
850 ims = original_ims;
851 ecode = rec->after_call;
852 break;
853 }
854
855 /* Otherwise, if PCRE_NOTEMPTY is set, fail if we have matched an empty
856 string - backtracking will then try other alternatives, if any. */
857
858 if (md->notempty && eptr == mstart) RRETURN(MATCH_NOMATCH);
859 md->end_match_ptr = eptr; /* Record where we ended */
860 md->end_offset_top = offset_top; /* and how many extracts were taken */
861 md->start_match_ptr = mstart; /* and the start (\K can modify) */
862 RRETURN(MATCH_MATCH);
863
864 /* Change option settings */
865
866 case OP_OPT:
867 ims = ecode[1];
868 ecode += 2;
869 DPRINTF(("ims set to %02lx\n", ims));
870 break;
871
872 /* Assertion brackets. Check the alternative branches in turn - the
873 matching won't pass the KET for an assertion. If any one branch matches,
874 the assertion is true. Lookbehind assertions have an OP_REVERSE item at the
875 start of each branch to move the current point backwards, so the code at
876 this level is identical to the lookahead case. */
877
878 case OP_ASSERT:
879 case OP_ASSERTBACK:
880 do
881 {
882 RMATCH(eptr, ecode + 1 + LINK_SIZE, offset_top, md, ims, NULL, 0,
883 RM4);
884 if (rrc == MATCH_MATCH) break;
885 if (rrc != MATCH_NOMATCH && rrc != MATCH_THEN) RRETURN(rrc);
886 ecode += GET(ecode, 1);
887 }
888 while (*ecode == OP_ALT);
889 if (*ecode == OP_KET) RRETURN(MATCH_NOMATCH);
890
891 /* If checking an assertion for a condition, return MATCH_MATCH. */
892
893 if ((flags & match_condassert) != 0) RRETURN(MATCH_MATCH);
894
895 /* Continue from after the assertion, updating the offsets high water
896 mark, since extracts may have been taken during the assertion. */
897
898 do ecode += GET(ecode,1); while (*ecode == OP_ALT);
899 ecode += 1 + LINK_SIZE;
900 offset_top = md->end_offset_top;
901 continue;
902
903 /* Negative assertion: all branches must fail to match */
904
905 case OP_ASSERT_NOT:
906 case OP_ASSERTBACK_NOT:
907 do
908 {
909 RMATCH(eptr, ecode + 1 + LINK_SIZE, offset_top, md, ims, NULL, 0,
910 RM5);
911 if (rrc == MATCH_MATCH) RRETURN(MATCH_NOMATCH);
912 if (rrc != MATCH_NOMATCH && rrc != MATCH_THEN) RRETURN(rrc);
913 ecode += GET(ecode,1);
914 }
915 while (*ecode == OP_ALT);
916
917 if ((flags & match_condassert) != 0) RRETURN(MATCH_MATCH);
918
919 ecode += 1 + LINK_SIZE;
920 continue;
921
922 /* Move the subject pointer back. This occurs only at the start of
923 each branch of a lookbehind assertion. If we are too close to the start to
924 move back, this match function fails. When working with UTF-8 we move
925 back a number of characters, not bytes. */
926
927 case OP_REVERSE:
928 #ifdef SUPPORT_UTF8
929 if (utf8)
930 {
931 i = GET(ecode, 1);
932 while (i-- > 0)
933 {
934 eptr--;
935 if (eptr < md->start_subject) RRETURN(MATCH_NOMATCH);
936 BACKCHAR(eptr);
937 }
938 }
939 else
940 #endif
941
942 /* No UTF-8 support, or not in UTF-8 mode: count is byte count */
943
944 {
945 eptr -= GET(ecode, 1);
946 if (eptr < md->start_subject) RRETURN(MATCH_NOMATCH);
947 }
948
949 /* Skip to next op code */
950
951 ecode += 1 + LINK_SIZE;
952 break;
953
954 /* The callout item calls an external function, if one is provided, passing
955 details of the match so far. This is mainly for debugging, though the
956 function is able to force a failure. */
957
958 case OP_CALLOUT:
959 if (pcre_callout != NULL)
960 {
961 pcre_callout_block cb;
962 cb.version = 1; /* Version 1 of the callout block */
963 cb.callout_number = ecode[1];
964 cb.offset_vector = md->offset_vector;
965 cb.subject = (PCRE_SPTR)md->start_subject;
966 cb.subject_length = md->end_subject - md->start_subject;
967 cb.start_match = mstart - md->start_subject;
968 cb.current_position = eptr - md->start_subject;
969 cb.pattern_position = GET(ecode, 2);
970 cb.next_item_length = GET(ecode, 2 + LINK_SIZE);
971 cb.capture_top = offset_top/2;
972 cb.capture_last = md->capture_last;
973 cb.callout_data = md->callout_data;
974 if ((rrc = (*pcre_callout)(&cb)) > 0) RRETURN(MATCH_NOMATCH);
975 if (rrc < 0) RRETURN(rrc);
976 }
977 ecode += 2 + 2*LINK_SIZE;
978 break;
979
980 /* Recursion either matches the current regex, or some subexpression. The
981 offset data is the offset to the starting bracket from the start of the
982 whole pattern. (This is so that it works from duplicated subpatterns.)
983
984 If there are any capturing brackets started but not finished, we have to
985 save their starting points and reinstate them after the recursion. However,
986 we don't know how many such there are (offset_top records the completed
987 total) so we just have to save all the potential data. There may be up to
988 65535 such values, which is too large to put on the stack, but using malloc
989 for small numbers seems expensive. As a compromise, the stack is used when
990 there are no more than REC_STACK_SAVE_MAX values to store; otherwise malloc
991 is used. A problem is what to do if the malloc fails ... there is no way of
992 returning to the top level with an error. Save the top REC_STACK_SAVE_MAX
993 values on the stack, and accept that the rest may be wrong.
994
995 There are also other values that have to be saved. We use a chained
996 sequence of blocks that actually live on the stack. Thanks to Robin Houston
997 for the original version of this logic. */
998
999 case OP_RECURSE:
1000 {
1001 callpat = md->start_code + GET(ecode, 1);
1002 new_recursive.group_num = (callpat == md->start_code)? 0 :
1003 GET2(callpat, 1 + LINK_SIZE);
1004
1005 /* Add to "recursing stack" */
1006
1007 new_recursive.prevrec = md->recursive;
1008 md->recursive = &new_recursive;
1009
1010 /* Find where to continue from afterwards */
1011
1012 ecode += 1 + LINK_SIZE;
1013 new_recursive.after_call = ecode;
1014
1015 /* Now save the offset data. */
1016
1017 new_recursive.saved_max = md->offset_end;
1018 if (new_recursive.saved_max <= REC_STACK_SAVE_MAX)
1019 new_recursive.offset_save = stacksave;
1020 else
1021 {
1022 new_recursive.offset_save =
1023 (int *)(pcre_malloc)(new_recursive.saved_max * sizeof(int));
1024 if (new_recursive.offset_save == NULL) RRETURN(PCRE_ERROR_NOMEMORY);
1025 }
1026
1027 memcpy(new_recursive.offset_save, md->offset_vector,
1028 new_recursive.saved_max * sizeof(int));
1029 new_recursive.save_start = mstart;
1030 mstart = eptr;
1031
1032 /* OK, now we can do the recursion. For each top-level alternative we
1033 restore the offset and recursion data. */
1034
1035 DPRINTF(("Recursing into group %d\n", new_recursive.group_num));
1036 flags = (*callpat >= OP_SBRA)? match_cbegroup : 0;
1037 do
1038 {
1039 RMATCH(eptr, callpat + _pcre_OP_lengths[*callpat], offset_top,
1040 md, ims, eptrb, flags, RM6);
1041 if (rrc == MATCH_MATCH)
1042 {
1043 DPRINTF(("Recursion matched\n"));
1044 md->recursive = new_recursive.prevrec;
1045 if (new_recursive.offset_save != stacksave)
1046 (pcre_free)(new_recursive.offset_save);
1047 RRETURN(MATCH_MATCH);
1048 }
1049 else if (rrc != MATCH_NOMATCH && rrc != MATCH_THEN)
1050 {
1051 DPRINTF(("Recursion gave error %d\n", rrc));
1052 RRETURN(rrc);
1053 }
1054
1055 md->recursive = &new_recursive;
1056 memcpy(md->offset_vector, new_recursive.offset_save,
1057 new_recursive.saved_max * sizeof(int));
1058 callpat += GET(callpat, 1);
1059 }
1060 while (*callpat == OP_ALT);
1061
1062 DPRINTF(("Recursion didn't match\n"));
1063 md->recursive = new_recursive.prevrec;
1064 if (new_recursive.offset_save != stacksave)
1065 (pcre_free)(new_recursive.offset_save);
1066 RRETURN(MATCH_NOMATCH);
1067 }
1068 /* Control never reaches here */
1069
1070 /* "Once" brackets are like assertion brackets except that after a match,
1071 the point in the subject string is not moved back. Thus there can never be
1072 a move back into the brackets. Friedl calls these "atomic" subpatterns.
1073 Check the alternative branches in turn - the matching won't pass the KET
1074 for this kind of subpattern. If any one branch matches, we carry on as at
1075 the end of a normal bracket, leaving the subject pointer. */
1076
1077 case OP_ONCE:
1078 prev = ecode;
1079 saved_eptr = eptr;
1080
1081 do
1082 {
1083 RMATCH(eptr, ecode + 1 + LINK_SIZE, offset_top, md, ims, eptrb, 0, RM7);
1084 if (rrc == MATCH_MATCH) break;
1085 if (rrc != MATCH_NOMATCH && rrc != MATCH_THEN) RRETURN(rrc);
1086 ecode += GET(ecode,1);
1087 }
1088 while (*ecode == OP_ALT);
1089
1090 /* If hit the end of the group (which could be repeated), fail */
1091
1092 if (*ecode != OP_ONCE && *ecode != OP_ALT) RRETURN(MATCH_NOMATCH);
1093
1094 /* Continue as from after the assertion, updating the offsets high water
1095 mark, since extracts may have been taken. */
1096
1097 do ecode += GET(ecode, 1); while (*ecode == OP_ALT);
1098
1099 offset_top = md->end_offset_top;
1100 eptr = md->end_match_ptr;
1101
1102 /* For a non-repeating ket, just continue at this level. This also
1103 happens for a repeating ket if no characters were matched in the group.
1104 This is the forcible breaking of infinite loops as implemented in Perl
1105 5.005. If there is an options reset, it will get obeyed in the normal
1106 course of events. */
1107
1108 if (*ecode == OP_KET || eptr == saved_eptr)
1109 {
1110 ecode += 1+LINK_SIZE;
1111 break;
1112 }
1113
1114 /* The repeating kets try the rest of the pattern or restart from the
1115 preceding bracket, in the appropriate order. The second "call" of match()
1116 uses tail recursion, to avoid using another stack frame. We need to reset
1117 any options that changed within the bracket before re-running it, so
1118 check the next opcode. */
1119
1120 if (ecode[1+LINK_SIZE] == OP_OPT)
1121 {
1122 ims = (ims & ~PCRE_IMS) | ecode[4];
1123 DPRINTF(("ims set to %02lx at group repeat\n", ims));
1124 }
1125
1126 if (*ecode == OP_KETRMIN)
1127 {
1128 RMATCH(eptr, ecode + 1 + LINK_SIZE, offset_top, md, ims, eptrb, 0, RM8);
1129 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1130 ecode = prev;
1131 flags = 0;
1132 goto TAIL_RECURSE;
1133 }
1134 else /* OP_KETRMAX */
1135 {
1136 RMATCH(eptr, prev, offset_top, md, ims, eptrb, match_cbegroup, RM9);
1137 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1138 ecode += 1 + LINK_SIZE;
1139 flags = 0;
1140 goto TAIL_RECURSE;
1141 }
1142 /* Control never gets here */
1143
1144 /* An alternation is the end of a branch; scan along to find the end of the
1145 bracketed group and go to there. */
1146
1147 case OP_ALT:
1148 do ecode += GET(ecode,1); while (*ecode == OP_ALT);
1149 break;
1150
1151 /* BRAZERO, BRAMINZERO and SKIPZERO occur just before a bracket group,
1152 indicating that it may occur zero times. It may repeat infinitely, or not
1153 at all - i.e. it could be ()* or ()? or even (){0} in the pattern. Brackets
1154 with fixed upper repeat limits are compiled as a number of copies, with the
1155 optional ones preceded by BRAZERO or BRAMINZERO. */
1156
1157 case OP_BRAZERO:
1158 {
1159 next = ecode+1;
1160 RMATCH(eptr, next, offset_top, md, ims, eptrb, 0, RM10);
1161 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1162 do next += GET(next,1); while (*next == OP_ALT);
1163 ecode = next + 1 + LINK_SIZE;
1164 }
1165 break;
1166
1167 case OP_BRAMINZERO:
1168 {
1169 next = ecode+1;
1170 do next += GET(next, 1); while (*next == OP_ALT);
1171 RMATCH(eptr, next + 1+LINK_SIZE, offset_top, md, ims, eptrb, 0, RM11);
1172 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1173 ecode++;
1174 }
1175 break;
1176
1177 case OP_SKIPZERO:
1178 {
1179 next = ecode+1;
1180 do next += GET(next,1); while (*next == OP_ALT);
1181 ecode = next + 1 + LINK_SIZE;
1182 }
1183 break;
1184
1185 /* End of a group, repeated or non-repeating. */
1186
1187 case OP_KET:
1188 case OP_KETRMIN:
1189 case OP_KETRMAX:
1190 prev = ecode - GET(ecode, 1);
1191
1192 /* If this was a group that remembered the subject start, in order to break
1193 infinite repeats of empty string matches, retrieve the subject start from
1194 the chain. Otherwise, set it NULL. */
1195
1196 if (*prev >= OP_SBRA)
1197 {
1198 saved_eptr = eptrb->epb_saved_eptr; /* Value at start of group */
1199 eptrb = eptrb->epb_prev; /* Backup to previous group */
1200 }
1201 else saved_eptr = NULL;
1202
1203 /* If we are at the end of an assertion group, stop matching and return
1204 MATCH_MATCH, but record the current high water mark for use by positive
1205 assertions. Do this also for the "once" (atomic) groups. */
1206
1207 if (*prev == OP_ASSERT || *prev == OP_ASSERT_NOT ||
1208 *prev == OP_ASSERTBACK || *prev == OP_ASSERTBACK_NOT ||
1209 *prev == OP_ONCE)
1210 {
1211 md->end_match_ptr = eptr; /* For ONCE */
1212 md->end_offset_top = offset_top;
1213 RRETURN(MATCH_MATCH);
1214 }
1215
1216 /* For capturing groups we have to check the group number back at the start
1217 and if necessary complete handling an extraction by setting the offsets and
1218 bumping the high water mark. Note that whole-pattern recursion is coded as
1219 a recurse into group 0, so it won't be picked up here. Instead, we catch it
1220 when the OP_END is reached. Other recursion is handled here. */
1221
1222 if (*prev == OP_CBRA || *prev == OP_SCBRA)
1223 {
1224 number = GET2(prev, 1+LINK_SIZE);
1225 offset = number << 1;
1226
1227 #ifdef DEBUG
1228 printf("end bracket %d", number);
1229 printf("\n");
1230 #endif
1231
1232 md->capture_last = number;
1233 if (offset >= md->offset_max) md->offset_overflow = TRUE; else
1234 {
1235 md->offset_vector[offset] =
1236 md->offset_vector[md->offset_end - number];
1237 md->offset_vector[offset+1] = eptr - md->start_subject;
1238 if (offset_top <= offset) offset_top = offset + 2;
1239 }
1240
1241 /* Handle a recursively called group. Restore the offsets
1242 appropriately and continue from after the call. */
1243
1244 if (md->recursive != NULL && md->recursive->group_num == number)
1245 {
1246 recursion_info *rec = md->recursive;
1247 DPRINTF(("Recursion (%d) succeeded - continuing\n", number));
1248 md->recursive = rec->prevrec;
1249 mstart = rec->save_start;
1250 memcpy(md->offset_vector, rec->offset_save,
1251 rec->saved_max * sizeof(int));
1252 ecode = rec->after_call;
1253 ims = original_ims;
1254 break;
1255 }
1256 }
1257
1258 /* For both capturing and non-capturing groups, reset the value of the ims
1259 flags, in case they got changed during the group. */
1260
1261 ims = original_ims;
1262 DPRINTF(("ims reset to %02lx\n", ims));
1263
1264 /* For a non-repeating ket, just continue at this level. This also
1265 happens for a repeating ket if no characters were matched in the group.
1266 This is the forcible breaking of infinite loops as implemented in Perl
1267 5.005. If there is an options reset, it will get obeyed in the normal
1268 course of events. */
1269
1270 if (*ecode == OP_KET || eptr == saved_eptr)
1271 {
1272 ecode += 1 + LINK_SIZE;
1273 break;
1274 }
1275
1276 /* The repeating kets try the rest of the pattern or restart from the
1277 preceding bracket, in the appropriate order. In the second case, we can use
1278 tail recursion to avoid using another stack frame, unless we have an
1279 unlimited repeat of a group that can match an empty string. */
1280
1281 flags = (*prev >= OP_SBRA)? match_cbegroup : 0;
1282
1283 if (*ecode == OP_KETRMIN)
1284 {
1285 RMATCH(eptr, ecode + 1 + LINK_SIZE, offset_top, md, ims, eptrb, 0, RM12);
1286 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1287 if (flags != 0) /* Could match an empty string */
1288 {
1289 RMATCH(eptr, prev, offset_top, md, ims, eptrb, flags, RM50);
1290 RRETURN(rrc);
1291 }
1292 ecode = prev;
1293 goto TAIL_RECURSE;
1294 }
1295 else /* OP_KETRMAX */
1296 {
1297 RMATCH(eptr, prev, offset_top, md, ims, eptrb, flags, RM13);
1298 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1299 ecode += 1 + LINK_SIZE;
1300 flags = 0;
1301 goto TAIL_RECURSE;
1302 }
1303 /* Control never gets here */
1304
1305 /* Start of subject unless notbol, or after internal newline if multiline */
1306
1307 case OP_CIRC:
1308 if (md->notbol && eptr == md->start_subject) RRETURN(MATCH_NOMATCH);
1309 if ((ims & PCRE_MULTILINE) != 0)
1310 {
1311 if (eptr != md->start_subject &&
1312 (eptr == md->end_subject || !WAS_NEWLINE(eptr)))
1313 RRETURN(MATCH_NOMATCH);
1314 ecode++;
1315 break;
1316 }
1317 /* ... else fall through */
1318
1319 /* Start of subject assertion */
1320
1321 case OP_SOD:
1322 if (eptr != md->start_subject) RRETURN(MATCH_NOMATCH);
1323 ecode++;
1324 break;
1325
1326 /* Start of match assertion */
1327
1328 case OP_SOM:
1329 if (eptr != md->start_subject + md->start_offset) RRETURN(MATCH_NOMATCH);
1330 ecode++;
1331 break;
1332
1333 /* Reset the start of match point */
1334
1335 case OP_SET_SOM:
1336 mstart = eptr;
1337 ecode++;
1338 break;
1339
1340 /* Assert before internal newline if multiline, or before a terminating
1341 newline unless endonly is set, else end of subject unless noteol is set. */
1342
1343 case OP_DOLL:
1344 if ((ims & PCRE_MULTILINE) != 0)
1345 {
1346 if (eptr < md->end_subject)
1347 { if (!IS_NEWLINE(eptr)) RRETURN(MATCH_NOMATCH); }
1348 else
1349 { if (md->noteol) RRETURN(MATCH_NOMATCH); }
1350 ecode++;
1351 break;
1352 }
1353 else
1354 {
1355 if (md->noteol) RRETURN(MATCH_NOMATCH);
1356 if (!md->endonly)
1357 {
1358 if (eptr != md->end_subject &&
1359 (!IS_NEWLINE(eptr) || eptr != md->end_subject - md->nllen))
1360 RRETURN(MATCH_NOMATCH);
1361 ecode++;
1362 break;
1363 }
1364 }
1365 /* ... else fall through for endonly */
1366
1367 /* End of subject assertion (\z) */
1368
1369 case OP_EOD:
1370 if (eptr < md->end_subject) RRETURN(MATCH_NOMATCH);
1371 ecode++;
1372 break;
1373
1374 /* End of subject or ending \n assertion (\Z) */
1375
1376 case OP_EODN:
1377 if (eptr != md->end_subject &&
1378 (!IS_NEWLINE(eptr) || eptr != md->end_subject - md->nllen))
1379 RRETURN(MATCH_NOMATCH);
1380 ecode++;
1381 break;
1382
1383 /* Word boundary assertions */
1384
1385 case OP_NOT_WORD_BOUNDARY:
1386 case OP_WORD_BOUNDARY:
1387 {
1388
1389 /* Find out if the previous and current characters are "word" characters.
1390 It takes a bit more work in UTF-8 mode. Characters > 255 are assumed to
1391 be "non-word" characters. */
1392
1393 #ifdef SUPPORT_UTF8
1394 if (utf8)
1395 {
1396 if (eptr == md->start_subject) prev_is_word = FALSE; else
1397 {
1398 const uschar *lastptr = eptr - 1;
1399 while((*lastptr & 0xc0) == 0x80) lastptr--;
1400 GETCHAR(c, lastptr);
1401 prev_is_word = c < 256 && (md->ctypes[c] & ctype_word) != 0;
1402 }
1403 if (eptr >= md->end_subject) cur_is_word = FALSE; else
1404 {
1405 GETCHAR(c, eptr);
1406 cur_is_word = c < 256 && (md->ctypes[c] & ctype_word) != 0;
1407 }
1408 }
1409 else
1410 #endif
1411
1412 /* More streamlined when not in UTF-8 mode */
1413
1414 {
1415 prev_is_word = (eptr != md->start_subject) &&
1416 ((md->ctypes[eptr[-1]] & ctype_word) != 0);
1417 cur_is_word = (eptr < md->end_subject) &&
1418 ((md->ctypes[*eptr] & ctype_word) != 0);
1419 }
1420
1421 /* Now see if the situation is what we want */
1422
1423 if ((*ecode++ == OP_WORD_BOUNDARY)?
1424 cur_is_word == prev_is_word : cur_is_word != prev_is_word)
1425 RRETURN(MATCH_NOMATCH);
1426 }
1427 break;
1428
1429 /* Match a single character type; inline for speed */
1430
1431 case OP_ANY:
1432 if ((ims & PCRE_DOTALL) == 0)
1433 {
1434 if (IS_NEWLINE(eptr)) RRETURN(MATCH_NOMATCH);
1435 }
1436 /* Fall through */
1437
1438 case OP_ALLANY:
1439 if (eptr++ >= md->end_subject) RRETURN(MATCH_NOMATCH);
1440 if (utf8)
1441 while (eptr < md->end_subject && (*eptr & 0xc0) == 0x80) eptr++;
1442 ecode++;
1443 break;
1444
1445 /* Match a single byte, even in UTF-8 mode. This opcode really does match
1446 any byte, even newline, independent of the setting of PCRE_DOTALL. */
1447
1448 case OP_ANYBYTE:
1449 if (eptr++ >= md->end_subject) RRETURN(MATCH_NOMATCH);
1450 ecode++;
1451 break;
1452
1453 case OP_NOT_DIGIT:
1454 if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
1455 GETCHARINCTEST(c, eptr);
1456 if (
1457 #ifdef SUPPORT_UTF8
1458 c < 256 &&
1459 #endif
1460 (md->ctypes[c] & ctype_digit) != 0
1461 )
1462 RRETURN(MATCH_NOMATCH);
1463 ecode++;
1464 break;
1465
1466 case OP_DIGIT:
1467 if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
1468 GETCHARINCTEST(c, eptr);
1469 if (
1470 #ifdef SUPPORT_UTF8
1471 c >= 256 ||
1472 #endif
1473 (md->ctypes[c] & ctype_digit) == 0
1474 )
1475 RRETURN(MATCH_NOMATCH);
1476 ecode++;
1477 break;
1478
1479 case OP_NOT_WHITESPACE:
1480 if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
1481 GETCHARINCTEST(c, eptr);
1482 if (
1483 #ifdef SUPPORT_UTF8
1484 c < 256 &&
1485 #endif
1486 (md->ctypes[c] & ctype_space) != 0
1487 )
1488 RRETURN(MATCH_NOMATCH);
1489 ecode++;
1490 break;
1491
1492 case OP_WHITESPACE:
1493 if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
1494 GETCHARINCTEST(c, eptr);
1495 if (
1496 #ifdef SUPPORT_UTF8
1497 c >= 256 ||
1498 #endif
1499 (md->ctypes[c] & ctype_space) == 0
1500 )
1501 RRETURN(MATCH_NOMATCH);
1502 ecode++;
1503 break;
1504
1505 case OP_NOT_WORDCHAR:
1506 if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
1507 GETCHARINCTEST(c, eptr);
1508 if (
1509 #ifdef SUPPORT_UTF8
1510 c < 256 &&
1511 #endif
1512 (md->ctypes[c] & ctype_word) != 0
1513 )
1514 RRETURN(MATCH_NOMATCH);
1515 ecode++;
1516 break;
1517
1518 case OP_WORDCHAR:
1519 if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
1520 GETCHARINCTEST(c, eptr);
1521 if (
1522 #ifdef SUPPORT_UTF8
1523 c >= 256 ||
1524 #endif
1525 (md->ctypes[c] & ctype_word) == 0
1526 )
1527 RRETURN(MATCH_NOMATCH);
1528 ecode++;
1529 break;
1530
1531 case OP_ANYNL:
1532 if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
1533 GETCHARINCTEST(c, eptr);
1534 switch(c)
1535 {
1536 default: RRETURN(MATCH_NOMATCH);
1537 case 0x000d:
1538 if (eptr < md->end_subject && *eptr == 0x0a) eptr++;
1539 break;
1540
1541 case 0x000a:
1542 break;
1543
1544 case 0x000b:
1545 case 0x000c:
1546 case 0x0085:
1547 case 0x2028:
1548 case 0x2029:
1549 if (md->bsr_anycrlf) RRETURN(MATCH_NOMATCH);
1550 break;
1551 }
1552 ecode++;
1553 break;
1554
1555 case OP_NOT_HSPACE:
1556 if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
1557 GETCHARINCTEST(c, eptr);
1558 switch(c)
1559 {
1560 default: break;
1561 case 0x09: /* HT */
1562 case 0x20: /* SPACE */
1563 case 0xa0: /* NBSP */
1564 case 0x1680: /* OGHAM SPACE MARK */
1565 case 0x180e: /* MONGOLIAN VOWEL SEPARATOR */
1566 case 0x2000: /* EN QUAD */
1567 case 0x2001: /* EM QUAD */
1568 case 0x2002: /* EN SPACE */
1569 case 0x2003: /* EM SPACE */
1570 case 0x2004: /* THREE-PER-EM SPACE */
1571 case 0x2005: /* FOUR-PER-EM SPACE */
1572 case 0x2006: /* SIX-PER-EM SPACE */
1573 case 0x2007: /* FIGURE SPACE */
1574 case 0x2008: /* PUNCTUATION SPACE */
1575 case 0x2009: /* THIN SPACE */
1576 case 0x200A: /* HAIR SPACE */
1577 case 0x202f: /* NARROW NO-BREAK SPACE */
1578 case 0x205f: /* MEDIUM MATHEMATICAL SPACE */
1579 case 0x3000: /* IDEOGRAPHIC SPACE */
1580 RRETURN(MATCH_NOMATCH);
1581 }
1582 ecode++;
1583 break;
1584
1585 case OP_HSPACE:
1586 if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
1587 GETCHARINCTEST(c, eptr);
1588 switch(c)
1589 {
1590 default: RRETURN(MATCH_NOMATCH);
1591 case 0x09: /* HT */
1592 case 0x20: /* SPACE */
1593 case 0xa0: /* NBSP */
1594 case 0x1680: /* OGHAM SPACE MARK */
1595 case 0x180e: /* MONGOLIAN VOWEL SEPARATOR */
1596 case 0x2000: /* EN QUAD */
1597 case 0x2001: /* EM QUAD */
1598 case 0x2002: /* EN SPACE */
1599 case 0x2003: /* EM SPACE */
1600 case 0x2004: /* THREE-PER-EM SPACE */
1601 case 0x2005: /* FOUR-PER-EM SPACE */
1602 case 0x2006: /* SIX-PER-EM SPACE */
1603 case 0x2007: /* FIGURE SPACE */
1604 case 0x2008: /* PUNCTUATION SPACE */
1605 case 0x2009: /* THIN SPACE */
1606 case 0x200A: /* HAIR SPACE */
1607 case 0x202f: /* NARROW NO-BREAK SPACE */
1608 case 0x205f: /* MEDIUM MATHEMATICAL SPACE */
1609 case 0x3000: /* IDEOGRAPHIC SPACE */
1610 break;
1611 }
1612 ecode++;
1613 break;
1614
1615 case OP_NOT_VSPACE:
1616 if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
1617 GETCHARINCTEST(c, eptr);
1618 switch(c)
1619 {
1620 default: break;
1621 case 0x0a: /* LF */
1622 case 0x0b: /* VT */
1623 case 0x0c: /* FF */
1624 case 0x0d: /* CR */
1625 case 0x85: /* NEL */
1626 case 0x2028: /* LINE SEPARATOR */
1627 case 0x2029: /* PARAGRAPH SEPARATOR */
1628 RRETURN(MATCH_NOMATCH);
1629 }
1630 ecode++;
1631 break;
1632
1633 case OP_VSPACE:
1634 if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
1635 GETCHARINCTEST(c, eptr);
1636 switch(c)
1637 {
1638 default: RRETURN(MATCH_NOMATCH);
1639 case 0x0a: /* LF */
1640 case 0x0b: /* VT */
1641 case 0x0c: /* FF */
1642 case 0x0d: /* CR */
1643 case 0x85: /* NEL */
1644 case 0x2028: /* LINE SEPARATOR */
1645 case 0x2029: /* PARAGRAPH SEPARATOR */
1646 break;
1647 }
1648 ecode++;
1649 break;
1650
1651 #ifdef SUPPORT_UCP
1652 /* Check the next character by Unicode property. We will get here only
1653 if the support is in the binary; otherwise a compile-time error occurs. */
1654
1655 case OP_PROP:
1656 case OP_NOTPROP:
1657 if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
1658 GETCHARINCTEST(c, eptr);
1659 {
1660 int chartype, script;
1661 int category = _pcre_ucp_findprop(c, &chartype, &script);
1662
1663 switch(ecode[1])
1664 {
1665 case PT_ANY:
1666 if (op == OP_NOTPROP) RRETURN(MATCH_NOMATCH);
1667 break;
1668
1669 case PT_LAMP:
1670 if ((chartype == ucp_Lu ||
1671 chartype == ucp_Ll ||
1672 chartype == ucp_Lt) == (op == OP_NOTPROP))
1673 RRETURN(MATCH_NOMATCH);
1674 break;
1675
1676 case PT_GC:
1677 if ((ecode[2] != category) == (op == OP_PROP))
1678 RRETURN(MATCH_NOMATCH);
1679 break;
1680
1681 case PT_PC:
1682 if ((ecode[2] != chartype) == (op == OP_PROP))
1683 RRETURN(MATCH_NOMATCH);
1684 break;
1685
1686 case PT_SC:
1687 if ((ecode[2] != script) == (op == OP_PROP))
1688 RRETURN(MATCH_NOMATCH);
1689 break;
1690
1691 default:
1692 RRETURN(PCRE_ERROR_INTERNAL);
1693 }
1694
1695 ecode += 3;
1696 }
1697 break;
1698
1699 /* Match an extended Unicode sequence. We will get here only if the support
1700 is in the binary; otherwise a compile-time error occurs. */
1701
1702 case OP_EXTUNI:
1703 if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
1704 GETCHARINCTEST(c, eptr);
1705 {
1706 int chartype, script;
1707 int category = _pcre_ucp_findprop(c, &chartype, &script);
1708 if (category == ucp_M) RRETURN(MATCH_NOMATCH);
1709 while (eptr < md->end_subject)
1710 {
1711 int len = 1;
1712 if (!utf8) c = *eptr; else
1713 {
1714 GETCHARLEN(c, eptr, len);
1715 }
1716 category = _pcre_ucp_findprop(c, &chartype, &script);
1717 if (category != ucp_M) break;
1718 eptr += len;
1719 }
1720 }
1721 ecode++;
1722 break;
1723 #endif
1724
1725
1726 /* Match a back reference, possibly repeatedly. Look past the end of the
1727 item to see if there is repeat information following. The code is similar
1728 to that for character classes, but repeated for efficiency. Then obey
1729 similar code to character type repeats - written out again for speed.
1730 However, if the referenced string is the empty string, always treat
1731 it as matched, any number of times (otherwise there could be infinite
1732 loops). */
1733
1734 case OP_REF:
1735 {
1736 offset = GET2(ecode, 1) << 1; /* Doubled ref number */
1737 ecode += 3;
1738
1739 /* If the reference is unset, there are two possibilities:
1740
1741 (a) In the default, Perl-compatible state, set the length to be longer
1742 than the amount of subject left; this ensures that every attempt at a
1743 match fails. We can't just fail here, because of the possibility of
1744 quantifiers with zero minima.
1745
1746 (b) If the JavaScript compatibility flag is set, set the length to zero
1747 so that the back reference matches an empty string.
1748
1749 Otherwise, set the length to the length of what was matched by the
1750 referenced subpattern. */
1751
1752 if (offset >= offset_top || md->offset_vector[offset] < 0)
1753 length = (md->jscript_compat)? 0 : md->end_subject - eptr + 1;
1754 else
1755 length = md->offset_vector[offset+1] - md->offset_vector[offset];
1756
1757 /* Set up for repetition, or handle the non-repeated case */
1758
1759 switch (*ecode)
1760 {
1761 case OP_CRSTAR:
1762 case OP_CRMINSTAR:
1763 case OP_CRPLUS:
1764 case OP_CRMINPLUS:
1765 case OP_CRQUERY:
1766 case OP_CRMINQUERY:
1767 c = *ecode++ - OP_CRSTAR;
1768 minimize = (c & 1) != 0;
1769 min = rep_min[c]; /* Pick up values from tables; */
1770 max = rep_max[c]; /* zero for max => infinity */
1771 if (max == 0) max = INT_MAX;
1772 break;
1773
1774 case OP_CRRANGE:
1775 case OP_CRMINRANGE:
1776 minimize = (*ecode == OP_CRMINRANGE);
1777 min = GET2(ecode, 1);
1778 max = GET2(ecode, 3);
1779 if (max == 0) max = INT_MAX;
1780 ecode += 5;
1781 break;
1782
1783 default: /* No repeat follows */
1784 if (!match_ref(offset, eptr, length, md, ims)) RRETURN(MATCH_NOMATCH);
1785 eptr += length;
1786 continue; /* With the main loop */
1787 }
1788
1789 /* If the length of the reference is zero, just continue with the
1790 main loop. */
1791
1792 if (length == 0) continue;
1793
1794 /* First, ensure the minimum number of matches are present. We get back
1795 the length of the reference string explicitly rather than passing the
1796 address of eptr, so that eptr can be a register variable. */
1797
1798 for (i = 1; i <= min; i++)
1799 {
1800 if (!match_ref(offset, eptr, length, md, ims)) RRETURN(MATCH_NOMATCH);
1801 eptr += length;
1802 }
1803
1804 /* If min = max, continue at the same level without recursion.
1805 They are not both allowed to be zero. */
1806
1807 if (min == max) continue;
1808
1809 /* If minimizing, keep trying and advancing the pointer */
1810
1811 if (minimize)
1812 {
1813 for (fi = min;; fi++)
1814 {
1815 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM14);
1816 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1817 if (fi >= max || !match_ref(offset, eptr, length, md, ims))
1818 RRETURN(MATCH_NOMATCH);
1819 eptr += length;
1820 }
1821 /* Control never gets here */
1822 }
1823
1824 /* If maximizing, find the longest string and work backwards */
1825
1826 else
1827 {
1828 pp = eptr;
1829 for (i = min; i < max; i++)
1830 {
1831 if (!match_ref(offset, eptr, length, md, ims)) break;
1832 eptr += length;
1833 }
1834 while (eptr >= pp)
1835 {
1836 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM15);
1837 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1838 eptr -= length;
1839 }
1840 RRETURN(MATCH_NOMATCH);
1841 }
1842 }
1843 /* Control never gets here */
1844
1845
1846
1847 /* Match a bit-mapped character class, possibly repeatedly. This op code is
1848 used when all the characters in the class have values in the range 0-255,
1849 and either the matching is caseful, or the characters are in the range
1850 0-127 when UTF-8 processing is enabled. The only difference between
1851 OP_CLASS and OP_NCLASS occurs when a data character outside the range is
1852 encountered.
1853
1854 First, look past the end of the item to see if there is repeat information
1855 following. Then obey similar code to character type repeats - written out
1856 again for speed. */
1857
1858 case OP_NCLASS:
1859 case OP_CLASS:
1860 {
1861 data = ecode + 1; /* Save for matching */
1862 ecode += 33; /* Advance past the item */
1863
1864 switch (*ecode)
1865 {
1866 case OP_CRSTAR:
1867 case OP_CRMINSTAR:
1868 case OP_CRPLUS:
1869 case OP_CRMINPLUS:
1870 case OP_CRQUERY:
1871 case OP_CRMINQUERY:
1872 c = *ecode++ - OP_CRSTAR;
1873 minimize = (c & 1) != 0;
1874 min = rep_min[c]; /* Pick up values from tables; */
1875 max = rep_max[c]; /* zero for max => infinity */
1876 if (max == 0) max = INT_MAX;
1877 break;
1878
1879 case OP_CRRANGE:
1880 case OP_CRMINRANGE:
1881 minimize = (*ecode == OP_CRMINRANGE);
1882 min = GET2(ecode, 1);
1883 max = GET2(ecode, 3);
1884 if (max == 0) max = INT_MAX;
1885 ecode += 5;
1886 break;
1887
1888 default: /* No repeat follows */
1889 min = max = 1;
1890 break;
1891 }
1892
1893 /* First, ensure the minimum number of matches are present. */
1894
1895 #ifdef SUPPORT_UTF8
1896 /* UTF-8 mode */
1897 if (utf8)
1898 {
1899 for (i = 1; i <= min; i++)
1900 {
1901 if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
1902 GETCHARINC(c, eptr);
1903 if (c > 255)
1904 {
1905 if (op == OP_CLASS) RRETURN(MATCH_NOMATCH);
1906 }
1907 else
1908 {
1909 if ((data[c/8] & (1 << (c&7))) == 0) RRETURN(MATCH_NOMATCH);
1910 }
1911 }
1912 }
1913 else
1914 #endif
1915 /* Not UTF-8 mode */
1916 {
1917 for (i = 1; i <= min; i++)
1918 {
1919 if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
1920 c = *eptr++;
1921 if ((data[c/8] & (1 << (c&7))) == 0) RRETURN(MATCH_NOMATCH);
1922 }
1923 }
1924
1925 /* If max == min we can continue with the main loop without the
1926 need to recurse. */
1927
1928 if (min == max) continue;
1929
1930 /* If minimizing, keep testing the rest of the expression and advancing
1931 the pointer while it matches the class. */
1932
1933 if (minimize)
1934 {
1935 #ifdef SUPPORT_UTF8
1936 /* UTF-8 mode */
1937 if (utf8)
1938 {
1939 for (fi = min;; fi++)
1940 {
1941 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM16);
1942 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1943 if (fi >= max || eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
1944 GETCHARINC(c, eptr);
1945 if (c > 255)
1946 {
1947 if (op == OP_CLASS) RRETURN(MATCH_NOMATCH);
1948 }
1949 else
1950 {
1951 if ((data[c/8] & (1 << (c&7))) == 0) RRETURN(MATCH_NOMATCH);
1952 }
1953 }
1954 }
1955 else
1956 #endif
1957 /* Not UTF-8 mode */
1958 {
1959 for (fi = min;; fi++)
1960 {
1961 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM17);
1962 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1963 if (fi >= max || eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
1964 c = *eptr++;
1965 if ((data[c/8] & (1 << (c&7))) == 0) RRETURN(MATCH_NOMATCH);
1966 }
1967 }
1968 /* Control never gets here */
1969 }
1970
1971 /* If maximizing, find the longest possible run, then work backwards. */
1972
1973 else
1974 {
1975 pp = eptr;
1976
1977 #ifdef SUPPORT_UTF8
1978 /* UTF-8 mode */
1979 if (utf8)
1980 {
1981 for (i = min; i < max; i++)
1982 {
1983 int len = 1;
1984 if (eptr >= md->end_subject) break;
1985 GETCHARLEN(c, eptr, len);
1986 if (c > 255)
1987 {
1988 if (op == OP_CLASS) break;
1989 }
1990 else
1991 {
1992 if ((data[c/8] & (1 << (c&7))) == 0) break;
1993 }
1994 eptr += len;
1995 }
1996 for (;;)
1997 {
1998 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM18);
1999 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2000 if (eptr-- == pp) break; /* Stop if tried at original pos */
2001 BACKCHAR(eptr);
2002 }
2003 }
2004 else
2005 #endif
2006 /* Not UTF-8 mode */
2007 {
2008 for (i = min; i < max; i++)
2009 {
2010 if (eptr >= md->end_subject) break;
2011 c = *eptr;
2012 if ((data[c/8] & (1 << (c&7))) == 0) break;
2013 eptr++;
2014 }
2015 while (eptr >= pp)
2016 {
2017 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM19);
2018 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2019 eptr--;
2020 }
2021 }
2022
2023 RRETURN(MATCH_NOMATCH);
2024 }
2025 }
2026 /* Control never gets here */
2027
2028
2029 /* Match an extended character class. This opcode is encountered only
2030 in UTF-8 mode, because that's the only time it is compiled. */
2031
2032 #ifdef SUPPORT_UTF8
2033 case OP_XCLASS:
2034 {
2035 data = ecode + 1 + LINK_SIZE; /* Save for matching */
2036 ecode += GET(ecode, 1); /* Advance past the item */
2037
2038 switch (*ecode)
2039 {
2040 case OP_CRSTAR:
2041 case OP_CRMINSTAR:
2042 case OP_CRPLUS:
2043 case OP_CRMINPLUS:
2044 case OP_CRQUERY:
2045 case OP_CRMINQUERY:
2046 c = *ecode++ - OP_CRSTAR;
2047 minimize = (c & 1) != 0;
2048 min = rep_min[c]; /* Pick up values from tables; */
2049 max = rep_max[c]; /* zero for max => infinity */
2050 if (max == 0) max = INT_MAX;
2051 break;
2052
2053 case OP_CRRANGE:
2054 case OP_CRMINRANGE:
2055 minimize = (*ecode == OP_CRMINRANGE);
2056 min = GET2(ecode, 1);
2057 max = GET2(ecode, 3);
2058 if (max == 0) max = INT_MAX;
2059 ecode += 5;
2060 break;
2061
2062 default: /* No repeat follows */
2063 min = max = 1;
2064 break;
2065 }
2066
2067 /* First, ensure the minimum number of matches are present. */
2068
2069 for (i = 1; i <= min; i++)
2070 {
2071 if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
2072 GETCHARINC(c, eptr);
2073 if (!_pcre_xclass(c, data)) RRETURN(MATCH_NOMATCH);
2074 }
2075
2076 /* If max == min we can continue with the main loop without the
2077 need to recurse. */
2078
2079 if (min == max) continue;
2080
2081 /* If minimizing, keep testing the rest of the expression and advancing
2082 the pointer while it matches the class. */
2083
2084 if (minimize)
2085 {
2086 for (fi = min;; fi++)
2087 {
2088 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM20);
2089 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2090 if (fi >= max || eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
2091 GETCHARINC(c, eptr);
2092 if (!_pcre_xclass(c, data)) RRETURN(MATCH_NOMATCH);
2093 }
2094 /* Control never gets here */
2095 }
2096
2097 /* If maximizing, find the longest possible run, then work backwards. */
2098
2099 else
2100 {
2101 pp = eptr;
2102 for (i = min; i < max; i++)
2103 {
2104 int len = 1;
2105 if (eptr >= md->end_subject) break;
2106 GETCHARLEN(c, eptr, len);
2107 if (!_pcre_xclass(c, data)) break;
2108 eptr += len;
2109 }
2110 for(;;)
2111 {
2112 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM21);
2113 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2114 if (eptr-- == pp) break; /* Stop if tried at original pos */
2115 if (utf8) BACKCHAR(eptr);
2116 }
2117 RRETURN(MATCH_NOMATCH);
2118 }
2119
2120 /* Control never gets here */
2121 }
2122 #endif /* End of XCLASS */
2123
2124 /* Match a single character, casefully */
2125
2126 case OP_CHAR:
2127 #ifdef SUPPORT_UTF8
2128 if (utf8)
2129 {
2130 length = 1;
2131 ecode++;
2132 GETCHARLEN(fc, ecode, length);
2133 if (length > md->end_subject - eptr) RRETURN(MATCH_NOMATCH);
2134 while (length-- > 0) if (*ecode++ != *eptr++) RRETURN(MATCH_NOMATCH);
2135 }
2136 else
2137 #endif
2138
2139 /* Non-UTF-8 mode */
2140 {
2141 if (md->end_subject - eptr < 1) RRETURN(MATCH_NOMATCH);
2142 if (ecode[1] != *eptr++) RRETURN(MATCH_NOMATCH);
2143 ecode += 2;
2144 }
2145 break;
2146
2147 /* Match a single character, caselessly */
2148
2149 case OP_CHARNC:
2150 #ifdef SUPPORT_UTF8
2151 if (utf8)
2152 {
2153 length = 1;
2154 ecode++;
2155 GETCHARLEN(fc, ecode, length);
2156
2157 if (length > md->end_subject - eptr) RRETURN(MATCH_NOMATCH);
2158
2159 /* If the pattern character's value is < 128, we have only one byte, and
2160 can use the fast lookup table. */
2161
2162 if (fc < 128)
2163 {
2164 if (md->lcc[*ecode++] != md->lcc[*eptr++]) RRETURN(MATCH_NOMATCH);
2165 }
2166
2167 /* Otherwise we must pick up the subject character */
2168
2169 else
2170 {
2171 unsigned int dc;
2172 GETCHARINC(dc, eptr);
2173 ecode += length;
2174
2175 /* If we have Unicode property support, we can use it to test the other
2176 case of the character, if there is one. */
2177
2178 if (fc != dc)
2179 {
2180 #ifdef SUPPORT_UCP
2181 if (dc != _pcre_ucp_othercase(fc))
2182 #endif
2183 RRETURN(MATCH_NOMATCH);
2184 }
2185 }
2186 }
2187 else
2188 #endif /* SUPPORT_UTF8 */
2189
2190 /* Non-UTF-8 mode */
2191 {
2192 if (md->end_subject - eptr < 1) RRETURN(MATCH_NOMATCH);
2193 if (md->lcc[ecode[1]] != md->lcc[*eptr++]) RRETURN(MATCH_NOMATCH);
2194 ecode += 2;
2195 }
2196 break;
2197
2198 /* Match a single character repeatedly. */
2199
2200 case OP_EXACT:
2201 min = max = GET2(ecode, 1);
2202 ecode += 3;
2203 goto REPEATCHAR;
2204
2205 case OP_POSUPTO:
2206 possessive = TRUE;
2207 /* Fall through */
2208
2209 case OP_UPTO:
2210 case OP_MINUPTO:
2211 min = 0;
2212 max = GET2(ecode, 1);
2213 minimize = *ecode == OP_MINUPTO;
2214 ecode += 3;
2215 goto REPEATCHAR;
2216
2217 case OP_POSSTAR:
2218 possessive = TRUE;
2219 min = 0;
2220 max = INT_MAX;
2221 ecode++;
2222 goto REPEATCHAR;
2223
2224 case OP_POSPLUS:
2225 possessive = TRUE;
2226 min = 1;
2227 max = INT_MAX;
2228 ecode++;
2229 goto REPEATCHAR;
2230
2231 case OP_POSQUERY:
2232 possessive = TRUE;
2233 min = 0;
2234 max = 1;
2235 ecode++;
2236 goto REPEATCHAR;
2237
2238 case OP_STAR:
2239 case OP_MINSTAR:
2240 case OP_PLUS:
2241 case OP_MINPLUS:
2242 case OP_QUERY:
2243 case OP_MINQUERY:
2244 c = *ecode++ - OP_STAR;
2245 minimize = (c & 1) != 0;
2246 min = rep_min[c]; /* Pick up values from tables; */
2247 max = rep_max[c]; /* zero for max => infinity */
2248 if (max == 0) max = INT_MAX;
2249
2250 /* Common code for all repeated single-character matches. We can give
2251 up quickly if there are fewer than the minimum number of characters left in
2252 the subject. */
2253
2254 REPEATCHAR:
2255 #ifdef SUPPORT_UTF8
2256 if (utf8)
2257 {
2258 length = 1;
2259 charptr = ecode;
2260 GETCHARLEN(fc, ecode, length);
2261 if (min * length > md->end_subject - eptr) RRETURN(MATCH_NOMATCH);
2262 ecode += length;
2263
2264 /* Handle multibyte character matching specially here. There is
2265 support for caseless matching if UCP support is present. */
2266
2267 if (length > 1)
2268 {
2269 #ifdef SUPPORT_UCP
2270 unsigned int othercase;
2271 if ((ims & PCRE_CASELESS) != 0 &&
2272 (othercase = _pcre_ucp_othercase(fc)) != NOTACHAR)
2273 oclength = _pcre_ord2utf8(othercase, occhars);
2274 else oclength = 0;
2275 #endif /* SUPPORT_UCP */
2276
2277 for (i = 1; i <= min; i++)
2278 {
2279 if (memcmp(eptr, charptr, length) == 0) eptr += length;
2280 #ifdef SUPPORT_UCP
2281 /* Need braces because of following else */
2282 else if (oclength == 0) { RRETURN(MATCH_NOMATCH); }
2283 else
2284 {
2285 if (memcmp(eptr, occhars, oclength) != 0) RRETURN(MATCH_NOMATCH);
2286 eptr += oclength;
2287 }
2288 #else /* without SUPPORT_UCP */
2289 else { RRETURN(MATCH_NOMATCH); }
2290 #endif /* SUPPORT_UCP */
2291 }
2292
2293 if (min == max) continue;
2294
2295 if (minimize)
2296 {
2297 for (fi = min;; fi++)
2298 {
2299 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM22);
2300 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2301 if (fi >= max || eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
2302 if (memcmp(eptr, charptr, length) == 0) eptr += length;
2303 #ifdef SUPPORT_UCP
2304 /* Need braces because of following else */
2305 else if (oclength == 0) { RRETURN(MATCH_NOMATCH); }
2306 else
2307 {
2308 if (memcmp(eptr, occhars, oclength) != 0) RRETURN(MATCH_NOMATCH);
2309 eptr += oclength;
2310 }
2311 #else /* without SUPPORT_UCP */
2312 else { RRETURN (MATCH_NOMATCH); }
2313 #endif /* SUPPORT_UCP */
2314 }
2315 /* Control never gets here */
2316 }
2317
2318 else /* Maximize */
2319 {
2320 pp = eptr;
2321 for (i = min; i < max; i++)
2322 {
2323 if (eptr > md->end_subject - length) break;
2324 if (memcmp(eptr, charptr, length) == 0) eptr += length;
2325 #ifdef SUPPORT_UCP
2326 else if (oclength == 0) break;
2327 else
2328 {
2329 if (memcmp(eptr, occhars, oclength) != 0) break;
2330 eptr += oclength;
2331 }
2332 #else /* without SUPPORT_UCP */
2333 else break;
2334 #endif /* SUPPORT_UCP */
2335 }
2336
2337 if (possessive) continue;
2338 for(;;)
2339 {
2340 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM23);
2341 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2342 if (eptr == pp) RRETURN(MATCH_NOMATCH);
2343 #ifdef SUPPORT_UCP
2344 eptr--;
2345 BACKCHAR(eptr);
2346 #else /* without SUPPORT_UCP */
2347 eptr -= length;
2348 #endif /* SUPPORT_UCP */
2349 }
2350 }
2351 /* Control never gets here */
2352 }
2353
2354 /* If the length of a UTF-8 character is 1, we fall through here, and
2355 obey the code as for non-UTF-8 characters below, though in this case the
2356 value of fc will always be < 128. */
2357 }
2358 else
2359 #endif /* SUPPORT_UTF8 */
2360
2361 /* When not in UTF-8 mode, load a single-byte character. */
2362 {
2363 if (min > md->end_subject - eptr) RRETURN(MATCH_NOMATCH);
2364 fc = *ecode++;
2365 }
2366
2367 /* The value of fc at this point is always less than 256, though we may or
2368 may not be in UTF-8 mode. The code is duplicated for the caseless and
2369 caseful cases, for speed, since matching characters is likely to be quite
2370 common. First, ensure the minimum number of matches are present. If min =
2371 max, continue at the same level without recursing. Otherwise, if
2372 minimizing, keep trying the rest of the expression and advancing one
2373 matching character if failing, up to the maximum. Alternatively, if
2374 maximizing, find the maximum number of characters and work backwards. */
2375
2376 DPRINTF(("matching %c{%d,%d} against subject %.*s\n", fc, min, max,
2377 max, eptr));
2378
2379 if ((ims & PCRE_CASELESS) != 0)
2380 {
2381 fc = md->lcc[fc];
2382 for (i = 1; i <= min; i++)
2383 if (fc != md->lcc[*eptr++]) RRETURN(MATCH_NOMATCH);
2384 if (min == max) continue;
2385 if (minimize)
2386 {
2387 for (fi = min;; fi++)
2388 {
2389 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM24);
2390 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2391 if (fi >= max || eptr >= md->end_subject ||
2392 fc != md->lcc[*eptr++])
2393 RRETURN(MATCH_NOMATCH);
2394 }
2395 /* Control never gets here */
2396 }
2397 else /* Maximize */
2398 {
2399 pp = eptr;
2400 for (i = min; i < max; i++)
2401 {
2402 if (eptr >= md->end_subject || fc != md->lcc[*eptr]) break;
2403 eptr++;
2404 }
2405 if (possessive) continue;
2406 while (eptr >= pp)
2407 {
2408 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM25);
2409 eptr--;
2410 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2411 }
2412 RRETURN(MATCH_NOMATCH);
2413 }
2414 /* Control never gets here */
2415 }
2416
2417 /* Caseful comparisons (includes all multi-byte characters) */
2418
2419 else
2420 {
2421 for (i = 1; i <= min; i++) if (fc != *eptr++) RRETURN(MATCH_NOMATCH);
2422 if (min == max) continue;
2423 if (minimize)
2424 {
2425 for (fi = min;; fi++)
2426 {
2427 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM26);
2428 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2429 if (fi >= max || eptr >= md->end_subject || fc != *eptr++)
2430 RRETURN(MATCH_NOMATCH);
2431 }
2432 /* Control never gets here */
2433 }
2434 else /* Maximize */
2435 {
2436 pp = eptr;
2437 for (i = min; i < max; i++)
2438 {
2439 if (eptr >= md->end_subject || fc != *eptr) break;
2440 eptr++;
2441 }
2442 if (possessive) continue;
2443 while (eptr >= pp)
2444 {
2445 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM27);
2446 eptr--;
2447 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2448 }
2449 RRETURN(MATCH_NOMATCH);
2450 }
2451 }
2452 /* Control never gets here */
2453
2454 /* Match a negated single one-byte character. The character we are
2455 checking can be multibyte. */
2456
2457 case OP_NOT:
2458 if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
2459 ecode++;
2460 GETCHARINCTEST(c, eptr);
2461 if ((ims & PCRE_CASELESS) != 0)
2462 {
2463 #ifdef SUPPORT_UTF8
2464 if (c < 256)
2465 #endif
2466 c = md->lcc[c];
2467 if (md->lcc[*ecode++] == c) RRETURN(MATCH_NOMATCH);
2468 }
2469 else
2470 {
2471 if (*ecode++ == c) RRETURN(MATCH_NOMATCH);
2472 }
2473 break;
2474
2475 /* Match a negated single one-byte character repeatedly. This is almost a
2476 repeat of the code for a repeated single character, but I haven't found a
2477 nice way of commoning these up that doesn't require a test of the
2478 positive/negative option for each character match. Maybe that wouldn't add
2479 very much to the time taken, but character matching *is* what this is all
2480 about... */
2481
2482 case OP_NOTEXACT:
2483 min = max = GET2(ecode, 1);
2484 ecode += 3;
2485 goto REPEATNOTCHAR;
2486
2487 case OP_NOTUPTO:
2488 case OP_NOTMINUPTO:
2489 min = 0;
2490 max = GET2(ecode, 1);
2491 minimize = *ecode == OP_NOTMINUPTO;
2492 ecode += 3;
2493 goto REPEATNOTCHAR;
2494
2495 case OP_NOTPOSSTAR:
2496 possessive = TRUE;
2497 min = 0;
2498 max = INT_MAX;
2499 ecode++;
2500 goto REPEATNOTCHAR;
2501
2502 case OP_NOTPOSPLUS:
2503 possessive = TRUE;
2504 min = 1;
2505 max = INT_MAX;
2506 ecode++;
2507 goto REPEATNOTCHAR;
2508
2509 case OP_NOTPOSQUERY:
2510 possessive = TRUE;
2511 min = 0;
2512 max = 1;
2513 ecode++;
2514 goto REPEATNOTCHAR;
2515
2516 case OP_NOTPOSUPTO:
2517 possessive = TRUE;
2518 min = 0;
2519 max = GET2(ecode, 1);
2520 ecode += 3;
2521 goto REPEATNOTCHAR;
2522
2523 case OP_NOTSTAR:
2524 case OP_NOTMINSTAR:
2525 case OP_NOTPLUS:
2526 case OP_NOTMINPLUS:
2527 case OP_NOTQUERY:
2528 case OP_NOTMINQUERY:
2529 c = *ecode++ - OP_NOTSTAR;
2530 minimize = (c & 1) != 0;
2531 min = rep_min[c]; /* Pick up values from tables; */
2532 max = rep_max[c]; /* zero for max => infinity */
2533 if (max == 0) max = INT_MAX;
2534
2535 /* Common code for all repeated single-byte matches. We can give up quickly
2536 if there are fewer than the minimum number of bytes left in the
2537 subject. */
2538
2539 REPEATNOTCHAR:
2540 if (min > md->end_subject - eptr) RRETURN(MATCH_NOMATCH);
2541 fc = *ecode++;
2542
2543 /* The code is duplicated for the caseless and caseful cases, for speed,
2544 since matching characters is likely to be quite common. First, ensure the
2545 minimum number of matches are present. If min = max, continue at the same
2546 level without recursing. Otherwise, if minimizing, keep trying the rest of
2547 the expression and advancing one matching character if failing, up to the
2548 maximum. Alternatively, if maximizing, find the maximum number of
2549 characters and work backwards. */
2550
2551 DPRINTF(("negative matching %c{%d,%d} against subject %.*s\n", fc, min, max,
2552 max, eptr));
2553
2554 if ((ims & PCRE_CASELESS) != 0)
2555 {
2556 fc = md->lcc[fc];
2557
2558 #ifdef SUPPORT_UTF8
2559 /* UTF-8 mode */
2560 if (utf8)
2561 {
2562 register unsigned int d;
2563 for (i = 1; i <= min; i++)
2564 {
2565 GETCHARINC(d, eptr);
2566 if (d < 256) d = md->lcc[d];
2567 if (fc == d) RRETURN(MATCH_NOMATCH);
2568 }
2569 }
2570 else
2571 #endif
2572
2573 /* Not UTF-8 mode */
2574 {
2575 for (i = 1; i <= min; i++)
2576 if (fc == md->lcc[*eptr++]) RRETURN(MATCH_NOMATCH);
2577 }
2578
2579 if (min == max) continue;
2580
2581 if (minimize)
2582 {
2583 #ifdef SUPPORT_UTF8
2584 /* UTF-8 mode */
2585 if (utf8)
2586 {
2587 register unsigned int d;
2588 for (fi = min;; fi++)
2589 {
2590 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM28);
2591 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2592 GETCHARINC(d, eptr);
2593 if (d < 256) d = md->lcc[d];
2594 if (fi >= max || eptr >= md->end_subject || fc == d)
2595 RRETURN(MATCH_NOMATCH);
2596 }
2597 }
2598 else
2599 #endif
2600 /* Not UTF-8 mode */
2601 {
2602 for (fi = min;; fi++)
2603 {
2604 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM29);
2605 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2606 if (fi >= max || eptr >= md->end_subject || fc == md->lcc[*eptr++])
2607 RRETURN(MATCH_NOMATCH);
2608 }
2609 }
2610 /* Control never gets here */
2611 }
2612
2613 /* Maximize case */
2614
2615 else
2616 {
2617 pp = eptr;
2618
2619 #ifdef SUPPORT_UTF8
2620 /* UTF-8 mode */
2621 if (utf8)
2622 {
2623 register unsigned int d;
2624 for (i = min; i < max; i++)
2625 {
2626 int len = 1;
2627 if (eptr >= md->end_subject) break;
2628 GETCHARLEN(d, eptr, len);
2629 if (d < 256) d = md->lcc[d];
2630 if (fc == d) break;
2631 eptr += len;
2632 }
2633 if (possessive) continue;
2634 for(;;)
2635 {
2636 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM30);
2637 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2638 if (eptr-- == pp) break; /* Stop if tried at original pos */
2639 BACKCHAR(eptr);
2640 }
2641 }
2642 else
2643 #endif
2644 /* Not UTF-8 mode */
2645 {
2646 for (i = min; i < max; i++)
2647 {
2648 if (eptr >= md->end_subject || fc == md->lcc[*eptr]) break;
2649 eptr++;
2650 }
2651 if (possessive) continue;
2652 while (eptr >= pp)
2653 {
2654 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM31);
2655 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2656 eptr--;
2657 }
2658 }
2659
2660 RRETURN(MATCH_NOMATCH);
2661 }
2662 /* Control never gets here */
2663 }
2664
2665 /* Caseful comparisons */
2666
2667 else
2668 {
2669 #ifdef SUPPORT_UTF8
2670 /* UTF-8 mode */
2671 if (utf8)
2672 {
2673 register unsigned int d;
2674 for (i = 1; i <= min; i++)
2675 {
2676 GETCHARINC(d, eptr);
2677 if (fc == d) RRETURN(MATCH_NOMATCH);
2678 }
2679 }
2680 else
2681 #endif
2682 /* Not UTF-8 mode */
2683 {
2684 for (i = 1; i <= min; i++)
2685 if (fc == *eptr++) RRETURN(MATCH_NOMATCH);
2686 }
2687
2688 if (min == max) continue;
2689
2690 if (minimize)
2691 {
2692 #ifdef SUPPORT_UTF8
2693 /* UTF-8 mode */
2694 if (utf8)
2695 {
2696 register unsigned int d;
2697 for (fi = min;; fi++)
2698 {
2699 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM32);
2700 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2701 GETCHARINC(d, eptr);
2702 if (fi >= max || eptr >= md->end_subject || fc == d)
2703 RRETURN(MATCH_NOMATCH);
2704 }
2705 }
2706 else
2707 #endif
2708 /* Not UTF-8 mode */
2709 {
2710 for (fi = min;; fi++)
2711 {
2712 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM33);
2713 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2714 if (fi >= max || eptr >= md->end_subject || fc == *eptr++)
2715 RRETURN(MATCH_NOMATCH);
2716 }
2717 }
2718 /* Control never gets here */
2719 }
2720
2721 /* Maximize case */
2722
2723 else
2724 {
2725 pp = eptr;
2726
2727 #ifdef SUPPORT_UTF8
2728 /* UTF-8 mode */
2729 if (utf8)
2730 {
2731 register unsigned int d;
2732 for (i = min; i < max; i++)
2733 {
2734 int len = 1;
2735 if (eptr >= md->end_subject) break;
2736 GETCHARLEN(d, eptr, len);
2737 if (fc == d) break;
2738 eptr += len;
2739 }
2740 if (possessive) continue;
2741 for(;;)
2742 {
2743 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM34);
2744 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2745 if (eptr-- == pp) break; /* Stop if tried at original pos */
2746 BACKCHAR(eptr);
2747 }
2748 }
2749 else
2750 #endif
2751 /* Not UTF-8 mode */
2752 {
2753 for (i = min; i < max; i++)
2754 {
2755 if (eptr >= md->end_subject || fc == *eptr) break;
2756 eptr++;
2757 }
2758 if (possessive) continue;
2759 while (eptr >= pp)
2760 {
2761 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM35);
2762 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2763 eptr--;
2764 }
2765 }
2766
2767 RRETURN(MATCH_NOMATCH);
2768 }
2769 }
2770 /* Control never gets here */
2771
2772 /* Match a single character type repeatedly; several different opcodes
2773 share code. This is very similar to the code for single characters, but we
2774 repeat it in the interests of efficiency. */
2775
2776 case OP_TYPEEXACT:
2777 min = max = GET2(ecode, 1);
2778 minimize = TRUE;
2779 ecode += 3;
2780 goto REPEATTYPE;
2781
2782 case OP_TYPEUPTO:
2783 case OP_TYPEMINUPTO:
2784 min = 0;
2785 max = GET2(ecode, 1);
2786 minimize = *ecode == OP_TYPEMINUPTO;
2787 ecode += 3;
2788 goto REPEATTYPE;
2789
2790 case OP_TYPEPOSSTAR:
2791 possessive = TRUE;
2792 min = 0;
2793 max = INT_MAX;
2794 ecode++;
2795 goto REPEATTYPE;
2796
2797 case OP_TYPEPOSPLUS:
2798 possessive = TRUE;
2799 min = 1;
2800 max = INT_MAX;
2801 ecode++;
2802 goto REPEATTYPE;
2803
2804 case OP_TYPEPOSQUERY:
2805 possessive = TRUE;
2806 min = 0;
2807 max = 1;
2808 ecode++;
2809 goto REPEATTYPE;
2810
2811 case OP_TYPEPOSUPTO:
2812 possessive = TRUE;
2813 min = 0;
2814 max = GET2(ecode, 1);
2815 ecode += 3;
2816 goto REPEATTYPE;
2817
2818 case OP_TYPESTAR:
2819 case OP_TYPEMINSTAR:
2820 case OP_TYPEPLUS:
2821 case OP_TYPEMINPLUS:
2822 case OP_TYPEQUERY:
2823 case OP_TYPEMINQUERY:
2824 c = *ecode++ - OP_TYPESTAR;
2825 minimize = (c & 1) != 0;
2826 min = rep_min[c]; /* Pick up values from tables; */
2827 max = rep_max[c]; /* zero for max => infinity */
2828 if (max == 0) max = INT_MAX;
2829
2830 /* Common code for all repeated single character type matches. Note that
2831 in UTF-8 mode, '.' matches a character of any length, but for the other
2832 character types, the valid characters are all one-byte long. */
2833
2834 REPEATTYPE:
2835 ctype = *ecode++; /* Code for the character type */
2836
2837 #ifdef SUPPORT_UCP
2838 if (ctype == OP_PROP || ctype == OP_NOTPROP)
2839 {
2840 prop_fail_result = ctype == OP_NOTPROP;
2841 prop_type = *ecode++;
2842 prop_value = *ecode++;
2843 }
2844 else prop_type = -1;
2845 #endif
2846
2847 /* First, ensure the minimum number of matches are present. Use inline
2848 code for maximizing the speed, and do the type test once at the start
2849 (i.e. keep it out of the loop). Also we can test that there are at least
2850 the minimum number of bytes before we start. This isn't as effective in
2851 UTF-8 mode, but it does no harm. Separate the UTF-8 code completely as that
2852 is tidier. Also separate the UCP code, which can be the same for both UTF-8
2853 and single-bytes. */
2854
2855 if (min > md->end_subject - eptr) RRETURN(MATCH_NOMATCH);
2856 if (min > 0)
2857 {
2858 #ifdef SUPPORT_UCP
2859 if (prop_type >= 0)
2860 {
2861 switch(prop_type)
2862 {
2863 case PT_ANY:
2864 if (prop_fail_result) RRETURN(MATCH_NOMATCH);
2865 for (i = 1; i <= min; i++)
2866 {
2867 if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
2868 GETCHARINCTEST(c, eptr);
2869 }
2870 break;
2871
2872 case PT_LAMP:
2873 for (i = 1; i <= min; i++)
2874 {
2875 if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
2876 GETCHARINCTEST(c, eptr);
2877 prop_category = _pcre_ucp_findprop(c, &prop_chartype, &prop_script);
2878 if ((prop_chartype == ucp_Lu ||
2879 prop_chartype == ucp_Ll ||
2880 prop_chartype == ucp_Lt) == prop_fail_result)
2881 RRETURN(MATCH_NOMATCH);
2882 }
2883 break;
2884
2885 case PT_GC:
2886 for (i = 1; i <= min; i++)
2887 {
2888 if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
2889 GETCHARINCTEST(c, eptr);
2890 prop_category = _pcre_ucp_findprop(c, &prop_chartype, &prop_script);
2891 if ((prop_category == prop_value) == prop_fail_result)
2892 RRETURN(MATCH_NOMATCH);
2893 }
2894 break;
2895
2896 case PT_PC:
2897 for (i = 1; i <= min; i++)
2898 {
2899 if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
2900 GETCHARINCTEST(c, eptr);
2901 prop_category = _pcre_ucp_findprop(c, &prop_chartype, &prop_script);
2902 if ((prop_chartype == prop_value) == prop_fail_result)
2903 RRETURN(MATCH_NOMATCH);
2904 }
2905 break;
2906
2907 case PT_SC:
2908 for (i = 1; i <= min; i++)
2909 {
2910 if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
2911 GETCHARINCTEST(c, eptr);
2912 prop_category = _pcre_ucp_findprop(c, &prop_chartype, &prop_script);
2913 if ((prop_script == prop_value) == prop_fail_result)
2914 RRETURN(MATCH_NOMATCH);
2915 }
2916 break;
2917
2918 default:
2919 RRETURN(PCRE_ERROR_INTERNAL);
2920 }
2921 }
2922
2923 /* Match extended Unicode sequences. We will get here only if the
2924 support is in the binary; otherwise a compile-time error occurs. */
2925
2926 else if (ctype == OP_EXTUNI)
2927 {
2928 for (i = 1; i <= min; i++)
2929 {
2930 GETCHARINCTEST(c, eptr);
2931 prop_category = _pcre_ucp_findprop(c, &prop_chartype, &prop_script);
2932 if (prop_category == ucp_M) RRETURN(MATCH_NOMATCH);
2933 while (eptr < md->end_subject)
2934 {
2935 int len = 1;
2936 if (!utf8) c = *eptr; else
2937 {
2938 GETCHARLEN(c, eptr, len);
2939 }
2940 prop_category = _pcre_ucp_findprop(c, &prop_chartype, &prop_script);
2941 if (prop_category != ucp_M) break;
2942 eptr += len;
2943 }
2944 }
2945 }
2946
2947 else
2948 #endif /* SUPPORT_UCP */
2949
2950 /* Handle all other cases when the coding is UTF-8 */
2951
2952 #ifdef SUPPORT_UTF8
2953 if (utf8) switch(ctype)
2954 {
2955 case OP_ANY:
2956 for (i = 1; i <= min; i++)
2957 {
2958 if (eptr >= md->end_subject ||
2959 ((ims & PCRE_DOTALL) == 0 && IS_NEWLINE(eptr)))
2960 RRETURN(MATCH_NOMATCH);
2961 eptr++;
2962 while (eptr < md->end_subject && (*eptr & 0xc0) == 0x80) eptr++;
2963 }
2964 break;
2965
2966 case OP_ALLANY:
2967 for (i = 1; i <= min; i++)
2968 {
2969 if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
2970 eptr++;
2971 while (eptr < md->end_subject && (*eptr & 0xc0) == 0x80) eptr++;
2972 }
2973 break;
2974
2975 case OP_ANYBYTE:
2976 eptr += min;
2977 break;
2978
2979 case OP_ANYNL:
2980 for (i = 1; i <= min; i++)
2981 {
2982 if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
2983 GETCHARINC(c, eptr);
2984 switch(c)
2985 {
2986 default: RRETURN(MATCH_NOMATCH);
2987 case 0x000d:
2988 if (eptr < md->end_subject && *eptr == 0x0a) eptr++;
2989 break;
2990
2991 case 0x000a:
2992 break;
2993
2994 case 0x000b:
2995 case 0x000c:
2996 case 0x0085:
2997 case 0x2028:
2998 case 0x2029:
2999 if (md->bsr_anycrlf) RRETURN(MATCH_NOMATCH);
3000 break;
3001 }
3002 }
3003 break;
3004
3005 case OP_NOT_HSPACE:
3006 for (i = 1; i <= min; i++)
3007 {
3008 if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
3009 GETCHARINC(c, eptr);
3010 switch(c)
3011 {
3012 default: break;
3013 case 0x09: /* HT */
3014 case 0x20: /* SPACE */
3015 case 0xa0: /* NBSP */
3016 case 0x1680: /* OGHAM SPACE MARK */
3017 case 0x180e: /* MONGOLIAN VOWEL SEPARATOR */
3018 case 0x2000: /* EN QUAD */
3019 case 0x2001: /* EM QUAD */
3020 case 0x2002: /* EN SPACE */
3021 case 0x2003: /* EM SPACE */
3022 case 0x2004: /* THREE-PER-EM SPACE */
3023 case 0x2005: /* FOUR-PER-EM SPACE */
3024 case 0x2006: /* SIX-PER-EM SPACE */
3025 case 0x2007: /* FIGURE SPACE */
3026 case 0x2008: /* PUNCTUATION SPACE */
3027 case 0x2009: /* THIN SPACE */
3028 case 0x200A: /* HAIR SPACE */
3029 case 0x202f: /* NARROW NO-BREAK SPACE */
3030 case 0x205f: /* MEDIUM MATHEMATICAL SPACE */
3031 case 0x3000: /* IDEOGRAPHIC SPACE */
3032 RRETURN(MATCH_NOMATCH);
3033 }
3034 }
3035 break;
3036
3037 case OP_HSPACE:
3038 for (i = 1; i <= min; i++)
3039 {
3040 if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
3041 GETCHARINC(c, eptr);
3042 switch(c)
3043 {
3044 default: RRETURN(MATCH_NOMATCH);
3045 case 0x09: /* HT */
3046 case 0x20: /* SPACE */
3047 case 0xa0: /* NBSP */
3048 case 0x1680: /* OGHAM SPACE MARK */
3049 case 0x180e: /* MONGOLIAN VOWEL SEPARATOR */
3050 case 0x2000: /* EN QUAD */
3051 case 0x2001: /* EM QUAD */
3052 case 0x2002: /* EN SPACE */
3053 case 0x2003: /* EM SPACE */
3054 case 0x2004: /* THREE-PER-EM SPACE */
3055 case 0x2005: /* FOUR-PER-EM SPACE */
3056 case 0x2006: /* SIX-PER-EM SPACE */
3057 case 0x2007: /* FIGURE SPACE */
3058 case 0x2008: /* PUNCTUATION SPACE */
3059 case 0x2009: /* THIN SPACE */
3060 case 0x200A: /* HAIR SPACE */
3061 case 0x202f: /* NARROW NO-BREAK SPACE */
3062 case 0x205f: /* MEDIUM MATHEMATICAL SPACE */
3063 case 0x3000: /* IDEOGRAPHIC SPACE */
3064 break;
3065 }
3066 }
3067 break;
3068
3069 case OP_NOT_VSPACE:
3070 for (i = 1; i <= min; i++)
3071 {
3072 if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
3073 GETCHARINC(c, eptr);
3074 switch(c)
3075 {
3076 default: break;
3077 case 0x0a: /* LF */
3078 case 0x0b: /* VT */
3079 case 0x0c: /* FF */
3080 case 0x0d: /* CR */
3081 case 0x85: /* NEL */
3082 case 0x2028: /* LINE SEPARATOR */
3083 case 0x2029: /* PARAGRAPH SEPARATOR */
3084 RRETURN(MATCH_NOMATCH);
3085 }
3086 }
3087 break;
3088
3089 case OP_VSPACE:
3090 for (i = 1; i <= min; i++)
3091 {
3092 if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
3093 GETCHARINC(c, eptr);
3094 switch(c)
3095 {
3096 default: RRETURN(MATCH_NOMATCH);
3097 case 0x0a: /* LF */
3098 case 0x0b: /* VT */
3099 case 0x0c: /* FF */
3100 case 0x0d: /* CR */
3101 case 0x85: /* NEL */
3102 case 0x2028: /* LINE SEPARATOR */
3103 case 0x2029: /* PARAGRAPH SEPARATOR */
3104 break;
3105 }
3106 }
3107 break;
3108
3109 case OP_NOT_DIGIT:
3110 for (i = 1; i <= min; i++)
3111 {
3112 if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
3113 GETCHARINC(c, eptr);
3114 if (c < 128 && (md->ctypes[c] & ctype_digit) != 0)
3115 RRETURN(MATCH_NOMATCH);
3116 }
3117 break;
3118
3119 case OP_DIGIT:
3120 for (i = 1; i <= min; i++)
3121 {
3122 if (eptr >= md->end_subject ||
3123 *eptr >= 128 || (md->ctypes[*eptr++] & ctype_digit) == 0)
3124 RRETURN(MATCH_NOMATCH);
3125 /* No need to skip more bytes - we know it's a 1-byte character */
3126 }
3127 break;
3128
3129 case OP_NOT_WHITESPACE:
3130 for (i = 1; i <= min; i++)
3131 {
3132 if (eptr >= md->end_subject ||
3133 (*eptr < 128 && (md->ctypes[*eptr] & ctype_space) != 0))
3134 RRETURN(MATCH_NOMATCH);
3135 while (++eptr < md->end_subject && (*eptr & 0xc0) == 0x80);
3136 }
3137 break;
3138
3139 case OP_WHITESPACE:
3140 for (i = 1; i <= min; i++)
3141 {
3142 if (eptr >= md->end_subject ||
3143 *eptr >= 128 || (md->ctypes[*eptr++] & ctype_space) == 0)
3144 RRETURN(MATCH_NOMATCH);
3145 /* No need to skip more bytes - we know it's a 1-byte character */
3146 }
3147 break;
3148
3149 case OP_NOT_WORDCHAR:
3150 for (i = 1; i <= min; i++)
3151 {
3152 if (eptr >= md->end_subject ||
3153 (*eptr < 128 && (md->ctypes[*eptr] & ctype_word) != 0))
3154 RRETURN(MATCH_NOMATCH);
3155 while (++eptr < md->end_subject && (*eptr & 0xc0) == 0x80);
3156 }
3157 break;
3158
3159 case OP_WORDCHAR:
3160 for (i = 1; i <= min; i++)
3161 {
3162 if (eptr >= md->end_subject ||
3163 *eptr >= 128 || (md->ctypes[*eptr++] & ctype_word) == 0)
3164 RRETURN(MATCH_NOMATCH);
3165 /* No need to skip more bytes - we know it's a 1-byte character */
3166 }
3167 break;
3168
3169 default:
3170 RRETURN(PCRE_ERROR_INTERNAL);
3171 } /* End switch(ctype) */
3172
3173 else
3174 #endif /* SUPPORT_UTF8 */
3175
3176 /* Code for the non-UTF-8 case for minimum matching of operators other
3177 than OP_PROP and OP_NOTPROP. We can assume that there are the minimum
3178 number of bytes present, as this was tested above. */
3179
3180 switch(ctype)
3181 {
3182 case OP_ANY:
3183 if ((ims & PCRE_DOTALL) == 0)
3184 {
3185 for (i = 1; i <= min; i++)
3186 {
3187 if (IS_NEWLINE(eptr)) RRETURN(MATCH_NOMATCH);
3188 eptr++;
3189 }
3190 }
3191 else eptr += min;
3192 break;
3193
3194 case OP_ALLANY:
3195 eptr += min;
3196 break;
3197
3198 case OP_ANYBYTE:
3199 eptr += min;
3200 break;
3201
3202 /* Because of the CRLF case, we can't assume the minimum number of
3203 bytes are present in this case. */
3204
3205 case OP_ANYNL:
3206 for (i = 1; i <= min; i++)
3207 {
3208 if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
3209 switch(*eptr++)
3210 {
3211 default: RRETURN(MATCH_NOMATCH);
3212 case 0x000d:
3213 if (eptr < md->end_subject && *eptr == 0x0a) eptr++;
3214 break;
3215 case 0x000a:
3216 break;
3217
3218 case 0x000b:
3219 case 0x000c:
3220 case 0x0085:
3221 if (md->bsr_anycrlf) RRETURN(MATCH_NOMATCH);
3222 break;
3223 }
3224 }
3225 break;
3226
3227 case OP_NOT_HSPACE:
3228 for (i = 1; i <= min; i++)
3229 {
3230 if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
3231 switch(*eptr++)
3232 {
3233 default: break;
3234 case 0x09: /* HT */
3235 case 0x20: /* SPACE */
3236 case 0xa0: /* NBSP */
3237 RRETURN(MATCH_NOMATCH);
3238 }
3239 }
3240 break;
3241
3242 case OP_HSPACE:
3243 for (i = 1; i <= min; i++)
3244 {
3245 if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
3246 switch(*eptr++)
3247 {
3248 default: RRETURN(MATCH_NOMATCH);
3249 case 0x09: /* HT */
3250 case 0x20: /* SPACE */
3251 case 0xa0: /* NBSP */
3252 break;
3253 }
3254 }
3255 break;
3256
3257 case OP_NOT_VSPACE:
3258 for (i = 1; i <= min; i++)
3259 {
3260 if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
3261 switch(*eptr++)
3262 {
3263 default: break;
3264 case 0x0a: /* LF */
3265 case 0x0b: /* VT */
3266 case 0x0c: /* FF */
3267 case 0x0d: /* CR */
3268 case 0x85: /* NEL */
3269 RRETURN(MATCH_NOMATCH);
3270 }
3271 }
3272 break;
3273
3274 case OP_VSPACE:
3275 for (i = 1; i <= min; i++)
3276 {
3277 if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
3278 switch(*eptr++)
3279 {
3280 default: RRETURN(MATCH_NOMATCH);
3281 case 0x0a: /* LF */
3282 case 0x0b: /* VT */
3283 case 0x0c: /* FF */
3284 case 0x0d: /* CR */
3285 case 0x85: /* NEL */
3286 break;
3287 }
3288 }
3289 break;
3290
3291 case OP_NOT_DIGIT:
3292 for (i = 1; i <= min; i++)
3293 if ((md->ctypes[*eptr++] & ctype_digit) != 0) RRETURN(MATCH_NOMATCH);
3294 break;
3295
3296 case OP_DIGIT:
3297 for (i = 1; i <= min; i++)
3298 if ((md->ctypes[*eptr++] & ctype_digit) == 0) RRETURN(MATCH_NOMATCH);
3299 break;
3300
3301 case OP_NOT_WHITESPACE:
3302 for (i = 1; i <= min; i++)
3303 if ((md->ctypes[*eptr++] & ctype_space) != 0) RRETURN(MATCH_NOMATCH);
3304 break;
3305
3306 case OP_WHITESPACE:
3307 for (i = 1; i <= min; i++)
3308 if ((md->ctypes[*eptr++] & ctype_space) == 0) RRETURN(MATCH_NOMATCH);
3309 break;
3310
3311 case OP_NOT_WORDCHAR:
3312 for (i = 1; i <= min; i++)
3313 if ((md->ctypes[*eptr++] & ctype_word) != 0)
3314 RRETURN(MATCH_NOMATCH);
3315 break;
3316
3317 case OP_WORDCHAR:
3318 for (i = 1; i <= min; i++)
3319 if ((md->ctypes[*eptr++] & ctype_word) == 0)
3320 RRETURN(MATCH_NOMATCH);
3321 break;
3322
3323 default:
3324 RRETURN(PCRE_ERROR_INTERNAL);
3325 }
3326 }
3327
3328 /* If min = max, continue at the same level without recursing */
3329
3330 if (min == max) continue;
3331
3332 /* If minimizing, we have to test the rest of the pattern before each
3333 subsequent match. Again, separate the UTF-8 case for speed, and also
3334 separate the UCP cases. */
3335
3336 if (minimize)
3337 {
3338 #ifdef SUPPORT_UCP
3339 if (prop_type >= 0)
3340 {
3341 switch(prop_type)
3342 {
3343 case PT_ANY:
3344 for (fi = min;; fi++)
3345 {
3346 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM36);
3347 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3348 if (fi >= max || eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
3349 GETCHARINC(c, eptr);
3350 if (prop_fail_result) RRETURN(MATCH_NOMATCH);
3351 }
3352 /* Control never gets here */
3353
3354 case PT_LAMP:
3355 for (fi = min;; fi++)
3356 {
3357 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM37);
3358 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3359 if (fi >= max || eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
3360 GETCHARINC(c, eptr);
3361 prop_category = _pcre_ucp_findprop(c, &prop_chartype, &prop_script);
3362 if ((prop_chartype == ucp_Lu ||
3363 prop_chartype == ucp_Ll ||
3364 prop_chartype == ucp_Lt) == prop_fail_result)
3365 RRETURN(MATCH_NOMATCH);
3366 }
3367 /* Control never gets here */
3368
3369 case PT_GC:
3370 for (fi = min;; fi++)
3371 {
3372 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM38);
3373 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3374 if (fi >= max || eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
3375 GETCHARINC(c, eptr);
3376 prop_category = _pcre_ucp_findprop(c, &prop_chartype, &prop_script);
3377 if ((prop_category == prop_value) == prop_fail_result)
3378 RRETURN(MATCH_NOMATCH);
3379 }
3380 /* Control never gets here */
3381
3382 case PT_PC:
3383 for (fi = min;; fi++)
3384 {
3385 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM39);
3386 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3387 if (fi >= max || eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
3388 GETCHARINC(c, eptr);
3389 prop_category = _pcre_ucp_findprop(c, &prop_chartype, &prop_script);
3390 if ((prop_chartype == prop_value) == prop_fail_result)
3391 RRETURN(MATCH_NOMATCH);
3392 }
3393 /* Control never gets here */
3394
3395 case PT_SC:
3396 for (fi = min;; fi++)
3397 {
3398 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM40);
3399 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3400 if (fi >= max || eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
3401 GETCHARINC(c, eptr);
3402 prop_category = _pcre_ucp_findprop(c, &prop_chartype, &prop_script);
3403 if ((prop_script == prop_value) == prop_fail_result)
3404 RRETURN(MATCH_NOMATCH);
3405 }
3406 /* Control never gets here */
3407
3408 default:
3409 RRETURN(PCRE_ERROR_INTERNAL);
3410 }
3411 }
3412
3413 /* Match extended Unicode sequences. We will get here only if the
3414 support is in the binary; otherwise a compile-time error occurs. */
3415
3416 else if (ctype == OP_EXTUNI)
3417 {
3418 for (fi = min;; fi++)
3419 {
3420 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM41);
3421 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3422 if (fi >= max || eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
3423 GETCHARINCTEST(c, eptr);
3424 prop_category = _pcre_ucp_findprop(c, &prop_chartype, &prop_script);
3425 if (prop_category == ucp_M) RRETURN(MATCH_NOMATCH);
3426 while (eptr < md->end_subject)
3427 {
3428 int len = 1;
3429 if (!utf8) c = *eptr; else
3430 {
3431 GETCHARLEN(c, eptr, len);
3432 }
3433 prop_category = _pcre_ucp_findprop(c, &prop_chartype, &prop_script);
3434 if (prop_category != ucp_M) break;
3435 eptr += len;
3436 }
3437 }
3438 }
3439
3440 else
3441 #endif /* SUPPORT_UCP */
3442
3443 #ifdef SUPPORT_UTF8
3444 /* UTF-8 mode */
3445 if (utf8)
3446 {
3447 for (fi = min;; fi++)
3448 {
3449 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM42);
3450 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3451 if (fi >= max || eptr >= md->end_subject ||
3452 (ctype == OP_ANY && (ims & PCRE_DOTALL) == 0 &&
3453 IS_NEWLINE(eptr)))
3454 RRETURN(MATCH_NOMATCH);
3455
3456 GETCHARINC(c, eptr);
3457 switch(ctype)
3458 {
3459 case OP_ANY: /* This is the DOTALL case */
3460 case OP_ALLANY:
3461 case OP_ANYBYTE:
3462 break;
3463
3464 case OP_ANYNL:
3465 switch(c)
3466 {
3467 default: RRETURN(MATCH_NOMATCH);
3468 case 0x000d:
3469 if (eptr < md->end_subject && *eptr == 0x0a) eptr++;
3470 break;
3471 case 0x000a:
3472 break;
3473
3474 case 0x000b:
3475 case 0x000c:
3476 case 0x0085:
3477 case 0x2028:
3478 case 0x2029:
3479 if (md->bsr_anycrlf) RRETURN(MATCH_NOMATCH);
3480 break;
3481 }
3482 break;
3483
3484 case OP_NOT_HSPACE:
3485 switch(c)
3486 {
3487 default: break;
3488 case 0x09: /* HT */
3489 case 0x20: /* SPACE */
3490 case 0xa0: /* NBSP */
3491 case 0x1680: /* OGHAM SPACE MARK */
3492 case 0x180e: /* MONGOLIAN VOWEL SEPARATOR */
3493 case 0x2000: /* EN QUAD */
3494 case 0x2001: /* EM QUAD */
3495 case 0x2002: /* EN SPACE */
3496 case 0x2003: /* EM SPACE */
3497 case 0x2004: /* THREE-PER-EM SPACE */
3498 case 0x2005: /* FOUR-PER-EM SPACE */
3499 case 0x2006: /* SIX-PER-EM SPACE */
3500 case 0x2007: /* FIGURE SPACE */
3501 case 0x2008: /* PUNCTUATION SPACE */
3502 case 0x2009: /* THIN SPACE */
3503 case 0x200A: /* HAIR SPACE */
3504 case 0x202f: /* NARROW NO-BREAK SPACE */
3505 case 0x205f: /* MEDIUM MATHEMATICAL SPACE */
3506 case 0x3000: /* IDEOGRAPHIC SPACE */
3507 RRETURN(MATCH_NOMATCH);
3508 }
3509 break;
3510
3511 case OP_HSPACE:
3512 switch(c)
3513 {
3514 default: RRETURN(MATCH_NOMATCH);
3515 case 0x09: /* HT */
3516 case 0x20: /* SPACE */
3517 case 0xa0: /* NBSP */
3518 case 0x1680: /* OGHAM SPACE MARK */
3519 case 0x180e: /* MONGOLIAN VOWEL SEPARATOR */
3520 case 0x2000: /* EN QUAD */
3521 case 0x2001: /* EM QUAD */
3522 case 0x2002: /* EN SPACE */
3523 case 0x2003: /* EM SPACE */
3524 case 0x2004: /* THREE-PER-EM SPACE */
3525 case 0x2005: /* FOUR-PER-EM SPACE */
3526 case 0x2006: /* SIX-PER-EM SPACE */
3527 case 0x2007: /* FIGURE SPACE */
3528 case 0x2008: /* PUNCTUATION SPACE */
3529 case 0x2009: /* THIN SPACE */
3530 case 0x200A: /* HAIR SPACE */
3531 case 0x202f: /* NARROW NO-BREAK SPACE */
3532 case 0x205f: /* MEDIUM MATHEMATICAL SPACE */
3533 case 0x3000: /* IDEOGRAPHIC SPACE */
3534 break;
3535 }
3536 break;
3537
3538 case OP_NOT_VSPACE:
3539 switch(c)
3540 {
3541 default: break;
3542 case 0x0a: /* LF */
3543 case 0x0b: /* VT */
3544 case 0x0c: /* FF */
3545 case 0x0d: /* CR */
3546 case 0x85: /* NEL */
3547 case 0x2028: /* LINE SEPARATOR */
3548 case 0x2029: /* PARAGRAPH SEPARATOR */
3549 RRETURN(MATCH_NOMATCH);
3550 }
3551 break;
3552
3553 case OP_VSPACE:
3554 switch(c)
3555 {
3556 default: RRETURN(MATCH_NOMATCH);
3557 case 0x0a: /* LF */
3558 case 0x0b: /* VT */
3559 case 0x0c: /* FF */
3560 case 0x0d: /* CR */
3561 case 0x85: /* NEL */
3562 case 0x2028: /* LINE SEPARATOR */
3563 case 0x2029: /* PARAGRAPH SEPARATOR */
3564 break;
3565 }
3566 break;
3567
3568 case OP_NOT_DIGIT:
3569 if (c < 256 && (md->ctypes[c] & ctype_digit) != 0)
3570 RRETURN(MATCH_NOMATCH);
3571 break;
3572
3573 case OP_DIGIT:
3574 if (c >= 256 || (md->ctypes[c] & ctype_digit) == 0)
3575 RRETURN(MATCH_NOMATCH);
3576 break;
3577
3578 case OP_NOT_WHITESPACE:
3579 if (c < 256 && (md->ctypes[c] & ctype_space) != 0)
3580 RRETURN(MATCH_NOMATCH);
3581 break;
3582
3583 case OP_WHITESPACE:
3584 if (c >= 256 || (md->ctypes[c] & ctype_space) == 0)
3585 RRETURN(MATCH_NOMATCH);
3586 break;
3587
3588 case OP_NOT_WORDCHAR:
3589 if (c < 256 && (md->ctypes[c] & ctype_word) != 0)
3590 RRETURN(MATCH_NOMATCH);
3591 break;
3592
3593 case OP_WORDCHAR:
3594 if (c >= 256 || (md->ctypes[c] & ctype_word) == 0)
3595 RRETURN(MATCH_NOMATCH);
3596 break;
3597
3598 default:
3599 RRETURN(PCRE_ERROR_INTERNAL);
3600 }
3601 }
3602 }
3603 else
3604 #endif
3605 /* Not UTF-8 mode */
3606 {
3607 for (fi = min;; fi++)
3608 {
3609 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM43);
3610 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3611 if (fi >= max || eptr >= md->end_subject ||
3612 ((ims & PCRE_DOTALL) == 0 && IS_NEWLINE(eptr)))
3613 RRETURN(MATCH_NOMATCH);
3614
3615 c = *eptr++;
3616 switch(ctype)
3617 {
3618 case OP_ANY: /* This is the DOTALL case */
3619 case OP_ALLANY:
3620 case OP_ANYBYTE:
3621 break;
3622
3623 case OP_ANYNL:
3624 switch(c)
3625 {
3626 default: RRETURN(MATCH_NOMATCH);
3627 case 0x000d:
3628 if (eptr < md->end_subject && *eptr == 0x0a) eptr++;
3629 break;
3630
3631 case 0x000a:
3632 break;
3633
3634 case 0x000b:
3635 case 0x000c:
3636 case 0x0085:
3637 if (md->bsr_anycrlf) RRETURN(MATCH_NOMATCH);
3638 break;
3639 }
3640 break;
3641
3642 case OP_NOT_HSPACE:
3643 switch(c)
3644 {
3645 default: break;
3646 case 0x09: /* HT */
3647 case 0x20: /* SPACE */
3648 case 0xa0: /* NBSP */
3649 RRETURN(MATCH_NOMATCH);
3650 }
3651 break;
3652
3653 case OP_HSPACE:
3654 switch(c)
3655 {
3656 default: RRETURN(MATCH_NOMATCH);
3657 case 0x09: /* HT */
3658 case 0x20: /* SPACE */
3659 case 0xa0: /* NBSP */
3660 break;
3661 }
3662 break;
3663
3664 case OP_NOT_VSPACE:
3665 switch(c)
3666 {
3667 default: break;
3668 case 0x0a: /* LF */
3669 case 0x0b: /* VT */
3670 case 0x0c: /* FF */
3671 case 0x0d: /* CR */
3672 case 0x85: /* NEL */
3673 RRETURN(MATCH_NOMATCH);
3674 }
3675 break;
3676
3677 case OP_VSPACE:
3678 switch(c)
3679 {
3680 default: RRETURN(MATCH_NOMATCH);
3681 case 0x0a: /* LF */
3682 case 0x0b: /* VT */
3683 case 0x0c: /* FF */
3684 case 0x0d: /* CR */
3685 case 0x85: /* NEL */
3686 break;
3687 }
3688 break;
3689
3690 case OP_NOT_DIGIT:
3691 if ((md->ctypes[c] & ctype_digit) != 0) RRETURN(MATCH_NOMATCH);
3692 break;
3693
3694 case OP_DIGIT:
3695 if ((md->ctypes[c] & ctype_digit) == 0) RRETURN(MATCH_NOMATCH);
3696 break;
3697
3698 case OP_NOT_WHITESPACE:
3699 if ((md->ctypes[c] & ctype_space) != 0) RRETURN(MATCH_NOMATCH);
3700 break;
3701
3702 case OP_WHITESPACE:
3703 if ((md->ctypes[c] & ctype_space) == 0) RRETURN(MATCH_NOMATCH);
3704 break;
3705
3706 case OP_NOT_WORDCHAR:
3707 if ((md->ctypes[c] & ctype_word) != 0) RRETURN(MATCH_NOMATCH);
3708 break;
3709
3710 case OP_WORDCHAR:
3711 if ((md->ctypes[c] & ctype_word) == 0) RRETURN(MATCH_NOMATCH);
3712 break;
3713
3714 default:
3715 RRETURN(PCRE_ERROR_INTERNAL);
3716 }
3717 }
3718 }
3719 /* Control never gets here */
3720 }
3721
3722 /* If maximizing, it is worth using inline code for speed, doing the type
3723 test once at the start (i.e. keep it out of the loop). Again, keep the
3724 UTF-8 and UCP stuff separate. */
3725
3726 else
3727 {
3728 pp = eptr; /* Remember where we started */
3729
3730 #ifdef SUPPORT_UCP
3731 if (prop_type >= 0)
3732 {
3733 switch(prop_type)
3734 {
3735 case PT_ANY:
3736 for (i = min; i < max; i++)
3737 {
3738 int len = 1;
3739 if (eptr >= md->end_subject) break;
3740 GETCHARLEN(c, eptr, len);
3741 if (prop_fail_result) break;
3742 eptr+= len;
3743 }
3744 break;
3745
3746 case PT_LAMP:
3747 for (i = min; i < max; i++)
3748 {
3749 int len = 1;
3750 if (eptr >= md->end_subject) break;
3751 GETCHARLEN(c, eptr, len);
3752 prop_category = _pcre_ucp_findprop(c, &prop_chartype, &prop_script);
3753 if ((prop_chartype == ucp_Lu ||
3754 prop_chartype == ucp_Ll ||
3755 prop_chartype == ucp_Lt) == prop_fail_result)
3756 break;
3757 eptr+= len;
3758 }
3759 break;
3760
3761 case PT_GC:
3762 for (i = min; i < max; i++)
3763 {
3764 int len = 1;
3765 if (eptr >= md->end_subject) break;
3766 GETCHARLEN(c, eptr, len);
3767 prop_category = _pcre_ucp_findprop(c, &prop_chartype, &prop_script);
3768 if ((prop_category == prop_value) == prop_fail_result)
3769 break;
3770 eptr+= len;
3771 }
3772 break;
3773
3774 case PT_PC:
3775 for (i = min; i < max; i++)
3776 {
3777 int len = 1;
3778 if (eptr >= md->end_subject) break;
3779 GETCHARLEN(c, eptr, len);
3780 prop_category = _pcre_ucp_findprop(c, &prop_chartype, &prop_script);
3781 if ((prop_chartype == prop_value) == prop_fail_result)
3782 break;
3783 eptr+= len;
3784 }
3785 break;
3786
3787 case PT_SC:
3788 for (i = min; i < max; i++)
3789 {
3790 int len = 1;
3791 if (eptr >= md->end_subject) break;
3792 GETCHARLEN(c, eptr, len);
3793 prop_category = _pcre_ucp_findprop(c, &prop_chartype, &prop_script);
3794 if ((prop_script == prop_value) == prop_fail_result)
3795 break;
3796 eptr+= len;
3797 }
3798 break;
3799 }
3800
3801 /* eptr is now past the end of the maximum run */
3802
3803 if (possessive) continue;
3804 for(;;)
3805 {
3806 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM44);
3807 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3808 if (eptr-- == pp) break; /* Stop if tried at original pos */
3809 if (utf8) BACKCHAR(eptr);
3810 }
3811 }
3812
3813 /* Match extended Unicode sequences. We will get here only if the
3814 support is in the binary; otherwise a compile-time error occurs. */
3815
3816 else if (ctype == OP_EXTUNI)
3817 {
3818 for (i = min; i < max; i++)
3819 {
3820 if (eptr >= md->end_subject) break;
3821 GETCHARINCTEST(c, eptr);
3822 prop_category = _pcre_ucp_findprop(c, &prop_chartype, &prop_script);
3823 if (prop_category == ucp_M) break;
3824 while (eptr < md->end_subject)
3825 {
3826 int len = 1;
3827 if (!utf8) c = *eptr; else
3828 {
3829 GETCHARLEN(c, eptr, len);
3830 }
3831 prop_category = _pcre_ucp_findprop(c, &prop_chartype, &prop_script);
3832 if (prop_category != ucp_M) break;
3833 eptr += len;
3834 }
3835 }
3836
3837 /* eptr is now past the end of the maximum run */
3838
3839 if (possessive) continue;
3840 for(;;)
3841 {
3842 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM45);
3843 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3844 if (eptr-- == pp) break; /* Stop if tried at original pos */
3845 for (;;) /* Move back over one extended */
3846 {
3847 int len = 1;
3848 if (!utf8) c = *eptr; else
3849 {
3850 BACKCHAR(eptr);
3851 GETCHARLEN(c, eptr, len);
3852 }
3853 prop_category = _pcre_ucp_findprop(c, &prop_chartype, &prop_script);
3854 if (prop_category != ucp_M) break;
3855 eptr--;
3856 }
3857 }
3858 }
3859
3860 else
3861 #endif /* SUPPORT_UCP */
3862
3863 #ifdef SUPPORT_UTF8
3864 /* UTF-8 mode */
3865
3866 if (utf8)
3867 {
3868 switch(ctype)
3869 {
3870 case OP_ANY:
3871 if (max < INT_MAX)
3872 {
3873 if ((ims & PCRE_DOTALL) == 0)
3874 {
3875 for (i = min; i < max; i++)
3876 {
3877 if (eptr >= md->end_subject || IS_NEWLINE(eptr)) break;
3878 eptr++;
3879 while (eptr < md->end_subject && (*eptr & 0xc0) == 0x80) eptr++;
3880 }
3881 }
3882 else
3883 {
3884 for (i = min; i < max; i++)
3885 {
3886 if (eptr >= md->end_subject) break;
3887 eptr++;
3888 while (eptr < md->end_subject && (*eptr & 0xc0) == 0x80) eptr++;
3889 }
3890 }
3891 }
3892
3893 /* Handle unlimited UTF-8 repeat */
3894
3895 else
3896 {
3897 if ((ims & PCRE_DOTALL) == 0)
3898 {
3899 for (i = min; i < max; i++)
3900 {
3901 if (eptr >= md->end_subject || IS_NEWLINE(eptr)) break;
3902 eptr++;
3903 while (eptr < md->end_subject && (*eptr & 0xc0) == 0x80) eptr++;
3904 }
3905 }
3906 else
3907 {
3908 eptr = md->end_subject;
3909 }
3910 }
3911 break;
3912
3913 case OP_ALLANY:
3914 if (max < INT_MAX)
3915 {
3916 for (i = min; i < max; i++)
3917 {
3918 if (eptr >= md->end_subject) break;
3919 eptr++;
3920 while (eptr < md->end_subject && (*eptr & 0xc0) == 0x80) eptr++;
3921 }
3922 }
3923 else eptr = md->end_subject; /* Unlimited UTF-8 repeat */
3924 break;
3925
3926 /* The byte case is the same as non-UTF8 */
3927
3928 case OP_ANYBYTE:
3929 c = max - min;
3930 if (c > (unsigned int)(md->end_subject - eptr))
3931 c = md->end_subject - eptr;
3932 eptr += c;
3933 break;
3934
3935 case OP_ANYNL:
3936 for (i = min; i < max; i++)
3937 {
3938 int len = 1;
3939 if (eptr >= md->end_subject) break;
3940 GETCHARLEN(c, eptr, len);
3941 if (c == 0x000d)
3942 {
3943 if (++eptr >= md->end_subject) break;
3944 if (*eptr == 0x000a) eptr++;
3945 }
3946 else
3947 {
3948 if (c != 0x000a &&
3949 (md->bsr_anycrlf ||
3950 (c != 0x000b && c != 0x000c &&
3951 c != 0x0085 && c != 0x2028 && c != 0x2029)))
3952 break;
3953 eptr += len;
3954 }
3955 }
3956 break;
3957
3958 case OP_NOT_HSPACE:
3959 case OP_HSPACE:
3960 for (i = min; i < max; i++)
3961 {
3962 BOOL gotspace;
3963 int len = 1;
3964 if (eptr >= md->end_subject) break;
3965 GETCHARLEN(c, eptr, len);
3966 switch(c)
3967 {
3968 default: gotspace = FALSE; break;
3969 case 0x09: /* HT */
3970 case 0x20: /* SPACE */
3971 case 0xa0: /* NBSP */
3972 case 0x1680: /* OGHAM SPACE MARK */
3973 case 0x180e: /* MONGOLIAN VOWEL SEPARATOR */
3974 case 0x2000: /* EN QUAD */
3975 case 0x2001: /* EM QUAD */
3976 case 0x2002: /* EN SPACE */
3977 case 0x2003: /* EM SPACE */
3978 case 0x2004: /* THREE-PER-EM SPACE */
3979 case 0x2005: /* FOUR-PER-EM SPACE */
3980 case 0x2006: /* SIX-PER-EM SPACE */
3981 case 0x2007: /* FIGURE SPACE */
3982 case 0x2008: /* PUNCTUATION SPACE */
3983 case 0x2009: /* THIN SPACE */
3984 case 0x200A: /* HAIR SPACE */
3985 case 0x202f: /* NARROW NO-BREAK SPACE */
3986 case 0x205f: /* MEDIUM MATHEMATICAL SPACE */
3987 case 0x3000: /* IDEOGRAPHIC SPACE */
3988 gotspace = TRUE;
3989 break;
3990 }
3991 if (gotspace == (ctype == OP_NOT_HSPACE)) break;
3992 eptr += len;
3993 }
3994 break;
3995
3996 case OP_NOT_VSPACE:
3997 case OP_VSPACE:
3998 for (i = min; i < max; i++)
3999 {
4000 BOOL gotspace;
4001 int len = 1;
4002 if (eptr >= md->end_subject) break;
4003 GETCHARLEN(c, eptr, len);
4004 switch(c)
4005 {
4006 default: gotspace = FALSE; break;
4007 case 0x0a: /* LF */
4008 case 0x0b: /* VT */
4009 case 0x0c: /* FF */
4010 case 0x0d: /* CR */
4011 case 0x85: /* NEL */
4012 case 0x2028: /* LINE SEPARATOR */
4013 case 0x2029: /* PARAGRAPH SEPARATOR */
4014 gotspace = TRUE;
4015 break;
4016 }
4017 if (gotspace == (ctype == OP_NOT_VSPACE)) break;
4018 eptr += len;
4019 }
4020 break;
4021
4022 case OP_NOT_DIGIT:
4023 for (i = min; i < max; i++)
4024 {
4025 int len = 1;
4026 if (eptr >= md->end_subject) break;
4027 GETCHARLEN(c, eptr, len);
4028 if (c < 256 && (md->ctypes[c] & ctype_digit) != 0) break;
4029 eptr+= len;
4030 }
4031 break;
4032
4033 case OP_DIGIT:
4034 for (i = min; i < max; i++)
4035 {
4036 int len = 1;
4037 if (eptr >= md->end_subject) break;
4038 GETCHARLEN(c, eptr, len);
4039 if (c >= 256 ||(md->ctypes[c] & ctype_digit) == 0) break;
4040 eptr+= len;
4041 }
4042 break;
4043
4044 case OP_NOT_WHITESPACE:
4045 for (i = min; i < max; i++)
4046 {
4047 int len = 1;
4048 if (eptr >= md->end_subject) break;
4049 GETCHARLEN(c, eptr, len);
4050 if (c < 256 && (md->ctypes[c] & ctype_space) != 0) break;
4051 eptr+= len;
4052 }
4053 break;
4054
4055 case OP_WHITESPACE:
4056 for (i = min; i < max; i++)
4057 {
4058 int len = 1;
4059 if (eptr >= md->end_subject) break;
4060 GETCHARLEN(c, eptr, len);
4061 if (c >= 256 ||(md->ctypes[c] & ctype_space) == 0) break;
4062 eptr+= len;
4063 }
4064 break;
4065
4066 case OP_NOT_WORDCHAR:
4067 for (i = min; i < max; i++)
4068 {
4069 int len = 1;
4070 if (eptr >= md->end_subject) break;
4071 GETCHARLEN(c, eptr, len);
4072 if (c < 256 && (md->ctypes[c] & ctype_word) != 0) break;
4073 eptr+= len;
4074 }
4075 break;
4076
4077 case OP_WORDCHAR:
4078 for (i = min; i < max; i++)
4079 {
4080 int len = 1;
4081 if (eptr >= md->end_subject) break;
4082 GETCHARLEN(c, eptr, len);
4083 if (c >= 256 || (md->ctypes[c] & ctype_word) == 0) break;
4084 eptr+= len;
4085 }
4086 break;
4087
4088 default:
4089 RRETURN(PCRE_ERROR_INTERNAL);
4090 }
4091
4092 /* eptr is now past the end of the maximum run */
4093
4094 if (possessive) continue;
4095 for(;;)
4096 {
4097 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM46);
4098 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
4099 if (eptr-- == pp) break; /* Stop if tried at original pos */
4100 BACKCHAR(eptr);
4101 }
4102 }
4103 else
4104 #endif /* SUPPORT_UTF8 */
4105
4106 /* Not UTF-8 mode */
4107 {
4108 switch(ctype)
4109 {
4110 case OP_ANY:
4111 if ((ims & PCRE_DOTALL) == 0)
4112 {
4113 for (i = min; i < max; i++)
4114 {
4115 if (eptr >= md->end_subject || IS_NEWLINE(eptr)) break;
4116 eptr++;
4117 }
4118 break;
4119 }
4120 /* For DOTALL case, fall through */
4121
4122 case OP_ALLANY:
4123 case OP_ANYBYTE:
4124 c = max - min;
4125 if (c > (unsigned int)(md->end_subject - eptr))
4126 c = md->end_subject - eptr;
4127 eptr += c;
4128 break;
4129
4130 case OP_ANYNL:
4131 for (i = min; i < max; i++)
4132 {
4133 if (eptr >= md->end_subject) break;
4134 c = *eptr;
4135 if (c == 0x000d)
4136 {
4137 if (++eptr >= md->end_subject) break;
4138 if (*eptr == 0x000a) eptr++;
4139 }
4140 else
4141 {
4142 if (c != 0x000a &&
4143 (md->bsr_anycrlf ||
4144 (c != 0x000b && c != 0x000c && c != 0x0085)))
4145 break;
4146 eptr++;
4147 }
4148 }
4149 break;
4150
4151 case OP_NOT_HSPACE:
4152 for (i = min; i < max; i++)
4153 {
4154 if (eptr >= md->end_subject) break;
4155 c = *eptr;
4156 if (c == 0x09 || c == 0x20 || c == 0xa0) break;
4157 eptr++;
4158 }
4159 break;
4160
4161 case OP_HSPACE:
4162 for (i = min; i < max; i++)
4163 {
4164 if (eptr >= md->end_subject) break;
4165 c = *eptr;
4166 if (c != 0x09 && c != 0x20 && c != 0xa0) break;
4167 eptr++;
4168 }
4169 break;
4170
4171 case OP_NOT_VSPACE:
4172 for (i = min; i < max; i++)
4173 {
4174 if (eptr >= md->end_subject) break;
4175 c = *eptr;
4176 if (c == 0x0a || c == 0x0b || c == 0x0c || c == 0x0d || c == 0x85)
4177 break;
4178 eptr++;
4179 }
4180 break;
4181
4182 case OP_VSPACE:
4183 for (i = min; i < max; i++)
4184 {
4185 if (eptr >= md->end_subject) break;
4186 c = *eptr;
4187 if (c != 0x0a && c != 0x0b && c != 0x0c && c != 0x0d && c != 0x85)
4188 break;
4189 eptr++;
4190 }
4191 break;
4192
4193 case OP_NOT_DIGIT:
4194 for (i = min; i < max; i++)
4195 {
4196 if (eptr >= md->end_subject || (md->ctypes[*eptr] & ctype_digit) != 0)
4197 break;
4198 eptr++;
4199 }
4200 break;
4201
4202 case OP_DIGIT:
4203 for (i = min; i < max; i++)
4204 {
4205 if (eptr >= md->end_subject || (md->ctypes[*eptr] & ctype_digit) == 0)
4206 break;
4207 eptr++;
4208 }
4209 break;
4210
4211 case OP_NOT_WHITESPACE:
4212 for (i = min; i < max; i++)
4213 {
4214 if (eptr >= md->end_subject || (md->ctypes[*eptr] & ctype_space) != 0)
4215 break;
4216 eptr++;
4217 }
4218 break;
4219
4220 case OP_WHITESPACE:
4221 for (i = min; i < max; i++)
4222 {
4223 if (eptr >= md->end_subject || (md->ctypes[*eptr] & ctype_space) == 0)
4224 break;
4225 eptr++;
4226 }
4227 break;
4228
4229 case OP_NOT_WORDCHAR:
4230 for (i = min; i < max; i++)
4231 {
4232 if (eptr >= md->end_subject || (md->ctypes[*eptr] & ctype_word) != 0)
4233 break;
4234 eptr++;
4235 }
4236 break;
4237
4238 case OP_WORDCHAR:
4239 for (i = min; i < max; i++)
4240 {
4241 if (eptr >= md->end_subject || (md->ctypes[*eptr] & ctype_word) == 0)
4242 break;
4243 eptr++;
4244 }
4245 break;
4246
4247 default:
4248 RRETURN(PCRE_ERROR_INTERNAL);
4249 }
4250
4251 /* eptr is now past the end of the maximum run */
4252
4253 if (possessive) continue;
4254 while (eptr >= pp)
4255 {
4256 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM47);
4257 eptr--;
4258 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
4259 }
4260 }
4261
4262 /* Get here if we can't make it match with any permitted repetitions */
4263
4264 RRETURN(MATCH_NOMATCH);
4265 }
4266 /* Control never gets here */
4267
4268 /* There's been some horrible disaster. Arrival here can only mean there is
4269 something seriously wrong in the code above or the OP_xxx definitions. */
4270
4271 default:
4272 DPRINTF(("Unknown opcode %d\n", *ecode));
4273 RRETURN(PCRE_ERROR_UNKNOWN_OPCODE);
4274 }
4275
4276 /* Do not stick any code in here without much thought; it is assumed
4277 that "continue" in the code above comes out to here to repeat the main
4278 loop. */
4279
4280 } /* End of main loop */
4281 /* Control never reaches here */
4282
4283
4284 /* When compiling to use the heap rather than the stack for recursive calls to
4285 match(), the RRETURN() macro jumps here. The number that is saved in
4286 frame->Xwhere indicates which label we actually want to return to. */
4287
4288 #ifdef NO_RECURSE
4289 #define LBL(val) case val: goto L_RM##val;
4290 HEAP_RETURN:
4291 switch (frame->Xwhere)
4292 {
4293 LBL( 1) LBL( 2) LBL( 3) LBL( 4) LBL( 5) LBL( 6) LBL( 7) LBL( 8)
4294 LBL( 9) LBL(10) LBL(11) LBL(12) LBL(13) LBL(14) LBL(15) LBL(17)
4295 LBL(19) LBL(24) LBL(25) LBL(26) LBL(27) LBL(29) LBL(31) LBL(33)
4296 LBL(35) LBL(43) LBL(47) LBL(48) LBL(49) LBL(50) LBL(51) LBL(52)
4297 LBL(53) LBL(54)
4298 #ifdef SUPPORT_UTF8
4299 LBL(16) LBL(18) LBL(20) LBL(21) LBL(22) LBL(23) LBL(28) LBL(30)
4300 LBL(32) LBL(34) LBL(42) LBL(46)
4301 #ifdef SUPPORT_UCP
4302 LBL(36) LBL(37) LBL(38) LBL(39) LBL(40) LBL(41) LBL(44) LBL(45)
4303 #endif /* SUPPORT_UCP */
4304 #endif /* SUPPORT_UTF8 */
4305 default:
4306 DPRINTF(("jump error in pcre match: label %d non-existent\n", frame->Xwhere));
4307 return PCRE_ERROR_INTERNAL;
4308 }
4309 #undef LBL
4310 #endif /* NO_RECURSE */
4311 }
4312
4313
4314 /***************************************************************************
4315 ****************************************************************************
4316 RECURSION IN THE match() FUNCTION
4317
4318 Undefine all the macros that were defined above to handle this. */
4319
4320 #ifdef NO_RECURSE
4321 #undef eptr
4322 #undef ecode
4323 #undef mstart
4324 #undef offset_top
4325 #undef ims
4326 #undef eptrb
4327 #undef flags
4328
4329 #undef callpat
4330 #undef charptr
4331 #undef data
4332 #undef next
4333 #undef pp
4334 #undef prev
4335 #undef saved_eptr
4336
4337 #undef new_recursive
4338
4339 #undef cur_is_word
4340 #undef condition
4341 #undef prev_is_word
4342
4343 #undef original_ims
4344
4345 #undef ctype
4346 #undef length
4347 #undef max
4348 #undef min
4349 #undef number
4350 #undef offset
4351 #undef op
4352 #undef save_capture_last
4353 #undef save_offset1
4354 #undef save_offset2
4355 #undef save_offset3
4356 #undef stacksave
4357
4358 #undef newptrb
4359
4360 #endif
4361
4362 /* These two are defined as macros in both cases */
4363
4364 #undef fc
4365 #undef fi
4366
4367 /***************************************************************************
4368 ***************************************************************************/
4369
4370
4371
4372 /*************************************************
4373 * Execute a Regular Expression *
4374 *************************************************/
4375
4376 /* This function applies a compiled re to a subject string and picks out
4377 portions of the string if it matches. Two elements in the vector are set for
4378 each substring: the offsets to the start and end of the substring.
4379
4380 Arguments:
4381 argument_re points to the compiled expression
4382 extra_data points to extra data or is NULL
4383 subject points to the subject string
4384 length length of subject string (may contain binary zeros)
4385 start_offset where to start in the subject string
4386 options option bits
4387 offsets points to a vector of ints to be filled in with offsets
4388 offsetcount the number of elements in the vector
4389
4390 Returns: > 0 => success; value is the number of elements filled in
4391 = 0 => success, but offsets is not big enough
4392 -1 => failed to match
4393 < -1 => some kind of unexpected problem
4394 */
4395
4396 PCRE_EXP_DEFN int
4397 pcre_exec(const pcre *argument_re, const pcre_extra *extra_data,
4398 PCRE_SPTR subject, int length, int start_offset, int options, int *offsets,
4399 int offsetcount)
4400 {
4401 int rc, resetcount, ocount;
4402 int first_byte = -1;
4403 int req_byte = -1;
4404 int req_byte2 = -1;
4405 int newline;
4406 unsigned long int ims;
4407 BOOL using_temporary_offsets = FALSE;
4408 BOOL anchored;
4409 BOOL startline;
4410 BOOL firstline;
4411 BOOL first_byte_caseless = FALSE;
4412 BOOL req_byte_caseless = FALSE;
4413 BOOL utf8;
4414 match_data match_block;
4415 match_data *md = &match_block;
4416 const uschar *tables;
4417 const uschar *start_bits = NULL;
4418 USPTR start_match = (USPTR)subject + start_offset;
4419 USPTR end_subject;
4420 USPTR req_byte_ptr = start_match - 1;
4421
4422 pcre_study_data internal_study;
4423 const pcre_study_data *study;
4424
4425 real_pcre internal_re;
4426 const real_pcre *external_re = (const real_pcre *)argument_re;
4427 const real_pcre *re = external_re;
4428
4429 /* Plausibility checks */
4430
4431 if ((options & ~PUBLIC_EXEC_OPTIONS) != 0) return PCRE_ERROR_BADOPTION;
4432 if (re == NULL || subject == NULL ||
4433 (offsets == NULL && offsetcount > 0)) return PCRE_ERROR_NULL;
4434 if (offsetcount < 0) return PCRE_ERROR_BADCOUNT;
4435
4436 /* Fish out the optional data from the extra_data structure, first setting
4437 the default values. */
4438
4439 study = NULL;
4440 md->match_limit = MATCH_LIMIT;
4441 md->match_limit_recursion = MATCH_LIMIT_RECURSION;
4442 md->callout_data = NULL;
4443
4444 /* The table pointer is always in native byte order. */
4445
4446 tables = external_re->tables;
4447
4448 if (extra_data != NULL)
4449 {
4450 register unsigned int flags = extra_data->flags;
4451 if ((flags & PCRE_EXTRA_STUDY_DATA) != 0)
4452 study = (const pcre_study_data *)extra_data->study_data;
4453 if ((flags & PCRE_EXTRA_MATCH_LIMIT) != 0)
4454 md->match_limit = extra_data->match_limit;
4455 if ((flags & PCRE_EXTRA_MATCH_LIMIT_RECURSION) != 0)
4456 md->match_limit_recursion = extra_data->match_limit_recursion;
4457 if ((flags & PCRE_EXTRA_CALLOUT_DATA) != 0)
4458 md->callout_data = extra_data->callout_data;
4459 if ((flags & PCRE_EXTRA_TABLES) != 0) tables = extra_data->tables;
4460 }
4461
4462 /* If the exec call supplied NULL for tables, use the inbuilt ones. This
4463 is a feature that makes it possible to save compiled regex and re-use them
4464 in other programs later. */
4465
4466 if (tables == NULL) tables = _pcre_default_tables;
4467
4468 /* Check that the first field in the block is the magic number. If it is not,
4469 test for a regex that was compiled on a host of opposite endianness. If this is
4470 the case, flipped values are put in internal_re and internal_study if there was
4471 study data too. */
4472
4473 if (re->magic_number != MAGIC_NUMBER)
4474 {
4475 re = _pcre_try_flipped(re, &internal_re, study, &internal_study);
4476 if (re == NULL) return PCRE_ERROR_BADMAGIC;
4477 if (study != NULL) study = &internal_study;
4478 }
4479
4480 /* Set up other data */
4481
4482 anchored = ((re->options | options) & PCRE_ANCHORED) != 0;
4483 startline = (re->flags & PCRE_STARTLINE) != 0;
4484 firstline = (re->options & PCRE_FIRSTLINE) != 0;
4485
4486 /* The code starts after the real_pcre block and the capture name table. */
4487
4488 md->start_code = (const uschar *)external_re + re->name_table_offset +
4489 re->name_count * re->name_entry_size;
4490
4491 md->start_subject = (USPTR)subject;
4492 md->start_offset = start_offset;
4493 md->end_subject = md->start_subject + length;
4494 end_subject = md->end_subject;
4495
4496 md->endonly = (re->options & PCRE_DOLLAR_ENDONLY) != 0;
4497 utf8 = md->utf8 = (re->options & PCRE_UTF8) != 0;
4498 md->jscript_compat = (re->options & PCRE_JAVASCRIPT_COMPAT) != 0;
4499
4500 md->notbol = (options & PCRE_NOTBOL) != 0;
4501 md->noteol = (options & PCRE_NOTEOL) != 0;
4502 md->notempty = (options & PCRE_NOTEMPTY) != 0;
4503 md->partial = (options & PCRE_PARTIAL) != 0;
4504 md->hitend = FALSE;
4505
4506 md->recursive = NULL; /* No recursion at top level */
4507
4508 md->lcc = tables + lcc_offset;
4509 md->ctypes = tables + ctypes_offset;
4510
4511 /* Handle different \R options. */
4512
4513 switch (options & (PCRE_BSR_ANYCRLF|PCRE_BSR_UNICODE))
4514 {
4515 case 0:
4516 if ((re->options & (PCRE_BSR_ANYCRLF|PCRE_BSR_UNICODE)) != 0)
4517 md->bsr_anycrlf = (re->options & PCRE_BSR_ANYCRLF) != 0;
4518 else
4519 #ifdef BSR_ANYCRLF
4520 md->bsr_anycrlf = TRUE;
4521 #else
4522 md->bsr_anycrlf = FALSE;
4523 #endif
4524 break;
4525
4526 case PCRE_BSR_ANYCRLF:
4527 md->bsr_anycrlf = TRUE;
4528 break;
4529
4530 case PCRE_BSR_UNICODE:
4531 md->bsr_anycrlf = FALSE;
4532 break;
4533
4534 default: return PCRE_ERROR_BADNEWLINE;
4535 }
4536
4537 /* Handle different types of newline. The three bits give eight cases. If
4538 nothing is set at run time, whatever was used at compile time applies. */
4539
4540 switch ((((options & PCRE_NEWLINE_BITS) == 0)? re->options :
4541 (pcre_uint32)options) & PCRE_NEWLINE_BITS)
4542 {
4543 case 0: newline = NEWLINE; break; /* Compile-time default */
4544 case PCRE_NEWLINE_CR: newline = '\r'; break;
4545 case PCRE_NEWLINE_LF: newline = '\n'; break;
4546 case PCRE_NEWLINE_CR+
4547 PCRE_NEWLINE_LF: newline = ('\r' << 8) | '\n'; break;
4548 case PCRE_NEWLINE_ANY: newline = -1; break;
4549 case PCRE_NEWLINE_ANYCRLF: newline = -2; break;
4550 default: return PCRE_ERROR_BADNEWLINE;
4551 }
4552
4553 if (newline == -2)
4554 {
4555 md->nltype = NLTYPE_ANYCRLF;
4556 }
4557 else if (newline < 0)
4558 {
4559 md->nltype = NLTYPE_ANY;
4560 }
4561 else
4562 {
4563 md->nltype = NLTYPE_FIXED;
4564 if (newline > 255)
4565 {
4566 md->nllen = 2;
4567 md->nl[0] = (newline >> 8) & 255;
4568 md->nl[1] = newline & 255;
4569 }
4570 else
4571 {
4572 md->nllen = 1;
4573 md->nl[0] = newline;
4574 }
4575 }
4576
4577 /* Partial matching is supported only for a restricted set of regexes at the
4578 moment. */
4579
4580 if (md->partial && (re->flags & PCRE_NOPARTIAL) != 0)
4581 return PCRE_ERROR_BADPARTIAL;
4582
4583 /* Check a UTF-8 string if required. Unfortunately there's no way of passing
4584 back the character offset. */
4585
4586 #ifdef SUPPORT_UTF8
4587 if (utf8 && (options & PCRE_NO_UTF8_CHECK) == 0)
4588 {
4589 if (_pcre_valid_utf8((uschar *)subject, length) >= 0)
4590 return PCRE_ERROR_BADUTF8;
4591 if (start_offset > 0 && start_offset < length)
4592 {
4593 int tb = ((uschar *)subject)[start_offset];
4594 if (tb > 127)
4595 {
4596 tb &= 0xc0;
4597 if (tb != 0 && tb != 0xc0) return PCRE_ERROR_BADUTF8_OFFSET;
4598 }
4599 }
4600 }
4601 #endif
4602
4603 /* The ims options can vary during the matching as a result of the presence
4604 of (?ims) items in the pattern. They are kept in a local variable so that
4605 restoring at the exit of a group is easy. */
4606
4607 ims = re->options & (PCRE_CASELESS|PCRE_MULTILINE|PCRE_DOTALL);
4608
4609 /* If the expression has got more back references than the offsets supplied can
4610 hold, we get a temporary chunk of working store to use during the matching.
4611 Otherwise, we can use the vector supplied, rounding down its size to a multiple
4612 of 3. */
4613
4614 ocount = offsetcount - (offsetcount % 3);
4615
4616 if (re->top_backref > 0 && re->top_backref >= ocount/3)
4617 {
4618 ocount = re->top_backref * 3 + 3;
4619 md->offset_vector = (int *)(pcre_malloc)(ocount * sizeof(int));
4620 if (md->offset_vector == NULL) return PCRE_ERROR_NOMEMORY;
4621 using_temporary_offsets = TRUE;
4622 DPRINTF(("Got memory to hold back references\n"));
4623 }
4624 else md->offset_vector = offsets;
4625
4626 md->offset_end = ocount;
4627 md->offset_max = (2*ocount)/3;
4628 md->offset_overflow = FALSE;
4629 md->capture_last = -1;
4630
4631 /* Compute the minimum number of offsets that we need to reset each time. Doing
4632 this makes a huge difference to execution time when there aren't many brackets
4633 in the pattern. */
4634
4635 resetcount = 2 + re->top_bracket * 2;
4636 if (resetcount > offsetcount) resetcount = ocount;
4637
4638 /* Reset the working variable associated with each extraction. These should
4639 never be used unless previously set, but they get saved and restored, and so we
4640 initialize them to avoid reading uninitialized locations. */
4641
4642 if (md->offset_vector != NULL)
4643 {
4644 register int *iptr = md->offset_vector + ocount;
4645 register int *iend = iptr - resetcount/2 + 1;
4646 while (--iptr >= iend) *iptr = -1;
4647 }
4648
4649 /* Set up the first character to match, if available. The first_byte value is
4650 never set for an anchored regular expression, but the anchoring may be forced
4651 at run time, so we have to test for anchoring. The first char may be unset for
4652 an unanchored pattern, of course. If there's no first char and the pattern was
4653 studied, there may be a bitmap of possible first characters. */
4654
4655 if (!anchored)
4656 {
4657 if ((re->flags & PCRE_FIRSTSET) != 0)
4658 {
4659 first_byte = re->first_byte & 255;
4660 if ((first_byte_caseless = ((re->first_byte & REQ_CASELESS) != 0)) == TRUE)
4661 first_byte = md->lcc[first_byte];
4662 }
4663 else
4664 if (!startline && study != NULL &&
4665 (study->options & PCRE_STUDY_MAPPED) != 0)
4666 start_bits = study->start_bits;
4667 }
4668
4669 /* For anchored or unanchored matches, there may be a "last known required
4670 character" set. */
4671
4672 if ((re->flags & PCRE_REQCHSET) != 0)
4673 {
4674 req_byte = re->req_byte & 255;
4675 req_byte_caseless = (re->req_byte & REQ_CASELESS) != 0;
4676 req_byte2 = (tables + fcc_offset)[req_byte]; /* case flipped */
4677 }
4678
4679
4680 /* ==========================================================================*/
4681
4682 /* Loop for handling unanchored repeated matching attempts; for anchored regexs
4683 the loop runs just once. */
4684
4685 for(;;)
4686 {
4687 USPTR save_end_subject = end_subject;
4688 USPTR new_start_match;
4689
4690 /* Reset the maximum number of extractions we might see. */
4691
4692 if (md->offset_vector != NULL)
4693 {
4694 register int *iptr = md->offset_vector;
4695 register int *iend = iptr + resetcount;
4696 while (iptr < iend) *iptr++ = -1;
4697 }
4698
4699 /* Advance to a unique first char if possible. If firstline is TRUE, the
4700 start of the match is constrained to the first line of a multiline string.
4701 That is, the match must be before or at the first newline. Implement this by
4702 temporarily adjusting end_subject so that we stop scanning at a newline. If
4703 the match fails at the newline, later code breaks this loop. */
4704
4705 if (firstline)
4706 {
4707 USPTR t = start_match;
4708 while (t < md->end_subject && !IS_NEWLINE(t)) t++;
4709 end_subject = t;
4710 }
4711
4712 /* Now test for a unique first byte */
4713
4714 if (first_byte >= 0)
4715 {
4716 if (first_byte_caseless)
4717 while (start_match < end_subject &&
4718 md->lcc[*start_match] != first_byte)
4719 { NEXTCHAR(start_match); }
4720 else
4721 while (start_match < end_subject && *start_match != first_byte)
4722 { NEXTCHAR(start_match); }
4723 }
4724
4725 /* Or to just after a linebreak for a multiline match if possible */
4726
4727 else if (startline)
4728 {
4729 if (start_match > md->start_subject + start_offset)
4730 {
4731 while (start_match <= end_subject && !WAS_NEWLINE(start_match))
4732 { NEXTCHAR(start_match); }
4733
4734 /* If we have just passed a CR and the newline option is ANY or ANYCRLF,
4735 and we are now at a LF, advance the match position by one more character.
4736 */
4737
4738 if (start_match[-1] == '\r' &&
4739 (md->nltype == NLTYPE_ANY || md->nltype == NLTYPE_ANYCRLF) &&
4740 start_match < end_subject &&
4741 *start_match == '\n')
4742 start_match++;
4743 }
4744 }
4745
4746 /* Or to a non-unique first char after study */
4747
4748 else if (start_bits != NULL)
4749 {
4750 while (start_match < end_subject)
4751 {
4752 register unsigned int c = *start_match;
4753 if ((start_bits[c/8] & (1 << (c&7))) == 0)
4754 { NEXTCHAR(start_match); }
4755 else break;
4756 }
4757 }
4758
4759 /* Restore fudged end_subject */
4760
4761 end_subject = save_end_subject;
4762
4763 #ifdef DEBUG /* Sigh. Some compilers never learn. */
4764 printf(">>>> Match against: ");
4765 pchars(start_match, end_subject - start_match, TRUE, md);
4766 printf("\n");
4767 #endif
4768
4769 /* If req_byte is set, we know that that character must appear in the subject
4770 for the match to succeed. If the first character is set, req_byte must be
4771 later in the subject; otherwise the test starts at the match point. This
4772 optimization can save a huge amount of backtracking in patterns with nested
4773 unlimited repeats that aren't going to match. Writing separate code for
4774 cased/caseless versions makes it go faster, as does using an autoincrement
4775 and backing off on a match.
4776
4777 HOWEVER: when the subject string is very, very long, searching to its end can
4778 take a long time, and give bad performance on quite ordinary patterns. This
4779 showed up when somebody was matching something like /^\d+C/ on a 32-megabyte
4780 string... so we don't do this when the string is sufficiently long.
4781
4782 ALSO: this processing is disabled when partial matching is requested.
4783 */
4784
4785 if (req_byte >= 0 &&
4786 end_subject - start_match < REQ_BYTE_MAX &&
4787 !md->partial)
4788 {
4789 register USPTR p = start_match + ((first_byte >= 0)? 1 : 0);
4790
4791 /* We don't need to repeat the search if we haven't yet reached the
4792 place we found it at last time. */
4793
4794 if (p > req_byte_ptr)
4795 {
4796 if (req_byte_caseless)
4797 {
4798 while (p < end_subject)
4799 {
4800 register int pp = *p++;
4801 if (pp == req_byte || pp == req_byte2) { p--; break; }
4802 }
4803 }
4804 else
4805 {
4806 while (p < end_subject)
4807 {
4808 if (*p++ == req_byte) { p--; break; }
4809 }
4810 }
4811
4812 /* If we can't find the required character, break the matching loop,
4813 forcing a match failure. */
4814
4815 if (p >= end_subject)
4816 {
4817 rc = MATCH_NOMATCH;
4818 break;
4819 }
4820
4821 /* If we have found the required character, save the point where we
4822 found it, so that we don't search again next time round the loop if
4823 the start hasn't passed this character yet. */
4824
4825 req_byte_ptr = p;
4826 }
4827 }
4828
4829 /* OK, we can now run the match. */
4830
4831 md->start_match_ptr = start_match;
4832 md->match_call_count = 0;
4833 rc = match(start_match, md->start_code, start_match, 2, md, ims, NULL, 0, 0);
4834
4835 switch(rc)
4836 {
4837 /* NOMATCH and PRUNE advance by one character. THEN at this level acts
4838 exactly like PRUNE. */
4839
4840 case MATCH_NOMATCH:
4841 case MATCH_PRUNE:
4842 case MATCH_THEN:
4843 new_start_match = start_match + 1;
4844 #ifdef SUPPORT_UTF8
4845 if (utf8)
4846 while(new_start_match < end_subject && (*new_start_match & 0xc0) == 0x80)
4847 new_start_match++;
4848 #endif
4849 break;
4850
4851 /* SKIP passes back the next starting point explicitly. */
4852
4853 case MATCH_SKIP:
4854 new_start_match = md->start_match_ptr;
4855 break;
4856
4857 /* COMMIT disables the bumpalong, but otherwise behaves as NOMATCH. */
4858
4859 case MATCH_COMMIT:
4860 rc = MATCH_NOMATCH;
4861 goto ENDLOOP;
4862
4863 /* Any other return is some kind of error. */
4864
4865 default:
4866 goto ENDLOOP;
4867 }
4868
4869 /* Control reaches here for the various types of "no match at this point"
4870 result. Reset the code to MATCH_NOMATCH for subsequent checking. */
4871
4872 rc = MATCH_NOMATCH;
4873
4874 /* If PCRE_FIRSTLINE is set, the match must happen before or at the first
4875 newline in the subject (though it may continue over the newline). Therefore,
4876 if we have just failed to match, starting at a newline, do not continue. */
4877
4878 if (firstline && IS_NEWLINE(start_match)) break;
4879
4880 /* Advance to new matching position */
4881
4882 start_match = new_start_match;
4883
4884 /* Break the loop if the pattern is anchored or if we have passed the end of
4885 the subject. */
4886
4887 if (anchored || start_match > end_subject) break;
4888
4889 /* If we have just passed a CR and we are now at a LF, and the pattern does
4890 not contain any explicit matches for \r or \n, and the newline option is CRLF
4891 or ANY or ANYCRLF, advance the match position by one more character. */
4892
4893 if (start_match[-1] == '\r' &&
4894 start_match < end_subject &&
4895 *start_match == '\n' &&
4896 (re->flags & PCRE_HASCRORLF) == 0 &&
4897 (md->nltype == NLTYPE_ANY ||
4898 md->nltype == NLTYPE_ANYCRLF ||
4899 md->nllen == 2))
4900 start_match++;
4901
4902 } /* End of for(;;) "bumpalong" loop */
4903
4904 /* ==========================================================================*/
4905
4906 /* We reach here when rc is not MATCH_NOMATCH, or if one of the stopping
4907 conditions is true:
4908
4909 (1) The pattern is anchored or the match was failed by (*COMMIT);
4910
4911 (2) We are past the end of the subject;
4912
4913 (3) PCRE_FIRSTLINE is set and we have failed to match at a newline, because
4914 this option requests that a match occur at or before the first newline in
4915 the subject.
4916
4917 When we have a match and the offset vector is big enough to deal with any
4918 backreferences, captured substring offsets will already be set up. In the case
4919 where we had to get some local store to hold offsets for backreference
4920 processing, copy those that we can. In this case there need not be overflow if
4921 certain parts of the pattern were not used, even though there are more
4922 capturing parentheses than vector slots. */
4923
4924 ENDLOOP:
4925
4926 if (rc == MATCH_MATCH)
4927 {
4928 if (using_temporary_offsets)
4929 {
4930 if (offsetcount >= 4)
4931 {
4932 memcpy(offsets + 2, md->offset_vector + 2,
4933 (offsetcount - 2) * sizeof(int));
4934 DPRINTF(("Copied offsets from temporary memory\n"));
4935 }
4936 if (md->end_offset_top > offsetcount) md->offset_overflow = TRUE;
4937 DPRINTF(("Freeing temporary memory\n"));
4938 (pcre_free)(md->offset_vector);
4939 }
4940
4941 /* Set the return code to the number of captured strings, or 0 if there are
4942 too many to fit into the vector. */
4943
4944 rc = md->offset_overflow? 0 : md->end_offset_top/2;
4945
4946 /* If there is space, set up the whole thing as substring 0. The value of
4947 md->start_match_ptr might be modified if \K was encountered on the success
4948 matching path. */
4949
4950 if (offsetcount < 2) rc = 0; else
4951 {
4952 offsets[0] = md->start_match_ptr - md->start_subject;
4953 offsets[1] = md->end_match_ptr - md->start_subject;
4954 }
4955
4956 DPRINTF((">>>> returning %d\n", rc));
4957 return rc;
4958 }
4959
4960 /* Control gets here if there has been an error, or if the overall match
4961 attempt has failed at all permitted starting positions. */
4962
4963 if (using_temporary_offsets)
4964 {
4965 DPRINTF(("Freeing temporary memory\n"));
4966 (pcre_free)(md->offset_vector);
4967 }
4968
4969 if (rc != MATCH_NOMATCH)
4970 {
4971 DPRINTF((">>>> error: returning %d\n", rc));
4972 return rc;
4973 }
4974 else if (md->partial && md->hitend)
4975 {
4976 DPRINTF((">>>> returning PCRE_ERROR_PARTIAL\n"));
4977 return PCRE_ERROR_PARTIAL;
4978 }
4979 else
4980 {
4981 DPRINTF((">>>> returning PCRE_ERROR_NOMATCH\n"));
4982 return PCRE_ERROR_NOMATCH;
4983 }
4984 }
4985
4986 /* End of pcre_exec.c */

Properties

Name Value
svn:eol-style native
svn:keywords "Author Date Id Revision Url"

webmaster@exim.org
ViewVC Help
Powered by ViewVC 1.1.12