/[pcre]/code/trunk/pcre_exec.c
ViewVC logotype

Contents of /code/trunk/pcre_exec.c

Parent Directory Parent Directory | Revision Log Revision Log


Revision 335 - (show annotations) (download)
Sat Apr 12 14:36:14 2008 UTC (6 years, 6 months ago) by ph10
File MIME type: text/plain
File size: 150506 byte(s)
Do not discard subpatterns with {0} quantifiers, as they may be called as 
subroutines.

1 /*************************************************
2 * Perl-Compatible Regular Expressions *
3 *************************************************/
4
5 /* PCRE is a library of functions to support regular expressions whose syntax
6 and semantics are as close as possible to those of the Perl 5 language.
7
8 Written by Philip Hazel
9 Copyright (c) 1997-2008 University of Cambridge
10
11 -----------------------------------------------------------------------------
12 Redistribution and use in source and binary forms, with or without
13 modification, are permitted provided that the following conditions are met:
14
15 * Redistributions of source code must retain the above copyright notice,
16 this list of conditions and the following disclaimer.
17
18 * Redistributions in binary form must reproduce the above copyright
19 notice, this list of conditions and the following disclaimer in the
20 documentation and/or other materials provided with the distribution.
21
22 * Neither the name of the University of Cambridge nor the names of its
23 contributors may be used to endorse or promote products derived from
24 this software without specific prior written permission.
25
26 THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
27 AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
28 IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
29 ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
30 LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
31 CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
32 SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
33 INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
34 CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
35 ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
36 POSSIBILITY OF SUCH DAMAGE.
37 -----------------------------------------------------------------------------
38 */
39
40
41 /* This module contains pcre_exec(), the externally visible function that does
42 pattern matching using an NFA algorithm, trying to mimic Perl as closely as
43 possible. There are also some static supporting functions. */
44
45 #ifdef HAVE_CONFIG_H
46 #include "config.h"
47 #endif
48
49 #define NLBLOCK md /* Block containing newline information */
50 #define PSSTART start_subject /* Field containing processed string start */
51 #define PSEND end_subject /* Field containing processed string end */
52
53 #include "pcre_internal.h"
54
55 /* Undefine some potentially clashing cpp symbols */
56
57 #undef min
58 #undef max
59
60 /* Flag bits for the match() function */
61
62 #define match_condassert 0x01 /* Called to check a condition assertion */
63 #define match_cbegroup 0x02 /* Could-be-empty unlimited repeat group */
64
65 /* Non-error returns from the match() function. Error returns are externally
66 defined PCRE_ERROR_xxx codes, which are all negative. */
67
68 #define MATCH_MATCH 1
69 #define MATCH_NOMATCH 0
70
71 /* Special internal returns from the match() function. Make them sufficiently
72 negative to avoid the external error codes. */
73
74 #define MATCH_COMMIT (-999)
75 #define MATCH_PRUNE (-998)
76 #define MATCH_SKIP (-997)
77 #define MATCH_THEN (-996)
78
79 /* Maximum number of ints of offset to save on the stack for recursive calls.
80 If the offset vector is bigger, malloc is used. This should be a multiple of 3,
81 because the offset vector is always a multiple of 3 long. */
82
83 #define REC_STACK_SAVE_MAX 30
84
85 /* Min and max values for the common repeats; for the maxima, 0 => infinity */
86
87 static const char rep_min[] = { 0, 0, 1, 1, 0, 0 };
88 static const char rep_max[] = { 0, 0, 0, 0, 1, 1 };
89
90
91
92 #ifdef DEBUG
93 /*************************************************
94 * Debugging function to print chars *
95 *************************************************/
96
97 /* Print a sequence of chars in printable format, stopping at the end of the
98 subject if the requested.
99
100 Arguments:
101 p points to characters
102 length number to print
103 is_subject TRUE if printing from within md->start_subject
104 md pointer to matching data block, if is_subject is TRUE
105
106 Returns: nothing
107 */
108
109 static void
110 pchars(const uschar *p, int length, BOOL is_subject, match_data *md)
111 {
112 unsigned int c;
113 if (is_subject && length > md->end_subject - p) length = md->end_subject - p;
114 while (length-- > 0)
115 if (isprint(c = *(p++))) printf("%c", c); else printf("\\x%02x", c);
116 }
117 #endif
118
119
120
121 /*************************************************
122 * Match a back-reference *
123 *************************************************/
124
125 /* If a back reference hasn't been set, the length that is passed is greater
126 than the number of characters left in the string, so the match fails.
127
128 Arguments:
129 offset index into the offset vector
130 eptr points into the subject
131 length length to be matched
132 md points to match data block
133 ims the ims flags
134
135 Returns: TRUE if matched
136 */
137
138 static BOOL
139 match_ref(int offset, register USPTR eptr, int length, match_data *md,
140 unsigned long int ims)
141 {
142 USPTR p = md->start_subject + md->offset_vector[offset];
143
144 #ifdef DEBUG
145 if (eptr >= md->end_subject)
146 printf("matching subject <null>");
147 else
148 {
149 printf("matching subject ");
150 pchars(eptr, length, TRUE, md);
151 }
152 printf(" against backref ");
153 pchars(p, length, FALSE, md);
154 printf("\n");
155 #endif
156
157 /* Always fail if not enough characters left */
158
159 if (length > md->end_subject - eptr) return FALSE;
160
161 /* Separate the caselesss case for speed */
162
163 if ((ims & PCRE_CASELESS) != 0)
164 {
165 while (length-- > 0)
166 if (md->lcc[*p++] != md->lcc[*eptr++]) return FALSE;
167 }
168 else
169 { while (length-- > 0) if (*p++ != *eptr++) return FALSE; }
170
171 return TRUE;
172 }
173
174
175
176 /***************************************************************************
177 ****************************************************************************
178 RECURSION IN THE match() FUNCTION
179
180 The match() function is highly recursive, though not every recursive call
181 increases the recursive depth. Nevertheless, some regular expressions can cause
182 it to recurse to a great depth. I was writing for Unix, so I just let it call
183 itself recursively. This uses the stack for saving everything that has to be
184 saved for a recursive call. On Unix, the stack can be large, and this works
185 fine.
186
187 It turns out that on some non-Unix-like systems there are problems with
188 programs that use a lot of stack. (This despite the fact that every last chip
189 has oodles of memory these days, and techniques for extending the stack have
190 been known for decades.) So....
191
192 There is a fudge, triggered by defining NO_RECURSE, which avoids recursive
193 calls by keeping local variables that need to be preserved in blocks of memory
194 obtained from malloc() instead instead of on the stack. Macros are used to
195 achieve this so that the actual code doesn't look very different to what it
196 always used to.
197
198 The original heap-recursive code used longjmp(). However, it seems that this
199 can be very slow on some operating systems. Following a suggestion from Stan
200 Switzer, the use of longjmp() has been abolished, at the cost of having to
201 provide a unique number for each call to RMATCH. There is no way of generating
202 a sequence of numbers at compile time in C. I have given them names, to make
203 them stand out more clearly.
204
205 Crude tests on x86 Linux show a small speedup of around 5-8%. However, on
206 FreeBSD, avoiding longjmp() more than halves the time taken to run the standard
207 tests. Furthermore, not using longjmp() means that local dynamic variables
208 don't have indeterminate values; this has meant that the frame size can be
209 reduced because the result can be "passed back" by straight setting of the
210 variable instead of being passed in the frame.
211 ****************************************************************************
212 ***************************************************************************/
213
214 /* Numbers for RMATCH calls. When this list is changed, the code at HEAP_RETURN
215 below must be updated in sync. */
216
217 enum { RM1=1, RM2, RM3, RM4, RM5, RM6, RM7, RM8, RM9, RM10,
218 RM11, RM12, RM13, RM14, RM15, RM16, RM17, RM18, RM19, RM20,
219 RM21, RM22, RM23, RM24, RM25, RM26, RM27, RM28, RM29, RM30,
220 RM31, RM32, RM33, RM34, RM35, RM36, RM37, RM38, RM39, RM40,
221 RM41, RM42, RM43, RM44, RM45, RM46, RM47, RM48, RM49, RM50,
222 RM51, RM52, RM53, RM54 };
223
224 /* These versions of the macros use the stack, as normal. There are debugging
225 versions and production versions. Note that the "rw" argument of RMATCH isn't
226 actuall used in this definition. */
227
228 #ifndef NO_RECURSE
229 #define REGISTER register
230
231 #ifdef DEBUG
232 #define RMATCH(ra,rb,rc,rd,re,rf,rg,rw) \
233 { \
234 printf("match() called in line %d\n", __LINE__); \
235 rrc = match(ra,rb,mstart,rc,rd,re,rf,rg,rdepth+1); \
236 printf("to line %d\n", __LINE__); \
237 }
238 #define RRETURN(ra) \
239 { \
240 printf("match() returned %d from line %d ", ra, __LINE__); \
241 return ra; \
242 }
243 #else
244 #define RMATCH(ra,rb,rc,rd,re,rf,rg,rw) \
245 rrc = match(ra,rb,mstart,rc,rd,re,rf,rg,rdepth+1)
246 #define RRETURN(ra) return ra
247 #endif
248
249 #else
250
251
252 /* These versions of the macros manage a private stack on the heap. Note that
253 the "rd" argument of RMATCH isn't actually used in this definition. It's the md
254 argument of match(), which never changes. */
255
256 #define REGISTER
257
258 #define RMATCH(ra,rb,rc,rd,re,rf,rg,rw)\
259 {\
260 heapframe *newframe = (pcre_stack_malloc)(sizeof(heapframe));\
261 frame->Xwhere = rw; \
262 newframe->Xeptr = ra;\
263 newframe->Xecode = rb;\
264 newframe->Xmstart = mstart;\
265 newframe->Xoffset_top = rc;\
266 newframe->Xims = re;\
267 newframe->Xeptrb = rf;\
268 newframe->Xflags = rg;\
269 newframe->Xrdepth = frame->Xrdepth + 1;\
270 newframe->Xprevframe = frame;\
271 frame = newframe;\
272 DPRINTF(("restarting from line %d\n", __LINE__));\
273 goto HEAP_RECURSE;\
274 L_##rw:\
275 DPRINTF(("jumped back to line %d\n", __LINE__));\
276 }
277
278 #define RRETURN(ra)\
279 {\
280 heapframe *newframe = frame;\
281 frame = newframe->Xprevframe;\
282 (pcre_stack_free)(newframe);\
283 if (frame != NULL)\
284 {\
285 rrc = ra;\
286 goto HEAP_RETURN;\
287 }\
288 return ra;\
289 }
290
291
292 /* Structure for remembering the local variables in a private frame */
293
294 typedef struct heapframe {
295 struct heapframe *Xprevframe;
296
297 /* Function arguments that may change */
298
299 const uschar *Xeptr;
300 const uschar *Xecode;
301 const uschar *Xmstart;
302 int Xoffset_top;
303 long int Xims;
304 eptrblock *Xeptrb;
305 int Xflags;
306 unsigned int Xrdepth;
307
308 /* Function local variables */
309
310 const uschar *Xcallpat;
311 const uschar *Xcharptr;
312 const uschar *Xdata;
313 const uschar *Xnext;
314 const uschar *Xpp;
315 const uschar *Xprev;
316 const uschar *Xsaved_eptr;
317
318 recursion_info Xnew_recursive;
319
320 BOOL Xcur_is_word;
321 BOOL Xcondition;
322 BOOL Xprev_is_word;
323
324 unsigned long int Xoriginal_ims;
325
326 #ifdef SUPPORT_UCP
327 int Xprop_type;
328 int Xprop_value;
329 int Xprop_fail_result;
330 int Xprop_category;
331 int Xprop_chartype;
332 int Xprop_script;
333 int Xoclength;
334 uschar Xocchars[8];
335 #endif
336
337 int Xctype;
338 unsigned int Xfc;
339 int Xfi;
340 int Xlength;
341 int Xmax;
342 int Xmin;
343 int Xnumber;
344 int Xoffset;
345 int Xop;
346 int Xsave_capture_last;
347 int Xsave_offset1, Xsave_offset2, Xsave_offset3;
348 int Xstacksave[REC_STACK_SAVE_MAX];
349
350 eptrblock Xnewptrb;
351
352 /* Where to jump back to */
353
354 int Xwhere;
355
356 } heapframe;
357
358 #endif
359
360
361 /***************************************************************************
362 ***************************************************************************/
363
364
365
366 /*************************************************
367 * Match from current position *
368 *************************************************/
369
370 /* This function is called recursively in many circumstances. Whenever it
371 returns a negative (error) response, the outer incarnation must also return the
372 same response.
373
374 Performance note: It might be tempting to extract commonly used fields from the
375 md structure (e.g. utf8, end_subject) into individual variables to improve
376 performance. Tests using gcc on a SPARC disproved this; in the first case, it
377 made performance worse.
378
379 Arguments:
380 eptr pointer to current character in subject
381 ecode pointer to current position in compiled code
382 mstart pointer to the current match start position (can be modified
383 by encountering \K)
384 offset_top current top pointer
385 md pointer to "static" info for the match
386 ims current /i, /m, and /s options
387 eptrb pointer to chain of blocks containing eptr at start of
388 brackets - for testing for empty matches
389 flags can contain
390 match_condassert - this is an assertion condition
391 match_cbegroup - this is the start of an unlimited repeat
392 group that can match an empty string
393 rdepth the recursion depth
394
395 Returns: MATCH_MATCH if matched ) these values are >= 0
396 MATCH_NOMATCH if failed to match )
397 a negative PCRE_ERROR_xxx value if aborted by an error condition
398 (e.g. stopped by repeated call or recursion limit)
399 */
400
401 static int
402 match(REGISTER USPTR eptr, REGISTER const uschar *ecode, const uschar *mstart,
403 int offset_top, match_data *md, unsigned long int ims, eptrblock *eptrb,
404 int flags, unsigned int rdepth)
405 {
406 /* These variables do not need to be preserved over recursion in this function,
407 so they can be ordinary variables in all cases. Mark some of them with
408 "register" because they are used a lot in loops. */
409
410 register int rrc; /* Returns from recursive calls */
411 register int i; /* Used for loops not involving calls to RMATCH() */
412 register unsigned int c; /* Character values not kept over RMATCH() calls */
413 register BOOL utf8; /* Local copy of UTF-8 flag for speed */
414
415 BOOL minimize, possessive; /* Quantifier options */
416
417 /* When recursion is not being used, all "local" variables that have to be
418 preserved over calls to RMATCH() are part of a "frame" which is obtained from
419 heap storage. Set up the top-level frame here; others are obtained from the
420 heap whenever RMATCH() does a "recursion". See the macro definitions above. */
421
422 #ifdef NO_RECURSE
423 heapframe *frame = (pcre_stack_malloc)(sizeof(heapframe));
424 frame->Xprevframe = NULL; /* Marks the top level */
425
426 /* Copy in the original argument variables */
427
428 frame->Xeptr = eptr;
429 frame->Xecode = ecode;
430 frame->Xmstart = mstart;
431 frame->Xoffset_top = offset_top;
432 frame->Xims = ims;
433 frame->Xeptrb = eptrb;
434 frame->Xflags = flags;
435 frame->Xrdepth = rdepth;
436
437 /* This is where control jumps back to to effect "recursion" */
438
439 HEAP_RECURSE:
440
441 /* Macros make the argument variables come from the current frame */
442
443 #define eptr frame->Xeptr
444 #define ecode frame->Xecode
445 #define mstart frame->Xmstart
446 #define offset_top frame->Xoffset_top
447 #define ims frame->Xims
448 #define eptrb frame->Xeptrb
449 #define flags frame->Xflags
450 #define rdepth frame->Xrdepth
451
452 /* Ditto for the local variables */
453
454 #ifdef SUPPORT_UTF8
455 #define charptr frame->Xcharptr
456 #endif
457 #define callpat frame->Xcallpat
458 #define data frame->Xdata
459 #define next frame->Xnext
460 #define pp frame->Xpp
461 #define prev frame->Xprev
462 #define saved_eptr frame->Xsaved_eptr
463
464 #define new_recursive frame->Xnew_recursive
465
466 #define cur_is_word frame->Xcur_is_word
467 #define condition frame->Xcondition
468 #define prev_is_word frame->Xprev_is_word
469
470 #define original_ims frame->Xoriginal_ims
471
472 #ifdef SUPPORT_UCP
473 #define prop_type frame->Xprop_type
474 #define prop_value frame->Xprop_value
475 #define prop_fail_result frame->Xprop_fail_result
476 #define prop_category frame->Xprop_category
477 #define prop_chartype frame->Xprop_chartype
478 #define prop_script frame->Xprop_script
479 #define oclength frame->Xoclength
480 #define occhars frame->Xocchars
481 #endif
482
483 #define ctype frame->Xctype
484 #define fc frame->Xfc
485 #define fi frame->Xfi
486 #define length frame->Xlength
487 #define max frame->Xmax
488 #define min frame->Xmin
489 #define number frame->Xnumber
490 #define offset frame->Xoffset
491 #define op frame->Xop
492 #define save_capture_last frame->Xsave_capture_last
493 #define save_offset1 frame->Xsave_offset1
494 #define save_offset2 frame->Xsave_offset2
495 #define save_offset3 frame->Xsave_offset3
496 #define stacksave frame->Xstacksave
497
498 #define newptrb frame->Xnewptrb
499
500 /* When recursion is being used, local variables are allocated on the stack and
501 get preserved during recursion in the normal way. In this environment, fi and
502 i, and fc and c, can be the same variables. */
503
504 #else /* NO_RECURSE not defined */
505 #define fi i
506 #define fc c
507
508
509 #ifdef SUPPORT_UTF8 /* Many of these variables are used only */
510 const uschar *charptr; /* in small blocks of the code. My normal */
511 #endif /* style of coding would have declared */
512 const uschar *callpat; /* them within each of those blocks. */
513 const uschar *data; /* However, in order to accommodate the */
514 const uschar *next; /* version of this code that uses an */
515 USPTR pp; /* external "stack" implemented on the */
516 const uschar *prev; /* heap, it is easier to declare them all */
517 USPTR saved_eptr; /* here, so the declarations can be cut */
518 /* out in a block. The only declarations */
519 recursion_info new_recursive; /* within blocks below are for variables */
520 /* that do not have to be preserved over */
521 BOOL cur_is_word; /* a recursive call to RMATCH(). */
522 BOOL condition;
523 BOOL prev_is_word;
524
525 unsigned long int original_ims;
526
527 #ifdef SUPPORT_UCP
528 int prop_type;
529 int prop_value;
530 int prop_fail_result;
531 int prop_category;
532 int prop_chartype;
533 int prop_script;
534 int oclength;
535 uschar occhars[8];
536 #endif
537
538 int ctype;
539 int length;
540 int max;
541 int min;
542 int number;
543 int offset;
544 int op;
545 int save_capture_last;
546 int save_offset1, save_offset2, save_offset3;
547 int stacksave[REC_STACK_SAVE_MAX];
548
549 eptrblock newptrb;
550 #endif /* NO_RECURSE */
551
552 /* These statements are here to stop the compiler complaining about unitialized
553 variables. */
554
555 #ifdef SUPPORT_UCP
556 prop_value = 0;
557 prop_fail_result = 0;
558 #endif
559
560
561 /* This label is used for tail recursion, which is used in a few cases even
562 when NO_RECURSE is not defined, in order to reduce the amount of stack that is
563 used. Thanks to Ian Taylor for noticing this possibility and sending the
564 original patch. */
565
566 TAIL_RECURSE:
567
568 /* OK, now we can get on with the real code of the function. Recursive calls
569 are specified by the macro RMATCH and RRETURN is used to return. When
570 NO_RECURSE is *not* defined, these just turn into a recursive call to match()
571 and a "return", respectively (possibly with some debugging if DEBUG is
572 defined). However, RMATCH isn't like a function call because it's quite a
573 complicated macro. It has to be used in one particular way. This shouldn't,
574 however, impact performance when true recursion is being used. */
575
576 #ifdef SUPPORT_UTF8
577 utf8 = md->utf8; /* Local copy of the flag */
578 #else
579 utf8 = FALSE;
580 #endif
581
582 /* First check that we haven't called match() too many times, or that we
583 haven't exceeded the recursive call limit. */
584
585 if (md->match_call_count++ >= md->match_limit) RRETURN(PCRE_ERROR_MATCHLIMIT);
586 if (rdepth >= md->match_limit_recursion) RRETURN(PCRE_ERROR_RECURSIONLIMIT);
587
588 original_ims = ims; /* Save for resetting on ')' */
589
590 /* At the start of a group with an unlimited repeat that may match an empty
591 string, the match_cbegroup flag is set. When this is the case, add the current
592 subject pointer to the chain of such remembered pointers, to be checked when we
593 hit the closing ket, in order to break infinite loops that match no characters.
594 When match() is called in other circumstances, don't add to the chain. The
595 match_cbegroup flag must NOT be used with tail recursion, because the memory
596 block that is used is on the stack, so a new one may be required for each
597 match(). */
598
599 if ((flags & match_cbegroup) != 0)
600 {
601 newptrb.epb_saved_eptr = eptr;
602 newptrb.epb_prev = eptrb;
603 eptrb = &newptrb;
604 }
605
606 /* Now start processing the opcodes. */
607
608 for (;;)
609 {
610 minimize = possessive = FALSE;
611 op = *ecode;
612
613 /* For partial matching, remember if we ever hit the end of the subject after
614 matching at least one subject character. */
615
616 if (md->partial &&
617 eptr >= md->end_subject &&
618 eptr > mstart)
619 md->hitend = TRUE;
620
621 switch(op)
622 {
623 case OP_FAIL:
624 RRETURN(MATCH_NOMATCH);
625
626 case OP_PRUNE:
627 RMATCH(eptr, ecode + _pcre_OP_lengths[*ecode], offset_top, md,
628 ims, eptrb, flags, RM51);
629 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
630 RRETURN(MATCH_PRUNE);
631
632 case OP_COMMIT:
633 RMATCH(eptr, ecode + _pcre_OP_lengths[*ecode], offset_top, md,
634 ims, eptrb, flags, RM52);
635 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
636 RRETURN(MATCH_COMMIT);
637
638 case OP_SKIP:
639 RMATCH(eptr, ecode + _pcre_OP_lengths[*ecode], offset_top, md,
640 ims, eptrb, flags, RM53);
641 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
642 md->start_match_ptr = eptr; /* Pass back current position */
643 RRETURN(MATCH_SKIP);
644
645 case OP_THEN:
646 RMATCH(eptr, ecode + _pcre_OP_lengths[*ecode], offset_top, md,
647 ims, eptrb, flags, RM54);
648 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
649 RRETURN(MATCH_THEN);
650
651 /* Handle a capturing bracket. If there is space in the offset vector, save
652 the current subject position in the working slot at the top of the vector.
653 We mustn't change the current values of the data slot, because they may be
654 set from a previous iteration of this group, and be referred to by a
655 reference inside the group.
656
657 If the bracket fails to match, we need to restore this value and also the
658 values of the final offsets, in case they were set by a previous iteration
659 of the same bracket.
660
661 If there isn't enough space in the offset vector, treat this as if it were
662 a non-capturing bracket. Don't worry about setting the flag for the error
663 case here; that is handled in the code for KET. */
664
665 case OP_CBRA:
666 case OP_SCBRA:
667 number = GET2(ecode, 1+LINK_SIZE);
668 offset = number << 1;
669
670 #ifdef DEBUG
671 printf("start bracket %d\n", number);
672 printf("subject=");
673 pchars(eptr, 16, TRUE, md);
674 printf("\n");
675 #endif
676
677 if (offset < md->offset_max)
678 {
679 save_offset1 = md->offset_vector[offset];
680 save_offset2 = md->offset_vector[offset+1];
681 save_offset3 = md->offset_vector[md->offset_end - number];
682 save_capture_last = md->capture_last;
683
684 DPRINTF(("saving %d %d %d\n", save_offset1, save_offset2, save_offset3));
685 md->offset_vector[md->offset_end - number] = eptr - md->start_subject;
686
687 flags = (op == OP_SCBRA)? match_cbegroup : 0;
688 do
689 {
690 RMATCH(eptr, ecode + _pcre_OP_lengths[*ecode], offset_top, md,
691 ims, eptrb, flags, RM1);
692 if (rrc != MATCH_NOMATCH && rrc != MATCH_THEN) RRETURN(rrc);
693 md->capture_last = save_capture_last;
694 ecode += GET(ecode, 1);
695 }
696 while (*ecode == OP_ALT);
697
698 DPRINTF(("bracket %d failed\n", number));
699
700 md->offset_vector[offset] = save_offset1;
701 md->offset_vector[offset+1] = save_offset2;
702 md->offset_vector[md->offset_end - number] = save_offset3;
703
704 RRETURN(MATCH_NOMATCH);
705 }
706
707 /* FALL THROUGH ... Insufficient room for saving captured contents. Treat
708 as a non-capturing bracket. */
709
710 /* VVVVVVVVVVVVVVVVVVVVVVVVV */
711 /* VVVVVVVVVVVVVVVVVVVVVVVVV */
712
713 DPRINTF(("insufficient capture room: treat as non-capturing\n"));
714
715 /* VVVVVVVVVVVVVVVVVVVVVVVVV */
716 /* VVVVVVVVVVVVVVVVVVVVVVVVV */
717
718 /* Non-capturing bracket. Loop for all the alternatives. When we get to the
719 final alternative within the brackets, we would return the result of a
720 recursive call to match() whatever happened. We can reduce stack usage by
721 turning this into a tail recursion, except in the case when match_cbegroup
722 is set.*/
723
724 case OP_BRA:
725 case OP_SBRA:
726 DPRINTF(("start non-capturing bracket\n"));
727 flags = (op >= OP_SBRA)? match_cbegroup : 0;
728 for (;;)
729 {
730 if (ecode[GET(ecode, 1)] != OP_ALT) /* Final alternative */
731 {
732 if (flags == 0) /* Not a possibly empty group */
733 {
734 ecode += _pcre_OP_lengths[*ecode];
735 DPRINTF(("bracket 0 tail recursion\n"));
736 goto TAIL_RECURSE;
737 }
738
739 /* Possibly empty group; can't use tail recursion. */
740
741 RMATCH(eptr, ecode + _pcre_OP_lengths[*ecode], offset_top, md, ims,
742 eptrb, flags, RM48);
743 RRETURN(rrc);
744 }
745
746 /* For non-final alternatives, continue the loop for a NOMATCH result;
747 otherwise return. */
748
749 RMATCH(eptr, ecode + _pcre_OP_lengths[*ecode], offset_top, md, ims,
750 eptrb, flags, RM2);
751 if (rrc != MATCH_NOMATCH && rrc != MATCH_THEN) RRETURN(rrc);
752 ecode += GET(ecode, 1);
753 }
754 /* Control never reaches here. */
755
756 /* Conditional group: compilation checked that there are no more than
757 two branches. If the condition is false, skipping the first branch takes us
758 past the end if there is only one branch, but that's OK because that is
759 exactly what going to the ket would do. As there is only one branch to be
760 obeyed, we can use tail recursion to avoid using another stack frame. */
761
762 case OP_COND:
763 case OP_SCOND:
764 if (ecode[LINK_SIZE+1] == OP_RREF) /* Recursion test */
765 {
766 offset = GET2(ecode, LINK_SIZE + 2); /* Recursion group number*/
767 condition = md->recursive != NULL &&
768 (offset == RREF_ANY || offset == md->recursive->group_num);
769 ecode += condition? 3 : GET(ecode, 1);
770 }
771
772 else if (ecode[LINK_SIZE+1] == OP_CREF) /* Group used test */
773 {
774 offset = GET2(ecode, LINK_SIZE+2) << 1; /* Doubled ref number */
775 condition = offset < offset_top && md->offset_vector[offset] >= 0;
776 ecode += condition? 3 : GET(ecode, 1);
777 }
778
779 else if (ecode[LINK_SIZE+1] == OP_DEF) /* DEFINE - always false */
780 {
781 condition = FALSE;
782 ecode += GET(ecode, 1);
783 }
784
785 /* The condition is an assertion. Call match() to evaluate it - setting
786 the final argument match_condassert causes it to stop at the end of an
787 assertion. */
788
789 else
790 {
791 RMATCH(eptr, ecode + 1 + LINK_SIZE, offset_top, md, ims, NULL,
792 match_condassert, RM3);
793 if (rrc == MATCH_MATCH)
794 {
795 condition = TRUE;
796 ecode += 1 + LINK_SIZE + GET(ecode, LINK_SIZE + 2);
797 while (*ecode == OP_ALT) ecode += GET(ecode, 1);
798 }
799 else if (rrc != MATCH_NOMATCH && rrc != MATCH_THEN)
800 {
801 RRETURN(rrc); /* Need braces because of following else */
802 }
803 else
804 {
805 condition = FALSE;
806 ecode += GET(ecode, 1);
807 }
808 }
809
810 /* We are now at the branch that is to be obeyed. As there is only one,
811 we can use tail recursion to avoid using another stack frame, except when
812 match_cbegroup is required for an unlimited repeat of a possibly empty
813 group. If the second alternative doesn't exist, we can just plough on. */
814
815 if (condition || *ecode == OP_ALT)
816 {
817 ecode += 1 + LINK_SIZE;
818 if (op == OP_SCOND) /* Possibly empty group */
819 {
820 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, match_cbegroup, RM49);
821 RRETURN(rrc);
822 }
823 else /* Group must match something */
824 {
825 flags = 0;
826 goto TAIL_RECURSE;
827 }
828 }
829 else /* Condition false & no 2nd alternative */
830 {
831 ecode += 1 + LINK_SIZE;
832 }
833 break;
834
835
836 /* End of the pattern, either real or forced. If we are in a top-level
837 recursion, we should restore the offsets appropriately and continue from
838 after the call. */
839
840 case OP_ACCEPT:
841 case OP_END:
842 if (md->recursive != NULL && md->recursive->group_num == 0)
843 {
844 recursion_info *rec = md->recursive;
845 DPRINTF(("End of pattern in a (?0) recursion\n"));
846 md->recursive = rec->prevrec;
847 memmove(md->offset_vector, rec->offset_save,
848 rec->saved_max * sizeof(int));
849 mstart = rec->save_start;
850 ims = original_ims;
851 ecode = rec->after_call;
852 break;
853 }
854
855 /* Otherwise, if PCRE_NOTEMPTY is set, fail if we have matched an empty
856 string - backtracking will then try other alternatives, if any. */
857
858 if (md->notempty && eptr == mstart) RRETURN(MATCH_NOMATCH);
859 md->end_match_ptr = eptr; /* Record where we ended */
860 md->end_offset_top = offset_top; /* and how many extracts were taken */
861 md->start_match_ptr = mstart; /* and the start (\K can modify) */
862 RRETURN(MATCH_MATCH);
863
864 /* Change option settings */
865
866 case OP_OPT:
867 ims = ecode[1];
868 ecode += 2;
869 DPRINTF(("ims set to %02lx\n", ims));
870 break;
871
872 /* Assertion brackets. Check the alternative branches in turn - the
873 matching won't pass the KET for an assertion. If any one branch matches,
874 the assertion is true. Lookbehind assertions have an OP_REVERSE item at the
875 start of each branch to move the current point backwards, so the code at
876 this level is identical to the lookahead case. */
877
878 case OP_ASSERT:
879 case OP_ASSERTBACK:
880 do
881 {
882 RMATCH(eptr, ecode + 1 + LINK_SIZE, offset_top, md, ims, NULL, 0,
883 RM4);
884 if (rrc == MATCH_MATCH) break;
885 if (rrc != MATCH_NOMATCH && rrc != MATCH_THEN) RRETURN(rrc);
886 ecode += GET(ecode, 1);
887 }
888 while (*ecode == OP_ALT);
889 if (*ecode == OP_KET) RRETURN(MATCH_NOMATCH);
890
891 /* If checking an assertion for a condition, return MATCH_MATCH. */
892
893 if ((flags & match_condassert) != 0) RRETURN(MATCH_MATCH);
894
895 /* Continue from after the assertion, updating the offsets high water
896 mark, since extracts may have been taken during the assertion. */
897
898 do ecode += GET(ecode,1); while (*ecode == OP_ALT);
899 ecode += 1 + LINK_SIZE;
900 offset_top = md->end_offset_top;
901 continue;
902
903 /* Negative assertion: all branches must fail to match */
904
905 case OP_ASSERT_NOT:
906 case OP_ASSERTBACK_NOT:
907 do
908 {
909 RMATCH(eptr, ecode + 1 + LINK_SIZE, offset_top, md, ims, NULL, 0,
910 RM5);
911 if (rrc == MATCH_MATCH) RRETURN(MATCH_NOMATCH);
912 if (rrc != MATCH_NOMATCH && rrc != MATCH_THEN) RRETURN(rrc);
913 ecode += GET(ecode,1);
914 }
915 while (*ecode == OP_ALT);
916
917 if ((flags & match_condassert) != 0) RRETURN(MATCH_MATCH);
918
919 ecode += 1 + LINK_SIZE;
920 continue;
921
922 /* Move the subject pointer back. This occurs only at the start of
923 each branch of a lookbehind assertion. If we are too close to the start to
924 move back, this match function fails. When working with UTF-8 we move
925 back a number of characters, not bytes. */
926
927 case OP_REVERSE:
928 #ifdef SUPPORT_UTF8
929 if (utf8)
930 {
931 i = GET(ecode, 1);
932 while (i-- > 0)
933 {
934 eptr--;
935 if (eptr < md->start_subject) RRETURN(MATCH_NOMATCH);
936 BACKCHAR(eptr);
937 }
938 }
939 else
940 #endif
941
942 /* No UTF-8 support, or not in UTF-8 mode: count is byte count */
943
944 {
945 eptr -= GET(ecode, 1);
946 if (eptr < md->start_subject) RRETURN(MATCH_NOMATCH);
947 }
948
949 /* Skip to next op code */
950
951 ecode += 1 + LINK_SIZE;
952 break;
953
954 /* The callout item calls an external function, if one is provided, passing
955 details of the match so far. This is mainly for debugging, though the
956 function is able to force a failure. */
957
958 case OP_CALLOUT:
959 if (pcre_callout != NULL)
960 {
961 pcre_callout_block cb;
962 cb.version = 1; /* Version 1 of the callout block */
963 cb.callout_number = ecode[1];
964 cb.offset_vector = md->offset_vector;
965 cb.subject = (PCRE_SPTR)md->start_subject;
966 cb.subject_length = md->end_subject - md->start_subject;
967 cb.start_match = mstart - md->start_subject;
968 cb.current_position = eptr - md->start_subject;
969 cb.pattern_position = GET(ecode, 2);
970 cb.next_item_length = GET(ecode, 2 + LINK_SIZE);
971 cb.capture_top = offset_top/2;
972 cb.capture_last = md->capture_last;
973 cb.callout_data = md->callout_data;
974 if ((rrc = (*pcre_callout)(&cb)) > 0) RRETURN(MATCH_NOMATCH);
975 if (rrc < 0) RRETURN(rrc);
976 }
977 ecode += 2 + 2*LINK_SIZE;
978 break;
979
980 /* Recursion either matches the current regex, or some subexpression. The
981 offset data is the offset to the starting bracket from the start of the
982 whole pattern. (This is so that it works from duplicated subpatterns.)
983
984 If there are any capturing brackets started but not finished, we have to
985 save their starting points and reinstate them after the recursion. However,
986 we don't know how many such there are (offset_top records the completed
987 total) so we just have to save all the potential data. There may be up to
988 65535 such values, which is too large to put on the stack, but using malloc
989 for small numbers seems expensive. As a compromise, the stack is used when
990 there are no more than REC_STACK_SAVE_MAX values to store; otherwise malloc
991 is used. A problem is what to do if the malloc fails ... there is no way of
992 returning to the top level with an error. Save the top REC_STACK_SAVE_MAX
993 values on the stack, and accept that the rest may be wrong.
994
995 There are also other values that have to be saved. We use a chained
996 sequence of blocks that actually live on the stack. Thanks to Robin Houston
997 for the original version of this logic. */
998
999 case OP_RECURSE:
1000 {
1001 callpat = md->start_code + GET(ecode, 1);
1002 new_recursive.group_num = (callpat == md->start_code)? 0 :
1003 GET2(callpat, 1 + LINK_SIZE);
1004
1005 /* Add to "recursing stack" */
1006
1007 new_recursive.prevrec = md->recursive;
1008 md->recursive = &new_recursive;
1009
1010 /* Find where to continue from afterwards */
1011
1012 ecode += 1 + LINK_SIZE;
1013 new_recursive.after_call = ecode;
1014
1015 /* Now save the offset data. */
1016
1017 new_recursive.saved_max = md->offset_end;
1018 if (new_recursive.saved_max <= REC_STACK_SAVE_MAX)
1019 new_recursive.offset_save = stacksave;
1020 else
1021 {
1022 new_recursive.offset_save =
1023 (int *)(pcre_malloc)(new_recursive.saved_max * sizeof(int));
1024 if (new_recursive.offset_save == NULL) RRETURN(PCRE_ERROR_NOMEMORY);
1025 }
1026
1027 memcpy(new_recursive.offset_save, md->offset_vector,
1028 new_recursive.saved_max * sizeof(int));
1029 new_recursive.save_start = mstart;
1030 mstart = eptr;
1031
1032 /* OK, now we can do the recursion. For each top-level alternative we
1033 restore the offset and recursion data. */
1034
1035 DPRINTF(("Recursing into group %d\n", new_recursive.group_num));
1036 flags = (*callpat >= OP_SBRA)? match_cbegroup : 0;
1037 do
1038 {
1039 RMATCH(eptr, callpat + _pcre_OP_lengths[*callpat], offset_top,
1040 md, ims, eptrb, flags, RM6);
1041 if (rrc == MATCH_MATCH)
1042 {
1043 DPRINTF(("Recursion matched\n"));
1044 md->recursive = new_recursive.prevrec;
1045 if (new_recursive.offset_save != stacksave)
1046 (pcre_free)(new_recursive.offset_save);
1047 RRETURN(MATCH_MATCH);
1048 }
1049 else if (rrc != MATCH_NOMATCH && rrc != MATCH_THEN)
1050 {
1051 DPRINTF(("Recursion gave error %d\n", rrc));
1052 RRETURN(rrc);
1053 }
1054
1055 md->recursive = &new_recursive;
1056 memcpy(md->offset_vector, new_recursive.offset_save,
1057 new_recursive.saved_max * sizeof(int));
1058 callpat += GET(callpat, 1);
1059 }
1060 while (*callpat == OP_ALT);
1061
1062 DPRINTF(("Recursion didn't match\n"));
1063 md->recursive = new_recursive.prevrec;
1064 if (new_recursive.offset_save != stacksave)
1065 (pcre_free)(new_recursive.offset_save);
1066 RRETURN(MATCH_NOMATCH);
1067 }
1068 /* Control never reaches here */
1069
1070 /* "Once" brackets are like assertion brackets except that after a match,
1071 the point in the subject string is not moved back. Thus there can never be
1072 a move back into the brackets. Friedl calls these "atomic" subpatterns.
1073 Check the alternative branches in turn - the matching won't pass the KET
1074 for this kind of subpattern. If any one branch matches, we carry on as at
1075 the end of a normal bracket, leaving the subject pointer. */
1076
1077 case OP_ONCE:
1078 prev = ecode;
1079 saved_eptr = eptr;
1080
1081 do
1082 {
1083 RMATCH(eptr, ecode + 1 + LINK_SIZE, offset_top, md, ims, eptrb, 0, RM7);
1084 if (rrc == MATCH_MATCH) break;
1085 if (rrc != MATCH_NOMATCH && rrc != MATCH_THEN) RRETURN(rrc);
1086 ecode += GET(ecode,1);
1087 }
1088 while (*ecode == OP_ALT);
1089
1090 /* If hit the end of the group (which could be repeated), fail */
1091
1092 if (*ecode != OP_ONCE && *ecode != OP_ALT) RRETURN(MATCH_NOMATCH);
1093
1094 /* Continue as from after the assertion, updating the offsets high water
1095 mark, since extracts may have been taken. */
1096
1097 do ecode += GET(ecode, 1); while (*ecode == OP_ALT);
1098
1099 offset_top = md->end_offset_top;
1100 eptr = md->end_match_ptr;
1101
1102 /* For a non-repeating ket, just continue at this level. This also
1103 happens for a repeating ket if no characters were matched in the group.
1104 This is the forcible breaking of infinite loops as implemented in Perl
1105 5.005. If there is an options reset, it will get obeyed in the normal
1106 course of events. */
1107
1108 if (*ecode == OP_KET || eptr == saved_eptr)
1109 {
1110 ecode += 1+LINK_SIZE;
1111 break;
1112 }
1113
1114 /* The repeating kets try the rest of the pattern or restart from the
1115 preceding bracket, in the appropriate order. The second "call" of match()
1116 uses tail recursion, to avoid using another stack frame. We need to reset
1117 any options that changed within the bracket before re-running it, so
1118 check the next opcode. */
1119
1120 if (ecode[1+LINK_SIZE] == OP_OPT)
1121 {
1122 ims = (ims & ~PCRE_IMS) | ecode[4];
1123 DPRINTF(("ims set to %02lx at group repeat\n", ims));
1124 }
1125
1126 if (*ecode == OP_KETRMIN)
1127 {
1128 RMATCH(eptr, ecode + 1 + LINK_SIZE, offset_top, md, ims, eptrb, 0, RM8);
1129 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1130 ecode = prev;
1131 flags = 0;
1132 goto TAIL_RECURSE;
1133 }
1134 else /* OP_KETRMAX */
1135 {
1136 RMATCH(eptr, prev, offset_top, md, ims, eptrb, match_cbegroup, RM9);
1137 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1138 ecode += 1 + LINK_SIZE;
1139 flags = 0;
1140 goto TAIL_RECURSE;
1141 }
1142 /* Control never gets here */
1143
1144 /* An alternation is the end of a branch; scan along to find the end of the
1145 bracketed group and go to there. */
1146
1147 case OP_ALT:
1148 do ecode += GET(ecode,1); while (*ecode == OP_ALT);
1149 break;
1150
1151 /* BRAZERO, BRAMINZERO and SKIPZERO occur just before a bracket group,
1152 indicating that it may occur zero times. It may repeat infinitely, or not
1153 at all - i.e. it could be ()* or ()? or even (){0} in the pattern. Brackets
1154 with fixed upper repeat limits are compiled as a number of copies, with the
1155 optional ones preceded by BRAZERO or BRAMINZERO. */
1156
1157 case OP_BRAZERO:
1158 {
1159 next = ecode+1;
1160 RMATCH(eptr, next, offset_top, md, ims, eptrb, 0, RM10);
1161 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1162 do next += GET(next,1); while (*next == OP_ALT);
1163 ecode = next + 1 + LINK_SIZE;
1164 }
1165 break;
1166
1167 case OP_BRAMINZERO:
1168 {
1169 next = ecode+1;
1170 do next += GET(next, 1); while (*next == OP_ALT);
1171 RMATCH(eptr, next + 1+LINK_SIZE, offset_top, md, ims, eptrb, 0, RM11);
1172 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1173 ecode++;
1174 }
1175 break;
1176
1177 case OP_SKIPZERO:
1178 {
1179 next = ecode+1;
1180 do next += GET(next,1); while (*next == OP_ALT);
1181 ecode = next + 1 + LINK_SIZE;
1182 }
1183 break;
1184
1185 /* End of a group, repeated or non-repeating. */
1186
1187 case OP_KET:
1188 case OP_KETRMIN:
1189 case OP_KETRMAX:
1190 prev = ecode - GET(ecode, 1);
1191
1192 /* If this was a group that remembered the subject start, in order to break
1193 infinite repeats of empty string matches, retrieve the subject start from
1194 the chain. Otherwise, set it NULL. */
1195
1196 if (*prev >= OP_SBRA)
1197 {
1198 saved_eptr = eptrb->epb_saved_eptr; /* Value at start of group */
1199 eptrb = eptrb->epb_prev; /* Backup to previous group */
1200 }
1201 else saved_eptr = NULL;
1202
1203 /* If we are at the end of an assertion group, stop matching and return
1204 MATCH_MATCH, but record the current high water mark for use by positive
1205 assertions. Do this also for the "once" (atomic) groups. */
1206
1207 if (*prev == OP_ASSERT || *prev == OP_ASSERT_NOT ||
1208 *prev == OP_ASSERTBACK || *prev == OP_ASSERTBACK_NOT ||
1209 *prev == OP_ONCE)
1210 {
1211 md->end_match_ptr = eptr; /* For ONCE */
1212 md->end_offset_top = offset_top;
1213 RRETURN(MATCH_MATCH);
1214 }
1215
1216 /* For capturing groups we have to check the group number back at the start
1217 and if necessary complete handling an extraction by setting the offsets and
1218 bumping the high water mark. Note that whole-pattern recursion is coded as
1219 a recurse into group 0, so it won't be picked up here. Instead, we catch it
1220 when the OP_END is reached. Other recursion is handled here. */
1221
1222 if (*prev == OP_CBRA || *prev == OP_SCBRA)
1223 {
1224 number = GET2(prev, 1+LINK_SIZE);
1225 offset = number << 1;
1226
1227 #ifdef DEBUG
1228 printf("end bracket %d", number);
1229 printf("\n");
1230 #endif
1231
1232 md->capture_last = number;
1233 if (offset >= md->offset_max) md->offset_overflow = TRUE; else
1234 {
1235 md->offset_vector[offset] =
1236 md->offset_vector[md->offset_end - number];
1237 md->offset_vector[offset+1] = eptr - md->start_subject;
1238 if (offset_top <= offset) offset_top = offset + 2;
1239 }
1240
1241 /* Handle a recursively called group. Restore the offsets
1242 appropriately and continue from after the call. */
1243
1244 if (md->recursive != NULL && md->recursive->group_num == number)
1245 {
1246 recursion_info *rec = md->recursive;
1247 DPRINTF(("Recursion (%d) succeeded - continuing\n", number));
1248 md->recursive = rec->prevrec;
1249 mstart = rec->save_start;
1250 memcpy(md->offset_vector, rec->offset_save,
1251 rec->saved_max * sizeof(int));
1252 ecode = rec->after_call;
1253 ims = original_ims;
1254 break;
1255 }
1256 }
1257
1258 /* For both capturing and non-capturing groups, reset the value of the ims
1259 flags, in case they got changed during the group. */
1260
1261 ims = original_ims;
1262 DPRINTF(("ims reset to %02lx\n", ims));
1263
1264 /* For a non-repeating ket, just continue at this level. This also
1265 happens for a repeating ket if no characters were matched in the group.
1266 This is the forcible breaking of infinite loops as implemented in Perl
1267 5.005. If there is an options reset, it will get obeyed in the normal
1268 course of events. */
1269
1270 if (*ecode == OP_KET || eptr == saved_eptr)
1271 {
1272 ecode += 1 + LINK_SIZE;
1273 break;
1274 }
1275
1276 /* The repeating kets try the rest of the pattern or restart from the
1277 preceding bracket, in the appropriate order. In the second case, we can use
1278 tail recursion to avoid using another stack frame, unless we have an
1279 unlimited repeat of a group that can match an empty string. */
1280
1281 flags = (*prev >= OP_SBRA)? match_cbegroup : 0;
1282
1283 if (*ecode == OP_KETRMIN)
1284 {
1285 RMATCH(eptr, ecode + 1 + LINK_SIZE, offset_top, md, ims, eptrb, 0, RM12);
1286 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1287 if (flags != 0) /* Could match an empty string */
1288 {
1289 RMATCH(eptr, prev, offset_top, md, ims, eptrb, flags, RM50);
1290 RRETURN(rrc);
1291 }
1292 ecode = prev;
1293 goto TAIL_RECURSE;
1294 }
1295 else /* OP_KETRMAX */
1296 {
1297 RMATCH(eptr, prev, offset_top, md, ims, eptrb, flags, RM13);
1298 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1299 ecode += 1 + LINK_SIZE;
1300 flags = 0;
1301 goto TAIL_RECURSE;
1302 }
1303 /* Control never gets here */
1304
1305 /* Start of subject unless notbol, or after internal newline if multiline */
1306
1307 case OP_CIRC:
1308 if (md->notbol && eptr == md->start_subject) RRETURN(MATCH_NOMATCH);
1309 if ((ims & PCRE_MULTILINE) != 0)
1310 {
1311 if (eptr != md->start_subject &&
1312 (eptr == md->end_subject || !WAS_NEWLINE(eptr)))
1313 RRETURN(MATCH_NOMATCH);
1314 ecode++;
1315 break;
1316 }
1317 /* ... else fall through */
1318
1319 /* Start of subject assertion */
1320
1321 case OP_SOD:
1322 if (eptr != md->start_subject) RRETURN(MATCH_NOMATCH);
1323 ecode++;
1324 break;
1325
1326 /* Start of match assertion */
1327
1328 case OP_SOM:
1329 if (eptr != md->start_subject + md->start_offset) RRETURN(MATCH_NOMATCH);
1330 ecode++;
1331 break;
1332
1333 /* Reset the start of match point */
1334
1335 case OP_SET_SOM:
1336 mstart = eptr;
1337 ecode++;
1338 break;
1339
1340 /* Assert before internal newline if multiline, or before a terminating
1341 newline unless endonly is set, else end of subject unless noteol is set. */
1342
1343 case OP_DOLL:
1344 if ((ims & PCRE_MULTILINE) != 0)
1345 {
1346 if (eptr < md->end_subject)
1347 { if (!IS_NEWLINE(eptr)) RRETURN(MATCH_NOMATCH); }
1348 else
1349 { if (md->noteol) RRETURN(MATCH_NOMATCH); }
1350 ecode++;
1351 break;
1352 }
1353 else
1354 {
1355 if (md->noteol) RRETURN(MATCH_NOMATCH);
1356 if (!md->endonly)
1357 {
1358 if (eptr != md->end_subject &&
1359 (!IS_NEWLINE(eptr) || eptr != md->end_subject - md->nllen))
1360 RRETURN(MATCH_NOMATCH);
1361 ecode++;
1362 break;
1363 }
1364 }
1365 /* ... else fall through for endonly */
1366
1367 /* End of subject assertion (\z) */
1368
1369 case OP_EOD:
1370 if (eptr < md->end_subject) RRETURN(MATCH_NOMATCH);
1371 ecode++;
1372 break;
1373
1374 /* End of subject or ending \n assertion (\Z) */
1375
1376 case OP_EODN:
1377 if (eptr != md->end_subject &&
1378 (!IS_NEWLINE(eptr) || eptr != md->end_subject - md->nllen))
1379 RRETURN(MATCH_NOMATCH);
1380 ecode++;
1381 break;
1382
1383 /* Word boundary assertions */
1384
1385 case OP_NOT_WORD_BOUNDARY:
1386 case OP_WORD_BOUNDARY:
1387 {
1388
1389 /* Find out if the previous and current characters are "word" characters.
1390 It takes a bit more work in UTF-8 mode. Characters > 255 are assumed to
1391 be "non-word" characters. */
1392
1393 #ifdef SUPPORT_UTF8
1394 if (utf8)
1395 {
1396 if (eptr == md->start_subject) prev_is_word = FALSE; else
1397 {
1398 const uschar *lastptr = eptr - 1;
1399 while((*lastptr & 0xc0) == 0x80) lastptr--;
1400 GETCHAR(c, lastptr);
1401 prev_is_word = c < 256 && (md->ctypes[c] & ctype_word) != 0;
1402 }
1403 if (eptr >= md->end_subject) cur_is_word = FALSE; else
1404 {
1405 GETCHAR(c, eptr);
1406 cur_is_word = c < 256 && (md->ctypes[c] & ctype_word) != 0;
1407 }
1408 }
1409 else
1410 #endif
1411
1412 /* More streamlined when not in UTF-8 mode */
1413
1414 {
1415 prev_is_word = (eptr != md->start_subject) &&
1416 ((md->ctypes[eptr[-1]] & ctype_word) != 0);
1417 cur_is_word = (eptr < md->end_subject) &&
1418 ((md->ctypes[*eptr] & ctype_word) != 0);
1419 }
1420
1421 /* Now see if the situation is what we want */
1422
1423 if ((*ecode++ == OP_WORD_BOUNDARY)?
1424 cur_is_word == prev_is_word : cur_is_word != prev_is_word)
1425 RRETURN(MATCH_NOMATCH);
1426 }
1427 break;
1428
1429 /* Match a single character type; inline for speed */
1430
1431 case OP_ANY:
1432 if ((ims & PCRE_DOTALL) == 0)
1433 {
1434 if (IS_NEWLINE(eptr)) RRETURN(MATCH_NOMATCH);
1435 }
1436 if (eptr++ >= md->end_subject) RRETURN(MATCH_NOMATCH);
1437 if (utf8)
1438 while (eptr < md->end_subject && (*eptr & 0xc0) == 0x80) eptr++;
1439 ecode++;
1440 break;
1441
1442 /* Match a single byte, even in UTF-8 mode. This opcode really does match
1443 any byte, even newline, independent of the setting of PCRE_DOTALL. */
1444
1445 case OP_ANYBYTE:
1446 if (eptr++ >= md->end_subject) RRETURN(MATCH_NOMATCH);
1447 ecode++;
1448 break;
1449
1450 case OP_NOT_DIGIT:
1451 if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
1452 GETCHARINCTEST(c, eptr);
1453 if (
1454 #ifdef SUPPORT_UTF8
1455 c < 256 &&
1456 #endif
1457 (md->ctypes[c] & ctype_digit) != 0
1458 )
1459 RRETURN(MATCH_NOMATCH);
1460 ecode++;
1461 break;
1462
1463 case OP_DIGIT:
1464 if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
1465 GETCHARINCTEST(c, eptr);
1466 if (
1467 #ifdef SUPPORT_UTF8
1468 c >= 256 ||
1469 #endif
1470 (md->ctypes[c] & ctype_digit) == 0
1471 )
1472 RRETURN(MATCH_NOMATCH);
1473 ecode++;
1474 break;
1475
1476 case OP_NOT_WHITESPACE:
1477 if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
1478 GETCHARINCTEST(c, eptr);
1479 if (
1480 #ifdef SUPPORT_UTF8
1481 c < 256 &&
1482 #endif
1483 (md->ctypes[c] & ctype_space) != 0
1484 )
1485 RRETURN(MATCH_NOMATCH);
1486 ecode++;
1487 break;
1488
1489 case OP_WHITESPACE:
1490 if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
1491 GETCHARINCTEST(c, eptr);
1492 if (
1493 #ifdef SUPPORT_UTF8
1494 c >= 256 ||
1495 #endif
1496 (md->ctypes[c] & ctype_space) == 0
1497 )
1498 RRETURN(MATCH_NOMATCH);
1499 ecode++;
1500 break;
1501
1502 case OP_NOT_WORDCHAR:
1503 if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
1504 GETCHARINCTEST(c, eptr);
1505 if (
1506 #ifdef SUPPORT_UTF8
1507 c < 256 &&
1508 #endif
1509 (md->ctypes[c] & ctype_word) != 0
1510 )
1511 RRETURN(MATCH_NOMATCH);
1512 ecode++;
1513 break;
1514
1515 case OP_WORDCHAR:
1516 if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
1517 GETCHARINCTEST(c, eptr);
1518 if (
1519 #ifdef SUPPORT_UTF8
1520 c >= 256 ||
1521 #endif
1522 (md->ctypes[c] & ctype_word) == 0
1523 )
1524 RRETURN(MATCH_NOMATCH);
1525 ecode++;
1526 break;
1527
1528 case OP_ANYNL:
1529 if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
1530 GETCHARINCTEST(c, eptr);
1531 switch(c)
1532 {
1533 default: RRETURN(MATCH_NOMATCH);
1534 case 0x000d:
1535 if (eptr < md->end_subject && *eptr == 0x0a) eptr++;
1536 break;
1537
1538 case 0x000a:
1539 break;
1540
1541 case 0x000b:
1542 case 0x000c:
1543 case 0x0085:
1544 case 0x2028:
1545 case 0x2029:
1546 if (md->bsr_anycrlf) RRETURN(MATCH_NOMATCH);
1547 break;
1548 }
1549 ecode++;
1550 break;
1551
1552 case OP_NOT_HSPACE:
1553 if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
1554 GETCHARINCTEST(c, eptr);
1555 switch(c)
1556 {
1557 default: break;
1558 case 0x09: /* HT */
1559 case 0x20: /* SPACE */
1560 case 0xa0: /* NBSP */
1561 case 0x1680: /* OGHAM SPACE MARK */
1562 case 0x180e: /* MONGOLIAN VOWEL SEPARATOR */
1563 case 0x2000: /* EN QUAD */
1564 case 0x2001: /* EM QUAD */
1565 case 0x2002: /* EN SPACE */
1566 case 0x2003: /* EM SPACE */
1567 case 0x2004: /* THREE-PER-EM SPACE */
1568 case 0x2005: /* FOUR-PER-EM SPACE */
1569 case 0x2006: /* SIX-PER-EM SPACE */
1570 case 0x2007: /* FIGURE SPACE */
1571 case 0x2008: /* PUNCTUATION SPACE */
1572 case 0x2009: /* THIN SPACE */
1573 case 0x200A: /* HAIR SPACE */
1574 case 0x202f: /* NARROW NO-BREAK SPACE */
1575 case 0x205f: /* MEDIUM MATHEMATICAL SPACE */
1576 case 0x3000: /* IDEOGRAPHIC SPACE */
1577 RRETURN(MATCH_NOMATCH);
1578 }
1579 ecode++;
1580 break;
1581
1582 case OP_HSPACE:
1583 if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
1584 GETCHARINCTEST(c, eptr);
1585 switch(c)
1586 {
1587 default: RRETURN(MATCH_NOMATCH);
1588 case 0x09: /* HT */
1589 case 0x20: /* SPACE */
1590 case 0xa0: /* NBSP */
1591 case 0x1680: /* OGHAM SPACE MARK */
1592 case 0x180e: /* MONGOLIAN VOWEL SEPARATOR */
1593 case 0x2000: /* EN QUAD */
1594 case 0x2001: /* EM QUAD */
1595 case 0x2002: /* EN SPACE */
1596 case 0x2003: /* EM SPACE */
1597 case 0x2004: /* THREE-PER-EM SPACE */
1598 case 0x2005: /* FOUR-PER-EM SPACE */
1599 case 0x2006: /* SIX-PER-EM SPACE */
1600 case 0x2007: /* FIGURE SPACE */
1601 case 0x2008: /* PUNCTUATION SPACE */
1602 case 0x2009: /* THIN SPACE */
1603 case 0x200A: /* HAIR SPACE */
1604 case 0x202f: /* NARROW NO-BREAK SPACE */
1605 case 0x205f: /* MEDIUM MATHEMATICAL SPACE */
1606 case 0x3000: /* IDEOGRAPHIC SPACE */
1607 break;
1608 }
1609 ecode++;
1610 break;
1611
1612 case OP_NOT_VSPACE:
1613 if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
1614 GETCHARINCTEST(c, eptr);
1615 switch(c)
1616 {
1617 default: break;
1618 case 0x0a: /* LF */
1619 case 0x0b: /* VT */
1620 case 0x0c: /* FF */
1621 case 0x0d: /* CR */
1622 case 0x85: /* NEL */
1623 case 0x2028: /* LINE SEPARATOR */
1624 case 0x2029: /* PARAGRAPH SEPARATOR */
1625 RRETURN(MATCH_NOMATCH);
1626 }
1627 ecode++;
1628 break;
1629
1630 case OP_VSPACE:
1631 if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
1632 GETCHARINCTEST(c, eptr);
1633 switch(c)
1634 {
1635 default: RRETURN(MATCH_NOMATCH);
1636 case 0x0a: /* LF */
1637 case 0x0b: /* VT */
1638 case 0x0c: /* FF */
1639 case 0x0d: /* CR */
1640 case 0x85: /* NEL */
1641 case 0x2028: /* LINE SEPARATOR */
1642 case 0x2029: /* PARAGRAPH SEPARATOR */
1643 break;
1644 }
1645 ecode++;
1646 break;
1647
1648 #ifdef SUPPORT_UCP
1649 /* Check the next character by Unicode property. We will get here only
1650 if the support is in the binary; otherwise a compile-time error occurs. */
1651
1652 case OP_PROP:
1653 case OP_NOTPROP:
1654 if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
1655 GETCHARINCTEST(c, eptr);
1656 {
1657 int chartype, script;
1658 int category = _pcre_ucp_findprop(c, &chartype, &script);
1659
1660 switch(ecode[1])
1661 {
1662 case PT_ANY:
1663 if (op == OP_NOTPROP) RRETURN(MATCH_NOMATCH);
1664 break;
1665
1666 case PT_LAMP:
1667 if ((chartype == ucp_Lu ||
1668 chartype == ucp_Ll ||
1669 chartype == ucp_Lt) == (op == OP_NOTPROP))
1670 RRETURN(MATCH_NOMATCH);
1671 break;
1672
1673 case PT_GC:
1674 if ((ecode[2] != category) == (op == OP_PROP))
1675 RRETURN(MATCH_NOMATCH);
1676 break;
1677
1678 case PT_PC:
1679 if ((ecode[2] != chartype) == (op == OP_PROP))
1680 RRETURN(MATCH_NOMATCH);
1681 break;
1682
1683 case PT_SC:
1684 if ((ecode[2] != script) == (op == OP_PROP))
1685 RRETURN(MATCH_NOMATCH);
1686 break;
1687
1688 default:
1689 RRETURN(PCRE_ERROR_INTERNAL);
1690 }
1691
1692 ecode += 3;
1693 }
1694 break;
1695
1696 /* Match an extended Unicode sequence. We will get here only if the support
1697 is in the binary; otherwise a compile-time error occurs. */
1698
1699 case OP_EXTUNI:
1700 if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
1701 GETCHARINCTEST(c, eptr);
1702 {
1703 int chartype, script;
1704 int category = _pcre_ucp_findprop(c, &chartype, &script);
1705 if (category == ucp_M) RRETURN(MATCH_NOMATCH);
1706 while (eptr < md->end_subject)
1707 {
1708 int len = 1;
1709 if (!utf8) c = *eptr; else
1710 {
1711 GETCHARLEN(c, eptr, len);
1712 }
1713 category = _pcre_ucp_findprop(c, &chartype, &script);
1714 if (category != ucp_M) break;
1715 eptr += len;
1716 }
1717 }
1718 ecode++;
1719 break;
1720 #endif
1721
1722
1723 /* Match a back reference, possibly repeatedly. Look past the end of the
1724 item to see if there is repeat information following. The code is similar
1725 to that for character classes, but repeated for efficiency. Then obey
1726 similar code to character type repeats - written out again for speed.
1727 However, if the referenced string is the empty string, always treat
1728 it as matched, any number of times (otherwise there could be infinite
1729 loops). */
1730
1731 case OP_REF:
1732 {
1733 offset = GET2(ecode, 1) << 1; /* Doubled ref number */
1734 ecode += 3; /* Advance past item */
1735
1736 /* If the reference is unset, set the length to be longer than the amount
1737 of subject left; this ensures that every attempt at a match fails. We
1738 can't just fail here, because of the possibility of quantifiers with zero
1739 minima. */
1740
1741 length = (offset >= offset_top || md->offset_vector[offset] < 0)?
1742 md->end_subject - eptr + 1 :
1743 md->offset_vector[offset+1] - md->offset_vector[offset];
1744
1745 /* Set up for repetition, or handle the non-repeated case */
1746
1747 switch (*ecode)
1748 {
1749 case OP_CRSTAR:
1750 case OP_CRMINSTAR:
1751 case OP_CRPLUS:
1752 case OP_CRMINPLUS:
1753 case OP_CRQUERY:
1754 case OP_CRMINQUERY:
1755 c = *ecode++ - OP_CRSTAR;
1756 minimize = (c & 1) != 0;
1757 min = rep_min[c]; /* Pick up values from tables; */
1758 max = rep_max[c]; /* zero for max => infinity */
1759 if (max == 0) max = INT_MAX;
1760 break;
1761
1762 case OP_CRRANGE:
1763 case OP_CRMINRANGE:
1764 minimize = (*ecode == OP_CRMINRANGE);
1765 min = GET2(ecode, 1);
1766 max = GET2(ecode, 3);
1767 if (max == 0) max = INT_MAX;
1768 ecode += 5;
1769 break;
1770
1771 default: /* No repeat follows */
1772 if (!match_ref(offset, eptr, length, md, ims)) RRETURN(MATCH_NOMATCH);
1773 eptr += length;
1774 continue; /* With the main loop */
1775 }
1776
1777 /* If the length of the reference is zero, just continue with the
1778 main loop. */
1779
1780 if (length == 0) continue;
1781
1782 /* First, ensure the minimum number of matches are present. We get back
1783 the length of the reference string explicitly rather than passing the
1784 address of eptr, so that eptr can be a register variable. */
1785
1786 for (i = 1; i <= min; i++)
1787 {
1788 if (!match_ref(offset, eptr, length, md, ims)) RRETURN(MATCH_NOMATCH);
1789 eptr += length;
1790 }
1791
1792 /* If min = max, continue at the same level without recursion.
1793 They are not both allowed to be zero. */
1794
1795 if (min == max) continue;
1796
1797 /* If minimizing, keep trying and advancing the pointer */
1798
1799 if (minimize)
1800 {
1801 for (fi = min;; fi++)
1802 {
1803 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM14);
1804 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1805 if (fi >= max || !match_ref(offset, eptr, length, md, ims))
1806 RRETURN(MATCH_NOMATCH);
1807 eptr += length;
1808 }
1809 /* Control never gets here */
1810 }
1811
1812 /* If maximizing, find the longest string and work backwards */
1813
1814 else
1815 {
1816 pp = eptr;
1817 for (i = min; i < max; i++)
1818 {
1819 if (!match_ref(offset, eptr, length, md, ims)) break;
1820 eptr += length;
1821 }
1822 while (eptr >= pp)
1823 {
1824 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM15);
1825 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1826 eptr -= length;
1827 }
1828 RRETURN(MATCH_NOMATCH);
1829 }
1830 }
1831 /* Control never gets here */
1832
1833
1834
1835 /* Match a bit-mapped character class, possibly repeatedly. This op code is
1836 used when all the characters in the class have values in the range 0-255,
1837 and either the matching is caseful, or the characters are in the range
1838 0-127 when UTF-8 processing is enabled. The only difference between
1839 OP_CLASS and OP_NCLASS occurs when a data character outside the range is
1840 encountered.
1841
1842 First, look past the end of the item to see if there is repeat information
1843 following. Then obey similar code to character type repeats - written out
1844 again for speed. */
1845
1846 case OP_NCLASS:
1847 case OP_CLASS:
1848 {
1849 data = ecode + 1; /* Save for matching */
1850 ecode += 33; /* Advance past the item */
1851
1852 switch (*ecode)
1853 {
1854 case OP_CRSTAR:
1855 case OP_CRMINSTAR:
1856 case OP_CRPLUS:
1857 case OP_CRMINPLUS:
1858 case OP_CRQUERY:
1859 case OP_CRMINQUERY:
1860 c = *ecode++ - OP_CRSTAR;
1861 minimize = (c & 1) != 0;
1862 min = rep_min[c]; /* Pick up values from tables; */
1863 max = rep_max[c]; /* zero for max => infinity */
1864 if (max == 0) max = INT_MAX;
1865 break;
1866
1867 case OP_CRRANGE:
1868 case OP_CRMINRANGE:
1869 minimize = (*ecode == OP_CRMINRANGE);
1870 min = GET2(ecode, 1);
1871 max = GET2(ecode, 3);
1872 if (max == 0) max = INT_MAX;
1873 ecode += 5;
1874 break;
1875
1876 default: /* No repeat follows */
1877 min = max = 1;
1878 break;
1879 }
1880
1881 /* First, ensure the minimum number of matches are present. */
1882
1883 #ifdef SUPPORT_UTF8
1884 /* UTF-8 mode */
1885 if (utf8)
1886 {
1887 for (i = 1; i <= min; i++)
1888 {
1889 if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
1890 GETCHARINC(c, eptr);
1891 if (c > 255)
1892 {
1893 if (op == OP_CLASS) RRETURN(MATCH_NOMATCH);
1894 }
1895 else
1896 {
1897 if ((data[c/8] & (1 << (c&7))) == 0) RRETURN(MATCH_NOMATCH);
1898 }
1899 }
1900 }
1901 else
1902 #endif
1903 /* Not UTF-8 mode */
1904 {
1905 for (i = 1; i <= min; i++)
1906 {
1907 if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
1908 c = *eptr++;
1909 if ((data[c/8] & (1 << (c&7))) == 0) RRETURN(MATCH_NOMATCH);
1910 }
1911 }
1912
1913 /* If max == min we can continue with the main loop without the
1914 need to recurse. */
1915
1916 if (min == max) continue;
1917
1918 /* If minimizing, keep testing the rest of the expression and advancing
1919 the pointer while it matches the class. */
1920
1921 if (minimize)
1922 {
1923 #ifdef SUPPORT_UTF8
1924 /* UTF-8 mode */
1925 if (utf8)
1926 {
1927 for (fi = min;; fi++)
1928 {
1929 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM16);
1930 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1931 if (fi >= max || eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
1932 GETCHARINC(c, eptr);
1933 if (c > 255)
1934 {
1935 if (op == OP_CLASS) RRETURN(MATCH_NOMATCH);
1936 }
1937 else
1938 {
1939 if ((data[c/8] & (1 << (c&7))) == 0) RRETURN(MATCH_NOMATCH);
1940 }
1941 }
1942 }
1943 else
1944 #endif
1945 /* Not UTF-8 mode */
1946 {
1947 for (fi = min;; fi++)
1948 {
1949 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM17);
1950 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1951 if (fi >= max || eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
1952 c = *eptr++;
1953 if ((data[c/8] & (1 << (c&7))) == 0) RRETURN(MATCH_NOMATCH);
1954 }
1955 }
1956 /* Control never gets here */
1957 }
1958
1959 /* If maximizing, find the longest possible run, then work backwards. */
1960
1961 else
1962 {
1963 pp = eptr;
1964
1965 #ifdef SUPPORT_UTF8
1966 /* UTF-8 mode */
1967 if (utf8)
1968 {
1969 for (i = min; i < max; i++)
1970 {
1971 int len = 1;
1972 if (eptr >= md->end_subject) break;
1973 GETCHARLEN(c, eptr, len);
1974 if (c > 255)
1975 {
1976 if (op == OP_CLASS) break;
1977 }
1978 else
1979 {
1980 if ((data[c/8] & (1 << (c&7))) == 0) break;
1981 }
1982 eptr += len;
1983 }
1984 for (;;)
1985 {
1986 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM18);
1987 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1988 if (eptr-- == pp) break; /* Stop if tried at original pos */
1989 BACKCHAR(eptr);
1990 }
1991 }
1992 else
1993 #endif
1994 /* Not UTF-8 mode */
1995 {
1996 for (i = min; i < max; i++)
1997 {
1998 if (eptr >= md->end_subject) break;
1999 c = *eptr;
2000 if ((data[c/8] & (1 << (c&7))) == 0) break;
2001 eptr++;
2002 }
2003 while (eptr >= pp)
2004 {
2005 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM19);
2006 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2007 eptr--;
2008 }
2009 }
2010
2011 RRETURN(MATCH_NOMATCH);
2012 }
2013 }
2014 /* Control never gets here */
2015
2016
2017 /* Match an extended character class. This opcode is encountered only
2018 in UTF-8 mode, because that's the only time it is compiled. */
2019
2020 #ifdef SUPPORT_UTF8
2021 case OP_XCLASS:
2022 {
2023 data = ecode + 1 + LINK_SIZE; /* Save for matching */
2024 ecode += GET(ecode, 1); /* Advance past the item */
2025
2026 switch (*ecode)
2027 {
2028 case OP_CRSTAR:
2029 case OP_CRMINSTAR:
2030 case OP_CRPLUS:
2031 case OP_CRMINPLUS:
2032 case OP_CRQUERY:
2033 case OP_CRMINQUERY:
2034 c = *ecode++ - OP_CRSTAR;
2035 minimize = (c & 1) != 0;
2036 min = rep_min[c]; /* Pick up values from tables; */
2037 max = rep_max[c]; /* zero for max => infinity */
2038 if (max == 0) max = INT_MAX;
2039 break;
2040
2041 case OP_CRRANGE:
2042 case OP_CRMINRANGE:
2043 minimize = (*ecode == OP_CRMINRANGE);
2044 min = GET2(ecode, 1);
2045 max = GET2(ecode, 3);
2046 if (max == 0) max = INT_MAX;
2047 ecode += 5;
2048 break;
2049
2050 default: /* No repeat follows */
2051 min = max = 1;
2052 break;
2053 }
2054
2055 /* First, ensure the minimum number of matches are present. */
2056
2057 for (i = 1; i <= min; i++)
2058 {
2059 if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
2060 GETCHARINC(c, eptr);
2061 if (!_pcre_xclass(c, data)) RRETURN(MATCH_NOMATCH);
2062 }
2063
2064 /* If max == min we can continue with the main loop without the
2065 need to recurse. */
2066
2067 if (min == max) continue;
2068
2069 /* If minimizing, keep testing the rest of the expression and advancing
2070 the pointer while it matches the class. */
2071
2072 if (minimize)
2073 {
2074 for (fi = min;; fi++)
2075 {
2076 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM20);
2077 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2078 if (fi >= max || eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
2079 GETCHARINC(c, eptr);
2080 if (!_pcre_xclass(c, data)) RRETURN(MATCH_NOMATCH);
2081 }
2082 /* Control never gets here */
2083 }
2084
2085 /* If maximizing, find the longest possible run, then work backwards. */
2086
2087 else
2088 {
2089 pp = eptr;
2090 for (i = min; i < max; i++)
2091 {
2092 int len = 1;
2093 if (eptr >= md->end_subject) break;
2094 GETCHARLEN(c, eptr, len);
2095 if (!_pcre_xclass(c, data)) break;
2096 eptr += len;
2097 }
2098 for(;;)
2099 {
2100 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM21);
2101 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2102 if (eptr-- == pp) break; /* Stop if tried at original pos */
2103 if (utf8) BACKCHAR(eptr);
2104 }
2105 RRETURN(MATCH_NOMATCH);
2106 }
2107
2108 /* Control never gets here */
2109 }
2110 #endif /* End of XCLASS */
2111
2112 /* Match a single character, casefully */
2113
2114 case OP_CHAR:
2115 #ifdef SUPPORT_UTF8
2116 if (utf8)
2117 {
2118 length = 1;
2119 ecode++;
2120 GETCHARLEN(fc, ecode, length);
2121 if (length > md->end_subject - eptr) RRETURN(MATCH_NOMATCH);
2122 while (length-- > 0) if (*ecode++ != *eptr++) RRETURN(MATCH_NOMATCH);
2123 }
2124 else
2125 #endif
2126
2127 /* Non-UTF-8 mode */
2128 {
2129 if (md->end_subject - eptr < 1) RRETURN(MATCH_NOMATCH);
2130 if (ecode[1] != *eptr++) RRETURN(MATCH_NOMATCH);
2131 ecode += 2;
2132 }
2133 break;
2134
2135 /* Match a single character, caselessly */
2136
2137 case OP_CHARNC:
2138 #ifdef SUPPORT_UTF8
2139 if (utf8)
2140 {
2141 length = 1;
2142 ecode++;
2143 GETCHARLEN(fc, ecode, length);
2144
2145 if (length > md->end_subject - eptr) RRETURN(MATCH_NOMATCH);
2146
2147 /* If the pattern character's value is < 128, we have only one byte, and
2148 can use the fast lookup table. */
2149
2150 if (fc < 128)
2151 {
2152 if (md->lcc[*ecode++] != md->lcc[*eptr++]) RRETURN(MATCH_NOMATCH);
2153 }
2154
2155 /* Otherwise we must pick up the subject character */
2156
2157 else
2158 {
2159 unsigned int dc;
2160 GETCHARINC(dc, eptr);
2161 ecode += length;
2162
2163 /* If we have Unicode property support, we can use it to test the other
2164 case of the character, if there is one. */
2165
2166 if (fc != dc)
2167 {
2168 #ifdef SUPPORT_UCP
2169 if (dc != _pcre_ucp_othercase(fc))
2170 #endif
2171 RRETURN(MATCH_NOMATCH);
2172 }
2173 }
2174 }
2175 else
2176 #endif /* SUPPORT_UTF8 */
2177
2178 /* Non-UTF-8 mode */
2179 {
2180 if (md->end_subject - eptr < 1) RRETURN(MATCH_NOMATCH);
2181 if (md->lcc[ecode[1]] != md->lcc[*eptr++]) RRETURN(MATCH_NOMATCH);
2182 ecode += 2;
2183 }
2184 break;
2185
2186 /* Match a single character repeatedly. */
2187
2188 case OP_EXACT:
2189 min = max = GET2(ecode, 1);
2190 ecode += 3;
2191 goto REPEATCHAR;
2192
2193 case OP_POSUPTO:
2194 possessive = TRUE;
2195 /* Fall through */
2196
2197 case OP_UPTO:
2198 case OP_MINUPTO:
2199 min = 0;
2200 max = GET2(ecode, 1);
2201 minimize = *ecode == OP_MINUPTO;
2202 ecode += 3;
2203 goto REPEATCHAR;
2204
2205 case OP_POSSTAR:
2206 possessive = TRUE;
2207 min = 0;
2208 max = INT_MAX;
2209 ecode++;
2210 goto REPEATCHAR;
2211
2212 case OP_POSPLUS:
2213 possessive = TRUE;
2214 min = 1;
2215 max = INT_MAX;
2216 ecode++;
2217 goto REPEATCHAR;
2218
2219 case OP_POSQUERY:
2220 possessive = TRUE;
2221 min = 0;
2222 max = 1;
2223 ecode++;
2224 goto REPEATCHAR;
2225
2226 case OP_STAR:
2227 case OP_MINSTAR:
2228 case OP_PLUS:
2229 case OP_MINPLUS:
2230 case OP_QUERY:
2231 case OP_MINQUERY:
2232 c = *ecode++ - OP_STAR;
2233 minimize = (c & 1) != 0;
2234 min = rep_min[c]; /* Pick up values from tables; */
2235 max = rep_max[c]; /* zero for max => infinity */
2236 if (max == 0) max = INT_MAX;
2237
2238 /* Common code for all repeated single-character matches. We can give
2239 up quickly if there are fewer than the minimum number of characters left in
2240 the subject. */
2241
2242 REPEATCHAR:
2243 #ifdef SUPPORT_UTF8
2244 if (utf8)
2245 {
2246 length = 1;
2247 charptr = ecode;
2248 GETCHARLEN(fc, ecode, length);
2249 if (min * length > md->end_subject - eptr) RRETURN(MATCH_NOMATCH);
2250 ecode += length;
2251
2252 /* Handle multibyte character matching specially here. There is
2253 support for caseless matching if UCP support is present. */
2254
2255 if (length > 1)
2256 {
2257 #ifdef SUPPORT_UCP
2258 unsigned int othercase;
2259 if ((ims & PCRE_CASELESS) != 0 &&
2260 (othercase = _pcre_ucp_othercase(fc)) != NOTACHAR)
2261 oclength = _pcre_ord2utf8(othercase, occhars);
2262 else oclength = 0;
2263 #endif /* SUPPORT_UCP */
2264
2265 for (i = 1; i <= min; i++)
2266 {
2267 if (memcmp(eptr, charptr, length) == 0) eptr += length;
2268 #ifdef SUPPORT_UCP
2269 /* Need braces because of following else */
2270 else if (oclength == 0) { RRETURN(MATCH_NOMATCH); }
2271 else
2272 {
2273 if (memcmp(eptr, occhars, oclength) != 0) RRETURN(MATCH_NOMATCH);
2274 eptr += oclength;
2275 }
2276 #else /* without SUPPORT_UCP */
2277 else { RRETURN(MATCH_NOMATCH); }
2278 #endif /* SUPPORT_UCP */
2279 }
2280
2281 if (min == max) continue;
2282
2283 if (minimize)
2284 {
2285 for (fi = min;; fi++)
2286 {
2287 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM22);
2288 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2289 if (fi >= max || eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
2290 if (memcmp(eptr, charptr, length) == 0) eptr += length;
2291 #ifdef SUPPORT_UCP
2292 /* Need braces because of following else */
2293 else if (oclength == 0) { RRETURN(MATCH_NOMATCH); }
2294 else
2295 {
2296 if (memcmp(eptr, occhars, oclength) != 0) RRETURN(MATCH_NOMATCH);
2297 eptr += oclength;
2298 }
2299 #else /* without SUPPORT_UCP */
2300 else { RRETURN (MATCH_NOMATCH); }
2301 #endif /* SUPPORT_UCP */
2302 }
2303 /* Control never gets here */
2304 }
2305
2306 else /* Maximize */
2307 {
2308 pp = eptr;
2309 for (i = min; i < max; i++)
2310 {
2311 if (eptr > md->end_subject - length) break;
2312 if (memcmp(eptr, charptr, length) == 0) eptr += length;
2313 #ifdef SUPPORT_UCP
2314 else if (oclength == 0) break;
2315 else
2316 {
2317 if (memcmp(eptr, occhars, oclength) != 0) break;
2318 eptr += oclength;
2319 }
2320 #else /* without SUPPORT_UCP */
2321 else break;
2322 #endif /* SUPPORT_UCP */
2323 }
2324
2325 if (possessive) continue;
2326 for(;;)
2327 {
2328 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM23);
2329 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2330 if (eptr == pp) RRETURN(MATCH_NOMATCH);
2331 #ifdef SUPPORT_UCP
2332 eptr--;
2333 BACKCHAR(eptr);
2334 #else /* without SUPPORT_UCP */
2335 eptr -= length;
2336 #endif /* SUPPORT_UCP */
2337 }
2338 }
2339 /* Control never gets here */
2340 }
2341
2342 /* If the length of a UTF-8 character is 1, we fall through here, and
2343 obey the code as for non-UTF-8 characters below, though in this case the
2344 value of fc will always be < 128. */
2345 }
2346 else
2347 #endif /* SUPPORT_UTF8 */
2348
2349 /* When not in UTF-8 mode, load a single-byte character. */
2350 {
2351 if (min > md->end_subject - eptr) RRETURN(MATCH_NOMATCH);
2352 fc = *ecode++;
2353 }
2354
2355 /* The value of fc at this point is always less than 256, though we may or
2356 may not be in UTF-8 mode. The code is duplicated for the caseless and
2357 caseful cases, for speed, since matching characters is likely to be quite
2358 common. First, ensure the minimum number of matches are present. If min =
2359 max, continue at the same level without recursing. Otherwise, if
2360 minimizing, keep trying the rest of the expression and advancing one
2361 matching character if failing, up to the maximum. Alternatively, if
2362 maximizing, find the maximum number of characters and work backwards. */
2363
2364 DPRINTF(("matching %c{%d,%d} against subject %.*s\n", fc, min, max,
2365 max, eptr));
2366
2367 if ((ims & PCRE_CASELESS) != 0)
2368 {
2369 fc = md->lcc[fc];
2370 for (i = 1; i <= min; i++)
2371 if (fc != md->lcc[*eptr++]) RRETURN(MATCH_NOMATCH);
2372 if (min == max) continue;
2373 if (minimize)
2374 {
2375 for (fi = min;; fi++)
2376 {
2377 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM24);
2378 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2379 if (fi >= max || eptr >= md->end_subject ||
2380 fc != md->lcc[*eptr++])
2381 RRETURN(MATCH_NOMATCH);
2382 }
2383 /* Control never gets here */
2384 }
2385 else /* Maximize */
2386 {
2387 pp = eptr;
2388 for (i = min; i < max; i++)
2389 {
2390 if (eptr >= md->end_subject || fc != md->lcc[*eptr]) break;
2391 eptr++;
2392 }
2393 if (possessive) continue;
2394 while (eptr >= pp)
2395 {
2396 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM25);
2397 eptr--;
2398 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2399 }
2400 RRETURN(MATCH_NOMATCH);
2401 }
2402 /* Control never gets here */
2403 }
2404
2405 /* Caseful comparisons (includes all multi-byte characters) */
2406
2407 else
2408 {
2409 for (i = 1; i <= min; i++) if (fc != *eptr++) RRETURN(MATCH_NOMATCH);
2410 if (min == max) continue;
2411 if (minimize)
2412 {
2413 for (fi = min;; fi++)
2414 {
2415 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM26);
2416 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2417 if (fi >= max || eptr >= md->end_subject || fc != *eptr++)
2418 RRETURN(MATCH_NOMATCH);
2419 }
2420 /* Control never gets here */
2421 }
2422 else /* Maximize */
2423 {
2424 pp = eptr;
2425 for (i = min; i < max; i++)
2426 {
2427 if (eptr >= md->end_subject || fc != *eptr) break;
2428 eptr++;
2429 }
2430 if (possessive) continue;
2431 while (eptr >= pp)
2432 {
2433 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM27);
2434 eptr--;
2435 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2436 }
2437 RRETURN(MATCH_NOMATCH);
2438 }
2439 }
2440 /* Control never gets here */
2441
2442 /* Match a negated single one-byte character. The character we are
2443 checking can be multibyte. */
2444
2445 case OP_NOT:
2446 if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
2447 ecode++;
2448 GETCHARINCTEST(c, eptr);
2449 if ((ims & PCRE_CASELESS) != 0)
2450 {
2451 #ifdef SUPPORT_UTF8
2452 if (c < 256)
2453 #endif
2454 c = md->lcc[c];
2455 if (md->lcc[*ecode++] == c) RRETURN(MATCH_NOMATCH);
2456 }
2457 else
2458 {
2459 if (*ecode++ == c) RRETURN(MATCH_NOMATCH);
2460 }
2461 break;
2462
2463 /* Match a negated single one-byte character repeatedly. This is almost a
2464 repeat of the code for a repeated single character, but I haven't found a
2465 nice way of commoning these up that doesn't require a test of the
2466 positive/negative option for each character match. Maybe that wouldn't add
2467 very much to the time taken, but character matching *is* what this is all
2468 about... */
2469
2470 case OP_NOTEXACT:
2471 min = max = GET2(ecode, 1);
2472 ecode += 3;
2473 goto REPEATNOTCHAR;
2474
2475 case OP_NOTUPTO:
2476 case OP_NOTMINUPTO:
2477 min = 0;
2478 max = GET2(ecode, 1);
2479 minimize = *ecode == OP_NOTMINUPTO;
2480 ecode += 3;
2481 goto REPEATNOTCHAR;
2482
2483 case OP_NOTPOSSTAR:
2484 possessive = TRUE;
2485 min = 0;
2486 max = INT_MAX;
2487 ecode++;
2488 goto REPEATNOTCHAR;
2489
2490 case OP_NOTPOSPLUS:
2491 possessive = TRUE;
2492 min = 1;
2493 max = INT_MAX;
2494 ecode++;
2495 goto REPEATNOTCHAR;
2496
2497 case OP_NOTPOSQUERY:
2498 possessive = TRUE;
2499 min = 0;
2500 max = 1;
2501 ecode++;
2502 goto REPEATNOTCHAR;
2503
2504 case OP_NOTPOSUPTO:
2505 possessive = TRUE;
2506 min = 0;
2507 max = GET2(ecode, 1);
2508 ecode += 3;
2509 goto REPEATNOTCHAR;
2510
2511 case OP_NOTSTAR:
2512 case OP_NOTMINSTAR:
2513 case OP_NOTPLUS:
2514 case OP_NOTMINPLUS:
2515 case OP_NOTQUERY:
2516 case OP_NOTMINQUERY:
2517 c = *ecode++ - OP_NOTSTAR;
2518 minimize = (c & 1) != 0;
2519 min = rep_min[c]; /* Pick up values from tables; */
2520 max = rep_max[c]; /* zero for max => infinity */
2521 if (max == 0) max = INT_MAX;
2522
2523 /* Common code for all repeated single-byte matches. We can give up quickly
2524 if there are fewer than the minimum number of bytes left in the
2525 subject. */
2526
2527 REPEATNOTCHAR:
2528 if (min > md->end_subject - eptr) RRETURN(MATCH_NOMATCH);
2529 fc = *ecode++;
2530
2531 /* The code is duplicated for the caseless and caseful cases, for speed,
2532 since matching characters is likely to be quite common. First, ensure the
2533 minimum number of matches are present. If min = max, continue at the same
2534 level without recursing. Otherwise, if minimizing, keep trying the rest of
2535 the expression and advancing one matching character if failing, up to the
2536 maximum. Alternatively, if maximizing, find the maximum number of
2537 characters and work backwards. */
2538
2539 DPRINTF(("negative matching %c{%d,%d} against subject %.*s\n", fc, min, max,
2540 max, eptr));
2541
2542 if ((ims & PCRE_CASELESS) != 0)
2543 {
2544 fc = md->lcc[fc];
2545
2546 #ifdef SUPPORT_UTF8
2547 /* UTF-8 mode */
2548 if (utf8)
2549 {
2550 register unsigned int d;
2551 for (i = 1; i <= min; i++)
2552 {
2553 GETCHARINC(d, eptr);
2554 if (d < 256) d = md->lcc[d];
2555 if (fc == d) RRETURN(MATCH_NOMATCH);
2556 }
2557 }
2558 else
2559 #endif
2560
2561 /* Not UTF-8 mode */
2562 {
2563 for (i = 1; i <= min; i++)
2564 if (fc == md->lcc[*eptr++]) RRETURN(MATCH_NOMATCH);
2565 }
2566
2567 if (min == max) continue;
2568
2569 if (minimize)
2570 {
2571 #ifdef SUPPORT_UTF8
2572 /* UTF-8 mode */
2573 if (utf8)
2574 {
2575 register unsigned int d;
2576 for (fi = min;; fi++)
2577 {
2578 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM28);
2579 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2580 GETCHARINC(d, eptr);
2581 if (d < 256) d = md->lcc[d];
2582 if (fi >= max || eptr >= md->end_subject || fc == d)
2583 RRETURN(MATCH_NOMATCH);
2584 }
2585 }
2586 else
2587 #endif
2588 /* Not UTF-8 mode */
2589 {
2590 for (fi = min;; fi++)
2591 {
2592 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM29);
2593 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2594 if (fi >= max || eptr >= md->end_subject || fc == md->lcc[*eptr++])
2595 RRETURN(MATCH_NOMATCH);
2596 }
2597 }
2598 /* Control never gets here */
2599 }
2600
2601 /* Maximize case */
2602
2603 else
2604 {
2605 pp = eptr;
2606
2607 #ifdef SUPPORT_UTF8
2608 /* UTF-8 mode */
2609 if (utf8)
2610 {
2611 register unsigned int d;
2612 for (i = min; i < max; i++)
2613 {
2614 int len = 1;
2615 if (eptr >= md->end_subject) break;
2616 GETCHARLEN(d, eptr, len);
2617 if (d < 256) d = md->lcc[d];
2618 if (fc == d) break;
2619 eptr += len;
2620 }
2621 if (possessive) continue;
2622 for(;;)
2623 {
2624 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM30);
2625 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2626 if (eptr-- == pp) break; /* Stop if tried at original pos */
2627 BACKCHAR(eptr);
2628 }
2629 }
2630 else
2631 #endif
2632 /* Not UTF-8 mode */
2633 {
2634 for (i = min; i < max; i++)
2635 {
2636 if (eptr >= md->end_subject || fc == md->lcc[*eptr]) break;
2637 eptr++;
2638 }
2639 if (possessive) continue;
2640 while (eptr >= pp)
2641 {
2642 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM31);
2643 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2644 eptr--;
2645 }
2646 }
2647
2648 RRETURN(MATCH_NOMATCH);
2649 }
2650 /* Control never gets here */
2651 }
2652
2653 /* Caseful comparisons */
2654
2655 else
2656 {
2657 #ifdef SUPPORT_UTF8
2658 /* UTF-8 mode */
2659 if (utf8)
2660 {
2661 register unsigned int d;
2662 for (i = 1; i <= min; i++)
2663 {
2664 GETCHARINC(d, eptr);
2665 if (fc == d) RRETURN(MATCH_NOMATCH);
2666 }
2667 }
2668 else
2669 #endif
2670 /* Not UTF-8 mode */
2671 {
2672 for (i = 1; i <= min; i++)
2673 if (fc == *eptr++) RRETURN(MATCH_NOMATCH);
2674 }
2675
2676 if (min == max) continue;
2677
2678 if (minimize)
2679 {
2680 #ifdef SUPPORT_UTF8
2681 /* UTF-8 mode */
2682 if (utf8)
2683 {
2684 register unsigned int d;
2685 for (fi = min;; fi++)
2686 {
2687 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM32);
2688 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2689 GETCHARINC(d, eptr);
2690 if (fi >= max || eptr >= md->end_subject || fc == d)
2691 RRETURN(MATCH_NOMATCH);
2692 }
2693 }
2694 else
2695 #endif
2696 /* Not UTF-8 mode */
2697 {
2698 for (fi = min;; fi++)
2699 {
2700 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM33);
2701 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2702 if (fi >= max || eptr >= md->end_subject || fc == *eptr++)
2703 RRETURN(MATCH_NOMATCH);
2704 }
2705 }
2706 /* Control never gets here */
2707 }
2708
2709 /* Maximize case */
2710
2711 else
2712 {
2713 pp = eptr;
2714
2715 #ifdef SUPPORT_UTF8
2716 /* UTF-8 mode */
2717 if (utf8)
2718 {
2719 register unsigned int d;
2720 for (i = min; i < max; i++)
2721 {
2722 int len = 1;
2723 if (eptr >= md->end_subject) break;
2724 GETCHARLEN(d, eptr, len);
2725 if (fc == d) break;
2726 eptr += len;
2727 }
2728 if (possessive) continue;
2729 for(;;)
2730 {
2731 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM34);
2732 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2733 if (eptr-- == pp) break; /* Stop if tried at original pos */
2734 BACKCHAR(eptr);
2735 }
2736 }
2737 else
2738 #endif
2739 /* Not UTF-8 mode */
2740 {
2741 for (i = min; i < max; i++)
2742 {
2743 if (eptr >= md->end_subject || fc == *eptr) break;
2744 eptr++;
2745 }
2746 if (possessive) continue;
2747 while (eptr >= pp)
2748 {
2749 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM35);
2750 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2751 eptr--;
2752 }
2753 }
2754
2755 RRETURN(MATCH_NOMATCH);
2756 }
2757 }
2758 /* Control never gets here */
2759
2760 /* Match a single character type repeatedly; several different opcodes
2761 share code. This is very similar to the code for single characters, but we
2762 repeat it in the interests of efficiency. */
2763
2764 case OP_TYPEEXACT:
2765 min = max = GET2(ecode, 1);
2766 minimize = TRUE;
2767 ecode += 3;
2768 goto REPEATTYPE;
2769
2770 case OP_TYPEUPTO:
2771 case OP_TYPEMINUPTO:
2772 min = 0;
2773 max = GET2(ecode, 1);
2774 minimize = *ecode == OP_TYPEMINUPTO;
2775 ecode += 3;
2776 goto REPEATTYPE;
2777
2778 case OP_TYPEPOSSTAR:
2779 possessive = TRUE;
2780 min = 0;
2781 max = INT_MAX;
2782 ecode++;
2783 goto REPEATTYPE;
2784
2785 case OP_TYPEPOSPLUS:
2786 possessive = TRUE;
2787 min = 1;
2788 max = INT_MAX;
2789 ecode++;
2790 goto REPEATTYPE;
2791
2792 case OP_TYPEPOSQUERY:
2793 possessive = TRUE;
2794 min = 0;
2795 max = 1;
2796 ecode++;
2797 goto REPEATTYPE;
2798
2799 case OP_TYPEPOSUPTO:
2800 possessive = TRUE;
2801 min = 0;
2802 max = GET2(ecode, 1);
2803 ecode += 3;
2804 goto REPEATTYPE;
2805
2806 case OP_TYPESTAR:
2807 case OP_TYPEMINSTAR:
2808 case OP_TYPEPLUS:
2809 case OP_TYPEMINPLUS:
2810 case OP_TYPEQUERY:
2811 case OP_TYPEMINQUERY:
2812 c = *ecode++ - OP_TYPESTAR;
2813 minimize = (c & 1) != 0;
2814 min = rep_min[c]; /* Pick up values from tables; */
2815 max = rep_max[c]; /* zero for max => infinity */
2816 if (max == 0) max = INT_MAX;
2817
2818 /* Common code for all repeated single character type matches. Note that
2819 in UTF-8 mode, '.' matches a character of any length, but for the other
2820 character types, the valid characters are all one-byte long. */
2821
2822 REPEATTYPE:
2823 ctype = *ecode++; /* Code for the character type */
2824
2825 #ifdef SUPPORT_UCP
2826 if (ctype == OP_PROP || ctype == OP_NOTPROP)
2827 {
2828 prop_fail_result = ctype == OP_NOTPROP;
2829 prop_type = *ecode++;
2830 prop_value = *ecode++;
2831 }
2832 else prop_type = -1;
2833 #endif
2834
2835 /* First, ensure the minimum number of matches are present. Use inline
2836 code for maximizing the speed, and do the type test once at the start
2837 (i.e. keep it out of the loop). Also we can test that there are at least
2838 the minimum number of bytes before we start. This isn't as effective in
2839 UTF-8 mode, but it does no harm. Separate the UTF-8 code completely as that
2840 is tidier. Also separate the UCP code, which can be the same for both UTF-8
2841 and single-bytes. */
2842
2843 if (min > md->end_subject - eptr) RRETURN(MATCH_NOMATCH);
2844 if (min > 0)
2845 {
2846 #ifdef SUPPORT_UCP
2847 if (prop_type >= 0)
2848 {
2849 switch(prop_type)
2850 {
2851 case PT_ANY:
2852 if (prop_fail_result) RRETURN(MATCH_NOMATCH);
2853 for (i = 1; i <= min; i++)
2854 {
2855 if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
2856 GETCHARINCTEST(c, eptr);
2857 }
2858 break;
2859
2860 case PT_LAMP:
2861 for (i = 1; i <= min; i++)
2862 {
2863 if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
2864 GETCHARINCTEST(c, eptr);
2865 prop_category = _pcre_ucp_findprop(c, &prop_chartype, &prop_script);
2866 if ((prop_chartype == ucp_Lu ||
2867 prop_chartype == ucp_Ll ||
2868 prop_chartype == ucp_Lt) == prop_fail_result)
2869 RRETURN(MATCH_NOMATCH);
2870 }
2871 break;
2872
2873 case PT_GC:
2874 for (i = 1; i <= min; i++)
2875 {
2876 if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
2877 GETCHARINCTEST(c, eptr);
2878 prop_category = _pcre_ucp_findprop(c, &prop_chartype, &prop_script);
2879 if ((prop_category == prop_value) == prop_fail_result)
2880 RRETURN(MATCH_NOMATCH);
2881 }
2882 break;
2883
2884 case PT_PC:
2885 for (i = 1; i <= min; i++)
2886 {
2887 if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
2888 GETCHARINCTEST(c, eptr);
2889 prop_category = _pcre_ucp_findprop(c, &prop_chartype, &prop_script);
2890 if ((prop_chartype == prop_value) == prop_fail_result)
2891 RRETURN(MATCH_NOMATCH);
2892 }
2893 break;
2894
2895 case PT_SC:
2896 for (i = 1; i <= min; i++)
2897 {
2898 if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
2899 GETCHARINCTEST(c, eptr);
2900 prop_category = _pcre_ucp_findprop(c, &prop_chartype, &prop_script);
2901 if ((prop_script == prop_value) == prop_fail_result)
2902 RRETURN(MATCH_NOMATCH);
2903 }
2904 break;
2905
2906 default:
2907 RRETURN(PCRE_ERROR_INTERNAL);
2908 }
2909 }
2910
2911 /* Match extended Unicode sequences. We will get here only if the
2912 support is in the binary; otherwise a compile-time error occurs. */
2913
2914 else if (ctype == OP_EXTUNI)
2915 {
2916 for (i = 1; i <= min; i++)
2917 {
2918 GETCHARINCTEST(c, eptr);
2919 prop_category = _pcre_ucp_findprop(c, &prop_chartype, &prop_script);
2920 if (prop_category == ucp_M) RRETURN(MATCH_NOMATCH);
2921 while (eptr < md->end_subject)
2922 {
2923 int len = 1;
2924 if (!utf8) c = *eptr; else
2925 {
2926 GETCHARLEN(c, eptr, len);
2927 }
2928 prop_category = _pcre_ucp_findprop(c, &prop_chartype, &prop_script);
2929 if (prop_category != ucp_M) break;
2930 eptr += len;
2931 }
2932 }
2933 }
2934
2935 else
2936 #endif /* SUPPORT_UCP */
2937
2938 /* Handle all other cases when the coding is UTF-8 */
2939
2940 #ifdef SUPPORT_UTF8
2941 if (utf8) switch(ctype)
2942 {
2943 case OP_ANY:
2944 for (i = 1; i <= min; i++)
2945 {
2946 if (eptr >= md->end_subject ||
2947 ((ims & PCRE_DOTALL) == 0 && IS_NEWLINE(eptr)))
2948 RRETURN(MATCH_NOMATCH);
2949 eptr++;
2950 while (eptr < md->end_subject && (*eptr & 0xc0) == 0x80) eptr++;
2951 }
2952 break;
2953
2954 case OP_ANYBYTE:
2955 eptr += min;
2956 break;
2957
2958 case OP_ANYNL:
2959 for (i = 1; i <= min; i++)
2960 {
2961 if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
2962 GETCHARINC(c, eptr);
2963 switch(c)
2964 {
2965 default: RRETURN(MATCH_NOMATCH);
2966 case 0x000d:
2967 if (eptr < md->end_subject && *eptr == 0x0a) eptr++;
2968 break;
2969
2970 case 0x000a:
2971 break;
2972
2973 case 0x000b:
2974 case 0x000c:
2975 case 0x0085:
2976 case 0x2028:
2977 case 0x2029:
2978 if (md->bsr_anycrlf) RRETURN(MATCH_NOMATCH);
2979 break;
2980 }
2981 }
2982 break;
2983
2984 case OP_NOT_HSPACE:
2985 for (i = 1; i <= min; i++)
2986 {
2987 if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
2988 GETCHARINC(c, eptr);
2989 switch(c)
2990 {
2991 default: break;
2992 case 0x09: /* HT */
2993 case 0x20: /* SPACE */
2994 case 0xa0: /* NBSP */
2995 case 0x1680: /* OGHAM SPACE MARK */
2996 case 0x180e: /* MONGOLIAN VOWEL SEPARATOR */
2997 case 0x2000: /* EN QUAD */
2998 case 0x2001: /* EM QUAD */
2999 case 0x2002: /* EN SPACE */
3000 case 0x2003: /* EM SPACE */
3001 case 0x2004: /* THREE-PER-EM SPACE */
3002 case 0x2005: /* FOUR-PER-EM SPACE */
3003 case 0x2006: /* SIX-PER-EM SPACE */
3004 case 0x2007: /* FIGURE SPACE */
3005 case 0x2008: /* PUNCTUATION SPACE */
3006 case 0x2009: /* THIN SPACE */
3007 case 0x200A: /* HAIR SPACE */
3008 case 0x202f: /* NARROW NO-BREAK SPACE */
3009 case 0x205f: /* MEDIUM MATHEMATICAL SPACE */
3010 case 0x3000: /* IDEOGRAPHIC SPACE */
3011 RRETURN(MATCH_NOMATCH);
3012 }
3013 }
3014 break;
3015
3016 case OP_HSPACE:
3017 for (i = 1; i <= min; i++)
3018 {
3019 if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
3020 GETCHARINC(c, eptr);
3021 switch(c)
3022 {
3023 default: RRETURN(MATCH_NOMATCH);
3024 case 0x09: /* HT */
3025 case 0x20: /* SPACE */
3026 case 0xa0: /* NBSP */
3027 case 0x1680: /* OGHAM SPACE MARK */
3028 case 0x180e: /* MONGOLIAN VOWEL SEPARATOR */
3029 case 0x2000: /* EN QUAD */
3030 case 0x2001: /* EM QUAD */
3031 case 0x2002: /* EN SPACE */
3032 case 0x2003: /* EM SPACE */
3033 case 0x2004: /* THREE-PER-EM SPACE */
3034 case 0x2005: /* FOUR-PER-EM SPACE */
3035 case 0x2006: /* SIX-PER-EM SPACE */
3036 case 0x2007: /* FIGURE SPACE */
3037 case 0x2008: /* PUNCTUATION SPACE */
3038 case 0x2009: /* THIN SPACE */
3039 case 0x200A: /* HAIR SPACE */
3040 case 0x202f: /* NARROW NO-BREAK SPACE */
3041 case 0x205f: /* MEDIUM MATHEMATICAL SPACE */
3042 case 0x3000: /* IDEOGRAPHIC SPACE */
3043 break;
3044 }
3045 }
3046 break;
3047
3048 case OP_NOT_VSPACE:
3049 for (i = 1; i <= min; i++)
3050 {
3051 if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
3052 GETCHARINC(c, eptr);
3053 switch(c)
3054 {
3055 default: break;
3056 case 0x0a: /* LF */
3057 case 0x0b: /* VT */
3058 case 0x0c: /* FF */
3059 case 0x0d: /* CR */
3060 case 0x85: /* NEL */
3061 case 0x2028: /* LINE SEPARATOR */
3062 case 0x2029: /* PARAGRAPH SEPARATOR */
3063 RRETURN(MATCH_NOMATCH);
3064 }
3065 }
3066 break;
3067
3068 case OP_VSPACE:
3069 for (i = 1; i <= min; i++)
3070 {
3071 if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
3072 GETCHARINC(c, eptr);
3073 switch(c)
3074 {
3075 default: RRETURN(MATCH_NOMATCH);
3076 case 0x0a: /* LF */
3077 case 0x0b: /* VT */
3078 case 0x0c: /* FF */
3079 case 0x0d: /* CR */
3080 case 0x85: /* NEL */
3081 case 0x2028: /* LINE SEPARATOR */
3082 case 0x2029: /* PARAGRAPH SEPARATOR */
3083 break;
3084 }
3085 }
3086 break;
3087
3088 case OP_NOT_DIGIT:
3089 for (i = 1; i <= min; i++)
3090 {
3091 if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
3092 GETCHARINC(c, eptr);
3093 if (c < 128 && (md->ctypes[c] & ctype_digit) != 0)
3094 RRETURN(MATCH_NOMATCH);
3095 }
3096 break;
3097
3098 case OP_DIGIT:
3099 for (i = 1; i <= min; i++)
3100 {
3101 if (eptr >= md->end_subject ||
3102 *eptr >= 128 || (md->ctypes[*eptr++] & ctype_digit) == 0)
3103 RRETURN(MATCH_NOMATCH);
3104 /* No need to skip more bytes - we know it's a 1-byte character */
3105 }
3106 break;
3107
3108 case OP_NOT_WHITESPACE:
3109 for (i = 1; i <= min; i++)
3110 {
3111 if (eptr >= md->end_subject ||
3112 (*eptr < 128 && (md->ctypes[*eptr] & ctype_space) != 0))
3113 RRETURN(MATCH_NOMATCH);
3114 while (++eptr < md->end_subject && (*eptr & 0xc0) == 0x80);
3115 }
3116 break;
3117
3118 case OP_WHITESPACE:
3119 for (i = 1; i <= min; i++)
3120 {
3121 if (eptr >= md->end_subject ||
3122 *eptr >= 128 || (md->ctypes[*eptr++] & ctype_space) == 0)
3123 RRETURN(MATCH_NOMATCH);
3124 /* No need to skip more bytes - we know it's a 1-byte character */
3125 }
3126 break;
3127
3128 case OP_NOT_WORDCHAR:
3129 for (i = 1; i <= min; i++)
3130 {
3131 if (eptr >= md->end_subject ||
3132 (*eptr < 128 && (md->ctypes[*eptr] & ctype_word) != 0))
3133 RRETURN(MATCH_NOMATCH);
3134 while (++eptr < md->end_subject && (*eptr & 0xc0) == 0x80);
3135 }
3136 break;
3137
3138 case OP_WORDCHAR:
3139 for (i = 1; i <= min; i++)
3140 {
3141 if (eptr >= md->end_subject ||
3142 *eptr >= 128 || (md->ctypes[*eptr++] & ctype_word) == 0)
3143 RRETURN(MATCH_NOMATCH);
3144 /* No need to skip more bytes - we know it's a 1-byte character */
3145 }
3146 break;
3147
3148 default:
3149 RRETURN(PCRE_ERROR_INTERNAL);
3150 } /* End switch(ctype) */
3151
3152 else
3153 #endif /* SUPPORT_UTF8 */
3154
3155 /* Code for the non-UTF-8 case for minimum matching of operators other
3156 than OP_PROP and OP_NOTPROP. We can assume that there are the minimum
3157 number of bytes present, as this was tested above. */
3158
3159 switch(ctype)
3160 {
3161 case OP_ANY:
3162 if ((ims & PCRE_DOTALL) == 0)
3163 {
3164 for (i = 1; i <= min; i++)
3165 {
3166 if (IS_NEWLINE(eptr)) RRETURN(MATCH_NOMATCH);
3167 eptr++;
3168 }
3169 }
3170 else eptr += min;
3171 break;
3172
3173 case OP_ANYBYTE:
3174 eptr += min;
3175 break;
3176
3177 /* Because of the CRLF case, we can't assume the minimum number of
3178 bytes are present in this case. */
3179
3180 case OP_ANYNL:
3181 for (i = 1; i <= min; i++)
3182 {
3183 if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
3184 switch(*eptr++)
3185 {
3186 default: RRETURN(MATCH_NOMATCH);
3187 case 0x000d:
3188 if (eptr < md->end_subject && *eptr == 0x0a) eptr++;
3189 break;
3190 case 0x000a:
3191 break;
3192
3193 case 0x000b:
3194 case 0x000c:
3195 case 0x0085:
3196 if (md->bsr_anycrlf) RRETURN(MATCH_NOMATCH);
3197 break;
3198 }
3199 }
3200 break;
3201
3202 case OP_NOT_HSPACE:
3203 for (i = 1; i <= min; i++)
3204 {
3205 if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
3206 switch(*eptr++)
3207 {
3208 default: break;
3209 case 0x09: /* HT */
3210 case 0x20: /* SPACE */
3211 case 0xa0: /* NBSP */
3212 RRETURN(MATCH_NOMATCH);
3213 }
3214 }
3215 break;
3216
3217 case OP_HSPACE:
3218 for (i = 1; i <= min; i++)
3219 {
3220 if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
3221 switch(*eptr++)
3222 {
3223 default: RRETURN(MATCH_NOMATCH);
3224 case 0x09: /* HT */
3225 case 0x20: /* SPACE */
3226 case 0xa0: /* NBSP */
3227 break;
3228 }
3229 }
3230 break;
3231
3232 case OP_NOT_VSPACE:
3233 for (i = 1; i <= min; i++)
3234 {
3235 if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
3236 switch(*eptr++)
3237 {
3238 default: break;
3239 case 0x0a: /* LF */
3240 case 0x0b: /* VT */
3241 case 0x0c: /* FF */
3242 case 0x0d: /* CR */
3243 case 0x85: /* NEL */
3244 RRETURN(MATCH_NOMATCH);
3245 }
3246 }
3247 break;
3248
3249 case OP_VSPACE:
3250 for (i = 1; i <= min; i++)
3251 {
3252 if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
3253 switch(*eptr++)
3254 {
3255 default: RRETURN(MATCH_NOMATCH);
3256 case 0x0a: /* LF */
3257 case 0x0b: /* VT */
3258 case 0x0c: /* FF */
3259 case 0x0d: /* CR */
3260 case 0x85: /* NEL */
3261 break;
3262 }
3263 }
3264 break;
3265
3266 case OP_NOT_DIGIT:
3267 for (i = 1; i <= min; i++)
3268 if ((md->ctypes[*eptr++] & ctype_digit) != 0) RRETURN(MATCH_NOMATCH);
3269 break;
3270
3271 case OP_DIGIT:
3272 for (i = 1; i <= min; i++)
3273 if ((md->ctypes[*eptr++] & ctype_digit) == 0) RRETURN(MATCH_NOMATCH);
3274 break;
3275
3276 case OP_NOT_WHITESPACE:
3277 for (i = 1; i <= min; i++)
3278 if ((md->ctypes[*eptr++] & ctype_space) != 0) RRETURN(MATCH_NOMATCH);
3279 break;
3280
3281 case OP_WHITESPACE:
3282 for (i = 1; i <= min; i++)
3283 if ((md->ctypes[*eptr++] & ctype_space) == 0) RRETURN(MATCH_NOMATCH);
3284 break;
3285
3286 case OP_NOT_WORDCHAR:
3287 for (i = 1; i <= min; i++)
3288 if ((md->ctypes[*eptr++] & ctype_word) != 0)
3289 RRETURN(MATCH_NOMATCH);
3290 break;
3291
3292 case OP_WORDCHAR:
3293 for (i = 1; i <= min; i++)
3294 if ((md->ctypes[*eptr++] & ctype_word) == 0)
3295 RRETURN(MATCH_NOMATCH);
3296 break;
3297
3298 default:
3299 RRETURN(PCRE_ERROR_INTERNAL);
3300 }
3301 }
3302
3303 /* If min = max, continue at the same level without recursing */
3304
3305 if (min == max) continue;
3306
3307 /* If minimizing, we have to test the rest of the pattern before each
3308 subsequent match. Again, separate the UTF-8 case for speed, and also
3309 separate the UCP cases. */
3310
3311 if (minimize)
3312 {
3313 #ifdef SUPPORT_UCP
3314 if (prop_type >= 0)
3315 {
3316 switch(prop_type)
3317 {
3318 case PT_ANY:
3319 for (fi = min;; fi++)
3320 {
3321 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM36);
3322 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3323 if (fi >= max || eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
3324 GETCHARINC(c, eptr);
3325 if (prop_fail_result) RRETURN(MATCH_NOMATCH);
3326 }
3327 /* Control never gets here */
3328
3329 case PT_LAMP:
3330 for (fi = min;; fi++)
3331 {
3332 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM37);
3333 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3334 if (fi >= max || eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
3335 GETCHARINC(c, eptr);
3336 prop_category = _pcre_ucp_findprop(c, &prop_chartype, &prop_script);
3337 if ((prop_chartype == ucp_Lu ||
3338 prop_chartype == ucp_Ll ||
3339 prop_chartype == ucp_Lt) == prop_fail_result)
3340 RRETURN(MATCH_NOMATCH);
3341 }
3342 /* Control never gets here */
3343
3344 case PT_GC:
3345 for (fi = min;; fi++)
3346 {
3347 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM38);
3348 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3349 if (fi >= max || eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
3350 GETCHARINC(c, eptr);
3351 prop_category = _pcre_ucp_findprop(c, &prop_chartype, &prop_script);
3352 if ((prop_category == prop_value) == prop_fail_result)
3353 RRETURN(MATCH_NOMATCH);
3354 }
3355 /* Control never gets here */
3356
3357 case PT_PC:
3358 for (fi = min;; fi++)
3359 {
3360 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM39);
3361 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3362 if (fi >= max || eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
3363 GETCHARINC(c, eptr);
3364 prop_category = _pcre_ucp_findprop(c, &prop_chartype, &prop_script);
3365 if ((prop_chartype == prop_value) == prop_fail_result)
3366 RRETURN(MATCH_NOMATCH);
3367 }
3368 /* Control never gets here */
3369
3370 case PT_SC:
3371 for (fi = min;; fi++)
3372 {
3373 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM40);
3374 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3375 if (fi >= max || eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
3376 GETCHARINC(c, eptr);
3377 prop_category = _pcre_ucp_findprop(c, &prop_chartype, &prop_script);
3378 if ((prop_script == prop_value) == prop_fail_result)
3379 RRETURN(MATCH_NOMATCH);
3380 }
3381 /* Control never gets here */
3382
3383 default:
3384 RRETURN(PCRE_ERROR_INTERNAL);
3385 }
3386 }
3387
3388 /* Match extended Unicode sequences. We will get here only if the
3389 support is in the binary; otherwise a compile-time error occurs. */
3390
3391 else if (ctype == OP_EXTUNI)
3392 {
3393 for (fi = min;; fi++)
3394 {
3395 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM41);
3396 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3397 if (fi >= max || eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
3398 GETCHARINCTEST(c, eptr);
3399 prop_category = _pcre_ucp_findprop(c, &prop_chartype, &prop_script);
3400 if (prop_category == ucp_M) RRETURN(MATCH_NOMATCH);
3401 while (eptr < md->end_subject)
3402 {
3403 int len = 1;
3404 if (!utf8) c = *eptr; else
3405 {
3406 GETCHARLEN(c, eptr, len);
3407 }
3408 prop_category = _pcre_ucp_findprop(c, &prop_chartype, &prop_script);
3409 if (prop_category != ucp_M) break;
3410 eptr += len;
3411 }
3412 }
3413 }
3414
3415 else
3416 #endif /* SUPPORT_UCP */
3417
3418 #ifdef SUPPORT_UTF8
3419 /* UTF-8 mode */
3420 if (utf8)
3421 {
3422 for (fi = min;; fi++)
3423 {
3424 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM42);
3425 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3426 if (fi >= max || eptr >= md->end_subject ||
3427 (ctype == OP_ANY && (ims & PCRE_DOTALL) == 0 &&
3428 IS_NEWLINE(eptr)))
3429 RRETURN(MATCH_NOMATCH);
3430
3431 GETCHARINC(c, eptr);
3432 switch(ctype)
3433 {
3434 case OP_ANY: /* This is the DOTALL case */
3435 break;
3436
3437 case OP_ANYBYTE:
3438 break;
3439
3440 case OP_ANYNL:
3441 switch(c)
3442 {
3443 default: RRETURN(MATCH_NOMATCH);
3444 case 0x000d:
3445 if (eptr < md->end_subject && *eptr == 0x0a) eptr++;
3446 break;
3447 case 0x000a:
3448 break;
3449
3450 case 0x000b:
3451 case 0x000c:
3452 case 0x0085:
3453 case 0x2028:
3454 case 0x2029:
3455 if (md->bsr_anycrlf) RRETURN(MATCH_NOMATCH);
3456 break;
3457 }
3458 break;
3459
3460 case OP_NOT_HSPACE:
3461 switch(c)
3462 {
3463 default: break;
3464 case 0x09: /* HT */
3465 case 0x20: /* SPACE */
3466 case 0xa0: /* NBSP */
3467 case 0x1680: /* OGHAM SPACE MARK */
3468 case 0x180e: /* MONGOLIAN VOWEL SEPARATOR */
3469 case 0x2000: /* EN QUAD */
3470 case 0x2001: /* EM QUAD */
3471 case 0x2002: /* EN SPACE */
3472 case 0x2003: /* EM SPACE */
3473 case 0x2004: /* THREE-PER-EM SPACE */
3474 case 0x2005: /* FOUR-PER-EM SPACE */
3475 case 0x2006: /* SIX-PER-EM SPACE */
3476 case 0x2007: /* FIGURE SPACE */
3477 case 0x2008: /* PUNCTUATION SPACE */
3478 case 0x2009: /* THIN SPACE */
3479 case 0x200A: /* HAIR SPACE */
3480 case 0x202f: /* NARROW NO-BREAK SPACE */
3481 case 0x205f: /* MEDIUM MATHEMATICAL SPACE */
3482 case 0x3000: /* IDEOGRAPHIC SPACE */
3483 RRETURN(MATCH_NOMATCH);
3484 }
3485 break;
3486
3487 case OP_HSPACE:
3488 switch(c)
3489 {
3490 default: RRETURN(MATCH_NOMATCH);
3491 case 0x09: /* HT */
3492 case 0x20: /* SPACE */
3493 case 0xa0: /* NBSP */
3494 case 0x1680: /* OGHAM SPACE MARK */
3495 case 0x180e: /* MONGOLIAN VOWEL SEPARATOR */
3496 case 0x2000: /* EN QUAD */
3497 case 0x2001: /* EM QUAD */
3498 case 0x2002: /* EN SPACE */
3499 case 0x2003: /* EM SPACE */
3500 case 0x2004: /* THREE-PER-EM SPACE */
3501 case 0x2005: /* FOUR-PER-EM SPACE */
3502 case 0x2006: /* SIX-PER-EM SPACE */
3503 case 0x2007: /* FIGURE SPACE */
3504 case 0x2008: /* PUNCTUATION SPACE */
3505 case 0x2009: /* THIN SPACE */
3506 case 0x200A: /* HAIR SPACE */
3507 case 0x202f: /* NARROW NO-BREAK SPACE */
3508 case 0x205f: /* MEDIUM MATHEMATICAL SPACE */
3509 case 0x3000: /* IDEOGRAPHIC SPACE */
3510 break;
3511 }
3512 break;
3513
3514 case OP_NOT_VSPACE:
3515 switch(c)
3516 {
3517 default: break;
3518 case 0x0a: /* LF */
3519 case 0x0b: /* VT */
3520 case 0x0c: /* FF */
3521 case 0x0d: /* CR */
3522 case 0x85: /* NEL */
3523 case 0x2028: /* LINE SEPARATOR */
3524 case 0x2029: /* PARAGRAPH SEPARATOR */
3525 RRETURN(MATCH_NOMATCH);
3526 }
3527 break;
3528
3529 case OP_VSPACE:
3530 switch(c)
3531 {
3532 default: RRETURN(MATCH_NOMATCH);
3533 case 0x0a: /* LF */
3534 case 0x0b: /* VT */
3535 case 0x0c: /* FF */
3536 case 0x0d: /* CR */
3537 case 0x85: /* NEL */
3538 case 0x2028: /* LINE SEPARATOR */
3539 case 0x2029: /* PARAGRAPH SEPARATOR */
3540 break;
3541 }
3542 break;
3543
3544 case OP_NOT_DIGIT:
3545 if (c < 256 && (md->ctypes[c] & ctype_digit) != 0)
3546 RRETURN(MATCH_NOMATCH);
3547 break;
3548
3549 case OP_DIGIT:
3550 if (c >= 256 || (md->ctypes[c] & ctype_digit) == 0)
3551 RRETURN(MATCH_NOMATCH);
3552 break;
3553
3554 case OP_NOT_WHITESPACE:
3555 if (c < 256 && (md->ctypes[c] & ctype_space) != 0)
3556 RRETURN(MATCH_NOMATCH);
3557 break;
3558
3559 case OP_WHITESPACE:
3560 if (c >= 256 || (md->ctypes[c] & ctype_space) == 0)
3561 RRETURN(MATCH_NOMATCH);
3562 break;
3563
3564 case OP_NOT_WORDCHAR:
3565 if (c < 256 && (md->ctypes[c] & ctype_word) != 0)
3566 RRETURN(MATCH_NOMATCH);
3567 break;
3568
3569 case OP_WORDCHAR:
3570 if (c >= 256 || (md->ctypes[c] & ctype_word) == 0)
3571 RRETURN(MATCH_NOMATCH);
3572 break;
3573
3574 default:
3575 RRETURN(PCRE_ERROR_INTERNAL);
3576 }
3577 }
3578 }
3579 else
3580 #endif
3581 /* Not UTF-8 mode */
3582 {
3583 for (fi = min;; fi++)
3584 {
3585 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM43);
3586 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3587 if (fi >= max || eptr >= md->end_subject ||
3588 ((ims & PCRE_DOTALL) == 0 && IS_NEWLINE(eptr)))
3589 RRETURN(MATCH_NOMATCH);
3590
3591 c = *eptr++;
3592 switch(ctype)
3593 {
3594 case OP_ANY: /* This is the DOTALL case */
3595 break;
3596
3597 case OP_ANYBYTE:
3598 break;
3599
3600 case OP_ANYNL:
3601 switch(c)
3602 {
3603 default: RRETURN(MATCH_NOMATCH);
3604 case 0x000d:
3605 if (eptr < md->end_subject && *eptr == 0x0a) eptr++;
3606 break;
3607
3608 case 0x000a:
3609 break;
3610
3611 case 0x000b:
3612 case 0x000c:
3613 case 0x0085:
3614 if (md->bsr_anycrlf) RRETURN(MATCH_NOMATCH);
3615 break;
3616 }
3617 break;
3618
3619 case OP_NOT_HSPACE:
3620 switch(c)
3621 {
3622 default: break;
3623 case 0x09: /* HT */
3624 case 0x20: /* SPACE */
3625 case 0xa0: /* NBSP */
3626 RRETURN(MATCH_NOMATCH);
3627 }
3628 break;
3629
3630 case OP_HSPACE:
3631 switch(c)
3632 {
3633 default: RRETURN(MATCH_NOMATCH);
3634 case 0x09: /* HT */
3635 case 0x20: /* SPACE */
3636 case 0xa0: /* NBSP */
3637 break;
3638 }
3639 break;
3640
3641 case OP_NOT_VSPACE:
3642 switch(c)
3643 {
3644 default: break;
3645 case 0x0a: /* LF */
3646 case 0x0b: /* VT */
3647 case 0x0c: /* FF */
3648 case 0x0d: /* CR */
3649 case 0x85: /* NEL */
3650 RRETURN(MATCH_NOMATCH);
3651 }
3652 break;
3653
3654 case OP_VSPACE:
3655 switch(c)
3656 {
3657 default: RRETURN(MATCH_NOMATCH);
3658 case 0x0a: /* LF */
3659 case 0x0b: /* VT */
3660 case 0x0c: /* FF */
3661 case 0x0d: /* CR */
3662 case 0x85: /* NEL */
3663 break;
3664 }
3665 break;
3666
3667 case OP_NOT_DIGIT:
3668 if ((md->ctypes[c] & ctype_digit) != 0) RRETURN(MATCH_NOMATCH);
3669 break;
3670
3671 case OP_DIGIT:
3672 if ((md->ctypes[c] & ctype_digit) == 0) RRETURN(MATCH_NOMATCH);
3673 break;
3674
3675 case OP_NOT_WHITESPACE:
3676 if ((md->ctypes[c] & ctype_space) != 0) RRETURN(MATCH_NOMATCH);
3677 break;
3678
3679 case OP_WHITESPACE:
3680 if ((md->ctypes[c] & ctype_space) == 0) RRETURN(MATCH_NOMATCH);
3681 break;
3682
3683 case OP_NOT_WORDCHAR:
3684 if ((md->ctypes[c] & ctype_word) != 0) RRETURN(MATCH_NOMATCH);
3685 break;
3686
3687 case OP_WORDCHAR:
3688 if ((md->ctypes[c] & ctype_word) == 0) RRETURN(MATCH_NOMATCH);
3689 break;
3690
3691 default:
3692 RRETURN(PCRE_ERROR_INTERNAL);
3693 }
3694 }
3695 }
3696 /* Control never gets here */
3697 }
3698
3699 /* If maximizing, it is worth using inline code for speed, doing the type
3700 test once at the start (i.e. keep it out of the loop). Again, keep the
3701 UTF-8 and UCP stuff separate. */
3702
3703 else
3704 {
3705 pp = eptr; /* Remember where we started */
3706
3707 #ifdef SUPPORT_UCP
3708 if (prop_type >= 0)
3709 {
3710 switch(prop_type)
3711 {
3712 case PT_ANY:
3713 for (i = min; i < max; i++)
3714 {
3715 int len = 1;
3716 if (eptr >= md->end_subject) break;
3717 GETCHARLEN(c, eptr, len);
3718 if (prop_fail_result) break;
3719 eptr+= len;
3720 }
3721 break;
3722
3723 case PT_LAMP:
3724 for (i = min; i < max; i++)
3725 {
3726 int len = 1;
3727 if (eptr >= md->end_subject) break;
3728 GETCHARLEN(c, eptr, len);
3729 prop_category = _pcre_ucp_findprop(c, &prop_chartype, &prop_script);
3730 if ((prop_chartype == ucp_Lu ||
3731 prop_chartype == ucp_Ll ||
3732 prop_chartype == ucp_Lt) == prop_fail_result)
3733 break;
3734 eptr+= len;
3735 }
3736 break;
3737
3738 case PT_GC:
3739 for (i = min; i < max; i++)
3740 {
3741 int len = 1;
3742 if (eptr >= md->end_subject) break;
3743 GETCHARLEN(c, eptr, len);
3744 prop_category = _pcre_ucp_findprop(c, &prop_chartype, &prop_script);
3745 if ((prop_category == prop_value) == prop_fail_result)
3746 break;
3747 eptr+= len;
3748 }
3749 break;
3750
3751 case PT_PC:
3752 for (i = min; i < max; i++)
3753 {
3754 int len = 1;
3755 if (eptr >= md->end_subject) break;
3756 GETCHARLEN(c, eptr, len);
3757 prop_category = _pcre_ucp_findprop(c, &prop_chartype, &prop_script);
3758 if ((prop_chartype == prop_value) == prop_fail_result)
3759 break;
3760 eptr+= len;
3761 }
3762 break;
3763
3764 case PT_SC:
3765 for (i = min; i < max; i++)
3766 {
3767 int len = 1;
3768 if (eptr >= md->end_subject) break;
3769 GETCHARLEN(c, eptr, len);
3770 prop_category = _pcre_ucp_findprop(c, &prop_chartype, &prop_script);
3771 if ((prop_script == prop_value) == prop_fail_result)
3772 break;
3773 eptr+= len;
3774 }
3775 break;
3776 }
3777
3778 /* eptr is now past the end of the maximum run */
3779
3780 if (possessive) continue;
3781 for(;;)
3782 {
3783 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM44);
3784 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3785 if (eptr-- == pp) break; /* Stop if tried at original pos */
3786 if (utf8) BACKCHAR(eptr);
3787 }
3788 }
3789
3790 /* Match extended Unicode sequences. We will get here only if the
3791 support is in the binary; otherwise a compile-time error occurs. */
3792
3793 else if (ctype == OP_EXTUNI)
3794 {
3795 for (i = min; i < max; i++)
3796 {
3797 if (eptr >= md->end_subject) break;
3798 GETCHARINCTEST(c, eptr);
3799 prop_category = _pcre_ucp_findprop(c, &prop_chartype, &prop_script);
3800 if (prop_category == ucp_M) break;
3801 while (eptr < md->end_subject)
3802 {
3803 int len = 1;
3804 if (!utf8) c = *eptr; else
3805 {
3806 GETCHARLEN(c, eptr, len);
3807 }
3808 prop_category = _pcre_ucp_findprop(c, &prop_chartype, &prop_script);
3809 if (prop_category != ucp_M) break;
3810 eptr += len;
3811 }
3812 }
3813
3814 /* eptr is now past the end of the maximum run */
3815
3816 if (possessive) continue;
3817 for(;;)
3818 {
3819 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM45);
3820 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3821 if (eptr-- == pp) break; /* Stop if tried at original pos */
3822 for (;;) /* Move back over one extended */
3823 {
3824 int len = 1;
3825 if (!utf8) c = *eptr; else
3826 {
3827 BACKCHAR(eptr);
3828 GETCHARLEN(c, eptr, len);
3829 }
3830 prop_category = _pcre_ucp_findprop(c, &prop_chartype, &prop_script);
3831 if (prop_category != ucp_M) break;
3832 eptr--;
3833 }
3834 }
3835 }
3836
3837 else
3838 #endif /* SUPPORT_UCP */
3839
3840 #ifdef SUPPORT_UTF8
3841 /* UTF-8 mode */
3842
3843 if (utf8)
3844 {
3845 switch(ctype)
3846 {
3847 case OP_ANY:
3848 if (max < INT_MAX)
3849 {
3850 if ((ims & PCRE_DOTALL) == 0)
3851 {
3852 for (i = min; i < max; i++)
3853 {
3854 if (eptr >= md->end_subject || IS_NEWLINE(eptr)) break;
3855 eptr++;
3856 while (eptr < md->end_subject && (*eptr & 0xc0) == 0x80) eptr++;
3857 }
3858 }
3859 else
3860 {
3861 for (i = min; i < max; i++)
3862 {
3863 if (eptr >= md->end_subject) break;
3864 eptr++;
3865 while (eptr < md->end_subject && (*eptr & 0xc0) == 0x80) eptr++;
3866 }
3867 }
3868 }
3869
3870 /* Handle unlimited UTF-8 repeat */
3871
3872 else
3873 {
3874 if ((ims & PCRE_DOTALL) == 0)
3875 {
3876 for (i = min; i < max; i++)
3877 {
3878 if (eptr >= md->end_subject || IS_NEWLINE(eptr)) break;
3879 eptr++;
3880 while (eptr < md->end_subject && (*eptr & 0xc0) == 0x80) eptr++;
3881 }
3882 }
3883 else
3884 {
3885 eptr = md->end_subject;
3886 }
3887 }
3888 break;
3889
3890 /* The byte case is the same as non-UTF8 */
3891
3892 case OP_ANYBYTE:
3893 c = max - min;
3894 if (c > (unsigned int)(md->end_subject - eptr))
3895 c = md->end_subject - eptr;
3896 eptr += c;
3897 break;
3898
3899 case OP_ANYNL:
3900 for (i = min; i < max; i++)
3901 {
3902 int len = 1;
3903 if (eptr >= md->end_subject) break;
3904 GETCHARLEN(c, eptr, len);
3905 if (c == 0x000d)
3906 {
3907 if (++eptr >= md->end_subject) break;
3908 if (*eptr == 0x000a) eptr++;
3909 }
3910 else
3911 {
3912 if (c != 0x000a &&
3913 (md->bsr_anycrlf ||
3914 (c != 0x000b && c != 0x000c &&
3915 c != 0x0085 && c != 0x2028 && c != 0x2029)))
3916 break;
3917 eptr += len;
3918 }
3919 }
3920 break;
3921
3922 case OP_NOT_HSPACE:
3923 case OP_HSPACE:
3924 for (i = min; i < max; i++)
3925 {
3926 BOOL gotspace;
3927 int len = 1;
3928 if (eptr >= md->end_subject) break;
3929 GETCHARLEN(c, eptr, len);
3930 switch(c)
3931 {
3932 default: gotspace = FALSE; break;
3933 case 0x09: /* HT */
3934 case 0x20: /* SPACE */
3935 case 0xa0: /* NBSP */
3936 case 0x1680: /* OGHAM SPACE MARK */
3937 case 0x180e: /* MONGOLIAN VOWEL SEPARATOR */
3938 case 0x2000: /* EN QUAD */
3939 case 0x2001: /* EM QUAD */
3940 case 0x2002: /* EN SPACE */
3941 case 0x2003: /* EM SPACE */
3942 case 0x2004: /* THREE-PER-EM SPACE */
3943 case 0x2005: /* FOUR-PER-EM SPACE */
3944 case 0x2006: /* SIX-PER-EM SPACE */
3945 case 0x2007: /* FIGURE SPACE */
3946 case 0x2008: /* PUNCTUATION SPACE */
3947 case 0x2009: /* THIN SPACE */
3948 case 0x200A: /* HAIR SPACE */
3949 case 0x202f: /* NARROW NO-BREAK SPACE */
3950 case 0x205f: /* MEDIUM MATHEMATICAL SPACE */
3951 case 0x3000: /* IDEOGRAPHIC SPACE */
3952 gotspace = TRUE;
3953 break;
3954 }
3955 if (gotspace == (ctype == OP_NOT_HSPACE)) break;
3956 eptr += len;
3957 }
3958 break;
3959
3960 case OP_NOT_VSPACE:
3961 case OP_VSPACE:
3962 for (i = min; i < max; i++)
3963 {
3964 BOOL gotspace;
3965 int len = 1;
3966 if (eptr >= md->end_subject) break;
3967 GETCHARLEN(c, eptr, len);
3968 switch(c)
3969 {
3970 default: gotspace = FALSE; break;
3971 case 0x0a: /* LF */
3972 case 0x0b: /* VT */
3973 case 0x0c: /* FF */
3974 case 0x0d: /* CR */
3975 case 0x85: /* NEL */
3976 case 0x2028: /* LINE SEPARATOR */
3977 case 0x2029: /* PARAGRAPH SEPARATOR */
3978 gotspace = TRUE;
3979 break;
3980 }
3981 if (gotspace == (ctype == OP_NOT_VSPACE)) break;
3982 eptr += len;
3983 }
3984 break;
3985
3986 case OP_NOT_DIGIT:
3987 for (i = min; i < max; i++)
3988 {
3989 int len = 1;
3990 if (eptr >= md->end_subject) break;
3991 GETCHARLEN(c, eptr, len);
3992 if (c < 256 && (md->ctypes[c] & ctype_digit) != 0) break;
3993 eptr+= len;
3994 }
3995 break;
3996
3997 case OP_DIGIT:
3998 for (i = min; i < max; i++)
3999 {
4000 int len = 1;
4001 if (eptr >= md->end_subject) break;
4002 GETCHARLEN(c, eptr, len);
4003 if (c >= 256 ||(md->ctypes[c] & ctype_digit) == 0) break;
4004 eptr+= len;
4005 }
4006 break;
4007
4008 case OP_NOT_WHITESPACE:
4009 for (i = min; i < max; i++)
4010 {
4011 int len = 1;
4012 if (eptr >= md->end_subject) break;
4013 GETCHARLEN(c, eptr, len);
4014 if (c < 256 && (md->ctypes[c] & ctype_space) != 0) break;
4015 eptr+= len;
4016 }
4017 break;
4018
4019 case OP_WHITESPACE:
4020 for (i = min; i < max; i++)
4021 {
4022 int len = 1;
4023 if (eptr >= md->end_subject) break;
4024 GETCHARLEN(c, eptr, len);
4025 if (c >= 256 ||(md->ctypes[c] & ctype_space) == 0) break;
4026 eptr+= len;
4027 }
4028 break;
4029
4030 case OP_NOT_WORDCHAR:
4031 for (i = min; i < max; i++)
4032 {
4033 int len = 1;
4034 if (eptr >= md->end_subject) break;
4035 GETCHARLEN(c, eptr, len);
4036 if (c < 256 && (md->ctypes[c] & ctype_word) != 0) break;
4037 eptr+= len;
4038 }
4039 break;
4040
4041 case OP_WORDCHAR:
4042 for (i = min; i < max; i++)
4043 {
4044 int len = 1;
4045 if (eptr >= md->end_subject) break;
4046 GETCHARLEN(c, eptr, len);
4047 if (c >= 256 || (md->ctypes[c] & ctype_word) == 0) break;
4048 eptr+= len;
4049 }
4050 break;
4051
4052 default:
4053 RRETURN(PCRE_ERROR_INTERNAL);
4054 }
4055
4056 /* eptr is now past the end of the maximum run */
4057
4058 if (possessive) continue;
4059 for(;;)
4060 {
4061 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM46);
4062 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
4063 if (eptr-- == pp) break; /* Stop if tried at original pos */
4064 BACKCHAR(eptr);
4065 }
4066 }
4067 else
4068 #endif /* SUPPORT_UTF8 */
4069
4070 /* Not UTF-8 mode */
4071 {
4072 switch(ctype)
4073 {
4074 case OP_ANY:
4075 if ((ims & PCRE_DOTALL) == 0)
4076 {
4077 for (i = min; i < max; i++)
4078 {
4079 if (eptr >= md->end_subject || IS_NEWLINE(eptr)) break;
4080 eptr++;
4081 }
4082 break;
4083 }
4084 /* For DOTALL case, fall through and treat as \C */
4085
4086 case OP_ANYBYTE:
4087 c = max - min;
4088 if (c > (unsigned int)(md->end_subject - eptr))
4089 c = md->end_subject - eptr;
4090 eptr += c;
4091 break;
4092
4093 case OP_ANYNL:
4094 for (i = min; i < max; i++)
4095 {
4096 if (eptr >= md->end_subject) break;
4097 c = *eptr;
4098 if (c == 0x000d)
4099 {
4100 if (++eptr >= md->end_subject) break;
4101 if (*eptr == 0x000a) eptr++;
4102 }
4103 else
4104 {
4105 if (c != 0x000a &&
4106 (md->bsr_anycrlf ||
4107 (c != 0x000b && c != 0x000c && c != 0x0085)))
4108 break;
4109 eptr++;
4110 }
4111 }
4112 break;
4113
4114 case OP_NOT_HSPACE:
4115 for (i = min; i < max; i++)
4116 {
4117 if (eptr >= md->end_subject) break;
4118 c = *eptr;
4119 if (c == 0x09 || c == 0x20 || c == 0xa0) break;
4120 eptr++;
4121 }
4122 break;
4123
4124 case OP_HSPACE:
4125 for (i = min; i < max; i++)
4126 {
4127 if (eptr >= md->end_subject) break;
4128 c = *eptr;
4129 if (c != 0x09 && c != 0x20 && c != 0xa0) break;
4130 eptr++;
4131 }
4132 break;
4133
4134 case OP_NOT_VSPACE:
4135 for (i = min; i < max; i++)
4136 {
4137 if (eptr >= md->end_subject) break;
4138 c = *eptr;
4139 if (c == 0x0a || c == 0x0b || c == 0x0c || c == 0x0d || c == 0x85)
4140 break;
4141 eptr++;
4142 }
4143 break;
4144
4145 case OP_VSPACE:
4146 for (i = min; i < max; i++)
4147 {
4148 if (eptr >= md->end_subject) break;
4149 c = *eptr;
4150 if (c != 0x0a && c != 0x0b && c != 0x0c && c != 0x0d && c != 0x85)
4151 break;
4152 eptr++;
4153 }
4154 break;
4155
4156 case OP_NOT_DIGIT:
4157 for (i = min; i < max; i++)
4158 {
4159 if (eptr >= md->end_subject || (md->ctypes[*eptr] & ctype_digit) != 0)
4160 break;
4161 eptr++;
4162 }
4163 break;
4164
4165 case OP_DIGIT:
4166 for (i = min; i < max; i++)
4167 {
4168 if (eptr >= md->end_subject || (md->ctypes[*eptr] & ctype_digit) == 0)
4169 break;
4170 eptr++;
4171 }
4172 break;
4173
4174 case OP_NOT_WHITESPACE:
4175 for (i = min; i < max; i++)
4176 {
4177 if (eptr >= md->end_subject || (md->ctypes[*eptr] & ctype_space) != 0)
4178 break;
4179 eptr++;
4180 }
4181 break;
4182
4183 case OP_WHITESPACE:
4184 for (i = min; i < max; i++)
4185 {
4186 if (eptr >= md->end_subject || (md->ctypes[*eptr] & ctype_space) == 0)
4187 break;
4188 eptr++;
4189 }
4190 break;
4191
4192 case OP_NOT_WORDCHAR:
4193 for (i = min; i < max; i++)
4194 {
4195 if (eptr >= md->end_subject || (md->ctypes[*eptr] & ctype_word) != 0)
4196 break;
4197 eptr++;
4198 }
4199 break;
4200
4201 case OP_WORDCHAR:
4202 for (i = min; i < max; i++)
4203 {
4204 if (eptr >= md->end_subject || (md->ctypes[*eptr] & ctype_word) == 0)
4205 break;
4206 eptr++;
4207 }
4208 break;
4209
4210 default:
4211 RRETURN(PCRE_ERROR_INTERNAL);
4212 }
4213
4214 /* eptr is now past the end of the maximum run */
4215
4216 if (possessive) continue;
4217 while (eptr >= pp)
4218 {
4219 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM47);
4220 eptr--;
4221 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
4222 }
4223 }
4224
4225 /* Get here if we can't make it match with any permitted repetitions */
4226
4227 RRETURN(MATCH_NOMATCH);
4228 }
4229 /* Control never gets here */
4230
4231 /* There's been some horrible disaster. Arrival here can only mean there is
4232 something seriously wrong in the code above or the OP_xxx definitions. */
4233
4234 default:
4235 DPRINTF(("Unknown opcode %d\n", *ecode));
4236 RRETURN(PCRE_ERROR_UNKNOWN_OPCODE);
4237 }
4238
4239 /* Do not stick any code in here without much thought; it is assumed
4240 that "continue" in the code above comes out to here to repeat the main
4241 loop. */
4242
4243 } /* End of main loop */
4244 /* Control never reaches here */
4245
4246
4247 /* When compiling to use the heap rather than the stack for recursive calls to
4248 match(), the RRETURN() macro jumps here. The number that is saved in
4249 frame->Xwhere indicates which label we actually want to return to. */
4250
4251 #ifdef NO_RECURSE
4252 #define LBL(val) case val: goto L_RM##val;
4253 HEAP_RETURN:
4254 switch (frame->Xwhere)
4255 {
4256 LBL( 1) LBL( 2) LBL( 3) LBL( 4) LBL( 5) LBL( 6) LBL( 7) LBL( 8)
4257 LBL( 9) LBL(10) LBL(11) LBL(12) LBL(13) LBL(14) LBL(15) LBL(17)
4258 LBL(19) LBL(24) LBL(25) LBL(26) LBL(27) LBL(29) LBL(31) LBL(33)
4259 LBL(35) LBL(43) LBL(47) LBL(48) LBL(49) LBL(50) LBL(51) LBL(52)
4260 LBL(53) LBL(54)
4261 #ifdef SUPPORT_UTF8
4262 LBL(16) LBL(18) LBL(20) LBL(21) LBL(22) LBL(23) LBL(28) LBL(30)
4263 LBL(32) LBL(34) LBL(42) LBL(46)
4264 #ifdef SUPPORT_UCP
4265 LBL(36) LBL(37) LBL(38) LBL(39) LBL(40) LBL(41) LBL(44) LBL(45)
4266 #endif /* SUPPORT_UCP */
4267 #endif /* SUPPORT_UTF8 */
4268 default:
4269 DPRINTF(("jump error in pcre match: label %d non-existent\n", frame->Xwhere));
4270 return PCRE_ERROR_INTERNAL;
4271 }
4272 #undef LBL
4273 #endif /* NO_RECURSE */
4274 }
4275
4276
4277 /***************************************************************************
4278 ****************************************************************************
4279 RECURSION IN THE match() FUNCTION
4280
4281 Undefine all the macros that were defined above to handle this. */
4282
4283 #ifdef NO_RECURSE
4284 #undef eptr
4285 #undef ecode
4286 #undef mstart
4287 #undef offset_top
4288 #undef ims
4289 #undef eptrb
4290 #undef flags
4291
4292 #undef callpat
4293 #undef charptr
4294 #undef data
4295 #undef next
4296 #undef pp
4297 #undef prev
4298 #undef saved_eptr
4299
4300 #undef new_recursive
4301
4302 #undef cur_is_word
4303 #undef condition
4304 #undef prev_is_word
4305
4306 #undef original_ims
4307
4308 #undef ctype
4309 #undef length
4310 #undef max
4311 #undef min
4312 #undef number
4313 #undef offset
4314 #undef op
4315 #undef save_capture_last
4316 #undef save_offset1
4317 #undef save_offset2
4318 #undef save_offset3
4319 #undef stacksave
4320
4321 #undef newptrb
4322
4323 #endif
4324
4325 /* These two are defined as macros in both cases */
4326
4327 #undef fc
4328 #undef fi
4329
4330 /***************************************************************************
4331 ***************************************************************************/
4332
4333
4334
4335 /*************************************************
4336 * Execute a Regular Expression *
4337 *************************************************/
4338
4339 /* This function applies a compiled re to a subject string and picks out
4340 portions of the string if it matches. Two elements in the vector are set for
4341 each substring: the offsets to the start and end of the substring.
4342
4343 Arguments:
4344 argument_re points to the compiled expression
4345 extra_data points to extra data or is NULL
4346 subject points to the subject string
4347 length length of subject string (may contain binary zeros)
4348 start_offset where to start in the subject string
4349 options option bits
4350 offsets points to a vector of ints to be filled in with offsets
4351 offsetcount the number of elements in the vector
4352
4353 Returns: > 0 => success; value is the number of elements filled in
4354 = 0 => success, but offsets is not big enough
4355 -1 => failed to match
4356 < -1 => some kind of unexpected problem
4357 */
4358
4359 PCRE_EXP_DEFN int
4360 pcre_exec(const pcre *argument_re, const pcre_extra *extra_data,
4361 PCRE_SPTR subject, int length, int start_offset, int options, int *offsets,
4362 int offsetcount)
4363 {
4364 int rc, resetcount, ocount;
4365 int first_byte = -1;
4366 int req_byte = -1;
4367 int req_byte2 = -1;
4368 int newline;
4369 unsigned long int ims;
4370 BOOL using_temporary_offsets = FALSE;
4371 BOOL anchored;
4372 BOOL startline;
4373 BOOL firstline;
4374 BOOL first_byte_caseless = FALSE;
4375 BOOL req_byte_caseless = FALSE;
4376 BOOL utf8;
4377 match_data match_block;
4378 match_data *md = &match_block;
4379 const uschar *tables;
4380 const uschar *start_bits = NULL;
4381 USPTR start_match = (USPTR)subject + start_offset;
4382 USPTR end_subject;
4383 USPTR req_byte_ptr = start_match - 1;
4384
4385 pcre_study_data internal_study;
4386 const pcre_study_data *study;
4387
4388 real_pcre internal_re;
4389 const real_pcre *external_re = (const real_pcre *)argument_re;
4390 const real_pcre *re = external_re;
4391
4392 /* Plausibility checks */
4393
4394 if ((options & ~PUBLIC_EXEC_OPTIONS) != 0) return PCRE_ERROR_BADOPTION;
4395 if (re == NULL || subject == NULL ||
4396 (offsets == NULL && offsetcount > 0)) return PCRE_ERROR_NULL;
4397 if (offsetcount < 0) return PCRE_ERROR_BADCOUNT;
4398
4399 /* Fish out the optional data from the extra_data structure, first setting
4400 the default values. */
4401
4402 study = NULL;
4403 md->match_limit = MATCH_LIMIT;
4404 md->match_limit_recursion = MATCH_LIMIT_RECURSION;
4405 md->callout_data = NULL;
4406
4407 /* The table pointer is always in native byte order. */
4408
4409 tables = external_re->tables;
4410
4411 if (extra_data != NULL)
4412 {
4413 register unsigned int flags = extra_data->flags;
4414 if ((flags & PCRE_EXTRA_STUDY_DATA) != 0)
4415 study = (const pcre_study_data *)extra_data->study_data;
4416 if ((flags & PCRE_EXTRA_MATCH_LIMIT) != 0)
4417 md->match_limit = extra_data->match_limit;
4418 if ((flags & PCRE_EXTRA_MATCH_LIMIT_RECURSION) != 0)
4419 md->match_limit_recursion = extra_data->match_limit_recursion;
4420 if ((flags & PCRE_EXTRA_CALLOUT_DATA) != 0)
4421 md->callout_data = extra_data->callout_data;
4422 if ((flags & PCRE_EXTRA_TABLES) != 0) tables = extra_data->tables;
4423 }
4424
4425 /* If the exec call supplied NULL for tables, use the inbuilt ones. This
4426 is a feature that makes it possible to save compiled regex and re-use them
4427 in other programs later. */
4428
4429 if (tables == NULL) tables = _pcre_default_tables;
4430
4431 /* Check that the first field in the block is the magic number. If it is not,
4432 test for a regex that was compiled on a host of opposite endianness. If this is
4433 the case, flipped values are put in internal_re and internal_study if there was
4434 study data too. */
4435
4436 if (re->magic_number != MAGIC_NUMBER)
4437 {
4438 re = _pcre_try_flipped(re, &internal_re, study, &internal_study);
4439 if (re == NULL) return PCRE_ERROR_BADMAGIC;
4440 if (study != NULL) study = &internal_study;
4441 }
4442
4443 /* Set up other data */
4444
4445 anchored = ((re->options | options) & PCRE_ANCHORED) != 0;
4446 startline = (re->flags & PCRE_STARTLINE) != 0;
4447 firstline = (re->options & PCRE_FIRSTLINE) != 0;
4448
4449 /* The code starts after the real_pcre block and the capture name table. */
4450
4451 md->start_code = (const uschar *)external_re + re->name_table_offset +
4452 re->name_count * re->name_entry_size;
4453
4454 md->start_subject = (USPTR)subject;
4455 md->start_offset = start_offset;
4456 md->end_subject = md->start_subject + length;
4457 end_subject = md->end_subject;
4458
4459 md->endonly = (re->options & PCRE_DOLLAR_ENDONLY) != 0;
4460 utf8 = md->utf8 = (re->options & PCRE_UTF8) != 0;
4461
4462 md->notbol = (options & PCRE_NOTBOL) != 0;
4463 md->noteol = (options & PCRE_NOTEOL) != 0;
4464 md->notempty = (options & PCRE_NOTEMPTY) != 0;
4465 md->partial = (options & PCRE_PARTIAL) != 0;
4466 md->hitend = FALSE;
4467
4468 md->recursive = NULL; /* No recursion at top level */
4469
4470 md->lcc = tables + lcc_offset;
4471 md->ctypes = tables + ctypes_offset;
4472
4473 /* Handle different \R options. */
4474
4475 switch (options & (PCRE_BSR_ANYCRLF|PCRE_BSR_UNICODE))
4476 {
4477 case 0:
4478 if ((re->options & (PCRE_BSR_ANYCRLF|PCRE_BSR_UNICODE)) != 0)
4479 md->bsr_anycrlf = (re->options & PCRE_BSR_ANYCRLF) != 0;
4480 else
4481 #ifdef BSR_ANYCRLF
4482 md->bsr_anycrlf = TRUE;
4483 #else
4484 md->bsr_anycrlf = FALSE;
4485 #endif
4486 break;
4487
4488 case PCRE_BSR_ANYCRLF:
4489 md->bsr_anycrlf = TRUE;
4490 break;
4491
4492 case PCRE_BSR_UNICODE:
4493 md->bsr_anycrlf = FALSE;
4494 break;
4495
4496 default: return PCRE_ERROR_BADNEWLINE;
4497 }
4498
4499 /* Handle different types of newline. The three bits give eight cases. If
4500 nothing is set at run time, whatever was used at compile time applies. */
4501
4502 switch ((((options & PCRE_NEWLINE_BITS) == 0)? re->options :
4503 (pcre_uint32)options) & PCRE_NEWLINE_BITS)
4504 {
4505 case 0: newline = NEWLINE; break; /* Compile-time default */
4506 case PCRE_NEWLINE_CR: newline = '\r'; break;
4507 case PCRE_NEWLINE_LF: newline = '\n'; break;
4508 case PCRE_NEWLINE_CR+
4509 PCRE_NEWLINE_LF: newline = ('\r' << 8) | '\n'; break;
4510 case PCRE_NEWLINE_ANY: newline = -1; break;
4511 case PCRE_NEWLINE_ANYCRLF: newline = -2; break;
4512 default: return PCRE_ERROR_BADNEWLINE;
4513 }
4514
4515 if (newline == -2)
4516 {
4517 md->nltype = NLTYPE_ANYCRLF;
4518 }
4519 else if (newline < 0)
4520 {
4521 md->nltype = NLTYPE_ANY;
4522 }
4523 else
4524 {
4525 md->nltype = NLTYPE_FIXED;
4526 if (newline > 255)
4527 {
4528 md->nllen = 2;
4529 md->nl[0] = (newline >> 8) & 255;
4530 md->nl[1] = newline & 255;
4531 }
4532 else
4533 {
4534 md->nllen = 1;
4535 md->nl[0] = newline;
4536 }
4537 }
4538
4539 /* Partial matching is supported only for a restricted set of regexes at the
4540 moment. */
4541
4542 if (md->partial && (re->flags & PCRE_NOPARTIAL) != 0)
4543 return PCRE_ERROR_BADPARTIAL;
4544
4545 /* Check a UTF-8 string if required. Unfortunately there's no way of passing
4546 back the character offset. */
4547
4548 #ifdef SUPPORT_UTF8
4549 if (utf8 && (options & PCRE_NO_UTF8_CHECK) == 0)
4550 {
4551 if (_pcre_valid_utf8((uschar *)subject, length) >= 0)
4552 return PCRE_ERROR_BADUTF8;
4553 if (start_offset > 0 && start_offset < length)
4554 {
4555 int tb = ((uschar *)subject)[start_offset];
4556 if (tb > 127)
4557 {
4558 tb &= 0xc0;
4559 if (tb != 0 && tb != 0xc0) return PCRE_ERROR_BADUTF8_OFFSET;
4560 }
4561 }
4562 }
4563 #endif
4564
4565 /* The ims options can vary during the matching as a result of the presence
4566 of (?ims) items in the pattern. They are kept in a local variable so that
4567 restoring at the exit of a group is easy. */
4568
4569 ims = re->options & (PCRE_CASELESS|PCRE_MULTILINE|PCRE_DOTALL);
4570
4571 /* If the expression has got more back references than the offsets supplied can
4572 hold, we get a temporary chunk of working store to use during the matching.
4573 Otherwise, we can use the vector supplied, rounding down its size to a multiple
4574 of 3. */
4575
4576 ocount = offsetcount - (offsetcount % 3);
4577
4578 if (re->top_backref > 0 && re->top_backref >= ocount/3)
4579 {
4580 ocount = re->top_backref * 3 + 3;
4581 md->offset_vector = (int *)(pcre_malloc)(ocount * sizeof(int));
4582 if (md->offset_vector == NULL) return PCRE_ERROR_NOMEMORY;
4583 using_temporary_offsets = TRUE;
4584 DPRINTF(("Got memory to hold back references\n"));
4585 }
4586 else md->offset_vector = offsets;
4587
4588 md->offset_end = ocount;
4589 md->offset_max = (2*ocount)/3;
4590 md->offset_overflow = FALSE;
4591 md->capture_last = -1;
4592
4593 /* Compute the minimum number of offsets that we need to reset each time. Doing
4594 this makes a huge difference to execution time when there aren't many brackets
4595 in the pattern. */
4596
4597 resetcount = 2 + re->top_bracket * 2;
4598 if (resetcount > offsetcount) resetcount = ocount;
4599
4600 /* Reset the working variable associated with each extraction. These should
4601 never be used unless previously set, but they get saved and restored, and so we
4602 initialize them to avoid reading uninitialized locations. */
4603
4604 if (md->offset_vector != NULL)
4605 {
4606 register int *iptr = md->offset_vector + ocount;
4607 register int *iend = iptr - resetcount/2 + 1;
4608 while (--iptr >= iend) *iptr = -1;
4609 }
4610
4611 /* Set up the first character to match, if available. The first_byte value is
4612 never set for an anchored regular expression, but the anchoring may be forced
4613 at run time, so we have to test for anchoring. The first char may be unset for
4614 an unanchored pattern, of course. If there's no first char and the pattern was
4615 studied, there may be a bitmap of possible first characters. */
4616
4617 if (!anchored)
4618 {
4619 if ((re->flags & PCRE_FIRSTSET) != 0)
4620 {
4621 first_byte = re->first_byte & 255;
4622 if ((first_byte_caseless = ((re->first_byte & REQ_CASELESS) != 0)) == TRUE)
4623 first_byte = md->lcc[first_byte];
4624 }
4625 else
4626 if (!startline && study != NULL &&
4627 (study->options & PCRE_STUDY_MAPPED) != 0)
4628 start_bits = study->start_bits;
4629 }
4630
4631 /* For anchored or unanchored matches, there may be a "last known required
4632 character" set. */
4633
4634 if ((re->flags & PCRE_REQCHSET) != 0)
4635 {
4636 req_byte = re->req_byte & 255;
4637 req_byte_caseless = (re->req_byte & REQ_CASELESS) != 0;
4638 req_byte2 = (tables + fcc_offset)[req_byte]; /* case flipped */
4639 }
4640
4641
4642 /* ==========================================================================*/
4643
4644 /* Loop for handling unanchored repeated matching attempts; for anchored regexs
4645 the loop runs just once. */
4646
4647 for(;;)
4648 {
4649 USPTR save_end_subject = end_subject;
4650 USPTR new_start_match;
4651
4652 /* Reset the maximum number of extractions we might see. */
4653
4654 if (md->offset_vector != NULL)
4655 {
4656 register int *iptr = md->offset_vector;
4657 register int *iend = iptr + resetcount;
4658 while (iptr < iend) *iptr++ = -1;
4659 }
4660
4661 /* Advance to a unique first char if possible. If firstline is TRUE, the
4662 start of the match is constrained to the first line of a multiline string.
4663 That is, the match must be before or at the first newline. Implement this by
4664 temporarily adjusting end_subject so that we stop scanning at a newline. If
4665 the match fails at the newline, later code breaks this loop. */
4666
4667 if (firstline)
4668 {
4669 USPTR t = start_match;
4670 while (t < md->end_subject && !IS_NEWLINE(t)) t++;
4671 end_subject = t;
4672 }
4673
4674 /* Now test for a unique first byte */
4675
4676 if (first_byte >= 0)
4677 {
4678 if (first_byte_caseless)
4679 while (start_match < end_subject &&
4680 md->lcc[*start_match] != first_byte)
4681 { NEXTCHAR(start_match); }
4682 else
4683 while (start_match < end_subject && *start_match != first_byte)
4684 { NEXTCHAR(start_match); }
4685 }
4686
4687 /* Or to just after a linebreak for a multiline match if possible */
4688
4689 else if (startline)
4690 {
4691 if (start_match > md->start_subject + start_offset)
4692 {
4693 while (start_match <= end_subject && !WAS_NEWLINE(start_match))
4694 { NEXTCHAR(start_match); }
4695
4696 /* If we have just passed a CR and the newline option is ANY or ANYCRLF,
4697 and we are now at a LF, advance the match position by one more character.
4698 */
4699
4700 if (start_match[-1] == '\r' &&
4701 (md->nltype == NLTYPE_ANY || md->nltype == NLTYPE_ANYCRLF) &&
4702 start_match < end_subject &&
4703 *start_match == '\n')
4704 start_match++;
4705 }
4706 }
4707
4708 /* Or to a non-unique first char after study */
4709
4710 else if (start_bits != NULL)
4711 {
4712 while (start_match < end_subject)
4713 {
4714 register unsigned int c = *start_match;
4715 if ((start_bits[c/8] & (1 << (c&7))) == 0)
4716 { NEXTCHAR(start_match); }
4717 else break;
4718 }
4719 }
4720
4721 /* Restore fudged end_subject */
4722
4723 end_subject = save_end_subject;
4724
4725 #ifdef DEBUG /* Sigh. Some compilers never learn. */
4726 printf(">>>> Match against: ");
4727 pchars(start_match, end_subject - start_match, TRUE, md);
4728 printf("\n");
4729 #endif
4730
4731 /* If req_byte is set, we know that that character must appear in the subject
4732 for the match to succeed. If the first character is set, req_byte must be
4733 later in the subject; otherwise the test starts at the match point. This
4734 optimization can save a huge amount of backtracking in patterns with nested
4735 unlimited repeats that aren't going to match. Writing separate code for
4736 cased/caseless versions makes it go faster, as does using an autoincrement
4737 and backing off on a match.
4738
4739 HOWEVER: when the subject string is very, very long, searching to its end can
4740 take a long time, and give bad performance on quite ordinary patterns. This
4741 showed up when somebody was matching something like /^\d+C/ on a 32-megabyte
4742 string... so we don't do this when the string is sufficiently long.
4743
4744 ALSO: this processing is disabled when partial matching is requested.
4745 */
4746
4747 if (req_byte >= 0 &&
4748 end_subject - start_match < REQ_BYTE_MAX &&
4749 !md->partial)
4750 {
4751 register USPTR p = start_match + ((first_byte >= 0)? 1 : 0);
4752
4753 /* We don't need to repeat the search if we haven't yet reached the
4754 place we found it at last time. */
4755
4756 if (p > req_byte_ptr)
4757 {
4758 if (req_byte_caseless)
4759 {
4760 while (p < end_subject)
4761 {
4762 register int pp = *p++;
4763 if (pp == req_byte || pp == req_byte2) { p--; break; }
4764 }
4765 }
4766 else
4767 {
4768 while (p < end_subject)
4769 {
4770 if (*p++ == req_byte) { p--; break; }
4771 }
4772 }
4773
4774 /* If we can't find the required character, break the matching loop,
4775 forcing a match failure. */
4776
4777 if (p >= end_subject)
4778 {
4779 rc = MATCH_NOMATCH;
4780 break;
4781 }
4782
4783 /* If we have found the required character, save the point where we
4784 found it, so that we don't search again next time round the loop if
4785 the start hasn't passed this character yet. */
4786
4787 req_byte_ptr = p;
4788 }
4789 }
4790
4791 /* OK, we can now run the match. */
4792
4793 md->start_match_ptr = start_match;
4794 md->match_call_count = 0;
4795 rc = match(start_match, md->start_code, start_match, 2, md, ims, NULL, 0, 0);
4796
4797 switch(rc)
4798 {
4799 /* NOMATCH and PRUNE advance by one character. THEN at this level acts
4800 exactly like PRUNE. */
4801
4802 case MATCH_NOMATCH:
4803 case MATCH_PRUNE:
4804 case MATCH_THEN:
4805 new_start_match = start_match + 1;
4806 #ifdef SUPPORT_UTF8
4807 if (utf8)
4808 while(new_start_match < end_subject && (*new_start_match & 0xc0) == 0x80)
4809 new_start_match++;
4810 #endif
4811 break;
4812
4813 /* SKIP passes back the next starting point explicitly. */
4814
4815 case MATCH_SKIP:
4816 new_start_match = md->start_match_ptr;
4817 break;
4818
4819 /* COMMIT disables the bumpalong, but otherwise behaves as NOMATCH. */
4820
4821 case MATCH_COMMIT:
4822 rc = MATCH_NOMATCH;
4823 goto ENDLOOP;
4824
4825 /* Any other return is some kind of error. */
4826
4827 default:
4828 goto ENDLOOP;
4829 }
4830
4831 /* Control reaches here for the various types of "no match at this point"
4832 result. Reset the code to MATCH_NOMATCH for subsequent checking. */
4833
4834 rc = MATCH_NOMATCH;
4835
4836 /* If PCRE_FIRSTLINE is set, the match must happen before or at the first
4837 newline in the subject (though it may continue over the newline). Therefore,
4838 if we have just failed to match, starting at a newline, do not continue. */
4839
4840 if (firstline && IS_NEWLINE(start_match)) break;
4841
4842 /* Advance to new matching position */
4843
4844 start_match = new_start_match;
4845
4846 /* Break the loop if the pattern is anchored or if we have passed the end of
4847 the subject. */
4848
4849 if (anchored || start_match > end_subject) break;
4850
4851 /* If we have just passed a CR and we are now at a LF, and the pattern does
4852 not contain any explicit matches for \r or \n, and the newline option is CRLF
4853 or ANY or ANYCRLF, advance the match position by one more character. */
4854
4855 if (start_match[-1] == '\r' &&
4856 start_match < end_subject &&
4857 *start_match == '\n' &&
4858 (re->flags & PCRE_HASCRORLF) == 0 &&
4859 (md->nltype == NLTYPE_ANY ||
4860 md->nltype == NLTYPE_ANYCRLF ||
4861 md->nllen == 2))
4862 start_match++;
4863
4864 } /* End of for(;;) "bumpalong" loop */
4865
4866 /* ==========================================================================*/
4867
4868 /* We reach here when rc is not MATCH_NOMATCH, or if one of the stopping
4869 conditions is true:
4870
4871 (1) The pattern is anchored or the match was failed by (*COMMIT);
4872
4873 (2) We are past the end of the subject;
4874
4875 (3) PCRE_FIRSTLINE is set and we have failed to match at a newline, because
4876 this option requests that a match occur at or before the first newline in
4877 the subject.
4878
4879 When we have a match and the offset vector is big enough to deal with any
4880 backreferences, captured substring offsets will already be set up. In the case
4881 where we had to get some local store to hold offsets for backreference
4882 processing, copy those that we can. In this case there need not be overflow if
4883 certain parts of the pattern were not used, even though there are more
4884 capturing parentheses than vector slots. */
4885
4886 ENDLOOP:
4887
4888 if (rc == MATCH_MATCH)
4889 {
4890 if (using_temporary_offsets)
4891 {
4892 if (offsetcount >= 4)
4893 {
4894 memcpy(offsets + 2, md->offset_vector + 2,
4895 (offsetcount - 2) * sizeof(int));
4896 DPRINTF(("Copied offsets from temporary memory\n"));
4897 }
4898 if (md->end_offset_top > offsetcount) md->offset_overflow = TRUE;
4899 DPRINTF(("Freeing temporary memory\n"));
4900 (pcre_free)(md->offset_vector);
4901 }
4902
4903 /* Set the return code to the number of captured strings, or 0 if there are
4904 too many to fit into the vector. */
4905
4906 rc = md->offset_overflow? 0 : md->end_offset_top/2;
4907
4908 /* If there is space, set up the whole thing as substring 0. The value of
4909 md->start_match_ptr might be modified if \K was encountered on the success
4910 matching path. */
4911
4912 if (offsetcount < 2) rc = 0; else
4913 {
4914 offsets[0] = md->start_match_ptr - md->start_subject;
4915 offsets[1] = md->end_match_ptr - md->start_subject;
4916 }
4917
4918 DPRINTF((">>>> returning %d\n", rc));
4919 return rc;
4920 }
4921
4922 /* Control gets here if there has been an error, or if the overall match
4923 attempt has failed at all permitted starting positions. */
4924
4925 if (using_temporary_offsets)
4926 {
4927 DPRINTF(("Freeing temporary memory\n"));
4928 (pcre_free)(md->offset_vector);
4929 }
4930
4931 if (rc != MATCH_NOMATCH)
4932 {
4933 DPRINTF((">>>> error: returning %d\n", rc));
4934 return rc;
4935 }
4936 else if (md->partial && md->hitend)
4937 {
4938 DPRINTF((">>>> returning PCRE_ERROR_PARTIAL\n"));
4939 return PCRE_ERROR_PARTIAL;
4940 }
4941 else
4942 {
4943 DPRINTF((">>>> returning PCRE_ERROR_NOMATCH\n"));
4944 return PCRE_ERROR_NOMATCH;
4945 }
4946 }
4947
4948 /* End of pcre_exec.c */

Properties

Name Value
svn:eol-style native
svn:keywords "Author Date Id Revision Url"

webmaster@exim.org
ViewVC Help
Powered by ViewVC 1.1.12