/[pcre]/code/trunk/pcre_exec.c
ViewVC logotype

Contents of /code/trunk/pcre_exec.c

Parent Directory Parent Directory | Revision Log Revision Log


Revision 336 - (show annotations) (download)
Sat Apr 12 15:59:03 2008 UTC (6 years ago) by ph10
File MIME type: text/plain
File size: 150920 byte(s)
Added PCRE_JAVASCRIPT_COMPAT option.

1 /*************************************************
2 * Perl-Compatible Regular Expressions *
3 *************************************************/
4
5 /* PCRE is a library of functions to support regular expressions whose syntax
6 and semantics are as close as possible to those of the Perl 5 language.
7
8 Written by Philip Hazel
9 Copyright (c) 1997-2008 University of Cambridge
10
11 -----------------------------------------------------------------------------
12 Redistribution and use in source and binary forms, with or without
13 modification, are permitted provided that the following conditions are met:
14
15 * Redistributions of source code must retain the above copyright notice,
16 this list of conditions and the following disclaimer.
17
18 * Redistributions in binary form must reproduce the above copyright
19 notice, this list of conditions and the following disclaimer in the
20 documentation and/or other materials provided with the distribution.
21
22 * Neither the name of the University of Cambridge nor the names of its
23 contributors may be used to endorse or promote products derived from
24 this software without specific prior written permission.
25
26 THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
27 AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
28 IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
29 ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
30 LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
31 CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
32 SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
33 INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
34 CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
35 ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
36 POSSIBILITY OF SUCH DAMAGE.
37 -----------------------------------------------------------------------------
38 */
39
40
41 /* This module contains pcre_exec(), the externally visible function that does
42 pattern matching using an NFA algorithm, trying to mimic Perl as closely as
43 possible. There are also some static supporting functions. */
44
45 #ifdef HAVE_CONFIG_H
46 #include "config.h"
47 #endif
48
49 #define NLBLOCK md /* Block containing newline information */
50 #define PSSTART start_subject /* Field containing processed string start */
51 #define PSEND end_subject /* Field containing processed string end */
52
53 #include "pcre_internal.h"
54
55 /* Undefine some potentially clashing cpp symbols */
56
57 #undef min
58 #undef max
59
60 /* Flag bits for the match() function */
61
62 #define match_condassert 0x01 /* Called to check a condition assertion */
63 #define match_cbegroup 0x02 /* Could-be-empty unlimited repeat group */
64
65 /* Non-error returns from the match() function. Error returns are externally
66 defined PCRE_ERROR_xxx codes, which are all negative. */
67
68 #define MATCH_MATCH 1
69 #define MATCH_NOMATCH 0
70
71 /* Special internal returns from the match() function. Make them sufficiently
72 negative to avoid the external error codes. */
73
74 #define MATCH_COMMIT (-999)
75 #define MATCH_PRUNE (-998)
76 #define MATCH_SKIP (-997)
77 #define MATCH_THEN (-996)
78
79 /* Maximum number of ints of offset to save on the stack for recursive calls.
80 If the offset vector is bigger, malloc is used. This should be a multiple of 3,
81 because the offset vector is always a multiple of 3 long. */
82
83 #define REC_STACK_SAVE_MAX 30
84
85 /* Min and max values for the common repeats; for the maxima, 0 => infinity */
86
87 static const char rep_min[] = { 0, 0, 1, 1, 0, 0 };
88 static const char rep_max[] = { 0, 0, 0, 0, 1, 1 };
89
90
91
92 #ifdef DEBUG
93 /*************************************************
94 * Debugging function to print chars *
95 *************************************************/
96
97 /* Print a sequence of chars in printable format, stopping at the end of the
98 subject if the requested.
99
100 Arguments:
101 p points to characters
102 length number to print
103 is_subject TRUE if printing from within md->start_subject
104 md pointer to matching data block, if is_subject is TRUE
105
106 Returns: nothing
107 */
108
109 static void
110 pchars(const uschar *p, int length, BOOL is_subject, match_data *md)
111 {
112 unsigned int c;
113 if (is_subject && length > md->end_subject - p) length = md->end_subject - p;
114 while (length-- > 0)
115 if (isprint(c = *(p++))) printf("%c", c); else printf("\\x%02x", c);
116 }
117 #endif
118
119
120
121 /*************************************************
122 * Match a back-reference *
123 *************************************************/
124
125 /* If a back reference hasn't been set, the length that is passed is greater
126 than the number of characters left in the string, so the match fails.
127
128 Arguments:
129 offset index into the offset vector
130 eptr points into the subject
131 length length to be matched
132 md points to match data block
133 ims the ims flags
134
135 Returns: TRUE if matched
136 */
137
138 static BOOL
139 match_ref(int offset, register USPTR eptr, int length, match_data *md,
140 unsigned long int ims)
141 {
142 USPTR p = md->start_subject + md->offset_vector[offset];
143
144 #ifdef DEBUG
145 if (eptr >= md->end_subject)
146 printf("matching subject <null>");
147 else
148 {
149 printf("matching subject ");
150 pchars(eptr, length, TRUE, md);
151 }
152 printf(" against backref ");
153 pchars(p, length, FALSE, md);
154 printf("\n");
155 #endif
156
157 /* Always fail if not enough characters left */
158
159 if (length > md->end_subject - eptr) return FALSE;
160
161 /* Separate the caselesss case for speed */
162
163 if ((ims & PCRE_CASELESS) != 0)
164 {
165 while (length-- > 0)
166 if (md->lcc[*p++] != md->lcc[*eptr++]) return FALSE;
167 }
168 else
169 { while (length-- > 0) if (*p++ != *eptr++) return FALSE; }
170
171 return TRUE;
172 }
173
174
175
176 /***************************************************************************
177 ****************************************************************************
178 RECURSION IN THE match() FUNCTION
179
180 The match() function is highly recursive, though not every recursive call
181 increases the recursive depth. Nevertheless, some regular expressions can cause
182 it to recurse to a great depth. I was writing for Unix, so I just let it call
183 itself recursively. This uses the stack for saving everything that has to be
184 saved for a recursive call. On Unix, the stack can be large, and this works
185 fine.
186
187 It turns out that on some non-Unix-like systems there are problems with
188 programs that use a lot of stack. (This despite the fact that every last chip
189 has oodles of memory these days, and techniques for extending the stack have
190 been known for decades.) So....
191
192 There is a fudge, triggered by defining NO_RECURSE, which avoids recursive
193 calls by keeping local variables that need to be preserved in blocks of memory
194 obtained from malloc() instead instead of on the stack. Macros are used to
195 achieve this so that the actual code doesn't look very different to what it
196 always used to.
197
198 The original heap-recursive code used longjmp(). However, it seems that this
199 can be very slow on some operating systems. Following a suggestion from Stan
200 Switzer, the use of longjmp() has been abolished, at the cost of having to
201 provide a unique number for each call to RMATCH. There is no way of generating
202 a sequence of numbers at compile time in C. I have given them names, to make
203 them stand out more clearly.
204
205 Crude tests on x86 Linux show a small speedup of around 5-8%. However, on
206 FreeBSD, avoiding longjmp() more than halves the time taken to run the standard
207 tests. Furthermore, not using longjmp() means that local dynamic variables
208 don't have indeterminate values; this has meant that the frame size can be
209 reduced because the result can be "passed back" by straight setting of the
210 variable instead of being passed in the frame.
211 ****************************************************************************
212 ***************************************************************************/
213
214 /* Numbers for RMATCH calls. When this list is changed, the code at HEAP_RETURN
215 below must be updated in sync. */
216
217 enum { RM1=1, RM2, RM3, RM4, RM5, RM6, RM7, RM8, RM9, RM10,
218 RM11, RM12, RM13, RM14, RM15, RM16, RM17, RM18, RM19, RM20,
219 RM21, RM22, RM23, RM24, RM25, RM26, RM27, RM28, RM29, RM30,
220 RM31, RM32, RM33, RM34, RM35, RM36, RM37, RM38, RM39, RM40,
221 RM41, RM42, RM43, RM44, RM45, RM46, RM47, RM48, RM49, RM50,
222 RM51, RM52, RM53, RM54 };
223
224 /* These versions of the macros use the stack, as normal. There are debugging
225 versions and production versions. Note that the "rw" argument of RMATCH isn't
226 actuall used in this definition. */
227
228 #ifndef NO_RECURSE
229 #define REGISTER register
230
231 #ifdef DEBUG
232 #define RMATCH(ra,rb,rc,rd,re,rf,rg,rw) \
233 { \
234 printf("match() called in line %d\n", __LINE__); \
235 rrc = match(ra,rb,mstart,rc,rd,re,rf,rg,rdepth+1); \
236 printf("to line %d\n", __LINE__); \
237 }
238 #define RRETURN(ra) \
239 { \
240 printf("match() returned %d from line %d ", ra, __LINE__); \
241 return ra; \
242 }
243 #else
244 #define RMATCH(ra,rb,rc,rd,re,rf,rg,rw) \
245 rrc = match(ra,rb,mstart,rc,rd,re,rf,rg,rdepth+1)
246 #define RRETURN(ra) return ra
247 #endif
248
249 #else
250
251
252 /* These versions of the macros manage a private stack on the heap. Note that
253 the "rd" argument of RMATCH isn't actually used in this definition. It's the md
254 argument of match(), which never changes. */
255
256 #define REGISTER
257
258 #define RMATCH(ra,rb,rc,rd,re,rf,rg,rw)\
259 {\
260 heapframe *newframe = (pcre_stack_malloc)(sizeof(heapframe));\
261 frame->Xwhere = rw; \
262 newframe->Xeptr = ra;\
263 newframe->Xecode = rb;\
264 newframe->Xmstart = mstart;\
265 newframe->Xoffset_top = rc;\
266 newframe->Xims = re;\
267 newframe->Xeptrb = rf;\
268 newframe->Xflags = rg;\
269 newframe->Xrdepth = frame->Xrdepth + 1;\
270 newframe->Xprevframe = frame;\
271 frame = newframe;\
272 DPRINTF(("restarting from line %d\n", __LINE__));\
273 goto HEAP_RECURSE;\
274 L_##rw:\
275 DPRINTF(("jumped back to line %d\n", __LINE__));\
276 }
277
278 #define RRETURN(ra)\
279 {\
280 heapframe *newframe = frame;\
281 frame = newframe->Xprevframe;\
282 (pcre_stack_free)(newframe);\
283 if (frame != NULL)\
284 {\
285 rrc = ra;\
286 goto HEAP_RETURN;\
287 }\
288 return ra;\
289 }
290
291
292 /* Structure for remembering the local variables in a private frame */
293
294 typedef struct heapframe {
295 struct heapframe *Xprevframe;
296
297 /* Function arguments that may change */
298
299 const uschar *Xeptr;
300 const uschar *Xecode;
301 const uschar *Xmstart;
302 int Xoffset_top;
303 long int Xims;
304 eptrblock *Xeptrb;
305 int Xflags;
306 unsigned int Xrdepth;
307
308 /* Function local variables */
309
310 const uschar *Xcallpat;
311 const uschar *Xcharptr;
312 const uschar *Xdata;
313 const uschar *Xnext;
314 const uschar *Xpp;
315 const uschar *Xprev;
316 const uschar *Xsaved_eptr;
317
318 recursion_info Xnew_recursive;
319
320 BOOL Xcur_is_word;
321 BOOL Xcondition;
322 BOOL Xprev_is_word;
323
324 unsigned long int Xoriginal_ims;
325
326 #ifdef SUPPORT_UCP
327 int Xprop_type;
328 int Xprop_value;
329 int Xprop_fail_result;
330 int Xprop_category;
331 int Xprop_chartype;
332 int Xprop_script;
333 int Xoclength;
334 uschar Xocchars[8];
335 #endif
336
337 int Xctype;
338 unsigned int Xfc;
339 int Xfi;
340 int Xlength;
341 int Xmax;
342 int Xmin;
343 int Xnumber;
344 int Xoffset;
345 int Xop;
346 int Xsave_capture_last;
347 int Xsave_offset1, Xsave_offset2, Xsave_offset3;
348 int Xstacksave[REC_STACK_SAVE_MAX];
349
350 eptrblock Xnewptrb;
351
352 /* Where to jump back to */
353
354 int Xwhere;
355
356 } heapframe;
357
358 #endif
359
360
361 /***************************************************************************
362 ***************************************************************************/
363
364
365
366 /*************************************************
367 * Match from current position *
368 *************************************************/
369
370 /* This function is called recursively in many circumstances. Whenever it
371 returns a negative (error) response, the outer incarnation must also return the
372 same response.
373
374 Performance note: It might be tempting to extract commonly used fields from the
375 md structure (e.g. utf8, end_subject) into individual variables to improve
376 performance. Tests using gcc on a SPARC disproved this; in the first case, it
377 made performance worse.
378
379 Arguments:
380 eptr pointer to current character in subject
381 ecode pointer to current position in compiled code
382 mstart pointer to the current match start position (can be modified
383 by encountering \K)
384 offset_top current top pointer
385 md pointer to "static" info for the match
386 ims current /i, /m, and /s options
387 eptrb pointer to chain of blocks containing eptr at start of
388 brackets - for testing for empty matches
389 flags can contain
390 match_condassert - this is an assertion condition
391 match_cbegroup - this is the start of an unlimited repeat
392 group that can match an empty string
393 rdepth the recursion depth
394
395 Returns: MATCH_MATCH if matched ) these values are >= 0
396 MATCH_NOMATCH if failed to match )
397 a negative PCRE_ERROR_xxx value if aborted by an error condition
398 (e.g. stopped by repeated call or recursion limit)
399 */
400
401 static int
402 match(REGISTER USPTR eptr, REGISTER const uschar *ecode, const uschar *mstart,
403 int offset_top, match_data *md, unsigned long int ims, eptrblock *eptrb,
404 int flags, unsigned int rdepth)
405 {
406 /* These variables do not need to be preserved over recursion in this function,
407 so they can be ordinary variables in all cases. Mark some of them with
408 "register" because they are used a lot in loops. */
409
410 register int rrc; /* Returns from recursive calls */
411 register int i; /* Used for loops not involving calls to RMATCH() */
412 register unsigned int c; /* Character values not kept over RMATCH() calls */
413 register BOOL utf8; /* Local copy of UTF-8 flag for speed */
414
415 BOOL minimize, possessive; /* Quantifier options */
416
417 /* When recursion is not being used, all "local" variables that have to be
418 preserved over calls to RMATCH() are part of a "frame" which is obtained from
419 heap storage. Set up the top-level frame here; others are obtained from the
420 heap whenever RMATCH() does a "recursion". See the macro definitions above. */
421
422 #ifdef NO_RECURSE
423 heapframe *frame = (pcre_stack_malloc)(sizeof(heapframe));
424 frame->Xprevframe = NULL; /* Marks the top level */
425
426 /* Copy in the original argument variables */
427
428 frame->Xeptr = eptr;
429 frame->Xecode = ecode;
430 frame->Xmstart = mstart;
431 frame->Xoffset_top = offset_top;
432 frame->Xims = ims;
433 frame->Xeptrb = eptrb;
434 frame->Xflags = flags;
435 frame->Xrdepth = rdepth;
436
437 /* This is where control jumps back to to effect "recursion" */
438
439 HEAP_RECURSE:
440
441 /* Macros make the argument variables come from the current frame */
442
443 #define eptr frame->Xeptr
444 #define ecode frame->Xecode
445 #define mstart frame->Xmstart
446 #define offset_top frame->Xoffset_top
447 #define ims frame->Xims
448 #define eptrb frame->Xeptrb
449 #define flags frame->Xflags
450 #define rdepth frame->Xrdepth
451
452 /* Ditto for the local variables */
453
454 #ifdef SUPPORT_UTF8
455 #define charptr frame->Xcharptr
456 #endif
457 #define callpat frame->Xcallpat
458 #define data frame->Xdata
459 #define next frame->Xnext
460 #define pp frame->Xpp
461 #define prev frame->Xprev
462 #define saved_eptr frame->Xsaved_eptr
463
464 #define new_recursive frame->Xnew_recursive
465
466 #define cur_is_word frame->Xcur_is_word
467 #define condition frame->Xcondition
468 #define prev_is_word frame->Xprev_is_word
469
470 #define original_ims frame->Xoriginal_ims
471
472 #ifdef SUPPORT_UCP
473 #define prop_type frame->Xprop_type
474 #define prop_value frame->Xprop_value
475 #define prop_fail_result frame->Xprop_fail_result
476 #define prop_category frame->Xprop_category
477 #define prop_chartype frame->Xprop_chartype
478 #define prop_script frame->Xprop_script
479 #define oclength frame->Xoclength
480 #define occhars frame->Xocchars
481 #endif
482
483 #define ctype frame->Xctype
484 #define fc frame->Xfc
485 #define fi frame->Xfi
486 #define length frame->Xlength
487 #define max frame->Xmax
488 #define min frame->Xmin
489 #define number frame->Xnumber
490 #define offset frame->Xoffset
491 #define op frame->Xop
492 #define save_capture_last frame->Xsave_capture_last
493 #define save_offset1 frame->Xsave_offset1
494 #define save_offset2 frame->Xsave_offset2
495 #define save_offset3 frame->Xsave_offset3
496 #define stacksave frame->Xstacksave
497
498 #define newptrb frame->Xnewptrb
499
500 /* When recursion is being used, local variables are allocated on the stack and
501 get preserved during recursion in the normal way. In this environment, fi and
502 i, and fc and c, can be the same variables. */
503
504 #else /* NO_RECURSE not defined */
505 #define fi i
506 #define fc c
507
508
509 #ifdef SUPPORT_UTF8 /* Many of these variables are used only */
510 const uschar *charptr; /* in small blocks of the code. My normal */
511 #endif /* style of coding would have declared */
512 const uschar *callpat; /* them within each of those blocks. */
513 const uschar *data; /* However, in order to accommodate the */
514 const uschar *next; /* version of this code that uses an */
515 USPTR pp; /* external "stack" implemented on the */
516 const uschar *prev; /* heap, it is easier to declare them all */
517 USPTR saved_eptr; /* here, so the declarations can be cut */
518 /* out in a block. The only declarations */
519 recursion_info new_recursive; /* within blocks below are for variables */
520 /* that do not have to be preserved over */
521 BOOL cur_is_word; /* a recursive call to RMATCH(). */
522 BOOL condition;
523 BOOL prev_is_word;
524
525 unsigned long int original_ims;
526
527 #ifdef SUPPORT_UCP
528 int prop_type;
529 int prop_value;
530 int prop_fail_result;
531 int prop_category;
532 int prop_chartype;
533 int prop_script;
534 int oclength;
535 uschar occhars[8];
536 #endif
537
538 int ctype;
539 int length;
540 int max;
541 int min;
542 int number;
543 int offset;
544 int op;
545 int save_capture_last;
546 int save_offset1, save_offset2, save_offset3;
547 int stacksave[REC_STACK_SAVE_MAX];
548
549 eptrblock newptrb;
550 #endif /* NO_RECURSE */
551
552 /* These statements are here to stop the compiler complaining about unitialized
553 variables. */
554
555 #ifdef SUPPORT_UCP
556 prop_value = 0;
557 prop_fail_result = 0;
558 #endif
559
560
561 /* This label is used for tail recursion, which is used in a few cases even
562 when NO_RECURSE is not defined, in order to reduce the amount of stack that is
563 used. Thanks to Ian Taylor for noticing this possibility and sending the
564 original patch. */
565
566 TAIL_RECURSE:
567
568 /* OK, now we can get on with the real code of the function. Recursive calls
569 are specified by the macro RMATCH and RRETURN is used to return. When
570 NO_RECURSE is *not* defined, these just turn into a recursive call to match()
571 and a "return", respectively (possibly with some debugging if DEBUG is
572 defined). However, RMATCH isn't like a function call because it's quite a
573 complicated macro. It has to be used in one particular way. This shouldn't,
574 however, impact performance when true recursion is being used. */
575
576 #ifdef SUPPORT_UTF8
577 utf8 = md->utf8; /* Local copy of the flag */
578 #else
579 utf8 = FALSE;
580 #endif
581
582 /* First check that we haven't called match() too many times, or that we
583 haven't exceeded the recursive call limit. */
584
585 if (md->match_call_count++ >= md->match_limit) RRETURN(PCRE_ERROR_MATCHLIMIT);
586 if (rdepth >= md->match_limit_recursion) RRETURN(PCRE_ERROR_RECURSIONLIMIT);
587
588 original_ims = ims; /* Save for resetting on ')' */
589
590 /* At the start of a group with an unlimited repeat that may match an empty
591 string, the match_cbegroup flag is set. When this is the case, add the current
592 subject pointer to the chain of such remembered pointers, to be checked when we
593 hit the closing ket, in order to break infinite loops that match no characters.
594 When match() is called in other circumstances, don't add to the chain. The
595 match_cbegroup flag must NOT be used with tail recursion, because the memory
596 block that is used is on the stack, so a new one may be required for each
597 match(). */
598
599 if ((flags & match_cbegroup) != 0)
600 {
601 newptrb.epb_saved_eptr = eptr;
602 newptrb.epb_prev = eptrb;
603 eptrb = &newptrb;
604 }
605
606 /* Now start processing the opcodes. */
607
608 for (;;)
609 {
610 minimize = possessive = FALSE;
611 op = *ecode;
612
613 /* For partial matching, remember if we ever hit the end of the subject after
614 matching at least one subject character. */
615
616 if (md->partial &&
617 eptr >= md->end_subject &&
618 eptr > mstart)
619 md->hitend = TRUE;
620
621 switch(op)
622 {
623 case OP_FAIL:
624 RRETURN(MATCH_NOMATCH);
625
626 case OP_PRUNE:
627 RMATCH(eptr, ecode + _pcre_OP_lengths[*ecode], offset_top, md,
628 ims, eptrb, flags, RM51);
629 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
630 RRETURN(MATCH_PRUNE);
631
632 case OP_COMMIT:
633 RMATCH(eptr, ecode + _pcre_OP_lengths[*ecode], offset_top, md,
634 ims, eptrb, flags, RM52);
635 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
636 RRETURN(MATCH_COMMIT);
637
638 case OP_SKIP:
639 RMATCH(eptr, ecode + _pcre_OP_lengths[*ecode], offset_top, md,
640 ims, eptrb, flags, RM53);
641 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
642 md->start_match_ptr = eptr; /* Pass back current position */
643 RRETURN(MATCH_SKIP);
644
645 case OP_THEN:
646 RMATCH(eptr, ecode + _pcre_OP_lengths[*ecode], offset_top, md,
647 ims, eptrb, flags, RM54);
648 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
649 RRETURN(MATCH_THEN);
650
651 /* Handle a capturing bracket. If there is space in the offset vector, save
652 the current subject position in the working slot at the top of the vector.
653 We mustn't change the current values of the data slot, because they may be
654 set from a previous iteration of this group, and be referred to by a
655 reference inside the group.
656
657 If the bracket fails to match, we need to restore this value and also the
658 values of the final offsets, in case they were set by a previous iteration
659 of the same bracket.
660
661 If there isn't enough space in the offset vector, treat this as if it were
662 a non-capturing bracket. Don't worry about setting the flag for the error
663 case here; that is handled in the code for KET. */
664
665 case OP_CBRA:
666 case OP_SCBRA:
667 number = GET2(ecode, 1+LINK_SIZE);
668 offset = number << 1;
669
670 #ifdef DEBUG
671 printf("start bracket %d\n", number);
672 printf("subject=");
673 pchars(eptr, 16, TRUE, md);
674 printf("\n");
675 #endif
676
677 if (offset < md->offset_max)
678 {
679 save_offset1 = md->offset_vector[offset];
680 save_offset2 = md->offset_vector[offset+1];
681 save_offset3 = md->offset_vector[md->offset_end - number];
682 save_capture_last = md->capture_last;
683
684 DPRINTF(("saving %d %d %d\n", save_offset1, save_offset2, save_offset3));
685 md->offset_vector[md->offset_end - number] = eptr - md->start_subject;
686
687 flags = (op == OP_SCBRA)? match_cbegroup : 0;
688 do
689 {
690 RMATCH(eptr, ecode + _pcre_OP_lengths[*ecode], offset_top, md,
691 ims, eptrb, flags, RM1);
692 if (rrc != MATCH_NOMATCH && rrc != MATCH_THEN) RRETURN(rrc);
693 md->capture_last = save_capture_last;
694 ecode += GET(ecode, 1);
695 }
696 while (*ecode == OP_ALT);
697
698 DPRINTF(("bracket %d failed\n", number));
699
700 md->offset_vector[offset] = save_offset1;
701 md->offset_vector[offset+1] = save_offset2;
702 md->offset_vector[md->offset_end - number] = save_offset3;
703
704 RRETURN(MATCH_NOMATCH);
705 }
706
707 /* FALL THROUGH ... Insufficient room for saving captured contents. Treat
708 as a non-capturing bracket. */
709
710 /* VVVVVVVVVVVVVVVVVVVVVVVVV */
711 /* VVVVVVVVVVVVVVVVVVVVVVVVV */
712
713 DPRINTF(("insufficient capture room: treat as non-capturing\n"));
714
715 /* VVVVVVVVVVVVVVVVVVVVVVVVV */
716 /* VVVVVVVVVVVVVVVVVVVVVVVVV */
717
718 /* Non-capturing bracket. Loop for all the alternatives. When we get to the
719 final alternative within the brackets, we would return the result of a
720 recursive call to match() whatever happened. We can reduce stack usage by
721 turning this into a tail recursion, except in the case when match_cbegroup
722 is set.*/
723
724 case OP_BRA:
725 case OP_SBRA:
726 DPRINTF(("start non-capturing bracket\n"));
727 flags = (op >= OP_SBRA)? match_cbegroup : 0;
728 for (;;)
729 {
730 if (ecode[GET(ecode, 1)] != OP_ALT) /* Final alternative */
731 {
732 if (flags == 0) /* Not a possibly empty group */
733 {
734 ecode += _pcre_OP_lengths[*ecode];
735 DPRINTF(("bracket 0 tail recursion\n"));
736 goto TAIL_RECURSE;
737 }
738
739 /* Possibly empty group; can't use tail recursion. */
740
741 RMATCH(eptr, ecode + _pcre_OP_lengths[*ecode], offset_top, md, ims,
742 eptrb, flags, RM48);
743 RRETURN(rrc);
744 }
745
746 /* For non-final alternatives, continue the loop for a NOMATCH result;
747 otherwise return. */
748
749 RMATCH(eptr, ecode + _pcre_OP_lengths[*ecode], offset_top, md, ims,
750 eptrb, flags, RM2);
751 if (rrc != MATCH_NOMATCH && rrc != MATCH_THEN) RRETURN(rrc);
752 ecode += GET(ecode, 1);
753 }
754 /* Control never reaches here. */
755
756 /* Conditional group: compilation checked that there are no more than
757 two branches. If the condition is false, skipping the first branch takes us
758 past the end if there is only one branch, but that's OK because that is
759 exactly what going to the ket would do. As there is only one branch to be
760 obeyed, we can use tail recursion to avoid using another stack frame. */
761
762 case OP_COND:
763 case OP_SCOND:
764 if (ecode[LINK_SIZE+1] == OP_RREF) /* Recursion test */
765 {
766 offset = GET2(ecode, LINK_SIZE + 2); /* Recursion group number*/
767 condition = md->recursive != NULL &&
768 (offset == RREF_ANY || offset == md->recursive->group_num);
769 ecode += condition? 3 : GET(ecode, 1);
770 }
771
772 else if (ecode[LINK_SIZE+1] == OP_CREF) /* Group used test */
773 {
774 offset = GET2(ecode, LINK_SIZE+2) << 1; /* Doubled ref number */
775 condition = offset < offset_top && md->offset_vector[offset] >= 0;
776 ecode += condition? 3 : GET(ecode, 1);
777 }
778
779 else if (ecode[LINK_SIZE+1] == OP_DEF) /* DEFINE - always false */
780 {
781 condition = FALSE;
782 ecode += GET(ecode, 1);
783 }
784
785 /* The condition is an assertion. Call match() to evaluate it - setting
786 the final argument match_condassert causes it to stop at the end of an
787 assertion. */
788
789 else
790 {
791 RMATCH(eptr, ecode + 1 + LINK_SIZE, offset_top, md, ims, NULL,
792 match_condassert, RM3);
793 if (rrc == MATCH_MATCH)
794 {
795 condition = TRUE;
796 ecode += 1 + LINK_SIZE + GET(ecode, LINK_SIZE + 2);
797 while (*ecode == OP_ALT) ecode += GET(ecode, 1);
798 }
799 else if (rrc != MATCH_NOMATCH && rrc != MATCH_THEN)
800 {
801 RRETURN(rrc); /* Need braces because of following else */
802 }
803 else
804 {
805 condition = FALSE;
806 ecode += GET(ecode, 1);
807 }
808 }
809
810 /* We are now at the branch that is to be obeyed. As there is only one,
811 we can use tail recursion to avoid using another stack frame, except when
812 match_cbegroup is required for an unlimited repeat of a possibly empty
813 group. If the second alternative doesn't exist, we can just plough on. */
814
815 if (condition || *ecode == OP_ALT)
816 {
817 ecode += 1 + LINK_SIZE;
818 if (op == OP_SCOND) /* Possibly empty group */
819 {
820 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, match_cbegroup, RM49);
821 RRETURN(rrc);
822 }
823 else /* Group must match something */
824 {
825 flags = 0;
826 goto TAIL_RECURSE;
827 }
828 }
829 else /* Condition false & no 2nd alternative */
830 {
831 ecode += 1 + LINK_SIZE;
832 }
833 break;
834
835
836 /* End of the pattern, either real or forced. If we are in a top-level
837 recursion, we should restore the offsets appropriately and continue from
838 after the call. */
839
840 case OP_ACCEPT:
841 case OP_END:
842 if (md->recursive != NULL && md->recursive->group_num == 0)
843 {
844 recursion_info *rec = md->recursive;
845 DPRINTF(("End of pattern in a (?0) recursion\n"));
846 md->recursive = rec->prevrec;
847 memmove(md->offset_vector, rec->offset_save,
848 rec->saved_max * sizeof(int));
849 mstart = rec->save_start;
850 ims = original_ims;
851 ecode = rec->after_call;
852 break;
853 }
854
855 /* Otherwise, if PCRE_NOTEMPTY is set, fail if we have matched an empty
856 string - backtracking will then try other alternatives, if any. */
857
858 if (md->notempty && eptr == mstart) RRETURN(MATCH_NOMATCH);
859 md->end_match_ptr = eptr; /* Record where we ended */
860 md->end_offset_top = offset_top; /* and how many extracts were taken */
861 md->start_match_ptr = mstart; /* and the start (\K can modify) */
862 RRETURN(MATCH_MATCH);
863
864 /* Change option settings */
865
866 case OP_OPT:
867 ims = ecode[1];
868 ecode += 2;
869 DPRINTF(("ims set to %02lx\n", ims));
870 break;
871
872 /* Assertion brackets. Check the alternative branches in turn - the
873 matching won't pass the KET for an assertion. If any one branch matches,
874 the assertion is true. Lookbehind assertions have an OP_REVERSE item at the
875 start of each branch to move the current point backwards, so the code at
876 this level is identical to the lookahead case. */
877
878 case OP_ASSERT:
879 case OP_ASSERTBACK:
880 do
881 {
882 RMATCH(eptr, ecode + 1 + LINK_SIZE, offset_top, md, ims, NULL, 0,
883 RM4);
884 if (rrc == MATCH_MATCH) break;
885 if (rrc != MATCH_NOMATCH && rrc != MATCH_THEN) RRETURN(rrc);
886 ecode += GET(ecode, 1);
887 }
888 while (*ecode == OP_ALT);
889 if (*ecode == OP_KET) RRETURN(MATCH_NOMATCH);
890
891 /* If checking an assertion for a condition, return MATCH_MATCH. */
892
893 if ((flags & match_condassert) != 0) RRETURN(MATCH_MATCH);
894
895 /* Continue from after the assertion, updating the offsets high water
896 mark, since extracts may have been taken during the assertion. */
897
898 do ecode += GET(ecode,1); while (*ecode == OP_ALT);
899 ecode += 1 + LINK_SIZE;
900 offset_top = md->end_offset_top;
901 continue;
902
903 /* Negative assertion: all branches must fail to match */
904
905 case OP_ASSERT_NOT:
906 case OP_ASSERTBACK_NOT:
907 do
908 {
909 RMATCH(eptr, ecode + 1 + LINK_SIZE, offset_top, md, ims, NULL, 0,
910 RM5);
911 if (rrc == MATCH_MATCH) RRETURN(MATCH_NOMATCH);
912 if (rrc != MATCH_NOMATCH && rrc != MATCH_THEN) RRETURN(rrc);
913 ecode += GET(ecode,1);
914 }
915 while (*ecode == OP_ALT);
916
917 if ((flags & match_condassert) != 0) RRETURN(MATCH_MATCH);
918
919 ecode += 1 + LINK_SIZE;
920 continue;
921
922 /* Move the subject pointer back. This occurs only at the start of
923 each branch of a lookbehind assertion. If we are too close to the start to
924 move back, this match function fails. When working with UTF-8 we move
925 back a number of characters, not bytes. */
926
927 case OP_REVERSE:
928 #ifdef SUPPORT_UTF8
929 if (utf8)
930 {
931 i = GET(ecode, 1);
932 while (i-- > 0)
933 {
934 eptr--;
935 if (eptr < md->start_subject) RRETURN(MATCH_NOMATCH);
936 BACKCHAR(eptr);
937 }
938 }
939 else
940 #endif
941
942 /* No UTF-8 support, or not in UTF-8 mode: count is byte count */
943
944 {
945 eptr -= GET(ecode, 1);
946 if (eptr < md->start_subject) RRETURN(MATCH_NOMATCH);
947 }
948
949 /* Skip to next op code */
950
951 ecode += 1 + LINK_SIZE;
952 break;
953
954 /* The callout item calls an external function, if one is provided, passing
955 details of the match so far. This is mainly for debugging, though the
956 function is able to force a failure. */
957
958 case OP_CALLOUT:
959 if (pcre_callout != NULL)
960 {
961 pcre_callout_block cb;
962 cb.version = 1; /* Version 1 of the callout block */
963 cb.callout_number = ecode[1];
964 cb.offset_vector = md->offset_vector;
965 cb.subject = (PCRE_SPTR)md->start_subject;
966 cb.subject_length = md->end_subject - md->start_subject;
967 cb.start_match = mstart - md->start_subject;
968 cb.current_position = eptr - md->start_subject;
969 cb.pattern_position = GET(ecode, 2);
970 cb.next_item_length = GET(ecode, 2 + LINK_SIZE);
971 cb.capture_top = offset_top/2;
972 cb.capture_last = md->capture_last;
973 cb.callout_data = md->callout_data;
974 if ((rrc = (*pcre_callout)(&cb)) > 0) RRETURN(MATCH_NOMATCH);
975 if (rrc < 0) RRETURN(rrc);
976 }
977 ecode += 2 + 2*LINK_SIZE;
978 break;
979
980 /* Recursion either matches the current regex, or some subexpression. The
981 offset data is the offset to the starting bracket from the start of the
982 whole pattern. (This is so that it works from duplicated subpatterns.)
983
984 If there are any capturing brackets started but not finished, we have to
985 save their starting points and reinstate them after the recursion. However,
986 we don't know how many such there are (offset_top records the completed
987 total) so we just have to save all the potential data. There may be up to
988 65535 such values, which is too large to put on the stack, but using malloc
989 for small numbers seems expensive. As a compromise, the stack is used when
990 there are no more than REC_STACK_SAVE_MAX values to store; otherwise malloc
991 is used. A problem is what to do if the malloc fails ... there is no way of
992 returning to the top level with an error. Save the top REC_STACK_SAVE_MAX
993 values on the stack, and accept that the rest may be wrong.
994
995 There are also other values that have to be saved. We use a chained
996 sequence of blocks that actually live on the stack. Thanks to Robin Houston
997 for the original version of this logic. */
998
999 case OP_RECURSE:
1000 {
1001 callpat = md->start_code + GET(ecode, 1);
1002 new_recursive.group_num = (callpat == md->start_code)? 0 :
1003 GET2(callpat, 1 + LINK_SIZE);
1004
1005 /* Add to "recursing stack" */
1006
1007 new_recursive.prevrec = md->recursive;
1008 md->recursive = &new_recursive;
1009
1010 /* Find where to continue from afterwards */
1011
1012 ecode += 1 + LINK_SIZE;
1013 new_recursive.after_call = ecode;
1014
1015 /* Now save the offset data. */
1016
1017 new_recursive.saved_max = md->offset_end;
1018 if (new_recursive.saved_max <= REC_STACK_SAVE_MAX)
1019 new_recursive.offset_save = stacksave;
1020 else
1021 {
1022 new_recursive.offset_save =
1023 (int *)(pcre_malloc)(new_recursive.saved_max * sizeof(int));
1024 if (new_recursive.offset_save == NULL) RRETURN(PCRE_ERROR_NOMEMORY);
1025 }
1026
1027 memcpy(new_recursive.offset_save, md->offset_vector,
1028 new_recursive.saved_max * sizeof(int));
1029 new_recursive.save_start = mstart;
1030 mstart = eptr;
1031
1032 /* OK, now we can do the recursion. For each top-level alternative we
1033 restore the offset and recursion data. */
1034
1035 DPRINTF(("Recursing into group %d\n", new_recursive.group_num));
1036 flags = (*callpat >= OP_SBRA)? match_cbegroup : 0;
1037 do
1038 {
1039 RMATCH(eptr, callpat + _pcre_OP_lengths[*callpat], offset_top,
1040 md, ims, eptrb, flags, RM6);
1041 if (rrc == MATCH_MATCH)
1042 {
1043 DPRINTF(("Recursion matched\n"));
1044 md->recursive = new_recursive.prevrec;
1045 if (new_recursive.offset_save != stacksave)
1046 (pcre_free)(new_recursive.offset_save);
1047 RRETURN(MATCH_MATCH);
1048 }
1049 else if (rrc != MATCH_NOMATCH && rrc != MATCH_THEN)
1050 {
1051 DPRINTF(("Recursion gave error %d\n", rrc));
1052 RRETURN(rrc);
1053 }
1054
1055 md->recursive = &new_recursive;
1056 memcpy(md->offset_vector, new_recursive.offset_save,
1057 new_recursive.saved_max * sizeof(int));
1058 callpat += GET(callpat, 1);
1059 }
1060 while (*callpat == OP_ALT);
1061
1062 DPRINTF(("Recursion didn't match\n"));
1063 md->recursive = new_recursive.prevrec;
1064 if (new_recursive.offset_save != stacksave)
1065 (pcre_free)(new_recursive.offset_save);
1066 RRETURN(MATCH_NOMATCH);
1067 }
1068 /* Control never reaches here */
1069
1070 /* "Once" brackets are like assertion brackets except that after a match,
1071 the point in the subject string is not moved back. Thus there can never be
1072 a move back into the brackets. Friedl calls these "atomic" subpatterns.
1073 Check the alternative branches in turn - the matching won't pass the KET
1074 for this kind of subpattern. If any one branch matches, we carry on as at
1075 the end of a normal bracket, leaving the subject pointer. */
1076
1077 case OP_ONCE:
1078 prev = ecode;
1079 saved_eptr = eptr;
1080
1081 do
1082 {
1083 RMATCH(eptr, ecode + 1 + LINK_SIZE, offset_top, md, ims, eptrb, 0, RM7);
1084 if (rrc == MATCH_MATCH) break;
1085 if (rrc != MATCH_NOMATCH && rrc != MATCH_THEN) RRETURN(rrc);
1086 ecode += GET(ecode,1);
1087 }
1088 while (*ecode == OP_ALT);
1089
1090 /* If hit the end of the group (which could be repeated), fail */
1091
1092 if (*ecode != OP_ONCE && *ecode != OP_ALT) RRETURN(MATCH_NOMATCH);
1093
1094 /* Continue as from after the assertion, updating the offsets high water
1095 mark, since extracts may have been taken. */
1096
1097 do ecode += GET(ecode, 1); while (*ecode == OP_ALT);
1098
1099 offset_top = md->end_offset_top;
1100 eptr = md->end_match_ptr;
1101
1102 /* For a non-repeating ket, just continue at this level. This also
1103 happens for a repeating ket if no characters were matched in the group.
1104 This is the forcible breaking of infinite loops as implemented in Perl
1105 5.005. If there is an options reset, it will get obeyed in the normal
1106 course of events. */
1107
1108 if (*ecode == OP_KET || eptr == saved_eptr)
1109 {
1110 ecode += 1+LINK_SIZE;
1111 break;
1112 }
1113
1114 /* The repeating kets try the rest of the pattern or restart from the
1115 preceding bracket, in the appropriate order. The second "call" of match()
1116 uses tail recursion, to avoid using another stack frame. We need to reset
1117 any options that changed within the bracket before re-running it, so
1118 check the next opcode. */
1119
1120 if (ecode[1+LINK_SIZE] == OP_OPT)
1121 {
1122 ims = (ims & ~PCRE_IMS) | ecode[4];
1123 DPRINTF(("ims set to %02lx at group repeat\n", ims));
1124 }
1125
1126 if (*ecode == OP_KETRMIN)
1127 {
1128 RMATCH(eptr, ecode + 1 + LINK_SIZE, offset_top, md, ims, eptrb, 0, RM8);
1129 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1130 ecode = prev;
1131 flags = 0;
1132 goto TAIL_RECURSE;
1133 }
1134 else /* OP_KETRMAX */
1135 {
1136 RMATCH(eptr, prev, offset_top, md, ims, eptrb, match_cbegroup, RM9);
1137 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1138 ecode += 1 + LINK_SIZE;
1139 flags = 0;
1140 goto TAIL_RECURSE;
1141 }
1142 /* Control never gets here */
1143
1144 /* An alternation is the end of a branch; scan along to find the end of the
1145 bracketed group and go to there. */
1146
1147 case OP_ALT:
1148 do ecode += GET(ecode,1); while (*ecode == OP_ALT);
1149 break;
1150
1151 /* BRAZERO, BRAMINZERO and SKIPZERO occur just before a bracket group,
1152 indicating that it may occur zero times. It may repeat infinitely, or not
1153 at all - i.e. it could be ()* or ()? or even (){0} in the pattern. Brackets
1154 with fixed upper repeat limits are compiled as a number of copies, with the
1155 optional ones preceded by BRAZERO or BRAMINZERO. */
1156
1157 case OP_BRAZERO:
1158 {
1159 next = ecode+1;
1160 RMATCH(eptr, next, offset_top, md, ims, eptrb, 0, RM10);
1161 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1162 do next += GET(next,1); while (*next == OP_ALT);
1163 ecode = next + 1 + LINK_SIZE;
1164 }
1165 break;
1166
1167 case OP_BRAMINZERO:
1168 {
1169 next = ecode+1;
1170 do next += GET(next, 1); while (*next == OP_ALT);
1171 RMATCH(eptr, next + 1+LINK_SIZE, offset_top, md, ims, eptrb, 0, RM11);
1172 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1173 ecode++;
1174 }
1175 break;
1176
1177 case OP_SKIPZERO:
1178 {
1179 next = ecode+1;
1180 do next += GET(next,1); while (*next == OP_ALT);
1181 ecode = next + 1 + LINK_SIZE;
1182 }
1183 break;
1184
1185 /* End of a group, repeated or non-repeating. */
1186
1187 case OP_KET:
1188 case OP_KETRMIN:
1189 case OP_KETRMAX:
1190 prev = ecode - GET(ecode, 1);
1191
1192 /* If this was a group that remembered the subject start, in order to break
1193 infinite repeats of empty string matches, retrieve the subject start from
1194 the chain. Otherwise, set it NULL. */
1195
1196 if (*prev >= OP_SBRA)
1197 {
1198 saved_eptr = eptrb->epb_saved_eptr; /* Value at start of group */
1199 eptrb = eptrb->epb_prev; /* Backup to previous group */
1200 }
1201 else saved_eptr = NULL;
1202
1203 /* If we are at the end of an assertion group, stop matching and return
1204 MATCH_MATCH, but record the current high water mark for use by positive
1205 assertions. Do this also for the "once" (atomic) groups. */
1206
1207 if (*prev == OP_ASSERT || *prev == OP_ASSERT_NOT ||
1208 *prev == OP_ASSERTBACK || *prev == OP_ASSERTBACK_NOT ||
1209 *prev == OP_ONCE)
1210 {
1211 md->end_match_ptr = eptr; /* For ONCE */
1212 md->end_offset_top = offset_top;
1213 RRETURN(MATCH_MATCH);
1214 }
1215
1216 /* For capturing groups we have to check the group number back at the start
1217 and if necessary complete handling an extraction by setting the offsets and
1218 bumping the high water mark. Note that whole-pattern recursion is coded as
1219 a recurse into group 0, so it won't be picked up here. Instead, we catch it
1220 when the OP_END is reached. Other recursion is handled here. */
1221
1222 if (*prev == OP_CBRA || *prev == OP_SCBRA)
1223 {
1224 number = GET2(prev, 1+LINK_SIZE);
1225 offset = number << 1;
1226
1227 #ifdef DEBUG
1228 printf("end bracket %d", number);
1229 printf("\n");
1230 #endif
1231
1232 md->capture_last = number;
1233 if (offset >= md->offset_max) md->offset_overflow = TRUE; else
1234 {
1235 md->offset_vector[offset] =
1236 md->offset_vector[md->offset_end - number];
1237 md->offset_vector[offset+1] = eptr - md->start_subject;
1238 if (offset_top <= offset) offset_top = offset + 2;
1239 }
1240
1241 /* Handle a recursively called group. Restore the offsets
1242 appropriately and continue from after the call. */
1243
1244 if (md->recursive != NULL && md->recursive->group_num == number)
1245 {
1246 recursion_info *rec = md->recursive;
1247 DPRINTF(("Recursion (%d) succeeded - continuing\n", number));
1248 md->recursive = rec->prevrec;
1249 mstart = rec->save_start;
1250 memcpy(md->offset_vector, rec->offset_save,
1251 rec->saved_max * sizeof(int));
1252 ecode = rec->after_call;
1253 ims = original_ims;
1254 break;
1255 }
1256 }
1257
1258 /* For both capturing and non-capturing groups, reset the value of the ims
1259 flags, in case they got changed during the group. */
1260
1261 ims = original_ims;
1262 DPRINTF(("ims reset to %02lx\n", ims));
1263
1264 /* For a non-repeating ket, just continue at this level. This also
1265 happens for a repeating ket if no characters were matched in the group.
1266 This is the forcible breaking of infinite loops as implemented in Perl
1267 5.005. If there is an options reset, it will get obeyed in the normal
1268 course of events. */
1269
1270 if (*ecode == OP_KET || eptr == saved_eptr)
1271 {
1272 ecode += 1 + LINK_SIZE;
1273 break;
1274 }
1275
1276 /* The repeating kets try the rest of the pattern or restart from the
1277 preceding bracket, in the appropriate order. In the second case, we can use
1278 tail recursion to avoid using another stack frame, unless we have an
1279 unlimited repeat of a group that can match an empty string. */
1280
1281 flags = (*prev >= OP_SBRA)? match_cbegroup : 0;
1282
1283 if (*ecode == OP_KETRMIN)
1284 {
1285 RMATCH(eptr, ecode + 1 + LINK_SIZE, offset_top, md, ims, eptrb, 0, RM12);
1286 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1287 if (flags != 0) /* Could match an empty string */
1288 {
1289 RMATCH(eptr, prev, offset_top, md, ims, eptrb, flags, RM50);
1290 RRETURN(rrc);
1291 }
1292 ecode = prev;
1293 goto TAIL_RECURSE;
1294 }
1295 else /* OP_KETRMAX */
1296 {
1297 RMATCH(eptr, prev, offset_top, md, ims, eptrb, flags, RM13);
1298 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1299 ecode += 1 + LINK_SIZE;
1300 flags = 0;
1301 goto TAIL_RECURSE;
1302 }
1303 /* Control never gets here */
1304
1305 /* Start of subject unless notbol, or after internal newline if multiline */
1306
1307 case OP_CIRC:
1308 if (md->notbol && eptr == md->start_subject) RRETURN(MATCH_NOMATCH);
1309 if ((ims & PCRE_MULTILINE) != 0)
1310 {
1311 if (eptr != md->start_subject &&
1312 (eptr == md->end_subject || !WAS_NEWLINE(eptr)))
1313 RRETURN(MATCH_NOMATCH);
1314 ecode++;
1315 break;
1316 }
1317 /* ... else fall through */
1318
1319 /* Start of subject assertion */
1320
1321 case OP_SOD:
1322 if (eptr != md->start_subject) RRETURN(MATCH_NOMATCH);
1323 ecode++;
1324 break;
1325
1326 /* Start of match assertion */
1327
1328 case OP_SOM:
1329 if (eptr != md->start_subject + md->start_offset) RRETURN(MATCH_NOMATCH);
1330 ecode++;
1331 break;
1332
1333 /* Reset the start of match point */
1334
1335 case OP_SET_SOM:
1336 mstart = eptr;
1337 ecode++;
1338 break;
1339
1340 /* Assert before internal newline if multiline, or before a terminating
1341 newline unless endonly is set, else end of subject unless noteol is set. */
1342
1343 case OP_DOLL:
1344 if ((ims & PCRE_MULTILINE) != 0)
1345 {
1346 if (eptr < md->end_subject)
1347 { if (!IS_NEWLINE(eptr)) RRETURN(MATCH_NOMATCH); }
1348 else
1349 { if (md->noteol) RRETURN(MATCH_NOMATCH); }
1350 ecode++;
1351 break;
1352 }
1353 else
1354 {
1355 if (md->noteol) RRETURN(MATCH_NOMATCH);
1356 if (!md->endonly)
1357 {
1358 if (eptr != md->end_subject &&
1359 (!IS_NEWLINE(eptr) || eptr != md->end_subject - md->nllen))
1360 RRETURN(MATCH_NOMATCH);
1361 ecode++;
1362 break;
1363 }
1364 }
1365 /* ... else fall through for endonly */
1366
1367 /* End of subject assertion (\z) */
1368
1369 case OP_EOD:
1370 if (eptr < md->end_subject) RRETURN(MATCH_NOMATCH);
1371 ecode++;
1372 break;
1373
1374 /* End of subject or ending \n assertion (\Z) */
1375
1376 case OP_EODN:
1377 if (eptr != md->end_subject &&
1378 (!IS_NEWLINE(eptr) || eptr != md->end_subject - md->nllen))
1379 RRETURN(MATCH_NOMATCH);
1380 ecode++;
1381 break;
1382
1383 /* Word boundary assertions */
1384
1385 case OP_NOT_WORD_BOUNDARY:
1386 case OP_WORD_BOUNDARY:
1387 {
1388
1389 /* Find out if the previous and current characters are "word" characters.
1390 It takes a bit more work in UTF-8 mode. Characters > 255 are assumed to
1391 be "non-word" characters. */
1392
1393 #ifdef SUPPORT_UTF8
1394 if (utf8)
1395 {
1396 if (eptr == md->start_subject) prev_is_word = FALSE; else
1397 {
1398 const uschar *lastptr = eptr - 1;
1399 while((*lastptr & 0xc0) == 0x80) lastptr--;
1400 GETCHAR(c, lastptr);
1401 prev_is_word = c < 256 && (md->ctypes[c] & ctype_word) != 0;
1402 }
1403 if (eptr >= md->end_subject) cur_is_word = FALSE; else
1404 {
1405 GETCHAR(c, eptr);
1406 cur_is_word = c < 256 && (md->ctypes[c] & ctype_word) != 0;
1407 }
1408 }
1409 else
1410 #endif
1411
1412 /* More streamlined when not in UTF-8 mode */
1413
1414 {
1415 prev_is_word = (eptr != md->start_subject) &&
1416 ((md->ctypes[eptr[-1]] & ctype_word) != 0);
1417 cur_is_word = (eptr < md->end_subject) &&
1418 ((md->ctypes[*eptr] & ctype_word) != 0);
1419 }
1420
1421 /* Now see if the situation is what we want */
1422
1423 if ((*ecode++ == OP_WORD_BOUNDARY)?
1424 cur_is_word == prev_is_word : cur_is_word != prev_is_word)
1425 RRETURN(MATCH_NOMATCH);
1426 }
1427 break;
1428
1429 /* Match a single character type; inline for speed */
1430
1431 case OP_ANY:
1432 if ((ims & PCRE_DOTALL) == 0)
1433 {
1434 if (IS_NEWLINE(eptr)) RRETURN(MATCH_NOMATCH);
1435 }
1436 if (eptr++ >= md->end_subject) RRETURN(MATCH_NOMATCH);
1437 if (utf8)
1438 while (eptr < md->end_subject && (*eptr & 0xc0) == 0x80) eptr++;
1439 ecode++;
1440 break;
1441
1442 /* Match a single byte, even in UTF-8 mode. This opcode really does match
1443 any byte, even newline, independent of the setting of PCRE_DOTALL. */
1444
1445 case OP_ANYBYTE:
1446 if (eptr++ >= md->end_subject) RRETURN(MATCH_NOMATCH);
1447 ecode++;
1448 break;
1449
1450 case OP_NOT_DIGIT:
1451 if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
1452 GETCHARINCTEST(c, eptr);
1453 if (
1454 #ifdef SUPPORT_UTF8
1455 c < 256 &&
1456 #endif
1457 (md->ctypes[c] & ctype_digit) != 0
1458 )
1459 RRETURN(MATCH_NOMATCH);
1460 ecode++;
1461 break;
1462
1463 case OP_DIGIT:
1464 if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
1465 GETCHARINCTEST(c, eptr);
1466 if (
1467 #ifdef SUPPORT_UTF8
1468 c >= 256 ||
1469 #endif
1470 (md->ctypes[c] & ctype_digit) == 0
1471 )
1472 RRETURN(MATCH_NOMATCH);
1473 ecode++;
1474 break;
1475
1476 case OP_NOT_WHITESPACE:
1477 if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
1478 GETCHARINCTEST(c, eptr);
1479 if (
1480 #ifdef SUPPORT_UTF8
1481 c < 256 &&
1482 #endif
1483 (md->ctypes[c] & ctype_space) != 0
1484 )
1485 RRETURN(MATCH_NOMATCH);
1486 ecode++;
1487 break;
1488
1489 case OP_WHITESPACE:
1490 if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
1491 GETCHARINCTEST(c, eptr);
1492 if (
1493 #ifdef SUPPORT_UTF8
1494 c >= 256 ||
1495 #endif
1496 (md->ctypes[c] & ctype_space) == 0
1497 )
1498 RRETURN(MATCH_NOMATCH);
1499 ecode++;
1500 break;
1501
1502 case OP_NOT_WORDCHAR:
1503 if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
1504 GETCHARINCTEST(c, eptr);
1505 if (
1506 #ifdef SUPPORT_UTF8
1507 c < 256 &&
1508 #endif
1509 (md->ctypes[c] & ctype_word) != 0
1510 )
1511 RRETURN(MATCH_NOMATCH);
1512 ecode++;
1513 break;
1514
1515 case OP_WORDCHAR:
1516 if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
1517 GETCHARINCTEST(c, eptr);
1518 if (
1519 #ifdef SUPPORT_UTF8
1520 c >= 256 ||
1521 #endif
1522 (md->ctypes[c] & ctype_word) == 0
1523 )
1524 RRETURN(MATCH_NOMATCH);
1525 ecode++;
1526 break;
1527
1528 case OP_ANYNL:
1529 if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
1530 GETCHARINCTEST(c, eptr);
1531 switch(c)
1532 {
1533 default: RRETURN(MATCH_NOMATCH);
1534 case 0x000d:
1535 if (eptr < md->end_subject && *eptr == 0x0a) eptr++;
1536 break;
1537
1538 case 0x000a:
1539 break;
1540
1541 case 0x000b:
1542 case 0x000c:
1543 case 0x0085:
1544 case 0x2028:
1545 case 0x2029:
1546 if (md->bsr_anycrlf) RRETURN(MATCH_NOMATCH);
1547 break;
1548 }
1549 ecode++;
1550 break;
1551
1552 case OP_NOT_HSPACE:
1553 if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
1554 GETCHARINCTEST(c, eptr);
1555 switch(c)
1556 {
1557 default: break;
1558 case 0x09: /* HT */
1559 case 0x20: /* SPACE */
1560 case 0xa0: /* NBSP */
1561 case 0x1680: /* OGHAM SPACE MARK */
1562 case 0x180e: /* MONGOLIAN VOWEL SEPARATOR */
1563 case 0x2000: /* EN QUAD */
1564 case 0x2001: /* EM QUAD */
1565 case 0x2002: /* EN SPACE */
1566 case 0x2003: /* EM SPACE */
1567 case 0x2004: /* THREE-PER-EM SPACE */
1568 case 0x2005: /* FOUR-PER-EM SPACE */
1569 case 0x2006: /* SIX-PER-EM SPACE */
1570 case 0x2007: /* FIGURE SPACE */
1571 case 0x2008: /* PUNCTUATION SPACE */
1572 case 0x2009: /* THIN SPACE */
1573 case 0x200A: /* HAIR SPACE */
1574 case 0x202f: /* NARROW NO-BREAK SPACE */
1575 case 0x205f: /* MEDIUM MATHEMATICAL SPACE */
1576 case 0x3000: /* IDEOGRAPHIC SPACE */
1577 RRETURN(MATCH_NOMATCH);
1578 }
1579 ecode++;
1580 break;
1581
1582 case OP_HSPACE:
1583 if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
1584 GETCHARINCTEST(c, eptr);
1585 switch(c)
1586 {
1587 default: RRETURN(MATCH_NOMATCH);
1588 case 0x09: /* HT */
1589 case 0x20: /* SPACE */
1590 case 0xa0: /* NBSP */
1591 case 0x1680: /* OGHAM SPACE MARK */
1592 case 0x180e: /* MONGOLIAN VOWEL SEPARATOR */
1593 case 0x2000: /* EN QUAD */
1594 case 0x2001: /* EM QUAD */
1595 case 0x2002: /* EN SPACE */
1596 case 0x2003: /* EM SPACE */
1597 case 0x2004: /* THREE-PER-EM SPACE */
1598 case 0x2005: /* FOUR-PER-EM SPACE */
1599 case 0x2006: /* SIX-PER-EM SPACE */
1600 case 0x2007: /* FIGURE SPACE */
1601 case 0x2008: /* PUNCTUATION SPACE */
1602 case 0x2009: /* THIN SPACE */
1603 case 0x200A: /* HAIR SPACE */
1604 case 0x202f: /* NARROW NO-BREAK SPACE */
1605 case 0x205f: /* MEDIUM MATHEMATICAL SPACE */
1606 case 0x3000: /* IDEOGRAPHIC SPACE */
1607 break;
1608 }
1609 ecode++;
1610 break;
1611
1612 case OP_NOT_VSPACE:
1613 if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
1614 GETCHARINCTEST(c, eptr);
1615 switch(c)
1616 {
1617 default: break;
1618 case 0x0a: /* LF */
1619 case 0x0b: /* VT */
1620 case 0x0c: /* FF */
1621 case 0x0d: /* CR */
1622 case 0x85: /* NEL */
1623 case 0x2028: /* LINE SEPARATOR */
1624 case 0x2029: /* PARAGRAPH SEPARATOR */
1625 RRETURN(MATCH_NOMATCH);
1626 }
1627 ecode++;
1628 break;
1629
1630 case OP_VSPACE:
1631 if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
1632 GETCHARINCTEST(c, eptr);
1633 switch(c)
1634 {
1635 default: RRETURN(MATCH_NOMATCH);
1636 case 0x0a: /* LF */
1637 case 0x0b: /* VT */
1638 case 0x0c: /* FF */
1639 case 0x0d: /* CR */
1640 case 0x85: /* NEL */
1641 case 0x2028: /* LINE SEPARATOR */
1642 case 0x2029: /* PARAGRAPH SEPARATOR */
1643 break;
1644 }
1645 ecode++;
1646 break;
1647
1648 #ifdef SUPPORT_UCP
1649 /* Check the next character by Unicode property. We will get here only
1650 if the support is in the binary; otherwise a compile-time error occurs. */
1651
1652 case OP_PROP:
1653 case OP_NOTPROP:
1654 if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
1655 GETCHARINCTEST(c, eptr);
1656 {
1657 int chartype, script;
1658 int category = _pcre_ucp_findprop(c, &chartype, &script);
1659
1660 switch(ecode[1])
1661 {
1662 case PT_ANY:
1663 if (op == OP_NOTPROP) RRETURN(MATCH_NOMATCH);
1664 break;
1665
1666 case PT_LAMP:
1667 if ((chartype == ucp_Lu ||
1668 chartype == ucp_Ll ||
1669 chartype == ucp_Lt) == (op == OP_NOTPROP))
1670 RRETURN(MATCH_NOMATCH);
1671 break;
1672
1673 case PT_GC:
1674 if ((ecode[2] != category) == (op == OP_PROP))
1675 RRETURN(MATCH_NOMATCH);
1676 break;
1677
1678 case PT_PC:
1679 if ((ecode[2] != chartype) == (op == OP_PROP))
1680 RRETURN(MATCH_NOMATCH);
1681 break;
1682
1683 case PT_SC:
1684 if ((ecode[2] != script) == (op == OP_PROP))
1685 RRETURN(MATCH_NOMATCH);
1686 break;
1687
1688 default:
1689 RRETURN(PCRE_ERROR_INTERNAL);
1690 }
1691
1692 ecode += 3;
1693 }
1694 break;
1695
1696 /* Match an extended Unicode sequence. We will get here only if the support
1697 is in the binary; otherwise a compile-time error occurs. */
1698
1699 case OP_EXTUNI:
1700 if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
1701 GETCHARINCTEST(c, eptr);
1702 {
1703 int chartype, script;
1704 int category = _pcre_ucp_findprop(c, &chartype, &script);
1705 if (category == ucp_M) RRETURN(MATCH_NOMATCH);
1706 while (eptr < md->end_subject)
1707 {
1708 int len = 1;
1709 if (!utf8) c = *eptr; else
1710 {
1711 GETCHARLEN(c, eptr, len);
1712 }
1713 category = _pcre_ucp_findprop(c, &chartype, &script);
1714 if (category != ucp_M) break;
1715 eptr += len;
1716 }
1717 }
1718 ecode++;
1719 break;
1720 #endif
1721
1722
1723 /* Match a back reference, possibly repeatedly. Look past the end of the
1724 item to see if there is repeat information following. The code is similar
1725 to that for character classes, but repeated for efficiency. Then obey
1726 similar code to character type repeats - written out again for speed.
1727 However, if the referenced string is the empty string, always treat
1728 it as matched, any number of times (otherwise there could be infinite
1729 loops). */
1730
1731 case OP_REF:
1732 {
1733 offset = GET2(ecode, 1) << 1; /* Doubled ref number */
1734 ecode += 3;
1735
1736 /* If the reference is unset, there are two possibilities:
1737
1738 (a) In the default, Perl-compatible state, set the length to be longer
1739 than the amount of subject left; this ensures that every attempt at a
1740 match fails. We can't just fail here, because of the possibility of
1741 quantifiers with zero minima.
1742
1743 (b) If the JavaScript compatibility flag is set, set the length to zero
1744 so that the back reference matches an empty string.
1745
1746 Otherwise, set the length to the length of what was matched by the
1747 referenced subpattern. */
1748
1749 if (offset >= offset_top || md->offset_vector[offset] < 0)
1750 length = (md->jscript_compat)? 0 : md->end_subject - eptr + 1;
1751 else
1752 length = md->offset_vector[offset+1] - md->offset_vector[offset];
1753
1754 /* Set up for repetition, or handle the non-repeated case */
1755
1756 switch (*ecode)
1757 {
1758 case OP_CRSTAR:
1759 case OP_CRMINSTAR:
1760 case OP_CRPLUS:
1761 case OP_CRMINPLUS:
1762 case OP_CRQUERY:
1763 case OP_CRMINQUERY:
1764 c = *ecode++ - OP_CRSTAR;
1765 minimize = (c & 1) != 0;
1766 min = rep_min[c]; /* Pick up values from tables; */
1767 max = rep_max[c]; /* zero for max => infinity */
1768 if (max == 0) max = INT_MAX;
1769 break;
1770
1771 case OP_CRRANGE:
1772 case OP_CRMINRANGE:
1773 minimize = (*ecode == OP_CRMINRANGE);
1774 min = GET2(ecode, 1);
1775 max = GET2(ecode, 3);
1776 if (max == 0) max = INT_MAX;
1777 ecode += 5;
1778 break;
1779
1780 default: /* No repeat follows */
1781 if (!match_ref(offset, eptr, length, md, ims)) RRETURN(MATCH_NOMATCH);
1782 eptr += length;
1783 continue; /* With the main loop */
1784 }
1785
1786 /* If the length of the reference is zero, just continue with the
1787 main loop. */
1788
1789 if (length == 0) continue;
1790
1791 /* First, ensure the minimum number of matches are present. We get back
1792 the length of the reference string explicitly rather than passing the
1793 address of eptr, so that eptr can be a register variable. */
1794
1795 for (i = 1; i <= min; i++)
1796 {
1797 if (!match_ref(offset, eptr, length, md, ims)) RRETURN(MATCH_NOMATCH);
1798 eptr += length;
1799 }
1800
1801 /* If min = max, continue at the same level without recursion.
1802 They are not both allowed to be zero. */
1803
1804 if (min == max) continue;
1805
1806 /* If minimizing, keep trying and advancing the pointer */
1807
1808 if (minimize)
1809 {
1810 for (fi = min;; fi++)
1811 {
1812 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM14);
1813 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1814 if (fi >= max || !match_ref(offset, eptr, length, md, ims))
1815 RRETURN(MATCH_NOMATCH);
1816 eptr += length;
1817 }
1818 /* Control never gets here */
1819 }
1820
1821 /* If maximizing, find the longest string and work backwards */
1822
1823 else
1824 {
1825 pp = eptr;
1826 for (i = min; i < max; i++)
1827 {
1828 if (!match_ref(offset, eptr, length, md, ims)) break;
1829 eptr += length;
1830 }
1831 while (eptr >= pp)
1832 {
1833 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM15);
1834 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1835 eptr -= length;
1836 }
1837 RRETURN(MATCH_NOMATCH);
1838 }
1839 }
1840 /* Control never gets here */
1841
1842
1843
1844 /* Match a bit-mapped character class, possibly repeatedly. This op code is
1845 used when all the characters in the class have values in the range 0-255,
1846 and either the matching is caseful, or the characters are in the range
1847 0-127 when UTF-8 processing is enabled. The only difference between
1848 OP_CLASS and OP_NCLASS occurs when a data character outside the range is
1849 encountered.
1850
1851 First, look past the end of the item to see if there is repeat information
1852 following. Then obey similar code to character type repeats - written out
1853 again for speed. */
1854
1855 case OP_NCLASS:
1856 case OP_CLASS:
1857 {
1858 data = ecode + 1; /* Save for matching */
1859 ecode += 33; /* Advance past the item */
1860
1861 switch (*ecode)
1862 {
1863 case OP_CRSTAR:
1864 case OP_CRMINSTAR:
1865 case OP_CRPLUS:
1866 case OP_CRMINPLUS:
1867 case OP_CRQUERY:
1868 case OP_CRMINQUERY:
1869 c = *ecode++ - OP_CRSTAR;
1870 minimize = (c & 1) != 0;
1871 min = rep_min[c]; /* Pick up values from tables; */
1872 max = rep_max[c]; /* zero for max => infinity */
1873 if (max == 0) max = INT_MAX;
1874 break;
1875
1876 case OP_CRRANGE:
1877 case OP_CRMINRANGE:
1878 minimize = (*ecode == OP_CRMINRANGE);
1879 min = GET2(ecode, 1);
1880 max = GET2(ecode, 3);
1881 if (max == 0) max = INT_MAX;
1882 ecode += 5;
1883 break;
1884
1885 default: /* No repeat follows */
1886 min = max = 1;
1887 break;
1888 }
1889
1890 /* First, ensure the minimum number of matches are present. */
1891
1892 #ifdef SUPPORT_UTF8
1893 /* UTF-8 mode */
1894 if (utf8)
1895 {
1896 for (i = 1; i <= min; i++)
1897 {
1898 if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
1899 GETCHARINC(c, eptr);
1900 if (c > 255)
1901 {
1902 if (op == OP_CLASS) RRETURN(MATCH_NOMATCH);
1903 }
1904 else
1905 {
1906 if ((data[c/8] & (1 << (c&7))) == 0) RRETURN(MATCH_NOMATCH);
1907 }
1908 }
1909 }
1910 else
1911 #endif
1912 /* Not UTF-8 mode */
1913 {
1914 for (i = 1; i <= min; i++)
1915 {
1916 if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
1917 c = *eptr++;
1918 if ((data[c/8] & (1 << (c&7))) == 0) RRETURN(MATCH_NOMATCH);
1919 }
1920 }
1921
1922 /* If max == min we can continue with the main loop without the
1923 need to recurse. */
1924
1925 if (min == max) continue;
1926
1927 /* If minimizing, keep testing the rest of the expression and advancing
1928 the pointer while it matches the class. */
1929
1930 if (minimize)
1931 {
1932 #ifdef SUPPORT_UTF8
1933 /* UTF-8 mode */
1934 if (utf8)
1935 {
1936 for (fi = min;; fi++)
1937 {
1938 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM16);
1939 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1940 if (fi >= max || eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
1941 GETCHARINC(c, eptr);
1942 if (c > 255)
1943 {
1944 if (op == OP_CLASS) RRETURN(MATCH_NOMATCH);
1945 }
1946 else
1947 {
1948 if ((data[c/8] & (1 << (c&7))) == 0) RRETURN(MATCH_NOMATCH);
1949 }
1950 }
1951 }
1952 else
1953 #endif
1954 /* Not UTF-8 mode */
1955 {
1956 for (fi = min;; fi++)
1957 {
1958 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM17);
1959 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1960 if (fi >= max || eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
1961 c = *eptr++;
1962 if ((data[c/8] & (1 << (c&7))) == 0) RRETURN(MATCH_NOMATCH);
1963 }
1964 }
1965 /* Control never gets here */
1966 }
1967
1968 /* If maximizing, find the longest possible run, then work backwards. */
1969
1970 else
1971 {
1972 pp = eptr;
1973
1974 #ifdef SUPPORT_UTF8
1975 /* UTF-8 mode */
1976 if (utf8)
1977 {
1978 for (i = min; i < max; i++)
1979 {
1980 int len = 1;
1981 if (eptr >= md->end_subject) break;
1982 GETCHARLEN(c, eptr, len);
1983 if (c > 255)
1984 {
1985 if (op == OP_CLASS) break;
1986 }
1987 else
1988 {
1989 if ((data[c/8] & (1 << (c&7))) == 0) break;
1990 }
1991 eptr += len;
1992 }
1993 for (;;)
1994 {
1995 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM18);
1996 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1997 if (eptr-- == pp) break; /* Stop if tried at original pos */
1998 BACKCHAR(eptr);
1999 }
2000 }
2001 else
2002 #endif
2003 /* Not UTF-8 mode */
2004 {
2005 for (i = min; i < max; i++)
2006 {
2007 if (eptr >= md->end_subject) break;
2008 c = *eptr;
2009 if ((data[c/8] & (1 << (c&7))) == 0) break;
2010 eptr++;
2011 }
2012 while (eptr >= pp)
2013 {
2014 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM19);
2015 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2016 eptr--;
2017 }
2018 }
2019
2020 RRETURN(MATCH_NOMATCH);
2021 }
2022 }
2023 /* Control never gets here */
2024
2025
2026 /* Match an extended character class. This opcode is encountered only
2027 in UTF-8 mode, because that's the only time it is compiled. */
2028
2029 #ifdef SUPPORT_UTF8
2030 case OP_XCLASS:
2031 {
2032 data = ecode + 1 + LINK_SIZE; /* Save for matching */
2033 ecode += GET(ecode, 1); /* Advance past the item */
2034
2035 switch (*ecode)
2036 {
2037 case OP_CRSTAR:
2038 case OP_CRMINSTAR:
2039 case OP_CRPLUS:
2040 case OP_CRMINPLUS:
2041 case OP_CRQUERY:
2042 case OP_CRMINQUERY:
2043 c = *ecode++ - OP_CRSTAR;
2044 minimize = (c & 1) != 0;
2045 min = rep_min[c]; /* Pick up values from tables; */
2046 max = rep_max[c]; /* zero for max => infinity */
2047 if (max == 0) max = INT_MAX;
2048 break;
2049
2050 case OP_CRRANGE:
2051 case OP_CRMINRANGE:
2052 minimize = (*ecode == OP_CRMINRANGE);
2053 min = GET2(ecode, 1);
2054 max = GET2(ecode, 3);
2055 if (max == 0) max = INT_MAX;
2056 ecode += 5;
2057 break;
2058
2059 default: /* No repeat follows */
2060 min = max = 1;
2061 break;
2062 }
2063
2064 /* First, ensure the minimum number of matches are present. */
2065
2066 for (i = 1; i <= min; i++)
2067 {
2068 if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
2069 GETCHARINC(c, eptr);
2070 if (!_pcre_xclass(c, data)) RRETURN(MATCH_NOMATCH);
2071 }
2072
2073 /* If max == min we can continue with the main loop without the
2074 need to recurse. */
2075
2076 if (min == max) continue;
2077
2078 /* If minimizing, keep testing the rest of the expression and advancing
2079 the pointer while it matches the class. */
2080
2081 if (minimize)
2082 {
2083 for (fi = min;; fi++)
2084 {
2085 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM20);
2086 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2087 if (fi >= max || eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
2088 GETCHARINC(c, eptr);
2089 if (!_pcre_xclass(c, data)) RRETURN(MATCH_NOMATCH);
2090 }
2091 /* Control never gets here */
2092 }
2093
2094 /* If maximizing, find the longest possible run, then work backwards. */
2095
2096 else
2097 {
2098 pp = eptr;
2099 for (i = min; i < max; i++)
2100 {
2101 int len = 1;
2102 if (eptr >= md->end_subject) break;
2103 GETCHARLEN(c, eptr, len);
2104 if (!_pcre_xclass(c, data)) break;
2105 eptr += len;
2106 }
2107 for(;;)
2108 {
2109 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM21);
2110 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2111 if (eptr-- == pp) break; /* Stop if tried at original pos */
2112 if (utf8) BACKCHAR(eptr);
2113 }
2114 RRETURN(MATCH_NOMATCH);
2115 }
2116
2117 /* Control never gets here */
2118 }
2119 #endif /* End of XCLASS */
2120
2121 /* Match a single character, casefully */
2122
2123 case OP_CHAR:
2124 #ifdef SUPPORT_UTF8
2125 if (utf8)
2126 {
2127 length = 1;
2128 ecode++;
2129 GETCHARLEN(fc, ecode, length);
2130 if (length > md->end_subject - eptr) RRETURN(MATCH_NOMATCH);
2131 while (length-- > 0) if (*ecode++ != *eptr++) RRETURN(MATCH_NOMATCH);
2132 }
2133 else
2134 #endif
2135
2136 /* Non-UTF-8 mode */
2137 {
2138 if (md->end_subject - eptr < 1) RRETURN(MATCH_NOMATCH);
2139 if (ecode[1] != *eptr++) RRETURN(MATCH_NOMATCH);
2140 ecode += 2;
2141 }
2142 break;
2143
2144 /* Match a single character, caselessly */
2145
2146 case OP_CHARNC:
2147 #ifdef SUPPORT_UTF8
2148 if (utf8)
2149 {
2150 length = 1;
2151 ecode++;
2152 GETCHARLEN(fc, ecode, length);
2153
2154 if (length > md->end_subject - eptr) RRETURN(MATCH_NOMATCH);
2155
2156 /* If the pattern character's value is < 128, we have only one byte, and
2157 can use the fast lookup table. */
2158
2159 if (fc < 128)
2160 {
2161 if (md->lcc[*ecode++] != md->lcc[*eptr++]) RRETURN(MATCH_NOMATCH);
2162 }
2163
2164 /* Otherwise we must pick up the subject character */
2165
2166 else
2167 {
2168 unsigned int dc;
2169 GETCHARINC(dc, eptr);
2170 ecode += length;
2171
2172 /* If we have Unicode property support, we can use it to test the other
2173 case of the character, if there is one. */
2174
2175 if (fc != dc)
2176 {
2177 #ifdef SUPPORT_UCP
2178 if (dc != _pcre_ucp_othercase(fc))
2179 #endif
2180 RRETURN(MATCH_NOMATCH);
2181 }
2182 }
2183 }
2184 else
2185 #endif /* SUPPORT_UTF8 */
2186
2187 /* Non-UTF-8 mode */
2188 {
2189 if (md->end_subject - eptr < 1) RRETURN(MATCH_NOMATCH);
2190 if (md->lcc[ecode[1]] != md->lcc[*eptr++]) RRETURN(MATCH_NOMATCH);
2191 ecode += 2;
2192 }
2193 break;
2194
2195 /* Match a single character repeatedly. */
2196
2197 case OP_EXACT:
2198 min = max = GET2(ecode, 1);
2199 ecode += 3;
2200 goto REPEATCHAR;
2201
2202 case OP_POSUPTO:
2203 possessive = TRUE;
2204 /* Fall through */
2205
2206 case OP_UPTO:
2207 case OP_MINUPTO:
2208 min = 0;
2209 max = GET2(ecode, 1);
2210 minimize = *ecode == OP_MINUPTO;
2211 ecode += 3;
2212 goto REPEATCHAR;
2213
2214 case OP_POSSTAR:
2215 possessive = TRUE;
2216 min = 0;
2217 max = INT_MAX;
2218 ecode++;
2219 goto REPEATCHAR;
2220
2221 case OP_POSPLUS:
2222 possessive = TRUE;
2223 min = 1;
2224 max = INT_MAX;
2225 ecode++;
2226 goto REPEATCHAR;
2227
2228 case OP_POSQUERY:
2229 possessive = TRUE;
2230 min = 0;
2231 max = 1;
2232 ecode++;
2233 goto REPEATCHAR;
2234
2235 case OP_STAR:
2236 case OP_MINSTAR:
2237 case OP_PLUS:
2238 case OP_MINPLUS:
2239 case OP_QUERY:
2240 case OP_MINQUERY:
2241 c = *ecode++ - OP_STAR;
2242 minimize = (c & 1) != 0;
2243 min = rep_min[c]; /* Pick up values from tables; */
2244 max = rep_max[c]; /* zero for max => infinity */
2245 if (max == 0) max = INT_MAX;
2246
2247 /* Common code for all repeated single-character matches. We can give
2248 up quickly if there are fewer than the minimum number of characters left in
2249 the subject. */
2250
2251 REPEATCHAR:
2252 #ifdef SUPPORT_UTF8
2253 if (utf8)
2254 {
2255 length = 1;
2256 charptr = ecode;
2257 GETCHARLEN(fc, ecode, length);
2258 if (min * length > md->end_subject - eptr) RRETURN(MATCH_NOMATCH);
2259 ecode += length;
2260
2261 /* Handle multibyte character matching specially here. There is
2262 support for caseless matching if UCP support is present. */
2263
2264 if (length > 1)
2265 {
2266 #ifdef SUPPORT_UCP
2267 unsigned int othercase;
2268 if ((ims & PCRE_CASELESS) != 0 &&
2269 (othercase = _pcre_ucp_othercase(fc)) != NOTACHAR)
2270 oclength = _pcre_ord2utf8(othercase, occhars);
2271 else oclength = 0;
2272 #endif /* SUPPORT_UCP */
2273
2274 for (i = 1; i <= min; i++)
2275 {
2276 if (memcmp(eptr, charptr, length) == 0) eptr += length;
2277 #ifdef SUPPORT_UCP
2278 /* Need braces because of following else */
2279 else if (oclength == 0) { RRETURN(MATCH_NOMATCH); }
2280 else
2281 {
2282 if (memcmp(eptr, occhars, oclength) != 0) RRETURN(MATCH_NOMATCH);
2283 eptr += oclength;
2284 }
2285 #else /* without SUPPORT_UCP */
2286 else { RRETURN(MATCH_NOMATCH); }
2287 #endif /* SUPPORT_UCP */
2288 }
2289
2290 if (min == max) continue;
2291
2292 if (minimize)
2293 {
2294 for (fi = min;; fi++)
2295 {
2296 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM22);
2297 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2298 if (fi >= max || eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
2299 if (memcmp(eptr, charptr, length) == 0) eptr += length;
2300 #ifdef SUPPORT_UCP
2301 /* Need braces because of following else */
2302 else if (oclength == 0) { RRETURN(MATCH_NOMATCH); }
2303 else
2304 {
2305 if (memcmp(eptr, occhars, oclength) != 0) RRETURN(MATCH_NOMATCH);
2306 eptr += oclength;
2307 }
2308 #else /* without SUPPORT_UCP */
2309 else { RRETURN (MATCH_NOMATCH); }
2310 #endif /* SUPPORT_UCP */
2311 }
2312 /* Control never gets here */
2313 }
2314
2315 else /* Maximize */
2316 {
2317 pp = eptr;
2318 for (i = min; i < max; i++)
2319 {
2320 if (eptr > md->end_subject - length) break;
2321 if (memcmp(eptr, charptr, length) == 0) eptr += length;
2322 #ifdef SUPPORT_UCP
2323 else if (oclength == 0) break;
2324 else
2325 {
2326 if (memcmp(eptr, occhars, oclength) != 0) break;
2327 eptr += oclength;
2328 }
2329 #else /* without SUPPORT_UCP */
2330 else break;
2331 #endif /* SUPPORT_UCP */
2332 }
2333
2334 if (possessive) continue;
2335 for(;;)
2336 {
2337 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM23);
2338 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2339 if (eptr == pp) RRETURN(MATCH_NOMATCH);
2340 #ifdef SUPPORT_UCP
2341 eptr--;
2342 BACKCHAR(eptr);
2343 #else /* without SUPPORT_UCP */
2344 eptr -= length;
2345 #endif /* SUPPORT_UCP */
2346 }
2347 }
2348 /* Control never gets here */
2349 }
2350
2351 /* If the length of a UTF-8 character is 1, we fall through here, and
2352 obey the code as for non-UTF-8 characters below, though in this case the
2353 value of fc will always be < 128. */
2354 }
2355 else
2356 #endif /* SUPPORT_UTF8 */
2357
2358 /* When not in UTF-8 mode, load a single-byte character. */
2359 {
2360 if (min > md->end_subject - eptr) RRETURN(MATCH_NOMATCH);
2361 fc = *ecode++;
2362 }
2363
2364 /* The value of fc at this point is always less than 256, though we may or
2365 may not be in UTF-8 mode. The code is duplicated for the caseless and
2366 caseful cases, for speed, since matching characters is likely to be quite
2367 common. First, ensure the minimum number of matches are present. If min =
2368 max, continue at the same level without recursing. Otherwise, if
2369 minimizing, keep trying the rest of the expression and advancing one
2370 matching character if failing, up to the maximum. Alternatively, if
2371 maximizing, find the maximum number of characters and work backwards. */
2372
2373 DPRINTF(("matching %c{%d,%d} against subject %.*s\n", fc, min, max,
2374 max, eptr));
2375
2376 if ((ims & PCRE_CASELESS) != 0)
2377 {
2378 fc = md->lcc[fc];
2379 for (i = 1; i <= min; i++)
2380 if (fc != md->lcc[*eptr++]) RRETURN(MATCH_NOMATCH);
2381 if (min == max) continue;
2382 if (minimize)
2383 {
2384 for (fi = min;; fi++)
2385 {
2386 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM24);
2387 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2388 if (fi >= max || eptr >= md->end_subject ||
2389 fc != md->lcc[*eptr++])
2390 RRETURN(MATCH_NOMATCH);
2391 }
2392 /* Control never gets here */
2393 }
2394 else /* Maximize */
2395 {
2396 pp = eptr;
2397 for (i = min; i < max; i++)
2398 {
2399 if (eptr >= md->end_subject || fc != md->lcc[*eptr]) break;
2400 eptr++;
2401 }
2402 if (possessive) continue;
2403 while (eptr >= pp)
2404 {
2405 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM25);
2406 eptr--;
2407 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2408 }
2409 RRETURN(MATCH_NOMATCH);
2410 }
2411 /* Control never gets here */
2412 }
2413
2414 /* Caseful comparisons (includes all multi-byte characters) */
2415
2416 else
2417 {
2418 for (i = 1; i <= min; i++) if (fc != *eptr++) RRETURN(MATCH_NOMATCH);
2419 if (min == max) continue;
2420 if (minimize)
2421 {
2422 for (fi = min;; fi++)
2423 {
2424 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM26);
2425 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2426 if (fi >= max || eptr >= md->end_subject || fc != *eptr++)
2427 RRETURN(MATCH_NOMATCH);
2428 }
2429 /* Control never gets here */
2430 }
2431 else /* Maximize */
2432 {
2433 pp = eptr;
2434 for (i = min; i < max; i++)
2435 {
2436 if (eptr >= md->end_subject || fc != *eptr) break;
2437 eptr++;
2438 }
2439 if (possessive) continue;
2440 while (eptr >= pp)
2441 {
2442 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM27);
2443 eptr--;
2444 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2445 }
2446 RRETURN(MATCH_NOMATCH);
2447 }
2448 }
2449 /* Control never gets here */
2450
2451 /* Match a negated single one-byte character. The character we are
2452 checking can be multibyte. */
2453
2454 case OP_NOT:
2455 if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
2456 ecode++;
2457 GETCHARINCTEST(c, eptr);
2458 if ((ims & PCRE_CASELESS) != 0)
2459 {
2460 #ifdef SUPPORT_UTF8
2461 if (c < 256)
2462 #endif
2463 c = md->lcc[c];
2464 if (md->lcc[*ecode++] == c) RRETURN(MATCH_NOMATCH);
2465 }
2466 else
2467 {
2468 if (*ecode++ == c) RRETURN(MATCH_NOMATCH);
2469 }
2470 break;
2471
2472 /* Match a negated single one-byte character repeatedly. This is almost a
2473 repeat of the code for a repeated single character, but I haven't found a
2474 nice way of commoning these up that doesn't require a test of the
2475 positive/negative option for each character match. Maybe that wouldn't add
2476 very much to the time taken, but character matching *is* what this is all
2477 about... */
2478
2479 case OP_NOTEXACT:
2480 min = max = GET2(ecode, 1);
2481 ecode += 3;
2482 goto REPEATNOTCHAR;
2483
2484 case OP_NOTUPTO:
2485 case OP_NOTMINUPTO:
2486 min = 0;
2487 max = GET2(ecode, 1);
2488 minimize = *ecode == OP_NOTMINUPTO;
2489 ecode += 3;
2490 goto REPEATNOTCHAR;
2491
2492 case OP_NOTPOSSTAR:
2493 possessive = TRUE;
2494 min = 0;
2495 max = INT_MAX;
2496 ecode++;
2497 goto REPEATNOTCHAR;
2498
2499 case OP_NOTPOSPLUS:
2500 possessive = TRUE;
2501 min = 1;
2502 max = INT_MAX;
2503 ecode++;
2504 goto REPEATNOTCHAR;
2505
2506 case OP_NOTPOSQUERY:
2507 possessive = TRUE;
2508 min = 0;
2509 max = 1;
2510 ecode++;
2511 goto REPEATNOTCHAR;
2512
2513 case OP_NOTPOSUPTO:
2514 possessive = TRUE;
2515 min = 0;
2516 max = GET2(ecode, 1);
2517 ecode += 3;
2518 goto REPEATNOTCHAR;
2519
2520 case OP_NOTSTAR:
2521 case OP_NOTMINSTAR:
2522 case OP_NOTPLUS:
2523 case OP_NOTMINPLUS:
2524 case OP_NOTQUERY:
2525 case OP_NOTMINQUERY:
2526 c = *ecode++ - OP_NOTSTAR;
2527 minimize = (c & 1) != 0;
2528 min = rep_min[c]; /* Pick up values from tables; */
2529 max = rep_max[c]; /* zero for max => infinity */
2530 if (max == 0) max = INT_MAX;
2531
2532 /* Common code for all repeated single-byte matches. We can give up quickly
2533 if there are fewer than the minimum number of bytes left in the
2534 subject. */
2535
2536 REPEATNOTCHAR:
2537 if (min > md->end_subject - eptr) RRETURN(MATCH_NOMATCH);
2538 fc = *ecode++;
2539
2540 /* The code is duplicated for the caseless and caseful cases, for speed,
2541 since matching characters is likely to be quite common. First, ensure the
2542 minimum number of matches are present. If min = max, continue at the same
2543 level without recursing. Otherwise, if minimizing, keep trying the rest of
2544 the expression and advancing one matching character if failing, up to the
2545 maximum. Alternatively, if maximizing, find the maximum number of
2546 characters and work backwards. */
2547
2548 DPRINTF(("negative matching %c{%d,%d} against subject %.*s\n", fc, min, max,
2549 max, eptr));
2550
2551 if ((ims & PCRE_CASELESS) != 0)
2552 {
2553 fc = md->lcc[fc];
2554
2555 #ifdef SUPPORT_UTF8
2556 /* UTF-8 mode */
2557 if (utf8)
2558 {
2559 register unsigned int d;
2560 for (i = 1; i <= min; i++)
2561 {
2562 GETCHARINC(d, eptr);
2563 if (d < 256) d = md->lcc[d];
2564 if (fc == d) RRETURN(MATCH_NOMATCH);
2565 }
2566 }
2567 else
2568 #endif
2569
2570 /* Not UTF-8 mode */
2571 {
2572 for (i = 1; i <= min; i++)
2573 if (fc == md->lcc[*eptr++]) RRETURN(MATCH_NOMATCH);
2574 }
2575
2576 if (min == max) continue;
2577
2578 if (minimize)
2579 {
2580 #ifdef SUPPORT_UTF8
2581 /* UTF-8 mode */
2582 if (utf8)
2583 {
2584 register unsigned int d;
2585 for (fi = min;; fi++)
2586 {
2587 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM28);
2588 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2589 GETCHARINC(d, eptr);
2590 if (d < 256) d = md->lcc[d];
2591 if (fi >= max || eptr >= md->end_subject || fc == d)
2592 RRETURN(MATCH_NOMATCH);
2593 }
2594 }
2595 else
2596 #endif
2597 /* Not UTF-8 mode */
2598 {
2599 for (fi = min;; fi++)
2600 {
2601 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM29);
2602 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2603 if (fi >= max || eptr >= md->end_subject || fc == md->lcc[*eptr++])
2604 RRETURN(MATCH_NOMATCH);
2605 }
2606 }
2607 /* Control never gets here */
2608 }
2609
2610 /* Maximize case */
2611
2612 else
2613 {
2614 pp = eptr;
2615
2616 #ifdef SUPPORT_UTF8
2617 /* UTF-8 mode */
2618 if (utf8)
2619 {
2620 register unsigned int d;
2621 for (i = min; i < max; i++)
2622 {
2623 int len = 1;
2624 if (eptr >= md->end_subject) break;
2625 GETCHARLEN(d, eptr, len);
2626 if (d < 256) d = md->lcc[d];
2627 if (fc == d) break;
2628 eptr += len;
2629 }
2630 if (possessive) continue;
2631 for(;;)
2632 {
2633 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM30);
2634 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2635 if (eptr-- == pp) break; /* Stop if tried at original pos */
2636 BACKCHAR(eptr);
2637 }
2638 }
2639 else
2640 #endif
2641 /* Not UTF-8 mode */
2642 {
2643 for (i = min; i < max; i++)
2644 {
2645 if (eptr >= md->end_subject || fc == md->lcc[*eptr]) break;
2646 eptr++;
2647 }
2648 if (possessive) continue;
2649 while (eptr >= pp)
2650 {
2651 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM31);
2652 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2653 eptr--;
2654 }
2655 }
2656
2657 RRETURN(MATCH_NOMATCH);
2658 }
2659 /* Control never gets here */
2660 }
2661
2662 /* Caseful comparisons */
2663
2664 else
2665 {
2666 #ifdef SUPPORT_UTF8
2667 /* UTF-8 mode */
2668 if (utf8)
2669 {
2670 register unsigned int d;
2671 for (i = 1; i <= min; i++)
2672 {
2673 GETCHARINC(d, eptr);
2674 if (fc == d) RRETURN(MATCH_NOMATCH);
2675 }
2676 }
2677 else
2678 #endif
2679 /* Not UTF-8 mode */
2680 {
2681 for (i = 1; i <= min; i++)
2682 if (fc == *eptr++) RRETURN(MATCH_NOMATCH);
2683 }
2684
2685 if (min == max) continue;
2686
2687 if (minimize)
2688 {
2689 #ifdef SUPPORT_UTF8
2690 /* UTF-8 mode */
2691 if (utf8)
2692 {
2693 register unsigned int d;
2694 for (fi = min;; fi++)
2695 {
2696 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM32);
2697 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2698 GETCHARINC(d, eptr);
2699 if (fi >= max || eptr >= md->end_subject || fc == d)
2700 RRETURN(MATCH_NOMATCH);
2701 }
2702 }
2703 else
2704 #endif
2705 /* Not UTF-8 mode */
2706 {
2707 for (fi = min;; fi++)
2708 {
2709 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM33);
2710 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2711 if (fi >= max || eptr >= md->end_subject || fc == *eptr++)
2712 RRETURN(MATCH_NOMATCH);
2713 }
2714 }
2715 /* Control never gets here */
2716 }
2717
2718 /* Maximize case */
2719
2720 else
2721 {
2722 pp = eptr;
2723
2724 #ifdef SUPPORT_UTF8
2725 /* UTF-8 mode */
2726 if (utf8)
2727 {
2728 register unsigned int d;
2729 for (i = min; i < max; i++)
2730 {
2731 int len = 1;
2732 if (eptr >= md->end_subject) break;
2733 GETCHARLEN(d, eptr, len);
2734 if (fc == d) break;
2735 eptr += len;
2736 }
2737 if (possessive) continue;
2738 for(;;)
2739 {
2740 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM34);
2741 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2742 if (eptr-- == pp) break; /* Stop if tried at original pos */
2743 BACKCHAR(eptr);
2744 }
2745 }
2746 else
2747 #endif
2748 /* Not UTF-8 mode */
2749 {
2750 for (i = min; i < max; i++)
2751 {
2752 if (eptr >= md->end_subject || fc == *eptr) break;
2753 eptr++;
2754 }
2755 if (possessive) continue;
2756 while (eptr >= pp)
2757 {
2758 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM35);
2759 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2760 eptr--;
2761 }
2762 }
2763
2764 RRETURN(MATCH_NOMATCH);
2765 }
2766 }
2767 /* Control never gets here */
2768
2769 /* Match a single character type repeatedly; several different opcodes
2770 share code. This is very similar to the code for single characters, but we
2771 repeat it in the interests of efficiency. */
2772
2773 case OP_TYPEEXACT:
2774 min = max = GET2(ecode, 1);
2775 minimize = TRUE;
2776 ecode += 3;
2777 goto REPEATTYPE;
2778
2779 case OP_TYPEUPTO:
2780 case OP_TYPEMINUPTO:
2781 min = 0;
2782 max = GET2(ecode, 1);
2783 minimize = *ecode == OP_TYPEMINUPTO;
2784 ecode += 3;
2785 goto REPEATTYPE;
2786
2787 case OP_TYPEPOSSTAR:
2788 possessive = TRUE;
2789 min = 0;
2790 max = INT_MAX;
2791 ecode++;
2792 goto REPEATTYPE;
2793
2794 case OP_TYPEPOSPLUS:
2795 possessive = TRUE;
2796 min = 1;
2797 max = INT_MAX;
2798 ecode++;
2799 goto REPEATTYPE;
2800
2801 case OP_TYPEPOSQUERY:
2802 possessive = TRUE;
2803 min = 0;
2804 max = 1;
2805 ecode++;
2806 goto REPEATTYPE;
2807
2808 case OP_TYPEPOSUPTO:
2809 possessive = TRUE;
2810 min = 0;
2811 max = GET2(ecode, 1);
2812 ecode += 3;
2813 goto REPEATTYPE;
2814
2815 case OP_TYPESTAR:
2816 case OP_TYPEMINSTAR:
2817 case OP_TYPEPLUS:
2818 case OP_TYPEMINPLUS:
2819 case OP_TYPEQUERY:
2820 case OP_TYPEMINQUERY:
2821 c = *ecode++ - OP_TYPESTAR;
2822 minimize = (c & 1) != 0;
2823 min = rep_min[c]; /* Pick up values from tables; */
2824 max = rep_max[c]; /* zero for max => infinity */
2825 if (max == 0) max = INT_MAX;
2826
2827 /* Common code for all repeated single character type matches. Note that
2828 in UTF-8 mode, '.' matches a character of any length, but for the other
2829 character types, the valid characters are all one-byte long. */
2830
2831 REPEATTYPE:
2832 ctype = *ecode++; /* Code for the character type */
2833
2834 #ifdef SUPPORT_UCP
2835 if (ctype == OP_PROP || ctype == OP_NOTPROP)
2836 {
2837 prop_fail_result = ctype == OP_NOTPROP;
2838 prop_type = *ecode++;
2839 prop_value = *ecode++;
2840 }
2841 else prop_type = -1;
2842 #endif
2843
2844 /* First, ensure the minimum number of matches are present. Use inline
2845 code for maximizing the speed, and do the type test once at the start
2846 (i.e. keep it out of the loop). Also we can test that there are at least
2847 the minimum number of bytes before we start. This isn't as effective in
2848 UTF-8 mode, but it does no harm. Separate the UTF-8 code completely as that
2849 is tidier. Also separate the UCP code, which can be the same for both UTF-8
2850 and single-bytes. */
2851
2852 if (min > md->end_subject - eptr) RRETURN(MATCH_NOMATCH);
2853 if (min > 0)
2854 {
2855 #ifdef SUPPORT_UCP
2856 if (prop_type >= 0)
2857 {
2858 switch(prop_type)
2859 {
2860 case PT_ANY:
2861 if (prop_fail_result) RRETURN(MATCH_NOMATCH);
2862 for (i = 1; i <= min; i++)
2863 {
2864 if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
2865 GETCHARINCTEST(c, eptr);
2866 }
2867 break;
2868
2869 case PT_LAMP:
2870 for (i = 1; i <= min; i++)
2871 {
2872 if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
2873 GETCHARINCTEST(c, eptr);
2874 prop_category = _pcre_ucp_findprop(c, &prop_chartype, &prop_script);
2875 if ((prop_chartype == ucp_Lu ||
2876 prop_chartype == ucp_Ll ||
2877 prop_chartype == ucp_Lt) == prop_fail_result)
2878 RRETURN(MATCH_NOMATCH);
2879 }
2880 break;
2881
2882 case PT_GC:
2883 for (i = 1; i <= min; i++)
2884 {
2885 if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
2886 GETCHARINCTEST(c, eptr);
2887 prop_category = _pcre_ucp_findprop(c, &prop_chartype, &prop_script);
2888 if ((prop_category == prop_value) == prop_fail_result)
2889 RRETURN(MATCH_NOMATCH);
2890 }
2891 break;
2892
2893 case PT_PC:
2894 for (i = 1; i <= min; i++)
2895 {
2896 if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
2897 GETCHARINCTEST(c, eptr);
2898 prop_category = _pcre_ucp_findprop(c, &prop_chartype, &prop_script);
2899 if ((prop_chartype == prop_value) == prop_fail_result)
2900 RRETURN(MATCH_NOMATCH);
2901 }
2902 break;
2903
2904 case PT_SC:
2905 for (i = 1; i <= min; i++)
2906 {
2907 if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
2908 GETCHARINCTEST(c, eptr);
2909 prop_category = _pcre_ucp_findprop(c, &prop_chartype, &prop_script);
2910 if ((prop_script == prop_value) == prop_fail_result)
2911 RRETURN(MATCH_NOMATCH);
2912 }
2913 break;
2914
2915 default:
2916 RRETURN(PCRE_ERROR_INTERNAL);
2917 }
2918 }
2919
2920 /* Match extended Unicode sequences. We will get here only if the
2921 support is in the binary; otherwise a compile-time error occurs. */
2922
2923 else if (ctype == OP_EXTUNI)
2924 {
2925 for (i = 1; i <= min; i++)
2926 {
2927 GETCHARINCTEST(c, eptr);
2928 prop_category = _pcre_ucp_findprop(c, &prop_chartype, &prop_script);
2929 if (prop_category == ucp_M) RRETURN(MATCH_NOMATCH);
2930 while (eptr < md->end_subject)
2931 {
2932 int len = 1;
2933 if (!utf8) c = *eptr; else
2934 {
2935 GETCHARLEN(c, eptr, len);
2936 }
2937 prop_category = _pcre_ucp_findprop(c, &prop_chartype, &prop_script);
2938 if (prop_category != ucp_M) break;
2939 eptr += len;
2940 }
2941 }
2942 }
2943
2944 else
2945 #endif /* SUPPORT_UCP */
2946
2947 /* Handle all other cases when the coding is UTF-8 */
2948
2949 #ifdef SUPPORT_UTF8
2950 if (utf8) switch(ctype)
2951 {
2952 case OP_ANY:
2953 for (i = 1; i <= min; i++)
2954 {
2955 if (eptr >= md->end_subject ||
2956 ((ims & PCRE_DOTALL) == 0 && IS_NEWLINE(eptr)))
2957 RRETURN(MATCH_NOMATCH);
2958 eptr++;
2959 while (eptr < md->end_subject && (*eptr & 0xc0) == 0x80) eptr++;
2960 }
2961 break;
2962
2963 case OP_ANYBYTE:
2964 eptr += min;
2965 break;
2966
2967 case OP_ANYNL:
2968 for (i = 1; i <= min; i++)
2969 {
2970 if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
2971 GETCHARINC(c, eptr);
2972 switch(c)
2973 {
2974 default: RRETURN(MATCH_NOMATCH);
2975 case 0x000d:
2976 if (eptr < md->end_subject && *eptr == 0x0a) eptr++;
2977 break;
2978
2979 case 0x000a:
2980 break;
2981
2982 case 0x000b:
2983 case 0x000c:
2984 case 0x0085:
2985 case 0x2028:
2986 case 0x2029:
2987 if (md->bsr_anycrlf) RRETURN(MATCH_NOMATCH);
2988 break;
2989 }
2990 }
2991 break;
2992
2993 case OP_NOT_HSPACE:
2994 for (i = 1; i <= min; i++)
2995 {
2996 if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
2997 GETCHARINC(c, eptr);
2998 switch(c)
2999 {
3000 default: break;
3001 case 0x09: /* HT */
3002 case 0x20: /* SPACE */
3003 case 0xa0: /* NBSP */
3004 case 0x1680: /* OGHAM SPACE MARK */
3005 case 0x180e: /* MONGOLIAN VOWEL SEPARATOR */
3006 case 0x2000: /* EN QUAD */
3007 case 0x2001: /* EM QUAD */
3008 case 0x2002: /* EN SPACE */
3009 case 0x2003: /* EM SPACE */
3010 case 0x2004: /* THREE-PER-EM SPACE */
3011 case 0x2005: /* FOUR-PER-EM SPACE */
3012 case 0x2006: /* SIX-PER-EM SPACE */
3013 case 0x2007: /* FIGURE SPACE */
3014 case 0x2008: /* PUNCTUATION SPACE */
3015 case 0x2009: /* THIN SPACE */
3016 case 0x200A: /* HAIR SPACE */
3017 case 0x202f: /* NARROW NO-BREAK SPACE */
3018 case 0x205f: /* MEDIUM MATHEMATICAL SPACE */
3019 case 0x3000: /* IDEOGRAPHIC SPACE */
3020 RRETURN(MATCH_NOMATCH);
3021 }
3022 }
3023 break;
3024
3025 case OP_HSPACE:
3026 for (i = 1; i <= min; i++)
3027 {
3028 if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
3029 GETCHARINC(c, eptr);
3030 switch(c)
3031 {
3032 default: RRETURN(MATCH_NOMATCH);
3033 case 0x09: /* HT */
3034 case 0x20: /* SPACE */
3035 case 0xa0: /* NBSP */
3036 case 0x1680: /* OGHAM SPACE MARK */
3037 case 0x180e: /* MONGOLIAN VOWEL SEPARATOR */
3038 case 0x2000: /* EN QUAD */
3039 case 0x2001: /* EM QUAD */
3040 case 0x2002: /* EN SPACE */
3041 case 0x2003: /* EM SPACE */
3042 case 0x2004: /* THREE-PER-EM SPACE */
3043 case 0x2005: /* FOUR-PER-EM SPACE */
3044 case 0x2006: /* SIX-PER-EM SPACE */
3045 case 0x2007: /* FIGURE SPACE */
3046 case 0x2008: /* PUNCTUATION SPACE */
3047 case 0x2009: /* THIN SPACE */
3048 case 0x200A: /* HAIR SPACE */
3049 case 0x202f: /* NARROW NO-BREAK SPACE */
3050 case 0x205f: /* MEDIUM MATHEMATICAL SPACE */
3051 case 0x3000: /* IDEOGRAPHIC SPACE */
3052 break;
3053 }
3054 }
3055 break;
3056
3057 case OP_NOT_VSPACE:
3058 for (i = 1; i <= min; i++)
3059 {
3060 if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
3061 GETCHARINC(c, eptr);
3062 switch(c)
3063 {
3064 default: break;
3065 case 0x0a: /* LF */
3066 case 0x0b: /* VT */
3067 case 0x0c: /* FF */
3068 case 0x0d: /* CR */
3069 case 0x85: /* NEL */
3070 case 0x2028: /* LINE SEPARATOR */
3071 case 0x2029: /* PARAGRAPH SEPARATOR */
3072 RRETURN(MATCH_NOMATCH);
3073 }
3074 }
3075 break;
3076
3077 case OP_VSPACE:
3078 for (i = 1; i <= min; i++)
3079 {
3080 if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
3081 GETCHARINC(c, eptr);
3082 switch(c)
3083 {
3084 default: RRETURN(MATCH_NOMATCH);
3085 case 0x0a: /* LF */
3086 case 0x0b: /* VT */
3087 case 0x0c: /* FF */
3088 case 0x0d: /* CR */
3089 case 0x85: /* NEL */
3090 case 0x2028: /* LINE SEPARATOR */
3091 case 0x2029: /* PARAGRAPH SEPARATOR */
3092 break;
3093 }
3094 }
3095 break;
3096
3097 case OP_NOT_DIGIT:
3098 for (i = 1; i <= min; i++)
3099 {
3100 if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
3101 GETCHARINC(c, eptr);
3102 if (c < 128 && (md->ctypes[c] & ctype_digit) != 0)
3103 RRETURN(MATCH_NOMATCH);
3104 }
3105 break;
3106
3107 case OP_DIGIT:
3108 for (i = 1; i <= min; i++)
3109 {
3110 if (eptr >= md->end_subject ||
3111 *eptr >= 128 || (md->ctypes[*eptr++] & ctype_digit) == 0)
3112 RRETURN(MATCH_NOMATCH);
3113 /* No need to skip more bytes - we know it's a 1-byte character */
3114 }
3115 break;
3116
3117 case OP_NOT_WHITESPACE:
3118 for (i = 1; i <= min; i++)
3119 {
3120 if (eptr >= md->end_subject ||
3121 (*eptr < 128 && (md->ctypes[*eptr] & ctype_space) != 0))
3122 RRETURN(MATCH_NOMATCH);
3123 while (++eptr < md->end_subject && (*eptr & 0xc0) == 0x80);
3124 }
3125 break;
3126
3127 case OP_WHITESPACE:
3128 for (i = 1; i <= min; i++)
3129 {
3130 if (eptr >= md->end_subject ||
3131 *eptr >= 128 || (md->ctypes[*eptr++] & ctype_space) == 0)
3132 RRETURN(MATCH_NOMATCH);
3133 /* No need to skip more bytes - we know it's a 1-byte character */
3134 }
3135 break;
3136
3137 case OP_NOT_WORDCHAR:
3138 for (i = 1; i <= min; i++)
3139 {
3140 if (eptr >= md->end_subject ||
3141 (*eptr < 128 && (md->ctypes[*eptr] & ctype_word) != 0))
3142 RRETURN(MATCH_NOMATCH);
3143 while (++eptr < md->end_subject && (*eptr & 0xc0) == 0x80);
3144 }
3145 break;
3146
3147 case OP_WORDCHAR:
3148 for (i = 1; i <= min; i++)
3149 {
3150 if (eptr >= md->end_subject ||
3151 *eptr >= 128 || (md->ctypes[*eptr++] & ctype_word) == 0)
3152 RRETURN(MATCH_NOMATCH);
3153 /* No need to skip more bytes - we know it's a 1-byte character */
3154 }
3155 break;
3156
3157 default:
3158 RRETURN(PCRE_ERROR_INTERNAL);
3159 } /* End switch(ctype) */
3160
3161 else
3162 #endif /* SUPPORT_UTF8 */
3163
3164 /* Code for the non-UTF-8 case for minimum matching of operators other
3165 than OP_PROP and OP_NOTPROP. We can assume that there are the minimum
3166 number of bytes present, as this was tested above. */
3167
3168 switch(ctype)
3169 {
3170 case OP_ANY:
3171 if ((ims & PCRE_DOTALL) == 0)
3172 {
3173 for (i = 1; i <= min; i++)
3174 {
3175 if (IS_NEWLINE(eptr)) RRETURN(MATCH_NOMATCH);
3176 eptr++;
3177 }
3178 }
3179 else eptr += min;
3180 break;
3181
3182 case OP_ANYBYTE:
3183 eptr += min;
3184 break;
3185
3186 /* Because of the CRLF case, we can't assume the minimum number of
3187 bytes are present in this case. */
3188
3189 case OP_ANYNL:
3190 for (i = 1; i <= min; i++)
3191 {
3192 if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
3193 switch(*eptr++)
3194 {
3195 default: RRETURN(MATCH_NOMATCH);
3196 case 0x000d:
3197 if (eptr < md->end_subject && *eptr == 0x0a) eptr++;
3198 break;
3199 case 0x000a:
3200 break;
3201
3202 case 0x000b:
3203 case 0x000c:
3204 case 0x0085:
3205 if (md->bsr_anycrlf) RRETURN(MATCH_NOMATCH);
3206 break;
3207 }
3208 }
3209 break;
3210
3211 case OP_NOT_HSPACE:
3212 for (i = 1; i <= min; i++)
3213 {
3214 if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
3215 switch(*eptr++)
3216 {
3217 default: break;
3218 case 0x09: /* HT */
3219 case 0x20: /* SPACE */
3220 case 0xa0: /* NBSP */
3221 RRETURN(MATCH_NOMATCH);
3222 }
3223 }
3224 break;
3225
3226 case OP_HSPACE:
3227 for (i = 1; i <= min; i++)
3228 {
3229 if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
3230 switch(*eptr++)
3231 {
3232 default: RRETURN(MATCH_NOMATCH);
3233 case 0x09: /* HT */
3234 case 0x20: /* SPACE */
3235 case 0xa0: /* NBSP */
3236 break;
3237 }
3238 }
3239 break;
3240
3241 case OP_NOT_VSPACE:
3242 for (i = 1; i <= min; i++)
3243 {
3244 if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
3245 switch(*eptr++)
3246 {
3247 default: break;
3248 case 0x0a: /* LF */
3249 case 0x0b: /* VT */
3250 case 0x0c: /* FF */
3251 case 0x0d: /* CR */
3252 case 0x85: /* NEL */
3253 RRETURN(MATCH_NOMATCH);
3254 }
3255 }
3256 break;
3257
3258 case OP_VSPACE:
3259 for (i = 1; i <= min; i++)
3260 {
3261 if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
3262 switch(*eptr++)
3263 {
3264 default: RRETURN(MATCH_NOMATCH);
3265 case 0x0a: /* LF */
3266 case 0x0b: /* VT */
3267 case 0x0c: /* FF */
3268 case 0x0d: /* CR */
3269 case 0x85: /* NEL */
3270 break;
3271 }
3272 }
3273 break;
3274
3275 case OP_NOT_DIGIT:
3276 for (i = 1; i <= min; i++)
3277 if ((md->ctypes[*eptr++] & ctype_digit) != 0) RRETURN(MATCH_NOMATCH);
3278 break;
3279
3280 case OP_DIGIT:
3281 for (i = 1; i <= min; i++)
3282 if ((md->ctypes[*eptr++] & ctype_digit) == 0) RRETURN(MATCH_NOMATCH);
3283 break;
3284
3285 case OP_NOT_WHITESPACE:
3286 for (i = 1; i <= min; i++)
3287 if ((md->ctypes[*eptr++] & ctype_space) != 0) RRETURN(MATCH_NOMATCH);
3288 break;
3289
3290 case OP_WHITESPACE:
3291 for (i = 1; i <= min; i++)
3292 if ((md->ctypes[*eptr++] & ctype_space) == 0) RRETURN(MATCH_NOMATCH);
3293 break;
3294
3295 case OP_NOT_WORDCHAR:
3296 for (i = 1; i <= min; i++)
3297 if ((md->ctypes[*eptr++] & ctype_word) != 0)
3298 RRETURN(MATCH_NOMATCH);
3299 break;
3300
3301 case OP_WORDCHAR:
3302 for (i = 1; i <= min; i++)
3303 if ((md->ctypes[*eptr++] & ctype_word) == 0)
3304 RRETURN(MATCH_NOMATCH);
3305 break;
3306
3307 default:
3308 RRETURN(PCRE_ERROR_INTERNAL);
3309 }
3310 }
3311
3312 /* If min = max, continue at the same level without recursing */
3313
3314 if (min == max) continue;
3315
3316 /* If minimizing, we have to test the rest of the pattern before each
3317 subsequent match. Again, separate the UTF-8 case for speed, and also
3318 separate the UCP cases. */
3319
3320 if (minimize)
3321 {
3322 #ifdef SUPPORT_UCP
3323 if (prop_type >= 0)
3324 {
3325 switch(prop_type)
3326 {
3327 case PT_ANY:
3328 for (fi = min;; fi++)
3329 {
3330 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM36);
3331 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3332 if (fi >= max || eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
3333 GETCHARINC(c, eptr);
3334 if (prop_fail_result) RRETURN(MATCH_NOMATCH);
3335 }
3336 /* Control never gets here */
3337
3338 case PT_LAMP:
3339 for (fi = min;; fi++)
3340 {
3341 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM37);
3342 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3343 if (fi >= max || eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
3344 GETCHARINC(c, eptr);
3345 prop_category = _pcre_ucp_findprop(c, &prop_chartype, &prop_script);
3346 if ((prop_chartype == ucp_Lu ||
3347 prop_chartype == ucp_Ll ||
3348 prop_chartype == ucp_Lt) == prop_fail_result)
3349 RRETURN(MATCH_NOMATCH);
3350 }
3351 /* Control never gets here */
3352
3353 case PT_GC:
3354 for (fi = min;; fi++)
3355 {
3356 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM38);
3357 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3358 if (fi >= max || eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
3359 GETCHARINC(c, eptr);
3360 prop_category = _pcre_ucp_findprop(c, &prop_chartype, &prop_script);
3361 if ((prop_category == prop_value) == prop_fail_result)
3362 RRETURN(MATCH_NOMATCH);
3363 }
3364 /* Control never gets here */
3365
3366 case PT_PC:
3367 for (fi = min;; fi++)
3368 {
3369 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM39);
3370 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3371 if (fi >= max || eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
3372 GETCHARINC(c, eptr);
3373 prop_category = _pcre_ucp_findprop(c, &prop_chartype, &prop_script);
3374 if ((prop_chartype == prop_value) == prop_fail_result)
3375 RRETURN(MATCH_NOMATCH);
3376 }
3377 /* Control never gets here */
3378
3379 case PT_SC:
3380 for (fi = min;; fi++)
3381 {
3382 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM40);
3383 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3384 if (fi >= max || eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
3385 GETCHARINC(c, eptr);
3386 prop_category = _pcre_ucp_findprop(c, &prop_chartype, &prop_script);
3387 if ((prop_script == prop_value) == prop_fail_result)
3388 RRETURN(MATCH_NOMATCH);
3389 }
3390 /* Control never gets here */
3391
3392 default:
3393 RRETURN(PCRE_ERROR_INTERNAL);
3394 }
3395 }
3396
3397 /* Match extended Unicode sequences. We will get here only if the
3398 support is in the binary; otherwise a compile-time error occurs. */
3399
3400 else if (ctype == OP_EXTUNI)
3401 {
3402 for (fi = min;; fi++)
3403 {
3404 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM41);
3405 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3406 if (fi >= max || eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
3407 GETCHARINCTEST(c, eptr);
3408 prop_category = _pcre_ucp_findprop(c, &prop_chartype, &prop_script);
3409 if (prop_category == ucp_M) RRETURN(MATCH_NOMATCH);
3410 while (eptr < md->end_subject)
3411 {
3412 int len = 1;
3413 if (!utf8) c = *eptr; else
3414 {
3415 GETCHARLEN(c, eptr, len);
3416 }
3417 prop_category = _pcre_ucp_findprop(c, &prop_chartype, &prop_script);
3418 if (prop_category != ucp_M) break;
3419 eptr += len;
3420 }
3421 }
3422 }
3423
3424 else
3425 #endif /* SUPPORT_UCP */
3426
3427 #ifdef SUPPORT_UTF8
3428 /* UTF-8 mode */
3429 if (utf8)
3430 {
3431 for (fi = min;; fi++)
3432 {
3433 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM42);
3434 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3435 if (fi >= max || eptr >= md->end_subject ||
3436 (ctype == OP_ANY && (ims & PCRE_DOTALL) == 0 &&
3437 IS_NEWLINE(eptr)))
3438 RRETURN(MATCH_NOMATCH);
3439
3440 GETCHARINC(c, eptr);
3441 switch(ctype)
3442 {
3443 case OP_ANY: /* This is the DOTALL case */
3444 break;
3445
3446 case OP_ANYBYTE:
3447 break;
3448
3449 case OP_ANYNL:
3450 switch(c)
3451 {
3452 default: RRETURN(MATCH_NOMATCH);
3453 case 0x000d:
3454 if (eptr < md->end_subject && *eptr == 0x0a) eptr++;
3455 break;
3456 case 0x000a:
3457 break;
3458
3459 case 0x000b:
3460 case 0x000c:
3461 case 0x0085:
3462 case 0x2028:
3463 case 0x2029:
3464 if (md->bsr_anycrlf) RRETURN(MATCH_NOMATCH);
3465 break;
3466 }
3467 break;
3468
3469 case OP_NOT_HSPACE:
3470 switch(c)
3471 {
3472 default: break;
3473 case 0x09: /* HT */
3474 case 0x20: /* SPACE */
3475 case 0xa0: /* NBSP */
3476 case 0x1680: /* OGHAM SPACE MARK */
3477 case 0x180e: /* MONGOLIAN VOWEL SEPARATOR */
3478 case 0x2000: /* EN QUAD */
3479 case 0x2001: /* EM QUAD */
3480 case 0x2002: /* EN SPACE */
3481 case 0x2003: /* EM SPACE */
3482 case 0x2004: /* THREE-PER-EM SPACE */
3483 case 0x2005: /* FOUR-PER-EM SPACE */
3484 case 0x2006: /* SIX-PER-EM SPACE */
3485 case 0x2007: /* FIGURE SPACE */
3486 case 0x2008: /* PUNCTUATION SPACE */
3487 case 0x2009: /* THIN SPACE */
3488 case 0x200A: /* HAIR SPACE */
3489 case 0x202f: /* NARROW NO-BREAK SPACE */
3490 case 0x205f: /* MEDIUM MATHEMATICAL SPACE */
3491 case 0x3000: /* IDEOGRAPHIC SPACE */
3492 RRETURN(MATCH_NOMATCH);
3493 }
3494 break;
3495
3496 case OP_HSPACE:
3497 switch(c)
3498 {
3499 default: RRETURN(MATCH_NOMATCH);
3500 case 0x09: /* HT */
3501 case 0x20: /* SPACE */
3502 case 0xa0: /* NBSP */
3503 case 0x1680: /* OGHAM SPACE MARK */
3504 case 0x180e: /* MONGOLIAN VOWEL SEPARATOR */
3505 case 0x2000: /* EN QUAD */
3506 case 0x2001: /* EM QUAD */
3507 case 0x2002: /* EN SPACE */
3508 case 0x2003: /* EM SPACE */
3509 case 0x2004: /* THREE-PER-EM SPACE */
3510 case 0x2005: /* FOUR-PER-EM SPACE */
3511 case 0x2006: /* SIX-PER-EM SPACE */
3512 case 0x2007: /* FIGURE SPACE */
3513 case 0x2008: /* PUNCTUATION SPACE */
3514 case 0x2009: /* THIN SPACE */
3515 case 0x200A: /* HAIR SPACE */
3516 case 0x202f: /* NARROW NO-BREAK SPACE */
3517 case 0x205f: /* MEDIUM MATHEMATICAL SPACE */
3518 case 0x3000: /* IDEOGRAPHIC SPACE */
3519 break;
3520 }
3521 break;
3522
3523 case OP_NOT_VSPACE:
3524 switch(c)
3525 {
3526 default: break;
3527 case 0x0a: /* LF */
3528 case 0x0b: /* VT */
3529 case 0x0c: /* FF */
3530 case 0x0d: /* CR */
3531 case 0x85: /* NEL */
3532 case 0x2028: /* LINE SEPARATOR */
3533 case 0x2029: /* PARAGRAPH SEPARATOR */
3534 RRETURN(MATCH_NOMATCH);
3535 }
3536 break;
3537
3538 case OP_VSPACE:
3539 switch(c)
3540 {
3541 default: RRETURN(MATCH_NOMATCH);
3542 case 0x0a: /* LF */
3543 case 0x0b: /* VT */
3544 case 0x0c: /* FF */
3545 case 0x0d: /* CR */
3546 case 0x85: /* NEL */
3547 case 0x2028: /* LINE SEPARATOR */
3548 case 0x2029: /* PARAGRAPH SEPARATOR */
3549 break;
3550 }
3551 break;
3552
3553 case OP_NOT_DIGIT:
3554 if (c < 256 && (md->ctypes[c] & ctype_digit) != 0)
3555 RRETURN(MATCH_NOMATCH);
3556 break;
3557
3558 case OP_DIGIT:
3559 if (c >= 256 || (md->ctypes[c] & ctype_digit) == 0)
3560 RRETURN(MATCH_NOMATCH);
3561 break;
3562
3563 case OP_NOT_WHITESPACE:
3564 if (c < 256 && (md->ctypes[c] & ctype_space) != 0)
3565 RRETURN(MATCH_NOMATCH);
3566 break;
3567
3568 case OP_WHITESPACE:
3569 if (c >= 256 || (md->ctypes[c] & ctype_space) == 0)
3570 RRETURN(MATCH_NOMATCH);
3571 break;
3572
3573 case OP_NOT_WORDCHAR:
3574 if (c < 256 && (md->ctypes[c] & ctype_word) != 0)
3575 RRETURN(MATCH_NOMATCH);
3576 break;
3577
3578 case OP_WORDCHAR:
3579 if (c >= 256 || (md->ctypes[c] & ctype_word) == 0)
3580 RRETURN(MATCH_NOMATCH);
3581 break;
3582
3583 default:
3584 RRETURN(PCRE_ERROR_INTERNAL);
3585 }
3586 }
3587 }
3588 else
3589 #endif
3590 /* Not UTF-8 mode */
3591 {
3592 for (fi = min;; fi++)
3593 {
3594 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM43);
3595 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3596 if (fi >= max || eptr >= md->end_subject ||
3597 ((ims & PCRE_DOTALL) == 0 && IS_NEWLINE(eptr)))
3598 RRETURN(MATCH_NOMATCH);
3599
3600 c = *eptr++;
3601 switch(ctype)
3602 {
3603 case OP_ANY: /* This is the DOTALL case */
3604 break;
3605
3606 case OP_ANYBYTE:
3607 break;
3608
3609 case OP_ANYNL:
3610 switch(c)
3611 {
3612 default: RRETURN(MATCH_NOMATCH);
3613 case 0x000d:
3614 if (eptr < md->end_subject && *eptr == 0x0a) eptr++;
3615 break;
3616
3617 case 0x000a:
3618 break;
3619
3620 case 0x000b:
3621 case 0x000c:
3622 case 0x0085:
3623 if (md->bsr_anycrlf) RRETURN(MATCH_NOMATCH);
3624 break;
3625 }
3626 break;
3627
3628 case OP_NOT_HSPACE:
3629 switch(c)
3630 {
3631 default: break;
3632 case 0x09: /* HT */
3633 case 0x20: /* SPACE */
3634 case 0xa0: /* NBSP */
3635 RRETURN(MATCH_NOMATCH);
3636 }
3637 break;
3638
3639 case OP_HSPACE:
3640 switch(c)
3641 {
3642 default: RRETURN(MATCH_NOMATCH);
3643 case 0x09: /* HT */
3644 case 0x20: /* SPACE */
3645 case 0xa0: /* NBSP */
3646 break;
3647 }
3648 break;
3649
3650 case OP_NOT_VSPACE:
3651 switch(c)
3652 {
3653 default: break;
3654 case 0x0a: /* LF */
3655 case 0x0b: /* VT */
3656 case 0x0c: /* FF */
3657 case 0x0d: /* CR */
3658 case 0x85: /* NEL */
3659 RRETURN(MATCH_NOMATCH);
3660 }
3661 break;
3662
3663 case OP_VSPACE:
3664 switch(c)
3665 {
3666 default: RRETURN(MATCH_NOMATCH);
3667 case 0x0a: /* LF */
3668 case 0x0b: /* VT */
3669 case 0x0c: /* FF */
3670 case 0x0d: /* CR */
3671 case 0x85: /* NEL */
3672 break;
3673 }
3674 break;
3675
3676 case OP_NOT_DIGIT:
3677 if ((md->ctypes[c] & ctype_digit) != 0) RRETURN(MATCH_NOMATCH);
3678 break;
3679
3680 case OP_DIGIT:
3681 if ((md->ctypes[c] & ctype_digit) == 0) RRETURN(MATCH_NOMATCH);
3682 break;
3683
3684 case OP_NOT_WHITESPACE:
3685 if ((md->ctypes[c] & ctype_space) != 0) RRETURN(MATCH_NOMATCH);
3686 break;
3687
3688 case OP_WHITESPACE:
3689 if ((md->ctypes[c] & ctype_space) == 0) RRETURN(MATCH_NOMATCH);
3690 break;
3691
3692 case OP_NOT_WORDCHAR:
3693 if ((md->ctypes[c] & ctype_word) != 0) RRETURN(MATCH_NOMATCH);
3694 break;
3695
3696 case OP_WORDCHAR:
3697 if ((md->ctypes[c] & ctype_word) == 0) RRETURN(MATCH_NOMATCH);
3698 break;
3699
3700 default:
3701 RRETURN(PCRE_ERROR_INTERNAL);
3702 }
3703 }
3704 }
3705 /* Control never gets here */
3706 }
3707
3708 /* If maximizing, it is worth using inline code for speed, doing the type
3709 test once at the start (i.e. keep it out of the loop). Again, keep the
3710 UTF-8 and UCP stuff separate. */
3711
3712 else
3713 {
3714 pp = eptr; /* Remember where we started */
3715
3716 #ifdef SUPPORT_UCP
3717 if (prop_type >= 0)
3718 {
3719 switch(prop_type)
3720 {
3721 case PT_ANY:
3722 for (i = min; i < max; i++)
3723 {
3724 int len = 1;
3725 if (eptr >= md->end_subject) break;
3726 GETCHARLEN(c, eptr, len);
3727 if (prop_fail_result) break;
3728 eptr+= len;
3729 }
3730 break;
3731
3732 case PT_LAMP:
3733 for (i = min; i < max; i++)
3734 {
3735 int len = 1;
3736 if (eptr >= md->end_subject) break;
3737 GETCHARLEN(c, eptr, len);
3738 prop_category = _pcre_ucp_findprop(c, &prop_chartype, &prop_script);
3739 if ((prop_chartype == ucp_Lu ||
3740 prop_chartype == ucp_Ll ||
3741 prop_chartype == ucp_Lt) == prop_fail_result)
3742 break;
3743 eptr+= len;
3744 }
3745 break;
3746
3747 case PT_GC:
3748 for (i = min; i < max; i++)
3749 {
3750 int len = 1;
3751 if (eptr >= md->end_subject) break;
3752 GETCHARLEN(c, eptr, len);
3753 prop_category = _pcre_ucp_findprop(c, &prop_chartype, &prop_script);
3754 if ((prop_category == prop_value) == prop_fail_result)
3755 break;
3756 eptr+= len;
3757 }
3758 break;
3759
3760 case PT_PC:
3761 for (i = min; i < max; i++)
3762 {
3763 int len = 1;
3764 if (eptr >= md->end_subject) break;
3765 GETCHARLEN(c, eptr, len);
3766 prop_category = _pcre_ucp_findprop(c, &prop_chartype, &prop_script);
3767 if ((prop_chartype == prop_value) == prop_fail_result)
3768 break;
3769 eptr+= len;
3770 }
3771 break;
3772
3773 case PT_SC:
3774 for (i = min; i < max; i++)
3775 {
3776 int len = 1;
3777 if (eptr >= md->end_subject) break;
3778 GETCHARLEN(c, eptr, len);
3779 prop_category = _pcre_ucp_findprop(c, &prop_chartype, &prop_script);
3780 if ((prop_script == prop_value) == prop_fail_result)
3781 break;
3782 eptr+= len;
3783 }
3784 break;
3785 }
3786
3787 /* eptr is now past the end of the maximum run */
3788
3789 if (possessive) continue;
3790 for(;;)
3791 {
3792 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM44);
3793 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3794 if (eptr-- == pp) break; /* Stop if tried at original pos */
3795 if (utf8) BACKCHAR(eptr);
3796 }
3797 }
3798
3799 /* Match extended Unicode sequences. We will get here only if the
3800 support is in the binary; otherwise a compile-time error occurs. */
3801
3802 else if (ctype == OP_EXTUNI)
3803 {
3804 for (i = min; i < max; i++)
3805 {
3806 if (eptr >= md->end_subject) break;
3807 GETCHARINCTEST(c, eptr);
3808 prop_category = _pcre_ucp_findprop(c, &prop_chartype, &prop_script);
3809 if (prop_category == ucp_M) break;
3810 while (eptr < md->end_subject)
3811 {
3812 int len = 1;
3813 if (!utf8) c = *eptr; else
3814 {
3815 GETCHARLEN(c, eptr, len);
3816 }
3817 prop_category = _pcre_ucp_findprop(c, &prop_chartype, &prop_script);
3818 if (prop_category != ucp_M) break;
3819 eptr += len;
3820 }
3821 }
3822
3823 /* eptr is now past the end of the maximum run */
3824
3825 if (possessive) continue;
3826 for(;;)
3827 {
3828 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM45);
3829 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3830 if (eptr-- == pp) break; /* Stop if tried at original pos */
3831 for (;;) /* Move back over one extended */
3832 {
3833 int len = 1;
3834 if (!utf8) c = *eptr; else
3835 {
3836 BACKCHAR(eptr);
3837 GETCHARLEN(c, eptr, len);
3838 }
3839 prop_category = _pcre_ucp_findprop(c, &prop_chartype, &prop_script);
3840 if (prop_category != ucp_M) break;
3841 eptr--;
3842 }
3843 }
3844 }
3845
3846 else
3847 #endif /* SUPPORT_UCP */
3848
3849 #ifdef SUPPORT_UTF8
3850 /* UTF-8 mode */
3851
3852 if (utf8)
3853 {
3854 switch(ctype)
3855 {
3856 case OP_ANY:
3857 if (max < INT_MAX)
3858 {
3859 if ((ims & PCRE_DOTALL) == 0)
3860 {
3861 for (i = min; i < max; i++)
3862 {
3863 if (eptr >= md->end_subject || IS_NEWLINE(eptr)) break;
3864 eptr++;
3865 while (eptr < md->end_subject && (*eptr & 0xc0) == 0x80) eptr++;
3866 }
3867 }
3868 else
3869 {
3870 for (i = min; i < max; i++)
3871 {
3872 if (eptr >= md->end_subject) break;
3873 eptr++;
3874 while (eptr < md->end_subject && (*eptr & 0xc0) == 0x80) eptr++;
3875 }
3876 }
3877 }
3878
3879 /* Handle unlimited UTF-8 repeat */
3880
3881 else
3882 {
3883 if ((ims & PCRE_DOTALL) == 0)
3884 {
3885 for (i = min; i < max; i++)
3886 {
3887 if (eptr >= md->end_subject || IS_NEWLINE(eptr)) break;
3888 eptr++;
3889 while (eptr < md->end_subject && (*eptr & 0xc0) == 0x80) eptr++;
3890 }
3891 }
3892 else
3893 {
3894 eptr = md->end_subject;
3895 }
3896 }
3897 break;
3898
3899 /* The byte case is the same as non-UTF8 */
3900
3901 case OP_ANYBYTE:
3902 c = max - min;
3903 if (c > (unsigned int)(md->end_subject - eptr))
3904 c = md->end_subject - eptr;
3905 eptr += c;
3906 break;
3907
3908 case OP_ANYNL:
3909 for (i = min; i < max; i++)
3910 {
3911 int len = 1;
3912 if (eptr >= md->end_subject) break;
3913 GETCHARLEN(c, eptr, len);
3914 if (c == 0x000d)
3915 {
3916 if (++eptr >= md->end_subject) break;
3917 if (*eptr == 0x000a) eptr++;
3918 }
3919 else
3920 {
3921 if (c != 0x000a &&
3922 (md->bsr_anycrlf ||
3923 (c != 0x000b && c != 0x000c &&
3924 c != 0x0085 && c != 0x2028 && c != 0x2029)))
3925 break;
3926 eptr += len;
3927 }
3928 }
3929 break;
3930
3931 case OP_NOT_HSPACE:
3932 case OP_HSPACE:
3933 for (i = min; i < max; i++)
3934 {
3935 BOOL gotspace;
3936 int len = 1;
3937 if (eptr >= md->end_subject) break;
3938 GETCHARLEN(c, eptr, len);
3939 switch(c)
3940 {
3941 default: gotspace = FALSE; break;
3942 case 0x09: /* HT */
3943 case 0x20: /* SPACE */
3944 case 0xa0: /* NBSP */
3945 case 0x1680: /* OGHAM SPACE MARK */
3946 case 0x180e: /* MONGOLIAN VOWEL SEPARATOR */
3947 case 0x2000: /* EN QUAD */
3948 case 0x2001: /* EM QUAD */
3949 case 0x2002: /* EN SPACE */
3950 case 0x2003: /* EM SPACE */
3951 case 0x2004: /* THREE-PER-EM SPACE */
3952 case 0x2005: /* FOUR-PER-EM SPACE */
3953 case 0x2006: /* SIX-PER-EM SPACE */
3954 case 0x2007: /* FIGURE SPACE */
3955 case 0x2008: /* PUNCTUATION SPACE */
3956 case 0x2009: /* THIN SPACE */
3957 case 0x200A: /* HAIR SPACE */
3958 case 0x202f: /* NARROW NO-BREAK SPACE */
3959 case 0x205f: /* MEDIUM MATHEMATICAL SPACE */
3960 case 0x3000: /* IDEOGRAPHIC SPACE */
3961 gotspace = TRUE;
3962 break;
3963 }
3964 if (gotspace == (ctype == OP_NOT_HSPACE)) break;
3965 eptr += len;
3966 }
3967 break;
3968
3969 case OP_NOT_VSPACE:
3970 case OP_VSPACE:
3971 for (i = min; i < max; i++)
3972 {
3973 BOOL gotspace;
3974 int len = 1;
3975 if (eptr >= md->end_subject) break;
3976 GETCHARLEN(c, eptr, len);
3977 switch(c)
3978 {
3979 default: gotspace = FALSE; break;
3980 case 0x0a: /* LF */
3981 case 0x0b: /* VT */
3982 case 0x0c: /* FF */
3983 case 0x0d: /* CR */
3984 case 0x85: /* NEL */
3985 case 0x2028: /* LINE SEPARATOR */
3986 case 0x2029: /* PARAGRAPH SEPARATOR */
3987 gotspace = TRUE;
3988 break;
3989 }
3990 if (gotspace == (ctype == OP_NOT_VSPACE)) break;
3991 eptr += len;
3992 }
3993 break;
3994
3995 case OP_NOT_DIGIT:
3996 for (i = min; i < max; i++)
3997 {
3998 int len = 1;
3999 if (eptr >= md->end_subject) break;
4000 GETCHARLEN(c, eptr, len);
4001 if (c < 256 && (md->ctypes[c] & ctype_digit) != 0) break;
4002 eptr+= len;
4003 }
4004 break;
4005
4006 case OP_DIGIT:
4007 for (i = min; i < max; i++)
4008 {
4009 int len = 1;
4010 if (eptr >= md->end_subject) break;
4011 GETCHARLEN(c, eptr, len);
4012 if (c >= 256 ||(md->ctypes[c] & ctype_digit) == 0) break;
4013 eptr+= len;
4014 }
4015 break;
4016
4017 case OP_NOT_WHITESPACE:
4018 for (i = min; i < max; i++)
4019 {
4020 int len = 1;
4021 if (eptr >= md->end_subject) break;
4022 GETCHARLEN(c, eptr, len);
4023 if (c < 256 && (md->ctypes[c] & ctype_space) != 0) break;
4024 eptr+= len;
4025 }
4026 break;
4027
4028 case OP_WHITESPACE:
4029 for (i = min; i < max; i++)
4030 {
4031 int len = 1;
4032 if (eptr >= md->end_subject) break;
4033 GETCHARLEN(c, eptr, len);
4034 if (c >= 256 ||(md->ctypes[c] & ctype_space) == 0) break;
4035 eptr+= len;
4036 }
4037 break;
4038
4039 case OP_NOT_WORDCHAR:
4040 for (i = min; i < max; i++)
4041 {
4042 int len = 1;
4043 if (eptr >= md->end_subject) break;
4044 GETCHARLEN(c, eptr, len);
4045 if (c < 256 && (md->ctypes[c] & ctype_word) != 0) break;
4046 eptr+= len;
4047 }
4048 break;
4049
4050 case OP_WORDCHAR:
4051 for (i = min; i < max; i++)
4052 {
4053 int len = 1;
4054 if (eptr >= md->end_subject) break;
4055 GETCHARLEN(c, eptr, len);
4056 if (c >= 256 || (md->ctypes[c] & ctype_word) == 0) break;
4057 eptr+= len;
4058 }
4059 break;
4060
4061 default:
4062 RRETURN(PCRE_ERROR_INTERNAL);
4063 }
4064
4065 /* eptr is now past the end of the maximum run */
4066
4067 if (possessive) continue;
4068 for(;;)
4069 {
4070 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM46);
4071 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
4072 if (eptr-- == pp) break; /* Stop if tried at original pos */
4073 BACKCHAR(eptr);
4074 }
4075 }
4076 else
4077 #endif /* SUPPORT_UTF8 */
4078
4079 /* Not UTF-8 mode */
4080 {
4081 switch(ctype)
4082 {
4083 case OP_ANY:
4084 if ((ims & PCRE_DOTALL) == 0)
4085 {
4086 for (i = min; i < max; i++)
4087 {
4088 if (eptr >= md->end_subject || IS_NEWLINE(eptr)) break;
4089 eptr++;
4090 }
4091 break;
4092 }
4093 /* For DOTALL case, fall through and treat as \C */
4094
4095 case OP_ANYBYTE:
4096 c = max - min;
4097 if (c > (unsigned int)(md->end_subject - eptr))
4098 c = md->end_subject - eptr;
4099 eptr += c;
4100 break;
4101
4102 case OP_ANYNL:
4103 for (i = min; i < max; i++)
4104 {
4105 if (eptr >= md->end_subject) break;
4106 c = *eptr;
4107 if (c == 0x000d)
4108 {
4109 if (++eptr >= md->end_subject) break;
4110 if (*eptr == 0x000a) eptr++;
4111 }
4112 else
4113 {
4114 if (c != 0x000a &&
4115 (md->bsr_anycrlf ||
4116 (c != 0x000b && c != 0x000c && c != 0x0085)))
4117 break;
4118 eptr++;
4119 }
4120 }
4121 break;
4122
4123 case OP_NOT_HSPACE:
4124 for (i = min; i < max; i++)
4125 {
4126 if (eptr >= md->end_subject) break;
4127 c = *eptr;
4128 if (c == 0x09 || c == 0x20 || c == 0xa0) break;
4129 eptr++;
4130 }
4131 break;
4132
4133 case OP_HSPACE:
4134 for (i = min; i < max; i++)
4135 {
4136 if (eptr >= md->end_subject) break;
4137 c = *eptr;
4138 if (c != 0x09 && c != 0x20 && c != 0xa0) break;
4139 eptr++;
4140 }
4141 break;
4142
4143 case OP_NOT_VSPACE:
4144 for (i = min; i < max; i++)
4145 {
4146 if (eptr >= md->end_subject) break;
4147 c = *eptr;
4148 if (c == 0x0a || c == 0x0b || c == 0x0c || c == 0x0d || c == 0x85)
4149 break;
4150 eptr++;
4151 }
4152 break;
4153
4154 case OP_VSPACE:
4155 for (i = min; i < max; i++)
4156 {
4157 if (eptr >= md->end_subject) break;
4158 c = *eptr;
4159 if (c != 0x0a && c != 0x0b && c != 0x0c && c != 0x0d && c != 0x85)
4160 break;
4161 eptr++;
4162 }
4163 break;
4164
4165 case OP_NOT_DIGIT:
4166 for (i = min; i < max; i++)
4167 {
4168 if (eptr >= md->end_subject || (md->ctypes[*eptr] & ctype_digit) != 0)
4169 break;
4170 eptr++;
4171 }
4172 break;
4173
4174 case OP_DIGIT:
4175 for (i = min; i < max; i++)
4176 {
4177 if (eptr >= md->end_subject || (md->ctypes[*eptr] & ctype_digit) == 0)
4178 break;
4179 eptr++;
4180 }
4181 break;
4182
4183 case OP_NOT_WHITESPACE:
4184 for (i = min; i < max; i++)
4185 {
4186 if (eptr >= md->end_subject || (md->ctypes[*eptr] & ctype_space) != 0)
4187 break;
4188 eptr++;
4189 }
4190 break;
4191
4192 case OP_WHITESPACE:
4193 for (i = min; i < max; i++)
4194 {
4195 if (eptr >= md->end_subject || (md->ctypes[*eptr] & ctype_space) == 0)
4196 break;
4197 eptr++;
4198 }
4199 break;
4200
4201 case OP_NOT_WORDCHAR:
4202 for (i = min; i < max; i++)
4203 {
4204 if (eptr >= md->end_subject || (md->ctypes[*eptr] & ctype_word) != 0)
4205 break;
4206 eptr++;
4207 }
4208 break;
4209
4210 case OP_WORDCHAR:
4211 for (i = min; i < max; i++)
4212 {
4213 if (eptr >= md->end_subject || (md->ctypes[*eptr] & ctype_word) == 0)
4214 break;
4215 eptr++;
4216 }
4217 break;
4218
4219 default:
4220 RRETURN(PCRE_ERROR_INTERNAL);
4221 }
4222
4223 /* eptr is now past the end of the maximum run */
4224
4225 if (possessive) continue;
4226 while (eptr >= pp)
4227 {
4228 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM47);
4229 eptr--;
4230 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
4231 }
4232 }
4233
4234 /* Get here if we can't make it match with any permitted repetitions */
4235
4236 RRETURN(MATCH_NOMATCH);
4237 }
4238 /* Control never gets here */
4239
4240 /* There's been some horrible disaster. Arrival here can only mean there is
4241 something seriously wrong in the code above or the OP_xxx definitions. */
4242
4243 default:
4244 DPRINTF(("Unknown opcode %d\n", *ecode));
4245 RRETURN(PCRE_ERROR_UNKNOWN_OPCODE);
4246 }
4247
4248 /* Do not stick any code in here without much thought; it is assumed
4249 that "continue" in the code above comes out to here to repeat the main
4250 loop. */
4251
4252 } /* End of main loop */
4253 /* Control never reaches here */
4254
4255
4256 /* When compiling to use the heap rather than the stack for recursive calls to
4257 match(), the RRETURN() macro jumps here. The number that is saved in
4258 frame->Xwhere indicates which label we actually want to return to. */
4259
4260 #ifdef NO_RECURSE
4261 #define LBL(val) case val: goto L_RM##val;
4262 HEAP_RETURN:
4263 switch (frame->Xwhere)
4264 {
4265 LBL( 1) LBL( 2) LBL( 3) LBL( 4) LBL( 5) LBL( 6) LBL( 7) LBL( 8)
4266 LBL( 9) LBL(10) LBL(11) LBL(12) LBL(13) LBL(14) LBL(15) LBL(17)
4267 LBL(19) LBL(24) LBL(25) LBL(26) LBL(27) LBL(29) LBL(31) LBL(33)
4268 LBL(35) LBL(43) LBL(47) LBL(48) LBL(49) LBL(50) LBL(51) LBL(52)
4269 LBL(53) LBL(54)
4270 #ifdef SUPPORT_UTF8
4271 LBL(16) LBL(18) LBL(20) LBL(21) LBL(22) LBL(23) LBL(28) LBL(30)
4272 LBL(32) LBL(34) LBL(42) LBL(46)
4273 #ifdef SUPPORT_UCP
4274 LBL(36) LBL(37) LBL(38) LBL(39) LBL(40) LBL(41) LBL(44) LBL(45)
4275 #endif /* SUPPORT_UCP */
4276 #endif /* SUPPORT_UTF8 */
4277 default:
4278 DPRINTF(("jump error in pcre match: label %d non-existent\n", frame->Xwhere));
4279 return PCRE_ERROR_INTERNAL;
4280 }
4281 #undef LBL
4282 #endif /* NO_RECURSE */
4283 }
4284
4285
4286 /***************************************************************************
4287 ****************************************************************************
4288 RECURSION IN THE match() FUNCTION
4289
4290 Undefine all the macros that were defined above to handle this. */
4291
4292 #ifdef NO_RECURSE
4293 #undef eptr
4294 #undef ecode
4295 #undef mstart
4296 #undef offset_top
4297 #undef ims
4298 #undef eptrb
4299 #undef flags
4300
4301 #undef callpat
4302 #undef charptr
4303 #undef data
4304 #undef next
4305 #undef pp
4306 #undef prev
4307 #undef saved_eptr
4308
4309 #undef new_recursive
4310
4311 #undef cur_is_word
4312 #undef condition
4313 #undef prev_is_word
4314
4315 #undef original_ims
4316
4317 #undef ctype
4318 #undef length
4319 #undef max
4320 #undef min
4321 #undef number
4322 #undef offset
4323 #undef op
4324 #undef save_capture_last
4325 #undef save_offset1
4326 #undef save_offset2
4327 #undef save_offset3
4328 #undef stacksave
4329
4330 #undef newptrb
4331
4332 #endif
4333
4334 /* These two are defined as macros in both cases */
4335
4336 #undef fc
4337 #undef fi
4338
4339 /***************************************************************************
4340 ***************************************************************************/
4341
4342
4343
4344 /*************************************************
4345 * Execute a Regular Expression *
4346 *************************************************/
4347
4348 /* This function applies a compiled re to a subject string and picks out
4349 portions of the string if it matches. Two elements in the vector are set for
4350 each substring: the offsets to the start and end of the substring.
4351
4352 Arguments:
4353 argument_re points to the compiled expression
4354 extra_data points to extra data or is NULL
4355 subject points to the subject string
4356 length length of subject string (may contain binary zeros)
4357 start_offset where to start in the subject string
4358 options option bits
4359 offsets points to a vector of ints to be filled in with offsets
4360 offsetcount the number of elements in the vector
4361
4362 Returns: > 0 => success; value is the number of elements filled in
4363 = 0 => success, but offsets is not big enough
4364 -1 => failed to match
4365 < -1 => some kind of unexpected problem
4366 */
4367
4368 PCRE_EXP_DEFN int
4369 pcre_exec(const pcre *argument_re, const pcre_extra *extra_data,
4370 PCRE_SPTR subject, int length, int start_offset, int options, int *offsets,
4371 int offsetcount)
4372 {
4373 int rc, resetcount, ocount;
4374 int first_byte = -1;
4375 int req_byte = -1;
4376 int req_byte2 = -1;
4377 int newline;
4378 unsigned long int ims;
4379 BOOL using_temporary_offsets = FALSE;
4380 BOOL anchored;
4381 BOOL startline;
4382 BOOL firstline;
4383 BOOL first_byte_caseless = FALSE;
4384 BOOL req_byte_caseless = FALSE;
4385 BOOL utf8;
4386 match_data match_block;
4387 match_data *md = &match_block;
4388 const uschar *tables;
4389 const uschar *start_bits = NULL;
4390 USPTR start_match = (USPTR)subject + start_offset;
4391 USPTR end_subject;
4392 USPTR req_byte_ptr = start_match - 1;
4393
4394 pcre_study_data internal_study;
4395 const pcre_study_data *study;
4396
4397 real_pcre internal_re;
4398 const real_pcre *external_re = (const real_pcre *)argument_re;
4399 const real_pcre *re = external_re;
4400
4401 /* Plausibility checks */
4402
4403 if ((options & ~PUBLIC_EXEC_OPTIONS) != 0) return PCRE_ERROR_BADOPTION;
4404 if (re == NULL || subject == NULL ||
4405 (offsets == NULL && offsetcount > 0)) return PCRE_ERROR_NULL;
4406 if (offsetcount < 0) return PCRE_ERROR_BADCOUNT;
4407
4408 /* Fish out the optional data from the extra_data structure, first setting
4409 the default values. */
4410
4411 study = NULL;
4412 md->match_limit = MATCH_LIMIT;
4413 md->match_limit_recursion = MATCH_LIMIT_RECURSION;
4414 md->callout_data = NULL;
4415
4416 /* The table pointer is always in native byte order. */
4417
4418 tables = external_re->tables;
4419
4420 if (extra_data != NULL)
4421 {
4422 register unsigned int flags = extra_data->flags;
4423 if ((flags & PCRE_EXTRA_STUDY_DATA) != 0)
4424 study = (const pcre_study_data *)extra_data->study_data;
4425 if ((flags & PCRE_EXTRA_MATCH_LIMIT) != 0)
4426 md->match_limit = extra_data->match_limit;
4427 if ((flags & PCRE_EXTRA_MATCH_LIMIT_RECURSION) != 0)
4428 md->match_limit_recursion = extra_data->match_limit_recursion;
4429 if ((flags & PCRE_EXTRA_CALLOUT_DATA) != 0)
4430 md->callout_data = extra_data->callout_data;
4431 if ((flags & PCRE_EXTRA_TABLES) != 0) tables = extra_data->tables;
4432 }
4433
4434 /* If the exec call supplied NULL for tables, use the inbuilt ones. This
4435 is a feature that makes it possible to save compiled regex and re-use them
4436 in other programs later. */
4437
4438 if (tables == NULL) tables = _pcre_default_tables;
4439
4440 /* Check that the first field in the block is the magic number. If it is not,
4441 test for a regex that was compiled on a host of opposite endianness. If this is
4442 the case, flipped values are put in internal_re and internal_study if there was
4443 study data too. */
4444
4445 if (re->magic_number != MAGIC_NUMBER)
4446 {
4447 re = _pcre_try_flipped(re, &internal_re, study, &internal_study);
4448 if (re == NULL) return PCRE_ERROR_BADMAGIC;
4449 if (study != NULL) study = &internal_study;
4450 }
4451
4452 /* Set up other data */
4453
4454 anchored = ((re->options | options) & PCRE_ANCHORED) != 0;
4455 startline = (re->flags & PCRE_STARTLINE) != 0;
4456 firstline = (re->options & PCRE_FIRSTLINE) != 0;
4457
4458 /* The code starts after the real_pcre block and the capture name table. */
4459
4460 md->start_code = (const uschar *)external_re + re->name_table_offset +
4461 re->name_count * re->name_entry_size;
4462
4463 md->start_subject = (USPTR)subject;
4464 md->start_offset = start_offset;
4465 md->end_subject = md->start_subject + length;
4466 end_subject = md->end_subject;
4467
4468 md->endonly = (re->options & PCRE_DOLLAR_ENDONLY) != 0;
4469 utf8 = md->utf8 = (re->options & PCRE_UTF8) != 0;
4470 md->jscript_compat = (re->options & PCRE_JAVASCRIPT_COMPAT) != 0;
4471
4472 md->notbol = (options & PCRE_NOTBOL) != 0;
4473 md->noteol = (options & PCRE_NOTEOL) != 0;
4474 md->notempty = (options & PCRE_NOTEMPTY) != 0;
4475 md->partial = (options & PCRE_PARTIAL) != 0;
4476 md->hitend = FALSE;
4477
4478 md->recursive = NULL; /* No recursion at top level */
4479
4480 md->lcc = tables + lcc_offset;
4481 md->ctypes = tables + ctypes_offset;
4482
4483 /* Handle different \R options. */
4484
4485 switch (options & (PCRE_BSR_ANYCRLF|PCRE_BSR_UNICODE))
4486 {
4487 case 0:
4488 if ((re->options & (PCRE_BSR_ANYCRLF|PCRE_BSR_UNICODE)) != 0)
4489 md->bsr_anycrlf = (re->options & PCRE_BSR_ANYCRLF) != 0;
4490 else
4491 #ifdef BSR_ANYCRLF
4492 md->bsr_anycrlf = TRUE;
4493 #else
4494 md->bsr_anycrlf = FALSE;
4495 #endif
4496 break;
4497
4498 case PCRE_BSR_ANYCRLF:
4499 md->bsr_anycrlf = TRUE;
4500 break;
4501
4502 case PCRE_BSR_UNICODE:
4503 md->bsr_anycrlf = FALSE;
4504 break;
4505
4506 default: return PCRE_ERROR_BADNEWLINE;
4507 }
4508
4509 /* Handle different types of newline. The three bits give eight cases. If
4510 nothing is set at run time, whatever was used at compile time applies. */
4511
4512 switch ((((options & PCRE_NEWLINE_BITS) == 0)? re->options :
4513 (pcre_uint32)options) & PCRE_NEWLINE_BITS)
4514 {
4515 case 0: newline = NEWLINE; break; /* Compile-time default */
4516 case PCRE_NEWLINE_CR: newline = '\r'; break;
4517 case PCRE_NEWLINE_LF: newline = '\n'; break;
4518 case PCRE_NEWLINE_CR+
4519 PCRE_NEWLINE_LF: newline = ('\r' << 8) | '\n'; break;
4520 case PCRE_NEWLINE_ANY: newline = -1; break;
4521 case PCRE_NEWLINE_ANYCRLF: newline = -2; break;
4522 default: return PCRE_ERROR_BADNEWLINE;
4523 }
4524
4525 if (newline == -2)
4526 {
4527 md->nltype = NLTYPE_ANYCRLF;
4528 }
4529 else if (newline < 0)
4530 {
4531 md->nltype = NLTYPE_ANY;
4532 }
4533 else
4534 {
4535 md->nltype = NLTYPE_FIXED;
4536 if (newline > 255)
4537 {
4538 md->nllen = 2;
4539 md->nl[0] = (newline >> 8) & 255;
4540 md->nl[1] = newline & 255;
4541 }
4542 else
4543 {
4544 md->nllen = 1;
4545 md->nl[0] = newline;
4546 }
4547 }
4548
4549 /* Partial matching is supported only for a restricted set of regexes at the
4550 moment. */
4551
4552 if (md->partial && (re->flags & PCRE_NOPARTIAL) != 0)
4553 return PCRE_ERROR_BADPARTIAL;
4554
4555 /* Check a UTF-8 string if required. Unfortunately there's no way of passing
4556 back the character offset. */
4557
4558 #ifdef SUPPORT_UTF8
4559 if (utf8 && (options & PCRE_NO_UTF8_CHECK) == 0)
4560 {
4561 if (_pcre_valid_utf8((uschar *)subject, length) >= 0)
4562 return PCRE_ERROR_BADUTF8;
4563 if (start_offset > 0 && start_offset < length)
4564 {
4565 int tb = ((uschar *)subject)[start_offset];
4566 if (tb > 127)
4567 {
4568 tb &= 0xc0;
4569 if (tb != 0 && tb != 0xc0) return PCRE_ERROR_BADUTF8_OFFSET;
4570 }
4571 }
4572 }
4573 #endif
4574
4575 /* The ims options can vary during the matching as a result of the presence
4576 of (?ims) items in the pattern. They are kept in a local variable so that
4577 restoring at the exit of a group is easy. */
4578
4579 ims = re->options & (PCRE_CASELESS|PCRE_MULTILINE|PCRE_DOTALL);
4580
4581 /* If the expression has got more back references than the offsets supplied can
4582 hold, we get a temporary chunk of working store to use during the matching.
4583 Otherwise, we can use the vector supplied, rounding down its size to a multiple
4584 of 3. */
4585
4586 ocount = offsetcount - (offsetcount % 3);
4587
4588 if (re->top_backref > 0 && re->top_backref >= ocount/3)
4589 {
4590 ocount = re->top_backref * 3 + 3;
4591 md->offset_vector = (int *)(pcre_malloc)(ocount * sizeof(int));
4592 if (md->offset_vector == NULL) return PCRE_ERROR_NOMEMORY;
4593 using_temporary_offsets = TRUE;
4594 DPRINTF(("Got memory to hold back references\n"));
4595 }
4596 else md->offset_vector = offsets;
4597
4598 md->offset_end = ocount;
4599 md->offset_max = (2*ocount)/3;
4600 md->offset_overflow = FALSE;
4601 md->capture_last = -1;
4602
4603 /* Compute the minimum number of offsets that we need to reset each time. Doing
4604 this makes a huge difference to execution time when there aren't many brackets
4605 in the pattern. */
4606
4607 resetcount = 2 + re->top_bracket * 2;
4608 if (resetcount > offsetcount) resetcount = ocount;
4609
4610 /* Reset the working variable associated with each extraction. These should
4611 never be used unless previously set, but they get saved and restored, and so we
4612 initialize them to avoid reading uninitialized locations. */
4613
4614 if (md->offset_vector != NULL)
4615 {
4616 register int *iptr = md->offset_vector + ocount;
4617 register int *iend = iptr - resetcount/2 + 1;
4618 while (--iptr >= iend) *iptr = -1;
4619 }
4620
4621 /* Set up the first character to match, if available. The first_byte value is
4622 never set for an anchored regular expression, but the anchoring may be forced
4623 at run time, so we have to test for anchoring. The first char may be unset for
4624 an unanchored pattern, of course. If there's no first char and the pattern was
4625 studied, there may be a bitmap of possible first characters. */
4626
4627 if (!anchored)
4628 {
4629 if ((re->flags & PCRE_FIRSTSET) != 0)
4630 {
4631 first_byte = re->first_byte & 255;
4632 if ((first_byte_caseless = ((re->first_byte & REQ_CASELESS) != 0)) == TRUE)
4633 first_byte = md->lcc[first_byte];
4634 }
4635 else
4636 if (!startline && study != NULL &&
4637 (study->options & PCRE_STUDY_MAPPED) != 0)
4638 start_bits = study->start_bits;
4639 }
4640
4641 /* For anchored or unanchored matches, there may be a "last known required
4642 character" set. */
4643
4644 if ((re->flags & PCRE_REQCHSET) != 0)
4645 {
4646 req_byte = re->req_byte & 255;
4647 req_byte_caseless = (re->req_byte & REQ_CASELESS) != 0;
4648 req_byte2 = (tables + fcc_offset)[req_byte]; /* case flipped */
4649 }
4650
4651
4652 /* ==========================================================================*/
4653
4654 /* Loop for handling unanchored repeated matching attempts; for anchored regexs
4655 the loop runs just once. */
4656
4657 for(;;)
4658 {
4659 USPTR save_end_subject = end_subject;
4660 USPTR new_start_match;
4661
4662 /* Reset the maximum number of extractions we might see. */
4663
4664 if (md->offset_vector != NULL)
4665 {
4666 register int *iptr = md->offset_vector;
4667 register int *iend = iptr + resetcount;
4668 while (iptr < iend) *iptr++ = -1;
4669 }
4670
4671 /* Advance to a unique first char if possible. If firstline is TRUE, the
4672 start of the match is constrained to the first line of a multiline string.
4673 That is, the match must be before or at the first newline. Implement this by
4674 temporarily adjusting end_subject so that we stop scanning at a newline. If
4675 the match fails at the newline, later code breaks this loop. */
4676
4677 if (firstline)
4678 {
4679 USPTR t = start_match;
4680 while (t < md->end_subject && !IS_NEWLINE(t)) t++;
4681 end_subject = t;
4682 }
4683
4684 /* Now test for a unique first byte */
4685
4686 if (first_byte >= 0)
4687 {
4688 if (first_byte_caseless)
4689 while (start_match < end_subject &&
4690 md->lcc[*start_match] != first_byte)
4691 { NEXTCHAR(start_match); }
4692 else
4693 while (start_match < end_subject && *start_match != first_byte)
4694 { NEXTCHAR(start_match); }
4695 }
4696
4697 /* Or to just after a linebreak for a multiline match if possible */
4698
4699 else if (startline)
4700 {
4701 if (start_match > md->start_subject + start_offset)
4702 {
4703 while (start_match <= end_subject && !WAS_NEWLINE(start_match))
4704 { NEXTCHAR(start_match); }
4705
4706 /* If we have just passed a CR and the newline option is ANY or ANYCRLF,
4707 and we are now at a LF, advance the match position by one more character.
4708 */
4709
4710 if (start_match[-1] == '\r' &&
4711 (md->nltype == NLTYPE_ANY || md->nltype == NLTYPE_ANYCRLF) &&
4712 start_match < end_subject &&
4713 *start_match == '\n')
4714 start_match++;
4715 }
4716 }
4717
4718 /* Or to a non-unique first char after study */
4719
4720 else if (start_bits != NULL)
4721 {
4722 while (start_match < end_subject)
4723 {
4724 register unsigned int c = *start_match;
4725 if ((start_bits[c/8] & (1 << (c&7))) == 0)
4726 { NEXTCHAR(start_match); }
4727 else break;
4728 }
4729 }
4730
4731 /* Restore fudged end_subject */
4732
4733 end_subject = save_end_subject;
4734
4735 #ifdef DEBUG /* Sigh. Some compilers never learn. */
4736 printf(">>>> Match against: ");
4737 pchars(start_match, end_subject - start_match, TRUE, md);
4738 printf("\n");
4739 #endif
4740
4741 /* If req_byte is set, we know that that character must appear in the subject
4742 for the match to succeed. If the first character is set, req_byte must be
4743 later in the subject; otherwise the test starts at the match point. This
4744 optimization can save a huge amount of backtracking in patterns with nested
4745 unlimited repeats that aren't going to match. Writing separate code for
4746 cased/caseless versions makes it go faster, as does using an autoincrement
4747 and backing off on a match.
4748
4749 HOWEVER: when the subject string is very, very long, searching to its end can
4750 take a long time, and give bad performance on quite ordinary patterns. This
4751 showed up when somebody was matching something like /^\d+C/ on a 32-megabyte
4752 string... so we don't do this when the string is sufficiently long.
4753
4754 ALSO: this processing is disabled when partial matching is requested.
4755 */
4756
4757 if (req_byte >= 0 &&
4758 end_subject - start_match < REQ_BYTE_MAX &&
4759 !md->partial)
4760 {
4761 register USPTR p = start_match + ((first_byte >= 0)? 1 : 0);
4762
4763 /* We don't need to repeat the search if we haven't yet reached the
4764 place we found it at last time. */
4765
4766 if (p > req_byte_ptr)
4767 {
4768 if (req_byte_caseless)
4769 {
4770 while (p < end_subject)
4771 {
4772 register int pp = *p++;
4773 if (pp == req_byte || pp == req_byte2) { p--; break; }
4774 }
4775 }
4776 else
4777 {
4778 while (p < end_subject)
4779 {
4780 if (*p++ == req_byte) { p--; break; }
4781 }
4782 }
4783
4784 /* If we can't find the required character, break the matching loop,
4785 forcing a match failure. */
4786
4787 if (p >= end_subject)
4788 {
4789 rc = MATCH_NOMATCH;
4790 break;
4791 }
4792
4793 /* If we have found the required character, save the point where we
4794 found it, so that we don't search again next time round the loop if
4795 the start hasn't passed this character yet. */
4796
4797 req_byte_ptr = p;
4798 }
4799 }
4800
4801 /* OK, we can now run the match. */
4802
4803 md->start_match_ptr = start_match;
4804 md->match_call_count = 0;
4805 rc = match(start_match, md->start_code, start_match, 2, md, ims, NULL, 0, 0);
4806
4807 switch(rc)
4808 {
4809 /* NOMATCH and PRUNE advance by one character. THEN at this level acts
4810 exactly like PRUNE. */
4811
4812 case MATCH_NOMATCH:
4813 case MATCH_PRUNE:
4814 case MATCH_THEN:
4815 new_start_match = start_match + 1;
4816 #ifdef SUPPORT_UTF8
4817 if (utf8)
4818 while(new_start_match < end_subject && (*new_start_match & 0xc0) == 0x80)
4819 new_start_match++;
4820 #endif
4821 break;
4822
4823 /* SKIP passes back the next starting point explicitly. */
4824
4825 case MATCH_SKIP:
4826 new_start_match = md->start_match_ptr;
4827 break;
4828
4829 /* COMMIT disables the bumpalong, but otherwise behaves as NOMATCH. */
4830
4831 case MATCH_COMMIT:
4832 rc = MATCH_NOMATCH;
4833 goto ENDLOOP;
4834
4835 /* Any other return is some kind of error. */
4836
4837 default:
4838 goto ENDLOOP;
4839 }
4840
4841 /* Control reaches here for the various types of "no match at this point"
4842 result. Reset the code to MATCH_NOMATCH for subsequent checking. */
4843
4844 rc = MATCH_NOMATCH;
4845
4846 /* If PCRE_FIRSTLINE is set, the match must happen before or at the first
4847 newline in the subject (though it may continue over the newline). Therefore,
4848 if we have just failed to match, starting at a newline, do not continue. */
4849
4850 if (firstline && IS_NEWLINE(start_match)) break;
4851
4852 /* Advance to new matching position */
4853
4854 start_match = new_start_match;
4855
4856 /* Break the loop if the pattern is anchored or if we have passed the end of
4857 the subject. */
4858
4859 if (anchored || start_match > end_subject) break;
4860
4861 /* If we have just passed a CR and we are now at a LF, and the pattern does
4862 not contain any explicit matches for \r or \n, and the newline option is CRLF
4863 or ANY or ANYCRLF, advance the match position by one more character. */
4864
4865 if (start_match[-1] == '\r' &&
4866 start_match < end_subject &&
4867 *start_match == '\n' &&
4868 (re->flags & PCRE_HASCRORLF) == 0 &&
4869 (md->nltype == NLTYPE_ANY ||
4870 md->nltype == NLTYPE_ANYCRLF ||
4871 md->nllen == 2))
4872 start_match++;
4873
4874 } /* End of for(;;) "bumpalong" loop */
4875
4876 /* ==========================================================================*/
4877
4878 /* We reach here when rc is not MATCH_NOMATCH, or if one of the stopping
4879 conditions is true:
4880
4881 (1) The pattern is anchored or the match was failed by (*COMMIT);
4882
4883 (2) We are past the end of the subject;
4884
4885 (3) PCRE_FIRSTLINE is set and we have failed to match at a newline, because
4886 this option requests that a match occur at or before the first newline in
4887 the subject.
4888
4889 When we have a match and the offset vector is big enough to deal with any
4890 backreferences, captured substring offsets will already be set up. In the case
4891 where we had to get some local store to hold offsets for backreference
4892 processing, copy those that we can. In this case there need not be overflow if
4893 certain parts of the pattern were not used, even though there are more
4894 capturing parentheses than vector slots. */
4895
4896 ENDLOOP:
4897
4898 if (rc == MATCH_MATCH)
4899 {
4900 if (using_temporary_offsets)
4901 {
4902 if (offsetcount >= 4)
4903 {
4904 memcpy(offsets + 2, md->offset_vector + 2,
4905 (offsetcount - 2) * sizeof(int));
4906 DPRINTF(("Copied offsets from temporary memory\n"));
4907 }
4908 if (md->end_offset_top > offsetcount) md->offset_overflow = TRUE;
4909 DPRINTF(("Freeing temporary memory\n"));
4910 (pcre_free)(md->offset_vector);
4911 }
4912
4913 /* Set the return code to the number of captured strings, or 0 if there are
4914 too many to fit into the vector. */
4915
4916 rc = md->offset_overflow? 0 : md->end_offset_top/2;
4917
4918 /* If there is space, set up the whole thing as substring 0. The value of
4919 md->start_match_ptr might be modified if \K was encountered on the success
4920 matching path. */
4921
4922 if (offsetcount < 2) rc = 0; else
4923 {
4924 offsets[0] = md->start_match_ptr - md->start_subject;
4925 offsets[1] = md->end_match_ptr - md->start_subject;
4926 }
4927
4928 DPRINTF((">>>> returning %d\n", rc));
4929 return rc;
4930 }
4931
4932 /* Control gets here if there has been an error, or if the overall match
4933 attempt has failed at all permitted starting positions. */
4934
4935 if (using_temporary_offsets)
4936 {
4937 DPRINTF(("Freeing temporary memory\n"));
4938 (pcre_free)(md->offset_vector);
4939 }
4940
4941 if (rc != MATCH_NOMATCH)
4942 {
4943 DPRINTF((">>>> error: returning %d\n", rc));
4944 return rc;
4945 }
4946 else if (md->partial && md->hitend)
4947 {
4948 DPRINTF((">>>> returning PCRE_ERROR_PARTIAL\n"));
4949 return PCRE_ERROR_PARTIAL;
4950 }
4951 else
4952 {
4953 DPRINTF((">>>> returning PCRE_ERROR_NOMATCH\n"));
4954 return PCRE_ERROR_NOMATCH;
4955 }
4956 }
4957
4958 /* End of pcre_exec.c */

Properties

Name Value
svn:eol-style native
svn:keywords "Author Date Id Revision Url"

webmaster@exim.org
ViewVC Help
Powered by ViewVC 1.1.12