/[pcre]/code/trunk/pcre_exec.c
ViewVC logotype

Contents of /code/trunk/pcre_exec.c

Parent Directory Parent Directory | Revision Log Revision Log


Revision 342 - (show annotations) (download)
Sun Apr 20 17:10:13 2008 UTC (6 years, 4 months ago) by ph10
File MIME type: text/plain
File size: 150789 byte(s)
Slight performance improvement by using the new OP_ALLANY opcode for cases of 
the metacharacter "." when DOTALL is set. Also, some tidies consequent upon its 
invention.

1 /*************************************************
2 * Perl-Compatible Regular Expressions *
3 *************************************************/
4
5 /* PCRE is a library of functions to support regular expressions whose syntax
6 and semantics are as close as possible to those of the Perl 5 language.
7
8 Written by Philip Hazel
9 Copyright (c) 1997-2008 University of Cambridge
10
11 -----------------------------------------------------------------------------
12 Redistribution and use in source and binary forms, with or without
13 modification, are permitted provided that the following conditions are met:
14
15 * Redistributions of source code must retain the above copyright notice,
16 this list of conditions and the following disclaimer.
17
18 * Redistributions in binary form must reproduce the above copyright
19 notice, this list of conditions and the following disclaimer in the
20 documentation and/or other materials provided with the distribution.
21
22 * Neither the name of the University of Cambridge nor the names of its
23 contributors may be used to endorse or promote products derived from
24 this software without specific prior written permission.
25
26 THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
27 AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
28 IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
29 ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
30 LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
31 CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
32 SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
33 INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
34 CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
35 ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
36 POSSIBILITY OF SUCH DAMAGE.
37 -----------------------------------------------------------------------------
38 */
39
40
41 /* This module contains pcre_exec(), the externally visible function that does
42 pattern matching using an NFA algorithm, trying to mimic Perl as closely as
43 possible. There are also some static supporting functions. */
44
45 #ifdef HAVE_CONFIG_H
46 #include "config.h"
47 #endif
48
49 #define NLBLOCK md /* Block containing newline information */
50 #define PSSTART start_subject /* Field containing processed string start */
51 #define PSEND end_subject /* Field containing processed string end */
52
53 #include "pcre_internal.h"
54
55 /* Undefine some potentially clashing cpp symbols */
56
57 #undef min
58 #undef max
59
60 /* Flag bits for the match() function */
61
62 #define match_condassert 0x01 /* Called to check a condition assertion */
63 #define match_cbegroup 0x02 /* Could-be-empty unlimited repeat group */
64
65 /* Non-error returns from the match() function. Error returns are externally
66 defined PCRE_ERROR_xxx codes, which are all negative. */
67
68 #define MATCH_MATCH 1
69 #define MATCH_NOMATCH 0
70
71 /* Special internal returns from the match() function. Make them sufficiently
72 negative to avoid the external error codes. */
73
74 #define MATCH_COMMIT (-999)
75 #define MATCH_PRUNE (-998)
76 #define MATCH_SKIP (-997)
77 #define MATCH_THEN (-996)
78
79 /* Maximum number of ints of offset to save on the stack for recursive calls.
80 If the offset vector is bigger, malloc is used. This should be a multiple of 3,
81 because the offset vector is always a multiple of 3 long. */
82
83 #define REC_STACK_SAVE_MAX 30
84
85 /* Min and max values for the common repeats; for the maxima, 0 => infinity */
86
87 static const char rep_min[] = { 0, 0, 1, 1, 0, 0 };
88 static const char rep_max[] = { 0, 0, 0, 0, 1, 1 };
89
90
91
92 #ifdef DEBUG
93 /*************************************************
94 * Debugging function to print chars *
95 *************************************************/
96
97 /* Print a sequence of chars in printable format, stopping at the end of the
98 subject if the requested.
99
100 Arguments:
101 p points to characters
102 length number to print
103 is_subject TRUE if printing from within md->start_subject
104 md pointer to matching data block, if is_subject is TRUE
105
106 Returns: nothing
107 */
108
109 static void
110 pchars(const uschar *p, int length, BOOL is_subject, match_data *md)
111 {
112 unsigned int c;
113 if (is_subject && length > md->end_subject - p) length = md->end_subject - p;
114 while (length-- > 0)
115 if (isprint(c = *(p++))) printf("%c", c); else printf("\\x%02x", c);
116 }
117 #endif
118
119
120
121 /*************************************************
122 * Match a back-reference *
123 *************************************************/
124
125 /* If a back reference hasn't been set, the length that is passed is greater
126 than the number of characters left in the string, so the match fails.
127
128 Arguments:
129 offset index into the offset vector
130 eptr points into the subject
131 length length to be matched
132 md points to match data block
133 ims the ims flags
134
135 Returns: TRUE if matched
136 */
137
138 static BOOL
139 match_ref(int offset, register USPTR eptr, int length, match_data *md,
140 unsigned long int ims)
141 {
142 USPTR p = md->start_subject + md->offset_vector[offset];
143
144 #ifdef DEBUG
145 if (eptr >= md->end_subject)
146 printf("matching subject <null>");
147 else
148 {
149 printf("matching subject ");
150 pchars(eptr, length, TRUE, md);
151 }
152 printf(" against backref ");
153 pchars(p, length, FALSE, md);
154 printf("\n");
155 #endif
156
157 /* Always fail if not enough characters left */
158
159 if (length > md->end_subject - eptr) return FALSE;
160
161 /* Separate the caselesss case for speed */
162
163 if ((ims & PCRE_CASELESS) != 0)
164 {
165 while (length-- > 0)
166 if (md->lcc[*p++] != md->lcc[*eptr++]) return FALSE;
167 }
168 else
169 { while (length-- > 0) if (*p++ != *eptr++) return FALSE; }
170
171 return TRUE;
172 }
173
174
175
176 /***************************************************************************
177 ****************************************************************************
178 RECURSION IN THE match() FUNCTION
179
180 The match() function is highly recursive, though not every recursive call
181 increases the recursive depth. Nevertheless, some regular expressions can cause
182 it to recurse to a great depth. I was writing for Unix, so I just let it call
183 itself recursively. This uses the stack for saving everything that has to be
184 saved for a recursive call. On Unix, the stack can be large, and this works
185 fine.
186
187 It turns out that on some non-Unix-like systems there are problems with
188 programs that use a lot of stack. (This despite the fact that every last chip
189 has oodles of memory these days, and techniques for extending the stack have
190 been known for decades.) So....
191
192 There is a fudge, triggered by defining NO_RECURSE, which avoids recursive
193 calls by keeping local variables that need to be preserved in blocks of memory
194 obtained from malloc() instead instead of on the stack. Macros are used to
195 achieve this so that the actual code doesn't look very different to what it
196 always used to.
197
198 The original heap-recursive code used longjmp(). However, it seems that this
199 can be very slow on some operating systems. Following a suggestion from Stan
200 Switzer, the use of longjmp() has been abolished, at the cost of having to
201 provide a unique number for each call to RMATCH. There is no way of generating
202 a sequence of numbers at compile time in C. I have given them names, to make
203 them stand out more clearly.
204
205 Crude tests on x86 Linux show a small speedup of around 5-8%. However, on
206 FreeBSD, avoiding longjmp() more than halves the time taken to run the standard
207 tests. Furthermore, not using longjmp() means that local dynamic variables
208 don't have indeterminate values; this has meant that the frame size can be
209 reduced because the result can be "passed back" by straight setting of the
210 variable instead of being passed in the frame.
211 ****************************************************************************
212 ***************************************************************************/
213
214 /* Numbers for RMATCH calls. When this list is changed, the code at HEAP_RETURN
215 below must be updated in sync. */
216
217 enum { RM1=1, RM2, RM3, RM4, RM5, RM6, RM7, RM8, RM9, RM10,
218 RM11, RM12, RM13, RM14, RM15, RM16, RM17, RM18, RM19, RM20,
219 RM21, RM22, RM23, RM24, RM25, RM26, RM27, RM28, RM29, RM30,
220 RM31, RM32, RM33, RM34, RM35, RM36, RM37, RM38, RM39, RM40,
221 RM41, RM42, RM43, RM44, RM45, RM46, RM47, RM48, RM49, RM50,
222 RM51, RM52, RM53, RM54 };
223
224 /* These versions of the macros use the stack, as normal. There are debugging
225 versions and production versions. Note that the "rw" argument of RMATCH isn't
226 actuall used in this definition. */
227
228 #ifndef NO_RECURSE
229 #define REGISTER register
230
231 #ifdef DEBUG
232 #define RMATCH(ra,rb,rc,rd,re,rf,rg,rw) \
233 { \
234 printf("match() called in line %d\n", __LINE__); \
235 rrc = match(ra,rb,mstart,rc,rd,re,rf,rg,rdepth+1); \
236 printf("to line %d\n", __LINE__); \
237 }
238 #define RRETURN(ra) \
239 { \
240 printf("match() returned %d from line %d ", ra, __LINE__); \
241 return ra; \
242 }
243 #else
244 #define RMATCH(ra,rb,rc,rd,re,rf,rg,rw) \
245 rrc = match(ra,rb,mstart,rc,rd,re,rf,rg,rdepth+1)
246 #define RRETURN(ra) return ra
247 #endif
248
249 #else
250
251
252 /* These versions of the macros manage a private stack on the heap. Note that
253 the "rd" argument of RMATCH isn't actually used in this definition. It's the md
254 argument of match(), which never changes. */
255
256 #define REGISTER
257
258 #define RMATCH(ra,rb,rc,rd,re,rf,rg,rw)\
259 {\
260 heapframe *newframe = (pcre_stack_malloc)(sizeof(heapframe));\
261 frame->Xwhere = rw; \
262 newframe->Xeptr = ra;\
263 newframe->Xecode = rb;\
264 newframe->Xmstart = mstart;\
265 newframe->Xoffset_top = rc;\
266 newframe->Xims = re;\
267 newframe->Xeptrb = rf;\
268 newframe->Xflags = rg;\
269 newframe->Xrdepth = frame->Xrdepth + 1;\
270 newframe->Xprevframe = frame;\
271 frame = newframe;\
272 DPRINTF(("restarting from line %d\n", __LINE__));\
273 goto HEAP_RECURSE;\
274 L_##rw:\
275 DPRINTF(("jumped back to line %d\n", __LINE__));\
276 }
277
278 #define RRETURN(ra)\
279 {\
280 heapframe *newframe = frame;\
281 frame = newframe->Xprevframe;\
282 (pcre_stack_free)(newframe);\
283 if (frame != NULL)\
284 {\
285 rrc = ra;\
286 goto HEAP_RETURN;\
287 }\
288 return ra;\
289 }
290
291
292 /* Structure for remembering the local variables in a private frame */
293
294 typedef struct heapframe {
295 struct heapframe *Xprevframe;
296
297 /* Function arguments that may change */
298
299 const uschar *Xeptr;
300 const uschar *Xecode;
301 const uschar *Xmstart;
302 int Xoffset_top;
303 long int Xims;
304 eptrblock *Xeptrb;
305 int Xflags;
306 unsigned int Xrdepth;
307
308 /* Function local variables */
309
310 const uschar *Xcallpat;
311 const uschar *Xcharptr;
312 const uschar *Xdata;
313 const uschar *Xnext;
314 const uschar *Xpp;
315 const uschar *Xprev;
316 const uschar *Xsaved_eptr;
317
318 recursion_info Xnew_recursive;
319
320 BOOL Xcur_is_word;
321 BOOL Xcondition;
322 BOOL Xprev_is_word;
323
324 unsigned long int Xoriginal_ims;
325
326 #ifdef SUPPORT_UCP
327 int Xprop_type;
328 int Xprop_value;
329 int Xprop_fail_result;
330 int Xprop_category;
331 int Xprop_chartype;
332 int Xprop_script;
333 int Xoclength;
334 uschar Xocchars[8];
335 #endif
336
337 int Xctype;
338 unsigned int Xfc;
339 int Xfi;
340 int Xlength;
341 int Xmax;
342 int Xmin;
343 int Xnumber;
344 int Xoffset;
345 int Xop;
346 int Xsave_capture_last;
347 int Xsave_offset1, Xsave_offset2, Xsave_offset3;
348 int Xstacksave[REC_STACK_SAVE_MAX];
349
350 eptrblock Xnewptrb;
351
352 /* Where to jump back to */
353
354 int Xwhere;
355
356 } heapframe;
357
358 #endif
359
360
361 /***************************************************************************
362 ***************************************************************************/
363
364
365
366 /*************************************************
367 * Match from current position *
368 *************************************************/
369
370 /* This function is called recursively in many circumstances. Whenever it
371 returns a negative (error) response, the outer incarnation must also return the
372 same response.
373
374 Performance note: It might be tempting to extract commonly used fields from the
375 md structure (e.g. utf8, end_subject) into individual variables to improve
376 performance. Tests using gcc on a SPARC disproved this; in the first case, it
377 made performance worse.
378
379 Arguments:
380 eptr pointer to current character in subject
381 ecode pointer to current position in compiled code
382 mstart pointer to the current match start position (can be modified
383 by encountering \K)
384 offset_top current top pointer
385 md pointer to "static" info for the match
386 ims current /i, /m, and /s options
387 eptrb pointer to chain of blocks containing eptr at start of
388 brackets - for testing for empty matches
389 flags can contain
390 match_condassert - this is an assertion condition
391 match_cbegroup - this is the start of an unlimited repeat
392 group that can match an empty string
393 rdepth the recursion depth
394
395 Returns: MATCH_MATCH if matched ) these values are >= 0
396 MATCH_NOMATCH if failed to match )
397 a negative PCRE_ERROR_xxx value if aborted by an error condition
398 (e.g. stopped by repeated call or recursion limit)
399 */
400
401 static int
402 match(REGISTER USPTR eptr, REGISTER const uschar *ecode, const uschar *mstart,
403 int offset_top, match_data *md, unsigned long int ims, eptrblock *eptrb,
404 int flags, unsigned int rdepth)
405 {
406 /* These variables do not need to be preserved over recursion in this function,
407 so they can be ordinary variables in all cases. Mark some of them with
408 "register" because they are used a lot in loops. */
409
410 register int rrc; /* Returns from recursive calls */
411 register int i; /* Used for loops not involving calls to RMATCH() */
412 register unsigned int c; /* Character values not kept over RMATCH() calls */
413 register BOOL utf8; /* Local copy of UTF-8 flag for speed */
414
415 BOOL minimize, possessive; /* Quantifier options */
416
417 /* When recursion is not being used, all "local" variables that have to be
418 preserved over calls to RMATCH() are part of a "frame" which is obtained from
419 heap storage. Set up the top-level frame here; others are obtained from the
420 heap whenever RMATCH() does a "recursion". See the macro definitions above. */
421
422 #ifdef NO_RECURSE
423 heapframe *frame = (pcre_stack_malloc)(sizeof(heapframe));
424 frame->Xprevframe = NULL; /* Marks the top level */
425
426 /* Copy in the original argument variables */
427
428 frame->Xeptr = eptr;
429 frame->Xecode = ecode;
430 frame->Xmstart = mstart;
431 frame->Xoffset_top = offset_top;
432 frame->Xims = ims;
433 frame->Xeptrb = eptrb;
434 frame->Xflags = flags;
435 frame->Xrdepth = rdepth;
436
437 /* This is where control jumps back to to effect "recursion" */
438
439 HEAP_RECURSE:
440
441 /* Macros make the argument variables come from the current frame */
442
443 #define eptr frame->Xeptr
444 #define ecode frame->Xecode
445 #define mstart frame->Xmstart
446 #define offset_top frame->Xoffset_top
447 #define ims frame->Xims
448 #define eptrb frame->Xeptrb
449 #define flags frame->Xflags
450 #define rdepth frame->Xrdepth
451
452 /* Ditto for the local variables */
453
454 #ifdef SUPPORT_UTF8
455 #define charptr frame->Xcharptr
456 #endif
457 #define callpat frame->Xcallpat
458 #define data frame->Xdata
459 #define next frame->Xnext
460 #define pp frame->Xpp
461 #define prev frame->Xprev
462 #define saved_eptr frame->Xsaved_eptr
463
464 #define new_recursive frame->Xnew_recursive
465
466 #define cur_is_word frame->Xcur_is_word
467 #define condition frame->Xcondition
468 #define prev_is_word frame->Xprev_is_word
469
470 #define original_ims frame->Xoriginal_ims
471
472 #ifdef SUPPORT_UCP
473 #define prop_type frame->Xprop_type
474 #define prop_value frame->Xprop_value
475 #define prop_fail_result frame->Xprop_fail_result
476 #define prop_category frame->Xprop_category
477 #define prop_chartype frame->Xprop_chartype
478 #define prop_script frame->Xprop_script
479 #define oclength frame->Xoclength
480 #define occhars frame->Xocchars
481 #endif
482
483 #define ctype frame->Xctype
484 #define fc frame->Xfc
485 #define fi frame->Xfi
486 #define length frame->Xlength
487 #define max frame->Xmax
488 #define min frame->Xmin
489 #define number frame->Xnumber
490 #define offset frame->Xoffset
491 #define op frame->Xop
492 #define save_capture_last frame->Xsave_capture_last
493 #define save_offset1 frame->Xsave_offset1
494 #define save_offset2 frame->Xsave_offset2
495 #define save_offset3 frame->Xsave_offset3
496 #define stacksave frame->Xstacksave
497
498 #define newptrb frame->Xnewptrb
499
500 /* When recursion is being used, local variables are allocated on the stack and
501 get preserved during recursion in the normal way. In this environment, fi and
502 i, and fc and c, can be the same variables. */
503
504 #else /* NO_RECURSE not defined */
505 #define fi i
506 #define fc c
507
508
509 #ifdef SUPPORT_UTF8 /* Many of these variables are used only */
510 const uschar *charptr; /* in small blocks of the code. My normal */
511 #endif /* style of coding would have declared */
512 const uschar *callpat; /* them within each of those blocks. */
513 const uschar *data; /* However, in order to accommodate the */
514 const uschar *next; /* version of this code that uses an */
515 USPTR pp; /* external "stack" implemented on the */
516 const uschar *prev; /* heap, it is easier to declare them all */
517 USPTR saved_eptr; /* here, so the declarations can be cut */
518 /* out in a block. The only declarations */
519 recursion_info new_recursive; /* within blocks below are for variables */
520 /* that do not have to be preserved over */
521 BOOL cur_is_word; /* a recursive call to RMATCH(). */
522 BOOL condition;
523 BOOL prev_is_word;
524
525 unsigned long int original_ims;
526
527 #ifdef SUPPORT_UCP
528 int prop_type;
529 int prop_value;
530 int prop_fail_result;
531 int prop_category;
532 int prop_chartype;
533 int prop_script;
534 int oclength;
535 uschar occhars[8];
536 #endif
537
538 int ctype;
539 int length;
540 int max;
541 int min;
542 int number;
543 int offset;
544 int op;
545 int save_capture_last;
546 int save_offset1, save_offset2, save_offset3;
547 int stacksave[REC_STACK_SAVE_MAX];
548
549 eptrblock newptrb;
550 #endif /* NO_RECURSE */
551
552 /* These statements are here to stop the compiler complaining about unitialized
553 variables. */
554
555 #ifdef SUPPORT_UCP
556 prop_value = 0;
557 prop_fail_result = 0;
558 #endif
559
560
561 /* This label is used for tail recursion, which is used in a few cases even
562 when NO_RECURSE is not defined, in order to reduce the amount of stack that is
563 used. Thanks to Ian Taylor for noticing this possibility and sending the
564 original patch. */
565
566 TAIL_RECURSE:
567
568 /* OK, now we can get on with the real code of the function. Recursive calls
569 are specified by the macro RMATCH and RRETURN is used to return. When
570 NO_RECURSE is *not* defined, these just turn into a recursive call to match()
571 and a "return", respectively (possibly with some debugging if DEBUG is
572 defined). However, RMATCH isn't like a function call because it's quite a
573 complicated macro. It has to be used in one particular way. This shouldn't,
574 however, impact performance when true recursion is being used. */
575
576 #ifdef SUPPORT_UTF8
577 utf8 = md->utf8; /* Local copy of the flag */
578 #else
579 utf8 = FALSE;
580 #endif
581
582 /* First check that we haven't called match() too many times, or that we
583 haven't exceeded the recursive call limit. */
584
585 if (md->match_call_count++ >= md->match_limit) RRETURN(PCRE_ERROR_MATCHLIMIT);
586 if (rdepth >= md->match_limit_recursion) RRETURN(PCRE_ERROR_RECURSIONLIMIT);
587
588 original_ims = ims; /* Save for resetting on ')' */
589
590 /* At the start of a group with an unlimited repeat that may match an empty
591 string, the match_cbegroup flag is set. When this is the case, add the current
592 subject pointer to the chain of such remembered pointers, to be checked when we
593 hit the closing ket, in order to break infinite loops that match no characters.
594 When match() is called in other circumstances, don't add to the chain. The
595 match_cbegroup flag must NOT be used with tail recursion, because the memory
596 block that is used is on the stack, so a new one may be required for each
597 match(). */
598
599 if ((flags & match_cbegroup) != 0)
600 {
601 newptrb.epb_saved_eptr = eptr;
602 newptrb.epb_prev = eptrb;
603 eptrb = &newptrb;
604 }
605
606 /* Now start processing the opcodes. */
607
608 for (;;)
609 {
610 minimize = possessive = FALSE;
611 op = *ecode;
612
613 /* For partial matching, remember if we ever hit the end of the subject after
614 matching at least one subject character. */
615
616 if (md->partial &&
617 eptr >= md->end_subject &&
618 eptr > mstart)
619 md->hitend = TRUE;
620
621 switch(op)
622 {
623 case OP_FAIL:
624 RRETURN(MATCH_NOMATCH);
625
626 case OP_PRUNE:
627 RMATCH(eptr, ecode + _pcre_OP_lengths[*ecode], offset_top, md,
628 ims, eptrb, flags, RM51);
629 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
630 RRETURN(MATCH_PRUNE);
631
632 case OP_COMMIT:
633 RMATCH(eptr, ecode + _pcre_OP_lengths[*ecode], offset_top, md,
634 ims, eptrb, flags, RM52);
635 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
636 RRETURN(MATCH_COMMIT);
637
638 case OP_SKIP:
639 RMATCH(eptr, ecode + _pcre_OP_lengths[*ecode], offset_top, md,
640 ims, eptrb, flags, RM53);
641 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
642 md->start_match_ptr = eptr; /* Pass back current position */
643 RRETURN(MATCH_SKIP);
644
645 case OP_THEN:
646 RMATCH(eptr, ecode + _pcre_OP_lengths[*ecode], offset_top, md,
647 ims, eptrb, flags, RM54);
648 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
649 RRETURN(MATCH_THEN);
650
651 /* Handle a capturing bracket. If there is space in the offset vector, save
652 the current subject position in the working slot at the top of the vector.
653 We mustn't change the current values of the data slot, because they may be
654 set from a previous iteration of this group, and be referred to by a
655 reference inside the group.
656
657 If the bracket fails to match, we need to restore this value and also the
658 values of the final offsets, in case they were set by a previous iteration
659 of the same bracket.
660
661 If there isn't enough space in the offset vector, treat this as if it were
662 a non-capturing bracket. Don't worry about setting the flag for the error
663 case here; that is handled in the code for KET. */
664
665 case OP_CBRA:
666 case OP_SCBRA:
667 number = GET2(ecode, 1+LINK_SIZE);
668 offset = number << 1;
669
670 #ifdef DEBUG
671 printf("start bracket %d\n", number);
672 printf("subject=");
673 pchars(eptr, 16, TRUE, md);
674 printf("\n");
675 #endif
676
677 if (offset < md->offset_max)
678 {
679 save_offset1 = md->offset_vector[offset];
680 save_offset2 = md->offset_vector[offset+1];
681 save_offset3 = md->offset_vector[md->offset_end - number];
682 save_capture_last = md->capture_last;
683
684 DPRINTF(("saving %d %d %d\n", save_offset1, save_offset2, save_offset3));
685 md->offset_vector[md->offset_end - number] = eptr - md->start_subject;
686
687 flags = (op == OP_SCBRA)? match_cbegroup : 0;
688 do
689 {
690 RMATCH(eptr, ecode + _pcre_OP_lengths[*ecode], offset_top, md,
691 ims, eptrb, flags, RM1);
692 if (rrc != MATCH_NOMATCH && rrc != MATCH_THEN) RRETURN(rrc);
693 md->capture_last = save_capture_last;
694 ecode += GET(ecode, 1);
695 }
696 while (*ecode == OP_ALT);
697
698 DPRINTF(("bracket %d failed\n", number));
699
700 md->offset_vector[offset] = save_offset1;
701 md->offset_vector[offset+1] = save_offset2;
702 md->offset_vector[md->offset_end - number] = save_offset3;
703
704 RRETURN(MATCH_NOMATCH);
705 }
706
707 /* FALL THROUGH ... Insufficient room for saving captured contents. Treat
708 as a non-capturing bracket. */
709
710 /* VVVVVVVVVVVVVVVVVVVVVVVVV */
711 /* VVVVVVVVVVVVVVVVVVVVVVVVV */
712
713 DPRINTF(("insufficient capture room: treat as non-capturing\n"));
714
715 /* VVVVVVVVVVVVVVVVVVVVVVVVV */
716 /* VVVVVVVVVVVVVVVVVVVVVVVVV */
717
718 /* Non-capturing bracket. Loop for all the alternatives. When we get to the
719 final alternative within the brackets, we would return the result of a
720 recursive call to match() whatever happened. We can reduce stack usage by
721 turning this into a tail recursion, except in the case when match_cbegroup
722 is set.*/
723
724 case OP_BRA:
725 case OP_SBRA:
726 DPRINTF(("start non-capturing bracket\n"));
727 flags = (op >= OP_SBRA)? match_cbegroup : 0;
728 for (;;)
729 {
730 if (ecode[GET(ecode, 1)] != OP_ALT) /* Final alternative */
731 {
732 if (flags == 0) /* Not a possibly empty group */
733 {
734 ecode += _pcre_OP_lengths[*ecode];
735 DPRINTF(("bracket 0 tail recursion\n"));
736 goto TAIL_RECURSE;
737 }
738
739 /* Possibly empty group; can't use tail recursion. */
740
741 RMATCH(eptr, ecode + _pcre_OP_lengths[*ecode], offset_top, md, ims,
742 eptrb, flags, RM48);
743 RRETURN(rrc);
744 }
745
746 /* For non-final alternatives, continue the loop for a NOMATCH result;
747 otherwise return. */
748
749 RMATCH(eptr, ecode + _pcre_OP_lengths[*ecode], offset_top, md, ims,
750 eptrb, flags, RM2);
751 if (rrc != MATCH_NOMATCH && rrc != MATCH_THEN) RRETURN(rrc);
752 ecode += GET(ecode, 1);
753 }
754 /* Control never reaches here. */
755
756 /* Conditional group: compilation checked that there are no more than
757 two branches. If the condition is false, skipping the first branch takes us
758 past the end if there is only one branch, but that's OK because that is
759 exactly what going to the ket would do. As there is only one branch to be
760 obeyed, we can use tail recursion to avoid using another stack frame. */
761
762 case OP_COND:
763 case OP_SCOND:
764 if (ecode[LINK_SIZE+1] == OP_RREF) /* Recursion test */
765 {
766 offset = GET2(ecode, LINK_SIZE + 2); /* Recursion group number*/
767 condition = md->recursive != NULL &&
768 (offset == RREF_ANY || offset == md->recursive->group_num);
769 ecode += condition? 3 : GET(ecode, 1);
770 }
771
772 else if (ecode[LINK_SIZE+1] == OP_CREF) /* Group used test */
773 {
774 offset = GET2(ecode, LINK_SIZE+2) << 1; /* Doubled ref number */
775 condition = offset < offset_top && md->offset_vector[offset] >= 0;
776 ecode += condition? 3 : GET(ecode, 1);
777 }
778
779 else if (ecode[LINK_SIZE+1] == OP_DEF) /* DEFINE - always false */
780 {
781 condition = FALSE;
782 ecode += GET(ecode, 1);
783 }
784
785 /* The condition is an assertion. Call match() to evaluate it - setting
786 the final argument match_condassert causes it to stop at the end of an
787 assertion. */
788
789 else
790 {
791 RMATCH(eptr, ecode + 1 + LINK_SIZE, offset_top, md, ims, NULL,
792 match_condassert, RM3);
793 if (rrc == MATCH_MATCH)
794 {
795 condition = TRUE;
796 ecode += 1 + LINK_SIZE + GET(ecode, LINK_SIZE + 2);
797 while (*ecode == OP_ALT) ecode += GET(ecode, 1);
798 }
799 else if (rrc != MATCH_NOMATCH && rrc != MATCH_THEN)
800 {
801 RRETURN(rrc); /* Need braces because of following else */
802 }
803 else
804 {
805 condition = FALSE;
806 ecode += GET(ecode, 1);
807 }
808 }
809
810 /* We are now at the branch that is to be obeyed. As there is only one,
811 we can use tail recursion to avoid using another stack frame, except when
812 match_cbegroup is required for an unlimited repeat of a possibly empty
813 group. If the second alternative doesn't exist, we can just plough on. */
814
815 if (condition || *ecode == OP_ALT)
816 {
817 ecode += 1 + LINK_SIZE;
818 if (op == OP_SCOND) /* Possibly empty group */
819 {
820 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, match_cbegroup, RM49);
821 RRETURN(rrc);
822 }
823 else /* Group must match something */
824 {
825 flags = 0;
826 goto TAIL_RECURSE;
827 }
828 }
829 else /* Condition false & no 2nd alternative */
830 {
831 ecode += 1 + LINK_SIZE;
832 }
833 break;
834
835
836 /* End of the pattern, either real or forced. If we are in a top-level
837 recursion, we should restore the offsets appropriately and continue from
838 after the call. */
839
840 case OP_ACCEPT:
841 case OP_END:
842 if (md->recursive != NULL && md->recursive->group_num == 0)
843 {
844 recursion_info *rec = md->recursive;
845 DPRINTF(("End of pattern in a (?0) recursion\n"));
846 md->recursive = rec->prevrec;
847 memmove(md->offset_vector, rec->offset_save,
848 rec->saved_max * sizeof(int));
849 mstart = rec->save_start;
850 ims = original_ims;
851 ecode = rec->after_call;
852 break;
853 }
854
855 /* Otherwise, if PCRE_NOTEMPTY is set, fail if we have matched an empty
856 string - backtracking will then try other alternatives, if any. */
857
858 if (md->notempty && eptr == mstart) RRETURN(MATCH_NOMATCH);
859 md->end_match_ptr = eptr; /* Record where we ended */
860 md->end_offset_top = offset_top; /* and how many extracts were taken */
861 md->start_match_ptr = mstart; /* and the start (\K can modify) */
862 RRETURN(MATCH_MATCH);
863
864 /* Change option settings */
865
866 case OP_OPT:
867 ims = ecode[1];
868 ecode += 2;
869 DPRINTF(("ims set to %02lx\n", ims));
870 break;
871
872 /* Assertion brackets. Check the alternative branches in turn - the
873 matching won't pass the KET for an assertion. If any one branch matches,
874 the assertion is true. Lookbehind assertions have an OP_REVERSE item at the
875 start of each branch to move the current point backwards, so the code at
876 this level is identical to the lookahead case. */
877
878 case OP_ASSERT:
879 case OP_ASSERTBACK:
880 do
881 {
882 RMATCH(eptr, ecode + 1 + LINK_SIZE, offset_top, md, ims, NULL, 0,
883 RM4);
884 if (rrc == MATCH_MATCH) break;
885 if (rrc != MATCH_NOMATCH && rrc != MATCH_THEN) RRETURN(rrc);
886 ecode += GET(ecode, 1);
887 }
888 while (*ecode == OP_ALT);
889 if (*ecode == OP_KET) RRETURN(MATCH_NOMATCH);
890
891 /* If checking an assertion for a condition, return MATCH_MATCH. */
892
893 if ((flags & match_condassert) != 0) RRETURN(MATCH_MATCH);
894
895 /* Continue from after the assertion, updating the offsets high water
896 mark, since extracts may have been taken during the assertion. */
897
898 do ecode += GET(ecode,1); while (*ecode == OP_ALT);
899 ecode += 1 + LINK_SIZE;
900 offset_top = md->end_offset_top;
901 continue;
902
903 /* Negative assertion: all branches must fail to match */
904
905 case OP_ASSERT_NOT:
906 case OP_ASSERTBACK_NOT:
907 do
908 {
909 RMATCH(eptr, ecode + 1 + LINK_SIZE, offset_top, md, ims, NULL, 0,
910 RM5);
911 if (rrc == MATCH_MATCH) RRETURN(MATCH_NOMATCH);
912 if (rrc != MATCH_NOMATCH && rrc != MATCH_THEN) RRETURN(rrc);
913 ecode += GET(ecode,1);
914 }
915 while (*ecode == OP_ALT);
916
917 if ((flags & match_condassert) != 0) RRETURN(MATCH_MATCH);
918
919 ecode += 1 + LINK_SIZE;
920 continue;
921
922 /* Move the subject pointer back. This occurs only at the start of
923 each branch of a lookbehind assertion. If we are too close to the start to
924 move back, this match function fails. When working with UTF-8 we move
925 back a number of characters, not bytes. */
926
927 case OP_REVERSE:
928 #ifdef SUPPORT_UTF8
929 if (utf8)
930 {
931 i = GET(ecode, 1);
932 while (i-- > 0)
933 {
934 eptr--;
935 if (eptr < md->start_subject) RRETURN(MATCH_NOMATCH);
936 BACKCHAR(eptr);
937 }
938 }
939 else
940 #endif
941
942 /* No UTF-8 support, or not in UTF-8 mode: count is byte count */
943
944 {
945 eptr -= GET(ecode, 1);
946 if (eptr < md->start_subject) RRETURN(MATCH_NOMATCH);
947 }
948
949 /* Skip to next op code */
950
951 ecode += 1 + LINK_SIZE;
952 break;
953
954 /* The callout item calls an external function, if one is provided, passing
955 details of the match so far. This is mainly for debugging, though the
956 function is able to force a failure. */
957
958 case OP_CALLOUT:
959 if (pcre_callout != NULL)
960 {
961 pcre_callout_block cb;
962 cb.version = 1; /* Version 1 of the callout block */
963 cb.callout_number = ecode[1];
964 cb.offset_vector = md->offset_vector;
965 cb.subject = (PCRE_SPTR)md->start_subject;
966 cb.subject_length = md->end_subject - md->start_subject;
967 cb.start_match = mstart - md->start_subject;
968 cb.current_position = eptr - md->start_subject;
969 cb.pattern_position = GET(ecode, 2);
970 cb.next_item_length = GET(ecode, 2 + LINK_SIZE);
971 cb.capture_top = offset_top/2;
972 cb.capture_last = md->capture_last;
973 cb.callout_data = md->callout_data;
974 if ((rrc = (*pcre_callout)(&cb)) > 0) RRETURN(MATCH_NOMATCH);
975 if (rrc < 0) RRETURN(rrc);
976 }
977 ecode += 2 + 2*LINK_SIZE;
978 break;
979
980 /* Recursion either matches the current regex, or some subexpression. The
981 offset data is the offset to the starting bracket from the start of the
982 whole pattern. (This is so that it works from duplicated subpatterns.)
983
984 If there are any capturing brackets started but not finished, we have to
985 save their starting points and reinstate them after the recursion. However,
986 we don't know how many such there are (offset_top records the completed
987 total) so we just have to save all the potential data. There may be up to
988 65535 such values, which is too large to put on the stack, but using malloc
989 for small numbers seems expensive. As a compromise, the stack is used when
990 there are no more than REC_STACK_SAVE_MAX values to store; otherwise malloc
991 is used. A problem is what to do if the malloc fails ... there is no way of
992 returning to the top level with an error. Save the top REC_STACK_SAVE_MAX
993 values on the stack, and accept that the rest may be wrong.
994
995 There are also other values that have to be saved. We use a chained
996 sequence of blocks that actually live on the stack. Thanks to Robin Houston
997 for the original version of this logic. */
998
999 case OP_RECURSE:
1000 {
1001 callpat = md->start_code + GET(ecode, 1);
1002 new_recursive.group_num = (callpat == md->start_code)? 0 :
1003 GET2(callpat, 1 + LINK_SIZE);
1004
1005 /* Add to "recursing stack" */
1006
1007 new_recursive.prevrec = md->recursive;
1008 md->recursive = &new_recursive;
1009
1010 /* Find where to continue from afterwards */
1011
1012 ecode += 1 + LINK_SIZE;
1013 new_recursive.after_call = ecode;
1014
1015 /* Now save the offset data. */
1016
1017 new_recursive.saved_max = md->offset_end;
1018 if (new_recursive.saved_max <= REC_STACK_SAVE_MAX)
1019 new_recursive.offset_save = stacksave;
1020 else
1021 {
1022 new_recursive.offset_save =
1023 (int *)(pcre_malloc)(new_recursive.saved_max * sizeof(int));
1024 if (new_recursive.offset_save == NULL) RRETURN(PCRE_ERROR_NOMEMORY);
1025 }
1026
1027 memcpy(new_recursive.offset_save, md->offset_vector,
1028 new_recursive.saved_max * sizeof(int));
1029 new_recursive.save_start = mstart;
1030 mstart = eptr;
1031
1032 /* OK, now we can do the recursion. For each top-level alternative we
1033 restore the offset and recursion data. */
1034
1035 DPRINTF(("Recursing into group %d\n", new_recursive.group_num));
1036 flags = (*callpat >= OP_SBRA)? match_cbegroup : 0;
1037 do
1038 {
1039 RMATCH(eptr, callpat + _pcre_OP_lengths[*callpat], offset_top,
1040 md, ims, eptrb, flags, RM6);
1041 if (rrc == MATCH_MATCH)
1042 {
1043 DPRINTF(("Recursion matched\n"));
1044 md->recursive = new_recursive.prevrec;
1045 if (new_recursive.offset_save != stacksave)
1046 (pcre_free)(new_recursive.offset_save);
1047 RRETURN(MATCH_MATCH);
1048 }
1049 else if (rrc != MATCH_NOMATCH && rrc != MATCH_THEN)
1050 {
1051 DPRINTF(("Recursion gave error %d\n", rrc));
1052 RRETURN(rrc);
1053 }
1054
1055 md->recursive = &new_recursive;
1056 memcpy(md->offset_vector, new_recursive.offset_save,
1057 new_recursive.saved_max * sizeof(int));
1058 callpat += GET(callpat, 1);
1059 }
1060 while (*callpat == OP_ALT);
1061
1062 DPRINTF(("Recursion didn't match\n"));
1063 md->recursive = new_recursive.prevrec;
1064 if (new_recursive.offset_save != stacksave)
1065 (pcre_free)(new_recursive.offset_save);
1066 RRETURN(MATCH_NOMATCH);
1067 }
1068 /* Control never reaches here */
1069
1070 /* "Once" brackets are like assertion brackets except that after a match,
1071 the point in the subject string is not moved back. Thus there can never be
1072 a move back into the brackets. Friedl calls these "atomic" subpatterns.
1073 Check the alternative branches in turn - the matching won't pass the KET
1074 for this kind of subpattern. If any one branch matches, we carry on as at
1075 the end of a normal bracket, leaving the subject pointer. */
1076
1077 case OP_ONCE:
1078 prev = ecode;
1079 saved_eptr = eptr;
1080
1081 do
1082 {
1083 RMATCH(eptr, ecode + 1 + LINK_SIZE, offset_top, md, ims, eptrb, 0, RM7);
1084 if (rrc == MATCH_MATCH) break;
1085 if (rrc != MATCH_NOMATCH && rrc != MATCH_THEN) RRETURN(rrc);
1086 ecode += GET(ecode,1);
1087 }
1088 while (*ecode == OP_ALT);
1089
1090 /* If hit the end of the group (which could be repeated), fail */
1091
1092 if (*ecode != OP_ONCE && *ecode != OP_ALT) RRETURN(MATCH_NOMATCH);
1093
1094 /* Continue as from after the assertion, updating the offsets high water
1095 mark, since extracts may have been taken. */
1096
1097 do ecode += GET(ecode, 1); while (*ecode == OP_ALT);
1098
1099 offset_top = md->end_offset_top;
1100 eptr = md->end_match_ptr;
1101
1102 /* For a non-repeating ket, just continue at this level. This also
1103 happens for a repeating ket if no characters were matched in the group.
1104 This is the forcible breaking of infinite loops as implemented in Perl
1105 5.005. If there is an options reset, it will get obeyed in the normal
1106 course of events. */
1107
1108 if (*ecode == OP_KET || eptr == saved_eptr)
1109 {
1110 ecode += 1+LINK_SIZE;
1111 break;
1112 }
1113
1114 /* The repeating kets try the rest of the pattern or restart from the
1115 preceding bracket, in the appropriate order. The second "call" of match()
1116 uses tail recursion, to avoid using another stack frame. We need to reset
1117 any options that changed within the bracket before re-running it, so
1118 check the next opcode. */
1119
1120 if (ecode[1+LINK_SIZE] == OP_OPT)
1121 {
1122 ims = (ims & ~PCRE_IMS) | ecode[4];
1123 DPRINTF(("ims set to %02lx at group repeat\n", ims));
1124 }
1125
1126 if (*ecode == OP_KETRMIN)
1127 {
1128 RMATCH(eptr, ecode + 1 + LINK_SIZE, offset_top, md, ims, eptrb, 0, RM8);
1129 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1130 ecode = prev;
1131 flags = 0;
1132 goto TAIL_RECURSE;
1133 }
1134 else /* OP_KETRMAX */
1135 {
1136 RMATCH(eptr, prev, offset_top, md, ims, eptrb, match_cbegroup, RM9);
1137 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1138 ecode += 1 + LINK_SIZE;
1139 flags = 0;
1140 goto TAIL_RECURSE;
1141 }
1142 /* Control never gets here */
1143
1144 /* An alternation is the end of a branch; scan along to find the end of the
1145 bracketed group and go to there. */
1146
1147 case OP_ALT:
1148 do ecode += GET(ecode,1); while (*ecode == OP_ALT);
1149 break;
1150
1151 /* BRAZERO, BRAMINZERO and SKIPZERO occur just before a bracket group,
1152 indicating that it may occur zero times. It may repeat infinitely, or not
1153 at all - i.e. it could be ()* or ()? or even (){0} in the pattern. Brackets
1154 with fixed upper repeat limits are compiled as a number of copies, with the
1155 optional ones preceded by BRAZERO or BRAMINZERO. */
1156
1157 case OP_BRAZERO:
1158 {
1159 next = ecode+1;
1160 RMATCH(eptr, next, offset_top, md, ims, eptrb, 0, RM10);
1161 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1162 do next += GET(next,1); while (*next == OP_ALT);
1163 ecode = next + 1 + LINK_SIZE;
1164 }
1165 break;
1166
1167 case OP_BRAMINZERO:
1168 {
1169 next = ecode+1;
1170 do next += GET(next, 1); while (*next == OP_ALT);
1171 RMATCH(eptr, next + 1+LINK_SIZE, offset_top, md, ims, eptrb, 0, RM11);
1172 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1173 ecode++;
1174 }
1175 break;
1176
1177 case OP_SKIPZERO:
1178 {
1179 next = ecode+1;
1180 do next += GET(next,1); while (*next == OP_ALT);
1181 ecode = next + 1 + LINK_SIZE;
1182 }
1183 break;
1184
1185 /* End of a group, repeated or non-repeating. */
1186
1187 case OP_KET:
1188 case OP_KETRMIN:
1189 case OP_KETRMAX:
1190 prev = ecode - GET(ecode, 1);
1191
1192 /* If this was a group that remembered the subject start, in order to break
1193 infinite repeats of empty string matches, retrieve the subject start from
1194 the chain. Otherwise, set it NULL. */
1195
1196 if (*prev >= OP_SBRA)
1197 {
1198 saved_eptr = eptrb->epb_saved_eptr; /* Value at start of group */
1199 eptrb = eptrb->epb_prev; /* Backup to previous group */
1200 }
1201 else saved_eptr = NULL;
1202
1203 /* If we are at the end of an assertion group, stop matching and return
1204 MATCH_MATCH, but record the current high water mark for use by positive
1205 assertions. Do this also for the "once" (atomic) groups. */
1206
1207 if (*prev == OP_ASSERT || *prev == OP_ASSERT_NOT ||
1208 *prev == OP_ASSERTBACK || *prev == OP_ASSERTBACK_NOT ||
1209 *prev == OP_ONCE)
1210 {
1211 md->end_match_ptr = eptr; /* For ONCE */
1212 md->end_offset_top = offset_top;
1213 RRETURN(MATCH_MATCH);
1214 }
1215
1216 /* For capturing groups we have to check the group number back at the start
1217 and if necessary complete handling an extraction by setting the offsets and
1218 bumping the high water mark. Note that whole-pattern recursion is coded as
1219 a recurse into group 0, so it won't be picked up here. Instead, we catch it
1220 when the OP_END is reached. Other recursion is handled here. */
1221
1222 if (*prev == OP_CBRA || *prev == OP_SCBRA)
1223 {
1224 number = GET2(prev, 1+LINK_SIZE);
1225 offset = number << 1;
1226
1227 #ifdef DEBUG
1228 printf("end bracket %d", number);
1229 printf("\n");
1230 #endif
1231
1232 md->capture_last = number;
1233 if (offset >= md->offset_max) md->offset_overflow = TRUE; else
1234 {
1235 md->offset_vector[offset] =
1236 md->offset_vector[md->offset_end - number];
1237 md->offset_vector[offset+1] = eptr - md->start_subject;
1238 if (offset_top <= offset) offset_top = offset + 2;
1239 }
1240
1241 /* Handle a recursively called group. Restore the offsets
1242 appropriately and continue from after the call. */
1243
1244 if (md->recursive != NULL && md->recursive->group_num == number)
1245 {
1246 recursion_info *rec = md->recursive;
1247 DPRINTF(("Recursion (%d) succeeded - continuing\n", number));
1248 md->recursive = rec->prevrec;
1249 mstart = rec->save_start;
1250 memcpy(md->offset_vector, rec->offset_save,
1251 rec->saved_max * sizeof(int));
1252 ecode = rec->after_call;
1253 ims = original_ims;
1254 break;
1255 }
1256 }
1257
1258 /* For both capturing and non-capturing groups, reset the value of the ims
1259 flags, in case they got changed during the group. */
1260
1261 ims = original_ims;
1262 DPRINTF(("ims reset to %02lx\n", ims));
1263
1264 /* For a non-repeating ket, just continue at this level. This also
1265 happens for a repeating ket if no characters were matched in the group.
1266 This is the forcible breaking of infinite loops as implemented in Perl
1267 5.005. If there is an options reset, it will get obeyed in the normal
1268 course of events. */
1269
1270 if (*ecode == OP_KET || eptr == saved_eptr)
1271 {
1272 ecode += 1 + LINK_SIZE;
1273 break;
1274 }
1275
1276 /* The repeating kets try the rest of the pattern or restart from the
1277 preceding bracket, in the appropriate order. In the second case, we can use
1278 tail recursion to avoid using another stack frame, unless we have an
1279 unlimited repeat of a group that can match an empty string. */
1280
1281 flags = (*prev >= OP_SBRA)? match_cbegroup : 0;
1282
1283 if (*ecode == OP_KETRMIN)
1284 {
1285 RMATCH(eptr, ecode + 1 + LINK_SIZE, offset_top, md, ims, eptrb, 0, RM12);
1286 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1287 if (flags != 0) /* Could match an empty string */
1288 {
1289 RMATCH(eptr, prev, offset_top, md, ims, eptrb, flags, RM50);
1290 RRETURN(rrc);
1291 }
1292 ecode = prev;
1293 goto TAIL_RECURSE;
1294 }
1295 else /* OP_KETRMAX */
1296 {
1297 RMATCH(eptr, prev, offset_top, md, ims, eptrb, flags, RM13);
1298 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1299 ecode += 1 + LINK_SIZE;
1300 flags = 0;
1301 goto TAIL_RECURSE;
1302 }
1303 /* Control never gets here */
1304
1305 /* Start of subject unless notbol, or after internal newline if multiline */
1306
1307 case OP_CIRC:
1308 if (md->notbol && eptr == md->start_subject) RRETURN(MATCH_NOMATCH);
1309 if ((ims & PCRE_MULTILINE) != 0)
1310 {
1311 if (eptr != md->start_subject &&
1312 (eptr == md->end_subject || !WAS_NEWLINE(eptr)))
1313 RRETURN(MATCH_NOMATCH);
1314 ecode++;
1315 break;
1316 }
1317 /* ... else fall through */
1318
1319 /* Start of subject assertion */
1320
1321 case OP_SOD:
1322 if (eptr != md->start_subject) RRETURN(MATCH_NOMATCH);
1323 ecode++;
1324 break;
1325
1326 /* Start of match assertion */
1327
1328 case OP_SOM:
1329 if (eptr != md->start_subject + md->start_offset) RRETURN(MATCH_NOMATCH);
1330 ecode++;
1331 break;
1332
1333 /* Reset the start of match point */
1334
1335 case OP_SET_SOM:
1336 mstart = eptr;
1337 ecode++;
1338 break;
1339
1340 /* Assert before internal newline if multiline, or before a terminating
1341 newline unless endonly is set, else end of subject unless noteol is set. */
1342
1343 case OP_DOLL:
1344 if ((ims & PCRE_MULTILINE) != 0)
1345 {
1346 if (eptr < md->end_subject)
1347 { if (!IS_NEWLINE(eptr)) RRETURN(MATCH_NOMATCH); }
1348 else
1349 { if (md->noteol) RRETURN(MATCH_NOMATCH); }
1350 ecode++;
1351 break;
1352 }
1353 else
1354 {
1355 if (md->noteol) RRETURN(MATCH_NOMATCH);
1356 if (!md->endonly)
1357 {
1358 if (eptr != md->end_subject &&
1359 (!IS_NEWLINE(eptr) || eptr != md->end_subject - md->nllen))
1360 RRETURN(MATCH_NOMATCH);
1361 ecode++;
1362 break;
1363 }
1364 }
1365 /* ... else fall through for endonly */
1366
1367 /* End of subject assertion (\z) */
1368
1369 case OP_EOD:
1370 if (eptr < md->end_subject) RRETURN(MATCH_NOMATCH);
1371 ecode++;
1372 break;
1373
1374 /* End of subject or ending \n assertion (\Z) */
1375
1376 case OP_EODN:
1377 if (eptr != md->end_subject &&
1378 (!IS_NEWLINE(eptr) || eptr != md->end_subject - md->nllen))
1379 RRETURN(MATCH_NOMATCH);
1380 ecode++;
1381 break;
1382
1383 /* Word boundary assertions */
1384
1385 case OP_NOT_WORD_BOUNDARY:
1386 case OP_WORD_BOUNDARY:
1387 {
1388
1389 /* Find out if the previous and current characters are "word" characters.
1390 It takes a bit more work in UTF-8 mode. Characters > 255 are assumed to
1391 be "non-word" characters. */
1392
1393 #ifdef SUPPORT_UTF8
1394 if (utf8)
1395 {
1396 if (eptr == md->start_subject) prev_is_word = FALSE; else
1397 {
1398 const uschar *lastptr = eptr - 1;
1399 while((*lastptr & 0xc0) == 0x80) lastptr--;
1400 GETCHAR(c, lastptr);
1401 prev_is_word = c < 256 && (md->ctypes[c] & ctype_word) != 0;
1402 }
1403 if (eptr >= md->end_subject) cur_is_word = FALSE; else
1404 {
1405 GETCHAR(c, eptr);
1406 cur_is_word = c < 256 && (md->ctypes[c] & ctype_word) != 0;
1407 }
1408 }
1409 else
1410 #endif
1411
1412 /* More streamlined when not in UTF-8 mode */
1413
1414 {
1415 prev_is_word = (eptr != md->start_subject) &&
1416 ((md->ctypes[eptr[-1]] & ctype_word) != 0);
1417 cur_is_word = (eptr < md->end_subject) &&
1418 ((md->ctypes[*eptr] & ctype_word) != 0);
1419 }
1420
1421 /* Now see if the situation is what we want */
1422
1423 if ((*ecode++ == OP_WORD_BOUNDARY)?
1424 cur_is_word == prev_is_word : cur_is_word != prev_is_word)
1425 RRETURN(MATCH_NOMATCH);
1426 }
1427 break;
1428
1429 /* Match a single character type; inline for speed */
1430
1431 case OP_ANY:
1432 if (IS_NEWLINE(eptr)) RRETURN(MATCH_NOMATCH);
1433 /* Fall through */
1434
1435 case OP_ALLANY:
1436 if (eptr++ >= md->end_subject) RRETURN(MATCH_NOMATCH);
1437 if (utf8) while (eptr < md->end_subject && (*eptr & 0xc0) == 0x80) eptr++;
1438 ecode++;
1439 break;
1440
1441 /* Match a single byte, even in UTF-8 mode. This opcode really does match
1442 any byte, even newline, independent of the setting of PCRE_DOTALL. */
1443
1444 case OP_ANYBYTE:
1445 if (eptr++ >= md->end_subject) RRETURN(MATCH_NOMATCH);
1446 ecode++;
1447 break;
1448
1449 case OP_NOT_DIGIT:
1450 if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
1451 GETCHARINCTEST(c, eptr);
1452 if (
1453 #ifdef SUPPORT_UTF8
1454 c < 256 &&
1455 #endif
1456 (md->ctypes[c] & ctype_digit) != 0
1457 )
1458 RRETURN(MATCH_NOMATCH);
1459 ecode++;
1460 break;
1461
1462 case OP_DIGIT:
1463 if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
1464 GETCHARINCTEST(c, eptr);
1465 if (
1466 #ifdef SUPPORT_UTF8
1467 c >= 256 ||
1468 #endif
1469 (md->ctypes[c] & ctype_digit) == 0
1470 )
1471 RRETURN(MATCH_NOMATCH);
1472 ecode++;
1473 break;
1474
1475 case OP_NOT_WHITESPACE:
1476 if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
1477 GETCHARINCTEST(c, eptr);
1478 if (
1479 #ifdef SUPPORT_UTF8
1480 c < 256 &&
1481 #endif
1482 (md->ctypes[c] & ctype_space) != 0
1483 )
1484 RRETURN(MATCH_NOMATCH);
1485 ecode++;
1486 break;
1487
1488 case OP_WHITESPACE:
1489 if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
1490 GETCHARINCTEST(c, eptr);
1491 if (
1492 #ifdef SUPPORT_UTF8
1493 c >= 256 ||
1494 #endif
1495 (md->ctypes[c] & ctype_space) == 0
1496 )
1497 RRETURN(MATCH_NOMATCH);
1498 ecode++;
1499 break;
1500
1501 case OP_NOT_WORDCHAR:
1502 if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
1503 GETCHARINCTEST(c, eptr);
1504 if (
1505 #ifdef SUPPORT_UTF8
1506 c < 256 &&
1507 #endif
1508 (md->ctypes[c] & ctype_word) != 0
1509 )
1510 RRETURN(MATCH_NOMATCH);
1511 ecode++;
1512 break;
1513
1514 case OP_WORDCHAR:
1515 if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
1516 GETCHARINCTEST(c, eptr);
1517 if (
1518 #ifdef SUPPORT_UTF8
1519 c >= 256 ||
1520 #endif
1521 (md->ctypes[c] & ctype_word) == 0
1522 )
1523 RRETURN(MATCH_NOMATCH);
1524 ecode++;
1525 break;
1526
1527 case OP_ANYNL:
1528 if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
1529 GETCHARINCTEST(c, eptr);
1530 switch(c)
1531 {
1532 default: RRETURN(MATCH_NOMATCH);
1533 case 0x000d:
1534 if (eptr < md->end_subject && *eptr == 0x0a) eptr++;
1535 break;
1536
1537 case 0x000a:
1538 break;
1539
1540 case 0x000b:
1541 case 0x000c:
1542 case 0x0085:
1543 case 0x2028:
1544 case 0x2029:
1545 if (md->bsr_anycrlf) RRETURN(MATCH_NOMATCH);
1546 break;
1547 }
1548 ecode++;
1549 break;
1550
1551 case OP_NOT_HSPACE:
1552 if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
1553 GETCHARINCTEST(c, eptr);
1554 switch(c)
1555 {
1556 default: break;
1557 case 0x09: /* HT */
1558 case 0x20: /* SPACE */
1559 case 0xa0: /* NBSP */
1560 case 0x1680: /* OGHAM SPACE MARK */
1561 case 0x180e: /* MONGOLIAN VOWEL SEPARATOR */
1562 case 0x2000: /* EN QUAD */
1563 case 0x2001: /* EM QUAD */
1564 case 0x2002: /* EN SPACE */
1565 case 0x2003: /* EM SPACE */
1566 case 0x2004: /* THREE-PER-EM SPACE */
1567 case 0x2005: /* FOUR-PER-EM SPACE */
1568 case 0x2006: /* SIX-PER-EM SPACE */
1569 case 0x2007: /* FIGURE SPACE */
1570 case 0x2008: /* PUNCTUATION SPACE */
1571 case 0x2009: /* THIN SPACE */
1572 case 0x200A: /* HAIR SPACE */
1573 case 0x202f: /* NARROW NO-BREAK SPACE */
1574 case 0x205f: /* MEDIUM MATHEMATICAL SPACE */
1575 case 0x3000: /* IDEOGRAPHIC SPACE */
1576 RRETURN(MATCH_NOMATCH);
1577 }
1578 ecode++;
1579 break;
1580
1581 case OP_HSPACE:
1582 if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
1583 GETCHARINCTEST(c, eptr);
1584 switch(c)
1585 {
1586 default: RRETURN(MATCH_NOMATCH);
1587 case 0x09: /* HT */
1588 case 0x20: /* SPACE */
1589 case 0xa0: /* NBSP */
1590 case 0x1680: /* OGHAM SPACE MARK */
1591 case 0x180e: /* MONGOLIAN VOWEL SEPARATOR */
1592 case 0x2000: /* EN QUAD */
1593 case 0x2001: /* EM QUAD */
1594 case 0x2002: /* EN SPACE */
1595 case 0x2003: /* EM SPACE */
1596 case 0x2004: /* THREE-PER-EM SPACE */
1597 case 0x2005: /* FOUR-PER-EM SPACE */
1598 case 0x2006: /* SIX-PER-EM SPACE */
1599 case 0x2007: /* FIGURE SPACE */
1600 case 0x2008: /* PUNCTUATION SPACE */
1601 case 0x2009: /* THIN SPACE */
1602 case 0x200A: /* HAIR SPACE */
1603 case 0x202f: /* NARROW NO-BREAK SPACE */
1604 case 0x205f: /* MEDIUM MATHEMATICAL SPACE */
1605 case 0x3000: /* IDEOGRAPHIC SPACE */
1606 break;
1607 }
1608 ecode++;
1609 break;
1610
1611 case OP_NOT_VSPACE:
1612 if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
1613 GETCHARINCTEST(c, eptr);
1614 switch(c)
1615 {
1616 default: break;
1617 case 0x0a: /* LF */
1618 case 0x0b: /* VT */
1619 case 0x0c: /* FF */
1620 case 0x0d: /* CR */
1621 case 0x85: /* NEL */
1622 case 0x2028: /* LINE SEPARATOR */
1623 case 0x2029: /* PARAGRAPH SEPARATOR */
1624 RRETURN(MATCH_NOMATCH);
1625 }
1626 ecode++;
1627 break;
1628
1629 case OP_VSPACE:
1630 if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
1631 GETCHARINCTEST(c, eptr);
1632 switch(c)
1633 {
1634 default: RRETURN(MATCH_NOMATCH);
1635 case 0x0a: /* LF */
1636 case 0x0b: /* VT */
1637 case 0x0c: /* FF */
1638 case 0x0d: /* CR */
1639 case 0x85: /* NEL */
1640 case 0x2028: /* LINE SEPARATOR */
1641 case 0x2029: /* PARAGRAPH SEPARATOR */
1642 break;
1643 }
1644 ecode++;
1645 break;
1646
1647 #ifdef SUPPORT_UCP
1648 /* Check the next character by Unicode property. We will get here only
1649 if the support is in the binary; otherwise a compile-time error occurs. */
1650
1651 case OP_PROP:
1652 case OP_NOTPROP:
1653 if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
1654 GETCHARINCTEST(c, eptr);
1655 {
1656 int chartype, script;
1657 int category = _pcre_ucp_findprop(c, &chartype, &script);
1658
1659 switch(ecode[1])
1660 {
1661 case PT_ANY:
1662 if (op == OP_NOTPROP) RRETURN(MATCH_NOMATCH);
1663 break;
1664
1665 case PT_LAMP:
1666 if ((chartype == ucp_Lu ||
1667 chartype == ucp_Ll ||
1668 chartype == ucp_Lt) == (op == OP_NOTPROP))
1669 RRETURN(MATCH_NOMATCH);
1670 break;
1671
1672 case PT_GC:
1673 if ((ecode[2] != category) == (op == OP_PROP))
1674 RRETURN(MATCH_NOMATCH);
1675 break;
1676
1677 case PT_PC:
1678 if ((ecode[2] != chartype) == (op == OP_PROP))
1679 RRETURN(MATCH_NOMATCH);
1680 break;
1681
1682 case PT_SC:
1683 if ((ecode[2] != script) == (op == OP_PROP))
1684 RRETURN(MATCH_NOMATCH);
1685 break;
1686
1687 default:
1688 RRETURN(PCRE_ERROR_INTERNAL);
1689 }
1690
1691 ecode += 3;
1692 }
1693 break;
1694
1695 /* Match an extended Unicode sequence. We will get here only if the support
1696 is in the binary; otherwise a compile-time error occurs. */
1697
1698 case OP_EXTUNI:
1699 if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
1700 GETCHARINCTEST(c, eptr);
1701 {
1702 int chartype, script;
1703 int category = _pcre_ucp_findprop(c, &chartype, &script);
1704 if (category == ucp_M) RRETURN(MATCH_NOMATCH);
1705 while (eptr < md->end_subject)
1706 {
1707 int len = 1;
1708 if (!utf8) c = *eptr; else
1709 {
1710 GETCHARLEN(c, eptr, len);
1711 }
1712 category = _pcre_ucp_findprop(c, &chartype, &script);
1713 if (category != ucp_M) break;
1714 eptr += len;
1715 }
1716 }
1717 ecode++;
1718 break;
1719 #endif
1720
1721
1722 /* Match a back reference, possibly repeatedly. Look past the end of the
1723 item to see if there is repeat information following. The code is similar
1724 to that for character classes, but repeated for efficiency. Then obey
1725 similar code to character type repeats - written out again for speed.
1726 However, if the referenced string is the empty string, always treat
1727 it as matched, any number of times (otherwise there could be infinite
1728 loops). */
1729
1730 case OP_REF:
1731 {
1732 offset = GET2(ecode, 1) << 1; /* Doubled ref number */
1733 ecode += 3;
1734
1735 /* If the reference is unset, there are two possibilities:
1736
1737 (a) In the default, Perl-compatible state, set the length to be longer
1738 than the amount of subject left; this ensures that every attempt at a
1739 match fails. We can't just fail here, because of the possibility of
1740 quantifiers with zero minima.
1741
1742 (b) If the JavaScript compatibility flag is set, set the length to zero
1743 so that the back reference matches an empty string.
1744
1745 Otherwise, set the length to the length of what was matched by the
1746 referenced subpattern. */
1747
1748 if (offset >= offset_top || md->offset_vector[offset] < 0)
1749 length = (md->jscript_compat)? 0 : md->end_subject - eptr + 1;
1750 else
1751 length = md->offset_vector[offset+1] - md->offset_vector[offset];
1752
1753 /* Set up for repetition, or handle the non-repeated case */
1754
1755 switch (*ecode)
1756 {
1757 case OP_CRSTAR:
1758 case OP_CRMINSTAR:
1759 case OP_CRPLUS:
1760 case OP_CRMINPLUS:
1761 case OP_CRQUERY:
1762 case OP_CRMINQUERY:
1763 c = *ecode++ - OP_CRSTAR;
1764 minimize = (c & 1) != 0;
1765 min = rep_min[c]; /* Pick up values from tables; */
1766 max = rep_max[c]; /* zero for max => infinity */
1767 if (max == 0) max = INT_MAX;
1768 break;
1769
1770 case OP_CRRANGE:
1771 case OP_CRMINRANGE:
1772 minimize = (*ecode == OP_CRMINRANGE);
1773 min = GET2(ecode, 1);
1774 max = GET2(ecode, 3);
1775 if (max == 0) max = INT_MAX;
1776 ecode += 5;
1777 break;
1778
1779 default: /* No repeat follows */
1780 if (!match_ref(offset, eptr, length, md, ims)) RRETURN(MATCH_NOMATCH);
1781 eptr += length;
1782 continue; /* With the main loop */
1783 }
1784
1785 /* If the length of the reference is zero, just continue with the
1786 main loop. */
1787
1788 if (length == 0) continue;
1789
1790 /* First, ensure the minimum number of matches are present. We get back
1791 the length of the reference string explicitly rather than passing the
1792 address of eptr, so that eptr can be a register variable. */
1793
1794 for (i = 1; i <= min; i++)
1795 {
1796 if (!match_ref(offset, eptr, length, md, ims)) RRETURN(MATCH_NOMATCH);
1797 eptr += length;
1798 }
1799
1800 /* If min = max, continue at the same level without recursion.
1801 They are not both allowed to be zero. */
1802
1803 if (min == max) continue;
1804
1805 /* If minimizing, keep trying and advancing the pointer */
1806
1807 if (minimize)
1808 {
1809 for (fi = min;; fi++)
1810 {
1811 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM14);
1812 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1813 if (fi >= max || !match_ref(offset, eptr, length, md, ims))
1814 RRETURN(MATCH_NOMATCH);
1815 eptr += length;
1816 }
1817 /* Control never gets here */
1818 }
1819
1820 /* If maximizing, find the longest string and work backwards */
1821
1822 else
1823 {
1824 pp = eptr;
1825 for (i = min; i < max; i++)
1826 {
1827 if (!match_ref(offset, eptr, length, md, ims)) break;
1828 eptr += length;
1829 }
1830 while (eptr >= pp)
1831 {
1832 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM15);
1833 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1834 eptr -= length;
1835 }
1836 RRETURN(MATCH_NOMATCH);
1837 }
1838 }
1839 /* Control never gets here */
1840
1841
1842
1843 /* Match a bit-mapped character class, possibly repeatedly. This op code is
1844 used when all the characters in the class have values in the range 0-255,
1845 and either the matching is caseful, or the characters are in the range
1846 0-127 when UTF-8 processing is enabled. The only difference between
1847 OP_CLASS and OP_NCLASS occurs when a data character outside the range is
1848 encountered.
1849
1850 First, look past the end of the item to see if there is repeat information
1851 following. Then obey similar code to character type repeats - written out
1852 again for speed. */
1853
1854 case OP_NCLASS:
1855 case OP_CLASS:
1856 {
1857 data = ecode + 1; /* Save for matching */
1858 ecode += 33; /* Advance past the item */
1859
1860 switch (*ecode)
1861 {
1862 case OP_CRSTAR:
1863 case OP_CRMINSTAR:
1864 case OP_CRPLUS:
1865 case OP_CRMINPLUS:
1866 case OP_CRQUERY:
1867 case OP_CRMINQUERY:
1868 c = *ecode++ - OP_CRSTAR;
1869 minimize = (c & 1) != 0;
1870 min = rep_min[c]; /* Pick up values from tables; */
1871 max = rep_max[c]; /* zero for max => infinity */
1872 if (max == 0) max = INT_MAX;
1873 break;
1874
1875 case OP_CRRANGE:
1876 case OP_CRMINRANGE:
1877 minimize = (*ecode == OP_CRMINRANGE);
1878 min = GET2(ecode, 1);
1879 max = GET2(ecode, 3);
1880 if (max == 0) max = INT_MAX;
1881 ecode += 5;
1882 break;
1883
1884 default: /* No repeat follows */
1885 min = max = 1;
1886 break;
1887 }
1888
1889 /* First, ensure the minimum number of matches are present. */
1890
1891 #ifdef SUPPORT_UTF8
1892 /* UTF-8 mode */
1893 if (utf8)
1894 {
1895 for (i = 1; i <= min; i++)
1896 {
1897 if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
1898 GETCHARINC(c, eptr);
1899 if (c > 255)
1900 {
1901 if (op == OP_CLASS) RRETURN(MATCH_NOMATCH);
1902 }
1903 else
1904 {
1905 if ((data[c/8] & (1 << (c&7))) == 0) RRETURN(MATCH_NOMATCH);
1906 }
1907 }
1908 }
1909 else
1910 #endif
1911 /* Not UTF-8 mode */
1912 {
1913 for (i = 1; i <= min; i++)
1914 {
1915 if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
1916 c = *eptr++;
1917 if ((data[c/8] & (1 << (c&7))) == 0) RRETURN(MATCH_NOMATCH);
1918 }
1919 }
1920
1921 /* If max == min we can continue with the main loop without the
1922 need to recurse. */
1923
1924 if (min == max) continue;
1925
1926 /* If minimizing, keep testing the rest of the expression and advancing
1927 the pointer while it matches the class. */
1928
1929 if (minimize)
1930 {
1931 #ifdef SUPPORT_UTF8
1932 /* UTF-8 mode */
1933 if (utf8)
1934 {
1935 for (fi = min;; fi++)
1936 {
1937 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM16);
1938 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1939 if (fi >= max || eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
1940 GETCHARINC(c, eptr);
1941 if (c > 255)
1942 {
1943 if (op == OP_CLASS) RRETURN(MATCH_NOMATCH);
1944 }
1945 else
1946 {
1947 if ((data[c/8] & (1 << (c&7))) == 0) RRETURN(MATCH_NOMATCH);
1948 }
1949 }
1950 }
1951 else
1952 #endif
1953 /* Not UTF-8 mode */
1954 {
1955 for (fi = min;; fi++)
1956 {
1957 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM17);
1958 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1959 if (fi >= max || eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
1960 c = *eptr++;
1961 if ((data[c/8] & (1 << (c&7))) == 0) RRETURN(MATCH_NOMATCH);
1962 }
1963 }
1964 /* Control never gets here */
1965 }
1966
1967 /* If maximizing, find the longest possible run, then work backwards. */
1968
1969 else
1970 {
1971 pp = eptr;
1972
1973 #ifdef SUPPORT_UTF8
1974 /* UTF-8 mode */
1975 if (utf8)
1976 {
1977 for (i = min; i < max; i++)
1978 {
1979 int len = 1;
1980 if (eptr >= md->end_subject) break;
1981 GETCHARLEN(c, eptr, len);
1982 if (c > 255)
1983 {
1984 if (op == OP_CLASS) break;
1985 }
1986 else
1987 {
1988 if ((data[c/8] & (1 << (c&7))) == 0) break;
1989 }
1990 eptr += len;
1991 }
1992 for (;;)
1993 {
1994 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM18);
1995 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1996 if (eptr-- == pp) break; /* Stop if tried at original pos */
1997 BACKCHAR(eptr);
1998 }
1999 }
2000 else
2001 #endif
2002 /* Not UTF-8 mode */
2003 {
2004 for (i = min; i < max; i++)
2005 {
2006 if (eptr >= md->end_subject) break;
2007 c = *eptr;
2008 if ((data[c/8] & (1 << (c&7))) == 0) break;
2009 eptr++;
2010 }
2011 while (eptr >= pp)
2012 {
2013 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM19);
2014 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2015 eptr--;
2016 }
2017 }
2018
2019 RRETURN(MATCH_NOMATCH);
2020 }
2021 }
2022 /* Control never gets here */
2023
2024
2025 /* Match an extended character class. This opcode is encountered only
2026 in UTF-8 mode, because that's the only time it is compiled. */
2027
2028 #ifdef SUPPORT_UTF8
2029 case OP_XCLASS:
2030 {
2031 data = ecode + 1 + LINK_SIZE; /* Save for matching */
2032 ecode += GET(ecode, 1); /* Advance past the item */
2033
2034 switch (*ecode)
2035 {
2036 case OP_CRSTAR:
2037 case OP_CRMINSTAR:
2038 case OP_CRPLUS:
2039 case OP_CRMINPLUS:
2040 case OP_CRQUERY:
2041 case OP_CRMINQUERY:
2042 c = *ecode++ - OP_CRSTAR;
2043 minimize = (c & 1) != 0;
2044 min = rep_min[c]; /* Pick up values from tables; */
2045 max = rep_max[c]; /* zero for max => infinity */
2046 if (max == 0) max = INT_MAX;
2047 break;
2048
2049 case OP_CRRANGE:
2050 case OP_CRMINRANGE:
2051 minimize = (*ecode == OP_CRMINRANGE);
2052 min = GET2(ecode, 1);
2053 max = GET2(ecode, 3);
2054 if (max == 0) max = INT_MAX;
2055 ecode += 5;
2056 break;
2057
2058 default: /* No repeat follows */
2059 min = max = 1;
2060 break;
2061 }
2062
2063 /* First, ensure the minimum number of matches are present. */
2064
2065 for (i = 1; i <= min; i++)
2066 {
2067 if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
2068 GETCHARINC(c, eptr);
2069 if (!_pcre_xclass(c, data)) RRETURN(MATCH_NOMATCH);
2070 }
2071
2072 /* If max == min we can continue with the main loop without the
2073 need to recurse. */
2074
2075 if (min == max) continue;
2076
2077 /* If minimizing, keep testing the rest of the expression and advancing
2078 the pointer while it matches the class. */
2079
2080 if (minimize)
2081 {
2082 for (fi = min;; fi++)
2083 {
2084 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM20);
2085 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2086 if (fi >= max || eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
2087 GETCHARINC(c, eptr);
2088 if (!_pcre_xclass(c, data)) RRETURN(MATCH_NOMATCH);
2089 }
2090 /* Control never gets here */
2091 }
2092
2093 /* If maximizing, find the longest possible run, then work backwards. */
2094
2095 else
2096 {
2097 pp = eptr;
2098 for (i = min; i < max; i++)
2099 {
2100 int len = 1;
2101 if (eptr >= md->end_subject) break;
2102 GETCHARLEN(c, eptr, len);
2103 if (!_pcre_xclass(c, data)) break;
2104 eptr += len;
2105 }
2106 for(;;)
2107 {
2108 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM21);
2109 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2110 if (eptr-- == pp) break; /* Stop if tried at original pos */
2111 if (utf8) BACKCHAR(eptr);
2112 }
2113 RRETURN(MATCH_NOMATCH);
2114 }
2115
2116 /* Control never gets here */
2117 }
2118 #endif /* End of XCLASS */
2119
2120 /* Match a single character, casefully */
2121
2122 case OP_CHAR:
2123 #ifdef SUPPORT_UTF8
2124 if (utf8)
2125 {
2126 length = 1;
2127 ecode++;
2128 GETCHARLEN(fc, ecode, length);
2129 if (length > md->end_subject - eptr) RRETURN(MATCH_NOMATCH);
2130 while (length-- > 0) if (*ecode++ != *eptr++) RRETURN(MATCH_NOMATCH);
2131 }
2132 else
2133 #endif
2134
2135 /* Non-UTF-8 mode */
2136 {
2137 if (md->end_subject - eptr < 1) RRETURN(MATCH_NOMATCH);
2138 if (ecode[1] != *eptr++) RRETURN(MATCH_NOMATCH);
2139 ecode += 2;
2140 }
2141 break;
2142
2143 /* Match a single character, caselessly */
2144
2145 case OP_CHARNC:
2146 #ifdef SUPPORT_UTF8
2147 if (utf8)
2148 {
2149 length = 1;
2150 ecode++;
2151 GETCHARLEN(fc, ecode, length);
2152
2153 if (length > md->end_subject - eptr) RRETURN(MATCH_NOMATCH);
2154
2155 /* If the pattern character's value is < 128, we have only one byte, and
2156 can use the fast lookup table. */
2157
2158 if (fc < 128)
2159 {
2160 if (md->lcc[*ecode++] != md->lcc[*eptr++]) RRETURN(MATCH_NOMATCH);
2161 }
2162
2163 /* Otherwise we must pick up the subject character */
2164
2165 else
2166 {
2167 unsigned int dc;
2168 GETCHARINC(dc, eptr);
2169 ecode += length;
2170
2171 /* If we have Unicode property support, we can use it to test the other
2172 case of the character, if there is one. */
2173
2174 if (fc != dc)
2175 {
2176 #ifdef SUPPORT_UCP
2177 if (dc != _pcre_ucp_othercase(fc))
2178 #endif
2179 RRETURN(MATCH_NOMATCH);
2180 }
2181 }
2182 }
2183 else
2184 #endif /* SUPPORT_UTF8 */
2185
2186 /* Non-UTF-8 mode */
2187 {
2188 if (md->end_subject - eptr < 1) RRETURN(MATCH_NOMATCH);
2189 if (md->lcc[ecode[1]] != md->lcc[*eptr++]) RRETURN(MATCH_NOMATCH);
2190 ecode += 2;
2191 }
2192 break;
2193
2194 /* Match a single character repeatedly. */
2195
2196 case OP_EXACT:
2197 min = max = GET2(ecode, 1);
2198 ecode += 3;
2199 goto REPEATCHAR;
2200
2201 case OP_POSUPTO:
2202 possessive = TRUE;
2203 /* Fall through */
2204
2205 case OP_UPTO:
2206 case OP_MINUPTO:
2207 min = 0;
2208 max = GET2(ecode, 1);
2209 minimize = *ecode == OP_MINUPTO;
2210 ecode += 3;
2211 goto REPEATCHAR;
2212
2213 case OP_POSSTAR:
2214 possessive = TRUE;
2215 min = 0;
2216 max = INT_MAX;
2217 ecode++;
2218 goto REPEATCHAR;
2219
2220 case OP_POSPLUS:
2221 possessive = TRUE;
2222 min = 1;
2223 max = INT_MAX;
2224 ecode++;
2225 goto REPEATCHAR;
2226
2227 case OP_POSQUERY:
2228 possessive = TRUE;
2229 min = 0;
2230 max = 1;
2231 ecode++;
2232 goto REPEATCHAR;
2233
2234 case OP_STAR:
2235 case OP_MINSTAR:
2236 case OP_PLUS:
2237 case OP_MINPLUS:
2238 case OP_QUERY:
2239 case OP_MINQUERY:
2240 c = *ecode++ - OP_STAR;
2241 minimize = (c & 1) != 0;
2242 min = rep_min[c]; /* Pick up values from tables; */
2243 max = rep_max[c]; /* zero for max => infinity */
2244 if (max == 0) max = INT_MAX;
2245
2246 /* Common code for all repeated single-character matches. We can give
2247 up quickly if there are fewer than the minimum number of characters left in
2248 the subject. */
2249
2250 REPEATCHAR:
2251 #ifdef SUPPORT_UTF8
2252 if (utf8)
2253 {
2254 length = 1;
2255 charptr = ecode;
2256 GETCHARLEN(fc, ecode, length);
2257 if (min * length > md->end_subject - eptr) RRETURN(MATCH_NOMATCH);
2258 ecode += length;
2259
2260 /* Handle multibyte character matching specially here. There is
2261 support for caseless matching if UCP support is present. */
2262
2263 if (length > 1)
2264 {
2265 #ifdef SUPPORT_UCP
2266 unsigned int othercase;
2267 if ((ims & PCRE_CASELESS) != 0 &&
2268 (othercase = _pcre_ucp_othercase(fc)) != NOTACHAR)
2269 oclength = _pcre_ord2utf8(othercase, occhars);
2270 else oclength = 0;
2271 #endif /* SUPPORT_UCP */
2272
2273 for (i = 1; i <= min; i++)
2274 {
2275 if (memcmp(eptr, charptr, length) == 0) eptr += length;
2276 #ifdef SUPPORT_UCP
2277 /* Need braces because of following else */
2278 else if (oclength == 0) { RRETURN(MATCH_NOMATCH); }
2279 else
2280 {
2281 if (memcmp(eptr, occhars, oclength) != 0) RRETURN(MATCH_NOMATCH);
2282 eptr += oclength;
2283 }
2284 #else /* without SUPPORT_UCP */
2285 else { RRETURN(MATCH_NOMATCH); }
2286 #endif /* SUPPORT_UCP */
2287 }
2288
2289 if (min == max) continue;
2290
2291 if (minimize)
2292 {
2293 for (fi = min;; fi++)
2294 {
2295 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM22);
2296 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2297 if (fi >= max || eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
2298 if (memcmp(eptr, charptr, length) == 0) eptr += length;
2299 #ifdef SUPPORT_UCP
2300 /* Need braces because of following else */
2301 else if (oclength == 0) { RRETURN(MATCH_NOMATCH); }
2302 else
2303 {
2304 if (memcmp(eptr, occhars, oclength) != 0) RRETURN(MATCH_NOMATCH);
2305 eptr += oclength;
2306 }
2307 #else /* without SUPPORT_UCP */
2308 else { RRETURN (MATCH_NOMATCH); }
2309 #endif /* SUPPORT_UCP */
2310 }
2311 /* Control never gets here */
2312 }
2313
2314 else /* Maximize */
2315 {
2316 pp = eptr;
2317 for (i = min; i < max; i++)
2318 {
2319 if (eptr > md->end_subject - length) break;
2320 if (memcmp(eptr, charptr, length) == 0) eptr += length;
2321 #ifdef SUPPORT_UCP
2322 else if (oclength == 0) break;
2323 else
2324 {
2325 if (memcmp(eptr, occhars, oclength) != 0) break;
2326 eptr += oclength;
2327 }
2328 #else /* without SUPPORT_UCP */
2329 else break;
2330 #endif /* SUPPORT_UCP */
2331 }
2332
2333 if (possessive) continue;
2334 for(;;)
2335 {
2336 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM23);
2337 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2338 if (eptr == pp) RRETURN(MATCH_NOMATCH);
2339 #ifdef SUPPORT_UCP
2340 eptr--;
2341 BACKCHAR(eptr);
2342 #else /* without SUPPORT_UCP */
2343 eptr -= length;
2344 #endif /* SUPPORT_UCP */
2345 }
2346 }
2347 /* Control never gets here */
2348 }
2349
2350 /* If the length of a UTF-8 character is 1, we fall through here, and
2351 obey the code as for non-UTF-8 characters below, though in this case the
2352 value of fc will always be < 128. */
2353 }
2354 else
2355 #endif /* SUPPORT_UTF8 */
2356
2357 /* When not in UTF-8 mode, load a single-byte character. */
2358 {
2359 if (min > md->end_subject - eptr) RRETURN(MATCH_NOMATCH);
2360 fc = *ecode++;
2361 }
2362
2363 /* The value of fc at this point is always less than 256, though we may or
2364 may not be in UTF-8 mode. The code is duplicated for the caseless and
2365 caseful cases, for speed, since matching characters is likely to be quite
2366 common. First, ensure the minimum number of matches are present. If min =
2367 max, continue at the same level without recursing. Otherwise, if
2368 minimizing, keep trying the rest of the expression and advancing one
2369 matching character if failing, up to the maximum. Alternatively, if
2370 maximizing, find the maximum number of characters and work backwards. */
2371
2372 DPRINTF(("matching %c{%d,%d} against subject %.*s\n", fc, min, max,
2373 max, eptr));
2374
2375 if ((ims & PCRE_CASELESS) != 0)
2376 {
2377 fc = md->lcc[fc];
2378 for (i = 1; i <= min; i++)
2379 if (fc != md->lcc[*eptr++]) RRETURN(MATCH_NOMATCH);
2380 if (min == max) continue;
2381 if (minimize)
2382 {
2383 for (fi = min;; fi++)
2384 {
2385 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM24);
2386 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2387 if (fi >= max || eptr >= md->end_subject ||
2388 fc != md->lcc[*eptr++])
2389 RRETURN(MATCH_NOMATCH);
2390 }
2391 /* Control never gets here */
2392 }
2393 else /* Maximize */
2394 {
2395 pp = eptr;
2396 for (i = min; i < max; i++)
2397 {
2398 if (eptr >= md->end_subject || fc != md->lcc[*eptr]) break;
2399 eptr++;
2400 }
2401 if (possessive) continue;
2402 while (eptr >= pp)
2403 {
2404 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM25);
2405 eptr--;
2406 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2407 }
2408 RRETURN(MATCH_NOMATCH);
2409 }
2410 /* Control never gets here */
2411 }
2412
2413 /* Caseful comparisons (includes all multi-byte characters) */
2414
2415 else
2416 {
2417 for (i = 1; i <= min; i++) if (fc != *eptr++) RRETURN(MATCH_NOMATCH);
2418 if (min == max) continue;
2419 if (minimize)
2420 {
2421 for (fi = min;; fi++)
2422 {
2423 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM26);
2424 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2425 if (fi >= max || eptr >= md->end_subject || fc != *eptr++)
2426 RRETURN(MATCH_NOMATCH);
2427 }
2428 /* Control never gets here */
2429 }
2430 else /* Maximize */
2431 {
2432 pp = eptr;
2433 for (i = min; i < max; i++)
2434 {
2435 if (eptr >= md->end_subject || fc != *eptr) break;
2436 eptr++;
2437 }
2438 if (possessive) continue;
2439 while (eptr >= pp)
2440 {
2441 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM27);
2442 eptr--;
2443 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2444 }
2445 RRETURN(MATCH_NOMATCH);
2446 }
2447 }
2448 /* Control never gets here */
2449
2450 /* Match a negated single one-byte character. The character we are
2451 checking can be multibyte. */
2452
2453 case OP_NOT:
2454 if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
2455 ecode++;
2456 GETCHARINCTEST(c, eptr);
2457 if ((ims & PCRE_CASELESS) != 0)
2458 {
2459 #ifdef SUPPORT_UTF8
2460 if (c < 256)
2461 #endif
2462 c = md->lcc[c];
2463 if (md->lcc[*ecode++] == c) RRETURN(MATCH_NOMATCH);
2464 }
2465 else
2466 {
2467 if (*ecode++ == c) RRETURN(MATCH_NOMATCH);
2468 }
2469 break;
2470
2471 /* Match a negated single one-byte character repeatedly. This is almost a
2472 repeat of the code for a repeated single character, but I haven't found a
2473 nice way of commoning these up that doesn't require a test of the
2474 positive/negative option for each character match. Maybe that wouldn't add
2475 very much to the time taken, but character matching *is* what this is all
2476 about... */
2477
2478 case OP_NOTEXACT:
2479 min = max = GET2(ecode, 1);
2480 ecode += 3;
2481 goto REPEATNOTCHAR;
2482
2483 case OP_NOTUPTO:
2484 case OP_NOTMINUPTO:
2485 min = 0;
2486 max = GET2(ecode, 1);
2487 minimize = *ecode == OP_NOTMINUPTO;
2488 ecode += 3;
2489 goto REPEATNOTCHAR;
2490
2491 case OP_NOTPOSSTAR:
2492 possessive = TRUE;
2493 min = 0;
2494 max = INT_MAX;
2495 ecode++;
2496 goto REPEATNOTCHAR;
2497
2498 case OP_NOTPOSPLUS:
2499 possessive = TRUE;
2500 min = 1;
2501 max = INT_MAX;
2502 ecode++;
2503 goto REPEATNOTCHAR;
2504
2505 case OP_NOTPOSQUERY:
2506 possessive = TRUE;
2507 min = 0;
2508 max = 1;
2509 ecode++;
2510 goto REPEATNOTCHAR;
2511
2512 case OP_NOTPOSUPTO:
2513 possessive = TRUE;
2514 min = 0;
2515 max = GET2(ecode, 1);
2516 ecode += 3;
2517 goto REPEATNOTCHAR;
2518
2519 case OP_NOTSTAR:
2520 case OP_NOTMINSTAR:
2521 case OP_NOTPLUS:
2522 case OP_NOTMINPLUS:
2523 case OP_NOTQUERY:
2524 case OP_NOTMINQUERY:
2525 c = *ecode++ - OP_NOTSTAR;
2526 minimize = (c & 1) != 0;
2527 min = rep_min[c]; /* Pick up values from tables; */
2528 max = rep_max[c]; /* zero for max => infinity */
2529 if (max == 0) max = INT_MAX;
2530
2531 /* Common code for all repeated single-byte matches. We can give up quickly
2532 if there are fewer than the minimum number of bytes left in the
2533 subject. */
2534
2535 REPEATNOTCHAR:
2536 if (min > md->end_subject - eptr) RRETURN(MATCH_NOMATCH);
2537 fc = *ecode++;
2538
2539 /* The code is duplicated for the caseless and caseful cases, for speed,
2540 since matching characters is likely to be quite common. First, ensure the
2541 minimum number of matches are present. If min = max, continue at the same
2542 level without recursing. Otherwise, if minimizing, keep trying the rest of
2543 the expression and advancing one matching character if failing, up to the
2544 maximum. Alternatively, if maximizing, find the maximum number of
2545 characters and work backwards. */
2546
2547 DPRINTF(("negative matching %c{%d,%d} against subject %.*s\n", fc, min, max,
2548 max, eptr));
2549
2550 if ((ims & PCRE_CASELESS) != 0)
2551 {
2552 fc = md->lcc[fc];
2553
2554 #ifdef SUPPORT_UTF8
2555 /* UTF-8 mode */
2556 if (utf8)
2557 {
2558 register unsigned int d;
2559 for (i = 1; i <= min; i++)
2560 {
2561 GETCHARINC(d, eptr);
2562 if (d < 256) d = md->lcc[d];
2563 if (fc == d) RRETURN(MATCH_NOMATCH);
2564 }
2565 }
2566 else
2567 #endif
2568
2569 /* Not UTF-8 mode */
2570 {
2571 for (i = 1; i <= min; i++)
2572 if (fc == md->lcc[*eptr++]) RRETURN(MATCH_NOMATCH);
2573 }
2574
2575 if (min == max) continue;
2576
2577 if (minimize)
2578 {
2579 #ifdef SUPPORT_UTF8
2580 /* UTF-8 mode */
2581 if (utf8)
2582 {
2583 register unsigned int d;
2584 for (fi = min;; fi++)
2585 {
2586 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM28);
2587 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2588 GETCHARINC(d, eptr);
2589 if (d < 256) d = md->lcc[d];
2590 if (fi >= max || eptr >= md->end_subject || fc == d)
2591 RRETURN(MATCH_NOMATCH);
2592 }
2593 }
2594 else
2595 #endif
2596 /* Not UTF-8 mode */
2597 {
2598 for (fi = min;; fi++)
2599 {
2600 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM29);
2601 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2602 if (fi >= max || eptr >= md->end_subject || fc == md->lcc[*eptr++])
2603 RRETURN(MATCH_NOMATCH);
2604 }
2605 }
2606 /* Control never gets here */
2607 }
2608
2609 /* Maximize case */
2610
2611 else
2612 {
2613 pp = eptr;
2614
2615 #ifdef SUPPORT_UTF8
2616 /* UTF-8 mode */
2617 if (utf8)
2618 {
2619 register unsigned int d;
2620 for (i = min; i < max; i++)
2621 {
2622 int len = 1;
2623 if (eptr >= md->end_subject) break;
2624 GETCHARLEN(d, eptr, len);
2625 if (d < 256) d = md->lcc[d];
2626 if (fc == d) break;
2627 eptr += len;
2628 }
2629 if (possessive) continue;
2630 for(;;)
2631 {
2632 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM30);
2633 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2634 if (eptr-- == pp) break; /* Stop if tried at original pos */
2635 BACKCHAR(eptr);
2636 }
2637 }
2638 else
2639 #endif
2640 /* Not UTF-8 mode */
2641 {
2642 for (i = min; i < max; i++)
2643 {
2644 if (eptr >= md->end_subject || fc == md->lcc[*eptr]) break;
2645 eptr++;
2646 }
2647 if (possessive) continue;
2648 while (eptr >= pp)
2649 {
2650 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM31);
2651 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2652 eptr--;
2653 }
2654 }
2655
2656 RRETURN(MATCH_NOMATCH);
2657 }
2658 /* Control never gets here */
2659 }
2660
2661 /* Caseful comparisons */
2662
2663 else
2664 {
2665 #ifdef SUPPORT_UTF8
2666 /* UTF-8 mode */
2667 if (utf8)
2668 {
2669 register unsigned int d;
2670 for (i = 1; i <= min; i++)
2671 {
2672 GETCHARINC(d, eptr);
2673 if (fc == d) RRETURN(MATCH_NOMATCH);
2674 }
2675 }
2676 else
2677 #endif
2678 /* Not UTF-8 mode */
2679 {
2680 for (i = 1; i <= min; i++)
2681 if (fc == *eptr++) RRETURN(MATCH_NOMATCH);
2682 }
2683
2684 if (min == max) continue;
2685
2686 if (minimize)
2687 {
2688 #ifdef SUPPORT_UTF8
2689 /* UTF-8 mode */
2690 if (utf8)
2691 {
2692 register unsigned int d;
2693 for (fi = min;; fi++)
2694 {
2695 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM32);
2696 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2697 GETCHARINC(d, eptr);
2698 if (fi >= max || eptr >= md->end_subject || fc == d)
2699 RRETURN(MATCH_NOMATCH);
2700 }
2701 }
2702 else
2703 #endif
2704 /* Not UTF-8 mode */
2705 {
2706 for (fi = min;; fi++)
2707 {
2708 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM33);
2709 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2710 if (fi >= max || eptr >= md->end_subject || fc == *eptr++)
2711 RRETURN(MATCH_NOMATCH);
2712 }
2713 }
2714 /* Control never gets here */
2715 }
2716
2717 /* Maximize case */
2718
2719 else
2720 {
2721 pp = eptr;
2722
2723 #ifdef SUPPORT_UTF8
2724 /* UTF-8 mode */
2725 if (utf8)
2726 {
2727 register unsigned int d;
2728 for (i = min; i < max; i++)
2729 {
2730 int len = 1;
2731 if (eptr >= md->end_subject) break;
2732 GETCHARLEN(d, eptr, len);
2733 if (fc == d) break;
2734 eptr += len;
2735 }
2736 if (possessive) continue;
2737 for(;;)
2738 {
2739 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM34);
2740 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2741 if (eptr-- == pp) break; /* Stop if tried at original pos */
2742 BACKCHAR(eptr);
2743 }
2744 }
2745 else
2746 #endif
2747 /* Not UTF-8 mode */
2748 {
2749 for (i = min; i < max; i++)
2750 {
2751 if (eptr >= md->end_subject || fc == *eptr) break;
2752 eptr++;
2753 }
2754 if (possessive) continue;
2755 while (eptr >= pp)
2756 {
2757 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM35);
2758 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2759 eptr--;
2760 }
2761 }
2762
2763 RRETURN(MATCH_NOMATCH);
2764 }
2765 }
2766 /* Control never gets here */
2767
2768 /* Match a single character type repeatedly; several different opcodes
2769 share code. This is very similar to the code for single characters, but we
2770 repeat it in the interests of efficiency. */
2771
2772 case OP_TYPEEXACT:
2773 min = max = GET2(ecode, 1);
2774 minimize = TRUE;
2775 ecode += 3;
2776 goto REPEATTYPE;
2777
2778 case OP_TYPEUPTO:
2779 case OP_TYPEMINUPTO:
2780 min = 0;
2781 max = GET2(ecode, 1);
2782 minimize = *ecode == OP_TYPEMINUPTO;
2783 ecode += 3;
2784 goto REPEATTYPE;
2785
2786 case OP_TYPEPOSSTAR:
2787 possessive = TRUE;
2788 min = 0;
2789 max = INT_MAX;
2790 ecode++;
2791 goto REPEATTYPE;
2792
2793 case OP_TYPEPOSPLUS:
2794 possessive = TRUE;
2795 min = 1;
2796 max = INT_MAX;
2797 ecode++;
2798 goto REPEATTYPE;
2799
2800 case OP_TYPEPOSQUERY:
2801 possessive = TRUE;
2802 min = 0;
2803 max = 1;
2804 ecode++;
2805 goto REPEATTYPE;
2806
2807 case OP_TYPEPOSUPTO:
2808 possessive = TRUE;
2809 min = 0;
2810 max = GET2(ecode, 1);
2811 ecode += 3;
2812 goto REPEATTYPE;
2813
2814 case OP_TYPESTAR:
2815 case OP_TYPEMINSTAR:
2816 case OP_TYPEPLUS:
2817 case OP_TYPEMINPLUS:
2818 case OP_TYPEQUERY:
2819 case OP_TYPEMINQUERY:
2820 c = *ecode++ - OP_TYPESTAR;
2821 minimize = (c & 1) != 0;
2822 min = rep_min[c]; /* Pick up values from tables; */
2823 max = rep_max[c]; /* zero for max => infinity */
2824 if (max == 0) max = INT_MAX;
2825
2826 /* Common code for all repeated single character type matches. Note that
2827 in UTF-8 mode, '.' matches a character of any length, but for the other
2828 character types, the valid characters are all one-byte long. */
2829
2830 REPEATTYPE:
2831 ctype = *ecode++; /* Code for the character type */
2832
2833 #ifdef SUPPORT_UCP
2834 if (ctype == OP_PROP || ctype == OP_NOTPROP)
2835 {
2836 prop_fail_result = ctype == OP_NOTPROP;
2837 prop_type = *ecode++;
2838 prop_value = *ecode++;
2839 }
2840 else prop_type = -1;
2841 #endif
2842
2843 /* First, ensure the minimum number of matches are present. Use inline
2844 code for maximizing the speed, and do the type test once at the start
2845 (i.e. keep it out of the loop). Also we can test that there are at least
2846 the minimum number of bytes before we start. This isn't as effective in
2847 UTF-8 mode, but it does no harm. Separate the UTF-8 code completely as that
2848 is tidier. Also separate the UCP code, which can be the same for both UTF-8
2849 and single-bytes. */
2850
2851 if (min > md->end_subject - eptr) RRETURN(MATCH_NOMATCH);
2852 if (min > 0)
2853 {
2854 #ifdef SUPPORT_UCP
2855 if (prop_type >= 0)
2856 {
2857 switch(prop_type)
2858 {
2859 case PT_ANY:
2860 if (prop_fail_result) RRETURN(MATCH_NOMATCH);
2861 for (i = 1; i <= min; i++)
2862 {
2863 if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
2864 GETCHARINCTEST(c, eptr);
2865 }
2866 break;
2867
2868 case PT_LAMP:
2869 for (i = 1; i <= min; i++)
2870 {
2871 if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
2872 GETCHARINCTEST(c, eptr);
2873 prop_category = _pcre_ucp_findprop(c, &prop_chartype, &prop_script);
2874 if ((prop_chartype == ucp_Lu ||
2875 prop_chartype == ucp_Ll ||
2876 prop_chartype == ucp_Lt) == prop_fail_result)
2877 RRETURN(MATCH_NOMATCH);
2878 }
2879 break;
2880
2881 case PT_GC:
2882 for (i = 1; i <= min; i++)
2883 {
2884 if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
2885 GETCHARINCTEST(c, eptr);
2886 prop_category = _pcre_ucp_findprop(c, &prop_chartype, &prop_script);
2887 if ((prop_category == prop_value) == prop_fail_result)
2888 RRETURN(MATCH_NOMATCH);
2889 }
2890 break;
2891
2892 case PT_PC:
2893 for (i = 1; i <= min; i++)
2894 {
2895 if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
2896 GETCHARINCTEST(c, eptr);
2897 prop_category = _pcre_ucp_findprop(c, &prop_chartype, &prop_script);
2898 if ((prop_chartype == prop_value) == prop_fail_result)
2899 RRETURN(MATCH_NOMATCH);
2900 }
2901 break;
2902
2903 case PT_SC:
2904 for (i = 1; i <= min; i++)
2905 {
2906 if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
2907 GETCHARINCTEST(c, eptr);
2908 prop_category = _pcre_ucp_findprop(c, &prop_chartype, &prop_script);
2909 if ((prop_script == prop_value) == prop_fail_result)
2910 RRETURN(MATCH_NOMATCH);
2911 }
2912 break;
2913
2914 default:
2915 RRETURN(PCRE_ERROR_INTERNAL);
2916 }
2917 }
2918
2919 /* Match extended Unicode sequences. We will get here only if the
2920 support is in the binary; otherwise a compile-time error occurs. */
2921
2922 else if (ctype == OP_EXTUNI)
2923 {
2924 for (i = 1; i <= min; i++)
2925 {
2926 GETCHARINCTEST(c, eptr);
2927 prop_category = _pcre_ucp_findprop(c, &prop_chartype, &prop_script);
2928 if (prop_category == ucp_M) RRETURN(MATCH_NOMATCH);
2929 while (eptr < md->end_subject)
2930 {
2931 int len = 1;
2932 if (!utf8) c = *eptr; else
2933 {
2934 GETCHARLEN(c, eptr, len);
2935 }
2936 prop_category = _pcre_ucp_findprop(c, &prop_chartype, &prop_script);
2937 if (prop_category != ucp_M) break;
2938 eptr += len;
2939 }
2940 }
2941 }
2942
2943 else
2944 #endif /* SUPPORT_UCP */
2945
2946 /* Handle all other cases when the coding is UTF-8 */
2947
2948 #ifdef SUPPORT_UTF8
2949 if (utf8) switch(ctype)
2950 {
2951 case OP_ANY:
2952 for (i = 1; i <= min; i++)
2953 {
2954 if (eptr >= md->end_subject || IS_NEWLINE(eptr))
2955 RRETURN(MATCH_NOMATCH);
2956 eptr++;
2957 while (eptr < md->end_subject && (*eptr & 0xc0) == 0x80) eptr++;
2958 }
2959 break;
2960
2961 case OP_ALLANY:
2962 for (i = 1; i <= min; i++)
2963 {
2964 if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
2965 eptr++;
2966 while (eptr < md->end_subject && (*eptr & 0xc0) == 0x80) eptr++;
2967 }
2968 break;
2969
2970 case OP_ANYBYTE:
2971 eptr += min;
2972 break;
2973
2974 case OP_ANYNL:
2975 for (i = 1; i <= min; i++)
2976 {
2977 if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
2978 GETCHARINC(c, eptr);
2979 switch(c)
2980 {
2981 default: RRETURN(MATCH_NOMATCH);
2982 case 0x000d:
2983 if (eptr < md->end_subject && *eptr == 0x0a) eptr++;
2984 break;
2985
2986 case 0x000a:
2987 break;
2988
2989 case 0x000b:
2990 case 0x000c:
2991 case 0x0085:
2992 case 0x2028:
2993 case 0x2029:
2994 if (md->bsr_anycrlf) RRETURN(MATCH_NOMATCH);
2995 break;
2996 }
2997 }
2998 break;
2999
3000 case OP_NOT_HSPACE:
3001 for (i = 1; i <= min; i++)
3002 {
3003 if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
3004 GETCHARINC(c, eptr);
3005 switch(c)
3006 {
3007 default: break;
3008 case 0x09: /* HT */
3009 case 0x20: /* SPACE */
3010 case 0xa0: /* NBSP */
3011 case 0x1680: /* OGHAM SPACE MARK */
3012 case 0x180e: /* MONGOLIAN VOWEL SEPARATOR */
3013 case 0x2000: /* EN QUAD */
3014 case 0x2001: /* EM QUAD */
3015 case 0x2002: /* EN SPACE */
3016 case 0x2003: /* EM SPACE */
3017 case 0x2004: /* THREE-PER-EM SPACE */
3018 case 0x2005: /* FOUR-PER-EM SPACE */
3019 case 0x2006: /* SIX-PER-EM SPACE */
3020 case 0x2007: /* FIGURE SPACE */
3021 case 0x2008: /* PUNCTUATION SPACE */
3022 case 0x2009: /* THIN SPACE */
3023 case 0x200A: /* HAIR SPACE */
3024 case 0x202f: /* NARROW NO-BREAK SPACE */
3025 case 0x205f: /* MEDIUM MATHEMATICAL SPACE */
3026 case 0x3000: /* IDEOGRAPHIC SPACE */
3027 RRETURN(MATCH_NOMATCH);
3028 }
3029 }
3030 break;
3031
3032 case OP_HSPACE:
3033 for (i = 1; i <= min; i++)
3034 {
3035 if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
3036 GETCHARINC(c, eptr);
3037 switch(c)
3038 {
3039 default: RRETURN(MATCH_NOMATCH);
3040 case 0x09: /* HT */
3041 case 0x20: /* SPACE */
3042 case 0xa0: /* NBSP */
3043 case 0x1680: /* OGHAM SPACE MARK */
3044 case 0x180e: /* MONGOLIAN VOWEL SEPARATOR */
3045 case 0x2000: /* EN QUAD */
3046 case 0x2001: /* EM QUAD */
3047 case 0x2002: /* EN SPACE */
3048 case 0x2003: /* EM SPACE */
3049 case 0x2004: /* THREE-PER-EM SPACE */
3050 case 0x2005: /* FOUR-PER-EM SPACE */
3051 case 0x2006: /* SIX-PER-EM SPACE */
3052 case 0x2007: /* FIGURE SPACE */
3053 case 0x2008: /* PUNCTUATION SPACE */
3054 case 0x2009: /* THIN SPACE */
3055 case 0x200A: /* HAIR SPACE */
3056 case 0x202f: /* NARROW NO-BREAK SPACE */
3057 case 0x205f: /* MEDIUM MATHEMATICAL SPACE */
3058 case 0x3000: /* IDEOGRAPHIC SPACE */
3059 break;
3060 }
3061 }
3062 break;
3063
3064 case OP_NOT_VSPACE:
3065 for (i = 1; i <= min; i++)
3066 {
3067 if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
3068 GETCHARINC(c, eptr);
3069 switch(c)
3070 {
3071 default: break;
3072 case 0x0a: /* LF */
3073 case 0x0b: /* VT */
3074 case 0x0c: /* FF */
3075 case 0x0d: /* CR */
3076 case 0x85: /* NEL */
3077 case 0x2028: /* LINE SEPARATOR */
3078 case 0x2029: /* PARAGRAPH SEPARATOR */
3079 RRETURN(MATCH_NOMATCH);
3080 }
3081 }
3082 break;
3083
3084 case OP_VSPACE:
3085 for (i = 1; i <= min; i++)
3086 {
3087 if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
3088 GETCHARINC(c, eptr);
3089 switch(c)
3090 {
3091 default: RRETURN(MATCH_NOMATCH);
3092 case 0x0a: /* LF */
3093 case 0x0b: /* VT */
3094 case 0x0c: /* FF */
3095 case 0x0d: /* CR */
3096 case 0x85: /* NEL */
3097 case 0x2028: /* LINE SEPARATOR */
3098 case 0x2029: /* PARAGRAPH SEPARATOR */
3099 break;
3100 }
3101 }
3102 break;
3103
3104 case OP_NOT_DIGIT:
3105 for (i = 1; i <= min; i++)
3106 {
3107 if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
3108 GETCHARINC(c, eptr);
3109 if (c < 128 && (md->ctypes[c] & ctype_digit) != 0)
3110 RRETURN(MATCH_NOMATCH);
3111 }
3112 break;
3113
3114 case OP_DIGIT:
3115 for (i = 1; i <= min; i++)
3116 {
3117 if (eptr >= md->end_subject ||
3118 *eptr >= 128 || (md->ctypes[*eptr++] & ctype_digit) == 0)
3119 RRETURN(MATCH_NOMATCH);
3120 /* No need to skip more bytes - we know it's a 1-byte character */
3121 }
3122 break;
3123
3124 case OP_NOT_WHITESPACE:
3125 for (i = 1; i <= min; i++)
3126 {
3127 if (eptr >= md->end_subject ||
3128 (*eptr < 128 && (md->ctypes[*eptr] & ctype_space) != 0))
3129 RRETURN(MATCH_NOMATCH);
3130 while (++eptr < md->end_subject && (*eptr & 0xc0) == 0x80);
3131 }
3132 break;
3133
3134 case OP_WHITESPACE:
3135 for (i = 1; i <= min; i++)
3136 {
3137 if (eptr >= md->end_subject ||
3138 *eptr >= 128 || (md->ctypes[*eptr++] & ctype_space) == 0)
3139 RRETURN(MATCH_NOMATCH);
3140 /* No need to skip more bytes - we know it's a 1-byte character */
3141 }
3142 break;
3143
3144 case OP_NOT_WORDCHAR:
3145 for (i = 1; i <= min; i++)
3146 {
3147 if (eptr >= md->end_subject ||
3148 (*eptr < 128 && (md->ctypes[*eptr] & ctype_word) != 0))
3149 RRETURN(MATCH_NOMATCH);
3150 while (++eptr < md->end_subject && (*eptr & 0xc0) == 0x80);
3151 }
3152 break;
3153
3154 case OP_WORDCHAR:
3155 for (i = 1; i <= min; i++)
3156 {
3157 if (eptr >= md->end_subject ||
3158 *eptr >= 128 || (md->ctypes[*eptr++] & ctype_word) == 0)
3159 RRETURN(MATCH_NOMATCH);
3160 /* No need to skip more bytes - we know it's a 1-byte character */
3161 }
3162 break;
3163
3164 default:
3165 RRETURN(PCRE_ERROR_INTERNAL);
3166 } /* End switch(ctype) */
3167
3168 else
3169 #endif /* SUPPORT_UTF8 */
3170
3171 /* Code for the non-UTF-8 case for minimum matching of operators other
3172 than OP_PROP and OP_NOTPROP. We can assume that there are the minimum
3173 number of bytes present, as this was tested above. */
3174
3175 switch(ctype)
3176 {
3177 case OP_ANY:
3178 for (i = 1; i <= min; i++)
3179 {
3180 if (IS_NEWLINE(eptr)) RRETURN(MATCH_NOMATCH);
3181 eptr++;
3182 }
3183 break;
3184
3185 case OP_ALLANY:
3186 eptr += min;
3187 break;
3188
3189 case OP_ANYBYTE:
3190 eptr += min;
3191 break;
3192
3193 /* Because of the CRLF case, we can't assume the minimum number of
3194 bytes are present in this case. */
3195
3196 case OP_ANYNL:
3197 for (i = 1; i <= min; i++)
3198 {
3199 if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
3200 switch(*eptr++)
3201 {
3202 default: RRETURN(MATCH_NOMATCH);
3203 case 0x000d:
3204 if (eptr < md->end_subject && *eptr == 0x0a) eptr++;
3205 break;
3206 case 0x000a:
3207 break;
3208
3209 case 0x000b:
3210 case 0x000c:
3211 case 0x0085:
3212 if (md->bsr_anycrlf) RRETURN(MATCH_NOMATCH);
3213 break;
3214 }
3215 }
3216 break;
3217
3218 case OP_NOT_HSPACE:
3219 for (i = 1; i <= min; i++)
3220 {
3221 if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
3222 switch(*eptr++)
3223 {
3224 default: break;
3225 case 0x09: /* HT */
3226 case 0x20: /* SPACE */
3227 case 0xa0: /* NBSP */
3228 RRETURN(MATCH_NOMATCH);
3229 }
3230 }
3231 break;
3232
3233 case OP_HSPACE:
3234 for (i = 1; i <= min; i++)
3235 {
3236 if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
3237 switch(*eptr++)
3238 {
3239 default: RRETURN(MATCH_NOMATCH);
3240 case 0x09: /* HT */
3241 case 0x20: /* SPACE */
3242 case 0xa0: /* NBSP */
3243 break;
3244 }
3245 }
3246 break;
3247
3248 case OP_NOT_VSPACE:
3249 for (i = 1; i <= min; i++)
3250 {
3251 if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
3252 switch(*eptr++)
3253 {
3254 default: break;
3255 case 0x0a: /* LF */
3256 case 0x0b: /* VT */
3257 case 0x0c: /* FF */
3258 case 0x0d: /* CR */
3259 case 0x85: /* NEL */
3260 RRETURN(MATCH_NOMATCH);
3261 }
3262 }
3263 break;
3264
3265 case OP_VSPACE:
3266 for (i = 1; i <= min; i++)
3267 {
3268 if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
3269 switch(*eptr++)
3270 {
3271 default: RRETURN(MATCH_NOMATCH);
3272 case 0x0a: /* LF */
3273 case 0x0b: /* VT */
3274 case 0x0c: /* FF */
3275 case 0x0d: /* CR */
3276 case 0x85: /* NEL */
3277 break;
3278 }
3279 }
3280 break;
3281
3282 case OP_NOT_DIGIT:
3283 for (i = 1; i <= min; i++)
3284 if ((md->ctypes[*eptr++] & ctype_digit) != 0) RRETURN(MATCH_NOMATCH);
3285 break;
3286
3287 case OP_DIGIT:
3288 for (i = 1; i <= min; i++)
3289 if ((md->ctypes[*eptr++] & ctype_digit) == 0) RRETURN(MATCH_NOMATCH);
3290 break;
3291
3292 case OP_NOT_WHITESPACE:
3293 for (i = 1; i <= min; i++)
3294 if ((md->ctypes[*eptr++] & ctype_space) != 0) RRETURN(MATCH_NOMATCH);
3295 break;
3296
3297 case OP_WHITESPACE:
3298 for (i = 1; i <= min; i++)
3299 if ((md->ctypes[*eptr++] & ctype_space) == 0) RRETURN(MATCH_NOMATCH);
3300 break;
3301
3302 case OP_NOT_WORDCHAR:
3303 for (i = 1; i <= min; i++)
3304 if ((md->ctypes[*eptr++] & ctype_word) != 0)
3305 RRETURN(MATCH_NOMATCH);
3306 break;
3307
3308 case OP_WORDCHAR:
3309 for (i = 1; i <= min; i++)
3310 if ((md->ctypes[*eptr++] & ctype_word) == 0)
3311 RRETURN(MATCH_NOMATCH);
3312 break;
3313
3314 default:
3315 RRETURN(PCRE_ERROR_INTERNAL);
3316 }
3317 }
3318
3319 /* If min = max, continue at the same level without recursing */
3320
3321 if (min == max) continue;
3322
3323 /* If minimizing, we have to test the rest of the pattern before each
3324 subsequent match. Again, separate the UTF-8 case for speed, and also
3325 separate the UCP cases. */
3326
3327 if (minimize)
3328 {
3329 #ifdef SUPPORT_UCP
3330 if (prop_type >= 0)
3331 {
3332 switch(prop_type)
3333 {
3334 case PT_ANY:
3335 for (fi = min;; fi++)
3336 {
3337 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM36);
3338 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3339 if (fi >= max || eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
3340 GETCHARINC(c, eptr);
3341 if (prop_fail_result) RRETURN(MATCH_NOMATCH);
3342 }
3343 /* Control never gets here */
3344
3345 case PT_LAMP:
3346 for (fi = min;; fi++)
3347 {
3348 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM37);
3349 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3350 if (fi >= max || eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
3351 GETCHARINC(c, eptr);
3352 prop_category = _pcre_ucp_findprop(c, &prop_chartype, &prop_script);
3353 if ((prop_chartype == ucp_Lu ||
3354 prop_chartype == ucp_Ll ||
3355 prop_chartype == ucp_Lt) == prop_fail_result)
3356 RRETURN(MATCH_NOMATCH);
3357 }
3358 /* Control never gets here */
3359
3360 case PT_GC:
3361 for (fi = min;; fi++)
3362 {
3363 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM38);
3364 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3365 if (fi >= max || eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
3366 GETCHARINC(c, eptr);
3367 prop_category = _pcre_ucp_findprop(c, &prop_chartype, &prop_script);
3368 if ((prop_category == prop_value) == prop_fail_result)
3369 RRETURN(MATCH_NOMATCH);
3370 }
3371 /* Control never gets here */
3372
3373 case PT_PC:
3374 for (fi = min;; fi++)
3375 {
3376 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM39);
3377 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3378 if (fi >= max || eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
3379 GETCHARINC(c, eptr);
3380 prop_category = _pcre_ucp_findprop(c, &prop_chartype, &prop_script);
3381 if ((prop_chartype == prop_value) == prop_fail_result)
3382 RRETURN(MATCH_NOMATCH);
3383 }
3384 /* Control never gets here */
3385
3386 case PT_SC:
3387 for (fi = min;; fi++)
3388 {
3389 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM40);
3390 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3391 if (fi >= max || eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
3392 GETCHARINC(c, eptr);
3393 prop_category = _pcre_ucp_findprop(c, &prop_chartype, &prop_script);
3394 if ((prop_script == prop_value) == prop_fail_result)
3395 RRETURN(MATCH_NOMATCH);
3396 }
3397 /* Control never gets here */
3398
3399 default:
3400 RRETURN(PCRE_ERROR_INTERNAL);
3401 }
3402 }
3403
3404 /* Match extended Unicode sequences. We will get here only if the
3405 support is in the binary; otherwise a compile-time error occurs. */
3406
3407 else if (ctype == OP_EXTUNI)
3408 {
3409 for (fi = min;; fi++)
3410 {
3411 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM41);
3412 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3413 if (fi >= max || eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
3414 GETCHARINCTEST(c, eptr);
3415 prop_category = _pcre_ucp_findprop(c, &prop_chartype, &prop_script);
3416 if (prop_category == ucp_M) RRETURN(MATCH_NOMATCH);
3417 while (eptr < md->end_subject)
3418 {
3419 int len = 1;
3420 if (!utf8) c = *eptr; else
3421 {
3422 GETCHARLEN(c, eptr, len);
3423 }
3424 prop_category = _pcre_ucp_findprop(c, &prop_chartype, &prop_script);
3425 if (prop_category != ucp_M) break;
3426 eptr += len;
3427 }
3428 }
3429 }
3430
3431 else
3432 #endif /* SUPPORT_UCP */
3433
3434 #ifdef SUPPORT_UTF8
3435 /* UTF-8 mode */
3436 if (utf8)
3437 {
3438 for (fi = min;; fi++)
3439 {
3440 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM42);
3441 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3442 if (fi >= max || eptr >= md->end_subject ||
3443 (ctype == OP_ANY && IS_NEWLINE(eptr)))
3444 RRETURN(MATCH_NOMATCH);
3445
3446 GETCHARINC(c, eptr);
3447 switch(ctype)
3448 {
3449 case OP_ANY: /* This is the non-NL case */
3450 case OP_ALLANY:
3451 case OP_ANYBYTE:
3452 break;
3453
3454 case OP_ANYNL:
3455 switch(c)
3456 {
3457 default: RRETURN(MATCH_NOMATCH);
3458 case 0x000d:
3459 if (eptr < md->end_subject && *eptr == 0x0a) eptr++;
3460 break;
3461 case 0x000a:
3462 break;
3463
3464 case 0x000b:
3465 case 0x000c:
3466 case 0x0085:
3467 case 0x2028:
3468 case 0x2029:
3469 if (md->bsr_anycrlf) RRETURN(MATCH_NOMATCH);
3470 break;
3471 }
3472 break;
3473
3474 case OP_NOT_HSPACE:
3475 switch(c)
3476 {
3477 default: break;
3478 case 0x09: /* HT */
3479 case 0x20: /* SPACE */
3480 case 0xa0: /* NBSP */
3481 case 0x1680: /* OGHAM SPACE MARK */
3482 case 0x180e: /* MONGOLIAN VOWEL SEPARATOR */
3483 case 0x2000: /* EN QUAD */
3484 case 0x2001: /* EM QUAD */
3485 case 0x2002: /* EN SPACE */
3486 case 0x2003: /* EM SPACE */
3487 case 0x2004: /* THREE-PER-EM SPACE */
3488 case 0x2005: /* FOUR-PER-EM SPACE */
3489 case 0x2006: /* SIX-PER-EM SPACE */
3490 case 0x2007: /* FIGURE SPACE */
3491 case 0x2008: /* PUNCTUATION SPACE */
3492 case 0x2009: /* THIN SPACE */
3493 case 0x200A: /* HAIR SPACE */
3494 case 0x202f: /* NARROW NO-BREAK SPACE */
3495 case 0x205f: /* MEDIUM MATHEMATICAL SPACE */
3496 case 0x3000: /* IDEOGRAPHIC SPACE */
3497 RRETURN(MATCH_NOMATCH);
3498 }
3499 break;
3500
3501 case OP_HSPACE:
3502 switch(c)
3503 {
3504 default: RRETURN(MATCH_NOMATCH);
3505 case 0x09: /* HT */
3506 case 0x20: /* SPACE */
3507 case 0xa0: /* NBSP */
3508 case 0x1680: /* OGHAM SPACE MARK */
3509 case 0x180e: /* MONGOLIAN VOWEL SEPARATOR */
3510 case 0x2000: /* EN QUAD */
3511 case 0x2001: /* EM QUAD */
3512 case 0x2002: /* EN SPACE */
3513 case 0x2003: /* EM SPACE */
3514 case 0x2004: /* THREE-PER-EM SPACE */
3515 case 0x2005: /* FOUR-PER-EM SPACE */
3516 case 0x2006: /* SIX-PER-EM SPACE */
3517 case 0x2007: /* FIGURE SPACE */
3518 case 0x2008: /* PUNCTUATION SPACE */
3519 case 0x2009: /* THIN SPACE */
3520 case 0x200A: /* HAIR SPACE */
3521 case 0x202f: /* NARROW NO-BREAK SPACE */
3522 case 0x205f: /* MEDIUM MATHEMATICAL SPACE */
3523 case 0x3000: /* IDEOGRAPHIC SPACE */
3524 break;
3525 }
3526 break;
3527
3528 case OP_NOT_VSPACE:
3529 switch(c)
3530 {
3531 default: break;
3532 case 0x0a: /* LF */
3533 case 0x0b: /* VT */
3534 case 0x0c: /* FF */
3535 case 0x0d: /* CR */
3536 case 0x85: /* NEL */
3537 case 0x2028: /* LINE SEPARATOR */
3538 case 0x2029: /* PARAGRAPH SEPARATOR */
3539 RRETURN(MATCH_NOMATCH);
3540 }
3541 break;
3542
3543 case OP_VSPACE:
3544 switch(c)
3545 {
3546 default: RRETURN(MATCH_NOMATCH);
3547 case 0x0a: /* LF */
3548 case 0x0b: /* VT */
3549 case 0x0c: /* FF */
3550 case 0x0d: /* CR */
3551 case 0x85: /* NEL */
3552 case 0x2028: /* LINE SEPARATOR */
3553 case 0x2029: /* PARAGRAPH SEPARATOR */
3554 break;
3555 }
3556 break;
3557
3558 case OP_NOT_DIGIT:
3559 if (c < 256 && (md->ctypes[c] & ctype_digit) != 0)
3560 RRETURN(MATCH_NOMATCH);
3561 break;
3562
3563 case OP_DIGIT:
3564 if (c >= 256 || (md->ctypes[c] & ctype_digit) == 0)
3565 RRETURN(MATCH_NOMATCH);
3566 break;
3567
3568 case OP_NOT_WHITESPACE:
3569 if (c < 256 && (md->ctypes[c] & ctype_space) != 0)
3570 RRETURN(MATCH_NOMATCH);
3571 break;
3572
3573 case OP_WHITESPACE:
3574 if (c >= 256 || (md->ctypes[c] & ctype_space) == 0)
3575 RRETURN(MATCH_NOMATCH);
3576 break;
3577
3578 case OP_NOT_WORDCHAR:
3579 if (c < 256 && (md->ctypes[c] & ctype_word) != 0)
3580 RRETURN(MATCH_NOMATCH);
3581 break;
3582
3583 case OP_WORDCHAR:
3584 if (c >= 256 || (md->ctypes[c] & ctype_word) == 0)
3585 RRETURN(MATCH_NOMATCH);
3586 break;
3587
3588 default:
3589 RRETURN(PCRE_ERROR_INTERNAL);
3590 }
3591 }
3592 }
3593 else
3594 #endif
3595 /* Not UTF-8 mode */
3596 {
3597 for (fi = min;; fi++)
3598 {
3599 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM43);
3600 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3601 if (fi >= max || eptr >= md->end_subject ||
3602 (ctype == OP_ANY && IS_NEWLINE(eptr)))
3603 RRETURN(MATCH_NOMATCH);
3604
3605 c = *eptr++;
3606 switch(ctype)
3607 {
3608 case OP_ANY: /* This is the non-NL case */
3609 case OP_ALLANY:
3610 case OP_ANYBYTE:
3611 break;
3612
3613 case OP_ANYNL:
3614 switch(c)
3615 {
3616 default: RRETURN(MATCH_NOMATCH);
3617 case 0x000d:
3618 if (eptr < md->end_subject && *eptr == 0x0a) eptr++;
3619 break;
3620
3621 case 0x000a:
3622 break;
3623
3624 case 0x000b:
3625 case 0x000c:
3626 case 0x0085:
3627 if (md->bsr_anycrlf) RRETURN(MATCH_NOMATCH);
3628 break;
3629 }
3630 break;
3631
3632 case OP_NOT_HSPACE:
3633 switch(c)
3634 {
3635 default: break;
3636 case 0x09: /* HT */
3637 case 0x20: /* SPACE */
3638 case 0xa0: /* NBSP */
3639 RRETURN(MATCH_NOMATCH);
3640 }
3641 break;
3642
3643 case OP_HSPACE:
3644 switch(c)
3645 {
3646 default: RRETURN(MATCH_NOMATCH);
3647 case 0x09: /* HT */
3648 case 0x20: /* SPACE */
3649 case 0xa0: /* NBSP */
3650 break;
3651 }
3652 break;
3653
3654 case OP_NOT_VSPACE:
3655 switch(c)
3656 {
3657 default: break;
3658 case 0x0a: /* LF */
3659 case 0x0b: /* VT */
3660 case 0x0c: /* FF */
3661 case 0x0d: /* CR */
3662 case 0x85: /* NEL */
3663 RRETURN(MATCH_NOMATCH);
3664 }
3665 break;
3666
3667 case OP_VSPACE:
3668 switch(c)
3669 {
3670 default: RRETURN(MATCH_NOMATCH);
3671 case 0x0a: /* LF */
3672 case 0x0b: /* VT */
3673 case 0x0c: /* FF */
3674 case 0x0d: /* CR */
3675 case 0x85: /* NEL */
3676 break;
3677 }
3678 break;
3679
3680 case OP_NOT_DIGIT:
3681 if ((md->ctypes[c] & ctype_digit) != 0) RRETURN(MATCH_NOMATCH);
3682 break;
3683
3684 case OP_DIGIT:
3685 if ((md->ctypes[c] & ctype_digit) == 0) RRETURN(MATCH_NOMATCH);
3686 break;
3687
3688 case OP_NOT_WHITESPACE:
3689 if ((md->ctypes[c] & ctype_space) != 0) RRETURN(MATCH_NOMATCH);
3690 break;
3691
3692 case OP_WHITESPACE:
3693 if ((md->ctypes[c] & ctype_space) == 0) RRETURN(MATCH_NOMATCH);
3694 break;
3695
3696 case OP_NOT_WORDCHAR:
3697 if ((md->ctypes[c] & ctype_word) != 0) RRETURN(MATCH_NOMATCH);
3698 break;
3699
3700 case OP_WORDCHAR:
3701 if ((md->ctypes[c] & ctype_word) == 0) RRETURN(MATCH_NOMATCH);
3702 break;
3703
3704 default:
3705 RRETURN(PCRE_ERROR_INTERNAL);
3706 }
3707 }
3708 }
3709 /* Control never gets here */
3710 }
3711
3712 /* If maximizing, it is worth using inline code for speed, doing the type
3713 test once at the start (i.e. keep it out of the loop). Again, keep the
3714 UTF-8 and UCP stuff separate. */
3715
3716 else
3717 {
3718 pp = eptr; /* Remember where we started */
3719
3720 #ifdef SUPPORT_UCP
3721 if (prop_type >= 0)
3722 {
3723 switch(prop_type)
3724 {
3725 case PT_ANY:
3726 for (i = min; i < max; i++)
3727 {
3728 int len = 1;
3729 if (eptr >= md->end_subject) break;
3730 GETCHARLEN(c, eptr, len);
3731 if (prop_fail_result) break;
3732 eptr+= len;
3733 }
3734 break;
3735
3736 case PT_LAMP:
3737 for (i = min; i < max; i++)
3738 {
3739 int len = 1;
3740 if (eptr >= md->end_subject) break;
3741 GETCHARLEN(c, eptr, len);
3742 prop_category = _pcre_ucp_findprop(c, &prop_chartype, &prop_script);
3743 if ((prop_chartype == ucp_Lu ||
3744 prop_chartype == ucp_Ll ||
3745 prop_chartype == ucp_Lt) == prop_fail_result)
3746 break;
3747 eptr+= len;
3748 }
3749 break;
3750
3751 case PT_GC:
3752 for (i = min; i < max; i++)
3753 {
3754 int len = 1;
3755 if (eptr >= md->end_subject) break;
3756 GETCHARLEN(c, eptr, len);
3757 prop_category = _pcre_ucp_findprop(c, &prop_chartype, &prop_script);
3758 if ((prop_category == prop_value) == prop_fail_result)
3759 break;
3760 eptr+= len;
3761 }
3762 break;
3763
3764 case PT_PC:
3765 for (i = min; i < max; i++)
3766 {
3767 int len = 1;
3768 if (eptr >= md->end_subject) break;
3769 GETCHARLEN(c, eptr, len);
3770 prop_category = _pcre_ucp_findprop(c, &prop_chartype, &prop_script);
3771 if ((prop_chartype == prop_value) == prop_fail_result)
3772 break;
3773 eptr+= len;
3774 }
3775 break;
3776
3777 case PT_SC:
3778 for (i = min; i < max; i++)
3779 {
3780 int len = 1;
3781 if (eptr >= md->end_subject) break;
3782 GETCHARLEN(c, eptr, len);
3783 prop_category = _pcre_ucp_findprop(c, &prop_chartype, &prop_script);
3784 if ((prop_script == prop_value) == prop_fail_result)
3785 break;
3786 eptr+= len;
3787 }
3788 break;
3789 }
3790
3791 /* eptr is now past the end of the maximum run */
3792
3793 if (possessive) continue;
3794 for(;;)
3795 {
3796 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM44);
3797 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3798 if (eptr-- == pp) break; /* Stop if tried at original pos */
3799 if (utf8) BACKCHAR(eptr);
3800 }
3801 }
3802
3803 /* Match extended Unicode sequences. We will get here only if the
3804 support is in the binary; otherwise a compile-time error occurs. */
3805
3806 else if (ctype == OP_EXTUNI)
3807 {
3808 for (i = min; i < max; i++)
3809 {
3810 if (eptr >= md->end_subject) break;
3811 GETCHARINCTEST(c, eptr);
3812 prop_category = _pcre_ucp_findprop(c, &prop_chartype, &prop_script);
3813 if (prop_category == ucp_M) break;
3814 while (eptr < md->end_subject)
3815 {
3816 int len = 1;
3817 if (!utf8) c = *eptr; else
3818 {
3819 GETCHARLEN(c, eptr, len);
3820 }
3821 prop_category = _pcre_ucp_findprop(c, &prop_chartype, &prop_script);
3822 if (prop_category != ucp_M) break;
3823 eptr += len;
3824 }
3825 }
3826
3827 /* eptr is now past the end of the maximum run */
3828
3829 if (possessive) continue;
3830 for(;;)
3831 {
3832 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM45);
3833 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3834 if (eptr-- == pp) break; /* Stop if tried at original pos */
3835 for (;;) /* Move back over one extended */
3836 {
3837 int len = 1;
3838 if (!utf8) c = *eptr; else
3839 {
3840 BACKCHAR(eptr);
3841 GETCHARLEN(c, eptr, len);
3842 }
3843 prop_category = _pcre_ucp_findprop(c, &prop_chartype, &prop_script);
3844 if (prop_category != ucp_M) break;
3845 eptr--;
3846 }
3847 }
3848 }
3849
3850 else
3851 #endif /* SUPPORT_UCP */
3852
3853 #ifdef SUPPORT_UTF8
3854 /* UTF-8 mode */
3855
3856 if (utf8)
3857 {
3858 switch(ctype)
3859 {
3860 case OP_ANY:
3861 if (max < INT_MAX)
3862 {
3863 for (i = min; i < max; i++)
3864 {
3865 if (eptr >= md->end_subject || IS_NEWLINE(eptr)) break;
3866 eptr++;
3867 while (eptr < md->end_subject && (*eptr & 0xc0) == 0x80) eptr++;
3868 }
3869 }
3870
3871 /* Handle unlimited UTF-8 repeat */
3872
3873 else
3874 {
3875 for (i = min; i < max; i++)
3876 {
3877 if (eptr >= md->end_subject || IS_NEWLINE(eptr)) break;
3878 eptr++;
3879 while (eptr < md->end_subject && (*eptr & 0xc0) == 0x80) eptr++;
3880 }
3881 }
3882 break;
3883
3884 case OP_ALLANY:
3885 if (max < INT_MAX)
3886 {
3887 for (i = min; i < max; i++)
3888 {
3889 if (eptr >= md->end_subject) break;
3890 eptr++;
3891 while (eptr < md->end_subject && (*eptr & 0xc0) == 0x80) eptr++;
3892 }
3893 }
3894 else eptr = md->end_subject; /* Unlimited UTF-8 repeat */
3895 break;
3896
3897 /* The byte case is the same as non-UTF8 */
3898
3899 case OP_ANYBYTE:
3900 c = max - min;
3901 if (c > (unsigned int)(md->end_subject - eptr))
3902 c = md->end_subject - eptr;
3903 eptr += c;
3904 break;
3905
3906 case OP_ANYNL:
3907 for (i = min; i < max; i++)
3908 {
3909 int len = 1;
3910 if (eptr >= md->end_subject) break;
3911 GETCHARLEN(c, eptr, len);
3912 if (c == 0x000d)
3913 {
3914 if (++eptr >= md->end_subject) break;
3915 if (*eptr == 0x000a) eptr++;
3916 }
3917 else
3918 {
3919 if (c != 0x000a &&
3920 (md->bsr_anycrlf ||
3921 (c != 0x000b && c != 0x000c &&
3922 c != 0x0085 && c != 0x2028 && c != 0x2029)))
3923 break;
3924 eptr += len;
3925 }
3926 }
3927 break;
3928
3929 case OP_NOT_HSPACE:
3930 case OP_HSPACE:
3931 for (i = min; i < max; i++)
3932 {
3933 BOOL gotspace;
3934 int len = 1;
3935 if (eptr >= md->end_subject) break;
3936 GETCHARLEN(c, eptr, len);
3937 switch(c)
3938 {
3939 default: gotspace = FALSE; break;
3940 case 0x09: /* HT */
3941 case 0x20: /* SPACE */
3942 case 0xa0: /* NBSP */
3943 case 0x1680: /* OGHAM SPACE MARK */
3944 case 0x180e: /* MONGOLIAN VOWEL SEPARATOR */
3945 case 0x2000: /* EN QUAD */
3946 case 0x2001: /* EM QUAD */
3947 case 0x2002: /* EN SPACE */
3948 case 0x2003: /* EM SPACE */
3949 case 0x2004: /* THREE-PER-EM SPACE */
3950 case 0x2005: /* FOUR-PER-EM SPACE */
3951 case 0x2006: /* SIX-PER-EM SPACE */
3952 case 0x2007: /* FIGURE SPACE */
3953 case 0x2008: /* PUNCTUATION SPACE */
3954 case 0x2009: /* THIN SPACE */
3955 case 0x200A: /* HAIR SPACE */
3956 case 0x202f: /* NARROW NO-BREAK SPACE */
3957 case 0x205f: /* MEDIUM MATHEMATICAL SPACE */
3958 case 0x3000: /* IDEOGRAPHIC SPACE */
3959 gotspace = TRUE;
3960 break;
3961 }
3962 if (gotspace == (ctype == OP_NOT_HSPACE)) break;
3963 eptr += len;
3964 }
3965 break;
3966
3967 case OP_NOT_VSPACE:
3968 case OP_VSPACE:
3969 for (i = min; i < max; i++)
3970 {
3971 BOOL gotspace;
3972 int len = 1;
3973 if (eptr >= md->end_subject) break;
3974 GETCHARLEN(c, eptr, len);
3975 switch(c)
3976 {
3977 default: gotspace = FALSE; break;
3978 case 0x0a: /* LF */
3979 case 0x0b: /* VT */
3980 case 0x0c: /* FF */
3981 case 0x0d: /* CR */
3982 case 0x85: /* NEL */
3983 case 0x2028: /* LINE SEPARATOR */
3984 case 0x2029: /* PARAGRAPH SEPARATOR */
3985 gotspace = TRUE;
3986 break;
3987 }
3988 if (gotspace == (ctype == OP_NOT_VSPACE)) break;
3989 eptr += len;
3990 }
3991 break;
3992
3993 case OP_NOT_DIGIT:
3994 for (i = min; i < max; i++)
3995 {
3996 int len = 1;
3997 if (eptr >= md->end_subject) break;
3998 GETCHARLEN(c, eptr, len);
3999 if (c < 256 && (md->ctypes[c] & ctype_digit) != 0) break;
4000 eptr+= len;
4001 }
4002 break;
4003
4004 case OP_DIGIT:
4005 for (i = min; i < max; i++)
4006 {
4007 int len = 1;
4008 if (eptr >= md->end_subject) break;
4009 GETCHARLEN(c, eptr, len);
4010 if (c >= 256 ||(md->ctypes[c] & ctype_digit) == 0) break;
4011 eptr+= len;
4012 }
4013 break;
4014
4015 case OP_NOT_WHITESPACE:
4016 for (i = min; i < max; i++)
4017 {
4018 int len = 1;
4019 if (eptr >= md->end_subject) break;
4020 GETCHARLEN(c, eptr, len);
4021 if (c < 256 && (md->ctypes[c] & ctype_space) != 0) break;
4022 eptr+= len;
4023 }
4024 break;
4025
4026 case OP_WHITESPACE:
4027 for (i = min; i < max; i++)
4028 {
4029 int len = 1;
4030 if (eptr >= md->end_subject) break;
4031 GETCHARLEN(c, eptr, len);
4032 if (c >= 256 ||(md->ctypes[c] & ctype_space) == 0) break;
4033 eptr+= len;
4034 }
4035 break;
4036
4037 case OP_NOT_WORDCHAR:
4038 for (i = min; i < max; i++)
4039 {
4040 int len = 1;
4041 if (eptr >= md->end_subject) break;
4042 GETCHARLEN(c, eptr, len);
4043 if (c < 256 && (md->ctypes[c] & ctype_word) != 0) break;
4044 eptr+= len;
4045 }
4046 break;
4047
4048 case OP_WORDCHAR:
4049 for (i = min; i < max; i++)
4050 {
4051 int len = 1;
4052 if (eptr >= md->end_subject) break;
4053 GETCHARLEN(c, eptr, len);
4054 if (c >= 256 || (md->ctypes[c] & ctype_word) == 0) break;
4055 eptr+= len;
4056 }
4057 break;
4058
4059 default:
4060 RRETURN(PCRE_ERROR_INTERNAL);
4061 }
4062
4063 /* eptr is now past the end of the maximum run */
4064
4065 if (possessive) continue;
4066 for(;;)
4067 {
4068 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM46);
4069 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
4070 if (eptr-- == pp) break; /* Stop if tried at original pos */
4071 BACKCHAR(eptr);
4072 }
4073 }
4074 else
4075 #endif /* SUPPORT_UTF8 */
4076
4077 /* Not UTF-8 mode */
4078 {
4079 switch(ctype)
4080 {
4081 case OP_ANY:
4082 for (i = min; i < max; i++)
4083 {
4084 if (eptr >= md->end_subject || IS_NEWLINE(eptr)) break;
4085 eptr++;
4086 }
4087 break;
4088
4089 case OP_ALLANY:
4090 case OP_ANYBYTE:
4091 c = max - min;
4092 if (c > (unsigned int)(md->end_subject - eptr))
4093 c = md->end_subject - eptr;
4094 eptr += c;
4095 break;
4096
4097 case OP_ANYNL:
4098 for (i = min; i < max; i++)
4099 {
4100 if (eptr >= md->end_subject) break;
4101 c = *eptr;
4102 if (c == 0x000d)
4103 {
4104 if (++eptr >= md->end_subject) break;
4105 if (*eptr == 0x000a) eptr++;
4106 }
4107 else
4108 {
4109 if (c != 0x000a &&
4110 (md->bsr_anycrlf ||
4111 (c != 0x000b && c != 0x000c && c != 0x0085)))
4112 break;
4113 eptr++;
4114 }
4115 }
4116 break;
4117
4118 case OP_NOT_HSPACE:
4119 for (i = min; i < max; i++)
4120 {
4121 if (eptr >= md->end_subject) break;
4122 c = *eptr;
4123 if (c == 0x09 || c == 0x20 || c == 0xa0) break;
4124 eptr++;
4125 }
4126 break;
4127
4128 case OP_HSPACE:
4129 for (i = min; i < max; i++)
4130 {
4131 if (eptr >= md->end_subject) break;
4132 c = *eptr;
4133 if (c != 0x09 && c != 0x20 && c != 0xa0) break;
4134 eptr++;
4135 }
4136 break;
4137
4138 case OP_NOT_VSPACE:
4139 for (i = min; i < max; i++)
4140 {
4141 if (eptr >= md->end_subject) break;
4142 c = *eptr;
4143 if (c == 0x0a || c == 0x0b || c == 0x0c || c == 0x0d || c == 0x85)
4144 break;
4145 eptr++;
4146 }
4147 break;
4148
4149 case OP_VSPACE:
4150 for (i = min; i < max; i++)
4151 {
4152 if (eptr >= md->end_subject) break;
4153 c = *eptr;
4154 if (c != 0x0a && c != 0x0b && c != 0x0c && c != 0x0d && c != 0x85)
4155 break;
4156 eptr++;
4157 }
4158 break;
4159
4160 case OP_NOT_DIGIT:
4161 for (i = min; i < max; i++)
4162 {
4163 if (eptr >= md->end_subject || (md->ctypes[*eptr] & ctype_digit) != 0)
4164 break;
4165 eptr++;
4166 }
4167 break;
4168
4169 case OP_DIGIT:
4170 for (i = min; i < max; i++)
4171 {
4172 if (eptr >= md->end_subject || (md->ctypes[*eptr] & ctype_digit) == 0)
4173 break;
4174 eptr++;
4175 }
4176 break;
4177
4178 case OP_NOT_WHITESPACE:
4179 for (i = min; i < max; i++)
4180 {
4181 if (eptr >= md->end_subject || (md->ctypes[*eptr] & ctype_space) != 0)
4182 break;
4183 eptr++;
4184 }
4185 break;
4186
4187 case OP_WHITESPACE:
4188 for (i = min; i < max; i++)
4189 {
4190 if (eptr >= md->end_subject || (md->ctypes[*eptr] & ctype_space) == 0)
4191 break;
4192 eptr++;
4193 }
4194 break;
4195
4196 case OP_NOT_WORDCHAR:
4197 for (i = min; i < max; i++)
4198 {
4199 if (eptr >= md->end_subject || (md->ctypes[*eptr] & ctype_word) != 0)
4200 break;
4201 eptr++;
4202 }
4203 break;
4204
4205 case OP_WORDCHAR:
4206 for (i = min; i < max; i++)
4207 {
4208 if (eptr >= md->end_subject || (md->ctypes[*eptr] & ctype_word) == 0)
4209 break;
4210 eptr++;
4211 }
4212 break;
4213
4214 default:
4215 RRETURN(PCRE_ERROR_INTERNAL);
4216 }
4217
4218 /* eptr is now past the end of the maximum run */
4219
4220 if (possessive) continue;
4221 while (eptr >= pp)
4222 {
4223 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM47);
4224 eptr--;
4225 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
4226 }
4227 }
4228
4229 /* Get here if we can't make it match with any permitted repetitions */
4230
4231 RRETURN(MATCH_NOMATCH);
4232 }
4233 /* Control never gets here */
4234
4235 /* There's been some horrible disaster. Arrival here can only mean there is
4236 something seriously wrong in the code above or the OP_xxx definitions. */
4237
4238 default:
4239 DPRINTF(("Unknown opcode %d\n", *ecode));
4240 RRETURN(PCRE_ERROR_UNKNOWN_OPCODE);
4241 }
4242
4243 /* Do not stick any code in here without much thought; it is assumed
4244 that "continue" in the code above comes out to here to repeat the main
4245 loop. */
4246
4247 } /* End of main loop */
4248 /* Control never reaches here */
4249
4250
4251 /* When compiling to use the heap rather than the stack for recursive calls to
4252 match(), the RRETURN() macro jumps here. The number that is saved in
4253 frame->Xwhere indicates which label we actually want to return to. */
4254
4255 #ifdef NO_RECURSE
4256 #define LBL(val) case val: goto L_RM##val;
4257 HEAP_RETURN:
4258 switch (frame->Xwhere)
4259 {
4260 LBL( 1) LBL( 2) LBL( 3) LBL( 4) LBL( 5) LBL( 6) LBL( 7) LBL( 8)
4261 LBL( 9) LBL(10) LBL(11) LBL(12) LBL(13) LBL(14) LBL(15) LBL(17)
4262 LBL(19) LBL(24) LBL(25) LBL(26) LBL(27) LBL(29) LBL(31) LBL(33)
4263 LBL(35) LBL(43) LBL(47) LBL(48) LBL(49) LBL(50) LBL(51) LBL(52)
4264 LBL(53) LBL(54)
4265 #ifdef SUPPORT_UTF8
4266 LBL(16) LBL(18) LBL(20) LBL(21) LBL(22) LBL(23) LBL(28) LBL(30)
4267 LBL(32) LBL(34) LBL(42) LBL(46)
4268 #ifdef SUPPORT_UCP
4269 LBL(36) LBL(37) LBL(38) LBL(39) LBL(40) LBL(41) LBL(44) LBL(45)
4270 #endif /* SUPPORT_UCP */
4271 #endif /* SUPPORT_UTF8 */
4272 default:
4273 DPRINTF(("jump error in pcre match: label %d non-existent\n", frame->Xwhere));
4274 return PCRE_ERROR_INTERNAL;
4275 }
4276 #undef LBL
4277 #endif /* NO_RECURSE */
4278 }
4279
4280
4281 /***************************************************************************
4282 ****************************************************************************
4283 RECURSION IN THE match() FUNCTION
4284
4285 Undefine all the macros that were defined above to handle this. */
4286
4287 #ifdef NO_RECURSE
4288 #undef eptr
4289 #undef ecode
4290 #undef mstart
4291 #undef offset_top
4292 #undef ims
4293 #undef eptrb
4294 #undef flags
4295
4296 #undef callpat
4297 #undef charptr
4298 #undef data
4299 #undef next
4300 #undef pp
4301 #undef prev
4302 #undef saved_eptr
4303
4304 #undef new_recursive
4305
4306 #undef cur_is_word
4307 #undef condition
4308 #undef prev_is_word
4309
4310 #undef original_ims
4311
4312 #undef ctype
4313 #undef length
4314 #undef max
4315 #undef min
4316 #undef number
4317 #undef offset
4318 #undef op
4319 #undef save_capture_last
4320 #undef save_offset1
4321 #undef save_offset2
4322 #undef save_offset3
4323 #undef stacksave
4324
4325 #undef newptrb
4326
4327 #endif
4328
4329 /* These two are defined as macros in both cases */
4330
4331 #undef fc
4332 #undef fi
4333
4334 /***************************************************************************
4335 ***************************************************************************/
4336
4337
4338
4339 /*************************************************
4340 * Execute a Regular Expression *
4341 *************************************************/
4342
4343 /* This function applies a compiled re to a subject string and picks out
4344 portions of the string if it matches. Two elements in the vector are set for
4345 each substring: the offsets to the start and end of the substring.
4346
4347 Arguments:
4348 argument_re points to the compiled expression
4349 extra_data points to extra data or is NULL
4350 subject points to the subject string
4351 length length of subject string (may contain binary zeros)
4352 start_offset where to start in the subject string
4353 options option bits
4354 offsets points to a vector of ints to be filled in with offsets
4355 offsetcount the number of elements in the vector
4356
4357 Returns: > 0 => success; value is the number of elements filled in
4358 = 0 => success, but offsets is not big enough
4359 -1 => failed to match
4360 < -1 => some kind of unexpected problem
4361 */
4362
4363 PCRE_EXP_DEFN int
4364 pcre_exec(const pcre *argument_re, const pcre_extra *extra_data,
4365 PCRE_SPTR subject, int length, int start_offset, int options, int *offsets,
4366 int offsetcount)
4367 {
4368 int rc, resetcount, ocount;
4369 int first_byte = -1;
4370 int req_byte = -1;
4371 int req_byte2 = -1;
4372 int newline;
4373 unsigned long int ims;
4374 BOOL using_temporary_offsets = FALSE;
4375 BOOL anchored;
4376 BOOL startline;
4377 BOOL firstline;
4378 BOOL first_byte_caseless = FALSE;
4379 BOOL req_byte_caseless = FALSE;
4380 BOOL utf8;
4381 match_data match_block;
4382 match_data *md = &match_block;
4383 const uschar *tables;
4384 const uschar *start_bits = NULL;
4385 USPTR start_match = (USPTR)subject + start_offset;
4386 USPTR end_subject;
4387 USPTR req_byte_ptr = start_match - 1;
4388
4389 pcre_study_data internal_study;
4390 const pcre_study_data *study;
4391
4392 real_pcre internal_re;
4393 const real_pcre *external_re = (const real_pcre *)argument_re;
4394 const real_pcre *re = external_re;
4395
4396 /* Plausibility checks */
4397
4398 if ((options & ~PUBLIC_EXEC_OPTIONS) != 0) return PCRE_ERROR_BADOPTION;
4399 if (re == NULL || subject == NULL ||
4400 (offsets == NULL && offsetcount > 0)) return PCRE_ERROR_NULL;
4401 if (offsetcount < 0) return PCRE_ERROR_BADCOUNT;
4402
4403 /* Fish out the optional data from the extra_data structure, first setting
4404 the default values. */
4405
4406 study = NULL;
4407 md->match_limit = MATCH_LIMIT;
4408 md->match_limit_recursion = MATCH_LIMIT_RECURSION;
4409 md->callout_data = NULL;
4410
4411 /* The table pointer is always in native byte order. */
4412
4413 tables = external_re->tables;
4414
4415 if (extra_data != NULL)
4416 {
4417 register unsigned int flags = extra_data->flags;
4418 if ((flags & PCRE_EXTRA_STUDY_DATA) != 0)
4419 study = (const pcre_study_data *)extra_data->study_data;
4420 if ((flags & PCRE_EXTRA_MATCH_LIMIT) != 0)
4421 md->match_limit = extra_data->match_limit;
4422 if ((flags & PCRE_EXTRA_MATCH_LIMIT_RECURSION) != 0)
4423 md->match_limit_recursion = extra_data->match_limit_recursion;
4424 if ((flags & PCRE_EXTRA_CALLOUT_DATA) != 0)
4425 md->callout_data = extra_data->callout_data;
4426 if ((flags & PCRE_EXTRA_TABLES) != 0) tables = extra_data->tables;
4427 }
4428
4429 /* If the exec call supplied NULL for tables, use the inbuilt ones. This
4430 is a feature that makes it possible to save compiled regex and re-use them
4431 in other programs later. */
4432
4433 if (tables == NULL) tables = _pcre_default_tables;
4434
4435 /* Check that the first field in the block is the magic number. If it is not,
4436 test for a regex that was compiled on a host of opposite endianness. If this is
4437 the case, flipped values are put in internal_re and internal_study if there was
4438 study data too. */
4439
4440 if (re->magic_number != MAGIC_NUMBER)
4441 {
4442 re = _pcre_try_flipped(re, &internal_re, study, &internal_study);
4443 if (re == NULL) return PCRE_ERROR_BADMAGIC;
4444 if (study != NULL) study = &internal_study;
4445 }
4446
4447 /* Set up other data */
4448
4449 anchored = ((re->options | options) & PCRE_ANCHORED) != 0;
4450 startline = (re->flags & PCRE_STARTLINE) != 0;
4451 firstline = (re->options & PCRE_FIRSTLINE) != 0;
4452
4453 /* The code starts after the real_pcre block and the capture name table. */
4454
4455 md->start_code = (const uschar *)external_re + re->name_table_offset +
4456 re->name_count * re->name_entry_size;
4457
4458 md->start_subject = (USPTR)subject;
4459 md->start_offset = start_offset;
4460 md->end_subject = md->start_subject + length;
4461 end_subject = md->end_subject;
4462
4463 md->endonly = (re->options & PCRE_DOLLAR_ENDONLY) != 0;
4464 utf8 = md->utf8 = (re->options & PCRE_UTF8) != 0;
4465 md->jscript_compat = (re->options & PCRE_JAVASCRIPT_COMPAT) != 0;
4466
4467 md->notbol = (options & PCRE_NOTBOL) != 0;
4468 md->noteol = (options & PCRE_NOTEOL) != 0;
4469 md->notempty = (options & PCRE_NOTEMPTY) != 0;
4470 md->partial = (options & PCRE_PARTIAL) != 0;
4471 md->hitend = FALSE;
4472
4473 md->recursive = NULL; /* No recursion at top level */
4474
4475 md->lcc = tables + lcc_offset;
4476 md->ctypes = tables + ctypes_offset;
4477
4478 /* Handle different \R options. */
4479
4480 switch (options & (PCRE_BSR_ANYCRLF|PCRE_BSR_UNICODE))
4481 {
4482 case 0:
4483 if ((re->options & (PCRE_BSR_ANYCRLF|PCRE_BSR_UNICODE)) != 0)
4484 md->bsr_anycrlf = (re->options & PCRE_BSR_ANYCRLF) != 0;
4485 else
4486 #ifdef BSR_ANYCRLF
4487 md->bsr_anycrlf = TRUE;
4488 #else
4489 md->bsr_anycrlf = FALSE;
4490 #endif
4491 break;
4492
4493 case PCRE_BSR_ANYCRLF:
4494 md->bsr_anycrlf = TRUE;
4495 break;
4496
4497 case PCRE_BSR_UNICODE:
4498 md->bsr_anycrlf = FALSE;
4499 break;
4500
4501 default: return PCRE_ERROR_BADNEWLINE;
4502 }
4503
4504 /* Handle different types of newline. The three bits give eight cases. If
4505 nothing is set at run time, whatever was used at compile time applies. */
4506
4507 switch ((((options & PCRE_NEWLINE_BITS) == 0)? re->options :
4508 (pcre_uint32)options) & PCRE_NEWLINE_BITS)
4509 {
4510 case 0: newline = NEWLINE; break; /* Compile-time default */
4511 case PCRE_NEWLINE_CR: newline = '\r'; break;
4512 case PCRE_NEWLINE_LF: newline = '\n'; break;
4513 case PCRE_NEWLINE_CR+
4514 PCRE_NEWLINE_LF: newline = ('\r' << 8) | '\n'; break;
4515 case PCRE_NEWLINE_ANY: newline = -1; break;
4516 case PCRE_NEWLINE_ANYCRLF: newline = -2; break;
4517 default: return PCRE_ERROR_BADNEWLINE;
4518 }
4519
4520 if (newline == -2)
4521 {
4522 md->nltype = NLTYPE_ANYCRLF;
4523 }
4524 else if (newline < 0)
4525 {
4526 md->nltype = NLTYPE_ANY;
4527 }
4528 else
4529 {
4530 md->nltype = NLTYPE_FIXED;
4531 if (newline > 255)
4532 {
4533 md->nllen = 2;
4534 md->nl[0] = (newline >> 8) & 255;
4535 md->nl[1] = newline & 255;
4536 }
4537 else
4538 {
4539 md->nllen = 1;
4540 md->nl[0] = newline;
4541 }
4542 }
4543
4544 /* Partial matching is supported only for a restricted set of regexes at the
4545 moment. */
4546
4547 if (md->partial && (re->flags & PCRE_NOPARTIAL) != 0)
4548 return PCRE_ERROR_BADPARTIAL;
4549
4550 /* Check a UTF-8 string if required. Unfortunately there's no way of passing
4551 back the character offset. */
4552
4553 #ifdef SUPPORT_UTF8
4554 if (utf8 && (options & PCRE_NO_UTF8_CHECK) == 0)
4555 {
4556 if (_pcre_valid_utf8((uschar *)subject, length) >= 0)
4557 return PCRE_ERROR_BADUTF8;
4558 if (start_offset > 0 && start_offset < length)
4559 {
4560 int tb = ((uschar *)subject)[start_offset];
4561 if (tb > 127)
4562 {
4563 tb &= 0xc0;
4564 if (tb != 0 && tb != 0xc0) return PCRE_ERROR_BADUTF8_OFFSET;
4565 }
4566 }
4567 }
4568 #endif
4569
4570 /* The ims options can vary during the matching as a result of the presence
4571 of (?ims) items in the pattern. They are kept in a local variable so that
4572 restoring at the exit of a group is easy. */
4573
4574 ims = re->options & (PCRE_CASELESS|PCRE_MULTILINE|PCRE_DOTALL);
4575
4576 /* If the expression has got more back references than the offsets supplied can
4577 hold, we get a temporary chunk of working store to use during the matching.
4578 Otherwise, we can use the vector supplied, rounding down its size to a multiple
4579 of 3. */
4580
4581 ocount = offsetcount - (offsetcount % 3);
4582
4583 if (re->top_backref > 0 && re->top_backref >= ocount/3)
4584 {
4585 ocount = re->top_backref * 3 + 3;
4586 md->offset_vector = (int *)(pcre_malloc)(ocount * sizeof(int));
4587 if (md->offset_vector == NULL) return PCRE_ERROR_NOMEMORY;
4588 using_temporary_offsets = TRUE;
4589 DPRINTF(("Got memory to hold back references\n"));
4590 }
4591 else md->offset_vector = offsets;
4592
4593 md->offset_end = ocount;
4594 md->offset_max = (2*ocount)/3;
4595 md->offset_overflow = FALSE;
4596 md->capture_last = -1;
4597
4598 /* Compute the minimum number of offsets that we need to reset each time. Doing
4599 this makes a huge difference to execution time when there aren't many brackets
4600 in the pattern. */
4601
4602 resetcount = 2 + re->top_bracket * 2;
4603 if (resetcount > offsetcount) resetcount = ocount;
4604
4605 /* Reset the working variable associated with each extraction. These should
4606 never be used unless previously set, but they get saved and restored, and so we
4607 initialize them to avoid reading uninitialized locations. */
4608
4609 if (md->offset_vector != NULL)
4610 {
4611 register int *iptr = md->offset_vector + ocount;
4612 register int *iend = iptr - resetcount/2 + 1;
4613 while (--iptr >= iend) *iptr = -1;
4614 }
4615
4616 /* Set up the first character to match, if available. The first_byte value is
4617 never set for an anchored regular expression, but the anchoring may be forced
4618 at run time, so we have to test for anchoring. The first char may be unset for
4619 an unanchored pattern, of course. If there's no first char and the pattern was
4620 studied, there may be a bitmap of possible first characters. */
4621
4622 if (!anchored)
4623 {
4624 if ((re->flags & PCRE_FIRSTSET) != 0)
4625 {
4626 first_byte = re->first_byte & 255;
4627 if ((first_byte_caseless = ((re->first_byte & REQ_CASELESS) != 0)) == TRUE)
4628 first_byte = md->lcc[first_byte];
4629 }
4630 else
4631 if (!startline && study != NULL &&
4632 (study->options & PCRE_STUDY_MAPPED) != 0)
4633 start_bits = study->start_bits;
4634 }
4635
4636 /* For anchored or unanchored matches, there may be a "last known required
4637 character" set. */
4638
4639 if ((re->flags & PCRE_REQCHSET) != 0)
4640 {
4641 req_byte = re->req_byte & 255;
4642 req_byte_caseless = (re->req_byte & REQ_CASELESS) != 0;
4643 req_byte2 = (tables + fcc_offset)[req_byte]; /* case flipped */
4644 }
4645
4646
4647 /* ==========================================================================*/
4648
4649 /* Loop for handling unanchored repeated matching attempts; for anchored regexs
4650 the loop runs just once. */
4651
4652 for(;;)
4653 {
4654 USPTR save_end_subject = end_subject;
4655 USPTR new_start_match;
4656
4657 /* Reset the maximum number of extractions we might see. */
4658
4659 if (md->offset_vector != NULL)
4660 {
4661 register int *iptr = md->offset_vector;
4662 register int *iend = iptr + resetcount;
4663 while (iptr < iend) *iptr++ = -1;
4664 }
4665
4666 /* Advance to a unique first char if possible. If firstline is TRUE, the
4667 start of the match is constrained to the first line of a multiline string.
4668 That is, the match must be before or at the first newline. Implement this by
4669 temporarily adjusting end_subject so that we stop scanning at a newline. If
4670 the match fails at the newline, later code breaks this loop. */
4671
4672 if (firstline)
4673 {
4674 USPTR t = start_match;
4675 while (t < md->end_subject && !IS_NEWLINE(t)) t++;
4676 end_subject = t;
4677 }
4678
4679 /* Now test for a unique first byte */
4680
4681 if (first_byte >= 0)
4682 {
4683 if (first_byte_caseless)
4684 while (start_match < end_subject &&
4685 md->lcc[*start_match] != first_byte)
4686 { NEXTCHAR(start_match); }
4687 else
4688 while (start_match < end_subject && *start_match != first_byte)
4689 { NEXTCHAR(start_match); }
4690 }
4691
4692 /* Or to just after a linebreak for a multiline match if possible */
4693
4694 else if (startline)
4695 {
4696 if (start_match > md->start_subject + start_offset)
4697 {
4698 while (start_match <= end_subject && !WAS_NEWLINE(start_match))
4699 { NEXTCHAR(start_match); }
4700
4701 /* If we have just passed a CR and the newline option is ANY or ANYCRLF,
4702 and we are now at a LF, advance the match position by one more character.
4703 */
4704
4705 if (start_match[-1] == '\r' &&
4706 (md->nltype == NLTYPE_ANY || md->nltype == NLTYPE_ANYCRLF) &&
4707 start_match < end_subject &&
4708 *start_match == '\n')
4709 start_match++;
4710 }
4711 }
4712
4713 /* Or to a non-unique first char after study */
4714
4715 else if (start_bits != NULL)
4716 {
4717 while (start_match < end_subject)
4718 {
4719 register unsigned int c = *start_match;
4720 if ((start_bits[c/8] & (1 << (c&7))) == 0)
4721 { NEXTCHAR(start_match); }
4722 else break;
4723 }
4724 }
4725
4726 /* Restore fudged end_subject */
4727
4728 end_subject = save_end_subject;
4729
4730 #ifdef DEBUG /* Sigh. Some compilers never learn. */
4731 printf(">>>> Match against: ");
4732 pchars(start_match, end_subject - start_match, TRUE, md);
4733 printf("\n");
4734 #endif
4735
4736 /* If req_byte is set, we know that that character must appear in the subject
4737 for the match to succeed. If the first character is set, req_byte must be
4738 later in the subject; otherwise the test starts at the match point. This
4739 optimization can save a huge amount of backtracking in patterns with nested
4740 unlimited repeats that aren't going to match. Writing separate code for
4741 cased/caseless versions makes it go faster, as does using an autoincrement
4742 and backing off on a match.
4743
4744 HOWEVER: when the subject string is very, very long, searching to its end can
4745 take a long time, and give bad performance on quite ordinary patterns. This
4746 showed up when somebody was matching something like /^\d+C/ on a 32-megabyte
4747 string... so we don't do this when the string is sufficiently long.
4748
4749 ALSO: this processing is disabled when partial matching is requested.
4750 */
4751
4752 if (req_byte >= 0 &&
4753 end_subject - start_match < REQ_BYTE_MAX &&
4754 !md->partial)
4755 {
4756 register USPTR p = start_match + ((first_byte >= 0)? 1 : 0);
4757
4758 /* We don't need to repeat the search if we haven't yet reached the
4759 place we found it at last time. */
4760
4761 if (p > req_byte_ptr)
4762 {
4763 if (req_byte_caseless)
4764 {
4765 while (p < end_subject)
4766 {
4767 register int pp = *p++;
4768 if (pp == req_byte || pp == req_byte2) { p--; break; }
4769 }
4770 }
4771 else
4772 {
4773 while (p < end_subject)
4774 {
4775 if (*p++ == req_byte) { p--; break; }
4776 }
4777 }
4778
4779 /* If we can't find the required character, break the matching loop,
4780 forcing a match failure. */
4781
4782 if (p >= end_subject)
4783 {
4784 rc = MATCH_NOMATCH;
4785 break;
4786 }
4787
4788 /* If we have found the required character, save the point where we
4789 found it, so that we don't search again next time round the loop if
4790 the start hasn't passed this character yet. */
4791
4792 req_byte_ptr = p;
4793 }
4794 }
4795
4796 /* OK, we can now run the match. */
4797
4798 md->start_match_ptr = start_match;
4799 md->match_call_count = 0;
4800 rc = match(start_match, md->start_code, start_match, 2, md, ims, NULL, 0, 0);
4801
4802 switch(rc)
4803 {
4804 /* NOMATCH and PRUNE advance by one character. THEN at this level acts
4805 exactly like PRUNE. */
4806
4807 case MATCH_NOMATCH:
4808 case MATCH_PRUNE:
4809 case MATCH_THEN:
4810 new_start_match = start_match + 1;
4811 #ifdef SUPPORT_UTF8
4812 if (utf8)
4813 while(new_start_match < end_subject && (*new_start_match & 0xc0) == 0x80)
4814 new_start_match++;
4815 #endif
4816 break;
4817
4818 /* SKIP passes back the next starting point explicitly. */
4819
4820 case MATCH_SKIP:
4821 new_start_match = md->start_match_ptr;
4822 break;
4823
4824 /* COMMIT disables the bumpalong, but otherwise behaves as NOMATCH. */
4825
4826 case MATCH_COMMIT:
4827 rc = MATCH_NOMATCH;
4828 goto ENDLOOP;
4829
4830 /* Any other return is some kind of error. */
4831
4832 default:
4833 goto ENDLOOP;
4834 }
4835
4836 /* Control reaches here for the various types of "no match at this point"
4837 result. Reset the code to MATCH_NOMATCH for subsequent checking. */
4838
4839 rc = MATCH_NOMATCH;
4840
4841 /* If PCRE_FIRSTLINE is set, the match must happen before or at the first
4842 newline in the subject (though it may continue over the newline). Therefore,
4843 if we have just failed to match, starting at a newline, do not continue. */
4844
4845 if (firstline && IS_NEWLINE(start_match)) break;
4846
4847 /* Advance to new matching position */
4848
4849 start_match = new_start_match;
4850
4851 /* Break the loop if the pattern is anchored or if we have passed the end of
4852 the subject. */
4853
4854 if (anchored || start_match > end_subject) break;
4855
4856 /* If we have just passed a CR and we are now at a LF, and the pattern does
4857 not contain any explicit matches for \r or \n, and the newline option is CRLF
4858 or ANY or ANYCRLF, advance the match position by one more character. */
4859
4860 if (start_match[-1] == '\r' &&
4861 start_match < end_subject &&
4862 *start_match == '\n' &&
4863 (re->flags & PCRE_HASCRORLF) == 0 &&
4864 (md->nltype == NLTYPE_ANY ||
4865 md->nltype == NLTYPE_ANYCRLF ||
4866 md->nllen == 2))
4867 start_match++;
4868
4869 } /* End of for(;;) "bumpalong" loop */
4870
4871 /* ==========================================================================*/
4872
4873 /* We reach here when rc is not MATCH_NOMATCH, or if one of the stopping
4874 conditions is true:
4875
4876 (1) The pattern is anchored or the match was failed by (*COMMIT);
4877
4878 (2) We are past the end of the subject;
4879
4880 (3) PCRE_FIRSTLINE is set and we have failed to match at a newline, because
4881 this option requests that a match occur at or before the first newline in
4882 the subject.
4883
4884 When we have a match and the offset vector is big enough to deal with any
4885 backreferences, captured substring offsets will already be set up. In the case
4886 where we had to get some local store to hold offsets for backreference
4887 processing, copy those that we can. In this case there need not be overflow if
4888 certain parts of the pattern were not used, even though there are more
4889 capturing parentheses than vector slots. */
4890
4891 ENDLOOP:
4892
4893 if (rc == MATCH_MATCH)
4894 {
4895 if (using_temporary_offsets)
4896 {
4897 if (offsetcount >= 4)
4898 {
4899 memcpy(offsets + 2, md->offset_vector + 2,
4900 (offsetcount - 2) * sizeof(int));
4901 DPRINTF(("Copied offsets from temporary memory\n"));
4902 }
4903 if (md->end_offset_top > offsetcount) md->offset_overflow = TRUE;
4904 DPRINTF(("Freeing temporary memory\n"));
4905 (pcre_free)(md->offset_vector);
4906 }
4907
4908 /* Set the return code to the number of captured strings, or 0 if there are
4909 too many to fit into the vector. */
4910
4911 rc = md->offset_overflow? 0 : md->end_offset_top/2;
4912
4913 /* If there is space, set up the whole thing as substring 0. The value of
4914 md->start_match_ptr might be modified if \K was encountered on the success
4915 matching path. */
4916
4917 if (offsetcount < 2) rc = 0; else
4918 {
4919 offsets[0] = md->start_match_ptr - md->start_subject;
4920 offsets[1] = md->end_match_ptr - md->start_subject;
4921 }
4922
4923 DPRINTF((">>>> returning %d\n", rc));
4924 return rc;
4925 }
4926
4927 /* Control gets here if there has been an error, or if the overall match
4928 attempt has failed at all permitted starting positions. */
4929
4930 if (using_temporary_offsets)
4931 {
4932 DPRINTF(("Freeing temporary memory\n"));
4933 (pcre_free)(md->offset_vector);
4934 }
4935
4936 if (rc != MATCH_NOMATCH)
4937 {
4938 DPRINTF((">>>> error: returning %d\n", rc));
4939 return rc;
4940 }
4941 else if (md->partial && md->hitend)
4942 {
4943 DPRINTF((">>>> returning PCRE_ERROR_PARTIAL\n"));
4944 return PCRE_ERROR_PARTIAL;
4945 }
4946 else
4947 {
4948 DPRINTF((">>>> returning PCRE_ERROR_NOMATCH\n"));
4949 return PCRE_ERROR_NOMATCH;
4950 }
4951 }
4952
4953 /* End of pcre_exec.c */

Properties

Name Value
svn:eol-style native
svn:keywords "Author Date Id Revision Url"

webmaster@exim.org
ViewVC Help
Powered by ViewVC 1.1.12