/[pcre]/code/trunk/pcre_exec.c
ViewVC logotype

Contents of /code/trunk/pcre_exec.c

Parent Directory Parent Directory | Revision Log Revision Log


Revision 371 - (show annotations) (download)
Mon Aug 25 18:28:05 2008 UTC (5 years, 7 months ago) by ph10
File MIME type: text/plain
File size: 151090 byte(s)
Source tidies for 7.8-RC1 

1 /*************************************************
2 * Perl-Compatible Regular Expressions *
3 *************************************************/
4
5 /* PCRE is a library of functions to support regular expressions whose syntax
6 and semantics are as close as possible to those of the Perl 5 language.
7
8 Written by Philip Hazel
9 Copyright (c) 1997-2008 University of Cambridge
10
11 -----------------------------------------------------------------------------
12 Redistribution and use in source and binary forms, with or without
13 modification, are permitted provided that the following conditions are met:
14
15 * Redistributions of source code must retain the above copyright notice,
16 this list of conditions and the following disclaimer.
17
18 * Redistributions in binary form must reproduce the above copyright
19 notice, this list of conditions and the following disclaimer in the
20 documentation and/or other materials provided with the distribution.
21
22 * Neither the name of the University of Cambridge nor the names of its
23 contributors may be used to endorse or promote products derived from
24 this software without specific prior written permission.
25
26 THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
27 AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
28 IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
29 ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
30 LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
31 CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
32 SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
33 INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
34 CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
35 ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
36 POSSIBILITY OF SUCH DAMAGE.
37 -----------------------------------------------------------------------------
38 */
39
40
41 /* This module contains pcre_exec(), the externally visible function that does
42 pattern matching using an NFA algorithm, trying to mimic Perl as closely as
43 possible. There are also some static supporting functions. */
44
45 #ifdef HAVE_CONFIG_H
46 #include "config.h"
47 #endif
48
49 #define NLBLOCK md /* Block containing newline information */
50 #define PSSTART start_subject /* Field containing processed string start */
51 #define PSEND end_subject /* Field containing processed string end */
52
53 #include "pcre_internal.h"
54
55 /* Undefine some potentially clashing cpp symbols */
56
57 #undef min
58 #undef max
59
60 /* Flag bits for the match() function */
61
62 #define match_condassert 0x01 /* Called to check a condition assertion */
63 #define match_cbegroup 0x02 /* Could-be-empty unlimited repeat group */
64
65 /* Non-error returns from the match() function. Error returns are externally
66 defined PCRE_ERROR_xxx codes, which are all negative. */
67
68 #define MATCH_MATCH 1
69 #define MATCH_NOMATCH 0
70
71 /* Special internal returns from the match() function. Make them sufficiently
72 negative to avoid the external error codes. */
73
74 #define MATCH_COMMIT (-999)
75 #define MATCH_PRUNE (-998)
76 #define MATCH_SKIP (-997)
77 #define MATCH_THEN (-996)
78
79 /* Maximum number of ints of offset to save on the stack for recursive calls.
80 If the offset vector is bigger, malloc is used. This should be a multiple of 3,
81 because the offset vector is always a multiple of 3 long. */
82
83 #define REC_STACK_SAVE_MAX 30
84
85 /* Min and max values for the common repeats; for the maxima, 0 => infinity */
86
87 static const char rep_min[] = { 0, 0, 1, 1, 0, 0 };
88 static const char rep_max[] = { 0, 0, 0, 0, 1, 1 };
89
90
91
92 #ifdef DEBUG
93 /*************************************************
94 * Debugging function to print chars *
95 *************************************************/
96
97 /* Print a sequence of chars in printable format, stopping at the end of the
98 subject if the requested.
99
100 Arguments:
101 p points to characters
102 length number to print
103 is_subject TRUE if printing from within md->start_subject
104 md pointer to matching data block, if is_subject is TRUE
105
106 Returns: nothing
107 */
108
109 static void
110 pchars(const uschar *p, int length, BOOL is_subject, match_data *md)
111 {
112 unsigned int c;
113 if (is_subject && length > md->end_subject - p) length = md->end_subject - p;
114 while (length-- > 0)
115 if (isprint(c = *(p++))) printf("%c", c); else printf("\\x%02x", c);
116 }
117 #endif
118
119
120
121 /*************************************************
122 * Match a back-reference *
123 *************************************************/
124
125 /* If a back reference hasn't been set, the length that is passed is greater
126 than the number of characters left in the string, so the match fails.
127
128 Arguments:
129 offset index into the offset vector
130 eptr points into the subject
131 length length to be matched
132 md points to match data block
133 ims the ims flags
134
135 Returns: TRUE if matched
136 */
137
138 static BOOL
139 match_ref(int offset, register USPTR eptr, int length, match_data *md,
140 unsigned long int ims)
141 {
142 USPTR p = md->start_subject + md->offset_vector[offset];
143
144 #ifdef DEBUG
145 if (eptr >= md->end_subject)
146 printf("matching subject <null>");
147 else
148 {
149 printf("matching subject ");
150 pchars(eptr, length, TRUE, md);
151 }
152 printf(" against backref ");
153 pchars(p, length, FALSE, md);
154 printf("\n");
155 #endif
156
157 /* Always fail if not enough characters left */
158
159 if (length > md->end_subject - eptr) return FALSE;
160
161 /* Separate the caseless case for speed. In UTF-8 mode we can only do this
162 properly if Unicode properties are supported. Otherwise, we can check only
163 ASCII characters. */
164
165 if ((ims & PCRE_CASELESS) != 0)
166 {
167 #ifdef SUPPORT_UTF8
168 #ifdef SUPPORT_UCP
169 if (md->utf8)
170 {
171 USPTR endptr = eptr + length;
172 while (eptr < endptr)
173 {
174 int c, d;
175 GETCHARINC(c, eptr);
176 GETCHARINC(d, p);
177 if (c != d && c != UCD_OTHERCASE(d)) return FALSE;
178 }
179 }
180 else
181 #endif
182 #endif
183
184 /* The same code works when not in UTF-8 mode and in UTF-8 mode when there
185 is no UCP support. */
186
187 while (length-- > 0)
188 { if (md->lcc[*p++] != md->lcc[*eptr++]) return FALSE; }
189 }
190
191 /* In the caseful case, we can just compare the bytes, whether or not we
192 are in UTF-8 mode. */
193
194 else
195 { while (length-- > 0) if (*p++ != *eptr++) return FALSE; }
196
197 return TRUE;
198 }
199
200
201
202 /***************************************************************************
203 ****************************************************************************
204 RECURSION IN THE match() FUNCTION
205
206 The match() function is highly recursive, though not every recursive call
207 increases the recursive depth. Nevertheless, some regular expressions can cause
208 it to recurse to a great depth. I was writing for Unix, so I just let it call
209 itself recursively. This uses the stack for saving everything that has to be
210 saved for a recursive call. On Unix, the stack can be large, and this works
211 fine.
212
213 It turns out that on some non-Unix-like systems there are problems with
214 programs that use a lot of stack. (This despite the fact that every last chip
215 has oodles of memory these days, and techniques for extending the stack have
216 been known for decades.) So....
217
218 There is a fudge, triggered by defining NO_RECURSE, which avoids recursive
219 calls by keeping local variables that need to be preserved in blocks of memory
220 obtained from malloc() instead instead of on the stack. Macros are used to
221 achieve this so that the actual code doesn't look very different to what it
222 always used to.
223
224 The original heap-recursive code used longjmp(). However, it seems that this
225 can be very slow on some operating systems. Following a suggestion from Stan
226 Switzer, the use of longjmp() has been abolished, at the cost of having to
227 provide a unique number for each call to RMATCH. There is no way of generating
228 a sequence of numbers at compile time in C. I have given them names, to make
229 them stand out more clearly.
230
231 Crude tests on x86 Linux show a small speedup of around 5-8%. However, on
232 FreeBSD, avoiding longjmp() more than halves the time taken to run the standard
233 tests. Furthermore, not using longjmp() means that local dynamic variables
234 don't have indeterminate values; this has meant that the frame size can be
235 reduced because the result can be "passed back" by straight setting of the
236 variable instead of being passed in the frame.
237 ****************************************************************************
238 ***************************************************************************/
239
240 /* Numbers for RMATCH calls. When this list is changed, the code at HEAP_RETURN
241 below must be updated in sync. */
242
243 enum { RM1=1, RM2, RM3, RM4, RM5, RM6, RM7, RM8, RM9, RM10,
244 RM11, RM12, RM13, RM14, RM15, RM16, RM17, RM18, RM19, RM20,
245 RM21, RM22, RM23, RM24, RM25, RM26, RM27, RM28, RM29, RM30,
246 RM31, RM32, RM33, RM34, RM35, RM36, RM37, RM38, RM39, RM40,
247 RM41, RM42, RM43, RM44, RM45, RM46, RM47, RM48, RM49, RM50,
248 RM51, RM52, RM53, RM54 };
249
250 /* These versions of the macros use the stack, as normal. There are debugging
251 versions and production versions. Note that the "rw" argument of RMATCH isn't
252 actuall used in this definition. */
253
254 #ifndef NO_RECURSE
255 #define REGISTER register
256
257 #ifdef DEBUG
258 #define RMATCH(ra,rb,rc,rd,re,rf,rg,rw) \
259 { \
260 printf("match() called in line %d\n", __LINE__); \
261 rrc = match(ra,rb,mstart,rc,rd,re,rf,rg,rdepth+1); \
262 printf("to line %d\n", __LINE__); \
263 }
264 #define RRETURN(ra) \
265 { \
266 printf("match() returned %d from line %d ", ra, __LINE__); \
267 return ra; \
268 }
269 #else
270 #define RMATCH(ra,rb,rc,rd,re,rf,rg,rw) \
271 rrc = match(ra,rb,mstart,rc,rd,re,rf,rg,rdepth+1)
272 #define RRETURN(ra) return ra
273 #endif
274
275 #else
276
277
278 /* These versions of the macros manage a private stack on the heap. Note that
279 the "rd" argument of RMATCH isn't actually used in this definition. It's the md
280 argument of match(), which never changes. */
281
282 #define REGISTER
283
284 #define RMATCH(ra,rb,rc,rd,re,rf,rg,rw)\
285 {\
286 heapframe *newframe = (pcre_stack_malloc)(sizeof(heapframe));\
287 frame->Xwhere = rw; \
288 newframe->Xeptr = ra;\
289 newframe->Xecode = rb;\
290 newframe->Xmstart = mstart;\
291 newframe->Xoffset_top = rc;\
292 newframe->Xims = re;\
293 newframe->Xeptrb = rf;\
294 newframe->Xflags = rg;\
295 newframe->Xrdepth = frame->Xrdepth + 1;\
296 newframe->Xprevframe = frame;\
297 frame = newframe;\
298 DPRINTF(("restarting from line %d\n", __LINE__));\
299 goto HEAP_RECURSE;\
300 L_##rw:\
301 DPRINTF(("jumped back to line %d\n", __LINE__));\
302 }
303
304 #define RRETURN(ra)\
305 {\
306 heapframe *newframe = frame;\
307 frame = newframe->Xprevframe;\
308 (pcre_stack_free)(newframe);\
309 if (frame != NULL)\
310 {\
311 rrc = ra;\
312 goto HEAP_RETURN;\
313 }\
314 return ra;\
315 }
316
317
318 /* Structure for remembering the local variables in a private frame */
319
320 typedef struct heapframe {
321 struct heapframe *Xprevframe;
322
323 /* Function arguments that may change */
324
325 const uschar *Xeptr;
326 const uschar *Xecode;
327 const uschar *Xmstart;
328 int Xoffset_top;
329 long int Xims;
330 eptrblock *Xeptrb;
331 int Xflags;
332 unsigned int Xrdepth;
333
334 /* Function local variables */
335
336 const uschar *Xcallpat;
337 const uschar *Xcharptr;
338 const uschar *Xdata;
339 const uschar *Xnext;
340 const uschar *Xpp;
341 const uschar *Xprev;
342 const uschar *Xsaved_eptr;
343
344 recursion_info Xnew_recursive;
345
346 BOOL Xcur_is_word;
347 BOOL Xcondition;
348 BOOL Xprev_is_word;
349
350 unsigned long int Xoriginal_ims;
351
352 #ifdef SUPPORT_UCP
353 int Xprop_type;
354 int Xprop_value;
355 int Xprop_fail_result;
356 int Xprop_category;
357 int Xprop_chartype;
358 int Xprop_script;
359 int Xoclength;
360 uschar Xocchars[8];
361 #endif
362
363 int Xctype;
364 unsigned int Xfc;
365 int Xfi;
366 int Xlength;
367 int Xmax;
368 int Xmin;
369 int Xnumber;
370 int Xoffset;
371 int Xop;
372 int Xsave_capture_last;
373 int Xsave_offset1, Xsave_offset2, Xsave_offset3;
374 int Xstacksave[REC_STACK_SAVE_MAX];
375
376 eptrblock Xnewptrb;
377
378 /* Where to jump back to */
379
380 int Xwhere;
381
382 } heapframe;
383
384 #endif
385
386
387 /***************************************************************************
388 ***************************************************************************/
389
390
391
392 /*************************************************
393 * Match from current position *
394 *************************************************/
395
396 /* This function is called recursively in many circumstances. Whenever it
397 returns a negative (error) response, the outer incarnation must also return the
398 same response.
399
400 Performance note: It might be tempting to extract commonly used fields from the
401 md structure (e.g. utf8, end_subject) into individual variables to improve
402 performance. Tests using gcc on a SPARC disproved this; in the first case, it
403 made performance worse.
404
405 Arguments:
406 eptr pointer to current character in subject
407 ecode pointer to current position in compiled code
408 mstart pointer to the current match start position (can be modified
409 by encountering \K)
410 offset_top current top pointer
411 md pointer to "static" info for the match
412 ims current /i, /m, and /s options
413 eptrb pointer to chain of blocks containing eptr at start of
414 brackets - for testing for empty matches
415 flags can contain
416 match_condassert - this is an assertion condition
417 match_cbegroup - this is the start of an unlimited repeat
418 group that can match an empty string
419 rdepth the recursion depth
420
421 Returns: MATCH_MATCH if matched ) these values are >= 0
422 MATCH_NOMATCH if failed to match )
423 a negative PCRE_ERROR_xxx value if aborted by an error condition
424 (e.g. stopped by repeated call or recursion limit)
425 */
426
427 static int
428 match(REGISTER USPTR eptr, REGISTER const uschar *ecode, const uschar *mstart,
429 int offset_top, match_data *md, unsigned long int ims, eptrblock *eptrb,
430 int flags, unsigned int rdepth)
431 {
432 /* These variables do not need to be preserved over recursion in this function,
433 so they can be ordinary variables in all cases. Mark some of them with
434 "register" because they are used a lot in loops. */
435
436 register int rrc; /* Returns from recursive calls */
437 register int i; /* Used for loops not involving calls to RMATCH() */
438 register unsigned int c; /* Character values not kept over RMATCH() calls */
439 register BOOL utf8; /* Local copy of UTF-8 flag for speed */
440
441 BOOL minimize, possessive; /* Quantifier options */
442
443 /* When recursion is not being used, all "local" variables that have to be
444 preserved over calls to RMATCH() are part of a "frame" which is obtained from
445 heap storage. Set up the top-level frame here; others are obtained from the
446 heap whenever RMATCH() does a "recursion". See the macro definitions above. */
447
448 #ifdef NO_RECURSE
449 heapframe *frame = (pcre_stack_malloc)(sizeof(heapframe));
450 frame->Xprevframe = NULL; /* Marks the top level */
451
452 /* Copy in the original argument variables */
453
454 frame->Xeptr = eptr;
455 frame->Xecode = ecode;
456 frame->Xmstart = mstart;
457 frame->Xoffset_top = offset_top;
458 frame->Xims = ims;
459 frame->Xeptrb = eptrb;
460 frame->Xflags = flags;
461 frame->Xrdepth = rdepth;
462
463 /* This is where control jumps back to to effect "recursion" */
464
465 HEAP_RECURSE:
466
467 /* Macros make the argument variables come from the current frame */
468
469 #define eptr frame->Xeptr
470 #define ecode frame->Xecode
471 #define mstart frame->Xmstart
472 #define offset_top frame->Xoffset_top
473 #define ims frame->Xims
474 #define eptrb frame->Xeptrb
475 #define flags frame->Xflags
476 #define rdepth frame->Xrdepth
477
478 /* Ditto for the local variables */
479
480 #ifdef SUPPORT_UTF8
481 #define charptr frame->Xcharptr
482 #endif
483 #define callpat frame->Xcallpat
484 #define data frame->Xdata
485 #define next frame->Xnext
486 #define pp frame->Xpp
487 #define prev frame->Xprev
488 #define saved_eptr frame->Xsaved_eptr
489
490 #define new_recursive frame->Xnew_recursive
491
492 #define cur_is_word frame->Xcur_is_word
493 #define condition frame->Xcondition
494 #define prev_is_word frame->Xprev_is_word
495
496 #define original_ims frame->Xoriginal_ims
497
498 #ifdef SUPPORT_UCP
499 #define prop_type frame->Xprop_type
500 #define prop_value frame->Xprop_value
501 #define prop_fail_result frame->Xprop_fail_result
502 #define prop_category frame->Xprop_category
503 #define prop_chartype frame->Xprop_chartype
504 #define prop_script frame->Xprop_script
505 #define oclength frame->Xoclength
506 #define occhars frame->Xocchars
507 #endif
508
509 #define ctype frame->Xctype
510 #define fc frame->Xfc
511 #define fi frame->Xfi
512 #define length frame->Xlength
513 #define max frame->Xmax
514 #define min frame->Xmin
515 #define number frame->Xnumber
516 #define offset frame->Xoffset
517 #define op frame->Xop
518 #define save_capture_last frame->Xsave_capture_last
519 #define save_offset1 frame->Xsave_offset1
520 #define save_offset2 frame->Xsave_offset2
521 #define save_offset3 frame->Xsave_offset3
522 #define stacksave frame->Xstacksave
523
524 #define newptrb frame->Xnewptrb
525
526 /* When recursion is being used, local variables are allocated on the stack and
527 get preserved during recursion in the normal way. In this environment, fi and
528 i, and fc and c, can be the same variables. */
529
530 #else /* NO_RECURSE not defined */
531 #define fi i
532 #define fc c
533
534
535 #ifdef SUPPORT_UTF8 /* Many of these variables are used only */
536 const uschar *charptr; /* in small blocks of the code. My normal */
537 #endif /* style of coding would have declared */
538 const uschar *callpat; /* them within each of those blocks. */
539 const uschar *data; /* However, in order to accommodate the */
540 const uschar *next; /* version of this code that uses an */
541 USPTR pp; /* external "stack" implemented on the */
542 const uschar *prev; /* heap, it is easier to declare them all */
543 USPTR saved_eptr; /* here, so the declarations can be cut */
544 /* out in a block. The only declarations */
545 recursion_info new_recursive; /* within blocks below are for variables */
546 /* that do not have to be preserved over */
547 BOOL cur_is_word; /* a recursive call to RMATCH(). */
548 BOOL condition;
549 BOOL prev_is_word;
550
551 unsigned long int original_ims;
552
553 #ifdef SUPPORT_UCP
554 int prop_type;
555 int prop_value;
556 int prop_fail_result;
557 int prop_category;
558 int prop_chartype;
559 int prop_script;
560 int oclength;
561 uschar occhars[8];
562 #endif
563
564 int ctype;
565 int length;
566 int max;
567 int min;
568 int number;
569 int offset;
570 int op;
571 int save_capture_last;
572 int save_offset1, save_offset2, save_offset3;
573 int stacksave[REC_STACK_SAVE_MAX];
574
575 eptrblock newptrb;
576 #endif /* NO_RECURSE */
577
578 /* These statements are here to stop the compiler complaining about unitialized
579 variables. */
580
581 #ifdef SUPPORT_UCP
582 prop_value = 0;
583 prop_fail_result = 0;
584 #endif
585
586
587 /* This label is used for tail recursion, which is used in a few cases even
588 when NO_RECURSE is not defined, in order to reduce the amount of stack that is
589 used. Thanks to Ian Taylor for noticing this possibility and sending the
590 original patch. */
591
592 TAIL_RECURSE:
593
594 /* OK, now we can get on with the real code of the function. Recursive calls
595 are specified by the macro RMATCH and RRETURN is used to return. When
596 NO_RECURSE is *not* defined, these just turn into a recursive call to match()
597 and a "return", respectively (possibly with some debugging if DEBUG is
598 defined). However, RMATCH isn't like a function call because it's quite a
599 complicated macro. It has to be used in one particular way. This shouldn't,
600 however, impact performance when true recursion is being used. */
601
602 #ifdef SUPPORT_UTF8
603 utf8 = md->utf8; /* Local copy of the flag */
604 #else
605 utf8 = FALSE;
606 #endif
607
608 /* First check that we haven't called match() too many times, or that we
609 haven't exceeded the recursive call limit. */
610
611 if (md->match_call_count++ >= md->match_limit) RRETURN(PCRE_ERROR_MATCHLIMIT);
612 if (rdepth >= md->match_limit_recursion) RRETURN(PCRE_ERROR_RECURSIONLIMIT);
613
614 original_ims = ims; /* Save for resetting on ')' */
615
616 /* At the start of a group with an unlimited repeat that may match an empty
617 string, the match_cbegroup flag is set. When this is the case, add the current
618 subject pointer to the chain of such remembered pointers, to be checked when we
619 hit the closing ket, in order to break infinite loops that match no characters.
620 When match() is called in other circumstances, don't add to the chain. The
621 match_cbegroup flag must NOT be used with tail recursion, because the memory
622 block that is used is on the stack, so a new one may be required for each
623 match(). */
624
625 if ((flags & match_cbegroup) != 0)
626 {
627 newptrb.epb_saved_eptr = eptr;
628 newptrb.epb_prev = eptrb;
629 eptrb = &newptrb;
630 }
631
632 /* Now start processing the opcodes. */
633
634 for (;;)
635 {
636 minimize = possessive = FALSE;
637 op = *ecode;
638
639 /* For partial matching, remember if we ever hit the end of the subject after
640 matching at least one subject character. */
641
642 if (md->partial &&
643 eptr >= md->end_subject &&
644 eptr > mstart)
645 md->hitend = TRUE;
646
647 switch(op)
648 {
649 case OP_FAIL:
650 RRETURN(MATCH_NOMATCH);
651
652 case OP_PRUNE:
653 RMATCH(eptr, ecode + _pcre_OP_lengths[*ecode], offset_top, md,
654 ims, eptrb, flags, RM51);
655 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
656 RRETURN(MATCH_PRUNE);
657
658 case OP_COMMIT:
659 RMATCH(eptr, ecode + _pcre_OP_lengths[*ecode], offset_top, md,
660 ims, eptrb, flags, RM52);
661 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
662 RRETURN(MATCH_COMMIT);
663
664 case OP_SKIP:
665 RMATCH(eptr, ecode + _pcre_OP_lengths[*ecode], offset_top, md,
666 ims, eptrb, flags, RM53);
667 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
668 md->start_match_ptr = eptr; /* Pass back current position */
669 RRETURN(MATCH_SKIP);
670
671 case OP_THEN:
672 RMATCH(eptr, ecode + _pcre_OP_lengths[*ecode], offset_top, md,
673 ims, eptrb, flags, RM54);
674 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
675 RRETURN(MATCH_THEN);
676
677 /* Handle a capturing bracket. If there is space in the offset vector, save
678 the current subject position in the working slot at the top of the vector.
679 We mustn't change the current values of the data slot, because they may be
680 set from a previous iteration of this group, and be referred to by a
681 reference inside the group.
682
683 If the bracket fails to match, we need to restore this value and also the
684 values of the final offsets, in case they were set by a previous iteration
685 of the same bracket.
686
687 If there isn't enough space in the offset vector, treat this as if it were
688 a non-capturing bracket. Don't worry about setting the flag for the error
689 case here; that is handled in the code for KET. */
690
691 case OP_CBRA:
692 case OP_SCBRA:
693 number = GET2(ecode, 1+LINK_SIZE);
694 offset = number << 1;
695
696 #ifdef DEBUG
697 printf("start bracket %d\n", number);
698 printf("subject=");
699 pchars(eptr, 16, TRUE, md);
700 printf("\n");
701 #endif
702
703 if (offset < md->offset_max)
704 {
705 save_offset1 = md->offset_vector[offset];
706 save_offset2 = md->offset_vector[offset+1];
707 save_offset3 = md->offset_vector[md->offset_end - number];
708 save_capture_last = md->capture_last;
709
710 DPRINTF(("saving %d %d %d\n", save_offset1, save_offset2, save_offset3));
711 md->offset_vector[md->offset_end - number] = eptr - md->start_subject;
712
713 flags = (op == OP_SCBRA)? match_cbegroup : 0;
714 do
715 {
716 RMATCH(eptr, ecode + _pcre_OP_lengths[*ecode], offset_top, md,
717 ims, eptrb, flags, RM1);
718 if (rrc != MATCH_NOMATCH && rrc != MATCH_THEN) RRETURN(rrc);
719 md->capture_last = save_capture_last;
720 ecode += GET(ecode, 1);
721 }
722 while (*ecode == OP_ALT);
723
724 DPRINTF(("bracket %d failed\n", number));
725
726 md->offset_vector[offset] = save_offset1;
727 md->offset_vector[offset+1] = save_offset2;
728 md->offset_vector[md->offset_end - number] = save_offset3;
729
730 RRETURN(MATCH_NOMATCH);
731 }
732
733 /* FALL THROUGH ... Insufficient room for saving captured contents. Treat
734 as a non-capturing bracket. */
735
736 /* VVVVVVVVVVVVVVVVVVVVVVVVV */
737 /* VVVVVVVVVVVVVVVVVVVVVVVVV */
738
739 DPRINTF(("insufficient capture room: treat as non-capturing\n"));
740
741 /* VVVVVVVVVVVVVVVVVVVVVVVVV */
742 /* VVVVVVVVVVVVVVVVVVVVVVVVV */
743
744 /* Non-capturing bracket. Loop for all the alternatives. When we get to the
745 final alternative within the brackets, we would return the result of a
746 recursive call to match() whatever happened. We can reduce stack usage by
747 turning this into a tail recursion, except in the case when match_cbegroup
748 is set.*/
749
750 case OP_BRA:
751 case OP_SBRA:
752 DPRINTF(("start non-capturing bracket\n"));
753 flags = (op >= OP_SBRA)? match_cbegroup : 0;
754 for (;;)
755 {
756 if (ecode[GET(ecode, 1)] != OP_ALT) /* Final alternative */
757 {
758 if (flags == 0) /* Not a possibly empty group */
759 {
760 ecode += _pcre_OP_lengths[*ecode];
761 DPRINTF(("bracket 0 tail recursion\n"));
762 goto TAIL_RECURSE;
763 }
764
765 /* Possibly empty group; can't use tail recursion. */
766
767 RMATCH(eptr, ecode + _pcre_OP_lengths[*ecode], offset_top, md, ims,
768 eptrb, flags, RM48);
769 RRETURN(rrc);
770 }
771
772 /* For non-final alternatives, continue the loop for a NOMATCH result;
773 otherwise return. */
774
775 RMATCH(eptr, ecode + _pcre_OP_lengths[*ecode], offset_top, md, ims,
776 eptrb, flags, RM2);
777 if (rrc != MATCH_NOMATCH && rrc != MATCH_THEN) RRETURN(rrc);
778 ecode += GET(ecode, 1);
779 }
780 /* Control never reaches here. */
781
782 /* Conditional group: compilation checked that there are no more than
783 two branches. If the condition is false, skipping the first branch takes us
784 past the end if there is only one branch, but that's OK because that is
785 exactly what going to the ket would do. As there is only one branch to be
786 obeyed, we can use tail recursion to avoid using another stack frame. */
787
788 case OP_COND:
789 case OP_SCOND:
790 if (ecode[LINK_SIZE+1] == OP_RREF) /* Recursion test */
791 {
792 offset = GET2(ecode, LINK_SIZE + 2); /* Recursion group number*/
793 condition = md->recursive != NULL &&
794 (offset == RREF_ANY || offset == md->recursive->group_num);
795 ecode += condition? 3 : GET(ecode, 1);
796 }
797
798 else if (ecode[LINK_SIZE+1] == OP_CREF) /* Group used test */
799 {
800 offset = GET2(ecode, LINK_SIZE+2) << 1; /* Doubled ref number */
801 condition = offset < offset_top && md->offset_vector[offset] >= 0;
802 ecode += condition? 3 : GET(ecode, 1);
803 }
804
805 else if (ecode[LINK_SIZE+1] == OP_DEF) /* DEFINE - always false */
806 {
807 condition = FALSE;
808 ecode += GET(ecode, 1);
809 }
810
811 /* The condition is an assertion. Call match() to evaluate it - setting
812 the final argument match_condassert causes it to stop at the end of an
813 assertion. */
814
815 else
816 {
817 RMATCH(eptr, ecode + 1 + LINK_SIZE, offset_top, md, ims, NULL,
818 match_condassert, RM3);
819 if (rrc == MATCH_MATCH)
820 {
821 condition = TRUE;
822 ecode += 1 + LINK_SIZE + GET(ecode, LINK_SIZE + 2);
823 while (*ecode == OP_ALT) ecode += GET(ecode, 1);
824 }
825 else if (rrc != MATCH_NOMATCH && rrc != MATCH_THEN)
826 {
827 RRETURN(rrc); /* Need braces because of following else */
828 }
829 else
830 {
831 condition = FALSE;
832 ecode += GET(ecode, 1);
833 }
834 }
835
836 /* We are now at the branch that is to be obeyed. As there is only one,
837 we can use tail recursion to avoid using another stack frame, except when
838 match_cbegroup is required for an unlimited repeat of a possibly empty
839 group. If the second alternative doesn't exist, we can just plough on. */
840
841 if (condition || *ecode == OP_ALT)
842 {
843 ecode += 1 + LINK_SIZE;
844 if (op == OP_SCOND) /* Possibly empty group */
845 {
846 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, match_cbegroup, RM49);
847 RRETURN(rrc);
848 }
849 else /* Group must match something */
850 {
851 flags = 0;
852 goto TAIL_RECURSE;
853 }
854 }
855 else /* Condition false & no 2nd alternative */
856 {
857 ecode += 1 + LINK_SIZE;
858 }
859 break;
860
861
862 /* End of the pattern, either real or forced. If we are in a top-level
863 recursion, we should restore the offsets appropriately and continue from
864 after the call. */
865
866 case OP_ACCEPT:
867 case OP_END:
868 if (md->recursive != NULL && md->recursive->group_num == 0)
869 {
870 recursion_info *rec = md->recursive;
871 DPRINTF(("End of pattern in a (?0) recursion\n"));
872 md->recursive = rec->prevrec;
873 memmove(md->offset_vector, rec->offset_save,
874 rec->saved_max * sizeof(int));
875 mstart = rec->save_start;
876 ims = original_ims;
877 ecode = rec->after_call;
878 break;
879 }
880
881 /* Otherwise, if PCRE_NOTEMPTY is set, fail if we have matched an empty
882 string - backtracking will then try other alternatives, if any. */
883
884 if (md->notempty && eptr == mstart) RRETURN(MATCH_NOMATCH);
885 md->end_match_ptr = eptr; /* Record where we ended */
886 md->end_offset_top = offset_top; /* and how many extracts were taken */
887 md->start_match_ptr = mstart; /* and the start (\K can modify) */
888 RRETURN(MATCH_MATCH);
889
890 /* Change option settings */
891
892 case OP_OPT:
893 ims = ecode[1];
894 ecode += 2;
895 DPRINTF(("ims set to %02lx\n", ims));
896 break;
897
898 /* Assertion brackets. Check the alternative branches in turn - the
899 matching won't pass the KET for an assertion. If any one branch matches,
900 the assertion is true. Lookbehind assertions have an OP_REVERSE item at the
901 start of each branch to move the current point backwards, so the code at
902 this level is identical to the lookahead case. */
903
904 case OP_ASSERT:
905 case OP_ASSERTBACK:
906 do
907 {
908 RMATCH(eptr, ecode + 1 + LINK_SIZE, offset_top, md, ims, NULL, 0,
909 RM4);
910 if (rrc == MATCH_MATCH) break;
911 if (rrc != MATCH_NOMATCH && rrc != MATCH_THEN) RRETURN(rrc);
912 ecode += GET(ecode, 1);
913 }
914 while (*ecode == OP_ALT);
915 if (*ecode == OP_KET) RRETURN(MATCH_NOMATCH);
916
917 /* If checking an assertion for a condition, return MATCH_MATCH. */
918
919 if ((flags & match_condassert) != 0) RRETURN(MATCH_MATCH);
920
921 /* Continue from after the assertion, updating the offsets high water
922 mark, since extracts may have been taken during the assertion. */
923
924 do ecode += GET(ecode,1); while (*ecode == OP_ALT);
925 ecode += 1 + LINK_SIZE;
926 offset_top = md->end_offset_top;
927 continue;
928
929 /* Negative assertion: all branches must fail to match */
930
931 case OP_ASSERT_NOT:
932 case OP_ASSERTBACK_NOT:
933 do
934 {
935 RMATCH(eptr, ecode + 1 + LINK_SIZE, offset_top, md, ims, NULL, 0,
936 RM5);
937 if (rrc == MATCH_MATCH) RRETURN(MATCH_NOMATCH);
938 if (rrc != MATCH_NOMATCH && rrc != MATCH_THEN) RRETURN(rrc);
939 ecode += GET(ecode,1);
940 }
941 while (*ecode == OP_ALT);
942
943 if ((flags & match_condassert) != 0) RRETURN(MATCH_MATCH);
944
945 ecode += 1 + LINK_SIZE;
946 continue;
947
948 /* Move the subject pointer back. This occurs only at the start of
949 each branch of a lookbehind assertion. If we are too close to the start to
950 move back, this match function fails. When working with UTF-8 we move
951 back a number of characters, not bytes. */
952
953 case OP_REVERSE:
954 #ifdef SUPPORT_UTF8
955 if (utf8)
956 {
957 i = GET(ecode, 1);
958 while (i-- > 0)
959 {
960 eptr--;
961 if (eptr < md->start_subject) RRETURN(MATCH_NOMATCH);
962 BACKCHAR(eptr);
963 }
964 }
965 else
966 #endif
967
968 /* No UTF-8 support, or not in UTF-8 mode: count is byte count */
969
970 {
971 eptr -= GET(ecode, 1);
972 if (eptr < md->start_subject) RRETURN(MATCH_NOMATCH);
973 }
974
975 /* Skip to next op code */
976
977 ecode += 1 + LINK_SIZE;
978 break;
979
980 /* The callout item calls an external function, if one is provided, passing
981 details of the match so far. This is mainly for debugging, though the
982 function is able to force a failure. */
983
984 case OP_CALLOUT:
985 if (pcre_callout != NULL)
986 {
987 pcre_callout_block cb;
988 cb.version = 1; /* Version 1 of the callout block */
989 cb.callout_number = ecode[1];
990 cb.offset_vector = md->offset_vector;
991 cb.subject = (PCRE_SPTR)md->start_subject;
992 cb.subject_length = md->end_subject - md->start_subject;
993 cb.start_match = mstart - md->start_subject;
994 cb.current_position = eptr - md->start_subject;
995 cb.pattern_position = GET(ecode, 2);
996 cb.next_item_length = GET(ecode, 2 + LINK_SIZE);
997 cb.capture_top = offset_top/2;
998 cb.capture_last = md->capture_last;
999 cb.callout_data = md->callout_data;
1000 if ((rrc = (*pcre_callout)(&cb)) > 0) RRETURN(MATCH_NOMATCH);
1001 if (rrc < 0) RRETURN(rrc);
1002 }
1003 ecode += 2 + 2*LINK_SIZE;
1004 break;
1005
1006 /* Recursion either matches the current regex, or some subexpression. The
1007 offset data is the offset to the starting bracket from the start of the
1008 whole pattern. (This is so that it works from duplicated subpatterns.)
1009
1010 If there are any capturing brackets started but not finished, we have to
1011 save their starting points and reinstate them after the recursion. However,
1012 we don't know how many such there are (offset_top records the completed
1013 total) so we just have to save all the potential data. There may be up to
1014 65535 such values, which is too large to put on the stack, but using malloc
1015 for small numbers seems expensive. As a compromise, the stack is used when
1016 there are no more than REC_STACK_SAVE_MAX values to store; otherwise malloc
1017 is used. A problem is what to do if the malloc fails ... there is no way of
1018 returning to the top level with an error. Save the top REC_STACK_SAVE_MAX
1019 values on the stack, and accept that the rest may be wrong.
1020
1021 There are also other values that have to be saved. We use a chained
1022 sequence of blocks that actually live on the stack. Thanks to Robin Houston
1023 for the original version of this logic. */
1024
1025 case OP_RECURSE:
1026 {
1027 callpat = md->start_code + GET(ecode, 1);
1028 new_recursive.group_num = (callpat == md->start_code)? 0 :
1029 GET2(callpat, 1 + LINK_SIZE);
1030
1031 /* Add to "recursing stack" */
1032
1033 new_recursive.prevrec = md->recursive;
1034 md->recursive = &new_recursive;
1035
1036 /* Find where to continue from afterwards */
1037
1038 ecode += 1 + LINK_SIZE;
1039 new_recursive.after_call = ecode;
1040
1041 /* Now save the offset data. */
1042
1043 new_recursive.saved_max = md->offset_end;
1044 if (new_recursive.saved_max <= REC_STACK_SAVE_MAX)
1045 new_recursive.offset_save = stacksave;
1046 else
1047 {
1048 new_recursive.offset_save =
1049 (int *)(pcre_malloc)(new_recursive.saved_max * sizeof(int));
1050 if (new_recursive.offset_save == NULL) RRETURN(PCRE_ERROR_NOMEMORY);
1051 }
1052
1053 memcpy(new_recursive.offset_save, md->offset_vector,
1054 new_recursive.saved_max * sizeof(int));
1055 new_recursive.save_start = mstart;
1056 mstart = eptr;
1057
1058 /* OK, now we can do the recursion. For each top-level alternative we
1059 restore the offset and recursion data. */
1060
1061 DPRINTF(("Recursing into group %d\n", new_recursive.group_num));
1062 flags = (*callpat >= OP_SBRA)? match_cbegroup : 0;
1063 do
1064 {
1065 RMATCH(eptr, callpat + _pcre_OP_lengths[*callpat], offset_top,
1066 md, ims, eptrb, flags, RM6);
1067 if (rrc == MATCH_MATCH)
1068 {
1069 DPRINTF(("Recursion matched\n"));
1070 md->recursive = new_recursive.prevrec;
1071 if (new_recursive.offset_save != stacksave)
1072 (pcre_free)(new_recursive.offset_save);
1073 RRETURN(MATCH_MATCH);
1074 }
1075 else if (rrc != MATCH_NOMATCH && rrc != MATCH_THEN)
1076 {
1077 DPRINTF(("Recursion gave error %d\n", rrc));
1078 RRETURN(rrc);
1079 }
1080
1081 md->recursive = &new_recursive;
1082 memcpy(md->offset_vector, new_recursive.offset_save,
1083 new_recursive.saved_max * sizeof(int));
1084 callpat += GET(callpat, 1);
1085 }
1086 while (*callpat == OP_ALT);
1087
1088 DPRINTF(("Recursion didn't match\n"));
1089 md->recursive = new_recursive.prevrec;
1090 if (new_recursive.offset_save != stacksave)
1091 (pcre_free)(new_recursive.offset_save);
1092 RRETURN(MATCH_NOMATCH);
1093 }
1094 /* Control never reaches here */
1095
1096 /* "Once" brackets are like assertion brackets except that after a match,
1097 the point in the subject string is not moved back. Thus there can never be
1098 a move back into the brackets. Friedl calls these "atomic" subpatterns.
1099 Check the alternative branches in turn - the matching won't pass the KET
1100 for this kind of subpattern. If any one branch matches, we carry on as at
1101 the end of a normal bracket, leaving the subject pointer. */
1102
1103 case OP_ONCE:
1104 prev = ecode;
1105 saved_eptr = eptr;
1106
1107 do
1108 {
1109 RMATCH(eptr, ecode + 1 + LINK_SIZE, offset_top, md, ims, eptrb, 0, RM7);
1110 if (rrc == MATCH_MATCH) break;
1111 if (rrc != MATCH_NOMATCH && rrc != MATCH_THEN) RRETURN(rrc);
1112 ecode += GET(ecode,1);
1113 }
1114 while (*ecode == OP_ALT);
1115
1116 /* If hit the end of the group (which could be repeated), fail */
1117
1118 if (*ecode != OP_ONCE && *ecode != OP_ALT) RRETURN(MATCH_NOMATCH);
1119
1120 /* Continue as from after the assertion, updating the offsets high water
1121 mark, since extracts may have been taken. */
1122
1123 do ecode += GET(ecode, 1); while (*ecode == OP_ALT);
1124
1125 offset_top = md->end_offset_top;
1126 eptr = md->end_match_ptr;
1127
1128 /* For a non-repeating ket, just continue at this level. This also
1129 happens for a repeating ket if no characters were matched in the group.
1130 This is the forcible breaking of infinite loops as implemented in Perl
1131 5.005. If there is an options reset, it will get obeyed in the normal
1132 course of events. */
1133
1134 if (*ecode == OP_KET || eptr == saved_eptr)
1135 {
1136 ecode += 1+LINK_SIZE;
1137 break;
1138 }
1139
1140 /* The repeating kets try the rest of the pattern or restart from the
1141 preceding bracket, in the appropriate order. The second "call" of match()
1142 uses tail recursion, to avoid using another stack frame. We need to reset
1143 any options that changed within the bracket before re-running it, so
1144 check the next opcode. */
1145
1146 if (ecode[1+LINK_SIZE] == OP_OPT)
1147 {
1148 ims = (ims & ~PCRE_IMS) | ecode[4];
1149 DPRINTF(("ims set to %02lx at group repeat\n", ims));
1150 }
1151
1152 if (*ecode == OP_KETRMIN)
1153 {
1154 RMATCH(eptr, ecode + 1 + LINK_SIZE, offset_top, md, ims, eptrb, 0, RM8);
1155 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1156 ecode = prev;
1157 flags = 0;
1158 goto TAIL_RECURSE;
1159 }
1160 else /* OP_KETRMAX */
1161 {
1162 RMATCH(eptr, prev, offset_top, md, ims, eptrb, match_cbegroup, RM9);
1163 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1164 ecode += 1 + LINK_SIZE;
1165 flags = 0;
1166 goto TAIL_RECURSE;
1167 }
1168 /* Control never gets here */
1169
1170 /* An alternation is the end of a branch; scan along to find the end of the
1171 bracketed group and go to there. */
1172
1173 case OP_ALT:
1174 do ecode += GET(ecode,1); while (*ecode == OP_ALT);
1175 break;
1176
1177 /* BRAZERO, BRAMINZERO and SKIPZERO occur just before a bracket group,
1178 indicating that it may occur zero times. It may repeat infinitely, or not
1179 at all - i.e. it could be ()* or ()? or even (){0} in the pattern. Brackets
1180 with fixed upper repeat limits are compiled as a number of copies, with the
1181 optional ones preceded by BRAZERO or BRAMINZERO. */
1182
1183 case OP_BRAZERO:
1184 {
1185 next = ecode+1;
1186 RMATCH(eptr, next, offset_top, md, ims, eptrb, 0, RM10);
1187 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1188 do next += GET(next,1); while (*next == OP_ALT);
1189 ecode = next + 1 + LINK_SIZE;
1190 }
1191 break;
1192
1193 case OP_BRAMINZERO:
1194 {
1195 next = ecode+1;
1196 do next += GET(next, 1); while (*next == OP_ALT);
1197 RMATCH(eptr, next + 1+LINK_SIZE, offset_top, md, ims, eptrb, 0, RM11);
1198 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1199 ecode++;
1200 }
1201 break;
1202
1203 case OP_SKIPZERO:
1204 {
1205 next = ecode+1;
1206 do next += GET(next,1); while (*next == OP_ALT);
1207 ecode = next + 1 + LINK_SIZE;
1208 }
1209 break;
1210
1211 /* End of a group, repeated or non-repeating. */
1212
1213 case OP_KET:
1214 case OP_KETRMIN:
1215 case OP_KETRMAX:
1216 prev = ecode - GET(ecode, 1);
1217
1218 /* If this was a group that remembered the subject start, in order to break
1219 infinite repeats of empty string matches, retrieve the subject start from
1220 the chain. Otherwise, set it NULL. */
1221
1222 if (*prev >= OP_SBRA)
1223 {
1224 saved_eptr = eptrb->epb_saved_eptr; /* Value at start of group */
1225 eptrb = eptrb->epb_prev; /* Backup to previous group */
1226 }
1227 else saved_eptr = NULL;
1228
1229 /* If we are at the end of an assertion group, stop matching and return
1230 MATCH_MATCH, but record the current high water mark for use by positive
1231 assertions. Do this also for the "once" (atomic) groups. */
1232
1233 if (*prev == OP_ASSERT || *prev == OP_ASSERT_NOT ||
1234 *prev == OP_ASSERTBACK || *prev == OP_ASSERTBACK_NOT ||
1235 *prev == OP_ONCE)
1236 {
1237 md->end_match_ptr = eptr; /* For ONCE */
1238 md->end_offset_top = offset_top;
1239 RRETURN(MATCH_MATCH);
1240 }
1241
1242 /* For capturing groups we have to check the group number back at the start
1243 and if necessary complete handling an extraction by setting the offsets and
1244 bumping the high water mark. Note that whole-pattern recursion is coded as
1245 a recurse into group 0, so it won't be picked up here. Instead, we catch it
1246 when the OP_END is reached. Other recursion is handled here. */
1247
1248 if (*prev == OP_CBRA || *prev == OP_SCBRA)
1249 {
1250 number = GET2(prev, 1+LINK_SIZE);
1251 offset = number << 1;
1252
1253 #ifdef DEBUG
1254 printf("end bracket %d", number);
1255 printf("\n");
1256 #endif
1257
1258 md->capture_last = number;
1259 if (offset >= md->offset_max) md->offset_overflow = TRUE; else
1260 {
1261 md->offset_vector[offset] =
1262 md->offset_vector[md->offset_end - number];
1263 md->offset_vector[offset+1] = eptr - md->start_subject;
1264 if (offset_top <= offset) offset_top = offset + 2;
1265 }
1266
1267 /* Handle a recursively called group. Restore the offsets
1268 appropriately and continue from after the call. */
1269
1270 if (md->recursive != NULL && md->recursive->group_num == number)
1271 {
1272 recursion_info *rec = md->recursive;
1273 DPRINTF(("Recursion (%d) succeeded - continuing\n", number));
1274 md->recursive = rec->prevrec;
1275 mstart = rec->save_start;
1276 memcpy(md->offset_vector, rec->offset_save,
1277 rec->saved_max * sizeof(int));
1278 ecode = rec->after_call;
1279 ims = original_ims;
1280 break;
1281 }
1282 }
1283
1284 /* For both capturing and non-capturing groups, reset the value of the ims
1285 flags, in case they got changed during the group. */
1286
1287 ims = original_ims;
1288 DPRINTF(("ims reset to %02lx\n", ims));
1289
1290 /* For a non-repeating ket, just continue at this level. This also
1291 happens for a repeating ket if no characters were matched in the group.
1292 This is the forcible breaking of infinite loops as implemented in Perl
1293 5.005. If there is an options reset, it will get obeyed in the normal
1294 course of events. */
1295
1296 if (*ecode == OP_KET || eptr == saved_eptr)
1297 {
1298 ecode += 1 + LINK_SIZE;
1299 break;
1300 }
1301
1302 /* The repeating kets try the rest of the pattern or restart from the
1303 preceding bracket, in the appropriate order. In the second case, we can use
1304 tail recursion to avoid using another stack frame, unless we have an
1305 unlimited repeat of a group that can match an empty string. */
1306
1307 flags = (*prev >= OP_SBRA)? match_cbegroup : 0;
1308
1309 if (*ecode == OP_KETRMIN)
1310 {
1311 RMATCH(eptr, ecode + 1 + LINK_SIZE, offset_top, md, ims, eptrb, 0, RM12);
1312 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1313 if (flags != 0) /* Could match an empty string */
1314 {
1315 RMATCH(eptr, prev, offset_top, md, ims, eptrb, flags, RM50);
1316 RRETURN(rrc);
1317 }
1318 ecode = prev;
1319 goto TAIL_RECURSE;
1320 }
1321 else /* OP_KETRMAX */
1322 {
1323 RMATCH(eptr, prev, offset_top, md, ims, eptrb, flags, RM13);
1324 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1325 ecode += 1 + LINK_SIZE;
1326 flags = 0;
1327 goto TAIL_RECURSE;
1328 }
1329 /* Control never gets here */
1330
1331 /* Start of subject unless notbol, or after internal newline if multiline */
1332
1333 case OP_CIRC:
1334 if (md->notbol && eptr == md->start_subject) RRETURN(MATCH_NOMATCH);
1335 if ((ims & PCRE_MULTILINE) != 0)
1336 {
1337 if (eptr != md->start_subject &&
1338 (eptr == md->end_subject || !WAS_NEWLINE(eptr)))
1339 RRETURN(MATCH_NOMATCH);
1340 ecode++;
1341 break;
1342 }
1343 /* ... else fall through */
1344
1345 /* Start of subject assertion */
1346
1347 case OP_SOD:
1348 if (eptr != md->start_subject) RRETURN(MATCH_NOMATCH);
1349 ecode++;
1350 break;
1351
1352 /* Start of match assertion */
1353
1354 case OP_SOM:
1355 if (eptr != md->start_subject + md->start_offset) RRETURN(MATCH_NOMATCH);
1356 ecode++;
1357 break;
1358
1359 /* Reset the start of match point */
1360
1361 case OP_SET_SOM:
1362 mstart = eptr;
1363 ecode++;
1364 break;
1365
1366 /* Assert before internal newline if multiline, or before a terminating
1367 newline unless endonly is set, else end of subject unless noteol is set. */
1368
1369 case OP_DOLL:
1370 if ((ims & PCRE_MULTILINE) != 0)
1371 {
1372 if (eptr < md->end_subject)
1373 { if (!IS_NEWLINE(eptr)) RRETURN(MATCH_NOMATCH); }
1374 else
1375 { if (md->noteol) RRETURN(MATCH_NOMATCH); }
1376 ecode++;
1377 break;
1378 }
1379 else
1380 {
1381 if (md->noteol) RRETURN(MATCH_NOMATCH);
1382 if (!md->endonly)
1383 {
1384 if (eptr != md->end_subject &&
1385 (!IS_NEWLINE(eptr) || eptr != md->end_subject - md->nllen))
1386 RRETURN(MATCH_NOMATCH);
1387 ecode++;
1388 break;
1389 }
1390 }
1391 /* ... else fall through for endonly */
1392
1393 /* End of subject assertion (\z) */
1394
1395 case OP_EOD:
1396 if (eptr < md->end_subject) RRETURN(MATCH_NOMATCH);
1397 ecode++;
1398 break;
1399
1400 /* End of subject or ending \n assertion (\Z) */
1401
1402 case OP_EODN:
1403 if (eptr != md->end_subject &&
1404 (!IS_NEWLINE(eptr) || eptr != md->end_subject - md->nllen))
1405 RRETURN(MATCH_NOMATCH);
1406 ecode++;
1407 break;
1408
1409 /* Word boundary assertions */
1410
1411 case OP_NOT_WORD_BOUNDARY:
1412 case OP_WORD_BOUNDARY:
1413 {
1414
1415 /* Find out if the previous and current characters are "word" characters.
1416 It takes a bit more work in UTF-8 mode. Characters > 255 are assumed to
1417 be "non-word" characters. */
1418
1419 #ifdef SUPPORT_UTF8
1420 if (utf8)
1421 {
1422 if (eptr == md->start_subject) prev_is_word = FALSE; else
1423 {
1424 const uschar *lastptr = eptr - 1;
1425 while((*lastptr & 0xc0) == 0x80) lastptr--;
1426 GETCHAR(c, lastptr);
1427 prev_is_word = c < 256 && (md->ctypes[c] & ctype_word) != 0;
1428 }
1429 if (eptr >= md->end_subject) cur_is_word = FALSE; else
1430 {
1431 GETCHAR(c, eptr);
1432 cur_is_word = c < 256 && (md->ctypes[c] & ctype_word) != 0;
1433 }
1434 }
1435 else
1436 #endif
1437
1438 /* More streamlined when not in UTF-8 mode */
1439
1440 {
1441 prev_is_word = (eptr != md->start_subject) &&
1442 ((md->ctypes[eptr[-1]] & ctype_word) != 0);
1443 cur_is_word = (eptr < md->end_subject) &&
1444 ((md->ctypes[*eptr] & ctype_word) != 0);
1445 }
1446
1447 /* Now see if the situation is what we want */
1448
1449 if ((*ecode++ == OP_WORD_BOUNDARY)?
1450 cur_is_word == prev_is_word : cur_is_word != prev_is_word)
1451 RRETURN(MATCH_NOMATCH);
1452 }
1453 break;
1454
1455 /* Match a single character type; inline for speed */
1456
1457 case OP_ANY:
1458 if (IS_NEWLINE(eptr)) RRETURN(MATCH_NOMATCH);
1459 /* Fall through */
1460
1461 case OP_ALLANY:
1462 if (eptr++ >= md->end_subject) RRETURN(MATCH_NOMATCH);
1463 if (utf8) while (eptr < md->end_subject && (*eptr & 0xc0) == 0x80) eptr++;
1464 ecode++;
1465 break;
1466
1467 /* Match a single byte, even in UTF-8 mode. This opcode really does match
1468 any byte, even newline, independent of the setting of PCRE_DOTALL. */
1469
1470 case OP_ANYBYTE:
1471 if (eptr++ >= md->end_subject) RRETURN(MATCH_NOMATCH);
1472 ecode++;
1473 break;
1474
1475 case OP_NOT_DIGIT:
1476 if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
1477 GETCHARINCTEST(c, eptr);
1478 if (
1479 #ifdef SUPPORT_UTF8
1480 c < 256 &&
1481 #endif
1482 (md->ctypes[c] & ctype_digit) != 0
1483 )
1484 RRETURN(MATCH_NOMATCH);
1485 ecode++;
1486 break;
1487
1488 case OP_DIGIT:
1489 if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
1490 GETCHARINCTEST(c, eptr);
1491 if (
1492 #ifdef SUPPORT_UTF8
1493 c >= 256 ||
1494 #endif
1495 (md->ctypes[c] & ctype_digit) == 0
1496 )
1497 RRETURN(MATCH_NOMATCH);
1498 ecode++;
1499 break;
1500
1501 case OP_NOT_WHITESPACE:
1502 if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
1503 GETCHARINCTEST(c, eptr);
1504 if (
1505 #ifdef SUPPORT_UTF8
1506 c < 256 &&
1507 #endif
1508 (md->ctypes[c] & ctype_space) != 0
1509 )
1510 RRETURN(MATCH_NOMATCH);
1511 ecode++;
1512 break;
1513
1514 case OP_WHITESPACE:
1515 if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
1516 GETCHARINCTEST(c, eptr);
1517 if (
1518 #ifdef SUPPORT_UTF8
1519 c >= 256 ||
1520 #endif
1521 (md->ctypes[c] & ctype_space) == 0
1522 )
1523 RRETURN(MATCH_NOMATCH);
1524 ecode++;
1525 break;
1526
1527 case OP_NOT_WORDCHAR:
1528 if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
1529 GETCHARINCTEST(c, eptr);
1530 if (
1531 #ifdef SUPPORT_UTF8
1532 c < 256 &&
1533 #endif
1534 (md->ctypes[c] & ctype_word) != 0
1535 )
1536 RRETURN(MATCH_NOMATCH);
1537 ecode++;
1538 break;
1539
1540 case OP_WORDCHAR:
1541 if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
1542 GETCHARINCTEST(c, eptr);
1543 if (
1544 #ifdef SUPPORT_UTF8
1545 c >= 256 ||
1546 #endif
1547 (md->ctypes[c] & ctype_word) == 0
1548 )
1549 RRETURN(MATCH_NOMATCH);
1550 ecode++;
1551 break;
1552
1553 case OP_ANYNL:
1554 if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
1555 GETCHARINCTEST(c, eptr);
1556 switch(c)
1557 {
1558 default: RRETURN(MATCH_NOMATCH);
1559 case 0x000d:
1560 if (eptr < md->end_subject && *eptr == 0x0a) eptr++;
1561 break;
1562
1563 case 0x000a:
1564 break;
1565
1566 case 0x000b:
1567 case 0x000c:
1568 case 0x0085:
1569 case 0x2028:
1570 case 0x2029:
1571 if (md->bsr_anycrlf) RRETURN(MATCH_NOMATCH);
1572 break;
1573 }
1574 ecode++;
1575 break;
1576
1577 case OP_NOT_HSPACE:
1578 if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
1579 GETCHARINCTEST(c, eptr);
1580 switch(c)
1581 {
1582 default: break;
1583 case 0x09: /* HT */
1584 case 0x20: /* SPACE */
1585 case 0xa0: /* NBSP */
1586 case 0x1680: /* OGHAM SPACE MARK */
1587 case 0x180e: /* MONGOLIAN VOWEL SEPARATOR */
1588 case 0x2000: /* EN QUAD */
1589 case 0x2001: /* EM QUAD */
1590 case 0x2002: /* EN SPACE */
1591 case 0x2003: /* EM SPACE */
1592 case 0x2004: /* THREE-PER-EM SPACE */
1593 case 0x2005: /* FOUR-PER-EM SPACE */
1594 case 0x2006: /* SIX-PER-EM SPACE */
1595 case 0x2007: /* FIGURE SPACE */
1596 case 0x2008: /* PUNCTUATION SPACE */
1597 case 0x2009: /* THIN SPACE */
1598 case 0x200A: /* HAIR SPACE */
1599 case 0x202f: /* NARROW NO-BREAK SPACE */
1600 case 0x205f: /* MEDIUM MATHEMATICAL SPACE */
1601 case 0x3000: /* IDEOGRAPHIC SPACE */
1602 RRETURN(MATCH_NOMATCH);
1603 }
1604 ecode++;
1605 break;
1606
1607 case OP_HSPACE:
1608 if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
1609 GETCHARINCTEST(c, eptr);
1610 switch(c)
1611 {
1612 default: RRETURN(MATCH_NOMATCH);
1613 case 0x09: /* HT */
1614 case 0x20: /* SPACE */
1615 case 0xa0: /* NBSP */
1616 case 0x1680: /* OGHAM SPACE MARK */
1617 case 0x180e: /* MONGOLIAN VOWEL SEPARATOR */
1618 case 0x2000: /* EN QUAD */
1619 case 0x2001: /* EM QUAD */
1620 case 0x2002: /* EN SPACE */
1621 case 0x2003: /* EM SPACE */
1622 case 0x2004: /* THREE-PER-EM SPACE */
1623 case 0x2005: /* FOUR-PER-EM SPACE */
1624 case 0x2006: /* SIX-PER-EM SPACE */
1625 case 0x2007: /* FIGURE SPACE */
1626 case 0x2008: /* PUNCTUATION SPACE */
1627 case 0x2009: /* THIN SPACE */
1628 case 0x200A: /* HAIR SPACE */
1629 case 0x202f: /* NARROW NO-BREAK SPACE */
1630 case 0x205f: /* MEDIUM MATHEMATICAL SPACE */
1631 case 0x3000: /* IDEOGRAPHIC SPACE */
1632 break;
1633 }
1634 ecode++;
1635 break;
1636
1637 case OP_NOT_VSPACE:
1638 if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
1639 GETCHARINCTEST(c, eptr);
1640 switch(c)
1641 {
1642 default: break;
1643 case 0x0a: /* LF */
1644 case 0x0b: /* VT */
1645 case 0x0c: /* FF */
1646 case 0x0d: /* CR */
1647 case 0x85: /* NEL */
1648 case 0x2028: /* LINE SEPARATOR */
1649 case 0x2029: /* PARAGRAPH SEPARATOR */
1650 RRETURN(MATCH_NOMATCH);
1651 }
1652 ecode++;
1653 break;
1654
1655 case OP_VSPACE:
1656 if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
1657 GETCHARINCTEST(c, eptr);
1658 switch(c)
1659 {
1660 default: RRETURN(MATCH_NOMATCH);
1661 case 0x0a: /* LF */
1662 case 0x0b: /* VT */
1663 case 0x0c: /* FF */
1664 case 0x0d: /* CR */
1665 case 0x85: /* NEL */
1666 case 0x2028: /* LINE SEPARATOR */
1667 case 0x2029: /* PARAGRAPH SEPARATOR */
1668 break;
1669 }
1670 ecode++;
1671 break;
1672
1673 #ifdef SUPPORT_UCP
1674 /* Check the next character by Unicode property. We will get here only
1675 if the support is in the binary; otherwise a compile-time error occurs. */
1676
1677 case OP_PROP:
1678 case OP_NOTPROP:
1679 if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
1680 GETCHARINCTEST(c, eptr);
1681 {
1682 const ucd_record * prop = GET_UCD(c);
1683
1684 switch(ecode[1])
1685 {
1686 case PT_ANY:
1687 if (op == OP_NOTPROP) RRETURN(MATCH_NOMATCH);
1688 break;
1689
1690 case PT_LAMP:
1691 if ((prop->chartype == ucp_Lu ||
1692 prop->chartype == ucp_Ll ||
1693 prop->chartype == ucp_Lt) == (op == OP_NOTPROP))
1694 RRETURN(MATCH_NOMATCH);
1695 break;
1696
1697 case PT_GC:
1698 if ((ecode[2] != _pcre_ucp_gentype[prop->chartype]) == (op == OP_PROP))
1699 RRETURN(MATCH_NOMATCH);
1700 break;
1701
1702 case PT_PC:
1703 if ((ecode[2] != prop->chartype) == (op == OP_PROP))
1704 RRETURN(MATCH_NOMATCH);
1705 break;
1706
1707 case PT_SC:
1708 if ((ecode[2] != prop->script) == (op == OP_PROP))
1709 RRETURN(MATCH_NOMATCH);
1710 break;
1711
1712 default:
1713 RRETURN(PCRE_ERROR_INTERNAL);
1714 }
1715
1716 ecode += 3;
1717 }
1718 break;
1719
1720 /* Match an extended Unicode sequence. We will get here only if the support
1721 is in the binary; otherwise a compile-time error occurs. */
1722
1723 case OP_EXTUNI:
1724 if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
1725 GETCHARINCTEST(c, eptr);
1726 {
1727 int category = UCD_CATEGORY(c);
1728 if (category == ucp_M) RRETURN(MATCH_NOMATCH);
1729 while (eptr < md->end_subject)
1730 {
1731 int len = 1;
1732 if (!utf8) c = *eptr; else
1733 {
1734 GETCHARLEN(c, eptr, len);
1735 }
1736 category = UCD_CATEGORY(c);
1737 if (category != ucp_M) break;
1738 eptr += len;
1739 }
1740 }
1741 ecode++;
1742 break;
1743 #endif
1744
1745
1746 /* Match a back reference, possibly repeatedly. Look past the end of the
1747 item to see if there is repeat information following. The code is similar
1748 to that for character classes, but repeated for efficiency. Then obey
1749 similar code to character type repeats - written out again for speed.
1750 However, if the referenced string is the empty string, always treat
1751 it as matched, any number of times (otherwise there could be infinite
1752 loops). */
1753
1754 case OP_REF:
1755 {
1756 offset = GET2(ecode, 1) << 1; /* Doubled ref number */
1757 ecode += 3;
1758
1759 /* If the reference is unset, there are two possibilities:
1760
1761 (a) In the default, Perl-compatible state, set the length to be longer
1762 than the amount of subject left; this ensures that every attempt at a
1763 match fails. We can't just fail here, because of the possibility of
1764 quantifiers with zero minima.
1765
1766 (b) If the JavaScript compatibility flag is set, set the length to zero
1767 so that the back reference matches an empty string.
1768
1769 Otherwise, set the length to the length of what was matched by the
1770 referenced subpattern. */
1771
1772 if (offset >= offset_top || md->offset_vector[offset] < 0)
1773 length = (md->jscript_compat)? 0 : md->end_subject - eptr + 1;
1774 else
1775 length = md->offset_vector[offset+1] - md->offset_vector[offset];
1776
1777 /* Set up for repetition, or handle the non-repeated case */
1778
1779 switch (*ecode)
1780 {
1781 case OP_CRSTAR:
1782 case OP_CRMINSTAR:
1783 case OP_CRPLUS:
1784 case OP_CRMINPLUS:
1785 case OP_CRQUERY:
1786 case OP_CRMINQUERY:
1787 c = *ecode++ - OP_CRSTAR;
1788 minimize = (c & 1) != 0;
1789 min = rep_min[c]; /* Pick up values from tables; */
1790 max = rep_max[c]; /* zero for max => infinity */
1791 if (max == 0) max = INT_MAX;
1792 break;
1793
1794 case OP_CRRANGE:
1795 case OP_CRMINRANGE:
1796 minimize = (*ecode == OP_CRMINRANGE);
1797 min = GET2(ecode, 1);
1798 max = GET2(ecode, 3);
1799 if (max == 0) max = INT_MAX;
1800 ecode += 5;
1801 break;
1802
1803 default: /* No repeat follows */
1804 if (!match_ref(offset, eptr, length, md, ims)) RRETURN(MATCH_NOMATCH);
1805 eptr += length;
1806 continue; /* With the main loop */
1807 }
1808
1809 /* If the length of the reference is zero, just continue with the
1810 main loop. */
1811
1812 if (length == 0) continue;
1813
1814 /* First, ensure the minimum number of matches are present. We get back
1815 the length of the reference string explicitly rather than passing the
1816 address of eptr, so that eptr can be a register variable. */
1817
1818 for (i = 1; i <= min; i++)
1819 {
1820 if (!match_ref(offset, eptr, length, md, ims)) RRETURN(MATCH_NOMATCH);
1821 eptr += length;
1822 }
1823
1824 /* If min = max, continue at the same level without recursion.
1825 They are not both allowed to be zero. */
1826
1827 if (min == max) continue;
1828
1829 /* If minimizing, keep trying and advancing the pointer */
1830
1831 if (minimize)
1832 {
1833 for (fi = min;; fi++)
1834 {
1835 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM14);
1836 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1837 if (fi >= max || !match_ref(offset, eptr, length, md, ims))
1838 RRETURN(MATCH_NOMATCH);
1839 eptr += length;
1840 }
1841 /* Control never gets here */
1842 }
1843
1844 /* If maximizing, find the longest string and work backwards */
1845
1846 else
1847 {
1848 pp = eptr;
1849 for (i = min; i < max; i++)
1850 {
1851 if (!match_ref(offset, eptr, length, md, ims)) break;
1852 eptr += length;
1853 }
1854 while (eptr >= pp)
1855 {
1856 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM15);
1857 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1858 eptr -= length;
1859 }
1860 RRETURN(MATCH_NOMATCH);
1861 }
1862 }
1863 /* Control never gets here */
1864
1865
1866
1867 /* Match a bit-mapped character class, possibly repeatedly. This op code is
1868 used when all the characters in the class have values in the range 0-255,
1869 and either the matching is caseful, or the characters are in the range
1870 0-127 when UTF-8 processing is enabled. The only difference between
1871 OP_CLASS and OP_NCLASS occurs when a data character outside the range is
1872 encountered.
1873
1874 First, look past the end of the item to see if there is repeat information
1875 following. Then obey similar code to character type repeats - written out
1876 again for speed. */
1877
1878 case OP_NCLASS:
1879 case OP_CLASS:
1880 {
1881 data = ecode + 1; /* Save for matching */
1882 ecode += 33; /* Advance past the item */
1883
1884 switch (*ecode)
1885 {
1886 case OP_CRSTAR:
1887 case OP_CRMINSTAR:
1888 case OP_CRPLUS:
1889 case OP_CRMINPLUS:
1890 case OP_CRQUERY:
1891 case OP_CRMINQUERY:
1892 c = *ecode++ - OP_CRSTAR;
1893 minimize = (c & 1) != 0;
1894 min = rep_min[c]; /* Pick up values from tables; */
1895 max = rep_max[c]; /* zero for max => infinity */
1896 if (max == 0) max = INT_MAX;
1897 break;
1898
1899 case OP_CRRANGE:
1900 case OP_CRMINRANGE:
1901 minimize = (*ecode == OP_CRMINRANGE);
1902 min = GET2(ecode, 1);
1903 max = GET2(ecode, 3);
1904 if (max == 0) max = INT_MAX;
1905 ecode += 5;
1906 break;
1907
1908 default: /* No repeat follows */
1909 min = max = 1;
1910 break;
1911 }
1912
1913 /* First, ensure the minimum number of matches are present. */
1914
1915 #ifdef SUPPORT_UTF8
1916 /* UTF-8 mode */
1917 if (utf8)
1918 {
1919 for (i = 1; i <= min; i++)
1920 {
1921 if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
1922 GETCHARINC(c, eptr);
1923 if (c > 255)
1924 {
1925 if (op == OP_CLASS) RRETURN(MATCH_NOMATCH);
1926 }
1927 else
1928 {
1929 if ((data[c/8] & (1 << (c&7))) == 0) RRETURN(MATCH_NOMATCH);
1930 }
1931 }
1932 }
1933 else
1934 #endif
1935 /* Not UTF-8 mode */
1936 {
1937 for (i = 1; i <= min; i++)
1938 {
1939 if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
1940 c = *eptr++;
1941 if ((data[c/8] & (1 << (c&7))) == 0) RRETURN(MATCH_NOMATCH);
1942 }
1943 }
1944
1945 /* If max == min we can continue with the main loop without the
1946 need to recurse. */
1947
1948 if (min == max) continue;
1949
1950 /* If minimizing, keep testing the rest of the expression and advancing
1951 the pointer while it matches the class. */
1952
1953 if (minimize)
1954 {
1955 #ifdef SUPPORT_UTF8
1956 /* UTF-8 mode */
1957 if (utf8)
1958 {
1959 for (fi = min;; fi++)
1960 {
1961 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM16);
1962 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1963 if (fi >= max || eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
1964 GETCHARINC(c, eptr);
1965 if (c > 255)
1966 {
1967 if (op == OP_CLASS) RRETURN(MATCH_NOMATCH);
1968 }
1969 else
1970 {
1971 if ((data[c/8] & (1 << (c&7))) == 0) RRETURN(MATCH_NOMATCH);
1972 }
1973 }
1974 }
1975 else
1976 #endif
1977 /* Not UTF-8 mode */
1978 {
1979 for (fi = min;; fi++)
1980 {
1981 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM17);
1982 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1983 if (fi >= max || eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
1984 c = *eptr++;
1985 if ((data[c/8] & (1 << (c&7))) == 0) RRETURN(MATCH_NOMATCH);
1986 }
1987 }
1988 /* Control never gets here */
1989 }
1990
1991 /* If maximizing, find the longest possible run, then work backwards. */
1992
1993 else
1994 {
1995 pp = eptr;
1996
1997 #ifdef SUPPORT_UTF8
1998 /* UTF-8 mode */
1999 if (utf8)
2000 {
2001 for (i = min; i < max; i++)
2002 {
2003 int len = 1;
2004 if (eptr >= md->end_subject) break;
2005 GETCHARLEN(c, eptr, len);
2006 if (c > 255)
2007 {
2008 if (op == OP_CLASS) break;
2009 }
2010 else
2011 {
2012 if ((data[c/8] & (1 << (c&7))) == 0) break;
2013 }
2014 eptr += len;
2015 }
2016 for (;;)
2017 {
2018 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM18);
2019 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2020 if (eptr-- == pp) break; /* Stop if tried at original pos */
2021 BACKCHAR(eptr);
2022 }
2023 }
2024 else
2025 #endif
2026 /* Not UTF-8 mode */
2027 {
2028 for (i = min; i < max; i++)
2029 {
2030 if (eptr >= md->end_subject) break;
2031 c = *eptr;
2032 if ((data[c/8] & (1 << (c&7))) == 0) break;
2033 eptr++;
2034 }
2035 while (eptr >= pp)
2036 {
2037 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM19);
2038 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2039 eptr--;
2040 }
2041 }
2042
2043 RRETURN(MATCH_NOMATCH);
2044 }
2045 }
2046 /* Control never gets here */
2047
2048
2049 /* Match an extended character class. This opcode is encountered only
2050 in UTF-8 mode, because that's the only time it is compiled. */
2051
2052 #ifdef SUPPORT_UTF8
2053 case OP_XCLASS:
2054 {
2055 data = ecode + 1 + LINK_SIZE; /* Save for matching */
2056 ecode += GET(ecode, 1); /* Advance past the item */
2057
2058 switch (*ecode)
2059 {
2060 case OP_CRSTAR:
2061 case OP_CRMINSTAR:
2062 case OP_CRPLUS:
2063 case OP_CRMINPLUS:
2064 case OP_CRQUERY:
2065 case OP_CRMINQUERY:
2066 c = *ecode++ - OP_CRSTAR;
2067 minimize = (c & 1) != 0;
2068 min = rep_min[c]; /* Pick up values from tables; */
2069 max = rep_max[c]; /* zero for max => infinity */
2070 if (max == 0) max = INT_MAX;
2071 break;
2072
2073 case OP_CRRANGE:
2074 case OP_CRMINRANGE:
2075 minimize = (*ecode == OP_CRMINRANGE);
2076 min = GET2(ecode, 1);
2077 max = GET2(ecode, 3);
2078 if (max == 0) max = INT_MAX;
2079 ecode += 5;
2080 break;
2081
2082 default: /* No repeat follows */
2083 min = max = 1;
2084 break;
2085 }
2086
2087 /* First, ensure the minimum number of matches are present. */
2088
2089 for (i = 1; i <= min; i++)
2090 {
2091 if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
2092 GETCHARINC(c, eptr);
2093 if (!_pcre_xclass(c, data)) RRETURN(MATCH_NOMATCH);
2094 }
2095
2096 /* If max == min we can continue with the main loop without the
2097 need to recurse. */
2098
2099 if (min == max) continue;
2100
2101 /* If minimizing, keep testing the rest of the expression and advancing
2102 the pointer while it matches the class. */
2103
2104 if (minimize)
2105 {
2106 for (fi = min;; fi++)
2107 {
2108 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM20);
2109 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2110 if (fi >= max || eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
2111 GETCHARINC(c, eptr);
2112 if (!_pcre_xclass(c, data)) RRETURN(MATCH_NOMATCH);
2113 }
2114 /* Control never gets here */
2115 }
2116
2117 /* If maximizing, find the longest possible run, then work backwards. */
2118
2119 else
2120 {
2121 pp = eptr;
2122 for (i = min; i < max; i++)
2123 {
2124 int len = 1;
2125 if (eptr >= md->end_subject) break;
2126 GETCHARLEN(c, eptr, len);
2127 if (!_pcre_xclass(c, data)) break;
2128 eptr += len;
2129 }
2130 for(;;)
2131 {
2132 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM21);
2133 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2134 if (eptr-- == pp) break; /* Stop if tried at original pos */
2135 if (utf8) BACKCHAR(eptr);
2136 }
2137 RRETURN(MATCH_NOMATCH);
2138 }
2139
2140 /* Control never gets here */
2141 }
2142 #endif /* End of XCLASS */
2143
2144 /* Match a single character, casefully */
2145
2146 case OP_CHAR:
2147 #ifdef SUPPORT_UTF8
2148 if (utf8)
2149 {
2150 length = 1;
2151 ecode++;
2152 GETCHARLEN(fc, ecode, length);
2153 if (length > md->end_subject - eptr) RRETURN(MATCH_NOMATCH);
2154 while (length-- > 0) if (*ecode++ != *eptr++) RRETURN(MATCH_NOMATCH);
2155 }
2156 else
2157 #endif
2158
2159 /* Non-UTF-8 mode */
2160 {
2161 if (md->end_subject - eptr < 1) RRETURN(MATCH_NOMATCH);
2162 if (ecode[1] != *eptr++) RRETURN(MATCH_NOMATCH);
2163 ecode += 2;
2164 }
2165 break;
2166
2167 /* Match a single character, caselessly */
2168
2169 case OP_CHARNC:
2170 #ifdef SUPPORT_UTF8
2171 if (utf8)
2172 {
2173 length = 1;
2174 ecode++;
2175 GETCHARLEN(fc, ecode, length);
2176
2177 if (length > md->end_subject - eptr) RRETURN(MATCH_NOMATCH);
2178
2179 /* If the pattern character's value is < 128, we have only one byte, and
2180 can use the fast lookup table. */
2181
2182 if (fc < 128)
2183 {
2184 if (md->lcc[*ecode++] != md->lcc[*eptr++]) RRETURN(MATCH_NOMATCH);
2185 }
2186
2187 /* Otherwise we must pick up the subject character */
2188
2189 else
2190 {
2191 unsigned int dc;
2192 GETCHARINC(dc, eptr);
2193 ecode += length;
2194
2195 /* If we have Unicode property support, we can use it to test the other
2196 case of the character, if there is one. */
2197
2198 if (fc != dc)
2199 {
2200 #ifdef SUPPORT_UCP
2201 if (dc != UCD_OTHERCASE(fc))
2202 #endif
2203 RRETURN(MATCH_NOMATCH);
2204 }
2205 }
2206 }
2207 else
2208 #endif /* SUPPORT_UTF8 */
2209
2210 /* Non-UTF-8 mode */
2211 {
2212 if (md->end_subject - eptr < 1) RRETURN(MATCH_NOMATCH);
2213 if (md->lcc[ecode[1]] != md->lcc[*eptr++]) RRETURN(MATCH_NOMATCH);
2214 ecode += 2;
2215 }
2216 break;
2217
2218 /* Match a single character repeatedly. */
2219
2220 case OP_EXACT:
2221 min = max = GET2(ecode, 1);
2222 ecode += 3;
2223 goto REPEATCHAR;
2224
2225 case OP_POSUPTO:
2226 possessive = TRUE;
2227 /* Fall through */
2228
2229 case OP_UPTO:
2230 case OP_MINUPTO:
2231 min = 0;
2232 max = GET2(ecode, 1);
2233 minimize = *ecode == OP_MINUPTO;
2234 ecode += 3;
2235 goto REPEATCHAR;
2236
2237 case OP_POSSTAR:
2238 possessive = TRUE;
2239 min = 0;
2240 max = INT_MAX;
2241 ecode++;
2242 goto REPEATCHAR;
2243
2244 case OP_POSPLUS:
2245 possessive = TRUE;
2246 min = 1;
2247 max = INT_MAX;
2248 ecode++;
2249 goto REPEATCHAR;
2250
2251 case OP_POSQUERY:
2252 possessive = TRUE;
2253 min = 0;
2254 max = 1;
2255 ecode++;
2256 goto REPEATCHAR;
2257
2258 case OP_STAR:
2259 case OP_MINSTAR:
2260 case OP_PLUS:
2261 case OP_MINPLUS:
2262 case OP_QUERY:
2263 case OP_MINQUERY:
2264 c = *ecode++ - OP_STAR;
2265 minimize = (c & 1) != 0;
2266 min = rep_min[c]; /* Pick up values from tables; */
2267 max = rep_max[c]; /* zero for max => infinity */
2268 if (max == 0) max = INT_MAX;
2269
2270 /* Common code for all repeated single-character matches. We can give
2271 up quickly if there are fewer than the minimum number of characters left in
2272 the subject. */
2273
2274 REPEATCHAR:
2275 #ifdef SUPPORT_UTF8
2276 if (utf8)
2277 {
2278 length = 1;
2279 charptr = ecode;
2280 GETCHARLEN(fc, ecode, length);
2281 if (min * length > md->end_subject - eptr) RRETURN(MATCH_NOMATCH);
2282 ecode += length;
2283
2284 /* Handle multibyte character matching specially here. There is
2285 support for caseless matching if UCP support is present. */
2286
2287 if (length > 1)
2288 {
2289 #ifdef SUPPORT_UCP
2290 unsigned int othercase;
2291 if ((ims & PCRE_CASELESS) != 0 &&
2292 (othercase = UCD_OTHERCASE(fc)) != fc)
2293 oclength = _pcre_ord2utf8(othercase, occhars);
2294 else oclength = 0;
2295 #endif /* SUPPORT_UCP */
2296
2297 for (i = 1; i <= min; i++)
2298 {
2299 if (memcmp(eptr, charptr, length) == 0) eptr += length;
2300 #ifdef SUPPORT_UCP
2301 /* Need braces because of following else */
2302 else if (oclength == 0) { RRETURN(MATCH_NOMATCH); }
2303 else
2304 {
2305 if (memcmp(eptr, occhars, oclength) != 0) RRETURN(MATCH_NOMATCH);
2306 eptr += oclength;
2307 }
2308 #else /* without SUPPORT_UCP */
2309 else { RRETURN(MATCH_NOMATCH); }
2310 #endif /* SUPPORT_UCP */
2311 }
2312
2313 if (min == max) continue;
2314
2315 if (minimize)
2316 {
2317 for (fi = min;; fi++)
2318 {
2319 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM22);
2320 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2321 if (fi >= max || eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
2322 if (memcmp(eptr, charptr, length) == 0) eptr += length;
2323 #ifdef SUPPORT_UCP
2324 /* Need braces because of following else */
2325 else if (oclength == 0) { RRETURN(MATCH_NOMATCH); }
2326 else
2327 {
2328 if (memcmp(eptr, occhars, oclength) != 0) RRETURN(MATCH_NOMATCH);
2329 eptr += oclength;
2330 }
2331 #else /* without SUPPORT_UCP */
2332 else { RRETURN (MATCH_NOMATCH); }
2333 #endif /* SUPPORT_UCP */
2334 }
2335 /* Control never gets here */
2336 }
2337
2338 else /* Maximize */
2339 {
2340 pp = eptr;
2341 for (i = min; i < max; i++)
2342 {
2343 if (eptr > md->end_subject - length) break;
2344 if (memcmp(eptr, charptr, length) == 0) eptr += length;
2345 #ifdef SUPPORT_UCP
2346 else if (oclength == 0) break;
2347 else
2348 {
2349 if (memcmp(eptr, occhars, oclength) != 0) break;
2350 eptr += oclength;
2351 }
2352 #else /* without SUPPORT_UCP */
2353 else break;
2354 #endif /* SUPPORT_UCP */
2355 }
2356
2357 if (possessive) continue;
2358 for(;;)
2359 {
2360 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM23);
2361 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2362 if (eptr == pp) RRETURN(MATCH_NOMATCH);
2363 #ifdef SUPPORT_UCP
2364 eptr--;
2365 BACKCHAR(eptr);
2366 #else /* without SUPPORT_UCP */
2367 eptr -= length;
2368 #endif /* SUPPORT_UCP */
2369 }
2370 }
2371 /* Control never gets here */
2372 }
2373
2374 /* If the length of a UTF-8 character is 1, we fall through here, and
2375 obey the code as for non-UTF-8 characters below, though in this case the
2376 value of fc will always be < 128. */
2377 }
2378 else
2379 #endif /* SUPPORT_UTF8 */
2380
2381 /* When not in UTF-8 mode, load a single-byte character. */
2382 {
2383 if (min > md->end_subject - eptr) RRETURN(MATCH_NOMATCH);
2384 fc = *ecode++;
2385 }
2386
2387 /* The value of fc at this point is always less than 256, though we may or
2388 may not be in UTF-8 mode. The code is duplicated for the caseless and
2389 caseful cases, for speed, since matching characters is likely to be quite
2390 common. First, ensure the minimum number of matches are present. If min =
2391 max, continue at the same level without recursing. Otherwise, if
2392 minimizing, keep trying the rest of the expression and advancing one
2393 matching character if failing, up to the maximum. Alternatively, if
2394 maximizing, find the maximum number of characters and work backwards. */
2395
2396 DPRINTF(("matching %c{%d,%d} against subject %.*s\n", fc, min, max,
2397 max, eptr));
2398
2399 if ((ims & PCRE_CASELESS) != 0)
2400 {
2401 fc = md->lcc[fc];
2402 for (i = 1; i <= min; i++)
2403 if (fc != md->lcc[*eptr++]) RRETURN(MATCH_NOMATCH);
2404 if (min == max) continue;
2405 if (minimize)
2406 {
2407 for (fi = min;; fi++)
2408 {
2409 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM24);
2410 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2411 if (fi >= max || eptr >= md->end_subject ||
2412 fc != md->lcc[*eptr++])
2413 RRETURN(MATCH_NOMATCH);
2414 }
2415 /* Control never gets here */
2416 }
2417 else /* Maximize */
2418 {
2419 pp = eptr;
2420 for (i = min; i < max; i++)
2421 {
2422 if (eptr >= md->end_subject || fc != md->lcc[*eptr]) break;
2423 eptr++;
2424 }
2425 if (possessive) continue;
2426 while (eptr >= pp)
2427 {
2428 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM25);
2429 eptr--;
2430 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2431 }
2432 RRETURN(MATCH_NOMATCH);
2433 }
2434 /* Control never gets here */
2435 }
2436
2437 /* Caseful comparisons (includes all multi-byte characters) */
2438
2439 else
2440 {
2441 for (i = 1; i <= min; i++) if (fc != *eptr++) RRETURN(MATCH_NOMATCH);
2442 if (min == max) continue;
2443 if (minimize)
2444 {
2445 for (fi = min;; fi++)
2446 {
2447 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM26);
2448 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2449 if (fi >= max || eptr >= md->end_subject || fc != *eptr++)
2450 RRETURN(MATCH_NOMATCH);
2451 }
2452 /* Control never gets here */
2453 }
2454 else /* Maximize */
2455 {
2456 pp = eptr;
2457 for (i = min; i < max; i++)
2458 {
2459 if (eptr >= md->end_subject || fc != *eptr) break;
2460 eptr++;
2461 }
2462 if (possessive) continue;
2463 while (eptr >= pp)
2464 {
2465 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM27);
2466 eptr--;
2467 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2468 }
2469 RRETURN(MATCH_NOMATCH);
2470 }
2471 }
2472 /* Control never gets here */
2473
2474 /* Match a negated single one-byte character. The character we are
2475 checking can be multibyte. */
2476
2477 case OP_NOT:
2478 if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
2479 ecode++;
2480 GETCHARINCTEST(c, eptr);
2481 if ((ims & PCRE_CASELESS) != 0)
2482 {
2483 #ifdef SUPPORT_UTF8
2484 if (c < 256)
2485 #endif
2486 c = md->lcc[c];
2487 if (md->lcc[*ecode++] == c) RRETURN(MATCH_NOMATCH);
2488 }
2489 else
2490 {
2491 if (*ecode++ == c) RRETURN(MATCH_NOMATCH);
2492 }
2493 break;
2494
2495 /* Match a negated single one-byte character repeatedly. This is almost a
2496 repeat of the code for a repeated single character, but I haven't found a
2497 nice way of commoning these up that doesn't require a test of the
2498 positive/negative option for each character match. Maybe that wouldn't add
2499 very much to the time taken, but character matching *is* what this is all
2500 about... */
2501
2502 case OP_NOTEXACT:
2503 min = max = GET2(ecode, 1);
2504 ecode += 3;
2505 goto REPEATNOTCHAR;
2506
2507 case OP_NOTUPTO:
2508 case OP_NOTMINUPTO:
2509 min = 0;
2510 max = GET2(ecode, 1);
2511 minimize = *ecode == OP_NOTMINUPTO;
2512 ecode += 3;
2513 goto REPEATNOTCHAR;
2514
2515 case OP_NOTPOSSTAR:
2516 possessive = TRUE;
2517 min = 0;
2518 max = INT_MAX;
2519 ecode++;
2520 goto REPEATNOTCHAR;
2521
2522 case OP_NOTPOSPLUS:
2523 possessive = TRUE;
2524 min = 1;
2525 max = INT_MAX;
2526 ecode++;
2527 goto REPEATNOTCHAR;
2528
2529 case OP_NOTPOSQUERY:
2530 possessive = TRUE;
2531 min = 0;
2532 max = 1;
2533 ecode++;
2534 goto REPEATNOTCHAR;
2535
2536 case OP_NOTPOSUPTO:
2537 possessive = TRUE;
2538 min = 0;
2539 max = GET2(ecode, 1);
2540 ecode += 3;
2541 goto REPEATNOTCHAR;
2542
2543 case OP_NOTSTAR:
2544 case OP_NOTMINSTAR:
2545 case OP_NOTPLUS:
2546 case OP_NOTMINPLUS:
2547 case OP_NOTQUERY:
2548 case OP_NOTMINQUERY:
2549 c = *ecode++ - OP_NOTSTAR;
2550 minimize = (c & 1) != 0;
2551 min = rep_min[c]; /* Pick up values from tables; */
2552 max = rep_max[c]; /* zero for max => infinity */
2553 if (max == 0) max = INT_MAX;
2554
2555 /* Common code for all repeated single-byte matches. We can give up quickly
2556 if there are fewer than the minimum number of bytes left in the
2557 subject. */
2558
2559 REPEATNOTCHAR:
2560 if (min > md->end_subject - eptr) RRETURN(MATCH_NOMATCH);
2561 fc = *ecode++;
2562
2563 /* The code is duplicated for the caseless and caseful cases, for speed,
2564 since matching characters is likely to be quite common. First, ensure the
2565 minimum number of matches are present. If min = max, continue at the same
2566 level without recursing. Otherwise, if minimizing, keep trying the rest of
2567 the expression and advancing one matching character if failing, up to the
2568 maximum. Alternatively, if maximizing, find the maximum number of
2569 characters and work backwards. */
2570
2571 DPRINTF(("negative matching %c{%d,%d} against subject %.*s\n", fc, min, max,
2572 max, eptr));
2573
2574 if ((ims & PCRE_CASELESS) != 0)
2575 {
2576 fc = md->lcc[fc];
2577
2578 #ifdef SUPPORT_UTF8
2579 /* UTF-8 mode */
2580 if (utf8)
2581 {
2582 register unsigned int d;
2583 for (i = 1; i <= min; i++)
2584 {
2585 GETCHARINC(d, eptr);
2586 if (d < 256) d = md->lcc[d];
2587 if (fc == d) RRETURN(MATCH_NOMATCH);
2588 }
2589 }
2590 else
2591 #endif
2592
2593 /* Not UTF-8 mode */
2594 {
2595 for (i = 1; i <= min; i++)
2596 if (fc == md->lcc[*eptr++]) RRETURN(MATCH_NOMATCH);
2597 }
2598
2599 if (min == max) continue;
2600
2601 if (minimize)
2602 {
2603 #ifdef SUPPORT_UTF8
2604 /* UTF-8 mode */
2605 if (utf8)
2606 {
2607 register unsigned int d;
2608 for (fi = min;; fi++)
2609 {
2610 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM28);
2611 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2612 if (fi >= max || eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
2613 GETCHARINC(d, eptr);
2614 if (d < 256) d = md->lcc[d];
2615 if (fc == d) RRETURN(MATCH_NOMATCH);
2616
2617 }
2618 }
2619 else
2620 #endif
2621 /* Not UTF-8 mode */
2622 {
2623 for (fi = min;; fi++)
2624 {
2625 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM29);
2626 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2627 if (fi >= max || eptr >= md->end_subject || fc == md->lcc[*eptr++])
2628 RRETURN(MATCH_NOMATCH);
2629 }
2630 }
2631 /* Control never gets here */
2632 }
2633
2634 /* Maximize case */
2635
2636 else
2637 {
2638 pp = eptr;
2639
2640 #ifdef SUPPORT_UTF8
2641 /* UTF-8 mode */
2642 if (utf8)
2643 {
2644 register unsigned int d;
2645 for (i = min; i < max; i++)
2646 {
2647 int len = 1;
2648 if (eptr >= md->end_subject) break;
2649 GETCHARLEN(d, eptr, len);
2650 if (d < 256) d = md->lcc[d];
2651 if (fc == d) break;
2652 eptr += len;
2653 }
2654 if (possessive) continue;
2655 for(;;)
2656 {
2657 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM30);
2658 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2659 if (eptr-- == pp) break; /* Stop if tried at original pos */
2660 BACKCHAR(eptr);
2661 }
2662 }
2663 else
2664 #endif
2665 /* Not UTF-8 mode */
2666 {
2667 for (i = min; i < max; i++)
2668 {
2669 if (eptr >= md->end_subject || fc == md->lcc[*eptr]) break;
2670 eptr++;
2671 }
2672 if (possessive) continue;
2673 while (eptr >= pp)
2674 {
2675 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM31);
2676 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2677 eptr--;
2678 }
2679 }
2680
2681 RRETURN(MATCH_NOMATCH);
2682 }
2683 /* Control never gets here */
2684 }
2685
2686 /* Caseful comparisons */
2687
2688 else
2689 {
2690 #ifdef SUPPORT_UTF8
2691 /* UTF-8 mode */
2692 if (utf8)
2693 {
2694 register unsigned int d;
2695 for (i = 1; i <= min; i++)
2696 {
2697 GETCHARINC(d, eptr);
2698 if (fc == d) RRETURN(MATCH_NOMATCH);
2699 }
2700 }
2701 else
2702 #endif
2703 /* Not UTF-8 mode */
2704 {
2705 for (i = 1; i <= min; i++)
2706 if (fc == *eptr++) RRETURN(MATCH_NOMATCH);
2707 }
2708
2709 if (min == max) continue;
2710
2711 if (minimize)
2712 {
2713 #ifdef SUPPORT_UTF8
2714 /* UTF-8 mode */
2715 if (utf8)
2716 {
2717 register unsigned int d;
2718 for (fi = min;; fi++)
2719 {
2720 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM32);
2721 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2722 if (fi >= max || eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
2723 GETCHARINC(d, eptr);
2724 if (fc == d) RRETURN(MATCH_NOMATCH);
2725 }
2726 }
2727 else
2728 #endif
2729 /* Not UTF-8 mode */
2730 {
2731 for (fi = min;; fi++)
2732 {
2733 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM33);
2734 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2735 if (fi >= max || eptr >= md->end_subject || fc == *eptr++)
2736 RRETURN(MATCH_NOMATCH);
2737 }
2738 }
2739 /* Control never gets here */
2740 }
2741
2742 /* Maximize case */
2743
2744 else
2745 {
2746 pp = eptr;
2747
2748 #ifdef SUPPORT_UTF8
2749 /* UTF-8 mode */
2750 if (utf8)
2751 {
2752 register unsigned int d;
2753 for (i = min; i < max; i++)
2754 {
2755 int len = 1;
2756 if (eptr >= md->end_subject) break;
2757 GETCHARLEN(d, eptr, len);
2758 if (fc == d) break;
2759 eptr += len;
2760 }
2761 if (possessive) continue;
2762 for(;;)
2763 {
2764 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM34);
2765 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2766 if (eptr-- == pp) break; /* Stop if tried at original pos */
2767 BACKCHAR(eptr);
2768 }
2769 }
2770 else
2771 #endif
2772 /* Not UTF-8 mode */
2773 {
2774 for (i = min; i < max; i++)
2775 {
2776 if (eptr >= md->end_subject || fc == *eptr) break;
2777 eptr++;
2778 }
2779 if (possessive) continue;
2780 while (eptr >= pp)
2781 {
2782 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM35);
2783 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2784 eptr--;
2785 }
2786 }
2787
2788 RRETURN(MATCH_NOMATCH);
2789 }
2790 }
2791 /* Control never gets here */
2792
2793 /* Match a single character type repeatedly; several different opcodes
2794 share code. This is very similar to the code for single characters, but we
2795 repeat it in the interests of efficiency. */
2796
2797 case OP_TYPEEXACT:
2798 min = max = GET2(ecode, 1);
2799 minimize = TRUE;
2800 ecode += 3;
2801 goto REPEATTYPE;
2802
2803 case OP_TYPEUPTO:
2804 case OP_TYPEMINUPTO:
2805 min = 0;
2806 max = GET2(ecode, 1);
2807 minimize = *ecode == OP_TYPEMINUPTO;
2808 ecode += 3;
2809 goto REPEATTYPE;
2810
2811 case OP_TYPEPOSSTAR:
2812 possessive = TRUE;
2813 min = 0;
2814 max = INT_MAX;
2815 ecode++;
2816 goto REPEATTYPE;
2817
2818 case OP_TYPEPOSPLUS:
2819 possessive = TRUE;
2820 min = 1;
2821 max = INT_MAX;
2822 ecode++;
2823 goto REPEATTYPE;
2824
2825 case OP_TYPEPOSQUERY:
2826 possessive = TRUE;
2827 min = 0;
2828 max = 1;
2829 ecode++;
2830 goto REPEATTYPE;
2831
2832 case OP_TYPEPOSUPTO:
2833 possessive = TRUE;
2834 min = 0;
2835 max = GET2(ecode, 1);
2836 ecode += 3;
2837 goto REPEATTYPE;
2838
2839 case OP_TYPESTAR:
2840 case OP_TYPEMINSTAR:
2841 case OP_TYPEPLUS:
2842 case OP_TYPEMINPLUS:
2843 case OP_TYPEQUERY:
2844 case OP_TYPEMINQUERY:
2845 c = *ecode++ - OP_TYPESTAR;
2846 minimize = (c & 1) != 0;
2847 min = rep_min[c]; /* Pick up values from tables; */
2848 max = rep_max[c]; /* zero for max => infinity */
2849 if (max == 0) max = INT_MAX;
2850
2851 /* Common code for all repeated single character type matches. Note that
2852 in UTF-8 mode, '.' matches a character of any length, but for the other
2853 character types, the valid characters are all one-byte long. */
2854
2855 REPEATTYPE:
2856 ctype = *ecode++; /* Code for the character type */
2857
2858 #ifdef SUPPORT_UCP
2859 if (ctype == OP_PROP || ctype == OP_NOTPROP)
2860 {
2861 prop_fail_result = ctype == OP_NOTPROP;
2862 prop_type = *ecode++;
2863 prop_value = *ecode++;
2864 }
2865 else prop_type = -1;
2866 #endif
2867
2868 /* First, ensure the minimum number of matches are present. Use inline
2869 code for maximizing the speed, and do the type test once at the start
2870 (i.e. keep it out of the loop). Also we can test that there are at least
2871 the minimum number of bytes before we start. This isn't as effective in
2872 UTF-8 mode, but it does no harm. Separate the UTF-8 code completely as that
2873 is tidier. Also separate the UCP code, which can be the same for both UTF-8
2874 and single-bytes. */
2875
2876 if (min > md->end_subject - eptr) RRETURN(MATCH_NOMATCH);
2877 if (min > 0)
2878 {
2879 #ifdef SUPPORT_UCP
2880 if (prop_type >= 0)
2881 {
2882 switch(prop_type)
2883 {
2884 case PT_ANY:
2885 if (prop_fail_result) RRETURN(MATCH_NOMATCH);
2886 for (i = 1; i <= min; i++)
2887 {
2888 if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
2889 GETCHARINCTEST(c, eptr);
2890 }
2891 break;
2892
2893 case PT_LAMP:
2894 for (i = 1; i <= min; i++)
2895 {
2896 if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
2897 GETCHARINCTEST(c, eptr);
2898 prop_chartype = UCD_CHARTYPE(c);
2899 if ((prop_chartype == ucp_Lu ||
2900 prop_chartype == ucp_Ll ||
2901 prop_chartype == ucp_Lt) == prop_fail_result)
2902 RRETURN(MATCH_NOMATCH);
2903 }
2904 break;
2905
2906 case PT_GC:
2907 for (i = 1; i <= min; i++)
2908 {
2909 if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
2910 GETCHARINCTEST(c, eptr);
2911 prop_category = UCD_CATEGORY(c);
2912 if ((prop_category == prop_value) == prop_fail_result)
2913 RRETURN(MATCH_NOMATCH);
2914 }
2915 break;
2916
2917 case PT_PC:
2918 for (i = 1; i <= min; i++)
2919 {
2920 if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
2921 GETCHARINCTEST(c, eptr);
2922 prop_chartype = UCD_CHARTYPE(c);
2923 if ((prop_chartype == prop_value) == prop_fail_result)
2924 RRETURN(MATCH_NOMATCH);
2925 }
2926 break;
2927
2928 case PT_SC:
2929 for (i = 1; i <= min; i++)
2930 {
2931 if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
2932 GETCHARINCTEST(c, eptr);
2933 prop_script = UCD_SCRIPT(c);
2934 if ((prop_script == prop_value) == prop_fail_result)
2935 RRETURN(MATCH_NOMATCH);
2936 }
2937 break;
2938
2939 default:
2940 RRETURN(PCRE_ERROR_INTERNAL);
2941 }
2942 }
2943
2944 /* Match extended Unicode sequences. We will get here only if the
2945 support is in the binary; otherwise a compile-time error occurs. */
2946
2947 else if (ctype == OP_EXTUNI)
2948 {
2949 for (i = 1; i <= min; i++)
2950 {
2951 GETCHARINCTEST(c, eptr);
2952 prop_category = UCD_CATEGORY(c);
2953 if (prop_category == ucp_M) RRETURN(MATCH_NOMATCH);
2954 while (eptr < md->end_subject)
2955 {
2956 int len = 1;
2957 if (!utf8) c = *eptr; else
2958 {
2959 GETCHARLEN(c, eptr, len);
2960 }
2961 prop_category = UCD_CATEGORY(c);
2962 if (prop_category != ucp_M) break;
2963 eptr += len;
2964 }
2965 }
2966 }
2967
2968 else
2969 #endif /* SUPPORT_UCP */
2970
2971 /* Handle all other cases when the coding is UTF-8 */
2972
2973 #ifdef SUPPORT_UTF8
2974 if (utf8) switch(ctype)
2975 {
2976 case OP_ANY:
2977 for (i = 1; i <= min; i++)
2978 {
2979 if (eptr >= md->end_subject || IS_NEWLINE(eptr))
2980 RRETURN(MATCH_NOMATCH);
2981 eptr++;
2982 while (eptr < md->end_subject && (*eptr & 0xc0) == 0x80) eptr++;
2983 }
2984 break;
2985
2986 case OP_ALLANY:
2987 for (i = 1; i <= min; i++)
2988 {
2989 if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
2990 eptr++;
2991 while (eptr < md->end_subject && (*eptr & 0xc0) == 0x80) eptr++;
2992 }
2993 break;
2994
2995 case OP_ANYBYTE:
2996 eptr += min;
2997 break;
2998
2999 case OP_ANYNL:
3000 for (i = 1; i <= min; i++)
3001 {
3002 if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
3003 GETCHARINC(c, eptr);
3004 switch(c)
3005 {
3006 default: RRETURN(MATCH_NOMATCH);
3007 case 0x000d:
3008 if (eptr < md->end_subject && *eptr == 0x0a) eptr++;
3009 break;
3010
3011 case 0x000a:
3012 break;
3013
3014 case 0x000b:
3015 case 0x000c:
3016 case 0x0085:
3017 case 0x2028:
3018 case 0x2029:
3019 if (md->bsr_anycrlf) RRETURN(MATCH_NOMATCH);
3020 break;
3021 }
3022 }
3023 break;
3024
3025 case OP_NOT_HSPACE:
3026 for (i = 1; i <= min; i++)
3027 {
3028 if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
3029 GETCHARINC(c, eptr);
3030 switch(c)
3031 {
3032 default: break;
3033 case 0x09: /* HT */
3034 case 0x20: /* SPACE */
3035 case 0xa0: /* NBSP */
3036 case 0x1680: /* OGHAM SPACE MARK */
3037 case 0x180e: /* MONGOLIAN VOWEL SEPARATOR */
3038 case 0x2000: /* EN QUAD */
3039 case 0x2001: /* EM QUAD */
3040 case 0x2002: /* EN SPACE */
3041 case 0x2003: /* EM SPACE */
3042 case 0x2004: /* THREE-PER-EM SPACE */
3043 case 0x2005: /* FOUR-PER-EM SPACE */
3044 case 0x2006: /* SIX-PER-EM SPACE */
3045 case 0x2007: /* FIGURE SPACE */
3046 case 0x2008: /* PUNCTUATION SPACE */
3047 case 0x2009: /* THIN SPACE */
3048 case 0x200A: /* HAIR SPACE */
3049 case 0x202f: /* NARROW NO-BREAK SPACE */
3050 case 0x205f: /* MEDIUM MATHEMATICAL SPACE */
3051 case 0x3000: /* IDEOGRAPHIC SPACE */
3052 RRETURN(MATCH_NOMATCH);
3053 }
3054 }
3055 break;
3056
3057 case OP_HSPACE:
3058 for (i = 1; i <= min; i++)
3059 {
3060 if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
3061 GETCHARINC(c, eptr);
3062 switch(c)
3063 {
3064 default: RRETURN(MATCH_NOMATCH);
3065 case 0x09: /* HT */
3066 case 0x20: /* SPACE */
3067 case 0xa0: /* NBSP */
3068 case 0x1680: /* OGHAM SPACE MARK */
3069 case 0x180e: /* MONGOLIAN VOWEL SEPARATOR */
3070 case 0x2000: /* EN QUAD */
3071 case 0x2001: /* EM QUAD */
3072 case 0x2002: /* EN SPACE */
3073 case 0x2003: /* EM SPACE */
3074 case 0x2004: /* THREE-PER-EM SPACE */
3075 case 0x2005: /* FOUR-PER-EM SPACE */
3076 case 0x2006: /* SIX-PER-EM SPACE */
3077 case 0x2007: /* FIGURE SPACE */
3078 case 0x2008: /* PUNCTUATION SPACE */
3079 case 0x2009: /* THIN SPACE */
3080 case 0x200A: /* HAIR SPACE */
3081 case 0x202f: /* NARROW NO-BREAK SPACE */
3082 case 0x205f: /* MEDIUM MATHEMATICAL SPACE */
3083 case 0x3000: /* IDEOGRAPHIC SPACE */
3084 break;
3085 }
3086 }
3087 break;
3088
3089 case OP_NOT_VSPACE:
3090 for (i = 1; i <= min; i++)
3091 {
3092 if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
3093 GETCHARINC(c, eptr);
3094 switch(c)
3095 {
3096 default: break;
3097 case 0x0a: /* LF */
3098 case 0x0b: /* VT */
3099 case 0x0c: /* FF */
3100 case 0x0d: /* CR */
3101 case 0x85: /* NEL */
3102 case 0x2028: /* LINE SEPARATOR */
3103 case 0x2029: /* PARAGRAPH SEPARATOR */
3104 RRETURN(MATCH_NOMATCH);
3105 }
3106 }
3107 break;
3108
3109 case OP_VSPACE:
3110 for (i = 1; i <= min; i++)
3111 {
3112 if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
3113 GETCHARINC(c, eptr);
3114 switch(c)
3115 {
3116 default: RRETURN(MATCH_NOMATCH);
3117 case 0x0a: /* LF */
3118 case 0x0b: /* VT */
3119 case 0x0c: /* FF */
3120 case 0x0d: /* CR */
3121 case 0x85: /* NEL */
3122 case 0x2028: /* LINE SEPARATOR */
3123 case 0x2029: /* PARAGRAPH SEPARATOR */
3124 break;
3125 }
3126 }
3127 break;
3128
3129 case OP_NOT_DIGIT:
3130 for (i = 1; i <= min; i++)
3131 {
3132 if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
3133 GETCHARINC(c, eptr);
3134 if (c < 128 && (md->ctypes[c] & ctype_digit) != 0)
3135 RRETURN(MATCH_NOMATCH);
3136 }
3137 break;
3138
3139 case OP_DIGIT:
3140 for (i = 1; i <= min; i++)
3141 {
3142 if (eptr >= md->end_subject ||
3143 *eptr >= 128 || (md->ctypes[*eptr++] & ctype_digit) == 0)
3144 RRETURN(MATCH_NOMATCH);
3145 /* No need to skip more bytes - we know it's a 1-byte character */
3146 }
3147 break;
3148
3149 case OP_NOT_WHITESPACE:
3150 for (i = 1; i <= min; i++)
3151 {
3152 if (eptr >= md->end_subject ||
3153 (*eptr < 128 && (md->ctypes[*eptr] & ctype_space) != 0))
3154 RRETURN(MATCH_NOMATCH);
3155 while (++eptr < md->end_subject && (*eptr & 0xc0) == 0x80);
3156 }
3157 break;
3158
3159 case OP_WHITESPACE:
3160 for (i = 1; i <= min; i++)
3161 {
3162 if (eptr >= md->end_subject ||
3163 *eptr >= 128 || (md->ctypes[*eptr++] & ctype_space) == 0)
3164 RRETURN(MATCH_NOMATCH);
3165 /* No need to skip more bytes - we know it's a 1-byte character */
3166 }
3167 break;
3168
3169 case OP_NOT_WORDCHAR:
3170 for (i = 1; i <= min; i++)
3171 {
3172 if (eptr >= md->end_subject ||
3173 (*eptr < 128 && (md->ctypes[*eptr] & ctype_word) != 0))
3174 RRETURN(MATCH_NOMATCH);
3175 while (++eptr < md->end_subject && (*eptr & 0xc0) == 0x80);
3176 }
3177 break;
3178
3179 case OP_WORDCHAR:
3180 for (i = 1; i <= min; i++)
3181 {
3182 if (eptr >= md->end_subject ||
3183 *eptr >= 128 || (md->ctypes[*eptr++] & ctype_word) == 0)
3184 RRETURN(MATCH_NOMATCH);
3185 /* No need to skip more bytes - we know it's a 1-byte character */
3186 }
3187 break;
3188
3189 default:
3190 RRETURN(PCRE_ERROR_INTERNAL);
3191 } /* End switch(ctype) */
3192
3193 else
3194 #endif /* SUPPORT_UTF8 */
3195
3196 /* Code for the non-UTF-8 case for minimum matching of operators other
3197 than OP_PROP and OP_NOTPROP. We can assume that there are the minimum
3198 number of bytes present, as this was tested above. */
3199
3200 switch(ctype)
3201 {
3202 case OP_ANY:
3203 for (i = 1; i <= min; i++)
3204 {
3205 if (IS_NEWLINE(eptr)) RRETURN(MATCH_NOMATCH);
3206 eptr++;
3207 }
3208 break;
3209
3210 case OP_ALLANY:
3211 eptr += min;
3212 break;
3213
3214 case OP_ANYBYTE:
3215 eptr += min;
3216 break;
3217
3218 /* Because of the CRLF case, we can't assume the minimum number of
3219 bytes are present in this case. */
3220
3221 case OP_ANYNL:
3222 for (i = 1; i <= min; i++)
3223 {
3224 if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
3225 switch(*eptr++)
3226 {
3227 default: RRETURN(MATCH_NOMATCH);
3228 case 0x000d:
3229 if (eptr < md->end_subject && *eptr == 0x0a) eptr++;
3230 break;
3231 case 0x000a:
3232 break;
3233
3234 case 0x000b:
3235 case 0x000c:
3236 case 0x0085:
3237 if (md->bsr_anycrlf) RRETURN(MATCH_NOMATCH);
3238 break;
3239 }
3240 }
3241 break;
3242
3243 case OP_NOT_HSPACE:
3244 for (i = 1; i <= min; i++)
3245 {
3246 if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
3247 switch(*eptr++)
3248 {
3249 default: break;
3250 case 0x09: /* HT */
3251 case 0x20: /* SPACE */
3252 case 0xa0: /* NBSP */
3253 RRETURN(MATCH_NOMATCH);
3254 }
3255 }
3256 break;
3257
3258 case OP_HSPACE:
3259 for (i = 1; i <= min; i++)
3260 {
3261 if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
3262 switch(*eptr++)
3263 {
3264 default: RRETURN(MATCH_NOMATCH);
3265 case 0x09: /* HT */
3266 case 0x20: /* SPACE */
3267 case 0xa0: /* NBSP */
3268 break;
3269 }
3270 }
3271 break;
3272
3273 case OP_NOT_VSPACE:
3274 for (i = 1; i <= min; i++)
3275 {
3276 if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
3277 switch(*eptr++)
3278 {
3279 default: break;
3280 case 0x0a: /* LF */
3281 case 0x0b: /* VT */
3282 case 0x0c: /* FF */
3283 case 0x0d: /* CR */
3284 case 0x85: /* NEL */
3285 RRETURN(MATCH_NOMATCH);
3286 }
3287 }
3288 break;
3289
3290 case OP_VSPACE:
3291 for (i = 1; i <= min; i++)
3292 {
3293 if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
3294 switch(*eptr++)
3295 {
3296 default: RRETURN(MATCH_NOMATCH);
3297 case 0x0a: /* LF */
3298 case 0x0b: /* VT */
3299 case 0x0c: /* FF */
3300 case 0x0d: /* CR */
3301 case 0x85: /* NEL */
3302 break;
3303 }
3304 }
3305 break;
3306
3307 case OP_NOT_DIGIT:
3308 for (i = 1; i <= min; i++)
3309 if ((md->ctypes[*eptr++] & ctype_digit) != 0) RRETURN(MATCH_NOMATCH);
3310 break;
3311
3312 case OP_DIGIT:
3313 for (i = 1; i <= min; i++)
3314 if ((md->ctypes[*eptr++] & ctype_digit) == 0) RRETURN(MATCH_NOMATCH);
3315 break;
3316
3317 case OP_NOT_WHITESPACE:
3318 for (i = 1; i <= min; i++)
3319 if ((md->ctypes[*eptr++] & ctype_space) != 0) RRETURN(MATCH_NOMATCH);
3320 break;
3321
3322 case OP_WHITESPACE:
3323 for (i = 1; i <= min; i++)
3324 if ((md->ctypes[*eptr++] & ctype_space) == 0) RRETURN(MATCH_NOMATCH);
3325 break;
3326
3327 case OP_NOT_WORDCHAR:
3328 for (i = 1; i <= min; i++)
3329 if ((md->ctypes[*eptr++] & ctype_word) != 0)
3330 RRETURN(MATCH_NOMATCH);
3331 break;
3332
3333 case OP_WORDCHAR:
3334 for (i = 1; i <= min; i++)
3335 if ((md->ctypes[*eptr++] & ctype_word) == 0)
3336 RRETURN(MATCH_NOMATCH);
3337 break;
3338
3339 default:
3340 RRETURN(PCRE_ERROR_INTERNAL);
3341 }
3342 }
3343
3344 /* If min = max, continue at the same level without recursing */
3345
3346 if (min == max) continue;
3347
3348 /* If minimizing, we have to test the rest of the pattern before each
3349 subsequent match. Again, separate the UTF-8 case for speed, and also
3350 separate the UCP cases. */
3351
3352 if (minimize)
3353 {
3354 #ifdef SUPPORT_UCP
3355 if (prop_type >= 0)
3356 {
3357 switch(prop_type)
3358 {
3359 case PT_ANY:
3360 for (fi = min;; fi++)
3361 {
3362 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM36);
3363 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3364 if (fi >= max || eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
3365 GETCHARINC(c, eptr);
3366 if (prop_fail_result) RRETURN(MATCH_NOMATCH);
3367 }
3368 /* Control never gets here */
3369
3370 case PT_LAMP:
3371 for (fi = min;; fi++)
3372 {
3373 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM37);
3374 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3375 if (fi >= max || eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
3376 GETCHARINC(c, eptr);
3377 prop_chartype = UCD_CHARTYPE(c);
3378 if ((prop_chartype == ucp_Lu ||
3379 prop_chartype == ucp_Ll ||
3380 prop_chartype == ucp_Lt) == prop_fail_result)
3381 RRETURN(MATCH_NOMATCH);
3382 }
3383 /* Control never gets here */
3384
3385 case PT_GC:
3386 for (fi = min;; fi++)
3387 {
3388 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM38);
3389 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3390 if (fi >= max || eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
3391 GETCHARINC(c, eptr);
3392 prop_category = UCD_CATEGORY(c);
3393 if ((prop_category == prop_value) == prop_fail_result)
3394 RRETURN(MATCH_NOMATCH);
3395 }
3396 /* Control never gets here */
3397
3398 case PT_PC:
3399 for (fi = min;; fi++)
3400 {
3401 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM39);
3402 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3403 if (fi >= max || eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
3404 GETCHARINC(c, eptr);
3405 prop_chartype = UCD_CHARTYPE(c);
3406 if ((prop_chartype == prop_value) == prop_fail_result)
3407 RRETURN(MATCH_NOMATCH);
3408 }
3409 /* Control never gets here */
3410
3411 case PT_SC:
3412 for (fi = min;; fi++)
3413 {
3414 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM40);
3415 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3416 if (fi >= max || eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
3417 GETCHARINC(c, eptr);
3418 prop_script = UCD_SCRIPT(c);
3419 if ((prop_script == prop_value) == prop_fail_result)
3420 RRETURN(MATCH_NOMATCH);
3421 }
3422 /* Control never gets here */
3423
3424 default:
3425 RRETURN(PCRE_ERROR_INTERNAL);
3426 }
3427 }
3428
3429 /* Match extended Unicode sequences. We will get here only if the
3430 support is in the binary; otherwise a compile-time error occurs. */
3431
3432 else if (ctype == OP_EXTUNI)
3433 {
3434 for (fi = min;; fi++)
3435 {
3436 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM41);
3437 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3438 if (fi >= max || eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
3439 GETCHARINCTEST(c, eptr);
3440 prop_category = UCD_CATEGORY(c);
3441 if (prop_category == ucp_M) RRETURN(MATCH_NOMATCH);
3442 while (eptr < md->end_subject)
3443 {
3444 int len = 1;
3445 if (!utf8) c = *eptr; else
3446 {
3447 GETCHARLEN(c, eptr, len);
3448 }
3449 prop_category = UCD_CATEGORY(c);
3450 if (prop_category != ucp_M) break;
3451 eptr += len;
3452 }
3453 }
3454 }
3455
3456 else
3457 #endif /* SUPPORT_UCP */
3458
3459 #ifdef SUPPORT_UTF8
3460 /* UTF-8 mode */
3461 if (utf8)
3462 {
3463 for (fi = min;; fi++)
3464 {
3465 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM42);
3466 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3467 if (fi >= max || eptr >= md->end_subject ||
3468 (ctype == OP_ANY && IS_NEWLINE(eptr)))
3469 RRETURN(MATCH_NOMATCH);
3470
3471 GETCHARINC(c, eptr);
3472 switch(ctype)
3473 {
3474 case OP_ANY: /* This is the non-NL case */
3475 case OP_ALLANY:
3476 case OP_ANYBYTE:
3477 break;
3478
3479 case OP_ANYNL:
3480 switch(c)
3481 {
3482 default: RRETURN(MATCH_NOMATCH);
3483 case 0x000d:
3484 if (eptr < md->end_subject && *eptr == 0x0a) eptr++;
3485 break;
3486 case 0x000a:
3487 break;
3488
3489 case 0x000b:
3490 case 0x000c:
3491 case 0x0085:
3492 case 0x2028:
3493 case 0x2029:
3494 if (md->bsr_anycrlf) RRETURN(MATCH_NOMATCH);
3495 break;
3496 }
3497 break;
3498
3499 case OP_NOT_HSPACE:
3500 switch(c)
3501 {
3502 default: break;
3503 case 0x09: /* HT */
3504 case 0x20: /* SPACE */
3505 case 0xa0: /* NBSP */
3506 case 0x1680: /* OGHAM SPACE MARK */
3507 case 0x180e: /* MONGOLIAN VOWEL SEPARATOR */
3508 case 0x2000: /* EN QUAD */
3509 case 0x2001: /* EM QUAD */
3510 case 0x2002: /* EN SPACE */
3511 case 0x2003: /* EM SPACE */
3512 case 0x2004: /* THREE-PER-EM SPACE */
3513 case 0x2005: /* FOUR-PER-EM SPACE */
3514 case 0x2006: /* SIX-PER-EM SPACE */
3515 case 0x2007: /* FIGURE SPACE */
3516 case 0x2008: /* PUNCTUATION SPACE */
3517 case 0x2009: /* THIN SPACE */
3518 case 0x200A: /* HAIR SPACE */
3519 case 0x202f: /* NARROW NO-BREAK SPACE */
3520 case 0x205f: /* MEDIUM MATHEMATICAL SPACE */
3521 case 0x3000: /* IDEOGRAPHIC SPACE */
3522 RRETURN(MATCH_NOMATCH);
3523 }
3524 break;
3525
3526 case OP_HSPACE:
3527 switch(c)
3528 {
3529 default: RRETURN(MATCH_NOMATCH);
3530 case 0x09: /* HT */
3531 case 0x20: /* SPACE */
3532 case 0xa0: /* NBSP */
3533 case 0x1680: /* OGHAM SPACE MARK */
3534 case 0x180e: /* MONGOLIAN VOWEL SEPARATOR */
3535 case 0x2000: /* EN QUAD */
3536 case 0x2001: /* EM QUAD */
3537 case 0x2002: /* EN SPACE */
3538 case 0x2003: /* EM SPACE */
3539 case 0x2004: /* THREE-PER-EM SPACE */
3540 case 0x2005: /* FOUR-PER-EM SPACE */
3541 case 0x2006: /* SIX-PER-EM SPACE */
3542 case 0x2007: /* FIGURE SPACE */
3543 case 0x2008: /* PUNCTUATION SPACE */
3544 case 0x2009: /* THIN SPACE */
3545 case 0x200A: /* HAIR SPACE */
3546 case 0x202f: /* NARROW NO-BREAK SPACE */
3547 case 0x205f: /* MEDIUM MATHEMATICAL SPACE */
3548 case 0x3000: /* IDEOGRAPHIC SPACE */
3549 break;
3550 }
3551 break;
3552
3553 case OP_NOT_VSPACE:
3554 switch(c)
3555 {
3556 default: break;
3557 case 0x0a: /* LF */
3558 case 0x0b: /* VT */
3559 case 0x0c: /* FF */
3560 case 0x0d: /* CR */
3561 case 0x85: /* NEL */
3562 case 0x2028: /* LINE SEPARATOR */
3563 case 0x2029: /* PARAGRAPH SEPARATOR */
3564 RRETURN(MATCH_NOMATCH);
3565 }
3566 break;
3567
3568 case OP_VSPACE:
3569 switch(c)
3570 {
3571 default: RRETURN(MATCH_NOMATCH);
3572 case 0x0a: /* LF */
3573 case 0x0b: /* VT */
3574 case 0x0c: /* FF */
3575 case 0x0d: /* CR */
3576 case 0x85: /* NEL */
3577 case 0x2028: /* LINE SEPARATOR */
3578 case 0x2029: /* PARAGRAPH SEPARATOR */
3579 break;
3580 }
3581 break;
3582
3583 case OP_NOT_DIGIT:
3584 if (c < 256 && (md->ctypes[c] & ctype_digit) != 0)
3585 RRETURN(MATCH_NOMATCH);
3586 break;
3587
3588 case OP_DIGIT:
3589 if (c >= 256 || (md->ctypes[c] & ctype_digit) == 0)
3590 RRETURN(MATCH_NOMATCH);
3591 break;
3592
3593 case OP_NOT_WHITESPACE:
3594 if (c < 256 && (md->ctypes[c] & ctype_space) != 0)
3595 RRETURN(MATCH_NOMATCH);
3596 break;
3597
3598 case OP_WHITESPACE:
3599 if (c >= 256 || (md->ctypes[c] & ctype_space) == 0)
3600 RRETURN(MATCH_NOMATCH);
3601 break;
3602
3603 case OP_NOT_WORDCHAR:
3604 if (c < 256 && (md->ctypes[c] & ctype_word) != 0)
3605 RRETURN(MATCH_NOMATCH);
3606 break;
3607
3608 case OP_WORDCHAR:
3609 if (c >= 256 || (md->ctypes[c] & ctype_word) == 0)
3610 RRETURN(MATCH_NOMATCH);
3611 break;
3612
3613 default:
3614 RRETURN(PCRE_ERROR_INTERNAL);
3615 }
3616 }
3617 }
3618 else
3619 #endif
3620 /* Not UTF-8 mode */
3621 {
3622 for (fi = min;; fi++)
3623 {
3624 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM43);
3625 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3626 if (fi >= max || eptr >= md->end_subject ||
3627 (ctype == OP_ANY && IS_NEWLINE(eptr)))
3628 RRETURN(MATCH_NOMATCH);
3629
3630 c = *eptr++;
3631 switch(ctype)
3632 {
3633 case OP_ANY: /* This is the non-NL case */
3634 case OP_ALLANY:
3635 case OP_ANYBYTE:
3636 break;
3637
3638 case OP_ANYNL:
3639 switch(c)
3640 {
3641 default: RRETURN(MATCH_NOMATCH);
3642 case 0x000d:
3643 if (eptr < md->end_subject && *eptr == 0x0a) eptr++;
3644 break;
3645
3646 case 0x000a:
3647 break;
3648
3649 case 0x000b:
3650 case 0x000c:
3651 case 0x0085:
3652 if (md->bsr_anycrlf) RRETURN(MATCH_NOMATCH);
3653 break;
3654 }
3655 break;
3656
3657 case OP_NOT_HSPACE:
3658 switch(c)
3659 {
3660 default: break;
3661 case 0x09: /* HT */
3662 case 0x20: /* SPACE */
3663 case 0xa0: /* NBSP */
3664 RRETURN(MATCH_NOMATCH);
3665 }
3666 break;
3667
3668 case OP_HSPACE:
3669 switch(c)
3670 {
3671 default: RRETURN(MATCH_NOMATCH);
3672 case 0x09: /* HT */
3673 case 0x20: /* SPACE */
3674 case 0xa0: /* NBSP */
3675 break;
3676 }
3677 break;
3678
3679 case OP_NOT_VSPACE:
3680 switch(c)
3681 {
3682 default: break;
3683 case 0x0a: /* LF */
3684 case 0x0b: /* VT */
3685 case 0x0c: /* FF */
3686 case 0x0d: /* CR */
3687 case 0x85: /* NEL */
3688 RRETURN(MATCH_NOMATCH);
3689 }
3690 break;
3691
3692 case OP_VSPACE:
3693 switch(c)
3694 {
3695 default: RRETURN(MATCH_NOMATCH);
3696 case 0x0a: /* LF */
3697 case 0x0b: /* VT */
3698 case 0x0c: /* FF */
3699 case 0x0d: /* CR */
3700 case 0x85: /* NEL */
3701 break;
3702 }
3703 break;
3704
3705 case OP_NOT_DIGIT:
3706 if ((md->ctypes[c] & ctype_digit) != 0) RRETURN(MATCH_NOMATCH);
3707 break;
3708
3709 case OP_DIGIT:
3710 if ((md->ctypes[c] & ctype_digit) == 0) RRETURN(MATCH_NOMATCH);
3711 break;
3712
3713 case OP_NOT_WHITESPACE:
3714 if ((md->ctypes[c] & ctype_space) != 0) RRETURN(MATCH_NOMATCH);
3715 break;
3716
3717 case OP_WHITESPACE:
3718 if ((md->ctypes[c] & ctype_space) == 0) RRETURN(MATCH_NOMATCH);
3719 break;
3720
3721 case OP_NOT_WORDCHAR:
3722 if ((md->ctypes[c] & ctype_word) != 0) RRETURN(MATCH_NOMATCH);
3723 break;
3724
3725 case OP_WORDCHAR:
3726 if ((md->ctypes[c] & ctype_word) == 0) RRETURN(MATCH_NOMATCH);
3727 break;
3728
3729 default:
3730 RRETURN(PCRE_ERROR_INTERNAL);
3731 }
3732 }
3733 }
3734 /* Control never gets here */
3735 }
3736
3737 /* If maximizing, it is worth using inline code for speed, doing the type
3738 test once at the start (i.e. keep it out of the loop). Again, keep the
3739 UTF-8 and UCP stuff separate. */
3740
3741 else
3742 {
3743 pp = eptr; /* Remember where we started */
3744
3745 #ifdef SUPPORT_UCP
3746 if (prop_type >= 0)
3747 {
3748 switch(prop_type)
3749 {
3750 case PT_ANY:
3751 for (i = min; i < max; i++)
3752 {
3753 int len = 1;
3754 if (eptr >= md->end_subject) break;
3755 GETCHARLEN(c, eptr, len);
3756 if (prop_fail_result) break;
3757 eptr+= len;
3758 }
3759 break;
3760
3761 case PT_LAMP:
3762 for (i = min; i < max; i++)
3763 {
3764 int len = 1;
3765 if (eptr >= md->end_subject) break;
3766 GETCHARLEN(c, eptr, len);
3767 prop_chartype = UCD_CHARTYPE(c);
3768 if ((prop_chartype == ucp_Lu ||
3769 prop_chartype == ucp_Ll ||
3770 prop_chartype == ucp_Lt) == prop_fail_result)
3771 break;
3772 eptr+= len;
3773 }
3774 break;
3775
3776 case PT_GC:
3777 for (i = min; i < max; i++)
3778 {
3779 int len = 1;
3780 if (eptr >= md->end_subject) break;
3781 GETCHARLEN(c, eptr, len);
3782 prop_category = UCD_CATEGORY(c);
3783 if ((prop_category == prop_value) == prop_fail_result)
3784 break;
3785 eptr+= len;
3786 }
3787 break;
3788
3789 case PT_PC:
3790 for (i = min; i < max; i++)
3791 {
3792 int len = 1;
3793 if (eptr >= md->end_subject) break;
3794 GETCHARLEN(c, eptr, len);
3795 prop_chartype = UCD_CHARTYPE(c);
3796 if ((prop_chartype == prop_value) == prop_fail_result)
3797 break;
3798 eptr+= len;
3799 }
3800 break;
3801
3802 case PT_SC:
3803 for (i = min; i < max; i++)
3804 {
3805 int len = 1;
3806 if (eptr >= md->end_subject) break;
3807 GETCHARLEN(c, eptr, len);
3808 prop_script = UCD_SCRIPT(c);
3809 if ((prop_script == prop_value) == prop_fail_result)
3810 break;
3811 eptr+= len;
3812 }
3813 break;
3814 }
3815
3816 /* eptr is now past the end of the maximum run */
3817
3818 if (possessive) continue;
3819 for(;;)
3820 {
3821 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM44);
3822 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3823 if (eptr-- == pp) break; /* Stop if tried at original pos */
3824 if (utf8) BACKCHAR(eptr);
3825 }
3826 }
3827
3828 /* Match extended Unicode sequences. We will get here only if the
3829 support is in the binary; otherwise a compile-time error occurs. */
3830
3831 else if (ctype == OP_EXTUNI)
3832 {
3833 for (i = min; i < max; i++)
3834 {
3835 if (eptr >= md->end_subject) break;
3836 GETCHARINCTEST(c, eptr);
3837 prop_category = UCD_CATEGORY(c);
3838 if (prop_category == ucp_M) break;
3839 while (eptr < md->end_subject)
3840 {
3841 int len = 1;
3842 if (!utf8) c = *eptr; else
3843 {
3844 GETCHARLEN(c, eptr, len);
3845 }
3846 prop_category = UCD_CATEGORY(c);
3847 if (prop_category != ucp_M) break;
3848 eptr += len;
3849 }
3850 }
3851
3852 /* eptr is now past the end of the maximum run */
3853
3854 if (possessive) continue;
3855 for(;;)
3856 {
3857 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM45);
3858 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3859 if (eptr-- == pp) break; /* Stop if tried at original pos */
3860 for (;;) /* Move back over one extended */
3861 {
3862 int len = 1;
3863 if (!utf8) c = *eptr; else
3864 {
3865 BACKCHAR(eptr);
3866 GETCHARLEN(c, eptr, len);
3867 }
3868 prop_category = UCD_CATEGORY(c);
3869 if (prop_category != ucp_M) break;
3870 eptr--;
3871 }
3872 }
3873 }
3874
3875 else
3876 #endif /* SUPPORT_UCP */
3877
3878 #ifdef SUPPORT_UTF8
3879 /* UTF-8 mode */
3880
3881 if (utf8)
3882 {
3883 switch(ctype)
3884 {
3885 case OP_ANY:
3886 if (max < INT_MAX)
3887 {
3888 for (i = min; i < max; i++)
3889 {
3890 if (eptr >= md->end_subject || IS_NEWLINE(eptr)) break;
3891 eptr++;
3892 while (eptr < md->end_subject && (*eptr & 0xc0) == 0x80) eptr++;
3893 }
3894 }
3895
3896 /* Handle unlimited UTF-8 repeat */
3897
3898 else
3899 {
3900 for (i = min; i < max; i++)
3901 {
3902 if (eptr >= md->end_subject || IS_NEWLINE(eptr)) break;
3903 eptr++;
3904 while (eptr < md->end_subject && (*eptr & 0xc0) == 0x80) eptr++;
3905 }
3906 }
3907 break;
3908
3909 case OP_ALLANY:
3910 if (max < INT_MAX)
3911 {
3912 for (i = min; i < max; i++)
3913 {
3914 if (eptr >= md->end_subject) break;
3915 eptr++;
3916 while (eptr < md->end_subject && (*eptr & 0xc0) == 0x80) eptr++;
3917 }
3918 }
3919 else eptr = md->end_subject; /* Unlimited UTF-8 repeat */
3920 break;
3921
3922 /* The byte case is the same as non-UTF8 */
3923
3924 case OP_ANYBYTE:
3925 c = max - min;
3926 if (c > (unsigned int)(md->end_subject - eptr))
3927 c = md->end_subject - eptr;
3928 eptr += c;
3929 break;
3930
3931 case OP_ANYNL:
3932 for (i = min; i < max; i++)
3933 {
3934 int len = 1;
3935 if (eptr >= md->end_subject) break;
3936 GETCHARLEN(c, eptr, len);
3937 if (c == 0x000d)
3938 {
3939 if (++eptr >= md->end_subject) break;
3940 if (*eptr == 0x000a) eptr++;
3941 }
3942 else
3943 {
3944 if (c != 0x000a &&
3945 (md->bsr_anycrlf ||
3946 (c != 0x000b && c != 0x000c &&
3947 c != 0x0085 && c != 0x2028 && c != 0x2029)))
3948 break;
3949 eptr += len;
3950 }
3951 }
3952 break;
3953
3954 case OP_NOT_HSPACE:
3955 case OP_HSPACE:
3956 for (i = min; i < max; i++)
3957 {
3958 BOOL gotspace;
3959 int len = 1;
3960 if (eptr >= md->end_subject) break;
3961 GETCHARLEN(c, eptr, len);
3962 switch(c)
3963 {
3964 default: gotspace = FALSE; break;
3965 case 0x09: /* HT */
3966 case 0x20: /* SPACE */
3967 case 0xa0: /* NBSP */
3968 case 0x1680: /* OGHAM SPACE MARK */
3969 case 0x180e: /* MONGOLIAN VOWEL SEPARATOR */
3970 case 0x2000: /* EN QUAD */
3971 case 0x2001: /* EM QUAD */
3972 case 0x2002: /* EN SPACE */
3973 case 0x2003: /* EM SPACE */
3974 case 0x2004: /* THREE-PER-EM SPACE */
3975 case 0x2005: /* FOUR-PER-EM SPACE */
3976 case 0x2006: /* SIX-PER-EM SPACE */
3977 case 0x2007: /* FIGURE SPACE */
3978 case 0x2008: /* PUNCTUATION SPACE */
3979 case 0x2009: /* THIN SPACE */
3980 case 0x200A: /* HAIR SPACE */
3981 case 0x202f: /* NARROW NO-BREAK SPACE */
3982 case 0x205f: /* MEDIUM MATHEMATICAL SPACE */
3983 case 0x3000: /* IDEOGRAPHIC SPACE */
3984 gotspace = TRUE;
3985 break;
3986 }
3987 if (gotspace == (ctype == OP_NOT_HSPACE)) break;
3988 eptr += len;
3989 }
3990 break;
3991
3992 case OP_NOT_VSPACE:
3993 case OP_VSPACE:
3994 for (i = min; i < max; i++)
3995 {
3996 BOOL gotspace;
3997 int len = 1;
3998 if (eptr >= md->end_subject) break;
3999 GETCHARLEN(c, eptr, len);
4000 switch(c)
4001 {
4002 default: gotspace = FALSE; break;
4003 case 0x0a: /* LF */
4004 case 0x0b: /* VT */
4005 case 0x0c: /* FF */
4006 case 0x0d: /* CR */
4007 case 0x85: /* NEL */
4008 case 0x2028: /* LINE SEPARATOR */
4009 case 0x2029: /* PARAGRAPH SEPARATOR */
4010 gotspace = TRUE;
4011 break;
4012 }
4013 if (gotspace == (ctype == OP_NOT_VSPACE)) break;
4014 eptr += len;
4015 }
4016 break;
4017
4018 case OP_NOT_DIGIT:
4019 for (i = min; i < max; i++)
4020 {
4021 int len = 1;
4022 if (eptr >= md->end_subject) break;
4023 GETCHARLEN(c, eptr, len);
4024 if (c < 256 && (md->ctypes[c] & ctype_digit) != 0) break;
4025 eptr+= len;
4026 }
4027 break;
4028
4029 case OP_DIGIT:
4030 for (i = min; i < max; i++)
4031 {
4032 int len = 1;
4033 if (eptr >= md->end_subject) break;
4034 GETCHARLEN(c, eptr, len);
4035 if (c >= 256 ||(md->ctypes[c] & ctype_digit) == 0) break;
4036 eptr+= len;
4037 }
4038 break;
4039
4040 case OP_NOT_WHITESPACE:
4041 for (i = min; i < max; i++)
4042 {
4043 int len = 1;
4044 if (eptr >= md->end_subject) break;
4045 GETCHARLEN(c, eptr, len);
4046 if (c < 256 && (md->ctypes[c] & ctype_space) != 0) break;
4047 eptr+= len;
4048 }
4049 break;
4050
4051 case OP_WHITESPACE:
4052 for (i = min; i < max; i++)
4053 {
4054 int len = 1;
4055 if (eptr >= md->end_subject) break;
4056 GETCHARLEN(c, eptr, len);
4057 if (c >= 256 ||(md->ctypes[c] & ctype_space) == 0) break;
4058 eptr+= len;
4059 }
4060 break;
4061
4062 case OP_NOT_WORDCHAR:
4063 for (i = min; i < max; i++)
4064 {
4065 int len = 1;
4066 if (eptr >= md->end_subject) break;
4067 GETCHARLEN(c, eptr, len);
4068 if (c < 256 && (md->ctypes[c] & ctype_word) != 0) break;
4069 eptr+= len;
4070 }
4071 break;
4072
4073 case OP_WORDCHAR:
4074 for (i = min; i < max; i++)
4075 {
4076 int len = 1;
4077 if (eptr >= md->end_subject) break;
4078 GETCHARLEN(c, eptr, len);
4079 if (c >= 256 || (md->ctypes[c] & ctype_word) == 0) break;
4080 eptr+= len;
4081 }
4082 break;
4083
4084 default:
4085 RRETURN(PCRE_ERROR_INTERNAL);
4086 }
4087
4088 /* eptr is now past the end of the maximum run */
4089
4090 if (possessive) continue;
4091 for(;;)
4092 {
4093 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM46);
4094 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
4095 if (eptr-- == pp) break; /* Stop if tried at original pos */
4096 BACKCHAR(eptr);
4097 }
4098 }
4099 else
4100 #endif /* SUPPORT_UTF8 */
4101
4102 /* Not UTF-8 mode */
4103 {
4104 switch(ctype)
4105 {
4106 case OP_ANY:
4107 for (i = min; i < max; i++)
4108 {
4109 if (eptr >= md->end_subject || IS_NEWLINE(eptr)) break;
4110 eptr++;
4111 }
4112 break;
4113
4114 case OP_ALLANY:
4115 case OP_ANYBYTE:
4116 c = max - min;
4117 if (c > (unsigned int)(md->end_subject - eptr))
4118 c = md->end_subject - eptr;
4119 eptr += c;
4120 break;
4121
4122 case OP_ANYNL:
4123 for (i = min; i < max; i++)
4124 {
4125 if (eptr >= md->end_subject) break;
4126 c = *eptr;
4127 if (c == 0x000d)
4128 {
4129 if (++eptr >= md->end_subject) break;
4130 if (*eptr == 0x000a) eptr++;
4131 }
4132 else
4133 {
4134 if (c != 0x000a &&
4135 (md->bsr_anycrlf ||
4136 (c != 0x000b && c != 0x000c && c != 0x0085)))
4137 break;
4138 eptr++;
4139 }
4140 }
4141 break;
4142
4143 case OP_NOT_HSPACE:
4144 for (i = min; i < max; i++)
4145 {
4146 if (eptr >= md->end_subject) break;
4147 c = *eptr;
4148 if (c == 0x09 || c == 0x20 || c == 0xa0) break;
4149 eptr++;
4150 }
4151 break;
4152
4153 case OP_HSPACE:
4154 for (i = min; i < max; i++)
4155 {
4156 if (eptr >= md->end_subject) break;
4157 c = *eptr;
4158 if (c != 0x09 && c != 0x20 && c != 0xa0) break;
4159 eptr++;
4160 }
4161 break;
4162
4163 case OP_NOT_VSPACE:
4164 for (i = min; i < max; i++)
4165 {
4166 if (eptr >= md->end_subject) break;
4167 c = *eptr;
4168 if (c == 0x0a || c == 0x0b || c == 0x0c || c == 0x0d || c == 0x85)
4169 break;
4170 eptr++;
4171 }
4172 break;
4173
4174 case OP_VSPACE:
4175 for (i = min; i < max; i++)
4176 {
4177 if (eptr >= md->end_subject) break;
4178 c = *eptr;
4179 if (c != 0x0a && c != 0x0b && c != 0x0c && c != 0x0d && c != 0x85)
4180 break;
4181 eptr++;
4182 }
4183 break;
4184
4185 case OP_NOT_DIGIT:
4186 for (i = min; i < max; i++)
4187 {
4188 if (eptr >= md->end_subject || (md->ctypes[*eptr] & ctype_digit) != 0)
4189 break;
4190 eptr++;
4191 }
4192 break;
4193
4194 case OP_DIGIT:
4195 for (i = min; i < max; i++)
4196 {
4197 if (eptr >= md->end_subject || (md->ctypes[*eptr] & ctype_digit) == 0)
4198 break;
4199 eptr++;
4200 }
4201 break;
4202
4203 case OP_NOT_WHITESPACE:
4204 for (i = min; i < max; i++)
4205 {
4206 if (eptr >= md->end_subject || (md->ctypes[*eptr] & ctype_space) != 0)
4207 break;
4208 eptr++;
4209 }
4210 break;
4211
4212 case OP_WHITESPACE:
4213 for (i = min; i < max; i++)
4214 {
4215 if (eptr >= md->end_subject || (md->ctypes[*eptr] & ctype_space) == 0)
4216 break;
4217 eptr++;
4218 }
4219 break;
4220
4221 case OP_NOT_WORDCHAR:
4222 for (i = min; i < max; i++)
4223 {
4224 if (eptr >= md->end_subject || (md->ctypes[*eptr] & ctype_word) != 0)
4225 break;
4226 eptr++;
4227 }
4228 break;
4229
4230 case OP_WORDCHAR:
4231 for (i = min; i < max; i++)
4232 {
4233 if (eptr >= md->end_subject || (md->ctypes[*eptr] & ctype_word) == 0)
4234 break;
4235 eptr++;
4236 }
4237 break;
4238
4239 default:
4240 RRETURN(PCRE_ERROR_INTERNAL);
4241 }
4242
4243 /* eptr is now past the end of the maximum run */
4244
4245 if (possessive) continue;
4246 while (eptr >= pp)
4247 {
4248 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM47);
4249 eptr--;
4250 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
4251 }
4252 }
4253
4254 /* Get here if we can't make it match with any permitted repetitions */
4255
4256 RRETURN(MATCH_NOMATCH);
4257 }
4258 /* Control never gets here */
4259
4260 /* There's been some horrible disaster. Arrival here can only mean there is
4261 something seriously wrong in the code above or the OP_xxx definitions. */
4262
4263 default:
4264 DPRINTF(("Unknown opcode %d\n", *ecode));
4265 RRETURN(PCRE_ERROR_UNKNOWN_OPCODE);
4266 }
4267
4268 /* Do not stick any code in here without much thought; it is assumed
4269 that "continue" in the code above comes out to here to repeat the main
4270 loop. */
4271
4272 } /* End of main loop */
4273 /* Control never reaches here */
4274
4275
4276 /* When compiling to use the heap rather than the stack for recursive calls to
4277 match(), the RRETURN() macro jumps here. The number that is saved in
4278 frame->Xwhere indicates which label we actually want to return to. */
4279
4280 #ifdef NO_RECURSE
4281 #define LBL(val) case val: goto L_RM##val;
4282 HEAP_RETURN:
4283 switch (frame->Xwhere)
4284 {
4285 LBL( 1) LBL( 2) LBL( 3) LBL( 4) LBL( 5) LBL( 6) LBL( 7) LBL( 8)
4286 LBL( 9) LBL(10) LBL(11) LBL(12) LBL(13) LBL(14) LBL(15) LBL(17)
4287 LBL(19) LBL(24) LBL(25) LBL(26) LBL(27) LBL(29) LBL(31) LBL(33)
4288 LBL(35) LBL(43) LBL(47) LBL(48) LBL(49) LBL(50) LBL(51) LBL(52)
4289 LBL(53) LBL(54)
4290 #ifdef SUPPORT_UTF8
4291 LBL(16) LBL(18) LBL(20) LBL(21) LBL(22) LBL(23) LBL(28) LBL(30)
4292 LBL(32) LBL(34) LBL(42) LBL(46)
4293 #ifdef SUPPORT_UCP
4294 LBL(36) LBL(37) LBL(38) LBL(39) LBL(40) LBL(41) LBL(44) LBL(45)
4295 #endif /* SUPPORT_UCP */
4296 #endif /* SUPPORT_UTF8 */
4297 default:
4298 DPRINTF(("jump error in pcre match: label %d non-existent\n", frame->Xwhere));
4299 return PCRE_ERROR_INTERNAL;
4300 }
4301 #undef LBL
4302 #endif /* NO_RECURSE */
4303 }
4304
4305
4306 /***************************************************************************
4307 ****************************************************************************
4308 RECURSION IN THE match() FUNCTION
4309
4310 Undefine all the macros that were defined above to handle this. */
4311
4312 #ifdef NO_RECURSE
4313 #undef eptr
4314 #undef ecode
4315 #undef mstart
4316 #undef offset_top
4317 #undef ims
4318 #undef eptrb
4319 #undef flags
4320
4321 #undef callpat
4322 #undef charptr
4323 #undef data
4324 #undef next
4325 #undef pp
4326 #undef prev
4327 #undef saved_eptr
4328
4329 #undef new_recursive
4330
4331 #undef cur_is_word
4332 #undef condition
4333 #undef prev_is_word
4334
4335 #undef original_ims
4336
4337 #undef ctype
4338 #undef length
4339 #undef max
4340 #undef min
4341 #undef number
4342 #undef offset
4343 #undef op
4344 #undef save_capture_last
4345 #undef save_offset1
4346 #undef save_offset2
4347 #undef save_offset3
4348 #undef stacksave
4349
4350 #undef newptrb
4351
4352 #endif
4353
4354 /* These two are defined as macros in both cases */
4355
4356 #undef fc
4357 #undef fi
4358
4359 /***************************************************************************
4360 ***************************************************************************/
4361
4362
4363
4364 /*************************************************
4365 * Execute a Regular Expression *
4366 *************************************************/
4367
4368 /* This function applies a compiled re to a subject string and picks out
4369 portions of the string if it matches. Two elements in the vector are set for
4370 each substring: the offsets to the start and end of the substring.
4371
4372 Arguments:
4373 argument_re points to the compiled expression
4374 extra_data points to extra data or is NULL
4375 subject points to the subject string
4376 length length of subject string (may contain binary zeros)
4377 start_offset where to start in the subject string
4378 options option bits
4379 offsets points to a vector of ints to be filled in with offsets
4380 offsetcount the number of elements in the vector
4381
4382 Returns: > 0 => success; value is the number of elements filled in
4383 = 0 => success, but offsets is not big enough
4384 -1 => failed to match
4385 < -1 => some kind of unexpected problem
4386 */
4387
4388 PCRE_EXP_DEFN int PCRE_CALL_CONVENTION
4389 pcre_exec(const pcre *argument_re, const pcre_extra *extra_data,
4390 PCRE_SPTR subject, int length, int start_offset, int options, int *offsets,
4391 int offsetcount)
4392 {
4393 int rc, resetcount, ocount;
4394 int first_byte = -1;
4395 int req_byte = -1;
4396 int req_byte2 = -1;
4397 int newline;
4398 unsigned long int ims;
4399 BOOL using_temporary_offsets = FALSE;
4400 BOOL anchored;
4401 BOOL startline;
4402 BOOL firstline;
4403 BOOL first_byte_caseless = FALSE;
4404 BOOL req_byte_caseless = FALSE;
4405 BOOL utf8;
4406 match_data match_block;
4407 match_data *md = &match_block;
4408 const uschar *tables;
4409 const uschar *start_bits = NULL;
4410 USPTR start_match = (USPTR)subject + start_offset;
4411 USPTR end_subject;
4412 USPTR req_byte_ptr = start_match - 1;
4413
4414 pcre_study_data internal_study;
4415 const pcre_study_data *study;
4416
4417 real_pcre internal_re;
4418 const real_pcre *external_re = (const real_pcre *)argument_re;
4419 const real_pcre *re = external_re;
4420
4421 /* Plausibility checks */
4422
4423 if ((options & ~PUBLIC_EXEC_OPTIONS) != 0) return PCRE_ERROR_BADOPTION;
4424 if (re == NULL || subject == NULL ||
4425 (offsets == NULL && offsetcount > 0)) return PCRE_ERROR_NULL;
4426 if (offsetcount < 0) return PCRE_ERROR_BADCOUNT;
4427
4428 /* Fish out the optional data from the extra_data structure, first setting
4429 the default values. */
4430
4431 study = NULL;
4432 md->match_limit = MATCH_LIMIT;
4433 md->match_limit_recursion = MATCH_LIMIT_RECURSION;
4434 md->callout_data = NULL;
4435
4436 /* The table pointer is always in native byte order. */
4437
4438 tables = external_re->tables;
4439
4440 if (extra_data != NULL)
4441 {
4442 register unsigned int flags = extra_data->flags;
4443 if ((flags & PCRE_EXTRA_STUDY_DATA) != 0)
4444 study = (const pcre_study_data *)extra_data->study_data;
4445 if ((flags & PCRE_EXTRA_MATCH_LIMIT) != 0)
4446 md->match_limit = extra_data->match_limit;
4447 if ((flags & PCRE_EXTRA_MATCH_LIMIT_RECURSION) != 0)
4448 md->match_limit_recursion = extra_data->match_limit_recursion;
4449 if ((flags & PCRE_EXTRA_CALLOUT_DATA) != 0)
4450 md->callout_data = extra_data->callout_data;
4451 if ((flags & PCRE_EXTRA_TABLES) != 0) tables = extra_data->tables;
4452 }
4453
4454 /* If the exec call supplied NULL for tables, use the inbuilt ones. This
4455 is a feature that makes it possible to save compiled regex and re-use them
4456 in other programs later. */
4457
4458 if (tables == NULL) tables = _pcre_default_tables;
4459
4460 /* Check that the first field in the block is the magic number. If it is not,
4461 test for a regex that was compiled on a host of opposite endianness. If this is
4462 the case, flipped values are put in internal_re and internal_study if there was
4463 study data too. */
4464
4465 if (re->magic_number != MAGIC_NUMBER)
4466 {
4467 re = _pcre_try_flipped(re, &internal_re, study, &internal_study);
4468 if (re == NULL) return PCRE_ERROR_BADMAGIC;
4469 if (study != NULL) study = &internal_study;
4470 }
4471
4472 /* Set up other data */
4473
4474 anchored = ((re->options | options) & PCRE_ANCHORED) != 0;
4475 startline = (re->flags & PCRE_STARTLINE) != 0;
4476 firstline = (re->options & PCRE_FIRSTLINE) != 0;
4477
4478 /* The code starts after the real_pcre block and the capture name table. */
4479
4480 md->start_code = (const uschar *)external_re + re->name_table_offset +
4481 re->name_count * re->name_entry_size;
4482
4483 md->start_subject = (USPTR)subject;
4484 md->start_offset = start_offset;
4485 md->end_subject = md->start_subject + length;
4486 end_subject = md->end_subject;
4487
4488 md->endonly = (re->options & PCRE_DOLLAR_ENDONLY) != 0;
4489 utf8 = md->utf8 = (re->options & PCRE_UTF8) != 0;
4490 md->jscript_compat = (re->options & PCRE_JAVASCRIPT_COMPAT) != 0;
4491
4492 md->notbol = (options & PCRE_NOTBOL) != 0;
4493 md->noteol = (options & PCRE_NOTEOL) != 0;
4494 md->notempty = (options & PCRE_NOTEMPTY) != 0;
4495 md->partial = (options & PCRE_PARTIAL) != 0;
4496 md->hitend = FALSE;
4497
4498 md->recursive = NULL; /* No recursion at top level */
4499
4500 md->lcc = tables + lcc_offset;
4501 md->ctypes = tables + ctypes_offset;
4502
4503 /* Handle different \R options. */
4504
4505 switch (options & (PCRE_BSR_ANYCRLF|PCRE_BSR_UNICODE))
4506 {
4507 case 0:
4508 if ((re->options & (PCRE_BSR_ANYCRLF|PCRE_BSR_UNICODE)) != 0)
4509 md->bsr_anycrlf = (re->options & PCRE_BSR_ANYCRLF) != 0;
4510 else
4511 #ifdef BSR_ANYCRLF
4512 md->bsr_anycrlf = TRUE;
4513 #else
4514 md->bsr_anycrlf = FALSE;
4515 #endif
4516 break;
4517
4518 case PCRE_BSR_ANYCRLF:
4519 md->bsr_anycrlf = TRUE;
4520 break;
4521
4522 case PCRE_BSR_UNICODE:
4523 md->bsr_anycrlf = FALSE;
4524 break;
4525
4526 default: return PCRE_ERROR_BADNEWLINE;
4527 }
4528
4529 /* Handle different types of newline. The three bits give eight cases. If
4530 nothing is set at run time, whatever was used at compile time applies. */
4531
4532 switch ((((options & PCRE_NEWLINE_BITS) == 0)? re->options :
4533 (pcre_uint32)options) & PCRE_NEWLINE_BITS)
4534 {
4535 case 0: newline = NEWLINE; break; /* Compile-time default */
4536 case PCRE_NEWLINE_CR: newline = '\r'; break;
4537 case PCRE_NEWLINE_LF: newline = '\n'; break;
4538 case PCRE_NEWLINE_CR+
4539 PCRE_NEWLINE_LF: newline = ('\r' << 8) | '\n'; break;
4540 case PCRE_NEWLINE_ANY: newline = -1; break;
4541 case PCRE_NEWLINE_ANYCRLF: newline = -2; break;
4542 default: return PCRE_ERROR_BADNEWLINE;
4543 }
4544
4545 if (newline == -2)
4546 {
4547 md->nltype = NLTYPE_ANYCRLF;
4548 }
4549 else if (newline < 0)
4550 {
4551 md->nltype = NLTYPE_ANY;
4552 }
4553 else
4554 {
4555 md->nltype = NLTYPE_FIXED;
4556 if (newline > 255)
4557 {
4558 md->nllen = 2;
4559 md->nl[0] = (newline >> 8) & 255;
4560 md->nl[1] = newline & 255;
4561 }
4562 else
4563 {
4564 md->nllen = 1;
4565 md->nl[0] = newline;
4566 }
4567 }
4568
4569 /* Partial matching is supported only for a restricted set of regexes at the
4570 moment. */
4571
4572 if (md->partial && (re->flags & PCRE_NOPARTIAL) != 0)
4573 return PCRE_ERROR_BADPARTIAL;
4574
4575 /* Check a UTF-8 string if required. Unfortunately there's no way of passing
4576 back the character offset. */
4577
4578 #ifdef SUPPORT_UTF8
4579 if (utf8 && (options & PCRE_NO_UTF8_CHECK) == 0)
4580 {
4581 if (_pcre_valid_utf8((uschar *)subject, length) >= 0)
4582 return PCRE_ERROR_BADUTF8;
4583 if (start_offset > 0 && start_offset < length)
4584 {
4585 int tb = ((uschar *)subject)[start_offset];
4586 if (tb > 127)
4587 {
4588 tb &= 0xc0;
4589 if (tb != 0 && tb != 0xc0) return PCRE_ERROR_BADUTF8_OFFSET;
4590 }
4591 }
4592 }
4593 #endif
4594
4595 /* The ims options can vary during the matching as a result of the presence
4596 of (?ims) items in the pattern. They are kept in a local variable so that
4597 restoring at the exit of a group is easy. */
4598
4599 ims = re->options & (PCRE_CASELESS|PCRE_MULTILINE|PCRE_DOTALL);
4600
4601 /* If the expression has got more back references than the offsets supplied can
4602 hold, we get a temporary chunk of working store to use during the matching.
4603 Otherwise, we can use the vector supplied, rounding down its size to a multiple
4604 of 3. */
4605
4606 ocount = offsetcount - (offsetcount % 3);
4607
4608 if (re->top_backref > 0 && re->top_backref >= ocount/3)
4609 {
4610 ocount = re->top_backref * 3 + 3;
4611 md->offset_vector = (int *)(pcre_malloc)(ocount * sizeof(int));
4612 if (md->offset_vector == NULL) return PCRE_ERROR_NOMEMORY;
4613 using_temporary_offsets = TRUE;
4614 DPRINTF(("Got memory to hold back references\n"));
4615 }
4616 else md->offset_vector = offsets;
4617
4618 md->offset_end = ocount;
4619 md->offset_max = (2*ocount)/3;
4620 md->offset_overflow = FALSE;
4621 md->capture_last = -1;
4622
4623 /* Compute the minimum number of offsets that we need to reset each time. Doing
4624 this makes a huge difference to execution time when there aren't many brackets
4625 in the pattern. */
4626
4627 resetcount = 2 + re->top_bracket * 2;
4628 if (resetcount > offsetcount) resetcount = ocount;
4629
4630 /* Reset the working variable associated with each extraction. These should
4631 never be used unless previously set, but they get saved and restored, and so we
4632 initialize them to avoid reading uninitialized locations. */
4633
4634 if (md->offset_vector != NULL)
4635 {
4636 register int *iptr = md->offset_vector + ocount;
4637 register int *iend = iptr - resetcount/2 + 1;
4638 while (--iptr >= iend) *iptr = -1;
4639 }
4640
4641 /* Set up the first character to match, if available. The first_byte value is
4642 never set for an anchored regular expression, but the anchoring may be forced
4643 at run time, so we have to test for anchoring. The first char may be unset for
4644 an unanchored pattern, of course. If there's no first char and the pattern was
4645 studied, there may be a bitmap of possible first characters. */
4646
4647 if (!anchored)
4648 {
4649 if ((re->flags & PCRE_FIRSTSET) != 0)
4650 {
4651 first_byte = re->first_byte & 255;
4652 if ((first_byte_caseless = ((re->first_byte & REQ_CASELESS) != 0)) == TRUE)
4653 first_byte = md->lcc[first_byte];
4654 }
4655 else
4656 if (!startline && study != NULL &&
4657 (study->options & PCRE_STUDY_MAPPED) != 0)
4658 start_bits = study->start_bits;
4659 }
4660
4661 /* For anchored or unanchored matches, there may be a "last known required
4662 character" set. */
4663
4664 if ((re->flags & PCRE_REQCHSET) != 0)
4665 {
4666 req_byte = re->req_byte & 255;
4667 req_byte_caseless = (re->req_byte & REQ_CASELESS) != 0;
4668 req_byte2 = (tables + fcc_offset)[req_byte]; /* case flipped */
4669 }
4670
4671
4672 /* ==========================================================================*/
4673
4674 /* Loop for handling unanchored repeated matching attempts; for anchored regexs
4675 the loop runs just once. */
4676
4677 for(;;)
4678 {
4679 USPTR save_end_subject = end_subject;
4680 USPTR new_start_match;
4681
4682 /* Reset the maximum number of extractions we might see. */
4683
4684 if (md->offset_vector != NULL)
4685 {
4686 register int *iptr = md->offset_vector;
4687 register int *iend = iptr + resetcount;
4688 while (iptr < iend) *iptr++ = -1;
4689 }
4690
4691 /* Advance to a unique first char if possible. If firstline is TRUE, the
4692 start of the match is constrained to the first line of a multiline string.
4693 That is, the match must be before or at the first newline. Implement this by
4694 temporarily adjusting end_subject so that we stop scanning at a newline. If
4695 the match fails at the newline, later code breaks this loop. */
4696
4697 if (firstline)
4698 {
4699 USPTR t = start_match;
4700 #ifdef SUPPORT_UTF8
4701 if (utf8)
4702 {
4703 while (t < md->end_subject && !IS_NEWLINE(t))
4704 {
4705 t++;
4706 while (t < end_subject && (*t & 0xc0) == 0x80) t++;
4707 }
4708 }
4709 else
4710 #endif
4711 while (t < md->end_subject && !IS_NEWLINE(t)) t++;
4712 end_subject = t;
4713 }
4714
4715 /* Now advance to a unique first byte if there is one. */
4716
4717 if (first_byte >= 0)
4718 {
4719 if (first_byte_caseless)
4720 while (start_match < end_subject && md->lcc[*start_match] != first_byte)
4721 start_match++;
4722 else
4723 while (start_match < end_subject && *start_match != first_byte)
4724 start_match++;
4725 }
4726
4727 /* Or to just after a linebreak for a multiline match */
4728
4729 else if (startline)
4730 {
4731 if (start_match > md->start_subject + start_offset)
4732 {
4733 #ifdef SUPPORT_UTF8
4734 if (utf8)
4735 {
4736 while (start_match < end_subject && !WAS_NEWLINE(start_match))
4737 {
4738 start_match++;
4739 while(start_match < end_subject && (*start_match & 0xc0) == 0x80)
4740 start_match++;
4741 }
4742 }
4743 else
4744 #endif
4745 while (start_match < end_subject && !WAS_NEWLINE(start_match))
4746 start_match++;
4747
4748 /* If we have just passed a CR and the newline option is ANY or ANYCRLF,
4749 and we are now at a LF, advance the match position by one more character.
4750 */
4751
4752 if (start_match[-1] == '\r' &&
4753 (md->nltype == NLTYPE_ANY || md->nltype == NLTYPE_ANYCRLF) &&
4754 start_match < end_subject &&
4755 *start_match == '\n')
4756 start_match++;
4757 }
4758 }
4759
4760 /* Or to a non-unique first byte after study */
4761
4762 else if (start_bits != NULL)
4763 {
4764 while (start_match < end_subject)
4765 {
4766 register unsigned int c = *start_match;
4767 if ((start_bits[c/8] & (1 << (c&7))) == 0) start_match++;
4768 else break;
4769 }
4770 }
4771
4772 /* Restore fudged end_subject */
4773
4774 end_subject = save_end_subject;
4775
4776 #ifdef DEBUG /* Sigh. Some compilers never learn. */
4777 printf(">>>> Match against: ");
4778 pchars(start_match, end_subject - start_match, TRUE, md);
4779 printf("\n");
4780 #endif
4781
4782 /* If req_byte is set, we know that that character must appear in the subject
4783 for the match to succeed. If the first character is set, req_byte must be
4784 later in the subject; otherwise the test starts at the match point. This
4785 optimization can save a huge amount of backtracking in patterns with nested
4786 unlimited repeats that aren't going to match. Writing separate code for
4787 cased/caseless versions makes it go faster, as does using an autoincrement
4788 and backing off on a match.
4789
4790 HOWEVER: when the subject string is very, very long, searching to its end can
4791 take a long time, and give bad performance on quite ordinary patterns. This
4792 showed up when somebody was matching something like /^\d+C/ on a 32-megabyte
4793 string... so we don't do this when the string is sufficiently long.
4794
4795 ALSO: this processing is disabled when partial matching is requested.
4796 */
4797
4798 if (req_byte >= 0 &&
4799 end_subject - start_match < REQ_BYTE_MAX &&
4800 !md->partial)
4801 {
4802 register USPTR p = start_match + ((first_byte >= 0)? 1 : 0);
4803
4804 /* We don't need to repeat the search if we haven't yet reached the
4805 place we found it at last time. */
4806
4807 if (p > req_byte_ptr)
4808 {
4809 if (req_byte_caseless)
4810 {
4811 while (p < end_subject)
4812 {
4813 register int pp = *p++;
4814 if (pp == req_byte || pp == req_byte2) { p--; break; }
4815 }
4816 }
4817 else
4818 {
4819 while (p < end_subject)
4820 {
4821 if (*p++ == req_byte) { p--; break; }
4822 }
4823 }
4824
4825 /* If we can't find the required character, break the matching loop,
4826 forcing a match failure. */
4827
4828 if (p >= end_subject)
4829 {
4830 rc = MATCH_NOMATCH;
4831 break;
4832 }
4833
4834 /* If we have found the required character, save the point where we
4835 found it, so that we don't search again next time round the loop if
4836 the start hasn't passed this character yet. */
4837
4838 req_byte_ptr = p;
4839 }
4840 }
4841
4842 /* OK, we can now run the match. */
4843
4844 md->start_match_ptr = start_match;
4845 md->match_call_count = 0;
4846 rc = match(start_match, md->start_code, start_match, 2, md, ims, NULL, 0, 0);
4847
4848 switch(rc)
4849 {
4850 /* NOMATCH and PRUNE advance by one character. THEN at this level acts
4851 exactly like PRUNE. */
4852
4853 case MATCH_NOMATCH:
4854 case MATCH_PRUNE:
4855 case MATCH_THEN:
4856 new_start_match = start_match + 1;
4857 #ifdef SUPPORT_UTF8
4858 if (utf8)
4859 while(new_start_match < end_subject && (*new_start_match & 0xc0) == 0x80)
4860 new_start_match++;
4861 #endif
4862 break;
4863
4864 /* SKIP passes back the next starting point explicitly. */
4865
4866 case MATCH_SKIP:
4867 new_start_match = md->start_match_ptr;
4868 break;
4869
4870 /* COMMIT disables the bumpalong, but otherwise behaves as NOMATCH. */
4871
4872 case MATCH_COMMIT:
4873 rc = MATCH_NOMATCH;
4874 goto ENDLOOP;
4875
4876 /* Any other return is some kind of error. */
4877
4878 default:
4879 goto ENDLOOP;
4880 }
4881
4882 /* Control reaches here for the various types of "no match at this point"
4883 result. Reset the code to MATCH_NOMATCH for subsequent checking. */
4884
4885 rc = MATCH_NOMATCH;
4886
4887 /* If PCRE_FIRSTLINE is set, the match must happen before or at the first
4888 newline in the subject (though it may continue over the newline). Therefore,
4889 if we have just failed to match, starting at a newline, do not continue. */
4890
4891 if (firstline && IS_NEWLINE(start_match)) break;
4892
4893 /* Advance to new matching position */
4894
4895 start_match = new_start_match;
4896
4897 /* Break the loop if the pattern is anchored or if we have passed the end of
4898 the subject. */
4899
4900 if (anchored || start_match > end_subject) break;
4901
4902 /* If we have just passed a CR and we are now at a LF, and the pattern does
4903 not contain any explicit matches for \r or \n, and the newline option is CRLF
4904 or ANY or ANYCRLF, advance the match position by one more character. */
4905
4906 if (start_match[-1] == '\r' &&
4907 start_match < end_subject &&
4908 *start_match == '\n' &&
4909 (re->flags & PCRE_HASCRORLF) == 0 &&
4910 (md->nltype == NLTYPE_ANY ||
4911 md->nltype == NLTYPE_ANYCRLF ||
4912 md->nllen == 2))
4913 start_match++;
4914
4915 } /* End of for(;;) "bumpalong" loop */
4916
4917 /* ==========================================================================*/
4918
4919 /* We reach here when rc is not MATCH_NOMATCH, or if one of the stopping
4920 conditions is true:
4921
4922 (1) The pattern is anchored or the match was failed by (*COMMIT);
4923
4924 (2) We are past the end of the subject;
4925
4926 (3) PCRE_FIRSTLINE is set and we have failed to match at a newline, because
4927 this option requests that a match occur at or before the first newline in
4928 the subject.
4929
4930 When we have a match and the offset vector is big enough to deal with any
4931 backreferences, captured substring offsets will already be set up. In the case
4932 where we had to get some local store to hold offsets for backreference
4933 processing, copy those that we can. In this case there need not be overflow if
4934 certain parts of the pattern were not used, even though there are more
4935 capturing parentheses than vector slots. */
4936
4937 ENDLOOP:
4938
4939 if (rc == MATCH_MATCH)
4940 {
4941 if (using_temporary_offsets)
4942 {
4943 if (offsetcount >= 4)
4944 {
4945 memcpy(offsets + 2, md->offset_vector + 2,
4946 (offsetcount - 2) * sizeof(int));
4947 DPRINTF(("Copied offsets from temporary memory\n"));
4948 }
4949 if (md->end_offset_top > offsetcount) md->offset_overflow = TRUE;
4950 DPRINTF(("Freeing temporary memory\n"));
4951 (pcre_free)(md->offset_vector);
4952 }
4953
4954 /* Set the return code to the number of captured strings, or 0 if there are
4955 too many to fit into the vector. */
4956
4957 rc = md->offset_overflow? 0 : md->end_offset_top/2;
4958
4959 /* If there is space, set up the whole thing as substring 0. The value of
4960 md->start_match_ptr might be modified if \K was encountered on the success
4961 matching path. */
4962
4963 if (offsetcount < 2) rc = 0; else
4964 {
4965 offsets[0] = md->start_match_ptr - md->start_subject;
4966 offsets[1] = md->end_match_ptr - md->start_subject;
4967 }
4968
4969 DPRINTF((">>>> returning %d\n", rc));
4970 return rc;
4971 }
4972
4973 /* Control gets here if there has been an error, or if the overall match
4974 attempt has failed at all permitted starting positions. */
4975
4976 if (using_temporary_offsets)
4977 {
4978 DPRINTF(("Freeing temporary memory\n"));
4979 (pcre_free)(md->offset_vector);
4980 }
4981
4982 if (rc != MATCH_NOMATCH)
4983 {
4984 DPRINTF((">>>> error: returning %d\n", rc));
4985 return rc;
4986 }
4987 else if (md->partial && md->hitend)
4988 {
4989 DPRINTF((">>>> returning PCRE_ERROR_PARTIAL\n"));
4990 return PCRE_ERROR_PARTIAL;
4991 }
4992 else
4993 {
4994 DPRINTF((">>>> returning PCRE_ERROR_NOMATCH\n"));
4995 return PCRE_ERROR_NOMATCH;
4996 }
4997 }
4998
4999 /* End of pcre_exec.c */

Properties

Name Value
svn:eol-style native
svn:keywords "Author Date Id Revision Url"

webmaster@exim.org
ViewVC Help
Powered by ViewVC 1.1.12