/[pcre]/code/trunk/pcre_exec.c
ViewVC logotype

Contents of /code/trunk/pcre_exec.c

Parent Directory Parent Directory | Revision Log Revision Log


Revision 305 - (show annotations) (download)
Sun Jan 20 20:07:32 2008 UTC (6 years, 7 months ago) by ph10
File MIME type: text/plain
File size: 150319 byte(s)
Update copyright year to 2008.

1 /*************************************************
2 * Perl-Compatible Regular Expressions *
3 *************************************************/
4
5 /* PCRE is a library of functions to support regular expressions whose syntax
6 and semantics are as close as possible to those of the Perl 5 language.
7
8 Written by Philip Hazel
9 Copyright (c) 1997-2008 University of Cambridge
10
11 -----------------------------------------------------------------------------
12 Redistribution and use in source and binary forms, with or without
13 modification, are permitted provided that the following conditions are met:
14
15 * Redistributions of source code must retain the above copyright notice,
16 this list of conditions and the following disclaimer.
17
18 * Redistributions in binary form must reproduce the above copyright
19 notice, this list of conditions and the following disclaimer in the
20 documentation and/or other materials provided with the distribution.
21
22 * Neither the name of the University of Cambridge nor the names of its
23 contributors may be used to endorse or promote products derived from
24 this software without specific prior written permission.
25
26 THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
27 AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
28 IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
29 ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
30 LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
31 CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
32 SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
33 INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
34 CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
35 ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
36 POSSIBILITY OF SUCH DAMAGE.
37 -----------------------------------------------------------------------------
38 */
39
40
41 /* This module contains pcre_exec(), the externally visible function that does
42 pattern matching using an NFA algorithm, trying to mimic Perl as closely as
43 possible. There are also some static supporting functions. */
44
45 #ifdef HAVE_CONFIG_H
46 #include "config.h"
47 #endif
48
49 #define NLBLOCK md /* Block containing newline information */
50 #define PSSTART start_subject /* Field containing processed string start */
51 #define PSEND end_subject /* Field containing processed string end */
52
53 #include "pcre_internal.h"
54
55 /* Undefine some potentially clashing cpp symbols */
56
57 #undef min
58 #undef max
59
60 /* Flag bits for the match() function */
61
62 #define match_condassert 0x01 /* Called to check a condition assertion */
63 #define match_cbegroup 0x02 /* Could-be-empty unlimited repeat group */
64
65 /* Non-error returns from the match() function. Error returns are externally
66 defined PCRE_ERROR_xxx codes, which are all negative. */
67
68 #define MATCH_MATCH 1
69 #define MATCH_NOMATCH 0
70
71 /* Special internal returns from the match() function. Make them sufficiently
72 negative to avoid the external error codes. */
73
74 #define MATCH_COMMIT (-999)
75 #define MATCH_PRUNE (-998)
76 #define MATCH_SKIP (-997)
77 #define MATCH_THEN (-996)
78
79 /* Maximum number of ints of offset to save on the stack for recursive calls.
80 If the offset vector is bigger, malloc is used. This should be a multiple of 3,
81 because the offset vector is always a multiple of 3 long. */
82
83 #define REC_STACK_SAVE_MAX 30
84
85 /* Min and max values for the common repeats; for the maxima, 0 => infinity */
86
87 static const char rep_min[] = { 0, 0, 1, 1, 0, 0 };
88 static const char rep_max[] = { 0, 0, 0, 0, 1, 1 };
89
90
91
92 #ifdef DEBUG
93 /*************************************************
94 * Debugging function to print chars *
95 *************************************************/
96
97 /* Print a sequence of chars in printable format, stopping at the end of the
98 subject if the requested.
99
100 Arguments:
101 p points to characters
102 length number to print
103 is_subject TRUE if printing from within md->start_subject
104 md pointer to matching data block, if is_subject is TRUE
105
106 Returns: nothing
107 */
108
109 static void
110 pchars(const uschar *p, int length, BOOL is_subject, match_data *md)
111 {
112 unsigned int c;
113 if (is_subject && length > md->end_subject - p) length = md->end_subject - p;
114 while (length-- > 0)
115 if (isprint(c = *(p++))) printf("%c", c); else printf("\\x%02x", c);
116 }
117 #endif
118
119
120
121 /*************************************************
122 * Match a back-reference *
123 *************************************************/
124
125 /* If a back reference hasn't been set, the length that is passed is greater
126 than the number of characters left in the string, so the match fails.
127
128 Arguments:
129 offset index into the offset vector
130 eptr points into the subject
131 length length to be matched
132 md points to match data block
133 ims the ims flags
134
135 Returns: TRUE if matched
136 */
137
138 static BOOL
139 match_ref(int offset, register USPTR eptr, int length, match_data *md,
140 unsigned long int ims)
141 {
142 USPTR p = md->start_subject + md->offset_vector[offset];
143
144 #ifdef DEBUG
145 if (eptr >= md->end_subject)
146 printf("matching subject <null>");
147 else
148 {
149 printf("matching subject ");
150 pchars(eptr, length, TRUE, md);
151 }
152 printf(" against backref ");
153 pchars(p, length, FALSE, md);
154 printf("\n");
155 #endif
156
157 /* Always fail if not enough characters left */
158
159 if (length > md->end_subject - eptr) return FALSE;
160
161 /* Separate the caselesss case for speed */
162
163 if ((ims & PCRE_CASELESS) != 0)
164 {
165 while (length-- > 0)
166 if (md->lcc[*p++] != md->lcc[*eptr++]) return FALSE;
167 }
168 else
169 { while (length-- > 0) if (*p++ != *eptr++) return FALSE; }
170
171 return TRUE;
172 }
173
174
175
176 /***************************************************************************
177 ****************************************************************************
178 RECURSION IN THE match() FUNCTION
179
180 The match() function is highly recursive, though not every recursive call
181 increases the recursive depth. Nevertheless, some regular expressions can cause
182 it to recurse to a great depth. I was writing for Unix, so I just let it call
183 itself recursively. This uses the stack for saving everything that has to be
184 saved for a recursive call. On Unix, the stack can be large, and this works
185 fine.
186
187 It turns out that on some non-Unix-like systems there are problems with
188 programs that use a lot of stack. (This despite the fact that every last chip
189 has oodles of memory these days, and techniques for extending the stack have
190 been known for decades.) So....
191
192 There is a fudge, triggered by defining NO_RECURSE, which avoids recursive
193 calls by keeping local variables that need to be preserved in blocks of memory
194 obtained from malloc() instead instead of on the stack. Macros are used to
195 achieve this so that the actual code doesn't look very different to what it
196 always used to.
197
198 The original heap-recursive code used longjmp(). However, it seems that this
199 can be very slow on some operating systems. Following a suggestion from Stan
200 Switzer, the use of longjmp() has been abolished, at the cost of having to
201 provide a unique number for each call to RMATCH. There is no way of generating
202 a sequence of numbers at compile time in C. I have given them names, to make
203 them stand out more clearly.
204
205 Crude tests on x86 Linux show a small speedup of around 5-8%. However, on
206 FreeBSD, avoiding longjmp() more than halves the time taken to run the standard
207 tests. Furthermore, not using longjmp() means that local dynamic variables
208 don't have indeterminate values; this has meant that the frame size can be
209 reduced because the result can be "passed back" by straight setting of the
210 variable instead of being passed in the frame.
211 ****************************************************************************
212 ***************************************************************************/
213
214 /* Numbers for RMATCH calls. When this list is changed, the code at HEAP_RETURN
215 below must be updated in sync. */
216
217 enum { RM1=1, RM2, RM3, RM4, RM5, RM6, RM7, RM8, RM9, RM10,
218 RM11, RM12, RM13, RM14, RM15, RM16, RM17, RM18, RM19, RM20,
219 RM21, RM22, RM23, RM24, RM25, RM26, RM27, RM28, RM29, RM30,
220 RM31, RM32, RM33, RM34, RM35, RM36, RM37, RM38, RM39, RM40,
221 RM41, RM42, RM43, RM44, RM45, RM46, RM47, RM48, RM49, RM50,
222 RM51, RM52, RM53, RM54 };
223
224 /* These versions of the macros use the stack, as normal. There are debugging
225 versions and production versions. Note that the "rw" argument of RMATCH isn't
226 actuall used in this definition. */
227
228 #ifndef NO_RECURSE
229 #define REGISTER register
230
231 #ifdef DEBUG
232 #define RMATCH(ra,rb,rc,rd,re,rf,rg,rw) \
233 { \
234 printf("match() called in line %d\n", __LINE__); \
235 rrc = match(ra,rb,mstart,rc,rd,re,rf,rg,rdepth+1); \
236 printf("to line %d\n", __LINE__); \
237 }
238 #define RRETURN(ra) \
239 { \
240 printf("match() returned %d from line %d ", ra, __LINE__); \
241 return ra; \
242 }
243 #else
244 #define RMATCH(ra,rb,rc,rd,re,rf,rg,rw) \
245 rrc = match(ra,rb,mstart,rc,rd,re,rf,rg,rdepth+1)
246 #define RRETURN(ra) return ra
247 #endif
248
249 #else
250
251
252 /* These versions of the macros manage a private stack on the heap. Note that
253 the "rd" argument of RMATCH isn't actually used in this definition. It's the md
254 argument of match(), which never changes. */
255
256 #define REGISTER
257
258 #define RMATCH(ra,rb,rc,rd,re,rf,rg,rw)\
259 {\
260 heapframe *newframe = (pcre_stack_malloc)(sizeof(heapframe));\
261 frame->Xwhere = rw; \
262 newframe->Xeptr = ra;\
263 newframe->Xecode = rb;\
264 newframe->Xmstart = mstart;\
265 newframe->Xoffset_top = rc;\
266 newframe->Xims = re;\
267 newframe->Xeptrb = rf;\
268 newframe->Xflags = rg;\
269 newframe->Xrdepth = frame->Xrdepth + 1;\
270 newframe->Xprevframe = frame;\
271 frame = newframe;\
272 DPRINTF(("restarting from line %d\n", __LINE__));\
273 goto HEAP_RECURSE;\
274 L_##rw:\
275 DPRINTF(("jumped back to line %d\n", __LINE__));\
276 }
277
278 #define RRETURN(ra)\
279 {\
280 heapframe *newframe = frame;\
281 frame = newframe->Xprevframe;\
282 (pcre_stack_free)(newframe);\
283 if (frame != NULL)\
284 {\
285 rrc = ra;\
286 goto HEAP_RETURN;\
287 }\
288 return ra;\
289 }
290
291
292 /* Structure for remembering the local variables in a private frame */
293
294 typedef struct heapframe {
295 struct heapframe *Xprevframe;
296
297 /* Function arguments that may change */
298
299 const uschar *Xeptr;
300 const uschar *Xecode;
301 const uschar *Xmstart;
302 int Xoffset_top;
303 long int Xims;
304 eptrblock *Xeptrb;
305 int Xflags;
306 unsigned int Xrdepth;
307
308 /* Function local variables */
309
310 const uschar *Xcallpat;
311 const uschar *Xcharptr;
312 const uschar *Xdata;
313 const uschar *Xnext;
314 const uschar *Xpp;
315 const uschar *Xprev;
316 const uschar *Xsaved_eptr;
317
318 recursion_info Xnew_recursive;
319
320 BOOL Xcur_is_word;
321 BOOL Xcondition;
322 BOOL Xprev_is_word;
323
324 unsigned long int Xoriginal_ims;
325
326 #ifdef SUPPORT_UCP
327 int Xprop_type;
328 int Xprop_value;
329 int Xprop_fail_result;
330 int Xprop_category;
331 int Xprop_chartype;
332 int Xprop_script;
333 int Xoclength;
334 uschar Xocchars[8];
335 #endif
336
337 int Xctype;
338 unsigned int Xfc;
339 int Xfi;
340 int Xlength;
341 int Xmax;
342 int Xmin;
343 int Xnumber;
344 int Xoffset;
345 int Xop;
346 int Xsave_capture_last;
347 int Xsave_offset1, Xsave_offset2, Xsave_offset3;
348 int Xstacksave[REC_STACK_SAVE_MAX];
349
350 eptrblock Xnewptrb;
351
352 /* Where to jump back to */
353
354 int Xwhere;
355
356 } heapframe;
357
358 #endif
359
360
361 /***************************************************************************
362 ***************************************************************************/
363
364
365
366 /*************************************************
367 * Match from current position *
368 *************************************************/
369
370 /* This function is called recursively in many circumstances. Whenever it
371 returns a negative (error) response, the outer incarnation must also return the
372 same response.
373
374 Performance note: It might be tempting to extract commonly used fields from the
375 md structure (e.g. utf8, end_subject) into individual variables to improve
376 performance. Tests using gcc on a SPARC disproved this; in the first case, it
377 made performance worse.
378
379 Arguments:
380 eptr pointer to current character in subject
381 ecode pointer to current position in compiled code
382 mstart pointer to the current match start position (can be modified
383 by encountering \K)
384 offset_top current top pointer
385 md pointer to "static" info for the match
386 ims current /i, /m, and /s options
387 eptrb pointer to chain of blocks containing eptr at start of
388 brackets - for testing for empty matches
389 flags can contain
390 match_condassert - this is an assertion condition
391 match_cbegroup - this is the start of an unlimited repeat
392 group that can match an empty string
393 rdepth the recursion depth
394
395 Returns: MATCH_MATCH if matched ) these values are >= 0
396 MATCH_NOMATCH if failed to match )
397 a negative PCRE_ERROR_xxx value if aborted by an error condition
398 (e.g. stopped by repeated call or recursion limit)
399 */
400
401 static int
402 match(REGISTER USPTR eptr, REGISTER const uschar *ecode, const uschar *mstart,
403 int offset_top, match_data *md, unsigned long int ims, eptrblock *eptrb,
404 int flags, unsigned int rdepth)
405 {
406 /* These variables do not need to be preserved over recursion in this function,
407 so they can be ordinary variables in all cases. Mark some of them with
408 "register" because they are used a lot in loops. */
409
410 register int rrc; /* Returns from recursive calls */
411 register int i; /* Used for loops not involving calls to RMATCH() */
412 register unsigned int c; /* Character values not kept over RMATCH() calls */
413 register BOOL utf8; /* Local copy of UTF-8 flag for speed */
414
415 BOOL minimize, possessive; /* Quantifier options */
416
417 /* When recursion is not being used, all "local" variables that have to be
418 preserved over calls to RMATCH() are part of a "frame" which is obtained from
419 heap storage. Set up the top-level frame here; others are obtained from the
420 heap whenever RMATCH() does a "recursion". See the macro definitions above. */
421
422 #ifdef NO_RECURSE
423 heapframe *frame = (pcre_stack_malloc)(sizeof(heapframe));
424 frame->Xprevframe = NULL; /* Marks the top level */
425
426 /* Copy in the original argument variables */
427
428 frame->Xeptr = eptr;
429 frame->Xecode = ecode;
430 frame->Xmstart = mstart;
431 frame->Xoffset_top = offset_top;
432 frame->Xims = ims;
433 frame->Xeptrb = eptrb;
434 frame->Xflags = flags;
435 frame->Xrdepth = rdepth;
436
437 /* This is where control jumps back to to effect "recursion" */
438
439 HEAP_RECURSE:
440
441 /* Macros make the argument variables come from the current frame */
442
443 #define eptr frame->Xeptr
444 #define ecode frame->Xecode
445 #define mstart frame->Xmstart
446 #define offset_top frame->Xoffset_top
447 #define ims frame->Xims
448 #define eptrb frame->Xeptrb
449 #define flags frame->Xflags
450 #define rdepth frame->Xrdepth
451
452 /* Ditto for the local variables */
453
454 #ifdef SUPPORT_UTF8
455 #define charptr frame->Xcharptr
456 #endif
457 #define callpat frame->Xcallpat
458 #define data frame->Xdata
459 #define next frame->Xnext
460 #define pp frame->Xpp
461 #define prev frame->Xprev
462 #define saved_eptr frame->Xsaved_eptr
463
464 #define new_recursive frame->Xnew_recursive
465
466 #define cur_is_word frame->Xcur_is_word
467 #define condition frame->Xcondition
468 #define prev_is_word frame->Xprev_is_word
469
470 #define original_ims frame->Xoriginal_ims
471
472 #ifdef SUPPORT_UCP
473 #define prop_type frame->Xprop_type
474 #define prop_value frame->Xprop_value
475 #define prop_fail_result frame->Xprop_fail_result
476 #define prop_category frame->Xprop_category
477 #define prop_chartype frame->Xprop_chartype
478 #define prop_script frame->Xprop_script
479 #define oclength frame->Xoclength
480 #define occhars frame->Xocchars
481 #endif
482
483 #define ctype frame->Xctype
484 #define fc frame->Xfc
485 #define fi frame->Xfi
486 #define length frame->Xlength
487 #define max frame->Xmax
488 #define min frame->Xmin
489 #define number frame->Xnumber
490 #define offset frame->Xoffset
491 #define op frame->Xop
492 #define save_capture_last frame->Xsave_capture_last
493 #define save_offset1 frame->Xsave_offset1
494 #define save_offset2 frame->Xsave_offset2
495 #define save_offset3 frame->Xsave_offset3
496 #define stacksave frame->Xstacksave
497
498 #define newptrb frame->Xnewptrb
499
500 /* When recursion is being used, local variables are allocated on the stack and
501 get preserved during recursion in the normal way. In this environment, fi and
502 i, and fc and c, can be the same variables. */
503
504 #else /* NO_RECURSE not defined */
505 #define fi i
506 #define fc c
507
508
509 #ifdef SUPPORT_UTF8 /* Many of these variables are used only */
510 const uschar *charptr; /* in small blocks of the code. My normal */
511 #endif /* style of coding would have declared */
512 const uschar *callpat; /* them within each of those blocks. */
513 const uschar *data; /* However, in order to accommodate the */
514 const uschar *next; /* version of this code that uses an */
515 USPTR pp; /* external "stack" implemented on the */
516 const uschar *prev; /* heap, it is easier to declare them all */
517 USPTR saved_eptr; /* here, so the declarations can be cut */
518 /* out in a block. The only declarations */
519 recursion_info new_recursive; /* within blocks below are for variables */
520 /* that do not have to be preserved over */
521 BOOL cur_is_word; /* a recursive call to RMATCH(). */
522 BOOL condition;
523 BOOL prev_is_word;
524
525 unsigned long int original_ims;
526
527 #ifdef SUPPORT_UCP
528 int prop_type;
529 int prop_value;
530 int prop_fail_result;
531 int prop_category;
532 int prop_chartype;
533 int prop_script;
534 int oclength;
535 uschar occhars[8];
536 #endif
537
538 int ctype;
539 int length;
540 int max;
541 int min;
542 int number;
543 int offset;
544 int op;
545 int save_capture_last;
546 int save_offset1, save_offset2, save_offset3;
547 int stacksave[REC_STACK_SAVE_MAX];
548
549 eptrblock newptrb;
550 #endif /* NO_RECURSE */
551
552 /* These statements are here to stop the compiler complaining about unitialized
553 variables. */
554
555 #ifdef SUPPORT_UCP
556 prop_value = 0;
557 prop_fail_result = 0;
558 #endif
559
560
561 /* This label is used for tail recursion, which is used in a few cases even
562 when NO_RECURSE is not defined, in order to reduce the amount of stack that is
563 used. Thanks to Ian Taylor for noticing this possibility and sending the
564 original patch. */
565
566 TAIL_RECURSE:
567
568 /* OK, now we can get on with the real code of the function. Recursive calls
569 are specified by the macro RMATCH and RRETURN is used to return. When
570 NO_RECURSE is *not* defined, these just turn into a recursive call to match()
571 and a "return", respectively (possibly with some debugging if DEBUG is
572 defined). However, RMATCH isn't like a function call because it's quite a
573 complicated macro. It has to be used in one particular way. This shouldn't,
574 however, impact performance when true recursion is being used. */
575
576 #ifdef SUPPORT_UTF8
577 utf8 = md->utf8; /* Local copy of the flag */
578 #else
579 utf8 = FALSE;
580 #endif
581
582 /* First check that we haven't called match() too many times, or that we
583 haven't exceeded the recursive call limit. */
584
585 if (md->match_call_count++ >= md->match_limit) RRETURN(PCRE_ERROR_MATCHLIMIT);
586 if (rdepth >= md->match_limit_recursion) RRETURN(PCRE_ERROR_RECURSIONLIMIT);
587
588 original_ims = ims; /* Save for resetting on ')' */
589
590 /* At the start of a group with an unlimited repeat that may match an empty
591 string, the match_cbegroup flag is set. When this is the case, add the current
592 subject pointer to the chain of such remembered pointers, to be checked when we
593 hit the closing ket, in order to break infinite loops that match no characters.
594 When match() is called in other circumstances, don't add to the chain. The
595 match_cbegroup flag must NOT be used with tail recursion, because the memory
596 block that is used is on the stack, so a new one may be required for each
597 match(). */
598
599 if ((flags & match_cbegroup) != 0)
600 {
601 newptrb.epb_saved_eptr = eptr;
602 newptrb.epb_prev = eptrb;
603 eptrb = &newptrb;
604 }
605
606 /* Now start processing the opcodes. */
607
608 for (;;)
609 {
610 minimize = possessive = FALSE;
611 op = *ecode;
612
613 /* For partial matching, remember if we ever hit the end of the subject after
614 matching at least one subject character. */
615
616 if (md->partial &&
617 eptr >= md->end_subject &&
618 eptr > mstart)
619 md->hitend = TRUE;
620
621 switch(op)
622 {
623 case OP_FAIL:
624 RRETURN(MATCH_NOMATCH);
625
626 case OP_PRUNE:
627 RMATCH(eptr, ecode + _pcre_OP_lengths[*ecode], offset_top, md,
628 ims, eptrb, flags, RM51);
629 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
630 RRETURN(MATCH_PRUNE);
631
632 case OP_COMMIT:
633 RMATCH(eptr, ecode + _pcre_OP_lengths[*ecode], offset_top, md,
634 ims, eptrb, flags, RM52);
635 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
636 RRETURN(MATCH_COMMIT);
637
638 case OP_SKIP:
639 RMATCH(eptr, ecode + _pcre_OP_lengths[*ecode], offset_top, md,
640 ims, eptrb, flags, RM53);
641 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
642 md->start_match_ptr = eptr; /* Pass back current position */
643 RRETURN(MATCH_SKIP);
644
645 case OP_THEN:
646 RMATCH(eptr, ecode + _pcre_OP_lengths[*ecode], offset_top, md,
647 ims, eptrb, flags, RM54);
648 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
649 RRETURN(MATCH_THEN);
650
651 /* Handle a capturing bracket. If there is space in the offset vector, save
652 the current subject position in the working slot at the top of the vector.
653 We mustn't change the current values of the data slot, because they may be
654 set from a previous iteration of this group, and be referred to by a
655 reference inside the group.
656
657 If the bracket fails to match, we need to restore this value and also the
658 values of the final offsets, in case they were set by a previous iteration
659 of the same bracket.
660
661 If there isn't enough space in the offset vector, treat this as if it were
662 a non-capturing bracket. Don't worry about setting the flag for the error
663 case here; that is handled in the code for KET. */
664
665 case OP_CBRA:
666 case OP_SCBRA:
667 number = GET2(ecode, 1+LINK_SIZE);
668 offset = number << 1;
669
670 #ifdef DEBUG
671 printf("start bracket %d\n", number);
672 printf("subject=");
673 pchars(eptr, 16, TRUE, md);
674 printf("\n");
675 #endif
676
677 if (offset < md->offset_max)
678 {
679 save_offset1 = md->offset_vector[offset];
680 save_offset2 = md->offset_vector[offset+1];
681 save_offset3 = md->offset_vector[md->offset_end - number];
682 save_capture_last = md->capture_last;
683
684 DPRINTF(("saving %d %d %d\n", save_offset1, save_offset2, save_offset3));
685 md->offset_vector[md->offset_end - number] = eptr - md->start_subject;
686
687 flags = (op == OP_SCBRA)? match_cbegroup : 0;
688 do
689 {
690 RMATCH(eptr, ecode + _pcre_OP_lengths[*ecode], offset_top, md,
691 ims, eptrb, flags, RM1);
692 if (rrc != MATCH_NOMATCH && rrc != MATCH_THEN) RRETURN(rrc);
693 md->capture_last = save_capture_last;
694 ecode += GET(ecode, 1);
695 }
696 while (*ecode == OP_ALT);
697
698 DPRINTF(("bracket %d failed\n", number));
699
700 md->offset_vector[offset] = save_offset1;
701 md->offset_vector[offset+1] = save_offset2;
702 md->offset_vector[md->offset_end - number] = save_offset3;
703
704 RRETURN(MATCH_NOMATCH);
705 }
706
707 /* FALL THROUGH ... Insufficient room for saving captured contents. Treat
708 as a non-capturing bracket. */
709
710 /* VVVVVVVVVVVVVVVVVVVVVVVVV */
711 /* VVVVVVVVVVVVVVVVVVVVVVVVV */
712
713 DPRINTF(("insufficient capture room: treat as non-capturing\n"));
714
715 /* VVVVVVVVVVVVVVVVVVVVVVVVV */
716 /* VVVVVVVVVVVVVVVVVVVVVVVVV */
717
718 /* Non-capturing bracket. Loop for all the alternatives. When we get to the
719 final alternative within the brackets, we would return the result of a
720 recursive call to match() whatever happened. We can reduce stack usage by
721 turning this into a tail recursion, except in the case when match_cbegroup
722 is set.*/
723
724 case OP_BRA:
725 case OP_SBRA:
726 DPRINTF(("start non-capturing bracket\n"));
727 flags = (op >= OP_SBRA)? match_cbegroup : 0;
728 for (;;)
729 {
730 if (ecode[GET(ecode, 1)] != OP_ALT) /* Final alternative */
731 {
732 if (flags == 0) /* Not a possibly empty group */
733 {
734 ecode += _pcre_OP_lengths[*ecode];
735 DPRINTF(("bracket 0 tail recursion\n"));
736 goto TAIL_RECURSE;
737 }
738
739 /* Possibly empty group; can't use tail recursion. */
740
741 RMATCH(eptr, ecode + _pcre_OP_lengths[*ecode], offset_top, md, ims,
742 eptrb, flags, RM48);
743 RRETURN(rrc);
744 }
745
746 /* For non-final alternatives, continue the loop for a NOMATCH result;
747 otherwise return. */
748
749 RMATCH(eptr, ecode + _pcre_OP_lengths[*ecode], offset_top, md, ims,
750 eptrb, flags, RM2);
751 if (rrc != MATCH_NOMATCH && rrc != MATCH_THEN) RRETURN(rrc);
752 ecode += GET(ecode, 1);
753 }
754 /* Control never reaches here. */
755
756 /* Conditional group: compilation checked that there are no more than
757 two branches. If the condition is false, skipping the first branch takes us
758 past the end if there is only one branch, but that's OK because that is
759 exactly what going to the ket would do. As there is only one branch to be
760 obeyed, we can use tail recursion to avoid using another stack frame. */
761
762 case OP_COND:
763 case OP_SCOND:
764 if (ecode[LINK_SIZE+1] == OP_RREF) /* Recursion test */
765 {
766 offset = GET2(ecode, LINK_SIZE + 2); /* Recursion group number*/
767 condition = md->recursive != NULL &&
768 (offset == RREF_ANY || offset == md->recursive->group_num);
769 ecode += condition? 3 : GET(ecode, 1);
770 }
771
772 else if (ecode[LINK_SIZE+1] == OP_CREF) /* Group used test */
773 {
774 offset = GET2(ecode, LINK_SIZE+2) << 1; /* Doubled ref number */
775 condition = offset < offset_top && md->offset_vector[offset] >= 0;
776 ecode += condition? 3 : GET(ecode, 1);
777 }
778
779 else if (ecode[LINK_SIZE+1] == OP_DEF) /* DEFINE - always false */
780 {
781 condition = FALSE;
782 ecode += GET(ecode, 1);
783 }
784
785 /* The condition is an assertion. Call match() to evaluate it - setting
786 the final argument match_condassert causes it to stop at the end of an
787 assertion. */
788
789 else
790 {
791 RMATCH(eptr, ecode + 1 + LINK_SIZE, offset_top, md, ims, NULL,
792 match_condassert, RM3);
793 if (rrc == MATCH_MATCH)
794 {
795 condition = TRUE;
796 ecode += 1 + LINK_SIZE + GET(ecode, LINK_SIZE + 2);
797 while (*ecode == OP_ALT) ecode += GET(ecode, 1);
798 }
799 else if (rrc != MATCH_NOMATCH && rrc != MATCH_THEN)
800 {
801 RRETURN(rrc); /* Need braces because of following else */
802 }
803 else
804 {
805 condition = FALSE;
806 ecode += GET(ecode, 1);
807 }
808 }
809
810 /* We are now at the branch that is to be obeyed. As there is only one,
811 we can use tail recursion to avoid using another stack frame, except when
812 match_cbegroup is required for an unlimited repeat of a possibly empty
813 group. If the second alternative doesn't exist, we can just plough on. */
814
815 if (condition || *ecode == OP_ALT)
816 {
817 ecode += 1 + LINK_SIZE;
818 if (op == OP_SCOND) /* Possibly empty group */
819 {
820 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, match_cbegroup, RM49);
821 RRETURN(rrc);
822 }
823 else /* Group must match something */
824 {
825 flags = 0;
826 goto TAIL_RECURSE;
827 }
828 }
829 else /* Condition false & no 2nd alternative */
830 {
831 ecode += 1 + LINK_SIZE;
832 }
833 break;
834
835
836 /* End of the pattern, either real or forced. If we are in a top-level
837 recursion, we should restore the offsets appropriately and continue from
838 after the call. */
839
840 case OP_ACCEPT:
841 case OP_END:
842 if (md->recursive != NULL && md->recursive->group_num == 0)
843 {
844 recursion_info *rec = md->recursive;
845 DPRINTF(("End of pattern in a (?0) recursion\n"));
846 md->recursive = rec->prevrec;
847 memmove(md->offset_vector, rec->offset_save,
848 rec->saved_max * sizeof(int));
849 mstart = rec->save_start;
850 ims = original_ims;
851 ecode = rec->after_call;
852 break;
853 }
854
855 /* Otherwise, if PCRE_NOTEMPTY is set, fail if we have matched an empty
856 string - backtracking will then try other alternatives, if any. */
857
858 if (md->notempty && eptr == mstart) RRETURN(MATCH_NOMATCH);
859 md->end_match_ptr = eptr; /* Record where we ended */
860 md->end_offset_top = offset_top; /* and how many extracts were taken */
861 md->start_match_ptr = mstart; /* and the start (\K can modify) */
862 RRETURN(MATCH_MATCH);
863
864 /* Change option settings */
865
866 case OP_OPT:
867 ims = ecode[1];
868 ecode += 2;
869 DPRINTF(("ims set to %02lx\n", ims));
870 break;
871
872 /* Assertion brackets. Check the alternative branches in turn - the
873 matching won't pass the KET for an assertion. If any one branch matches,
874 the assertion is true. Lookbehind assertions have an OP_REVERSE item at the
875 start of each branch to move the current point backwards, so the code at
876 this level is identical to the lookahead case. */
877
878 case OP_ASSERT:
879 case OP_ASSERTBACK:
880 do
881 {
882 RMATCH(eptr, ecode + 1 + LINK_SIZE, offset_top, md, ims, NULL, 0,
883 RM4);
884 if (rrc == MATCH_MATCH) break;
885 if (rrc != MATCH_NOMATCH && rrc != MATCH_THEN) RRETURN(rrc);
886 ecode += GET(ecode, 1);
887 }
888 while (*ecode == OP_ALT);
889 if (*ecode == OP_KET) RRETURN(MATCH_NOMATCH);
890
891 /* If checking an assertion for a condition, return MATCH_MATCH. */
892
893 if ((flags & match_condassert) != 0) RRETURN(MATCH_MATCH);
894
895 /* Continue from after the assertion, updating the offsets high water
896 mark, since extracts may have been taken during the assertion. */
897
898 do ecode += GET(ecode,1); while (*ecode == OP_ALT);
899 ecode += 1 + LINK_SIZE;
900 offset_top = md->end_offset_top;
901 continue;
902
903 /* Negative assertion: all branches must fail to match */
904
905 case OP_ASSERT_NOT:
906 case OP_ASSERTBACK_NOT:
907 do
908 {
909 RMATCH(eptr, ecode + 1 + LINK_SIZE, offset_top, md, ims, NULL, 0,
910 RM5);
911 if (rrc == MATCH_MATCH) RRETURN(MATCH_NOMATCH);
912 if (rrc != MATCH_NOMATCH && rrc != MATCH_THEN) RRETURN(rrc);
913 ecode += GET(ecode,1);
914 }
915 while (*ecode == OP_ALT);
916
917 if ((flags & match_condassert) != 0) RRETURN(MATCH_MATCH);
918
919 ecode += 1 + LINK_SIZE;
920 continue;
921
922 /* Move the subject pointer back. This occurs only at the start of
923 each branch of a lookbehind assertion. If we are too close to the start to
924 move back, this match function fails. When working with UTF-8 we move
925 back a number of characters, not bytes. */
926
927 case OP_REVERSE:
928 #ifdef SUPPORT_UTF8
929 if (utf8)
930 {
931 i = GET(ecode, 1);
932 while (i-- > 0)
933 {
934 eptr--;
935 if (eptr < md->start_subject) RRETURN(MATCH_NOMATCH);
936 BACKCHAR(eptr);
937 }
938 }
939 else
940 #endif
941
942 /* No UTF-8 support, or not in UTF-8 mode: count is byte count */
943
944 {
945 eptr -= GET(ecode, 1);
946 if (eptr < md->start_subject) RRETURN(MATCH_NOMATCH);
947 }
948
949 /* Skip to next op code */
950
951 ecode += 1 + LINK_SIZE;
952 break;
953
954 /* The callout item calls an external function, if one is provided, passing
955 details of the match so far. This is mainly for debugging, though the
956 function is able to force a failure. */
957
958 case OP_CALLOUT:
959 if (pcre_callout != NULL)
960 {
961 pcre_callout_block cb;
962 cb.version = 1; /* Version 1 of the callout block */
963 cb.callout_number = ecode[1];
964 cb.offset_vector = md->offset_vector;
965 cb.subject = (PCRE_SPTR)md->start_subject;
966 cb.subject_length = md->end_subject - md->start_subject;
967 cb.start_match = mstart - md->start_subject;
968 cb.current_position = eptr - md->start_subject;
969 cb.pattern_position = GET(ecode, 2);
970 cb.next_item_length = GET(ecode, 2 + LINK_SIZE);
971 cb.capture_top = offset_top/2;
972 cb.capture_last = md->capture_last;
973 cb.callout_data = md->callout_data;
974 if ((rrc = (*pcre_callout)(&cb)) > 0) RRETURN(MATCH_NOMATCH);
975 if (rrc < 0) RRETURN(rrc);
976 }
977 ecode += 2 + 2*LINK_SIZE;
978 break;
979
980 /* Recursion either matches the current regex, or some subexpression. The
981 offset data is the offset to the starting bracket from the start of the
982 whole pattern. (This is so that it works from duplicated subpatterns.)
983
984 If there are any capturing brackets started but not finished, we have to
985 save their starting points and reinstate them after the recursion. However,
986 we don't know how many such there are (offset_top records the completed
987 total) so we just have to save all the potential data. There may be up to
988 65535 such values, which is too large to put on the stack, but using malloc
989 for small numbers seems expensive. As a compromise, the stack is used when
990 there are no more than REC_STACK_SAVE_MAX values to store; otherwise malloc
991 is used. A problem is what to do if the malloc fails ... there is no way of
992 returning to the top level with an error. Save the top REC_STACK_SAVE_MAX
993 values on the stack, and accept that the rest may be wrong.
994
995 There are also other values that have to be saved. We use a chained
996 sequence of blocks that actually live on the stack. Thanks to Robin Houston
997 for the original version of this logic. */
998
999 case OP_RECURSE:
1000 {
1001 callpat = md->start_code + GET(ecode, 1);
1002 new_recursive.group_num = (callpat == md->start_code)? 0 :
1003 GET2(callpat, 1 + LINK_SIZE);
1004
1005 /* Add to "recursing stack" */
1006
1007 new_recursive.prevrec = md->recursive;
1008 md->recursive = &new_recursive;
1009
1010 /* Find where to continue from afterwards */
1011
1012 ecode += 1 + LINK_SIZE;
1013 new_recursive.after_call = ecode;
1014
1015 /* Now save the offset data. */
1016
1017 new_recursive.saved_max = md->offset_end;
1018 if (new_recursive.saved_max <= REC_STACK_SAVE_MAX)
1019 new_recursive.offset_save = stacksave;
1020 else
1021 {
1022 new_recursive.offset_save =
1023 (int *)(pcre_malloc)(new_recursive.saved_max * sizeof(int));
1024 if (new_recursive.offset_save == NULL) RRETURN(PCRE_ERROR_NOMEMORY);
1025 }
1026
1027 memcpy(new_recursive.offset_save, md->offset_vector,
1028 new_recursive.saved_max * sizeof(int));
1029 new_recursive.save_start = mstart;
1030 mstart = eptr;
1031
1032 /* OK, now we can do the recursion. For each top-level alternative we
1033 restore the offset and recursion data. */
1034
1035 DPRINTF(("Recursing into group %d\n", new_recursive.group_num));
1036 flags = (*callpat >= OP_SBRA)? match_cbegroup : 0;
1037 do
1038 {
1039 RMATCH(eptr, callpat + _pcre_OP_lengths[*callpat], offset_top,
1040 md, ims, eptrb, flags, RM6);
1041 if (rrc == MATCH_MATCH)
1042 {
1043 DPRINTF(("Recursion matched\n"));
1044 md->recursive = new_recursive.prevrec;
1045 if (new_recursive.offset_save != stacksave)
1046 (pcre_free)(new_recursive.offset_save);
1047 RRETURN(MATCH_MATCH);
1048 }
1049 else if (rrc != MATCH_NOMATCH && rrc != MATCH_THEN)
1050 {
1051 DPRINTF(("Recursion gave error %d\n", rrc));
1052 RRETURN(rrc);
1053 }
1054
1055 md->recursive = &new_recursive;
1056 memcpy(md->offset_vector, new_recursive.offset_save,
1057 new_recursive.saved_max * sizeof(int));
1058 callpat += GET(callpat, 1);
1059 }
1060 while (*callpat == OP_ALT);
1061
1062 DPRINTF(("Recursion didn't match\n"));
1063 md->recursive = new_recursive.prevrec;
1064 if (new_recursive.offset_save != stacksave)
1065 (pcre_free)(new_recursive.offset_save);
1066 RRETURN(MATCH_NOMATCH);
1067 }
1068 /* Control never reaches here */
1069
1070 /* "Once" brackets are like assertion brackets except that after a match,
1071 the point in the subject string is not moved back. Thus there can never be
1072 a move back into the brackets. Friedl calls these "atomic" subpatterns.
1073 Check the alternative branches in turn - the matching won't pass the KET
1074 for this kind of subpattern. If any one branch matches, we carry on as at
1075 the end of a normal bracket, leaving the subject pointer. */
1076
1077 case OP_ONCE:
1078 prev = ecode;
1079 saved_eptr = eptr;
1080
1081 do
1082 {
1083 RMATCH(eptr, ecode + 1 + LINK_SIZE, offset_top, md, ims, eptrb, 0, RM7);
1084 if (rrc == MATCH_MATCH) break;
1085 if (rrc != MATCH_NOMATCH && rrc != MATCH_THEN) RRETURN(rrc);
1086 ecode += GET(ecode,1);
1087 }
1088 while (*ecode == OP_ALT);
1089
1090 /* If hit the end of the group (which could be repeated), fail */
1091
1092 if (*ecode != OP_ONCE && *ecode != OP_ALT) RRETURN(MATCH_NOMATCH);
1093
1094 /* Continue as from after the assertion, updating the offsets high water
1095 mark, since extracts may have been taken. */
1096
1097 do ecode += GET(ecode, 1); while (*ecode == OP_ALT);
1098
1099 offset_top = md->end_offset_top;
1100 eptr = md->end_match_ptr;
1101
1102 /* For a non-repeating ket, just continue at this level. This also
1103 happens for a repeating ket if no characters were matched in the group.
1104 This is the forcible breaking of infinite loops as implemented in Perl
1105 5.005. If there is an options reset, it will get obeyed in the normal
1106 course of events. */
1107
1108 if (*ecode == OP_KET || eptr == saved_eptr)
1109 {
1110 ecode += 1+LINK_SIZE;
1111 break;
1112 }
1113
1114 /* The repeating kets try the rest of the pattern or restart from the
1115 preceding bracket, in the appropriate order. The second "call" of match()
1116 uses tail recursion, to avoid using another stack frame. We need to reset
1117 any options that changed within the bracket before re-running it, so
1118 check the next opcode. */
1119
1120 if (ecode[1+LINK_SIZE] == OP_OPT)
1121 {
1122 ims = (ims & ~PCRE_IMS) | ecode[4];
1123 DPRINTF(("ims set to %02lx at group repeat\n", ims));
1124 }
1125
1126 if (*ecode == OP_KETRMIN)
1127 {
1128 RMATCH(eptr, ecode + 1 + LINK_SIZE, offset_top, md, ims, eptrb, 0, RM8);
1129 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1130 ecode = prev;
1131 flags = 0;
1132 goto TAIL_RECURSE;
1133 }
1134 else /* OP_KETRMAX */
1135 {
1136 RMATCH(eptr, prev, offset_top, md, ims, eptrb, match_cbegroup, RM9);
1137 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1138 ecode += 1 + LINK_SIZE;
1139 flags = 0;
1140 goto TAIL_RECURSE;
1141 }
1142 /* Control never gets here */
1143
1144 /* An alternation is the end of a branch; scan along to find the end of the
1145 bracketed group and go to there. */
1146
1147 case OP_ALT:
1148 do ecode += GET(ecode,1); while (*ecode == OP_ALT);
1149 break;
1150
1151 /* BRAZERO and BRAMINZERO occur just before a bracket group, indicating
1152 that it may occur zero times. It may repeat infinitely, or not at all -
1153 i.e. it could be ()* or ()? in the pattern. Brackets with fixed upper
1154 repeat limits are compiled as a number of copies, with the optional ones
1155 preceded by BRAZERO or BRAMINZERO. */
1156
1157 case OP_BRAZERO:
1158 {
1159 next = ecode+1;
1160 RMATCH(eptr, next, offset_top, md, ims, eptrb, 0, RM10);
1161 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1162 do next += GET(next,1); while (*next == OP_ALT);
1163 ecode = next + 1 + LINK_SIZE;
1164 }
1165 break;
1166
1167 case OP_BRAMINZERO:
1168 {
1169 next = ecode+1;
1170 do next += GET(next, 1); while (*next == OP_ALT);
1171 RMATCH(eptr, next + 1+LINK_SIZE, offset_top, md, ims, eptrb, 0, RM11);
1172 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1173 ecode++;
1174 }
1175 break;
1176
1177 /* End of a group, repeated or non-repeating. */
1178
1179 case OP_KET:
1180 case OP_KETRMIN:
1181 case OP_KETRMAX:
1182 prev = ecode - GET(ecode, 1);
1183
1184 /* If this was a group that remembered the subject start, in order to break
1185 infinite repeats of empty string matches, retrieve the subject start from
1186 the chain. Otherwise, set it NULL. */
1187
1188 if (*prev >= OP_SBRA)
1189 {
1190 saved_eptr = eptrb->epb_saved_eptr; /* Value at start of group */
1191 eptrb = eptrb->epb_prev; /* Backup to previous group */
1192 }
1193 else saved_eptr = NULL;
1194
1195 /* If we are at the end of an assertion group, stop matching and return
1196 MATCH_MATCH, but record the current high water mark for use by positive
1197 assertions. Do this also for the "once" (atomic) groups. */
1198
1199 if (*prev == OP_ASSERT || *prev == OP_ASSERT_NOT ||
1200 *prev == OP_ASSERTBACK || *prev == OP_ASSERTBACK_NOT ||
1201 *prev == OP_ONCE)
1202 {
1203 md->end_match_ptr = eptr; /* For ONCE */
1204 md->end_offset_top = offset_top;
1205 RRETURN(MATCH_MATCH);
1206 }
1207
1208 /* For capturing groups we have to check the group number back at the start
1209 and if necessary complete handling an extraction by setting the offsets and
1210 bumping the high water mark. Note that whole-pattern recursion is coded as
1211 a recurse into group 0, so it won't be picked up here. Instead, we catch it
1212 when the OP_END is reached. Other recursion is handled here. */
1213
1214 if (*prev == OP_CBRA || *prev == OP_SCBRA)
1215 {
1216 number = GET2(prev, 1+LINK_SIZE);
1217 offset = number << 1;
1218
1219 #ifdef DEBUG
1220 printf("end bracket %d", number);
1221 printf("\n");
1222 #endif
1223
1224 md->capture_last = number;
1225 if (offset >= md->offset_max) md->offset_overflow = TRUE; else
1226 {
1227 md->offset_vector[offset] =
1228 md->offset_vector[md->offset_end - number];
1229 md->offset_vector[offset+1] = eptr - md->start_subject;
1230 if (offset_top <= offset) offset_top = offset + 2;
1231 }
1232
1233 /* Handle a recursively called group. Restore the offsets
1234 appropriately and continue from after the call. */
1235
1236 if (md->recursive != NULL && md->recursive->group_num == number)
1237 {
1238 recursion_info *rec = md->recursive;
1239 DPRINTF(("Recursion (%d) succeeded - continuing\n", number));
1240 md->recursive = rec->prevrec;
1241 mstart = rec->save_start;
1242 memcpy(md->offset_vector, rec->offset_save,
1243 rec->saved_max * sizeof(int));
1244 ecode = rec->after_call;
1245 ims = original_ims;
1246 break;
1247 }
1248 }
1249
1250 /* For both capturing and non-capturing groups, reset the value of the ims
1251 flags, in case they got changed during the group. */
1252
1253 ims = original_ims;
1254 DPRINTF(("ims reset to %02lx\n", ims));
1255
1256 /* For a non-repeating ket, just continue at this level. This also
1257 happens for a repeating ket if no characters were matched in the group.
1258 This is the forcible breaking of infinite loops as implemented in Perl
1259 5.005. If there is an options reset, it will get obeyed in the normal
1260 course of events. */
1261
1262 if (*ecode == OP_KET || eptr == saved_eptr)
1263 {
1264 ecode += 1 + LINK_SIZE;
1265 break;
1266 }
1267
1268 /* The repeating kets try the rest of the pattern or restart from the
1269 preceding bracket, in the appropriate order. In the second case, we can use
1270 tail recursion to avoid using another stack frame, unless we have an
1271 unlimited repeat of a group that can match an empty string. */
1272
1273 flags = (*prev >= OP_SBRA)? match_cbegroup : 0;
1274
1275 if (*ecode == OP_KETRMIN)
1276 {
1277 RMATCH(eptr, ecode + 1 + LINK_SIZE, offset_top, md, ims, eptrb, 0, RM12);
1278 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1279 if (flags != 0) /* Could match an empty string */
1280 {
1281 RMATCH(eptr, prev, offset_top, md, ims, eptrb, flags, RM50);
1282 RRETURN(rrc);
1283 }
1284 ecode = prev;
1285 goto TAIL_RECURSE;
1286 }
1287 else /* OP_KETRMAX */
1288 {
1289 RMATCH(eptr, prev, offset_top, md, ims, eptrb, flags, RM13);
1290 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1291 ecode += 1 + LINK_SIZE;
1292 flags = 0;
1293 goto TAIL_RECURSE;
1294 }
1295 /* Control never gets here */
1296
1297 /* Start of subject unless notbol, or after internal newline if multiline */
1298
1299 case OP_CIRC:
1300 if (md->notbol && eptr == md->start_subject) RRETURN(MATCH_NOMATCH);
1301 if ((ims & PCRE_MULTILINE) != 0)
1302 {
1303 if (eptr != md->start_subject &&
1304 (eptr == md->end_subject || !WAS_NEWLINE(eptr)))
1305 RRETURN(MATCH_NOMATCH);
1306 ecode++;
1307 break;
1308 }
1309 /* ... else fall through */
1310
1311 /* Start of subject assertion */
1312
1313 case OP_SOD:
1314 if (eptr != md->start_subject) RRETURN(MATCH_NOMATCH);
1315 ecode++;
1316 break;
1317
1318 /* Start of match assertion */
1319
1320 case OP_SOM:
1321 if (eptr != md->start_subject + md->start_offset) RRETURN(MATCH_NOMATCH);
1322 ecode++;
1323 break;
1324
1325 /* Reset the start of match point */
1326
1327 case OP_SET_SOM:
1328 mstart = eptr;
1329 ecode++;
1330 break;
1331
1332 /* Assert before internal newline if multiline, or before a terminating
1333 newline unless endonly is set, else end of subject unless noteol is set. */
1334
1335 case OP_DOLL:
1336 if ((ims & PCRE_MULTILINE) != 0)
1337 {
1338 if (eptr < md->end_subject)
1339 { if (!IS_NEWLINE(eptr)) RRETURN(MATCH_NOMATCH); }
1340 else
1341 { if (md->noteol) RRETURN(MATCH_NOMATCH); }
1342 ecode++;
1343 break;
1344 }
1345 else
1346 {
1347 if (md->noteol) RRETURN(MATCH_NOMATCH);
1348 if (!md->endonly)
1349 {
1350 if (eptr != md->end_subject &&
1351 (!IS_NEWLINE(eptr) || eptr != md->end_subject - md->nllen))
1352 RRETURN(MATCH_NOMATCH);
1353 ecode++;
1354 break;
1355 }
1356 }
1357 /* ... else fall through for endonly */
1358
1359 /* End of subject assertion (\z) */
1360
1361 case OP_EOD:
1362 if (eptr < md->end_subject) RRETURN(MATCH_NOMATCH);
1363 ecode++;
1364 break;
1365
1366 /* End of subject or ending \n assertion (\Z) */
1367
1368 case OP_EODN:
1369 if (eptr != md->end_subject &&
1370 (!IS_NEWLINE(eptr) || eptr != md->end_subject - md->nllen))
1371 RRETURN(MATCH_NOMATCH);
1372 ecode++;
1373 break;
1374
1375 /* Word boundary assertions */
1376
1377 case OP_NOT_WORD_BOUNDARY:
1378 case OP_WORD_BOUNDARY:
1379 {
1380
1381 /* Find out if the previous and current characters are "word" characters.
1382 It takes a bit more work in UTF-8 mode. Characters > 255 are assumed to
1383 be "non-word" characters. */
1384
1385 #ifdef SUPPORT_UTF8
1386 if (utf8)
1387 {
1388 if (eptr == md->start_subject) prev_is_word = FALSE; else
1389 {
1390 const uschar *lastptr = eptr - 1;
1391 while((*lastptr & 0xc0) == 0x80) lastptr--;
1392 GETCHAR(c, lastptr);
1393 prev_is_word = c < 256 && (md->ctypes[c] & ctype_word) != 0;
1394 }
1395 if (eptr >= md->end_subject) cur_is_word = FALSE; else
1396 {
1397 GETCHAR(c, eptr);
1398 cur_is_word = c < 256 && (md->ctypes[c] & ctype_word) != 0;
1399 }
1400 }
1401 else
1402 #endif
1403
1404 /* More streamlined when not in UTF-8 mode */
1405
1406 {
1407 prev_is_word = (eptr != md->start_subject) &&
1408 ((md->ctypes[eptr[-1]] & ctype_word) != 0);
1409 cur_is_word = (eptr < md->end_subject) &&
1410 ((md->ctypes[*eptr] & ctype_word) != 0);
1411 }
1412
1413 /* Now see if the situation is what we want */
1414
1415 if ((*ecode++ == OP_WORD_BOUNDARY)?
1416 cur_is_word == prev_is_word : cur_is_word != prev_is_word)
1417 RRETURN(MATCH_NOMATCH);
1418 }
1419 break;
1420
1421 /* Match a single character type; inline for speed */
1422
1423 case OP_ANY:
1424 if ((ims & PCRE_DOTALL) == 0)
1425 {
1426 if (IS_NEWLINE(eptr)) RRETURN(MATCH_NOMATCH);
1427 }
1428 if (eptr++ >= md->end_subject) RRETURN(MATCH_NOMATCH);
1429 if (utf8)
1430 while (eptr < md->end_subject && (*eptr & 0xc0) == 0x80) eptr++;
1431 ecode++;
1432 break;
1433
1434 /* Match a single byte, even in UTF-8 mode. This opcode really does match
1435 any byte, even newline, independent of the setting of PCRE_DOTALL. */
1436
1437 case OP_ANYBYTE:
1438 if (eptr++ >= md->end_subject) RRETURN(MATCH_NOMATCH);
1439 ecode++;
1440 break;
1441
1442 case OP_NOT_DIGIT:
1443 if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
1444 GETCHARINCTEST(c, eptr);
1445 if (
1446 #ifdef SUPPORT_UTF8
1447 c < 256 &&
1448 #endif
1449 (md->ctypes[c] & ctype_digit) != 0
1450 )
1451 RRETURN(MATCH_NOMATCH);
1452 ecode++;
1453 break;
1454
1455 case OP_DIGIT:
1456 if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
1457 GETCHARINCTEST(c, eptr);
1458 if (
1459 #ifdef SUPPORT_UTF8
1460 c >= 256 ||
1461 #endif
1462 (md->ctypes[c] & ctype_digit) == 0
1463 )
1464 RRETURN(MATCH_NOMATCH);
1465 ecode++;
1466 break;
1467
1468 case OP_NOT_WHITESPACE:
1469 if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
1470 GETCHARINCTEST(c, eptr);
1471 if (
1472 #ifdef SUPPORT_UTF8
1473 c < 256 &&
1474 #endif
1475 (md->ctypes[c] & ctype_space) != 0
1476 )
1477 RRETURN(MATCH_NOMATCH);
1478 ecode++;
1479 break;
1480
1481 case OP_WHITESPACE:
1482 if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
1483 GETCHARINCTEST(c, eptr);
1484 if (
1485 #ifdef SUPPORT_UTF8
1486 c >= 256 ||
1487 #endif
1488 (md->ctypes[c] & ctype_space) == 0
1489 )
1490 RRETURN(MATCH_NOMATCH);
1491 ecode++;
1492 break;
1493
1494 case OP_NOT_WORDCHAR:
1495 if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
1496 GETCHARINCTEST(c, eptr);
1497 if (
1498 #ifdef SUPPORT_UTF8
1499 c < 256 &&
1500 #endif
1501 (md->ctypes[c] & ctype_word) != 0
1502 )
1503 RRETURN(MATCH_NOMATCH);
1504 ecode++;
1505 break;
1506
1507 case OP_WORDCHAR:
1508 if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
1509 GETCHARINCTEST(c, eptr);
1510 if (
1511 #ifdef SUPPORT_UTF8
1512 c >= 256 ||
1513 #endif
1514 (md->ctypes[c] & ctype_word) == 0
1515 )
1516 RRETURN(MATCH_NOMATCH);
1517 ecode++;
1518 break;
1519
1520 case OP_ANYNL:
1521 if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
1522 GETCHARINCTEST(c, eptr);
1523 switch(c)
1524 {
1525 default: RRETURN(MATCH_NOMATCH);
1526 case 0x000d:
1527 if (eptr < md->end_subject && *eptr == 0x0a) eptr++;
1528 break;
1529
1530 case 0x000a:
1531 break;
1532
1533 case 0x000b:
1534 case 0x000c:
1535 case 0x0085:
1536 case 0x2028:
1537 case 0x2029:
1538 if (md->bsr_anycrlf) RRETURN(MATCH_NOMATCH);
1539 break;
1540 }
1541 ecode++;
1542 break;
1543
1544 case OP_NOT_HSPACE:
1545 if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
1546 GETCHARINCTEST(c, eptr);
1547 switch(c)
1548 {
1549 default: break;
1550 case 0x09: /* HT */
1551 case 0x20: /* SPACE */
1552 case 0xa0: /* NBSP */
1553 case 0x1680: /* OGHAM SPACE MARK */
1554 case 0x180e: /* MONGOLIAN VOWEL SEPARATOR */
1555 case 0x2000: /* EN QUAD */
1556 case 0x2001: /* EM QUAD */
1557 case 0x2002: /* EN SPACE */
1558 case 0x2003: /* EM SPACE */
1559 case 0x2004: /* THREE-PER-EM SPACE */
1560 case 0x2005: /* FOUR-PER-EM SPACE */
1561 case 0x2006: /* SIX-PER-EM SPACE */
1562 case 0x2007: /* FIGURE SPACE */
1563 case 0x2008: /* PUNCTUATION SPACE */
1564 case 0x2009: /* THIN SPACE */
1565 case 0x200A: /* HAIR SPACE */
1566 case 0x202f: /* NARROW NO-BREAK SPACE */
1567 case 0x205f: /* MEDIUM MATHEMATICAL SPACE */
1568 case 0x3000: /* IDEOGRAPHIC SPACE */
1569 RRETURN(MATCH_NOMATCH);
1570 }
1571 ecode++;
1572 break;
1573
1574 case OP_HSPACE:
1575 if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
1576 GETCHARINCTEST(c, eptr);
1577 switch(c)
1578 {
1579 default: RRETURN(MATCH_NOMATCH);
1580 case 0x09: /* HT */
1581 case 0x20: /* SPACE */
1582 case 0xa0: /* NBSP */
1583 case 0x1680: /* OGHAM SPACE MARK */
1584 case 0x180e: /* MONGOLIAN VOWEL SEPARATOR */
1585 case 0x2000: /* EN QUAD */
1586 case 0x2001: /* EM QUAD */
1587 case 0x2002: /* EN SPACE */
1588 case 0x2003: /* EM SPACE */
1589 case 0x2004: /* THREE-PER-EM SPACE */
1590 case 0x2005: /* FOUR-PER-EM SPACE */
1591 case 0x2006: /* SIX-PER-EM SPACE */
1592 case 0x2007: /* FIGURE SPACE */
1593 case 0x2008: /* PUNCTUATION SPACE */
1594 case 0x2009: /* THIN SPACE */
1595 case 0x200A: /* HAIR SPACE */
1596 case 0x202f: /* NARROW NO-BREAK SPACE */
1597 case 0x205f: /* MEDIUM MATHEMATICAL SPACE */
1598 case 0x3000: /* IDEOGRAPHIC SPACE */
1599 break;
1600 }
1601 ecode++;
1602 break;
1603
1604 case OP_NOT_VSPACE:
1605 if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
1606 GETCHARINCTEST(c, eptr);
1607 switch(c)
1608 {
1609 default: break;
1610 case 0x0a: /* LF */
1611 case 0x0b: /* VT */
1612 case 0x0c: /* FF */
1613 case 0x0d: /* CR */
1614 case 0x85: /* NEL */
1615 case 0x2028: /* LINE SEPARATOR */
1616 case 0x2029: /* PARAGRAPH SEPARATOR */
1617 RRETURN(MATCH_NOMATCH);
1618 }
1619 ecode++;
1620 break;
1621
1622 case OP_VSPACE:
1623 if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
1624 GETCHARINCTEST(c, eptr);
1625 switch(c)
1626 {
1627 default: RRETURN(MATCH_NOMATCH);
1628 case 0x0a: /* LF */
1629 case 0x0b: /* VT */
1630 case 0x0c: /* FF */
1631 case 0x0d: /* CR */
1632 case 0x85: /* NEL */
1633 case 0x2028: /* LINE SEPARATOR */
1634 case 0x2029: /* PARAGRAPH SEPARATOR */
1635 break;
1636 }
1637 ecode++;
1638 break;
1639
1640 #ifdef SUPPORT_UCP
1641 /* Check the next character by Unicode property. We will get here only
1642 if the support is in the binary; otherwise a compile-time error occurs. */
1643
1644 case OP_PROP:
1645 case OP_NOTPROP:
1646 if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
1647 GETCHARINCTEST(c, eptr);
1648 {
1649 int chartype, script;
1650 int category = _pcre_ucp_findprop(c, &chartype, &script);
1651
1652 switch(ecode[1])
1653 {
1654 case PT_ANY:
1655 if (op == OP_NOTPROP) RRETURN(MATCH_NOMATCH);
1656 break;
1657
1658 case PT_LAMP:
1659 if ((chartype == ucp_Lu ||
1660 chartype == ucp_Ll ||
1661 chartype == ucp_Lt) == (op == OP_NOTPROP))
1662 RRETURN(MATCH_NOMATCH);
1663 break;
1664
1665 case PT_GC:
1666 if ((ecode[2] != category) == (op == OP_PROP))
1667 RRETURN(MATCH_NOMATCH);
1668 break;
1669
1670 case PT_PC:
1671 if ((ecode[2] != chartype) == (op == OP_PROP))
1672 RRETURN(MATCH_NOMATCH);
1673 break;
1674
1675 case PT_SC:
1676 if ((ecode[2] != script) == (op == OP_PROP))
1677 RRETURN(MATCH_NOMATCH);
1678 break;
1679
1680 default:
1681 RRETURN(PCRE_ERROR_INTERNAL);
1682 }
1683
1684 ecode += 3;
1685 }
1686 break;
1687
1688 /* Match an extended Unicode sequence. We will get here only if the support
1689 is in the binary; otherwise a compile-time error occurs. */
1690
1691 case OP_EXTUNI:
1692 if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
1693 GETCHARINCTEST(c, eptr);
1694 {
1695 int chartype, script;
1696 int category = _pcre_ucp_findprop(c, &chartype, &script);
1697 if (category == ucp_M) RRETURN(MATCH_NOMATCH);
1698 while (eptr < md->end_subject)
1699 {
1700 int len = 1;
1701 if (!utf8) c = *eptr; else
1702 {
1703 GETCHARLEN(c, eptr, len);
1704 }
1705 category = _pcre_ucp_findprop(c, &chartype, &script);
1706 if (category != ucp_M) break;
1707 eptr += len;
1708 }
1709 }
1710 ecode++;
1711 break;
1712 #endif
1713
1714
1715 /* Match a back reference, possibly repeatedly. Look past the end of the
1716 item to see if there is repeat information following. The code is similar
1717 to that for character classes, but repeated for efficiency. Then obey
1718 similar code to character type repeats - written out again for speed.
1719 However, if the referenced string is the empty string, always treat
1720 it as matched, any number of times (otherwise there could be infinite
1721 loops). */
1722
1723 case OP_REF:
1724 {
1725 offset = GET2(ecode, 1) << 1; /* Doubled ref number */
1726 ecode += 3; /* Advance past item */
1727
1728 /* If the reference is unset, set the length to be longer than the amount
1729 of subject left; this ensures that every attempt at a match fails. We
1730 can't just fail here, because of the possibility of quantifiers with zero
1731 minima. */
1732
1733 length = (offset >= offset_top || md->offset_vector[offset] < 0)?
1734 md->end_subject - eptr + 1 :
1735 md->offset_vector[offset+1] - md->offset_vector[offset];
1736
1737 /* Set up for repetition, or handle the non-repeated case */
1738
1739 switch (*ecode)
1740 {
1741 case OP_CRSTAR:
1742 case OP_CRMINSTAR:
1743 case OP_CRPLUS:
1744 case OP_CRMINPLUS:
1745 case OP_CRQUERY:
1746 case OP_CRMINQUERY:
1747 c = *ecode++ - OP_CRSTAR;
1748 minimize = (c & 1) != 0;
1749 min = rep_min[c]; /* Pick up values from tables; */
1750 max = rep_max[c]; /* zero for max => infinity */
1751 if (max == 0) max = INT_MAX;
1752 break;
1753
1754 case OP_CRRANGE:
1755 case OP_CRMINRANGE:
1756 minimize = (*ecode == OP_CRMINRANGE);
1757 min = GET2(ecode, 1);
1758 max = GET2(ecode, 3);
1759 if (max == 0) max = INT_MAX;
1760 ecode += 5;
1761 break;
1762
1763 default: /* No repeat follows */
1764 if (!match_ref(offset, eptr, length, md, ims)) RRETURN(MATCH_NOMATCH);
1765 eptr += length;
1766 continue; /* With the main loop */
1767 }
1768
1769 /* If the length of the reference is zero, just continue with the
1770 main loop. */
1771
1772 if (length == 0) continue;
1773
1774 /* First, ensure the minimum number of matches are present. We get back
1775 the length of the reference string explicitly rather than passing the
1776 address of eptr, so that eptr can be a register variable. */
1777
1778 for (i = 1; i <= min; i++)
1779 {
1780 if (!match_ref(offset, eptr, length, md, ims)) RRETURN(MATCH_NOMATCH);
1781 eptr += length;
1782 }
1783
1784 /* If min = max, continue at the same level without recursion.
1785 They are not both allowed to be zero. */
1786
1787 if (min == max) continue;
1788
1789 /* If minimizing, keep trying and advancing the pointer */
1790
1791 if (minimize)
1792 {
1793 for (fi = min;; fi++)
1794 {
1795 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM14);
1796 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1797 if (fi >= max || !match_ref(offset, eptr, length, md, ims))
1798 RRETURN(MATCH_NOMATCH);
1799 eptr += length;
1800 }
1801 /* Control never gets here */
1802 }
1803
1804 /* If maximizing, find the longest string and work backwards */
1805
1806 else
1807 {
1808 pp = eptr;
1809 for (i = min; i < max; i++)
1810 {
1811 if (!match_ref(offset, eptr, length, md, ims)) break;
1812 eptr += length;
1813 }
1814 while (eptr >= pp)
1815 {
1816 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM15);
1817 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1818 eptr -= length;
1819 }
1820 RRETURN(MATCH_NOMATCH);
1821 }
1822 }
1823 /* Control never gets here */
1824
1825
1826
1827 /* Match a bit-mapped character class, possibly repeatedly. This op code is
1828 used when all the characters in the class have values in the range 0-255,
1829 and either the matching is caseful, or the characters are in the range
1830 0-127 when UTF-8 processing is enabled. The only difference between
1831 OP_CLASS and OP_NCLASS occurs when a data character outside the range is
1832 encountered.
1833
1834 First, look past the end of the item to see if there is repeat information
1835 following. Then obey similar code to character type repeats - written out
1836 again for speed. */
1837
1838 case OP_NCLASS:
1839 case OP_CLASS:
1840 {
1841 data = ecode + 1; /* Save for matching */
1842 ecode += 33; /* Advance past the item */
1843
1844 switch (*ecode)
1845 {
1846 case OP_CRSTAR:
1847 case OP_CRMINSTAR:
1848 case OP_CRPLUS:
1849 case OP_CRMINPLUS:
1850 case OP_CRQUERY:
1851 case OP_CRMINQUERY:
1852 c = *ecode++ - OP_CRSTAR;
1853 minimize = (c & 1) != 0;
1854 min = rep_min[c]; /* Pick up values from tables; */
1855 max = rep_max[c]; /* zero for max => infinity */
1856 if (max == 0) max = INT_MAX;
1857 break;
1858
1859 case OP_CRRANGE:
1860 case OP_CRMINRANGE:
1861 minimize = (*ecode == OP_CRMINRANGE);
1862 min = GET2(ecode, 1);
1863 max = GET2(ecode, 3);
1864 if (max == 0) max = INT_MAX;
1865 ecode += 5;
1866 break;
1867
1868 default: /* No repeat follows */
1869 min = max = 1;
1870 break;
1871 }
1872
1873 /* First, ensure the minimum number of matches are present. */
1874
1875 #ifdef SUPPORT_UTF8
1876 /* UTF-8 mode */
1877 if (utf8)
1878 {
1879 for (i = 1; i <= min; i++)
1880 {
1881 if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
1882 GETCHARINC(c, eptr);
1883 if (c > 255)
1884 {
1885 if (op == OP_CLASS) RRETURN(MATCH_NOMATCH);
1886 }
1887 else
1888 {
1889 if ((data[c/8] & (1 << (c&7))) == 0) RRETURN(MATCH_NOMATCH);
1890 }
1891 }
1892 }
1893 else
1894 #endif
1895 /* Not UTF-8 mode */
1896 {
1897 for (i = 1; i <= min; i++)
1898 {
1899 if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
1900 c = *eptr++;
1901 if ((data[c/8] & (1 << (c&7))) == 0) RRETURN(MATCH_NOMATCH);
1902 }
1903 }
1904
1905 /* If max == min we can continue with the main loop without the
1906 need to recurse. */
1907
1908 if (min == max) continue;
1909
1910 /* If minimizing, keep testing the rest of the expression and advancing
1911 the pointer while it matches the class. */
1912
1913 if (minimize)
1914 {
1915 #ifdef SUPPORT_UTF8
1916 /* UTF-8 mode */
1917 if (utf8)
1918 {
1919 for (fi = min;; fi++)
1920 {
1921 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM16);
1922 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1923 if (fi >= max || eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
1924 GETCHARINC(c, eptr);
1925 if (c > 255)
1926 {
1927 if (op == OP_CLASS) RRETURN(MATCH_NOMATCH);
1928 }
1929 else
1930 {
1931 if ((data[c/8] & (1 << (c&7))) == 0) RRETURN(MATCH_NOMATCH);
1932 }
1933 }
1934 }
1935 else
1936 #endif
1937 /* Not UTF-8 mode */
1938 {
1939 for (fi = min;; fi++)
1940 {
1941 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM17);
1942 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1943 if (fi >= max || eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
1944 c = *eptr++;
1945 if ((data[c/8] & (1 << (c&7))) == 0) RRETURN(MATCH_NOMATCH);
1946 }
1947 }
1948 /* Control never gets here */
1949 }
1950
1951 /* If maximizing, find the longest possible run, then work backwards. */
1952
1953 else
1954 {
1955 pp = eptr;
1956
1957 #ifdef SUPPORT_UTF8
1958 /* UTF-8 mode */
1959 if (utf8)
1960 {
1961 for (i = min; i < max; i++)
1962 {
1963 int len = 1;
1964 if (eptr >= md->end_subject) break;
1965 GETCHARLEN(c, eptr, len);
1966 if (c > 255)
1967 {
1968 if (op == OP_CLASS) break;
1969 }
1970 else
1971 {
1972 if ((data[c/8] & (1 << (c&7))) == 0) break;
1973 }
1974 eptr += len;
1975 }
1976 for (;;)
1977 {
1978 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM18);
1979 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1980 if (eptr-- == pp) break; /* Stop if tried at original pos */
1981 BACKCHAR(eptr);
1982 }
1983 }
1984 else
1985 #endif
1986 /* Not UTF-8 mode */
1987 {
1988 for (i = min; i < max; i++)
1989 {
1990 if (eptr >= md->end_subject) break;
1991 c = *eptr;
1992 if ((data[c/8] & (1 << (c&7))) == 0) break;
1993 eptr++;
1994 }
1995 while (eptr >= pp)
1996 {
1997 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM19);
1998 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1999 eptr--;
2000 }
2001 }
2002
2003 RRETURN(MATCH_NOMATCH);
2004 }
2005 }
2006 /* Control never gets here */
2007
2008
2009 /* Match an extended character class. This opcode is encountered only
2010 in UTF-8 mode, because that's the only time it is compiled. */
2011
2012 #ifdef SUPPORT_UTF8
2013 case OP_XCLASS:
2014 {
2015 data = ecode + 1 + LINK_SIZE; /* Save for matching */
2016 ecode += GET(ecode, 1); /* Advance past the item */
2017
2018 switch (*ecode)
2019 {
2020 case OP_CRSTAR:
2021 case OP_CRMINSTAR:
2022 case OP_CRPLUS:
2023 case OP_CRMINPLUS:
2024 case OP_CRQUERY:
2025 case OP_CRMINQUERY:
2026 c = *ecode++ - OP_CRSTAR;
2027 minimize = (c & 1) != 0;
2028 min = rep_min[c]; /* Pick up values from tables; */
2029 max = rep_max[c]; /* zero for max => infinity */
2030 if (max == 0) max = INT_MAX;
2031 break;
2032
2033 case OP_CRRANGE:
2034 case OP_CRMINRANGE:
2035 minimize = (*ecode == OP_CRMINRANGE);
2036 min = GET2(ecode, 1);
2037 max = GET2(ecode, 3);
2038 if (max == 0) max = INT_MAX;
2039 ecode += 5;
2040 break;
2041
2042 default: /* No repeat follows */
2043 min = max = 1;
2044 break;
2045 }
2046
2047 /* First, ensure the minimum number of matches are present. */
2048
2049 for (i = 1; i <= min; i++)
2050 {
2051 if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
2052 GETCHARINC(c, eptr);
2053 if (!_pcre_xclass(c, data)) RRETURN(MATCH_NOMATCH);
2054 }
2055
2056 /* If max == min we can continue with the main loop without the
2057 need to recurse. */
2058
2059 if (min == max) continue;
2060
2061 /* If minimizing, keep testing the rest of the expression and advancing
2062 the pointer while it matches the class. */
2063
2064 if (minimize)
2065 {
2066 for (fi = min;; fi++)
2067 {
2068 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM20);
2069 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2070 if (fi >= max || eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
2071 GETCHARINC(c, eptr);
2072 if (!_pcre_xclass(c, data)) RRETURN(MATCH_NOMATCH);
2073 }
2074 /* Control never gets here */
2075 }
2076
2077 /* If maximizing, find the longest possible run, then work backwards. */
2078
2079 else
2080 {
2081 pp = eptr;
2082 for (i = min; i < max; i++)
2083 {
2084 int len = 1;
2085 if (eptr >= md->end_subject) break;
2086 GETCHARLEN(c, eptr, len);
2087 if (!_pcre_xclass(c, data)) break;
2088 eptr += len;
2089 }
2090 for(;;)
2091 {
2092 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM21);
2093 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2094 if (eptr-- == pp) break; /* Stop if tried at original pos */
2095 if (utf8) BACKCHAR(eptr);
2096 }
2097 RRETURN(MATCH_NOMATCH);
2098 }
2099
2100 /* Control never gets here */
2101 }
2102 #endif /* End of XCLASS */
2103
2104 /* Match a single character, casefully */
2105
2106 case OP_CHAR:
2107 #ifdef SUPPORT_UTF8
2108 if (utf8)
2109 {
2110 length = 1;
2111 ecode++;
2112 GETCHARLEN(fc, ecode, length);
2113 if (length > md->end_subject - eptr) RRETURN(MATCH_NOMATCH);
2114 while (length-- > 0) if (*ecode++ != *eptr++) RRETURN(MATCH_NOMATCH);
2115 }
2116 else
2117 #endif
2118
2119 /* Non-UTF-8 mode */
2120 {
2121 if (md->end_subject - eptr < 1) RRETURN(MATCH_NOMATCH);
2122 if (ecode[1] != *eptr++) RRETURN(MATCH_NOMATCH);
2123 ecode += 2;
2124 }
2125 break;
2126
2127 /* Match a single character, caselessly */
2128
2129 case OP_CHARNC:
2130 #ifdef SUPPORT_UTF8
2131 if (utf8)
2132 {
2133 length = 1;
2134 ecode++;
2135 GETCHARLEN(fc, ecode, length);
2136
2137 if (length > md->end_subject - eptr) RRETURN(MATCH_NOMATCH);
2138
2139 /* If the pattern character's value is < 128, we have only one byte, and
2140 can use the fast lookup table. */
2141
2142 if (fc < 128)
2143 {
2144 if (md->lcc[*ecode++] != md->lcc[*eptr++]) RRETURN(MATCH_NOMATCH);
2145 }
2146
2147 /* Otherwise we must pick up the subject character */
2148
2149 else
2150 {
2151 unsigned int dc;
2152 GETCHARINC(dc, eptr);
2153 ecode += length;
2154
2155 /* If we have Unicode property support, we can use it to test the other
2156 case of the character, if there is one. */
2157
2158 if (fc != dc)
2159 {
2160 #ifdef SUPPORT_UCP
2161 if (dc != _pcre_ucp_othercase(fc))
2162 #endif
2163 RRETURN(MATCH_NOMATCH);
2164 }
2165 }
2166 }
2167 else
2168 #endif /* SUPPORT_UTF8 */
2169
2170 /* Non-UTF-8 mode */
2171 {
2172 if (md->end_subject - eptr < 1) RRETURN(MATCH_NOMATCH);
2173 if (md->lcc[ecode[1]] != md->lcc[*eptr++]) RRETURN(MATCH_NOMATCH);
2174 ecode += 2;
2175 }
2176 break;
2177
2178 /* Match a single character repeatedly. */
2179
2180 case OP_EXACT:
2181 min = max = GET2(ecode, 1);
2182 ecode += 3;
2183 goto REPEATCHAR;
2184
2185 case OP_POSUPTO:
2186 possessive = TRUE;
2187 /* Fall through */
2188
2189 case OP_UPTO:
2190 case OP_MINUPTO:
2191 min = 0;
2192 max = GET2(ecode, 1);
2193 minimize = *ecode == OP_MINUPTO;
2194 ecode += 3;
2195 goto REPEATCHAR;
2196
2197 case OP_POSSTAR:
2198 possessive = TRUE;
2199 min = 0;
2200 max = INT_MAX;
2201 ecode++;
2202 goto REPEATCHAR;
2203
2204 case OP_POSPLUS:
2205 possessive = TRUE;
2206 min = 1;
2207 max = INT_MAX;
2208 ecode++;
2209 goto REPEATCHAR;
2210
2211 case OP_POSQUERY:
2212 possessive = TRUE;
2213 min = 0;
2214 max = 1;
2215 ecode++;
2216 goto REPEATCHAR;
2217
2218 case OP_STAR:
2219 case OP_MINSTAR:
2220 case OP_PLUS:
2221 case OP_MINPLUS:
2222 case OP_QUERY:
2223 case OP_MINQUERY:
2224 c = *ecode++ - OP_STAR;
2225 minimize = (c & 1) != 0;
2226 min = rep_min[c]; /* Pick up values from tables; */
2227 max = rep_max[c]; /* zero for max => infinity */
2228 if (max == 0) max = INT_MAX;
2229
2230 /* Common code for all repeated single-character matches. We can give
2231 up quickly if there are fewer than the minimum number of characters left in
2232 the subject. */
2233
2234 REPEATCHAR:
2235 #ifdef SUPPORT_UTF8
2236 if (utf8)
2237 {
2238 length = 1;
2239 charptr = ecode;
2240 GETCHARLEN(fc, ecode, length);
2241 if (min * length > md->end_subject - eptr) RRETURN(MATCH_NOMATCH);
2242 ecode += length;
2243
2244 /* Handle multibyte character matching specially here. There is
2245 support for caseless matching if UCP support is present. */
2246
2247 if (length > 1)
2248 {
2249 #ifdef SUPPORT_UCP
2250 unsigned int othercase;
2251 if ((ims & PCRE_CASELESS) != 0 &&
2252 (othercase = _pcre_ucp_othercase(fc)) != NOTACHAR)
2253 oclength = _pcre_ord2utf8(othercase, occhars);
2254 else oclength = 0;
2255 #endif /* SUPPORT_UCP */
2256
2257 for (i = 1; i <= min; i++)
2258 {
2259 if (memcmp(eptr, charptr, length) == 0) eptr += length;
2260 #ifdef SUPPORT_UCP
2261 /* Need braces because of following else */
2262 else if (oclength == 0) { RRETURN(MATCH_NOMATCH); }
2263 else
2264 {
2265 if (memcmp(eptr, occhars, oclength) != 0) RRETURN(MATCH_NOMATCH);
2266 eptr += oclength;
2267 }
2268 #else /* without SUPPORT_UCP */
2269 else { RRETURN(MATCH_NOMATCH); }
2270 #endif /* SUPPORT_UCP */
2271 }
2272
2273 if (min == max) continue;
2274
2275 if (minimize)
2276 {
2277 for (fi = min;; fi++)
2278 {
2279 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM22);
2280 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2281 if (fi >= max || eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
2282 if (memcmp(eptr, charptr, length) == 0) eptr += length;
2283 #ifdef SUPPORT_UCP
2284 /* Need braces because of following else */
2285 else if (oclength == 0) { RRETURN(MATCH_NOMATCH); }
2286 else
2287 {
2288 if (memcmp(eptr, occhars, oclength) != 0) RRETURN(MATCH_NOMATCH);
2289 eptr += oclength;
2290 }
2291 #else /* without SUPPORT_UCP */
2292 else { RRETURN (MATCH_NOMATCH); }
2293 #endif /* SUPPORT_UCP */
2294 }
2295 /* Control never gets here */
2296 }
2297
2298 else /* Maximize */
2299 {
2300 pp = eptr;
2301 for (i = min; i < max; i++)
2302 {
2303 if (eptr > md->end_subject - length) break;
2304 if (memcmp(eptr, charptr, length) == 0) eptr += length;
2305 #ifdef SUPPORT_UCP
2306 else if (oclength == 0) break;
2307 else
2308 {
2309 if (memcmp(eptr, occhars, oclength) != 0) break;
2310 eptr += oclength;
2311 }
2312 #else /* without SUPPORT_UCP */
2313 else break;
2314 #endif /* SUPPORT_UCP */
2315 }
2316
2317 if (possessive) continue;
2318 for(;;)
2319 {
2320 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM23);
2321 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2322 if (eptr == pp) RRETURN(MATCH_NOMATCH);
2323 #ifdef SUPPORT_UCP
2324 eptr--;
2325 BACKCHAR(eptr);
2326 #else /* without SUPPORT_UCP */
2327 eptr -= length;
2328 #endif /* SUPPORT_UCP */
2329 }
2330 }
2331 /* Control never gets here */
2332 }
2333
2334 /* If the length of a UTF-8 character is 1, we fall through here, and
2335 obey the code as for non-UTF-8 characters below, though in this case the
2336 value of fc will always be < 128. */
2337 }
2338 else
2339 #endif /* SUPPORT_UTF8 */
2340
2341 /* When not in UTF-8 mode, load a single-byte character. */
2342 {
2343 if (min > md->end_subject - eptr) RRETURN(MATCH_NOMATCH);
2344 fc = *ecode++;
2345 }
2346
2347 /* The value of fc at this point is always less than 256, though we may or
2348 may not be in UTF-8 mode. The code is duplicated for the caseless and
2349 caseful cases, for speed, since matching characters is likely to be quite
2350 common. First, ensure the minimum number of matches are present. If min =
2351 max, continue at the same level without recursing. Otherwise, if
2352 minimizing, keep trying the rest of the expression and advancing one
2353 matching character if failing, up to the maximum. Alternatively, if
2354 maximizing, find the maximum number of characters and work backwards. */
2355
2356 DPRINTF(("matching %c{%d,%d} against subject %.*s\n", fc, min, max,
2357 max, eptr));
2358
2359 if ((ims & PCRE_CASELESS) != 0)
2360 {
2361 fc = md->lcc[fc];
2362 for (i = 1; i <= min; i++)
2363 if (fc != md->lcc[*eptr++]) RRETURN(MATCH_NOMATCH);
2364 if (min == max) continue;
2365 if (minimize)
2366 {
2367 for (fi = min;; fi++)
2368 {
2369 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM24);
2370 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2371 if (fi >= max || eptr >= md->end_subject ||
2372 fc != md->lcc[*eptr++])
2373 RRETURN(MATCH_NOMATCH);
2374 }
2375 /* Control never gets here */
2376 }
2377 else /* Maximize */
2378 {
2379 pp = eptr;
2380 for (i = min; i < max; i++)
2381 {
2382 if (eptr >= md->end_subject || fc != md->lcc[*eptr]) break;
2383 eptr++;
2384 }
2385 if (possessive) continue;
2386 while (eptr >= pp)
2387 {
2388 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM25);
2389 eptr--;
2390 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2391 }
2392 RRETURN(MATCH_NOMATCH);
2393 }
2394 /* Control never gets here */
2395 }
2396
2397 /* Caseful comparisons (includes all multi-byte characters) */
2398
2399 else
2400 {
2401 for (i = 1; i <= min; i++) if (fc != *eptr++) RRETURN(MATCH_NOMATCH);
2402 if (min == max) continue;
2403 if (minimize)
2404 {
2405 for (fi = min;; fi++)
2406 {
2407 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM26);
2408 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2409 if (fi >= max || eptr >= md->end_subject || fc != *eptr++)
2410 RRETURN(MATCH_NOMATCH);
2411 }
2412 /* Control never gets here */
2413 }
2414 else /* Maximize */
2415 {
2416 pp = eptr;
2417 for (i = min; i < max; i++)
2418 {
2419 if (eptr >= md->end_subject || fc != *eptr) break;
2420 eptr++;
2421 }
2422 if (possessive) continue;
2423 while (eptr >= pp)
2424 {
2425 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM27);
2426 eptr--;
2427 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2428 }
2429 RRETURN(MATCH_NOMATCH);
2430 }
2431 }
2432 /* Control never gets here */
2433
2434 /* Match a negated single one-byte character. The character we are
2435 checking can be multibyte. */
2436
2437 case OP_NOT:
2438 if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
2439 ecode++;
2440 GETCHARINCTEST(c, eptr);
2441 if ((ims & PCRE_CASELESS) != 0)
2442 {
2443 #ifdef SUPPORT_UTF8
2444 if (c < 256)
2445 #endif
2446 c = md->lcc[c];
2447 if (md->lcc[*ecode++] == c) RRETURN(MATCH_NOMATCH);
2448 }
2449 else
2450 {
2451 if (*ecode++ == c) RRETURN(MATCH_NOMATCH);
2452 }
2453 break;
2454
2455 /* Match a negated single one-byte character repeatedly. This is almost a
2456 repeat of the code for a repeated single character, but I haven't found a
2457 nice way of commoning these up that doesn't require a test of the
2458 positive/negative option for each character match. Maybe that wouldn't add
2459 very much to the time taken, but character matching *is* what this is all
2460 about... */
2461
2462 case OP_NOTEXACT:
2463 min = max = GET2(ecode, 1);
2464 ecode += 3;
2465 goto REPEATNOTCHAR;
2466
2467 case OP_NOTUPTO:
2468 case OP_NOTMINUPTO:
2469 min = 0;
2470 max = GET2(ecode, 1);
2471 minimize = *ecode == OP_NOTMINUPTO;
2472 ecode += 3;
2473 goto REPEATNOTCHAR;
2474
2475 case OP_NOTPOSSTAR:
2476 possessive = TRUE;
2477 min = 0;
2478 max = INT_MAX;
2479 ecode++;
2480 goto REPEATNOTCHAR;
2481
2482 case OP_NOTPOSPLUS:
2483 possessive = TRUE;
2484 min = 1;
2485 max = INT_MAX;
2486 ecode++;
2487 goto REPEATNOTCHAR;
2488
2489 case OP_NOTPOSQUERY:
2490 possessive = TRUE;
2491 min = 0;
2492 max = 1;
2493 ecode++;
2494 goto REPEATNOTCHAR;
2495
2496 case OP_NOTPOSUPTO:
2497 possessive = TRUE;
2498 min = 0;
2499 max = GET2(ecode, 1);
2500 ecode += 3;
2501 goto REPEATNOTCHAR;
2502
2503 case OP_NOTSTAR:
2504 case OP_NOTMINSTAR:
2505 case OP_NOTPLUS:
2506 case OP_NOTMINPLUS:
2507 case OP_NOTQUERY:
2508 case OP_NOTMINQUERY:
2509 c = *ecode++ - OP_NOTSTAR;
2510 minimize = (c & 1) != 0;
2511 min = rep_min[c]; /* Pick up values from tables; */
2512 max = rep_max[c]; /* zero for max => infinity */
2513 if (max == 0) max = INT_MAX;
2514
2515 /* Common code for all repeated single-byte matches. We can give up quickly
2516 if there are fewer than the minimum number of bytes left in the
2517 subject. */
2518
2519 REPEATNOTCHAR:
2520 if (min > md->end_subject - eptr) RRETURN(MATCH_NOMATCH);
2521 fc = *ecode++;
2522
2523 /* The code is duplicated for the caseless and caseful cases, for speed,
2524 since matching characters is likely to be quite common. First, ensure the
2525 minimum number of matches are present. If min = max, continue at the same
2526 level without recursing. Otherwise, if minimizing, keep trying the rest of
2527 the expression and advancing one matching character if failing, up to the
2528 maximum. Alternatively, if maximizing, find the maximum number of
2529 characters and work backwards. */
2530
2531 DPRINTF(("negative matching %c{%d,%d} against subject %.*s\n", fc, min, max,
2532 max, eptr));
2533
2534 if ((ims & PCRE_CASELESS) != 0)
2535 {
2536 fc = md->lcc[fc];
2537
2538 #ifdef SUPPORT_UTF8
2539 /* UTF-8 mode */
2540 if (utf8)
2541 {
2542 register unsigned int d;
2543 for (i = 1; i <= min; i++)
2544 {
2545 GETCHARINC(d, eptr);
2546 if (d < 256) d = md->lcc[d];
2547 if (fc == d) RRETURN(MATCH_NOMATCH);
2548 }
2549 }
2550 else
2551 #endif
2552
2553 /* Not UTF-8 mode */
2554 {
2555 for (i = 1; i <= min; i++)
2556 if (fc == md->lcc[*eptr++]) RRETURN(MATCH_NOMATCH);
2557 }
2558
2559 if (min == max) continue;
2560
2561 if (minimize)
2562 {
2563 #ifdef SUPPORT_UTF8
2564 /* UTF-8 mode */
2565 if (utf8)
2566 {
2567 register unsigned int d;
2568 for (fi = min;; fi++)
2569 {
2570 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM28);
2571 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2572 GETCHARINC(d, eptr);
2573 if (d < 256) d = md->lcc[d];
2574 if (fi >= max || eptr >= md->end_subject || fc == d)
2575 RRETURN(MATCH_NOMATCH);
2576 }
2577 }
2578 else
2579 #endif
2580 /* Not UTF-8 mode */
2581 {
2582 for (fi = min;; fi++)
2583 {
2584 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM29);
2585 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2586 if (fi >= max || eptr >= md->end_subject || fc == md->lcc[*eptr++])
2587 RRETURN(MATCH_NOMATCH);
2588 }
2589 }
2590 /* Control never gets here */
2591 }
2592
2593 /* Maximize case */
2594
2595 else
2596 {
2597 pp = eptr;
2598
2599 #ifdef SUPPORT_UTF8
2600 /* UTF-8 mode */
2601 if (utf8)
2602 {
2603 register unsigned int d;
2604 for (i = min; i < max; i++)
2605 {
2606 int len = 1;
2607 if (eptr >= md->end_subject) break;
2608 GETCHARLEN(d, eptr, len);
2609 if (d < 256) d = md->lcc[d];
2610 if (fc == d) break;
2611 eptr += len;
2612 }
2613 if (possessive) continue;
2614 for(;;)
2615 {
2616 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM30);
2617 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2618 if (eptr-- == pp) break; /* Stop if tried at original pos */
2619 BACKCHAR(eptr);
2620 }
2621 }
2622 else
2623 #endif
2624 /* Not UTF-8 mode */
2625 {
2626 for (i = min; i < max; i++)
2627 {
2628 if (eptr >= md->end_subject || fc == md->lcc[*eptr]) break;
2629 eptr++;
2630 }
2631 if (possessive) continue;
2632 while (eptr >= pp)
2633 {
2634 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM31);
2635 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2636 eptr--;
2637 }
2638 }
2639
2640 RRETURN(MATCH_NOMATCH);
2641 }
2642 /* Control never gets here */
2643 }
2644
2645 /* Caseful comparisons */
2646
2647 else
2648 {
2649 #ifdef SUPPORT_UTF8
2650 /* UTF-8 mode */
2651 if (utf8)
2652 {
2653 register unsigned int d;
2654 for (i = 1; i <= min; i++)
2655 {
2656 GETCHARINC(d, eptr);
2657 if (fc == d) RRETURN(MATCH_NOMATCH);
2658 }
2659 }
2660 else
2661 #endif
2662 /* Not UTF-8 mode */
2663 {
2664 for (i = 1; i <= min; i++)
2665 if (fc == *eptr++) RRETURN(MATCH_NOMATCH);
2666 }
2667
2668 if (min == max) continue;
2669
2670 if (minimize)
2671 {
2672 #ifdef SUPPORT_UTF8
2673 /* UTF-8 mode */
2674 if (utf8)
2675 {
2676 register unsigned int d;
2677 for (fi = min;; fi++)
2678 {
2679 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM32);
2680 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2681 GETCHARINC(d, eptr);
2682 if (fi >= max || eptr >= md->end_subject || fc == d)
2683 RRETURN(MATCH_NOMATCH);
2684 }
2685 }
2686 else
2687 #endif
2688 /* Not UTF-8 mode */
2689 {
2690 for (fi = min;; fi++)
2691 {
2692 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM33);
2693 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2694 if (fi >= max || eptr >= md->end_subject || fc == *eptr++)
2695 RRETURN(MATCH_NOMATCH);
2696 }
2697 }
2698 /* Control never gets here */
2699 }
2700
2701 /* Maximize case */
2702
2703 else
2704 {
2705 pp = eptr;
2706
2707 #ifdef SUPPORT_UTF8
2708 /* UTF-8 mode */
2709 if (utf8)
2710 {
2711 register unsigned int d;
2712 for (i = min; i < max; i++)
2713 {
2714 int len = 1;
2715 if (eptr >= md->end_subject) break;
2716 GETCHARLEN(d, eptr, len);
2717 if (fc == d) break;
2718 eptr += len;
2719 }
2720 if (possessive) continue;
2721 for(;;)
2722 {
2723 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM34);
2724 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2725 if (eptr-- == pp) break; /* Stop if tried at original pos */
2726 BACKCHAR(eptr);
2727 }
2728 }
2729 else
2730 #endif
2731 /* Not UTF-8 mode */
2732 {
2733 for (i = min; i < max; i++)
2734 {
2735 if (eptr >= md->end_subject || fc == *eptr) break;
2736 eptr++;
2737 }
2738 if (possessive) continue;
2739 while (eptr >= pp)
2740 {
2741 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM35);
2742 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2743 eptr--;
2744 }
2745 }
2746
2747 RRETURN(MATCH_NOMATCH);
2748 }
2749 }
2750 /* Control never gets here */
2751
2752 /* Match a single character type repeatedly; several different opcodes
2753 share code. This is very similar to the code for single characters, but we
2754 repeat it in the interests of efficiency. */
2755
2756 case OP_TYPEEXACT:
2757 min = max = GET2(ecode, 1);
2758 minimize = TRUE;
2759 ecode += 3;
2760 goto REPEATTYPE;
2761
2762 case OP_TYPEUPTO:
2763 case OP_TYPEMINUPTO:
2764 min = 0;
2765 max = GET2(ecode, 1);
2766 minimize = *ecode == OP_TYPEMINUPTO;
2767 ecode += 3;
2768 goto REPEATTYPE;
2769
2770 case OP_TYPEPOSSTAR:
2771 possessive = TRUE;
2772 min = 0;
2773 max = INT_MAX;
2774 ecode++;
2775 goto REPEATTYPE;
2776
2777 case OP_TYPEPOSPLUS:
2778 possessive = TRUE;
2779 min = 1;
2780 max = INT_MAX;
2781 ecode++;
2782 goto REPEATTYPE;
2783
2784 case OP_TYPEPOSQUERY:
2785 possessive = TRUE;
2786 min = 0;
2787 max = 1;
2788 ecode++;
2789 goto REPEATTYPE;
2790
2791 case OP_TYPEPOSUPTO:
2792 possessive = TRUE;
2793 min = 0;
2794 max = GET2(ecode, 1);
2795 ecode += 3;
2796 goto REPEATTYPE;
2797
2798 case OP_TYPESTAR:
2799 case OP_TYPEMINSTAR:
2800 case OP_TYPEPLUS:
2801 case OP_TYPEMINPLUS:
2802 case OP_TYPEQUERY:
2803 case OP_TYPEMINQUERY:
2804 c = *ecode++ - OP_TYPESTAR;
2805 minimize = (c & 1) != 0;
2806 min = rep_min[c]; /* Pick up values from tables; */
2807 max = rep_max[c]; /* zero for max => infinity */
2808 if (max == 0) max = INT_MAX;
2809
2810 /* Common code for all repeated single character type matches. Note that
2811 in UTF-8 mode, '.' matches a character of any length, but for the other
2812 character types, the valid characters are all one-byte long. */
2813
2814 REPEATTYPE:
2815 ctype = *ecode++; /* Code for the character type */
2816
2817 #ifdef SUPPORT_UCP
2818 if (ctype == OP_PROP || ctype == OP_NOTPROP)
2819 {
2820 prop_fail_result = ctype == OP_NOTPROP;
2821 prop_type = *ecode++;
2822 prop_value = *ecode++;
2823 }
2824 else prop_type = -1;
2825 #endif
2826
2827 /* First, ensure the minimum number of matches are present. Use inline
2828 code for maximizing the speed, and do the type test once at the start
2829 (i.e. keep it out of the loop). Also we can test that there are at least
2830 the minimum number of bytes before we start. This isn't as effective in
2831 UTF-8 mode, but it does no harm. Separate the UTF-8 code completely as that
2832 is tidier. Also separate the UCP code, which can be the same for both UTF-8
2833 and single-bytes. */
2834
2835 if (min > md->end_subject - eptr) RRETURN(MATCH_NOMATCH);
2836 if (min > 0)
2837 {
2838 #ifdef SUPPORT_UCP
2839 if (prop_type >= 0)
2840 {
2841 switch(prop_type)
2842 {
2843 case PT_ANY:
2844 if (prop_fail_result) RRETURN(MATCH_NOMATCH);
2845 for (i = 1; i <= min; i++)
2846 {
2847 if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
2848 GETCHARINCTEST(c, eptr);
2849 }
2850 break;
2851
2852 case PT_LAMP:
2853 for (i = 1; i <= min; i++)
2854 {
2855 if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
2856 GETCHARINCTEST(c, eptr);
2857 prop_category = _pcre_ucp_findprop(c, &prop_chartype, &prop_script);
2858 if ((prop_chartype == ucp_Lu ||
2859 prop_chartype == ucp_Ll ||
2860 prop_chartype == ucp_Lt) == prop_fail_result)
2861 RRETURN(MATCH_NOMATCH);
2862 }
2863 break;
2864
2865 case PT_GC:
2866 for (i = 1; i <= min; i++)
2867 {
2868 if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
2869 GETCHARINCTEST(c, eptr);
2870 prop_category = _pcre_ucp_findprop(c, &prop_chartype, &prop_script);
2871 if ((prop_category == prop_value) == prop_fail_result)
2872 RRETURN(MATCH_NOMATCH);
2873 }
2874 break;
2875
2876 case PT_PC:
2877 for (i = 1; i <= min; i++)
2878 {
2879 if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
2880 GETCHARINCTEST(c, eptr);
2881 prop_category = _pcre_ucp_findprop(c, &prop_chartype, &prop_script);
2882 if ((prop_chartype == prop_value) == prop_fail_result)
2883 RRETURN(MATCH_NOMATCH);
2884 }
2885 break;
2886
2887 case PT_SC:
2888 for (i = 1; i <= min; i++)
2889 {
2890 if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
2891 GETCHARINCTEST(c, eptr);
2892 prop_category = _pcre_ucp_findprop(c, &prop_chartype, &prop_script);
2893 if ((prop_script == prop_value) == prop_fail_result)
2894 RRETURN(MATCH_NOMATCH);
2895 }
2896 break;
2897
2898 default:
2899 RRETURN(PCRE_ERROR_INTERNAL);
2900 }
2901 }
2902
2903 /* Match extended Unicode sequences. We will get here only if the
2904 support is in the binary; otherwise a compile-time error occurs. */
2905
2906 else if (ctype == OP_EXTUNI)
2907 {
2908 for (i = 1; i <= min; i++)
2909 {
2910 GETCHARINCTEST(c, eptr);
2911 prop_category = _pcre_ucp_findprop(c, &prop_chartype, &prop_script);
2912 if (prop_category == ucp_M) RRETURN(MATCH_NOMATCH);
2913 while (eptr < md->end_subject)
2914 {
2915 int len = 1;
2916 if (!utf8) c = *eptr; else
2917 {
2918 GETCHARLEN(c, eptr, len);
2919 }
2920 prop_category = _pcre_ucp_findprop(c, &prop_chartype, &prop_script);
2921 if (prop_category != ucp_M) break;
2922 eptr += len;
2923 }
2924 }
2925 }
2926
2927 else
2928 #endif /* SUPPORT_UCP */
2929
2930 /* Handle all other cases when the coding is UTF-8 */
2931
2932 #ifdef SUPPORT_UTF8
2933 if (utf8) switch(ctype)
2934 {
2935 case OP_ANY:
2936 for (i = 1; i <= min; i++)
2937 {
2938 if (eptr >= md->end_subject ||
2939 ((ims & PCRE_DOTALL) == 0 && IS_NEWLINE(eptr)))
2940 RRETURN(MATCH_NOMATCH);
2941 eptr++;
2942 while (eptr < md->end_subject && (*eptr & 0xc0) == 0x80) eptr++;
2943 }
2944 break;
2945
2946 case OP_ANYBYTE:
2947 eptr += min;
2948 break;
2949
2950 case OP_ANYNL:
2951 for (i = 1; i <= min; i++)
2952 {
2953 if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
2954 GETCHARINC(c, eptr);
2955 switch(c)
2956 {
2957 default: RRETURN(MATCH_NOMATCH);
2958 case 0x000d:
2959 if (eptr < md->end_subject && *eptr == 0x0a) eptr++;
2960 break;
2961
2962 case 0x000a:
2963 break;
2964
2965 case 0x000b:
2966 case 0x000c:
2967 case 0x0085:
2968 case 0x2028:
2969 case 0x2029:
2970 if (md->bsr_anycrlf) RRETURN(MATCH_NOMATCH);
2971 break;
2972 }
2973 }
2974 break;
2975
2976 case OP_NOT_HSPACE:
2977 for (i = 1; i <= min; i++)
2978 {
2979 if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
2980 GETCHARINC(c, eptr);
2981 switch(c)
2982 {
2983 default: break;
2984 case 0x09: /* HT */
2985 case 0x20: /* SPACE */
2986 case 0xa0: /* NBSP */
2987 case 0x1680: /* OGHAM SPACE MARK */
2988 case 0x180e: /* MONGOLIAN VOWEL SEPARATOR */
2989 case 0x2000: /* EN QUAD */
2990 case 0x2001: /* EM QUAD */
2991 case 0x2002: /* EN SPACE */
2992 case 0x2003: /* EM SPACE */
2993 case 0x2004: /* THREE-PER-EM SPACE */
2994 case 0x2005: /* FOUR-PER-EM SPACE */
2995 case 0x2006: /* SIX-PER-EM SPACE */
2996 case 0x2007: /* FIGURE SPACE */
2997 case 0x2008: /* PUNCTUATION SPACE */
2998 case 0x2009: /* THIN SPACE */
2999 case 0x200A: /* HAIR SPACE */
3000 case 0x202f: /* NARROW NO-BREAK SPACE */
3001 case 0x205f: /* MEDIUM MATHEMATICAL SPACE */
3002 case 0x3000: /* IDEOGRAPHIC SPACE */
3003 RRETURN(MATCH_NOMATCH);
3004 }
3005 }
3006 break;
3007
3008 case OP_HSPACE:
3009 for (i = 1; i <= min; i++)
3010 {
3011 if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
3012 GETCHARINC(c, eptr);
3013 switch(c)
3014 {
3015 default: RRETURN(MATCH_NOMATCH);
3016 case 0x09: /* HT */
3017 case 0x20: /* SPACE */
3018 case 0xa0: /* NBSP */
3019 case 0x1680: /* OGHAM SPACE MARK */
3020 case 0x180e: /* MONGOLIAN VOWEL SEPARATOR */
3021 case 0x2000: /* EN QUAD */
3022 case 0x2001: /* EM QUAD */
3023 case 0x2002: /* EN SPACE */
3024 case 0x2003: /* EM SPACE */
3025 case 0x2004: /* THREE-PER-EM SPACE */
3026 case 0x2005: /* FOUR-PER-EM SPACE */
3027 case 0x2006: /* SIX-PER-EM SPACE */
3028 case 0x2007: /* FIGURE SPACE */
3029 case 0x2008: /* PUNCTUATION SPACE */
3030 case 0x2009: /* THIN SPACE */
3031 case 0x200A: /* HAIR SPACE */
3032 case 0x202f: /* NARROW NO-BREAK SPACE */
3033 case 0x205f: /* MEDIUM MATHEMATICAL SPACE */
3034 case 0x3000: /* IDEOGRAPHIC SPACE */
3035 break;
3036 }
3037 }
3038 break;
3039
3040 case OP_NOT_VSPACE:
3041 for (i = 1; i <= min; i++)
3042 {
3043 if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
3044 GETCHARINC(c, eptr);
3045 switch(c)
3046 {
3047 default: break;
3048 case 0x0a: /* LF */
3049 case 0x0b: /* VT */
3050 case 0x0c: /* FF */
3051 case 0x0d: /* CR */
3052 case 0x85: /* NEL */
3053 case 0x2028: /* LINE SEPARATOR */
3054 case 0x2029: /* PARAGRAPH SEPARATOR */
3055 RRETURN(MATCH_NOMATCH);
3056 }
3057 }
3058 break;
3059
3060 case OP_VSPACE:
3061 for (i = 1; i <= min; i++)
3062 {
3063 if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
3064 GETCHARINC(c, eptr);
3065 switch(c)
3066 {
3067 default: RRETURN(MATCH_NOMATCH);
3068 case 0x0a: /* LF */
3069 case 0x0b: /* VT */
3070 case 0x0c: /* FF */
3071 case 0x0d: /* CR */
3072 case 0x85: /* NEL */
3073 case 0x2028: /* LINE SEPARATOR */
3074 case 0x2029: /* PARAGRAPH SEPARATOR */
3075 break;
3076 }
3077 }
3078 break;
3079
3080 case OP_NOT_DIGIT:
3081 for (i = 1; i <= min; i++)
3082 {
3083 if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
3084 GETCHARINC(c, eptr);
3085 if (c < 128 && (md->ctypes[c] & ctype_digit) != 0)
3086 RRETURN(MATCH_NOMATCH);
3087 }
3088 break;
3089
3090 case OP_DIGIT:
3091 for (i = 1; i <= min; i++)
3092 {
3093 if (eptr >= md->end_subject ||
3094 *eptr >= 128 || (md->ctypes[*eptr++] & ctype_digit) == 0)
3095 RRETURN(MATCH_NOMATCH);
3096 /* No need to skip more bytes - we know it's a 1-byte character */
3097 }
3098 break;
3099
3100 case OP_NOT_WHITESPACE:
3101 for (i = 1; i <= min; i++)
3102 {
3103 if (eptr >= md->end_subject ||
3104 (*eptr < 128 && (md->ctypes[*eptr] & ctype_space) != 0))
3105 RRETURN(MATCH_NOMATCH);
3106 while (++eptr < md->end_subject && (*eptr & 0xc0) == 0x80);
3107 }
3108 break;
3109
3110 case OP_WHITESPACE:
3111 for (i = 1; i <= min; i++)
3112 {
3113 if (eptr >= md->end_subject ||
3114 *eptr >= 128 || (md->ctypes[*eptr++] & ctype_space) == 0)
3115 RRETURN(MATCH_NOMATCH);
3116 /* No need to skip more bytes - we know it's a 1-byte character */
3117 }
3118 break;
3119
3120 case OP_NOT_WORDCHAR:
3121 for (i = 1; i <= min; i++)
3122 {
3123 if (eptr >= md->end_subject ||
3124 (*eptr < 128 && (md->ctypes[*eptr] & ctype_word) != 0))
3125 RRETURN(MATCH_NOMATCH);
3126 while (++eptr < md->end_subject && (*eptr & 0xc0) == 0x80);
3127 }
3128 break;
3129
3130 case OP_WORDCHAR:
3131 for (i = 1; i <= min; i++)
3132 {
3133 if (eptr >= md->end_subject ||
3134 *eptr >= 128 || (md->ctypes[*eptr++] & ctype_word) == 0)
3135 RRETURN(MATCH_NOMATCH);
3136 /* No need to skip more bytes - we know it's a 1-byte character */
3137 }
3138 break;
3139
3140 default:
3141 RRETURN(PCRE_ERROR_INTERNAL);
3142 } /* End switch(ctype) */
3143
3144 else
3145 #endif /* SUPPORT_UTF8 */
3146
3147 /* Code for the non-UTF-8 case for minimum matching of operators other
3148 than OP_PROP and OP_NOTPROP. We can assume that there are the minimum
3149 number of bytes present, as this was tested above. */
3150
3151 switch(ctype)
3152 {
3153 case OP_ANY:
3154 if ((ims & PCRE_DOTALL) == 0)
3155 {
3156 for (i = 1; i <= min; i++)
3157 {
3158 if (IS_NEWLINE(eptr)) RRETURN(MATCH_NOMATCH);
3159 eptr++;
3160 }
3161 }
3162 else eptr += min;
3163 break;
3164
3165 case OP_ANYBYTE:
3166 eptr += min;
3167 break;
3168
3169 /* Because of the CRLF case, we can't assume the minimum number of
3170 bytes are present in this case. */
3171
3172 case OP_ANYNL:
3173 for (i = 1; i <= min; i++)
3174 {
3175 if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
3176 switch(*eptr++)
3177 {
3178 default: RRETURN(MATCH_NOMATCH);
3179 case 0x000d:
3180 if (eptr < md->end_subject && *eptr == 0x0a) eptr++;
3181 break;
3182 case 0x000a:
3183 break;
3184
3185 case 0x000b:
3186 case 0x000c:
3187 case 0x0085:
3188 if (md->bsr_anycrlf) RRETURN(MATCH_NOMATCH);
3189 break;
3190 }
3191 }
3192 break;
3193
3194 case OP_NOT_HSPACE:
3195 for (i = 1; i <= min; i++)
3196 {
3197 if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
3198 switch(*eptr++)
3199 {
3200 default: break;
3201 case 0x09: /* HT */
3202 case 0x20: /* SPACE */
3203 case 0xa0: /* NBSP */
3204 RRETURN(MATCH_NOMATCH);
3205 }
3206 }
3207 break;
3208
3209 case OP_HSPACE:
3210 for (i = 1; i <= min; i++)
3211 {
3212 if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
3213 switch(*eptr++)
3214 {
3215 default: RRETURN(MATCH_NOMATCH);
3216 case 0x09: /* HT */
3217 case 0x20: /* SPACE */
3218 case 0xa0: /* NBSP */
3219 break;
3220 }
3221 }
3222 break;
3223
3224 case OP_NOT_VSPACE:
3225 for (i = 1; i <= min; i++)
3226 {
3227 if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
3228 switch(*eptr++)
3229 {
3230 default: break;
3231 case 0x0a: /* LF */
3232 case 0x0b: /* VT */
3233 case 0x0c: /* FF */
3234 case 0x0d: /* CR */
3235 case 0x85: /* NEL */
3236 RRETURN(MATCH_NOMATCH);
3237 }
3238 }
3239 break;
3240
3241 case OP_VSPACE:
3242 for (i = 1; i <= min; i++)
3243 {
3244 if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
3245 switch(*eptr++)
3246 {
3247 default: RRETURN(MATCH_NOMATCH);
3248 case 0x0a: /* LF */
3249 case 0x0b: /* VT */
3250 case 0x0c: /* FF */
3251 case 0x0d: /* CR */
3252 case 0x85: /* NEL */
3253 break;
3254 }
3255 }
3256 break;
3257
3258 case OP_NOT_DIGIT:
3259 for (i = 1; i <= min; i++)
3260 if ((md->ctypes[*eptr++] & ctype_digit) != 0) RRETURN(MATCH_NOMATCH);
3261 break;
3262
3263 case OP_DIGIT:
3264 for (i = 1; i <= min; i++)
3265 if ((md->ctypes[*eptr++] & ctype_digit) == 0) RRETURN(MATCH_NOMATCH);
3266 break;
3267
3268 case OP_NOT_WHITESPACE:
3269 for (i = 1; i <= min; i++)
3270 if ((md->ctypes[*eptr++] & ctype_space) != 0) RRETURN(MATCH_NOMATCH);
3271 break;
3272
3273 case OP_WHITESPACE:
3274 for (i = 1; i <= min; i++)
3275 if ((md->ctypes[*eptr++] & ctype_space) == 0) RRETURN(MATCH_NOMATCH);
3276 break;
3277
3278 case OP_NOT_WORDCHAR:
3279 for (i = 1; i <= min; i++)
3280 if ((md->ctypes[*eptr++] & ctype_word) != 0)
3281 RRETURN(MATCH_NOMATCH);
3282 break;
3283
3284 case OP_WORDCHAR:
3285 for (i = 1; i <= min; i++)
3286 if ((md->ctypes[*eptr++] & ctype_word) == 0)
3287 RRETURN(MATCH_NOMATCH);
3288 break;
3289
3290 default:
3291 RRETURN(PCRE_ERROR_INTERNAL);
3292 }
3293 }
3294
3295 /* If min = max, continue at the same level without recursing */
3296
3297 if (min == max) continue;
3298
3299 /* If minimizing, we have to test the rest of the pattern before each
3300 subsequent match. Again, separate the UTF-8 case for speed, and also
3301 separate the UCP cases. */
3302
3303 if (minimize)
3304 {
3305 #ifdef SUPPORT_UCP
3306 if (prop_type >= 0)
3307 {
3308 switch(prop_type)
3309 {
3310 case PT_ANY:
3311 for (fi = min;; fi++)
3312 {
3313 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM36);
3314 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3315 if (fi >= max || eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
3316 GETCHARINC(c, eptr);
3317 if (prop_fail_result) RRETURN(MATCH_NOMATCH);
3318 }
3319 /* Control never gets here */
3320
3321 case PT_LAMP:
3322 for (fi = min;; fi++)
3323 {
3324 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM37);
3325 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3326 if (fi >= max || eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
3327 GETCHARINC(c, eptr);
3328 prop_category = _pcre_ucp_findprop(c, &prop_chartype, &prop_script);
3329 if ((prop_chartype == ucp_Lu ||
3330 prop_chartype == ucp_Ll ||
3331 prop_chartype == ucp_Lt) == prop_fail_result)
3332 RRETURN(MATCH_NOMATCH);
3333 }
3334 /* Control never gets here */
3335
3336 case PT_GC:
3337 for (fi = min;; fi++)
3338 {
3339 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM38);
3340 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3341 if (fi >= max || eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
3342 GETCHARINC(c, eptr);
3343 prop_category = _pcre_ucp_findprop(c, &prop_chartype, &prop_script);
3344 if ((prop_category == prop_value) == prop_fail_result)
3345 RRETURN(MATCH_NOMATCH);
3346 }
3347 /* Control never gets here */
3348
3349 case PT_PC:
3350 for (fi = min;; fi++)
3351 {
3352 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM39);
3353 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3354 if (fi >= max || eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
3355 GETCHARINC(c, eptr);
3356 prop_category = _pcre_ucp_findprop(c, &prop_chartype, &prop_script);
3357 if ((prop_chartype == prop_value) == prop_fail_result)
3358 RRETURN(MATCH_NOMATCH);
3359 }
3360 /* Control never gets here */
3361
3362 case PT_SC:
3363 for (fi = min;; fi++)
3364 {
3365 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM40);
3366 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3367 if (fi >= max || eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
3368 GETCHARINC(c, eptr);
3369 prop_category = _pcre_ucp_findprop(c, &prop_chartype, &prop_script);
3370 if ((prop_script == prop_value) == prop_fail_result)
3371 RRETURN(MATCH_NOMATCH);
3372 }
3373 /* Control never gets here */
3374
3375 default:
3376 RRETURN(PCRE_ERROR_INTERNAL);
3377 }
3378 }
3379
3380 /* Match extended Unicode sequences. We will get here only if the
3381 support is in the binary; otherwise a compile-time error occurs. */
3382
3383 else if (ctype == OP_EXTUNI)
3384 {
3385 for (fi = min;; fi++)
3386 {
3387 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM41);
3388 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3389 if (fi >= max || eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
3390 GETCHARINCTEST(c, eptr);
3391 prop_category = _pcre_ucp_findprop(c, &prop_chartype, &prop_script);
3392 if (prop_category == ucp_M) RRETURN(MATCH_NOMATCH);
3393 while (eptr < md->end_subject)
3394 {
3395 int len = 1;
3396 if (!utf8) c = *eptr; else
3397 {
3398 GETCHARLEN(c, eptr, len);
3399 }
3400 prop_category = _pcre_ucp_findprop(c, &prop_chartype, &prop_script);
3401 if (prop_category != ucp_M) break;
3402 eptr += len;
3403 }
3404 }
3405 }
3406
3407 else
3408 #endif /* SUPPORT_UCP */
3409
3410 #ifdef SUPPORT_UTF8
3411 /* UTF-8 mode */
3412 if (utf8)
3413 {
3414 for (fi = min;; fi++)
3415 {
3416 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM42);
3417 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3418 if (fi >= max || eptr >= md->end_subject ||
3419 (ctype == OP_ANY && (ims & PCRE_DOTALL) == 0 &&
3420 IS_NEWLINE(eptr)))
3421 RRETURN(MATCH_NOMATCH);
3422
3423 GETCHARINC(c, eptr);
3424 switch(ctype)
3425 {
3426 case OP_ANY: /* This is the DOTALL case */
3427 break;
3428
3429 case OP_ANYBYTE:
3430 break;
3431
3432 case OP_ANYNL:
3433 switch(c)
3434 {
3435 default: RRETURN(MATCH_NOMATCH);
3436 case 0x000d:
3437 if (eptr < md->end_subject && *eptr == 0x0a) eptr++;
3438 break;
3439 case 0x000a:
3440 break;
3441
3442 case 0x000b:
3443 case 0x000c:
3444 case 0x0085:
3445 case 0x2028:
3446 case 0x2029:
3447 if (md->bsr_anycrlf) RRETURN(MATCH_NOMATCH);
3448 break;
3449 }
3450 break;
3451
3452 case OP_NOT_HSPACE:
3453 switch(c)
3454 {
3455 default: break;
3456 case 0x09: /* HT */
3457 case 0x20: /* SPACE */
3458 case 0xa0: /* NBSP */
3459 case 0x1680: /* OGHAM SPACE MARK */
3460 case 0x180e: /* MONGOLIAN VOWEL SEPARATOR */
3461 case 0x2000: /* EN QUAD */
3462 case 0x2001: /* EM QUAD */
3463 case 0x2002: /* EN SPACE */
3464 case 0x2003: /* EM SPACE */
3465 case 0x2004: /* THREE-PER-EM SPACE */
3466 case 0x2005: /* FOUR-PER-EM SPACE */
3467 case 0x2006: /* SIX-PER-EM SPACE */
3468 case 0x2007: /* FIGURE SPACE */
3469 case 0x2008: /* PUNCTUATION SPACE */
3470 case 0x2009: /* THIN SPACE */
3471 case 0x200A: /* HAIR SPACE */
3472 case 0x202f: /* NARROW NO-BREAK SPACE */
3473 case 0x205f: /* MEDIUM MATHEMATICAL SPACE */
3474 case 0x3000: /* IDEOGRAPHIC SPACE */
3475 RRETURN(MATCH_NOMATCH);
3476 }
3477 break;
3478
3479 case OP_HSPACE:
3480 switch(c)
3481 {
3482 default: RRETURN(MATCH_NOMATCH);
3483 case 0x09: /* HT */
3484 case 0x20: /* SPACE */
3485 case 0xa0: /* NBSP */
3486 case 0x1680: /* OGHAM SPACE MARK */
3487 case 0x180e: /* MONGOLIAN VOWEL SEPARATOR */
3488 case 0x2000: /* EN QUAD */
3489 case 0x2001: /* EM QUAD */
3490 case 0x2002: /* EN SPACE */
3491 case 0x2003: /* EM SPACE */
3492 case 0x2004: /* THREE-PER-EM SPACE */
3493 case 0x2005: /* FOUR-PER-EM SPACE */
3494 case 0x2006: /* SIX-PER-EM SPACE */
3495 case 0x2007: /* FIGURE SPACE */
3496 case 0x2008: /* PUNCTUATION SPACE */
3497 case 0x2009: /* THIN SPACE */
3498 case 0x200A: /* HAIR SPACE */
3499 case 0x202f: /* NARROW NO-BREAK SPACE */
3500 case 0x205f: /* MEDIUM MATHEMATICAL SPACE */
3501 case 0x3000: /* IDEOGRAPHIC SPACE */
3502 break;
3503 }
3504 break;
3505
3506 case OP_NOT_VSPACE:
3507 switch(c)
3508 {
3509 default: break;
3510 case 0x0a: /* LF */
3511 case 0x0b: /* VT */
3512 case 0x0c: /* FF */
3513 case 0x0d: /* CR */
3514 case 0x85: /* NEL */
3515 case 0x2028: /* LINE SEPARATOR */
3516 case 0x2029: /* PARAGRAPH SEPARATOR */
3517 RRETURN(MATCH_NOMATCH);
3518 }
3519 break;
3520
3521 case OP_VSPACE:
3522 switch(c)
3523 {
3524 default: RRETURN(MATCH_NOMATCH);
3525 case 0x0a: /* LF */
3526 case 0x0b: /* VT */
3527 case 0x0c: /* FF */
3528 case 0x0d: /* CR */
3529 case 0x85: /* NEL */
3530 case 0x2028: /* LINE SEPARATOR */
3531 case 0x2029: /* PARAGRAPH SEPARATOR */
3532 break;
3533 }
3534 break;
3535
3536 case OP_NOT_DIGIT:
3537 if (c < 256 && (md->ctypes[c] & ctype_digit) != 0)
3538 RRETURN(MATCH_NOMATCH);
3539 break;
3540
3541 case OP_DIGIT:
3542 if (c >= 256 || (md->ctypes[c] & ctype_digit) == 0)
3543 RRETURN(MATCH_NOMATCH);
3544 break;
3545
3546 case OP_NOT_WHITESPACE:
3547 if (c < 256 && (md->ctypes[c] & ctype_space) != 0)
3548 RRETURN(MATCH_NOMATCH);
3549 break;
3550
3551 case OP_WHITESPACE:
3552 if (c >= 256 || (md->ctypes[c] & ctype_space) == 0)
3553 RRETURN(MATCH_NOMATCH);
3554 break;
3555
3556 case OP_NOT_WORDCHAR:
3557 if (c < 256 && (md->ctypes[c] & ctype_word) != 0)
3558 RRETURN(MATCH_NOMATCH);
3559 break;
3560
3561 case OP_WORDCHAR:
3562 if (c >= 256 || (md->ctypes[c] & ctype_word) == 0)
3563 RRETURN(MATCH_NOMATCH);
3564 break;
3565
3566 default:
3567 RRETURN(PCRE_ERROR_INTERNAL);
3568 }
3569 }
3570 }
3571 else
3572 #endif
3573 /* Not UTF-8 mode */
3574 {
3575 for (fi = min;; fi++)
3576 {
3577 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM43);
3578 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3579 if (fi >= max || eptr >= md->end_subject ||
3580 ((ims & PCRE_DOTALL) == 0 && IS_NEWLINE(eptr)))
3581 RRETURN(MATCH_NOMATCH);
3582
3583 c = *eptr++;
3584 switch(ctype)
3585 {
3586 case OP_ANY: /* This is the DOTALL case */
3587 break;
3588
3589 case OP_ANYBYTE:
3590 break;
3591
3592 case OP_ANYNL:
3593 switch(c)
3594 {
3595 default: RRETURN(MATCH_NOMATCH);
3596 case 0x000d:
3597 if (eptr < md->end_subject && *eptr == 0x0a) eptr++;
3598 break;
3599
3600 case 0x000a:
3601 break;
3602
3603 case 0x000b:
3604 case 0x000c:
3605 case 0x0085:
3606 if (md->bsr_anycrlf) RRETURN(MATCH_NOMATCH);
3607 break;
3608 }
3609 break;
3610
3611 case OP_NOT_HSPACE:
3612 switch(c)
3613 {
3614 default: break;
3615 case 0x09: /* HT */
3616 case 0x20: /* SPACE */
3617 case 0xa0: /* NBSP */
3618 RRETURN(MATCH_NOMATCH);
3619 }
3620 break;
3621
3622 case OP_HSPACE:
3623 switch(c)
3624 {
3625 default: RRETURN(MATCH_NOMATCH);
3626 case 0x09: /* HT */
3627 case 0x20: /* SPACE */
3628 case 0xa0: /* NBSP */
3629 break;
3630 }
3631 break;
3632
3633 case OP_NOT_VSPACE:
3634 switch(c)
3635 {
3636 default: break;
3637 case 0x0a: /* LF */
3638 case 0x0b: /* VT */
3639 case 0x0c: /* FF */
3640 case 0x0d: /* CR */
3641 case 0x85: /* NEL */
3642 RRETURN(MATCH_NOMATCH);
3643 }
3644 break;
3645
3646 case OP_VSPACE:
3647 switch(c)
3648 {
3649 default: RRETURN(MATCH_NOMATCH);
3650 case 0x0a: /* LF */
3651 case 0x0b: /* VT */
3652 case 0x0c: /* FF */
3653 case 0x0d: /* CR */
3654 case 0x85: /* NEL */
3655 break;
3656 }
3657 break;
3658
3659 case OP_NOT_DIGIT:
3660 if ((md->ctypes[c] & ctype_digit) != 0) RRETURN(MATCH_NOMATCH);
3661 break;
3662
3663 case OP_DIGIT:
3664 if ((md->ctypes[c] & ctype_digit) == 0) RRETURN(MATCH_NOMATCH);
3665 break;
3666
3667 case OP_NOT_WHITESPACE:
3668 if ((md->ctypes[c] & ctype_space) != 0) RRETURN(MATCH_NOMATCH);
3669 break;
3670
3671 case OP_WHITESPACE:
3672 if ((md->ctypes[c] & ctype_space) == 0) RRETURN(MATCH_NOMATCH);
3673 break;
3674
3675 case OP_NOT_WORDCHAR:
3676 if ((md->ctypes[c] & ctype_word) != 0) RRETURN(MATCH_NOMATCH);
3677 break;
3678
3679 case OP_WORDCHAR:
3680 if ((md->ctypes[c] & ctype_word) == 0) RRETURN(MATCH_NOMATCH);
3681 break;
3682
3683 default:
3684 RRETURN(PCRE_ERROR_INTERNAL);
3685 }
3686 }
3687 }
3688 /* Control never gets here */
3689 }
3690
3691 /* If maximizing, it is worth using inline code for speed, doing the type
3692 test once at the start (i.e. keep it out of the loop). Again, keep the
3693 UTF-8 and UCP stuff separate. */
3694
3695 else
3696 {
3697 pp = eptr; /* Remember where we started */
3698
3699 #ifdef SUPPORT_UCP
3700 if (prop_type >= 0)
3701 {
3702 switch(prop_type)
3703 {
3704 case PT_ANY:
3705 for (i = min; i < max; i++)
3706 {
3707 int len = 1;
3708 if (eptr >= md->end_subject) break;
3709 GETCHARLEN(c, eptr, len);
3710 if (prop_fail_result) break;
3711 eptr+= len;
3712 }
3713 break;
3714
3715 case PT_LAMP:
3716 for (i = min; i < max; i++)
3717 {
3718 int len = 1;
3719 if (eptr >= md->end_subject) break;
3720 GETCHARLEN(c, eptr, len);
3721 prop_category = _pcre_ucp_findprop(c, &prop_chartype, &prop_script);
3722 if ((prop_chartype == ucp_Lu ||
3723 prop_chartype == ucp_Ll ||
3724 prop_chartype == ucp_Lt) == prop_fail_result)
3725 break;
3726 eptr+= len;
3727 }
3728 break;
3729
3730 case PT_GC:
3731 for (i = min; i < max; i++)
3732 {
3733 int len = 1;
3734 if (eptr >= md->end_subject) break;
3735 GETCHARLEN(c, eptr, len);
3736 prop_category = _pcre_ucp_findprop(c, &prop_chartype, &prop_script);
3737 if ((prop_category == prop_value) == prop_fail_result)
3738 break;
3739 eptr+= len;
3740 }
3741 break;
3742
3743 case PT_PC:
3744 for (i = min; i < max; i++)
3745 {
3746 int len = 1;
3747 if (eptr >= md->end_subject) break;
3748 GETCHARLEN(c, eptr, len);
3749 prop_category = _pcre_ucp_findprop(c, &prop_chartype, &prop_script);
3750 if ((prop_chartype == prop_value) == prop_fail_result)
3751 break;
3752 eptr+= len;
3753 }
3754 break;
3755
3756 case PT_SC:
3757 for (i = min; i < max; i++)
3758 {
3759 int len = 1;
3760 if (eptr >= md->end_subject) break;
3761 GETCHARLEN(c, eptr, len);
3762 prop_category = _pcre_ucp_findprop(c, &prop_chartype, &prop_script);
3763 if ((prop_script == prop_value) == prop_fail_result)
3764 break;
3765 eptr+= len;
3766 }
3767 break;
3768 }
3769
3770 /* eptr is now past the end of the maximum run */
3771
3772 if (possessive) continue;
3773 for(;;)
3774 {
3775 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM44);
3776 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3777 if (eptr-- == pp) break; /* Stop if tried at original pos */
3778 if (utf8) BACKCHAR(eptr);
3779 }
3780 }
3781
3782 /* Match extended Unicode sequences. We will get here only if the
3783 support is in the binary; otherwise a compile-time error occurs. */
3784
3785 else if (ctype == OP_EXTUNI)
3786 {
3787 for (i = min; i < max; i++)
3788 {
3789 if (eptr >= md->end_subject) break;
3790 GETCHARINCTEST(c, eptr);
3791 prop_category = _pcre_ucp_findprop(c, &prop_chartype, &prop_script);
3792 if (prop_category == ucp_M) break;
3793 while (eptr < md->end_subject)
3794 {
3795 int len = 1;
3796 if (!utf8) c = *eptr; else
3797 {
3798 GETCHARLEN(c, eptr, len);
3799 }
3800 prop_category = _pcre_ucp_findprop(c, &prop_chartype, &prop_script);
3801 if (prop_category != ucp_M) break;
3802 eptr += len;
3803 }
3804 }
3805
3806 /* eptr is now past the end of the maximum run */
3807
3808 if (possessive) continue;
3809 for(;;)
3810 {
3811 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM45);
3812 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3813 if (eptr-- == pp) break; /* Stop if tried at original pos */
3814 for (;;) /* Move back over one extended */
3815 {
3816 int len = 1;
3817 if (!utf8) c = *eptr; else
3818 {
3819 BACKCHAR(eptr);
3820 GETCHARLEN(c, eptr, len);
3821 }
3822 prop_category = _pcre_ucp_findprop(c, &prop_chartype, &prop_script);
3823 if (prop_category != ucp_M) break;
3824 eptr--;
3825 }
3826 }
3827 }
3828
3829 else
3830 #endif /* SUPPORT_UCP */
3831
3832 #ifdef SUPPORT_UTF8
3833 /* UTF-8 mode */
3834
3835 if (utf8)
3836 {
3837 switch(ctype)
3838 {
3839 case OP_ANY:
3840 if (max < INT_MAX)
3841 {
3842 if ((ims & PCRE_DOTALL) == 0)
3843 {
3844 for (i = min; i < max; i++)
3845 {
3846 if (eptr >= md->end_subject || IS_NEWLINE(eptr)) break;
3847 eptr++;
3848 while (eptr < md->end_subject && (*eptr & 0xc0) == 0x80) eptr++;
3849 }
3850 }
3851 else
3852 {
3853 for (i = min; i < max; i++)
3854 {
3855 if (eptr >= md->end_subject) break;
3856 eptr++;
3857 while (eptr < md->end_subject && (*eptr & 0xc0) == 0x80) eptr++;
3858 }
3859 }
3860 }
3861
3862 /* Handle unlimited UTF-8 repeat */
3863
3864 else
3865 {
3866 if ((ims & PCRE_DOTALL) == 0)
3867 {
3868 for (i = min; i < max; i++)
3869 {
3870 if (eptr >= md->end_subject || IS_NEWLINE(eptr)) break;
3871 eptr++;
3872 while (eptr < md->end_subject && (*eptr & 0xc0) == 0x80) eptr++;
3873 }
3874 }
3875 else
3876 {
3877 eptr = md->end_subject;
3878 }
3879 }
3880 break;
3881
3882 /* The byte case is the same as non-UTF8 */
3883
3884 case OP_ANYBYTE:
3885 c = max - min;
3886 if (c > (unsigned int)(md->end_subject - eptr))
3887 c = md->end_subject - eptr;
3888 eptr += c;
3889 break;
3890
3891 case OP_ANYNL:
3892 for (i = min; i < max; i++)
3893 {
3894 int len = 1;
3895 if (eptr >= md->end_subject) break;
3896 GETCHARLEN(c, eptr, len);
3897 if (c == 0x000d)
3898 {
3899 if (++eptr >= md->end_subject) break;
3900 if (*eptr == 0x000a) eptr++;
3901 }
3902 else
3903 {
3904 if (c != 0x000a &&
3905 (md->bsr_anycrlf ||
3906 (c != 0x000b && c != 0x000c &&
3907 c != 0x0085 && c != 0x2028 && c != 0x2029)))
3908 break;
3909 eptr += len;
3910 }
3911 }
3912 break;
3913
3914 case OP_NOT_HSPACE:
3915 case OP_HSPACE:
3916 for (i = min; i < max; i++)
3917 {
3918 BOOL gotspace;
3919 int len = 1;
3920 if (eptr >= md->end_subject) break;
3921 GETCHARLEN(c, eptr, len);
3922 switch(c)
3923 {
3924 default: gotspace = FALSE; break;
3925 case 0x09: /* HT */
3926 case 0x20: /* SPACE */
3927 case 0xa0: /* NBSP */
3928 case 0x1680: /* OGHAM SPACE MARK */
3929 case 0x180e: /* MONGOLIAN VOWEL SEPARATOR */
3930 case 0x2000: /* EN QUAD */
3931 case 0x2001: /* EM QUAD */
3932 case 0x2002: /* EN SPACE */
3933 case 0x2003: /* EM SPACE */
3934 case 0x2004: /* THREE-PER-EM SPACE */
3935 case 0x2005: /* FOUR-PER-EM SPACE */
3936 case 0x2006: /* SIX-PER-EM SPACE */
3937 case 0x2007: /* FIGURE SPACE */
3938 case 0x2008: /* PUNCTUATION SPACE */
3939 case 0x2009: /* THIN SPACE */
3940 case 0x200A: /* HAIR SPACE */
3941 case 0x202f: /* NARROW NO-BREAK SPACE */
3942 case 0x205f: /* MEDIUM MATHEMATICAL SPACE */
3943 case 0x3000: /* IDEOGRAPHIC SPACE */
3944 gotspace = TRUE;
3945 break;
3946 }
3947 if (gotspace == (ctype == OP_NOT_HSPACE)) break;
3948 eptr += len;
3949 }
3950 break;
3951
3952 case OP_NOT_VSPACE:
3953 case OP_VSPACE:
3954 for (i = min; i < max; i++)
3955 {
3956 BOOL gotspace;
3957 int len = 1;
3958 if (eptr >= md->end_subject) break;
3959 GETCHARLEN(c, eptr, len);
3960 switch(c)
3961 {
3962 default: gotspace = FALSE; break;
3963 case 0x0a: /* LF */
3964 case 0x0b: /* VT */
3965 case 0x0c: /* FF */
3966 case 0x0d: /* CR */
3967 case 0x85: /* NEL */
3968 case 0x2028: /* LINE SEPARATOR */
3969 case 0x2029: /* PARAGRAPH SEPARATOR */
3970 gotspace = TRUE;
3971 break;
3972 }
3973 if (gotspace == (ctype == OP_NOT_VSPACE)) break;
3974 eptr += len;
3975 }
3976 break;
3977
3978 case OP_NOT_DIGIT:
3979 for (i = min; i < max; i++)
3980 {
3981 int len = 1;
3982 if (eptr >= md->end_subject) break;
3983 GETCHARLEN(c, eptr, len);
3984 if (c < 256 && (md->ctypes[c] & ctype_digit) != 0) break;
3985 eptr+= len;
3986 }
3987 break;
3988
3989 case OP_DIGIT:
3990 for (i = min; i < max; i++)
3991 {
3992 int len = 1;
3993 if (eptr >= md->end_subject) break;
3994 GETCHARLEN(c, eptr, len);
3995 if (c >= 256 ||(md->ctypes[c] & ctype_digit) == 0) break;
3996 eptr+= len;
3997 }
3998 break;
3999
4000 case OP_NOT_WHITESPACE:
4001 for (i = min; i < max; i++)
4002 {
4003 int len = 1;
4004 if (eptr >= md->end_subject) break;
4005 GETCHARLEN(c, eptr, len);
4006 if (c < 256 && (md->ctypes[c] & ctype_space) != 0) break;
4007 eptr+= len;
4008 }
4009 break;
4010
4011 case OP_WHITESPACE:
4012 for (i = min; i < max; i++)
4013 {
4014 int len = 1;
4015 if (eptr >= md->end_subject) break;
4016 GETCHARLEN(c, eptr, len);
4017 if (c >= 256 ||(md->ctypes[c] & ctype_space) == 0) break;
4018 eptr+= len;
4019 }
4020 break;
4021
4022 case OP_NOT_WORDCHAR:
4023 for (i = min; i < max; i++)
4024 {
4025 int len = 1;
4026 if (eptr >= md->end_subject) break;
4027 GETCHARLEN(c, eptr, len);
4028 if (c < 256 && (md->ctypes[c] & ctype_word) != 0) break;
4029 eptr+= len;
4030 }
4031 break;
4032
4033 case OP_WORDCHAR:
4034 for (i = min; i < max; i++)
4035 {
4036 int len = 1;
4037 if (eptr >= md->end_subject) break;
4038 GETCHARLEN(c, eptr, len);
4039 if (c >= 256 || (md->ctypes[c] & ctype_word) == 0) break;
4040 eptr+= len;
4041 }
4042 break;
4043
4044 default:
4045 RRETURN(PCRE_ERROR_INTERNAL);
4046 }
4047
4048 /* eptr is now past the end of the maximum run */
4049
4050 if (possessive) continue;
4051 for(;;)
4052 {
4053 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM46);
4054 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
4055 if (eptr-- == pp) break; /* Stop if tried at original pos */
4056 BACKCHAR(eptr);
4057 }
4058 }
4059 else
4060 #endif /* SUPPORT_UTF8 */
4061
4062 /* Not UTF-8 mode */
4063 {
4064 switch(ctype)
4065 {
4066 case OP_ANY:
4067 if ((ims & PCRE_DOTALL) == 0)
4068 {
4069 for (i = min; i < max; i++)
4070 {
4071 if (eptr >= md->end_subject || IS_NEWLINE(eptr)) break;
4072 eptr++;
4073 }
4074 break;
4075 }
4076 /* For DOTALL case, fall through and treat as \C */
4077
4078 case OP_ANYBYTE:
4079 c = max - min;
4080 if (c > (unsigned int)(md->end_subject - eptr))
4081 c = md->end_subject - eptr;
4082 eptr += c;
4083 break;
4084
4085 case OP_ANYNL:
4086 for (i = min; i < max; i++)
4087 {
4088 if (eptr >= md->end_subject) break;
4089 c = *eptr;
4090 if (c == 0x000d)
4091 {
4092 if (++eptr >= md->end_subject) break;
4093 if (*eptr == 0x000a) eptr++;
4094 }
4095 else
4096 {
4097 if (c != 0x000a &&
4098 (md->bsr_anycrlf ||
4099 (c != 0x000b && c != 0x000c && c != 0x0085)))
4100 break;
4101 eptr++;
4102 }
4103 }
4104 break;
4105
4106 case OP_NOT_HSPACE:
4107 for (i = min; i < max; i++)
4108 {
4109 if (eptr >= md->end_subject) break;
4110 c = *eptr;
4111 if (c == 0x09 || c == 0x20 || c == 0xa0) break;
4112 eptr++;
4113 }
4114 break;
4115
4116 case OP_HSPACE:
4117 for (i = min; i < max; i++)
4118 {
4119 if (eptr >= md->end_subject) break;
4120 c = *eptr;
4121 if (c != 0x09 && c != 0x20 && c != 0xa0) break;
4122 eptr++;
4123 }
4124 break;
4125
4126 case OP_NOT_VSPACE:
4127 for (i = min; i < max; i++)
4128 {
4129 if (eptr >= md->end_subject) break;
4130 c = *eptr;
4131 if (c == 0x0a || c == 0x0b || c == 0x0c || c == 0x0d || c == 0x85)
4132 break;
4133 eptr++;
4134 }
4135 break;
4136
4137 case OP_VSPACE:
4138 for (i = min; i < max; i++)
4139 {
4140 if (eptr >= md->end_subject) break;
4141 c = *eptr;
4142 if (c != 0x0a && c != 0x0b && c != 0x0c && c != 0x0d && c != 0x85)
4143 break;
4144 eptr++;
4145 }
4146 break;
4147
4148 case OP_NOT_DIGIT:
4149 for (i = min; i < max; i++)
4150 {
4151 if (eptr >= md->end_subject || (md->ctypes[*eptr] & ctype_digit) != 0)
4152 break;
4153 eptr++;
4154 }
4155 break;
4156
4157 case OP_DIGIT:
4158 for (i = min; i < max; i++)
4159 {
4160 if (eptr >= md->end_subject || (md->ctypes[*eptr] & ctype_digit) == 0)
4161 break;
4162 eptr++;
4163 }
4164 break;
4165
4166 case OP_NOT_WHITESPACE:
4167 for (i = min; i < max; i++)
4168 {
4169 if (eptr >= md->end_subject || (md->ctypes[*eptr] & ctype_space) != 0)
4170 break;
4171 eptr++;
4172 }
4173 break;
4174
4175 case OP_WHITESPACE:
4176 for (i = min; i < max; i++)
4177 {
4178 if (eptr >= md->end_subject || (md->ctypes[*eptr] & ctype_space) == 0)
4179 break;
4180 eptr++;
4181 }
4182 break;
4183
4184 case OP_NOT_WORDCHAR:
4185 for (i = min; i < max; i++)
4186 {
4187 if (eptr >= md->end_subject || (md->ctypes[*eptr] & ctype_word) != 0)
4188 break;
4189 eptr++;
4190 }
4191 break;
4192
4193 case OP_WORDCHAR:
4194 for (i = min; i < max; i++)
4195 {
4196 if (eptr >= md->end_subject || (md->ctypes[*eptr] & ctype_word) == 0)
4197 break;
4198 eptr++;
4199 }
4200 break;
4201
4202 default:
4203 RRETURN(PCRE_ERROR_INTERNAL);
4204 }
4205
4206 /* eptr is now past the end of the maximum run */
4207
4208 if (possessive) continue;
4209 while (eptr >= pp)
4210 {
4211 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM47);
4212 eptr--;
4213 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
4214 }
4215 }
4216
4217 /* Get here if we can't make it match with any permitted repetitions */
4218
4219 RRETURN(MATCH_NOMATCH);
4220 }
4221 /* Control never gets here */
4222
4223 /* There's been some horrible disaster. Arrival here can only mean there is
4224 something seriously wrong in the code above or the OP_xxx definitions. */
4225
4226 default:
4227 DPRINTF(("Unknown opcode %d\n", *ecode));
4228 RRETURN(PCRE_ERROR_UNKNOWN_OPCODE);
4229 }
4230
4231 /* Do not stick any code in here without much thought; it is assumed
4232 that "continue" in the code above comes out to here to repeat the main
4233 loop. */
4234
4235 } /* End of main loop */
4236 /* Control never reaches here */
4237
4238
4239 /* When compiling to use the heap rather than the stack for recursive calls to
4240 match(), the RRETURN() macro jumps here. The number that is saved in
4241 frame->Xwhere indicates which label we actually want to return to. */
4242
4243 #ifdef NO_RECURSE
4244 #define LBL(val) case val: goto L_RM##val;
4245 HEAP_RETURN:
4246 switch (frame->Xwhere)
4247 {
4248 LBL( 1) LBL( 2) LBL( 3) LBL( 4) LBL( 5) LBL( 6) LBL( 7) LBL( 8)
4249 LBL( 9) LBL(10) LBL(11) LBL(12) LBL(13) LBL(14) LBL(15) LBL(17)
4250 LBL(19) LBL(24) LBL(25) LBL(26) LBL(27) LBL(29) LBL(31) LBL(33)
4251 LBL(35) LBL(43) LBL(47) LBL(48) LBL(49) LBL(50) LBL(51) LBL(52)
4252 LBL(53) LBL(54)
4253 #ifdef SUPPORT_UTF8
4254 LBL(16) LBL(18) LBL(20) LBL(21) LBL(22) LBL(23) LBL(28) LBL(30)
4255 LBL(32) LBL(34) LBL(42) LBL(46)
4256 #ifdef SUPPORT_UCP
4257 LBL(36) LBL(37) LBL(38) LBL(39) LBL(40) LBL(41) LBL(44) LBL(45)
4258 #endif /* SUPPORT_UCP */
4259 #endif /* SUPPORT_UTF8 */
4260 default:
4261 DPRINTF(("jump error in pcre match: label %d non-existent\n", frame->Xwhere));
4262 return PCRE_ERROR_INTERNAL;
4263 }
4264 #undef LBL
4265 #endif /* NO_RECURSE */
4266 }
4267
4268
4269 /***************************************************************************
4270 ****************************************************************************
4271 RECURSION IN THE match() FUNCTION
4272
4273 Undefine all the macros that were defined above to handle this. */
4274
4275 #ifdef NO_RECURSE
4276 #undef eptr
4277 #undef ecode
4278 #undef mstart
4279 #undef offset_top
4280 #undef ims
4281 #undef eptrb
4282 #undef flags
4283
4284 #undef callpat
4285 #undef charptr
4286 #undef data
4287 #undef next
4288 #undef pp
4289 #undef prev
4290 #undef saved_eptr
4291
4292 #undef new_recursive
4293
4294 #undef cur_is_word
4295 #undef condition
4296 #undef prev_is_word
4297
4298 #undef original_ims
4299
4300 #undef ctype
4301 #undef length
4302 #undef max
4303 #undef min
4304 #undef number
4305 #undef offset
4306 #undef op
4307 #undef save_capture_last
4308 #undef save_offset1
4309 #undef save_offset2
4310 #undef save_offset3
4311 #undef stacksave
4312
4313 #undef newptrb
4314
4315 #endif
4316
4317 /* These two are defined as macros in both cases */
4318
4319 #undef fc
4320 #undef fi
4321
4322 /***************************************************************************
4323 ***************************************************************************/
4324
4325
4326
4327 /*************************************************
4328 * Execute a Regular Expression *
4329 *************************************************/
4330
4331 /* This function applies a compiled re to a subject string and picks out
4332 portions of the string if it matches. Two elements in the vector are set for
4333 each substring: the offsets to the start and end of the substring.
4334
4335 Arguments:
4336 argument_re points to the compiled expression
4337 extra_data points to extra data or is NULL
4338 subject points to the subject string
4339 length length of subject string (may contain binary zeros)
4340 start_offset where to start in the subject string
4341 options option bits
4342 offsets points to a vector of ints to be filled in with offsets
4343 offsetcount the number of elements in the vector
4344
4345 Returns: > 0 => success; value is the number of elements filled in
4346 = 0 => success, but offsets is not big enough
4347 -1 => failed to match
4348 < -1 => some kind of unexpected problem
4349 */
4350
4351 PCRE_EXP_DEFN int
4352 pcre_exec(const pcre *argument_re, const pcre_extra *extra_data,
4353 PCRE_SPTR subject, int length, int start_offset, int options, int *offsets,
4354 int offsetcount)
4355 {
4356 int rc, resetcount, ocount;
4357 int first_byte = -1;
4358 int req_byte = -1;
4359 int req_byte2 = -1;
4360 int newline;
4361 unsigned long int ims;
4362 BOOL using_temporary_offsets = FALSE;
4363 BOOL anchored;
4364 BOOL startline;
4365 BOOL firstline;
4366 BOOL first_byte_caseless = FALSE;
4367 BOOL req_byte_caseless = FALSE;
4368 BOOL utf8;
4369 match_data match_block;
4370 match_data *md = &match_block;
4371 const uschar *tables;
4372 const uschar *start_bits = NULL;
4373 USPTR start_match = (USPTR)subject + start_offset;
4374 USPTR end_subject;
4375 USPTR req_byte_ptr = start_match - 1;
4376
4377 pcre_study_data internal_study;
4378 const pcre_study_data *study;
4379
4380 real_pcre internal_re;
4381 const real_pcre *external_re = (const real_pcre *)argument_re;
4382 const real_pcre *re = external_re;
4383
4384 /* Plausibility checks */
4385
4386 if ((options & ~PUBLIC_EXEC_OPTIONS) != 0) return PCRE_ERROR_BADOPTION;
4387 if (re == NULL || subject == NULL ||
4388 (offsets == NULL && offsetcount > 0)) return PCRE_ERROR_NULL;
4389 if (offsetcount < 0) return PCRE_ERROR_BADCOUNT;
4390
4391 /* Fish out the optional data from the extra_data structure, first setting
4392 the default values. */
4393
4394 study = NULL;
4395 md->match_limit = MATCH_LIMIT;
4396 md->match_limit_recursion = MATCH_LIMIT_RECURSION;
4397 md->callout_data = NULL;
4398
4399 /* The table pointer is always in native byte order. */
4400
4401 tables = external_re->tables;
4402
4403 if (extra_data != NULL)
4404 {
4405 register unsigned int flags = extra_data->flags;
4406 if ((flags & PCRE_EXTRA_STUDY_DATA) != 0)
4407 study = (const pcre_study_data *)extra_data->study_data;
4408 if ((flags & PCRE_EXTRA_MATCH_LIMIT) != 0)
4409 md->match_limit = extra_data->match_limit;
4410 if ((flags & PCRE_EXTRA_MATCH_LIMIT_RECURSION) != 0)
4411 md->match_limit_recursion = extra_data->match_limit_recursion;
4412 if ((flags & PCRE_EXTRA_CALLOUT_DATA) != 0)
4413 md->callout_data = extra_data->callout_data;
4414 if ((flags & PCRE_EXTRA_TABLES) != 0) tables = extra_data->tables;
4415 }
4416
4417 /* If the exec call supplied NULL for tables, use the inbuilt ones. This
4418 is a feature that makes it possible to save compiled regex and re-use them
4419 in other programs later. */
4420
4421 if (tables == NULL) tables = _pcre_default_tables;
4422
4423 /* Check that the first field in the block is the magic number. If it is not,
4424 test for a regex that was compiled on a host of opposite endianness. If this is
4425 the case, flipped values are put in internal_re and internal_study if there was
4426 study data too. */
4427
4428 if (re->magic_number != MAGIC_NUMBER)
4429 {
4430 re = _pcre_try_flipped(re, &internal_re, study, &internal_study);
4431 if (re == NULL) return PCRE_ERROR_BADMAGIC;
4432 if (study != NULL) study = &internal_study;
4433 }
4434
4435 /* Set up other data */
4436
4437 anchored = ((re->options | options) & PCRE_ANCHORED) != 0;
4438 startline = (re->flags & PCRE_STARTLINE) != 0;
4439 firstline = (re->options & PCRE_FIRSTLINE) != 0;
4440
4441 /* The code starts after the real_pcre block and the capture name table. */
4442
4443 md->start_code = (const uschar *)external_re + re->name_table_offset +
4444 re->name_count * re->name_entry_size;
4445
4446 md->start_subject = (USPTR)subject;
4447 md->start_offset = start_offset;
4448 md->end_subject = md->start_subject + length;
4449 end_subject = md->end_subject;
4450
4451 md->endonly = (re->options & PCRE_DOLLAR_ENDONLY) != 0;
4452 utf8 = md->utf8 = (re->options & PCRE_UTF8) != 0;
4453
4454 md->notbol = (options & PCRE_NOTBOL) != 0;
4455 md->noteol = (options & PCRE_NOTEOL) != 0;
4456 md->notempty = (options & PCRE_NOTEMPTY) != 0;
4457 md->partial = (options & PCRE_PARTIAL) != 0;
4458 md->hitend = FALSE;
4459
4460 md->recursive = NULL; /* No recursion at top level */
4461
4462 md->lcc = tables + lcc_offset;
4463 md->ctypes = tables + ctypes_offset;
4464
4465 /* Handle different \R options. */
4466
4467 switch (options & (PCRE_BSR_ANYCRLF|PCRE_BSR_UNICODE))
4468 {
4469 case 0:
4470 if ((re->options & (PCRE_BSR_ANYCRLF|PCRE_BSR_UNICODE)) != 0)
4471 md->bsr_anycrlf = (re->options & PCRE_BSR_ANYCRLF) != 0;
4472 else
4473 #ifdef BSR_ANYCRLF
4474 md->bsr_anycrlf = TRUE;
4475 #else
4476 md->bsr_anycrlf = FALSE;
4477 #endif
4478 break;
4479
4480 case PCRE_BSR_ANYCRLF:
4481 md->bsr_anycrlf = TRUE;
4482 break;
4483
4484 case PCRE_BSR_UNICODE:
4485 md->bsr_anycrlf = FALSE;
4486 break;
4487
4488 default: return PCRE_ERROR_BADNEWLINE;
4489 }
4490
4491 /* Handle different types of newline. The three bits give eight cases. If
4492 nothing is set at run time, whatever was used at compile time applies. */
4493
4494 switch ((((options & PCRE_NEWLINE_BITS) == 0)? re->options :
4495 (pcre_uint32)options) & PCRE_NEWLINE_BITS)
4496 {
4497 case 0: newline = NEWLINE; break; /* Compile-time default */
4498 case PCRE_NEWLINE_CR: newline = '\r'; break;
4499 case PCRE_NEWLINE_LF: newline = '\n'; break;
4500 case PCRE_NEWLINE_CR+
4501 PCRE_NEWLINE_LF: newline = ('\r' << 8) | '\n'; break;
4502 case PCRE_NEWLINE_ANY: newline = -1; break;
4503 case PCRE_NEWLINE_ANYCRLF: newline = -2; break;
4504 default: return PCRE_ERROR_BADNEWLINE;
4505 }
4506
4507 if (newline == -2)
4508 {
4509 md->nltype = NLTYPE_ANYCRLF;
4510 }
4511 else if (newline < 0)
4512 {
4513 md->nltype = NLTYPE_ANY;
4514 }
4515 else
4516 {
4517 md->nltype = NLTYPE_FIXED;
4518 if (newline > 255)
4519 {
4520 md->nllen = 2;
4521 md->nl[0] = (newline >> 8) & 255;
4522 md->nl[1] = newline & 255;
4523 }
4524 else
4525 {
4526 md->nllen = 1;
4527 md->nl[0] = newline;
4528 }
4529 }
4530
4531 /* Partial matching is supported only for a restricted set of regexes at the
4532 moment. */
4533
4534 if (md->partial && (re->flags & PCRE_NOPARTIAL) != 0)
4535 return PCRE_ERROR_BADPARTIAL;
4536
4537 /* Check a UTF-8 string if required. Unfortunately there's no way of passing
4538 back the character offset. */
4539
4540 #ifdef SUPPORT_UTF8
4541 if (utf8 && (options & PCRE_NO_UTF8_CHECK) == 0)
4542 {
4543 if (_pcre_valid_utf8((uschar *)subject, length) >= 0)
4544 return PCRE_ERROR_BADUTF8;
4545 if (start_offset > 0 && start_offset < length)
4546 {
4547 int tb = ((uschar *)subject)[start_offset];
4548 if (tb > 127)
4549 {
4550 tb &= 0xc0;
4551 if (tb != 0 && tb != 0xc0) return PCRE_ERROR_BADUTF8_OFFSET;
4552 }
4553 }
4554 }
4555 #endif
4556
4557 /* The ims options can vary during the matching as a result of the presence
4558 of (?ims) items in the pattern. They are kept in a local variable so that
4559 restoring at the exit of a group is easy. */
4560
4561 ims = re->options & (PCRE_CASELESS|PCRE_MULTILINE|PCRE_DOTALL);
4562
4563 /* If the expression has got more back references than the offsets supplied can
4564 hold, we get a temporary chunk of working store to use during the matching.
4565 Otherwise, we can use the vector supplied, rounding down its size to a multiple
4566 of 3. */
4567
4568 ocount = offsetcount - (offsetcount % 3);
4569
4570 if (re->top_backref > 0 && re->top_backref >= ocount/3)
4571 {
4572 ocount = re->top_backref * 3 + 3;
4573 md->offset_vector = (int *)(pcre_malloc)(ocount * sizeof(int));
4574 if (md->offset_vector == NULL) return PCRE_ERROR_NOMEMORY;
4575 using_temporary_offsets = TRUE;
4576 DPRINTF(("Got memory to hold back references\n"));
4577 }
4578 else md->offset_vector = offsets;
4579
4580 md->offset_end = ocount;
4581 md->offset_max = (2*ocount)/3;
4582 md->offset_overflow = FALSE;
4583 md->capture_last = -1;
4584
4585 /* Compute the minimum number of offsets that we need to reset each time. Doing
4586 this makes a huge difference to execution time when there aren't many brackets
4587 in the pattern. */
4588
4589 resetcount = 2 + re->top_bracket * 2;
4590 if (resetcount > offsetcount) resetcount = ocount;
4591
4592 /* Reset the working variable associated with each extraction. These should
4593 never be used unless previously set, but they get saved and restored, and so we
4594 initialize them to avoid reading uninitialized locations. */
4595
4596 if (md->offset_vector != NULL)
4597 {
4598 register int *iptr = md->offset_vector + ocount;
4599 register int *iend = iptr - resetcount/2 + 1;
4600 while (--iptr >= iend) *iptr = -1;
4601 }
4602
4603 /* Set up the first character to match, if available. The first_byte value is
4604 never set for an anchored regular expression, but the anchoring may be forced
4605 at run time, so we have to test for anchoring. The first char may be unset for
4606 an unanchored pattern, of course. If there's no first char and the pattern was
4607 studied, there may be a bitmap of possible first characters. */
4608
4609 if (!anchored)
4610 {
4611 if ((re->flags & PCRE_FIRSTSET) != 0)
4612 {
4613 first_byte = re->first_byte & 255;
4614 if ((first_byte_caseless = ((re->first_byte & REQ_CASELESS) != 0)) == TRUE)
4615 first_byte = md->lcc[first_byte];
4616 }
4617 else
4618 if (!startline && study != NULL &&
4619 (study->options & PCRE_STUDY_MAPPED) != 0)
4620 start_bits = study->start_bits;
4621 }
4622
4623 /* For anchored or unanchored matches, there may be a "last known required
4624 character" set. */
4625
4626 if ((re->flags & PCRE_REQCHSET) != 0)
4627 {
4628 req_byte = re->req_byte & 255;
4629 req_byte_caseless = (re->req_byte & REQ_CASELESS) != 0;
4630 req_byte2 = (tables + fcc_offset)[req_byte]; /* case flipped */
4631 }
4632
4633
4634 /* ==========================================================================*/
4635
4636 /* Loop for handling unanchored repeated matching attempts; for anchored regexs
4637 the loop runs just once. */
4638
4639 for(;;)
4640 {
4641 USPTR save_end_subject = end_subject;
4642 USPTR new_start_match;
4643
4644 /* Reset the maximum number of extractions we might see. */
4645
4646 if (md->offset_vector != NULL)
4647 {
4648 register int *iptr = md->offset_vector;
4649 register int *iend = iptr + resetcount;
4650 while (iptr < iend) *iptr++ = -1;
4651 }
4652
4653 /* Advance to a unique first char if possible. If firstline is TRUE, the
4654 start of the match is constrained to the first line of a multiline string.
4655 That is, the match must be before or at the first newline. Implement this by
4656 temporarily adjusting end_subject so that we stop scanning at a newline. If
4657 the match fails at the newline, later code breaks this loop. */
4658
4659 if (firstline)
4660 {
4661 USPTR t = start_match;
4662 while (t < md->end_subject && !IS_NEWLINE(t)) t++;
4663 end_subject = t;
4664 }
4665
4666 /* Now test for a unique first byte */
4667
4668 if (first_byte >= 0)
4669 {
4670 if (first_byte_caseless)
4671 while (start_match < end_subject &&
4672 md->lcc[*start_match] != first_byte)
4673 { NEXTCHAR(start_match); }
4674 else
4675 while (start_match < end_subject && *start_match != first_byte)
4676 { NEXTCHAR(start_match); }
4677 }
4678
4679 /* Or to just after a linebreak for a multiline match if possible */
4680
4681 else if (startline)
4682 {
4683 if (start_match > md->start_subject + start_offset)
4684 {
4685 while (start_match <= end_subject && !WAS_NEWLINE(start_match))
4686 { NEXTCHAR(start_match); }
4687
4688 /* If we have just passed a CR and the newline option is ANY or ANYCRLF,
4689 and we are now at a LF, advance the match position by one more character.
4690 */
4691
4692 if (start_match[-1] == '\r' &&
4693 (md->nltype == NLTYPE_ANY || md->nltype == NLTYPE_ANYCRLF) &&
4694 start_match < end_subject &&
4695 *start_match == '\n')
4696 start_match++;
4697 }
4698 }
4699
4700 /* Or to a non-unique first char after study */
4701
4702 else if (start_bits != NULL)
4703 {
4704 while (start_match < end_subject)
4705 {
4706 register unsigned int c = *start_match;
4707 if ((start_bits[c/8] & (1 << (c&7))) == 0)
4708 { NEXTCHAR(start_match); }
4709 else break;
4710 }
4711 }
4712
4713 /* Restore fudged end_subject */
4714
4715 end_subject = save_end_subject;
4716
4717 #ifdef DEBUG /* Sigh. Some compilers never learn. */
4718 printf(">>>> Match against: ");
4719 pchars(start_match, end_subject - start_match, TRUE, md);
4720 printf("\n");
4721 #endif
4722
4723 /* If req_byte is set, we know that that character must appear in the subject
4724 for the match to succeed. If the first character is set, req_byte must be
4725 later in the subject; otherwise the test starts at the match point. This
4726 optimization can save a huge amount of backtracking in patterns with nested
4727 unlimited repeats that aren't going to match. Writing separate code for
4728 cased/caseless versions makes it go faster, as does using an autoincrement
4729 and backing off on a match.
4730
4731 HOWEVER: when the subject string is very, very long, searching to its end can
4732 take a long time, and give bad performance on quite ordinary patterns. This
4733 showed up when somebody was matching something like /^\d+C/ on a 32-megabyte
4734 string... so we don't do this when the string is sufficiently long.
4735
4736 ALSO: this processing is disabled when partial matching is requested.
4737 */
4738
4739 if (req_byte >= 0 &&
4740 end_subject - start_match < REQ_BYTE_MAX &&
4741 !md->partial)
4742 {
4743 register USPTR p = start_match + ((first_byte >= 0)? 1 : 0);
4744
4745 /* We don't need to repeat the search if we haven't yet reached the
4746 place we found it at last time. */
4747
4748 if (p > req_byte_ptr)
4749 {
4750 if (req_byte_caseless)
4751 {
4752 while (p < end_subject)
4753 {
4754 register int pp = *p++;
4755 if (pp == req_byte || pp == req_byte2) { p--; break; }
4756 }
4757 }
4758 else
4759 {
4760 while (p < end_subject)
4761 {
4762 if (*p++ == req_byte) { p--; break; }
4763 }
4764 }
4765
4766 /* If we can't find the required character, break the matching loop,
4767 forcing a match failure. */
4768
4769 if (p >= end_subject)
4770 {
4771 rc = MATCH_NOMATCH;
4772 break;
4773 }
4774
4775 /* If we have found the required character, save the point where we
4776 found it, so that we don't search again next time round the loop if
4777 the start hasn't passed this character yet. */
4778
4779 req_byte_ptr = p;
4780 }
4781 }
4782
4783 /* OK, we can now run the match. */
4784
4785 md->start_match_ptr = start_match;
4786 md->match_call_count = 0;
4787 rc = match(start_match, md->start_code, start_match, 2, md, ims, NULL, 0, 0);
4788
4789 switch(rc)
4790 {
4791 /* NOMATCH and PRUNE advance by one character. THEN at this level acts
4792 exactly like PRUNE. */
4793
4794 case MATCH_NOMATCH:
4795 case MATCH_PRUNE:
4796 case MATCH_THEN:
4797 new_start_match = start_match + 1;
4798 #ifdef SUPPORT_UTF8
4799 if (utf8)
4800 while(new_start_match < end_subject && (*new_start_match & 0xc0) == 0x80)
4801 new_start_match++;
4802 #endif
4803 break;
4804
4805 /* SKIP passes back the next starting point explicitly. */
4806
4807 case MATCH_SKIP:
4808 new_start_match = md->start_match_ptr;
4809 break;
4810
4811 /* COMMIT disables the bumpalong, but otherwise behaves as NOMATCH. */
4812
4813 case MATCH_COMMIT:
4814 rc = MATCH_NOMATCH;
4815 goto ENDLOOP;
4816
4817 /* Any other return is some kind of error. */
4818
4819 default:
4820 goto ENDLOOP;
4821 }
4822
4823 /* Control reaches here for the various types of "no match at this point"
4824 result. Reset the code to MATCH_NOMATCH for subsequent checking. */
4825
4826 rc = MATCH_NOMATCH;
4827
4828 /* If PCRE_FIRSTLINE is set, the match must happen before or at the first
4829 newline in the subject (though it may continue over the newline). Therefore,
4830 if we have just failed to match, starting at a newline, do not continue. */
4831
4832 if (firstline && IS_NEWLINE(start_match)) break;
4833
4834 /* Advance to new matching position */
4835
4836 start_match = new_start_match;
4837
4838 /* Break the loop if the pattern is anchored or if we have passed the end of
4839 the subject. */
4840
4841 if (anchored || start_match > end_subject) break;
4842
4843 /* If we have just passed a CR and we are now at a LF, and the pattern does
4844 not contain any explicit matches for \r or \n, and the newline option is CRLF
4845 or ANY or ANYCRLF, advance the match position by one more character. */
4846
4847 if (start_match[-1] == '\r' &&
4848 start_match < end_subject &&
4849 *start_match == '\n' &&
4850 (re->flags & PCRE_HASCRORLF) == 0 &&
4851 (md->nltype == NLTYPE_ANY ||
4852 md->nltype == NLTYPE_ANYCRLF ||
4853 md->nllen == 2))
4854 start_match++;
4855
4856 } /* End of for(;;) "bumpalong" loop */
4857
4858 /* ==========================================================================*/
4859
4860 /* We reach here when rc is not MATCH_NOMATCH, or if one of the stopping
4861 conditions is true:
4862
4863 (1) The pattern is anchored or the match was failed by (*COMMIT);
4864
4865 (2) We are past the end of the subject;
4866
4867 (3) PCRE_FIRSTLINE is set and we have failed to match at a newline, because
4868 this option requests that a match occur at or before the first newline in
4869 the subject.
4870
4871 When we have a match and the offset vector is big enough to deal with any
4872 backreferences, captured substring offsets will already be set up. In the case
4873 where we had to get some local store to hold offsets for backreference
4874 processing, copy those that we can. In this case there need not be overflow if
4875 certain parts of the pattern were not used, even though there are more
4876 capturing parentheses than vector slots. */
4877
4878 ENDLOOP:
4879
4880 if (rc == MATCH_MATCH)
4881 {
4882 if (using_temporary_offsets)
4883 {
4884 if (offsetcount >= 4)
4885 {
4886 memcpy(offsets + 2, md->offset_vector + 2,
4887 (offsetcount - 2) * sizeof(int));
4888 DPRINTF(("Copied offsets from temporary memory\n"));
4889 }
4890 if (md->end_offset_top > offsetcount) md->offset_overflow = TRUE;
4891 DPRINTF(("Freeing temporary memory\n"));
4892 (pcre_free)(md->offset_vector);
4893 }
4894
4895 /* Set the return code to the number of captured strings, or 0 if there are
4896 too many to fit into the vector. */
4897
4898 rc = md->offset_overflow? 0 : md->end_offset_top/2;
4899
4900 /* If there is space, set up the whole thing as substring 0. The value of
4901 md->start_match_ptr might be modified if \K was encountered on the success
4902 matching path. */
4903
4904 if (offsetcount < 2) rc = 0; else
4905 {
4906 offsets[0] = md->start_match_ptr - md->start_subject;
4907 offsets[1] = md->end_match_ptr - md->start_subject;
4908 }
4909
4910 DPRINTF((">>>> returning %d\n", rc));
4911 return rc;
4912 }
4913
4914 /* Control gets here if there has been an error, or if the overall match
4915 attempt has failed at all permitted starting positions. */
4916
4917 if (using_temporary_offsets)
4918 {
4919 DPRINTF(("Freeing temporary memory\n"));
4920 (pcre_free)(md->offset_vector);
4921 }
4922
4923 if (rc != MATCH_NOMATCH)
4924 {
4925 DPRINTF((">>>> error: returning %d\n", rc));
4926 return rc;
4927 }
4928 else if (md->partial && md->hitend)
4929 {
4930 DPRINTF((">>>> returning PCRE_ERROR_PARTIAL\n"));
4931 return PCRE_ERROR_PARTIAL;
4932 }
4933 else
4934 {
4935 DPRINTF((">>>> returning PCRE_ERROR_NOMATCH\n"));
4936 return PCRE_ERROR_NOMATCH;
4937 }
4938 }
4939
4940 /* End of pcre_exec.c */

Properties

Name Value
svn:eol-style native
svn:keywords "Author Date Id Revision Url"

webmaster@exim.org
ViewVC Help
Powered by ViewVC 1.1.12