/[pcre]/code/trunk/pcre_exec.c
ViewVC logotype

Contents of /code/trunk/pcre_exec.c

Parent Directory Parent Directory | Revision Log Revision Log


Revision 351 - (show annotations) (download)
Fri Jul 4 18:27:16 2008 UTC (6 years, 4 months ago) by ph10
File MIME type: text/plain
File size: 149954 byte(s)
Final tidies for new Unicode property code; upgrade to Unicode 5.1.0.

1 /*************************************************
2 * Perl-Compatible Regular Expressions *
3 *************************************************/
4
5 /* PCRE is a library of functions to support regular expressions whose syntax
6 and semantics are as close as possible to those of the Perl 5 language.
7
8 Written by Philip Hazel
9 Copyright (c) 1997-2008 University of Cambridge
10
11 -----------------------------------------------------------------------------
12 Redistribution and use in source and binary forms, with or without
13 modification, are permitted provided that the following conditions are met:
14
15 * Redistributions of source code must retain the above copyright notice,
16 this list of conditions and the following disclaimer.
17
18 * Redistributions in binary form must reproduce the above copyright
19 notice, this list of conditions and the following disclaimer in the
20 documentation and/or other materials provided with the distribution.
21
22 * Neither the name of the University of Cambridge nor the names of its
23 contributors may be used to endorse or promote products derived from
24 this software without specific prior written permission.
25
26 THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
27 AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
28 IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
29 ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
30 LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
31 CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
32 SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
33 INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
34 CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
35 ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
36 POSSIBILITY OF SUCH DAMAGE.
37 -----------------------------------------------------------------------------
38 */
39
40
41 /* This module contains pcre_exec(), the externally visible function that does
42 pattern matching using an NFA algorithm, trying to mimic Perl as closely as
43 possible. There are also some static supporting functions. */
44
45 #ifdef HAVE_CONFIG_H
46 #include "config.h"
47 #endif
48
49 #define NLBLOCK md /* Block containing newline information */
50 #define PSSTART start_subject /* Field containing processed string start */
51 #define PSEND end_subject /* Field containing processed string end */
52
53 #include "pcre_internal.h"
54
55 /* Undefine some potentially clashing cpp symbols */
56
57 #undef min
58 #undef max
59
60 /* Flag bits for the match() function */
61
62 #define match_condassert 0x01 /* Called to check a condition assertion */
63 #define match_cbegroup 0x02 /* Could-be-empty unlimited repeat group */
64
65 /* Non-error returns from the match() function. Error returns are externally
66 defined PCRE_ERROR_xxx codes, which are all negative. */
67
68 #define MATCH_MATCH 1
69 #define MATCH_NOMATCH 0
70
71 /* Special internal returns from the match() function. Make them sufficiently
72 negative to avoid the external error codes. */
73
74 #define MATCH_COMMIT (-999)
75 #define MATCH_PRUNE (-998)
76 #define MATCH_SKIP (-997)
77 #define MATCH_THEN (-996)
78
79 /* Maximum number of ints of offset to save on the stack for recursive calls.
80 If the offset vector is bigger, malloc is used. This should be a multiple of 3,
81 because the offset vector is always a multiple of 3 long. */
82
83 #define REC_STACK_SAVE_MAX 30
84
85 /* Min and max values for the common repeats; for the maxima, 0 => infinity */
86
87 static const char rep_min[] = { 0, 0, 1, 1, 0, 0 };
88 static const char rep_max[] = { 0, 0, 0, 0, 1, 1 };
89
90
91
92 #ifdef DEBUG
93 /*************************************************
94 * Debugging function to print chars *
95 *************************************************/
96
97 /* Print a sequence of chars in printable format, stopping at the end of the
98 subject if the requested.
99
100 Arguments:
101 p points to characters
102 length number to print
103 is_subject TRUE if printing from within md->start_subject
104 md pointer to matching data block, if is_subject is TRUE
105
106 Returns: nothing
107 */
108
109 static void
110 pchars(const uschar *p, int length, BOOL is_subject, match_data *md)
111 {
112 unsigned int c;
113 if (is_subject && length > md->end_subject - p) length = md->end_subject - p;
114 while (length-- > 0)
115 if (isprint(c = *(p++))) printf("%c", c); else printf("\\x%02x", c);
116 }
117 #endif
118
119
120
121 /*************************************************
122 * Match a back-reference *
123 *************************************************/
124
125 /* If a back reference hasn't been set, the length that is passed is greater
126 than the number of characters left in the string, so the match fails.
127
128 Arguments:
129 offset index into the offset vector
130 eptr points into the subject
131 length length to be matched
132 md points to match data block
133 ims the ims flags
134
135 Returns: TRUE if matched
136 */
137
138 static BOOL
139 match_ref(int offset, register USPTR eptr, int length, match_data *md,
140 unsigned long int ims)
141 {
142 USPTR p = md->start_subject + md->offset_vector[offset];
143
144 #ifdef DEBUG
145 if (eptr >= md->end_subject)
146 printf("matching subject <null>");
147 else
148 {
149 printf("matching subject ");
150 pchars(eptr, length, TRUE, md);
151 }
152 printf(" against backref ");
153 pchars(p, length, FALSE, md);
154 printf("\n");
155 #endif
156
157 /* Always fail if not enough characters left */
158
159 if (length > md->end_subject - eptr) return FALSE;
160
161 /* Separate the caselesss case for speed */
162
163 if ((ims & PCRE_CASELESS) != 0)
164 {
165 while (length-- > 0)
166 if (md->lcc[*p++] != md->lcc[*eptr++]) return FALSE;
167 }
168 else
169 { while (length-- > 0) if (*p++ != *eptr++) return FALSE; }
170
171 return TRUE;
172 }
173
174
175
176 /***************************************************************************
177 ****************************************************************************
178 RECURSION IN THE match() FUNCTION
179
180 The match() function is highly recursive, though not every recursive call
181 increases the recursive depth. Nevertheless, some regular expressions can cause
182 it to recurse to a great depth. I was writing for Unix, so I just let it call
183 itself recursively. This uses the stack for saving everything that has to be
184 saved for a recursive call. On Unix, the stack can be large, and this works
185 fine.
186
187 It turns out that on some non-Unix-like systems there are problems with
188 programs that use a lot of stack. (This despite the fact that every last chip
189 has oodles of memory these days, and techniques for extending the stack have
190 been known for decades.) So....
191
192 There is a fudge, triggered by defining NO_RECURSE, which avoids recursive
193 calls by keeping local variables that need to be preserved in blocks of memory
194 obtained from malloc() instead instead of on the stack. Macros are used to
195 achieve this so that the actual code doesn't look very different to what it
196 always used to.
197
198 The original heap-recursive code used longjmp(). However, it seems that this
199 can be very slow on some operating systems. Following a suggestion from Stan
200 Switzer, the use of longjmp() has been abolished, at the cost of having to
201 provide a unique number for each call to RMATCH. There is no way of generating
202 a sequence of numbers at compile time in C. I have given them names, to make
203 them stand out more clearly.
204
205 Crude tests on x86 Linux show a small speedup of around 5-8%. However, on
206 FreeBSD, avoiding longjmp() more than halves the time taken to run the standard
207 tests. Furthermore, not using longjmp() means that local dynamic variables
208 don't have indeterminate values; this has meant that the frame size can be
209 reduced because the result can be "passed back" by straight setting of the
210 variable instead of being passed in the frame.
211 ****************************************************************************
212 ***************************************************************************/
213
214 /* Numbers for RMATCH calls. When this list is changed, the code at HEAP_RETURN
215 below must be updated in sync. */
216
217 enum { RM1=1, RM2, RM3, RM4, RM5, RM6, RM7, RM8, RM9, RM10,
218 RM11, RM12, RM13, RM14, RM15, RM16, RM17, RM18, RM19, RM20,
219 RM21, RM22, RM23, RM24, RM25, RM26, RM27, RM28, RM29, RM30,
220 RM31, RM32, RM33, RM34, RM35, RM36, RM37, RM38, RM39, RM40,
221 RM41, RM42, RM43, RM44, RM45, RM46, RM47, RM48, RM49, RM50,
222 RM51, RM52, RM53, RM54 };
223
224 /* These versions of the macros use the stack, as normal. There are debugging
225 versions and production versions. Note that the "rw" argument of RMATCH isn't
226 actuall used in this definition. */
227
228 #ifndef NO_RECURSE
229 #define REGISTER register
230
231 #ifdef DEBUG
232 #define RMATCH(ra,rb,rc,rd,re,rf,rg,rw) \
233 { \
234 printf("match() called in line %d\n", __LINE__); \
235 rrc = match(ra,rb,mstart,rc,rd,re,rf,rg,rdepth+1); \
236 printf("to line %d\n", __LINE__); \
237 }
238 #define RRETURN(ra) \
239 { \
240 printf("match() returned %d from line %d ", ra, __LINE__); \
241 return ra; \
242 }
243 #else
244 #define RMATCH(ra,rb,rc,rd,re,rf,rg,rw) \
245 rrc = match(ra,rb,mstart,rc,rd,re,rf,rg,rdepth+1)
246 #define RRETURN(ra) return ra
247 #endif
248
249 #else
250
251
252 /* These versions of the macros manage a private stack on the heap. Note that
253 the "rd" argument of RMATCH isn't actually used in this definition. It's the md
254 argument of match(), which never changes. */
255
256 #define REGISTER
257
258 #define RMATCH(ra,rb,rc,rd,re,rf,rg,rw)\
259 {\
260 heapframe *newframe = (pcre_stack_malloc)(sizeof(heapframe));\
261 frame->Xwhere = rw; \
262 newframe->Xeptr = ra;\
263 newframe->Xecode = rb;\
264 newframe->Xmstart = mstart;\
265 newframe->Xoffset_top = rc;\
266 newframe->Xims = re;\
267 newframe->Xeptrb = rf;\
268 newframe->Xflags = rg;\
269 newframe->Xrdepth = frame->Xrdepth + 1;\
270 newframe->Xprevframe = frame;\
271 frame = newframe;\
272 DPRINTF(("restarting from line %d\n", __LINE__));\
273 goto HEAP_RECURSE;\
274 L_##rw:\
275 DPRINTF(("jumped back to line %d\n", __LINE__));\
276 }
277
278 #define RRETURN(ra)\
279 {\
280 heapframe *newframe = frame;\
281 frame = newframe->Xprevframe;\
282 (pcre_stack_free)(newframe);\
283 if (frame != NULL)\
284 {\
285 rrc = ra;\
286 goto HEAP_RETURN;\
287 }\
288 return ra;\
289 }
290
291
292 /* Structure for remembering the local variables in a private frame */
293
294 typedef struct heapframe {
295 struct heapframe *Xprevframe;
296
297 /* Function arguments that may change */
298
299 const uschar *Xeptr;
300 const uschar *Xecode;
301 const uschar *Xmstart;
302 int Xoffset_top;
303 long int Xims;
304 eptrblock *Xeptrb;
305 int Xflags;
306 unsigned int Xrdepth;
307
308 /* Function local variables */
309
310 const uschar *Xcallpat;
311 const uschar *Xcharptr;
312 const uschar *Xdata;
313 const uschar *Xnext;
314 const uschar *Xpp;
315 const uschar *Xprev;
316 const uschar *Xsaved_eptr;
317
318 recursion_info Xnew_recursive;
319
320 BOOL Xcur_is_word;
321 BOOL Xcondition;
322 BOOL Xprev_is_word;
323
324 unsigned long int Xoriginal_ims;
325
326 #ifdef SUPPORT_UCP
327 int Xprop_type;
328 int Xprop_value;
329 int Xprop_fail_result;
330 int Xprop_category;
331 int Xprop_chartype;
332 int Xprop_script;
333 int Xoclength;
334 uschar Xocchars[8];
335 #endif
336
337 int Xctype;
338 unsigned int Xfc;
339 int Xfi;
340 int Xlength;
341 int Xmax;
342 int Xmin;
343 int Xnumber;
344 int Xoffset;
345 int Xop;
346 int Xsave_capture_last;
347 int Xsave_offset1, Xsave_offset2, Xsave_offset3;
348 int Xstacksave[REC_STACK_SAVE_MAX];
349
350 eptrblock Xnewptrb;
351
352 /* Where to jump back to */
353
354 int Xwhere;
355
356 } heapframe;
357
358 #endif
359
360
361 /***************************************************************************
362 ***************************************************************************/
363
364
365
366 /*************************************************
367 * Match from current position *
368 *************************************************/
369
370 /* This function is called recursively in many circumstances. Whenever it
371 returns a negative (error) response, the outer incarnation must also return the
372 same response.
373
374 Performance note: It might be tempting to extract commonly used fields from the
375 md structure (e.g. utf8, end_subject) into individual variables to improve
376 performance. Tests using gcc on a SPARC disproved this; in the first case, it
377 made performance worse.
378
379 Arguments:
380 eptr pointer to current character in subject
381 ecode pointer to current position in compiled code
382 mstart pointer to the current match start position (can be modified
383 by encountering \K)
384 offset_top current top pointer
385 md pointer to "static" info for the match
386 ims current /i, /m, and /s options
387 eptrb pointer to chain of blocks containing eptr at start of
388 brackets - for testing for empty matches
389 flags can contain
390 match_condassert - this is an assertion condition
391 match_cbegroup - this is the start of an unlimited repeat
392 group that can match an empty string
393 rdepth the recursion depth
394
395 Returns: MATCH_MATCH if matched ) these values are >= 0
396 MATCH_NOMATCH if failed to match )
397 a negative PCRE_ERROR_xxx value if aborted by an error condition
398 (e.g. stopped by repeated call or recursion limit)
399 */
400
401 static int
402 match(REGISTER USPTR eptr, REGISTER const uschar *ecode, const uschar *mstart,
403 int offset_top, match_data *md, unsigned long int ims, eptrblock *eptrb,
404 int flags, unsigned int rdepth)
405 {
406 /* These variables do not need to be preserved over recursion in this function,
407 so they can be ordinary variables in all cases. Mark some of them with
408 "register" because they are used a lot in loops. */
409
410 register int rrc; /* Returns from recursive calls */
411 register int i; /* Used for loops not involving calls to RMATCH() */
412 register unsigned int c; /* Character values not kept over RMATCH() calls */
413 register BOOL utf8; /* Local copy of UTF-8 flag for speed */
414
415 BOOL minimize, possessive; /* Quantifier options */
416
417 /* When recursion is not being used, all "local" variables that have to be
418 preserved over calls to RMATCH() are part of a "frame" which is obtained from
419 heap storage. Set up the top-level frame here; others are obtained from the
420 heap whenever RMATCH() does a "recursion". See the macro definitions above. */
421
422 #ifdef NO_RECURSE
423 heapframe *frame = (pcre_stack_malloc)(sizeof(heapframe));
424 frame->Xprevframe = NULL; /* Marks the top level */
425
426 /* Copy in the original argument variables */
427
428 frame->Xeptr = eptr;
429 frame->Xecode = ecode;
430 frame->Xmstart = mstart;
431 frame->Xoffset_top = offset_top;
432 frame->Xims = ims;
433 frame->Xeptrb = eptrb;
434 frame->Xflags = flags;
435 frame->Xrdepth = rdepth;
436
437 /* This is where control jumps back to to effect "recursion" */
438
439 HEAP_RECURSE:
440
441 /* Macros make the argument variables come from the current frame */
442
443 #define eptr frame->Xeptr
444 #define ecode frame->Xecode
445 #define mstart frame->Xmstart
446 #define offset_top frame->Xoffset_top
447 #define ims frame->Xims
448 #define eptrb frame->Xeptrb
449 #define flags frame->Xflags
450 #define rdepth frame->Xrdepth
451
452 /* Ditto for the local variables */
453
454 #ifdef SUPPORT_UTF8
455 #define charptr frame->Xcharptr
456 #endif
457 #define callpat frame->Xcallpat
458 #define data frame->Xdata
459 #define next frame->Xnext
460 #define pp frame->Xpp
461 #define prev frame->Xprev
462 #define saved_eptr frame->Xsaved_eptr
463
464 #define new_recursive frame->Xnew_recursive
465
466 #define cur_is_word frame->Xcur_is_word
467 #define condition frame->Xcondition
468 #define prev_is_word frame->Xprev_is_word
469
470 #define original_ims frame->Xoriginal_ims
471
472 #ifdef SUPPORT_UCP
473 #define prop_type frame->Xprop_type
474 #define prop_value frame->Xprop_value
475 #define prop_fail_result frame->Xprop_fail_result
476 #define prop_category frame->Xprop_category
477 #define prop_chartype frame->Xprop_chartype
478 #define prop_script frame->Xprop_script
479 #define oclength frame->Xoclength
480 #define occhars frame->Xocchars
481 #endif
482
483 #define ctype frame->Xctype
484 #define fc frame->Xfc
485 #define fi frame->Xfi
486 #define length frame->Xlength
487 #define max frame->Xmax
488 #define min frame->Xmin
489 #define number frame->Xnumber
490 #define offset frame->Xoffset
491 #define op frame->Xop
492 #define save_capture_last frame->Xsave_capture_last
493 #define save_offset1 frame->Xsave_offset1
494 #define save_offset2 frame->Xsave_offset2
495 #define save_offset3 frame->Xsave_offset3
496 #define stacksave frame->Xstacksave
497
498 #define newptrb frame->Xnewptrb
499
500 /* When recursion is being used, local variables are allocated on the stack and
501 get preserved during recursion in the normal way. In this environment, fi and
502 i, and fc and c, can be the same variables. */
503
504 #else /* NO_RECURSE not defined */
505 #define fi i
506 #define fc c
507
508
509 #ifdef SUPPORT_UTF8 /* Many of these variables are used only */
510 const uschar *charptr; /* in small blocks of the code. My normal */
511 #endif /* style of coding would have declared */
512 const uschar *callpat; /* them within each of those blocks. */
513 const uschar *data; /* However, in order to accommodate the */
514 const uschar *next; /* version of this code that uses an */
515 USPTR pp; /* external "stack" implemented on the */
516 const uschar *prev; /* heap, it is easier to declare them all */
517 USPTR saved_eptr; /* here, so the declarations can be cut */
518 /* out in a block. The only declarations */
519 recursion_info new_recursive; /* within blocks below are for variables */
520 /* that do not have to be preserved over */
521 BOOL cur_is_word; /* a recursive call to RMATCH(). */
522 BOOL condition;
523 BOOL prev_is_word;
524
525 unsigned long int original_ims;
526
527 #ifdef SUPPORT_UCP
528 int prop_type;
529 int prop_value;
530 int prop_fail_result;
531 int prop_category;
532 int prop_chartype;
533 int prop_script;
534 int oclength;
535 uschar occhars[8];
536 #endif
537
538 int ctype;
539 int length;
540 int max;
541 int min;
542 int number;
543 int offset;
544 int op;
545 int save_capture_last;
546 int save_offset1, save_offset2, save_offset3;
547 int stacksave[REC_STACK_SAVE_MAX];
548
549 eptrblock newptrb;
550 #endif /* NO_RECURSE */
551
552 /* These statements are here to stop the compiler complaining about unitialized
553 variables. */
554
555 #ifdef SUPPORT_UCP
556 prop_value = 0;
557 prop_fail_result = 0;
558 #endif
559
560
561 /* This label is used for tail recursion, which is used in a few cases even
562 when NO_RECURSE is not defined, in order to reduce the amount of stack that is
563 used. Thanks to Ian Taylor for noticing this possibility and sending the
564 original patch. */
565
566 TAIL_RECURSE:
567
568 /* OK, now we can get on with the real code of the function. Recursive calls
569 are specified by the macro RMATCH and RRETURN is used to return. When
570 NO_RECURSE is *not* defined, these just turn into a recursive call to match()
571 and a "return", respectively (possibly with some debugging if DEBUG is
572 defined). However, RMATCH isn't like a function call because it's quite a
573 complicated macro. It has to be used in one particular way. This shouldn't,
574 however, impact performance when true recursion is being used. */
575
576 #ifdef SUPPORT_UTF8
577 utf8 = md->utf8; /* Local copy of the flag */
578 #else
579 utf8 = FALSE;
580 #endif
581
582 /* First check that we haven't called match() too many times, or that we
583 haven't exceeded the recursive call limit. */
584
585 if (md->match_call_count++ >= md->match_limit) RRETURN(PCRE_ERROR_MATCHLIMIT);
586 if (rdepth >= md->match_limit_recursion) RRETURN(PCRE_ERROR_RECURSIONLIMIT);
587
588 original_ims = ims; /* Save for resetting on ')' */
589
590 /* At the start of a group with an unlimited repeat that may match an empty
591 string, the match_cbegroup flag is set. When this is the case, add the current
592 subject pointer to the chain of such remembered pointers, to be checked when we
593 hit the closing ket, in order to break infinite loops that match no characters.
594 When match() is called in other circumstances, don't add to the chain. The
595 match_cbegroup flag must NOT be used with tail recursion, because the memory
596 block that is used is on the stack, so a new one may be required for each
597 match(). */
598
599 if ((flags & match_cbegroup) != 0)
600 {
601 newptrb.epb_saved_eptr = eptr;
602 newptrb.epb_prev = eptrb;
603 eptrb = &newptrb;
604 }
605
606 /* Now start processing the opcodes. */
607
608 for (;;)
609 {
610 minimize = possessive = FALSE;
611 op = *ecode;
612
613 /* For partial matching, remember if we ever hit the end of the subject after
614 matching at least one subject character. */
615
616 if (md->partial &&
617 eptr >= md->end_subject &&
618 eptr > mstart)
619 md->hitend = TRUE;
620
621 switch(op)
622 {
623 case OP_FAIL:
624 RRETURN(MATCH_NOMATCH);
625
626 case OP_PRUNE:
627 RMATCH(eptr, ecode + _pcre_OP_lengths[*ecode], offset_top, md,
628 ims, eptrb, flags, RM51);
629 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
630 RRETURN(MATCH_PRUNE);
631
632 case OP_COMMIT:
633 RMATCH(eptr, ecode + _pcre_OP_lengths[*ecode], offset_top, md,
634 ims, eptrb, flags, RM52);
635 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
636 RRETURN(MATCH_COMMIT);
637
638 case OP_SKIP:
639 RMATCH(eptr, ecode + _pcre_OP_lengths[*ecode], offset_top, md,
640 ims, eptrb, flags, RM53);
641 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
642 md->start_match_ptr = eptr; /* Pass back current position */
643 RRETURN(MATCH_SKIP);
644
645 case OP_THEN:
646 RMATCH(eptr, ecode + _pcre_OP_lengths[*ecode], offset_top, md,
647 ims, eptrb, flags, RM54);
648 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
649 RRETURN(MATCH_THEN);
650
651 /* Handle a capturing bracket. If there is space in the offset vector, save
652 the current subject position in the working slot at the top of the vector.
653 We mustn't change the current values of the data slot, because they may be
654 set from a previous iteration of this group, and be referred to by a
655 reference inside the group.
656
657 If the bracket fails to match, we need to restore this value and also the
658 values of the final offsets, in case they were set by a previous iteration
659 of the same bracket.
660
661 If there isn't enough space in the offset vector, treat this as if it were
662 a non-capturing bracket. Don't worry about setting the flag for the error
663 case here; that is handled in the code for KET. */
664
665 case OP_CBRA:
666 case OP_SCBRA:
667 number = GET2(ecode, 1+LINK_SIZE);
668 offset = number << 1;
669
670 #ifdef DEBUG
671 printf("start bracket %d\n", number);
672 printf("subject=");
673 pchars(eptr, 16, TRUE, md);
674 printf("\n");
675 #endif
676
677 if (offset < md->offset_max)
678 {
679 save_offset1 = md->offset_vector[offset];
680 save_offset2 = md->offset_vector[offset+1];
681 save_offset3 = md->offset_vector[md->offset_end - number];
682 save_capture_last = md->capture_last;
683
684 DPRINTF(("saving %d %d %d\n", save_offset1, save_offset2, save_offset3));
685 md->offset_vector[md->offset_end - number] = eptr - md->start_subject;
686
687 flags = (op == OP_SCBRA)? match_cbegroup : 0;
688 do
689 {
690 RMATCH(eptr, ecode + _pcre_OP_lengths[*ecode], offset_top, md,
691 ims, eptrb, flags, RM1);
692 if (rrc != MATCH_NOMATCH && rrc != MATCH_THEN) RRETURN(rrc);
693 md->capture_last = save_capture_last;
694 ecode += GET(ecode, 1);
695 }
696 while (*ecode == OP_ALT);
697
698 DPRINTF(("bracket %d failed\n", number));
699
700 md->offset_vector[offset] = save_offset1;
701 md->offset_vector[offset+1] = save_offset2;
702 md->offset_vector[md->offset_end - number] = save_offset3;
703
704 RRETURN(MATCH_NOMATCH);
705 }
706
707 /* FALL THROUGH ... Insufficient room for saving captured contents. Treat
708 as a non-capturing bracket. */
709
710 /* VVVVVVVVVVVVVVVVVVVVVVVVV */
711 /* VVVVVVVVVVVVVVVVVVVVVVVVV */
712
713 DPRINTF(("insufficient capture room: treat as non-capturing\n"));
714
715 /* VVVVVVVVVVVVVVVVVVVVVVVVV */
716 /* VVVVVVVVVVVVVVVVVVVVVVVVV */
717
718 /* Non-capturing bracket. Loop for all the alternatives. When we get to the
719 final alternative within the brackets, we would return the result of a
720 recursive call to match() whatever happened. We can reduce stack usage by
721 turning this into a tail recursion, except in the case when match_cbegroup
722 is set.*/
723
724 case OP_BRA:
725 case OP_SBRA:
726 DPRINTF(("start non-capturing bracket\n"));
727 flags = (op >= OP_SBRA)? match_cbegroup : 0;
728 for (;;)
729 {
730 if (ecode[GET(ecode, 1)] != OP_ALT) /* Final alternative */
731 {
732 if (flags == 0) /* Not a possibly empty group */
733 {
734 ecode += _pcre_OP_lengths[*ecode];
735 DPRINTF(("bracket 0 tail recursion\n"));
736 goto TAIL_RECURSE;
737 }
738
739 /* Possibly empty group; can't use tail recursion. */
740
741 RMATCH(eptr, ecode + _pcre_OP_lengths[*ecode], offset_top, md, ims,
742 eptrb, flags, RM48);
743 RRETURN(rrc);
744 }
745
746 /* For non-final alternatives, continue the loop for a NOMATCH result;
747 otherwise return. */
748
749 RMATCH(eptr, ecode + _pcre_OP_lengths[*ecode], offset_top, md, ims,
750 eptrb, flags, RM2);
751 if (rrc != MATCH_NOMATCH && rrc != MATCH_THEN) RRETURN(rrc);
752 ecode += GET(ecode, 1);
753 }
754 /* Control never reaches here. */
755
756 /* Conditional group: compilation checked that there are no more than
757 two branches. If the condition is false, skipping the first branch takes us
758 past the end if there is only one branch, but that's OK because that is
759 exactly what going to the ket would do. As there is only one branch to be
760 obeyed, we can use tail recursion to avoid using another stack frame. */
761
762 case OP_COND:
763 case OP_SCOND:
764 if (ecode[LINK_SIZE+1] == OP_RREF) /* Recursion test */
765 {
766 offset = GET2(ecode, LINK_SIZE + 2); /* Recursion group number*/
767 condition = md->recursive != NULL &&
768 (offset == RREF_ANY || offset == md->recursive->group_num);
769 ecode += condition? 3 : GET(ecode, 1);
770 }
771
772 else if (ecode[LINK_SIZE+1] == OP_CREF) /* Group used test */
773 {
774 offset = GET2(ecode, LINK_SIZE+2) << 1; /* Doubled ref number */
775 condition = offset < offset_top && md->offset_vector[offset] >= 0;
776 ecode += condition? 3 : GET(ecode, 1);
777 }
778
779 else if (ecode[LINK_SIZE+1] == OP_DEF) /* DEFINE - always false */
780 {
781 condition = FALSE;
782 ecode += GET(ecode, 1);
783 }
784
785 /* The condition is an assertion. Call match() to evaluate it - setting
786 the final argument match_condassert causes it to stop at the end of an
787 assertion. */
788
789 else
790 {
791 RMATCH(eptr, ecode + 1 + LINK_SIZE, offset_top, md, ims, NULL,
792 match_condassert, RM3);
793 if (rrc == MATCH_MATCH)
794 {
795 condition = TRUE;
796 ecode += 1 + LINK_SIZE + GET(ecode, LINK_SIZE + 2);
797 while (*ecode == OP_ALT) ecode += GET(ecode, 1);
798 }
799 else if (rrc != MATCH_NOMATCH && rrc != MATCH_THEN)
800 {
801 RRETURN(rrc); /* Need braces because of following else */
802 }
803 else
804 {
805 condition = FALSE;
806 ecode += GET(ecode, 1);
807 }
808 }
809
810 /* We are now at the branch that is to be obeyed. As there is only one,
811 we can use tail recursion to avoid using another stack frame, except when
812 match_cbegroup is required for an unlimited repeat of a possibly empty
813 group. If the second alternative doesn't exist, we can just plough on. */
814
815 if (condition || *ecode == OP_ALT)
816 {
817 ecode += 1 + LINK_SIZE;
818 if (op == OP_SCOND) /* Possibly empty group */
819 {
820 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, match_cbegroup, RM49);
821 RRETURN(rrc);
822 }
823 else /* Group must match something */
824 {
825 flags = 0;
826 goto TAIL_RECURSE;
827 }
828 }
829 else /* Condition false & no 2nd alternative */
830 {
831 ecode += 1 + LINK_SIZE;
832 }
833 break;
834
835
836 /* End of the pattern, either real or forced. If we are in a top-level
837 recursion, we should restore the offsets appropriately and continue from
838 after the call. */
839
840 case OP_ACCEPT:
841 case OP_END:
842 if (md->recursive != NULL && md->recursive->group_num == 0)
843 {
844 recursion_info *rec = md->recursive;
845 DPRINTF(("End of pattern in a (?0) recursion\n"));
846 md->recursive = rec->prevrec;
847 memmove(md->offset_vector, rec->offset_save,
848 rec->saved_max * sizeof(int));
849 mstart = rec->save_start;
850 ims = original_ims;
851 ecode = rec->after_call;
852 break;
853 }
854
855 /* Otherwise, if PCRE_NOTEMPTY is set, fail if we have matched an empty
856 string - backtracking will then try other alternatives, if any. */
857
858 if (md->notempty && eptr == mstart) RRETURN(MATCH_NOMATCH);
859 md->end_match_ptr = eptr; /* Record where we ended */
860 md->end_offset_top = offset_top; /* and how many extracts were taken */
861 md->start_match_ptr = mstart; /* and the start (\K can modify) */
862 RRETURN(MATCH_MATCH);
863
864 /* Change option settings */
865
866 case OP_OPT:
867 ims = ecode[1];
868 ecode += 2;
869 DPRINTF(("ims set to %02lx\n", ims));
870 break;
871
872 /* Assertion brackets. Check the alternative branches in turn - the
873 matching won't pass the KET for an assertion. If any one branch matches,
874 the assertion is true. Lookbehind assertions have an OP_REVERSE item at the
875 start of each branch to move the current point backwards, so the code at
876 this level is identical to the lookahead case. */
877
878 case OP_ASSERT:
879 case OP_ASSERTBACK:
880 do
881 {
882 RMATCH(eptr, ecode + 1 + LINK_SIZE, offset_top, md, ims, NULL, 0,
883 RM4);
884 if (rrc == MATCH_MATCH) break;
885 if (rrc != MATCH_NOMATCH && rrc != MATCH_THEN) RRETURN(rrc);
886 ecode += GET(ecode, 1);
887 }
888 while (*ecode == OP_ALT);
889 if (*ecode == OP_KET) RRETURN(MATCH_NOMATCH);
890
891 /* If checking an assertion for a condition, return MATCH_MATCH. */
892
893 if ((flags & match_condassert) != 0) RRETURN(MATCH_MATCH);
894
895 /* Continue from after the assertion, updating the offsets high water
896 mark, since extracts may have been taken during the assertion. */
897
898 do ecode += GET(ecode,1); while (*ecode == OP_ALT);
899 ecode += 1 + LINK_SIZE;
900 offset_top = md->end_offset_top;
901 continue;
902
903 /* Negative assertion: all branches must fail to match */
904
905 case OP_ASSERT_NOT:
906 case OP_ASSERTBACK_NOT:
907 do
908 {
909 RMATCH(eptr, ecode + 1 + LINK_SIZE, offset_top, md, ims, NULL, 0,
910 RM5);
911 if (rrc == MATCH_MATCH) RRETURN(MATCH_NOMATCH);
912 if (rrc != MATCH_NOMATCH && rrc != MATCH_THEN) RRETURN(rrc);
913 ecode += GET(ecode,1);
914 }
915 while (*ecode == OP_ALT);
916
917 if ((flags & match_condassert) != 0) RRETURN(MATCH_MATCH);
918
919 ecode += 1 + LINK_SIZE;
920 continue;
921
922 /* Move the subject pointer back. This occurs only at the start of
923 each branch of a lookbehind assertion. If we are too close to the start to
924 move back, this match function fails. When working with UTF-8 we move
925 back a number of characters, not bytes. */
926
927 case OP_REVERSE:
928 #ifdef SUPPORT_UTF8
929 if (utf8)
930 {
931 i = GET(ecode, 1);
932 while (i-- > 0)
933 {
934 eptr--;
935 if (eptr < md->start_subject) RRETURN(MATCH_NOMATCH);
936 BACKCHAR(eptr);
937 }
938 }
939 else
940 #endif
941
942 /* No UTF-8 support, or not in UTF-8 mode: count is byte count */
943
944 {
945 eptr -= GET(ecode, 1);
946 if (eptr < md->start_subject) RRETURN(MATCH_NOMATCH);
947 }
948
949 /* Skip to next op code */
950
951 ecode += 1 + LINK_SIZE;
952 break;
953
954 /* The callout item calls an external function, if one is provided, passing
955 details of the match so far. This is mainly for debugging, though the
956 function is able to force a failure. */
957
958 case OP_CALLOUT:
959 if (pcre_callout != NULL)
960 {
961 pcre_callout_block cb;
962 cb.version = 1; /* Version 1 of the callout block */
963 cb.callout_number = ecode[1];
964 cb.offset_vector = md->offset_vector;
965 cb.subject = (PCRE_SPTR)md->start_subject;
966 cb.subject_length = md->end_subject - md->start_subject;
967 cb.start_match = mstart - md->start_subject;
968 cb.current_position = eptr - md->start_subject;
969 cb.pattern_position = GET(ecode, 2);
970 cb.next_item_length = GET(ecode, 2 + LINK_SIZE);
971 cb.capture_top = offset_top/2;
972 cb.capture_last = md->capture_last;
973 cb.callout_data = md->callout_data;
974 if ((rrc = (*pcre_callout)(&cb)) > 0) RRETURN(MATCH_NOMATCH);
975 if (rrc < 0) RRETURN(rrc);
976 }
977 ecode += 2 + 2*LINK_SIZE;
978 break;
979
980 /* Recursion either matches the current regex, or some subexpression. The
981 offset data is the offset to the starting bracket from the start of the
982 whole pattern. (This is so that it works from duplicated subpatterns.)
983
984 If there are any capturing brackets started but not finished, we have to
985 save their starting points and reinstate them after the recursion. However,
986 we don't know how many such there are (offset_top records the completed
987 total) so we just have to save all the potential data. There may be up to
988 65535 such values, which is too large to put on the stack, but using malloc
989 for small numbers seems expensive. As a compromise, the stack is used when
990 there are no more than REC_STACK_SAVE_MAX values to store; otherwise malloc
991 is used. A problem is what to do if the malloc fails ... there is no way of
992 returning to the top level with an error. Save the top REC_STACK_SAVE_MAX
993 values on the stack, and accept that the rest may be wrong.
994
995 There are also other values that have to be saved. We use a chained
996 sequence of blocks that actually live on the stack. Thanks to Robin Houston
997 for the original version of this logic. */
998
999 case OP_RECURSE:
1000 {
1001 callpat = md->start_code + GET(ecode, 1);
1002 new_recursive.group_num = (callpat == md->start_code)? 0 :
1003 GET2(callpat, 1 + LINK_SIZE);
1004
1005 /* Add to "recursing stack" */
1006
1007 new_recursive.prevrec = md->recursive;
1008 md->recursive = &new_recursive;
1009
1010 /* Find where to continue from afterwards */
1011
1012 ecode += 1 + LINK_SIZE;
1013 new_recursive.after_call = ecode;
1014
1015 /* Now save the offset data. */
1016
1017 new_recursive.saved_max = md->offset_end;
1018 if (new_recursive.saved_max <= REC_STACK_SAVE_MAX)
1019 new_recursive.offset_save = stacksave;
1020 else
1021 {
1022 new_recursive.offset_save =
1023 (int *)(pcre_malloc)(new_recursive.saved_max * sizeof(int));
1024 if (new_recursive.offset_save == NULL) RRETURN(PCRE_ERROR_NOMEMORY);
1025 }
1026
1027 memcpy(new_recursive.offset_save, md->offset_vector,
1028 new_recursive.saved_max * sizeof(int));
1029 new_recursive.save_start = mstart;
1030 mstart = eptr;
1031
1032 /* OK, now we can do the recursion. For each top-level alternative we
1033 restore the offset and recursion data. */
1034
1035 DPRINTF(("Recursing into group %d\n", new_recursive.group_num));
1036 flags = (*callpat >= OP_SBRA)? match_cbegroup : 0;
1037 do
1038 {
1039 RMATCH(eptr, callpat + _pcre_OP_lengths[*callpat], offset_top,
1040 md, ims, eptrb, flags, RM6);
1041 if (rrc == MATCH_MATCH)
1042 {
1043 DPRINTF(("Recursion matched\n"));
1044 md->recursive = new_recursive.prevrec;
1045 if (new_recursive.offset_save != stacksave)
1046 (pcre_free)(new_recursive.offset_save);
1047 RRETURN(MATCH_MATCH);
1048 }
1049 else if (rrc != MATCH_NOMATCH && rrc != MATCH_THEN)
1050 {
1051 DPRINTF(("Recursion gave error %d\n", rrc));
1052 RRETURN(rrc);
1053 }
1054
1055 md->recursive = &new_recursive;
1056 memcpy(md->offset_vector, new_recursive.offset_save,
1057 new_recursive.saved_max * sizeof(int));
1058 callpat += GET(callpat, 1);
1059 }
1060 while (*callpat == OP_ALT);
1061
1062 DPRINTF(("Recursion didn't match\n"));
1063 md->recursive = new_recursive.prevrec;
1064 if (new_recursive.offset_save != stacksave)
1065 (pcre_free)(new_recursive.offset_save);
1066 RRETURN(MATCH_NOMATCH);
1067 }
1068 /* Control never reaches here */
1069
1070 /* "Once" brackets are like assertion brackets except that after a match,
1071 the point in the subject string is not moved back. Thus there can never be
1072 a move back into the brackets. Friedl calls these "atomic" subpatterns.
1073 Check the alternative branches in turn - the matching won't pass the KET
1074 for this kind of subpattern. If any one branch matches, we carry on as at
1075 the end of a normal bracket, leaving the subject pointer. */
1076
1077 case OP_ONCE:
1078 prev = ecode;
1079 saved_eptr = eptr;
1080
1081 do
1082 {
1083 RMATCH(eptr, ecode + 1 + LINK_SIZE, offset_top, md, ims, eptrb, 0, RM7);
1084 if (rrc == MATCH_MATCH) break;
1085 if (rrc != MATCH_NOMATCH && rrc != MATCH_THEN) RRETURN(rrc);
1086 ecode += GET(ecode,1);
1087 }
1088 while (*ecode == OP_ALT);
1089
1090 /* If hit the end of the group (which could be repeated), fail */
1091
1092 if (*ecode != OP_ONCE && *ecode != OP_ALT) RRETURN(MATCH_NOMATCH);
1093
1094 /* Continue as from after the assertion, updating the offsets high water
1095 mark, since extracts may have been taken. */
1096
1097 do ecode += GET(ecode, 1); while (*ecode == OP_ALT);
1098
1099 offset_top = md->end_offset_top;
1100 eptr = md->end_match_ptr;
1101
1102 /* For a non-repeating ket, just continue at this level. This also
1103 happens for a repeating ket if no characters were matched in the group.
1104 This is the forcible breaking of infinite loops as implemented in Perl
1105 5.005. If there is an options reset, it will get obeyed in the normal
1106 course of events. */
1107
1108 if (*ecode == OP_KET || eptr == saved_eptr)
1109 {
1110 ecode += 1+LINK_SIZE;
1111 break;
1112 }
1113
1114 /* The repeating kets try the rest of the pattern or restart from the
1115 preceding bracket, in the appropriate order. The second "call" of match()
1116 uses tail recursion, to avoid using another stack frame. We need to reset
1117 any options that changed within the bracket before re-running it, so
1118 check the next opcode. */
1119
1120 if (ecode[1+LINK_SIZE] == OP_OPT)
1121 {
1122 ims = (ims & ~PCRE_IMS) | ecode[4];
1123 DPRINTF(("ims set to %02lx at group repeat\n", ims));
1124 }
1125
1126 if (*ecode == OP_KETRMIN)
1127 {
1128 RMATCH(eptr, ecode + 1 + LINK_SIZE, offset_top, md, ims, eptrb, 0, RM8);
1129 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1130 ecode = prev;
1131 flags = 0;
1132 goto TAIL_RECURSE;
1133 }
1134 else /* OP_KETRMAX */
1135 {
1136 RMATCH(eptr, prev, offset_top, md, ims, eptrb, match_cbegroup, RM9);
1137 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1138 ecode += 1 + LINK_SIZE;
1139 flags = 0;
1140 goto TAIL_RECURSE;
1141 }
1142 /* Control never gets here */
1143
1144 /* An alternation is the end of a branch; scan along to find the end of the
1145 bracketed group and go to there. */
1146
1147 case OP_ALT:
1148 do ecode += GET(ecode,1); while (*ecode == OP_ALT);
1149 break;
1150
1151 /* BRAZERO, BRAMINZERO and SKIPZERO occur just before a bracket group,
1152 indicating that it may occur zero times. It may repeat infinitely, or not
1153 at all - i.e. it could be ()* or ()? or even (){0} in the pattern. Brackets
1154 with fixed upper repeat limits are compiled as a number of copies, with the
1155 optional ones preceded by BRAZERO or BRAMINZERO. */
1156
1157 case OP_BRAZERO:
1158 {
1159 next = ecode+1;
1160 RMATCH(eptr, next, offset_top, md, ims, eptrb, 0, RM10);
1161 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1162 do next += GET(next,1); while (*next == OP_ALT);
1163 ecode = next + 1 + LINK_SIZE;
1164 }
1165 break;
1166
1167 case OP_BRAMINZERO:
1168 {
1169 next = ecode+1;
1170 do next += GET(next, 1); while (*next == OP_ALT);
1171 RMATCH(eptr, next + 1+LINK_SIZE, offset_top, md, ims, eptrb, 0, RM11);
1172 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1173 ecode++;
1174 }
1175 break;
1176
1177 case OP_SKIPZERO:
1178 {
1179 next = ecode+1;
1180 do next += GET(next,1); while (*next == OP_ALT);
1181 ecode = next + 1 + LINK_SIZE;
1182 }
1183 break;
1184
1185 /* End of a group, repeated or non-repeating. */
1186
1187 case OP_KET:
1188 case OP_KETRMIN:
1189 case OP_KETRMAX:
1190 prev = ecode - GET(ecode, 1);
1191
1192 /* If this was a group that remembered the subject start, in order to break
1193 infinite repeats of empty string matches, retrieve the subject start from
1194 the chain. Otherwise, set it NULL. */
1195
1196 if (*prev >= OP_SBRA)
1197 {
1198 saved_eptr = eptrb->epb_saved_eptr; /* Value at start of group */
1199 eptrb = eptrb->epb_prev; /* Backup to previous group */
1200 }
1201 else saved_eptr = NULL;
1202
1203 /* If we are at the end of an assertion group, stop matching and return
1204 MATCH_MATCH, but record the current high water mark for use by positive
1205 assertions. Do this also for the "once" (atomic) groups. */
1206
1207 if (*prev == OP_ASSERT || *prev == OP_ASSERT_NOT ||
1208 *prev == OP_ASSERTBACK || *prev == OP_ASSERTBACK_NOT ||
1209 *prev == OP_ONCE)
1210 {
1211 md->end_match_ptr = eptr; /* For ONCE */
1212 md->end_offset_top = offset_top;
1213 RRETURN(MATCH_MATCH);
1214 }
1215
1216 /* For capturing groups we have to check the group number back at the start
1217 and if necessary complete handling an extraction by setting the offsets and
1218 bumping the high water mark. Note that whole-pattern recursion is coded as
1219 a recurse into group 0, so it won't be picked up here. Instead, we catch it
1220 when the OP_END is reached. Other recursion is handled here. */
1221
1222 if (*prev == OP_CBRA || *prev == OP_SCBRA)
1223 {
1224 number = GET2(prev, 1+LINK_SIZE);
1225 offset = number << 1;
1226
1227 #ifdef DEBUG
1228 printf("end bracket %d", number);
1229 printf("\n");
1230 #endif
1231
1232 md->capture_last = number;
1233 if (offset >= md->offset_max) md->offset_overflow = TRUE; else
1234 {
1235 md->offset_vector[offset] =
1236 md->offset_vector[md->offset_end - number];
1237 md->offset_vector[offset+1] = eptr - md->start_subject;
1238 if (offset_top <= offset) offset_top = offset + 2;
1239 }
1240
1241 /* Handle a recursively called group. Restore the offsets
1242 appropriately and continue from after the call. */
1243
1244 if (md->recursive != NULL && md->recursive->group_num == number)
1245 {
1246 recursion_info *rec = md->recursive;
1247 DPRINTF(("Recursion (%d) succeeded - continuing\n", number));
1248 md->recursive = rec->prevrec;
1249 mstart = rec->save_start;
1250 memcpy(md->offset_vector, rec->offset_save,
1251 rec->saved_max * sizeof(int));
1252 ecode = rec->after_call;
1253 ims = original_ims;
1254 break;
1255 }
1256 }
1257
1258 /* For both capturing and non-capturing groups, reset the value of the ims
1259 flags, in case they got changed during the group. */
1260
1261 ims = original_ims;
1262 DPRINTF(("ims reset to %02lx\n", ims));
1263
1264 /* For a non-repeating ket, just continue at this level. This also
1265 happens for a repeating ket if no characters were matched in the group.
1266 This is the forcible breaking of infinite loops as implemented in Perl
1267 5.005. If there is an options reset, it will get obeyed in the normal
1268 course of events. */
1269
1270 if (*ecode == OP_KET || eptr == saved_eptr)
1271 {
1272 ecode += 1 + LINK_SIZE;
1273 break;
1274 }
1275
1276 /* The repeating kets try the rest of the pattern or restart from the
1277 preceding bracket, in the appropriate order. In the second case, we can use
1278 tail recursion to avoid using another stack frame, unless we have an
1279 unlimited repeat of a group that can match an empty string. */
1280
1281 flags = (*prev >= OP_SBRA)? match_cbegroup : 0;
1282
1283 if (*ecode == OP_KETRMIN)
1284 {
1285 RMATCH(eptr, ecode + 1 + LINK_SIZE, offset_top, md, ims, eptrb, 0, RM12);
1286 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1287 if (flags != 0) /* Could match an empty string */
1288 {
1289 RMATCH(eptr, prev, offset_top, md, ims, eptrb, flags, RM50);
1290 RRETURN(rrc);
1291 }
1292 ecode = prev;
1293 goto TAIL_RECURSE;
1294 }
1295 else /* OP_KETRMAX */
1296 {
1297 RMATCH(eptr, prev, offset_top, md, ims, eptrb, flags, RM13);
1298 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1299 ecode += 1 + LINK_SIZE;
1300 flags = 0;
1301 goto TAIL_RECURSE;
1302 }
1303 /* Control never gets here */
1304
1305 /* Start of subject unless notbol, or after internal newline if multiline */
1306
1307 case OP_CIRC:
1308 if (md->notbol && eptr == md->start_subject) RRETURN(MATCH_NOMATCH);
1309 if ((ims & PCRE_MULTILINE) != 0)
1310 {
1311 if (eptr != md->start_subject &&
1312 (eptr == md->end_subject || !WAS_NEWLINE(eptr)))
1313 RRETURN(MATCH_NOMATCH);
1314 ecode++;
1315 break;
1316 }
1317 /* ... else fall through */
1318
1319 /* Start of subject assertion */
1320
1321 case OP_SOD:
1322 if (eptr != md->start_subject) RRETURN(MATCH_NOMATCH);
1323 ecode++;
1324 break;
1325
1326 /* Start of match assertion */
1327
1328 case OP_SOM:
1329 if (eptr != md->start_subject + md->start_offset) RRETURN(MATCH_NOMATCH);
1330 ecode++;
1331 break;
1332
1333 /* Reset the start of match point */
1334
1335 case OP_SET_SOM:
1336 mstart = eptr;
1337 ecode++;
1338 break;
1339
1340 /* Assert before internal newline if multiline, or before a terminating
1341 newline unless endonly is set, else end of subject unless noteol is set. */
1342
1343 case OP_DOLL:
1344 if ((ims & PCRE_MULTILINE) != 0)
1345 {
1346 if (eptr < md->end_subject)
1347 { if (!IS_NEWLINE(eptr)) RRETURN(MATCH_NOMATCH); }
1348 else
1349 { if (md->noteol) RRETURN(MATCH_NOMATCH); }
1350 ecode++;
1351 break;
1352 }
1353 else
1354 {
1355 if (md->noteol) RRETURN(MATCH_NOMATCH);
1356 if (!md->endonly)
1357 {
1358 if (eptr != md->end_subject &&
1359 (!IS_NEWLINE(eptr) || eptr != md->end_subject - md->nllen))
1360 RRETURN(MATCH_NOMATCH);
1361 ecode++;
1362 break;
1363 }
1364 }
1365 /* ... else fall through for endonly */
1366
1367 /* End of subject assertion (\z) */
1368
1369 case OP_EOD:
1370 if (eptr < md->end_subject) RRETURN(MATCH_NOMATCH);
1371 ecode++;
1372 break;
1373
1374 /* End of subject or ending \n assertion (\Z) */
1375
1376 case OP_EODN:
1377 if (eptr != md->end_subject &&
1378 (!IS_NEWLINE(eptr) || eptr != md->end_subject - md->nllen))
1379 RRETURN(MATCH_NOMATCH);
1380 ecode++;
1381 break;
1382
1383 /* Word boundary assertions */
1384
1385 case OP_NOT_WORD_BOUNDARY:
1386 case OP_WORD_BOUNDARY:
1387 {
1388
1389 /* Find out if the previous and current characters are "word" characters.
1390 It takes a bit more work in UTF-8 mode. Characters > 255 are assumed to
1391 be "non-word" characters. */
1392
1393 #ifdef SUPPORT_UTF8
1394 if (utf8)
1395 {
1396 if (eptr == md->start_subject) prev_is_word = FALSE; else
1397 {
1398 const uschar *lastptr = eptr - 1;
1399 while((*lastptr & 0xc0) == 0x80) lastptr--;
1400 GETCHAR(c, lastptr);
1401 prev_is_word = c < 256 && (md->ctypes[c] & ctype_word) != 0;
1402 }
1403 if (eptr >= md->end_subject) cur_is_word = FALSE; else
1404 {
1405 GETCHAR(c, eptr);
1406 cur_is_word = c < 256 && (md->ctypes[c] & ctype_word) != 0;
1407 }
1408 }
1409 else
1410 #endif
1411
1412 /* More streamlined when not in UTF-8 mode */
1413
1414 {
1415 prev_is_word = (eptr != md->start_subject) &&
1416 ((md->ctypes[eptr[-1]] & ctype_word) != 0);
1417 cur_is_word = (eptr < md->end_subject) &&
1418 ((md->ctypes[*eptr] & ctype_word) != 0);
1419 }
1420
1421 /* Now see if the situation is what we want */
1422
1423 if ((*ecode++ == OP_WORD_BOUNDARY)?
1424 cur_is_word == prev_is_word : cur_is_word != prev_is_word)
1425 RRETURN(MATCH_NOMATCH);
1426 }
1427 break;
1428
1429 /* Match a single character type; inline for speed */
1430
1431 case OP_ANY:
1432 if (IS_NEWLINE(eptr)) RRETURN(MATCH_NOMATCH);
1433 /* Fall through */
1434
1435 case OP_ALLANY:
1436 if (eptr++ >= md->end_subject) RRETURN(MATCH_NOMATCH);
1437 if (utf8) while (eptr < md->end_subject && (*eptr & 0xc0) == 0x80) eptr++;
1438 ecode++;
1439 break;
1440
1441 /* Match a single byte, even in UTF-8 mode. This opcode really does match
1442 any byte, even newline, independent of the setting of PCRE_DOTALL. */
1443
1444 case OP_ANYBYTE:
1445 if (eptr++ >= md->end_subject) RRETURN(MATCH_NOMATCH);
1446 ecode++;
1447 break;
1448
1449 case OP_NOT_DIGIT:
1450 if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
1451 GETCHARINCTEST(c, eptr);
1452 if (
1453 #ifdef SUPPORT_UTF8
1454 c < 256 &&
1455 #endif
1456 (md->ctypes[c] & ctype_digit) != 0
1457 )
1458 RRETURN(MATCH_NOMATCH);
1459 ecode++;
1460 break;
1461
1462 case OP_DIGIT:
1463 if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
1464 GETCHARINCTEST(c, eptr);
1465 if (
1466 #ifdef SUPPORT_UTF8
1467 c >= 256 ||
1468 #endif
1469 (md->ctypes[c] & ctype_digit) == 0
1470 )
1471 RRETURN(MATCH_NOMATCH);
1472 ecode++;
1473 break;
1474
1475 case OP_NOT_WHITESPACE:
1476 if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
1477 GETCHARINCTEST(c, eptr);
1478 if (
1479 #ifdef SUPPORT_UTF8
1480 c < 256 &&
1481 #endif
1482 (md->ctypes[c] & ctype_space) != 0
1483 )
1484 RRETURN(MATCH_NOMATCH);
1485 ecode++;
1486 break;
1487
1488 case OP_WHITESPACE:
1489 if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
1490 GETCHARINCTEST(c, eptr);
1491 if (
1492 #ifdef SUPPORT_UTF8
1493 c >= 256 ||
1494 #endif
1495 (md->ctypes[c] & ctype_space) == 0
1496 )
1497 RRETURN(MATCH_NOMATCH);
1498 ecode++;
1499 break;
1500
1501 case OP_NOT_WORDCHAR:
1502 if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
1503 GETCHARINCTEST(c, eptr);
1504 if (
1505 #ifdef SUPPORT_UTF8
1506 c < 256 &&
1507 #endif
1508 (md->ctypes[c] & ctype_word) != 0
1509 )
1510 RRETURN(MATCH_NOMATCH);
1511 ecode++;
1512 break;
1513
1514 case OP_WORDCHAR:
1515 if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
1516 GETCHARINCTEST(c, eptr);
1517 if (
1518 #ifdef SUPPORT_UTF8
1519 c >= 256 ||
1520 #endif
1521 (md->ctypes[c] & ctype_word) == 0
1522 )
1523 RRETURN(MATCH_NOMATCH);
1524 ecode++;
1525 break;
1526
1527 case OP_ANYNL:
1528 if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
1529 GETCHARINCTEST(c, eptr);
1530 switch(c)
1531 {
1532 default: RRETURN(MATCH_NOMATCH);
1533 case 0x000d:
1534 if (eptr < md->end_subject && *eptr == 0x0a) eptr++;
1535 break;
1536
1537 case 0x000a:
1538 break;
1539
1540 case 0x000b:
1541 case 0x000c:
1542 case 0x0085:
1543 case 0x2028:
1544 case 0x2029:
1545 if (md->bsr_anycrlf) RRETURN(MATCH_NOMATCH);
1546 break;
1547 }
1548 ecode++;
1549 break;
1550
1551 case OP_NOT_HSPACE:
1552 if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
1553 GETCHARINCTEST(c, eptr);
1554 switch(c)
1555 {
1556 default: break;
1557 case 0x09: /* HT */
1558 case 0x20: /* SPACE */
1559 case 0xa0: /* NBSP */
1560 case 0x1680: /* OGHAM SPACE MARK */
1561 case 0x180e: /* MONGOLIAN VOWEL SEPARATOR */
1562 case 0x2000: /* EN QUAD */
1563 case 0x2001: /* EM QUAD */
1564 case 0x2002: /* EN SPACE */
1565 case 0x2003: /* EM SPACE */
1566 case 0x2004: /* THREE-PER-EM SPACE */
1567 case 0x2005: /* FOUR-PER-EM SPACE */
1568 case 0x2006: /* SIX-PER-EM SPACE */
1569 case 0x2007: /* FIGURE SPACE */
1570 case 0x2008: /* PUNCTUATION SPACE */
1571 case 0x2009: /* THIN SPACE */
1572 case 0x200A: /* HAIR SPACE */
1573 case 0x202f: /* NARROW NO-BREAK SPACE */
1574 case 0x205f: /* MEDIUM MATHEMATICAL SPACE */
1575 case 0x3000: /* IDEOGRAPHIC SPACE */
1576 RRETURN(MATCH_NOMATCH);
1577 }
1578 ecode++;
1579 break;
1580
1581 case OP_HSPACE:
1582 if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
1583 GETCHARINCTEST(c, eptr);
1584 switch(c)
1585 {
1586 default: RRETURN(MATCH_NOMATCH);
1587 case 0x09: /* HT */
1588 case 0x20: /* SPACE */
1589 case 0xa0: /* NBSP */
1590 case 0x1680: /* OGHAM SPACE MARK */
1591 case 0x180e: /* MONGOLIAN VOWEL SEPARATOR */
1592 case 0x2000: /* EN QUAD */
1593 case 0x2001: /* EM QUAD */
1594 case 0x2002: /* EN SPACE */
1595 case 0x2003: /* EM SPACE */
1596 case 0x2004: /* THREE-PER-EM SPACE */
1597 case 0x2005: /* FOUR-PER-EM SPACE */
1598 case 0x2006: /* SIX-PER-EM SPACE */
1599 case 0x2007: /* FIGURE SPACE */
1600 case 0x2008: /* PUNCTUATION SPACE */
1601 case 0x2009: /* THIN SPACE */
1602 case 0x200A: /* HAIR SPACE */
1603 case 0x202f: /* NARROW NO-BREAK SPACE */
1604 case 0x205f: /* MEDIUM MATHEMATICAL SPACE */
1605 case 0x3000: /* IDEOGRAPHIC SPACE */
1606 break;
1607 }
1608 ecode++;
1609 break;
1610
1611 case OP_NOT_VSPACE:
1612 if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
1613 GETCHARINCTEST(c, eptr);
1614 switch(c)
1615 {
1616 default: break;
1617 case 0x0a: /* LF */
1618 case 0x0b: /* VT */
1619 case 0x0c: /* FF */
1620 case 0x0d: /* CR */
1621 case 0x85: /* NEL */
1622 case 0x2028: /* LINE SEPARATOR */
1623 case 0x2029: /* PARAGRAPH SEPARATOR */
1624 RRETURN(MATCH_NOMATCH);
1625 }
1626 ecode++;
1627 break;
1628
1629 case OP_VSPACE:
1630 if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
1631 GETCHARINCTEST(c, eptr);
1632 switch(c)
1633 {
1634 default: RRETURN(MATCH_NOMATCH);
1635 case 0x0a: /* LF */
1636 case 0x0b: /* VT */
1637 case 0x0c: /* FF */
1638 case 0x0d: /* CR */
1639 case 0x85: /* NEL */
1640 case 0x2028: /* LINE SEPARATOR */
1641 case 0x2029: /* PARAGRAPH SEPARATOR */
1642 break;
1643 }
1644 ecode++;
1645 break;
1646
1647 #ifdef SUPPORT_UCP
1648 /* Check the next character by Unicode property. We will get here only
1649 if the support is in the binary; otherwise a compile-time error occurs. */
1650
1651 case OP_PROP:
1652 case OP_NOTPROP:
1653 if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
1654 GETCHARINCTEST(c, eptr);
1655 {
1656 const ucd_record * prop = GET_UCD(c);
1657
1658 switch(ecode[1])
1659 {
1660 case PT_ANY:
1661 if (op == OP_NOTPROP) RRETURN(MATCH_NOMATCH);
1662 break;
1663
1664 case PT_LAMP:
1665 if ((prop->chartype == ucp_Lu ||
1666 prop->chartype == ucp_Ll ||
1667 prop->chartype == ucp_Lt) == (op == OP_NOTPROP))
1668 RRETURN(MATCH_NOMATCH);
1669 break;
1670
1671 case PT_GC:
1672 if ((ecode[2] != _pcre_ucp_gentype[prop->chartype]) == (op == OP_PROP))
1673 RRETURN(MATCH_NOMATCH);
1674 break;
1675
1676 case PT_PC:
1677 if ((ecode[2] != prop->chartype) == (op == OP_PROP))
1678 RRETURN(MATCH_NOMATCH);
1679 break;
1680
1681 case PT_SC:
1682 if ((ecode[2] != prop->script) == (op == OP_PROP))
1683 RRETURN(MATCH_NOMATCH);
1684 break;
1685
1686 default:
1687 RRETURN(PCRE_ERROR_INTERNAL);
1688 }
1689
1690 ecode += 3;
1691 }
1692 break;
1693
1694 /* Match an extended Unicode sequence. We will get here only if the support
1695 is in the binary; otherwise a compile-time error occurs. */
1696
1697 case OP_EXTUNI:
1698 if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
1699 GETCHARINCTEST(c, eptr);
1700 {
1701 int category = UCD_CATEGORY(c);
1702 if (category == ucp_M) RRETURN(MATCH_NOMATCH);
1703 while (eptr < md->end_subject)
1704 {
1705 int len = 1;
1706 if (!utf8) c = *eptr; else
1707 {
1708 GETCHARLEN(c, eptr, len);
1709 }
1710 category = UCD_CATEGORY(c);
1711 if (category != ucp_M) break;
1712 eptr += len;
1713 }
1714 }
1715 ecode++;
1716 break;
1717 #endif
1718
1719
1720 /* Match a back reference, possibly repeatedly. Look past the end of the
1721 item to see if there is repeat information following. The code is similar
1722 to that for character classes, but repeated for efficiency. Then obey
1723 similar code to character type repeats - written out again for speed.
1724 However, if the referenced string is the empty string, always treat
1725 it as matched, any number of times (otherwise there could be infinite
1726 loops). */
1727
1728 case OP_REF:
1729 {
1730 offset = GET2(ecode, 1) << 1; /* Doubled ref number */
1731 ecode += 3;
1732
1733 /* If the reference is unset, there are two possibilities:
1734
1735 (a) In the default, Perl-compatible state, set the length to be longer
1736 than the amount of subject left; this ensures that every attempt at a
1737 match fails. We can't just fail here, because of the possibility of
1738 quantifiers with zero minima.
1739
1740 (b) If the JavaScript compatibility flag is set, set the length to zero
1741 so that the back reference matches an empty string.
1742
1743 Otherwise, set the length to the length of what was matched by the
1744 referenced subpattern. */
1745
1746 if (offset >= offset_top || md->offset_vector[offset] < 0)
1747 length = (md->jscript_compat)? 0 : md->end_subject - eptr + 1;
1748 else
1749 length = md->offset_vector[offset+1] - md->offset_vector[offset];
1750
1751 /* Set up for repetition, or handle the non-repeated case */
1752
1753 switch (*ecode)
1754 {
1755 case OP_CRSTAR:
1756 case OP_CRMINSTAR:
1757 case OP_CRPLUS:
1758 case OP_CRMINPLUS:
1759 case OP_CRQUERY:
1760 case OP_CRMINQUERY:
1761 c = *ecode++ - OP_CRSTAR;
1762 minimize = (c & 1) != 0;
1763 min = rep_min[c]; /* Pick up values from tables; */
1764 max = rep_max[c]; /* zero for max => infinity */
1765 if (max == 0) max = INT_MAX;
1766 break;
1767
1768 case OP_CRRANGE:
1769 case OP_CRMINRANGE:
1770 minimize = (*ecode == OP_CRMINRANGE);
1771 min = GET2(ecode, 1);
1772 max = GET2(ecode, 3);
1773 if (max == 0) max = INT_MAX;
1774 ecode += 5;
1775 break;
1776
1777 default: /* No repeat follows */
1778 if (!match_ref(offset, eptr, length, md, ims)) RRETURN(MATCH_NOMATCH);
1779 eptr += length;
1780 continue; /* With the main loop */
1781 }
1782
1783 /* If the length of the reference is zero, just continue with the
1784 main loop. */
1785
1786 if (length == 0) continue;
1787
1788 /* First, ensure the minimum number of matches are present. We get back
1789 the length of the reference string explicitly rather than passing the
1790 address of eptr, so that eptr can be a register variable. */
1791
1792 for (i = 1; i <= min; i++)
1793 {
1794 if (!match_ref(offset, eptr, length, md, ims)) RRETURN(MATCH_NOMATCH);
1795 eptr += length;
1796 }
1797
1798 /* If min = max, continue at the same level without recursion.
1799 They are not both allowed to be zero. */
1800
1801 if (min == max) continue;
1802
1803 /* If minimizing, keep trying and advancing the pointer */
1804
1805 if (minimize)
1806 {
1807 for (fi = min;; fi++)
1808 {
1809 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM14);
1810 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1811 if (fi >= max || !match_ref(offset, eptr, length, md, ims))
1812 RRETURN(MATCH_NOMATCH);
1813 eptr += length;
1814 }
1815 /* Control never gets here */
1816 }
1817
1818 /* If maximizing, find the longest string and work backwards */
1819
1820 else
1821 {
1822 pp = eptr;
1823 for (i = min; i < max; i++)
1824 {
1825 if (!match_ref(offset, eptr, length, md, ims)) break;
1826 eptr += length;
1827 }
1828 while (eptr >= pp)
1829 {
1830 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM15);
1831 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1832 eptr -= length;
1833 }
1834 RRETURN(MATCH_NOMATCH);
1835 }
1836 }
1837 /* Control never gets here */
1838
1839
1840
1841 /* Match a bit-mapped character class, possibly repeatedly. This op code is
1842 used when all the characters in the class have values in the range 0-255,
1843 and either the matching is caseful, or the characters are in the range
1844 0-127 when UTF-8 processing is enabled. The only difference between
1845 OP_CLASS and OP_NCLASS occurs when a data character outside the range is
1846 encountered.
1847
1848 First, look past the end of the item to see if there is repeat information
1849 following. Then obey similar code to character type repeats - written out
1850 again for speed. */
1851
1852 case OP_NCLASS:
1853 case OP_CLASS:
1854 {
1855 data = ecode + 1; /* Save for matching */
1856 ecode += 33; /* Advance past the item */
1857
1858 switch (*ecode)
1859 {
1860 case OP_CRSTAR:
1861 case OP_CRMINSTAR:
1862 case OP_CRPLUS:
1863 case OP_CRMINPLUS:
1864 case OP_CRQUERY:
1865 case OP_CRMINQUERY:
1866 c = *ecode++ - OP_CRSTAR;
1867 minimize = (c & 1) != 0;
1868 min = rep_min[c]; /* Pick up values from tables; */
1869 max = rep_max[c]; /* zero for max => infinity */
1870 if (max == 0) max = INT_MAX;
1871 break;
1872
1873 case OP_CRRANGE:
1874 case OP_CRMINRANGE:
1875 minimize = (*ecode == OP_CRMINRANGE);
1876 min = GET2(ecode, 1);
1877 max = GET2(ecode, 3);
1878 if (max == 0) max = INT_MAX;
1879 ecode += 5;
1880 break;
1881
1882 default: /* No repeat follows */
1883 min = max = 1;
1884 break;
1885 }
1886
1887 /* First, ensure the minimum number of matches are present. */
1888
1889 #ifdef SUPPORT_UTF8
1890 /* UTF-8 mode */
1891 if (utf8)
1892 {
1893 for (i = 1; i <= min; i++)
1894 {
1895 if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
1896 GETCHARINC(c, eptr);
1897 if (c > 255)
1898 {
1899 if (op == OP_CLASS) RRETURN(MATCH_NOMATCH);
1900 }
1901 else
1902 {
1903 if ((data[c/8] & (1 << (c&7))) == 0) RRETURN(MATCH_NOMATCH);
1904 }
1905 }
1906 }
1907 else
1908 #endif
1909 /* Not UTF-8 mode */
1910 {
1911 for (i = 1; i <= min; i++)
1912 {
1913 if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
1914 c = *eptr++;
1915 if ((data[c/8] & (1 << (c&7))) == 0) RRETURN(MATCH_NOMATCH);
1916 }
1917 }
1918
1919 /* If max == min we can continue with the main loop without the
1920 need to recurse. */
1921
1922 if (min == max) continue;
1923
1924 /* If minimizing, keep testing the rest of the expression and advancing
1925 the pointer while it matches the class. */
1926
1927 if (minimize)
1928 {
1929 #ifdef SUPPORT_UTF8
1930 /* UTF-8 mode */
1931 if (utf8)
1932 {
1933 for (fi = min;; fi++)
1934 {
1935 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM16);
1936 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1937 if (fi >= max || eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
1938 GETCHARINC(c, eptr);
1939 if (c > 255)
1940 {
1941 if (op == OP_CLASS) RRETURN(MATCH_NOMATCH);
1942 }
1943 else
1944 {
1945 if ((data[c/8] & (1 << (c&7))) == 0) RRETURN(MATCH_NOMATCH);
1946 }
1947 }
1948 }
1949 else
1950 #endif
1951 /* Not UTF-8 mode */
1952 {
1953 for (fi = min;; fi++)
1954 {
1955 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM17);
1956 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1957 if (fi >= max || eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
1958 c = *eptr++;
1959 if ((data[c/8] & (1 << (c&7))) == 0) RRETURN(MATCH_NOMATCH);
1960 }
1961 }
1962 /* Control never gets here */
1963 }
1964
1965 /* If maximizing, find the longest possible run, then work backwards. */
1966
1967 else
1968 {
1969 pp = eptr;
1970
1971 #ifdef SUPPORT_UTF8
1972 /* UTF-8 mode */
1973 if (utf8)
1974 {
1975 for (i = min; i < max; i++)
1976 {
1977 int len = 1;
1978 if (eptr >= md->end_subject) break;
1979 GETCHARLEN(c, eptr, len);
1980 if (c > 255)
1981 {
1982 if (op == OP_CLASS) break;
1983 }
1984 else
1985 {
1986 if ((data[c/8] & (1 << (c&7))) == 0) break;
1987 }
1988 eptr += len;
1989 }
1990 for (;;)
1991 {
1992 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM18);
1993 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1994 if (eptr-- == pp) break; /* Stop if tried at original pos */
1995 BACKCHAR(eptr);
1996 }
1997 }
1998 else
1999 #endif
2000 /* Not UTF-8 mode */
2001 {
2002 for (i = min; i < max; i++)
2003 {
2004 if (eptr >= md->end_subject) break;
2005 c = *eptr;
2006 if ((data[c/8] & (1 << (c&7))) == 0) break;
2007 eptr++;
2008 }
2009 while (eptr >= pp)
2010 {
2011 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM19);
2012 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2013 eptr--;
2014 }
2015 }
2016
2017 RRETURN(MATCH_NOMATCH);
2018 }
2019 }
2020 /* Control never gets here */
2021
2022
2023 /* Match an extended character class. This opcode is encountered only
2024 in UTF-8 mode, because that's the only time it is compiled. */
2025
2026 #ifdef SUPPORT_UTF8
2027 case OP_XCLASS:
2028 {
2029 data = ecode + 1 + LINK_SIZE; /* Save for matching */
2030 ecode += GET(ecode, 1); /* Advance past the item */
2031
2032 switch (*ecode)
2033 {
2034 case OP_CRSTAR:
2035 case OP_CRMINSTAR:
2036 case OP_CRPLUS:
2037 case OP_CRMINPLUS:
2038 case OP_CRQUERY:
2039 case OP_CRMINQUERY:
2040 c = *ecode++ - OP_CRSTAR;
2041 minimize = (c & 1) != 0;
2042 min = rep_min[c]; /* Pick up values from tables; */
2043 max = rep_max[c]; /* zero for max => infinity */
2044 if (max == 0) max = INT_MAX;
2045 break;
2046
2047 case OP_CRRANGE:
2048 case OP_CRMINRANGE:
2049 minimize = (*ecode == OP_CRMINRANGE);
2050 min = GET2(ecode, 1);
2051 max = GET2(ecode, 3);
2052 if (max == 0) max = INT_MAX;
2053 ecode += 5;
2054 break;
2055
2056 default: /* No repeat follows */
2057 min = max = 1;
2058 break;
2059 }
2060
2061 /* First, ensure the minimum number of matches are present. */
2062
2063 for (i = 1; i <= min; i++)
2064 {
2065 if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
2066 GETCHARINC(c, eptr);
2067 if (!_pcre_xclass(c, data)) RRETURN(MATCH_NOMATCH);
2068 }
2069
2070 /* If max == min we can continue with the main loop without the
2071 need to recurse. */
2072
2073 if (min == max) continue;
2074
2075 /* If minimizing, keep testing the rest of the expression and advancing
2076 the pointer while it matches the class. */
2077
2078 if (minimize)
2079 {
2080 for (fi = min;; fi++)
2081 {
2082 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM20);
2083 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2084 if (fi >= max || eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
2085 GETCHARINC(c, eptr);
2086 if (!_pcre_xclass(c, data)) RRETURN(MATCH_NOMATCH);
2087 }
2088 /* Control never gets here */
2089 }
2090
2091 /* If maximizing, find the longest possible run, then work backwards. */
2092
2093 else
2094 {
2095 pp = eptr;
2096 for (i = min; i < max; i++)
2097 {
2098 int len = 1;
2099 if (eptr >= md->end_subject) break;
2100 GETCHARLEN(c, eptr, len);
2101 if (!_pcre_xclass(c, data)) break;
2102 eptr += len;
2103 }
2104 for(;;)
2105 {
2106 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM21);
2107 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2108 if (eptr-- == pp) break; /* Stop if tried at original pos */
2109 if (utf8) BACKCHAR(eptr);
2110 }
2111 RRETURN(MATCH_NOMATCH);
2112 }
2113
2114 /* Control never gets here */
2115 }
2116 #endif /* End of XCLASS */
2117
2118 /* Match a single character, casefully */
2119
2120 case OP_CHAR:
2121 #ifdef SUPPORT_UTF8
2122 if (utf8)
2123 {
2124 length = 1;
2125 ecode++;
2126 GETCHARLEN(fc, ecode, length);
2127 if (length > md->end_subject - eptr) RRETURN(MATCH_NOMATCH);
2128 while (length-- > 0) if (*ecode++ != *eptr++) RRETURN(MATCH_NOMATCH);
2129 }
2130 else
2131 #endif
2132
2133 /* Non-UTF-8 mode */
2134 {
2135 if (md->end_subject - eptr < 1) RRETURN(MATCH_NOMATCH);
2136 if (ecode[1] != *eptr++) RRETURN(MATCH_NOMATCH);
2137 ecode += 2;
2138 }
2139 break;
2140
2141 /* Match a single character, caselessly */
2142
2143 case OP_CHARNC:
2144 #ifdef SUPPORT_UTF8
2145 if (utf8)
2146 {
2147 length = 1;
2148 ecode++;
2149 GETCHARLEN(fc, ecode, length);
2150
2151 if (length > md->end_subject - eptr) RRETURN(MATCH_NOMATCH);
2152
2153 /* If the pattern character's value is < 128, we have only one byte, and
2154 can use the fast lookup table. */
2155
2156 if (fc < 128)
2157 {
2158 if (md->lcc[*ecode++] != md->lcc[*eptr++]) RRETURN(MATCH_NOMATCH);
2159 }
2160
2161 /* Otherwise we must pick up the subject character */
2162
2163 else
2164 {
2165 unsigned int dc;
2166 GETCHARINC(dc, eptr);
2167 ecode += length;
2168
2169 /* If we have Unicode property support, we can use it to test the other
2170 case of the character, if there is one. */
2171
2172 if (fc != dc)
2173 {
2174 #ifdef SUPPORT_UCP
2175 if (dc != UCD_OTHERCASE(fc))
2176 #endif
2177 RRETURN(MATCH_NOMATCH);
2178 }
2179 }
2180 }
2181 else
2182 #endif /* SUPPORT_UTF8 */
2183
2184 /* Non-UTF-8 mode */
2185 {
2186 if (md->end_subject - eptr < 1) RRETURN(MATCH_NOMATCH);
2187 if (md->lcc[ecode[1]] != md->lcc[*eptr++]) RRETURN(MATCH_NOMATCH);
2188 ecode += 2;
2189 }
2190 break;
2191
2192 /* Match a single character repeatedly. */
2193
2194 case OP_EXACT:
2195 min = max = GET2(ecode, 1);
2196 ecode += 3;
2197 goto REPEATCHAR;
2198
2199 case OP_POSUPTO:
2200 possessive = TRUE;
2201 /* Fall through */
2202
2203 case OP_UPTO:
2204 case OP_MINUPTO:
2205 min = 0;
2206 max = GET2(ecode, 1);
2207 minimize = *ecode == OP_MINUPTO;
2208 ecode += 3;
2209 goto REPEATCHAR;
2210
2211 case OP_POSSTAR:
2212 possessive = TRUE;
2213 min = 0;
2214 max = INT_MAX;
2215 ecode++;
2216 goto REPEATCHAR;
2217
2218 case OP_POSPLUS:
2219 possessive = TRUE;
2220 min = 1;
2221 max = INT_MAX;
2222 ecode++;
2223 goto REPEATCHAR;
2224
2225 case OP_POSQUERY:
2226 possessive = TRUE;
2227 min = 0;
2228 max = 1;
2229 ecode++;
2230 goto REPEATCHAR;
2231
2232 case OP_STAR:
2233 case OP_MINSTAR:
2234 case OP_PLUS:
2235 case OP_MINPLUS:
2236 case OP_QUERY:
2237 case OP_MINQUERY:
2238 c = *ecode++ - OP_STAR;
2239 minimize = (c & 1) != 0;
2240 min = rep_min[c]; /* Pick up values from tables; */
2241 max = rep_max[c]; /* zero for max => infinity */
2242 if (max == 0) max = INT_MAX;
2243
2244 /* Common code for all repeated single-character matches. We can give
2245 up quickly if there are fewer than the minimum number of characters left in
2246 the subject. */
2247
2248 REPEATCHAR:
2249 #ifdef SUPPORT_UTF8
2250 if (utf8)
2251 {
2252 length = 1;
2253 charptr = ecode;
2254 GETCHARLEN(fc, ecode, length);
2255 if (min * length > md->end_subject - eptr) RRETURN(MATCH_NOMATCH);
2256 ecode += length;
2257
2258 /* Handle multibyte character matching specially here. There is
2259 support for caseless matching if UCP support is present. */
2260
2261 if (length > 1)
2262 {
2263 #ifdef SUPPORT_UCP
2264 unsigned int othercase;
2265 if ((ims & PCRE_CASELESS) != 0 &&
2266 (othercase = UCD_OTHERCASE(fc)) != fc)
2267 oclength = _pcre_ord2utf8(othercase, occhars);
2268 else oclength = 0;
2269 #endif /* SUPPORT_UCP */
2270
2271 for (i = 1; i <= min; i++)
2272 {
2273 if (memcmp(eptr, charptr, length) == 0) eptr += length;
2274 #ifdef SUPPORT_UCP
2275 /* Need braces because of following else */
2276 else if (oclength == 0) { RRETURN(MATCH_NOMATCH); }
2277 else
2278 {
2279 if (memcmp(eptr, occhars, oclength) != 0) RRETURN(MATCH_NOMATCH);
2280 eptr += oclength;
2281 }
2282 #else /* without SUPPORT_UCP */
2283 else { RRETURN(MATCH_NOMATCH); }
2284 #endif /* SUPPORT_UCP */
2285 }
2286
2287 if (min == max) continue;
2288
2289 if (minimize)
2290 {
2291 for (fi = min;; fi++)
2292 {
2293 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM22);
2294 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2295 if (fi >= max || eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
2296 if (memcmp(eptr, charptr, length) == 0) eptr += length;
2297 #ifdef SUPPORT_UCP
2298 /* Need braces because of following else */
2299 else if (oclength == 0) { RRETURN(MATCH_NOMATCH); }
2300 else
2301 {
2302 if (memcmp(eptr, occhars, oclength) != 0) RRETURN(MATCH_NOMATCH);
2303 eptr += oclength;
2304 }
2305 #else /* without SUPPORT_UCP */
2306 else { RRETURN (MATCH_NOMATCH); }
2307 #endif /* SUPPORT_UCP */
2308 }
2309 /* Control never gets here */
2310 }
2311
2312 else /* Maximize */
2313 {
2314 pp = eptr;
2315 for (i = min; i < max; i++)
2316 {
2317 if (eptr > md->end_subject - length) break;
2318 if (memcmp(eptr, charptr, length) == 0) eptr += length;
2319 #ifdef SUPPORT_UCP
2320 else if (oclength == 0) break;
2321 else
2322 {
2323 if (memcmp(eptr, occhars, oclength) != 0) break;
2324 eptr += oclength;
2325 }
2326 #else /* without SUPPORT_UCP */
2327 else break;
2328 #endif /* SUPPORT_UCP */
2329 }
2330
2331 if (possessive) continue;
2332 for(;;)
2333 {
2334 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM23);
2335 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2336 if (eptr == pp) RRETURN(MATCH_NOMATCH);
2337 #ifdef SUPPORT_UCP
2338 eptr--;
2339 BACKCHAR(eptr);
2340 #else /* without SUPPORT_UCP */
2341 eptr -= length;
2342 #endif /* SUPPORT_UCP */
2343 }
2344 }
2345 /* Control never gets here */
2346 }
2347
2348 /* If the length of a UTF-8 character is 1, we fall through here, and
2349 obey the code as for non-UTF-8 characters below, though in this case the
2350 value of fc will always be < 128. */
2351 }
2352 else
2353 #endif /* SUPPORT_UTF8 */
2354
2355 /* When not in UTF-8 mode, load a single-byte character. */
2356 {
2357 if (min > md->end_subject - eptr) RRETURN(MATCH_NOMATCH);
2358 fc = *ecode++;
2359 }
2360
2361 /* The value of fc at this point is always less than 256, though we may or
2362 may not be in UTF-8 mode. The code is duplicated for the caseless and
2363 caseful cases, for speed, since matching characters is likely to be quite
2364 common. First, ensure the minimum number of matches are present. If min =
2365 max, continue at the same level without recursing. Otherwise, if
2366 minimizing, keep trying the rest of the expression and advancing one
2367 matching character if failing, up to the maximum. Alternatively, if
2368 maximizing, find the maximum number of characters and work backwards. */
2369
2370 DPRINTF(("matching %c{%d,%d} against subject %.*s\n", fc, min, max,
2371 max, eptr));
2372
2373 if ((ims & PCRE_CASELESS) != 0)
2374 {
2375 fc = md->lcc[fc];
2376 for (i = 1; i <= min; i++)
2377 if (fc != md->lcc[*eptr++]) RRETURN(MATCH_NOMATCH);
2378 if (min == max) continue;
2379 if (minimize)
2380 {
2381 for (fi = min;; fi++)
2382 {
2383 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM24);
2384 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2385 if (fi >= max || eptr >= md->end_subject ||
2386 fc != md->lcc[*eptr++])
2387 RRETURN(MATCH_NOMATCH);
2388 }
2389 /* Control never gets here */
2390 }
2391 else /* Maximize */
2392 {
2393 pp = eptr;
2394 for (i = min; i < max; i++)
2395 {
2396 if (eptr >= md->end_subject || fc != md->lcc[*eptr]) break;
2397 eptr++;
2398 }
2399 if (possessive) continue;
2400 while (eptr >= pp)
2401 {
2402 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM25);
2403 eptr--;
2404 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2405 }
2406 RRETURN(MATCH_NOMATCH);
2407 }
2408 /* Control never gets here */
2409 }
2410
2411 /* Caseful comparisons (includes all multi-byte characters) */
2412
2413 else
2414 {
2415 for (i = 1; i <= min; i++) if (fc != *eptr++) RRETURN(MATCH_NOMATCH);
2416 if (min == max) continue;
2417 if (minimize)
2418 {
2419 for (fi = min;; fi++)
2420 {
2421 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM26);
2422 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2423 if (fi >= max || eptr >= md->end_subject || fc != *eptr++)
2424 RRETURN(MATCH_NOMATCH);
2425 }
2426 /* Control never gets here */
2427 }
2428 else /* Maximize */
2429 {
2430 pp = eptr;
2431 for (i = min; i < max; i++)
2432 {
2433 if (eptr >= md->end_subject || fc != *eptr) break;
2434 eptr++;
2435 }
2436 if (possessive) continue;
2437 while (eptr >= pp)
2438 {
2439 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM27);
2440 eptr--;
2441 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2442 }
2443 RRETURN(MATCH_NOMATCH);
2444 }
2445 }
2446 /* Control never gets here */
2447
2448 /* Match a negated single one-byte character. The character we are
2449 checking can be multibyte. */
2450
2451 case OP_NOT:
2452 if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
2453 ecode++;
2454 GETCHARINCTEST(c, eptr);
2455 if ((ims & PCRE_CASELESS) != 0)
2456 {
2457 #ifdef SUPPORT_UTF8
2458 if (c < 256)
2459 #endif
2460 c = md->lcc[c];
2461 if (md->lcc[*ecode++] == c) RRETURN(MATCH_NOMATCH);
2462 }
2463 else
2464 {
2465 if (*ecode++ == c) RRETURN(MATCH_NOMATCH);
2466 }
2467 break;
2468
2469 /* Match a negated single one-byte character repeatedly. This is almost a
2470 repeat of the code for a repeated single character, but I haven't found a
2471 nice way of commoning these up that doesn't require a test of the
2472 positive/negative option for each character match. Maybe that wouldn't add
2473 very much to the time taken, but character matching *is* what this is all
2474 about... */
2475
2476 case OP_NOTEXACT:
2477 min = max = GET2(ecode, 1);
2478 ecode += 3;
2479 goto REPEATNOTCHAR;
2480
2481 case OP_NOTUPTO:
2482 case OP_NOTMINUPTO:
2483 min = 0;
2484 max = GET2(ecode, 1);
2485 minimize = *ecode == OP_NOTMINUPTO;
2486 ecode += 3;
2487 goto REPEATNOTCHAR;
2488
2489 case OP_NOTPOSSTAR:
2490 possessive = TRUE;
2491 min = 0;
2492 max = INT_MAX;
2493 ecode++;
2494 goto REPEATNOTCHAR;
2495
2496 case OP_NOTPOSPLUS:
2497 possessive = TRUE;
2498 min = 1;
2499 max = INT_MAX;
2500 ecode++;
2501 goto REPEATNOTCHAR;
2502
2503 case OP_NOTPOSQUERY:
2504 possessive = TRUE;
2505 min = 0;
2506 max = 1;
2507 ecode++;
2508 goto REPEATNOTCHAR;
2509
2510 case OP_NOTPOSUPTO:
2511 possessive = TRUE;
2512 min = 0;
2513 max = GET2(ecode, 1);
2514 ecode += 3;
2515 goto REPEATNOTCHAR;
2516
2517 case OP_NOTSTAR:
2518 case OP_NOTMINSTAR:
2519 case OP_NOTPLUS:
2520 case OP_NOTMINPLUS:
2521 case OP_NOTQUERY:
2522 case OP_NOTMINQUERY:
2523 c = *ecode++ - OP_NOTSTAR;
2524 minimize = (c & 1) != 0;
2525 min = rep_min[c]; /* Pick up values from tables; */
2526 max = rep_max[c]; /* zero for max => infinity */
2527 if (max == 0) max = INT_MAX;
2528
2529 /* Common code for all repeated single-byte matches. We can give up quickly
2530 if there are fewer than the minimum number of bytes left in the
2531 subject. */
2532
2533 REPEATNOTCHAR:
2534 if (min > md->end_subject - eptr) RRETURN(MATCH_NOMATCH);
2535 fc = *ecode++;
2536
2537 /* The code is duplicated for the caseless and caseful cases, for speed,
2538 since matching characters is likely to be quite common. First, ensure the
2539 minimum number of matches are present. If min = max, continue at the same
2540 level without recursing. Otherwise, if minimizing, keep trying the rest of
2541 the expression and advancing one matching character if failing, up to the
2542 maximum. Alternatively, if maximizing, find the maximum number of
2543 characters and work backwards. */
2544
2545 DPRINTF(("negative matching %c{%d,%d} against subject %.*s\n", fc, min, max,
2546 max, eptr));
2547
2548 if ((ims & PCRE_CASELESS) != 0)
2549 {
2550 fc = md->lcc[fc];
2551
2552 #ifdef SUPPORT_UTF8
2553 /* UTF-8 mode */
2554 if (utf8)
2555 {
2556 register unsigned int d;
2557 for (i = 1; i <= min; i++)
2558 {
2559 GETCHARINC(d, eptr);
2560 if (d < 256) d = md->lcc[d];
2561 if (fc == d) RRETURN(MATCH_NOMATCH);
2562 }
2563 }
2564 else
2565 #endif
2566
2567 /* Not UTF-8 mode */
2568 {
2569 for (i = 1; i <= min; i++)
2570 if (fc == md->lcc[*eptr++]) RRETURN(MATCH_NOMATCH);
2571 }
2572
2573 if (min == max) continue;
2574
2575 if (minimize)
2576 {
2577 #ifdef SUPPORT_UTF8
2578 /* UTF-8 mode */
2579 if (utf8)
2580 {
2581 register unsigned int d;
2582 for (fi = min;; fi++)
2583 {
2584 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM28);
2585 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2586 GETCHARINC(d, eptr);
2587 if (d < 256) d = md->lcc[d];
2588 if (fi >= max || eptr >= md->end_subject || fc == d)
2589 RRETURN(MATCH_NOMATCH);
2590 }
2591 }
2592 else
2593 #endif
2594 /* Not UTF-8 mode */
2595 {
2596 for (fi = min;; fi++)
2597 {
2598 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM29);
2599 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2600 if (fi >= max || eptr >= md->end_subject || fc == md->lcc[*eptr++])
2601 RRETURN(MATCH_NOMATCH);
2602 }
2603 }
2604 /* Control never gets here */
2605 }
2606
2607 /* Maximize case */
2608
2609 else
2610 {
2611 pp = eptr;
2612
2613 #ifdef SUPPORT_UTF8
2614 /* UTF-8 mode */
2615 if (utf8)
2616 {
2617 register unsigned int d;
2618 for (i = min; i < max; i++)
2619 {
2620 int len = 1;
2621 if (eptr >= md->end_subject) break;
2622 GETCHARLEN(d, eptr, len);
2623 if (d < 256) d = md->lcc[d];
2624 if (fc == d) break;
2625 eptr += len;
2626 }
2627 if (possessive) continue;
2628 for(;;)
2629 {
2630 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM30);
2631 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2632 if (eptr-- == pp) break; /* Stop if tried at original pos */
2633 BACKCHAR(eptr);
2634 }
2635 }
2636 else
2637 #endif
2638 /* Not UTF-8 mode */
2639 {
2640 for (i = min; i < max; i++)
2641 {
2642 if (eptr >= md->end_subject || fc == md->lcc[*eptr]) break;
2643 eptr++;
2644 }
2645 if (possessive) continue;
2646 while (eptr >= pp)
2647 {
2648 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM31);
2649 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2650 eptr--;
2651 }
2652 }
2653
2654 RRETURN(MATCH_NOMATCH);
2655 }
2656 /* Control never gets here */
2657 }
2658
2659 /* Caseful comparisons */
2660
2661 else
2662 {
2663 #ifdef SUPPORT_UTF8
2664 /* UTF-8 mode */
2665 if (utf8)
2666 {
2667 register unsigned int d;
2668 for (i = 1; i <= min; i++)
2669 {
2670 GETCHARINC(d, eptr);
2671 if (fc == d) RRETURN(MATCH_NOMATCH);
2672 }
2673 }
2674 else
2675 #endif
2676 /* Not UTF-8 mode */
2677 {
2678 for (i = 1; i <= min; i++)
2679 if (fc == *eptr++) RRETURN(MATCH_NOMATCH);
2680 }
2681
2682 if (min == max) continue;
2683
2684 if (minimize)
2685 {
2686 #ifdef SUPPORT_UTF8
2687 /* UTF-8 mode */
2688 if (utf8)
2689 {
2690 register unsigned int d;
2691 for (fi = min;; fi++)
2692 {
2693 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM32);
2694 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2695 GETCHARINC(d, eptr);
2696 if (fi >= max || eptr >= md->end_subject || fc == d)
2697 RRETURN(MATCH_NOMATCH);
2698 }
2699 }
2700 else
2701 #endif
2702 /* Not UTF-8 mode */
2703 {
2704 for (fi = min;; fi++)
2705 {
2706 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM33);
2707 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2708 if (fi >= max || eptr >= md->end_subject || fc == *eptr++)
2709 RRETURN(MATCH_NOMATCH);
2710 }
2711 }
2712 /* Control never gets here */
2713 }
2714
2715 /* Maximize case */
2716
2717 else
2718 {
2719 pp = eptr;
2720
2721 #ifdef SUPPORT_UTF8
2722 /* UTF-8 mode */
2723 if (utf8)
2724 {
2725 register unsigned int d;
2726 for (i = min; i < max; i++)
2727 {
2728 int len = 1;
2729 if (eptr >= md->end_subject) break;
2730 GETCHARLEN(d, eptr, len);
2731 if (fc == d) break;
2732 eptr += len;
2733 }
2734 if (possessive) continue;
2735 for(;;)
2736 {
2737 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM34);
2738 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2739 if (eptr-- == pp) break; /* Stop if tried at original pos */
2740 BACKCHAR(eptr);
2741 }
2742 }
2743 else
2744 #endif
2745 /* Not UTF-8 mode */
2746 {
2747 for (i = min; i < max; i++)
2748 {
2749 if (eptr >= md->end_subject || fc == *eptr) break;
2750 eptr++;
2751 }
2752 if (possessive) continue;
2753 while (eptr >= pp)
2754 {
2755 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM35);
2756 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2757 eptr--;
2758 }
2759 }
2760
2761 RRETURN(MATCH_NOMATCH);
2762 }
2763 }
2764 /* Control never gets here */
2765
2766 /* Match a single character type repeatedly; several different opcodes
2767 share code. This is very similar to the code for single characters, but we
2768 repeat it in the interests of efficiency. */
2769
2770 case OP_TYPEEXACT:
2771 min = max = GET2(ecode, 1);
2772 minimize = TRUE;
2773 ecode += 3;
2774 goto REPEATTYPE;
2775
2776 case OP_TYPEUPTO:
2777 case OP_TYPEMINUPTO:
2778 min = 0;
2779 max = GET2(ecode, 1);
2780 minimize = *ecode == OP_TYPEMINUPTO;
2781 ecode += 3;
2782 goto REPEATTYPE;
2783
2784 case OP_TYPEPOSSTAR:
2785 possessive = TRUE;
2786 min = 0;
2787 max = INT_MAX;
2788 ecode++;
2789 goto REPEATTYPE;
2790
2791 case OP_TYPEPOSPLUS:
2792 possessive = TRUE;
2793 min = 1;
2794 max = INT_MAX;
2795 ecode++;
2796 goto REPEATTYPE;
2797
2798 case OP_TYPEPOSQUERY:
2799 possessive = TRUE;
2800 min = 0;
2801 max = 1;
2802 ecode++;
2803 goto REPEATTYPE;
2804
2805 case OP_TYPEPOSUPTO:
2806 possessive = TRUE;
2807 min = 0;
2808 max = GET2(ecode, 1);
2809 ecode += 3;
2810 goto REPEATTYPE;
2811
2812 case OP_TYPESTAR:
2813 case OP_TYPEMINSTAR:
2814 case OP_TYPEPLUS:
2815 case OP_TYPEMINPLUS:
2816 case OP_TYPEQUERY:
2817 case OP_TYPEMINQUERY:
2818 c = *ecode++ - OP_TYPESTAR;
2819 minimize = (c & 1) != 0;
2820 min = rep_min[c]; /* Pick up values from tables; */
2821 max = rep_max[c]; /* zero for max => infinity */
2822 if (max == 0) max = INT_MAX;
2823
2824 /* Common code for all repeated single character type matches. Note that
2825 in UTF-8 mode, '.' matches a character of any length, but for the other
2826 character types, the valid characters are all one-byte long. */
2827
2828 REPEATTYPE:
2829 ctype = *ecode++; /* Code for the character type */
2830
2831 #ifdef SUPPORT_UCP
2832 if (ctype == OP_PROP || ctype == OP_NOTPROP)
2833 {
2834 prop_fail_result = ctype == OP_NOTPROP;
2835 prop_type = *ecode++;
2836 prop_value = *ecode++;
2837 }
2838 else prop_type = -1;
2839 #endif
2840
2841 /* First, ensure the minimum number of matches are present. Use inline
2842 code for maximizing the speed, and do the type test once at the start
2843 (i.e. keep it out of the loop). Also we can test that there are at least
2844 the minimum number of bytes before we start. This isn't as effective in
2845 UTF-8 mode, but it does no harm. Separate the UTF-8 code completely as that
2846 is tidier. Also separate the UCP code, which can be the same for both UTF-8
2847 and single-bytes. */
2848
2849 if (min > md->end_subject - eptr) RRETURN(MATCH_NOMATCH);
2850 if (min > 0)
2851 {
2852 #ifdef SUPPORT_UCP
2853 if (prop_type >= 0)
2854 {
2855 switch(prop_type)
2856 {
2857 case PT_ANY:
2858 if (prop_fail_result) RRETURN(MATCH_NOMATCH);
2859 for (i = 1; i <= min; i++)
2860 {
2861 if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
2862 GETCHARINCTEST(c, eptr);
2863 }
2864 break;
2865
2866 case PT_LAMP:
2867 for (i = 1; i <= min; i++)
2868 {
2869 if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
2870 GETCHARINCTEST(c, eptr);
2871 prop_chartype = UCD_CHARTYPE(c);
2872 if ((prop_chartype == ucp_Lu ||
2873 prop_chartype == ucp_Ll ||
2874 prop_chartype == ucp_Lt) == prop_fail_result)
2875 RRETURN(MATCH_NOMATCH);
2876 }
2877 break;
2878
2879 case PT_GC:
2880 for (i = 1; i <= min; i++)
2881 {
2882 if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
2883 GETCHARINCTEST(c, eptr);
2884 prop_category = UCD_CATEGORY(c);
2885 if ((prop_category == prop_value) == prop_fail_result)
2886 RRETURN(MATCH_NOMATCH);
2887 }
2888 break;
2889
2890 case PT_PC:
2891 for (i = 1; i <= min; i++)
2892 {
2893 if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
2894 GETCHARINCTEST(c, eptr);
2895 prop_chartype = UCD_CHARTYPE(c);
2896 if ((prop_chartype == prop_value) == prop_fail_result)
2897 RRETURN(MATCH_NOMATCH);
2898 }
2899 break;
2900
2901 case PT_SC:
2902 for (i = 1; i <= min; i++)
2903 {
2904 if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
2905 GETCHARINCTEST(c, eptr);
2906 prop_script = UCD_SCRIPT(c);
2907 if ((prop_script == prop_value) == prop_fail_result)
2908 RRETURN(MATCH_NOMATCH);
2909 }
2910 break;
2911
2912 default:
2913 RRETURN(PCRE_ERROR_INTERNAL);
2914 }
2915 }
2916
2917 /* Match extended Unicode sequences. We will get here only if the
2918 support is in the binary; otherwise a compile-time error occurs. */
2919
2920 else if (ctype == OP_EXTUNI)
2921 {
2922 for (i = 1; i <= min; i++)
2923 {
2924 GETCHARINCTEST(c, eptr);
2925 prop_category = UCD_CATEGORY(c);
2926 if (prop_category == ucp_M) RRETURN(MATCH_NOMATCH);
2927 while (eptr < md->end_subject)
2928 {
2929 int len = 1;
2930 if (!utf8) c = *eptr; else
2931 {
2932 GETCHARLEN(c, eptr, len);
2933 }
2934 prop_category = UCD_CATEGORY(c);
2935 if (prop_category != ucp_M) break;
2936 eptr += len;
2937 }
2938 }
2939 }
2940
2941 else
2942 #endif /* SUPPORT_UCP */
2943
2944 /* Handle all other cases when the coding is UTF-8 */
2945
2946 #ifdef SUPPORT_UTF8
2947 if (utf8) switch(ctype)
2948 {
2949 case OP_ANY:
2950 for (i = 1; i <= min; i++)
2951 {
2952 if (eptr >= md->end_subject || IS_NEWLINE(eptr))
2953 RRETURN(MATCH_NOMATCH);
2954 eptr++;
2955 while (eptr < md->end_subject && (*eptr & 0xc0) == 0x80) eptr++;
2956 }
2957 break;
2958
2959 case OP_ALLANY:
2960 for (i = 1; i <= min; i++)
2961 {
2962 if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
2963 eptr++;
2964 while (eptr < md->end_subject && (*eptr & 0xc0) == 0x80) eptr++;
2965 }
2966 break;
2967
2968 case OP_ANYBYTE:
2969 eptr += min;
2970 break;
2971
2972 case OP_ANYNL:
2973 for (i = 1; i <= min; i++)
2974 {
2975 if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
2976 GETCHARINC(c, eptr);
2977 switch(c)
2978 {
2979 default: RRETURN(MATCH_NOMATCH);
2980 case 0x000d:
2981 if (eptr < md->end_subject && *eptr == 0x0a) eptr++;
2982 break;
2983
2984 case 0x000a:
2985 break;
2986
2987 case 0x000b:
2988 case 0x000c:
2989 case 0x0085:
2990 case 0x2028:
2991 case 0x2029:
2992 if (md->bsr_anycrlf) RRETURN(MATCH_NOMATCH);
2993 break;
2994 }
2995 }
2996 break;
2997
2998 case OP_NOT_HSPACE:
2999 for (i = 1; i <= min; i++)
3000 {
3001 if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
3002 GETCHARINC(c, eptr);
3003 switch(c)
3004 {
3005 default: break;
3006 case 0x09: /* HT */
3007 case 0x20: /* SPACE */
3008 case 0xa0: /* NBSP */
3009 case 0x1680: /* OGHAM SPACE MARK */
3010 case 0x180e: /* MONGOLIAN VOWEL SEPARATOR */
3011 case 0x2000: /* EN QUAD */
3012 case 0x2001: /* EM QUAD */
3013 case 0x2002: /* EN SPACE */
3014 case 0x2003: /* EM SPACE */
3015 case 0x2004: /* THREE-PER-EM SPACE */
3016 case 0x2005: /* FOUR-PER-EM SPACE */
3017 case 0x2006: /* SIX-PER-EM SPACE */
3018 case 0x2007: /* FIGURE SPACE */
3019 case 0x2008: /* PUNCTUATION SPACE */
3020 case 0x2009: /* THIN SPACE */
3021 case 0x200A: /* HAIR SPACE */
3022 case 0x202f: /* NARROW NO-BREAK SPACE */
3023 case 0x205f: /* MEDIUM MATHEMATICAL SPACE */
3024 case 0x3000: /* IDEOGRAPHIC SPACE */
3025 RRETURN(MATCH_NOMATCH);
3026 }
3027 }
3028 break;
3029
3030 case OP_HSPACE:
3031 for (i = 1; i <= min; i++)
3032 {
3033 if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
3034 GETCHARINC(c, eptr);
3035 switch(c)
3036 {
3037 default: RRETURN(MATCH_NOMATCH);
3038 case 0x09: /* HT */
3039 case 0x20: /* SPACE */
3040 case 0xa0: /* NBSP */
3041 case 0x1680: /* OGHAM SPACE MARK */
3042 case 0x180e: /* MONGOLIAN VOWEL SEPARATOR */
3043 case 0x2000: /* EN QUAD */
3044 case 0x2001: /* EM QUAD */
3045 case 0x2002: /* EN SPACE */
3046 case 0x2003: /* EM SPACE */
3047 case 0x2004: /* THREE-PER-EM SPACE */
3048 case 0x2005: /* FOUR-PER-EM SPACE */
3049 case 0x2006: /* SIX-PER-EM SPACE */
3050 case 0x2007: /* FIGURE SPACE */
3051 case 0x2008: /* PUNCTUATION SPACE */
3052 case 0x2009: /* THIN SPACE */
3053 case 0x200A: /* HAIR SPACE */
3054 case 0x202f: /* NARROW NO-BREAK SPACE */
3055 case 0x205f: /* MEDIUM MATHEMATICAL SPACE */
3056 case 0x3000: /* IDEOGRAPHIC SPACE */
3057 break;
3058 }
3059 }
3060 break;
3061
3062 case OP_NOT_VSPACE:
3063 for (i = 1; i <= min; i++)
3064 {
3065 if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
3066 GETCHARINC(c, eptr);
3067 switch(c)
3068 {
3069 default: break;
3070 case 0x0a: /* LF */
3071 case 0x0b: /* VT */
3072 case 0x0c: /* FF */
3073 case 0x0d: /* CR */
3074 case 0x85: /* NEL */
3075 case 0x2028: /* LINE SEPARATOR */
3076 case 0x2029: /* PARAGRAPH SEPARATOR */
3077 RRETURN(MATCH_NOMATCH);
3078 }
3079 }
3080 break;
3081
3082 case OP_VSPACE:
3083 for (i = 1; i <= min; i++)
3084 {
3085 if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
3086 GETCHARINC(c, eptr);
3087 switch(c)
3088 {
3089 default: RRETURN(MATCH_NOMATCH);
3090 case 0x0a: /* LF */
3091 case 0x0b: /* VT */
3092 case 0x0c: /* FF */
3093 case 0x0d: /* CR */
3094 case 0x85: /* NEL */
3095 case 0x2028: /* LINE SEPARATOR */
3096 case 0x2029: /* PARAGRAPH SEPARATOR */
3097 break;
3098 }
3099 }
3100 break;
3101
3102 case OP_NOT_DIGIT:
3103 for (i = 1; i <= min; i++)
3104 {
3105 if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
3106 GETCHARINC(c, eptr);
3107 if (c < 128 && (md->ctypes[c] & ctype_digit) != 0)
3108 RRETURN(MATCH_NOMATCH);
3109 }
3110 break;
3111
3112 case OP_DIGIT:
3113 for (i = 1; i <= min; i++)
3114 {
3115 if (eptr >= md->end_subject ||
3116 *eptr >= 128 || (md->ctypes[*eptr++] & ctype_digit) == 0)
3117 RRETURN(MATCH_NOMATCH);
3118 /* No need to skip more bytes - we know it's a 1-byte character */
3119 }
3120 break;
3121
3122 case OP_NOT_WHITESPACE:
3123 for (i = 1; i <= min; i++)
3124 {
3125 if (eptr >= md->end_subject ||
3126 (*eptr < 128 && (md->ctypes[*eptr] & ctype_space) != 0))
3127 RRETURN(MATCH_NOMATCH);
3128 while (++eptr < md->end_subject && (*eptr & 0xc0) == 0x80);
3129 }
3130 break;
3131
3132 case OP_WHITESPACE:
3133 for (i = 1; i <= min; i++)
3134 {
3135 if (eptr >= md->end_subject ||
3136 *eptr >= 128 || (md->ctypes[*eptr++] & ctype_space) == 0)
3137 RRETURN(MATCH_NOMATCH);
3138 /* No need to skip more bytes - we know it's a 1-byte character */
3139 }
3140 break;
3141
3142 case OP_NOT_WORDCHAR:
3143 for (i = 1; i <= min; i++)
3144 {
3145 if (eptr >= md->end_subject ||
3146 (*eptr < 128 && (md->ctypes[*eptr] & ctype_word) != 0))
3147 RRETURN(MATCH_NOMATCH);
3148 while (++eptr < md->end_subject && (*eptr & 0xc0) == 0x80);
3149 }
3150 break;
3151
3152 case OP_WORDCHAR:
3153 for (i = 1; i <= min; i++)
3154 {
3155 if (eptr >= md->end_subject ||
3156 *eptr >= 128 || (md->ctypes[*eptr++] & ctype_word) == 0)
3157 RRETURN(MATCH_NOMATCH);
3158 /* No need to skip more bytes - we know it's a 1-byte character */
3159 }
3160 break;
3161
3162 default:
3163 RRETURN(PCRE_ERROR_INTERNAL);
3164 } /* End switch(ctype) */
3165
3166 else
3167 #endif /* SUPPORT_UTF8 */
3168
3169 /* Code for the non-UTF-8 case for minimum matching of operators other
3170 than OP_PROP and OP_NOTPROP. We can assume that there are the minimum
3171 number of bytes present, as this was tested above. */
3172
3173 switch(ctype)
3174 {
3175 case OP_ANY:
3176 for (i = 1; i <= min; i++)
3177 {
3178 if (IS_NEWLINE(eptr)) RRETURN(MATCH_NOMATCH);
3179 eptr++;
3180 }
3181 break;
3182
3183 case OP_ALLANY:
3184 eptr += min;
3185 break;
3186
3187 case OP_ANYBYTE:
3188 eptr += min;
3189 break;
3190
3191 /* Because of the CRLF case, we can't assume the minimum number of
3192 bytes are present in this case. */
3193
3194 case OP_ANYNL:
3195 for (i = 1; i <= min; i++)
3196 {
3197 if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
3198 switch(*eptr++)
3199 {
3200 default: RRETURN(MATCH_NOMATCH);
3201 case 0x000d:
3202 if (eptr < md->end_subject && *eptr == 0x0a) eptr++;
3203 break;
3204 case 0x000a:
3205 break;
3206
3207 case 0x000b:
3208 case 0x000c:
3209 case 0x0085:
3210 if (md->bsr_anycrlf) RRETURN(MATCH_NOMATCH);
3211 break;
3212 }
3213 }
3214 break;
3215
3216 case OP_NOT_HSPACE:
3217 for (i = 1; i <= min; i++)
3218 {
3219 if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
3220 switch(*eptr++)
3221 {
3222 default: break;
3223 case 0x09: /* HT */
3224 case 0x20: /* SPACE */
3225 case 0xa0: /* NBSP */
3226 RRETURN(MATCH_NOMATCH);
3227 }
3228 }
3229 break;
3230
3231 case OP_HSPACE:
3232 for (i = 1; i <= min; i++)
3233 {
3234 if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
3235 switch(*eptr++)
3236 {
3237 default: RRETURN(MATCH_NOMATCH);
3238 case 0x09: /* HT */
3239 case 0x20: /* SPACE */
3240 case 0xa0: /* NBSP */
3241 break;
3242 }
3243 }
3244 break;
3245
3246 case OP_NOT_VSPACE:
3247 for (i = 1; i <= min; i++)
3248 {
3249 if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
3250 switch(*eptr++)
3251 {
3252 default: break;
3253 case 0x0a: /* LF */
3254 case 0x0b: /* VT */
3255 case 0x0c: /* FF */
3256 case 0x0d: /* CR */
3257 case 0x85: /* NEL */
3258 RRETURN(MATCH_NOMATCH);
3259 }
3260 }
3261 break;
3262
3263 case OP_VSPACE:
3264 for (i = 1; i <= min; i++)
3265 {
3266 if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
3267 switch(*eptr++)
3268 {
3269 default: RRETURN(MATCH_NOMATCH);
3270 case 0x0a: /* LF */
3271 case 0x0b: /* VT */
3272 case 0x0c: /* FF */
3273 case 0x0d: /* CR */
3274 case 0x85: /* NEL */
3275 break;
3276 }
3277 }
3278 break;
3279
3280 case OP_NOT_DIGIT:
3281 for (i = 1; i <= min; i++)
3282 if ((md->ctypes[*eptr++] & ctype_digit) != 0) RRETURN(MATCH_NOMATCH);
3283 break;
3284
3285 case OP_DIGIT:
3286 for (i = 1; i <= min; i++)
3287 if ((md->ctypes[*eptr++] & ctype_digit) == 0) RRETURN(MATCH_NOMATCH);
3288 break;
3289
3290 case OP_NOT_WHITESPACE:
3291 for (i = 1; i <= min; i++)
3292 if ((md->ctypes[*eptr++] & ctype_space) != 0) RRETURN(MATCH_NOMATCH);
3293 break;
3294
3295 case OP_WHITESPACE:
3296 for (i = 1; i <= min; i++)
3297 if ((md->ctypes[*eptr++] & ctype_space) == 0) RRETURN(MATCH_NOMATCH);
3298 break;
3299
3300 case OP_NOT_WORDCHAR:
3301 for (i = 1; i <= min; i++)
3302 if ((md->ctypes[*eptr++] & ctype_word) != 0)
3303 RRETURN(MATCH_NOMATCH);
3304 break;
3305
3306 case OP_WORDCHAR:
3307 for (i = 1; i <= min; i++)
3308 if ((md->ctypes[*eptr++] & ctype_word) == 0)
3309 RRETURN(MATCH_NOMATCH);
3310 break;
3311
3312 default:
3313 RRETURN(PCRE_ERROR_INTERNAL);
3314 }
3315 }
3316
3317 /* If min = max, continue at the same level without recursing */
3318
3319 if (min == max) continue;
3320
3321 /* If minimizing, we have to test the rest of the pattern before each
3322 subsequent match. Again, separate the UTF-8 case for speed, and also
3323 separate the UCP cases. */
3324
3325 if (minimize)
3326 {
3327 #ifdef SUPPORT_UCP
3328 if (prop_type >= 0)
3329 {
3330 switch(prop_type)
3331 {
3332 case PT_ANY:
3333 for (fi = min;; fi++)
3334 {
3335 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM36);
3336 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3337 if (fi >= max || eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
3338 GETCHARINC(c, eptr);
3339 if (prop_fail_result) RRETURN(MATCH_NOMATCH);
3340 }
3341 /* Control never gets here */
3342
3343 case PT_LAMP:
3344 for (fi = min;; fi++)
3345 {
3346 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM37);
3347 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3348 if (fi >= max || eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
3349 GETCHARINC(c, eptr);
3350 prop_chartype = UCD_CHARTYPE(c);
3351 if ((prop_chartype == ucp_Lu ||
3352 prop_chartype == ucp_Ll ||
3353 prop_chartype == ucp_Lt) == prop_fail_result)
3354 RRETURN(MATCH_NOMATCH);
3355 }
3356 /* Control never gets here */
3357
3358 case PT_GC:
3359 for (fi = min;; fi++)
3360 {
3361 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM38);
3362 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3363 if (fi >= max || eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
3364 GETCHARINC(c, eptr);
3365 prop_category = UCD_CATEGORY(c);
3366 if ((prop_category == prop_value) == prop_fail_result)
3367 RRETURN(MATCH_NOMATCH);
3368 }
3369 /* Control never gets here */
3370
3371 case PT_PC:
3372 for (fi = min;; fi++)
3373 {
3374 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM39);
3375 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3376 if (fi >= max || eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
3377 GETCHARINC(c, eptr);
3378 prop_chartype = UCD_CHARTYPE(c);
3379 if ((prop_chartype == prop_value) == prop_fail_result)
3380 RRETURN(MATCH_NOMATCH);
3381 }
3382 /* Control never gets here */
3383
3384 case PT_SC:
3385 for (fi = min;; fi++)
3386 {
3387 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM40);
3388 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3389 if (fi >= max || eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
3390 GETCHARINC(c, eptr);
3391 prop_script = UCD_SCRIPT(c);
3392 if ((prop_script == prop_value) == prop_fail_result)
3393 RRETURN(MATCH_NOMATCH);
3394 }
3395 /* Control never gets here */
3396
3397 default:
3398 RRETURN(PCRE_ERROR_INTERNAL);
3399 }
3400 }
3401
3402 /* Match extended Unicode sequences. We will get here only if the
3403 support is in the binary; otherwise a compile-time error occurs. */
3404
3405 else if (ctype == OP_EXTUNI)
3406 {
3407 for (fi = min;; fi++)
3408 {
3409 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM41);
3410 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3411 if (fi >= max || eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
3412 GETCHARINCTEST(c, eptr);
3413 prop_category = UCD_CATEGORY(c);
3414 if (prop_category == ucp_M) RRETURN(MATCH_NOMATCH);
3415 while (eptr < md->end_subject)
3416 {
3417 int len = 1;
3418 if (!utf8) c = *eptr; else
3419 {
3420 GETCHARLEN(c, eptr, len);
3421 }
3422 prop_category = UCD_CATEGORY(c);
3423 if (prop_category != ucp_M) break;
3424 eptr += len;
3425 }
3426 }
3427 }
3428
3429 else
3430 #endif /* SUPPORT_UCP */
3431
3432 #ifdef SUPPORT_UTF8
3433 /* UTF-8 mode */
3434 if (utf8)
3435 {
3436 for (fi = min;; fi++)
3437 {
3438 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM42);
3439 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3440 if (fi >= max || eptr >= md->end_subject ||
3441 (ctype == OP_ANY && IS_NEWLINE(eptr)))
3442 RRETURN(MATCH_NOMATCH);
3443
3444 GETCHARINC(c, eptr);
3445 switch(ctype)
3446 {
3447 case OP_ANY: /* This is the non-NL case */
3448 case OP_ALLANY:
3449 case OP_ANYBYTE:
3450 break;
3451
3452 case OP_ANYNL:
3453 switch(c)
3454 {
3455 default: RRETURN(MATCH_NOMATCH);
3456 case 0x000d:
3457 if (eptr < md->end_subject && *eptr == 0x0a) eptr++;
3458 break;
3459 case 0x000a:
3460 break;
3461
3462 case 0x000b:
3463 case 0x000c:
3464 case 0x0085:
3465 case 0x2028:
3466 case 0x2029:
3467 if (md->bsr_anycrlf) RRETURN(MATCH_NOMATCH);
3468 break;
3469 }
3470 break;
3471
3472 case OP_NOT_HSPACE:
3473 switch(c)
3474 {
3475 default: break;
3476 case 0x09: /* HT */
3477 case 0x20: /* SPACE */
3478 case 0xa0: /* NBSP */
3479 case 0x1680: /* OGHAM SPACE MARK */
3480 case 0x180e: /* MONGOLIAN VOWEL SEPARATOR */
3481 case 0x2000: /* EN QUAD */
3482 case 0x2001: /* EM QUAD */
3483 case 0x2002: /* EN SPACE */
3484 case 0x2003: /* EM SPACE */
3485 case 0x2004: /* THREE-PER-EM SPACE */
3486 case 0x2005: /* FOUR-PER-EM SPACE */
3487 case 0x2006: /* SIX-PER-EM SPACE */
3488 case 0x2007: /* FIGURE SPACE */
3489 case 0x2008: /* PUNCTUATION SPACE */
3490 case 0x2009: /* THIN SPACE */
3491 case 0x200A: /* HAIR SPACE */
3492 case 0x202f: /* NARROW NO-BREAK SPACE */
3493 case 0x205f: /* MEDIUM MATHEMATICAL SPACE */
3494 case 0x3000: /* IDEOGRAPHIC SPACE */
3495 RRETURN(MATCH_NOMATCH);
3496 }
3497 break;
3498
3499 case OP_HSPACE:
3500 switch(c)
3501 {
3502 default: RRETURN(MATCH_NOMATCH);
3503 case 0x09: /* HT */
3504 case 0x20: /* SPACE */
3505 case 0xa0: /* NBSP */
3506 case 0x1680: /* OGHAM SPACE MARK */
3507 case 0x180e: /* MONGOLIAN VOWEL SEPARATOR */
3508 case 0x2000: /* EN QUAD */
3509 case 0x2001: /* EM QUAD */
3510 case 0x2002: /* EN SPACE */
3511 case 0x2003: /* EM SPACE */
3512 case 0x2004: /* THREE-PER-EM SPACE */
3513 case 0x2005: /* FOUR-PER-EM SPACE */
3514 case 0x2006: /* SIX-PER-EM SPACE */
3515 case 0x2007: /* FIGURE SPACE */
3516 case 0x2008: /* PUNCTUATION SPACE */
3517 case 0x2009: /* THIN SPACE */
3518 case 0x200A: /* HAIR SPACE */
3519 case 0x202f: /* NARROW NO-BREAK SPACE */
3520 case 0x205f: /* MEDIUM MATHEMATICAL SPACE */
3521 case 0x3000: /* IDEOGRAPHIC SPACE */
3522 break;
3523 }
3524 break;
3525
3526 case OP_NOT_VSPACE:
3527 switch(c)
3528 {
3529 default: break;
3530 case 0x0a: /* LF */
3531 case 0x0b: /* VT */
3532 case 0x0c: /* FF */
3533 case 0x0d: /* CR */
3534 case 0x85: /* NEL */
3535 case 0x2028: /* LINE SEPARATOR */
3536 case 0x2029: /* PARAGRAPH SEPARATOR */
3537 RRETURN(MATCH_NOMATCH);
3538 }
3539 break;
3540
3541 case OP_VSPACE:
3542 switch(c)
3543 {
3544 default: RRETURN(MATCH_NOMATCH);
3545 case 0x0a: /* LF */
3546 case 0x0b: /* VT */
3547 case 0x0c: /* FF */
3548 case 0x0d: /* CR */
3549 case 0x85: /* NEL */
3550 case 0x2028: /* LINE SEPARATOR */
3551 case 0x2029: /* PARAGRAPH SEPARATOR */
3552 break;
3553 }
3554 break;
3555
3556 case OP_NOT_DIGIT:
3557 if (c < 256 && (md->ctypes[c] & ctype_digit) != 0)
3558 RRETURN(MATCH_NOMATCH);
3559 break;
3560
3561 case OP_DIGIT:
3562 if (c >= 256 || (md->ctypes[c] & ctype_digit) == 0)
3563 RRETURN(MATCH_NOMATCH);
3564 break;
3565
3566 case OP_NOT_WHITESPACE:
3567 if (c < 256 && (md->ctypes[c] & ctype_space) != 0)
3568 RRETURN(MATCH_NOMATCH);
3569 break;
3570
3571 case OP_WHITESPACE:
3572 if (c >= 256 || (md->ctypes[c] & ctype_space) == 0)
3573 RRETURN(MATCH_NOMATCH);
3574 break;
3575
3576 case OP_NOT_WORDCHAR:
3577 if (c < 256 && (md->ctypes[c] & ctype_word) != 0)
3578 RRETURN(MATCH_NOMATCH);
3579 break;
3580
3581 case OP_WORDCHAR:
3582 if (c >= 256 || (md->ctypes[c] & ctype_word) == 0)
3583 RRETURN(MATCH_NOMATCH);
3584 break;
3585
3586 default:
3587 RRETURN(PCRE_ERROR_INTERNAL);
3588 }
3589 }
3590 }
3591 else
3592 #endif
3593 /* Not UTF-8 mode */
3594 {
3595 for (fi = min;; fi++)
3596 {
3597 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM43);
3598 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3599 if (fi >= max || eptr >= md->end_subject ||
3600 (ctype == OP_ANY && IS_NEWLINE(eptr)))
3601 RRETURN(MATCH_NOMATCH);
3602
3603 c = *eptr++;
3604 switch(ctype)
3605 {
3606 case OP_ANY: /* This is the non-NL case */
3607 case OP_ALLANY:
3608 case OP_ANYBYTE:
3609 break;
3610
3611 case OP_ANYNL:
3612 switch(c)
3613 {
3614 default: RRETURN(MATCH_NOMATCH);
3615 case 0x000d:
3616 if (eptr < md->end_subject && *eptr == 0x0a) eptr++;
3617 break;
3618
3619 case 0x000a:
3620 break;
3621
3622 case 0x000b:
3623 case 0x000c:
3624 case 0x0085:
3625 if (md->bsr_anycrlf) RRETURN(MATCH_NOMATCH);
3626 break;
3627 }
3628 break;
3629
3630 case OP_NOT_HSPACE:
3631 switch(c)
3632 {
3633 default: break;
3634 case 0x09: /* HT */
3635 case 0x20: /* SPACE */
3636 case 0xa0: /* NBSP */
3637 RRETURN(MATCH_NOMATCH);
3638 }
3639 break;
3640
3641 case OP_HSPACE:
3642 switch(c)
3643 {
3644 default: RRETURN(MATCH_NOMATCH);
3645 case 0x09: /* HT */
3646 case 0x20: /* SPACE */
3647 case 0xa0: /* NBSP */
3648 break;
3649 }
3650 break;
3651
3652 case OP_NOT_VSPACE:
3653 switch(c)
3654 {
3655 default: break;
3656 case 0x0a: /* LF */
3657 case 0x0b: /* VT */
3658 case 0x0c: /* FF */
3659 case 0x0d: /* CR */
3660 case 0x85: /* NEL */
3661 RRETURN(MATCH_NOMATCH);
3662 }
3663 break;
3664
3665 case OP_VSPACE:
3666 switch(c)
3667 {
3668 default: RRETURN(MATCH_NOMATCH);
3669 case 0x0a: /* LF */
3670 case 0x0b: /* VT */
3671 case 0x0c: /* FF */
3672 case 0x0d: /* CR */
3673 case 0x85: /* NEL */
3674 break;
3675 }
3676 break;
3677
3678 case OP_NOT_DIGIT:
3679 if ((md->ctypes[c] & ctype_digit) != 0) RRETURN(MATCH_NOMATCH);
3680 break;
3681
3682 case OP_DIGIT:
3683 if ((md->ctypes[c] & ctype_digit) == 0) RRETURN(MATCH_NOMATCH);
3684 break;
3685
3686 case OP_NOT_WHITESPACE:
3687 if ((md->ctypes[c] & ctype_space) != 0) RRETURN(MATCH_NOMATCH);
3688 break;
3689
3690 case OP_WHITESPACE:
3691 if ((md->ctypes[c] & ctype_space) == 0) RRETURN(MATCH_NOMATCH);
3692 break;
3693
3694 case OP_NOT_WORDCHAR:
3695 if ((md->ctypes[c] & ctype_word) != 0) RRETURN(MATCH_NOMATCH);
3696 break;
3697
3698 case OP_WORDCHAR:
3699 if ((md->ctypes[c] & ctype_word) == 0) RRETURN(MATCH_NOMATCH);
3700 break;
3701
3702 default:
3703 RRETURN(PCRE_ERROR_INTERNAL);
3704 }
3705 }
3706 }
3707 /* Control never gets here */
3708 }
3709
3710 /* If maximizing, it is worth using inline code for speed, doing the type
3711 test once at the start (i.e. keep it out of the loop). Again, keep the
3712 UTF-8 and UCP stuff separate. */
3713
3714 else
3715 {
3716 pp = eptr; /* Remember where we started */
3717
3718 #ifdef SUPPORT_UCP
3719 if (prop_type >= 0)
3720 {
3721 switch(prop_type)
3722 {
3723 case PT_ANY:
3724 for (i = min; i < max; i++)
3725 {
3726 int len = 1;
3727 if (eptr >= md->end_subject) break;
3728 GETCHARLEN(c, eptr, len);
3729 if (prop_fail_result) break;
3730 eptr+= len;
3731 }
3732 break;
3733
3734 case PT_LAMP:
3735 for (i = min; i < max; i++)
3736 {
3737 int len = 1;
3738 if (eptr >= md->end_subject) break;
3739 GETCHARLEN(c, eptr, len);
3740 prop_chartype = UCD_CHARTYPE(c);
3741 if ((prop_chartype == ucp_Lu ||
3742 prop_chartype == ucp_Ll ||
3743 prop_chartype == ucp_Lt) == prop_fail_result)
3744 break;
3745 eptr+= len;
3746 }
3747 break;
3748
3749 case PT_GC:
3750 for (i = min; i < max; i++)
3751 {
3752 int len = 1;
3753 if (eptr >= md->end_subject) break;
3754 GETCHARLEN(c, eptr, len);
3755 prop_category = UCD_CATEGORY(c);
3756 if ((prop_category == prop_value) == prop_fail_result)
3757 break;
3758 eptr+= len;
3759 }
3760 break;
3761
3762 case PT_PC:
3763 for (i = min; i < max; i++)
3764 {
3765 int len = 1;
3766 if (eptr >= md->end_subject) break;
3767 GETCHARLEN(c, eptr, len);
3768 prop_chartype = UCD_CHARTYPE(c);
3769 if ((prop_chartype == prop_value) == prop_fail_result)
3770 break;
3771 eptr+= len;
3772 }
3773 break;
3774
3775 case PT_SC:
3776 for (i = min; i < max; i++)
3777 {
3778 int len = 1;
3779 if (eptr >= md->end_subject) break;
3780 GETCHARLEN(c, eptr, len);
3781 prop_script = UCD_SCRIPT(c);
3782 if ((prop_script == prop_value) == prop_fail_result)
3783 break;
3784 eptr+= len;
3785 }
3786 break;
3787 }
3788
3789 /* eptr is now past the end of the maximum run */
3790
3791 if (possessive) continue;
3792 for(;;)
3793 {
3794 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM44);
3795 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3796 if (eptr-- == pp) break; /* Stop if tried at original pos */
3797 if (utf8) BACKCHAR(eptr);
3798 }
3799 }
3800
3801 /* Match extended Unicode sequences. We will get here only if the
3802 support is in the binary; otherwise a compile-time error occurs. */
3803
3804 else if (ctype == OP_EXTUNI)
3805 {
3806 for (i = min; i < max; i++)
3807 {
3808 if (eptr >= md->end_subject) break;
3809 GETCHARINCTEST(c, eptr);
3810 prop_category = UCD_CATEGORY(c);
3811 if (prop_category == ucp_M) break;
3812 while (eptr < md->end_subject)
3813 {
3814 int len = 1;
3815 if (!utf8) c = *eptr; else
3816 {
3817 GETCHARLEN(c, eptr, len);
3818 }
3819 prop_category = UCD_CATEGORY(c);
3820 if (prop_category != ucp_M) break;
3821 eptr += len;
3822 }
3823 }
3824
3825 /* eptr is now past the end of the maximum run */
3826
3827 if (possessive) continue;
3828 for(;;)
3829 {
3830 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM45);
3831 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3832 if (eptr-- == pp) break; /* Stop if tried at original pos */
3833 for (;;) /* Move back over one extended */
3834 {
3835 int len = 1;
3836 if (!utf8) c = *eptr; else
3837 {
3838 BACKCHAR(eptr);
3839 GETCHARLEN(c, eptr, len);
3840 }
3841 prop_category = UCD_CATEGORY(c);
3842 if (prop_category != ucp_M) break;
3843 eptr--;
3844 }
3845 }
3846 }
3847
3848 else
3849 #endif /* SUPPORT_UCP */
3850
3851 #ifdef SUPPORT_UTF8
3852 /* UTF-8 mode */
3853
3854 if (utf8)
3855 {
3856 switch(ctype)
3857 {
3858 case OP_ANY:
3859 if (max < INT_MAX)
3860 {
3861 for (i = min; i < max; i++)
3862 {
3863 if (eptr >= md->end_subject || IS_NEWLINE(eptr)) break;
3864 eptr++;
3865 while (eptr < md->end_subject && (*eptr & 0xc0) == 0x80) eptr++;
3866 }
3867 }
3868
3869 /* Handle unlimited UTF-8 repeat */
3870
3871 else
3872 {
3873 for (i = min; i < max; i++)
3874 {
3875 if (eptr >= md->end_subject || IS_NEWLINE(eptr)) break;
3876 eptr++;
3877 while (eptr < md->end_subject && (*eptr & 0xc0) == 0x80) eptr++;
3878 }
3879 }
3880 break;
3881
3882 case OP_ALLANY:
3883 if (max < INT_MAX)
3884 {
3885 for (i = min; i < max; i++)
3886 {
3887 if (eptr >= md->end_subject) break;
3888 eptr++;
3889 while (eptr < md->end_subject && (*eptr & 0xc0) == 0x80) eptr++;
3890 }
3891 }
3892 else eptr = md->end_subject; /* Unlimited UTF-8 repeat */
3893 break;
3894
3895 /* The byte case is the same as non-UTF8 */
3896
3897 case OP_ANYBYTE:
3898 c = max - min;
3899 if (c > (unsigned int)(md->end_subject - eptr))
3900 c = md->end_subject - eptr;
3901 eptr += c;
3902 break;
3903
3904 case OP_ANYNL:
3905 for (i = min; i < max; i++)
3906 {
3907 int len = 1;
3908 if (eptr >= md->end_subject) break;
3909 GETCHARLEN(c, eptr, len);
3910 if (c == 0x000d)
3911 {
3912 if (++eptr >= md->end_subject) break;
3913 if (*eptr == 0x000a) eptr++;
3914 }
3915 else
3916 {
3917 if (c != 0x000a &&
3918 (md->bsr_anycrlf ||
3919 (c != 0x000b && c != 0x000c &&
3920 c != 0x0085 && c != 0x2028 && c != 0x2029)))
3921 break;
3922 eptr += len;
3923 }
3924 }
3925 break;
3926
3927 case OP_NOT_HSPACE:
3928 case OP_HSPACE:
3929 for (i = min; i < max; i++)
3930 {
3931 BOOL gotspace;
3932 int len = 1;
3933 if (eptr >= md->end_subject) break;
3934 GETCHARLEN(c, eptr, len);
3935 switch(c)
3936 {
3937 default: gotspace = FALSE; break;
3938 case 0x09: /* HT */
3939 case 0x20: /* SPACE */
3940 case 0xa0: /* NBSP */
3941 case 0x1680: /* OGHAM SPACE MARK */
3942 case 0x180e: /* MONGOLIAN VOWEL SEPARATOR */
3943 case 0x2000: /* EN QUAD */
3944 case 0x2001: /* EM QUAD */
3945 case 0x2002: /* EN SPACE */
3946 case 0x2003: /* EM SPACE */
3947 case 0x2004: /* THREE-PER-EM SPACE */
3948 case 0x2005: /* FOUR-PER-EM SPACE */
3949 case 0x2006: /* SIX-PER-EM SPACE */
3950 case 0x2007: /* FIGURE SPACE */
3951 case 0x2008: /* PUNCTUATION SPACE */
3952 case 0x2009: /* THIN SPACE */
3953 case 0x200A: /* HAIR SPACE */
3954 case 0x202f: /* NARROW NO-BREAK SPACE */
3955 case 0x205f: /* MEDIUM MATHEMATICAL SPACE */
3956 case 0x3000: /* IDEOGRAPHIC SPACE */
3957 gotspace = TRUE;
3958 break;
3959 }
3960 if (gotspace == (ctype == OP_NOT_HSPACE)) break;
3961 eptr += len;
3962 }
3963 break;
3964
3965 case OP_NOT_VSPACE:
3966 case OP_VSPACE:
3967 for (i = min; i < max; i++)
3968 {
3969 BOOL gotspace;
3970 int len = 1;
3971 if (eptr >= md->end_subject) break;
3972 GETCHARLEN(c, eptr, len);
3973 switch(c)
3974 {
3975 default: gotspace = FALSE; break;
3976 case 0x0a: /* LF */
3977 case 0x0b: /* VT */
3978 case 0x0c: /* FF */
3979 case 0x0d: /* CR */
3980 case 0x85: /* NEL */
3981 case 0x2028: /* LINE SEPARATOR */
3982 case 0x2029: /* PARAGRAPH SEPARATOR */
3983 gotspace = TRUE;
3984 break;
3985 }
3986 if (gotspace == (ctype == OP_NOT_VSPACE)) break;
3987 eptr += len;
3988 }
3989 break;
3990
3991 case OP_NOT_DIGIT:
3992 for (i = min; i < max; i++)
3993 {
3994 int len = 1;
3995 if (eptr >= md->end_subject) break;
3996 GETCHARLEN(c, eptr, len);
3997 if (c < 256 && (md->ctypes[c] & ctype_digit) != 0) break;
3998 eptr+= len;
3999 }
4000 break;
4001
4002 case OP_DIGIT:
4003 for (i = min; i < max; i++)
4004 {
4005 int len = 1;
4006 if (eptr >= md->end_subject) break;
4007 GETCHARLEN(c, eptr, len);
4008 if (c >= 256 ||(md->ctypes[c] & ctype_digit) == 0) break;
4009 eptr+= len;
4010 }
4011 break;
4012
4013 case OP_NOT_WHITESPACE:
4014 for (i = min; i < max; i++)
4015 {
4016 int len = 1;
4017 if (eptr >= md->end_subject) break;
4018 GETCHARLEN(c, eptr, len);
4019 if (c < 256 && (md->ctypes[c] & ctype_space) != 0) break;
4020 eptr+= len;
4021 }
4022 break;
4023
4024 case OP_WHITESPACE:
4025 for (i = min; i < max; i++)
4026 {
4027 int len = 1;
4028 if (eptr >= md->end_subject) break;
4029 GETCHARLEN(c, eptr, len);
4030 if (c >= 256 ||(md->ctypes[c] & ctype_space) == 0) break;
4031 eptr+= len;
4032 }
4033 break;
4034
4035 case OP_NOT_WORDCHAR:
4036 for (i = min; i < max; i++)
4037 {
4038 int len = 1;
4039 if (eptr >= md->end_subject) break;
4040 GETCHARLEN(c, eptr, len);
4041 if (c < 256 && (md->ctypes[c] & ctype_word) != 0) break;
4042 eptr+= len;
4043 }
4044 break;
4045
4046 case OP_WORDCHAR:
4047 for (i = min; i < max; i++)
4048 {
4049 int len = 1;
4050 if (eptr >= md->end_subject) break;
4051 GETCHARLEN(c, eptr, len);
4052 if (c >= 256 || (md->ctypes[c] & ctype_word) == 0) break;
4053 eptr+= len;
4054 }
4055 break;
4056
4057 default:
4058 RRETURN(PCRE_ERROR_INTERNAL);
4059 }
4060
4061 /* eptr is now past the end of the maximum run */
4062
4063 if (possessive) continue;
4064 for(;;)
4065 {
4066 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM46);
4067 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
4068 if (eptr-- == pp) break; /* Stop if tried at original pos */
4069 BACKCHAR(eptr);
4070 }
4071 }
4072 else
4073 #endif /* SUPPORT_UTF8 */
4074
4075 /* Not UTF-8 mode */
4076 {
4077 switch(ctype)
4078 {
4079 case OP_ANY:
4080 for (i = min; i < max; i++)
4081 {
4082 if (eptr >= md->end_subject || IS_NEWLINE(eptr)) break;
4083 eptr++;
4084 }
4085 break;
4086
4087 case OP_ALLANY:
4088 case OP_ANYBYTE:
4089 c = max - min;
4090 if (c > (unsigned int)(md->end_subject - eptr))
4091 c = md->end_subject - eptr;
4092 eptr += c;
4093 break;
4094
4095 case OP_ANYNL:
4096 for (i = min; i < max; i++)
4097 {
4098 if (eptr >= md->end_subject) break;
4099 c = *eptr;
4100 if (c == 0x000d)
4101 {
4102 if (++eptr >= md->end_subject) break;
4103 if (*eptr == 0x000a) eptr++;
4104 }
4105 else
4106 {
4107 if (c != 0x000a &&
4108 (md->bsr_anycrlf ||
4109 (c != 0x000b && c != 0x000c && c != 0x0085)))
4110 break;
4111 eptr++;
4112 }
4113 }
4114 break;
4115
4116 case OP_NOT_HSPACE:
4117 for (i = min; i < max; i++)
4118 {
4119 if (eptr >= md->end_subject) break;
4120 c = *eptr;
4121 if (c == 0x09 || c == 0x20 || c == 0xa0) break;
4122 eptr++;
4123 }
4124 break;
4125
4126 case OP_HSPACE:
4127 for (i = min; i < max; i++)
4128 {
4129 if (eptr >= md->end_subject) break;
4130 c = *eptr;
4131 if (c != 0x09 && c != 0x20 && c != 0xa0) break;
4132 eptr++;
4133 }
4134 break;
4135
4136 case OP_NOT_VSPACE:
4137 for (i = min; i < max; i++)
4138 {
4139 if (eptr >= md->end_subject) break;
4140 c = *eptr;
4141 if (c == 0x0a || c == 0x0b || c == 0x0c || c == 0x0d || c == 0x85)
4142 break;
4143 eptr++;
4144 }
4145 break;
4146
4147 case OP_VSPACE:
4148 for (i = min; i < max; i++)
4149 {
4150 if (eptr >= md->end_subject) break;
4151 c = *eptr;
4152 if (c != 0x0a && c != 0x0b && c != 0x0c && c != 0x0d && c != 0x85)
4153 break;
4154 eptr++;
4155 }
4156 break;
4157
4158 case OP_NOT_DIGIT:
4159 for (i = min; i < max; i++)
4160 {
4161 if (eptr >= md->end_subject || (md->ctypes[*eptr] & ctype_digit) != 0)
4162 break;
4163 eptr++;
4164 }
4165 break;
4166
4167 case OP_DIGIT:
4168 for (i = min; i < max; i++)
4169 {
4170 if (eptr >= md->end_subject || (md->ctypes[*eptr] & ctype_digit) == 0)
4171 break;
4172 eptr++;
4173 }
4174 break;
4175
4176 case OP_NOT_WHITESPACE:
4177 for (i = min; i < max; i++)
4178 {
4179 if (eptr >= md->end_subject || (md->ctypes[*eptr] & ctype_space) != 0)
4180 break;
4181 eptr++;
4182 }
4183 break;
4184
4185 case OP_WHITESPACE:
4186 for (i = min; i < max; i++)
4187 {
4188 if (eptr >= md->end_subject || (md->ctypes[*eptr] & ctype_space) == 0)
4189 break;
4190 eptr++;
4191 }
4192 break;
4193
4194 case OP_NOT_WORDCHAR:
4195 for (i = min; i < max; i++)
4196 {
4197 if (eptr >= md->end_subject || (md->ctypes[*eptr] & ctype_word) != 0)
4198 break;
4199 eptr++;
4200 }
4201 break;
4202
4203 case OP_WORDCHAR:
4204 for (i = min; i < max; i++)
4205 {
4206 if (eptr >= md->end_subject || (md->ctypes[*eptr] & ctype_word) == 0)
4207 break;
4208 eptr++;
4209 }
4210 break;
4211
4212 default:
4213 RRETURN(PCRE_ERROR_INTERNAL);
4214 }
4215
4216 /* eptr is now past the end of the maximum run */
4217
4218 if (possessive) continue;
4219 while (eptr >= pp)
4220 {
4221 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM47);
4222 eptr--;
4223 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
4224 }
4225 }
4226
4227 /* Get here if we can't make it match with any permitted repetitions */
4228
4229 RRETURN(MATCH_NOMATCH);
4230 }
4231 /* Control never gets here */
4232
4233 /* There's been some horrible disaster. Arrival here can only mean there is
4234 something seriously wrong in the code above or the OP_xxx definitions. */
4235
4236 default:
4237 DPRINTF(("Unknown opcode %d\n", *ecode));
4238 RRETURN(PCRE_ERROR_UNKNOWN_OPCODE);
4239 }
4240
4241 /* Do not stick any code in here without much thought; it is assumed
4242 that "continue" in the code above comes out to here to repeat the main
4243 loop. */
4244
4245 } /* End of main loop */
4246 /* Control never reaches here */
4247
4248
4249 /* When compiling to use the heap rather than the stack for recursive calls to
4250 match(), the RRETURN() macro jumps here. The number that is saved in
4251 frame->Xwhere indicates which label we actually want to return to. */
4252
4253 #ifdef NO_RECURSE
4254 #define LBL(val) case val: goto L_RM##val;
4255 HEAP_RETURN:
4256 switch (frame->Xwhere)
4257 {
4258 LBL( 1) LBL( 2) LBL( 3) LBL( 4) LBL( 5) LBL( 6) LBL( 7) LBL( 8)
4259 LBL( 9) LBL(10) LBL(11) LBL(12) LBL(13) LBL(14) LBL(15) LBL(17)
4260 LBL(19) LBL(24) LBL(25) LBL(26) LBL(27) LBL(29) LBL(31) LBL(33)
4261 LBL(35) LBL(43) LBL(47) LBL(48) LBL(49) LBL(50) LBL(51) LBL(52)
4262 LBL(53) LBL(54)
4263 #ifdef SUPPORT_UTF8
4264 LBL(16) LBL(18) LBL(20) LBL(21) LBL(22) LBL(23) LBL(28) LBL(30)
4265 LBL(32) LBL(34) LBL(42) LBL(46)
4266 #ifdef SUPPORT_UCP
4267 LBL(36) LBL(37) LBL(38) LBL(39) LBL(40) LBL(41) LBL(44) LBL(45)
4268 #endif /* SUPPORT_UCP */
4269 #endif /* SUPPORT_UTF8 */
4270 default:
4271 DPRINTF(("jump error in pcre match: label %d non-existent\n", frame->Xwhere));
4272 return PCRE_ERROR_INTERNAL;
4273 }
4274 #undef LBL
4275 #endif /* NO_RECURSE */
4276 }
4277
4278
4279 /***************************************************************************
4280 ****************************************************************************
4281 RECURSION IN THE match() FUNCTION
4282
4283 Undefine all the macros that were defined above to handle this. */
4284
4285 #ifdef NO_RECURSE
4286 #undef eptr
4287 #undef ecode
4288 #undef mstart
4289 #undef offset_top
4290 #undef ims
4291 #undef eptrb
4292 #undef flags
4293
4294 #undef callpat
4295 #undef charptr
4296 #undef data
4297 #undef next
4298 #undef pp
4299 #undef prev
4300 #undef saved_eptr
4301
4302 #undef new_recursive
4303
4304 #undef cur_is_word
4305 #undef condition
4306 #undef prev_is_word
4307
4308 #undef original_ims
4309
4310 #undef ctype
4311 #undef length
4312 #undef max
4313 #undef min
4314 #undef number
4315 #undef offset
4316 #undef op
4317 #undef save_capture_last
4318 #undef save_offset1
4319 #undef save_offset2
4320 #undef save_offset3
4321 #undef stacksave
4322
4323 #undef newptrb
4324
4325 #endif
4326
4327 /* These two are defined as macros in both cases */
4328
4329 #undef fc
4330 #undef fi
4331
4332 /***************************************************************************
4333 ***************************************************************************/
4334
4335
4336
4337 /*************************************************
4338 * Execute a Regular Expression *
4339 *************************************************/
4340
4341 /* This function applies a compiled re to a subject string and picks out
4342 portions of the string if it matches. Two elements in the vector are set for
4343 each substring: the offsets to the start and end of the substring.
4344
4345 Arguments:
4346 argument_re points to the compiled expression
4347 extra_data points to extra data or is NULL
4348 subject points to the subject string
4349 length length of subject string (may contain binary zeros)
4350 start_offset where to start in the subject string
4351 options option bits
4352 offsets points to a vector of ints to be filled in with offsets
4353 offsetcount the number of elements in the vector
4354
4355 Returns: > 0 => success; value is the number of elements filled in
4356 = 0 => success, but offsets is not big enough
4357 -1 => failed to match
4358 < -1 => some kind of unexpected problem
4359 */
4360
4361 PCRE_EXP_DEFN int
4362 pcre_exec(const pcre *argument_re, const pcre_extra *extra_data,
4363 PCRE_SPTR subject, int length, int start_offset, int options, int *offsets,
4364 int offsetcount)
4365 {
4366 int rc, resetcount, ocount;
4367 int first_byte = -1;
4368 int req_byte = -1;
4369 int req_byte2 = -1;
4370 int newline;
4371 unsigned long int ims;
4372 BOOL using_temporary_offsets = FALSE;
4373 BOOL anchored;
4374 BOOL startline;
4375 BOOL firstline;
4376 BOOL first_byte_caseless = FALSE;
4377 BOOL req_byte_caseless = FALSE;
4378 BOOL utf8;
4379 match_data match_block;
4380 match_data *md = &match_block;
4381 const uschar *tables;
4382 const uschar *start_bits = NULL;
4383 USPTR start_match = (USPTR)subject + start_offset;
4384 USPTR end_subject;
4385 USPTR req_byte_ptr = start_match - 1;
4386
4387 pcre_study_data internal_study;
4388 const pcre_study_data *study;
4389
4390 real_pcre internal_re;
4391 const real_pcre *external_re = (const real_pcre *)argument_re;
4392 const real_pcre *re = external_re;
4393
4394 /* Plausibility checks */
4395
4396 if ((options & ~PUBLIC_EXEC_OPTIONS) != 0) return PCRE_ERROR_BADOPTION;
4397 if (re == NULL || subject == NULL ||
4398 (offsets == NULL && offsetcount > 0)) return PCRE_ERROR_NULL;
4399 if (offsetcount < 0) return PCRE_ERROR_BADCOUNT;
4400
4401 /* Fish out the optional data from the extra_data structure, first setting
4402 the default values. */
4403
4404 study = NULL;
4405 md->match_limit = MATCH_LIMIT;
4406 md->match_limit_recursion = MATCH_LIMIT_RECURSION;
4407 md->callout_data = NULL;
4408
4409 /* The table pointer is always in native byte order. */
4410
4411 tables = external_re->tables;
4412
4413 if (extra_data != NULL)
4414 {
4415 register unsigned int flags = extra_data->flags;
4416 if ((flags & PCRE_EXTRA_STUDY_DATA) != 0)
4417 study = (const pcre_study_data *)extra_data->study_data;
4418 if ((flags & PCRE_EXTRA_MATCH_LIMIT) != 0)
4419 md->match_limit = extra_data->match_limit;
4420 if ((flags & PCRE_EXTRA_MATCH_LIMIT_RECURSION) != 0)
4421 md->match_limit_recursion = extra_data->match_limit_recursion;
4422 if ((flags & PCRE_EXTRA_CALLOUT_DATA) != 0)
4423 md->callout_data = extra_data->callout_data;
4424 if ((flags & PCRE_EXTRA_TABLES) != 0) tables = extra_data->tables;
4425 }
4426
4427 /* If the exec call supplied NULL for tables, use the inbuilt ones. This
4428 is a feature that makes it possible to save compiled regex and re-use them
4429 in other programs later. */
4430
4431 if (tables == NULL) tables = _pcre_default_tables;
4432
4433 /* Check that the first field in the block is the magic number. If it is not,
4434 test for a regex that was compiled on a host of opposite endianness. If this is
4435 the case, flipped values are put in internal_re and internal_study if there was
4436 study data too. */
4437
4438 if (re->magic_number != MAGIC_NUMBER)
4439 {
4440 re = _pcre_try_flipped(re, &internal_re, study, &internal_study);
4441 if (re == NULL) return PCRE_ERROR_BADMAGIC;
4442 if (study != NULL) study = &internal_study;
4443 }
4444
4445 /* Set up other data */
4446
4447 anchored = ((re->options | options) & PCRE_ANCHORED) != 0;
4448 startline = (re->flags & PCRE_STARTLINE) != 0;
4449 firstline = (re->options & PCRE_FIRSTLINE) != 0;
4450
4451 /* The code starts after the real_pcre block and the capture name table. */
4452
4453 md->start_code = (const uschar *)external_re + re->name_table_offset +
4454 re->name_count * re->name_entry_size;
4455
4456 md->start_subject = (USPTR)subject;
4457 md->start_offset = start_offset;
4458 md->end_subject = md->start_subject + length;
4459 end_subject = md->end_subject;
4460
4461 md->endonly = (re->options & PCRE_DOLLAR_ENDONLY) != 0;
4462 utf8 = md->utf8 = (re->options & PCRE_UTF8) != 0;
4463 md->jscript_compat = (re->options & PCRE_JAVASCRIPT_COMPAT) != 0;
4464
4465 md->notbol = (options & PCRE_NOTBOL) != 0;
4466 md->noteol = (options & PCRE_NOTEOL) != 0;
4467 md->notempty = (options & PCRE_NOTEMPTY) != 0;
4468 md->partial = (options & PCRE_PARTIAL) != 0;
4469 md->hitend = FALSE;
4470
4471 md->recursive = NULL; /* No recursion at top level */
4472
4473 md->lcc = tables + lcc_offset;
4474 md->ctypes = tables + ctypes_offset;
4475
4476 /* Handle different \R options. */
4477
4478 switch (options & (PCRE_BSR_ANYCRLF|PCRE_BSR_UNICODE))
4479 {
4480 case 0:
4481 if ((re->options & (PCRE_BSR_ANYCRLF|PCRE_BSR_UNICODE)) != 0)
4482 md->bsr_anycrlf = (re->options & PCRE_BSR_ANYCRLF) != 0;
4483 else
4484 #ifdef BSR_ANYCRLF
4485 md->bsr_anycrlf = TRUE;
4486 #else
4487 md->bsr_anycrlf = FALSE;
4488 #endif
4489 break;
4490
4491 case PCRE_BSR_ANYCRLF:
4492 md->bsr_anycrlf = TRUE;
4493 break;
4494
4495 case PCRE_BSR_UNICODE:
4496 md->bsr_anycrlf = FALSE;
4497 break;
4498
4499 default: return PCRE_ERROR_BADNEWLINE;
4500 }
4501
4502 /* Handle different types of newline. The three bits give eight cases. If
4503 nothing is set at run time, whatever was used at compile time applies. */
4504
4505 switch ((((options & PCRE_NEWLINE_BITS) == 0)? re->options :
4506 (pcre_uint32)options) & PCRE_NEWLINE_BITS)
4507 {
4508 case 0: newline = NEWLINE; break; /* Compile-time default */
4509 case PCRE_NEWLINE_CR: newline = '\r'; break;
4510 case PCRE_NEWLINE_LF: newline = '\n'; break;
4511 case PCRE_NEWLINE_CR+
4512 PCRE_NEWLINE_LF: newline = ('\r' << 8) | '\n'; break;
4513 case PCRE_NEWLINE_ANY: newline = -1; break;
4514 case PCRE_NEWLINE_ANYCRLF: newline = -2; break;
4515 default: return PCRE_ERROR_BADNEWLINE;
4516 }
4517
4518 if (newline == -2)
4519 {
4520 md->nltype = NLTYPE_ANYCRLF;
4521 }
4522 else if (newline < 0)
4523 {
4524 md->nltype = NLTYPE_ANY;
4525 }
4526 else
4527 {
4528 md->nltype = NLTYPE_FIXED;
4529 if (newline > 255)
4530 {
4531 md->nllen = 2;
4532 md->nl[0] = (newline >> 8) & 255;
4533 md->nl[1] = newline & 255;
4534 }
4535 else
4536 {
4537 md->nllen = 1;
4538 md->nl[0] = newline;
4539 }
4540 }
4541
4542 /* Partial matching is supported only for a restricted set of regexes at the
4543 moment. */
4544
4545 if (md->partial && (re->flags & PCRE_NOPARTIAL) != 0)
4546 return PCRE_ERROR_BADPARTIAL;
4547
4548 /* Check a UTF-8 string if required. Unfortunately there's no way of passing
4549 back the character offset. */
4550
4551 #ifdef SUPPORT_UTF8
4552 if (utf8 && (options & PCRE_NO_UTF8_CHECK) == 0)
4553 {
4554 if (_pcre_valid_utf8((uschar *)subject, length) >= 0)
4555 return PCRE_ERROR_BADUTF8;
4556 if (start_offset > 0 && start_offset < length)
4557 {
4558 int tb = ((uschar *)subject)[start_offset];
4559 if (tb > 127)
4560 {
4561 tb &= 0xc0;
4562 if (tb != 0 && tb != 0xc0) return PCRE_ERROR_BADUTF8_OFFSET;
4563 }
4564 }
4565 }
4566 #endif
4567
4568 /* The ims options can vary during the matching as a result of the presence
4569 of (?ims) items in the pattern. They are kept in a local variable so that
4570 restoring at the exit of a group is easy. */
4571
4572 ims = re->options & (PCRE_CASELESS|PCRE_MULTILINE|PCRE_DOTALL);
4573
4574 /* If the expression has got more back references than the offsets supplied can
4575 hold, we get a temporary chunk of working store to use during the matching.
4576 Otherwise, we can use the vector supplied, rounding down its size to a multiple
4577 of 3. */
4578
4579 ocount = offsetcount - (offsetcount % 3);
4580
4581 if (re->top_backref > 0 && re->top_backref >= ocount/3)
4582 {
4583 ocount = re->top_backref * 3 + 3;
4584 md->offset_vector = (int *)(pcre_malloc)(ocount * sizeof(int));
4585 if (md->offset_vector == NULL) return PCRE_ERROR_NOMEMORY;
4586 using_temporary_offsets = TRUE;
4587 DPRINTF(("Got memory to hold back references\n"));
4588 }
4589 else md->offset_vector = offsets;
4590
4591 md->offset_end = ocount;
4592 md->offset_max = (2*ocount)/3;
4593 md->offset_overflow = FALSE;
4594 md->capture_last = -1;
4595
4596 /* Compute the minimum number of offsets that we need to reset each time. Doing
4597 this makes a huge difference to execution time when there aren't many brackets
4598 in the pattern. */
4599
4600 resetcount = 2 + re->top_bracket * 2;
4601 if (resetcount > offsetcount) resetcount = ocount;
4602
4603 /* Reset the working variable associated with each extraction. These should
4604 never be used unless previously set, but they get saved and restored, and so we
4605 initialize them to avoid reading uninitialized locations. */
4606
4607 if (md->offset_vector != NULL)
4608 {
4609 register int *iptr = md->offset_vector + ocount;
4610 register int *iend = iptr - resetcount/2 + 1;
4611 while (--iptr >= iend) *iptr = -1;
4612 }
4613
4614 /* Set up the first character to match, if available. The first_byte value is
4615 never set for an anchored regular expression, but the anchoring may be forced
4616 at run time, so we have to test for anchoring. The first char may be unset for
4617 an unanchored pattern, of course. If there's no first char and the pattern was
4618 studied, there may be a bitmap of possible first characters. */
4619
4620 if (!anchored)
4621 {
4622 if ((re->flags & PCRE_FIRSTSET) != 0)
4623 {
4624 first_byte = re->first_byte & 255;
4625 if ((first_byte_caseless = ((re->first_byte & REQ_CASELESS) != 0)) == TRUE)
4626 first_byte = md->lcc[first_byte];
4627 }
4628 else
4629 if (!startline && study != NULL &&
4630 (study->options & PCRE_STUDY_MAPPED) != 0)
4631 start_bits = study->start_bits;
4632 }
4633
4634 /* For anchored or unanchored matches, there may be a "last known required
4635 character" set. */
4636
4637 if ((re->flags & PCRE_REQCHSET) != 0)
4638 {
4639 req_byte = re->req_byte & 255;
4640 req_byte_caseless = (re->req_byte & REQ_CASELESS) != 0;
4641 req_byte2 = (tables + fcc_offset)[req_byte]; /* case flipped */
4642 }
4643
4644
4645 /* ==========================================================================*/
4646
4647 /* Loop for handling unanchored repeated matching attempts; for anchored regexs
4648 the loop runs just once. */
4649
4650 for(;;)
4651 {
4652 USPTR save_end_subject = end_subject;
4653 USPTR new_start_match;
4654
4655 /* Reset the maximum number of extractions we might see. */
4656
4657 if (md->offset_vector != NULL)
4658 {
4659 register int *iptr = md->offset_vector;
4660 register int *iend = iptr + resetcount;
4661 while (iptr < iend) *iptr++ = -1;
4662 }
4663
4664 /* Advance to a unique first char if possible. If firstline is TRUE, the
4665 start of the match is constrained to the first line of a multiline string.
4666 That is, the match must be before or at the first newline. Implement this by
4667 temporarily adjusting end_subject so that we stop scanning at a newline. If
4668 the match fails at the newline, later code breaks this loop. */
4669
4670 if (firstline)
4671 {
4672 USPTR t = start_match;
4673 while (t < md->end_subject && !IS_NEWLINE(t)) t++;
4674 end_subject = t;
4675 }
4676
4677 /* Now test for a unique first byte */
4678
4679 if (first_byte >= 0)
4680 {
4681 if (first_byte_caseless)
4682 while (start_match < end_subject &&
4683 md->lcc[*start_match] != first_byte)
4684 { NEXTCHAR(start_match); }
4685 else
4686 while (start_match < end_subject && *start_match != first_byte)
4687 { NEXTCHAR(start_match); }
4688 }
4689
4690 /* Or to just after a linebreak for a multiline match if possible */
4691
4692 else if (startline)
4693 {
4694 if (start_match > md->start_subject + start_offset)
4695 {
4696 while (start_match <= end_subject && !WAS_NEWLINE(start_match))
4697 { NEXTCHAR(start_match); }
4698
4699 /* If we have just passed a CR and the newline option is ANY or ANYCRLF,
4700 and we are now at a LF, advance the match position by one more character.
4701 */
4702
4703 if (start_match[-1] == '\r' &&
4704 (md->nltype == NLTYPE_ANY || md->nltype == NLTYPE_ANYCRLF) &&
4705 start_match < end_subject &&
4706 *start_match == '\n')
4707 start_match++;
4708 }
4709 }
4710
4711 /* Or to a non-unique first char after study */
4712
4713 else if (start_bits != NULL)
4714 {
4715 while (start_match < end_subject)
4716 {
4717 register unsigned int c = *start_match;
4718 if ((start_bits[c/8] & (1 << (c&7))) == 0)
4719 { NEXTCHAR(start_match); }
4720 else break;
4721 }
4722 }
4723
4724 /* Restore fudged end_subject */
4725
4726 end_subject = save_end_subject;
4727
4728 #ifdef DEBUG /* Sigh. Some compilers never learn. */
4729 printf(">>>> Match against: ");
4730 pchars(start_match, end_subject - start_match, TRUE, md);
4731 printf("\n");
4732 #endif
4733
4734 /* If req_byte is set, we know that that character must appear in the subject
4735 for the match to succeed. If the first character is set, req_byte must be
4736 later in the subject; otherwise the test starts at the match point. This
4737 optimization can save a huge amount of backtracking in patterns with nested
4738 unlimited repeats that aren't going to match. Writing separate code for
4739 cased/caseless versions makes it go faster, as does using an autoincrement
4740 and backing off on a match.
4741
4742 HOWEVER: when the subject string is very, very long, searching to its end can
4743 take a long time, and give bad performance on quite ordinary patterns. This
4744 showed up when somebody was matching something like /^\d+C/ on a 32-megabyte
4745 string... so we don't do this when the string is sufficiently long.
4746
4747 ALSO: this processing is disabled when partial matching is requested.
4748 */
4749
4750 if (req_byte >= 0 &&
4751 end_subject - start_match < REQ_BYTE_MAX &&
4752 !md->partial)
4753 {
4754 register USPTR p = start_match + ((first_byte >= 0)? 1 : 0);
4755
4756 /* We don't need to repeat the search if we haven't yet reached the
4757 place we found it at last time. */
4758
4759 if (p > req_byte_ptr)
4760 {
4761 if (req_byte_caseless)
4762 {
4763 while (p < end_subject)
4764 {
4765 register int pp = *p++;
4766 if (pp == req_byte || pp == req_byte2) { p--; break; }
4767 }
4768 }
4769 else
4770 {
4771 while (p < end_subject)
4772 {
4773 if (*p++ == req_byte) { p--; break; }
4774 }
4775 }
4776
4777 /* If we can't find the required character, break the matching loop,
4778 forcing a match failure. */
4779
4780 if (p >= end_subject)
4781 {
4782 rc = MATCH_NOMATCH;
4783 break;
4784 }
4785
4786 /* If we have found the required character, save the point where we
4787 found it, so that we don't search again next time round the loop if
4788 the start hasn't passed this character yet. */
4789
4790 req_byte_ptr = p;
4791 }
4792 }
4793
4794 /* OK, we can now run the match. */
4795
4796 md->start_match_ptr = start_match;
4797 md->match_call_count = 0;
4798 rc = match(start_match, md->start_code, start_match, 2, md, ims, NULL, 0, 0);
4799
4800 switch(rc)
4801 {
4802 /* NOMATCH and PRUNE advance by one character. THEN at this level acts
4803 exactly like PRUNE. */
4804
4805 case MATCH_NOMATCH:
4806 case MATCH_PRUNE:
4807 case MATCH_THEN:
4808 new_start_match = start_match + 1;
4809 #ifdef SUPPORT_UTF8
4810 if (utf8)
4811 while(new_start_match < end_subject && (*new_start_match & 0xc0) == 0x80)
4812 new_start_match++;
4813 #endif
4814 break;
4815
4816 /* SKIP passes back the next starting point explicitly. */
4817
4818 case MATCH_SKIP:
4819 new_start_match = md->start_match_ptr;
4820 break;
4821
4822 /* COMMIT disables the bumpalong, but otherwise behaves as NOMATCH. */
4823
4824 case MATCH_COMMIT:
4825 rc = MATCH_NOMATCH;
4826 goto ENDLOOP;
4827
4828 /* Any other return is some kind of error. */
4829
4830 default:
4831 goto ENDLOOP;
4832 }
4833
4834 /* Control reaches here for the various types of "no match at this point"
4835 result. Reset the code to MATCH_NOMATCH for subsequent checking. */
4836
4837 rc = MATCH_NOMATCH;
4838
4839 /* If PCRE_FIRSTLINE is set, the match must happen before or at the first
4840 newline in the subject (though it may continue over the newline). Therefore,
4841 if we have just failed to match, starting at a newline, do not continue. */
4842
4843 if (firstline && IS_NEWLINE(start_match)) break;
4844
4845 /* Advance to new matching position */
4846
4847 start_match = new_start_match;
4848
4849 /* Break the loop if the pattern is anchored or if we have passed the end of
4850 the subject. */
4851
4852 if (anchored || start_match > end_subject) break;
4853
4854 /* If we have just passed a CR and we are now at a LF, and the pattern does
4855 not contain any explicit matches for \r or \n, and the newline option is CRLF
4856 or ANY or ANYCRLF, advance the match position by one more character. */
4857
4858 if (start_match[-1] == '\r' &&
4859 start_match < end_subject &&
4860 *start_match == '\n' &&
4861 (re->flags & PCRE_HASCRORLF) == 0 &&
4862 (md->nltype == NLTYPE_ANY ||
4863 md->nltype == NLTYPE_ANYCRLF ||
4864 md->nllen == 2))
4865 start_match++;
4866
4867 } /* End of for(;;) "bumpalong" loop */
4868
4869 /* ==========================================================================*/
4870
4871 /* We reach here when rc is not MATCH_NOMATCH, or if one of the stopping
4872 conditions is true:
4873
4874 (1) The pattern is anchored or the match was failed by (*COMMIT);
4875
4876 (2) We are past the end of the subject;
4877
4878 (3) PCRE_FIRSTLINE is set and we have failed to match at a newline, because
4879 this option requests that a match occur at or before the first newline in
4880 the subject.
4881
4882 When we have a match and the offset vector is big enough to deal with any
4883 backreferences, captured substring offsets will already be set up. In the case
4884 where we had to get some local store to hold offsets for backreference
4885 processing, copy those that we can. In this case there need not be overflow if
4886 certain parts of the pattern were not used, even though there are more
4887 capturing parentheses than vector slots. */
4888
4889 ENDLOOP:
4890
4891 if (rc == MATCH_MATCH)
4892 {
4893 if (using_temporary_offsets)
4894 {
4895 if (offsetcount >= 4)
4896 {
4897 memcpy(offsets + 2, md->offset_vector + 2,
4898 (offsetcount - 2) * sizeof(int));
4899 DPRINTF(("Copied offsets from temporary memory\n"));
4900 }
4901 if (md->end_offset_top > offsetcount) md->offset_overflow = TRUE;
4902 DPRINTF(("Freeing temporary memory\n"));
4903 (pcre_free)(md->offset_vector);
4904 }
4905
4906 /* Set the return code to the number of captured strings, or 0 if there are
4907 too many to fit into the vector. */
4908
4909 rc = md->offset_overflow? 0 : md->end_offset_top/2;
4910
4911 /* If there is space, set up the whole thing as substring 0. The value of
4912 md->start_match_ptr might be modified if \K was encountered on the success
4913 matching path. */
4914
4915 if (offsetcount < 2) rc = 0; else
4916 {
4917 offsets[0] = md->start_match_ptr - md->start_subject;
4918 offsets[1] = md->end_match_ptr - md->start_subject;
4919 }
4920
4921 DPRINTF((">>>> returning %d\n", rc));
4922 return rc;
4923 }
4924
4925 /* Control gets here if there has been an error, or if the overall match
4926 attempt has failed at all permitted starting positions. */
4927
4928 if (using_temporary_offsets)
4929 {
4930 DPRINTF(("Freeing temporary memory\n"));
4931 (pcre_free)(md->offset_vector);
4932 }
4933
4934 if (rc != MATCH_NOMATCH)
4935 {
4936 DPRINTF((">>>> error: returning %d\n", rc));
4937 return rc;
4938 }
4939 else if (md->partial && md->hitend)
4940 {
4941 DPRINTF((">>>> returning PCRE_ERROR_PARTIAL\n"));
4942 return PCRE_ERROR_PARTIAL;
4943 }
4944 else
4945 {
4946 DPRINTF((">>>> returning PCRE_ERROR_NOMATCH\n"));
4947 return PCRE_ERROR_NOMATCH;
4948 }
4949 }
4950
4951 /* End of pcre_exec.c */

Properties

Name Value
svn:eol-style native
svn:keywords "Author Date Id Revision Url"

webmaster@exim.org
ViewVC Help
Powered by ViewVC 1.1.12