/[pcre]/code/trunk/pcre_exec.c
ViewVC logotype

Contents of /code/trunk/pcre_exec.c

Parent Directory Parent Directory | Revision Log Revision Log


Revision 226 - (show annotations) (download)
Tue Aug 21 11:46:08 2007 UTC (7 years, 3 months ago) by ph10
File MIME type: text/plain
File size: 149165 byte(s)
Don't advance by 2 if explicit \r or \n in the pattern. Add 
PCRE_INFO_HASCRORLF.

1 /*************************************************
2 * Perl-Compatible Regular Expressions *
3 *************************************************/
4
5 /* PCRE is a library of functions to support regular expressions whose syntax
6 and semantics are as close as possible to those of the Perl 5 language.
7
8 Written by Philip Hazel
9 Copyright (c) 1997-2007 University of Cambridge
10
11 -----------------------------------------------------------------------------
12 Redistribution and use in source and binary forms, with or without
13 modification, are permitted provided that the following conditions are met:
14
15 * Redistributions of source code must retain the above copyright notice,
16 this list of conditions and the following disclaimer.
17
18 * Redistributions in binary form must reproduce the above copyright
19 notice, this list of conditions and the following disclaimer in the
20 documentation and/or other materials provided with the distribution.
21
22 * Neither the name of the University of Cambridge nor the names of its
23 contributors may be used to endorse or promote products derived from
24 this software without specific prior written permission.
25
26 THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
27 AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
28 IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
29 ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
30 LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
31 CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
32 SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
33 INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
34 CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
35 ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
36 POSSIBILITY OF SUCH DAMAGE.
37 -----------------------------------------------------------------------------
38 */
39
40
41 /* This module contains pcre_exec(), the externally visible function that does
42 pattern matching using an NFA algorithm, trying to mimic Perl as closely as
43 possible. There are also some static supporting functions. */
44
45 #ifdef HAVE_CONFIG_H
46 #include <config.h>
47 #endif
48
49 #define NLBLOCK md /* Block containing newline information */
50 #define PSSTART start_subject /* Field containing processed string start */
51 #define PSEND end_subject /* Field containing processed string end */
52
53 #include "pcre_internal.h"
54
55 /* Undefine some potentially clashing cpp symbols */
56
57 #undef min
58 #undef max
59
60 /* Flag bits for the match() function */
61
62 #define match_condassert 0x01 /* Called to check a condition assertion */
63 #define match_cbegroup 0x02 /* Could-be-empty unlimited repeat group */
64
65 /* Non-error returns from the match() function. Error returns are externally
66 defined PCRE_ERROR_xxx codes, which are all negative. */
67
68 #define MATCH_MATCH 1
69 #define MATCH_NOMATCH 0
70
71 /* Special internal returns from the match() function. Make them sufficiently
72 negative to avoid the external error codes. */
73
74 #define MATCH_COMMIT (-999)
75 #define MATCH_PRUNE (-998)
76 #define MATCH_SKIP (-997)
77 #define MATCH_THEN (-996)
78
79 /* Maximum number of ints of offset to save on the stack for recursive calls.
80 If the offset vector is bigger, malloc is used. This should be a multiple of 3,
81 because the offset vector is always a multiple of 3 long. */
82
83 #define REC_STACK_SAVE_MAX 30
84
85 /* Min and max values for the common repeats; for the maxima, 0 => infinity */
86
87 static const char rep_min[] = { 0, 0, 1, 1, 0, 0 };
88 static const char rep_max[] = { 0, 0, 0, 0, 1, 1 };
89
90
91
92 #ifdef DEBUG
93 /*************************************************
94 * Debugging function to print chars *
95 *************************************************/
96
97 /* Print a sequence of chars in printable format, stopping at the end of the
98 subject if the requested.
99
100 Arguments:
101 p points to characters
102 length number to print
103 is_subject TRUE if printing from within md->start_subject
104 md pointer to matching data block, if is_subject is TRUE
105
106 Returns: nothing
107 */
108
109 static void
110 pchars(const uschar *p, int length, BOOL is_subject, match_data *md)
111 {
112 unsigned int c;
113 if (is_subject && length > md->end_subject - p) length = md->end_subject - p;
114 while (length-- > 0)
115 if (isprint(c = *(p++))) printf("%c", c); else printf("\\x%02x", c);
116 }
117 #endif
118
119
120
121 /*************************************************
122 * Match a back-reference *
123 *************************************************/
124
125 /* If a back reference hasn't been set, the length that is passed is greater
126 than the number of characters left in the string, so the match fails.
127
128 Arguments:
129 offset index into the offset vector
130 eptr points into the subject
131 length length to be matched
132 md points to match data block
133 ims the ims flags
134
135 Returns: TRUE if matched
136 */
137
138 static BOOL
139 match_ref(int offset, register USPTR eptr, int length, match_data *md,
140 unsigned long int ims)
141 {
142 USPTR p = md->start_subject + md->offset_vector[offset];
143
144 #ifdef DEBUG
145 if (eptr >= md->end_subject)
146 printf("matching subject <null>");
147 else
148 {
149 printf("matching subject ");
150 pchars(eptr, length, TRUE, md);
151 }
152 printf(" against backref ");
153 pchars(p, length, FALSE, md);
154 printf("\n");
155 #endif
156
157 /* Always fail if not enough characters left */
158
159 if (length > md->end_subject - eptr) return FALSE;
160
161 /* Separate the caselesss case for speed */
162
163 if ((ims & PCRE_CASELESS) != 0)
164 {
165 while (length-- > 0)
166 if (md->lcc[*p++] != md->lcc[*eptr++]) return FALSE;
167 }
168 else
169 { while (length-- > 0) if (*p++ != *eptr++) return FALSE; }
170
171 return TRUE;
172 }
173
174
175
176 /***************************************************************************
177 ****************************************************************************
178 RECURSION IN THE match() FUNCTION
179
180 The match() function is highly recursive, though not every recursive call
181 increases the recursive depth. Nevertheless, some regular expressions can cause
182 it to recurse to a great depth. I was writing for Unix, so I just let it call
183 itself recursively. This uses the stack for saving everything that has to be
184 saved for a recursive call. On Unix, the stack can be large, and this works
185 fine.
186
187 It turns out that on some non-Unix-like systems there are problems with
188 programs that use a lot of stack. (This despite the fact that every last chip
189 has oodles of memory these days, and techniques for extending the stack have
190 been known for decades.) So....
191
192 There is a fudge, triggered by defining NO_RECURSE, which avoids recursive
193 calls by keeping local variables that need to be preserved in blocks of memory
194 obtained from malloc() instead instead of on the stack. Macros are used to
195 achieve this so that the actual code doesn't look very different to what it
196 always used to.
197
198 The original heap-recursive code used longjmp(). However, it seems that this
199 can be very slow on some operating systems. Following a suggestion from Stan
200 Switzer, the use of longjmp() has been abolished, at the cost of having to
201 provide a unique number for each call to RMATCH. There is no way of generating
202 a sequence of numbers at compile time in C. I have given them names, to make
203 them stand out more clearly.
204
205 Crude tests on x86 Linux show a small speedup of around 5-8%. However, on
206 FreeBSD, avoiding longjmp() more than halves the time taken to run the standard
207 tests. Furthermore, not using longjmp() means that local dynamic variables
208 don't have indeterminate values; this has meant that the frame size can be
209 reduced because the result can be "passed back" by straight setting of the
210 variable instead of being passed in the frame.
211 ****************************************************************************
212 ***************************************************************************/
213
214 /* Numbers for RMATCH calls. When this list is changed, the code at HEAP_RETURN
215 below must be updated in sync. */
216
217 enum { RM1=1, RM2, RM3, RM4, RM5, RM6, RM7, RM8, RM9, RM10,
218 RM11, RM12, RM13, RM14, RM15, RM16, RM17, RM18, RM19, RM20,
219 RM21, RM22, RM23, RM24, RM25, RM26, RM27, RM28, RM29, RM30,
220 RM31, RM32, RM33, RM34, RM35, RM36, RM37, RM38, RM39, RM40,
221 RM41, RM42, RM43, RM44, RM45, RM46, RM47, RM48, RM49, RM50,
222 RM51, RM52, RM53, RM54 };
223
224 /* These versions of the macros use the stack, as normal. There are debugging
225 versions and production versions. Note that the "rw" argument of RMATCH isn't
226 actuall used in this definition. */
227
228 #ifndef NO_RECURSE
229 #define REGISTER register
230
231 #ifdef DEBUG
232 #define RMATCH(ra,rb,rc,rd,re,rf,rg,rw) \
233 { \
234 printf("match() called in line %d\n", __LINE__); \
235 rrc = match(ra,rb,mstart,rc,rd,re,rf,rg,rdepth+1); \
236 printf("to line %d\n", __LINE__); \
237 }
238 #define RRETURN(ra) \
239 { \
240 printf("match() returned %d from line %d ", ra, __LINE__); \
241 return ra; \
242 }
243 #else
244 #define RMATCH(ra,rb,rc,rd,re,rf,rg,rw) \
245 rrc = match(ra,rb,mstart,rc,rd,re,rf,rg,rdepth+1)
246 #define RRETURN(ra) return ra
247 #endif
248
249 #else
250
251
252 /* These versions of the macros manage a private stack on the heap. Note that
253 the "rd" argument of RMATCH isn't actually used in this definition. It's the md
254 argument of match(), which never changes. */
255
256 #define REGISTER
257
258 #define RMATCH(ra,rb,rc,rd,re,rf,rg,rw)\
259 {\
260 heapframe *newframe = (pcre_stack_malloc)(sizeof(heapframe));\
261 frame->Xwhere = rw; \
262 newframe->Xeptr = ra;\
263 newframe->Xecode = rb;\
264 newframe->Xmstart = mstart;\
265 newframe->Xoffset_top = rc;\
266 newframe->Xims = re;\
267 newframe->Xeptrb = rf;\
268 newframe->Xflags = rg;\
269 newframe->Xrdepth = frame->Xrdepth + 1;\
270 newframe->Xprevframe = frame;\
271 frame = newframe;\
272 DPRINTF(("restarting from line %d\n", __LINE__));\
273 goto HEAP_RECURSE;\
274 L_##rw:\
275 DPRINTF(("jumped back to line %d\n", __LINE__));\
276 }
277
278 #define RRETURN(ra)\
279 {\
280 heapframe *newframe = frame;\
281 frame = newframe->Xprevframe;\
282 (pcre_stack_free)(newframe);\
283 if (frame != NULL)\
284 {\
285 rrc = ra;\
286 goto HEAP_RETURN;\
287 }\
288 return ra;\
289 }
290
291
292 /* Structure for remembering the local variables in a private frame */
293
294 typedef struct heapframe {
295 struct heapframe *Xprevframe;
296
297 /* Function arguments that may change */
298
299 const uschar *Xeptr;
300 const uschar *Xecode;
301 const uschar *Xmstart;
302 int Xoffset_top;
303 long int Xims;
304 eptrblock *Xeptrb;
305 int Xflags;
306 unsigned int Xrdepth;
307
308 /* Function local variables */
309
310 const uschar *Xcallpat;
311 const uschar *Xcharptr;
312 const uschar *Xdata;
313 const uschar *Xnext;
314 const uschar *Xpp;
315 const uschar *Xprev;
316 const uschar *Xsaved_eptr;
317
318 recursion_info Xnew_recursive;
319
320 BOOL Xcur_is_word;
321 BOOL Xcondition;
322 BOOL Xprev_is_word;
323
324 unsigned long int Xoriginal_ims;
325
326 #ifdef SUPPORT_UCP
327 int Xprop_type;
328 int Xprop_value;
329 int Xprop_fail_result;
330 int Xprop_category;
331 int Xprop_chartype;
332 int Xprop_script;
333 int Xoclength;
334 uschar Xocchars[8];
335 #endif
336
337 int Xctype;
338 unsigned int Xfc;
339 int Xfi;
340 int Xlength;
341 int Xmax;
342 int Xmin;
343 int Xnumber;
344 int Xoffset;
345 int Xop;
346 int Xsave_capture_last;
347 int Xsave_offset1, Xsave_offset2, Xsave_offset3;
348 int Xstacksave[REC_STACK_SAVE_MAX];
349
350 eptrblock Xnewptrb;
351
352 /* Where to jump back to */
353
354 int Xwhere;
355
356 } heapframe;
357
358 #endif
359
360
361 /***************************************************************************
362 ***************************************************************************/
363
364
365
366 /*************************************************
367 * Match from current position *
368 *************************************************/
369
370 /* This function is called recursively in many circumstances. Whenever it
371 returns a negative (error) response, the outer incarnation must also return the
372 same response.
373
374 Performance note: It might be tempting to extract commonly used fields from the
375 md structure (e.g. utf8, end_subject) into individual variables to improve
376 performance. Tests using gcc on a SPARC disproved this; in the first case, it
377 made performance worse.
378
379 Arguments:
380 eptr pointer to current character in subject
381 ecode pointer to current position in compiled code
382 mstart pointer to the current match start position (can be modified
383 by encountering \K)
384 offset_top current top pointer
385 md pointer to "static" info for the match
386 ims current /i, /m, and /s options
387 eptrb pointer to chain of blocks containing eptr at start of
388 brackets - for testing for empty matches
389 flags can contain
390 match_condassert - this is an assertion condition
391 match_cbegroup - this is the start of an unlimited repeat
392 group that can match an empty string
393 rdepth the recursion depth
394
395 Returns: MATCH_MATCH if matched ) these values are >= 0
396 MATCH_NOMATCH if failed to match )
397 a negative PCRE_ERROR_xxx value if aborted by an error condition
398 (e.g. stopped by repeated call or recursion limit)
399 */
400
401 static int
402 match(REGISTER USPTR eptr, REGISTER const uschar *ecode, const uschar *mstart,
403 int offset_top, match_data *md, unsigned long int ims, eptrblock *eptrb,
404 int flags, unsigned int rdepth)
405 {
406 /* These variables do not need to be preserved over recursion in this function,
407 so they can be ordinary variables in all cases. Mark some of them with
408 "register" because they are used a lot in loops. */
409
410 register int rrc; /* Returns from recursive calls */
411 register int i; /* Used for loops not involving calls to RMATCH() */
412 register unsigned int c; /* Character values not kept over RMATCH() calls */
413 register BOOL utf8; /* Local copy of UTF-8 flag for speed */
414
415 BOOL minimize, possessive; /* Quantifier options */
416
417 /* When recursion is not being used, all "local" variables that have to be
418 preserved over calls to RMATCH() are part of a "frame" which is obtained from
419 heap storage. Set up the top-level frame here; others are obtained from the
420 heap whenever RMATCH() does a "recursion". See the macro definitions above. */
421
422 #ifdef NO_RECURSE
423 heapframe *frame = (pcre_stack_malloc)(sizeof(heapframe));
424 frame->Xprevframe = NULL; /* Marks the top level */
425
426 /* Copy in the original argument variables */
427
428 frame->Xeptr = eptr;
429 frame->Xecode = ecode;
430 frame->Xmstart = mstart;
431 frame->Xoffset_top = offset_top;
432 frame->Xims = ims;
433 frame->Xeptrb = eptrb;
434 frame->Xflags = flags;
435 frame->Xrdepth = rdepth;
436
437 /* This is where control jumps back to to effect "recursion" */
438
439 HEAP_RECURSE:
440
441 /* Macros make the argument variables come from the current frame */
442
443 #define eptr frame->Xeptr
444 #define ecode frame->Xecode
445 #define mstart frame->Xmstart
446 #define offset_top frame->Xoffset_top
447 #define ims frame->Xims
448 #define eptrb frame->Xeptrb
449 #define flags frame->Xflags
450 #define rdepth frame->Xrdepth
451
452 /* Ditto for the local variables */
453
454 #ifdef SUPPORT_UTF8
455 #define charptr frame->Xcharptr
456 #endif
457 #define callpat frame->Xcallpat
458 #define data frame->Xdata
459 #define next frame->Xnext
460 #define pp frame->Xpp
461 #define prev frame->Xprev
462 #define saved_eptr frame->Xsaved_eptr
463
464 #define new_recursive frame->Xnew_recursive
465
466 #define cur_is_word frame->Xcur_is_word
467 #define condition frame->Xcondition
468 #define prev_is_word frame->Xprev_is_word
469
470 #define original_ims frame->Xoriginal_ims
471
472 #ifdef SUPPORT_UCP
473 #define prop_type frame->Xprop_type
474 #define prop_value frame->Xprop_value
475 #define prop_fail_result frame->Xprop_fail_result
476 #define prop_category frame->Xprop_category
477 #define prop_chartype frame->Xprop_chartype
478 #define prop_script frame->Xprop_script
479 #define oclength frame->Xoclength
480 #define occhars frame->Xocchars
481 #endif
482
483 #define ctype frame->Xctype
484 #define fc frame->Xfc
485 #define fi frame->Xfi
486 #define length frame->Xlength
487 #define max frame->Xmax
488 #define min frame->Xmin
489 #define number frame->Xnumber
490 #define offset frame->Xoffset
491 #define op frame->Xop
492 #define save_capture_last frame->Xsave_capture_last
493 #define save_offset1 frame->Xsave_offset1
494 #define save_offset2 frame->Xsave_offset2
495 #define save_offset3 frame->Xsave_offset3
496 #define stacksave frame->Xstacksave
497
498 #define newptrb frame->Xnewptrb
499
500 /* When recursion is being used, local variables are allocated on the stack and
501 get preserved during recursion in the normal way. In this environment, fi and
502 i, and fc and c, can be the same variables. */
503
504 #else /* NO_RECURSE not defined */
505 #define fi i
506 #define fc c
507
508
509 #ifdef SUPPORT_UTF8 /* Many of these variables are used only */
510 const uschar *charptr; /* in small blocks of the code. My normal */
511 #endif /* style of coding would have declared */
512 const uschar *callpat; /* them within each of those blocks. */
513 const uschar *data; /* However, in order to accommodate the */
514 const uschar *next; /* version of this code that uses an */
515 USPTR pp; /* external "stack" implemented on the */
516 const uschar *prev; /* heap, it is easier to declare them all */
517 USPTR saved_eptr; /* here, so the declarations can be cut */
518 /* out in a block. The only declarations */
519 recursion_info new_recursive; /* within blocks below are for variables */
520 /* that do not have to be preserved over */
521 BOOL cur_is_word; /* a recursive call to RMATCH(). */
522 BOOL condition;
523 BOOL prev_is_word;
524
525 unsigned long int original_ims;
526
527 #ifdef SUPPORT_UCP
528 int prop_type;
529 int prop_value;
530 int prop_fail_result;
531 int prop_category;
532 int prop_chartype;
533 int prop_script;
534 int oclength;
535 uschar occhars[8];
536 #endif
537
538 int ctype;
539 int length;
540 int max;
541 int min;
542 int number;
543 int offset;
544 int op;
545 int save_capture_last;
546 int save_offset1, save_offset2, save_offset3;
547 int stacksave[REC_STACK_SAVE_MAX];
548
549 eptrblock newptrb;
550 #endif /* NO_RECURSE */
551
552 /* These statements are here to stop the compiler complaining about unitialized
553 variables. */
554
555 #ifdef SUPPORT_UCP
556 prop_value = 0;
557 prop_fail_result = 0;
558 #endif
559
560
561 /* This label is used for tail recursion, which is used in a few cases even
562 when NO_RECURSE is not defined, in order to reduce the amount of stack that is
563 used. Thanks to Ian Taylor for noticing this possibility and sending the
564 original patch. */
565
566 TAIL_RECURSE:
567
568 /* OK, now we can get on with the real code of the function. Recursive calls
569 are specified by the macro RMATCH and RRETURN is used to return. When
570 NO_RECURSE is *not* defined, these just turn into a recursive call to match()
571 and a "return", respectively (possibly with some debugging if DEBUG is
572 defined). However, RMATCH isn't like a function call because it's quite a
573 complicated macro. It has to be used in one particular way. This shouldn't,
574 however, impact performance when true recursion is being used. */
575
576 #ifdef SUPPORT_UTF8
577 utf8 = md->utf8; /* Local copy of the flag */
578 #else
579 utf8 = FALSE;
580 #endif
581
582 /* First check that we haven't called match() too many times, or that we
583 haven't exceeded the recursive call limit. */
584
585 if (md->match_call_count++ >= md->match_limit) RRETURN(PCRE_ERROR_MATCHLIMIT);
586 if (rdepth >= md->match_limit_recursion) RRETURN(PCRE_ERROR_RECURSIONLIMIT);
587
588 original_ims = ims; /* Save for resetting on ')' */
589
590 /* At the start of a group with an unlimited repeat that may match an empty
591 string, the match_cbegroup flag is set. When this is the case, add the current
592 subject pointer to the chain of such remembered pointers, to be checked when we
593 hit the closing ket, in order to break infinite loops that match no characters.
594 When match() is called in other circumstances, don't add to the chain. The
595 match_cbegroup flag must NOT be used with tail recursion, because the memory
596 block that is used is on the stack, so a new one may be required for each
597 match(). */
598
599 if ((flags & match_cbegroup) != 0)
600 {
601 newptrb.epb_saved_eptr = eptr;
602 newptrb.epb_prev = eptrb;
603 eptrb = &newptrb;
604 }
605
606 /* Now start processing the opcodes. */
607
608 for (;;)
609 {
610 minimize = possessive = FALSE;
611 op = *ecode;
612
613 /* For partial matching, remember if we ever hit the end of the subject after
614 matching at least one subject character. */
615
616 if (md->partial &&
617 eptr >= md->end_subject &&
618 eptr > mstart)
619 md->hitend = TRUE;
620
621 switch(op)
622 {
623 case OP_FAIL:
624 RRETURN(MATCH_NOMATCH);
625
626 case OP_PRUNE:
627 RMATCH(eptr, ecode + _pcre_OP_lengths[*ecode], offset_top, md,
628 ims, eptrb, flags, RM51);
629 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
630 RRETURN(MATCH_PRUNE);
631
632 case OP_COMMIT:
633 RMATCH(eptr, ecode + _pcre_OP_lengths[*ecode], offset_top, md,
634 ims, eptrb, flags, RM52);
635 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
636 RRETURN(MATCH_COMMIT);
637
638 case OP_SKIP:
639 RMATCH(eptr, ecode + _pcre_OP_lengths[*ecode], offset_top, md,
640 ims, eptrb, flags, RM53);
641 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
642 md->start_match_ptr = eptr; /* Pass back current position */
643 RRETURN(MATCH_SKIP);
644
645 case OP_THEN:
646 RMATCH(eptr, ecode + _pcre_OP_lengths[*ecode], offset_top, md,
647 ims, eptrb, flags, RM54);
648 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
649 RRETURN(MATCH_THEN);
650
651 /* Handle a capturing bracket. If there is space in the offset vector, save
652 the current subject position in the working slot at the top of the vector.
653 We mustn't change the current values of the data slot, because they may be
654 set from a previous iteration of this group, and be referred to by a
655 reference inside the group.
656
657 If the bracket fails to match, we need to restore this value and also the
658 values of the final offsets, in case they were set by a previous iteration
659 of the same bracket.
660
661 If there isn't enough space in the offset vector, treat this as if it were
662 a non-capturing bracket. Don't worry about setting the flag for the error
663 case here; that is handled in the code for KET. */
664
665 case OP_CBRA:
666 case OP_SCBRA:
667 number = GET2(ecode, 1+LINK_SIZE);
668 offset = number << 1;
669
670 #ifdef DEBUG
671 printf("start bracket %d\n", number);
672 printf("subject=");
673 pchars(eptr, 16, TRUE, md);
674 printf("\n");
675 #endif
676
677 if (offset < md->offset_max)
678 {
679 save_offset1 = md->offset_vector[offset];
680 save_offset2 = md->offset_vector[offset+1];
681 save_offset3 = md->offset_vector[md->offset_end - number];
682 save_capture_last = md->capture_last;
683
684 DPRINTF(("saving %d %d %d\n", save_offset1, save_offset2, save_offset3));
685 md->offset_vector[md->offset_end - number] = eptr - md->start_subject;
686
687 flags = (op == OP_SCBRA)? match_cbegroup : 0;
688 do
689 {
690 RMATCH(eptr, ecode + _pcre_OP_lengths[*ecode], offset_top, md,
691 ims, eptrb, flags, RM1);
692 if (rrc != MATCH_NOMATCH && rrc != MATCH_THEN) RRETURN(rrc);
693 md->capture_last = save_capture_last;
694 ecode += GET(ecode, 1);
695 }
696 while (*ecode == OP_ALT);
697
698 DPRINTF(("bracket %d failed\n", number));
699
700 md->offset_vector[offset] = save_offset1;
701 md->offset_vector[offset+1] = save_offset2;
702 md->offset_vector[md->offset_end - number] = save_offset3;
703
704 RRETURN(MATCH_NOMATCH);
705 }
706
707 /* FALL THROUGH ... Insufficient room for saving captured contents. Treat
708 as a non-capturing bracket. */
709
710 /* VVVVVVVVVVVVVVVVVVVVVVVVV */
711 /* VVVVVVVVVVVVVVVVVVVVVVVVV */
712
713 DPRINTF(("insufficient capture room: treat as non-capturing\n"));
714
715 /* VVVVVVVVVVVVVVVVVVVVVVVVV */
716 /* VVVVVVVVVVVVVVVVVVVVVVVVV */
717
718 /* Non-capturing bracket. Loop for all the alternatives. When we get to the
719 final alternative within the brackets, we would return the result of a
720 recursive call to match() whatever happened. We can reduce stack usage by
721 turning this into a tail recursion, except in the case when match_cbegroup
722 is set.*/
723
724 case OP_BRA:
725 case OP_SBRA:
726 DPRINTF(("start non-capturing bracket\n"));
727 flags = (op >= OP_SBRA)? match_cbegroup : 0;
728 for (;;)
729 {
730 if (ecode[GET(ecode, 1)] != OP_ALT) /* Final alternative */
731 {
732 if (flags == 0) /* Not a possibly empty group */
733 {
734 ecode += _pcre_OP_lengths[*ecode];
735 DPRINTF(("bracket 0 tail recursion\n"));
736 goto TAIL_RECURSE;
737 }
738
739 /* Possibly empty group; can't use tail recursion. */
740
741 RMATCH(eptr, ecode + _pcre_OP_lengths[*ecode], offset_top, md, ims,
742 eptrb, flags, RM48);
743 RRETURN(rrc);
744 }
745
746 /* For non-final alternatives, continue the loop for a NOMATCH result;
747 otherwise return. */
748
749 RMATCH(eptr, ecode + _pcre_OP_lengths[*ecode], offset_top, md, ims,
750 eptrb, flags, RM2);
751 if (rrc != MATCH_NOMATCH && rrc != MATCH_THEN) RRETURN(rrc);
752 ecode += GET(ecode, 1);
753 }
754 /* Control never reaches here. */
755
756 /* Conditional group: compilation checked that there are no more than
757 two branches. If the condition is false, skipping the first branch takes us
758 past the end if there is only one branch, but that's OK because that is
759 exactly what going to the ket would do. As there is only one branch to be
760 obeyed, we can use tail recursion to avoid using another stack frame. */
761
762 case OP_COND:
763 case OP_SCOND:
764 if (ecode[LINK_SIZE+1] == OP_RREF) /* Recursion test */
765 {
766 offset = GET2(ecode, LINK_SIZE + 2); /* Recursion group number*/
767 condition = md->recursive != NULL &&
768 (offset == RREF_ANY || offset == md->recursive->group_num);
769 ecode += condition? 3 : GET(ecode, 1);
770 }
771
772 else if (ecode[LINK_SIZE+1] == OP_CREF) /* Group used test */
773 {
774 offset = GET2(ecode, LINK_SIZE+2) << 1; /* Doubled ref number */
775 condition = offset < offset_top && md->offset_vector[offset] >= 0;
776 ecode += condition? 3 : GET(ecode, 1);
777 }
778
779 else if (ecode[LINK_SIZE+1] == OP_DEF) /* DEFINE - always false */
780 {
781 condition = FALSE;
782 ecode += GET(ecode, 1);
783 }
784
785 /* The condition is an assertion. Call match() to evaluate it - setting
786 the final argument match_condassert causes it to stop at the end of an
787 assertion. */
788
789 else
790 {
791 RMATCH(eptr, ecode + 1 + LINK_SIZE, offset_top, md, ims, NULL,
792 match_condassert, RM3);
793 if (rrc == MATCH_MATCH)
794 {
795 condition = TRUE;
796 ecode += 1 + LINK_SIZE + GET(ecode, LINK_SIZE + 2);
797 while (*ecode == OP_ALT) ecode += GET(ecode, 1);
798 }
799 else if (rrc != MATCH_NOMATCH && rrc != MATCH_THEN)
800 {
801 RRETURN(rrc); /* Need braces because of following else */
802 }
803 else
804 {
805 condition = FALSE;
806 ecode += GET(ecode, 1);
807 }
808 }
809
810 /* We are now at the branch that is to be obeyed. As there is only one,
811 we can use tail recursion to avoid using another stack frame, except when
812 match_cbegroup is required for an unlimited repeat of a possibly empty
813 group. If the second alternative doesn't exist, we can just plough on. */
814
815 if (condition || *ecode == OP_ALT)
816 {
817 ecode += 1 + LINK_SIZE;
818 if (op == OP_SCOND) /* Possibly empty group */
819 {
820 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, match_cbegroup, RM49);
821 RRETURN(rrc);
822 }
823 else /* Group must match something */
824 {
825 flags = 0;
826 goto TAIL_RECURSE;
827 }
828 }
829 else /* Condition false & no 2nd alternative */
830 {
831 ecode += 1 + LINK_SIZE;
832 }
833 break;
834
835
836 /* End of the pattern, either real or forced. If we are in a top-level
837 recursion, we should restore the offsets appropriately and continue from
838 after the call. */
839
840 case OP_ACCEPT:
841 case OP_END:
842 if (md->recursive != NULL && md->recursive->group_num == 0)
843 {
844 recursion_info *rec = md->recursive;
845 DPRINTF(("End of pattern in a (?0) recursion\n"));
846 md->recursive = rec->prevrec;
847 memmove(md->offset_vector, rec->offset_save,
848 rec->saved_max * sizeof(int));
849 mstart = rec->save_start;
850 ims = original_ims;
851 ecode = rec->after_call;
852 break;
853 }
854
855 /* Otherwise, if PCRE_NOTEMPTY is set, fail if we have matched an empty
856 string - backtracking will then try other alternatives, if any. */
857
858 if (md->notempty && eptr == mstart) RRETURN(MATCH_NOMATCH);
859 md->end_match_ptr = eptr; /* Record where we ended */
860 md->end_offset_top = offset_top; /* and how many extracts were taken */
861 md->start_match_ptr = mstart; /* and the start (\K can modify) */
862 RRETURN(MATCH_MATCH);
863
864 /* Change option settings */
865
866 case OP_OPT:
867 ims = ecode[1];
868 ecode += 2;
869 DPRINTF(("ims set to %02lx\n", ims));
870 break;
871
872 /* Assertion brackets. Check the alternative branches in turn - the
873 matching won't pass the KET for an assertion. If any one branch matches,
874 the assertion is true. Lookbehind assertions have an OP_REVERSE item at the
875 start of each branch to move the current point backwards, so the code at
876 this level is identical to the lookahead case. */
877
878 case OP_ASSERT:
879 case OP_ASSERTBACK:
880 do
881 {
882 RMATCH(eptr, ecode + 1 + LINK_SIZE, offset_top, md, ims, NULL, 0,
883 RM4);
884 if (rrc == MATCH_MATCH) break;
885 if (rrc != MATCH_NOMATCH && rrc != MATCH_THEN) RRETURN(rrc);
886 ecode += GET(ecode, 1);
887 }
888 while (*ecode == OP_ALT);
889 if (*ecode == OP_KET) RRETURN(MATCH_NOMATCH);
890
891 /* If checking an assertion for a condition, return MATCH_MATCH. */
892
893 if ((flags & match_condassert) != 0) RRETURN(MATCH_MATCH);
894
895 /* Continue from after the assertion, updating the offsets high water
896 mark, since extracts may have been taken during the assertion. */
897
898 do ecode += GET(ecode,1); while (*ecode == OP_ALT);
899 ecode += 1 + LINK_SIZE;
900 offset_top = md->end_offset_top;
901 continue;
902
903 /* Negative assertion: all branches must fail to match */
904
905 case OP_ASSERT_NOT:
906 case OP_ASSERTBACK_NOT:
907 do
908 {
909 RMATCH(eptr, ecode + 1 + LINK_SIZE, offset_top, md, ims, NULL, 0,
910 RM5);
911 if (rrc == MATCH_MATCH) RRETURN(MATCH_NOMATCH);
912 if (rrc != MATCH_NOMATCH && rrc != MATCH_THEN) RRETURN(rrc);
913 ecode += GET(ecode,1);
914 }
915 while (*ecode == OP_ALT);
916
917 if ((flags & match_condassert) != 0) RRETURN(MATCH_MATCH);
918
919 ecode += 1 + LINK_SIZE;
920 continue;
921
922 /* Move the subject pointer back. This occurs only at the start of
923 each branch of a lookbehind assertion. If we are too close to the start to
924 move back, this match function fails. When working with UTF-8 we move
925 back a number of characters, not bytes. */
926
927 case OP_REVERSE:
928 #ifdef SUPPORT_UTF8
929 if (utf8)
930 {
931 i = GET(ecode, 1);
932 while (i-- > 0)
933 {
934 eptr--;
935 if (eptr < md->start_subject) RRETURN(MATCH_NOMATCH);
936 BACKCHAR(eptr);
937 }
938 }
939 else
940 #endif
941
942 /* No UTF-8 support, or not in UTF-8 mode: count is byte count */
943
944 {
945 eptr -= GET(ecode, 1);
946 if (eptr < md->start_subject) RRETURN(MATCH_NOMATCH);
947 }
948
949 /* Skip to next op code */
950
951 ecode += 1 + LINK_SIZE;
952 break;
953
954 /* The callout item calls an external function, if one is provided, passing
955 details of the match so far. This is mainly for debugging, though the
956 function is able to force a failure. */
957
958 case OP_CALLOUT:
959 if (pcre_callout != NULL)
960 {
961 pcre_callout_block cb;
962 cb.version = 1; /* Version 1 of the callout block */
963 cb.callout_number = ecode[1];
964 cb.offset_vector = md->offset_vector;
965 cb.subject = (PCRE_SPTR)md->start_subject;
966 cb.subject_length = md->end_subject - md->start_subject;
967 cb.start_match = mstart - md->start_subject;
968 cb.current_position = eptr - md->start_subject;
969 cb.pattern_position = GET(ecode, 2);
970 cb.next_item_length = GET(ecode, 2 + LINK_SIZE);
971 cb.capture_top = offset_top/2;
972 cb.capture_last = md->capture_last;
973 cb.callout_data = md->callout_data;
974 if ((rrc = (*pcre_callout)(&cb)) > 0) RRETURN(MATCH_NOMATCH);
975 if (rrc < 0) RRETURN(rrc);
976 }
977 ecode += 2 + 2*LINK_SIZE;
978 break;
979
980 /* Recursion either matches the current regex, or some subexpression. The
981 offset data is the offset to the starting bracket from the start of the
982 whole pattern. (This is so that it works from duplicated subpatterns.)
983
984 If there are any capturing brackets started but not finished, we have to
985 save their starting points and reinstate them after the recursion. However,
986 we don't know how many such there are (offset_top records the completed
987 total) so we just have to save all the potential data. There may be up to
988 65535 such values, which is too large to put on the stack, but using malloc
989 for small numbers seems expensive. As a compromise, the stack is used when
990 there are no more than REC_STACK_SAVE_MAX values to store; otherwise malloc
991 is used. A problem is what to do if the malloc fails ... there is no way of
992 returning to the top level with an error. Save the top REC_STACK_SAVE_MAX
993 values on the stack, and accept that the rest may be wrong.
994
995 There are also other values that have to be saved. We use a chained
996 sequence of blocks that actually live on the stack. Thanks to Robin Houston
997 for the original version of this logic. */
998
999 case OP_RECURSE:
1000 {
1001 callpat = md->start_code + GET(ecode, 1);
1002 new_recursive.group_num = (callpat == md->start_code)? 0 :
1003 GET2(callpat, 1 + LINK_SIZE);
1004
1005 /* Add to "recursing stack" */
1006
1007 new_recursive.prevrec = md->recursive;
1008 md->recursive = &new_recursive;
1009
1010 /* Find where to continue from afterwards */
1011
1012 ecode += 1 + LINK_SIZE;
1013 new_recursive.after_call = ecode;
1014
1015 /* Now save the offset data. */
1016
1017 new_recursive.saved_max = md->offset_end;
1018 if (new_recursive.saved_max <= REC_STACK_SAVE_MAX)
1019 new_recursive.offset_save = stacksave;
1020 else
1021 {
1022 new_recursive.offset_save =
1023 (int *)(pcre_malloc)(new_recursive.saved_max * sizeof(int));
1024 if (new_recursive.offset_save == NULL) RRETURN(PCRE_ERROR_NOMEMORY);
1025 }
1026
1027 memcpy(new_recursive.offset_save, md->offset_vector,
1028 new_recursive.saved_max * sizeof(int));
1029 new_recursive.save_start = mstart;
1030 mstart = eptr;
1031
1032 /* OK, now we can do the recursion. For each top-level alternative we
1033 restore the offset and recursion data. */
1034
1035 DPRINTF(("Recursing into group %d\n", new_recursive.group_num));
1036 flags = (*callpat >= OP_SBRA)? match_cbegroup : 0;
1037 do
1038 {
1039 RMATCH(eptr, callpat + _pcre_OP_lengths[*callpat], offset_top,
1040 md, ims, eptrb, flags, RM6);
1041 if (rrc == MATCH_MATCH)
1042 {
1043 DPRINTF(("Recursion matched\n"));
1044 md->recursive = new_recursive.prevrec;
1045 if (new_recursive.offset_save != stacksave)
1046 (pcre_free)(new_recursive.offset_save);
1047 RRETURN(MATCH_MATCH);
1048 }
1049 else if (rrc != MATCH_NOMATCH && rrc != MATCH_THEN)
1050 {
1051 DPRINTF(("Recursion gave error %d\n", rrc));
1052 RRETURN(rrc);
1053 }
1054
1055 md->recursive = &new_recursive;
1056 memcpy(md->offset_vector, new_recursive.offset_save,
1057 new_recursive.saved_max * sizeof(int));
1058 callpat += GET(callpat, 1);
1059 }
1060 while (*callpat == OP_ALT);
1061
1062 DPRINTF(("Recursion didn't match\n"));
1063 md->recursive = new_recursive.prevrec;
1064 if (new_recursive.offset_save != stacksave)
1065 (pcre_free)(new_recursive.offset_save);
1066 RRETURN(MATCH_NOMATCH);
1067 }
1068 /* Control never reaches here */
1069
1070 /* "Once" brackets are like assertion brackets except that after a match,
1071 the point in the subject string is not moved back. Thus there can never be
1072 a move back into the brackets. Friedl calls these "atomic" subpatterns.
1073 Check the alternative branches in turn - the matching won't pass the KET
1074 for this kind of subpattern. If any one branch matches, we carry on as at
1075 the end of a normal bracket, leaving the subject pointer. */
1076
1077 case OP_ONCE:
1078 prev = ecode;
1079 saved_eptr = eptr;
1080
1081 do
1082 {
1083 RMATCH(eptr, ecode + 1 + LINK_SIZE, offset_top, md, ims, eptrb, 0, RM7);
1084 if (rrc == MATCH_MATCH) break;
1085 if (rrc != MATCH_NOMATCH && rrc != MATCH_THEN) RRETURN(rrc);
1086 ecode += GET(ecode,1);
1087 }
1088 while (*ecode == OP_ALT);
1089
1090 /* If hit the end of the group (which could be repeated), fail */
1091
1092 if (*ecode != OP_ONCE && *ecode != OP_ALT) RRETURN(MATCH_NOMATCH);
1093
1094 /* Continue as from after the assertion, updating the offsets high water
1095 mark, since extracts may have been taken. */
1096
1097 do ecode += GET(ecode, 1); while (*ecode == OP_ALT);
1098
1099 offset_top = md->end_offset_top;
1100 eptr = md->end_match_ptr;
1101
1102 /* For a non-repeating ket, just continue at this level. This also
1103 happens for a repeating ket if no characters were matched in the group.
1104 This is the forcible breaking of infinite loops as implemented in Perl
1105 5.005. If there is an options reset, it will get obeyed in the normal
1106 course of events. */
1107
1108 if (*ecode == OP_KET || eptr == saved_eptr)
1109 {
1110 ecode += 1+LINK_SIZE;
1111 break;
1112 }
1113
1114 /* The repeating kets try the rest of the pattern or restart from the
1115 preceding bracket, in the appropriate order. The second "call" of match()
1116 uses tail recursion, to avoid using another stack frame. We need to reset
1117 any options that changed within the bracket before re-running it, so
1118 check the next opcode. */
1119
1120 if (ecode[1+LINK_SIZE] == OP_OPT)
1121 {
1122 ims = (ims & ~PCRE_IMS) | ecode[4];
1123 DPRINTF(("ims set to %02lx at group repeat\n", ims));
1124 }
1125
1126 if (*ecode == OP_KETRMIN)
1127 {
1128 RMATCH(eptr, ecode + 1 + LINK_SIZE, offset_top, md, ims, eptrb, 0, RM8);
1129 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1130 ecode = prev;
1131 flags = 0;
1132 goto TAIL_RECURSE;
1133 }
1134 else /* OP_KETRMAX */
1135 {
1136 RMATCH(eptr, prev, offset_top, md, ims, eptrb, match_cbegroup, RM9);
1137 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1138 ecode += 1 + LINK_SIZE;
1139 flags = 0;
1140 goto TAIL_RECURSE;
1141 }
1142 /* Control never gets here */
1143
1144 /* An alternation is the end of a branch; scan along to find the end of the
1145 bracketed group and go to there. */
1146
1147 case OP_ALT:
1148 do ecode += GET(ecode,1); while (*ecode == OP_ALT);
1149 break;
1150
1151 /* BRAZERO and BRAMINZERO occur just before a bracket group, indicating
1152 that it may occur zero times. It may repeat infinitely, or not at all -
1153 i.e. it could be ()* or ()? in the pattern. Brackets with fixed upper
1154 repeat limits are compiled as a number of copies, with the optional ones
1155 preceded by BRAZERO or BRAMINZERO. */
1156
1157 case OP_BRAZERO:
1158 {
1159 next = ecode+1;
1160 RMATCH(eptr, next, offset_top, md, ims, eptrb, 0, RM10);
1161 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1162 do next += GET(next,1); while (*next == OP_ALT);
1163 ecode = next + 1 + LINK_SIZE;
1164 }
1165 break;
1166
1167 case OP_BRAMINZERO:
1168 {
1169 next = ecode+1;
1170 do next += GET(next, 1); while (*next == OP_ALT);
1171 RMATCH(eptr, next + 1+LINK_SIZE, offset_top, md, ims, eptrb, 0, RM11);
1172 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1173 ecode++;
1174 }
1175 break;
1176
1177 /* End of a group, repeated or non-repeating. */
1178
1179 case OP_KET:
1180 case OP_KETRMIN:
1181 case OP_KETRMAX:
1182 prev = ecode - GET(ecode, 1);
1183
1184 /* If this was a group that remembered the subject start, in order to break
1185 infinite repeats of empty string matches, retrieve the subject start from
1186 the chain. Otherwise, set it NULL. */
1187
1188 if (*prev >= OP_SBRA)
1189 {
1190 saved_eptr = eptrb->epb_saved_eptr; /* Value at start of group */
1191 eptrb = eptrb->epb_prev; /* Backup to previous group */
1192 }
1193 else saved_eptr = NULL;
1194
1195 /* If we are at the end of an assertion group, stop matching and return
1196 MATCH_MATCH, but record the current high water mark for use by positive
1197 assertions. Do this also for the "once" (atomic) groups. */
1198
1199 if (*prev == OP_ASSERT || *prev == OP_ASSERT_NOT ||
1200 *prev == OP_ASSERTBACK || *prev == OP_ASSERTBACK_NOT ||
1201 *prev == OP_ONCE)
1202 {
1203 md->end_match_ptr = eptr; /* For ONCE */
1204 md->end_offset_top = offset_top;
1205 RRETURN(MATCH_MATCH);
1206 }
1207
1208 /* For capturing groups we have to check the group number back at the start
1209 and if necessary complete handling an extraction by setting the offsets and
1210 bumping the high water mark. Note that whole-pattern recursion is coded as
1211 a recurse into group 0, so it won't be picked up here. Instead, we catch it
1212 when the OP_END is reached. Other recursion is handled here. */
1213
1214 if (*prev == OP_CBRA || *prev == OP_SCBRA)
1215 {
1216 number = GET2(prev, 1+LINK_SIZE);
1217 offset = number << 1;
1218
1219 #ifdef DEBUG
1220 printf("end bracket %d", number);
1221 printf("\n");
1222 #endif
1223
1224 md->capture_last = number;
1225 if (offset >= md->offset_max) md->offset_overflow = TRUE; else
1226 {
1227 md->offset_vector[offset] =
1228 md->offset_vector[md->offset_end - number];
1229 md->offset_vector[offset+1] = eptr - md->start_subject;
1230 if (offset_top <= offset) offset_top = offset + 2;
1231 }
1232
1233 /* Handle a recursively called group. Restore the offsets
1234 appropriately and continue from after the call. */
1235
1236 if (md->recursive != NULL && md->recursive->group_num == number)
1237 {
1238 recursion_info *rec = md->recursive;
1239 DPRINTF(("Recursion (%d) succeeded - continuing\n", number));
1240 md->recursive = rec->prevrec;
1241 mstart = rec->save_start;
1242 memcpy(md->offset_vector, rec->offset_save,
1243 rec->saved_max * sizeof(int));
1244 ecode = rec->after_call;
1245 ims = original_ims;
1246 break;
1247 }
1248 }
1249
1250 /* For both capturing and non-capturing groups, reset the value of the ims
1251 flags, in case they got changed during the group. */
1252
1253 ims = original_ims;
1254 DPRINTF(("ims reset to %02lx\n", ims));
1255
1256 /* For a non-repeating ket, just continue at this level. This also
1257 happens for a repeating ket if no characters were matched in the group.
1258 This is the forcible breaking of infinite loops as implemented in Perl
1259 5.005. If there is an options reset, it will get obeyed in the normal
1260 course of events. */
1261
1262 if (*ecode == OP_KET || eptr == saved_eptr)
1263 {
1264 ecode += 1 + LINK_SIZE;
1265 break;
1266 }
1267
1268 /* The repeating kets try the rest of the pattern or restart from the
1269 preceding bracket, in the appropriate order. In the second case, we can use
1270 tail recursion to avoid using another stack frame, unless we have an
1271 unlimited repeat of a group that can match an empty string. */
1272
1273 flags = (*prev >= OP_SBRA)? match_cbegroup : 0;
1274
1275 if (*ecode == OP_KETRMIN)
1276 {
1277 RMATCH(eptr, ecode + 1 + LINK_SIZE, offset_top, md, ims, eptrb, 0, RM12);
1278 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1279 if (flags != 0) /* Could match an empty string */
1280 {
1281 RMATCH(eptr, prev, offset_top, md, ims, eptrb, flags, RM50);
1282 RRETURN(rrc);
1283 }
1284 ecode = prev;
1285 goto TAIL_RECURSE;
1286 }
1287 else /* OP_KETRMAX */
1288 {
1289 RMATCH(eptr, prev, offset_top, md, ims, eptrb, flags, RM13);
1290 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1291 ecode += 1 + LINK_SIZE;
1292 flags = 0;
1293 goto TAIL_RECURSE;
1294 }
1295 /* Control never gets here */
1296
1297 /* Start of subject unless notbol, or after internal newline if multiline */
1298
1299 case OP_CIRC:
1300 if (md->notbol && eptr == md->start_subject) RRETURN(MATCH_NOMATCH);
1301 if ((ims & PCRE_MULTILINE) != 0)
1302 {
1303 if (eptr != md->start_subject &&
1304 (eptr == md->end_subject || !WAS_NEWLINE(eptr)))
1305 RRETURN(MATCH_NOMATCH);
1306 ecode++;
1307 break;
1308 }
1309 /* ... else fall through */
1310
1311 /* Start of subject assertion */
1312
1313 case OP_SOD:
1314 if (eptr != md->start_subject) RRETURN(MATCH_NOMATCH);
1315 ecode++;
1316 break;
1317
1318 /* Start of match assertion */
1319
1320 case OP_SOM:
1321 if (eptr != md->start_subject + md->start_offset) RRETURN(MATCH_NOMATCH);
1322 ecode++;
1323 break;
1324
1325 /* Reset the start of match point */
1326
1327 case OP_SET_SOM:
1328 mstart = eptr;
1329 ecode++;
1330 break;
1331
1332 /* Assert before internal newline if multiline, or before a terminating
1333 newline unless endonly is set, else end of subject unless noteol is set. */
1334
1335 case OP_DOLL:
1336 if ((ims & PCRE_MULTILINE) != 0)
1337 {
1338 if (eptr < md->end_subject)
1339 { if (!IS_NEWLINE(eptr)) RRETURN(MATCH_NOMATCH); }
1340 else
1341 { if (md->noteol) RRETURN(MATCH_NOMATCH); }
1342 ecode++;
1343 break;
1344 }
1345 else
1346 {
1347 if (md->noteol) RRETURN(MATCH_NOMATCH);
1348 if (!md->endonly)
1349 {
1350 if (eptr != md->end_subject &&
1351 (!IS_NEWLINE(eptr) || eptr != md->end_subject - md->nllen))
1352 RRETURN(MATCH_NOMATCH);
1353 ecode++;
1354 break;
1355 }
1356 }
1357 /* ... else fall through for endonly */
1358
1359 /* End of subject assertion (\z) */
1360
1361 case OP_EOD:
1362 if (eptr < md->end_subject) RRETURN(MATCH_NOMATCH);
1363 ecode++;
1364 break;
1365
1366 /* End of subject or ending \n assertion (\Z) */
1367
1368 case OP_EODN:
1369 if (eptr != md->end_subject &&
1370 (!IS_NEWLINE(eptr) || eptr != md->end_subject - md->nllen))
1371 RRETURN(MATCH_NOMATCH);
1372 ecode++;
1373 break;
1374
1375 /* Word boundary assertions */
1376
1377 case OP_NOT_WORD_BOUNDARY:
1378 case OP_WORD_BOUNDARY:
1379 {
1380
1381 /* Find out if the previous and current characters are "word" characters.
1382 It takes a bit more work in UTF-8 mode. Characters > 255 are assumed to
1383 be "non-word" characters. */
1384
1385 #ifdef SUPPORT_UTF8
1386 if (utf8)
1387 {
1388 if (eptr == md->start_subject) prev_is_word = FALSE; else
1389 {
1390 const uschar *lastptr = eptr - 1;
1391 while((*lastptr & 0xc0) == 0x80) lastptr--;
1392 GETCHAR(c, lastptr);
1393 prev_is_word = c < 256 && (md->ctypes[c] & ctype_word) != 0;
1394 }
1395 if (eptr >= md->end_subject) cur_is_word = FALSE; else
1396 {
1397 GETCHAR(c, eptr);
1398 cur_is_word = c < 256 && (md->ctypes[c] & ctype_word) != 0;
1399 }
1400 }
1401 else
1402 #endif
1403
1404 /* More streamlined when not in UTF-8 mode */
1405
1406 {
1407 prev_is_word = (eptr != md->start_subject) &&
1408 ((md->ctypes[eptr[-1]] & ctype_word) != 0);
1409 cur_is_word = (eptr < md->end_subject) &&
1410 ((md->ctypes[*eptr] & ctype_word) != 0);
1411 }
1412
1413 /* Now see if the situation is what we want */
1414
1415 if ((*ecode++ == OP_WORD_BOUNDARY)?
1416 cur_is_word == prev_is_word : cur_is_word != prev_is_word)
1417 RRETURN(MATCH_NOMATCH);
1418 }
1419 break;
1420
1421 /* Match a single character type; inline for speed */
1422
1423 case OP_ANY:
1424 if ((ims & PCRE_DOTALL) == 0)
1425 {
1426 if (IS_NEWLINE(eptr)) RRETURN(MATCH_NOMATCH);
1427 }
1428 if (eptr++ >= md->end_subject) RRETURN(MATCH_NOMATCH);
1429 if (utf8)
1430 while (eptr < md->end_subject && (*eptr & 0xc0) == 0x80) eptr++;
1431 ecode++;
1432 break;
1433
1434 /* Match a single byte, even in UTF-8 mode. This opcode really does match
1435 any byte, even newline, independent of the setting of PCRE_DOTALL. */
1436
1437 case OP_ANYBYTE:
1438 if (eptr++ >= md->end_subject) RRETURN(MATCH_NOMATCH);
1439 ecode++;
1440 break;
1441
1442 case OP_NOT_DIGIT:
1443 if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
1444 GETCHARINCTEST(c, eptr);
1445 if (
1446 #ifdef SUPPORT_UTF8
1447 c < 256 &&
1448 #endif
1449 (md->ctypes[c] & ctype_digit) != 0
1450 )
1451 RRETURN(MATCH_NOMATCH);
1452 ecode++;
1453 break;
1454
1455 case OP_DIGIT:
1456 if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
1457 GETCHARINCTEST(c, eptr);
1458 if (
1459 #ifdef SUPPORT_UTF8
1460 c >= 256 ||
1461 #endif
1462 (md->ctypes[c] & ctype_digit) == 0
1463 )
1464 RRETURN(MATCH_NOMATCH);
1465 ecode++;
1466 break;
1467
1468 case OP_NOT_WHITESPACE:
1469 if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
1470 GETCHARINCTEST(c, eptr);
1471 if (
1472 #ifdef SUPPORT_UTF8
1473 c < 256 &&
1474 #endif
1475 (md->ctypes[c] & ctype_space) != 0
1476 )
1477 RRETURN(MATCH_NOMATCH);
1478 ecode++;
1479 break;
1480
1481 case OP_WHITESPACE:
1482 if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
1483 GETCHARINCTEST(c, eptr);
1484 if (
1485 #ifdef SUPPORT_UTF8
1486 c >= 256 ||
1487 #endif
1488 (md->ctypes[c] & ctype_space) == 0
1489 )
1490 RRETURN(MATCH_NOMATCH);
1491 ecode++;
1492 break;
1493
1494 case OP_NOT_WORDCHAR:
1495 if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
1496 GETCHARINCTEST(c, eptr);
1497 if (
1498 #ifdef SUPPORT_UTF8
1499 c < 256 &&
1500 #endif
1501 (md->ctypes[c] & ctype_word) != 0
1502 )
1503 RRETURN(MATCH_NOMATCH);
1504 ecode++;
1505 break;
1506
1507 case OP_WORDCHAR:
1508 if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
1509 GETCHARINCTEST(c, eptr);
1510 if (
1511 #ifdef SUPPORT_UTF8
1512 c >= 256 ||
1513 #endif
1514 (md->ctypes[c] & ctype_word) == 0
1515 )
1516 RRETURN(MATCH_NOMATCH);
1517 ecode++;
1518 break;
1519
1520 case OP_ANYNL:
1521 if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
1522 GETCHARINCTEST(c, eptr);
1523 switch(c)
1524 {
1525 default: RRETURN(MATCH_NOMATCH);
1526 case 0x000d:
1527 if (eptr < md->end_subject && *eptr == 0x0a) eptr++;
1528 break;
1529 case 0x000a:
1530 case 0x000b:
1531 case 0x000c:
1532 case 0x0085:
1533 case 0x2028:
1534 case 0x2029:
1535 break;
1536 }
1537 ecode++;
1538 break;
1539
1540 case OP_NOT_HSPACE:
1541 if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
1542 GETCHARINCTEST(c, eptr);
1543 switch(c)
1544 {
1545 default: break;
1546 case 0x09: /* HT */
1547 case 0x20: /* SPACE */
1548 case 0xa0: /* NBSP */
1549 case 0x1680: /* OGHAM SPACE MARK */
1550 case 0x180e: /* MONGOLIAN VOWEL SEPARATOR */
1551 case 0x2000: /* EN QUAD */
1552 case 0x2001: /* EM QUAD */
1553 case 0x2002: /* EN SPACE */
1554 case 0x2003: /* EM SPACE */
1555 case 0x2004: /* THREE-PER-EM SPACE */
1556 case 0x2005: /* FOUR-PER-EM SPACE */
1557 case 0x2006: /* SIX-PER-EM SPACE */
1558 case 0x2007: /* FIGURE SPACE */
1559 case 0x2008: /* PUNCTUATION SPACE */
1560 case 0x2009: /* THIN SPACE */
1561 case 0x200A: /* HAIR SPACE */
1562 case 0x202f: /* NARROW NO-BREAK SPACE */
1563 case 0x205f: /* MEDIUM MATHEMATICAL SPACE */
1564 case 0x3000: /* IDEOGRAPHIC SPACE */
1565 RRETURN(MATCH_NOMATCH);
1566 }
1567 ecode++;
1568 break;
1569
1570 case OP_HSPACE:
1571 if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
1572 GETCHARINCTEST(c, eptr);
1573 switch(c)
1574 {
1575 default: RRETURN(MATCH_NOMATCH);
1576 case 0x09: /* HT */
1577 case 0x20: /* SPACE */
1578 case 0xa0: /* NBSP */
1579 case 0x1680: /* OGHAM SPACE MARK */
1580 case 0x180e: /* MONGOLIAN VOWEL SEPARATOR */
1581 case 0x2000: /* EN QUAD */
1582 case 0x2001: /* EM QUAD */
1583 case 0x2002: /* EN SPACE */
1584 case 0x2003: /* EM SPACE */
1585 case 0x2004: /* THREE-PER-EM SPACE */
1586 case 0x2005: /* FOUR-PER-EM SPACE */
1587 case 0x2006: /* SIX-PER-EM SPACE */
1588 case 0x2007: /* FIGURE SPACE */
1589 case 0x2008: /* PUNCTUATION SPACE */
1590 case 0x2009: /* THIN SPACE */
1591 case 0x200A: /* HAIR SPACE */
1592 case 0x202f: /* NARROW NO-BREAK SPACE */
1593 case 0x205f: /* MEDIUM MATHEMATICAL SPACE */
1594 case 0x3000: /* IDEOGRAPHIC SPACE */
1595 break;
1596 }
1597 ecode++;
1598 break;
1599
1600 case OP_NOT_VSPACE:
1601 if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
1602 GETCHARINCTEST(c, eptr);
1603 switch(c)
1604 {
1605 default: break;
1606 case 0x0a: /* LF */
1607 case 0x0b: /* VT */
1608 case 0x0c: /* FF */
1609 case 0x0d: /* CR */
1610 case 0x85: /* NEL */
1611 case 0x2028: /* LINE SEPARATOR */
1612 case 0x2029: /* PARAGRAPH SEPARATOR */
1613 RRETURN(MATCH_NOMATCH);
1614 }
1615 ecode++;
1616 break;
1617
1618 case OP_VSPACE:
1619 if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
1620 GETCHARINCTEST(c, eptr);
1621 switch(c)
1622 {
1623 default: RRETURN(MATCH_NOMATCH);
1624 case 0x0a: /* LF */
1625 case 0x0b: /* VT */
1626 case 0x0c: /* FF */
1627 case 0x0d: /* CR */
1628 case 0x85: /* NEL */
1629 case 0x2028: /* LINE SEPARATOR */
1630 case 0x2029: /* PARAGRAPH SEPARATOR */
1631 break;
1632 }
1633 ecode++;
1634 break;
1635
1636 #ifdef SUPPORT_UCP
1637 /* Check the next character by Unicode property. We will get here only
1638 if the support is in the binary; otherwise a compile-time error occurs. */
1639
1640 case OP_PROP:
1641 case OP_NOTPROP:
1642 if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
1643 GETCHARINCTEST(c, eptr);
1644 {
1645 int chartype, script;
1646 int category = _pcre_ucp_findprop(c, &chartype, &script);
1647
1648 switch(ecode[1])
1649 {
1650 case PT_ANY:
1651 if (op == OP_NOTPROP) RRETURN(MATCH_NOMATCH);
1652 break;
1653
1654 case PT_LAMP:
1655 if ((chartype == ucp_Lu ||
1656 chartype == ucp_Ll ||
1657 chartype == ucp_Lt) == (op == OP_NOTPROP))
1658 RRETURN(MATCH_NOMATCH);
1659 break;
1660
1661 case PT_GC:
1662 if ((ecode[2] != category) == (op == OP_PROP))
1663 RRETURN(MATCH_NOMATCH);
1664 break;
1665
1666 case PT_PC:
1667 if ((ecode[2] != chartype) == (op == OP_PROP))
1668 RRETURN(MATCH_NOMATCH);
1669 break;
1670
1671 case PT_SC:
1672 if ((ecode[2] != script) == (op == OP_PROP))
1673 RRETURN(MATCH_NOMATCH);
1674 break;
1675
1676 default:
1677 RRETURN(PCRE_ERROR_INTERNAL);
1678 }
1679
1680 ecode += 3;
1681 }
1682 break;
1683
1684 /* Match an extended Unicode sequence. We will get here only if the support
1685 is in the binary; otherwise a compile-time error occurs. */
1686
1687 case OP_EXTUNI:
1688 if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
1689 GETCHARINCTEST(c, eptr);
1690 {
1691 int chartype, script;
1692 int category = _pcre_ucp_findprop(c, &chartype, &script);
1693 if (category == ucp_M) RRETURN(MATCH_NOMATCH);
1694 while (eptr < md->end_subject)
1695 {
1696 int len = 1;
1697 if (!utf8) c = *eptr; else
1698 {
1699 GETCHARLEN(c, eptr, len);
1700 }
1701 category = _pcre_ucp_findprop(c, &chartype, &script);
1702 if (category != ucp_M) break;
1703 eptr += len;
1704 }
1705 }
1706 ecode++;
1707 break;
1708 #endif
1709
1710
1711 /* Match a back reference, possibly repeatedly. Look past the end of the
1712 item to see if there is repeat information following. The code is similar
1713 to that for character classes, but repeated for efficiency. Then obey
1714 similar code to character type repeats - written out again for speed.
1715 However, if the referenced string is the empty string, always treat
1716 it as matched, any number of times (otherwise there could be infinite
1717 loops). */
1718
1719 case OP_REF:
1720 {
1721 offset = GET2(ecode, 1) << 1; /* Doubled ref number */
1722 ecode += 3; /* Advance past item */
1723
1724 /* If the reference is unset, set the length to be longer than the amount
1725 of subject left; this ensures that every attempt at a match fails. We
1726 can't just fail here, because of the possibility of quantifiers with zero
1727 minima. */
1728
1729 length = (offset >= offset_top || md->offset_vector[offset] < 0)?
1730 md->end_subject - eptr + 1 :
1731 md->offset_vector[offset+1] - md->offset_vector[offset];
1732
1733 /* Set up for repetition, or handle the non-repeated case */
1734
1735 switch (*ecode)
1736 {
1737 case OP_CRSTAR:
1738 case OP_CRMINSTAR:
1739 case OP_CRPLUS:
1740 case OP_CRMINPLUS:
1741 case OP_CRQUERY:
1742 case OP_CRMINQUERY:
1743 c = *ecode++ - OP_CRSTAR;
1744 minimize = (c & 1) != 0;
1745 min = rep_min[c]; /* Pick up values from tables; */
1746 max = rep_max[c]; /* zero for max => infinity */
1747 if (max == 0) max = INT_MAX;
1748 break;
1749
1750 case OP_CRRANGE:
1751 case OP_CRMINRANGE:
1752 minimize = (*ecode == OP_CRMINRANGE);
1753 min = GET2(ecode, 1);
1754 max = GET2(ecode, 3);
1755 if (max == 0) max = INT_MAX;
1756 ecode += 5;
1757 break;
1758
1759 default: /* No repeat follows */
1760 if (!match_ref(offset, eptr, length, md, ims)) RRETURN(MATCH_NOMATCH);
1761 eptr += length;
1762 continue; /* With the main loop */
1763 }
1764
1765 /* If the length of the reference is zero, just continue with the
1766 main loop. */
1767
1768 if (length == 0) continue;
1769
1770 /* First, ensure the minimum number of matches are present. We get back
1771 the length of the reference string explicitly rather than passing the
1772 address of eptr, so that eptr can be a register variable. */
1773
1774 for (i = 1; i <= min; i++)
1775 {
1776 if (!match_ref(offset, eptr, length, md, ims)) RRETURN(MATCH_NOMATCH);
1777 eptr += length;
1778 }
1779
1780 /* If min = max, continue at the same level without recursion.
1781 They are not both allowed to be zero. */
1782
1783 if (min == max) continue;
1784
1785 /* If minimizing, keep trying and advancing the pointer */
1786
1787 if (minimize)
1788 {
1789 for (fi = min;; fi++)
1790 {
1791 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM14);
1792 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1793 if (fi >= max || !match_ref(offset, eptr, length, md, ims))
1794 RRETURN(MATCH_NOMATCH);
1795 eptr += length;
1796 }
1797 /* Control never gets here */
1798 }
1799
1800 /* If maximizing, find the longest string and work backwards */
1801
1802 else
1803 {
1804 pp = eptr;
1805 for (i = min; i < max; i++)
1806 {
1807 if (!match_ref(offset, eptr, length, md, ims)) break;
1808 eptr += length;
1809 }
1810 while (eptr >= pp)
1811 {
1812 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM15);
1813 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1814 eptr -= length;
1815 }
1816 RRETURN(MATCH_NOMATCH);
1817 }
1818 }
1819 /* Control never gets here */
1820
1821
1822
1823 /* Match a bit-mapped character class, possibly repeatedly. This op code is
1824 used when all the characters in the class have values in the range 0-255,
1825 and either the matching is caseful, or the characters are in the range
1826 0-127 when UTF-8 processing is enabled. The only difference between
1827 OP_CLASS and OP_NCLASS occurs when a data character outside the range is
1828 encountered.
1829
1830 First, look past the end of the item to see if there is repeat information
1831 following. Then obey similar code to character type repeats - written out
1832 again for speed. */
1833
1834 case OP_NCLASS:
1835 case OP_CLASS:
1836 {
1837 data = ecode + 1; /* Save for matching */
1838 ecode += 33; /* Advance past the item */
1839
1840 switch (*ecode)
1841 {
1842 case OP_CRSTAR:
1843 case OP_CRMINSTAR:
1844 case OP_CRPLUS:
1845 case OP_CRMINPLUS:
1846 case OP_CRQUERY:
1847 case OP_CRMINQUERY:
1848 c = *ecode++ - OP_CRSTAR;
1849 minimize = (c & 1) != 0;
1850 min = rep_min[c]; /* Pick up values from tables; */
1851 max = rep_max[c]; /* zero for max => infinity */
1852 if (max == 0) max = INT_MAX;
1853 break;
1854
1855 case OP_CRRANGE:
1856 case OP_CRMINRANGE:
1857 minimize = (*ecode == OP_CRMINRANGE);
1858 min = GET2(ecode, 1);
1859 max = GET2(ecode, 3);
1860 if (max == 0) max = INT_MAX;
1861 ecode += 5;
1862 break;
1863
1864 default: /* No repeat follows */
1865 min = max = 1;
1866 break;
1867 }
1868
1869 /* First, ensure the minimum number of matches are present. */
1870
1871 #ifdef SUPPORT_UTF8
1872 /* UTF-8 mode */
1873 if (utf8)
1874 {
1875 for (i = 1; i <= min; i++)
1876 {
1877 if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
1878 GETCHARINC(c, eptr);
1879 if (c > 255)
1880 {
1881 if (op == OP_CLASS) RRETURN(MATCH_NOMATCH);
1882 }
1883 else
1884 {
1885 if ((data[c/8] & (1 << (c&7))) == 0) RRETURN(MATCH_NOMATCH);
1886 }
1887 }
1888 }
1889 else
1890 #endif
1891 /* Not UTF-8 mode */
1892 {
1893 for (i = 1; i <= min; i++)
1894 {
1895 if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
1896 c = *eptr++;
1897 if ((data[c/8] & (1 << (c&7))) == 0) RRETURN(MATCH_NOMATCH);
1898 }
1899 }
1900
1901 /* If max == min we can continue with the main loop without the
1902 need to recurse. */
1903
1904 if (min == max) continue;
1905
1906 /* If minimizing, keep testing the rest of the expression and advancing
1907 the pointer while it matches the class. */
1908
1909 if (minimize)
1910 {
1911 #ifdef SUPPORT_UTF8
1912 /* UTF-8 mode */
1913 if (utf8)
1914 {
1915 for (fi = min;; fi++)
1916 {
1917 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM16);
1918 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1919 if (fi >= max || eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
1920 GETCHARINC(c, eptr);
1921 if (c > 255)
1922 {
1923 if (op == OP_CLASS) RRETURN(MATCH_NOMATCH);
1924 }
1925 else
1926 {
1927 if ((data[c/8] & (1 << (c&7))) == 0) RRETURN(MATCH_NOMATCH);
1928 }
1929 }
1930 }
1931 else
1932 #endif
1933 /* Not UTF-8 mode */
1934 {
1935 for (fi = min;; fi++)
1936 {
1937 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM17);
1938 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1939 if (fi >= max || eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
1940 c = *eptr++;
1941 if ((data[c/8] & (1 << (c&7))) == 0) RRETURN(MATCH_NOMATCH);
1942 }
1943 }
1944 /* Control never gets here */
1945 }
1946
1947 /* If maximizing, find the longest possible run, then work backwards. */
1948
1949 else
1950 {
1951 pp = eptr;
1952
1953 #ifdef SUPPORT_UTF8
1954 /* UTF-8 mode */
1955 if (utf8)
1956 {
1957 for (i = min; i < max; i++)
1958 {
1959 int len = 1;
1960 if (eptr >= md->end_subject) break;
1961 GETCHARLEN(c, eptr, len);
1962 if (c > 255)
1963 {
1964 if (op == OP_CLASS) break;
1965 }
1966 else
1967 {
1968 if ((data[c/8] & (1 << (c&7))) == 0) break;
1969 }
1970 eptr += len;
1971 }
1972 for (;;)
1973 {
1974 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM18);
1975 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1976 if (eptr-- == pp) break; /* Stop if tried at original pos */
1977 BACKCHAR(eptr);
1978 }
1979 }
1980 else
1981 #endif
1982 /* Not UTF-8 mode */
1983 {
1984 for (i = min; i < max; i++)
1985 {
1986 if (eptr >= md->end_subject) break;
1987 c = *eptr;
1988 if ((data[c/8] & (1 << (c&7))) == 0) break;
1989 eptr++;
1990 }
1991 while (eptr >= pp)
1992 {
1993 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM19);
1994 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1995 eptr--;
1996 }
1997 }
1998
1999 RRETURN(MATCH_NOMATCH);
2000 }
2001 }
2002 /* Control never gets here */
2003
2004
2005 /* Match an extended character class. This opcode is encountered only
2006 in UTF-8 mode, because that's the only time it is compiled. */
2007
2008 #ifdef SUPPORT_UTF8
2009 case OP_XCLASS:
2010 {
2011 data = ecode + 1 + LINK_SIZE; /* Save for matching */
2012 ecode += GET(ecode, 1); /* Advance past the item */
2013
2014 switch (*ecode)
2015 {
2016 case OP_CRSTAR:
2017 case OP_CRMINSTAR:
2018 case OP_CRPLUS:
2019 case OP_CRMINPLUS:
2020 case OP_CRQUERY:
2021 case OP_CRMINQUERY:
2022 c = *ecode++ - OP_CRSTAR;
2023 minimize = (c & 1) != 0;
2024 min = rep_min[c]; /* Pick up values from tables; */
2025 max = rep_max[c]; /* zero for max => infinity */
2026 if (max == 0) max = INT_MAX;
2027 break;
2028
2029 case OP_CRRANGE:
2030 case OP_CRMINRANGE:
2031 minimize = (*ecode == OP_CRMINRANGE);
2032 min = GET2(ecode, 1);
2033 max = GET2(ecode, 3);
2034 if (max == 0) max = INT_MAX;
2035 ecode += 5;
2036 break;
2037
2038 default: /* No repeat follows */
2039 min = max = 1;
2040 break;
2041 }
2042
2043 /* First, ensure the minimum number of matches are present. */
2044
2045 for (i = 1; i <= min; i++)
2046 {
2047 if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
2048 GETCHARINC(c, eptr);
2049 if (!_pcre_xclass(c, data)) RRETURN(MATCH_NOMATCH);
2050 }
2051
2052 /* If max == min we can continue with the main loop without the
2053 need to recurse. */
2054
2055 if (min == max) continue;
2056
2057 /* If minimizing, keep testing the rest of the expression and advancing
2058 the pointer while it matches the class. */
2059
2060 if (minimize)
2061 {
2062 for (fi = min;; fi++)
2063 {
2064 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM20);
2065 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2066 if (fi >= max || eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
2067 GETCHARINC(c, eptr);
2068 if (!_pcre_xclass(c, data)) RRETURN(MATCH_NOMATCH);
2069 }
2070 /* Control never gets here */
2071 }
2072
2073 /* If maximizing, find the longest possible run, then work backwards. */
2074
2075 else
2076 {
2077 pp = eptr;
2078 for (i = min; i < max; i++)
2079 {
2080 int len = 1;
2081 if (eptr >= md->end_subject) break;
2082 GETCHARLEN(c, eptr, len);
2083 if (!_pcre_xclass(c, data)) break;
2084 eptr += len;
2085 }
2086 for(;;)
2087 {
2088 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM21);
2089 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2090 if (eptr-- == pp) break; /* Stop if tried at original pos */
2091 if (utf8) BACKCHAR(eptr);
2092 }
2093 RRETURN(MATCH_NOMATCH);
2094 }
2095
2096 /* Control never gets here */
2097 }
2098 #endif /* End of XCLASS */
2099
2100 /* Match a single character, casefully */
2101
2102 case OP_CHAR:
2103 #ifdef SUPPORT_UTF8
2104 if (utf8)
2105 {
2106 length = 1;
2107 ecode++;
2108 GETCHARLEN(fc, ecode, length);
2109 if (length > md->end_subject - eptr) RRETURN(MATCH_NOMATCH);
2110 while (length-- > 0) if (*ecode++ != *eptr++) RRETURN(MATCH_NOMATCH);
2111 }
2112 else
2113 #endif
2114
2115 /* Non-UTF-8 mode */
2116 {
2117 if (md->end_subject - eptr < 1) RRETURN(MATCH_NOMATCH);
2118 if (ecode[1] != *eptr++) RRETURN(MATCH_NOMATCH);
2119 ecode += 2;
2120 }
2121 break;
2122
2123 /* Match a single character, caselessly */
2124
2125 case OP_CHARNC:
2126 #ifdef SUPPORT_UTF8
2127 if (utf8)
2128 {
2129 length = 1;
2130 ecode++;
2131 GETCHARLEN(fc, ecode, length);
2132
2133 if (length > md->end_subject - eptr) RRETURN(MATCH_NOMATCH);
2134
2135 /* If the pattern character's value is < 128, we have only one byte, and
2136 can use the fast lookup table. */
2137
2138 if (fc < 128)
2139 {
2140 if (md->lcc[*ecode++] != md->lcc[*eptr++]) RRETURN(MATCH_NOMATCH);
2141 }
2142
2143 /* Otherwise we must pick up the subject character */
2144
2145 else
2146 {
2147 unsigned int dc;
2148 GETCHARINC(dc, eptr);
2149 ecode += length;
2150
2151 /* If we have Unicode property support, we can use it to test the other
2152 case of the character, if there is one. */
2153
2154 if (fc != dc)
2155 {
2156 #ifdef SUPPORT_UCP
2157 if (dc != _pcre_ucp_othercase(fc))
2158 #endif
2159 RRETURN(MATCH_NOMATCH);
2160 }
2161 }
2162 }
2163 else
2164 #endif /* SUPPORT_UTF8 */
2165
2166 /* Non-UTF-8 mode */
2167 {
2168 if (md->end_subject - eptr < 1) RRETURN(MATCH_NOMATCH);
2169 if (md->lcc[ecode[1]] != md->lcc[*eptr++]) RRETURN(MATCH_NOMATCH);
2170 ecode += 2;
2171 }
2172 break;
2173
2174 /* Match a single character repeatedly. */
2175
2176 case OP_EXACT:
2177 min = max = GET2(ecode, 1);
2178 ecode += 3;
2179 goto REPEATCHAR;
2180
2181 case OP_POSUPTO:
2182 possessive = TRUE;
2183 /* Fall through */
2184
2185 case OP_UPTO:
2186 case OP_MINUPTO:
2187 min = 0;
2188 max = GET2(ecode, 1);
2189 minimize = *ecode == OP_MINUPTO;
2190 ecode += 3;
2191 goto REPEATCHAR;
2192
2193 case OP_POSSTAR:
2194 possessive = TRUE;
2195 min = 0;
2196 max = INT_MAX;
2197 ecode++;
2198 goto REPEATCHAR;
2199
2200 case OP_POSPLUS:
2201 possessive = TRUE;
2202 min = 1;
2203 max = INT_MAX;
2204 ecode++;
2205 goto REPEATCHAR;
2206
2207 case OP_POSQUERY:
2208 possessive = TRUE;
2209 min = 0;
2210 max = 1;
2211 ecode++;
2212 goto REPEATCHAR;
2213
2214 case OP_STAR:
2215 case OP_MINSTAR:
2216 case OP_PLUS:
2217 case OP_MINPLUS:
2218 case OP_QUERY:
2219 case OP_MINQUERY:
2220 c = *ecode++ - OP_STAR;
2221 minimize = (c & 1) != 0;
2222 min = rep_min[c]; /* Pick up values from tables; */
2223 max = rep_max[c]; /* zero for max => infinity */
2224 if (max == 0) max = INT_MAX;
2225
2226 /* Common code for all repeated single-character matches. We can give
2227 up quickly if there are fewer than the minimum number of characters left in
2228 the subject. */
2229
2230 REPEATCHAR:
2231 #ifdef SUPPORT_UTF8
2232 if (utf8)
2233 {
2234 length = 1;
2235 charptr = ecode;
2236 GETCHARLEN(fc, ecode, length);
2237 if (min * length > md->end_subject - eptr) RRETURN(MATCH_NOMATCH);
2238 ecode += length;
2239
2240 /* Handle multibyte character matching specially here. There is
2241 support for caseless matching if UCP support is present. */
2242
2243 if (length > 1)
2244 {
2245 #ifdef SUPPORT_UCP
2246 unsigned int othercase;
2247 if ((ims & PCRE_CASELESS) != 0 &&
2248 (othercase = _pcre_ucp_othercase(fc)) != NOTACHAR)
2249 oclength = _pcre_ord2utf8(othercase, occhars);
2250 else oclength = 0;
2251 #endif /* SUPPORT_UCP */
2252
2253 for (i = 1; i <= min; i++)
2254 {
2255 if (memcmp(eptr, charptr, length) == 0) eptr += length;
2256 #ifdef SUPPORT_UCP
2257 /* Need braces because of following else */
2258 else if (oclength == 0) { RRETURN(MATCH_NOMATCH); }
2259 else
2260 {
2261 if (memcmp(eptr, occhars, oclength) != 0) RRETURN(MATCH_NOMATCH);
2262 eptr += oclength;
2263 }
2264 #else /* without SUPPORT_UCP */
2265 else { RRETURN(MATCH_NOMATCH); }
2266 #endif /* SUPPORT_UCP */
2267 }
2268
2269 if (min == max) continue;
2270
2271 if (minimize)
2272 {
2273 for (fi = min;; fi++)
2274 {
2275 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM22);
2276 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2277 if (fi >= max || eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
2278 if (memcmp(eptr, charptr, length) == 0) eptr += length;
2279 #ifdef SUPPORT_UCP
2280 /* Need braces because of following else */
2281 else if (oclength == 0) { RRETURN(MATCH_NOMATCH); }
2282 else
2283 {
2284 if (memcmp(eptr, occhars, oclength) != 0) RRETURN(MATCH_NOMATCH);
2285 eptr += oclength;
2286 }
2287 #else /* without SUPPORT_UCP */
2288 else { RRETURN (MATCH_NOMATCH); }
2289 #endif /* SUPPORT_UCP */
2290 }
2291 /* Control never gets here */
2292 }
2293
2294 else /* Maximize */
2295 {
2296 pp = eptr;
2297 for (i = min; i < max; i++)
2298 {
2299 if (eptr > md->end_subject - length) break;
2300 if (memcmp(eptr, charptr, length) == 0) eptr += length;
2301 #ifdef SUPPORT_UCP
2302 else if (oclength == 0) break;
2303 else
2304 {
2305 if (memcmp(eptr, occhars, oclength) != 0) break;
2306 eptr += oclength;
2307 }
2308 #else /* without SUPPORT_UCP */
2309 else break;
2310 #endif /* SUPPORT_UCP */
2311 }
2312
2313 if (possessive) continue;
2314 for(;;)
2315 {
2316 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM23);
2317 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2318 if (eptr == pp) RRETURN(MATCH_NOMATCH);
2319 #ifdef SUPPORT_UCP
2320 eptr--;
2321 BACKCHAR(eptr);
2322 #else /* without SUPPORT_UCP */
2323 eptr -= length;
2324 #endif /* SUPPORT_UCP */
2325 }
2326 }
2327 /* Control never gets here */
2328 }
2329
2330 /* If the length of a UTF-8 character is 1, we fall through here, and
2331 obey the code as for non-UTF-8 characters below, though in this case the
2332 value of fc will always be < 128. */
2333 }
2334 else
2335 #endif /* SUPPORT_UTF8 */
2336
2337 /* When not in UTF-8 mode, load a single-byte character. */
2338 {
2339 if (min > md->end_subject - eptr) RRETURN(MATCH_NOMATCH);
2340 fc = *ecode++;
2341 }
2342
2343 /* The value of fc at this point is always less than 256, though we may or
2344 may not be in UTF-8 mode. The code is duplicated for the caseless and
2345 caseful cases, for speed, since matching characters is likely to be quite
2346 common. First, ensure the minimum number of matches are present. If min =
2347 max, continue at the same level without recursing. Otherwise, if
2348 minimizing, keep trying the rest of the expression and advancing one
2349 matching character if failing, up to the maximum. Alternatively, if
2350 maximizing, find the maximum number of characters and work backwards. */
2351
2352 DPRINTF(("matching %c{%d,%d} against subject %.*s\n", fc, min, max,
2353 max, eptr));
2354
2355 if ((ims & PCRE_CASELESS) != 0)
2356 {
2357 fc = md->lcc[fc];
2358 for (i = 1; i <= min; i++)
2359 if (fc != md->lcc[*eptr++]) RRETURN(MATCH_NOMATCH);
2360 if (min == max) continue;
2361 if (minimize)
2362 {
2363 for (fi = min;; fi++)
2364 {
2365 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM24);
2366 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2367 if (fi >= max || eptr >= md->end_subject ||
2368 fc != md->lcc[*eptr++])
2369 RRETURN(MATCH_NOMATCH);
2370 }
2371 /* Control never gets here */
2372 }
2373 else /* Maximize */
2374 {
2375 pp = eptr;
2376 for (i = min; i < max; i++)
2377 {
2378 if (eptr >= md->end_subject || fc != md->lcc[*eptr]) break;
2379 eptr++;
2380 }
2381 if (possessive) continue;
2382 while (eptr >= pp)
2383 {
2384 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM25);
2385 eptr--;
2386 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2387 }
2388 RRETURN(MATCH_NOMATCH);
2389 }
2390 /* Control never gets here */
2391 }
2392
2393 /* Caseful comparisons (includes all multi-byte characters) */
2394
2395 else
2396 {
2397 for (i = 1; i <= min; i++) if (fc != *eptr++) RRETURN(MATCH_NOMATCH);
2398 if (min == max) continue;
2399 if (minimize)
2400 {
2401 for (fi = min;; fi++)
2402 {
2403 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM26);
2404 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2405 if (fi >= max || eptr >= md->end_subject || fc != *eptr++)
2406 RRETURN(MATCH_NOMATCH);
2407 }
2408 /* Control never gets here */
2409 }
2410 else /* Maximize */
2411 {
2412 pp = eptr;
2413 for (i = min; i < max; i++)
2414 {
2415 if (eptr >= md->end_subject || fc != *eptr) break;
2416 eptr++;
2417 }
2418 if (possessive) continue;
2419 while (eptr >= pp)
2420 {
2421 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM27);
2422 eptr--;
2423 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2424 }
2425 RRETURN(MATCH_NOMATCH);
2426 }
2427 }
2428 /* Control never gets here */
2429
2430 /* Match a negated single one-byte character. The character we are
2431 checking can be multibyte. */
2432
2433 case OP_NOT:
2434 if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
2435 ecode++;
2436 GETCHARINCTEST(c, eptr);
2437 if ((ims & PCRE_CASELESS) != 0)
2438 {
2439 #ifdef SUPPORT_UTF8
2440 if (c < 256)
2441 #endif
2442 c = md->lcc[c];
2443 if (md->lcc[*ecode++] == c) RRETURN(MATCH_NOMATCH);
2444 }
2445 else
2446 {
2447 if (*ecode++ == c) RRETURN(MATCH_NOMATCH);
2448 }
2449 break;
2450
2451 /* Match a negated single one-byte character repeatedly. This is almost a
2452 repeat of the code for a repeated single character, but I haven't found a
2453 nice way of commoning these up that doesn't require a test of the
2454 positive/negative option for each character match. Maybe that wouldn't add
2455 very much to the time taken, but character matching *is* what this is all
2456 about... */
2457
2458 case OP_NOTEXACT:
2459 min = max = GET2(ecode, 1);
2460 ecode += 3;
2461 goto REPEATNOTCHAR;
2462
2463 case OP_NOTUPTO:
2464 case OP_NOTMINUPTO:
2465 min = 0;
2466 max = GET2(ecode, 1);
2467 minimize = *ecode == OP_NOTMINUPTO;
2468 ecode += 3;
2469 goto REPEATNOTCHAR;
2470
2471 case OP_NOTPOSSTAR:
2472 possessive = TRUE;
2473 min = 0;
2474 max = INT_MAX;
2475 ecode++;
2476 goto REPEATNOTCHAR;
2477
2478 case OP_NOTPOSPLUS:
2479 possessive = TRUE;
2480 min = 1;
2481 max = INT_MAX;
2482 ecode++;
2483 goto REPEATNOTCHAR;
2484
2485 case OP_NOTPOSQUERY:
2486 possessive = TRUE;
2487 min = 0;
2488 max = 1;
2489 ecode++;
2490 goto REPEATNOTCHAR;
2491
2492 case OP_NOTPOSUPTO:
2493 possessive = TRUE;
2494 min = 0;
2495 max = GET2(ecode, 1);
2496 ecode += 3;
2497 goto REPEATNOTCHAR;
2498
2499 case OP_NOTSTAR:
2500 case OP_NOTMINSTAR:
2501 case OP_NOTPLUS:
2502 case OP_NOTMINPLUS:
2503 case OP_NOTQUERY:
2504 case OP_NOTMINQUERY:
2505 c = *ecode++ - OP_NOTSTAR;
2506 minimize = (c & 1) != 0;
2507 min = rep_min[c]; /* Pick up values from tables; */
2508 max = rep_max[c]; /* zero for max => infinity */
2509 if (max == 0) max = INT_MAX;
2510
2511 /* Common code for all repeated single-byte matches. We can give up quickly
2512 if there are fewer than the minimum number of bytes left in the
2513 subject. */
2514
2515 REPEATNOTCHAR:
2516 if (min > md->end_subject - eptr) RRETURN(MATCH_NOMATCH);
2517 fc = *ecode++;
2518
2519 /* The code is duplicated for the caseless and caseful cases, for speed,
2520 since matching characters is likely to be quite common. First, ensure the
2521 minimum number of matches are present. If min = max, continue at the same
2522 level without recursing. Otherwise, if minimizing, keep trying the rest of
2523 the expression and advancing one matching character if failing, up to the
2524 maximum. Alternatively, if maximizing, find the maximum number of
2525 characters and work backwards. */
2526
2527 DPRINTF(("negative matching %c{%d,%d} against subject %.*s\n", fc, min, max,
2528 max, eptr));
2529
2530 if ((ims & PCRE_CASELESS) != 0)
2531 {
2532 fc = md->lcc[fc];
2533
2534 #ifdef SUPPORT_UTF8
2535 /* UTF-8 mode */
2536 if (utf8)
2537 {
2538 register unsigned int d;
2539 for (i = 1; i <= min; i++)
2540 {
2541 GETCHARINC(d, eptr);
2542 if (d < 256) d = md->lcc[d];
2543 if (fc == d) RRETURN(MATCH_NOMATCH);
2544 }
2545 }
2546 else
2547 #endif
2548
2549 /* Not UTF-8 mode */
2550 {
2551 for (i = 1; i <= min; i++)
2552 if (fc == md->lcc[*eptr++]) RRETURN(MATCH_NOMATCH);
2553 }
2554
2555 if (min == max) continue;
2556
2557 if (minimize)
2558 {
2559 #ifdef SUPPORT_UTF8
2560 /* UTF-8 mode */
2561 if (utf8)
2562 {
2563 register unsigned int d;
2564 for (fi = min;; fi++)
2565 {
2566 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM28);
2567 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2568 GETCHARINC(d, eptr);
2569 if (d < 256) d = md->lcc[d];
2570 if (fi >= max || eptr >= md->end_subject || fc == d)
2571 RRETURN(MATCH_NOMATCH);
2572 }
2573 }
2574 else
2575 #endif
2576 /* Not UTF-8 mode */
2577 {
2578 for (fi = min;; fi++)
2579 {
2580 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM29);
2581 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2582 if (fi >= max || eptr >= md->end_subject || fc == md->lcc[*eptr++])
2583 RRETURN(MATCH_NOMATCH);
2584 }
2585 }
2586 /* Control never gets here */
2587 }
2588
2589 /* Maximize case */
2590
2591 else
2592 {
2593 pp = eptr;
2594
2595 #ifdef SUPPORT_UTF8
2596 /* UTF-8 mode */
2597 if (utf8)
2598 {
2599 register unsigned int d;
2600 for (i = min; i < max; i++)
2601 {
2602 int len = 1;
2603 if (eptr >= md->end_subject) break;
2604 GETCHARLEN(d, eptr, len);
2605 if (d < 256) d = md->lcc[d];
2606 if (fc == d) break;
2607 eptr += len;
2608 }
2609 if (possessive) continue;
2610 for(;;)
2611 {
2612 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM30);
2613 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2614 if (eptr-- == pp) break; /* Stop if tried at original pos */
2615 BACKCHAR(eptr);
2616 }
2617 }
2618 else
2619 #endif
2620 /* Not UTF-8 mode */
2621 {
2622 for (i = min; i < max; i++)
2623 {
2624 if (eptr >= md->end_subject || fc == md->lcc[*eptr]) break;
2625 eptr++;
2626 }
2627 if (possessive) continue;
2628 while (eptr >= pp)
2629 {
2630 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM31);
2631 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2632 eptr--;
2633 }
2634 }
2635
2636 RRETURN(MATCH_NOMATCH);
2637 }
2638 /* Control never gets here */
2639 }
2640
2641 /* Caseful comparisons */
2642
2643 else
2644 {
2645 #ifdef SUPPORT_UTF8
2646 /* UTF-8 mode */
2647 if (utf8)
2648 {
2649 register unsigned int d;
2650 for (i = 1; i <= min; i++)
2651 {
2652 GETCHARINC(d, eptr);
2653 if (fc == d) RRETURN(MATCH_NOMATCH);
2654 }
2655 }
2656 else
2657 #endif
2658 /* Not UTF-8 mode */
2659 {
2660 for (i = 1; i <= min; i++)
2661 if (fc == *eptr++) RRETURN(MATCH_NOMATCH);
2662 }
2663
2664 if (min == max) continue;
2665
2666 if (minimize)
2667 {
2668 #ifdef SUPPORT_UTF8
2669 /* UTF-8 mode */
2670 if (utf8)
2671 {
2672 register unsigned int d;
2673 for (fi = min;; fi++)
2674 {
2675 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM32);
2676 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2677 GETCHARINC(d, eptr);
2678 if (fi >= max || eptr >= md->end_subject || fc == d)
2679 RRETURN(MATCH_NOMATCH);
2680 }
2681 }
2682 else
2683 #endif
2684 /* Not UTF-8 mode */
2685 {
2686 for (fi = min;; fi++)
2687 {
2688 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM33);
2689 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2690 if (fi >= max || eptr >= md->end_subject || fc == *eptr++)
2691 RRETURN(MATCH_NOMATCH);
2692 }
2693 }
2694 /* Control never gets here */
2695 }
2696
2697 /* Maximize case */
2698
2699 else
2700 {
2701 pp = eptr;
2702
2703 #ifdef SUPPORT_UTF8
2704 /* UTF-8 mode */
2705 if (utf8)
2706 {
2707 register unsigned int d;
2708 for (i = min; i < max; i++)
2709 {
2710 int len = 1;
2711 if (eptr >= md->end_subject) break;
2712 GETCHARLEN(d, eptr, len);
2713 if (fc == d) break;
2714 eptr += len;
2715 }
2716 if (possessive) continue;
2717 for(;;)
2718 {
2719 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM34);
2720 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2721 if (eptr-- == pp) break; /* Stop if tried at original pos */
2722 BACKCHAR(eptr);
2723 }
2724 }
2725 else
2726 #endif
2727 /* Not UTF-8 mode */
2728 {
2729 for (i = min; i < max; i++)
2730 {
2731 if (eptr >= md->end_subject || fc == *eptr) break;
2732 eptr++;
2733 }
2734 if (possessive) continue;
2735 while (eptr >= pp)
2736 {
2737 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM35);
2738 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2739 eptr--;
2740 }
2741 }
2742
2743 RRETURN(MATCH_NOMATCH);
2744 }
2745 }
2746 /* Control never gets here */
2747
2748 /* Match a single character type repeatedly; several different opcodes
2749 share code. This is very similar to the code for single characters, but we
2750 repeat it in the interests of efficiency. */
2751
2752 case OP_TYPEEXACT:
2753 min = max = GET2(ecode, 1);
2754 minimize = TRUE;
2755 ecode += 3;
2756 goto REPEATTYPE;
2757
2758 case OP_TYPEUPTO:
2759 case OP_TYPEMINUPTO:
2760 min = 0;
2761 max = GET2(ecode, 1);
2762 minimize = *ecode == OP_TYPEMINUPTO;
2763 ecode += 3;
2764 goto REPEATTYPE;
2765
2766 case OP_TYPEPOSSTAR:
2767 possessive = TRUE;
2768 min = 0;
2769 max = INT_MAX;
2770 ecode++;
2771 goto REPEATTYPE;
2772
2773 case OP_TYPEPOSPLUS:
2774 possessive = TRUE;
2775 min = 1;
2776 max = INT_MAX;
2777 ecode++;
2778 goto REPEATTYPE;
2779
2780 case OP_TYPEPOSQUERY:
2781 possessive = TRUE;
2782 min = 0;
2783 max = 1;
2784 ecode++;
2785 goto REPEATTYPE;
2786
2787 case OP_TYPEPOSUPTO:
2788 possessive = TRUE;
2789 min = 0;
2790 max = GET2(ecode, 1);
2791 ecode += 3;
2792 goto REPEATTYPE;
2793
2794 case OP_TYPESTAR:
2795 case OP_TYPEMINSTAR:
2796 case OP_TYPEPLUS:
2797 case OP_TYPEMINPLUS:
2798 case OP_TYPEQUERY:
2799 case OP_TYPEMINQUERY:
2800 c = *ecode++ - OP_TYPESTAR;
2801 minimize = (c & 1) != 0;
2802 min = rep_min[c]; /* Pick up values from tables; */
2803 max = rep_max[c]; /* zero for max => infinity */
2804 if (max == 0) max = INT_MAX;
2805
2806 /* Common code for all repeated single character type matches. Note that
2807 in UTF-8 mode, '.' matches a character of any length, but for the other
2808 character types, the valid characters are all one-byte long. */
2809
2810 REPEATTYPE:
2811 ctype = *ecode++; /* Code for the character type */
2812
2813 #ifdef SUPPORT_UCP
2814 if (ctype == OP_PROP || ctype == OP_NOTPROP)
2815 {
2816 prop_fail_result = ctype == OP_NOTPROP;
2817 prop_type = *ecode++;
2818 prop_value = *ecode++;
2819 }
2820 else prop_type = -1;
2821 #endif
2822
2823 /* First, ensure the minimum number of matches are present. Use inline
2824 code for maximizing the speed, and do the type test once at the start
2825 (i.e. keep it out of the loop). Also we can test that there are at least
2826 the minimum number of bytes before we start. This isn't as effective in
2827 UTF-8 mode, but it does no harm. Separate the UTF-8 code completely as that
2828 is tidier. Also separate the UCP code, which can be the same for both UTF-8
2829 and single-bytes. */
2830
2831 if (min > md->end_subject - eptr) RRETURN(MATCH_NOMATCH);
2832 if (min > 0)
2833 {
2834 #ifdef SUPPORT_UCP
2835 if (prop_type >= 0)
2836 {
2837 switch(prop_type)
2838 {
2839 case PT_ANY:
2840 if (prop_fail_result) RRETURN(MATCH_NOMATCH);
2841 for (i = 1; i <= min; i++)
2842 {
2843 if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
2844 GETCHARINCTEST(c, eptr);
2845 }
2846 break;
2847
2848 case PT_LAMP:
2849 for (i = 1; i <= min; i++)
2850 {
2851 if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
2852 GETCHARINCTEST(c, eptr);
2853 prop_category = _pcre_ucp_findprop(c, &prop_chartype, &prop_script);
2854 if ((prop_chartype == ucp_Lu ||
2855 prop_chartype == ucp_Ll ||
2856 prop_chartype == ucp_Lt) == prop_fail_result)
2857 RRETURN(MATCH_NOMATCH);
2858 }
2859 break;
2860
2861 case PT_GC:
2862 for (i = 1; i <= min; i++)
2863 {
2864 if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
2865 GETCHARINCTEST(c, eptr);
2866 prop_category = _pcre_ucp_findprop(c, &prop_chartype, &prop_script);
2867 if ((prop_category == prop_value) == prop_fail_result)
2868 RRETURN(MATCH_NOMATCH);
2869 }
2870 break;
2871
2872 case PT_PC:
2873 for (i = 1; i <= min; i++)
2874 {
2875 if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
2876 GETCHARINCTEST(c, eptr);
2877 prop_category = _pcre_ucp_findprop(c, &prop_chartype, &prop_script);
2878 if ((prop_chartype == prop_value) == prop_fail_result)
2879 RRETURN(MATCH_NOMATCH);
2880 }
2881 break;
2882
2883 case PT_SC:
2884 for (i = 1; i <= min; i++)
2885 {
2886 if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
2887 GETCHARINCTEST(c, eptr);
2888 prop_category = _pcre_ucp_findprop(c, &prop_chartype, &prop_script);
2889 if ((prop_script == prop_value) == prop_fail_result)
2890 RRETURN(MATCH_NOMATCH);
2891 }
2892 break;
2893
2894 default:
2895 RRETURN(PCRE_ERROR_INTERNAL);
2896 }
2897 }
2898
2899 /* Match extended Unicode sequences. We will get here only if the
2900 support is in the binary; otherwise a compile-time error occurs. */
2901
2902 else if (ctype == OP_EXTUNI)
2903 {
2904 for (i = 1; i <= min; i++)
2905 {
2906 GETCHARINCTEST(c, eptr);
2907 prop_category = _pcre_ucp_findprop(c, &prop_chartype, &prop_script);
2908 if (prop_category == ucp_M) RRETURN(MATCH_NOMATCH);
2909 while (eptr < md->end_subject)
2910 {
2911 int len = 1;
2912 if (!utf8) c = *eptr; else
2913 {
2914 GETCHARLEN(c, eptr, len);
2915 }
2916 prop_category = _pcre_ucp_findprop(c, &prop_chartype, &prop_script);
2917 if (prop_category != ucp_M) break;
2918 eptr += len;
2919 }
2920 }
2921 }
2922
2923 else
2924 #endif /* SUPPORT_UCP */
2925
2926 /* Handle all other cases when the coding is UTF-8 */
2927
2928 #ifdef SUPPORT_UTF8
2929 if (utf8) switch(ctype)
2930 {
2931 case OP_ANY:
2932 for (i = 1; i <= min; i++)
2933 {
2934 if (eptr >= md->end_subject ||
2935 ((ims & PCRE_DOTALL) == 0 && IS_NEWLINE(eptr)))
2936 RRETURN(MATCH_NOMATCH);
2937 eptr++;
2938 while (eptr < md->end_subject && (*eptr & 0xc0) == 0x80) eptr++;
2939 }
2940 break;
2941
2942 case OP_ANYBYTE:
2943 eptr += min;
2944 break;
2945
2946 case OP_ANYNL:
2947 for (i = 1; i <= min; i++)
2948 {
2949 if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
2950 GETCHARINC(c, eptr);
2951 switch(c)
2952 {
2953 default: RRETURN(MATCH_NOMATCH);
2954 case 0x000d:
2955 if (eptr < md->end_subject && *eptr == 0x0a) eptr++;
2956 break;
2957 case 0x000a:
2958 case 0x000b:
2959 case 0x000c:
2960 case 0x0085:
2961 case 0x2028:
2962 case 0x2029:
2963 break;
2964 }
2965 }
2966 break;
2967
2968 case OP_NOT_HSPACE:
2969 for (i = 1; i <= min; i++)
2970 {
2971 if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
2972 GETCHARINC(c, eptr);
2973 switch(c)
2974 {
2975 default: break;
2976 case 0x09: /* HT */
2977 case 0x20: /* SPACE */
2978 case 0xa0: /* NBSP */
2979 case 0x1680: /* OGHAM SPACE MARK */
2980 case 0x180e: /* MONGOLIAN VOWEL SEPARATOR */
2981 case 0x2000: /* EN QUAD */
2982 case 0x2001: /* EM QUAD */
2983 case 0x2002: /* EN SPACE */
2984 case 0x2003: /* EM SPACE */
2985 case 0x2004: /* THREE-PER-EM SPACE */
2986 case 0x2005: /* FOUR-PER-EM SPACE */
2987 case 0x2006: /* SIX-PER-EM SPACE */
2988 case 0x2007: /* FIGURE SPACE */
2989 case 0x2008: /* PUNCTUATION SPACE */
2990 case 0x2009: /* THIN SPACE */
2991 case 0x200A: /* HAIR SPACE */
2992 case 0x202f: /* NARROW NO-BREAK SPACE */
2993 case 0x205f: /* MEDIUM MATHEMATICAL SPACE */
2994 case 0x3000: /* IDEOGRAPHIC SPACE */
2995 RRETURN(MATCH_NOMATCH);
2996 }
2997 }
2998 break;
2999
3000 case OP_HSPACE:
3001 for (i = 1; i <= min; i++)
3002 {
3003 if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
3004 GETCHARINC(c, eptr);
3005 switch(c)
3006 {
3007 default: RRETURN(MATCH_NOMATCH);
3008 case 0x09: /* HT */
3009 case 0x20: /* SPACE */
3010 case 0xa0: /* NBSP */
3011 case 0x1680: /* OGHAM SPACE MARK */
3012 case 0x180e: /* MONGOLIAN VOWEL SEPARATOR */
3013 case 0x2000: /* EN QUAD */
3014 case 0x2001: /* EM QUAD */
3015 case 0x2002: /* EN SPACE */
3016 case 0x2003: /* EM SPACE */
3017 case 0x2004: /* THREE-PER-EM SPACE */
3018 case 0x2005: /* FOUR-PER-EM SPACE */
3019 case 0x2006: /* SIX-PER-EM SPACE */
3020 case 0x2007: /* FIGURE SPACE */
3021 case 0x2008: /* PUNCTUATION SPACE */
3022 case 0x2009: /* THIN SPACE */
3023 case 0x200A: /* HAIR SPACE */
3024 case 0x202f: /* NARROW NO-BREAK SPACE */
3025 case 0x205f: /* MEDIUM MATHEMATICAL SPACE */
3026 case 0x3000: /* IDEOGRAPHIC SPACE */
3027 break;
3028 }
3029 }
3030 break;
3031
3032 case OP_NOT_VSPACE:
3033 for (i = 1; i <= min; i++)
3034 {
3035 if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
3036 GETCHARINC(c, eptr);
3037 switch(c)
3038 {
3039 default: break;
3040 case 0x0a: /* LF */
3041 case 0x0b: /* VT */
3042 case 0x0c: /* FF */
3043 case 0x0d: /* CR */
3044 case 0x85: /* NEL */
3045 case 0x2028: /* LINE SEPARATOR */
3046 case 0x2029: /* PARAGRAPH SEPARATOR */
3047 RRETURN(MATCH_NOMATCH);
3048 }
3049 }
3050 break;
3051
3052 case OP_VSPACE:
3053 for (i = 1; i <= min; i++)
3054 {
3055 if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
3056 GETCHARINC(c, eptr);
3057 switch(c)
3058 {
3059 default: RRETURN(MATCH_NOMATCH);
3060 case 0x0a: /* LF */
3061 case 0x0b: /* VT */
3062 case 0x0c: /* FF */
3063 case 0x0d: /* CR */
3064 case 0x85: /* NEL */
3065 case 0x2028: /* LINE SEPARATOR */
3066 case 0x2029: /* PARAGRAPH SEPARATOR */
3067 break;
3068 }
3069 }
3070 break;
3071
3072 case OP_NOT_DIGIT:
3073 for (i = 1; i <= min; i++)
3074 {
3075 if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
3076 GETCHARINC(c, eptr);
3077 if (c < 128 && (md->ctypes[c] & ctype_digit) != 0)
3078 RRETURN(MATCH_NOMATCH);
3079 }
3080 break;
3081
3082 case OP_DIGIT:
3083 for (i = 1; i <= min; i++)
3084 {
3085 if (eptr >= md->end_subject ||
3086 *eptr >= 128 || (md->ctypes[*eptr++] & ctype_digit) == 0)
3087 RRETURN(MATCH_NOMATCH);
3088 /* No need to skip more bytes - we know it's a 1-byte character */
3089 }
3090 break;
3091
3092 case OP_NOT_WHITESPACE:
3093 for (i = 1; i <= min; i++)
3094 {
3095 if (eptr >= md->end_subject ||
3096 (*eptr < 128 && (md->ctypes[*eptr] & ctype_space) != 0))
3097 RRETURN(MATCH_NOMATCH);
3098 while (++eptr < md->end_subject && (*eptr & 0xc0) == 0x80);
3099 }
3100 break;
3101
3102 case OP_WHITESPACE:
3103 for (i = 1; i <= min; i++)
3104 {
3105 if (eptr >= md->end_subject ||
3106 *eptr >= 128 || (md->ctypes[*eptr++] & ctype_space) == 0)
3107 RRETURN(MATCH_NOMATCH);
3108 /* No need to skip more bytes - we know it's a 1-byte character */
3109 }
3110 break;
3111
3112 case OP_NOT_WORDCHAR:
3113 for (i = 1; i <= min; i++)
3114 {
3115 if (eptr >= md->end_subject ||
3116 (*eptr < 128 && (md->ctypes[*eptr] & ctype_word) != 0))
3117 RRETURN(MATCH_NOMATCH);
3118 while (++eptr < md->end_subject && (*eptr & 0xc0) == 0x80);
3119 }
3120 break;
3121
3122 case OP_WORDCHAR:
3123 for (i = 1; i <= min; i++)
3124 {
3125 if (eptr >= md->end_subject ||
3126 *eptr >= 128 || (md->ctypes[*eptr++] & ctype_word) == 0)
3127 RRETURN(MATCH_NOMATCH);
3128 /* No need to skip more bytes - we know it's a 1-byte character */
3129 }
3130 break;
3131
3132 default:
3133 RRETURN(PCRE_ERROR_INTERNAL);
3134 } /* End switch(ctype) */
3135
3136 else
3137 #endif /* SUPPORT_UTF8 */
3138
3139 /* Code for the non-UTF-8 case for minimum matching of operators other
3140 than OP_PROP and OP_NOTPROP. We can assume that there are the minimum
3141 number of bytes present, as this was tested above. */
3142
3143 switch(ctype)
3144 {
3145 case OP_ANY:
3146 if ((ims & PCRE_DOTALL) == 0)
3147 {
3148 for (i = 1; i <= min; i++)
3149 {
3150 if (IS_NEWLINE(eptr)) RRETURN(MATCH_NOMATCH);
3151 eptr++;
3152 }
3153 }
3154 else eptr += min;
3155 break;
3156
3157 case OP_ANYBYTE:
3158 eptr += min;
3159 break;
3160
3161 /* Because of the CRLF case, we can't assume the minimum number of
3162 bytes are present in this case. */
3163
3164 case OP_ANYNL:
3165 for (i = 1; i <= min; i++)
3166 {
3167 if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
3168 switch(*eptr++)
3169 {
3170 default: RRETURN(MATCH_NOMATCH);
3171 case 0x000d:
3172 if (eptr < md->end_subject && *eptr == 0x0a) eptr++;
3173 break;
3174 case 0x000a:
3175 case 0x000b:
3176 case 0x000c:
3177 case 0x0085:
3178 break;
3179 }
3180 }
3181 break;
3182
3183 case OP_NOT_HSPACE:
3184 for (i = 1; i <= min; i++)
3185 {
3186 if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
3187 switch(*eptr++)
3188 {
3189 default: break;
3190 case 0x09: /* HT */
3191 case 0x20: /* SPACE */
3192 case 0xa0: /* NBSP */
3193 RRETURN(MATCH_NOMATCH);
3194 }
3195 }
3196 break;
3197
3198 case OP_HSPACE:
3199 for (i = 1; i <= min; i++)
3200 {
3201 if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
3202 switch(*eptr++)
3203 {
3204 default: RRETURN(MATCH_NOMATCH);
3205 case 0x09: /* HT */
3206 case 0x20: /* SPACE */
3207 case 0xa0: /* NBSP */
3208 break;
3209 }
3210 }
3211 break;
3212
3213 case OP_NOT_VSPACE:
3214 for (i = 1; i <= min; i++)
3215 {
3216 if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
3217 switch(*eptr++)
3218 {
3219 default: break;
3220 case 0x0a: /* LF */
3221 case 0x0b: /* VT */
3222 case 0x0c: /* FF */
3223 case 0x0d: /* CR */
3224 case 0x85: /* NEL */
3225 RRETURN(MATCH_NOMATCH);
3226 }
3227 }
3228 break;
3229
3230 case OP_VSPACE:
3231 for (i = 1; i <= min; i++)
3232 {
3233 if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
3234 switch(*eptr++)
3235 {
3236 default: RRETURN(MATCH_NOMATCH);
3237 case 0x0a: /* LF */
3238 case 0x0b: /* VT */
3239 case 0x0c: /* FF */
3240 case 0x0d: /* CR */
3241 case 0x85: /* NEL */
3242 break;
3243 }
3244 }
3245 break;
3246
3247 case OP_NOT_DIGIT:
3248 for (i = 1; i <= min; i++)
3249 if ((md->ctypes[*eptr++] & ctype_digit) != 0) RRETURN(MATCH_NOMATCH);
3250 break;
3251
3252 case OP_DIGIT:
3253 for (i = 1; i <= min; i++)
3254 if ((md->ctypes[*eptr++] & ctype_digit) == 0) RRETURN(MATCH_NOMATCH);
3255 break;
3256
3257 case OP_NOT_WHITESPACE:
3258 for (i = 1; i <= min; i++)
3259 if ((md->ctypes[*eptr++] & ctype_space) != 0) RRETURN(MATCH_NOMATCH);
3260 break;
3261
3262 case OP_WHITESPACE:
3263 for (i = 1; i <= min; i++)
3264 if ((md->ctypes[*eptr++] & ctype_space) == 0) RRETURN(MATCH_NOMATCH);
3265 break;
3266
3267 case OP_NOT_WORDCHAR:
3268 for (i = 1; i <= min; i++)
3269 if ((md->ctypes[*eptr++] & ctype_word) != 0)
3270 RRETURN(MATCH_NOMATCH);
3271 break;
3272
3273 case OP_WORDCHAR:
3274 for (i = 1; i <= min; i++)
3275 if ((md->ctypes[*eptr++] & ctype_word) == 0)
3276 RRETURN(MATCH_NOMATCH);
3277 break;
3278
3279 default:
3280 RRETURN(PCRE_ERROR_INTERNAL);
3281 }
3282 }
3283
3284 /* If min = max, continue at the same level without recursing */
3285
3286 if (min == max) continue;
3287
3288 /* If minimizing, we have to test the rest of the pattern before each
3289 subsequent match. Again, separate the UTF-8 case for speed, and also
3290 separate the UCP cases. */
3291
3292 if (minimize)
3293 {
3294 #ifdef SUPPORT_UCP
3295 if (prop_type >= 0)
3296 {
3297 switch(prop_type)
3298 {
3299 case PT_ANY:
3300 for (fi = min;; fi++)
3301 {
3302 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM36);
3303 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3304 if (fi >= max || eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
3305 GETCHARINC(c, eptr);
3306 if (prop_fail_result) RRETURN(MATCH_NOMATCH);
3307 }
3308 /* Control never gets here */
3309
3310 case PT_LAMP:
3311 for (fi = min;; fi++)
3312 {
3313 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM37);
3314 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3315 if (fi >= max || eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
3316 GETCHARINC(c, eptr);
3317 prop_category = _pcre_ucp_findprop(c, &prop_chartype, &prop_script);
3318 if ((prop_chartype == ucp_Lu ||
3319 prop_chartype == ucp_Ll ||
3320 prop_chartype == ucp_Lt) == prop_fail_result)
3321 RRETURN(MATCH_NOMATCH);
3322 }
3323 /* Control never gets here */
3324
3325 case PT_GC:
3326 for (fi = min;; fi++)
3327 {
3328 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM38);
3329 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3330 if (fi >= max || eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
3331 GETCHARINC(c, eptr);
3332 prop_category = _pcre_ucp_findprop(c, &prop_chartype, &prop_script);
3333 if ((prop_category == prop_value) == prop_fail_result)
3334 RRETURN(MATCH_NOMATCH);
3335 }
3336 /* Control never gets here */
3337
3338 case PT_PC:
3339 for (fi = min;; fi++)
3340 {
3341 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM39);
3342 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3343 if (fi >= max || eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
3344 GETCHARINC(c, eptr);
3345 prop_category = _pcre_ucp_findprop(c, &prop_chartype, &prop_script);
3346 if ((prop_chartype == prop_value) == prop_fail_result)
3347 RRETURN(MATCH_NOMATCH);
3348 }
3349 /* Control never gets here */
3350
3351 case PT_SC:
3352 for (fi = min;; fi++)
3353 {
3354 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM40);
3355 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3356 if (fi >= max || eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
3357 GETCHARINC(c, eptr);
3358 prop_category = _pcre_ucp_findprop(c, &prop_chartype, &prop_script);
3359 if ((prop_script == prop_value) == prop_fail_result)
3360 RRETURN(MATCH_NOMATCH);
3361 }
3362 /* Control never gets here */
3363
3364 default:
3365 RRETURN(PCRE_ERROR_INTERNAL);
3366 }
3367 }
3368
3369 /* Match extended Unicode sequences. We will get here only if the
3370 support is in the binary; otherwise a compile-time error occurs. */
3371
3372 else if (ctype == OP_EXTUNI)
3373 {
3374 for (fi = min;; fi++)
3375 {
3376 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM41);
3377 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3378 if (fi >= max || eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
3379 GETCHARINCTEST(c, eptr);
3380 prop_category = _pcre_ucp_findprop(c, &prop_chartype, &prop_script);
3381 if (prop_category == ucp_M) RRETURN(MATCH_NOMATCH);
3382 while (eptr < md->end_subject)
3383 {
3384 int len = 1;
3385 if (!utf8) c = *eptr; else
3386 {
3387 GETCHARLEN(c, eptr, len);
3388 }
3389 prop_category = _pcre_ucp_findprop(c, &prop_chartype, &prop_script);
3390 if (prop_category != ucp_M) break;
3391 eptr += len;
3392 }
3393 }
3394 }
3395
3396 else
3397 #endif /* SUPPORT_UCP */
3398
3399 #ifdef SUPPORT_UTF8
3400 /* UTF-8 mode */
3401 if (utf8)
3402 {
3403 for (fi = min;; fi++)
3404 {
3405 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM42);
3406 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3407 if (fi >= max || eptr >= md->end_subject ||
3408 (ctype == OP_ANY && (ims & PCRE_DOTALL) == 0 &&
3409 IS_NEWLINE(eptr)))
3410 RRETURN(MATCH_NOMATCH);
3411
3412 GETCHARINC(c, eptr);
3413 switch(ctype)
3414 {
3415 case OP_ANY: /* This is the DOTALL case */
3416 break;
3417
3418 case OP_ANYBYTE:
3419 break;
3420
3421 case OP_ANYNL:
3422 switch(c)
3423 {
3424 default: RRETURN(MATCH_NOMATCH);
3425 case 0x000d:
3426 if (eptr < md->end_subject && *eptr == 0x0a) eptr++;
3427 break;
3428 case 0x000a:
3429 case 0x000b:
3430 case 0x000c:
3431 case 0x0085:
3432 case 0x2028:
3433 case 0x2029:
3434 break;
3435 }
3436 break;
3437
3438 case OP_NOT_HSPACE:
3439 switch(c)
3440 {
3441 default: break;
3442 case 0x09: /* HT */
3443 case 0x20: /* SPACE */
3444 case 0xa0: /* NBSP */
3445 case 0x1680: /* OGHAM SPACE MARK */
3446 case 0x180e: /* MONGOLIAN VOWEL SEPARATOR */
3447 case 0x2000: /* EN QUAD */
3448 case 0x2001: /* EM QUAD */
3449 case 0x2002: /* EN SPACE */
3450 case 0x2003: /* EM SPACE */
3451 case 0x2004: /* THREE-PER-EM SPACE */
3452 case 0x2005: /* FOUR-PER-EM SPACE */
3453 case 0x2006: /* SIX-PER-EM SPACE */
3454 case 0x2007: /* FIGURE SPACE */
3455 case 0x2008: /* PUNCTUATION SPACE */
3456 case 0x2009: /* THIN SPACE */
3457 case 0x200A: /* HAIR SPACE */
3458 case 0x202f: /* NARROW NO-BREAK SPACE */
3459 case 0x205f: /* MEDIUM MATHEMATICAL SPACE */
3460 case 0x3000: /* IDEOGRAPHIC SPACE */
3461 RRETURN(MATCH_NOMATCH);
3462 }
3463 break;
3464
3465 case OP_HSPACE:
3466 switch(c)
3467 {
3468 default: RRETURN(MATCH_NOMATCH);
3469 case 0x09: /* HT */
3470 case 0x20: /* SPACE */
3471 case 0xa0: /* NBSP */
3472 case 0x1680: /* OGHAM SPACE MARK */
3473 case 0x180e: /* MONGOLIAN VOWEL SEPARATOR */
3474 case 0x2000: /* EN QUAD */
3475 case 0x2001: /* EM QUAD */
3476 case 0x2002: /* EN SPACE */
3477 case 0x2003: /* EM SPACE */
3478 case 0x2004: /* THREE-PER-EM SPACE */
3479 case 0x2005: /* FOUR-PER-EM SPACE */
3480 case 0x2006: /* SIX-PER-EM SPACE */
3481 case 0x2007: /* FIGURE SPACE */
3482 case 0x2008: /* PUNCTUATION SPACE */
3483 case 0x2009: /* THIN SPACE */
3484 case 0x200A: /* HAIR SPACE */
3485 case 0x202f: /* NARROW NO-BREAK SPACE */
3486 case 0x205f: /* MEDIUM MATHEMATICAL SPACE */
3487 case 0x3000: /* IDEOGRAPHIC SPACE */
3488 break;
3489 }
3490 break;
3491
3492 case OP_NOT_VSPACE:
3493 switch(c)
3494 {
3495 default: break;
3496 case 0x0a: /* LF */
3497 case 0x0b: /* VT */
3498 case 0x0c: /* FF */
3499 case 0x0d: /* CR */
3500 case 0x85: /* NEL */
3501 case 0x2028: /* LINE SEPARATOR */
3502 case 0x2029: /* PARAGRAPH SEPARATOR */
3503 RRETURN(MATCH_NOMATCH);
3504 }
3505 break;
3506
3507 case OP_VSPACE:
3508 switch(c)
3509 {
3510 default: RRETURN(MATCH_NOMATCH);
3511 case 0x0a: /* LF */
3512 case 0x0b: /* VT */
3513 case 0x0c: /* FF */
3514 case 0x0d: /* CR */
3515 case 0x85: /* NEL */
3516 case 0x2028: /* LINE SEPARATOR */
3517 case 0x2029: /* PARAGRAPH SEPARATOR */
3518 break;
3519 }
3520 break;
3521
3522 case OP_NOT_DIGIT:
3523 if (c < 256 && (md->ctypes[c] & ctype_digit) != 0)
3524 RRETURN(MATCH_NOMATCH);
3525 break;
3526
3527 case OP_DIGIT:
3528 if (c >= 256 || (md->ctypes[c] & ctype_digit) == 0)
3529 RRETURN(MATCH_NOMATCH);
3530 break;
3531
3532 case OP_NOT_WHITESPACE:
3533 if (c < 256 && (md->ctypes[c] & ctype_space) != 0)
3534 RRETURN(MATCH_NOMATCH);
3535 break;
3536
3537 case OP_WHITESPACE:
3538 if (c >= 256 || (md->ctypes[c] & ctype_space) == 0)
3539 RRETURN(MATCH_NOMATCH);
3540 break;
3541
3542 case OP_NOT_WORDCHAR:
3543 if (c < 256 && (md->ctypes[c] & ctype_word) != 0)
3544 RRETURN(MATCH_NOMATCH);
3545 break;
3546
3547 case OP_WORDCHAR:
3548 if (c >= 256 || (md->ctypes[c] & ctype_word) == 0)
3549 RRETURN(MATCH_NOMATCH);
3550 break;
3551
3552 default:
3553 RRETURN(PCRE_ERROR_INTERNAL);
3554 }
3555 }
3556 }
3557 else
3558 #endif
3559 /* Not UTF-8 mode */
3560 {
3561 for (fi = min;; fi++)
3562 {
3563 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM43);
3564 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3565 if (fi >= max || eptr >= md->end_subject ||
3566 ((ims & PCRE_DOTALL) == 0 && IS_NEWLINE(eptr)))
3567 RRETURN(MATCH_NOMATCH);
3568
3569 c = *eptr++;
3570 switch(ctype)
3571 {
3572 case OP_ANY: /* This is the DOTALL case */
3573 break;
3574
3575 case OP_ANYBYTE:
3576 break;
3577
3578 case OP_ANYNL:
3579 switch(c)
3580 {
3581 default: RRETURN(MATCH_NOMATCH);
3582 case 0x000d:
3583 if (eptr < md->end_subject && *eptr == 0x0a) eptr++;
3584 break;
3585 case 0x000a:
3586 case 0x000b:
3587 case 0x000c:
3588 case 0x0085:
3589 break;
3590 }
3591 break;
3592
3593 case OP_NOT_HSPACE:
3594 switch(c)
3595 {
3596 default: break;
3597 case 0x09: /* HT */
3598 case 0x20: /* SPACE */
3599 case 0xa0: /* NBSP */
3600 RRETURN(MATCH_NOMATCH);
3601 }
3602 break;
3603
3604 case OP_HSPACE:
3605 switch(c)
3606 {
3607 default: RRETURN(MATCH_NOMATCH);
3608 case 0x09: /* HT */
3609 case 0x20: /* SPACE */
3610 case 0xa0: /* NBSP */
3611 break;
3612 }
3613 break;
3614
3615 case OP_NOT_VSPACE:
3616 switch(c)
3617 {
3618 default: break;
3619 case 0x0a: /* LF */
3620 case 0x0b: /* VT */
3621 case 0x0c: /* FF */
3622 case 0x0d: /* CR */
3623 case 0x85: /* NEL */
3624 RRETURN(MATCH_NOMATCH);
3625 }
3626 break;
3627
3628 case OP_VSPACE:
3629 switch(c)
3630 {
3631 default: RRETURN(MATCH_NOMATCH);
3632 case 0x0a: /* LF */
3633 case 0x0b: /* VT */
3634 case 0x0c: /* FF */
3635 case 0x0d: /* CR */
3636 case 0x85: /* NEL */
3637 break;
3638 }
3639 break;
3640
3641 case OP_NOT_DIGIT:
3642 if ((md->ctypes[c] & ctype_digit) != 0) RRETURN(MATCH_NOMATCH);
3643 break;
3644
3645 case OP_DIGIT:
3646 if ((md->ctypes[c] & ctype_digit) == 0) RRETURN(MATCH_NOMATCH);
3647 break;
3648
3649 case OP_NOT_WHITESPACE:
3650 if ((md->ctypes[c] & ctype_space) != 0) RRETURN(MATCH_NOMATCH);
3651 break;
3652
3653 case OP_WHITESPACE:
3654 if ((md->ctypes[c] & ctype_space) == 0) RRETURN(MATCH_NOMATCH);
3655 break;
3656
3657 case OP_NOT_WORDCHAR:
3658 if ((md->ctypes[c] & ctype_word) != 0) RRETURN(MATCH_NOMATCH);
3659 break;
3660
3661 case OP_WORDCHAR:
3662 if ((md->ctypes[c] & ctype_word) == 0) RRETURN(MATCH_NOMATCH);
3663 break;
3664
3665 default:
3666 RRETURN(PCRE_ERROR_INTERNAL);
3667 }
3668 }
3669 }
3670 /* Control never gets here */
3671 }
3672
3673 /* If maximizing, it is worth using inline code for speed, doing the type
3674 test once at the start (i.e. keep it out of the loop). Again, keep the
3675 UTF-8 and UCP stuff separate. */
3676
3677 else
3678 {
3679 pp = eptr; /* Remember where we started */
3680
3681 #ifdef SUPPORT_UCP
3682 if (prop_type >= 0)
3683 {
3684 switch(prop_type)
3685 {
3686 case PT_ANY:
3687 for (i = min; i < max; i++)
3688 {
3689 int len = 1;
3690 if (eptr >= md->end_subject) break;
3691 GETCHARLEN(c, eptr, len);
3692 if (prop_fail_result) break;
3693 eptr+= len;
3694 }
3695 break;
3696
3697 case PT_LAMP:
3698 for (i = min; i < max; i++)
3699 {
3700 int len = 1;
3701 if (eptr >= md->end_subject) break;
3702 GETCHARLEN(c, eptr, len);
3703 prop_category = _pcre_ucp_findprop(c, &prop_chartype, &prop_script);
3704 if ((prop_chartype == ucp_Lu ||
3705 prop_chartype == ucp_Ll ||
3706 prop_chartype == ucp_Lt) == prop_fail_result)
3707 break;
3708 eptr+= len;
3709 }
3710 break;
3711
3712 case PT_GC:
3713 for (i = min; i < max; i++)
3714 {
3715 int len = 1;
3716 if (eptr >= md->end_subject) break;
3717 GETCHARLEN(c, eptr, len);
3718 prop_category = _pcre_ucp_findprop(c, &prop_chartype, &prop_script);
3719 if ((prop_category == prop_value) == prop_fail_result)
3720 break;
3721 eptr+= len;
3722 }
3723 break;
3724
3725 case PT_PC:
3726 for (i = min; i < max; i++)
3727 {
3728 int len = 1;
3729 if (eptr >= md->end_subject) break;
3730 GETCHARLEN(c, eptr, len);
3731 prop_category = _pcre_ucp_findprop(c, &prop_chartype, &prop_script);
3732 if ((prop_chartype == prop_value) == prop_fail_result)
3733 break;
3734 eptr+= len;
3735 }
3736 break;
3737
3738 case PT_SC:
3739 for (i = min; i < max; i++)
3740 {
3741 int len = 1;
3742 if (eptr >= md->end_subject) break;
3743 GETCHARLEN(c, eptr, len);
3744 prop_category = _pcre_ucp_findprop(c, &prop_chartype, &prop_script);
3745 if ((prop_script == prop_value) == prop_fail_result)
3746 break;
3747 eptr+= len;
3748 }
3749 break;
3750 }
3751
3752 /* eptr is now past the end of the maximum run */
3753
3754 if (possessive) continue;
3755 for(;;)
3756 {
3757 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM44);
3758 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3759 if (eptr-- == pp) break; /* Stop if tried at original pos */
3760 if (utf8) BACKCHAR(eptr);
3761 }
3762 }
3763
3764 /* Match extended Unicode sequences. We will get here only if the
3765 support is in the binary; otherwise a compile-time error occurs. */
3766
3767 else if (ctype == OP_EXTUNI)
3768 {
3769 for (i = min; i < max; i++)
3770 {
3771 if (eptr >= md->end_subject) break;
3772 GETCHARINCTEST(c, eptr);
3773 prop_category = _pcre_ucp_findprop(c, &prop_chartype, &prop_script);
3774 if (prop_category == ucp_M) break;
3775 while (eptr < md->end_subject)
3776 {
3777 int len = 1;
3778 if (!utf8) c = *eptr; else
3779 {
3780 GETCHARLEN(c, eptr, len);
3781 }
3782 prop_category = _pcre_ucp_findprop(c, &prop_chartype, &prop_script);
3783 if (prop_category != ucp_M) break;
3784 eptr += len;
3785 }
3786 }
3787
3788 /* eptr is now past the end of the maximum run */
3789
3790 if (possessive) continue;
3791 for(;;)
3792 {
3793 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM45);
3794 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3795 if (eptr-- == pp) break; /* Stop if tried at original pos */
3796 for (;;) /* Move back over one extended */
3797 {
3798 int len = 1;
3799 if (!utf8) c = *eptr; else
3800 {
3801 BACKCHAR(eptr);
3802 GETCHARLEN(c, eptr, len);
3803 }
3804 prop_category = _pcre_ucp_findprop(c, &prop_chartype, &prop_script);
3805 if (prop_category != ucp_M) break;
3806 eptr--;
3807 }
3808 }
3809 }
3810
3811 else
3812 #endif /* SUPPORT_UCP */
3813
3814 #ifdef SUPPORT_UTF8
3815 /* UTF-8 mode */
3816
3817 if (utf8)
3818 {
3819 switch(ctype)
3820 {
3821 case OP_ANY:
3822 if (max < INT_MAX)
3823 {
3824 if ((ims & PCRE_DOTALL) == 0)
3825 {
3826 for (i = min; i < max; i++)
3827 {
3828 if (eptr >= md->end_subject || IS_NEWLINE(eptr)) break;
3829 eptr++;
3830 while (eptr < md->end_subject && (*eptr & 0xc0) == 0x80) eptr++;
3831 }
3832 }
3833 else
3834 {
3835 for (i = min; i < max; i++)
3836 {
3837 if (eptr >= md->end_subject) break;
3838 eptr++;
3839 while (eptr < md->end_subject && (*eptr & 0xc0) == 0x80) eptr++;
3840 }
3841 }
3842 }
3843
3844 /* Handle unlimited UTF-8 repeat */
3845
3846 else
3847 {
3848 if ((ims & PCRE_DOTALL) == 0)
3849 {
3850 for (i = min; i < max; i++)
3851 {
3852 if (eptr >= md->end_subject || IS_NEWLINE(eptr)) break;
3853 eptr++;
3854 while (eptr < md->end_subject && (*eptr & 0xc0) == 0x80) eptr++;
3855 }
3856 }
3857 else
3858 {
3859 eptr = md->end_subject;
3860 }
3861 }
3862 break;
3863
3864 /* The byte case is the same as non-UTF8 */
3865
3866 case OP_ANYBYTE:
3867 c = max - min;
3868 if (c > (unsigned int)(md->end_subject - eptr))
3869 c = md->end_subject - eptr;
3870 eptr += c;
3871 break;
3872
3873 case OP_ANYNL:
3874 for (i = min; i < max; i++)
3875 {
3876 int len = 1;
3877 if (eptr >= md->end_subject) break;
3878 GETCHARLEN(c, eptr, len);
3879 if (c == 0x000d)
3880 {
3881 if (++eptr >= md->end_subject) break;
3882 if (*eptr == 0x000a) eptr++;
3883 }
3884 else
3885 {
3886 if (c != 0x000a && c != 0x000b && c != 0x000c &&
3887 c != 0x0085 && c != 0x2028 && c != 0x2029)
3888 break;
3889 eptr += len;
3890 }
3891 }
3892 break;
3893
3894 case OP_NOT_HSPACE:
3895 case OP_HSPACE:
3896 for (i = min; i < max; i++)
3897 {
3898 BOOL gotspace;
3899 int len = 1;
3900 if (eptr >= md->end_subject) break;
3901 GETCHARLEN(c, eptr, len);
3902 switch(c)
3903 {
3904 default: gotspace = FALSE; break;
3905 case 0x09: /* HT */
3906 case 0x20: /* SPACE */
3907 case 0xa0: /* NBSP */
3908 case 0x1680: /* OGHAM SPACE MARK */
3909 case 0x180e: /* MONGOLIAN VOWEL SEPARATOR */
3910 case 0x2000: /* EN QUAD */
3911 case 0x2001: /* EM QUAD */
3912 case 0x2002: /* EN SPACE */
3913 case 0x2003: /* EM SPACE */
3914 case 0x2004: /* THREE-PER-EM SPACE */
3915 case 0x2005: /* FOUR-PER-EM SPACE */
3916 case 0x2006: /* SIX-PER-EM SPACE */
3917 case 0x2007: /* FIGURE SPACE */
3918 case 0x2008: /* PUNCTUATION SPACE */
3919 case 0x2009: /* THIN SPACE */
3920 case 0x200A: /* HAIR SPACE */
3921 case 0x202f: /* NARROW NO-BREAK SPACE */
3922 case 0x205f: /* MEDIUM MATHEMATICAL SPACE */
3923 case 0x3000: /* IDEOGRAPHIC SPACE */
3924 gotspace = TRUE;
3925 break;
3926 }
3927 if (gotspace == (ctype == OP_NOT_HSPACE)) break;
3928 eptr += len;
3929 }
3930 break;
3931
3932 case OP_NOT_VSPACE:
3933 case OP_VSPACE:
3934 for (i = min; i < max; i++)
3935 {
3936 BOOL gotspace;
3937 int len = 1;
3938 if (eptr >= md->end_subject) break;
3939 GETCHARLEN(c, eptr, len);
3940 switch(c)
3941 {
3942 default: gotspace = FALSE; break;
3943 case 0x0a: /* LF */
3944 case 0x0b: /* VT */
3945 case 0x0c: /* FF */
3946 case 0x0d: /* CR */
3947 case 0x85: /* NEL */
3948 case 0x2028: /* LINE SEPARATOR */
3949 case 0x2029: /* PARAGRAPH SEPARATOR */
3950 gotspace = TRUE;
3951 break;
3952 }
3953 if (gotspace == (ctype == OP_NOT_VSPACE)) break;
3954 eptr += len;
3955 }
3956 break;
3957
3958 case OP_NOT_DIGIT:
3959 for (i = min; i < max; i++)
3960 {
3961 int len = 1;
3962 if (eptr >= md->end_subject) break;
3963 GETCHARLEN(c, eptr, len);
3964 if (c < 256 && (md->ctypes[c] & ctype_digit) != 0) break;
3965 eptr+= len;
3966 }
3967 break;
3968
3969 case OP_DIGIT:
3970 for (i = min; i < max; i++)
3971 {
3972 int len = 1;
3973 if (eptr >= md->end_subject) break;
3974 GETCHARLEN(c, eptr, len);
3975 if (c >= 256 ||(md->ctypes[c] & ctype_digit) == 0) break;
3976 eptr+= len;
3977 }
3978 break;
3979
3980 case OP_NOT_WHITESPACE:
3981 for (i = min; i < max; i++)
3982 {
3983 int len = 1;
3984 if (eptr >= md->end_subject) break;
3985 GETCHARLEN(c, eptr, len);
3986 if (c < 256 && (md->ctypes[c] & ctype_space) != 0) break;
3987 eptr+= len;
3988 }
3989 break;
3990
3991 case OP_WHITESPACE:
3992 for (i = min; i < max; i++)
3993 {
3994 int len = 1;
3995 if (eptr >= md->end_subject) break;
3996 GETCHARLEN(c, eptr, len);
3997 if (c >= 256 ||(md->ctypes[c] & ctype_space) == 0) break;
3998 eptr+= len;
3999 }
4000 break;
4001
4002 case OP_NOT_WORDCHAR:
4003 for (i = min; i < max; i++)
4004 {
4005 int len = 1;
4006 if (eptr >= md->end_subject) break;
4007 GETCHARLEN(c, eptr, len);
4008 if (c < 256 && (md->ctypes[c] & ctype_word) != 0) break;
4009 eptr+= len;
4010 }
4011 break;
4012
4013 case OP_WORDCHAR:
4014 for (i = min; i < max; i++)
4015 {
4016 int len = 1;
4017 if (eptr >= md->end_subject) break;
4018 GETCHARLEN(c, eptr, len);
4019 if (c >= 256 || (md->ctypes[c] & ctype_word) == 0) break;
4020 eptr+= len;
4021 }
4022 break;
4023
4024 default:
4025 RRETURN(PCRE_ERROR_INTERNAL);
4026 }
4027
4028 /* eptr is now past the end of the maximum run */
4029
4030 if (possessive) continue;
4031 for(;;)
4032 {
4033 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM46);
4034 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
4035 if (eptr-- == pp) break; /* Stop if tried at original pos */
4036 BACKCHAR(eptr);
4037 }
4038 }
4039 else
4040 #endif /* SUPPORT_UTF8 */
4041
4042 /* Not UTF-8 mode */
4043 {
4044 switch(ctype)
4045 {
4046 case OP_ANY:
4047 if ((ims & PCRE_DOTALL) == 0)
4048 {
4049 for (i = min; i < max; i++)
4050 {
4051 if (eptr >= md->end_subject || IS_NEWLINE(eptr)) break;
4052 eptr++;
4053 }
4054 break;
4055 }
4056 /* For DOTALL case, fall through and treat as \C */
4057
4058 case OP_ANYBYTE:
4059 c = max - min;
4060 if (c > (unsigned int)(md->end_subject - eptr))
4061 c = md->end_subject - eptr;
4062 eptr += c;
4063 break;
4064
4065 case OP_ANYNL:
4066 for (i = min; i < max; i++)
4067 {
4068 if (eptr >= md->end_subject) break;
4069 c = *eptr;
4070 if (c == 0x000d)
4071 {
4072 if (++eptr >= md->end_subject) break;
4073 if (*eptr == 0x000a) eptr++;
4074 }
4075 else
4076 {
4077 if (c != 0x000a && c != 0x000b && c != 0x000c && c != 0x0085)
4078 break;
4079 eptr++;
4080 }
4081 }
4082 break;
4083
4084 case OP_NOT_HSPACE:
4085 for (i = min; i < max; i++)
4086 {
4087 if (eptr >= md->end_subject) break;
4088 c = *eptr;
4089 if (c == 0x09 || c == 0x20 || c == 0xa0) break;
4090 eptr++;
4091 }
4092 break;
4093
4094 case OP_HSPACE:
4095 for (i = min; i < max; i++)
4096 {
4097 if (eptr >= md->end_subject) break;
4098 c = *eptr;
4099 if (c != 0x09 && c != 0x20 && c != 0xa0) break;
4100 eptr++;
4101 }
4102 break;
4103
4104 case OP_NOT_VSPACE:
4105 for (i = min; i < max; i++)
4106 {
4107 if (eptr >= md->end_subject) break;
4108 c = *eptr;
4109 if (c == 0x0a || c == 0x0b || c == 0x0c || c == 0x0d || c == 0x85)
4110 break;
4111 eptr++;
4112 }
4113 break;
4114
4115 case OP_VSPACE:
4116 for (i = min; i < max; i++)
4117 {
4118 if (eptr >= md->end_subject) break;
4119 c = *eptr;
4120 if (c != 0x0a && c != 0x0b && c != 0x0c && c != 0x0d && c != 0x85)
4121 break;
4122 eptr++;
4123 }
4124 break;
4125
4126 case OP_NOT_DIGIT:
4127 for (i = min; i < max; i++)
4128 {
4129 if (eptr >= md->end_subject || (md->ctypes[*eptr] & ctype_digit) != 0)
4130 break;
4131 eptr++;
4132 }
4133 break;
4134
4135 case OP_DIGIT:
4136 for (i = min; i < max; i++)
4137 {
4138 if (eptr >= md->end_subject || (md->ctypes[*eptr] & ctype_digit) == 0)
4139 break;
4140 eptr++;
4141 }
4142 break;
4143
4144 case OP_NOT_WHITESPACE:
4145 for (i = min; i < max; i++)
4146 {
4147 if (eptr >= md->end_subject || (md->ctypes[*eptr] & ctype_space) != 0)
4148 break;
4149 eptr++;
4150 }
4151 break;
4152
4153 case OP_WHITESPACE:
4154 for (i = min; i < max; i++)
4155 {
4156 if (eptr >= md->end_subject || (md->ctypes[*eptr] & ctype_space) == 0)
4157 break;
4158 eptr++;
4159 }
4160 break;
4161
4162 case OP_NOT_WORDCHAR:
4163 for (i = min; i < max; i++)
4164 {
4165 if (eptr >= md->end_subject || (md->ctypes[*eptr] & ctype_word) != 0)
4166 break;
4167 eptr++;
4168 }
4169 break;
4170
4171 case OP_WORDCHAR:
4172 for (i = min; i < max; i++)
4173 {
4174 if (eptr >= md->end_subject || (md->ctypes[*eptr] & ctype_word) == 0)
4175 break;
4176 eptr++;
4177 }
4178 break;
4179
4180 default:
4181 RRETURN(PCRE_ERROR_INTERNAL);
4182 }
4183
4184 /* eptr is now past the end of the maximum run */
4185
4186 if (possessive) continue;
4187 while (eptr >= pp)
4188 {
4189 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM47);
4190 eptr--;
4191 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
4192 }
4193 }
4194
4195 /* Get here if we can't make it match with any permitted repetitions */
4196
4197 RRETURN(MATCH_NOMATCH);
4198 }
4199 /* Control never gets here */
4200
4201 /* There's been some horrible disaster. Arrival here can only mean there is
4202 something seriously wrong in the code above or the OP_xxx definitions. */
4203
4204 default:
4205 DPRINTF(("Unknown opcode %d\n", *ecode));
4206 RRETURN(PCRE_ERROR_UNKNOWN_OPCODE);
4207 }
4208
4209 /* Do not stick any code in here without much thought; it is assumed
4210 that "continue" in the code above comes out to here to repeat the main
4211 loop. */
4212
4213 } /* End of main loop */
4214 /* Control never reaches here */
4215
4216
4217 /* When compiling to use the heap rather than the stack for recursive calls to
4218 match(), the RRETURN() macro jumps here. The number that is saved in
4219 frame->Xwhere indicates which label we actually want to return to. */
4220
4221 #ifdef NO_RECURSE
4222 #define LBL(val) case val: goto L_RM##val;
4223 HEAP_RETURN:
4224 switch (frame->Xwhere)
4225 {
4226 LBL( 1) LBL( 2) LBL( 3) LBL( 4) LBL( 5) LBL( 6) LBL( 7) LBL( 8)
4227 LBL( 9) LBL(10) LBL(11) LBL(12) LBL(13) LBL(14) LBL(15) LBL(16)
4228 LBL(17) LBL(18) LBL(19) LBL(20) LBL(21) LBL(22) LBL(23) LBL(24)
4229 LBL(25) LBL(26) LBL(27) LBL(28) LBL(29) LBL(30) LBL(31) LBL(32)
4230 LBL(33) LBL(34) LBL(35) LBL(36) LBL(37) LBL(38) LBL(39) LBL(40)
4231 LBL(41) LBL(42) LBL(43) LBL(44) LBL(45) LBL(46) LBL(47) LBL(48)
4232 LBL(49) LBL(50) LBL(51) LBL(52) LBL(53) LBL(54)
4233 default:
4234 DPRINTF(("jump error in pcre match: label %d non-existent\n", frame->Xwhere));
4235 return PCRE_ERROR_INTERNAL;
4236 }
4237 #undef LBL
4238 #endif /* NO_RECURSE */
4239 }
4240
4241
4242 /***************************************************************************
4243 ****************************************************************************
4244 RECURSION IN THE match() FUNCTION
4245
4246 Undefine all the macros that were defined above to handle this. */
4247
4248 #ifdef NO_RECURSE
4249 #undef eptr
4250 #undef ecode
4251 #undef mstart
4252 #undef offset_top
4253 #undef ims
4254 #undef eptrb
4255 #undef flags
4256
4257 #undef callpat
4258 #undef charptr
4259 #undef data
4260 #undef next
4261 #undef pp
4262 #undef prev
4263 #undef saved_eptr
4264
4265 #undef new_recursive
4266
4267 #undef cur_is_word
4268 #undef condition
4269 #undef prev_is_word
4270
4271 #undef original_ims
4272
4273 #undef ctype
4274 #undef length
4275 #undef max
4276 #undef min
4277 #undef number
4278 #undef offset
4279 #undef op
4280 #undef save_capture_last
4281 #undef save_offset1
4282 #undef save_offset2
4283 #undef save_offset3
4284 #undef stacksave
4285
4286 #undef newptrb
4287
4288 #endif
4289
4290 /* These two are defined as macros in both cases */
4291
4292 #undef fc
4293 #undef fi
4294
4295 /***************************************************************************
4296 ***************************************************************************/
4297
4298
4299
4300 /*************************************************
4301 * Execute a Regular Expression *
4302 *************************************************/
4303
4304 /* This function applies a compiled re to a subject string and picks out
4305 portions of the string if it matches. Two elements in the vector are set for
4306 each substring: the offsets to the start and end of the substring.
4307
4308 Arguments:
4309 argument_re points to the compiled expression
4310 extra_data points to extra data or is NULL
4311 subject points to the subject string
4312 length length of subject string (may contain binary zeros)
4313 start_offset where to start in the subject string
4314 options option bits
4315 offsets points to a vector of ints to be filled in with offsets
4316 offsetcount the number of elements in the vector
4317
4318 Returns: > 0 => success; value is the number of elements filled in
4319 = 0 => success, but offsets is not big enough
4320 -1 => failed to match
4321 < -1 => some kind of unexpected problem
4322 */
4323
4324 PCRE_EXP_DEFN int
4325 pcre_exec(const pcre *argument_re, const pcre_extra *extra_data,
4326 PCRE_SPTR subject, int length, int start_offset, int options, int *offsets,
4327 int offsetcount)
4328 {
4329 int rc, resetcount, ocount;
4330 int first_byte = -1;
4331 int req_byte = -1;
4332 int req_byte2 = -1;
4333 int newline;
4334 unsigned long int ims;
4335 BOOL using_temporary_offsets = FALSE;
4336 BOOL anchored;
4337 BOOL startline;
4338 BOOL firstline;
4339 BOOL first_byte_caseless = FALSE;
4340 BOOL req_byte_caseless = FALSE;
4341 BOOL utf8;
4342 match_data match_block;
4343 match_data *md = &match_block;
4344 const uschar *tables;
4345 const uschar *start_bits = NULL;
4346 USPTR start_match = (USPTR)subject + start_offset;
4347 USPTR end_subject;
4348 USPTR req_byte_ptr = start_match - 1;
4349
4350 pcre_study_data internal_study;
4351 const pcre_study_data *study;
4352
4353 real_pcre internal_re;
4354 const real_pcre *external_re = (const real_pcre *)argument_re;
4355 const real_pcre *re = external_re;
4356
4357 /* Plausibility checks */
4358
4359 if ((options & ~PUBLIC_EXEC_OPTIONS) != 0) return PCRE_ERROR_BADOPTION;
4360 if (re == NULL || subject == NULL ||
4361 (offsets == NULL && offsetcount > 0)) return PCRE_ERROR_NULL;
4362 if (offsetcount < 0) return PCRE_ERROR_BADCOUNT;
4363
4364 /* Fish out the optional data from the extra_data structure, first setting
4365 the default values. */
4366
4367 study = NULL;
4368 md->match_limit = MATCH_LIMIT;
4369 md->match_limit_recursion = MATCH_LIMIT_RECURSION;
4370 md->callout_data = NULL;
4371
4372 /* The table pointer is always in native byte order. */
4373
4374 tables = external_re->tables;
4375
4376 if (extra_data != NULL)
4377 {
4378 register unsigned int flags = extra_data->flags;
4379 if ((flags & PCRE_EXTRA_STUDY_DATA) != 0)
4380 study = (const pcre_study_data *)extra_data->study_data;
4381 if ((flags & PCRE_EXTRA_MATCH_LIMIT) != 0)
4382 md->match_limit = extra_data->match_limit;
4383 if ((flags & PCRE_EXTRA_MATCH_LIMIT_RECURSION) != 0)
4384 md->match_limit_recursion = extra_data->match_limit_recursion;
4385 if ((flags & PCRE_EXTRA_CALLOUT_DATA) != 0)
4386 md->callout_data = extra_data->callout_data;
4387 if ((flags & PCRE_EXTRA_TABLES) != 0) tables = extra_data->tables;
4388 }
4389
4390 /* If the exec call supplied NULL for tables, use the inbuilt ones. This
4391 is a feature that makes it possible to save compiled regex and re-use them
4392 in other programs later. */
4393
4394 if (tables == NULL) tables = _pcre_default_tables;
4395
4396 /* Check that the first field in the block is the magic number. If it is not,
4397 test for a regex that was compiled on a host of opposite endianness. If this is
4398 the case, flipped values are put in internal_re and internal_study if there was
4399 study data too. */
4400
4401 if (re->magic_number != MAGIC_NUMBER)
4402 {
4403 re = _pcre_try_flipped(re, &internal_re, study, &internal_study);
4404 if (re == NULL) return PCRE_ERROR_BADMAGIC;
4405 if (study != NULL) study = &internal_study;
4406 }
4407
4408 /* Set up other data */
4409
4410 anchored = ((re->options | options) & PCRE_ANCHORED) != 0;
4411 startline = (re->options & PCRE_STARTLINE) != 0;
4412 firstline = (re->options & PCRE_FIRSTLINE) != 0;
4413
4414 /* The code starts after the real_pcre block and the capture name table. */
4415
4416 md->start_code = (const uschar *)external_re + re->name_table_offset +
4417 re->name_count * re->name_entry_size;
4418
4419 md->start_subject = (USPTR)subject;
4420 md->start_offset = start_offset;
4421 md->end_subject = md->start_subject + length;
4422 end_subject = md->end_subject;
4423
4424 md->endonly = (re->options & PCRE_DOLLAR_ENDONLY) != 0;
4425 utf8 = md->utf8 = (re->options & PCRE_UTF8) != 0;
4426
4427 md->notbol = (options & PCRE_NOTBOL) != 0;
4428 md->noteol = (options & PCRE_NOTEOL) != 0;
4429 md->notempty = (options & PCRE_NOTEMPTY) != 0;
4430 md->partial = (options & PCRE_PARTIAL) != 0;
4431 md->hitend = FALSE;
4432
4433 md->recursive = NULL; /* No recursion at top level */
4434
4435 md->lcc = tables + lcc_offset;
4436 md->ctypes = tables + ctypes_offset;
4437
4438 /* Handle different types of newline. The three bits give eight cases. If
4439 nothing is set at run time, whatever was used at compile time applies. */
4440
4441 switch ((((options & PCRE_NEWLINE_BITS) == 0)? re->options : (pcre_uint32)options) &
4442 PCRE_NEWLINE_BITS)
4443 {
4444 case 0: newline = NEWLINE; break; /* Compile-time default */
4445 case PCRE_NEWLINE_CR: newline = '\r'; break;
4446 case PCRE_NEWLINE_LF: newline = '\n'; break;
4447 case PCRE_NEWLINE_CR+
4448 PCRE_NEWLINE_LF: newline = ('\r' << 8) | '\n'; break;
4449 case PCRE_NEWLINE_ANY: newline = -1; break;
4450 case PCRE_NEWLINE_ANYCRLF: newline = -2; break;
4451 default: return PCRE_ERROR_BADNEWLINE;
4452 }
4453
4454 if (newline == -2)
4455 {
4456 md->nltype = NLTYPE_ANYCRLF;
4457 }
4458 else if (newline < 0)
4459 {
4460 md->nltype = NLTYPE_ANY;
4461 }
4462 else
4463 {
4464 md->nltype = NLTYPE_FIXED;
4465 if (newline > 255)
4466 {
4467 md->nllen = 2;
4468 md->nl[0] = (newline >> 8) & 255;
4469 md->nl[1] = newline & 255;
4470 }
4471 else
4472 {
4473 md->nllen = 1;
4474 md->nl[0] = newline;
4475 }
4476 }
4477
4478 /* Partial matching is supported only for a restricted set of regexes at the
4479 moment. */
4480
4481 if (md->partial && (re->options & PCRE_NOPARTIAL) != 0)
4482 return PCRE_ERROR_BADPARTIAL;
4483
4484 /* Check a UTF-8 string if required. Unfortunately there's no way of passing
4485 back the character offset. */
4486
4487 #ifdef SUPPORT_UTF8
4488 if (utf8 && (options & PCRE_NO_UTF8_CHECK) == 0)
4489 {
4490 if (_pcre_valid_utf8((uschar *)subject, length) >= 0)
4491 return PCRE_ERROR_BADUTF8;
4492 if (start_offset > 0 && start_offset < length)
4493 {
4494 int tb = ((uschar *)subject)[start_offset];
4495 if (tb > 127)
4496 {
4497 tb &= 0xc0;
4498 if (tb != 0 && tb != 0xc0) return PCRE_ERROR_BADUTF8_OFFSET;
4499 }
4500 }
4501 }
4502 #endif
4503
4504 /* The ims options can vary during the matching as a result of the presence
4505 of (?ims) items in the pattern. They are kept in a local variable so that
4506 restoring at the exit of a group is easy. */
4507
4508 ims = re->options & (PCRE_CASELESS|PCRE_MULTILINE|PCRE_DOTALL);
4509
4510 /* If the expression has got more back references than the offsets supplied can
4511 hold, we get a temporary chunk of working store to use during the matching.
4512 Otherwise, we can use the vector supplied, rounding down its size to a multiple
4513 of 3. */
4514
4515 ocount = offsetcount - (offsetcount % 3);
4516
4517 if (re->top_backref > 0 && re->top_backref >= ocount/3)
4518 {
4519 ocount = re->top_backref * 3 + 3;
4520 md->offset_vector = (int *)(pcre_malloc)(ocount * sizeof(int));
4521 if (md->offset_vector == NULL) return PCRE_ERROR_NOMEMORY;
4522 using_temporary_offsets = TRUE;
4523 DPRINTF(("Got memory to hold back references\n"));
4524 }
4525 else md->offset_vector = offsets;
4526
4527 md->offset_end = ocount;
4528 md->offset_max = (2*ocount)/3;
4529 md->offset_overflow = FALSE;
4530 md->capture_last = -1;
4531
4532 /* Compute the minimum number of offsets that we need to reset each time. Doing
4533 this makes a huge difference to execution time when there aren't many brackets
4534 in the pattern. */
4535
4536 resetcount = 2 + re->top_bracket * 2;
4537 if (resetcount > offsetcount) resetcount = ocount;
4538
4539 /* Reset the working variable associated with each extraction. These should
4540 never be used unless previously set, but they get saved and restored, and so we
4541 initialize them to avoid reading uninitialized locations. */
4542
4543 if (md->offset_vector != NULL)
4544 {
4545 register int *iptr = md->offset_vector + ocount;
4546 register int *iend = iptr - resetcount/2 + 1;
4547 while (--iptr >= iend) *iptr = -1;
4548 }
4549
4550 /* Set up the first character to match, if available. The first_byte value is
4551 never set for an anchored regular expression, but the anchoring may be forced
4552 at run time, so we have to test for anchoring. The first char may be unset for
4553 an unanchored pattern, of course. If there's no first char and the pattern was
4554 studied, there may be a bitmap of possible first characters. */
4555
4556 if (!anchored)
4557 {
4558 if ((re->options & PCRE_FIRSTSET) != 0)
4559 {
4560 first_byte = re->first_byte & 255;
4561 if ((first_byte_caseless = ((re->first_byte & REQ_CASELESS) != 0)) == TRUE)
4562 first_byte = md->lcc[first_byte];
4563 }
4564 else
4565 if (!startline && study != NULL &&
4566 (study->options & PCRE_STUDY_MAPPED) != 0)
4567 start_bits = study->start_bits;
4568 }
4569
4570 /* For anchored or unanchored matches, there may be a "last known required
4571 character" set. */
4572
4573 if ((re->options & PCRE_REQCHSET) != 0)
4574 {
4575 req_byte = re->req_byte & 255;
4576 req_byte_caseless = (re->req_byte & REQ_CASELESS) != 0;
4577 req_byte2 = (tables + fcc_offset)[req_byte]; /* case flipped */
4578 }
4579
4580
4581 /* ==========================================================================*/
4582
4583 /* Loop for handling unanchored repeated matching attempts; for anchored regexs
4584 the loop runs just once. */
4585
4586 for(;;)
4587 {
4588 USPTR save_end_subject = end_subject;
4589 USPTR new_start_match;
4590
4591 /* Reset the maximum number of extractions we might see. */
4592
4593 if (md->offset_vector != NULL)
4594 {
4595 register int *iptr = md->offset_vector;
4596 register int *iend = iptr + resetcount;
4597 while (iptr < iend) *iptr++ = -1;
4598 }
4599
4600 /* Advance to a unique first char if possible. If firstline is TRUE, the
4601 start of the match is constrained to the first line of a multiline string.
4602 That is, the match must be before or at the first newline. Implement this by
4603 temporarily adjusting end_subject so that we stop scanning at a newline. If
4604 the match fails at the newline, later code breaks this loop. */
4605
4606 if (firstline)
4607 {
4608 USPTR t = start_match;
4609 while (t < md->end_subject && !IS_NEWLINE(t)) t++;
4610 end_subject = t;
4611 }
4612
4613 /* Now test for a unique first byte */
4614
4615 if (first_byte >= 0)
4616 {
4617 if (first_byte_caseless)
4618 while (start_match < end_subject &&
4619 md->lcc[*start_match] != first_byte)
4620 start_match++;
4621 else
4622 while (start_match < end_subject && *start_match != first_byte)
4623 start_match++;
4624 }
4625
4626 /* Or to just after a linebreak for a multiline match if possible */
4627
4628 else if (startline)
4629 {
4630 if (start_match > md->start_subject + start_offset)
4631 {
4632 while (start_match <= end_subject && !WAS_NEWLINE(start_match))
4633 start_match++;
4634
4635 /* If we have just passed a CR and the newline option is ANY or ANYCRLF,
4636 and we are now at a LF, advance the match position by one more character.
4637 */
4638
4639 if (start_match[-1] == '\r' &&
4640 (md->nltype == NLTYPE_ANY || md->nltype == NLTYPE_ANYCRLF) &&
4641 start_match < end_subject &&
4642 *start_match == '\n')
4643 start_match++;
4644 }
4645 }
4646
4647 /* Or to a non-unique first char after study */
4648
4649 else if (start_bits != NULL)
4650 {
4651 while (start_match < end_subject)
4652 {
4653 register unsigned int c = *start_match;
4654 if ((start_bits[c/8] & (1 << (c&7))) == 0) start_match++; else break;
4655 }
4656 }
4657
4658 /* Restore fudged end_subject */
4659
4660 end_subject = save_end_subject;
4661
4662 #ifdef DEBUG /* Sigh. Some compilers never learn. */
4663 printf(">>>> Match against: ");
4664 pchars(start_match, end_subject - start_match, TRUE, md);
4665 printf("\n");
4666 #endif
4667
4668 /* If req_byte is set, we know that that character must appear in the subject
4669 for the match to succeed. If the first character is set, req_byte must be
4670 later in the subject; otherwise the test starts at the match point. This
4671 optimization can save a huge amount of backtracking in patterns with nested
4672 unlimited repeats that aren't going to match. Writing separate code for
4673 cased/caseless versions makes it go faster, as does using an autoincrement
4674 and backing off on a match.
4675
4676 HOWEVER: when the subject string is very, very long, searching to its end can
4677 take a long time, and give bad performance on quite ordinary patterns. This
4678 showed up when somebody was matching something like /^\d+C/ on a 32-megabyte
4679 string... so we don't do this when the string is sufficiently long.
4680
4681 ALSO: this processing is disabled when partial matching is requested.
4682 */
4683
4684 if (req_byte >= 0 &&
4685 end_subject - start_match < REQ_BYTE_MAX &&
4686 !md->partial)
4687 {
4688 register USPTR p = start_match + ((first_byte >= 0)? 1 : 0);
4689
4690 /* We don't need to repeat the search if we haven't yet reached the
4691 place we found it at last time. */
4692
4693 if (p > req_byte_ptr)
4694 {
4695 if (req_byte_caseless)
4696 {
4697 while (p < end_subject)
4698 {
4699 register int pp = *p++;
4700 if (pp == req_byte || pp == req_byte2) { p--; break; }
4701 }
4702 }
4703 else
4704 {
4705 while (p < end_subject)
4706 {
4707 if (*p++ == req_byte) { p--; break; }
4708 }
4709 }
4710
4711 /* If we can't find the required character, break the matching loop,
4712 forcing a match failure. */
4713
4714 if (p >= end_subject)
4715 {
4716 rc = MATCH_NOMATCH;
4717 break;
4718 }
4719
4720 /* If we have found the required character, save the point where we
4721 found it, so that we don't search again next time round the loop if
4722 the start hasn't passed this character yet. */
4723
4724 req_byte_ptr = p;
4725 }
4726 }
4727
4728 /* OK, we can now run the match. */
4729
4730 md->start_match_ptr = start_match;
4731 md->match_call_count = 0;
4732 rc = match(start_match, md->start_code, start_match, 2, md, ims, NULL, 0, 0);
4733
4734 switch(rc)
4735 {
4736 /* NOMATCH and PRUNE advance by one character. THEN at this level acts
4737 exactly like PRUNE. */
4738
4739 case MATCH_NOMATCH:
4740 case MATCH_PRUNE:
4741 case MATCH_THEN:
4742 new_start_match = start_match + 1;
4743 #ifdef SUPPORT_UTF8
4744 if (utf8)
4745 while(new_start_match < end_subject && (*new_start_match & 0xc0) == 0x80)
4746 new_start_match++;
4747 #endif
4748 break;
4749
4750 /* SKIP passes back the next starting point explicitly. */
4751
4752 case MATCH_SKIP:
4753 new_start_match = md->start_match_ptr;
4754 break;
4755
4756 /* COMMIT disables the bumpalong, but otherwise behaves as NOMATCH. */
4757
4758 case MATCH_COMMIT:
4759 rc = MATCH_NOMATCH;
4760 goto ENDLOOP;
4761
4762 /* Any other return is some kind of error. */
4763
4764 default:
4765 goto ENDLOOP;
4766 }
4767
4768 /* Control reaches here for the various types of "no match at this point"
4769 result. Reset the code to MATCH_NOMATCH for subsequent checking. */
4770
4771 rc = MATCH_NOMATCH;
4772
4773 /* If PCRE_FIRSTLINE is set, the match must happen before or at the first
4774 newline in the subject (though it may continue over the newline). Therefore,
4775 if we have just failed to match, starting at a newline, do not continue. */
4776
4777 if (firstline && IS_NEWLINE(start_match)) break;
4778
4779 /* Advance to new matching position */
4780
4781 start_match = new_start_match;
4782
4783 /* Break the loop if the pattern is anchored or if we have passed the end of
4784 the subject. */
4785
4786 if (anchored || start_match > end_subject) break;
4787
4788 /* If we have just passed a CR and we are now at a LF, and the pattern does
4789 not contain any explicit matches for \r or \n, and the newline option is CRLF
4790 or ANY or ANYCRLF, advance the match position by one more character. */
4791
4792 if (start_match[-1] == '\r' &&
4793 start_match < end_subject &&
4794 *start_match == '\n' &&
4795 (re->options & PCRE_HASCRORLF) == 0 &&
4796 (md->nltype == NLTYPE_ANY ||
4797 md->nltype == NLTYPE_ANYCRLF ||
4798 md->nllen == 2))
4799 start_match++;
4800
4801 } /* End of for(;;) "bumpalong" loop */
4802
4803 /* ==========================================================================*/
4804
4805 /* We reach here when rc is not MATCH_NOMATCH, or if one of the stopping
4806 conditions is true:
4807
4808 (1) The pattern is anchored or the match was failed by (*COMMIT);
4809
4810 (2) We are past the end of the subject;
4811
4812 (3) PCRE_FIRSTLINE is set and we have failed to match at a newline, because
4813 this option requests that a match occur at or before the first newline in
4814 the subject.
4815
4816 When we have a match and the offset vector is big enough to deal with any
4817 backreferences, captured substring offsets will already be set up. In the case
4818 where we had to get some local store to hold offsets for backreference
4819 processing, copy those that we can. In this case there need not be overflow if
4820 certain parts of the pattern were not used, even though there are more
4821 capturing parentheses than vector slots. */
4822
4823 ENDLOOP:
4824
4825 if (rc == MATCH_MATCH)
4826 {
4827 if (using_temporary_offsets)
4828 {
4829 if (offsetcount >= 4)
4830 {
4831 memcpy(offsets + 2, md->offset_vector + 2,
4832 (offsetcount - 2) * sizeof(int));
4833 DPRINTF(("Copied offsets from temporary memory\n"));
4834 }
4835 if (md->end_offset_top > offsetcount) md->offset_overflow = TRUE;
4836 DPRINTF(("Freeing temporary memory\n"));
4837 (pcre_free)(md->offset_vector);
4838 }
4839
4840 /* Set the return code to the number of captured strings, or 0 if there are
4841 too many to fit into the vector. */
4842
4843 rc = md->offset_overflow? 0 : md->end_offset_top/2;
4844
4845 /* If there is space, set up the whole thing as substring 0. The value of
4846 md->start_match_ptr might be modified if \K was encountered on the success
4847 matching path. */
4848
4849 if (offsetcount < 2) rc = 0; else
4850 {
4851 offsets[0] = md->start_match_ptr - md->start_subject;
4852 offsets[1] = md->end_match_ptr - md->start_subject;
4853 }
4854
4855 DPRINTF((">>>> returning %d\n", rc));
4856 return rc;
4857 }
4858
4859 /* Control gets here if there has been an error, or if the overall match
4860 attempt has failed at all permitted starting positions. */
4861
4862 if (using_temporary_offsets)
4863 {
4864 DPRINTF(("Freeing temporary memory\n"));
4865 (pcre_free)(md->offset_vector);
4866 }
4867
4868 if (rc != MATCH_NOMATCH)
4869 {
4870 DPRINTF((">>>> error: returning %d\n", rc));
4871 return rc;
4872 }
4873 else if (md->partial && md->hitend)
4874 {
4875 DPRINTF((">>>> returning PCRE_ERROR_PARTIAL\n"));
4876 return PCRE_ERROR_PARTIAL;
4877 }
4878 else
4879 {
4880 DPRINTF((">>>> returning PCRE_ERROR_NOMATCH\n"));
4881 return PCRE_ERROR_NOMATCH;
4882 }
4883 }
4884
4885 /* End of pcre_exec.c */

Properties

Name Value
svn:eol-style native
svn:keywords "Author Date Id Revision Url"

webmaster@exim.org
ViewVC Help
Powered by ViewVC 1.1.12