/[pcre]/code/trunk/pcre_exec.c
ViewVC logotype

Contents of /code/trunk/pcre_exec.c

Parent Directory Parent Directory | Revision Log Revision Log


Revision 455 - (show annotations) (download)
Sat Sep 26 19:12:32 2009 UTC (4 years, 10 months ago) by ph10
File MIME type: text/plain
File size: 163053 byte(s)
Added lower bound length-finding to pcre_study() and use it when matching; make 
the value available via pcre_fullinfo(); also fixed bugs connected with
pcre_study() in pcre_dfa_exec(). 

1 /*************************************************
2 * Perl-Compatible Regular Expressions *
3 *************************************************/
4
5 /* PCRE is a library of functions to support regular expressions whose syntax
6 and semantics are as close as possible to those of the Perl 5 language.
7
8 Written by Philip Hazel
9 Copyright (c) 1997-2009 University of Cambridge
10
11 -----------------------------------------------------------------------------
12 Redistribution and use in source and binary forms, with or without
13 modification, are permitted provided that the following conditions are met:
14
15 * Redistributions of source code must retain the above copyright notice,
16 this list of conditions and the following disclaimer.
17
18 * Redistributions in binary form must reproduce the above copyright
19 notice, this list of conditions and the following disclaimer in the
20 documentation and/or other materials provided with the distribution.
21
22 * Neither the name of the University of Cambridge nor the names of its
23 contributors may be used to endorse or promote products derived from
24 this software without specific prior written permission.
25
26 THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
27 AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
28 IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
29 ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
30 LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
31 CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
32 SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
33 INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
34 CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
35 ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
36 POSSIBILITY OF SUCH DAMAGE.
37 -----------------------------------------------------------------------------
38 */
39
40
41 /* This module contains pcre_exec(), the externally visible function that does
42 pattern matching using an NFA algorithm, trying to mimic Perl as closely as
43 possible. There are also some static supporting functions. */
44
45 #ifdef HAVE_CONFIG_H
46 #include "config.h"
47 #endif
48
49 #define NLBLOCK md /* Block containing newline information */
50 #define PSSTART start_subject /* Field containing processed string start */
51 #define PSEND end_subject /* Field containing processed string end */
52
53 #include "pcre_internal.h"
54
55 /* Undefine some potentially clashing cpp symbols */
56
57 #undef min
58 #undef max
59
60 /* Flag bits for the match() function */
61
62 #define match_condassert 0x01 /* Called to check a condition assertion */
63 #define match_cbegroup 0x02 /* Could-be-empty unlimited repeat group */
64
65 /* Non-error returns from the match() function. Error returns are externally
66 defined PCRE_ERROR_xxx codes, which are all negative. */
67
68 #define MATCH_MATCH 1
69 #define MATCH_NOMATCH 0
70
71 /* Special internal returns from the match() function. Make them sufficiently
72 negative to avoid the external error codes. */
73
74 #define MATCH_COMMIT (-999)
75 #define MATCH_PRUNE (-998)
76 #define MATCH_SKIP (-997)
77 #define MATCH_THEN (-996)
78
79 /* Maximum number of ints of offset to save on the stack for recursive calls.
80 If the offset vector is bigger, malloc is used. This should be a multiple of 3,
81 because the offset vector is always a multiple of 3 long. */
82
83 #define REC_STACK_SAVE_MAX 30
84
85 /* Min and max values for the common repeats; for the maxima, 0 => infinity */
86
87 static const char rep_min[] = { 0, 0, 1, 1, 0, 0 };
88 static const char rep_max[] = { 0, 0, 0, 0, 1, 1 };
89
90
91
92 #ifdef DEBUG
93 /*************************************************
94 * Debugging function to print chars *
95 *************************************************/
96
97 /* Print a sequence of chars in printable format, stopping at the end of the
98 subject if the requested.
99
100 Arguments:
101 p points to characters
102 length number to print
103 is_subject TRUE if printing from within md->start_subject
104 md pointer to matching data block, if is_subject is TRUE
105
106 Returns: nothing
107 */
108
109 static void
110 pchars(const uschar *p, int length, BOOL is_subject, match_data *md)
111 {
112 unsigned int c;
113 if (is_subject && length > md->end_subject - p) length = md->end_subject - p;
114 while (length-- > 0)
115 if (isprint(c = *(p++))) printf("%c", c); else printf("\\x%02x", c);
116 }
117 #endif
118
119
120
121 /*************************************************
122 * Match a back-reference *
123 *************************************************/
124
125 /* If a back reference hasn't been set, the length that is passed is greater
126 than the number of characters left in the string, so the match fails.
127
128 Arguments:
129 offset index into the offset vector
130 eptr points into the subject
131 length length to be matched
132 md points to match data block
133 ims the ims flags
134
135 Returns: TRUE if matched
136 */
137
138 static BOOL
139 match_ref(int offset, register USPTR eptr, int length, match_data *md,
140 unsigned long int ims)
141 {
142 USPTR p = md->start_subject + md->offset_vector[offset];
143
144 #ifdef DEBUG
145 if (eptr >= md->end_subject)
146 printf("matching subject <null>");
147 else
148 {
149 printf("matching subject ");
150 pchars(eptr, length, TRUE, md);
151 }
152 printf(" against backref ");
153 pchars(p, length, FALSE, md);
154 printf("\n");
155 #endif
156
157 /* Always fail if not enough characters left */
158
159 if (length > md->end_subject - eptr) return FALSE;
160
161 /* Separate the caseless case for speed. In UTF-8 mode we can only do this
162 properly if Unicode properties are supported. Otherwise, we can check only
163 ASCII characters. */
164
165 if ((ims & PCRE_CASELESS) != 0)
166 {
167 #ifdef SUPPORT_UTF8
168 #ifdef SUPPORT_UCP
169 if (md->utf8)
170 {
171 USPTR endptr = eptr + length;
172 while (eptr < endptr)
173 {
174 int c, d;
175 GETCHARINC(c, eptr);
176 GETCHARINC(d, p);
177 if (c != d && c != UCD_OTHERCASE(d)) return FALSE;
178 }
179 }
180 else
181 #endif
182 #endif
183
184 /* The same code works when not in UTF-8 mode and in UTF-8 mode when there
185 is no UCP support. */
186
187 while (length-- > 0)
188 { if (md->lcc[*p++] != md->lcc[*eptr++]) return FALSE; }
189 }
190
191 /* In the caseful case, we can just compare the bytes, whether or not we
192 are in UTF-8 mode. */
193
194 else
195 { while (length-- > 0) if (*p++ != *eptr++) return FALSE; }
196
197 return TRUE;
198 }
199
200
201
202 /***************************************************************************
203 ****************************************************************************
204 RECURSION IN THE match() FUNCTION
205
206 The match() function is highly recursive, though not every recursive call
207 increases the recursive depth. Nevertheless, some regular expressions can cause
208 it to recurse to a great depth. I was writing for Unix, so I just let it call
209 itself recursively. This uses the stack for saving everything that has to be
210 saved for a recursive call. On Unix, the stack can be large, and this works
211 fine.
212
213 It turns out that on some non-Unix-like systems there are problems with
214 programs that use a lot of stack. (This despite the fact that every last chip
215 has oodles of memory these days, and techniques for extending the stack have
216 been known for decades.) So....
217
218 There is a fudge, triggered by defining NO_RECURSE, which avoids recursive
219 calls by keeping local variables that need to be preserved in blocks of memory
220 obtained from malloc() instead instead of on the stack. Macros are used to
221 achieve this so that the actual code doesn't look very different to what it
222 always used to.
223
224 The original heap-recursive code used longjmp(). However, it seems that this
225 can be very slow on some operating systems. Following a suggestion from Stan
226 Switzer, the use of longjmp() has been abolished, at the cost of having to
227 provide a unique number for each call to RMATCH. There is no way of generating
228 a sequence of numbers at compile time in C. I have given them names, to make
229 them stand out more clearly.
230
231 Crude tests on x86 Linux show a small speedup of around 5-8%. However, on
232 FreeBSD, avoiding longjmp() more than halves the time taken to run the standard
233 tests. Furthermore, not using longjmp() means that local dynamic variables
234 don't have indeterminate values; this has meant that the frame size can be
235 reduced because the result can be "passed back" by straight setting of the
236 variable instead of being passed in the frame.
237 ****************************************************************************
238 ***************************************************************************/
239
240 /* Numbers for RMATCH calls. When this list is changed, the code at HEAP_RETURN
241 below must be updated in sync. */
242
243 enum { RM1=1, RM2, RM3, RM4, RM5, RM6, RM7, RM8, RM9, RM10,
244 RM11, RM12, RM13, RM14, RM15, RM16, RM17, RM18, RM19, RM20,
245 RM21, RM22, RM23, RM24, RM25, RM26, RM27, RM28, RM29, RM30,
246 RM31, RM32, RM33, RM34, RM35, RM36, RM37, RM38, RM39, RM40,
247 RM41, RM42, RM43, RM44, RM45, RM46, RM47, RM48, RM49, RM50,
248 RM51, RM52, RM53, RM54 };
249
250 /* These versions of the macros use the stack, as normal. There are debugging
251 versions and production versions. Note that the "rw" argument of RMATCH isn't
252 actuall used in this definition. */
253
254 #ifndef NO_RECURSE
255 #define REGISTER register
256
257 #ifdef DEBUG
258 #define RMATCH(ra,rb,rc,rd,re,rf,rg,rw) \
259 { \
260 printf("match() called in line %d\n", __LINE__); \
261 rrc = match(ra,rb,mstart,rc,rd,re,rf,rg,rdepth+1); \
262 printf("to line %d\n", __LINE__); \
263 }
264 #define RRETURN(ra) \
265 { \
266 printf("match() returned %d from line %d ", ra, __LINE__); \
267 return ra; \
268 }
269 #else
270 #define RMATCH(ra,rb,rc,rd,re,rf,rg,rw) \
271 rrc = match(ra,rb,mstart,rc,rd,re,rf,rg,rdepth+1)
272 #define RRETURN(ra) return ra
273 #endif
274
275 #else
276
277
278 /* These versions of the macros manage a private stack on the heap. Note that
279 the "rd" argument of RMATCH isn't actually used in this definition. It's the md
280 argument of match(), which never changes. */
281
282 #define REGISTER
283
284 #define RMATCH(ra,rb,rc,rd,re,rf,rg,rw)\
285 {\
286 heapframe *newframe = (pcre_stack_malloc)(sizeof(heapframe));\
287 frame->Xwhere = rw; \
288 newframe->Xeptr = ra;\
289 newframe->Xecode = rb;\
290 newframe->Xmstart = mstart;\
291 newframe->Xoffset_top = rc;\
292 newframe->Xims = re;\
293 newframe->Xeptrb = rf;\
294 newframe->Xflags = rg;\
295 newframe->Xrdepth = frame->Xrdepth + 1;\
296 newframe->Xprevframe = frame;\
297 frame = newframe;\
298 DPRINTF(("restarting from line %d\n", __LINE__));\
299 goto HEAP_RECURSE;\
300 L_##rw:\
301 DPRINTF(("jumped back to line %d\n", __LINE__));\
302 }
303
304 #define RRETURN(ra)\
305 {\
306 heapframe *newframe = frame;\
307 frame = newframe->Xprevframe;\
308 (pcre_stack_free)(newframe);\
309 if (frame != NULL)\
310 {\
311 rrc = ra;\
312 goto HEAP_RETURN;\
313 }\
314 return ra;\
315 }
316
317
318 /* Structure for remembering the local variables in a private frame */
319
320 typedef struct heapframe {
321 struct heapframe *Xprevframe;
322
323 /* Function arguments that may change */
324
325 USPTR Xeptr;
326 const uschar *Xecode;
327 USPTR Xmstart;
328 int Xoffset_top;
329 long int Xims;
330 eptrblock *Xeptrb;
331 int Xflags;
332 unsigned int Xrdepth;
333
334 /* Function local variables */
335
336 USPTR Xcallpat;
337 #ifdef SUPPORT_UTF8
338 USPTR Xcharptr;
339 #endif
340 USPTR Xdata;
341 USPTR Xnext;
342 USPTR Xpp;
343 USPTR Xprev;
344 USPTR Xsaved_eptr;
345
346 recursion_info Xnew_recursive;
347
348 BOOL Xcur_is_word;
349 BOOL Xcondition;
350 BOOL Xprev_is_word;
351
352 unsigned long int Xoriginal_ims;
353
354 #ifdef SUPPORT_UCP
355 int Xprop_type;
356 int Xprop_value;
357 int Xprop_fail_result;
358 int Xprop_category;
359 int Xprop_chartype;
360 int Xprop_script;
361 int Xoclength;
362 uschar Xocchars[8];
363 #endif
364
365 int Xcodelink;
366 int Xctype;
367 unsigned int Xfc;
368 int Xfi;
369 int Xlength;
370 int Xmax;
371 int Xmin;
372 int Xnumber;
373 int Xoffset;
374 int Xop;
375 int Xsave_capture_last;
376 int Xsave_offset1, Xsave_offset2, Xsave_offset3;
377 int Xstacksave[REC_STACK_SAVE_MAX];
378
379 eptrblock Xnewptrb;
380
381 /* Where to jump back to */
382
383 int Xwhere;
384
385 } heapframe;
386
387 #endif
388
389
390 /***************************************************************************
391 ***************************************************************************/
392
393
394
395 /*************************************************
396 * Match from current position *
397 *************************************************/
398
399 /* This function is called recursively in many circumstances. Whenever it
400 returns a negative (error) response, the outer incarnation must also return the
401 same response. */
402
403 /* These macros pack up tests that are used for partial matching, and which
404 appears several times in the code. We set the "hit end" flag if the pointer is
405 at the end of the subject and also past the start of the subject (i.e.
406 something has been matched). For hard partial matching, we then return
407 immediately. The second one is used when we already know we are past the end of
408 the subject. */
409
410 #define CHECK_PARTIAL()\
411 if (md->partial != 0 && eptr >= md->end_subject && eptr > mstart)\
412 {\
413 md->hitend = TRUE;\
414 if (md->partial > 1) RRETURN(PCRE_ERROR_PARTIAL);\
415 }
416
417 #define SCHECK_PARTIAL()\
418 if (md->partial && eptr > mstart)\
419 {\
420 md->hitend = TRUE;\
421 if (md->partial > 1) RRETURN(PCRE_ERROR_PARTIAL);\
422 }
423
424
425 /* Performance note: It might be tempting to extract commonly used fields from
426 the md structure (e.g. utf8, end_subject) into individual variables to improve
427 performance. Tests using gcc on a SPARC disproved this; in the first case, it
428 made performance worse.
429
430 Arguments:
431 eptr pointer to current character in subject
432 ecode pointer to current position in compiled code
433 mstart pointer to the current match start position (can be modified
434 by encountering \K)
435 offset_top current top pointer
436 md pointer to "static" info for the match
437 ims current /i, /m, and /s options
438 eptrb pointer to chain of blocks containing eptr at start of
439 brackets - for testing for empty matches
440 flags can contain
441 match_condassert - this is an assertion condition
442 match_cbegroup - this is the start of an unlimited repeat
443 group that can match an empty string
444 rdepth the recursion depth
445
446 Returns: MATCH_MATCH if matched ) these values are >= 0
447 MATCH_NOMATCH if failed to match )
448 a negative PCRE_ERROR_xxx value if aborted by an error condition
449 (e.g. stopped by repeated call or recursion limit)
450 */
451
452 static int
453 match(REGISTER USPTR eptr, REGISTER const uschar *ecode, USPTR mstart,
454 int offset_top, match_data *md, unsigned long int ims, eptrblock *eptrb,
455 int flags, unsigned int rdepth)
456 {
457 /* These variables do not need to be preserved over recursion in this function,
458 so they can be ordinary variables in all cases. Mark some of them with
459 "register" because they are used a lot in loops. */
460
461 register int rrc; /* Returns from recursive calls */
462 register int i; /* Used for loops not involving calls to RMATCH() */
463 register unsigned int c; /* Character values not kept over RMATCH() calls */
464 register BOOL utf8; /* Local copy of UTF-8 flag for speed */
465
466 BOOL minimize, possessive; /* Quantifier options */
467 int condcode;
468
469 /* When recursion is not being used, all "local" variables that have to be
470 preserved over calls to RMATCH() are part of a "frame" which is obtained from
471 heap storage. Set up the top-level frame here; others are obtained from the
472 heap whenever RMATCH() does a "recursion". See the macro definitions above. */
473
474 #ifdef NO_RECURSE
475 heapframe *frame = (pcre_stack_malloc)(sizeof(heapframe));
476 frame->Xprevframe = NULL; /* Marks the top level */
477
478 /* Copy in the original argument variables */
479
480 frame->Xeptr = eptr;
481 frame->Xecode = ecode;
482 frame->Xmstart = mstart;
483 frame->Xoffset_top = offset_top;
484 frame->Xims = ims;
485 frame->Xeptrb = eptrb;
486 frame->Xflags = flags;
487 frame->Xrdepth = rdepth;
488
489 /* This is where control jumps back to to effect "recursion" */
490
491 HEAP_RECURSE:
492
493 /* Macros make the argument variables come from the current frame */
494
495 #define eptr frame->Xeptr
496 #define ecode frame->Xecode
497 #define mstart frame->Xmstart
498 #define offset_top frame->Xoffset_top
499 #define ims frame->Xims
500 #define eptrb frame->Xeptrb
501 #define flags frame->Xflags
502 #define rdepth frame->Xrdepth
503
504 /* Ditto for the local variables */
505
506 #ifdef SUPPORT_UTF8
507 #define charptr frame->Xcharptr
508 #endif
509 #define callpat frame->Xcallpat
510 #define codelink frame->Xcodelink
511 #define data frame->Xdata
512 #define next frame->Xnext
513 #define pp frame->Xpp
514 #define prev frame->Xprev
515 #define saved_eptr frame->Xsaved_eptr
516
517 #define new_recursive frame->Xnew_recursive
518
519 #define cur_is_word frame->Xcur_is_word
520 #define condition frame->Xcondition
521 #define prev_is_word frame->Xprev_is_word
522
523 #define original_ims frame->Xoriginal_ims
524
525 #ifdef SUPPORT_UCP
526 #define prop_type frame->Xprop_type
527 #define prop_value frame->Xprop_value
528 #define prop_fail_result frame->Xprop_fail_result
529 #define prop_category frame->Xprop_category
530 #define prop_chartype frame->Xprop_chartype
531 #define prop_script frame->Xprop_script
532 #define oclength frame->Xoclength
533 #define occhars frame->Xocchars
534 #endif
535
536 #define ctype frame->Xctype
537 #define fc frame->Xfc
538 #define fi frame->Xfi
539 #define length frame->Xlength
540 #define max frame->Xmax
541 #define min frame->Xmin
542 #define number frame->Xnumber
543 #define offset frame->Xoffset
544 #define op frame->Xop
545 #define save_capture_last frame->Xsave_capture_last
546 #define save_offset1 frame->Xsave_offset1
547 #define save_offset2 frame->Xsave_offset2
548 #define save_offset3 frame->Xsave_offset3
549 #define stacksave frame->Xstacksave
550
551 #define newptrb frame->Xnewptrb
552
553 /* When recursion is being used, local variables are allocated on the stack and
554 get preserved during recursion in the normal way. In this environment, fi and
555 i, and fc and c, can be the same variables. */
556
557 #else /* NO_RECURSE not defined */
558 #define fi i
559 #define fc c
560
561
562 #ifdef SUPPORT_UTF8 /* Many of these variables are used only */
563 const uschar *charptr; /* in small blocks of the code. My normal */
564 #endif /* style of coding would have declared */
565 const uschar *callpat; /* them within each of those blocks. */
566 const uschar *data; /* However, in order to accommodate the */
567 const uschar *next; /* version of this code that uses an */
568 USPTR pp; /* external "stack" implemented on the */
569 const uschar *prev; /* heap, it is easier to declare them all */
570 USPTR saved_eptr; /* here, so the declarations can be cut */
571 /* out in a block. The only declarations */
572 recursion_info new_recursive; /* within blocks below are for variables */
573 /* that do not have to be preserved over */
574 BOOL cur_is_word; /* a recursive call to RMATCH(). */
575 BOOL condition;
576 BOOL prev_is_word;
577
578 unsigned long int original_ims;
579
580 #ifdef SUPPORT_UCP
581 int prop_type;
582 int prop_value;
583 int prop_fail_result;
584 int prop_category;
585 int prop_chartype;
586 int prop_script;
587 int oclength;
588 uschar occhars[8];
589 #endif
590
591 int codelink;
592 int ctype;
593 int length;
594 int max;
595 int min;
596 int number;
597 int offset;
598 int op;
599 int save_capture_last;
600 int save_offset1, save_offset2, save_offset3;
601 int stacksave[REC_STACK_SAVE_MAX];
602
603 eptrblock newptrb;
604 #endif /* NO_RECURSE */
605
606 /* These statements are here to stop the compiler complaining about unitialized
607 variables. */
608
609 #ifdef SUPPORT_UCP
610 prop_value = 0;
611 prop_fail_result = 0;
612 #endif
613
614
615 /* This label is used for tail recursion, which is used in a few cases even
616 when NO_RECURSE is not defined, in order to reduce the amount of stack that is
617 used. Thanks to Ian Taylor for noticing this possibility and sending the
618 original patch. */
619
620 TAIL_RECURSE:
621
622 /* OK, now we can get on with the real code of the function. Recursive calls
623 are specified by the macro RMATCH and RRETURN is used to return. When
624 NO_RECURSE is *not* defined, these just turn into a recursive call to match()
625 and a "return", respectively (possibly with some debugging if DEBUG is
626 defined). However, RMATCH isn't like a function call because it's quite a
627 complicated macro. It has to be used in one particular way. This shouldn't,
628 however, impact performance when true recursion is being used. */
629
630 #ifdef SUPPORT_UTF8
631 utf8 = md->utf8; /* Local copy of the flag */
632 #else
633 utf8 = FALSE;
634 #endif
635
636 /* First check that we haven't called match() too many times, or that we
637 haven't exceeded the recursive call limit. */
638
639 if (md->match_call_count++ >= md->match_limit) RRETURN(PCRE_ERROR_MATCHLIMIT);
640 if (rdepth >= md->match_limit_recursion) RRETURN(PCRE_ERROR_RECURSIONLIMIT);
641
642 original_ims = ims; /* Save for resetting on ')' */
643
644 /* At the start of a group with an unlimited repeat that may match an empty
645 string, the match_cbegroup flag is set. When this is the case, add the current
646 subject pointer to the chain of such remembered pointers, to be checked when we
647 hit the closing ket, in order to break infinite loops that match no characters.
648 When match() is called in other circumstances, don't add to the chain. The
649 match_cbegroup flag must NOT be used with tail recursion, because the memory
650 block that is used is on the stack, so a new one may be required for each
651 match(). */
652
653 if ((flags & match_cbegroup) != 0)
654 {
655 newptrb.epb_saved_eptr = eptr;
656 newptrb.epb_prev = eptrb;
657 eptrb = &newptrb;
658 }
659
660 /* Now start processing the opcodes. */
661
662 for (;;)
663 {
664 minimize = possessive = FALSE;
665 op = *ecode;
666
667 switch(op)
668 {
669 case OP_FAIL:
670 RRETURN(MATCH_NOMATCH);
671
672 case OP_PRUNE:
673 RMATCH(eptr, ecode + _pcre_OP_lengths[*ecode], offset_top, md,
674 ims, eptrb, flags, RM51);
675 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
676 RRETURN(MATCH_PRUNE);
677
678 case OP_COMMIT:
679 RMATCH(eptr, ecode + _pcre_OP_lengths[*ecode], offset_top, md,
680 ims, eptrb, flags, RM52);
681 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
682 RRETURN(MATCH_COMMIT);
683
684 case OP_SKIP:
685 RMATCH(eptr, ecode + _pcre_OP_lengths[*ecode], offset_top, md,
686 ims, eptrb, flags, RM53);
687 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
688 md->start_match_ptr = eptr; /* Pass back current position */
689 RRETURN(MATCH_SKIP);
690
691 case OP_THEN:
692 RMATCH(eptr, ecode + _pcre_OP_lengths[*ecode], offset_top, md,
693 ims, eptrb, flags, RM54);
694 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
695 RRETURN(MATCH_THEN);
696
697 /* Handle a capturing bracket. If there is space in the offset vector, save
698 the current subject position in the working slot at the top of the vector.
699 We mustn't change the current values of the data slot, because they may be
700 set from a previous iteration of this group, and be referred to by a
701 reference inside the group.
702
703 If the bracket fails to match, we need to restore this value and also the
704 values of the final offsets, in case they were set by a previous iteration
705 of the same bracket.
706
707 If there isn't enough space in the offset vector, treat this as if it were
708 a non-capturing bracket. Don't worry about setting the flag for the error
709 case here; that is handled in the code for KET. */
710
711 case OP_CBRA:
712 case OP_SCBRA:
713 number = GET2(ecode, 1+LINK_SIZE);
714 offset = number << 1;
715
716 #ifdef DEBUG
717 printf("start bracket %d\n", number);
718 printf("subject=");
719 pchars(eptr, 16, TRUE, md);
720 printf("\n");
721 #endif
722
723 if (offset < md->offset_max)
724 {
725 save_offset1 = md->offset_vector[offset];
726 save_offset2 = md->offset_vector[offset+1];
727 save_offset3 = md->offset_vector[md->offset_end - number];
728 save_capture_last = md->capture_last;
729
730 DPRINTF(("saving %d %d %d\n", save_offset1, save_offset2, save_offset3));
731 md->offset_vector[md->offset_end - number] = eptr - md->start_subject;
732
733 flags = (op == OP_SCBRA)? match_cbegroup : 0;
734 do
735 {
736 RMATCH(eptr, ecode + _pcre_OP_lengths[*ecode], offset_top, md,
737 ims, eptrb, flags, RM1);
738 if (rrc != MATCH_NOMATCH && rrc != MATCH_THEN) RRETURN(rrc);
739 md->capture_last = save_capture_last;
740 ecode += GET(ecode, 1);
741 }
742 while (*ecode == OP_ALT);
743
744 DPRINTF(("bracket %d failed\n", number));
745
746 md->offset_vector[offset] = save_offset1;
747 md->offset_vector[offset+1] = save_offset2;
748 md->offset_vector[md->offset_end - number] = save_offset3;
749
750 RRETURN(MATCH_NOMATCH);
751 }
752
753 /* FALL THROUGH ... Insufficient room for saving captured contents. Treat
754 as a non-capturing bracket. */
755
756 /* VVVVVVVVVVVVVVVVVVVVVVVVV */
757 /* VVVVVVVVVVVVVVVVVVVVVVVVV */
758
759 DPRINTF(("insufficient capture room: treat as non-capturing\n"));
760
761 /* VVVVVVVVVVVVVVVVVVVVVVVVV */
762 /* VVVVVVVVVVVVVVVVVVVVVVVVV */
763
764 /* Non-capturing bracket. Loop for all the alternatives. When we get to the
765 final alternative within the brackets, we would return the result of a
766 recursive call to match() whatever happened. We can reduce stack usage by
767 turning this into a tail recursion, except in the case when match_cbegroup
768 is set.*/
769
770 case OP_BRA:
771 case OP_SBRA:
772 DPRINTF(("start non-capturing bracket\n"));
773 flags = (op >= OP_SBRA)? match_cbegroup : 0;
774 for (;;)
775 {
776 if (ecode[GET(ecode, 1)] != OP_ALT) /* Final alternative */
777 {
778 if (flags == 0) /* Not a possibly empty group */
779 {
780 ecode += _pcre_OP_lengths[*ecode];
781 DPRINTF(("bracket 0 tail recursion\n"));
782 goto TAIL_RECURSE;
783 }
784
785 /* Possibly empty group; can't use tail recursion. */
786
787 RMATCH(eptr, ecode + _pcre_OP_lengths[*ecode], offset_top, md, ims,
788 eptrb, flags, RM48);
789 RRETURN(rrc);
790 }
791
792 /* For non-final alternatives, continue the loop for a NOMATCH result;
793 otherwise return. */
794
795 RMATCH(eptr, ecode + _pcre_OP_lengths[*ecode], offset_top, md, ims,
796 eptrb, flags, RM2);
797 if (rrc != MATCH_NOMATCH && rrc != MATCH_THEN) RRETURN(rrc);
798 ecode += GET(ecode, 1);
799 }
800 /* Control never reaches here. */
801
802 /* Conditional group: compilation checked that there are no more than
803 two branches. If the condition is false, skipping the first branch takes us
804 past the end if there is only one branch, but that's OK because that is
805 exactly what going to the ket would do. As there is only one branch to be
806 obeyed, we can use tail recursion to avoid using another stack frame. */
807
808 case OP_COND:
809 case OP_SCOND:
810 codelink= GET(ecode, 1);
811
812 /* Because of the way auto-callout works during compile, a callout item is
813 inserted between OP_COND and an assertion condition. */
814
815 if (ecode[LINK_SIZE+1] == OP_CALLOUT)
816 {
817 if (pcre_callout != NULL)
818 {
819 pcre_callout_block cb;
820 cb.version = 1; /* Version 1 of the callout block */
821 cb.callout_number = ecode[LINK_SIZE+2];
822 cb.offset_vector = md->offset_vector;
823 cb.subject = (PCRE_SPTR)md->start_subject;
824 cb.subject_length = md->end_subject - md->start_subject;
825 cb.start_match = mstart - md->start_subject;
826 cb.current_position = eptr - md->start_subject;
827 cb.pattern_position = GET(ecode, LINK_SIZE + 3);
828 cb.next_item_length = GET(ecode, 3 + 2*LINK_SIZE);
829 cb.capture_top = offset_top/2;
830 cb.capture_last = md->capture_last;
831 cb.callout_data = md->callout_data;
832 if ((rrc = (*pcre_callout)(&cb)) > 0) RRETURN(MATCH_NOMATCH);
833 if (rrc < 0) RRETURN(rrc);
834 }
835 ecode += _pcre_OP_lengths[OP_CALLOUT];
836 }
837
838 condcode = ecode[LINK_SIZE+1];
839
840 /* Now see what the actual condition is */
841
842 if (condcode == OP_RREF) /* Recursion test */
843 {
844 offset = GET2(ecode, LINK_SIZE + 2); /* Recursion group number*/
845 condition = md->recursive != NULL &&
846 (offset == RREF_ANY || offset == md->recursive->group_num);
847 ecode += condition? 3 : GET(ecode, 1);
848 }
849
850 else if (condcode == OP_CREF) /* Group used test */
851 {
852 offset = GET2(ecode, LINK_SIZE+2) << 1; /* Doubled ref number */
853 condition = offset < offset_top && md->offset_vector[offset] >= 0;
854 ecode += condition? 3 : GET(ecode, 1);
855 }
856
857 else if (condcode == OP_DEF) /* DEFINE - always false */
858 {
859 condition = FALSE;
860 ecode += GET(ecode, 1);
861 }
862
863 /* The condition is an assertion. Call match() to evaluate it - setting
864 the final argument match_condassert causes it to stop at the end of an
865 assertion. */
866
867 else
868 {
869 RMATCH(eptr, ecode + 1 + LINK_SIZE, offset_top, md, ims, NULL,
870 match_condassert, RM3);
871 if (rrc == MATCH_MATCH)
872 {
873 condition = TRUE;
874 ecode += 1 + LINK_SIZE + GET(ecode, LINK_SIZE + 2);
875 while (*ecode == OP_ALT) ecode += GET(ecode, 1);
876 }
877 else if (rrc != MATCH_NOMATCH && rrc != MATCH_THEN)
878 {
879 RRETURN(rrc); /* Need braces because of following else */
880 }
881 else
882 {
883 condition = FALSE;
884 ecode += codelink;
885 }
886 }
887
888 /* We are now at the branch that is to be obeyed. As there is only one,
889 we can use tail recursion to avoid using another stack frame, except when
890 match_cbegroup is required for an unlimited repeat of a possibly empty
891 group. If the second alternative doesn't exist, we can just plough on. */
892
893 if (condition || *ecode == OP_ALT)
894 {
895 ecode += 1 + LINK_SIZE;
896 if (op == OP_SCOND) /* Possibly empty group */
897 {
898 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, match_cbegroup, RM49);
899 RRETURN(rrc);
900 }
901 else /* Group must match something */
902 {
903 flags = 0;
904 goto TAIL_RECURSE;
905 }
906 }
907 else /* Condition false & no alternative */
908 {
909 ecode += 1 + LINK_SIZE;
910 }
911 break;
912
913
914 /* Before OP_ACCEPT there may be any number of OP_CLOSE opcodes,
915 to close any currently open capturing brackets. */
916
917 case OP_CLOSE:
918 number = GET2(ecode, 1);
919 offset = number << 1;
920
921 #ifdef DEBUG
922 printf("end bracket %d at *ACCEPT", number);
923 printf("\n");
924 #endif
925
926 md->capture_last = number;
927 if (offset >= md->offset_max) md->offset_overflow = TRUE; else
928 {
929 md->offset_vector[offset] =
930 md->offset_vector[md->offset_end - number];
931 md->offset_vector[offset+1] = eptr - md->start_subject;
932 if (offset_top <= offset) offset_top = offset + 2;
933 }
934 ecode += 3;
935 break;
936
937
938 /* End of the pattern, either real or forced. If we are in a top-level
939 recursion, we should restore the offsets appropriately and continue from
940 after the call. */
941
942 case OP_ACCEPT:
943 case OP_END:
944 if (md->recursive != NULL && md->recursive->group_num == 0)
945 {
946 recursion_info *rec = md->recursive;
947 DPRINTF(("End of pattern in a (?0) recursion\n"));
948 md->recursive = rec->prevrec;
949 memmove(md->offset_vector, rec->offset_save,
950 rec->saved_max * sizeof(int));
951 offset_top = rec->offset_top;
952 mstart = rec->save_start;
953 ims = original_ims;
954 ecode = rec->after_call;
955 break;
956 }
957
958 /* Otherwise, if we have matched an empty string, fail if PCRE_NOTEMPTY is
959 set, or if PCRE_NOTEMPTY_ATSTART is set and we have matched at the start of
960 the subject. In both cases, backtracking will then try other alternatives,
961 if any. */
962
963 if (eptr == mstart &&
964 (md->notempty ||
965 (md->notempty_atstart &&
966 mstart == md->start_subject + md->start_offset)))
967 RRETURN(MATCH_NOMATCH);
968
969 /* Otherwise, we have a match. */
970
971 md->end_match_ptr = eptr; /* Record where we ended */
972 md->end_offset_top = offset_top; /* and how many extracts were taken */
973 md->start_match_ptr = mstart; /* and the start (\K can modify) */
974 RRETURN(MATCH_MATCH);
975
976 /* Change option settings */
977
978 case OP_OPT:
979 ims = ecode[1];
980 ecode += 2;
981 DPRINTF(("ims set to %02lx\n", ims));
982 break;
983
984 /* Assertion brackets. Check the alternative branches in turn - the
985 matching won't pass the KET for an assertion. If any one branch matches,
986 the assertion is true. Lookbehind assertions have an OP_REVERSE item at the
987 start of each branch to move the current point backwards, so the code at
988 this level is identical to the lookahead case. */
989
990 case OP_ASSERT:
991 case OP_ASSERTBACK:
992 do
993 {
994 RMATCH(eptr, ecode + 1 + LINK_SIZE, offset_top, md, ims, NULL, 0,
995 RM4);
996 if (rrc == MATCH_MATCH) break;
997 if (rrc != MATCH_NOMATCH && rrc != MATCH_THEN) RRETURN(rrc);
998 ecode += GET(ecode, 1);
999 }
1000 while (*ecode == OP_ALT);
1001 if (*ecode == OP_KET) RRETURN(MATCH_NOMATCH);
1002
1003 /* If checking an assertion for a condition, return MATCH_MATCH. */
1004
1005 if ((flags & match_condassert) != 0) RRETURN(MATCH_MATCH);
1006
1007 /* Continue from after the assertion, updating the offsets high water
1008 mark, since extracts may have been taken during the assertion. */
1009
1010 do ecode += GET(ecode,1); while (*ecode == OP_ALT);
1011 ecode += 1 + LINK_SIZE;
1012 offset_top = md->end_offset_top;
1013 continue;
1014
1015 /* Negative assertion: all branches must fail to match */
1016
1017 case OP_ASSERT_NOT:
1018 case OP_ASSERTBACK_NOT:
1019 do
1020 {
1021 RMATCH(eptr, ecode + 1 + LINK_SIZE, offset_top, md, ims, NULL, 0,
1022 RM5);
1023 if (rrc == MATCH_MATCH) RRETURN(MATCH_NOMATCH);
1024 if (rrc != MATCH_NOMATCH && rrc != MATCH_THEN) RRETURN(rrc);
1025 ecode += GET(ecode,1);
1026 }
1027 while (*ecode == OP_ALT);
1028
1029 if ((flags & match_condassert) != 0) RRETURN(MATCH_MATCH);
1030
1031 ecode += 1 + LINK_SIZE;
1032 continue;
1033
1034 /* Move the subject pointer back. This occurs only at the start of
1035 each branch of a lookbehind assertion. If we are too close to the start to
1036 move back, this match function fails. When working with UTF-8 we move
1037 back a number of characters, not bytes. */
1038
1039 case OP_REVERSE:
1040 #ifdef SUPPORT_UTF8
1041 if (utf8)
1042 {
1043 i = GET(ecode, 1);
1044 while (i-- > 0)
1045 {
1046 eptr--;
1047 if (eptr < md->start_subject) RRETURN(MATCH_NOMATCH);
1048 BACKCHAR(eptr);
1049 }
1050 }
1051 else
1052 #endif
1053
1054 /* No UTF-8 support, or not in UTF-8 mode: count is byte count */
1055
1056 {
1057 eptr -= GET(ecode, 1);
1058 if (eptr < md->start_subject) RRETURN(MATCH_NOMATCH);
1059 }
1060
1061 /* Save the earliest consulted character, then skip to next op code */
1062
1063 if (eptr < md->start_used_ptr) md->start_used_ptr = eptr;
1064 ecode += 1 + LINK_SIZE;
1065 break;
1066
1067 /* The callout item calls an external function, if one is provided, passing
1068 details of the match so far. This is mainly for debugging, though the
1069 function is able to force a failure. */
1070
1071 case OP_CALLOUT:
1072 if (pcre_callout != NULL)
1073 {
1074 pcre_callout_block cb;
1075 cb.version = 1; /* Version 1 of the callout block */
1076 cb.callout_number = ecode[1];
1077 cb.offset_vector = md->offset_vector;
1078 cb.subject = (PCRE_SPTR)md->start_subject;
1079 cb.subject_length = md->end_subject - md->start_subject;
1080 cb.start_match = mstart - md->start_subject;
1081 cb.current_position = eptr - md->start_subject;
1082 cb.pattern_position = GET(ecode, 2);
1083 cb.next_item_length = GET(ecode, 2 + LINK_SIZE);
1084 cb.capture_top = offset_top/2;
1085 cb.capture_last = md->capture_last;
1086 cb.callout_data = md->callout_data;
1087 if ((rrc = (*pcre_callout)(&cb)) > 0) RRETURN(MATCH_NOMATCH);
1088 if (rrc < 0) RRETURN(rrc);
1089 }
1090 ecode += 2 + 2*LINK_SIZE;
1091 break;
1092
1093 /* Recursion either matches the current regex, or some subexpression. The
1094 offset data is the offset to the starting bracket from the start of the
1095 whole pattern. (This is so that it works from duplicated subpatterns.)
1096
1097 If there are any capturing brackets started but not finished, we have to
1098 save their starting points and reinstate them after the recursion. However,
1099 we don't know how many such there are (offset_top records the completed
1100 total) so we just have to save all the potential data. There may be up to
1101 65535 such values, which is too large to put on the stack, but using malloc
1102 for small numbers seems expensive. As a compromise, the stack is used when
1103 there are no more than REC_STACK_SAVE_MAX values to store; otherwise malloc
1104 is used. A problem is what to do if the malloc fails ... there is no way of
1105 returning to the top level with an error. Save the top REC_STACK_SAVE_MAX
1106 values on the stack, and accept that the rest may be wrong.
1107
1108 There are also other values that have to be saved. We use a chained
1109 sequence of blocks that actually live on the stack. Thanks to Robin Houston
1110 for the original version of this logic. */
1111
1112 case OP_RECURSE:
1113 {
1114 callpat = md->start_code + GET(ecode, 1);
1115 new_recursive.group_num = (callpat == md->start_code)? 0 :
1116 GET2(callpat, 1 + LINK_SIZE);
1117
1118 /* Add to "recursing stack" */
1119
1120 new_recursive.prevrec = md->recursive;
1121 md->recursive = &new_recursive;
1122
1123 /* Find where to continue from afterwards */
1124
1125 ecode += 1 + LINK_SIZE;
1126 new_recursive.after_call = ecode;
1127
1128 /* Now save the offset data. */
1129
1130 new_recursive.saved_max = md->offset_end;
1131 if (new_recursive.saved_max <= REC_STACK_SAVE_MAX)
1132 new_recursive.offset_save = stacksave;
1133 else
1134 {
1135 new_recursive.offset_save =
1136 (int *)(pcre_malloc)(new_recursive.saved_max * sizeof(int));
1137 if (new_recursive.offset_save == NULL) RRETURN(PCRE_ERROR_NOMEMORY);
1138 }
1139
1140 memcpy(new_recursive.offset_save, md->offset_vector,
1141 new_recursive.saved_max * sizeof(int));
1142 new_recursive.save_start = mstart;
1143 new_recursive.offset_top = offset_top;
1144 mstart = eptr;
1145
1146 /* OK, now we can do the recursion. For each top-level alternative we
1147 restore the offset and recursion data. */
1148
1149 DPRINTF(("Recursing into group %d\n", new_recursive.group_num));
1150 flags = (*callpat >= OP_SBRA)? match_cbegroup : 0;
1151 do
1152 {
1153 RMATCH(eptr, callpat + _pcre_OP_lengths[*callpat], offset_top,
1154 md, ims, eptrb, flags, RM6);
1155 if (rrc == MATCH_MATCH)
1156 {
1157 DPRINTF(("Recursion matched\n"));
1158 md->recursive = new_recursive.prevrec;
1159 if (new_recursive.offset_save != stacksave)
1160 (pcre_free)(new_recursive.offset_save);
1161 RRETURN(MATCH_MATCH);
1162 }
1163 else if (rrc != MATCH_NOMATCH && rrc != MATCH_THEN)
1164 {
1165 DPRINTF(("Recursion gave error %d\n", rrc));
1166 if (new_recursive.offset_save != stacksave)
1167 (pcre_free)(new_recursive.offset_save);
1168 RRETURN(rrc);
1169 }
1170
1171 md->recursive = &new_recursive;
1172 memcpy(md->offset_vector, new_recursive.offset_save,
1173 new_recursive.saved_max * sizeof(int));
1174 callpat += GET(callpat, 1);
1175 }
1176 while (*callpat == OP_ALT);
1177
1178 DPRINTF(("Recursion didn't match\n"));
1179 md->recursive = new_recursive.prevrec;
1180 if (new_recursive.offset_save != stacksave)
1181 (pcre_free)(new_recursive.offset_save);
1182 RRETURN(MATCH_NOMATCH);
1183 }
1184 /* Control never reaches here */
1185
1186 /* "Once" brackets are like assertion brackets except that after a match,
1187 the point in the subject string is not moved back. Thus there can never be
1188 a move back into the brackets. Friedl calls these "atomic" subpatterns.
1189 Check the alternative branches in turn - the matching won't pass the KET
1190 for this kind of subpattern. If any one branch matches, we carry on as at
1191 the end of a normal bracket, leaving the subject pointer. */
1192
1193 case OP_ONCE:
1194 prev = ecode;
1195 saved_eptr = eptr;
1196
1197 do
1198 {
1199 RMATCH(eptr, ecode + 1 + LINK_SIZE, offset_top, md, ims, eptrb, 0, RM7);
1200 if (rrc == MATCH_MATCH) break;
1201 if (rrc != MATCH_NOMATCH && rrc != MATCH_THEN) RRETURN(rrc);
1202 ecode += GET(ecode,1);
1203 }
1204 while (*ecode == OP_ALT);
1205
1206 /* If hit the end of the group (which could be repeated), fail */
1207
1208 if (*ecode != OP_ONCE && *ecode != OP_ALT) RRETURN(MATCH_NOMATCH);
1209
1210 /* Continue as from after the assertion, updating the offsets high water
1211 mark, since extracts may have been taken. */
1212
1213 do ecode += GET(ecode, 1); while (*ecode == OP_ALT);
1214
1215 offset_top = md->end_offset_top;
1216 eptr = md->end_match_ptr;
1217
1218 /* For a non-repeating ket, just continue at this level. This also
1219 happens for a repeating ket if no characters were matched in the group.
1220 This is the forcible breaking of infinite loops as implemented in Perl
1221 5.005. If there is an options reset, it will get obeyed in the normal
1222 course of events. */
1223
1224 if (*ecode == OP_KET || eptr == saved_eptr)
1225 {
1226 ecode += 1+LINK_SIZE;
1227 break;
1228 }
1229
1230 /* The repeating kets try the rest of the pattern or restart from the
1231 preceding bracket, in the appropriate order. The second "call" of match()
1232 uses tail recursion, to avoid using another stack frame. We need to reset
1233 any options that changed within the bracket before re-running it, so
1234 check the next opcode. */
1235
1236 if (ecode[1+LINK_SIZE] == OP_OPT)
1237 {
1238 ims = (ims & ~PCRE_IMS) | ecode[4];
1239 DPRINTF(("ims set to %02lx at group repeat\n", ims));
1240 }
1241
1242 if (*ecode == OP_KETRMIN)
1243 {
1244 RMATCH(eptr, ecode + 1 + LINK_SIZE, offset_top, md, ims, eptrb, 0, RM8);
1245 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1246 ecode = prev;
1247 flags = 0;
1248 goto TAIL_RECURSE;
1249 }
1250 else /* OP_KETRMAX */
1251 {
1252 RMATCH(eptr, prev, offset_top, md, ims, eptrb, match_cbegroup, RM9);
1253 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1254 ecode += 1 + LINK_SIZE;
1255 flags = 0;
1256 goto TAIL_RECURSE;
1257 }
1258 /* Control never gets here */
1259
1260 /* An alternation is the end of a branch; scan along to find the end of the
1261 bracketed group and go to there. */
1262
1263 case OP_ALT:
1264 do ecode += GET(ecode,1); while (*ecode == OP_ALT);
1265 break;
1266
1267 /* BRAZERO, BRAMINZERO and SKIPZERO occur just before a bracket group,
1268 indicating that it may occur zero times. It may repeat infinitely, or not
1269 at all - i.e. it could be ()* or ()? or even (){0} in the pattern. Brackets
1270 with fixed upper repeat limits are compiled as a number of copies, with the
1271 optional ones preceded by BRAZERO or BRAMINZERO. */
1272
1273 case OP_BRAZERO:
1274 {
1275 next = ecode+1;
1276 RMATCH(eptr, next, offset_top, md, ims, eptrb, 0, RM10);
1277 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1278 do next += GET(next,1); while (*next == OP_ALT);
1279 ecode = next + 1 + LINK_SIZE;
1280 }
1281 break;
1282
1283 case OP_BRAMINZERO:
1284 {
1285 next = ecode+1;
1286 do next += GET(next, 1); while (*next == OP_ALT);
1287 RMATCH(eptr, next + 1+LINK_SIZE, offset_top, md, ims, eptrb, 0, RM11);
1288 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1289 ecode++;
1290 }
1291 break;
1292
1293 case OP_SKIPZERO:
1294 {
1295 next = ecode+1;
1296 do next += GET(next,1); while (*next == OP_ALT);
1297 ecode = next + 1 + LINK_SIZE;
1298 }
1299 break;
1300
1301 /* End of a group, repeated or non-repeating. */
1302
1303 case OP_KET:
1304 case OP_KETRMIN:
1305 case OP_KETRMAX:
1306 prev = ecode - GET(ecode, 1);
1307
1308 /* If this was a group that remembered the subject start, in order to break
1309 infinite repeats of empty string matches, retrieve the subject start from
1310 the chain. Otherwise, set it NULL. */
1311
1312 if (*prev >= OP_SBRA)
1313 {
1314 saved_eptr = eptrb->epb_saved_eptr; /* Value at start of group */
1315 eptrb = eptrb->epb_prev; /* Backup to previous group */
1316 }
1317 else saved_eptr = NULL;
1318
1319 /* If we are at the end of an assertion group, stop matching and return
1320 MATCH_MATCH, but record the current high water mark for use by positive
1321 assertions. Do this also for the "once" (atomic) groups. */
1322
1323 if (*prev == OP_ASSERT || *prev == OP_ASSERT_NOT ||
1324 *prev == OP_ASSERTBACK || *prev == OP_ASSERTBACK_NOT ||
1325 *prev == OP_ONCE)
1326 {
1327 md->end_match_ptr = eptr; /* For ONCE */
1328 md->end_offset_top = offset_top;
1329 RRETURN(MATCH_MATCH);
1330 }
1331
1332 /* For capturing groups we have to check the group number back at the start
1333 and if necessary complete handling an extraction by setting the offsets and
1334 bumping the high water mark. Note that whole-pattern recursion is coded as
1335 a recurse into group 0, so it won't be picked up here. Instead, we catch it
1336 when the OP_END is reached. Other recursion is handled here. */
1337
1338 if (*prev == OP_CBRA || *prev == OP_SCBRA)
1339 {
1340 number = GET2(prev, 1+LINK_SIZE);
1341 offset = number << 1;
1342
1343 #ifdef DEBUG
1344 printf("end bracket %d", number);
1345 printf("\n");
1346 #endif
1347
1348 md->capture_last = number;
1349 if (offset >= md->offset_max) md->offset_overflow = TRUE; else
1350 {
1351 md->offset_vector[offset] =
1352 md->offset_vector[md->offset_end - number];
1353 md->offset_vector[offset+1] = eptr - md->start_subject;
1354 if (offset_top <= offset) offset_top = offset + 2;
1355 }
1356
1357 /* Handle a recursively called group. Restore the offsets
1358 appropriately and continue from after the call. */
1359
1360 if (md->recursive != NULL && md->recursive->group_num == number)
1361 {
1362 recursion_info *rec = md->recursive;
1363 DPRINTF(("Recursion (%d) succeeded - continuing\n", number));
1364 md->recursive = rec->prevrec;
1365 mstart = rec->save_start;
1366 memcpy(md->offset_vector, rec->offset_save,
1367 rec->saved_max * sizeof(int));
1368 offset_top = rec->offset_top;
1369 ecode = rec->after_call;
1370 ims = original_ims;
1371 break;
1372 }
1373 }
1374
1375 /* For both capturing and non-capturing groups, reset the value of the ims
1376 flags, in case they got changed during the group. */
1377
1378 ims = original_ims;
1379 DPRINTF(("ims reset to %02lx\n", ims));
1380
1381 /* For a non-repeating ket, just continue at this level. This also
1382 happens for a repeating ket if no characters were matched in the group.
1383 This is the forcible breaking of infinite loops as implemented in Perl
1384 5.005. If there is an options reset, it will get obeyed in the normal
1385 course of events. */
1386
1387 if (*ecode == OP_KET || eptr == saved_eptr)
1388 {
1389 ecode += 1 + LINK_SIZE;
1390 break;
1391 }
1392
1393 /* The repeating kets try the rest of the pattern or restart from the
1394 preceding bracket, in the appropriate order. In the second case, we can use
1395 tail recursion to avoid using another stack frame, unless we have an
1396 unlimited repeat of a group that can match an empty string. */
1397
1398 flags = (*prev >= OP_SBRA)? match_cbegroup : 0;
1399
1400 if (*ecode == OP_KETRMIN)
1401 {
1402 RMATCH(eptr, ecode + 1 + LINK_SIZE, offset_top, md, ims, eptrb, 0, RM12);
1403 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1404 if (flags != 0) /* Could match an empty string */
1405 {
1406 RMATCH(eptr, prev, offset_top, md, ims, eptrb, flags, RM50);
1407 RRETURN(rrc);
1408 }
1409 ecode = prev;
1410 goto TAIL_RECURSE;
1411 }
1412 else /* OP_KETRMAX */
1413 {
1414 RMATCH(eptr, prev, offset_top, md, ims, eptrb, flags, RM13);
1415 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1416 ecode += 1 + LINK_SIZE;
1417 flags = 0;
1418 goto TAIL_RECURSE;
1419 }
1420 /* Control never gets here */
1421
1422 /* Start of subject unless notbol, or after internal newline if multiline */
1423
1424 case OP_CIRC:
1425 if (md->notbol && eptr == md->start_subject) RRETURN(MATCH_NOMATCH);
1426 if ((ims & PCRE_MULTILINE) != 0)
1427 {
1428 if (eptr != md->start_subject &&
1429 (eptr == md->end_subject || !WAS_NEWLINE(eptr)))
1430 RRETURN(MATCH_NOMATCH);
1431 ecode++;
1432 break;
1433 }
1434 /* ... else fall through */
1435
1436 /* Start of subject assertion */
1437
1438 case OP_SOD:
1439 if (eptr != md->start_subject) RRETURN(MATCH_NOMATCH);
1440 ecode++;
1441 break;
1442
1443 /* Start of match assertion */
1444
1445 case OP_SOM:
1446 if (eptr != md->start_subject + md->start_offset) RRETURN(MATCH_NOMATCH);
1447 ecode++;
1448 break;
1449
1450 /* Reset the start of match point */
1451
1452 case OP_SET_SOM:
1453 mstart = eptr;
1454 ecode++;
1455 break;
1456
1457 /* Assert before internal newline if multiline, or before a terminating
1458 newline unless endonly is set, else end of subject unless noteol is set. */
1459
1460 case OP_DOLL:
1461 if ((ims & PCRE_MULTILINE) != 0)
1462 {
1463 if (eptr < md->end_subject)
1464 { if (!IS_NEWLINE(eptr)) RRETURN(MATCH_NOMATCH); }
1465 else
1466 { if (md->noteol) RRETURN(MATCH_NOMATCH); }
1467 ecode++;
1468 break;
1469 }
1470 else
1471 {
1472 if (md->noteol) RRETURN(MATCH_NOMATCH);
1473 if (!md->endonly)
1474 {
1475 if (eptr != md->end_subject &&
1476 (!IS_NEWLINE(eptr) || eptr != md->end_subject - md->nllen))
1477 RRETURN(MATCH_NOMATCH);
1478 ecode++;
1479 break;
1480 }
1481 }
1482 /* ... else fall through for endonly */
1483
1484 /* End of subject assertion (\z) */
1485
1486 case OP_EOD:
1487 if (eptr < md->end_subject) RRETURN(MATCH_NOMATCH);
1488 ecode++;
1489 break;
1490
1491 /* End of subject or ending \n assertion (\Z) */
1492
1493 case OP_EODN:
1494 if (eptr != md->end_subject &&
1495 (!IS_NEWLINE(eptr) || eptr != md->end_subject - md->nllen))
1496 RRETURN(MATCH_NOMATCH);
1497 ecode++;
1498 break;
1499
1500 /* Word boundary assertions */
1501
1502 case OP_NOT_WORD_BOUNDARY:
1503 case OP_WORD_BOUNDARY:
1504 {
1505
1506 /* Find out if the previous and current characters are "word" characters.
1507 It takes a bit more work in UTF-8 mode. Characters > 255 are assumed to
1508 be "non-word" characters. Remember the earliest consulted character for
1509 partial matching. */
1510
1511 #ifdef SUPPORT_UTF8
1512 if (utf8)
1513 {
1514 if (eptr == md->start_subject) prev_is_word = FALSE; else
1515 {
1516 USPTR lastptr = eptr - 1;
1517 while((*lastptr & 0xc0) == 0x80) lastptr--;
1518 if (lastptr < md->start_used_ptr) md->start_used_ptr = lastptr;
1519 GETCHAR(c, lastptr);
1520 prev_is_word = c < 256 && (md->ctypes[c] & ctype_word) != 0;
1521 }
1522 if (eptr >= md->end_subject)
1523 {
1524 SCHECK_PARTIAL();
1525 cur_is_word = FALSE;
1526 }
1527 else
1528 {
1529 GETCHAR(c, eptr);
1530 cur_is_word = c < 256 && (md->ctypes[c] & ctype_word) != 0;
1531 }
1532 }
1533 else
1534 #endif
1535
1536 /* Not in UTF-8 mode */
1537
1538 {
1539 if (eptr == md->start_subject) prev_is_word = FALSE; else
1540 {
1541 if (eptr <= md->start_used_ptr) md->start_used_ptr = eptr - 1;
1542 prev_is_word = ((md->ctypes[eptr[-1]] & ctype_word) != 0);
1543 }
1544 if (eptr >= md->end_subject)
1545 {
1546 SCHECK_PARTIAL();
1547 cur_is_word = FALSE;
1548 }
1549 else cur_is_word = ((md->ctypes[*eptr] & ctype_word) != 0);
1550 }
1551
1552 /* Now see if the situation is what we want */
1553
1554 if ((*ecode++ == OP_WORD_BOUNDARY)?
1555 cur_is_word == prev_is_word : cur_is_word != prev_is_word)
1556 RRETURN(MATCH_NOMATCH);
1557 }
1558 break;
1559
1560 /* Match a single character type; inline for speed */
1561
1562 case OP_ANY:
1563 if (IS_NEWLINE(eptr)) RRETURN(MATCH_NOMATCH);
1564 /* Fall through */
1565
1566 case OP_ALLANY:
1567 if (eptr++ >= md->end_subject)
1568 {
1569 SCHECK_PARTIAL();
1570 RRETURN(MATCH_NOMATCH);
1571 }
1572 if (utf8) while (eptr < md->end_subject && (*eptr & 0xc0) == 0x80) eptr++;
1573 ecode++;
1574 break;
1575
1576 /* Match a single byte, even in UTF-8 mode. This opcode really does match
1577 any byte, even newline, independent of the setting of PCRE_DOTALL. */
1578
1579 case OP_ANYBYTE:
1580 if (eptr++ >= md->end_subject)
1581 {
1582 SCHECK_PARTIAL();
1583 RRETURN(MATCH_NOMATCH);
1584 }
1585 ecode++;
1586 break;
1587
1588 case OP_NOT_DIGIT:
1589 if (eptr >= md->end_subject)
1590 {
1591 SCHECK_PARTIAL();
1592 RRETURN(MATCH_NOMATCH);
1593 }
1594 GETCHARINCTEST(c, eptr);
1595 if (
1596 #ifdef SUPPORT_UTF8
1597 c < 256 &&
1598 #endif
1599 (md->ctypes[c] & ctype_digit) != 0
1600 )
1601 RRETURN(MATCH_NOMATCH);
1602 ecode++;
1603 break;
1604
1605 case OP_DIGIT:
1606 if (eptr >= md->end_subject)
1607 {
1608 SCHECK_PARTIAL();
1609 RRETURN(MATCH_NOMATCH);
1610 }
1611 GETCHARINCTEST(c, eptr);
1612 if (
1613 #ifdef SUPPORT_UTF8
1614 c >= 256 ||
1615 #endif
1616 (md->ctypes[c] & ctype_digit) == 0
1617 )
1618 RRETURN(MATCH_NOMATCH);
1619 ecode++;
1620 break;
1621
1622 case OP_NOT_WHITESPACE:
1623 if (eptr >= md->end_subject)
1624 {
1625 SCHECK_PARTIAL();
1626 RRETURN(MATCH_NOMATCH);
1627 }
1628 GETCHARINCTEST(c, eptr);
1629 if (
1630 #ifdef SUPPORT_UTF8
1631 c < 256 &&
1632 #endif
1633 (md->ctypes[c] & ctype_space) != 0
1634 )
1635 RRETURN(MATCH_NOMATCH);
1636 ecode++;
1637 break;
1638
1639 case OP_WHITESPACE:
1640 if (eptr >= md->end_subject)
1641 {
1642 SCHECK_PARTIAL();
1643 RRETURN(MATCH_NOMATCH);
1644 }
1645 GETCHARINCTEST(c, eptr);
1646 if (
1647 #ifdef SUPPORT_UTF8
1648 c >= 256 ||
1649 #endif
1650 (md->ctypes[c] & ctype_space) == 0
1651 )
1652 RRETURN(MATCH_NOMATCH);
1653 ecode++;
1654 break;
1655
1656 case OP_NOT_WORDCHAR:
1657 if (eptr >= md->end_subject)
1658 {
1659 SCHECK_PARTIAL();
1660 RRETURN(MATCH_NOMATCH);
1661 }
1662 GETCHARINCTEST(c, eptr);
1663 if (
1664 #ifdef SUPPORT_UTF8
1665 c < 256 &&
1666 #endif
1667 (md->ctypes[c] & ctype_word) != 0
1668 )
1669 RRETURN(MATCH_NOMATCH);
1670 ecode++;
1671 break;
1672
1673 case OP_WORDCHAR:
1674 if (eptr >= md->end_subject)
1675 {
1676 SCHECK_PARTIAL();
1677 RRETURN(MATCH_NOMATCH);
1678 }
1679 GETCHARINCTEST(c, eptr);
1680 if (
1681 #ifdef SUPPORT_UTF8
1682 c >= 256 ||
1683 #endif
1684 (md->ctypes[c] & ctype_word) == 0
1685 )
1686 RRETURN(MATCH_NOMATCH);
1687 ecode++;
1688 break;
1689
1690 case OP_ANYNL:
1691 if (eptr >= md->end_subject)
1692 {
1693 SCHECK_PARTIAL();
1694 RRETURN(MATCH_NOMATCH);
1695 }
1696 GETCHARINCTEST(c, eptr);
1697 switch(c)
1698 {
1699 default: RRETURN(MATCH_NOMATCH);
1700 case 0x000d:
1701 if (eptr < md->end_subject && *eptr == 0x0a) eptr++;
1702 break;
1703
1704 case 0x000a:
1705 break;
1706
1707 case 0x000b:
1708 case 0x000c:
1709 case 0x0085:
1710 case 0x2028:
1711 case 0x2029:
1712 if (md->bsr_anycrlf) RRETURN(MATCH_NOMATCH);
1713 break;
1714 }
1715 ecode++;
1716 break;
1717
1718 case OP_NOT_HSPACE:
1719 if (eptr >= md->end_subject)
1720 {
1721 SCHECK_PARTIAL();
1722 RRETURN(MATCH_NOMATCH);
1723 }
1724 GETCHARINCTEST(c, eptr);
1725 switch(c)
1726 {
1727 default: break;
1728 case 0x09: /* HT */
1729 case 0x20: /* SPACE */
1730 case 0xa0: /* NBSP */
1731 case 0x1680: /* OGHAM SPACE MARK */
1732 case 0x180e: /* MONGOLIAN VOWEL SEPARATOR */
1733 case 0x2000: /* EN QUAD */
1734 case 0x2001: /* EM QUAD */
1735 case 0x2002: /* EN SPACE */
1736 case 0x2003: /* EM SPACE */
1737 case 0x2004: /* THREE-PER-EM SPACE */
1738 case 0x2005: /* FOUR-PER-EM SPACE */
1739 case 0x2006: /* SIX-PER-EM SPACE */
1740 case 0x2007: /* FIGURE SPACE */
1741 case 0x2008: /* PUNCTUATION SPACE */
1742 case 0x2009: /* THIN SPACE */
1743 case 0x200A: /* HAIR SPACE */
1744 case 0x202f: /* NARROW NO-BREAK SPACE */
1745 case 0x205f: /* MEDIUM MATHEMATICAL SPACE */
1746 case 0x3000: /* IDEOGRAPHIC SPACE */
1747 RRETURN(MATCH_NOMATCH);
1748 }
1749 ecode++;
1750 break;
1751
1752 case OP_HSPACE:
1753 if (eptr >= md->end_subject)
1754 {
1755 SCHECK_PARTIAL();
1756 RRETURN(MATCH_NOMATCH);
1757 }
1758 GETCHARINCTEST(c, eptr);
1759 switch(c)
1760 {
1761 default: RRETURN(MATCH_NOMATCH);
1762 case 0x09: /* HT */
1763 case 0x20: /* SPACE */
1764 case 0xa0: /* NBSP */
1765 case 0x1680: /* OGHAM SPACE MARK */
1766 case 0x180e: /* MONGOLIAN VOWEL SEPARATOR */
1767 case 0x2000: /* EN QUAD */
1768 case 0x2001: /* EM QUAD */
1769 case 0x2002: /* EN SPACE */
1770 case 0x2003: /* EM SPACE */
1771 case 0x2004: /* THREE-PER-EM SPACE */
1772 case 0x2005: /* FOUR-PER-EM SPACE */
1773 case 0x2006: /* SIX-PER-EM SPACE */
1774 case 0x2007: /* FIGURE SPACE */
1775 case 0x2008: /* PUNCTUATION SPACE */
1776 case 0x2009: /* THIN SPACE */
1777 case 0x200A: /* HAIR SPACE */
1778 case 0x202f: /* NARROW NO-BREAK SPACE */
1779 case 0x205f: /* MEDIUM MATHEMATICAL SPACE */
1780 case 0x3000: /* IDEOGRAPHIC SPACE */
1781 break;
1782 }
1783 ecode++;
1784 break;
1785
1786 case OP_NOT_VSPACE:
1787 if (eptr >= md->end_subject)
1788 {
1789 SCHECK_PARTIAL();
1790 RRETURN(MATCH_NOMATCH);
1791 }
1792 GETCHARINCTEST(c, eptr);
1793 switch(c)
1794 {
1795 default: break;
1796 case 0x0a: /* LF */
1797 case 0x0b: /* VT */
1798 case 0x0c: /* FF */
1799 case 0x0d: /* CR */
1800 case 0x85: /* NEL */
1801 case 0x2028: /* LINE SEPARATOR */
1802 case 0x2029: /* PARAGRAPH SEPARATOR */
1803 RRETURN(MATCH_NOMATCH);
1804 }
1805 ecode++;
1806 break;
1807
1808 case OP_VSPACE:
1809 if (eptr >= md->end_subject)
1810 {
1811 SCHECK_PARTIAL();
1812 RRETURN(MATCH_NOMATCH);
1813 }
1814 GETCHARINCTEST(c, eptr);
1815 switch(c)
1816 {
1817 default: RRETURN(MATCH_NOMATCH);
1818 case 0x0a: /* LF */
1819 case 0x0b: /* VT */
1820 case 0x0c: /* FF */
1821 case 0x0d: /* CR */
1822 case 0x85: /* NEL */
1823 case 0x2028: /* LINE SEPARATOR */
1824 case 0x2029: /* PARAGRAPH SEPARATOR */
1825 break;
1826 }
1827 ecode++;
1828 break;
1829
1830 #ifdef SUPPORT_UCP
1831 /* Check the next character by Unicode property. We will get here only
1832 if the support is in the binary; otherwise a compile-time error occurs. */
1833
1834 case OP_PROP:
1835 case OP_NOTPROP:
1836 if (eptr >= md->end_subject)
1837 {
1838 SCHECK_PARTIAL();
1839 RRETURN(MATCH_NOMATCH);
1840 }
1841 GETCHARINCTEST(c, eptr);
1842 {
1843 const ucd_record *prop = GET_UCD(c);
1844
1845 switch(ecode[1])
1846 {
1847 case PT_ANY:
1848 if (op == OP_NOTPROP) RRETURN(MATCH_NOMATCH);
1849 break;
1850
1851 case PT_LAMP:
1852 if ((prop->chartype == ucp_Lu ||
1853 prop->chartype == ucp_Ll ||
1854 prop->chartype == ucp_Lt) == (op == OP_NOTPROP))
1855 RRETURN(MATCH_NOMATCH);
1856 break;
1857
1858 case PT_GC:
1859 if ((ecode[2] != _pcre_ucp_gentype[prop->chartype]) == (op == OP_PROP))
1860 RRETURN(MATCH_NOMATCH);
1861 break;
1862
1863 case PT_PC:
1864 if ((ecode[2] != prop->chartype) == (op == OP_PROP))
1865 RRETURN(MATCH_NOMATCH);
1866 break;
1867
1868 case PT_SC:
1869 if ((ecode[2] != prop->script) == (op == OP_PROP))
1870 RRETURN(MATCH_NOMATCH);
1871 break;
1872
1873 default:
1874 RRETURN(PCRE_ERROR_INTERNAL);
1875 }
1876
1877 ecode += 3;
1878 }
1879 break;
1880
1881 /* Match an extended Unicode sequence. We will get here only if the support
1882 is in the binary; otherwise a compile-time error occurs. */
1883
1884 case OP_EXTUNI:
1885 if (eptr >= md->end_subject)
1886 {
1887 SCHECK_PARTIAL();
1888 RRETURN(MATCH_NOMATCH);
1889 }
1890 GETCHARINCTEST(c, eptr);
1891 {
1892 int category = UCD_CATEGORY(c);
1893 if (category == ucp_M) RRETURN(MATCH_NOMATCH);
1894 while (eptr < md->end_subject)
1895 {
1896 int len = 1;
1897 if (!utf8) c = *eptr; else
1898 {
1899 GETCHARLEN(c, eptr, len);
1900 }
1901 category = UCD_CATEGORY(c);
1902 if (category != ucp_M) break;
1903 eptr += len;
1904 }
1905 }
1906 ecode++;
1907 break;
1908 #endif
1909
1910
1911 /* Match a back reference, possibly repeatedly. Look past the end of the
1912 item to see if there is repeat information following. The code is similar
1913 to that for character classes, but repeated for efficiency. Then obey
1914 similar code to character type repeats - written out again for speed.
1915 However, if the referenced string is the empty string, always treat
1916 it as matched, any number of times (otherwise there could be infinite
1917 loops). */
1918
1919 case OP_REF:
1920 {
1921 offset = GET2(ecode, 1) << 1; /* Doubled ref number */
1922 ecode += 3;
1923
1924 /* If the reference is unset, there are two possibilities:
1925
1926 (a) In the default, Perl-compatible state, set the length to be longer
1927 than the amount of subject left; this ensures that every attempt at a
1928 match fails. We can't just fail here, because of the possibility of
1929 quantifiers with zero minima.
1930
1931 (b) If the JavaScript compatibility flag is set, set the length to zero
1932 so that the back reference matches an empty string.
1933
1934 Otherwise, set the length to the length of what was matched by the
1935 referenced subpattern. */
1936
1937 if (offset >= offset_top || md->offset_vector[offset] < 0)
1938 length = (md->jscript_compat)? 0 : md->end_subject - eptr + 1;
1939 else
1940 length = md->offset_vector[offset+1] - md->offset_vector[offset];
1941
1942 /* Set up for repetition, or handle the non-repeated case */
1943
1944 switch (*ecode)
1945 {
1946 case OP_CRSTAR:
1947 case OP_CRMINSTAR:
1948 case OP_CRPLUS:
1949 case OP_CRMINPLUS:
1950 case OP_CRQUERY:
1951 case OP_CRMINQUERY:
1952 c = *ecode++ - OP_CRSTAR;
1953 minimize = (c & 1) != 0;
1954 min = rep_min[c]; /* Pick up values from tables; */
1955 max = rep_max[c]; /* zero for max => infinity */
1956 if (max == 0) max = INT_MAX;
1957 break;
1958
1959 case OP_CRRANGE:
1960 case OP_CRMINRANGE:
1961 minimize = (*ecode == OP_CRMINRANGE);
1962 min = GET2(ecode, 1);
1963 max = GET2(ecode, 3);
1964 if (max == 0) max = INT_MAX;
1965 ecode += 5;
1966 break;
1967
1968 default: /* No repeat follows */
1969 if (!match_ref(offset, eptr, length, md, ims))
1970 {
1971 CHECK_PARTIAL();
1972 RRETURN(MATCH_NOMATCH);
1973 }
1974 eptr += length;
1975 continue; /* With the main loop */
1976 }
1977
1978 /* If the length of the reference is zero, just continue with the
1979 main loop. */
1980
1981 if (length == 0) continue;
1982
1983 /* First, ensure the minimum number of matches are present. We get back
1984 the length of the reference string explicitly rather than passing the
1985 address of eptr, so that eptr can be a register variable. */
1986
1987 for (i = 1; i <= min; i++)
1988 {
1989 if (!match_ref(offset, eptr, length, md, ims))
1990 {
1991 CHECK_PARTIAL();
1992 RRETURN(MATCH_NOMATCH);
1993 }
1994 eptr += length;
1995 }
1996
1997 /* If min = max, continue at the same level without recursion.
1998 They are not both allowed to be zero. */
1999
2000 if (min == max) continue;
2001
2002 /* If minimizing, keep trying and advancing the pointer */
2003
2004 if (minimize)
2005 {
2006 for (fi = min;; fi++)
2007 {
2008 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM14);
2009 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2010 if (fi >= max) RRETURN(MATCH_NOMATCH);
2011 if (!match_ref(offset, eptr, length, md, ims))
2012 {
2013 CHECK_PARTIAL();
2014 RRETURN(MATCH_NOMATCH);
2015 }
2016 eptr += length;
2017 }
2018 /* Control never gets here */
2019 }
2020
2021 /* If maximizing, find the longest string and work backwards */
2022
2023 else
2024 {
2025 pp = eptr;
2026 for (i = min; i < max; i++)
2027 {
2028 if (!match_ref(offset, eptr, length, md, ims)) break;
2029 eptr += length;
2030 }
2031 while (eptr >= pp)
2032 {
2033 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM15);
2034 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2035 eptr -= length;
2036 }
2037 RRETURN(MATCH_NOMATCH);
2038 }
2039 }
2040 /* Control never gets here */
2041
2042 /* Match a bit-mapped character class, possibly repeatedly. This op code is
2043 used when all the characters in the class have values in the range 0-255,
2044 and either the matching is caseful, or the characters are in the range
2045 0-127 when UTF-8 processing is enabled. The only difference between
2046 OP_CLASS and OP_NCLASS occurs when a data character outside the range is
2047 encountered.
2048
2049 First, look past the end of the item to see if there is repeat information
2050 following. Then obey similar code to character type repeats - written out
2051 again for speed. */
2052
2053 case OP_NCLASS:
2054 case OP_CLASS:
2055 {
2056 data = ecode + 1; /* Save for matching */
2057 ecode += 33; /* Advance past the item */
2058
2059 switch (*ecode)
2060 {
2061 case OP_CRSTAR:
2062 case OP_CRMINSTAR:
2063 case OP_CRPLUS:
2064 case OP_CRMINPLUS:
2065 case OP_CRQUERY:
2066 case OP_CRMINQUERY:
2067 c = *ecode++ - OP_CRSTAR;
2068 minimize = (c & 1) != 0;
2069 min = rep_min[c]; /* Pick up values from tables; */
2070 max = rep_max[c]; /* zero for max => infinity */
2071 if (max == 0) max = INT_MAX;
2072 break;
2073
2074 case OP_CRRANGE:
2075 case OP_CRMINRANGE:
2076 minimize = (*ecode == OP_CRMINRANGE);
2077 min = GET2(ecode, 1);
2078 max = GET2(ecode, 3);
2079 if (max == 0) max = INT_MAX;
2080 ecode += 5;
2081 break;
2082
2083 default: /* No repeat follows */
2084 min = max = 1;
2085 break;
2086 }
2087
2088 /* First, ensure the minimum number of matches are present. */
2089
2090 #ifdef SUPPORT_UTF8
2091 /* UTF-8 mode */
2092 if (utf8)
2093 {
2094 for (i = 1; i <= min; i++)
2095 {
2096 if (eptr >= md->end_subject)
2097 {
2098 SCHECK_PARTIAL();
2099 RRETURN(MATCH_NOMATCH);
2100 }
2101 GETCHARINC(c, eptr);
2102 if (c > 255)
2103 {
2104 if (op == OP_CLASS) RRETURN(MATCH_NOMATCH);
2105 }
2106 else
2107 {
2108 if ((data[c/8] & (1 << (c&7))) == 0) RRETURN(MATCH_NOMATCH);
2109 }
2110 }
2111 }
2112 else
2113 #endif
2114 /* Not UTF-8 mode */
2115 {
2116 for (i = 1; i <= min; i++)
2117 {
2118 if (eptr >= md->end_subject)
2119 {
2120 SCHECK_PARTIAL();
2121 RRETURN(MATCH_NOMATCH);
2122 }
2123 c = *eptr++;
2124 if ((data[c/8] & (1 << (c&7))) == 0) RRETURN(MATCH_NOMATCH);
2125 }
2126 }
2127
2128 /* If max == min we can continue with the main loop without the
2129 need to recurse. */
2130
2131 if (min == max) continue;
2132
2133 /* If minimizing, keep testing the rest of the expression and advancing
2134 the pointer while it matches the class. */
2135
2136 if (minimize)
2137 {
2138 #ifdef SUPPORT_UTF8
2139 /* UTF-8 mode */
2140 if (utf8)
2141 {
2142 for (fi = min;; fi++)
2143 {
2144 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM16);
2145 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2146 if (fi >= max) RRETURN(MATCH_NOMATCH);
2147 if (eptr >= md->end_subject)
2148 {
2149 SCHECK_PARTIAL();
2150 RRETURN(MATCH_NOMATCH);
2151 }
2152 GETCHARINC(c, eptr);
2153 if (c > 255)
2154 {
2155 if (op == OP_CLASS) RRETURN(MATCH_NOMATCH);
2156 }
2157 else
2158 {
2159 if ((data[c/8] & (1 << (c&7))) == 0) RRETURN(MATCH_NOMATCH);
2160 }
2161 }
2162 }
2163 else
2164 #endif
2165 /* Not UTF-8 mode */
2166 {
2167 for (fi = min;; fi++)
2168 {
2169 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM17);
2170 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2171 if (fi >= max) RRETURN(MATCH_NOMATCH);
2172 if (eptr >= md->end_subject)
2173 {
2174 SCHECK_PARTIAL();
2175 RRETURN(MATCH_NOMATCH);
2176 }
2177 c = *eptr++;
2178 if ((data[c/8] & (1 << (c&7))) == 0) RRETURN(MATCH_NOMATCH);
2179 }
2180 }
2181 /* Control never gets here */
2182 }
2183
2184 /* If maximizing, find the longest possible run, then work backwards. */
2185
2186 else
2187 {
2188 pp = eptr;
2189
2190 #ifdef SUPPORT_UTF8
2191 /* UTF-8 mode */
2192 if (utf8)
2193 {
2194 for (i = min; i < max; i++)
2195 {
2196 int len = 1;
2197 if (eptr >= md->end_subject) break;
2198 GETCHARLEN(c, eptr, len);
2199 if (c > 255)
2200 {
2201 if (op == OP_CLASS) break;
2202 }
2203 else
2204 {
2205 if ((data[c/8] & (1 << (c&7))) == 0) break;
2206 }
2207 eptr += len;
2208 }
2209 for (;;)
2210 {
2211 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM18);
2212 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2213 if (eptr-- == pp) break; /* Stop if tried at original pos */
2214 BACKCHAR(eptr);
2215 }
2216 }
2217 else
2218 #endif
2219 /* Not UTF-8 mode */
2220 {
2221 for (i = min; i < max; i++)
2222 {
2223 if (eptr >= md->end_subject) break;
2224 c = *eptr;
2225 if ((data[c/8] & (1 << (c&7))) == 0) break;
2226 eptr++;
2227 }
2228 while (eptr >= pp)
2229 {
2230 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM19);
2231 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2232 eptr--;
2233 }
2234 }
2235
2236 RRETURN(MATCH_NOMATCH);
2237 }
2238 }
2239 /* Control never gets here */
2240
2241
2242 /* Match an extended character class. This opcode is encountered only
2243 when UTF-8 mode mode is supported. Nevertheless, we may not be in UTF-8
2244 mode, because Unicode properties are supported in non-UTF-8 mode. */
2245
2246 #ifdef SUPPORT_UTF8
2247 case OP_XCLASS:
2248 {
2249 data = ecode + 1 + LINK_SIZE; /* Save for matching */
2250 ecode += GET(ecode, 1); /* Advance past the item */
2251
2252 switch (*ecode)
2253 {
2254 case OP_CRSTAR:
2255 case OP_CRMINSTAR:
2256 case OP_CRPLUS:
2257 case OP_CRMINPLUS:
2258 case OP_CRQUERY:
2259 case OP_CRMINQUERY:
2260 c = *ecode++ - OP_CRSTAR;
2261 minimize = (c & 1) != 0;
2262 min = rep_min[c]; /* Pick up values from tables; */
2263 max = rep_max[c]; /* zero for max => infinity */
2264 if (max == 0) max = INT_MAX;
2265 break;
2266
2267 case OP_CRRANGE:
2268 case OP_CRMINRANGE:
2269 minimize = (*ecode == OP_CRMINRANGE);
2270 min = GET2(ecode, 1);
2271 max = GET2(ecode, 3);
2272 if (max == 0) max = INT_MAX;
2273 ecode += 5;
2274 break;
2275
2276 default: /* No repeat follows */
2277 min = max = 1;
2278 break;
2279 }
2280
2281 /* First, ensure the minimum number of matches are present. */
2282
2283 for (i = 1; i <= min; i++)
2284 {
2285 if (eptr >= md->end_subject)
2286 {
2287 SCHECK_PARTIAL();
2288 RRETURN(MATCH_NOMATCH);
2289 }
2290 GETCHARINCTEST(c, eptr);
2291 if (!_pcre_xclass(c, data)) RRETURN(MATCH_NOMATCH);
2292 }
2293
2294 /* If max == min we can continue with the main loop without the
2295 need to recurse. */
2296
2297 if (min == max) continue;
2298
2299 /* If minimizing, keep testing the rest of the expression and advancing
2300 the pointer while it matches the class. */
2301
2302 if (minimize)
2303 {
2304 for (fi = min;; fi++)
2305 {
2306 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM20);
2307 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2308 if (fi >= max) RRETURN(MATCH_NOMATCH);
2309 if (eptr >= md->end_subject)
2310 {
2311 SCHECK_PARTIAL();
2312 RRETURN(MATCH_NOMATCH);
2313 }
2314 GETCHARINCTEST(c, eptr);
2315 if (!_pcre_xclass(c, data)) RRETURN(MATCH_NOMATCH);
2316 }
2317 /* Control never gets here */
2318 }
2319
2320 /* If maximizing, find the longest possible run, then work backwards. */
2321
2322 else
2323 {
2324 pp = eptr;
2325 for (i = min; i < max; i++)
2326 {
2327 int len = 1;
2328 if (eptr >= md->end_subject) break;
2329 GETCHARLENTEST(c, eptr, len);
2330 if (!_pcre_xclass(c, data)) break;
2331 eptr += len;
2332 }
2333 for(;;)
2334 {
2335 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM21);
2336 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2337 if (eptr-- == pp) break; /* Stop if tried at original pos */
2338 if (utf8) BACKCHAR(eptr);
2339 }
2340 RRETURN(MATCH_NOMATCH);
2341 }
2342
2343 /* Control never gets here */
2344 }
2345 #endif /* End of XCLASS */
2346
2347 /* Match a single character, casefully */
2348
2349 case OP_CHAR:
2350 #ifdef SUPPORT_UTF8
2351 if (utf8)
2352 {
2353 length = 1;
2354 ecode++;
2355 GETCHARLEN(fc, ecode, length);
2356 if (length > md->end_subject - eptr)
2357 {
2358 CHECK_PARTIAL(); /* Not SCHECK_PARTIAL() */
2359 RRETURN(MATCH_NOMATCH);
2360 }
2361 while (length-- > 0) if (*ecode++ != *eptr++) RRETURN(MATCH_NOMATCH);
2362 }
2363 else
2364 #endif
2365
2366 /* Non-UTF-8 mode */
2367 {
2368 if (md->end_subject - eptr < 1)
2369 {
2370 SCHECK_PARTIAL(); /* This one can use SCHECK_PARTIAL() */
2371 RRETURN(MATCH_NOMATCH);
2372 }
2373 if (ecode[1] != *eptr++) RRETURN(MATCH_NOMATCH);
2374 ecode += 2;
2375 }
2376 break;
2377
2378 /* Match a single character, caselessly */
2379
2380 case OP_CHARNC:
2381 #ifdef SUPPORT_UTF8
2382 if (utf8)
2383 {
2384 length = 1;
2385 ecode++;
2386 GETCHARLEN(fc, ecode, length);
2387
2388 if (length > md->end_subject - eptr)
2389 {
2390 CHECK_PARTIAL(); /* Not SCHECK_PARTIAL() */
2391 RRETURN(MATCH_NOMATCH);
2392 }
2393
2394 /* If the pattern character's value is < 128, we have only one byte, and
2395 can use the fast lookup table. */
2396
2397 if (fc < 128)
2398 {
2399 if (md->lcc[*ecode++] != md->lcc[*eptr++]) RRETURN(MATCH_NOMATCH);
2400 }
2401
2402 /* Otherwise we must pick up the subject character */
2403
2404 else
2405 {
2406 unsigned int dc;
2407 GETCHARINC(dc, eptr);
2408 ecode += length;
2409
2410 /* If we have Unicode property support, we can use it to test the other
2411 case of the character, if there is one. */
2412
2413 if (fc != dc)
2414 {
2415 #ifdef SUPPORT_UCP
2416 if (dc != UCD_OTHERCASE(fc))
2417 #endif
2418 RRETURN(MATCH_NOMATCH);
2419 }
2420 }
2421 }
2422 else
2423 #endif /* SUPPORT_UTF8 */
2424
2425 /* Non-UTF-8 mode */
2426 {
2427 if (md->end_subject - eptr < 1)
2428 {
2429 SCHECK_PARTIAL(); /* This one can use SCHECK_PARTIAL() */
2430 RRETURN(MATCH_NOMATCH);
2431 }
2432 if (md->lcc[ecode[1]] != md->lcc[*eptr++]) RRETURN(MATCH_NOMATCH);
2433 ecode += 2;
2434 }
2435 break;
2436
2437 /* Match a single character repeatedly. */
2438
2439 case OP_EXACT:
2440 min = max = GET2(ecode, 1);
2441 ecode += 3;
2442 goto REPEATCHAR;
2443
2444 case OP_POSUPTO:
2445 possessive = TRUE;
2446 /* Fall through */
2447
2448 case OP_UPTO:
2449 case OP_MINUPTO:
2450 min = 0;
2451 max = GET2(ecode, 1);
2452 minimize = *ecode == OP_MINUPTO;
2453 ecode += 3;
2454 goto REPEATCHAR;
2455
2456 case OP_POSSTAR:
2457 possessive = TRUE;
2458 min = 0;
2459 max = INT_MAX;
2460 ecode++;
2461 goto REPEATCHAR;
2462
2463 case OP_POSPLUS:
2464 possessive = TRUE;
2465 min = 1;
2466 max = INT_MAX;
2467 ecode++;
2468 goto REPEATCHAR;
2469
2470 case OP_POSQUERY:
2471 possessive = TRUE;
2472 min = 0;
2473 max = 1;
2474 ecode++;
2475 goto REPEATCHAR;
2476
2477 case OP_STAR:
2478 case OP_MINSTAR:
2479 case OP_PLUS:
2480 case OP_MINPLUS:
2481 case OP_QUERY:
2482 case OP_MINQUERY:
2483 c = *ecode++ - OP_STAR;
2484 minimize = (c & 1) != 0;
2485
2486 min = rep_min[c]; /* Pick up values from tables; */
2487 max = rep_max[c]; /* zero for max => infinity */
2488 if (max == 0) max = INT_MAX;
2489
2490 /* Common code for all repeated single-character matches. */
2491
2492 REPEATCHAR:
2493 #ifdef SUPPORT_UTF8
2494 if (utf8)
2495 {
2496 length = 1;
2497 charptr = ecode;
2498 GETCHARLEN(fc, ecode, length);
2499 ecode += length;
2500
2501 /* Handle multibyte character matching specially here. There is
2502 support for caseless matching if UCP support is present. */
2503
2504 if (length > 1)
2505 {
2506 #ifdef SUPPORT_UCP
2507 unsigned int othercase;
2508 if ((ims & PCRE_CASELESS) != 0 &&
2509 (othercase = UCD_OTHERCASE(fc)) != fc)
2510 oclength = _pcre_ord2utf8(othercase, occhars);
2511 else oclength = 0;
2512 #endif /* SUPPORT_UCP */
2513
2514 for (i = 1; i <= min; i++)
2515 {
2516 if (eptr <= md->end_subject - length &&
2517 memcmp(eptr, charptr, length) == 0) eptr += length;
2518 #ifdef SUPPORT_UCP
2519 else if (oclength > 0 &&
2520 eptr <= md->end_subject - oclength &&
2521 memcmp(eptr, occhars, oclength) == 0) eptr += oclength;
2522 #endif /* SUPPORT_UCP */
2523 else
2524 {
2525 CHECK_PARTIAL();
2526 RRETURN(MATCH_NOMATCH);
2527 }
2528 }
2529
2530 if (min == max) continue;
2531
2532 if (minimize)
2533 {
2534 for (fi = min;; fi++)
2535 {
2536 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM22);
2537 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2538 if (fi >= max) RRETURN(MATCH_NOMATCH);
2539 if (eptr <= md->end_subject - length &&
2540 memcmp(eptr, charptr, length) == 0) eptr += length;
2541 #ifdef SUPPORT_UCP
2542 else if (oclength > 0 &&
2543 eptr <= md->end_subject - oclength &&
2544 memcmp(eptr, occhars, oclength) == 0) eptr += oclength;
2545 #endif /* SUPPORT_UCP */
2546 else
2547 {
2548 CHECK_PARTIAL();
2549 RRETURN(MATCH_NOMATCH);
2550 }
2551 }
2552 /* Control never gets here */
2553 }
2554
2555 else /* Maximize */
2556 {
2557 pp = eptr;
2558 for (i = min; i < max; i++)
2559 {
2560 if (eptr <= md->end_subject - length &&
2561 memcmp(eptr, charptr, length) == 0) eptr += length;
2562 #ifdef SUPPORT_UCP
2563 else if (oclength > 0 &&
2564 eptr <= md->end_subject - oclength &&
2565 memcmp(eptr, occhars, oclength) == 0) eptr += oclength;
2566 #endif /* SUPPORT_UCP */
2567 else break;
2568 }
2569
2570 if (possessive) continue;
2571
2572 for(;;)
2573 {
2574 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM23);
2575 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2576 if (eptr == pp) { RRETURN(MATCH_NOMATCH); }
2577 #ifdef SUPPORT_UCP
2578 eptr--;
2579 BACKCHAR(eptr);
2580 #else /* without SUPPORT_UCP */
2581 eptr -= length;
2582 #endif /* SUPPORT_UCP */
2583 }
2584 }
2585 /* Control never gets here */
2586 }
2587
2588 /* If the length of a UTF-8 character is 1, we fall through here, and
2589 obey the code as for non-UTF-8 characters below, though in this case the
2590 value of fc will always be < 128. */
2591 }
2592 else
2593 #endif /* SUPPORT_UTF8 */
2594
2595 /* When not in UTF-8 mode, load a single-byte character. */
2596
2597 fc = *ecode++;
2598
2599 /* The value of fc at this point is always less than 256, though we may or
2600 may not be in UTF-8 mode. The code is duplicated for the caseless and
2601 caseful cases, for speed, since matching characters is likely to be quite
2602 common. First, ensure the minimum number of matches are present. If min =
2603 max, continue at the same level without recursing. Otherwise, if
2604 minimizing, keep trying the rest of the expression and advancing one
2605 matching character if failing, up to the maximum. Alternatively, if
2606 maximizing, find the maximum number of characters and work backwards. */
2607
2608 DPRINTF(("matching %c{%d,%d} against subject %.*s\n", fc, min, max,
2609 max, eptr));
2610
2611 if ((ims & PCRE_CASELESS) != 0)
2612 {
2613 fc = md->lcc[fc];
2614 for (i = 1; i <= min; i++)
2615 {
2616 if (eptr >= md->end_subject)
2617 {
2618 SCHECK_PARTIAL();
2619 RRETURN(MATCH_NOMATCH);
2620 }
2621 if (fc != md->lcc[*eptr++]) RRETURN(MATCH_NOMATCH);
2622 }
2623 if (min == max) continue;
2624 if (minimize)
2625 {
2626 for (fi = min;; fi++)
2627 {
2628 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM24);
2629 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2630 if (fi >= max) RRETURN(MATCH_NOMATCH);
2631 if (eptr >= md->end_subject)
2632 {
2633 SCHECK_PARTIAL();
2634 RRETURN(MATCH_NOMATCH);
2635 }
2636 if (fc != md->lcc[*eptr++]) RRETURN(MATCH_NOMATCH);
2637 }
2638 /* Control never gets here */
2639 }
2640 else /* Maximize */
2641 {
2642 pp = eptr;
2643 for (i = min; i < max; i++)
2644 {
2645 if (eptr >= md->end_subject || fc != md->lcc[*eptr]) break;
2646 eptr++;
2647 }
2648
2649 if (possessive) continue;
2650
2651 while (eptr >= pp)
2652 {
2653 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM25);
2654 eptr--;
2655 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2656 }
2657 RRETURN(MATCH_NOMATCH);
2658 }
2659 /* Control never gets here */
2660 }
2661
2662 /* Caseful comparisons (includes all multi-byte characters) */
2663
2664 else
2665 {
2666 for (i = 1; i <= min; i++)
2667 {
2668 if (eptr >= md->end_subject)
2669 {
2670 SCHECK_PARTIAL();
2671 RRETURN(MATCH_NOMATCH);
2672 }
2673 if (fc != *eptr++) RRETURN(MATCH_NOMATCH);
2674 }
2675
2676 if (min == max) continue;
2677
2678 if (minimize)
2679 {
2680 for (fi = min;; fi++)
2681 {
2682 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM26);
2683 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2684 if (fi >= max) RRETURN(MATCH_NOMATCH);
2685 if (eptr >= md->end_subject)
2686 {
2687 SCHECK_PARTIAL();
2688 RRETURN(MATCH_NOMATCH);
2689 }
2690 if (fc != *eptr++) RRETURN(MATCH_NOMATCH);
2691 }
2692 /* Control never gets here */
2693 }
2694 else /* Maximize */
2695 {
2696 pp = eptr;
2697 for (i = min; i < max; i++)
2698 {
2699 if (eptr >= md->end_subject || fc != *eptr) break;
2700 eptr++;
2701 }
2702 if (possessive) continue;
2703
2704 while (eptr >= pp)
2705 {
2706 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM27);
2707 eptr--;
2708 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2709 }
2710 RRETURN(MATCH_NOMATCH);
2711 }
2712 }
2713 /* Control never gets here */
2714
2715 /* Match a negated single one-byte character. The character we are
2716 checking can be multibyte. */
2717
2718 case OP_NOT:
2719 if (eptr >= md->end_subject)
2720 {
2721 SCHECK_PARTIAL();
2722 RRETURN(MATCH_NOMATCH);
2723 }
2724 ecode++;
2725 GETCHARINCTEST(c, eptr);
2726 if ((ims & PCRE_CASELESS) != 0)
2727 {
2728 #ifdef SUPPORT_UTF8
2729 if (c < 256)
2730 #endif
2731 c = md->lcc[c];
2732 if (md->lcc[*ecode++] == c) RRETURN(MATCH_NOMATCH);
2733 }
2734 else
2735 {
2736 if (*ecode++ == c) RRETURN(MATCH_NOMATCH);
2737 }
2738 break;
2739
2740 /* Match a negated single one-byte character repeatedly. This is almost a
2741 repeat of the code for a repeated single character, but I haven't found a
2742 nice way of commoning these up that doesn't require a test of the
2743 positive/negative option for each character match. Maybe that wouldn't add
2744 very much to the time taken, but character matching *is* what this is all
2745 about... */
2746
2747 case OP_NOTEXACT:
2748 min = max = GET2(ecode, 1);
2749 ecode += 3;
2750 goto REPEATNOTCHAR;
2751
2752 case OP_NOTUPTO:
2753 case OP_NOTMINUPTO:
2754 min = 0;
2755 max = GET2(ecode, 1);
2756 minimize = *ecode == OP_NOTMINUPTO;
2757 ecode += 3;
2758 goto REPEATNOTCHAR;
2759
2760 case OP_NOTPOSSTAR:
2761 possessive = TRUE;
2762 min = 0;
2763 max = INT_MAX;
2764 ecode++;
2765 goto REPEATNOTCHAR;
2766
2767 case OP_NOTPOSPLUS:
2768 possessive = TRUE;
2769 min = 1;
2770 max = INT_MAX;
2771 ecode++;
2772 goto REPEATNOTCHAR;
2773
2774 case OP_NOTPOSQUERY:
2775 possessive = TRUE;
2776 min = 0;
2777 max = 1;
2778 ecode++;
2779 goto REPEATNOTCHAR;
2780
2781 case OP_NOTPOSUPTO:
2782 possessive = TRUE;
2783 min = 0;
2784 max = GET2(ecode, 1);
2785 ecode += 3;
2786 goto REPEATNOTCHAR;
2787
2788 case OP_NOTSTAR:
2789 case OP_NOTMINSTAR:
2790 case OP_NOTPLUS:
2791 case OP_NOTMINPLUS:
2792 case OP_NOTQUERY:
2793 case OP_NOTMINQUERY:
2794 c = *ecode++ - OP_NOTSTAR;
2795 minimize = (c & 1) != 0;
2796 min = rep_min[c]; /* Pick up values from tables; */
2797 max = rep_max[c]; /* zero for max => infinity */
2798 if (max == 0) max = INT_MAX;
2799
2800 /* Common code for all repeated single-byte matches. */
2801
2802 REPEATNOTCHAR:
2803 fc = *ecode++;
2804
2805 /* The code is duplicated for the caseless and caseful cases, for speed,
2806 since matching characters is likely to be quite common. First, ensure the
2807 minimum number of matches are present. If min = max, continue at the same
2808 level without recursing. Otherwise, if minimizing, keep trying the rest of
2809 the expression and advancing one matching character if failing, up to the
2810 maximum. Alternatively, if maximizing, find the maximum number of
2811 characters and work backwards. */
2812
2813 DPRINTF(("negative matching %c{%d,%d} against subject %.*s\n", fc, min, max,
2814 max, eptr));
2815
2816 if ((ims & PCRE_CASELESS) != 0)
2817 {
2818 fc = md->lcc[fc];
2819
2820 #ifdef SUPPORT_UTF8
2821 /* UTF-8 mode */
2822 if (utf8)
2823 {
2824 register unsigned int d;
2825 for (i = 1; i <= min; i++)
2826 {
2827 if (eptr >= md->end_subject)
2828 {
2829 SCHECK_PARTIAL();
2830 RRETURN(MATCH_NOMATCH);
2831 }
2832 GETCHARINC(d, eptr);
2833 if (d < 256) d = md->lcc[d];
2834 if (fc == d) RRETURN(MATCH_NOMATCH);
2835 }
2836 }
2837 else
2838 #endif
2839
2840 /* Not UTF-8 mode */
2841 {
2842 for (i = 1; i <= min; i++)
2843 {
2844 if (eptr >= md->end_subject)
2845 {
2846 SCHECK_PARTIAL();
2847 RRETURN(MATCH_NOMATCH);
2848 }
2849 if (fc == md->lcc[*eptr++]) RRETURN(MATCH_NOMATCH);
2850 }
2851 }
2852
2853 if (min == max) continue;
2854
2855 if (minimize)
2856 {
2857 #ifdef SUPPORT_UTF8
2858 /* UTF-8 mode */
2859 if (utf8)
2860 {
2861 register unsigned int d;
2862 for (fi = min;; fi++)
2863 {
2864 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM28);
2865 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2866 if (fi >= max) RRETURN(MATCH_NOMATCH);
2867 if (eptr >= md->end_subject)
2868 {
2869 SCHECK_PARTIAL();
2870 RRETURN(MATCH_NOMATCH);
2871 }
2872 GETCHARINC(d, eptr);
2873 if (d < 256) d = md->lcc[d];
2874 if (fc == d) RRETURN(MATCH_NOMATCH);
2875 }
2876 }
2877 else
2878 #endif
2879 /* Not UTF-8 mode */
2880 {
2881 for (fi = min;; fi++)
2882 {
2883 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM29);
2884 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2885 if (fi >= max) RRETURN(MATCH_NOMATCH);
2886 if (eptr >= md->end_subject)
2887 {
2888 SCHECK_PARTIAL();
2889 RRETURN(MATCH_NOMATCH);
2890 }
2891 if (fc == md->lcc[*eptr++]) RRETURN(MATCH_NOMATCH);
2892 }
2893 }
2894 /* Control never gets here */
2895 }
2896
2897 /* Maximize case */
2898
2899 else
2900 {
2901 pp = eptr;
2902
2903 #ifdef SUPPORT_UTF8
2904 /* UTF-8 mode */
2905 if (utf8)
2906 {
2907 register unsigned int d;
2908 for (i = min; i < max; i++)
2909 {
2910 int len = 1;
2911 if (eptr >= md->end_subject) break;
2912 GETCHARLEN(d, eptr, len);
2913 if (d < 256) d = md->lcc[d];
2914 if (fc == d) break;
2915 eptr += len;
2916 }
2917 if (possessive) continue;
2918 for(;;)
2919 {
2920 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM30);
2921 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2922 if (eptr-- == pp) break; /* Stop if tried at original pos */
2923 BACKCHAR(eptr);
2924 }
2925 }
2926 else
2927 #endif
2928 /* Not UTF-8 mode */
2929 {
2930 for (i = min; i < max; i++)
2931 {
2932 if (eptr >= md->end_subject || fc == md->lcc[*eptr]) break;
2933 eptr++;
2934 }
2935 if (possessive) continue;
2936 while (eptr >= pp)
2937 {
2938 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM31);
2939 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2940 eptr--;
2941 }
2942 }
2943
2944 RRETURN(MATCH_NOMATCH);
2945 }
2946 /* Control never gets here */
2947 }
2948
2949 /* Caseful comparisons */
2950
2951 else
2952 {
2953 #ifdef SUPPORT_UTF8
2954 /* UTF-8 mode */
2955 if (utf8)
2956 {
2957 register unsigned int d;
2958 for (i = 1; i <= min; i++)
2959 {
2960 if (eptr >= md->end_subject)
2961 {
2962 SCHECK_PARTIAL();
2963 RRETURN(MATCH_NOMATCH);
2964 }
2965 GETCHARINC(d, eptr);
2966 if (fc == d) RRETURN(MATCH_NOMATCH);
2967 }
2968 }
2969 else
2970 #endif
2971 /* Not UTF-8 mode */
2972 {
2973 for (i = 1; i <= min; i++)
2974 {
2975 if (eptr >= md->end_subject)
2976 {
2977 SCHECK_PARTIAL();
2978 RRETURN(MATCH_NOMATCH);
2979 }
2980 if (fc == *eptr++) RRETURN(MATCH_NOMATCH);
2981 }
2982 }
2983
2984 if (min == max) continue;
2985
2986 if (minimize)
2987 {
2988 #ifdef SUPPORT_UTF8
2989 /* UTF-8 mode */
2990 if (utf8)
2991 {
2992 register unsigned int d;
2993 for (fi = min;; fi++)
2994 {
2995 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM32);
2996 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2997 if (fi >= max) RRETURN(MATCH_NOMATCH);
2998 if (eptr >= md->end_subject)
2999 {
3000 SCHECK_PARTIAL();
3001 RRETURN(MATCH_NOMATCH);
3002 }
3003 GETCHARINC(d, eptr);
3004 if (fc == d) RRETURN(MATCH_NOMATCH);
3005 }
3006 }
3007 else
3008 #endif
3009 /* Not UTF-8 mode */
3010 {
3011 for (fi = min;; fi++)
3012 {
3013 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM33);
3014 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3015 if (fi >= max) RRETURN(MATCH_NOMATCH);
3016 if (eptr >= md->end_subject)
3017 {
3018 SCHECK_PARTIAL();
3019 RRETURN(MATCH_NOMATCH);
3020 }
3021 if (fc == *eptr++) RRETURN(MATCH_NOMATCH);
3022 }
3023 }
3024 /* Control never gets here */
3025 }
3026
3027 /* Maximize case */
3028
3029 else
3030 {
3031 pp = eptr;
3032
3033 #ifdef SUPPORT_UTF8
3034 /* UTF-8 mode */
3035 if (utf8)
3036 {
3037 register unsigned int d;
3038 for (i = min; i < max; i++)
3039 {
3040 int len = 1;
3041 if (eptr >= md->end_subject) break;
3042 GETCHARLEN(d, eptr, len);
3043 if (fc == d) break;
3044 eptr += len;
3045 }
3046 if (possessive) continue;
3047 for(;;)
3048 {
3049 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM34);
3050 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3051 if (eptr-- == pp) break; /* Stop if tried at original pos */
3052 BACKCHAR(eptr);
3053 }
3054 }
3055 else
3056 #endif
3057 /* Not UTF-8 mode */
3058 {
3059 for (i = min; i < max; i++)
3060 {
3061 if (eptr >= md->end_subject || fc == *eptr) break;
3062 eptr++;
3063 }
3064 if (possessive) continue;
3065 while (eptr >= pp)
3066 {
3067 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM35);
3068 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3069 eptr--;
3070 }
3071 }
3072
3073 RRETURN(MATCH_NOMATCH);
3074 }
3075 }
3076 /* Control never gets here */
3077
3078 /* Match a single character type repeatedly; several different opcodes
3079 share code. This is very similar to the code for single characters, but we
3080 repeat it in the interests of efficiency. */
3081
3082 case OP_TYPEEXACT:
3083 min = max = GET2(ecode, 1);
3084 minimize = TRUE;
3085 ecode += 3;
3086 goto REPEATTYPE;
3087
3088 case OP_TYPEUPTO:
3089 case OP_TYPEMINUPTO:
3090 min = 0;
3091 max = GET2(ecode, 1);
3092 minimize = *ecode == OP_TYPEMINUPTO;
3093 ecode += 3;
3094 goto REPEATTYPE;
3095
3096 case OP_TYPEPOSSTAR:
3097 possessive = TRUE;
3098 min = 0;
3099 max = INT_MAX;
3100 ecode++;
3101 goto REPEATTYPE;
3102
3103 case OP_TYPEPOSPLUS:
3104 possessive = TRUE;
3105 min = 1;
3106 max = INT_MAX;
3107 ecode++;
3108 goto REPEATTYPE;
3109
3110 case OP_TYPEPOSQUERY:
3111 possessive = TRUE;
3112 min = 0;
3113 max = 1;
3114 ecode++;
3115 goto REPEATTYPE;
3116
3117 case OP_TYPEPOSUPTO:
3118 possessive = TRUE;
3119 min = 0;
3120 max = GET2(ecode, 1);
3121 ecode += 3;
3122 goto REPEATTYPE;
3123
3124 case OP_TYPESTAR:
3125 case OP_TYPEMINSTAR:
3126 case OP_TYPEPLUS:
3127 case OP_TYPEMINPLUS:
3128 case OP_TYPEQUERY:
3129 case OP_TYPEMINQUERY:
3130 c = *ecode++ - OP_TYPESTAR;
3131 minimize = (c & 1) != 0;
3132 min = rep_min[c]; /* Pick up values from tables; */
3133 max = rep_max[c]; /* zero for max => infinity */
3134 if (max == 0) max = INT_MAX;
3135
3136 /* Common code for all repeated single character type matches. Note that
3137 in UTF-8 mode, '.' matches a character of any length, but for the other
3138 character types, the valid characters are all one-byte long. */
3139
3140 REPEATTYPE:
3141 ctype = *ecode++; /* Code for the character type */
3142
3143 #ifdef SUPPORT_UCP
3144 if (ctype == OP_PROP || ctype == OP_NOTPROP)
3145 {
3146 prop_fail_result = ctype == OP_NOTPROP;
3147 prop_type = *ecode++;
3148 prop_value = *ecode++;
3149 }
3150 else prop_type = -1;
3151 #endif
3152
3153 /* First, ensure the minimum number of matches are present. Use inline
3154 code for maximizing the speed, and do the type test once at the start
3155 (i.e. keep it out of the loop). Separate the UTF-8 code completely as that
3156 is tidier. Also separate the UCP code, which can be the same for both UTF-8
3157 and single-bytes. */
3158
3159 if (min > 0)
3160 {
3161 #ifdef SUPPORT_UCP
3162 if (prop_type >= 0)
3163 {
3164 switch(prop_type)
3165 {
3166 case PT_ANY:
3167 if (prop_fail_result) RRETURN(MATCH_NOMATCH);
3168 for (i = 1; i <= min; i++)
3169 {
3170 if (eptr >= md->end_subject)
3171 {
3172 SCHECK_PARTIAL();
3173 RRETURN(MATCH_NOMATCH);
3174 }
3175 GETCHARINCTEST(c, eptr);
3176 }
3177 break;
3178
3179 case PT_LAMP:
3180 for (i = 1; i <= min; i++)
3181 {
3182 if (eptr >= md->end_subject)
3183 {
3184 SCHECK_PARTIAL();
3185 RRETURN(MATCH_NOMATCH);
3186 }
3187 GETCHARINCTEST(c, eptr);
3188 prop_chartype = UCD_CHARTYPE(c);
3189 if ((prop_chartype == ucp_Lu ||
3190 prop_chartype == ucp_Ll ||
3191 prop_chartype == ucp_Lt) == prop_fail_result)
3192 RRETURN(MATCH_NOMATCH);
3193 }
3194 break;
3195
3196 case PT_GC:
3197 for (i = 1; i <= min; i++)
3198 {
3199 if (eptr >= md->end_subject)
3200 {
3201 SCHECK_PARTIAL();
3202 RRETURN(MATCH_NOMATCH);
3203 }
3204 GETCHARINCTEST(c, eptr);
3205 prop_category = UCD_CATEGORY(c);
3206 if ((prop_category == prop_value) == prop_fail_result)
3207 RRETURN(MATCH_NOMATCH);
3208 }
3209 break;
3210
3211 case PT_PC:
3212 for (i = 1; i <= min; i++)
3213 {
3214 if (eptr >= md->end_subject)
3215 {
3216 SCHECK_PARTIAL();
3217 RRETURN(MATCH_NOMATCH);
3218 }
3219 GETCHARINCTEST(c, eptr);
3220 prop_chartype = UCD_CHARTYPE(c);
3221 if ((prop_chartype == prop_value) == prop_fail_result)
3222 RRETURN(MATCH_NOMATCH);
3223 }
3224 break;
3225
3226 case PT_SC:
3227 for (i = 1; i <= min; i++)
3228 {
3229 if (eptr >= md->end_subject)
3230 {
3231 SCHECK_PARTIAL();
3232 RRETURN(MATCH_NOMATCH);
3233 }
3234 GETCHARINCTEST(c, eptr);
3235 prop_script = UCD_SCRIPT(c);
3236 if ((prop_script == prop_value) == prop_fail_result)
3237 RRETURN(MATCH_NOMATCH);
3238 }
3239 break;
3240
3241 default:
3242 RRETURN(PCRE_ERROR_INTERNAL);
3243 }
3244 }
3245
3246 /* Match extended Unicode sequences. We will get here only if the
3247 support is in the binary; otherwise a compile-time error occurs. */
3248
3249 else if (ctype == OP_EXTUNI)
3250 {
3251 for (i = 1; i <= min; i++)
3252 {
3253 if (eptr >= md->end_subject)
3254 {
3255 SCHECK_PARTIAL();
3256 RRETURN(MATCH_NOMATCH);
3257 }
3258 GETCHARINCTEST(c, eptr);
3259 prop_category = UCD_CATEGORY(c);
3260 if (prop_category == ucp_M) RRETURN(MATCH_NOMATCH);
3261 while (eptr < md->end_subject)
3262 {
3263 int len = 1;
3264 if (!utf8) c = *eptr;
3265 else { GETCHARLEN(c, eptr, len); }
3266 prop_category = UCD_CATEGORY(c);
3267 if (prop_category != ucp_M) break;
3268 eptr += len;
3269 }
3270 }
3271 }
3272
3273 else
3274 #endif /* SUPPORT_UCP */
3275
3276 /* Handle all other cases when the coding is UTF-8 */
3277
3278 #ifdef SUPPORT_UTF8
3279 if (utf8) switch(ctype)
3280 {
3281 case OP_ANY:
3282 for (i = 1; i <= min; i++)
3283 {
3284 if (eptr >= md->end_subject)
3285 {
3286 SCHECK_PARTIAL();
3287 RRETURN(MATCH_NOMATCH);
3288 }
3289 if (IS_NEWLINE(eptr)) RRETURN(MATCH_NOMATCH);
3290 eptr++;
3291 while (eptr < md->end_subject && (*eptr & 0xc0) == 0x80) eptr++;
3292 }
3293 break;
3294
3295 case OP_ALLANY:
3296 for (i = 1; i <= min; i++)
3297 {
3298 if (eptr >= md->end_subject)
3299 {
3300 SCHECK_PARTIAL();
3301 RRETURN(MATCH_NOMATCH);
3302 }
3303 eptr++;
3304 while (eptr < md->end_subject && (*eptr & 0xc0) == 0x80) eptr++;
3305 }
3306 break;
3307
3308 case OP_ANYBYTE:
3309 if (eptr > md->end_subject - min) RRETURN(MATCH_NOMATCH);
3310 eptr += min;
3311 break;
3312
3313 case OP_ANYNL:
3314 for (i = 1; i <= min; i++)
3315 {
3316 if (eptr >= md->end_subject)
3317 {
3318 SCHECK_PARTIAL();
3319 RRETURN(MATCH_NOMATCH);
3320 }
3321 GETCHARINC(c, eptr);
3322 switch(c)
3323 {
3324 default: RRETURN(MATCH_NOMATCH);
3325 case 0x000d:
3326 if (eptr < md->end_subject && *eptr == 0x0a) eptr++;
3327 break;
3328
3329 case 0x000a:
3330 break;
3331
3332 case 0x000b:
3333 case 0x000c:
3334 case 0x0085:
3335 case 0x2028:
3336 case 0x2029:
3337 if (md->bsr_anycrlf) RRETURN(MATCH_NOMATCH);
3338 break;
3339 }
3340 }
3341 break;
3342
3343 case OP_NOT_HSPACE:
3344 for (i = 1; i <= min; i++)
3345 {
3346 if (eptr >= md->end_subject)
3347 {
3348 SCHECK_PARTIAL();
3349 RRETURN(MATCH_NOMATCH);
3350 }
3351 GETCHARINC(c, eptr);
3352 switch(c)
3353 {
3354 default: break;
3355 case 0x09: /* HT */
3356 case 0x20: /* SPACE */
3357 case 0xa0: /* NBSP */
3358 case 0x1680: /* OGHAM SPACE MARK */
3359 case 0x180e: /* MONGOLIAN VOWEL SEPARATOR */
3360 case 0x2000: /* EN QUAD */
3361 case 0x2001: /* EM QUAD */
3362 case 0x2002: /* EN SPACE */
3363 case 0x2003: /* EM SPACE */
3364 case 0x2004: /* THREE-PER-EM SPACE */
3365 case 0x2005: /* FOUR-PER-EM SPACE */
3366 case 0x2006: /* SIX-PER-EM SPACE */
3367 case 0x2007: /* FIGURE SPACE */
3368 case 0x2008: /* PUNCTUATION SPACE */
3369 case 0x2009: /* THIN SPACE */
3370 case 0x200A: /* HAIR SPACE */
3371 case 0x202f: /* NARROW NO-BREAK SPACE */
3372 case 0x205f: /* MEDIUM MATHEMATICAL SPACE */
3373 case 0x3000: /* IDEOGRAPHIC SPACE */
3374 RRETURN(MATCH_NOMATCH);
3375 }
3376 }
3377 break;
3378
3379 case OP_HSPACE:
3380 for (i = 1; i <= min; i++)
3381 {
3382 if (eptr >= md->end_subject)
3383 {
3384 SCHECK_PARTIAL();
3385 RRETURN(MATCH_NOMATCH);
3386 }
3387 GETCHARINC(c, eptr);
3388 switch(c)
3389 {
3390 default: RRETURN(MATCH_NOMATCH);
3391 case 0x09: /* HT */
3392 case 0x20: /* SPACE */
3393 case 0xa0: /* NBSP */
3394 case 0x1680: /* OGHAM SPACE MARK */
3395 case 0x180e: /* MONGOLIAN VOWEL SEPARATOR */
3396 case 0x2000: /* EN QUAD */
3397 case 0x2001: /* EM QUAD */
3398 case 0x2002: /* EN SPACE */
3399 case 0x2003: /* EM SPACE */
3400 case 0x2004: /* THREE-PER-EM SPACE */
3401 case 0x2005: /* FOUR-PER-EM SPACE */
3402 case 0x2006: /* SIX-PER-EM SPACE */
3403 case 0x2007: /* FIGURE SPACE */
3404 case 0x2008: /* PUNCTUATION SPACE */
3405 case 0x2009: /* THIN SPACE */
3406 case 0x200A: /* HAIR SPACE */
3407 case 0x202f: /* NARROW NO-BREAK SPACE */
3408 case 0x205f: /* MEDIUM MATHEMATICAL SPACE */
3409 case 0x3000: /* IDEOGRAPHIC SPACE */
3410 break;
3411 }
3412 }
3413 break;
3414
3415 case OP_NOT_VSPACE:
3416 for (i = 1; i <= min; i++)
3417 {
3418 if (eptr >= md->end_subject)
3419 {
3420 SCHECK_PARTIAL();
3421 RRETURN(MATCH_NOMATCH);
3422 }
3423 GETCHARINC(c, eptr);
3424 switch(c)
3425 {
3426 default: break;
3427 case 0x0a: /* LF */
3428 case 0x0b: /* VT */
3429 case 0x0c: /* FF */
3430 case 0x0d: /* CR */
3431 case 0x85: /* NEL */
3432 case 0x2028: /* LINE SEPARATOR */
3433 case 0x2029: /* PARAGRAPH SEPARATOR */
3434 RRETURN(MATCH_NOMATCH);
3435 }
3436 }
3437 break;
3438
3439 case OP_VSPACE:
3440 for (i = 1; i <= min; i++)
3441 {
3442 if (eptr >= md->end_subject)
3443 {
3444 SCHECK_PARTIAL();
3445 RRETURN(MATCH_NOMATCH);
3446 }
3447 GETCHARINC(c, eptr);
3448 switch(c)
3449 {
3450 default: RRETURN(MATCH_NOMATCH);
3451 case 0x0a: /* LF */
3452 case 0x0b: /* VT */
3453 case 0x0c: /* FF */
3454 case 0x0d: /* CR */
3455 case 0x85: /* NEL */
3456 case 0x2028: /* LINE SEPARATOR */
3457 case 0x2029: /* PARAGRAPH SEPARATOR */
3458 break;
3459 }
3460 }
3461 break;
3462
3463 case OP_NOT_DIGIT:
3464 for (i = 1; i <= min; i++)
3465 {
3466 if (eptr >= md->end_subject)
3467 {
3468 SCHECK_PARTIAL();
3469 RRETURN(MATCH_NOMATCH);
3470 }
3471 GETCHARINC(c, eptr);
3472 if (c < 128 && (md->ctypes[c] & ctype_digit) != 0)
3473 RRETURN(MATCH_NOMATCH);
3474 }
3475 break;
3476
3477 case OP_DIGIT:
3478 for (i = 1; i <= min; i++)
3479 {
3480 if (eptr >= md->end_subject)
3481 {
3482 SCHECK_PARTIAL();
3483 RRETURN(MATCH_NOMATCH);
3484 }
3485 if (*eptr >= 128 || (md->ctypes[*eptr++] & ctype_digit) == 0)
3486 RRETURN(MATCH_NOMATCH);
3487 /* No need to skip more bytes - we know it's a 1-byte character */
3488 }
3489 break;
3490
3491 case OP_NOT_WHITESPACE:
3492 for (i = 1; i <= min; i++)
3493 {
3494 if (eptr >= md->end_subject)
3495 {
3496 SCHECK_PARTIAL();
3497 RRETURN(MATCH_NOMATCH);
3498 }
3499 if (*eptr < 128 && (md->ctypes[*eptr] & ctype_space) != 0)
3500 RRETURN(MATCH_NOMATCH);
3501 while (++eptr < md->end_subject && (*eptr & 0xc0) == 0x80);
3502 }
3503 break;
3504
3505 case OP_WHITESPACE:
3506 for (i = 1; i <= min; i++)
3507 {
3508 if (eptr >= md->end_subject)
3509 {
3510 SCHECK_PARTIAL();
3511 RRETURN(MATCH_NOMATCH);
3512 }
3513 if (*eptr >= 128 || (md->ctypes[*eptr++] & ctype_space) == 0)
3514 RRETURN(MATCH_NOMATCH);
3515 /* No need to skip more bytes - we know it's a 1-byte character */
3516 }
3517 break;
3518
3519 case OP_NOT_WORDCHAR:
3520 for (i = 1; i <= min; i++)
3521 {
3522 if (eptr >= md->end_subject ||
3523 (*eptr < 128 && (md->ctypes[*eptr] & ctype_word) != 0))
3524 RRETURN(MATCH_NOMATCH);
3525 while (++eptr < md->end_subject && (*eptr & 0xc0) == 0x80);
3526 }
3527 break;
3528
3529 case OP_WORDCHAR:
3530 for (i = 1; i <= min; i++)
3531 {
3532 if (eptr >= md->end_subject)
3533 {
3534 SCHECK_PARTIAL();
3535 RRETURN(MATCH_NOMATCH);
3536 }
3537 if (*eptr >= 128 || (md->ctypes[*eptr++] & ctype_word) == 0)
3538 RRETURN(MATCH_NOMATCH);
3539 /* No need to skip more bytes - we know it's a 1-byte character */
3540 }
3541 break;
3542
3543 default:
3544 RRETURN(PCRE_ERROR_INTERNAL);
3545 } /* End switch(ctype) */
3546
3547 else
3548 #endif /* SUPPORT_UTF8 */
3549
3550 /* Code for the non-UTF-8 case for minimum matching of operators other
3551 than OP_PROP and OP_NOTPROP. */
3552
3553 switch(ctype)
3554 {
3555 case OP_ANY:
3556 for (i = 1; i <= min; i++)
3557 {
3558 if (eptr >= md->end_subject)
3559 {
3560 SCHECK_PARTIAL();
3561 RRETURN(MATCH_NOMATCH);
3562 }
3563 if (IS_NEWLINE(eptr)) RRETURN(MATCH_NOMATCH);
3564 eptr++;
3565 }
3566 break;
3567
3568 case OP_ALLANY:
3569 if (eptr > md->end_subject - min)
3570 {
3571 SCHECK_PARTIAL();
3572 RRETURN(MATCH_NOMATCH);
3573 }
3574 eptr += min;
3575 break;
3576
3577 case OP_ANYBYTE:
3578 if (eptr > md->end_subject - min)
3579 {
3580 SCHECK_PARTIAL();
3581 RRETURN(MATCH_NOMATCH);
3582 }
3583 eptr += min;
3584 break;
3585
3586 case OP_ANYNL:
3587 for (i = 1; i <= min; i++)
3588 {
3589 if (eptr >= md->end_subject)
3590 {
3591 SCHECK_PARTIAL();
3592 RRETURN(MATCH_NOMATCH);
3593 }
3594 switch(*eptr++)
3595 {
3596 default: RRETURN(MATCH_NOMATCH);
3597 case 0x000d:
3598 if (eptr < md->end_subject && *eptr == 0x0a) eptr++;
3599 break;
3600 case 0x000a:
3601 break;
3602
3603 case 0x000b:
3604 case 0x000c:
3605 case 0x0085:
3606 if (md->bsr_anycrlf) RRETURN(MATCH_NOMATCH);
3607 break;
3608 }
3609 }
3610 break;
3611
3612 case OP_NOT_HSPACE:
3613 for (i = 1; i <= min; i++)
3614 {
3615 if (eptr >= md->end_subject)
3616 {
3617 SCHECK_PARTIAL();
3618 RRETURN(MATCH_NOMATCH);
3619 }
3620 switch(*eptr++)
3621 {
3622 default: break;
3623 case 0x09: /* HT */
3624 case 0x20: /* SPACE */
3625 case 0xa0: /* NBSP */
3626 RRETURN(MATCH_NOMATCH);
3627 }
3628 }
3629 break;
3630
3631 case OP_HSPACE:
3632 for (i = 1; i <= min; i++)
3633 {
3634 if (eptr >= md->end_subject)
3635 {
3636 SCHECK_PARTIAL();
3637 RRETURN(MATCH_NOMATCH);
3638 }
3639 switch(*eptr++)
3640 {
3641 default: RRETURN(MATCH_NOMATCH);
3642 case 0x09: /* HT */
3643 case 0x20: /* SPACE */
3644 case 0xa0: /* NBSP */
3645 break;
3646 }
3647 }
3648 break;
3649
3650 case OP_NOT_VSPACE:
3651 for (i = 1; i <= min; i++)
3652 {
3653 if (eptr >= md->end_subject)
3654 {
3655 SCHECK_PARTIAL();
3656 RRETURN(MATCH_NOMATCH);
3657 }
3658 switch(*eptr++)
3659 {
3660 default: break;
3661 case 0x0a: /* LF */
3662 case 0x0b: /* VT */
3663 case 0x0c: /* FF */
3664 case 0x0d: /* CR */
3665 case 0x85: /* NEL */
3666 RRETURN(MATCH_NOMATCH);
3667 }
3668 }
3669 break;
3670
3671 case OP_VSPACE:
3672 for (i = 1; i <= min; i++)
3673 {
3674 if (eptr >= md->end_subject)
3675 {
3676 SCHECK_PARTIAL();
3677 RRETURN(MATCH_NOMATCH);
3678 }
3679 switch(*eptr++)
3680 {
3681 default: RRETURN(MATCH_NOMATCH);
3682 case 0x0a: /* LF */
3683 case 0x0b: /* VT */
3684 case 0x0c: /* FF */
3685 case 0x0d: /* CR */
3686 case 0x85: /* NEL */
3687 break;
3688 }
3689 }
3690 break;
3691
3692 case OP_NOT_DIGIT:
3693 for (i = 1; i <= min; i++)
3694 {
3695 if (eptr >= md->end_subject)
3696 {
3697 SCHECK_PARTIAL();
3698 RRETURN(MATCH_NOMATCH);
3699 }
3700 if ((md->ctypes[*eptr++] & ctype_digit) != 0) RRETURN(MATCH_NOMATCH);
3701 }
3702 break;
3703
3704 case OP_DIGIT:
3705 for (i = 1; i <= min; i++)
3706 {
3707 if (eptr >= md->end_subject)
3708 {
3709 SCHECK_PARTIAL();
3710 RRETURN(MATCH_NOMATCH);
3711 }
3712 if ((md->ctypes[*eptr++] & ctype_digit) == 0) RRETURN(MATCH_NOMATCH);
3713 }
3714 break;
3715
3716 case OP_NOT_WHITESPACE:
3717 for (i = 1; i <= min; i++)
3718 {
3719 if (eptr >= md->end_subject)
3720 {
3721 SCHECK_PARTIAL();
3722 RRETURN(MATCH_NOMATCH);
3723 }
3724 if ((md->ctypes[*eptr++] & ctype_space) != 0) RRETURN(MATCH_NOMATCH);
3725 }
3726 break;
3727
3728 case OP_WHITESPACE:
3729 for (i = 1; i <= min; i++)
3730 {
3731 if (eptr >= md->end_subject)
3732 {
3733 SCHECK_PARTIAL();
3734 RRETURN(MATCH_NOMATCH);
3735 }
3736 if ((md->ctypes[*eptr++] & ctype_space) == 0) RRETURN(MATCH_NOMATCH);
3737 }
3738 break;
3739
3740 case OP_NOT_WORDCHAR:
3741 for (i = 1; i <= min; i++)
3742 {
3743 if (eptr >= md->end_subject)
3744 {
3745 SCHECK_PARTIAL();
3746 RRETURN(MATCH_NOMATCH);
3747 }
3748 if ((md->ctypes[*eptr++] & ctype_word) != 0)
3749 RRETURN(MATCH_NOMATCH);
3750 }
3751 break;
3752
3753 case OP_WORDCHAR:
3754 for (i = 1; i <= min; i++)
3755 {
3756 if (eptr >= md->end_subject)
3757 {
3758 SCHECK_PARTIAL();
3759 RRETURN(MATCH_NOMATCH);
3760 }
3761 if ((md->ctypes[*eptr++] & ctype_word) == 0)
3762 RRETURN(MATCH_NOMATCH);
3763 }
3764 break;
3765
3766 default:
3767 RRETURN(PCRE_ERROR_INTERNAL);
3768 }
3769 }
3770
3771 /* If min = max, continue at the same level without recursing */
3772
3773 if (min == max) continue;
3774
3775 /* If minimizing, we have to test the rest of the pattern before each
3776 subsequent match. Again, separate the UTF-8 case for speed, and also
3777 separate the UCP cases. */
3778
3779 if (minimize)
3780 {
3781 #ifdef SUPPORT_UCP
3782 if (prop_type >= 0)
3783 {
3784 switch(prop_type)
3785 {
3786 case PT_ANY:
3787 for (fi = min;; fi++)
3788 {
3789 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM36);
3790 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3791 if (fi >= max) RRETURN(MATCH_NOMATCH);
3792 if (eptr >= md->end_subject)
3793 {
3794 SCHECK_PARTIAL();
3795 RRETURN(MATCH_NOMATCH);
3796 }
3797 GETCHARINC(c, eptr);
3798 if (prop_fail_result) RRETURN(MATCH_NOMATCH);
3799 }
3800 /* Control never gets here */
3801
3802 case PT_LAMP:
3803 for (fi = min;; fi++)
3804 {
3805 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM37);
3806 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3807 if (fi >= max) RRETURN(MATCH_NOMATCH);
3808 if (eptr >= md->end_subject)
3809 {
3810 SCHECK_PARTIAL();
3811 RRETURN(MATCH_NOMATCH);
3812 }
3813 GETCHARINC(c, eptr);
3814 prop_chartype = UCD_CHARTYPE(c);
3815 if ((prop_chartype == ucp_Lu ||
3816 prop_chartype == ucp_Ll ||
3817 prop_chartype == ucp_Lt) == prop_fail_result)
3818 RRETURN(MATCH_NOMATCH);
3819 }
3820 /* Control never gets here */
3821
3822 case PT_GC:
3823 for (fi = min;; fi++)
3824 {
3825 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM38);
3826 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3827 if (fi >= max) RRETURN(MATCH_NOMATCH);
3828 if (eptr >= md->end_subject)
3829 {
3830 SCHECK_PARTIAL();
3831 RRETURN(MATCH_NOMATCH);
3832 }
3833 GETCHARINC(c, eptr);
3834 prop_category = UCD_CATEGORY(c);
3835 if ((prop_category == prop_value) == prop_fail_result)
3836 RRETURN(MATCH_NOMATCH);
3837 }
3838 /* Control never gets here */
3839
3840 case PT_PC:
3841 for (fi = min;; fi++)
3842 {
3843 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM39);
3844 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3845 if (fi >= max) RRETURN(MATCH_NOMATCH);
3846 if (eptr >= md->end_subject)
3847 {
3848 SCHECK_PARTIAL();
3849 RRETURN(MATCH_NOMATCH);
3850 }
3851 GETCHARINC(c, eptr);
3852 prop_chartype = UCD_CHARTYPE(c);
3853 if ((prop_chartype == prop_value) == prop_fail_result)
3854 RRETURN(MATCH_NOMATCH);
3855 }
3856 /* Control never gets here */
3857
3858 case PT_SC:
3859 for (fi = min;; fi++)
3860 {
3861 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM40);
3862 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3863 if (fi >= max) RRETURN(MATCH_NOMATCH);
3864 if (eptr >= md->end_subject)
3865 {
3866 SCHECK_PARTIAL();
3867 RRETURN(MATCH_NOMATCH);
3868 }
3869 GETCHARINC(c, eptr);
3870 prop_script = UCD_SCRIPT(c);
3871 if ((prop_script == prop_value) == prop_fail_result)
3872 RRETURN(MATCH_NOMATCH);
3873 }
3874 /* Control never gets here */
3875
3876 default:
3877 RRETURN(PCRE_ERROR_INTERNAL);
3878 }
3879 }
3880
3881 /* Match extended Unicode sequences. We will get here only if the
3882 support is in the binary; otherwise a compile-time error occurs. */
3883
3884 else if (ctype == OP_EXTUNI)
3885 {
3886 for (fi = min;; fi++)
3887 {
3888 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM41);
3889 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3890 if (fi >= max) RRETURN(MATCH_NOMATCH);
3891 if (eptr >= md->end_subject)
3892 {
3893 SCHECK_PARTIAL();
3894 RRETURN(MATCH_NOMATCH);
3895 }
3896 GETCHARINCTEST(c, eptr);
3897 prop_category = UCD_CATEGORY(c);
3898 if (prop_category == ucp_M) RRETURN(MATCH_NOMATCH);
3899 while (eptr < md->end_subject)
3900 {
3901 int len = 1;
3902 if (!utf8) c = *eptr;
3903 else { GETCHARLEN(c, eptr, len); }
3904 prop_category = UCD_CATEGORY(c);
3905 if (prop_category != ucp_M) break;
3906 eptr += len;
3907 }
3908 }
3909 }
3910
3911 else
3912 #endif /* SUPPORT_UCP */
3913
3914 #ifdef SUPPORT_UTF8
3915 /* UTF-8 mode */
3916 if (utf8)
3917 {
3918 for (fi = min;; fi++)
3919 {
3920 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM42);
3921 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3922 if (fi >= max) RRETURN(MATCH_NOMATCH);
3923 if (eptr >= md->end_subject)
3924 {
3925 SCHECK_PARTIAL();
3926 RRETURN(MATCH_NOMATCH);
3927 }
3928 if (ctype == OP_ANY && IS_NEWLINE(eptr))
3929 RRETURN(MATCH_NOMATCH);
3930 GETCHARINC(c, eptr);
3931 switch(ctype)
3932 {
3933 case OP_ANY: /* This is the non-NL case */
3934 case OP_ALLANY:
3935 case OP_ANYBYTE:
3936 break;
3937
3938 case OP_ANYNL:
3939 switch(c)
3940 {
3941 default: RRETURN(MATCH_NOMATCH);
3942 case 0x000d:
3943 if (eptr < md->end_subject && *eptr == 0x0a) eptr++;
3944 break;
3945 case 0x000a:
3946 break;
3947
3948 case 0x000b:
3949 case 0x000c:
3950 case 0x0085:
3951 case 0x2028:
3952 case 0x2029:
3953 if (md->bsr_anycrlf) RRETURN(MATCH_NOMATCH);
3954 break;
3955 }
3956 break;
3957
3958 case OP_NOT_HSPACE:
3959 switch(c)
3960 {
3961 default: break;
3962 case 0x09: /* HT */
3963 case 0x20: /* SPACE */
3964 case 0xa0: /* NBSP */
3965 case 0x1680: /* OGHAM SPACE MARK */
3966 case 0x180e: /* MONGOLIAN VOWEL SEPARATOR */
3967 case 0x2000: /* EN QUAD */
3968 case 0x2001: /* EM QUAD */
3969 case 0x2002: /* EN SPACE */
3970 case 0x2003: /* EM SPACE */
3971 case 0x2004: /* THREE-PER-EM SPACE */
3972 case 0x2005: /* FOUR-PER-EM SPACE */
3973 case 0x2006: /* SIX-PER-EM SPACE */
3974 case 0x2007: /* FIGURE SPACE */
3975 case 0x2008: /* PUNCTUATION SPACE */
3976 case 0x2009: /* THIN SPACE */
3977 case 0x200A: /* HAIR SPACE */
3978 case 0x202f: /* NARROW NO-BREAK SPACE */
3979 case 0x205f: /* MEDIUM MATHEMATICAL SPACE */
3980 case 0x3000: /* IDEOGRAPHIC SPACE */
3981 RRETURN(MATCH_NOMATCH);
3982 }
3983 break;
3984
3985 case OP_HSPACE:
3986 switch(c)
3987 {
3988 default: RRETURN(MATCH_NOMATCH);
3989 case 0x09: /* HT */
3990 case 0x20: /* SPACE */
3991 case 0xa0: /* NBSP */
3992 case 0x1680: /* OGHAM SPACE MARK */
3993 case 0x180e: /* MONGOLIAN VOWEL SEPARATOR */
3994 case 0x2000: /* EN QUAD */
3995 case 0x2001: /* EM QUAD */
3996 case 0x2002: /* EN SPACE */
3997 case 0x2003: /* EM SPACE */
3998 case 0x2004: /* THREE-PER-EM SPACE */
3999 case 0x2005: /* FOUR-PER-EM SPACE */
4000 case 0x2006: /* SIX-PER-EM SPACE */
4001 case 0x2007: /* FIGURE SPACE */
4002 case 0x2008: /* PUNCTUATION SPACE */
4003 case 0x2009: /* THIN SPACE */
4004 case 0x200A: /* HAIR SPACE */
4005 case 0x202f: /* NARROW NO-BREAK SPACE */
4006 case 0x205f: /* MEDIUM MATHEMATICAL SPACE */
4007 case 0x3000: /* IDEOGRAPHIC SPACE */
4008 break;
4009 }
4010 break;
4011
4012 case OP_NOT_VSPACE:
4013 switch(c)
4014 {
4015 default: break;
4016 case 0x0a: /* LF */
4017 case 0x0b: /* VT */
4018 case 0x0c: /* FF */
4019 case 0x0d: /* CR */
4020 case 0x85: /* NEL */
4021 case 0x2028: /* LINE SEPARATOR */
4022 case 0x2029: /* PARAGRAPH SEPARATOR */
4023 RRETURN(MATCH_NOMATCH);
4024 }
4025 break;
4026
4027 case OP_VSPACE:
4028 switch(c)
4029 {
4030 default: RRETURN(MATCH_NOMATCH);
4031 case 0x0a: /* LF */
4032 case 0x0b: /* VT */
4033 case 0x0c: /* FF */
4034 case 0x0d: /* CR */
4035 case 0x85: /* NEL */
4036 case 0x2028: /* LINE SEPARATOR */
4037 case 0x2029: /* PARAGRAPH SEPARATOR */
4038 break;
4039 }
4040 break;
4041
4042 case OP_NOT_DIGIT:
4043 if (c < 256 && (md->ctypes[c] & ctype_digit) != 0)
4044 RRETURN(MATCH_NOMATCH);
4045 break;
4046
4047 case OP_DIGIT:
4048 if (c >= 256 || (md->ctypes[c] & ctype_digit) == 0)
4049 RRETURN(MATCH_NOMATCH);
4050 break;
4051
4052 case OP_NOT_WHITESPACE:
4053 if (c < 256 && (md->ctypes[c] & ctype_space) != 0)
4054 RRETURN(MATCH_NOMATCH);
4055 break;
4056
4057 case OP_WHITESPACE:
4058 if (c >= 256 || (md->ctypes[c] & ctype_space) == 0)
4059 RRETURN(MATCH_NOMATCH);
4060 break;
4061
4062 case OP_NOT_WORDCHAR:
4063 if (c < 256 && (md->ctypes[c] & ctype_word) != 0)
4064 RRETURN(MATCH_NOMATCH);
4065 break;
4066
4067 case OP_WORDCHAR:
4068 if (c >= 256 || (md->ctypes[c] & ctype_word) == 0)
4069 RRETURN(MATCH_NOMATCH);
4070 break;
4071
4072 default:
4073 RRETURN(PCRE_ERROR_INTERNAL);
4074 }
4075 }
4076 }
4077 else
4078 #endif
4079 /* Not UTF-8 mode */
4080 {
4081 for (fi = min;; fi++)
4082 {
4083 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM43);
4084 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
4085 if (fi >= max) RRETURN(MATCH_NOMATCH);
4086 if (eptr >= md->end_subject)
4087 {
4088 SCHECK_PARTIAL();
4089 RRETURN(MATCH_NOMATCH);
4090 }
4091 if (ctype == OP_ANY && IS_NEWLINE(eptr))
4092 RRETURN(MATCH_NOMATCH);
4093 c = *eptr++;
4094 switch(ctype)
4095 {
4096 case OP_ANY: /* This is the non-NL case */
4097 case OP_ALLANY:
4098 case OP_ANYBYTE:
4099 break;
4100
4101 case OP_ANYNL:
4102 switch(c)
4103 {
4104 default: RRETURN(MATCH_NOMATCH);
4105 case 0x000d:
4106 if (eptr < md->end_subject && *eptr == 0x0a) eptr++;
4107 break;
4108
4109 case 0x000a:
4110 break;
4111
4112 case 0x000b:
4113 case 0x000c:
4114 case 0x0085:
4115 if (md->bsr_anycrlf) RRETURN(MATCH_NOMATCH);
4116 break;
4117 }
4118 break;
4119
4120 case OP_NOT_HSPACE:
4121 switch(c)
4122 {
4123 default: break;
4124 case 0x09: /* HT */
4125 case 0x20: /* SPACE */
4126 case 0xa0: /* NBSP */
4127 RRETURN(MATCH_NOMATCH);
4128 }
4129 break;
4130
4131 case OP_HSPACE:
4132 switch(c)
4133 {
4134 default: RRETURN(MATCH_NOMATCH);
4135 case 0x09: /* HT */
4136 case 0x20: /* SPACE */
4137 case 0xa0: /* NBSP */
4138 break;
4139 }
4140 break;
4141
4142 case OP_NOT_VSPACE:
4143 switch(c)
4144 {
4145 default: break;
4146 case 0x0a: /* LF */
4147 case 0x0b: /* VT */
4148 case 0x0c: /* FF */
4149 case 0x0d: /* CR */
4150 case 0x85: /* NEL */
4151 RRETURN(MATCH_NOMATCH);
4152 }
4153 break;
4154
4155 case OP_VSPACE:
4156 switch(c)
4157 {
4158 default: RRETURN(MATCH_NOMATCH);
4159 case 0x0a: /* LF */
4160 case 0x0b: /* VT */
4161 case 0x0c: /* FF */
4162 case 0x0d: /* CR */
4163 case 0x85: /* NEL */
4164 break;
4165 }
4166 break;
4167
4168 case OP_NOT_DIGIT:
4169 if ((md->ctypes[c] & ctype_digit) != 0) RRETURN(MATCH_NOMATCH);
4170 break;
4171
4172 case OP_DIGIT:
4173 if ((md->ctypes[c] & ctype_digit) == 0) RRETURN(MATCH_NOMATCH);
4174 break;
4175
4176 case OP_NOT_WHITESPACE:
4177 if ((md->ctypes[c] & ctype_space) != 0) RRETURN(MATCH_NOMATCH);
4178 break;
4179
4180 case OP_WHITESPACE:
4181 if ((md->ctypes[c] & ctype_space) == 0) RRETURN(MATCH_NOMATCH);
4182 break;
4183
4184 case OP_NOT_WORDCHAR:
4185 if ((md->ctypes[c] & ctype_word) != 0) RRETURN(MATCH_NOMATCH);
4186 break;
4187
4188 case OP_WORDCHAR:
4189 if ((md->ctypes[c] & ctype_word) == 0) RRETURN(MATCH_NOMATCH);
4190 break;
4191
4192 default:
4193 RRETURN(PCRE_ERROR_INTERNAL);
4194 }
4195 }
4196 }
4197 /* Control never gets here */
4198 }
4199
4200 /* If maximizing, it is worth using inline code for speed, doing the type
4201 test once at the start (i.e. keep it out of the loop). Again, keep the
4202 UTF-8 and UCP stuff separate. */
4203
4204 else
4205 {
4206 pp = eptr; /* Remember where we started */
4207
4208 #ifdef SUPPORT_UCP
4209 if (prop_type >= 0)
4210 {
4211 switch(prop_type)
4212 {
4213 case PT_ANY:
4214 for (i = min; i < max; i++)
4215 {
4216 int len = 1;
4217 if (eptr >= md->end_subject) break;
4218 GETCHARLEN(c, eptr, len);
4219 if (prop_fail_result) break;
4220 eptr+= len;
4221 }
4222 break;
4223
4224 case PT_LAMP:
4225 for (i = min; i < max; i++)
4226 {
4227 int len = 1;
4228 if (eptr >= md->end_subject) break;
4229 GETCHARLEN(c, eptr, len);
4230 prop_chartype = UCD_CHARTYPE(c);
4231 if ((prop_chartype == ucp_Lu ||
4232 prop_chartype == ucp_Ll ||
4233 prop_chartype == ucp_Lt) == prop_fail_result)
4234 break;
4235 eptr+= len;
4236 }
4237 break;
4238
4239 case PT_GC:
4240 for (i = min; i < max; i++)
4241 {
4242 int len = 1;
4243 if (eptr >= md->end_subject) break;
4244 GETCHARLEN(c, eptr, len);
4245 prop_category = UCD_CATEGORY(c);
4246 if ((prop_category == prop_value) == prop_fail_result)
4247 break;
4248 eptr+= len;
4249 }
4250 break;
4251
4252 case PT_PC:
4253 for (i = min; i < max; i++)
4254 {
4255 int len = 1;
4256 if (eptr >= md->end_subject) break;
4257 GETCHARLEN(c, eptr, len);
4258 prop_chartype = UCD_CHARTYPE(c);
4259 if ((prop_chartype == prop_value) == prop_fail_result)
4260 break;
4261 eptr+= len;
4262 }
4263 break;
4264
4265 case PT_SC:
4266 for (i = min; i < max; i++)
4267 {
4268 int len = 1;
4269 if (eptr >= md->end_subject) break;
4270 GETCHARLEN(c, eptr, len);
4271 prop_script = UCD_SCRIPT(c);
4272 if ((prop_script == prop_value) == prop_fail_result)
4273 break;
4274 eptr+= len;
4275 }
4276 break;
4277 }
4278
4279 /* eptr is now past the end of the maximum run */
4280
4281 if (possessive) continue;
4282 for(;;)
4283 {
4284 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM44);
4285 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
4286 if (eptr-- == pp) break; /* Stop if tried at original pos */
4287 if (utf8) BACKCHAR(eptr);
4288 }
4289 }
4290
4291 /* Match extended Unicode sequences. We will get here only if the
4292 support is in the binary; otherwise a compile-time error occurs. */
4293
4294 else if (ctype == OP_EXTUNI)
4295 {
4296 for (i = min; i < max; i++)
4297 {
4298 if (eptr >= md->end_subject) break;
4299 GETCHARINCTEST(c, eptr);
4300 prop_category = UCD_CATEGORY(c);
4301 if (prop_category == ucp_M) break;
4302 while (eptr < md->end_subject)
4303 {
4304 int len = 1;
4305 if (!utf8) c = *eptr; else
4306 {
4307 GETCHARLEN(c, eptr, len);
4308 }
4309 prop_category = UCD_CATEGORY(c);
4310 if (prop_category != ucp_M) break;
4311 eptr += len;
4312 }
4313 }
4314
4315 /* eptr is now past the end of the maximum run */
4316
4317 if (possessive) continue;
4318 for(;;)
4319 {
4320 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM45);
4321 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
4322 if (eptr-- == pp) break; /* Stop if tried at original pos */
4323 for (;;) /* Move back over one extended */
4324 {
4325 int len = 1;
4326 if (!utf8) c = *eptr; else
4327 {
4328 BACKCHAR(eptr);
4329 GETCHARLEN(c, eptr, len);
4330 }
4331 prop_category = UCD_CATEGORY(c);
4332 if (prop_category != ucp_M) break;
4333 eptr--;
4334 }
4335 }
4336 }
4337
4338 else
4339 #endif /* SUPPORT_UCP */
4340
4341 #ifdef SUPPORT_UTF8
4342 /* UTF-8 mode */
4343
4344 if (utf8)
4345 {
4346 switch(ctype)
4347 {
4348 case OP_ANY:
4349 if (max < INT_MAX)
4350 {
4351 for (i = min; i < max; i++)
4352 {
4353 if (eptr >= md->end_subject || IS_NEWLINE(eptr)) break;
4354 eptr++;
4355 while (eptr < md->end_subject && (*eptr & 0xc0) == 0x80) eptr++;
4356 }
4357 }
4358
4359 /* Handle unlimited UTF-8 repeat */
4360
4361 else
4362 {
4363 for (i = min; i < max; i++)
4364 {
4365 if (eptr >= md->end_subject || IS_NEWLINE(eptr)) break;
4366 eptr++;
4367 while (eptr < md->end_subject && (*eptr & 0xc0) == 0x80) eptr++;
4368 }
4369 }
4370 break;
4371
4372 case OP_ALLANY:
4373 if (max < INT_MAX)
4374 {
4375 for (i = min; i < max; i++)
4376 {
4377 if (eptr >= md->end_subject) break;
4378 eptr++;
4379 while (eptr < md->end_subject && (*eptr & 0xc0) == 0x80) eptr++;
4380 }
4381 }
4382 else eptr = md->end_subject; /* Unlimited UTF-8 repeat */
4383 break;
4384
4385 /* The byte case is the same as non-UTF8 */
4386
4387 case OP_ANYBYTE:
4388 c = max - min;
4389 if (c > (unsigned int)(md->end_subject - eptr))
4390 c = md->end_subject - eptr;
4391 eptr += c;
4392 break;
4393
4394 case OP_ANYNL:
4395 for (i = min; i < max; i++)
4396 {
4397 int len = 1;
4398 if (eptr >= md->end_subject) break;
4399 GETCHARLEN(c, eptr, len);
4400 if (c == 0x000d)
4401 {
4402 if (++eptr >= md->end_subject) break;
4403 if (*eptr == 0x000a) eptr++;
4404 }
4405 else
4406 {
4407 if (c != 0x000a &&
4408 (md->bsr_anycrlf ||
4409 (c != 0x000b && c != 0x000c &&
4410 c != 0x0085 && c != 0x2028 && c != 0x2029)))
4411 break;
4412 eptr += len;
4413 }
4414 }
4415 break;
4416
4417 case OP_NOT_HSPACE:
4418 case OP_HSPACE:
4419 for (i = min; i < max; i++)
4420 {
4421 BOOL gotspace;
4422 int len = 1;
4423 if (eptr >= md->end_subject) break;
4424 GETCHARLEN(c, eptr, len);
4425 switch(c)
4426 {
4427 default: gotspace = FALSE; break;
4428 case 0x09: /* HT */
4429 case 0x20: /* SPACE */
4430 case 0xa0: /* NBSP */
4431 case 0x1680: /* OGHAM SPACE MARK */
4432 case 0x180e: /* MONGOLIAN VOWEL SEPARATOR */
4433 case 0x2000: /* EN QUAD */
4434 case 0x2001: /* EM QUAD */
4435 case 0x2002: /* EN SPACE */
4436 case 0x2003: /* EM SPACE */
4437 case 0x2004: /* THREE-PER-EM SPACE */
4438 case 0x2005: /* FOUR-PER-EM SPACE */
4439 case 0x2006: /* SIX-PER-EM SPACE */
4440 case 0x2007: /* FIGURE SPACE */
4441 case 0x2008: /* PUNCTUATION SPACE */
4442 case 0x2009: /* THIN SPACE */
4443 case 0x200A: /* HAIR SPACE */
4444 case 0x202f: /* NARROW NO-BREAK SPACE */
4445 case 0x205f: /* MEDIUM MATHEMATICAL SPACE */
4446 case 0x3000: /* IDEOGRAPHIC SPACE */
4447 gotspace = TRUE;
4448 break;
4449 }
4450 if (gotspace == (ctype == OP_NOT_HSPACE)) break;
4451 eptr += len;
4452 }
4453 break;
4454
4455 case OP_NOT_VSPACE:
4456 case OP_VSPACE:
4457 for (i = min; i < max; i++)
4458 {
4459 BOOL gotspace;
4460 int len = 1;
4461 if (eptr >= md->end_subject) break;
4462 GETCHARLEN(c, eptr, len);
4463 switch(c)
4464 {
4465 default: gotspace = FALSE; break;
4466 case 0x0a: /* LF */
4467 case 0x0b: /* VT */
4468 case 0x0c: /* FF */
4469 case 0x0d: /* CR */
4470 case 0x85: /* NEL */
4471 case 0x2028: /* LINE SEPARATOR */
4472 case 0x2029: /* PARAGRAPH SEPARATOR */
4473 gotspace = TRUE;
4474 break;
4475 }
4476 if (gotspace == (ctype == OP_NOT_VSPACE)) break;
4477 eptr += len;
4478 }
4479 break;
4480
4481 case OP_NOT_DIGIT:
4482 for (i = min; i < max; i++)
4483 {
4484 int len = 1;
4485 if (eptr >= md->end_subject) break;
4486 GETCHARLEN(c, eptr, len);
4487 if (c < 256 && (md->ctypes[c] & ctype_digit) != 0) break;
4488 eptr+= len;
4489 }
4490 break;
4491
4492 case OP_DIGIT:
4493 for (i = min; i < max; i++)
4494 {
4495 int len = 1;
4496 if (eptr >= md->end_subject) break;
4497 GETCHARLEN(c, eptr, len);
4498 if (c >= 256 ||(md->ctypes[c] & ctype_digit) == 0) break;
4499 eptr+= len;
4500 }
4501 break;
4502
4503 case OP_NOT_WHITESPACE:
4504 for (i = min; i < max; i++)
4505 {
4506 int len = 1;
4507 if (eptr >= md->end_subject) break;
4508 GETCHARLEN(c, eptr, len);
4509 if (c < 256 && (md->ctypes[c] & ctype_space) != 0) break;
4510 eptr+= len;
4511 }
4512 break;
4513
4514 case OP_WHITESPACE:
4515 for (i = min; i < max; i++)
4516 {
4517 int len = 1;
4518 if (eptr >= md->end_subject) break;
4519 GETCHARLEN(c, eptr, len);
4520 if (c >= 256 ||(md->ctypes[c] & ctype_space) == 0) break;
4521 eptr+= len;
4522 }
4523 break;
4524
4525 case OP_NOT_WORDCHAR:
4526 for (i = min; i < max; i++)
4527 {
4528 int len = 1;
4529 if (eptr >= md->end_subject) break;
4530 GETCHARLEN(c, eptr, len);
4531 if (c < 256 && (md->ctypes[c] & ctype_word) != 0) break;
4532 eptr+= len;
4533 }
4534 break;
4535
4536 case OP_WORDCHAR:
4537 for (i = min; i < max; i++)
4538 {
4539 int len = 1;
4540 if (eptr >= md->end_subject) break;
4541 GETCHARLEN(c, eptr, len);
4542 if (c >= 256 || (md->ctypes[c] & ctype_word) == 0) break;
4543 eptr+= len;
4544 }
4545 break;
4546
4547 default:
4548 RRETURN(PCRE_ERROR_INTERNAL);
4549 }
4550
4551 /* eptr is now past the end of the maximum run */
4552
4553 if (possessive) continue;
4554 for(;;)
4555 {
4556 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM46);
4557 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
4558 if (eptr-- == pp) break; /* Stop if tried at original pos */
4559 BACKCHAR(eptr);
4560 }
4561 }
4562 else
4563 #endif /* SUPPORT_UTF8 */
4564
4565 /* Not UTF-8 mode */
4566 {
4567 switch(ctype)
4568 {
4569 case OP_ANY:
4570 for (i = min; i < max; i++)
4571 {
4572 if (eptr >= md->end_subject || IS_NEWLINE(eptr)) break;
4573 eptr++;
4574 }
4575 break;
4576
4577 case OP_ALLANY:
4578 case OP_ANYBYTE:
4579 c = max - min;
4580 if (c > (unsigned int)(md->end_subject - eptr))
4581 c = md->end_subject - eptr;
4582 eptr += c;
4583 break;
4584
4585 case OP_ANYNL:
4586 for (i = min; i < max; i++)
4587 {
4588 if (eptr >= md->end_subject) break;
4589 c = *eptr;
4590 if (c == 0x000d)
4591 {
4592 if (++eptr >= md->end_subject) break;
4593 if (*eptr == 0x000a) eptr++;
4594 }
4595 else
4596 {
4597 if (c != 0x000a &&
4598 (md->bsr_anycrlf ||
4599 (c != 0x000b && c != 0x000c && c != 0x0085)))
4600 break;
4601 eptr++;
4602 }
4603 }
4604 break;
4605
4606 case OP_NOT_HSPACE:
4607 for (i = min; i < max; i++)
4608 {
4609 if (eptr >= md->end_subject) break;
4610 c = *eptr;
4611 if (c == 0x09 || c == 0x20 || c == 0xa0) break;
4612 eptr++;
4613 }
4614 break;
4615
4616 case OP_HSPACE:
4617 for (i = min; i < max; i++)
4618 {
4619 if (eptr >= md->end_subject) break;
4620 c = *eptr;
4621 if (c != 0x09 && c != 0x20 && c != 0xa0) break;
4622 eptr++;
4623 }
4624 break;
4625
4626 case OP_NOT_VSPACE:
4627 for (i = min; i < max; i++)
4628 {
4629 if (eptr >= md->end_subject) break;
4630 c = *eptr;
4631 if (c == 0x0a || c == 0x0b || c == 0x0c || c == 0x0d || c == 0x85)
4632 break;
4633 eptr++;
4634 }
4635 break;
4636
4637 case OP_VSPACE:
4638 for (i = min; i < max; i++)
4639 {
4640 if (eptr >= md->end_subject) break;
4641 c = *eptr;
4642 if (c != 0x0a && c != 0x0b && c != 0x0c && c != 0x0d && c != 0x85)
4643 break;
4644 eptr++;
4645 }
4646 break;
4647
4648 case OP_NOT_DIGIT:
4649 for (i = min; i < max; i++)
4650 {
4651 if (eptr >= md->end_subject || (md->ctypes[*eptr] & ctype_digit) != 0)
4652 break;
4653 eptr++;
4654 }
4655 break;
4656
4657 case OP_DIGIT:
4658 for (i = min; i < max; i++)
4659 {
4660 if (eptr >= md->end_subject || (md->ctypes[*eptr] & ctype_digit) == 0)
4661 break;
4662 eptr++;
4663 }
4664 break;
4665
4666 case OP_NOT_WHITESPACE:
4667 for (i = min; i < max; i++)
4668 {
4669 if (eptr >= md->end_subject || (md->ctypes[*eptr] & ctype_space) != 0)
4670 break;
4671 eptr++;
4672 }
4673 break;
4674
4675 case OP_WHITESPACE:
4676 for (i = min; i < max; i++)
4677 {
4678 if (eptr >= md->end_subject || (md->ctypes[*eptr] & ctype_space) == 0)
4679 break;
4680 eptr++;
4681 }
4682 break;
4683
4684 case OP_NOT_WORDCHAR:
4685 for (i = min; i < max; i++)
4686 {
4687 if (eptr >= md->end_subject || (md->ctypes[*eptr] & ctype_word) != 0)
4688 break;
4689 eptr++;
4690 }
4691 break;
4692
4693 case OP_WORDCHAR:
4694 for (i = min; i < max; i++)
4695 {
4696 if (eptr >= md->end_subject || (md->ctypes[*eptr] & ctype_word) == 0)
4697 break;
4698 eptr++;
4699 }
4700 break;
4701
4702 default:
4703 RRETURN(PCRE_ERROR_INTERNAL);
4704 }
4705
4706 /* eptr is now past the end of the maximum run */
4707
4708 if (possessive) continue;
4709 while (eptr >= pp)
4710 {
4711 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM47);
4712 eptr--;
4713 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
4714 }
4715 }
4716
4717 /* Get here if we can't make it match with any permitted repetitions */
4718
4719 RRETURN(MATCH_NOMATCH);
4720 }
4721 /* Control never gets here */
4722
4723 /* There's been some horrible disaster. Arrival here can only mean there is
4724 something seriously wrong in the code above or the OP_xxx definitions. */
4725
4726 default:
4727 DPRINTF(("Unknown opcode %d\n", *ecode));
4728 RRETURN(PCRE_ERROR_UNKNOWN_OPCODE);
4729 }
4730
4731 /* Do not stick any code in here without much thought; it is assumed
4732 that "continue" in the code above comes out to here to repeat the main
4733 loop. */
4734
4735 } /* End of main loop */
4736 /* Control never reaches here */
4737
4738
4739 /* When compiling to use the heap rather than the stack for recursive calls to
4740 match(), the RRETURN() macro jumps here. The number that is saved in
4741 frame->Xwhere indicates which label we actually want to return to. */
4742
4743 #ifdef NO_RECURSE
4744 #define LBL(val) case val: goto L_RM##val;
4745 HEAP_RETURN:
4746 switch (frame->Xwhere)
4747 {
4748 LBL( 1) LBL( 2) LBL( 3) LBL( 4) LBL( 5) LBL( 6) LBL( 7) LBL( 8)
4749 LBL( 9) LBL(10) LBL(11) LBL(12) LBL(13) LBL(14) LBL(15) LBL(17)
4750 LBL(19) LBL(24) LBL(25) LBL(26) LBL(27) LBL(29) LBL(31) LBL(33)
4751 LBL(35) LBL(43) LBL(47) LBL(48) LBL(49) LBL(50) LBL(51) LBL(52)
4752 LBL(53) LBL(54)
4753 #ifdef SUPPORT_UTF8
4754 LBL(16) LBL(18) LBL(20) LBL(21) LBL(22) LBL(23) LBL(28) LBL(30)
4755 LBL(32) LBL(34) LBL(42) LBL(46)
4756 #ifdef SUPPORT_UCP
4757 LBL(36) LBL(37) LBL(38) LBL(39) LBL(40) LBL(41) LBL(44) LBL(45)
4758 #endif /* SUPPORT_UCP */
4759 #endif /* SUPPORT_UTF8 */
4760 default:
4761 DPRINTF(("jump error in pcre match: label %d non-existent\n", frame->Xwhere));
4762 return PCRE_ERROR_INTERNAL;
4763 }
4764 #undef LBL
4765 #endif /* NO_RECURSE */
4766 }
4767
4768
4769 /***************************************************************************
4770 ****************************************************************************
4771 RECURSION IN THE match() FUNCTION
4772
4773 Undefine all the macros that were defined above to handle this. */
4774
4775 #ifdef NO_RECURSE
4776 #undef eptr
4777 #undef ecode
4778 #undef mstart
4779 #undef offset_top
4780 #undef ims
4781 #undef eptrb
4782 #undef flags
4783
4784 #undef callpat
4785 #undef charptr
4786 #undef data
4787 #undef next
4788 #undef pp
4789 #undef prev
4790 #undef saved_eptr
4791
4792 #undef new_recursive
4793
4794 #undef cur_is_word
4795 #undef condition
4796 #undef prev_is_word
4797
4798 #undef original_ims
4799
4800 #undef ctype
4801 #undef length
4802 #undef max
4803 #undef min
4804 #undef number
4805 #undef offset
4806 #undef op
4807 #undef save_capture_last
4808 #undef save_offset1
4809 #undef save_offset2
4810 #undef save_offset3
4811 #undef stacksave
4812
4813 #undef newptrb
4814
4815 #endif
4816
4817 /* These two are defined as macros in both cases */
4818
4819 #undef fc
4820 #undef fi
4821
4822 /***************************************************************************
4823 ***************************************************************************/
4824
4825
4826
4827 /*************************************************
4828 * Execute a Regular Expression *
4829 *************************************************/
4830
4831 /* This function applies a compiled re to a subject string and picks out
4832 portions of the string if it matches. Two elements in the vector are set for
4833 each substring: the offsets to the start and end of the substring.
4834
4835 Arguments:
4836 argument_re points to the compiled expression
4837 extra_data points to extra data or is NULL
4838 subject points to the subject string
4839 length length of subject string (may contain binary zeros)
4840 start_offset where to start in the subject string
4841 options option bits
4842 offsets points to a vector of ints to be filled in with offsets
4843 offsetcount the number of elements in the vector
4844
4845 Returns: > 0 => success; value is the number of elements filled in
4846 = 0 => success, but offsets is not big enough
4847 -1 => failed to match
4848 < -1 => some kind of unexpected problem
4849 */
4850
4851 PCRE_EXP_DEFN int PCRE_CALL_CONVENTION
4852 pcre_exec(const pcre *argument_re, const pcre_extra *extra_data,
4853 PCRE_SPTR subject, int length, int start_offset, int options, int *offsets,
4854 int offsetcount)
4855 {
4856 int rc, resetcount, ocount;
4857 int first_byte = -1;
4858 int req_byte = -1;
4859 int req_byte2 = -1;
4860 int newline;
4861 unsigned long int ims;
4862 BOOL using_temporary_offsets = FALSE;
4863 BOOL anchored;
4864 BOOL startline;
4865 BOOL firstline;
4866 BOOL first_byte_caseless = FALSE;
4867 BOOL req_byte_caseless = FALSE;
4868 BOOL utf8;
4869 match_data match_block;
4870 match_data *md = &match_block;
4871 const uschar *tables;
4872 const uschar *start_bits = NULL;
4873 USPTR start_match = (USPTR)subject + start_offset;
4874 USPTR end_subject;
4875 USPTR start_partial = NULL;
4876 USPTR req_byte_ptr = start_match - 1;
4877
4878 pcre_study_data internal_study;
4879 const pcre_study_data *study;
4880
4881 real_pcre internal_re;
4882 const real_pcre *external_re = (const real_pcre *)argument_re;
4883 const real_pcre *re = external_re;
4884
4885 /* Plausibility checks */
4886
4887 if ((options & ~PUBLIC_EXEC_OPTIONS) != 0) return PCRE_ERROR_BADOPTION;
4888 if (re == NULL || subject == NULL ||
4889 (offsets == NULL && offsetcount > 0)) return PCRE_ERROR_NULL;
4890 if (offsetcount < 0) return PCRE_ERROR_BADCOUNT;
4891
4892 /* Fish out the optional data from the extra_data structure, first setting
4893 the default values. */
4894
4895 study = NULL;
4896 md->match_limit = MATCH_LIMIT;
4897 md->match_limit_recursion = MATCH_LIMIT_RECURSION;
4898 md->callout_data = NULL;
4899
4900 /* The table pointer is always in native byte order. */
4901
4902 tables = external_re->tables;
4903
4904 if (extra_data != NULL)
4905 {
4906 register unsigned int flags = extra_data->flags;
4907 if ((flags & PCRE_EXTRA_STUDY_DATA) != 0)
4908 study = (const pcre_study_data *)extra_data->study_data;
4909 if ((flags & PCRE_EXTRA_MATCH_LIMIT) != 0)
4910 md->match_limit = extra_data->match_limit;
4911 if ((flags & PCRE_EXTRA_MATCH_LIMIT_RECURSION) != 0)
4912 md->match_limit_recursion = extra_data->match_limit_recursion;
4913 if ((flags & PCRE_EXTRA_CALLOUT_DATA) != 0)
4914 md->callout_data = extra_data->callout_data;
4915 if ((flags & PCRE_EXTRA_TABLES) != 0) tables = extra_data->tables;
4916 }
4917
4918 /* If the exec call supplied NULL for tables, use the inbuilt ones. This
4919 is a feature that makes it possible to save compiled regex and re-use them
4920 in other programs later. */
4921
4922 if (tables == NULL) tables = _pcre_default_tables;
4923
4924 /* Check that the first field in the block is the magic number. If it is not,
4925 test for a regex that was compiled on a host of opposite endianness. If this is
4926 the case, flipped values are put in internal_re and internal_study if there was
4927 study data too. */
4928
4929 if (re->magic_number != MAGIC_NUMBER)
4930 {
4931 re = _pcre_try_flipped(re, &internal_re, study, &internal_study);
4932 if (re == NULL) return PCRE_ERROR_BADMAGIC;
4933 if (study != NULL) study = &internal_study;
4934 }
4935
4936 /* Set up other data */
4937
4938 anchored = ((re->options | options) & PCRE_ANCHORED) != 0;
4939 startline = (re->flags & PCRE_STARTLINE) != 0;
4940 firstline = (re->options & PCRE_FIRSTLINE) != 0;
4941
4942 /* The code starts after the real_pcre block and the capture name table. */
4943
4944 md->start_code = (const uschar *)external_re + re->name_table_offset +
4945 re->name_count * re->name_entry_size;
4946
4947 md->start_subject = (USPTR)subject;
4948 md->start_offset = start_offset;
4949 md->end_subject = md->start_subject + length;
4950 end_subject = md->end_subject;
4951
4952 md->endonly = (re->options & PCRE_DOLLAR_ENDONLY) != 0;
4953 utf8 = md->utf8 = (re->options & PCRE_UTF8) != 0;
4954 md->jscript_compat = (re->options & PCRE_JAVASCRIPT_COMPAT) != 0;
4955
4956 md->notbol = (options & PCRE_NOTBOL) != 0;
4957 md->noteol = (options & PCRE_NOTEOL) != 0;
4958 md->notempty = (options & PCRE_NOTEMPTY) != 0;
4959 md->notempty_atstart = (options & PCRE_NOTEMPTY_ATSTART) != 0;
4960 md->partial = ((options & PCRE_PARTIAL_HARD) != 0)? 2 :
4961 ((options & PCRE_PARTIAL_SOFT) != 0)? 1 : 0;
4962 md->hitend = FALSE;
4963
4964 md->recursive = NULL; /* No recursion at top level */
4965
4966 md->lcc = tables + lcc_offset;
4967 md->ctypes = tables + ctypes_offset;
4968
4969 /* Handle different \R options. */
4970
4971 switch (options & (PCRE_BSR_ANYCRLF|PCRE_BSR_UNICODE))
4972 {
4973 case 0:
4974 if ((re->options & (PCRE_BSR_ANYCRLF|PCRE_BSR_UNICODE)) != 0)
4975 md->bsr_anycrlf = (re->options & PCRE_BSR_ANYCRLF) != 0;
4976 else
4977 #ifdef BSR_ANYCRLF
4978 md->bsr_anycrlf = TRUE;
4979 #else
4980 md->bsr_anycrlf = FALSE;
4981 #endif
4982 break;
4983
4984 case PCRE_BSR_ANYCRLF:
4985 md->bsr_anycrlf = TRUE;
4986 break;
4987
4988 case PCRE_BSR_UNICODE:
4989 md->bsr_anycrlf = FALSE;
4990 break;
4991
4992 default: return PCRE_ERROR_BADNEWLINE;
4993 }
4994
4995 /* Handle different types of newline. The three bits give eight cases. If
4996 nothing is set at run time, whatever was used at compile time applies. */
4997
4998 switch ((((options & PCRE_NEWLINE_BITS) == 0)? re->options :
4999 (pcre_uint32)options) & PCRE_NEWLINE_BITS)
5000 {
5001 case 0: newline = NEWLINE; break; /* Compile-time default */
5002 case PCRE_NEWLINE_CR: newline = CHAR_CR; break;
5003 case PCRE_NEWLINE_LF: newline = CHAR_NL; break;
5004 case PCRE_NEWLINE_CR+
5005 PCRE_NEWLINE_LF: newline = (CHAR_CR << 8) | CHAR_NL; break;
5006 case PCRE_NEWLINE_ANY: newline = -1; break;
5007 case PCRE_NEWLINE_ANYCRLF: newline = -2; break;
5008 default: return PCRE_ERROR_BADNEWLINE;
5009 }
5010
5011 if (newline == -2)
5012 {
5013 md->nltype = NLTYPE_ANYCRLF;
5014 }
5015 else if (newline < 0)
5016 {
5017 md->nltype = NLTYPE_ANY;
5018 }
5019 else
5020 {
5021 md->nltype = NLTYPE_FIXED;
5022 if (newline > 255)
5023 {
5024 md->nllen = 2;
5025 md->nl[0] = (newline >> 8) & 255;
5026 md->nl[1] = newline & 255;
5027 }
5028 else
5029 {
5030 md->nllen = 1;
5031 md->nl[0] = newline;
5032 }
5033 }
5034
5035 /* Partial matching was originally supported only for a restricted set of
5036 regexes; from release 8.00 there are no restrictions, but the bits are still
5037 defined (though never set). So there's no harm in leaving this code. */
5038
5039 if (md->partial && (re->flags & PCRE_NOPARTIAL) != 0)
5040 return PCRE_ERROR_BADPARTIAL;
5041
5042 /* Check a UTF-8 string if required. Unfortunately there's no way of passing
5043 back the character offset. */
5044
5045 #ifdef SUPPORT_UTF8
5046 if (utf8 && (options & PCRE_NO_UTF8_CHECK) == 0)
5047 {
5048 if (_pcre_valid_utf8((USPTR)subject, length) >= 0)
5049 return PCRE_ERROR_BADUTF8;
5050 if (start_offset > 0 && start_offset < length)
5051 {
5052 int tb = ((USPTR)subject)[start_offset];
5053 if (tb > 127)
5054 {
5055 tb &= 0xc0;
5056 if (tb != 0 && tb != 0xc0) return PCRE_ERROR_BADUTF8_OFFSET;
5057 }
5058 }
5059 }
5060 #endif
5061
5062 /* The ims options can vary during the matching as a result of the presence
5063 of (?ims) items in the pattern. They are kept in a local variable so that
5064 restoring at the exit of a group is easy. */
5065
5066 ims = re->options & (PCRE_CASELESS|PCRE_MULTILINE|PCRE_DOTALL);
5067
5068 /* If the expression has got more back references than the offsets supplied can
5069 hold, we get a temporary chunk of working store to use during the matching.
5070 Otherwise, we can use the vector supplied, rounding down its size to a multiple
5071 of 3. */
5072
5073 ocount = offsetcount - (offsetcount % 3);
5074
5075 if (re->top_backref > 0 && re->top_backref >= ocount/3)
5076 {
5077 ocount = re->top_backref * 3 + 3;
5078 md->offset_vector = (int *)(pcre_malloc)(ocount * sizeof(int));
5079 if (md->offset_vector == NULL) return PCRE_ERROR_NOMEMORY;
5080 using_temporary_offsets = TRUE;
5081 DPRINTF(("Got memory to hold back references\n"));
5082 }
5083 else md->offset_vector = offsets;
5084
5085 md->offset_end = ocount;
5086 md->offset_max = (2*ocount)/3;
5087 md->offset_overflow = FALSE;
5088 md->capture_last = -1;
5089
5090 /* Compute the minimum number of offsets that we need to reset each time. Doing
5091 this makes a huge difference to execution time when there aren't many brackets
5092 in the pattern. */
5093
5094 resetcount = 2 + re->top_bracket * 2;
5095 if (resetcount > offsetcount) resetcount = ocount;
5096
5097 /* Reset the working variable associated with each extraction. These should
5098 never be used unless previously set, but they get saved and restored, and so we
5099 initialize them to avoid reading uninitialized locations. */
5100
5101 if (md->offset_vector != NULL)
5102 {
5103 register int *iptr = md->offset_vector + ocount;
5104 register int *iend = iptr - resetcount/2 + 1;
5105 while (--iptr >= iend) *iptr = -1;
5106 }
5107
5108 /* Set up the first character to match, if available. The first_byte value is
5109 never set for an anchored regular expression, but the anchoring may be forced
5110 at run time, so we have to test for anchoring. The first char may be unset for
5111 an unanchored pattern, of course. If there's no first char and the pattern was
5112 studied, there may be a bitmap of possible first characters. */
5113
5114 if (!anchored)
5115 {
5116 if ((re->flags & PCRE_FIRSTSET) != 0)
5117 {
5118 first_byte = re->first_byte & 255;
5119 if ((first_byte_caseless = ((re->first_byte & REQ_CASELESS) != 0)) == TRUE)
5120 first_byte = md->lcc[first_byte];
5121 }
5122 else
5123 if (!startline && study != NULL &&
5124 (study->flags & PCRE_STUDY_MAPPED) != 0)
5125 start_bits = study->start_bits;
5126 }
5127
5128 /* For anchored or unanchored matches, there may be a "last known required
5129 character" set. */
5130
5131 if ((re->flags & PCRE_REQCHSET) != 0)
5132 {
5133 req_byte = re->req_byte & 255;
5134 req_byte_caseless = (re->req_byte & REQ_CASELESS) != 0;
5135 req_byte2 = (tables + fcc_offset)[req_byte]; /* case flipped */
5136 }
5137
5138
5139 /* ==========================================================================*/
5140
5141 /* Loop for handling unanchored repeated matching attempts; for anchored regexs
5142 the loop runs just once. */
5143
5144 for(;;)
5145 {
5146 USPTR save_end_subject = end_subject;
5147 USPTR new_start_match;
5148
5149 /* Reset the maximum number of extractions we might see. */
5150
5151 if (md->offset_vector != NULL)
5152 {
5153 register int *iptr = md->offset_vector;
5154 register int *iend = iptr + resetcount;
5155 while (iptr < iend) *iptr++ = -1;
5156 }
5157
5158 /* If firstline is TRUE, the start of the match is constrained to the first
5159 line of a multiline string. That is, the match must be before or at the first
5160 newline. Implement this by temporarily adjusting end_subject so that we stop
5161 scanning at a newline. If the match fails at the newline, later code breaks
5162 this loop. */
5163
5164 if (firstline)
5165 {
5166 USPTR t = start_match;
5167 #ifdef SUPPORT_UTF8
5168 if (utf8)
5169 {
5170 while (t < md->end_subject && !IS_NEWLINE(t))
5171 {
5172 t++;
5173 while (t < end_subject && (*t & 0xc0) == 0x80) t++;
5174 }
5175 }
5176 else
5177 #endif
5178 while (t < md->end_subject && !IS_NEWLINE(t)) t++;
5179 end_subject = t;
5180 }
5181
5182 /* There are some optimizations that avoid running the match if a known
5183 starting point is not found, or if a known later character is not present.
5184 However, there is an option that disables these, for testing and for ensuring
5185 that all callouts do actually occur. */
5186
5187 if ((options & PCRE_NO_START_OPTIMIZE) == 0)
5188 {
5189 /* Advance to a unique first byte if there is one. */
5190
5191 if (first_byte >= 0)
5192 {
5193 if (first_byte_caseless)
5194 while (start_match < end_subject && md->lcc[*start_match] != first_byte)
5195 start_match++;
5196 else
5197 while (start_match < end_subject && *start_match != first_byte)
5198 start_match++;
5199 }
5200
5201 /* Or to just after a linebreak for a multiline match */
5202
5203 else if (startline)
5204 {
5205 if (start_match > md->start_subject + start_offset)
5206 {
5207 #ifdef SUPPORT_UTF8
5208 if (utf8)
5209 {
5210 while (start_match < end_subject && !WAS_NEWLINE(start_match))
5211 {
5212 start_match++;
5213 while(start_match < end_subject && (*start_match & 0xc0) == 0x80)
5214 start_match++;
5215 }
5216 }
5217 else
5218 #endif
5219 while (start_match < end_subject && !WAS_NEWLINE(start_match))
5220 start_match++;
5221
5222 /* If we have just passed a CR and the newline option is ANY or ANYCRLF,
5223 and we are now at a LF, advance the match position by one more character.
5224 */
5225
5226 if (start_match[-1] == CHAR_CR &&
5227 (md->nltype == NLTYPE_ANY || md->nltype == NLTYPE_ANYCRLF) &&
5228 start_match < end_subject &&
5229 *start_match == CHAR_NL)
5230 start_match++;
5231 }
5232 }
5233
5234 /* Or to a non-unique first byte after study */
5235
5236 else if (start_bits != NULL)
5237 {
5238 while (start_match < end_subject)
5239 {
5240 register unsigned int c = *start_match;
5241 if ((start_bits[c/8] & (1 << (c&7))) == 0) start_match++;
5242 else break;
5243 }
5244 }
5245 } /* Starting optimizations */
5246
5247 /* Restore fudged end_subject */
5248
5249 end_subject = save_end_subject;
5250
5251 /* The following two optimizations are disabled for partial matching or if
5252 disabling is explicitly requested. */
5253
5254 if ((options & PCRE_NO_START_OPTIMIZE) == 0 && !md->partial)
5255 {
5256 /* If the pattern was studied, a minimum subject length may be set. This is
5257 a lower bound; no actual string of that length may actually match the
5258 pattern. Although the value is, strictly, in characters, we treat it as
5259 bytes to avoid spending too much time in this optimization. */
5260
5261 if (study != NULL && (study->flags & PCRE_STUDY_MINLEN) != 0 &&
5262 end_subject - start_match < study->minlength)
5263 {
5264 rc = MATCH_NOMATCH;
5265 break;
5266 }
5267
5268 /* If req_byte is set, we know that that character must appear in the
5269 subject for the match to succeed. If the first character is set, req_byte
5270 must be later in the subject; otherwise the test starts at the match point.
5271 This optimization can save a huge amount of backtracking in patterns with
5272 nested unlimited repeats that aren't going to match. Writing separate code
5273 for cased/caseless versions makes it go faster, as does using an
5274 autoincrement and backing off on a match.
5275
5276 HOWEVER: when the subject string is very, very long, searching to its end
5277 can take a long time, and give bad performance on quite ordinary patterns.
5278 This showed up when somebody was matching something like /^\d+C/ on a
5279 32-megabyte string... so we don't do this when the string is sufficiently
5280 long. */
5281
5282 if (req_byte >= 0 && end_subject - start_match < REQ_BYTE_MAX)
5283 {
5284 register USPTR p = start_match + ((first_byte >= 0)? 1 : 0);
5285
5286 /* We don't need to repeat the search if we haven't yet reached the
5287 place we found it at last time. */
5288
5289 if (p > req_byte_ptr)
5290 {
5291 if (req_byte_caseless)
5292 {
5293 while (p < end_subject)
5294 {
5295 register int pp = *p++;
5296 if (pp == req_byte || pp == req_byte2) { p--; break; }
5297 }
5298 }
5299 else
5300 {
5301 while (p < end_subject)
5302 {
5303 if (*p++ == req_byte) { p--; break; }
5304 }
5305 }
5306
5307 /* If we can't find the required character, break the matching loop,
5308 forcing a match failure. */
5309
5310 if (p >= end_subject)
5311 {
5312 rc = MATCH_NOMATCH;
5313 break;
5314 }
5315
5316 /* If we have found the required character, save the point where we
5317 found it, so that we don't search again next time round the loop if
5318 the start hasn't passed this character yet. */
5319
5320 req_byte_ptr = p;
5321 }
5322 }
5323 }
5324
5325 #ifdef DEBUG /* Sigh. Some compilers never learn. */
5326 printf(">>>> Match against: ");
5327 pchars(start_match, end_subject - start_match, TRUE, md);
5328 printf("\n");
5329 #endif
5330
5331 /* OK, we can now run the match. If "hitend" is set afterwards, remember the
5332 first starting point for which a partial match was found. */
5333
5334 md->start_match_ptr = start_match;
5335 md->start_used_ptr = start_match;
5336 md->match_call_count = 0;
5337 rc = match(start_match, md->start_code, start_match, 2, md, ims, NULL, 0, 0);
5338 if (md->hitend && start_partial == NULL) start_partial = md->start_used_ptr;
5339
5340 switch(rc)
5341 {
5342 /* NOMATCH and PRUNE advance by one character. THEN at this level acts
5343 exactly like PRUNE. */
5344
5345 case MATCH_NOMATCH:
5346 case MATCH_PRUNE:
5347 case MATCH_THEN:
5348 new_start_match = start_match + 1;
5349 #ifdef SUPPORT_UTF8
5350 if (utf8)
5351 while(new_start_match < end_subject && (*new_start_match & 0xc0) == 0x80)
5352 new_start_match++;
5353 #endif
5354 break;
5355
5356 /* SKIP passes back the next starting point explicitly. */
5357
5358 case MATCH_SKIP:
5359 new_start_match = md->start_match_ptr;
5360 break;
5361
5362 /* COMMIT disables the bumpalong, but otherwise behaves as NOMATCH. */
5363
5364 case MATCH_COMMIT:
5365 rc = MATCH_NOMATCH;
5366 goto ENDLOOP;
5367
5368 /* Any other return is either a match, or some kind of error. */
5369
5370 default:
5371 goto ENDLOOP;
5372 }
5373
5374 /* Control reaches here for the various types of "no match at this point"
5375 result. Reset the code to MATCH_NOMATCH for subsequent checking. */
5376
5377 rc = MATCH_NOMATCH;
5378
5379 /* If PCRE_FIRSTLINE is set, the match must happen before or at the first
5380 newline in the subject (though it may continue over the newline). Therefore,
5381 if we have just failed to match, starting at a newline, do not continue. */
5382
5383 if (firstline && IS_NEWLINE(start_match)) break;
5384
5385 /* Advance to new matching position */
5386
5387 start_match = new_start_match;
5388
5389 /* Break the loop if the pattern is anchored or if we have passed the end of
5390 the subject. */
5391
5392 if (anchored || start_match > end_subject) break;
5393
5394 /* If we have just passed a CR and we are now at a LF, and the pattern does
5395 not contain any explicit matches for \r or \n, and the newline option is CRLF
5396 or ANY or ANYCRLF, advance the match position by one more character. */
5397
5398 if (start_match[-1] == CHAR_CR &&
5399 start_match < end_subject &&
5400 *start_match == CHAR_NL &&
5401 (re->flags & PCRE_HASCRORLF) == 0 &&
5402 (md->nltype == NLTYPE_ANY ||
5403 md->nltype == NLTYPE_ANYCRLF ||
5404 md->nllen == 2))
5405 start_match++;
5406
5407 } /* End of for(;;) "bumpalong" loop */
5408
5409 /* ==========================================================================*/
5410
5411 /* We reach here when rc is not MATCH_NOMATCH, or if one of the stopping
5412 conditions is true:
5413
5414 (1) The pattern is anchored or the match was failed by (*COMMIT);
5415
5416 (2) We are past the end of the subject;
5417
5418 (3) PCRE_FIRSTLINE is set and we have failed to match at a newline, because
5419 this option requests that a match occur at or before the first newline in
5420 the subject.
5421
5422 When we have a match and the offset vector is big enough to deal with any
5423 backreferences, captured substring offsets will already be set up. In the case
5424 where we had to get some local store to hold offsets for backreference
5425 processing, copy those that we can. In this case there need not be overflow if
5426 certain parts of the pattern were not used, even though there are more
5427 capturing parentheses than vector slots. */
5428
5429 ENDLOOP:
5430
5431 if (rc == MATCH_MATCH)
5432 {
5433 if (using_temporary_offsets)
5434 {
5435 if (offsetcount >= 4)
5436 {
5437 memcpy(offsets + 2, md->offset_vector + 2,
5438 (offsetcount - 2) * sizeof(int));
5439 DPRINTF(("Copied offsets from temporary memory\n"));
5440 }
5441 if (md->end_offset_top > offsetcount) md->offset_overflow = TRUE;
5442 DPRINTF(("Freeing temporary memory\n"));
5443 (pcre_free)(md->offset_vector);
5444 }
5445
5446 /* Set the return code to the number of captured strings, or 0 if there are
5447 too many to fit into the vector. */
5448
5449 rc = md->offset_overflow? 0 : md->end_offset_top/2;
5450
5451 /* If there is space, set up the whole thing as substring 0. The value of
5452 md->start_match_ptr might be modified if \K was encountered on the success
5453 matching path. */
5454
5455 if (offsetcount < 2) rc = 0; else
5456 {
5457 offsets[0] = md->start_match_ptr - md->start_subject;
5458 offsets[1] = md->end_match_ptr - md->start_subject;
5459 }
5460
5461 DPRINTF((">>>> returning %d\n", rc));
5462 return rc;
5463 }
5464
5465 /* Control gets here if there has been an error, or if the overall match
5466 attempt has failed at all permitted starting positions. */
5467
5468 if (using_temporary_offsets)
5469 {
5470 DPRINTF(("Freeing temporary memory\n"));
5471 (pcre_free)(md->offset_vector);
5472 }
5473
5474 if (rc != MATCH_NOMATCH && rc != PCRE_ERROR_PARTIAL)
5475 {
5476 DPRINTF((">>>> error: returning %d\n", rc));
5477 return rc;
5478 }
5479 else if (start_partial != NULL)
5480 {
5481 DPRINTF((">>>> returning PCRE_ERROR_PARTIAL\n"));
5482 if (offsetcount > 1)
5483 {
5484 offsets[0] = start_partial - (USPTR)subject;
5485 offsets[1] = end_subject - (USPTR)subject;
5486 }
5487 return PCRE_ERROR_PARTIAL;
5488 }
5489 else
5490 {
5491 DPRINTF((">>>> returning PCRE_ERROR_NOMATCH\n"));
5492 return PCRE_ERROR_NOMATCH;
5493 }
5494 }
5495
5496 /* End of pcre_exec.c */

Properties

Name Value
svn:eol-style native
svn:keywords "Author Date Id Revision Url"

webmaster@exim.org
ViewVC Help
Powered by ViewVC 1.1.12