/[pcre]/code/trunk/pcre_exec.c
ViewVC logotype

Contents of /code/trunk/pcre_exec.c

Parent Directory Parent Directory | Revision Log Revision Log


Revision 172 - (hide annotations) (download)
Tue Jun 5 10:40:13 2007 UTC (7 years, 4 months ago) by ph10
File MIME type: text/plain
File size: 130351 byte(s)
Drastically reduce workspace used for alternatives in groups; also some 
trailing space removals for a test release.

1 nigel 77 /*************************************************
2     * Perl-Compatible Regular Expressions *
3     *************************************************/
4    
5     /* PCRE is a library of functions to support regular expressions whose syntax
6     and semantics are as close as possible to those of the Perl 5 language.
7    
8     Written by Philip Hazel
9 ph10 117 Copyright (c) 1997-2007 University of Cambridge
10 nigel 77
11     -----------------------------------------------------------------------------
12     Redistribution and use in source and binary forms, with or without
13     modification, are permitted provided that the following conditions are met:
14    
15     * Redistributions of source code must retain the above copyright notice,
16     this list of conditions and the following disclaimer.
17    
18     * Redistributions in binary form must reproduce the above copyright
19     notice, this list of conditions and the following disclaimer in the
20     documentation and/or other materials provided with the distribution.
21    
22     * Neither the name of the University of Cambridge nor the names of its
23     contributors may be used to endorse or promote products derived from
24     this software without specific prior written permission.
25    
26     THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
27     AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
28     IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
29     ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
30     LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
31     CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
32     SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
33     INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
34     CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
35     ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
36     POSSIBILITY OF SUCH DAMAGE.
37     -----------------------------------------------------------------------------
38     */
39    
40    
41     /* This module contains pcre_exec(), the externally visible function that does
42     pattern matching using an NFA algorithm, trying to mimic Perl as closely as
43     possible. There are also some static supporting functions. */
44    
45 nigel 93 #define NLBLOCK md /* Block containing newline information */
46     #define PSSTART start_subject /* Field containing processed string start */
47     #define PSEND end_subject /* Field containing processed string end */
48    
49 nigel 77 #include "pcre_internal.h"
50    
51 ph10 137 /* Undefine some potentially clashing cpp symbols */
52    
53     #undef min
54     #undef max
55    
56 nigel 93 /* The chain of eptrblocks for tail recursions uses memory in stack workspace,
57     obtained at top level, the size of which is defined by EPTR_WORK_SIZE. */
58 nigel 77
59 nigel 93 #define EPTR_WORK_SIZE (1000)
60 nigel 77
61     /* Flag bits for the match() function */
62    
63 nigel 93 #define match_condassert 0x01 /* Called to check a condition assertion */
64     #define match_cbegroup 0x02 /* Could-be-empty unlimited repeat group */
65     #define match_tail_recursed 0x04 /* Tail recursive call */
66 nigel 77
67     /* Non-error returns from the match() function. Error returns are externally
68     defined PCRE_ERROR_xxx codes, which are all negative. */
69    
70     #define MATCH_MATCH 1
71     #define MATCH_NOMATCH 0
72    
73     /* Maximum number of ints of offset to save on the stack for recursive calls.
74     If the offset vector is bigger, malloc is used. This should be a multiple of 3,
75     because the offset vector is always a multiple of 3 long. */
76    
77     #define REC_STACK_SAVE_MAX 30
78    
79     /* Min and max values for the common repeats; for the maxima, 0 => infinity */
80    
81     static const char rep_min[] = { 0, 0, 1, 1, 0, 0 };
82     static const char rep_max[] = { 0, 0, 0, 0, 1, 1 };
83    
84    
85    
86     #ifdef DEBUG
87     /*************************************************
88     * Debugging function to print chars *
89     *************************************************/
90    
91     /* Print a sequence of chars in printable format, stopping at the end of the
92     subject if the requested.
93    
94     Arguments:
95     p points to characters
96     length number to print
97     is_subject TRUE if printing from within md->start_subject
98     md pointer to matching data block, if is_subject is TRUE
99    
100     Returns: nothing
101     */
102    
103     static void
104     pchars(const uschar *p, int length, BOOL is_subject, match_data *md)
105     {
106 nigel 93 unsigned int c;
107 nigel 77 if (is_subject && length > md->end_subject - p) length = md->end_subject - p;
108     while (length-- > 0)
109     if (isprint(c = *(p++))) printf("%c", c); else printf("\\x%02x", c);
110     }
111     #endif
112    
113    
114    
115     /*************************************************
116     * Match a back-reference *
117     *************************************************/
118    
119     /* If a back reference hasn't been set, the length that is passed is greater
120     than the number of characters left in the string, so the match fails.
121    
122     Arguments:
123     offset index into the offset vector
124     eptr points into the subject
125     length length to be matched
126     md points to match data block
127     ims the ims flags
128    
129     Returns: TRUE if matched
130     */
131    
132     static BOOL
133 nigel 87 match_ref(int offset, register USPTR eptr, int length, match_data *md,
134 nigel 77 unsigned long int ims)
135     {
136 nigel 87 USPTR p = md->start_subject + md->offset_vector[offset];
137 nigel 77
138     #ifdef DEBUG
139     if (eptr >= md->end_subject)
140     printf("matching subject <null>");
141     else
142     {
143     printf("matching subject ");
144     pchars(eptr, length, TRUE, md);
145     }
146     printf(" against backref ");
147     pchars(p, length, FALSE, md);
148     printf("\n");
149     #endif
150    
151     /* Always fail if not enough characters left */
152    
153     if (length > md->end_subject - eptr) return FALSE;
154    
155     /* Separate the caselesss case for speed */
156    
157     if ((ims & PCRE_CASELESS) != 0)
158     {
159     while (length-- > 0)
160     if (md->lcc[*p++] != md->lcc[*eptr++]) return FALSE;
161     }
162     else
163     { while (length-- > 0) if (*p++ != *eptr++) return FALSE; }
164    
165     return TRUE;
166     }
167    
168    
169    
170     /***************************************************************************
171     ****************************************************************************
172     RECURSION IN THE match() FUNCTION
173    
174 nigel 87 The match() function is highly recursive, though not every recursive call
175     increases the recursive depth. Nevertheless, some regular expressions can cause
176     it to recurse to a great depth. I was writing for Unix, so I just let it call
177     itself recursively. This uses the stack for saving everything that has to be
178     saved for a recursive call. On Unix, the stack can be large, and this works
179     fine.
180 nigel 77
181 nigel 87 It turns out that on some non-Unix-like systems there are problems with
182     programs that use a lot of stack. (This despite the fact that every last chip
183     has oodles of memory these days, and techniques for extending the stack have
184     been known for decades.) So....
185 nigel 77
186     There is a fudge, triggered by defining NO_RECURSE, which avoids recursive
187     calls by keeping local variables that need to be preserved in blocks of memory
188 nigel 87 obtained from malloc() instead instead of on the stack. Macros are used to
189 nigel 77 achieve this so that the actual code doesn't look very different to what it
190     always used to.
191 ph10 164
192 ph10 165 The original heap-recursive code used longjmp(). However, it seems that this
193 ph10 164 can be very slow on some operating systems. Following a suggestion from Stan
194     Switzer, the use of longjmp() has been abolished, at the cost of having to
195     provide a unique number for each call to RMATCH. There is no way of generating
196     a sequence of numbers at compile time in C. I have given them names, to make
197     them stand out more clearly.
198    
199     Crude tests on x86 Linux show a small speedup of around 5-8%. However, on
200     FreeBSD, avoiding longjmp() more than halves the time taken to run the standard
201 ph10 165 tests. Furthermore, not using longjmp() means that local dynamic variables
202     don't have indeterminate values; this has meant that the frame size can be
203 ph10 164 reduced because the result can be "passed back" by straight setting of the
204     variable instead of being passed in the frame.
205 nigel 77 ****************************************************************************
206     ***************************************************************************/
207    
208    
209 ph10 164 /* Numbers for RMATCH calls */
210    
211     enum { RM1=1, RM2, RM3, RM4, RM5, RM6, RM7, RM8, RM9, RM10,
212     RM11, RM12, RM13, RM14, RM15, RM16, RM17, RM18, RM19, RM20,
213     RM21, RM22, RM23, RM24, RM25, RM26, RM27, RM28, RM29, RM30,
214     RM31, RM32, RM33, RM34, RM35, RM36, RM37, RM38, RM39, RM40,
215     RM41, RM42, RM43, RM44, RM45, RM46, RM47 };
216    
217 ph10 165
218 nigel 87 /* These versions of the macros use the stack, as normal. There are debugging
219 ph10 165 versions and production versions. Note that the "rw" argument of RMATCH isn't
220 ph10 164 actuall used in this definition. */
221 nigel 77
222     #ifndef NO_RECURSE
223     #define REGISTER register
224 ph10 164
225 nigel 87 #ifdef DEBUG
226 ph10 164 #define RMATCH(ra,rb,rc,rd,re,rf,rg,rw) \
227 nigel 87 { \
228     printf("match() called in line %d\n", __LINE__); \
229 ph10 168 rrc = match(ra,rb,mstart,rc,rd,re,rf,rg,rdepth+1); \
230 nigel 87 printf("to line %d\n", __LINE__); \
231     }
232     #define RRETURN(ra) \
233     { \
234     printf("match() returned %d from line %d ", ra, __LINE__); \
235     return ra; \
236     }
237     #else
238 ph10 164 #define RMATCH(ra,rb,rc,rd,re,rf,rg,rw) \
239 ph10 168 rrc = match(ra,rb,mstart,rc,rd,re,rf,rg,rdepth+1)
240 nigel 77 #define RRETURN(ra) return ra
241 nigel 87 #endif
242    
243 nigel 77 #else
244    
245    
246 ph10 164 /* These versions of the macros manage a private stack on the heap. Note that
247     the "rd" argument of RMATCH isn't actually used in this definition. It's the md
248     argument of match(), which never changes. */
249 nigel 77
250     #define REGISTER
251    
252 ph10 164 #define RMATCH(ra,rb,rc,rd,re,rf,rg,rw)\
253 nigel 77 {\
254     heapframe *newframe = (pcre_stack_malloc)(sizeof(heapframe));\
255 ph10 164 frame->Xwhere = rw; \
256     newframe->Xeptr = ra;\
257     newframe->Xecode = rb;\
258 ph10 168 newframe->Xmstart = mstart;\
259 ph10 164 newframe->Xoffset_top = rc;\
260     newframe->Xims = re;\
261     newframe->Xeptrb = rf;\
262     newframe->Xflags = rg;\
263     newframe->Xrdepth = frame->Xrdepth + 1;\
264     newframe->Xprevframe = frame;\
265     frame = newframe;\
266     DPRINTF(("restarting from line %d\n", __LINE__));\
267     goto HEAP_RECURSE;\
268     L_##rw:\
269     DPRINTF(("jumped back to line %d\n", __LINE__));\
270 nigel 77 }
271    
272     #define RRETURN(ra)\
273     {\
274     heapframe *newframe = frame;\
275     frame = newframe->Xprevframe;\
276     (pcre_stack_free)(newframe);\
277     if (frame != NULL)\
278     {\
279 ph10 164 rrc = ra;\
280     goto HEAP_RETURN;\
281 nigel 77 }\
282     return ra;\
283     }
284    
285    
286     /* Structure for remembering the local variables in a private frame */
287    
288     typedef struct heapframe {
289     struct heapframe *Xprevframe;
290    
291     /* Function arguments that may change */
292    
293     const uschar *Xeptr;
294     const uschar *Xecode;
295 ph10 172 const uschar *Xmstart;
296 nigel 77 int Xoffset_top;
297     long int Xims;
298     eptrblock *Xeptrb;
299     int Xflags;
300 nigel 91 unsigned int Xrdepth;
301 nigel 77
302     /* Function local variables */
303    
304     const uschar *Xcallpat;
305     const uschar *Xcharptr;
306     const uschar *Xdata;
307     const uschar *Xnext;
308     const uschar *Xpp;
309     const uschar *Xprev;
310     const uschar *Xsaved_eptr;
311    
312     recursion_info Xnew_recursive;
313    
314     BOOL Xcur_is_word;
315     BOOL Xcondition;
316     BOOL Xprev_is_word;
317    
318     unsigned long int Xoriginal_ims;
319    
320     #ifdef SUPPORT_UCP
321     int Xprop_type;
322 nigel 87 int Xprop_value;
323 nigel 77 int Xprop_fail_result;
324     int Xprop_category;
325     int Xprop_chartype;
326 nigel 87 int Xprop_script;
327 ph10 123 int Xoclength;
328     uschar Xocchars[8];
329 nigel 77 #endif
330    
331     int Xctype;
332 nigel 93 unsigned int Xfc;
333 nigel 77 int Xfi;
334     int Xlength;
335     int Xmax;
336     int Xmin;
337     int Xnumber;
338     int Xoffset;
339     int Xop;
340     int Xsave_capture_last;
341     int Xsave_offset1, Xsave_offset2, Xsave_offset3;
342     int Xstacksave[REC_STACK_SAVE_MAX];
343    
344     eptrblock Xnewptrb;
345    
346 ph10 164 /* Where to jump back to */
347 nigel 77
348 ph10 164 int Xwhere;
349 ph10 165
350 nigel 77 } heapframe;
351    
352     #endif
353    
354    
355     /***************************************************************************
356     ***************************************************************************/
357    
358    
359    
360     /*************************************************
361     * Match from current position *
362     *************************************************/
363    
364 nigel 93 /* This function is called recursively in many circumstances. Whenever it
365 nigel 77 returns a negative (error) response, the outer incarnation must also return the
366     same response.
367    
368     Performance note: It might be tempting to extract commonly used fields from the
369     md structure (e.g. utf8, end_subject) into individual variables to improve
370     performance. Tests using gcc on a SPARC disproved this; in the first case, it
371     made performance worse.
372    
373     Arguments:
374 nigel 93 eptr pointer to current character in subject
375     ecode pointer to current position in compiled code
376 ph10 168 mstart pointer to the current match start position (can be modified
377 ph10 172 by encountering \K)
378 nigel 77 offset_top current top pointer
379     md pointer to "static" info for the match
380     ims current /i, /m, and /s options
381     eptrb pointer to chain of blocks containing eptr at start of
382     brackets - for testing for empty matches
383     flags can contain
384     match_condassert - this is an assertion condition
385 nigel 93 match_cbegroup - this is the start of an unlimited repeat
386     group that can match an empty string
387     match_tail_recursed - this is a tail_recursed group
388 nigel 87 rdepth the recursion depth
389 nigel 77
390     Returns: MATCH_MATCH if matched ) these values are >= 0
391     MATCH_NOMATCH if failed to match )
392     a negative PCRE_ERROR_xxx value if aborted by an error condition
393 nigel 87 (e.g. stopped by repeated call or recursion limit)
394 nigel 77 */
395    
396     static int
397 ph10 172 match(REGISTER USPTR eptr, REGISTER const uschar *ecode, const uschar *mstart,
398 nigel 77 int offset_top, match_data *md, unsigned long int ims, eptrblock *eptrb,
399 nigel 91 int flags, unsigned int rdepth)
400 nigel 77 {
401     /* These variables do not need to be preserved over recursion in this function,
402 nigel 93 so they can be ordinary variables in all cases. Mark some of them with
403     "register" because they are used a lot in loops. */
404 nigel 77
405 nigel 91 register int rrc; /* Returns from recursive calls */
406     register int i; /* Used for loops not involving calls to RMATCH() */
407 nigel 93 register unsigned int c; /* Character values not kept over RMATCH() calls */
408 nigel 91 register BOOL utf8; /* Local copy of UTF-8 flag for speed */
409 nigel 77
410 nigel 93 BOOL minimize, possessive; /* Quantifier options */
411    
412 nigel 77 /* When recursion is not being used, all "local" variables that have to be
413     preserved over calls to RMATCH() are part of a "frame" which is obtained from
414     heap storage. Set up the top-level frame here; others are obtained from the
415     heap whenever RMATCH() does a "recursion". See the macro definitions above. */
416    
417     #ifdef NO_RECURSE
418     heapframe *frame = (pcre_stack_malloc)(sizeof(heapframe));
419     frame->Xprevframe = NULL; /* Marks the top level */
420    
421     /* Copy in the original argument variables */
422    
423     frame->Xeptr = eptr;
424     frame->Xecode = ecode;
425 ph10 168 frame->Xmstart = mstart;
426 nigel 77 frame->Xoffset_top = offset_top;
427     frame->Xims = ims;
428     frame->Xeptrb = eptrb;
429     frame->Xflags = flags;
430 nigel 87 frame->Xrdepth = rdepth;
431 nigel 77
432     /* This is where control jumps back to to effect "recursion" */
433    
434     HEAP_RECURSE:
435    
436     /* Macros make the argument variables come from the current frame */
437    
438     #define eptr frame->Xeptr
439     #define ecode frame->Xecode
440 ph10 168 #define mstart frame->Xmstart
441 nigel 77 #define offset_top frame->Xoffset_top
442     #define ims frame->Xims
443     #define eptrb frame->Xeptrb
444     #define flags frame->Xflags
445 nigel 87 #define rdepth frame->Xrdepth
446 nigel 77
447     /* Ditto for the local variables */
448    
449     #ifdef SUPPORT_UTF8
450     #define charptr frame->Xcharptr
451     #endif
452     #define callpat frame->Xcallpat
453     #define data frame->Xdata
454     #define next frame->Xnext
455     #define pp frame->Xpp
456     #define prev frame->Xprev
457     #define saved_eptr frame->Xsaved_eptr
458    
459     #define new_recursive frame->Xnew_recursive
460    
461     #define cur_is_word frame->Xcur_is_word
462     #define condition frame->Xcondition
463     #define prev_is_word frame->Xprev_is_word
464    
465     #define original_ims frame->Xoriginal_ims
466    
467     #ifdef SUPPORT_UCP
468     #define prop_type frame->Xprop_type
469 nigel 87 #define prop_value frame->Xprop_value
470 nigel 77 #define prop_fail_result frame->Xprop_fail_result
471     #define prop_category frame->Xprop_category
472     #define prop_chartype frame->Xprop_chartype
473 nigel 87 #define prop_script frame->Xprop_script
474 ph10 115 #define oclength frame->Xoclength
475     #define occhars frame->Xocchars
476 nigel 77 #endif
477    
478     #define ctype frame->Xctype
479     #define fc frame->Xfc
480     #define fi frame->Xfi
481     #define length frame->Xlength
482     #define max frame->Xmax
483     #define min frame->Xmin
484     #define number frame->Xnumber
485     #define offset frame->Xoffset
486     #define op frame->Xop
487     #define save_capture_last frame->Xsave_capture_last
488     #define save_offset1 frame->Xsave_offset1
489     #define save_offset2 frame->Xsave_offset2
490     #define save_offset3 frame->Xsave_offset3
491     #define stacksave frame->Xstacksave
492    
493     #define newptrb frame->Xnewptrb
494    
495     /* When recursion is being used, local variables are allocated on the stack and
496     get preserved during recursion in the normal way. In this environment, fi and
497     i, and fc and c, can be the same variables. */
498    
499 nigel 93 #else /* NO_RECURSE not defined */
500 nigel 77 #define fi i
501     #define fc c
502    
503    
504 nigel 87 #ifdef SUPPORT_UTF8 /* Many of these variables are used only */
505     const uschar *charptr; /* in small blocks of the code. My normal */
506     #endif /* style of coding would have declared */
507     const uschar *callpat; /* them within each of those blocks. */
508     const uschar *data; /* However, in order to accommodate the */
509     const uschar *next; /* version of this code that uses an */
510     USPTR pp; /* external "stack" implemented on the */
511     const uschar *prev; /* heap, it is easier to declare them all */
512     USPTR saved_eptr; /* here, so the declarations can be cut */
513     /* out in a block. The only declarations */
514     recursion_info new_recursive; /* within blocks below are for variables */
515     /* that do not have to be preserved over */
516     BOOL cur_is_word; /* a recursive call to RMATCH(). */
517     BOOL condition;
518 nigel 77 BOOL prev_is_word;
519    
520     unsigned long int original_ims;
521    
522     #ifdef SUPPORT_UCP
523     int prop_type;
524 nigel 87 int prop_value;
525 nigel 77 int prop_fail_result;
526     int prop_category;
527     int prop_chartype;
528 nigel 87 int prop_script;
529 ph10 115 int oclength;
530     uschar occhars[8];
531 nigel 77 #endif
532    
533     int ctype;
534     int length;
535     int max;
536     int min;
537     int number;
538     int offset;
539     int op;
540     int save_capture_last;
541     int save_offset1, save_offset2, save_offset3;
542     int stacksave[REC_STACK_SAVE_MAX];
543    
544     eptrblock newptrb;
545 nigel 93 #endif /* NO_RECURSE */
546 nigel 77
547     /* These statements are here to stop the compiler complaining about unitialized
548     variables. */
549    
550     #ifdef SUPPORT_UCP
551 nigel 87 prop_value = 0;
552 nigel 77 prop_fail_result = 0;
553     #endif
554    
555 nigel 93
556 nigel 91 /* This label is used for tail recursion, which is used in a few cases even
557     when NO_RECURSE is not defined, in order to reduce the amount of stack that is
558     used. Thanks to Ian Taylor for noticing this possibility and sending the
559     original patch. */
560    
561     TAIL_RECURSE:
562    
563 nigel 87 /* OK, now we can get on with the real code of the function. Recursive calls
564     are specified by the macro RMATCH and RRETURN is used to return. When
565     NO_RECURSE is *not* defined, these just turn into a recursive call to match()
566     and a "return", respectively (possibly with some debugging if DEBUG is
567     defined). However, RMATCH isn't like a function call because it's quite a
568     complicated macro. It has to be used in one particular way. This shouldn't,
569     however, impact performance when true recursion is being used. */
570 nigel 77
571 ph10 164 #ifdef SUPPORT_UTF8
572     utf8 = md->utf8; /* Local copy of the flag */
573     #else
574     utf8 = FALSE;
575     #endif
576    
577 nigel 87 /* First check that we haven't called match() too many times, or that we
578     haven't exceeded the recursive call limit. */
579    
580 nigel 77 if (md->match_call_count++ >= md->match_limit) RRETURN(PCRE_ERROR_MATCHLIMIT);
581 nigel 87 if (rdepth >= md->match_limit_recursion) RRETURN(PCRE_ERROR_RECURSIONLIMIT);
582 nigel 77
583     original_ims = ims; /* Save for resetting on ')' */
584 nigel 91
585 nigel 93 /* At the start of a group with an unlimited repeat that may match an empty
586     string, the match_cbegroup flag is set. When this is the case, add the current
587     subject pointer to the chain of such remembered pointers, to be checked when we
588     hit the closing ket, in order to break infinite loops that match no characters.
589     When match() is called in other circumstances, don't add to the chain. If this
590     is a tail recursion, use a block from the workspace, as the one on the stack is
591     already used. */
592 nigel 77
593 nigel 93 if ((flags & match_cbegroup) != 0)
594 nigel 77 {
595 nigel 93 eptrblock *p;
596     if ((flags & match_tail_recursed) != 0)
597     {
598     if (md->eptrn >= EPTR_WORK_SIZE) RRETURN(PCRE_ERROR_NULLWSLIMIT);
599     p = md->eptrchain + md->eptrn++;
600     }
601     else p = &newptrb;
602     p->epb_saved_eptr = eptr;
603     p->epb_prev = eptrb;
604     eptrb = p;
605 nigel 77 }
606    
607 nigel 93 /* Now start processing the opcodes. */
608 nigel 77
609     for (;;)
610     {
611 nigel 93 minimize = possessive = FALSE;
612 nigel 77 op = *ecode;
613    
614     /* For partial matching, remember if we ever hit the end of the subject after
615     matching at least one subject character. */
616    
617     if (md->partial &&
618     eptr >= md->end_subject &&
619 ph10 168 eptr > mstart)
620 nigel 77 md->hitend = TRUE;
621    
622 nigel 93 switch(op)
623     {
624     /* Handle a capturing bracket. If there is space in the offset vector, save
625     the current subject position in the working slot at the top of the vector.
626     We mustn't change the current values of the data slot, because they may be
627     set from a previous iteration of this group, and be referred to by a
628     reference inside the group.
629 nigel 77
630 nigel 93 If the bracket fails to match, we need to restore this value and also the
631     values of the final offsets, in case they were set by a previous iteration
632     of the same bracket.
633 nigel 77
634 nigel 93 If there isn't enough space in the offset vector, treat this as if it were
635     a non-capturing bracket. Don't worry about setting the flag for the error
636     case here; that is handled in the code for KET. */
637 nigel 77
638 nigel 93 case OP_CBRA:
639     case OP_SCBRA:
640     number = GET2(ecode, 1+LINK_SIZE);
641 nigel 77 offset = number << 1;
642    
643     #ifdef DEBUG
644 nigel 93 printf("start bracket %d\n", number);
645     printf("subject=");
646 nigel 77 pchars(eptr, 16, TRUE, md);
647     printf("\n");
648     #endif
649    
650     if (offset < md->offset_max)
651     {
652     save_offset1 = md->offset_vector[offset];
653     save_offset2 = md->offset_vector[offset+1];
654     save_offset3 = md->offset_vector[md->offset_end - number];
655     save_capture_last = md->capture_last;
656    
657     DPRINTF(("saving %d %d %d\n", save_offset1, save_offset2, save_offset3));
658     md->offset_vector[md->offset_end - number] = eptr - md->start_subject;
659    
660 nigel 93 flags = (op == OP_SCBRA)? match_cbegroup : 0;
661 nigel 77 do
662     {
663 ph10 164 RMATCH(eptr, ecode + _pcre_OP_lengths[*ecode], offset_top, md,
664     ims, eptrb, flags, RM1);
665 nigel 77 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
666     md->capture_last = save_capture_last;
667     ecode += GET(ecode, 1);
668     }
669     while (*ecode == OP_ALT);
670    
671     DPRINTF(("bracket %d failed\n", number));
672    
673     md->offset_vector[offset] = save_offset1;
674     md->offset_vector[offset+1] = save_offset2;
675     md->offset_vector[md->offset_end - number] = save_offset3;
676    
677     RRETURN(MATCH_NOMATCH);
678     }
679    
680 nigel 93 /* Insufficient room for saving captured contents. Treat as a non-capturing
681     bracket. */
682 nigel 77
683 nigel 93 DPRINTF(("insufficient capture room: treat as non-capturing\n"));
684 nigel 77
685 nigel 93 /* Non-capturing bracket. Loop for all the alternatives. When we get to the
686     final alternative within the brackets, we would return the result of a
687     recursive call to match() whatever happened. We can reduce stack usage by
688     turning this into a tail recursion. */
689 nigel 77
690 nigel 93 case OP_BRA:
691     case OP_SBRA:
692     DPRINTF(("start non-capturing bracket\n"));
693     flags = (op >= OP_SBRA)? match_cbegroup : 0;
694 nigel 91 for (;;)
695 nigel 77 {
696 nigel 91 if (ecode[GET(ecode, 1)] != OP_ALT)
697 nigel 93 {
698     ecode += _pcre_OP_lengths[*ecode];
699     flags |= match_tail_recursed;
700     DPRINTF(("bracket 0 tail recursion\n"));
701     goto TAIL_RECURSE;
702     }
703 nigel 91
704     /* For non-final alternatives, continue the loop for a NOMATCH result;
705     otherwise return. */
706    
707 ph10 164 RMATCH(eptr, ecode + _pcre_OP_lengths[*ecode], offset_top, md, ims,
708     eptrb, flags, RM2);
709 nigel 77 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
710     ecode += GET(ecode, 1);
711     }
712 nigel 91 /* Control never reaches here. */
713 nigel 77
714     /* Conditional group: compilation checked that there are no more than
715     two branches. If the condition is false, skipping the first branch takes us
716     past the end if there is only one branch, but that's OK because that is
717 nigel 91 exactly what going to the ket would do. As there is only one branch to be
718     obeyed, we can use tail recursion to avoid using another stack frame. */
719 nigel 77
720     case OP_COND:
721 nigel 93 case OP_SCOND:
722     if (ecode[LINK_SIZE+1] == OP_RREF) /* Recursion test */
723 nigel 77 {
724 nigel 93 offset = GET2(ecode, LINK_SIZE + 2); /* Recursion group number*/
725     condition = md->recursive != NULL &&
726     (offset == RREF_ANY || offset == md->recursive->group_num);
727     ecode += condition? 3 : GET(ecode, 1);
728     }
729    
730     else if (ecode[LINK_SIZE+1] == OP_CREF) /* Group used test */
731     {
732 nigel 77 offset = GET2(ecode, LINK_SIZE+2) << 1; /* Doubled ref number */
733 nigel 93 condition = offset < offset_top && md->offset_vector[offset] >= 0;
734     ecode += condition? 3 : GET(ecode, 1);
735 nigel 77 }
736    
737 nigel 93 else if (ecode[LINK_SIZE+1] == OP_DEF) /* DEFINE - always false */
738     {
739     condition = FALSE;
740     ecode += GET(ecode, 1);
741     }
742    
743 nigel 77 /* The condition is an assertion. Call match() to evaluate it - setting
744 nigel 93 the final argument match_condassert causes it to stop at the end of an
745     assertion. */
746 nigel 77
747     else
748     {
749 ph10 164 RMATCH(eptr, ecode + 1 + LINK_SIZE, offset_top, md, ims, NULL,
750     match_condassert, RM3);
751 nigel 77 if (rrc == MATCH_MATCH)
752     {
753 nigel 93 condition = TRUE;
754     ecode += 1 + LINK_SIZE + GET(ecode, LINK_SIZE + 2);
755 nigel 77 while (*ecode == OP_ALT) ecode += GET(ecode, 1);
756     }
757     else if (rrc != MATCH_NOMATCH)
758     {
759     RRETURN(rrc); /* Need braces because of following else */
760     }
761 nigel 93 else
762     {
763     condition = FALSE;
764     ecode += GET(ecode, 1);
765     }
766     }
767 nigel 91
768 nigel 93 /* We are now at the branch that is to be obeyed. As there is only one,
769     we can use tail recursion to avoid using another stack frame. If the second
770     alternative doesn't exist, we can just plough on. */
771 nigel 91
772 nigel 93 if (condition || *ecode == OP_ALT)
773     {
774 nigel 91 ecode += 1 + LINK_SIZE;
775 nigel 93 flags = match_tail_recursed | ((op == OP_SCOND)? match_cbegroup : 0);
776 nigel 91 goto TAIL_RECURSE;
777 nigel 77 }
778 nigel 93 else
779     {
780     ecode += 1 + LINK_SIZE;
781     }
782     break;
783 nigel 77
784    
785 nigel 93 /* End of the pattern. If we are in a top-level recursion, we should
786     restore the offsets appropriately and continue from after the call. */
787 nigel 77
788     case OP_END:
789     if (md->recursive != NULL && md->recursive->group_num == 0)
790     {
791     recursion_info *rec = md->recursive;
792 nigel 87 DPRINTF(("End of pattern in a (?0) recursion\n"));
793 nigel 77 md->recursive = rec->prevrec;
794     memmove(md->offset_vector, rec->offset_save,
795     rec->saved_max * sizeof(int));
796 ph10 168 mstart = rec->save_start;
797 nigel 77 ims = original_ims;
798     ecode = rec->after_call;
799     break;
800     }
801    
802     /* Otherwise, if PCRE_NOTEMPTY is set, fail if we have matched an empty
803     string - backtracking will then try other alternatives, if any. */
804    
805 ph10 168 if (md->notempty && eptr == mstart) RRETURN(MATCH_NOMATCH);
806     md->end_match_ptr = eptr; /* Record where we ended */
807     md->end_offset_top = offset_top; /* and how many extracts were taken */
808     md->start_match_ptr = mstart; /* and the start (\K can modify) */
809 nigel 77 RRETURN(MATCH_MATCH);
810    
811     /* Change option settings */
812    
813     case OP_OPT:
814     ims = ecode[1];
815     ecode += 2;
816     DPRINTF(("ims set to %02lx\n", ims));
817     break;
818    
819     /* Assertion brackets. Check the alternative branches in turn - the
820     matching won't pass the KET for an assertion. If any one branch matches,
821     the assertion is true. Lookbehind assertions have an OP_REVERSE item at the
822     start of each branch to move the current point backwards, so the code at
823     this level is identical to the lookahead case. */
824    
825     case OP_ASSERT:
826     case OP_ASSERTBACK:
827     do
828     {
829 ph10 164 RMATCH(eptr, ecode + 1 + LINK_SIZE, offset_top, md, ims, NULL, 0,
830     RM4);
831 nigel 77 if (rrc == MATCH_MATCH) break;
832     if (rrc != MATCH_NOMATCH) RRETURN(rrc);
833     ecode += GET(ecode, 1);
834     }
835     while (*ecode == OP_ALT);
836     if (*ecode == OP_KET) RRETURN(MATCH_NOMATCH);
837    
838     /* If checking an assertion for a condition, return MATCH_MATCH. */
839    
840     if ((flags & match_condassert) != 0) RRETURN(MATCH_MATCH);
841    
842     /* Continue from after the assertion, updating the offsets high water
843     mark, since extracts may have been taken during the assertion. */
844    
845     do ecode += GET(ecode,1); while (*ecode == OP_ALT);
846     ecode += 1 + LINK_SIZE;
847     offset_top = md->end_offset_top;
848     continue;
849    
850     /* Negative assertion: all branches must fail to match */
851    
852     case OP_ASSERT_NOT:
853     case OP_ASSERTBACK_NOT:
854     do
855     {
856 ph10 164 RMATCH(eptr, ecode + 1 + LINK_SIZE, offset_top, md, ims, NULL, 0,
857     RM5);
858 nigel 77 if (rrc == MATCH_MATCH) RRETURN(MATCH_NOMATCH);
859     if (rrc != MATCH_NOMATCH) RRETURN(rrc);
860     ecode += GET(ecode,1);
861     }
862     while (*ecode == OP_ALT);
863    
864     if ((flags & match_condassert) != 0) RRETURN(MATCH_MATCH);
865    
866     ecode += 1 + LINK_SIZE;
867     continue;
868    
869     /* Move the subject pointer back. This occurs only at the start of
870     each branch of a lookbehind assertion. If we are too close to the start to
871     move back, this match function fails. When working with UTF-8 we move
872     back a number of characters, not bytes. */
873    
874     case OP_REVERSE:
875     #ifdef SUPPORT_UTF8
876     if (utf8)
877     {
878 nigel 93 i = GET(ecode, 1);
879     while (i-- > 0)
880 nigel 77 {
881     eptr--;
882     if (eptr < md->start_subject) RRETURN(MATCH_NOMATCH);
883     BACKCHAR(eptr)
884     }
885     }
886     else
887     #endif
888    
889     /* No UTF-8 support, or not in UTF-8 mode: count is byte count */
890    
891     {
892 nigel 93 eptr -= GET(ecode, 1);
893 nigel 77 if (eptr < md->start_subject) RRETURN(MATCH_NOMATCH);
894     }
895    
896     /* Skip to next op code */
897    
898     ecode += 1 + LINK_SIZE;
899     break;
900    
901     /* The callout item calls an external function, if one is provided, passing
902     details of the match so far. This is mainly for debugging, though the
903     function is able to force a failure. */
904    
905     case OP_CALLOUT:
906     if (pcre_callout != NULL)
907     {
908     pcre_callout_block cb;
909     cb.version = 1; /* Version 1 of the callout block */
910     cb.callout_number = ecode[1];
911     cb.offset_vector = md->offset_vector;
912 nigel 87 cb.subject = (PCRE_SPTR)md->start_subject;
913 nigel 77 cb.subject_length = md->end_subject - md->start_subject;
914 ph10 168 cb.start_match = mstart - md->start_subject;
915 nigel 77 cb.current_position = eptr - md->start_subject;
916     cb.pattern_position = GET(ecode, 2);
917     cb.next_item_length = GET(ecode, 2 + LINK_SIZE);
918     cb.capture_top = offset_top/2;
919     cb.capture_last = md->capture_last;
920     cb.callout_data = md->callout_data;
921     if ((rrc = (*pcre_callout)(&cb)) > 0) RRETURN(MATCH_NOMATCH);
922     if (rrc < 0) RRETURN(rrc);
923     }
924     ecode += 2 + 2*LINK_SIZE;
925     break;
926    
927     /* Recursion either matches the current regex, or some subexpression. The
928     offset data is the offset to the starting bracket from the start of the
929     whole pattern. (This is so that it works from duplicated subpatterns.)
930    
931     If there are any capturing brackets started but not finished, we have to
932     save their starting points and reinstate them after the recursion. However,
933     we don't know how many such there are (offset_top records the completed
934     total) so we just have to save all the potential data. There may be up to
935     65535 such values, which is too large to put on the stack, but using malloc
936     for small numbers seems expensive. As a compromise, the stack is used when
937     there are no more than REC_STACK_SAVE_MAX values to store; otherwise malloc
938     is used. A problem is what to do if the malloc fails ... there is no way of
939     returning to the top level with an error. Save the top REC_STACK_SAVE_MAX
940     values on the stack, and accept that the rest may be wrong.
941    
942     There are also other values that have to be saved. We use a chained
943     sequence of blocks that actually live on the stack. Thanks to Robin Houston
944     for the original version of this logic. */
945    
946     case OP_RECURSE:
947     {
948     callpat = md->start_code + GET(ecode, 1);
949 nigel 93 new_recursive.group_num = (callpat == md->start_code)? 0 :
950     GET2(callpat, 1 + LINK_SIZE);
951 nigel 77
952     /* Add to "recursing stack" */
953    
954     new_recursive.prevrec = md->recursive;
955     md->recursive = &new_recursive;
956    
957     /* Find where to continue from afterwards */
958    
959     ecode += 1 + LINK_SIZE;
960     new_recursive.after_call = ecode;
961    
962     /* Now save the offset data. */
963    
964     new_recursive.saved_max = md->offset_end;
965     if (new_recursive.saved_max <= REC_STACK_SAVE_MAX)
966     new_recursive.offset_save = stacksave;
967     else
968     {
969     new_recursive.offset_save =
970     (int *)(pcre_malloc)(new_recursive.saved_max * sizeof(int));
971     if (new_recursive.offset_save == NULL) RRETURN(PCRE_ERROR_NOMEMORY);
972     }
973    
974     memcpy(new_recursive.offset_save, md->offset_vector,
975     new_recursive.saved_max * sizeof(int));
976 ph10 168 new_recursive.save_start = mstart;
977     mstart = eptr;
978 nigel 77
979     /* OK, now we can do the recursion. For each top-level alternative we
980     restore the offset and recursion data. */
981    
982     DPRINTF(("Recursing into group %d\n", new_recursive.group_num));
983 nigel 93 flags = (*callpat >= OP_SBRA)? match_cbegroup : 0;
984 nigel 77 do
985     {
986 ph10 164 RMATCH(eptr, callpat + _pcre_OP_lengths[*callpat], offset_top,
987     md, ims, eptrb, flags, RM6);
988 nigel 77 if (rrc == MATCH_MATCH)
989     {
990 nigel 87 DPRINTF(("Recursion matched\n"));
991 nigel 77 md->recursive = new_recursive.prevrec;
992     if (new_recursive.offset_save != stacksave)
993     (pcre_free)(new_recursive.offset_save);
994     RRETURN(MATCH_MATCH);
995     }
996 nigel 87 else if (rrc != MATCH_NOMATCH)
997     {
998     DPRINTF(("Recursion gave error %d\n", rrc));
999     RRETURN(rrc);
1000     }
1001 nigel 77
1002     md->recursive = &new_recursive;
1003     memcpy(md->offset_vector, new_recursive.offset_save,
1004     new_recursive.saved_max * sizeof(int));
1005     callpat += GET(callpat, 1);
1006     }
1007     while (*callpat == OP_ALT);
1008    
1009     DPRINTF(("Recursion didn't match\n"));
1010     md->recursive = new_recursive.prevrec;
1011     if (new_recursive.offset_save != stacksave)
1012     (pcre_free)(new_recursive.offset_save);
1013     RRETURN(MATCH_NOMATCH);
1014     }
1015     /* Control never reaches here */
1016    
1017     /* "Once" brackets are like assertion brackets except that after a match,
1018     the point in the subject string is not moved back. Thus there can never be
1019     a move back into the brackets. Friedl calls these "atomic" subpatterns.
1020     Check the alternative branches in turn - the matching won't pass the KET
1021     for this kind of subpattern. If any one branch matches, we carry on as at
1022     the end of a normal bracket, leaving the subject pointer. */
1023    
1024     case OP_ONCE:
1025 nigel 91 prev = ecode;
1026     saved_eptr = eptr;
1027    
1028     do
1029 nigel 77 {
1030 ph10 164 RMATCH(eptr, ecode + 1 + LINK_SIZE, offset_top, md, ims,
1031     eptrb, 0, RM7);
1032 nigel 91 if (rrc == MATCH_MATCH) break;
1033     if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1034     ecode += GET(ecode,1);
1035     }
1036     while (*ecode == OP_ALT);
1037 nigel 77
1038 nigel 91 /* If hit the end of the group (which could be repeated), fail */
1039 nigel 77
1040 nigel 91 if (*ecode != OP_ONCE && *ecode != OP_ALT) RRETURN(MATCH_NOMATCH);
1041 nigel 77
1042 nigel 91 /* Continue as from after the assertion, updating the offsets high water
1043     mark, since extracts may have been taken. */
1044 nigel 77
1045 nigel 93 do ecode += GET(ecode, 1); while (*ecode == OP_ALT);
1046 nigel 77
1047 nigel 91 offset_top = md->end_offset_top;
1048     eptr = md->end_match_ptr;
1049 nigel 77
1050 nigel 91 /* For a non-repeating ket, just continue at this level. This also
1051     happens for a repeating ket if no characters were matched in the group.
1052     This is the forcible breaking of infinite loops as implemented in Perl
1053     5.005. If there is an options reset, it will get obeyed in the normal
1054     course of events. */
1055 nigel 77
1056 nigel 91 if (*ecode == OP_KET || eptr == saved_eptr)
1057     {
1058     ecode += 1+LINK_SIZE;
1059     break;
1060     }
1061 nigel 77
1062 nigel 91 /* The repeating kets try the rest of the pattern or restart from the
1063     preceding bracket, in the appropriate order. The second "call" of match()
1064     uses tail recursion, to avoid using another stack frame. We need to reset
1065     any options that changed within the bracket before re-running it, so
1066     check the next opcode. */
1067 nigel 77
1068 nigel 91 if (ecode[1+LINK_SIZE] == OP_OPT)
1069     {
1070     ims = (ims & ~PCRE_IMS) | ecode[4];
1071     DPRINTF(("ims set to %02lx at group repeat\n", ims));
1072     }
1073 nigel 77
1074 nigel 91 if (*ecode == OP_KETRMIN)
1075     {
1076 ph10 164 RMATCH(eptr, ecode + 1 + LINK_SIZE, offset_top, md, ims, eptrb, 0,
1077     RM8);
1078 nigel 91 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1079     ecode = prev;
1080 nigel 93 flags = match_tail_recursed;
1081 nigel 91 goto TAIL_RECURSE;
1082 nigel 77 }
1083 nigel 91 else /* OP_KETRMAX */
1084     {
1085 ph10 164 RMATCH(eptr, prev, offset_top, md, ims, eptrb, match_cbegroup, RM9);
1086 nigel 91 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1087     ecode += 1 + LINK_SIZE;
1088 nigel 93 flags = match_tail_recursed;
1089 nigel 91 goto TAIL_RECURSE;
1090     }
1091     /* Control never gets here */
1092 nigel 77
1093     /* An alternation is the end of a branch; scan along to find the end of the
1094     bracketed group and go to there. */
1095    
1096     case OP_ALT:
1097     do ecode += GET(ecode,1); while (*ecode == OP_ALT);
1098     break;
1099    
1100     /* BRAZERO and BRAMINZERO occur just before a bracket group, indicating
1101     that it may occur zero times. It may repeat infinitely, or not at all -
1102     i.e. it could be ()* or ()? in the pattern. Brackets with fixed upper
1103     repeat limits are compiled as a number of copies, with the optional ones
1104     preceded by BRAZERO or BRAMINZERO. */
1105    
1106     case OP_BRAZERO:
1107     {
1108     next = ecode+1;
1109 ph10 164 RMATCH(eptr, next, offset_top, md, ims, eptrb, 0, RM10);
1110 nigel 77 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1111     do next += GET(next,1); while (*next == OP_ALT);
1112 nigel 93 ecode = next + 1 + LINK_SIZE;
1113 nigel 77 }
1114     break;
1115    
1116     case OP_BRAMINZERO:
1117     {
1118     next = ecode+1;
1119 nigel 93 do next += GET(next, 1); while (*next == OP_ALT);
1120 ph10 164 RMATCH(eptr, next + 1+LINK_SIZE, offset_top, md, ims, eptrb, 0, RM11);
1121 nigel 77 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1122     ecode++;
1123     }
1124     break;
1125    
1126 nigel 93 /* End of a group, repeated or non-repeating. */
1127 nigel 77
1128     case OP_KET:
1129     case OP_KETRMIN:
1130     case OP_KETRMAX:
1131 nigel 91 prev = ecode - GET(ecode, 1);
1132 nigel 77
1133 nigel 93 /* If this was a group that remembered the subject start, in order to break
1134     infinite repeats of empty string matches, retrieve the subject start from
1135     the chain. Otherwise, set it NULL. */
1136 nigel 77
1137 nigel 93 if (*prev >= OP_SBRA)
1138     {
1139     saved_eptr = eptrb->epb_saved_eptr; /* Value at start of group */
1140     eptrb = eptrb->epb_prev; /* Backup to previous group */
1141     }
1142     else saved_eptr = NULL;
1143 nigel 77
1144 nigel 93 /* If we are at the end of an assertion group, stop matching and return
1145     MATCH_MATCH, but record the current high water mark for use by positive
1146     assertions. Do this also for the "once" (atomic) groups. */
1147    
1148 nigel 91 if (*prev == OP_ASSERT || *prev == OP_ASSERT_NOT ||
1149     *prev == OP_ASSERTBACK || *prev == OP_ASSERTBACK_NOT ||
1150     *prev == OP_ONCE)
1151     {
1152     md->end_match_ptr = eptr; /* For ONCE */
1153     md->end_offset_top = offset_top;
1154     RRETURN(MATCH_MATCH);
1155     }
1156 nigel 77
1157 nigel 93 /* For capturing groups we have to check the group number back at the start
1158     and if necessary complete handling an extraction by setting the offsets and
1159     bumping the high water mark. Note that whole-pattern recursion is coded as
1160     a recurse into group 0, so it won't be picked up here. Instead, we catch it
1161     when the OP_END is reached. Other recursion is handled here. */
1162 nigel 77
1163 nigel 93 if (*prev == OP_CBRA || *prev == OP_SCBRA)
1164 nigel 91 {
1165 nigel 93 number = GET2(prev, 1+LINK_SIZE);
1166 nigel 91 offset = number << 1;
1167 nigel 77
1168     #ifdef DEBUG
1169 nigel 91 printf("end bracket %d", number);
1170     printf("\n");
1171 nigel 77 #endif
1172    
1173 nigel 93 md->capture_last = number;
1174     if (offset >= md->offset_max) md->offset_overflow = TRUE; else
1175 nigel 91 {
1176 nigel 93 md->offset_vector[offset] =
1177     md->offset_vector[md->offset_end - number];
1178     md->offset_vector[offset+1] = eptr - md->start_subject;
1179     if (offset_top <= offset) offset_top = offset + 2;
1180     }
1181 nigel 77
1182 nigel 93 /* Handle a recursively called group. Restore the offsets
1183     appropriately and continue from after the call. */
1184 nigel 77
1185 nigel 93 if (md->recursive != NULL && md->recursive->group_num == number)
1186     {
1187     recursion_info *rec = md->recursive;
1188     DPRINTF(("Recursion (%d) succeeded - continuing\n", number));
1189     md->recursive = rec->prevrec;
1190 ph10 168 mstart = rec->save_start;
1191 nigel 93 memcpy(md->offset_vector, rec->offset_save,
1192     rec->saved_max * sizeof(int));
1193     ecode = rec->after_call;
1194     ims = original_ims;
1195     break;
1196 nigel 77 }
1197 nigel 91 }
1198 nigel 77
1199 nigel 93 /* For both capturing and non-capturing groups, reset the value of the ims
1200     flags, in case they got changed during the group. */
1201 nigel 77
1202 nigel 91 ims = original_ims;
1203     DPRINTF(("ims reset to %02lx\n", ims));
1204 nigel 77
1205 nigel 91 /* For a non-repeating ket, just continue at this level. This also
1206     happens for a repeating ket if no characters were matched in the group.
1207     This is the forcible breaking of infinite loops as implemented in Perl
1208     5.005. If there is an options reset, it will get obeyed in the normal
1209     course of events. */
1210 nigel 77
1211 nigel 91 if (*ecode == OP_KET || eptr == saved_eptr)
1212     {
1213     ecode += 1 + LINK_SIZE;
1214     break;
1215     }
1216 nigel 77
1217 nigel 91 /* The repeating kets try the rest of the pattern or restart from the
1218     preceding bracket, in the appropriate order. In the second case, we can use
1219     tail recursion to avoid using another stack frame. */
1220 nigel 77
1221 nigel 93 flags = (*prev >= OP_SBRA)? match_cbegroup : 0;
1222    
1223 nigel 91 if (*ecode == OP_KETRMIN)
1224     {
1225 ph10 164 RMATCH(eptr, ecode + 1+LINK_SIZE, offset_top, md, ims, eptrb, 0,
1226     RM12);
1227 nigel 91 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1228     ecode = prev;
1229 nigel 93 flags |= match_tail_recursed;
1230 nigel 91 goto TAIL_RECURSE;
1231 nigel 77 }
1232 nigel 91 else /* OP_KETRMAX */
1233     {
1234 ph10 164 RMATCH(eptr, prev, offset_top, md, ims, eptrb, flags, RM13);
1235 nigel 91 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1236     ecode += 1 + LINK_SIZE;
1237 nigel 93 flags = match_tail_recursed;
1238 nigel 91 goto TAIL_RECURSE;
1239     }
1240     /* Control never gets here */
1241 nigel 77
1242     /* Start of subject unless notbol, or after internal newline if multiline */
1243    
1244     case OP_CIRC:
1245     if (md->notbol && eptr == md->start_subject) RRETURN(MATCH_NOMATCH);
1246     if ((ims & PCRE_MULTILINE) != 0)
1247     {
1248 nigel 91 if (eptr != md->start_subject &&
1249 nigel 93 (eptr == md->end_subject || !WAS_NEWLINE(eptr)))
1250 nigel 77 RRETURN(MATCH_NOMATCH);
1251     ecode++;
1252     break;
1253     }
1254     /* ... else fall through */
1255    
1256     /* Start of subject assertion */
1257    
1258     case OP_SOD:
1259     if (eptr != md->start_subject) RRETURN(MATCH_NOMATCH);
1260     ecode++;
1261     break;
1262    
1263     /* Start of match assertion */
1264    
1265     case OP_SOM:
1266     if (eptr != md->start_subject + md->start_offset) RRETURN(MATCH_NOMATCH);
1267     ecode++;
1268     break;
1269 ph10 172
1270 ph10 168 /* Reset the start of match point */
1271 ph10 172
1272 ph10 168 case OP_SET_SOM:
1273     mstart = eptr;
1274 ph10 172 ecode++;
1275     break;
1276 nigel 77
1277     /* Assert before internal newline if multiline, or before a terminating
1278     newline unless endonly is set, else end of subject unless noteol is set. */
1279    
1280     case OP_DOLL:
1281     if ((ims & PCRE_MULTILINE) != 0)
1282     {
1283     if (eptr < md->end_subject)
1284 nigel 91 { if (!IS_NEWLINE(eptr)) RRETURN(MATCH_NOMATCH); }
1285 nigel 77 else
1286     { if (md->noteol) RRETURN(MATCH_NOMATCH); }
1287     ecode++;
1288     break;
1289     }
1290     else
1291     {
1292     if (md->noteol) RRETURN(MATCH_NOMATCH);
1293     if (!md->endonly)
1294     {
1295 nigel 91 if (eptr != md->end_subject &&
1296 nigel 93 (!IS_NEWLINE(eptr) || eptr != md->end_subject - md->nllen))
1297 nigel 77 RRETURN(MATCH_NOMATCH);
1298     ecode++;
1299     break;
1300     }
1301     }
1302 nigel 91 /* ... else fall through for endonly */
1303 nigel 77
1304     /* End of subject assertion (\z) */
1305    
1306     case OP_EOD:
1307     if (eptr < md->end_subject) RRETURN(MATCH_NOMATCH);
1308     ecode++;
1309     break;
1310    
1311     /* End of subject or ending \n assertion (\Z) */
1312    
1313     case OP_EODN:
1314 nigel 91 if (eptr != md->end_subject &&
1315 nigel 93 (!IS_NEWLINE(eptr) || eptr != md->end_subject - md->nllen))
1316 nigel 91 RRETURN(MATCH_NOMATCH);
1317 nigel 77 ecode++;
1318     break;
1319    
1320     /* Word boundary assertions */
1321    
1322     case OP_NOT_WORD_BOUNDARY:
1323     case OP_WORD_BOUNDARY:
1324     {
1325    
1326     /* Find out if the previous and current characters are "word" characters.
1327     It takes a bit more work in UTF-8 mode. Characters > 255 are assumed to
1328     be "non-word" characters. */
1329    
1330     #ifdef SUPPORT_UTF8
1331     if (utf8)
1332     {
1333     if (eptr == md->start_subject) prev_is_word = FALSE; else
1334     {
1335     const uschar *lastptr = eptr - 1;
1336     while((*lastptr & 0xc0) == 0x80) lastptr--;
1337     GETCHAR(c, lastptr);
1338     prev_is_word = c < 256 && (md->ctypes[c] & ctype_word) != 0;
1339     }
1340     if (eptr >= md->end_subject) cur_is_word = FALSE; else
1341     {
1342     GETCHAR(c, eptr);
1343     cur_is_word = c < 256 && (md->ctypes[c] & ctype_word) != 0;
1344     }
1345     }
1346     else
1347     #endif
1348    
1349     /* More streamlined when not in UTF-8 mode */
1350    
1351     {
1352     prev_is_word = (eptr != md->start_subject) &&
1353     ((md->ctypes[eptr[-1]] & ctype_word) != 0);
1354     cur_is_word = (eptr < md->end_subject) &&
1355     ((md->ctypes[*eptr] & ctype_word) != 0);
1356     }
1357    
1358     /* Now see if the situation is what we want */
1359    
1360     if ((*ecode++ == OP_WORD_BOUNDARY)?
1361     cur_is_word == prev_is_word : cur_is_word != prev_is_word)
1362     RRETURN(MATCH_NOMATCH);
1363     }
1364     break;
1365    
1366     /* Match a single character type; inline for speed */
1367    
1368     case OP_ANY:
1369 nigel 91 if ((ims & PCRE_DOTALL) == 0)
1370     {
1371 nigel 93 if (IS_NEWLINE(eptr)) RRETURN(MATCH_NOMATCH);
1372 nigel 91 }
1373 nigel 77 if (eptr++ >= md->end_subject) RRETURN(MATCH_NOMATCH);
1374     if (utf8)
1375     while (eptr < md->end_subject && (*eptr & 0xc0) == 0x80) eptr++;
1376     ecode++;
1377     break;
1378    
1379     /* Match a single byte, even in UTF-8 mode. This opcode really does match
1380     any byte, even newline, independent of the setting of PCRE_DOTALL. */
1381    
1382     case OP_ANYBYTE:
1383     if (eptr++ >= md->end_subject) RRETURN(MATCH_NOMATCH);
1384     ecode++;
1385     break;
1386    
1387     case OP_NOT_DIGIT:
1388     if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
1389     GETCHARINCTEST(c, eptr);
1390     if (
1391     #ifdef SUPPORT_UTF8
1392     c < 256 &&
1393     #endif
1394     (md->ctypes[c] & ctype_digit) != 0
1395     )
1396     RRETURN(MATCH_NOMATCH);
1397     ecode++;
1398     break;
1399    
1400     case OP_DIGIT:
1401     if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
1402     GETCHARINCTEST(c, eptr);
1403     if (
1404     #ifdef SUPPORT_UTF8
1405     c >= 256 ||
1406     #endif
1407     (md->ctypes[c] & ctype_digit) == 0
1408     )
1409     RRETURN(MATCH_NOMATCH);
1410     ecode++;
1411     break;
1412    
1413     case OP_NOT_WHITESPACE:
1414     if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
1415     GETCHARINCTEST(c, eptr);
1416     if (
1417     #ifdef SUPPORT_UTF8
1418     c < 256 &&
1419     #endif
1420     (md->ctypes[c] & ctype_space) != 0
1421     )
1422     RRETURN(MATCH_NOMATCH);
1423     ecode++;
1424     break;
1425    
1426     case OP_WHITESPACE:
1427     if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
1428     GETCHARINCTEST(c, eptr);
1429     if (
1430     #ifdef SUPPORT_UTF8
1431     c >= 256 ||
1432     #endif
1433     (md->ctypes[c] & ctype_space) == 0
1434     )
1435     RRETURN(MATCH_NOMATCH);
1436     ecode++;
1437     break;
1438    
1439     case OP_NOT_WORDCHAR:
1440     if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
1441     GETCHARINCTEST(c, eptr);
1442     if (
1443     #ifdef SUPPORT_UTF8
1444     c < 256 &&
1445     #endif
1446     (md->ctypes[c] & ctype_word) != 0
1447     )
1448     RRETURN(MATCH_NOMATCH);
1449     ecode++;
1450     break;
1451    
1452     case OP_WORDCHAR:
1453     if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
1454     GETCHARINCTEST(c, eptr);
1455     if (
1456     #ifdef SUPPORT_UTF8
1457     c >= 256 ||
1458     #endif
1459     (md->ctypes[c] & ctype_word) == 0
1460     )
1461     RRETURN(MATCH_NOMATCH);
1462     ecode++;
1463     break;
1464    
1465 nigel 93 case OP_ANYNL:
1466     if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
1467     GETCHARINCTEST(c, eptr);
1468     switch(c)
1469     {
1470     default: RRETURN(MATCH_NOMATCH);
1471     case 0x000d:
1472     if (eptr < md->end_subject && *eptr == 0x0a) eptr++;
1473     break;
1474     case 0x000a:
1475     case 0x000b:
1476     case 0x000c:
1477     case 0x0085:
1478     case 0x2028:
1479     case 0x2029:
1480     break;
1481     }
1482     ecode++;
1483     break;
1484    
1485 nigel 77 #ifdef SUPPORT_UCP
1486     /* Check the next character by Unicode property. We will get here only
1487     if the support is in the binary; otherwise a compile-time error occurs. */
1488    
1489     case OP_PROP:
1490     case OP_NOTPROP:
1491     if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
1492     GETCHARINCTEST(c, eptr);
1493     {
1494 nigel 87 int chartype, script;
1495     int category = _pcre_ucp_findprop(c, &chartype, &script);
1496 nigel 77
1497 nigel 87 switch(ecode[1])
1498     {
1499     case PT_ANY:
1500     if (op == OP_NOTPROP) RRETURN(MATCH_NOMATCH);
1501     break;
1502 nigel 77
1503 nigel 87 case PT_LAMP:
1504     if ((chartype == ucp_Lu ||
1505     chartype == ucp_Ll ||
1506     chartype == ucp_Lt) == (op == OP_NOTPROP))
1507 nigel 77 RRETURN(MATCH_NOMATCH);
1508 nigel 87 break;
1509    
1510     case PT_GC:
1511     if ((ecode[2] != category) == (op == OP_PROP))
1512 nigel 77 RRETURN(MATCH_NOMATCH);
1513 nigel 87 break;
1514    
1515     case PT_PC:
1516     if ((ecode[2] != chartype) == (op == OP_PROP))
1517     RRETURN(MATCH_NOMATCH);
1518     break;
1519    
1520     case PT_SC:
1521     if ((ecode[2] != script) == (op == OP_PROP))
1522     RRETURN(MATCH_NOMATCH);
1523     break;
1524    
1525     default:
1526     RRETURN(PCRE_ERROR_INTERNAL);
1527 nigel 77 }
1528 nigel 87
1529     ecode += 3;
1530 nigel 77 }
1531     break;
1532    
1533     /* Match an extended Unicode sequence. We will get here only if the support
1534     is in the binary; otherwise a compile-time error occurs. */
1535    
1536     case OP_EXTUNI:
1537     if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
1538     GETCHARINCTEST(c, eptr);
1539     {
1540 nigel 87 int chartype, script;
1541     int category = _pcre_ucp_findprop(c, &chartype, &script);
1542 nigel 77 if (category == ucp_M) RRETURN(MATCH_NOMATCH);
1543     while (eptr < md->end_subject)
1544     {
1545     int len = 1;
1546     if (!utf8) c = *eptr; else
1547     {
1548     GETCHARLEN(c, eptr, len);
1549     }
1550 nigel 87 category = _pcre_ucp_findprop(c, &chartype, &script);
1551 nigel 77 if (category != ucp_M) break;
1552     eptr += len;
1553     }
1554     }
1555     ecode++;
1556     break;
1557     #endif
1558    
1559    
1560     /* Match a back reference, possibly repeatedly. Look past the end of the
1561     item to see if there is repeat information following. The code is similar
1562     to that for character classes, but repeated for efficiency. Then obey
1563     similar code to character type repeats - written out again for speed.
1564     However, if the referenced string is the empty string, always treat
1565     it as matched, any number of times (otherwise there could be infinite
1566     loops). */
1567    
1568     case OP_REF:
1569     {
1570     offset = GET2(ecode, 1) << 1; /* Doubled ref number */
1571     ecode += 3; /* Advance past item */
1572    
1573     /* If the reference is unset, set the length to be longer than the amount
1574     of subject left; this ensures that every attempt at a match fails. We
1575     can't just fail here, because of the possibility of quantifiers with zero
1576     minima. */
1577    
1578     length = (offset >= offset_top || md->offset_vector[offset] < 0)?
1579     md->end_subject - eptr + 1 :
1580     md->offset_vector[offset+1] - md->offset_vector[offset];
1581    
1582     /* Set up for repetition, or handle the non-repeated case */
1583    
1584     switch (*ecode)
1585     {
1586     case OP_CRSTAR:
1587     case OP_CRMINSTAR:
1588     case OP_CRPLUS:
1589     case OP_CRMINPLUS:
1590     case OP_CRQUERY:
1591     case OP_CRMINQUERY:
1592     c = *ecode++ - OP_CRSTAR;
1593     minimize = (c & 1) != 0;
1594     min = rep_min[c]; /* Pick up values from tables; */
1595     max = rep_max[c]; /* zero for max => infinity */
1596     if (max == 0) max = INT_MAX;
1597     break;
1598    
1599     case OP_CRRANGE:
1600     case OP_CRMINRANGE:
1601     minimize = (*ecode == OP_CRMINRANGE);
1602     min = GET2(ecode, 1);
1603     max = GET2(ecode, 3);
1604     if (max == 0) max = INT_MAX;
1605     ecode += 5;
1606     break;
1607    
1608     default: /* No repeat follows */
1609     if (!match_ref(offset, eptr, length, md, ims)) RRETURN(MATCH_NOMATCH);
1610     eptr += length;
1611     continue; /* With the main loop */
1612     }
1613    
1614     /* If the length of the reference is zero, just continue with the
1615     main loop. */
1616    
1617     if (length == 0) continue;
1618    
1619     /* First, ensure the minimum number of matches are present. We get back
1620     the length of the reference string explicitly rather than passing the
1621     address of eptr, so that eptr can be a register variable. */
1622    
1623     for (i = 1; i <= min; i++)
1624     {
1625     if (!match_ref(offset, eptr, length, md, ims)) RRETURN(MATCH_NOMATCH);
1626     eptr += length;
1627     }
1628    
1629     /* If min = max, continue at the same level without recursion.
1630     They are not both allowed to be zero. */
1631    
1632     if (min == max) continue;
1633    
1634     /* If minimizing, keep trying and advancing the pointer */
1635    
1636     if (minimize)
1637     {
1638     for (fi = min;; fi++)
1639     {
1640 ph10 164 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM14);
1641 nigel 77 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1642     if (fi >= max || !match_ref(offset, eptr, length, md, ims))
1643     RRETURN(MATCH_NOMATCH);
1644     eptr += length;
1645     }
1646     /* Control never gets here */
1647     }
1648    
1649     /* If maximizing, find the longest string and work backwards */
1650    
1651     else
1652     {
1653     pp = eptr;
1654     for (i = min; i < max; i++)
1655     {
1656     if (!match_ref(offset, eptr, length, md, ims)) break;
1657     eptr += length;
1658     }
1659     while (eptr >= pp)
1660     {
1661 ph10 164 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM15);
1662 nigel 77 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1663     eptr -= length;
1664     }
1665     RRETURN(MATCH_NOMATCH);
1666     }
1667     }
1668     /* Control never gets here */
1669    
1670    
1671    
1672     /* Match a bit-mapped character class, possibly repeatedly. This op code is
1673     used when all the characters in the class have values in the range 0-255,
1674     and either the matching is caseful, or the characters are in the range
1675     0-127 when UTF-8 processing is enabled. The only difference between
1676     OP_CLASS and OP_NCLASS occurs when a data character outside the range is
1677     encountered.
1678    
1679     First, look past the end of the item to see if there is repeat information
1680     following. Then obey similar code to character type repeats - written out
1681     again for speed. */
1682    
1683     case OP_NCLASS:
1684     case OP_CLASS:
1685     {
1686     data = ecode + 1; /* Save for matching */
1687     ecode += 33; /* Advance past the item */
1688    
1689     switch (*ecode)
1690     {
1691     case OP_CRSTAR:
1692     case OP_CRMINSTAR:
1693     case OP_CRPLUS:
1694     case OP_CRMINPLUS:
1695     case OP_CRQUERY:
1696     case OP_CRMINQUERY:
1697     c = *ecode++ - OP_CRSTAR;
1698     minimize = (c & 1) != 0;
1699     min = rep_min[c]; /* Pick up values from tables; */
1700     max = rep_max[c]; /* zero for max => infinity */
1701     if (max == 0) max = INT_MAX;
1702     break;
1703    
1704     case OP_CRRANGE:
1705     case OP_CRMINRANGE:
1706     minimize = (*ecode == OP_CRMINRANGE);
1707     min = GET2(ecode, 1);
1708     max = GET2(ecode, 3);
1709     if (max == 0) max = INT_MAX;
1710     ecode += 5;
1711     break;
1712    
1713     default: /* No repeat follows */
1714     min = max = 1;
1715     break;
1716     }
1717    
1718     /* First, ensure the minimum number of matches are present. */
1719    
1720     #ifdef SUPPORT_UTF8
1721     /* UTF-8 mode */
1722     if (utf8)
1723     {
1724     for (i = 1; i <= min; i++)
1725     {
1726     if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
1727     GETCHARINC(c, eptr);
1728     if (c > 255)
1729     {
1730     if (op == OP_CLASS) RRETURN(MATCH_NOMATCH);
1731     }
1732     else
1733     {
1734     if ((data[c/8] & (1 << (c&7))) == 0) RRETURN(MATCH_NOMATCH);
1735     }
1736     }
1737     }
1738     else
1739     #endif
1740     /* Not UTF-8 mode */
1741     {
1742     for (i = 1; i <= min; i++)
1743     {
1744     if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
1745     c = *eptr++;
1746     if ((data[c/8] & (1 << (c&7))) == 0) RRETURN(MATCH_NOMATCH);
1747     }
1748     }
1749    
1750     /* If max == min we can continue with the main loop without the
1751     need to recurse. */
1752    
1753     if (min == max) continue;
1754    
1755     /* If minimizing, keep testing the rest of the expression and advancing
1756     the pointer while it matches the class. */
1757    
1758     if (minimize)
1759     {
1760     #ifdef SUPPORT_UTF8
1761     /* UTF-8 mode */
1762     if (utf8)
1763     {
1764     for (fi = min;; fi++)
1765     {
1766 ph10 164 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM16);
1767 nigel 77 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1768     if (fi >= max || eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
1769     GETCHARINC(c, eptr);
1770     if (c > 255)
1771     {
1772     if (op == OP_CLASS) RRETURN(MATCH_NOMATCH);
1773     }
1774     else
1775     {
1776     if ((data[c/8] & (1 << (c&7))) == 0) RRETURN(MATCH_NOMATCH);
1777     }
1778     }
1779     }
1780     else
1781     #endif
1782     /* Not UTF-8 mode */
1783     {
1784     for (fi = min;; fi++)
1785     {
1786 ph10 164 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM17);
1787 nigel 77 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1788     if (fi >= max || eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
1789     c = *eptr++;
1790     if ((data[c/8] & (1 << (c&7))) == 0) RRETURN(MATCH_NOMATCH);
1791     }
1792     }
1793     /* Control never gets here */
1794     }
1795    
1796     /* If maximizing, find the longest possible run, then work backwards. */
1797    
1798     else
1799     {
1800     pp = eptr;
1801    
1802     #ifdef SUPPORT_UTF8
1803     /* UTF-8 mode */
1804     if (utf8)
1805     {
1806     for (i = min; i < max; i++)
1807     {
1808     int len = 1;
1809     if (eptr >= md->end_subject) break;
1810     GETCHARLEN(c, eptr, len);
1811     if (c > 255)
1812     {
1813     if (op == OP_CLASS) break;
1814     }
1815     else
1816     {
1817     if ((data[c/8] & (1 << (c&7))) == 0) break;
1818     }
1819     eptr += len;
1820     }
1821     for (;;)
1822     {
1823 ph10 164 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM18);
1824 nigel 77 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1825     if (eptr-- == pp) break; /* Stop if tried at original pos */
1826     BACKCHAR(eptr);
1827     }
1828     }
1829     else
1830     #endif
1831     /* Not UTF-8 mode */
1832     {
1833     for (i = min; i < max; i++)
1834     {
1835     if (eptr >= md->end_subject) break;
1836     c = *eptr;
1837     if ((data[c/8] & (1 << (c&7))) == 0) break;
1838     eptr++;
1839     }
1840     while (eptr >= pp)
1841     {
1842 ph10 164 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM19);
1843 nigel 87 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1844 nigel 77 eptr--;
1845     }
1846     }
1847    
1848     RRETURN(MATCH_NOMATCH);
1849     }
1850     }
1851     /* Control never gets here */
1852    
1853    
1854     /* Match an extended character class. This opcode is encountered only
1855     in UTF-8 mode, because that's the only time it is compiled. */
1856    
1857     #ifdef SUPPORT_UTF8
1858     case OP_XCLASS:
1859     {
1860     data = ecode + 1 + LINK_SIZE; /* Save for matching */
1861     ecode += GET(ecode, 1); /* Advance past the item */
1862    
1863     switch (*ecode)
1864     {
1865     case OP_CRSTAR:
1866     case OP_CRMINSTAR:
1867     case OP_CRPLUS:
1868     case OP_CRMINPLUS:
1869     case OP_CRQUERY:
1870     case OP_CRMINQUERY:
1871     c = *ecode++ - OP_CRSTAR;
1872     minimize = (c & 1) != 0;
1873     min = rep_min[c]; /* Pick up values from tables; */
1874     max = rep_max[c]; /* zero for max => infinity */
1875     if (max == 0) max = INT_MAX;
1876     break;
1877    
1878     case OP_CRRANGE:
1879     case OP_CRMINRANGE:
1880     minimize = (*ecode == OP_CRMINRANGE);
1881     min = GET2(ecode, 1);
1882     max = GET2(ecode, 3);
1883     if (max == 0) max = INT_MAX;
1884     ecode += 5;
1885     break;
1886    
1887     default: /* No repeat follows */
1888     min = max = 1;
1889     break;
1890     }
1891    
1892     /* First, ensure the minimum number of matches are present. */
1893    
1894     for (i = 1; i <= min; i++)
1895     {
1896     if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
1897     GETCHARINC(c, eptr);
1898     if (!_pcre_xclass(c, data)) RRETURN(MATCH_NOMATCH);
1899     }
1900    
1901     /* If max == min we can continue with the main loop without the
1902     need to recurse. */
1903    
1904     if (min == max) continue;
1905    
1906     /* If minimizing, keep testing the rest of the expression and advancing
1907     the pointer while it matches the class. */
1908    
1909     if (minimize)
1910     {
1911     for (fi = min;; fi++)
1912     {
1913 ph10 164 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM20);
1914 nigel 77 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1915     if (fi >= max || eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
1916     GETCHARINC(c, eptr);
1917     if (!_pcre_xclass(c, data)) RRETURN(MATCH_NOMATCH);
1918     }
1919     /* Control never gets here */
1920     }
1921    
1922     /* If maximizing, find the longest possible run, then work backwards. */
1923    
1924     else
1925     {
1926     pp = eptr;
1927     for (i = min; i < max; i++)
1928     {
1929     int len = 1;
1930     if (eptr >= md->end_subject) break;
1931     GETCHARLEN(c, eptr, len);
1932     if (!_pcre_xclass(c, data)) break;
1933     eptr += len;
1934     }
1935     for(;;)
1936     {
1937 ph10 164 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM21);
1938 nigel 77 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1939     if (eptr-- == pp) break; /* Stop if tried at original pos */
1940     BACKCHAR(eptr)
1941     }
1942     RRETURN(MATCH_NOMATCH);
1943     }
1944    
1945     /* Control never gets here */
1946     }
1947     #endif /* End of XCLASS */
1948    
1949     /* Match a single character, casefully */
1950    
1951     case OP_CHAR:
1952     #ifdef SUPPORT_UTF8
1953     if (utf8)
1954     {
1955     length = 1;
1956     ecode++;
1957     GETCHARLEN(fc, ecode, length);
1958     if (length > md->end_subject - eptr) RRETURN(MATCH_NOMATCH);
1959     while (length-- > 0) if (*ecode++ != *eptr++) RRETURN(MATCH_NOMATCH);
1960     }
1961     else
1962     #endif
1963    
1964     /* Non-UTF-8 mode */
1965     {
1966     if (md->end_subject - eptr < 1) RRETURN(MATCH_NOMATCH);
1967     if (ecode[1] != *eptr++) RRETURN(MATCH_NOMATCH);
1968     ecode += 2;
1969     }
1970     break;
1971    
1972     /* Match a single character, caselessly */
1973    
1974     case OP_CHARNC:
1975     #ifdef SUPPORT_UTF8
1976     if (utf8)
1977     {
1978     length = 1;
1979     ecode++;
1980     GETCHARLEN(fc, ecode, length);
1981    
1982     if (length > md->end_subject - eptr) RRETURN(MATCH_NOMATCH);
1983    
1984     /* If the pattern character's value is < 128, we have only one byte, and
1985     can use the fast lookup table. */
1986    
1987     if (fc < 128)
1988     {
1989     if (md->lcc[*ecode++] != md->lcc[*eptr++]) RRETURN(MATCH_NOMATCH);
1990     }
1991    
1992     /* Otherwise we must pick up the subject character */
1993    
1994     else
1995     {
1996 nigel 93 unsigned int dc;
1997 nigel 77 GETCHARINC(dc, eptr);
1998     ecode += length;
1999    
2000     /* If we have Unicode property support, we can use it to test the other
2001 nigel 87 case of the character, if there is one. */
2002 nigel 77
2003     if (fc != dc)
2004     {
2005     #ifdef SUPPORT_UCP
2006 nigel 87 if (dc != _pcre_ucp_othercase(fc))
2007 nigel 77 #endif
2008     RRETURN(MATCH_NOMATCH);
2009     }
2010     }
2011     }
2012     else
2013     #endif /* SUPPORT_UTF8 */
2014    
2015     /* Non-UTF-8 mode */
2016     {
2017     if (md->end_subject - eptr < 1) RRETURN(MATCH_NOMATCH);
2018     if (md->lcc[ecode[1]] != md->lcc[*eptr++]) RRETURN(MATCH_NOMATCH);
2019     ecode += 2;
2020     }
2021     break;
2022    
2023 nigel 93 /* Match a single character repeatedly. */
2024 nigel 77
2025     case OP_EXACT:
2026     min = max = GET2(ecode, 1);
2027     ecode += 3;
2028     goto REPEATCHAR;
2029    
2030 nigel 93 case OP_POSUPTO:
2031     possessive = TRUE;
2032     /* Fall through */
2033    
2034 nigel 77 case OP_UPTO:
2035     case OP_MINUPTO:
2036     min = 0;
2037     max = GET2(ecode, 1);
2038     minimize = *ecode == OP_MINUPTO;
2039     ecode += 3;
2040     goto REPEATCHAR;
2041    
2042 nigel 93 case OP_POSSTAR:
2043     possessive = TRUE;
2044     min = 0;
2045     max = INT_MAX;
2046     ecode++;
2047     goto REPEATCHAR;
2048    
2049     case OP_POSPLUS:
2050     possessive = TRUE;
2051     min = 1;
2052     max = INT_MAX;
2053     ecode++;
2054     goto REPEATCHAR;
2055    
2056     case OP_POSQUERY:
2057     possessive = TRUE;
2058     min = 0;
2059     max = 1;
2060     ecode++;
2061     goto REPEATCHAR;
2062    
2063 nigel 77 case OP_STAR:
2064     case OP_MINSTAR:
2065     case OP_PLUS:
2066     case OP_MINPLUS:
2067     case OP_QUERY:
2068     case OP_MINQUERY:
2069     c = *ecode++ - OP_STAR;
2070     minimize = (c & 1) != 0;
2071     min = rep_min[c]; /* Pick up values from tables; */
2072     max = rep_max[c]; /* zero for max => infinity */
2073     if (max == 0) max = INT_MAX;
2074    
2075     /* Common code for all repeated single-character matches. We can give
2076     up quickly if there are fewer than the minimum number of characters left in
2077     the subject. */
2078    
2079     REPEATCHAR:
2080     #ifdef SUPPORT_UTF8
2081     if (utf8)
2082     {
2083     length = 1;
2084     charptr = ecode;
2085     GETCHARLEN(fc, ecode, length);
2086     if (min * length > md->end_subject - eptr) RRETURN(MATCH_NOMATCH);
2087     ecode += length;
2088    
2089     /* Handle multibyte character matching specially here. There is
2090     support for caseless matching if UCP support is present. */
2091    
2092     if (length > 1)
2093     {
2094     #ifdef SUPPORT_UCP
2095 nigel 93 unsigned int othercase;
2096 nigel 77 if ((ims & PCRE_CASELESS) != 0 &&
2097 nigel 93 (othercase = _pcre_ucp_othercase(fc)) != NOTACHAR)
2098 nigel 77 oclength = _pcre_ord2utf8(othercase, occhars);
2099 ph10 115 else oclength = 0;
2100 nigel 77 #endif /* SUPPORT_UCP */
2101    
2102     for (i = 1; i <= min; i++)
2103     {
2104     if (memcmp(eptr, charptr, length) == 0) eptr += length;
2105 ph10 123 #ifdef SUPPORT_UCP
2106 nigel 77 /* Need braces because of following else */
2107     else if (oclength == 0) { RRETURN(MATCH_NOMATCH); }
2108     else
2109     {
2110     if (memcmp(eptr, occhars, oclength) != 0) RRETURN(MATCH_NOMATCH);
2111     eptr += oclength;
2112     }
2113 ph10 115 #else /* without SUPPORT_UCP */
2114     else { RRETURN(MATCH_NOMATCH); }
2115 ph10 123 #endif /* SUPPORT_UCP */
2116 nigel 77 }
2117    
2118     if (min == max) continue;
2119    
2120     if (minimize)
2121     {
2122     for (fi = min;; fi++)
2123     {
2124 ph10 164 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM22);
2125 nigel 77 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2126     if (fi >= max || eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
2127     if (memcmp(eptr, charptr, length) == 0) eptr += length;
2128 ph10 123 #ifdef SUPPORT_UCP
2129 nigel 77 /* Need braces because of following else */
2130     else if (oclength == 0) { RRETURN(MATCH_NOMATCH); }
2131     else
2132     {
2133     if (memcmp(eptr, occhars, oclength) != 0) RRETURN(MATCH_NOMATCH);
2134     eptr += oclength;
2135     }
2136 ph10 115 #else /* without SUPPORT_UCP */
2137     else { RRETURN (MATCH_NOMATCH); }
2138     #endif /* SUPPORT_UCP */
2139 nigel 77 }
2140     /* Control never gets here */
2141     }
2142 nigel 93
2143     else /* Maximize */
2144 nigel 77 {
2145     pp = eptr;
2146     for (i = min; i < max; i++)
2147     {
2148     if (eptr > md->end_subject - length) break;
2149     if (memcmp(eptr, charptr, length) == 0) eptr += length;
2150 ph10 123 #ifdef SUPPORT_UCP
2151 nigel 77 else if (oclength == 0) break;
2152     else
2153     {
2154     if (memcmp(eptr, occhars, oclength) != 0) break;
2155     eptr += oclength;
2156     }
2157 ph10 115 #else /* without SUPPORT_UCP */
2158     else break;
2159 ph10 123 #endif /* SUPPORT_UCP */
2160 nigel 77 }
2161 nigel 93
2162     if (possessive) continue;
2163 ph10 120 for(;;)
2164 nigel 77 {
2165 ph10 164 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM23);
2166 nigel 77 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2167 ph10 120 if (eptr == pp) RRETURN(MATCH_NOMATCH);
2168 ph10 115 #ifdef SUPPORT_UCP
2169     eptr--;
2170     BACKCHAR(eptr);
2171 ph10 123 #else /* without SUPPORT_UCP */
2172 nigel 77 eptr -= length;
2173 ph10 123 #endif /* SUPPORT_UCP */
2174 nigel 77 }
2175     }
2176     /* Control never gets here */
2177     }
2178    
2179     /* If the length of a UTF-8 character is 1, we fall through here, and
2180     obey the code as for non-UTF-8 characters below, though in this case the
2181     value of fc will always be < 128. */
2182     }
2183     else
2184     #endif /* SUPPORT_UTF8 */
2185    
2186     /* When not in UTF-8 mode, load a single-byte character. */
2187     {
2188     if (min > md->end_subject - eptr) RRETURN(MATCH_NOMATCH);
2189     fc = *ecode++;
2190     }
2191    
2192     /* The value of fc at this point is always less than 256, though we may or
2193     may not be in UTF-8 mode. The code is duplicated for the caseless and
2194     caseful cases, for speed, since matching characters is likely to be quite
2195     common. First, ensure the minimum number of matches are present. If min =
2196     max, continue at the same level without recursing. Otherwise, if
2197     minimizing, keep trying the rest of the expression and advancing one
2198     matching character if failing, up to the maximum. Alternatively, if
2199     maximizing, find the maximum number of characters and work backwards. */
2200    
2201     DPRINTF(("matching %c{%d,%d} against subject %.*s\n", fc, min, max,
2202     max, eptr));
2203    
2204     if ((ims & PCRE_CASELESS) != 0)
2205     {
2206     fc = md->lcc[fc];
2207     for (i = 1; i <= min; i++)
2208     if (fc != md->lcc[*eptr++]) RRETURN(MATCH_NOMATCH);
2209     if (min == max) continue;
2210     if (minimize)
2211     {
2212     for (fi = min;; fi++)
2213     {
2214 ph10 164 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM24);
2215 nigel 77 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2216     if (fi >= max || eptr >= md->end_subject ||
2217     fc != md->lcc[*eptr++])
2218     RRETURN(MATCH_NOMATCH);
2219     }
2220     /* Control never gets here */
2221     }
2222 nigel 93 else /* Maximize */
2223 nigel 77 {
2224     pp = eptr;
2225     for (i = min; i < max; i++)
2226     {
2227     if (eptr >= md->end_subject || fc != md->lcc[*eptr]) break;
2228     eptr++;
2229     }
2230 nigel 93 if (possessive) continue;
2231 nigel 77 while (eptr >= pp)
2232     {
2233 ph10 164 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM25);
2234 nigel 77 eptr--;
2235     if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2236     }
2237     RRETURN(MATCH_NOMATCH);
2238     }
2239     /* Control never gets here */
2240     }
2241    
2242     /* Caseful comparisons (includes all multi-byte characters) */
2243    
2244     else
2245     {
2246     for (i = 1; i <= min; i++) if (fc != *eptr++) RRETURN(MATCH_NOMATCH);
2247     if (min == max) continue;
2248     if (minimize)
2249     {
2250     for (fi = min;; fi++)
2251     {
2252 ph10 164 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM26);
2253 nigel 77 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2254     if (fi >= max || eptr >= md->end_subject || fc != *eptr++)
2255     RRETURN(MATCH_NOMATCH);
2256     }
2257     /* Control never gets here */
2258     }
2259 nigel 93 else /* Maximize */
2260 nigel 77 {
2261     pp = eptr;
2262     for (i = min; i < max; i++)
2263     {
2264     if (eptr >= md->end_subject || fc != *eptr) break;
2265     eptr++;
2266     }
2267 nigel 93 if (possessive) continue;
2268 nigel 77 while (eptr >= pp)
2269     {
2270 ph10 164 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM27);
2271 nigel 77 eptr--;
2272     if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2273     }
2274     RRETURN(MATCH_NOMATCH);
2275     }
2276     }
2277     /* Control never gets here */
2278    
2279     /* Match a negated single one-byte character. The character we are
2280     checking can be multibyte. */
2281    
2282     case OP_NOT:
2283     if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
2284     ecode++;
2285     GETCHARINCTEST(c, eptr);
2286     if ((ims & PCRE_CASELESS) != 0)
2287     {
2288     #ifdef SUPPORT_UTF8
2289     if (c < 256)
2290     #endif
2291     c = md->lcc[c];
2292     if (md->lcc[*ecode++] == c) RRETURN(MATCH_NOMATCH);
2293     }
2294     else
2295     {
2296     if (*ecode++ == c) RRETURN(MATCH_NOMATCH);
2297     }
2298     break;
2299    
2300     /* Match a negated single one-byte character repeatedly. This is almost a
2301     repeat of the code for a repeated single character, but I haven't found a
2302     nice way of commoning these up that doesn't require a test of the
2303     positive/negative option for each character match. Maybe that wouldn't add
2304     very much to the time taken, but character matching *is* what this is all
2305     about... */
2306    
2307     case OP_NOTEXACT:
2308     min = max = GET2(ecode, 1);
2309     ecode += 3;
2310     goto REPEATNOTCHAR;
2311    
2312     case OP_NOTUPTO:
2313     case OP_NOTMINUPTO:
2314     min = 0;
2315     max = GET2(ecode, 1);
2316     minimize = *ecode == OP_NOTMINUPTO;
2317     ecode += 3;
2318     goto REPEATNOTCHAR;
2319    
2320 nigel 93 case OP_NOTPOSSTAR:
2321     possessive = TRUE;
2322     min = 0;
2323     max = INT_MAX;
2324     ecode++;
2325     goto REPEATNOTCHAR;
2326    
2327     case OP_NOTPOSPLUS:
2328     possessive = TRUE;
2329     min = 1;
2330     max = INT_MAX;
2331     ecode++;
2332     goto REPEATNOTCHAR;
2333    
2334     case OP_NOTPOSQUERY:
2335     possessive = TRUE;
2336     min = 0;
2337     max = 1;
2338     ecode++;
2339     goto REPEATNOTCHAR;
2340    
2341     case OP_NOTPOSUPTO:
2342     possessive = TRUE;
2343     min = 0;
2344     max = GET2(ecode, 1);
2345     ecode += 3;
2346     goto REPEATNOTCHAR;
2347    
2348 nigel 77 case OP_NOTSTAR:
2349     case OP_NOTMINSTAR:
2350     case OP_NOTPLUS:
2351     case OP_NOTMINPLUS:
2352     case OP_NOTQUERY:
2353     case OP_NOTMINQUERY:
2354     c = *ecode++ - OP_NOTSTAR;
2355     minimize = (c & 1) != 0;
2356     min = rep_min[c]; /* Pick up values from tables; */
2357     max = rep_max[c]; /* zero for max => infinity */
2358     if (max == 0) max = INT_MAX;
2359    
2360     /* Common code for all repeated single-byte matches. We can give up quickly
2361     if there are fewer than the minimum number of bytes left in the
2362     subject. */
2363    
2364     REPEATNOTCHAR:
2365     if (min > md->end_subject - eptr) RRETURN(MATCH_NOMATCH);
2366     fc = *ecode++;
2367    
2368     /* The code is duplicated for the caseless and caseful cases, for speed,
2369     since matching characters is likely to be quite common. First, ensure the
2370     minimum number of matches are present. If min = max, continue at the same
2371     level without recursing. Otherwise, if minimizing, keep trying the rest of
2372     the expression and advancing one matching character if failing, up to the
2373     maximum. Alternatively, if maximizing, find the maximum number of
2374     characters and work backwards. */
2375    
2376     DPRINTF(("negative matching %c{%d,%d} against subject %.*s\n", fc, min, max,
2377     max, eptr));
2378    
2379     if ((ims & PCRE_CASELESS) != 0)
2380     {
2381     fc = md->lcc[fc];
2382    
2383     #ifdef SUPPORT_UTF8
2384     /* UTF-8 mode */
2385     if (utf8)
2386     {
2387 nigel 93 register unsigned int d;
2388 nigel 77 for (i = 1; i <= min; i++)
2389     {
2390     GETCHARINC(d, eptr);
2391     if (d < 256) d = md->lcc[d];
2392     if (fc == d) RRETURN(MATCH_NOMATCH);
2393     }
2394     }
2395     else
2396     #endif
2397    
2398     /* Not UTF-8 mode */
2399     {
2400     for (i = 1; i <= min; i++)
2401     if (fc == md->lcc[*eptr++]) RRETURN(MATCH_NOMATCH);
2402     }
2403    
2404     if (min == max) continue;
2405    
2406     if (minimize)
2407     {
2408     #ifdef SUPPORT_UTF8
2409     /* UTF-8 mode */
2410     if (utf8)
2411     {
2412 nigel 93 register unsigned int d;
2413 nigel 77 for (fi = min;; fi++)
2414     {
2415 ph10 164 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM28);
2416 nigel 77 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2417     GETCHARINC(d, eptr);
2418     if (d < 256) d = md->lcc[d];
2419     if (fi >= max || eptr >= md->end_subject || fc == d)
2420     RRETURN(MATCH_NOMATCH);
2421     }
2422     }
2423     else
2424     #endif
2425     /* Not UTF-8 mode */
2426     {
2427     for (fi = min;; fi++)
2428     {
2429 ph10 164 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM29);
2430 nigel 77 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2431     if (fi >= max || eptr >= md->end_subject || fc == md->lcc[*eptr++])
2432     RRETURN(MATCH_NOMATCH);
2433     }
2434     }
2435     /* Control never gets here */
2436     }
2437    
2438     /* Maximize case */
2439    
2440     else
2441     {
2442     pp = eptr;
2443    
2444     #ifdef SUPPORT_UTF8
2445     /* UTF-8 mode */
2446     if (utf8)
2447     {
2448 nigel 93 register unsigned int d;
2449 nigel 77 for (i = min; i < max; i++)
2450     {
2451     int len = 1;
2452     if (eptr >= md->end_subject) break;
2453     GETCHARLEN(d, eptr, len);
2454     if (d < 256) d = md->lcc[d];
2455     if (fc == d) break;
2456     eptr += len;
2457     }
2458 nigel 93 if (possessive) continue;
2459     for(;;)
2460 nigel 77 {
2461 ph10 164 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM30);
2462 nigel 77 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2463     if (eptr-- == pp) break; /* Stop if tried at original pos */
2464     BACKCHAR(eptr);
2465     }
2466     }
2467     else
2468     #endif
2469     /* Not UTF-8 mode */
2470     {
2471     for (i = min; i < max; i++)
2472     {
2473     if (eptr >= md->end_subject || fc == md->lcc[*eptr]) break;
2474     eptr++;
2475     }
2476 nigel 93 if (possessive) continue;
2477 nigel 77 while (eptr >= pp)
2478     {
2479 ph10 164 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM31);
2480 nigel 77 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2481     eptr--;
2482     }
2483     }
2484    
2485     RRETURN(MATCH_NOMATCH);
2486     }
2487     /* Control never gets here */
2488     }
2489    
2490     /* Caseful comparisons */
2491    
2492     else
2493     {
2494     #ifdef SUPPORT_UTF8
2495     /* UTF-8 mode */
2496     if (utf8)
2497     {
2498 nigel 93 register unsigned int d;
2499 nigel 77 for (i = 1; i <= min; i++)
2500     {
2501     GETCHARINC(d, eptr);
2502     if (fc == d) RRETURN(MATCH_NOMATCH);
2503     }
2504     }
2505     else
2506     #endif
2507     /* Not UTF-8 mode */
2508     {
2509     for (i = 1; i <= min; i++)
2510     if (fc == *eptr++) RRETURN(MATCH_NOMATCH);
2511     }
2512    
2513     if (min == max) continue;
2514    
2515     if (minimize)
2516     {
2517     #ifdef SUPPORT_UTF8
2518     /* UTF-8 mode */
2519     if (utf8)
2520     {
2521 nigel 93 register unsigned int d;
2522 nigel 77 for (fi = min;; fi++)
2523     {
2524 ph10 164 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM32);
2525 nigel 77 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2526     GETCHARINC(d, eptr);
2527     if (fi >= max || eptr >= md->end_subject || fc == d)
2528     RRETURN(MATCH_NOMATCH);
2529     }
2530     }
2531     else
2532     #endif
2533     /* Not UTF-8 mode */
2534     {
2535     for (fi = min;; fi++)
2536     {
2537 ph10 164 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM33);
2538 nigel 77 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2539     if (fi >= max || eptr >= md->end_subject || fc == *eptr++)
2540     RRETURN(MATCH_NOMATCH);
2541     }
2542     }
2543     /* Control never gets here */
2544     }
2545    
2546     /* Maximize case */
2547    
2548     else
2549     {
2550     pp = eptr;
2551    
2552     #ifdef SUPPORT_UTF8
2553     /* UTF-8 mode */
2554     if (utf8)
2555     {
2556 nigel 93 register unsigned int d;
2557 nigel 77 for (i = min; i < max; i++)
2558     {
2559     int len = 1;
2560     if (eptr >= md->end_subject) break;
2561     GETCHARLEN(d, eptr, len);
2562     if (fc == d) break;
2563     eptr += len;
2564     }
2565 nigel 93 if (possessive) continue;
2566 nigel 77 for(;;)
2567     {
2568 ph10 164 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM34);
2569 nigel 77 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2570     if (eptr-- == pp) break; /* Stop if tried at original pos */
2571     BACKCHAR(eptr);
2572     }
2573     }
2574     else
2575     #endif
2576     /* Not UTF-8 mode */
2577     {
2578     for (i = min; i < max; i++)
2579     {
2580     if (eptr >= md->end_subject || fc == *eptr) break;
2581     eptr++;
2582     }
2583 nigel 93 if (possessive) continue;
2584 nigel 77 while (eptr >= pp)
2585     {
2586 ph10 164 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM35);
2587 nigel 77 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2588     eptr--;
2589     }
2590     }
2591    
2592     RRETURN(MATCH_NOMATCH);
2593     }
2594     }
2595     /* Control never gets here */
2596    
2597     /* Match a single character type repeatedly; several different opcodes
2598     share code. This is very similar to the code for single characters, but we
2599     repeat it in the interests of efficiency. */
2600    
2601     case OP_TYPEEXACT:
2602     min = max = GET2(ecode, 1);
2603     minimize = TRUE;
2604     ecode += 3;
2605     goto REPEATTYPE;
2606    
2607     case OP_TYPEUPTO:
2608     case OP_TYPEMINUPTO:
2609     min = 0;
2610     max = GET2(ecode, 1);
2611     minimize = *ecode == OP_TYPEMINUPTO;
2612     ecode += 3;
2613     goto REPEATTYPE;
2614    
2615 nigel 93 case OP_TYPEPOSSTAR:
2616     possessive = TRUE;
2617     min = 0;
2618     max = INT_MAX;
2619     ecode++;
2620     goto REPEATTYPE;
2621    
2622     case OP_TYPEPOSPLUS:
2623     possessive = TRUE;
2624     min = 1;
2625     max = INT_MAX;
2626     ecode++;
2627     goto REPEATTYPE;
2628    
2629     case OP_TYPEPOSQUERY:
2630     possessive = TRUE;
2631     min = 0;
2632     max = 1;
2633     ecode++;
2634     goto REPEATTYPE;
2635    
2636     case OP_TYPEPOSUPTO:
2637     possessive = TRUE;
2638     min = 0;
2639     max = GET2(ecode, 1);
2640     ecode += 3;
2641     goto REPEATTYPE;
2642    
2643 nigel 77 case OP_TYPESTAR:
2644     case OP_TYPEMINSTAR:
2645     case OP_TYPEPLUS:
2646     case OP_TYPEMINPLUS:
2647     case OP_TYPEQUERY:
2648     case OP_TYPEMINQUERY:
2649     c = *ecode++ - OP_TYPESTAR;
2650     minimize = (c & 1) != 0;
2651     min = rep_min[c]; /* Pick up values from tables; */
2652     max = rep_max[c]; /* zero for max => infinity */
2653     if (max == 0) max = INT_MAX;
2654    
2655     /* Common code for all repeated single character type matches. Note that
2656     in UTF-8 mode, '.' matches a character of any length, but for the other
2657     character types, the valid characters are all one-byte long. */
2658    
2659     REPEATTYPE:
2660     ctype = *ecode++; /* Code for the character type */
2661    
2662     #ifdef SUPPORT_UCP
2663     if (ctype == OP_PROP || ctype == OP_NOTPROP)
2664     {
2665     prop_fail_result = ctype == OP_NOTPROP;
2666     prop_type = *ecode++;
2667 nigel 87 prop_value = *ecode++;
2668 nigel 77 }
2669     else prop_type = -1;
2670     #endif
2671    
2672     /* First, ensure the minimum number of matches are present. Use inline
2673     code for maximizing the speed, and do the type test once at the start
2674     (i.e. keep it out of the loop). Also we can test that there are at least
2675     the minimum number of bytes before we start. This isn't as effective in
2676     UTF-8 mode, but it does no harm. Separate the UTF-8 code completely as that
2677     is tidier. Also separate the UCP code, which can be the same for both UTF-8
2678     and single-bytes. */
2679    
2680     if (min > md->end_subject - eptr) RRETURN(MATCH_NOMATCH);
2681     if (min > 0)
2682     {
2683     #ifdef SUPPORT_UCP
2684 nigel 87 if (prop_type >= 0)
2685 nigel 77 {
2686 nigel 87 switch(prop_type)
2687 nigel 77 {
2688 nigel 87 case PT_ANY:
2689     if (prop_fail_result) RRETURN(MATCH_NOMATCH);
2690     for (i = 1; i <= min; i++)
2691     {
2692     if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
2693     GETCHARINC(c, eptr);
2694     }
2695     break;
2696    
2697     case PT_LAMP:
2698     for (i = 1; i <= min; i++)
2699     {
2700     if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
2701     GETCHARINC(c, eptr);
2702     prop_category = _pcre_ucp_findprop(c, &prop_chartype, &prop_script);
2703     if ((prop_chartype == ucp_Lu ||
2704     prop_chartype == ucp_Ll ||
2705     prop_chartype == ucp_Lt) == prop_fail_result)
2706     RRETURN(MATCH_NOMATCH);
2707     }
2708     break;
2709    
2710     case PT_GC:
2711     for (i = 1; i <= min; i++)
2712     {
2713     if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
2714     GETCHARINC(c, eptr);
2715     prop_category = _pcre_ucp_findprop(c, &prop_chartype, &prop_script);
2716     if ((prop_category == prop_value) == prop_fail_result)
2717     RRETURN(MATCH_NOMATCH);
2718     }
2719     break;
2720    
2721     case PT_PC:
2722     for (i = 1; i <= min; i++)
2723     {
2724     if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
2725     GETCHARINC(c, eptr);
2726     prop_category = _pcre_ucp_findprop(c, &prop_chartype, &prop_script);
2727     if ((prop_chartype == prop_value) == prop_fail_result)
2728     RRETURN(MATCH_NOMATCH);
2729     }
2730     break;
2731    
2732     case PT_SC:
2733     for (i = 1; i <= min; i++)
2734     {
2735     if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
2736     GETCHARINC(c, eptr);
2737     prop_category = _pcre_ucp_findprop(c, &prop_chartype, &prop_script);
2738     if ((prop_script == prop_value) == prop_fail_result)
2739     RRETURN(MATCH_NOMATCH);
2740     }
2741     break;
2742    
2743     default:
2744     RRETURN(PCRE_ERROR_INTERNAL);
2745 nigel 77 }
2746     }
2747    
2748     /* Match extended Unicode sequences. We will get here only if the
2749     support is in the binary; otherwise a compile-time error occurs. */
2750    
2751     else if (ctype == OP_EXTUNI)
2752     {
2753     for (i = 1; i <= min; i++)
2754     {
2755     GETCHARINCTEST(c, eptr);
2756 nigel 87 prop_category = _pcre_ucp_findprop(c, &prop_chartype, &prop_script);
2757 nigel 77 if (prop_category == ucp_M) RRETURN(MATCH_NOMATCH);
2758     while (eptr < md->end_subject)
2759     {
2760     int len = 1;
2761     if (!utf8) c = *eptr; else
2762     {
2763     GETCHARLEN(c, eptr, len);
2764     }
2765 nigel 87 prop_category = _pcre_ucp_findprop(c, &prop_chartype, &prop_script);
2766 nigel 77 if (prop_category != ucp_M) break;
2767     eptr += len;
2768     }
2769     }
2770     }
2771    
2772     else
2773     #endif /* SUPPORT_UCP */
2774    
2775     /* Handle all other cases when the coding is UTF-8 */
2776    
2777     #ifdef SUPPORT_UTF8
2778     if (utf8) switch(ctype)
2779     {
2780     case OP_ANY:
2781     for (i = 1; i <= min; i++)
2782     {
2783     if (eptr >= md->end_subject ||
2784 nigel 93 ((ims & PCRE_DOTALL) == 0 && IS_NEWLINE(eptr)))
2785 nigel 77 RRETURN(MATCH_NOMATCH);
2786 nigel 91 eptr++;
2787 nigel 77 while (eptr < md->end_subject && (*eptr & 0xc0) == 0x80) eptr++;
2788     }
2789     break;
2790    
2791     case OP_ANYBYTE:
2792     eptr += min;
2793     break;
2794    
2795 nigel 93 case OP_ANYNL:
2796     for (i = 1; i <= min; i++)
2797     {
2798     if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
2799     GETCHARINC(c, eptr);
2800     switch(c)
2801     {
2802     default: RRETURN(MATCH_NOMATCH);
2803     case 0x000d:
2804     if (eptr < md->end_subject && *eptr == 0x0a) eptr++;
2805     break;
2806     case 0x000a:
2807     case 0x000b:
2808     case 0x000c:
2809     case 0x0085:
2810     case 0x2028:
2811     case 0x2029:
2812     break;
2813     }
2814     }
2815     break;
2816    
2817 nigel 77 case OP_NOT_DIGIT:
2818     for (i = 1; i <= min; i++)
2819     {
2820     if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
2821     GETCHARINC(c, eptr);
2822     if (c < 128 && (md->ctypes[c] & ctype_digit) != 0)
2823     RRETURN(MATCH_NOMATCH);
2824     }
2825     break;
2826    
2827     case OP_DIGIT:
2828     for (i = 1; i <= min; i++)
2829     {
2830     if (eptr >= md->end_subject ||
2831     *eptr >= 128 || (md->ctypes[*eptr++] & ctype_digit) == 0)
2832     RRETURN(MATCH_NOMATCH);
2833     /* No need to skip more bytes - we know it's a 1-byte character */
2834     }
2835     break;
2836    
2837     case OP_NOT_WHITESPACE:
2838     for (i = 1; i <= min; i++)
2839     {
2840     if (eptr >= md->end_subject ||
2841     (*eptr < 128 && (md->ctypes[*eptr++] & ctype_space) != 0))
2842     RRETURN(MATCH_NOMATCH);
2843     while (eptr < md->end_subject && (*eptr & 0xc0) == 0x80) eptr++;
2844     }
2845     break;
2846    
2847     case OP_WHITESPACE:
2848     for (i = 1; i <= min; i++)
2849     {
2850     if (eptr >= md->end_subject ||
2851     *eptr >= 128 || (md->ctypes[*eptr++] & ctype_space) == 0)
2852     RRETURN(MATCH_NOMATCH);
2853     /* No need to skip more bytes - we know it's a 1-byte character */
2854     }
2855     break;
2856    
2857     case OP_NOT_WORDCHAR:
2858     for (i = 1; i <= min; i++)
2859     {
2860     if (eptr >= md->end_subject ||
2861     (*eptr < 128 && (md->ctypes[*eptr++] & ctype_word) != 0))
2862     RRETURN(MATCH_NOMATCH);
2863     while (eptr < md->end_subject && (*eptr & 0xc0) == 0x80) eptr++;
2864     }
2865     break;
2866    
2867     case OP_WORDCHAR:
2868     for (i = 1; i <= min; i++)
2869     {
2870     if (eptr >= md->end_subject ||
2871     *eptr >= 128 || (md->ctypes[*eptr++] & ctype_word) == 0)
2872     RRETURN(MATCH_NOMATCH);
2873     /* No need to skip more bytes - we know it's a 1-byte character */
2874     }
2875     break;
2876    
2877     default:
2878     RRETURN(PCRE_ERROR_INTERNAL);
2879     } /* End switch(ctype) */
2880    
2881     else
2882     #endif /* SUPPORT_UTF8 */
2883    
2884     /* Code for the non-UTF-8 case for minimum matching of operators other
2885 nigel 93 than OP_PROP and OP_NOTPROP. We can assume that there are the minimum
2886     number of bytes present, as this was tested above. */
2887 nigel 77
2888     switch(ctype)
2889     {
2890     case OP_ANY:
2891     if ((ims & PCRE_DOTALL) == 0)
2892     {
2893     for (i = 1; i <= min; i++)
2894 nigel 91 {
2895 nigel 93 if (IS_NEWLINE(eptr)) RRETURN(MATCH_NOMATCH);
2896 nigel 91 eptr++;
2897     }
2898 nigel 77 }
2899     else eptr += min;
2900     break;
2901    
2902     case OP_ANYBYTE:
2903     eptr += min;
2904     break;
2905    
2906 nigel 93 /* Because of the CRLF case, we can't assume the minimum number of
2907     bytes are present in this case. */
2908    
2909     case OP_ANYNL:
2910     for (i = 1; i <= min; i++)
2911     {
2912     if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
2913     switch(*eptr++)
2914     {
2915     default: RRETURN(MATCH_NOMATCH);
2916     case 0x000d:
2917     if (eptr < md->end_subject && *eptr == 0x0a) eptr++;
2918     break;
2919     case 0x000a:
2920     case 0x000b:
2921     case 0x000c:
2922     case 0x0085:
2923     break;
2924     }
2925     }
2926     break;
2927    
2928 nigel 77 case OP_NOT_DIGIT:
2929     for (i = 1; i <= min; i++)
2930     if ((md->ctypes[*eptr++] & ctype_digit) != 0) RRETURN(MATCH_NOMATCH);
2931     break;
2932    
2933     case OP_DIGIT:
2934     for (i = 1; i <= min; i++)
2935     if ((md->ctypes[*eptr++] & ctype_digit) == 0) RRETURN(MATCH_NOMATCH);
2936     break;
2937    
2938     case OP_NOT_WHITESPACE:
2939     for (i = 1; i <= min; i++)
2940     if ((md->ctypes[*eptr++] & ctype_space) != 0) RRETURN(MATCH_NOMATCH);
2941     break;
2942    
2943     case OP_WHITESPACE:
2944     for (i = 1; i <= min; i++)
2945     if ((md->ctypes[*eptr++] & ctype_space) == 0) RRETURN(MATCH_NOMATCH);
2946     break;
2947    
2948     case OP_NOT_WORDCHAR:
2949     for (i = 1; i <= min; i++)
2950     if ((md->ctypes[*eptr++] & ctype_word) != 0)
2951     RRETURN(MATCH_NOMATCH);
2952     break;
2953    
2954     case OP_WORDCHAR:
2955     for (i = 1; i <= min; i++)
2956     if ((md->ctypes[*eptr++] & ctype_word) == 0)
2957     RRETURN(MATCH_NOMATCH);
2958     break;
2959    
2960     default:
2961     RRETURN(PCRE_ERROR_INTERNAL);
2962     }
2963     }
2964    
2965     /* If min = max, continue at the same level without recursing */
2966    
2967     if (min == max) continue;
2968    
2969     /* If minimizing, we have to test the rest of the pattern before each
2970     subsequent match. Again, separate the UTF-8 case for speed, and also
2971     separate the UCP cases. */
2972    
2973     if (minimize)
2974     {
2975     #ifdef SUPPORT_UCP
2976 nigel 87 if (prop_type >= 0)
2977 nigel 77 {
2978 nigel 87 switch(prop_type)
2979 nigel 77 {
2980 nigel 87 case PT_ANY:
2981     for (fi = min;; fi++)
2982     {
2983 ph10 164 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM36);
2984 nigel 87 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2985     if (fi >= max || eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
2986     GETCHARINC(c, eptr);
2987     if (prop_fail_result) RRETURN(MATCH_NOMATCH);
2988     }
2989 nigel 93 /* Control never gets here */
2990 nigel 87
2991     case PT_LAMP:
2992     for (fi = min;; fi++)
2993     {
2994 ph10 164 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM37);
2995 nigel 87 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2996     if (fi >= max || eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
2997     GETCHARINC(c, eptr);
2998     prop_category = _pcre_ucp_findprop(c, &prop_chartype, &prop_script);
2999     if ((prop_chartype == ucp_Lu ||
3000     prop_chartype == ucp_Ll ||
3001     prop_chartype == ucp_Lt) == prop_fail_result)
3002     RRETURN(MATCH_NOMATCH);
3003     }
3004 nigel 93 /* Control never gets here */
3005 nigel 87
3006     case PT_GC:
3007     for (fi = min;; fi++)
3008     {
3009 ph10 164 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM38);
3010 nigel 87 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3011     if (fi >= max || eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
3012     GETCHARINC(c, eptr);
3013     prop_category = _pcre_ucp_findprop(c, &prop_chartype, &prop_script);
3014     if ((prop_category == prop_value) == prop_fail_result)
3015     RRETURN(MATCH_NOMATCH);
3016     }
3017 nigel 93 /* Control never gets here */
3018 nigel 87
3019     case PT_PC:
3020     for (fi = min;; fi++)
3021     {
3022 ph10 164 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM39);
3023 nigel 87 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3024     if (fi >= max || eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
3025     GETCHARINC(c, eptr);
3026     prop_category = _pcre_ucp_findprop(c, &prop_chartype, &prop_script);
3027     if ((prop_chartype == prop_value) == prop_fail_result)
3028     RRETURN(MATCH_NOMATCH);
3029     }
3030 nigel 93 /* Control never gets here */
3031 nigel 87
3032     case PT_SC:
3033     for (fi = min;; fi++)
3034     {
3035 ph10 164 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM40);
3036 nigel 87 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3037     if (fi >= max || eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
3038     GETCHARINC(c, eptr);
3039     prop_category = _pcre_ucp_findprop(c, &prop_chartype, &prop_script);
3040     if ((prop_script == prop_value) == prop_fail_result)
3041     RRETURN(MATCH_NOMATCH);
3042     }
3043 nigel 93 /* Control never gets here */
3044 nigel 87
3045     default:
3046     RRETURN(PCRE_ERROR_INTERNAL);
3047 nigel 77 }
3048     }
3049    
3050     /* Match extended Unicode sequences. We will get here only if the
3051     support is in the binary; otherwise a compile-time error occurs. */
3052    
3053     else if (ctype == OP_EXTUNI)
3054     {
3055     for (fi = min;; fi++)
3056     {
3057 ph10 164 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM41);
3058 nigel 77 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3059     if (fi >= max || eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
3060     GETCHARINCTEST(c, eptr);
3061 nigel 87 prop_category = _pcre_ucp_findprop(c, &prop_chartype, &prop_script);
3062 nigel 77 if (prop_category == ucp_M) RRETURN(MATCH_NOMATCH);
3063     while (eptr < md->end_subject)
3064     {
3065     int len = 1;
3066     if (!utf8) c = *eptr; else
3067     {
3068     GETCHARLEN(c, eptr, len);
3069     }
3070 nigel 87 prop_category = _pcre_ucp_findprop(c, &prop_chartype, &prop_script);
3071 nigel 77 if (prop_category != ucp_M) break;
3072     eptr += len;
3073     }
3074     }
3075     }
3076    
3077     else
3078     #endif /* SUPPORT_UCP */
3079    
3080     #ifdef SUPPORT_UTF8
3081     /* UTF-8 mode */
3082     if (utf8)
3083     {
3084     for (fi = min;; fi++)
3085     {
3086 ph10 164 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM42);
3087 nigel 77 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3088 nigel 91 if (fi >= max || eptr >= md->end_subject ||
3089     (ctype == OP_ANY && (ims & PCRE_DOTALL) == 0 &&
3090 nigel 93 IS_NEWLINE(eptr)))
3091 nigel 91 RRETURN(MATCH_NOMATCH);
3092 nigel 77
3093     GETCHARINC(c, eptr);
3094     switch(ctype)
3095     {
3096 nigel 91 case OP_ANY: /* This is the DOTALL case */
3097 nigel 77 break;
3098    
3099     case OP_ANYBYTE:
3100     break;
3101    
3102 nigel 93 case OP_ANYNL:
3103     switch(c)
3104     {
3105     default: RRETURN(MATCH_NOMATCH);
3106     case 0x000d:
3107     if (eptr < md->end_subject && *eptr == 0x0a) eptr++;
3108     break;
3109     case 0x000a:
3110     case 0x000b:
3111     case 0x000c:
3112     case 0x0085:
3113     case 0x2028:
3114     case 0x2029:
3115     break;
3116     }
3117     break;
3118    
3119 nigel 77 case OP_NOT_DIGIT:
3120     if (c < 256 && (md->ctypes[c] & ctype_digit) != 0)
3121     RRETURN(MATCH_NOMATCH);
3122     break;
3123    
3124     case OP_DIGIT:
3125     if (c >= 256 || (md->ctypes[c] & ctype_digit) == 0)
3126     RRETURN(MATCH_NOMATCH);
3127     break;
3128    
3129     case OP_NOT_WHITESPACE:
3130     if (c < 256 && (md->ctypes[c] & ctype_space) != 0)
3131     RRETURN(MATCH_NOMATCH);
3132     break;
3133    
3134     case OP_WHITESPACE:
3135     if (c >= 256 || (md->ctypes[c] & ctype_space) == 0)
3136     RRETURN(MATCH_NOMATCH);
3137     break;
3138    
3139     case OP_NOT_WORDCHAR:
3140     if (c < 256 && (md->ctypes[c] & ctype_word) != 0)
3141     RRETURN(MATCH_NOMATCH);
3142     break;
3143    
3144     case OP_WORDCHAR:
3145     if (c >= 256 || (md->ctypes[c] & ctype_word) == 0)
3146     RRETURN(MATCH_NOMATCH);
3147     break;
3148    
3149     default:
3150     RRETURN(PCRE_ERROR_INTERNAL);
3151     }
3152     }
3153     }
3154     else
3155     #endif
3156     /* Not UTF-8 mode */
3157     {
3158     for (fi = min;; fi++)
3159     {
3160 ph10 164 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM43);
3161 nigel 77 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3162 nigel 91 if (fi >= max || eptr >= md->end_subject ||
3163 nigel 93 ((ims & PCRE_DOTALL) == 0 && IS_NEWLINE(eptr)))
3164 nigel 91 RRETURN(MATCH_NOMATCH);
3165    
3166 nigel 77 c = *eptr++;
3167     switch(ctype)
3168     {
3169 nigel 91 case OP_ANY: /* This is the DOTALL case */
3170 nigel 77 break;
3171    
3172     case OP_ANYBYTE:
3173     break;
3174    
3175 nigel 93 case OP_ANYNL:
3176     switch(c)
3177     {
3178     default: RRETURN(MATCH_NOMATCH);
3179     case 0x000d:
3180     if (eptr < md->end_subject && *eptr == 0x0a) eptr++;
3181     break;
3182     case 0x000a:
3183     case 0x000b:
3184     case 0x000c:
3185     case 0x0085:
3186     break;
3187     }
3188     break;
3189    
3190 nigel 77 case OP_NOT_DIGIT:
3191     if ((md->ctypes[c] & ctype_digit) != 0) RRETURN(MATCH_NOMATCH);
3192     break;
3193    
3194     case OP_DIGIT:
3195     if ((md->ctypes[c] & ctype_digit) == 0) RRETURN(MATCH_NOMATCH);
3196     break;
3197    
3198     case OP_NOT_WHITESPACE:
3199     if ((md->ctypes[c] & ctype_space) != 0) RRETURN(MATCH_NOMATCH);
3200     break;
3201    
3202     case OP_WHITESPACE:
3203     if ((md->ctypes[c] & ctype_space) == 0) RRETURN(MATCH_NOMATCH);
3204     break;
3205    
3206     case OP_NOT_WORDCHAR:
3207     if ((md->ctypes[c] & ctype_word) != 0) RRETURN(MATCH_NOMATCH);
3208     break;
3209    
3210     case OP_WORDCHAR:
3211     if ((md->ctypes[c] & ctype_word) == 0) RRETURN(MATCH_NOMATCH);
3212     break;
3213    
3214     default:
3215     RRETURN(PCRE_ERROR_INTERNAL);
3216     }
3217     }
3218     }
3219     /* Control never gets here */
3220     }
3221    
3222 nigel 93 /* If maximizing, it is worth using inline code for speed, doing the type
3223 nigel 77 test once at the start (i.e. keep it out of the loop). Again, keep the
3224     UTF-8 and UCP stuff separate. */
3225    
3226     else
3227     {
3228     pp = eptr; /* Remember where we started */
3229    
3230     #ifdef SUPPORT_UCP
3231 nigel 87 if (prop_type >= 0)
3232 nigel 77 {
3233 nigel 87 switch(prop_type)
3234 nigel 77 {
3235 nigel 87 case PT_ANY:
3236     for (i = min; i < max; i++)
3237     {
3238     int len = 1;
3239     if (eptr >= md->end_subject) break;
3240     GETCHARLEN(c, eptr, len);
3241     if (prop_fail_result) break;
3242     eptr+= len;
3243     }
3244     break;
3245    
3246     case PT_LAMP:
3247     for (i = min; i < max; i++)
3248     {
3249     int len = 1;
3250     if (eptr >= md->end_subject) break;
3251     GETCHARLEN(c, eptr, len);
3252     prop_category = _pcre_ucp_findprop(c, &prop_chartype, &prop_script);
3253     if ((prop_chartype == ucp_Lu ||
3254     prop_chartype == ucp_Ll ||
3255     prop_chartype == ucp_Lt) == prop_fail_result)
3256     break;
3257     eptr+= len;
3258     }
3259     break;
3260    
3261     case PT_GC:
3262     for (i = min; i < max; i++)
3263     {
3264     int len = 1;
3265     if (eptr >= md->end_subject) break;
3266     GETCHARLEN(c, eptr, len);
3267     prop_category = _pcre_ucp_findprop(c, &prop_chartype, &prop_script);
3268     if ((prop_category == prop_value) == prop_fail_result)
3269     break;
3270     eptr+= len;
3271     }
3272     break;
3273    
3274     case PT_PC:
3275     for (i = min; i < max; i++)
3276     {
3277     int len = 1;
3278     if (eptr >= md->end_subject) break;
3279     GETCHARLEN(c, eptr, len);
3280     prop_category = _pcre_ucp_findprop(c, &prop_chartype, &prop_script);
3281     if ((prop_chartype == prop_value) == prop_fail_result)
3282     break;
3283     eptr+= len;
3284     }
3285     break;
3286    
3287     case PT_SC:
3288     for (i = min; i < max; i++)
3289     {
3290     int len = 1;
3291     if (eptr >= md->end_subject) break;
3292     GETCHARLEN(c, eptr, len);
3293     prop_category = _pcre_ucp_findprop(c, &prop_chartype, &prop_script);
3294     if ((prop_script == prop_value) == prop_fail_result)
3295     break;
3296     eptr+= len;
3297     }
3298     break;
3299 nigel 77 }
3300    
3301     /* eptr is now past the end of the maximum run */
3302    
3303 nigel 93 if (possessive) continue;
3304 nigel 77 for(;;)
3305     {
3306 ph10 164 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM44);
3307 nigel 77 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3308     if (eptr-- == pp) break; /* Stop if tried at original pos */
3309     BACKCHAR(eptr);
3310     }
3311     }
3312    
3313     /* Match extended Unicode sequences. We will get here only if the
3314     support is in the binary; otherwise a compile-time error occurs. */
3315    
3316     else if (ctype == OP_EXTUNI)
3317     {
3318     for (i = min; i < max; i++)
3319     {
3320     if (eptr >= md->end_subject) break;
3321     GETCHARINCTEST(c, eptr);
3322 nigel 87 prop_category = _pcre_ucp_findprop(c, &prop_chartype, &prop_script);
3323 nigel 77 if (prop_category == ucp_M) break;
3324     while (eptr < md->end_subject)
3325     {
3326     int len = 1;
3327     if (!utf8) c = *eptr; else
3328     {
3329     GETCHARLEN(c, eptr, len);
3330     }
3331 nigel 87 prop_category = _pcre_ucp_findprop(c, &prop_chartype, &prop_script);
3332 nigel 77 if (prop_category != ucp_M) break;
3333     eptr += len;
3334     }
3335     }
3336    
3337     /* eptr is now past the end of the maximum run */
3338    
3339 nigel 93 if (possessive) continue;
3340 nigel 77 for(;;)
3341     {
3342 ph10 164 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM45);
3343 nigel 77 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3344     if (eptr-- == pp) break; /* Stop if tried at original pos */
3345     for (;;) /* Move back over one extended */
3346     {
3347     int len = 1;
3348     BACKCHAR(eptr);
3349     if (!utf8) c = *eptr; else
3350     {
3351     GETCHARLEN(c, eptr, len);
3352     }
3353 nigel 87 prop_category = _pcre_ucp_findprop(c, &prop_chartype, &prop_script);
3354 nigel 77 if (prop_category != ucp_M) break;
3355     eptr--;
3356     }
3357     }
3358     }
3359    
3360     else
3361     #endif /* SUPPORT_UCP */
3362    
3363     #ifdef SUPPORT_UTF8
3364     /* UTF-8 mode */
3365    
3366     if (utf8)
3367     {
3368     switch(ctype)
3369     {
3370     case OP_ANY:
3371    
3372 nigel 91 /* Special code is required for UTF8, but when the maximum is
3373     unlimited we don't need it, so we repeat the non-UTF8 code. This is
3374     probably worth it, because .* is quite a common idiom. */
3375 nigel 77
3376     if (max < INT_MAX)
3377     {
3378     if ((ims & PCRE_DOTALL) == 0)
3379     {
3380     for (i = min; i < max; i++)
3381     {
3382 nigel 93 if (eptr >= md->end_subject || IS_NEWLINE(eptr)) break;
3383 nigel 77 eptr++;
3384     while (eptr < md->end_subject && (*eptr & 0xc0) == 0x80) eptr++;
3385     }
3386     }
3387     else
3388     {
3389     for (i = min; i < max; i++)
3390     {
3391 nigel 91 if (eptr >= md->end_subject) break;
3392 nigel 77 eptr++;
3393     while (eptr < md->end_subject && (*eptr & 0xc0) == 0x80) eptr++;
3394     }
3395     }
3396     }
3397    
3398     /* Handle unlimited UTF-8 repeat */
3399    
3400     else
3401     {
3402     if ((ims & PCRE_DOTALL) == 0)
3403     {
3404     for (i = min; i < max; i++)
3405     {
3406 nigel 93 if (eptr >= md->end_subject || IS_NEWLINE(eptr)) break;
3407 nigel 77 eptr++;
3408     }
3409     break;
3410     }
3411     else
3412     {
3413     c = max - min;
3414 nigel 93 if (c > (unsigned int)(md->end_subject - eptr))
3415     c = md->end_subject - eptr;
3416 nigel 77 eptr += c;
3417     }
3418     }
3419     break;
3420    
3421     /* The byte case is the same as non-UTF8 */
3422    
3423     case OP_ANYBYTE:
3424     c = max - min;
3425 nigel 93 if (c > (unsigned int)(md->end_subject - eptr))
3426     c = md->end_subject - eptr;
3427 nigel 77 eptr += c;
3428     break;
3429    
3430 nigel 93 case OP_ANYNL:
3431     for (i = min; i < max; i++)
3432     {
3433     int len = 1;
3434     if (eptr >= md->end_subject) break;
3435     GETCHARLEN(c, eptr, len);
3436     if (c == 0x000d)
3437     {
3438     if (++eptr >= md->end_subject) break;
3439     if (*eptr == 0x000a) eptr++;
3440     }
3441     else
3442     {
3443     if (c != 0x000a && c != 0x000b && c != 0x000c &&
3444     c != 0x0085 && c != 0x2028 && c != 0x2029)
3445     break;
3446     eptr += len;
3447     }
3448     }
3449     break;
3450    
3451 nigel 77 case OP_NOT_DIGIT:
3452     for (i = min; i < max; i++)
3453     {
3454     int len = 1;
3455     if (eptr >= md->end_subject) break;
3456     GETCHARLEN(c, eptr, len);
3457     if (c < 256 && (md->ctypes[c] & ctype_digit) != 0) break;
3458     eptr+= len;
3459     }
3460     break;
3461    
3462     case OP_DIGIT:
3463     for (i = min; i < max; i++)
3464     {
3465     int len = 1;
3466     if (eptr >= md->end_subject) break;
3467     GETCHARLEN(c, eptr, len);
3468     if (c >= 256 ||(md->ctypes[c] & ctype_digit) == 0) break;
3469     eptr+= len;
3470     }
3471     break;
3472    
3473     case OP_NOT_WHITESPACE:
3474     for (i = min; i < max; i++)
3475     {
3476     int len = 1;
3477     if (eptr >= md->end_subject) break;
3478     GETCHARLEN(c, eptr, len);
3479     if (c < 256 && (md->ctypes[c] & ctype_space) != 0) break;
3480     eptr+= len;
3481     }
3482     break;
3483    
3484     case OP_WHITESPACE:
3485     for (i = min; i < max; i++)
3486     {
3487     int len = 1;
3488     if (eptr >= md->end_subject) break;
3489     GETCHARLEN(c, eptr, len);
3490     if (c >= 256 ||(md->ctypes[c] & ctype_space) == 0) break;
3491     eptr+= len;
3492     }
3493     break;
3494    
3495     case OP_NOT_WORDCHAR:
3496     for (i = min; i < max; i++)
3497     {
3498     int len = 1;
3499     if (eptr >= md->end_subject) break;
3500     GETCHARLEN(c, eptr, len);
3501     if (c < 256 && (md->ctypes[c] & ctype_word) != 0) break;
3502     eptr+= len;
3503     }
3504     break;
3505    
3506     case OP_WORDCHAR:
3507     for (i = min; i < max; i++)
3508     {
3509     int len = 1;
3510     if (eptr >= md->end_subject) break;
3511     GETCHARLEN(c, eptr, len);
3512     if (c >= 256 || (md->ctypes[c] & ctype_word) == 0) break;
3513     eptr+= len;
3514     }
3515     break;
3516    
3517     default:
3518     RRETURN(PCRE_ERROR_INTERNAL);
3519     }
3520    
3521     /* eptr is now past the end of the maximum run */
3522    
3523 nigel 93 if (possessive) continue;
3524 nigel 77 for(;;)
3525     {
3526 ph10 164 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM46);
3527 nigel 77 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3528     if (eptr-- == pp) break; /* Stop if tried at original pos */
3529     BACKCHAR(eptr);
3530     }
3531     }
3532     else
3533     #endif
3534    
3535     /* Not UTF-8 mode */
3536     {
3537     switch(ctype)
3538     {
3539     case OP_ANY:
3540     if ((ims & PCRE_DOTALL) == 0)
3541     {
3542     for (i = min; i < max; i++)
3543     {
3544 nigel 93 if (eptr >= md->end_subject || IS_NEWLINE(eptr)) break;
3545 nigel 77 eptr++;
3546     }
3547     break;
3548     }
3549     /* For DOTALL case, fall through and treat as \C */
3550    
3551     case OP_ANYBYTE:
3552     c = max - min;
3553 nigel 93 if (c > (unsigned int)(md->end_subject - eptr))
3554     c = md->end_subject - eptr;
3555 nigel 77 eptr += c;
3556     break;
3557    
3558 nigel 93 case OP_ANYNL:
3559     for (i = min; i < max; i++)
3560     {
3561     if (eptr >= md->end_subject) break;
3562     c = *eptr;
3563     if (c == 0x000d)
3564     {
3565     if (++eptr >= md->end_subject) break;
3566     if (*eptr == 0x000a) eptr++;
3567     }
3568     else
3569     {
3570     if (c != 0x000a && c != 0x000b && c != 0x000c && c != 0x0085)
3571     break;
3572     eptr++;
3573     }
3574     }
3575     break;
3576    
3577 nigel 77 case OP_NOT_DIGIT:
3578     for (i = min; i < max; i++)
3579     {
3580     if (eptr >= md->end_subject || (md->ctypes[*eptr] & ctype_digit) != 0)
3581     break;
3582     eptr++;
3583     }
3584     break;
3585    
3586     case OP_DIGIT:
3587     for (i = min; i < max; i++)
3588     {
3589     if (eptr >= md->end_subject || (md->ctypes[*eptr] & ctype_digit) == 0)
3590     break;
3591     eptr++;
3592     }
3593     break;
3594    
3595     case OP_NOT_WHITESPACE:
3596     for (i = min; i < max; i++)
3597     {
3598     if (eptr >= md->end_subject || (md->ctypes[*eptr] & ctype_space) != 0)
3599     break;
3600     eptr++;
3601     }
3602     break;
3603    
3604     case OP_WHITESPACE:
3605     for (i = min; i < max; i++)
3606     {
3607     if (eptr >= md->end_subject || (md->ctypes[*eptr] & ctype_space) == 0)
3608     break;
3609     eptr++;
3610     }
3611     break;
3612    
3613     case OP_NOT_WORDCHAR:
3614     for (i = min; i < max; i++)
3615     {
3616     if (eptr >= md->end_subject || (md->ctypes[*eptr] & ctype_word) != 0)
3617     break;
3618     eptr++;
3619     }
3620     break;
3621    
3622     case OP_WORDCHAR:
3623     for (i = min; i < max; i++)
3624     {
3625     if (eptr >= md->end_subject || (md->ctypes[*eptr] & ctype_word) == 0)
3626     break;
3627     eptr++;
3628     }
3629     break;
3630    
3631     default:
3632     RRETURN(PCRE_ERROR_INTERNAL);
3633     }
3634    
3635     /* eptr is now past the end of the maximum run */
3636    
3637 nigel 93 if (possessive) continue;
3638 nigel 77 while (eptr >= pp)
3639     {
3640 ph10 164 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM47);
3641 nigel 77 eptr--;
3642     if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3643     }
3644     }
3645    
3646     /* Get here if we can't make it match with any permitted repetitions */
3647    
3648     RRETURN(MATCH_NOMATCH);
3649     }
3650     /* Control never gets here */
3651    
3652 nigel 93 /* There's been some horrible disaster. Arrival here can only mean there is
3653     something seriously wrong in the code above or the OP_xxx definitions. */
3654 nigel 77
3655     default:
3656     DPRINTF(("Unknown opcode %d\n", *ecode));
3657 nigel 93 RRETURN(PCRE_ERROR_UNKNOWN_OPCODE);
3658 nigel 77 }
3659    
3660     /* Do not stick any code in here without much thought; it is assumed
3661     that "continue" in the code above comes out to here to repeat the main
3662     loop. */
3663    
3664     } /* End of main loop */
3665     /* Control never reaches here */
3666 ph10 164
3667    
3668 ph10 165 /* When compiling to use the heap rather than the stack for recursive calls to
3669     match(), the RRETURN() macro jumps here. The number that is saved in
3670 ph10 164 frame->Xwhere indicates which label we actually want to return to. */
3671    
3672     #ifdef NO_RECURSE
3673     #define LBL(val) case val: goto L_RM##val;
3674     HEAP_RETURN:
3675     switch (frame->Xwhere)
3676     {
3677     LBL( 1) LBL( 2) LBL( 3) LBL( 4) LBL( 5) LBL( 6) LBL( 7) LBL( 8)
3678     LBL( 9) LBL(10) LBL(11) LBL(12) LBL(13) LBL(14) LBL(15) LBL(16)
3679     LBL(17) LBL(18) LBL(19) LBL(20) LBL(21) LBL(22) LBL(23) LBL(24)
3680     LBL(25) LBL(26) LBL(27) LBL(28) LBL(29) LBL(30) LBL(31) LBL(32)
3681     LBL(33) LBL(34) LBL(35) LBL(36) LBL(37) LBL(38) LBL(39) LBL(40)
3682     LBL(41) LBL(42) LBL(43) LBL(44) LBL(45) LBL(46) LBL(47)
3683     default:
3684     DPRINTF(("jump error in pcre match: label %d non-existent\n", frame->Xwhere));
3685     return PCRE_ERROR_INTERNAL;
3686     }
3687 ph10 165 #undef LBL
3688 ph10 164 #endif /* NO_RECURSE */
3689 nigel 77 }
3690    
3691    
3692     /***************************************************************************
3693     ****************************************************************************
3694     RECURSION IN THE match() FUNCTION
3695    
3696     Undefine all the macros that were defined above to handle this. */
3697    
3698     #ifdef NO_RECURSE
3699     #undef eptr
3700     #undef ecode
3701 ph10 168 #undef mstart
3702 nigel 77 #undef offset_top
3703     #undef ims
3704     #undef eptrb
3705     #undef flags
3706    
3707     #undef callpat
3708     #undef charptr
3709     #undef data
3710     #undef next
3711     #undef pp
3712     #undef prev
3713     #undef saved_eptr
3714    
3715     #undef new_recursive
3716    
3717     #undef cur_is_word
3718     #undef condition
3719     #undef prev_is_word
3720    
3721     #undef original_ims
3722    
3723     #undef ctype
3724     #undef length
3725     #undef max
3726     #undef min
3727     #undef number
3728     #undef offset
3729     #undef op
3730     #undef save_capture_last
3731     #undef save_offset1
3732     #undef save_offset2
3733     #undef save_offset3
3734     #undef stacksave
3735    
3736     #undef newptrb
3737    
3738     #endif
3739    
3740     /* These two are defined as macros in both cases */
3741    
3742     #undef fc
3743     #undef fi
3744    
3745     /***************************************************************************
3746     ***************************************************************************/
3747    
3748    
3749    
3750     /*************************************************
3751     * Execute a Regular Expression *
3752     *************************************************/
3753    
3754     /* This function applies a compiled re to a subject string and picks out
3755     portions of the string if it matches. Two elements in the vector are set for
3756     each substring: the offsets to the start and end of the substring.
3757    
3758     Arguments:
3759     argument_re points to the compiled expression
3760     extra_data points to extra data or is NULL
3761     subject points to the subject string
3762     length length of subject string (may contain binary zeros)
3763     start_offset where to start in the subject string
3764     options option bits
3765     offsets points to a vector of ints to be filled in with offsets
3766     offsetcount the number of elements in the vector
3767    
3768     Returns: > 0 => success; value is the number of elements filled in
3769     = 0 => success, but offsets is not big enough
3770     -1 => failed to match
3771     < -1 => some kind of unexpected problem
3772     */
3773    
3774 ph10 145 PCRE_EXP_DEFN int
3775 nigel 77 pcre_exec(const pcre *argument_re, const pcre_extra *extra_data,
3776 nigel 87 PCRE_SPTR subject, int length, int start_offset, int options, int *offsets,
3777 nigel 77 int offsetcount)
3778     {
3779     int rc, resetcount, ocount;
3780     int first_byte = -1;
3781     int req_byte = -1;
3782     int req_byte2 = -1;
3783 nigel 91 int newline;
3784     unsigned long int ims;
3785 nigel 77 BOOL using_temporary_offsets = FALSE;
3786     BOOL anchored;
3787     BOOL startline;
3788     BOOL firstline;
3789     BOOL first_byte_caseless = FALSE;
3790     BOOL req_byte_caseless = FALSE;
3791 nigel 93 BOOL utf8;
3792 nigel 77 match_data match_block;
3793 nigel 91 match_data *md = &match_block;
3794 nigel 77 const uschar *tables;
3795     const uschar *start_bits = NULL;
3796 nigel 87 USPTR start_match = (USPTR)subject + start_offset;
3797     USPTR end_subject;
3798     USPTR req_byte_ptr = start_match - 1;
3799 nigel 93 eptrblock eptrchain[EPTR_WORK_SIZE];
3800 nigel 77
3801     pcre_study_data internal_study;
3802     const pcre_study_data *study;
3803    
3804     real_pcre internal_re;
3805     const real_pcre *external_re = (const real_pcre *)argument_re;
3806     const real_pcre *re = external_re;
3807    
3808     /* Plausibility checks */
3809    
3810     if ((options & ~PUBLIC_EXEC_OPTIONS) != 0) return PCRE_ERROR_BADOPTION;
3811     if (re == NULL || subject == NULL ||
3812     (offsets == NULL && offsetcount > 0)) return PCRE_ERROR_NULL;
3813     if (offsetcount < 0) return PCRE_ERROR_BADCOUNT;
3814    
3815     /* Fish out the optional data from the extra_data structure, first setting
3816     the default values. */
3817    
3818     study = NULL;
3819 nigel 91 md->match_limit = MATCH_LIMIT;
3820     md->match_limit_recursion = MATCH_LIMIT_RECURSION;
3821     md->callout_data = NULL;
3822 nigel 77
3823     /* The table pointer is always in native byte order. */
3824    
3825     tables = external_re->tables;
3826    
3827     if (extra_data != NULL)
3828     {
3829     register unsigned int flags = extra_data->flags;
3830     if ((flags & PCRE_EXTRA_STUDY_DATA) != 0)
3831     study = (const pcre_study_data *)extra_data->study_data;
3832     if ((flags & PCRE_EXTRA_MATCH_LIMIT) != 0)
3833 nigel 91 md->match_limit = extra_data->match_limit;
3834 nigel 87 if ((flags & PCRE_EXTRA_MATCH_LIMIT_RECURSION) != 0)
3835 nigel 91 md->match_limit_recursion = extra_data->match_limit_recursion;
3836 nigel 77 if ((flags & PCRE_EXTRA_CALLOUT_DATA) != 0)
3837 nigel 91 md->callout_data = extra_data->callout_data;
3838 nigel 77 if ((flags & PCRE_EXTRA_TABLES) != 0) tables = extra_data->tables;
3839     }
3840    
3841     /* If the exec call supplied NULL for tables, use the inbuilt ones. This
3842     is a feature that makes it possible to save compiled regex and re-use them
3843     in other programs later. */
3844    
3845     if (tables == NULL) tables = _pcre_default_tables;
3846    
3847     /* Check that the first field in the block is the magic number. If it is not,
3848     test for a regex that was compiled on a host of opposite endianness. If this is
3849     the case, flipped values are put in internal_re and internal_study if there was
3850     study data too. */
3851    
3852     if (re->magic_number != MAGIC_NUMBER)
3853     {
3854     re = _pcre_try_flipped(re, &internal_re, study, &internal_study);
3855     if (re == NULL) return PCRE_ERROR_BADMAGIC;
3856     if (study != NULL) study = &internal_study;
3857     }
3858    
3859     /* Set up other data */
3860    
3861     anchored = ((re->options | options) & PCRE_ANCHORED) != 0;
3862     startline = (re->options & PCRE_STARTLINE) != 0;
3863     firstline = (re->options & PCRE_FIRSTLINE) != 0;
3864    
3865     /* The code starts after the real_pcre block and the capture name table. */
3866    
3867 nigel 91 md->start_code = (const uschar *)external_re + re->name_table_offset +
3868 nigel 77 re->name_count * re->name_entry_size;
3869    
3870 nigel 91 md->start_subject = (USPTR)subject;
3871     md->start_offset = start_offset;
3872     md->end_subject = md->start_subject + length;
3873     end_subject = md->end_subject;
3874 nigel 77
3875 nigel 91 md->endonly = (re->options & PCRE_DOLLAR_ENDONLY) != 0;
3876 nigel 93 utf8 = md->utf8 = (re->options & PCRE_UTF8) != 0;
3877 nigel 77
3878 nigel 91 md->notbol = (options & PCRE_NOTBOL) != 0;
3879     md->noteol = (options & PCRE_NOTEOL) != 0;
3880     md->notempty = (options & PCRE_NOTEMPTY) != 0;
3881     md->partial = (options & PCRE_PARTIAL) != 0;
3882     md->hitend = FALSE;
3883 nigel 77
3884 nigel 91 md->recursive = NULL; /* No recursion at top level */
3885 nigel 93 md->eptrchain = eptrchain; /* Make workspace generally available */
3886 nigel 77
3887 nigel 91 md->lcc = tables + lcc_offset;
3888     md->ctypes = tables + ctypes_offset;
3889 nigel 77
3890 ph10 97 /* Handle different types of newline. The three bits give eight cases. If
3891     nothing is set at run time, whatever was used at compile time applies. */
3892 nigel 91
3893 ph10 144 switch ((((options & PCRE_NEWLINE_BITS) == 0)? re->options : (pcre_uint32)options) &
3894 nigel 93 PCRE_NEWLINE_BITS)
3895 nigel 91 {
3896 nigel 93 case 0: newline = NEWLINE; break; /* Compile-time default */
3897 nigel 91 case PCRE_NEWLINE_CR: newline = '\r'; break;
3898     case PCRE_NEWLINE_LF: newline = '\n'; break;
3899     case PCRE_NEWLINE_CR+
3900     PCRE_NEWLINE_LF: newline = ('\r' << 8) | '\n'; break;
3901 nigel 93 case PCRE_NEWLINE_ANY: newline = -1; break;
3902 ph10 150 case PCRE_NEWLINE_ANYCRLF: newline = -2; break;
3903 nigel 93 default: return PCRE_ERROR_BADNEWLINE;
3904 nigel 91 }
3905    
3906 ph10 149 if (newline == -2)
3907 nigel 91 {
3908 ph10 149 md->nltype = NLTYPE_ANYCRLF;
3909     }
3910     else if (newline < 0)
3911     {
3912 nigel 93 md->nltype = NLTYPE_ANY;
3913 nigel 91 }
3914     else
3915     {
3916 nigel 93 md->nltype = NLTYPE_FIXED;
3917     if (newline > 255)
3918     {
3919     md->nllen = 2;
3920     md->nl[0] = (newline >> 8) & 255;
3921     md->nl[1] = newline & 255;
3922     }
3923     else
3924     {
3925     md->nllen = 1;
3926     md->nl[0] = newline;
3927     }
3928 nigel 91 }
3929    
3930 nigel 77 /* Partial matching is supported only for a restricted set of regexes at the
3931     moment. */
3932    
3933 nigel 91 if (md->partial && (re->options & PCRE_NOPARTIAL) != 0)
3934 nigel 77 return PCRE_ERROR_BADPARTIAL;
3935    
3936     /* Check a UTF-8 string if required. Unfortunately there's no way of passing
3937     back the character offset. */
3938    
3939     #ifdef SUPPORT_UTF8
3940 nigel 93 if (utf8 && (options & PCRE_NO_UTF8_CHECK) == 0)
3941 nigel 77 {
3942     if (_pcre_valid_utf8((uschar *)subject, length) >= 0)
3943     return PCRE_ERROR_BADUTF8;
3944     if (start_offset > 0 && start_offset < length)
3945     {
3946     int tb = ((uschar *)subject)[start_offset];
3947     if (tb > 127)
3948     {
3949     tb &= 0xc0;
3950     if (tb != 0 && tb != 0xc0) return PCRE_ERROR_BADUTF8_OFFSET;
3951     }
3952     }
3953     }
3954     #endif
3955    
3956     /* The ims options can vary during the matching as a result of the presence
3957     of (?ims) items in the pattern. They are kept in a local variable so that
3958     restoring at the exit of a group is easy. */
3959    
3960     ims = re->options & (PCRE_CASELESS|PCRE_MULTILINE|PCRE_DOTALL);
3961    
3962     /* If the expression has got more back references than the offsets supplied can
3963     hold, we get a temporary chunk of working store to use during the matching.
3964     Otherwise, we can use the vector supplied, rounding down its size to a multiple
3965     of 3. */
3966    
3967     ocount = offsetcount - (offsetcount % 3);
3968    
3969     if (re->top_backref > 0 && re->top_backref >= ocount/3)
3970     {
3971     ocount = re->top_backref * 3 + 3;
3972 nigel 91 md->offset_vector = (int *)(pcre_malloc)(ocount * sizeof(int));
3973     if (md->offset_vector == NULL) return PCRE_ERROR_NOMEMORY;
3974 nigel 77 using_temporary_offsets = TRUE;
3975     DPRINTF(("Got memory to hold back references\n"));
3976     }
3977 nigel 91 else md->offset_vector = offsets;
3978 nigel 77
3979 nigel 91 md->offset_end = ocount;
3980     md->offset_max = (2*ocount)/3;
3981     md->offset_overflow = FALSE;
3982     md->capture_last = -1;
3983 nigel 77
3984     /* Compute the minimum number of offsets that we need to reset each time. Doing
3985     this makes a huge difference to execution time when there aren't many brackets
3986     in the pattern. */
3987    
3988     resetcount = 2 + re->top_bracket * 2;
3989     if (resetcount > offsetcount) resetcount = ocount;
3990    
3991     /* Reset the working variable associated with each extraction. These should
3992     never be used unless previously set, but they get saved and restored, and so we
3993     initialize them to avoid reading uninitialized locations. */
3994    
3995 nigel 91 if (md->offset_vector != NULL)
3996 nigel 77 {
3997 nigel 91 register int *iptr = md->offset_vector + ocount;
3998 nigel 77 register int *iend = iptr - resetcount/2 + 1;
3999     while (--iptr >= iend) *iptr = -1;
4000     }
4001    
4002     /* Set up the first character to match, if available. The first_byte value is
4003     never set for an anchored regular expression, but the anchoring may be forced
4004     at run time, so we have to test for anchoring. The first char may be unset for
4005     an unanchored pattern, of course. If there's no first char and the pattern was
4006     studied, there may be a bitmap of possible first characters. */
4007    
4008     if (!anchored)
4009     {
4010     if ((re->options & PCRE_FIRSTSET) != 0)
4011     {
4012     first_byte = re->first_byte & 255;
4013     if ((first_byte_caseless = ((re->first_byte & REQ_CASELESS) != 0)) == TRUE)
4014 nigel 91 first_byte = md->lcc[first_byte];
4015 nigel 77 }
4016     else
4017     if (!startline && study != NULL &&
4018     (study->options & PCRE_STUDY_MAPPED) != 0)
4019     start_bits = study->start_bits;
4020     }
4021    
4022     /* For anchored or unanchored matches, there may be a "last known required
4023     character" set. */
4024    
4025     if ((re->options & PCRE_REQCHSET) != 0)
4026     {
4027     req_byte = re->req_byte & 255;
4028     req_byte_caseless = (re->req_byte & REQ_CASELESS) != 0;
4029     req_byte2 = (tables + fcc_offset)[req_byte]; /* case flipped */
4030     }
4031    
4032 nigel 93
4033     /* ==========================================================================*/
4034    
4035 nigel 77 /* Loop for handling unanchored repeated matching attempts; for anchored regexs
4036     the loop runs just once. */
4037    
4038 nigel 93 for(;;)
4039 nigel 77 {
4040 nigel 87 USPTR save_end_subject = end_subject;
4041 nigel 77
4042     /* Reset the maximum number of extractions we might see. */
4043    
4044 nigel 91 if (md->offset_vector != NULL)
4045 nigel 77 {
4046 nigel 91 register int *iptr = md->offset_vector;
4047 nigel 77 register int *iend = iptr + resetcount;
4048     while (iptr < iend) *iptr++ = -1;
4049     }
4050    
4051     /* Advance to a unique first char if possible. If firstline is TRUE, the
4052     start of the match is constrained to the first line of a multiline string.
4053 nigel 93 That is, the match must be before or at the first newline. Implement this by
4054     temporarily adjusting end_subject so that we stop scanning at a newline. If
4055     the match fails at the newline, later code breaks this loop. */
4056 nigel 77
4057     if (firstline)
4058     {
4059 nigel 87 USPTR t = start_match;
4060 nigel 93 while (t < md->end_subject && !IS_NEWLINE(t)) t++;
4061 nigel 77 end_subject = t;
4062     }
4063    
4064     /* Now test for a unique first byte */
4065    
4066     if (first_byte >= 0)
4067     {
4068     if (first_byte_caseless)
4069     while (start_match < end_subject &&
4070 nigel 91 md->lcc[*start_match] != first_byte)
4071 nigel 77 start_match++;
4072     else
4073     while (start_match < end_subject && *start_match != first_byte)
4074     start_match++;
4075     }
4076    
4077 nigel 91 /* Or to just after a linebreak for a multiline match if possible */
4078 nigel 77
4079     else if (startline)
4080     {
4081 nigel 93 if (start_match > md->start_subject + start_offset)
4082 nigel 77 {
4083 nigel 93 while (start_match <= end_subject && !WAS_NEWLINE(start_match))
4084 nigel 77 start_match++;
4085 ph10 134
4086 ph10 149 /* If we have just passed a CR and the newline option is ANY or ANYCRLF,
4087     and we are now at a LF, advance the match position by one more character.
4088     */
4089 ph10 134
4090 ph10 130 if (start_match[-1] == '\r' &&
4091 ph10 149 (md->nltype == NLTYPE_ANY || md->nltype == NLTYPE_ANYCRLF) &&
4092 ph10 130 start_match < end_subject &&
4093     *start_match == '\n')
4094     start_match++;
4095 nigel 77 }
4096     }
4097    
4098     /* Or to a non-unique first char after study */
4099    
4100     else if (start_bits != NULL)
4101     {
4102     while (start_match < end_subject)
4103     {
4104     register unsigned int c = *start_match;
4105     if ((start_bits[c/8] & (1 << (c&7))) == 0) start_match++; else break;
4106     }
4107     }
4108    
4109     /* Restore fudged end_subject */
4110    
4111     end_subject = save_end_subject;
4112    
4113     #ifdef DEBUG /* Sigh. Some compilers never learn. */
4114     printf(">>>> Match against: ");
4115 nigel 91 pchars(start_match, end_subject - start_match, TRUE, md);
4116 nigel 77 printf("\n");
4117     #endif
4118    
4119     /* If req_byte is set, we know that that character must appear in the subject
4120     for the match to succeed. If the first character is set, req_byte must be
4121     later in the subject; otherwise the test starts at the match point. This
4122     optimization can save a huge amount of backtracking in patterns with nested
4123     unlimited repeats that aren't going to match. Writing separate code for
4124     cased/caseless versions makes it go faster, as does using an autoincrement
4125     and backing off on a match.
4126    
4127     HOWEVER: when the subject string is very, very long, searching to its end can
4128     take a long time, and give bad performance on quite ordinary patterns. This
4129 nigel 93 showed up when somebody was matching something like /^\d+C/ on a 32-megabyte
4130     string... so we don't do this when the string is sufficiently long.
4131 nigel 77
4132     ALSO: this processing is disabled when partial matching is requested.
4133     */
4134    
4135     if (req_byte >= 0 &&
4136     end_subject - start_match < REQ_BYTE_MAX &&
4137 nigel 91 !md->partial)
4138 nigel 77 {
4139 nigel 87 register USPTR p = start_match + ((first_byte >= 0)? 1 : 0);
4140 nigel 77
4141     /* We don't need to repeat the search if we haven't yet reached the
4142     place we found it at last time. */
4143    
4144     if (p > req_byte_ptr)
4145     {
4146     if (req_byte_caseless)
4147     {
4148     while (p < end_subject)
4149     {
4150     register int pp = *p++;
4151     if (pp == req_byte || pp == req_byte2) { p--; break; }
4152     }
4153     }
4154     else
4155     {
4156     while (p < end_subject)
4157     {
4158     if (*p++ == req_byte) { p--; break; }
4159     }
4160     }
4161    
4162 nigel 93 /* If we can't find the required character, break the matching loop,
4163     forcing a match failure. */
4164 nigel 77
4165 nigel 93 if (p >= end_subject)
4166     {
4167     rc = MATCH_NOMATCH;
4168     break;
4169     }
4170 nigel 77
4171     /* If we have found the required character, save the point where we
4172     found it, so that we don't search again next time round the loop if
4173     the start hasn't passed this character yet. */
4174    
4175     req_byte_ptr = p;
4176     }
4177     }
4178    
4179 nigel 93 /* OK, we can now run the match. */
4180 nigel 77
4181 ph10 168 md->start_match_ptr = start_match; /* Insurance */
4182 nigel 91 md->match_call_count = 0;
4183 nigel 93 md->eptrn = 0; /* Next free eptrchain slot */
4184 ph10 172 rc = match(start_match, md->start_code, start_match, 2, md,
4185 ph10 168 ims, NULL, 0, 0);
4186 nigel 77
4187 nigel 93 /* Any return other than MATCH_NOMATCH breaks the loop. */
4188 nigel 77
4189 nigel 93 if (rc != MATCH_NOMATCH) break;