/[pcre]/code/trunk/pcre_exec.c
ViewVC logotype

Contents of /code/trunk/pcre_exec.c

Parent Directory Parent Directory | Revision Log Revision Log


Revision 623 - (hide annotations) (download)
Tue Jul 19 09:58:42 2011 UTC (3 years, 2 months ago) by ph10
File MIME type: text/plain
File size: 193457 byte(s)
Fix \X* bug when first character has the mark property. Also improve code for 
property and script handling.  

1 nigel 77 /*************************************************
2     * Perl-Compatible Regular Expressions *
3     *************************************************/
4    
5     /* PCRE is a library of functions to support regular expressions whose syntax
6     and semantics are as close as possible to those of the Perl 5 language.
7    
8     Written by Philip Hazel
9 ph10 598 Copyright (c) 1997-2011 University of Cambridge
10 nigel 77
11     -----------------------------------------------------------------------------
12     Redistribution and use in source and binary forms, with or without
13     modification, are permitted provided that the following conditions are met:
14    
15     * Redistributions of source code must retain the above copyright notice,
16     this list of conditions and the following disclaimer.
17    
18     * Redistributions in binary form must reproduce the above copyright
19     notice, this list of conditions and the following disclaimer in the
20     documentation and/or other materials provided with the distribution.
21    
22     * Neither the name of the University of Cambridge nor the names of its
23     contributors may be used to endorse or promote products derived from
24     this software without specific prior written permission.
25    
26     THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
27     AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
28     IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
29     ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
30     LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
31     CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
32     SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
33     INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
34     CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
35     ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
36     POSSIBILITY OF SUCH DAMAGE.
37     -----------------------------------------------------------------------------
38     */
39    
40    
41     /* This module contains pcre_exec(), the externally visible function that does
42     pattern matching using an NFA algorithm, trying to mimic Perl as closely as
43     possible. There are also some static supporting functions. */
44    
45 ph10 200 #ifdef HAVE_CONFIG_H
46 ph10 236 #include "config.h"
47 ph10 200 #endif
48 ph10 199
49 nigel 93 #define NLBLOCK md /* Block containing newline information */
50     #define PSSTART start_subject /* Field containing processed string start */
51     #define PSEND end_subject /* Field containing processed string end */
52    
53 nigel 77 #include "pcre_internal.h"
54    
55 ph10 137 /* Undefine some potentially clashing cpp symbols */
56    
57     #undef min
58     #undef max
59    
60 ph10 604 /* Values for setting in md->match_function_type to indicate two special types
61     of call to match(). We do it this way to save on using another stack variable,
62     as stack usage is to be discouraged. */
63 nigel 77
64 ph10 604 #define MATCH_CONDASSERT 1 /* Called to check a condition assertion */
65     #define MATCH_CBEGROUP 2 /* Could-be-empty unlimited repeat group */
66 nigel 77
67     /* Non-error returns from the match() function. Error returns are externally
68     defined PCRE_ERROR_xxx codes, which are all negative. */
69    
70     #define MATCH_MATCH 1
71     #define MATCH_NOMATCH 0
72    
73 ph10 211 /* Special internal returns from the match() function. Make them sufficiently
74 ph10 210 negative to avoid the external error codes. */
75    
76 ph10 511 #define MATCH_ACCEPT (-999)
77     #define MATCH_COMMIT (-998)
78 ph10 604 #define MATCH_KETRPOS (-997)
79 ph10 618 #define MATCH_ONCE (-996)
80     #define MATCH_PRUNE (-995)
81     #define MATCH_SKIP (-994)
82     #define MATCH_SKIP_ARG (-993)
83     #define MATCH_THEN (-992)
84 ph10 210
85 ph10 510 /* This is a convenience macro for code that occurs many times. */
86    
87     #define MRRETURN(ra) \
88     { \
89     md->mark = markptr; \
90     RRETURN(ra); \
91     }
92    
93 nigel 77 /* Maximum number of ints of offset to save on the stack for recursive calls.
94     If the offset vector is bigger, malloc is used. This should be a multiple of 3,
95     because the offset vector is always a multiple of 3 long. */
96    
97     #define REC_STACK_SAVE_MAX 30
98    
99     /* Min and max values for the common repeats; for the maxima, 0 => infinity */
100    
101     static const char rep_min[] = { 0, 0, 1, 1, 0, 0 };
102     static const char rep_max[] = { 0, 0, 0, 0, 1, 1 };
103    
104    
105    
106 ph10 475 #ifdef PCRE_DEBUG
107 nigel 77 /*************************************************
108     * Debugging function to print chars *
109     *************************************************/
110    
111     /* Print a sequence of chars in printable format, stopping at the end of the
112     subject if the requested.
113    
114     Arguments:
115     p points to characters
116     length number to print
117     is_subject TRUE if printing from within md->start_subject
118     md pointer to matching data block, if is_subject is TRUE
119    
120     Returns: nothing
121     */
122    
123     static void
124     pchars(const uschar *p, int length, BOOL is_subject, match_data *md)
125     {
126 nigel 93 unsigned int c;
127 nigel 77 if (is_subject && length > md->end_subject - p) length = md->end_subject - p;
128     while (length-- > 0)
129     if (isprint(c = *(p++))) printf("%c", c); else printf("\\x%02x", c);
130     }
131     #endif
132    
133    
134    
135     /*************************************************
136     * Match a back-reference *
137     *************************************************/
138    
139 ph10 595 /* Normally, if a back reference hasn't been set, the length that is passed is
140     negative, so the match always fails. However, in JavaScript compatibility mode,
141     the length passed is zero. Note that in caseless UTF-8 mode, the number of
142     subject bytes matched may be different to the number of reference bytes.
143 nigel 77
144     Arguments:
145     offset index into the offset vector
146 ph10 595 eptr pointer into the subject
147     length length of reference to be matched (number of bytes)
148 nigel 77 md points to match data block
149 ph10 602 caseless TRUE if caseless
150 nigel 77
151 ph10 595 Returns: < 0 if not matched, otherwise the number of subject bytes matched
152 nigel 77 */
153    
154 ph10 595 static int
155 nigel 87 match_ref(int offset, register USPTR eptr, int length, match_data *md,
156 ph10 602 BOOL caseless)
157 nigel 77 {
158 ph10 595 USPTR eptr_start = eptr;
159     register USPTR p = md->start_subject + md->offset_vector[offset];
160 nigel 77
161 ph10 475 #ifdef PCRE_DEBUG
162 nigel 77 if (eptr >= md->end_subject)
163     printf("matching subject <null>");
164     else
165     {
166     printf("matching subject ");
167     pchars(eptr, length, TRUE, md);
168     }
169     printf(" against backref ");
170     pchars(p, length, FALSE, md);
171     printf("\n");
172     #endif
173    
174 ph10 595 /* Always fail if reference not set (and not JavaScript compatible). */
175 nigel 77
176 ph10 595 if (length < 0) return -1;
177 nigel 77
178 ph10 354 /* Separate the caseless case for speed. In UTF-8 mode we can only do this
179     properly if Unicode properties are supported. Otherwise, we can check only
180     ASCII characters. */
181 nigel 77
182 ph10 602 if (caseless)
183 nigel 77 {
184 ph10 354 #ifdef SUPPORT_UTF8
185     #ifdef SUPPORT_UCP
186     if (md->utf8)
187     {
188 ph10 595 /* Match characters up to the end of the reference. NOTE: the number of
189     bytes matched may differ, because there are some characters whose upper and
190     lower case versions code as different numbers of bytes. For example, U+023A
191     (2 bytes in UTF-8) is the upper case version of U+2C65 (3 bytes in UTF-8);
192     a sequence of 3 of the former uses 6 bytes, as does a sequence of two of
193     the latter. It is important, therefore, to check the length along the
194     reference, not along the subject (earlier code did this wrong). */
195    
196     USPTR endptr = p + length;
197     while (p < endptr)
198 ph10 354 {
199 ph10 358 int c, d;
200 ph10 597 if (eptr >= md->end_subject) return -1;
201 ph10 354 GETCHARINC(c, eptr);
202     GETCHARINC(d, p);
203 ph10 595 if (c != d && c != UCD_OTHERCASE(d)) return -1;
204 ph10 358 }
205     }
206 ph10 354 else
207     #endif
208     #endif
209    
210     /* The same code works when not in UTF-8 mode and in UTF-8 mode when there
211     is no UCP support. */
212 ph10 597 {
213     if (eptr + length > md->end_subject) return -1;
214     while (length-- > 0)
215     { if (md->lcc[*p++] != md->lcc[*eptr++]) return -1; }
216     }
217 nigel 77 }
218 ph10 358
219 ph10 354 /* In the caseful case, we can just compare the bytes, whether or not we
220     are in UTF-8 mode. */
221 ph10 358
222 nigel 77 else
223 ph10 597 {
224     if (eptr + length > md->end_subject) return -1;
225     while (length-- > 0) if (*p++ != *eptr++) return -1;
226     }
227 nigel 77
228 ph10 595 return eptr - eptr_start;
229 nigel 77 }
230    
231    
232    
233     /***************************************************************************
234     ****************************************************************************
235     RECURSION IN THE match() FUNCTION
236    
237 nigel 87 The match() function is highly recursive, though not every recursive call
238     increases the recursive depth. Nevertheless, some regular expressions can cause
239     it to recurse to a great depth. I was writing for Unix, so I just let it call
240     itself recursively. This uses the stack for saving everything that has to be
241     saved for a recursive call. On Unix, the stack can be large, and this works
242     fine.
243 nigel 77
244 nigel 87 It turns out that on some non-Unix-like systems there are problems with
245     programs that use a lot of stack. (This despite the fact that every last chip
246     has oodles of memory these days, and techniques for extending the stack have
247     been known for decades.) So....
248 nigel 77
249     There is a fudge, triggered by defining NO_RECURSE, which avoids recursive
250     calls by keeping local variables that need to be preserved in blocks of memory
251 nigel 87 obtained from malloc() instead instead of on the stack. Macros are used to
252 nigel 77 achieve this so that the actual code doesn't look very different to what it
253     always used to.
254 ph10 164
255 ph10 165 The original heap-recursive code used longjmp(). However, it seems that this
256 ph10 164 can be very slow on some operating systems. Following a suggestion from Stan
257     Switzer, the use of longjmp() has been abolished, at the cost of having to
258     provide a unique number for each call to RMATCH. There is no way of generating
259     a sequence of numbers at compile time in C. I have given them names, to make
260     them stand out more clearly.
261    
262     Crude tests on x86 Linux show a small speedup of around 5-8%. However, on
263     FreeBSD, avoiding longjmp() more than halves the time taken to run the standard
264 ph10 165 tests. Furthermore, not using longjmp() means that local dynamic variables
265     don't have indeterminate values; this has meant that the frame size can be
266 ph10 164 reduced because the result can be "passed back" by straight setting of the
267     variable instead of being passed in the frame.
268 nigel 77 ****************************************************************************
269     ***************************************************************************/
270    
271 ph10 212 /* Numbers for RMATCH calls. When this list is changed, the code at HEAP_RETURN
272     below must be updated in sync. */
273 nigel 77
274 ph10 164 enum { RM1=1, RM2, RM3, RM4, RM5, RM6, RM7, RM8, RM9, RM10,
275     RM11, RM12, RM13, RM14, RM15, RM16, RM17, RM18, RM19, RM20,
276     RM21, RM22, RM23, RM24, RM25, RM26, RM27, RM28, RM29, RM30,
277     RM31, RM32, RM33, RM34, RM35, RM36, RM37, RM38, RM39, RM40,
278 ph10 210 RM41, RM42, RM43, RM44, RM45, RM46, RM47, RM48, RM49, RM50,
279 ph10 527 RM51, RM52, RM53, RM54, RM55, RM56, RM57, RM58, RM59, RM60,
280 ph10 623 RM61, RM62, RM63 };
281 ph10 164
282 nigel 87 /* These versions of the macros use the stack, as normal. There are debugging
283 ph10 165 versions and production versions. Note that the "rw" argument of RMATCH isn't
284 ph10 501 actually used in this definition. */
285 nigel 77
286     #ifndef NO_RECURSE
287     #define REGISTER register
288 ph10 164
289 ph10 475 #ifdef PCRE_DEBUG
290 ph10 604 #define RMATCH(ra,rb,rc,rd,re,rw) \
291 nigel 87 { \
292     printf("match() called in line %d\n", __LINE__); \
293 ph10 604 rrc = match(ra,rb,mstart,markptr,rc,rd,re,rdepth+1); \
294 nigel 87 printf("to line %d\n", __LINE__); \
295     }
296     #define RRETURN(ra) \
297     { \
298     printf("match() returned %d from line %d ", ra, __LINE__); \
299     return ra; \
300     }
301     #else
302 ph10 604 #define RMATCH(ra,rb,rc,rd,re,rw) \
303     rrc = match(ra,rb,mstart,markptr,rc,rd,re,rdepth+1)
304 nigel 77 #define RRETURN(ra) return ra
305 nigel 87 #endif
306    
307 nigel 77 #else
308    
309    
310 ph10 164 /* These versions of the macros manage a private stack on the heap. Note that
311     the "rd" argument of RMATCH isn't actually used in this definition. It's the md
312     argument of match(), which never changes. */
313 nigel 77
314     #define REGISTER
315    
316 ph10 604 #define RMATCH(ra,rb,rc,rd,re,rw)\
317 nigel 77 {\
318 ph10 563 heapframe *newframe = (heapframe *)(pcre_stack_malloc)(sizeof(heapframe));\
319 ph10 534 if (newframe == NULL) RRETURN(PCRE_ERROR_NOMEMORY);\
320 ph10 164 frame->Xwhere = rw; \
321     newframe->Xeptr = ra;\
322     newframe->Xecode = rb;\
323 ph10 168 newframe->Xmstart = mstart;\
324 ph10 501 newframe->Xmarkptr = markptr;\
325 ph10 164 newframe->Xoffset_top = rc;\
326 ph10 602 newframe->Xeptrb = re;\
327 ph10 164 newframe->Xrdepth = frame->Xrdepth + 1;\
328     newframe->Xprevframe = frame;\
329     frame = newframe;\
330     DPRINTF(("restarting from line %d\n", __LINE__));\
331     goto HEAP_RECURSE;\
332     L_##rw:\
333     DPRINTF(("jumped back to line %d\n", __LINE__));\
334 nigel 77 }
335    
336     #define RRETURN(ra)\
337     {\
338 ph10 527 heapframe *oldframe = frame;\
339     frame = oldframe->Xprevframe;\
340     (pcre_stack_free)(oldframe);\
341 nigel 77 if (frame != NULL)\
342     {\
343 ph10 164 rrc = ra;\
344     goto HEAP_RETURN;\
345 nigel 77 }\
346     return ra;\
347     }
348    
349    
350     /* Structure for remembering the local variables in a private frame */
351    
352     typedef struct heapframe {
353     struct heapframe *Xprevframe;
354    
355     /* Function arguments that may change */
356    
357 ph10 409 USPTR Xeptr;
358 nigel 77 const uschar *Xecode;
359 ph10 409 USPTR Xmstart;
360 ph10 501 USPTR Xmarkptr;
361 nigel 77 int Xoffset_top;
362     eptrblock *Xeptrb;
363 nigel 91 unsigned int Xrdepth;
364 nigel 77
365     /* Function local variables */
366    
367 ph10 409 USPTR Xcallpat;
368 ph10 406 #ifdef SUPPORT_UTF8
369 ph10 409 USPTR Xcharptr;
370 ph10 406 #endif
371 ph10 409 USPTR Xdata;
372     USPTR Xnext;
373     USPTR Xpp;
374     USPTR Xprev;
375     USPTR Xsaved_eptr;
376 nigel 77
377     recursion_info Xnew_recursive;
378    
379     BOOL Xcur_is_word;
380     BOOL Xcondition;
381     BOOL Xprev_is_word;
382    
383     #ifdef SUPPORT_UCP
384     int Xprop_type;
385 nigel 87 int Xprop_value;
386 nigel 77 int Xprop_fail_result;
387 ph10 123 int Xoclength;
388     uschar Xocchars[8];
389 nigel 77 #endif
390    
391 ph10 403 int Xcodelink;
392 nigel 77 int Xctype;
393 nigel 93 unsigned int Xfc;
394 nigel 77 int Xfi;
395     int Xlength;
396     int Xmax;
397     int Xmin;
398     int Xnumber;
399     int Xoffset;
400     int Xop;
401     int Xsave_capture_last;
402     int Xsave_offset1, Xsave_offset2, Xsave_offset3;
403     int Xstacksave[REC_STACK_SAVE_MAX];
404    
405     eptrblock Xnewptrb;
406    
407 ph10 164 /* Where to jump back to */
408 nigel 77
409 ph10 164 int Xwhere;
410 ph10 165
411 nigel 77 } heapframe;
412    
413     #endif
414    
415    
416     /***************************************************************************
417     ***************************************************************************/
418    
419    
420    
421     /*************************************************
422     * Match from current position *
423     *************************************************/
424    
425 nigel 93 /* This function is called recursively in many circumstances. Whenever it
426 nigel 77 returns a negative (error) response, the outer incarnation must also return the
427 ph10 426 same response. */
428 nigel 77
429 ph10 426 /* These macros pack up tests that are used for partial matching, and which
430     appears several times in the code. We set the "hit end" flag if the pointer is
431     at the end of the subject and also past the start of the subject (i.e.
432 ph10 427 something has been matched). For hard partial matching, we then return
433     immediately. The second one is used when we already know we are past the end of
434     the subject. */
435 ph10 426
436     #define CHECK_PARTIAL()\
437 ph10 553 if (md->partial != 0 && eptr >= md->end_subject && \
438     eptr > md->start_used_ptr) \
439     { \
440     md->hitend = TRUE; \
441     if (md->partial > 1) MRRETURN(PCRE_ERROR_PARTIAL); \
442 ph10 427 }
443 ph10 426
444     #define SCHECK_PARTIAL()\
445 ph10 553 if (md->partial != 0 && eptr > md->start_used_ptr) \
446     { \
447     md->hitend = TRUE; \
448     if (md->partial > 1) MRRETURN(PCRE_ERROR_PARTIAL); \
449 ph10 427 }
450 ph10 426
451 ph10 427
452 ph10 426 /* Performance note: It might be tempting to extract commonly used fields from
453     the md structure (e.g. utf8, end_subject) into individual variables to improve
454 nigel 77 performance. Tests using gcc on a SPARC disproved this; in the first case, it
455     made performance worse.
456    
457     Arguments:
458 nigel 93 eptr pointer to current character in subject
459     ecode pointer to current position in compiled code
460 ph10 168 mstart pointer to the current match start position (can be modified
461 ph10 172 by encountering \K)
462 ph10 501 markptr pointer to the most recent MARK name, or NULL
463 nigel 77 offset_top current top pointer
464     md pointer to "static" info for the match
465     eptrb pointer to chain of blocks containing eptr at start of
466     brackets - for testing for empty matches
467 nigel 87 rdepth the recursion depth
468 nigel 77
469     Returns: MATCH_MATCH if matched ) these values are >= 0
470     MATCH_NOMATCH if failed to match )
471 ph10 510 a negative MATCH_xxx value for PRUNE, SKIP, etc
472 nigel 77 a negative PCRE_ERROR_xxx value if aborted by an error condition
473 nigel 87 (e.g. stopped by repeated call or recursion limit)
474 nigel 77 */
475    
476     static int
477 ph10 510 match(REGISTER USPTR eptr, REGISTER const uschar *ecode, USPTR mstart,
478 ph10 602 const uschar *markptr, int offset_top, match_data *md, eptrblock *eptrb,
479 ph10 604 unsigned int rdepth)
480 nigel 77 {
481     /* These variables do not need to be preserved over recursion in this function,
482 nigel 93 so they can be ordinary variables in all cases. Mark some of them with
483     "register" because they are used a lot in loops. */
484 nigel 77
485 nigel 91 register int rrc; /* Returns from recursive calls */
486     register int i; /* Used for loops not involving calls to RMATCH() */
487 nigel 93 register unsigned int c; /* Character values not kept over RMATCH() calls */
488 nigel 91 register BOOL utf8; /* Local copy of UTF-8 flag for speed */
489 nigel 77
490 nigel 93 BOOL minimize, possessive; /* Quantifier options */
491 ph10 602 BOOL caseless;
492 ph10 403 int condcode;
493 nigel 93
494 nigel 77 /* When recursion is not being used, all "local" variables that have to be
495     preserved over calls to RMATCH() are part of a "frame" which is obtained from
496     heap storage. Set up the top-level frame here; others are obtained from the
497     heap whenever RMATCH() does a "recursion". See the macro definitions above. */
498    
499     #ifdef NO_RECURSE
500 ph10 563 heapframe *frame = (heapframe *)(pcre_stack_malloc)(sizeof(heapframe));
501 ph10 531 if (frame == NULL) RRETURN(PCRE_ERROR_NOMEMORY);
502 nigel 77 frame->Xprevframe = NULL; /* Marks the top level */
503    
504     /* Copy in the original argument variables */
505    
506     frame->Xeptr = eptr;
507     frame->Xecode = ecode;
508 ph10 168 frame->Xmstart = mstart;
509 ph10 501 frame->Xmarkptr = markptr;
510 nigel 77 frame->Xoffset_top = offset_top;
511     frame->Xeptrb = eptrb;
512 nigel 87 frame->Xrdepth = rdepth;
513 nigel 77
514     /* This is where control jumps back to to effect "recursion" */
515    
516     HEAP_RECURSE:
517    
518     /* Macros make the argument variables come from the current frame */
519    
520     #define eptr frame->Xeptr
521     #define ecode frame->Xecode
522 ph10 168 #define mstart frame->Xmstart
523 ph10 501 #define markptr frame->Xmarkptr
524 nigel 77 #define offset_top frame->Xoffset_top
525     #define eptrb frame->Xeptrb
526 nigel 87 #define rdepth frame->Xrdepth
527 nigel 77
528     /* Ditto for the local variables */
529    
530     #ifdef SUPPORT_UTF8
531     #define charptr frame->Xcharptr
532     #endif
533     #define callpat frame->Xcallpat
534 ph10 403 #define codelink frame->Xcodelink
535 nigel 77 #define data frame->Xdata
536     #define next frame->Xnext
537     #define pp frame->Xpp
538     #define prev frame->Xprev
539     #define saved_eptr frame->Xsaved_eptr
540    
541     #define new_recursive frame->Xnew_recursive
542    
543     #define cur_is_word frame->Xcur_is_word
544     #define condition frame->Xcondition
545     #define prev_is_word frame->Xprev_is_word
546    
547     #ifdef SUPPORT_UCP
548     #define prop_type frame->Xprop_type
549 nigel 87 #define prop_value frame->Xprop_value
550 nigel 77 #define prop_fail_result frame->Xprop_fail_result
551 ph10 115 #define oclength frame->Xoclength
552     #define occhars frame->Xocchars
553 nigel 77 #endif
554    
555     #define ctype frame->Xctype
556     #define fc frame->Xfc
557     #define fi frame->Xfi
558     #define length frame->Xlength
559     #define max frame->Xmax
560     #define min frame->Xmin
561     #define number frame->Xnumber
562     #define offset frame->Xoffset
563     #define op frame->Xop
564     #define save_capture_last frame->Xsave_capture_last
565     #define save_offset1 frame->Xsave_offset1
566     #define save_offset2 frame->Xsave_offset2
567     #define save_offset3 frame->Xsave_offset3
568     #define stacksave frame->Xstacksave
569    
570     #define newptrb frame->Xnewptrb
571    
572     /* When recursion is being used, local variables are allocated on the stack and
573     get preserved during recursion in the normal way. In this environment, fi and
574     i, and fc and c, can be the same variables. */
575    
576 nigel 93 #else /* NO_RECURSE not defined */
577 nigel 77 #define fi i
578     #define fc c
579    
580 ph10 604 /* Many of the following variables are used only in small blocks of the code.
581     My normal style of coding would have declared them within each of those blocks.
582     However, in order to accommodate the version of this code that uses an external
583     "stack" implemented on the heap, it is easier to declare them all here, so the
584     declarations can be cut out in a block. The only declarations within blocks
585     below are for variables that do not have to be preserved over a recursive call
586     to RMATCH(). */
587 nigel 77
588 ph10 604 #ifdef SUPPORT_UTF8
589     const uschar *charptr;
590     #endif
591     const uschar *callpat;
592     const uschar *data;
593     const uschar *next;
594     USPTR pp;
595     const uschar *prev;
596     USPTR saved_eptr;
597    
598     recursion_info new_recursive;
599    
600     BOOL cur_is_word;
601 nigel 87 BOOL condition;
602 nigel 77 BOOL prev_is_word;
603    
604     #ifdef SUPPORT_UCP
605     int prop_type;
606 nigel 87 int prop_value;
607 nigel 77 int prop_fail_result;
608 ph10 115 int oclength;
609     uschar occhars[8];
610 nigel 77 #endif
611    
612 ph10 399 int codelink;
613 nigel 77 int ctype;
614     int length;
615     int max;
616     int min;
617     int number;
618     int offset;
619     int op;
620     int save_capture_last;
621     int save_offset1, save_offset2, save_offset3;
622     int stacksave[REC_STACK_SAVE_MAX];
623    
624     eptrblock newptrb;
625 nigel 93 #endif /* NO_RECURSE */
626 nigel 77
627 ph10 604 /* To save space on the stack and in the heap frame, I have doubled up on some
628     of the local variables that are used only in localised parts of the code, but
629     still need to be preserved over recursive calls of match(). These macros define
630     the alternative names that are used. */
631    
632     #define allow_zero cur_is_word
633     #define cbegroup condition
634     #define code_offset codelink
635     #define condassert condition
636     #define matched_once prev_is_word
637    
638 nigel 77 /* These statements are here to stop the compiler complaining about unitialized
639     variables. */
640    
641     #ifdef SUPPORT_UCP
642 nigel 87 prop_value = 0;
643 nigel 77 prop_fail_result = 0;
644     #endif
645    
646 nigel 93
647 nigel 91 /* This label is used for tail recursion, which is used in a few cases even
648     when NO_RECURSE is not defined, in order to reduce the amount of stack that is
649     used. Thanks to Ian Taylor for noticing this possibility and sending the
650     original patch. */
651    
652     TAIL_RECURSE:
653    
654 nigel 87 /* OK, now we can get on with the real code of the function. Recursive calls
655     are specified by the macro RMATCH and RRETURN is used to return. When
656     NO_RECURSE is *not* defined, these just turn into a recursive call to match()
657 ph10 475 and a "return", respectively (possibly with some debugging if PCRE_DEBUG is
658 nigel 87 defined). However, RMATCH isn't like a function call because it's quite a
659     complicated macro. It has to be used in one particular way. This shouldn't,
660     however, impact performance when true recursion is being used. */
661 nigel 77
662 ph10 164 #ifdef SUPPORT_UTF8
663     utf8 = md->utf8; /* Local copy of the flag */
664     #else
665     utf8 = FALSE;
666     #endif
667    
668 nigel 87 /* First check that we haven't called match() too many times, or that we
669     haven't exceeded the recursive call limit. */
670    
671 nigel 77 if (md->match_call_count++ >= md->match_limit) RRETURN(PCRE_ERROR_MATCHLIMIT);
672 nigel 87 if (rdepth >= md->match_limit_recursion) RRETURN(PCRE_ERROR_RECURSIONLIMIT);
673 nigel 77
674 nigel 93 /* At the start of a group with an unlimited repeat that may match an empty
675 ph10 604 string, the variable md->match_function_type is set to MATCH_CBEGROUP. It is
676     done this way to save having to use another function argument, which would take
677     up space on the stack. See also MATCH_CONDASSERT below.
678 nigel 77
679 ph10 604 When MATCH_CBEGROUP is set, add the current subject pointer to the chain of
680     such remembered pointers, to be checked when we hit the closing ket, in order
681     to break infinite loops that match no characters. When match() is called in
682     other circumstances, don't add to the chain. The MATCH_CBEGROUP feature must
683     NOT be used with tail recursion, because the memory block that is used is on
684     the stack, so a new one may be required for each match(). */
685    
686     if (md->match_function_type == MATCH_CBEGROUP)
687 nigel 77 {
688 ph10 197 newptrb.epb_saved_eptr = eptr;
689     newptrb.epb_prev = eptrb;
690     eptrb = &newptrb;
691 ph10 604 md->match_function_type = 0;
692 nigel 77 }
693    
694 nigel 93 /* Now start processing the opcodes. */
695 nigel 77
696     for (;;)
697     {
698 nigel 93 minimize = possessive = FALSE;
699 nigel 77 op = *ecode;
700 ph10 604
701 nigel 93 switch(op)
702     {
703 ph10 510 case OP_MARK:
704     markptr = ecode + 2;
705     RMATCH(eptr, ecode + _pcre_OP_lengths[*ecode] + ecode[1], offset_top, md,
706 ph10 604 eptrb, RM55);
707 ph10 512
708     /* A return of MATCH_SKIP_ARG means that matching failed at SKIP with an
709     argument, and we must check whether that argument matches this MARK's
710     argument. It is passed back in md->start_match_ptr (an overloading of that
711     variable). If it does match, we reset that variable to the current subject
712     position and return MATCH_SKIP. Otherwise, pass back the return code
713 ph10 510 unaltered. */
714 ph10 512
715     if (rrc == MATCH_SKIP_ARG &&
716 ph10 510 strcmp((char *)markptr, (char *)(md->start_match_ptr)) == 0)
717     {
718     md->start_match_ptr = eptr;
719     RRETURN(MATCH_SKIP);
720     }
721    
722 ph10 512 if (md->mark == NULL) md->mark = markptr;
723 ph10 510 RRETURN(rrc);
724    
725 ph10 210 case OP_FAIL:
726 ph10 510 MRRETURN(MATCH_NOMATCH);
727 ph10 211
728 ph10 551 /* COMMIT overrides PRUNE, SKIP, and THEN */
729 ph10 553
730 ph10 510 case OP_COMMIT:
731     RMATCH(eptr, ecode + _pcre_OP_lengths[*ecode], offset_top, md,
732 ph10 604 eptrb, RM52);
733 ph10 551 if (rrc != MATCH_NOMATCH && rrc != MATCH_PRUNE &&
734 ph10 553 rrc != MATCH_SKIP && rrc != MATCH_SKIP_ARG &&
735     rrc != MATCH_THEN)
736 ph10 551 RRETURN(rrc);
737 ph10 510 MRRETURN(MATCH_COMMIT);
738    
739 ph10 551 /* PRUNE overrides THEN */
740 ph10 553
741 ph10 210 case OP_PRUNE:
742     RMATCH(eptr, ecode + _pcre_OP_lengths[*ecode], offset_top, md,
743 ph10 604 eptrb, RM51);
744 ph10 551 if (rrc != MATCH_NOMATCH && rrc != MATCH_THEN) RRETURN(rrc);
745 ph10 510 MRRETURN(MATCH_PRUNE);
746 ph10 211
747 ph10 510 case OP_PRUNE_ARG:
748     RMATCH(eptr, ecode + _pcre_OP_lengths[*ecode] + ecode[1], offset_top, md,
749 ph10 604 eptrb, RM56);
750 ph10 551 if (rrc != MATCH_NOMATCH && rrc != MATCH_THEN) RRETURN(rrc);
751 ph10 510 md->mark = ecode + 2;
752     RRETURN(MATCH_PRUNE);
753 ph10 211
754 ph10 551 /* SKIP overrides PRUNE and THEN */
755 ph10 553
756 ph10 210 case OP_SKIP:
757     RMATCH(eptr, ecode + _pcre_OP_lengths[*ecode], offset_top, md,
758 ph10 604 eptrb, RM53);
759 ph10 553 if (rrc != MATCH_NOMATCH && rrc != MATCH_PRUNE && rrc != MATCH_THEN)
760 ph10 551 RRETURN(rrc);
761 ph10 211 md->start_match_ptr = eptr; /* Pass back current position */
762 ph10 510 MRRETURN(MATCH_SKIP);
763 ph10 211
764 ph10 510 case OP_SKIP_ARG:
765     RMATCH(eptr, ecode + _pcre_OP_lengths[*ecode] + ecode[1], offset_top, md,
766 ph10 604 eptrb, RM57);
767 ph10 553 if (rrc != MATCH_NOMATCH && rrc != MATCH_PRUNE && rrc != MATCH_THEN)
768 ph10 551 RRETURN(rrc);
769 ph10 512
770     /* Pass back the current skip name by overloading md->start_match_ptr and
771     returning the special MATCH_SKIP_ARG return code. This will either be
772     caught by a matching MARK, or get to the top, where it is treated the same
773 ph10 510 as PRUNE. */
774 ph10 512
775 ph10 510 md->start_match_ptr = ecode + 2;
776 ph10 512 RRETURN(MATCH_SKIP_ARG);
777 ph10 553
778 ph10 550 /* For THEN (and THEN_ARG) we pass back the address of the bracket or
779 ph10 553 the alt that is at the start of the current branch. This makes it possible
780     to skip back past alternatives that precede the THEN within the current
781     branch. */
782 ph10 512
783 ph10 210 case OP_THEN:
784     RMATCH(eptr, ecode + _pcre_OP_lengths[*ecode], offset_top, md,
785 ph10 604 eptrb, RM54);
786 ph10 210 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
787 ph10 550 md->start_match_ptr = ecode - GET(ecode, 1);
788 ph10 510 MRRETURN(MATCH_THEN);
789    
790     case OP_THEN_ARG:
791 ph10 553 RMATCH(eptr, ecode + _pcre_OP_lengths[*ecode] + ecode[1+LINK_SIZE],
792 ph10 604 offset_top, md, eptrb, RM58);
793 ph10 510 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
794 ph10 550 md->start_match_ptr = ecode - GET(ecode, 1);
795     md->mark = ecode + LINK_SIZE + 2;
796 ph10 212 RRETURN(MATCH_THEN);
797 ph10 211
798 ph10 604 /* Handle a capturing bracket, other than those that are possessive with an
799     unlimited repeat. If there is space in the offset vector, save the current
800     subject position in the working slot at the top of the vector. We mustn't
801     change the current values of the data slot, because they may be set from a
802     previous iteration of this group, and be referred to by a reference inside
803 ph10 617 the group. A failure to match might occur after the group has succeeded,
804     if something later on doesn't match. For this reason, we need to restore
805     the working value and also the values of the final offsets, in case they
806     were set by a previous iteration of the same bracket.
807 nigel 77
808 nigel 93 If there isn't enough space in the offset vector, treat this as if it were
809     a non-capturing bracket. Don't worry about setting the flag for the error
810     case here; that is handled in the code for KET. */
811 nigel 77
812 nigel 93 case OP_CBRA:
813     case OP_SCBRA:
814     number = GET2(ecode, 1+LINK_SIZE);
815 nigel 77 offset = number << 1;
816 ph10 604
817 ph10 475 #ifdef PCRE_DEBUG
818 nigel 93 printf("start bracket %d\n", number);
819     printf("subject=");
820 nigel 77 pchars(eptr, 16, TRUE, md);
821     printf("\n");
822     #endif
823    
824     if (offset < md->offset_max)
825     {
826     save_offset1 = md->offset_vector[offset];
827     save_offset2 = md->offset_vector[offset+1];
828     save_offset3 = md->offset_vector[md->offset_end - number];
829     save_capture_last = md->capture_last;
830    
831     DPRINTF(("saving %d %d %d\n", save_offset1, save_offset2, save_offset3));
832 ph10 531 md->offset_vector[md->offset_end - number] =
833 ph10 530 (int)(eptr - md->start_subject);
834 nigel 77
835 ph10 604 for (;;)
836 nigel 77 {
837 ph10 604 if (op >= OP_SBRA) md->match_function_type = MATCH_CBEGROUP;
838     RMATCH(eptr, ecode + _pcre_OP_lengths[*ecode], offset_top, md,
839     eptrb, RM1);
840 ph10 618 if (rrc == MATCH_ONCE) break; /* Backing up through an atomic group */
841 ph10 550 if (rrc != MATCH_NOMATCH &&
842     (rrc != MATCH_THEN || md->start_match_ptr != ecode))
843     RRETURN(rrc);
844 nigel 77 md->capture_last = save_capture_last;
845     ecode += GET(ecode, 1);
846 ph10 604 if (*ecode != OP_ALT) break;
847 nigel 77 }
848    
849     DPRINTF(("bracket %d failed\n", number));
850     md->offset_vector[offset] = save_offset1;
851     md->offset_vector[offset+1] = save_offset2;
852     md->offset_vector[md->offset_end - number] = save_offset3;
853 ph10 618
854     /* At this point, rrc will be one of MATCH_ONCE, MATCH_NOMATCH, or
855     MATCH_THEN. */
856 nigel 77
857 ph10 609 if (rrc != MATCH_THEN && md->mark == NULL) md->mark = markptr;
858 ph10 618 RRETURN(((rrc == MATCH_ONCE)? MATCH_ONCE:MATCH_NOMATCH));
859 nigel 77 }
860    
861 ph10 197 /* FALL THROUGH ... Insufficient room for saving captured contents. Treat
862     as a non-capturing bracket. */
863 nigel 77
864 ph10 197 /* VVVVVVVVVVVVVVVVVVVVVVVVV */
865     /* VVVVVVVVVVVVVVVVVVVVVVVVV */
866    
867 nigel 93 DPRINTF(("insufficient capture room: treat as non-capturing\n"));
868 nigel 77
869 ph10 197 /* VVVVVVVVVVVVVVVVVVVVVVVVV */
870     /* VVVVVVVVVVVVVVVVVVVVVVVVV */
871    
872 ph10 618 /* Non-capturing or atomic group, except for possessive with unlimited
873     repeat. Loop for all the alternatives. When we get to the final alternative
874     within the brackets, we used to return the result of a recursive call to
875     match() whatever happened so it was possible to reduce stack usage by
876     turning this into a tail recursion, except in the case of a possibly empty
877     group. However, now that there is the possiblity of (*THEN) occurring in
878     the final alternative, this optimization is no longer possible.
879    
880     MATCH_ONCE is returned when the end of an atomic group is successfully
881     reached, but subsequent matching fails. It passes back up the tree (causing
882     captured values to be reset) until the original atomic group level is
883     reached. This is tested by comparing md->once_target with the start of the
884     group. At this point, the return is converted into MATCH_NOMATCH so that
885     previous backup points can be taken. */
886 nigel 77
887 ph10 618 case OP_ONCE:
888 nigel 93 case OP_BRA:
889     case OP_SBRA:
890     DPRINTF(("start non-capturing bracket\n"));
891 ph10 618
892 nigel 91 for (;;)
893 nigel 77 {
894 ph10 618 if (op >= OP_SBRA || op == OP_ONCE) md->match_function_type = MATCH_CBEGROUP;
895 ph10 602 RMATCH(eptr, ecode + _pcre_OP_lengths[*ecode], offset_top, md, eptrb,
896 ph10 604 RM2);
897 ph10 550 if (rrc != MATCH_NOMATCH &&
898     (rrc != MATCH_THEN || md->start_match_ptr != ecode))
899 ph10 618 {
900     if (rrc == MATCH_ONCE)
901     {
902     const uschar *scode = ecode;
903     if (*scode != OP_ONCE) /* If not at start, find it */
904     {
905     while (*scode == OP_ALT) scode += GET(scode, 1);
906     scode -= GET(scode, 1);
907     }
908     if (md->once_target == scode) rrc = MATCH_NOMATCH;
909     }
910 ph10 550 RRETURN(rrc);
911 ph10 618 }
912 nigel 77 ecode += GET(ecode, 1);
913 ph10 609 if (*ecode != OP_ALT) break;
914 nigel 77 }
915 ph10 609 if (rrc != MATCH_THEN && md->mark == NULL) md->mark = markptr;
916     RRETURN(MATCH_NOMATCH);
917    
918 ph10 604 /* Handle possessive capturing brackets with an unlimited repeat. We come
919     here from BRAZERO with allow_zero set TRUE. The offset_vector values are
920     handled similarly to the normal case above. However, the matching is
921     different. The end of these brackets will always be OP_KETRPOS, which
922     returns MATCH_KETRPOS without going further in the pattern. By this means
923     we can handle the group by iteration rather than recursion, thereby
924     reducing the amount of stack needed. */
925    
926     case OP_CBRAPOS:
927     case OP_SCBRAPOS:
928     allow_zero = FALSE;
929    
930     POSSESSIVE_CAPTURE:
931     number = GET2(ecode, 1+LINK_SIZE);
932     offset = number << 1;
933    
934     #ifdef PCRE_DEBUG
935     printf("start possessive bracket %d\n", number);
936     printf("subject=");
937     pchars(eptr, 16, TRUE, md);
938     printf("\n");
939     #endif
940    
941     if (offset < md->offset_max)
942     {
943     matched_once = FALSE;
944     code_offset = ecode - md->start_code;
945    
946     save_offset1 = md->offset_vector[offset];
947     save_offset2 = md->offset_vector[offset+1];
948     save_offset3 = md->offset_vector[md->offset_end - number];
949     save_capture_last = md->capture_last;
950    
951     DPRINTF(("saving %d %d %d\n", save_offset1, save_offset2, save_offset3));
952    
953     /* Each time round the loop, save the current subject position for use
954     when the group matches. For MATCH_MATCH, the group has matched, so we
955     restart it with a new subject starting position, remembering that we had
956     at least one match. For MATCH_NOMATCH, carry on with the alternatives, as
957     usual. If we haven't matched any alternatives in any iteration, check to
958     see if a previous iteration matched. If so, the group has matched;
959     continue from afterwards. Otherwise it has failed; restore the previous
960     capture values before returning NOMATCH. */
961    
962     for (;;)
963     {
964     md->offset_vector[md->offset_end - number] =
965     (int)(eptr - md->start_subject);
966     if (op >= OP_SBRA) md->match_function_type = MATCH_CBEGROUP;
967     RMATCH(eptr, ecode + _pcre_OP_lengths[*ecode], offset_top, md,
968     eptrb, RM63);
969     if (rrc == MATCH_KETRPOS)
970     {
971     offset_top = md->end_offset_top;
972     eptr = md->end_match_ptr;
973     ecode = md->start_code + code_offset;
974     save_capture_last = md->capture_last;
975     matched_once = TRUE;
976     continue;
977     }
978     if (rrc != MATCH_NOMATCH &&
979     (rrc != MATCH_THEN || md->start_match_ptr != ecode))
980     RRETURN(rrc);
981     md->capture_last = save_capture_last;
982     ecode += GET(ecode, 1);
983     if (*ecode != OP_ALT) break;
984     }
985 ph10 610
986 ph10 604 if (!matched_once)
987     {
988     md->offset_vector[offset] = save_offset1;
989     md->offset_vector[offset+1] = save_offset2;
990     md->offset_vector[md->offset_end - number] = save_offset3;
991     }
992    
993 ph10 609 if (rrc != MATCH_THEN && md->mark == NULL) md->mark = markptr;
994 ph10 604 if (allow_zero || matched_once)
995     {
996     ecode += 1 + LINK_SIZE;
997     break;
998     }
999    
1000     RRETURN(MATCH_NOMATCH);
1001     }
1002    
1003     /* FALL THROUGH ... Insufficient room for saving captured contents. Treat
1004     as a non-capturing bracket. */
1005    
1006     /* VVVVVVVVVVVVVVVVVVVVVVVVV */
1007     /* VVVVVVVVVVVVVVVVVVVVVVVVV */
1008    
1009     DPRINTF(("insufficient capture room: treat as non-capturing\n"));
1010    
1011     /* VVVVVVVVVVVVVVVVVVVVVVVVV */
1012     /* VVVVVVVVVVVVVVVVVVVVVVVVV */
1013    
1014     /* Non-capturing possessive bracket with unlimited repeat. We come here
1015     from BRAZERO with allow_zero = TRUE. The code is similar to the above,
1016     without the capturing complication. It is written out separately for speed
1017     and cleanliness. */
1018    
1019     case OP_BRAPOS:
1020     case OP_SBRAPOS:
1021     allow_zero = FALSE;
1022    
1023     POSSESSIVE_NON_CAPTURE:
1024     matched_once = FALSE;
1025     code_offset = ecode - md->start_code;
1026    
1027     for (;;)
1028     {
1029     if (op >= OP_SBRA) md->match_function_type = MATCH_CBEGROUP;
1030     RMATCH(eptr, ecode + _pcre_OP_lengths[*ecode], offset_top, md,
1031 ph10 609 eptrb, RM48);
1032 ph10 604 if (rrc == MATCH_KETRPOS)
1033     {
1034 ph10 610 offset_top = md->end_offset_top;
1035 ph10 604 eptr = md->end_match_ptr;
1036     ecode = md->start_code + code_offset;
1037     matched_once = TRUE;
1038     continue;
1039     }
1040     if (rrc != MATCH_NOMATCH &&
1041     (rrc != MATCH_THEN || md->start_match_ptr != ecode))
1042     RRETURN(rrc);
1043     ecode += GET(ecode, 1);
1044     if (*ecode != OP_ALT) break;
1045     }
1046 ph10 610
1047 ph10 604 if (matched_once || allow_zero)
1048     {
1049     ecode += 1 + LINK_SIZE;
1050     break;
1051     }
1052     RRETURN(MATCH_NOMATCH);
1053    
1054     /* Control never reaches here. */
1055    
1056 nigel 77 /* Conditional group: compilation checked that there are no more than
1057     two branches. If the condition is false, skipping the first branch takes us
1058     past the end if there is only one branch, but that's OK because that is
1059 ph10 609 exactly what going to the ket would do. */
1060 nigel 77
1061     case OP_COND:
1062 nigel 93 case OP_SCOND:
1063 ph10 604 codelink = GET(ecode, 1);
1064 ph10 406
1065 ph10 381 /* Because of the way auto-callout works during compile, a callout item is
1066     inserted between OP_COND and an assertion condition. */
1067 ph10 392
1068 ph10 381 if (ecode[LINK_SIZE+1] == OP_CALLOUT)
1069     {
1070     if (pcre_callout != NULL)
1071     {
1072     pcre_callout_block cb;
1073     cb.version = 1; /* Version 1 of the callout block */
1074     cb.callout_number = ecode[LINK_SIZE+2];
1075     cb.offset_vector = md->offset_vector;
1076     cb.subject = (PCRE_SPTR)md->start_subject;
1077 ph10 530 cb.subject_length = (int)(md->end_subject - md->start_subject);
1078     cb.start_match = (int)(mstart - md->start_subject);
1079     cb.current_position = (int)(eptr - md->start_subject);
1080 ph10 381 cb.pattern_position = GET(ecode, LINK_SIZE + 3);
1081     cb.next_item_length = GET(ecode, 3 + 2*LINK_SIZE);
1082     cb.capture_top = offset_top/2;
1083     cb.capture_last = md->capture_last;
1084     cb.callout_data = md->callout_data;
1085 ph10 510 if ((rrc = (*pcre_callout)(&cb)) > 0) MRRETURN(MATCH_NOMATCH);
1086 ph10 381 if (rrc < 0) RRETURN(rrc);
1087     }
1088     ecode += _pcre_OP_lengths[OP_CALLOUT];
1089     }
1090 ph10 392
1091 ph10 399 condcode = ecode[LINK_SIZE+1];
1092 ph10 406
1093 ph10 381 /* Now see what the actual condition is */
1094 ph10 392
1095 ph10 459 if (condcode == OP_RREF || condcode == OP_NRREF) /* Recursion test */
1096 nigel 77 {
1097 ph10 459 if (md->recursive == NULL) /* Not recursing => FALSE */
1098     {
1099 ph10 461 condition = FALSE;
1100     ecode += GET(ecode, 1);
1101     }
1102 ph10 459 else
1103 ph10 461 {
1104 ph10 459 int recno = GET2(ecode, LINK_SIZE + 2); /* Recursion group number*/
1105     condition = (recno == RREF_ANY || recno == md->recursive->group_num);
1106 ph10 461
1107 ph10 459 /* If the test is for recursion into a specific subpattern, and it is
1108     false, but the test was set up by name, scan the table to see if the
1109     name refers to any other numbers, and test them. The condition is true
1110     if any one is set. */
1111 ph10 461
1112 ph10 459 if (!condition && condcode == OP_NRREF && recno != RREF_ANY)
1113     {
1114     uschar *slotA = md->name_table;
1115     for (i = 0; i < md->name_count; i++)
1116 ph10 461 {
1117     if (GET2(slotA, 0) == recno) break;
1118 ph10 459 slotA += md->name_entry_size;
1119     }
1120 ph10 461
1121 ph10 459 /* Found a name for the number - there can be only one; duplicate
1122     names for different numbers are allowed, but not vice versa. First
1123     scan down for duplicates. */
1124 ph10 461
1125 ph10 459 if (i < md->name_count)
1126 ph10 461 {
1127 ph10 459 uschar *slotB = slotA;
1128     while (slotB > md->name_table)
1129     {
1130     slotB -= md->name_entry_size;
1131     if (strcmp((char *)slotA + 2, (char *)slotB + 2) == 0)
1132     {
1133     condition = GET2(slotB, 0) == md->recursive->group_num;
1134 ph10 461 if (condition) break;
1135     }
1136 ph10 459 else break;
1137 ph10 461 }
1138    
1139 ph10 459 /* Scan up for duplicates */
1140 ph10 461
1141 ph10 459 if (!condition)
1142 ph10 461 {
1143 ph10 459 slotB = slotA;
1144     for (i++; i < md->name_count; i++)
1145     {
1146     slotB += md->name_entry_size;
1147     if (strcmp((char *)slotA + 2, (char *)slotB + 2) == 0)
1148     {
1149     condition = GET2(slotB, 0) == md->recursive->group_num;
1150     if (condition) break;
1151 ph10 461 }
1152 ph10 459 else break;
1153 ph10 461 }
1154     }
1155 ph10 459 }
1156 ph10 461 }
1157    
1158 ph10 459 /* Chose branch according to the condition */
1159 ph10 461
1160 ph10 459 ecode += condition? 3 : GET(ecode, 1);
1161     }
1162 ph10 461 }
1163 nigel 93
1164 ph10 459 else if (condcode == OP_CREF || condcode == OP_NCREF) /* Group used test */
1165 nigel 93 {
1166 nigel 77 offset = GET2(ecode, LINK_SIZE+2) << 1; /* Doubled ref number */
1167 nigel 93 condition = offset < offset_top && md->offset_vector[offset] >= 0;
1168 ph10 461
1169 ph10 459 /* If the numbered capture is unset, but the reference was by name,
1170 ph10 461 scan the table to see if the name refers to any other numbers, and test
1171     them. The condition is true if any one is set. This is tediously similar
1172     to the code above, but not close enough to try to amalgamate. */
1173    
1174 ph10 459 if (!condition && condcode == OP_NCREF)
1175     {
1176 ph10 461 int refno = offset >> 1;
1177 ph10 459 uschar *slotA = md->name_table;
1178 ph10 461
1179 ph10 459 for (i = 0; i < md->name_count; i++)
1180 ph10 461 {
1181     if (GET2(slotA, 0) == refno) break;
1182 ph10 459 slotA += md->name_entry_size;
1183     }
1184 ph10 461
1185     /* Found a name for the number - there can be only one; duplicate names
1186     for different numbers are allowed, but not vice versa. First scan down
1187 ph10 459 for duplicates. */
1188 ph10 461
1189 ph10 459 if (i < md->name_count)
1190 ph10 461 {
1191 ph10 459 uschar *slotB = slotA;
1192     while (slotB > md->name_table)
1193     {
1194     slotB -= md->name_entry_size;
1195     if (strcmp((char *)slotA + 2, (char *)slotB + 2) == 0)
1196     {
1197     offset = GET2(slotB, 0) << 1;
1198 ph10 461 condition = offset < offset_top &&
1199 ph10 459 md->offset_vector[offset] >= 0;
1200 ph10 461 if (condition) break;
1201     }
1202 ph10 459 else break;
1203 ph10 461 }
1204    
1205 ph10 459 /* Scan up for duplicates */
1206 ph10 461
1207 ph10 459 if (!condition)
1208 ph10 461 {
1209 ph10 459 slotB = slotA;
1210     for (i++; i < md->name_count; i++)
1211     {
1212     slotB += md->name_entry_size;
1213     if (strcmp((char *)slotA + 2, (char *)slotB + 2) == 0)
1214     {
1215     offset = GET2(slotB, 0) << 1;
1216 ph10 461 condition = offset < offset_top &&
1217 ph10 459 md->offset_vector[offset] >= 0;
1218 ph10 461 if (condition) break;
1219     }
1220 ph10 459 else break;
1221 ph10 461 }
1222     }
1223 ph10 459 }
1224 ph10 461 }
1225    
1226 ph10 459 /* Chose branch according to the condition */
1227    
1228 nigel 93 ecode += condition? 3 : GET(ecode, 1);
1229 nigel 77 }
1230    
1231 ph10 399 else if (condcode == OP_DEF) /* DEFINE - always false */
1232 nigel 93 {
1233     condition = FALSE;
1234     ecode += GET(ecode, 1);
1235     }
1236    
1237 nigel 77 /* The condition is an assertion. Call match() to evaluate it - setting
1238 ph10 604 md->match_function_type to MATCH_CONDASSERT causes it to stop at the end of
1239     an assertion. */
1240 nigel 77
1241     else
1242     {
1243 ph10 604 md->match_function_type = MATCH_CONDASSERT;
1244     RMATCH(eptr, ecode + 1 + LINK_SIZE, offset_top, md, NULL, RM3);
1245 nigel 77 if (rrc == MATCH_MATCH)
1246     {
1247 ph10 619 if (md->end_offset_top > offset_top)
1248     offset_top = md->end_offset_top; /* Captures may have happened */
1249 nigel 93 condition = TRUE;
1250     ecode += 1 + LINK_SIZE + GET(ecode, LINK_SIZE + 2);
1251 nigel 77 while (*ecode == OP_ALT) ecode += GET(ecode, 1);
1252     }
1253 ph10 550 else if (rrc != MATCH_NOMATCH &&
1254     (rrc != MATCH_THEN || md->start_match_ptr != ecode))
1255 nigel 77 {
1256     RRETURN(rrc); /* Need braces because of following else */
1257     }
1258 nigel 93 else
1259     {
1260     condition = FALSE;
1261 ph10 399 ecode += codelink;
1262 nigel 93 }
1263     }
1264 nigel 91
1265 nigel 93 /* We are now at the branch that is to be obeyed. As there is only one,
1266 ph10 609 we used to use tail recursion to avoid using another stack frame, except
1267     when there was unlimited repeat of a possibly empty group. However, that
1268     strategy no longer works because of the possibilty of (*THEN) being
1269     encountered in the branch. A recursive call to match() is always required,
1270     unless the second alternative doesn't exist, in which case we can just
1271     plough on. */
1272 nigel 91
1273 nigel 93 if (condition || *ecode == OP_ALT)
1274     {
1275 ph10 609 if (op == OP_SCOND) md->match_function_type = MATCH_CBEGROUP;
1276     RMATCH(eptr, ecode + 1 + LINK_SIZE, offset_top, md, eptrb, RM49);
1277     if (rrc == MATCH_THEN && md->start_match_ptr == ecode)
1278     rrc = MATCH_NOMATCH;
1279     RRETURN(rrc);
1280 nigel 77 }
1281 ph10 395 else /* Condition false & no alternative */
1282 nigel 93 {
1283     ecode += 1 + LINK_SIZE;
1284     }
1285     break;
1286 nigel 77
1287 ph10 461
1288 ph10 447 /* Before OP_ACCEPT there may be any number of OP_CLOSE opcodes,
1289     to close any currently open capturing brackets. */
1290 ph10 461
1291 ph10 447 case OP_CLOSE:
1292 ph10 461 number = GET2(ecode, 1);
1293 ph10 447 offset = number << 1;
1294 ph10 461
1295 ph10 475 #ifdef PCRE_DEBUG
1296 ph10 447 printf("end bracket %d at *ACCEPT", number);
1297     printf("\n");
1298     #endif
1299 nigel 77
1300 ph10 447 md->capture_last = number;
1301     if (offset >= md->offset_max) md->offset_overflow = TRUE; else
1302     {
1303     md->offset_vector[offset] =
1304     md->offset_vector[md->offset_end - number];
1305 ph10 530 md->offset_vector[offset+1] = (int)(eptr - md->start_subject);
1306 ph10 447 if (offset_top <= offset) offset_top = offset + 2;
1307     }
1308     ecode += 3;
1309 ph10 461 break;
1310 ph10 447
1311    
1312 ph10 619 /* End of the pattern, either real or forced. */
1313 nigel 77
1314 ph10 619 case OP_END:
1315 ph10 210 case OP_ACCEPT:
1316 ph10 613 case OP_ASSERT_ACCEPT:
1317 ph10 619
1318     /* If we have matched an empty string, fail if not in an assertion and not
1319     in a recursion if either PCRE_NOTEMPTY is set, or if PCRE_NOTEMPTY_ATSTART
1320 ph10 613 is set and we have matched at the start of the subject. In both cases,
1321     backtracking will then try other alternatives, if any. */
1322 ph10 443
1323 ph10 619 if (eptr == mstart && op != OP_ASSERT_ACCEPT &&
1324 ph10 618 md->recursive == NULL &&
1325 ph10 619 (md->notempty ||
1326     (md->notempty_atstart &&
1327     mstart == md->start_subject + md->start_offset)))
1328 ph10 510 MRRETURN(MATCH_NOMATCH);
1329 ph10 443
1330 ph10 442 /* Otherwise, we have a match. */
1331 ph10 608
1332 ph10 168 md->end_match_ptr = eptr; /* Record where we ended */
1333     md->end_offset_top = offset_top; /* and how many extracts were taken */
1334 ph10 210 md->start_match_ptr = mstart; /* and the start (\K can modify) */
1335 nigel 77
1336 ph10 512 /* For some reason, the macros don't work properly if an expression is
1337     given as the argument to MRRETURN when the heap is in use. */
1338    
1339     rrc = (op == OP_END)? MATCH_MATCH : MATCH_ACCEPT;
1340     MRRETURN(rrc);
1341    
1342 nigel 77 /* Assertion brackets. Check the alternative branches in turn - the
1343     matching won't pass the KET for an assertion. If any one branch matches,
1344     the assertion is true. Lookbehind assertions have an OP_REVERSE item at the
1345     start of each branch to move the current point backwards, so the code at
1346 ph10 604 this level is identical to the lookahead case. When the assertion is part
1347     of a condition, we want to return immediately afterwards. The caller of
1348     this incarnation of the match() function will have set MATCH_CONDASSERT in
1349     md->match_function type, and one of these opcodes will be the first opcode
1350     that is processed. We use a local variable that is preserved over calls to
1351     match() to remember this case. */
1352 nigel 77
1353     case OP_ASSERT:
1354     case OP_ASSERTBACK:
1355 ph10 604 if (md->match_function_type == MATCH_CONDASSERT)
1356     {
1357     condassert = TRUE;
1358     md->match_function_type = 0;
1359     }
1360     else condassert = FALSE;
1361    
1362 nigel 77 do
1363     {
1364 ph10 604 RMATCH(eptr, ecode + 1 + LINK_SIZE, offset_top, md, NULL, RM4);
1365 ph10 511 if (rrc == MATCH_MATCH || rrc == MATCH_ACCEPT)
1366 ph10 500 {
1367     mstart = md->start_match_ptr; /* In case \K reset it */
1368     break;
1369 ph10 501 }
1370 ph10 550 if (rrc != MATCH_NOMATCH &&
1371     (rrc != MATCH_THEN || md->start_match_ptr != ecode))
1372     RRETURN(rrc);
1373 nigel 77 ecode += GET(ecode, 1);
1374     }
1375     while (*ecode == OP_ALT);
1376 ph10 604
1377 ph10 510 if (*ecode == OP_KET) MRRETURN(MATCH_NOMATCH);
1378 nigel 77
1379     /* If checking an assertion for a condition, return MATCH_MATCH. */
1380    
1381 ph10 604 if (condassert) RRETURN(MATCH_MATCH);
1382 nigel 77
1383     /* Continue from after the assertion, updating the offsets high water
1384     mark, since extracts may have been taken during the assertion. */
1385    
1386     do ecode += GET(ecode,1); while (*ecode == OP_ALT);
1387     ecode += 1 + LINK_SIZE;
1388     offset_top = md->end_offset_top;
1389     continue;
1390    
1391 ph10 473 /* Negative assertion: all branches must fail to match. Encountering SKIP,
1392 ph10 482 PRUNE, or COMMIT means we must assume failure without checking subsequent
1393 ph10 473 branches. */
1394 nigel 77
1395     case OP_ASSERT_NOT:
1396     case OP_ASSERTBACK_NOT:
1397 ph10 604 if (md->match_function_type == MATCH_CONDASSERT)
1398     {
1399     condassert = TRUE;
1400     md->match_function_type = 0;
1401     }
1402     else condassert = FALSE;
1403    
1404 nigel 77 do
1405     {
1406 ph10 604 RMATCH(eptr, ecode + 1 + LINK_SIZE, offset_top, md, NULL, RM5);
1407 ph10 511 if (rrc == MATCH_MATCH || rrc == MATCH_ACCEPT) MRRETURN(MATCH_NOMATCH);
1408 ph10 473 if (rrc == MATCH_SKIP || rrc == MATCH_PRUNE || rrc == MATCH_COMMIT)
1409     {
1410     do ecode += GET(ecode,1); while (*ecode == OP_ALT);
1411 ph10 482 break;
1412     }
1413 ph10 550 if (rrc != MATCH_NOMATCH &&
1414     (rrc != MATCH_THEN || md->start_match_ptr != ecode))
1415     RRETURN(rrc);
1416 nigel 77 ecode += GET(ecode,1);
1417     }
1418     while (*ecode == OP_ALT);
1419    
1420 ph10 604 if (condassert) RRETURN(MATCH_MATCH); /* Condition assertion */
1421    
1422 nigel 77 ecode += 1 + LINK_SIZE;
1423     continue;
1424    
1425     /* Move the subject pointer back. This occurs only at the start of
1426     each branch of a lookbehind assertion. If we are too close to the start to
1427     move back, this match function fails. When working with UTF-8 we move
1428     back a number of characters, not bytes. */
1429    
1430     case OP_REVERSE:
1431     #ifdef SUPPORT_UTF8
1432     if (utf8)
1433     {
1434 nigel 93 i = GET(ecode, 1);
1435     while (i-- > 0)
1436 nigel 77 {
1437     eptr--;
1438 ph10 510 if (eptr < md->start_subject) MRRETURN(MATCH_NOMATCH);
1439 ph10 207 BACKCHAR(eptr);
1440 nigel 77 }
1441     }
1442     else
1443     #endif
1444    
1445     /* No UTF-8 support, or not in UTF-8 mode: count is byte count */
1446    
1447     {
1448 nigel 93 eptr -= GET(ecode, 1);
1449 ph10 510 if (eptr < md->start_subject) MRRETURN(MATCH_NOMATCH);
1450 nigel 77 }
1451    
1452 ph10 435 /* Save the earliest consulted character, then skip to next op code */
1453 nigel 77
1454 ph10 435 if (eptr < md->start_used_ptr) md->start_used_ptr = eptr;
1455 nigel 77 ecode += 1 + LINK_SIZE;
1456     break;
1457    
1458     /* The callout item calls an external function, if one is provided, passing
1459     details of the match so far. This is mainly for debugging, though the
1460     function is able to force a failure. */
1461    
1462     case OP_CALLOUT:
1463     if (pcre_callout != NULL)
1464     {
1465     pcre_callout_block cb;
1466     cb.version = 1; /* Version 1 of the callout block */
1467     cb.callout_number = ecode[1];
1468     cb.offset_vector = md->offset_vector;
1469 nigel 87 cb.subject = (PCRE_SPTR)md->start_subject;
1470 ph10 530 cb.subject_length = (int)(md->end_subject - md->start_subject);
1471     cb.start_match = (int)(mstart - md->start_subject);
1472     cb.current_position = (int)(eptr - md->start_subject);
1473 nigel 77 cb.pattern_position = GET(ecode, 2);
1474     cb.next_item_length = GET(ecode, 2 + LINK_SIZE);
1475     cb.capture_top = offset_top/2;
1476     cb.capture_last = md->capture_last;
1477     cb.callout_data = md->callout_data;
1478 ph10 510 if ((rrc = (*pcre_callout)(&cb)) > 0) MRRETURN(MATCH_NOMATCH);
1479 nigel 77 if (rrc < 0) RRETURN(rrc);
1480     }
1481     ecode += 2 + 2*LINK_SIZE;
1482     break;
1483    
1484     /* Recursion either matches the current regex, or some subexpression. The
1485     offset data is the offset to the starting bracket from the start of the
1486     whole pattern. (This is so that it works from duplicated subpatterns.)
1487 ph10 618
1488     The state of the capturing groups is preserved over recursion, and
1489     re-instated afterwards. We don't know how many are started and not yet
1490     finished (offset_top records the completed total) so we just have to save
1491     all the potential data. There may be up to 65535 such values, which is too
1492     large to put on the stack, but using malloc for small numbers seems
1493     expensive. As a compromise, the stack is used when there are no more than
1494     REC_STACK_SAVE_MAX values to store; otherwise malloc is used.
1495 nigel 77
1496     There are also other values that have to be saved. We use a chained
1497     sequence of blocks that actually live on the stack. Thanks to Robin Houston
1498 ph10 618 for the original version of this logic. It has, however, been hacked around
1499     a lot, so he is not to blame for the current way it works. */
1500 nigel 77
1501     case OP_RECURSE:
1502     {
1503     callpat = md->start_code + GET(ecode, 1);
1504 nigel 93 new_recursive.group_num = (callpat == md->start_code)? 0 :
1505     GET2(callpat, 1 + LINK_SIZE);
1506 nigel 77
1507     /* Add to "recursing stack" */
1508    
1509     new_recursive.prevrec = md->recursive;
1510     md->recursive = &new_recursive;
1511    
1512 ph10 618 /* Where to continue from afterwards */
1513 nigel 77
1514     ecode += 1 + LINK_SIZE;
1515    
1516 ph10 618 /* Now save the offset data */
1517 nigel 77
1518     new_recursive.saved_max = md->offset_end;
1519     if (new_recursive.saved_max <= REC_STACK_SAVE_MAX)
1520     new_recursive.offset_save = stacksave;
1521     else
1522     {
1523     new_recursive.offset_save =
1524     (int *)(pcre_malloc)(new_recursive.saved_max * sizeof(int));
1525     if (new_recursive.offset_save == NULL) RRETURN(PCRE_ERROR_NOMEMORY);
1526     }
1527     memcpy(new_recursive.offset_save, md->offset_vector,
1528     new_recursive.saved_max * sizeof(int));
1529 ph10 608
1530 ph10 618 /* OK, now we can do the recursion. After processing each alternative,
1531     restore the offset data. If there were nested recursions, md->recursive
1532     might be changed, so reset it before looping. */
1533 nigel 77
1534     DPRINTF(("Recursing into group %d\n", new_recursive.group_num));
1535 ph10 604 cbegroup = (*callpat >= OP_SBRA);
1536 nigel 77 do
1537     {
1538 ph10 604 if (cbegroup) md->match_function_type = MATCH_CBEGROUP;
1539 ph10 164 RMATCH(eptr, callpat + _pcre_OP_lengths[*callpat], offset_top,
1540 ph10 604 md, eptrb, RM6);
1541 ph10 618 memcpy(md->offset_vector, new_recursive.offset_save,
1542     new_recursive.saved_max * sizeof(int));
1543 ph10 511 if (rrc == MATCH_MATCH || rrc == MATCH_ACCEPT)
1544 nigel 77 {
1545 nigel 87 DPRINTF(("Recursion matched\n"));
1546 nigel 77 md->recursive = new_recursive.prevrec;
1547     if (new_recursive.offset_save != stacksave)
1548     (pcre_free)(new_recursive.offset_save);
1549 ph10 618
1550     /* Set where we got to in the subject, and reset the start in case
1551     it was changed by \K. This *is* propagated back out of a recursion,
1552     for Perl compatibility. */
1553    
1554     eptr = md->end_match_ptr;
1555     mstart = md->start_match_ptr;
1556     goto RECURSION_MATCHED; /* Exit loop; end processing */
1557 nigel 77 }
1558 ph10 550 else if (rrc != MATCH_NOMATCH &&
1559     (rrc != MATCH_THEN || md->start_match_ptr != ecode))
1560 nigel 87 {
1561     DPRINTF(("Recursion gave error %d\n", rrc));
1562 ph10 400 if (new_recursive.offset_save != stacksave)
1563     (pcre_free)(new_recursive.offset_save);
1564 nigel 87 RRETURN(rrc);
1565     }
1566 nigel 77
1567     md->recursive = &new_recursive;
1568     callpat += GET(callpat, 1);
1569     }
1570     while (*callpat == OP_ALT);
1571    
1572     DPRINTF(("Recursion didn't match\n"));
1573     md->recursive = new_recursive.prevrec;
1574     if (new_recursive.offset_save != stacksave)
1575     (pcre_free)(new_recursive.offset_save);
1576 ph10 510 MRRETURN(MATCH_NOMATCH);
1577 nigel 77 }
1578 ph10 618
1579     RECURSION_MATCHED:
1580     break;
1581 nigel 77
1582     /* An alternation is the end of a branch; scan along to find the end of the
1583     bracketed group and go to there. */
1584    
1585     case OP_ALT:
1586     do ecode += GET(ecode,1); while (*ecode == OP_ALT);
1587     break;
1588    
1589 ph10 335 /* BRAZERO, BRAMINZERO and SKIPZERO occur just before a bracket group,
1590     indicating that it may occur zero times. It may repeat infinitely, or not
1591     at all - i.e. it could be ()* or ()? or even (){0} in the pattern. Brackets
1592     with fixed upper repeat limits are compiled as a number of copies, with the
1593     optional ones preceded by BRAZERO or BRAMINZERO. */
1594 ph10 604
1595 nigel 77 case OP_BRAZERO:
1596 ph10 604 next = ecode + 1;
1597     RMATCH(eptr, next, offset_top, md, eptrb, RM10);
1598     if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1599     do next += GET(next, 1); while (*next == OP_ALT);
1600     ecode = next + 1 + LINK_SIZE;
1601 nigel 77 break;
1602 ph10 604
1603 nigel 77 case OP_BRAMINZERO:
1604 ph10 604 next = ecode + 1;
1605     do next += GET(next, 1); while (*next == OP_ALT);
1606     RMATCH(eptr, next + 1+LINK_SIZE, offset_top, md, eptrb, RM11);
1607     if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1608     ecode++;
1609 nigel 77 break;
1610    
1611 ph10 335 case OP_SKIPZERO:
1612 ph10 604 next = ecode+1;
1613     do next += GET(next,1); while (*next == OP_ALT);
1614     ecode = next + 1 + LINK_SIZE;
1615 ph10 335 break;
1616 ph10 604
1617     /* BRAPOSZERO occurs before a possessive bracket group. Don't do anything
1618     here; just jump to the group, with allow_zero set TRUE. */
1619    
1620     case OP_BRAPOSZERO:
1621     op = *(++ecode);
1622     allow_zero = TRUE;
1623     if (op == OP_CBRAPOS || op == OP_SCBRAPOS) goto POSSESSIVE_CAPTURE;
1624     goto POSSESSIVE_NON_CAPTURE;
1625 ph10 335
1626 nigel 93 /* End of a group, repeated or non-repeating. */
1627 nigel 77
1628     case OP_KET:
1629     case OP_KETRMIN:
1630     case OP_KETRMAX:
1631 ph10 604 case OP_KETRPOS:
1632 nigel 91 prev = ecode - GET(ecode, 1);
1633 ph10 618
1634 nigel 93 /* If this was a group that remembered the subject start, in order to break
1635     infinite repeats of empty string matches, retrieve the subject start from
1636     the chain. Otherwise, set it NULL. */
1637 nigel 77
1638 ph10 618 if (*prev >= OP_SBRA || *prev == OP_ONCE)
1639 nigel 93 {
1640     saved_eptr = eptrb->epb_saved_eptr; /* Value at start of group */
1641     eptrb = eptrb->epb_prev; /* Backup to previous group */
1642     }
1643     else saved_eptr = NULL;
1644 nigel 77
1645 ph10 618 /* If we are at the end of an assertion group, stop matching and return
1646     MATCH_MATCH, but record the current high water mark for use by positive
1647     assertions. We also need to record the match start in case it was changed
1648     by \K. */
1649 nigel 93
1650 nigel 91 if (*prev == OP_ASSERT || *prev == OP_ASSERT_NOT ||
1651 ph10 618 *prev == OP_ASSERTBACK || *prev == OP_ASSERTBACK_NOT)
1652 nigel 91 {
1653     md->end_match_ptr = eptr; /* For ONCE */
1654     md->end_offset_top = offset_top;
1655 ph10 500 md->start_match_ptr = mstart;
1656 ph10 510 MRRETURN(MATCH_MATCH);
1657 nigel 91 }
1658 nigel 77
1659 nigel 93 /* For capturing groups we have to check the group number back at the start
1660     and if necessary complete handling an extraction by setting the offsets and
1661 ph10 618 bumping the high water mark. Whole-pattern recursion is coded as a recurse
1662     into group 0, so it won't be picked up here. Instead, we catch it when the
1663     OP_END is reached. Other recursion is handled here. We just have to record
1664     the current subject position and start match pointer and give a MATCH
1665     return. */
1666 nigel 77
1667 ph10 604 if (*prev == OP_CBRA || *prev == OP_SCBRA ||
1668     *prev == OP_CBRAPOS || *prev == OP_SCBRAPOS)
1669 nigel 91 {
1670 nigel 93 number = GET2(prev, 1+LINK_SIZE);
1671 nigel 91 offset = number << 1;
1672 ph10 461
1673 ph10 475 #ifdef PCRE_DEBUG
1674 nigel 91 printf("end bracket %d", number);
1675     printf("\n");
1676 nigel 77 #endif
1677    
1678 ph10 618 /* Handle a recursively called group. */
1679    
1680     if (md->recursive != NULL && md->recursive->group_num == number)
1681     {
1682     md->end_match_ptr = eptr;
1683     md->start_match_ptr = mstart;
1684     RRETURN(MATCH_MATCH);
1685     }
1686    
1687     /* Deal with capturing */
1688    
1689 nigel 93 md->capture_last = number;
1690     if (offset >= md->offset_max) md->offset_overflow = TRUE; else
1691 nigel 91 {
1692 ph10 615 /* If offset is greater than offset_top, it means that we are
1693     "skipping" a capturing group, and that group's offsets must be marked
1694     unset. In earlier versions of PCRE, all the offsets were unset at the
1695     start of matching, but this doesn't work because atomic groups and
1696     assertions can cause a value to be set that should later be unset.
1697     Example: matching /(?>(a))b|(a)c/ against "ac". This sets group 1 as
1698     part of the atomic group, but this is not on the final matching path,
1699     so must be unset when 2 is set. (If there is no group 2, there is no
1700     problem, because offset_top will then be 2, indicating no capture.) */
1701    
1702     if (offset > offset_top)
1703     {
1704     register int *iptr = md->offset_vector + offset_top;
1705     register int *iend = md->offset_vector + offset;
1706     while (iptr < iend) *iptr++ = -1;
1707     }
1708    
1709     /* Now make the extraction */
1710    
1711 nigel 93 md->offset_vector[offset] =
1712     md->offset_vector[md->offset_end - number];
1713 ph10 530 md->offset_vector[offset+1] = (int)(eptr - md->start_subject);
1714 nigel 93 if (offset_top <= offset) offset_top = offset + 2;
1715     }
1716 nigel 91 }
1717 nigel 77
1718 ph10 618 /* For an ordinary non-repeating ket, just continue at this level. This
1719     also happens for a repeating ket if no characters were matched in the
1720     group. This is the forcible breaking of infinite loops as implemented in
1721     Perl 5.005. For a non-repeating atomic group, establish a backup point by
1722     processing the rest of the pattern at a lower level. If this results in a
1723     NOMATCH return, pass MATCH_ONCE back to the original OP_ONCE level, thereby
1724     bypassing intermediate backup points, but resetting any captures that
1725     happened along the way. */
1726 nigel 77
1727 nigel 91 if (*ecode == OP_KET || eptr == saved_eptr)
1728     {
1729 ph10 618 if (*prev == OP_ONCE)
1730     {
1731     RMATCH(eptr, ecode + 1 + LINK_SIZE, offset_top, md, eptrb, RM12);
1732     if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1733     md->once_target = prev; /* Level at which to change to MATCH_NOMATCH */
1734     RRETURN(MATCH_ONCE);
1735     }
1736     ecode += 1 + LINK_SIZE; /* Carry on at this level */
1737 nigel 91 break;
1738     }
1739 ph10 604
1740     /* OP_KETRPOS is a possessive repeating ket. Remember the current position,
1741     and return the MATCH_KETRPOS. This makes it possible to do the repeats one
1742     at a time from the outer level, thus saving stack. */
1743    
1744     if (*ecode == OP_KETRPOS)
1745     {
1746     md->end_match_ptr = eptr;
1747     md->end_offset_top = offset_top;
1748     RRETURN(MATCH_KETRPOS);
1749     }
1750 nigel 77
1751 ph10 604 /* The normal repeating kets try the rest of the pattern or restart from
1752     the preceding bracket, in the appropriate order. In the second case, we can
1753     use tail recursion to avoid using another stack frame, unless we have an
1754 ph10 618 an atomic group or an unlimited repeat of a group that can match an empty
1755     string. */
1756 nigel 77
1757 nigel 91 if (*ecode == OP_KETRMIN)
1758     {
1759 ph10 623 RMATCH(eptr, ecode + 1 + LINK_SIZE, offset_top, md, eptrb, RM7);
1760 nigel 91 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1761 ph10 618 if (*prev == OP_ONCE)
1762     {
1763 ph10 623 RMATCH(eptr, prev, offset_top, md, eptrb, RM8);
1764 ph10 618 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1765     md->once_target = prev; /* Level at which to change to MATCH_NOMATCH */
1766     RRETURN(MATCH_ONCE);
1767     }
1768 ph10 604 if (*prev >= OP_SBRA) /* Could match an empty string */
1769 ph10 197 {
1770 ph10 604 md->match_function_type = MATCH_CBEGROUP;
1771     RMATCH(eptr, prev, offset_top, md, eptrb, RM50);
1772 ph10 197 RRETURN(rrc);
1773     }
1774 nigel 91 ecode = prev;
1775     goto TAIL_RECURSE;
1776 nigel 77 }
1777 nigel 91 else /* OP_KETRMAX */
1778     {
1779 ph10 604 if (*prev >= OP_SBRA) md->match_function_type = MATCH_CBEGROUP;
1780     RMATCH(eptr, prev, offset_top, md, eptrb, RM13);
1781 ph10 618 if (rrc == MATCH_ONCE && md->once_target == prev) rrc = MATCH_NOMATCH;
1782 nigel 91 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1783 ph10 618 if (*prev == OP_ONCE)
1784     {
1785 ph10 623 RMATCH(eptr, ecode + 1 + LINK_SIZE, offset_top, md, eptrb, RM9);
1786 ph10 618 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1787     md->once_target = prev;
1788     RRETURN(MATCH_ONCE);
1789     }
1790 nigel 91 ecode += 1 + LINK_SIZE;
1791     goto TAIL_RECURSE;
1792     }
1793     /* Control never gets here */
1794 nigel 77
1795 ph10 602 /* Not multiline mode: start of subject assertion, unless notbol. */
1796 nigel 77
1797     case OP_CIRC:
1798 ph10 510 if (md->notbol && eptr == md->start_subject) MRRETURN(MATCH_NOMATCH);
1799 ph10 602
1800 nigel 77 /* Start of subject assertion */
1801    
1802     case OP_SOD:
1803 ph10 510 if (eptr != md->start_subject) MRRETURN(MATCH_NOMATCH);
1804 nigel 77 ecode++;
1805     break;
1806 ph10 602
1807     /* Multiline mode: start of subject unless notbol, or after any newline. */
1808 nigel 77
1809 ph10 602 case OP_CIRCM:
1810     if (md->notbol && eptr == md->start_subject) MRRETURN(MATCH_NOMATCH);
1811     if (eptr != md->start_subject &&
1812     (eptr == md->end_subject || !WAS_NEWLINE(eptr)))
1813     MRRETURN(MATCH_NOMATCH);
1814     ecode++;
1815     break;
1816    
1817 nigel 77 /* Start of match assertion */
1818    
1819     case OP_SOM:
1820 ph10 510 if (eptr != md->start_subject + md->start_offset) MRRETURN(MATCH_NOMATCH);
1821 nigel 77 ecode++;
1822     break;
1823 ph10 172
1824 ph10 168 /* Reset the start of match point */
1825 ph10 172
1826 ph10 168 case OP_SET_SOM:
1827     mstart = eptr;
1828 ph10 172 ecode++;
1829     break;
1830 nigel 77
1831 ph10 602 /* Multiline mode: assert before any newline, or before end of subject
1832     unless noteol is set. */
1833 nigel 77
1834 ph10 602 case OP_DOLLM:
1835     if (eptr < md->end_subject)
1836     { if (!IS_NEWLINE(eptr)) MRRETURN(MATCH_NOMATCH); }
1837     else
1838 nigel 77 {
1839 ph10 510 if (md->noteol) MRRETURN(MATCH_NOMATCH);
1840 ph10 602 SCHECK_PARTIAL();
1841 nigel 77 }
1842 ph10 602 ecode++;
1843     break;
1844 ph10 579
1845 ph10 602 /* Not multiline mode: assert before a terminating newline or before end of
1846     subject unless noteol is set. */
1847    
1848     case OP_DOLL:
1849     if (md->noteol) MRRETURN(MATCH_NOMATCH);
1850     if (!md->endonly) goto ASSERT_NL_OR_EOS;
1851    
1852 nigel 91 /* ... else fall through for endonly */
1853 nigel 77
1854     /* End of subject assertion (\z) */
1855    
1856     case OP_EOD:
1857 ph10 510 if (eptr < md->end_subject) MRRETURN(MATCH_NOMATCH);
1858 ph10 553 SCHECK_PARTIAL();
1859 nigel 77 ecode++;
1860     break;
1861    
1862     /* End of subject or ending \n assertion (\Z) */
1863    
1864     case OP_EODN:
1865 ph10 553 ASSERT_NL_OR_EOS:
1866     if (eptr < md->end_subject &&
1867 nigel 93 (!IS_NEWLINE(eptr) || eptr != md->end_subject - md->nllen))
1868 ph10 510 MRRETURN(MATCH_NOMATCH);
1869 ph10 579
1870 ph10 553 /* Either at end of string or \n before end. */
1871 ph10 579
1872 ph10 553 SCHECK_PARTIAL();
1873 nigel 77 ecode++;
1874     break;
1875    
1876     /* Word boundary assertions */
1877    
1878     case OP_NOT_WORD_BOUNDARY:
1879     case OP_WORD_BOUNDARY:
1880     {
1881    
1882     /* Find out if the previous and current characters are "word" characters.
1883     It takes a bit more work in UTF-8 mode. Characters > 255 are assumed to
1884 ph10 443 be "non-word" characters. Remember the earliest consulted character for
1885 ph10 435 partial matching. */
1886 nigel 77
1887     #ifdef SUPPORT_UTF8
1888     if (utf8)
1889     {
1890 ph10 518 /* Get status of previous character */
1891 ph10 527
1892 nigel 77 if (eptr == md->start_subject) prev_is_word = FALSE; else
1893     {
1894 ph10 409 USPTR lastptr = eptr - 1;
1895 nigel 77 while((*lastptr & 0xc0) == 0x80) lastptr--;
1896 ph10 443 if (lastptr < md->start_used_ptr) md->start_used_ptr = lastptr;
1897 nigel 77 GETCHAR(c, lastptr);
1898 ph10 527 #ifdef SUPPORT_UCP
1899 ph10 518 if (md->use_ucp)
1900     {
1901     if (c == '_') prev_is_word = TRUE; else
1902 ph10 527 {
1903 ph10 518 int cat = UCD_CATEGORY(c);
1904     prev_is_word = (cat == ucp_L || cat == ucp_N);
1905 ph10 527 }
1906     }
1907     else
1908     #endif
1909 nigel 77 prev_is_word = c < 256 && (md->ctypes[c] & ctype_word) != 0;
1910     }
1911 ph10 527
1912 ph10 518 /* Get status of next character */
1913 ph10 527
1914 ph10 443 if (eptr >= md->end_subject)
1915 nigel 77 {
1916 ph10 443 SCHECK_PARTIAL();
1917     cur_is_word = FALSE;
1918 ph10 428 }
1919     else
1920     {
1921 nigel 77 GETCHAR(c, eptr);
1922 ph10 527 #ifdef SUPPORT_UCP
1923 ph10 518 if (md->use_ucp)
1924     {
1925     if (c == '_') cur_is_word = TRUE; else
1926 ph10 527 {
1927 ph10 518 int cat = UCD_CATEGORY(c);
1928     cur_is_word = (cat == ucp_L || cat == ucp_N);
1929 ph10 527 }
1930     }
1931     else
1932     #endif
1933 nigel 77 cur_is_word = c < 256 && (md->ctypes[c] & ctype_word) != 0;
1934     }
1935     }
1936     else
1937     #endif
1938    
1939 ph10 527 /* Not in UTF-8 mode, but we may still have PCRE_UCP set, and for
1940 ph10 518 consistency with the behaviour of \w we do use it in this case. */
1941 nigel 77
1942     {
1943 ph10 518 /* Get status of previous character */
1944 ph10 527
1945 ph10 435 if (eptr == md->start_subject) prev_is_word = FALSE; else
1946     {
1947 ph10 443 if (eptr <= md->start_used_ptr) md->start_used_ptr = eptr - 1;
1948 ph10 527 #ifdef SUPPORT_UCP
1949 ph10 518 if (md->use_ucp)
1950     {
1951 ph10 527 c = eptr[-1];
1952 ph10 518 if (c == '_') prev_is_word = TRUE; else
1953 ph10 527 {
1954 ph10 518 int cat = UCD_CATEGORY(c);
1955     prev_is_word = (cat == ucp_L || cat == ucp_N);
1956 ph10 527 }
1957     }
1958     else
1959     #endif
1960 ph10 435 prev_is_word = ((md->ctypes[eptr[-1]] & ctype_word) != 0);
1961     }
1962 ph10 527
1963 ph10 518 /* Get status of next character */
1964 ph10 527
1965 ph10 443 if (eptr >= md->end_subject)
1966 ph10 428 {
1967 ph10 443 SCHECK_PARTIAL();
1968     cur_is_word = FALSE;
1969 ph10 428 }
1970 ph10 527 else
1971     #ifdef SUPPORT_UCP
1972 ph10 518 if (md->use_ucp)
1973     {
1974 ph10 527 c = *eptr;
1975 ph10 518 if (c == '_') cur_is_word = TRUE; else
1976 ph10 527 {
1977 ph10 518 int cat = UCD_CATEGORY(c);
1978     cur_is_word = (cat == ucp_L || cat == ucp_N);
1979 ph10 527 }
1980     }
1981     else
1982     #endif
1983 ph10 518 cur_is_word = ((md->ctypes[*eptr] & ctype_word) != 0);
1984 nigel 77 }
1985    
1986     /* Now see if the situation is what we want */
1987    
1988     if ((*ecode++ == OP_WORD_BOUNDARY)?
1989     cur_is_word == prev_is_word : cur_is_word != prev_is_word)
1990 ph10 510 MRRETURN(MATCH_NOMATCH);
1991 nigel 77 }
1992     break;
1993    
1994     /* Match a single character type; inline for speed */
1995    
1996     case OP_ANY:
1997 ph10 510 if (IS_NEWLINE(eptr)) MRRETURN(MATCH_NOMATCH);
1998 ph10 345 /* Fall through */
1999    
2000 ph10 341 case OP_ALLANY:
2001 ph10 443 if (eptr++ >= md->end_subject)
2002 ph10 428 {
2003 ph10 443 SCHECK_PARTIAL();
2004 ph10 510 MRRETURN(MATCH_NOMATCH);
2005 ph10 443 }
2006 ph10 342 if (utf8) while (eptr < md->end_subject && (*eptr & 0xc0) == 0x80) eptr++;
2007 nigel 77 ecode++;
2008     break;
2009    
2010     /* Match a single byte, even in UTF-8 mode. This opcode really does match
2011     any byte, even newline, independent of the setting of PCRE_DOTALL. */
2012    
2013     case OP_ANYBYTE:
2014 ph10 443 if (eptr++ >= md->end_subject)
2015 ph10 428 {
2016 ph10 443 SCHECK_PARTIAL();
2017 ph10 510 MRRETURN(MATCH_NOMATCH);
2018 ph10 443 }
2019 nigel 77 ecode++;
2020     break;
2021    
2022     case OP_NOT_DIGIT:
2023 ph10 443 if (eptr >= md->end_subject)
2024 ph10 428 {
2025 ph10 443 SCHECK_PARTIAL();
2026 ph10 510 MRRETURN(MATCH_NOMATCH);
2027 ph10 443 }
2028 nigel 77 GETCHARINCTEST(c, eptr);
2029     if (
2030     #ifdef SUPPORT_UTF8
2031     c < 256 &&
2032     #endif
2033     (md->ctypes[c] & ctype_digit) != 0
2034     )
2035 ph10 510 MRRETURN(MATCH_NOMATCH);
2036 nigel 77 ecode++;
2037     break;
2038    
2039     case OP_DIGIT:
2040 ph10 443 if (eptr >= md->end_subject)
2041 ph10 428 {
2042 ph10 443 SCHECK_PARTIAL();
2043 ph10 510 MRRETURN(MATCH_NOMATCH);
2044 ph10 443 }
2045 nigel 77 GETCHARINCTEST(c, eptr);
2046     if (
2047     #ifdef SUPPORT_UTF8
2048     c >= 256 ||
2049     #endif
2050     (md->ctypes[c] & ctype_digit) == 0
2051     )
2052 ph10 510 MRRETURN(MATCH_NOMATCH);
2053 nigel 77 ecode++;
2054     break;
2055    
2056     case OP_NOT_WHITESPACE:
2057 ph10 443 if (eptr >= md->end_subject)
2058 ph10 428 {
2059 ph10 443 SCHECK_PARTIAL();
2060 ph10 510 MRRETURN(MATCH_NOMATCH);
2061 ph10 443 }
2062 nigel 77 GETCHARINCTEST(c, eptr);
2063     if (
2064     #ifdef SUPPORT_UTF8
2065     c < 256 &&
2066     #endif
2067     (md->ctypes[c] & ctype_space) != 0
2068     )
2069 ph10 510 MRRETURN(MATCH_NOMATCH);
2070 nigel 77 ecode++;
2071     break;
2072    
2073     case OP_WHITESPACE:
2074 ph10 443 if (eptr >= md->end_subject)
2075 ph10 428 {
2076 ph10 443 SCHECK_PARTIAL();
2077 ph10 510 MRRETURN(MATCH_NOMATCH);
2078 ph10 443 }
2079 nigel 77 GETCHARINCTEST(c, eptr);
2080     if (
2081     #ifdef SUPPORT_UTF8
2082     c >= 256 ||
2083     #endif
2084     (md->ctypes[c] & ctype_space) == 0
2085     )
2086 ph10 510 MRRETURN(MATCH_NOMATCH);
2087 nigel 77 ecode++;
2088     break;
2089    
2090     case OP_NOT_WORDCHAR:
2091 ph10 443 if (eptr >= md->end_subject)
2092 ph10 428 {
2093 ph10 443 SCHECK_PARTIAL();
2094 ph10 510 MRRETURN(MATCH_NOMATCH);
2095 ph10 443 }
2096 nigel 77 GETCHARINCTEST(c, eptr);
2097     if (
2098     #ifdef SUPPORT_UTF8
2099     c < 256 &&
2100     #endif
2101     (md->ctypes[c] & ctype_word) != 0
2102     )
2103 ph10 510 MRRETURN(MATCH_NOMATCH);
2104 nigel 77 ecode++;
2105     break;
2106    
2107     case OP_WORDCHAR:
2108 ph10 443 if (eptr >= md->end_subject)
2109 ph10 428 {
2110 ph10 443 SCHECK_PARTIAL();
2111 ph10 510 MRRETURN(MATCH_NOMATCH);
2112 ph10 443 }
2113 nigel 77 GETCHARINCTEST(c, eptr);
2114     if (
2115     #ifdef SUPPORT_UTF8
2116     c >= 256 ||
2117     #endif
2118     (md->ctypes[c] & ctype_word) == 0
2119     )
2120 ph10 510 MRRETURN(MATCH_NOMATCH);
2121 nigel 77 ecode++;
2122     break;
2123    
2124 nigel 93 case OP_ANYNL:
2125 ph10 443 if (eptr >= md->end_subject)
2126 ph10 428 {
2127 ph10 443 SCHECK_PARTIAL();
2128 ph10 510 MRRETURN(MATCH_NOMATCH);
2129 ph10 443 }
2130 nigel 93 GETCHARINCTEST(c, eptr);
2131     switch(c)
2132     {
2133 ph10 510 default: MRRETURN(MATCH_NOMATCH);
2134 ph10 600
2135 nigel 93 case 0x000d:
2136     if (eptr < md->end_subject && *eptr == 0x0a) eptr++;
2137     break;
2138 ph10 231
2139 nigel 93 case 0x000a:
2140 ph10 231 break;
2141    
2142 nigel 93 case 0x000b:
2143     case 0x000c:
2144     case 0x0085:
2145     case 0x2028:
2146     case 0x2029:
2147 ph10 510 if (md->bsr_anycrlf) MRRETURN(MATCH_NOMATCH);
2148 nigel 93 break;
2149     }
2150     ecode++;
2151     break;
2152    
2153 ph10 178 case OP_NOT_HSPACE:
2154 ph10 443 if (eptr >= md->end_subject)
2155 ph10 428 {
2156 ph10 443 SCHECK_PARTIAL();
2157 ph10 510 MRRETURN(MATCH_NOMATCH);
2158 ph10 443 }
2159 ph10 178 GETCHARINCTEST(c, eptr);
2160     switch(c)
2161     {
2162     default: break;
2163     case 0x09: /* HT */
2164     case 0x20: /* SPACE */
2165     case 0xa0: /* NBSP */
2166     case 0x1680: /* OGHAM SPACE MARK */
2167     case 0x180e: /* MONGOLIAN VOWEL SEPARATOR */
2168     case 0x2000: /* EN QUAD */
2169     case 0x2001: /* EM QUAD */
2170     case 0x2002: /* EN SPACE */
2171     case 0x2003: /* EM SPACE */
2172     case 0x2004: /* THREE-PER-EM SPACE */
2173     case 0x2005: /* FOUR-PER-EM SPACE */
2174     case 0x2006: /* SIX-PER-EM SPACE */
2175     case 0x2007: /* FIGURE SPACE */
2176     case 0x2008: /* PUNCTUATION SPACE */
2177     case 0x2009: /* THIN SPACE */
2178     case 0x200A: /* HAIR SPACE */
2179     case 0x202f: /* NARROW NO-BREAK SPACE */
2180     case 0x205f: /* MEDIUM MATHEMATICAL SPACE */
2181     case 0x3000: /* IDEOGRAPHIC SPACE */
2182 ph10 510 MRRETURN(MATCH_NOMATCH);
2183 ph10 178 }
2184     ecode++;
2185     break;
2186    
2187     case OP_HSPACE:
2188 ph10 443 if (eptr >= md->end_subject)
2189 ph10 428 {
2190 ph10 443 SCHECK_PARTIAL();
2191 ph10 510 MRRETURN(MATCH_NOMATCH);
2192 ph10 443 }
2193 ph10 178 GETCHARINCTEST(c, eptr);
2194     switch(c)
2195     {
2196 ph10 510 default: MRRETURN(MATCH_NOMATCH);
2197 ph10 178 case 0x09: /* HT */
2198     case 0x20: /* SPACE */
2199     case 0xa0: /* NBSP */
2200     case 0x1680: /* OGHAM SPACE MARK */
2201     case 0x180e: /* MONGOLIAN VOWEL SEPARATOR */
2202     case 0x2000: /* EN QUAD */
2203     case 0x2001: /* EM QUAD */
2204     case 0x2002: /* EN SPACE */
2205     case 0x2003: /* EM SPACE */
2206     case 0x2004: /* THREE-PER-EM SPACE */
2207     case 0x2005: /* FOUR-PER-EM SPACE */
2208     case 0x2006: /* SIX-PER-EM SPACE */
2209     case 0x2007: /* FIGURE SPACE */
2210     case 0x2008: /* PUNCTUATION SPACE */
2211     case 0x2009: /* THIN SPACE */
2212     case 0x200A: /* HAIR SPACE */
2213     case 0x202f: /* NARROW NO-BREAK SPACE */
2214     case 0x205f: /* MEDIUM MATHEMATICAL SPACE */
2215     case 0x3000: /* IDEOGRAPHIC SPACE */
2216     break;
2217     }
2218     ecode++;
2219     break;
2220    
2221     case OP_NOT_VSPACE:
2222 ph10 443 if (eptr >= md->end_subject)
2223 ph10 428 {
2224 ph10 443 SCHECK_PARTIAL();
2225 ph10 510 MRRETURN(MATCH_NOMATCH);
2226 ph10 443 }
2227 ph10 178 GETCHARINCTEST(c, eptr);
2228     switch(c)
2229     {
2230     default: break;
2231     case 0x0a: /* LF */
2232     case 0x0b: /* VT */
2233     case 0x0c: /* FF */
2234     case 0x0d: /* CR */
2235     case 0x85: /* NEL */
2236     case 0x2028: /* LINE SEPARATOR */
2237     case 0x2029: /* PARAGRAPH SEPARATOR */
2238 ph10 510 MRRETURN(MATCH_NOMATCH);
2239 ph10 178 }
2240     ecode++;
2241     break;
2242    
2243     case OP_VSPACE:
2244 ph10 443 if (eptr >= md->end_subject)
2245 ph10 428 {
2246 ph10 443 SCHECK_PARTIAL();
2247 ph10 510 MRRETURN(MATCH_NOMATCH);
2248 ph10 443 }
2249 ph10 178 GETCHARINCTEST(c, eptr);
2250     switch(c)
2251     {
2252 ph10 510 default: MRRETURN(MATCH_NOMATCH);
2253 ph10 178 case 0x0a: /* LF */
2254     case 0x0b: /* VT */
2255     case 0x0c: /* FF */
2256     case 0x0d: /* CR */
2257     case 0x85: /* NEL */
2258     case 0x2028: /* LINE SEPARATOR */
2259     case 0x2029: /* PARAGRAPH SEPARATOR */
2260     break;
2261     }
2262     ecode++;
2263     break;
2264    
2265 nigel 77 #ifdef SUPPORT_UCP
2266     /* Check the next character by Unicode property. We will get here only
2267     if the support is in the binary; otherwise a compile-time error occurs. */
2268    
2269     case OP_PROP:
2270     case OP_NOTPROP:
2271 ph10 443 if (eptr >= md->end_subject)
2272 ph10 428 {
2273 ph10 443 SCHECK_PARTIAL();
2274 ph10 510 MRRETURN(MATCH_NOMATCH);
2275 ph10 443 }
2276 nigel 77 GETCHARINCTEST(c, eptr);
2277     {
2278 ph10 384 const ucd_record *prop = GET_UCD(c);
2279 nigel 77
2280 nigel 87 switch(ecode[1])
2281     {
2282     case PT_ANY:
2283 ph10 510 if (op == OP_NOTPROP) MRRETURN(MATCH_NOMATCH);
2284 nigel 87 break;
2285 nigel 77
2286 nigel 87 case PT_LAMP:
2287 ph10 349 if ((prop->chartype == ucp_Lu ||
2288     prop->chartype == ucp_Ll ||
2289     prop->chartype == ucp_Lt) == (op == OP_NOTPROP))
2290 ph10 510 MRRETURN(MATCH_NOMATCH);
2291 ph10 517 break;
2292 nigel 87
2293     case PT_GC:
2294 ph10 351 if ((ecode[2] != _pcre_ucp_gentype[prop->chartype]) == (op == OP_PROP))
2295 ph10 510 MRRETURN(MATCH_NOMATCH);
2296 nigel 87 break;
2297    
2298     case PT_PC:
2299 ph10 349 if ((ecode[2] != prop->chartype) == (op == OP_PROP))
2300 ph10 510 MRRETURN(MATCH_NOMATCH);
2301 nigel 87 break;
2302    
2303     case PT_SC:
2304 ph10 349 if ((ecode[2] != prop->script) == (op == OP_PROP))
2305 ph10 510 MRRETURN(MATCH_NOMATCH);
2306 nigel 87 break;
2307 ph10 527
2308 ph10 517 /* These are specials */
2309 ph10 527
2310 ph10 517 case PT_ALNUM:
2311     if ((_pcre_ucp_gentype[prop->chartype] == ucp_L ||
2312     _pcre_ucp_gentype[prop->chartype] == ucp_N) == (op == OP_NOTPROP))
2313     MRRETURN(MATCH_NOMATCH);
2314 ph10 527 break;
2315    
2316 ph10 517 case PT_SPACE: /* Perl space */
2317     if ((_pcre_ucp_gentype[prop->chartype] == ucp_Z ||
2318     c == CHAR_HT || c == CHAR_NL || c == CHAR_FF || c == CHAR_CR)
2319     == (op == OP_NOTPROP))
2320     MRRETURN(MATCH_NOMATCH);
2321 ph10 527 break;
2322    
2323 ph10 517 case PT_PXSPACE: /* POSIX space */
2324     if ((_pcre_ucp_gentype[prop->chartype] == ucp_Z ||
2325 ph10 527 c == CHAR_HT || c == CHAR_NL || c == CHAR_VT ||
2326 ph10 517 c == CHAR_FF || c == CHAR_CR)
2327     == (op == OP_NOTPROP))
2328     MRRETURN(MATCH_NOMATCH);
2329 ph10 527 break;
2330 nigel 87
2331 ph10 527 case PT_WORD:
2332 ph10 517 if ((_pcre_ucp_gentype[prop->chartype] == ucp_L ||
2333 ph10 527 _pcre_ucp_gentype[prop->chartype] == ucp_N ||
2334 ph10 517 c == CHAR_UNDERSCORE) == (op == OP_NOTPROP))
2335     MRRETURN(MATCH_NOMATCH);
2336 ph10 527 break;
2337    
2338 ph10 517 /* This should never occur */
2339    
2340 nigel 87 default:
2341     RRETURN(PCRE_ERROR_INTERNAL);
2342 nigel 77 }
2343 nigel 87
2344     ecode += 3;
2345 nigel 77 }
2346     break;
2347    
2348     /* Match an extended Unicode sequence. We will get here only if the support
2349     is in the binary; otherwise a compile-time error occurs. */
2350    
2351     case OP_EXTUNI:
2352 ph10 443 if (eptr >= md->end_subject)
2353 ph10 428 {
2354 ph10 443 SCHECK_PARTIAL();
2355 ph10 510 MRRETURN(MATCH_NOMATCH);
2356 ph10 443 }
2357 nigel 77 GETCHARINCTEST(c, eptr);
2358 ph10 623 if (UCD_CATEGORY(c) == ucp_M) MRRETURN(MATCH_NOMATCH);
2359     while (eptr < md->end_subject)
2360 nigel 77 {
2361 ph10 623 int len = 1;
2362     if (!utf8) c = *eptr; else { GETCHARLEN(c, eptr, len); }
2363     if (UCD_CATEGORY(c) != ucp_M) break;
2364     eptr += len;
2365 nigel 77 }
2366     ecode++;
2367     break;
2368     #endif
2369    
2370    
2371     /* Match a back reference, possibly repeatedly. Look past the end of the
2372     item to see if there is repeat information following. The code is similar
2373     to that for character classes, but repeated for efficiency. Then obey
2374     similar code to character type repeats - written out again for speed.
2375     However, if the referenced string is the empty string, always treat
2376     it as matched, any number of times (otherwise there could be infinite
2377     loops). */
2378    
2379     case OP_REF:
2380 ph10 602 case OP_REFI:
2381     caseless = op == OP_REFI;
2382 ph10 595 offset = GET2(ecode, 1) << 1; /* Doubled ref number */
2383     ecode += 3;
2384 ph10 345
2385 ph10 595 /* If the reference is unset, there are two possibilities:
2386 ph10 345
2387 ph10 595 (a) In the default, Perl-compatible state, set the length negative;
2388     this ensures that every attempt at a match fails. We can't just fail
2389     here, because of the possibility of quantifiers with zero minima.
2390 ph10 345
2391 ph10 595 (b) If the JavaScript compatibility flag is set, set the length to zero
2392     so that the back reference matches an empty string.
2393 ph10 345
2394 ph10 595 Otherwise, set the length to the length of what was matched by the
2395     referenced subpattern. */
2396 ph10 345
2397 ph10 595 if (offset >= offset_top || md->offset_vector[offset] < 0)
2398     length = (md->jscript_compat)? 0 : -1;
2399     else
2400     length = md->offset_vector[offset+1] - md->offset_vector[offset];
2401 nigel 77
2402 ph10 595 /* Set up for repetition, or handle the non-repeated case */
2403 nigel 77
2404 ph10 595 switch (*ecode)
2405     {
2406     case OP_CRSTAR:
2407     case OP_CRMINSTAR:
2408     case OP_CRPLUS:
2409     case OP_CRMINPLUS:
2410     case OP_CRQUERY:
2411     case OP_CRMINQUERY:
2412     c = *ecode++ - OP_CRSTAR;
2413     minimize = (c & 1) != 0;
2414     min = rep_min[c]; /* Pick up values from tables; */
2415     max = rep_max[c]; /* zero for max => infinity */
2416     if (max == 0) max = INT_MAX;
2417     break;
2418 nigel 77
2419 ph10 595 case OP_CRRANGE:
2420     case OP_CRMINRANGE:
2421     minimize = (*ecode == OP_CRMINRANGE);
2422     min = GET2(ecode, 1);
2423     max = GET2(ecode, 3);
2424     if (max == 0) max = INT_MAX;
2425     ecode += 5;
2426     break;
2427 nigel 77
2428 ph10 595 default: /* No repeat follows */
2429 ph10 602 if ((length = match_ref(offset, eptr, length, md, caseless)) < 0)
2430 ph10 595 {
2431     CHECK_PARTIAL();
2432     MRRETURN(MATCH_NOMATCH);
2433 nigel 77 }
2434 ph10 595 eptr += length;
2435     continue; /* With the main loop */
2436     }
2437 nigel 77
2438 ph10 595 /* Handle repeated back references. If the length of the reference is
2439     zero, just continue with the main loop. */
2440 ph10 443
2441 ph10 595 if (length == 0) continue;
2442 nigel 77
2443 ph10 595 /* First, ensure the minimum number of matches are present. We get back
2444     the length of the reference string explicitly rather than passing the
2445     address of eptr, so that eptr can be a register variable. */
2446 nigel 77
2447 ph10 595 for (i = 1; i <= min; i++)
2448     {
2449     int slength;
2450 ph10 602 if ((slength = match_ref(offset, eptr, length, md, caseless)) < 0)
2451 nigel 77 {
2452 ph10 595 CHECK_PARTIAL();
2453     MRRETURN(MATCH_NOMATCH);
2454 nigel 77 }
2455 ph10 595 eptr += slength;
2456     }
2457 nigel 77
2458 ph10 595 /* If min = max, continue at the same level without recursion.
2459     They are not both allowed to be zero. */
2460 nigel 77
2461 ph10 595 if (min == max) continue;
2462 nigel 77
2463 ph10 595 /* If minimizing, keep trying and advancing the pointer */
2464 nigel 77
2465 ph10 595 if (minimize)
2466     {
2467     for (fi = min;; fi++)
2468 nigel 77 {
2469 ph10 595 int slength;
2470 ph10 604 RMATCH(eptr, ecode, offset_top, md, eptrb, RM14);
2471 ph10 595 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2472     if (fi >= max) MRRETURN(MATCH_NOMATCH);
2473 ph10 602 if ((slength = match_ref(offset, eptr, length, md, caseless)) < 0)
2474 nigel 77 {
2475 ph10 595 CHECK_PARTIAL();
2476     MRRETURN(MATCH_NOMATCH);
2477 nigel 77 }
2478 ph10 595 eptr += slength;
2479 nigel 77 }
2480 ph10 595 /* Control never gets here */
2481     }
2482 nigel 77
2483 ph10 595 /* If maximizing, find the longest string and work backwards */
2484 nigel 77
2485 ph10 595 else
2486     {
2487     pp = eptr;
2488     for (i = min; i < max; i++)
2489 nigel 77 {
2490 ph10 595 int slength;
2491 ph10 602 if ((slength = match_ref(offset, eptr, length, md, caseless)) < 0)
2492 nigel 77 {
2493 ph10 595 CHECK_PARTIAL();
2494     break;
2495 nigel 77 }
2496 ph10 595 eptr += slength;
2497 nigel 77 }
2498 ph10 595 while (eptr >= pp)
2499     {
2500 ph10 604 RMATCH(eptr, ecode, offset_top, md, eptrb, RM15);
2501 ph10 595 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2502     eptr -= length;
2503     }
2504     MRRETURN(MATCH_NOMATCH);
2505 nigel 77 }
2506     /* Control never gets here */
2507    
2508     /* Match a bit-mapped character class, possibly repeatedly. This op code is
2509     used when all the characters in the class have values in the range 0-255,
2510     and either the matching is caseful, or the characters are in the range
2511     0-127 when UTF-8 processing is enabled. The only difference between
2512     OP_CLASS and OP_NCLASS occurs when a data character outside the range is
2513     encountered.
2514    
2515     First, look past the end of the item to see if there is repeat information
2516     following. Then obey similar code to character type repeats - written out
2517     again for speed. */
2518    
2519     case OP_NCLASS:
2520     case OP_CLASS:
2521     {
2522     data = ecode + 1; /* Save for matching */
2523     ecode += 33; /* Advance past the item */
2524    
2525     switch (*ecode)
2526     {
2527     case OP_CRSTAR:
2528     case OP_CRMINSTAR:
2529     case OP_CRPLUS:
2530     case OP_CRMINPLUS:
2531     case OP_CRQUERY:
2532     case OP_CRMINQUERY:
2533     c = *ecode++ - OP_CRSTAR;
2534     minimize = (c & 1) != 0;
2535     min = rep_min[c]; /* Pick up values from tables; */
2536     max = rep_max[c]; /* zero for max => infinity */
2537     if (max == 0) max = INT_MAX;
2538     break;
2539    
2540     case OP_CRRANGE:
2541     case OP_CRMINRANGE:
2542     minimize = (*ecode == OP_CRMINRANGE);
2543     min = GET2(ecode, 1);
2544     max = GET2(ecode, 3);
2545     if (max == 0) max = INT_MAX;
2546     ecode += 5;
2547     break;
2548    
2549     default: /* No repeat follows */
2550     min = max = 1;
2551     break;
2552     }
2553    
2554     /* First, ensure the minimum number of matches are present. */
2555    
2556     #ifdef SUPPORT_UTF8
2557     /* UTF-8 mode */
2558     if (utf8)
2559     {
2560     for (i = 1; i <= min; i++)
2561     {
2562 ph10 427 if (eptr >= md->end_subject)
2563 ph10 426 {
2564 ph10 428 SCHECK_PARTIAL();
2565 ph10 510 MRRETURN(MATCH_NOMATCH);
2566 ph10 427 }
2567 nigel 77 GETCHARINC(c, eptr);
2568     if (c > 255)
2569     {
2570 ph10 510 if (op == OP_CLASS) MRRETURN(MATCH_NOMATCH);
2571 nigel 77 }
2572     else
2573     {
2574 ph10 510 if ((data[c/8] & (1 << (c&7))) == 0) MRRETURN(MATCH_NOMATCH);
2575 nigel 77 }
2576     }
2577     }
2578     else
2579     #endif
2580     /* Not UTF-8 mode */
2581     {
2582     for (i = 1; i <= min; i++)
2583     {
2584 ph10 427 if (eptr >= md->end_subject)
2585 ph10 426 {
2586 ph10 428 SCHECK_PARTIAL();
2587 ph10 510 MRRETURN(MATCH_NOMATCH);
2588 ph10 427 }
2589 nigel 77 c = *eptr++;
2590 ph10 510 if ((data[c/8] & (1 << (c&7))) == 0) MRRETURN(MATCH_NOMATCH);
2591 nigel 77 }
2592     }
2593    
2594     /* If max == min we can continue with the main loop without the
2595     need to recurse. */
2596    
2597     if (min == max) continue;
2598    
2599     /* If minimizing, keep testing the rest of the expression and advancing
2600     the pointer while it matches the class. */
2601    
2602     if (minimize)
2603     {
2604     #ifdef SUPPORT_UTF8
2605     /* UTF-8 mode */
2606     if (utf8)
2607     {
2608     for (fi = min;; fi++)
2609     {
2610 ph10 604 RMATCH(eptr, ecode, offset_top, md, eptrb, RM16);
2611 nigel 77 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2612 ph10 510 if (fi >= max) MRRETURN(MATCH_NOMATCH);
2613 ph10 427 if (eptr >= md->end_subject)
2614 ph10 426 {
2615 ph10 427 SCHECK_PARTIAL();
2616 ph10 510 MRRETURN(MATCH_NOMATCH);
2617 ph10 427 }
2618 nigel 77 GETCHARINC(c, eptr);
2619     if (c > 255)
2620     {
2621 ph10 510 if (op == OP_CLASS) MRRETURN(MATCH_NOMATCH);
2622 nigel 77 }
2623     else
2624     {
2625 ph10 510 if ((data[c/8] & (1 << (c&7))) == 0) MRRETURN(MATCH_NOMATCH);
2626 nigel 77 }
2627     }
2628     }
2629     else
2630     #endif
2631     /* Not UTF-8 mode */
2632     {
2633     for (fi = min;; fi++)
2634     {
2635 ph10 604 RMATCH(eptr, ecode, offset_top, md, eptrb, RM17);
2636 nigel 77 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2637 ph10 510 if (fi >= max) MRRETURN(MATCH_NOMATCH);
2638 ph10 427 if (eptr >= md->end_subject)
2639 ph10 426 {
2640 ph10 427 SCHECK_PARTIAL();
2641 ph10 510 MRRETURN(MATCH_NOMATCH);
2642 ph10 427 }
2643 nigel 77 c = *eptr++;
2644 ph10 510 if ((data[c/8] & (1 << (c&7))) == 0) MRRETURN(MATCH_NOMATCH);
2645 nigel 77 }
2646     }
2647     /* Control never gets here */
2648     }
2649    
2650     /* If maximizing, find the longest possible run, then work backwards. */
2651    
2652     else
2653     {
2654     pp = eptr;
2655    
2656     #ifdef SUPPORT_UTF8
2657     /* UTF-8 mode */
2658     if (utf8)
2659     {
2660     for (i = min; i < max; i++)
2661     {
2662     int len = 1;
2663 ph10 463 if (eptr >= md->end_subject)
2664 ph10 462 {
2665 ph10 463 SCHECK_PARTIAL();
2666 ph10 462 break;
2667 ph10 463 }
2668 nigel 77 GETCHARLEN(c, eptr, len);
2669     if (c > 255)
2670     {
2671     if (op == OP_CLASS) break;
2672     }
2673     else
2674     {
2675     if ((data[c/8] & (1 << (c&7))) == 0) break;
2676     }
2677     eptr += len;
2678     }
2679     for (;;)
2680     {
2681 ph10 604 RMATCH(eptr, ecode, offset_top, md, eptrb, RM18);
2682 nigel 77 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2683     if (eptr-- == pp) break; /* Stop if tried at original pos */
2684     BACKCHAR(eptr);
2685     }
2686     }
2687     else
2688     #endif
2689     /* Not UTF-8 mode */
2690     {
2691     for (i = min; i < max; i++)
2692     {
2693 ph10 463 if (eptr >= md->end_subject)
2694 ph10 462 {
2695 ph10 463 SCHECK_PARTIAL();
2696 ph10 462 break;
2697 ph10 463 }
2698 nigel 77 c = *eptr;
2699     if ((data[c/8] & (1 << (c&7))) == 0) break;
2700     eptr++;
2701     }
2702     while (eptr >= pp)
2703     {
2704 ph10 604 RMATCH(eptr, ecode, offset_top, md, eptrb, RM19);
2705 nigel 87 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2706 nigel 77 eptr--;
2707     }
2708     }
2709    
2710 ph10 510 MRRETURN(MATCH_NOMATCH);
2711 nigel 77 }
2712     }
2713     /* Control never gets here */
2714    
2715    
2716     /* Match an extended character class. This opcode is encountered only
2717 ph10 384 when UTF-8 mode mode is supported. Nevertheless, we may not be in UTF-8
2718     mode, because Unicode properties are supported in non-UTF-8 mode. */
2719 nigel 77
2720     #ifdef SUPPORT_UTF8
2721     case OP_XCLASS:
2722     {
2723     data = ecode + 1 + LINK_SIZE; /* Save for matching */
2724     ecode += GET(ecode, 1); /* Advance past the item */
2725    
2726     switch (*ecode)
2727     {
2728     case OP_CRSTAR:
2729     case OP_CRMINSTAR:
2730     case OP_CRPLUS:
2731     case OP_CRMINPLUS:
2732     case OP_CRQUERY:
2733     case OP_CRMINQUERY:
2734     c = *ecode++ - OP_CRSTAR;
2735     minimize = (c & 1) != 0;
2736     min = rep_min[c]; /* Pick up values from tables; */
2737     max = rep_max[c]; /* zero for max => infinity */
2738     if (max == 0) max = INT_MAX;
2739     break;
2740    
2741     case OP_CRRANGE:
2742     case OP_CRMINRANGE:
2743     minimize = (*ecode == OP_CRMINRANGE);
2744     min = GET2(ecode, 1);
2745     max = GET2(ecode, 3);
2746     if (max == 0) max = INT_MAX;
2747     ecode += 5;
2748     break;
2749    
2750     default: /* No repeat follows */
2751     min = max = 1;
2752     break;
2753     }
2754    
2755     /* First, ensure the minimum number of matches are present. */
2756    
2757     for (i = 1; i <= min; i++)
2758     {
2759 ph10 427 if (eptr >= md->end_subject)
2760 ph10 426 {
2761     SCHECK_PARTIAL();
2762 ph10 510 MRRETURN(MATCH_NOMATCH);
2763 ph10 427 }
2764 ph10 384 GETCHARINCTEST(c, eptr);
2765 ph10 510 if (!_pcre_xclass(c, data)) MRRETURN(MATCH_NOMATCH);
2766 nigel 77 }
2767    
2768     /* If max == min we can continue with the main loop without the
2769     need to recurse. */
2770    
2771     if (min == max) continue;
2772    
2773     /* If minimizing, keep testing the rest of the expression and advancing
2774     the pointer while it matches the class. */
2775    
2776     if (minimize)
2777     {
2778     for (fi = min;; fi++)
2779     {
2780 ph10 604 RMATCH(eptr, ecode, offset_top, md, eptrb, RM20);
2781 nigel 77 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2782 ph10 510 if (fi >= max) MRRETURN(MATCH_NOMATCH);
2783 ph10 427 if (eptr >= md->end_subject)
2784 ph10 426 {
2785 ph10 427 SCHECK_PARTIAL();
2786 ph10 510 MRRETURN(MATCH_NOMATCH);
2787 ph10 427 }
2788 ph10 384 GETCHARINCTEST(c, eptr);
2789 ph10 510 if (!_pcre_xclass(c, data)) MRRETURN(MATCH_NOMATCH);
2790 nigel 77 }
2791     /* Control never gets here */
2792     }
2793    
2794     /* If maximizing, find the longest possible run, then work backwards. */
2795    
2796     else
2797     {
2798     pp = eptr;
2799     for (i = min; i < max; i++)
2800     {
2801     int len = 1;
2802 ph10 463 if (eptr >= md->end_subject)
2803 ph10 462 {
2804 ph10 463 SCHECK_PARTIAL();
2805 ph10 462 break;
2806 ph10 463 }
2807 ph10 384 GETCHARLENTEST(c, eptr, len);
2808 nigel 77 if (!_pcre_xclass(c, data)) break;
2809     eptr += len;
2810     }
2811     for(;;)
2812     {
2813 ph10 604 RMATCH(eptr, ecode, offset_top, md, eptrb, RM21);
2814 nigel 77 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2815     if (eptr-- == pp) break; /* Stop if tried at original pos */
2816 ph10 214 if (utf8) BACKCHAR(eptr);
2817 nigel 77 }
2818 ph10 510 MRRETURN(MATCH_NOMATCH);
2819 nigel 77 }
2820    
2821     /* Control never gets here */
2822     }
2823     #endif /* End of XCLASS */
2824    
2825     /* Match a single character, casefully */
2826    
2827     case OP_CHAR:
2828     #ifdef SUPPORT_UTF8
2829     if (utf8)
2830     {
2831     length = 1;
2832     ecode++;
2833     GETCHARLEN(fc, ecode, length);
2834 ph10 443 if (length > md->end_subject - eptr)
2835 ph10 428 {
2836     CHECK_PARTIAL(); /* Not SCHECK_PARTIAL() */
2837 ph10 510 MRRETURN(MATCH_NOMATCH);
2838 ph10 443 }
2839 ph10 510 while (length-- > 0) if (*ecode++ != *eptr++) MRRETURN(MATCH_NOMATCH);
2840 nigel 77 }
2841     else
2842     #endif
2843    
2844     /* Non-UTF-8 mode */
2845     {
2846 ph10 443 if (md->end_subject - eptr < 1)
2847 ph10 428 {
2848     SCHECK_PARTIAL(); /* This one can use SCHECK_PARTIAL() */
2849 ph10 510 MRRETURN(MATCH_NOMATCH);
2850 ph10 443 }
2851 ph10 510 if (ecode[1] != *eptr++) MRRETURN(MATCH_NOMATCH);
2852 nigel 77 ecode += 2;
2853     }
2854     break;
2855    
2856     /* Match a single character, caselessly */
2857    
2858 ph10 602 case OP_CHARI:
2859 nigel 77 #ifdef SUPPORT_UTF8
2860     if (utf8)
2861     {
2862     length = 1;
2863     ecode++;
2864     GETCHARLEN(fc, ecode, length);
2865    
2866 ph10 443 if (length > md->end_subject - eptr)
2867 ph10 428 {
2868     CHECK_PARTIAL(); /* Not SCHECK_PARTIAL() */
2869 ph10 510 MRRETURN(MATCH_NOMATCH);
2870 ph10 443 }
2871 nigel 77
2872     /* If the pattern character's value is < 128, we have only one byte, and
2873     can use the fast lookup table. */
2874    
2875     if (fc < 128)
2876     {
2877 ph10 510 if (md->lcc[*ecode++] != md->lcc[*eptr++]) MRRETURN(MATCH_NOMATCH);
2878 nigel 77 }
2879    
2880     /* Otherwise we must pick up the subject character */
2881    
2882     else
2883     {
2884 nigel 93 unsigned int dc;
2885 nigel 77 GETCHARINC(dc, eptr);
2886     ecode += length;
2887    
2888     /* If we have Unicode property support, we can use it to test the other
2889 nigel 87 case of the character, if there is one. */
2890 nigel 77
2891     if (fc != dc)
2892     {
2893     #ifdef SUPPORT_UCP
2894 ph10 349 if (dc != UCD_OTHERCASE(fc))
2895 nigel 77 #endif
2896 ph10 510 MRRETURN(MATCH_NOMATCH);
2897 nigel 77 }
2898     }
2899     }
2900     else
2901     #endif /* SUPPORT_UTF8 */
2902    
2903     /* Non-UTF-8 mode */
2904     {
2905 ph10 443 if (md->end_subject - eptr < 1)
2906 ph10 428 {
2907 ph10 443 SCHECK_PARTIAL(); /* This one can use SCHECK_PARTIAL() */
2908 ph10 510 MRRETURN(MATCH_NOMATCH);
2909 ph10 443 }
2910 ph10 510 if (md->lcc[ecode[1]] != md->lcc[*eptr++]) MRRETURN(MATCH_NOMATCH);
2911 nigel 77 ecode += 2;
2912     }
2913     break;
2914    
2915 nigel 93 /* Match a single character repeatedly. */
2916 nigel 77
2917     case OP_EXACT:
2918 ph10 602 case OP_EXACTI:
2919 nigel 77 min = max = GET2(ecode, 1);
2920     ecode += 3;
2921     goto REPEATCHAR;
2922    
2923 nigel 93 case OP_POSUPTO:
2924 ph10 602 case OP_POSUPTOI:
2925 nigel 93 possessive = TRUE;
2926     /* Fall through */
2927    
2928 nigel 77 case OP_UPTO:
2929 ph10 602 case OP_UPTOI:
2930 nigel 77 case OP_MINUPTO:
2931 ph10 602 case OP_MINUPTOI:
2932 nigel 77 min = 0;
2933     max = GET2(ecode, 1);
2934 ph10 602 minimize = *ecode == OP_MINUPTO || *ecode == OP_MINUPTOI;
2935 nigel 77 ecode += 3;
2936     goto REPEATCHAR;
2937    
2938 nigel 93 case OP_POSSTAR:
2939 ph10 602 case OP_POSSTARI:
2940 nigel 93 possessive = TRUE;
2941     min = 0;
2942     max = INT_MAX;
2943     ecode++;
2944     goto REPEATCHAR;
2945    
2946     case OP_POSPLUS:
2947 ph10 602 case OP_POSPLUSI:
2948 nigel 93 possessive = TRUE;
2949     min = 1;
2950     max = INT_MAX;
2951     ecode++;
2952     goto REPEATCHAR;
2953    
2954     case OP_POSQUERY:
2955 ph10 602 case OP_POSQUERYI:
2956 nigel 93 possessive = TRUE;
2957     min = 0;
2958     max = 1;
2959     ecode++;
2960     goto REPEATCHAR;
2961    
2962 nigel 77 case OP_STAR:
2963 ph10 602 case OP_STARI:
2964 nigel 77 case OP_MINSTAR:
2965 ph10 602 case OP_MINSTARI:
2966 nigel 77 case OP_PLUS:
2967 ph10 602 case OP_PLUSI:
2968 nigel 77 case OP_MINPLUS:
2969 ph10 602 case OP_MINPLUSI:
2970 nigel 77 case OP_QUERY:
2971 ph10 602 case OP_QUERYI:
2972 nigel 77 case OP_MINQUERY:
2973 ph10 602 case OP_MINQUERYI:
2974     c = *ecode++ - ((op < OP_STARI)? OP_STAR : OP_STARI);
2975 nigel 77 minimize = (c & 1) != 0;
2976     min = rep_min[c]; /* Pick up values from tables; */
2977     max = rep_max[c]; /* zero for max => infinity */
2978     if (max == 0) max = INT_MAX;
2979    
2980 ph10 426 /* Common code for all repeated single-character matches. */
2981 nigel 77
2982     REPEATCHAR:
2983     #ifdef SUPPORT_UTF8
2984     if (utf8)
2985     {
2986     length = 1;
2987     charptr = ecode;
2988     GETCHARLEN(fc, ecode, length);
2989     ecode += length;
2990    
2991     /* Handle multibyte character matching specially here. There is
2992     support for caseless matching if UCP support is present. */
2993    
2994     if (length > 1)
2995     {
2996     #ifdef SUPPORT_UCP
2997 nigel 93 unsigned int othercase;
2998 ph10 602 if (op >= OP_STARI && /* Caseless */
2999 ph10 349 (othercase = UCD_OTHERCASE(fc)) != fc)
3000 nigel 77 oclength = _pcre_ord2utf8(othercase, occhars);
3001 ph10 115 else oclength = 0;
3002 nigel 77 #endif /* SUPPORT_UCP */
3003    
3004     for (i = 1; i <= min; i++)
3005     {
3006 ph10 426 if (eptr <= md->end_subject - length &&
3007     memcmp(eptr, charptr, length) == 0) eptr += length;
3008 ph10 123 #ifdef SUPPORT_UCP
3009 ph10 426 else if (oclength > 0 &&
3010     eptr <= md->end_subject - oclength &&
3011     memcmp(eptr, occhars, oclength) == 0) eptr += oclength;
3012     #endif /* SUPPORT_UCP */
3013 nigel 77 else
3014     {
3015 ph10 426 CHECK_PARTIAL();
3016 ph10 510 MRRETURN(MATCH_NOMATCH);
3017 nigel 77 }
3018     }
3019    
3020     if (min == max) continue;
3021    
3022     if (minimize)
3023     {
3024     for (fi = min;; fi++)
3025     {
3026 ph10 604 RMATCH(eptr, ecode, offset_top, md, eptrb, RM22);
3027 nigel 77 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3028 ph10 510 if (fi >= max) MRRETURN(MATCH_NOMATCH);
3029 ph10 426 if (eptr <= md->end_subject - length &&
3030     memcmp(eptr, charptr, length) == 0) eptr += length;
3031 ph10 123 #ifdef SUPPORT_UCP
3032 ph10 426 else if (oclength > 0 &&
3033     eptr <= md->end_subject - oclength &&
3034     memcmp(eptr, occhars, oclength) == 0) eptr += oclength;
3035     #endif /* SUPPORT_UCP */
3036 nigel 77 else
3037     {
3038 ph10 426 CHECK_PARTIAL();
3039 ph10 510 MRRETURN(MATCH_NOMATCH);
3040 nigel 77 }
3041     }
3042     /* Control never gets here */
3043     }
3044 nigel 93
3045     else /* Maximize */
3046 nigel 77 {
3047     pp = eptr;
3048     for (i = min; i < max; i++)
3049     {
3050 ph10 426 if (eptr <= md->end_subject - length &&
3051     memcmp(eptr, charptr, length) == 0) eptr += length;
3052 ph10 123 #ifdef SUPPORT_UCP
3053 ph10 426 else if (oclength > 0 &&
3054     eptr <= md->end_subject - oclength &&
3055     memcmp(eptr, occhars, oclength) == 0) eptr += oclength;
3056     #endif /* SUPPORT_UCP */
3057 ph10 463 else
3058 ph10 462 {
3059 ph10 463 CHECK_PARTIAL();
3060 ph10 462 break;
3061 ph10 463 }
3062 nigel 77 }
3063 nigel 93
3064     if (possessive) continue;
3065 ph10 427
3066 ph10 120 for(;;)
3067 ph10 426 {
3068 ph10 604 RMATCH(eptr, ecode, offset_top, md, eptrb, RM23);
3069 ph10 426 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3070 ph10 510 if (eptr == pp) { MRRETURN(MATCH_NOMATCH); }
3071 ph10 115 #ifdef SUPPORT_UCP
3072 ph10 426 eptr--;
3073     BACKCHAR(eptr);
3074 ph10 123 #else /* without SUPPORT_UCP */
3075 ph10 426 eptr -= length;
3076 ph10 123 #endif /* SUPPORT_UCP */
3077 ph10 426 }
3078 nigel 77 }
3079     /* Control never gets here */
3080     }
3081    
3082     /* If the length of a UTF-8 character is 1, we fall through here, and
3083     obey the code as for non-UTF-8 characters below, though in this case the
3084     value of fc will always be < 128. */
3085     }
3086     else
3087     #endif /* SUPPORT_UTF8 */
3088    
3089     /* When not in UTF-8 mode, load a single-byte character. */
3090    
3091 ph10 426 fc = *ecode++;
3092 ph10 443
3093 nigel 77 /* The value of fc at this point is always less than 256, though we may or
3094     may not be in UTF-8 mode. The code is duplicated for the caseless and
3095     caseful cases, for speed, since matching characters is likely to be quite
3096     common. First, ensure the minimum number of matches are present. If min =
3097     max, continue at the same level without recursing. Otherwise, if
3098     minimizing, keep trying the rest of the expression and advancing one
3099     matching character if failing, up to the maximum. Alternatively, if
3100     maximizing, find the maximum number of characters and work backwards. */
3101    
3102     DPRINTF(("matching %c{%d,%d} against subject %.*s\n", fc, min, max,
3103     max, eptr));
3104    
3105 ph10 602 if (op >= OP_STARI) /* Caseless */
3106 nigel 77 {
3107     fc = md->lcc[fc];
3108     for (i = 1; i <= min; i++)
3109 ph10 426 {
3110     if (eptr >= md->end_subject)
3111     {
3112     SCHECK_PARTIAL();
3113 ph10 510 MRRETURN(MATCH_NOMATCH);
3114 ph10 426 }
3115 ph10 510 if (fc != md->lcc[*eptr++]) MRRETURN(MATCH_NOMATCH);
3116 ph10 426 }
3117 nigel 77 if (min == max) continue;
3118     if (minimize)
3119     {
3120     for (fi = min;; fi++)
3121     {
3122 ph10 604 RMATCH(eptr, ecode, offset_top, md, eptrb, RM24);
3123 nigel 77 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3124 ph10 510 if (fi >= max) MRRETURN(MATCH_NOMATCH);
3125 ph10 426 if (eptr >= md->end_subject)
3126     {
3127 ph10 427 SCHECK_PARTIAL();
3128 ph10 510 MRRETURN(MATCH_NOMATCH);
3129 ph10 426 }
3130 ph10 510 if (fc != md->lcc[*eptr++]) MRRETURN(MATCH_NOMATCH);
3131 nigel 77 }
3132     /* Control never gets here */
3133     }
3134 nigel 93 else /* Maximize */
3135 nigel 77 {
3136     pp = eptr;
3137     for (i = min; i < max; i++)
3138     {
3139 ph10 463 if (eptr >= md->end_subject)
3140 ph10 462 {
3141     SCHECK_PARTIAL();
3142     break;
3143 ph10 463 }
3144 ph10 462 if (fc != md->lcc[*eptr]) break;
3145 nigel 77 eptr++;
3146     }
3147 ph10 427
3148 nigel 93 if (possessive) continue;
3149 ph10 427
3150 nigel 77 while (eptr >= pp)
3151     {
3152 ph10 604 RMATCH(eptr, ecode, offset_top, md, eptrb, RM25);
3153 nigel 77 eptr--;
3154     if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3155     }
3156 ph10 510 MRRETURN(MATCH_NOMATCH);
3157 nigel 77 }
3158     /* Control never gets here */
3159     }
3160    
3161     /* Caseful comparisons (includes all multi-byte characters) */
3162    
3163     else
3164     {
3165 ph10 427 for (i = 1; i <= min; i++)
3166 ph10 426 {
3167     if (eptr >= md->end_subject)
3168     {
3169     SCHECK_PARTIAL();
3170 ph10 510 MRRETURN(MATCH_NOMATCH);
3171 ph10 426 }
3172 ph10 510 if (fc != *eptr++) MRRETURN(MATCH_NOMATCH);
3173 ph10 427 }
3174 ph10 443
3175 nigel 77 if (min == max) continue;
3176 ph10 443
3177 nigel 77 if (minimize)
3178     {
3179     for (fi = min;; fi++)
3180     {
3181 ph10 604 RMATCH(eptr, ecode, offset_top, md, eptrb, RM26);
3182 nigel 77 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3183 ph10 510 if (fi >= max) MRRETURN(MATCH_NOMATCH);
3184 ph10 426 if (eptr >= md->end_subject)
3185 ph10 427 {
3186 ph10 426 SCHECK_PARTIAL();
3187 ph10 510 MRRETURN(MATCH_NOMATCH);
3188 ph10 427 }
3189 ph10 510 if (fc != *eptr++) MRRETURN(MATCH_NOMATCH);
3190 nigel 77 }
3191     /* Control never gets here */
3192     }
3193 nigel 93 else /* Maximize */
3194 nigel 77 {
3195     pp = eptr;
3196     for (i = min; i < max; i++)
3197     {
3198 ph10 463 if (eptr >= md->end_subject)
3199 ph10 462 {
3200 ph10 463 SCHECK_PARTIAL();
3201 ph10 462 break;
3202 ph10 463 }
3203 ph10 462 if (fc != *eptr) break;
3204 nigel 77 eptr++;
3205     }
3206 nigel 93 if (possessive) continue;
3207 ph10 443
3208 nigel 77 while (eptr >= pp)
3209     {
3210 ph10 604 RMATCH(eptr, ecode, offset_top, md, eptrb, RM27);
3211 nigel 77 eptr--;
3212     if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3213     }
3214 ph10 510 MRRETURN(MATCH_NOMATCH);
3215 nigel 77 }
3216     }
3217     /* Control never gets here */
3218    
3219     /* Match a negated single one-byte character. The character we are
3220     checking can be multibyte. */
3221    
3222     case OP_NOT:
3223 ph10 602 case OP_NOTI:
3224 ph10 443 if (eptr >= md->end_subject)
3225 ph10 428 {
3226 ph10 443 SCHECK_PARTIAL();
3227 ph10 510 MRRETURN(MATCH_NOMATCH);
3228 ph10 443 }
3229 nigel 77 ecode++;
3230     GETCHARINCTEST(c, eptr);
3231 ph10 602 if (op == OP_NOTI) /* The caseless case */
3232 nigel 77 {
3233     #ifdef SUPPORT_UTF8
3234     if (c < 256)
3235     #endif
3236     c = md->lcc[c];
3237 ph10 510 if (md->lcc[*ecode++] == c) MRRETURN(MATCH_NOMATCH);
3238 nigel 77 }
3239 ph10 602 else /* Caseful */
3240 nigel 77 {
3241 ph10 510 if (*ecode++ == c) MRRETURN(MATCH_NOMATCH);
3242 nigel 77 }
3243     break;
3244    
3245     /* Match a negated single one-byte character repeatedly. This is almost a
3246     repeat of the code for a repeated single character, but I haven't found a
3247     nice way of commoning these up that doesn't require a test of the
3248     positive/negative option for each character match. Maybe that wouldn't add
3249     very much to the time taken, but character matching *is* what this is all
3250     about... */
3251    
3252     case OP_NOTEXACT:
3253 ph10 602 case OP_NOTEXACTI:
3254 nigel 77 min = max = GET2(ecode, 1);
3255     ecode += 3;
3256     goto REPEATNOTCHAR;
3257    
3258     case OP_NOTUPTO:
3259 ph10 602 case OP_NOTUPTOI:
3260 nigel 77 case OP_NOTMINUPTO:
3261 ph10 602 case OP_NOTMINUPTOI:
3262 nigel 77 min = 0;
3263     max = GET2(ecode, 1);
3264 ph10 602 minimize = *ecode == OP_NOTMINUPTO || *ecode == OP_NOTMINUPTOI;
3265 nigel 77 ecode += 3;
3266     goto REPEATNOTCHAR;
3267    
3268 nigel 93 case OP_NOTPOSSTAR:
3269 ph10 602 case OP_NOTPOSSTARI:
3270 nigel 93 possessive = TRUE;
3271     min = 0;
3272     max = INT_MAX;
3273     ecode++;
3274     goto REPEATNOTCHAR;
3275    
3276     case OP_NOTPOSPLUS:
3277 ph10 602 case OP_NOTPOSPLUSI:
3278 nigel 93 possessive = TRUE;
3279     min = 1;
3280     max = INT_MAX;
3281     ecode++;
3282     goto REPEATNOTCHAR;
3283    
3284     case OP_NOTPOSQUERY:
3285 ph10 602 case OP_NOTPOSQUERYI:
3286 nigel 93 possessive = TRUE;
3287     min = 0;
3288     max = 1;
3289     ecode++;
3290     goto REPEATNOTCHAR;
3291    
3292     case OP_NOTPOSUPTO:
3293 ph10 602 case OP_NOTPOSUPTOI:
3294 nigel 93 possessive = TRUE;
3295     min = 0;
3296     max = GET2(ecode, 1);
3297     ecode += 3;
3298     goto REPEATNOTCHAR;
3299    
3300 nigel 77 case OP_NOTSTAR:
3301 ph10 602 case OP_NOTSTARI:
3302 nigel 77 case OP_NOTMINSTAR:
3303 ph10 602 case OP_NOTMINSTARI:
3304 nigel 77 case OP_NOTPLUS:
3305 ph10 602 case OP_NOTPLUSI:
3306 nigel 77 case OP_NOTMINPLUS:
3307 ph10 602 case OP_NOTMINPLUSI:
3308 nigel 77 case OP_NOTQUERY:
3309 ph10 602 case OP_NOTQUERYI:
3310 nigel 77 case OP_NOTMINQUERY:
3311 ph10 602 case OP_NOTMINQUERYI:
3312     c = *ecode++ - ((op >= OP_NOTSTARI)? OP_NOTSTARI: OP_NOTSTAR);
3313 nigel 77 minimize = (c & 1) != 0;
3314     min = rep_min[c]; /* Pick up values from tables; */
3315     max = rep_max[c]; /* zero for max => infinity */
3316     if (max == 0) max = INT_MAX;
3317    
3318 ph10 426 /* Common code for all repeated single-byte matches. */
3319 nigel 77
3320     REPEATNOTCHAR:
3321     fc = *ecode++;
3322    
3323     /* The code is duplicated for the caseless and caseful cases, for speed,
3324     since matching characters is likely to be quite common. First, ensure the
3325     minimum number of matches are present. If min = max, continue at the same
3326     level without recursing. Otherwise, if minimizing, keep trying the rest of
3327     the expression and advancing one matching character if failing, up to the
3328     maximum. Alternatively, if maximizing, find the maximum number of
3329     characters and work backwards. */
3330    
3331     DPRINTF(("negative matching %c{%d,%d} against subject %.*s\n", fc, min, max,
3332     max, eptr));
3333    
3334 ph10 602 if (op >= OP_NOTSTARI) /* Caseless */
3335 nigel 77 {
3336     fc = md->lcc[fc];
3337    
3338     #ifdef SUPPORT_UTF8
3339     /* UTF-8 mode */
3340     if (utf8)
3341     {
3342 nigel 93 register unsigned int d;
3343 nigel 77 for (i = 1; i <= min; i++)
3344     {
3345 ph10 426 if (eptr >= md->end_subject)
3346     {
3347     SCHECK_PARTIAL();
3348 ph10 510 MRRETURN(MATCH_NOMATCH);
3349 ph10 427 }
3350 nigel 77 GETCHARINC(d, eptr);
3351     if (d < 256) d = md->lcc[d];
3352 ph10 510 if (fc == d) MRRETURN(MATCH_NOMATCH);
3353 nigel 77 }
3354     }
3355     else
3356     #endif
3357    
3358     /* Not UTF-8 mode */
3359     {
3360     for (i = 1; i <= min; i++)
3361 ph10 426 {
3362     if (eptr >= md->end_subject)
3363     {
3364     SCHECK_PARTIAL();
3365 ph10 510 MRRETURN(MATCH_NOMATCH);
3366 ph10 427 }
3367 ph10 510 if (fc == md->lcc[*eptr++]) MRRETURN(MATCH_NOMATCH);
3368 ph10 427 }
3369 nigel 77 }
3370    
3371     if (min == max) continue;
3372    
3373     if (minimize)
3374     {
3375     #ifdef SUPPORT_UTF8
3376     /* UTF-8 mode */
3377     if (utf8)
3378     {
3379 nigel 93 register unsigned int d;
3380 nigel 77 for (fi = min;; fi++)
3381     {
3382 ph10 604 RMATCH(eptr, ecode, offset_top, md, eptrb, RM28);
3383 nigel 77 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3384 ph10 510 if (fi >= max) MRRETURN(MATCH_NOMATCH);
3385 ph10 427 if (eptr >= md->end_subject)
3386 ph10 426 {
3387 ph10 427 SCHECK_PARTIAL();
3388 ph10 510 MRRETURN(MATCH_NOMATCH);
3389 ph10 427 }
3390 nigel 77 GETCHARINC(d, eptr);
3391     if (d < 256) d = md->lcc[d];
3392 ph10 510 if (fc == d) MRRETURN(MATCH_NOMATCH);
3393 nigel 77 }
3394     }
3395     else
3396     #endif
3397     /* Not UTF-8 mode */
3398     {
3399     for (fi = min;; fi++)
3400     {
3401 ph10 604 RMATCH(eptr, ecode, offset_top, md, eptrb, RM29);
3402 nigel 77 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3403 ph10 510 if (fi >= max) MRRETURN(MATCH_NOMATCH);
3404 ph10 426 if (eptr >= md->end_subject)
3405     {
3406     SCHECK_PARTIAL();
3407 ph10 510 MRRETURN(MATCH_NOMATCH);
3408 ph10 426 }
3409 ph10 510 if (fc == md->lcc[*eptr++]) MRRETURN(MATCH_NOMATCH);
3410 nigel 77 }
3411     }
3412     /* Control never gets here */
3413     }
3414    
3415     /* Maximize case */
3416    
3417     else
3418     {
3419     pp = eptr;
3420    
3421     #ifdef SUPPORT_UTF8
3422     /* UTF-8 mode */
3423     if (utf8)
3424     {
3425 nigel 93 register unsigned int d;
3426 nigel 77 for (i = min; i < max; i++)
3427     {
3428     int len = 1;
3429 ph10 463 if (eptr >= md->end_subject)
3430 ph10 462 {
3431 ph10 463 SCHECK_PARTIAL();
3432 ph10 462 break;
3433 ph10 463 }
3434 nigel 77 GETCHARLEN(d, eptr, len);
3435     if (d < 256) d = md->lcc[d];
3436     if (fc == d) break;
3437     eptr += len;
3438     }
3439 nigel 93 if (possessive) continue;
3440     for(;;)
3441 nigel 77 {
3442 ph10 604 RMATCH(eptr, ecode, offset_top, md, eptrb, RM30);
3443 nigel 77 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3444     if (eptr-- == pp) break; /* Stop if tried at original pos */
3445     BACKCHAR(eptr);
3446     }
3447     }
3448     else
3449     #endif
3450     /* Not UTF-8 mode */
3451     {
3452     for (i = min; i < max; i++)
3453     {
3454 ph10 463 if (eptr >= md->end_subject)
3455 ph10 462 {
3456     SCHECK_PARTIAL();
3457     break;
3458 ph10 463 }
3459 ph10 462 if (fc == md->lcc[*eptr]) break;
3460 nigel 77 eptr++;
3461     }
3462 nigel 93 if (possessive) continue;
3463 nigel 77 while (eptr >= pp)
3464     {
3465 ph10 604 RMATCH(eptr, ecode, offset_top, md, eptrb, RM31);
3466 nigel 77 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3467     eptr--;
3468     }
3469     }
3470    
3471 ph10 510 MRRETURN(MATCH_NOMATCH);
3472 nigel 77 }
3473     /* Control never gets here */
3474     }
3475    
3476     /* Caseful comparisons */
3477    
3478     else
3479     {
3480     #ifdef SUPPORT_UTF8
3481     /* UTF-8 mode */
3482     if (utf8)
3483     {
3484 nigel 93 register unsigned int d;
3485 nigel 77 for (i = 1; i <= min; i++)
3486     {
3487 ph10 426 if (eptr >= md->end_subject)
3488     {
3489     SCHECK_PARTIAL();
3490 ph10 510 MRRETURN(MATCH_NOMATCH);
3491 ph10 427 }
3492 nigel 77 GETCHARINC(d, eptr);
3493 ph10 510 if (fc == d) MRRETURN(MATCH_NOMATCH);
3494 nigel 77 }
3495     }
3496     else
3497     #endif
3498     /* Not UTF-8 mode */
3499     {
3500     for (i = 1; i <= min; i++)
3501 ph10 426 {
3502     if (eptr >= md->end_subject)
3503     {
3504     SCHECK_PARTIAL();
3505 ph10 510 MRRETURN(MATCH_NOMATCH);
3506 ph10 427 }
3507 ph10 510 if (fc == *eptr++) MRRETURN(MATCH_NOMATCH);
3508 ph10 427 }
3509 nigel 77 }
3510    
3511     if (min == max) continue;
3512    
3513     if (minimize)
3514     {
3515     #ifdef SUPPORT_UTF8
3516     /* UTF-8 mode */
3517     if (utf8)
3518     {
3519 nigel 93 register unsigned int d;
3520 nigel 77 for (fi = min;; fi++)
3521     {
3522 ph10 604 RMATCH(eptr, ecode, offset_top, md, eptrb, RM32);
3523 nigel 77 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3524 ph10 510 if (fi >= max) MRRETURN(MATCH_NOMATCH);
3525 ph10 427 if (eptr >= md->end_subject)
3526 ph10 426 {
3527 ph10 427 SCHECK_PARTIAL();
3528 ph10 510 MRRETURN(MATCH_NOMATCH);
3529 ph10 427 }
3530 nigel 77 GETCHARINC(d, eptr);
3531 ph10 510 if (fc == d) MRRETURN(MATCH_NOMATCH);
3532 nigel 77 }
3533     }
3534     else
3535     #endif
3536     /* Not UTF-8 mode */
3537     {
3538     for (fi = min;; fi++)
3539     {
3540 ph10 604 RMATCH(eptr, ecode, offset_top, md, eptrb, RM33);
3541 nigel 77 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3542 ph10 510 if (fi >= max) MRRETURN(MATCH_NOMATCH);
3543 ph10 426 if (eptr >= md->end_subject)
3544     {
3545     SCHECK_PARTIAL();
3546 ph10 510 MRRETURN(MATCH_NOMATCH);
3547 ph10 427 }
3548 ph10 510 if (fc == *eptr++) MRRETURN(MATCH_NOMATCH);
3549 nigel 77 }
3550     }
3551     /* Control never gets here */
3552     }
3553    
3554     /* Maximize case */
3555    
3556     else
3557     {
3558     pp = eptr;
3559    
3560     #ifdef SUPPORT_UTF8
3561     /* UTF-8 mode */
3562     if (utf8)
3563     {
3564 nigel 93 register unsigned int d;
3565 nigel 77 for (i = min; i < max; i++)
3566     {
3567     int len = 1;
3568 ph10 463 if (eptr >= md->end_subject)
3569 ph10 462 {
3570 ph10 463 SCHECK_PARTIAL();
3571 ph10 462 break;
3572 ph10 463 }
3573 nigel 77 GETCHARLEN(d, eptr, len);
3574     if (fc == d) break;
3575     eptr += len;
3576     }
3577 nigel 93 if (possessive) continue;
3578 nigel 77 for(;;)
3579     {
3580 ph10 604 RMATCH(eptr, ecode, offset_top, md, eptrb, RM34);
3581 nigel 77 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3582     if (eptr-- == pp) break; /* Stop if tried at original pos */
3583     BACKCHAR(eptr);
3584     }
3585     }
3586     else
3587     #endif
3588     /* Not UTF-8 mode */
3589     {
3590     for (i = min; i < max; i++)
3591     {
3592 ph10 463 if (eptr >= md->end_subject)
3593 ph10 462 {
3594 ph10 463 SCHECK_PARTIAL();
3595 ph10 462 break;
3596 ph10 463 }
3597 ph10 462 if (fc == *eptr) break;
3598 nigel 77 eptr++;
3599     }
3600 nigel 93 if (possessive) continue;
3601 nigel 77 while (eptr >= pp)
3602     {
3603 ph10 604 RMATCH(eptr, ecode, offset_top, md, eptrb, RM35);
3604 nigel 77 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3605     eptr--;
3606     }
3607     }
3608    
3609 ph10 510 MRRETURN(MATCH_NOMATCH);
3610 nigel 77 }
3611     }
3612     /* Control never gets here */
3613    
3614     /* Match a single character type repeatedly; several different opcodes
3615     share code. This is very similar to the code for single characters, but we
3616     repeat it in the interests of efficiency. */
3617    
3618     case OP_TYPEEXACT:
3619     min = max = GET2(ecode, 1);
3620     minimize = TRUE;
3621     ecode += 3;
3622     goto REPEATTYPE;
3623    
3624     case OP_TYPEUPTO:
3625     case OP_TYPEMINUPTO:
3626     min = 0;
3627     max = GET2(ecode, 1);
3628     minimize = *ecode == OP_TYPEMINUPTO;
3629     ecode += 3;
3630     goto REPEATTYPE;
3631    
3632 nigel 93 case OP_TYPEPOSSTAR:
3633     possessive = TRUE;
3634     min = 0;
3635     max = INT_MAX;
3636     ecode++;
3637     goto REPEATTYPE;
3638    
3639     case OP_TYPEPOSPLUS:
3640     possessive = TRUE;
3641     min = 1;
3642     max = INT_MAX;
3643     ecode++;
3644     goto REPEATTYPE;
3645    
3646     case OP_TYPEPOSQUERY:
3647     possessive = TRUE;
3648     min = 0;
3649     max = 1;
3650     ecode++;
3651     goto REPEATTYPE;
3652    
3653     case OP_TYPEPOSUPTO:
3654     possessive = TRUE;
3655     min = 0;
3656     max = GET2(ecode, 1);
3657     ecode += 3;
3658     goto REPEATTYPE;
3659    
3660 nigel 77 case OP_TYPESTAR:
3661     case OP_TYPEMINSTAR:
3662     case OP_TYPEPLUS:
3663     case OP_TYPEMINPLUS:
3664     case OP_TYPEQUERY:
3665     case OP_TYPEMINQUERY:
3666     c = *ecode++ - OP_TYPESTAR;
3667     minimize = (c & 1) != 0;
3668     min = rep_min[c]; /* Pick up values from tables; */
3669     max = rep_max[c]; /* zero for max => infinity */
3670     if (max == 0) max = INT_MAX;
3671    
3672     /* Common code for all repeated single character type matches. Note that
3673     in UTF-8 mode, '.' matches a character of any length, but for the other
3674     character types, the valid characters are all one-byte long. */
3675    
3676     REPEATTYPE:
3677     ctype = *ecode++; /* Code for the character type */
3678    
3679     #ifdef SUPPORT_UCP
3680     if (ctype == OP_PROP || ctype == OP_NOTPROP)
3681     {
3682     prop_fail_result = ctype == OP_NOTPROP;
3683     prop_type = *ecode++;
3684 nigel 87 prop_value = *ecode++;
3685 nigel 77 }
3686     else prop_type = -1;
3687     #endif
3688    
3689     /* First, ensure the minimum number of matches are present. Use inline
3690     code for maximizing the speed, and do the type test once at the start
3691 ph10 426 (i.e. keep it out of the loop). Separate the UTF-8 code completely as that
3692 nigel 77 is tidier. Also separate the UCP code, which can be the same for both UTF-8
3693     and single-bytes. */
3694    
3695     if (min > 0)
3696     {
3697     #ifdef SUPPORT_UCP
3698 nigel 87 if (prop_type >= 0)
3699 nigel 77 {
3700 nigel 87 switch(prop_type)
3701 nigel 77 {
3702 nigel 87 case PT_ANY:
3703 ph10 510 if (prop_fail_result) MRRETURN(MATCH_NOMATCH);
3704 nigel 87 for (i = 1; i <= min; i++)
3705     {
3706 ph10 427 if (eptr >= md->end_subject)
3707 ph10 426 {
3708 ph10 427 SCHECK_PARTIAL();
3709 ph10 510 MRRETURN(MATCH_NOMATCH);
3710 ph10 427 }
3711 ph10 184 GETCHARINCTEST(c, eptr);
3712 nigel 87 }
3713     break;
3714    
3715     case PT_LAMP:
3716     for (i = 1; i <= min; i++)
3717     {
3718 ph10 623 int chartype;
3719 ph10 427 if (eptr >= md->end_subject)
3720 ph10 426 {
3721 ph10 427 SCHECK_PARTIAL();
3722 ph10 510 MRRETURN(MATCH_NOMATCH);
3723 ph10 427 }
3724 ph10 184 GETCHARINCTEST(c, eptr);
3725 ph10 623 chartype = UCD_CHARTYPE(c);
3726     if ((chartype == ucp_Lu ||
3727     chartype == ucp_Ll ||
3728     chartype == ucp_Lt) == prop_fail_result)
3729 ph10 510 MRRETURN(MATCH_NOMATCH);
3730 nigel 87 }
3731     break;
3732    
3733     case PT_GC:
3734     for (i = 1; i <= min; i++)
3735     {
3736 ph10 427 if (eptr >= md->end_subject)
3737 ph10 426 {
3738 ph10 427 SCHECK_PARTIAL();
3739 ph10 510 MRRETURN(MATCH_NOMATCH);
3740 ph10 427 }
3741 ph10 184 GETCHARINCTEST(c, eptr);
3742 ph10 623 if ((UCD_CATEGORY(c) == prop_value) == prop_fail_result)
3743 ph10 510 MRRETURN(MATCH_NOMATCH);
3744 nigel 87 }
3745     break;
3746    
3747     case PT_PC:
3748     for (i = 1; i <= min; i++)
3749     {
3750 ph10 427 if (eptr >= md->end_subject)
3751 ph10 426 {
3752 ph10 427 SCHECK_PARTIAL();
3753 ph10 510 MRRETURN(MATCH_NOMATCH);
3754 ph10 427 }
3755 ph10 184 GETCHARINCTEST(c, eptr);
3756 ph10 623 if ((UCD_CHARTYPE(c) == prop_value) == prop_fail_result)
3757 ph10 510 MRRETURN(MATCH_NOMATCH);
3758 nigel 87 }
3759     break;
3760    
3761     case PT_SC:
3762     for (i = 1; i <= min; i++)
3763     {
3764 ph10 427 if (eptr >= md->end_subject)
3765 ph10 426 {
3766 ph10 427 SCHECK_PARTIAL();
3767 ph10 510 MRRETURN(MATCH_NOMATCH);
3768 ph10 427 }
3769 ph10 184 GETCHARINCTEST(c, eptr);
3770 ph10 623 if ((UCD_SCRIPT(c) == prop_value) == prop_fail_result)
3771 ph10 510 MRRETURN(MATCH_NOMATCH);
3772 nigel 87 }
3773     break;
3774 ph10 527
3775 ph10 517 case PT_ALNUM:
3776     for (i = 1; i <= min; i++)
3777     {
3778 ph10 623 int category;
3779 ph10 517 if (eptr >= md->end_subject)
3780     {
3781     SCHECK_PARTIAL();
3782     MRRETURN(MATCH_NOMATCH);
3783     }
3784     GETCHARINCTEST(c, eptr);
3785 ph10 623 category = UCD_CATEGORY(c);
3786     if ((category == ucp_L || category == ucp_N) == prop_fail_result)
3787 ph10 517 MRRETURN(MATCH_NOMATCH);
3788     }
3789     break;
3790 ph10 527
3791 ph10 517 case PT_SPACE: /* Perl space */
3792     for (i = 1; i <= min; i++)
3793     {
3794     if (eptr >= md->end_subject)
3795     {
3796     SCHECK_PARTIAL();
3797     MRRETURN(MATCH_NOMATCH);
3798     }
3799     GETCHARINCTEST(c, eptr);
3800 ph10 623 if ((UCD_CATEGORY(c) == ucp_Z || c == CHAR_HT || c == CHAR_NL ||
3801 ph10 527 c == CHAR_FF || c == CHAR_CR)
3802 ph10 517 == prop_fail_result)
3803     MRRETURN(MATCH_NOMATCH);
3804     }
3805     break;
3806 ph10 527
3807 ph10 517 case PT_PXSPACE: /* POSIX space */
3808     for (i = 1; i <= min; i++)
3809     {
3810     if (eptr >= md->end_subject)
3811     {
3812     SCHECK_PARTIAL();
3813     MRRETURN(MATCH_NOMATCH);
3814     }
3815     GETCHARINCTEST(c, eptr);
3816 ph10 623 if ((UCD_CATEGORY(c) == ucp_Z || c == CHAR_HT || c == CHAR_NL ||
3817 ph10