/[pcre]/code/trunk/pcre_exec.c
ViewVC logotype

Contents of /code/trunk/pcre_exec.c

Parent Directory Parent Directory | Revision Log Revision Log


Revision 615 - (hide annotations) (download)
Mon Jul 11 14:23:06 2011 UTC (22 months, 1 week ago) by ph10
File MIME type: text/plain
File size: 195321 byte(s)
A better patch for the atomic capturing not resetting bug.

1 nigel 77 /*************************************************
2     * Perl-Compatible Regular Expressions *
3     *************************************************/
4    
5     /* PCRE is a library of functions to support regular expressions whose syntax
6     and semantics are as close as possible to those of the Perl 5 language.
7    
8     Written by Philip Hazel
9 ph10 598 Copyright (c) 1997-2011 University of Cambridge
10 nigel 77
11     -----------------------------------------------------------------------------
12     Redistribution and use in source and binary forms, with or without
13     modification, are permitted provided that the following conditions are met:
14    
15     * Redistributions of source code must retain the above copyright notice,
16     this list of conditions and the following disclaimer.
17    
18     * Redistributions in binary form must reproduce the above copyright
19     notice, this list of conditions and the following disclaimer in the
20     documentation and/or other materials provided with the distribution.
21    
22     * Neither the name of the University of Cambridge nor the names of its
23     contributors may be used to endorse or promote products derived from
24     this software without specific prior written permission.
25    
26     THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
27     AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
28     IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
29     ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
30     LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
31     CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
32     SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
33     INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
34     CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
35     ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
36     POSSIBILITY OF SUCH DAMAGE.
37     -----------------------------------------------------------------------------
38     */
39    
40    
41     /* This module contains pcre_exec(), the externally visible function that does
42     pattern matching using an NFA algorithm, trying to mimic Perl as closely as
43     possible. There are also some static supporting functions. */
44    
45 ph10 200 #ifdef HAVE_CONFIG_H
46 ph10 236 #include "config.h"
47 ph10 200 #endif
48 ph10 199
49 nigel 93 #define NLBLOCK md /* Block containing newline information */
50     #define PSSTART start_subject /* Field containing processed string start */
51     #define PSEND end_subject /* Field containing processed string end */
52    
53 nigel 77 #include "pcre_internal.h"
54    
55 ph10 137 /* Undefine some potentially clashing cpp symbols */
56    
57     #undef min
58     #undef max
59    
60 ph10 604 /* Values for setting in md->match_function_type to indicate two special types
61     of call to match(). We do it this way to save on using another stack variable,
62     as stack usage is to be discouraged. */
63 nigel 77
64 ph10 604 #define MATCH_CONDASSERT 1 /* Called to check a condition assertion */
65     #define MATCH_CBEGROUP 2 /* Could-be-empty unlimited repeat group */
66 nigel 77
67     /* Non-error returns from the match() function. Error returns are externally
68     defined PCRE_ERROR_xxx codes, which are all negative. */
69    
70     #define MATCH_MATCH 1
71     #define MATCH_NOMATCH 0
72    
73 ph10 211 /* Special internal returns from the match() function. Make them sufficiently
74 ph10 210 negative to avoid the external error codes. */
75    
76 ph10 511 #define MATCH_ACCEPT (-999)
77     #define MATCH_COMMIT (-998)
78 ph10 604 #define MATCH_KETRPOS (-997)
79     #define MATCH_PRUNE (-996)
80     #define MATCH_SKIP (-995)
81     #define MATCH_SKIP_ARG (-994)
82     #define MATCH_THEN (-993)
83 ph10 210
84 ph10 510 /* This is a convenience macro for code that occurs many times. */
85    
86     #define MRRETURN(ra) \
87     { \
88     md->mark = markptr; \
89     RRETURN(ra); \
90     }
91    
92 nigel 77 /* Maximum number of ints of offset to save on the stack for recursive calls.
93     If the offset vector is bigger, malloc is used. This should be a multiple of 3,
94     because the offset vector is always a multiple of 3 long. */
95    
96     #define REC_STACK_SAVE_MAX 30
97    
98     /* Min and max values for the common repeats; for the maxima, 0 => infinity */
99    
100     static const char rep_min[] = { 0, 0, 1, 1, 0, 0 };
101     static const char rep_max[] = { 0, 0, 0, 0, 1, 1 };
102    
103    
104    
105 ph10 475 #ifdef PCRE_DEBUG
106 nigel 77 /*************************************************
107     * Debugging function to print chars *
108     *************************************************/
109    
110     /* Print a sequence of chars in printable format, stopping at the end of the
111     subject if the requested.
112    
113     Arguments:
114     p points to characters
115     length number to print
116     is_subject TRUE if printing from within md->start_subject
117     md pointer to matching data block, if is_subject is TRUE
118    
119     Returns: nothing
120     */
121    
122     static void
123     pchars(const uschar *p, int length, BOOL is_subject, match_data *md)
124     {
125 nigel 93 unsigned int c;
126 nigel 77 if (is_subject && length > md->end_subject - p) length = md->end_subject - p;
127     while (length-- > 0)
128     if (isprint(c = *(p++))) printf("%c", c); else printf("\\x%02x", c);
129     }
130     #endif
131    
132    
133    
134     /*************************************************
135     * Match a back-reference *
136     *************************************************/
137    
138 ph10 595 /* Normally, if a back reference hasn't been set, the length that is passed is
139     negative, so the match always fails. However, in JavaScript compatibility mode,
140     the length passed is zero. Note that in caseless UTF-8 mode, the number of
141     subject bytes matched may be different to the number of reference bytes.
142 nigel 77
143     Arguments:
144     offset index into the offset vector
145 ph10 595 eptr pointer into the subject
146     length length of reference to be matched (number of bytes)
147 nigel 77 md points to match data block
148 ph10 602 caseless TRUE if caseless
149 nigel 77
150 ph10 595 Returns: < 0 if not matched, otherwise the number of subject bytes matched
151 nigel 77 */
152    
153 ph10 595 static int
154 nigel 87 match_ref(int offset, register USPTR eptr, int length, match_data *md,
155 ph10 602 BOOL caseless)
156 nigel 77 {
157 ph10 595 USPTR eptr_start = eptr;
158     register USPTR p = md->start_subject + md->offset_vector[offset];
159 nigel 77
160 ph10 475 #ifdef PCRE_DEBUG
161 nigel 77 if (eptr >= md->end_subject)
162     printf("matching subject <null>");
163     else
164     {
165     printf("matching subject ");
166     pchars(eptr, length, TRUE, md);
167     }
168     printf(" against backref ");
169     pchars(p, length, FALSE, md);
170     printf("\n");
171     #endif
172    
173 ph10 595 /* Always fail if reference not set (and not JavaScript compatible). */
174 nigel 77
175 ph10 595 if (length < 0) return -1;
176 nigel 77
177 ph10 354 /* Separate the caseless case for speed. In UTF-8 mode we can only do this
178     properly if Unicode properties are supported. Otherwise, we can check only
179     ASCII characters. */
180 nigel 77
181 ph10 602 if (caseless)
182 nigel 77 {
183 ph10 354 #ifdef SUPPORT_UTF8
184     #ifdef SUPPORT_UCP
185     if (md->utf8)
186     {
187 ph10 595 /* Match characters up to the end of the reference. NOTE: the number of
188     bytes matched may differ, because there are some characters whose upper and
189     lower case versions code as different numbers of bytes. For example, U+023A
190     (2 bytes in UTF-8) is the upper case version of U+2C65 (3 bytes in UTF-8);
191     a sequence of 3 of the former uses 6 bytes, as does a sequence of two of
192     the latter. It is important, therefore, to check the length along the
193     reference, not along the subject (earlier code did this wrong). */
194    
195     USPTR endptr = p + length;
196     while (p < endptr)
197 ph10 354 {
198 ph10 358 int c, d;
199 ph10 597 if (eptr >= md->end_subject) return -1;
200 ph10 354 GETCHARINC(c, eptr);
201     GETCHARINC(d, p);
202 ph10 595 if (c != d && c != UCD_OTHERCASE(d)) return -1;
203 ph10 358 }
204     }
205 ph10 354 else
206     #endif
207     #endif
208    
209     /* The same code works when not in UTF-8 mode and in UTF-8 mode when there
210     is no UCP support. */
211 ph10 597 {
212     if (eptr + length > md->end_subject) return -1;
213     while (length-- > 0)
214     { if (md->lcc[*p++] != md->lcc[*eptr++]) return -1; }
215     }
216 nigel 77 }
217 ph10 358
218 ph10 354 /* In the caseful case, we can just compare the bytes, whether or not we
219     are in UTF-8 mode. */
220 ph10 358
221 nigel 77 else
222 ph10 597 {
223     if (eptr + length > md->end_subject) return -1;
224     while (length-- > 0) if (*p++ != *eptr++) return -1;
225     }
226 nigel 77
227 ph10 595 return eptr - eptr_start;
228 nigel 77 }
229    
230    
231    
232     /***************************************************************************
233     ****************************************************************************
234     RECURSION IN THE match() FUNCTION
235    
236 nigel 87 The match() function is highly recursive, though not every recursive call
237     increases the recursive depth. Nevertheless, some regular expressions can cause
238     it to recurse to a great depth. I was writing for Unix, so I just let it call
239     itself recursively. This uses the stack for saving everything that has to be
240     saved for a recursive call. On Unix, the stack can be large, and this works
241     fine.
242 nigel 77
243 nigel 87 It turns out that on some non-Unix-like systems there are problems with
244     programs that use a lot of stack. (This despite the fact that every last chip
245     has oodles of memory these days, and techniques for extending the stack have
246     been known for decades.) So....
247 nigel 77
248     There is a fudge, triggered by defining NO_RECURSE, which avoids recursive
249     calls by keeping local variables that need to be preserved in blocks of memory
250 nigel 87 obtained from malloc() instead instead of on the stack. Macros are used to
251 nigel 77 achieve this so that the actual code doesn't look very different to what it
252     always used to.
253 ph10 164
254 ph10 165 The original heap-recursive code used longjmp(). However, it seems that this
255 ph10 164 can be very slow on some operating systems. Following a suggestion from Stan
256     Switzer, the use of longjmp() has been abolished, at the cost of having to
257     provide a unique number for each call to RMATCH. There is no way of generating
258     a sequence of numbers at compile time in C. I have given them names, to make
259     them stand out more clearly.
260    
261     Crude tests on x86 Linux show a small speedup of around 5-8%. However, on
262     FreeBSD, avoiding longjmp() more than halves the time taken to run the standard
263 ph10 165 tests. Furthermore, not using longjmp() means that local dynamic variables
264     don't have indeterminate values; this has meant that the frame size can be
265 ph10 164 reduced because the result can be "passed back" by straight setting of the
266     variable instead of being passed in the frame.
267 nigel 77 ****************************************************************************
268     ***************************************************************************/
269    
270 ph10 212 /* Numbers for RMATCH calls. When this list is changed, the code at HEAP_RETURN
271     below must be updated in sync. */
272 nigel 77
273 ph10 164 enum { RM1=1, RM2, RM3, RM4, RM5, RM6, RM7, RM8, RM9, RM10,
274     RM11, RM12, RM13, RM14, RM15, RM16, RM17, RM18, RM19, RM20,
275     RM21, RM22, RM23, RM24, RM25, RM26, RM27, RM28, RM29, RM30,
276     RM31, RM32, RM33, RM34, RM35, RM36, RM37, RM38, RM39, RM40,
277 ph10 210 RM41, RM42, RM43, RM44, RM45, RM46, RM47, RM48, RM49, RM50,
278 ph10 527 RM51, RM52, RM53, RM54, RM55, RM56, RM57, RM58, RM59, RM60,
279 ph10 609 RM61, RM62, RM63};
280 ph10 164
281 nigel 87 /* These versions of the macros use the stack, as normal. There are debugging
282 ph10 165 versions and production versions. Note that the "rw" argument of RMATCH isn't
283 ph10 501 actually used in this definition. */
284 nigel 77
285     #ifndef NO_RECURSE
286     #define REGISTER register
287 ph10 164
288 ph10 475 #ifdef PCRE_DEBUG
289 ph10 604 #define RMATCH(ra,rb,rc,rd,re,rw) \
290 nigel 87 { \
291     printf("match() called in line %d\n", __LINE__); \
292 ph10 604 rrc = match(ra,rb,mstart,markptr,rc,rd,re,rdepth+1); \
293 nigel 87 printf("to line %d\n", __LINE__); \
294     }
295     #define RRETURN(ra) \
296     { \
297     printf("match() returned %d from line %d ", ra, __LINE__); \
298     return ra; \
299     }
300     #else
301 ph10 604 #define RMATCH(ra,rb,rc,rd,re,rw) \
302     rrc = match(ra,rb,mstart,markptr,rc,rd,re,rdepth+1)
303 nigel 77 #define RRETURN(ra) return ra
304 nigel 87 #endif
305    
306 nigel 77 #else
307    
308    
309 ph10 164 /* These versions of the macros manage a private stack on the heap. Note that
310     the "rd" argument of RMATCH isn't actually used in this definition. It's the md
311     argument of match(), which never changes. */
312 nigel 77
313     #define REGISTER
314    
315 ph10 604 #define RMATCH(ra,rb,rc,rd,re,rw)\
316 nigel 77 {\
317 ph10 563 heapframe *newframe = (heapframe *)(pcre_stack_malloc)(sizeof(heapframe));\
318 ph10 534 if (newframe == NULL) RRETURN(PCRE_ERROR_NOMEMORY);\
319 ph10 164 frame->Xwhere = rw; \
320     newframe->Xeptr = ra;\
321     newframe->Xecode = rb;\
322 ph10 168 newframe->Xmstart = mstart;\
323 ph10 501 newframe->Xmarkptr = markptr;\
324 ph10 164 newframe->Xoffset_top = rc;\
325 ph10 602 newframe->Xeptrb = re;\
326 ph10 164 newframe->Xrdepth = frame->Xrdepth + 1;\
327     newframe->Xprevframe = frame;\
328     frame = newframe;\
329     DPRINTF(("restarting from line %d\n", __LINE__));\
330     goto HEAP_RECURSE;\
331     L_##rw:\
332     DPRINTF(("jumped back to line %d\n", __LINE__));\
333 nigel 77 }
334    
335     #define RRETURN(ra)\
336     {\
337 ph10 527 heapframe *oldframe = frame;\
338     frame = oldframe->Xprevframe;\
339     (pcre_stack_free)(oldframe);\
340 nigel 77 if (frame != NULL)\
341     {\
342 ph10 164 rrc = ra;\
343     goto HEAP_RETURN;\
344 nigel 77 }\
345     return ra;\
346     }
347    
348    
349     /* Structure for remembering the local variables in a private frame */
350    
351     typedef struct heapframe {
352     struct heapframe *Xprevframe;
353    
354     /* Function arguments that may change */
355    
356 ph10 409 USPTR Xeptr;
357 nigel 77 const uschar *Xecode;
358 ph10 409 USPTR Xmstart;
359 ph10 501 USPTR Xmarkptr;
360 nigel 77 int Xoffset_top;
361     eptrblock *Xeptrb;
362 nigel 91 unsigned int Xrdepth;
363 nigel 77
364     /* Function local variables */
365    
366 ph10 409 USPTR Xcallpat;
367 ph10 406 #ifdef SUPPORT_UTF8
368 ph10 409 USPTR Xcharptr;
369 ph10 406 #endif
370 ph10 409 USPTR Xdata;
371     USPTR Xnext;
372     USPTR Xpp;
373     USPTR Xprev;
374     USPTR Xsaved_eptr;
375 nigel 77
376     recursion_info Xnew_recursive;
377    
378     BOOL Xcur_is_word;
379     BOOL Xcondition;
380     BOOL Xprev_is_word;
381    
382     #ifdef SUPPORT_UCP
383     int Xprop_type;
384 nigel 87 int Xprop_value;
385 nigel 77 int Xprop_fail_result;
386     int Xprop_category;
387     int Xprop_chartype;
388 nigel 87 int Xprop_script;
389 ph10 123 int Xoclength;
390     uschar Xocchars[8];
391 nigel 77 #endif
392    
393 ph10 403 int Xcodelink;
394 nigel 77 int Xctype;
395 nigel 93 unsigned int Xfc;
396 nigel 77 int Xfi;
397     int Xlength;
398     int Xmax;
399     int Xmin;
400     int Xnumber;
401     int Xoffset;
402     int Xop;
403     int Xsave_capture_last;
404     int Xsave_offset1, Xsave_offset2, Xsave_offset3;
405     int Xstacksave[REC_STACK_SAVE_MAX];
406    
407     eptrblock Xnewptrb;
408    
409 ph10 164 /* Where to jump back to */
410 nigel 77
411 ph10 164 int Xwhere;
412 ph10 165
413 nigel 77 } heapframe;
414    
415     #endif
416    
417    
418     /***************************************************************************
419     ***************************************************************************/
420    
421    
422    
423     /*************************************************
424     * Match from current position *
425     *************************************************/
426    
427 nigel 93 /* This function is called recursively in many circumstances. Whenever it
428 nigel 77 returns a negative (error) response, the outer incarnation must also return the
429 ph10 426 same response. */
430 nigel 77
431 ph10 426 /* These macros pack up tests that are used for partial matching, and which
432     appears several times in the code. We set the "hit end" flag if the pointer is
433     at the end of the subject and also past the start of the subject (i.e.
434 ph10 427 something has been matched). For hard partial matching, we then return
435     immediately. The second one is used when we already know we are past the end of
436     the subject. */
437 ph10 426
438     #define CHECK_PARTIAL()\
439 ph10 553 if (md->partial != 0 && eptr >= md->end_subject && \
440     eptr > md->start_used_ptr) \
441     { \
442     md->hitend = TRUE; \
443     if (md->partial > 1) MRRETURN(PCRE_ERROR_PARTIAL); \
444 ph10 427 }
445 ph10 426
446     #define SCHECK_PARTIAL()\
447 ph10 553 if (md->partial != 0 && eptr > md->start_used_ptr) \
448     { \
449     md->hitend = TRUE; \
450     if (md->partial > 1) MRRETURN(PCRE_ERROR_PARTIAL); \
451 ph10 427 }
452 ph10 426
453 ph10 427
454 ph10 426 /* Performance note: It might be tempting to extract commonly used fields from
455     the md structure (e.g. utf8, end_subject) into individual variables to improve
456 nigel 77 performance. Tests using gcc on a SPARC disproved this; in the first case, it
457     made performance worse.
458    
459     Arguments:
460 nigel 93 eptr pointer to current character in subject
461     ecode pointer to current position in compiled code
462 ph10 168 mstart pointer to the current match start position (can be modified
463 ph10 172 by encountering \K)
464 ph10 501 markptr pointer to the most recent MARK name, or NULL
465 nigel 77 offset_top current top pointer
466     md pointer to "static" info for the match
467     eptrb pointer to chain of blocks containing eptr at start of
468     brackets - for testing for empty matches
469 nigel 87 rdepth the recursion depth
470 nigel 77
471     Returns: MATCH_MATCH if matched ) these values are >= 0
472     MATCH_NOMATCH if failed to match )
473 ph10 510 a negative MATCH_xxx value for PRUNE, SKIP, etc
474 nigel 77 a negative PCRE_ERROR_xxx value if aborted by an error condition
475 nigel 87 (e.g. stopped by repeated call or recursion limit)
476 nigel 77 */
477    
478     static int
479 ph10 510 match(REGISTER USPTR eptr, REGISTER const uschar *ecode, USPTR mstart,
480 ph10 602 const uschar *markptr, int offset_top, match_data *md, eptrblock *eptrb,
481 ph10 604 unsigned int rdepth)
482 nigel 77 {
483     /* These variables do not need to be preserved over recursion in this function,
484 nigel 93 so they can be ordinary variables in all cases. Mark some of them with
485     "register" because they are used a lot in loops. */
486 nigel 77
487 nigel 91 register int rrc; /* Returns from recursive calls */
488     register int i; /* Used for loops not involving calls to RMATCH() */
489 nigel 93 register unsigned int c; /* Character values not kept over RMATCH() calls */
490 nigel 91 register BOOL utf8; /* Local copy of UTF-8 flag for speed */
491 nigel 77
492 nigel 93 BOOL minimize, possessive; /* Quantifier options */
493 ph10 602 BOOL caseless;
494 ph10 403 int condcode;
495 nigel 93
496 nigel 77 /* When recursion is not being used, all "local" variables that have to be
497     preserved over calls to RMATCH() are part of a "frame" which is obtained from
498     heap storage. Set up the top-level frame here; others are obtained from the
499     heap whenever RMATCH() does a "recursion". See the macro definitions above. */
500    
501     #ifdef NO_RECURSE
502 ph10 563 heapframe *frame = (heapframe *)(pcre_stack_malloc)(sizeof(heapframe));
503 ph10 531 if (frame == NULL) RRETURN(PCRE_ERROR_NOMEMORY);
504 nigel 77 frame->Xprevframe = NULL; /* Marks the top level */
505    
506     /* Copy in the original argument variables */
507    
508     frame->Xeptr = eptr;
509     frame->Xecode = ecode;
510 ph10 168 frame->Xmstart = mstart;
511 ph10 501 frame->Xmarkptr = markptr;
512 nigel 77 frame->Xoffset_top = offset_top;
513     frame->Xeptrb = eptrb;
514 nigel 87 frame->Xrdepth = rdepth;
515 nigel 77
516     /* This is where control jumps back to to effect "recursion" */
517    
518     HEAP_RECURSE:
519    
520     /* Macros make the argument variables come from the current frame */
521    
522     #define eptr frame->Xeptr
523     #define ecode frame->Xecode
524 ph10 168 #define mstart frame->Xmstart
525 ph10 501 #define markptr frame->Xmarkptr
526 nigel 77 #define offset_top frame->Xoffset_top
527     #define eptrb frame->Xeptrb
528 nigel 87 #define rdepth frame->Xrdepth
529 nigel 77
530     /* Ditto for the local variables */
531    
532     #ifdef SUPPORT_UTF8
533     #define charptr frame->Xcharptr
534     #endif
535     #define callpat frame->Xcallpat
536 ph10 403 #define codelink frame->Xcodelink
537 nigel 77 #define data frame->Xdata
538     #define next frame->Xnext
539     #define pp frame->Xpp
540     #define prev frame->Xprev
541     #define saved_eptr frame->Xsaved_eptr
542    
543     #define new_recursive frame->Xnew_recursive
544    
545     #define cur_is_word frame->Xcur_is_word
546     #define condition frame->Xcondition
547     #define prev_is_word frame->Xprev_is_word
548    
549     #ifdef SUPPORT_UCP
550     #define prop_type frame->Xprop_type
551 nigel 87 #define prop_value frame->Xprop_value
552 nigel 77 #define prop_fail_result frame->Xprop_fail_result
553     #define prop_category frame->Xprop_category
554     #define prop_chartype frame->Xprop_chartype
555 nigel 87 #define prop_script frame->Xprop_script
556 ph10 115 #define oclength frame->Xoclength
557     #define occhars frame->Xocchars
558 nigel 77 #endif
559    
560     #define ctype frame->Xctype
561     #define fc frame->Xfc
562     #define fi frame->Xfi
563     #define length frame->Xlength
564     #define max frame->Xmax
565     #define min frame->Xmin
566     #define number frame->Xnumber
567     #define offset frame->Xoffset
568     #define op frame->Xop
569     #define save_capture_last frame->Xsave_capture_last
570     #define save_offset1 frame->Xsave_offset1
571     #define save_offset2 frame->Xsave_offset2
572     #define save_offset3 frame->Xsave_offset3
573     #define stacksave frame->Xstacksave
574    
575     #define newptrb frame->Xnewptrb
576    
577     /* When recursion is being used, local variables are allocated on the stack and
578     get preserved during recursion in the normal way. In this environment, fi and
579     i, and fc and c, can be the same variables. */
580    
581 nigel 93 #else /* NO_RECURSE not defined */
582 nigel 77 #define fi i
583     #define fc c
584    
585 ph10 604 /* Many of the following variables are used only in small blocks of the code.
586     My normal style of coding would have declared them within each of those blocks.
587     However, in order to accommodate the version of this code that uses an external
588     "stack" implemented on the heap, it is easier to declare them all here, so the
589     declarations can be cut out in a block. The only declarations within blocks
590     below are for variables that do not have to be preserved over a recursive call
591     to RMATCH(). */
592 nigel 77
593 ph10 604 #ifdef SUPPORT_UTF8
594     const uschar *charptr;
595     #endif
596     const uschar *callpat;
597     const uschar *data;
598     const uschar *next;
599     USPTR pp;
600     const uschar *prev;
601     USPTR saved_eptr;
602    
603     recursion_info new_recursive;
604    
605     BOOL cur_is_word;
606 nigel 87 BOOL condition;
607 nigel 77 BOOL prev_is_word;
608    
609     #ifdef SUPPORT_UCP
610     int prop_type;
611 nigel 87 int prop_value;
612 nigel 77 int prop_fail_result;
613     int prop_category;
614     int prop_chartype;
615 nigel 87 int prop_script;
616 ph10 115 int oclength;
617     uschar occhars[8];
618 nigel 77 #endif
619    
620 ph10 399 int codelink;
621 nigel 77 int ctype;
622     int length;
623     int max;
624     int min;
625     int number;
626     int offset;
627     int op;
628     int save_capture_last;
629     int save_offset1, save_offset2, save_offset3;
630     int stacksave[REC_STACK_SAVE_MAX];
631    
632     eptrblock newptrb;
633 nigel 93 #endif /* NO_RECURSE */
634 nigel 77
635 ph10 604 /* To save space on the stack and in the heap frame, I have doubled up on some
636     of the local variables that are used only in localised parts of the code, but
637     still need to be preserved over recursive calls of match(). These macros define
638     the alternative names that are used. */
639    
640     #define allow_zero cur_is_word
641     #define cbegroup condition
642     #define code_offset codelink
643     #define condassert condition
644     #define matched_once prev_is_word
645    
646 nigel 77 /* These statements are here to stop the compiler complaining about unitialized
647     variables. */
648    
649     #ifdef SUPPORT_UCP
650 nigel 87 prop_value = 0;
651 nigel 77 prop_fail_result = 0;
652     #endif
653    
654 nigel 93
655 nigel 91 /* This label is used for tail recursion, which is used in a few cases even
656     when NO_RECURSE is not defined, in order to reduce the amount of stack that is
657     used. Thanks to Ian Taylor for noticing this possibility and sending the
658     original patch. */
659    
660     TAIL_RECURSE:
661    
662 nigel 87 /* OK, now we can get on with the real code of the function. Recursive calls
663     are specified by the macro RMATCH and RRETURN is used to return. When
664     NO_RECURSE is *not* defined, these just turn into a recursive call to match()
665 ph10 475 and a "return", respectively (possibly with some debugging if PCRE_DEBUG is
666 nigel 87 defined). However, RMATCH isn't like a function call because it's quite a
667     complicated macro. It has to be used in one particular way. This shouldn't,
668     however, impact performance when true recursion is being used. */
669 nigel 77
670 ph10 164 #ifdef SUPPORT_UTF8
671     utf8 = md->utf8; /* Local copy of the flag */
672     #else
673     utf8 = FALSE;
674     #endif
675    
676 nigel 87 /* First check that we haven't called match() too many times, or that we
677     haven't exceeded the recursive call limit. */
678    
679 nigel 77 if (md->match_call_count++ >= md->match_limit) RRETURN(PCRE_ERROR_MATCHLIMIT);
680 nigel 87 if (rdepth >= md->match_limit_recursion) RRETURN(PCRE_ERROR_RECURSIONLIMIT);
681 nigel 77
682 nigel 93 /* At the start of a group with an unlimited repeat that may match an empty
683 ph10 604 string, the variable md->match_function_type is set to MATCH_CBEGROUP. It is
684     done this way to save having to use another function argument, which would take
685     up space on the stack. See also MATCH_CONDASSERT below.
686 nigel 77
687 ph10 604 When MATCH_CBEGROUP is set, add the current subject pointer to the chain of
688     such remembered pointers, to be checked when we hit the closing ket, in order
689     to break infinite loops that match no characters. When match() is called in
690     other circumstances, don't add to the chain. The MATCH_CBEGROUP feature must
691     NOT be used with tail recursion, because the memory block that is used is on
692     the stack, so a new one may be required for each match(). */
693    
694     if (md->match_function_type == MATCH_CBEGROUP)
695 nigel 77 {
696 ph10 197 newptrb.epb_saved_eptr = eptr;
697     newptrb.epb_prev = eptrb;
698     eptrb = &newptrb;
699 ph10 604 md->match_function_type = 0;
700 nigel 77 }
701    
702 nigel 93 /* Now start processing the opcodes. */
703 nigel 77
704     for (;;)
705     {
706 nigel 93 minimize = possessive = FALSE;
707 nigel 77 op = *ecode;
708 ph10 604
709 nigel 93 switch(op)
710     {
711 ph10 510 case OP_MARK:
712     markptr = ecode + 2;
713     RMATCH(eptr, ecode + _pcre_OP_lengths[*ecode] + ecode[1], offset_top, md,
714 ph10 604 eptrb, RM55);
715 ph10 512
716     /* A return of MATCH_SKIP_ARG means that matching failed at SKIP with an
717     argument, and we must check whether that argument matches this MARK's
718     argument. It is passed back in md->start_match_ptr (an overloading of that
719     variable). If it does match, we reset that variable to the current subject
720     position and return MATCH_SKIP. Otherwise, pass back the return code
721 ph10 510 unaltered. */
722 ph10 512
723     if (rrc == MATCH_SKIP_ARG &&
724 ph10 510 strcmp((char *)markptr, (char *)(md->start_match_ptr)) == 0)
725     {
726     md->start_match_ptr = eptr;
727     RRETURN(MATCH_SKIP);
728     }
729    
730 ph10 512 if (md->mark == NULL) md->mark = markptr;
731 ph10 510 RRETURN(rrc);
732    
733 ph10 210 case OP_FAIL:
734 ph10 510 MRRETURN(MATCH_NOMATCH);
735 ph10 211
736 ph10 551 /* COMMIT overrides PRUNE, SKIP, and THEN */
737 ph10 553
738 ph10 510 case OP_COMMIT:
739     RMATCH(eptr, ecode + _pcre_OP_lengths[*ecode], offset_top, md,
740 ph10 604 eptrb, RM52);
741 ph10 551 if (rrc != MATCH_NOMATCH && rrc != MATCH_PRUNE &&
742 ph10 553 rrc != MATCH_SKIP && rrc != MATCH_SKIP_ARG &&
743     rrc != MATCH_THEN)
744 ph10 551 RRETURN(rrc);
745 ph10 510 MRRETURN(MATCH_COMMIT);
746    
747 ph10 551 /* PRUNE overrides THEN */
748 ph10 553
749 ph10 210 case OP_PRUNE:
750     RMATCH(eptr, ecode + _pcre_OP_lengths[*ecode], offset_top, md,
751 ph10 604 eptrb, RM51);
752 ph10 551 if (rrc != MATCH_NOMATCH && rrc != MATCH_THEN) RRETURN(rrc);
753 ph10 510 MRRETURN(MATCH_PRUNE);
754 ph10 211
755 ph10 510 case OP_PRUNE_ARG:
756     RMATCH(eptr, ecode + _pcre_OP_lengths[*ecode] + ecode[1], offset_top, md,
757 ph10 604 eptrb, RM56);
758 ph10 551 if (rrc != MATCH_NOMATCH && rrc != MATCH_THEN) RRETURN(rrc);
759 ph10 510 md->mark = ecode + 2;
760     RRETURN(MATCH_PRUNE);
761 ph10 211
762 ph10 551 /* SKIP overrides PRUNE and THEN */
763 ph10 553
764 ph10 210 case OP_SKIP:
765     RMATCH(eptr, ecode + _pcre_OP_lengths[*ecode], offset_top, md,
766 ph10 604 eptrb, RM53);
767 ph10 553 if (rrc != MATCH_NOMATCH && rrc != MATCH_PRUNE && rrc != MATCH_THEN)
768 ph10 551 RRETURN(rrc);
769 ph10 211 md->start_match_ptr = eptr; /* Pass back current position */
770 ph10 510 MRRETURN(MATCH_SKIP);
771 ph10 211
772 ph10 510 case OP_SKIP_ARG:
773     RMATCH(eptr, ecode + _pcre_OP_lengths[*ecode] + ecode[1], offset_top, md,
774 ph10 604 eptrb, RM57);
775 ph10 553 if (rrc != MATCH_NOMATCH && rrc != MATCH_PRUNE && rrc != MATCH_THEN)
776 ph10 551 RRETURN(rrc);
777 ph10 512
778     /* Pass back the current skip name by overloading md->start_match_ptr and
779     returning the special MATCH_SKIP_ARG return code. This will either be
780     caught by a matching MARK, or get to the top, where it is treated the same
781 ph10 510 as PRUNE. */
782 ph10 512
783 ph10 510 md->start_match_ptr = ecode + 2;
784 ph10 512 RRETURN(MATCH_SKIP_ARG);
785 ph10 553
786 ph10 550 /* For THEN (and THEN_ARG) we pass back the address of the bracket or
787 ph10 553 the alt that is at the start of the current branch. This makes it possible
788     to skip back past alternatives that precede the THEN within the current
789     branch. */
790 ph10 512
791 ph10 210 case OP_THEN:
792     RMATCH(eptr, ecode + _pcre_OP_lengths[*ecode], offset_top, md,
793 ph10 604 eptrb, RM54);
794 ph10 210 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
795 ph10 550 md->start_match_ptr = ecode - GET(ecode, 1);
796 ph10 510 MRRETURN(MATCH_THEN);
797    
798     case OP_THEN_ARG:
799 ph10 553 RMATCH(eptr, ecode + _pcre_OP_lengths[*ecode] + ecode[1+LINK_SIZE],
800 ph10 604 offset_top, md, eptrb, RM58);
801 ph10 510 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
802 ph10 550 md->start_match_ptr = ecode - GET(ecode, 1);
803     md->mark = ecode + LINK_SIZE + 2;
804 ph10 212 RRETURN(MATCH_THEN);
805 ph10 211
806 ph10 604 /* Handle a capturing bracket, other than those that are possessive with an
807     unlimited repeat. If there is space in the offset vector, save the current
808     subject position in the working slot at the top of the vector. We mustn't
809     change the current values of the data slot, because they may be set from a
810     previous iteration of this group, and be referred to by a reference inside
811     the group. If we fail to match, we need to restore this value and also the
812 nigel 93 values of the final offsets, in case they were set by a previous iteration
813     of the same bracket.
814 nigel 77
815 nigel 93 If there isn't enough space in the offset vector, treat this as if it were
816     a non-capturing bracket. Don't worry about setting the flag for the error
817     case here; that is handled in the code for KET. */
818 nigel 77
819 nigel 93 case OP_CBRA:
820     case OP_SCBRA:
821     number = GET2(ecode, 1+LINK_SIZE);
822 nigel 77 offset = number << 1;
823 ph10 604
824 ph10 475 #ifdef PCRE_DEBUG
825 nigel 93 printf("start bracket %d\n", number);
826     printf("subject=");
827 nigel 77 pchars(eptr, 16, TRUE, md);
828     printf("\n");
829     #endif
830    
831     if (offset < md->offset_max)
832     {
833     save_offset1 = md->offset_vector[offset];
834     save_offset2 = md->offset_vector[offset+1];
835     save_offset3 = md->offset_vector[md->offset_end - number];
836     save_capture_last = md->capture_last;
837    
838     DPRINTF(("saving %d %d %d\n", save_offset1, save_offset2, save_offset3));
839 ph10 531 md->offset_vector[md->offset_end - number] =
840 ph10 530 (int)(eptr - md->start_subject);
841 nigel 77
842 ph10 604 for (;;)
843 nigel 77 {
844 ph10 604 if (op >= OP_SBRA) md->match_function_type = MATCH_CBEGROUP;
845     RMATCH(eptr, ecode + _pcre_OP_lengths[*ecode], offset_top, md,
846     eptrb, RM1);
847 ph10 550 if (rrc != MATCH_NOMATCH &&
848     (rrc != MATCH_THEN || md->start_match_ptr != ecode))
849     RRETURN(rrc);
850 nigel 77 md->capture_last = save_capture_last;
851     ecode += GET(ecode, 1);
852 ph10 604 if (*ecode != OP_ALT) break;
853 nigel 77 }
854    
855     DPRINTF(("bracket %d failed\n", number));
856    
857     md->offset_vector[offset] = save_offset1;
858     md->offset_vector[offset+1] = save_offset2;
859     md->offset_vector[md->offset_end - number] = save_offset3;
860    
861 ph10 609 if (rrc != MATCH_THEN && md->mark == NULL) md->mark = markptr;
862 nigel 77 RRETURN(MATCH_NOMATCH);
863     }
864    
865 ph10 197 /* FALL THROUGH ... Insufficient room for saving captured contents. Treat
866     as a non-capturing bracket. */
867 nigel 77
868 ph10 197 /* VVVVVVVVVVVVVVVVVVVVVVVVV */
869     /* VVVVVVVVVVVVVVVVVVVVVVVVV */
870    
871 nigel 93 DPRINTF(("insufficient capture room: treat as non-capturing\n"));
872 nigel 77
873 ph10 197 /* VVVVVVVVVVVVVVVVVVVVVVVVV */
874     /* VVVVVVVVVVVVVVVVVVVVVVVVV */
875    
876 ph10 604 /* Non-capturing bracket, except for possessive with unlimited repeat. Loop
877     for all the alternatives. When we get to the final alternative within the
878 ph10 609 brackets, we used to return the result of a recursive call to match()
879     whatever happened so it was possible to reduce stack usage by turning this
880     into a tail recursion, except in the case of a possibly empty group.
881     However, now that there is the possiblity of (*THEN) occurring in the final
882     alternative, this optimization is no longer possible. */
883 nigel 77
884 nigel 93 case OP_BRA:
885     case OP_SBRA:
886     DPRINTF(("start non-capturing bracket\n"));
887 nigel 91 for (;;)
888 nigel 77 {
889 ph10 604 if (op >= OP_SBRA) md->match_function_type = MATCH_CBEGROUP;
890 ph10 602 RMATCH(eptr, ecode + _pcre_OP_lengths[*ecode], offset_top, md, eptrb,
891 ph10 604 RM2);
892 ph10 550 if (rrc != MATCH_NOMATCH &&
893     (rrc != MATCH_THEN || md->start_match_ptr != ecode))
894     RRETURN(rrc);
895 nigel 77 ecode += GET(ecode, 1);
896 ph10 609 if (*ecode != OP_ALT) break;
897 nigel 77 }
898    
899 ph10 609 if (rrc != MATCH_THEN && md->mark == NULL) md->mark = markptr;
900     RRETURN(MATCH_NOMATCH);
901    
902 ph10 604 /* Handle possessive capturing brackets with an unlimited repeat. We come
903     here from BRAZERO with allow_zero set TRUE. The offset_vector values are
904     handled similarly to the normal case above. However, the matching is
905     different. The end of these brackets will always be OP_KETRPOS, which
906     returns MATCH_KETRPOS without going further in the pattern. By this means
907     we can handle the group by iteration rather than recursion, thereby
908     reducing the amount of stack needed. */
909    
910     case OP_CBRAPOS:
911     case OP_SCBRAPOS:
912     allow_zero = FALSE;
913    
914     POSSESSIVE_CAPTURE:
915     number = GET2(ecode, 1+LINK_SIZE);
916     offset = number << 1;
917    
918     #ifdef PCRE_DEBUG
919     printf("start possessive bracket %d\n", number);
920     printf("subject=");
921     pchars(eptr, 16, TRUE, md);
922     printf("\n");
923     #endif
924    
925     if (offset < md->offset_max)
926     {
927     matched_once = FALSE;
928     code_offset = ecode - md->start_code;
929    
930     save_offset1 = md->offset_vector[offset];
931     save_offset2 = md->offset_vector[offset+1];
932     save_offset3 = md->offset_vector[md->offset_end - number];
933     save_capture_last = md->capture_last;
934    
935     DPRINTF(("saving %d %d %d\n", save_offset1, save_offset2, save_offset3));
936    
937     /* Each time round the loop, save the current subject position for use
938     when the group matches. For MATCH_MATCH, the group has matched, so we
939     restart it with a new subject starting position, remembering that we had
940     at least one match. For MATCH_NOMATCH, carry on with the alternatives, as
941     usual. If we haven't matched any alternatives in any iteration, check to
942     see if a previous iteration matched. If so, the group has matched;
943     continue from afterwards. Otherwise it has failed; restore the previous
944     capture values before returning NOMATCH. */
945    
946     for (;;)
947     {
948     md->offset_vector[md->offset_end - number] =
949     (int)(eptr - md->start_subject);
950     if (op >= OP_SBRA) md->match_function_type = MATCH_CBEGROUP;
951     RMATCH(eptr, ecode + _pcre_OP_lengths[*ecode], offset_top, md,
952     eptrb, RM63);
953     if (rrc == MATCH_KETRPOS)
954     {
955     offset_top = md->end_offset_top;
956     eptr = md->end_match_ptr;
957     ecode = md->start_code + code_offset;
958     save_capture_last = md->capture_last;
959     matched_once = TRUE;
960     continue;
961     }
962     if (rrc != MATCH_NOMATCH &&
963     (rrc != MATCH_THEN || md->start_match_ptr != ecode))
964     RRETURN(rrc);
965     md->capture_last = save_capture_last;
966     ecode += GET(ecode, 1);
967     if (*ecode != OP_ALT) break;
968     }
969 ph10 610
970 ph10 604 if (!matched_once)
971     {
972     md->offset_vector[offset] = save_offset1;
973     md->offset_vector[offset+1] = save_offset2;
974     md->offset_vector[md->offset_end - number] = save_offset3;
975     }
976    
977 ph10 609 if (rrc != MATCH_THEN && md->mark == NULL) md->mark = markptr;
978 ph10 604 if (allow_zero || matched_once)
979     {
980     ecode += 1 + LINK_SIZE;
981     break;
982     }
983    
984     RRETURN(MATCH_NOMATCH);
985     }
986    
987     /* FALL THROUGH ... Insufficient room for saving captured contents. Treat
988     as a non-capturing bracket. */
989    
990     /* VVVVVVVVVVVVVVVVVVVVVVVVV */
991     /* VVVVVVVVVVVVVVVVVVVVVVVVV */
992    
993     DPRINTF(("insufficient capture room: treat as non-capturing\n"));
994    
995     /* VVVVVVVVVVVVVVVVVVVVVVVVV */
996     /* VVVVVVVVVVVVVVVVVVVVVVVVV */
997    
998     /* Non-capturing possessive bracket with unlimited repeat. We come here
999     from BRAZERO with allow_zero = TRUE. The code is similar to the above,
1000     without the capturing complication. It is written out separately for speed
1001     and cleanliness. */
1002    
1003     case OP_BRAPOS:
1004     case OP_SBRAPOS:
1005     allow_zero = FALSE;
1006    
1007     POSSESSIVE_NON_CAPTURE:
1008     matched_once = FALSE;
1009     code_offset = ecode - md->start_code;
1010    
1011     for (;;)
1012     {
1013     if (op >= OP_SBRA) md->match_function_type = MATCH_CBEGROUP;
1014     RMATCH(eptr, ecode + _pcre_OP_lengths[*ecode], offset_top, md,
1015 ph10 609 eptrb, RM48);
1016 ph10 604 if (rrc == MATCH_KETRPOS)
1017     {
1018 ph10 610 offset_top = md->end_offset_top;
1019 ph10 604 eptr = md->end_match_ptr;
1020     ecode = md->start_code + code_offset;
1021     matched_once = TRUE;
1022     continue;
1023     }
1024     if (rrc != MATCH_NOMATCH &&
1025     (rrc != MATCH_THEN || md->start_match_ptr != ecode))
1026     RRETURN(rrc);
1027     ecode += GET(ecode, 1);
1028     if (*ecode != OP_ALT) break;
1029     }
1030 ph10 610
1031 ph10 604 if (matched_once || allow_zero)
1032     {
1033     ecode += 1 + LINK_SIZE;
1034     break;
1035     }
1036     RRETURN(MATCH_NOMATCH);
1037    
1038     /* Control never reaches here. */
1039    
1040 nigel 77 /* Conditional group: compilation checked that there are no more than
1041     two branches. If the condition is false, skipping the first branch takes us
1042     past the end if there is only one branch, but that's OK because that is
1043 ph10 609 exactly what going to the ket would do. */
1044 nigel 77
1045     case OP_COND:
1046 nigel 93 case OP_SCOND:
1047 ph10 604 codelink = GET(ecode, 1);
1048 ph10 406
1049 ph10 381 /* Because of the way auto-callout works during compile, a callout item is
1050     inserted between OP_COND and an assertion condition. */
1051 ph10 392
1052 ph10 381 if (ecode[LINK_SIZE+1] == OP_CALLOUT)
1053     {
1054     if (pcre_callout != NULL)
1055     {
1056     pcre_callout_block cb;
1057     cb.version = 1; /* Version 1 of the callout block */
1058     cb.callout_number = ecode[LINK_SIZE+2];
1059     cb.offset_vector = md->offset_vector;
1060     cb.subject = (PCRE_SPTR)md->start_subject;
1061 ph10 530 cb.subject_length = (int)(md->end_subject - md->start_subject);
1062     cb.start_match = (int)(mstart - md->start_subject);
1063     cb.current_position = (int)(eptr - md->start_subject);
1064 ph10 381 cb.pattern_position = GET(ecode, LINK_SIZE + 3);
1065     cb.next_item_length = GET(ecode, 3 + 2*LINK_SIZE);
1066     cb.capture_top = offset_top/2;
1067     cb.capture_last = md->capture_last;
1068     cb.callout_data = md->callout_data;
1069 ph10 510 if ((rrc = (*pcre_callout)(&cb)) > 0) MRRETURN(MATCH_NOMATCH);
1070 ph10 381 if (rrc < 0) RRETURN(rrc);
1071     }
1072     ecode += _pcre_OP_lengths[OP_CALLOUT];
1073     }
1074 ph10 392
1075 ph10 399 condcode = ecode[LINK_SIZE+1];
1076 ph10 406
1077 ph10 381 /* Now see what the actual condition is */
1078 ph10 392
1079 ph10 459 if (condcode == OP_RREF || condcode == OP_NRREF) /* Recursion test */
1080 nigel 77 {
1081 ph10 459 if (md->recursive == NULL) /* Not recursing => FALSE */
1082     {
1083 ph10 461 condition = FALSE;
1084     ecode += GET(ecode, 1);
1085     }
1086 ph10 459 else
1087 ph10 461 {
1088 ph10 459 int recno = GET2(ecode, LINK_SIZE + 2); /* Recursion group number*/
1089     condition = (recno == RREF_ANY || recno == md->recursive->group_num);
1090 ph10 461
1091 ph10 459 /* If the test is for recursion into a specific subpattern, and it is
1092     false, but the test was set up by name, scan the table to see if the
1093     name refers to any other numbers, and test them. The condition is true
1094     if any one is set. */
1095 ph10 461
1096 ph10 459 if (!condition && condcode == OP_NRREF && recno != RREF_ANY)
1097     {
1098     uschar *slotA = md->name_table;
1099     for (i = 0; i < md->name_count; i++)
1100 ph10 461 {
1101     if (GET2(slotA, 0) == recno) break;
1102 ph10 459 slotA += md->name_entry_size;
1103     }
1104 ph10 461
1105 ph10 459 /* Found a name for the number - there can be only one; duplicate
1106     names for different numbers are allowed, but not vice versa. First
1107     scan down for duplicates. */
1108 ph10 461
1109 ph10 459 if (i < md->name_count)
1110 ph10 461 {
1111 ph10 459 uschar *slotB = slotA;
1112     while (slotB > md->name_table)
1113     {
1114     slotB -= md->name_entry_size;
1115     if (strcmp((char *)slotA + 2, (char *)slotB + 2) == 0)
1116     {
1117     condition = GET2(slotB, 0) == md->recursive->group_num;
1118 ph10 461 if (condition) break;
1119     }
1120 ph10 459 else break;
1121 ph10 461 }
1122    
1123 ph10 459 /* Scan up for duplicates */
1124 ph10 461
1125 ph10 459 if (!condition)
1126 ph10 461 {
1127 ph10 459 slotB = slotA;
1128     for (i++; i < md->name_count; i++)
1129     {
1130     slotB += md->name_entry_size;
1131     if (strcmp((char *)slotA + 2, (char *)slotB + 2) == 0)
1132     {
1133     condition = GET2(slotB, 0) == md->recursive->group_num;
1134     if (condition) break;
1135 ph10 461 }
1136 ph10 459 else break;
1137 ph10 461 }
1138     }
1139 ph10 459 }
1140 ph10 461 }
1141    
1142 ph10 459 /* Chose branch according to the condition */
1143 ph10 461
1144 ph10 459 ecode += condition? 3 : GET(ecode, 1);
1145     }
1146 ph10 461 }
1147 nigel 93
1148 ph10 459 else if (condcode == OP_CREF || condcode == OP_NCREF) /* Group used test */
1149 nigel 93 {
1150 nigel 77 offset = GET2(ecode, LINK_SIZE+2) << 1; /* Doubled ref number */
1151 nigel 93 condition = offset < offset_top && md->offset_vector[offset] >= 0;
1152 ph10 461
1153 ph10 459 /* If the numbered capture is unset, but the reference was by name,
1154 ph10 461 scan the table to see if the name refers to any other numbers, and test
1155     them. The condition is true if any one is set. This is tediously similar
1156     to the code above, but not close enough to try to amalgamate. */
1157    
1158 ph10 459 if (!condition && condcode == OP_NCREF)
1159     {
1160 ph10 461 int refno = offset >> 1;
1161 ph10 459 uschar *slotA = md->name_table;
1162 ph10 461
1163 ph10 459 for (i = 0; i < md->name_count; i++)
1164 ph10 461 {
1165     if (GET2(slotA, 0) == refno) break;
1166 ph10 459 slotA += md->name_entry_size;
1167     }
1168 ph10 461
1169     /* Found a name for the number - there can be only one; duplicate names
1170     for different numbers are allowed, but not vice versa. First scan down
1171 ph10 459 for duplicates. */
1172 ph10 461
1173 ph10 459 if (i < md->name_count)
1174 ph10 461 {
1175 ph10 459 uschar *slotB = slotA;
1176     while (slotB > md->name_table)
1177     {
1178     slotB -= md->name_entry_size;
1179     if (strcmp((char *)slotA + 2, (char *)slotB + 2) == 0)
1180     {
1181     offset = GET2(slotB, 0) << 1;
1182 ph10 461 condition = offset < offset_top &&
1183 ph10 459 md->offset_vector[offset] >= 0;
1184 ph10 461 if (condition) break;
1185     }
1186 ph10 459 else break;
1187 ph10 461 }
1188    
1189 ph10 459 /* Scan up for duplicates */
1190 ph10 461
1191 ph10 459 if (!condition)
1192 ph10 461 {
1193 ph10 459 slotB = slotA;
1194     for (i++; i < md->name_count; i++)
1195     {
1196     slotB += md->name_entry_size;
1197     if (strcmp((char *)slotA + 2, (char *)slotB + 2) == 0)
1198     {
1199     offset = GET2(slotB, 0) << 1;
1200 ph10 461 condition = offset < offset_top &&
1201 ph10 459 md->offset_vector[offset] >= 0;
1202 ph10 461 if (condition) break;
1203     }
1204 ph10 459 else break;
1205 ph10 461 }
1206     }
1207 ph10 459 }
1208 ph10 461 }
1209    
1210 ph10 459 /* Chose branch according to the condition */
1211    
1212 nigel 93 ecode += condition? 3 : GET(ecode, 1);
1213 nigel 77 }
1214    
1215 ph10 399 else if (condcode == OP_DEF) /* DEFINE - always false */
1216 nigel 93 {
1217     condition = FALSE;
1218     ecode += GET(ecode, 1);
1219     }
1220    
1221 nigel 77 /* The condition is an assertion. Call match() to evaluate it - setting
1222 ph10 604 md->match_function_type to MATCH_CONDASSERT causes it to stop at the end of
1223     an assertion. */
1224 nigel 77
1225     else
1226     {
1227 ph10 604 md->match_function_type = MATCH_CONDASSERT;
1228     RMATCH(eptr, ecode + 1 + LINK_SIZE, offset_top, md, NULL, RM3);
1229 nigel 77 if (rrc == MATCH_MATCH)
1230     {
1231 nigel 93 condition = TRUE;
1232     ecode += 1 + LINK_SIZE + GET(ecode, LINK_SIZE + 2);
1233 nigel 77 while (*ecode == OP_ALT) ecode += GET(ecode, 1);
1234     }
1235 ph10 550 else if (rrc != MATCH_NOMATCH &&
1236     (rrc != MATCH_THEN || md->start_match_ptr != ecode))
1237 nigel 77 {
1238     RRETURN(rrc); /* Need braces because of following else */
1239     }
1240 nigel 93 else
1241     {
1242     condition = FALSE;
1243 ph10 399 ecode += codelink;
1244 nigel 93 }
1245     }
1246 nigel 91
1247 nigel 93 /* We are now at the branch that is to be obeyed. As there is only one,
1248 ph10 609 we used to use tail recursion to avoid using another stack frame, except
1249     when there was unlimited repeat of a possibly empty group. However, that
1250     strategy no longer works because of the possibilty of (*THEN) being
1251     encountered in the branch. A recursive call to match() is always required,
1252     unless the second alternative doesn't exist, in which case we can just
1253     plough on. */
1254 nigel 91
1255 nigel 93 if (condition || *ecode == OP_ALT)
1256     {
1257 ph10 609 if (op == OP_SCOND) md->match_function_type = MATCH_CBEGROUP;
1258     RMATCH(eptr, ecode + 1 + LINK_SIZE, offset_top, md, eptrb, RM49);
1259     if (rrc == MATCH_THEN && md->start_match_ptr == ecode)
1260     rrc = MATCH_NOMATCH;
1261     RRETURN(rrc);
1262 nigel 77 }
1263 ph10 395 else /* Condition false & no alternative */
1264 nigel 93 {
1265     ecode += 1 + LINK_SIZE;
1266     }
1267     break;
1268 nigel 77
1269 ph10 461
1270 ph10 447 /* Before OP_ACCEPT there may be any number of OP_CLOSE opcodes,
1271     to close any currently open capturing brackets. */
1272 ph10 461
1273 ph10 447 case OP_CLOSE:
1274 ph10 461 number = GET2(ecode, 1);
1275 ph10 447 offset = number << 1;
1276 ph10 461
1277 ph10 475 #ifdef PCRE_DEBUG
1278 ph10 447 printf("end bracket %d at *ACCEPT", number);
1279     printf("\n");
1280     #endif
1281 nigel 77
1282 ph10 447 md->capture_last = number;
1283     if (offset >= md->offset_max) md->offset_overflow = TRUE; else
1284     {
1285     md->offset_vector[offset] =
1286     md->offset_vector[md->offset_end - number];
1287 ph10 530 md->offset_vector[offset+1] = (int)(eptr - md->start_subject);
1288 ph10 447 if (offset_top <= offset) offset_top = offset + 2;
1289     }
1290     ecode += 3;
1291 ph10 461 break;
1292 ph10 447
1293    
1294 ph10 608 /* End of the pattern, either real or forced. If we are in a recursion, we
1295     should restore the offsets appropriately, and if it's a top-level
1296     recursion, continue from after the call. */
1297 nigel 77
1298 ph10 210 case OP_ACCEPT:
1299 ph10 613 case OP_ASSERT_ACCEPT:
1300 nigel 77 case OP_END:
1301 ph10 608 if (md->recursive != NULL)
1302 nigel 77 {
1303     recursion_info *rec = md->recursive;
1304     md->recursive = rec->prevrec;
1305 ph10 608 memmove(md->offset_vector, rec->offset_save,
1306 nigel 77 rec->saved_max * sizeof(int));
1307 ph10 461 offset_top = rec->save_offset_top;
1308 ph10 608 if (rec->group_num == 0)
1309     {
1310     ecode = rec->after_call;
1311     break;
1312     }
1313 nigel 77 }
1314    
1315 ph10 613 /* Otherwise, if we have matched an empty string, fail if not in an
1316     assertion and if either PCRE_NOTEMPTY is set, or if PCRE_NOTEMPTY_ATSTART
1317     is set and we have matched at the start of the subject. In both cases,
1318     backtracking will then try other alternatives, if any. */
1319 ph10 443
1320 ph10 613 else if (eptr == mstart && op != OP_ASSERT_ACCEPT &&
1321 ph10 442 (md->notempty ||
1322 ph10 443 (md->notempty_atstart &&
1323 ph10 442 mstart == md->start_subject + md->start_offset)))
1324 ph10 510 MRRETURN(MATCH_NOMATCH);
1325 ph10 443
1326 ph10 442 /* Otherwise, we have a match. */
1327 ph10 608
1328 ph10 168 md->end_match_ptr = eptr; /* Record where we ended */
1329     md->end_offset_top = offset_top; /* and how many extracts were taken */
1330 ph10 210 md->start_match_ptr = mstart; /* and the start (\K can modify) */
1331 nigel 77
1332 ph10 512 /* For some reason, the macros don't work properly if an expression is
1333     given as the argument to MRRETURN when the heap is in use. */
1334    
1335     rrc = (op == OP_END)? MATCH_MATCH : MATCH_ACCEPT;
1336     MRRETURN(rrc);
1337    
1338 nigel 77 /* Assertion brackets. Check the alternative branches in turn - the
1339     matching won't pass the KET for an assertion. If any one branch matches,
1340     the assertion is true. Lookbehind assertions have an OP_REVERSE item at the
1341     start of each branch to move the current point backwards, so the code at
1342 ph10 604 this level is identical to the lookahead case. When the assertion is part
1343     of a condition, we want to return immediately afterwards. The caller of
1344     this incarnation of the match() function will have set MATCH_CONDASSERT in
1345     md->match_function type, and one of these opcodes will be the first opcode
1346     that is processed. We use a local variable that is preserved over calls to
1347     match() to remember this case. */
1348 nigel 77
1349     case OP_ASSERT:
1350     case OP_ASSERTBACK:
1351 ph10 604 if (md->match_function_type == MATCH_CONDASSERT)
1352     {
1353     condassert = TRUE;
1354     md->match_function_type = 0;
1355     }
1356     else condassert = FALSE;
1357    
1358 nigel 77 do
1359     {
1360 ph10 604 RMATCH(eptr, ecode + 1 + LINK_SIZE, offset_top, md, NULL, RM4);
1361 ph10 511 if (rrc == MATCH_MATCH || rrc == MATCH_ACCEPT)
1362 ph10 500 {
1363     mstart = md->start_match_ptr; /* In case \K reset it */
1364     break;
1365 ph10 501 }
1366 ph10 550 if (rrc != MATCH_NOMATCH &&
1367     (rrc != MATCH_THEN || md->start_match_ptr != ecode))
1368     RRETURN(rrc);
1369 nigel 77 ecode += GET(ecode, 1);
1370     }
1371     while (*ecode == OP_ALT);
1372 ph10 604
1373 ph10 510 if (*ecode == OP_KET) MRRETURN(MATCH_NOMATCH);
1374 nigel 77
1375     /* If checking an assertion for a condition, return MATCH_MATCH. */
1376    
1377 ph10 604 if (condassert) RRETURN(MATCH_MATCH);
1378 nigel 77
1379     /* Continue from after the assertion, updating the offsets high water
1380     mark, since extracts may have been taken during the assertion. */
1381    
1382     do ecode += GET(ecode,1); while (*ecode == OP_ALT);
1383     ecode += 1 + LINK_SIZE;
1384     offset_top = md->end_offset_top;
1385     continue;
1386    
1387 ph10 473 /* Negative assertion: all branches must fail to match. Encountering SKIP,
1388 ph10 482 PRUNE, or COMMIT means we must assume failure without checking subsequent
1389 ph10 473 branches. */
1390 nigel 77
1391     case OP_ASSERT_NOT:
1392     case OP_ASSERTBACK_NOT:
1393 ph10 604 if (md->match_function_type == MATCH_CONDASSERT)
1394     {
1395     condassert = TRUE;
1396     md->match_function_type = 0;
1397     }
1398     else condassert = FALSE;
1399    
1400 nigel 77 do
1401     {
1402 ph10 604 RMATCH(eptr, ecode + 1 + LINK_SIZE, offset_top, md, NULL, RM5);
1403 ph10 511 if (rrc == MATCH_MATCH || rrc == MATCH_ACCEPT) MRRETURN(MATCH_NOMATCH);
1404 ph10 473 if (rrc == MATCH_SKIP || rrc == MATCH_PRUNE || rrc == MATCH_COMMIT)
1405     {
1406     do ecode += GET(ecode,1); while (*ecode == OP_ALT);
1407 ph10 482 break;
1408     }
1409 ph10 550 if (rrc != MATCH_NOMATCH &&
1410     (rrc != MATCH_THEN || md->start_match_ptr != ecode))
1411     RRETURN(rrc);
1412 nigel 77 ecode += GET(ecode,1);
1413     }
1414     while (*ecode == OP_ALT);
1415    
1416 ph10 604 if (condassert) RRETURN(MATCH_MATCH); /* Condition assertion */
1417    
1418 nigel 77 ecode += 1 + LINK_SIZE;
1419     continue;
1420    
1421     /* Move the subject pointer back. This occurs only at the start of
1422     each branch of a lookbehind assertion. If we are too close to the start to
1423     move back, this match function fails. When working with UTF-8 we move
1424     back a number of characters, not bytes. */
1425    
1426     case OP_REVERSE:
1427     #ifdef SUPPORT_UTF8
1428     if (utf8)
1429     {
1430 nigel 93 i = GET(ecode, 1);
1431     while (i-- > 0)
1432 nigel 77 {
1433     eptr--;
1434 ph10 510 if (eptr < md->start_subject) MRRETURN(MATCH_NOMATCH);
1435 ph10 207 BACKCHAR(eptr);
1436 nigel 77 }
1437     }
1438     else
1439     #endif
1440    
1441     /* No UTF-8 support, or not in UTF-8 mode: count is byte count */
1442    
1443     {
1444 nigel 93 eptr -= GET(ecode, 1);
1445 ph10 510 if (eptr < md->start_subject) MRRETURN(MATCH_NOMATCH);
1446 nigel 77 }
1447    
1448 ph10 435 /* Save the earliest consulted character, then skip to next op code */
1449 nigel 77
1450 ph10 435 if (eptr < md->start_used_ptr) md->start_used_ptr = eptr;
1451 nigel 77 ecode += 1 + LINK_SIZE;
1452     break;
1453    
1454     /* The callout item calls an external function, if one is provided, passing
1455     details of the match so far. This is mainly for debugging, though the
1456     function is able to force a failure. */
1457    
1458     case OP_CALLOUT:
1459     if (pcre_callout != NULL)
1460     {
1461     pcre_callout_block cb;
1462     cb.version = 1; /* Version 1 of the callout block */
1463     cb.callout_number = ecode[1];
1464     cb.offset_vector = md->offset_vector;
1465 nigel 87 cb.subject = (PCRE_SPTR)md->start_subject;
1466 ph10 530 cb.subject_length = (int)(md->end_subject - md->start_subject);
1467     cb.start_match = (int)(mstart - md->start_subject);
1468     cb.current_position = (int)(eptr - md->start_subject);
1469 nigel 77 cb.pattern_position = GET(ecode, 2);
1470     cb.next_item_length = GET(ecode, 2 + LINK_SIZE);
1471     cb.capture_top = offset_top/2;
1472     cb.capture_last = md->capture_last;
1473     cb.callout_data = md->callout_data;
1474 ph10 510 if ((rrc = (*pcre_callout)(&cb)) > 0) MRRETURN(MATCH_NOMATCH);
1475 nigel 77 if (rrc < 0) RRETURN(rrc);
1476     }
1477     ecode += 2 + 2*LINK_SIZE;
1478     break;
1479    
1480     /* Recursion either matches the current regex, or some subexpression. The
1481     offset data is the offset to the starting bracket from the start of the
1482     whole pattern. (This is so that it works from duplicated subpatterns.)
1483    
1484     If there are any capturing brackets started but not finished, we have to
1485     save their starting points and reinstate them after the recursion. However,
1486     we don't know how many such there are (offset_top records the completed
1487     total) so we just have to save all the potential data. There may be up to
1488     65535 such values, which is too large to put on the stack, but using malloc
1489     for small numbers seems expensive. As a compromise, the stack is used when
1490     there are no more than REC_STACK_SAVE_MAX values to store; otherwise malloc
1491     is used. A problem is what to do if the malloc fails ... there is no way of
1492     returning to the top level with an error. Save the top REC_STACK_SAVE_MAX
1493     values on the stack, and accept that the rest may be wrong.
1494    
1495     There are also other values that have to be saved. We use a chained
1496     sequence of blocks that actually live on the stack. Thanks to Robin Houston
1497     for the original version of this logic. */
1498    
1499     case OP_RECURSE:
1500     {
1501     callpat = md->start_code + GET(ecode, 1);
1502 nigel 93 new_recursive.group_num = (callpat == md->start_code)? 0 :
1503     GET2(callpat, 1 + LINK_SIZE);
1504 nigel 77
1505     /* Add to "recursing stack" */
1506    
1507     new_recursive.prevrec = md->recursive;
1508     md->recursive = &new_recursive;
1509    
1510     /* Find where to continue from afterwards */
1511    
1512     ecode += 1 + LINK_SIZE;
1513     new_recursive.after_call = ecode;
1514    
1515     /* Now save the offset data. */
1516    
1517     new_recursive.saved_max = md->offset_end;
1518     if (new_recursive.saved_max <= REC_STACK_SAVE_MAX)
1519     new_recursive.offset_save = stacksave;
1520     else
1521     {
1522     new_recursive.offset_save =
1523     (int *)(pcre_malloc)(new_recursive.saved_max * sizeof(int));
1524     if (new_recursive.offset_save == NULL) RRETURN(PCRE_ERROR_NOMEMORY);
1525     }
1526    
1527     memcpy(new_recursive.offset_save, md->offset_vector,
1528     new_recursive.saved_max * sizeof(int));
1529 ph10 461 new_recursive.save_offset_top = offset_top;
1530 ph10 608
1531 nigel 77 /* OK, now we can do the recursion. For each top-level alternative we
1532     restore the offset and recursion data. */
1533    
1534     DPRINTF(("Recursing into group %d\n", new_recursive.group_num));
1535 ph10 604 cbegroup = (*callpat >= OP_SBRA);
1536 nigel 77 do
1537     {
1538 ph10 604 if (cbegroup) md->match_function_type = MATCH_CBEGROUP;
1539 ph10 164 RMATCH(eptr, callpat + _pcre_OP_lengths[*callpat], offset_top,
1540 ph10 604 md, eptrb, RM6);
1541 ph10 511 if (rrc == MATCH_MATCH || rrc == MATCH_ACCEPT)
1542 nigel 77 {
1543 nigel 87 DPRINTF(("Recursion matched\n"));
1544 nigel 77 md->recursive = new_recursive.prevrec;
1545     if (new_recursive.offset_save != stacksave)
1546     (pcre_free)(new_recursive.offset_save);
1547 ph10 510 MRRETURN(MATCH_MATCH);
1548 nigel 77 }
1549 ph10 550 else if (rrc != MATCH_NOMATCH &&
1550     (rrc != MATCH_THEN || md->start_match_ptr != ecode))
1551 nigel 87 {
1552     DPRINTF(("Recursion gave error %d\n", rrc));
1553 ph10 400 if (new_recursive.offset_save != stacksave)
1554     (pcre_free)(new_recursive.offset_save);
1555 nigel 87 RRETURN(rrc);
1556     }
1557 nigel 77
1558     md->recursive = &new_recursive;
1559     memcpy(md->offset_vector, new_recursive.offset_save,
1560     new_recursive.saved_max * sizeof(int));
1561     callpat += GET(callpat, 1);
1562     }
1563     while (*callpat == OP_ALT);
1564    
1565     DPRINTF(("Recursion didn't match\n"));
1566     md->recursive = new_recursive.prevrec;
1567     if (new_recursive.offset_save != stacksave)
1568     (pcre_free)(new_recursive.offset_save);
1569 ph10 510 MRRETURN(MATCH_NOMATCH);
1570 nigel 77 }
1571     /* Control never reaches here */
1572    
1573     /* "Once" brackets are like assertion brackets except that after a match,
1574     the point in the subject string is not moved back. Thus there can never be
1575     a move back into the brackets. Friedl calls these "atomic" subpatterns.
1576     Check the alternative branches in turn - the matching won't pass the KET
1577     for this kind of subpattern. If any one branch matches, we carry on as at
1578 ph10 500 the end of a normal bracket, leaving the subject pointer, but resetting
1579     the start-of-match value in case it was changed by \K. */
1580 nigel 77
1581     case OP_ONCE:
1582 nigel 91 prev = ecode;
1583     saved_eptr = eptr;
1584    
1585     do
1586 nigel 77 {
1587 ph10 604 RMATCH(eptr, ecode + 1 + LINK_SIZE, offset_top, md, eptrb, RM7);
1588 ph10 511 if (rrc == MATCH_MATCH) /* Note: _not_ MATCH_ACCEPT */
1589 ph10 500 {
1590     mstart = md->start_match_ptr;
1591     break;
1592 ph10 501 }
1593 ph10 550 if (rrc != MATCH_NOMATCH &&
1594     (rrc != MATCH_THEN || md->start_match_ptr != ecode))
1595     RRETURN(rrc);
1596 nigel 91 ecode += GET(ecode,1);
1597     }
1598     while (*ecode == OP_ALT);
1599 nigel 77
1600 nigel 91 /* If hit the end of the group (which could be repeated), fail */
1601 nigel 77
1602 nigel 91 if (*ecode != OP_ONCE && *ecode != OP_ALT) RRETURN(MATCH_NOMATCH);
1603 nigel 77
1604 ph10 614 /* Continue after the group, updating the offsets high water mark, since
1605     extracts may have been taken. */
1606 nigel 77
1607 nigel 93 do ecode += GET(ecode, 1); while (*ecode == OP_ALT);
1608 nigel 77
1609 nigel 91 offset_top = md->end_offset_top;
1610     eptr = md->end_match_ptr;
1611 nigel 77
1612 nigel 91 /* For a non-repeating ket, just continue at this level. This also
1613     happens for a repeating ket if no characters were matched in the group.
1614     This is the forcible breaking of infinite loops as implemented in Perl
1615     5.005. If there is an options reset, it will get obeyed in the normal
1616     course of events. */
1617 nigel 77
1618 nigel 91 if (*ecode == OP_KET || eptr == saved_eptr)
1619     {
1620     ecode += 1+LINK_SIZE;
1621     break;
1622     }
1623 nigel 77
1624 nigel 91 /* The repeating kets try the rest of the pattern or restart from the
1625     preceding bracket, in the appropriate order. The second "call" of match()
1626 ph10 602 uses tail recursion, to avoid using another stack frame. */
1627 nigel 77
1628 nigel 91 if (*ecode == OP_KETRMIN)
1629     {
1630 ph10 604 RMATCH(eptr, ecode + 1 + LINK_SIZE, offset_top, md, eptrb, RM8);
1631 nigel 91 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1632     ecode = prev;
1633     goto TAIL_RECURSE;
1634 nigel 77 }
1635 nigel 91 else /* OP_KETRMAX */
1636     {
1637 ph10 604 md->match_function_type = MATCH_CBEGROUP;
1638     RMATCH(eptr, prev, offset_top, md, eptrb, RM9);
1639 nigel 91 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1640     ecode += 1 + LINK_SIZE;
1641     goto TAIL_RECURSE;
1642     }
1643     /* Control never gets here */
1644 nigel 77
1645     /* An alternation is the end of a branch; scan along to find the end of the
1646     bracketed group and go to there. */
1647    
1648     case OP_ALT:
1649     do ecode += GET(ecode,1); while (*ecode == OP_ALT);
1650     break;
1651    
1652 ph10 335 /* BRAZERO, BRAMINZERO and SKIPZERO occur just before a bracket group,
1653     indicating that it may occur zero times. It may repeat infinitely, or not
1654     at all - i.e. it could be ()* or ()? or even (){0} in the pattern. Brackets
1655     with fixed upper repeat limits are compiled as a number of copies, with the
1656     optional ones preceded by BRAZERO or BRAMINZERO. */
1657 ph10 604
1658 nigel 77 case OP_BRAZERO:
1659 ph10 604 next = ecode + 1;
1660     RMATCH(eptr, next, offset_top, md, eptrb, RM10);
1661     if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1662     do next += GET(next, 1); while (*next == OP_ALT);
1663     ecode = next + 1 + LINK_SIZE;
1664 nigel 77 break;
1665 ph10 604
1666 nigel 77 case OP_BRAMINZERO:
1667 ph10 604 next = ecode + 1;
1668     do next += GET(next, 1); while (*next == OP_ALT);
1669     RMATCH(eptr, next + 1+LINK_SIZE, offset_top, md, eptrb, RM11);
1670     if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1671     ecode++;
1672 nigel 77 break;
1673    
1674 ph10 335 case OP_SKIPZERO:
1675 ph10 604 next = ecode+1;
1676     do next += GET(next,1); while (*next == OP_ALT);
1677     ecode = next + 1 + LINK_SIZE;
1678 ph10 335 break;
1679 ph10 604
1680     /* BRAPOSZERO occurs before a possessive bracket group. Don't do anything
1681     here; just jump to the group, with allow_zero set TRUE. */
1682    
1683     case OP_BRAPOSZERO:
1684     op = *(++ecode);
1685     allow_zero = TRUE;
1686     if (op == OP_CBRAPOS || op == OP_SCBRAPOS) goto POSSESSIVE_CAPTURE;
1687     goto POSSESSIVE_NON_CAPTURE;
1688 ph10 335
1689 nigel 93 /* End of a group, repeated or non-repeating. */
1690 nigel 77
1691     case OP_KET:
1692     case OP_KETRMIN:
1693     case OP_KETRMAX:
1694 ph10 604 case OP_KETRPOS:
1695 nigel 91 prev = ecode - GET(ecode, 1);
1696 nigel 77
1697 nigel 93 /* If this was a group that remembered the subject start, in order to break
1698     infinite repeats of empty string matches, retrieve the subject start from
1699     the chain. Otherwise, set it NULL. */
1700 nigel 77
1701 nigel 93 if (*prev >= OP_SBRA)
1702     {
1703     saved_eptr = eptrb->epb_saved_eptr; /* Value at start of group */
1704     eptrb = eptrb->epb_prev; /* Backup to previous group */
1705     }
1706     else saved_eptr = NULL;
1707 nigel 77
1708 ph10 500 /* If we are at the end of an assertion group or an atomic group, stop
1709     matching and return MATCH_MATCH, but record the current high water mark for
1710     use by positive assertions. We also need to record the match start in case
1711     it was changed by \K. */
1712 nigel 93
1713 nigel 91 if (*prev == OP_ASSERT || *prev == OP_ASSERT_NOT ||
1714     *prev == OP_ASSERTBACK || *prev == OP_ASSERTBACK_NOT ||
1715     *prev == OP_ONCE)
1716     {
1717     md->end_match_ptr = eptr; /* For ONCE */
1718     md->end_offset_top = offset_top;
1719 ph10 500 md->start_match_ptr = mstart;
1720 ph10 510 MRRETURN(MATCH_MATCH);
1721 nigel 91 }
1722 nigel 77
1723 nigel 93 /* For capturing groups we have to check the group number back at the start
1724     and if necessary complete handling an extraction by setting the offsets and
1725     bumping the high water mark. Note that whole-pattern recursion is coded as
1726     a recurse into group 0, so it won't be picked up here. Instead, we catch it
1727     when the OP_END is reached. Other recursion is handled here. */
1728 nigel 77
1729 ph10 604 if (*prev == OP_CBRA || *prev == OP_SCBRA ||
1730     *prev == OP_CBRAPOS || *prev == OP_SCBRAPOS)
1731 nigel 91 {
1732 nigel 93 number = GET2(prev, 1+LINK_SIZE);
1733 nigel 91 offset = number << 1;
1734 ph10 461
1735 ph10 475 #ifdef PCRE_DEBUG
1736 nigel 91 printf("end bracket %d", number);
1737     printf("\n");
1738 nigel 77 #endif
1739    
1740 nigel 93 md->capture_last = number;
1741     if (offset >= md->offset_max) md->offset_overflow = TRUE; else
1742 nigel 91 {
1743 ph10 615 /* If offset is greater than offset_top, it means that we are
1744     "skipping" a capturing group, and that group's offsets must be marked
1745     unset. In earlier versions of PCRE, all the offsets were unset at the
1746     start of matching, but this doesn't work because atomic groups and
1747     assertions can cause a value to be set that should later be unset.
1748     Example: matching /(?>(a))b|(a)c/ against "ac". This sets group 1 as
1749     part of the atomic group, but this is not on the final matching path,
1750     so must be unset when 2 is set. (If there is no group 2, there is no
1751     problem, because offset_top will then be 2, indicating no capture.) */
1752    
1753     if (offset > offset_top)
1754     {
1755     register int *iptr = md->offset_vector + offset_top;
1756     register int *iend = md->offset_vector + offset;
1757     while (iptr < iend) *iptr++ = -1;
1758     }
1759    
1760     /* Now make the extraction */
1761    
1762 nigel 93 md->offset_vector[offset] =
1763     md->offset_vector[md->offset_end - number];
1764 ph10 530 md->offset_vector[offset+1] = (int)(eptr - md->start_subject);
1765 nigel 93 if (offset_top <= offset) offset_top = offset + 2;
1766     }
1767 nigel 77
1768 nigel 93 /* Handle a recursively called group. Restore the offsets
1769     appropriately and continue from after the call. */
1770 nigel 77
1771 nigel 93 if (md->recursive != NULL && md->recursive->group_num == number)
1772     {
1773     recursion_info *rec = md->recursive;
1774     DPRINTF(("Recursion (%d) succeeded - continuing\n", number));
1775     md->recursive = rec->prevrec;
1776     memcpy(md->offset_vector, rec->offset_save,
1777     rec->saved_max * sizeof(int));
1778 ph10 461 offset_top = rec->save_offset_top;
1779 nigel 93 ecode = rec->after_call;
1780     break;
1781 nigel 77 }
1782 nigel 91 }
1783 nigel 77
1784 nigel 91 /* For a non-repeating ket, just continue at this level. This also
1785     happens for a repeating ket if no characters were matched in the group.
1786     This is the forcible breaking of infinite loops as implemented in Perl
1787     5.005. If there is an options reset, it will get obeyed in the normal
1788     course of events. */
1789 nigel 77
1790 nigel 91 if (*ecode == OP_KET || eptr == saved_eptr)
1791     {
1792     ecode += 1 + LINK_SIZE;
1793     break;
1794     }
1795 ph10 604
1796     /* OP_KETRPOS is a possessive repeating ket. Remember the current position,
1797     and return the MATCH_KETRPOS. This makes it possible to do the repeats one
1798     at a time from the outer level, thus saving stack. */
1799    
1800     if (*ecode == OP_KETRPOS)
1801     {
1802     md->end_match_ptr = eptr;
1803     md->end_offset_top = offset_top;
1804     RRETURN(MATCH_KETRPOS);
1805     }
1806 nigel 77
1807 ph10 604 /* The normal repeating kets try the rest of the pattern or restart from
1808     the preceding bracket, in the appropriate order. In the second case, we can
1809     use tail recursion to avoid using another stack frame, unless we have an
1810 ph10 197 unlimited repeat of a group that can match an empty string. */
1811 nigel 77
1812 nigel 91 if (*ecode == OP_KETRMIN)
1813     {
1814 ph10 604 RMATCH(eptr, ecode + 1 + LINK_SIZE, offset_top, md, eptrb, RM12);
1815 nigel 91 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1816 ph10 604 if (*prev >= OP_SBRA) /* Could match an empty string */
1817 ph10 197 {
1818 ph10 604 md->match_function_type = MATCH_CBEGROUP;
1819     RMATCH(eptr, prev, offset_top, md, eptrb, RM50);
1820 ph10 197 RRETURN(rrc);
1821     }
1822 nigel 91 ecode = prev;
1823     goto TAIL_RECURSE;
1824 nigel 77 }
1825 nigel 91 else /* OP_KETRMAX */
1826     {
1827 ph10 604 if (*prev >= OP_SBRA) md->match_function_type = MATCH_CBEGROUP;
1828     RMATCH(eptr, prev, offset_top, md, eptrb, RM13);
1829 nigel 91 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1830     ecode += 1 + LINK_SIZE;
1831     goto TAIL_RECURSE;
1832     }
1833     /* Control never gets here */
1834 nigel 77
1835 ph10 602 /* Not multiline mode: start of subject assertion, unless notbol. */
1836 nigel 77
1837     case OP_CIRC:
1838 ph10 510 if (md->notbol && eptr == md->start_subject) MRRETURN(MATCH_NOMATCH);
1839 ph10 602
1840 nigel 77 /* Start of subject assertion */
1841    
1842     case OP_SOD:
1843 ph10 510 if (eptr != md->start_subject) MRRETURN(MATCH_NOMATCH);
1844 nigel 77 ecode++;
1845     break;
1846 ph10 602
1847     /* Multiline mode: start of subject unless notbol, or after any newline. */
1848 nigel 77
1849 ph10 602 case OP_CIRCM:
1850     if (md->notbol && eptr == md->start_subject) MRRETURN(MATCH_NOMATCH);
1851     if (eptr != md->start_subject &&
1852     (eptr == md->end_subject || !WAS_NEWLINE(eptr)))
1853     MRRETURN(MATCH_NOMATCH);
1854     ecode++;
1855     break;
1856    
1857 nigel 77 /* Start of match assertion */
1858    
1859     case OP_SOM:
1860 ph10 510 if (eptr != md->start_subject + md->start_offset) MRRETURN(MATCH_NOMATCH);
1861 nigel 77 ecode++;
1862     break;
1863 ph10 172
1864 ph10 168 /* Reset the start of match point */
1865 ph10 172
1866 ph10 168 case OP_SET_SOM:
1867     mstart = eptr;
1868 ph10 172 ecode++;
1869     break;
1870 nigel 77
1871 ph10 602 /* Multiline mode: assert before any newline, or before end of subject
1872     unless noteol is set. */
1873 nigel 77
1874 ph10 602 case OP_DOLLM:
1875     if (eptr < md->end_subject)
1876     { if (!IS_NEWLINE(eptr)) MRRETURN(MATCH_NOMATCH); }
1877     else
1878 nigel 77 {
1879 ph10 510 if (md->noteol) MRRETURN(MATCH_NOMATCH);
1880 ph10 602 SCHECK_PARTIAL();
1881 nigel 77 }
1882 ph10 602 ecode++;
1883     break;
1884 ph10 579
1885 ph10 602 /* Not multiline mode: assert before a terminating newline or before end of
1886     subject unless noteol is set. */
1887    
1888     case OP_DOLL:
1889     if (md->noteol) MRRETURN(MATCH_NOMATCH);
1890     if (!md->endonly) goto ASSERT_NL_OR_EOS;
1891    
1892 nigel 91 /* ... else fall through for endonly */
1893 nigel 77
1894     /* End of subject assertion (\z) */
1895    
1896     case OP_EOD:
1897 ph10 510 if (eptr < md->end_subject) MRRETURN(MATCH_NOMATCH);
1898 ph10 553 SCHECK_PARTIAL();
1899 nigel 77 ecode++;
1900     break;
1901    
1902     /* End of subject or ending \n assertion (\Z) */
1903    
1904     case OP_EODN:
1905 ph10 553 ASSERT_NL_OR_EOS:
1906     if (eptr < md->end_subject &&
1907 nigel 93 (!IS_NEWLINE(eptr) || eptr != md->end_subject - md->nllen))
1908 ph10 510 MRRETURN(MATCH_NOMATCH);
1909 ph10 579
1910 ph10 553 /* Either at end of string or \n before end. */
1911 ph10 579
1912 ph10 553 SCHECK_PARTIAL();
1913 nigel 77 ecode++;
1914     break;
1915    
1916     /* Word boundary assertions */
1917    
1918     case OP_NOT_WORD_BOUNDARY:
1919     case OP_WORD_BOUNDARY:
1920     {
1921    
1922     /* Find out if the previous and current characters are "word" characters.
1923     It takes a bit more work in UTF-8 mode. Characters > 255 are assumed to
1924 ph10 443 be "non-word" characters. Remember the earliest consulted character for
1925 ph10 435 partial matching. */
1926 nigel 77
1927     #ifdef SUPPORT_UTF8
1928     if (utf8)
1929     {
1930 ph10 518 /* Get status of previous character */
1931 ph10 527
1932 nigel 77 if (eptr == md->start_subject) prev_is_word = FALSE; else
1933     {
1934 ph10 409 USPTR lastptr = eptr - 1;
1935 nigel 77 while((*lastptr & 0xc0) == 0x80) lastptr--;
1936 ph10 443 if (lastptr < md->start_used_ptr) md->start_used_ptr = lastptr;
1937 nigel 77 GETCHAR(c, lastptr);
1938 ph10 527 #ifdef SUPPORT_UCP
1939 ph10 518 if (md->use_ucp)
1940     {
1941     if (c == '_') prev_is_word = TRUE; else
1942 ph10 527 {
1943 ph10 518 int cat = UCD_CATEGORY(c);
1944     prev_is_word = (cat == ucp_L || cat == ucp_N);
1945 ph10 527 }
1946     }
1947     else
1948     #endif
1949 nigel 77 prev_is_word = c < 256 && (md->ctypes[c] & ctype_word) != 0;
1950     }
1951 ph10 527
1952 ph10 518 /* Get status of next character */
1953 ph10 527
1954 ph10 443 if (eptr >= md->end_subject)
1955 nigel 77 {
1956 ph10 443 SCHECK_PARTIAL();
1957     cur_is_word = FALSE;
1958 ph10 428 }
1959     else
1960     {
1961 nigel 77 GETCHAR(c, eptr);
1962 ph10 527 #ifdef SUPPORT_UCP
1963 ph10 518 if (md->use_ucp)
1964     {
1965     if (c == '_') cur_is_word = TRUE; else
1966 ph10 527 {
1967 ph10 518 int cat = UCD_CATEGORY(c);
1968     cur_is_word = (cat == ucp_L || cat == ucp_N);
1969 ph10 527 }
1970     }
1971     else
1972     #endif
1973 nigel 77 cur_is_word = c < 256 && (md->ctypes[c] & ctype_word) != 0;
1974     }
1975     }
1976     else
1977     #endif
1978    
1979 ph10 527 /* Not in UTF-8 mode, but we may still have PCRE_UCP set, and for
1980 ph10 518 consistency with the behaviour of \w we do use it in this case. */
1981 nigel 77
1982     {
1983 ph10 518 /* Get status of previous character */
1984 ph10 527
1985 ph10 435 if (eptr == md->start_subject) prev_is_word = FALSE; else
1986     {
1987 ph10 443 if (eptr <= md->start_used_ptr) md->start_used_ptr = eptr - 1;
1988 ph10 527 #ifdef SUPPORT_UCP
1989 ph10 518 if (md->use_ucp)
1990     {
1991 ph10 527 c = eptr[-1];
1992 ph10 518 if (c == '_') prev_is_word = TRUE; else
1993 ph10 527 {
1994 ph10 518 int cat = UCD_CATEGORY(c);
1995     prev_is_word = (cat == ucp_L || cat == ucp_N);
1996 ph10 527 }
1997     }
1998     else
1999     #endif
2000 ph10 435 prev_is_word = ((md->ctypes[eptr[-1]] & ctype_word) != 0);
2001     }
2002 ph10 527
2003 ph10 518 /* Get status of next character */
2004 ph10 527
2005 ph10 443 if (eptr >= md->end_subject)
2006 ph10 428 {
2007 ph10 443 SCHECK_PARTIAL();
2008     cur_is_word = FALSE;
2009 ph10 428 }
2010 ph10 527 else
2011     #ifdef SUPPORT_UCP
2012 ph10 518 if (md->use_ucp)
2013     {
2014 ph10 527 c = *eptr;
2015 ph10 518 if (c == '_') cur_is_word = TRUE; else
2016 ph10 527 {
2017 ph10 518 int cat = UCD_CATEGORY(c);
2018     cur_is_word = (cat == ucp_L || cat == ucp_N);
2019 ph10 527 }
2020     }
2021     else
2022     #endif
2023 ph10 518 cur_is_word = ((md->ctypes[*eptr] & ctype_word) != 0);
2024 nigel 77 }
2025    
2026     /* Now see if the situation is what we want */
2027    
2028     if ((*ecode++ == OP_WORD_BOUNDARY)?
2029     cur_is_word == prev_is_word : cur_is_word != prev_is_word)
2030 ph10 510 MRRETURN(MATCH_NOMATCH);
2031 nigel 77 }
2032     break;
2033    
2034     /* Match a single character type; inline for speed */
2035    
2036     case OP_ANY:
2037 ph10 510 if (IS_NEWLINE(eptr)) MRRETURN(MATCH_NOMATCH);
2038 ph10 345 /* Fall through */
2039    
2040 ph10 341 case OP_ALLANY:
2041 ph10 443 if (eptr++ >= md->end_subject)
2042 ph10 428 {
2043 ph10 443 SCHECK_PARTIAL();
2044 ph10 510 MRRETURN(MATCH_NOMATCH);
2045 ph10 443 }
2046 ph10 342 if (utf8) while (eptr < md->end_subject && (*eptr & 0xc0) == 0x80) eptr++;
2047 nigel 77 ecode++;
2048     break;
2049    
2050     /* Match a single byte, even in UTF-8 mode. This opcode really does match
2051     any byte, even newline, independent of the setting of PCRE_DOTALL. */
2052    
2053     case OP_ANYBYTE:
2054 ph10 443 if (eptr++ >= md->end_subject)
2055 ph10 428 {
2056 ph10 443 SCHECK_PARTIAL();
2057 ph10 510 MRRETURN(MATCH_NOMATCH);
2058 ph10 443 }
2059 nigel 77 ecode++;
2060     break;
2061    
2062     case OP_NOT_DIGIT:
2063 ph10 443 if (eptr >= md->end_subject)
2064 ph10 428 {
2065 ph10 443 SCHECK_PARTIAL();
2066 ph10 510 MRRETURN(MATCH_NOMATCH);
2067 ph10 443 }
2068 nigel 77 GETCHARINCTEST(c, eptr);
2069     if (
2070     #ifdef SUPPORT_UTF8
2071     c < 256 &&
2072     #endif
2073     (md->ctypes[c] & ctype_digit) != 0
2074     )
2075 ph10 510 MRRETURN(MATCH_NOMATCH);
2076 nigel 77 ecode++;
2077     break;
2078    
2079     case OP_DIGIT:
2080 ph10 443 if (eptr >= md->end_subject)
2081 ph10 428 {
2082 ph10 443 SCHECK_PARTIAL();
2083 ph10 510 MRRETURN(MATCH_NOMATCH);
2084 ph10 443 }
2085 nigel 77 GETCHARINCTEST(c, eptr);
2086     if (
2087     #ifdef SUPPORT_UTF8
2088     c >= 256 ||
2089     #endif
2090     (md->ctypes[c] & ctype_digit) == 0
2091     )
2092 ph10 510 MRRETURN(MATCH_NOMATCH);
2093 nigel 77 ecode++;
2094     break;
2095    
2096     case OP_NOT_WHITESPACE:
2097 ph10 443 if (eptr >= md->end_subject)
2098 ph10 428 {
2099 ph10 443 SCHECK_PARTIAL();
2100 ph10 510 MRRETURN(MATCH_NOMATCH);
2101 ph10 443 }
2102 nigel 77 GETCHARINCTEST(c, eptr);
2103     if (
2104     #ifdef SUPPORT_UTF8
2105     c < 256 &&
2106     #endif
2107     (md->ctypes[c] & ctype_space) != 0
2108     )
2109 ph10 510 MRRETURN(MATCH_NOMATCH);
2110 nigel 77 ecode++;
2111     break;
2112    
2113     case OP_WHITESPACE:
2114 ph10 443 if (eptr >= md->end_subject)
2115 ph10 428 {
2116 ph10 443 SCHECK_PARTIAL();
2117 ph10 510 MRRETURN(MATCH_NOMATCH);
2118 ph10 443 }
2119 nigel 77 GETCHARINCTEST(c, eptr);
2120     if (
2121     #ifdef SUPPORT_UTF8
2122     c >= 256 ||
2123     #endif
2124     (md->ctypes[c] & ctype_space) == 0
2125     )
2126 ph10 510 MRRETURN(MATCH_NOMATCH);
2127 nigel 77 ecode++;
2128     break;
2129    
2130     case OP_NOT_WORDCHAR:
2131 ph10 443 if (eptr >= md->end_subject)
2132 ph10 428 {
2133 ph10 443 SCHECK_PARTIAL();
2134 ph10 510 MRRETURN(MATCH_NOMATCH);
2135 ph10 443 }
2136 nigel 77 GETCHARINCTEST(c, eptr);
2137     if (
2138     #ifdef SUPPORT_UTF8
2139     c < 256 &&
2140     #endif
2141     (md->ctypes[c] & ctype_word) != 0
2142     )
2143 ph10 510 MRRETURN(MATCH_NOMATCH);
2144 nigel 77 ecode++;
2145     break;
2146    
2147     case OP_WORDCHAR:
2148 ph10 443 if (eptr >= md->end_subject)
2149 ph10 428 {
2150 ph10 443 SCHECK_PARTIAL();
2151 ph10 510 MRRETURN(MATCH_NOMATCH);
2152 ph10 443 }
2153 nigel 77 GETCHARINCTEST(c, eptr);
2154     if (
2155     #ifdef SUPPORT_UTF8
2156     c >= 256 ||
2157     #endif
2158     (md->ctypes[c] & ctype_word) == 0
2159     )
2160 ph10 510 MRRETURN(MATCH_NOMATCH);
2161 nigel 77 ecode++;
2162     break;
2163    
2164 nigel 93 case OP_ANYNL:
2165 ph10 443 if (eptr >= md->end_subject)
2166 ph10 428 {
2167 ph10 443 SCHECK_PARTIAL();
2168 ph10 510 MRRETURN(MATCH_NOMATCH);
2169 ph10 443 }
2170 nigel 93 GETCHARINCTEST(c, eptr);
2171     switch(c)
2172     {
2173 ph10 510 default: MRRETURN(MATCH_NOMATCH);
2174 ph10 600
2175 nigel 93 case 0x000d:
2176     if (eptr < md->end_subject && *eptr == 0x0a) eptr++;
2177     break;
2178 ph10 231
2179 nigel 93 case 0x000a:
2180 ph10 231 break;
2181    
2182 nigel 93 case 0x000b:
2183     case 0x000c:
2184     case 0x0085:
2185     case 0x2028:
2186     case 0x2029:
2187 ph10 510 if (md->bsr_anycrlf) MRRETURN(MATCH_NOMATCH);
2188 nigel 93 break;
2189     }
2190     ecode++;
2191     break;
2192    
2193 ph10 178 case OP_NOT_HSPACE:
2194 ph10 443 if (eptr >= md->end_subject)
2195 ph10 428 {
2196 ph10 443 SCHECK_PARTIAL();
2197 ph10 510 MRRETURN(MATCH_NOMATCH);
2198 ph10 443 }
2199 ph10 178 GETCHARINCTEST(c, eptr);
2200     switch(c)
2201     {
2202     default: break;
2203     case 0x09: /* HT */
2204     case 0x20: /* SPACE */
2205     case 0xa0: /* NBSP */
2206     case 0x1680: /* OGHAM SPACE MARK */
2207     case 0x180e: /* MONGOLIAN VOWEL SEPARATOR */
2208     case 0x2000: /* EN QUAD */
2209     case 0x2001: /* EM QUAD */
2210     case 0x2002: /* EN SPACE */
2211     case 0x2003: /* EM SPACE */
2212     case 0x2004: /* THREE-PER-EM SPACE */
2213     case 0x2005: /* FOUR-PER-EM SPACE */
2214     case 0x2006: /* SIX-PER-EM SPACE */
2215     case 0x2007: /* FIGURE SPACE */
2216     case 0x2008: /* PUNCTUATION SPACE */
2217     case 0x2009: /* THIN SPACE */
2218     case 0x200A: /* HAIR SPACE */
2219     case 0x202f: /* NARROW NO-BREAK SPACE */
2220     case 0x205f: /* MEDIUM MATHEMATICAL SPACE */
2221     case 0x3000: /* IDEOGRAPHIC SPACE */
2222 ph10 510 MRRETURN(MATCH_NOMATCH);
2223 ph10 178 }
2224     ecode++;
2225     break;
2226    
2227     case OP_HSPACE:
2228 ph10 443 if (eptr >= md->end_subject)
2229 ph10 428 {
2230 ph10 443 SCHECK_PARTIAL();
2231 ph10 510 MRRETURN(MATCH_NOMATCH);
2232 ph10 443 }
2233 ph10 178 GETCHARINCTEST(c, eptr);
2234     switch(c)
2235     {
2236 ph10 510 default: MRRETURN(MATCH_NOMATCH);
2237 ph10 178 case 0x09: /* HT */
2238     case 0x20: /* SPACE */
2239     case 0xa0: /* NBSP */
2240     case 0x1680: /* OGHAM SPACE MARK */
2241     case 0x180e: /* MONGOLIAN VOWEL SEPARATOR */
2242     case 0x2000: /* EN QUAD */
2243     case 0x2001: /* EM QUAD */
2244     case 0x2002: /* EN SPACE */
2245     case 0x2003: /* EM SPACE */
2246     case 0x2004: /* THREE-PER-EM SPACE */
2247     case 0x2005: /* FOUR-PER-EM SPACE */
2248     case 0x2006: /* SIX-PER-EM SPACE */
2249     case 0x2007: /* FIGURE SPACE */
2250     case 0x2008: /* PUNCTUATION SPACE */
2251     case 0x2009: /* THIN SPACE */
2252     case 0x200A: /* HAIR SPACE */
2253     case 0x202f: /* NARROW NO-BREAK SPACE */
2254     case 0x205f: /* MEDIUM MATHEMATICAL SPACE */
2255     case 0x3000: /* IDEOGRAPHIC SPACE */
2256     break;
2257     }
2258     ecode++;
2259     break;
2260    
2261     case OP_NOT_VSPACE:
2262 ph10 443 if (eptr >= md->end_subject)
2263 ph10 428 {
2264 ph10 443 SCHECK_PARTIAL();
2265 ph10 510 MRRETURN(MATCH_NOMATCH);
2266 ph10 443 }
2267 ph10 178 GETCHARINCTEST(c, eptr);
2268     switch(c)
2269     {
2270     default: break;
2271     case 0x0a: /* LF */
2272     case 0x0b: /* VT */
2273     case 0x0c: /* FF */
2274     case 0x0d: /* CR */
2275     case 0x85: /* NEL */
2276     case 0x2028: /* LINE SEPARATOR */
2277     case 0x2029: /* PARAGRAPH SEPARATOR */
2278 ph10 510 MRRETURN(MATCH_NOMATCH);
2279 ph10 178 }
2280     ecode++;
2281     break;
2282    
2283     case OP_VSPACE:
2284 ph10 443 if (eptr >= md->end_subject)
2285 ph10 428 {
2286 ph10 443 SCHECK_PARTIAL();
2287 ph10 510 MRRETURN(MATCH_NOMATCH);
2288 ph10 443 }
2289 ph10 178 GETCHARINCTEST(c, eptr);
2290     switch(c)
2291     {
2292 ph10 510 default: MRRETURN(MATCH_NOMATCH);
2293 ph10 178 case 0x0a: /* LF */
2294     case 0x0b: /* VT */
2295     case 0x0c: /* FF */
2296     case 0x0d: /* CR */
2297     case 0x85: /* NEL */
2298     case 0x2028: /* LINE SEPARATOR */
2299     case 0x2029: /* PARAGRAPH SEPARATOR */
2300     break;
2301     }
2302     ecode++;
2303     break;
2304    
2305 nigel 77 #ifdef SUPPORT_UCP
2306     /* Check the next character by Unicode property. We will get here only
2307     if the support is in the binary; otherwise a compile-time error occurs. */
2308    
2309     case OP_PROP:
2310     case OP_NOTPROP:
2311 ph10 443 if (eptr >= md->end_subject)
2312 ph10 428 {
2313 ph10 443 SCHECK_PARTIAL();
2314 ph10 510 MRRETURN(MATCH_NOMATCH);
2315 ph10 443 }
2316 nigel 77 GETCHARINCTEST(c, eptr);
2317     {
2318 ph10 384 const ucd_record *prop = GET_UCD(c);
2319 nigel 77
2320 nigel 87 switch(ecode[1])
2321     {
2322     case PT_ANY:
2323 ph10 510 if (op == OP_NOTPROP) MRRETURN(MATCH_NOMATCH);
2324 nigel 87 break;
2325 nigel 77
2326 nigel 87 case PT_LAMP:
2327 ph10 349 if ((prop->chartype == ucp_Lu ||
2328     prop->chartype == ucp_Ll ||
2329     prop->chartype == ucp_Lt) == (op == OP_NOTPROP))
2330 ph10 510 MRRETURN(MATCH_NOMATCH);
2331 ph10 517 break;
2332 nigel 87
2333     case PT_GC:
2334 ph10 351 if ((ecode[2] != _pcre_ucp_gentype[prop->chartype]) == (op == OP_PROP))
2335 ph10 510 MRRETURN(MATCH_NOMATCH);
2336 nigel 87 break;
2337    
2338     case PT_PC:
2339 ph10 349 if ((ecode[2] != prop->chartype) == (op == OP_PROP))
2340 ph10 510 MRRETURN(MATCH_NOMATCH);
2341 nigel 87 break;
2342    
2343     case PT_SC:
2344 ph10 349 if ((ecode[2] != prop->script) == (op == OP_PROP))
2345 ph10 510 MRRETURN(MATCH_NOMATCH);
2346 nigel 87 break;
2347 ph10 527
2348 ph10 517 /* These are specials */
2349 ph10 527
2350 ph10 517 case PT_ALNUM:
2351     if ((_pcre_ucp_gentype[prop->chartype] == ucp_L ||
2352     _pcre_ucp_gentype[prop->chartype] == ucp_N) == (op == OP_NOTPROP))
2353     MRRETURN(MATCH_NOMATCH);
2354 ph10 527 break;
2355    
2356 ph10 517 case PT_SPACE: /* Perl space */
2357     if ((_pcre_ucp_gentype[prop->chartype] == ucp_Z ||
2358     c == CHAR_HT || c == CHAR_NL || c == CHAR_FF || c == CHAR_CR)
2359     == (op == OP_NOTPROP))
2360     MRRETURN(MATCH_NOMATCH);
2361 ph10 527 break;
2362    
2363 ph10 517 case PT_PXSPACE: /* POSIX space */
2364     if ((_pcre_ucp_gentype[prop->chartype] == ucp_Z ||
2365 ph10 527 c == CHAR_HT || c == CHAR_NL || c == CHAR_VT ||
2366 ph10 517 c == CHAR_FF || c == CHAR_CR)
2367     == (op == OP_NOTPROP))
2368     MRRETURN(MATCH_NOMATCH);
2369 ph10 527 break;
2370 nigel 87
2371 ph10 527 case PT_WORD:
2372 ph10 517 if ((_pcre_ucp_gentype[prop->chartype] == ucp_L ||
2373 ph10 527 _pcre_ucp_gentype[prop->chartype] == ucp_N ||
2374 ph10 517 c == CHAR_UNDERSCORE) == (op == OP_NOTPROP))
2375     MRRETURN(MATCH_NOMATCH);
2376 ph10 527 break;
2377    
2378 ph10 517 /* This should never occur */
2379    
2380 nigel 87 default:
2381     RRETURN(PCRE_ERROR_INTERNAL);
2382 nigel 77 }
2383 nigel 87
2384     ecode += 3;
2385 nigel 77 }
2386     break;
2387    
2388     /* Match an extended Unicode sequence. We will get here only if the support
2389     is in the binary; otherwise a compile-time error occurs. */
2390    
2391     case OP_EXTUNI:
2392 ph10 443 if (eptr >= md->end_subject)
2393 ph10 428 {
2394 ph10 443 SCHECK_PARTIAL();
2395 ph10 510 MRRETURN(MATCH_NOMATCH);
2396 ph10 443 }
2397 nigel 77 GETCHARINCTEST(c, eptr);
2398     {
2399 ph10 349 int category = UCD_CATEGORY(c);
2400 ph10 510 if (category == ucp_M) MRRETURN(MATCH_NOMATCH);
2401 nigel 77 while (eptr < md->end_subject)
2402     {
2403     int len = 1;
2404     if (!utf8) c = *eptr; else
2405     {
2406     GETCHARLEN(c, eptr, len);
2407     }
2408 ph10 349 category = UCD_CATEGORY(c);
2409 nigel 77 if (category != ucp_M) break;
2410     eptr += len;
2411     }
2412     }
2413     ecode++;
2414     break;
2415     #endif
2416    
2417    
2418     /* Match a back reference, possibly repeatedly. Look past the end of the
2419     item to see if there is repeat information following. The code is similar
2420     to that for character classes, but repeated for efficiency. Then obey
2421     similar code to character type repeats - written out again for speed.
2422     However, if the referenced string is the empty string, always treat
2423     it as matched, any number of times (otherwise there could be infinite
2424     loops). */
2425    
2426     case OP_REF:
2427 ph10 602 case OP_REFI:
2428     caseless = op == OP_REFI;
2429 ph10 595 offset = GET2(ecode, 1) << 1; /* Doubled ref number */
2430     ecode += 3;
2431 ph10 345
2432 ph10 595 /* If the reference is unset, there are two possibilities:
2433 ph10 345
2434 ph10 595 (a) In the default, Perl-compatible state, set the length negative;
2435     this ensures that every attempt at a match fails. We can't just fail
2436     here, because of the possibility of quantifiers with zero minima.
2437 ph10 345
2438 ph10 595 (b) If the JavaScript compatibility flag is set, set the length to zero
2439     so that the back reference matches an empty string.
2440 ph10 345
2441 ph10 595 Otherwise, set the length to the length of what was matched by the
2442     referenced subpattern. */
2443 ph10 345
2444 ph10 595 if (offset >= offset_top || md->offset_vector[offset] < 0)
2445     length = (md->jscript_compat)? 0 : -1;
2446     else
2447     length = md->offset_vector[offset+1] - md->offset_vector[offset];
2448 nigel 77
2449 ph10 595 /* Set up for repetition, or handle the non-repeated case */
2450 nigel 77
2451 ph10 595 switch (*ecode)
2452     {
2453     case OP_CRSTAR:
2454     case OP_CRMINSTAR:
2455     case OP_CRPLUS:
2456     case OP_CRMINPLUS:
2457     case OP_CRQUERY:
2458     case OP_CRMINQUERY:
2459     c = *ecode++ - OP_CRSTAR;
2460     minimize = (c & 1) != 0;
2461     min = rep_min[c]; /* Pick up values from tables; */
2462     max = rep_max[c]; /* zero for max => infinity */
2463     if (max == 0) max = INT_MAX;
2464     break;
2465 nigel 77
2466 ph10 595 case OP_CRRANGE:
2467     case OP_CRMINRANGE:
2468     minimize = (*ecode == OP_CRMINRANGE);
2469     min = GET2(ecode, 1);
2470     max = GET2(ecode, 3);
2471     if (max == 0) max = INT_MAX;
2472     ecode += 5;
2473     break;
2474 nigel 77
2475 ph10 595 default: /* No repeat follows */
2476 ph10 602 if ((length = match_ref(offset, eptr, length, md, caseless)) < 0)
2477 ph10 595 {
2478     CHECK_PARTIAL();
2479     MRRETURN(MATCH_NOMATCH);
2480 nigel 77 }
2481 ph10 595 eptr += length;
2482     continue; /* With the main loop */
2483     }
2484 nigel 77
2485 ph10 595 /* Handle repeated back references. If the length of the reference is
2486     zero, just continue with the main loop. */
2487 ph10 443
2488 ph10 595 if (length == 0) continue;
2489 nigel 77
2490 ph10 595 /* First, ensure the minimum number of matches are present. We get back
2491     the length of the reference string explicitly rather than passing the
2492     address of eptr, so that eptr can be a register variable. */
2493 nigel 77
2494 ph10 595 for (i = 1; i <= min; i++)
2495     {
2496     int slength;
2497 ph10 602 if ((slength = match_ref(offset, eptr, length, md, caseless)) < 0)
2498 nigel 77 {
2499 ph10 595 CHECK_PARTIAL();
2500     MRRETURN(MATCH_NOMATCH);
2501 nigel 77 }
2502 ph10 595 eptr += slength;
2503     }
2504 nigel 77
2505 ph10 595 /* If min = max, continue at the same level without recursion.
2506     They are not both allowed to be zero. */
2507 nigel 77
2508 ph10 595 if (min == max) continue;
2509 nigel 77
2510 ph10 595 /* If minimizing, keep trying and advancing the pointer */
2511 nigel 77
2512 ph10 595 if (minimize)
2513     {
2514     for (fi = min;; fi++)
2515 nigel 77 {
2516 ph10 595 int slength;
2517 ph10 604 RMATCH(eptr, ecode, offset_top, md, eptrb, RM14);
2518 ph10 595 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2519     if (fi >= max) MRRETURN(MATCH_NOMATCH);
2520 ph10 602 if ((slength = match_ref(offset, eptr, length, md, caseless)) < 0)
2521 nigel 77 {
2522 ph10 595 CHECK_PARTIAL();
2523     MRRETURN(MATCH_NOMATCH);
2524 nigel 77 }
2525 ph10 595 eptr += slength;
2526 nigel 77 }
2527 ph10 595 /* Control never gets here */
2528     }
2529 nigel 77
2530 ph10 595 /* If maximizing, find the longest string and work backwards */
2531 nigel 77
2532 ph10 595 else
2533     {
2534     pp = eptr;
2535     for (i = min; i < max; i++)
2536 nigel 77 {
2537 ph10 595 int slength;
2538 ph10 602 if ((slength = match_ref(offset, eptr, length, md, caseless)) < 0)
2539 nigel 77 {
2540 ph10 595 CHECK_PARTIAL();
2541     break;
2542 nigel 77 }
2543 ph10 595 eptr += slength;
2544 nigel 77 }
2545 ph10 595 while (eptr >= pp)
2546     {
2547 ph10 604 RMATCH(eptr, ecode, offset_top, md, eptrb, RM15);
2548 ph10 595 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2549     eptr -= length;
2550     }
2551     MRRETURN(MATCH_NOMATCH);
2552 nigel 77 }
2553     /* Control never gets here */
2554    
2555     /* Match a bit-mapped character class, possibly repeatedly. This op code is
2556     used when all the characters in the class have values in the range 0-255,
2557     and either the matching is caseful, or the characters are in the range
2558     0-127 when UTF-8 processing is enabled. The only difference between
2559     OP_CLASS and OP_NCLASS occurs when a data character outside the range is
2560     encountered.
2561    
2562     First, look past the end of the item to see if there is repeat information
2563     following. Then obey similar code to character type repeats - written out
2564     again for speed. */
2565    
2566     case OP_NCLASS:
2567     case OP_CLASS:
2568     {
2569     data = ecode + 1; /* Save for matching */
2570     ecode += 33; /* Advance past the item */
2571    
2572     switch (*ecode)
2573     {
2574     case OP_CRSTAR:
2575     case OP_CRMINSTAR:
2576     case OP_CRPLUS:
2577     case OP_CRMINPLUS:
2578     case OP_CRQUERY:
2579     case OP_CRMINQUERY:
2580     c = *ecode++ - OP_CRSTAR;
2581     minimize = (c & 1) != 0;
2582     min = rep_min[c]; /* Pick up values from tables; */
2583     max = rep_max[c]; /* zero for max => infinity */
2584     if (max == 0) max = INT_MAX;
2585     break;
2586    
2587     case OP_CRRANGE:
2588     case OP_CRMINRANGE:
2589     minimize = (*ecode == OP_CRMINRANGE);
2590     min = GET2(ecode, 1);
2591     max = GET2(ecode, 3);
2592     if (max == 0) max = INT_MAX;
2593     ecode += 5;
2594     break;
2595    
2596     default: /* No repeat follows */
2597     min = max = 1;
2598     break;
2599     }
2600    
2601     /* First, ensure the minimum number of matches are present. */
2602    
2603     #ifdef SUPPORT_UTF8
2604     /* UTF-8 mode */
2605     if (utf8)
2606     {
2607     for (i = 1; i <= min; i++)
2608     {
2609 ph10 427 if (eptr >= md->end_subject)
2610 ph10 426 {
2611 ph10 428 SCHECK_PARTIAL();
2612 ph10 510 MRRETURN(MATCH_NOMATCH);
2613 ph10 427 }
2614 nigel 77 GETCHARINC(c, eptr);
2615     if (c > 255)
2616     {
2617 ph10 510 if (op == OP_CLASS) MRRETURN(MATCH_NOMATCH);
2618 nigel 77 }
2619     else
2620     {
2621 ph10 510 if ((data[c/8] & (1 << (c&7))) == 0) MRRETURN(MATCH_NOMATCH);
2622 nigel 77 }
2623     }
2624     }
2625     else
2626     #endif
2627     /* Not UTF-8 mode */
2628     {
2629     for (i = 1; i <= min; i++)
2630     {
2631 ph10 427 if (eptr >= md->end_subject)
2632 ph10 426 {
2633 ph10 428 SCHECK_PARTIAL();
2634 ph10 510 MRRETURN(MATCH_NOMATCH);
2635 ph10 427 }
2636 nigel 77 c = *eptr++;
2637 ph10 510 if ((data[c/8] & (1 << (c&7))) == 0) MRRETURN(MATCH_NOMATCH);
2638 nigel 77 }
2639     }
2640    
2641     /* If max == min we can continue with the main loop without the
2642     need to recurse. */
2643    
2644     if (min == max) continue;
2645    
2646     /* If minimizing, keep testing the rest of the expression and advancing
2647     the pointer while it matches the class. */
2648    
2649     if (minimize)
2650     {
2651     #ifdef SUPPORT_UTF8
2652     /* UTF-8 mode */
2653     if (utf8)
2654     {
2655     for (fi = min;; fi++)
2656     {
2657 ph10 604 RMATCH(eptr, ecode, offset_top, md, eptrb, RM16);
2658 nigel 77 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2659 ph10 510 if (fi >= max) MRRETURN(MATCH_NOMATCH);
2660 ph10 427 if (eptr >= md->end_subject)
2661 ph10 426 {
2662 ph10 427 SCHECK_PARTIAL();
2663 ph10 510 MRRETURN(MATCH_NOMATCH);
2664 ph10 427 }
2665 nigel 77 GETCHARINC(c, eptr);
2666     if (c > 255)
2667     {
2668 ph10 510 if (op == OP_CLASS) MRRETURN(MATCH_NOMATCH);
2669 nigel 77 }
2670     else
2671     {
2672 ph10 510 if ((data[c/8] & (1 << (c&7))) == 0) MRRETURN(MATCH_NOMATCH);
2673 nigel 77 }
2674     }
2675     }
2676     else
2677     #endif
2678     /* Not UTF-8 mode */
2679     {
2680     for (fi = min;; fi++)
2681     {
2682 ph10 604 RMATCH(eptr, ecode, offset_top, md, eptrb, RM17);
2683 nigel 77 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2684 ph10 510 if (fi >= max) MRRETURN(MATCH_NOMATCH);
2685 ph10 427 if (eptr >= md->end_subject)
2686 ph10 426 {
2687 ph10 427 SCHECK_PARTIAL();
2688 ph10 510 MRRETURN(MATCH_NOMATCH);
2689 ph10 427 }
2690 nigel 77 c = *eptr++;
2691 ph10 510 if ((data[c/8] & (1 << (c&7))) == 0) MRRETURN(MATCH_NOMATCH);
2692 nigel 77 }
2693     }
2694     /* Control never gets here */
2695     }
2696    
2697     /* If maximizing, find the longest possible run, then work backwards. */
2698    
2699     else
2700     {
2701     pp = eptr;
2702    
2703     #ifdef SUPPORT_UTF8
2704     /* UTF-8 mode */
2705     if (utf8)
2706     {
2707     for (i = min; i < max; i++)
2708     {
2709     int len = 1;
2710 ph10 463 if (eptr >= md->end_subject)
2711 ph10 462 {
2712 ph10 463 SCHECK_PARTIAL();
2713 ph10 462 break;
2714 ph10 463 }
2715 nigel 77 GETCHARLEN(c, eptr, len);
2716     if (c > 255)
2717     {
2718     if (op == OP_CLASS) break;
2719     }
2720     else
2721     {
2722     if ((data[c/8] & (1 << (c&7))) == 0) break;
2723     }
2724     eptr += len;
2725     }
2726     for (;;)
2727     {
2728 ph10 604 RMATCH(eptr, ecode, offset_top, md, eptrb, RM18);
2729 nigel 77 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2730     if (eptr-- == pp) break; /* Stop if tried at original pos */
2731     BACKCHAR(eptr);
2732     }
2733     }
2734     else
2735     #endif
2736     /* Not UTF-8 mode */
2737     {
2738     for (i = min; i < max; i++)
2739     {
2740 ph10 463 if (eptr >= md->end_subject)
2741 ph10 462 {
2742 ph10 463 SCHECK_PARTIAL();
2743 ph10 462 break;
2744 ph10 463 }
2745 nigel 77 c = *eptr;
2746     if ((data[c/8] & (1 << (c&7))) == 0) break;
2747     eptr++;
2748     }
2749     while (eptr >= pp)
2750     {
2751 ph10 604 RMATCH(eptr, ecode, offset_top, md, eptrb, RM19);
2752 nigel 87 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2753 nigel 77 eptr--;
2754     }
2755     }
2756    
2757 ph10 510 MRRETURN(MATCH_NOMATCH);
2758 nigel 77 }
2759     }
2760     /* Control never gets here */
2761    
2762    
2763     /* Match an extended character class. This opcode is encountered only
2764 ph10 384 when UTF-8 mode mode is supported. Nevertheless, we may not be in UTF-8
2765     mode, because Unicode properties are supported in non-UTF-8 mode. */
2766 nigel 77
2767     #ifdef SUPPORT_UTF8
2768     case OP_XCLASS:
2769     {
2770     data = ecode + 1 + LINK_SIZE; /* Save for matching */
2771     ecode += GET(ecode, 1); /* Advance past the item */
2772    
2773     switch (*ecode)
2774     {
2775     case OP_CRSTAR:
2776     case OP_CRMINSTAR:
2777     case OP_CRPLUS:
2778     case OP_CRMINPLUS:
2779     case OP_CRQUERY:
2780     case OP_CRMINQUERY:
2781     c = *ecode++ - OP_CRSTAR;
2782     minimize = (c & 1) != 0;
2783     min = rep_min[c]; /* Pick up values from tables; */
2784     max = rep_max[c]; /* zero for max => infinity */
2785     if (max == 0) max = INT_MAX;
2786     break;
2787    
2788     case OP_CRRANGE:
2789     case OP_CRMINRANGE:
2790     minimize = (*ecode == OP_CRMINRANGE);
2791     min = GET2(ecode, 1);
2792     max = GET2(ecode, 3);
2793     if (max == 0) max = INT_MAX;
2794     ecode += 5;
2795     break;
2796    
2797     default: /* No repeat follows */
2798     min = max = 1;
2799     break;
2800     }
2801    
2802     /* First, ensure the minimum number of matches are present. */
2803    
2804     for (i = 1; i <= min; i++)
2805     {
2806 ph10 427 if (eptr >= md->end_subject)
2807 ph10 426 {
2808     SCHECK_PARTIAL();
2809 ph10 510 MRRETURN(MATCH_NOMATCH);
2810 ph10 427 }
2811 ph10 384 GETCHARINCTEST(c, eptr);
2812 ph10 510 if (!_pcre_xclass(c, data)) MRRETURN(MATCH_NOMATCH);
2813 nigel 77 }
2814    
2815     /* If max == min we can continue with the main loop without the
2816     need to recurse. */
2817    
2818     if (min == max) continue;
2819    
2820     /* If minimizing, keep testing the rest of the expression and advancing
2821     the pointer while it matches the class. */
2822    
2823     if (minimize)
2824     {
2825     for (fi = min;; fi++)
2826     {
2827 ph10 604 RMATCH(eptr, ecode, offset_top, md, eptrb, RM20);
2828 nigel 77 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2829 ph10 510 if (fi >= max) MRRETURN(MATCH_NOMATCH);
2830 ph10 427 if (eptr >= md->end_subject)
2831 ph10 426 {
2832 ph10 427 SCHECK_PARTIAL();
2833 ph10 510 MRRETURN(MATCH_NOMATCH);
2834 ph10 427 }
2835 ph10 384 GETCHARINCTEST(c, eptr);
2836 ph10 510 if (!_pcre_xclass(c, data)) MRRETURN(MATCH_NOMATCH);
2837 nigel 77 }
2838     /* Control never gets here */
2839     }
2840    
2841     /* If maximizing, find the longest possible run, then work backwards. */
2842    
2843     else
2844     {
2845     pp = eptr;
2846     for (i = min; i < max; i++)
2847     {
2848     int len = 1;
2849 ph10 463 if (eptr >= md->end_subject)
2850 ph10 462 {
2851 ph10 463 SCHECK_PARTIAL();
2852 ph10 462 break;
2853 ph10 463 }
2854 ph10 384 GETCHARLENTEST(c, eptr, len);
2855 nigel 77 if (!_pcre_xclass(c, data)) break;
2856     eptr += len;
2857     }
2858     for(;;)
2859     {
2860 ph10 604 RMATCH(eptr, ecode, offset_top, md, eptrb, RM21);
2861 nigel 77 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2862     if (eptr-- == pp) break; /* Stop if tried at original pos */
2863 ph10 214 if (utf8) BACKCHAR(eptr);
2864 nigel 77 }
2865 ph10 510 MRRETURN(MATCH_NOMATCH);
2866 nigel 77 }
2867    
2868     /* Control never gets here */
2869     }
2870     #endif /* End of XCLASS */
2871    
2872     /* Match a single character, casefully */
2873    
2874     case OP_CHAR:
2875     #ifdef SUPPORT_UTF8
2876     if (utf8)
2877     {
2878     length = 1;
2879     ecode++;
2880     GETCHARLEN(fc, ecode, length);
2881 ph10 443 if (length > md->end_subject - eptr)
2882 ph10 428 {
2883     CHECK_PARTIAL(); /* Not SCHECK_PARTIAL() */
2884 ph10 510 MRRETURN(MATCH_NOMATCH);
2885 ph10 443 }
2886 ph10 510 while (length-- > 0) if (*ecode++ != *eptr++) MRRETURN(MATCH_NOMATCH);
2887 nigel 77 }
2888     else
2889     #endif
2890    
2891     /* Non-UTF-8 mode */
2892     {
2893 ph10 443 if (md->end_subject - eptr < 1)
2894 ph10 428 {
2895     SCHECK_PARTIAL(); /* This one can use SCHECK_PARTIAL() */
2896 ph10 510 MRRETURN(MATCH_NOMATCH);
2897 ph10 443 }
2898 ph10 510 if (ecode[1] != *eptr++) MRRETURN(MATCH_NOMATCH);
2899 nigel 77 ecode += 2;
2900     }
2901     break;
2902    
2903     /* Match a single character, caselessly */
2904    
2905 ph10 602 case OP_CHARI:
2906 nigel 77 #ifdef SUPPORT_UTF8
2907     if (utf8)
2908     {
2909     length = 1;
2910     ecode++;
2911     GETCHARLEN(fc, ecode, length);
2912    
2913 ph10 443 if (length > md->end_subject - eptr)
2914 ph10 428 {
2915     CHECK_PARTIAL(); /* Not SCHECK_PARTIAL() */
2916 ph10 510 MRRETURN(MATCH_NOMATCH);
2917 ph10 443 }
2918 nigel 77
2919     /* If the pattern character's value is < 128, we have only one byte, and
2920     can use the fast lookup table. */
2921    
2922     if (fc < 128)
2923     {
2924 ph10 510 if (md->lcc[*ecode++] != md->lcc[*eptr++]) MRRETURN(MATCH_NOMATCH);
2925 nigel 77 }
2926    
2927     /* Otherwise we must pick up the subject character */
2928    
2929     else
2930     {
2931 nigel 93 unsigned int dc;
2932 nigel 77 GETCHARINC(dc, eptr);
2933     ecode += length;
2934    
2935     /* If we have Unicode property support, we can use it to test the other
2936 nigel 87 case of the character, if there is one. */
2937 nigel 77
2938     if (fc != dc)
2939     {
2940     #ifdef SUPPORT_UCP
2941 ph10 349 if (dc != UCD_OTHERCASE(fc))
2942 nigel 77 #endif
2943 ph10 510 MRRETURN(MATCH_NOMATCH);
2944 nigel 77 }
2945     }
2946     }
2947     else
2948     #endif /* SUPPORT_UTF8 */
2949    
2950     /* Non-UTF-8 mode */
2951     {
2952 ph10 443 if (md->end_subject - eptr < 1)
2953 ph10 428 {
2954 ph10 443 SCHECK_PARTIAL(); /* This one can use SCHECK_PARTIAL() */
2955 ph10 510 MRRETURN(MATCH_NOMATCH);
2956 ph10 443 }
2957 ph10 510 if (md->lcc[ecode[1]] != md->lcc[*eptr++]) MRRETURN(MATCH_NOMATCH);
2958 nigel 77 ecode += 2;
2959     }
2960     break;
2961    
2962 nigel 93 /* Match a single character repeatedly. */
2963 nigel 77
2964     case OP_EXACT:
2965 ph10 602 case OP_EXACTI:
2966 nigel 77 min = max = GET2(ecode, 1);
2967     ecode += 3;
2968     goto REPEATCHAR;
2969    
2970 nigel 93 case OP_POSUPTO:
2971 ph10 602 case OP_POSUPTOI:
2972 nigel 93 possessive = TRUE;
2973     /* Fall through */
2974    
2975 nigel 77 case OP_UPTO:
2976 ph10 602 case OP_UPTOI:
2977 nigel 77 case OP_MINUPTO:
2978 ph10 602 case OP_MINUPTOI:
2979 nigel 77 min = 0;
2980     max = GET2(ecode, 1);
2981 ph10 602 minimize = *ecode == OP_MINUPTO || *ecode == OP_MINUPTOI;
2982 nigel 77 ecode += 3;
2983     goto REPEATCHAR;
2984    
2985 nigel 93 case OP_POSSTAR:
2986 ph10 602 case OP_POSSTARI:
2987 nigel 93 possessive = TRUE;
2988     min = 0;
2989     max = INT_MAX;
2990     ecode++;
2991     goto REPEATCHAR;
2992    
2993     case OP_POSPLUS:
2994 ph10 602 case OP_POSPLUSI:
2995 nigel 93 possessive = TRUE;
2996     min = 1;
2997     max = INT_MAX;
2998     ecode++;
2999     goto REPEATCHAR;
3000    
3001     case OP_POSQUERY:
3002 ph10 602 case OP_POSQUERYI:
3003 nigel 93 possessive = TRUE;
3004     min = 0;
3005     max = 1;
3006     ecode++;
3007     goto REPEATCHAR;
3008    
3009 nigel 77 case OP_STAR:
3010 ph10 602 case OP_STARI:
3011 nigel 77 case OP_MINSTAR:
3012 ph10 602 case OP_MINSTARI:
3013 nigel 77 case OP_PLUS:
3014 ph10 602 case OP_PLUSI:
3015 nigel 77 case OP_MINPLUS:
3016 ph10 602 case OP_MINPLUSI:
3017 nigel 77 case OP_QUERY:
3018 ph10 602 case OP_QUERYI:
3019 nigel 77 case OP_MINQUERY:
3020 ph10 602 case OP_MINQUERYI:
3021     c = *ecode++ - ((op < OP_STARI)? OP_STAR : OP_STARI);
3022 nigel 77 minimize = (c & 1) != 0;
3023     min = rep_min[c]; /* Pick up values from tables; */
3024     max = rep_max[c]; /* zero for max => infinity */
3025     if (max == 0) max = INT_MAX;
3026    
3027 ph10 426 /* Common code for all repeated single-character matches. */
3028 nigel 77
3029     REPEATCHAR:
3030     #ifdef SUPPORT_UTF8
3031     if (utf8)
3032     {
3033     length = 1;
3034     charptr = ecode;
3035     GETCHARLEN(fc, ecode, length);
3036     ecode += length;
3037    
3038     /* Handle multibyte character matching specially here. There is
3039     support for caseless matching if UCP support is present. */
3040    
3041     if (length > 1)
3042     {
3043     #ifdef SUPPORT_UCP
3044 nigel 93 unsigned int othercase;
3045 ph10 602 if (op >= OP_STARI && /* Caseless */
3046 ph10 349 (othercase = UCD_OTHERCASE(fc)) != fc)
3047 nigel 77 oclength = _pcre_ord2utf8(othercase, occhars);
3048 ph10 115 else oclength = 0;
3049 nigel 77 #endif /* SUPPORT_UCP */
3050    
3051     for (i = 1; i <= min; i++)
3052     {
3053 ph10 426 if (eptr <= md->end_subject - length &&
3054     memcmp(eptr, charptr, length) == 0) eptr += length;
3055 ph10 123 #ifdef SUPPORT_UCP
3056 ph10 426 else if (oclength > 0 &&
3057     eptr <= md->end_subject - oclength &&
3058     memcmp(eptr, occhars, oclength) == 0) eptr += oclength;
3059     #endif /* SUPPORT_UCP */
3060 nigel 77 else
3061     {
3062 ph10 426 CHECK_PARTIAL();
3063 ph10 510 MRRETURN(MATCH_NOMATCH);
3064 nigel 77 }
3065     }
3066    
3067     if (min == max) continue;
3068    
3069     if (minimize)
3070     {
3071     for (fi = min;; fi++)
3072     {
3073 ph10 604 RMATCH(eptr, ecode, offset_top, md, eptrb, RM22);
3074 nigel 77 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3075 ph10 510 if (fi >= max) MRRETURN(MATCH_NOMATCH);
3076 ph10 426 if (eptr <= md->end_subject - length &&
3077     memcmp(eptr, charptr, length) == 0) eptr += length;
3078 ph10 123 #ifdef SUPPORT_UCP
3079 ph10 426 else if (oclength > 0 &&
3080     eptr <= md->end_subject - oclength &&
3081     memcmp(eptr, occhars, oclength) == 0) eptr += oclength;
3082     #endif /* SUPPORT_UCP */
3083 nigel 77 else
3084     {
3085 ph10 426 CHECK_PARTIAL();
3086 ph10 510 MRRETURN(MATCH_NOMATCH);
3087 nigel 77 }
3088     }
3089     /* Control never gets here */
3090     }
3091 nigel 93
3092     else /* Maximize */
3093 nigel 77 {
3094     pp = eptr;
3095     for (i = min; i < max; i++)
3096     {
3097 ph10 426 if (eptr <= md->end_subject - length &&
3098     memcmp(eptr, charptr, length) == 0) eptr += length;
3099 ph10 123 #ifdef SUPPORT_UCP
3100 ph10 426 else if (oclength > 0 &&
3101     eptr <= md->end_subject - oclength &&
3102     memcmp(eptr, occhars, oclength) == 0) eptr += oclength;
3103     #endif /* SUPPORT_UCP */
3104 ph10 463 else
3105 ph10 462 {
3106 ph10 463 CHECK_PARTIAL();
3107 ph10 462 break;
3108 ph10 463 }
3109 nigel 77 }
3110 nigel 93
3111     if (possessive) continue;
3112 ph10 427
3113 ph10 120 for(;;)
3114 ph10 426 {
3115 ph10 604 RMATCH(eptr, ecode, offset_top, md, eptrb, RM23);
3116 ph10 426 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3117 ph10 510 if (eptr == pp) { MRRETURN(MATCH_NOMATCH); }
3118 ph10 115 #ifdef SUPPORT_UCP
3119 ph10 426 eptr--;
3120     BACKCHAR(eptr);
3121 ph10 123 #else /* without SUPPORT_UCP */
3122 ph10 426 eptr -= length;
3123 ph10 123 #endif /* SUPPORT_UCP */
3124 ph10 426 }
3125 nigel 77 }
3126     /* Control never gets here */
3127     }
3128    
3129     /* If the length of a UTF-8 character is 1, we fall through here, and
3130     obey the code as for non-UTF-8 characters below, though in this case the
3131     value of fc will always be < 128. */
3132     }
3133     else
3134     #endif /* SUPPORT_UTF8 */
3135    
3136     /* When not in UTF-8 mode, load a single-byte character. */
3137    
3138 ph10 426 fc = *ecode++;
3139 ph10 443
3140 nigel 77 /* The value of fc at this point is always less than 256, though we may or
3141     may not be in UTF-8 mode. The code is duplicated for the caseless and
3142     caseful cases, for speed, since matching characters is likely to be quite
3143     common. First, ensure the minimum number of matches are present. If min =
3144     max, continue at the same level without recursing. Otherwise, if
3145     minimizing, keep trying the rest of the expression and advancing one
3146     matching character if failing, up to the maximum. Alternatively, if
3147     maximizing, find the maximum number of characters and work backwards. */
3148    
3149     DPRINTF(("matching %c{%d,%d} against subject %.*s\n", fc, min, max,
3150     max, eptr));
3151    
3152 ph10 602 if (op >= OP_STARI) /* Caseless */
3153 nigel 77 {
3154     fc = md->lcc[fc];
3155     for (i = 1; i <= min; i++)
3156 ph10 426 {
3157     if (eptr >= md->end_subject)
3158     {
3159     SCHECK_PARTIAL();
3160 ph10 510 MRRETURN(MATCH_NOMATCH);
3161 ph10 426 }
3162 ph10 510 if (fc != md->lcc[*eptr++]) MRRETURN(MATCH_NOMATCH);
3163 ph10 426 }
3164 nigel 77 if (min == max) continue;
3165     if (minimize)
3166     {
3167     for (fi = min;; fi++)
3168     {
3169 ph10 604 RMATCH(eptr, ecode, offset_top, md, eptrb, RM24);
3170 nigel 77 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3171 ph10 510 if (fi >= max) MRRETURN(MATCH_NOMATCH);
3172 ph10 426 if (eptr >= md->end_subject)
3173     {
3174 ph10 427 SCHECK_PARTIAL();
3175 ph10 510 MRRETURN(MATCH_NOMATCH);
3176 ph10 426 }
3177 ph10 510 if (fc != md->lcc[*eptr++]) MRRETURN(MATCH_NOMATCH);
3178 nigel 77 }
3179     /* Control never gets here */
3180     }
3181 nigel 93 else /* Maximize */
3182 nigel 77 {
3183     pp = eptr;
3184     for (i = min; i < max; i++)
3185     {
3186 ph10 463 if (eptr >= md->end_subject)
3187 ph10 462 {
3188     SCHECK_PARTIAL();
3189     break;
3190 ph10 463 }
3191 ph10 462 if (fc != md->lcc[*eptr]) break;
3192 nigel 77 eptr++;
3193     }
3194 ph10 427
3195 nigel 93 if (possessive) continue;
3196 ph10 427
3197 nigel 77 while (eptr >= pp)
3198     {
3199 ph10 604 RMATCH(eptr, ecode, offset_top, md, eptrb, RM25);
3200 nigel 77 eptr--;
3201     if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3202     }
3203 ph10 510 MRRETURN(MATCH_NOMATCH);
3204 nigel 77 }
3205     /* Control never gets here */
3206     }
3207    
3208     /* Caseful comparisons (includes all multi-byte characters) */
3209    
3210     else
3211     {
3212 ph10 427 for (i = 1; i <= min; i++)
3213 ph10 426 {
3214     if (eptr >= md->end_subject)
3215     {
3216     SCHECK_PARTIAL();
3217 ph10 510 MRRETURN(MATCH_NOMATCH);
3218 ph10 426 }
3219 ph10 510 if (fc != *eptr++) MRRETURN(MATCH_NOMATCH);
3220 ph10 427 }
3221 ph10 443
3222 nigel 77 if (min == max) continue;
3223 ph10 443
3224 nigel 77 if (minimize)
3225     {
3226     for (fi = min;; fi++)
3227     {
3228 ph10 604 RMATCH(eptr, ecode, offset_top, md, eptrb, RM26);
3229 nigel 77 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3230 ph10 510 if (fi >= max) MRRETURN(MATCH_NOMATCH);
3231 ph10 426 if (eptr >= md->end_subject)
3232 ph10 427 {
3233 ph10 426 SCHECK_PARTIAL();
3234 ph10 510 MRRETURN(MATCH_NOMATCH);
3235 ph10 427 }
3236 ph10 510 if (fc != *eptr++) MRRETURN(MATCH_NOMATCH);
3237 nigel 77 }
3238     /* Control never gets here */
3239     }
3240 nigel 93 else /* Maximize */
3241 nigel 77 {
3242     pp = eptr;
3243     for (i = min; i < max; i++)
3244     {
3245 ph10 463 if (eptr >= md->end_subject)
3246 ph10 462 {
3247 ph10 463 SCHECK_PARTIAL();
3248 ph10 462 break;
3249 ph10 463 }
3250 ph10 462 if (fc != *eptr) break;
3251 nigel 77 eptr++;
3252     }
3253 nigel 93 if (possessive) continue;
3254 ph10 443
3255 nigel 77 while (eptr >= pp)
3256     {
3257 ph10 604 RMATCH(eptr, ecode, offset_top, md, eptrb, RM27);
3258 nigel 77 eptr--;
3259     if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3260     }
3261 ph10 510 MRRETURN(MATCH_NOMATCH);
3262 nigel 77 }
3263     }
3264     /* Control never gets here */
3265    
3266     /* Match a negated single one-byte character. The character we are
3267     checking can be multibyte. */
3268    
3269     case OP_NOT:
3270 ph10 602 case OP_NOTI:
3271 ph10 443 if (eptr >= md->end_subject)
3272 ph10 428 {
3273 ph10 443 SCHECK_PARTIAL();
3274 ph10 510 MRRETURN(MATCH_NOMATCH);
3275 ph10 443 }
3276 nigel 77 ecode++;
3277     GETCHARINCTEST(c, eptr);
3278 ph10 602 if (op == OP_NOTI) /* The caseless case */
3279 nigel 77 {
3280     #ifdef SUPPORT_UTF8
3281     if (c < 256)
3282     #endif
3283     c = md->lcc[c];
3284 ph10 510 if (md->lcc[*ecode++] == c) MRRETURN(MATCH_NOMATCH);
3285 nigel 77 }
3286 ph10 602 else /* Caseful */
3287 nigel 77 {
3288 ph10 510 if (*ecode++ == c) MRRETURN(MATCH_NOMATCH);
3289 nigel 77 }
3290     break;
3291    
3292     /* Match a negated single one-byte character repeatedly. This is almost a
3293     repeat of the code for a repeated single character, but I haven't found a
3294     nice way of commoning these up that doesn't require a test of the
3295     positive/negative option for each character match. Maybe that wouldn't add
3296     very much to the time taken, but character matching *is* what this is all
3297     about... */
3298    
3299     case OP_NOTEXACT:
3300 ph10 602 case OP_NOTEXACTI:
3301 nigel 77 min = max = GET2(ecode, 1);
3302     ecode += 3;
3303     goto REPEATNOTCHAR;
3304    
3305     case OP_NOTUPTO:
3306 ph10 602 case OP_NOTUPTOI:
3307 nigel 77 case OP_NOTMINUPTO:
3308 ph10 602 case OP_NOTMINUPTOI:
3309 nigel 77 min = 0;
3310     max = GET2(ecode, 1);
3311 ph10 602 minimize = *ecode == OP_NOTMINUPTO || *ecode == OP_NOTMINUPTOI;
3312 nigel 77 ecode += 3;
3313     goto REPEATNOTCHAR;
3314    
3315 nigel 93 case OP_NOTPOSSTAR:
3316 ph10 602 case OP_NOTPOSSTARI:
3317 nigel 93 possessive = TRUE;
3318     min = 0;
3319     max = INT_MAX;
3320     ecode++;
3321     goto REPEATNOTCHAR;
3322    
3323     case OP_NOTPOSPLUS:
3324 ph10 602 case OP_NOTPOSPLUSI:
3325 nigel 93 possessive = TRUE;
3326     min = 1;
3327     max = INT_MAX;
3328     ecode++;
3329     goto REPEATNOTCHAR;
3330    
3331     case OP_NOTPOSQUERY:
3332 ph10 602 case OP_NOTPOSQUERYI:
3333 nigel 93 possessive = TRUE;
3334     min = 0;
3335     max = 1;
3336     ecode++;
3337     goto REPEATNOTCHAR;
3338    
3339     case OP_NOTPOSUPTO:
3340 ph10 602 case OP_NOTPOSUPTOI:
3341 nigel 93 possessive = TRUE;
3342     min = 0;
3343     max = GET2(ecode, 1);
3344     ecode += 3;
3345     goto REPEATNOTCHAR;
3346    
3347 nigel 77 case OP_NOTSTAR:
3348 ph10 602 case OP_NOTSTARI:
3349 nigel 77 case OP_NOTMINSTAR:
3350 ph10 602 case OP_NOTMINSTARI:
3351 nigel 77 case OP_NOTPLUS:
3352 ph10 602 case OP_NOTPLUSI:
3353 nigel 77 case OP_NOTMINPLUS:
3354 ph10 602 case OP_NOTMINPLUSI:
3355 nigel 77 case OP_NOTQUERY:
3356 ph10 602 case OP_NOTQUERYI:
3357 nigel 77 case OP_NOTMINQUERY:
3358 ph10 602 case OP_NOTMINQUERYI:
3359     c = *ecode++ - ((op >= OP_NOTSTARI)? OP_NOTSTARI: OP_NOTSTAR);
3360 nigel 77 minimize = (c & 1) != 0;
3361     min = rep_min[c]; /* Pick up values from tables; */
3362     max = rep_max[c]; /* zero for max => infinity */
3363     if (max == 0) max = INT_MAX;
3364    
3365 ph10 426 /* Common code for all repeated single-byte matches. */
3366 nigel 77
3367     REPEATNOTCHAR:
3368     fc = *ecode++;
3369    
3370     /* The code is duplicated for the caseless and caseful cases, for speed,
3371     since matching characters is likely to be quite common. First, ensure the
3372     minimum number of matches are present. If min = max, continue at the same
3373     level without recursing. Otherwise, if minimizing, keep trying the rest of
3374     the expression and advancing one matching character if failing, up to the
3375     maximum. Alternatively, if maximizing, find the maximum number of
3376     characters and work backwards. */
3377    
3378     DPRINTF(("negative matching %c{%d,%d} against subject %.*s\n", fc, min, max,
3379     max, eptr));
3380    
3381 ph10 602 if (op >= OP_NOTSTARI) /* Caseless */
3382 nigel 77 {
3383     fc = md->lcc[fc];
3384    
3385     #ifdef SUPPORT_UTF8
3386     /* UTF-8 mode */
3387     if (utf8)
3388     {
3389 nigel 93 register unsigned int d;
3390 nigel 77 for (i = 1; i <= min; i++)
3391     {
3392 ph10 426 if (eptr >= md->end_subject)
3393     {
3394     SCHECK_PARTIAL();
3395 ph10 510 MRRETURN(MATCH_NOMATCH);
3396 ph10 427 }
3397 nigel 77 GETCHARINC(d, eptr);
3398     if (d < 256) d = md->lcc[d];
3399 ph10 510 if (fc == d) MRRETURN(MATCH_NOMATCH);
3400 nigel 77 }
3401     }
3402     else
3403     #endif
3404    
3405     /* Not UTF-8 mode */
3406     {
3407     for (i = 1; i <= min; i++)
3408 ph10 426 {
3409     if (eptr >= md->end_subject)
3410     {
3411     SCHECK_PARTIAL();
3412 ph10 510 MRRETURN(MATCH_NOMATCH);
3413 ph10 427 }
3414 ph10 510 if (fc == md->lcc[*eptr++]) MRRETURN(MATCH_NOMATCH);
3415 ph10 427 }
3416 nigel 77 }
3417    
3418     if (min == max) continue;
3419    
3420     if (minimize)
3421     {
3422     #ifdef SUPPORT_UTF8
3423     /* UTF-8 mode */
3424     if (utf8)
3425     {
3426 nigel 93 register unsigned int d;
3427 nigel 77 for (fi = min;; fi++)
3428     {
3429 ph10 604 RMATCH(eptr, ecode, offset_top, md, eptrb, RM28);
3430 nigel 77 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3431 ph10 510 if (fi >= max) MRRETURN(MATCH_NOMATCH);
3432 ph10 427 if (eptr >= md->end_subject)
3433 ph10 426 {
3434 ph10 427 SCHECK_PARTIAL();
3435 ph10 510 MRRETURN(MATCH_NOMATCH);
3436 ph10 427 }
3437 nigel 77 GETCHARINC(d, eptr);
3438     if (d < 256) d = md->lcc[d];
3439 ph10 510 if (fc == d) MRRETURN(MATCH_NOMATCH);
3440 nigel 77 }
3441     }
3442     else
3443     #endif
3444     /* Not UTF-8 mode */
3445     {
3446     for (fi = min;; fi++)
3447     {
3448 ph10 604 RMATCH(eptr, ecode, offset_top, md, eptrb, RM29);
3449 nigel 77 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3450 ph10 510 if (fi >= max) MRRETURN(MATCH_NOMATCH);
3451 ph10 426 if (eptr >= md->end_subject)
3452     {
3453     SCHECK_PARTIAL();
3454 ph10 510 MRRETURN(MATCH_NOMATCH);
3455 ph10 426 }
3456 ph10 510 if (fc == md->lcc[*eptr++]) MRRETURN(MATCH_NOMATCH);
3457 nigel 77 }
3458     }
3459     /* Control never gets here */
3460     }
3461    
3462     /* Maximize case */
3463    
3464     else
3465     {
3466     pp = eptr;
3467    
3468     #ifdef SUPPORT_UTF8
3469     /* UTF-8 mode */
3470     if (utf8)
3471     {
3472 nigel 93 register unsigned int d;
3473 nigel 77 for (i = min; i < max; i++)
3474     {
3475     int len = 1;
3476 ph10 463 if (eptr >= md->end_subject)
3477 ph10 462 {
3478 ph10 463 SCHECK_PARTIAL();
3479 ph10 462 break;
3480 ph10 463 }
3481 nigel 77 GETCHARLEN(d, eptr, len);
3482     if (d < 256) d = md->lcc[d];
3483     if (fc == d) break;
3484     eptr += len;
3485     }
3486 nigel 93 if (possessive) continue;
3487     for(;;)
3488 nigel 77 {
3489 ph10 604 RMATCH(eptr, ecode, offset_top, md, eptrb, RM30);
3490 nigel 77 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3491     if (eptr-- == pp) break; /* Stop if tried at original pos */
3492     BACKCHAR(eptr);
3493     }
3494     }
3495     else
3496     #endif
3497     /* Not UTF-8 mode */
3498     {
3499     for (i = min; i < max; i++)
3500     {
3501 ph10 463 if (eptr >= md->end_subject)
3502 ph10 462 {
3503     SCHECK_PARTIAL();
3504     break;
3505 ph10 463 }
3506 ph10 462 if (fc == md->lcc[*eptr]) break;
3507 nigel 77 eptr++;
3508     }
3509 nigel 93 if (possessive) continue;
3510 nigel 77 while (eptr >= pp)
3511     {
3512 ph10 604 RMATCH(eptr, ecode, offset_top, md, eptrb, RM31);
3513 nigel 77 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3514     eptr--;
3515     }
3516     }
3517    
3518 ph10 510 MRRETURN(MATCH_NOMATCH);
3519 nigel 77 }
3520     /* Control never gets here */
3521     }
3522    
3523     /* Caseful comparisons */
3524    
3525     else
3526     {
3527     #ifdef SUPPORT_UTF8
3528     /* UTF-8 mode */
3529     if (utf8)
3530     {
3531 nigel 93 register unsigned int d;
3532 nigel 77 for (i = 1; i <= min; i++)
3533     {
3534 ph10 426 if (eptr >= md->end_subject)
3535     {
3536     SCHECK_PARTIAL();
3537 ph10 510 MRRETURN(MATCH_NOMATCH);
3538 ph10 427 }
3539 nigel 77 GETCHARINC(d, eptr);
3540 ph10 510 if (fc == d) MRRETURN(MATCH_NOMATCH);
3541 nigel 77 }
3542     }
3543     else
3544     #endif
3545     /* Not UTF-8 mode */
3546     {
3547     for (i = 1; i <= min; i++)
3548 ph10 426 {
3549     if (eptr >= md->end_subject)
3550     {
3551     SCHECK_PARTIAL();
3552 ph10 510 MRRETURN(MATCH_NOMATCH);
3553 ph10 427 }
3554 ph10 510 if (fc == *eptr++) MRRETURN(MATCH_NOMATCH);
3555 ph10 427 }
3556 nigel 77 }
3557    
3558     if (min == max) continue;
3559    
3560     if (minimize)
3561     {
3562     #ifdef SUPPORT_UTF8
3563     /* UTF-8 mode */
3564     if (utf8)
3565     {
3566 nigel 93 register unsigned int d;
3567 nigel 77 for (fi = min;; fi++)
3568     {
3569 ph10 604 RMATCH(eptr, ecode, offset_top, md, eptrb, RM32);
3570 nigel 77 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3571 ph10 510 if (fi >= max) MRRETURN(MATCH_NOMATCH);
3572 ph10 427 if (eptr >= md->end_subject)
3573 ph10 426 {
3574 ph10 427 SCHECK_PARTIAL();
3575 ph10 510 MRRETURN(MATCH_NOMATCH);
3576 ph10 427 }
3577 nigel 77 GETCHARINC(d, eptr);
3578 ph10 510 if (fc == d) MRRETURN(MATCH_NOMATCH);
3579 nigel 77 }
3580     }
3581     else
3582     #endif
3583     /* Not UTF-8 mode */
3584     {
3585     for (fi = min;; fi++)
3586     {
3587 ph10 604 RMATCH(eptr, ecode, offset_top, md, eptrb, RM33);
3588 nigel 77 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3589 ph10 510 if (fi >= max) MRRETURN(MATCH_NOMATCH);
3590 ph10 426 if (eptr >= md->end_subject)
3591     {
3592     SCHECK_PARTIAL();
3593 ph10 510 MRRETURN(MATCH_NOMATCH);
3594 ph10 427 }
3595 ph10 510 if (fc == *eptr++) MRRETURN(MATCH_NOMATCH);
3596 nigel 77 }
3597     }
3598     /* Control never gets here */
3599     }
3600    
3601     /* Maximize case */
3602    
3603     else
3604     {
3605     pp = eptr;
3606    
3607     #ifdef SUPPORT_UTF8
3608     /* UTF-8 mode */
3609     if (utf8)
3610     {
3611 nigel 93 register unsigned int d;
3612 nigel 77 for (i = min; i < max; i++)
3613     {
3614     int len = 1;
3615 ph10 463 if (eptr >= md->end_subject)
3616 ph10 462 {
3617 ph10 463 SCHECK_PARTIAL();
3618 ph10 462 break;
3619 ph10 463 }
3620 nigel 77 GETCHARLEN(d, eptr, len);
3621     if (fc == d) break;
3622     eptr += len;
3623     }
3624 nigel 93 if (possessive) continue;
3625 nigel 77 for(;;)
3626     {
3627 ph10 604 RMATCH(eptr, ecode, offset_top, md, eptrb, RM34);
3628 nigel 77 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3629     if (eptr-- == pp) break; /* Stop if tried at original pos */
3630     BACKCHAR(eptr);
3631     }
3632     }
3633     else
3634     #endif
3635     /* Not UTF-8 mode */
3636     {
3637     for (i = min; i < max; i++)
3638     {
3639 ph10 463 if (eptr >= md->end_subject)
3640 ph10 462 {
3641 ph10 463 SCHECK_PARTIAL();
3642 ph10 462 break;
3643 ph10 463 }
3644 ph10 462 if (fc == *eptr) break;
3645 nigel 77 eptr++;
3646     }
3647 nigel 93 if (possessive) continue;
3648 nigel 77 while (eptr >= pp)
3649     {
3650 ph10 604 RMATCH(eptr, ecode, offset_top, md, eptrb, RM35);
3651 nigel 77 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3652     eptr--;
3653     }
3654     }
3655    
3656 ph10 510 MRRETURN(MATCH_NOMATCH);
3657 nigel 77 }
3658     }
3659     /* Control never gets here */
3660    
3661     /* Match a single character type repeatedly; several different opcodes
3662     share code. This is very similar to the code for single characters, but we
3663     repeat it in the interests of efficiency. */
3664    
3665     case OP_TYPEEXACT:
3666     min = max = GET2(ecode, 1);
3667     minimize = TRUE;
3668     ecode += 3;
3669     goto REPEATTYPE;
3670    
3671     case OP_TYPEUPTO:
3672     case OP_TYPEMINUPTO:
3673     min = 0;
3674     max = GET2(ecode, 1);
3675     minimize = *ecode == OP_TYPEMINUPTO;
3676     ecode += 3;
3677     goto REPEATTYPE;
3678    
3679 nigel 93 case OP_TYPEPOSSTAR:
3680     possessive = TRUE;
3681     min = 0;
3682     max = INT_MAX;
3683     ecode++;
3684     goto REPEATTYPE;
3685    
3686     case OP_TYPEPOSPLUS:
3687     possessive = TRUE;
3688     min = 1;
3689     max = INT_MAX;
3690     ecode++;
3691     goto REPEATTYPE;
3692    
3693     case OP_TYPEPOSQUERY:
3694     possessive = TRUE;
3695     min = 0;
3696     max = 1;
3697     ecode++;
3698     goto REPEATTYPE;
3699    
3700     case OP_TYPEPOSUPTO:
3701     possessive = TRUE;
3702     min = 0;
3703     max = GET2(ecode, 1);
3704     ecode += 3;
3705     goto REPEATTYPE;
3706    
3707 nigel 77 case OP_TYPESTAR:
3708     case OP_TYPEMINSTAR:
3709     case OP_TYPEPLUS:
3710     case OP_TYPEMINPLUS:
3711     case OP_TYPEQUERY:
3712     case OP_TYPEMINQUERY:
3713     c = *ecode++ - OP_TYPESTAR;
3714     minimize = (c & 1) != 0;
3715     min = rep_min[c]; /* Pick up values from tables; */
3716     max = rep_max[c]; /* zero for max => infinity */
3717     if (max == 0) max = INT_MAX;
3718    
3719     /* Common code for all repeated single character type matches. Note that
3720     in UTF-8 mode, '.' matches a character of any length, but for the other
3721     character types, the valid characters are all one-byte long. */
3722    
3723     REPEATTYPE:
3724     ctype = *ecode++; /* Code for the character type */
3725    
3726     #ifdef SUPPORT_UCP
3727     if (ctype == OP_PROP || ctype == OP_NOTPROP)
3728     {
3729     prop_fail_result = ctype == OP_NOTPROP;
3730     prop_type = *ecode++;
3731 nigel 87 prop_value = *ecode++;
3732 nigel 77 }
3733     else prop_type = -1;
3734     #endif
3735    
3736     /* First, ensure the minimum number of matches are present. Use inline
3737     code for maximizing the speed, and do the type test once at the start
3738 ph10 426 (i.e. keep it out of the loop). Separate the UTF-8 code completely as that
3739 nigel 77 is tidier. Also separate the UCP code, which can be the same for both UTF-8
3740     and single-bytes. */
3741    
3742     if (min > 0)
3743     {
3744     #ifdef SUPPORT_UCP
3745 nigel 87 if (prop_type >= 0)
3746 nigel 77 {
3747 nigel 87 switch(prop_type)
3748 nigel 77 {
3749 nigel 87 case PT_ANY:
3750 ph10 510 if (prop_fail_result) MRRETURN(MATCH_NOMATCH);
3751 nigel 87 for (i = 1; i <= min; i++)
3752     {
3753 ph10 427 if (eptr >= md->end_subject)
3754 ph10 426 {
3755 ph10 427 SCHECK_PARTIAL();
3756 ph10 510 MRRETURN(MATCH_NOMATCH);
3757 ph10 427 }
3758 ph10 184 GETCHARINCTEST(c, eptr);
3759 nigel 87 }
3760     break;
3761    
3762     case PT_LAMP:
3763     for (i = 1; i <= min; i++)
3764     {
3765 ph10 427 if (eptr >= md->end_subject)
3766 ph10 426 {
3767 ph10 427 SCHECK_PARTIAL();
3768 ph10 510 MRRETURN(MATCH_NOMATCH);
3769 ph10 427 }
3770 ph10 184 GETCHARINCTEST(c, eptr);
3771 ph10 349 prop_chartype = UCD_CHARTYPE(c);
3772 nigel 87 if ((prop_chartype == ucp_Lu ||
3773     prop_chartype == ucp_Ll ||
3774     prop_chartype == ucp_Lt) == prop_fail_result)
3775 ph10 510 MRRETURN(MATCH_NOMATCH);
3776 nigel 87 }
3777     break;
3778    
3779     case PT_GC:
3780     for (i = 1; i <= min; i++)
3781     {
3782 ph10 427 if (eptr >= md->end_subject)
3783 ph10 426 {
3784 ph10 427 SCHECK_PARTIAL();
3785 ph10 510 MRRETURN(MATCH_NOMATCH);
3786 ph10 427 }
3787 ph10 184 GETCHARINCTEST(c, eptr);
3788 ph10 349 prop_category = UCD_CATEGORY(c);
3789 nigel 87 if ((prop_category == prop_value) == prop_fail_result)
3790 ph10 510 MRRETURN(MATCH_NOMATCH);
3791 nigel 87 }
3792     break;
3793    
3794     case PT_PC:
3795     for (i = 1; i <= min; i++)
3796     {
3797 ph10 427 if (eptr >= md->end_subject)
3798 ph10 426 {
3799 ph10 427 SCHECK_PARTIAL();
3800 ph10 510 MRRETURN(MATCH_NOMATCH);
3801 ph10 427 }
3802 ph10 184 GETCHARINCTEST(c, eptr);
3803 ph10 349 prop_chartype = UCD_CHARTYPE(c);
3804 nigel 87 if ((prop_chartype == prop_value) == prop_fail_result)
3805 ph10 510 MRRETURN(MATCH_NOMATCH);
3806 nigel 87 }
3807     break;
3808    
3809     case PT_SC:
3810     for (i = 1; i <= min; i++)
3811     {
3812 ph10 427 if (eptr >= md->end_subject)
3813 ph10 426 {
3814 ph10 427 SCHECK_PARTIAL();
3815 ph10 510 MRRETURN(MATCH_NOMATCH);
3816 ph10 427 }
3817 ph10 184 GETCHARINCTEST(c, eptr);
3818 ph10 349 prop_script = UCD_SCRIPT(c);
3819 nigel 87 if ((prop_script == prop_value) == prop_fail_result)
3820 ph10 510 MRRETURN(MATCH_NOMATCH);
3821 nigel 87 }
3822     break;
3823 ph10 527
3824 ph10 517 case PT_ALNUM:
3825     for (i = 1; i <= min; i++)
3826     {
3827     if (eptr >= md->end_subject)
3828     {
3829     SCHECK_PARTIAL();
3830     MRRETURN(MATCH_NOMATCH);
3831     }
3832     GETCHARINCTEST(c, eptr);
3833 ph10 527 prop_category = UCD_CATEGORY(c);
3834     if ((prop_category == ucp_L || prop_category == ucp_N)
3835 ph10 517 == prop_fail_result)
3836     MRRETURN(MATCH_NOMATCH);
3837     }
3838     break;
3839 ph10 527
3840 ph10 517 case PT_SPACE: /* Perl space */
3841     for (i = 1; i <= min; i++)
3842     {
3843     if (eptr >= md->end_subject)
3844     {
3845     SCHECK_PARTIAL();
3846     MRRETURN(MATCH_NOMATCH);
3847     }
3848     GETCHARINCTEST(c, eptr);
3849 ph10 527 prop_category = UCD_CATEGORY(c);
3850     if ((prop_category == ucp_Z || c == CHAR_HT || c == CHAR_NL ||
3851     c == CHAR_FF || c == CHAR_CR)
3852 ph10 517 == prop_fail_result)
3853     MRRETURN(MATCH_NOMATCH);
3854     }
3855     break;
3856 ph10 527
3857 ph10 517 case PT_PXSPACE: /* POSIX space */
3858     for (i = 1; i <= min; i++)
3859     {
3860     if (eptr >= md->end_subject)
3861     {
3862     SCHECK_PARTIAL();
3863     MRRETURN(MATCH_NOMATCH);
3864     }
3865     GETCHARINCTEST(c, eptr);
3866 ph10 527 prop_category = UCD_CATEGORY(c);
3867     if ((prop_category == ucp_Z || c == CHAR_HT || c == CHAR_NL ||
3868     c == CHAR_VT || c == CHAR_FF || c == CHAR_CR)
3869 ph10 517 == prop_fail_result)
3870     MRRETURN(MATCH_NOMATCH);
3871     }
3872     break;
3873 ph10 527
3874     case PT_WORD:
3875 ph10 517 for (i = 1; i <= min; i++)
3876     {
3877     if (eptr >= md->end_subject)
3878