/[pcre]/code/trunk/pcre_exec.c
ViewVC logotype

Contents of /code/trunk/pcre_exec.c

Parent Directory Parent Directory | Revision Log Revision Log


Revision 606 - (hide annotations) (download)
Mon Jun 6 17:46:22 2011 UTC (2 years ago) by ph10
File MIME type: text/plain
File size: 194674 byte(s)
Tidy the API for _pcre_valid_utf8() to a more suitable form for a future public 
release. Also make -s in pcretest force a study for every regex.

1 nigel 77 /*************************************************
2     * Perl-Compatible Regular Expressions *
3     *************************************************/
4    
5     /* PCRE is a library of functions to support regular expressions whose syntax
6     and semantics are as close as possible to those of the Perl 5 language.
7    
8     Written by Philip Hazel
9 ph10 598 Copyright (c) 1997-2011 University of Cambridge
10 nigel 77
11     -----------------------------------------------------------------------------
12     Redistribution and use in source and binary forms, with or without
13     modification, are permitted provided that the following conditions are met:
14    
15     * Redistributions of source code must retain the above copyright notice,
16     this list of conditions and the following disclaimer.
17    
18     * Redistributions in binary form must reproduce the above copyright
19     notice, this list of conditions and the following disclaimer in the
20     documentation and/or other materials provided with the distribution.
21    
22     * Neither the name of the University of Cambridge nor the names of its
23     contributors may be used to endorse or promote products derived from
24     this software without specific prior written permission.
25    
26     THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
27     AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
28     IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
29     ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
30     LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
31     CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
32     SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
33     INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
34     CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
35     ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
36     POSSIBILITY OF SUCH DAMAGE.
37     -----------------------------------------------------------------------------
38     */
39    
40    
41     /* This module contains pcre_exec(), the externally visible function that does
42     pattern matching using an NFA algorithm, trying to mimic Perl as closely as
43     possible. There are also some static supporting functions. */
44    
45 ph10 200 #ifdef HAVE_CONFIG_H
46 ph10 236 #include "config.h"
47 ph10 200 #endif
48 ph10 199
49 nigel 93 #define NLBLOCK md /* Block containing newline information */
50     #define PSSTART start_subject /* Field containing processed string start */
51     #define PSEND end_subject /* Field containing processed string end */
52    
53 nigel 77 #include "pcre_internal.h"
54    
55 ph10 137 /* Undefine some potentially clashing cpp symbols */
56    
57     #undef min
58     #undef max
59    
60 ph10 604 /* Values for setting in md->match_function_type to indicate two special types
61     of call to match(). We do it this way to save on using another stack variable,
62     as stack usage is to be discouraged. */
63 nigel 77
64 ph10 604 #define MATCH_CONDASSERT 1 /* Called to check a condition assertion */
65     #define MATCH_CBEGROUP 2 /* Could-be-empty unlimited repeat group */
66 nigel 77
67     /* Non-error returns from the match() function. Error returns are externally
68     defined PCRE_ERROR_xxx codes, which are all negative. */
69    
70     #define MATCH_MATCH 1
71     #define MATCH_NOMATCH 0
72    
73 ph10 211 /* Special internal returns from the match() function. Make them sufficiently
74 ph10 210 negative to avoid the external error codes. */
75    
76 ph10 511 #define MATCH_ACCEPT (-999)
77     #define MATCH_COMMIT (-998)
78 ph10 604 #define MATCH_KETRPOS (-997)
79     #define MATCH_PRUNE (-996)
80     #define MATCH_SKIP (-995)
81     #define MATCH_SKIP_ARG (-994)
82     #define MATCH_THEN (-993)
83 ph10 210
84 ph10 510 /* This is a convenience macro for code that occurs many times. */
85    
86     #define MRRETURN(ra) \
87     { \
88     md->mark = markptr; \
89     RRETURN(ra); \
90     }
91    
92 nigel 77 /* Maximum number of ints of offset to save on the stack for recursive calls.
93     If the offset vector is bigger, malloc is used. This should be a multiple of 3,
94     because the offset vector is always a multiple of 3 long. */
95    
96     #define REC_STACK_SAVE_MAX 30
97    
98     /* Min and max values for the common repeats; for the maxima, 0 => infinity */
99    
100     static const char rep_min[] = { 0, 0, 1, 1, 0, 0 };
101     static const char rep_max[] = { 0, 0, 0, 0, 1, 1 };
102    
103    
104    
105 ph10 475 #ifdef PCRE_DEBUG
106 nigel 77 /*************************************************
107     * Debugging function to print chars *
108     *************************************************/
109    
110     /* Print a sequence of chars in printable format, stopping at the end of the
111     subject if the requested.
112    
113     Arguments:
114     p points to characters
115     length number to print
116     is_subject TRUE if printing from within md->start_subject
117     md pointer to matching data block, if is_subject is TRUE
118    
119     Returns: nothing
120     */
121    
122     static void
123     pchars(const uschar *p, int length, BOOL is_subject, match_data *md)
124     {
125 nigel 93 unsigned int c;
126 nigel 77 if (is_subject && length > md->end_subject - p) length = md->end_subject - p;
127     while (length-- > 0)
128     if (isprint(c = *(p++))) printf("%c", c); else printf("\\x%02x", c);
129     }
130     #endif
131    
132    
133    
134     /*************************************************
135     * Match a back-reference *
136     *************************************************/
137    
138 ph10 595 /* Normally, if a back reference hasn't been set, the length that is passed is
139     negative, so the match always fails. However, in JavaScript compatibility mode,
140     the length passed is zero. Note that in caseless UTF-8 mode, the number of
141     subject bytes matched may be different to the number of reference bytes.
142 nigel 77
143     Arguments:
144     offset index into the offset vector
145 ph10 595 eptr pointer into the subject
146     length length of reference to be matched (number of bytes)
147 nigel 77 md points to match data block
148 ph10 602 caseless TRUE if caseless
149 nigel 77
150 ph10 595 Returns: < 0 if not matched, otherwise the number of subject bytes matched
151 nigel 77 */
152    
153 ph10 595 static int
154 nigel 87 match_ref(int offset, register USPTR eptr, int length, match_data *md,
155 ph10 602 BOOL caseless)
156 nigel 77 {
157 ph10 595 USPTR eptr_start = eptr;
158     register USPTR p = md->start_subject + md->offset_vector[offset];
159 nigel 77
160 ph10 475 #ifdef PCRE_DEBUG
161 nigel 77 if (eptr >= md->end_subject)
162     printf("matching subject <null>");
163     else
164     {
165     printf("matching subject ");
166     pchars(eptr, length, TRUE, md);
167     }
168     printf(" against backref ");
169     pchars(p, length, FALSE, md);
170     printf("\n");
171     #endif
172    
173 ph10 595 /* Always fail if reference not set (and not JavaScript compatible). */
174 nigel 77
175 ph10 595 if (length < 0) return -1;
176 nigel 77
177 ph10 354 /* Separate the caseless case for speed. In UTF-8 mode we can only do this
178     properly if Unicode properties are supported. Otherwise, we can check only
179     ASCII characters. */
180 nigel 77
181 ph10 602 if (caseless)
182 nigel 77 {
183 ph10 354 #ifdef SUPPORT_UTF8
184     #ifdef SUPPORT_UCP
185     if (md->utf8)
186     {
187 ph10 595 /* Match characters up to the end of the reference. NOTE: the number of
188     bytes matched may differ, because there are some characters whose upper and
189     lower case versions code as different numbers of bytes. For example, U+023A
190     (2 bytes in UTF-8) is the upper case version of U+2C65 (3 bytes in UTF-8);
191     a sequence of 3 of the former uses 6 bytes, as does a sequence of two of
192     the latter. It is important, therefore, to check the length along the
193     reference, not along the subject (earlier code did this wrong). */
194    
195     USPTR endptr = p + length;
196     while (p < endptr)
197 ph10 354 {
198 ph10 358 int c, d;
199 ph10 597 if (eptr >= md->end_subject) return -1;
200 ph10 354 GETCHARINC(c, eptr);
201     GETCHARINC(d, p);
202 ph10 595 if (c != d && c != UCD_OTHERCASE(d)) return -1;
203 ph10 358 }
204     }
205 ph10 354 else
206     #endif
207     #endif
208    
209     /* The same code works when not in UTF-8 mode and in UTF-8 mode when there
210     is no UCP support. */
211 ph10 597 {
212     if (eptr + length > md->end_subject) return -1;
213     while (length-- > 0)
214     { if (md->lcc[*p++] != md->lcc[*eptr++]) return -1; }
215     }
216 nigel 77 }
217 ph10 358
218 ph10 354 /* In the caseful case, we can just compare the bytes, whether or not we
219     are in UTF-8 mode. */
220 ph10 358
221 nigel 77 else
222 ph10 597 {
223     if (eptr + length > md->end_subject) return -1;
224     while (length-- > 0) if (*p++ != *eptr++) return -1;
225     }
226 nigel 77
227 ph10 595 return eptr - eptr_start;
228 nigel 77 }
229    
230    
231    
232     /***************************************************************************
233     ****************************************************************************
234     RECURSION IN THE match() FUNCTION
235    
236 nigel 87 The match() function is highly recursive, though not every recursive call
237     increases the recursive depth. Nevertheless, some regular expressions can cause
238     it to recurse to a great depth. I was writing for Unix, so I just let it call
239     itself recursively. This uses the stack for saving everything that has to be
240     saved for a recursive call. On Unix, the stack can be large, and this works
241     fine.
242 nigel 77
243 nigel 87 It turns out that on some non-Unix-like systems there are problems with
244     programs that use a lot of stack. (This despite the fact that every last chip
245     has oodles of memory these days, and techniques for extending the stack have
246     been known for decades.) So....
247 nigel 77
248     There is a fudge, triggered by defining NO_RECURSE, which avoids recursive
249     calls by keeping local variables that need to be preserved in blocks of memory
250 nigel 87 obtained from malloc() instead instead of on the stack. Macros are used to
251 nigel 77 achieve this so that the actual code doesn't look very different to what it
252     always used to.
253 ph10 164
254 ph10 165 The original heap-recursive code used longjmp(). However, it seems that this
255 ph10 164 can be very slow on some operating systems. Following a suggestion from Stan
256     Switzer, the use of longjmp() has been abolished, at the cost of having to
257     provide a unique number for each call to RMATCH. There is no way of generating
258     a sequence of numbers at compile time in C. I have given them names, to make
259     them stand out more clearly.
260    
261     Crude tests on x86 Linux show a small speedup of around 5-8%. However, on
262     FreeBSD, avoiding longjmp() more than halves the time taken to run the standard
263 ph10 165 tests. Furthermore, not using longjmp() means that local dynamic variables
264     don't have indeterminate values; this has meant that the frame size can be
265 ph10 164 reduced because the result can be "passed back" by straight setting of the
266     variable instead of being passed in the frame.
267 nigel 77 ****************************************************************************
268     ***************************************************************************/
269    
270 ph10 212 /* Numbers for RMATCH calls. When this list is changed, the code at HEAP_RETURN
271     below must be updated in sync. */
272 nigel 77
273 ph10 164 enum { RM1=1, RM2, RM3, RM4, RM5, RM6, RM7, RM8, RM9, RM10,
274     RM11, RM12, RM13, RM14, RM15, RM16, RM17, RM18, RM19, RM20,
275     RM21, RM22, RM23, RM24, RM25, RM26, RM27, RM28, RM29, RM30,
276     RM31, RM32, RM33, RM34, RM35, RM36, RM37, RM38, RM39, RM40,
277 ph10 210 RM41, RM42, RM43, RM44, RM45, RM46, RM47, RM48, RM49, RM50,
278 ph10 527 RM51, RM52, RM53, RM54, RM55, RM56, RM57, RM58, RM59, RM60,
279 ph10 604 RM61, RM62, RM63, RM64 };
280 ph10 164
281 nigel 87 /* These versions of the macros use the stack, as normal. There are debugging
282 ph10 165 versions and production versions. Note that the "rw" argument of RMATCH isn't
283 ph10 501 actually used in this definition. */
284 nigel 77
285     #ifndef NO_RECURSE
286     #define REGISTER register
287 ph10 164
288 ph10 475 #ifdef PCRE_DEBUG
289 ph10 604 #define RMATCH(ra,rb,rc,rd,re,rw) \
290 nigel 87 { \
291     printf("match() called in line %d\n", __LINE__); \
292 ph10 604 rrc = match(ra,rb,mstart,markptr,rc,rd,re,rdepth+1); \
293 nigel 87 printf("to line %d\n", __LINE__); \
294     }
295     #define RRETURN(ra) \
296     { \
297     printf("match() returned %d from line %d ", ra, __LINE__); \
298     return ra; \
299     }
300     #else
301 ph10 604 #define RMATCH(ra,rb,rc,rd,re,rw) \
302     rrc = match(ra,rb,mstart,markptr,rc,rd,re,rdepth+1)
303 nigel 77 #define RRETURN(ra) return ra
304 nigel 87 #endif
305    
306 nigel 77 #else
307    
308    
309 ph10 164 /* These versions of the macros manage a private stack on the heap. Note that
310     the "rd" argument of RMATCH isn't actually used in this definition. It's the md
311     argument of match(), which never changes. */
312 nigel 77
313     #define REGISTER
314    
315 ph10 604 #define RMATCH(ra,rb,rc,rd,re,rw)\
316 nigel 77 {\
317 ph10 563 heapframe *newframe = (heapframe *)(pcre_stack_malloc)(sizeof(heapframe));\
318 ph10 534 if (newframe == NULL) RRETURN(PCRE_ERROR_NOMEMORY);\
319 ph10 164 frame->Xwhere = rw; \
320     newframe->Xeptr = ra;\
321     newframe->Xecode = rb;\
322 ph10 168 newframe->Xmstart = mstart;\
323 ph10 501 newframe->Xmarkptr = markptr;\
324 ph10 164 newframe->Xoffset_top = rc;\
325 ph10 602 newframe->Xeptrb = re;\
326 ph10 164 newframe->Xrdepth = frame->Xrdepth + 1;\
327     newframe->Xprevframe = frame;\
328     frame = newframe;\
329     DPRINTF(("restarting from line %d\n", __LINE__));\
330     goto HEAP_RECURSE;\
331     L_##rw:\
332     DPRINTF(("jumped back to line %d\n", __LINE__));\
333 nigel 77 }
334    
335     #define RRETURN(ra)\
336     {\
337 ph10 527 heapframe *oldframe = frame;\
338     frame = oldframe->Xprevframe;\
339     (pcre_stack_free)(oldframe);\
340 nigel 77 if (frame != NULL)\
341     {\
342 ph10 164 rrc = ra;\
343     goto HEAP_RETURN;\
344 nigel 77 }\
345     return ra;\
346     }
347    
348    
349     /* Structure for remembering the local variables in a private frame */
350    
351     typedef struct heapframe {
352     struct heapframe *Xprevframe;
353    
354     /* Function arguments that may change */
355    
356 ph10 409 USPTR Xeptr;
357 nigel 77 const uschar *Xecode;
358 ph10 409 USPTR Xmstart;
359 ph10 501 USPTR Xmarkptr;
360 nigel 77 int Xoffset_top;
361     eptrblock *Xeptrb;
362 nigel 91 unsigned int Xrdepth;
363 nigel 77
364     /* Function local variables */
365    
366 ph10 409 USPTR Xcallpat;
367 ph10 406 #ifdef SUPPORT_UTF8
368 ph10 409 USPTR Xcharptr;
369 ph10 406 #endif
370 ph10 409 USPTR Xdata;
371     USPTR Xnext;
372     USPTR Xpp;
373     USPTR Xprev;
374     USPTR Xsaved_eptr;
375 nigel 77
376     recursion_info Xnew_recursive;
377    
378     BOOL Xcur_is_word;
379     BOOL Xcondition;
380     BOOL Xprev_is_word;
381    
382     #ifdef SUPPORT_UCP
383     int Xprop_type;
384 nigel 87 int Xprop_value;
385 nigel 77 int Xprop_fail_result;
386     int Xprop_category;
387     int Xprop_chartype;
388 nigel 87 int Xprop_script;
389 ph10 123 int Xoclength;
390     uschar Xocchars[8];
391 nigel 77 #endif
392    
393 ph10 403 int Xcodelink;
394 nigel 77 int Xctype;
395 nigel 93 unsigned int Xfc;
396 nigel 77 int Xfi;
397     int Xlength;
398     int Xmax;
399     int Xmin;
400     int Xnumber;
401     int Xoffset;
402     int Xop;
403     int Xsave_capture_last;
404     int Xsave_offset1, Xsave_offset2, Xsave_offset3;
405     int Xstacksave[REC_STACK_SAVE_MAX];
406    
407     eptrblock Xnewptrb;
408    
409 ph10 164 /* Where to jump back to */
410 nigel 77
411 ph10 164 int Xwhere;
412 ph10 165
413 nigel 77 } heapframe;
414    
415     #endif
416    
417    
418     /***************************************************************************
419     ***************************************************************************/
420    
421    
422    
423     /*************************************************
424     * Match from current position *
425     *************************************************/
426    
427 nigel 93 /* This function is called recursively in many circumstances. Whenever it
428 nigel 77 returns a negative (error) response, the outer incarnation must also return the
429 ph10 426 same response. */
430 nigel 77
431 ph10 426 /* These macros pack up tests that are used for partial matching, and which
432     appears several times in the code. We set the "hit end" flag if the pointer is
433     at the end of the subject and also past the start of the subject (i.e.
434 ph10 427 something has been matched). For hard partial matching, we then return
435     immediately. The second one is used when we already know we are past the end of
436     the subject. */
437 ph10 426
438     #define CHECK_PARTIAL()\
439 ph10 553 if (md->partial != 0 && eptr >= md->end_subject && \
440     eptr > md->start_used_ptr) \
441     { \
442     md->hitend = TRUE; \
443     if (md->partial > 1) MRRETURN(PCRE_ERROR_PARTIAL); \
444 ph10 427 }
445 ph10 426
446     #define SCHECK_PARTIAL()\
447 ph10 553 if (md->partial != 0 && eptr > md->start_used_ptr) \
448     { \
449     md->hitend = TRUE; \
450     if (md->partial > 1) MRRETURN(PCRE_ERROR_PARTIAL); \
451 ph10 427 }
452 ph10 426
453 ph10 427
454 ph10 426 /* Performance note: It might be tempting to extract commonly used fields from
455     the md structure (e.g. utf8, end_subject) into individual variables to improve
456 nigel 77 performance. Tests using gcc on a SPARC disproved this; in the first case, it
457     made performance worse.
458    
459     Arguments:
460 nigel 93 eptr pointer to current character in subject
461     ecode pointer to current position in compiled code
462 ph10 168 mstart pointer to the current match start position (can be modified
463 ph10 172 by encountering \K)
464 ph10 501 markptr pointer to the most recent MARK name, or NULL
465 nigel 77 offset_top current top pointer
466     md pointer to "static" info for the match
467     eptrb pointer to chain of blocks containing eptr at start of
468     brackets - for testing for empty matches
469 nigel 87 rdepth the recursion depth
470 nigel 77
471     Returns: MATCH_MATCH if matched ) these values are >= 0
472     MATCH_NOMATCH if failed to match )
473 ph10 510 a negative MATCH_xxx value for PRUNE, SKIP, etc
474 nigel 77 a negative PCRE_ERROR_xxx value if aborted by an error condition
475 nigel 87 (e.g. stopped by repeated call or recursion limit)
476 nigel 77 */
477    
478     static int
479 ph10 510 match(REGISTER USPTR eptr, REGISTER const uschar *ecode, USPTR mstart,
480 ph10 602 const uschar *markptr, int offset_top, match_data *md, eptrblock *eptrb,
481 ph10 604 unsigned int rdepth)
482 nigel 77 {
483     /* These variables do not need to be preserved over recursion in this function,
484 nigel 93 so they can be ordinary variables in all cases. Mark some of them with
485     "register" because they are used a lot in loops. */
486 nigel 77
487 nigel 91 register int rrc; /* Returns from recursive calls */
488     register int i; /* Used for loops not involving calls to RMATCH() */
489 nigel 93 register unsigned int c; /* Character values not kept over RMATCH() calls */
490 nigel 91 register BOOL utf8; /* Local copy of UTF-8 flag for speed */
491 nigel 77
492 nigel 93 BOOL minimize, possessive; /* Quantifier options */
493 ph10 602 BOOL caseless;
494 ph10 403 int condcode;
495 nigel 93
496 nigel 77 /* When recursion is not being used, all "local" variables that have to be
497     preserved over calls to RMATCH() are part of a "frame" which is obtained from
498     heap storage. Set up the top-level frame here; others are obtained from the
499     heap whenever RMATCH() does a "recursion". See the macro definitions above. */
500    
501     #ifdef NO_RECURSE
502 ph10 563 heapframe *frame = (heapframe *)(pcre_stack_malloc)(sizeof(heapframe));
503 ph10 531 if (frame == NULL) RRETURN(PCRE_ERROR_NOMEMORY);
504 nigel 77 frame->Xprevframe = NULL; /* Marks the top level */
505    
506     /* Copy in the original argument variables */
507    
508     frame->Xeptr = eptr;
509     frame->Xecode = ecode;
510 ph10 168 frame->Xmstart = mstart;
511 ph10 501 frame->Xmarkptr = markptr;
512 nigel 77 frame->Xoffset_top = offset_top;
513     frame->Xeptrb = eptrb;
514 nigel 87 frame->Xrdepth = rdepth;
515 nigel 77
516     /* This is where control jumps back to to effect "recursion" */
517    
518     HEAP_RECURSE:
519    
520     /* Macros make the argument variables come from the current frame */
521    
522     #define eptr frame->Xeptr
523     #define ecode frame->Xecode
524 ph10 168 #define mstart frame->Xmstart
525 ph10 501 #define markptr frame->Xmarkptr
526 nigel 77 #define offset_top frame->Xoffset_top
527     #define eptrb frame->Xeptrb
528 nigel 87 #define rdepth frame->Xrdepth
529 nigel 77
530     /* Ditto for the local variables */
531    
532     #ifdef SUPPORT_UTF8
533     #define charptr frame->Xcharptr
534     #endif
535     #define callpat frame->Xcallpat
536 ph10 403 #define codelink frame->Xcodelink
537 nigel 77 #define data frame->Xdata
538     #define next frame->Xnext
539     #define pp frame->Xpp
540     #define prev frame->Xprev
541     #define saved_eptr frame->Xsaved_eptr
542    
543     #define new_recursive frame->Xnew_recursive
544    
545     #define cur_is_word frame->Xcur_is_word
546     #define condition frame->Xcondition
547     #define prev_is_word frame->Xprev_is_word
548    
549     #ifdef SUPPORT_UCP
550     #define prop_type frame->Xprop_type
551 nigel 87 #define prop_value frame->Xprop_value
552 nigel 77 #define prop_fail_result frame->Xprop_fail_result
553     #define prop_category frame->Xprop_category
554     #define prop_chartype frame->Xprop_chartype
555 nigel 87 #define prop_script frame->Xprop_script
556 ph10 115 #define oclength frame->Xoclength
557     #define occhars frame->Xocchars
558 nigel 77 #endif
559    
560     #define ctype frame->Xctype
561     #define fc frame->Xfc
562     #define fi frame->Xfi
563     #define length frame->Xlength
564     #define max frame->Xmax
565     #define min frame->Xmin
566     #define number frame->Xnumber
567     #define offset frame->Xoffset
568     #define op frame->Xop
569     #define save_capture_last frame->Xsave_capture_last
570     #define save_offset1 frame->Xsave_offset1
571     #define save_offset2 frame->Xsave_offset2
572     #define save_offset3 frame->Xsave_offset3
573     #define stacksave frame->Xstacksave
574    
575     #define newptrb frame->Xnewptrb
576    
577     /* When recursion is being used, local variables are allocated on the stack and
578     get preserved during recursion in the normal way. In this environment, fi and
579     i, and fc and c, can be the same variables. */
580    
581 nigel 93 #else /* NO_RECURSE not defined */
582 nigel 77 #define fi i
583     #define fc c
584    
585 ph10 604 /* Many of the following variables are used only in small blocks of the code.
586     My normal style of coding would have declared them within each of those blocks.
587     However, in order to accommodate the version of this code that uses an external
588     "stack" implemented on the heap, it is easier to declare them all here, so the
589     declarations can be cut out in a block. The only declarations within blocks
590     below are for variables that do not have to be preserved over a recursive call
591     to RMATCH(). */
592 nigel 77
593 ph10 604 #ifdef SUPPORT_UTF8
594     const uschar *charptr;
595     #endif
596     const uschar *callpat;
597     const uschar *data;
598     const uschar *next;
599     USPTR pp;
600     const uschar *prev;
601     USPTR saved_eptr;
602    
603     recursion_info new_recursive;
604    
605     BOOL cur_is_word;
606 nigel 87 BOOL condition;
607 nigel 77 BOOL prev_is_word;
608    
609     #ifdef SUPPORT_UCP
610     int prop_type;
611 nigel 87 int prop_value;
612 nigel 77 int prop_fail_result;
613     int prop_category;
614     int prop_chartype;
615 nigel 87 int prop_script;
616 ph10 115 int oclength;
617     uschar occhars[8];
618 nigel 77 #endif
619    
620 ph10 399 int codelink;
621 nigel 77 int ctype;
622     int length;
623     int max;
624     int min;
625     int number;
626     int offset;
627     int op;
628     int save_capture_last;
629     int save_offset1, save_offset2, save_offset3;
630     int stacksave[REC_STACK_SAVE_MAX];
631    
632     eptrblock newptrb;
633 nigel 93 #endif /* NO_RECURSE */
634 nigel 77
635 ph10 604 /* To save space on the stack and in the heap frame, I have doubled up on some
636     of the local variables that are used only in localised parts of the code, but
637     still need to be preserved over recursive calls of match(). These macros define
638     the alternative names that are used. */
639    
640     #define allow_zero cur_is_word
641     #define cbegroup condition
642     #define code_offset codelink
643     #define condassert condition
644     #define matched_once prev_is_word
645    
646 nigel 77 /* These statements are here to stop the compiler complaining about unitialized
647     variables. */
648    
649     #ifdef SUPPORT_UCP
650 nigel 87 prop_value = 0;
651 nigel 77 prop_fail_result = 0;
652     #endif
653    
654 nigel 93
655 nigel 91 /* This label is used for tail recursion, which is used in a few cases even
656     when NO_RECURSE is not defined, in order to reduce the amount of stack that is
657     used. Thanks to Ian Taylor for noticing this possibility and sending the
658     original patch. */
659    
660     TAIL_RECURSE:
661    
662 nigel 87 /* OK, now we can get on with the real code of the function. Recursive calls
663     are specified by the macro RMATCH and RRETURN is used to return. When
664     NO_RECURSE is *not* defined, these just turn into a recursive call to match()
665 ph10 475 and a "return", respectively (possibly with some debugging if PCRE_DEBUG is
666 nigel 87 defined). However, RMATCH isn't like a function call because it's quite a
667     complicated macro. It has to be used in one particular way. This shouldn't,
668     however, impact performance when true recursion is being used. */
669 nigel 77
670 ph10 164 #ifdef SUPPORT_UTF8
671     utf8 = md->utf8; /* Local copy of the flag */
672     #else
673     utf8 = FALSE;
674     #endif
675    
676 nigel 87 /* First check that we haven't called match() too many times, or that we
677     haven't exceeded the recursive call limit. */
678    
679 nigel 77 if (md->match_call_count++ >= md->match_limit) RRETURN(PCRE_ERROR_MATCHLIMIT);
680 nigel 87 if (rdepth >= md->match_limit_recursion) RRETURN(PCRE_ERROR_RECURSIONLIMIT);
681 nigel 77
682 nigel 93 /* At the start of a group with an unlimited repeat that may match an empty
683 ph10 604 string, the variable md->match_function_type is set to MATCH_CBEGROUP. It is
684     done this way to save having to use another function argument, which would take
685     up space on the stack. See also MATCH_CONDASSERT below.
686 nigel 77
687 ph10 604 When MATCH_CBEGROUP is set, add the current subject pointer to the chain of
688     such remembered pointers, to be checked when we hit the closing ket, in order
689     to break infinite loops that match no characters. When match() is called in
690     other circumstances, don't add to the chain. The MATCH_CBEGROUP feature must
691     NOT be used with tail recursion, because the memory block that is used is on
692     the stack, so a new one may be required for each match(). */
693    
694     if (md->match_function_type == MATCH_CBEGROUP)
695 nigel 77 {
696 ph10 197 newptrb.epb_saved_eptr = eptr;
697     newptrb.epb_prev = eptrb;
698     eptrb = &newptrb;
699 ph10 604 md->match_function_type = 0;
700 nigel 77 }
701    
702 nigel 93 /* Now start processing the opcodes. */
703 nigel 77
704     for (;;)
705     {
706 nigel 93 minimize = possessive = FALSE;
707 nigel 77 op = *ecode;
708 ph10 604
709 nigel 93 switch(op)
710     {
711 ph10 510 case OP_MARK:
712     markptr = ecode + 2;
713     RMATCH(eptr, ecode + _pcre_OP_lengths[*ecode] + ecode[1], offset_top, md,
714 ph10 604 eptrb, RM55);
715 ph10 512
716     /* A return of MATCH_SKIP_ARG means that matching failed at SKIP with an
717     argument, and we must check whether that argument matches this MARK's
718     argument. It is passed back in md->start_match_ptr (an overloading of that
719     variable). If it does match, we reset that variable to the current subject
720     position and return MATCH_SKIP. Otherwise, pass back the return code
721 ph10 510 unaltered. */
722 ph10 512
723     if (rrc == MATCH_SKIP_ARG &&
724 ph10 510 strcmp((char *)markptr, (char *)(md->start_match_ptr)) == 0)
725     {
726     md->start_match_ptr = eptr;
727     RRETURN(MATCH_SKIP);
728     }
729    
730 ph10 512 if (md->mark == NULL) md->mark = markptr;
731 ph10 510 RRETURN(rrc);
732    
733 ph10 210 case OP_FAIL:
734 ph10 510 MRRETURN(MATCH_NOMATCH);
735 ph10 211
736 ph10 551 /* COMMIT overrides PRUNE, SKIP, and THEN */
737 ph10 553
738 ph10 510 case OP_COMMIT:
739     RMATCH(eptr, ecode + _pcre_OP_lengths[*ecode], offset_top, md,
740 ph10 604 eptrb, RM52);
741 ph10 551 if (rrc != MATCH_NOMATCH && rrc != MATCH_PRUNE &&
742 ph10 553 rrc != MATCH_SKIP && rrc != MATCH_SKIP_ARG &&
743     rrc != MATCH_THEN)
744 ph10 551 RRETURN(rrc);
745 ph10 510 MRRETURN(MATCH_COMMIT);
746    
747 ph10 551 /* PRUNE overrides THEN */
748 ph10 553
749 ph10 210 case OP_PRUNE:
750     RMATCH(eptr, ecode + _pcre_OP_lengths[*ecode], offset_top, md,
751 ph10 604 eptrb, RM51);
752 ph10 551 if (rrc != MATCH_NOMATCH && rrc != MATCH_THEN) RRETURN(rrc);
753 ph10 510 MRRETURN(MATCH_PRUNE);
754 ph10 211
755 ph10 510 case OP_PRUNE_ARG:
756     RMATCH(eptr, ecode + _pcre_OP_lengths[*ecode] + ecode[1], offset_top, md,
757 ph10 604 eptrb, RM56);
758 ph10 551 if (rrc != MATCH_NOMATCH && rrc != MATCH_THEN) RRETURN(rrc);
759 ph10 510 md->mark = ecode + 2;
760     RRETURN(MATCH_PRUNE);
761 ph10 211
762 ph10 551 /* SKIP overrides PRUNE and THEN */
763 ph10 553
764 ph10 210 case OP_SKIP:
765     RMATCH(eptr, ecode + _pcre_OP_lengths[*ecode], offset_top, md,
766 ph10 604 eptrb, RM53);
767 ph10 553 if (rrc != MATCH_NOMATCH && rrc != MATCH_PRUNE && rrc != MATCH_THEN)
768 ph10 551 RRETURN(rrc);
769 ph10 211 md->start_match_ptr = eptr; /* Pass back current position */
770 ph10 510 MRRETURN(MATCH_SKIP);
771 ph10 211
772 ph10 510 case OP_SKIP_ARG:
773     RMATCH(eptr, ecode + _pcre_OP_lengths[*ecode] + ecode[1], offset_top, md,
774 ph10 604 eptrb, RM57);
775 ph10 553 if (rrc != MATCH_NOMATCH && rrc != MATCH_PRUNE && rrc != MATCH_THEN)
776 ph10 551 RRETURN(rrc);
777 ph10 512
778     /* Pass back the current skip name by overloading md->start_match_ptr and
779     returning the special MATCH_SKIP_ARG return code. This will either be
780     caught by a matching MARK, or get to the top, where it is treated the same
781 ph10 510 as PRUNE. */
782 ph10 512
783 ph10 510 md->start_match_ptr = ecode + 2;
784 ph10 512 RRETURN(MATCH_SKIP_ARG);
785 ph10 553
786 ph10 550 /* For THEN (and THEN_ARG) we pass back the address of the bracket or
787 ph10 553 the alt that is at the start of the current branch. This makes it possible
788     to skip back past alternatives that precede the THEN within the current
789     branch. */
790 ph10 512
791 ph10 210 case OP_THEN:
792     RMATCH(eptr, ecode + _pcre_OP_lengths[*ecode], offset_top, md,
793 ph10 604 eptrb, RM54);
794 ph10 210 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
795 ph10 550 md->start_match_ptr = ecode - GET(ecode, 1);
796 ph10 510 MRRETURN(MATCH_THEN);
797    
798     case OP_THEN_ARG:
799 ph10 553 RMATCH(eptr, ecode + _pcre_OP_lengths[*ecode] + ecode[1+LINK_SIZE],
800 ph10 604 offset_top, md, eptrb, RM58);
801 ph10 510 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
802 ph10 550 md->start_match_ptr = ecode - GET(ecode, 1);
803     md->mark = ecode + LINK_SIZE + 2;
804 ph10 212 RRETURN(MATCH_THEN);
805 ph10 211
806 ph10 604 /* Handle a capturing bracket, other than those that are possessive with an
807     unlimited repeat. If there is space in the offset vector, save the current
808     subject position in the working slot at the top of the vector. We mustn't
809     change the current values of the data slot, because they may be set from a
810     previous iteration of this group, and be referred to by a reference inside
811     the group. If we fail to match, we need to restore this value and also the
812 nigel 93 values of the final offsets, in case they were set by a previous iteration
813     of the same bracket.
814 nigel 77
815 nigel 93 If there isn't enough space in the offset vector, treat this as if it were
816     a non-capturing bracket. Don't worry about setting the flag for the error
817     case here; that is handled in the code for KET. */
818 nigel 77
819 nigel 93 case OP_CBRA:
820     case OP_SCBRA:
821     number = GET2(ecode, 1+LINK_SIZE);
822 nigel 77 offset = number << 1;
823 ph10 604
824 ph10 475 #ifdef PCRE_DEBUG
825 nigel 93 printf("start bracket %d\n", number);
826     printf("subject=");
827 nigel 77 pchars(eptr, 16, TRUE, md);
828     printf("\n");
829     #endif
830    
831     if (offset < md->offset_max)
832     {
833     save_offset1 = md->offset_vector[offset];
834     save_offset2 = md->offset_vector[offset+1];
835     save_offset3 = md->offset_vector[md->offset_end - number];
836     save_capture_last = md->capture_last;
837    
838     DPRINTF(("saving %d %d %d\n", save_offset1, save_offset2, save_offset3));
839 ph10 531 md->offset_vector[md->offset_end - number] =
840 ph10 530 (int)(eptr - md->start_subject);
841 nigel 77
842 ph10 604 for (;;)
843 nigel 77 {
844 ph10 604 if (op >= OP_SBRA) md->match_function_type = MATCH_CBEGROUP;
845     RMATCH(eptr, ecode + _pcre_OP_lengths[*ecode], offset_top, md,
846     eptrb, RM1);
847 ph10 550 if (rrc != MATCH_NOMATCH &&
848     (rrc != MATCH_THEN || md->start_match_ptr != ecode))
849     RRETURN(rrc);
850 nigel 77 md->capture_last = save_capture_last;
851     ecode += GET(ecode, 1);
852 ph10 604 if (*ecode != OP_ALT) break;
853 nigel 77 }
854    
855     DPRINTF(("bracket %d failed\n", number));
856    
857     md->offset_vector[offset] = save_offset1;
858     md->offset_vector[offset+1] = save_offset2;
859     md->offset_vector[md->offset_end - number] = save_offset3;
860    
861 ph10 510 if (rrc != MATCH_THEN) md->mark = markptr;
862 nigel 77 RRETURN(MATCH_NOMATCH);
863     }
864    
865 ph10 197 /* FALL THROUGH ... Insufficient room for saving captured contents. Treat
866     as a non-capturing bracket. */
867 nigel 77
868 ph10 197 /* VVVVVVVVVVVVVVVVVVVVVVVVV */
869     /* VVVVVVVVVVVVVVVVVVVVVVVVV */
870    
871 nigel 93 DPRINTF(("insufficient capture room: treat as non-capturing\n"));
872 nigel 77
873 ph10 197 /* VVVVVVVVVVVVVVVVVVVVVVVVV */
874     /* VVVVVVVVVVVVVVVVVVVVVVVVV */
875    
876 ph10 604 /* Non-capturing bracket, except for possessive with unlimited repeat. Loop
877     for all the alternatives. When we get to the final alternative within the
878     brackets, we would return the result of a recursive call to match()
879     whatever happened. We can reduce stack usage by turning this into a tail
880     recursion, except in the case of a possibly empty group.*/
881 nigel 77
882 nigel 93 case OP_BRA:
883     case OP_SBRA:
884     DPRINTF(("start non-capturing bracket\n"));
885 nigel 91 for (;;)
886 nigel 77 {
887 ph10 197 if (ecode[GET(ecode, 1)] != OP_ALT) /* Final alternative */
888 nigel 93 {
889 ph10 604 if (op >= OP_SBRA) /* Possibly empty group */
890 ph10 197 {
891 ph10 604 md->match_function_type = MATCH_CBEGROUP;
892     RMATCH(eptr, ecode + _pcre_OP_lengths[*ecode], offset_top, md, eptrb,
893     RM48);
894     if (rrc == MATCH_NOMATCH) md->mark = markptr;
895     RRETURN(rrc);
896     }
897     /* Not a possibly empty group; use tail recursion */
898     ecode += _pcre_OP_lengths[*ecode];
899     DPRINTF(("bracket 0 tail recursion\n"));
900     goto TAIL_RECURSE;
901 nigel 93 }
902 nigel 91
903     /* For non-final alternatives, continue the loop for a NOMATCH result;
904     otherwise return. */
905    
906 ph10 604 if (op >= OP_SBRA) md->match_function_type = MATCH_CBEGROUP;
907 ph10 602 RMATCH(eptr, ecode + _pcre_OP_lengths[*ecode], offset_top, md, eptrb,
908 ph10 604 RM2);
909 ph10 550 if (rrc != MATCH_NOMATCH &&
910     (rrc != MATCH_THEN || md->start_match_ptr != ecode))
911     RRETURN(rrc);
912 nigel 77 ecode += GET(ecode, 1);
913     }
914 nigel 91 /* Control never reaches here. */
915 nigel 77
916 ph10 604 /* Handle possessive capturing brackets with an unlimited repeat. We come
917     here from BRAZERO with allow_zero set TRUE. The offset_vector values are
918     handled similarly to the normal case above. However, the matching is
919     different. The end of these brackets will always be OP_KETRPOS, which
920     returns MATCH_KETRPOS without going further in the pattern. By this means
921     we can handle the group by iteration rather than recursion, thereby
922     reducing the amount of stack needed. */
923    
924     case OP_CBRAPOS:
925     case OP_SCBRAPOS:
926     allow_zero = FALSE;
927    
928     POSSESSIVE_CAPTURE:
929     number = GET2(ecode, 1+LINK_SIZE);
930     offset = number << 1;
931    
932     #ifdef PCRE_DEBUG
933     printf("start possessive bracket %d\n", number);
934     printf("subject=");
935     pchars(eptr, 16, TRUE, md);
936     printf("\n");
937     #endif
938    
939     if (offset < md->offset_max)
940     {
941     matched_once = FALSE;
942     code_offset = ecode - md->start_code;
943    
944     save_offset1 = md->offset_vector[offset];
945     save_offset2 = md->offset_vector[offset+1];
946     save_offset3 = md->offset_vector[md->offset_end - number];
947     save_capture_last = md->capture_last;
948    
949     DPRINTF(("saving %d %d %d\n", save_offset1, save_offset2, save_offset3));
950    
951     /* Each time round the loop, save the current subject position for use
952     when the group matches. For MATCH_MATCH, the group has matched, so we
953     restart it with a new subject starting position, remembering that we had
954     at least one match. For MATCH_NOMATCH, carry on with the alternatives, as
955     usual. If we haven't matched any alternatives in any iteration, check to
956     see if a previous iteration matched. If so, the group has matched;
957     continue from afterwards. Otherwise it has failed; restore the previous
958     capture values before returning NOMATCH. */
959    
960     for (;;)
961     {
962     md->offset_vector[md->offset_end - number] =
963     (int)(eptr - md->start_subject);
964     if (op >= OP_SBRA) md->match_function_type = MATCH_CBEGROUP;
965     RMATCH(eptr, ecode + _pcre_OP_lengths[*ecode], offset_top, md,
966     eptrb, RM63);
967     if (rrc == MATCH_KETRPOS)
968     {
969     offset_top = md->end_offset_top;
970     eptr = md->end_match_ptr;
971     ecode = md->start_code + code_offset;
972     save_capture_last = md->capture_last;
973     matched_once = TRUE;
974     continue;
975     }
976     if (rrc != MATCH_NOMATCH &&
977     (rrc != MATCH_THEN || md->start_match_ptr != ecode))
978     RRETURN(rrc);
979     md->capture_last = save_capture_last;
980     ecode += GET(ecode, 1);
981     if (*ecode != OP_ALT) break;
982     }
983    
984     if (!matched_once)
985     {
986     md->offset_vector[offset] = save_offset1;
987     md->offset_vector[offset+1] = save_offset2;
988     md->offset_vector[md->offset_end - number] = save_offset3;
989     }
990    
991     if (rrc != MATCH_THEN) md->mark = markptr;
992     if (allow_zero || matched_once)
993     {
994     ecode += 1 + LINK_SIZE;
995     break;
996     }
997    
998     RRETURN(MATCH_NOMATCH);
999     }
1000    
1001     /* FALL THROUGH ... Insufficient room for saving captured contents. Treat
1002     as a non-capturing bracket. */
1003    
1004     /* VVVVVVVVVVVVVVVVVVVVVVVVV */
1005     /* VVVVVVVVVVVVVVVVVVVVVVVVV */
1006    
1007     DPRINTF(("insufficient capture room: treat as non-capturing\n"));
1008    
1009     /* VVVVVVVVVVVVVVVVVVVVVVVVV */
1010     /* VVVVVVVVVVVVVVVVVVVVVVVVV */
1011    
1012     /* Non-capturing possessive bracket with unlimited repeat. We come here
1013     from BRAZERO with allow_zero = TRUE. The code is similar to the above,
1014     without the capturing complication. It is written out separately for speed
1015     and cleanliness. */
1016    
1017     case OP_BRAPOS:
1018     case OP_SBRAPOS:
1019     allow_zero = FALSE;
1020    
1021     POSSESSIVE_NON_CAPTURE:
1022     matched_once = FALSE;
1023     code_offset = ecode - md->start_code;
1024    
1025     for (;;)
1026     {
1027     if (op >= OP_SBRA) md->match_function_type = MATCH_CBEGROUP;
1028     RMATCH(eptr, ecode + _pcre_OP_lengths[*ecode], offset_top, md,
1029     eptrb, RM64);
1030     if (rrc == MATCH_KETRPOS)
1031     {
1032     eptr = md->end_match_ptr;
1033     ecode = md->start_code + code_offset;
1034     matched_once = TRUE;
1035     continue;
1036     }
1037     if (rrc != MATCH_NOMATCH &&
1038     (rrc != MATCH_THEN || md->start_match_ptr != ecode))
1039     RRETURN(rrc);
1040     ecode += GET(ecode, 1);
1041     if (*ecode != OP_ALT) break;
1042     }
1043    
1044     if (matched_once || allow_zero)
1045     {
1046     ecode += 1 + LINK_SIZE;
1047     break;
1048     }
1049     RRETURN(MATCH_NOMATCH);
1050    
1051     /* Control never reaches here. */
1052    
1053 nigel 77 /* Conditional group: compilation checked that there are no more than
1054     two branches. If the condition is false, skipping the first branch takes us
1055     past the end if there is only one branch, but that's OK because that is
1056 nigel 91 exactly what going to the ket would do. As there is only one branch to be
1057     obeyed, we can use tail recursion to avoid using another stack frame. */
1058 nigel 77
1059     case OP_COND:
1060 nigel 93 case OP_SCOND:
1061 ph10 604 codelink = GET(ecode, 1);
1062 ph10 406
1063 ph10 381 /* Because of the way auto-callout works during compile, a callout item is
1064     inserted between OP_COND and an assertion condition. */
1065 ph10 392
1066 ph10 381 if (ecode[LINK_SIZE+1] == OP_CALLOUT)
1067     {
1068     if (pcre_callout != NULL)
1069     {
1070     pcre_callout_block cb;
1071     cb.version = 1; /* Version 1 of the callout block */
1072     cb.callout_number = ecode[LINK_SIZE+2];
1073     cb.offset_vector = md->offset_vector;
1074     cb.subject = (PCRE_SPTR)md->start_subject;
1075 ph10 530 cb.subject_length = (int)(md->end_subject - md->start_subject);
1076     cb.start_match = (int)(mstart - md->start_subject);
1077     cb.current_position = (int)(eptr - md->start_subject);
1078 ph10 381 cb.pattern_position = GET(ecode, LINK_SIZE + 3);
1079     cb.next_item_length = GET(ecode, 3 + 2*LINK_SIZE);
1080     cb.capture_top = offset_top/2;
1081     cb.capture_last = md->capture_last;
1082     cb.callout_data = md->callout_data;
1083 ph10 510 if ((rrc = (*pcre_callout)(&cb)) > 0) MRRETURN(MATCH_NOMATCH);
1084 ph10 381 if (rrc < 0) RRETURN(rrc);
1085     }
1086     ecode += _pcre_OP_lengths[OP_CALLOUT];
1087     }
1088 ph10 392
1089 ph10 399 condcode = ecode[LINK_SIZE+1];
1090 ph10 406
1091 ph10 381 /* Now see what the actual condition is */
1092 ph10 392
1093 ph10 459 if (condcode == OP_RREF || condcode == OP_NRREF) /* Recursion test */
1094 nigel 77 {
1095 ph10 459 if (md->recursive == NULL) /* Not recursing => FALSE */
1096     {
1097 ph10 461 condition = FALSE;
1098     ecode += GET(ecode, 1);
1099     }
1100 ph10 459 else
1101 ph10 461 {
1102 ph10 459 int recno = GET2(ecode, LINK_SIZE + 2); /* Recursion group number*/
1103     condition = (recno == RREF_ANY || recno == md->recursive->group_num);
1104 ph10 461
1105 ph10 459 /* If the test is for recursion into a specific subpattern, and it is
1106     false, but the test was set up by name, scan the table to see if the
1107     name refers to any other numbers, and test them. The condition is true
1108     if any one is set. */
1109 ph10 461
1110 ph10 459 if (!condition && condcode == OP_NRREF && recno != RREF_ANY)
1111     {
1112     uschar *slotA = md->name_table;
1113     for (i = 0; i < md->name_count; i++)
1114 ph10 461 {
1115     if (GET2(slotA, 0) == recno) break;
1116 ph10 459 slotA += md->name_entry_size;
1117     }
1118 ph10 461
1119 ph10 459 /* Found a name for the number - there can be only one; duplicate
1120     names for different numbers are allowed, but not vice versa. First
1121     scan down for duplicates. */
1122 ph10 461
1123 ph10 459 if (i < md->name_count)
1124 ph10 461 {
1125 ph10 459 uschar *slotB = slotA;
1126     while (slotB > md->name_table)
1127     {
1128     slotB -= md->name_entry_size;
1129     if (strcmp((char *)slotA + 2, (char *)slotB + 2) == 0)
1130     {
1131     condition = GET2(slotB, 0) == md->recursive->group_num;
1132 ph10 461 if (condition) break;
1133     }
1134 ph10 459 else break;
1135 ph10 461 }
1136    
1137 ph10 459 /* Scan up for duplicates */
1138 ph10 461
1139 ph10 459 if (!condition)
1140 ph10 461 {
1141 ph10 459 slotB = slotA;
1142     for (i++; i < md->name_count; i++)
1143     {
1144     slotB += md->name_entry_size;
1145     if (strcmp((char *)slotA + 2, (char *)slotB + 2) == 0)
1146     {
1147     condition = GET2(slotB, 0) == md->recursive->group_num;
1148     if (condition) break;
1149 ph10 461 }
1150 ph10 459 else break;
1151 ph10 461 }
1152     }
1153 ph10 459 }
1154 ph10 461 }
1155    
1156 ph10 459 /* Chose branch according to the condition */
1157 ph10 461
1158 ph10 459 ecode += condition? 3 : GET(ecode, 1);
1159     }
1160 ph10 461 }
1161 nigel 93
1162 ph10 459 else if (condcode == OP_CREF || condcode == OP_NCREF) /* Group used test */
1163 nigel 93 {
1164 nigel 77 offset = GET2(ecode, LINK_SIZE+2) << 1; /* Doubled ref number */
1165 nigel 93 condition = offset < offset_top && md->offset_vector[offset] >= 0;
1166 ph10 461
1167 ph10 459 /* If the numbered capture is unset, but the reference was by name,
1168 ph10 461 scan the table to see if the name refers to any other numbers, and test
1169     them. The condition is true if any one is set. This is tediously similar
1170     to the code above, but not close enough to try to amalgamate. */
1171    
1172 ph10 459 if (!condition && condcode == OP_NCREF)
1173     {
1174 ph10 461 int refno = offset >> 1;
1175 ph10 459 uschar *slotA = md->name_table;
1176 ph10 461
1177 ph10 459 for (i = 0; i < md->name_count; i++)
1178 ph10 461 {
1179     if (GET2(slotA, 0) == refno) break;
1180 ph10 459 slotA += md->name_entry_size;
1181     }
1182 ph10 461
1183     /* Found a name for the number - there can be only one; duplicate names
1184     for different numbers are allowed, but not vice versa. First scan down
1185 ph10 459 for duplicates. */
1186 ph10 461
1187 ph10 459 if (i < md->name_count)
1188 ph10 461 {
1189 ph10 459 uschar *slotB = slotA;
1190     while (slotB > md->name_table)
1191     {
1192     slotB -= md->name_entry_size;
1193     if (strcmp((char *)slotA + 2, (char *)slotB + 2) == 0)
1194     {
1195     offset = GET2(slotB, 0) << 1;
1196 ph10 461 condition = offset < offset_top &&
1197 ph10 459 md->offset_vector[offset] >= 0;
1198 ph10 461 if (condition) break;
1199     }
1200 ph10 459 else break;
1201 ph10 461 }
1202    
1203 ph10 459 /* Scan up for duplicates */
1204 ph10 461
1205 ph10 459 if (!condition)
1206 ph10 461 {
1207 ph10 459 slotB = slotA;
1208     for (i++; i < md->name_count; i++)
1209     {
1210     slotB += md->name_entry_size;
1211     if (strcmp((char *)slotA + 2, (char *)slotB + 2) == 0)
1212     {
1213     offset = GET2(slotB, 0) << 1;
1214 ph10 461 condition = offset < offset_top &&
1215 ph10 459 md->offset_vector[offset] >= 0;
1216 ph10 461 if (condition) break;
1217     }
1218 ph10 459 else break;
1219 ph10 461 }
1220     }
1221 ph10 459 }
1222 ph10 461 }
1223    
1224 ph10 459 /* Chose branch according to the condition */
1225    
1226 nigel 93 ecode += condition? 3 : GET(ecode, 1);
1227 nigel 77 }
1228    
1229 ph10 399 else if (condcode == OP_DEF) /* DEFINE - always false */
1230 nigel 93 {
1231     condition = FALSE;
1232     ecode += GET(ecode, 1);
1233     }
1234    
1235 nigel 77 /* The condition is an assertion. Call match() to evaluate it - setting
1236 ph10 604 md->match_function_type to MATCH_CONDASSERT causes it to stop at the end of
1237     an assertion. */
1238 nigel 77
1239     else
1240     {
1241 ph10 604 md->match_function_type = MATCH_CONDASSERT;
1242     RMATCH(eptr, ecode + 1 + LINK_SIZE, offset_top, md, NULL, RM3);
1243 nigel 77 if (rrc == MATCH_MATCH)
1244     {
1245 nigel 93 condition = TRUE;
1246     ecode += 1 + LINK_SIZE + GET(ecode, LINK_SIZE + 2);
1247 nigel 77 while (*ecode == OP_ALT) ecode += GET(ecode, 1);
1248     }
1249 ph10 550 else if (rrc != MATCH_NOMATCH &&
1250     (rrc != MATCH_THEN || md->start_match_ptr != ecode))
1251 nigel 77 {
1252     RRETURN(rrc); /* Need braces because of following else */
1253     }
1254 nigel 93 else
1255     {
1256     condition = FALSE;
1257 ph10 399 ecode += codelink;
1258 nigel 93 }
1259     }
1260 nigel 91
1261 nigel 93 /* We are now at the branch that is to be obeyed. As there is only one,
1262 ph10 197 we can use tail recursion to avoid using another stack frame, except when
1263 ph10 604 we have an unlimited repeat of a possibly empty group. If the second
1264     alternative doesn't exist, we can just plough on. */
1265 nigel 91
1266 nigel 93 if (condition || *ecode == OP_ALT)
1267     {
1268 nigel 91 ecode += 1 + LINK_SIZE;
1269 ph10 197 if (op == OP_SCOND) /* Possibly empty group */
1270     {
1271 ph10 604 md->match_function_type = MATCH_CBEGROUP;
1272     RMATCH(eptr, ecode, offset_top, md, eptrb, RM49);
1273 ph10 197 RRETURN(rrc);
1274     }
1275 ph10 604 else goto TAIL_RECURSE;
1276 nigel 77 }
1277 ph10 395 else /* Condition false & no alternative */
1278 nigel 93 {
1279     ecode += 1 + LINK_SIZE;
1280     }
1281     break;
1282 nigel 77
1283 ph10 461
1284 ph10 447 /* Before OP_ACCEPT there may be any number of OP_CLOSE opcodes,
1285     to close any currently open capturing brackets. */
1286 ph10 461
1287 ph10 447 case OP_CLOSE:
1288 ph10 461 number = GET2(ecode, 1);
1289 ph10 447 offset = number << 1;
1290 ph10 461
1291 ph10 475 #ifdef PCRE_DEBUG
1292 ph10 447 printf("end bracket %d at *ACCEPT", number);
1293     printf("\n");
1294     #endif
1295 nigel 77
1296 ph10 447 md->capture_last = number;
1297     if (offset >= md->offset_max) md->offset_overflow = TRUE; else
1298     {
1299     md->offset_vector[offset] =
1300     md->offset_vector[md->offset_end - number];
1301 ph10 530 md->offset_vector[offset+1] = (int)(eptr - md->start_subject);
1302 ph10 447 if (offset_top <= offset) offset_top = offset + 2;
1303     }
1304     ecode += 3;
1305 ph10 461 break;
1306 ph10 447
1307    
1308 ph10 210 /* End of the pattern, either real or forced. If we are in a top-level
1309     recursion, we should restore the offsets appropriately and continue from
1310     after the call. */
1311 nigel 77
1312 ph10 210 case OP_ACCEPT:
1313 nigel 77 case OP_END:
1314     if (md->recursive != NULL && md->recursive->group_num == 0)
1315     {
1316     recursion_info *rec = md->recursive;
1317 nigel 87 DPRINTF(("End of pattern in a (?0) recursion\n"));
1318 nigel 77 md->recursive = rec->prevrec;
1319     memmove(md->offset_vector, rec->offset_save,
1320     rec->saved_max * sizeof(int));
1321 ph10 461 offset_top = rec->save_offset_top;
1322 nigel 77 ecode = rec->after_call;
1323     break;
1324     }
1325    
1326 ph10 442 /* Otherwise, if we have matched an empty string, fail if PCRE_NOTEMPTY is
1327     set, or if PCRE_NOTEMPTY_ATSTART is set and we have matched at the start of
1328     the subject. In both cases, backtracking will then try other alternatives,
1329     if any. */
1330 ph10 443
1331 ph10 442 if (eptr == mstart &&
1332     (md->notempty ||
1333 ph10 443 (md->notempty_atstart &&
1334 ph10 442 mstart == md->start_subject + md->start_offset)))
1335 ph10 510 MRRETURN(MATCH_NOMATCH);
1336 ph10 443
1337 ph10 442 /* Otherwise, we have a match. */
1338 nigel 77
1339 ph10 168 md->end_match_ptr = eptr; /* Record where we ended */
1340     md->end_offset_top = offset_top; /* and how many extracts were taken */
1341 ph10 210 md->start_match_ptr = mstart; /* and the start (\K can modify) */
1342 nigel 77
1343 ph10 512 /* For some reason, the macros don't work properly if an expression is
1344     given as the argument to MRRETURN when the heap is in use. */
1345    
1346     rrc = (op == OP_END)? MATCH_MATCH : MATCH_ACCEPT;
1347     MRRETURN(rrc);
1348    
1349 nigel 77 /* Assertion brackets. Check the alternative branches in turn - the
1350     matching won't pass the KET for an assertion. If any one branch matches,
1351     the assertion is true. Lookbehind assertions have an OP_REVERSE item at the
1352     start of each branch to move the current point backwards, so the code at
1353 ph10 604 this level is identical to the lookahead case. When the assertion is part
1354     of a condition, we want to return immediately afterwards. The caller of
1355     this incarnation of the match() function will have set MATCH_CONDASSERT in
1356     md->match_function type, and one of these opcodes will be the first opcode
1357     that is processed. We use a local variable that is preserved over calls to
1358     match() to remember this case. */
1359 nigel 77
1360     case OP_ASSERT:
1361     case OP_ASSERTBACK:
1362 ph10 604 if (md->match_function_type == MATCH_CONDASSERT)
1363     {
1364     condassert = TRUE;
1365     md->match_function_type = 0;
1366     }
1367     else condassert = FALSE;
1368    
1369 nigel 77 do
1370     {
1371 ph10 604 RMATCH(eptr, ecode + 1 + LINK_SIZE, offset_top, md, NULL, RM4);
1372 ph10 511 if (rrc == MATCH_MATCH || rrc == MATCH_ACCEPT)
1373 ph10 500 {
1374     mstart = md->start_match_ptr; /* In case \K reset it */
1375     break;
1376 ph10 501 }
1377 ph10 550 if (rrc != MATCH_NOMATCH &&
1378     (rrc != MATCH_THEN || md->start_match_ptr != ecode))
1379     RRETURN(rrc);
1380 nigel 77 ecode += GET(ecode, 1);
1381     }
1382     while (*ecode == OP_ALT);
1383 ph10 604
1384 ph10 510 if (*ecode == OP_KET) MRRETURN(MATCH_NOMATCH);
1385 nigel 77
1386     /* If checking an assertion for a condition, return MATCH_MATCH. */
1387    
1388 ph10 604 if (condassert) RRETURN(MATCH_MATCH);
1389 nigel 77
1390     /* Continue from after the assertion, updating the offsets high water
1391     mark, since extracts may have been taken during the assertion. */
1392    
1393     do ecode += GET(ecode,1); while (*ecode == OP_ALT);
1394     ecode += 1 + LINK_SIZE;
1395     offset_top = md->end_offset_top;
1396     continue;
1397    
1398 ph10 473 /* Negative assertion: all branches must fail to match. Encountering SKIP,
1399 ph10 482 PRUNE, or COMMIT means we must assume failure without checking subsequent
1400 ph10 473 branches. */
1401 nigel 77
1402     case OP_ASSERT_NOT:
1403     case OP_ASSERTBACK_NOT:
1404 ph10 604 if (md->match_function_type == MATCH_CONDASSERT)
1405     {
1406     condassert = TRUE;
1407     md->match_function_type = 0;
1408     }
1409     else condassert = FALSE;
1410    
1411 nigel 77 do
1412     {
1413 ph10 604 RMATCH(eptr, ecode + 1 + LINK_SIZE, offset_top, md, NULL, RM5);
1414 ph10 511 if (rrc == MATCH_MATCH || rrc == MATCH_ACCEPT) MRRETURN(MATCH_NOMATCH);
1415 ph10 473 if (rrc == MATCH_SKIP || rrc == MATCH_PRUNE || rrc == MATCH_COMMIT)
1416     {
1417     do ecode += GET(ecode,1); while (*ecode == OP_ALT);
1418 ph10 482 break;
1419     }
1420 ph10 550 if (rrc != MATCH_NOMATCH &&
1421     (rrc != MATCH_THEN || md->start_match_ptr != ecode))
1422     RRETURN(rrc);
1423 nigel 77 ecode += GET(ecode,1);
1424     }
1425     while (*ecode == OP_ALT);
1426    
1427 ph10 604 if (condassert) RRETURN(MATCH_MATCH); /* Condition assertion */
1428    
1429 nigel 77 ecode += 1 + LINK_SIZE;
1430     continue;
1431    
1432     /* Move the subject pointer back. This occurs only at the start of
1433     each branch of a lookbehind assertion. If we are too close to the start to
1434     move back, this match function fails. When working with UTF-8 we move
1435     back a number of characters, not bytes. */
1436    
1437     case OP_REVERSE:
1438     #ifdef SUPPORT_UTF8
1439     if (utf8)
1440     {
1441 nigel 93 i = GET(ecode, 1);
1442     while (i-- > 0)
1443 nigel 77 {
1444     eptr--;
1445 ph10 510 if (eptr < md->start_subject) MRRETURN(MATCH_NOMATCH);
1446 ph10 207 BACKCHAR(eptr);
1447 nigel 77 }
1448     }
1449     else
1450     #endif
1451    
1452     /* No UTF-8 support, or not in UTF-8 mode: count is byte count */
1453    
1454     {
1455 nigel 93 eptr -= GET(ecode, 1);
1456 ph10 510 if (eptr < md->start_subject) MRRETURN(MATCH_NOMATCH);
1457 nigel 77 }
1458    
1459 ph10 435 /* Save the earliest consulted character, then skip to next op code */
1460 nigel 77
1461 ph10 435 if (eptr < md->start_used_ptr) md->start_used_ptr = eptr;
1462 nigel 77 ecode += 1 + LINK_SIZE;
1463     break;
1464    
1465     /* The callout item calls an external function, if one is provided, passing
1466     details of the match so far. This is mainly for debugging, though the
1467     function is able to force a failure. */
1468    
1469     case OP_CALLOUT:
1470     if (pcre_callout != NULL)
1471     {
1472     pcre_callout_block cb;
1473     cb.version = 1; /* Version 1 of the callout block */
1474     cb.callout_number = ecode[1];
1475     cb.offset_vector = md->offset_vector;
1476 nigel 87 cb.subject = (PCRE_SPTR)md->start_subject;
1477 ph10 530 cb.subject_length = (int)(md->end_subject - md->start_subject);
1478     cb.start_match = (int)(mstart - md->start_subject);
1479     cb.current_position = (int)(eptr - md->start_subject);
1480 nigel 77 cb.pattern_position = GET(ecode, 2);
1481     cb.next_item_length = GET(ecode, 2 + LINK_SIZE);
1482     cb.capture_top = offset_top/2;
1483     cb.capture_last = md->capture_last;
1484     cb.callout_data = md->callout_data;
1485 ph10 510 if ((rrc = (*pcre_callout)(&cb)) > 0) MRRETURN(MATCH_NOMATCH);
1486 nigel 77 if (rrc < 0) RRETURN(rrc);
1487     }
1488     ecode += 2 + 2*LINK_SIZE;
1489     break;
1490    
1491     /* Recursion either matches the current regex, or some subexpression. The
1492     offset data is the offset to the starting bracket from the start of the
1493     whole pattern. (This is so that it works from duplicated subpatterns.)
1494    
1495     If there are any capturing brackets started but not finished, we have to
1496     save their starting points and reinstate them after the recursion. However,
1497     we don't know how many such there are (offset_top records the completed
1498     total) so we just have to save all the potential data. There may be up to
1499     65535 such values, which is too large to put on the stack, but using malloc
1500     for small numbers seems expensive. As a compromise, the stack is used when
1501     there are no more than REC_STACK_SAVE_MAX values to store; otherwise malloc
1502     is used. A problem is what to do if the malloc fails ... there is no way of
1503     returning to the top level with an error. Save the top REC_STACK_SAVE_MAX
1504     values on the stack, and accept that the rest may be wrong.
1505    
1506     There are also other values that have to be saved. We use a chained
1507     sequence of blocks that actually live on the stack. Thanks to Robin Houston
1508     for the original version of this logic. */
1509    
1510     case OP_RECURSE:
1511     {
1512     callpat = md->start_code + GET(ecode, 1);
1513 nigel 93 new_recursive.group_num = (callpat == md->start_code)? 0 :
1514     GET2(callpat, 1 + LINK_SIZE);
1515 nigel 77
1516     /* Add to "recursing stack" */
1517    
1518     new_recursive.prevrec = md->recursive;
1519     md->recursive = &new_recursive;
1520    
1521     /* Find where to continue from afterwards */
1522    
1523     ecode += 1 + LINK_SIZE;
1524     new_recursive.after_call = ecode;
1525    
1526     /* Now save the offset data. */
1527    
1528     new_recursive.saved_max = md->offset_end;
1529     if (new_recursive.saved_max <= REC_STACK_SAVE_MAX)
1530     new_recursive.offset_save = stacksave;
1531     else
1532     {
1533     new_recursive.offset_save =
1534     (int *)(pcre_malloc)(new_recursive.saved_max * sizeof(int));
1535     if (new_recursive.offset_save == NULL) RRETURN(PCRE_ERROR_NOMEMORY);
1536     }
1537    
1538     memcpy(new_recursive.offset_save, md->offset_vector,
1539     new_recursive.saved_max * sizeof(int));
1540 ph10 461 new_recursive.save_offset_top = offset_top;
1541 nigel 77
1542     /* OK, now we can do the recursion. For each top-level alternative we
1543     restore the offset and recursion data. */
1544    
1545     DPRINTF(("Recursing into group %d\n", new_recursive.group_num));
1546 ph10 604 cbegroup = (*callpat >= OP_SBRA);
1547 nigel 77 do
1548     {
1549 ph10 604 if (cbegroup) md->match_function_type = MATCH_CBEGROUP;
1550 ph10 164 RMATCH(eptr, callpat + _pcre_OP_lengths[*callpat], offset_top,
1551 ph10 604 md, eptrb, RM6);
1552 ph10 511 if (rrc == MATCH_MATCH || rrc == MATCH_ACCEPT)
1553 nigel 77 {
1554 nigel 87 DPRINTF(("Recursion matched\n"));
1555 nigel 77 md->recursive = new_recursive.prevrec;
1556     if (new_recursive.offset_save != stacksave)
1557     (pcre_free)(new_recursive.offset_save);
1558 ph10 510 MRRETURN(MATCH_MATCH);
1559 nigel 77 }
1560 ph10 550 else if (rrc != MATCH_NOMATCH &&
1561     (rrc != MATCH_THEN || md->start_match_ptr != ecode))
1562 nigel 87 {
1563     DPRINTF(("Recursion gave error %d\n", rrc));
1564 ph10 400 if (new_recursive.offset_save != stacksave)
1565     (pcre_free)(new_recursive.offset_save);
1566 nigel 87 RRETURN(rrc);
1567     }
1568 nigel 77
1569     md->recursive = &new_recursive;
1570     memcpy(md->offset_vector, new_recursive.offset_save,
1571     new_recursive.saved_max * sizeof(int));
1572     callpat += GET(callpat, 1);
1573     }
1574     while (*callpat == OP_ALT);
1575    
1576     DPRINTF(("Recursion didn't match\n"));
1577     md->recursive = new_recursive.prevrec;
1578     if (new_recursive.offset_save != stacksave)
1579     (pcre_free)(new_recursive.offset_save);
1580 ph10 510 MRRETURN(MATCH_NOMATCH);
1581 nigel 77 }
1582     /* Control never reaches here */
1583    
1584     /* "Once" brackets are like assertion brackets except that after a match,
1585     the point in the subject string is not moved back. Thus there can never be
1586     a move back into the brackets. Friedl calls these "atomic" subpatterns.
1587     Check the alternative branches in turn - the matching won't pass the KET
1588     for this kind of subpattern. If any one branch matches, we carry on as at
1589 ph10 500 the end of a normal bracket, leaving the subject pointer, but resetting
1590     the start-of-match value in case it was changed by \K. */
1591 nigel 77
1592     case OP_ONCE:
1593 nigel 91 prev = ecode;
1594     saved_eptr = eptr;
1595    
1596     do
1597 nigel 77 {
1598 ph10 604 RMATCH(eptr, ecode + 1 + LINK_SIZE, offset_top, md, eptrb, RM7);
1599 ph10 511 if (rrc == MATCH_MATCH) /* Note: _not_ MATCH_ACCEPT */
1600 ph10 500 {
1601     mstart = md->start_match_ptr;
1602     break;
1603 ph10 501 }
1604 ph10 550 if (rrc != MATCH_NOMATCH &&
1605     (rrc != MATCH_THEN || md->start_match_ptr != ecode))
1606     RRETURN(rrc);
1607 nigel 91 ecode += GET(ecode,1);
1608     }
1609     while (*ecode == OP_ALT);
1610 nigel 77
1611 nigel 91 /* If hit the end of the group (which could be repeated), fail */
1612 nigel 77
1613 nigel 91 if (*ecode != OP_ONCE && *ecode != OP_ALT) RRETURN(MATCH_NOMATCH);
1614 nigel 77
1615 nigel 91 /* Continue as from after the assertion, updating the offsets high water
1616     mark, since extracts may have been taken. */
1617 nigel 77
1618 nigel 93 do ecode += GET(ecode, 1); while (*ecode == OP_ALT);
1619 nigel 77
1620 nigel 91 offset_top = md->end_offset_top;
1621     eptr = md->end_match_ptr;
1622 nigel 77
1623 nigel 91 /* For a non-repeating ket, just continue at this level. This also
1624     happens for a repeating ket if no characters were matched in the group.
1625     This is the forcible breaking of infinite loops as implemented in Perl
1626     5.005. If there is an options reset, it will get obeyed in the normal
1627     course of events. */
1628 nigel 77
1629 nigel 91 if (*ecode == OP_KET || eptr == saved_eptr)
1630     {
1631     ecode += 1+LINK_SIZE;
1632     break;
1633     }
1634 nigel 77
1635 nigel 91 /* The repeating kets try the rest of the pattern or restart from the
1636     preceding bracket, in the appropriate order. The second "call" of match()
1637 ph10 602 uses tail recursion, to avoid using another stack frame. */
1638 nigel 77
1639 nigel 91 if (*ecode == OP_KETRMIN)
1640     {
1641 ph10 604 RMATCH(eptr, ecode + 1 + LINK_SIZE, offset_top, md, eptrb, RM8);
1642 nigel 91 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1643     ecode = prev;
1644     goto TAIL_RECURSE;
1645 nigel 77 }
1646 nigel 91 else /* OP_KETRMAX */
1647     {
1648 ph10 604 md->match_function_type = MATCH_CBEGROUP;
1649     RMATCH(eptr, prev, offset_top, md, eptrb, RM9);
1650 nigel 91 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1651     ecode += 1 + LINK_SIZE;
1652     goto TAIL_RECURSE;
1653     }
1654     /* Control never gets here */
1655 nigel 77
1656     /* An alternation is the end of a branch; scan along to find the end of the
1657     bracketed group and go to there. */
1658    
1659     case OP_ALT:
1660     do ecode += GET(ecode,1); while (*ecode == OP_ALT);
1661     break;
1662    
1663 ph10 335 /* BRAZERO, BRAMINZERO and SKIPZERO occur just before a bracket group,
1664     indicating that it may occur zero times. It may repeat infinitely, or not
1665     at all - i.e. it could be ()* or ()? or even (){0} in the pattern. Brackets
1666     with fixed upper repeat limits are compiled as a number of copies, with the
1667     optional ones preceded by BRAZERO or BRAMINZERO. */
1668 ph10 604
1669 nigel 77 case OP_BRAZERO:
1670 ph10 604 next = ecode + 1;
1671     RMATCH(eptr, next, offset_top, md, eptrb, RM10);
1672     if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1673     do next += GET(next, 1); while (*next == OP_ALT);
1674     ecode = next + 1 + LINK_SIZE;
1675 nigel 77 break;
1676 ph10 604
1677 nigel 77 case OP_BRAMINZERO:
1678 ph10 604 next = ecode + 1;
1679     do next += GET(next, 1); while (*next == OP_ALT);
1680     RMATCH(eptr, next + 1+LINK_SIZE, offset_top, md, eptrb, RM11);
1681     if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1682     ecode++;
1683 nigel 77 break;
1684    
1685 ph10 335 case OP_SKIPZERO:
1686 ph10 604 next = ecode+1;
1687     do next += GET(next,1); while (*next == OP_ALT);
1688     ecode = next + 1 + LINK_SIZE;
1689 ph10 335 break;
1690 ph10 604
1691     /* BRAPOSZERO occurs before a possessive bracket group. Don't do anything
1692     here; just jump to the group, with allow_zero set TRUE. */
1693    
1694     case OP_BRAPOSZERO:
1695     op = *(++ecode);
1696     allow_zero = TRUE;
1697     if (op == OP_CBRAPOS || op == OP_SCBRAPOS) goto POSSESSIVE_CAPTURE;
1698     goto POSSESSIVE_NON_CAPTURE;
1699 ph10 335
1700 nigel 93 /* End of a group, repeated or non-repeating. */
1701 nigel 77
1702     case OP_KET:
1703     case OP_KETRMIN:
1704     case OP_KETRMAX:
1705 ph10 604 case OP_KETRPOS:
1706 nigel 91 prev = ecode - GET(ecode, 1);
1707 nigel 77
1708 nigel 93 /* If this was a group that remembered the subject start, in order to break
1709     infinite repeats of empty string matches, retrieve the subject start from
1710     the chain. Otherwise, set it NULL. */
1711 nigel 77
1712 nigel 93 if (*prev >= OP_SBRA)
1713     {
1714     saved_eptr = eptrb->epb_saved_eptr; /* Value at start of group */
1715     eptrb = eptrb->epb_prev; /* Backup to previous group */
1716     }
1717     else saved_eptr = NULL;
1718 nigel 77
1719 ph10 500 /* If we are at the end of an assertion group or an atomic group, stop
1720     matching and return MATCH_MATCH, but record the current high water mark for
1721     use by positive assertions. We also need to record the match start in case
1722     it was changed by \K. */
1723 nigel 93
1724 nigel 91 if (*prev == OP_ASSERT || *prev == OP_ASSERT_NOT ||
1725     *prev == OP_ASSERTBACK || *prev == OP_ASSERTBACK_NOT ||
1726     *prev == OP_ONCE)
1727     {
1728     md->end_match_ptr = eptr; /* For ONCE */
1729     md->end_offset_top = offset_top;
1730 ph10 500 md->start_match_ptr = mstart;
1731 ph10 510 MRRETURN(MATCH_MATCH);
1732 nigel 91 }
1733 nigel 77
1734 nigel 93 /* For capturing groups we have to check the group number back at the start
1735     and if necessary complete handling an extraction by setting the offsets and
1736     bumping the high water mark. Note that whole-pattern recursion is coded as
1737     a recurse into group 0, so it won't be picked up here. Instead, we catch it
1738     when the OP_END is reached. Other recursion is handled here. */
1739 nigel 77
1740 ph10 604 if (*prev == OP_CBRA || *prev == OP_SCBRA ||
1741     *prev == OP_CBRAPOS || *prev == OP_SCBRAPOS)
1742 nigel 91 {
1743 nigel 93 number = GET2(prev, 1+LINK_SIZE);
1744 nigel 91 offset = number << 1;
1745 ph10 461
1746 ph10 475 #ifdef PCRE_DEBUG
1747 nigel 91 printf("end bracket %d", number);
1748     printf("\n");
1749 nigel 77 #endif
1750    
1751 nigel 93 md->capture_last = number;
1752     if (offset >= md->offset_max) md->offset_overflow = TRUE; else
1753 nigel 91 {
1754 nigel 93 md->offset_vector[offset] =
1755     md->offset_vector[md->offset_end - number];
1756 ph10 530 md->offset_vector[offset+1] = (int)(eptr - md->start_subject);
1757 nigel 93 if (offset_top <= offset) offset_top = offset + 2;
1758     }
1759 nigel 77
1760 nigel 93 /* Handle a recursively called group. Restore the offsets
1761     appropriately and continue from after the call. */
1762 nigel 77
1763 nigel 93 if (md->recursive != NULL && md->recursive->group_num == number)
1764     {
1765     recursion_info *rec = md->recursive;
1766     DPRINTF(("Recursion (%d) succeeded - continuing\n", number));
1767     md->recursive = rec->prevrec;
1768     memcpy(md->offset_vector, rec->offset_save,
1769     rec->saved_max * sizeof(int));
1770 ph10 461 offset_top = rec->save_offset_top;
1771 nigel 93 ecode = rec->after_call;
1772     break;
1773 nigel 77 }
1774 nigel 91 }
1775 nigel 77
1776 nigel 91 /* For a non-repeating ket, just continue at this level. This also
1777     happens for a repeating ket if no characters were matched in the group.
1778     This is the forcible breaking of infinite loops as implemented in Perl
1779     5.005. If there is an options reset, it will get obeyed in the normal
1780     course of events. */
1781 nigel 77
1782 nigel 91 if (*ecode == OP_KET || eptr == saved_eptr)
1783     {
1784     ecode += 1 + LINK_SIZE;
1785     break;
1786     }
1787 ph10 604
1788     /* OP_KETRPOS is a possessive repeating ket. Remember the current position,
1789     and return the MATCH_KETRPOS. This makes it possible to do the repeats one
1790     at a time from the outer level, thus saving stack. */
1791    
1792     if (*ecode == OP_KETRPOS)
1793     {
1794     md->end_match_ptr = eptr;
1795     md->end_offset_top = offset_top;
1796     RRETURN(MATCH_KETRPOS);
1797     }
1798 nigel 77
1799 ph10 604 /* The normal repeating kets try the rest of the pattern or restart from
1800     the preceding bracket, in the appropriate order. In the second case, we can
1801     use tail recursion to avoid using another stack frame, unless we have an
1802 ph10 197 unlimited repeat of a group that can match an empty string. */
1803 nigel 77
1804 nigel 91 if (*ecode == OP_KETRMIN)
1805     {
1806 ph10 604 RMATCH(eptr, ecode + 1 + LINK_SIZE, offset_top, md, eptrb, RM12);
1807 nigel 91 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1808 ph10 604 if (*prev >= OP_SBRA) /* Could match an empty string */
1809 ph10 197 {
1810 ph10 604 md->match_function_type = MATCH_CBEGROUP;
1811     RMATCH(eptr, prev, offset_top, md, eptrb, RM50);
1812 ph10 197 RRETURN(rrc);
1813     }
1814 nigel 91 ecode = prev;
1815     goto TAIL_RECURSE;
1816 nigel 77 }
1817 nigel 91 else /* OP_KETRMAX */
1818     {
1819 ph10 604 if (*prev >= OP_SBRA) md->match_function_type = MATCH_CBEGROUP;
1820     RMATCH(eptr, prev, offset_top, md, eptrb, RM13);
1821 nigel 91 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1822     ecode += 1 + LINK_SIZE;
1823     goto TAIL_RECURSE;
1824     }
1825     /* Control never gets here */
1826 nigel 77
1827 ph10 602 /* Not multiline mode: start of subject assertion, unless notbol. */
1828 nigel 77
1829     case OP_CIRC:
1830 ph10 510 if (md->notbol && eptr == md->start_subject) MRRETURN(MATCH_NOMATCH);
1831 ph10 602
1832 nigel 77 /* Start of subject assertion */
1833    
1834     case OP_SOD:
1835 ph10 510 if (eptr != md->start_subject) MRRETURN(MATCH_NOMATCH);
1836 nigel 77 ecode++;
1837     break;
1838 ph10 602
1839     /* Multiline mode: start of subject unless notbol, or after any newline. */
1840 nigel 77
1841 ph10 602 case OP_CIRCM:
1842     if (md->notbol && eptr == md->start_subject) MRRETURN(MATCH_NOMATCH);
1843     if (eptr != md->start_subject &&
1844     (eptr == md->end_subject || !WAS_NEWLINE(eptr)))
1845     MRRETURN(MATCH_NOMATCH);
1846     ecode++;
1847     break;
1848    
1849 nigel 77 /* Start of match assertion */
1850    
1851     case OP_SOM:
1852 ph10 510 if (eptr != md->start_subject + md->start_offset) MRRETURN(MATCH_NOMATCH);
1853 nigel 77 ecode++;
1854     break;
1855 ph10 172
1856 ph10 168 /* Reset the start of match point */
1857 ph10 172
1858 ph10 168 case OP_SET_SOM:
1859     mstart = eptr;
1860 ph10 172 ecode++;
1861     break;
1862 nigel 77
1863 ph10 602 /* Multiline mode: assert before any newline, or before end of subject
1864     unless noteol is set. */
1865 nigel 77
1866 ph10 602 case OP_DOLLM:
1867     if (eptr < md->end_subject)
1868     { if (!IS_NEWLINE(eptr)) MRRETURN(MATCH_NOMATCH); }
1869     else
1870 nigel 77 {
1871 ph10 510 if (md->noteol) MRRETURN(MATCH_NOMATCH);
1872 ph10 602 SCHECK_PARTIAL();
1873 nigel 77 }
1874 ph10 602 ecode++;
1875     break;
1876 ph10 579
1877 ph10 602 /* Not multiline mode: assert before a terminating newline or before end of
1878     subject unless noteol is set. */
1879    
1880     case OP_DOLL:
1881     if (md->noteol) MRRETURN(MATCH_NOMATCH);
1882     if (!md->endonly) goto ASSERT_NL_OR_EOS;
1883    
1884 nigel 91 /* ... else fall through for endonly */
1885 nigel 77
1886     /* End of subject assertion (\z) */
1887    
1888     case OP_EOD:
1889 ph10 510 if (eptr < md->end_subject) MRRETURN(MATCH_NOMATCH);
1890 ph10 553 SCHECK_PARTIAL();
1891 nigel 77 ecode++;
1892     break;
1893    
1894     /* End of subject or ending \n assertion (\Z) */
1895    
1896     case OP_EODN:
1897 ph10 553 ASSERT_NL_OR_EOS:
1898     if (eptr < md->end_subject &&
1899 nigel 93 (!IS_NEWLINE(eptr) || eptr != md->end_subject - md->nllen))
1900 ph10 510 MRRETURN(MATCH_NOMATCH);
1901 ph10 579
1902 ph10 553 /* Either at end of string or \n before end. */
1903 ph10 579
1904 ph10 553 SCHECK_PARTIAL();
1905 nigel 77 ecode++;
1906     break;
1907    
1908     /* Word boundary assertions */
1909    
1910     case OP_NOT_WORD_BOUNDARY:
1911     case OP_WORD_BOUNDARY:
1912     {
1913    
1914     /* Find out if the previous and current characters are "word" characters.
1915     It takes a bit more work in UTF-8 mode. Characters > 255 are assumed to
1916 ph10 443 be "non-word" characters. Remember the earliest consulted character for
1917 ph10 435 partial matching. */
1918 nigel 77
1919     #ifdef SUPPORT_UTF8
1920     if (utf8)
1921     {
1922 ph10 518 /* Get status of previous character */
1923 ph10 527
1924 nigel 77 if (eptr == md->start_subject) prev_is_word = FALSE; else
1925     {
1926 ph10 409 USPTR lastptr = eptr - 1;
1927 nigel 77 while((*lastptr & 0xc0) == 0x80) lastptr--;
1928 ph10 443 if (lastptr < md->start_used_ptr) md->start_used_ptr = lastptr;
1929 nigel 77 GETCHAR(c, lastptr);
1930 ph10 527 #ifdef SUPPORT_UCP
1931 ph10 518 if (md->use_ucp)
1932     {
1933     if (c == '_') prev_is_word = TRUE; else
1934 ph10 527 {
1935 ph10 518 int cat = UCD_CATEGORY(c);
1936     prev_is_word = (cat == ucp_L || cat == ucp_N);
1937 ph10 527 }
1938     }
1939     else
1940     #endif
1941 nigel 77 prev_is_word = c < 256 && (md->ctypes[c] & ctype_word) != 0;
1942     }
1943 ph10 527
1944 ph10 518 /* Get status of next character */
1945 ph10 527
1946 ph10 443 if (eptr >= md->end_subject)
1947 nigel 77 {
1948 ph10 443 SCHECK_PARTIAL();
1949     cur_is_word = FALSE;
1950 ph10 428 }
1951     else
1952     {
1953 nigel 77 GETCHAR(c, eptr);
1954 ph10 527 #ifdef SUPPORT_UCP
1955 ph10 518 if (md->use_ucp)
1956     {
1957     if (c == '_') cur_is_word = TRUE; else
1958 ph10 527 {
1959 ph10 518 int cat = UCD_CATEGORY(c);
1960     cur_is_word = (cat == ucp_L || cat == ucp_N);
1961 ph10 527 }
1962     }
1963     else
1964     #endif
1965 nigel 77 cur_is_word = c < 256 && (md->ctypes[c] & ctype_word) != 0;
1966     }
1967     }
1968     else
1969     #endif
1970    
1971 ph10 527 /* Not in UTF-8 mode, but we may still have PCRE_UCP set, and for
1972 ph10 518 consistency with the behaviour of \w we do use it in this case. */
1973 nigel 77
1974     {
1975 ph10 518 /* Get status of previous character */
1976 ph10 527
1977 ph10 435 if (eptr == md->start_subject) prev_is_word = FALSE; else
1978     {
1979 ph10 443 if (eptr <= md->start_used_ptr) md->start_used_ptr = eptr - 1;
1980 ph10 527 #ifdef SUPPORT_UCP
1981 ph10 518 if (md->use_ucp)
1982     {
1983 ph10 527 c = eptr[-1];
1984 ph10 518 if (c == '_') prev_is_word = TRUE; else
1985 ph10 527 {
1986 ph10 518 int cat = UCD_CATEGORY(c);
1987     prev_is_word = (cat == ucp_L || cat == ucp_N);
1988 ph10 527 }
1989     }
1990     else
1991     #endif
1992 ph10 435 prev_is_word = ((md->ctypes[eptr[-1]] & ctype_word) != 0);
1993     }
1994 ph10 527
1995 ph10 518 /* Get status of next character */
1996 ph10 527
1997 ph10 443 if (eptr >= md->end_subject)
1998 ph10 428 {
1999 ph10 443 SCHECK_PARTIAL();
2000     cur_is_word = FALSE;
2001 ph10 428 }
2002 ph10 527 else
2003     #ifdef SUPPORT_UCP
2004 ph10 518 if (md->use_ucp)
2005     {
2006 ph10 527 c = *eptr;
2007 ph10 518 if (c == '_') cur_is_word = TRUE; else
2008 ph10 527 {
2009 ph10 518 int cat = UCD_CATEGORY(c);
2010     cur_is_word = (cat == ucp_L || cat == ucp_N);
2011 ph10 527 }
2012     }
2013     else
2014     #endif
2015 ph10 518 cur_is_word = ((md->ctypes[*eptr] & ctype_word) != 0);
2016 nigel 77 }
2017    
2018     /* Now see if the situation is what we want */
2019    
2020     if ((*ecode++ == OP_WORD_BOUNDARY)?
2021     cur_is_word == prev_is_word : cur_is_word != prev_is_word)
2022 ph10 510 MRRETURN(MATCH_NOMATCH);
2023 nigel 77 }
2024     break;
2025    
2026     /* Match a single character type; inline for speed */
2027    
2028     case OP_ANY:
2029 ph10 510 if (IS_NEWLINE(eptr)) MRRETURN(MATCH_NOMATCH);
2030 ph10 345 /* Fall through */
2031    
2032 ph10 341 case OP_ALLANY:
2033 ph10 443 if (eptr++ >= md->end_subject)
2034 ph10 428 {
2035 ph10 443 SCHECK_PARTIAL();
2036 ph10 510 MRRETURN(MATCH_NOMATCH);
2037 ph10 443 }
2038 ph10 342 if (utf8) while (eptr < md->end_subject && (*eptr & 0xc0) == 0x80) eptr++;
2039 nigel 77 ecode++;
2040     break;
2041    
2042     /* Match a single byte, even in UTF-8 mode. This opcode really does match
2043     any byte, even newline, independent of the setting of PCRE_DOTALL. */
2044    
2045     case OP_ANYBYTE:
2046 ph10 443 if (eptr++ >= md->end_subject)
2047 ph10 428 {
2048 ph10 443 SCHECK_PARTIAL();
2049 ph10 510 MRRETURN(MATCH_NOMATCH);
2050 ph10 443 }
2051 nigel 77 ecode++;
2052     break;
2053    
2054     case OP_NOT_DIGIT:
2055 ph10 443 if (eptr >= md->end_subject)
2056 ph10 428 {
2057 ph10 443 SCHECK_PARTIAL();
2058 ph10 510 MRRETURN(MATCH_NOMATCH);
2059 ph10 443 }
2060 nigel 77 GETCHARINCTEST(c, eptr);
2061     if (
2062     #ifdef SUPPORT_UTF8
2063     c < 256 &&
2064     #endif
2065     (md->ctypes[c] & ctype_digit) != 0
2066     )
2067 ph10 510 MRRETURN(MATCH_NOMATCH);
2068 nigel 77 ecode++;
2069     break;
2070    
2071     case OP_DIGIT:
2072 ph10 443 if (eptr >= md->end_subject)
2073 ph10 428 {
2074 ph10 443 SCHECK_PARTIAL();
2075 ph10 510 MRRETURN(MATCH_NOMATCH);
2076 ph10 443 }
2077 nigel 77 GETCHARINCTEST(c, eptr);
2078     if (
2079     #ifdef SUPPORT_UTF8
2080     c >= 256 ||
2081     #endif
2082     (md->ctypes[c] & ctype_digit) == 0
2083     )
2084 ph10 510 MRRETURN(MATCH_NOMATCH);
2085 nigel 77 ecode++;
2086     break;
2087    
2088     case OP_NOT_WHITESPACE:
2089 ph10 443 if (eptr >= md->end_subject)
2090 ph10 428 {
2091 ph10 443 SCHECK_PARTIAL();
2092 ph10 510 MRRETURN(MATCH_NOMATCH);
2093 ph10 443 }
2094 nigel 77 GETCHARINCTEST(c, eptr);
2095     if (
2096     #ifdef SUPPORT_UTF8
2097     c < 256 &&
2098     #endif
2099     (md->ctypes[c] & ctype_space) != 0
2100     )
2101 ph10 510 MRRETURN(MATCH_NOMATCH);
2102 nigel 77 ecode++;
2103     break;
2104    
2105     case OP_WHITESPACE:
2106 ph10 443 if (eptr >= md->end_subject)
2107 ph10 428 {
2108 ph10 443 SCHECK_PARTIAL();
2109 ph10 510 MRRETURN(MATCH_NOMATCH);
2110 ph10 443 }
2111 nigel 77 GETCHARINCTEST(c, eptr);
2112     if (
2113     #ifdef SUPPORT_UTF8
2114     c >= 256 ||
2115     #endif
2116     (md->ctypes[c] & ctype_space) == 0
2117     )
2118 ph10 510 MRRETURN(MATCH_NOMATCH);
2119 nigel 77 ecode++;
2120     break;
2121    
2122     case OP_NOT_WORDCHAR:
2123 ph10 443 if (eptr >= md->end_subject)
2124 ph10 428 {
2125 ph10 443 SCHECK_PARTIAL();
2126 ph10 510 MRRETURN(MATCH_NOMATCH);
2127 ph10 443 }
2128 nigel 77 GETCHARINCTEST(c, eptr);
2129     if (
2130     #ifdef SUPPORT_UTF8
2131     c < 256 &&
2132     #endif
2133     (md->ctypes[c] & ctype_word) != 0
2134     )
2135 ph10 510 MRRETURN(MATCH_NOMATCH);
2136 nigel 77 ecode++;
2137     break;
2138    
2139     case OP_WORDCHAR:
2140 ph10 443 if (eptr >= md->end_subject)
2141 ph10 428 {
2142 ph10 443 SCHECK_PARTIAL();
2143 ph10 510 MRRETURN(MATCH_NOMATCH);
2144 ph10 443 }
2145 nigel 77 GETCHARINCTEST(c, eptr);
2146     if (
2147     #ifdef SUPPORT_UTF8
2148     c >= 256 ||
2149     #endif
2150     (md->ctypes[c] & ctype_word) == 0
2151     )
2152 ph10 510 MRRETURN(MATCH_NOMATCH);
2153 nigel 77 ecode++;
2154     break;
2155    
2156 nigel 93 case OP_ANYNL:
2157 ph10 443 if (eptr >= md->end_subject)
2158 ph10 428 {
2159 ph10 443 SCHECK_PARTIAL();
2160 ph10 510 MRRETURN(MATCH_NOMATCH);
2161 ph10 443 }
2162 nigel 93 GETCHARINCTEST(c, eptr);
2163     switch(c)
2164     {
2165 ph10 510 default: MRRETURN(MATCH_NOMATCH);
2166 ph10 600
2167 nigel 93 case 0x000d:
2168     if (eptr < md->end_subject && *eptr == 0x0a) eptr++;
2169     break;
2170 ph10 231
2171 nigel 93 case 0x000a:
2172 ph10 231 break;
2173    
2174 nigel 93 case 0x000b:
2175     case 0x000c:
2176     case 0x0085:
2177     case 0x2028:
2178     case 0x2029:
2179 ph10 510 if (md->bsr_anycrlf) MRRETURN(MATCH_NOMATCH);
2180 nigel 93 break;
2181     }
2182     ecode++;
2183     break;
2184    
2185 ph10 178 case OP_NOT_HSPACE:
2186 ph10 443 if (eptr >= md->end_subject)
2187 ph10 428 {
2188 ph10 443 SCHECK_PARTIAL();
2189 ph10 510 MRRETURN(MATCH_NOMATCH);
2190 ph10 443 }
2191 ph10 178 GETCHARINCTEST(c, eptr);
2192     switch(c)
2193     {
2194     default: break;
2195     case 0x09: /* HT */
2196     case 0x20: /* SPACE */
2197     case 0xa0: /* NBSP */
2198     case 0x1680: /* OGHAM SPACE MARK */
2199     case 0x180e: /* MONGOLIAN VOWEL SEPARATOR */
2200     case 0x2000: /* EN QUAD */
2201     case 0x2001: /* EM QUAD */
2202     case 0x2002: /* EN SPACE */
2203     case 0x2003: /* EM SPACE */
2204     case 0x2004: /* THREE-PER-EM SPACE */
2205     case 0x2005: /* FOUR-PER-EM SPACE */
2206     case 0x2006: /* SIX-PER-EM SPACE */
2207     case 0x2007: /* FIGURE SPACE */
2208     case 0x2008: /* PUNCTUATION SPACE */
2209     case 0x2009: /* THIN SPACE */
2210     case 0x200A: /* HAIR SPACE */
2211     case 0x202f: /* NARROW NO-BREAK SPACE */
2212     case 0x205f: /* MEDIUM MATHEMATICAL SPACE */
2213     case 0x3000: /* IDEOGRAPHIC SPACE */
2214 ph10 510 MRRETURN(MATCH_NOMATCH);
2215 ph10 178 }
2216     ecode++;
2217     break;
2218    
2219     case OP_HSPACE:
2220 ph10 443 if (eptr >= md->end_subject)
2221 ph10 428 {
2222 ph10 443 SCHECK_PARTIAL();
2223 ph10 510 MRRETURN(MATCH_NOMATCH);
2224 ph10 443 }
2225 ph10 178 GETCHARINCTEST(c, eptr);
2226     switch(c)
2227     {
2228 ph10 510 default: MRRETURN(MATCH_NOMATCH);
2229 ph10 178 case 0x09: /* HT */
2230     case 0x20: /* SPACE */
2231     case 0xa0: /* NBSP */
2232     case 0x1680: /* OGHAM SPACE MARK */
2233     case 0x180e: /* MONGOLIAN VOWEL SEPARATOR */
2234     case 0x2000: /* EN QUAD */
2235     case 0x2001: /* EM QUAD */
2236     case 0x2002: /* EN SPACE */
2237     case 0x2003: /* EM SPACE */
2238     case 0x2004: /* THREE-PER-EM SPACE */
2239     case 0x2005: /* FOUR-PER-EM SPACE */
2240     case 0x2006: /* SIX-PER-EM SPACE */
2241     case 0x2007: /* FIGURE SPACE */
2242     case 0x2008: /* PUNCTUATION SPACE */
2243     case 0x2009: /* THIN SPACE */
2244     case 0x200A: /* HAIR SPACE */
2245     case 0x202f: /* NARROW NO-BREAK SPACE */
2246     case 0x205f: /* MEDIUM MATHEMATICAL SPACE */
2247     case 0x3000: /* IDEOGRAPHIC SPACE */
2248     break;
2249     }
2250     ecode++;
2251     break;
2252    
2253     case OP_NOT_VSPACE:
2254 ph10 443 if (eptr >= md->end_subject)
2255 ph10 428 {
2256 ph10 443 SCHECK_PARTIAL();
2257 ph10 510 MRRETURN(MATCH_NOMATCH);
2258 ph10 443 }
2259 ph10 178 GETCHARINCTEST(c, eptr);
2260     switch(c)
2261     {
2262     default: break;
2263     case 0x0a: /* LF */
2264     case 0x0b: /* VT */
2265     case 0x0c: /* FF */
2266     case 0x0d: /* CR */
2267     case 0x85: /* NEL */
2268     case 0x2028: /* LINE SEPARATOR */
2269     case 0x2029: /* PARAGRAPH SEPARATOR */
2270 ph10 510 MRRETURN(MATCH_NOMATCH);
2271 ph10 178 }
2272     ecode++;
2273     break;
2274    
2275     case OP_VSPACE:
2276 ph10 443 if (eptr >= md->end_subject)
2277 ph10 428 {
2278 ph10 443 SCHECK_PARTIAL();
2279 ph10 510 MRRETURN(MATCH_NOMATCH);
2280 ph10 443 }
2281 ph10 178 GETCHARINCTEST(c, eptr);
2282     switch(c)
2283     {
2284 ph10 510 default: MRRETURN(MATCH_NOMATCH);
2285 ph10 178 case 0x0a: /* LF */
2286     case 0x0b: /* VT */
2287     case 0x0c: /* FF */
2288     case 0x0d: /* CR */
2289     case 0x85: /* NEL */
2290     case 0x2028: /* LINE SEPARATOR */
2291     case 0x2029: /* PARAGRAPH SEPARATOR */
2292     break;
2293     }
2294     ecode++;
2295     break;
2296    
2297 nigel 77 #ifdef SUPPORT_UCP
2298     /* Check the next character by Unicode property. We will get here only
2299     if the support is in the binary; otherwise a compile-time error occurs. */
2300    
2301     case OP_PROP:
2302     case OP_NOTPROP:
2303 ph10 443 if (eptr >= md->end_subject)
2304 ph10 428 {
2305 ph10 443 SCHECK_PARTIAL();
2306 ph10 510 MRRETURN(MATCH_NOMATCH);
2307 ph10 443 }
2308 nigel 77 GETCHARINCTEST(c, eptr);
2309     {
2310 ph10 384 const ucd_record *prop = GET_UCD(c);
2311 nigel 77
2312 nigel 87 switch(ecode[1])
2313     {
2314     case PT_ANY:
2315 ph10 510 if (op == OP_NOTPROP) MRRETURN(MATCH_NOMATCH);
2316 nigel 87 break;
2317 nigel 77
2318 nigel 87 case PT_LAMP:
2319 ph10 349 if ((prop->chartype == ucp_Lu ||
2320     prop->chartype == ucp_Ll ||
2321     prop->chartype == ucp_Lt) == (op == OP_NOTPROP))
2322 ph10 510 MRRETURN(MATCH_NOMATCH);
2323 ph10 517 break;
2324 nigel 87
2325     case PT_GC:
2326 ph10 351 if ((ecode[2] != _pcre_ucp_gentype[prop->chartype]) == (op == OP_PROP))
2327 ph10 510 MRRETURN(MATCH_NOMATCH);
2328 nigel 87 break;
2329    
2330     case PT_PC:
2331 ph10 349 if ((ecode[2] != prop->chartype) == (op == OP_PROP))
2332 ph10 510 MRRETURN(MATCH_NOMATCH);
2333 nigel 87 break;
2334    
2335     case PT_SC:
2336 ph10 349 if ((ecode[2] != prop->script) == (op == OP_PROP))
2337 ph10 510 MRRETURN(MATCH_NOMATCH);
2338 nigel 87 break;
2339 ph10 527
2340 ph10 517 /* These are specials */
2341 ph10 527
2342 ph10 517 case PT_ALNUM:
2343     if ((_pcre_ucp_gentype[prop->chartype] == ucp_L ||
2344     _pcre_ucp_gentype[prop->chartype] == ucp_N) == (op == OP_NOTPROP))
2345     MRRETURN(MATCH_NOMATCH);
2346 ph10 527 break;
2347    
2348 ph10 517 case PT_SPACE: /* Perl space */
2349     if ((_pcre_ucp_gentype[prop->chartype] == ucp_Z ||
2350     c == CHAR_HT || c == CHAR_NL || c == CHAR_FF || c == CHAR_CR)
2351     == (op == OP_NOTPROP))
2352     MRRETURN(MATCH_NOMATCH);
2353 ph10 527 break;
2354    
2355 ph10 517 case PT_PXSPACE: /* POSIX space */
2356     if ((_pcre_ucp_gentype[prop->chartype] == ucp_Z ||
2357 ph10 527 c == CHAR_HT || c == CHAR_NL || c == CHAR_VT ||
2358 ph10 517 c == CHAR_FF || c == CHAR_CR)
2359     == (op == OP_NOTPROP))
2360     MRRETURN(MATCH_NOMATCH);
2361 ph10 527 break;
2362 nigel 87
2363 ph10 527 case PT_WORD:
2364 ph10 517 if ((_pcre_ucp_gentype[prop->chartype] == ucp_L ||
2365 ph10 527 _pcre_ucp_gentype[prop->chartype] == ucp_N ||
2366 ph10 517 c == CHAR_UNDERSCORE) == (op == OP_NOTPROP))
2367     MRRETURN(MATCH_NOMATCH);
2368 ph10 527 break;
2369    
2370 ph10 517 /* This should never occur */
2371    
2372 nigel 87 default:
2373     RRETURN(PCRE_ERROR_INTERNAL);
2374 nigel 77 }
2375 nigel 87
2376     ecode += 3;
2377 nigel 77 }
2378     break;
2379    
2380     /* Match an extended Unicode sequence. We will get here only if the support
2381     is in the binary; otherwise a compile-time error occurs. */
2382    
2383     case OP_EXTUNI:
2384 ph10 443 if (eptr >= md->end_subject)
2385 ph10 428 {
2386 ph10 443 SCHECK_PARTIAL();
2387 ph10 510 MRRETURN(MATCH_NOMATCH);
2388 ph10 443 }
2389 nigel 77 GETCHARINCTEST(c, eptr);
2390     {
2391 ph10 349 int category = UCD_CATEGORY(c);
2392 ph10 510 if (category == ucp_M) MRRETURN(MATCH_NOMATCH);
2393 nigel 77 while (eptr < md->end_subject)
2394     {
2395     int len = 1;
2396     if (!utf8) c = *eptr; else
2397     {
2398     GETCHARLEN(c, eptr, len);
2399     }
2400 ph10 349 category = UCD_CATEGORY(c);
2401 nigel 77 if (category != ucp_M) break;
2402     eptr += len;
2403     }
2404     }
2405     ecode++;
2406     break;
2407     #endif
2408    
2409    
2410     /* Match a back reference, possibly repeatedly. Look past the end of the
2411     item to see if there is repeat information following. The code is similar
2412     to that for character classes, but repeated for efficiency. Then obey
2413     similar code to character type repeats - written out again for speed.
2414     However, if the referenced string is the empty string, always treat
2415     it as matched, any number of times (otherwise there could be infinite
2416     loops). */
2417    
2418     case OP_REF:
2419 ph10 602 case OP_REFI:
2420     caseless = op == OP_REFI;
2421 ph10 595 offset = GET2(ecode, 1) << 1; /* Doubled ref number */
2422     ecode += 3;
2423 ph10 345
2424 ph10 595 /* If the reference is unset, there are two possibilities:
2425 ph10 345
2426 ph10 595 (a) In the default, Perl-compatible state, set the length negative;
2427     this ensures that every attempt at a match fails. We can't just fail
2428     here, because of the possibility of quantifiers with zero minima.
2429 ph10 345
2430 ph10 595 (b) If the JavaScript compatibility flag is set, set the length to zero
2431     so that the back reference matches an empty string.
2432 ph10 345
2433 ph10 595 Otherwise, set the length to the length of what was matched by the
2434     referenced subpattern. */
2435 ph10 345
2436 ph10 595 if (offset >= offset_top || md->offset_vector[offset] < 0)
2437     length = (md->jscript_compat)? 0 : -1;
2438     else
2439     length = md->offset_vector[offset+1] - md->offset_vector[offset];
2440 nigel 77
2441 ph10 595 /* Set up for repetition, or handle the non-repeated case */
2442 nigel 77
2443 ph10 595 switch (*ecode)
2444     {
2445     case OP_CRSTAR:
2446     case OP_CRMINSTAR:
2447     case OP_CRPLUS:
2448     case OP_CRMINPLUS:
2449     case OP_CRQUERY:
2450     case OP_CRMINQUERY:
2451     c = *ecode++ - OP_CRSTAR;
2452     minimize = (c & 1) != 0;
2453     min = rep_min[c]; /* Pick up values from tables; */
2454     max = rep_max[c]; /* zero for max => infinity */
2455     if (max == 0) max = INT_MAX;
2456     break;
2457 nigel 77
2458 ph10 595 case OP_CRRANGE:
2459     case OP_CRMINRANGE:
2460     minimize = (*ecode == OP_CRMINRANGE);
2461     min = GET2(ecode, 1);
2462     max = GET2(ecode, 3);
2463     if (max == 0) max = INT_MAX;
2464     ecode += 5;
2465     break;
2466 nigel 77
2467 ph10 595 default: /* No repeat follows */
2468 ph10 602 if ((length = match_ref(offset, eptr, length, md, caseless)) < 0)
2469 ph10 595 {
2470     CHECK_PARTIAL();
2471     MRRETURN(MATCH_NOMATCH);
2472 nigel 77 }
2473 ph10 595 eptr += length;
2474     continue; /* With the main loop */
2475     }
2476 nigel 77
2477 ph10 595 /* Handle repeated back references. If the length of the reference is
2478     zero, just continue with the main loop. */
2479 ph10 443
2480 ph10 595 if (length == 0) continue;
2481 nigel 77
2482 ph10 595 /* First, ensure the minimum number of matches are present. We get back
2483     the length of the reference string explicitly rather than passing the
2484     address of eptr, so that eptr can be a register variable. */
2485 nigel 77
2486 ph10 595 for (i = 1; i <= min; i++)
2487     {
2488     int slength;
2489 ph10 602 if ((slength = match_ref(offset, eptr, length, md, caseless)) < 0)
2490 nigel 77 {
2491 ph10 595 CHECK_PARTIAL();
2492     MRRETURN(MATCH_NOMATCH);
2493 nigel 77 }
2494 ph10 595 eptr += slength;
2495     }
2496 nigel 77
2497 ph10 595 /* If min = max, continue at the same level without recursion.
2498     They are not both allowed to be zero. */
2499 nigel 77
2500 ph10 595 if (min == max) continue;
2501 nigel 77
2502 ph10 595 /* If minimizing, keep trying and advancing the pointer */
2503 nigel 77
2504 ph10 595 if (minimize)
2505     {
2506     for (fi = min;; fi++)
2507 nigel 77 {
2508 ph10 595 int slength;
2509 ph10 604 RMATCH(eptr, ecode, offset_top, md, eptrb, RM14);
2510 ph10 595 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2511     if (fi >= max) MRRETURN(MATCH_NOMATCH);
2512 ph10 602 if ((slength = match_ref(offset, eptr, length, md, caseless)) < 0)
2513 nigel 77 {
2514 ph10 595 CHECK_PARTIAL();
2515     MRRETURN(MATCH_NOMATCH);
2516 nigel 77 }
2517 ph10 595 eptr += slength;
2518 nigel 77 }
2519 ph10 595 /* Control never gets here */
2520     }
2521 nigel 77
2522 ph10 595 /* If maximizing, find the longest string and work backwards */
2523 nigel 77
2524 ph10 595 else
2525     {
2526     pp = eptr;
2527     for (i = min; i < max; i++)
2528 nigel 77 {
2529 ph10 595 int slength;
2530 ph10 602 if ((slength = match_ref(offset, eptr, length, md, caseless)) < 0)
2531 nigel 77 {
2532 ph10 595 CHECK_PARTIAL();
2533     break;
2534 nigel 77 }
2535 ph10 595 eptr += slength;
2536 nigel 77 }
2537 ph10 595 while (eptr >= pp)
2538     {
2539 ph10 604 RMATCH(eptr, ecode, offset_top, md, eptrb, RM15);
2540 ph10 595 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2541     eptr -= length;
2542     }
2543     MRRETURN(MATCH_NOMATCH);
2544 nigel 77 }
2545     /* Control never gets here */
2546    
2547     /* Match a bit-mapped character class, possibly repeatedly. This op code is
2548     used when all the characters in the class have values in the range 0-255,
2549     and either the matching is caseful, or the characters are in the range
2550     0-127 when UTF-8 processing is enabled. The only difference between
2551     OP_CLASS and OP_NCLASS occurs when a data character outside the range is
2552     encountered.
2553    
2554     First, look past the end of the item to see if there is repeat information
2555     following. Then obey similar code to character type repeats - written out
2556     again for speed. */
2557    
2558     case OP_NCLASS:
2559     case OP_CLASS:
2560     {
2561     data = ecode + 1; /* Save for matching */
2562     ecode += 33; /* Advance past the item */
2563    
2564     switch (*ecode)
2565     {
2566     case OP_CRSTAR:
2567     case OP_CRMINSTAR:
2568     case OP_CRPLUS:
2569     case OP_CRMINPLUS:
2570     case OP_CRQUERY:
2571     case OP_CRMINQUERY:
2572     c = *ecode++ - OP_CRSTAR;
2573     minimize = (c & 1) != 0;
2574     min = rep_min[c]; /* Pick up values from tables; */
2575     max = rep_max[c]; /* zero for max => infinity */
2576     if (max == 0) max = INT_MAX;
2577     break;
2578    
2579     case OP_CRRANGE:
2580     case OP_CRMINRANGE:
2581     minimize = (*ecode == OP_CRMINRANGE);
2582     min = GET2(ecode, 1);
2583     max = GET2(ecode, 3);
2584     if (max == 0) max = INT_MAX;
2585     ecode += 5;
2586     break;
2587    
2588     default: /* No repeat follows */
2589     min = max = 1;
2590     break;
2591     }
2592    
2593     /* First, ensure the minimum number of matches are present. */
2594    
2595     #ifdef SUPPORT_UTF8
2596     /* UTF-8 mode */
2597     if (utf8)
2598     {
2599     for (i = 1; i <= min; i++)
2600     {
2601 ph10 427 if (eptr >= md->end_subject)
2602 ph10 426 {
2603 ph10 428 SCHECK_PARTIAL();
2604 ph10 510 MRRETURN(MATCH_NOMATCH);
2605 ph10 427 }
2606 nigel 77 GETCHARINC(c, eptr);
2607     if (c > 255)
2608     {
2609 ph10 510 if (op == OP_CLASS) MRRETURN(MATCH_NOMATCH);
2610 nigel 77 }
2611     else
2612     {
2613 ph10 510 if ((data[c/8] & (1 << (c&7))) == 0) MRRETURN(MATCH_NOMATCH);
2614 nigel 77 }
2615     }
2616     }
2617     else
2618     #endif
2619     /* Not UTF-8 mode */
2620     {
2621     for (i = 1; i <= min; i++)
2622     {
2623 ph10 427 if (eptr >= md->end_subject)
2624 ph10 426 {
2625 ph10 428 SCHECK_PARTIAL();
2626 ph10 510 MRRETURN(MATCH_NOMATCH);
2627 ph10 427 }
2628 nigel 77 c = *eptr++;
2629 ph10 510 if ((data[c/8] & (1 << (c&7))) == 0) MRRETURN(MATCH_NOMATCH);
2630 nigel 77 }
2631     }
2632    
2633     /* If max == min we can continue with the main loop without the
2634     need to recurse. */
2635    
2636     if (min == max) continue;
2637    
2638     /* If minimizing, keep testing the rest of the expression and advancing
2639     the pointer while it matches the class. */
2640    
2641     if (minimize)
2642     {
2643     #ifdef SUPPORT_UTF8
2644     /* UTF-8 mode */
2645     if (utf8)
2646     {
2647     for (fi = min;; fi++)
2648     {
2649 ph10 604 RMATCH(eptr, ecode, offset_top, md, eptrb, RM16);
2650 nigel 77 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2651 ph10 510 if (fi >= max) MRRETURN(MATCH_NOMATCH);
2652 ph10 427 if (eptr >= md->end_subject)
2653 ph10 426 {
2654 ph10 427 SCHECK_PARTIAL();
2655 ph10 510 MRRETURN(MATCH_NOMATCH);
2656 ph10 427 }
2657 nigel 77 GETCHARINC(c, eptr);
2658     if (c > 255)
2659     {
2660 ph10 510 if (op == OP_CLASS) MRRETURN(MATCH_NOMATCH);
2661 nigel 77 }
2662     else
2663     {
2664 ph10 510 if ((data[c/8] & (1 << (c&7))) == 0) MRRETURN(MATCH_NOMATCH);
2665 nigel 77 }
2666     }
2667     }
2668     else
2669     #endif
2670     /* Not UTF-8 mode */
2671     {
2672     for (fi = min;; fi++)
2673     {
2674 ph10 604 RMATCH(eptr, ecode, offset_top, md, eptrb, RM17);
2675 nigel 77 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2676 ph10 510 if (fi >= max) MRRETURN(MATCH_NOMATCH);
2677 ph10 427 if (eptr >= md->end_subject)
2678 ph10 426 {
2679 ph10 427 SCHECK_PARTIAL();
2680 ph10 510 MRRETURN(MATCH_NOMATCH);
2681 ph10 427 }
2682 nigel 77 c = *eptr++;
2683 ph10 510 if ((data[c/8] & (1 << (c&7))) == 0) MRRETURN(MATCH_NOMATCH);
2684 nigel 77 }
2685     }
2686     /* Control never gets here */
2687     }
2688    
2689     /* If maximizing, find the longest possible run, then work backwards. */
2690    
2691     else
2692     {
2693     pp = eptr;
2694    
2695     #ifdef SUPPORT_UTF8
2696     /* UTF-8 mode */
2697     if (utf8)
2698     {
2699     for (i = min; i < max; i++)
2700     {
2701     int len = 1;
2702 ph10 463 if (eptr >= md->end_subject)
2703 ph10 462 {
2704 ph10 463 SCHECK_PARTIAL();
2705 ph10 462 break;
2706 ph10 463 }
2707 nigel 77 GETCHARLEN(c, eptr, len);
2708     if (c > 255)
2709     {
2710     if (op == OP_CLASS) break;
2711     }
2712     else
2713     {
2714     if ((data[c/8] & (1 << (c&7))) == 0) break;
2715     }
2716     eptr += len;
2717     }
2718     for (;;)
2719     {
2720 ph10 604 RMATCH(eptr, ecode, offset_top, md, eptrb, RM18);
2721 nigel 77 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2722     if (eptr-- == pp) break; /* Stop if tried at original pos */
2723     BACKCHAR(eptr);
2724     }
2725     }
2726     else
2727     #endif
2728     /* Not UTF-8 mode */
2729     {
2730     for (i = min; i < max; i++)
2731     {
2732 ph10 463 if (eptr >= md->end_subject)
2733 ph10 462 {
2734 ph10 463 SCHECK_PARTIAL();
2735 ph10 462 break;
2736 ph10 463 }
2737 nigel 77 c = *eptr;
2738     if ((data[c/8] & (1 << (c&7))) == 0) break;
2739     eptr++;
2740     }
2741     while (eptr >= pp)
2742     {
2743 ph10 604 RMATCH(eptr, ecode, offset_top, md, eptrb, RM19);
2744 nigel 87 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2745 nigel 77 eptr--;
2746     }
2747     }
2748    
2749 ph10 510 MRRETURN(MATCH_NOMATCH);
2750 nigel 77 }
2751     }
2752     /* Control never gets here */
2753    
2754    
2755     /* Match an extended character class. This opcode is encountered only
2756 ph10 384 when UTF-8 mode mode is supported. Nevertheless, we may not be in UTF-8
2757     mode, because Unicode properties are supported in non-UTF-8 mode. */
2758 nigel 77
2759     #ifdef SUPPORT_UTF8
2760     case OP_XCLASS:
2761     {
2762     data = ecode + 1 + LINK_SIZE; /* Save for matching */
2763     ecode += GET(ecode, 1); /* Advance past the item */
2764    
2765     switch (*ecode)
2766     {
2767     case OP_CRSTAR:
2768     case OP_CRMINSTAR:
2769     case OP_CRPLUS:
2770     case OP_CRMINPLUS:
2771     case OP_CRQUERY:
2772     case OP_CRMINQUERY:
2773     c = *ecode++ - OP_CRSTAR;
2774     minimize = (c & 1) != 0;
2775     min = rep_min[c]; /* Pick up values from tables; */
2776     max = rep_max[c]; /* zero for max => infinity */
2777     if (max == 0) max = INT_MAX;
2778     break;
2779    
2780     case OP_CRRANGE:
2781     case OP_CRMINRANGE:
2782     minimize = (*ecode == OP_CRMINRANGE);
2783     min = GET2(ecode, 1);
2784     max = GET2(ecode, 3);
2785     if (max == 0) max = INT_MAX;
2786     ecode += 5;
2787     break;
2788    
2789     default: /* No repeat follows */
2790     min = max = 1;
2791     break;
2792     }
2793    
2794     /* First, ensure the minimum number of matches are present. */
2795    
2796     for (i = 1; i <= min; i++)
2797     {
2798 ph10 427 if (eptr >= md->end_subject)
2799 ph10 426 {
2800     SCHECK_PARTIAL();
2801 ph10 510 MRRETURN(MATCH_NOMATCH);
2802 ph10 427 }
2803 ph10 384 GETCHARINCTEST(c, eptr);
2804 ph10 510 if (!_pcre_xclass(c, data)) MRRETURN(MATCH_NOMATCH);
2805 nigel 77 }
2806    
2807     /* If max == min we can continue with the main loop without the
2808     need to recurse. */
2809    
2810     if (min == max) continue;
2811    
2812     /* If minimizing, keep testing the rest of the expression and advancing
2813     the pointer while it matches the class. */
2814    
2815     if (minimize)
2816     {
2817     for (fi = min;; fi++)
2818     {
2819 ph10 604 RMATCH(eptr, ecode, offset_top, md, eptrb, RM20);
2820 nigel 77 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2821 ph10 510 if (fi >= max) MRRETURN(MATCH_NOMATCH);
2822 ph10 427 if (eptr >= md->end_subject)
2823 ph10 426 {
2824 ph10 427 SCHECK_PARTIAL();
2825 ph10 510 MRRETURN(MATCH_NOMATCH);
2826 ph10 427 }
2827 ph10 384 GETCHARINCTEST(c, eptr);
2828 ph10 510 if (!_pcre_xclass(c, data)) MRRETURN(MATCH_NOMATCH);
2829 nigel 77 }
2830     /* Control never gets here */
2831     }
2832    
2833     /* If maximizing, find the longest possible run, then work backwards. */
2834    
2835     else
2836     {
2837     pp = eptr;
2838     for (i = min; i < max; i++)
2839     {
2840     int len = 1;
2841 ph10 463 if (eptr >= md->end_subject)
2842 ph10 462 {
2843 ph10 463 SCHECK_PARTIAL();
2844 ph10 462 break;
2845 ph10 463 }
2846 ph10 384 GETCHARLENTEST(c, eptr, len);
2847 nigel 77 if (!_pcre_xclass(c, data)) break;
2848     eptr += len;
2849     }
2850     for(;;)
2851     {
2852 ph10 604 RMATCH(eptr, ecode, offset_top, md, eptrb, RM21);
2853 nigel 77 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2854     if (eptr-- == pp) break; /* Stop if tried at original pos */
2855 ph10 214 if (utf8) BACKCHAR(eptr);
2856 nigel 77 }
2857 ph10 510 MRRETURN(MATCH_NOMATCH);
2858 nigel 77 }
2859    
2860     /* Control never gets here */
2861     }
2862     #endif /* End of XCLASS */
2863    
2864     /* Match a single character, casefully */
2865    
2866     case OP_CHAR:
2867     #ifdef SUPPORT_UTF8
2868     if (utf8)
2869     {
2870     length = 1;
2871     ecode++;
2872     GETCHARLEN(fc, ecode, length);
2873 ph10 443 if (length > md->end_subject - eptr)
2874 ph10 428 {
2875     CHECK_PARTIAL(); /* Not SCHECK_PARTIAL() */
2876 ph10 510 MRRETURN(MATCH_NOMATCH);
2877 ph10 443 }
2878 ph10 510 while (length-- > 0) if (*ecode++ != *eptr++) MRRETURN(MATCH_NOMATCH);
2879 nigel 77 }
2880     else
2881     #endif
2882    
2883     /* Non-UTF-8 mode */
2884     {
2885 ph10 443 if (md->end_subject - eptr < 1)
2886 ph10 428 {
2887     SCHECK_PARTIAL(); /* This one can use SCHECK_PARTIAL() */
2888 ph10 510 MRRETURN(MATCH_NOMATCH);
2889 ph10 443 }
2890 ph10 510 if (ecode[1] != *eptr++) MRRETURN(MATCH_NOMATCH);
2891 nigel 77 ecode += 2;
2892     }
2893     break;
2894    
2895     /* Match a single character, caselessly */
2896    
2897 ph10 602 case OP_CHARI:
2898 nigel 77 #ifdef SUPPORT_UTF8
2899     if (utf8)
2900     {
2901     length = 1;
2902     ecode++;
2903     GETCHARLEN(fc, ecode, length);
2904    
2905 ph10 443 if (length > md->end_subject - eptr)
2906 ph10 428 {
2907     CHECK_PARTIAL(); /* Not SCHECK_PARTIAL() */
2908 ph10 510 MRRETURN(MATCH_NOMATCH);
2909 ph10 443 }
2910 nigel 77
2911     /* If the pattern character's value is < 128, we have only one byte, and
2912     can use the fast lookup table. */
2913    
2914     if (fc < 128)
2915     {
2916 ph10 510 if (md->lcc[*ecode++] != md->lcc[*eptr++]) MRRETURN(MATCH_NOMATCH);
2917 nigel 77 }
2918    
2919     /* Otherwise we must pick up the subject character */
2920    
2921     else
2922     {
2923 nigel 93 unsigned int dc;
2924 nigel 77 GETCHARINC(dc, eptr);
2925     ecode += length;
2926    
2927     /* If we have Unicode property support, we can use it to test the other
2928 nigel 87 case of the character, if there is one. */
2929 nigel 77
2930     if (fc != dc)
2931     {
2932     #ifdef SUPPORT_UCP
2933 ph10 349 if (dc != UCD_OTHERCASE(fc))
2934 nigel 77 #endif
2935 ph10 510 MRRETURN(MATCH_NOMATCH);
2936 nigel 77 }
2937     }
2938     }
2939     else
2940     #endif /* SUPPORT_UTF8 */
2941    
2942     /* Non-UTF-8 mode */
2943     {
2944 ph10 443 if (md->end_subject - eptr < 1)
2945 ph10 428 {
2946 ph10 443 SCHECK_PARTIAL(); /* This one can use SCHECK_PARTIAL() */
2947 ph10 510 MRRETURN(MATCH_NOMATCH);
2948 ph10 443 }
2949 ph10 510 if (md->lcc[ecode[1]] != md->lcc[*eptr++]) MRRETURN(MATCH_NOMATCH);
2950 nigel 77 ecode += 2;
2951     }
2952     break;
2953    
2954 nigel 93 /* Match a single character repeatedly. */
2955 nigel 77
2956     case OP_EXACT:
2957 ph10 602 case OP_EXACTI:
2958 nigel 77 min = max = GET2(ecode, 1);
2959     ecode += 3;
2960     goto REPEATCHAR;
2961    
2962 nigel 93 case OP_POSUPTO:
2963 ph10 602 case OP_POSUPTOI:
2964 nigel 93 possessive = TRUE;
2965     /* Fall through */
2966    
2967 nigel 77 case OP_UPTO:
2968 ph10 602 case OP_UPTOI:
2969 nigel 77 case OP_MINUPTO:
2970 ph10 602 case OP_MINUPTOI:
2971 nigel 77 min = 0;
2972     max = GET2(ecode, 1);
2973 ph10 602 minimize = *ecode == OP_MINUPTO || *ecode == OP_MINUPTOI;
2974 nigel 77 ecode += 3;
2975     goto REPEATCHAR;
2976    
2977 nigel 93 case OP_POSSTAR:
2978 ph10 602 case OP_POSSTARI:
2979 nigel 93 possessive = TRUE;
2980     min = 0;
2981     max = INT_MAX;
2982     ecode++;
2983     goto REPEATCHAR;
2984    
2985     case OP_POSPLUS:
2986 ph10 602 case OP_POSPLUSI:
2987 nigel 93 possessive = TRUE;
2988     min = 1;
2989     max = INT_MAX;
2990     ecode++;
2991     goto REPEATCHAR;
2992    
2993     case OP_POSQUERY:
2994 ph10 602 case OP_POSQUERYI:
2995 nigel 93 possessive = TRUE;
2996     min = 0;
2997     max = 1;
2998     ecode++;
2999     goto REPEATCHAR;
3000    
3001 nigel 77 case OP_STAR:
3002 ph10 602 case OP_STARI:
3003 nigel 77 case OP_MINSTAR:
3004 ph10 602 case OP_MINSTARI:
3005 nigel 77 case OP_PLUS:
3006 ph10 602 case OP_PLUSI:
3007 nigel 77 case OP_MINPLUS:
3008 ph10 602 case OP_MINPLUSI:
3009 nigel 77 case OP_QUERY:
3010 ph10 602 case OP_QUERYI:
3011 nigel 77 case OP_MINQUERY:
3012 ph10 602 case OP_MINQUERYI:
3013     c = *ecode++ - ((op < OP_STARI)? OP_STAR : OP_STARI);
3014 nigel 77 minimize = (c & 1) != 0;
3015     min = rep_min[c]; /* Pick up values from tables; */
3016     max = rep_max[c]; /* zero for max => infinity */
3017     if (max == 0) max = INT_MAX;
3018    
3019 ph10 426 /* Common code for all repeated single-character matches. */
3020 nigel 77
3021     REPEATCHAR:
3022     #ifdef SUPPORT_UTF8
3023     if (utf8)
3024     {
3025     length = 1;
3026     charptr = ecode;
3027     GETCHARLEN(fc, ecode, length);
3028     ecode += length;
3029    
3030     /* Handle multibyte character matching specially here. There is
3031     support for caseless matching if UCP support is present. */
3032    
3033     if (length > 1)
3034     {
3035     #ifdef SUPPORT_UCP
3036 nigel 93 unsigned int othercase;
3037 ph10 602 if (op >= OP_STARI && /* Caseless */
3038 ph10 349 (othercase = UCD_OTHERCASE(fc)) != fc)
3039 nigel 77 oclength = _pcre_ord2utf8(othercase, occhars);
3040 ph10 115 else oclength = 0;
3041 nigel 77 #endif /* SUPPORT_UCP */
3042    
3043     for (i = 1; i <= min; i++)
3044     {
3045 ph10 426 if (eptr <= md->end_subject - length &&
3046     memcmp(eptr, charptr, length) == 0) eptr += length;
3047 ph10 123 #ifdef SUPPORT_UCP
3048 ph10 426 else if (oclength > 0 &&
3049     eptr <= md->end_subject - oclength &&
3050     memcmp(eptr, occhars, oclength) == 0) eptr += oclength;
3051     #endif /* SUPPORT_UCP */
3052 nigel 77 else
3053     {
3054 ph10 426 CHECK_PARTIAL();
3055 ph10 510 MRRETURN(MATCH_NOMATCH);
3056 nigel 77 }
3057     }
3058    
3059     if (min == max) continue;
3060    
3061     if (minimize)
3062     {
3063     for (fi = min;; fi++)
3064     {
3065 ph10 604 RMATCH(eptr, ecode, offset_top, md, eptrb, RM22);
3066 nigel 77 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3067 ph10 510 if (fi >= max) MRRETURN(MATCH_NOMATCH);
3068 ph10 426 if (eptr <= md->end_subject - length &&
3069     memcmp(eptr, charptr, length) == 0) eptr += length;
3070 ph10 123 #ifdef SUPPORT_UCP
3071 ph10 426 else if (oclength > 0 &&
3072     eptr <= md->end_subject - oclength &&
3073     memcmp(eptr, occhars, oclength) == 0) eptr += oclength;
3074     #endif /* SUPPORT_UCP */
3075 nigel 77 else
3076     {
3077 ph10 426 CHECK_PARTIAL();
3078 ph10 510 MRRETURN(MATCH_NOMATCH);
3079 nigel 77 }
3080     }
3081     /* Control never gets here */
3082     }
3083 nigel 93
3084     else /* Maximize */
3085 nigel 77 {
3086     pp = eptr;
3087     for (i = min; i < max; i++)
3088     {
3089 ph10 426 if (eptr <= md->end_subject - length &&
3090     memcmp(eptr, charptr, length) == 0) eptr += length;
3091 ph10 123 #ifdef SUPPORT_UCP
3092 ph10 426 else if (oclength > 0 &&
3093     eptr <= md->end_subject - oclength &&
3094     memcmp(eptr, occhars, oclength) == 0) eptr += oclength;
3095     #endif /* SUPPORT_UCP */
3096 ph10 463 else
3097 ph10 462 {
3098 ph10 463 CHECK_PARTIAL();
3099 ph10 462 break;
3100 ph10 463 }
3101 nigel 77 }
3102 nigel 93
3103     if (possessive) continue;
3104 ph10 427
3105 ph10 120 for(;;)
3106 ph10 426 {
3107 ph10 604 RMATCH(eptr, ecode, offset_top, md, eptrb, RM23);
3108 ph10 426 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3109 ph10 510 if (eptr == pp) { MRRETURN(MATCH_NOMATCH); }
3110 ph10 115 #ifdef SUPPORT_UCP
3111 ph10 426 eptr--;
3112     BACKCHAR(eptr);
3113 ph10 123 #else /* without SUPPORT_UCP */
3114 ph10 426 eptr -= length;
3115 ph10 123 #endif /* SUPPORT_UCP */
3116 ph10 426 }
3117 nigel 77 }
3118     /* Control never gets here */
3119     }
3120    
3121     /* If the length of a UTF-8 character is 1, we fall through here, and
3122     obey the code as for non-UTF-8 characters below, though in this case the
3123     value of fc will always be < 128. */
3124     }
3125     else
3126     #endif /* SUPPORT_UTF8 */
3127    
3128     /* When not in UTF-8 mode, load a single-byte character. */
3129    
3130 ph10 426 fc = *ecode++;
3131 ph10 443
3132 nigel 77 /* The value of fc at this point is always less than 256, though we may or
3133     may not be in UTF-8 mode. The code is duplicated for the caseless and
3134     caseful cases, for speed, since matching characters is likely to be quite
3135     common. First, ensure the minimum number of matches are present. If min =
3136     max, continue at the same level without recursing. Otherwise, if
3137     minimizing, keep trying the rest of the expression and advancing one
3138     matching character if failing, up to the maximum. Alternatively, if
3139     maximizing, find the maximum number of characters and work backwards. */
3140    
3141     DPRINTF(("matching %c{%d,%d} against subject %.*s\n", fc, min, max,
3142     max, eptr));
3143    
3144 ph10 602 if (op >= OP_STARI) /* Caseless */
3145 nigel 77 {
3146     fc = md->lcc[fc];
3147     for (i = 1; i <= min; i++)
3148 ph10 426 {
3149     if (eptr >= md->end_subject)
3150     {
3151     SCHECK_PARTIAL();
3152 ph10 510 MRRETURN(MATCH_NOMATCH);
3153 ph10 426 }
3154 ph10 510 if (fc != md->lcc[*eptr++]) MRRETURN(MATCH_NOMATCH);
3155 ph10 426 }
3156 nigel 77 if (min == max) continue;
3157     if (minimize)
3158     {
3159     for (fi = min;; fi++)
3160     {
3161 ph10 604 RMATCH(eptr, ecode, offset_top, md, eptrb, RM24);
3162 nigel 77 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3163 ph10 510 if (fi >= max) MRRETURN(MATCH_NOMATCH);
3164 ph10 426 if (eptr >= md->end_subject)
3165     {
3166 ph10 427 SCHECK_PARTIAL();
3167 ph10 510 MRRETURN(MATCH_NOMATCH);
3168 ph10 426 }
3169 ph10 510 if (fc != md->lcc[*eptr++]) MRRETURN(MATCH_NOMATCH);
3170 nigel 77 }
3171     /* Control never gets here */
3172     }
3173 nigel 93 else /* Maximize */
3174 nigel 77 {
3175     pp = eptr;
3176     for (i = min; i < max; i++)
3177     {
3178 ph10 463 if (eptr >= md->end_subject)
3179 ph10 462 {
3180     SCHECK_PARTIAL();
3181     break;
3182 ph10 463 }
3183 ph10 462 if (fc != md->lcc[*eptr]) break;
3184 nigel 77 eptr++;
3185     }
3186 ph10 427
3187 nigel 93 if (possessive) continue;
3188 ph10 427
3189 nigel 77 while (eptr >= pp)
3190     {
3191 ph10 604 RMATCH(eptr, ecode, offset_top, md, eptrb, RM25);
3192 nigel 77 eptr--;
3193     if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3194     }
3195 ph10 510 MRRETURN(MATCH_NOMATCH);
3196 nigel 77 }
3197     /* Control never gets here */
3198     }
3199    
3200     /* Caseful comparisons (includes all multi-byte characters) */
3201    
3202     else
3203     {
3204 ph10 427 for (i = 1; i <= min; i++)
3205 ph10 426 {
3206     if (eptr >= md->end_subject)
3207     {
3208     SCHECK_PARTIAL();
3209 ph10 510 MRRETURN(MATCH_NOMATCH);
3210 ph10 426 }
3211 ph10 510 if (fc != *eptr++) MRRETURN(MATCH_NOMATCH);
3212 ph10 427 }
3213 ph10 443
3214 nigel 77 if (min == max) continue;
3215 ph10 443
3216 nigel 77 if (minimize)
3217     {
3218     for (fi = min;; fi++)
3219     {
3220 ph10 604 RMATCH(eptr, ecode, offset_top, md, eptrb, RM26);
3221 nigel 77 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3222 ph10 510 if (fi >= max) MRRETURN(MATCH_NOMATCH);
3223 ph10 426 if (eptr >= md->end_subject)
3224 ph10 427 {
3225 ph10 426 SCHECK_PARTIAL();
3226 ph10 510 MRRETURN(MATCH_NOMATCH);
3227 ph10 427 }
3228 ph10 510 if (fc != *eptr++) MRRETURN(MATCH_NOMATCH);
3229 nigel 77 }
3230     /* Control never gets here */
3231     }
3232 nigel 93 else /* Maximize */
3233 nigel 77 {
3234     pp = eptr;
3235     for (i = min; i < max; i++)
3236     {
3237 ph10 463 if (eptr >= md->end_subject)
3238 ph10 462 {
3239 ph10 463 SCHECK_PARTIAL();
3240 ph10 462 break;
3241 ph10 463 }
3242 ph10 462 if (fc != *eptr) break;
3243 nigel 77 eptr++;
3244     }
3245 nigel 93 if (possessive) continue;
3246 ph10 443
3247 nigel 77 while (eptr >= pp)
3248     {
3249 ph10 604 RMATCH(eptr, ecode, offset_top, md, eptrb, RM27);
3250 nigel 77 eptr--;
3251     if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3252     }
3253 ph10 510 MRRETURN(MATCH_NOMATCH);
3254 nigel 77 }
3255     }
3256     /* Control never gets here */
3257    
3258     /* Match a negated single one-byte character. The character we are
3259     checking can be multibyte. */
3260    
3261     case OP_NOT:
3262 ph10 602 case OP_NOTI:
3263 ph10 443 if (eptr >= md->end_subject)
3264 ph10 428 {
3265 ph10 443 SCHECK_PARTIAL();
3266 ph10 510 MRRETURN(MATCH_NOMATCH);
3267 ph10 443 }
3268 nigel 77 ecode++;
3269     GETCHARINCTEST(c, eptr);
3270 ph10 602 if (op == OP_NOTI) /* The caseless case */
3271 nigel 77 {
3272     #ifdef SUPPORT_UTF8
3273     if (c < 256)
3274     #endif
3275     c = md->lcc[c];
3276 ph10 510 if (md->lcc[*ecode++] == c) MRRETURN(MATCH_NOMATCH);
3277 nigel 77 }
3278 ph10 602 else /* Caseful */
3279 nigel 77 {
3280 ph10 510 if (*ecode++ == c) MRRETURN(MATCH_NOMATCH);
3281 nigel 77 }
3282     break;
3283    
3284     /* Match a negated single one-byte character repeatedly. This is almost a
3285     repeat of the code for a repeated single character, but I haven't found a
3286     nice way of commoning these up that doesn't require a test of the
3287     positive/negative option for each character match. Maybe that wouldn't add
3288     very much to the time taken, but character matching *is* what this is all
3289     about... */
3290    
3291     case OP_NOTEXACT:
3292 ph10 602 case OP_NOTEXACTI:
3293 nigel 77 min = max = GET2(ecode, 1);
3294     ecode += 3;
3295     goto REPEATNOTCHAR;
3296    
3297     case OP_NOTUPTO:
3298 ph10 602 case OP_NOTUPTOI:
3299 nigel 77 case OP_NOTMINUPTO:
3300 ph10 602 case OP_NOTMINUPTOI:
3301 nigel 77 min = 0;
3302     max = GET2(ecode, 1);
3303 ph10 602 minimize = *ecode == OP_NOTMINUPTO || *ecode == OP_NOTMINUPTOI;
3304 nigel 77 ecode += 3;
3305     goto REPEATNOTCHAR;
3306    
3307 nigel 93 case OP_NOTPOSSTAR:
3308 ph10 602 case OP_NOTPOSSTARI:
3309 nigel 93 possessive = TRUE;
3310     min = 0;
3311     max = INT_MAX;
3312     ecode++;
3313     goto REPEATNOTCHAR;
3314    
3315     case OP_NOTPOSPLUS:
3316 ph10 602 case OP_NOTPOSPLUSI:
3317 nigel 93 possessive = TRUE;
3318     min = 1;
3319     max = INT_MAX;
3320     ecode++;
3321     goto REPEATNOTCHAR;
3322    
3323     case OP_NOTPOSQUERY:
3324 ph10 602 case OP_NOTPOSQUERYI:
3325 nigel 93 possessive = TRUE;
3326     min = 0;
3327     max = 1;
3328     ecode++;
3329     goto REPEATNOTCHAR;
3330    
3331     case OP_NOTPOSUPTO:
3332 ph10 602 case OP_NOTPOSUPTOI:
3333 nigel 93 possessive = TRUE;
3334     min = 0;
3335     max = GET2(ecode, 1);
3336     ecode += 3;
3337     goto REPEATNOTCHAR;
3338    
3339 nigel 77 case OP_NOTSTAR:
3340 ph10 602 case OP_NOTSTARI:
3341 nigel 77 case OP_NOTMINSTAR:
3342 ph10 602 case OP_NOTMINSTARI:
3343 nigel 77 case OP_NOTPLUS:
3344 ph10 602 case OP_NOTPLUSI:
3345 nigel 77 case OP_NOTMINPLUS:
3346 ph10 602 case OP_NOTMINPLUSI:
3347 nigel 77 case OP_NOTQUERY:
3348 ph10 602 case OP_NOTQUERYI:
3349 nigel 77 case OP_NOTMINQUERY:
3350 ph10 602 case OP_NOTMINQUERYI:
3351     c = *ecode++ - ((op >= OP_NOTSTARI)? OP_NOTSTARI: OP_NOTSTAR);
3352 nigel 77 minimize = (c & 1) != 0;
3353     min = rep_min[c]; /* Pick up values from tables; */
3354     max = rep_max[c]; /* zero for max => infinity */
3355     if (max == 0) max = INT_MAX;
3356    
3357 ph10 426 /* Common code for all repeated single-byte matches. */
3358 nigel 77
3359     REPEATNOTCHAR:
3360     fc = *ecode++;
3361    
3362     /* The code is duplicated for the caseless and caseful cases, for speed,
3363     since matching characters is likely to be quite common. First, ensure the
3364     minimum number of matches are present. If min = max, continue at the same
3365     level without recursing. Otherwise, if minimizing, keep trying the rest of
3366     the expression and advancing one matching character if failing, up to the
3367     maximum. Alternatively, if maximizing, find the maximum number of
3368     characters and work backwards. */
3369    
3370     DPRINTF(("negative matching %c{%d,%d} against subject %.*s\n", fc, min, max,
3371     max, eptr));
3372    
3373 ph10 602 if (op >= OP_NOTSTARI) /* Caseless */
3374 nigel 77 {
3375     fc = md->lcc[fc];
3376    
3377     #ifdef SUPPORT_UTF8
3378     /* UTF-8 mode */
3379     if (utf8)
3380     {
3381 nigel 93 register unsigned int d;
3382 nigel 77 for (i = 1; i <= min; i++)
3383     {
3384 ph10 426 if (eptr >= md->end_subject)
3385     {
3386     SCHECK_PARTIAL();
3387 ph10 510 MRRETURN(MATCH_NOMATCH);
3388 ph10 427 }
3389 nigel 77 GETCHARINC(d, eptr);
3390     if (d < 256) d = md->lcc[d];
3391 ph10 510 if (fc == d) MRRETURN(MATCH_NOMATCH);
3392 nigel 77 }
3393     }
3394     else
3395     #endif
3396    
3397     /* Not UTF-8 mode */
3398     {
3399     for (i = 1; i <= min; i++)
3400 ph10 426 {
3401     if (eptr >= md->end_subject)
3402     {
3403     SCHECK_PARTIAL();
3404 ph10 510 MRRETURN(MATCH_NOMATCH);
3405 ph10 427 }
3406 ph10 510 if (fc == md->lcc[*eptr++]) MRRETURN(MATCH_NOMATCH);
3407 ph10 427 }
3408 nigel 77 }
3409    
3410     if (min == max) continue;
3411    
3412     if (minimize)
3413     {
3414     #ifdef SUPPORT_UTF8
3415     /* UTF-8 mode */
3416     if (utf8)
3417     {
3418 nigel 93 register unsigned int d;
3419 nigel 77 for (fi = min;; fi++)
3420     {
3421 ph10 604 RMATCH(eptr, ecode, offset_top, md, eptrb, RM28);
3422 nigel 77 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3423 ph10 510 if (fi >= max) MRRETURN(MATCH_NOMATCH);
3424 ph10 427 if (eptr >= md->end_subject)
3425 ph10 426 {
3426 ph10 427 SCHECK_PARTIAL();
3427 ph10 510 MRRETURN(MATCH_NOMATCH);
3428 ph10 427 }
3429 nigel 77 GETCHARINC(d, eptr);
3430     if (d < 256) d = md->lcc[d];
3431 ph10 510 if (fc == d) MRRETURN(MATCH_NOMATCH);
3432 nigel 77 }
3433     }
3434     else
3435     #endif
3436     /* Not UTF-8 mode */
3437     {
3438     for (fi = min;; fi++)
3439     {
3440 ph10 604 RMATCH(eptr, ecode, offset_top, md, eptrb, RM29);
3441 nigel 77 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3442 ph10 510 if (fi >= max) MRRETURN(MATCH_NOMATCH);
3443 ph10 426 if (eptr >= md->end_subject)
3444     {
3445     SCHECK_PARTIAL();
3446 ph10 510 MRRETURN(MATCH_NOMATCH);
3447 ph10 426 }
3448 ph10 510 if (fc == md->lcc[*eptr++]) MRRETURN(MATCH_NOMATCH);
3449 nigel 77 }
3450     }
3451     /* Control never gets here */
3452     }
3453    
3454     /* Maximize case */
3455    
3456     else
3457     {
3458     pp = eptr;
3459    
3460     #ifdef SUPPORT_UTF8
3461     /* UTF-8 mode */
3462     if (utf8)
3463     {
3464 nigel 93 register unsigned int d;
3465 nigel 77 for (i = min; i < max; i++)
3466     {
3467     int len = 1;
3468 ph10 463 if (eptr >= md->end_subject)
3469 ph10 462 {
3470 ph10 463 SCHECK_PARTIAL();
3471 ph10 462 break;
3472 ph10 463 }
3473 nigel 77 GETCHARLEN(d, eptr, len);
3474     if (d < 256) d = md->lcc[d];
3475     if (fc == d) break;
3476     eptr += len;
3477     }
3478 nigel 93 if (possessive) continue;
3479     for(;;)
3480 nigel 77 {
3481 ph10 604 RMATCH(eptr, ecode, offset_top, md, eptrb, RM30);
3482 nigel 77 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3483     if (eptr-- == pp) break; /* Stop if tried at original pos */
3484     BACKCHAR(eptr);
3485     }
3486     }
3487     else
3488     #endif
3489     /* Not UTF-8 mode */
3490     {
3491     for (i = min; i < max; i++)
3492     {
3493 ph10 463 if (eptr >= md->end_subject)
3494 ph10 462 {
3495     SCHECK_PARTIAL();
3496     break;
3497 ph10 463 }
3498 ph10 462 if (fc == md->lcc[*eptr]) break;
3499 nigel 77 eptr++;
3500     }
3501 nigel 93 if (possessive) continue;
3502 nigel 77 while (eptr >= pp)
3503     {
3504 ph10 604 RMATCH(eptr, ecode, offset_top, md, eptrb, RM31);
3505 nigel 77 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3506     eptr--;
3507     }
3508     }
3509    
3510 ph10 510 MRRETURN(MATCH_NOMATCH);
3511 nigel 77 }
3512     /* Control never gets here */
3513     }
3514    
3515     /* Caseful comparisons */
3516    
3517     else
3518     {
3519     #ifdef SUPPORT_UTF8
3520     /* UTF-8 mode */
3521     if (utf8)
3522     {
3523 nigel 93 register unsigned int d;
3524 nigel 77 for (i = 1; i <= min; i++)
3525     {
3526 ph10 426 if (eptr >= md->end_subject)
3527     {
3528     SCHECK_PARTIAL();
3529 ph10 510 MRRETURN(MATCH_NOMATCH);
3530 ph10 427 }
3531 nigel 77 GETCHARINC(d, eptr);
3532 ph10 510 if (fc == d) MRRETURN(MATCH_NOMATCH);
3533 nigel 77 }
3534     }
3535     else
3536     #endif
3537     /* Not UTF-8 mode */
3538     {
3539     for (i = 1; i <= min; i++)
3540 ph10 426 {
3541     if (eptr >= md->end_subject)
3542     {
3543     SCHECK_PARTIAL();
3544 ph10 510 MRRETURN(MATCH_NOMATCH);
3545 ph10 427 }
3546 ph10 510 if (fc == *eptr++) MRRETURN(MATCH_NOMATCH);
3547 ph10 427 }
3548 nigel 77 }
3549    
3550     if (min == max) continue;
3551    
3552     if (minimize)
3553     {
3554     #ifdef SUPPORT_UTF8
3555     /* UTF-8 mode */
3556     if (utf8)
3557     {
3558 nigel 93 register unsigned int d;
3559 nigel 77 for (fi = min;; fi++)
3560     {
3561 ph10 604 RMATCH(eptr, ecode, offset_top, md, eptrb, RM32);
3562 nigel 77 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3563 ph10 510 if (fi >= max) MRRETURN(MATCH_NOMATCH);
3564 ph10 427 if (eptr >= md->end_subject)
3565 ph10 426 {
3566 ph10 427 SCHECK_PARTIAL();
3567 ph10 510 MRRETURN(MATCH_NOMATCH);
3568 ph10 427 }
3569 nigel 77 GETCHARINC(d, eptr);
3570 ph10 510 if (fc == d) MRRETURN(MATCH_NOMATCH);
3571 nigel 77 }
3572     }
3573     else
3574     #endif
3575     /* Not UTF-8 mode */
3576     {
3577     for (fi = min;; fi++)
3578     {
3579 ph10 604 RMATCH(eptr, ecode, offset_top, md, eptrb, RM33);
3580 nigel 77 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3581 ph10 510 if (fi >= max) MRRETURN(MATCH_NOMATCH);
3582 ph10 426 if (eptr >= md->end_subject)
3583     {
3584     SCHECK_PARTIAL();
3585 ph10 510 MRRETURN(MATCH_NOMATCH);
3586 ph10 427 }
3587 ph10 510 if (fc == *eptr++) MRRETURN(MATCH_NOMATCH);
3588 nigel 77 }
3589     }
3590     /* Control never gets here */
3591     }
3592    
3593     /* Maximize case */
3594    
3595     else
3596     {
3597     pp = eptr;
3598    
3599     #ifdef SUPPORT_UTF8
3600     /* UTF-8 mode */
3601     if (utf8)
3602     {
3603 nigel 93 register unsigned int d;
3604 nigel 77 for (i = min; i < max; i++)
3605     {
3606     int len = 1;
3607 ph10 463 if (eptr >= md->end_subject)
3608 ph10 462 {
3609 ph10 463 SCHECK_PARTIAL();
3610 ph10 462 break;
3611 ph10 463 }
3612 nigel 77 GETCHARLEN(d, eptr, len);
3613     if (fc == d) break;
3614     eptr += len;
3615     }
3616 nigel 93 if (possessive) continue;
3617 nigel 77 for(;;)
3618     {
3619 ph10 604 RMATCH(eptr, ecode, offset_top, md, eptrb, RM34);
3620 nigel 77 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3621     if (eptr-- == pp) break; /* Stop if tried at original pos */
3622     BACKCHAR(eptr);
3623     }
3624     }
3625     else
3626     #endif
3627     /* Not UTF-8 mode */
3628     {
3629     for (i = min; i < max; i++)
3630     {
3631 ph10 463 if (eptr >= md->end_subject)
3632 ph10 462 {
3633 ph10 463 SCHECK_PARTIAL();
3634 ph10 462 break;
3635 ph10 463 }
3636 ph10 462 if (fc == *eptr) break;
3637 nigel 77 eptr++;
3638     }
3639 nigel 93 if (possessive) continue;
3640 nigel 77 while (eptr >= pp)
3641     {
3642 ph10 604 RMATCH(eptr, ecode, offset_top, md, eptrb, RM35);
3643 nigel 77 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3644     eptr--;
3645     }
3646     }
3647    
3648 ph10 510 MRRETURN(MATCH_NOMATCH);
3649 nigel 77 }
3650     }
3651     /* Control never gets here */
3652    
3653     /* Match a single character type repeatedly; several different opcodes
3654     share code. This is very similar to the code for single characters, but we
3655     repeat it in the interests of efficiency. */
3656    
3657     case OP_TYPEEXACT:
3658     min = max = GET2(ecode, 1);
3659     minimize = TRUE;
3660     ecode += 3;
3661     goto REPEATTYPE;
3662    
3663     case OP_TYPEUPTO:
3664     case OP_TYPEMINUPTO:
3665     min = 0;
3666     max = GET2(ecode, 1);
3667     minimize = *ecode == OP_TYPEMINUPTO;
3668     ecode += 3;
3669     goto REPEATTYPE;
3670    
3671 nigel 93 case OP_TYPEPOSSTAR:
3672     possessive = TRUE;
3673     min = 0;
3674     max = INT_MAX;
3675     ecode++;
3676     goto REPEATTYPE;
3677    
3678     case OP_TYPEPOSPLUS:
3679     possessive = TRUE;
3680     min = 1;
3681     max = INT_MAX;
3682     ecode++;
3683     goto REPEATTYPE;
3684    
3685     case OP_TYPEPOSQUERY:
3686     possessive = TRUE;
3687     min = 0;
3688     max = 1;
3689     ecode++;
3690     goto REPEATTYPE;
3691    
3692     case OP_TYPEPOSUPTO:
3693     possessive = TRUE;
3694     min = 0;
3695     max = GET2(ecode, 1);
3696     ecode += 3;
3697     goto REPEATTYPE;
3698    
3699 nigel 77 case OP_TYPESTAR:
3700     case OP_TYPEMINSTAR:
3701     case OP_TYPEPLUS:
3702     case OP_TYPEMINPLUS:
3703     case OP_TYPEQUERY:
3704     case OP_TYPEMINQUERY:
3705     c = *ecode++ - OP_TYPESTAR;
3706     minimize = (c & 1) != 0;
3707     min = rep_min[c]; /* Pick up values from tables; */
3708     max = rep_max[c]; /* zero for max => infinity */
3709     if (max == 0) max = INT_MAX;
3710    
3711     /* Common code for all repeated single character type matches. Note that
3712     in UTF-8 mode, '.' matches a character of any length, but for the other
3713     character types, the valid characters are all one-byte long. */
3714    
3715     REPEATTYPE:
3716     ctype = *ecode++; /* Code for the character type */
3717    
3718     #ifdef SUPPORT_UCP
3719     if (ctype == OP_PROP || ctype == OP_NOTPROP)
3720     {
3721     prop_fail_result = ctype == OP_NOTPROP;
3722     prop_type = *ecode++;
3723 nigel 87 prop_value = *ecode++;
3724 nigel 77 }
3725     else prop_type = -1;
3726     #endif
3727    
3728     /* First, ensure the minimum number of matches are present. Use inline
3729     code for maximizing the speed, and do the type test once at the start
3730 ph10 426 (i.e. keep it out of the loop). Separate the UTF-8 code completely as that
3731 nigel 77 is tidier. Also separate the UCP code, which can be the same for both UTF-8
3732     and single-bytes. */
3733    
3734     if (min > 0)
3735     {
3736     #ifdef SUPPORT_UCP
3737 nigel 87 if (prop_type >= 0)
3738 nigel 77 {
3739 nigel 87 switch(prop_type)
3740 nigel 77 {
3741 nigel 87 case PT_ANY:
3742 ph10 510 if (prop_fail_result) MRRETURN(MATCH_NOMATCH);
3743 nigel 87 for (i = 1; i <= min; i++)
3744     {
3745 ph10 427 if (eptr >= md->end_subject)
3746 ph10 426 {
3747 ph10 427 SCHECK_PARTIAL();
3748 ph10 510 MRRETURN(MATCH_NOMATCH);
3749 ph10 427 }
3750 ph10 184 GETCHARINCTEST(c, eptr);
3751 nigel 87 }
3752     break;
3753    
3754     case PT_LAMP:
3755     for (i = 1; i <= min; i++)
3756     {
3757 ph10 427 if (eptr >= md->end_subject)
3758 ph10 426 {
3759 ph10 427 SCHECK_PARTIAL();
3760 ph10 510 MRRETURN(MATCH_NOMATCH);
3761 ph10 427 }
3762 ph10 184 GETCHARINCTEST(c, eptr);
3763 ph10 349 prop_chartype = UCD_CHARTYPE(c);
3764 nigel 87 if ((prop_chartype == ucp_Lu ||
3765     prop_chartype == ucp_Ll ||
3766     prop_chartype == ucp_Lt) == prop_fail_result)
3767 ph10 510 MRRETURN(MATCH_NOMATCH);
3768 nigel 87 }
3769     break;
3770    
3771     case PT_GC:
3772     for (i = 1; i <= min; i++)
3773     {
3774 ph10 427 if (eptr >= md->end_subject)
3775 ph10 426 {
3776 ph10 427 SCHECK_PARTIAL();
3777 ph10 510 MRRETURN(MATCH_NOMATCH);
3778 ph10 427 }
3779 ph10 184 GETCHARINCTEST(c, eptr);
3780 ph10 349 prop_category = UCD_CATEGORY(c);
3781 nigel 87 if ((prop_category == prop_value) == prop_fail_result)
3782 ph10 510 MRRETURN(MATCH_NOMATCH);
3783 nigel 87 }
3784     break;
3785    
3786     case PT_PC:
3787     for (i = 1; i <= min; i++)
3788     {
3789 ph10 427 if (eptr >= md->end_subject)
3790 ph10 426 {
3791 ph10 427 SCHECK_PARTIAL();
3792 ph10 510 MRRETURN(MATCH_NOMATCH);
3793 ph10 427 }
3794 ph10 184 GETCHARINCTEST(c, eptr);
3795 ph10 349 prop_chartype = UCD_CHARTYPE(c);
3796 nigel 87 if ((prop_chartype == prop_value) == prop_fail_result)
3797 ph10 510 MRRETURN(MATCH_NOMATCH);
3798 nigel 87 }
3799     break;
3800    
3801     case PT_SC:
3802     for (i = 1; i <= min; i++)
3803     {
3804 ph10 427 if (eptr >= md->end_subject)
3805 ph10 426 {
3806 ph10 427 SCHECK_PARTIAL();
3807 ph10 510 MRRETURN(MATCH_NOMATCH);
3808 ph10 427 }
3809 ph10 184 GETCHARINCTEST(c, eptr);
3810 ph10 349 prop_script = UCD_SCRIPT(c);
3811 nigel 87 if ((prop_script == prop_value) == prop_fail_result)
3812 ph10 510 MRRETURN(MATCH_NOMATCH);
3813 nigel 87 }
3814     break;
3815 ph10