/[pcre]/code/trunk/pcre_exec.c
ViewVC logotype

Contents of /code/trunk/pcre_exec.c

Parent Directory Parent Directory | Revision Log Revision Log


Revision 609 - (hide annotations) (download)
Wed Jun 15 18:09:23 2011 UTC (23 months ago) by ph10
File MIME type: text/plain
File size: 194331 byte(s)
Fix bug with /\A.*?(?:a|b(*THEN)c)/ by removing the tail recursion optimization 
for the final branch. Also fix a similar bug for conditional subpatterns.

1 nigel 77 /*************************************************
2     * Perl-Compatible Regular Expressions *
3     *************************************************/
4    
5     /* PCRE is a library of functions to support regular expressions whose syntax
6     and semantics are as close as possible to those of the Perl 5 language.
7    
8     Written by Philip Hazel
9 ph10 598 Copyright (c) 1997-2011 University of Cambridge
10 nigel 77
11     -----------------------------------------------------------------------------
12     Redistribution and use in source and binary forms, with or without
13     modification, are permitted provided that the following conditions are met:
14    
15     * Redistributions of source code must retain the above copyright notice,
16     this list of conditions and the following disclaimer.
17    
18     * Redistributions in binary form must reproduce the above copyright
19     notice, this list of conditions and the following disclaimer in the
20     documentation and/or other materials provided with the distribution.
21    
22     * Neither the name of the University of Cambridge nor the names of its
23     contributors may be used to endorse or promote products derived from
24     this software without specific prior written permission.
25    
26     THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
27     AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
28     IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
29     ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
30     LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
31     CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
32     SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
33     INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
34     CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
35     ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
36     POSSIBILITY OF SUCH DAMAGE.
37     -----------------------------------------------------------------------------
38     */
39    
40    
41     /* This module contains pcre_exec(), the externally visible function that does
42     pattern matching using an NFA algorithm, trying to mimic Perl as closely as
43     possible. There are also some static supporting functions. */
44    
45 ph10 200 #ifdef HAVE_CONFIG_H
46 ph10 236 #include "config.h"
47 ph10 200 #endif
48 ph10 199
49 nigel 93 #define NLBLOCK md /* Block containing newline information */
50     #define PSSTART start_subject /* Field containing processed string start */
51     #define PSEND end_subject /* Field containing processed string end */
52    
53 nigel 77 #include "pcre_internal.h"
54    
55 ph10 137 /* Undefine some potentially clashing cpp symbols */
56    
57     #undef min
58     #undef max
59    
60 ph10 604 /* Values for setting in md->match_function_type to indicate two special types
61     of call to match(). We do it this way to save on using another stack variable,
62     as stack usage is to be discouraged. */
63 nigel 77
64 ph10 604 #define MATCH_CONDASSERT 1 /* Called to check a condition assertion */
65     #define MATCH_CBEGROUP 2 /* Could-be-empty unlimited repeat group */
66 nigel 77
67     /* Non-error returns from the match() function. Error returns are externally
68     defined PCRE_ERROR_xxx codes, which are all negative. */
69    
70     #define MATCH_MATCH 1
71     #define MATCH_NOMATCH 0
72    
73 ph10 211 /* Special internal returns from the match() function. Make them sufficiently
74 ph10 210 negative to avoid the external error codes. */
75    
76 ph10 511 #define MATCH_ACCEPT (-999)
77     #define MATCH_COMMIT (-998)
78 ph10 604 #define MATCH_KETRPOS (-997)
79     #define MATCH_PRUNE (-996)
80     #define MATCH_SKIP (-995)
81     #define MATCH_SKIP_ARG (-994)
82     #define MATCH_THEN (-993)
83 ph10 210
84 ph10 510 /* This is a convenience macro for code that occurs many times. */
85    
86     #define MRRETURN(ra) \
87     { \
88     md->mark = markptr; \
89     RRETURN(ra); \
90     }
91    
92 nigel 77 /* Maximum number of ints of offset to save on the stack for recursive calls.
93     If the offset vector is bigger, malloc is used. This should be a multiple of 3,
94     because the offset vector is always a multiple of 3 long. */
95    
96     #define REC_STACK_SAVE_MAX 30
97    
98     /* Min and max values for the common repeats; for the maxima, 0 => infinity */
99    
100     static const char rep_min[] = { 0, 0, 1, 1, 0, 0 };
101     static const char rep_max[] = { 0, 0, 0, 0, 1, 1 };
102    
103    
104    
105 ph10 475 #ifdef PCRE_DEBUG
106 nigel 77 /*************************************************
107     * Debugging function to print chars *
108     *************************************************/
109    
110     /* Print a sequence of chars in printable format, stopping at the end of the
111     subject if the requested.
112    
113     Arguments:
114     p points to characters
115     length number to print
116     is_subject TRUE if printing from within md->start_subject
117     md pointer to matching data block, if is_subject is TRUE
118    
119     Returns: nothing
120     */
121    
122     static void
123     pchars(const uschar *p, int length, BOOL is_subject, match_data *md)
124     {
125 nigel 93 unsigned int c;
126 nigel 77 if (is_subject && length > md->end_subject - p) length = md->end_subject - p;
127     while (length-- > 0)
128     if (isprint(c = *(p++))) printf("%c", c); else printf("\\x%02x", c);
129     }
130     #endif
131    
132    
133    
134     /*************************************************
135     * Match a back-reference *
136     *************************************************/
137    
138 ph10 595 /* Normally, if a back reference hasn't been set, the length that is passed is
139     negative, so the match always fails. However, in JavaScript compatibility mode,
140     the length passed is zero. Note that in caseless UTF-8 mode, the number of
141     subject bytes matched may be different to the number of reference bytes.
142 nigel 77
143     Arguments:
144     offset index into the offset vector
145 ph10 595 eptr pointer into the subject
146     length length of reference to be matched (number of bytes)
147 nigel 77 md points to match data block
148 ph10 602 caseless TRUE if caseless
149 nigel 77
150 ph10 595 Returns: < 0 if not matched, otherwise the number of subject bytes matched
151 nigel 77 */
152    
153 ph10 595 static int
154 nigel 87 match_ref(int offset, register USPTR eptr, int length, match_data *md,
155 ph10 602 BOOL caseless)
156 nigel 77 {
157 ph10 595 USPTR eptr_start = eptr;
158     register USPTR p = md->start_subject + md->offset_vector[offset];
159 nigel 77
160 ph10 475 #ifdef PCRE_DEBUG
161 nigel 77 if (eptr >= md->end_subject)
162     printf("matching subject <null>");
163     else
164     {
165     printf("matching subject ");
166     pchars(eptr, length, TRUE, md);
167     }
168     printf(" against backref ");
169     pchars(p, length, FALSE, md);
170     printf("\n");
171     #endif
172    
173 ph10 595 /* Always fail if reference not set (and not JavaScript compatible). */
174 nigel 77
175 ph10 595 if (length < 0) return -1;
176 nigel 77
177 ph10 354 /* Separate the caseless case for speed. In UTF-8 mode we can only do this
178     properly if Unicode properties are supported. Otherwise, we can check only
179     ASCII characters. */
180 nigel 77
181 ph10 602 if (caseless)
182 nigel 77 {
183 ph10 354 #ifdef SUPPORT_UTF8
184     #ifdef SUPPORT_UCP
185     if (md->utf8)
186     {
187 ph10 595 /* Match characters up to the end of the reference. NOTE: the number of
188     bytes matched may differ, because there are some characters whose upper and
189     lower case versions code as different numbers of bytes. For example, U+023A
190     (2 bytes in UTF-8) is the upper case version of U+2C65 (3 bytes in UTF-8);
191     a sequence of 3 of the former uses 6 bytes, as does a sequence of two of
192     the latter. It is important, therefore, to check the length along the
193     reference, not along the subject (earlier code did this wrong). */
194    
195     USPTR endptr = p + length;
196     while (p < endptr)
197 ph10 354 {
198 ph10 358 int c, d;
199 ph10 597 if (eptr >= md->end_subject) return -1;
200 ph10 354 GETCHARINC(c, eptr);
201     GETCHARINC(d, p);
202 ph10 595 if (c != d && c != UCD_OTHERCASE(d)) return -1;
203 ph10 358 }
204     }
205 ph10 354 else
206     #endif
207     #endif
208    
209     /* The same code works when not in UTF-8 mode and in UTF-8 mode when there
210     is no UCP support. */
211 ph10 597 {
212     if (eptr + length > md->end_subject) return -1;
213     while (length-- > 0)
214     { if (md->lcc[*p++] != md->lcc[*eptr++]) return -1; }
215     }
216 nigel 77 }
217 ph10 358
218 ph10 354 /* In the caseful case, we can just compare the bytes, whether or not we
219     are in UTF-8 mode. */
220 ph10 358
221 nigel 77 else
222 ph10 597 {
223     if (eptr + length > md->end_subject) return -1;
224     while (length-- > 0) if (*p++ != *eptr++) return -1;
225     }
226 nigel 77
227 ph10 595 return eptr - eptr_start;
228 nigel 77 }
229    
230    
231    
232     /***************************************************************************
233     ****************************************************************************
234     RECURSION IN THE match() FUNCTION
235    
236 nigel 87 The match() function is highly recursive, though not every recursive call
237     increases the recursive depth. Nevertheless, some regular expressions can cause
238     it to recurse to a great depth. I was writing for Unix, so I just let it call
239     itself recursively. This uses the stack for saving everything that has to be
240     saved for a recursive call. On Unix, the stack can be large, and this works
241     fine.
242 nigel 77
243 nigel 87 It turns out that on some non-Unix-like systems there are problems with
244     programs that use a lot of stack. (This despite the fact that every last chip
245     has oodles of memory these days, and techniques for extending the stack have
246     been known for decades.) So....
247 nigel 77
248     There is a fudge, triggered by defining NO_RECURSE, which avoids recursive
249     calls by keeping local variables that need to be preserved in blocks of memory
250 nigel 87 obtained from malloc() instead instead of on the stack. Macros are used to
251 nigel 77 achieve this so that the actual code doesn't look very different to what it
252     always used to.
253 ph10 164
254 ph10 165 The original heap-recursive code used longjmp(). However, it seems that this
255 ph10 164 can be very slow on some operating systems. Following a suggestion from Stan
256     Switzer, the use of longjmp() has been abolished, at the cost of having to
257     provide a unique number for each call to RMATCH. There is no way of generating
258     a sequence of numbers at compile time in C. I have given them names, to make
259     them stand out more clearly.
260    
261     Crude tests on x86 Linux show a small speedup of around 5-8%. However, on
262     FreeBSD, avoiding longjmp() more than halves the time taken to run the standard
263 ph10 165 tests. Furthermore, not using longjmp() means that local dynamic variables
264     don't have indeterminate values; this has meant that the frame size can be
265 ph10 164 reduced because the result can be "passed back" by straight setting of the
266     variable instead of being passed in the frame.
267 nigel 77 ****************************************************************************
268     ***************************************************************************/
269    
270 ph10 212 /* Numbers for RMATCH calls. When this list is changed, the code at HEAP_RETURN
271     below must be updated in sync. */
272 nigel 77
273 ph10 164 enum { RM1=1, RM2, RM3, RM4, RM5, RM6, RM7, RM8, RM9, RM10,
274     RM11, RM12, RM13, RM14, RM15, RM16, RM17, RM18, RM19, RM20,
275     RM21, RM22, RM23, RM24, RM25, RM26, RM27, RM28, RM29, RM30,
276     RM31, RM32, RM33, RM34, RM35, RM36, RM37, RM38, RM39, RM40,
277 ph10 210 RM41, RM42, RM43, RM44, RM45, RM46, RM47, RM48, RM49, RM50,
278 ph10 527 RM51, RM52, RM53, RM54, RM55, RM56, RM57, RM58, RM59, RM60,
279 ph10 609 RM61, RM62, RM63};
280 ph10 164
281 nigel 87 /* These versions of the macros use the stack, as normal. There are debugging
282 ph10 165 versions and production versions. Note that the "rw" argument of RMATCH isn't
283 ph10 501 actually used in this definition. */
284 nigel 77
285     #ifndef NO_RECURSE
286     #define REGISTER register
287 ph10 164
288 ph10 475 #ifdef PCRE_DEBUG
289 ph10 604 #define RMATCH(ra,rb,rc,rd,re,rw) \
290 nigel 87 { \
291     printf("match() called in line %d\n", __LINE__); \
292 ph10 604 rrc = match(ra,rb,mstart,markptr,rc,rd,re,rdepth+1); \
293 nigel 87 printf("to line %d\n", __LINE__); \
294     }
295     #define RRETURN(ra) \
296     { \
297     printf("match() returned %d from line %d ", ra, __LINE__); \
298     return ra; \
299     }
300     #else
301 ph10 604 #define RMATCH(ra,rb,rc,rd,re,rw) \
302     rrc = match(ra,rb,mstart,markptr,rc,rd,re,rdepth+1)
303 nigel 77 #define RRETURN(ra) return ra
304 nigel 87 #endif
305    
306 nigel 77 #else
307    
308    
309 ph10 164 /* These versions of the macros manage a private stack on the heap. Note that
310     the "rd" argument of RMATCH isn't actually used in this definition. It's the md
311     argument of match(), which never changes. */
312 nigel 77
313     #define REGISTER
314    
315 ph10 604 #define RMATCH(ra,rb,rc,rd,re,rw)\
316 nigel 77 {\
317 ph10 563 heapframe *newframe = (heapframe *)(pcre_stack_malloc)(sizeof(heapframe));\
318 ph10 534 if (newframe == NULL) RRETURN(PCRE_ERROR_NOMEMORY);\
319 ph10 164 frame->Xwhere = rw; \
320     newframe->Xeptr = ra;\
321     newframe->Xecode = rb;\
322 ph10 168 newframe->Xmstart = mstart;\
323 ph10 501 newframe->Xmarkptr = markptr;\
324 ph10 164 newframe->Xoffset_top = rc;\
325 ph10 602 newframe->Xeptrb = re;\
326 ph10 164 newframe->Xrdepth = frame->Xrdepth + 1;\
327     newframe->Xprevframe = frame;\
328     frame = newframe;\
329     DPRINTF(("restarting from line %d\n", __LINE__));\
330     goto HEAP_RECURSE;\
331     L_##rw:\
332     DPRINTF(("jumped back to line %d\n", __LINE__));\
333 nigel 77 }
334    
335     #define RRETURN(ra)\
336     {\
337 ph10 527 heapframe *oldframe = frame;\
338     frame = oldframe->Xprevframe;\
339     (pcre_stack_free)(oldframe);\
340 nigel 77 if (frame != NULL)\
341     {\
342 ph10 164 rrc = ra;\
343     goto HEAP_RETURN;\
344 nigel 77 }\
345     return ra;\
346     }
347    
348    
349     /* Structure for remembering the local variables in a private frame */
350    
351     typedef struct heapframe {
352     struct heapframe *Xprevframe;
353    
354     /* Function arguments that may change */
355    
356 ph10 409 USPTR Xeptr;
357 nigel 77 const uschar *Xecode;
358 ph10 409 USPTR Xmstart;
359 ph10 501 USPTR Xmarkptr;
360 nigel 77 int Xoffset_top;
361     eptrblock *Xeptrb;
362 nigel 91 unsigned int Xrdepth;
363 nigel 77
364     /* Function local variables */
365    
366 ph10 409 USPTR Xcallpat;
367 ph10 406 #ifdef SUPPORT_UTF8
368 ph10 409 USPTR Xcharptr;
369 ph10 406 #endif
370 ph10 409 USPTR Xdata;
371     USPTR Xnext;
372     USPTR Xpp;
373     USPTR Xprev;
374     USPTR Xsaved_eptr;
375 nigel 77
376     recursion_info Xnew_recursive;
377    
378     BOOL Xcur_is_word;
379     BOOL Xcondition;
380     BOOL Xprev_is_word;
381    
382     #ifdef SUPPORT_UCP
383     int Xprop_type;
384 nigel 87 int Xprop_value;
385 nigel 77 int Xprop_fail_result;
386     int Xprop_category;
387     int Xprop_chartype;
388 nigel 87 int Xprop_script;
389 ph10 123 int Xoclength;
390     uschar Xocchars[8];
391 nigel 77 #endif
392    
393 ph10 403 int Xcodelink;
394 nigel 77 int Xctype;
395 nigel 93 unsigned int Xfc;
396 nigel 77 int Xfi;
397     int Xlength;
398     int Xmax;
399     int Xmin;
400     int Xnumber;
401     int Xoffset;
402     int Xop;
403     int Xsave_capture_last;
404     int Xsave_offset1, Xsave_offset2, Xsave_offset3;
405     int Xstacksave[REC_STACK_SAVE_MAX];
406    
407     eptrblock Xnewptrb;
408    
409 ph10 164 /* Where to jump back to */
410 nigel 77
411 ph10 164 int Xwhere;
412 ph10 165
413 nigel 77 } heapframe;
414    
415     #endif
416    
417    
418     /***************************************************************************
419     ***************************************************************************/
420    
421    
422    
423     /*************************************************
424     * Match from current position *
425     *************************************************/
426    
427 nigel 93 /* This function is called recursively in many circumstances. Whenever it
428 nigel 77 returns a negative (error) response, the outer incarnation must also return the
429 ph10 426 same response. */
430 nigel 77
431 ph10 426 /* These macros pack up tests that are used for partial matching, and which
432     appears several times in the code. We set the "hit end" flag if the pointer is
433     at the end of the subject and also past the start of the subject (i.e.
434 ph10 427 something has been matched). For hard partial matching, we then return
435     immediately. The second one is used when we already know we are past the end of
436     the subject. */
437 ph10 426
438     #define CHECK_PARTIAL()\
439 ph10 553 if (md->partial != 0 && eptr >= md->end_subject && \
440     eptr > md->start_used_ptr) \
441     { \
442     md->hitend = TRUE; \
443     if (md->partial > 1) MRRETURN(PCRE_ERROR_PARTIAL); \
444 ph10 427 }
445 ph10 426
446     #define SCHECK_PARTIAL()\
447 ph10 553 if (md->partial != 0 && eptr > md->start_used_ptr) \
448     { \
449     md->hitend = TRUE; \
450     if (md->partial > 1) MRRETURN(PCRE_ERROR_PARTIAL); \
451 ph10 427 }
452 ph10 426
453 ph10 427
454 ph10 426 /* Performance note: It might be tempting to extract commonly used fields from
455     the md structure (e.g. utf8, end_subject) into individual variables to improve
456 nigel 77 performance. Tests using gcc on a SPARC disproved this; in the first case, it
457     made performance worse.
458    
459     Arguments:
460 nigel 93 eptr pointer to current character in subject
461     ecode pointer to current position in compiled code
462 ph10 168 mstart pointer to the current match start position (can be modified
463 ph10 172 by encountering \K)
464 ph10 501 markptr pointer to the most recent MARK name, or NULL
465 nigel 77 offset_top current top pointer
466     md pointer to "static" info for the match
467     eptrb pointer to chain of blocks containing eptr at start of
468     brackets - for testing for empty matches
469 nigel 87 rdepth the recursion depth
470 nigel 77
471     Returns: MATCH_MATCH if matched ) these values are >= 0
472     MATCH_NOMATCH if failed to match )
473 ph10 510 a negative MATCH_xxx value for PRUNE, SKIP, etc
474 nigel 77 a negative PCRE_ERROR_xxx value if aborted by an error condition
475 nigel 87 (e.g. stopped by repeated call or recursion limit)
476 nigel 77 */
477    
478     static int
479 ph10 510 match(REGISTER USPTR eptr, REGISTER const uschar *ecode, USPTR mstart,
480 ph10 602 const uschar *markptr, int offset_top, match_data *md, eptrblock *eptrb,
481 ph10 604 unsigned int rdepth)
482 nigel 77 {
483     /* These variables do not need to be preserved over recursion in this function,
484 nigel 93 so they can be ordinary variables in all cases. Mark some of them with
485     "register" because they are used a lot in loops. */
486 nigel 77
487 nigel 91 register int rrc; /* Returns from recursive calls */
488     register int i; /* Used for loops not involving calls to RMATCH() */
489 nigel 93 register unsigned int c; /* Character values not kept over RMATCH() calls */
490 nigel 91 register BOOL utf8; /* Local copy of UTF-8 flag for speed */
491 nigel 77
492 nigel 93 BOOL minimize, possessive; /* Quantifier options */
493 ph10 602 BOOL caseless;
494 ph10 403 int condcode;
495 nigel 93
496 nigel 77 /* When recursion is not being used, all "local" variables that have to be
497     preserved over calls to RMATCH() are part of a "frame" which is obtained from
498     heap storage. Set up the top-level frame here; others are obtained from the
499     heap whenever RMATCH() does a "recursion". See the macro definitions above. */
500    
501     #ifdef NO_RECURSE
502 ph10 563 heapframe *frame = (heapframe *)(pcre_stack_malloc)(sizeof(heapframe));
503 ph10 531 if (frame == NULL) RRETURN(PCRE_ERROR_NOMEMORY);
504 nigel 77 frame->Xprevframe = NULL; /* Marks the top level */
505    
506     /* Copy in the original argument variables */
507    
508     frame->Xeptr = eptr;
509     frame->Xecode = ecode;
510 ph10 168 frame->Xmstart = mstart;
511 ph10 501 frame->Xmarkptr = markptr;
512 nigel 77 frame->Xoffset_top = offset_top;
513     frame->Xeptrb = eptrb;
514 nigel 87 frame->Xrdepth = rdepth;
515 nigel 77
516     /* This is where control jumps back to to effect "recursion" */
517    
518     HEAP_RECURSE:
519    
520     /* Macros make the argument variables come from the current frame */
521    
522     #define eptr frame->Xeptr
523     #define ecode frame->Xecode
524 ph10 168 #define mstart frame->Xmstart
525 ph10 501 #define markptr frame->Xmarkptr
526 nigel 77 #define offset_top frame->Xoffset_top
527     #define eptrb frame->Xeptrb
528 nigel 87 #define rdepth frame->Xrdepth
529 nigel 77
530     /* Ditto for the local variables */
531    
532     #ifdef SUPPORT_UTF8
533     #define charptr frame->Xcharptr
534     #endif
535     #define callpat frame->Xcallpat
536 ph10 403 #define codelink frame->Xcodelink
537 nigel 77 #define data frame->Xdata
538     #define next frame->Xnext
539     #define pp frame->Xpp
540     #define prev frame->Xprev
541     #define saved_eptr frame->Xsaved_eptr
542    
543     #define new_recursive frame->Xnew_recursive
544    
545     #define cur_is_word frame->Xcur_is_word
546     #define condition frame->Xcondition
547     #define prev_is_word frame->Xprev_is_word
548    
549     #ifdef SUPPORT_UCP
550     #define prop_type frame->Xprop_type
551 nigel 87 #define prop_value frame->Xprop_value
552 nigel 77 #define prop_fail_result frame->Xprop_fail_result
553     #define prop_category frame->Xprop_category
554     #define prop_chartype frame->Xprop_chartype
555 nigel 87 #define prop_script frame->Xprop_script
556 ph10 115 #define oclength frame->Xoclength
557     #define occhars frame->Xocchars
558 nigel 77 #endif
559    
560     #define ctype frame->Xctype
561     #define fc frame->Xfc
562     #define fi frame->Xfi
563     #define length frame->Xlength
564     #define max frame->Xmax
565     #define min frame->Xmin
566     #define number frame->Xnumber
567     #define offset frame->Xoffset
568     #define op frame->Xop
569     #define save_capture_last frame->Xsave_capture_last
570     #define save_offset1 frame->Xsave_offset1
571     #define save_offset2 frame->Xsave_offset2
572     #define save_offset3 frame->Xsave_offset3
573     #define stacksave frame->Xstacksave
574    
575     #define newptrb frame->Xnewptrb
576    
577     /* When recursion is being used, local variables are allocated on the stack and
578     get preserved during recursion in the normal way. In this environment, fi and
579     i, and fc and c, can be the same variables. */
580    
581 nigel 93 #else /* NO_RECURSE not defined */
582 nigel 77 #define fi i
583     #define fc c
584    
585 ph10 604 /* Many of the following variables are used only in small blocks of the code.
586     My normal style of coding would have declared them within each of those blocks.
587     However, in order to accommodate the version of this code that uses an external
588     "stack" implemented on the heap, it is easier to declare them all here, so the
589     declarations can be cut out in a block. The only declarations within blocks
590     below are for variables that do not have to be preserved over a recursive call
591     to RMATCH(). */
592 nigel 77
593 ph10 604 #ifdef SUPPORT_UTF8
594     const uschar *charptr;
595     #endif
596     const uschar *callpat;
597     const uschar *data;
598     const uschar *next;
599     USPTR pp;
600     const uschar *prev;
601     USPTR saved_eptr;
602    
603     recursion_info new_recursive;
604    
605     BOOL cur_is_word;
606 nigel 87 BOOL condition;
607 nigel 77 BOOL prev_is_word;
608    
609     #ifdef SUPPORT_UCP
610     int prop_type;
611 nigel 87 int prop_value;
612 nigel 77 int prop_fail_result;
613     int prop_category;
614     int prop_chartype;
615 nigel 87 int prop_script;
616 ph10 115 int oclength;
617     uschar occhars[8];
618 nigel 77 #endif
619    
620 ph10 399 int codelink;
621 nigel 77 int ctype;
622     int length;
623     int max;
624     int min;
625     int number;
626     int offset;
627     int op;
628     int save_capture_last;
629     int save_offset1, save_offset2, save_offset3;
630     int stacksave[REC_STACK_SAVE_MAX];
631    
632     eptrblock newptrb;
633 nigel 93 #endif /* NO_RECURSE */
634 nigel 77
635 ph10 604 /* To save space on the stack and in the heap frame, I have doubled up on some
636     of the local variables that are used only in localised parts of the code, but
637     still need to be preserved over recursive calls of match(). These macros define
638     the alternative names that are used. */
639    
640     #define allow_zero cur_is_word
641     #define cbegroup condition
642     #define code_offset codelink
643     #define condassert condition
644     #define matched_once prev_is_word
645    
646 nigel 77 /* These statements are here to stop the compiler complaining about unitialized
647     variables. */
648    
649     #ifdef SUPPORT_UCP
650 nigel 87 prop_value = 0;
651 nigel 77 prop_fail_result = 0;
652     #endif
653    
654 nigel 93
655 nigel 91 /* This label is used for tail recursion, which is used in a few cases even
656     when NO_RECURSE is not defined, in order to reduce the amount of stack that is
657     used. Thanks to Ian Taylor for noticing this possibility and sending the
658     original patch. */
659    
660     TAIL_RECURSE:
661    
662 nigel 87 /* OK, now we can get on with the real code of the function. Recursive calls
663     are specified by the macro RMATCH and RRETURN is used to return. When
664     NO_RECURSE is *not* defined, these just turn into a recursive call to match()
665 ph10 475 and a "return", respectively (possibly with some debugging if PCRE_DEBUG is
666 nigel 87 defined). However, RMATCH isn't like a function call because it's quite a
667     complicated macro. It has to be used in one particular way. This shouldn't,
668     however, impact performance when true recursion is being used. */
669 nigel 77
670 ph10 164 #ifdef SUPPORT_UTF8
671     utf8 = md->utf8; /* Local copy of the flag */
672     #else
673     utf8 = FALSE;
674     #endif
675    
676 nigel 87 /* First check that we haven't called match() too many times, or that we
677     haven't exceeded the recursive call limit. */
678    
679 nigel 77 if (md->match_call_count++ >= md->match_limit) RRETURN(PCRE_ERROR_MATCHLIMIT);
680 nigel 87 if (rdepth >= md->match_limit_recursion) RRETURN(PCRE_ERROR_RECURSIONLIMIT);
681 nigel 77
682 nigel 93 /* At the start of a group with an unlimited repeat that may match an empty
683 ph10 604 string, the variable md->match_function_type is set to MATCH_CBEGROUP. It is
684     done this way to save having to use another function argument, which would take
685     up space on the stack. See also MATCH_CONDASSERT below.
686 nigel 77
687 ph10 604 When MATCH_CBEGROUP is set, add the current subject pointer to the chain of
688     such remembered pointers, to be checked when we hit the closing ket, in order
689     to break infinite loops that match no characters. When match() is called in
690     other circumstances, don't add to the chain. The MATCH_CBEGROUP feature must
691     NOT be used with tail recursion, because the memory block that is used is on
692     the stack, so a new one may be required for each match(). */
693    
694     if (md->match_function_type == MATCH_CBEGROUP)
695 nigel 77 {
696 ph10 197 newptrb.epb_saved_eptr = eptr;
697     newptrb.epb_prev = eptrb;
698     eptrb = &newptrb;
699 ph10 604 md->match_function_type = 0;
700 nigel 77 }
701    
702 nigel 93 /* Now start processing the opcodes. */
703 nigel 77
704     for (;;)
705     {
706 nigel 93 minimize = possessive = FALSE;
707 nigel 77 op = *ecode;
708 ph10 604
709 nigel 93 switch(op)
710     {
711 ph10 510 case OP_MARK:
712     markptr = ecode + 2;
713     RMATCH(eptr, ecode + _pcre_OP_lengths[*ecode] + ecode[1], offset_top, md,
714 ph10 604 eptrb, RM55);
715 ph10 512
716     /* A return of MATCH_SKIP_ARG means that matching failed at SKIP with an
717     argument, and we must check whether that argument matches this MARK's
718     argument. It is passed back in md->start_match_ptr (an overloading of that
719     variable). If it does match, we reset that variable to the current subject
720     position and return MATCH_SKIP. Otherwise, pass back the return code
721 ph10 510 unaltered. */
722 ph10 512
723     if (rrc == MATCH_SKIP_ARG &&
724 ph10 510 strcmp((char *)markptr, (char *)(md->start_match_ptr)) == 0)
725     {
726     md->start_match_ptr = eptr;
727     RRETURN(MATCH_SKIP);
728     }
729    
730 ph10 512 if (md->mark == NULL) md->mark = markptr;
731 ph10 510 RRETURN(rrc);
732    
733 ph10 210 case OP_FAIL:
734 ph10 510 MRRETURN(MATCH_NOMATCH);
735 ph10 211
736 ph10 551 /* COMMIT overrides PRUNE, SKIP, and THEN */
737 ph10 553
738 ph10 510 case OP_COMMIT:
739     RMATCH(eptr, ecode + _pcre_OP_lengths[*ecode], offset_top, md,
740 ph10 604 eptrb, RM52);
741 ph10 551 if (rrc != MATCH_NOMATCH && rrc != MATCH_PRUNE &&
742 ph10 553 rrc != MATCH_SKIP && rrc != MATCH_SKIP_ARG &&
743     rrc != MATCH_THEN)
744 ph10 551 RRETURN(rrc);
745 ph10 510 MRRETURN(MATCH_COMMIT);
746    
747 ph10 551 /* PRUNE overrides THEN */
748 ph10 553
749 ph10 210 case OP_PRUNE:
750     RMATCH(eptr, ecode + _pcre_OP_lengths[*ecode], offset_top, md,
751 ph10 604 eptrb, RM51);
752 ph10 551 if (rrc != MATCH_NOMATCH && rrc != MATCH_THEN) RRETURN(rrc);
753 ph10 510 MRRETURN(MATCH_PRUNE);
754 ph10 211
755 ph10 510 case OP_PRUNE_ARG:
756     RMATCH(eptr, ecode + _pcre_OP_lengths[*ecode] + ecode[1], offset_top, md,
757 ph10 604 eptrb, RM56);
758 ph10 551 if (rrc != MATCH_NOMATCH && rrc != MATCH_THEN) RRETURN(rrc);
759 ph10 510 md->mark = ecode + 2;
760     RRETURN(MATCH_PRUNE);
761 ph10 211
762 ph10 551 /* SKIP overrides PRUNE and THEN */
763 ph10 553
764 ph10 210 case OP_SKIP:
765     RMATCH(eptr, ecode + _pcre_OP_lengths[*ecode], offset_top, md,
766 ph10 604 eptrb, RM53);
767 ph10 553 if (rrc != MATCH_NOMATCH && rrc != MATCH_PRUNE && rrc != MATCH_THEN)
768 ph10 551 RRETURN(rrc);
769 ph10 211 md->start_match_ptr = eptr; /* Pass back current position */
770 ph10 510 MRRETURN(MATCH_SKIP);
771 ph10 211
772 ph10 510 case OP_SKIP_ARG:
773     RMATCH(eptr, ecode + _pcre_OP_lengths[*ecode] + ecode[1], offset_top, md,
774 ph10 604 eptrb, RM57);
775 ph10 553 if (rrc != MATCH_NOMATCH && rrc != MATCH_PRUNE && rrc != MATCH_THEN)
776 ph10 551 RRETURN(rrc);
777 ph10 512
778     /* Pass back the current skip name by overloading md->start_match_ptr and
779     returning the special MATCH_SKIP_ARG return code. This will either be
780     caught by a matching MARK, or get to the top, where it is treated the same
781 ph10 510 as PRUNE. */
782 ph10 512
783 ph10 510 md->start_match_ptr = ecode + 2;
784 ph10 512 RRETURN(MATCH_SKIP_ARG);
785 ph10 553
786 ph10 550 /* For THEN (and THEN_ARG) we pass back the address of the bracket or
787 ph10 553 the alt that is at the start of the current branch. This makes it possible
788     to skip back past alternatives that precede the THEN within the current
789     branch. */
790 ph10 512
791 ph10 210 case OP_THEN:
792     RMATCH(eptr, ecode + _pcre_OP_lengths[*ecode], offset_top, md,
793 ph10 604 eptrb, RM54);
794 ph10 210 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
795 ph10 550 md->start_match_ptr = ecode - GET(ecode, 1);
796 ph10 510 MRRETURN(MATCH_THEN);
797    
798     case OP_THEN_ARG:
799 ph10 553 RMATCH(eptr, ecode + _pcre_OP_lengths[*ecode] + ecode[1+LINK_SIZE],
800 ph10 604 offset_top, md, eptrb, RM58);
801 ph10 510 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
802 ph10 550 md->start_match_ptr = ecode - GET(ecode, 1);
803     md->mark = ecode + LINK_SIZE + 2;
804 ph10 212 RRETURN(MATCH_THEN);
805 ph10 211
806 ph10 604 /* Handle a capturing bracket, other than those that are possessive with an
807     unlimited repeat. If there is space in the offset vector, save the current
808     subject position in the working slot at the top of the vector. We mustn't
809     change the current values of the data slot, because they may be set from a
810     previous iteration of this group, and be referred to by a reference inside
811     the group. If we fail to match, we need to restore this value and also the
812 nigel 93 values of the final offsets, in case they were set by a previous iteration
813     of the same bracket.
814 nigel 77
815 nigel 93 If there isn't enough space in the offset vector, treat this as if it were
816     a non-capturing bracket. Don't worry about setting the flag for the error
817     case here; that is handled in the code for KET. */
818 nigel 77
819 nigel 93 case OP_CBRA:
820     case OP_SCBRA:
821     number = GET2(ecode, 1+LINK_SIZE);
822 nigel 77 offset = number << 1;
823 ph10 604
824 ph10 475 #ifdef PCRE_DEBUG
825 nigel 93 printf("start bracket %d\n", number);
826     printf("subject=");
827 nigel 77 pchars(eptr, 16, TRUE, md);
828     printf("\n");
829     #endif
830    
831     if (offset < md->offset_max)
832     {
833     save_offset1 = md->offset_vector[offset];
834     save_offset2 = md->offset_vector[offset+1];
835     save_offset3 = md->offset_vector[md->offset_end - number];
836     save_capture_last = md->capture_last;
837    
838     DPRINTF(("saving %d %d %d\n", save_offset1, save_offset2, save_offset3));
839 ph10 531 md->offset_vector[md->offset_end - number] =
840 ph10 530 (int)(eptr - md->start_subject);
841 nigel 77
842 ph10 604 for (;;)
843 nigel 77 {
844 ph10 604 if (op >= OP_SBRA) md->match_function_type = MATCH_CBEGROUP;
845     RMATCH(eptr, ecode + _pcre_OP_lengths[*ecode], offset_top, md,
846     eptrb, RM1);
847 ph10 550 if (rrc != MATCH_NOMATCH &&
848     (rrc != MATCH_THEN || md->start_match_ptr != ecode))
849     RRETURN(rrc);
850 nigel 77 md->capture_last = save_capture_last;
851     ecode += GET(ecode, 1);
852 ph10 604 if (*ecode != OP_ALT) break;
853 nigel 77 }
854    
855     DPRINTF(("bracket %d failed\n", number));
856    
857     md->offset_vector[offset] = save_offset1;
858     md->offset_vector[offset+1] = save_offset2;
859     md->offset_vector[md->offset_end - number] = save_offset3;
860    
861 ph10 609 if (rrc != MATCH_THEN && md->mark == NULL) md->mark = markptr;
862 nigel 77 RRETURN(MATCH_NOMATCH);
863     }
864    
865 ph10 197 /* FALL THROUGH ... Insufficient room for saving captured contents. Treat
866     as a non-capturing bracket. */
867 nigel 77
868 ph10 197 /* VVVVVVVVVVVVVVVVVVVVVVVVV */
869     /* VVVVVVVVVVVVVVVVVVVVVVVVV */
870    
871 nigel 93 DPRINTF(("insufficient capture room: treat as non-capturing\n"));
872 nigel 77
873 ph10 197 /* VVVVVVVVVVVVVVVVVVVVVVVVV */
874     /* VVVVVVVVVVVVVVVVVVVVVVVVV */
875    
876 ph10 604 /* Non-capturing bracket, except for possessive with unlimited repeat. Loop
877     for all the alternatives. When we get to the final alternative within the
878 ph10 609 brackets, we used to return the result of a recursive call to match()
879     whatever happened so it was possible to reduce stack usage by turning this
880     into a tail recursion, except in the case of a possibly empty group.
881     However, now that there is the possiblity of (*THEN) occurring in the final
882     alternative, this optimization is no longer possible. */
883 nigel 77
884 nigel 93 case OP_BRA:
885     case OP_SBRA:
886     DPRINTF(("start non-capturing bracket\n"));
887 nigel 91 for (;;)
888 nigel 77 {
889 ph10 604 if (op >= OP_SBRA) md->match_function_type = MATCH_CBEGROUP;
890 ph10 602 RMATCH(eptr, ecode + _pcre_OP_lengths[*ecode], offset_top, md, eptrb,
891 ph10 604 RM2);
892 ph10 550 if (rrc != MATCH_NOMATCH &&
893     (rrc != MATCH_THEN || md->start_match_ptr != ecode))
894     RRETURN(rrc);
895 nigel 77 ecode += GET(ecode, 1);
896 ph10 609 if (*ecode != OP_ALT) break;
897 nigel 77 }
898    
899 ph10 609 if (rrc != MATCH_THEN && md->mark == NULL) md->mark = markptr;
900     RRETURN(MATCH_NOMATCH);
901    
902 ph10 604 /* Handle possessive capturing brackets with an unlimited repeat. We come
903     here from BRAZERO with allow_zero set TRUE. The offset_vector values are
904     handled similarly to the normal case above. However, the matching is
905     different. The end of these brackets will always be OP_KETRPOS, which
906     returns MATCH_KETRPOS without going further in the pattern. By this means
907     we can handle the group by iteration rather than recursion, thereby
908     reducing the amount of stack needed. */
909    
910     case OP_CBRAPOS:
911     case OP_SCBRAPOS:
912     allow_zero = FALSE;
913    
914     POSSESSIVE_CAPTURE:
915     number = GET2(ecode, 1+LINK_SIZE);
916     offset = number << 1;
917    
918     #ifdef PCRE_DEBUG
919     printf("start possessive bracket %d\n", number);
920     printf("subject=");
921     pchars(eptr, 16, TRUE, md);
922     printf("\n");
923     #endif
924    
925     if (offset < md->offset_max)
926     {
927     matched_once = FALSE;
928     code_offset = ecode - md->start_code;
929    
930     save_offset1 = md->offset_vector[offset];
931     save_offset2 = md->offset_vector[offset+1];
932     save_offset3 = md->offset_vector[md->offset_end - number];
933     save_capture_last = md->capture_last;
934    
935     DPRINTF(("saving %d %d %d\n", save_offset1, save_offset2, save_offset3));
936    
937     /* Each time round the loop, save the current subject position for use
938     when the group matches. For MATCH_MATCH, the group has matched, so we
939     restart it with a new subject starting position, remembering that we had
940     at least one match. For MATCH_NOMATCH, carry on with the alternatives, as
941     usual. If we haven't matched any alternatives in any iteration, check to
942     see if a previous iteration matched. If so, the group has matched;
943     continue from afterwards. Otherwise it has failed; restore the previous
944     capture values before returning NOMATCH. */
945    
946     for (;;)
947     {
948     md->offset_vector[md->offset_end - number] =
949     (int)(eptr - md->start_subject);
950     if (op >= OP_SBRA) md->match_function_type = MATCH_CBEGROUP;
951     RMATCH(eptr, ecode + _pcre_OP_lengths[*ecode], offset_top, md,
952     eptrb, RM63);
953     if (rrc == MATCH_KETRPOS)
954     {
955     offset_top = md->end_offset_top;
956     eptr = md->end_match_ptr;
957     ecode = md->start_code + code_offset;
958     save_capture_last = md->capture_last;
959     matched_once = TRUE;
960     continue;
961     }
962     if (rrc != MATCH_NOMATCH &&
963     (rrc != MATCH_THEN || md->start_match_ptr != ecode))
964     RRETURN(rrc);
965     md->capture_last = save_capture_last;
966     ecode += GET(ecode, 1);
967     if (*ecode != OP_ALT) break;
968     }
969    
970     if (!matched_once)
971     {
972     md->offset_vector[offset] = save_offset1;
973     md->offset_vector[offset+1] = save_offset2;
974     md->offset_vector[md->offset_end - number] = save_offset3;
975     }
976    
977 ph10 609 if (rrc != MATCH_THEN && md->mark == NULL) md->mark = markptr;
978 ph10 604 if (allow_zero || matched_once)
979     {
980     ecode += 1 + LINK_SIZE;
981     break;
982     }
983    
984     RRETURN(MATCH_NOMATCH);
985     }
986    
987     /* FALL THROUGH ... Insufficient room for saving captured contents. Treat
988     as a non-capturing bracket. */
989    
990     /* VVVVVVVVVVVVVVVVVVVVVVVVV */
991     /* VVVVVVVVVVVVVVVVVVVVVVVVV */
992    
993     DPRINTF(("insufficient capture room: treat as non-capturing\n"));
994    
995     /* VVVVVVVVVVVVVVVVVVVVVVVVV */
996     /* VVVVVVVVVVVVVVVVVVVVVVVVV */
997    
998     /* Non-capturing possessive bracket with unlimited repeat. We come here
999     from BRAZERO with allow_zero = TRUE. The code is similar to the above,
1000     without the capturing complication. It is written out separately for speed
1001     and cleanliness. */
1002    
1003     case OP_BRAPOS:
1004     case OP_SBRAPOS:
1005     allow_zero = FALSE;
1006    
1007     POSSESSIVE_NON_CAPTURE:
1008     matched_once = FALSE;
1009     code_offset = ecode - md->start_code;
1010    
1011     for (;;)
1012     {
1013     if (op >= OP_SBRA) md->match_function_type = MATCH_CBEGROUP;
1014     RMATCH(eptr, ecode + _pcre_OP_lengths[*ecode], offset_top, md,
1015 ph10 609 eptrb, RM48);
1016 ph10 604 if (rrc == MATCH_KETRPOS)
1017     {
1018     eptr = md->end_match_ptr;
1019     ecode = md->start_code + code_offset;
1020     matched_once = TRUE;
1021     continue;
1022     }
1023     if (rrc != MATCH_NOMATCH &&
1024     (rrc != MATCH_THEN || md->start_match_ptr != ecode))
1025     RRETURN(rrc);
1026     ecode += GET(ecode, 1);
1027     if (*ecode != OP_ALT) break;
1028     }
1029    
1030     if (matched_once || allow_zero)
1031     {
1032     ecode += 1 + LINK_SIZE;
1033     break;
1034     }
1035     RRETURN(MATCH_NOMATCH);
1036    
1037     /* Control never reaches here. */
1038    
1039 nigel 77 /* Conditional group: compilation checked that there are no more than
1040     two branches. If the condition is false, skipping the first branch takes us
1041     past the end if there is only one branch, but that's OK because that is
1042 ph10 609 exactly what going to the ket would do. */
1043 nigel 77
1044     case OP_COND:
1045 nigel 93 case OP_SCOND:
1046 ph10 604 codelink = GET(ecode, 1);
1047 ph10 406
1048 ph10 381 /* Because of the way auto-callout works during compile, a callout item is
1049     inserted between OP_COND and an assertion condition. */
1050 ph10 392
1051 ph10 381 if (ecode[LINK_SIZE+1] == OP_CALLOUT)
1052     {
1053     if (pcre_callout != NULL)
1054     {
1055     pcre_callout_block cb;
1056     cb.version = 1; /* Version 1 of the callout block */
1057     cb.callout_number = ecode[LINK_SIZE+2];
1058     cb.offset_vector = md->offset_vector;
1059     cb.subject = (PCRE_SPTR)md->start_subject;
1060 ph10 530 cb.subject_length = (int)(md->end_subject - md->start_subject);
1061     cb.start_match = (int)(mstart - md->start_subject);
1062     cb.current_position = (int)(eptr - md->start_subject);
1063 ph10 381 cb.pattern_position = GET(ecode, LINK_SIZE + 3);
1064     cb.next_item_length = GET(ecode, 3 + 2*LINK_SIZE);
1065     cb.capture_top = offset_top/2;
1066     cb.capture_last = md->capture_last;
1067     cb.callout_data = md->callout_data;
1068 ph10 510 if ((rrc = (*pcre_callout)(&cb)) > 0) MRRETURN(MATCH_NOMATCH);
1069 ph10 381 if (rrc < 0) RRETURN(rrc);
1070     }
1071     ecode += _pcre_OP_lengths[OP_CALLOUT];
1072     }
1073 ph10 392
1074 ph10 399 condcode = ecode[LINK_SIZE+1];
1075 ph10 406
1076 ph10 381 /* Now see what the actual condition is */
1077 ph10 392
1078 ph10 459 if (condcode == OP_RREF || condcode == OP_NRREF) /* Recursion test */
1079 nigel 77 {
1080 ph10 459 if (md->recursive == NULL) /* Not recursing => FALSE */
1081     {
1082 ph10 461 condition = FALSE;
1083     ecode += GET(ecode, 1);
1084     }
1085 ph10 459 else
1086 ph10 461 {
1087 ph10 459 int recno = GET2(ecode, LINK_SIZE + 2); /* Recursion group number*/
1088     condition = (recno == RREF_ANY || recno == md->recursive->group_num);
1089 ph10 461
1090 ph10 459 /* If the test is for recursion into a specific subpattern, and it is
1091     false, but the test was set up by name, scan the table to see if the
1092     name refers to any other numbers, and test them. The condition is true
1093     if any one is set. */
1094 ph10 461
1095 ph10 459 if (!condition && condcode == OP_NRREF && recno != RREF_ANY)
1096     {
1097     uschar *slotA = md->name_table;
1098     for (i = 0; i < md->name_count; i++)
1099 ph10 461 {
1100     if (GET2(slotA, 0) == recno) break;
1101 ph10 459 slotA += md->name_entry_size;
1102     }
1103 ph10 461
1104 ph10 459 /* Found a name for the number - there can be only one; duplicate
1105     names for different numbers are allowed, but not vice versa. First
1106     scan down for duplicates. */
1107 ph10 461
1108 ph10 459 if (i < md->name_count)
1109 ph10 461 {
1110 ph10 459 uschar *slotB = slotA;
1111     while (slotB > md->name_table)
1112     {
1113     slotB -= md->name_entry_size;
1114     if (strcmp((char *)slotA + 2, (char *)slotB + 2) == 0)
1115     {
1116     condition = GET2(slotB, 0) == md->recursive->group_num;
1117 ph10 461 if (condition) break;
1118     }
1119 ph10 459 else break;
1120 ph10 461 }
1121    
1122 ph10 459 /* Scan up for duplicates */
1123 ph10 461
1124 ph10 459 if (!condition)
1125 ph10 461 {
1126 ph10 459 slotB = slotA;
1127     for (i++; i < md->name_count; i++)
1128     {
1129     slotB += md->name_entry_size;
1130     if (strcmp((char *)slotA + 2, (char *)slotB + 2) == 0)
1131     {
1132     condition = GET2(slotB, 0) == md->recursive->group_num;
1133     if (condition) break;
1134 ph10 461 }
1135 ph10 459 else break;
1136 ph10 461 }
1137     }
1138 ph10 459 }
1139 ph10 461 }
1140    
1141 ph10 459 /* Chose branch according to the condition */
1142 ph10 461
1143 ph10 459 ecode += condition? 3 : GET(ecode, 1);
1144     }
1145 ph10 461 }
1146 nigel 93
1147 ph10 459 else if (condcode == OP_CREF || condcode == OP_NCREF) /* Group used test */
1148 nigel 93 {
1149 nigel 77 offset = GET2(ecode, LINK_SIZE+2) << 1; /* Doubled ref number */
1150 nigel 93 condition = offset < offset_top && md->offset_vector[offset] >= 0;
1151 ph10 461
1152 ph10 459 /* If the numbered capture is unset, but the reference was by name,
1153 ph10 461 scan the table to see if the name refers to any other numbers, and test
1154     them. The condition is true if any one is set. This is tediously similar
1155     to the code above, but not close enough to try to amalgamate. */
1156    
1157 ph10 459 if (!condition && condcode == OP_NCREF)
1158     {
1159 ph10 461 int refno = offset >> 1;
1160 ph10 459 uschar *slotA = md->name_table;
1161 ph10 461
1162 ph10 459 for (i = 0; i < md->name_count; i++)
1163 ph10 461 {
1164     if (GET2(slotA, 0) == refno) break;
1165 ph10 459 slotA += md->name_entry_size;
1166     }
1167 ph10 461
1168     /* Found a name for the number - there can be only one; duplicate names
1169     for different numbers are allowed, but not vice versa. First scan down
1170 ph10 459 for duplicates. */
1171 ph10 461
1172 ph10 459 if (i < md->name_count)
1173 ph10 461 {
1174 ph10 459 uschar *slotB = slotA;
1175     while (slotB > md->name_table)
1176     {
1177     slotB -= md->name_entry_size;
1178     if (strcmp((char *)slotA + 2, (char *)slotB + 2) == 0)
1179     {
1180     offset = GET2(slotB, 0) << 1;
1181 ph10 461 condition = offset < offset_top &&
1182 ph10 459 md->offset_vector[offset] >= 0;
1183 ph10 461 if (condition) break;
1184     }
1185 ph10 459 else break;
1186 ph10 461 }
1187    
1188 ph10 459 /* Scan up for duplicates */
1189 ph10 461
1190 ph10 459 if (!condition)
1191 ph10 461 {
1192 ph10 459 slotB = slotA;
1193     for (i++; i < md->name_count; i++)
1194     {
1195     slotB += md->name_entry_size;
1196     if (strcmp((char *)slotA + 2, (char *)slotB + 2) == 0)
1197     {
1198     offset = GET2(slotB, 0) << 1;
1199 ph10 461 condition = offset < offset_top &&
1200 ph10 459 md->offset_vector[offset] >= 0;
1201 ph10 461 if (condition) break;
1202     }
1203 ph10 459 else break;
1204 ph10 461 }
1205     }
1206 ph10 459 }
1207 ph10 461 }
1208    
1209 ph10 459 /* Chose branch according to the condition */
1210    
1211 nigel 93 ecode += condition? 3 : GET(ecode, 1);
1212 nigel 77 }
1213    
1214 ph10 399 else if (condcode == OP_DEF) /* DEFINE - always false */
1215 nigel 93 {
1216     condition = FALSE;
1217     ecode += GET(ecode, 1);
1218     }
1219    
1220 nigel 77 /* The condition is an assertion. Call match() to evaluate it - setting
1221 ph10 604 md->match_function_type to MATCH_CONDASSERT causes it to stop at the end of
1222     an assertion. */
1223 nigel 77
1224     else
1225     {
1226 ph10 604 md->match_function_type = MATCH_CONDASSERT;
1227     RMATCH(eptr, ecode + 1 + LINK_SIZE, offset_top, md, NULL, RM3);
1228 nigel 77 if (rrc == MATCH_MATCH)
1229     {
1230 nigel 93 condition = TRUE;
1231     ecode += 1 + LINK_SIZE + GET(ecode, LINK_SIZE + 2);
1232 nigel 77 while (*ecode == OP_ALT) ecode += GET(ecode, 1);
1233     }
1234 ph10 550 else if (rrc != MATCH_NOMATCH &&
1235     (rrc != MATCH_THEN || md->start_match_ptr != ecode))
1236 nigel 77 {
1237     RRETURN(rrc); /* Need braces because of following else */
1238     }
1239 nigel 93 else
1240     {
1241     condition = FALSE;
1242 ph10 399 ecode += codelink;
1243 nigel 93 }
1244     }
1245 nigel 91
1246 nigel 93 /* We are now at the branch that is to be obeyed. As there is only one,
1247 ph10 609 we used to use tail recursion to avoid using another stack frame, except
1248     when there was unlimited repeat of a possibly empty group. However, that
1249     strategy no longer works because of the possibilty of (*THEN) being
1250     encountered in the branch. A recursive call to match() is always required,
1251     unless the second alternative doesn't exist, in which case we can just
1252     plough on. */
1253 nigel 91
1254 nigel 93 if (condition || *ecode == OP_ALT)
1255     {
1256 ph10 609 if (op == OP_SCOND) md->match_function_type = MATCH_CBEGROUP;
1257     RMATCH(eptr, ecode + 1 + LINK_SIZE, offset_top, md, eptrb, RM49);
1258     if (rrc == MATCH_THEN && md->start_match_ptr == ecode)
1259     rrc = MATCH_NOMATCH;
1260     RRETURN(rrc);
1261 nigel 77 }
1262 ph10 395 else /* Condition false & no alternative */
1263 nigel 93 {
1264     ecode += 1 + LINK_SIZE;
1265     }
1266     break;
1267 nigel 77
1268 ph10 461
1269 ph10 447 /* Before OP_ACCEPT there may be any number of OP_CLOSE opcodes,
1270     to close any currently open capturing brackets. */
1271 ph10 461
1272 ph10 447 case OP_CLOSE:
1273 ph10 461 number = GET2(ecode, 1);
1274 ph10 447 offset = number << 1;
1275 ph10 461
1276 ph10 475 #ifdef PCRE_DEBUG
1277 ph10 447 printf("end bracket %d at *ACCEPT", number);
1278     printf("\n");
1279     #endif
1280 nigel 77
1281 ph10 447 md->capture_last = number;
1282     if (offset >= md->offset_max) md->offset_overflow = TRUE; else
1283     {
1284     md->offset_vector[offset] =
1285     md->offset_vector[md->offset_end - number];
1286 ph10 530 md->offset_vector[offset+1] = (int)(eptr - md->start_subject);
1287 ph10 447 if (offset_top <= offset) offset_top = offset + 2;
1288     }
1289     ecode += 3;
1290 ph10 461 break;
1291 ph10 447
1292    
1293 ph10 608 /* End of the pattern, either real or forced. If we are in a recursion, we
1294     should restore the offsets appropriately, and if it's a top-level
1295     recursion, continue from after the call. */
1296 nigel 77
1297 ph10 210 case OP_ACCEPT:
1298 nigel 77 case OP_END:
1299 ph10 608 if (md->recursive != NULL)
1300 nigel 77 {
1301     recursion_info *rec = md->recursive;
1302     md->recursive = rec->prevrec;
1303 ph10 608 memmove(md->offset_vector, rec->offset_save,
1304 nigel 77 rec->saved_max * sizeof(int));
1305 ph10 461 offset_top = rec->save_offset_top;
1306 ph10 608 if (rec->group_num == 0)
1307     {
1308     ecode = rec->after_call;
1309     break;
1310     }
1311 nigel 77 }
1312    
1313 ph10 442 /* Otherwise, if we have matched an empty string, fail if PCRE_NOTEMPTY is
1314     set, or if PCRE_NOTEMPTY_ATSTART is set and we have matched at the start of
1315     the subject. In both cases, backtracking will then try other alternatives,
1316     if any. */
1317 ph10 443
1318 ph10 608 else if (eptr == mstart &&
1319 ph10 442 (md->notempty ||
1320 ph10 443 (md->notempty_atstart &&
1321 ph10 442 mstart == md->start_subject + md->start_offset)))
1322 ph10 510 MRRETURN(MATCH_NOMATCH);
1323 ph10 443
1324 ph10 442 /* Otherwise, we have a match. */
1325 ph10 608
1326 ph10 168 md->end_match_ptr = eptr; /* Record where we ended */
1327     md->end_offset_top = offset_top; /* and how many extracts were taken */
1328 ph10 210 md->start_match_ptr = mstart; /* and the start (\K can modify) */
1329 nigel 77
1330 ph10 512 /* For some reason, the macros don't work properly if an expression is
1331     given as the argument to MRRETURN when the heap is in use. */
1332    
1333     rrc = (op == OP_END)? MATCH_MATCH : MATCH_ACCEPT;
1334     MRRETURN(rrc);
1335    
1336 nigel 77 /* Assertion brackets. Check the alternative branches in turn - the
1337     matching won't pass the KET for an assertion. If any one branch matches,
1338     the assertion is true. Lookbehind assertions have an OP_REVERSE item at the
1339     start of each branch to move the current point backwards, so the code at
1340 ph10 604 this level is identical to the lookahead case. When the assertion is part
1341     of a condition, we want to return immediately afterwards. The caller of
1342     this incarnation of the match() function will have set MATCH_CONDASSERT in
1343     md->match_function type, and one of these opcodes will be the first opcode
1344     that is processed. We use a local variable that is preserved over calls to
1345     match() to remember this case. */
1346 nigel 77
1347     case OP_ASSERT:
1348     case OP_ASSERTBACK:
1349 ph10 604 if (md->match_function_type == MATCH_CONDASSERT)
1350     {
1351     condassert = TRUE;
1352     md->match_function_type = 0;
1353     }
1354     else condassert = FALSE;
1355    
1356 nigel 77 do
1357     {
1358 ph10 604 RMATCH(eptr, ecode + 1 + LINK_SIZE, offset_top, md, NULL, RM4);
1359 ph10 511 if (rrc == MATCH_MATCH || rrc == MATCH_ACCEPT)
1360 ph10 500 {
1361     mstart = md->start_match_ptr; /* In case \K reset it */
1362     break;
1363 ph10 501 }
1364 ph10 550 if (rrc != MATCH_NOMATCH &&
1365     (rrc != MATCH_THEN || md->start_match_ptr != ecode))
1366     RRETURN(rrc);
1367 nigel 77 ecode += GET(ecode, 1);
1368     }
1369     while (*ecode == OP_ALT);
1370 ph10 604
1371 ph10 510 if (*ecode == OP_KET) MRRETURN(MATCH_NOMATCH);
1372 nigel 77
1373     /* If checking an assertion for a condition, return MATCH_MATCH. */
1374    
1375 ph10 604 if (condassert) RRETURN(MATCH_MATCH);
1376 nigel 77
1377     /* Continue from after the assertion, updating the offsets high water
1378     mark, since extracts may have been taken during the assertion. */
1379    
1380     do ecode += GET(ecode,1); while (*ecode == OP_ALT);
1381     ecode += 1 + LINK_SIZE;
1382     offset_top = md->end_offset_top;
1383     continue;
1384    
1385 ph10 473 /* Negative assertion: all branches must fail to match. Encountering SKIP,
1386 ph10 482 PRUNE, or COMMIT means we must assume failure without checking subsequent
1387 ph10 473 branches. */
1388 nigel 77
1389     case OP_ASSERT_NOT:
1390     case OP_ASSERTBACK_NOT:
1391 ph10 604 if (md->match_function_type == MATCH_CONDASSERT)
1392     {
1393     condassert = TRUE;
1394     md->match_function_type = 0;
1395     }
1396     else condassert = FALSE;
1397    
1398 nigel 77 do
1399     {
1400 ph10 604 RMATCH(eptr, ecode + 1 + LINK_SIZE, offset_top, md, NULL, RM5);
1401 ph10 511 if (rrc == MATCH_MATCH || rrc == MATCH_ACCEPT) MRRETURN(MATCH_NOMATCH);
1402 ph10 473 if (rrc == MATCH_SKIP || rrc == MATCH_PRUNE || rrc == MATCH_COMMIT)
1403     {
1404     do ecode += GET(ecode,1); while (*ecode == OP_ALT);
1405 ph10 482 break;
1406     }
1407 ph10 550 if (rrc != MATCH_NOMATCH &&
1408     (rrc != MATCH_THEN || md->start_match_ptr != ecode))
1409     RRETURN(rrc);
1410 nigel 77 ecode += GET(ecode,1);
1411     }
1412     while (*ecode == OP_ALT);
1413    
1414 ph10 604 if (condassert) RRETURN(MATCH_MATCH); /* Condition assertion */
1415    
1416 nigel 77 ecode += 1 + LINK_SIZE;
1417     continue;
1418    
1419     /* Move the subject pointer back. This occurs only at the start of
1420     each branch of a lookbehind assertion. If we are too close to the start to
1421     move back, this match function fails. When working with UTF-8 we move
1422     back a number of characters, not bytes. */
1423    
1424     case OP_REVERSE:
1425     #ifdef SUPPORT_UTF8
1426     if (utf8)
1427     {
1428 nigel 93 i = GET(ecode, 1);
1429     while (i-- > 0)
1430 nigel 77 {
1431     eptr--;
1432 ph10 510 if (eptr < md->start_subject) MRRETURN(MATCH_NOMATCH);
1433 ph10 207 BACKCHAR(eptr);
1434 nigel 77 }
1435     }
1436     else
1437     #endif
1438    
1439     /* No UTF-8 support, or not in UTF-8 mode: count is byte count */
1440    
1441     {
1442 nigel 93 eptr -= GET(ecode, 1);
1443 ph10 510 if (eptr < md->start_subject) MRRETURN(MATCH_NOMATCH);
1444 nigel 77 }
1445    
1446 ph10 435 /* Save the earliest consulted character, then skip to next op code */
1447 nigel 77
1448 ph10 435 if (eptr < md->start_used_ptr) md->start_used_ptr = eptr;
1449 nigel 77 ecode += 1 + LINK_SIZE;
1450     break;
1451    
1452     /* The callout item calls an external function, if one is provided, passing
1453     details of the match so far. This is mainly for debugging, though the
1454     function is able to force a failure. */
1455    
1456     case OP_CALLOUT:
1457     if (pcre_callout != NULL)
1458     {
1459     pcre_callout_block cb;
1460     cb.version = 1; /* Version 1 of the callout block */
1461     cb.callout_number = ecode[1];
1462     cb.offset_vector = md->offset_vector;
1463 nigel 87 cb.subject = (PCRE_SPTR)md->start_subject;
1464 ph10 530 cb.subject_length = (int)(md->end_subject - md->start_subject);
1465     cb.start_match = (int)(mstart - md->start_subject);
1466     cb.current_position = (int)(eptr - md->start_subject);
1467 nigel 77 cb.pattern_position = GET(ecode, 2);
1468     cb.next_item_length = GET(ecode, 2 + LINK_SIZE);
1469     cb.capture_top = offset_top/2;
1470     cb.capture_last = md->capture_last;
1471     cb.callout_data = md->callout_data;
1472 ph10 510 if ((rrc = (*pcre_callout)(&cb)) > 0) MRRETURN(MATCH_NOMATCH);
1473 nigel 77 if (rrc < 0) RRETURN(rrc);
1474     }
1475     ecode += 2 + 2*LINK_SIZE;
1476     break;
1477    
1478     /* Recursion either matches the current regex, or some subexpression. The
1479     offset data is the offset to the starting bracket from the start of the
1480     whole pattern. (This is so that it works from duplicated subpatterns.)
1481    
1482     If there are any capturing brackets started but not finished, we have to
1483     save their starting points and reinstate them after the recursion. However,
1484     we don't know how many such there are (offset_top records the completed
1485     total) so we just have to save all the potential data. There may be up to
1486     65535 such values, which is too large to put on the stack, but using malloc
1487     for small numbers seems expensive. As a compromise, the stack is used when
1488     there are no more than REC_STACK_SAVE_MAX values to store; otherwise malloc
1489     is used. A problem is what to do if the malloc fails ... there is no way of
1490     returning to the top level with an error. Save the top REC_STACK_SAVE_MAX
1491     values on the stack, and accept that the rest may be wrong.
1492    
1493     There are also other values that have to be saved. We use a chained
1494     sequence of blocks that actually live on the stack. Thanks to Robin Houston
1495     for the original version of this logic. */
1496    
1497     case OP_RECURSE:
1498     {
1499     callpat = md->start_code + GET(ecode, 1);
1500 nigel 93 new_recursive.group_num = (callpat == md->start_code)? 0 :
1501     GET2(callpat, 1 + LINK_SIZE);
1502 nigel 77
1503     /* Add to "recursing stack" */
1504    
1505     new_recursive.prevrec = md->recursive;
1506     md->recursive = &new_recursive;
1507    
1508     /* Find where to continue from afterwards */
1509    
1510     ecode += 1 + LINK_SIZE;
1511     new_recursive.after_call = ecode;
1512    
1513     /* Now save the offset data. */
1514    
1515     new_recursive.saved_max = md->offset_end;
1516     if (new_recursive.saved_max <= REC_STACK_SAVE_MAX)
1517     new_recursive.offset_save = stacksave;
1518     else
1519     {
1520     new_recursive.offset_save =
1521     (int *)(pcre_malloc)(new_recursive.saved_max * sizeof(int));
1522     if (new_recursive.offset_save == NULL) RRETURN(PCRE_ERROR_NOMEMORY);
1523     }
1524    
1525     memcpy(new_recursive.offset_save, md->offset_vector,
1526     new_recursive.saved_max * sizeof(int));
1527 ph10 461 new_recursive.save_offset_top = offset_top;
1528 ph10 608
1529 nigel 77 /* OK, now we can do the recursion. For each top-level alternative we
1530     restore the offset and recursion data. */
1531    
1532     DPRINTF(("Recursing into group %d\n", new_recursive.group_num));
1533 ph10 604 cbegroup = (*callpat >= OP_SBRA);
1534 nigel 77 do
1535     {
1536 ph10 604 if (cbegroup) md->match_function_type = MATCH_CBEGROUP;
1537 ph10 164 RMATCH(eptr, callpat + _pcre_OP_lengths[*callpat], offset_top,
1538 ph10 604 md, eptrb, RM6);
1539 ph10 511 if (rrc == MATCH_MATCH || rrc == MATCH_ACCEPT)
1540 nigel 77 {
1541 nigel 87 DPRINTF(("Recursion matched\n"));
1542 nigel 77 md->recursive = new_recursive.prevrec;
1543     if (new_recursive.offset_save != stacksave)
1544     (pcre_free)(new_recursive.offset_save);
1545 ph10 510 MRRETURN(MATCH_MATCH);
1546 nigel 77 }
1547 ph10 550 else if (rrc != MATCH_NOMATCH &&
1548     (rrc != MATCH_THEN || md->start_match_ptr != ecode))
1549 nigel 87 {
1550     DPRINTF(("Recursion gave error %d\n", rrc));
1551 ph10 400 if (new_recursive.offset_save != stacksave)
1552     (pcre_free)(new_recursive.offset_save);
1553 nigel 87 RRETURN(rrc);
1554     }
1555 nigel 77
1556     md->recursive = &new_recursive;
1557     memcpy(md->offset_vector, new_recursive.offset_save,
1558     new_recursive.saved_max * sizeof(int));
1559     callpat += GET(callpat, 1);
1560     }
1561     while (*callpat == OP_ALT);
1562    
1563     DPRINTF(("Recursion didn't match\n"));
1564     md->recursive = new_recursive.prevrec;
1565     if (new_recursive.offset_save != stacksave)
1566     (pcre_free)(new_recursive.offset_save);
1567 ph10 510 MRRETURN(MATCH_NOMATCH);
1568 nigel 77 }
1569     /* Control never reaches here */
1570    
1571     /* "Once" brackets are like assertion brackets except that after a match,
1572     the point in the subject string is not moved back. Thus there can never be
1573     a move back into the brackets. Friedl calls these "atomic" subpatterns.
1574     Check the alternative branches in turn - the matching won't pass the KET
1575     for this kind of subpattern. If any one branch matches, we carry on as at
1576 ph10 500 the end of a normal bracket, leaving the subject pointer, but resetting
1577     the start-of-match value in case it was changed by \K. */
1578 nigel 77
1579     case OP_ONCE:
1580 nigel 91 prev = ecode;
1581     saved_eptr = eptr;
1582    
1583     do
1584 nigel 77 {
1585 ph10 604 RMATCH(eptr, ecode + 1 + LINK_SIZE, offset_top, md, eptrb, RM7);
1586 ph10 511 if (rrc == MATCH_MATCH) /* Note: _not_ MATCH_ACCEPT */
1587 ph10 500 {
1588     mstart = md->start_match_ptr;
1589     break;
1590 ph10 501 }
1591 ph10 550 if (rrc != MATCH_NOMATCH &&
1592     (rrc != MATCH_THEN || md->start_match_ptr != ecode))
1593     RRETURN(rrc);
1594 nigel 91 ecode += GET(ecode,1);
1595     }
1596     while (*ecode == OP_ALT);
1597 nigel 77
1598 nigel 91 /* If hit the end of the group (which could be repeated), fail */
1599 nigel 77
1600 nigel 91 if (*ecode != OP_ONCE && *ecode != OP_ALT) RRETURN(MATCH_NOMATCH);
1601 nigel 77
1602 nigel 91 /* Continue as from after the assertion, updating the offsets high water
1603     mark, since extracts may have been taken. */
1604 nigel 77
1605 nigel 93 do ecode += GET(ecode, 1); while (*ecode == OP_ALT);
1606 nigel 77
1607 nigel 91 offset_top = md->end_offset_top;
1608     eptr = md->end_match_ptr;
1609 nigel 77
1610 nigel 91 /* For a non-repeating ket, just continue at this level. This also
1611     happens for a repeating ket if no characters were matched in the group.
1612     This is the forcible breaking of infinite loops as implemented in Perl
1613     5.005. If there is an options reset, it will get obeyed in the normal
1614     course of events. */
1615 nigel 77
1616 nigel 91 if (*ecode == OP_KET || eptr == saved_eptr)
1617     {
1618     ecode += 1+LINK_SIZE;
1619     break;
1620     }
1621 nigel 77
1622 nigel 91 /* The repeating kets try the rest of the pattern or restart from the
1623     preceding bracket, in the appropriate order. The second "call" of match()
1624 ph10 602 uses tail recursion, to avoid using another stack frame. */
1625 nigel 77
1626 nigel 91 if (*ecode == OP_KETRMIN)
1627     {
1628 ph10 604 RMATCH(eptr, ecode + 1 + LINK_SIZE, offset_top, md, eptrb, RM8);
1629 nigel 91 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1630     ecode = prev;
1631     goto TAIL_RECURSE;
1632 nigel 77 }
1633 nigel 91 else /* OP_KETRMAX */
1634     {
1635 ph10 604 md->match_function_type = MATCH_CBEGROUP;
1636     RMATCH(eptr, prev, offset_top, md, eptrb, RM9);
1637 nigel 91 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1638     ecode += 1 + LINK_SIZE;
1639     goto TAIL_RECURSE;
1640     }
1641     /* Control never gets here */
1642 nigel 77
1643     /* An alternation is the end of a branch; scan along to find the end of the
1644     bracketed group and go to there. */
1645    
1646     case OP_ALT:
1647     do ecode += GET(ecode,1); while (*ecode == OP_ALT);
1648     break;
1649    
1650 ph10 335 /* BRAZERO, BRAMINZERO and SKIPZERO occur just before a bracket group,
1651     indicating that it may occur zero times. It may repeat infinitely, or not
1652     at all - i.e. it could be ()* or ()? or even (){0} in the pattern. Brackets
1653     with fixed upper repeat limits are compiled as a number of copies, with the
1654     optional ones preceded by BRAZERO or BRAMINZERO. */
1655 ph10 604
1656 nigel 77 case OP_BRAZERO:
1657 ph10 604 next = ecode + 1;
1658     RMATCH(eptr, next, offset_top, md, eptrb, RM10);
1659     if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1660     do next += GET(next, 1); while (*next == OP_ALT);
1661     ecode = next + 1 + LINK_SIZE;
1662 nigel 77 break;
1663 ph10 604
1664 nigel 77 case OP_BRAMINZERO:
1665 ph10 604 next = ecode + 1;
1666     do next += GET(next, 1); while (*next == OP_ALT);
1667     RMATCH(eptr, next + 1+LINK_SIZE, offset_top, md, eptrb, RM11);
1668     if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1669     ecode++;
1670 nigel 77 break;
1671    
1672 ph10 335 case OP_SKIPZERO:
1673 ph10 604 next = ecode+1;
1674     do next += GET(next,1); while (*next == OP_ALT);
1675     ecode = next + 1 + LINK_SIZE;
1676 ph10 335 break;
1677 ph10 604
1678     /* BRAPOSZERO occurs before a possessive bracket group. Don't do anything
1679     here; just jump to the group, with allow_zero set TRUE. */
1680    
1681     case OP_BRAPOSZERO:
1682     op = *(++ecode);
1683     allow_zero = TRUE;
1684     if (op == OP_CBRAPOS || op == OP_SCBRAPOS) goto POSSESSIVE_CAPTURE;
1685     goto POSSESSIVE_NON_CAPTURE;
1686 ph10 335
1687 nigel 93 /* End of a group, repeated or non-repeating. */
1688 nigel 77
1689     case OP_KET:
1690     case OP_KETRMIN:
1691     case OP_KETRMAX:
1692 ph10 604 case OP_KETRPOS:
1693 nigel 91 prev = ecode - GET(ecode, 1);
1694 nigel 77
1695 nigel 93 /* If this was a group that remembered the subject start, in order to break
1696     infinite repeats of empty string matches, retrieve the subject start from
1697     the chain. Otherwise, set it NULL. */
1698 nigel 77
1699 nigel 93 if (*prev >= OP_SBRA)
1700     {
1701     saved_eptr = eptrb->epb_saved_eptr; /* Value at start of group */
1702     eptrb = eptrb->epb_prev; /* Backup to previous group */
1703     }
1704     else saved_eptr = NULL;
1705 nigel 77
1706 ph10 500 /* If we are at the end of an assertion group or an atomic group, stop
1707     matching and return MATCH_MATCH, but record the current high water mark for
1708     use by positive assertions. We also need to record the match start in case
1709     it was changed by \K. */
1710 nigel 93
1711 nigel 91 if (*prev == OP_ASSERT || *prev == OP_ASSERT_NOT ||
1712     *prev == OP_ASSERTBACK || *prev == OP_ASSERTBACK_NOT ||
1713     *prev == OP_ONCE)
1714     {
1715     md->end_match_ptr = eptr; /* For ONCE */
1716     md->end_offset_top = offset_top;
1717 ph10 500 md->start_match_ptr = mstart;
1718 ph10 510 MRRETURN(MATCH_MATCH);
1719 nigel 91 }
1720 nigel 77
1721 nigel 93 /* For capturing groups we have to check the group number back at the start
1722     and if necessary complete handling an extraction by setting the offsets and
1723     bumping the high water mark. Note that whole-pattern recursion is coded as
1724     a recurse into group 0, so it won't be picked up here. Instead, we catch it
1725     when the OP_END is reached. Other recursion is handled here. */
1726 nigel 77
1727 ph10 604 if (*prev == OP_CBRA || *prev == OP_SCBRA ||
1728     *prev == OP_CBRAPOS || *prev == OP_SCBRAPOS)
1729 nigel 91 {
1730 nigel 93 number = GET2(prev, 1+LINK_SIZE);
1731 nigel 91 offset = number << 1;
1732 ph10 461
1733 ph10 475 #ifdef PCRE_DEBUG
1734 nigel 91 printf("end bracket %d", number);
1735     printf("\n");
1736 nigel 77 #endif
1737    
1738 nigel 93 md->capture_last = number;
1739     if (offset >= md->offset_max) md->offset_overflow = TRUE; else
1740 nigel 91 {
1741 nigel 93 md->offset_vector[offset] =
1742     md->offset_vector[md->offset_end - number];
1743 ph10 530 md->offset_vector[offset+1] = (int)(eptr - md->start_subject);
1744 nigel 93 if (offset_top <= offset) offset_top = offset + 2;
1745     }
1746 nigel 77
1747 nigel 93 /* Handle a recursively called group. Restore the offsets
1748     appropriately and continue from after the call. */
1749 nigel 77
1750 nigel 93 if (md->recursive != NULL && md->recursive->group_num == number)
1751     {
1752     recursion_info *rec = md->recursive;
1753     DPRINTF(("Recursion (%d) succeeded - continuing\n", number));
1754     md->recursive = rec->prevrec;
1755     memcpy(md->offset_vector, rec->offset_save,
1756     rec->saved_max * sizeof(int));
1757 ph10 461 offset_top = rec->save_offset_top;
1758 nigel 93 ecode = rec->after_call;
1759     break;
1760 nigel 77 }
1761 nigel 91 }
1762 nigel 77
1763 nigel 91 /* For a non-repeating ket, just continue at this level. This also
1764     happens for a repeating ket if no characters were matched in the group.
1765     This is the forcible breaking of infinite loops as implemented in Perl
1766     5.005. If there is an options reset, it will get obeyed in the normal
1767     course of events. */
1768 nigel 77
1769 nigel 91 if (*ecode == OP_KET || eptr == saved_eptr)
1770     {
1771     ecode += 1 + LINK_SIZE;
1772     break;
1773     }
1774 ph10 604
1775     /* OP_KETRPOS is a possessive repeating ket. Remember the current position,
1776     and return the MATCH_KETRPOS. This makes it possible to do the repeats one
1777     at a time from the outer level, thus saving stack. */
1778    
1779     if (*ecode == OP_KETRPOS)
1780     {
1781     md->end_match_ptr = eptr;
1782     md->end_offset_top = offset_top;
1783     RRETURN(MATCH_KETRPOS);
1784     }
1785 nigel 77
1786 ph10 604 /* The normal repeating kets try the rest of the pattern or restart from
1787     the preceding bracket, in the appropriate order. In the second case, we can
1788     use tail recursion to avoid using another stack frame, unless we have an
1789 ph10 197 unlimited repeat of a group that can match an empty string. */
1790 nigel 77
1791 nigel 91 if (*ecode == OP_KETRMIN)
1792     {
1793 ph10 604 RMATCH(eptr, ecode + 1 + LINK_SIZE, offset_top, md, eptrb, RM12);
1794 nigel 91 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1795 ph10 604 if (*prev >= OP_SBRA) /* Could match an empty string */
1796 ph10 197 {
1797 ph10 604 md->match_function_type = MATCH_CBEGROUP;
1798     RMATCH(eptr, prev, offset_top, md, eptrb, RM50);
1799 ph10 197 RRETURN(rrc);
1800     }
1801 nigel 91 ecode = prev;
1802     goto TAIL_RECURSE;
1803 nigel 77 }
1804 nigel 91 else /* OP_KETRMAX */
1805     {
1806 ph10 604 if (*prev >= OP_SBRA) md->match_function_type = MATCH_CBEGROUP;
1807     RMATCH(eptr, prev, offset_top, md, eptrb, RM13);
1808 nigel 91 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1809     ecode += 1 + LINK_SIZE;
1810     goto TAIL_RECURSE;
1811     }
1812     /* Control never gets here */
1813 nigel 77
1814 ph10 602 /* Not multiline mode: start of subject assertion, unless notbol. */
1815 nigel 77
1816     case OP_CIRC:
1817 ph10 510 if (md->notbol && eptr == md->start_subject) MRRETURN(MATCH_NOMATCH);
1818 ph10 602
1819 nigel 77 /* Start of subject assertion */
1820    
1821     case OP_SOD:
1822 ph10 510 if (eptr != md->start_subject) MRRETURN(MATCH_NOMATCH);
1823 nigel 77 ecode++;
1824     break;
1825 ph10 602
1826     /* Multiline mode: start of subject unless notbol, or after any newline. */
1827 nigel 77
1828 ph10 602 case OP_CIRCM:
1829     if (md->notbol && eptr == md->start_subject) MRRETURN(MATCH_NOMATCH);
1830     if (eptr != md->start_subject &&
1831     (eptr == md->end_subject || !WAS_NEWLINE(eptr)))
1832     MRRETURN(MATCH_NOMATCH);
1833     ecode++;
1834     break;
1835    
1836 nigel 77 /* Start of match assertion */
1837    
1838     case OP_SOM:
1839 ph10 510 if (eptr != md->start_subject + md->start_offset) MRRETURN(MATCH_NOMATCH);
1840 nigel 77 ecode++;
1841     break;
1842 ph10 172
1843 ph10 168 /* Reset the start of match point */
1844 ph10 172
1845 ph10 168 case OP_SET_SOM:
1846     mstart = eptr;
1847 ph10 172 ecode++;
1848     break;
1849 nigel 77
1850 ph10 602 /* Multiline mode: assert before any newline, or before end of subject
1851     unless noteol is set. */
1852 nigel 77
1853 ph10 602 case OP_DOLLM:
1854     if (eptr < md->end_subject)
1855     { if (!IS_NEWLINE(eptr)) MRRETURN(MATCH_NOMATCH); }
1856     else
1857 nigel 77 {
1858 ph10 510 if (md->noteol) MRRETURN(MATCH_NOMATCH);
1859 ph10 602 SCHECK_PARTIAL();
1860 nigel 77 }
1861 ph10 602 ecode++;
1862     break;
1863 ph10 579
1864 ph10 602 /* Not multiline mode: assert before a terminating newline or before end of
1865     subject unless noteol is set. */
1866    
1867     case OP_DOLL:
1868     if (md->noteol) MRRETURN(MATCH_NOMATCH);
1869     if (!md->endonly) goto ASSERT_NL_OR_EOS;
1870    
1871 nigel 91 /* ... else fall through for endonly */
1872 nigel 77
1873     /* End of subject assertion (\z) */
1874    
1875     case OP_EOD:
1876 ph10 510 if (eptr < md->end_subject) MRRETURN(MATCH_NOMATCH);
1877 ph10 553 SCHECK_PARTIAL();
1878 nigel 77 ecode++;
1879     break;
1880    
1881     /* End of subject or ending \n assertion (\Z) */
1882    
1883     case OP_EODN:
1884 ph10 553 ASSERT_NL_OR_EOS:
1885     if (eptr < md->end_subject &&
1886 nigel 93 (!IS_NEWLINE(eptr) || eptr != md->end_subject - md->nllen))
1887 ph10 510 MRRETURN(MATCH_NOMATCH);
1888 ph10 579
1889 ph10 553 /* Either at end of string or \n before end. */
1890 ph10 579
1891 ph10 553 SCHECK_PARTIAL();
1892 nigel 77 ecode++;
1893     break;
1894    
1895     /* Word boundary assertions */
1896    
1897     case OP_NOT_WORD_BOUNDARY:
1898     case OP_WORD_BOUNDARY:
1899     {
1900    
1901     /* Find out if the previous and current characters are "word" characters.
1902     It takes a bit more work in UTF-8 mode. Characters > 255 are assumed to
1903 ph10 443 be "non-word" characters. Remember the earliest consulted character for
1904 ph10 435 partial matching. */
1905 nigel 77
1906     #ifdef SUPPORT_UTF8
1907     if (utf8)
1908     {
1909 ph10 518 /* Get status of previous character */
1910 ph10 527
1911 nigel 77 if (eptr == md->start_subject) prev_is_word = FALSE; else
1912     {
1913 ph10 409 USPTR lastptr = eptr - 1;
1914 nigel 77 while((*lastptr & 0xc0) == 0x80) lastptr--;
1915 ph10 443 if (lastptr < md->start_used_ptr) md->start_used_ptr = lastptr;
1916 nigel 77 GETCHAR(c, lastptr);
1917 ph10 527 #ifdef SUPPORT_UCP
1918 ph10 518 if (md->use_ucp)
1919     {
1920     if (c == '_') prev_is_word = TRUE; else
1921 ph10 527 {
1922 ph10 518 int cat = UCD_CATEGORY(c);
1923     prev_is_word = (cat == ucp_L || cat == ucp_N);
1924 ph10 527 }
1925     }
1926     else
1927     #endif
1928 nigel 77 prev_is_word = c < 256 && (md->ctypes[c] & ctype_word) != 0;
1929     }
1930 ph10 527
1931 ph10 518 /* Get status of next character */
1932 ph10 527
1933 ph10 443 if (eptr >= md->end_subject)
1934 nigel 77 {
1935 ph10 443 SCHECK_PARTIAL();
1936     cur_is_word = FALSE;
1937 ph10 428 }
1938     else
1939     {
1940 nigel 77 GETCHAR(c, eptr);
1941 ph10 527 #ifdef SUPPORT_UCP
1942 ph10 518 if (md->use_ucp)
1943     {
1944     if (c == '_') cur_is_word = TRUE; else
1945 ph10 527 {
1946 ph10 518 int cat = UCD_CATEGORY(c);
1947     cur_is_word = (cat == ucp_L || cat == ucp_N);
1948 ph10 527 }
1949     }
1950     else
1951     #endif
1952 nigel 77 cur_is_word = c < 256 && (md->ctypes[c] & ctype_word) != 0;
1953     }
1954     }
1955     else
1956     #endif
1957    
1958 ph10 527 /* Not in UTF-8 mode, but we may still have PCRE_UCP set, and for
1959 ph10 518 consistency with the behaviour of \w we do use it in this case. */
1960 nigel 77
1961     {
1962 ph10 518 /* Get status of previous character */
1963 ph10 527
1964 ph10 435 if (eptr == md->start_subject) prev_is_word = FALSE; else
1965     {
1966 ph10 443 if (eptr <= md->start_used_ptr) md->start_used_ptr = eptr - 1;
1967 ph10 527 #ifdef SUPPORT_UCP
1968 ph10 518 if (md->use_ucp)
1969     {
1970 ph10 527 c = eptr[-1];
1971 ph10 518 if (c == '_') prev_is_word = TRUE; else
1972 ph10 527 {
1973 ph10 518 int cat = UCD_CATEGORY(c);
1974     prev_is_word = (cat == ucp_L || cat == ucp_N);
1975 ph10 527 }
1976     }
1977     else
1978     #endif
1979 ph10 435 prev_is_word = ((md->ctypes[eptr[-1]] & ctype_word) != 0);
1980     }
1981 ph10 527
1982 ph10 518 /* Get status of next character */
1983 ph10 527
1984 ph10 443 if (eptr >= md->end_subject)
1985 ph10 428 {
1986 ph10 443 SCHECK_PARTIAL();
1987     cur_is_word = FALSE;
1988 ph10 428 }
1989 ph10 527 else
1990     #ifdef SUPPORT_UCP
1991 ph10 518 if (md->use_ucp)
1992     {
1993 ph10 527 c = *eptr;
1994 ph10 518 if (c == '_') cur_is_word = TRUE; else
1995 ph10 527 {
1996 ph10 518 int cat = UCD_CATEGORY(c);
1997     cur_is_word = (cat == ucp_L || cat == ucp_N);
1998 ph10 527 }
1999     }
2000     else
2001     #endif
2002 ph10 518 cur_is_word = ((md->ctypes[*eptr] & ctype_word) != 0);
2003 nigel 77 }
2004    
2005     /* Now see if the situation is what we want */
2006    
2007     if ((*ecode++ == OP_WORD_BOUNDARY)?
2008     cur_is_word == prev_is_word : cur_is_word != prev_is_word)
2009 ph10 510 MRRETURN(MATCH_NOMATCH);
2010 nigel 77 }
2011     break;
2012    
2013     /* Match a single character type; inline for speed */
2014    
2015     case OP_ANY:
2016 ph10 510 if (IS_NEWLINE(eptr)) MRRETURN(MATCH_NOMATCH);
2017 ph10 345 /* Fall through */
2018    
2019 ph10 341 case OP_ALLANY:
2020 ph10 443 if (eptr++ >= md->end_subject)
2021 ph10 428 {
2022 ph10 443 SCHECK_PARTIAL();
2023 ph10 510 MRRETURN(MATCH_NOMATCH);
2024 ph10 443 }
2025 ph10 342 if (utf8) while (eptr < md->end_subject && (*eptr & 0xc0) == 0x80) eptr++;
2026 nigel 77 ecode++;
2027     break;
2028    
2029     /* Match a single byte, even in UTF-8 mode. This opcode really does match
2030     any byte, even newline, independent of the setting of PCRE_DOTALL. */
2031    
2032     case OP_ANYBYTE:
2033 ph10 443 if (eptr++ >= md->end_subject)
2034 ph10 428 {
2035 ph10 443 SCHECK_PARTIAL();
2036 ph10 510 MRRETURN(MATCH_NOMATCH);
2037 ph10 443 }
2038 nigel 77 ecode++;
2039     break;
2040    
2041     case OP_NOT_DIGIT:
2042 ph10 443 if (eptr >= md->end_subject)
2043 ph10 428 {
2044 ph10 443 SCHECK_PARTIAL();
2045 ph10 510 MRRETURN(MATCH_NOMATCH);
2046 ph10 443 }
2047 nigel 77 GETCHARINCTEST(c, eptr);
2048     if (
2049     #ifdef SUPPORT_UTF8
2050     c < 256 &&
2051     #endif
2052     (md->ctypes[c] & ctype_digit) != 0
2053     )
2054 ph10 510 MRRETURN(MATCH_NOMATCH);
2055 nigel 77 ecode++;
2056     break;
2057    
2058     case OP_DIGIT:
2059 ph10 443 if (eptr >= md->end_subject)
2060 ph10 428 {
2061 ph10 443 SCHECK_PARTIAL();
2062 ph10 510 MRRETURN(MATCH_NOMATCH);
2063 ph10 443 }
2064 nigel 77 GETCHARINCTEST(c, eptr);
2065     if (
2066     #ifdef SUPPORT_UTF8
2067     c >= 256 ||
2068     #endif
2069     (md->ctypes[c] & ctype_digit) == 0
2070     )
2071 ph10 510 MRRETURN(MATCH_NOMATCH);
2072 nigel 77 ecode++;
2073     break;
2074    
2075     case OP_NOT_WHITESPACE:
2076 ph10 443 if (eptr >= md->end_subject)
2077 ph10 428 {
2078 ph10 443 SCHECK_PARTIAL();
2079 ph10 510 MRRETURN(MATCH_NOMATCH);
2080 ph10 443 }
2081 nigel 77 GETCHARINCTEST(c, eptr);
2082     if (
2083     #ifdef SUPPORT_UTF8
2084     c < 256 &&
2085     #endif
2086     (md->ctypes[c] & ctype_space) != 0
2087     )
2088 ph10 510 MRRETURN(MATCH_NOMATCH);
2089 nigel 77 ecode++;
2090     break;
2091    
2092     case OP_WHITESPACE:
2093 ph10 443 if (eptr >= md->end_subject)
2094 ph10 428 {
2095 ph10 443 SCHECK_PARTIAL();
2096 ph10 510 MRRETURN(MATCH_NOMATCH);
2097 ph10 443 }
2098 nigel 77 GETCHARINCTEST(c, eptr);
2099     if (
2100     #ifdef SUPPORT_UTF8
2101     c >= 256 ||
2102     #endif
2103     (md->ctypes[c] & ctype_space) == 0
2104     )
2105 ph10 510 MRRETURN(MATCH_NOMATCH);
2106 nigel 77 ecode++;
2107     break;
2108    
2109     case OP_NOT_WORDCHAR:
2110 ph10 443 if (eptr >= md->end_subject)
2111 ph10 428 {
2112 ph10 443 SCHECK_PARTIAL();
2113 ph10 510 MRRETURN(MATCH_NOMATCH);
2114 ph10 443 }
2115 nigel 77 GETCHARINCTEST(c, eptr);
2116     if (
2117     #ifdef SUPPORT_UTF8
2118     c < 256 &&
2119     #endif
2120     (md->ctypes[c] & ctype_word) != 0
2121     )
2122 ph10 510 MRRETURN(MATCH_NOMATCH);
2123 nigel 77 ecode++;
2124     break;
2125    
2126     case OP_WORDCHAR:
2127 ph10 443 if (eptr >= md->end_subject)
2128 ph10 428 {
2129 ph10 443 SCHECK_PARTIAL();
2130 ph10 510 MRRETURN(MATCH_NOMATCH);
2131 ph10 443 }
2132 nigel 77 GETCHARINCTEST(c, eptr);
2133     if (
2134     #ifdef SUPPORT_UTF8
2135     c >= 256 ||
2136     #endif
2137     (md->ctypes[c] & ctype_word) == 0
2138     )
2139 ph10 510 MRRETURN(MATCH_NOMATCH);
2140 nigel 77 ecode++;
2141     break;
2142    
2143 nigel 93 case OP_ANYNL:
2144 ph10 443 if (eptr >= md->end_subject)
2145 ph10 428 {
2146 ph10 443 SCHECK_PARTIAL();
2147 ph10 510 MRRETURN(MATCH_NOMATCH);
2148 ph10 443 }
2149 nigel 93 GETCHARINCTEST(c, eptr);
2150     switch(c)
2151     {
2152 ph10 510 default: MRRETURN(MATCH_NOMATCH);
2153 ph10 600
2154 nigel 93 case 0x000d:
2155     if (eptr < md->end_subject && *eptr == 0x0a) eptr++;
2156     break;
2157 ph10 231
2158 nigel 93 case 0x000a:
2159 ph10 231 break;
2160    
2161 nigel 93 case 0x000b:
2162     case 0x000c:
2163     case 0x0085:
2164     case 0x2028:
2165     case 0x2029:
2166 ph10 510 if (md->bsr_anycrlf) MRRETURN(MATCH_NOMATCH);
2167 nigel 93 break;
2168     }
2169     ecode++;
2170     break;
2171    
2172 ph10 178 case OP_NOT_HSPACE:
2173 ph10 443 if (eptr >= md->end_subject)
2174 ph10 428 {
2175 ph10 443 SCHECK_PARTIAL();
2176 ph10 510 MRRETURN(MATCH_NOMATCH);
2177 ph10 443 }
2178 ph10 178 GETCHARINCTEST(c, eptr);
2179     switch(c)
2180     {
2181     default: break;
2182     case 0x09: /* HT */
2183     case 0x20: /* SPACE */
2184     case 0xa0: /* NBSP */
2185     case 0x1680: /* OGHAM SPACE MARK */
2186     case 0x180e: /* MONGOLIAN VOWEL SEPARATOR */
2187     case 0x2000: /* EN QUAD */
2188     case 0x2001: /* EM QUAD */
2189     case 0x2002: /* EN SPACE */
2190     case 0x2003: /* EM SPACE */
2191     case 0x2004: /* THREE-PER-EM SPACE */
2192     case 0x2005: /* FOUR-PER-EM SPACE */
2193     case 0x2006: /* SIX-PER-EM SPACE */
2194     case 0x2007: /* FIGURE SPACE */
2195     case 0x2008: /* PUNCTUATION SPACE */
2196     case 0x2009: /* THIN SPACE */
2197     case 0x200A: /* HAIR SPACE */
2198     case 0x202f: /* NARROW NO-BREAK SPACE */
2199     case 0x205f: /* MEDIUM MATHEMATICAL SPACE */
2200     case 0x3000: /* IDEOGRAPHIC SPACE */
2201 ph10 510 MRRETURN(MATCH_NOMATCH);
2202 ph10 178 }
2203     ecode++;
2204     break;
2205    
2206     case OP_HSPACE:
2207 ph10 443 if (eptr >= md->end_subject)
2208 ph10 428 {
2209 ph10 443 SCHECK_PARTIAL();
2210 ph10 510 MRRETURN(MATCH_NOMATCH);
2211 ph10 443 }
2212 ph10 178 GETCHARINCTEST(c, eptr);
2213     switch(c)
2214     {
2215 ph10 510 default: MRRETURN(MATCH_NOMATCH);
2216 ph10 178 case 0x09: /* HT */
2217     case 0x20: /* SPACE */
2218     case 0xa0: /* NBSP */
2219     case 0x1680: /* OGHAM SPACE MARK */
2220     case 0x180e: /* MONGOLIAN VOWEL SEPARATOR */
2221     case 0x2000: /* EN QUAD */
2222     case 0x2001: /* EM QUAD */
2223     case 0x2002: /* EN SPACE */
2224     case 0x2003: /* EM SPACE */
2225     case 0x2004: /* THREE-PER-EM SPACE */
2226     case 0x2005: /* FOUR-PER-EM SPACE */
2227     case 0x2006: /* SIX-PER-EM SPACE */
2228     case 0x2007: /* FIGURE SPACE */
2229     case 0x2008: /* PUNCTUATION SPACE */
2230     case 0x2009: /* THIN SPACE */
2231     case 0x200A: /* HAIR SPACE */
2232     case 0x202f: /* NARROW NO-BREAK SPACE */
2233     case 0x205f: /* MEDIUM MATHEMATICAL SPACE */
2234     case 0x3000: /* IDEOGRAPHIC SPACE */
2235     break;
2236     }
2237     ecode++;
2238     break;
2239    
2240     case OP_NOT_VSPACE:
2241 ph10 443 if (eptr >= md->end_subject)
2242 ph10 428 {
2243 ph10 443 SCHECK_PARTIAL();
2244 ph10 510 MRRETURN(MATCH_NOMATCH);
2245 ph10 443 }
2246 ph10 178 GETCHARINCTEST(c, eptr);
2247     switch(c)
2248     {
2249     default: break;
2250     case 0x0a: /* LF */
2251     case 0x0b: /* VT */
2252     case 0x0c: /* FF */
2253     case 0x0d: /* CR */
2254     case 0x85: /* NEL */
2255     case 0x2028: /* LINE SEPARATOR */
2256     case 0x2029: /* PARAGRAPH SEPARATOR */
2257 ph10 510 MRRETURN(MATCH_NOMATCH);
2258 ph10 178 }
2259     ecode++;
2260     break;
2261    
2262     case OP_VSPACE:
2263 ph10 443 if (eptr >= md->end_subject)
2264 ph10 428 {
2265 ph10 443 SCHECK_PARTIAL();
2266 ph10 510 MRRETURN(MATCH_NOMATCH);
2267 ph10 443 }
2268 ph10 178 GETCHARINCTEST(c, eptr);
2269     switch(c)
2270     {
2271 ph10 510 default: MRRETURN(MATCH_NOMATCH);
2272 ph10 178 case 0x0a: /* LF */
2273     case 0x0b: /* VT */
2274     case 0x0c: /* FF */
2275     case 0x0d: /* CR */
2276     case 0x85: /* NEL */
2277     case 0x2028: /* LINE SEPARATOR */
2278     case 0x2029: /* PARAGRAPH SEPARATOR */
2279     break;
2280     }
2281     ecode++;
2282     break;
2283    
2284 nigel 77 #ifdef SUPPORT_UCP
2285     /* Check the next character by Unicode property. We will get here only
2286     if the support is in the binary; otherwise a compile-time error occurs. */
2287    
2288     case OP_PROP:
2289     case OP_NOTPROP:
2290 ph10 443 if (eptr >= md->end_subject)
2291 ph10 428 {
2292 ph10 443 SCHECK_PARTIAL();
2293 ph10 510 MRRETURN(MATCH_NOMATCH);
2294 ph10 443 }
2295 nigel 77 GETCHARINCTEST(c, eptr);
2296     {
2297 ph10 384 const ucd_record *prop = GET_UCD(c);
2298 nigel 77
2299 nigel 87 switch(ecode[1])
2300     {
2301     case PT_ANY:
2302 ph10 510 if (op == OP_NOTPROP) MRRETURN(MATCH_NOMATCH);
2303 nigel 87 break;
2304 nigel 77
2305 nigel 87 case PT_LAMP:
2306 ph10 349 if ((prop->chartype == ucp_Lu ||
2307     prop->chartype == ucp_Ll ||
2308     prop->chartype == ucp_Lt) == (op == OP_NOTPROP))
2309 ph10 510 MRRETURN(MATCH_NOMATCH);
2310 ph10 517 break;
2311 nigel 87
2312     case PT_GC:
2313 ph10 351 if ((ecode[2] != _pcre_ucp_gentype[prop->chartype]) == (op == OP_PROP))
2314 ph10 510 MRRETURN(MATCH_NOMATCH);
2315 nigel 87 break;
2316    
2317     case PT_PC:
2318 ph10 349 if ((ecode[2] != prop->chartype) == (op == OP_PROP))
2319 ph10 510 MRRETURN(MATCH_NOMATCH);
2320 nigel 87 break;
2321    
2322     case PT_SC:
2323 ph10 349 if ((ecode[2] != prop->script) == (op == OP_PROP))
2324 ph10 510 MRRETURN(MATCH_NOMATCH);
2325 nigel 87 break;
2326 ph10 527
2327 ph10 517 /* These are specials */
2328 ph10 527
2329 ph10 517 case PT_ALNUM:
2330     if ((_pcre_ucp_gentype[prop->chartype] == ucp_L ||
2331     _pcre_ucp_gentype[prop->chartype] == ucp_N) == (op == OP_NOTPROP))
2332     MRRETURN(MATCH_NOMATCH);
2333 ph10 527 break;
2334    
2335 ph10 517 case PT_SPACE: /* Perl space */
2336     if ((_pcre_ucp_gentype[prop->chartype] == ucp_Z ||
2337     c == CHAR_HT || c == CHAR_NL || c == CHAR_FF || c == CHAR_CR)
2338     == (op == OP_NOTPROP))
2339     MRRETURN(MATCH_NOMATCH);
2340 ph10 527 break;
2341    
2342 ph10 517 case PT_PXSPACE: /* POSIX space */
2343     if ((_pcre_ucp_gentype[prop->chartype] == ucp_Z ||
2344 ph10 527 c == CHAR_HT || c == CHAR_NL || c == CHAR_VT ||
2345 ph10 517 c == CHAR_FF || c == CHAR_CR)
2346     == (op == OP_NOTPROP))
2347     MRRETURN(MATCH_NOMATCH);
2348 ph10 527 break;
2349 nigel 87
2350 ph10 527 case PT_WORD:
2351 ph10 517 if ((_pcre_ucp_gentype[prop->chartype] == ucp_L ||
2352 ph10 527 _pcre_ucp_gentype[prop->chartype] == ucp_N ||
2353 ph10 517 c == CHAR_UNDERSCORE) == (op == OP_NOTPROP))
2354     MRRETURN(MATCH_NOMATCH);
2355 ph10 527 break;
2356    
2357 ph10 517 /* This should never occur */
2358    
2359 nigel 87 default:
2360     RRETURN(PCRE_ERROR_INTERNAL);
2361 nigel 77 }
2362 nigel 87
2363     ecode += 3;
2364 nigel 77 }
2365     break;
2366    
2367     /* Match an extended Unicode sequence. We will get here only if the support
2368     is in the binary; otherwise a compile-time error occurs. */
2369    
2370     case OP_EXTUNI:
2371 ph10 443 if (eptr >= md->end_subject)
2372 ph10 428 {
2373 ph10 443 SCHECK_PARTIAL();
2374 ph10 510 MRRETURN(MATCH_NOMATCH);
2375 ph10 443 }
2376 nigel 77 GETCHARINCTEST(c, eptr);
2377     {
2378 ph10 349 int category = UCD_CATEGORY(c);
2379 ph10 510 if (category == ucp_M) MRRETURN(MATCH_NOMATCH);
2380 nigel 77 while (eptr < md->end_subject)
2381     {
2382     int len = 1;
2383     if (!utf8) c = *eptr; else
2384     {
2385     GETCHARLEN(c, eptr, len);
2386     }
2387 ph10 349 category = UCD_CATEGORY(c);
2388 nigel 77 if (category != ucp_M) break;
2389     eptr += len;
2390     }
2391     }
2392     ecode++;
2393     break;
2394     #endif
2395    
2396    
2397     /* Match a back reference, possibly repeatedly. Look past the end of the
2398     item to see if there is repeat information following. The code is similar
2399     to that for character classes, but repeated for efficiency. Then obey
2400     similar code to character type repeats - written out again for speed.
2401     However, if the referenced string is the empty string, always treat
2402     it as matched, any number of times (otherwise there could be infinite
2403     loops). */
2404    
2405     case OP_REF:
2406 ph10 602 case OP_REFI:
2407     caseless = op == OP_REFI;
2408 ph10 595 offset = GET2(ecode, 1) << 1; /* Doubled ref number */
2409     ecode += 3;
2410 ph10 345
2411 ph10 595 /* If the reference is unset, there are two possibilities:
2412 ph10 345
2413 ph10 595 (a) In the default, Perl-compatible state, set the length negative;
2414     this ensures that every attempt at a match fails. We can't just fail
2415     here, because of the possibility of quantifiers with zero minima.
2416 ph10 345
2417 ph10 595 (b) If the JavaScript compatibility flag is set, set the length to zero
2418     so that the back reference matches an empty string.
2419 ph10 345
2420 ph10 595 Otherwise, set the length to the length of what was matched by the
2421     referenced subpattern. */
2422 ph10 345
2423 ph10 595 if (offset >= offset_top || md->offset_vector[offset] < 0)
2424     length = (md->jscript_compat)? 0 : -1;
2425     else
2426     length = md->offset_vector[offset+1] - md->offset_vector[offset];
2427 nigel 77
2428 ph10 595 /* Set up for repetition, or handle the non-repeated case */
2429 nigel 77
2430 ph10 595 switch (*ecode)
2431     {
2432     case OP_CRSTAR:
2433     case OP_CRMINSTAR:
2434     case OP_CRPLUS:
2435     case OP_CRMINPLUS:
2436     case OP_CRQUERY:
2437     case OP_CRMINQUERY:
2438     c = *ecode++ - OP_CRSTAR;
2439     minimize = (c & 1) != 0;
2440     min = rep_min[c]; /* Pick up values from tables; */
2441     max = rep_max[c]; /* zero for max => infinity */
2442     if (max == 0) max = INT_MAX;
2443     break;
2444 nigel 77
2445 ph10 595 case OP_CRRANGE:
2446     case OP_CRMINRANGE:
2447     minimize = (*ecode == OP_CRMINRANGE);
2448     min = GET2(ecode, 1);
2449     max = GET2(ecode, 3);
2450     if (max == 0) max = INT_MAX;
2451     ecode += 5;
2452     break;
2453 nigel 77
2454 ph10 595 default: /* No repeat follows */
2455 ph10 602 if ((length = match_ref(offset, eptr, length, md, caseless)) < 0)
2456 ph10 595 {
2457     CHECK_PARTIAL();
2458     MRRETURN(MATCH_NOMATCH);
2459 nigel 77 }
2460 ph10 595 eptr += length;
2461     continue; /* With the main loop */
2462     }
2463 nigel 77
2464 ph10 595 /* Handle repeated back references. If the length of the reference is
2465     zero, just continue with the main loop. */
2466 ph10 443
2467 ph10 595 if (length == 0) continue;
2468 nigel 77
2469 ph10 595 /* First, ensure the minimum number of matches are present. We get back
2470     the length of the reference string explicitly rather than passing the
2471     address of eptr, so that eptr can be a register variable. */
2472 nigel 77
2473 ph10 595 for (i = 1; i <= min; i++)
2474     {
2475     int slength;
2476 ph10 602 if ((slength = match_ref(offset, eptr, length, md, caseless)) < 0)
2477 nigel 77 {
2478 ph10 595 CHECK_PARTIAL();
2479     MRRETURN(MATCH_NOMATCH);
2480 nigel 77 }
2481 ph10 595 eptr += slength;
2482     }
2483 nigel 77
2484 ph10 595 /* If min = max, continue at the same level without recursion.
2485     They are not both allowed to be zero. */
2486 nigel 77
2487 ph10 595 if (min == max) continue;
2488 nigel 77
2489 ph10 595 /* If minimizing, keep trying and advancing the pointer */
2490 nigel 77
2491 ph10 595 if (minimize)
2492     {
2493     for (fi = min;; fi++)
2494 nigel 77 {
2495 ph10 595 int slength;
2496 ph10 604 RMATCH(eptr, ecode, offset_top, md, eptrb, RM14);
2497 ph10 595 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2498     if (fi >= max) MRRETURN(MATCH_NOMATCH);
2499 ph10 602 if ((slength = match_ref(offset, eptr, length, md, caseless)) < 0)
2500 nigel 77 {
2501 ph10 595 CHECK_PARTIAL();
2502     MRRETURN(MATCH_NOMATCH);
2503 nigel 77 }
2504 ph10 595 eptr += slength;
2505 nigel 77 }
2506 ph10 595 /* Control never gets here */
2507     }
2508 nigel 77
2509 ph10 595 /* If maximizing, find the longest string and work backwards */
2510 nigel 77
2511 ph10 595 else
2512     {
2513     pp = eptr;
2514     for (i = min; i < max; i++)
2515 nigel 77 {
2516 ph10 595 int slength;
2517 ph10 602 if ((slength = match_ref(offset, eptr, length, md, caseless)) < 0)
2518 nigel 77 {
2519 ph10 595 CHECK_PARTIAL();
2520     break;
2521 nigel 77 }
2522 ph10 595 eptr += slength;
2523 nigel 77 }
2524 ph10 595 while (eptr >= pp)
2525     {
2526 ph10 604 RMATCH(eptr, ecode, offset_top, md, eptrb, RM15);
2527 ph10 595 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2528     eptr -= length;
2529     }
2530     MRRETURN(MATCH_NOMATCH);
2531 nigel 77 }
2532     /* Control never gets here */
2533    
2534     /* Match a bit-mapped character class, possibly repeatedly. This op code is
2535     used when all the characters in the class have values in the range 0-255,
2536     and either the matching is caseful, or the characters are in the range
2537     0-127 when UTF-8 processing is enabled. The only difference between
2538     OP_CLASS and OP_NCLASS occurs when a data character outside the range is
2539     encountered.
2540    
2541     First, look past the end of the item to see if there is repeat information
2542     following. Then obey similar code to character type repeats - written out
2543     again for speed. */
2544    
2545     case OP_NCLASS:
2546     case OP_CLASS:
2547     {
2548     data = ecode + 1; /* Save for matching */
2549     ecode += 33; /* Advance past the item */
2550    
2551     switch (*ecode)
2552     {
2553     case OP_CRSTAR:
2554     case OP_CRMINSTAR:
2555     case OP_CRPLUS:
2556     case OP_CRMINPLUS:
2557     case OP_CRQUERY:
2558     case OP_CRMINQUERY:
2559     c = *ecode++ - OP_CRSTAR;
2560     minimize = (c & 1) != 0;
2561     min = rep_min[c]; /* Pick up values from tables; */
2562     max = rep_max[c]; /* zero for max => infinity */
2563     if (max == 0) max = INT_MAX;
2564     break;
2565    
2566     case OP_CRRANGE:
2567     case OP_CRMINRANGE:
2568     minimize = (*ecode == OP_CRMINRANGE);
2569     min = GET2(ecode, 1);
2570     max = GET2(ecode, 3);
2571     if (max == 0) max = INT_MAX;
2572     ecode += 5;
2573     break;
2574    
2575     default: /* No repeat follows */
2576     min = max = 1;
2577     break;
2578     }
2579    
2580     /* First, ensure the minimum number of matches are present. */
2581    
2582     #ifdef SUPPORT_UTF8
2583     /* UTF-8 mode */
2584     if (utf8)
2585     {
2586     for (i = 1; i <= min; i++)
2587     {
2588 ph10 427 if (eptr >= md->end_subject)
2589 ph10 426 {
2590 ph10 428 SCHECK_PARTIAL();
2591 ph10 510 MRRETURN(MATCH_NOMATCH);
2592 ph10 427 }
2593 nigel 77 GETCHARINC(c, eptr);
2594     if (c > 255)
2595     {
2596 ph10 510 if (op == OP_CLASS) MRRETURN(MATCH_NOMATCH);
2597 nigel 77 }
2598     else
2599     {
2600 ph10 510 if ((data[c/8] & (1 << (c&7))) == 0) MRRETURN(MATCH_NOMATCH);
2601 nigel 77 }
2602     }
2603     }
2604     else
2605     #endif
2606     /* Not UTF-8 mode */
2607     {
2608     for (i = 1; i <= min; i++)
2609     {
2610 ph10 427 if (eptr >= md->end_subject)
2611 ph10 426 {
2612 ph10 428 SCHECK_PARTIAL();
2613 ph10 510 MRRETURN(MATCH_NOMATCH);
2614 ph10 427 }
2615 nigel 77 c = *eptr++;
2616 ph10 510 if ((data[c/8] & (1 << (c&7))) == 0) MRRETURN(MATCH_NOMATCH);
2617 nigel 77 }
2618     }
2619    
2620     /* If max == min we can continue with the main loop without the
2621     need to recurse. */
2622    
2623     if (min == max) continue;
2624    
2625     /* If minimizing, keep testing the rest of the expression and advancing
2626     the pointer while it matches the class. */
2627    
2628     if (minimize)
2629     {
2630     #ifdef SUPPORT_UTF8
2631     /* UTF-8 mode */
2632     if (utf8)
2633     {
2634     for (fi = min;; fi++)
2635     {
2636 ph10 604 RMATCH(eptr, ecode, offset_top, md, eptrb, RM16);
2637 nigel 77 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2638 ph10 510 if (fi >= max) MRRETURN(MATCH_NOMATCH);
2639 ph10 427 if (eptr >= md->end_subject)
2640 ph10 426 {
2641 ph10 427 SCHECK_PARTIAL();
2642 ph10 510 MRRETURN(MATCH_NOMATCH);
2643 ph10 427 }
2644 nigel 77 GETCHARINC(c, eptr);
2645     if (c > 255)
2646     {
2647 ph10 510 if (op == OP_CLASS) MRRETURN(MATCH_NOMATCH);
2648 nigel 77 }
2649     else
2650     {
2651 ph10 510 if ((data[c/8] & (1 << (c&7))) == 0) MRRETURN(MATCH_NOMATCH);
2652 nigel 77 }
2653     }
2654     }
2655     else
2656     #endif
2657     /* Not UTF-8 mode */
2658     {
2659     for (fi = min;; fi++)
2660     {
2661 ph10 604 RMATCH(eptr, ecode, offset_top, md, eptrb, RM17);
2662 nigel 77 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2663 ph10 510 if (fi >= max) MRRETURN(MATCH_NOMATCH);
2664 ph10 427 if (eptr >= md->end_subject)
2665 ph10 426 {
2666 ph10 427 SCHECK_PARTIAL();
2667 ph10 510 MRRETURN(MATCH_NOMATCH);
2668 ph10 427 }
2669 nigel 77 c = *eptr++;
2670 ph10 510 if ((data[c/8] & (1 << (c&7))) == 0) MRRETURN(MATCH_NOMATCH);
2671 nigel 77 }
2672     }
2673     /* Control never gets here */
2674     }
2675    
2676     /* If maximizing, find the longest possible run, then work backwards. */
2677    
2678     else
2679     {
2680     pp = eptr;
2681    
2682     #ifdef SUPPORT_UTF8
2683     /* UTF-8 mode */
2684     if (utf8)
2685     {
2686     for (i = min; i < max; i++)
2687     {
2688     int len = 1;
2689 ph10 463 if (eptr >= md->end_subject)
2690 ph10 462 {
2691 ph10 463 SCHECK_PARTIAL();
2692 ph10 462 break;
2693 ph10 463 }
2694 nigel 77 GETCHARLEN(c, eptr, len);
2695     if (c > 255)
2696     {
2697     if (op == OP_CLASS) break;
2698     }
2699     else
2700     {
2701     if ((data[c/8] & (1 << (c&7))) == 0) break;
2702     }
2703     eptr += len;
2704     }
2705     for (;;)
2706     {
2707 ph10 604 RMATCH(eptr, ecode, offset_top, md, eptrb, RM18);
2708 nigel 77 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2709     if (eptr-- == pp) break; /* Stop if tried at original pos */
2710     BACKCHAR(eptr);
2711     }
2712     }
2713     else
2714     #endif
2715     /* Not UTF-8 mode */
2716     {
2717     for (i = min; i < max; i++)
2718     {
2719 ph10 463 if (eptr >= md->end_subject)
2720 ph10 462 {
2721 ph10 463 SCHECK_PARTIAL();
2722 ph10 462 break;
2723 ph10 463 }
2724 nigel 77 c = *eptr;
2725     if ((data[c/8] & (1 << (c&7))) == 0) break;
2726     eptr++;
2727     }
2728     while (eptr >= pp)
2729     {
2730 ph10 604 RMATCH(eptr, ecode, offset_top, md, eptrb, RM19);
2731 nigel 87 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2732 nigel 77 eptr--;
2733     }
2734     }
2735    
2736 ph10 510 MRRETURN(MATCH_NOMATCH);
2737 nigel 77 }
2738     }
2739     /* Control never gets here */
2740    
2741    
2742     /* Match an extended character class. This opcode is encountered only
2743 ph10 384 when UTF-8 mode mode is supported. Nevertheless, we may not be in UTF-8
2744     mode, because Unicode properties are supported in non-UTF-8 mode. */
2745 nigel 77
2746     #ifdef SUPPORT_UTF8
2747     case OP_XCLASS:
2748     {
2749     data = ecode + 1 + LINK_SIZE; /* Save for matching */
2750     ecode += GET(ecode, 1); /* Advance past the item */
2751    
2752     switch (*ecode)
2753     {
2754     case OP_CRSTAR:
2755     case OP_CRMINSTAR:
2756     case OP_CRPLUS:
2757     case OP_CRMINPLUS:
2758     case OP_CRQUERY:
2759     case OP_CRMINQUERY:
2760     c = *ecode++ - OP_CRSTAR;
2761     minimize = (c & 1) != 0;
2762     min = rep_min[c]; /* Pick up values from tables; */
2763     max = rep_max[c]; /* zero for max => infinity */
2764     if (max == 0) max = INT_MAX;
2765     break;
2766    
2767     case OP_CRRANGE:
2768     case OP_CRMINRANGE:
2769     minimize = (*ecode == OP_CRMINRANGE);
2770     min = GET2(ecode, 1);
2771     max = GET2(ecode, 3);
2772     if (max == 0) max = INT_MAX;
2773     ecode += 5;
2774     break;
2775    
2776     default: /* No repeat follows */
2777     min = max = 1;
2778     break;
2779     }
2780    
2781     /* First, ensure the minimum number of matches are present. */
2782    
2783     for (i = 1; i <= min; i++)
2784     {
2785 ph10 427 if (eptr >= md->end_subject)
2786 ph10 426 {
2787     SCHECK_PARTIAL();
2788 ph10 510 MRRETURN(MATCH_NOMATCH);
2789 ph10 427 }
2790 ph10 384 GETCHARINCTEST(c, eptr);
2791 ph10 510 if (!_pcre_xclass(c, data)) MRRETURN(MATCH_NOMATCH);
2792 nigel 77 }
2793    
2794     /* If max == min we can continue with the main loop without the
2795     need to recurse. */
2796    
2797     if (min == max) continue;
2798    
2799     /* If minimizing, keep testing the rest of the expression and advancing
2800     the pointer while it matches the class. */
2801    
2802     if (minimize)
2803     {
2804     for (fi = min;; fi++)
2805     {
2806 ph10 604 RMATCH(eptr, ecode, offset_top, md, eptrb, RM20);
2807 nigel 77 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2808 ph10 510 if (fi >= max) MRRETURN(MATCH_NOMATCH);
2809 ph10 427 if (eptr >= md->end_subject)
2810 ph10 426 {
2811 ph10 427 SCHECK_PARTIAL();
2812 ph10 510 MRRETURN(MATCH_NOMATCH);
2813 ph10 427 }
2814 ph10 384 GETCHARINCTEST(c, eptr);
2815 ph10 510 if (!_pcre_xclass(c, data)) MRRETURN(MATCH_NOMATCH);
2816 nigel 77 }
2817     /* Control never gets here */
2818     }
2819    
2820     /* If maximizing, find the longest possible run, then work backwards. */
2821    
2822     else
2823     {
2824     pp = eptr;
2825     for (i = min; i < max; i++)
2826     {
2827     int len = 1;
2828 ph10 463 if (eptr >= md->end_subject)
2829 ph10 462 {
2830 ph10 463 SCHECK_PARTIAL();
2831 ph10 462 break;
2832 ph10 463 }
2833 ph10 384 GETCHARLENTEST(c, eptr, len);
2834 nigel 77 if (!_pcre_xclass(c, data)) break;
2835     eptr += len;
2836     }
2837     for(;;)
2838     {
2839 ph10 604 RMATCH(eptr, ecode, offset_top, md, eptrb, RM21);
2840 nigel 77 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2841     if (eptr-- == pp) break; /* Stop if tried at original pos */
2842 ph10 214 if (utf8) BACKCHAR(eptr);
2843 nigel 77 }
2844 ph10 510 MRRETURN(MATCH_NOMATCH);
2845 nigel 77 }
2846    
2847     /* Control never gets here */
2848     }
2849     #endif /* End of XCLASS */
2850    
2851     /* Match a single character, casefully */
2852    
2853     case OP_CHAR:
2854     #ifdef SUPPORT_UTF8
2855     if (utf8)
2856     {
2857     length = 1;
2858     ecode++;
2859     GETCHARLEN(fc, ecode, length);
2860 ph10 443 if (length > md->end_subject - eptr)
2861 ph10 428 {
2862     CHECK_PARTIAL(); /* Not SCHECK_PARTIAL() */
2863 ph10 510 MRRETURN(MATCH_NOMATCH);
2864 ph10 443 }
2865 ph10 510 while (length-- > 0) if (*ecode++ != *eptr++) MRRETURN(MATCH_NOMATCH);
2866 nigel 77 }
2867     else
2868     #endif
2869    
2870     /* Non-UTF-8 mode */
2871     {
2872 ph10 443 if (md->end_subject - eptr < 1)
2873 ph10 428 {
2874     SCHECK_PARTIAL(); /* This one can use SCHECK_PARTIAL() */
2875 ph10 510 MRRETURN(MATCH_NOMATCH);
2876 ph10 443 }
2877 ph10 510 if (ecode[1] != *eptr++) MRRETURN(MATCH_NOMATCH);
2878 nigel 77 ecode += 2;
2879     }
2880     break;
2881    
2882     /* Match a single character, caselessly */
2883    
2884 ph10 602 case OP_CHARI:
2885 nigel 77 #ifdef SUPPORT_UTF8
2886     if (utf8)
2887     {
2888     length = 1;
2889     ecode++;
2890     GETCHARLEN(fc, ecode, length);
2891    
2892 ph10 443 if (length > md->end_subject - eptr)
2893 ph10 428 {
2894     CHECK_PARTIAL(); /* Not SCHECK_PARTIAL() */
2895 ph10 510 MRRETURN(MATCH_NOMATCH);
2896 ph10 443 }
2897 nigel 77
2898     /* If the pattern character's value is < 128, we have only one byte, and
2899     can use the fast lookup table. */
2900    
2901     if (fc < 128)
2902     {
2903 ph10 510 if (md->lcc[*ecode++] != md->lcc[*eptr++]) MRRETURN(MATCH_NOMATCH);
2904 nigel 77 }
2905    
2906     /* Otherwise we must pick up the subject character */
2907    
2908     else
2909     {
2910 nigel 93 unsigned int dc;
2911 nigel 77 GETCHARINC(dc, eptr);
2912     ecode += length;
2913    
2914     /* If we have Unicode property support, we can use it to test the other
2915 nigel 87 case of the character, if there is one. */
2916 nigel 77
2917     if (fc != dc)
2918     {
2919     #ifdef SUPPORT_UCP
2920 ph10 349 if (dc != UCD_OTHERCASE(fc))
2921 nigel 77 #endif
2922 ph10 510 MRRETURN(MATCH_NOMATCH);
2923 nigel 77 }
2924     }
2925     }
2926     else
2927     #endif /* SUPPORT_UTF8 */
2928    
2929     /* Non-UTF-8 mode */
2930     {
2931 ph10 443 if (md->end_subject - eptr < 1)
2932 ph10 428 {
2933 ph10 443 SCHECK_PARTIAL(); /* This one can use SCHECK_PARTIAL() */
2934 ph10 510 MRRETURN(MATCH_NOMATCH);
2935 ph10 443 }
2936 ph10 510 if (md->lcc[ecode[1]] != md->lcc[*eptr++]) MRRETURN(MATCH_NOMATCH);
2937 nigel 77 ecode += 2;
2938     }
2939     break;
2940    
2941 nigel 93 /* Match a single character repeatedly. */
2942 nigel 77
2943     case OP_EXACT:
2944 ph10 602 case OP_EXACTI:
2945 nigel 77 min = max = GET2(ecode, 1);
2946     ecode += 3;
2947     goto REPEATCHAR;
2948    
2949 nigel 93 case OP_POSUPTO:
2950 ph10 602 case OP_POSUPTOI:
2951 nigel 93 possessive = TRUE;
2952     /* Fall through */
2953    
2954 nigel 77 case OP_UPTO:
2955 ph10 602 case OP_UPTOI:
2956 nigel 77 case OP_MINUPTO:
2957 ph10 602 case OP_MINUPTOI:
2958 nigel 77 min = 0;
2959     max = GET2(ecode, 1);
2960 ph10 602 minimize = *ecode == OP_MINUPTO || *ecode == OP_MINUPTOI;
2961 nigel 77 ecode += 3;
2962     goto REPEATCHAR;
2963    
2964 nigel 93 case OP_POSSTAR:
2965 ph10 602 case OP_POSSTARI:
2966 nigel 93 possessive = TRUE;
2967     min = 0;
2968     max = INT_MAX;
2969     ecode++;
2970     goto REPEATCHAR;
2971    
2972     case OP_POSPLUS:
2973 ph10 602 case OP_POSPLUSI:
2974 nigel 93 possessive = TRUE;
2975     min = 1;
2976     max = INT_MAX;
2977     ecode++;
2978     goto REPEATCHAR;
2979    
2980     case OP_POSQUERY:
2981 ph10 602 case OP_POSQUERYI:
2982 nigel 93 possessive = TRUE;
2983     min = 0;
2984     max = 1;
2985     ecode++;
2986     goto REPEATCHAR;
2987    
2988 nigel 77 case OP_STAR:
2989 ph10 602 case OP_STARI:
2990 nigel 77 case OP_MINSTAR:
2991 ph10 602 case OP_MINSTARI:
2992 nigel 77 case OP_PLUS:
2993 ph10 602 case OP_PLUSI:
2994 nigel 77 case OP_MINPLUS:
2995 ph10 602 case OP_MINPLUSI:
2996 nigel 77 case OP_QUERY:
2997 ph10 602 case OP_QUERYI:
2998 nigel 77 case OP_MINQUERY:
2999 ph10 602 case OP_MINQUERYI:
3000     c = *ecode++ - ((op < OP_STARI)? OP_STAR : OP_STARI);
3001 nigel 77 minimize = (c & 1) != 0;
3002     min = rep_min[c]; /* Pick up values from tables; */
3003     max = rep_max[c]; /* zero for max => infinity */
3004     if (max == 0) max = INT_MAX;
3005    
3006 ph10 426 /* Common code for all repeated single-character matches. */
3007 nigel 77
3008     REPEATCHAR:
3009     #ifdef SUPPORT_UTF8
3010     if (utf8)
3011     {
3012     length = 1;
3013     charptr = ecode;
3014     GETCHARLEN(fc, ecode, length);
3015     ecode += length;
3016    
3017     /* Handle multibyte character matching specially here. There is
3018     support for caseless matching if UCP support is present. */
3019    
3020     if (length > 1)
3021     {
3022     #ifdef SUPPORT_UCP
3023 nigel 93 unsigned int othercase;
3024 ph10 602 if (op >= OP_STARI && /* Caseless */
3025 ph10 349 (othercase = UCD_OTHERCASE(fc)) != fc)
3026 nigel 77 oclength = _pcre_ord2utf8(othercase, occhars);
3027 ph10 115 else oclength = 0;
3028 nigel 77 #endif /* SUPPORT_UCP */
3029    
3030     for (i = 1; i <= min; i++)
3031     {
3032 ph10 426 if (eptr <= md->end_subject - length &&
3033     memcmp(eptr, charptr, length) == 0) eptr += length;
3034 ph10 123 #ifdef SUPPORT_UCP
3035 ph10 426 else if (oclength > 0 &&
3036     eptr <= md->end_subject - oclength &&
3037     memcmp(eptr, occhars, oclength) == 0) eptr += oclength;
3038     #endif /* SUPPORT_UCP */
3039 nigel 77 else
3040     {
3041 ph10 426 CHECK_PARTIAL();
3042 ph10 510 MRRETURN(MATCH_NOMATCH);
3043 nigel 77 }
3044     }
3045    
3046     if (min == max) continue;
3047    
3048     if (minimize)
3049     {
3050     for (fi = min;; fi++)
3051     {
3052 ph10 604 RMATCH(eptr, ecode, offset_top, md, eptrb, RM22);
3053 nigel 77 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3054 ph10 510 if (fi >= max) MRRETURN(MATCH_NOMATCH);
3055 ph10 426 if (eptr <= md->end_subject - length &&
3056     memcmp(eptr, charptr, length) == 0) eptr += length;
3057 ph10 123 #ifdef SUPPORT_UCP
3058 ph10 426 else if (oclength > 0 &&
3059     eptr <= md->end_subject - oclength &&
3060     memcmp(eptr, occhars, oclength) == 0) eptr += oclength;
3061     #endif /* SUPPORT_UCP */
3062 nigel 77 else
3063     {
3064 ph10 426 CHECK_PARTIAL();
3065 ph10 510 MRRETURN(MATCH_NOMATCH);
3066 nigel 77 }
3067     }
3068     /* Control never gets here */
3069     }
3070 nigel 93
3071     else /* Maximize */
3072 nigel 77 {
3073     pp = eptr;
3074     for (i = min; i < max; i++)
3075     {
3076 ph10 426 if (eptr <= md->end_subject - length &&
3077     memcmp(eptr, charptr, length) == 0) eptr += length;
3078 ph10 123 #ifdef SUPPORT_UCP
3079 ph10 426 else if (oclength > 0 &&
3080     eptr <= md->end_subject - oclength &&
3081     memcmp(eptr, occhars, oclength) == 0) eptr += oclength;
3082     #endif /* SUPPORT_UCP */
3083 ph10 463 else
3084 ph10 462 {
3085 ph10 463 CHECK_PARTIAL();
3086 ph10 462 break;
3087 ph10 463 }
3088 nigel 77 }
3089 nigel 93
3090     if (possessive) continue;
3091 ph10 427
3092 ph10 120 for(;;)
3093 ph10 426 {
3094 ph10 604 RMATCH(eptr, ecode, offset_top, md, eptrb, RM23);
3095 ph10 426 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3096 ph10 510 if (eptr == pp) { MRRETURN(MATCH_NOMATCH); }
3097 ph10 115 #ifdef SUPPORT_UCP
3098 ph10 426 eptr--;
3099     BACKCHAR(eptr);
3100 ph10 123 #else /* without SUPPORT_UCP */
3101 ph10 426 eptr -= length;
3102 ph10 123 #endif /* SUPPORT_UCP */
3103 ph10 426 }
3104 nigel 77 }
3105     /* Control never gets here */
3106     }
3107    
3108     /* If the length of a UTF-8 character is 1, we fall through here, and
3109     obey the code as for non-UTF-8 characters below, though in this case the
3110     value of fc will always be < 128. */
3111     }
3112     else
3113     #endif /* SUPPORT_UTF8 */
3114    
3115     /* When not in UTF-8 mode, load a single-byte character. */
3116    
3117 ph10 426 fc = *ecode++;
3118 ph10 443
3119 nigel 77 /* The value of fc at this point is always less than 256, though we may or
3120     may not be in UTF-8 mode. The code is duplicated for the caseless and
3121     caseful cases, for speed, since matching characters is likely to be quite
3122     common. First, ensure the minimum number of matches are present. If min =
3123     max, continue at the same level without recursing. Otherwise, if
3124     minimizing, keep trying the rest of the expression and advancing one
3125     matching character if failing, up to the maximum. Alternatively, if
3126     maximizing, find the maximum number of characters and work backwards. */
3127    
3128     DPRINTF(("matching %c{%d,%d} against subject %.*s\n", fc, min, max,
3129     max, eptr));
3130    
3131 ph10 602 if (op >= OP_STARI) /* Caseless */
3132 nigel 77 {
3133     fc = md->lcc[fc];
3134     for (i = 1; i <= min; i++)
3135 ph10 426 {
3136     if (eptr >= md->end_subject)
3137     {
3138     SCHECK_PARTIAL();
3139 ph10 510 MRRETURN(MATCH_NOMATCH);
3140 ph10 426 }
3141 ph10 510 if (fc != md->lcc[*eptr++]) MRRETURN(MATCH_NOMATCH);
3142 ph10 426 }
3143 nigel 77 if (min == max) continue;
3144     if (minimize)
3145     {
3146     for (fi = min;; fi++)
3147     {
3148 ph10 604 RMATCH(eptr, ecode, offset_top, md, eptrb, RM24);
3149 nigel 77 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3150 ph10 510 if (fi >= max) MRRETURN(MATCH_NOMATCH);
3151 ph10 426 if (eptr >= md->end_subject)
3152     {
3153 ph10 427 SCHECK_PARTIAL();
3154 ph10 510 MRRETURN(MATCH_NOMATCH);
3155 ph10 426 }
3156 ph10 510 if (fc != md->lcc[*eptr++]) MRRETURN(MATCH_NOMATCH);
3157 nigel 77 }
3158     /* Control never gets here */
3159     }
3160 nigel 93 else /* Maximize */
3161 nigel 77 {
3162     pp = eptr;
3163     for (i = min; i < max; i++)
3164     {
3165 ph10 463 if (eptr >= md->end_subject)
3166 ph10 462 {
3167     SCHECK_PARTIAL();
3168     break;
3169 ph10 463 }
3170 ph10 462 if (fc != md->lcc[*eptr]) break;
3171 nigel 77 eptr++;
3172     }
3173 ph10 427
3174 nigel 93 if (possessive) continue;
3175 ph10 427
3176 nigel 77 while (eptr >= pp)
3177     {
3178 ph10 604 RMATCH(eptr, ecode, offset_top, md, eptrb, RM25);
3179 nigel 77 eptr--;
3180     if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3181     }
3182 ph10 510 MRRETURN(MATCH_NOMATCH);
3183 nigel 77 }
3184     /* Control never gets here */
3185     }
3186    
3187     /* Caseful comparisons (includes all multi-byte characters) */
3188    
3189     else
3190     {
3191 ph10 427 for (i = 1; i <= min; i++)
3192 ph10 426 {
3193     if (eptr >= md->end_subject)
3194     {
3195     SCHECK_PARTIAL();
3196 ph10 510 MRRETURN(MATCH_NOMATCH);
3197 ph10 426 }
3198 ph10 510 if (fc != *eptr++) MRRETURN(MATCH_NOMATCH);
3199 ph10 427 }
3200 ph10 443
3201 nigel 77 if (min == max) continue;
3202 ph10 443
3203 nigel 77 if (minimize)
3204     {
3205     for (fi = min;; fi++)
3206     {
3207 ph10 604 RMATCH(eptr, ecode, offset_top, md, eptrb, RM26);
3208 nigel 77 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3209 ph10 510 if (fi >= max) MRRETURN(MATCH_NOMATCH);
3210 ph10 426 if (eptr >= md->end_subject)
3211 ph10 427 {
3212 ph10 426 SCHECK_PARTIAL();
3213 ph10 510 MRRETURN(MATCH_NOMATCH);
3214 ph10 427 }
3215 ph10 510 if (fc != *eptr++) MRRETURN(MATCH_NOMATCH);
3216 nigel 77 }
3217     /* Control never gets here */
3218     }
3219 nigel 93 else /* Maximize */
3220 nigel 77 {
3221     pp = eptr;
3222     for (i = min; i < max; i++)
3223     {
3224 ph10 463 if (eptr >= md->end_subject)
3225 ph10 462 {
3226 ph10 463 SCHECK_PARTIAL();
3227 ph10 462 break;
3228 ph10 463 }
3229 ph10 462 if (fc != *eptr) break;
3230 nigel 77 eptr++;
3231     }
3232 nigel 93 if (possessive) continue;
3233 ph10 443
3234 nigel 77 while (eptr >= pp)
3235     {
3236 ph10 604 RMATCH(eptr, ecode, offset_top, md, eptrb, RM27);
3237 nigel 77 eptr--;
3238     if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3239     }
3240 ph10 510 MRRETURN(MATCH_NOMATCH);
3241 nigel 77 }
3242     }
3243     /* Control never gets here */
3244    
3245     /* Match a negated single one-byte character. The character we are
3246     checking can be multibyte. */
3247    
3248     case OP_NOT:
3249 ph10 602 case OP_NOTI:
3250 ph10 443 if (eptr >= md->end_subject)
3251 ph10 428 {
3252 ph10 443 SCHECK_PARTIAL();
3253 ph10 510 MRRETURN(MATCH_NOMATCH);
3254 ph10 443 }
3255 nigel 77 ecode++;
3256     GETCHARINCTEST(c, eptr);
3257 ph10 602 if (op == OP_NOTI) /* The caseless case */
3258 nigel 77 {
3259     #ifdef SUPPORT_UTF8
3260     if (c < 256)
3261     #endif
3262     c = md->lcc[c];
3263 ph10 510 if (md->lcc[*ecode++] == c) MRRETURN(MATCH_NOMATCH);
3264 nigel 77 }
3265 ph10 602 else /* Caseful */
3266 nigel 77 {
3267 ph10 510 if (*ecode++ == c) MRRETURN(MATCH_NOMATCH);
3268 nigel 77 }
3269     break;
3270    
3271     /* Match a negated single one-byte character repeatedly. This is almost a
3272     repeat of the code for a repeated single character, but I haven't found a
3273     nice way of commoning these up that doesn't require a test of the
3274     positive/negative option for each character match. Maybe that wouldn't add
3275     very much to the time taken, but character matching *is* what this is all
3276     about... */
3277    
3278     case OP_NOTEXACT:
3279 ph10 602 case OP_NOTEXACTI:
3280 nigel 77 min = max = GET2(ecode, 1);
3281     ecode += 3;
3282     goto REPEATNOTCHAR;
3283    
3284     case OP_NOTUPTO:
3285 ph10 602 case OP_NOTUPTOI:
3286 nigel 77 case OP_NOTMINUPTO:
3287 ph10 602 case OP_NOTMINUPTOI:
3288 nigel 77 min = 0;
3289     max = GET2(ecode, 1);
3290 ph10 602 minimize = *ecode == OP_NOTMINUPTO || *ecode == OP_NOTMINUPTOI;
3291 nigel 77 ecode += 3;
3292     goto REPEATNOTCHAR;
3293    
3294 nigel 93 case OP_NOTPOSSTAR:
3295 ph10 602 case OP_NOTPOSSTARI:
3296 nigel 93 possessive = TRUE;
3297     min = 0;
3298     max = INT_MAX;
3299     ecode++;
3300     goto REPEATNOTCHAR;
3301    
3302     case OP_NOTPOSPLUS:
3303 ph10 602 case OP_NOTPOSPLUSI:
3304 nigel 93 possessive = TRUE;
3305     min = 1;
3306     max = INT_MAX;
3307     ecode++;
3308     goto REPEATNOTCHAR;
3309    
3310     case OP_NOTPOSQUERY:
3311 ph10 602 case OP_NOTPOSQUERYI:
3312 nigel 93 possessive = TRUE;
3313     min = 0;
3314     max = 1;
3315     ecode++;
3316     goto REPEATNOTCHAR;
3317    
3318     case OP_NOTPOSUPTO:
3319 ph10 602 case OP_NOTPOSUPTOI:
3320 nigel 93 possessive = TRUE;
3321     min = 0;
3322     max = GET2(ecode, 1);
3323     ecode += 3;
3324     goto REPEATNOTCHAR;
3325    
3326 nigel 77 case OP_NOTSTAR:
3327 ph10 602 case OP_NOTSTARI:
3328 nigel 77 case OP_NOTMINSTAR:
3329 ph10 602 case OP_NOTMINSTARI:
3330 nigel 77 case OP_NOTPLUS:
3331 ph10 602 case OP_NOTPLUSI:
3332 nigel 77 case OP_NOTMINPLUS:
3333 ph10 602 case OP_NOTMINPLUSI:
3334 nigel 77 case OP_NOTQUERY:
3335 ph10 602 case OP_NOTQUERYI:
3336 nigel 77 case OP_NOTMINQUERY:
3337 ph10 602 case OP_NOTMINQUERYI:
3338     c = *ecode++ - ((op >= OP_NOTSTARI)? OP_NOTSTARI: OP_NOTSTAR);
3339 nigel 77 minimize = (c & 1) != 0;
3340     min = rep_min[c]; /* Pick up values from tables; */
3341     max = rep_max[c]; /* zero for max => infinity */
3342     if (max == 0) max = INT_MAX;
3343    
3344 ph10 426 /* Common code for all repeated single-byte matches. */
3345 nigel 77
3346     REPEATNOTCHAR:
3347     fc = *ecode++;
3348    
3349     /* The code is duplicated for the caseless and caseful cases, for speed,
3350     since matching characters is likely to be quite common. First, ensure the
3351     minimum number of matches are present. If min = max, continue at the same
3352     level without recursing. Otherwise, if minimizing, keep trying the rest of
3353     the expression and advancing one matching character if failing, up to the
3354     maximum. Alternatively, if maximizing, find the maximum number of
3355     characters and work backwards. */
3356    
3357     DPRINTF(("negative matching %c{%d,%d} against subject %.*s\n", fc, min, max,
3358     max, eptr));
3359    
3360 ph10 602 if (op >= OP_NOTSTARI) /* Caseless */
3361 nigel 77 {
3362     fc = md->lcc[fc];
3363    
3364     #ifdef SUPPORT_UTF8
3365     /* UTF-8 mode */
3366     if (utf8)
3367     {
3368 nigel 93 register unsigned int d;
3369 nigel 77 for (i = 1; i <= min; i++)
3370     {
3371 ph10 426 if (eptr >= md->end_subject)
3372     {
3373     SCHECK_PARTIAL();
3374 ph10 510 MRRETURN(MATCH_NOMATCH);
3375 ph10 427 }
3376 nigel 77 GETCHARINC(d, eptr);
3377     if (d < 256) d = md->lcc[d];
3378 ph10 510 if (fc == d) MRRETURN(MATCH_NOMATCH);
3379 nigel 77 }
3380     }
3381     else
3382     #endif
3383    
3384     /* Not UTF-8 mode */
3385     {
3386     for (i = 1; i <= min; i++)
3387 ph10 426 {
3388     if (eptr >= md->end_subject)
3389     {
3390     SCHECK_PARTIAL();
3391 ph10 510 MRRETURN(MATCH_NOMATCH);
3392 ph10 427 }
3393 ph10 510 if (fc == md->lcc[*eptr++]) MRRETURN(MATCH_NOMATCH);
3394 ph10 427 }
3395 nigel 77 }
3396    
3397     if (min == max) continue;
3398    
3399     if (minimize)
3400     {
3401     #ifdef SUPPORT_UTF8
3402     /* UTF-8 mode */
3403     if (utf8)
3404     {
3405 nigel 93 register unsigned int d;
3406 nigel 77 for (fi = min;; fi++)
3407     {
3408 ph10 604 RMATCH(eptr, ecode, offset_top, md, eptrb, RM28);
3409 nigel 77 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3410 ph10 510 if (fi >= max) MRRETURN(MATCH_NOMATCH);
3411 ph10 427 if (eptr >= md->end_subject)
3412 ph10 426 {
3413 ph10 427 SCHECK_PARTIAL();
3414 ph10 510 MRRETURN(MATCH_NOMATCH);
3415 ph10 427 }
3416 nigel 77 GETCHARINC(d, eptr);
3417     if (d < 256) d = md->lcc[d];
3418 ph10 510 if (fc == d) MRRETURN(MATCH_NOMATCH);
3419 nigel 77 }
3420     }
3421     else
3422     #endif
3423     /* Not UTF-8 mode */
3424     {
3425     for (fi = min;; fi++)
3426     {
3427 ph10 604 RMATCH(eptr, ecode, offset_top, md, eptrb, RM29);
3428 nigel 77 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3429 ph10 510 if (fi >= max) MRRETURN(MATCH_NOMATCH);
3430 ph10 426 if (eptr >= md->end_subject)
3431     {
3432     SCHECK_PARTIAL();
3433 ph10 510 MRRETURN(MATCH_NOMATCH);
3434 ph10 426 }
3435 ph10 510 if (fc == md->lcc[*eptr++]) MRRETURN(MATCH_NOMATCH);
3436 nigel 77 }
3437     }
3438     /* Control never gets here */
3439     }
3440    
3441     /* Maximize case */
3442    
3443     else
3444     {
3445     pp = eptr;
3446    
3447     #ifdef SUPPORT_UTF8
3448     /* UTF-8 mode */
3449     if (utf8)
3450     {
3451 nigel 93 register unsigned int d;
3452 nigel 77 for (i = min; i < max; i++)
3453     {
3454     int len = 1;
3455 ph10 463 if (eptr >= md->end_subject)
3456 ph10 462 {
3457 ph10 463 SCHECK_PARTIAL();
3458 ph10 462 break;
3459 ph10 463 }
3460 nigel 77 GETCHARLEN(d, eptr, len);
3461     if (d < 256) d = md->lcc[d];
3462     if (fc == d) break;
3463     eptr += len;
3464     }
3465 nigel 93 if (possessive) continue;
3466     for(;;)
3467 nigel 77 {
3468 ph10 604 RMATCH(eptr, ecode, offset_top, md, eptrb, RM30);
3469 nigel 77 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3470     if (eptr-- == pp) break; /* Stop if tried at original pos */
3471     BACKCHAR(eptr);
3472     }
3473     }
3474     else
3475     #endif
3476     /* Not UTF-8 mode */
3477     {
3478     for (i = min; i < max; i++)
3479     {
3480 ph10 463 if (eptr >= md->end_subject)
3481 ph10 462 {
3482     SCHECK_PARTIAL();
3483     break;
3484 ph10 463 }
3485 ph10 462 if (fc == md->lcc[*eptr]) break;
3486 nigel 77 eptr++;
3487     }
3488 nigel 93 if (possessive) continue;
3489 nigel 77 while (eptr >= pp)
3490     {
3491 ph10 604 RMATCH(eptr, ecode, offset_top, md, eptrb, RM31);
3492 nigel 77 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3493     eptr--;
3494     }
3495     }
3496    
3497 ph10 510 MRRETURN(MATCH_NOMATCH);
3498 nigel 77 }
3499     /* Control never gets here */
3500     }
3501    
3502     /* Caseful comparisons */
3503    
3504     else
3505     {
3506     #ifdef SUPPORT_UTF8
3507     /* UTF-8 mode */
3508     if (utf8)
3509     {
3510 nigel 93 register unsigned int d;
3511 nigel 77 for (i = 1; i <= min; i++)
3512     {
3513 ph10 426 if (eptr >= md->end_subject)
3514     {
3515     SCHECK_PARTIAL();
3516 ph10 510 MRRETURN(MATCH_NOMATCH);
3517 ph10 427 }
3518 nigel 77 GETCHARINC(d, eptr);
3519 ph10 510 if (fc == d) MRRETURN(MATCH_NOMATCH);
3520 nigel 77 }
3521     }
3522     else
3523     #endif
3524     /* Not UTF-8 mode */
3525     {
3526     for (i = 1; i <= min; i++)
3527 ph10 426 {
3528     if (eptr >= md->end_subject)
3529     {
3530     SCHECK_PARTIAL();
3531 ph10 510 MRRETURN(MATCH_NOMATCH);
3532 ph10 427 }
3533 ph10 510 if (fc == *eptr++) MRRETURN(MATCH_NOMATCH);
3534 ph10 427 }
3535 nigel 77 }
3536    
3537     if (min == max) continue;
3538    
3539     if (minimize)
3540     {
3541     #ifdef SUPPORT_UTF8
3542     /* UTF-8 mode */
3543     if (utf8)
3544     {
3545 nigel 93 register unsigned int d;
3546 nigel 77 for (fi = min;; fi++)
3547     {
3548 ph10 604 RMATCH(eptr, ecode, offset_top, md, eptrb, RM32);
3549 nigel 77 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3550 ph10 510 if (fi >= max) MRRETURN(MATCH_NOMATCH);
3551 ph10 427 if (eptr >= md->end_subject)
3552 ph10 426 {
3553 ph10 427 SCHECK_PARTIAL();
3554 ph10 510 MRRETURN(MATCH_NOMATCH);
3555 ph10 427 }
3556 nigel 77 GETCHARINC(d, eptr);
3557 ph10 510 if (fc == d) MRRETURN(MATCH_NOMATCH);
3558 nigel 77 }
3559     }
3560     else
3561     #endif
3562     /* Not UTF-8 mode */
3563     {
3564     for (fi = min;; fi++)
3565     {
3566 ph10 604 RMATCH(eptr, ecode, offset_top, md, eptrb, RM33);
3567 nigel 77 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3568 ph10 510 if (fi >= max) MRRETURN(MATCH_NOMATCH);
3569 ph10 426 if (eptr >= md->end_subject)
3570     {
3571     SCHECK_PARTIAL();
3572 ph10 510 MRRETURN(MATCH_NOMATCH);
3573 ph10 427 }
3574 ph10 510 if (fc == *eptr++) MRRETURN(MATCH_NOMATCH);
3575 nigel 77 }
3576     }
3577     /* Control never gets here */
3578     }
3579    
3580     /* Maximize case */
3581    
3582     else
3583     {
3584     pp = eptr;
3585    
3586     #ifdef SUPPORT_UTF8
3587     /* UTF-8 mode */
3588     if (utf8)
3589     {
3590 nigel 93 register unsigned int d;
3591 nigel 77 for (i = min; i < max; i++)
3592     {
3593     int len = 1;
3594 ph10 463 if (eptr >= md->end_subject)
3595 ph10 462 {
3596 ph10 463 SCHECK_PARTIAL();
3597 ph10 462 break;
3598 ph10 463 }
3599 nigel 77 GETCHARLEN(d, eptr, len);
3600     if (fc == d) break;
3601     eptr += len;
3602     }
3603 nigel 93 if (possessive) continue;
3604 nigel 77 for(;;)
3605     {
3606 ph10 604 RMATCH(eptr, ecode, offset_top, md, eptrb, RM34);
3607 nigel 77 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3608     if (eptr-- == pp) break; /* Stop if tried at original pos */
3609     BACKCHAR(eptr);
3610     }
3611     }
3612     else
3613     #endif
3614     /* Not UTF-8 mode */
3615     {
3616     for (i = min; i < max; i++)
3617     {
3618 ph10 463 if (eptr >= md->end_subject)
3619 ph10 462 {
3620 ph10 463 SCHECK_PARTIAL();
3621 ph10 462 break;
3622 ph10 463 }
3623 ph10 462 if (fc == *eptr) break;
3624 nigel 77 eptr++;
3625     }
3626 nigel 93 if (possessive) continue;
3627 nigel 77 while (eptr >= pp)
3628     {
3629 ph10 604 RMATCH(eptr, ecode, offset_top, md, eptrb, RM35);
3630 nigel 77 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3631     eptr--;
3632     }
3633     }
3634    
3635 ph10 510 MRRETURN(MATCH_NOMATCH);
3636 nigel 77 }
3637     }
3638     /* Control never gets here */
3639    
3640     /* Match a single character type repeatedly; several different opcodes
3641     share code. This is very similar to the code for single characters, but we
3642     repeat it in the interests of efficiency. */
3643    
3644     case OP_TYPEEXACT:
3645     min = max = GET2(ecode, 1);
3646     minimize = TRUE;
3647     ecode += 3;
3648     goto REPEATTYPE;
3649    
3650     case OP_TYPEUPTO:
3651     case OP_TYPEMINUPTO:
3652     min = 0;
3653     max = GET2(ecode, 1);
3654     minimize = *ecode == OP_TYPEMINUPTO;
3655     ecode += 3;
3656     goto REPEATTYPE;
3657    
3658 nigel 93 case OP_TYPEPOSSTAR:
3659     possessive = TRUE;
3660     min = 0;
3661     max = INT_MAX;
3662     ecode++;
3663     goto REPEATTYPE;
3664    
3665     case OP_TYPEPOSPLUS:
3666     possessive = TRUE;
3667     min = 1;
3668     max = INT_MAX;
3669     ecode++;
3670     goto REPEATTYPE;
3671    
3672     case OP_TYPEPOSQUERY:
3673     possessive = TRUE;
3674     min = 0;
3675     max = 1;
3676     ecode++;
3677     goto REPEATTYPE;
3678    
3679     case OP_TYPEPOSUPTO:
3680     possessive = TRUE;
3681     min = 0;
3682     max = GET2(ecode, 1);
3683     ecode += 3;
3684     goto REPEATTYPE;
3685    
3686 nigel 77 case OP_TYPESTAR:
3687     case OP_TYPEMINSTAR:
3688     case OP_TYPEPLUS:
3689     case OP_TYPEMINPLUS:
3690     case OP_TYPEQUERY:
3691     case OP_TYPEMINQUERY:
3692     c = *ecode++ - OP_TYPESTAR;
3693     minimize = (c & 1) != 0;
3694     min = rep_min[c]; /* Pick up values from tables; */
3695     max = rep_max[c]; /* zero for max => infinity */
3696     if (max == 0) max = INT_MAX;
3697    
3698     /* Common code for all repeated single character type matches. Note that
3699     in UTF-8 mode, '.' matches a character of any length, but for the other
3700     character types, the valid characters are all one-byte long. */
3701    
3702     REPEATTYPE:
3703     ctype = *ecode++; /* Code for the character type */
3704    
3705     #ifdef SUPPORT_UCP
3706     if (ctype == OP_PROP || ctype == OP_NOTPROP)
3707     {
3708     prop_fail_result = ctype == OP_NOTPROP;
3709     prop_type = *ecode++;
3710 nigel 87 prop_value = *ecode++;
3711 nigel 77 }
3712     else prop_type = -1;
3713     #endif
3714    
3715     /* First, ensure the minimum number of matches are present. Use inline
3716     code for maximizing the speed, and do the type test once at the start
3717 ph10 426 (i.e. keep it out of the loop). Separate the UTF-8 code completely as that
3718 nigel 77 is tidier. Also separate the UCP code, which can be the same for both UTF-8
3719     and single-bytes. */
3720    
3721     if (min > 0)
3722     {
3723     #ifdef SUPPORT_UCP
3724 nigel 87 if (prop_type >= 0)
3725 nigel 77 {
3726 nigel 87 switch(prop_type)
3727 nigel 77 {
3728 nigel 87 case PT_ANY:
3729 ph10 510 if (prop_fail_result) MRRETURN(MATCH_NOMATCH);
3730 nigel 87 for (i = 1; i <= min; i++)
3731     {
3732 ph10 427 if (eptr >= md->end_subject)
3733 ph10 426 {
3734 ph10 427 SCHECK_PARTIAL();
3735 ph10 510 MRRETURN(MATCH_NOMATCH);
3736 ph10 427 }
3737 ph10 184 GETCHARINCTEST(c, eptr);
3738 nigel 87 }
3739     break;
3740    
3741     case PT_LAMP:
3742     for (i = 1; i <= min; i++)
3743     {
3744 ph10 427 if (eptr >= md->end_subject)
3745 ph10 426 {
3746 ph10 427 SCHECK_PARTIAL();
3747 ph10 510 MRRETURN(MATCH_NOMATCH);
3748 ph10 427 }
3749 ph10 184 GETCHARINCTEST(c, eptr);
3750 ph10 349 prop_chartype = UCD_CHARTYPE(c);
3751 nigel 87 if ((prop_chartype == ucp_Lu ||
3752     prop_chartype == ucp_Ll ||
3753     prop_chartype == ucp_Lt) == prop_fail_result)
3754 ph10 510 MRRETURN(MATCH_NOMATCH);
3755 nigel 87 }
3756     break;
3757    
3758     case PT_GC:
3759     for (i = 1; i <= min; i++)
3760     {
3761 ph10 427 if (eptr >= md->end_subject)
3762 ph10 426 {
3763 ph10 427 SCHECK_PARTIAL();
3764 ph10 510 MRRETURN(MATCH_NOMATCH);
3765 ph10 427 }
3766 ph10 184 GETCHARINCTEST(c, eptr);
3767 ph10 349 prop_category = UCD_CATEGORY(c);
3768 nigel 87 if ((prop_category == prop_value) == prop_fail_result)
3769 ph10 510 MRRETURN(MATCH_NOMATCH);
3770 nigel 87 }
3771     break;
3772    
3773     case PT_PC:
3774     for (i = 1; i <= min; i++)
3775     {
3776 ph10 427 if (eptr >= md->end_subject)
3777 ph10 426 {
3778 ph10 427 SCHECK_PARTIAL();
3779 ph10 510 MRRETURN(MATCH_NOMATCH);
3780 ph10 427 }
3781 ph10 184 GETCHARINCTEST(c, eptr);
3782 ph10 349 prop_chartype = UCD_CHARTYPE(c);
3783 nigel 87 if ((prop_chartype == prop_value) == prop_fail_result)
3784 ph10 510 MRRETURN(MATCH_NOMATCH);
3785 nigel 87 }
3786     break;
3787    
3788     case PT_SC:
3789     for (i = 1; i <= min; i++)
3790     {
3791 ph10 427 if (eptr >= md->end_subject)
3792 ph10 426 {
3793 ph10 427 SCHECK_PARTIAL();
3794 ph10 510 MRRETURN(MATCH_NOMATCH);
3795 ph10 427 }
3796 ph10 184 GETCHARINCTEST(c, eptr);
3797 ph10 349 prop_script = UCD_SCRIPT(c);
3798 nigel 87 if ((prop_script == prop_value) == prop_fail_result)
3799 ph10 510 MRRETURN(MATCH_NOMATCH);
3800 nigel 87 }
3801     break;
3802 ph10 527
3803 ph10 517 case PT_ALNUM:
3804     for (i = 1; i <= min; i++)
3805     {
3806     if (eptr >= md->end_subject)
3807     {
3808     SCHECK_PARTIAL();
3809     MRRETURN(MATCH_NOMATCH);
3810     }
3811     GETCHARINCTEST(c, eptr);
3812 ph10 527 prop_category = UCD_CATEGORY(c);
3813     if ((prop_category == ucp_L || prop_category == ucp_N)
3814 ph10 517 == prop_fail_result)
3815     MRRETURN(MATCH_NOMATCH);
3816     }
3817     break;
3818 ph10 527
3819 ph10 517 case PT_SPACE: /* Perl space */
3820     for (i = 1; i <= min; i++)
3821     {
3822     if (eptr >= md->end_subject)
3823     {
3824     SCHECK_PARTIAL();
3825     MRRETURN(MATCH_NOMATCH);
3826     }
3827     GETCHARINCTEST(c, eptr);
3828 ph10 527 prop_category = UCD_CATEGORY(c);
3829     if ((prop_category == ucp_Z || c == CHAR_HT || c == CHAR_NL ||
3830     c == CHAR_FF || c == CHAR_CR)
3831 ph10 517 == prop_fail_result)
3832     MRRETURN(MATCH_NOMATCH);
3833     }
3834     break;
3835 ph10 527
3836 ph10 517 case PT_PXSPACE: /* POSIX space */
3837     for (i = 1; i <= min; i++)
3838     {
3839     if (eptr >= md->end_subject)
3840     {
3841     SCHECK_PARTIAL();
3842     MRRETURN(MATCH_NOMATCH);
3843     }
3844     GETCHARINCTEST(c, eptr);
3845 ph10 527 prop_category = UCD_CATEGORY(c);
3846     if ((prop_category == ucp_Z || c == CHAR_HT || c == CHAR_NL ||
3847     c == CHAR_VT || c == CHAR_FF || c == CHAR_CR)
3848 ph10 517 == prop_fail_result)
3849     MRRETURN(MATCH_NOMATCH);
3850     }
3851     break;
3852 ph10 527
3853     case PT_WORD:
3854 ph10 517 for (i = 1; i <= min; i++)
3855     {
3856     if (eptr >= md->end_subject)
3857     {
3858     SCHECK_PARTIAL();
3859     MRRETURN(MATCH_NOMATCH);
3860     }
3861     GETCHARINCTEST(c, eptr);
3862 ph10 527 prop_category = UCD_CATEGORY(c);
3863 ph10 517 if ((prop_category == ucp_L || prop_category == ucp_N ||
3864 ph10 527 c == CHAR_UNDERSCORE)
3865 ph10 517 == prop_fail_result)
3866     MRRETURN(MATCH_NOMATCH);
3867     }
3868     break;
3869 ph10 527
3870 ph10 517 /* This should not occur */
3871 nigel 87
3872     default:
3873     RRETURN(PCRE_ERROR_INTERNAL);
3874 nigel 77 }
3875     }
3876    
3877     /* Match extended Unicode sequences. We will get here only if the
3878     support is in the binary; otherwise a compile-time error occurs. */
3879    
3880     else if (ctype == OP_EXTUNI)
3881     {
3882     for (i = 1; i <= min; i++)
3883     {
3884 ph10 427 if (eptr >= md->end_subject)
3885 ph10 426 {
3886 ph10 427 SCHECK_PARTIAL();
3887 ph10 510 MRRETURN(MATCH_NOMATCH);
3888 ph10 427 }
3889 nigel 77 GETCHARINCTEST(c, eptr);
3890 ph10 349 prop_category = UCD_CATEGORY(c);
3891 ph10 510 if (prop_category == ucp_M) MRRETURN(MATCH_NOMATCH);
3892 nigel 77 while (eptr < md->end_subject)
3893     {
3894     int len = 1;
3895 ph10 426 if (!utf8) c = *eptr;
3896     else { GETCHARLEN