/[pcre]/code/trunk/pcre_exec.c
ViewVC logotype

Contents of /code/trunk/pcre_exec.c

Parent Directory Parent Directory | Revision Log Revision Log


Revision 608 - (hide annotations) (download)
Sun Jun 12 16:25:55 2011 UTC (3 years, 3 months ago) by ph10
File MIME type: text/plain
File size: 194678 byte(s)
Fix problems with capturing parens and *ACCEPT with recursion.

1 nigel 77 /*************************************************
2     * Perl-Compatible Regular Expressions *
3     *************************************************/
4    
5     /* PCRE is a library of functions to support regular expressions whose syntax
6     and semantics are as close as possible to those of the Perl 5 language.
7    
8     Written by Philip Hazel
9 ph10 598 Copyright (c) 1997-2011 University of Cambridge
10 nigel 77
11     -----------------------------------------------------------------------------
12     Redistribution and use in source and binary forms, with or without
13     modification, are permitted provided that the following conditions are met:
14    
15     * Redistributions of source code must retain the above copyright notice,
16     this list of conditions and the following disclaimer.
17    
18     * Redistributions in binary form must reproduce the above copyright
19     notice, this list of conditions and the following disclaimer in the
20     documentation and/or other materials provided with the distribution.
21    
22     * Neither the name of the University of Cambridge nor the names of its
23     contributors may be used to endorse or promote products derived from
24     this software without specific prior written permission.
25    
26     THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
27     AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
28     IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
29     ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
30     LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
31     CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
32     SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
33     INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
34     CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
35     ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
36     POSSIBILITY OF SUCH DAMAGE.
37     -----------------------------------------------------------------------------
38     */
39    
40    
41     /* This module contains pcre_exec(), the externally visible function that does
42     pattern matching using an NFA algorithm, trying to mimic Perl as closely as
43     possible. There are also some static supporting functions. */
44    
45 ph10 200 #ifdef HAVE_CONFIG_H
46 ph10 236 #include "config.h"
47 ph10 200 #endif
48 ph10 199
49 nigel 93 #define NLBLOCK md /* Block containing newline information */
50     #define PSSTART start_subject /* Field containing processed string start */
51     #define PSEND end_subject /* Field containing processed string end */
52    
53 nigel 77 #include "pcre_internal.h"
54    
55 ph10 137 /* Undefine some potentially clashing cpp symbols */
56    
57     #undef min
58     #undef max
59    
60 ph10 604 /* Values for setting in md->match_function_type to indicate two special types
61     of call to match(). We do it this way to save on using another stack variable,
62     as stack usage is to be discouraged. */
63 nigel 77
64 ph10 604 #define MATCH_CONDASSERT 1 /* Called to check a condition assertion */
65     #define MATCH_CBEGROUP 2 /* Could-be-empty unlimited repeat group */
66 nigel 77
67     /* Non-error returns from the match() function. Error returns are externally
68     defined PCRE_ERROR_xxx codes, which are all negative. */
69    
70     #define MATCH_MATCH 1
71     #define MATCH_NOMATCH 0
72    
73 ph10 211 /* Special internal returns from the match() function. Make them sufficiently
74 ph10 210 negative to avoid the external error codes. */
75    
76 ph10 511 #define MATCH_ACCEPT (-999)
77     #define MATCH_COMMIT (-998)
78 ph10 604 #define MATCH_KETRPOS (-997)
79     #define MATCH_PRUNE (-996)
80     #define MATCH_SKIP (-995)
81     #define MATCH_SKIP_ARG (-994)
82     #define MATCH_THEN (-993)
83 ph10 210
84 ph10 510 /* This is a convenience macro for code that occurs many times. */
85    
86     #define MRRETURN(ra) \
87     { \
88     md->mark = markptr; \
89     RRETURN(ra); \
90     }
91    
92 nigel 77 /* Maximum number of ints of offset to save on the stack for recursive calls.
93     If the offset vector is bigger, malloc is used. This should be a multiple of 3,
94     because the offset vector is always a multiple of 3 long. */
95    
96     #define REC_STACK_SAVE_MAX 30
97    
98     /* Min and max values for the common repeats; for the maxima, 0 => infinity */
99    
100     static const char rep_min[] = { 0, 0, 1, 1, 0, 0 };
101     static const char rep_max[] = { 0, 0, 0, 0, 1, 1 };
102    
103    
104    
105 ph10 475 #ifdef PCRE_DEBUG
106 nigel 77 /*************************************************
107     * Debugging function to print chars *
108     *************************************************/
109    
110     /* Print a sequence of chars in printable format, stopping at the end of the
111     subject if the requested.
112    
113     Arguments:
114     p points to characters
115     length number to print
116     is_subject TRUE if printing from within md->start_subject
117     md pointer to matching data block, if is_subject is TRUE
118    
119     Returns: nothing
120     */
121    
122     static void
123     pchars(const uschar *p, int length, BOOL is_subject, match_data *md)
124     {
125 nigel 93 unsigned int c;
126 nigel 77 if (is_subject && length > md->end_subject - p) length = md->end_subject - p;
127     while (length-- > 0)
128     if (isprint(c = *(p++))) printf("%c", c); else printf("\\x%02x", c);
129     }
130     #endif
131    
132    
133    
134     /*************************************************
135     * Match a back-reference *
136     *************************************************/
137    
138 ph10 595 /* Normally, if a back reference hasn't been set, the length that is passed is
139     negative, so the match always fails. However, in JavaScript compatibility mode,
140     the length passed is zero. Note that in caseless UTF-8 mode, the number of
141     subject bytes matched may be different to the number of reference bytes.
142 nigel 77
143     Arguments:
144     offset index into the offset vector
145 ph10 595 eptr pointer into the subject
146     length length of reference to be matched (number of bytes)
147 nigel 77 md points to match data block
148 ph10 602 caseless TRUE if caseless
149 nigel 77
150 ph10 595 Returns: < 0 if not matched, otherwise the number of subject bytes matched
151 nigel 77 */
152    
153 ph10 595 static int
154 nigel 87 match_ref(int offset, register USPTR eptr, int length, match_data *md,
155 ph10 602 BOOL caseless)
156 nigel 77 {
157 ph10 595 USPTR eptr_start = eptr;
158     register USPTR p = md->start_subject + md->offset_vector[offset];
159 nigel 77
160 ph10 475 #ifdef PCRE_DEBUG
161 nigel 77 if (eptr >= md->end_subject)
162     printf("matching subject <null>");
163     else
164     {
165     printf("matching subject ");
166     pchars(eptr, length, TRUE, md);
167     }
168     printf(" against backref ");
169     pchars(p, length, FALSE, md);
170     printf("\n");
171     #endif
172    
173 ph10 595 /* Always fail if reference not set (and not JavaScript compatible). */
174 nigel 77
175 ph10 595 if (length < 0) return -1;
176 nigel 77
177 ph10 354 /* Separate the caseless case for speed. In UTF-8 mode we can only do this
178     properly if Unicode properties are supported. Otherwise, we can check only
179     ASCII characters. */
180 nigel 77
181 ph10 602 if (caseless)
182 nigel 77 {
183 ph10 354 #ifdef SUPPORT_UTF8
184     #ifdef SUPPORT_UCP
185     if (md->utf8)
186     {
187 ph10 595 /* Match characters up to the end of the reference. NOTE: the number of
188     bytes matched may differ, because there are some characters whose upper and
189     lower case versions code as different numbers of bytes. For example, U+023A
190     (2 bytes in UTF-8) is the upper case version of U+2C65 (3 bytes in UTF-8);
191     a sequence of 3 of the former uses 6 bytes, as does a sequence of two of
192     the latter. It is important, therefore, to check the length along the
193     reference, not along the subject (earlier code did this wrong). */
194    
195     USPTR endptr = p + length;
196     while (p < endptr)
197 ph10 354 {
198 ph10 358 int c, d;
199 ph10 597 if (eptr >= md->end_subject) return -1;
200 ph10 354 GETCHARINC(c, eptr);
201     GETCHARINC(d, p);
202 ph10 595 if (c != d && c != UCD_OTHERCASE(d)) return -1;
203 ph10 358 }
204     }
205 ph10 354 else
206     #endif
207     #endif
208    
209     /* The same code works when not in UTF-8 mode and in UTF-8 mode when there
210     is no UCP support. */
211 ph10 597 {
212     if (eptr + length > md->end_subject) return -1;
213     while (length-- > 0)
214     { if (md->lcc[*p++] != md->lcc[*eptr++]) return -1; }
215     }
216 nigel 77 }
217 ph10 358
218 ph10 354 /* In the caseful case, we can just compare the bytes, whether or not we
219     are in UTF-8 mode. */
220 ph10 358
221 nigel 77 else
222 ph10 597 {
223     if (eptr + length > md->end_subject) return -1;
224     while (length-- > 0) if (*p++ != *eptr++) return -1;
225     }
226 nigel 77
227 ph10 595 return eptr - eptr_start;
228 nigel 77 }
229    
230    
231    
232     /***************************************************************************
233     ****************************************************************************
234     RECURSION IN THE match() FUNCTION
235    
236 nigel 87 The match() function is highly recursive, though not every recursive call
237     increases the recursive depth. Nevertheless, some regular expressions can cause
238     it to recurse to a great depth. I was writing for Unix, so I just let it call
239     itself recursively. This uses the stack for saving everything that has to be
240     saved for a recursive call. On Unix, the stack can be large, and this works
241     fine.
242 nigel 77
243 nigel 87 It turns out that on some non-Unix-like systems there are problems with
244     programs that use a lot of stack. (This despite the fact that every last chip
245     has oodles of memory these days, and techniques for extending the stack have
246     been known for decades.) So....
247 nigel 77
248     There is a fudge, triggered by defining NO_RECURSE, which avoids recursive
249     calls by keeping local variables that need to be preserved in blocks of memory
250 nigel 87 obtained from malloc() instead instead of on the stack. Macros are used to
251 nigel 77 achieve this so that the actual code doesn't look very different to what it
252     always used to.
253 ph10 164
254 ph10 165 The original heap-recursive code used longjmp(). However, it seems that this
255 ph10 164 can be very slow on some operating systems. Following a suggestion from Stan
256     Switzer, the use of longjmp() has been abolished, at the cost of having to
257     provide a unique number for each call to RMATCH. There is no way of generating
258     a sequence of numbers at compile time in C. I have given them names, to make
259     them stand out more clearly.
260    
261     Crude tests on x86 Linux show a small speedup of around 5-8%. However, on
262     FreeBSD, avoiding longjmp() more than halves the time taken to run the standard
263 ph10 165 tests. Furthermore, not using longjmp() means that local dynamic variables
264     don't have indeterminate values; this has meant that the frame size can be
265 ph10 164 reduced because the result can be "passed back" by straight setting of the
266     variable instead of being passed in the frame.
267 nigel 77 ****************************************************************************
268     ***************************************************************************/
269    
270 ph10 212 /* Numbers for RMATCH calls. When this list is changed, the code at HEAP_RETURN
271     below must be updated in sync. */
272 nigel 77
273 ph10 164 enum { RM1=1, RM2, RM3, RM4, RM5, RM6, RM7, RM8, RM9, RM10,
274     RM11, RM12, RM13, RM14, RM15, RM16, RM17, RM18, RM19, RM20,
275     RM21, RM22, RM23, RM24, RM25, RM26, RM27, RM28, RM29, RM30,
276     RM31, RM32, RM33, RM34, RM35, RM36, RM37, RM38, RM39, RM40,
277 ph10 210 RM41, RM42, RM43, RM44, RM45, RM46, RM47, RM48, RM49, RM50,
278 ph10 527 RM51, RM52, RM53, RM54, RM55, RM56, RM57, RM58, RM59, RM60,
279 ph10 604 RM61, RM62, RM63, RM64 };
280 ph10 164
281 nigel 87 /* These versions of the macros use the stack, as normal. There are debugging
282 ph10 165 versions and production versions. Note that the "rw" argument of RMATCH isn't
283 ph10 501 actually used in this definition. */
284 nigel 77
285     #ifndef NO_RECURSE
286     #define REGISTER register
287 ph10 164
288 ph10 475 #ifdef PCRE_DEBUG
289 ph10 604 #define RMATCH(ra,rb,rc,rd,re,rw) \
290 nigel 87 { \
291     printf("match() called in line %d\n", __LINE__); \
292 ph10 604 rrc = match(ra,rb,mstart,markptr,rc,rd,re,rdepth+1); \
293 nigel 87 printf("to line %d\n", __LINE__); \
294     }
295     #define RRETURN(ra) \
296     { \
297     printf("match() returned %d from line %d ", ra, __LINE__); \
298     return ra; \
299     }
300     #else
301 ph10 604 #define RMATCH(ra,rb,rc,rd,re,rw) \
302     rrc = match(ra,rb,mstart,markptr,rc,rd,re,rdepth+1)
303 nigel 77 #define RRETURN(ra) return ra
304 nigel 87 #endif
305    
306 nigel 77 #else
307    
308    
309 ph10 164 /* These versions of the macros manage a private stack on the heap. Note that
310     the "rd" argument of RMATCH isn't actually used in this definition. It's the md
311     argument of match(), which never changes. */
312 nigel 77
313     #define REGISTER
314    
315 ph10 604 #define RMATCH(ra,rb,rc,rd,re,rw)\
316 nigel 77 {\
317 ph10 563 heapframe *newframe = (heapframe *)(pcre_stack_malloc)(sizeof(heapframe));\
318 ph10 534 if (newframe == NULL) RRETURN(PCRE_ERROR_NOMEMORY);\
319 ph10 164 frame->Xwhere = rw; \
320     newframe->Xeptr = ra;\
321     newframe->Xecode = rb;\
322 ph10 168 newframe->Xmstart = mstart;\
323 ph10 501 newframe->Xmarkptr = markptr;\
324 ph10 164 newframe->Xoffset_top = rc;\
325 ph10 602 newframe->Xeptrb = re;\
326 ph10 164 newframe->Xrdepth = frame->Xrdepth + 1;\
327     newframe->Xprevframe = frame;\
328     frame = newframe;\
329     DPRINTF(("restarting from line %d\n", __LINE__));\
330     goto HEAP_RECURSE;\
331     L_##rw:\
332     DPRINTF(("jumped back to line %d\n", __LINE__));\
333 nigel 77 }
334    
335     #define RRETURN(ra)\
336     {\
337 ph10 527 heapframe *oldframe = frame;\
338     frame = oldframe->Xprevframe;\
339     (pcre_stack_free)(oldframe);\
340 nigel 77 if (frame != NULL)\
341     {\
342 ph10 164 rrc = ra;\
343     goto HEAP_RETURN;\
344 nigel 77 }\
345     return ra;\
346     }
347    
348    
349     /* Structure for remembering the local variables in a private frame */
350    
351     typedef struct heapframe {
352     struct heapframe *Xprevframe;
353    
354     /* Function arguments that may change */
355    
356 ph10 409 USPTR Xeptr;
357 nigel 77 const uschar *Xecode;
358 ph10 409 USPTR Xmstart;
359 ph10 501 USPTR Xmarkptr;
360 nigel 77 int Xoffset_top;
361     eptrblock *Xeptrb;
362 nigel 91 unsigned int Xrdepth;
363 nigel 77
364     /* Function local variables */
365    
366 ph10 409 USPTR Xcallpat;
367 ph10 406 #ifdef SUPPORT_UTF8
368 ph10 409 USPTR Xcharptr;
369 ph10 406 #endif
370 ph10 409 USPTR Xdata;
371     USPTR Xnext;
372     USPTR Xpp;
373     USPTR Xprev;
374     USPTR Xsaved_eptr;
375 nigel 77
376     recursion_info Xnew_recursive;
377    
378     BOOL Xcur_is_word;
379     BOOL Xcondition;
380     BOOL Xprev_is_word;
381    
382     #ifdef SUPPORT_UCP
383     int Xprop_type;
384 nigel 87 int Xprop_value;
385 nigel 77 int Xprop_fail_result;
386     int Xprop_category;
387     int Xprop_chartype;
388 nigel 87 int Xprop_script;
389 ph10 123 int Xoclength;
390     uschar Xocchars[8];
391 nigel 77 #endif
392    
393 ph10 403 int Xcodelink;
394 nigel 77 int Xctype;
395 nigel 93 unsigned int Xfc;
396 nigel 77 int Xfi;
397     int Xlength;
398     int Xmax;
399     int Xmin;
400     int Xnumber;
401     int Xoffset;
402     int Xop;
403     int Xsave_capture_last;
404     int Xsave_offset1, Xsave_offset2, Xsave_offset3;
405     int Xstacksave[REC_STACK_SAVE_MAX];
406    
407     eptrblock Xnewptrb;
408    
409 ph10 164 /* Where to jump back to */
410 nigel 77
411 ph10 164 int Xwhere;
412 ph10 165
413 nigel 77 } heapframe;
414    
415     #endif
416    
417    
418     /***************************************************************************
419     ***************************************************************************/
420    
421    
422    
423     /*************************************************
424     * Match from current position *
425     *************************************************/
426    
427 nigel 93 /* This function is called recursively in many circumstances. Whenever it
428 nigel 77 returns a negative (error) response, the outer incarnation must also return the
429 ph10 426 same response. */
430 nigel 77
431 ph10 426 /* These macros pack up tests that are used for partial matching, and which
432     appears several times in the code. We set the "hit end" flag if the pointer is
433     at the end of the subject and also past the start of the subject (i.e.
434 ph10 427 something has been matched). For hard partial matching, we then return
435     immediately. The second one is used when we already know we are past the end of
436     the subject. */
437 ph10 426
438     #define CHECK_PARTIAL()\
439 ph10 553 if (md->partial != 0 && eptr >= md->end_subject && \
440     eptr > md->start_used_ptr) \
441     { \
442     md->hitend = TRUE; \
443     if (md->partial > 1) MRRETURN(PCRE_ERROR_PARTIAL); \
444 ph10 427 }
445 ph10 426
446     #define SCHECK_PARTIAL()\
447 ph10 553 if (md->partial != 0 && eptr > md->start_used_ptr) \
448     { \
449     md->hitend = TRUE; \
450     if (md->partial > 1) MRRETURN(PCRE_ERROR_PARTIAL); \
451 ph10 427 }
452 ph10 426
453 ph10 427
454 ph10 426 /* Performance note: It might be tempting to extract commonly used fields from
455     the md structure (e.g. utf8, end_subject) into individual variables to improve
456 nigel 77 performance. Tests using gcc on a SPARC disproved this; in the first case, it
457     made performance worse.
458    
459     Arguments:
460 nigel 93 eptr pointer to current character in subject
461     ecode pointer to current position in compiled code
462 ph10 168 mstart pointer to the current match start position (can be modified
463 ph10 172 by encountering \K)
464 ph10 501 markptr pointer to the most recent MARK name, or NULL
465 nigel 77 offset_top current top pointer
466     md pointer to "static" info for the match
467     eptrb pointer to chain of blocks containing eptr at start of
468     brackets - for testing for empty matches
469 nigel 87 rdepth the recursion depth
470 nigel 77
471     Returns: MATCH_MATCH if matched ) these values are >= 0
472     MATCH_NOMATCH if failed to match )
473 ph10 510 a negative MATCH_xxx value for PRUNE, SKIP, etc
474 nigel 77 a negative PCRE_ERROR_xxx value if aborted by an error condition
475 nigel 87 (e.g. stopped by repeated call or recursion limit)
476 nigel 77 */
477    
478     static int
479 ph10 510 match(REGISTER USPTR eptr, REGISTER const uschar *ecode, USPTR mstart,
480 ph10 602 const uschar *markptr, int offset_top, match_data *md, eptrblock *eptrb,
481 ph10 604 unsigned int rdepth)
482 nigel 77 {
483     /* These variables do not need to be preserved over recursion in this function,
484 nigel 93 so they can be ordinary variables in all cases. Mark some of them with
485     "register" because they are used a lot in loops. */
486 nigel 77
487 nigel 91 register int rrc; /* Returns from recursive calls */
488     register int i; /* Used for loops not involving calls to RMATCH() */
489 nigel 93 register unsigned int c; /* Character values not kept over RMATCH() calls */
490 nigel 91 register BOOL utf8; /* Local copy of UTF-8 flag for speed */
491 nigel 77
492 nigel 93 BOOL minimize, possessive; /* Quantifier options */
493 ph10 602 BOOL caseless;
494 ph10 403 int condcode;
495 nigel 93
496 nigel 77 /* When recursion is not being used, all "local" variables that have to be
497     preserved over calls to RMATCH() are part of a "frame" which is obtained from
498     heap storage. Set up the top-level frame here; others are obtained from the
499     heap whenever RMATCH() does a "recursion". See the macro definitions above. */
500    
501     #ifdef NO_RECURSE
502 ph10 563 heapframe *frame = (heapframe *)(pcre_stack_malloc)(sizeof(heapframe));
503 ph10 531 if (frame == NULL) RRETURN(PCRE_ERROR_NOMEMORY);
504 nigel 77 frame->Xprevframe = NULL; /* Marks the top level */
505    
506     /* Copy in the original argument variables */
507    
508     frame->Xeptr = eptr;
509     frame->Xecode = ecode;
510 ph10 168 frame->Xmstart = mstart;
511 ph10 501 frame->Xmarkptr = markptr;
512 nigel 77 frame->Xoffset_top = offset_top;
513     frame->Xeptrb = eptrb;
514 nigel 87 frame->Xrdepth = rdepth;
515 nigel 77
516     /* This is where control jumps back to to effect "recursion" */
517    
518     HEAP_RECURSE:
519    
520     /* Macros make the argument variables come from the current frame */
521    
522     #define eptr frame->Xeptr
523     #define ecode frame->Xecode
524 ph10 168 #define mstart frame->Xmstart
525 ph10 501 #define markptr frame->Xmarkptr
526 nigel 77 #define offset_top frame->Xoffset_top
527     #define eptrb frame->Xeptrb
528 nigel 87 #define rdepth frame->Xrdepth
529 nigel 77
530     /* Ditto for the local variables */
531    
532     #ifdef SUPPORT_UTF8
533     #define charptr frame->Xcharptr
534     #endif
535     #define callpat frame->Xcallpat
536 ph10 403 #define codelink frame->Xcodelink
537 nigel 77 #define data frame->Xdata
538     #define next frame->Xnext
539     #define pp frame->Xpp
540     #define prev frame->Xprev
541     #define saved_eptr frame->Xsaved_eptr
542    
543     #define new_recursive frame->Xnew_recursive
544    
545     #define cur_is_word frame->Xcur_is_word
546     #define condition frame->Xcondition
547     #define prev_is_word frame->Xprev_is_word
548    
549     #ifdef SUPPORT_UCP
550     #define prop_type frame->Xprop_type
551 nigel 87 #define prop_value frame->Xprop_value
552 nigel 77 #define prop_fail_result frame->Xprop_fail_result
553     #define prop_category frame->Xprop_category
554     #define prop_chartype frame->Xprop_chartype
555 nigel 87 #define prop_script frame->Xprop_script
556 ph10 115 #define oclength frame->Xoclength
557     #define occhars frame->Xocchars
558 nigel 77 #endif
559    
560     #define ctype frame->Xctype
561     #define fc frame->Xfc
562     #define fi frame->Xfi
563     #define length frame->Xlength
564     #define max frame->Xmax
565     #define min frame->Xmin
566     #define number frame->Xnumber
567     #define offset frame->Xoffset
568     #define op frame->Xop
569     #define save_capture_last frame->Xsave_capture_last
570     #define save_offset1 frame->Xsave_offset1
571     #define save_offset2 frame->Xsave_offset2
572     #define save_offset3 frame->Xsave_offset3
573     #define stacksave frame->Xstacksave
574    
575     #define newptrb frame->Xnewptrb
576    
577     /* When recursion is being used, local variables are allocated on the stack and
578     get preserved during recursion in the normal way. In this environment, fi and
579     i, and fc and c, can be the same variables. */
580    
581 nigel 93 #else /* NO_RECURSE not defined */
582 nigel 77 #define fi i
583     #define fc c
584    
585 ph10 604 /* Many of the following variables are used only in small blocks of the code.
586     My normal style of coding would have declared them within each of those blocks.
587     However, in order to accommodate the version of this code that uses an external
588     "stack" implemented on the heap, it is easier to declare them all here, so the
589     declarations can be cut out in a block. The only declarations within blocks
590     below are for variables that do not have to be preserved over a recursive call
591     to RMATCH(). */
592 nigel 77
593 ph10 604 #ifdef SUPPORT_UTF8
594     const uschar *charptr;
595     #endif
596     const uschar *callpat;
597     const uschar *data;
598     const uschar *next;
599     USPTR pp;
600     const uschar *prev;
601     USPTR saved_eptr;
602    
603     recursion_info new_recursive;
604    
605     BOOL cur_is_word;
606 nigel 87 BOOL condition;
607 nigel 77 BOOL prev_is_word;
608    
609     #ifdef SUPPORT_UCP
610     int prop_type;
611 nigel 87 int prop_value;
612 nigel 77 int prop_fail_result;
613     int prop_category;
614     int prop_chartype;
615 nigel 87 int prop_script;
616 ph10 115 int oclength;
617     uschar occhars[8];
618 nigel 77 #endif
619    
620 ph10 399 int codelink;
621 nigel 77 int ctype;
622     int length;
623     int max;
624     int min;
625     int number;
626     int offset;
627     int op;
628     int save_capture_last;
629     int save_offset1, save_offset2, save_offset3;
630     int stacksave[REC_STACK_SAVE_MAX];
631    
632     eptrblock newptrb;
633 nigel 93 #endif /* NO_RECURSE */
634 nigel 77
635 ph10 604 /* To save space on the stack and in the heap frame, I have doubled up on some
636     of the local variables that are used only in localised parts of the code, but
637     still need to be preserved over recursive calls of match(). These macros define
638     the alternative names that are used. */
639    
640     #define allow_zero cur_is_word
641     #define cbegroup condition
642     #define code_offset codelink
643     #define condassert condition
644     #define matched_once prev_is_word
645    
646 nigel 77 /* These statements are here to stop the compiler complaining about unitialized
647     variables. */
648    
649     #ifdef SUPPORT_UCP
650 nigel 87 prop_value = 0;
651 nigel 77 prop_fail_result = 0;
652     #endif
653    
654 nigel 93
655 nigel 91 /* This label is used for tail recursion, which is used in a few cases even
656     when NO_RECURSE is not defined, in order to reduce the amount of stack that is
657     used. Thanks to Ian Taylor for noticing this possibility and sending the
658     original patch. */
659    
660     TAIL_RECURSE:
661    
662 nigel 87 /* OK, now we can get on with the real code of the function. Recursive calls
663     are specified by the macro RMATCH and RRETURN is used to return. When
664     NO_RECURSE is *not* defined, these just turn into a recursive call to match()
665 ph10 475 and a "return", respectively (possibly with some debugging if PCRE_DEBUG is
666 nigel 87 defined). However, RMATCH isn't like a function call because it's quite a
667     complicated macro. It has to be used in one particular way. This shouldn't,
668     however, impact performance when true recursion is being used. */
669 nigel 77
670 ph10 164 #ifdef SUPPORT_UTF8
671     utf8 = md->utf8; /* Local copy of the flag */
672     #else
673     utf8 = FALSE;
674     #endif
675    
676 nigel 87 /* First check that we haven't called match() too many times, or that we
677     haven't exceeded the recursive call limit. */
678    
679 nigel 77 if (md->match_call_count++ >= md->match_limit) RRETURN(PCRE_ERROR_MATCHLIMIT);
680 nigel 87 if (rdepth >= md->match_limit_recursion) RRETURN(PCRE_ERROR_RECURSIONLIMIT);
681 nigel 77
682 nigel 93 /* At the start of a group with an unlimited repeat that may match an empty
683 ph10 604 string, the variable md->match_function_type is set to MATCH_CBEGROUP. It is
684     done this way to save having to use another function argument, which would take
685     up space on the stack. See also MATCH_CONDASSERT below.
686 nigel 77
687 ph10 604 When MATCH_CBEGROUP is set, add the current subject pointer to the chain of
688     such remembered pointers, to be checked when we hit the closing ket, in order
689     to break infinite loops that match no characters. When match() is called in
690     other circumstances, don't add to the chain. The MATCH_CBEGROUP feature must
691     NOT be used with tail recursion, because the memory block that is used is on
692     the stack, so a new one may be required for each match(). */
693    
694     if (md->match_function_type == MATCH_CBEGROUP)
695 nigel 77 {
696 ph10 197 newptrb.epb_saved_eptr = eptr;
697     newptrb.epb_prev = eptrb;
698     eptrb = &newptrb;
699 ph10 604 md->match_function_type = 0;
700 nigel 77 }
701    
702 nigel 93 /* Now start processing the opcodes. */
703 nigel 77
704     for (;;)
705     {
706 nigel 93 minimize = possessive = FALSE;
707 nigel 77 op = *ecode;
708 ph10 604
709 nigel 93 switch(op)
710     {
711 ph10 510 case OP_MARK:
712     markptr = ecode + 2;
713     RMATCH(eptr, ecode + _pcre_OP_lengths[*ecode] + ecode[1], offset_top, md,
714 ph10 604 eptrb, RM55);
715 ph10 512
716     /* A return of MATCH_SKIP_ARG means that matching failed at SKIP with an
717     argument, and we must check whether that argument matches this MARK's
718     argument. It is passed back in md->start_match_ptr (an overloading of that
719     variable). If it does match, we reset that variable to the current subject
720     position and return MATCH_SKIP. Otherwise, pass back the return code
721 ph10 510 unaltered. */
722 ph10 512
723     if (rrc == MATCH_SKIP_ARG &&
724 ph10 510 strcmp((char *)markptr, (char *)(md->start_match_ptr)) == 0)
725     {
726     md->start_match_ptr = eptr;
727     RRETURN(MATCH_SKIP);
728     }
729    
730 ph10 512 if (md->mark == NULL) md->mark = markptr;
731 ph10 510 RRETURN(rrc);
732    
733 ph10 210 case OP_FAIL:
734 ph10 510 MRRETURN(MATCH_NOMATCH);
735 ph10 211
736 ph10 551 /* COMMIT overrides PRUNE, SKIP, and THEN */
737 ph10 553
738 ph10 510 case OP_COMMIT:
739     RMATCH(eptr, ecode + _pcre_OP_lengths[*ecode], offset_top, md,
740 ph10 604 eptrb, RM52);
741 ph10 551 if (rrc != MATCH_NOMATCH && rrc != MATCH_PRUNE &&
742 ph10 553 rrc != MATCH_SKIP && rrc != MATCH_SKIP_ARG &&
743     rrc != MATCH_THEN)
744 ph10 551 RRETURN(rrc);
745 ph10 510 MRRETURN(MATCH_COMMIT);
746    
747 ph10 551 /* PRUNE overrides THEN */
748 ph10 553
749 ph10 210 case OP_PRUNE:
750     RMATCH(eptr, ecode + _pcre_OP_lengths[*ecode], offset_top, md,
751 ph10 604 eptrb, RM51);
752 ph10 551 if (rrc != MATCH_NOMATCH && rrc != MATCH_THEN) RRETURN(rrc);
753 ph10 510 MRRETURN(MATCH_PRUNE);
754 ph10 211
755 ph10 510 case OP_PRUNE_ARG:
756     RMATCH(eptr, ecode + _pcre_OP_lengths[*ecode] + ecode[1], offset_top, md,
757 ph10 604 eptrb, RM56);
758 ph10 551 if (rrc != MATCH_NOMATCH && rrc != MATCH_THEN) RRETURN(rrc);
759 ph10 510 md->mark = ecode + 2;
760     RRETURN(MATCH_PRUNE);
761 ph10 211
762 ph10 551 /* SKIP overrides PRUNE and THEN */
763 ph10 553
764 ph10 210 case OP_SKIP:
765     RMATCH(eptr, ecode + _pcre_OP_lengths[*ecode], offset_top, md,
766 ph10 604 eptrb, RM53);
767 ph10 553 if (rrc != MATCH_NOMATCH && rrc != MATCH_PRUNE && rrc != MATCH_THEN)
768 ph10 551 RRETURN(rrc);
769 ph10 211 md->start_match_ptr = eptr; /* Pass back current position */
770 ph10 510 MRRETURN(MATCH_SKIP);
771 ph10 211
772 ph10 510 case OP_SKIP_ARG:
773     RMATCH(eptr, ecode + _pcre_OP_lengths[*ecode] + ecode[1], offset_top, md,
774 ph10 604 eptrb, RM57);
775 ph10 553 if (rrc != MATCH_NOMATCH && rrc != MATCH_PRUNE && rrc != MATCH_THEN)
776 ph10 551 RRETURN(rrc);
777 ph10 512
778     /* Pass back the current skip name by overloading md->start_match_ptr and
779     returning the special MATCH_SKIP_ARG return code. This will either be
780     caught by a matching MARK, or get to the top, where it is treated the same
781 ph10 510 as PRUNE. */
782 ph10 512
783 ph10 510 md->start_match_ptr = ecode + 2;
784 ph10 512 RRETURN(MATCH_SKIP_ARG);
785 ph10 553
786 ph10 550 /* For THEN (and THEN_ARG) we pass back the address of the bracket or
787 ph10 553 the alt that is at the start of the current branch. This makes it possible
788     to skip back past alternatives that precede the THEN within the current
789     branch. */
790 ph10 512
791 ph10 210 case OP_THEN:
792     RMATCH(eptr, ecode + _pcre_OP_lengths[*ecode], offset_top, md,
793 ph10 604 eptrb, RM54);
794 ph10 210 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
795 ph10 550 md->start_match_ptr = ecode - GET(ecode, 1);
796 ph10 510 MRRETURN(MATCH_THEN);
797    
798     case OP_THEN_ARG:
799 ph10 553 RMATCH(eptr, ecode + _pcre_OP_lengths[*ecode] + ecode[1+LINK_SIZE],
800 ph10 604 offset_top, md, eptrb, RM58);
801 ph10 510 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
802 ph10 550 md->start_match_ptr = ecode - GET(ecode, 1);
803     md->mark = ecode + LINK_SIZE + 2;
804 ph10 212 RRETURN(MATCH_THEN);
805 ph10 211
806 ph10 604 /* Handle a capturing bracket, other than those that are possessive with an
807     unlimited repeat. If there is space in the offset vector, save the current
808     subject position in the working slot at the top of the vector. We mustn't
809     change the current values of the data slot, because they may be set from a
810     previous iteration of this group, and be referred to by a reference inside
811     the group. If we fail to match, we need to restore this value and also the
812 nigel 93 values of the final offsets, in case they were set by a previous iteration
813     of the same bracket.
814 nigel 77
815 nigel 93 If there isn't enough space in the offset vector, treat this as if it were
816     a non-capturing bracket. Don't worry about setting the flag for the error
817     case here; that is handled in the code for KET. */
818 nigel 77
819 nigel 93 case OP_CBRA:
820     case OP_SCBRA:
821     number = GET2(ecode, 1+LINK_SIZE);
822 nigel 77 offset = number << 1;
823 ph10 604
824 ph10 475 #ifdef PCRE_DEBUG
825 nigel 93 printf("start bracket %d\n", number);
826     printf("subject=");
827 nigel 77 pchars(eptr, 16, TRUE, md);
828     printf("\n");
829     #endif
830    
831     if (offset < md->offset_max)
832     {
833     save_offset1 = md->offset_vector[offset];
834     save_offset2 = md->offset_vector[offset+1];
835     save_offset3 = md->offset_vector[md->offset_end - number];
836     save_capture_last = md->capture_last;
837    
838     DPRINTF(("saving %d %d %d\n", save_offset1, save_offset2, save_offset3));
839 ph10 531 md->offset_vector[md->offset_end - number] =
840 ph10 530 (int)(eptr - md->start_subject);
841 nigel 77
842 ph10 604 for (;;)
843 nigel 77 {
844 ph10 604 if (op >= OP_SBRA) md->match_function_type = MATCH_CBEGROUP;
845     RMATCH(eptr, ecode + _pcre_OP_lengths[*ecode], offset_top, md,
846     eptrb, RM1);
847 ph10 550 if (rrc != MATCH_NOMATCH &&
848     (rrc != MATCH_THEN || md->start_match_ptr != ecode))
849     RRETURN(rrc);
850 nigel 77 md->capture_last = save_capture_last;
851     ecode += GET(ecode, 1);
852 ph10 604 if (*ecode != OP_ALT) break;
853 nigel 77 }
854    
855     DPRINTF(("bracket %d failed\n", number));
856    
857     md->offset_vector[offset] = save_offset1;
858     md->offset_vector[offset+1] = save_offset2;
859     md->offset_vector[md->offset_end - number] = save_offset3;
860    
861 ph10 510 if (rrc != MATCH_THEN) md->mark = markptr;
862 nigel 77 RRETURN(MATCH_NOMATCH);
863     }
864    
865 ph10 197 /* FALL THROUGH ... Insufficient room for saving captured contents. Treat
866     as a non-capturing bracket. */
867 nigel 77
868 ph10 197 /* VVVVVVVVVVVVVVVVVVVVVVVVV */
869     /* VVVVVVVVVVVVVVVVVVVVVVVVV */
870    
871 nigel 93 DPRINTF(("insufficient capture room: treat as non-capturing\n"));
872 nigel 77
873 ph10 197 /* VVVVVVVVVVVVVVVVVVVVVVVVV */
874     /* VVVVVVVVVVVVVVVVVVVVVVVVV */
875    
876 ph10 604 /* Non-capturing bracket, except for possessive with unlimited repeat. Loop
877     for all the alternatives. When we get to the final alternative within the
878     brackets, we would return the result of a recursive call to match()
879     whatever happened. We can reduce stack usage by turning this into a tail
880     recursion, except in the case of a possibly empty group.*/
881 nigel 77
882 nigel 93 case OP_BRA:
883     case OP_SBRA:
884     DPRINTF(("start non-capturing bracket\n"));
885 nigel 91 for (;;)
886 nigel 77 {
887 ph10 197 if (ecode[GET(ecode, 1)] != OP_ALT) /* Final alternative */
888 nigel 93 {
889 ph10 604 if (op >= OP_SBRA) /* Possibly empty group */
890 ph10 197 {
891 ph10 604 md->match_function_type = MATCH_CBEGROUP;
892     RMATCH(eptr, ecode + _pcre_OP_lengths[*ecode], offset_top, md, eptrb,
893     RM48);
894     if (rrc == MATCH_NOMATCH) md->mark = markptr;
895     RRETURN(rrc);
896     }
897     /* Not a possibly empty group; use tail recursion */
898     ecode += _pcre_OP_lengths[*ecode];
899     DPRINTF(("bracket 0 tail recursion\n"));
900     goto TAIL_RECURSE;
901 nigel 93 }
902 nigel 91
903     /* For non-final alternatives, continue the loop for a NOMATCH result;
904     otherwise return. */
905    
906 ph10 604 if (op >= OP_SBRA) md->match_function_type = MATCH_CBEGROUP;
907 ph10 602 RMATCH(eptr, ecode + _pcre_OP_lengths[*ecode], offset_top, md, eptrb,
908 ph10 604 RM2);
909 ph10 550 if (rrc != MATCH_NOMATCH &&
910     (rrc != MATCH_THEN || md->start_match_ptr != ecode))
911     RRETURN(rrc);
912 nigel 77 ecode += GET(ecode, 1);
913     }
914 nigel 91 /* Control never reaches here. */
915 nigel 77
916 ph10 604 /* Handle possessive capturing brackets with an unlimited repeat. We come
917     here from BRAZERO with allow_zero set TRUE. The offset_vector values are
918     handled similarly to the normal case above. However, the matching is
919     different. The end of these brackets will always be OP_KETRPOS, which
920     returns MATCH_KETRPOS without going further in the pattern. By this means
921     we can handle the group by iteration rather than recursion, thereby
922     reducing the amount of stack needed. */
923    
924     case OP_CBRAPOS:
925     case OP_SCBRAPOS:
926     allow_zero = FALSE;
927    
928     POSSESSIVE_CAPTURE:
929     number = GET2(ecode, 1+LINK_SIZE);
930     offset = number << 1;
931    
932     #ifdef PCRE_DEBUG
933     printf("start possessive bracket %d\n", number);
934     printf("subject=");
935     pchars(eptr, 16, TRUE, md);
936     printf("\n");
937     #endif
938    
939     if (offset < md->offset_max)
940     {
941     matched_once = FALSE;
942     code_offset = ecode - md->start_code;
943    
944     save_offset1 = md->offset_vector[offset];
945     save_offset2 = md->offset_vector[offset+1];
946     save_offset3 = md->offset_vector[md->offset_end - number];
947     save_capture_last = md->capture_last;
948    
949     DPRINTF(("saving %d %d %d\n", save_offset1, save_offset2, save_offset3));
950    
951     /* Each time round the loop, save the current subject position for use
952     when the group matches. For MATCH_MATCH, the group has matched, so we
953     restart it with a new subject starting position, remembering that we had
954     at least one match. For MATCH_NOMATCH, carry on with the alternatives, as
955     usual. If we haven't matched any alternatives in any iteration, check to
956     see if a previous iteration matched. If so, the group has matched;
957     continue from afterwards. Otherwise it has failed; restore the previous
958     capture values before returning NOMATCH. */
959    
960     for (;;)
961     {
962     md->offset_vector[md->offset_end - number] =
963     (int)(eptr - md->start_subject);
964     if (op >= OP_SBRA) md->match_function_type = MATCH_CBEGROUP;
965     RMATCH(eptr, ecode + _pcre_OP_lengths[*ecode], offset_top, md,
966     eptrb, RM63);
967     if (rrc == MATCH_KETRPOS)
968     {
969     offset_top = md->end_offset_top;
970     eptr = md->end_match_ptr;
971     ecode = md->start_code + code_offset;
972     save_capture_last = md->capture_last;
973     matched_once = TRUE;
974     continue;
975     }
976     if (rrc != MATCH_NOMATCH &&
977     (rrc != MATCH_THEN || md->start_match_ptr != ecode))
978     RRETURN(rrc);
979     md->capture_last = save_capture_last;
980     ecode += GET(ecode, 1);
981     if (*ecode != OP_ALT) break;
982     }
983    
984     if (!matched_once)
985     {
986     md->offset_vector[offset] = save_offset1;
987     md->offset_vector[offset+1] = save_offset2;
988     md->offset_vector[md->offset_end - number] = save_offset3;
989     }
990    
991     if (rrc != MATCH_THEN) md->mark = markptr;
992     if (allow_zero || matched_once)
993     {
994     ecode += 1 + LINK_SIZE;
995     break;
996     }
997    
998     RRETURN(MATCH_NOMATCH);
999     }
1000    
1001     /* FALL THROUGH ... Insufficient room for saving captured contents. Treat
1002     as a non-capturing bracket. */
1003    
1004     /* VVVVVVVVVVVVVVVVVVVVVVVVV */
1005     /* VVVVVVVVVVVVVVVVVVVVVVVVV */
1006    
1007     DPRINTF(("insufficient capture room: treat as non-capturing\n"));
1008    
1009     /* VVVVVVVVVVVVVVVVVVVVVVVVV */
1010     /* VVVVVVVVVVVVVVVVVVVVVVVVV */
1011    
1012     /* Non-capturing possessive bracket with unlimited repeat. We come here
1013     from BRAZERO with allow_zero = TRUE. The code is similar to the above,
1014     without the capturing complication. It is written out separately for speed
1015     and cleanliness. */
1016    
1017     case OP_BRAPOS:
1018     case OP_SBRAPOS:
1019     allow_zero = FALSE;
1020    
1021     POSSESSIVE_NON_CAPTURE:
1022     matched_once = FALSE;
1023     code_offset = ecode - md->start_code;
1024    
1025     for (;;)
1026     {
1027     if (op >= OP_SBRA) md->match_function_type = MATCH_CBEGROUP;
1028     RMATCH(eptr, ecode + _pcre_OP_lengths[*ecode], offset_top, md,
1029     eptrb, RM64);
1030     if (rrc == MATCH_KETRPOS)
1031     {
1032     eptr = md->end_match_ptr;
1033     ecode = md->start_code + code_offset;
1034     matched_once = TRUE;
1035     continue;
1036     }
1037     if (rrc != MATCH_NOMATCH &&
1038     (rrc != MATCH_THEN || md->start_match_ptr != ecode))
1039     RRETURN(rrc);
1040     ecode += GET(ecode, 1);
1041     if (*ecode != OP_ALT) break;
1042     }
1043    
1044     if (matched_once || allow_zero)
1045     {
1046     ecode += 1 + LINK_SIZE;
1047     break;
1048     }
1049     RRETURN(MATCH_NOMATCH);
1050    
1051     /* Control never reaches here. */
1052    
1053 nigel 77 /* Conditional group: compilation checked that there are no more than
1054     two branches. If the condition is false, skipping the first branch takes us
1055     past the end if there is only one branch, but that's OK because that is
1056 nigel 91 exactly what going to the ket would do. As there is only one branch to be
1057     obeyed, we can use tail recursion to avoid using another stack frame. */
1058 nigel 77
1059     case OP_COND:
1060 nigel 93 case OP_SCOND:
1061 ph10 604 codelink = GET(ecode, 1);
1062 ph10 406
1063 ph10 381 /* Because of the way auto-callout works during compile, a callout item is
1064     inserted between OP_COND and an assertion condition. */
1065 ph10 392
1066 ph10 381 if (ecode[LINK_SIZE+1] == OP_CALLOUT)
1067     {
1068     if (pcre_callout != NULL)
1069     {
1070     pcre_callout_block cb;
1071     cb.version = 1; /* Version 1 of the callout block */
1072     cb.callout_number = ecode[LINK_SIZE+2];
1073     cb.offset_vector = md->offset_vector;
1074     cb.subject = (PCRE_SPTR)md->start_subject;
1075 ph10 530 cb.subject_length = (int)(md->end_subject - md->start_subject);
1076     cb.start_match = (int)(mstart - md->start_subject);
1077     cb.current_position = (int)(eptr - md->start_subject);
1078 ph10 381 cb.pattern_position = GET(ecode, LINK_SIZE + 3);
1079     cb.next_item_length = GET(ecode, 3 + 2*LINK_SIZE);
1080     cb.capture_top = offset_top/2;
1081     cb.capture_last = md->capture_last;
1082     cb.callout_data = md->callout_data;
1083 ph10 510 if ((rrc = (*pcre_callout)(&cb)) > 0) MRRETURN(MATCH_NOMATCH);
1084 ph10 381 if (rrc < 0) RRETURN(rrc);
1085     }
1086     ecode += _pcre_OP_lengths[OP_CALLOUT];
1087     }
1088 ph10 392
1089 ph10 399 condcode = ecode[LINK_SIZE+1];
1090 ph10 406
1091 ph10 381 /* Now see what the actual condition is */
1092 ph10 392
1093 ph10 459 if (condcode == OP_RREF || condcode == OP_NRREF) /* Recursion test */
1094 nigel 77 {
1095 ph10 459 if (md->recursive == NULL) /* Not recursing => FALSE */
1096     {
1097 ph10 461 condition = FALSE;
1098     ecode += GET(ecode, 1);
1099     }
1100 ph10 459 else
1101 ph10 461 {
1102 ph10 459 int recno = GET2(ecode, LINK_SIZE + 2); /* Recursion group number*/
1103     condition = (recno == RREF_ANY || recno == md->recursive->group_num);
1104 ph10 461
1105 ph10 459 /* If the test is for recursion into a specific subpattern, and it is
1106     false, but the test was set up by name, scan the table to see if the
1107     name refers to any other numbers, and test them. The condition is true
1108     if any one is set. */
1109 ph10 461
1110 ph10 459 if (!condition && condcode == OP_NRREF && recno != RREF_ANY)
1111     {
1112     uschar *slotA = md->name_table;
1113     for (i = 0; i < md->name_count; i++)
1114 ph10 461 {
1115     if (GET2(slotA, 0) == recno) break;
1116 ph10 459 slotA += md->name_entry_size;
1117     }
1118 ph10 461
1119 ph10 459 /* Found a name for the number - there can be only one; duplicate
1120     names for different numbers are allowed, but not vice versa. First
1121     scan down for duplicates. */
1122 ph10 461
1123 ph10 459 if (i < md->name_count)
1124 ph10 461 {
1125 ph10 459 uschar *slotB = slotA;
1126     while (slotB > md->name_table)
1127     {
1128     slotB -= md->name_entry_size;
1129     if (strcmp((char *)slotA + 2, (char *)slotB + 2) == 0)
1130     {
1131     condition = GET2(slotB, 0) == md->recursive->group_num;
1132 ph10 461 if (condition) break;
1133     }
1134 ph10 459 else break;
1135 ph10 461 }
1136    
1137 ph10 459 /* Scan up for duplicates */
1138 ph10 461
1139 ph10 459 if (!condition)
1140 ph10 461 {
1141 ph10 459 slotB = slotA;
1142     for (i++; i < md->name_count; i++)
1143     {
1144     slotB += md->name_entry_size;
1145     if (strcmp((char *)slotA + 2, (char *)slotB + 2) == 0)
1146     {
1147     condition = GET2(slotB, 0) == md->recursive->group_num;
1148     if (condition) break;
1149 ph10 461 }
1150 ph10 459 else break;
1151 ph10 461 }
1152     }
1153 ph10 459 }
1154 ph10 461 }
1155    
1156 ph10 459 /* Chose branch according to the condition */
1157 ph10 461
1158 ph10 459 ecode += condition? 3 : GET(ecode, 1);
1159     }
1160 ph10 461 }
1161 nigel 93
1162 ph10 459 else if (condcode == OP_CREF || condcode == OP_NCREF) /* Group used test */
1163 nigel 93 {
1164 nigel 77 offset = GET2(ecode, LINK_SIZE+2) << 1; /* Doubled ref number */
1165 nigel 93 condition = offset < offset_top && md->offset_vector[offset] >= 0;
1166 ph10 461
1167 ph10 459 /* If the numbered capture is unset, but the reference was by name,
1168 ph10 461 scan the table to see if the name refers to any other numbers, and test
1169     them. The condition is true if any one is set. This is tediously similar
1170     to the code above, but not close enough to try to amalgamate. */
1171    
1172 ph10 459 if (!condition && condcode == OP_NCREF)
1173     {
1174 ph10 461 int refno = offset >> 1;
1175 ph10 459 uschar *slotA = md->name_table;
1176 ph10 461
1177 ph10 459 for (i = 0; i < md->name_count; i++)
1178 ph10 461 {
1179     if (GET2(slotA, 0) == refno) break;
1180 ph10 459 slotA += md->name_entry_size;
1181     }
1182 ph10 461
1183     /* Found a name for the number - there can be only one; duplicate names
1184     for different numbers are allowed, but not vice versa. First scan down
1185 ph10 459 for duplicates. */
1186 ph10 461
1187 ph10 459 if (i < md->name_count)
1188 ph10 461 {
1189 ph10 459 uschar *slotB = slotA;
1190     while (slotB > md->name_table)
1191     {
1192     slotB -= md->name_entry_size;
1193     if (strcmp((char *)slotA + 2, (char *)slotB + 2) == 0)
1194     {
1195     offset = GET2(slotB, 0) << 1;
1196 ph10 461 condition = offset < offset_top &&
1197 ph10 459 md->offset_vector[offset] >= 0;
1198 ph10 461 if (condition) break;
1199     }
1200 ph10 459 else break;
1201 ph10 461 }
1202    
1203 ph10 459 /* Scan up for duplicates */
1204 ph10 461
1205 ph10 459 if (!condition)
1206 ph10 461 {
1207 ph10 459 slotB = slotA;
1208     for (i++; i < md->name_count; i++)
1209     {
1210     slotB += md->name_entry_size;
1211     if (strcmp((char *)slotA + 2, (char *)slotB + 2) == 0)
1212     {
1213     offset = GET2(slotB, 0) << 1;
1214 ph10 461 condition = offset < offset_top &&
1215 ph10 459 md->offset_vector[offset] >= 0;
1216 ph10 461 if (condition) break;
1217     }
1218 ph10 459 else break;
1219 ph10 461 }
1220     }
1221 ph10 459 }
1222 ph10 461 }
1223    
1224 ph10 459 /* Chose branch according to the condition */
1225    
1226 nigel 93 ecode += condition? 3 : GET(ecode, 1);
1227 nigel 77 }
1228    
1229 ph10 399 else if (condcode == OP_DEF) /* DEFINE - always false */
1230 nigel 93 {
1231     condition = FALSE;
1232     ecode += GET(ecode, 1);
1233     }
1234    
1235 nigel 77 /* The condition is an assertion. Call match() to evaluate it - setting
1236 ph10 604 md->match_function_type to MATCH_CONDASSERT causes it to stop at the end of
1237     an assertion. */
1238 nigel 77
1239     else
1240     {
1241 ph10 604 md->match_function_type = MATCH_CONDASSERT;
1242     RMATCH(eptr, ecode + 1 + LINK_SIZE, offset_top, md, NULL, RM3);
1243 nigel 77 if (rrc == MATCH_MATCH)
1244     {
1245 nigel 93 condition = TRUE;
1246     ecode += 1 + LINK_SIZE + GET(ecode, LINK_SIZE + 2);
1247 nigel 77 while (*ecode == OP_ALT) ecode += GET(ecode, 1);
1248     }
1249 ph10 550 else if (rrc != MATCH_NOMATCH &&
1250     (rrc != MATCH_THEN || md->start_match_ptr != ecode))
1251 nigel 77 {
1252     RRETURN(rrc); /* Need braces because of following else */
1253     }
1254 nigel 93 else
1255     {
1256     condition = FALSE;
1257 ph10 399 ecode += codelink;
1258 nigel 93 }
1259     }
1260 nigel 91
1261 nigel 93 /* We are now at the branch that is to be obeyed. As there is only one,
1262 ph10 197 we can use tail recursion to avoid using another stack frame, except when
1263 ph10 604 we have an unlimited repeat of a possibly empty group. If the second
1264     alternative doesn't exist, we can just plough on. */
1265 nigel 91
1266 nigel 93 if (condition || *ecode == OP_ALT)
1267     {
1268 nigel 91 ecode += 1 + LINK_SIZE;
1269 ph10 197 if (op == OP_SCOND) /* Possibly empty group */
1270     {
1271 ph10 604 md->match_function_type = MATCH_CBEGROUP;
1272     RMATCH(eptr, ecode, offset_top, md, eptrb, RM49);
1273 ph10 197 RRETURN(rrc);
1274     }
1275 ph10 604 else goto TAIL_RECURSE;
1276 nigel 77 }
1277 ph10 395 else /* Condition false & no alternative */
1278 nigel 93 {
1279     ecode += 1 + LINK_SIZE;
1280     }
1281     break;
1282 nigel 77
1283 ph10 461
1284 ph10 447 /* Before OP_ACCEPT there may be any number of OP_CLOSE opcodes,
1285     to close any currently open capturing brackets. */
1286 ph10 461
1287 ph10 447 case OP_CLOSE:
1288 ph10 461 number = GET2(ecode, 1);
1289 ph10 447 offset = number << 1;
1290 ph10 461
1291 ph10 475 #ifdef PCRE_DEBUG
1292 ph10 447 printf("end bracket %d at *ACCEPT", number);
1293     printf("\n");
1294     #endif
1295 nigel 77
1296 ph10 447 md->capture_last = number;
1297     if (offset >= md->offset_max) md->offset_overflow = TRUE; else
1298     {
1299     md->offset_vector[offset] =
1300     md->offset_vector[md->offset_end - number];
1301 ph10 530 md->offset_vector[offset+1] = (int)(eptr - md->start_subject);
1302 ph10 447 if (offset_top <= offset) offset_top = offset + 2;
1303     }
1304     ecode += 3;
1305 ph10 461 break;
1306 ph10 447
1307    
1308 ph10 608 /* End of the pattern, either real or forced. If we are in a recursion, we
1309     should restore the offsets appropriately, and if it's a top-level
1310     recursion, continue from after the call. */
1311 nigel 77
1312 ph10 210 case OP_ACCEPT:
1313 nigel 77 case OP_END:
1314 ph10 608 if (md->recursive != NULL)
1315 nigel 77 {
1316     recursion_info *rec = md->recursive;
1317     md->recursive = rec->prevrec;
1318 ph10 608 memmove(md->offset_vector, rec->offset_save,
1319 nigel 77 rec->saved_max * sizeof(int));
1320 ph10 461 offset_top = rec->save_offset_top;
1321 ph10 608 if (rec->group_num == 0)
1322     {
1323     ecode = rec->after_call;
1324     break;
1325     }
1326 nigel 77 }
1327    
1328 ph10 442 /* Otherwise, if we have matched an empty string, fail if PCRE_NOTEMPTY is
1329     set, or if PCRE_NOTEMPTY_ATSTART is set and we have matched at the start of
1330     the subject. In both cases, backtracking will then try other alternatives,
1331     if any. */
1332 ph10 443
1333 ph10 608 else if (eptr == mstart &&
1334 ph10 442 (md->notempty ||
1335 ph10 443 (md->notempty_atstart &&
1336 ph10 442 mstart == md->start_subject + md->start_offset)))
1337 ph10 510 MRRETURN(MATCH_NOMATCH);
1338 ph10 443
1339 ph10 442 /* Otherwise, we have a match. */
1340 ph10 608
1341 ph10 168 md->end_match_ptr = eptr; /* Record where we ended */
1342     md->end_offset_top = offset_top; /* and how many extracts were taken */
1343 ph10 210 md->start_match_ptr = mstart; /* and the start (\K can modify) */
1344 nigel 77
1345 ph10 512 /* For some reason, the macros don't work properly if an expression is
1346     given as the argument to MRRETURN when the heap is in use. */
1347    
1348     rrc = (op == OP_END)? MATCH_MATCH : MATCH_ACCEPT;
1349     MRRETURN(rrc);
1350    
1351 nigel 77 /* Assertion brackets. Check the alternative branches in turn - the
1352     matching won't pass the KET for an assertion. If any one branch matches,
1353     the assertion is true. Lookbehind assertions have an OP_REVERSE item at the
1354     start of each branch to move the current point backwards, so the code at
1355 ph10 604 this level is identical to the lookahead case. When the assertion is part
1356     of a condition, we want to return immediately afterwards. The caller of
1357     this incarnation of the match() function will have set MATCH_CONDASSERT in
1358     md->match_function type, and one of these opcodes will be the first opcode
1359     that is processed. We use a local variable that is preserved over calls to
1360     match() to remember this case. */
1361 nigel 77
1362     case OP_ASSERT:
1363     case OP_ASSERTBACK:
1364 ph10 604 if (md->match_function_type == MATCH_CONDASSERT)
1365     {
1366     condassert = TRUE;
1367     md->match_function_type = 0;
1368     }
1369     else condassert = FALSE;
1370    
1371 nigel 77 do
1372     {
1373 ph10 604 RMATCH(eptr, ecode + 1 + LINK_SIZE, offset_top, md, NULL, RM4);
1374 ph10 511 if (rrc == MATCH_MATCH || rrc == MATCH_ACCEPT)
1375 ph10 500 {
1376     mstart = md->start_match_ptr; /* In case \K reset it */
1377     break;
1378 ph10 501 }
1379 ph10 550 if (rrc != MATCH_NOMATCH &&
1380     (rrc != MATCH_THEN || md->start_match_ptr != ecode))
1381     RRETURN(rrc);
1382 nigel 77 ecode += GET(ecode, 1);
1383     }
1384     while (*ecode == OP_ALT);
1385 ph10 604
1386 ph10 510 if (*ecode == OP_KET) MRRETURN(MATCH_NOMATCH);
1387 nigel 77
1388     /* If checking an assertion for a condition, return MATCH_MATCH. */
1389    
1390 ph10 604 if (condassert) RRETURN(MATCH_MATCH);
1391 nigel 77
1392     /* Continue from after the assertion, updating the offsets high water
1393     mark, since extracts may have been taken during the assertion. */
1394    
1395     do ecode += GET(ecode,1); while (*ecode == OP_ALT);
1396     ecode += 1 + LINK_SIZE;
1397     offset_top = md->end_offset_top;
1398     continue;
1399    
1400 ph10 473 /* Negative assertion: all branches must fail to match. Encountering SKIP,
1401 ph10 482 PRUNE, or COMMIT means we must assume failure without checking subsequent
1402 ph10 473 branches. */
1403 nigel 77
1404     case OP_ASSERT_NOT:
1405     case OP_ASSERTBACK_NOT:
1406 ph10 604 if (md->match_function_type == MATCH_CONDASSERT)
1407     {
1408     condassert = TRUE;
1409     md->match_function_type = 0;
1410     }
1411     else condassert = FALSE;
1412    
1413 nigel 77 do
1414     {
1415 ph10 604 RMATCH(eptr, ecode + 1 + LINK_SIZE, offset_top, md, NULL, RM5);
1416 ph10 511 if (rrc == MATCH_MATCH || rrc == MATCH_ACCEPT) MRRETURN(MATCH_NOMATCH);
1417 ph10 473 if (rrc == MATCH_SKIP || rrc == MATCH_PRUNE || rrc == MATCH_COMMIT)
1418     {
1419     do ecode += GET(ecode,1); while (*ecode == OP_ALT);
1420 ph10 482 break;
1421     }
1422 ph10 550 if (rrc != MATCH_NOMATCH &&
1423     (rrc != MATCH_THEN || md->start_match_ptr != ecode))
1424     RRETURN(rrc);
1425 nigel 77 ecode += GET(ecode,1);
1426     }
1427     while (*ecode == OP_ALT);
1428    
1429 ph10 604 if (condassert) RRETURN(MATCH_MATCH); /* Condition assertion */
1430    
1431 nigel 77 ecode += 1 + LINK_SIZE;
1432     continue;
1433    
1434     /* Move the subject pointer back. This occurs only at the start of
1435     each branch of a lookbehind assertion. If we are too close to the start to
1436     move back, this match function fails. When working with UTF-8 we move
1437     back a number of characters, not bytes. */
1438    
1439     case OP_REVERSE:
1440     #ifdef SUPPORT_UTF8
1441     if (utf8)
1442     {
1443 nigel 93 i = GET(ecode, 1);
1444     while (i-- > 0)
1445 nigel 77 {
1446     eptr--;
1447 ph10 510 if (eptr < md->start_subject) MRRETURN(MATCH_NOMATCH);
1448 ph10 207 BACKCHAR(eptr);
1449 nigel 77 }
1450     }
1451     else
1452     #endif
1453    
1454     /* No UTF-8 support, or not in UTF-8 mode: count is byte count */
1455    
1456     {
1457 nigel 93 eptr -= GET(ecode, 1);
1458 ph10 510 if (eptr < md->start_subject) MRRETURN(MATCH_NOMATCH);
1459 nigel 77 }
1460    
1461 ph10 435 /* Save the earliest consulted character, then skip to next op code */
1462 nigel 77
1463 ph10 435 if (eptr < md->start_used_ptr) md->start_used_ptr = eptr;
1464 nigel 77 ecode += 1 + LINK_SIZE;
1465     break;
1466    
1467     /* The callout item calls an external function, if one is provided, passing
1468     details of the match so far. This is mainly for debugging, though the
1469     function is able to force a failure. */
1470    
1471     case OP_CALLOUT:
1472     if (pcre_callout != NULL)
1473     {
1474     pcre_callout_block cb;
1475     cb.version = 1; /* Version 1 of the callout block */
1476     cb.callout_number = ecode[1];
1477     cb.offset_vector = md->offset_vector;
1478 nigel 87 cb.subject = (PCRE_SPTR)md->start_subject;
1479 ph10 530 cb.subject_length = (int)(md->end_subject - md->start_subject);
1480     cb.start_match = (int)(mstart - md->start_subject);
1481     cb.current_position = (int)(eptr - md->start_subject);
1482 nigel 77 cb.pattern_position = GET(ecode, 2);
1483     cb.next_item_length = GET(ecode, 2 + LINK_SIZE);
1484     cb.capture_top = offset_top/2;
1485     cb.capture_last = md->capture_last;
1486     cb.callout_data = md->callout_data;
1487 ph10 510 if ((rrc = (*pcre_callout)(&cb)) > 0) MRRETURN(MATCH_NOMATCH);
1488 nigel 77 if (rrc < 0) RRETURN(rrc);
1489     }
1490     ecode += 2 + 2*LINK_SIZE;
1491     break;
1492    
1493     /* Recursion either matches the current regex, or some subexpression. The
1494     offset data is the offset to the starting bracket from the start of the
1495     whole pattern. (This is so that it works from duplicated subpatterns.)
1496    
1497     If there are any capturing brackets started but not finished, we have to
1498     save their starting points and reinstate them after the recursion. However,
1499     we don't know how many such there are (offset_top records the completed
1500     total) so we just have to save all the potential data. There may be up to
1501     65535 such values, which is too large to put on the stack, but using malloc
1502     for small numbers seems expensive. As a compromise, the stack is used when
1503     there are no more than REC_STACK_SAVE_MAX values to store; otherwise malloc
1504     is used. A problem is what to do if the malloc fails ... there is no way of
1505     returning to the top level with an error. Save the top REC_STACK_SAVE_MAX
1506     values on the stack, and accept that the rest may be wrong.
1507    
1508     There are also other values that have to be saved. We use a chained
1509     sequence of blocks that actually live on the stack. Thanks to Robin Houston
1510     for the original version of this logic. */
1511    
1512     case OP_RECURSE:
1513     {
1514     callpat = md->start_code + GET(ecode, 1);
1515 nigel 93 new_recursive.group_num = (callpat == md->start_code)? 0 :
1516     GET2(callpat, 1 + LINK_SIZE);
1517 nigel 77
1518     /* Add to "recursing stack" */
1519    
1520     new_recursive.prevrec = md->recursive;
1521     md->recursive = &new_recursive;
1522    
1523     /* Find where to continue from afterwards */
1524    
1525     ecode += 1 + LINK_SIZE;
1526     new_recursive.after_call = ecode;
1527    
1528     /* Now save the offset data. */
1529    
1530     new_recursive.saved_max = md->offset_end;
1531     if (new_recursive.saved_max <= REC_STACK_SAVE_MAX)
1532     new_recursive.offset_save = stacksave;
1533     else
1534     {
1535     new_recursive.offset_save =
1536     (int *)(pcre_malloc)(new_recursive.saved_max * sizeof(int));
1537     if (new_recursive.offset_save == NULL) RRETURN(PCRE_ERROR_NOMEMORY);
1538     }
1539    
1540     memcpy(new_recursive.offset_save, md->offset_vector,
1541     new_recursive.saved_max * sizeof(int));
1542 ph10 461 new_recursive.save_offset_top = offset_top;
1543 ph10 608
1544 nigel 77 /* OK, now we can do the recursion. For each top-level alternative we
1545     restore the offset and recursion data. */
1546    
1547     DPRINTF(("Recursing into group %d\n", new_recursive.group_num));
1548 ph10 604 cbegroup = (*callpat >= OP_SBRA);
1549 nigel 77 do
1550     {
1551 ph10 604 if (cbegroup) md->match_function_type = MATCH_CBEGROUP;
1552 ph10 164 RMATCH(eptr, callpat + _pcre_OP_lengths[*callpat], offset_top,
1553 ph10 604 md, eptrb, RM6);
1554 ph10 511 if (rrc == MATCH_MATCH || rrc == MATCH_ACCEPT)
1555 nigel 77 {
1556 nigel 87 DPRINTF(("Recursion matched\n"));
1557 nigel 77 md->recursive = new_recursive.prevrec;
1558     if (new_recursive.offset_save != stacksave)
1559     (pcre_free)(new_recursive.offset_save);
1560 ph10 510 MRRETURN(MATCH_MATCH);
1561 nigel 77 }
1562 ph10 550 else if (rrc != MATCH_NOMATCH &&
1563     (rrc != MATCH_THEN || md->start_match_ptr != ecode))
1564 nigel 87 {
1565     DPRINTF(("Recursion gave error %d\n", rrc));
1566 ph10 400 if (new_recursive.offset_save != stacksave)
1567     (pcre_free)(new_recursive.offset_save);
1568 nigel 87 RRETURN(rrc);
1569     }
1570 nigel 77
1571     md->recursive = &new_recursive;
1572     memcpy(md->offset_vector, new_recursive.offset_save,
1573     new_recursive.saved_max * sizeof(int));
1574     callpat += GET(callpat, 1);
1575     }
1576     while (*callpat == OP_ALT);
1577    
1578     DPRINTF(("Recursion didn't match\n"));
1579     md->recursive = new_recursive.prevrec;
1580     if (new_recursive.offset_save != stacksave)
1581     (pcre_free)(new_recursive.offset_save);
1582 ph10 510 MRRETURN(MATCH_NOMATCH);
1583 nigel 77 }
1584     /* Control never reaches here */
1585    
1586     /* "Once" brackets are like assertion brackets except that after a match,
1587     the point in the subject string is not moved back. Thus there can never be
1588     a move back into the brackets. Friedl calls these "atomic" subpatterns.
1589     Check the alternative branches in turn - the matching won't pass the KET
1590     for this kind of subpattern. If any one branch matches, we carry on as at
1591 ph10 500 the end of a normal bracket, leaving the subject pointer, but resetting
1592     the start-of-match value in case it was changed by \K. */
1593 nigel 77
1594     case OP_ONCE:
1595 nigel 91 prev = ecode;
1596     saved_eptr = eptr;
1597    
1598     do
1599 nigel 77 {
1600 ph10 604 RMATCH(eptr, ecode + 1 + LINK_SIZE, offset_top, md, eptrb, RM7);
1601 ph10 511 if (rrc == MATCH_MATCH) /* Note: _not_ MATCH_ACCEPT */
1602 ph10 500 {
1603     mstart = md->start_match_ptr;
1604     break;
1605 ph10 501 }
1606 ph10 550 if (rrc != MATCH_NOMATCH &&
1607     (rrc != MATCH_THEN || md->start_match_ptr != ecode))
1608     RRETURN(rrc);
1609 nigel 91 ecode += GET(ecode,1);
1610     }
1611     while (*ecode == OP_ALT);
1612 nigel 77
1613 nigel 91 /* If hit the end of the group (which could be repeated), fail */
1614 nigel 77
1615 nigel 91 if (*ecode != OP_ONCE && *ecode != OP_ALT) RRETURN(MATCH_NOMATCH);
1616 nigel 77
1617 nigel 91 /* Continue as from after the assertion, updating the offsets high water
1618     mark, since extracts may have been taken. */
1619 nigel 77
1620 nigel 93 do ecode += GET(ecode, 1); while (*ecode == OP_ALT);
1621 nigel 77
1622 nigel 91 offset_top = md->end_offset_top;
1623     eptr = md->end_match_ptr;
1624 nigel 77
1625 nigel 91 /* For a non-repeating ket, just continue at this level. This also
1626     happens for a repeating ket if no characters were matched in the group.
1627     This is the forcible breaking of infinite loops as implemented in Perl
1628     5.005. If there is an options reset, it will get obeyed in the normal
1629     course of events. */
1630 nigel 77
1631 nigel 91 if (*ecode == OP_KET || eptr == saved_eptr)
1632     {
1633     ecode += 1+LINK_SIZE;
1634     break;
1635     }
1636 nigel 77
1637 nigel 91 /* The repeating kets try the rest of the pattern or restart from the
1638     preceding bracket, in the appropriate order. The second "call" of match()
1639 ph10 602 uses tail recursion, to avoid using another stack frame. */
1640 nigel 77
1641 nigel 91 if (*ecode == OP_KETRMIN)
1642     {
1643 ph10 604 RMATCH(eptr, ecode + 1 + LINK_SIZE, offset_top, md, eptrb, RM8);
1644 nigel 91 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1645     ecode = prev;
1646     goto TAIL_RECURSE;
1647 nigel 77 }
1648 nigel 91 else /* OP_KETRMAX */
1649     {
1650 ph10 604 md->match_function_type = MATCH_CBEGROUP;
1651     RMATCH(eptr, prev, offset_top, md, eptrb, RM9);
1652 nigel 91 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1653     ecode += 1 + LINK_SIZE;
1654     goto TAIL_RECURSE;
1655     }
1656     /* Control never gets here */
1657 nigel 77
1658     /* An alternation is the end of a branch; scan along to find the end of the
1659     bracketed group and go to there. */
1660    
1661     case OP_ALT:
1662     do ecode += GET(ecode,1); while (*ecode == OP_ALT);
1663     break;
1664    
1665 ph10 335 /* BRAZERO, BRAMINZERO and SKIPZERO occur just before a bracket group,
1666     indicating that it may occur zero times. It may repeat infinitely, or not
1667     at all - i.e. it could be ()* or ()? or even (){0} in the pattern. Brackets
1668     with fixed upper repeat limits are compiled as a number of copies, with the
1669     optional ones preceded by BRAZERO or BRAMINZERO. */
1670 ph10 604
1671 nigel 77 case OP_BRAZERO:
1672 ph10 604 next = ecode + 1;
1673     RMATCH(eptr, next, offset_top, md, eptrb, RM10);
1674     if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1675     do next += GET(next, 1); while (*next == OP_ALT);
1676     ecode = next + 1 + LINK_SIZE;
1677 nigel 77 break;
1678 ph10 604
1679 nigel 77 case OP_BRAMINZERO:
1680 ph10 604 next = ecode + 1;
1681     do next += GET(next, 1); while (*next == OP_ALT);
1682     RMATCH(eptr, next + 1+LINK_SIZE, offset_top, md, eptrb, RM11);
1683     if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1684     ecode++;
1685 nigel 77 break;
1686    
1687 ph10 335 case OP_SKIPZERO:
1688 ph10 604 next = ecode+1;
1689     do next += GET(next,1); while (*next == OP_ALT);
1690     ecode = next + 1 + LINK_SIZE;
1691 ph10 335 break;
1692 ph10 604
1693     /* BRAPOSZERO occurs before a possessive bracket group. Don't do anything
1694     here; just jump to the group, with allow_zero set TRUE. */
1695    
1696     case OP_BRAPOSZERO:
1697     op = *(++ecode);
1698     allow_zero = TRUE;
1699     if (op == OP_CBRAPOS || op == OP_SCBRAPOS) goto POSSESSIVE_CAPTURE;
1700     goto POSSESSIVE_NON_CAPTURE;
1701 ph10 335
1702 nigel 93 /* End of a group, repeated or non-repeating. */
1703 nigel 77
1704     case OP_KET:
1705     case OP_KETRMIN:
1706     case OP_KETRMAX:
1707 ph10 604 case OP_KETRPOS:
1708 nigel 91 prev = ecode - GET(ecode, 1);
1709 nigel 77
1710 nigel 93 /* If this was a group that remembered the subject start, in order to break
1711     infinite repeats of empty string matches, retrieve the subject start from
1712     the chain. Otherwise, set it NULL. */
1713 nigel 77
1714 nigel 93 if (*prev >= OP_SBRA)
1715     {
1716     saved_eptr = eptrb->epb_saved_eptr; /* Value at start of group */
1717     eptrb = eptrb->epb_prev; /* Backup to previous group */
1718     }
1719     else saved_eptr = NULL;
1720 nigel 77
1721 ph10 500 /* If we are at the end of an assertion group or an atomic group, stop
1722     matching and return MATCH_MATCH, but record the current high water mark for
1723     use by positive assertions. We also need to record the match start in case
1724     it was changed by \K. */
1725 nigel 93
1726 nigel 91 if (*prev == OP_ASSERT || *prev == OP_ASSERT_NOT ||
1727     *prev == OP_ASSERTBACK || *prev == OP_ASSERTBACK_NOT ||
1728     *prev == OP_ONCE)
1729     {
1730     md->end_match_ptr = eptr; /* For ONCE */
1731     md->end_offset_top = offset_top;
1732 ph10 500 md->start_match_ptr = mstart;
1733 ph10 510 MRRETURN(MATCH_MATCH);
1734 nigel 91 }
1735 nigel 77
1736 nigel 93 /* For capturing groups we have to check the group number back at the start
1737     and if necessary complete handling an extraction by setting the offsets and
1738     bumping the high water mark. Note that whole-pattern recursion is coded as
1739     a recurse into group 0, so it won't be picked up here. Instead, we catch it
1740     when the OP_END is reached. Other recursion is handled here. */
1741 nigel 77
1742 ph10 604 if (*prev == OP_CBRA || *prev == OP_SCBRA ||
1743     *prev == OP_CBRAPOS || *prev == OP_SCBRAPOS)
1744 nigel 91 {
1745 nigel 93 number = GET2(prev, 1+LINK_SIZE);
1746 nigel 91 offset = number << 1;
1747 ph10 461
1748 ph10 475 #ifdef PCRE_DEBUG
1749 nigel 91 printf("end bracket %d", number);
1750     printf("\n");
1751 nigel 77 #endif
1752    
1753 nigel 93 md->capture_last = number;
1754     if (offset >= md->offset_max) md->offset_overflow = TRUE; else
1755 nigel 91 {
1756 nigel 93 md->offset_vector[offset] =
1757     md->offset_vector[md->offset_end - number];
1758 ph10 530 md->offset_vector[offset+1] = (int)(eptr - md->start_subject);
1759 nigel 93 if (offset_top <= offset) offset_top = offset + 2;
1760     }
1761 nigel 77
1762 nigel 93 /* Handle a recursively called group. Restore the offsets
1763     appropriately and continue from after the call. */
1764 nigel 77
1765 nigel 93 if (md->recursive != NULL && md->recursive->group_num == number)
1766     {
1767     recursion_info *rec = md->recursive;
1768     DPRINTF(("Recursion (%d) succeeded - continuing\n", number));
1769     md->recursive = rec->prevrec;
1770     memcpy(md->offset_vector, rec->offset_save,
1771     rec->saved_max * sizeof(int));
1772 ph10 461 offset_top = rec->save_offset_top;
1773 nigel 93 ecode = rec->after_call;
1774     break;
1775 nigel 77 }
1776 nigel 91 }
1777 nigel 77
1778 nigel 91 /* For a non-repeating ket, just continue at this level. This also
1779     happens for a repeating ket if no characters were matched in the group.
1780     This is the forcible breaking of infinite loops as implemented in Perl
1781     5.005. If there is an options reset, it will get obeyed in the normal
1782     course of events. */
1783 nigel 77
1784 nigel 91 if (*ecode == OP_KET || eptr == saved_eptr)
1785     {
1786     ecode += 1 + LINK_SIZE;
1787     break;
1788     }
1789 ph10 604
1790     /* OP_KETRPOS is a possessive repeating ket. Remember the current position,
1791     and return the MATCH_KETRPOS. This makes it possible to do the repeats one
1792     at a time from the outer level, thus saving stack. */
1793    
1794     if (*ecode == OP_KETRPOS)
1795     {
1796     md->end_match_ptr = eptr;
1797     md->end_offset_top = offset_top;
1798     RRETURN(MATCH_KETRPOS);
1799     }
1800 nigel 77
1801 ph10 604 /* The normal repeating kets try the rest of the pattern or restart from
1802     the preceding bracket, in the appropriate order. In the second case, we can
1803     use tail recursion to avoid using another stack frame, unless we have an
1804 ph10 197 unlimited repeat of a group that can match an empty string. */
1805 nigel 77
1806 nigel 91 if (*ecode == OP_KETRMIN)
1807     {
1808 ph10 604 RMATCH(eptr, ecode + 1 + LINK_SIZE, offset_top, md, eptrb, RM12);
1809 nigel 91 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1810 ph10 604 if (*prev >= OP_SBRA) /* Could match an empty string */
1811 ph10 197 {
1812 ph10 604 md->match_function_type = MATCH_CBEGROUP;
1813     RMATCH(eptr, prev, offset_top, md, eptrb, RM50);
1814 ph10 197 RRETURN(rrc);
1815     }
1816 nigel 91 ecode = prev;
1817     goto TAIL_RECURSE;
1818 nigel 77 }
1819 nigel 91 else /* OP_KETRMAX */
1820     {
1821 ph10 604 if (*prev >= OP_SBRA) md->match_function_type = MATCH_CBEGROUP;
1822     RMATCH(eptr, prev, offset_top, md, eptrb, RM13);
1823 nigel 91 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1824     ecode += 1 + LINK_SIZE;
1825     goto TAIL_RECURSE;
1826     }
1827     /* Control never gets here */
1828 nigel 77
1829 ph10 602 /* Not multiline mode: start of subject assertion, unless notbol. */
1830 nigel 77
1831     case OP_CIRC:
1832 ph10 510 if (md->notbol && eptr == md->start_subject) MRRETURN(MATCH_NOMATCH);
1833 ph10 602
1834 nigel 77 /* Start of subject assertion */
1835    
1836     case OP_SOD:
1837 ph10 510 if (eptr != md->start_subject) MRRETURN(MATCH_NOMATCH);
1838 nigel 77 ecode++;
1839     break;
1840 ph10 602
1841     /* Multiline mode: start of subject unless notbol, or after any newline. */
1842 nigel 77
1843 ph10 602 case OP_CIRCM:
1844     if (md->notbol && eptr == md->start_subject) MRRETURN(MATCH_NOMATCH);
1845     if (eptr != md->start_subject &&
1846     (eptr == md->end_subject || !WAS_NEWLINE(eptr)))
1847     MRRETURN(MATCH_NOMATCH);
1848     ecode++;
1849     break;
1850    
1851 nigel 77 /* Start of match assertion */
1852    
1853     case OP_SOM:
1854 ph10 510 if (eptr != md->start_subject + md->start_offset) MRRETURN(MATCH_NOMATCH);
1855 nigel 77 ecode++;
1856     break;
1857 ph10 172
1858 ph10 168 /* Reset the start of match point */
1859 ph10 172
1860 ph10 168 case OP_SET_SOM:
1861     mstart = eptr;
1862 ph10 172 ecode++;
1863     break;
1864 nigel 77
1865 ph10 602 /* Multiline mode: assert before any newline, or before end of subject
1866     unless noteol is set. */
1867 nigel 77
1868 ph10 602 case OP_DOLLM:
1869     if (eptr < md->end_subject)
1870     { if (!IS_NEWLINE(eptr)) MRRETURN(MATCH_NOMATCH); }
1871     else
1872 nigel 77 {
1873 ph10 510 if (md->noteol) MRRETURN(MATCH_NOMATCH);
1874 ph10 602 SCHECK_PARTIAL();
1875 nigel 77 }
1876 ph10 602 ecode++;
1877     break;
1878 ph10 579
1879 ph10 602 /* Not multiline mode: assert before a terminating newline or before end of
1880     subject unless noteol is set. */
1881    
1882     case OP_DOLL:
1883     if (md->noteol) MRRETURN(MATCH_NOMATCH);
1884     if (!md->endonly) goto ASSERT_NL_OR_EOS;
1885    
1886 nigel 91 /* ... else fall through for endonly */
1887 nigel 77
1888     /* End of subject assertion (\z) */
1889    
1890     case OP_EOD:
1891 ph10 510 if (eptr < md->end_subject) MRRETURN(MATCH_NOMATCH);
1892 ph10 553 SCHECK_PARTIAL();
1893 nigel 77 ecode++;
1894     break;
1895    
1896     /* End of subject or ending \n assertion (\Z) */
1897    
1898     case OP_EODN:
1899 ph10 553 ASSERT_NL_OR_EOS:
1900     if (eptr < md->end_subject &&
1901 nigel 93 (!IS_NEWLINE(eptr) || eptr != md->end_subject - md->nllen))
1902 ph10 510 MRRETURN(MATCH_NOMATCH);
1903 ph10 579
1904 ph10 553 /* Either at end of string or \n before end. */
1905 ph10 579
1906 ph10 553 SCHECK_PARTIAL();
1907 nigel 77 ecode++;
1908     break;
1909    
1910     /* Word boundary assertions */
1911    
1912     case OP_NOT_WORD_BOUNDARY:
1913     case OP_WORD_BOUNDARY:
1914     {
1915    
1916     /* Find out if the previous and current characters are "word" characters.
1917     It takes a bit more work in UTF-8 mode. Characters > 255 are assumed to
1918 ph10 443 be "non-word" characters. Remember the earliest consulted character for
1919 ph10 435 partial matching. */
1920 nigel 77
1921     #ifdef SUPPORT_UTF8
1922     if (utf8)
1923     {
1924 ph10 518 /* Get status of previous character */
1925 ph10 527
1926 nigel 77 if (eptr == md->start_subject) prev_is_word = FALSE; else
1927     {
1928 ph10 409 USPTR lastptr = eptr - 1;
1929 nigel 77 while((*lastptr & 0xc0) == 0x80) lastptr--;
1930 ph10 443 if (lastptr < md->start_used_ptr) md->start_used_ptr = lastptr;
1931 nigel 77 GETCHAR(c, lastptr);
1932 ph10 527 #ifdef SUPPORT_UCP
1933 ph10 518 if (md->use_ucp)
1934     {
1935     if (c == '_') prev_is_word = TRUE; else
1936 ph10 527 {
1937 ph10 518 int cat = UCD_CATEGORY(c);
1938     prev_is_word = (cat == ucp_L || cat == ucp_N);
1939 ph10 527 }
1940     }
1941     else
1942     #endif
1943 nigel 77 prev_is_word = c < 256 && (md->ctypes[c] & ctype_word) != 0;
1944     }
1945 ph10 527
1946 ph10 518 /* Get status of next character */
1947 ph10 527
1948 ph10 443 if (eptr >= md->end_subject)
1949 nigel 77 {
1950 ph10 443 SCHECK_PARTIAL();
1951     cur_is_word = FALSE;
1952 ph10 428 }
1953     else
1954     {
1955 nigel 77 GETCHAR(c, eptr);
1956 ph10 527 #ifdef SUPPORT_UCP
1957 ph10 518 if (md->use_ucp)
1958     {
1959     if (c == '_') cur_is_word = TRUE; else
1960 ph10 527 {
1961 ph10 518 int cat = UCD_CATEGORY(c);
1962     cur_is_word = (cat == ucp_L || cat == ucp_N);
1963 ph10 527 }
1964     }
1965     else
1966     #endif
1967 nigel 77 cur_is_word = c < 256 && (md->ctypes[c] & ctype_word) != 0;
1968     }
1969     }
1970     else
1971     #endif
1972    
1973 ph10 527 /* Not in UTF-8 mode, but we may still have PCRE_UCP set, and for
1974 ph10 518 consistency with the behaviour of \w we do use it in this case. */
1975 nigel 77
1976     {
1977 ph10 518 /* Get status of previous character */
1978 ph10 527
1979 ph10 435 if (eptr == md->start_subject) prev_is_word = FALSE; else
1980     {
1981 ph10 443 if (eptr <= md->start_used_ptr) md->start_used_ptr = eptr - 1;
1982 ph10 527 #ifdef SUPPORT_UCP
1983 ph10 518 if (md->use_ucp)
1984     {
1985 ph10 527 c = eptr[-1];
1986 ph10 518 if (c == '_') prev_is_word = TRUE; else
1987 ph10 527 {
1988 ph10 518 int cat = UCD_CATEGORY(c);
1989     prev_is_word = (cat == ucp_L || cat == ucp_N);
1990 ph10 527 }
1991     }
1992     else
1993     #endif
1994 ph10 435 prev_is_word = ((md->ctypes[eptr[-1]] & ctype_word) != 0);
1995     }
1996 ph10 527
1997 ph10 518 /* Get status of next character */
1998 ph10 527
1999 ph10 443 if (eptr >= md->end_subject)
2000 ph10 428 {
2001 ph10 443 SCHECK_PARTIAL();
2002     cur_is_word = FALSE;
2003 ph10 428 }
2004 ph10 527 else
2005     #ifdef SUPPORT_UCP
2006 ph10 518 if (md->use_ucp)
2007     {
2008 ph10 527 c = *eptr;
2009 ph10 518 if (c == '_') cur_is_word = TRUE; else
2010 ph10 527 {
2011 ph10 518 int cat = UCD_CATEGORY(c);
2012     cur_is_word = (cat == ucp_L || cat == ucp_N);
2013 ph10 527 }
2014     }
2015     else
2016     #endif
2017 ph10 518 cur_is_word = ((md->ctypes[*eptr] & ctype_word) != 0);
2018 nigel 77 }
2019    
2020     /* Now see if the situation is what we want */
2021    
2022     if ((*ecode++ == OP_WORD_BOUNDARY)?
2023     cur_is_word == prev_is_word : cur_is_word != prev_is_word)
2024 ph10 510 MRRETURN(MATCH_NOMATCH);
2025 nigel 77 }
2026     break;
2027    
2028     /* Match a single character type; inline for speed */
2029    
2030     case OP_ANY:
2031 ph10 510 if (IS_NEWLINE(eptr)) MRRETURN(MATCH_NOMATCH);
2032 ph10 345 /* Fall through */
2033    
2034 ph10 341 case OP_ALLANY:
2035 ph10 443 if (eptr++ >= md->end_subject)
2036 ph10 428 {
2037 ph10 443 SCHECK_PARTIAL();
2038 ph10 510 MRRETURN(MATCH_NOMATCH);
2039 ph10 443 }
2040 ph10 342 if (utf8) while (eptr < md->end_subject && (*eptr & 0xc0) == 0x80) eptr++;
2041 nigel 77 ecode++;
2042     break;
2043    
2044     /* Match a single byte, even in UTF-8 mode. This opcode really does match
2045     any byte, even newline, independent of the setting of PCRE_DOTALL. */
2046    
2047     case OP_ANYBYTE:
2048 ph10 443 if (eptr++ >= md->end_subject)
2049 ph10 428 {
2050 ph10 443 SCHECK_PARTIAL();
2051 ph10 510 MRRETURN(MATCH_NOMATCH);
2052 ph10 443 }
2053 nigel 77 ecode++;
2054     break;
2055    
2056     case OP_NOT_DIGIT:
2057 ph10 443 if (eptr >= md->end_subject)
2058 ph10 428 {
2059 ph10 443 SCHECK_PARTIAL();
2060 ph10 510 MRRETURN(MATCH_NOMATCH);
2061 ph10 443 }
2062 nigel 77 GETCHARINCTEST(c, eptr);
2063     if (
2064     #ifdef SUPPORT_UTF8
2065     c < 256 &&
2066     #endif
2067     (md->ctypes[c] & ctype_digit) != 0
2068     )
2069 ph10 510 MRRETURN(MATCH_NOMATCH);
2070 nigel 77 ecode++;
2071     break;
2072    
2073     case OP_DIGIT:
2074 ph10 443 if (eptr >= md->end_subject)
2075 ph10 428 {
2076 ph10 443 SCHECK_PARTIAL();
2077 ph10 510 MRRETURN(MATCH_NOMATCH);
2078 ph10 443 }
2079 nigel 77 GETCHARINCTEST(c, eptr);
2080     if (
2081     #ifdef SUPPORT_UTF8
2082     c >= 256 ||
2083     #endif
2084     (md->ctypes[c] & ctype_digit) == 0
2085     )
2086 ph10 510 MRRETURN(MATCH_NOMATCH);
2087 nigel 77 ecode++;
2088     break;
2089    
2090     case OP_NOT_WHITESPACE:
2091 ph10 443 if (eptr >= md->end_subject)
2092 ph10 428 {
2093 ph10 443 SCHECK_PARTIAL();
2094 ph10 510 MRRETURN(MATCH_NOMATCH);
2095 ph10 443 }
2096 nigel 77 GETCHARINCTEST(c, eptr);
2097     if (
2098     #ifdef SUPPORT_UTF8
2099     c < 256 &&
2100     #endif
2101     (md->ctypes[c] & ctype_space) != 0
2102     )
2103 ph10 510 MRRETURN(MATCH_NOMATCH);
2104 nigel 77 ecode++;
2105     break;
2106    
2107     case OP_WHITESPACE:
2108 ph10 443 if (eptr >= md->end_subject)
2109 ph10 428 {
2110 ph10 443 SCHECK_PARTIAL();
2111 ph10 510 MRRETURN(MATCH_NOMATCH);
2112 ph10 443 }
2113 nigel 77 GETCHARINCTEST(c, eptr);
2114     if (
2115     #ifdef SUPPORT_UTF8
2116     c >= 256 ||
2117     #endif
2118     (md->ctypes[c] & ctype_space) == 0
2119     )
2120 ph10 510 MRRETURN(MATCH_NOMATCH);
2121 nigel 77 ecode++;
2122     break;
2123    
2124     case OP_NOT_WORDCHAR:
2125 ph10 443 if (eptr >= md->end_subject)
2126 ph10 428 {
2127 ph10 443 SCHECK_PARTIAL();
2128 ph10 510 MRRETURN(MATCH_NOMATCH);
2129 ph10 443 }
2130 nigel 77 GETCHARINCTEST(c, eptr);
2131     if (
2132     #ifdef SUPPORT_UTF8
2133     c < 256 &&
2134     #endif
2135     (md->ctypes[c] & ctype_word) != 0
2136     )
2137 ph10 510 MRRETURN(MATCH_NOMATCH);
2138 nigel 77 ecode++;
2139     break;
2140    
2141     case OP_WORDCHAR:
2142 ph10 443 if (eptr >= md->end_subject)
2143 ph10 428 {
2144 ph10 443 SCHECK_PARTIAL();
2145 ph10 510 MRRETURN(MATCH_NOMATCH);
2146 ph10 443 }
2147 nigel 77 GETCHARINCTEST(c, eptr);
2148     if (
2149     #ifdef SUPPORT_UTF8
2150     c >= 256 ||
2151     #endif
2152     (md->ctypes[c] & ctype_word) == 0
2153     )
2154 ph10 510 MRRETURN(MATCH_NOMATCH);
2155 nigel 77 ecode++;
2156     break;
2157    
2158 nigel 93 case OP_ANYNL:
2159 ph10 443 if (eptr >= md->end_subject)
2160 ph10 428 {
2161 ph10 443 SCHECK_PARTIAL();
2162 ph10 510 MRRETURN(MATCH_NOMATCH);
2163 ph10 443 }
2164 nigel 93 GETCHARINCTEST(c, eptr);
2165     switch(c)
2166     {
2167 ph10 510 default: MRRETURN(MATCH_NOMATCH);
2168 ph10 600
2169 nigel 93 case 0x000d:
2170     if (eptr < md->end_subject && *eptr == 0x0a) eptr++;
2171     break;
2172 ph10 231
2173 nigel 93 case 0x000a:
2174 ph10 231 break;
2175    
2176 nigel 93 case 0x000b:
2177     case 0x000c:
2178     case 0x0085:
2179     case 0x2028:
2180     case 0x2029:
2181 ph10 510 if (md->bsr_anycrlf) MRRETURN(MATCH_NOMATCH);
2182 nigel 93 break;
2183     }
2184     ecode++;
2185     break;
2186    
2187 ph10 178 case OP_NOT_HSPACE:
2188 ph10 443 if (eptr >= md->end_subject)
2189 ph10 428 {
2190 ph10 443 SCHECK_PARTIAL();
2191 ph10 510 MRRETURN(MATCH_NOMATCH);
2192 ph10 443 }
2193 ph10 178 GETCHARINCTEST(c, eptr);
2194     switch(c)
2195     {
2196     default: break;
2197     case 0x09: /* HT */
2198     case 0x20: /* SPACE */
2199     case 0xa0: /* NBSP */
2200     case 0x1680: /* OGHAM SPACE MARK */
2201     case 0x180e: /* MONGOLIAN VOWEL SEPARATOR */
2202     case 0x2000: /* EN QUAD */
2203     case 0x2001: /* EM QUAD */
2204     case 0x2002: /* EN SPACE */
2205     case 0x2003: /* EM SPACE */
2206     case 0x2004: /* THREE-PER-EM SPACE */
2207     case 0x2005: /* FOUR-PER-EM SPACE */
2208     case 0x2006: /* SIX-PER-EM SPACE */
2209     case 0x2007: /* FIGURE SPACE */
2210     case 0x2008: /* PUNCTUATION SPACE */
2211     case 0x2009: /* THIN SPACE */
2212     case 0x200A: /* HAIR SPACE */
2213     case 0x202f: /* NARROW NO-BREAK SPACE */
2214     case 0x205f: /* MEDIUM MATHEMATICAL SPACE */
2215     case 0x3000: /* IDEOGRAPHIC SPACE */
2216 ph10 510 MRRETURN(MATCH_NOMATCH);
2217 ph10 178 }
2218     ecode++;
2219     break;
2220    
2221     case OP_HSPACE:
2222 ph10 443 if (eptr >= md->end_subject)
2223 ph10 428 {
2224 ph10 443 SCHECK_PARTIAL();
2225 ph10 510 MRRETURN(MATCH_NOMATCH);
2226 ph10 443 }
2227 ph10 178 GETCHARINCTEST(c, eptr);
2228     switch(c)
2229     {
2230 ph10 510 default: MRRETURN(MATCH_NOMATCH);
2231 ph10 178 case 0x09: /* HT */
2232     case 0x20: /* SPACE */
2233     case 0xa0: /* NBSP */
2234     case 0x1680: /* OGHAM SPACE MARK */
2235     case 0x180e: /* MONGOLIAN VOWEL SEPARATOR */
2236     case 0x2000: /* EN QUAD */
2237     case 0x2001: /* EM QUAD */
2238     case 0x2002: /* EN SPACE */
2239     case 0x2003: /* EM SPACE */
2240     case 0x2004: /* THREE-PER-EM SPACE */
2241     case 0x2005: /* FOUR-PER-EM SPACE */
2242     case 0x2006: /* SIX-PER-EM SPACE */
2243     case 0x2007: /* FIGURE SPACE */
2244     case 0x2008: /* PUNCTUATION SPACE */
2245     case 0x2009: /* THIN SPACE */
2246     case 0x200A: /* HAIR SPACE */
2247     case 0x202f: /* NARROW NO-BREAK SPACE */
2248     case 0x205f: /* MEDIUM MATHEMATICAL SPACE */
2249     case 0x3000: /* IDEOGRAPHIC SPACE */
2250     break;
2251     }
2252     ecode++;
2253     break;
2254    
2255     case OP_NOT_VSPACE:
2256 ph10 443 if (eptr >= md->end_subject)
2257 ph10 428 {
2258 ph10 443 SCHECK_PARTIAL();
2259 ph10 510 MRRETURN(MATCH_NOMATCH);
2260 ph10 443 }
2261 ph10 178 GETCHARINCTEST(c, eptr);
2262     switch(c)
2263     {
2264     default: break;
2265     case 0x0a: /* LF */
2266     case 0x0b: /* VT */
2267     case 0x0c: /* FF */
2268     case 0x0d: /* CR */
2269     case 0x85: /* NEL */
2270     case 0x2028: /* LINE SEPARATOR */
2271     case 0x2029: /* PARAGRAPH SEPARATOR */
2272 ph10 510 MRRETURN(MATCH_NOMATCH);
2273 ph10 178 }
2274     ecode++;
2275     break;
2276    
2277     case OP_VSPACE:
2278 ph10 443 if (eptr >= md->end_subject)
2279 ph10 428 {
2280 ph10 443 SCHECK_PARTIAL();
2281 ph10 510 MRRETURN(MATCH_NOMATCH);
2282 ph10 443 }
2283 ph10 178 GETCHARINCTEST(c, eptr);
2284     switch(c)
2285     {
2286 ph10 510 default: MRRETURN(MATCH_NOMATCH);
2287 ph10 178 case 0x0a: /* LF */
2288     case 0x0b: /* VT */
2289     case 0x0c: /* FF */
2290     case 0x0d: /* CR */
2291     case 0x85: /* NEL */
2292     case 0x2028: /* LINE SEPARATOR */
2293     case 0x2029: /* PARAGRAPH SEPARATOR */
2294     break;
2295     }
2296     ecode++;
2297     break;
2298    
2299 nigel 77 #ifdef SUPPORT_UCP
2300     /* Check the next character by Unicode property. We will get here only
2301     if the support is in the binary; otherwise a compile-time error occurs. */
2302    
2303     case OP_PROP:
2304     case OP_NOTPROP:
2305 ph10 443 if (eptr >= md->end_subject)
2306 ph10 428 {
2307 ph10 443 SCHECK_PARTIAL();
2308 ph10 510 MRRETURN(MATCH_NOMATCH);
2309 ph10 443 }
2310 nigel 77 GETCHARINCTEST(c, eptr);
2311     {
2312 ph10 384 const ucd_record *prop = GET_UCD(c);
2313 nigel 77
2314 nigel 87 switch(ecode[1])
2315     {
2316     case PT_ANY:
2317 ph10 510 if (op == OP_NOTPROP) MRRETURN(MATCH_NOMATCH);
2318 nigel 87 break;
2319 nigel 77
2320 nigel 87 case PT_LAMP:
2321 ph10 349 if ((prop->chartype == ucp_Lu ||
2322     prop->chartype == ucp_Ll ||
2323     prop->chartype == ucp_Lt) == (op == OP_NOTPROP))
2324 ph10 510 MRRETURN(MATCH_NOMATCH);
2325 ph10 517 break;
2326 nigel 87
2327     case PT_GC:
2328 ph10 351 if ((ecode[2] != _pcre_ucp_gentype[prop->chartype]) == (op == OP_PROP))
2329 ph10 510 MRRETURN(MATCH_NOMATCH);
2330 nigel 87 break;
2331    
2332     case PT_PC:
2333 ph10 349 if ((ecode[2] != prop->chartype) == (op == OP_PROP))
2334 ph10 510 MRRETURN(MATCH_NOMATCH);
2335 nigel 87 break;
2336    
2337     case PT_SC:
2338 ph10 349 if ((ecode[2] != prop->script) == (op == OP_PROP))
2339 ph10 510 MRRETURN(MATCH_NOMATCH);
2340 nigel 87 break;
2341 ph10 527
2342 ph10 517 /* These are specials */
2343 ph10 527
2344 ph10 517 case PT_ALNUM:
2345     if ((_pcre_ucp_gentype[prop->chartype] == ucp_L ||
2346     _pcre_ucp_gentype[prop->chartype] == ucp_N) == (op == OP_NOTPROP))
2347     MRRETURN(MATCH_NOMATCH);
2348 ph10 527 break;
2349    
2350 ph10 517 case PT_SPACE: /* Perl space */
2351     if ((_pcre_ucp_gentype[prop->chartype] == ucp_Z ||
2352     c == CHAR_HT || c == CHAR_NL || c == CHAR_FF || c == CHAR_CR)
2353     == (op == OP_NOTPROP))
2354     MRRETURN(MATCH_NOMATCH);
2355 ph10 527 break;
2356    
2357 ph10 517 case PT_PXSPACE: /* POSIX space */
2358     if ((_pcre_ucp_gentype[prop->chartype] == ucp_Z ||
2359 ph10 527 c == CHAR_HT || c == CHAR_NL || c == CHAR_VT ||
2360 ph10 517 c == CHAR_FF || c == CHAR_CR)
2361     == (op == OP_NOTPROP))
2362     MRRETURN(MATCH_NOMATCH);
2363 ph10 527 break;
2364 nigel 87
2365 ph10 527 case PT_WORD:
2366 ph10 517 if ((_pcre_ucp_gentype[prop->chartype] == ucp_L ||
2367 ph10 527 _pcre_ucp_gentype[prop->chartype] == ucp_N ||
2368 ph10 517 c == CHAR_UNDERSCORE) == (op == OP_NOTPROP))
2369     MRRETURN(MATCH_NOMATCH);
2370 ph10 527 break;
2371    
2372 ph10 517 /* This should never occur */
2373    
2374 nigel 87 default:
2375     RRETURN(PCRE_ERROR_INTERNAL);
2376 nigel 77 }
2377 nigel 87
2378     ecode += 3;
2379 nigel 77 }
2380     break;
2381    
2382     /* Match an extended Unicode sequence. We will get here only if the support
2383     is in the binary; otherwise a compile-time error occurs. */
2384    
2385     case OP_EXTUNI:
2386 ph10 443 if (eptr >= md->end_subject)
2387 ph10 428 {
2388 ph10 443 SCHECK_PARTIAL();
2389 ph10 510 MRRETURN(MATCH_NOMATCH);
2390 ph10 443 }
2391 nigel 77 GETCHARINCTEST(c, eptr);
2392     {
2393 ph10 349 int category = UCD_CATEGORY(c);
2394 ph10 510 if (category == ucp_M) MRRETURN(MATCH_NOMATCH);
2395 nigel 77 while (eptr < md->end_subject)
2396     {
2397     int len = 1;
2398     if (!utf8) c = *eptr; else
2399     {
2400     GETCHARLEN(c, eptr, len);
2401     }
2402 ph10 349 category = UCD_CATEGORY(c);
2403 nigel 77 if (category != ucp_M) break;
2404     eptr += len;
2405     }
2406     }
2407     ecode++;
2408     break;
2409     #endif
2410    
2411    
2412     /* Match a back reference, possibly repeatedly. Look past the end of the
2413     item to see if there is repeat information following. The code is similar
2414     to that for character classes, but repeated for efficiency. Then obey
2415     similar code to character type repeats - written out again for speed.
2416     However, if the referenced string is the empty string, always treat
2417     it as matched, any number of times (otherwise there could be infinite
2418     loops). */
2419    
2420     case OP_REF:
2421 ph10 602 case OP_REFI:
2422     caseless = op == OP_REFI;
2423 ph10 595 offset = GET2(ecode, 1) << 1; /* Doubled ref number */
2424     ecode += 3;
2425 ph10 345
2426 ph10 595 /* If the reference is unset, there are two possibilities:
2427 ph10 345
2428 ph10 595 (a) In the default, Perl-compatible state, set the length negative;
2429     this ensures that every attempt at a match fails. We can't just fail
2430     here, because of the possibility of quantifiers with zero minima.
2431 ph10 345
2432 ph10 595 (b) If the JavaScript compatibility flag is set, set the length to zero
2433     so that the back reference matches an empty string.
2434 ph10 345
2435 ph10 595 Otherwise, set the length to the length of what was matched by the
2436     referenced subpattern. */
2437 ph10 345
2438 ph10 595 if (offset >= offset_top || md->offset_vector[offset] < 0)
2439     length = (md->jscript_compat)? 0 : -1;
2440     else
2441     length = md->offset_vector[offset+1] - md->offset_vector[offset];
2442 nigel 77
2443 ph10 595 /* Set up for repetition, or handle the non-repeated case */
2444 nigel 77
2445 ph10 595 switch (*ecode)
2446     {
2447     case OP_CRSTAR:
2448     case OP_CRMINSTAR:
2449     case OP_CRPLUS:
2450     case OP_CRMINPLUS:
2451     case OP_CRQUERY:
2452     case OP_CRMINQUERY:
2453     c = *ecode++ - OP_CRSTAR;
2454     minimize = (c & 1) != 0;
2455     min = rep_min[c]; /* Pick up values from tables; */
2456     max = rep_max[c]; /* zero for max => infinity */
2457     if (max == 0) max = INT_MAX;
2458     break;
2459 nigel 77
2460 ph10 595 case OP_CRRANGE:
2461     case OP_CRMINRANGE:
2462     minimize = (*ecode == OP_CRMINRANGE);
2463     min = GET2(ecode, 1);
2464     max = GET2(ecode, 3);
2465     if (max == 0) max = INT_MAX;
2466     ecode += 5;
2467     break;
2468 nigel 77
2469 ph10 595 default: /* No repeat follows */
2470 ph10 602 if ((length = match_ref(offset, eptr, length, md, caseless)) < 0)
2471 ph10 595 {
2472     CHECK_PARTIAL();
2473     MRRETURN(MATCH_NOMATCH);
2474 nigel 77 }
2475 ph10 595 eptr += length;
2476     continue; /* With the main loop */
2477     }
2478 nigel 77
2479 ph10 595 /* Handle repeated back references. If the length of the reference is
2480     zero, just continue with the main loop. */
2481 ph10 443
2482 ph10 595 if (length == 0) continue;
2483 nigel 77
2484 ph10 595 /* First, ensure the minimum number of matches are present. We get back
2485     the length of the reference string explicitly rather than passing the
2486     address of eptr, so that eptr can be a register variable. */
2487 nigel 77
2488 ph10 595 for (i = 1; i <= min; i++)
2489     {
2490     int slength;
2491 ph10 602 if ((slength = match_ref(offset, eptr, length, md, caseless)) < 0)
2492 nigel 77 {
2493 ph10 595 CHECK_PARTIAL();
2494     MRRETURN(MATCH_NOMATCH);
2495 nigel 77 }
2496 ph10 595 eptr += slength;
2497     }
2498 nigel 77
2499 ph10 595 /* If min = max, continue at the same level without recursion.
2500     They are not both allowed to be zero. */
2501 nigel 77
2502 ph10 595 if (min == max) continue;
2503 nigel 77
2504 ph10 595 /* If minimizing, keep trying and advancing the pointer */
2505 nigel 77
2506 ph10 595 if (minimize)
2507     {
2508     for (fi = min;; fi++)
2509 nigel 77 {
2510 ph10 595 int slength;
2511 ph10 604 RMATCH(eptr, ecode, offset_top, md, eptrb, RM14);
2512 ph10 595 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2513     if (fi >= max) MRRETURN(MATCH_NOMATCH);
2514 ph10 602 if ((slength = match_ref(offset, eptr, length, md, caseless)) < 0)
2515 nigel 77 {
2516 ph10 595 CHECK_PARTIAL();
2517     MRRETURN(MATCH_NOMATCH);
2518 nigel 77 }
2519 ph10 595 eptr += slength;
2520 nigel 77 }
2521 ph10 595 /* Control never gets here */
2522     }
2523 nigel 77
2524 ph10 595 /* If maximizing, find the longest string and work backwards */
2525 nigel 77
2526 ph10 595 else
2527     {
2528     pp = eptr;
2529     for (i = min; i < max; i++)
2530 nigel 77 {
2531 ph10 595 int slength;
2532 ph10 602 if ((slength = match_ref(offset, eptr, length, md, caseless)) < 0)
2533 nigel 77 {
2534 ph10 595 CHECK_PARTIAL();
2535     break;
2536 nigel 77 }
2537 ph10 595 eptr += slength;
2538 nigel 77 }
2539 ph10 595 while (eptr >= pp)
2540     {
2541 ph10 604 RMATCH(eptr, ecode, offset_top, md, eptrb, RM15);
2542 ph10 595 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2543     eptr -= length;
2544     }
2545     MRRETURN(MATCH_NOMATCH);
2546 nigel 77 }
2547     /* Control never gets here */
2548    
2549     /* Match a bit-mapped character class, possibly repeatedly. This op code is
2550     used when all the characters in the class have values in the range 0-255,
2551     and either the matching is caseful, or the characters are in the range
2552     0-127 when UTF-8 processing is enabled. The only difference between
2553     OP_CLASS and OP_NCLASS occurs when a data character outside the range is
2554     encountered.
2555    
2556     First, look past the end of the item to see if there is repeat information
2557     following. Then obey similar code to character type repeats - written out
2558     again for speed. */
2559    
2560     case OP_NCLASS:
2561     case OP_CLASS:
2562     {
2563     data = ecode + 1; /* Save for matching */
2564     ecode += 33; /* Advance past the item */
2565    
2566     switch (*ecode)
2567     {
2568     case OP_CRSTAR:
2569     case OP_CRMINSTAR:
2570     case OP_CRPLUS:
2571     case OP_CRMINPLUS:
2572     case OP_CRQUERY:
2573     case OP_CRMINQUERY:
2574     c = *ecode++ - OP_CRSTAR;
2575     minimize = (c & 1) != 0;
2576     min = rep_min[c]; /* Pick up values from tables; */
2577     max = rep_max[c]; /* zero for max => infinity */
2578     if (max == 0) max = INT_MAX;
2579     break;
2580    
2581     case OP_CRRANGE:
2582     case OP_CRMINRANGE:
2583     minimize = (*ecode == OP_CRMINRANGE);
2584     min = GET2(ecode, 1);
2585     max = GET2(ecode, 3);
2586     if (max == 0) max = INT_MAX;
2587     ecode += 5;
2588     break;
2589    
2590     default: /* No repeat follows */
2591     min = max = 1;
2592     break;
2593     }
2594    
2595     /* First, ensure the minimum number of matches are present. */
2596    
2597     #ifdef SUPPORT_UTF8
2598     /* UTF-8 mode */
2599     if (utf8)
2600     {
2601     for (i = 1; i <= min; i++)
2602     {
2603 ph10 427 if (eptr >= md->end_subject)
2604 ph10 426 {
2605 ph10 428 SCHECK_PARTIAL();
2606 ph10 510 MRRETURN(MATCH_NOMATCH);
2607 ph10 427 }
2608 nigel 77 GETCHARINC(c, eptr);
2609     if (c > 255)
2610     {
2611 ph10 510 if (op == OP_CLASS) MRRETURN(MATCH_NOMATCH);
2612 nigel 77 }
2613     else
2614     {
2615 ph10 510 if ((data[c/8] & (1 << (c&7))) == 0) MRRETURN(MATCH_NOMATCH);
2616 nigel 77 }
2617     }
2618     }
2619     else
2620     #endif
2621     /* Not UTF-8 mode */
2622     {
2623     for (i = 1; i <= min; i++)
2624     {
2625 ph10 427 if (eptr >= md->end_subject)
2626 ph10 426 {
2627 ph10 428 SCHECK_PARTIAL();
2628 ph10 510 MRRETURN(MATCH_NOMATCH);
2629 ph10 427 }
2630 nigel 77 c = *eptr++;
2631 ph10 510 if ((data[c/8] & (1 << (c&7))) == 0) MRRETURN(MATCH_NOMATCH);
2632 nigel 77 }
2633     }
2634    
2635     /* If max == min we can continue with the main loop without the
2636     need to recurse. */
2637    
2638     if (min == max) continue;
2639    
2640     /* If minimizing, keep testing the rest of the expression and advancing
2641     the pointer while it matches the class. */
2642    
2643     if (minimize)
2644     {
2645     #ifdef SUPPORT_UTF8
2646     /* UTF-8 mode */
2647     if (utf8)
2648     {
2649     for (fi = min;; fi++)
2650     {
2651 ph10 604 RMATCH(eptr, ecode, offset_top, md, eptrb, RM16);
2652 nigel 77 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2653 ph10 510 if (fi >= max) MRRETURN(MATCH_NOMATCH);
2654 ph10 427 if (eptr >= md->end_subject)
2655 ph10 426 {
2656 ph10 427 SCHECK_PARTIAL();
2657 ph10 510 MRRETURN(MATCH_NOMATCH);
2658 ph10 427 }
2659 nigel 77 GETCHARINC(c, eptr);
2660     if (c > 255)
2661     {
2662 ph10 510 if (op == OP_CLASS) MRRETURN(MATCH_NOMATCH);
2663 nigel 77 }
2664     else
2665     {
2666 ph10 510 if ((data[c/8] & (1 << (c&7))) == 0) MRRETURN(MATCH_NOMATCH);
2667 nigel 77 }
2668     }
2669     }
2670     else
2671     #endif
2672     /* Not UTF-8 mode */
2673     {
2674     for (fi = min;; fi++)
2675     {
2676 ph10 604 RMATCH(eptr, ecode, offset_top, md, eptrb, RM17);
2677 nigel 77 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2678 ph10 510 if (fi >= max) MRRETURN(MATCH_NOMATCH);
2679 ph10 427 if (eptr >= md->end_subject)
2680 ph10 426 {
2681 ph10 427 SCHECK_PARTIAL();
2682 ph10 510 MRRETURN(MATCH_NOMATCH);
2683 ph10 427 }
2684 nigel 77 c = *eptr++;
2685 ph10 510 if ((data[c/8] & (1 << (c&7))) == 0) MRRETURN(MATCH_NOMATCH);
2686 nigel 77 }
2687     }
2688     /* Control never gets here */
2689     }
2690    
2691     /* If maximizing, find the longest possible run, then work backwards. */
2692    
2693     else
2694     {
2695     pp = eptr;
2696    
2697     #ifdef SUPPORT_UTF8
2698     /* UTF-8 mode */
2699     if (utf8)
2700     {
2701     for (i = min; i < max; i++)
2702     {
2703     int len = 1;
2704 ph10 463 if (eptr >= md->end_subject)
2705 ph10 462 {
2706 ph10 463 SCHECK_PARTIAL();
2707 ph10 462 break;
2708 ph10 463 }
2709 nigel 77 GETCHARLEN(c, eptr, len);
2710     if (c > 255)
2711     {
2712     if (op == OP_CLASS) break;
2713     }
2714     else
2715     {
2716     if ((data[c/8] & (1 << (c&7))) == 0) break;
2717     }
2718     eptr += len;
2719     }
2720     for (;;)
2721     {
2722 ph10 604 RMATCH(eptr, ecode, offset_top, md, eptrb, RM18);
2723 nigel 77 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2724     if (eptr-- == pp) break; /* Stop if tried at original pos */
2725     BACKCHAR(eptr);
2726     }
2727     }
2728     else
2729     #endif
2730     /* Not UTF-8 mode */
2731     {
2732     for (i = min; i < max; i++)
2733     {
2734 ph10 463 if (eptr >= md->end_subject)
2735 ph10 462 {
2736 ph10 463 SCHECK_PARTIAL();
2737 ph10 462 break;
2738 ph10 463 }
2739 nigel 77 c = *eptr;
2740     if ((data[c/8] & (1 << (c&7))) == 0) break;
2741     eptr++;
2742     }
2743     while (eptr >= pp)
2744     {
2745 ph10 604 RMATCH(eptr, ecode, offset_top, md, eptrb, RM19);
2746 nigel 87 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2747 nigel 77 eptr--;
2748     }
2749     }
2750    
2751 ph10 510 MRRETURN(MATCH_NOMATCH);
2752 nigel 77 }
2753     }
2754     /* Control never gets here */
2755    
2756    
2757     /* Match an extended character class. This opcode is encountered only
2758 ph10 384 when UTF-8 mode mode is supported. Nevertheless, we may not be in UTF-8
2759     mode, because Unicode properties are supported in non-UTF-8 mode. */
2760 nigel 77
2761     #ifdef SUPPORT_UTF8
2762     case OP_XCLASS:
2763     {
2764     data = ecode + 1 + LINK_SIZE; /* Save for matching */
2765     ecode += GET(ecode, 1); /* Advance past the item */
2766    
2767     switch (*ecode)
2768     {
2769     case OP_CRSTAR:
2770     case OP_CRMINSTAR:
2771     case OP_CRPLUS:
2772     case OP_CRMINPLUS:
2773     case OP_CRQUERY:
2774     case OP_CRMINQUERY:
2775     c = *ecode++ - OP_CRSTAR;
2776     minimize = (c & 1) != 0;
2777     min = rep_min[c]; /* Pick up values from tables; */
2778     max = rep_max[c]; /* zero for max => infinity */
2779     if (max == 0) max = INT_MAX;
2780     break;
2781    
2782     case OP_CRRANGE:
2783     case OP_CRMINRANGE:
2784     minimize = (*ecode == OP_CRMINRANGE);
2785     min = GET2(ecode, 1);
2786     max = GET2(ecode, 3);
2787     if (max == 0) max = INT_MAX;
2788     ecode += 5;
2789     break;
2790    
2791     default: /* No repeat follows */
2792     min = max = 1;
2793     break;
2794     }
2795    
2796     /* First, ensure the minimum number of matches are present. */
2797    
2798     for (i = 1; i <= min; i++)
2799     {
2800 ph10 427 if (eptr >= md->end_subject)
2801 ph10 426 {
2802     SCHECK_PARTIAL();
2803 ph10 510 MRRETURN(MATCH_NOMATCH);
2804 ph10 427 }
2805 ph10 384 GETCHARINCTEST(c, eptr);
2806 ph10 510 if (!_pcre_xclass(c, data)) MRRETURN(MATCH_NOMATCH);
2807 nigel 77 }
2808    
2809     /* If max == min we can continue with the main loop without the
2810     need to recurse. */
2811    
2812     if (min == max) continue;
2813    
2814     /* If minimizing, keep testing the rest of the expression and advancing
2815     the pointer while it matches the class. */
2816    
2817     if (minimize)
2818     {
2819     for (fi = min;; fi++)
2820     {
2821 ph10 604 RMATCH(eptr, ecode, offset_top, md, eptrb, RM20);
2822 nigel 77 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2823 ph10 510 if (fi >= max) MRRETURN(MATCH_NOMATCH);
2824 ph10 427 if (eptr >= md->end_subject)
2825 ph10 426 {
2826 ph10 427 SCHECK_PARTIAL();
2827 ph10 510 MRRETURN(MATCH_NOMATCH);
2828 ph10 427 }
2829 ph10 384 GETCHARINCTEST(c, eptr);
2830 ph10 510 if (!_pcre_xclass(c, data)) MRRETURN(MATCH_NOMATCH);
2831 nigel 77 }
2832     /* Control never gets here */
2833     }
2834    
2835     /* If maximizing, find the longest possible run, then work backwards. */
2836    
2837     else
2838     {
2839     pp = eptr;
2840     for (i = min; i < max; i++)
2841     {
2842     int len = 1;
2843 ph10 463 if (eptr >= md->end_subject)
2844 ph10 462 {
2845 ph10 463 SCHECK_PARTIAL();
2846 ph10 462 break;
2847 ph10 463 }
2848 ph10 384 GETCHARLENTEST(c, eptr, len);
2849 nigel 77 if (!_pcre_xclass(c, data)) break;
2850     eptr += len;
2851     }
2852     for(;;)
2853     {
2854 ph10 604 RMATCH(eptr, ecode, offset_top, md, eptrb, RM21);
2855 nigel 77 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2856     if (eptr-- == pp) break; /* Stop if tried at original pos */
2857 ph10 214 if (utf8) BACKCHAR(eptr);
2858 nigel 77 }
2859 ph10 510 MRRETURN(MATCH_NOMATCH);
2860 nigel 77 }
2861    
2862     /* Control never gets here */
2863     }
2864     #endif /* End of XCLASS */
2865    
2866     /* Match a single character, casefully */
2867    
2868     case OP_CHAR:
2869     #ifdef SUPPORT_UTF8
2870     if (utf8)
2871     {
2872     length = 1;
2873     ecode++;
2874     GETCHARLEN(fc, ecode, length);
2875 ph10 443 if (length > md->end_subject - eptr)
2876 ph10 428 {
2877     CHECK_PARTIAL(); /* Not SCHECK_PARTIAL() */
2878 ph10 510 MRRETURN(MATCH_NOMATCH);
2879 ph10 443 }
2880 ph10 510 while (length-- > 0) if (*ecode++ != *eptr++) MRRETURN(MATCH_NOMATCH);
2881 nigel 77 }
2882     else
2883     #endif
2884    
2885     /* Non-UTF-8 mode */
2886     {
2887 ph10 443 if (md->end_subject - eptr < 1)
2888 ph10 428 {
2889     SCHECK_PARTIAL(); /* This one can use SCHECK_PARTIAL() */
2890 ph10 510 MRRETURN(MATCH_NOMATCH);
2891 ph10 443 }
2892 ph10 510 if (ecode[1] != *eptr++) MRRETURN(MATCH_NOMATCH);
2893 nigel 77 ecode += 2;
2894     }
2895     break;
2896    
2897     /* Match a single character, caselessly */
2898    
2899 ph10 602 case OP_CHARI:
2900 nigel 77 #ifdef SUPPORT_UTF8
2901     if (utf8)
2902     {
2903     length = 1;
2904     ecode++;
2905     GETCHARLEN(fc, ecode, length);
2906    
2907 ph10 443 if (length > md->end_subject - eptr)
2908 ph10 428 {
2909     CHECK_PARTIAL(); /* Not SCHECK_PARTIAL() */
2910 ph10 510 MRRETURN(MATCH_NOMATCH);
2911 ph10 443 }
2912 nigel 77
2913     /* If the pattern character's value is < 128, we have only one byte, and
2914     can use the fast lookup table. */
2915    
2916     if (fc < 128)
2917     {
2918 ph10 510 if (md->lcc[*ecode++] != md->lcc[*eptr++]) MRRETURN(MATCH_NOMATCH);
2919 nigel 77 }
2920    
2921     /* Otherwise we must pick up the subject character */
2922    
2923     else
2924     {
2925 nigel 93 unsigned int dc;
2926 nigel 77 GETCHARINC(dc, eptr);
2927     ecode += length;
2928    
2929     /* If we have Unicode property support, we can use it to test the other
2930 nigel 87 case of the character, if there is one. */
2931 nigel 77
2932     if (fc != dc)
2933     {
2934     #ifdef SUPPORT_UCP
2935 ph10 349 if (dc != UCD_OTHERCASE(fc))
2936 nigel 77 #endif
2937 ph10 510 MRRETURN(MATCH_NOMATCH);
2938 nigel 77 }
2939     }
2940     }
2941     else
2942     #endif /* SUPPORT_UTF8 */
2943    
2944     /* Non-UTF-8 mode */
2945     {
2946 ph10 443 if (md->end_subject - eptr < 1)
2947 ph10 428 {
2948 ph10 443 SCHECK_PARTIAL(); /* This one can use SCHECK_PARTIAL() */
2949 ph10 510 MRRETURN(MATCH_NOMATCH);
2950 ph10 443 }
2951 ph10 510 if (md->lcc[ecode[1]] != md->lcc[*eptr++]) MRRETURN(MATCH_NOMATCH);
2952 nigel 77 ecode += 2;
2953     }
2954     break;
2955    
2956 nigel 93 /* Match a single character repeatedly. */
2957 nigel 77
2958     case OP_EXACT:
2959 ph10 602 case OP_EXACTI:
2960 nigel 77 min = max = GET2(ecode, 1);
2961     ecode += 3;
2962     goto REPEATCHAR;
2963    
2964 nigel 93 case OP_POSUPTO:
2965 ph10 602 case OP_POSUPTOI:
2966 nigel 93 possessive = TRUE;
2967     /* Fall through */
2968    
2969 nigel 77 case OP_UPTO:
2970 ph10 602 case OP_UPTOI:
2971 nigel 77 case OP_MINUPTO:
2972 ph10 602 case OP_MINUPTOI:
2973 nigel 77 min = 0;
2974     max = GET2(ecode, 1);
2975 ph10 602 minimize = *ecode == OP_MINUPTO || *ecode == OP_MINUPTOI;
2976 nigel 77 ecode += 3;
2977     goto REPEATCHAR;
2978    
2979 nigel 93 case OP_POSSTAR:
2980 ph10 602 case OP_POSSTARI:
2981 nigel 93 possessive = TRUE;
2982     min = 0;
2983     max = INT_MAX;
2984     ecode++;
2985     goto REPEATCHAR;
2986    
2987     case OP_POSPLUS:
2988 ph10 602 case OP_POSPLUSI:
2989 nigel 93 possessive = TRUE;
2990     min = 1;
2991     max = INT_MAX;
2992     ecode++;
2993     goto REPEATCHAR;
2994    
2995     case OP_POSQUERY:
2996 ph10 602 case OP_POSQUERYI:
2997 nigel 93 possessive = TRUE;
2998     min = 0;
2999     max = 1;
3000     ecode++;
3001     goto REPEATCHAR;
3002    
3003 nigel 77 case OP_STAR:
3004 ph10 602 case OP_STARI:
3005 nigel 77 case OP_MINSTAR:
3006 ph10 602 case OP_MINSTARI:
3007 nigel 77 case OP_PLUS:
3008 ph10 602 case OP_PLUSI:
3009 nigel 77 case OP_MINPLUS:
3010 ph10 602 case OP_MINPLUSI:
3011 nigel 77 case OP_QUERY:
3012 ph10 602 case OP_QUERYI:
3013 nigel 77 case OP_MINQUERY:
3014 ph10 602 case OP_MINQUERYI:
3015     c = *ecode++ - ((op < OP_STARI)? OP_STAR : OP_STARI);
3016 nigel 77 minimize = (c & 1) != 0;
3017     min = rep_min[c]; /* Pick up values from tables; */
3018     max = rep_max[c]; /* zero for max => infinity */
3019     if (max == 0) max = INT_MAX;
3020    
3021 ph10 426 /* Common code for all repeated single-character matches. */
3022 nigel 77
3023     REPEATCHAR:
3024     #ifdef SUPPORT_UTF8
3025     if (utf8)
3026     {
3027     length = 1;
3028     charptr = ecode;
3029     GETCHARLEN(fc, ecode, length);
3030     ecode += length;
3031    
3032     /* Handle multibyte character matching specially here. There is
3033     support for caseless matching if UCP support is present. */
3034    
3035     if (length > 1)
3036     {
3037     #ifdef SUPPORT_UCP
3038 nigel 93 unsigned int othercase;
3039 ph10 602 if (op >= OP_STARI && /* Caseless */
3040 ph10 349 (othercase = UCD_OTHERCASE(fc)) != fc)
3041 nigel 77 oclength = _pcre_ord2utf8(othercase, occhars);
3042 ph10 115 else oclength = 0;
3043 nigel 77 #endif /* SUPPORT_UCP */
3044    
3045     for (i = 1; i <= min; i++)
3046     {
3047 ph10 426 if (eptr <= md->end_subject - length &&
3048     memcmp(eptr, charptr, length) == 0) eptr += length;
3049 ph10 123 #ifdef SUPPORT_UCP
3050 ph10 426 else if (oclength > 0 &&
3051     eptr <= md->end_subject - oclength &&
3052     memcmp(eptr, occhars, oclength) == 0) eptr += oclength;
3053     #endif /* SUPPORT_UCP */
3054 nigel 77 else
3055     {
3056 ph10 426 CHECK_PARTIAL();
3057 ph10 510 MRRETURN(MATCH_NOMATCH);
3058 nigel 77 }
3059     }
3060    
3061     if (min == max) continue;
3062    
3063     if (minimize)
3064     {
3065     for (fi = min;; fi++)
3066     {
3067 ph10 604 RMATCH(eptr, ecode, offset_top, md, eptrb, RM22);
3068 nigel 77 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3069 ph10 510 if (fi >= max) MRRETURN(MATCH_NOMATCH);
3070 ph10 426 if (eptr <= md->end_subject - length &&
3071     memcmp(eptr, charptr, length) == 0) eptr += length;
3072 ph10 123 #ifdef SUPPORT_UCP
3073 ph10 426 else if (oclength > 0 &&
3074     eptr <= md->end_subject - oclength &&
3075     memcmp(eptr, occhars, oclength) == 0) eptr += oclength;
3076     #endif /* SUPPORT_UCP */
3077 nigel 77 else
3078     {
3079 ph10 426 CHECK_PARTIAL();
3080 ph10 510 MRRETURN(MATCH_NOMATCH);
3081 nigel 77 }
3082     }
3083     /* Control never gets here */
3084     }
3085 nigel 93
3086     else /* Maximize */
3087 nigel 77 {
3088     pp = eptr;
3089     for (i = min; i < max; i++)
3090     {
3091 ph10 426 if (eptr <= md->end_subject - length &&
3092     memcmp(eptr, charptr, length) == 0) eptr += length;
3093 ph10 123 #ifdef SUPPORT_UCP
3094 ph10 426 else if (oclength > 0 &&
3095     eptr <= md->end_subject - oclength &&
3096     memcmp(eptr, occhars, oclength) == 0) eptr += oclength;
3097     #endif /* SUPPORT_UCP */
3098 ph10 463 else
3099 ph10 462 {
3100 ph10 463 CHECK_PARTIAL();
3101 ph10 462 break;
3102 ph10 463 }
3103 nigel 77 }
3104 nigel 93
3105     if (possessive) continue;
3106 ph10 427
3107 ph10 120 for(;;)
3108 ph10 426 {
3109 ph10 604 RMATCH(eptr, ecode, offset_top, md, eptrb, RM23);
3110 ph10 426 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3111 ph10 510 if (eptr == pp) { MRRETURN(MATCH_NOMATCH); }
3112 ph10 115 #ifdef SUPPORT_UCP
3113 ph10 426 eptr--;
3114     BACKCHAR(eptr);
3115 ph10 123 #else /* without SUPPORT_UCP */
3116 ph10 426 eptr -= length;
3117 ph10 123 #endif /* SUPPORT_UCP */
3118 ph10 426 }
3119 nigel 77 }
3120     /* Control never gets here */
3121     }
3122    
3123     /* If the length of a UTF-8 character is 1, we fall through here, and
3124     obey the code as for non-UTF-8 characters below, though in this case the
3125     value of fc will always be < 128. */
3126     }
3127     else
3128     #endif /* SUPPORT_UTF8 */
3129    
3130     /* When not in UTF-8 mode, load a single-byte character. */
3131    
3132 ph10 426 fc = *ecode++;
3133 ph10 443
3134 nigel 77 /* The value of fc at this point is always less than 256, though we may or
3135     may not be in UTF-8 mode. The code is duplicated for the caseless and
3136     caseful cases, for speed, since matching characters is likely to be quite
3137     common. First, ensure the minimum number of matches are present. If min =
3138     max, continue at the same level without recursing. Otherwise, if
3139     minimizing, keep trying the rest of the expression and advancing one
3140     matching character if failing, up to the maximum. Alternatively, if
3141     maximizing, find the maximum number of characters and work backwards. */
3142    
3143     DPRINTF(("matching %c{%d,%d} against subject %.*s\n", fc, min, max,
3144     max, eptr));
3145    
3146 ph10 602 if (op >= OP_STARI) /* Caseless */
3147 nigel 77 {
3148     fc = md->lcc[fc];
3149     for (i = 1; i <= min; i++)
3150 ph10 426 {
3151     if (eptr >= md->end_subject)
3152     {
3153     SCHECK_PARTIAL();
3154 ph10 510 MRRETURN(MATCH_NOMATCH);
3155 ph10 426 }
3156 ph10 510 if (fc != md->lcc[*eptr++]) MRRETURN(MATCH_NOMATCH);
3157 ph10 426 }
3158 nigel 77 if (min == max) continue;
3159     if (minimize)
3160     {
3161     for (fi = min;; fi++)
3162     {
3163 ph10 604 RMATCH(eptr, ecode, offset_top, md, eptrb, RM24);
3164 nigel 77 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3165 ph10 510 if (fi >= max) MRRETURN(MATCH_NOMATCH);
3166 ph10 426 if (eptr >= md->end_subject)
3167     {
3168 ph10 427 SCHECK_PARTIAL();
3169 ph10 510 MRRETURN(MATCH_NOMATCH);
3170 ph10 426 }
3171 ph10 510 if (fc != md->lcc[*eptr++]) MRRETURN(MATCH_NOMATCH);
3172 nigel 77 }
3173     /* Control never gets here */
3174     }
3175 nigel 93 else /* Maximize */
3176 nigel 77 {
3177     pp = eptr;
3178     for (i = min; i < max; i++)
3179     {
3180 ph10 463 if (eptr >= md->end_subject)
3181 ph10 462 {
3182     SCHECK_PARTIAL();
3183     break;
3184 ph10 463 }
3185 ph10 462 if (fc != md->lcc[*eptr]) break;
3186 nigel 77 eptr++;
3187     }
3188 ph10 427
3189 nigel 93 if (possessive) continue;
3190 ph10 427
3191 nigel 77 while (eptr >= pp)
3192     {
3193 ph10 604 RMATCH(eptr, ecode, offset_top, md, eptrb, RM25);
3194 nigel 77 eptr--;
3195     if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3196     }
3197 ph10 510 MRRETURN(MATCH_NOMATCH);
3198 nigel 77 }
3199     /* Control never gets here */
3200     }
3201    
3202     /* Caseful comparisons (includes all multi-byte characters) */
3203    
3204     else
3205     {
3206 ph10 427 for (i = 1; i <= min; i++)
3207 ph10 426 {
3208     if (eptr >= md->end_subject)
3209     {
3210     SCHECK_PARTIAL();
3211 ph10 510 MRRETURN(MATCH_NOMATCH);
3212 ph10 426 }
3213 ph10 510 if (fc != *eptr++) MRRETURN(MATCH_NOMATCH);
3214 ph10 427 }
3215 ph10 443
3216 nigel 77 if (min == max) continue;
3217 ph10 443
3218 nigel 77 if (minimize)
3219     {
3220     for (fi = min;; fi++)
3221     {
3222 ph10 604 RMATCH(eptr, ecode, offset_top, md, eptrb, RM26);
3223 nigel 77 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3224 ph10 510 if (fi >= max) MRRETURN(MATCH_NOMATCH);
3225 ph10 426 if (eptr >= md->end_subject)
3226 ph10 427 {
3227 ph10 426 SCHECK_PARTIAL();
3228 ph10 510 MRRETURN(MATCH_NOMATCH);
3229 ph10 427 }
3230 ph10 510 if (fc != *eptr++) MRRETURN(MATCH_NOMATCH);
3231 nigel 77 }
3232     /* Control never gets here */
3233     }
3234 nigel 93 else /* Maximize */
3235 nigel 77 {
3236     pp = eptr;
3237     for (i = min; i < max; i++)
3238     {
3239 ph10 463 if (eptr >= md->end_subject)
3240 ph10 462 {
3241 ph10 463 SCHECK_PARTIAL();
3242 ph10 462 break;
3243 ph10 463 }
3244 ph10 462 if (fc != *eptr) break;
3245 nigel 77 eptr++;
3246     }
3247 nigel 93 if (possessive) continue;
3248 ph10 443
3249 nigel 77 while (eptr >= pp)
3250     {
3251 ph10 604 RMATCH(eptr, ecode, offset_top, md, eptrb, RM27);
3252 nigel 77 eptr--;
3253     if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3254     }
3255 ph10 510 MRRETURN(MATCH_NOMATCH);
3256 nigel 77 }
3257     }
3258     /* Control never gets here */
3259    
3260     /* Match a negated single one-byte character. The character we are
3261     checking can be multibyte. */
3262    
3263     case OP_NOT:
3264 ph10 602 case OP_NOTI:
3265 ph10 443 if (eptr >= md->end_subject)
3266 ph10 428 {
3267 ph10 443 SCHECK_PARTIAL();
3268 ph10 510 MRRETURN(MATCH_NOMATCH);
3269 ph10 443 }
3270 nigel 77 ecode++;
3271     GETCHARINCTEST(c, eptr);
3272 ph10 602 if (op == OP_NOTI) /* The caseless case */
3273 nigel 77 {
3274     #ifdef SUPPORT_UTF8
3275     if (c < 256)
3276     #endif
3277     c = md->lcc[c];
3278 ph10 510 if (md->lcc[*ecode++] == c) MRRETURN(MATCH_NOMATCH);
3279 nigel 77 }
3280 ph10 602 else /* Caseful */
3281 nigel 77 {
3282 ph10 510 if (*ecode++ == c) MRRETURN(MATCH_NOMATCH);
3283 nigel 77 }
3284     break;
3285    
3286     /* Match a negated single one-byte character repeatedly. This is almost a
3287     repeat of the code for a repeated single character, but I haven't found a
3288     nice way of commoning these up that doesn't require a test of the
3289     positive/negative option for each character match. Maybe that wouldn't add
3290     very much to the time taken, but character matching *is* what this is all
3291     about... */
3292    
3293     case OP_NOTEXACT:
3294 ph10 602 case OP_NOTEXACTI:
3295 nigel 77 min = max = GET2(ecode, 1);
3296     ecode += 3;
3297     goto REPEATNOTCHAR;
3298    
3299     case OP_NOTUPTO:
3300 ph10 602 case OP_NOTUPTOI:
3301 nigel 77 case OP_NOTMINUPTO:
3302 ph10 602 case OP_NOTMINUPTOI:
3303 nigel 77 min = 0;
3304     max = GET2(ecode, 1);
3305 ph10 602 minimize = *ecode == OP_NOTMINUPTO || *ecode == OP_NOTMINUPTOI;
3306 nigel 77 ecode += 3;
3307     goto REPEATNOTCHAR;
3308    
3309 nigel 93 case OP_NOTPOSSTAR:
3310 ph10 602 case OP_NOTPOSSTARI:
3311 nigel 93 possessive = TRUE;
3312     min = 0;
3313     max = INT_MAX;
3314     ecode++;
3315     goto REPEATNOTCHAR;
3316    
3317     case OP_NOTPOSPLUS:
3318 ph10 602 case OP_NOTPOSPLUSI:
3319 nigel 93 possessive = TRUE;
3320     min = 1;
3321     max = INT_MAX;
3322     ecode++;
3323     goto REPEATNOTCHAR;
3324    
3325     case OP_NOTPOSQUERY:
3326 ph10 602 case OP_NOTPOSQUERYI:
3327 nigel 93 possessive = TRUE;
3328     min = 0;
3329     max = 1;
3330     ecode++;
3331     goto REPEATNOTCHAR;
3332    
3333     case OP_NOTPOSUPTO:
3334 ph10 602 case OP_NOTPOSUPTOI:
3335 nigel 93 possessive = TRUE;
3336     min = 0;
3337     max = GET2(ecode, 1);
3338     ecode += 3;
3339     goto REPEATNOTCHAR;
3340    
3341 nigel 77 case OP_NOTSTAR:
3342 ph10 602 case OP_NOTSTARI:
3343 nigel 77 case OP_NOTMINSTAR:
3344 ph10 602 case OP_NOTMINSTARI:
3345 nigel 77 case OP_NOTPLUS:
3346 ph10 602 case OP_NOTPLUSI:
3347 nigel 77 case OP_NOTMINPLUS:
3348 ph10 602 case OP_NOTMINPLUSI:
3349 nigel 77 case OP_NOTQUERY:
3350 ph10 602 case OP_NOTQUERYI:
3351 nigel 77 case OP_NOTMINQUERY:
3352 ph10 602 case OP_NOTMINQUERYI:
3353     c = *ecode++ - ((op >= OP_NOTSTARI)? OP_NOTSTARI: OP_NOTSTAR);
3354 nigel 77 minimize = (c & 1) != 0;
3355     min = rep_min[c]; /* Pick up values from tables; */
3356     max = rep_max[c]; /* zero for max => infinity */
3357     if (max == 0) max = INT_MAX;
3358    
3359 ph10 426 /* Common code for all repeated single-byte matches. */
3360 nigel 77
3361     REPEATNOTCHAR:
3362     fc = *ecode++;
3363    
3364     /* The code is duplicated for the caseless and caseful cases, for speed,
3365     since matching characters is likely to be quite common. First, ensure the
3366     minimum number of matches are present. If min = max, continue at the same
3367     level without recursing. Otherwise, if minimizing, keep trying the rest of
3368     the expression and advancing one matching character if failing, up to the
3369     maximum. Alternatively, if maximizing, find the maximum number of
3370     characters and work backwards. */
3371    
3372     DPRINTF(("negative matching %c{%d,%d} against subject %.*s\n", fc, min, max,
3373     max, eptr));
3374    
3375 ph10 602 if (op >= OP_NOTSTARI) /* Caseless */
3376 nigel 77 {
3377     fc = md->lcc[fc];
3378    
3379     #ifdef SUPPORT_UTF8
3380     /* UTF-8 mode */
3381     if (utf8)
3382     {
3383 nigel 93 register unsigned int d;
3384 nigel 77 for (i = 1; i <= min; i++)
3385     {
3386 ph10 426 if (eptr >= md->end_subject)
3387     {
3388     SCHECK_PARTIAL();
3389 ph10 510 MRRETURN(MATCH_NOMATCH);
3390 ph10 427 }
3391 nigel 77 GETCHARINC(d, eptr);
3392     if (d < 256) d = md->lcc[d];
3393 ph10 510 if (fc == d) MRRETURN(MATCH_NOMATCH);
3394 nigel 77 }
3395     }
3396     else
3397     #endif
3398    
3399     /* Not UTF-8 mode */
3400     {
3401     for (i = 1; i <= min; i++)
3402 ph10 426 {
3403     if (eptr >= md->end_subject)
3404     {
3405     SCHECK_PARTIAL();
3406 ph10 510 MRRETURN(MATCH_NOMATCH);
3407 ph10 427 }
3408 ph10 510 if (fc == md->lcc[*eptr++]) MRRETURN(MATCH_NOMATCH);
3409 ph10 427 }
3410 nigel 77 }
3411    
3412     if (min == max) continue;
3413    
3414     if (minimize)
3415     {
3416     #ifdef SUPPORT_UTF8
3417     /* UTF-8 mode */
3418     if (utf8)
3419     {
3420 nigel 93 register unsigned int d;
3421 nigel 77 for (fi = min;; fi++)
3422     {
3423 ph10 604 RMATCH(eptr, ecode, offset_top, md, eptrb, RM28);
3424 nigel 77 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3425 ph10 510 if (fi >= max) MRRETURN(MATCH_NOMATCH);
3426 ph10 427 if (eptr >= md->end_subject)
3427 ph10 426 {
3428 ph10 427 SCHECK_PARTIAL();
3429 ph10 510 MRRETURN(MATCH_NOMATCH);
3430 ph10 427 }
3431 nigel 77 GETCHARINC(d, eptr);
3432     if (d < 256) d = md->lcc[d];
3433 ph10 510 if (fc == d) MRRETURN(MATCH_NOMATCH);
3434 nigel 77 }
3435     }
3436     else
3437     #endif
3438     /* Not UTF-8 mode */
3439     {
3440     for (fi = min;; fi++)
3441     {
3442 ph10 604 RMATCH(eptr, ecode, offset_top, md, eptrb, RM29);
3443 nigel 77 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3444 ph10 510 if (fi >= max) MRRETURN(MATCH_NOMATCH);
3445 ph10 426 if (eptr >= md->end_subject)
3446     {
3447     SCHECK_PARTIAL();
3448 ph10 510 MRRETURN(MATCH_NOMATCH);
3449 ph10 426 }
3450 ph10 510 if (fc == md->lcc[*eptr++]) MRRETURN(MATCH_NOMATCH);
3451 nigel 77 }
3452     }
3453     /* Control never gets here */
3454     }
3455    
3456     /* Maximize case */
3457    
3458     else
3459     {
3460     pp = eptr;
3461    
3462     #ifdef SUPPORT_UTF8
3463     /* UTF-8 mode */
3464     if (utf8)
3465     {
3466 nigel 93 register unsigned int d;
3467 nigel 77 for (i = min; i < max; i++)
3468     {
3469     int len = 1;
3470 ph10 463 if (eptr >= md->end_subject)
3471 ph10 462 {
3472 ph10 463 SCHECK_PARTIAL();
3473 ph10 462 break;
3474 ph10 463 }
3475 nigel 77 GETCHARLEN(d, eptr, len);
3476     if (d < 256) d = md->lcc[d];
3477     if (fc == d) break;
3478     eptr += len;
3479     }
3480 nigel 93 if (possessive) continue;
3481     for(;;)
3482 nigel 77 {
3483 ph10 604 RMATCH(eptr, ecode, offset_top, md, eptrb, RM30);
3484 nigel 77 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3485     if (eptr-- == pp) break; /* Stop if tried at original pos */
3486     BACKCHAR(eptr);
3487     }
3488     }
3489     else
3490     #endif
3491     /* Not UTF-8 mode */
3492     {
3493     for (i = min; i < max; i++)
3494     {
3495 ph10 463 if (eptr >= md->end_subject)
3496 ph10 462 {
3497     SCHECK_PARTIAL();
3498     break;
3499 ph10 463 }
3500 ph10 462 if (fc == md->lcc[*eptr]) break;
3501 nigel 77 eptr++;
3502     }
3503 nigel 93 if (possessive) continue;
3504 nigel 77 while (eptr >= pp)
3505     {
3506 ph10 604 RMATCH(eptr, ecode, offset_top, md, eptrb, RM31);
3507 nigel 77 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3508     eptr--;
3509     }
3510     }
3511    
3512 ph10 510 MRRETURN(MATCH_NOMATCH);
3513 nigel 77 }
3514     /* Control never gets here */
3515     }
3516    
3517     /* Caseful comparisons */
3518    
3519     else
3520     {
3521     #ifdef SUPPORT_UTF8
3522     /* UTF-8 mode */
3523     if (utf8)
3524     {
3525 nigel 93 register unsigned int d;
3526 nigel 77 for (i = 1; i <= min; i++)
3527     {
3528 ph10 426 if (eptr >= md->end_subject)
3529     {
3530     SCHECK_PARTIAL();
3531 ph10 510 MRRETURN(MATCH_NOMATCH);
3532 ph10 427 }
3533 nigel 77 GETCHARINC(d, eptr);
3534 ph10 510 if (fc == d) MRRETURN(MATCH_NOMATCH);
3535 nigel 77 }
3536     }
3537     else
3538     #endif
3539     /* Not UTF-8 mode */
3540     {
3541     for (i = 1; i <= min; i++)
3542 ph10 426 {
3543     if (eptr >= md->end_subject)
3544     {
3545     SCHECK_PARTIAL();
3546 ph10 510 MRRETURN(MATCH_NOMATCH);
3547 ph10 427 }
3548 ph10 510 if (fc == *eptr++) MRRETURN(MATCH_NOMATCH);
3549 ph10 427 }
3550 nigel 77 }
3551    
3552     if (min == max) continue;
3553    
3554     if (minimize)
3555     {
3556     #ifdef SUPPORT_UTF8
3557     /* UTF-8 mode */
3558     if (utf8)
3559     {
3560 nigel 93 register unsigned int d;
3561 nigel 77 for (fi = min;; fi++)
3562     {
3563 ph10 604 RMATCH(eptr, ecode, offset_top, md, eptrb, RM32);
3564 nigel 77 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3565 ph10 510 if (fi >= max) MRRETURN(MATCH_NOMATCH);
3566 ph10 427 if (eptr >= md->end_subject)
3567 ph10 426 {
3568 ph10 427 SCHECK_PARTIAL();
3569 ph10 510 MRRETURN(MATCH_NOMATCH);
3570 ph10 427 }
3571 nigel 77 GETCHARINC(d, eptr);
3572 ph10 510 if (fc == d) MRRETURN(MATCH_NOMATCH);
3573 nigel 77 }
3574     }
3575     else
3576     #endif
3577     /* Not UTF-8 mode */
3578     {
3579     for (fi = min;; fi++)
3580     {
3581 ph10 604 RMATCH(eptr, ecode, offset_top, md, eptrb, RM33);
3582 nigel 77 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3583 ph10 510 if (fi >= max) MRRETURN(MATCH_NOMATCH);
3584 ph10 426 if (eptr >= md->end_subject)
3585     {
3586     SCHECK_PARTIAL();
3587 ph10 510 MRRETURN(MATCH_NOMATCH);
3588 ph10 427 }
3589 ph10 510 if (fc == *eptr++) MRRETURN(MATCH_NOMATCH);
3590 nigel 77 }
3591     }
3592     /* Control never gets here */
3593     }
3594    
3595     /* Maximize case */
3596    
3597     else
3598     {
3599     pp = eptr;
3600    
3601     #ifdef SUPPORT_UTF8
3602     /* UTF-8 mode */
3603     if (utf8)
3604     {
3605 nigel 93 register unsigned int d;
3606 nigel 77 for (i = min; i < max; i++)
3607     {
3608     int len = 1;
3609 ph10 463 if (eptr >= md->end_subject)
3610 ph10 462 {
3611 ph10 463 SCHECK_PARTIAL();
3612 ph10 462 break;
3613 ph10 463 }
3614 nigel 77 GETCHARLEN(d, eptr, len);
3615     if (fc == d) break;
3616     eptr += len;
3617     }
3618 nigel 93 if (possessive) continue;
3619 nigel 77 for(;;)
3620     {
3621 ph10 604 RMATCH(eptr, ecode, offset_top, md, eptrb, RM34);
3622 nigel 77 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3623     if (eptr-- == pp) break; /* Stop if tried at original pos */
3624     BACKCHAR(eptr);
3625     }
3626     }
3627     else
3628     #endif
3629     /* Not UTF-8 mode */
3630     {
3631     for (i = min; i < max; i++)
3632     {
3633 ph10 463 if (eptr >= md->end_subject)
3634 ph10 462 {
3635 ph10 463 SCHECK_PARTIAL();
3636 ph10 462 break;
3637 ph10 463 }
3638 ph10 462 if (fc == *eptr) break;
3639 nigel 77 eptr++;
3640     }
3641 nigel 93 if (possessive) continue;
3642 nigel 77 while (eptr >= pp)
3643     {
3644 ph10 604 RMATCH(eptr, ecode, offset_top, md, eptrb, RM35);
3645 nigel 77 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3646     eptr--;
3647     }
3648     }
3649    
3650 ph10 510 MRRETURN(MATCH_NOMATCH);
3651 nigel 77 }
3652     }
3653     /* Control never gets here */
3654    
3655     /* Match a single character type repeatedly; several different opcodes
3656     share code. This is very similar to the code for single characters, but we
3657     repeat it in the interests of efficiency. */
3658    
3659     case OP_TYPEEXACT:
3660     min = max = GET2(ecode, 1);
3661     minimize = TRUE;
3662     ecode += 3;
3663     goto REPEATTYPE;
3664    
3665     case OP_TYPEUPTO:
3666     case OP_TYPEMINUPTO:
3667     min = 0;
3668     max = GET2(ecode, 1);
3669     minimize = *ecode == OP_TYPEMINUPTO;
3670     ecode += 3;
3671     goto REPEATTYPE;
3672    
3673 nigel 93 case OP_TYPEPOSSTAR:
3674     possessive = TRUE;
3675     min = 0;
3676     max = INT_MAX;
3677     ecode++;
3678     goto REPEATTYPE;
3679    
3680     case OP_TYPEPOSPLUS:
3681     possessive = TRUE;
3682     min = 1;
3683     max = INT_MAX;
3684     ecode++;
3685     goto REPEATTYPE;
3686    
3687     case OP_TYPEPOSQUERY:
3688     possessive = TRUE;
3689     min = 0;
3690     max = 1;
3691     ecode++;
3692     goto REPEATTYPE;
3693    
3694     case OP_TYPEPOSUPTO:
3695     possessive = TRUE;
3696     min = 0;
3697     max = GET2(ecode, 1);
3698     ecode += 3;
3699     goto REPEATTYPE;
3700    
3701 nigel 77 case OP_TYPESTAR:
3702     case OP_TYPEMINSTAR:
3703     case OP_TYPEPLUS:
3704     case OP_TYPEMINPLUS:
3705     case OP_TYPEQUERY:
3706     case OP_TYPEMINQUERY:
3707     c = *ecode++ - OP_TYPESTAR;
3708     minimize = (c & 1) != 0;
3709     min = rep_min[c]; /* Pick up values from tables; */
3710     max = rep_max[c]; /* zero for max => infinity */
3711     if (max == 0) max = INT_MAX;
3712    
3713     /* Common code for all repeated single character type matches. Note that
3714     in UTF-8 mode, '.' matches a character of any length, but for the other
3715     character types, the valid characters are all one-byte long. */
3716    
3717     REPEATTYPE:
3718     ctype = *ecode++; /* Code for the character type */
3719    
3720     #ifdef SUPPORT_UCP
3721     if (ctype == OP_PROP || ctype == OP_NOTPROP)
3722     {
3723     prop_fail_result = ctype == OP_NOTPROP;
3724     prop_type = *ecode++;
3725 nigel 87 prop_value = *ecode++;
3726 nigel 77 }
3727     else prop_type = -1;
3728     #endif
3729    
3730     /* First, ensure the minimum number of matches are present. Use inline
3731     code for maximizing the speed, and do the type test once at the start
3732 ph10 426 (i.e. keep it out of the loop). Separate the UTF-8 code completely as that
3733 nigel 77 is tidier. Also separate the UCP code, which can be the same for both UTF-8
3734     and single-bytes. */
3735    
3736     if (min > 0)
3737     {
3738     #ifdef SUPPORT_UCP
3739 nigel 87 if (prop_type >= 0)
3740 nigel 77 {
3741 nigel 87 switch(prop_type)
3742 nigel 77 {
3743 nigel 87 case PT_ANY:
3744 ph10 510 if (prop_fail_result) MRRETURN(MATCH_NOMATCH);
3745 nigel 87 for (i = 1; i <= min; i++)
3746     {
3747 ph10 427 if (eptr >= md->end_subject)
3748 ph10 426 {
3749 ph10 427 SCHECK_PARTIAL();
3750 ph10 510 MRRETURN(MATCH_NOMATCH);
3751 ph10 427 }
3752 ph10 184 GETCHARINCTEST(c, eptr);
3753 nigel 87 }
3754     break;
3755    
3756     case PT_LAMP:
3757     for (i = 1; i <= min; i++)
3758     {
3759 ph10 427 if (eptr >= md->end_subject)
3760 ph10 426 {
3761 ph10 427 SCHECK_PARTIAL();
3762 ph10 510 MRRETURN(MATCH_NOMATCH);
3763 ph10 427 }
3764 ph10 184 GETCHARINCTEST(c, eptr);
3765 ph10 349 prop_chartype = UCD_CHARTYPE(c);
3766 nigel 87 if ((prop_chartype == ucp_Lu ||
3767     prop_chartype == ucp_Ll ||
3768     prop_chartype == ucp_Lt) == prop_fail_result)
3769 ph10 510 MRRETURN(MATCH_NOMATCH);
3770 nigel 87 }
3771     break;
3772    
3773     case PT_GC:
3774     for (i = 1; i <= min; i++)
3775     {
3776 ph10 427 if (eptr >= md->end_subject)
3777 ph10 426 {
3778 ph10 427 SCHECK_PARTIAL();
3779 ph10 510 MRRETURN(MATCH_NOMATCH);
3780 ph10 427 }
3781 ph10 184 GETCHARINCTEST(c, eptr);
3782 ph10 349 prop_category = UCD_CATEGORY(c);
3783 nigel 87 if ((prop_category == prop_value) == prop_fail_result)
3784 ph10 510 MRRETURN(MATCH_NOMATCH);
3785 nigel 87 }
3786     break;
3787    
3788     case PT_PC:
3789     for (i = 1; i <= min; i++)
3790     {
3791 ph10 427 if (eptr >= md->end_subject)
3792 ph10 426 {
3793 ph10 427 SCHECK_PARTIAL();
3794 ph10 510 MRRETURN(MATCH_NOMATCH);
3795 ph10 427 }
3796 ph10 184 GETCHARINCTEST(c, eptr);
3797 ph10 349 prop_chartype = UCD_CHARTYPE(c);
3798 nigel 87 if ((prop_chartype == prop_value) == prop_fail_result)
3799 ph10 510 MRRETURN(MATCH_NOMATCH);
3800 nigel 87 }
3801     break;
3802    
3803     case PT_SC:
3804     for (i = 1; i <= min; i++)
3805     {
3806 ph10 427 if (eptr >= md->end_subject)
3807 ph10 426 {
3808 ph10 427 SCHECK_PARTIAL();
3809 ph10 510 MRRETURN(MATCH_NOMATCH);
3810 ph10 427 }
3811 ph10 184 GETCHARINCTEST(c, eptr);
3812 ph10 349 prop_script = UCD_SCRIPT(c);
3813 nigel 87 if ((prop_script == prop_value) == prop_fail_result)
3814 ph10 510 MRRETURN(MATCH_NOMATCH);
3815 nigel 87 }
3816     break;
3817 ph10 527
3818 ph10 517 case PT_ALNUM:
3819     for (i = 1; i <= min; i++)
3820     {
3821     if (eptr >= md->end_subject)
3822     {
3823     SCHECK_PARTIAL();
3824     MRRETURN(MATCH_NOMATCH);
3825     }
3826     GETCHARINCTEST(c, eptr);
3827 ph10 527 prop_category = UCD_CATEGORY(c);
3828     if ((prop_category == ucp_L || prop_category == ucp_N)
3829 ph10 517 == prop_fail_result)
3830     MRRETURN(MATCH_NOMATCH);
3831     }
3832     break;
3833 ph10 527
3834 ph10 517 case PT_SPACE: /* Perl space */
3835     for (i = 1; i <= min; i++)
3836     {
3837     if (eptr >= md->end_subject)
3838     {
3839     SCHECK_PARTIAL();
3840     MRRETURN(MATCH_NOMATCH);
3841     }
3842     GETCHARINCTEST(c, eptr);
3843 ph10 527 prop_category = UCD_CATEGORY(c);
3844     if ((prop_category == ucp_Z || c == CHAR_HT || c == CHAR_NL ||
3845     c == CHAR_FF || c == CHAR_CR)
3846 ph10 517 == prop_fail_result)
3847     MRRETURN(MATCH_NOMATCH);
3848     }
3849     break;
3850 ph10 527
3851 ph10 517 case PT_PXSPACE: /* POSIX space */
3852     for (i = 1; i <= min; i++)
3853     {
3854     if (eptr >= md->end_subject)
3855     {
3856     SCHECK_PARTIAL();
3857     MRRETURN(MATCH_NOMATCH);
3858     }
3859     GETCHARINCTEST(c, eptr);
3860 ph10 527 prop_category = UCD_CATEGORY(c);
3861     if ((prop_category == ucp_Z || c == CHAR_HT || c == CHAR_NL ||
3862     c == CHAR_VT || c == CHAR_FF || c == CHAR_CR)
3863 ph10 517 == prop_fail_result)
3864     MRRETURN(MATCH_NOMATCH);
3865     }
3866     break;
3867 ph10 527
3868     case PT_WORD:
3869 ph10 517 for (i = 1; i <= min; i++)
3870     {
3871     if (eptr >= md->end_subject)
3872     {
3873     SCHECK_PARTIAL();
3874     MRRETURN(MATCH_NOMATCH);
3875     }
3876     GETCHARINCTEST(c, eptr);
3877 ph10 527 prop_category = UCD_CATEGORY(c);
3878 ph10 517 if ((prop_category == ucp_L || prop_category == ucp_N ||
3879 ph10 527 c == CHAR_UNDERSCORE)
3880 ph10 517 == prop_fail_result)
3881     MRRETURN(MATCH_NOMATCH);
3882     }
3883     break;
3884 ph10 527
3885 ph10 517 /* This should not occur */
3886 nigel 87
3887     default:
3888     RRETURN(PCRE_ERROR_INTERNAL);
3889 nigel 77 }
3890     }
3891    
3892     /* Match extended Unicode sequences. We will get here only if the
3893     support is in the binary; otherwise a compile-time error occurs. */
3894    
3895     else if (ctype == OP_EXTUNI)
3896     {
3897     for (i = 1; i <= min; i++)
3898     {
3899 ph10 427 if (eptr >= md->end_subject)
3900 ph10 426 {
3901 ph10 427 SCHECK_PARTIAL();
3902 ph10 510 MRRETURN(MATCH_NOMATCH);
3903 ph10 427 }
3904 nigel 77 GETCHARINCTEST(c, eptr);
3905 ph10 349 prop_category = UCD_CATEGORY(c);
3906 ph10 510 if (prop_category == ucp_M) MRRETURN(MATCH_NOMATCH);
3907 nigel 77 while (eptr < md->end_subject)
3908     {
3909     int len = 1;
3910 ph10 426 if (!utf8) c = *eptr;
3911     else { GETCHARLEN(c, eptr, len); }
3912 ph10 349 prop_category = UCD_CATEGORY(c);
3913 nigel 77 if (prop_category != ucp_M) break;
3914     eptr += len;
3915     }
3916     }
3917     }
3918    
3919     else
3920     #endif /* SUPPORT_UCP */
3921    
3922     /* Handle all other cases when the coding is UTF-8 */
3923    
3924     #ifdef SUPPORT_UTF8
3925     if (utf8) switch(ctype)
3926     {
3927     case OP_ANY:
3928     for (i = 1; i <= min; i++)
3929     {
3930 ph10 426 if (eptr >= md->end_subject)
3931     {
3932 ph10 427 SCHECK_PARTIAL();
3933 ph10 510 MRRETURN(MATCH_NOMATCH);
3934 ph10 427 }
3935 ph10 510 if (IS_NEWLINE(eptr)) MRRETURN(MATCH_NOMATCH);
3936 nigel 91 eptr++;
3937 nigel 77 while (eptr < md->end_subject && (*eptr & 0xc0) == 0x80) eptr++;
3938     }
3939     break;
3940    
3941 ph10 341 case OP_ALLANY:
3942     for (i = 1; i <= min; i++)
3943     {
3944 ph10 427 if (eptr >= md->end_subject)
3945 ph10 426 {
3946     SCHECK_PARTIAL();
3947 ph10 510 MRRETURN(MATCH_NOMATCH);
3948 ph10 427 }
3949 ph10 341 eptr++;
3950     while (eptr < md->end_subject && (*eptr & 0xc0) == 0x80) eptr++;
3951     }
3952     break;
3953    
3954 nigel 77 case OP_ANYBYTE:
3955 ph10 510 if (eptr > md->end_subject - min) MRRETURN(MATCH_NOMATCH);
3956 nigel 77 eptr += min;