--- code/trunk/pcre_exec.c 2007/06/13 15:09:54 182 +++ code/trunk/pcre_exec.c 2009/03/23 12:05:43 406 @@ -6,7 +6,7 @@ and semantics are as close as possible to those of the Perl 5 language. Written by Philip Hazel - Copyright (c) 1997-2007 University of Cambridge + Copyright (c) 1997-2009 University of Cambridge ----------------------------------------------------------------------------- Redistribution and use in source and binary forms, with or without @@ -42,6 +42,10 @@ pattern matching using an NFA algorithm, trying to mimic Perl as closely as possible. There are also some static supporting functions. */ +#ifdef HAVE_CONFIG_H +#include "config.h" +#endif + #define NLBLOCK md /* Block containing newline information */ #define PSSTART start_subject /* Field containing processed string start */ #define PSEND end_subject /* Field containing processed string end */ @@ -53,16 +57,10 @@ #undef min #undef max -/* The chain of eptrblocks for tail recursions uses memory in stack workspace, -obtained at top level, the size of which is defined by EPTR_WORK_SIZE. */ - -#define EPTR_WORK_SIZE (1000) - /* Flag bits for the match() function */ #define match_condassert 0x01 /* Called to check a condition assertion */ #define match_cbegroup 0x02 /* Could-be-empty unlimited repeat group */ -#define match_tail_recursed 0x04 /* Tail recursive call */ /* Non-error returns from the match() function. Error returns are externally defined PCRE_ERROR_xxx codes, which are all negative. */ @@ -70,6 +68,14 @@ #define MATCH_MATCH 1 #define MATCH_NOMATCH 0 +/* Special internal returns from the match() function. Make them sufficiently +negative to avoid the external error codes. */ + +#define MATCH_COMMIT (-999) +#define MATCH_PRUNE (-998) +#define MATCH_SKIP (-997) +#define MATCH_THEN (-996) + /* Maximum number of ints of offset to save on the stack for recursive calls. If the offset vector is bigger, malloc is used. This should be a multiple of 3, because the offset vector is always a multiple of 3 long. */ @@ -152,13 +158,39 @@ if (length > md->end_subject - eptr) return FALSE; -/* Separate the caselesss case for speed */ +/* Separate the caseless case for speed. In UTF-8 mode we can only do this +properly if Unicode properties are supported. Otherwise, we can check only +ASCII characters. */ if ((ims & PCRE_CASELESS) != 0) { +#ifdef SUPPORT_UTF8 +#ifdef SUPPORT_UCP + if (md->utf8) + { + USPTR endptr = eptr + length; + while (eptr < endptr) + { + int c, d; + GETCHARINC(c, eptr); + GETCHARINC(d, p); + if (c != d && c != UCD_OTHERCASE(d)) return FALSE; + } + } + else +#endif +#endif + + /* The same code works when not in UTF-8 mode and in UTF-8 mode when there + is no UCP support. */ + while (length-- > 0) - if (md->lcc[*p++] != md->lcc[*eptr++]) return FALSE; + { if (md->lcc[*p++] != md->lcc[*eptr++]) return FALSE; } } + +/* In the caseful case, we can just compare the bytes, whether or not we +are in UTF-8 mode. */ + else { while (length-- > 0) if (*p++ != *eptr++) return FALSE; } @@ -205,15 +237,15 @@ **************************************************************************** ***************************************************************************/ - -/* Numbers for RMATCH calls */ +/* Numbers for RMATCH calls. When this list is changed, the code at HEAP_RETURN +below must be updated in sync. */ enum { RM1=1, RM2, RM3, RM4, RM5, RM6, RM7, RM8, RM9, RM10, RM11, RM12, RM13, RM14, RM15, RM16, RM17, RM18, RM19, RM20, RM21, RM22, RM23, RM24, RM25, RM26, RM27, RM28, RM29, RM30, RM31, RM32, RM33, RM34, RM35, RM36, RM37, RM38, RM39, RM40, - RM41, RM42, RM43, RM44, RM45, RM46, RM47 }; - + RM41, RM42, RM43, RM44, RM45, RM46, RM47, RM48, RM49, RM50, + RM51, RM52, RM53, RM54 }; /* These versions of the macros use the stack, as normal. There are debugging versions and production versions. Note that the "rw" argument of RMATCH isn't @@ -302,7 +334,9 @@ /* Function local variables */ const uschar *Xcallpat; +#ifdef SUPPORT_UTF8 const uschar *Xcharptr; +#endif const uschar *Xdata; const uschar *Xnext; const uschar *Xpp; @@ -328,6 +362,7 @@ uschar Xocchars[8]; #endif + int Xcodelink; int Xctype; unsigned int Xfc; int Xfi; @@ -384,7 +419,6 @@ match_condassert - this is an assertion condition match_cbegroup - this is the start of an unlimited repeat group that can match an empty string - match_tail_recursed - this is a tail_recursed group rdepth the recursion depth Returns: MATCH_MATCH if matched ) these values are >= 0 @@ -408,6 +442,7 @@ register BOOL utf8; /* Local copy of UTF-8 flag for speed */ BOOL minimize, possessive; /* Quantifier options */ +int condcode; /* When recursion is not being used, all "local" variables that have to be preserved over calls to RMATCH() are part of a "frame" which is obtained from @@ -450,6 +485,7 @@ #define charptr frame->Xcharptr #endif #define callpat frame->Xcallpat +#define codelink frame->Xcodelink #define data frame->Xdata #define next frame->Xnext #define pp frame->Xpp @@ -530,6 +566,7 @@ uschar occhars[8]; #endif +int codelink; int ctype; int length; int max; @@ -586,22 +623,16 @@ string, the match_cbegroup flag is set. When this is the case, add the current subject pointer to the chain of such remembered pointers, to be checked when we hit the closing ket, in order to break infinite loops that match no characters. -When match() is called in other circumstances, don't add to the chain. If this -is a tail recursion, use a block from the workspace, as the one on the stack is -already used. */ +When match() is called in other circumstances, don't add to the chain. The +match_cbegroup flag must NOT be used with tail recursion, because the memory +block that is used is on the stack, so a new one may be required for each +match(). */ if ((flags & match_cbegroup) != 0) { - eptrblock *p; - if ((flags & match_tail_recursed) != 0) - { - if (md->eptrn >= EPTR_WORK_SIZE) RRETURN(PCRE_ERROR_NULLWSLIMIT); - p = md->eptrchain + md->eptrn++; - } - else p = &newptrb; - p->epb_saved_eptr = eptr; - p->epb_prev = eptrb; - eptrb = p; + newptrb.epb_saved_eptr = eptr; + newptrb.epb_prev = eptrb; + eptrb = &newptrb; } /* Now start processing the opcodes. */ @@ -621,6 +652,34 @@ switch(op) { + case OP_FAIL: + RRETURN(MATCH_NOMATCH); + + case OP_PRUNE: + RMATCH(eptr, ecode + _pcre_OP_lengths[*ecode], offset_top, md, + ims, eptrb, flags, RM51); + if (rrc != MATCH_NOMATCH) RRETURN(rrc); + RRETURN(MATCH_PRUNE); + + case OP_COMMIT: + RMATCH(eptr, ecode + _pcre_OP_lengths[*ecode], offset_top, md, + ims, eptrb, flags, RM52); + if (rrc != MATCH_NOMATCH) RRETURN(rrc); + RRETURN(MATCH_COMMIT); + + case OP_SKIP: + RMATCH(eptr, ecode + _pcre_OP_lengths[*ecode], offset_top, md, + ims, eptrb, flags, RM53); + if (rrc != MATCH_NOMATCH) RRETURN(rrc); + md->start_match_ptr = eptr; /* Pass back current position */ + RRETURN(MATCH_SKIP); + + case OP_THEN: + RMATCH(eptr, ecode + _pcre_OP_lengths[*ecode], offset_top, md, + ims, eptrb, flags, RM54); + if (rrc != MATCH_NOMATCH) RRETURN(rrc); + RRETURN(MATCH_THEN); + /* Handle a capturing bracket. If there is space in the offset vector, save the current subject position in the working slot at the top of the vector. We mustn't change the current values of the data slot, because they may be @@ -662,7 +721,7 @@ { RMATCH(eptr, ecode + _pcre_OP_lengths[*ecode], offset_top, md, ims, eptrb, flags, RM1); - if (rrc != MATCH_NOMATCH) RRETURN(rrc); + if (rrc != MATCH_NOMATCH && rrc != MATCH_THEN) RRETURN(rrc); md->capture_last = save_capture_last; ecode += GET(ecode, 1); } @@ -677,15 +736,22 @@ RRETURN(MATCH_NOMATCH); } - /* Insufficient room for saving captured contents. Treat as a non-capturing - bracket. */ + /* FALL THROUGH ... Insufficient room for saving captured contents. Treat + as a non-capturing bracket. */ + + /* VVVVVVVVVVVVVVVVVVVVVVVVV */ + /* VVVVVVVVVVVVVVVVVVVVVVVVV */ DPRINTF(("insufficient capture room: treat as non-capturing\n")); + /* VVVVVVVVVVVVVVVVVVVVVVVVV */ + /* VVVVVVVVVVVVVVVVVVVVVVVVV */ + /* Non-capturing bracket. Loop for all the alternatives. When we get to the final alternative within the brackets, we would return the result of a recursive call to match() whatever happened. We can reduce stack usage by - turning this into a tail recursion. */ + turning this into a tail recursion, except in the case when match_cbegroup + is set.*/ case OP_BRA: case OP_SBRA: @@ -693,12 +759,20 @@ flags = (op >= OP_SBRA)? match_cbegroup : 0; for (;;) { - if (ecode[GET(ecode, 1)] != OP_ALT) + if (ecode[GET(ecode, 1)] != OP_ALT) /* Final alternative */ { - ecode += _pcre_OP_lengths[*ecode]; - flags |= match_tail_recursed; - DPRINTF(("bracket 0 tail recursion\n")); - goto TAIL_RECURSE; + if (flags == 0) /* Not a possibly empty group */ + { + ecode += _pcre_OP_lengths[*ecode]; + DPRINTF(("bracket 0 tail recursion\n")); + goto TAIL_RECURSE; + } + + /* Possibly empty group; can't use tail recursion. */ + + RMATCH(eptr, ecode + _pcre_OP_lengths[*ecode], offset_top, md, ims, + eptrb, flags, RM48); + RRETURN(rrc); } /* For non-final alternatives, continue the loop for a NOMATCH result; @@ -706,7 +780,7 @@ RMATCH(eptr, ecode + _pcre_OP_lengths[*ecode], offset_top, md, ims, eptrb, flags, RM2); - if (rrc != MATCH_NOMATCH) RRETURN(rrc); + if (rrc != MATCH_NOMATCH && rrc != MATCH_THEN) RRETURN(rrc); ecode += GET(ecode, 1); } /* Control never reaches here. */ @@ -719,7 +793,39 @@ case OP_COND: case OP_SCOND: - if (ecode[LINK_SIZE+1] == OP_RREF) /* Recursion test */ + codelink= GET(ecode, 1); + + /* Because of the way auto-callout works during compile, a callout item is + inserted between OP_COND and an assertion condition. */ + + if (ecode[LINK_SIZE+1] == OP_CALLOUT) + { + if (pcre_callout != NULL) + { + pcre_callout_block cb; + cb.version = 1; /* Version 1 of the callout block */ + cb.callout_number = ecode[LINK_SIZE+2]; + cb.offset_vector = md->offset_vector; + cb.subject = (PCRE_SPTR)md->start_subject; + cb.subject_length = md->end_subject - md->start_subject; + cb.start_match = mstart - md->start_subject; + cb.current_position = eptr - md->start_subject; + cb.pattern_position = GET(ecode, LINK_SIZE + 3); + cb.next_item_length = GET(ecode, 3 + 2*LINK_SIZE); + cb.capture_top = offset_top/2; + cb.capture_last = md->capture_last; + cb.callout_data = md->callout_data; + if ((rrc = (*pcre_callout)(&cb)) > 0) RRETURN(MATCH_NOMATCH); + if (rrc < 0) RRETURN(rrc); + } + ecode += _pcre_OP_lengths[OP_CALLOUT]; + } + + condcode = ecode[LINK_SIZE+1]; + + /* Now see what the actual condition is */ + + if (condcode == OP_RREF) /* Recursion test */ { offset = GET2(ecode, LINK_SIZE + 2); /* Recursion group number*/ condition = md->recursive != NULL && @@ -727,14 +833,14 @@ ecode += condition? 3 : GET(ecode, 1); } - else if (ecode[LINK_SIZE+1] == OP_CREF) /* Group used test */ + else if (condcode == OP_CREF) /* Group used test */ { offset = GET2(ecode, LINK_SIZE+2) << 1; /* Doubled ref number */ condition = offset < offset_top && md->offset_vector[offset] >= 0; ecode += condition? 3 : GET(ecode, 1); } - else if (ecode[LINK_SIZE+1] == OP_DEF) /* DEFINE - always false */ + else if (condcode == OP_DEF) /* DEFINE - always false */ { condition = FALSE; ecode += GET(ecode, 1); @@ -754,37 +860,48 @@ ecode += 1 + LINK_SIZE + GET(ecode, LINK_SIZE + 2); while (*ecode == OP_ALT) ecode += GET(ecode, 1); } - else if (rrc != MATCH_NOMATCH) + else if (rrc != MATCH_NOMATCH && rrc != MATCH_THEN) { RRETURN(rrc); /* Need braces because of following else */ } else { condition = FALSE; - ecode += GET(ecode, 1); + ecode += codelink; } } /* We are now at the branch that is to be obeyed. As there is only one, - we can use tail recursion to avoid using another stack frame. If the second - alternative doesn't exist, we can just plough on. */ + we can use tail recursion to avoid using another stack frame, except when + match_cbegroup is required for an unlimited repeat of a possibly empty + group. If the second alternative doesn't exist, we can just plough on. */ if (condition || *ecode == OP_ALT) { ecode += 1 + LINK_SIZE; - flags = match_tail_recursed | ((op == OP_SCOND)? match_cbegroup : 0); - goto TAIL_RECURSE; + if (op == OP_SCOND) /* Possibly empty group */ + { + RMATCH(eptr, ecode, offset_top, md, ims, eptrb, match_cbegroup, RM49); + RRETURN(rrc); + } + else /* Group must match something */ + { + flags = 0; + goto TAIL_RECURSE; + } } - else + else /* Condition false & no alternative */ { ecode += 1 + LINK_SIZE; } break; - /* End of the pattern. If we are in a top-level recursion, we should - restore the offsets appropriately and continue from after the call. */ + /* End of the pattern, either real or forced. If we are in a top-level + recursion, we should restore the offsets appropriately and continue from + after the call. */ + case OP_ACCEPT: case OP_END: if (md->recursive != NULL && md->recursive->group_num == 0) { @@ -805,7 +922,7 @@ if (md->notempty && eptr == mstart) RRETURN(MATCH_NOMATCH); md->end_match_ptr = eptr; /* Record where we ended */ md->end_offset_top = offset_top; /* and how many extracts were taken */ - md->start_match_ptr = mstart; /* and the start (\K can modify) */ + md->start_match_ptr = mstart; /* and the start (\K can modify) */ RRETURN(MATCH_MATCH); /* Change option settings */ @@ -829,7 +946,7 @@ RMATCH(eptr, ecode + 1 + LINK_SIZE, offset_top, md, ims, NULL, 0, RM4); if (rrc == MATCH_MATCH) break; - if (rrc != MATCH_NOMATCH) RRETURN(rrc); + if (rrc != MATCH_NOMATCH && rrc != MATCH_THEN) RRETURN(rrc); ecode += GET(ecode, 1); } while (*ecode == OP_ALT); @@ -856,7 +973,7 @@ RMATCH(eptr, ecode + 1 + LINK_SIZE, offset_top, md, ims, NULL, 0, RM5); if (rrc == MATCH_MATCH) RRETURN(MATCH_NOMATCH); - if (rrc != MATCH_NOMATCH) RRETURN(rrc); + if (rrc != MATCH_NOMATCH && rrc != MATCH_THEN) RRETURN(rrc); ecode += GET(ecode,1); } while (*ecode == OP_ALT); @@ -880,7 +997,7 @@ { eptr--; if (eptr < md->start_subject) RRETURN(MATCH_NOMATCH); - BACKCHAR(eptr) + BACKCHAR(eptr); } } else @@ -993,9 +1110,11 @@ (pcre_free)(new_recursive.offset_save); RRETURN(MATCH_MATCH); } - else if (rrc != MATCH_NOMATCH) + else if (rrc != MATCH_NOMATCH && rrc != MATCH_THEN) { DPRINTF(("Recursion gave error %d\n", rrc)); + if (new_recursive.offset_save != stacksave) + (pcre_free)(new_recursive.offset_save); RRETURN(rrc); } @@ -1027,10 +1146,9 @@ do { - RMATCH(eptr, ecode + 1 + LINK_SIZE, offset_top, md, ims, - eptrb, 0, RM7); + RMATCH(eptr, ecode + 1 + LINK_SIZE, offset_top, md, ims, eptrb, 0, RM7); if (rrc == MATCH_MATCH) break; - if (rrc != MATCH_NOMATCH) RRETURN(rrc); + if (rrc != MATCH_NOMATCH && rrc != MATCH_THEN) RRETURN(rrc); ecode += GET(ecode,1); } while (*ecode == OP_ALT); @@ -1073,11 +1191,10 @@ if (*ecode == OP_KETRMIN) { - RMATCH(eptr, ecode + 1 + LINK_SIZE, offset_top, md, ims, eptrb, 0, - RM8); + RMATCH(eptr, ecode + 1 + LINK_SIZE, offset_top, md, ims, eptrb, 0, RM8); if (rrc != MATCH_NOMATCH) RRETURN(rrc); ecode = prev; - flags = match_tail_recursed; + flags = 0; goto TAIL_RECURSE; } else /* OP_KETRMAX */ @@ -1085,7 +1202,7 @@ RMATCH(eptr, prev, offset_top, md, ims, eptrb, match_cbegroup, RM9); if (rrc != MATCH_NOMATCH) RRETURN(rrc); ecode += 1 + LINK_SIZE; - flags = match_tail_recursed; + flags = 0; goto TAIL_RECURSE; } /* Control never gets here */ @@ -1097,11 +1214,11 @@ do ecode += GET(ecode,1); while (*ecode == OP_ALT); break; - /* BRAZERO and BRAMINZERO occur just before a bracket group, indicating - that it may occur zero times. It may repeat infinitely, or not at all - - i.e. it could be ()* or ()? in the pattern. Brackets with fixed upper - repeat limits are compiled as a number of copies, with the optional ones - preceded by BRAZERO or BRAMINZERO. */ + /* BRAZERO, BRAMINZERO and SKIPZERO occur just before a bracket group, + indicating that it may occur zero times. It may repeat infinitely, or not + at all - i.e. it could be ()* or ()? or even (){0} in the pattern. Brackets + with fixed upper repeat limits are compiled as a number of copies, with the + optional ones preceded by BRAZERO or BRAMINZERO. */ case OP_BRAZERO: { @@ -1123,6 +1240,14 @@ } break; + case OP_SKIPZERO: + { + next = ecode+1; + do next += GET(next,1); while (*next == OP_ALT); + ecode = next + 1 + LINK_SIZE; + } + break; + /* End of a group, repeated or non-repeating. */ case OP_KET: @@ -1216,17 +1341,21 @@ /* The repeating kets try the rest of the pattern or restart from the preceding bracket, in the appropriate order. In the second case, we can use - tail recursion to avoid using another stack frame. */ + tail recursion to avoid using another stack frame, unless we have an + unlimited repeat of a group that can match an empty string. */ flags = (*prev >= OP_SBRA)? match_cbegroup : 0; if (*ecode == OP_KETRMIN) { - RMATCH(eptr, ecode + 1+LINK_SIZE, offset_top, md, ims, eptrb, 0, - RM12); + RMATCH(eptr, ecode + 1 + LINK_SIZE, offset_top, md, ims, eptrb, 0, RM12); if (rrc != MATCH_NOMATCH) RRETURN(rrc); + if (flags != 0) /* Could match an empty string */ + { + RMATCH(eptr, prev, offset_top, md, ims, eptrb, flags, RM50); + RRETURN(rrc); + } ecode = prev; - flags |= match_tail_recursed; goto TAIL_RECURSE; } else /* OP_KETRMAX */ @@ -1234,7 +1363,7 @@ RMATCH(eptr, prev, offset_top, md, ims, eptrb, flags, RM13); if (rrc != MATCH_NOMATCH) RRETURN(rrc); ecode += 1 + LINK_SIZE; - flags = match_tail_recursed; + flags = 0; goto TAIL_RECURSE; } /* Control never gets here */ @@ -1366,13 +1495,12 @@ /* Match a single character type; inline for speed */ case OP_ANY: - if ((ims & PCRE_DOTALL) == 0) - { - if (IS_NEWLINE(eptr)) RRETURN(MATCH_NOMATCH); - } + if (IS_NEWLINE(eptr)) RRETURN(MATCH_NOMATCH); + /* Fall through */ + + case OP_ALLANY: if (eptr++ >= md->end_subject) RRETURN(MATCH_NOMATCH); - if (utf8) - while (eptr < md->end_subject && (*eptr & 0xc0) == 0x80) eptr++; + if (utf8) while (eptr < md->end_subject && (*eptr & 0xc0) == 0x80) eptr++; ecode++; break; @@ -1471,12 +1599,16 @@ case 0x000d: if (eptr < md->end_subject && *eptr == 0x0a) eptr++; break; + case 0x000a: + break; + case 0x000b: case 0x000c: case 0x0085: case 0x2028: case 0x2029: + if (md->bsr_anycrlf) RRETURN(MATCH_NOMATCH); break; } ecode++; @@ -1587,8 +1719,7 @@ if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH); GETCHARINCTEST(c, eptr); { - int chartype, script; - int category = _pcre_ucp_findprop(c, &chartype, &script); + const ucd_record *prop = GET_UCD(c); switch(ecode[1]) { @@ -1597,24 +1728,24 @@ break; case PT_LAMP: - if ((chartype == ucp_Lu || - chartype == ucp_Ll || - chartype == ucp_Lt) == (op == OP_NOTPROP)) + if ((prop->chartype == ucp_Lu || + prop->chartype == ucp_Ll || + prop->chartype == ucp_Lt) == (op == OP_NOTPROP)) RRETURN(MATCH_NOMATCH); break; case PT_GC: - if ((ecode[2] != category) == (op == OP_PROP)) + if ((ecode[2] != _pcre_ucp_gentype[prop->chartype]) == (op == OP_PROP)) RRETURN(MATCH_NOMATCH); break; case PT_PC: - if ((ecode[2] != chartype) == (op == OP_PROP)) + if ((ecode[2] != prop->chartype) == (op == OP_PROP)) RRETURN(MATCH_NOMATCH); break; case PT_SC: - if ((ecode[2] != script) == (op == OP_PROP)) + if ((ecode[2] != prop->script) == (op == OP_PROP)) RRETURN(MATCH_NOMATCH); break; @@ -1633,8 +1764,7 @@ if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH); GETCHARINCTEST(c, eptr); { - int chartype, script; - int category = _pcre_ucp_findprop(c, &chartype, &script); + int category = UCD_CATEGORY(c); if (category == ucp_M) RRETURN(MATCH_NOMATCH); while (eptr < md->end_subject) { @@ -1643,7 +1773,7 @@ { GETCHARLEN(c, eptr, len); } - category = _pcre_ucp_findprop(c, &chartype, &script); + category = UCD_CATEGORY(c); if (category != ucp_M) break; eptr += len; } @@ -1664,16 +1794,25 @@ case OP_REF: { offset = GET2(ecode, 1) << 1; /* Doubled ref number */ - ecode += 3; /* Advance past item */ + ecode += 3; + + /* If the reference is unset, there are two possibilities: + + (a) In the default, Perl-compatible state, set the length to be longer + than the amount of subject left; this ensures that every attempt at a + match fails. We can't just fail here, because of the possibility of + quantifiers with zero minima. - /* If the reference is unset, set the length to be longer than the amount - of subject left; this ensures that every attempt at a match fails. We - can't just fail here, because of the possibility of quantifiers with zero - minima. */ - - length = (offset >= offset_top || md->offset_vector[offset] < 0)? - md->end_subject - eptr + 1 : - md->offset_vector[offset+1] - md->offset_vector[offset]; + (b) If the JavaScript compatibility flag is set, set the length to zero + so that the back reference matches an empty string. + + Otherwise, set the length to the length of what was matched by the + referenced subpattern. */ + + if (offset >= offset_top || md->offset_vector[offset] < 0) + length = (md->jscript_compat)? 0 : md->end_subject - eptr + 1; + else + length = md->offset_vector[offset+1] - md->offset_vector[offset]; /* Set up for repetition, or handle the non-repeated case */ @@ -1948,7 +2087,8 @@ /* Match an extended character class. This opcode is encountered only - in UTF-8 mode, because that's the only time it is compiled. */ + when UTF-8 mode mode is supported. Nevertheless, we may not be in UTF-8 + mode, because Unicode properties are supported in non-UTF-8 mode. */ #ifdef SUPPORT_UTF8 case OP_XCLASS: @@ -1990,7 +2130,7 @@ for (i = 1; i <= min; i++) { if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH); - GETCHARINC(c, eptr); + GETCHARINCTEST(c, eptr); if (!_pcre_xclass(c, data)) RRETURN(MATCH_NOMATCH); } @@ -2009,7 +2149,7 @@ RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM20); if (rrc != MATCH_NOMATCH) RRETURN(rrc); if (fi >= max || eptr >= md->end_subject) RRETURN(MATCH_NOMATCH); - GETCHARINC(c, eptr); + GETCHARINCTEST(c, eptr); if (!_pcre_xclass(c, data)) RRETURN(MATCH_NOMATCH); } /* Control never gets here */ @@ -2024,7 +2164,7 @@ { int len = 1; if (eptr >= md->end_subject) break; - GETCHARLEN(c, eptr, len); + GETCHARLENTEST(c, eptr, len); if (!_pcre_xclass(c, data)) break; eptr += len; } @@ -2033,7 +2173,7 @@ RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM21); if (rrc != MATCH_NOMATCH) RRETURN(rrc); if (eptr-- == pp) break; /* Stop if tried at original pos */ - BACKCHAR(eptr) + if (utf8) BACKCHAR(eptr); } RRETURN(MATCH_NOMATCH); } @@ -2099,7 +2239,7 @@ if (fc != dc) { #ifdef SUPPORT_UCP - if (dc != _pcre_ucp_othercase(fc)) + if (dc != UCD_OTHERCASE(fc)) #endif RRETURN(MATCH_NOMATCH); } @@ -2190,7 +2330,7 @@ #ifdef SUPPORT_UCP unsigned int othercase; if ((ims & PCRE_CASELESS) != 0 && - (othercase = _pcre_ucp_othercase(fc)) != NOTACHAR) + (othercase = UCD_OTHERCASE(fc)) != fc) oclength = _pcre_ord2utf8(othercase, occhars); else oclength = 0; #endif /* SUPPORT_UCP */ @@ -2510,10 +2650,11 @@ { RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM28); if (rrc != MATCH_NOMATCH) RRETURN(rrc); + if (fi >= max || eptr >= md->end_subject) RRETURN(MATCH_NOMATCH); GETCHARINC(d, eptr); if (d < 256) d = md->lcc[d]; - if (fi >= max || eptr >= md->end_subject || fc == d) - RRETURN(MATCH_NOMATCH); + if (fc == d) RRETURN(MATCH_NOMATCH); + } } else @@ -2619,9 +2760,9 @@ { RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM32); if (rrc != MATCH_NOMATCH) RRETURN(rrc); + if (fi >= max || eptr >= md->end_subject) RRETURN(MATCH_NOMATCH); GETCHARINC(d, eptr); - if (fi >= max || eptr >= md->end_subject || fc == d) - RRETURN(MATCH_NOMATCH); + if (fc == d) RRETURN(MATCH_NOMATCH); } } else @@ -2786,7 +2927,7 @@ for (i = 1; i <= min; i++) { if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH); - GETCHARINC(c, eptr); + GETCHARINCTEST(c, eptr); } break; @@ -2794,8 +2935,8 @@ for (i = 1; i <= min; i++) { if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH); - GETCHARINC(c, eptr); - prop_category = _pcre_ucp_findprop(c, &prop_chartype, &prop_script); + GETCHARINCTEST(c, eptr); + prop_chartype = UCD_CHARTYPE(c); if ((prop_chartype == ucp_Lu || prop_chartype == ucp_Ll || prop_chartype == ucp_Lt) == prop_fail_result) @@ -2807,8 +2948,8 @@ for (i = 1; i <= min; i++) { if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH); - GETCHARINC(c, eptr); - prop_category = _pcre_ucp_findprop(c, &prop_chartype, &prop_script); + GETCHARINCTEST(c, eptr); + prop_category = UCD_CATEGORY(c); if ((prop_category == prop_value) == prop_fail_result) RRETURN(MATCH_NOMATCH); } @@ -2818,8 +2959,8 @@ for (i = 1; i <= min; i++) { if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH); - GETCHARINC(c, eptr); - prop_category = _pcre_ucp_findprop(c, &prop_chartype, &prop_script); + GETCHARINCTEST(c, eptr); + prop_chartype = UCD_CHARTYPE(c); if ((prop_chartype == prop_value) == prop_fail_result) RRETURN(MATCH_NOMATCH); } @@ -2829,8 +2970,8 @@ for (i = 1; i <= min; i++) { if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH); - GETCHARINC(c, eptr); - prop_category = _pcre_ucp_findprop(c, &prop_chartype, &prop_script); + GETCHARINCTEST(c, eptr); + prop_script = UCD_SCRIPT(c); if ((prop_script == prop_value) == prop_fail_result) RRETURN(MATCH_NOMATCH); } @@ -2849,7 +2990,7 @@ for (i = 1; i <= min; i++) { GETCHARINCTEST(c, eptr); - prop_category = _pcre_ucp_findprop(c, &prop_chartype, &prop_script); + prop_category = UCD_CATEGORY(c); if (prop_category == ucp_M) RRETURN(MATCH_NOMATCH); while (eptr < md->end_subject) { @@ -2858,7 +2999,7 @@ { GETCHARLEN(c, eptr, len); } - prop_category = _pcre_ucp_findprop(c, &prop_chartype, &prop_script); + prop_category = UCD_CATEGORY(c); if (prop_category != ucp_M) break; eptr += len; } @@ -2876,14 +3017,22 @@ case OP_ANY: for (i = 1; i <= min; i++) { - if (eptr >= md->end_subject || - ((ims & PCRE_DOTALL) == 0 && IS_NEWLINE(eptr))) + if (eptr >= md->end_subject || IS_NEWLINE(eptr)) RRETURN(MATCH_NOMATCH); eptr++; while (eptr < md->end_subject && (*eptr & 0xc0) == 0x80) eptr++; } break; + case OP_ALLANY: + for (i = 1; i <= min; i++) + { + if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH); + eptr++; + while (eptr < md->end_subject && (*eptr & 0xc0) == 0x80) eptr++; + } + break; + case OP_ANYBYTE: eptr += min; break; @@ -2899,12 +3048,16 @@ case 0x000d: if (eptr < md->end_subject && *eptr == 0x0a) eptr++; break; + case 0x000a: + break; + case 0x000b: case 0x000c: case 0x0085: case 0x2028: case 0x2029: + if (md->bsr_anycrlf) RRETURN(MATCH_NOMATCH); break; } } @@ -3038,9 +3191,9 @@ for (i = 1; i <= min; i++) { if (eptr >= md->end_subject || - (*eptr < 128 && (md->ctypes[*eptr++] & ctype_space) != 0)) + (*eptr < 128 && (md->ctypes[*eptr] & ctype_space) != 0)) RRETURN(MATCH_NOMATCH); - while (eptr < md->end_subject && (*eptr & 0xc0) == 0x80) eptr++; + while (++eptr < md->end_subject && (*eptr & 0xc0) == 0x80); } break; @@ -3058,9 +3211,9 @@ for (i = 1; i <= min; i++) { if (eptr >= md->end_subject || - (*eptr < 128 && (md->ctypes[*eptr++] & ctype_word) != 0)) + (*eptr < 128 && (md->ctypes[*eptr] & ctype_word) != 0)) RRETURN(MATCH_NOMATCH); - while (eptr < md->end_subject && (*eptr & 0xc0) == 0x80) eptr++; + while (++eptr < md->end_subject && (*eptr & 0xc0) == 0x80); } break; @@ -3088,15 +3241,15 @@ switch(ctype) { case OP_ANY: - if ((ims & PCRE_DOTALL) == 0) + for (i = 1; i <= min; i++) { - for (i = 1; i <= min; i++) - { - if (IS_NEWLINE(eptr)) RRETURN(MATCH_NOMATCH); - eptr++; - } + if (IS_NEWLINE(eptr)) RRETURN(MATCH_NOMATCH); + eptr++; } - else eptr += min; + break; + + case OP_ALLANY: + eptr += min; break; case OP_ANYBYTE: @@ -3117,9 +3270,12 @@ if (eptr < md->end_subject && *eptr == 0x0a) eptr++; break; case 0x000a: + break; + case 0x000b: case 0x000c: case 0x0085: + if (md->bsr_anycrlf) RRETURN(MATCH_NOMATCH); break; } } @@ -3259,7 +3415,7 @@ if (rrc != MATCH_NOMATCH) RRETURN(rrc); if (fi >= max || eptr >= md->end_subject) RRETURN(MATCH_NOMATCH); GETCHARINC(c, eptr); - prop_category = _pcre_ucp_findprop(c, &prop_chartype, &prop_script); + prop_chartype = UCD_CHARTYPE(c); if ((prop_chartype == ucp_Lu || prop_chartype == ucp_Ll || prop_chartype == ucp_Lt) == prop_fail_result) @@ -3274,7 +3430,7 @@ if (rrc != MATCH_NOMATCH) RRETURN(rrc); if (fi >= max || eptr >= md->end_subject) RRETURN(MATCH_NOMATCH); GETCHARINC(c, eptr); - prop_category = _pcre_ucp_findprop(c, &prop_chartype, &prop_script); + prop_category = UCD_CATEGORY(c); if ((prop_category == prop_value) == prop_fail_result) RRETURN(MATCH_NOMATCH); } @@ -3287,7 +3443,7 @@ if (rrc != MATCH_NOMATCH) RRETURN(rrc); if (fi >= max || eptr >= md->end_subject) RRETURN(MATCH_NOMATCH); GETCHARINC(c, eptr); - prop_category = _pcre_ucp_findprop(c, &prop_chartype, &prop_script); + prop_chartype = UCD_CHARTYPE(c); if ((prop_chartype == prop_value) == prop_fail_result) RRETURN(MATCH_NOMATCH); } @@ -3300,7 +3456,7 @@ if (rrc != MATCH_NOMATCH) RRETURN(rrc); if (fi >= max || eptr >= md->end_subject) RRETURN(MATCH_NOMATCH); GETCHARINC(c, eptr); - prop_category = _pcre_ucp_findprop(c, &prop_chartype, &prop_script); + prop_script = UCD_SCRIPT(c); if ((prop_script == prop_value) == prop_fail_result) RRETURN(MATCH_NOMATCH); } @@ -3322,7 +3478,7 @@ if (rrc != MATCH_NOMATCH) RRETURN(rrc); if (fi >= max || eptr >= md->end_subject) RRETURN(MATCH_NOMATCH); GETCHARINCTEST(c, eptr); - prop_category = _pcre_ucp_findprop(c, &prop_chartype, &prop_script); + prop_category = UCD_CATEGORY(c); if (prop_category == ucp_M) RRETURN(MATCH_NOMATCH); while (eptr < md->end_subject) { @@ -3331,7 +3487,7 @@ { GETCHARLEN(c, eptr, len); } - prop_category = _pcre_ucp_findprop(c, &prop_chartype, &prop_script); + prop_category = UCD_CATEGORY(c); if (prop_category != ucp_M) break; eptr += len; } @@ -3350,16 +3506,14 @@ RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM42); if (rrc != MATCH_NOMATCH) RRETURN(rrc); if (fi >= max || eptr >= md->end_subject || - (ctype == OP_ANY && (ims & PCRE_DOTALL) == 0 && - IS_NEWLINE(eptr))) + (ctype == OP_ANY && IS_NEWLINE(eptr))) RRETURN(MATCH_NOMATCH); GETCHARINC(c, eptr); switch(ctype) { - case OP_ANY: /* This is the DOTALL case */ - break; - + case OP_ANY: /* This is the non-NL case */ + case OP_ALLANY: case OP_ANYBYTE: break; @@ -3371,11 +3525,14 @@ if (eptr < md->end_subject && *eptr == 0x0a) eptr++; break; case 0x000a: + break; + case 0x000b: case 0x000c: case 0x0085: case 0x2028: case 0x2029: + if (md->bsr_anycrlf) RRETURN(MATCH_NOMATCH); break; } break; @@ -3508,15 +3665,14 @@ RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM43); if (rrc != MATCH_NOMATCH) RRETURN(rrc); if (fi >= max || eptr >= md->end_subject || - ((ims & PCRE_DOTALL) == 0 && IS_NEWLINE(eptr))) + (ctype == OP_ANY && IS_NEWLINE(eptr))) RRETURN(MATCH_NOMATCH); c = *eptr++; switch(ctype) { - case OP_ANY: /* This is the DOTALL case */ - break; - + case OP_ANY: /* This is the non-NL case */ + case OP_ALLANY: case OP_ANYBYTE: break; @@ -3527,10 +3683,14 @@ case 0x000d: if (eptr < md->end_subject && *eptr == 0x0a) eptr++; break; + case 0x000a: + break; + case 0x000b: case 0x000c: case 0x0085: + if (md->bsr_anycrlf) RRETURN(MATCH_NOMATCH); break; } break; @@ -3645,7 +3805,7 @@ int len = 1; if (eptr >= md->end_subject) break; GETCHARLEN(c, eptr, len); - prop_category = _pcre_ucp_findprop(c, &prop_chartype, &prop_script); + prop_chartype = UCD_CHARTYPE(c); if ((prop_chartype == ucp_Lu || prop_chartype == ucp_Ll || prop_chartype == ucp_Lt) == prop_fail_result) @@ -3660,7 +3820,7 @@ int len = 1; if (eptr >= md->end_subject) break; GETCHARLEN(c, eptr, len); - prop_category = _pcre_ucp_findprop(c, &prop_chartype, &prop_script); + prop_category = UCD_CATEGORY(c); if ((prop_category == prop_value) == prop_fail_result) break; eptr+= len; @@ -3673,7 +3833,7 @@ int len = 1; if (eptr >= md->end_subject) break; GETCHARLEN(c, eptr, len); - prop_category = _pcre_ucp_findprop(c, &prop_chartype, &prop_script); + prop_chartype = UCD_CHARTYPE(c); if ((prop_chartype == prop_value) == prop_fail_result) break; eptr+= len; @@ -3686,7 +3846,7 @@ int len = 1; if (eptr >= md->end_subject) break; GETCHARLEN(c, eptr, len); - prop_category = _pcre_ucp_findprop(c, &prop_chartype, &prop_script); + prop_script = UCD_SCRIPT(c); if ((prop_script == prop_value) == prop_fail_result) break; eptr+= len; @@ -3702,7 +3862,7 @@ RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM44); if (rrc != MATCH_NOMATCH) RRETURN(rrc); if (eptr-- == pp) break; /* Stop if tried at original pos */ - BACKCHAR(eptr); + if (utf8) BACKCHAR(eptr); } } @@ -3715,7 +3875,7 @@ { if (eptr >= md->end_subject) break; GETCHARINCTEST(c, eptr); - prop_category = _pcre_ucp_findprop(c, &prop_chartype, &prop_script); + prop_category = UCD_CATEGORY(c); if (prop_category == ucp_M) break; while (eptr < md->end_subject) { @@ -3724,7 +3884,7 @@ { GETCHARLEN(c, eptr, len); } - prop_category = _pcre_ucp_findprop(c, &prop_chartype, &prop_script); + prop_category = UCD_CATEGORY(c); if (prop_category != ucp_M) break; eptr += len; } @@ -3741,12 +3901,12 @@ for (;;) /* Move back over one extended */ { int len = 1; - BACKCHAR(eptr); if (!utf8) c = *eptr; else { + BACKCHAR(eptr); GETCHARLEN(c, eptr, len); } - prop_category = _pcre_ucp_findprop(c, &prop_chartype, &prop_script); + prop_category = UCD_CATEGORY(c); if (prop_category != ucp_M) break; eptr--; } @@ -3764,30 +3924,13 @@ switch(ctype) { case OP_ANY: - - /* Special code is required for UTF8, but when the maximum is - unlimited we don't need it, so we repeat the non-UTF8 code. This is - probably worth it, because .* is quite a common idiom. */ - if (max < INT_MAX) { - if ((ims & PCRE_DOTALL) == 0) - { - for (i = min; i < max; i++) - { - if (eptr >= md->end_subject || IS_NEWLINE(eptr)) break; - eptr++; - while (eptr < md->end_subject && (*eptr & 0xc0) == 0x80) eptr++; - } - } - else + for (i = min; i < max; i++) { - for (i = min; i < max; i++) - { - if (eptr >= md->end_subject) break; - eptr++; - while (eptr < md->end_subject && (*eptr & 0xc0) == 0x80) eptr++; - } + if (eptr >= md->end_subject || IS_NEWLINE(eptr)) break; + eptr++; + while (eptr < md->end_subject && (*eptr & 0xc0) == 0x80) eptr++; } } @@ -3795,23 +3938,26 @@ else { - if ((ims & PCRE_DOTALL) == 0) + for (i = min; i < max; i++) { - for (i = min; i < max; i++) - { - if (eptr >= md->end_subject || IS_NEWLINE(eptr)) break; - eptr++; - } - break; + if (eptr >= md->end_subject || IS_NEWLINE(eptr)) break; + eptr++; + while (eptr < md->end_subject && (*eptr & 0xc0) == 0x80) eptr++; } - else + } + break; + + case OP_ALLANY: + if (max < INT_MAX) + { + for (i = min; i < max; i++) { - c = max - min; - if (c > (unsigned int)(md->end_subject - eptr)) - c = md->end_subject - eptr; - eptr += c; + if (eptr >= md->end_subject) break; + eptr++; + while (eptr < md->end_subject && (*eptr & 0xc0) == 0x80) eptr++; } } + else eptr = md->end_subject; /* Unlimited UTF-8 repeat */ break; /* The byte case is the same as non-UTF8 */ @@ -3836,8 +3982,10 @@ } else { - if (c != 0x000a && c != 0x000b && c != 0x000c && - c != 0x0085 && c != 0x2028 && c != 0x2029) + if (c != 0x000a && + (md->bsr_anycrlf || + (c != 0x000b && c != 0x000c && + c != 0x0085 && c != 0x2028 && c != 0x2029))) break; eptr += len; } @@ -3990,24 +4138,21 @@ } } else -#endif +#endif /* SUPPORT_UTF8 */ /* Not UTF-8 mode */ { switch(ctype) { case OP_ANY: - if ((ims & PCRE_DOTALL) == 0) + for (i = min; i < max; i++) { - for (i = min; i < max; i++) - { - if (eptr >= md->end_subject || IS_NEWLINE(eptr)) break; - eptr++; - } - break; + if (eptr >= md->end_subject || IS_NEWLINE(eptr)) break; + eptr++; } - /* For DOTALL case, fall through and treat as \C */ + break; + case OP_ALLANY: case OP_ANYBYTE: c = max - min; if (c > (unsigned int)(md->end_subject - eptr)) @@ -4027,7 +4172,9 @@ } else { - if (c != 0x000a && c != 0x000b && c != 0x000c && c != 0x0085) + if (c != 0x000a && + (md->bsr_anycrlf || + (c != 0x000b && c != 0x000c && c != 0x0085))) break; eptr++; } @@ -4177,11 +4324,17 @@ switch (frame->Xwhere) { LBL( 1) LBL( 2) LBL( 3) LBL( 4) LBL( 5) LBL( 6) LBL( 7) LBL( 8) - LBL( 9) LBL(10) LBL(11) LBL(12) LBL(13) LBL(14) LBL(15) LBL(16) - LBL(17) LBL(18) LBL(19) LBL(20) LBL(21) LBL(22) LBL(23) LBL(24) - LBL(25) LBL(26) LBL(27) LBL(28) LBL(29) LBL(30) LBL(31) LBL(32) - LBL(33) LBL(34) LBL(35) LBL(36) LBL(37) LBL(38) LBL(39) LBL(40) - LBL(41) LBL(42) LBL(43) LBL(44) LBL(45) LBL(46) LBL(47) + LBL( 9) LBL(10) LBL(11) LBL(12) LBL(13) LBL(14) LBL(15) LBL(17) + LBL(19) LBL(24) LBL(25) LBL(26) LBL(27) LBL(29) LBL(31) LBL(33) + LBL(35) LBL(43) LBL(47) LBL(48) LBL(49) LBL(50) LBL(51) LBL(52) + LBL(53) LBL(54) +#ifdef SUPPORT_UTF8 + LBL(16) LBL(18) LBL(20) LBL(21) LBL(22) LBL(23) LBL(28) LBL(30) + LBL(32) LBL(34) LBL(42) LBL(46) +#ifdef SUPPORT_UCP + LBL(36) LBL(37) LBL(38) LBL(39) LBL(40) LBL(41) LBL(44) LBL(45) +#endif /* SUPPORT_UCP */ +#endif /* SUPPORT_UTF8 */ default: DPRINTF(("jump error in pcre match: label %d non-existent\n", frame->Xwhere)); return PCRE_ERROR_INTERNAL; @@ -4273,7 +4426,7 @@ < -1 => some kind of unexpected problem */ -PCRE_EXP_DEFN int +PCRE_EXP_DEFN int PCRE_CALL_CONVENTION pcre_exec(const pcre *argument_re, const pcre_extra *extra_data, PCRE_SPTR subject, int length, int start_offset, int options, int *offsets, int offsetcount) @@ -4298,7 +4451,6 @@ USPTR start_match = (USPTR)subject + start_offset; USPTR end_subject; USPTR req_byte_ptr = start_match - 1; -eptrblock eptrchain[EPTR_WORK_SIZE]; pcre_study_data internal_study; const pcre_study_data *study; @@ -4361,7 +4513,7 @@ /* Set up other data */ anchored = ((re->options | options) & PCRE_ANCHORED) != 0; -startline = (re->options & PCRE_STARTLINE) != 0; +startline = (re->flags & PCRE_STARTLINE) != 0; firstline = (re->options & PCRE_FIRSTLINE) != 0; /* The code starts after the real_pcre block and the capture name table. */ @@ -4376,6 +4528,7 @@ md->endonly = (re->options & PCRE_DOLLAR_ENDONLY) != 0; utf8 = md->utf8 = (re->options & PCRE_UTF8) != 0; +md->jscript_compat = (re->options & PCRE_JAVASCRIPT_COMPAT) != 0; md->notbol = (options & PCRE_NOTBOL) != 0; md->noteol = (options & PCRE_NOTEOL) != 0; @@ -4384,22 +4537,47 @@ md->hitend = FALSE; md->recursive = NULL; /* No recursion at top level */ -md->eptrchain = eptrchain; /* Make workspace generally available */ md->lcc = tables + lcc_offset; md->ctypes = tables + ctypes_offset; +/* Handle different \R options. */ + +switch (options & (PCRE_BSR_ANYCRLF|PCRE_BSR_UNICODE)) + { + case 0: + if ((re->options & (PCRE_BSR_ANYCRLF|PCRE_BSR_UNICODE)) != 0) + md->bsr_anycrlf = (re->options & PCRE_BSR_ANYCRLF) != 0; + else +#ifdef BSR_ANYCRLF + md->bsr_anycrlf = TRUE; +#else + md->bsr_anycrlf = FALSE; +#endif + break; + + case PCRE_BSR_ANYCRLF: + md->bsr_anycrlf = TRUE; + break; + + case PCRE_BSR_UNICODE: + md->bsr_anycrlf = FALSE; + break; + + default: return PCRE_ERROR_BADNEWLINE; + } + /* Handle different types of newline. The three bits give eight cases. If nothing is set at run time, whatever was used at compile time applies. */ -switch ((((options & PCRE_NEWLINE_BITS) == 0)? re->options : (pcre_uint32)options) & - PCRE_NEWLINE_BITS) +switch ((((options & PCRE_NEWLINE_BITS) == 0)? re->options : + (pcre_uint32)options) & PCRE_NEWLINE_BITS) { case 0: newline = NEWLINE; break; /* Compile-time default */ - case PCRE_NEWLINE_CR: newline = '\r'; break; - case PCRE_NEWLINE_LF: newline = '\n'; break; + case PCRE_NEWLINE_CR: newline = CHAR_CR; break; + case PCRE_NEWLINE_LF: newline = CHAR_NL; break; case PCRE_NEWLINE_CR+ - PCRE_NEWLINE_LF: newline = ('\r' << 8) | '\n'; break; + PCRE_NEWLINE_LF: newline = (CHAR_CR << 8) | CHAR_NL; break; case PCRE_NEWLINE_ANY: newline = -1; break; case PCRE_NEWLINE_ANYCRLF: newline = -2; break; default: return PCRE_ERROR_BADNEWLINE; @@ -4432,7 +4610,7 @@ /* Partial matching is supported only for a restricted set of regexes at the moment. */ -if (md->partial && (re->options & PCRE_NOPARTIAL) != 0) +if (md->partial && (re->flags & PCRE_NOPARTIAL) != 0) return PCRE_ERROR_BADPARTIAL; /* Check a UTF-8 string if required. Unfortunately there's no way of passing @@ -4509,7 +4687,7 @@ if (!anchored) { - if ((re->options & PCRE_FIRSTSET) != 0) + if ((re->flags & PCRE_FIRSTSET) != 0) { first_byte = re->first_byte & 255; if ((first_byte_caseless = ((re->first_byte & REQ_CASELESS) != 0)) == TRUE) @@ -4524,7 +4702,7 @@ /* For anchored or unanchored matches, there may be a "last known required character" set. */ -if ((re->options & PCRE_REQCHSET) != 0) +if ((re->flags & PCRE_REQCHSET) != 0) { req_byte = re->req_byte & 255; req_byte_caseless = (re->req_byte & REQ_CASELESS) != 0; @@ -4540,6 +4718,7 @@ for(;;) { USPTR save_end_subject = end_subject; + USPTR new_start_match; /* Reset the maximum number of extractions we might see. */ @@ -4550,63 +4729,94 @@ while (iptr < iend) *iptr++ = -1; } - /* Advance to a unique first char if possible. If firstline is TRUE, the - start of the match is constrained to the first line of a multiline string. - That is, the match must be before or at the first newline. Implement this by - temporarily adjusting end_subject so that we stop scanning at a newline. If - the match fails at the newline, later code breaks this loop. */ + /* If firstline is TRUE, the start of the match is constrained to the first + line of a multiline string. That is, the match must be before or at the first + newline. Implement this by temporarily adjusting end_subject so that we stop + scanning at a newline. If the match fails at the newline, later code breaks + this loop. */ if (firstline) { USPTR t = start_match; +#ifdef SUPPORT_UTF8 + if (utf8) + { + while (t < md->end_subject && !IS_NEWLINE(t)) + { + t++; + while (t < end_subject && (*t & 0xc0) == 0x80) t++; + } + } + else +#endif while (t < md->end_subject && !IS_NEWLINE(t)) t++; end_subject = t; } - /* Now test for a unique first byte */ + /* There are some optimizations that avoid running the match if a known + starting point is not found, or if a known later character is not present. + However, there is an option that disables these, for testing and for ensuring + that all callouts do actually occur. */ - if (first_byte >= 0) + if ((options & PCRE_NO_START_OPTIMIZE) == 0) { - if (first_byte_caseless) - while (start_match < end_subject && - md->lcc[*start_match] != first_byte) - start_match++; - else - while (start_match < end_subject && *start_match != first_byte) - start_match++; - } + /* Advance to a unique first byte if there is one. */ - /* Or to just after a linebreak for a multiline match if possible */ + if (first_byte >= 0) + { + if (first_byte_caseless) + while (start_match < end_subject && md->lcc[*start_match] != first_byte) + start_match++; + else + while (start_match < end_subject && *start_match != first_byte) + start_match++; + } - else if (startline) - { - if (start_match > md->start_subject + start_offset) + /* Or to just after a linebreak for a multiline match */ + + else if (startline) { - while (start_match <= end_subject && !WAS_NEWLINE(start_match)) - start_match++; + if (start_match > md->start_subject + start_offset) + { +#ifdef SUPPORT_UTF8 + if (utf8) + { + while (start_match < end_subject && !WAS_NEWLINE(start_match)) + { + start_match++; + while(start_match < end_subject && (*start_match & 0xc0) == 0x80) + start_match++; + } + } + else +#endif + while (start_match < end_subject && !WAS_NEWLINE(start_match)) + start_match++; - /* If we have just passed a CR and the newline option is ANY or ANYCRLF, - and we are now at a LF, advance the match position by one more character. - */ - - if (start_match[-1] == '\r' && - (md->nltype == NLTYPE_ANY || md->nltype == NLTYPE_ANYCRLF) && - start_match < end_subject && - *start_match == '\n') - start_match++; + /* If we have just passed a CR and the newline option is ANY or ANYCRLF, + and we are now at a LF, advance the match position by one more character. + */ + + if (start_match[-1] == CHAR_CR && + (md->nltype == NLTYPE_ANY || md->nltype == NLTYPE_ANYCRLF) && + start_match < end_subject && + *start_match == CHAR_NL) + start_match++; + } } - } - /* Or to a non-unique first char after study */ + /* Or to a non-unique first byte after study */ - else if (start_bits != NULL) - { - while (start_match < end_subject) + else if (start_bits != NULL) { - register unsigned int c = *start_match; - if ((start_bits[c/8] & (1 << (c&7))) == 0) start_match++; else break; + while (start_match < end_subject) + { + register unsigned int c = *start_match; + if ((start_bits[c/8] & (1 << (c&7))) == 0) start_match++; + else break; + } } - } + } /* Starting optimizations */ /* Restore fudged end_subject */ @@ -4618,23 +4828,25 @@ printf("\n"); #endif - /* If req_byte is set, we know that that character must appear in the subject - for the match to succeed. If the first character is set, req_byte must be - later in the subject; otherwise the test starts at the match point. This - optimization can save a huge amount of backtracking in patterns with nested - unlimited repeats that aren't going to match. Writing separate code for - cased/caseless versions makes it go faster, as does using an autoincrement - and backing off on a match. - - HOWEVER: when the subject string is very, very long, searching to its end can - take a long time, and give bad performance on quite ordinary patterns. This - showed up when somebody was matching something like /^\d+C/ on a 32-megabyte - string... so we don't do this when the string is sufficiently long. + /* If req_byte is set, we know that that character must appear in the + subject for the match to succeed. If the first character is set, req_byte + must be later in the subject; otherwise the test starts at the match point. + This optimization can save a huge amount of backtracking in patterns with + nested unlimited repeats that aren't going to match. Writing separate code + for cased/caseless versions makes it go faster, as does using an + autoincrement and backing off on a match. + + HOWEVER: when the subject string is very, very long, searching to its end + can take a long time, and give bad performance on quite ordinary patterns. + This showed up when somebody was matching something like /^\d+C/ on a + 32-megabyte string... so we don't do this when the string is sufficiently + long. - ALSO: this processing is disabled when partial matching is requested. - */ + ALSO: this processing is disabled when partial matching is requested, or if + disabling is explicitly requested. */ - if (req_byte >= 0 && + if ((options & PCRE_NO_START_OPTIMIZE) == 0 && + req_byte >= 0 && end_subject - start_match < REQ_BYTE_MAX && !md->partial) { @@ -4680,15 +4892,48 @@ /* OK, we can now run the match. */ - md->start_match_ptr = start_match; /* Insurance */ + md->start_match_ptr = start_match; md->match_call_count = 0; - md->eptrn = 0; /* Next free eptrchain slot */ - rc = match(start_match, md->start_code, start_match, 2, md, - ims, NULL, 0, 0); + rc = match(start_match, md->start_code, start_match, 2, md, ims, NULL, 0, 0); + + switch(rc) + { + /* NOMATCH and PRUNE advance by one character. THEN at this level acts + exactly like PRUNE. */ + + case MATCH_NOMATCH: + case MATCH_PRUNE: + case MATCH_THEN: + new_start_match = start_match + 1; +#ifdef SUPPORT_UTF8 + if (utf8) + while(new_start_match < end_subject && (*new_start_match & 0xc0) == 0x80) + new_start_match++; +#endif + break; + + /* SKIP passes back the next starting point explicitly. */ + + case MATCH_SKIP: + new_start_match = md->start_match_ptr; + break; + + /* COMMIT disables the bumpalong, but otherwise behaves as NOMATCH. */ + + case MATCH_COMMIT: + rc = MATCH_NOMATCH; + goto ENDLOOP; - /* Any return other than MATCH_NOMATCH breaks the loop. */ + /* Any other return is some kind of error. */ - if (rc != MATCH_NOMATCH) break; + default: + goto ENDLOOP; + } + + /* Control reaches here for the various types of "no match at this point" + result. Reset the code to MATCH_NOMATCH for subsequent checking. */ + + rc = MATCH_NOMATCH; /* If PCRE_FIRSTLINE is set, the match must happen before or at the first newline in the subject (though it may continue over the newline). Therefore, @@ -4696,30 +4941,26 @@ if (firstline && IS_NEWLINE(start_match)) break; - /* Advance the match position by one character. */ + /* Advance to new matching position */ - start_match++; -#ifdef SUPPORT_UTF8 - if (utf8) - while(start_match < end_subject && (*start_match & 0xc0) == 0x80) - start_match++; -#endif + start_match = new_start_match; /* Break the loop if the pattern is anchored or if we have passed the end of the subject. */ if (anchored || start_match > end_subject) break; - /* If we have just passed a CR and the newline option is CRLF or ANY or - ANYCRLF, and we are now at a LF, advance the match position by one more - character. */ - - if (start_match[-1] == '\r' && - (md->nltype == NLTYPE_ANY || - md->nltype == NLTYPE_ANYCRLF || - md->nllen == 2) && - start_match < end_subject && - *start_match == '\n') + /* If we have just passed a CR and we are now at a LF, and the pattern does + not contain any explicit matches for \r or \n, and the newline option is CRLF + or ANY or ANYCRLF, advance the match position by one more character. */ + + if (start_match[-1] == CHAR_CR && + start_match < end_subject && + *start_match == CHAR_NL && + (re->flags & PCRE_HASCRORLF) == 0 && + (md->nltype == NLTYPE_ANY || + md->nltype == NLTYPE_ANYCRLF || + md->nllen == 2)) start_match++; } /* End of for(;;) "bumpalong" loop */ @@ -4729,7 +4970,7 @@ /* We reach here when rc is not MATCH_NOMATCH, or if one of the stopping conditions is true: -(1) The pattern is anchored; +(1) The pattern is anchored or the match was failed by (*COMMIT); (2) We are past the end of the subject; @@ -4744,6 +4985,8 @@ certain parts of the pattern were not used, even though there are more capturing parentheses than vector slots. */ +ENDLOOP: + if (rc == MATCH_MATCH) { if (using_temporary_offsets)