--- code/trunk/pcre_compile.c 2011/06/02 19:04:54 604 +++ code/trunk/pcre_compile.c 2011/07/23 16:19:50 635 @@ -409,6 +409,7 @@ "(*MARK) must have an argument\0" "this version of PCRE is not compiled with PCRE_UCP support\0" "\\c must be followed by an ASCII character\0" + "\\k is not followed by a braced, angle-bracketed, or quoted name\0" ; /* Table to identify digits and hex digits. This is used when compiling @@ -1694,6 +1695,7 @@ for (;;) { register int c = *code; + if (c == OP_END) return NULL; /* XCLASS is used for classes that cannot be represented just by a bit @@ -1974,13 +1976,30 @@ } /* For a recursion/subroutine call, if its end has been reached, which - implies a subroutine call, we can scan it. */ + implies a backward reference subroutine call, we can scan it. If it's a + forward reference subroutine call, we can't. To detect forward reference + we have to scan up the list that is kept in the workspace. This function is + called only when doing the real compile, not during the pre-compile that + measures the size of the compiled pattern. */ if (c == OP_RECURSE) { - BOOL empty_branch = FALSE; - const uschar *scode = cd->start_code + GET(code, 1); + const uschar *scode; + BOOL empty_branch; + + /* Test for forward reference */ + + for (scode = cd->start_workspace; scode < cd->hwm; scode += LINK_SIZE) + if (GET(scode, 0) == code + 1 - cd->start_code) return TRUE; + + /* Not a forward reference, test for completed backward reference */ + + empty_branch = FALSE; + scode = cd->start_code + GET(code, 1); if (GET(scode, 1) == 0) return TRUE; /* Unclosed */ + + /* Completed backwards reference */ + do { if (could_be_empty_branch(scode, endcode, utf8, cd)) @@ -1991,6 +2010,7 @@ scode += GET(scode, 1); } while (*scode == OP_ALT); + if (!empty_branch) return FALSE; /* All branches are non-empty */ continue; } @@ -2216,6 +2236,8 @@ the current branch of the current pattern to see if it could match the empty string. If it could, we must look outwards for branches at other levels, stopping when we pass beyond the bracket which is the subject of the recursion. +This function is called only during the real compile, not during the +pre-compile. Arguments: code points to start of the recursion @@ -3017,7 +3039,7 @@ int firstbyte, reqbyte; int zeroreqbyte, zerofirstbyte; int req_caseopt, reqvary, tempreqvary; -int options = *optionsptr; +int options = *optionsptr; /* May change dynamically */ int after_manual_callout = 0; int length_prevgroup = 0; register int c; @@ -3035,6 +3057,10 @@ uschar *save_hwm = NULL; uschar classbits[32]; +/* We can fish out the UTF-8 setting once and for all into a BOOL, but we +must not do this for other options (e.g. PCRE_EXTENDED) because they may change +dynamically as we process the pattern. */ + #ifdef SUPPORT_UTF8 BOOL class_utf8; BOOL utf8 = (options & PCRE_UTF8) != 0; @@ -3215,7 +3241,7 @@ previous_callout = NULL; } - /* In extended mode, skip white space and comments */ + /* In extended mode, skip white space and comments. */ if ((options & PCRE_EXTENDED) != 0) { @@ -4207,6 +4233,35 @@ ptr++; } else repeat_type = greedy_default; + + /* If previous was a recursion call, wrap it in atomic brackets so that + previous becomes the atomic group. All recursions were so wrapped in the + past, but it no longer happens for non-repeated recursions. In fact, the + repeated ones could be re-implemented independently so as not to need this, + but for the moment we rely on the code for repeating groups. */ + + if (*previous == OP_RECURSE) + { + memmove(previous + 1 + LINK_SIZE, previous, 1 + LINK_SIZE); + *previous = OP_ONCE; + PUT(previous, 1, 2 + 2*LINK_SIZE); + previous[2 + 2*LINK_SIZE] = OP_KET; + PUT(previous, 3 + 2*LINK_SIZE, 2 + 2*LINK_SIZE); + code += 2 + 2 * LINK_SIZE; + length_prevgroup = 3 + 3*LINK_SIZE; + + /* When actually compiling, we need to check whether this was a forward + reference, and if so, adjust the offset. */ + + if (lengthptr == NULL && cd->hwm >= cd->start_workspace + LINK_SIZE) + { + int offset = GET(cd->hwm, -LINK_SIZE); + if (offset == previous + 1 - cd->start_code) + PUT(cd->hwm, -LINK_SIZE, offset + 1 + LINK_SIZE); + } + } + + /* Now handle repetition for the different types of item. */ /* If previous was a character match, abolish the item and generate a repeat item instead. If a char item has a minumum of more than one, ensure @@ -4510,7 +4565,7 @@ int len = (int)(code - previous); uschar *bralink = NULL; uschar *brazeroptr = NULL; - + /* Repeating a DEFINE group is pointless */ if (*previous == OP_COND && previous[LINK_SIZE+1] == OP_DEF) @@ -4726,14 +4781,19 @@ } /* If the maximum is unlimited, set a repeater in the final copy. For - ONCE brackets, that's all we need to do. + ONCE brackets, that's all we need to do. However, possessively repeated + ONCE brackets can be converted into non-capturing brackets, as the + behaviour of (?:xx)++ is the same as (?>xx)++ and this saves having to + deal with possessive ONCEs specially. Otherwise, if the quantifier was possessive, we convert the BRA code to the POS form, and the KET code to KETRPOS. (It turns out to be convenient at runtime to detect this kind of subpattern at both the start and at the - end.) If the group is preceded by OP_BRAZERO, convert this to - OP_BRAPOSZERO. Then cancel the possessive flag so that the default action - below, of wrapping everything inside atomic brackets, does not happen. + end.) The use of special opcodes makes it possible to reduce greatly the + stack usage in pcre_exec(). If the group is preceded by OP_BRAZERO, + convert this to OP_BRAPOSZERO. Then cancel the possessive flag so that + the default action below, of wrapping everything inside atomic brackets, + does not happen. Then, when we are doing the actual compile phase, check to see whether this group is one that could match an empty string. If so, convert the @@ -4745,8 +4805,9 @@ { uschar *ketcode = code - 1 - LINK_SIZE; uschar *bracode = ketcode - GET(ketcode, 1); - - if (*bracode == OP_ONCE) + + if (*bracode == OP_ONCE && possessive_quantifier) *bracode = OP_BRA; + if (*bracode == OP_ONCE) *ketcode = OP_KETRMAX + repeat_type; else { @@ -4793,13 +4854,18 @@ } /* If the character following a repeat is '+', or if certain optimization - tests above succeeded, possessive_quantifier is TRUE. For some of the - simpler opcodes, there is an special alternative opcode for this. For - anything else, we wrap the entire repeated item inside OP_ONCE brackets. - The '+' notation is just syntactic sugar, taken from Sun's Java package, - but the special opcodes can optimize it a bit. The repeated item starts at - tempcode, not at previous, which might be the first part of a string whose - (former) last char we repeated. + tests above succeeded, possessive_quantifier is TRUE. For some opcodes, + there are special alternative opcodes for this case. For anything else, we + wrap the entire repeated item inside OP_ONCE brackets. Logically, the '+' + notation is just syntactic sugar, taken from Sun's Java package, but the + special opcodes can optimize it. + + Possessively repeated subpatterns have already been handled in the code + just above, so possessive_quantifier is always FALSE for them at this + stage. + + Note that the repeated item starts at tempcode, not at previous, which + might be the first part of a string whose (former) last char we repeated. Possessifying an 'exact' quantifier has no effect, so we can ignore it. But an 'upto' may follow. We skip over an 'exact' item, and then test the @@ -4924,22 +4990,29 @@ if (namelen == verbs[i].len && strncmp((char *)name, vn, namelen) == 0) { - /* Check for open captures before ACCEPT */ + /* Check for open captures before ACCEPT and convert it to + ASSERT_ACCEPT if in an assertion. */ if (verbs[i].op == OP_ACCEPT) { open_capitem *oc; + if (arglen != 0) + { + *errorcodeptr = ERR59; + goto FAILED; + } cd->had_accept = TRUE; for (oc = cd->open_caps; oc != NULL; oc = oc->next) { *code++ = OP_CLOSE; PUT2INC(code, 0, oc->number); } + *code++ = (cd->assert_depth > 0)? OP_ASSERT_ACCEPT : OP_ACCEPT; } - /* Handle the cases with/without an argument */ + /* Handle other cases with/without an argument */ - if (arglen == 0) + else if (arglen == 0) { if (verbs[i].op < 0) /* Argument is mandatory */ { @@ -5228,6 +5301,7 @@ /* ------------------------------------------------------------ */ case CHAR_EQUALS_SIGN: /* Positive lookahead */ bravalue = OP_ASSERT; + cd->assert_depth += 1; ptr++; break; @@ -5242,6 +5316,7 @@ continue; } bravalue = OP_ASSERT_NOT; + cd->assert_depth += 1; break; @@ -5251,11 +5326,13 @@ { case CHAR_EQUALS_SIGN: /* Positive lookbehind */ bravalue = OP_ASSERTBACK; + cd->assert_depth += 1; ptr += 2; break; case CHAR_EXCLAMATION_MARK: /* Negative lookbehind */ bravalue = OP_ASSERTBACK_NOT; + cd->assert_depth += 1; ptr += 2; break; @@ -5499,7 +5576,7 @@ temp = cd->end_pattern; cd->end_pattern = ptr; - recno = find_parens(cd, name, namelen, + recno = find_parens(cd, name, namelen, (options & PCRE_EXTENDED) != 0, utf8); cd->end_pattern = temp; if (recno < 0) recno = 0; /* Forward ref; set dummy number */ @@ -5646,10 +5723,10 @@ /* Fudge the value of "called" so that when it is inserted as an offset below, what it actually inserted is the reference number - of the group. */ + of the group. Then remember the forward reference. */ called = cd->start_code + recno; - PUTINC(cd->hwm, 0, (int)(code + 2 + LINK_SIZE - cd->start_code)); + PUTINC(cd->hwm, 0, (int)(code + 1 - cd->start_code)); } /* If not a forward reference, and the subpattern is still open, @@ -5664,23 +5741,11 @@ } } - /* Insert the recursion/subroutine item, automatically wrapped inside - "once" brackets. Set up a "previous group" length so that a - subsequent quantifier will work. */ - - *code = OP_ONCE; - PUT(code, 1, 2 + 2*LINK_SIZE); - code += 1 + LINK_SIZE; - + /* Insert the recursion/subroutine item. */ + *code = OP_RECURSE; PUT(code, 1, (int)(called - cd->start_code)); code += 1 + LINK_SIZE; - - *code = OP_KET; - PUT(code, 1, 2 + 2*LINK_SIZE); - code += 1 + LINK_SIZE; - - length_prevgroup = 3 + 3*LINK_SIZE; } /* Can't determine a first byte now */ @@ -5823,6 +5888,9 @@ &length_prevgroup /* Pre-compile phase */ )) goto FAILED; + + if (bravalue >= OP_ASSERT && bravalue <= OP_ASSERTBACK_NOT) + cd->assert_depth -= 1; /* At the end of compiling, code is still pointing to the start of the group, while tempcode has been updated to point past the end of the group @@ -5894,7 +5962,7 @@ goto FAILED; } *lengthptr += length_prevgroup - 2 - 2*LINK_SIZE; - *code++ = OP_BRA; + code++; /* This already contains bravalue */ PUTINC(code, 0, 1 + LINK_SIZE); *code++ = OP_KET; PUTINC(code, 0, 1 + LINK_SIZE); @@ -6062,17 +6130,22 @@ } /* \k or \k'name' is a back reference by name (Perl syntax). - We also support \k{name} (.NET syntax) */ + We also support \k{name} (.NET syntax). */ - if (-c == ESC_k && (ptr[1] == CHAR_LESS_THAN_SIGN || - ptr[1] == CHAR_APOSTROPHE || ptr[1] == CHAR_LEFT_CURLY_BRACKET)) + if (-c == ESC_k) { + if ((ptr[1] != CHAR_LESS_THAN_SIGN && + ptr[1] != CHAR_APOSTROPHE && ptr[1] != CHAR_LEFT_CURLY_BRACKET)) + { + *errorcodeptr = ERR69; + break; + } is_recurse = FALSE; terminator = (*(++ptr) == CHAR_LESS_THAN_SIGN)? CHAR_GREATER_THAN_SIGN : (*ptr == CHAR_APOSTROPHE)? CHAR_APOSTROPHE : CHAR_RIGHT_CURLY_BRACKET; goto NAMED_REF_OR_RECURSE; - } + } /* Back references are handled specially; must disable firstbyte if not set to cope with cases like (?=(\w+))\1: which would otherwise set @@ -6969,13 +7042,12 @@ /* Can't support UTF8 unless PCRE has been compiled to include the code. The return of an error code from _pcre_valid_utf8() is a new feature, introduced in -release 8.13. The only use we make of it here is to adjust the offset value to -the end of the string for a short string error, for compatibility with previous -versions. */ +release 8.13. It is passed back from pcre_[dfa_]exec(), but at the moment is +not used here. */ #ifdef SUPPORT_UTF8 if (utf8 && (options & PCRE_NO_UTF8_CHECK) == 0 && - (*erroroffset = _pcre_valid_utf8((USPTR)pattern, -1, &errorcode)) >= 0) + (errorcode = _pcre_valid_utf8((USPTR)pattern, -1, erroroffset)) != 0) { errorcode = ERR44; goto PCRE_EARLY_ERROR_RETURN2; @@ -7146,6 +7218,7 @@ */ cd->final_bracount = cd->bracount; /* Save for checking forward references */ +cd->assert_depth = 0; cd->bracount = 0; cd->names_found = 0; cd->name_table = (uschar *)re + re->name_table_offset;