--- code/trunk/pcre_compile.c 2007/08/15 11:34:14 213 +++ code/trunk/pcre_compile.c 2007/08/17 09:25:08 221 @@ -283,7 +283,7 @@ "(*VERB) with an argument is not supported", /* 60 */ "(*VERB) not recognized", - "number is too big" + "number is too big" }; @@ -524,12 +524,12 @@ c = 0; while ((digitab[ptr[1]] & ctype_digit) != 0) c = c * 10 + *(++ptr) - '0'; - + if (c < 0) { *errorcodeptr = ERR61; break; - } + } if (c == 0 || (braced && *(++ptr) != '}')) { @@ -574,8 +574,8 @@ if (c < 0) { *errorcodeptr = ERR61; - break; - } + break; + } if (c < 10 || c <= bracount) { c = -(ESC_REF + c); @@ -950,6 +950,7 @@ { while (*(++ptr) != ']') { + if (*ptr == 0) return -1; if (*ptr == '\\') { if (*(++ptr) == 0) return -1; @@ -1105,7 +1106,6 @@ { int d; register int op = *cc; - switch (op) { case OP_CBRA: @@ -1194,6 +1194,7 @@ case OP_TYPEEXACT: branchlength += GET2(cc,1); + if (cc[3] == OP_PROP || cc[3] == OP_NOTPROP) cc += 2; cc += 4; break; @@ -1302,13 +1303,42 @@ code += _pcre_OP_lengths[c]; } - /* In UTF-8 mode, opcodes that are followed by a character may be followed by - a multi-byte character. The length in the table is a minimum, so we have to - arrange to skip the extra bytes. */ + /* Otherwise, we can get the item's length from the table, except that for + repeated character types, we have to test for \p and \P, which have an extra + two bytes of parameters. */ else { + switch(c) + { + case OP_TYPESTAR: + case OP_TYPEMINSTAR: + case OP_TYPEPLUS: + case OP_TYPEMINPLUS: + case OP_TYPEQUERY: + case OP_TYPEMINQUERY: + case OP_TYPEPOSSTAR: + case OP_TYPEPOSPLUS: + case OP_TYPEPOSQUERY: + if (code[1] == OP_PROP || code[1] == OP_NOTPROP) code += 2; + break; + + case OP_TYPEUPTO: + case OP_TYPEMINUPTO: + case OP_TYPEEXACT: + case OP_TYPEPOSUPTO: + if (code[3] == OP_PROP || code[3] == OP_NOTPROP) code += 2; + break; + } + + /* Add in the fixed length from the table */ + code += _pcre_OP_lengths[c]; + + /* In UTF-8 mode, opcodes that are followed by a character may be followed by + a multi-byte character. The length in the table is a minimum, so we have to + arrange to skip the extra bytes. */ + #ifdef SUPPORT_UTF8 if (utf8) switch(c) { @@ -1366,14 +1396,42 @@ if (c == OP_XCLASS) code += GET(code, 1); - /* Otherwise, we get the item's length from the table. In UTF-8 mode, opcodes - that are followed by a character may be followed by a multi-byte character. - The length in the table is a minimum, so we have to arrange to skip the extra - bytes. */ + /* Otherwise, we can get the item's length from the table, except that for + repeated character types, we have to test for \p and \P, which have an extra + two bytes of parameters. */ else { + switch(c) + { + case OP_TYPESTAR: + case OP_TYPEMINSTAR: + case OP_TYPEPLUS: + case OP_TYPEMINPLUS: + case OP_TYPEQUERY: + case OP_TYPEMINQUERY: + case OP_TYPEPOSSTAR: + case OP_TYPEPOSPLUS: + case OP_TYPEPOSQUERY: + if (code[1] == OP_PROP || code[1] == OP_NOTPROP) code += 2; + break; + + case OP_TYPEPOSUPTO: + case OP_TYPEUPTO: + case OP_TYPEMINUPTO: + case OP_TYPEEXACT: + if (code[3] == OP_PROP || code[3] == OP_NOTPROP) code += 2; + break; + } + + /* Add in the fixed length from the table */ + code += _pcre_OP_lengths[c]; + + /* In UTF-8 mode, opcodes that are followed by a character may be followed + by a multi-byte character. The length in the table is a minimum, so we have + to arrange to skip the extra bytes. */ + #ifdef SUPPORT_UTF8 if (utf8) switch(c) { @@ -1469,11 +1527,15 @@ switch (c) { - /* Check for quantifiers after a class */ + /* Check for quantifiers after a class. XCLASS is used for classes that + cannot be represented just by a bit map. This includes negated single + high-valued characters. The length in _pcre_OP_lengths[] is zero; the + actual length is stored in the compiled code, so we must update "code" + here. */ #ifdef SUPPORT_UTF8 case OP_XCLASS: - ccode = code + GET(code, 1); + ccode = code += GET(code, 1); goto CHECK_CLASS_REPEAT; #endif @@ -2666,6 +2728,7 @@ else inescq = TRUE; continue; } + else if (-c == ESC_E) continue; /* Ignore orphan \E */ if (c < 0) { @@ -4740,10 +4803,10 @@ goto FAILED; } - /* In the pre-compile phase, update the length by the length of the nested - group, less the brackets at either end. Then reduce the compiled code to - just the brackets so that it doesn't use much memory if it is duplicated by - a quantifier. */ + /* In the pre-compile phase, update the length by the length of the group, + less the brackets at either end. Then reduce the compiled code to just a + set of non-capturing brackets so that it doesn't use much memory if it is + duplicated by a quantifier.*/ if (lengthptr != NULL) { @@ -4753,15 +4816,16 @@ goto FAILED; } *lengthptr += length_prevgroup - 2 - 2*LINK_SIZE; - code++; + *code++ = OP_BRA; PUTINC(code, 0, 1 + LINK_SIZE); *code++ = OP_KET; PUTINC(code, 0, 1 + LINK_SIZE); + break; /* No need to waste time with special character handling */ } /* Otherwise update the main code pointer to the end of the group. */ - else code = tempcode; + code = tempcode; /* For a DEFINE group, required and first character settings are not relevant. */