--- code/trunk/pcre_study.c 2011/06/02 19:04:54 604 +++ code/trunk/pcre_study.c 2011/06/03 18:18:30 605 @@ -52,7 +52,7 @@ /* Returns from set_start_bits() */ -enum { SSB_FAIL, SSB_DONE, SSB_CONTINUE }; +enum { SSB_FAIL, SSB_DONE, SSB_CONTINUE, SSB_UNKNOWN }; @@ -73,7 +73,7 @@ Returns: the minimum length -1 if \C was encountered -2 internal error (missing capturing bracket) - -3 internal error (opcode not listed) + -3 internal error (opcode not listed) */ static int @@ -140,7 +140,7 @@ case OP_KET: case OP_KETRMAX: case OP_KETRMIN: - case OP_KETRPOS: + case OP_KETRPOS: case OP_END: if (length < 0 || (!had_recurse && branchlength < length)) length = branchlength; @@ -382,13 +382,13 @@ min = 0; cc++; break; - - case OP_CRPLUS: - case OP_CRMINPLUS: - min = 1; - cc++; - break; - + + case OP_CRPLUS: + case OP_CRMINPLUS: + min = 1; + cc++; + break; + case OP_CRRANGE: case OP_CRMINRANGE: min = GET2(cc, 1); @@ -419,7 +419,7 @@ of a character, we must take special action for UTF-8 characters. As it happens, the "NOT" versions of these opcodes are used at present only for ASCII characters, so they could be omitted from this list. However, in - future that may change, so we include them here so as not to leave a + future that may change, so we include them here so as not to leave a gotcha for a future maintainer. */ case OP_UPTO: @@ -488,10 +488,10 @@ case OP_PRUNE: case OP_SET_SOM: case OP_SKIP: - case OP_THEN: + case OP_THEN: cc += _pcre_OP_lengths[op]; break; - + /* This should not occur: we list all opcodes explicitly so that when new ones get added they are properly considered. */ @@ -648,10 +648,11 @@ Returns: SSB_FAIL => Failed to find any starting bytes SSB_DONE => Found mandatory starting bytes SSB_CONTINUE => Found optional starting bytes + SSB_UNKNOWN => Hit an unrecognized opcode */ static int -set_start_bits(const uschar *code, uschar *start_bits, BOOL utf8, +set_start_bits(const uschar *code, uschar *start_bits, BOOL utf8, compile_data *cd) { register int c; @@ -679,7 +680,7 @@ { BOOL try_next = TRUE; const uschar *tcode = code + 1 + LINK_SIZE; - + if (*code == OP_CBRA || *code == OP_SCBRA || *code == OP_CBRAPOS || *code == OP_SCBRAPOS) tcode += 2; @@ -688,9 +689,86 @@ int rc; switch(*tcode) { - /* Fail if we reach something we don't understand */ + /* If we reach something we don't understand, it means a new opcode has + been created that hasn't been added to this code. Hopefully this problem + will be discovered during testing. */ default: + return SSB_UNKNOWN; + + /* Fail for a valid opcode that implies no starting bits. */ + + case OP_ACCEPT: + case OP_ALLANY: + case OP_ANY: + case OP_ANYBYTE: + case OP_CIRC: + case OP_CIRCM: + case OP_CLOSE: + case OP_COMMIT: + case OP_COND: + case OP_CREF: + case OP_DEF: + case OP_DOLL: + case OP_DOLLM: + case OP_END: + case OP_EOD: + case OP_EODN: + case OP_EXTUNI: + case OP_FAIL: + case OP_MARK: + case OP_NCREF: + case OP_NOT: + case OP_NOTEXACT: + case OP_NOTEXACTI: + case OP_NOTI: + case OP_NOTMINPLUS: + case OP_NOTMINPLUSI: + case OP_NOTMINQUERY: + case OP_NOTMINQUERYI: + case OP_NOTMINSTAR: + case OP_NOTMINSTARI: + case OP_NOTMINUPTO: + case OP_NOTMINUPTOI: + case OP_NOTPLUS: + case OP_NOTPLUSI: + case OP_NOTPOSPLUS: + case OP_NOTPOSPLUSI: + case OP_NOTPOSQUERY: + case OP_NOTPOSQUERYI: + case OP_NOTPOSSTAR: + case OP_NOTPOSSTARI: + case OP_NOTPOSUPTO: + case OP_NOTPOSUPTOI: + case OP_NOTPROP: + case OP_NOTQUERY: + case OP_NOTQUERYI: + case OP_NOTSTAR: + case OP_NOTSTARI: + case OP_NOTUPTO: + case OP_NOTUPTOI: + case OP_NOT_HSPACE: + case OP_NOT_VSPACE: + case OP_NOT_WORD_BOUNDARY: + case OP_NRREF: + case OP_PROP: + case OP_PRUNE: + case OP_PRUNE_ARG: + case OP_RECURSE: + case OP_REF: + case OP_REFI: + case OP_REVERSE: + case OP_RREF: + case OP_SCOND: + case OP_SET_SOM: + case OP_SKIP: + case OP_SKIP_ARG: + case OP_SOD: + case OP_SOM: + case OP_THEN: + case OP_THEN_ARG: + case OP_WORD_BOUNDARY: + case OP_XCLASS: return SSB_FAIL; /* If we hit a bracket or a positive lookahead assertion, recurse to set @@ -709,7 +787,7 @@ case OP_ONCE: case OP_ASSERT: rc = set_start_bits(tcode, start_bits, utf8, cd); - if (rc == SSB_FAIL) return SSB_FAIL; + if (rc == SSB_FAIL || rc == SSB_UNKNOWN) return rc; if (rc == SSB_DONE) try_next = FALSE; else { do tcode += GET(tcode, 1); while (*tcode == OP_ALT); @@ -732,7 +810,7 @@ case OP_KET: case OP_KETRMAX: case OP_KETRMIN: - case OP_KETRPOS: + case OP_KETRPOS: return SSB_CONTINUE; /* Skip over callout */ @@ -755,8 +833,8 @@ case OP_BRAZERO: case OP_BRAMINZERO: case OP_BRAPOSZERO: - if (set_start_bits(++tcode, start_bits, utf8, cd) == SSB_FAIL) - return SSB_FAIL; + rc = set_start_bits(++tcode, start_bits, utf8, cd); + if (rc == SSB_FAIL || rc == SSB_UNKNOWN) return rc; /* ========================================================================= See the comment at the head of this function concerning the next line, which was an old fudge for the benefit of OS/2. @@ -1058,7 +1136,8 @@ for (c = 0; c < 32; c++) start_bits[c] |= tcode[c]; } - /* Advance past the bit map, and act on what follows */ + /* Advance past the bit map, and act on what follows. For a zero + minimum repeat, continue; otherwise stop processing. */ tcode += 32; switch (*tcode) @@ -1075,7 +1154,7 @@ if (((tcode[1] << 8) + tcode[2]) == 0) tcode += 5; else try_next = FALSE; break; - + default: try_next = FALSE; break; @@ -1094,6 +1173,8 @@ + + /************************************************* * Study a compiled expression * *************************************************/ @@ -1150,6 +1231,8 @@ if ((re->options & PCRE_ANCHORED) == 0 && (re->flags & (PCRE_FIRSTSET|PCRE_STARTLINE)) == 0) { + int rc; + /* Set the character tables in the block that is passed around */ tables = re->tables; @@ -1165,8 +1248,10 @@ /* See if we can find a fixed set of initial characters for the pattern. */ memset(start_bits, 0, 32 * sizeof(uschar)); - bits_set = set_start_bits(code, start_bits, (re->options & PCRE_UTF8) != 0, - &compile_block) == SSB_DONE; + rc = set_start_bits(code, start_bits, (re->options & PCRE_UTF8) != 0, + &compile_block); + bits_set = rc == SSB_DONE; + if (rc == SSB_UNKNOWN) *errorptr = "internal error: opcode not recognized"; } /* Find the minimum length of subject string. */ @@ -1174,13 +1259,13 @@ switch(min = find_minlength(code, code, re->options)) { case -2: *errorptr = "internal error: missing capturing bracket"; break; - case -3: *errorptr = "internal error: opcode not recognized"; break; + case -3: *errorptr = "internal error: opcode not recognized"; break; default: break; - } + } -/* Return NULL if no optimization is possible. */ +/* Return NULL if there's been an error or if no optimization is possible. */ -if (!bits_set && min < 0) return NULL; +if (*errorptr != NULL || (!bits_set && min < 0)) return NULL; /* Get a pcre_extra block and a pcre_study_data block. The study data is put in the latter, which is pointed to by the former, which may also get additional