--- code/trunk/pcrecpp.cc 2007/09/11 12:57:06 236 +++ code/tags/pcre-8.01/pcrecpp.cc 2010/01/19 16:45:59 490 @@ -1,4 +1,4 @@ -// Copyright (c) 2005, Google Inc. +// Copyright (c) 2010, Google Inc. // All rights reserved. // // Redistribution and use in source and binary forms, with or without @@ -33,12 +33,6 @@ #include "config.h" #endif -#ifdef _WIN32 -#define HAVE_STRTOQ 1 -#define strtoll _strtoui64 -#define strtoull _strtoi64 -#endif - #include #include #include @@ -61,7 +55,23 @@ static const int kVecSize = (1 + kMaxArgs) * 3; // results + PCRE workspace // Special object that stands-in for no argument -PCRECPP_EXP_DEFN Arg no_arg((void*)NULL); +Arg RE::no_arg((void*)NULL); + +// This is for ABI compatibility with old versions of pcre (pre-7.6), +// which defined a global no_arg variable instead of putting it in the +// RE class. This works on GCC >= 3, at least. It definitely works +// for ELF, but may not for other object formats (Mach-O, for +// instance, does not support aliases.) We could probably have a more +// inclusive test if we ever needed it. (Note that not only the +// __attribute__ syntax, but also __USER_LABEL_PREFIX__, are +// gnu-specific.) +#if defined(__GNUC__) && __GNUC__ >= 3 && defined(__ELF__) +# define ULP_AS_STRING(x) ULP_AS_STRING_INTERNAL(x) +# define ULP_AS_STRING_INTERNAL(x) #x +# define USER_LABEL_PREFIX_STR ULP_AS_STRING(__USER_LABEL_PREFIX__) +extern Arg no_arg + __attribute__((alias(USER_LABEL_PREFIX_STR "_ZN7pcrecpp2RE6no_argE"))); +#endif // If a regular expression has no error, its error_ field points here static const string empty_string; @@ -321,7 +331,7 @@ bool RE::Replace(const StringPiece& rewrite, string *str) const { int vec[kVecSize]; - int matches = TryMatch(*str, 0, UNANCHORED, vec, kVecSize); + int matches = TryMatch(*str, 0, UNANCHORED, true, vec, kVecSize); if (matches == 0) return false; @@ -337,13 +347,17 @@ // Returns PCRE_NEWLINE_CRLF, PCRE_NEWLINE_CR, or PCRE_NEWLINE_LF. // Note that PCRE_NEWLINE_CRLF is defined to be P_N_CR | P_N_LF. +// Modified by PH to add PCRE_NEWLINE_ANY and PCRE_NEWLINE_ANYCRLF. + static int NewlineMode(int pcre_options) { // TODO: if we can make it threadsafe, cache this var int newline_mode = 0; /* if (newline_mode) return newline_mode; */ // do this once it's cached - if (pcre_options & (PCRE_NEWLINE_CRLF|PCRE_NEWLINE_CR|PCRE_NEWLINE_LF)) { + if (pcre_options & (PCRE_NEWLINE_CRLF|PCRE_NEWLINE_CR|PCRE_NEWLINE_LF| + PCRE_NEWLINE_ANY|PCRE_NEWLINE_ANYCRLF)) { newline_mode = (pcre_options & - (PCRE_NEWLINE_CRLF|PCRE_NEWLINE_CR|PCRE_NEWLINE_LF)); + (PCRE_NEWLINE_CRLF|PCRE_NEWLINE_CR|PCRE_NEWLINE_LF| + PCRE_NEWLINE_ANY|PCRE_NEWLINE_ANYCRLF)); } else { int newline; pcre_config(PCRE_CONFIG_NEWLINE, &newline); @@ -353,8 +367,12 @@ newline_mode = PCRE_NEWLINE_CR; else if (newline == 3338) newline_mode = PCRE_NEWLINE_CRLF; + else if (newline == -1) + newline_mode = PCRE_NEWLINE_ANY; + else if (newline == -2) + newline_mode = PCRE_NEWLINE_ANYCRLF; else - assert("" == "Unexpected return value from pcre_config(NEWLINE)"); + assert(NULL == "Unexpected return value from pcre_config(NEWLINE)"); } return newline_mode; } @@ -366,45 +384,64 @@ string out; int start = 0; int lastend = -1; + bool last_match_was_empty_string = false; - for (; start <= static_cast(str->length()); count++) { - int matches = TryMatch(*str, start, UNANCHORED, vec, kVecSize); - if (matches <= 0) - break; - int matchstart = vec[0], matchend = vec[1]; - assert(matchstart >= start); - assert(matchend >= matchstart); - if (matchstart == matchend && matchstart == lastend) { - // advance one character if we matched an empty string at the same - // place as the last match occurred - matchend = start + 1; - // If the current char is CR and we're in CRLF mode, skip LF too. - // Note it's better to call pcre_fullinfo() than to examine - // all_options(), since options_ could have changed bewteen - // compile-time and now, but this is simpler and safe enough. - if (start+1 < static_cast(str->length()) && - (*str)[start] == '\r' && (*str)[start+1] == '\n' && - NewlineMode(options_.all_options()) == PCRE_NEWLINE_CRLF) { - matchend++; - } - // We also need to advance more than one char if we're in utf8 mode. -#ifdef SUPPORT_UTF8 - if (options_.utf8()) { - while (matchend < static_cast(str->length()) && - ((*str)[matchend] & 0xc0) == 0x80) + while (start <= static_cast(str->length())) { + // If the previous match was for the empty string, we shouldn't + // just match again: we'll match in the same way and get an + // infinite loop. Instead, we do the match in a special way: + // anchored -- to force another try at the same position -- + // and with a flag saying that this time, ignore empty matches. + // If this special match returns, that means there's a non-empty + // match at this position as well, and we can continue. If not, + // we do what perl does, and just advance by one. + // Notice that perl prints '@@@' for this; + // perl -le '$_ = "aa"; s/b*|aa/@/g; print' + int matches; + if (last_match_was_empty_string) { + matches = TryMatch(*str, start, ANCHOR_START, false, vec, kVecSize); + if (matches <= 0) { + int matchend = start + 1; // advance one character. + // If the current char is CR and we're in CRLF mode, skip LF too. + // Note it's better to call pcre_fullinfo() than to examine + // all_options(), since options_ could have changed bewteen + // compile-time and now, but this is simpler and safe enough. + // Modified by PH to add ANY and ANYCRLF. + if (matchend < static_cast(str->length()) && + (*str)[start] == '\r' && (*str)[matchend] == '\n' && + (NewlineMode(options_.all_options()) == PCRE_NEWLINE_CRLF || + NewlineMode(options_.all_options()) == PCRE_NEWLINE_ANY || + NewlineMode(options_.all_options()) == PCRE_NEWLINE_ANYCRLF)) { matchend++; - } + } + // We also need to advance more than one char if we're in utf8 mode. +#ifdef SUPPORT_UTF8 + if (options_.utf8()) { + while (matchend < static_cast(str->length()) && + ((*str)[matchend] & 0xc0) == 0x80) + matchend++; + } #endif - if (matchend <= static_cast(str->length())) - out.append(*str, start, matchend - start); - start = matchend; + if (start < static_cast(str->length())) + out.append(*str, start, matchend - start); + start = matchend; + last_match_was_empty_string = false; + continue; + } } else { - out.append(*str, start, matchstart - start); - Rewrite(&out, rewrite, *str, vec, matches); - start = matchend; - lastend = matchend; - count++; + matches = TryMatch(*str, start, UNANCHORED, true, vec, kVecSize); + if (matches <= 0) + break; } + int matchstart = vec[0], matchend = vec[1]; + assert(matchstart >= start); + assert(matchend >= matchstart); + out.append(*str, start, matchstart - start); + Rewrite(&out, rewrite, *str, vec, matches); + start = matchend; + lastend = matchend; + count++; + last_match_was_empty_string = (matchstart == matchend); } if (count == 0) @@ -420,7 +457,7 @@ const StringPiece& text, string *out) const { int vec[kVecSize]; - int matches = TryMatch(text, 0, UNANCHORED, vec, kVecSize); + int matches = TryMatch(text, 0, UNANCHORED, true, vec, kVecSize); if (matches == 0) return false; out->erase(); @@ -435,21 +472,27 @@ // Note that it's legal to escape a character even if it has no // special meaning in a regular expression -- so this function does // that. (This also makes it identical to the perl function of the - // same name; see `perldoc -f quotemeta`.) + // same name; see `perldoc -f quotemeta`.) The one exception is + // escaping NUL: rather than doing backslash + NUL, like perl does, + // we do '\0', because pcre itself doesn't take embedded NUL chars. for (int ii = 0; ii < unquoted.size(); ++ii) { // Note that using 'isalnum' here raises the benchmark time from // 32ns to 58ns: - if ((unquoted[ii] < 'a' || unquoted[ii] > 'z') && - (unquoted[ii] < 'A' || unquoted[ii] > 'Z') && - (unquoted[ii] < '0' || unquoted[ii] > '9') && - unquoted[ii] != '_' && - // If this is the part of a UTF8 or Latin1 character, we need - // to copy this byte without escaping. Experimentally this is - // what works correctly with the regexp library. - !(unquoted[ii] & 128)) { + if (unquoted[ii] == '\0') { + result += "\\0"; + } else if ((unquoted[ii] < 'a' || unquoted[ii] > 'z') && + (unquoted[ii] < 'A' || unquoted[ii] > 'Z') && + (unquoted[ii] < '0' || unquoted[ii] > '9') && + unquoted[ii] != '_' && + // If this is the part of a UTF8 or Latin1 character, we need + // to copy this byte without escaping. Experimentally this is + // what works correctly with the regexp library. + !(unquoted[ii] & 128)) { result += '\\'; + result += unquoted[ii]; + } else { + result += unquoted[ii]; } - result += unquoted[ii]; } return result; @@ -460,6 +503,7 @@ int RE::TryMatch(const StringPiece& text, int startpos, Anchor anchor, + bool empty_ok, int *vec, int vecsize) const { pcre* re = (anchor == ANCHOR_BOTH) ? re_full_ : re_partial_; @@ -477,12 +521,19 @@ extra.flags |= PCRE_EXTRA_MATCH_LIMIT_RECURSION; extra.match_limit_recursion = options_.match_limit_recursion(); } + + int options = 0; + if (anchor != UNANCHORED) + options |= PCRE_ANCHORED; + if (!empty_ok) + options |= PCRE_NOTEMPTY; + int rc = pcre_exec(re, // The regular expression object &extra, (text.data() == NULL) ? "" : text.data(), text.size(), startpos, - (anchor == UNANCHORED) ? 0 : PCRE_ANCHORED, + options, vec, vecsize); @@ -512,7 +563,7 @@ int* vec, int vecsize) const { assert((1 + n) * 3 <= vecsize); // results + PCRE workspace - int matches = TryMatch(text, 0, anchor, vec, vecsize); + int matches = TryMatch(text, 0, anchor, true, vec, vecsize); assert(matches >= 0); // TryMatch never returns negatives if (matches == 0) return false; @@ -577,14 +628,14 @@ if (start >= 0) out->append(text.data() + start, vec[2 * n + 1] - start); } else if (c == '\\') { - out->push_back('\\'); + *out += '\\'; } else { //fprintf(stderr, "invalid rewrite pattern: %.*s\n", // rewrite.size(), rewrite.data()); return false; } } else { - out->push_back(c); + *out += c; } } return true; @@ -612,23 +663,27 @@ } bool Arg::parse_string(const char* str, int n, void* dest) { + if (dest == NULL) return true; reinterpret_cast(dest)->assign(str, n); return true; } bool Arg::parse_stringpiece(const char* str, int n, void* dest) { + if (dest == NULL) return true; reinterpret_cast(dest)->set(str, n); return true; } bool Arg::parse_char(const char* str, int n, void* dest) { if (n != 1) return false; + if (dest == NULL) return true; *(reinterpret_cast(dest)) = str[0]; return true; } bool Arg::parse_uchar(const char* str, int n, void* dest) { if (n != 1) return false; + if (dest == NULL) return true; *(reinterpret_cast(dest)) = str[0]; return true; } @@ -677,6 +732,7 @@ long r = strtol(str, &end, radix); if (end != str + n) return false; // Leftover junk if (errno) return false; + if (dest == NULL) return true; *(reinterpret_cast(dest)) = r; return true; } @@ -694,6 +750,7 @@ unsigned long r = strtoul(str, &end, radix); if (end != str + n) return false; // Leftover junk if (errno) return false; + if (dest == NULL) return true; *(reinterpret_cast(dest)) = r; return true; } @@ -705,7 +762,8 @@ long r; if (!parse_long_radix(str, n, &r, radix)) return false; // Could not parse if (r < SHRT_MIN || r > SHRT_MAX) return false; // Out of range - *(reinterpret_cast(dest)) = r; + if (dest == NULL) return true; + *(reinterpret_cast(dest)) = static_cast(r); return true; } @@ -716,7 +774,8 @@ unsigned long r; if (!parse_ulong_radix(str, n, &r, radix)) return false; // Could not parse if (r > USHRT_MAX) return false; // Out of range - *(reinterpret_cast(dest)) = r; + if (dest == NULL) return true; + *(reinterpret_cast(dest)) = static_cast(r); return true; } @@ -727,6 +786,7 @@ long r; if (!parse_long_radix(str, n, &r, radix)) return false; // Could not parse if (r < INT_MIN || r > INT_MAX) return false; // Out of range + if (dest == NULL) return true; *(reinterpret_cast(dest)) = r; return true; } @@ -738,6 +798,7 @@ unsigned long r; if (!parse_ulong_radix(str, n, &r, radix)) return false; // Could not parse if (r > UINT_MAX) return false; // Out of range + if (dest == NULL) return true; *(reinterpret_cast(dest)) = r; return true; } @@ -758,11 +819,16 @@ long long r = strtoq(str, &end, radix); #elif defined HAVE_STRTOLL long long r = strtoll(str, &end, radix); +#elif defined HAVE__STRTOI64 + long long r = _strtoi64(str, &end, radix); +#elif defined HAVE_STRTOIMAX + long long r = strtoimax(str, &end, radix); #else #error parse_longlong_radix: cannot convert input to a long-long #endif if (end != str + n) return false; // Leftover junk if (errno) return false; + if (dest == NULL) return true; *(reinterpret_cast(dest)) = r; return true; #endif /* HAVE_LONG_LONG */ @@ -785,11 +851,16 @@ unsigned long long r = strtouq(str, &end, radix); #elif defined HAVE_STRTOLL unsigned long long r = strtoull(str, &end, radix); +#elif defined HAVE__STRTOI64 + unsigned long long r = _strtoui64(str, &end, radix); +#elif defined HAVE_STRTOIMAX + unsigned long long r = strtoumax(str, &end, radix); #else #error parse_ulonglong_radix: cannot convert input to a long-long #endif if (end != str + n) return false; // Leftover junk if (errno) return false; + if (dest == NULL) return true; *(reinterpret_cast(dest)) = r; return true; #endif /* HAVE_UNSIGNED_LONG_LONG */ @@ -807,6 +878,7 @@ double r = strtod(buf, &end); if (end != buf + n) return false; // Leftover junk if (errno) return false; + if (dest == NULL) return true; *(reinterpret_cast(dest)) = r; return true; } @@ -814,6 +886,7 @@ bool Arg::parse_float(const char* str, int n, void* dest) { double r; if (!parse_double(str, n, &r)) return false; + if (dest == NULL) return true; *(reinterpret_cast(dest)) = static_cast(r); return true; }