| 61 |
// If the user doesn't ask for any options, we just use this one |
// If the user doesn't ask for any options, we just use this one |
| 62 |
static RE_Options default_options; |
static RE_Options default_options; |
| 63 |
|
|
| 64 |
void RE::Init(const char* pat, const RE_Options* options) { |
void RE::Init(const string& pat, const RE_Options* options) { |
| 65 |
pattern_ = pat; |
pattern_ = pat; |
| 66 |
if (options == NULL) { |
if (options == NULL) { |
| 67 |
options_ = default_options; |
options_ = default_options; |
| 78 |
// conservative in that it may treat some "simple" patterns |
// conservative in that it may treat some "simple" patterns |
| 79 |
// as "complex" (e.g., if the vertical bar is in a character |
// as "complex" (e.g., if the vertical bar is in a character |
| 80 |
// class or is escaped). But it seems good enough. |
// class or is escaped). But it seems good enough. |
| 81 |
if (strchr(pat, '|') == NULL) { |
if (strchr(pat.c_str(), '|') == NULL) { |
| 82 |
// Simple pattern: we can use position-based checks to perform |
// Simple pattern: we can use position-based checks to perform |
| 83 |
// fully anchored matches |
// fully anchored matches |
| 84 |
re_full_ = re_partial_; |
re_full_ = re_partial_; |
| 89 |
} |
} |
| 90 |
} |
} |
| 91 |
|
|
| 92 |
RE::~RE() { |
void RE::Cleanup() { |
| 93 |
if (re_full_ != NULL && re_full_ != re_partial_) (*pcre_free)(re_full_); |
if (re_full_ != NULL && re_full_ != re_partial_) (*pcre_free)(re_full_); |
| 94 |
if (re_partial_ != NULL) (*pcre_free)(re_partial_); |
if (re_partial_ != NULL) (*pcre_free)(re_partial_); |
| 95 |
if (error_ != &empty_string) delete error_; |
if (error_ != &empty_string) delete error_; |
| 96 |
} |
} |
| 97 |
|
|
| 98 |
|
|
| 99 |
|
RE::~RE() { |
| 100 |
|
Cleanup(); |
| 101 |
|
} |
| 102 |
|
|
| 103 |
|
|
| 104 |
pcre* RE::Compile(Anchor anchor) { |
pcre* RE::Compile(Anchor anchor) { |
| 105 |
// First, convert RE_Options into pcre options |
// First, convert RE_Options into pcre options |
| 106 |
int pcre_options = 0; |
int pcre_options = 0; |
| 338 |
return true; |
return true; |
| 339 |
} |
} |
| 340 |
|
|
| 341 |
|
// Returns PCRE_NEWLINE_CRLF, PCRE_NEWLINE_CR, or PCRE_NEWLINE_LF. |
| 342 |
|
// Note that PCRE_NEWLINE_CRLF is defined to be P_N_CR | P_N_LF. |
| 343 |
|
static int NewlineMode(int pcre_options) { |
| 344 |
|
// TODO: if we can make it threadsafe, cache this var |
| 345 |
|
int newline_mode = 0; |
| 346 |
|
/* if (newline_mode) return newline_mode; */ // do this once it's cached |
| 347 |
|
if (pcre_options & (PCRE_NEWLINE_CRLF|PCRE_NEWLINE_CR|PCRE_NEWLINE_LF)) { |
| 348 |
|
newline_mode = (pcre_options & |
| 349 |
|
(PCRE_NEWLINE_CRLF|PCRE_NEWLINE_CR|PCRE_NEWLINE_LF)); |
| 350 |
|
} else { |
| 351 |
|
int newline; |
| 352 |
|
pcre_config(PCRE_CONFIG_NEWLINE, &newline); |
| 353 |
|
if (newline == 10) |
| 354 |
|
newline_mode = PCRE_NEWLINE_LF; |
| 355 |
|
else if (newline == 13) |
| 356 |
|
newline_mode = PCRE_NEWLINE_CR; |
| 357 |
|
else if (newline == 3338) |
| 358 |
|
newline_mode = PCRE_NEWLINE_CRLF; |
| 359 |
|
else |
| 360 |
|
assert("" == "Unexpected return value from pcre_config(NEWLINE)"); |
| 361 |
|
} |
| 362 |
|
return newline_mode; |
| 363 |
|
} |
| 364 |
|
|
| 365 |
int RE::GlobalReplace(const StringPiece& rewrite, |
int RE::GlobalReplace(const StringPiece& rewrite, |
| 366 |
string *str) const { |
string *str) const { |
| 367 |
int count = 0; |
int count = 0; |
| 380 |
if (matchstart == matchend && matchstart == lastend) { |
if (matchstart == matchend && matchstart == lastend) { |
| 381 |
// advance one character if we matched an empty string at the same |
// advance one character if we matched an empty string at the same |
| 382 |
// place as the last match occurred |
// place as the last match occurred |
| 383 |
if (start < static_cast<int>(str->length())) |
matchend = start + 1; |
| 384 |
out.push_back((*str)[start]); |
// If the current char is CR and we're in CRLF mode, skip LF too. |
| 385 |
start++; |
// Note it's better to call pcre_fullinfo() than to examine |
| 386 |
|
// all_options(), since options_ could have changed bewteen |
| 387 |
|
// compile-time and now, but this is simpler and safe enough. |
| 388 |
|
if (start+1 < static_cast<int>(str->length()) && |
| 389 |
|
(*str)[start] == '\r' && (*str)[start+1] == '\n' && |
| 390 |
|
NewlineMode(options_.all_options()) == PCRE_NEWLINE_CRLF) { |
| 391 |
|
matchend++; |
| 392 |
|
} |
| 393 |
|
// We also need to advance more than one char if we're in utf8 mode. |
| 394 |
|
#ifdef SUPPORT_UTF8 |
| 395 |
|
if (options_.utf8()) { |
| 396 |
|
while (matchend < static_cast<int>(str->length()) && |
| 397 |
|
((*str)[matchend] & 0xc0) == 0x80) |
| 398 |
|
matchend++; |
| 399 |
|
} |
| 400 |
|
#endif |
| 401 |
|
if (matchend <= static_cast<int>(str->length())) |
| 402 |
|
out.append(*str, start, matchend - start); |
| 403 |
|
start = matchend; |
| 404 |
} else { |
} else { |
| 405 |
out.append(*str, start, matchstart - start); |
out.append(*str, start, matchstart - start); |
| 406 |
Rewrite(&out, rewrite, *str, vec, matches); |
Rewrite(&out, rewrite, *str, vec, matches); |
| 430 |
return Rewrite(out, rewrite, text, vec, matches); |
return Rewrite(out, rewrite, text, vec, matches); |
| 431 |
} |
} |
| 432 |
|
|
| 433 |
|
/*static*/ string RE::QuoteMeta(const StringPiece& unquoted) { |
| 434 |
|
string result; |
| 435 |
|
|
| 436 |
|
// Escape any ascii character not in [A-Za-z_0-9]. |
| 437 |
|
// |
| 438 |
|
// Note that it's legal to escape a character even if it has no |
| 439 |
|
// special meaning in a regular expression -- so this function does |
| 440 |
|
// that. (This also makes it identical to the perl function of the |
| 441 |
|
// same name; see `perldoc -f quotemeta`.) |
| 442 |
|
for (int ii = 0; ii < unquoted.size(); ++ii) { |
| 443 |
|
// Note that using 'isalnum' here raises the benchmark time from |
| 444 |
|
// 32ns to 58ns: |
| 445 |
|
if ((unquoted[ii] < 'a' || unquoted[ii] > 'z') && |
| 446 |
|
(unquoted[ii] < 'A' || unquoted[ii] > 'Z') && |
| 447 |
|
(unquoted[ii] < '0' || unquoted[ii] > '9') && |
| 448 |
|
unquoted[ii] != '_' && |
| 449 |
|
// If this is the part of a UTF8 or Latin1 character, we need |
| 450 |
|
// to copy this byte without escaping. Experimentally this is |
| 451 |
|
// what works correctly with the regexp library. |
| 452 |
|
!(unquoted[ii] & 128)) { |
| 453 |
|
result += '\\'; |
| 454 |
|
} |
| 455 |
|
result += unquoted[ii]; |
| 456 |
|
} |
| 457 |
|
|
| 458 |
|
return result; |
| 459 |
|
} |
| 460 |
|
|
| 461 |
/***** Actual matching and rewriting code *****/ |
/***** Actual matching and rewriting code *****/ |
| 462 |
|
|
| 463 |
int RE::TryMatch(const StringPiece& text, |
int RE::TryMatch(const StringPiece& text, |
| 843 |
return parse_##name##_radix(str, n, dest, 0); \ |
return parse_##name##_radix(str, n, dest, 0); \ |
| 844 |
} |
} |
| 845 |
|
|
| 846 |
DEFINE_INTEGER_PARSERS(short); |
DEFINE_INTEGER_PARSERS(short) /* */ |
| 847 |
DEFINE_INTEGER_PARSERS(ushort); |
DEFINE_INTEGER_PARSERS(ushort) /* */ |
| 848 |
DEFINE_INTEGER_PARSERS(int); |
DEFINE_INTEGER_PARSERS(int) /* Don't use semicolons after these */ |
| 849 |
DEFINE_INTEGER_PARSERS(uint); |
DEFINE_INTEGER_PARSERS(uint) /* statements because they can cause */ |
| 850 |
DEFINE_INTEGER_PARSERS(long); |
DEFINE_INTEGER_PARSERS(long) /* compiler warnings if the checking */ |
| 851 |
DEFINE_INTEGER_PARSERS(ulong); |
DEFINE_INTEGER_PARSERS(ulong) /* level is turned up high enough. */ |
| 852 |
DEFINE_INTEGER_PARSERS(longlong); |
DEFINE_INTEGER_PARSERS(longlong) /* */ |
| 853 |
DEFINE_INTEGER_PARSERS(ulonglong); |
DEFINE_INTEGER_PARSERS(ulonglong) /* */ |
| 854 |
|
|
| 855 |
#undef DEFINE_INTEGER_PARSERS |
#undef DEFINE_INTEGER_PARSERS |
| 856 |
|
|