| 29 |
// |
// |
| 30 |
// Author: Sanjay Ghemawat |
// Author: Sanjay Ghemawat |
| 31 |
|
|
| 32 |
|
#ifdef HAVE_CONFIG_H |
| 33 |
|
#include "config.h" |
| 34 |
|
#endif |
| 35 |
|
|
| 36 |
#include <stdlib.h> |
#include <stdlib.h> |
| 37 |
#include <stdio.h> |
#include <stdio.h> |
| 38 |
#include <ctype.h> |
#include <ctype.h> |
| 41 |
#include <errno.h> |
#include <errno.h> |
| 42 |
#include <string> |
#include <string> |
| 43 |
#include <algorithm> |
#include <algorithm> |
| 44 |
#include "config.h" |
|
| 45 |
// We need this to compile the proper dll on windows/msys. This is copied |
#include "pcrecpp_internal.h" |
|
// from pcre_internal.h. It would probably be better just to include that. |
|
|
#define PCRE_DEFINITION /* Win32 __declspec(export) trigger for .dll */ |
|
| 46 |
#include "pcre.h" |
#include "pcre.h" |
|
#include "pcre_stringpiece.h" |
|
| 47 |
#include "pcrecpp.h" |
#include "pcrecpp.h" |
| 48 |
|
#include "pcre_stringpiece.h" |
| 49 |
|
|
| 50 |
|
|
| 51 |
namespace pcrecpp { |
namespace pcrecpp { |
| 55 |
static const int kVecSize = (1 + kMaxArgs) * 3; // results + PCRE workspace |
static const int kVecSize = (1 + kMaxArgs) * 3; // results + PCRE workspace |
| 56 |
|
|
| 57 |
// Special object that stands-in for no argument |
// Special object that stands-in for no argument |
| 58 |
Arg no_arg((void*)NULL); |
Arg RE::no_arg((void*)NULL); |
| 59 |
|
|
| 60 |
|
// This is for ABI compatibility with old versions of pcre (pre-7.6), |
| 61 |
|
// which defined a global no_arg variable instead of putting it in the |
| 62 |
|
// RE class. This works on GCC >= 3, at least. It definitely works |
| 63 |
|
// for ELF, but may not for other object formats (Mach-O, for |
| 64 |
|
// instance, does not support aliases.) We could probably have a more |
| 65 |
|
// inclusive test if we ever needed it. (Note that not only the |
| 66 |
|
// __attribute__ syntax, but also __USER_LABEL_PREFIX__, are |
| 67 |
|
// gnu-specific.) |
| 68 |
|
#if defined(__GNUC__) && __GNUC__ >= 3 && defined(__ELF__) |
| 69 |
|
# define ULP_AS_STRING(x) ULP_AS_STRING_INTERNAL(x) |
| 70 |
|
# define ULP_AS_STRING_INTERNAL(x) #x |
| 71 |
|
# define USER_LABEL_PREFIX_STR ULP_AS_STRING(__USER_LABEL_PREFIX__) |
| 72 |
|
extern Arg no_arg |
| 73 |
|
__attribute__((alias(USER_LABEL_PREFIX_STR "_ZN7pcrecpp2RE6no_argE"))); |
| 74 |
|
#endif |
| 75 |
|
|
| 76 |
// If a regular expression has no error, its error_ field points here |
// If a regular expression has no error, its error_ field points here |
| 77 |
static const string empty_string; |
static const string empty_string; |
| 92 |
|
|
| 93 |
re_partial_ = Compile(UNANCHORED); |
re_partial_ = Compile(UNANCHORED); |
| 94 |
if (re_partial_ != NULL) { |
if (re_partial_ != NULL) { |
| 95 |
// Check for complicated patterns. The following change is |
re_full_ = Compile(ANCHOR_BOTH); |
|
// conservative in that it may treat some "simple" patterns |
|
|
// as "complex" (e.g., if the vertical bar is in a character |
|
|
// class or is escaped). But it seems good enough. |
|
|
if (strchr(pat.c_str(), '|') == NULL) { |
|
|
// Simple pattern: we can use position-based checks to perform |
|
|
// fully anchored matches |
|
|
re_full_ = re_partial_; |
|
|
} else { |
|
|
// We need a special pattern for anchored matches |
|
|
re_full_ = Compile(ANCHOR_BOTH); |
|
|
} |
|
| 96 |
} |
} |
| 97 |
} |
} |
| 98 |
|
|
| 99 |
void RE::Cleanup() { |
void RE::Cleanup() { |
| 100 |
if (re_full_ != NULL && re_full_ != re_partial_) (*pcre_free)(re_full_); |
if (re_full_ != NULL) (*pcre_free)(re_full_); |
| 101 |
if (re_partial_ != NULL) (*pcre_free)(re_partial_); |
if (re_partial_ != NULL) (*pcre_free)(re_partial_); |
| 102 |
if (error_ != &empty_string) delete error_; |
if (error_ != &empty_string) delete error_; |
| 103 |
} |
} |
| 104 |
|
|
| 105 |
|
|
| 347 |
|
|
| 348 |
// Returns PCRE_NEWLINE_CRLF, PCRE_NEWLINE_CR, or PCRE_NEWLINE_LF. |
// Returns PCRE_NEWLINE_CRLF, PCRE_NEWLINE_CR, or PCRE_NEWLINE_LF. |
| 349 |
// Note that PCRE_NEWLINE_CRLF is defined to be P_N_CR | P_N_LF. |
// Note that PCRE_NEWLINE_CRLF is defined to be P_N_CR | P_N_LF. |
| 350 |
|
// Modified by PH to add PCRE_NEWLINE_ANY and PCRE_NEWLINE_ANYCRLF. |
| 351 |
|
|
| 352 |
static int NewlineMode(int pcre_options) { |
static int NewlineMode(int pcre_options) { |
| 353 |
// TODO: if we can make it threadsafe, cache this var |
// TODO: if we can make it threadsafe, cache this var |
| 354 |
int newline_mode = 0; |
int newline_mode = 0; |
| 355 |
/* if (newline_mode) return newline_mode; */ // do this once it's cached |
/* if (newline_mode) return newline_mode; */ // do this once it's cached |
| 356 |
if (pcre_options & (PCRE_NEWLINE_CRLF|PCRE_NEWLINE_CR|PCRE_NEWLINE_LF)) { |
if (pcre_options & (PCRE_NEWLINE_CRLF|PCRE_NEWLINE_CR|PCRE_NEWLINE_LF| |
| 357 |
|
PCRE_NEWLINE_ANY|PCRE_NEWLINE_ANYCRLF)) { |
| 358 |
newline_mode = (pcre_options & |
newline_mode = (pcre_options & |
| 359 |
(PCRE_NEWLINE_CRLF|PCRE_NEWLINE_CR|PCRE_NEWLINE_LF)); |
(PCRE_NEWLINE_CRLF|PCRE_NEWLINE_CR|PCRE_NEWLINE_LF| |
| 360 |
|
PCRE_NEWLINE_ANY|PCRE_NEWLINE_ANYCRLF)); |
| 361 |
} else { |
} else { |
| 362 |
int newline; |
int newline; |
| 363 |
pcre_config(PCRE_CONFIG_NEWLINE, &newline); |
pcre_config(PCRE_CONFIG_NEWLINE, &newline); |
| 367 |
newline_mode = PCRE_NEWLINE_CR; |
newline_mode = PCRE_NEWLINE_CR; |
| 368 |
else if (newline == 3338) |
else if (newline == 3338) |
| 369 |
newline_mode = PCRE_NEWLINE_CRLF; |
newline_mode = PCRE_NEWLINE_CRLF; |
| 370 |
|
else if (newline == -1) |
| 371 |
|
newline_mode = PCRE_NEWLINE_ANY; |
| 372 |
|
else if (newline == -2) |
| 373 |
|
newline_mode = PCRE_NEWLINE_ANYCRLF; |
| 374 |
else |
else |
| 375 |
assert("" == "Unexpected return value from pcre_config(NEWLINE)"); |
assert("" == "Unexpected return value from pcre_config(NEWLINE)"); |
| 376 |
} |
} |
| 385 |
int start = 0; |
int start = 0; |
| 386 |
int lastend = -1; |
int lastend = -1; |
| 387 |
|
|
| 388 |
for (; start <= static_cast<int>(str->length()); count++) { |
while (start <= static_cast<int>(str->length())) { |
| 389 |
int matches = TryMatch(*str, start, UNANCHORED, vec, kVecSize); |
int matches = TryMatch(*str, start, UNANCHORED, vec, kVecSize); |
| 390 |
if (matches <= 0) |
if (matches <= 0) |
| 391 |
break; |
break; |
| 400 |
// Note it's better to call pcre_fullinfo() than to examine |
// Note it's better to call pcre_fullinfo() than to examine |
| 401 |
// all_options(), since options_ could have changed bewteen |
// all_options(), since options_ could have changed bewteen |
| 402 |
// compile-time and now, but this is simpler and safe enough. |
// compile-time and now, but this is simpler and safe enough. |
| 403 |
|
// Modified by PH to add ANY and ANYCRLF. |
| 404 |
if (start+1 < static_cast<int>(str->length()) && |
if (start+1 < static_cast<int>(str->length()) && |
| 405 |
(*str)[start] == '\r' && (*str)[start+1] == '\n' && |
(*str)[start] == '\r' && (*str)[start+1] == '\n' && |
| 406 |
NewlineMode(options_.all_options()) == PCRE_NEWLINE_CRLF) { |
(NewlineMode(options_.all_options()) == PCRE_NEWLINE_CRLF || |
| 407 |
|
NewlineMode(options_.all_options()) == PCRE_NEWLINE_ANY || |
| 408 |
|
NewlineMode(options_.all_options()) == PCRE_NEWLINE_ANYCRLF) |
| 409 |
|
) { |
| 410 |
matchend++; |
matchend++; |
| 411 |
} |
} |
| 412 |
// We also need to advance more than one char if we're in utf8 mode. |
// We also need to advance more than one char if we're in utf8 mode. |
| 457 |
// Note that it's legal to escape a character even if it has no |
// Note that it's legal to escape a character even if it has no |
| 458 |
// special meaning in a regular expression -- so this function does |
// special meaning in a regular expression -- so this function does |
| 459 |
// that. (This also makes it identical to the perl function of the |
// that. (This also makes it identical to the perl function of the |
| 460 |
// same name; see `perldoc -f quotemeta`.) |
// same name; see `perldoc -f quotemeta`.) The one exception is |
| 461 |
|
// escaping NUL: rather than doing backslash + NUL, like perl does, |
| 462 |
|
// we do '\0', because pcre itself doesn't take embedded NUL chars. |
| 463 |
for (int ii = 0; ii < unquoted.size(); ++ii) { |
for (int ii = 0; ii < unquoted.size(); ++ii) { |
| 464 |
// Note that using 'isalnum' here raises the benchmark time from |
// Note that using 'isalnum' here raises the benchmark time from |
| 465 |
// 32ns to 58ns: |
// 32ns to 58ns: |
| 466 |
if ((unquoted[ii] < 'a' || unquoted[ii] > 'z') && |
if (unquoted[ii] == '\0') { |
| 467 |
(unquoted[ii] < 'A' || unquoted[ii] > 'Z') && |
result += "\\0"; |
| 468 |
(unquoted[ii] < '0' || unquoted[ii] > '9') && |
} else if ((unquoted[ii] < 'a' || unquoted[ii] > 'z') && |
| 469 |
unquoted[ii] != '_' && |
(unquoted[ii] < 'A' || unquoted[ii] > 'Z') && |
| 470 |
// If this is the part of a UTF8 or Latin1 character, we need |
(unquoted[ii] < '0' || unquoted[ii] > '9') && |
| 471 |
// to copy this byte without escaping. Experimentally this is |
unquoted[ii] != '_' && |
| 472 |
// what works correctly with the regexp library. |
// If this is the part of a UTF8 or Latin1 character, we need |
| 473 |
!(unquoted[ii] & 128)) { |
// to copy this byte without escaping. Experimentally this is |
| 474 |
|
// what works correctly with the regexp library. |
| 475 |
|
!(unquoted[ii] & 128)) { |
| 476 |
result += '\\'; |
result += '\\'; |
| 477 |
|
result += unquoted[ii]; |
| 478 |
|
} else { |
| 479 |
|
result += unquoted[ii]; |
| 480 |
} |
} |
|
result += unquoted[ii]; |
|
| 481 |
} |
} |
| 482 |
|
|
| 483 |
return result; |
return result; |
| 496 |
return 0; |
return 0; |
| 497 |
} |
} |
| 498 |
|
|
| 499 |
pcre_extra extra = { 0 }; |
pcre_extra extra = { 0, 0, 0, 0, 0, 0 }; |
| 500 |
if (options_.match_limit() > 0) { |
if (options_.match_limit() > 0) { |
| 501 |
extra.flags |= PCRE_EXTRA_MATCH_LIMIT; |
extra.flags |= PCRE_EXTRA_MATCH_LIMIT; |
| 502 |
extra.match_limit = options_.match_limit(); |
extra.match_limit = options_.match_limit(); |
| 529 |
rc = vecsize / 2; |
rc = vecsize / 2; |
| 530 |
} |
} |
| 531 |
|
|
|
if ((anchor == ANCHOR_BOTH) && (re_full_ == re_partial_)) { |
|
|
// We need an extra check to make sure that the match extended |
|
|
// to the end of the input string |
|
|
assert(vec[0] == 0); // PCRE_ANCHORED forces starting match |
|
|
if (vec[1] != text.size()) return 0; // Did not get ending match |
|
|
} |
|
|
|
|
| 532 |
return rc; |
return rc; |
| 533 |
} |
} |
| 534 |
|
|
| 640 |
} |
} |
| 641 |
|
|
| 642 |
bool Arg::parse_string(const char* str, int n, void* dest) { |
bool Arg::parse_string(const char* str, int n, void* dest) { |
| 643 |
|
if (dest == NULL) return true; |
| 644 |
reinterpret_cast<string*>(dest)->assign(str, n); |
reinterpret_cast<string*>(dest)->assign(str, n); |
| 645 |
return true; |
return true; |
| 646 |
} |
} |
| 647 |
|
|
| 648 |
bool Arg::parse_stringpiece(const char* str, int n, void* dest) { |
bool Arg::parse_stringpiece(const char* str, int n, void* dest) { |
| 649 |
|
if (dest == NULL) return true; |
| 650 |
reinterpret_cast<StringPiece*>(dest)->set(str, n); |
reinterpret_cast<StringPiece*>(dest)->set(str, n); |
| 651 |
return true; |
return true; |
| 652 |
} |
} |
| 653 |
|
|
| 654 |
bool Arg::parse_char(const char* str, int n, void* dest) { |
bool Arg::parse_char(const char* str, int n, void* dest) { |
| 655 |
if (n != 1) return false; |
if (n != 1) return false; |
| 656 |
|
if (dest == NULL) return true; |
| 657 |
*(reinterpret_cast<char*>(dest)) = str[0]; |
*(reinterpret_cast<char*>(dest)) = str[0]; |
| 658 |
return true; |
return true; |
| 659 |
} |
} |
| 660 |
|
|
| 661 |
bool Arg::parse_uchar(const char* str, int n, void* dest) { |
bool Arg::parse_uchar(const char* str, int n, void* dest) { |
| 662 |
if (n != 1) return false; |
if (n != 1) return false; |
| 663 |
|
if (dest == NULL) return true; |
| 664 |
*(reinterpret_cast<unsigned char*>(dest)) = str[0]; |
*(reinterpret_cast<unsigned char*>(dest)) = str[0]; |
| 665 |
return true; |
return true; |
| 666 |
} |
} |
| 709 |
long r = strtol(str, &end, radix); |
long r = strtol(str, &end, radix); |
| 710 |
if (end != str + n) return false; // Leftover junk |
if (end != str + n) return false; // Leftover junk |
| 711 |
if (errno) return false; |
if (errno) return false; |
| 712 |
|
if (dest == NULL) return true; |
| 713 |
*(reinterpret_cast<long*>(dest)) = r; |
*(reinterpret_cast<long*>(dest)) = r; |
| 714 |
return true; |
return true; |
| 715 |
} |
} |
| 727 |
unsigned long r = strtoul(str, &end, radix); |
unsigned long r = strtoul(str, &end, radix); |
| 728 |
if (end != str + n) return false; // Leftover junk |
if (end != str + n) return false; // Leftover junk |
| 729 |
if (errno) return false; |
if (errno) return false; |
| 730 |
|
if (dest == NULL) return true; |
| 731 |
*(reinterpret_cast<unsigned long*>(dest)) = r; |
*(reinterpret_cast<unsigned long*>(dest)) = r; |
| 732 |
return true; |
return true; |
| 733 |
} |
} |
| 739 |
long r; |
long r; |
| 740 |
if (!parse_long_radix(str, n, &r, radix)) return false; // Could not parse |
if (!parse_long_radix(str, n, &r, radix)) return false; // Could not parse |
| 741 |
if (r < SHRT_MIN || r > SHRT_MAX) return false; // Out of range |
if (r < SHRT_MIN || r > SHRT_MAX) return false; // Out of range |
| 742 |
*(reinterpret_cast<short*>(dest)) = r; |
if (dest == NULL) return true; |
| 743 |
|
*(reinterpret_cast<short*>(dest)) = static_cast<short>(r); |
| 744 |
return true; |
return true; |
| 745 |
} |
} |
| 746 |
|
|
| 751 |
unsigned long r; |
unsigned long r; |
| 752 |
if (!parse_ulong_radix(str, n, &r, radix)) return false; // Could not parse |
if (!parse_ulong_radix(str, n, &r, radix)) return false; // Could not parse |
| 753 |
if (r > USHRT_MAX) return false; // Out of range |
if (r > USHRT_MAX) return false; // Out of range |
| 754 |
*(reinterpret_cast<unsigned short*>(dest)) = r; |
if (dest == NULL) return true; |
| 755 |
|
*(reinterpret_cast<unsigned short*>(dest)) = static_cast<unsigned short>(r); |
| 756 |
return true; |
return true; |
| 757 |
} |
} |
| 758 |
|
|
| 763 |
long r; |
long r; |
| 764 |
if (!parse_long_radix(str, n, &r, radix)) return false; // Could not parse |
if (!parse_long_radix(str, n, &r, radix)) return false; // Could not parse |
| 765 |
if (r < INT_MIN || r > INT_MAX) return false; // Out of range |
if (r < INT_MIN || r > INT_MAX) return false; // Out of range |
| 766 |
|
if (dest == NULL) return true; |
| 767 |
*(reinterpret_cast<int*>(dest)) = r; |
*(reinterpret_cast<int*>(dest)) = r; |
| 768 |
return true; |
return true; |
| 769 |
} |
} |
| 775 |
unsigned long r; |
unsigned long r; |
| 776 |
if (!parse_ulong_radix(str, n, &r, radix)) return false; // Could not parse |
if (!parse_ulong_radix(str, n, &r, radix)) return false; // Could not parse |
| 777 |
if (r > UINT_MAX) return false; // Out of range |
if (r > UINT_MAX) return false; // Out of range |
| 778 |
|
if (dest == NULL) return true; |
| 779 |
*(reinterpret_cast<unsigned int*>(dest)) = r; |
*(reinterpret_cast<unsigned int*>(dest)) = r; |
| 780 |
return true; |
return true; |
| 781 |
} |
} |
| 796 |
long long r = strtoq(str, &end, radix); |
long long r = strtoq(str, &end, radix); |
| 797 |
#elif defined HAVE_STRTOLL |
#elif defined HAVE_STRTOLL |
| 798 |
long long r = strtoll(str, &end, radix); |
long long r = strtoll(str, &end, radix); |
| 799 |
|
#elif defined HAVE__STRTOI64 |
| 800 |
|
long long r = _strtoi64(str, &end, radix); |
| 801 |
#else |
#else |
| 802 |
#error parse_longlong_radix: cannot convert input to a long-long |
#error parse_longlong_radix: cannot convert input to a long-long |
| 803 |
#endif |
#endif |
| 804 |
if (end != str + n) return false; // Leftover junk |
if (end != str + n) return false; // Leftover junk |
| 805 |
if (errno) return false; |
if (errno) return false; |
| 806 |
|
if (dest == NULL) return true; |
| 807 |
*(reinterpret_cast<long long*>(dest)) = r; |
*(reinterpret_cast<long long*>(dest)) = r; |
| 808 |
return true; |
return true; |
| 809 |
#endif /* HAVE_LONG_LONG */ |
#endif /* HAVE_LONG_LONG */ |
| 826 |
unsigned long long r = strtouq(str, &end, radix); |
unsigned long long r = strtouq(str, &end, radix); |
| 827 |
#elif defined HAVE_STRTOLL |
#elif defined HAVE_STRTOLL |
| 828 |
unsigned long long r = strtoull(str, &end, radix); |
unsigned long long r = strtoull(str, &end, radix); |
| 829 |
|
#elif defined HAVE__STRTOI64 |
| 830 |
|
unsigned long long r = _strtoui64(str, &end, radix); |
| 831 |
#else |
#else |
| 832 |
#error parse_ulonglong_radix: cannot convert input to a long-long |
#error parse_ulonglong_radix: cannot convert input to a long-long |
| 833 |
#endif |
#endif |
| 834 |
if (end != str + n) return false; // Leftover junk |
if (end != str + n) return false; // Leftover junk |
| 835 |
if (errno) return false; |
if (errno) return false; |
| 836 |
|
if (dest == NULL) return true; |
| 837 |
*(reinterpret_cast<unsigned long long*>(dest)) = r; |
*(reinterpret_cast<unsigned long long*>(dest)) = r; |
| 838 |
return true; |
return true; |
| 839 |
#endif /* HAVE_UNSIGNED_LONG_LONG */ |
#endif /* HAVE_UNSIGNED_LONG_LONG */ |
| 851 |
double r = strtod(buf, &end); |
double r = strtod(buf, &end); |
| 852 |
if (end != buf + n) return false; // Leftover junk |
if (end != buf + n) return false; // Leftover junk |
| 853 |
if (errno) return false; |
if (errno) return false; |
| 854 |
|
if (dest == NULL) return true; |
| 855 |
*(reinterpret_cast<double*>(dest)) = r; |
*(reinterpret_cast<double*>(dest)) = r; |
| 856 |
return true; |
return true; |
| 857 |
} |
} |
| 859 |
bool Arg::parse_float(const char* str, int n, void* dest) { |
bool Arg::parse_float(const char* str, int n, void* dest) { |
| 860 |
double r; |
double r; |
| 861 |
if (!parse_double(str, n, &r)) return false; |
if (!parse_double(str, n, &r)) return false; |
| 862 |
|
if (dest == NULL) return true; |
| 863 |
*(reinterpret_cast<float*>(dest)) = static_cast<float>(r); |
*(reinterpret_cast<float*>(dest)) = static_cast<float>(r); |
| 864 |
return true; |
return true; |
| 865 |
} |
} |