| 1 |
// -*- coding: utf-8 -*- |
// -*- coding: utf-8 -*- |
| 2 |
// |
// |
| 3 |
// Copyright (c) 2005 - 2006, Google Inc. |
// Copyright (c) 2005 - 2010, Google Inc. |
| 4 |
// All rights reserved. |
// All rights reserved. |
| 5 |
// |
// |
| 6 |
// Redistribution and use in source and binary forms, with or without |
// Redistribution and use in source and binary forms, with or without |
| 33 |
// |
// |
| 34 |
// TODO: Test extractions for PartialMatch/Consume |
// TODO: Test extractions for PartialMatch/Consume |
| 35 |
|
|
| 36 |
|
#ifdef HAVE_CONFIG_H |
| 37 |
|
#include "config.h" |
| 38 |
|
#endif |
| 39 |
|
|
| 40 |
#include <stdio.h> |
#include <stdio.h> |
| 41 |
|
#include <string.h> /* for memset and strcmp */ |
| 42 |
#include <cassert> |
#include <cassert> |
| 43 |
#include <vector> |
#include <vector> |
|
#include "config.h" |
|
| 44 |
#include "pcrecpp.h" |
#include "pcrecpp.h" |
| 45 |
|
|
| 46 |
using pcrecpp::StringPiece; |
using pcrecpp::StringPiece; |
| 111 |
initial_size = VirtualProcessSize(); |
initial_size = VirtualProcessSize(); |
| 112 |
printf("Size after 50000: %llu\n", initial_size); |
printf("Size after 50000: %llu\n", initial_size); |
| 113 |
} |
} |
| 114 |
char buf[100]; |
char buf[100]; // definitely big enough |
| 115 |
snprintf(buf, sizeof(buf), "pat%09d", i); |
sprintf(buf, "pat%09d", i); |
| 116 |
RE newre(buf); |
RE newre(buf); |
| 117 |
} |
} |
| 118 |
uint64 final_size = VirtualProcessSize(); |
uint64 final_size = VirtualProcessSize(); |
| 214 |
const char *original; |
const char *original; |
| 215 |
const char *single; |
const char *single; |
| 216 |
const char *global; |
const char *global; |
| 217 |
|
int global_count; // the expected return value from ReplaceAll |
| 218 |
}; |
}; |
| 219 |
static const ReplaceTest tests[] = { |
static const ReplaceTest tests[] = { |
| 220 |
{ "(qu|[b-df-hj-np-tv-z]*)([a-z]+)", |
{ "(qu|[b-df-hj-np-tv-z]*)([a-z]+)", |
| 221 |
"\\2\\1ay", |
"\\2\\1ay", |
| 222 |
"the quick brown fox jumps over the lazy dogs.", |
"the quick brown fox jumps over the lazy dogs.", |
| 223 |
"ethay quick brown fox jumps over the lazy dogs.", |
"ethay quick brown fox jumps over the lazy dogs.", |
| 224 |
"ethay ickquay ownbray oxfay umpsjay overay ethay azylay ogsday." }, |
"ethay ickquay ownbray oxfay umpsjay overay ethay azylay ogsday.", |
| 225 |
|
9 }, |
| 226 |
{ "\\w+", |
{ "\\w+", |
| 227 |
"\\0-NOSPAM", |
"\\0-NOSPAM", |
| 228 |
"paul.haahr@google.com", |
"paul.haahr@google.com", |
| 229 |
"paul-NOSPAM.haahr@google.com", |
"paul-NOSPAM.haahr@google.com", |
| 230 |
"paul-NOSPAM.haahr-NOSPAM@google-NOSPAM.com-NOSPAM" }, |
"paul-NOSPAM.haahr-NOSPAM@google-NOSPAM.com-NOSPAM", |
| 231 |
|
4 }, |
| 232 |
{ "^", |
{ "^", |
| 233 |
"(START)", |
"(START)", |
| 234 |
"foo", |
"foo", |
| 235 |
"(START)foo", |
"(START)foo", |
| 236 |
"(START)foo" }, |
"(START)foo", |
| 237 |
|
1 }, |
| 238 |
{ "^", |
{ "^", |
| 239 |
"(START)", |
"(START)", |
| 240 |
"", |
"", |
| 241 |
"(START)", |
"(START)", |
| 242 |
"(START)" }, |
"(START)", |
| 243 |
|
1 }, |
| 244 |
{ "$", |
{ "$", |
| 245 |
"(END)", |
"(END)", |
| 246 |
"", |
"", |
| 247 |
"(END)", |
"(END)", |
| 248 |
"(END)" }, |
"(END)", |
| 249 |
|
1 }, |
| 250 |
{ "b", |
{ "b", |
| 251 |
"bb", |
"bb", |
| 252 |
"ababababab", |
"ababababab", |
| 253 |
"abbabababab", |
"abbabababab", |
| 254 |
"abbabbabbabbabb" }, |
"abbabbabbabbabb", |
| 255 |
|
5 }, |
| 256 |
{ "b", |
{ "b", |
| 257 |
"bb", |
"bb", |
| 258 |
"bbbbbb", |
"bbbbbb", |
| 259 |
"bbbbbbb", |
"bbbbbbb", |
| 260 |
"bbbbbbbbbbbb" }, |
"bbbbbbbbbbbb", |
| 261 |
|
6 }, |
| 262 |
{ "b+", |
{ "b+", |
| 263 |
"bb", |
"bb", |
| 264 |
"bbbbbb", |
"bbbbbb", |
| 265 |
"bb", |
"bb", |
| 266 |
"bb" }, |
"bb", |
| 267 |
|
1 }, |
| 268 |
{ "b*", |
{ "b*", |
| 269 |
"bb", |
"bb", |
| 270 |
"bbbbbb", |
"bbbbbb", |
| 271 |
"bb", |
"bb", |
| 272 |
"bb" }, |
"bbbb", |
| 273 |
|
2 }, |
| 274 |
{ "b*", |
{ "b*", |
| 275 |
"bb", |
"bb", |
| 276 |
"aaaaa", |
"aaaaa", |
| 277 |
"bbaaaaa", |
"bbaaaaa", |
| 278 |
"bbabbabbabbabbabb" }, |
"bbabbabbabbabbabb", |
| 279 |
|
6 }, |
| 280 |
{ "b*", |
{ "b*", |
| 281 |
"bb", |
"bb", |
| 282 |
"aa\naa\n", |
"aa\naa\n", |
| 283 |
"bbaa\naa\n", |
"bbaa\naa\n", |
| 284 |
"bbabbabb\nbbabbabb\nbb" }, |
"bbabbabb\nbbabbabb\nbb", |
| 285 |
|
7 }, |
| 286 |
{ "b*", |
{ "b*", |
| 287 |
"bb", |
"bb", |
| 288 |
"aa\raa\r", |
"aa\raa\r", |
| 289 |
"bbaa\raa\r", |
"bbaa\raa\r", |
| 290 |
"bbabbabb\rbbabbabb\rbb" }, |
"bbabbabb\rbbabbabb\rbb", |
| 291 |
|
7 }, |
| 292 |
{ "b*", |
{ "b*", |
| 293 |
"bb", |
"bb", |
| 294 |
"aa\r\naa\r\n", |
"aa\r\naa\r\n", |
| 295 |
"bbaa\r\naa\r\n", |
"bbaa\r\naa\r\n", |
| 296 |
"bbabbabb\r\nbbabbabb\r\nbb" }, |
"bbabbabb\r\nbbabbabb\r\nbb", |
| 297 |
|
7 }, |
| 298 |
|
// Check empty-string matching (it's tricky!) |
| 299 |
|
{ "aa|b*", |
| 300 |
|
"@", |
| 301 |
|
"aa", |
| 302 |
|
"@", |
| 303 |
|
"@@", |
| 304 |
|
2 }, |
| 305 |
|
{ "b*|aa", |
| 306 |
|
"@", |
| 307 |
|
"aa", |
| 308 |
|
"@aa", |
| 309 |
|
"@@@", |
| 310 |
|
3 }, |
| 311 |
#ifdef SUPPORT_UTF8 |
#ifdef SUPPORT_UTF8 |
| 312 |
{ "b*", |
{ "b*", |
| 313 |
"bb", |
"bb", |
| 314 |
"\xE3\x83\x9B\xE3\x83\xBC\xE3\x83\xA0\xE3\x81\xB8", // utf8 |
"\xE3\x83\x9B\xE3\x83\xBC\xE3\x83\xA0\xE3\x81\xB8", // utf8 |
| 315 |
"bb\xE3\x83\x9B\xE3\x83\xBC\xE3\x83\xA0\xE3\x81\xB8", |
"bb\xE3\x83\x9B\xE3\x83\xBC\xE3\x83\xA0\xE3\x81\xB8", |
| 316 |
"bb\xE3\x83\x9B""bb""\xE3\x83\xBC""bb""\xE3\x83\xA0""bb""\xE3\x81\xB8""bb" }, |
"bb\xE3\x83\x9B""bb""\xE3\x83\xBC""bb""\xE3\x83\xA0""bb""\xE3\x81\xB8""bb", |
| 317 |
|
5 }, |
| 318 |
{ "b*", |
{ "b*", |
| 319 |
"bb", |
"bb", |
| 320 |
"\xE3\x83\x9B\r\n\xE3\x83\xBC\r\xE3\x83\xA0\n\xE3\x81\xB8\r\n", // utf8 |
"\xE3\x83\x9B\r\n\xE3\x83\xBC\r\xE3\x83\xA0\n\xE3\x81\xB8\r\n", // utf8 |
| 321 |
"bb\xE3\x83\x9B\r\n\xE3\x83\xBC\r\xE3\x83\xA0\n\xE3\x81\xB8\r\n", |
"bb\xE3\x83\x9B\r\n\xE3\x83\xBC\r\xE3\x83\xA0\n\xE3\x81\xB8\r\n", |
| 322 |
("bb\xE3\x83\x9B""bb\r\nbb""\xE3\x83\xBC""bb\rbb""\xE3\x83\xA0" |
("bb\xE3\x83\x9B""bb\r\nbb""\xE3\x83\xBC""bb\rbb""\xE3\x83\xA0" |
| 323 |
"bb\nbb""\xE3\x81\xB8""bb\r\nbb") }, |
"bb\nbb""\xE3\x81\xB8""bb\r\nbb"), |
| 324 |
|
9 }, |
| 325 |
#endif |
#endif |
| 326 |
{ "", NULL, NULL, NULL, NULL } |
{ "", NULL, NULL, NULL, NULL, 0 } |
| 327 |
}; |
}; |
| 328 |
|
|
| 329 |
#ifdef SUPPORT_UTF8 |
#ifdef SUPPORT_UTF8 |
| 339 |
CHECK(re.Replace(t->rewrite, &one)); |
CHECK(re.Replace(t->rewrite, &one)); |
| 340 |
CHECK_EQ(one, t->single); |
CHECK_EQ(one, t->single); |
| 341 |
string all(t->original); |
string all(t->original); |
| 342 |
CHECK(re.GlobalReplace(t->rewrite, &all) > 0); |
const int replace_count = re.GlobalReplace(t->rewrite, &all); |
| 343 |
CHECK_EQ(all, t->global); |
CHECK_EQ(all, t->global); |
| 344 |
|
CHECK_EQ(replace_count, t->global_count); |
| 345 |
} |
} |
| 346 |
|
|
| 347 |
// One final test: test \r\n replacement when we're not in CRLF mode |
// One final test: test \r\n replacement when we're not in CRLF mode |
| 349 |
RE re("b*", RE_Options(PCRE_NEWLINE_CR).set_utf8(support_utf8)); |
RE re("b*", RE_Options(PCRE_NEWLINE_CR).set_utf8(support_utf8)); |
| 350 |
assert(re.error().empty()); |
assert(re.error().empty()); |
| 351 |
string all("aa\r\naa\r\n"); |
string all("aa\r\naa\r\n"); |
| 352 |
CHECK(re.GlobalReplace("bb", &all) > 0); |
CHECK_EQ(re.GlobalReplace("bb", &all), 9); |
| 353 |
CHECK_EQ(all, string("bbabbabb\rbb\nbbabbabb\rbb\nbb")); |
CHECK_EQ(all, string("bbabbabb\rbb\nbbabbabb\rbb\nbb")); |
| 354 |
} |
} |
| 355 |
{ |
{ |
| 356 |
RE re("b*", RE_Options(PCRE_NEWLINE_LF).set_utf8(support_utf8)); |
RE re("b*", RE_Options(PCRE_NEWLINE_LF).set_utf8(support_utf8)); |
| 357 |
assert(re.error().empty()); |
assert(re.error().empty()); |
| 358 |
string all("aa\r\naa\r\n"); |
string all("aa\r\naa\r\n"); |
| 359 |
CHECK(re.GlobalReplace("bb", &all) > 0); |
CHECK_EQ(re.GlobalReplace("bb", &all), 9); |
| 360 |
CHECK_EQ(all, string("bbabbabb\rbb\nbbabbabb\rbb\nbb")); |
CHECK_EQ(all, string("bbabbabb\rbb\nbbabbabb\rbb\nbb")); |
| 361 |
} |
} |
| 362 |
// TODO: test what happens when no PCRE_NEWLINE_* flag is set. |
// TODO: test what happens when no PCRE_NEWLINE_* flag is set. |
| 413 |
} |
} |
| 414 |
|
|
| 415 |
static void TestMatchNumberPeculiarity() { |
static void TestMatchNumberPeculiarity() { |
| 416 |
printf("Testing match-number peculiaraity\n"); |
printf("Testing match-number peculiarity\n"); |
| 417 |
|
|
| 418 |
string word1; |
string word1; |
| 419 |
string word2; |
string word2; |
| 511 |
TestQuoteMeta("((a|b)c?d*e+[f-h]i)"); |
TestQuoteMeta("((a|b)c?d*e+[f-h]i)"); |
| 512 |
TestQuoteMeta("((?!)xxx).*yyy"); |
TestQuoteMeta("((?!)xxx).*yyy"); |
| 513 |
TestQuoteMeta("(["); |
TestQuoteMeta("(["); |
| 514 |
|
TestQuoteMeta(string("foo\0bar", 7)); |
| 515 |
} |
} |
| 516 |
|
|
| 517 |
static void TestQuoteMetaSimpleNegative() { |
static void TestQuoteMetaSimpleNegative() { |
| 833 |
return 0; |
return 0; |
| 834 |
} |
} |
| 835 |
|
|
| 836 |
|
printf("PCRE C++ wrapper tests\n"); |
| 837 |
printf("Testing FullMatch\n"); |
printf("Testing FullMatch\n"); |
| 838 |
|
|
| 839 |
int i; |
int i; |
| 890 |
CHECK_EQ(s, string("ruby")); |
CHECK_EQ(s, string("ruby")); |
| 891 |
CHECK_EQ(i, 1234); |
CHECK_EQ(i, 1234); |
| 892 |
|
|
| 893 |
|
// Ignore non-void* NULL arg |
| 894 |
|
CHECK(RE("he(.*)lo").FullMatch("hello", (char*)NULL)); |
| 895 |
|
CHECK(RE("h(.*)o").FullMatch("hello", (string*)NULL)); |
| 896 |
|
CHECK(RE("h(.*)o").FullMatch("hello", (StringPiece*)NULL)); |
| 897 |
|
CHECK(RE("(.*)").FullMatch("1234", (int*)NULL)); |
| 898 |
|
#ifdef HAVE_LONG_LONG |
| 899 |
|
CHECK(RE("(.*)").FullMatch("1234567890123456", (long long*)NULL)); |
| 900 |
|
#endif |
| 901 |
|
CHECK(RE("(.*)").FullMatch("123.4567890123456", (double*)NULL)); |
| 902 |
|
CHECK(RE("(.*)").FullMatch("123.4567890123456", (float*)NULL)); |
| 903 |
|
|
| 904 |
|
// Fail on non-void* NULL arg if the match doesn't parse for the given type. |
| 905 |
|
CHECK(!RE("h(.*)lo").FullMatch("hello", &s, (char*)NULL)); |
| 906 |
|
CHECK(!RE("(.*)").FullMatch("hello", (int*)NULL)); |
| 907 |
|
CHECK(!RE("(.*)").FullMatch("1234567890123456", (int*)NULL)); |
| 908 |
|
CHECK(!RE("(.*)").FullMatch("hello", (double*)NULL)); |
| 909 |
|
CHECK(!RE("(.*)").FullMatch("hello", (float*)NULL)); |
| 910 |
|
|
| 911 |
// Ignored arg |
// Ignored arg |
| 912 |
CHECK(RE("(\\w+)(:)(\\d+)").FullMatch("ruby:1234", &s, (void*)NULL, &i)); |
CHECK(RE("(\\w+)(:)(\\d+)").FullMatch("ruby:1234", &s, (void*)NULL, &i)); |
| 913 |
CHECK_EQ(s, string("ruby")); |
CHECK_EQ(s, string("ruby")); |
| 959 |
CHECK(!RE("(\\d+)").FullMatch("4294967296", &v)); |
CHECK(!RE("(\\d+)").FullMatch("4294967296", &v)); |
| 960 |
} |
} |
| 961 |
#ifdef HAVE_LONG_LONG |
#ifdef HAVE_LONG_LONG |
| 962 |
|
# if defined(__MINGW__) || defined(__MINGW32__) |
| 963 |
|
# define LLD "%I64d" |
| 964 |
|
# define LLU "%I64u" |
| 965 |
|
# else |
| 966 |
|
# define LLD "%lld" |
| 967 |
|
# define LLU "%llu" |
| 968 |
|
# endif |
| 969 |
{ |
{ |
| 970 |
long long v; |
long long v; |
| 971 |
static const long long max_value = 0x7fffffffffffffffLL; |
static const long long max_value = 0x7fffffffffffffffLL; |
| 972 |
static const long long min_value = -max_value - 1; |
static const long long min_value = -max_value - 1; |
| 973 |
char buf[32]; |
char buf[32]; // definitely big enough for a long long |
| 974 |
|
|
| 975 |
CHECK(RE("(-?\\d+)").FullMatch("100", &v)); CHECK_EQ(v, 100); |
CHECK(RE("(-?\\d+)").FullMatch("100", &v)); CHECK_EQ(v, 100); |
| 976 |
CHECK(RE("(-?\\d+)").FullMatch("-100",&v)); CHECK_EQ(v, -100); |
CHECK(RE("(-?\\d+)").FullMatch("-100",&v)); CHECK_EQ(v, -100); |
| 977 |
|
|
| 978 |
snprintf(buf, sizeof(buf), "%lld", max_value); |
sprintf(buf, LLD, max_value); |
| 979 |
CHECK(RE("(-?\\d+)").FullMatch(buf,&v)); CHECK_EQ(v, max_value); |
CHECK(RE("(-?\\d+)").FullMatch(buf,&v)); CHECK_EQ(v, max_value); |
| 980 |
|
|
| 981 |
snprintf(buf, sizeof(buf), "%lld", min_value); |
sprintf(buf, LLD, min_value); |
| 982 |
CHECK(RE("(-?\\d+)").FullMatch(buf,&v)); CHECK_EQ(v, min_value); |
CHECK(RE("(-?\\d+)").FullMatch(buf,&v)); CHECK_EQ(v, min_value); |
| 983 |
|
|
| 984 |
snprintf(buf, sizeof(buf), "%lld", max_value); |
sprintf(buf, LLD, max_value); |
| 985 |
assert(buf[strlen(buf)-1] != '9'); |
assert(buf[strlen(buf)-1] != '9'); |
| 986 |
buf[strlen(buf)-1]++; |
buf[strlen(buf)-1]++; |
| 987 |
CHECK(!RE("(-?\\d+)").FullMatch(buf, &v)); |
CHECK(!RE("(-?\\d+)").FullMatch(buf, &v)); |
| 988 |
|
|
| 989 |
snprintf(buf, sizeof(buf), "%lld", min_value); |
sprintf(buf, LLD, min_value); |
| 990 |
assert(buf[strlen(buf)-1] != '9'); |
assert(buf[strlen(buf)-1] != '9'); |
| 991 |
buf[strlen(buf)-1]++; |
buf[strlen(buf)-1]++; |
| 992 |
CHECK(!RE("(-?\\d+)").FullMatch(buf, &v)); |
CHECK(!RE("(-?\\d+)").FullMatch(buf, &v)); |
| 997 |
unsigned long long v; |
unsigned long long v; |
| 998 |
long long v2; |
long long v2; |
| 999 |
static const unsigned long long max_value = 0xffffffffffffffffULL; |
static const unsigned long long max_value = 0xffffffffffffffffULL; |
| 1000 |
char buf[32]; |
char buf[32]; // definitely big enough for a unsigned long long |
| 1001 |
|
|
| 1002 |
CHECK(RE("(-?\\d+)").FullMatch("100",&v)); CHECK_EQ(v, 100); |
CHECK(RE("(-?\\d+)").FullMatch("100",&v)); CHECK_EQ(v, 100); |
| 1003 |
CHECK(RE("(-?\\d+)").FullMatch("-100",&v2)); CHECK_EQ(v2, -100); |
CHECK(RE("(-?\\d+)").FullMatch("-100",&v2)); CHECK_EQ(v2, -100); |
| 1004 |
|
|
| 1005 |
snprintf(buf, sizeof(buf), "%llu", max_value); |
sprintf(buf, LLU, max_value); |
| 1006 |
CHECK(RE("(-?\\d+)").FullMatch(buf,&v)); CHECK_EQ(v, max_value); |
CHECK(RE("(-?\\d+)").FullMatch(buf,&v)); CHECK_EQ(v, max_value); |
| 1007 |
|
|
| 1008 |
assert(buf[strlen(buf)-1] != '9'); |
assert(buf[strlen(buf)-1] != '9'); |
| 1183 |
printf("Testing UTF-8 handling\n"); |
printf("Testing UTF-8 handling\n"); |
| 1184 |
|
|
| 1185 |
// Three Japanese characters (nihongo) |
// Three Japanese characters (nihongo) |
| 1186 |
const char utf8_string[] = { |
const unsigned char utf8_string[] = { |
| 1187 |
0xe6, 0x97, 0xa5, // 65e5 |
0xe6, 0x97, 0xa5, // 65e5 |
| 1188 |
0xe6, 0x9c, 0xac, // 627c |
0xe6, 0x9c, 0xac, // 627c |
| 1189 |
0xe8, 0xaa, 0x9e, // 8a9e |
0xe8, 0xaa, 0x9e, // 8a9e |
| 1190 |
0 |
0 |
| 1191 |
}; |
}; |
| 1192 |
const char utf8_pattern[] = { |
const unsigned char utf8_pattern[] = { |
| 1193 |
'.', |
'.', |
| 1194 |
0xe6, 0x9c, 0xac, // 627c |
0xe6, 0x9c, 0xac, // 627c |
| 1195 |
'.', |
'.', |