--- code/trunk/pcretest.c 2007/03/30 15:46:27 141 +++ code/trunk/pcretest.c 2009/09/26 19:12:32 455 @@ -36,6 +36,10 @@ */ +#ifdef HAVE_CONFIG_H +#include "config.h" +#endif + #include #include #include @@ -44,6 +48,14 @@ #include #include +#ifdef SUPPORT_LIBREADLINE +#ifdef HAVE_UNISTD_H +#include +#endif +#include +#include +#endif + /* A number of things vary for Windows builds. Originally, pcretest opened its input and output without "b"; then I was told that "b" was needed in some @@ -59,6 +71,14 @@ #define INPUT_MODE "r" #define OUTPUT_MODE "wb" +#ifndef isatty +#define isatty _isatty /* This is what Windows calls them, I'm told, */ +#endif /* though in some environments they seem to */ + /* be already defined, hence the #ifndefs. */ +#ifndef fileno +#define fileno _fileno +#endif + #else #include /* These two includes are needed */ #include /* for setrlimit(). */ @@ -67,19 +87,23 @@ #endif -#define PCRE_SPY /* For Win32 build, import data, not export */ - -/* We include pcre_internal.h because we need the internal info for displaying -the results of pcre_study() and we also need to know about the internal -macros, structures, and other internal data values; pcretest has "inside -information" compared to a program that strictly follows the PCRE API. */ +/* We have to include pcre_internal.h because we need the internal info for +displaying the results of pcre_study() and we also need to know about the +internal macros, structures, and other internal data values; pcretest has +"inside information" compared to a program that strictly follows the PCRE API. + +Although pcre_internal.h does itself include pcre.h, we explicitly include it +here before pcre_internal.h so that the PCRE_EXP_xxx macros get set +appropriately for an application, not for building PCRE. */ +#include "pcre.h" #include "pcre_internal.h" -/* We need access to the data tables that PCRE uses. So as not to have to keep -two copies, we include the source file here, changing the names of the external -symbols to prevent clashes. */ +/* We need access to some of the data tables that PCRE uses. So as not to have +to keep two copies, we include the source file here, changing the names of the +external symbols to prevent clashes. */ +#define _pcre_ucp_gentype ucp_gentype #define _pcre_utf8_table1 utf8_table1 #define _pcre_utf8_table1_size utf8_table1_size #define _pcre_utf8_table2 utf8_table2 @@ -87,6 +111,7 @@ #define _pcre_utf8_table4 utf8_table4 #define _pcre_utt utt #define _pcre_utt_size utt_size +#define _pcre_utt_names utt_names #define _pcre_OP_lengths OP_lengths #include "pcre_tables.c" @@ -149,6 +174,7 @@ static int callout_extra; static int callout_fail_count; static int callout_fail_id; +static int debug_lengths; static int first_callout; static int locale_set = 0; static int show_malloc; @@ -180,6 +206,7 @@ Arguments: f the file to read start where in buffer to start (this *must* be within buffer) + prompt for stdin or readline() Returns: pointer to the start of new data could be a copy of start, or could be moved @@ -187,7 +214,7 @@ */ static uschar * -extend_inputline(FILE *f, uschar *start) +extend_inputline(FILE *f, uschar *start, const char *prompt) { uschar *here = start; @@ -198,8 +225,36 @@ if (rlen > 1000) { int dlen; - if (fgets((char *)here, rlen, f) == NULL) - return (here == start)? NULL : start; + + /* If libreadline support is required, use readline() to read a line if the + input is a terminal. Note that readline() removes the trailing newline, so + we must put it back again, to be compatible with fgets(). */ + +#ifdef SUPPORT_LIBREADLINE + if (isatty(fileno(f))) + { + size_t len; + char *s = readline(prompt); + if (s == NULL) return (here == start)? NULL : start; + len = strlen(s); + if (len > 0) add_history(s); + if (len > rlen - 1) len = rlen - 1; + memcpy(here, s, len); + here[len] = '\n'; + here[len+1] = 0; + free(s); + } + else +#endif + + /* Read the next line by normal means, prompting if the file is stdin. */ + + { + if (f == stdin) printf(prompt); + if (fgets((char *)here, rlen, f) == NULL) + return (here == start)? NULL : start; + } + dlen = (int)strlen((char *)here); if (dlen > 0 && here[dlen - 1] == '\n') return start; here += dlen; @@ -656,11 +711,38 @@ /************************************************* +* Case-independent strncmp() function * +*************************************************/ + +/* +Arguments: + s first string + t second string + n number of characters to compare + +Returns: < 0, = 0, or > 0, according to the comparison +*/ + +static int +strncmpic(uschar *s, uschar *t, int n) +{ +while (n--) + { + int c = tolower(*s++) - tolower(*t++); + if (c) return c; + } +return 0; +} + + + +/************************************************* * Check newline indicator * *************************************************/ /* This is used both at compile and run-time to check for escapes, where -xxx is LF, CR, CRLF, or ANY. Print a message and return 0 if there is no match. +xxx is LF, CR, CRLF, ANYCRLF, or ANY. Print a message and return 0 if there is +no match. Arguments: p points after the leading '<' @@ -672,10 +754,13 @@ static int check_newline(uschar *p, FILE *f) { -if (strncmp((char *)p, "cr>", 3) == 0) return PCRE_NEWLINE_CR; -if (strncmp((char *)p, "lf>", 3) == 0) return PCRE_NEWLINE_LF; -if (strncmp((char *)p, "crlf>", 5) == 0) return PCRE_NEWLINE_CRLF; -if (strncmp((char *)p, "any>", 4) == 0) return PCRE_NEWLINE_ANY; +if (strncmpic(p, (uschar *)"cr>", 3) == 0) return PCRE_NEWLINE_CR; +if (strncmpic(p, (uschar *)"lf>", 3) == 0) return PCRE_NEWLINE_LF; +if (strncmpic(p, (uschar *)"crlf>", 5) == 0) return PCRE_NEWLINE_CRLF; +if (strncmpic(p, (uschar *)"anycrlf>", 8) == 0) return PCRE_NEWLINE_ANYCRLF; +if (strncmpic(p, (uschar *)"any>", 4) == 0) return PCRE_NEWLINE_ANY; +if (strncmpic(p, (uschar *)"bsr_anycrlf>", 12) == 0) return PCRE_BSR_ANYCRLF; +if (strncmpic(p, (uschar *)"bsr_unicode>", 12) == 0) return PCRE_BSR_UNICODE; fprintf(f, "Unknown newline type at: <%s\n", p); return 0; } @@ -689,7 +774,14 @@ static void usage(void) { -printf("Usage: pcretest [options] [ []]\n"); +printf("Usage: pcretest [options] [ []]\n\n"); +printf("Input and output default to stdin and stdout.\n"); +#ifdef SUPPORT_LIBREADLINE +printf("If input is a terminal, readline() is used to read from it.\n"); +#else +printf("This version of pcretest is not linked with readline().\n"); +#endif +printf("\nOptions:\n"); printf(" -b show compiled code (bytecode)\n"); printf(" -C show PCRE compile-time options and exit\n"); printf(" -d debug: show compiled code and information (-b and -i)\n"); @@ -698,6 +790,7 @@ #endif printf(" -help show usage information\n"); printf(" -i show information about compiled patterns\n" + " -M find MATCH_LIMIT minimum for each subject\n" " -m output memory used information\n" " -o set size of offsets vector to \n"); #if !defined NOPOSIX @@ -727,6 +820,7 @@ FILE *infile = stdin; int options = 0; int study_options = 0; +int default_find_match_limit = FALSE; int op = 1; int timeit = 0; int timeitm = 0; @@ -786,6 +880,7 @@ else if (strcmp(argv[op], "-b") == 0) debug = 1; else if (strcmp(argv[op], "-i") == 0) showinfo = 1; else if (strcmp(argv[op], "-d") == 0) showinfo = debug = 1; + else if (strcmp(argv[op], "-M") == 0) default_find_match_limit = TRUE; #if !defined NODFA else if (strcmp(argv[op], "-dfa") == 0) all_use_dfa = 1; #endif @@ -838,6 +933,7 @@ else if (strcmp(argv[op], "-C") == 0) { int rc; + unsigned long int lrc; printf("PCRE version %s\n", pcre_version()); printf("Compiled with\n"); (void)pcre_config(PCRE_CONFIG_UTF8, &rc); @@ -845,17 +941,23 @@ (void)pcre_config(PCRE_CONFIG_UNICODE_PROPERTIES, &rc); printf(" %sUnicode properties support\n", rc? "" : "No "); (void)pcre_config(PCRE_CONFIG_NEWLINE, &rc); - printf(" Newline sequence is %s\n", (rc == '\r')? "CR" : - (rc == '\n')? "LF" : (rc == ('\r'<<8 | '\n'))? "CRLF" : + /* Note that these values are always the ASCII values, even + in EBCDIC environments. CR is 13 and NL is 10. */ + printf(" Newline sequence is %s\n", (rc == 13)? "CR" : + (rc == 10)? "LF" : (rc == (13<<8 | 10))? "CRLF" : + (rc == -2)? "ANYCRLF" : (rc == -1)? "ANY" : "???"); + (void)pcre_config(PCRE_CONFIG_BSR, &rc); + printf(" \\R matches %s\n", rc? "CR, LF, or CRLF only" : + "all Unicode newlines"); (void)pcre_config(PCRE_CONFIG_LINK_SIZE, &rc); printf(" Internal link size = %d\n", rc); (void)pcre_config(PCRE_CONFIG_POSIX_MALLOC_THRESHOLD, &rc); printf(" POSIX malloc threshold = %d\n", rc); - (void)pcre_config(PCRE_CONFIG_MATCH_LIMIT, &rc); - printf(" Default match limit = %d\n", rc); - (void)pcre_config(PCRE_CONFIG_MATCH_LIMIT_RECURSION, &rc); - printf(" Default recursion depth limit = %d\n", rc); + (void)pcre_config(PCRE_CONFIG_MATCH_LIMIT, &lrc); + printf(" Default match limit = %ld\n", lrc); + (void)pcre_config(PCRE_CONFIG_MATCH_LIMIT_RECURSION, &lrc); + printf(" Default recursion depth limit = %ld\n", lrc); (void)pcre_config(PCRE_CONFIG_STACKRECURSE, &rc); printf(" Match recursion uses %s\n", rc? "stack" : "heap"); goto EXIT; @@ -884,7 +986,7 @@ if (offsets == NULL) { printf("** Failed to get %d bytes of memory for offsets vector\n", - size_offsets_max * sizeof(int)); + (int)(size_offsets_max * sizeof(int))); yield = 1; goto EXIT; } @@ -944,7 +1046,6 @@ size_t size, regex_gotten_store; int do_study = 0; int do_debug = debug; - int debug_lengths = 1; int do_G = 0; int do_g = 0; int do_showinfo = showinfo; @@ -953,9 +1054,9 @@ int erroroffset, len, delimiter, poffset; use_utf8 = 0; + debug_lengths = 1; - if (infile == stdin) printf(" re> "); - if (extend_inputline(infile, buffer) == NULL) break; + if (extend_inputline(infile, buffer, " re> ") == NULL) break; if (infile != stdin) fprintf(outfile, "%s", (char *)buffer); fflush(outfile); @@ -1055,7 +1156,7 @@ if (isalnum(delimiter) || delimiter == '\\') { - fprintf(outfile, "** Delimiter must not be alphameric or \\\n"); + fprintf(outfile, "** Delimiter must not be alphanumeric or \\\n"); goto SKIP_DATA; } @@ -1071,8 +1172,7 @@ pp++; } if (*pp != 0) break; - if (infile == stdin) printf(" > "); - if ((pp = extend_inputline(infile, pp)) == NULL) + if ((pp = extend_inputline(infile, pp, " > ")) == NULL) { fprintf(outfile, "** Unexpected EOF\n"); done = 1; @@ -1164,10 +1264,18 @@ case '<': { - int x = check_newline(pp, outfile); - if (x == 0) goto SKIP_DATA; - options |= x; - while (*pp++ != '>'); + if (strncmp((char *)pp, "JS>", 3) == 0) + { + options |= PCRE_JAVASCRIPT_COMPAT; + pp += 3; + } + else + { + int x = check_newline(pp, outfile); + if (x == 0) goto SKIP_DATA; + options |= x; + while (*pp++ != '>'); + } } break; @@ -1197,6 +1305,7 @@ if ((options & PCRE_DOTALL) != 0) cflags |= REG_DOTALL; if ((options & PCRE_NO_AUTO_CAPTURE) != 0) cflags |= REG_NOSUB; if ((options & PCRE_UTF8) != 0) cflags |= REG_UTF8; + if ((options & PCRE_UNGREEDY) != 0) cflags |= REG_UNGREEDY; rc = regcomp(&preg, (char *)p, cflags); @@ -1217,6 +1326,8 @@ #endif /* !defined NOPOSIX */ { + unsigned long int get_options; + if (timeit > 0) { register int i; @@ -1246,7 +1357,7 @@ { for (;;) { - if (extend_inputline(infile, buffer) == NULL) + if (extend_inputline(infile, buffer, NULL) == NULL) { done = 1; goto CONTINUE; @@ -1260,9 +1371,16 @@ goto CONTINUE; } - /* Compilation succeeded; print data if required. There are now two - info-returning functions. The old one has a limited interface and - returns only limited data. Check that it agrees with the newer one. */ + /* Compilation succeeded. It is now possible to set the UTF-8 option from + within the regex; check for this so that we know how to process the data + lines. */ + + new_info(re, NULL, PCRE_INFO_OPTIONS, &get_options); + if ((get_options & PCRE_UTF8) != 0) use_utf8 = 1; + + /* Print information if required. There are now two info-returning + functions. The old one has a limited interface and returns only limited + data. Check that it agrees with the newer one. */ if (log_store) fprintf(outfile, "Memory allocation (code space): %d\n", @@ -1309,24 +1427,32 @@ if (do_flip) { real_pcre *rre = (real_pcre *)re; - rre->magic_number = byteflip(rre->magic_number, sizeof(rre->magic_number)); + rre->magic_number = + byteflip(rre->magic_number, sizeof(rre->magic_number)); rre->size = byteflip(rre->size, sizeof(rre->size)); rre->options = byteflip(rre->options, sizeof(rre->options)); - rre->top_bracket = byteflip(rre->top_bracket, sizeof(rre->top_bracket)); - rre->top_backref = byteflip(rre->top_backref, sizeof(rre->top_backref)); - rre->first_byte = byteflip(rre->first_byte, sizeof(rre->first_byte)); - rre->req_byte = byteflip(rre->req_byte, sizeof(rre->req_byte)); - rre->name_table_offset = byteflip(rre->name_table_offset, + rre->flags = (pcre_uint16)byteflip(rre->flags, sizeof(rre->flags)); + rre->top_bracket = + (pcre_uint16)byteflip(rre->top_bracket, sizeof(rre->top_bracket)); + rre->top_backref = + (pcre_uint16)byteflip(rre->top_backref, sizeof(rre->top_backref)); + rre->first_byte = + (pcre_uint16)byteflip(rre->first_byte, sizeof(rre->first_byte)); + rre->req_byte = + (pcre_uint16)byteflip(rre->req_byte, sizeof(rre->req_byte)); + rre->name_table_offset = (pcre_uint16)byteflip(rre->name_table_offset, sizeof(rre->name_table_offset)); - rre->name_entry_size = byteflip(rre->name_entry_size, + rre->name_entry_size = (pcre_uint16)byteflip(rre->name_entry_size, sizeof(rre->name_entry_size)); - rre->name_count = byteflip(rre->name_count, sizeof(rre->name_count)); + rre->name_count = (pcre_uint16)byteflip(rre->name_count, + sizeof(rre->name_count)); if (extra != NULL) { pcre_study_data *rsd = (pcre_study_data *)(extra->study_data); rsd->size = byteflip(rsd->size, sizeof(rsd->size)); - rsd->options = byteflip(rsd->options, sizeof(rsd->options)); + rsd->flags = byteflip(rsd->flags, sizeof(rsd->flags)); + rsd->minlength = byteflip(rsd->minlength, sizeof(rsd->minlength)); } } @@ -1340,17 +1466,19 @@ pcre_printint(re, outfile, debug_lengths); } + /* We already have the options in get_options (see above) */ + if (do_showinfo) { - unsigned long int get_options, all_options; + unsigned long int all_options; #if !defined NOINFOCHECK int old_first_char, old_options, old_count; #endif - int count, backrefmax, first_char, need_char; + int count, backrefmax, first_char, need_char, okpartial, jchanged, + hascrorlf; int nameentrysize, namecount; const uschar *nametable; - new_info(re, NULL, PCRE_INFO_OPTIONS, &get_options); new_info(re, NULL, PCRE_INFO_SIZE, &size); new_info(re, NULL, PCRE_INFO_CAPTURECOUNT, &count); new_info(re, NULL, PCRE_INFO_BACKREFMAX, &backrefmax); @@ -1359,6 +1487,9 @@ new_info(re, NULL, PCRE_INFO_NAMEENTRYSIZE, &nameentrysize); new_info(re, NULL, PCRE_INFO_NAMECOUNT, &namecount); new_info(re, NULL, PCRE_INFO_NAMETABLE, (void *)&nametable); + new_info(re, NULL, PCRE_INFO_OKPARTIAL, &okpartial); + new_info(re, NULL, PCRE_INFO_JCHANGED, &jchanged); + new_info(re, NULL, PCRE_INFO_HASCRORLF, &hascrorlf); #if !defined NOINFOCHECK old_count = pcre_info(re, &old_options, &old_first_char); @@ -1400,26 +1531,22 @@ } } - /* The NOPARTIAL bit is a private bit in the options, so we have - to fish it out via out back door */ + if (!okpartial) fprintf(outfile, "Partial matching not supported\n"); + if (hascrorlf) fprintf(outfile, "Contains explicit CR or LF match\n"); all_options = ((real_pcre *)re)->options; - if (do_flip) - { - all_options = byteflip(all_options, sizeof(all_options)); - } - - if ((all_options & PCRE_NOPARTIAL) != 0) - fprintf(outfile, "Partial matching not supported\n"); + if (do_flip) all_options = byteflip(all_options, sizeof(all_options)); if (get_options == 0) fprintf(outfile, "No options\n"); - else fprintf(outfile, "Options:%s%s%s%s%s%s%s%s%s%s%s%s%s\n", + else fprintf(outfile, "Options:%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s\n", ((get_options & PCRE_ANCHORED) != 0)? " anchored" : "", ((get_options & PCRE_CASELESS) != 0)? " caseless" : "", ((get_options & PCRE_EXTENDED) != 0)? " extended" : "", ((get_options & PCRE_MULTILINE) != 0)? " multiline" : "", ((get_options & PCRE_FIRSTLINE) != 0)? " firstline" : "", ((get_options & PCRE_DOTALL) != 0)? " dotall" : "", + ((get_options & PCRE_BSR_ANYCRLF) != 0)? " bsr_anycrlf" : "", + ((get_options & PCRE_BSR_UNICODE) != 0)? " bsr_unicode" : "", ((get_options & PCRE_DOLLAR_ENDONLY) != 0)? " dollar_endonly" : "", ((get_options & PCRE_EXTRA) != 0)? " extra" : "", ((get_options & PCRE_UNGREEDY) != 0)? " ungreedy" : "", @@ -1428,6 +1555,8 @@ ((get_options & PCRE_NO_UTF8_CHECK) != 0)? " no_utf8_check" : "", ((get_options & PCRE_DUPNAMES) != 0)? " dupnames" : ""); + if (jchanged) fprintf(outfile, "Duplicate name status changes\n"); + switch (get_options & PCRE_NEWLINE_BITS) { case PCRE_NEWLINE_CR: @@ -1442,6 +1571,10 @@ fprintf(outfile, "Forced newline sequence: CRLF\n"); break; + case PCRE_NEWLINE_ANYCRLF: + fprintf(outfile, "Forced newline sequence: ANYCRLF\n"); + break; + case PCRE_NEWLINE_ANY: fprintf(outfile, "Forced newline sequence: ANY\n"); break; @@ -1496,10 +1629,14 @@ else { uschar *start_bits = NULL; + int minlength; + + new_info(re, extra, PCRE_INFO_MINLENGTH, &minlength); + fprintf(outfile, "Subject length lower bound = %d\n", minlength); + new_info(re, extra, PCRE_INFO_FIRSTTABLE, &start_bits); - if (start_bits == NULL) - fprintf(outfile, "No starting byte set\n"); + fprintf(outfile, "No set of starting bytes\n"); else { int i; @@ -1546,15 +1683,15 @@ else { uschar sbuf[8]; - sbuf[0] = (true_size >> 24) & 255; - sbuf[1] = (true_size >> 16) & 255; - sbuf[2] = (true_size >> 8) & 255; - sbuf[3] = (true_size) & 255; - - sbuf[4] = (true_study_size >> 24) & 255; - sbuf[5] = (true_study_size >> 16) & 255; - sbuf[6] = (true_study_size >> 8) & 255; - sbuf[7] = (true_study_size) & 255; + sbuf[0] = (uschar)((true_size >> 24) & 255); + sbuf[1] = (uschar)((true_size >> 16) & 255); + sbuf[2] = (uschar)((true_size >> 8) & 255); + sbuf[3] = (uschar)((true_size) & 255); + + sbuf[4] = (uschar)((true_study_size >> 24) & 255); + sbuf[5] = (uschar)((true_study_size >> 16) & 255); + sbuf[6] = (uschar)((true_study_size >> 8) & 255); + sbuf[7] = (uschar)((true_study_size) & 255); if (fwrite(sbuf, 1, 8, f) < 8 || fwrite(re, 1, true_size, f) < true_size) @@ -1591,14 +1728,14 @@ for (;;) { uschar *q; - uschar *bptr = dbuffer; + uschar *bptr; int *use_offsets = offsets; int use_size_offsets = size_offsets; int callout_data = 0; int callout_data_set = 0; int count, c; int copystrings = 0; - int find_match_limit = 0; + int find_match_limit = default_find_match_limit; int getstrings = 0; int getlist = 0; int gmatched = 0; @@ -1628,8 +1765,7 @@ len = 0; for (;;) { - if (infile == stdin) printf("data> "); - if (extend_inputline(infile, buffer + len) == NULL) + if (extend_inputline(infile, buffer + len, "data> ") == NULL) { if (len > 0) break; done = 1; @@ -1647,7 +1783,7 @@ p = buffer; while (isspace(*p)) p++; - q = dbuffer; + bptr = q = dbuffer; while ((c = *p++) != 0) { int i = 0; @@ -1697,9 +1833,19 @@ { unsigned char buff8[8]; int ii, utn; - utn = ord2utf8(c, buff8); - for (ii = 0; ii < utn - 1; ii++) *q++ = buff8[ii]; - c = buff8[ii]; /* Last byte */ + if (use_utf8) + { + utn = ord2utf8(c, buff8); + for (ii = 0; ii < utn - 1; ii++) *q++ = buff8[ii]; + c = buff8[ii]; /* Last byte */ + } + else + { + if (c > 255) + fprintf(outfile, "** Character \\x{%x} is greater than 255 and " + "UTF-8 mode is not enabled.\n" + "** Truncation will probably give the wrong result.\n", c); + } p = pt + 1; break; } @@ -1829,7 +1975,10 @@ continue; case 'N': - options |= PCRE_NOTEMPTY; + if ((options & PCRE_NOTEMPTY) != 0) + options = (options & ~PCRE_NOTEMPTY) | PCRE_NOTEMPTY_ATSTART; + else + options |= PCRE_NOTEMPTY; continue; case 'O': @@ -1842,7 +1991,7 @@ if (offsets == NULL) { printf("** Failed to get %d bytes of memory for offsets vector\n", - size_offsets_max * sizeof(int)); + (int)(size_offsets_max * sizeof(int))); yield = 1; goto EXIT; } @@ -1852,7 +2001,8 @@ continue; case 'P': - options |= PCRE_PARTIAL; + options |= ((options & PCRE_PARTIAL_SOFT) == 0)? + PCRE_PARTIAL_SOFT : PCRE_PARTIAL_HARD; continue; case 'Q': @@ -1887,6 +2037,10 @@ show_malloc = 1; continue; + case 'Y': + options |= PCRE_NO_START_OPTIMIZE; + continue; + case 'Z': options |= PCRE_NOTEOL; continue; @@ -1909,6 +2063,23 @@ *q = 0; len = q - dbuffer; + /* Move the data to the end of the buffer so that a read over the end of + the buffer will be seen by valgrind, even if it doesn't cause a crash. If + we are using the POSIX interface, we must include the terminating zero. */ + +#if !defined NOPOSIX + if (posix || do_posix) + { + memmove(bptr + buffer_size - len - 1, bptr, len + 1); + bptr += buffer_size - len - 1; + } + else +#endif + { + memmove(bptr + buffer_size - len, bptr, len); + bptr += buffer_size - len; + } + if ((all_use_dfa || use_dfa) && find_match_limit) { printf("**Match limit not relevant for DFA matching: ignored\n"); @@ -1928,6 +2099,7 @@ pmatch = (regmatch_t *)malloc(sizeof(regmatch_t) * use_size_offsets); if ((options & PCRE_NOTBOL) != 0) eflags |= REG_NOTBOL; if ((options & PCRE_NOTEOL) != 0) eflags |= REG_NOTEOL; + if ((options & PCRE_NOTEMPTY) != 0) eflags |= REG_NOTEMPTY; rc = regexec(&preg, (const char *)bptr, use_size_offsets, pmatch, eflags); @@ -1972,7 +2144,6 @@ for (;; gmatched++) /* Loop for /g or /G */ { - int gany_fudge; if (timeitm > 0) { register int i; @@ -1984,7 +2155,7 @@ { int workspace[1000]; for (i = 0; i < timeitm; i++) - count = pcre_dfa_exec(re, NULL, (char *)bptr, len, start_offset, + count = pcre_dfa_exec(re, extra, (char *)bptr, len, start_offset, options | g_notempty, use_offsets, use_size_offsets, workspace, sizeof(workspace)/sizeof(int)); } @@ -2047,7 +2218,7 @@ else if (all_use_dfa || use_dfa) { int workspace[1000]; - count = pcre_dfa_exec(re, NULL, (char *)bptr, len, start_offset, + count = pcre_dfa_exec(re, extra, (char *)bptr, len, start_offset, options | g_notempty, use_offsets, use_size_offsets, workspace, sizeof(workspace)/sizeof(int)); if (count == 0) @@ -2202,29 +2373,57 @@ else if (count == PCRE_ERROR_PARTIAL) { fprintf(outfile, "Partial match"); -#if !defined NODFA - if ((all_use_dfa || use_dfa) && use_size_offsets > 2) - fprintf(outfile, ": %.*s", use_offsets[1] - use_offsets[0], - bptr + use_offsets[0]); -#endif + if (use_size_offsets > 1) + { + fprintf(outfile, ": "); + pchars(bptr + use_offsets[0], use_offsets[1] - use_offsets[0], + outfile); + } fprintf(outfile, "\n"); break; /* Out of the /g loop */ } /* Failed to match. If this is a /g or /G loop and we previously set - g_notempty after a null match, this is not necessarily the end. - We want to advance the start offset, and continue. In the case of UTF-8 - matching, the advance must be one character, not one byte. Fudge the - offset values to achieve this. We won't be at the end of the string - - that was checked before setting g_notempty. */ + g_notempty after a null match, this is not necessarily the end. We want + to advance the start offset, and continue. We won't be at the end of the + string - that was checked before setting g_notempty. + + Complication arises in the case when the newline option is "any" or + "anycrlf". If the previous match was at the end of a line terminated by + CRLF, an advance of one character just passes the \r, whereas we should + prefer the longer newline sequence, as does the code in pcre_exec(). + Fudge the offset value to achieve this. + + Otherwise, in the case of UTF-8 matching, the advance must be one + character, not one byte. */ else { if (g_notempty != 0) { int onechar = 1; + unsigned int obits = ((real_pcre *)re)->options; use_offsets[0] = start_offset; - if (use_utf8) + if ((obits & PCRE_NEWLINE_BITS) == 0) + { + int d; + (void)pcre_config(PCRE_CONFIG_NEWLINE, &d); + /* Note that these values are always the ASCII ones, even in + EBCDIC environments. CR = 13, NL = 10. */ + obits = (d == 13)? PCRE_NEWLINE_CR : + (d == 10)? PCRE_NEWLINE_LF : + (d == (13<<8 | 10))? PCRE_NEWLINE_CRLF : + (d == -2)? PCRE_NEWLINE_ANYCRLF : + (d == -1)? PCRE_NEWLINE_ANY : 0; + } + if (((obits & PCRE_NEWLINE_BITS) == PCRE_NEWLINE_ANY || + (obits & PCRE_NEWLINE_BITS) == PCRE_NEWLINE_ANYCRLF) + && + start_offset < len - 1 && + bptr[start_offset] == '\r' && + bptr[start_offset+1] == '\n') + onechar++; + else if (use_utf8) { while (start_offset + onechar < len) { @@ -2252,43 +2451,30 @@ if (!do_g && !do_G) break; /* If we have matched an empty string, first check to see if we are at - the end of the subject. If so, the /g loop is over. Otherwise, mimic - what Perl's /g options does. This turns out to be rather cunning. First - we set PCRE_NOTEMPTY and PCRE_ANCHORED and try the match again at the + the end of the subject. If so, the /g loop is over. Otherwise, mimic what + Perl's /g options does. This turns out to be rather cunning. First we set + PCRE_NOTEMPTY_ATSTART and PCRE_ANCHORED and try the match again at the same point. If this fails (picked up above) we advance to the next - character. - - Yet more complication arises in the case when the newline option is - "any" and a pattern in multiline mode has to match at the start of a - line. If a previous match was at the end of a line, and advance of one - character just passes the \r, whereas we should prefer the longer newline - sequence, as does the code in pcre_exec(). So we fudge it. */ + character. */ g_notempty = 0; - gany_fudge = 0; - + if (use_offsets[0] == use_offsets[1]) { if (use_offsets[0] == len) break; - g_notempty = PCRE_NOTEMPTY | PCRE_ANCHORED; - if ((((real_pcre *)re)->options & PCRE_STARTLINE) != 0 && - (((real_pcre *)re)->options & PCRE_NEWLINE_BITS) == PCRE_NEWLINE_ANY && - use_offsets[0] < len - 1 && - bptr[use_offsets[0]] == '\r' && - bptr[use_offsets[0]+1] == '\n') - gany_fudge = 1; + g_notempty = PCRE_NOTEMPTY_ATSTART | PCRE_ANCHORED; } /* For /g, update the start offset, leaving the rest alone */ - if (do_g) start_offset = use_offsets[1] + gany_fudge; + if (do_g) start_offset = use_offsets[1]; /* For /G, update the pointer and length */ else { - bptr += use_offsets[1] + gany_fudge; - len -= use_offsets[1] + gany_fudge; + bptr += use_offsets[1]; + len -= use_offsets[1]; } } /* End of loop for /g and /G */