--- code/trunk/pcregrep.c 2007/03/05 12:36:47 97 +++ code/trunk/pcregrep.c 2007/09/11 12:57:06 236 @@ -6,7 +6,7 @@ its pattern matching. On a Unix or Win32 system it can recurse into directories. - Copyright (c) 1997-2006 University of Cambridge + Copyright (c) 1997-2007 University of Cambridge ----------------------------------------------------------------------------- Redistribution and use in source and binary forms, with or without @@ -38,7 +38,7 @@ */ #ifdef HAVE_CONFIG_H -# include +#include "config.h" #endif #include @@ -50,7 +50,10 @@ #include #include + +#ifdef HAVE_UNISTD_H #include +#endif #include "pcre.h" @@ -86,7 +89,7 @@ /* Line ending types */ -enum { EL_LF, EL_CR, EL_CRLF, EL_ANY }; +enum { EL_LF, EL_CR, EL_CRLF, EL_ANY, EL_ANYCRLF }; @@ -119,8 +122,8 @@ static const unsigned char *pcretables = NULL; static int pattern_count = 0; -static pcre **pattern_list; -static pcre_extra **hints_list; +static pcre **pattern_list = NULL; +static pcre_extra **hints_list = NULL; static char *include_pattern = NULL; static char *exclude_pattern = NULL; @@ -194,7 +197,7 @@ { OP_STRING, N_LABEL, &stdin_name, "label=name", "set name for standard input" }, { OP_STRING, N_LOCALE, &locale, "locale=locale", "use the named locale" }, { OP_NODATA, 'M', NULL, "multiline", "run in multiline mode" }, - { OP_STRING, 'N', &newline, "newline=type", "specify newline type (CR, LR, CRLF)" }, + { OP_STRING, 'N', &newline, "newline=type", "specify newline type (CR, LF, CRLF, ANYCRLF or ANY)" }, { OP_NODATA, 'n', NULL, "line-number", "print line number with output lines" }, { OP_NODATA, 'o', NULL, "only-matching", "show only the part of the line that matched" }, { OP_NODATA, 'q', NULL, "quiet", "suppress output, just set return code" }, @@ -224,7 +227,7 @@ static const char *suffix[] = { "", "\\b", ")$", ")$", "\\E", "\\E\\b", "\\E)$", "\\E)$" }; -/* UTF-8 tables - used only when the newline setting is "all". */ +/* UTF-8 tables - used only when the newline setting is "any". */ const int utf8_table3[] = { 0xff, 0x1f, 0x0f, 0x07, 0x03, 0x01}; @@ -278,7 +281,7 @@ if (strcmp(dent->d_name, ".") != 0 && strcmp(dent->d_name, "..") != 0) return dent->d_name; } -return NULL; /* Keep compiler happy; never executed */ +/* Control never reaches here */ } static void @@ -463,7 +466,7 @@ -#if ! HAVE_STRERROR +#ifndef HAVE_STRERROR /************************************************* * Provide strerror() for non-ANSI libraries * *************************************************/ @@ -543,6 +546,50 @@ } break; + case EL_ANYCRLF: + while (p < endptr) + { + int extra = 0; + register int c = *((unsigned char *)p); + + if (utf8 && c >= 0xc0) + { + int gcii, gcss; + extra = utf8_table4[c & 0x3f]; /* Number of additional bytes */ + gcss = 6*extra; + c = (c & utf8_table3[extra]) << gcss; + for (gcii = 1; gcii <= extra; gcii++) + { + gcss -= 6; + c |= (p[gcii] & 0x3f) << gcss; + } + } + + p += 1 + extra; + + switch (c) + { + case 0x0a: /* LF */ + *lenptr = 1; + return p; + + case 0x0d: /* CR */ + if (p < endptr && *p == 0x0a) + { + *lenptr = 2; + p++; + } + else *lenptr = 1; + return p; + + default: + break; + } + } /* End of loop for ANYCRLF case */ + + *lenptr = 0; /* Must have hit the end */ + return endptr; + case EL_ANY: while (p < endptr) { @@ -641,6 +688,7 @@ return p; /* But control should never get here */ case EL_ANY: + case EL_ANYCRLF: if (*(--p) == '\n' && p > startptr && p[-1] == '\r') p--; if (utf8) while ((*p & 0xc0) == 0x80) p--; @@ -669,7 +717,17 @@ } else c = *((unsigned char *)pp); - switch (c) + if (endlinetype == EL_ANYCRLF) switch (c) + { + case 0x0a: /* LF */ + case 0x0d: /* CR */ + return p; + + default: + break; + } + + else switch (c) { case 0x0a: /* LF */ case 0x0b: /* VT */ @@ -798,7 +856,7 @@ t = end_of_line(t, endptr, &endlinelength); linelength = t - ptr - endlinelength; - length = multiline? endptr - ptr : linelength; + length = multiline? (size_t)(endptr - ptr) : linelength; /* Extra processing for Jeffrey Friedl's debugging. */ @@ -1006,18 +1064,23 @@ /* In multiline mode, we want to print to the end of the line in which the end of the matched string is found, so we adjust linelength and the - line number appropriately. Because the PCRE_FIRSTLINE option is set, the - start of the match will always be before the first newline sequence. */ + line number appropriately, but only when there actually was a match + (invert not set). Because the PCRE_FIRSTLINE option is set, the start of + the match will always be before the first newline sequence. */ if (multiline) { int ellength; - char *endmatch = ptr + offsets[1]; - t = ptr; - while (t < endmatch) + char *endmatch = ptr; + if (!invert) { - t = end_of_line(t, endptr, &ellength); - if (t <= endmatch) linenumber++; else break; + endmatch += offsets[1]; + t = ptr; + while (t < endmatch) + { + t = end_of_line(t, endptr, &ellength); + if (t <= endmatch) linenumber++; else break; + } } endmatch = end_of_line(endmatch, endptr, &ellength); linelength = endmatch - ptr - ellength; @@ -1066,6 +1129,24 @@ lastmatchnumber = linenumber + 1; } + /* For a match in multiline inverted mode (which of course did not cause + anything to be printed), we have to move on to the end of the match before + proceeding. */ + + if (multiline && invert && match) + { + int ellength; + char *endmatch = ptr + offsets[1]; + t = ptr; + while (t < endmatch) + { + t = end_of_line(t, endptr, &ellength); + if (t <= endmatch) linenumber++; else break; + } + endmatch = end_of_line(endmatch, endptr, &ellength); + linelength = endmatch - ptr - ellength; + } + /* Advance to after the newline and increment the line number. */ ptr += linelength + endlinelength; @@ -1406,7 +1487,11 @@ suffix[process_options]); pattern_list[pattern_count] = pcre_compile(buffer, options, &error, &errptr, pcretables); -if (pattern_list[pattern_count++] != NULL) return TRUE; +if (pattern_list[pattern_count] != NULL) + { + pattern_count++; + return TRUE; + } /* Handle compile errors */ @@ -1464,7 +1549,7 @@ char *p = end_of_line(pattern, eop, &ellength); if (ellength == 0) return compile_single_pattern(pattern, options, filename, count); - sprintf(buffer, "%.*s", p - pattern - ellength, pattern); + sprintf(buffer, "%.*s", (int)(p - pattern - ellength), pattern); pattern = p; if (!compile_single_pattern(buffer, options, filename, count)) return FALSE; @@ -1488,6 +1573,7 @@ int rc = 1; int pcre_options = 0; int cmd_pattern_count = 0; +int hint_count = 0; int errptr; BOOL only_one_at_top; char *patterns[MAX_PATTERN_COUNT]; @@ -1505,6 +1591,7 @@ case '\r': newline = (char *)"cr"; break; case ('\r' << 8) | '\n': newline = (char *)"crlf"; break; case -1: newline = (char *)"any"; break; + case -2: newline = (char *)"anycrlf"; break; } /* Process the options */ @@ -1562,7 +1649,7 @@ else /* Special case xxx=data */ { int oplen = equals - op->long_name; - int arglen = (argequals == NULL)? strlen(arg) : argequals - arg; + int arglen = (argequals == NULL)? (int)strlen(arg) : argequals - arg; if (oplen == arglen && strncmp(arg, op->long_name, oplen) == 0) { option_data = arg + arglen; @@ -1581,8 +1668,8 @@ char buff2[24]; int baselen = opbra - op->long_name; sprintf(buff1, "%.*s", baselen, op->long_name); - sprintf(buff2, "%s%.*s", buff1, strlen(op->long_name) - baselen - 2, - opbra + 1); + sprintf(buff2, "%s%.*s", buff1, + (int)strlen(op->long_name) - baselen - 2, opbra + 1); if (strcmp(arg, buff1) == 0 || strcmp(arg, buff2) == 0) break; } @@ -1812,6 +1899,11 @@ pcre_options |= PCRE_NEWLINE_ANY; endlinetype = EL_ANY; } +else if (strcmp(newline, "anycrlf") == 0 || strcmp(newline, "ANYCRLF") == 0) + { + pcre_options |= PCRE_NEWLINE_ANYCRLF; + endlinetype = EL_ANYCRLF; + } else { fprintf(stderr, "pcregrep: Invalid newline specifier \"%s\"\n", newline); @@ -1866,7 +1958,7 @@ if (pattern_list == NULL || hints_list == NULL) { fprintf(stderr, "pcregrep: malloc failed\n"); - return 2; + goto EXIT2; } /* If no patterns were provided by -e, and there is no file provided by -f, @@ -1885,7 +1977,7 @@ { if (!compile_pattern(patterns[j], pcre_options, NULL, (j == 0 && cmd_pattern_count == 1)? 0 : j + 1)) - return 2; + goto EXIT2; } /* Compile the regular expressions that are provided in a file. */ @@ -1909,7 +2001,7 @@ { fprintf(stderr, "pcregrep: Failed to open %s: %s\n", pattern_filename, strerror(errno)); - return 2; + goto EXIT2; } filename = pattern_filename; } @@ -1922,7 +2014,7 @@ linenumber++; if (buffer[0] == 0) continue; /* Skip blank lines */ if (!compile_pattern(buffer, pcre_options, filename, linenumber)) - return 2; + goto EXIT2; } if (f != stdin) fclose(f); @@ -1938,8 +2030,9 @@ char s[16]; if (pattern_count == 1) s[0] = 0; else sprintf(s, " number %d", j); fprintf(stderr, "pcregrep: Error while studying regex%s: %s\n", s, error); - return 2; + goto EXIT2; } + hint_count++; } /* If there are include or exclude patterns, compile them. */ @@ -1952,7 +2045,7 @@ { fprintf(stderr, "pcregrep: Error in 'exclude' regex at offset %d: %s\n", errptr, error); - return 2; + goto EXIT2; } } @@ -1964,14 +2057,17 @@ { fprintf(stderr, "pcregrep: Error in 'include' regex at offset %d: %s\n", errptr, error); - return 2; + goto EXIT2; } } /* If there are no further arguments, do the business on stdin and exit. */ if (i >= argc) - return pcregrep(stdin, (filenames > FN_DEFAULT)? stdin_name : NULL); + { + rc = pcregrep(stdin, (filenames > FN_DEFAULT)? stdin_name : NULL); + goto EXIT; + } /* Otherwise, work through the remaining arguments as files or directories. Pass in the fact that there is only one argument at top level - this suppresses @@ -1988,7 +2084,22 @@ else if (frc == 0 && rc == 1) rc = 0; } +EXIT: +if (pattern_list != NULL) + { + for (i = 0; i < pattern_count; i++) free(pattern_list[i]); + free(pattern_list); + } +if (hints_list != NULL) + { + for (i = 0; i < hint_count; i++) free(hints_list[i]); + free(hints_list); + } return rc; + +EXIT2: +rc = 2; +goto EXIT; } /* End of pcregrep */