--- code/trunk/pcretest.c 2007/02/24 21:38:49 27
+++ code/trunk/pcretest.c 2007/02/24 21:39:50 57
@@ -12,7 +12,14 @@
/* Use the internal info for displaying the results of pcre_study(). */
#include "internal.h"
+
+/* It is possible to compile this test program without including support for
+testing the POSIX interface, though this is not available via the standard
+Makefile. */
+
+#if !defined NOPOSIX
#include "pcreposix.h"
+#endif
#ifndef CLOCKS_PER_SEC
#ifdef CLK_TCK
@@ -27,6 +34,114 @@
static FILE *outfile;
static int log_store = 0;
+static size_t gotten_store;
+
+
+
+static int utf8_table1[] = {
+ 0x0000007f, 0x000007ff, 0x0000ffff, 0x001fffff, 0x03ffffff, 0x7fffffff};
+
+static int utf8_table2[] = {
+ 0, 0xc0, 0xe0, 0xf0, 0xf8, 0xfc};
+
+static int utf8_table3[] = {
+ 0xff, 0x1f, 0x0f, 0x07, 0x03, 0x01};
+
+
+/*************************************************
+* Convert character value to UTF-8 *
+*************************************************/
+
+/* This function takes an integer value in the range 0 - 0x7fffffff
+and encodes it as a UTF-8 character in 0 to 6 bytes.
+
+Arguments:
+ cvalue the character value
+ buffer pointer to buffer for result - at least 6 bytes long
+
+Returns: number of characters placed in the buffer
+ -1 if input character is negative
+ 0 if input character is positive but too big (only when
+ int is longer than 32 bits)
+*/
+
+static int
+ord2utf8(int cvalue, unsigned char *buffer)
+{
+register int i, j;
+for (i = 0; i < sizeof(utf8_table1)/sizeof(int); i++)
+ if (cvalue <= utf8_table1[i]) break;
+if (i >= sizeof(utf8_table1)/sizeof(int)) return 0;
+if (cvalue < 0) return -1;
+*buffer++ = utf8_table2[i] | (cvalue & utf8_table3[i]);
+cvalue >>= 6 - i;
+for (j = 0; j < i; j++)
+ {
+ *buffer++ = 0x80 | (cvalue & 0x3f);
+ cvalue >>= 6;
+ }
+return i + 1;
+}
+
+
+/*************************************************
+* Convert UTF-8 string to value *
+*************************************************/
+
+/* This function takes one or more bytes that represents a UTF-8 character,
+and returns the value of the character.
+
+Argument:
+ buffer a pointer to the byte vector
+ vptr a pointer to an int to receive the value
+
+Returns: > 0 => the number of bytes consumed
+ -6 to 0 => malformed UTF-8 character at offset = (-return)
+*/
+
+int
+utf82ord(unsigned char *buffer, int *vptr)
+{
+int c = *buffer++;
+int d = c;
+int i, j, s;
+
+for (i = -1; i < 6; i++) /* i is number of additional bytes */
+ {
+ if ((d & 0x80) == 0) break;
+ d <<= 1;
+ }
+
+if (i == -1) { *vptr = c; return 1; } /* ascii character */
+if (i == 0 || i == 6) return 0; /* invalid UTF-8 */
+
+/* i now has a value in the range 1-5 */
+
+d = c & utf8_table3[i];
+s = 6 - i;
+
+for (j = 0; j < i; j++)
+ {
+ c = *buffer++;
+ if ((c & 0xc0) != 0x80) return -(j+1);
+ d |= (c & 0x3f) << s;
+ s += 6;
+ }
+
+/* Check that encoding was the correct unique one */
+
+for (j = 0; j < sizeof(utf8_table1)/sizeof(int); j++)
+ if (d <= utf8_table1[j]) break;
+if (j != i) return -(i+1);
+
+/* Valid value */
+
+*vptr = d;
+return i+1;
+}
+
+
+
@@ -41,14 +156,14 @@
"*", "*?", "+", "+?", "?", "??", "{", "{", "{",
"*", "*?", "+", "+?", "?", "??", "{", "{", "{",
"*", "*?", "+", "+?", "?", "??", "{", "{",
- "class", "Ref",
+ "class", "Ref", "Recurse",
"Alt", "Ket", "KetRmax", "KetRmin", "Assert", "Assert not",
"AssertB", "AssertB not", "Reverse", "Once", "Cond", "Cref",
- "Brazero", "Braminzero", "Bra"
+ "Brazero", "Braminzero", "Branumber", "Bra"
};
-static void print_internals(pcre *re, FILE *outfile)
+static void print_internals(pcre *re)
{
unsigned char *code = ((real_pcre *)re)->code;
@@ -63,7 +178,10 @@
if (*code >= OP_BRA)
{
- fprintf(outfile, "%3d Bra %d", (code[1] << 8) + code[2], *code - OP_BRA);
+ if (*code - OP_BRA > EXTRACT_BASIC_MAX)
+ fprintf(outfile, "%3d Bra extra", (code[1] << 8) + code[2]);
+ else
+ fprintf(outfile, "%3d Bra %d", (code[1] << 8) + code[2], *code - OP_BRA);
code += 2;
}
@@ -79,16 +197,6 @@
code++;
break;
- case OP_COND:
- fprintf(outfile, "%3d Cond", (code[1] << 8) + code[2]);
- code += 2;
- break;
-
- case OP_CREF:
- fprintf(outfile, " %.2d %s", code[1], OP_names[*code]);
- code++;
- break;
-
case OP_CHARS:
charlength = *(++code);
fprintf(outfile, "%3d ", charlength);
@@ -106,11 +214,10 @@
case OP_ASSERTBACK:
case OP_ASSERTBACK_NOT:
case OP_ONCE:
- fprintf(outfile, "%3d %s", (code[1] << 8) + code[2], OP_names[*code]);
- code += 2;
- break;
-
+ case OP_COND:
+ case OP_BRANUMBER:
case OP_REVERSE:
+ case OP_CREF:
fprintf(outfile, "%3d %s", (code[1] << 8) + code[2], OP_names[*code]);
code += 2;
break;
@@ -183,8 +290,8 @@
break;
case OP_REF:
- fprintf(outfile, " \\%d", *(++code));
- code++;
+ fprintf(outfile, " \\%d", (code[1] << 8) | code[2]);
+ code += 3;
goto CLASS_REF_REPEAT;
case OP_CLASS:
@@ -257,14 +364,31 @@
-/* Character string printing function. */
+/* Character string printing function. A "normal" and a UTF-8 version. */
-static void pchars(unsigned char *p, int length)
+static void pchars(unsigned char *p, int length, int utf8)
{
int c;
while (length-- > 0)
+ {
+ if (utf8)
+ {
+ int rc = utf82ord(p, &c);
+ if (rc > 0)
+ {
+ length -= rc - 1;
+ p += rc;
+ if (c < 256 && isprint(c)) fprintf(outfile, "%c", c);
+ else fprintf(outfile, "\\x{%02x}", c);
+ continue;
+ }
+ }
+
+ /* Not UTF-8, or malformed UTF-8 */
+
if (isprint(c = *(p++))) fprintf(outfile, "%c", c);
else fprintf(outfile, "\\x%02x", c);
+ }
}
@@ -274,12 +398,28 @@
static void *new_malloc(size_t size)
{
-if (log_store) fprintf(outfile, "Store size request: %d\n", (int)size);
+gotten_store = size;
+if (log_store)
+ fprintf(outfile, "Memory allocation (code space): %d\n",
+ (int)((int)size - offsetof(real_pcre, code[0])));
return malloc(size);
}
+
+/* Get one piece of information from the pcre_fullinfo() function */
+
+static void new_info(pcre *re, pcre_extra *study, int option, void *ptr)
+{
+int rc;
+if ((rc = pcre_fullinfo(re, study, option, ptr)) < 0)
+ fprintf(outfile, "Error %d from pcre_fullinfo(%d)\n", rc, option);
+}
+
+
+
+
/* Read lines from named file or stdin and write to named file or stdout; lines
consist of a regular expression, in delimiters and optionally followed by
options, followed by a set of test data, terminated by an empty line. */
@@ -292,7 +432,13 @@
int op = 1;
int timeit = 0;
int showinfo = 0;
+int showstore = 0;
+int size_offsets = 45;
+int size_offsets_max;
+int *offsets;
+#if !defined NOPOSIX
int posix = 0;
+#endif
int debug = 0;
int done = 0;
unsigned char buffer[30000];
@@ -306,26 +452,51 @@
while (argc > 1 && argv[op][0] == '-')
{
- if (strcmp(argv[op], "-s") == 0) log_store = 1;
+ char *endptr;
+
+ if (strcmp(argv[op], "-s") == 0 || strcmp(argv[op], "-m") == 0)
+ showstore = 1;
else if (strcmp(argv[op], "-t") == 0) timeit = 1;
else if (strcmp(argv[op], "-i") == 0) showinfo = 1;
else if (strcmp(argv[op], "-d") == 0) showinfo = debug = 1;
+ else if (strcmp(argv[op], "-o") == 0 && argc > 2 &&
+ ((size_offsets = strtoul(argv[op+1], &endptr, 10)), *endptr == 0))
+ {
+ op++;
+ argc--;
+ }
+#if !defined NOPOSIX
else if (strcmp(argv[op], "-p") == 0) posix = 1;
+#endif
else
{
- printf("*** Unknown option %s\n", argv[op]);
- printf("Usage: pcretest [-d] [-i] [-p] [-s] [-t] [ [