/[pcre]/code/trunk/pcretest.c
ViewVC logotype

Contents of /code/trunk/pcretest.c

Parent Directory Parent Directory | Revision Log Revision Log


Revision 73 - (show annotations) (download)
Sat Feb 24 21:40:30 2007 UTC (7 years, 2 months ago) by nigel
File MIME type: text/plain
File size: 41274 byte(s)
Load pcre-4.5 into code/trunk.

1 /*************************************************
2 * PCRE testing program *
3 *************************************************/
4
5 /* This program was hacked up as a tester for PCRE. I really should have
6 written it more tidily in the first place. Will I ever learn? It has grown and
7 been extended and consequently is now rather untidy in places. */
8
9 #include <ctype.h>
10 #include <stdio.h>
11 #include <string.h>
12 #include <stdlib.h>
13 #include <time.h>
14 #include <locale.h>
15
16 /* We need the internal info for displaying the results of pcre_study(). Also
17 for getting the opcodes for showing compiled code. */
18
19 #define PCRE_SPY /* For Win32 build, import data, not export */
20 #include "internal.h"
21
22 /* It is possible to compile this test program without including support for
23 testing the POSIX interface, though this is not available via the standard
24 Makefile. */
25
26 #if !defined NOPOSIX
27 #include "pcreposix.h"
28 #endif
29
30 #ifndef CLOCKS_PER_SEC
31 #ifdef CLK_TCK
32 #define CLOCKS_PER_SEC CLK_TCK
33 #else
34 #define CLOCKS_PER_SEC 100
35 #endif
36 #endif
37
38 #define LOOPREPEAT 50000
39
40 #define BUFFER_SIZE 30000
41 #define DBUFFER_SIZE BUFFER_SIZE
42
43
44 static FILE *outfile;
45 static int log_store = 0;
46 static int callout_count;
47 static int callout_extra;
48 static int callout_fail_count;
49 static int callout_fail_id;
50 static int first_callout;
51 static int show_malloc;
52 static int use_utf8;
53 static size_t gotten_store;
54
55
56 static const int utf8_table1[] = {
57 0x0000007f, 0x000007ff, 0x0000ffff, 0x001fffff, 0x03ffffff, 0x7fffffff};
58
59 static const int utf8_table2[] = {
60 0, 0xc0, 0xe0, 0xf0, 0xf8, 0xfc};
61
62 static const int utf8_table3[] = {
63 0xff, 0x1f, 0x0f, 0x07, 0x03, 0x01};
64
65
66
67 /*************************************************
68 * Print compiled regex *
69 *************************************************/
70
71 /* The code for doing this is held in a separate file that is also included in
72 pcre.c when it is compiled with the debug switch. It defines a function called
73 print_internals(), which uses a table of opcode lengths defined by the macro
74 OP_LENGTHS, whose name must be OP_lengths. */
75
76 static uschar OP_lengths[] = { OP_LENGTHS };
77
78 #include "printint.c"
79
80
81
82 /*************************************************
83 * Read number from string *
84 *************************************************/
85
86 /* We don't use strtoul() because SunOS4 doesn't have it. Rather than mess
87 around with conditional compilation, just do the job by hand. It is only used
88 for unpicking the -o argument, so just keep it simple.
89
90 Arguments:
91 str string to be converted
92 endptr where to put the end pointer
93
94 Returns: the unsigned long
95 */
96
97 static int
98 get_value(unsigned char *str, unsigned char **endptr)
99 {
100 int result = 0;
101 while(*str != 0 && isspace(*str)) str++;
102 while (isdigit(*str)) result = result * 10 + (int)(*str++ - '0');
103 *endptr = str;
104 return(result);
105 }
106
107
108
109 /*************************************************
110 * Convert character value to UTF-8 *
111 *************************************************/
112
113 /* This function takes an integer value in the range 0 - 0x7fffffff
114 and encodes it as a UTF-8 character in 0 to 6 bytes.
115
116 Arguments:
117 cvalue the character value
118 buffer pointer to buffer for result - at least 6 bytes long
119
120 Returns: number of characters placed in the buffer
121 -1 if input character is negative
122 0 if input character is positive but too big (only when
123 int is longer than 32 bits)
124 */
125
126 static int
127 ord2utf8(int cvalue, unsigned char *buffer)
128 {
129 register int i, j;
130 for (i = 0; i < sizeof(utf8_table1)/sizeof(int); i++)
131 if (cvalue <= utf8_table1[i]) break;
132 if (i >= sizeof(utf8_table1)/sizeof(int)) return 0;
133 if (cvalue < 0) return -1;
134
135 buffer += i;
136 for (j = i; j > 0; j--)
137 {
138 *buffer-- = 0x80 | (cvalue & 0x3f);
139 cvalue >>= 6;
140 }
141 *buffer = utf8_table2[i] | cvalue;
142 return i + 1;
143 }
144
145
146 /*************************************************
147 * Convert UTF-8 string to value *
148 *************************************************/
149
150 /* This function takes one or more bytes that represents a UTF-8 character,
151 and returns the value of the character.
152
153 Argument:
154 buffer a pointer to the byte vector
155 vptr a pointer to an int to receive the value
156
157 Returns: > 0 => the number of bytes consumed
158 -6 to 0 => malformed UTF-8 character at offset = (-return)
159 */
160
161 static int
162 utf82ord(unsigned char *buffer, int *vptr)
163 {
164 int c = *buffer++;
165 int d = c;
166 int i, j, s;
167
168 for (i = -1; i < 6; i++) /* i is number of additional bytes */
169 {
170 if ((d & 0x80) == 0) break;
171 d <<= 1;
172 }
173
174 if (i == -1) { *vptr = c; return 1; } /* ascii character */
175 if (i == 0 || i == 6) return 0; /* invalid UTF-8 */
176
177 /* i now has a value in the range 1-5 */
178
179 s = 6*i;
180 d = (c & utf8_table3[i]) << s;
181
182 for (j = 0; j < i; j++)
183 {
184 c = *buffer++;
185 if ((c & 0xc0) != 0x80) return -(j+1);
186 s -= 6;
187 d |= (c & 0x3f) << s;
188 }
189
190 /* Check that encoding was the correct unique one */
191
192 for (j = 0; j < sizeof(utf8_table1)/sizeof(int); j++)
193 if (d <= utf8_table1[j]) break;
194 if (j != i) return -(i+1);
195
196 /* Valid value */
197
198 *vptr = d;
199 return i+1;
200 }
201
202
203
204 /*************************************************
205 * Print character string *
206 *************************************************/
207
208 /* Character string printing function. Must handle UTF-8 strings in utf8
209 mode. Yields number of characters printed. If handed a NULL file, just counts
210 chars without printing. */
211
212 static int pchars(unsigned char *p, int length, FILE *f)
213 {
214 int c;
215 int yield = 0;
216
217 while (length-- > 0)
218 {
219 if (use_utf8)
220 {
221 int rc = utf82ord(p, &c);
222
223 if (rc > 0 && rc <= length + 1) /* Mustn't run over the end */
224 {
225 length -= rc - 1;
226 p += rc;
227 if (c < 256 && isprint(c))
228 {
229 if (f != NULL) fprintf(f, "%c", c);
230 yield++;
231 }
232 else
233 {
234 int n;
235 if (f != NULL) fprintf(f, "\\x{%02x}%n", c, &n);
236 yield += n;
237 }
238 continue;
239 }
240 }
241
242 /* Not UTF-8, or malformed UTF-8 */
243
244 if (isprint(c = *(p++)))
245 {
246 if (f != NULL) fprintf(f, "%c", c);
247 yield++;
248 }
249 else
250 {
251 if (f != NULL) fprintf(f, "\\x%02x", c);
252 yield += 4;
253 }
254 }
255
256 return yield;
257 }
258
259
260
261 /*************************************************
262 * Callout function *
263 *************************************************/
264
265 /* Called from PCRE as a result of the (?C) item. We print out where we are in
266 the match. Yield zero unless more callouts than the fail count, or the callout
267 data is not zero. */
268
269 static int callout(pcre_callout_block *cb)
270 {
271 FILE *f = (first_callout | callout_extra)? outfile : NULL;
272 int i, pre_start, post_start;
273
274 if (callout_extra)
275 {
276 fprintf(f, "Callout %d: last capture = %d\n",
277 cb->callout_number, cb->capture_last);
278
279 for (i = 0; i < cb->capture_top * 2; i += 2)
280 {
281 if (cb->offset_vector[i] < 0)
282 fprintf(f, "%2d: <unset>\n", i/2);
283 else
284 {
285 fprintf(f, "%2d: ", i/2);
286 (void)pchars((unsigned char *)cb->subject + cb->offset_vector[i],
287 cb->offset_vector[i+1] - cb->offset_vector[i], f);
288 fprintf(f, "\n");
289 }
290 }
291 }
292
293 /* Re-print the subject in canonical form, the first time or if giving full
294 datails. On subsequent calls in the same match, we use pchars just to find the
295 printed lengths of the substrings. */
296
297 if (f != NULL) fprintf(f, "--->");
298
299 pre_start = pchars((unsigned char *)cb->subject, cb->start_match, f);
300 post_start = pchars((unsigned char *)(cb->subject + cb->start_match),
301 cb->current_position - cb->start_match, f);
302
303 (void)pchars((unsigned char *)(cb->subject + cb->current_position),
304 cb->subject_length - cb->current_position, f);
305
306 if (f != NULL) fprintf(f, "\n");
307
308 /* Always print appropriate indicators, with callout number if not already
309 shown */
310
311 if (callout_extra) fprintf(outfile, " ");
312 else fprintf(outfile, "%3d ", cb->callout_number);
313
314 for (i = 0; i < pre_start; i++) fprintf(outfile, " ");
315 fprintf(outfile, "^");
316
317 if (post_start > 0)
318 {
319 for (i = 0; i < post_start - 1; i++) fprintf(outfile, " ");
320 fprintf(outfile, "^");
321 }
322
323 fprintf(outfile, "\n");
324 first_callout = 0;
325
326 if (cb->callout_data != NULL)
327 {
328 int callout_data = *((int *)(cb->callout_data));
329 if (callout_data != 0)
330 {
331 fprintf(outfile, "Callout data = %d\n", callout_data);
332 return callout_data;
333 }
334 }
335
336 return (cb->callout_number != callout_fail_id)? 0 :
337 (++callout_count >= callout_fail_count)? 1 : 0;
338 }
339
340
341 /*************************************************
342 * Local malloc functions *
343 *************************************************/
344
345 /* Alternative malloc function, to test functionality and show the size of the
346 compiled re. */
347
348 static void *new_malloc(size_t size)
349 {
350 void *block = malloc(size);
351 gotten_store = size;
352 if (show_malloc)
353 fprintf(outfile, "malloc %3d %p\n", size, block);
354 return block;
355 }
356
357 static void new_free(void *block)
358 {
359 if (show_malloc)
360 fprintf(outfile, "free %p\n", block);
361 free(block);
362 }
363
364
365 /* For recursion malloc/free, to test stacking calls */
366
367 static void *stack_malloc(size_t size)
368 {
369 void *block = malloc(size);
370 if (show_malloc)
371 fprintf(outfile, "stack_malloc %3d %p\n", size, block);
372 return block;
373 }
374
375 static void stack_free(void *block)
376 {
377 if (show_malloc)
378 fprintf(outfile, "stack_free %p\n", block);
379 free(block);
380 }
381
382
383 /*************************************************
384 * Call pcre_fullinfo() *
385 *************************************************/
386
387 /* Get one piece of information from the pcre_fullinfo() function */
388
389 static void new_info(pcre *re, pcre_extra *study, int option, void *ptr)
390 {
391 int rc;
392 if ((rc = pcre_fullinfo(re, study, option, ptr)) < 0)
393 fprintf(outfile, "Error %d from pcre_fullinfo(%d)\n", rc, option);
394 }
395
396
397
398 /*************************************************
399 * Main Program *
400 *************************************************/
401
402 /* Read lines from named file or stdin and write to named file or stdout; lines
403 consist of a regular expression, in delimiters and optionally followed by
404 options, followed by a set of test data, terminated by an empty line. */
405
406 int main(int argc, char **argv)
407 {
408 FILE *infile = stdin;
409 int options = 0;
410 int study_options = 0;
411 int op = 1;
412 int timeit = 0;
413 int showinfo = 0;
414 int showstore = 0;
415 int size_offsets = 45;
416 int size_offsets_max;
417 int *offsets;
418 #if !defined NOPOSIX
419 int posix = 0;
420 #endif
421 int debug = 0;
422 int done = 0;
423
424 unsigned char *buffer;
425 unsigned char *dbuffer;
426
427 /* Get buffers from malloc() so that Electric Fence will check their misuse
428 when I am debugging. */
429
430 buffer = (unsigned char *)malloc(BUFFER_SIZE);
431 dbuffer = (unsigned char *)malloc(DBUFFER_SIZE);
432
433 /* Static so that new_malloc can use it. */
434
435 outfile = stdout;
436
437 /* Scan options */
438
439 while (argc > 1 && argv[op][0] == '-')
440 {
441 unsigned char *endptr;
442
443 if (strcmp(argv[op], "-s") == 0 || strcmp(argv[op], "-m") == 0)
444 showstore = 1;
445 else if (strcmp(argv[op], "-t") == 0) timeit = 1;
446 else if (strcmp(argv[op], "-i") == 0) showinfo = 1;
447 else if (strcmp(argv[op], "-d") == 0) showinfo = debug = 1;
448 else if (strcmp(argv[op], "-o") == 0 && argc > 2 &&
449 ((size_offsets = get_value((unsigned char *)argv[op+1], &endptr)),
450 *endptr == 0))
451 {
452 op++;
453 argc--;
454 }
455 #if !defined NOPOSIX
456 else if (strcmp(argv[op], "-p") == 0) posix = 1;
457 #endif
458 else if (strcmp(argv[op], "-C") == 0)
459 {
460 int rc;
461 printf("PCRE version %s\n", pcre_version());
462 printf("Compiled with\n");
463 (void)pcre_config(PCRE_CONFIG_UTF8, &rc);
464 printf(" %sUTF-8 support\n", rc? "" : "No ");
465 (void)pcre_config(PCRE_CONFIG_NEWLINE, &rc);
466 printf(" Newline character is %s\n", (rc == '\r')? "CR" : "LF");
467 (void)pcre_config(PCRE_CONFIG_LINK_SIZE, &rc);
468 printf(" Internal link size = %d\n", rc);
469 (void)pcre_config(PCRE_CONFIG_POSIX_MALLOC_THRESHOLD, &rc);
470 printf(" POSIX malloc threshold = %d\n", rc);
471 (void)pcre_config(PCRE_CONFIG_MATCH_LIMIT, &rc);
472 printf(" Default match limit = %d\n", rc);
473 (void)pcre_config(PCRE_CONFIG_STACKRECURSE, &rc);
474 printf(" Match recursion uses %s\n", rc? "stack" : "heap");
475 exit(0);
476 }
477 else
478 {
479 printf("** Unknown or malformed option %s\n", argv[op]);
480 printf("Usage: pcretest [-d] [-i] [-o <n>] [-p] [-s] [-t] [<input> [<output>]]\n");
481 printf(" -C show PCRE compile-time options and exit\n");
482 printf(" -d debug: show compiled code; implies -i\n"
483 " -i show information about compiled pattern\n"
484 " -o <n> set size of offsets vector to <n>\n");
485 #if !defined NOPOSIX
486 printf(" -p use POSIX interface\n");
487 #endif
488 printf(" -s output store information\n"
489 " -t time compilation and execution\n");
490 return 1;
491 }
492 op++;
493 argc--;
494 }
495
496 /* Get the store for the offsets vector, and remember what it was */
497
498 size_offsets_max = size_offsets;
499 offsets = (int *)malloc(size_offsets_max * sizeof(int));
500 if (offsets == NULL)
501 {
502 printf("** Failed to get %d bytes of memory for offsets vector\n",
503 size_offsets_max * sizeof(int));
504 return 1;
505 }
506
507 /* Sort out the input and output files */
508
509 if (argc > 1)
510 {
511 infile = fopen(argv[op], "r");
512 if (infile == NULL)
513 {
514 printf("** Failed to open %s\n", argv[op]);
515 return 1;
516 }
517 }
518
519 if (argc > 2)
520 {
521 outfile = fopen(argv[op+1], "w");
522 if (outfile == NULL)
523 {
524 printf("** Failed to open %s\n", argv[op+1]);
525 return 1;
526 }
527 }
528
529 /* Set alternative malloc function */
530
531 pcre_malloc = new_malloc;
532 pcre_free = new_free;
533 pcre_stack_malloc = stack_malloc;
534 pcre_stack_free = stack_free;
535
536 /* Heading line, then prompt for first regex if stdin */
537
538 fprintf(outfile, "PCRE version %s\n\n", pcre_version());
539
540 /* Main loop */
541
542 while (!done)
543 {
544 pcre *re = NULL;
545 pcre_extra *extra = NULL;
546
547 #if !defined NOPOSIX /* There are still compilers that require no indent */
548 regex_t preg;
549 int do_posix = 0;
550 #endif
551
552 const char *error;
553 unsigned char *p, *pp, *ppp;
554 const unsigned char *tables = NULL;
555 int do_study = 0;
556 int do_debug = debug;
557 int do_G = 0;
558 int do_g = 0;
559 int do_showinfo = showinfo;
560 int do_showrest = 0;
561 int erroroffset, len, delimiter;
562
563 use_utf8 = 0;
564
565 if (infile == stdin) printf(" re> ");
566 if (fgets((char *)buffer, BUFFER_SIZE, infile) == NULL) break;
567 if (infile != stdin) fprintf(outfile, "%s", (char *)buffer);
568 fflush(outfile);
569
570 p = buffer;
571 while (isspace(*p)) p++;
572 if (*p == 0) continue;
573
574 /* Get the delimiter and seek the end of the pattern; if is isn't
575 complete, read more. */
576
577 delimiter = *p++;
578
579 if (isalnum(delimiter) || delimiter == '\\')
580 {
581 fprintf(outfile, "** Delimiter must not be alphameric or \\\n");
582 goto SKIP_DATA;
583 }
584
585 pp = p;
586
587 for(;;)
588 {
589 while (*pp != 0)
590 {
591 if (*pp == '\\' && pp[1] != 0) pp++;
592 else if (*pp == delimiter) break;
593 pp++;
594 }
595 if (*pp != 0) break;
596
597 len = BUFFER_SIZE - (pp - buffer);
598 if (len < 256)
599 {
600 fprintf(outfile, "** Expression too long - missing delimiter?\n");
601 goto SKIP_DATA;
602 }
603
604 if (infile == stdin) printf(" > ");
605 if (fgets((char *)pp, len, infile) == NULL)
606 {
607 fprintf(outfile, "** Unexpected EOF\n");
608 done = 1;
609 goto CONTINUE;
610 }
611 if (infile != stdin) fprintf(outfile, "%s", (char *)pp);
612 }
613
614 /* If the first character after the delimiter is backslash, make
615 the pattern end with backslash. This is purely to provide a way
616 of testing for the error message when a pattern ends with backslash. */
617
618 if (pp[1] == '\\') *pp++ = '\\';
619
620 /* Terminate the pattern at the delimiter */
621
622 *pp++ = 0;
623
624 /* Look for options after final delimiter */
625
626 options = 0;
627 study_options = 0;
628 log_store = showstore; /* default from command line */
629
630 while (*pp != 0)
631 {
632 switch (*pp++)
633 {
634 case 'g': do_g = 1; break;
635 case 'i': options |= PCRE_CASELESS; break;
636 case 'm': options |= PCRE_MULTILINE; break;
637 case 's': options |= PCRE_DOTALL; break;
638 case 'x': options |= PCRE_EXTENDED; break;
639
640 case '+': do_showrest = 1; break;
641 case 'A': options |= PCRE_ANCHORED; break;
642 case 'D': do_debug = do_showinfo = 1; break;
643 case 'E': options |= PCRE_DOLLAR_ENDONLY; break;
644 case 'G': do_G = 1; break;
645 case 'I': do_showinfo = 1; break;
646 case 'M': log_store = 1; break;
647 case 'N': options |= PCRE_NO_AUTO_CAPTURE; break;
648
649 #if !defined NOPOSIX
650 case 'P': do_posix = 1; break;
651 #endif
652
653 case 'S': do_study = 1; break;
654 case 'U': options |= PCRE_UNGREEDY; break;
655 case 'X': options |= PCRE_EXTRA; break;
656 case '8': options |= PCRE_UTF8; use_utf8 = 1; break;
657 case '?': options |= PCRE_NO_UTF8_CHECK; break;
658
659 case 'L':
660 ppp = pp;
661 while (*ppp != '\n' && *ppp != ' ') ppp++;
662 *ppp = 0;
663 if (setlocale(LC_CTYPE, (const char *)pp) == NULL)
664 {
665 fprintf(outfile, "** Failed to set locale \"%s\"\n", pp);
666 goto SKIP_DATA;
667 }
668 tables = pcre_maketables();
669 pp = ppp;
670 break;
671
672 case '\n': case ' ': break;
673 default:
674 fprintf(outfile, "** Unknown option '%c'\n", pp[-1]);
675 goto SKIP_DATA;
676 }
677 }
678
679 /* Handle compiling via the POSIX interface, which doesn't support the
680 timing, showing, or debugging options, nor the ability to pass over
681 local character tables. */
682
683 #if !defined NOPOSIX
684 if (posix || do_posix)
685 {
686 int rc;
687 int cflags = 0;
688 if ((options & PCRE_CASELESS) != 0) cflags |= REG_ICASE;
689 if ((options & PCRE_MULTILINE) != 0) cflags |= REG_NEWLINE;
690 rc = regcomp(&preg, (char *)p, cflags);
691
692 /* Compilation failed; go back for another re, skipping to blank line
693 if non-interactive. */
694
695 if (rc != 0)
696 {
697 (void)regerror(rc, &preg, (char *)buffer, BUFFER_SIZE);
698 fprintf(outfile, "Failed: POSIX code %d: %s\n", rc, buffer);
699 goto SKIP_DATA;
700 }
701 }
702
703 /* Handle compiling via the native interface */
704
705 else
706 #endif /* !defined NOPOSIX */
707
708 {
709 if (timeit)
710 {
711 register int i;
712 clock_t time_taken;
713 clock_t start_time = clock();
714 for (i = 0; i < LOOPREPEAT; i++)
715 {
716 re = pcre_compile((char *)p, options, &error, &erroroffset, tables);
717 if (re != NULL) free(re);
718 }
719 time_taken = clock() - start_time;
720 fprintf(outfile, "Compile time %.3f milliseconds\n",
721 (((double)time_taken * 1000.0) / (double)LOOPREPEAT) /
722 (double)CLOCKS_PER_SEC);
723 }
724
725 re = pcre_compile((char *)p, options, &error, &erroroffset, tables);
726
727 /* Compilation failed; go back for another re, skipping to blank line
728 if non-interactive. */
729
730 if (re == NULL)
731 {
732 fprintf(outfile, "Failed: %s at offset %d\n", error, erroroffset);
733 SKIP_DATA:
734 if (infile != stdin)
735 {
736 for (;;)
737 {
738 if (fgets((char *)buffer, BUFFER_SIZE, infile) == NULL)
739 {
740 done = 1;
741 goto CONTINUE;
742 }
743 len = (int)strlen((char *)buffer);
744 while (len > 0 && isspace(buffer[len-1])) len--;
745 if (len == 0) break;
746 }
747 fprintf(outfile, "\n");
748 }
749 goto CONTINUE;
750 }
751
752 /* Compilation succeeded; print data if required. There are now two
753 info-returning functions. The old one has a limited interface and
754 returns only limited data. Check that it agrees with the newer one. */
755
756 if (log_store)
757 fprintf(outfile, "Memory allocation (code space): %d\n",
758 (int)(gotten_store -
759 sizeof(real_pcre) -
760 ((real_pcre *)re)->name_count * ((real_pcre *)re)->name_entry_size));
761
762 if (do_showinfo)
763 {
764 unsigned long int get_options;
765 int old_first_char, old_options, old_count;
766 int count, backrefmax, first_char, need_char;
767 int nameentrysize, namecount;
768 const uschar *nametable;
769 size_t size;
770
771 if (do_debug)
772 {
773 fprintf(outfile, "------------------------------------------------------------------\n");
774 print_internals(re, outfile);
775 }
776
777 new_info(re, NULL, PCRE_INFO_OPTIONS, &get_options);
778 new_info(re, NULL, PCRE_INFO_SIZE, &size);
779 new_info(re, NULL, PCRE_INFO_CAPTURECOUNT, &count);
780 new_info(re, NULL, PCRE_INFO_BACKREFMAX, &backrefmax);
781 new_info(re, NULL, PCRE_INFO_FIRSTBYTE, &first_char);
782 new_info(re, NULL, PCRE_INFO_LASTLITERAL, &need_char);
783 new_info(re, NULL, PCRE_INFO_NAMEENTRYSIZE, &nameentrysize);
784 new_info(re, NULL, PCRE_INFO_NAMECOUNT, &namecount);
785 new_info(re, NULL, PCRE_INFO_NAMETABLE, (void *)&nametable);
786
787 old_count = pcre_info(re, &old_options, &old_first_char);
788 if (count < 0) fprintf(outfile,
789 "Error %d from pcre_info()\n", count);
790 else
791 {
792 if (old_count != count) fprintf(outfile,
793 "Count disagreement: pcre_fullinfo=%d pcre_info=%d\n", count,
794 old_count);
795
796 if (old_first_char != first_char) fprintf(outfile,
797 "First char disagreement: pcre_fullinfo=%d pcre_info=%d\n",
798 first_char, old_first_char);
799
800 if (old_options != (int)get_options) fprintf(outfile,
801 "Options disagreement: pcre_fullinfo=%ld pcre_info=%d\n",
802 get_options, old_options);
803 }
804
805 if (size != gotten_store) fprintf(outfile,
806 "Size disagreement: pcre_fullinfo=%d call to malloc for %d\n",
807 size, gotten_store);
808
809 fprintf(outfile, "Capturing subpattern count = %d\n", count);
810 if (backrefmax > 0)
811 fprintf(outfile, "Max back reference = %d\n", backrefmax);
812
813 if (namecount > 0)
814 {
815 fprintf(outfile, "Named capturing subpatterns:\n");
816 while (namecount-- > 0)
817 {
818 fprintf(outfile, " %s %*s%3d\n", nametable + 2,
819 nameentrysize - 3 - (int)strlen((char *)nametable + 2), "",
820 GET2(nametable, 0));
821 nametable += nameentrysize;
822 }
823 }
824
825 if (get_options == 0) fprintf(outfile, "No options\n");
826 else fprintf(outfile, "Options:%s%s%s%s%s%s%s%s%s%s\n",
827 ((get_options & PCRE_ANCHORED) != 0)? " anchored" : "",
828 ((get_options & PCRE_CASELESS) != 0)? " caseless" : "",
829 ((get_options & PCRE_EXTENDED) != 0)? " extended" : "",
830 ((get_options & PCRE_MULTILINE) != 0)? " multiline" : "",
831 ((get_options & PCRE_DOTALL) != 0)? " dotall" : "",
832 ((get_options & PCRE_DOLLAR_ENDONLY) != 0)? " dollar_endonly" : "",
833 ((get_options & PCRE_EXTRA) != 0)? " extra" : "",
834 ((get_options & PCRE_UNGREEDY) != 0)? " ungreedy" : "",
835 ((get_options & PCRE_UTF8) != 0)? " utf8" : "",
836 ((get_options & PCRE_NO_UTF8_CHECK) != 0)? " no_utf8_check" : "");
837
838 if (((((real_pcre *)re)->options) & PCRE_ICHANGED) != 0)
839 fprintf(outfile, "Case state changes\n");
840
841 if (first_char == -1)
842 {
843 fprintf(outfile, "First char at start or follows \\n\n");
844 }
845 else if (first_char < 0)
846 {
847 fprintf(outfile, "No first char\n");
848 }
849 else
850 {
851 int ch = first_char & 255;
852 const char *caseless = ((first_char & REQ_CASELESS) == 0)?
853 "" : " (caseless)";
854 if (isprint(ch))
855 fprintf(outfile, "First char = \'%c\'%s\n", ch, caseless);
856 else
857 fprintf(outfile, "First char = %d%s\n", ch, caseless);
858 }
859
860 if (need_char < 0)
861 {
862 fprintf(outfile, "No need char\n");
863 }
864 else
865 {
866 int ch = need_char & 255;
867 const char *caseless = ((need_char & REQ_CASELESS) == 0)?
868 "" : " (caseless)";
869 if (isprint(ch))
870 fprintf(outfile, "Need char = \'%c\'%s\n", ch, caseless);
871 else
872 fprintf(outfile, "Need char = %d%s\n", ch, caseless);
873 }
874 }
875
876 /* If /S was present, study the regexp to generate additional info to
877 help with the matching. */
878
879 if (do_study)
880 {
881 if (timeit)
882 {
883 register int i;
884 clock_t time_taken;
885 clock_t start_time = clock();
886 for (i = 0; i < LOOPREPEAT; i++)
887 extra = pcre_study(re, study_options, &error);
888 time_taken = clock() - start_time;
889 if (extra != NULL) free(extra);
890 fprintf(outfile, " Study time %.3f milliseconds\n",
891 (((double)time_taken * 1000.0) / (double)LOOPREPEAT) /
892 (double)CLOCKS_PER_SEC);
893 }
894
895 extra = pcre_study(re, study_options, &error);
896 if (error != NULL)
897 fprintf(outfile, "Failed to study: %s\n", error);
898 else if (extra == NULL)
899 fprintf(outfile, "Study returned NULL\n");
900
901 /* Don't output study size; at present it is in any case a fixed
902 value, but it varies, depending on the computer architecture, and
903 so messes up the test suite. */
904
905 else if (do_showinfo)
906 {
907 size_t size;
908 uschar *start_bits = NULL;
909 new_info(re, extra, PCRE_INFO_STUDYSIZE, &size);
910 new_info(re, extra, PCRE_INFO_FIRSTTABLE, &start_bits);
911 /* fprintf(outfile, "Study size = %d\n", size); */
912 if (start_bits == NULL)
913 fprintf(outfile, "No starting character set\n");
914 else
915 {
916 int i;
917 int c = 24;
918 fprintf(outfile, "Starting character set: ");
919 for (i = 0; i < 256; i++)
920 {
921 if ((start_bits[i/8] & (1<<(i%8))) != 0)
922 {
923 if (c > 75)
924 {
925 fprintf(outfile, "\n ");
926 c = 2;
927 }
928 if (isprint(i) && i != ' ')
929 {
930 fprintf(outfile, "%c ", i);
931 c += 2;
932 }
933 else
934 {
935 fprintf(outfile, "\\x%02x ", i);
936 c += 5;
937 }
938 }
939 }
940 fprintf(outfile, "\n");
941 }
942 }
943 }
944 }
945
946 /* Read data lines and test them */
947
948 for (;;)
949 {
950 unsigned char *q;
951 unsigned char *bptr = dbuffer;
952 int *use_offsets = offsets;
953 int use_size_offsets = size_offsets;
954 int callout_data = 0;
955 int callout_data_set = 0;
956 int count, c;
957 int copystrings = 0;
958 int find_match_limit = 0;
959 int getstrings = 0;
960 int getlist = 0;
961 int gmatched = 0;
962 int start_offset = 0;
963 int g_notempty = 0;
964
965 options = 0;
966
967 pcre_callout = callout;
968 first_callout = 1;
969 callout_extra = 0;
970 callout_count = 0;
971 callout_fail_count = 999999;
972 callout_fail_id = -1;
973 show_malloc = 0;
974
975 if (infile == stdin) printf("data> ");
976 if (fgets((char *)buffer, BUFFER_SIZE, infile) == NULL)
977 {
978 done = 1;
979 goto CONTINUE;
980 }
981 if (infile != stdin) fprintf(outfile, "%s", (char *)buffer);
982
983 len = (int)strlen((char *)buffer);
984 while (len > 0 && isspace(buffer[len-1])) len--;
985 buffer[len] = 0;
986 if (len == 0) break;
987
988 p = buffer;
989 while (isspace(*p)) p++;
990
991 q = dbuffer;
992 while ((c = *p++) != 0)
993 {
994 int i = 0;
995 int n = 0;
996
997 if (c == '\\') switch ((c = *p++))
998 {
999 case 'a': c = 7; break;
1000 case 'b': c = '\b'; break;
1001 case 'e': c = 27; break;
1002 case 'f': c = '\f'; break;
1003 case 'n': c = '\n'; break;
1004 case 'r': c = '\r'; break;
1005 case 't': c = '\t'; break;
1006 case 'v': c = '\v'; break;
1007
1008 case '0': case '1': case '2': case '3':
1009 case '4': case '5': case '6': case '7':
1010 c -= '0';
1011 while (i++ < 2 && isdigit(*p) && *p != '8' && *p != '9')
1012 c = c * 8 + *p++ - '0';
1013 break;
1014
1015 case 'x':
1016
1017 /* Handle \x{..} specially - new Perl thing for utf8 */
1018
1019 if (*p == '{')
1020 {
1021 unsigned char *pt = p;
1022 c = 0;
1023 while (isxdigit(*(++pt)))
1024 c = c * 16 + tolower(*pt) - ((isdigit(*pt))? '0' : 'W');
1025 if (*pt == '}')
1026 {
1027 unsigned char buff8[8];
1028 int ii, utn;
1029 utn = ord2utf8(c, buff8);
1030 for (ii = 0; ii < utn - 1; ii++) *q++ = buff8[ii];
1031 c = buff8[ii]; /* Last byte */
1032 p = pt + 1;
1033 break;
1034 }
1035 /* Not correct form; fall through */
1036 }
1037
1038 /* Ordinary \x */
1039
1040 c = 0;
1041 while (i++ < 2 && isxdigit(*p))
1042 {
1043 c = c * 16 + tolower(*p) - ((isdigit(*p))? '0' : 'W');
1044 p++;
1045 }
1046 break;
1047
1048 case 0: /* Allows for an empty line */
1049 p--;
1050 continue;
1051
1052 case 'A': /* Option setting */
1053 options |= PCRE_ANCHORED;
1054 continue;
1055
1056 case 'B':
1057 options |= PCRE_NOTBOL;
1058 continue;
1059
1060 case 'C':
1061 if (isdigit(*p)) /* Set copy string */
1062 {
1063 while(isdigit(*p)) n = n * 10 + *p++ - '0';
1064 copystrings |= 1 << n;
1065 }
1066 else if (isalnum(*p))
1067 {
1068 uschar name[256];
1069 uschar *npp = name;
1070 while (isalnum(*p)) *npp++ = *p++;
1071 *npp = 0;
1072 n = pcre_get_stringnumber(re, (char *)name);
1073 if (n < 0)
1074 fprintf(outfile, "no parentheses with name \"%s\"\n", name);
1075 else copystrings |= 1 << n;
1076 }
1077 else if (*p == '+')
1078 {
1079 callout_extra = 1;
1080 p++;
1081 }
1082 else if (*p == '-')
1083 {
1084 pcre_callout = NULL;
1085 p++;
1086 }
1087 else if (*p == '!')
1088 {
1089 callout_fail_id = 0;
1090 p++;
1091 while(isdigit(*p))
1092 callout_fail_id = callout_fail_id * 10 + *p++ - '0';
1093 callout_fail_count = 0;
1094 if (*p == '!')
1095 {
1096 p++;
1097 while(isdigit(*p))
1098 callout_fail_count = callout_fail_count * 10 + *p++ - '0';
1099 }
1100 }
1101 else if (*p == '*')
1102 {
1103 int sign = 1;
1104 callout_data = 0;
1105 if (*(++p) == '-') { sign = -1; p++; }
1106 while(isdigit(*p))
1107 callout_data = callout_data * 10 + *p++ - '0';
1108 callout_data *= sign;
1109 callout_data_set = 1;
1110 }
1111 continue;
1112
1113 case 'G':
1114 if (isdigit(*p))
1115 {
1116 while(isdigit(*p)) n = n * 10 + *p++ - '0';
1117 getstrings |= 1 << n;
1118 }
1119 else if (isalnum(*p))
1120 {
1121 uschar name[256];
1122 uschar *npp = name;
1123 while (isalnum(*p)) *npp++ = *p++;
1124 *npp = 0;
1125 n = pcre_get_stringnumber(re, (char *)name);
1126 if (n < 0)
1127 fprintf(outfile, "no parentheses with name \"%s\"\n", name);
1128 else getstrings |= 1 << n;
1129 }
1130 continue;
1131
1132 case 'L':
1133 getlist = 1;
1134 continue;
1135
1136 case 'M':
1137 find_match_limit = 1;
1138 continue;
1139
1140 case 'N':
1141 options |= PCRE_NOTEMPTY;
1142 continue;
1143
1144 case 'O':
1145 while(isdigit(*p)) n = n * 10 + *p++ - '0';
1146 if (n > size_offsets_max)
1147 {
1148 size_offsets_max = n;
1149 free(offsets);
1150 use_offsets = offsets = (int *)malloc(size_offsets_max * sizeof(int));
1151 if (offsets == NULL)
1152 {
1153 printf("** Failed to get %d bytes of memory for offsets vector\n",
1154 size_offsets_max * sizeof(int));
1155 return 1;
1156 }
1157 }
1158 use_size_offsets = n;
1159 if (n == 0) use_offsets = NULL; /* Ensures it can't write to it */
1160 continue;
1161
1162 case 'S':
1163 show_malloc = 1;
1164 continue;
1165
1166 case 'Z':
1167 options |= PCRE_NOTEOL;
1168 continue;
1169
1170 case '?':
1171 options |= PCRE_NO_UTF8_CHECK;
1172 continue;
1173 }
1174 *q++ = c;
1175 }
1176 *q = 0;
1177 len = q - dbuffer;
1178
1179 /* Handle matching via the POSIX interface, which does not
1180 support timing or playing with the match limit or callout data. */
1181
1182 #if !defined NOPOSIX
1183 if (posix || do_posix)
1184 {
1185 int rc;
1186 int eflags = 0;
1187 regmatch_t *pmatch = NULL;
1188 if (use_size_offsets > 0)
1189 pmatch = (regmatch_t *)malloc(sizeof(regmatch_t) * use_size_offsets);
1190 if ((options & PCRE_NOTBOL) != 0) eflags |= REG_NOTBOL;
1191 if ((options & PCRE_NOTEOL) != 0) eflags |= REG_NOTEOL;
1192
1193 rc = regexec(&preg, (const char *)bptr, use_size_offsets, pmatch, eflags);
1194
1195 if (rc != 0)
1196 {
1197 (void)regerror(rc, &preg, (char *)buffer, BUFFER_SIZE);
1198 fprintf(outfile, "No match: POSIX code %d: %s\n", rc, buffer);
1199 }
1200 else
1201 {
1202 size_t i;
1203 for (i = 0; i < (size_t)use_size_offsets; i++)
1204 {
1205 if (pmatch[i].rm_so >= 0)
1206 {
1207 fprintf(outfile, "%2d: ", (int)i);
1208 (void)pchars(dbuffer + pmatch[i].rm_so,
1209 pmatch[i].rm_eo - pmatch[i].rm_so, outfile);
1210 fprintf(outfile, "\n");
1211 if (i == 0 && do_showrest)
1212 {
1213 fprintf(outfile, " 0+ ");
1214 (void)pchars(dbuffer + pmatch[i].rm_eo, len - pmatch[i].rm_eo,
1215 outfile);
1216 fprintf(outfile, "\n");
1217 }
1218 }
1219 }
1220 }
1221 free(pmatch);
1222 }
1223
1224 /* Handle matching via the native interface - repeats for /g and /G */
1225
1226 else
1227 #endif /* !defined NOPOSIX */
1228
1229 for (;; gmatched++) /* Loop for /g or /G */
1230 {
1231 if (timeit)
1232 {
1233 register int i;
1234 clock_t time_taken;
1235 clock_t start_time = clock();
1236 for (i = 0; i < LOOPREPEAT; i++)
1237 count = pcre_exec(re, extra, (char *)bptr, len,
1238 start_offset, options | g_notempty, use_offsets, use_size_offsets);
1239 time_taken = clock() - start_time;
1240 fprintf(outfile, "Execute time %.3f milliseconds\n",
1241 (((double)time_taken * 1000.0) / (double)LOOPREPEAT) /
1242 (double)CLOCKS_PER_SEC);
1243 }
1244
1245 /* If find_match_limit is set, we want to do repeated matches with
1246 varying limits in order to find the minimum value. */
1247
1248 if (find_match_limit)
1249 {
1250 int min = 0;
1251 int mid = 64;
1252 int max = -1;
1253
1254 if (extra == NULL)
1255 {
1256 extra = (pcre_extra *)malloc(sizeof(pcre_extra));
1257 extra->flags = 0;
1258 }
1259 extra->flags |= PCRE_EXTRA_MATCH_LIMIT;
1260
1261 for (;;)
1262 {
1263 extra->match_limit = mid;
1264 count = pcre_exec(re, extra, (char *)bptr, len, start_offset,
1265 options | g_notempty, use_offsets, use_size_offsets);
1266 if (count == PCRE_ERROR_MATCHLIMIT)
1267 {
1268 /* fprintf(outfile, "Testing match limit = %d\n", mid); */
1269 min = mid;
1270 mid = (mid == max - 1)? max : (max > 0)? (min + max)/2 : mid*2;
1271 }
1272 else if (count >= 0 || count == PCRE_ERROR_NOMATCH)
1273 {
1274 if (mid == min + 1)
1275 {
1276 fprintf(outfile, "Minimum match limit = %d\n", mid);
1277 break;
1278 }
1279 /* fprintf(outfile, "Testing match limit = %d\n", mid); */
1280 max = mid;
1281 mid = (min + mid)/2;
1282 }
1283 else break; /* Some other error */
1284 }
1285
1286 extra->flags &= ~PCRE_EXTRA_MATCH_LIMIT;
1287 }
1288
1289 /* If callout_data is set, use the interface with additional data */
1290
1291 else if (callout_data_set)
1292 {
1293 if (extra == NULL)
1294 {
1295 extra = (pcre_extra *)malloc(sizeof(pcre_extra));
1296 extra->flags = 0;
1297 }
1298 extra->flags |= PCRE_EXTRA_CALLOUT_DATA;
1299 extra->callout_data = &callout_data;
1300 count = pcre_exec(re, extra, (char *)bptr, len, start_offset,
1301 options | g_notempty, use_offsets, use_size_offsets);
1302 extra->flags &= ~PCRE_EXTRA_CALLOUT_DATA;
1303 }
1304
1305 /* The normal case is just to do the match once, with the default
1306 value of match_limit. */
1307
1308 else count = pcre_exec(re, extra, (char *)bptr, len,
1309 start_offset, options | g_notempty, use_offsets, use_size_offsets);
1310
1311 if (count == 0)
1312 {
1313 fprintf(outfile, "Matched, but too many substrings\n");
1314 count = use_size_offsets/3;
1315 }
1316
1317 /* Matched */
1318
1319 if (count >= 0)
1320 {
1321 int i;
1322 for (i = 0; i < count * 2; i += 2)
1323 {
1324 if (use_offsets[i] < 0)
1325 fprintf(outfile, "%2d: <unset>\n", i/2);
1326 else
1327 {
1328 fprintf(outfile, "%2d: ", i/2);
1329 (void)pchars(bptr + use_offsets[i],
1330 use_offsets[i+1] - use_offsets[i], outfile);
1331 fprintf(outfile, "\n");
1332 if (i == 0)
1333 {
1334 if (do_showrest)
1335 {
1336 fprintf(outfile, " 0+ ");
1337 (void)pchars(bptr + use_offsets[i+1], len - use_offsets[i+1],
1338 outfile);
1339 fprintf(outfile, "\n");
1340 }
1341 }
1342 }
1343 }
1344
1345 for (i = 0; i < 32; i++)
1346 {
1347 if ((copystrings & (1 << i)) != 0)
1348 {
1349 char copybuffer[16];
1350 int rc = pcre_copy_substring((char *)bptr, use_offsets, count,
1351 i, copybuffer, sizeof(copybuffer));
1352 if (rc < 0)
1353 fprintf(outfile, "copy substring %d failed %d\n", i, rc);
1354 else
1355 fprintf(outfile, "%2dC %s (%d)\n", i, copybuffer, rc);
1356 }
1357 }
1358
1359 for (i = 0; i < 32; i++)
1360 {
1361 if ((getstrings & (1 << i)) != 0)
1362 {
1363 const char *substring;
1364 int rc = pcre_get_substring((char *)bptr, use_offsets, count,
1365 i, &substring);
1366 if (rc < 0)
1367 fprintf(outfile, "get substring %d failed %d\n", i, rc);
1368 else
1369 {
1370 fprintf(outfile, "%2dG %s (%d)\n", i, substring, rc);
1371 /* free((void *)substring); */
1372 pcre_free_substring(substring);
1373 }
1374 }
1375 }
1376
1377 if (getlist)
1378 {
1379 const char **stringlist;
1380 int rc = pcre_get_substring_list((char *)bptr, use_offsets, count,
1381 &stringlist);
1382 if (rc < 0)
1383 fprintf(outfile, "get substring list failed %d\n", rc);
1384 else
1385 {
1386 for (i = 0; i < count; i++)
1387 fprintf(outfile, "%2dL %s\n", i, stringlist[i]);
1388 if (stringlist[i] != NULL)
1389 fprintf(outfile, "string list not terminated by NULL\n");
1390 /* free((void *)stringlist); */
1391 pcre_free_substring_list(stringlist);
1392 }
1393 }
1394 }
1395
1396 /* Failed to match. If this is a /g or /G loop and we previously set
1397 g_notempty after a null match, this is not necessarily the end.
1398 We want to advance the start offset, and continue. In the case of UTF-8
1399 matching, the advance must be one character, not one byte. Fudge the
1400 offset values to achieve this. We won't be at the end of the string -
1401 that was checked before setting g_notempty. */
1402
1403 else
1404 {
1405 if (g_notempty != 0)
1406 {
1407 int onechar = 1;
1408 use_offsets[0] = start_offset;
1409 if (use_utf8)
1410 {
1411 while (start_offset + onechar < len)
1412 {
1413 int tb = bptr[start_offset+onechar];
1414 if (tb <= 127) break;
1415 tb &= 0xc0;
1416 if (tb != 0 && tb != 0xc0) onechar++;
1417 }
1418 }
1419 use_offsets[1] = start_offset + onechar;
1420 }
1421 else
1422 {
1423 if (count == PCRE_ERROR_NOMATCH)
1424 {
1425 if (gmatched == 0) fprintf(outfile, "No match\n");
1426 }
1427 else fprintf(outfile, "Error %d\n", count);
1428 break; /* Out of the /g loop */
1429 }
1430 }
1431
1432 /* If not /g or /G we are done */
1433
1434 if (!do_g && !do_G) break;
1435
1436 /* If we have matched an empty string, first check to see if we are at
1437 the end of the subject. If so, the /g loop is over. Otherwise, mimic
1438 what Perl's /g options does. This turns out to be rather cunning. First
1439 we set PCRE_NOTEMPTY and PCRE_ANCHORED and try the match again at the
1440 same point. If this fails (picked up above) we advance to the next
1441 character. */
1442
1443 g_notempty = 0;
1444 if (use_offsets[0] == use_offsets[1])
1445 {
1446 if (use_offsets[0] == len) break;
1447 g_notempty = PCRE_NOTEMPTY | PCRE_ANCHORED;
1448 }
1449
1450 /* For /g, update the start offset, leaving the rest alone */
1451
1452 if (do_g) start_offset = use_offsets[1];
1453
1454 /* For /G, update the pointer and length */
1455
1456 else
1457 {
1458 bptr += use_offsets[1];
1459 len -= use_offsets[1];
1460 }
1461 } /* End of loop for /g and /G */
1462 } /* End of loop for data lines */
1463
1464 CONTINUE:
1465
1466 #if !defined NOPOSIX
1467 if (posix || do_posix) regfree(&preg);
1468 #endif
1469
1470 if (re != NULL) free(re);
1471 if (extra != NULL) free(extra);
1472 if (tables != NULL)
1473 {
1474 free((void *)tables);
1475 setlocale(LC_CTYPE, "C");
1476 }
1477 }
1478
1479 if (infile == stdin) fprintf(outfile, "\n");
1480 return 0;
1481 }
1482
1483 /* End */

webmaster@exim.org
ViewVC Help
Powered by ViewVC 1.1.12