/[pcre]/code/branches/pcre16/pcretest.c
ViewVC logotype

Contents of /code/branches/pcre16/pcretest.c

Parent Directory Parent Directory | Revision Log Revision Log


Revision 67 - (show annotations) (download)
Sat Feb 24 21:40:13 2007 UTC (7 years, 6 months ago) by nigel
Original Path: code/trunk/pcretest.c
File MIME type: text/plain
File size: 39234 byte(s)
Load pcre-4.2 into code/trunk.

1 /*************************************************
2 * PCRE testing program *
3 *************************************************/
4
5 /* This program was hacked up as a tester for PCRE. I really should have
6 written it more tidily in the first place. Will I ever learn? It has grown and
7 been extended and consequently is now rather untidy in places. */
8
9 #include <ctype.h>
10 #include <stdio.h>
11 #include <string.h>
12 #include <stdlib.h>
13 #include <time.h>
14 #include <locale.h>
15
16 /* We need the internal info for displaying the results of pcre_study(). Also
17 for getting the opcodes for showing compiled code. */
18
19 #define PCRE_SPY /* For Win32 build, import data, not export */
20 #include "internal.h"
21
22 /* It is possible to compile this test program without including support for
23 testing the POSIX interface, though this is not available via the standard
24 Makefile. */
25
26 #if !defined NOPOSIX
27 #include "pcreposix.h"
28 #endif
29
30 #ifndef CLOCKS_PER_SEC
31 #ifdef CLK_TCK
32 #define CLOCKS_PER_SEC CLK_TCK
33 #else
34 #define CLOCKS_PER_SEC 100
35 #endif
36 #endif
37
38 #define LOOPREPEAT 50000
39
40
41 static FILE *outfile;
42 static int log_store = 0;
43 static int callout_count;
44 static int callout_extra;
45 static int callout_fail_count;
46 static int callout_fail_id;
47 static int first_callout;
48 static int use_utf8;
49 static size_t gotten_store;
50
51
52
53 static int utf8_table1[] = {
54 0x0000007f, 0x000007ff, 0x0000ffff, 0x001fffff, 0x03ffffff, 0x7fffffff};
55
56 static int utf8_table2[] = {
57 0, 0xc0, 0xe0, 0xf0, 0xf8, 0xfc};
58
59 static int utf8_table3[] = {
60 0xff, 0x1f, 0x0f, 0x07, 0x03, 0x01};
61
62
63
64 /*************************************************
65 * Print compiled regex *
66 *************************************************/
67
68 /* The code for doing this is held in a separate file that is also included in
69 pcre.c when it is compiled with the debug switch. It defines a function called
70 print_internals(), which uses a table of opcode lengths defined by the macro
71 OP_LENGTHS, whose name must be OP_lengths. */
72
73 static uschar OP_lengths[] = { OP_LENGTHS };
74
75 #include "printint.c"
76
77
78
79 /*************************************************
80 * Read number from string *
81 *************************************************/
82
83 /* We don't use strtoul() because SunOS4 doesn't have it. Rather than mess
84 around with conditional compilation, just do the job by hand. It is only used
85 for unpicking the -o argument, so just keep it simple.
86
87 Arguments:
88 str string to be converted
89 endptr where to put the end pointer
90
91 Returns: the unsigned long
92 */
93
94 static int
95 get_value(unsigned char *str, unsigned char **endptr)
96 {
97 int result = 0;
98 while(*str != 0 && isspace(*str)) str++;
99 while (isdigit(*str)) result = result * 10 + (int)(*str++ - '0');
100 *endptr = str;
101 return(result);
102 }
103
104
105
106 /*************************************************
107 * Convert character value to UTF-8 *
108 *************************************************/
109
110 /* This function takes an integer value in the range 0 - 0x7fffffff
111 and encodes it as a UTF-8 character in 0 to 6 bytes.
112
113 Arguments:
114 cvalue the character value
115 buffer pointer to buffer for result - at least 6 bytes long
116
117 Returns: number of characters placed in the buffer
118 -1 if input character is negative
119 0 if input character is positive but too big (only when
120 int is longer than 32 bits)
121 */
122
123 static int
124 ord2utf8(int cvalue, unsigned char *buffer)
125 {
126 register int i, j;
127 for (i = 0; i < sizeof(utf8_table1)/sizeof(int); i++)
128 if (cvalue <= utf8_table1[i]) break;
129 if (i >= sizeof(utf8_table1)/sizeof(int)) return 0;
130 if (cvalue < 0) return -1;
131
132 buffer += i;
133 for (j = i; j > 0; j--)
134 {
135 *buffer-- = 0x80 | (cvalue & 0x3f);
136 cvalue >>= 6;
137 }
138 *buffer = utf8_table2[i] | cvalue;
139 return i + 1;
140 }
141
142
143 /*************************************************
144 * Convert UTF-8 string to value *
145 *************************************************/
146
147 /* This function takes one or more bytes that represents a UTF-8 character,
148 and returns the value of the character.
149
150 Argument:
151 buffer a pointer to the byte vector
152 vptr a pointer to an int to receive the value
153
154 Returns: > 0 => the number of bytes consumed
155 -6 to 0 => malformed UTF-8 character at offset = (-return)
156 */
157
158 static int
159 utf82ord(unsigned char *buffer, int *vptr)
160 {
161 int c = *buffer++;
162 int d = c;
163 int i, j, s;
164
165 for (i = -1; i < 6; i++) /* i is number of additional bytes */
166 {
167 if ((d & 0x80) == 0) break;
168 d <<= 1;
169 }
170
171 if (i == -1) { *vptr = c; return 1; } /* ascii character */
172 if (i == 0 || i == 6) return 0; /* invalid UTF-8 */
173
174 /* i now has a value in the range 1-5 */
175
176 s = 6*i;
177 d = (c & utf8_table3[i]) << s;
178
179 for (j = 0; j < i; j++)
180 {
181 c = *buffer++;
182 if ((c & 0xc0) != 0x80) return -(j+1);
183 s -= 6;
184 d |= (c & 0x3f) << s;
185 }
186
187 /* Check that encoding was the correct unique one */
188
189 for (j = 0; j < sizeof(utf8_table1)/sizeof(int); j++)
190 if (d <= utf8_table1[j]) break;
191 if (j != i) return -(i+1);
192
193 /* Valid value */
194
195 *vptr = d;
196 return i+1;
197 }
198
199
200
201 /*************************************************
202 * Print character string *
203 *************************************************/
204
205 /* Character string printing function. Must handle UTF-8 strings in utf8
206 mode. Yields number of characters printed. If handed a NULL file, just counts
207 chars without printing. */
208
209 static int pchars(unsigned char *p, int length, FILE *f)
210 {
211 int c;
212 int yield = 0;
213
214 while (length-- > 0)
215 {
216 if (use_utf8)
217 {
218 int rc = utf82ord(p, &c);
219
220 if (rc > 0 && rc <= length + 1) /* Mustn't run over the end */
221 {
222 length -= rc - 1;
223 p += rc;
224 if (c < 256 && isprint(c))
225 {
226 if (f != NULL) fprintf(f, "%c", c);
227 yield++;
228 }
229 else
230 {
231 int n;
232 if (f != NULL) fprintf(f, "\\x{%02x}%n", c, &n);
233 yield += n;
234 }
235 continue;
236 }
237 }
238
239 /* Not UTF-8, or malformed UTF-8 */
240
241 if (isprint(c = *(p++)))
242 {
243 if (f != NULL) fprintf(f, "%c", c);
244 yield++;
245 }
246 else
247 {
248 if (f != NULL) fprintf(f, "\\x%02x", c);
249 yield += 4;
250 }
251 }
252
253 return yield;
254 }
255
256
257
258 /*************************************************
259 * Callout function *
260 *************************************************/
261
262 /* Called from PCRE as a result of the (?C) item. We print out where we are in
263 the match. Yield zero unless more callouts than the fail count, or the callout
264 data is not zero. */
265
266 static int callout(pcre_callout_block *cb)
267 {
268 FILE *f = (first_callout | callout_extra)? outfile : NULL;
269 int i, pre_start, post_start;
270
271 if (callout_extra)
272 {
273 fprintf(f, "Callout %d: last capture = %d\n",
274 cb->callout_number, cb->capture_last);
275
276 for (i = 0; i < cb->capture_top * 2; i += 2)
277 {
278 if (cb->offset_vector[i] < 0)
279 fprintf(f, "%2d: <unset>\n", i/2);
280 else
281 {
282 fprintf(f, "%2d: ", i/2);
283 (void)pchars((unsigned char *)cb->subject + cb->offset_vector[i],
284 cb->offset_vector[i+1] - cb->offset_vector[i], f);
285 fprintf(f, "\n");
286 }
287 }
288 }
289
290 /* Re-print the subject in canonical form, the first time or if giving full
291 datails. On subsequent calls in the same match, we use pchars just to find the
292 printed lengths of the substrings. */
293
294 if (f != NULL) fprintf(f, "--->");
295
296 pre_start = pchars((unsigned char *)cb->subject, cb->start_match, f);
297 post_start = pchars((unsigned char *)(cb->subject + cb->start_match),
298 cb->current_position - cb->start_match, f);
299
300 (void)pchars((unsigned char *)(cb->subject + cb->current_position),
301 cb->subject_length - cb->current_position, f);
302
303 if (f != NULL) fprintf(f, "\n");
304
305 /* Always print appropriate indicators, with callout number if not already
306 shown */
307
308 if (callout_extra) fprintf(outfile, " ");
309 else fprintf(outfile, "%3d ", cb->callout_number);
310
311 for (i = 0; i < pre_start; i++) fprintf(outfile, " ");
312 fprintf(outfile, "^");
313
314 if (post_start > 0)
315 {
316 for (i = 0; i < post_start - 1; i++) fprintf(outfile, " ");
317 fprintf(outfile, "^");
318 }
319
320 fprintf(outfile, "\n");
321
322 first_callout = 0;
323
324 if ((int)(cb->callout_data) != 0)
325 {
326 fprintf(outfile, "Callout data = %d\n", (int)(cb->callout_data));
327 return (int)(cb->callout_data);
328 }
329
330 return (cb->callout_number != callout_fail_id)? 0 :
331 (++callout_count >= callout_fail_count)? 1 : 0;
332 }
333
334
335 /*************************************************
336 * Local malloc function *
337 *************************************************/
338
339 /* Alternative malloc function, to test functionality and show the size of the
340 compiled re. */
341
342 static void *new_malloc(size_t size)
343 {
344 gotten_store = size;
345 return malloc(size);
346 }
347
348
349
350 /*************************************************
351 * Call pcre_fullinfo() *
352 *************************************************/
353
354 /* Get one piece of information from the pcre_fullinfo() function */
355
356 static void new_info(pcre *re, pcre_extra *study, int option, void *ptr)
357 {
358 int rc;
359 if ((rc = pcre_fullinfo(re, study, option, ptr)) < 0)
360 fprintf(outfile, "Error %d from pcre_fullinfo(%d)\n", rc, option);
361 }
362
363
364
365 /*************************************************
366 * Main Program *
367 *************************************************/
368
369 /* Read lines from named file or stdin and write to named file or stdout; lines
370 consist of a regular expression, in delimiters and optionally followed by
371 options, followed by a set of test data, terminated by an empty line. */
372
373 int main(int argc, char **argv)
374 {
375 FILE *infile = stdin;
376 int options = 0;
377 int study_options = 0;
378 int op = 1;
379 int timeit = 0;
380 int showinfo = 0;
381 int showstore = 0;
382 int size_offsets = 45;
383 int size_offsets_max;
384 int *offsets;
385 #if !defined NOPOSIX
386 int posix = 0;
387 #endif
388 int debug = 0;
389 int done = 0;
390 unsigned char buffer[30000];
391 unsigned char dbuffer[1024];
392
393 /* Static so that new_malloc can use it. */
394
395 outfile = stdout;
396
397 /* Scan options */
398
399 while (argc > 1 && argv[op][0] == '-')
400 {
401 unsigned char *endptr;
402
403 if (strcmp(argv[op], "-s") == 0 || strcmp(argv[op], "-m") == 0)
404 showstore = 1;
405 else if (strcmp(argv[op], "-t") == 0) timeit = 1;
406 else if (strcmp(argv[op], "-i") == 0) showinfo = 1;
407 else if (strcmp(argv[op], "-d") == 0) showinfo = debug = 1;
408 else if (strcmp(argv[op], "-o") == 0 && argc > 2 &&
409 ((size_offsets = get_value((unsigned char *)argv[op+1], &endptr)),
410 *endptr == 0))
411 {
412 op++;
413 argc--;
414 }
415 #if !defined NOPOSIX
416 else if (strcmp(argv[op], "-p") == 0) posix = 1;
417 #endif
418 else if (strcmp(argv[op], "-C") == 0)
419 {
420 int rc;
421 printf("PCRE version %s\n", pcre_version());
422 printf("Compiled with\n");
423 (void)pcre_config(PCRE_CONFIG_UTF8, &rc);
424 printf(" %sUTF-8 support\n", rc? "" : "No ");
425 (void)pcre_config(PCRE_CONFIG_NEWLINE, &rc);
426 printf(" Newline character is %s\n", (rc == '\r')? "CR" : "LF");
427 (void)pcre_config(PCRE_CONFIG_LINK_SIZE, &rc);
428 printf(" Internal link size = %d\n", rc);
429 (void)pcre_config(PCRE_CONFIG_POSIX_MALLOC_THRESHOLD, &rc);
430 printf(" POSIX malloc threshold = %d\n", rc);
431 (void)pcre_config(PCRE_CONFIG_MATCH_LIMIT, &rc);
432 printf(" Default match limit = %d\n", rc);
433 exit(0);
434 }
435 else
436 {
437 printf("** Unknown or malformed option %s\n", argv[op]);
438 printf("Usage: pcretest [-d] [-i] [-o <n>] [-p] [-s] [-t] [<input> [<output>]]\n");
439 printf(" -C show PCRE compile-time options and exit\n");
440 printf(" -d debug: show compiled code; implies -i\n"
441 " -i show information about compiled pattern\n"
442 " -o <n> set size of offsets vector to <n>\n");
443 #if !defined NOPOSIX
444 printf(" -p use POSIX interface\n");
445 #endif
446 printf(" -s output store information\n"
447 " -t time compilation and execution\n");
448 return 1;
449 }
450 op++;
451 argc--;
452 }
453
454 /* Get the store for the offsets vector, and remember what it was */
455
456 size_offsets_max = size_offsets;
457 offsets = malloc(size_offsets_max * sizeof(int));
458 if (offsets == NULL)
459 {
460 printf("** Failed to get %d bytes of memory for offsets vector\n",
461 size_offsets_max * sizeof(int));
462 return 1;
463 }
464
465 /* Sort out the input and output files */
466
467 if (argc > 1)
468 {
469 infile = fopen(argv[op], "r");
470 if (infile == NULL)
471 {
472 printf("** Failed to open %s\n", argv[op]);
473 return 1;
474 }
475 }
476
477 if (argc > 2)
478 {
479 outfile = fopen(argv[op+1], "w");
480 if (outfile == NULL)
481 {
482 printf("** Failed to open %s\n", argv[op+1]);
483 return 1;
484 }
485 }
486
487 /* Set alternative malloc function */
488
489 pcre_malloc = new_malloc;
490
491 /* Heading line, then prompt for first regex if stdin */
492
493 fprintf(outfile, "PCRE version %s\n\n", pcre_version());
494
495 /* Main loop */
496
497 while (!done)
498 {
499 pcre *re = NULL;
500 pcre_extra *extra = NULL;
501
502 #if !defined NOPOSIX /* There are still compilers that require no indent */
503 regex_t preg;
504 int do_posix = 0;
505 #endif
506
507 const char *error;
508 unsigned char *p, *pp, *ppp;
509 const unsigned char *tables = NULL;
510 int do_study = 0;
511 int do_debug = debug;
512 int do_G = 0;
513 int do_g = 0;
514 int do_showinfo = showinfo;
515 int do_showrest = 0;
516 int erroroffset, len, delimiter;
517
518 use_utf8 = 0;
519
520 if (infile == stdin) printf(" re> ");
521 if (fgets((char *)buffer, sizeof(buffer), infile) == NULL) break;
522 if (infile != stdin) fprintf(outfile, "%s", (char *)buffer);
523 fflush(outfile);
524
525 p = buffer;
526 while (isspace(*p)) p++;
527 if (*p == 0) continue;
528
529 /* Get the delimiter and seek the end of the pattern; if is isn't
530 complete, read more. */
531
532 delimiter = *p++;
533
534 if (isalnum(delimiter) || delimiter == '\\')
535 {
536 fprintf(outfile, "** Delimiter must not be alphameric or \\\n");
537 goto SKIP_DATA;
538 }
539
540 pp = p;
541
542 for(;;)
543 {
544 while (*pp != 0)
545 {
546 if (*pp == '\\' && pp[1] != 0) pp++;
547 else if (*pp == delimiter) break;
548 pp++;
549 }
550 if (*pp != 0) break;
551
552 len = sizeof(buffer) - (pp - buffer);
553 if (len < 256)
554 {
555 fprintf(outfile, "** Expression too long - missing delimiter?\n");
556 goto SKIP_DATA;
557 }
558
559 if (infile == stdin) printf(" > ");
560 if (fgets((char *)pp, len, infile) == NULL)
561 {
562 fprintf(outfile, "** Unexpected EOF\n");
563 done = 1;
564 goto CONTINUE;
565 }
566 if (infile != stdin) fprintf(outfile, "%s", (char *)pp);
567 }
568
569 /* If the first character after the delimiter is backslash, make
570 the pattern end with backslash. This is purely to provide a way
571 of testing for the error message when a pattern ends with backslash. */
572
573 if (pp[1] == '\\') *pp++ = '\\';
574
575 /* Terminate the pattern at the delimiter */
576
577 *pp++ = 0;
578
579 /* Look for options after final delimiter */
580
581 options = 0;
582 study_options = 0;
583 log_store = showstore; /* default from command line */
584
585 while (*pp != 0)
586 {
587 switch (*pp++)
588 {
589 case 'g': do_g = 1; break;
590 case 'i': options |= PCRE_CASELESS; break;
591 case 'm': options |= PCRE_MULTILINE; break;
592 case 's': options |= PCRE_DOTALL; break;
593 case 'x': options |= PCRE_EXTENDED; break;
594
595 case '+': do_showrest = 1; break;
596 case 'A': options |= PCRE_ANCHORED; break;
597 case 'D': do_debug = do_showinfo = 1; break;
598 case 'E': options |= PCRE_DOLLAR_ENDONLY; break;
599 case 'G': do_G = 1; break;
600 case 'I': do_showinfo = 1; break;
601 case 'M': log_store = 1; break;
602 case 'N': options |= PCRE_NO_AUTO_CAPTURE; break;
603
604 #if !defined NOPOSIX
605 case 'P': do_posix = 1; break;
606 #endif
607
608 case 'S': do_study = 1; break;
609 case 'U': options |= PCRE_UNGREEDY; break;
610 case 'X': options |= PCRE_EXTRA; break;
611 case '8': options |= PCRE_UTF8; use_utf8 = 1; break;
612
613 case 'L':
614 ppp = pp;
615 while (*ppp != '\n' && *ppp != ' ') ppp++;
616 *ppp = 0;
617 if (setlocale(LC_CTYPE, (const char *)pp) == NULL)
618 {
619 fprintf(outfile, "** Failed to set locale \"%s\"\n", pp);
620 goto SKIP_DATA;
621 }
622 tables = pcre_maketables();
623 pp = ppp;
624 break;
625
626 case '\n': case ' ': break;
627 default:
628 fprintf(outfile, "** Unknown option '%c'\n", pp[-1]);
629 goto SKIP_DATA;
630 }
631 }
632
633 /* Handle compiling via the POSIX interface, which doesn't support the
634 timing, showing, or debugging options, nor the ability to pass over
635 local character tables. */
636
637 #if !defined NOPOSIX
638 if (posix || do_posix)
639 {
640 int rc;
641 int cflags = 0;
642 if ((options & PCRE_CASELESS) != 0) cflags |= REG_ICASE;
643 if ((options & PCRE_MULTILINE) != 0) cflags |= REG_NEWLINE;
644 rc = regcomp(&preg, (char *)p, cflags);
645
646 /* Compilation failed; go back for another re, skipping to blank line
647 if non-interactive. */
648
649 if (rc != 0)
650 {
651 (void)regerror(rc, &preg, (char *)buffer, sizeof(buffer));
652 fprintf(outfile, "Failed: POSIX code %d: %s\n", rc, buffer);
653 goto SKIP_DATA;
654 }
655 }
656
657 /* Handle compiling via the native interface */
658
659 else
660 #endif /* !defined NOPOSIX */
661
662 {
663 if (timeit)
664 {
665 register int i;
666 clock_t time_taken;
667 clock_t start_time = clock();
668 for (i = 0; i < LOOPREPEAT; i++)
669 {
670 re = pcre_compile((char *)p, options, &error, &erroroffset, tables);
671 if (re != NULL) free(re);
672 }
673 time_taken = clock() - start_time;
674 fprintf(outfile, "Compile time %.3f milliseconds\n",
675 (((double)time_taken * 1000.0) / (double)LOOPREPEAT) /
676 (double)CLOCKS_PER_SEC);
677 }
678
679 re = pcre_compile((char *)p, options, &error, &erroroffset, tables);
680
681 /* Compilation failed; go back for another re, skipping to blank line
682 if non-interactive. */
683
684 if (re == NULL)
685 {
686 fprintf(outfile, "Failed: %s at offset %d\n", error, erroroffset);
687 SKIP_DATA:
688 if (infile != stdin)
689 {
690 for (;;)
691 {
692 if (fgets((char *)buffer, sizeof(buffer), infile) == NULL)
693 {
694 done = 1;
695 goto CONTINUE;
696 }
697 len = (int)strlen((char *)buffer);
698 while (len > 0 && isspace(buffer[len-1])) len--;
699 if (len == 0) break;
700 }
701 fprintf(outfile, "\n");
702 }
703 goto CONTINUE;
704 }
705
706 /* Compilation succeeded; print data if required. There are now two
707 info-returning functions. The old one has a limited interface and
708 returns only limited data. Check that it agrees with the newer one. */
709
710 if (log_store)
711 fprintf(outfile, "Memory allocation (code space): %d\n",
712 (int)(gotten_store -
713 sizeof(real_pcre) -
714 ((real_pcre *)re)->name_count * ((real_pcre *)re)->name_entry_size));
715
716 if (do_showinfo)
717 {
718 unsigned long int get_options;
719 int old_first_char, old_options, old_count;
720 int count, backrefmax, first_char, need_char;
721 int nameentrysize, namecount;
722 const uschar *nametable;
723 size_t size;
724
725 if (do_debug)
726 {
727 fprintf(outfile, "------------------------------------------------------------------\n");
728 print_internals(re, outfile);
729 }
730
731 new_info(re, NULL, PCRE_INFO_OPTIONS, &get_options);
732 new_info(re, NULL, PCRE_INFO_SIZE, &size);
733 new_info(re, NULL, PCRE_INFO_CAPTURECOUNT, &count);
734 new_info(re, NULL, PCRE_INFO_BACKREFMAX, &backrefmax);
735 new_info(re, NULL, PCRE_INFO_FIRSTBYTE, &first_char);
736 new_info(re, NULL, PCRE_INFO_LASTLITERAL, &need_char);
737 new_info(re, NULL, PCRE_INFO_NAMEENTRYSIZE, &nameentrysize);
738 new_info(re, NULL, PCRE_INFO_NAMECOUNT, &namecount);
739 new_info(re, NULL, PCRE_INFO_NAMETABLE, (void *)&nametable);
740
741 old_count = pcre_info(re, &old_options, &old_first_char);
742 if (count < 0) fprintf(outfile,
743 "Error %d from pcre_info()\n", count);
744 else
745 {
746 if (old_count != count) fprintf(outfile,
747 "Count disagreement: pcre_fullinfo=%d pcre_info=%d\n", count,
748 old_count);
749
750 if (old_first_char != first_char) fprintf(outfile,
751 "First char disagreement: pcre_fullinfo=%d pcre_info=%d\n",
752 first_char, old_first_char);
753
754 if (old_options != (int)get_options) fprintf(outfile,
755 "Options disagreement: pcre_fullinfo=%ld pcre_info=%d\n",
756 get_options, old_options);
757 }
758
759 if (size != gotten_store) fprintf(outfile,
760 "Size disagreement: pcre_fullinfo=%d call to malloc for %d\n",
761 size, gotten_store);
762
763 fprintf(outfile, "Capturing subpattern count = %d\n", count);
764 if (backrefmax > 0)
765 fprintf(outfile, "Max back reference = %d\n", backrefmax);
766
767 if (namecount > 0)
768 {
769 fprintf(outfile, "Named capturing subpatterns:\n");
770 while (namecount-- > 0)
771 {
772 fprintf(outfile, " %s %*s%3d\n", nametable + 2,
773 nameentrysize - 3 - (int)strlen((char *)nametable + 2), "",
774 GET2(nametable, 0));
775 nametable += nameentrysize;
776 }
777 }
778
779 if (get_options == 0) fprintf(outfile, "No options\n");
780 else fprintf(outfile, "Options:%s%s%s%s%s%s%s%s%s\n",
781 ((get_options & PCRE_ANCHORED) != 0)? " anchored" : "",
782 ((get_options & PCRE_CASELESS) != 0)? " caseless" : "",
783 ((get_options & PCRE_EXTENDED) != 0)? " extended" : "",
784 ((get_options & PCRE_MULTILINE) != 0)? " multiline" : "",
785 ((get_options & PCRE_DOTALL) != 0)? " dotall" : "",
786 ((get_options & PCRE_DOLLAR_ENDONLY) != 0)? " dollar_endonly" : "",
787 ((get_options & PCRE_EXTRA) != 0)? " extra" : "",
788 ((get_options & PCRE_UNGREEDY) != 0)? " ungreedy" : "",
789 ((get_options & PCRE_UTF8) != 0)? " utf8" : "");
790
791 if (((((real_pcre *)re)->options) & PCRE_ICHANGED) != 0)
792 fprintf(outfile, "Case state changes\n");
793
794 if (first_char == -1)
795 {
796 fprintf(outfile, "First char at start or follows \\n\n");
797 }
798 else if (first_char < 0)
799 {
800 fprintf(outfile, "No first char\n");
801 }
802 else
803 {
804 int ch = first_char & 255;
805 const char *caseless = ((first_char & REQ_CASELESS) == 0)?
806 "" : " (caseless)";
807 if (isprint(ch))
808 fprintf(outfile, "First char = \'%c\'%s\n", ch, caseless);
809 else
810 fprintf(outfile, "First char = %d%s\n", ch, caseless);
811 }
812
813 if (need_char < 0)
814 {
815 fprintf(outfile, "No need char\n");
816 }
817 else
818 {
819 int ch = need_char & 255;
820 const char *caseless = ((need_char & REQ_CASELESS) == 0)?
821 "" : " (caseless)";
822 if (isprint(ch))
823 fprintf(outfile, "Need char = \'%c\'%s\n", ch, caseless);
824 else
825 fprintf(outfile, "Need char = %d%s\n", ch, caseless);
826 }
827 }
828
829 /* If /S was present, study the regexp to generate additional info to
830 help with the matching. */
831
832 if (do_study)
833 {
834 if (timeit)
835 {
836 register int i;
837 clock_t time_taken;
838 clock_t start_time = clock();
839 for (i = 0; i < LOOPREPEAT; i++)
840 extra = pcre_study(re, study_options, &error);
841 time_taken = clock() - start_time;
842 if (extra != NULL) free(extra);
843 fprintf(outfile, " Study time %.3f milliseconds\n",
844 (((double)time_taken * 1000.0) / (double)LOOPREPEAT) /
845 (double)CLOCKS_PER_SEC);
846 }
847
848 extra = pcre_study(re, study_options, &error);
849 if (error != NULL)
850 fprintf(outfile, "Failed to study: %s\n", error);
851 else if (extra == NULL)
852 fprintf(outfile, "Study returned NULL\n");
853
854 else if (do_showinfo)
855 {
856 size_t size;
857 uschar *start_bits = NULL;
858 new_info(re, extra, PCRE_INFO_STUDYSIZE, &size);
859 new_info(re, extra, PCRE_INFO_FIRSTTABLE, &start_bits);
860 fprintf(outfile, "Study size = %d\n", size);
861 if (start_bits == NULL)
862 fprintf(outfile, "No starting character set\n");
863 else
864 {
865 int i;
866 int c = 24;
867 fprintf(outfile, "Starting character set: ");
868 for (i = 0; i < 256; i++)
869 {
870 if ((start_bits[i/8] & (1<<(i%8))) != 0)
871 {
872 if (c > 75)
873 {
874 fprintf(outfile, "\n ");
875 c = 2;
876 }
877 if (isprint(i) && i != ' ')
878 {
879 fprintf(outfile, "%c ", i);
880 c += 2;
881 }
882 else
883 {
884 fprintf(outfile, "\\x%02x ", i);
885 c += 5;
886 }
887 }
888 }
889 fprintf(outfile, "\n");
890 }
891 }
892 }
893 }
894
895 /* Read data lines and test them */
896
897 for (;;)
898 {
899 unsigned char *q;
900 unsigned char *bptr = dbuffer;
901 int *use_offsets = offsets;
902 int use_size_offsets = size_offsets;
903 int callout_data = 0;
904 int callout_data_set = 0;
905 int count, c;
906 int copystrings = 0;
907 int find_match_limit = 0;
908 int getstrings = 0;
909 int getlist = 0;
910 int gmatched = 0;
911 int start_offset = 0;
912 int g_notempty = 0;
913
914 options = 0;
915
916 pcre_callout = callout;
917 first_callout = 1;
918 callout_extra = 0;
919 callout_count = 0;
920 callout_fail_count = 999999;
921 callout_fail_id = -1;
922
923 if (infile == stdin) printf("data> ");
924 if (fgets((char *)buffer, sizeof(buffer), infile) == NULL)
925 {
926 done = 1;
927 goto CONTINUE;
928 }
929 if (infile != stdin) fprintf(outfile, "%s", (char *)buffer);
930
931 len = (int)strlen((char *)buffer);
932 while (len > 0 && isspace(buffer[len-1])) len--;
933 buffer[len] = 0;
934 if (len == 0) break;
935
936 p = buffer;
937 while (isspace(*p)) p++;
938
939 q = dbuffer;
940 while ((c = *p++) != 0)
941 {
942 int i = 0;
943 int n = 0;
944
945 if (c == '\\') switch ((c = *p++))
946 {
947 case 'a': c = 7; break;
948 case 'b': c = '\b'; break;
949 case 'e': c = 27; break;
950 case 'f': c = '\f'; break;
951 case 'n': c = '\n'; break;
952 case 'r': c = '\r'; break;
953 case 't': c = '\t'; break;
954 case 'v': c = '\v'; break;
955
956 case '0': case '1': case '2': case '3':
957 case '4': case '5': case '6': case '7':
958 c -= '0';
959 while (i++ < 2 && isdigit(*p) && *p != '8' && *p != '9')
960 c = c * 8 + *p++ - '0';
961 break;
962
963 case 'x':
964
965 /* Handle \x{..} specially - new Perl thing for utf8 */
966
967 if (*p == '{')
968 {
969 unsigned char *pt = p;
970 c = 0;
971 while (isxdigit(*(++pt)))
972 c = c * 16 + tolower(*pt) - ((isdigit(*pt))? '0' : 'W');
973 if (*pt == '}')
974 {
975 unsigned char buff8[8];
976 int ii, utn;
977 utn = ord2utf8(c, buff8);
978 for (ii = 0; ii < utn - 1; ii++) *q++ = buff8[ii];
979 c = buff8[ii]; /* Last byte */
980 p = pt + 1;
981 break;
982 }
983 /* Not correct form; fall through */
984 }
985
986 /* Ordinary \x */
987
988 c = 0;
989 while (i++ < 2 && isxdigit(*p))
990 {
991 c = c * 16 + tolower(*p) - ((isdigit(*p))? '0' : 'W');
992 p++;
993 }
994 break;
995
996 case 0: /* Allows for an empty line */
997 p--;
998 continue;
999
1000 case 'A': /* Option setting */
1001 options |= PCRE_ANCHORED;
1002 continue;
1003
1004 case 'B':
1005 options |= PCRE_NOTBOL;
1006 continue;
1007
1008 case 'C':
1009 if (isdigit(*p)) /* Set copy string */
1010 {
1011 while(isdigit(*p)) n = n * 10 + *p++ - '0';
1012 copystrings |= 1 << n;
1013 }
1014 else if (isalnum(*p))
1015 {
1016 uschar name[256];
1017 uschar *npp = name;
1018 while (isalnum(*p)) *npp++ = *p++;
1019 *npp = 0;
1020 n = pcre_get_stringnumber(re, (char *)name);
1021 if (n < 0)
1022 fprintf(outfile, "no parentheses with name \"%s\"\n", name);
1023 else copystrings |= 1 << n;
1024 }
1025 else if (*p == '+')
1026 {
1027 callout_extra = 1;
1028 p++;
1029 }
1030 else if (*p == '-')
1031 {
1032 pcre_callout = NULL;
1033 p++;
1034 }
1035 else if (*p == '!')
1036 {
1037 callout_fail_id = 0;
1038 p++;
1039 while(isdigit(*p))
1040 callout_fail_id = callout_fail_id * 10 + *p++ - '0';
1041 callout_fail_count = 0;
1042 if (*p == '!')
1043 {
1044 p++;
1045 while(isdigit(*p))
1046 callout_fail_count = callout_fail_count * 10 + *p++ - '0';
1047 }
1048 }
1049 else if (*p == '*')
1050 {
1051 int sign = 1;
1052 callout_data = 0;
1053 if (*(++p) == '-') { sign = -1; p++; }
1054 while(isdigit(*p))
1055 callout_data = callout_data * 10 + *p++ - '0';
1056 callout_data *= sign;
1057 callout_data_set = 1;
1058 }
1059 continue;
1060
1061 case 'G':
1062 if (isdigit(*p))
1063 {
1064 while(isdigit(*p)) n = n * 10 + *p++ - '0';
1065 getstrings |= 1 << n;
1066 }
1067 else if (isalnum(*p))
1068 {
1069 uschar name[256];
1070 uschar *npp = name;
1071 while (isalnum(*p)) *npp++ = *p++;
1072 *npp = 0;
1073 n = pcre_get_stringnumber(re, (char *)name);
1074 if (n < 0)
1075 fprintf(outfile, "no parentheses with name \"%s\"\n", name);
1076 else getstrings |= 1 << n;
1077 }
1078 continue;
1079
1080 case 'L':
1081 getlist = 1;
1082 continue;
1083
1084 case 'M':
1085 find_match_limit = 1;
1086 continue;
1087
1088 case 'N':
1089 options |= PCRE_NOTEMPTY;
1090 continue;
1091
1092 case 'O':
1093 while(isdigit(*p)) n = n * 10 + *p++ - '0';
1094 if (n > size_offsets_max)
1095 {
1096 size_offsets_max = n;
1097 free(offsets);
1098 use_offsets = offsets = malloc(size_offsets_max * sizeof(int));
1099 if (offsets == NULL)
1100 {
1101 printf("** Failed to get %d bytes of memory for offsets vector\n",
1102 size_offsets_max * sizeof(int));
1103 return 1;
1104 }
1105 }
1106 use_size_offsets = n;
1107 if (n == 0) use_offsets = NULL; /* Ensures it can't write to it */
1108 continue;
1109
1110 case 'Z':
1111 options |= PCRE_NOTEOL;
1112 continue;
1113 }
1114 *q++ = c;
1115 }
1116 *q = 0;
1117 len = q - dbuffer;
1118
1119 /* Handle matching via the POSIX interface, which does not
1120 support timing or playing with the match limit or callout data. */
1121
1122 #if !defined NOPOSIX
1123 if (posix || do_posix)
1124 {
1125 int rc;
1126 int eflags = 0;
1127 regmatch_t *pmatch = NULL;
1128 if (use_size_offsets > 0)
1129 pmatch = malloc(sizeof(regmatch_t) * use_size_offsets);
1130 if ((options & PCRE_NOTBOL) != 0) eflags |= REG_NOTBOL;
1131 if ((options & PCRE_NOTEOL) != 0) eflags |= REG_NOTEOL;
1132
1133 rc = regexec(&preg, (const char *)bptr, use_size_offsets, pmatch, eflags);
1134
1135 if (rc != 0)
1136 {
1137 (void)regerror(rc, &preg, (char *)buffer, sizeof(buffer));
1138 fprintf(outfile, "No match: POSIX code %d: %s\n", rc, buffer);
1139 }
1140 else
1141 {
1142 size_t i;
1143 for (i = 0; i < (size_t)use_size_offsets; i++)
1144 {
1145 if (pmatch[i].rm_so >= 0)
1146 {
1147 fprintf(outfile, "%2d: ", (int)i);
1148 (void)pchars(dbuffer + pmatch[i].rm_so,
1149 pmatch[i].rm_eo - pmatch[i].rm_so, outfile);
1150 fprintf(outfile, "\n");
1151 if (i == 0 && do_showrest)
1152 {
1153 fprintf(outfile, " 0+ ");
1154 (void)pchars(dbuffer + pmatch[i].rm_eo, len - pmatch[i].rm_eo,
1155 outfile);
1156 fprintf(outfile, "\n");
1157 }
1158 }
1159 }
1160 }
1161 free(pmatch);
1162 }
1163
1164 /* Handle matching via the native interface - repeats for /g and /G */
1165
1166 else
1167 #endif /* !defined NOPOSIX */
1168
1169 for (;; gmatched++) /* Loop for /g or /G */
1170 {
1171 if (timeit)
1172 {
1173 register int i;
1174 clock_t time_taken;
1175 clock_t start_time = clock();
1176 for (i = 0; i < LOOPREPEAT; i++)
1177 count = pcre_exec(re, extra, (char *)bptr, len,
1178 start_offset, options | g_notempty, use_offsets, use_size_offsets);
1179 time_taken = clock() - start_time;
1180 fprintf(outfile, "Execute time %.3f milliseconds\n",
1181 (((double)time_taken * 1000.0) / (double)LOOPREPEAT) /
1182 (double)CLOCKS_PER_SEC);
1183 }
1184
1185 /* If find_match_limit is set, we want to do repeated matches with
1186 varying limits in order to find the minimum value. */
1187
1188 if (find_match_limit)
1189 {
1190 int min = 0;
1191 int mid = 64;
1192 int max = -1;
1193
1194 if (extra == NULL)
1195 {
1196 extra = malloc(sizeof(pcre_extra));
1197 extra->flags = 0;
1198 }
1199 extra->flags |= PCRE_EXTRA_MATCH_LIMIT;
1200
1201 for (;;)
1202 {
1203 extra->match_limit = mid;
1204 count = pcre_exec(re, extra, (char *)bptr, len, start_offset,
1205 options | g_notempty, use_offsets, use_size_offsets);
1206 if (count == PCRE_ERROR_MATCHLIMIT)
1207 {
1208 /* fprintf(outfile, "Testing match limit = %d\n", mid); */
1209 min = mid;
1210 mid = (mid == max - 1)? max : (max > 0)? (min + max)/2 : mid*2;
1211 }
1212 else if (count >= 0 || count == PCRE_ERROR_NOMATCH)
1213 {
1214 if (mid == min + 1)
1215 {
1216 fprintf(outfile, "Minimum match limit = %d\n", mid);
1217 break;
1218 }
1219 /* fprintf(outfile, "Testing match limit = %d\n", mid); */
1220 max = mid;
1221 mid = (min + mid)/2;
1222 }
1223 else break; /* Some other error */
1224 }
1225
1226 extra->flags &= ~PCRE_EXTRA_MATCH_LIMIT;
1227 }
1228
1229 /* If callout_data is set, use the interface with additional data */
1230
1231 else if (callout_data_set)
1232 {
1233 if (extra == NULL)
1234 {
1235 extra = malloc(sizeof(pcre_extra));
1236 extra->flags = 0;
1237 }
1238 extra->flags |= PCRE_EXTRA_CALLOUT_DATA;
1239 extra->callout_data = (void *)callout_data;
1240 count = pcre_exec(re, extra, (char *)bptr, len, start_offset,
1241 options | g_notempty, use_offsets, use_size_offsets);
1242 extra->flags &= ~PCRE_EXTRA_CALLOUT_DATA;
1243 }
1244
1245 /* The normal case is just to do the match once, with the default
1246 value of match_limit. */
1247
1248 else count = pcre_exec(re, extra, (char *)bptr, len,
1249 start_offset, options | g_notempty, use_offsets, use_size_offsets);
1250
1251 if (count == 0)
1252 {
1253 fprintf(outfile, "Matched, but too many substrings\n");
1254 count = use_size_offsets/3;
1255 }
1256
1257 /* Matched */
1258
1259 if (count >= 0)
1260 {
1261 int i;
1262 for (i = 0; i < count * 2; i += 2)
1263 {
1264 if (use_offsets[i] < 0)
1265 fprintf(outfile, "%2d: <unset>\n", i/2);
1266 else
1267 {
1268 fprintf(outfile, "%2d: ", i/2);
1269 (void)pchars(bptr + use_offsets[i],
1270 use_offsets[i+1] - use_offsets[i], outfile);
1271 fprintf(outfile, "\n");
1272 if (i == 0)
1273 {
1274 if (do_showrest)
1275 {
1276 fprintf(outfile, " 0+ ");
1277 (void)pchars(bptr + use_offsets[i+1], len - use_offsets[i+1],
1278 outfile);
1279 fprintf(outfile, "\n");
1280 }
1281 }
1282 }
1283 }
1284
1285 for (i = 0; i < 32; i++)
1286 {
1287 if ((copystrings & (1 << i)) != 0)
1288 {
1289 char copybuffer[16];
1290 int rc = pcre_copy_substring((char *)bptr, use_offsets, count,
1291 i, copybuffer, sizeof(copybuffer));
1292 if (rc < 0)
1293 fprintf(outfile, "copy substring %d failed %d\n", i, rc);
1294 else
1295 fprintf(outfile, "%2dC %s (%d)\n", i, copybuffer, rc);
1296 }
1297 }
1298
1299 for (i = 0; i < 32; i++)
1300 {
1301 if ((getstrings & (1 << i)) != 0)
1302 {
1303 const char *substring;
1304 int rc = pcre_get_substring((char *)bptr, use_offsets, count,
1305 i, &substring);
1306 if (rc < 0)
1307 fprintf(outfile, "get substring %d failed %d\n", i, rc);
1308 else
1309 {
1310 fprintf(outfile, "%2dG %s (%d)\n", i, substring, rc);
1311 /* free((void *)substring); */
1312 pcre_free_substring(substring);
1313 }
1314 }
1315 }
1316
1317 if (getlist)
1318 {
1319 const char **stringlist;
1320 int rc = pcre_get_substring_list((char *)bptr, use_offsets, count,
1321 &stringlist);
1322 if (rc < 0)
1323 fprintf(outfile, "get substring list failed %d\n", rc);
1324 else
1325 {
1326 for (i = 0; i < count; i++)
1327 fprintf(outfile, "%2dL %s\n", i, stringlist[i]);
1328 if (stringlist[i] != NULL)
1329 fprintf(outfile, "string list not terminated by NULL\n");
1330 /* free((void *)stringlist); */
1331 pcre_free_substring_list(stringlist);
1332 }
1333 }
1334 }
1335
1336 /* Failed to match. If this is a /g or /G loop and we previously set
1337 g_notempty after a null match, this is not necessarily the end.
1338 We want to advance the start offset, and continue. Fudge the offset
1339 values to achieve this. We won't be at the end of the string - that
1340 was checked before setting g_notempty. */
1341
1342 else
1343 {
1344 if (g_notempty != 0)
1345 {
1346 use_offsets[0] = start_offset;
1347 use_offsets[1] = start_offset + 1;
1348 }
1349 else
1350 {
1351 if (gmatched == 0) /* Error if no previous matches */
1352 {
1353 if (count == -1) fprintf(outfile, "No match\n");
1354 else fprintf(outfile, "Error %d\n", count);
1355 }
1356 break; /* Out of the /g loop */
1357 }
1358 }
1359
1360 /* If not /g or /G we are done */
1361
1362 if (!do_g && !do_G) break;
1363
1364 /* If we have matched an empty string, first check to see if we are at
1365 the end of the subject. If so, the /g loop is over. Otherwise, mimic
1366 what Perl's /g options does. This turns out to be rather cunning. First
1367 we set PCRE_NOTEMPTY and PCRE_ANCHORED and try the match again at the
1368 same point. If this fails (picked up above) we advance to the next
1369 character. */
1370
1371 g_notempty = 0;
1372 if (use_offsets[0] == use_offsets[1])
1373 {
1374 if (use_offsets[0] == len) break;
1375 g_notempty = PCRE_NOTEMPTY | PCRE_ANCHORED;
1376 }
1377
1378 /* For /g, update the start offset, leaving the rest alone */
1379
1380 if (do_g) start_offset = use_offsets[1];
1381
1382 /* For /G, update the pointer and length */
1383
1384 else
1385 {
1386 bptr += use_offsets[1];
1387 len -= use_offsets[1];
1388 }
1389 } /* End of loop for /g and /G */
1390 } /* End of loop for data lines */
1391
1392 CONTINUE:
1393
1394 #if !defined NOPOSIX
1395 if (posix || do_posix) regfree(&preg);
1396 #endif
1397
1398 if (re != NULL) free(re);
1399 if (extra != NULL) free(extra);
1400 if (tables != NULL)
1401 {
1402 free((void *)tables);
1403 setlocale(LC_CTYPE, "C");
1404 }
1405 }
1406
1407 fprintf(outfile, "\n");
1408 return 0;
1409 }
1410
1411 /* End */

webmaster@exim.org
ViewVC Help
Powered by ViewVC 1.1.12