/[pcre]/code/trunk/pcretest.c
ViewVC logotype

Contents of /code/trunk/pcretest.c

Parent Directory Parent Directory | Revision Log Revision Log


Revision 55 - (show annotations) (download)
Sat Feb 24 21:39:46 2007 UTC (7 years, 9 months ago) by nigel
File MIME type: text/plain
File size: 33746 byte(s)
Load pcre-3.6 into code/trunk.

1 /*************************************************
2 * PCRE testing program *
3 *************************************************/
4
5 #include <ctype.h>
6 #include <stdio.h>
7 #include <string.h>
8 #include <stdlib.h>
9 #include <time.h>
10 #include <locale.h>
11
12 /* Use the internal info for displaying the results of pcre_study(). */
13
14 #include "internal.h"
15
16 /* It is possible to compile this test program without including support for
17 testing the POSIX interface, though this is not available via the standard
18 Makefile. */
19
20 #if !defined NOPOSIX
21 #include "pcreposix.h"
22 #endif
23
24 #ifndef CLOCKS_PER_SEC
25 #ifdef CLK_TCK
26 #define CLOCKS_PER_SEC CLK_TCK
27 #else
28 #define CLOCKS_PER_SEC 100
29 #endif
30 #endif
31
32 #define LOOPREPEAT 20000
33
34
35 static FILE *outfile;
36 static int log_store = 0;
37 static size_t gotten_store;
38
39
40
41 static int utf8_table1[] = {
42 0x0000007f, 0x000007ff, 0x0000ffff, 0x001fffff, 0x03ffffff, 0x7fffffff};
43
44 static int utf8_table2[] = {
45 0, 0xc0, 0xe0, 0xf0, 0xf8, 0xfc};
46
47 static int utf8_table3[] = {
48 0xff, 0x1f, 0x0f, 0x07, 0x03, 0x01};
49
50
51 /*************************************************
52 * Convert character value to UTF-8 *
53 *************************************************/
54
55 /* This function takes an integer value in the range 0 - 0x7fffffff
56 and encodes it as a UTF-8 character in 0 to 6 bytes.
57
58 Arguments:
59 cvalue the character value
60 buffer pointer to buffer for result - at least 6 bytes long
61
62 Returns: number of characters placed in the buffer
63 -1 if input character is negative
64 0 if input character is positive but too big (only when
65 int is longer than 32 bits)
66 */
67
68 static int
69 ord2utf8(int cvalue, unsigned char *buffer)
70 {
71 register int i, j;
72 for (i = 0; i < sizeof(utf8_table1)/sizeof(int); i++)
73 if (cvalue <= utf8_table1[i]) break;
74 if (i >= sizeof(utf8_table1)/sizeof(int)) return 0;
75 if (cvalue < 0) return -1;
76 *buffer++ = utf8_table2[i] | (cvalue & utf8_table3[i]);
77 cvalue >>= 6 - i;
78 for (j = 0; j < i; j++)
79 {
80 *buffer++ = 0x80 | (cvalue & 0x3f);
81 cvalue >>= 6;
82 }
83 return i + 1;
84 }
85
86
87 /*************************************************
88 * Convert UTF-8 string to value *
89 *************************************************/
90
91 /* This function takes one or more bytes that represents a UTF-8 character,
92 and returns the value of the character.
93
94 Argument:
95 buffer a pointer to the byte vector
96 vptr a pointer to an int to receive the value
97
98 Returns: > 0 => the number of bytes consumed
99 -6 to 0 => malformed UTF-8 character at offset = (-return)
100 */
101
102 int
103 utf82ord(unsigned char *buffer, int *vptr)
104 {
105 int c = *buffer++;
106 int d = c;
107 int i, j, s;
108
109 for (i = -1; i < 6; i++) /* i is number of additional bytes */
110 {
111 if ((d & 0x80) == 0) break;
112 d <<= 1;
113 }
114
115 if (i == -1) { *vptr = c; return 1; } /* ascii character */
116 if (i == 0 || i == 6) return 0; /* invalid UTF-8 */
117
118 /* i now has a value in the range 1-5 */
119
120 d = c & utf8_table3[i];
121 s = 6 - i;
122
123 for (j = 0; j < i; j++)
124 {
125 c = *buffer++;
126 if ((c & 0xc0) != 0x80) return -(j+1);
127 d |= (c & 0x3f) << s;
128 s += 6;
129 }
130
131 /* Check that encoding was the correct unique one */
132
133 for (j = 0; j < sizeof(utf8_table1)/sizeof(int); j++)
134 if (d <= utf8_table1[j]) break;
135 if (j != i) return -(i+1);
136
137 /* Valid value */
138
139 *vptr = d;
140 return i+1;
141 }
142
143
144
145
146
147
148 /* Debugging function to print the internal form of the regex. This is the same
149 code as contained in pcre.c under the DEBUG macro. */
150
151 static const char *OP_names[] = {
152 "End", "\\A", "\\B", "\\b", "\\D", "\\d",
153 "\\S", "\\s", "\\W", "\\w", "\\Z", "\\z",
154 "Opt", "^", "$", "Any", "chars", "not",
155 "*", "*?", "+", "+?", "?", "??", "{", "{", "{",
156 "*", "*?", "+", "+?", "?", "??", "{", "{", "{",
157 "*", "*?", "+", "+?", "?", "??", "{", "{", "{",
158 "*", "*?", "+", "+?", "?", "??", "{", "{",
159 "class", "Ref", "Recurse",
160 "Alt", "Ket", "KetRmax", "KetRmin", "Assert", "Assert not",
161 "AssertB", "AssertB not", "Reverse", "Once", "Cond", "Cref",
162 "Brazero", "Braminzero", "Branumber", "Bra"
163 };
164
165
166 static void print_internals(pcre *re)
167 {
168 unsigned char *code = ((real_pcre *)re)->code;
169
170 fprintf(outfile, "------------------------------------------------------------------\n");
171
172 for(;;)
173 {
174 int c;
175 int charlength;
176
177 fprintf(outfile, "%3d ", (int)(code - ((real_pcre *)re)->code));
178
179 if (*code >= OP_BRA)
180 {
181 if (*code - OP_BRA > EXTRACT_BASIC_MAX)
182 fprintf(outfile, "%3d Bra extra", (code[1] << 8) + code[2]);
183 else
184 fprintf(outfile, "%3d Bra %d", (code[1] << 8) + code[2], *code - OP_BRA);
185 code += 2;
186 }
187
188 else switch(*code)
189 {
190 case OP_END:
191 fprintf(outfile, " %s\n", OP_names[*code]);
192 fprintf(outfile, "------------------------------------------------------------------\n");
193 return;
194
195 case OP_OPT:
196 fprintf(outfile, " %.2x %s", code[1], OP_names[*code]);
197 code++;
198 break;
199
200 case OP_CHARS:
201 charlength = *(++code);
202 fprintf(outfile, "%3d ", charlength);
203 while (charlength-- > 0)
204 if (isprint(c = *(++code))) fprintf(outfile, "%c", c);
205 else fprintf(outfile, "\\x%02x", c);
206 break;
207
208 case OP_KETRMAX:
209 case OP_KETRMIN:
210 case OP_ALT:
211 case OP_KET:
212 case OP_ASSERT:
213 case OP_ASSERT_NOT:
214 case OP_ASSERTBACK:
215 case OP_ASSERTBACK_NOT:
216 case OP_ONCE:
217 case OP_COND:
218 case OP_BRANUMBER:
219 case OP_REVERSE:
220 case OP_CREF:
221 fprintf(outfile, "%3d %s", (code[1] << 8) + code[2], OP_names[*code]);
222 code += 2;
223 break;
224
225 case OP_STAR:
226 case OP_MINSTAR:
227 case OP_PLUS:
228 case OP_MINPLUS:
229 case OP_QUERY:
230 case OP_MINQUERY:
231 case OP_TYPESTAR:
232 case OP_TYPEMINSTAR:
233 case OP_TYPEPLUS:
234 case OP_TYPEMINPLUS:
235 case OP_TYPEQUERY:
236 case OP_TYPEMINQUERY:
237 if (*code >= OP_TYPESTAR)
238 fprintf(outfile, " %s", OP_names[code[1]]);
239 else if (isprint(c = code[1])) fprintf(outfile, " %c", c);
240 else fprintf(outfile, " \\x%02x", c);
241 fprintf(outfile, "%s", OP_names[*code++]);
242 break;
243
244 case OP_EXACT:
245 case OP_UPTO:
246 case OP_MINUPTO:
247 if (isprint(c = code[3])) fprintf(outfile, " %c{", c);
248 else fprintf(outfile, " \\x%02x{", c);
249 if (*code != OP_EXACT) fprintf(outfile, ",");
250 fprintf(outfile, "%d}", (code[1] << 8) + code[2]);
251 if (*code == OP_MINUPTO) fprintf(outfile, "?");
252 code += 3;
253 break;
254
255 case OP_TYPEEXACT:
256 case OP_TYPEUPTO:
257 case OP_TYPEMINUPTO:
258 fprintf(outfile, " %s{", OP_names[code[3]]);
259 if (*code != OP_TYPEEXACT) fprintf(outfile, "0,");
260 fprintf(outfile, "%d}", (code[1] << 8) + code[2]);
261 if (*code == OP_TYPEMINUPTO) fprintf(outfile, "?");
262 code += 3;
263 break;
264
265 case OP_NOT:
266 if (isprint(c = *(++code))) fprintf(outfile, " [^%c]", c);
267 else fprintf(outfile, " [^\\x%02x]", c);
268 break;
269
270 case OP_NOTSTAR:
271 case OP_NOTMINSTAR:
272 case OP_NOTPLUS:
273 case OP_NOTMINPLUS:
274 case OP_NOTQUERY:
275 case OP_NOTMINQUERY:
276 if (isprint(c = code[1])) fprintf(outfile, " [^%c]", c);
277 else fprintf(outfile, " [^\\x%02x]", c);
278 fprintf(outfile, "%s", OP_names[*code++]);
279 break;
280
281 case OP_NOTEXACT:
282 case OP_NOTUPTO:
283 case OP_NOTMINUPTO:
284 if (isprint(c = code[3])) fprintf(outfile, " [^%c]{", c);
285 else fprintf(outfile, " [^\\x%02x]{", c);
286 if (*code != OP_NOTEXACT) fprintf(outfile, ",");
287 fprintf(outfile, "%d}", (code[1] << 8) + code[2]);
288 if (*code == OP_NOTMINUPTO) fprintf(outfile, "?");
289 code += 3;
290 break;
291
292 case OP_REF:
293 fprintf(outfile, " \\%d", (code[1] << 8) | code[2]);
294 code += 3;
295 goto CLASS_REF_REPEAT;
296
297 case OP_CLASS:
298 {
299 int i, min, max;
300 code++;
301 fprintf(outfile, " [");
302
303 for (i = 0; i < 256; i++)
304 {
305 if ((code[i/8] & (1 << (i&7))) != 0)
306 {
307 int j;
308 for (j = i+1; j < 256; j++)
309 if ((code[j/8] & (1 << (j&7))) == 0) break;
310 if (i == '-' || i == ']') fprintf(outfile, "\\");
311 if (isprint(i)) fprintf(outfile, "%c", i); else fprintf(outfile, "\\x%02x", i);
312 if (--j > i)
313 {
314 fprintf(outfile, "-");
315 if (j == '-' || j == ']') fprintf(outfile, "\\");
316 if (isprint(j)) fprintf(outfile, "%c", j); else fprintf(outfile, "\\x%02x", j);
317 }
318 i = j;
319 }
320 }
321 fprintf(outfile, "]");
322 code += 32;
323
324 CLASS_REF_REPEAT:
325
326 switch(*code)
327 {
328 case OP_CRSTAR:
329 case OP_CRMINSTAR:
330 case OP_CRPLUS:
331 case OP_CRMINPLUS:
332 case OP_CRQUERY:
333 case OP_CRMINQUERY:
334 fprintf(outfile, "%s", OP_names[*code]);
335 break;
336
337 case OP_CRRANGE:
338 case OP_CRMINRANGE:
339 min = (code[1] << 8) + code[2];
340 max = (code[3] << 8) + code[4];
341 if (max == 0) fprintf(outfile, "{%d,}", min);
342 else fprintf(outfile, "{%d,%d}", min, max);
343 if (*code == OP_CRMINRANGE) fprintf(outfile, "?");
344 code += 4;
345 break;
346
347 default:
348 code--;
349 }
350 }
351 break;
352
353 /* Anything else is just a one-node item */
354
355 default:
356 fprintf(outfile, " %s", OP_names[*code]);
357 break;
358 }
359
360 code++;
361 fprintf(outfile, "\n");
362 }
363 }
364
365
366
367 /* Character string printing function. A "normal" and a UTF-8 version. */
368
369 static void pchars(unsigned char *p, int length, int utf8)
370 {
371 int c;
372 while (length-- > 0)
373 {
374 if (utf8)
375 {
376 int rc = utf82ord(p, &c);
377 if (rc > 0)
378 {
379 length -= rc - 1;
380 p += rc;
381 if (c < 256 && isprint(c)) fprintf(outfile, "%c", c);
382 else fprintf(outfile, "\\x{%02x}", c);
383 continue;
384 }
385 }
386
387 /* Not UTF-8, or malformed UTF-8 */
388
389 if (isprint(c = *(p++))) fprintf(outfile, "%c", c);
390 else fprintf(outfile, "\\x%02x", c);
391 }
392 }
393
394
395
396 /* Alternative malloc function, to test functionality and show the size of the
397 compiled re. */
398
399 static void *new_malloc(size_t size)
400 {
401 gotten_store = size;
402 if (log_store)
403 fprintf(outfile, "Memory allocation (code space): %d\n",
404 (int)((int)size - offsetof(real_pcre, code[0])));
405 return malloc(size);
406 }
407
408
409
410
411 /* Get one piece of information from the pcre_fullinfo() function */
412
413 static void new_info(pcre *re, pcre_extra *study, int option, void *ptr)
414 {
415 int rc;
416 if ((rc = pcre_fullinfo(re, study, option, ptr)) < 0)
417 fprintf(outfile, "Error %d from pcre_fullinfo(%d)\n", rc, option);
418 }
419
420
421
422
423 /* Read lines from named file or stdin and write to named file or stdout; lines
424 consist of a regular expression, in delimiters and optionally followed by
425 options, followed by a set of test data, terminated by an empty line. */
426
427 int main(int argc, char **argv)
428 {
429 FILE *infile = stdin;
430 int options = 0;
431 int study_options = 0;
432 int op = 1;
433 int timeit = 0;
434 int showinfo = 0;
435 int showstore = 0;
436 int size_offsets = 45;
437 int size_offsets_max;
438 int *offsets;
439 #if !defined NOPOSIX
440 int posix = 0;
441 #endif
442 int debug = 0;
443 int done = 0;
444 unsigned char buffer[30000];
445 unsigned char dbuffer[1024];
446
447 /* Static so that new_malloc can use it. */
448
449 outfile = stdout;
450
451 /* Scan options */
452
453 while (argc > 1 && argv[op][0] == '-')
454 {
455 char *endptr;
456
457 if (strcmp(argv[op], "-s") == 0 || strcmp(argv[op], "-m") == 0)
458 showstore = 1;
459 else if (strcmp(argv[op], "-t") == 0) timeit = 1;
460 else if (strcmp(argv[op], "-i") == 0) showinfo = 1;
461 else if (strcmp(argv[op], "-d") == 0) showinfo = debug = 1;
462 else if (strcmp(argv[op], "-o") == 0 && argc > 2 &&
463 ((size_offsets = strtoul(argv[op+1], &endptr, 10)), *endptr == 0))
464 {
465 op++;
466 argc--;
467 }
468 #if !defined NOPOSIX
469 else if (strcmp(argv[op], "-p") == 0) posix = 1;
470 #endif
471 else
472 {
473 printf("** Unknown or malformed option %s\n", argv[op]);
474 printf("Usage: pcretest [-d] [-i] [-o <n>] [-p] [-s] [-t] [<input> [<output>]]\n");
475 printf(" -d debug: show compiled code; implies -i\n"
476 " -i show information about compiled pattern\n"
477 " -o <n> set size of offsets vector to <n>\n");
478 #if !defined NOPOSIX
479 printf(" -p use POSIX interface\n");
480 #endif
481 printf(" -s output store information\n"
482 " -t time compilation and execution\n");
483 return 1;
484 }
485 op++;
486 argc--;
487 }
488
489 /* Get the store for the offsets vector, and remember what it was */
490
491 size_offsets_max = size_offsets;
492 offsets = malloc(size_offsets_max * sizeof(int));
493 if (offsets == NULL)
494 {
495 printf("** Failed to get %d bytes of memory for offsets vector\n",
496 size_offsets_max * sizeof(int));
497 return 1;
498 }
499
500 /* Sort out the input and output files */
501
502 if (argc > 1)
503 {
504 infile = fopen(argv[op], "r");
505 if (infile == NULL)
506 {
507 printf("** Failed to open %s\n", argv[op]);
508 return 1;
509 }
510 }
511
512 if (argc > 2)
513 {
514 outfile = fopen(argv[op+1], "w");
515 if (outfile == NULL)
516 {
517 printf("** Failed to open %s\n", argv[op+1]);
518 return 1;
519 }
520 }
521
522 /* Set alternative malloc function */
523
524 pcre_malloc = new_malloc;
525
526 /* Heading line, then prompt for first regex if stdin */
527
528 fprintf(outfile, "PCRE version %s\n\n", pcre_version());
529
530 /* Main loop */
531
532 while (!done)
533 {
534 pcre *re = NULL;
535 pcre_extra *extra = NULL;
536
537 #if !defined NOPOSIX /* There are still compilers that require no indent */
538 regex_t preg;
539 int do_posix = 0;
540 #endif
541
542 const char *error;
543 unsigned char *p, *pp, *ppp;
544 const unsigned char *tables = NULL;
545 int do_study = 0;
546 int do_debug = debug;
547 int do_G = 0;
548 int do_g = 0;
549 int do_showinfo = showinfo;
550 int do_showrest = 0;
551 int utf8 = 0;
552 int erroroffset, len, delimiter;
553
554 if (infile == stdin) printf(" re> ");
555 if (fgets((char *)buffer, sizeof(buffer), infile) == NULL) break;
556 if (infile != stdin) fprintf(outfile, "%s", (char *)buffer);
557
558 p = buffer;
559 while (isspace(*p)) p++;
560 if (*p == 0) continue;
561
562 /* Get the delimiter and seek the end of the pattern; if is isn't
563 complete, read more. */
564
565 delimiter = *p++;
566
567 if (isalnum(delimiter) || delimiter == '\\')
568 {
569 fprintf(outfile, "** Delimiter must not be alphameric or \\\n");
570 goto SKIP_DATA;
571 }
572
573 pp = p;
574
575 for(;;)
576 {
577 while (*pp != 0)
578 {
579 if (*pp == '\\' && pp[1] != 0) pp++;
580 else if (*pp == delimiter) break;
581 pp++;
582 }
583 if (*pp != 0) break;
584
585 len = sizeof(buffer) - (pp - buffer);
586 if (len < 256)
587 {
588 fprintf(outfile, "** Expression too long - missing delimiter?\n");
589 goto SKIP_DATA;
590 }
591
592 if (infile == stdin) printf(" > ");
593 if (fgets((char *)pp, len, infile) == NULL)
594 {
595 fprintf(outfile, "** Unexpected EOF\n");
596 done = 1;
597 goto CONTINUE;
598 }
599 if (infile != stdin) fprintf(outfile, "%s", (char *)pp);
600 }
601
602 /* If the first character after the delimiter is backslash, make
603 the pattern end with backslash. This is purely to provide a way
604 of testing for the error message when a pattern ends with backslash. */
605
606 if (pp[1] == '\\') *pp++ = '\\';
607
608 /* Terminate the pattern at the delimiter */
609
610 *pp++ = 0;
611
612 /* Look for options after final delimiter */
613
614 options = 0;
615 study_options = 0;
616 log_store = showstore; /* default from command line */
617
618 while (*pp != 0)
619 {
620 switch (*pp++)
621 {
622 case 'g': do_g = 1; break;
623 case 'i': options |= PCRE_CASELESS; break;
624 case 'm': options |= PCRE_MULTILINE; break;
625 case 's': options |= PCRE_DOTALL; break;
626 case 'x': options |= PCRE_EXTENDED; break;
627
628 case '+': do_showrest = 1; break;
629 case 'A': options |= PCRE_ANCHORED; break;
630 case 'D': do_debug = do_showinfo = 1; break;
631 case 'E': options |= PCRE_DOLLAR_ENDONLY; break;
632 case 'G': do_G = 1; break;
633 case 'I': do_showinfo = 1; break;
634 case 'M': log_store = 1; break;
635
636 #if !defined NOPOSIX
637 case 'P': do_posix = 1; break;
638 #endif
639
640 case 'S': do_study = 1; break;
641 case 'U': options |= PCRE_UNGREEDY; break;
642 case 'X': options |= PCRE_EXTRA; break;
643 case '8': options |= PCRE_UTF8; utf8 = 1; break;
644
645 case 'L':
646 ppp = pp;
647 while (*ppp != '\n' && *ppp != ' ') ppp++;
648 *ppp = 0;
649 if (setlocale(LC_CTYPE, (const char *)pp) == NULL)
650 {
651 fprintf(outfile, "** Failed to set locale \"%s\"\n", pp);
652 goto SKIP_DATA;
653 }
654 tables = pcre_maketables();
655 pp = ppp;
656 break;
657
658 case '\n': case ' ': break;
659 default:
660 fprintf(outfile, "** Unknown option '%c'\n", pp[-1]);
661 goto SKIP_DATA;
662 }
663 }
664
665 /* Handle compiling via the POSIX interface, which doesn't support the
666 timing, showing, or debugging options, nor the ability to pass over
667 local character tables. */
668
669 #if !defined NOPOSIX
670 if (posix || do_posix)
671 {
672 int rc;
673 int cflags = 0;
674 if ((options & PCRE_CASELESS) != 0) cflags |= REG_ICASE;
675 if ((options & PCRE_MULTILINE) != 0) cflags |= REG_NEWLINE;
676 rc = regcomp(&preg, (char *)p, cflags);
677
678 /* Compilation failed; go back for another re, skipping to blank line
679 if non-interactive. */
680
681 if (rc != 0)
682 {
683 (void)regerror(rc, &preg, (char *)buffer, sizeof(buffer));
684 fprintf(outfile, "Failed: POSIX code %d: %s\n", rc, buffer);
685 goto SKIP_DATA;
686 }
687 }
688
689 /* Handle compiling via the native interface */
690
691 else
692 #endif /* !defined NOPOSIX */
693
694 {
695 if (timeit)
696 {
697 register int i;
698 clock_t time_taken;
699 clock_t start_time = clock();
700 for (i = 0; i < LOOPREPEAT; i++)
701 {
702 re = pcre_compile((char *)p, options, &error, &erroroffset, tables);
703 if (re != NULL) free(re);
704 }
705 time_taken = clock() - start_time;
706 fprintf(outfile, "Compile time %.3f milliseconds\n",
707 ((double)time_taken * 1000.0) /
708 ((double)LOOPREPEAT * (double)CLOCKS_PER_SEC));
709 }
710
711 re = pcre_compile((char *)p, options, &error, &erroroffset, tables);
712
713 /* Compilation failed; go back for another re, skipping to blank line
714 if non-interactive. */
715
716 if (re == NULL)
717 {
718 fprintf(outfile, "Failed: %s at offset %d\n", error, erroroffset);
719 SKIP_DATA:
720 if (infile != stdin)
721 {
722 for (;;)
723 {
724 if (fgets((char *)buffer, sizeof(buffer), infile) == NULL)
725 {
726 done = 1;
727 goto CONTINUE;
728 }
729 len = (int)strlen((char *)buffer);
730 while (len > 0 && isspace(buffer[len-1])) len--;
731 if (len == 0) break;
732 }
733 fprintf(outfile, "\n");
734 }
735 goto CONTINUE;
736 }
737
738 /* Compilation succeeded; print data if required. There are now two
739 info-returning functions. The old one has a limited interface and
740 returns only limited data. Check that it agrees with the newer one. */
741
742 if (do_showinfo)
743 {
744 unsigned long int get_options;
745 int old_first_char, old_options, old_count;
746 int count, backrefmax, first_char, need_char;
747 size_t size;
748
749 if (do_debug) print_internals(re);
750
751 new_info(re, NULL, PCRE_INFO_OPTIONS, &get_options);
752 new_info(re, NULL, PCRE_INFO_SIZE, &size);
753 new_info(re, NULL, PCRE_INFO_CAPTURECOUNT, &count);
754 new_info(re, NULL, PCRE_INFO_BACKREFMAX, &backrefmax);
755 new_info(re, NULL, PCRE_INFO_FIRSTCHAR, &first_char);
756 new_info(re, NULL, PCRE_INFO_LASTLITERAL, &need_char);
757
758 old_count = pcre_info(re, &old_options, &old_first_char);
759 if (count < 0) fprintf(outfile,
760 "Error %d from pcre_info()\n", count);
761 else
762 {
763 if (old_count != count) fprintf(outfile,
764 "Count disagreement: pcre_fullinfo=%d pcre_info=%d\n", count,
765 old_count);
766
767 if (old_first_char != first_char) fprintf(outfile,
768 "First char disagreement: pcre_fullinfo=%d pcre_info=%d\n",
769 first_char, old_first_char);
770
771 if (old_options != (int)get_options) fprintf(outfile,
772 "Options disagreement: pcre_fullinfo=%ld pcre_info=%d\n",
773 get_options, old_options);
774 }
775
776 if (size != gotten_store) fprintf(outfile,
777 "Size disagreement: pcre_fullinfo=%d call to malloc for %d\n",
778 size, gotten_store);
779
780 fprintf(outfile, "Capturing subpattern count = %d\n", count);
781 if (backrefmax > 0)
782 fprintf(outfile, "Max back reference = %d\n", backrefmax);
783 if (get_options == 0) fprintf(outfile, "No options\n");
784 else fprintf(outfile, "Options:%s%s%s%s%s%s%s%s%s\n",
785 ((get_options & PCRE_ANCHORED) != 0)? " anchored" : "",
786 ((get_options & PCRE_CASELESS) != 0)? " caseless" : "",
787 ((get_options & PCRE_EXTENDED) != 0)? " extended" : "",
788 ((get_options & PCRE_MULTILINE) != 0)? " multiline" : "",
789 ((get_options & PCRE_DOTALL) != 0)? " dotall" : "",
790 ((get_options & PCRE_DOLLAR_ENDONLY) != 0)? " dollar_endonly" : "",
791 ((get_options & PCRE_EXTRA) != 0)? " extra" : "",
792 ((get_options & PCRE_UNGREEDY) != 0)? " ungreedy" : "",
793 ((get_options & PCRE_UTF8) != 0)? " utf8" : "");
794
795 if (((((real_pcre *)re)->options) & PCRE_ICHANGED) != 0)
796 fprintf(outfile, "Case state changes\n");
797
798 if (first_char == -1)
799 {
800 fprintf(outfile, "First char at start or follows \\n\n");
801 }
802 else if (first_char < 0)
803 {
804 fprintf(outfile, "No first char\n");
805 }
806 else
807 {
808 if (isprint(first_char))
809 fprintf(outfile, "First char = \'%c\'\n", first_char);
810 else
811 fprintf(outfile, "First char = %d\n", first_char);
812 }
813
814 if (need_char < 0)
815 {
816 fprintf(outfile, "No need char\n");
817 }
818 else
819 {
820 if (isprint(need_char))
821 fprintf(outfile, "Need char = \'%c\'\n", need_char);
822 else
823 fprintf(outfile, "Need char = %d\n", need_char);
824 }
825 }
826
827 /* If /S was present, study the regexp to generate additional info to
828 help with the matching. */
829
830 if (do_study)
831 {
832 if (timeit)
833 {
834 register int i;
835 clock_t time_taken;
836 clock_t start_time = clock();
837 for (i = 0; i < LOOPREPEAT; i++)
838 extra = pcre_study(re, study_options, &error);
839 time_taken = clock() - start_time;
840 if (extra != NULL) free(extra);
841 fprintf(outfile, " Study time %.3f milliseconds\n",
842 ((double)time_taken * 1000.0)/
843 ((double)LOOPREPEAT * (double)CLOCKS_PER_SEC));
844 }
845
846 extra = pcre_study(re, study_options, &error);
847 if (error != NULL)
848 fprintf(outfile, "Failed to study: %s\n", error);
849 else if (extra == NULL)
850 fprintf(outfile, "Study returned NULL\n");
851
852 else if (do_showinfo)
853 {
854 uschar *start_bits = NULL;
855 new_info(re, extra, PCRE_INFO_FIRSTTABLE, &start_bits);
856 if (start_bits == NULL)
857 fprintf(outfile, "No starting character set\n");
858 else
859 {
860 int i;
861 int c = 24;
862 fprintf(outfile, "Starting character set: ");
863 for (i = 0; i < 256; i++)
864 {
865 if ((start_bits[i/8] & (1<<(i%8))) != 0)
866 {
867 if (c > 75)
868 {
869 fprintf(outfile, "\n ");
870 c = 2;
871 }
872 if (isprint(i) && i != ' ')
873 {
874 fprintf(outfile, "%c ", i);
875 c += 2;
876 }
877 else
878 {
879 fprintf(outfile, "\\x%02x ", i);
880 c += 5;
881 }
882 }
883 }
884 fprintf(outfile, "\n");
885 }
886 }
887 }
888 }
889
890 /* Read data lines and test them */
891
892 for (;;)
893 {
894 unsigned char *q;
895 unsigned char *bptr = dbuffer;
896 int use_size_offsets = size_offsets;
897 int count, c;
898 int copystrings = 0;
899 int getstrings = 0;
900 int getlist = 0;
901 int gmatched = 0;
902 int start_offset = 0;
903 int g_notempty = 0;
904
905 options = 0;
906
907 if (infile == stdin) printf("data> ");
908 if (fgets((char *)buffer, sizeof(buffer), infile) == NULL)
909 {
910 done = 1;
911 goto CONTINUE;
912 }
913 if (infile != stdin) fprintf(outfile, "%s", (char *)buffer);
914
915 len = (int)strlen((char *)buffer);
916 while (len > 0 && isspace(buffer[len-1])) len--;
917 buffer[len] = 0;
918 if (len == 0) break;
919
920 p = buffer;
921 while (isspace(*p)) p++;
922
923 q = dbuffer;
924 while ((c = *p++) != 0)
925 {
926 int i = 0;
927 int n = 0;
928 if (c == '\\') switch ((c = *p++))
929 {
930 case 'a': c = 7; break;
931 case 'b': c = '\b'; break;
932 case 'e': c = 27; break;
933 case 'f': c = '\f'; break;
934 case 'n': c = '\n'; break;
935 case 'r': c = '\r'; break;
936 case 't': c = '\t'; break;
937 case 'v': c = '\v'; break;
938
939 case '0': case '1': case '2': case '3':
940 case '4': case '5': case '6': case '7':
941 c -= '0';
942 while (i++ < 2 && isdigit(*p) && *p != '8' && *p != '9')
943 c = c * 8 + *p++ - '0';
944 break;
945
946 case 'x':
947
948 /* Handle \x{..} specially - new Perl thing for utf8 */
949
950 if (*p == '{')
951 {
952 unsigned char *pt = p;
953 c = 0;
954 while (isxdigit(*(++pt)))
955 c = c * 16 + tolower(*pt) - ((isdigit(*pt))? '0' : 'W');
956 if (*pt == '}')
957 {
958 unsigned char buffer[8];
959 int ii, utn;
960 utn = ord2utf8(c, buffer);
961 for (ii = 0; ii < utn - 1; ii++) *q++ = buffer[ii];
962 c = buffer[ii]; /* Last byte */
963 p = pt + 1;
964 break;
965 }
966 /* Not correct form; fall through */
967 }
968
969 /* Ordinary \x */
970
971 c = 0;
972 while (i++ < 2 && isxdigit(*p))
973 {
974 c = c * 16 + tolower(*p) - ((isdigit(*p))? '0' : 'W');
975 p++;
976 }
977 break;
978
979 case 0: /* Allows for an empty line */
980 p--;
981 continue;
982
983 case 'A': /* Option setting */
984 options |= PCRE_ANCHORED;
985 continue;
986
987 case 'B':
988 options |= PCRE_NOTBOL;
989 continue;
990
991 case 'C':
992 while(isdigit(*p)) n = n * 10 + *p++ - '0';
993 copystrings |= 1 << n;
994 continue;
995
996 case 'G':
997 while(isdigit(*p)) n = n * 10 + *p++ - '0';
998 getstrings |= 1 << n;
999 continue;
1000
1001 case 'L':
1002 getlist = 1;
1003 continue;
1004
1005 case 'N':
1006 options |= PCRE_NOTEMPTY;
1007 continue;
1008
1009 case 'O':
1010 while(isdigit(*p)) n = n * 10 + *p++ - '0';
1011 if (n > size_offsets_max)
1012 {
1013
1014 if (offsets != NULL)
1015
1016 free(offsets);
1017 size_offsets_max = n;
1018 offsets = malloc(size_offsets_max * sizeof(int));
1019 if (offsets == NULL)
1020 {
1021 printf("** Failed to get %d bytes of memory for offsets vector\n",
1022 size_offsets_max * sizeof(int));
1023 return 1;
1024 }
1025 }
1026 use_size_offsets = n;
1027
1028 if (n == 0)
1029 {
1030 free(offsets);
1031 offsets = NULL;
1032 size_offsets_max = 0;
1033 }
1034
1035 continue;
1036
1037 case 'Z':
1038 options |= PCRE_NOTEOL;
1039 continue;
1040 }
1041 *q++ = c;
1042 }
1043 *q = 0;
1044 len = q - dbuffer;
1045
1046 /* Handle matching via the POSIX interface, which does not
1047 support timing. */
1048
1049 #if !defined NOPOSIX
1050 if (posix || do_posix)
1051 {
1052 int rc;
1053 int eflags = 0;
1054 regmatch_t *pmatch = malloc(sizeof(regmatch_t) * use_size_offsets);
1055 if ((options & PCRE_NOTBOL) != 0) eflags |= REG_NOTBOL;
1056 if ((options & PCRE_NOTEOL) != 0) eflags |= REG_NOTEOL;
1057
1058 rc = regexec(&preg, (const char *)bptr, use_size_offsets, pmatch, eflags);
1059
1060 if (rc != 0)
1061 {
1062 (void)regerror(rc, &preg, (char *)buffer, sizeof(buffer));
1063 fprintf(outfile, "No match: POSIX code %d: %s\n", rc, buffer);
1064 }
1065 else
1066 {
1067 size_t i;
1068 for (i = 0; i < use_size_offsets; i++)
1069 {
1070 if (pmatch[i].rm_so >= 0)
1071 {
1072 fprintf(outfile, "%2d: ", (int)i);
1073 pchars(dbuffer + pmatch[i].rm_so,
1074 pmatch[i].rm_eo - pmatch[i].rm_so, utf8);
1075 fprintf(outfile, "\n");
1076 if (i == 0 && do_showrest)
1077 {
1078 fprintf(outfile, " 0+ ");
1079 pchars(dbuffer + pmatch[i].rm_eo, len - pmatch[i].rm_eo, utf8);
1080 fprintf(outfile, "\n");
1081 }
1082 }
1083 }
1084 }
1085 free(pmatch);
1086 }
1087
1088 /* Handle matching via the native interface - repeats for /g and /G */
1089
1090 else
1091 #endif /* !defined NOPOSIX */
1092
1093 for (;; gmatched++) /* Loop for /g or /G */
1094 {
1095 if (timeit)
1096 {
1097 register int i;
1098 clock_t time_taken;
1099 clock_t start_time = clock();
1100 for (i = 0; i < LOOPREPEAT; i++)
1101 count = pcre_exec(re, extra, (char *)bptr, len,
1102 start_offset, options | g_notempty, offsets, use_size_offsets);
1103 time_taken = clock() - start_time;
1104 fprintf(outfile, "Execute time %.3f milliseconds\n",
1105 ((double)time_taken * 1000.0)/
1106 ((double)LOOPREPEAT * (double)CLOCKS_PER_SEC));
1107 }
1108
1109 count = pcre_exec(re, extra, (char *)bptr, len,
1110 start_offset, options | g_notempty, offsets, use_size_offsets);
1111
1112 if (count == 0)
1113 {
1114 fprintf(outfile, "Matched, but too many substrings\n");
1115 count = use_size_offsets/3;
1116 }
1117
1118 /* Matched */
1119
1120 if (count >= 0)
1121 {
1122 int i;
1123 for (i = 0; i < count * 2; i += 2)
1124 {
1125 if (offsets[i] < 0)
1126 fprintf(outfile, "%2d: <unset>\n", i/2);
1127 else
1128 {
1129 fprintf(outfile, "%2d: ", i/2);
1130 pchars(bptr + offsets[i], offsets[i+1] - offsets[i], utf8);
1131 fprintf(outfile, "\n");
1132 if (i == 0)
1133 {
1134 if (do_showrest)
1135 {
1136 fprintf(outfile, " 0+ ");
1137 pchars(bptr + offsets[i+1], len - offsets[i+1], utf8);
1138 fprintf(outfile, "\n");
1139 }
1140 }
1141 }
1142 }
1143
1144 for (i = 0; i < 32; i++)
1145 {
1146 if ((copystrings & (1 << i)) != 0)
1147 {
1148 char copybuffer[16];
1149 int rc = pcre_copy_substring((char *)bptr, offsets, count,
1150 i, copybuffer, sizeof(copybuffer));
1151 if (rc < 0)
1152 fprintf(outfile, "copy substring %d failed %d\n", i, rc);
1153 else
1154 fprintf(outfile, "%2dC %s (%d)\n", i, copybuffer, rc);
1155 }
1156 }
1157
1158 for (i = 0; i < 32; i++)
1159 {
1160 if ((getstrings & (1 << i)) != 0)
1161 {
1162 const char *substring;
1163 int rc = pcre_get_substring((char *)bptr, offsets, count,
1164 i, &substring);
1165 if (rc < 0)
1166 fprintf(outfile, "get substring %d failed %d\n", i, rc);
1167 else
1168 {
1169 fprintf(outfile, "%2dG %s (%d)\n", i, substring, rc);
1170 /* free((void *)substring); */
1171 pcre_free_substring(substring);
1172 }
1173 }
1174 }
1175
1176 if (getlist)
1177 {
1178 const char **stringlist;
1179 int rc = pcre_get_substring_list((char *)bptr, offsets, count,
1180 &stringlist);
1181 if (rc < 0)
1182 fprintf(outfile, "get substring list failed %d\n", rc);
1183 else
1184 {
1185 for (i = 0; i < count; i++)
1186 fprintf(outfile, "%2dL %s\n", i, stringlist[i]);
1187 if (stringlist[i] != NULL)
1188 fprintf(outfile, "string list not terminated by NULL\n");
1189 /* free((void *)stringlist); */
1190 pcre_free_substring_list(stringlist);
1191 }
1192 }
1193 }
1194
1195 /* Failed to match. If this is a /g or /G loop and we previously set
1196 g_notempty after a null match, this is not necessarily the end.
1197 We want to advance the start offset, and continue. Fudge the offset
1198 values to achieve this. We won't be at the end of the string - that
1199 was checked before setting g_notempty. */
1200
1201 else
1202 {
1203 if (g_notempty != 0)
1204 {
1205 offsets[0] = start_offset;
1206 offsets[1] = start_offset + 1;
1207 }
1208 else
1209 {
1210 if (gmatched == 0) /* Error if no previous matches */
1211 {
1212 if (count == -1) fprintf(outfile, "No match\n");
1213 else fprintf(outfile, "Error %d\n", count);
1214 }
1215 break; /* Out of the /g loop */
1216 }
1217 }
1218
1219 /* If not /g or /G we are done */
1220
1221 if (!do_g && !do_G) break;
1222
1223 /* If we have matched an empty string, first check to see if we are at
1224 the end of the subject. If so, the /g loop is over. Otherwise, mimic
1225 what Perl's /g options does. This turns out to be rather cunning. First
1226 we set PCRE_NOTEMPTY and PCRE_ANCHORED and try the match again at the
1227 same point. If this fails (picked up above) we advance to the next
1228 character. */
1229
1230 g_notempty = 0;
1231 if (offsets[0] == offsets[1])
1232 {
1233 if (offsets[0] == len) break;
1234 g_notempty = PCRE_NOTEMPTY | PCRE_ANCHORED;
1235 }
1236
1237 /* For /g, update the start offset, leaving the rest alone */
1238
1239 if (do_g) start_offset = offsets[1];
1240
1241 /* For /G, update the pointer and length */
1242
1243 else
1244 {
1245 bptr += offsets[1];
1246 len -= offsets[1];
1247 }
1248 } /* End of loop for /g and /G */
1249 } /* End of loop for data lines */
1250
1251 CONTINUE:
1252
1253 #if !defined NOPOSIX
1254 if (posix || do_posix) regfree(&preg);
1255 #endif
1256
1257 if (re != NULL) free(re);
1258 if (extra != NULL) free(extra);
1259 if (tables != NULL)
1260 {
1261 free((void *)tables);
1262 setlocale(LC_CTYPE, "C");
1263 }
1264 }
1265
1266 fprintf(outfile, "\n");
1267 return 0;
1268 }
1269
1270 /* End */

webmaster@exim.org
ViewVC Help
Powered by ViewVC 1.1.12