/[pcre]/code/trunk/pcretest.c
ViewVC logotype

Contents of /code/trunk/pcretest.c

Parent Directory Parent Directory | Revision Log Revision Log


Revision 49 - (show annotations) (download)
Sat Feb 24 21:39:33 2007 UTC (7 years, 8 months ago) by nigel
File MIME type: text/plain
File size: 32558 byte(s)
Load pcre-3.3 into code/trunk.

1 /*************************************************
2 * PCRE testing program *
3 *************************************************/
4
5 #include <ctype.h>
6 #include <stdio.h>
7 #include <string.h>
8 #include <stdlib.h>
9 #include <time.h>
10 #include <locale.h>
11
12 /* Use the internal info for displaying the results of pcre_study(). */
13
14 #include "internal.h"
15
16 /* It is possible to compile this test program without including support for
17 testing the POSIX interface, though this is not available via the standard
18 Makefile. */
19
20 #if !defined NOPOSIX
21 #include "pcreposix.h"
22 #endif
23
24 #ifndef CLOCKS_PER_SEC
25 #ifdef CLK_TCK
26 #define CLOCKS_PER_SEC CLK_TCK
27 #else
28 #define CLOCKS_PER_SEC 100
29 #endif
30 #endif
31
32 #define LOOPREPEAT 20000
33
34
35 static FILE *outfile;
36 static int log_store = 0;
37 static size_t gotten_store;
38
39
40
41 static int utf8_table1[] = {
42 0x0000007f, 0x000007ff, 0x0000ffff, 0x001fffff, 0x03ffffff, 0x7fffffff};
43
44 static int utf8_table2[] = {
45 0, 0xc0, 0xe0, 0xf0, 0xf8, 0xfc};
46
47 static int utf8_table3[] = {
48 0xff, 0x1f, 0x0f, 0x07, 0x03, 0x01};
49
50
51 /*************************************************
52 * Convert character value to UTF-8 *
53 *************************************************/
54
55 /* This function takes an integer value in the range 0 - 0x7fffffff
56 and encodes it as a UTF-8 character in 0 to 6 bytes.
57
58 Arguments:
59 cvalue the character value
60 buffer pointer to buffer for result - at least 6 bytes long
61
62 Returns: number of characters placed in the buffer
63 -1 if input character is negative
64 0 if input character is positive but too big (only when
65 int is longer than 32 bits)
66 */
67
68 static int
69 ord2utf8(int cvalue, unsigned char *buffer)
70 {
71 register int i, j;
72 for (i = 0; i < sizeof(utf8_table1)/sizeof(int); i++)
73 if (cvalue <= utf8_table1[i]) break;
74 if (i >= sizeof(utf8_table1)/sizeof(int)) return 0;
75 if (cvalue < 0) return -1;
76 *buffer++ = utf8_table2[i] | (cvalue & utf8_table3[i]);
77 cvalue >>= 6 - i;
78 for (j = 0; j < i; j++)
79 {
80 *buffer++ = 0x80 | (cvalue & 0x3f);
81 cvalue >>= 6;
82 }
83 return i + 1;
84 }
85
86
87 /*************************************************
88 * Convert UTF-8 string to value *
89 *************************************************/
90
91 /* This function takes one or more bytes that represents a UTF-8 character,
92 and returns the value of the character.
93
94 Argument:
95 buffer a pointer to the byte vector
96 vptr a pointer to an int to receive the value
97
98 Returns: > 0 => the number of bytes consumed
99 -6 to 0 => malformed UTF-8 character at offset = (-return)
100 */
101
102 int
103 utf82ord(unsigned char *buffer, int *vptr)
104 {
105 int c = *buffer++;
106 int d = c;
107 int i, j, s;
108
109 for (i = -1; i < 6; i++) /* i is number of additional bytes */
110 {
111 if ((d & 0x80) == 0) break;
112 d <<= 1;
113 }
114
115 if (i == -1) { *vptr = c; return 1; } /* ascii character */
116 if (i == 0 || i == 6) return 0; /* invalid UTF-8 */
117
118 /* i now has a value in the range 1-5 */
119
120 d = c & utf8_table3[i];
121 s = 6 - i;
122
123 for (j = 0; j < i; j++)
124 {
125 c = *buffer++;
126 if ((c & 0xc0) != 0x80) return -(j+1);
127 d |= (c & 0x3f) << s;
128 s += 6;
129 }
130
131 /* Check that encoding was the correct unique one */
132
133 for (j = 0; j < sizeof(utf8_table1)/sizeof(int); j++)
134 if (d <= utf8_table1[j]) break;
135 if (j != i) return -(i+1);
136
137 /* Valid value */
138
139 *vptr = d;
140 return i+1;
141 }
142
143
144
145
146
147
148 /* Debugging function to print the internal form of the regex. This is the same
149 code as contained in pcre.c under the DEBUG macro. */
150
151 static const char *OP_names[] = {
152 "End", "\\A", "\\B", "\\b", "\\D", "\\d",
153 "\\S", "\\s", "\\W", "\\w", "\\Z", "\\z",
154 "Opt", "^", "$", "Any", "chars", "not",
155 "*", "*?", "+", "+?", "?", "??", "{", "{", "{",
156 "*", "*?", "+", "+?", "?", "??", "{", "{", "{",
157 "*", "*?", "+", "+?", "?", "??", "{", "{", "{",
158 "*", "*?", "+", "+?", "?", "??", "{", "{",
159 "class", "Ref", "Recurse",
160 "Alt", "Ket", "KetRmax", "KetRmin", "Assert", "Assert not",
161 "AssertB", "AssertB not", "Reverse", "Once", "Cond", "Cref",
162 "Brazero", "Braminzero", "Bra"
163 };
164
165
166 static void print_internals(pcre *re)
167 {
168 unsigned char *code = ((real_pcre *)re)->code;
169
170 fprintf(outfile, "------------------------------------------------------------------\n");
171
172 for(;;)
173 {
174 int c;
175 int charlength;
176
177 fprintf(outfile, "%3d ", (int)(code - ((real_pcre *)re)->code));
178
179 if (*code >= OP_BRA)
180 {
181 fprintf(outfile, "%3d Bra %d", (code[1] << 8) + code[2], *code - OP_BRA);
182 code += 2;
183 }
184
185 else switch(*code)
186 {
187 case OP_END:
188 fprintf(outfile, " %s\n", OP_names[*code]);
189 fprintf(outfile, "------------------------------------------------------------------\n");
190 return;
191
192 case OP_OPT:
193 fprintf(outfile, " %.2x %s", code[1], OP_names[*code]);
194 code++;
195 break;
196
197 case OP_COND:
198 fprintf(outfile, "%3d Cond", (code[1] << 8) + code[2]);
199 code += 2;
200 break;
201
202 case OP_CREF:
203 fprintf(outfile, " %.2d %s", code[1], OP_names[*code]);
204 code++;
205 break;
206
207 case OP_CHARS:
208 charlength = *(++code);
209 fprintf(outfile, "%3d ", charlength);
210 while (charlength-- > 0)
211 if (isprint(c = *(++code))) fprintf(outfile, "%c", c);
212 else fprintf(outfile, "\\x%02x", c);
213 break;
214
215 case OP_KETRMAX:
216 case OP_KETRMIN:
217 case OP_ALT:
218 case OP_KET:
219 case OP_ASSERT:
220 case OP_ASSERT_NOT:
221 case OP_ASSERTBACK:
222 case OP_ASSERTBACK_NOT:
223 case OP_ONCE:
224 fprintf(outfile, "%3d %s", (code[1] << 8) + code[2], OP_names[*code]);
225 code += 2;
226 break;
227
228 case OP_REVERSE:
229 fprintf(outfile, "%3d %s", (code[1] << 8) + code[2], OP_names[*code]);
230 code += 2;
231 break;
232
233 case OP_STAR:
234 case OP_MINSTAR:
235 case OP_PLUS:
236 case OP_MINPLUS:
237 case OP_QUERY:
238 case OP_MINQUERY:
239 case OP_TYPESTAR:
240 case OP_TYPEMINSTAR:
241 case OP_TYPEPLUS:
242 case OP_TYPEMINPLUS:
243 case OP_TYPEQUERY:
244 case OP_TYPEMINQUERY:
245 if (*code >= OP_TYPESTAR)
246 fprintf(outfile, " %s", OP_names[code[1]]);
247 else if (isprint(c = code[1])) fprintf(outfile, " %c", c);
248 else fprintf(outfile, " \\x%02x", c);
249 fprintf(outfile, "%s", OP_names[*code++]);
250 break;
251
252 case OP_EXACT:
253 case OP_UPTO:
254 case OP_MINUPTO:
255 if (isprint(c = code[3])) fprintf(outfile, " %c{", c);
256 else fprintf(outfile, " \\x%02x{", c);
257 if (*code != OP_EXACT) fprintf(outfile, ",");
258 fprintf(outfile, "%d}", (code[1] << 8) + code[2]);
259 if (*code == OP_MINUPTO) fprintf(outfile, "?");
260 code += 3;
261 break;
262
263 case OP_TYPEEXACT:
264 case OP_TYPEUPTO:
265 case OP_TYPEMINUPTO:
266 fprintf(outfile, " %s{", OP_names[code[3]]);
267 if (*code != OP_TYPEEXACT) fprintf(outfile, "0,");
268 fprintf(outfile, "%d}", (code[1] << 8) + code[2]);
269 if (*code == OP_TYPEMINUPTO) fprintf(outfile, "?");
270 code += 3;
271 break;
272
273 case OP_NOT:
274 if (isprint(c = *(++code))) fprintf(outfile, " [^%c]", c);
275 else fprintf(outfile, " [^\\x%02x]", c);
276 break;
277
278 case OP_NOTSTAR:
279 case OP_NOTMINSTAR:
280 case OP_NOTPLUS:
281 case OP_NOTMINPLUS:
282 case OP_NOTQUERY:
283 case OP_NOTMINQUERY:
284 if (isprint(c = code[1])) fprintf(outfile, " [^%c]", c);
285 else fprintf(outfile, " [^\\x%02x]", c);
286 fprintf(outfile, "%s", OP_names[*code++]);
287 break;
288
289 case OP_NOTEXACT:
290 case OP_NOTUPTO:
291 case OP_NOTMINUPTO:
292 if (isprint(c = code[3])) fprintf(outfile, " [^%c]{", c);
293 else fprintf(outfile, " [^\\x%02x]{", c);
294 if (*code != OP_NOTEXACT) fprintf(outfile, ",");
295 fprintf(outfile, "%d}", (code[1] << 8) + code[2]);
296 if (*code == OP_NOTMINUPTO) fprintf(outfile, "?");
297 code += 3;
298 break;
299
300 case OP_REF:
301 fprintf(outfile, " \\%d", *(++code));
302 code++;
303 goto CLASS_REF_REPEAT;
304
305 case OP_CLASS:
306 {
307 int i, min, max;
308 code++;
309 fprintf(outfile, " [");
310
311 for (i = 0; i < 256; i++)
312 {
313 if ((code[i/8] & (1 << (i&7))) != 0)
314 {
315 int j;
316 for (j = i+1; j < 256; j++)
317 if ((code[j/8] & (1 << (j&7))) == 0) break;
318 if (i == '-' || i == ']') fprintf(outfile, "\\");
319 if (isprint(i)) fprintf(outfile, "%c", i); else fprintf(outfile, "\\x%02x", i);
320 if (--j > i)
321 {
322 fprintf(outfile, "-");
323 if (j == '-' || j == ']') fprintf(outfile, "\\");
324 if (isprint(j)) fprintf(outfile, "%c", j); else fprintf(outfile, "\\x%02x", j);
325 }
326 i = j;
327 }
328 }
329 fprintf(outfile, "]");
330 code += 32;
331
332 CLASS_REF_REPEAT:
333
334 switch(*code)
335 {
336 case OP_CRSTAR:
337 case OP_CRMINSTAR:
338 case OP_CRPLUS:
339 case OP_CRMINPLUS:
340 case OP_CRQUERY:
341 case OP_CRMINQUERY:
342 fprintf(outfile, "%s", OP_names[*code]);
343 break;
344
345 case OP_CRRANGE:
346 case OP_CRMINRANGE:
347 min = (code[1] << 8) + code[2];
348 max = (code[3] << 8) + code[4];
349 if (max == 0) fprintf(outfile, "{%d,}", min);
350 else fprintf(outfile, "{%d,%d}", min, max);
351 if (*code == OP_CRMINRANGE) fprintf(outfile, "?");
352 code += 4;
353 break;
354
355 default:
356 code--;
357 }
358 }
359 break;
360
361 /* Anything else is just a one-node item */
362
363 default:
364 fprintf(outfile, " %s", OP_names[*code]);
365 break;
366 }
367
368 code++;
369 fprintf(outfile, "\n");
370 }
371 }
372
373
374
375 /* Character string printing function. A "normal" and a UTF-8 version. */
376
377 static void pchars(unsigned char *p, int length, int utf8)
378 {
379 int c;
380 while (length-- > 0)
381 {
382 if (utf8)
383 {
384 int rc = utf82ord(p, &c);
385 if (rc > 0)
386 {
387 length -= rc - 1;
388 p += rc;
389 if (c < 256 && isprint(c)) fprintf(outfile, "%c", c);
390 else fprintf(outfile, "\\x{%02x}", c);
391 continue;
392 }
393 }
394
395 /* Not UTF-8, or malformed UTF-8 */
396
397 if (isprint(c = *(p++))) fprintf(outfile, "%c", c);
398 else fprintf(outfile, "\\x%02x", c);
399 }
400 }
401
402
403
404 /* Alternative malloc function, to test functionality and show the size of the
405 compiled re. */
406
407 static void *new_malloc(size_t size)
408 {
409 gotten_store = size;
410 if (log_store)
411 fprintf(outfile, "Memory allocation (code space): %d\n",
412 (int)((int)size - offsetof(real_pcre, code[0])));
413 return malloc(size);
414 }
415
416
417
418
419 /* Get one piece of information from the pcre_fullinfo() function */
420
421 static void new_info(pcre *re, pcre_extra *study, int option, void *ptr)
422 {
423 int rc;
424 if ((rc = pcre_fullinfo(re, study, option, ptr)) < 0)
425 fprintf(outfile, "Error %d from pcre_fullinfo(%d)\n", rc, option);
426 }
427
428
429
430
431 /* Read lines from named file or stdin and write to named file or stdout; lines
432 consist of a regular expression, in delimiters and optionally followed by
433 options, followed by a set of test data, terminated by an empty line. */
434
435 int main(int argc, char **argv)
436 {
437 FILE *infile = stdin;
438 int options = 0;
439 int study_options = 0;
440 int op = 1;
441 int timeit = 0;
442 int showinfo = 0;
443 int showstore = 0;
444 int posix = 0;
445 int debug = 0;
446 int done = 0;
447 unsigned char buffer[30000];
448 unsigned char dbuffer[1024];
449
450 /* Static so that new_malloc can use it. */
451
452 outfile = stdout;
453
454 /* Scan options */
455
456 while (argc > 1 && argv[op][0] == '-')
457 {
458 if (strcmp(argv[op], "-s") == 0 || strcmp(argv[op], "-m") == 0)
459 showstore = 1;
460 else if (strcmp(argv[op], "-t") == 0) timeit = 1;
461 else if (strcmp(argv[op], "-i") == 0) showinfo = 1;
462 else if (strcmp(argv[op], "-d") == 0) showinfo = debug = 1;
463 else if (strcmp(argv[op], "-p") == 0) posix = 1;
464 else
465 {
466 printf("*** Unknown option %s\n", argv[op]);
467 printf("Usage: pcretest [-d] [-i] [-p] [-s] [-t] [<input> [<output>]]\n");
468 printf(" -d debug: show compiled code; implies -i\n"
469 " -i show information about compiled pattern\n"
470 " -p use POSIX interface\n"
471 " -s output store information\n"
472 " -t time compilation and execution\n");
473 return 1;
474 }
475 op++;
476 argc--;
477 }
478
479 /* Sort out the input and output files */
480
481 if (argc > 1)
482 {
483 infile = fopen(argv[op], "r");
484 if (infile == NULL)
485 {
486 printf("** Failed to open %s\n", argv[op]);
487 return 1;
488 }
489 }
490
491 if (argc > 2)
492 {
493 outfile = fopen(argv[op+1], "w");
494 if (outfile == NULL)
495 {
496 printf("** Failed to open %s\n", argv[op+1]);
497 return 1;
498 }
499 }
500
501 /* Set alternative malloc function */
502
503 pcre_malloc = new_malloc;
504
505 /* Heading line, then prompt for first regex if stdin */
506
507 fprintf(outfile, "PCRE version %s\n\n", pcre_version());
508
509 /* Main loop */
510
511 while (!done)
512 {
513 pcre *re = NULL;
514 pcre_extra *extra = NULL;
515
516 #if !defined NOPOSIX /* There are still compilers that require no indent */
517 regex_t preg;
518 int do_posix = 0;
519 #endif
520
521 const char *error;
522 unsigned char *p, *pp, *ppp;
523 unsigned const char *tables = NULL;
524 int do_study = 0;
525 int do_debug = debug;
526 int do_G = 0;
527 int do_g = 0;
528 int do_showinfo = showinfo;
529 int do_showrest = 0;
530 int utf8 = 0;
531 int erroroffset, len, delimiter;
532
533 if (infile == stdin) printf(" re> ");
534 if (fgets((char *)buffer, sizeof(buffer), infile) == NULL) break;
535 if (infile != stdin) fprintf(outfile, "%s", (char *)buffer);
536
537 p = buffer;
538 while (isspace(*p)) p++;
539 if (*p == 0) continue;
540
541 /* Get the delimiter and seek the end of the pattern; if is isn't
542 complete, read more. */
543
544 delimiter = *p++;
545
546 if (isalnum(delimiter) || delimiter == '\\')
547 {
548 fprintf(outfile, "** Delimiter must not be alphameric or \\\n");
549 goto SKIP_DATA;
550 }
551
552 pp = p;
553
554 for(;;)
555 {
556 while (*pp != 0)
557 {
558 if (*pp == '\\' && pp[1] != 0) pp++;
559 else if (*pp == delimiter) break;
560 pp++;
561 }
562 if (*pp != 0) break;
563
564 len = sizeof(buffer) - (pp - buffer);
565 if (len < 256)
566 {
567 fprintf(outfile, "** Expression too long - missing delimiter?\n");
568 goto SKIP_DATA;
569 }
570
571 if (infile == stdin) printf(" > ");
572 if (fgets((char *)pp, len, infile) == NULL)
573 {
574 fprintf(outfile, "** Unexpected EOF\n");
575 done = 1;
576 goto CONTINUE;
577 }
578 if (infile != stdin) fprintf(outfile, "%s", (char *)pp);
579 }
580
581 /* If the first character after the delimiter is backslash, make
582 the pattern end with backslash. This is purely to provide a way
583 of testing for the error message when a pattern ends with backslash. */
584
585 if (pp[1] == '\\') *pp++ = '\\';
586
587 /* Terminate the pattern at the delimiter */
588
589 *pp++ = 0;
590
591 /* Look for options after final delimiter */
592
593 options = 0;
594 study_options = 0;
595 log_store = showstore; /* default from command line */
596
597 while (*pp != 0)
598 {
599 switch (*pp++)
600 {
601 case 'g': do_g = 1; break;
602 case 'i': options |= PCRE_CASELESS; break;
603 case 'm': options |= PCRE_MULTILINE; break;
604 case 's': options |= PCRE_DOTALL; break;
605 case 'x': options |= PCRE_EXTENDED; break;
606
607 case '+': do_showrest = 1; break;
608 case 'A': options |= PCRE_ANCHORED; break;
609 case 'D': do_debug = do_showinfo = 1; break;
610 case 'E': options |= PCRE_DOLLAR_ENDONLY; break;
611 case 'G': do_G = 1; break;
612 case 'I': do_showinfo = 1; break;
613 case 'M': log_store = 1; break;
614
615 #if !defined NOPOSIX
616 case 'P': do_posix = 1; break;
617 #endif
618
619 case 'S': do_study = 1; break;
620 case 'U': options |= PCRE_UNGREEDY; break;
621 case 'X': options |= PCRE_EXTRA; break;
622 case '8': options |= PCRE_UTF8; utf8 = 1; break;
623
624 case 'L':
625 ppp = pp;
626 while (*ppp != '\n' && *ppp != ' ') ppp++;
627 *ppp = 0;
628 if (setlocale(LC_CTYPE, (const char *)pp) == NULL)
629 {
630 fprintf(outfile, "** Failed to set locale \"%s\"\n", pp);
631 goto SKIP_DATA;
632 }
633 tables = pcre_maketables();
634 pp = ppp;
635 break;
636
637 case '\n': case ' ': break;
638 default:
639 fprintf(outfile, "** Unknown option '%c'\n", pp[-1]);
640 goto SKIP_DATA;
641 }
642 }
643
644 /* Handle compiling via the POSIX interface, which doesn't support the
645 timing, showing, or debugging options, nor the ability to pass over
646 local character tables. */
647
648 #if !defined NOPOSIX
649 if (posix || do_posix)
650 {
651 int rc;
652 int cflags = 0;
653 if ((options & PCRE_CASELESS) != 0) cflags |= REG_ICASE;
654 if ((options & PCRE_MULTILINE) != 0) cflags |= REG_NEWLINE;
655 rc = regcomp(&preg, (char *)p, cflags);
656
657 /* Compilation failed; go back for another re, skipping to blank line
658 if non-interactive. */
659
660 if (rc != 0)
661 {
662 (void)regerror(rc, &preg, (char *)buffer, sizeof(buffer));
663 fprintf(outfile, "Failed: POSIX code %d: %s\n", rc, buffer);
664 goto SKIP_DATA;
665 }
666 }
667
668 /* Handle compiling via the native interface */
669
670 else
671 #endif /* !defined NOPOSIX */
672
673 {
674 if (timeit)
675 {
676 register int i;
677 clock_t time_taken;
678 clock_t start_time = clock();
679 for (i = 0; i < LOOPREPEAT; i++)
680 {
681 re = pcre_compile((char *)p, options, &error, &erroroffset, tables);
682 if (re != NULL) free(re);
683 }
684 time_taken = clock() - start_time;
685 fprintf(outfile, "Compile time %.3f milliseconds\n",
686 ((double)time_taken * 1000.0) /
687 ((double)LOOPREPEAT * (double)CLOCKS_PER_SEC));
688 }
689
690 re = pcre_compile((char *)p, options, &error, &erroroffset, tables);
691
692 /* Compilation failed; go back for another re, skipping to blank line
693 if non-interactive. */
694
695 if (re == NULL)
696 {
697 fprintf(outfile, "Failed: %s at offset %d\n", error, erroroffset);
698 SKIP_DATA:
699 if (infile != stdin)
700 {
701 for (;;)
702 {
703 if (fgets((char *)buffer, sizeof(buffer), infile) == NULL)
704 {
705 done = 1;
706 goto CONTINUE;
707 }
708 len = (int)strlen((char *)buffer);
709 while (len > 0 && isspace(buffer[len-1])) len--;
710 if (len == 0) break;
711 }
712 fprintf(outfile, "\n");
713 }
714 goto CONTINUE;
715 }
716
717 /* Compilation succeeded; print data if required. There are now two
718 info-returning functions. The old one has a limited interface and
719 returns only limited data. Check that it agrees with the newer one. */
720
721 if (do_showinfo)
722 {
723 int old_first_char, old_options, old_count;
724 int count, backrefmax, first_char, need_char;
725 size_t size;
726
727 if (do_debug) print_internals(re);
728
729 new_info(re, NULL, PCRE_INFO_OPTIONS, &options);
730 new_info(re, NULL, PCRE_INFO_SIZE, &size);
731 new_info(re, NULL, PCRE_INFO_CAPTURECOUNT, &count);
732 new_info(re, NULL, PCRE_INFO_BACKREFMAX, &backrefmax);
733 new_info(re, NULL, PCRE_INFO_FIRSTCHAR, &first_char);
734 new_info(re, NULL, PCRE_INFO_LASTLITERAL, &need_char);
735
736 old_count = pcre_info(re, &old_options, &old_first_char);
737 if (count < 0) fprintf(outfile,
738 "Error %d from pcre_info()\n", count);
739 else
740 {
741 if (old_count != count) fprintf(outfile,
742 "Count disagreement: pcre_fullinfo=%d pcre_info=%d\n", count,
743 old_count);
744
745 if (old_first_char != first_char) fprintf(outfile,
746 "First char disagreement: pcre_fullinfo=%d pcre_info=%d\n",
747 first_char, old_first_char);
748
749 if (old_options != options) fprintf(outfile,
750 "Options disagreement: pcre_fullinfo=%d pcre_info=%d\n", options,
751 old_options);
752 }
753
754 if (size != gotten_store) fprintf(outfile,
755 "Size disagreement: pcre_fullinfo=%d call to malloc for %d\n",
756 size, gotten_store);
757
758 fprintf(outfile, "Capturing subpattern count = %d\n", count);
759 if (backrefmax > 0)
760 fprintf(outfile, "Max back reference = %d\n", backrefmax);
761 if (options == 0) fprintf(outfile, "No options\n");
762 else fprintf(outfile, "Options:%s%s%s%s%s%s%s%s%s\n",
763 ((options & PCRE_ANCHORED) != 0)? " anchored" : "",
764 ((options & PCRE_CASELESS) != 0)? " caseless" : "",
765 ((options & PCRE_EXTENDED) != 0)? " extended" : "",
766 ((options & PCRE_MULTILINE) != 0)? " multiline" : "",
767 ((options & PCRE_DOTALL) != 0)? " dotall" : "",
768 ((options & PCRE_DOLLAR_ENDONLY) != 0)? " dollar_endonly" : "",
769 ((options & PCRE_EXTRA) != 0)? " extra" : "",
770 ((options & PCRE_UNGREEDY) != 0)? " ungreedy" : "",
771 ((options & PCRE_UTF8) != 0)? " utf8" : "");
772
773 if (((((real_pcre *)re)->options) & PCRE_ICHANGED) != 0)
774 fprintf(outfile, "Case state changes\n");
775
776 if (first_char == -1)
777 {
778 fprintf(outfile, "First char at start or follows \\n\n");
779 }
780 else if (first_char < 0)
781 {
782 fprintf(outfile, "No first char\n");
783 }
784 else
785 {
786 if (isprint(first_char))
787 fprintf(outfile, "First char = \'%c\'\n", first_char);
788 else
789 fprintf(outfile, "First char = %d\n", first_char);
790 }
791
792 if (need_char < 0)
793 {
794 fprintf(outfile, "No need char\n");
795 }
796 else
797 {
798 if (isprint(need_char))
799 fprintf(outfile, "Need char = \'%c\'\n", need_char);
800 else
801 fprintf(outfile, "Need char = %d\n", need_char);
802 }
803 }
804
805 /* If /S was present, study the regexp to generate additional info to
806 help with the matching. */
807
808 if (do_study)
809 {
810 if (timeit)
811 {
812 register int i;
813 clock_t time_taken;
814 clock_t start_time = clock();
815 for (i = 0; i < LOOPREPEAT; i++)
816 extra = pcre_study(re, study_options, &error);
817 time_taken = clock() - start_time;
818 if (extra != NULL) free(extra);
819 fprintf(outfile, " Study time %.3f milliseconds\n",
820 ((double)time_taken * 1000.0)/
821 ((double)LOOPREPEAT * (double)CLOCKS_PER_SEC));
822 }
823
824 extra = pcre_study(re, study_options, &error);
825 if (error != NULL)
826 fprintf(outfile, "Failed to study: %s\n", error);
827 else if (extra == NULL)
828 fprintf(outfile, "Study returned NULL\n");
829
830 else if (do_showinfo)
831 {
832 uschar *start_bits = NULL;
833 new_info(re, extra, PCRE_INFO_FIRSTTABLE, &start_bits);
834 if (start_bits == NULL)
835 fprintf(outfile, "No starting character set\n");
836 else
837 {
838 int i;
839 int c = 24;
840 fprintf(outfile, "Starting character set: ");
841 for (i = 0; i < 256; i++)
842 {
843 if ((start_bits[i/8] & (1<<(i%8))) != 0)
844 {
845 if (c > 75)
846 {
847 fprintf(outfile, "\n ");
848 c = 2;
849 }
850 if (isprint(i) && i != ' ')
851 {
852 fprintf(outfile, "%c ", i);
853 c += 2;
854 }
855 else
856 {
857 fprintf(outfile, "\\x%02x ", i);
858 c += 5;
859 }
860 }
861 }
862 fprintf(outfile, "\n");
863 }
864 }
865 }
866 }
867
868 /* Read data lines and test them */
869
870 for (;;)
871 {
872 unsigned char *q;
873 unsigned char *bptr = dbuffer;
874 int count, c;
875 int copystrings = 0;
876 int getstrings = 0;
877 int getlist = 0;
878 int gmatched = 0;
879 int start_offset = 0;
880 int g_notempty = 0;
881 int offsets[45];
882 int size_offsets = sizeof(offsets)/sizeof(int);
883
884 options = 0;
885
886 if (infile == stdin) printf("data> ");
887 if (fgets((char *)buffer, sizeof(buffer), infile) == NULL)
888 {
889 done = 1;
890 goto CONTINUE;
891 }
892 if (infile != stdin) fprintf(outfile, "%s", (char *)buffer);
893
894 len = (int)strlen((char *)buffer);
895 while (len > 0 && isspace(buffer[len-1])) len--;
896 buffer[len] = 0;
897 if (len == 0) break;
898
899 p = buffer;
900 while (isspace(*p)) p++;
901
902 q = dbuffer;
903 while ((c = *p++) != 0)
904 {
905 int i = 0;
906 int n = 0;
907 if (c == '\\') switch ((c = *p++))
908 {
909 case 'a': c = 7; break;
910 case 'b': c = '\b'; break;
911 case 'e': c = 27; break;
912 case 'f': c = '\f'; break;
913 case 'n': c = '\n'; break;
914 case 'r': c = '\r'; break;
915 case 't': c = '\t'; break;
916 case 'v': c = '\v'; break;
917
918 case '0': case '1': case '2': case '3':
919 case '4': case '5': case '6': case '7':
920 c -= '0';
921 while (i++ < 2 && isdigit(*p) && *p != '8' && *p != '9')
922 c = c * 8 + *p++ - '0';
923 break;
924
925 case 'x':
926
927 /* Handle \x{..} specially - new Perl thing for utf8 */
928
929 if (*p == '{')
930 {
931 unsigned char *pt = p;
932 c = 0;
933 while (isxdigit(*(++pt)))
934 c = c * 16 + tolower(*pt) - ((isdigit(*pt))? '0' : 'W');
935 if (*pt == '}')
936 {
937 unsigned char buffer[8];
938 int ii, utn;
939 utn = ord2utf8(c, buffer);
940 for (ii = 0; ii < utn - 1; ii++) *q++ = buffer[ii];
941 c = buffer[ii]; /* Last byte */
942 p = pt + 1;
943 break;
944 }
945 /* Not correct form; fall through */
946 }
947
948 /* Ordinary \x */
949
950 c = 0;
951 while (i++ < 2 && isxdigit(*p))
952 {
953 c = c * 16 + tolower(*p) - ((isdigit(*p))? '0' : 'W');
954 p++;
955 }
956 break;
957
958 case 0: /* Allows for an empty line */
959 p--;
960 continue;
961
962 case 'A': /* Option setting */
963 options |= PCRE_ANCHORED;
964 continue;
965
966 case 'B':
967 options |= PCRE_NOTBOL;
968 continue;
969
970 case 'C':
971 while(isdigit(*p)) n = n * 10 + *p++ - '0';
972 copystrings |= 1 << n;
973 continue;
974
975 case 'G':
976 while(isdigit(*p)) n = n * 10 + *p++ - '0';
977 getstrings |= 1 << n;
978 continue;
979
980 case 'L':
981 getlist = 1;
982 continue;
983
984 case 'N':
985 options |= PCRE_NOTEMPTY;
986 continue;
987
988 case 'O':
989 while(isdigit(*p)) n = n * 10 + *p++ - '0';
990 if (n <= (int)(sizeof(offsets)/sizeof(int))) size_offsets = n;
991 continue;
992
993 case 'Z':
994 options |= PCRE_NOTEOL;
995 continue;
996 }
997 *q++ = c;
998 }
999 *q = 0;
1000 len = q - dbuffer;
1001
1002 /* Handle matching via the POSIX interface, which does not
1003 support timing. */
1004
1005 #if !defined NOPOSIX
1006 if (posix || do_posix)
1007 {
1008 int rc;
1009 int eflags = 0;
1010 regmatch_t pmatch[sizeof(offsets)/sizeof(int)];
1011 if ((options & PCRE_NOTBOL) != 0) eflags |= REG_NOTBOL;
1012 if ((options & PCRE_NOTEOL) != 0) eflags |= REG_NOTEOL;
1013
1014 rc = regexec(&preg, (const char *)bptr, size_offsets, pmatch, eflags);
1015
1016 if (rc != 0)
1017 {
1018 (void)regerror(rc, &preg, (char *)buffer, sizeof(buffer));
1019 fprintf(outfile, "No match: POSIX code %d: %s\n", rc, buffer);
1020 }
1021 else
1022 {
1023 size_t i;
1024 for (i = 0; i < size_offsets; i++)
1025 {
1026 if (pmatch[i].rm_so >= 0)
1027 {
1028 fprintf(outfile, "%2d: ", (int)i);
1029 pchars(dbuffer + pmatch[i].rm_so,
1030 pmatch[i].rm_eo - pmatch[i].rm_so, utf8);
1031 fprintf(outfile, "\n");
1032 if (i == 0 && do_showrest)
1033 {
1034 fprintf(outfile, " 0+ ");
1035 pchars(dbuffer + pmatch[i].rm_eo, len - pmatch[i].rm_eo, utf8);
1036 fprintf(outfile, "\n");
1037 }
1038 }
1039 }
1040 }
1041 }
1042
1043 /* Handle matching via the native interface - repeats for /g and /G */
1044
1045 else
1046 #endif /* !defined NOPOSIX */
1047
1048 for (;; gmatched++) /* Loop for /g or /G */
1049 {
1050 if (timeit)
1051 {
1052 register int i;
1053 clock_t time_taken;
1054 clock_t start_time = clock();
1055 for (i = 0; i < LOOPREPEAT; i++)
1056 count = pcre_exec(re, extra, (char *)bptr, len,
1057 start_offset, options | g_notempty, offsets, size_offsets);
1058 time_taken = clock() - start_time;
1059 fprintf(outfile, "Execute time %.3f milliseconds\n",
1060 ((double)time_taken * 1000.0)/
1061 ((double)LOOPREPEAT * (double)CLOCKS_PER_SEC));
1062 }
1063
1064 count = pcre_exec(re, extra, (char *)bptr, len,
1065 start_offset, options | g_notempty, offsets, size_offsets);
1066
1067 if (count == 0)
1068 {
1069 fprintf(outfile, "Matched, but too many substrings\n");
1070 count = size_offsets/3;
1071 }
1072
1073 /* Matched */
1074
1075 if (count >= 0)
1076 {
1077 int i;
1078 for (i = 0; i < count * 2; i += 2)
1079 {
1080 if (offsets[i] < 0)
1081 fprintf(outfile, "%2d: <unset>\n", i/2);
1082 else
1083 {
1084 fprintf(outfile, "%2d: ", i/2);
1085 pchars(bptr + offsets[i], offsets[i+1] - offsets[i], utf8);
1086 fprintf(outfile, "\n");
1087 if (i == 0)
1088 {
1089 if (do_showrest)
1090 {
1091 fprintf(outfile, " 0+ ");
1092 pchars(bptr + offsets[i+1], len - offsets[i+1], utf8);
1093 fprintf(outfile, "\n");
1094 }
1095 }
1096 }
1097 }
1098
1099 for (i = 0; i < 32; i++)
1100 {
1101 if ((copystrings & (1 << i)) != 0)
1102 {
1103 char copybuffer[16];
1104 int rc = pcre_copy_substring((char *)bptr, offsets, count,
1105 i, copybuffer, sizeof(copybuffer));
1106 if (rc < 0)
1107 fprintf(outfile, "copy substring %d failed %d\n", i, rc);
1108 else
1109 fprintf(outfile, "%2dC %s (%d)\n", i, copybuffer, rc);
1110 }
1111 }
1112
1113 for (i = 0; i < 32; i++)
1114 {
1115 if ((getstrings & (1 << i)) != 0)
1116 {
1117 const char *substring;
1118 int rc = pcre_get_substring((char *)bptr, offsets, count,
1119 i, &substring);
1120 if (rc < 0)
1121 fprintf(outfile, "get substring %d failed %d\n", i, rc);
1122 else
1123 {
1124 fprintf(outfile, "%2dG %s (%d)\n", i, substring, rc);
1125 /* free((void *)substring); */
1126 pcre_free_substring(substring);
1127 }
1128 }
1129 }
1130
1131 if (getlist)
1132 {
1133 const char **stringlist;
1134 int rc = pcre_get_substring_list((char *)bptr, offsets, count,
1135 &stringlist);
1136 if (rc < 0)
1137 fprintf(outfile, "get substring list failed %d\n", rc);
1138 else
1139 {
1140 for (i = 0; i < count; i++)
1141 fprintf(outfile, "%2dL %s\n", i, stringlist[i]);
1142 if (stringlist[i] != NULL)
1143 fprintf(outfile, "string list not terminated by NULL\n");
1144 /* free((void *)stringlist); */
1145 pcre_free_substring_list(stringlist);
1146 }
1147 }
1148 }
1149
1150 /* Failed to match. If this is a /g or /G loop and we previously set
1151 g_notempty after a null match, this is not necessarily the end.
1152 We want to advance the start offset, and continue. Fudge the offset
1153 values to achieve this. We won't be at the end of the string - that
1154 was checked before setting g_notempty. */
1155
1156 else
1157 {
1158 if (g_notempty != 0)
1159 {
1160 offsets[0] = start_offset;
1161 offsets[1] = start_offset + 1;
1162 }
1163 else
1164 {
1165 if (gmatched == 0) /* Error if no previous matches */
1166 {
1167 if (count == -1) fprintf(outfile, "No match\n");
1168 else fprintf(outfile, "Error %d\n", count);
1169 }
1170 break; /* Out of the /g loop */
1171 }
1172 }
1173
1174 /* If not /g or /G we are done */
1175
1176 if (!do_g && !do_G) break;
1177
1178 /* If we have matched an empty string, first check to see if we are at
1179 the end of the subject. If so, the /g loop is over. Otherwise, mimic
1180 what Perl's /g options does. This turns out to be rather cunning. First
1181 we set PCRE_NOTEMPTY and PCRE_ANCHORED and try the match again at the
1182 same point. If this fails (picked up above) we advance to the next
1183 character. */
1184
1185 g_notempty = 0;
1186 if (offsets[0] == offsets[1])
1187 {
1188 if (offsets[0] == len) break;
1189 g_notempty = PCRE_NOTEMPTY | PCRE_ANCHORED;
1190 }
1191
1192 /* For /g, update the start offset, leaving the rest alone */
1193
1194 if (do_g) start_offset = offsets[1];
1195
1196 /* For /G, update the pointer and length */
1197
1198 else
1199 {
1200 bptr += offsets[1];
1201 len -= offsets[1];
1202 }
1203 } /* End of loop for /g and /G */
1204 } /* End of loop for data lines */
1205
1206 CONTINUE:
1207
1208 #if !defined NOPOSIX
1209 if (posix || do_posix) regfree(&preg);
1210 #endif
1211
1212 if (re != NULL) free(re);
1213 if (extra != NULL) free(extra);
1214 if (tables != NULL)
1215 {
1216 free((void *)tables);
1217 setlocale(LC_CTYPE, "C");
1218 }
1219 }
1220
1221 fprintf(outfile, "\n");
1222 return 0;
1223 }
1224
1225 /* End */

webmaster@exim.org
ViewVC Help
Powered by ViewVC 1.1.12