/[pcre]/code/trunk/pcretest.c
ViewVC logotype

Diff of /code/trunk/pcretest.c

Parent Directory Parent Directory | Revision Log Revision Log | View Patch Patch

revision 27 by nigel, Sat Feb 24 21:38:49 2007 UTC revision 57 by nigel, Sat Feb 24 21:39:50 2007 UTC
# Line 12  Line 12 
12  /* Use the internal info for displaying the results of pcre_study(). */  /* Use the internal info for displaying the results of pcre_study(). */
13    
14  #include "internal.h"  #include "internal.h"
15    
16    /* It is possible to compile this test program without including support for
17    testing the POSIX interface, though this is not available via the standard
18    Makefile. */
19    
20    #if !defined NOPOSIX
21  #include "pcreposix.h"  #include "pcreposix.h"
22    #endif
23    
24  #ifndef CLOCKS_PER_SEC  #ifndef CLOCKS_PER_SEC
25  #ifdef CLK_TCK  #ifdef CLK_TCK
# Line 27  Line 34 
34    
35  static FILE *outfile;  static FILE *outfile;
36  static int log_store = 0;  static int log_store = 0;
37    static size_t gotten_store;
38    
39    
40    
41    static int utf8_table1[] = {
42      0x0000007f, 0x000007ff, 0x0000ffff, 0x001fffff, 0x03ffffff, 0x7fffffff};
43    
44    static int utf8_table2[] = {
45      0, 0xc0, 0xe0, 0xf0, 0xf8, 0xfc};
46    
47    static int utf8_table3[] = {
48      0xff, 0x1f, 0x0f, 0x07, 0x03, 0x01};
49    
50    
51    /*************************************************
52    *       Convert character value to UTF-8         *
53    *************************************************/
54    
55    /* This function takes an integer value in the range 0 - 0x7fffffff
56    and encodes it as a UTF-8 character in 0 to 6 bytes.
57    
58    Arguments:
59      cvalue     the character value
60      buffer     pointer to buffer for result - at least 6 bytes long
61    
62    Returns:     number of characters placed in the buffer
63                 -1 if input character is negative
64                 0 if input character is positive but too big (only when
65                 int is longer than 32 bits)
66    */
67    
68    static int
69    ord2utf8(int cvalue, unsigned char *buffer)
70    {
71    register int i, j;
72    for (i = 0; i < sizeof(utf8_table1)/sizeof(int); i++)
73      if (cvalue <= utf8_table1[i]) break;
74    if (i >= sizeof(utf8_table1)/sizeof(int)) return 0;
75    if (cvalue < 0) return -1;
76    *buffer++ = utf8_table2[i] | (cvalue & utf8_table3[i]);
77    cvalue >>= 6 - i;
78    for (j = 0; j < i; j++)
79      {
80      *buffer++ = 0x80 | (cvalue & 0x3f);
81      cvalue >>= 6;
82      }
83    return i + 1;
84    }
85    
86    
87    /*************************************************
88    *            Convert UTF-8 string to value       *
89    *************************************************/
90    
91    /* This function takes one or more bytes that represents a UTF-8 character,
92    and returns the value of the character.
93    
94    Argument:
95      buffer   a pointer to the byte vector
96      vptr     a pointer to an int to receive the value
97    
98    Returns:   >  0 => the number of bytes consumed
99               -6 to 0 => malformed UTF-8 character at offset = (-return)
100    */
101    
102    int
103    utf82ord(unsigned char *buffer, int *vptr)
104    {
105    int c = *buffer++;
106    int d = c;
107    int i, j, s;
108    
109    for (i = -1; i < 6; i++)               /* i is number of additional bytes */
110      {
111      if ((d & 0x80) == 0) break;
112      d <<= 1;
113      }
114    
115    if (i == -1) { *vptr = c; return 1; }  /* ascii character */
116    if (i == 0 || i == 6) return 0;        /* invalid UTF-8 */
117    
118    /* i now has a value in the range 1-5 */
119    
120    d = c & utf8_table3[i];
121    s = 6 - i;
122    
123    for (j = 0; j < i; j++)
124      {
125      c = *buffer++;
126      if ((c & 0xc0) != 0x80) return -(j+1);
127      d |= (c & 0x3f) << s;
128      s += 6;
129      }
130    
131    /* Check that encoding was the correct unique one */
132    
133    for (j = 0; j < sizeof(utf8_table1)/sizeof(int); j++)
134      if (d <= utf8_table1[j]) break;
135    if (j != i) return -(i+1);
136    
137    /* Valid value */
138    
139    *vptr = d;
140    return i+1;
141    }
142    
143    
144    
145    
146    
147    
# Line 41  static const char *OP_names[] = { Line 156  static const char *OP_names[] = {
156    "*", "*?", "+", "+?", "?", "??", "{", "{", "{",    "*", "*?", "+", "+?", "?", "??", "{", "{", "{",
157    "*", "*?", "+", "+?", "?", "??", "{", "{", "{",    "*", "*?", "+", "+?", "?", "??", "{", "{", "{",
158    "*", "*?", "+", "+?", "?", "??", "{", "{",    "*", "*?", "+", "+?", "?", "??", "{", "{",
159    "class", "Ref",    "class", "Ref", "Recurse",
160    "Alt", "Ket", "KetRmax", "KetRmin", "Assert", "Assert not",    "Alt", "Ket", "KetRmax", "KetRmin", "Assert", "Assert not",
161    "AssertB", "AssertB not", "Reverse", "Once", "Cond", "Cref",    "AssertB", "AssertB not", "Reverse", "Once", "Cond", "Cref",
162    "Brazero", "Braminzero", "Bra"    "Brazero", "Braminzero", "Branumber", "Bra"
163  };  };
164    
165    
166  static void print_internals(pcre *re, FILE *outfile)  static void print_internals(pcre *re)
167  {  {
168  unsigned char *code = ((real_pcre *)re)->code;  unsigned char *code = ((real_pcre *)re)->code;
169    
# Line 63  for(;;) Line 178  for(;;)
178    
179    if (*code >= OP_BRA)    if (*code >= OP_BRA)
180      {      {
181      fprintf(outfile, "%3d Bra %d", (code[1] << 8) + code[2], *code - OP_BRA);      if (*code - OP_BRA > EXTRACT_BASIC_MAX)
182          fprintf(outfile, "%3d Bra extra", (code[1] << 8) + code[2]);
183        else
184          fprintf(outfile, "%3d Bra %d", (code[1] << 8) + code[2], *code - OP_BRA);
185      code += 2;      code += 2;
186      }      }
187    
# Line 79  for(;;) Line 197  for(;;)
197      code++;      code++;
198      break;      break;
199    
     case OP_COND:  
     fprintf(outfile, "%3d Cond", (code[1] << 8) + code[2]);  
     code += 2;  
     break;  
   
     case OP_CREF:  
     fprintf(outfile, " %.2d %s", code[1], OP_names[*code]);  
     code++;  
     break;  
   
200      case OP_CHARS:      case OP_CHARS:
201      charlength = *(++code);      charlength = *(++code);
202      fprintf(outfile, "%3d ", charlength);      fprintf(outfile, "%3d ", charlength);
# Line 106  for(;;) Line 214  for(;;)
214      case OP_ASSERTBACK:      case OP_ASSERTBACK:
215      case OP_ASSERTBACK_NOT:      case OP_ASSERTBACK_NOT:
216      case OP_ONCE:      case OP_ONCE:
217      fprintf(outfile, "%3d %s", (code[1] << 8) + code[2], OP_names[*code]);      case OP_COND:
218      code += 2;      case OP_BRANUMBER:
     break;  
   
219      case OP_REVERSE:      case OP_REVERSE:
220        case OP_CREF:
221      fprintf(outfile, "%3d %s", (code[1] << 8) + code[2], OP_names[*code]);      fprintf(outfile, "%3d %s", (code[1] << 8) + code[2], OP_names[*code]);
222      code += 2;      code += 2;
223      break;      break;
# Line 183  for(;;) Line 290  for(;;)
290      break;      break;
291    
292      case OP_REF:      case OP_REF:
293      fprintf(outfile, "    \\%d", *(++code));      fprintf(outfile, "    \\%d", (code[1] << 8) | code[2]);
294      code++;      code += 3;
295      goto CLASS_REF_REPEAT;      goto CLASS_REF_REPEAT;
296    
297      case OP_CLASS:      case OP_CLASS:
# Line 257  for(;;) Line 364  for(;;)
364    
365    
366    
367  /* Character string printing function. */  /* Character string printing function. A "normal" and a UTF-8 version. */
368    
369  static void pchars(unsigned char *p, int length)  static void pchars(unsigned char *p, int length, int utf8)
370  {  {
371  int c;  int c;
372  while (length-- > 0)  while (length-- > 0)
373      {
374      if (utf8)
375        {
376        int rc = utf82ord(p, &c);
377        if (rc > 0)
378          {
379          length -= rc - 1;
380          p += rc;
381          if (c < 256 && isprint(c)) fprintf(outfile, "%c", c);
382            else fprintf(outfile, "\\x{%02x}", c);
383          continue;
384          }
385        }
386    
387       /* Not UTF-8, or malformed UTF-8  */
388    
389    if (isprint(c = *(p++))) fprintf(outfile, "%c", c);    if (isprint(c = *(p++))) fprintf(outfile, "%c", c);
390      else fprintf(outfile, "\\x%02x", c);      else fprintf(outfile, "\\x%02x", c);
391      }
392  }  }
393    
394    
# Line 274  compiled re. */ Line 398  compiled re. */
398    
399  static void *new_malloc(size_t size)  static void *new_malloc(size_t size)
400  {  {
401  if (log_store) fprintf(outfile, "Store size request: %d\n", (int)size);  gotten_store = size;
402    if (log_store)
403      fprintf(outfile, "Memory allocation (code space): %d\n",
404        (int)((int)size - offsetof(real_pcre, code[0])));
405  return malloc(size);  return malloc(size);
406  }  }
407    
408    
409    
410    
411    /* Get one piece of information from the pcre_fullinfo() function */
412    
413    static void new_info(pcre *re, pcre_extra *study, int option, void *ptr)
414    {
415    int rc;
416    if ((rc = pcre_fullinfo(re, study, option, ptr)) < 0)
417      fprintf(outfile, "Error %d from pcre_fullinfo(%d)\n", rc, option);
418    }
419    
420    
421    
422    
423  /* Read lines from named file or stdin and write to named file or stdout; lines  /* Read lines from named file or stdin and write to named file or stdout; lines
424  consist of a regular expression, in delimiters and optionally followed by  consist of a regular expression, in delimiters and optionally followed by
425  options, followed by a set of test data, terminated by an empty line. */  options, followed by a set of test data, terminated by an empty line. */
# Line 292  int study_options = 0; Line 432  int study_options = 0;
432  int op = 1;  int op = 1;
433  int timeit = 0;  int timeit = 0;
434  int showinfo = 0;  int showinfo = 0;
435    int showstore = 0;
436    int size_offsets = 45;
437    int size_offsets_max;
438    int *offsets;
439    #if !defined NOPOSIX
440  int posix = 0;  int posix = 0;
441    #endif
442  int debug = 0;  int debug = 0;
443  int done = 0;  int done = 0;
444  unsigned char buffer[30000];  unsigned char buffer[30000];
# Line 306  outfile = stdout; Line 452  outfile = stdout;
452    
453  while (argc > 1 && argv[op][0] == '-')  while (argc > 1 && argv[op][0] == '-')
454    {    {
455    if (strcmp(argv[op], "-s") == 0) log_store = 1;    char *endptr;
456    
457      if (strcmp(argv[op], "-s") == 0 || strcmp(argv[op], "-m") == 0)
458        showstore = 1;
459    else if (strcmp(argv[op], "-t") == 0) timeit = 1;    else if (strcmp(argv[op], "-t") == 0) timeit = 1;
460    else if (strcmp(argv[op], "-i") == 0) showinfo = 1;    else if (strcmp(argv[op], "-i") == 0) showinfo = 1;
461    else if (strcmp(argv[op], "-d") == 0) showinfo = debug = 1;    else if (strcmp(argv[op], "-d") == 0) showinfo = debug = 1;
462      else if (strcmp(argv[op], "-o") == 0 && argc > 2 &&
463          ((size_offsets = strtoul(argv[op+1], &endptr, 10)), *endptr == 0))
464        {
465        op++;
466        argc--;
467        }
468    #if !defined NOPOSIX
469    else if (strcmp(argv[op], "-p") == 0) posix = 1;    else if (strcmp(argv[op], "-p") == 0) posix = 1;
470    #endif
471    else    else
472      {      {
473      printf("*** Unknown option %s\n", argv[op]);      printf("** Unknown or malformed option %s\n", argv[op]);
474      printf("Usage: pcretest [-d] [-i] [-p] [-s] [-t] [<input> [<output>]]\n");      printf("Usage:   pcretest [-d] [-i] [-o <n>] [-p] [-s] [-t] [<input> [<output>]]\n");
475      printf("  -d   debug: show compiled code; implies -i\n"      printf("  -d     debug: show compiled code; implies -i\n"
476             "  -i   show information about compiled pattern\n"             "  -i     show information about compiled pattern\n"
477             "  -p   use POSIX interface\n"             "  -o <n> set size of offsets vector to <n>\n");
478             "  -s   output store information\n"  #if !defined NOPOSIX
479             "  -t   time compilation and execution\n");      printf("  -p     use POSIX interface\n");
480    #endif
481        printf("  -s     output store information\n"
482               "  -t     time compilation and execution\n");
483      return 1;      return 1;
484      }      }
485    op++;    op++;
486    argc--;    argc--;
487    }    }
488    
489    /* Get the store for the offsets vector, and remember what it was */
490    
491    size_offsets_max = size_offsets;
492    offsets = malloc(size_offsets_max * sizeof(int));
493    if (offsets == NULL)
494      {
495      printf("** Failed to get %d bytes of memory for offsets vector\n",
496        size_offsets_max * sizeof(int));
497      return 1;
498      }
499    
500  /* Sort out the input and output files */  /* Sort out the input and output files */
501    
502  if (argc > 1)  if (argc > 1)
# Line 362  while (!done) Line 533  while (!done)
533    {    {
534    pcre *re = NULL;    pcre *re = NULL;
535    pcre_extra *extra = NULL;    pcre_extra *extra = NULL;
536    
537    #if !defined NOPOSIX  /* There are still compilers that require no indent */
538    regex_t preg;    regex_t preg;
539      int do_posix = 0;
540    #endif
541    
542    const char *error;    const char *error;
543    unsigned char *p, *pp, *ppp;    unsigned char *p, *pp, *ppp;
544    unsigned const char *tables = NULL;    const unsigned char *tables = NULL;
545    int do_study = 0;    int do_study = 0;
546    int do_debug = debug;    int do_debug = debug;
547      int do_G = 0;
548      int do_g = 0;
549    int do_showinfo = showinfo;    int do_showinfo = showinfo;
550    int do_posix = 0;    int do_showrest = 0;
551      int utf8 = 0;
552    int erroroffset, len, delimiter;    int erroroffset, len, delimiter;
553    
554    if (infile == stdin) printf("  re> ");    if (infile == stdin) printf("  re> ");
# Line 385  while (!done) Line 564  while (!done)
564    
565    delimiter = *p++;    delimiter = *p++;
566    
567    if (isalnum(delimiter))    if (isalnum(delimiter) || delimiter == '\\')
568      {      {
569      fprintf(outfile, "** Delimiter must not be alphameric\n");      fprintf(outfile, "** Delimiter must not be alphameric or \\\n");
570      goto SKIP_DATA;      goto SKIP_DATA;
571      }      }
572    
# Line 395  while (!done) Line 574  while (!done)
574    
575    for(;;)    for(;;)
576      {      {
577      while (*pp != 0 && *pp != delimiter) pp++;      while (*pp != 0)
578          {
579          if (*pp == '\\' && pp[1] != 0) pp++;
580            else if (*pp == delimiter) break;
581          pp++;
582          }
583      if (*pp != 0) break;      if (*pp != 0) break;
584    
585      len = sizeof(buffer) - (pp - buffer);      len = sizeof(buffer) - (pp - buffer);
# Line 415  while (!done) Line 599  while (!done)
599      if (infile != stdin) fprintf(outfile, "%s", (char *)pp);      if (infile != stdin) fprintf(outfile, "%s", (char *)pp);
600      }      }
601    
602      /* If the first character after the delimiter is backslash, make
603      the pattern end with backslash. This is purely to provide a way
604      of testing for the error message when a pattern ends with backslash. */
605    
606      if (pp[1] == '\\') *pp++ = '\\';
607    
608    /* Terminate the pattern at the delimiter */    /* Terminate the pattern at the delimiter */
609    
610    *pp++ = 0;    *pp++ = 0;
# Line 423  while (!done) Line 613  while (!done)
613    
614    options = 0;    options = 0;
615    study_options = 0;    study_options = 0;
616      log_store = showstore;  /* default from command line */
617    
618    while (*pp != 0)    while (*pp != 0)
619      {      {
620      switch (*pp++)      switch (*pp++)
621        {        {
622          case 'g': do_g = 1; break;
623        case 'i': options |= PCRE_CASELESS; break;        case 'i': options |= PCRE_CASELESS; break;
624        case 'm': options |= PCRE_MULTILINE; break;        case 'm': options |= PCRE_MULTILINE; break;
625        case 's': options |= PCRE_DOTALL; break;        case 's': options |= PCRE_DOTALL; break;
626        case 'x': options |= PCRE_EXTENDED; break;        case 'x': options |= PCRE_EXTENDED; break;
627    
628          case '+': do_showrest = 1; break;
629        case 'A': options |= PCRE_ANCHORED; break;        case 'A': options |= PCRE_ANCHORED; break;
630        case 'D': do_debug = do_showinfo = 1; break;        case 'D': do_debug = do_showinfo = 1; break;
631        case 'E': options |= PCRE_DOLLAR_ENDONLY; break;        case 'E': options |= PCRE_DOLLAR_ENDONLY; break;
632          case 'G': do_G = 1; break;
633        case 'I': do_showinfo = 1; break;        case 'I': do_showinfo = 1; break;
634          case 'M': log_store = 1; break;
635    
636    #if !defined NOPOSIX
637        case 'P': do_posix = 1; break;        case 'P': do_posix = 1; break;
638    #endif
639    
640        case 'S': do_study = 1; break;        case 'S': do_study = 1; break;
641        case 'U': options |= PCRE_UNGREEDY; break;        case 'U': options |= PCRE_UNGREEDY; break;
642        case 'X': options |= PCRE_EXTRA; break;        case 'X': options |= PCRE_EXTRA; break;
643          case '8': options |= PCRE_UTF8; utf8 = 1; break;
644    
645        case 'L':        case 'L':
646        ppp = pp;        ppp = pp;
# Line 465  while (!done) Line 666  while (!done)
666    timing, showing, or debugging options, nor the ability to pass over    timing, showing, or debugging options, nor the ability to pass over
667    local character tables. */    local character tables. */
668    
669    #if !defined NOPOSIX
670    if (posix || do_posix)    if (posix || do_posix)
671      {      {
672      int rc;      int rc;
# Line 487  while (!done) Line 689  while (!done)
689    /* Handle compiling via the native interface */    /* Handle compiling via the native interface */
690    
691    else    else
692    #endif  /* !defined NOPOSIX */
693    
694      {      {
695      if (timeit)      if (timeit)
696        {        {
# Line 531  while (!done) Line 735  while (!done)
735        goto CONTINUE;        goto CONTINUE;
736        }        }
737    
738      /* Compilation succeeded; print data if required */      /* Compilation succeeded; print data if required. There are now two
739        info-returning functions. The old one has a limited interface and
740        returns only limited data. Check that it agrees with the newer one. */
741    
742      if (do_showinfo)      if (do_showinfo)
743        {        {
744        int first_char, count;        unsigned long int get_options;
745          int old_first_char, old_options, old_count;
746          int count, backrefmax, first_char, need_char;
747          size_t size;
748    
749          if (do_debug) print_internals(re);
750    
751          new_info(re, NULL, PCRE_INFO_OPTIONS, &get_options);
752          new_info(re, NULL, PCRE_INFO_SIZE, &size);
753          new_info(re, NULL, PCRE_INFO_CAPTURECOUNT, &count);
754          new_info(re, NULL, PCRE_INFO_BACKREFMAX, &backrefmax);
755          new_info(re, NULL, PCRE_INFO_FIRSTCHAR, &first_char);
756          new_info(re, NULL, PCRE_INFO_LASTLITERAL, &need_char);
757    
758        if (do_debug) print_internals(re, outfile);        old_count = pcre_info(re, &old_options, &old_first_char);
   
       count = pcre_info(re, &options, &first_char);  
759        if (count < 0) fprintf(outfile,        if (count < 0) fprintf(outfile,
760          "Error %d while reading info\n", count);          "Error %d from pcre_info()\n", count);
761        else        else
762          {          {
763          fprintf(outfile, "Identifying subpattern count = %d\n", count);          if (old_count != count) fprintf(outfile,
764          if (options == 0) fprintf(outfile, "No options\n");            "Count disagreement: pcre_fullinfo=%d pcre_info=%d\n", count,
765            else fprintf(outfile, "Options:%s%s%s%s%s%s%s%s\n",              old_count);
766              ((options & PCRE_ANCHORED) != 0)? " anchored" : "",  
767              ((options & PCRE_CASELESS) != 0)? " caseless" : "",          if (old_first_char != first_char) fprintf(outfile,
768              ((options & PCRE_EXTENDED) != 0)? " extended" : "",            "First char disagreement: pcre_fullinfo=%d pcre_info=%d\n",
769              ((options & PCRE_MULTILINE) != 0)? " multiline" : "",              first_char, old_first_char);
770              ((options & PCRE_DOTALL) != 0)? " dotall" : "",  
771              ((options & PCRE_DOLLAR_ENDONLY) != 0)? " dollar_endonly" : "",          if (old_options != (int)get_options) fprintf(outfile,
772              ((options & PCRE_EXTRA) != 0)? " extra" : "",            "Options disagreement: pcre_fullinfo=%ld pcre_info=%d\n",
773              ((options & PCRE_UNGREEDY) != 0)? " ungreedy" : "");              get_options, old_options);
774          if (first_char == -1)          }
775            {  
776            fprintf(outfile, "First char at start or follows \\n\n");        if (size != gotten_store) fprintf(outfile,
777            }          "Size disagreement: pcre_fullinfo=%d call to malloc for %d\n",
778          else if (first_char < 0)          size, gotten_store);
779            {  
780            fprintf(outfile, "No first char\n");        fprintf(outfile, "Capturing subpattern count = %d\n", count);
781            }        if (backrefmax > 0)
782            fprintf(outfile, "Max back reference = %d\n", backrefmax);
783          if (get_options == 0) fprintf(outfile, "No options\n");
784            else fprintf(outfile, "Options:%s%s%s%s%s%s%s%s%s\n",
785              ((get_options & PCRE_ANCHORED) != 0)? " anchored" : "",
786              ((get_options & PCRE_CASELESS) != 0)? " caseless" : "",
787              ((get_options & PCRE_EXTENDED) != 0)? " extended" : "",
788              ((get_options & PCRE_MULTILINE) != 0)? " multiline" : "",
789              ((get_options & PCRE_DOTALL) != 0)? " dotall" : "",
790              ((get_options & PCRE_DOLLAR_ENDONLY) != 0)? " dollar_endonly" : "",
791              ((get_options & PCRE_EXTRA) != 0)? " extra" : "",
792              ((get_options & PCRE_UNGREEDY) != 0)? " ungreedy" : "",
793              ((get_options & PCRE_UTF8) != 0)? " utf8" : "");
794    
795          if (((((real_pcre *)re)->options) & PCRE_ICHANGED) != 0)
796            fprintf(outfile, "Case state changes\n");
797    
798          if (first_char == -1)
799            {
800            fprintf(outfile, "First char at start or follows \\n\n");
801            }
802          else if (first_char < 0)
803            {
804            fprintf(outfile, "No first char\n");
805            }
806          else
807            {
808            if (isprint(first_char))
809              fprintf(outfile, "First char = \'%c\'\n", first_char);
810          else          else
811            {            fprintf(outfile, "First char = %d\n", first_char);
812            if (isprint(first_char))          }
813              fprintf(outfile, "First char = \'%c\'\n", first_char);  
814            else        if (need_char < 0)
815              fprintf(outfile, "First char = %d\n", first_char);          {
816            }          fprintf(outfile, "No need char\n");
817            }
818          else
819            {
820            if (isprint(need_char))
821              fprintf(outfile, "Need char = \'%c\'\n", need_char);
822            else
823              fprintf(outfile, "Need char = %d\n", need_char);
824          }          }
825        }        }
826    
# Line 598  while (!done) Line 849  while (!done)
849        else if (extra == NULL)        else if (extra == NULL)
850          fprintf(outfile, "Study returned NULL\n");          fprintf(outfile, "Study returned NULL\n");
851    
       /* This looks at internal information. A bit kludgy to do it this  
       way, but it is useful for testing. */  
   
852        else if (do_showinfo)        else if (do_showinfo)
853          {          {
854          real_pcre_extra *xx = (real_pcre_extra *)extra;          uschar *start_bits = NULL;
855          if ((xx->options & PCRE_STUDY_MAPPED) == 0)          new_info(re, extra, PCRE_INFO_FIRSTTABLE, &start_bits);
856            if (start_bits == NULL)
857            fprintf(outfile, "No starting character set\n");            fprintf(outfile, "No starting character set\n");
858          else          else
859            {            {
# Line 613  while (!done) Line 862  while (!done)
862            fprintf(outfile, "Starting character set: ");            fprintf(outfile, "Starting character set: ");
863            for (i = 0; i < 256; i++)            for (i = 0; i < 256; i++)
864              {              {
865              if ((xx->start_bits[i/8] & (1<<(i%8))) != 0)              if ((start_bits[i/8] & (1<<(i%8))) != 0)
866                {                {
867                if (c > 75)                if (c > 75)
868                  {                  {
# Line 643  while (!done) Line 892  while (!done)
892    for (;;)    for (;;)
893      {      {
894      unsigned char *q;      unsigned char *q;
895        unsigned char *bptr = dbuffer;
896        int *use_offsets = offsets;
897        int use_size_offsets = size_offsets;
898      int count, c;      int count, c;
899      int offsets[45];      int copystrings = 0;
900      int size_offsets = sizeof(offsets)/sizeof(int);      int getstrings = 0;
901        int getlist = 0;
902        int gmatched = 0;
903        int start_offset = 0;
904        int g_notempty = 0;
905    
906      options = 0;      options = 0;
907    
908      if (infile == stdin) printf("  data> ");      if (infile == stdin) printf("data> ");
909      if (fgets((char *)buffer, sizeof(buffer), infile) == NULL)      if (fgets((char *)buffer, sizeof(buffer), infile) == NULL)
910        {        {
911        done = 1;        done = 1;
# Line 689  while (!done) Line 945  while (!done)
945          break;          break;
946    
947          case 'x':          case 'x':
948    
949            /* Handle \x{..} specially - new Perl thing for utf8 */
950    
951            if (*p == '{')
952              {
953              unsigned char *pt = p;
954              c = 0;
955              while (isxdigit(*(++pt)))
956                c = c * 16 + tolower(*pt) - ((isdigit(*pt))? '0' : 'W');
957              if (*pt == '}')
958                {
959                unsigned char buffer[8];
960                int ii, utn;
961                utn = ord2utf8(c, buffer);
962                for (ii = 0; ii < utn - 1; ii++) *q++ = buffer[ii];
963                c = buffer[ii];   /* Last byte */
964                p = pt + 1;
965                break;
966                }
967              /* Not correct form; fall through */
968              }
969    
970            /* Ordinary \x */
971    
972          c = 0;          c = 0;
973          while (i++ < 2 && isxdigit(*p))          while (i++ < 2 && isxdigit(*p))
974            {            {
# Line 709  while (!done) Line 989  while (!done)
989          options |= PCRE_NOTBOL;          options |= PCRE_NOTBOL;
990          continue;          continue;
991    
992            case 'C':
993            while(isdigit(*p)) n = n * 10 + *p++ - '0';
994            copystrings |= 1 << n;
995            continue;
996    
997            case 'G':
998            while(isdigit(*p)) n = n * 10 + *p++ - '0';
999            getstrings |= 1 << n;
1000            continue;
1001    
1002            case 'L':
1003            getlist = 1;
1004            continue;
1005    
1006            case 'N':
1007            options |= PCRE_NOTEMPTY;
1008            continue;
1009    
1010          case 'O':          case 'O':
1011          while(isdigit(*p)) n = n * 10 + *p++ - '0';          while(isdigit(*p)) n = n * 10 + *p++ - '0';
1012          if (n <= (int)(sizeof(offsets)/sizeof(int))) size_offsets = n;          if (n > size_offsets_max)
1013              {
1014              size_offsets_max = n;
1015              free(offsets);
1016              use_offsets = offsets = malloc(size_offsets_max * sizeof(int));
1017              if (offsets == NULL)
1018                {
1019                printf("** Failed to get %d bytes of memory for offsets vector\n",
1020                  size_offsets_max * sizeof(int));
1021                return 1;
1022                }
1023              }
1024            use_size_offsets = n;
1025            if (n == 0) use_offsets = NULL;
1026          continue;          continue;
1027    
1028          case 'Z':          case 'Z':
# Line 726  while (!done) Line 1037  while (!done)
1037      /* Handle matching via the POSIX interface, which does not      /* Handle matching via the POSIX interface, which does not
1038      support timing. */      support timing. */
1039    
1040    #if !defined NOPOSIX
1041      if (posix || do_posix)      if (posix || do_posix)
1042        {        {
1043        int rc;        int rc;
1044        int eflags = 0;        int eflags = 0;
1045        regmatch_t pmatch[30];        regmatch_t *pmatch = malloc(sizeof(regmatch_t) * use_size_offsets);
1046        if ((options & PCRE_NOTBOL) != 0) eflags |= REG_NOTBOL;        if ((options & PCRE_NOTBOL) != 0) eflags |= REG_NOTBOL;
1047        if ((options & PCRE_NOTEOL) != 0) eflags |= REG_NOTEOL;        if ((options & PCRE_NOTEOL) != 0) eflags |= REG_NOTEOL;
1048    
1049        rc = regexec(&preg, (char *)dbuffer, sizeof(pmatch)/sizeof(regmatch_t),        rc = regexec(&preg, (const char *)bptr, use_size_offsets, pmatch, eflags);
         pmatch, eflags);  
1050    
1051        if (rc != 0)        if (rc != 0)
1052          {          {
# Line 745  while (!done) Line 1056  while (!done)
1056        else        else
1057          {          {
1058          size_t i;          size_t i;
1059          for (i = 0; i < sizeof(pmatch)/sizeof(regmatch_t); i++)          for (i = 0; i < use_size_offsets; i++)
1060            {            {
1061            if (pmatch[i].rm_so >= 0)            if (pmatch[i].rm_so >= 0)
1062              {              {
1063              fprintf(outfile, "%2d: ", (int)i);              fprintf(outfile, "%2d: ", (int)i);
1064              pchars(dbuffer + pmatch[i].rm_so,              pchars(dbuffer + pmatch[i].rm_so,
1065                pmatch[i].rm_eo - pmatch[i].rm_so);                pmatch[i].rm_eo - pmatch[i].rm_so, utf8);
1066              fprintf(outfile, "\n");              fprintf(outfile, "\n");
1067                if (i == 0 && do_showrest)
1068                  {
1069                  fprintf(outfile, " 0+ ");
1070                  pchars(dbuffer + pmatch[i].rm_eo, len - pmatch[i].rm_eo, utf8);
1071                  fprintf(outfile, "\n");
1072                  }
1073              }              }
1074            }            }
1075          }          }
1076          free(pmatch);
1077        }        }
1078    
1079      /* Handle matching via the native interface */      /* Handle matching via the native interface - repeats for /g and /G */
1080    
1081      else      else
1082    #endif  /* !defined NOPOSIX */
1083    
1084        for (;; gmatched++)    /* Loop for /g or /G */
1085        {        {
1086        if (timeit)        if (timeit)
1087          {          {
# Line 768  while (!done) Line 1089  while (!done)
1089          clock_t time_taken;          clock_t time_taken;
1090          clock_t start_time = clock();          clock_t start_time = clock();
1091          for (i = 0; i < LOOPREPEAT; i++)          for (i = 0; i < LOOPREPEAT; i++)
1092            count = pcre_exec(re, extra, (char *)dbuffer, len, options, offsets,            count = pcre_exec(re, extra, (char *)bptr, len,
1093              size_offsets);              start_offset, options | g_notempty, use_offsets, use_size_offsets);
1094          time_taken = clock() - start_time;          time_taken = clock() - start_time;
1095          fprintf(outfile, "Execute time %.3f milliseconds\n",          fprintf(outfile, "Execute time %.3f milliseconds\n",
1096            ((double)time_taken * 1000.0)/            ((double)time_taken * 1000.0)/
1097            ((double)LOOPREPEAT * (double)CLOCKS_PER_SEC));            ((double)LOOPREPEAT * (double)CLOCKS_PER_SEC));
1098          }          }
1099    
1100        count = pcre_exec(re, extra, (char *)dbuffer, len, options, offsets,        count = pcre_exec(re, extra, (char *)bptr, len,
1101          size_offsets);          start_offset, options | g_notempty, use_offsets, use_size_offsets);
1102    
1103        if (count == 0)        if (count == 0)
1104          {          {
1105          fprintf(outfile, "Matched, but too many substrings\n");          fprintf(outfile, "Matched, but too many substrings\n");
1106          count = size_offsets/3;          count = use_size_offsets/3;
1107          }          }
1108    
1109          /* Matched */
1110    
1111        if (count >= 0)        if (count >= 0)
1112          {          {
1113          int i;          int i;
1114          count *= 2;          for (i = 0; i < count * 2; i += 2)
         for (i = 0; i < count; i += 2)  
1115            {            {
1116            if (offsets[i] < 0)            if (use_offsets[i] < 0)
1117              fprintf(outfile, "%2d: <unset>\n", i/2);              fprintf(outfile, "%2d: <unset>\n", i/2);
1118            else            else
1119              {              {
1120              fprintf(outfile, "%2d: ", i/2);              fprintf(outfile, "%2d: ", i/2);
1121              pchars(dbuffer + offsets[i], offsets[i+1] - offsets[i]);              pchars(bptr + use_offsets[i], use_offsets[i+1] - use_offsets[i], utf8);
1122              fprintf(outfile, "\n");              fprintf(outfile, "\n");
1123                if (i == 0)
1124                  {
1125                  if (do_showrest)
1126                    {
1127                    fprintf(outfile, " 0+ ");
1128                    pchars(bptr + use_offsets[i+1], len - use_offsets[i+1], utf8);
1129                    fprintf(outfile, "\n");
1130                    }
1131                  }
1132                }
1133              }
1134    
1135            for (i = 0; i < 32; i++)
1136              {
1137              if ((copystrings & (1 << i)) != 0)
1138                {
1139                char copybuffer[16];
1140                int rc = pcre_copy_substring((char *)bptr, use_offsets, count,
1141                  i, copybuffer, sizeof(copybuffer));
1142                if (rc < 0)
1143                  fprintf(outfile, "copy substring %d failed %d\n", i, rc);
1144                else
1145                  fprintf(outfile, "%2dC %s (%d)\n", i, copybuffer, rc);
1146                }
1147              }
1148    
1149            for (i = 0; i < 32; i++)
1150              {
1151              if ((getstrings & (1 << i)) != 0)
1152                {
1153                const char *substring;
1154                int rc = pcre_get_substring((char *)bptr, use_offsets, count,
1155                  i, &substring);
1156                if (rc < 0)
1157                  fprintf(outfile, "get substring %d failed %d\n", i, rc);
1158                else
1159                  {
1160                  fprintf(outfile, "%2dG %s (%d)\n", i, substring, rc);
1161                  /* free((void *)substring); */
1162                  pcre_free_substring(substring);
1163                  }
1164                }
1165              }
1166    
1167            if (getlist)
1168              {
1169              const char **stringlist;
1170              int rc = pcre_get_substring_list((char *)bptr, use_offsets, count,
1171                &stringlist);
1172              if (rc < 0)
1173                fprintf(outfile, "get substring list failed %d\n", rc);
1174              else
1175                {
1176                for (i = 0; i < count; i++)
1177                  fprintf(outfile, "%2dL %s\n", i, stringlist[i]);
1178                if (stringlist[i] != NULL)
1179                  fprintf(outfile, "string list not terminated by NULL\n");
1180                /* free((void *)stringlist); */
1181                pcre_free_substring_list(stringlist);
1182              }              }
1183            }            }
1184          }          }
1185    
1186          /* Failed to match. If this is a /g or /G loop and we previously set
1187          g_notempty after a null match, this is not necessarily the end.
1188          We want to advance the start offset, and continue. Fudge the offset
1189          values to achieve this. We won't be at the end of the string - that
1190          was checked before setting g_notempty. */
1191    
1192        else        else
1193          {          {
1194          if (count == -1) fprintf(outfile, "No match\n");          if (g_notempty != 0)
1195            else fprintf(outfile, "Error %d\n", count);            {
1196              use_offsets[0] = start_offset;
1197              use_offsets[1] = start_offset + 1;
1198              }
1199            else
1200              {
1201              if (gmatched == 0)   /* Error if no previous matches */
1202                {
1203                if (count == -1) fprintf(outfile, "No match\n");
1204                  else fprintf(outfile, "Error %d\n", count);
1205                }
1206              break;  /* Out of the /g loop */
1207              }
1208          }          }
1209        }  
1210      }        /* If not /g or /G we are done */
1211    
1212          if (!do_g && !do_G) break;
1213    
1214          /* If we have matched an empty string, first check to see if we are at
1215          the end of the subject. If so, the /g loop is over. Otherwise, mimic
1216          what Perl's /g options does. This turns out to be rather cunning. First
1217          we set PCRE_NOTEMPTY and PCRE_ANCHORED and try the match again at the
1218          same point. If this fails (picked up above) we advance to the next
1219          character. */
1220    
1221          g_notempty = 0;
1222          if (use_offsets[0] == use_offsets[1])
1223            {
1224            if (use_offsets[0] == len) break;
1225            g_notempty = PCRE_NOTEMPTY | PCRE_ANCHORED;
1226            }
1227    
1228          /* For /g, update the start offset, leaving the rest alone */
1229    
1230          if (do_g) start_offset = use_offsets[1];
1231    
1232          /* For /G, update the pointer and length */
1233    
1234          else
1235            {
1236            bptr += use_offsets[1];
1237            len -= use_offsets[1];
1238            }
1239          }  /* End of loop for /g and /G */
1240        }    /* End of loop for data lines */
1241    
1242    CONTINUE:    CONTINUE:
1243    
1244    #if !defined NOPOSIX
1245    if (posix || do_posix) regfree(&preg);    if (posix || do_posix) regfree(&preg);
1246    #endif
1247    
1248    if (re != NULL) free(re);    if (re != NULL) free(re);
1249    if (extra != NULL) free(extra);    if (extra != NULL) free(extra);
1250    if (tables != NULL)    if (tables != NULL)

Legend:
Removed from v.27  
changed lines
  Added in v.57

webmaster@exim.org
ViewVC Help
Powered by ViewVC 1.1.12