/[pcre]/code/trunk/pcretest.c
ViewVC logotype

Diff of /code/trunk/pcretest.c

Parent Directory Parent Directory | Revision Log Revision Log | View Patch Patch

revision 141 by ph10, Fri Mar 30 15:46:27 2007 UTC revision 240 by ph10, Tue Sep 11 15:47:20 2007 UTC
# Line 36  POSSIBILITY OF SUCH DAMAGE. Line 36  POSSIBILITY OF SUCH DAMAGE.
36  */  */
37    
38    
39    #ifdef HAVE_CONFIG_H
40    #include "config.h"
41    #endif
42    
43  #include <ctype.h>  #include <ctype.h>
44  #include <stdio.h>  #include <stdio.h>
45  #include <string.h>  #include <string.h>
# Line 67  input mode under Windows. */ Line 71  input mode under Windows. */
71  #endif  #endif
72    
73    
74  #define PCRE_SPY        /* For Win32 build, import data, not export */  /* We have to include pcre_internal.h because we need the internal info for
75    displaying the results of pcre_study() and we also need to know about the
76  /* We include pcre_internal.h because we need the internal info for displaying  internal macros, structures, and other internal data values; pcretest has
77  the results of pcre_study() and we also need to know about the internal  "inside information" compared to a program that strictly follows the PCRE API.
78  macros, structures, and other internal data values; pcretest has "inside  
79  information" compared to a program that strictly follows the PCRE API. */  Although pcre_internal.h does itself include pcre.h, we explicitly include it
80    here before pcre_internal.h so that the PCRE_EXP_xxx macros get set
81    appropriately for an application, not for building PCRE. */
82    
83    #include "pcre.h"
84  #include "pcre_internal.h"  #include "pcre_internal.h"
85    
86  /* We need access to the data tables that PCRE uses. So as not to have to keep  /* We need access to the data tables that PCRE uses. So as not to have to keep
# Line 87  symbols to prevent clashes. */ Line 94  symbols to prevent clashes. */
94  #define _pcre_utf8_table4      utf8_table4  #define _pcre_utf8_table4      utf8_table4
95  #define _pcre_utt              utt  #define _pcre_utt              utt
96  #define _pcre_utt_size         utt_size  #define _pcre_utt_size         utt_size
97    #define _pcre_utt_names        utt_names
98  #define _pcre_OP_lengths       OP_lengths  #define _pcre_OP_lengths       OP_lengths
99    
100  #include "pcre_tables.c"  #include "pcre_tables.c"
# Line 149  static int callout_count; Line 157  static int callout_count;
157  static int callout_extra;  static int callout_extra;
158  static int callout_fail_count;  static int callout_fail_count;
159  static int callout_fail_id;  static int callout_fail_id;
160    static int debug_lengths;
161  static int first_callout;  static int first_callout;
162  static int locale_set = 0;  static int locale_set = 0;
163  static int show_malloc;  static int show_malloc;
# Line 656  return count; Line 665  return count;
665    
666    
667  /*************************************************  /*************************************************
668    *         Case-independent strncmp() function    *
669    *************************************************/
670    
671    /*
672    Arguments:
673      s         first string
674      t         second string
675      n         number of characters to compare
676    
677    Returns:    < 0, = 0, or > 0, according to the comparison
678    */
679    
680    static int
681    strncmpic(uschar *s, uschar *t, int n)
682    {
683    while (n--)
684      {
685      int c = tolower(*s++) - tolower(*t++);
686      if (c) return c;
687      }
688    return 0;
689    }
690    
691    
692    
693    /*************************************************
694  *         Check newline indicator                *  *         Check newline indicator                *
695  *************************************************/  *************************************************/
696    
697  /* This is used both at compile and run-time to check for <xxx> escapes, where  /* This is used both at compile and run-time to check for <xxx> escapes, where
698  xxx is LF, CR, CRLF, or ANY. Print a message and return 0 if there is no match.  xxx is LF, CR, CRLF, ANYCRLF, or ANY. Print a message and return 0 if there is
699    no match.
700    
701  Arguments:  Arguments:
702    p           points after the leading '<'    p           points after the leading '<'
# Line 672  Returns: appropriate PCRE_NEWLINE_x Line 708  Returns: appropriate PCRE_NEWLINE_x
708  static int  static int
709  check_newline(uschar *p, FILE *f)  check_newline(uschar *p, FILE *f)
710  {  {
711  if (strncmp((char *)p, "cr>", 3) == 0) return PCRE_NEWLINE_CR;  if (strncmpic(p, (uschar *)"cr>", 3) == 0) return PCRE_NEWLINE_CR;
712  if (strncmp((char *)p, "lf>", 3) == 0) return PCRE_NEWLINE_LF;  if (strncmpic(p, (uschar *)"lf>", 3) == 0) return PCRE_NEWLINE_LF;
713  if (strncmp((char *)p, "crlf>", 5) == 0) return PCRE_NEWLINE_CRLF;  if (strncmpic(p, (uschar *)"crlf>", 5) == 0) return PCRE_NEWLINE_CRLF;
714  if (strncmp((char *)p, "any>", 4) == 0) return PCRE_NEWLINE_ANY;  if (strncmpic(p, (uschar *)"anycrlf>", 8) == 0) return PCRE_NEWLINE_ANYCRLF;
715    if (strncmpic(p, (uschar *)"any>", 4) == 0) return PCRE_NEWLINE_ANY;
716    if (strncmpic(p, (uschar *)"bsr_anycrlf>", 12) == 0) return PCRE_BSR_ANYCRLF;
717    if (strncmpic(p, (uschar *)"bsr_unicode>", 12) == 0) return PCRE_BSR_UNICODE;
718  fprintf(f, "Unknown newline type at: <%s\n", p);  fprintf(f, "Unknown newline type at: <%s\n", p);
719  return 0;  return 0;
720  }  }
# Line 847  while (argc > 1 && argv[op][0] == '-') Line 886  while (argc > 1 && argv[op][0] == '-')
886      (void)pcre_config(PCRE_CONFIG_NEWLINE, &rc);      (void)pcre_config(PCRE_CONFIG_NEWLINE, &rc);
887      printf("  Newline sequence is %s\n", (rc == '\r')? "CR" :      printf("  Newline sequence is %s\n", (rc == '\r')? "CR" :
888        (rc == '\n')? "LF" : (rc == ('\r'<<8 | '\n'))? "CRLF" :        (rc == '\n')? "LF" : (rc == ('\r'<<8 | '\n'))? "CRLF" :
889          (rc == -2)? "ANYCRLF" :
890        (rc == -1)? "ANY" : "???");        (rc == -1)? "ANY" : "???");
891        (void)pcre_config(PCRE_CONFIG_BSR, &rc);
892        printf("  \\R matches %s\n", rc? "CR, LF, or CRLF only" :
893                                         "all Unicode newlines");
894      (void)pcre_config(PCRE_CONFIG_LINK_SIZE, &rc);      (void)pcre_config(PCRE_CONFIG_LINK_SIZE, &rc);
895      printf("  Internal link size = %d\n", rc);      printf("  Internal link size = %d\n", rc);
896      (void)pcre_config(PCRE_CONFIG_POSIX_MALLOC_THRESHOLD, &rc);      (void)pcre_config(PCRE_CONFIG_POSIX_MALLOC_THRESHOLD, &rc);
# Line 884  offsets = (int *)malloc(size_offsets_max Line 927  offsets = (int *)malloc(size_offsets_max
927  if (offsets == NULL)  if (offsets == NULL)
928    {    {
929    printf("** Failed to get %d bytes of memory for offsets vector\n",    printf("** Failed to get %d bytes of memory for offsets vector\n",
930      size_offsets_max * sizeof(int));      (int)(size_offsets_max * sizeof(int)));
931    yield = 1;    yield = 1;
932    goto EXIT;    goto EXIT;
933    }    }
# Line 944  while (!done) Line 987  while (!done)
987    size_t size, regex_gotten_store;    size_t size, regex_gotten_store;
988    int do_study = 0;    int do_study = 0;
989    int do_debug = debug;    int do_debug = debug;
   int debug_lengths = 1;  
990    int do_G = 0;    int do_G = 0;
991    int do_g = 0;    int do_g = 0;
992    int do_showinfo = showinfo;    int do_showinfo = showinfo;
# Line 953  while (!done) Line 995  while (!done)
995    int erroroffset, len, delimiter, poffset;    int erroroffset, len, delimiter, poffset;
996    
997    use_utf8 = 0;    use_utf8 = 0;
998      debug_lengths = 1;
999    
1000    if (infile == stdin) printf("  re> ");    if (infile == stdin) printf("  re> ");
1001    if (extend_inputline(infile, buffer) == NULL) break;    if (extend_inputline(infile, buffer) == NULL) break;
# Line 1312  while (!done) Line 1355  while (!done)
1355        rre->magic_number = byteflip(rre->magic_number, sizeof(rre->magic_number));        rre->magic_number = byteflip(rre->magic_number, sizeof(rre->magic_number));
1356        rre->size = byteflip(rre->size, sizeof(rre->size));        rre->size = byteflip(rre->size, sizeof(rre->size));
1357        rre->options = byteflip(rre->options, sizeof(rre->options));        rre->options = byteflip(rre->options, sizeof(rre->options));
1358          rre->flags = byteflip(rre->flags, sizeof(rre->flags));
1359        rre->top_bracket = byteflip(rre->top_bracket, sizeof(rre->top_bracket));        rre->top_bracket = byteflip(rre->top_bracket, sizeof(rre->top_bracket));
1360        rre->top_backref = byteflip(rre->top_backref, sizeof(rre->top_backref));        rre->top_backref = byteflip(rre->top_backref, sizeof(rre->top_backref));
1361        rre->first_byte = byteflip(rre->first_byte, sizeof(rre->first_byte));        rre->first_byte = byteflip(rre->first_byte, sizeof(rre->first_byte));
# Line 1346  while (!done) Line 1390  while (!done)
1390  #if !defined NOINFOCHECK  #if !defined NOINFOCHECK
1391        int old_first_char, old_options, old_count;        int old_first_char, old_options, old_count;
1392  #endif  #endif
1393        int count, backrefmax, first_char, need_char;        int count, backrefmax, first_char, need_char, okpartial, jchanged,
1394            hascrorlf;
1395        int nameentrysize, namecount;        int nameentrysize, namecount;
1396        const uschar *nametable;        const uschar *nametable;
1397    
# Line 1359  while (!done) Line 1404  while (!done)
1404        new_info(re, NULL, PCRE_INFO_NAMEENTRYSIZE, &nameentrysize);        new_info(re, NULL, PCRE_INFO_NAMEENTRYSIZE, &nameentrysize);
1405        new_info(re, NULL, PCRE_INFO_NAMECOUNT, &namecount);        new_info(re, NULL, PCRE_INFO_NAMECOUNT, &namecount);
1406        new_info(re, NULL, PCRE_INFO_NAMETABLE, (void *)&nametable);        new_info(re, NULL, PCRE_INFO_NAMETABLE, (void *)&nametable);
1407          new_info(re, NULL, PCRE_INFO_OKPARTIAL, &okpartial);
1408          new_info(re, NULL, PCRE_INFO_JCHANGED, &jchanged);
1409          new_info(re, NULL, PCRE_INFO_HASCRORLF, &hascrorlf);
1410    
1411  #if !defined NOINFOCHECK  #if !defined NOINFOCHECK
1412        old_count = pcre_info(re, &old_options, &old_first_char);        old_count = pcre_info(re, &old_options, &old_first_char);
# Line 1400  while (!done) Line 1448  while (!done)
1448            }            }
1449          }          }
1450    
1451        /* The NOPARTIAL bit is a private bit in the options, so we have        if (!okpartial) fprintf(outfile, "Partial matching not supported\n");
1452        to fish it out via out back door */        if (hascrorlf) fprintf(outfile, "Contains explicit CR or LF match\n");
1453    
1454        all_options = ((real_pcre *)re)->options;        all_options = ((real_pcre *)re)->options;
1455        if (do_flip)        if (do_flip) all_options = byteflip(all_options, sizeof(all_options));
         {  
         all_options = byteflip(all_options, sizeof(all_options));  
          }  
   
       if ((all_options & PCRE_NOPARTIAL) != 0)  
         fprintf(outfile, "Partial matching not supported\n");  
1456    
1457        if (get_options == 0) fprintf(outfile, "No options\n");        if (get_options == 0) fprintf(outfile, "No options\n");
1458          else fprintf(outfile, "Options:%s%s%s%s%s%s%s%s%s%s%s%s%s\n",          else fprintf(outfile, "Options:%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s\n",
1459            ((get_options & PCRE_ANCHORED) != 0)? " anchored" : "",            ((get_options & PCRE_ANCHORED) != 0)? " anchored" : "",
1460            ((get_options & PCRE_CASELESS) != 0)? " caseless" : "",            ((get_options & PCRE_CASELESS) != 0)? " caseless" : "",
1461            ((get_options & PCRE_EXTENDED) != 0)? " extended" : "",            ((get_options & PCRE_EXTENDED) != 0)? " extended" : "",
1462            ((get_options & PCRE_MULTILINE) != 0)? " multiline" : "",            ((get_options & PCRE_MULTILINE) != 0)? " multiline" : "",
1463            ((get_options & PCRE_FIRSTLINE) != 0)? " firstline" : "",            ((get_options & PCRE_FIRSTLINE) != 0)? " firstline" : "",
1464            ((get_options & PCRE_DOTALL) != 0)? " dotall" : "",            ((get_options & PCRE_DOTALL) != 0)? " dotall" : "",
1465              ((get_options & PCRE_BSR_ANYCRLF) != 0)? " bsr_anycrlf" : "",
1466              ((get_options & PCRE_BSR_UNICODE) != 0)? " bsr_unicode" : "",
1467            ((get_options & PCRE_DOLLAR_ENDONLY) != 0)? " dollar_endonly" : "",            ((get_options & PCRE_DOLLAR_ENDONLY) != 0)? " dollar_endonly" : "",
1468            ((get_options & PCRE_EXTRA) != 0)? " extra" : "",            ((get_options & PCRE_EXTRA) != 0)? " extra" : "",
1469            ((get_options & PCRE_UNGREEDY) != 0)? " ungreedy" : "",            ((get_options & PCRE_UNGREEDY) != 0)? " ungreedy" : "",
# Line 1428  while (!done) Line 1472  while (!done)
1472            ((get_options & PCRE_NO_UTF8_CHECK) != 0)? " no_utf8_check" : "",            ((get_options & PCRE_NO_UTF8_CHECK) != 0)? " no_utf8_check" : "",
1473            ((get_options & PCRE_DUPNAMES) != 0)? " dupnames" : "");            ((get_options & PCRE_DUPNAMES) != 0)? " dupnames" : "");
1474    
1475          if (jchanged) fprintf(outfile, "Duplicate name status changes\n");
1476    
1477        switch (get_options & PCRE_NEWLINE_BITS)        switch (get_options & PCRE_NEWLINE_BITS)
1478          {          {
1479          case PCRE_NEWLINE_CR:          case PCRE_NEWLINE_CR:
# Line 1442  while (!done) Line 1488  while (!done)
1488          fprintf(outfile, "Forced newline sequence: CRLF\n");          fprintf(outfile, "Forced newline sequence: CRLF\n");
1489          break;          break;
1490    
1491            case PCRE_NEWLINE_ANYCRLF:
1492            fprintf(outfile, "Forced newline sequence: ANYCRLF\n");
1493            break;
1494    
1495          case PCRE_NEWLINE_ANY:          case PCRE_NEWLINE_ANY:
1496          fprintf(outfile, "Forced newline sequence: ANY\n");          fprintf(outfile, "Forced newline sequence: ANY\n");
1497          break;          break;
# Line 1591  while (!done) Line 1641  while (!done)
1641    for (;;)    for (;;)
1642      {      {
1643      uschar *q;      uschar *q;
1644      uschar *bptr = dbuffer;      uschar *bptr;
1645      int *use_offsets = offsets;      int *use_offsets = offsets;
1646      int use_size_offsets = size_offsets;      int use_size_offsets = size_offsets;
1647      int callout_data = 0;      int callout_data = 0;
# Line 1647  while (!done) Line 1697  while (!done)
1697      p = buffer;      p = buffer;
1698      while (isspace(*p)) p++;      while (isspace(*p)) p++;
1699    
1700      q = dbuffer;      bptr = q = dbuffer;
1701      while ((c = *p++) != 0)      while ((c = *p++) != 0)
1702        {        {
1703        int i = 0;        int i = 0;
# Line 1842  while (!done) Line 1892  while (!done)
1892            if (offsets == NULL)            if (offsets == NULL)
1893              {              {
1894              printf("** Failed to get %d bytes of memory for offsets vector\n",              printf("** Failed to get %d bytes of memory for offsets vector\n",
1895                size_offsets_max * sizeof(int));                (int)(size_offsets_max * sizeof(int)));
1896              yield = 1;              yield = 1;
1897              goto EXIT;              goto EXIT;
1898              }              }
# Line 1972  while (!done) Line 2022  while (!done)
2022    
2023      for (;; gmatched++)    /* Loop for /g or /G */      for (;; gmatched++)    /* Loop for /g or /G */
2024        {        {
       int gany_fudge;  
2025        if (timeitm > 0)        if (timeitm > 0)
2026          {          {
2027          register int i;          register int i;
# Line 2212  while (!done) Line 2261  while (!done)
2261          }          }
2262    
2263        /* Failed to match. If this is a /g or /G loop and we previously set        /* Failed to match. If this is a /g or /G loop and we previously set
2264        g_notempty after a null match, this is not necessarily the end.        g_notempty after a null match, this is not necessarily the end. We want
2265        We want to advance the start offset, and continue. In the case of UTF-8        to advance the start offset, and continue. We won't be at the end of the
2266        matching, the advance must be one character, not one byte. Fudge the        string - that was checked before setting g_notempty.
2267        offset values to achieve this. We won't be at the end of the string -  
2268        that was checked before setting g_notempty. */        Complication arises in the case when the newline option is "any" or
2269          "anycrlf". If the previous match was at the end of a line terminated by
2270          CRLF, an advance of one character just passes the \r, whereas we should
2271          prefer the longer newline sequence, as does the code in pcre_exec().
2272          Fudge the offset value to achieve this.
2273    
2274          Otherwise, in the case of UTF-8 matching, the advance must be one
2275          character, not one byte. */
2276    
2277        else        else
2278          {          {
2279          if (g_notempty != 0)          if (g_notempty != 0)
2280            {            {
2281            int onechar = 1;            int onechar = 1;
2282              unsigned int obits = ((real_pcre *)re)->options;
2283            use_offsets[0] = start_offset;            use_offsets[0] = start_offset;
2284            if (use_utf8)            if ((obits & PCRE_NEWLINE_BITS) == 0)
2285                {
2286                int d;
2287                (void)pcre_config(PCRE_CONFIG_NEWLINE, &d);
2288                obits = (d == '\r')? PCRE_NEWLINE_CR :
2289                        (d == '\n')? PCRE_NEWLINE_LF :
2290                        (d == ('\r'<<8 | '\n'))? PCRE_NEWLINE_CRLF :
2291                        (d == -2)? PCRE_NEWLINE_ANYCRLF :
2292                        (d == -1)? PCRE_NEWLINE_ANY : 0;
2293                }
2294              if (((obits & PCRE_NEWLINE_BITS) == PCRE_NEWLINE_ANY ||
2295                   (obits & PCRE_NEWLINE_BITS) == PCRE_NEWLINE_ANYCRLF)
2296                  &&
2297                  start_offset < len - 1 &&
2298                  bptr[start_offset] == '\r' &&
2299                  bptr[start_offset+1] == '\n')
2300                onechar++;
2301              else if (use_utf8)
2302              {              {
2303              while (start_offset + onechar < len)              while (start_offset + onechar < len)
2304                {                {
# Line 2256  while (!done) Line 2330  while (!done)
2330        what Perl's /g options does. This turns out to be rather cunning. First        what Perl's /g options does. This turns out to be rather cunning. First
2331        we set PCRE_NOTEMPTY and PCRE_ANCHORED and try the match again at the        we set PCRE_NOTEMPTY and PCRE_ANCHORED and try the match again at the
2332        same point. If this fails (picked up above) we advance to the next        same point. If this fails (picked up above) we advance to the next
2333        character.        character. */
   
       Yet more complication arises in the case when the newline option is  
       "any" and a pattern in multiline mode has to match at the start of a  
       line. If a previous match was at the end of a line, and advance of one  
       character just passes the \r, whereas we should prefer the longer newline  
       sequence, as does the code in pcre_exec(). So we fudge it. */  
2334    
2335        g_notempty = 0;        g_notempty = 0;
2336        gany_fudge = 0;  
   
2337        if (use_offsets[0] == use_offsets[1])        if (use_offsets[0] == use_offsets[1])
2338          {          {
2339          if (use_offsets[0] == len) break;          if (use_offsets[0] == len) break;
2340          g_notempty = PCRE_NOTEMPTY | PCRE_ANCHORED;          g_notempty = PCRE_NOTEMPTY | PCRE_ANCHORED;
         if ((((real_pcre *)re)->options & PCRE_STARTLINE) != 0 &&  
             (((real_pcre *)re)->options & PCRE_NEWLINE_BITS) == PCRE_NEWLINE_ANY &&  
             use_offsets[0] < len - 1 &&  
             bptr[use_offsets[0]] == '\r' &&  
             bptr[use_offsets[0]+1] == '\n')  
           gany_fudge = 1;  
2341          }          }
2342    
2343        /* For /g, update the start offset, leaving the rest alone */        /* For /g, update the start offset, leaving the rest alone */
2344    
2345        if (do_g) start_offset = use_offsets[1] + gany_fudge;        if (do_g) start_offset = use_offsets[1];
2346    
2347        /* For /G, update the pointer and length */        /* For /G, update the pointer and length */
2348    
2349        else        else
2350          {          {
2351          bptr += use_offsets[1] + gany_fudge;          bptr += use_offsets[1];
2352          len -= use_offsets[1] + gany_fudge;          len -= use_offsets[1];
2353          }          }
2354        }  /* End of loop for /g and /G */        }  /* End of loop for /g and /G */
2355    

Legend:
Removed from v.141  
changed lines
  Added in v.240

webmaster@exim.org
ViewVC Help
Powered by ViewVC 1.1.12