/[pcre]/code/branches/pcre16/pcretest.c
ViewVC logotype

Diff of /code/branches/pcre16/pcretest.c

Parent Directory Parent Directory | Revision Log Revision Log | View Patch Patch

revision 47 by nigel, Sat Feb 24 21:39:29 2007 UTC revision 49 by nigel, Sat Feb 24 21:39:33 2007 UTC
# Line 38  static size_t gotten_store; Line 38  static size_t gotten_store;
38    
39    
40    
41    static int utf8_table1[] = {
42      0x0000007f, 0x000007ff, 0x0000ffff, 0x001fffff, 0x03ffffff, 0x7fffffff};
43    
44    static int utf8_table2[] = {
45      0, 0xc0, 0xe0, 0xf0, 0xf8, 0xfc};
46    
47    static int utf8_table3[] = {
48      0xff, 0x1f, 0x0f, 0x07, 0x03, 0x01};
49    
50    
51    /*************************************************
52    *       Convert character value to UTF-8         *
53    *************************************************/
54    
55    /* This function takes an integer value in the range 0 - 0x7fffffff
56    and encodes it as a UTF-8 character in 0 to 6 bytes.
57    
58    Arguments:
59      cvalue     the character value
60      buffer     pointer to buffer for result - at least 6 bytes long
61    
62    Returns:     number of characters placed in the buffer
63                 -1 if input character is negative
64                 0 if input character is positive but too big (only when
65                 int is longer than 32 bits)
66    */
67    
68    static int
69    ord2utf8(int cvalue, unsigned char *buffer)
70    {
71    register int i, j;
72    for (i = 0; i < sizeof(utf8_table1)/sizeof(int); i++)
73      if (cvalue <= utf8_table1[i]) break;
74    if (i >= sizeof(utf8_table1)/sizeof(int)) return 0;
75    if (cvalue < 0) return -1;
76    *buffer++ = utf8_table2[i] | (cvalue & utf8_table3[i]);
77    cvalue >>= 6 - i;
78    for (j = 0; j < i; j++)
79      {
80      *buffer++ = 0x80 | (cvalue & 0x3f);
81      cvalue >>= 6;
82      }
83    return i + 1;
84    }
85    
86    
87    /*************************************************
88    *            Convert UTF-8 string to value       *
89    *************************************************/
90    
91    /* This function takes one or more bytes that represents a UTF-8 character,
92    and returns the value of the character.
93    
94    Argument:
95      buffer   a pointer to the byte vector
96      vptr     a pointer to an int to receive the value
97    
98    Returns:   >  0 => the number of bytes consumed
99               -6 to 0 => malformed UTF-8 character at offset = (-return)
100    */
101    
102    int
103    utf82ord(unsigned char *buffer, int *vptr)
104    {
105    int c = *buffer++;
106    int d = c;
107    int i, j, s;
108    
109    for (i = -1; i < 6; i++)               /* i is number of additional bytes */
110      {
111      if ((d & 0x80) == 0) break;
112      d <<= 1;
113      }
114    
115    if (i == -1) { *vptr = c; return 1; }  /* ascii character */
116    if (i == 0 || i == 6) return 0;        /* invalid UTF-8 */
117    
118    /* i now has a value in the range 1-5 */
119    
120    d = c & utf8_table3[i];
121    s = 6 - i;
122    
123    for (j = 0; j < i; j++)
124      {
125      c = *buffer++;
126      if ((c & 0xc0) != 0x80) return -(j+1);
127      d |= (c & 0x3f) << s;
128      s += 6;
129      }
130    
131    /* Check that encoding was the correct unique one */
132    
133    for (j = 0; j < sizeof(utf8_table1)/sizeof(int); j++)
134      if (d <= utf8_table1[j]) break;
135    if (j != i) return -(i+1);
136    
137    /* Valid value */
138    
139    *vptr = d;
140    return i+1;
141    }
142    
143    
144    
145    
146    
147    
148  /* Debugging function to print the internal form of the regex. This is the same  /* Debugging function to print the internal form of the regex. This is the same
149  code as contained in pcre.c under the DEBUG macro. */  code as contained in pcre.c under the DEBUG macro. */
150    
# Line 265  for(;;) Line 372  for(;;)
372    
373    
374    
375  /* Character string printing function. */  /* Character string printing function. A "normal" and a UTF-8 version. */
376    
377  static void pchars(unsigned char *p, int length)  static void pchars(unsigned char *p, int length, int utf8)
378  {  {
379  int c;  int c;
380  while (length-- > 0)  while (length-- > 0)
381      {
382      if (utf8)
383        {
384        int rc = utf82ord(p, &c);
385        if (rc > 0)
386          {
387          length -= rc - 1;
388          p += rc;
389          if (c < 256 && isprint(c)) fprintf(outfile, "%c", c);
390            else fprintf(outfile, "\\x{%02x}", c);
391          continue;
392          }
393        }
394    
395       /* Not UTF-8, or malformed UTF-8  */
396    
397    if (isprint(c = *(p++))) fprintf(outfile, "%c", c);    if (isprint(c = *(p++))) fprintf(outfile, "%c", c);
398      else fprintf(outfile, "\\x%02x", c);      else fprintf(outfile, "\\x%02x", c);
399      }
400  }  }
401    
402    
# Line 403  while (!done) Line 527  while (!done)
527    int do_g = 0;    int do_g = 0;
528    int do_showinfo = showinfo;    int do_showinfo = showinfo;
529    int do_showrest = 0;    int do_showrest = 0;
530      int utf8 = 0;
531    int erroroffset, len, delimiter;    int erroroffset, len, delimiter;
532    
533    if (infile == stdin) printf("  re> ");    if (infile == stdin) printf("  re> ");
# Line 494  while (!done) Line 619  while (!done)
619        case 'S': do_study = 1; break;        case 'S': do_study = 1; break;
620        case 'U': options |= PCRE_UNGREEDY; break;        case 'U': options |= PCRE_UNGREEDY; break;
621        case 'X': options |= PCRE_EXTRA; break;        case 'X': options |= PCRE_EXTRA; break;
622          case '8': options |= PCRE_UTF8; utf8 = 1; break;
623    
624        case 'L':        case 'L':
625        ppp = pp;        ppp = pp;
# Line 633  while (!done) Line 759  while (!done)
759        if (backrefmax > 0)        if (backrefmax > 0)
760          fprintf(outfile, "Max back reference = %d\n", backrefmax);          fprintf(outfile, "Max back reference = %d\n", backrefmax);
761        if (options == 0) fprintf(outfile, "No options\n");        if (options == 0) fprintf(outfile, "No options\n");
762          else fprintf(outfile, "Options:%s%s%s%s%s%s%s%s\n",          else fprintf(outfile, "Options:%s%s%s%s%s%s%s%s%s\n",
763            ((options & PCRE_ANCHORED) != 0)? " anchored" : "",            ((options & PCRE_ANCHORED) != 0)? " anchored" : "",
764            ((options & PCRE_CASELESS) != 0)? " caseless" : "",            ((options & PCRE_CASELESS) != 0)? " caseless" : "",
765            ((options & PCRE_EXTENDED) != 0)? " extended" : "",            ((options & PCRE_EXTENDED) != 0)? " extended" : "",
# Line 641  while (!done) Line 767  while (!done)
767            ((options & PCRE_DOTALL) != 0)? " dotall" : "",            ((options & PCRE_DOTALL) != 0)? " dotall" : "",
768            ((options & PCRE_DOLLAR_ENDONLY) != 0)? " dollar_endonly" : "",            ((options & PCRE_DOLLAR_ENDONLY) != 0)? " dollar_endonly" : "",
769            ((options & PCRE_EXTRA) != 0)? " extra" : "",            ((options & PCRE_EXTRA) != 0)? " extra" : "",
770            ((options & PCRE_UNGREEDY) != 0)? " ungreedy" : "");            ((options & PCRE_UNGREEDY) != 0)? " ungreedy" : "",
771              ((options & PCRE_UTF8) != 0)? " utf8" : "");
772    
773        if (((((real_pcre *)re)->options) & PCRE_ICHANGED) != 0)        if (((((real_pcre *)re)->options) & PCRE_ICHANGED) != 0)
774          fprintf(outfile, "Case state changes\n");          fprintf(outfile, "Case state changes\n");
# Line 796  while (!done) Line 923  while (!done)
923          break;          break;
924    
925          case 'x':          case 'x':
926    
927            /* Handle \x{..} specially - new Perl thing for utf8 */
928    
929            if (*p == '{')
930              {
931              unsigned char *pt = p;
932              c = 0;
933              while (isxdigit(*(++pt)))
934                c = c * 16 + tolower(*pt) - ((isdigit(*pt))? '0' : 'W');
935              if (*pt == '}')
936                {
937                unsigned char buffer[8];
938                int ii, utn;
939                utn = ord2utf8(c, buffer);
940                for (ii = 0; ii < utn - 1; ii++) *q++ = buffer[ii];
941                c = buffer[ii];   /* Last byte */
942                p = pt + 1;
943                break;
944                }
945              /* Not correct form; fall through */
946              }
947    
948            /* Ordinary \x */
949    
950          c = 0;          c = 0;
951          while (i++ < 2 && isxdigit(*p))          while (i++ < 2 && isxdigit(*p))
952            {            {
# Line 876  while (!done) Line 1027  while (!done)
1027              {              {
1028              fprintf(outfile, "%2d: ", (int)i);              fprintf(outfile, "%2d: ", (int)i);
1029              pchars(dbuffer + pmatch[i].rm_so,              pchars(dbuffer + pmatch[i].rm_so,
1030                pmatch[i].rm_eo - pmatch[i].rm_so);                pmatch[i].rm_eo - pmatch[i].rm_so, utf8);
1031              fprintf(outfile, "\n");              fprintf(outfile, "\n");
1032              if (i == 0 && do_showrest)              if (i == 0 && do_showrest)
1033                {                {
1034                fprintf(outfile, " 0+ ");                fprintf(outfile, " 0+ ");
1035                pchars(dbuffer + pmatch[i].rm_eo, len - pmatch[i].rm_eo);                pchars(dbuffer + pmatch[i].rm_eo, len - pmatch[i].rm_eo, utf8);
1036                fprintf(outfile, "\n");                fprintf(outfile, "\n");
1037                }                }
1038              }              }
# Line 931  while (!done) Line 1082  while (!done)
1082            else            else
1083              {              {
1084              fprintf(outfile, "%2d: ", i/2);              fprintf(outfile, "%2d: ", i/2);
1085              pchars(bptr + offsets[i], offsets[i+1] - offsets[i]);              pchars(bptr + offsets[i], offsets[i+1] - offsets[i], utf8);
1086              fprintf(outfile, "\n");              fprintf(outfile, "\n");
1087              if (i == 0)              if (i == 0)
1088                {                {
1089                if (do_showrest)                if (do_showrest)
1090                  {                  {
1091                  fprintf(outfile, " 0+ ");                  fprintf(outfile, " 0+ ");
1092                  pchars(bptr + offsets[i+1], len - offsets[i+1]);                  pchars(bptr + offsets[i+1], len - offsets[i+1], utf8);
1093                  fprintf(outfile, "\n");                  fprintf(outfile, "\n");
1094                  }                  }
1095                }                }
# Line 971  while (!done) Line 1122  while (!done)
1122              else              else
1123                {                {
1124                fprintf(outfile, "%2dG %s (%d)\n", i, substring, rc);                fprintf(outfile, "%2dG %s (%d)\n", i, substring, rc);
1125                free((void *)substring);                /* free((void *)substring); */
1126                  pcre_free_substring(substring);
1127                }                }
1128              }              }
1129            }            }
# Line 989  while (!done) Line 1141  while (!done)
1141                fprintf(outfile, "%2dL %s\n", i, stringlist[i]);                fprintf(outfile, "%2dL %s\n", i, stringlist[i]);
1142              if (stringlist[i] != NULL)              if (stringlist[i] != NULL)
1143                fprintf(outfile, "string list not terminated by NULL\n");                fprintf(outfile, "string list not terminated by NULL\n");
1144              free((void *)stringlist);              /* free((void *)stringlist); */
1145                pcre_free_substring_list(stringlist);
1146              }              }
1147            }            }
1148          }          }

Legend:
Removed from v.47  
changed lines
  Added in v.49

webmaster@exim.org
ViewVC Help
Powered by ViewVC 1.1.12