/[pcre]/code/trunk/maint/utf8.c
ViewVC logotype

Contents of /code/trunk/maint/utf8.c

Parent Directory Parent Directory | Revision Log Revision Log


Revision 946 - (hide annotations) (download)
Wed Feb 29 18:00:55 2012 UTC (2 years, 4 months ago) by ph10
File MIME type: text/plain
File size: 6017 byte(s)
Tidy up this developer's test program and add more descriptive comments.

1 ph10 946 /* A test program for converting characters to UTF-8 and vice versa. Note that
2     this program conforms to the original definition of UTF-8, which allows
3     codepoints up to 7fffffff. The more recent definition limits the validity of
4     UTF-8 codepoints to a maximum of 10ffffff.
5 ph10 97
6 ph10 946 The arguments are either single codepoint values, written as 0xhhhh, for
7     conversion to UTF-8, or sequences of hex values, written without 0x and
8     optionally including spaces (but such arguments must be quoted), for conversion
9     from UTF-8 to codepoints. For example:
10    
11     ./utf8 0x1234
12     0x00001234 => e1 88 b4
13    
14     ./utf8 "e1 88 b4"
15     0x00001234 <= e1 88 b4
16    
17     In the second case, a number of characters can be present in one argument:
18    
19     ./utf8 "65 e188b4 77"
20     0x00000065 <= 65
21     0x00001234 <= e1 88 b4
22     0x00000077 <= 77
23    
24     If the option -s is given, the sequence of UTF-bytes is written out between
25     angle brackets at the end of the line. On a UTF-8 terminal, this will show the
26     appropriate graphic for the codepoint. */
27    
28 ph10 97 #include <stdio.h>
29     #include <stdlib.h>
30     #include <ctype.h>
31    
32     /* The valid ranges for UTF-8 characters are:
33    
34     0000 0000 to 0000 007f 1 byte (ascii)
35     0000 0080 to 0000 07ff 2 bytes
36     0000 0800 to 0000 ffff 3 bytes
37     0001 0000 to 001f ffff 4 bytes
38     0020 0000 to 03ff ffff 5 bytes
39     0400 0000 to 7fff ffff 6 bytes
40     */
41    
42    
43     static const int utf8_table1[] = {
44     0x0000007f, 0x000007ff, 0x0000ffff, 0x001fffff, 0x03ffffff, 0x7fffffff};
45    
46     static const int utf8_table2[] = {
47     0, 0xc0, 0xe0, 0xf0, 0xf8, 0xfc};
48    
49     static const int utf8_table3[] = {
50     0xff, 0x1f, 0x0f, 0x07, 0x03, 0x01};
51    
52     static const unsigned char utf8_table4[] = {
53     1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
54     1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
55     2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,
56     3,3,3,3,3,3,3,3,4,4,4,4,5,5,6,6 };
57    
58    
59     /*************************************************
60     * Convert character value to UTF-8 *
61     *************************************************/
62    
63     /* This function takes an integer value in the range 0 - 0x7fffffff
64 ph10 522 and encodes it as a UTF-8 character in 1 to 6 bytes.
65 ph10 97
66     Arguments:
67     cvalue the character value
68     buffer pointer to buffer for result - at least 6 bytes long
69    
70     Returns: number of characters placed in the buffer
71     -1 if input character is negative
72     0 if input character is positive but too big (only when
73     int is longer than 32 bits)
74     */
75    
76     int
77     ord2utf8(int cvalue, unsigned char *buffer)
78     {
79     register int i, j;
80     for (i = 0; i < sizeof(utf8_table1)/sizeof(int); i++)
81     if (cvalue <= utf8_table1[i]) break;
82     if (i >= sizeof(utf8_table1)/sizeof(int)) return 0;
83     if (cvalue < 0) return -1;
84     buffer += i;
85     for (j = i; j > 0; j--)
86     {
87     *buffer-- = 0x80 | (cvalue & 0x3f);
88     cvalue >>= 6;
89     }
90     *buffer = utf8_table2[i] | cvalue;
91     return i + 1;
92     }
93    
94    
95    
96     /*************************************************
97     * Convert UTF-8 string to value *
98     *************************************************/
99    
100     /* This function takes one or more bytes that represents a UTF-8 character,
101     and returns the value of the character.
102    
103     Argument:
104     buffer a pointer to the byte vector
105     vptr a pointer to an int to receive the value
106    
107     Returns: > 0 => the number of bytes consumed
108     -6 to 0 => malformed UTF-8 character at offset = (-return)
109     */
110    
111     int
112     utf82ord(unsigned char *buffer, int *vptr)
113     {
114     int c = *buffer++;
115     int d = c;
116     int i, j, s;
117    
118     for (i = -1; i < 6; i++) /* i is number of additional bytes */
119     {
120     if ((d & 0x80) == 0) break;
121     d <<= 1;
122     }
123    
124     if (i == -1) { *vptr = c; return 1; } /* ascii character */
125     if (i == 0 || i == 6) return 0; /* invalid UTF-8 */
126    
127     /* i now has a value in the range 1-5 */
128    
129     s = 6*i;
130     d = (c & utf8_table3[i]) << s;
131    
132     for (j = 0; j < i; j++)
133     {
134     c = *buffer++;
135     if ((c & 0xc0) != 0x80) return -(j+1);
136     s -= 6;
137     d |= (c & 0x3f) << s;
138     }
139    
140     /* Check that encoding was the correct unique one */
141    
142     for (j = 0; j < sizeof(utf8_table1)/sizeof(int); j++)
143     if (d <= utf8_table1[j]) break;
144     if (j != i) return -(i+1);
145    
146     /* Valid value */
147    
148     *vptr = d;
149     return i+1;
150     }
151    
152    
153    
154    
155     /*************************************************
156     * Main Program *
157     *************************************************/
158    
159    
160     int
161     main(int argc, char **argv)
162     {
163 ph10 592 int i = 1;
164     int show = 0;
165 ph10 946 unsigned char buffer[64];
166 ph10 592
167 ph10 946 if (argc > 1 && strcmp(argv[1], "-s") == 0)
168 ph10 97 {
169 ph10 592 show = 1;
170     i = 2;
171     }
172    
173     for (; i < argc; i++)
174     {
175 ph10 97 unsigned char *x = argv[i];
176     if (strncmp(x, "0x", 2) == 0)
177     {
178     int j;
179     int d = strtol(x+2, NULL, 16);
180     int rc = ord2utf8(d, buffer);
181     printf("0x%08x => ", d);
182 ph10 592 if (rc <= 0) printf("*** Error %d ***", rc); else
183     {
184     for (j = 0; j < rc; j++) printf("%02x ", buffer[j]);
185     if (show)
186     {
187     printf(">");
188     for (j = 0; j < rc; j++) printf("%c", buffer[j]);
189     printf("<");
190     }
191     }
192 ph10 97 printf("\n");
193     }
194     else
195     {
196     int d, rc;
197     int j = 0;
198     int y = 0;
199 ph10 946 int z = 0;
200     unsigned char *bptr;
201    
202 ph10 97 for (;;)
203     {
204     while (*x == ' ') x++;
205     if (*x == 0 && !z) break;
206     if (!isxdigit(*x))
207     {
208     printf("Malformed hex string: %s\n", argv[i]);
209     j = -1;
210     break;
211     }
212     y = y * 16 + tolower(*x) - ((isdigit(*x))? '0' : 'W');
213     x++;
214     if (z)
215     {
216     buffer[j++] = y;
217     y = 0;
218     }
219     z ^= 1;
220     }
221 ph10 946 buffer[j] = 0;
222     bptr = buffer;
223    
224     while (*bptr != 0)
225     {
226     rc = utf82ord(bptr, &d);
227     if (rc > 0)
228     {
229     printf("0x%08x <= ", d);
230     for (j = 0; j < rc; j++) printf("%02x ", bptr[j]);
231     if (show)
232     {
233     printf(">");
234     for (j = 0; j < rc; j++) printf("%c", bptr[j]);
235     printf("<");
236     }
237     printf("\n");
238     bptr += rc;
239     }
240     else
241     {
242     printf("Malformed UTF-8 at offset %d <= ", -rc);
243     while (*bptr != 0) printf("%02x ", *bptr++);
244     printf("\n");
245     break;
246     }
247     }
248 ph10 97 }
249     }
250     return 0;
251     }
252    
253     /* End */

webmaster@exim.org
ViewVC Help
Powered by ViewVC 1.1.12