/[pcre]/code/trunk/maint/utf8.c
ViewVC logotype

Contents of /code/trunk/maint/utf8.c

Parent Directory Parent Directory | Revision Log Revision Log


Revision 946 - (show annotations) (download)
Wed Feb 29 18:00:55 2012 UTC (2 years, 6 months ago) by ph10
File MIME type: text/plain
File size: 6017 byte(s)
Tidy up this developer's test program and add more descriptive comments.

1 /* A test program for converting characters to UTF-8 and vice versa. Note that
2 this program conforms to the original definition of UTF-8, which allows
3 codepoints up to 7fffffff. The more recent definition limits the validity of
4 UTF-8 codepoints to a maximum of 10ffffff.
5
6 The arguments are either single codepoint values, written as 0xhhhh, for
7 conversion to UTF-8, or sequences of hex values, written without 0x and
8 optionally including spaces (but such arguments must be quoted), for conversion
9 from UTF-8 to codepoints. For example:
10
11 ./utf8 0x1234
12 0x00001234 => e1 88 b4
13
14 ./utf8 "e1 88 b4"
15 0x00001234 <= e1 88 b4
16
17 In the second case, a number of characters can be present in one argument:
18
19 ./utf8 "65 e188b4 77"
20 0x00000065 <= 65
21 0x00001234 <= e1 88 b4
22 0x00000077 <= 77
23
24 If the option -s is given, the sequence of UTF-bytes is written out between
25 angle brackets at the end of the line. On a UTF-8 terminal, this will show the
26 appropriate graphic for the codepoint. */
27
28 #include <stdio.h>
29 #include <stdlib.h>
30 #include <ctype.h>
31
32 /* The valid ranges for UTF-8 characters are:
33
34 0000 0000 to 0000 007f 1 byte (ascii)
35 0000 0080 to 0000 07ff 2 bytes
36 0000 0800 to 0000 ffff 3 bytes
37 0001 0000 to 001f ffff 4 bytes
38 0020 0000 to 03ff ffff 5 bytes
39 0400 0000 to 7fff ffff 6 bytes
40 */
41
42
43 static const int utf8_table1[] = {
44 0x0000007f, 0x000007ff, 0x0000ffff, 0x001fffff, 0x03ffffff, 0x7fffffff};
45
46 static const int utf8_table2[] = {
47 0, 0xc0, 0xe0, 0xf0, 0xf8, 0xfc};
48
49 static const int utf8_table3[] = {
50 0xff, 0x1f, 0x0f, 0x07, 0x03, 0x01};
51
52 static const unsigned char utf8_table4[] = {
53 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
54 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
55 2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,
56 3,3,3,3,3,3,3,3,4,4,4,4,5,5,6,6 };
57
58
59 /*************************************************
60 * Convert character value to UTF-8 *
61 *************************************************/
62
63 /* This function takes an integer value in the range 0 - 0x7fffffff
64 and encodes it as a UTF-8 character in 1 to 6 bytes.
65
66 Arguments:
67 cvalue the character value
68 buffer pointer to buffer for result - at least 6 bytes long
69
70 Returns: number of characters placed in the buffer
71 -1 if input character is negative
72 0 if input character is positive but too big (only when
73 int is longer than 32 bits)
74 */
75
76 int
77 ord2utf8(int cvalue, unsigned char *buffer)
78 {
79 register int i, j;
80 for (i = 0; i < sizeof(utf8_table1)/sizeof(int); i++)
81 if (cvalue <= utf8_table1[i]) break;
82 if (i >= sizeof(utf8_table1)/sizeof(int)) return 0;
83 if (cvalue < 0) return -1;
84 buffer += i;
85 for (j = i; j > 0; j--)
86 {
87 *buffer-- = 0x80 | (cvalue & 0x3f);
88 cvalue >>= 6;
89 }
90 *buffer = utf8_table2[i] | cvalue;
91 return i + 1;
92 }
93
94
95
96 /*************************************************
97 * Convert UTF-8 string to value *
98 *************************************************/
99
100 /* This function takes one or more bytes that represents a UTF-8 character,
101 and returns the value of the character.
102
103 Argument:
104 buffer a pointer to the byte vector
105 vptr a pointer to an int to receive the value
106
107 Returns: > 0 => the number of bytes consumed
108 -6 to 0 => malformed UTF-8 character at offset = (-return)
109 */
110
111 int
112 utf82ord(unsigned char *buffer, int *vptr)
113 {
114 int c = *buffer++;
115 int d = c;
116 int i, j, s;
117
118 for (i = -1; i < 6; i++) /* i is number of additional bytes */
119 {
120 if ((d & 0x80) == 0) break;
121 d <<= 1;
122 }
123
124 if (i == -1) { *vptr = c; return 1; } /* ascii character */
125 if (i == 0 || i == 6) return 0; /* invalid UTF-8 */
126
127 /* i now has a value in the range 1-5 */
128
129 s = 6*i;
130 d = (c & utf8_table3[i]) << s;
131
132 for (j = 0; j < i; j++)
133 {
134 c = *buffer++;
135 if ((c & 0xc0) != 0x80) return -(j+1);
136 s -= 6;
137 d |= (c & 0x3f) << s;
138 }
139
140 /* Check that encoding was the correct unique one */
141
142 for (j = 0; j < sizeof(utf8_table1)/sizeof(int); j++)
143 if (d <= utf8_table1[j]) break;
144 if (j != i) return -(i+1);
145
146 /* Valid value */
147
148 *vptr = d;
149 return i+1;
150 }
151
152
153
154
155 /*************************************************
156 * Main Program *
157 *************************************************/
158
159
160 int
161 main(int argc, char **argv)
162 {
163 int i = 1;
164 int show = 0;
165 unsigned char buffer[64];
166
167 if (argc > 1 && strcmp(argv[1], "-s") == 0)
168 {
169 show = 1;
170 i = 2;
171 }
172
173 for (; i < argc; i++)
174 {
175 unsigned char *x = argv[i];
176 if (strncmp(x, "0x", 2) == 0)
177 {
178 int j;
179 int d = strtol(x+2, NULL, 16);
180 int rc = ord2utf8(d, buffer);
181 printf("0x%08x => ", d);
182 if (rc <= 0) printf("*** Error %d ***", rc); else
183 {
184 for (j = 0; j < rc; j++) printf("%02x ", buffer[j]);
185 if (show)
186 {
187 printf(">");
188 for (j = 0; j < rc; j++) printf("%c", buffer[j]);
189 printf("<");
190 }
191 }
192 printf("\n");
193 }
194 else
195 {
196 int d, rc;
197 int j = 0;
198 int y = 0;
199 int z = 0;
200 unsigned char *bptr;
201
202 for (;;)
203 {
204 while (*x == ' ') x++;
205 if (*x == 0 && !z) break;
206 if (!isxdigit(*x))
207 {
208 printf("Malformed hex string: %s\n", argv[i]);
209 j = -1;
210 break;
211 }
212 y = y * 16 + tolower(*x) - ((isdigit(*x))? '0' : 'W');
213 x++;
214 if (z)
215 {
216 buffer[j++] = y;
217 y = 0;
218 }
219 z ^= 1;
220 }
221 buffer[j] = 0;
222 bptr = buffer;
223
224 while (*bptr != 0)
225 {
226 rc = utf82ord(bptr, &d);
227 if (rc > 0)
228 {
229 printf("0x%08x <= ", d);
230 for (j = 0; j < rc; j++) printf("%02x ", bptr[j]);
231 if (show)
232 {
233 printf(">");
234 for (j = 0; j < rc; j++) printf("%c", bptr[j]);
235 printf("<");
236 }
237 printf("\n");
238 bptr += rc;
239 }
240 else
241 {
242 printf("Malformed UTF-8 at offset %d <= ", -rc);
243 while (*bptr != 0) printf("%02x ", *bptr++);
244 printf("\n");
245 break;
246 }
247 }
248 }
249 }
250 return 0;
251 }
252
253 /* End */

webmaster@exim.org
ViewVC Help
Powered by ViewVC 1.1.12