/[pcre]/code/trunk/maint/utf8.c
ViewVC logotype

Contents of /code/trunk/maint/utf8.c

Parent Directory Parent Directory | Revision Log Revision Log


Revision 114 - (show annotations) (download)
Fri Mar 9 10:15:12 2007 UTC (7 years, 5 months ago) by ph10
File MIME type: text/plain
File size: 4396 byte(s)
Rename "maintain" as "maint".

1 /* A program for converting characters to UTF-8 and vice versa */
2
3 #include <stdio.h>
4 #include <stdlib.h>
5 #include <ctype.h>
6
7 /* The valid ranges for UTF-8 characters are:
8
9 0000 0000 to 0000 007f 1 byte (ascii)
10 0000 0080 to 0000 07ff 2 bytes
11 0000 0800 to 0000 ffff 3 bytes
12 0001 0000 to 001f ffff 4 bytes
13 0020 0000 to 03ff ffff 5 bytes
14 0400 0000 to 7fff ffff 6 bytes
15 */
16
17
18 static const int utf8_table1[] = {
19 0x0000007f, 0x000007ff, 0x0000ffff, 0x001fffff, 0x03ffffff, 0x7fffffff};
20
21 static const int utf8_table2[] = {
22 0, 0xc0, 0xe0, 0xf0, 0xf8, 0xfc};
23
24 static const int utf8_table3[] = {
25 0xff, 0x1f, 0x0f, 0x07, 0x03, 0x01};
26
27 static const unsigned char utf8_table4[] = {
28 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
29 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
30 2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,
31 3,3,3,3,3,3,3,3,4,4,4,4,5,5,6,6 };
32
33
34 /*************************************************
35 * Convert character value to UTF-8 *
36 *************************************************/
37
38 /* This function takes an integer value in the range 0 - 0x7fffffff
39 and encodes it as a UTF-8 character in 0 to 6 bytes.
40
41 Arguments:
42 cvalue the character value
43 buffer pointer to buffer for result - at least 6 bytes long
44
45 Returns: number of characters placed in the buffer
46 -1 if input character is negative
47 0 if input character is positive but too big (only when
48 int is longer than 32 bits)
49 */
50
51 int
52 ord2utf8(int cvalue, unsigned char *buffer)
53 {
54 register int i, j;
55 for (i = 0; i < sizeof(utf8_table1)/sizeof(int); i++)
56 if (cvalue <= utf8_table1[i]) break;
57 if (i >= sizeof(utf8_table1)/sizeof(int)) return 0;
58 if (cvalue < 0) return -1;
59 buffer += i;
60 for (j = i; j > 0; j--)
61 {
62 *buffer-- = 0x80 | (cvalue & 0x3f);
63 cvalue >>= 6;
64 }
65 *buffer = utf8_table2[i] | cvalue;
66 return i + 1;
67 }
68
69
70
71 /*************************************************
72 * Convert UTF-8 string to value *
73 *************************************************/
74
75 /* This function takes one or more bytes that represents a UTF-8 character,
76 and returns the value of the character.
77
78 Argument:
79 buffer a pointer to the byte vector
80 vptr a pointer to an int to receive the value
81
82 Returns: > 0 => the number of bytes consumed
83 -6 to 0 => malformed UTF-8 character at offset = (-return)
84 */
85
86 int
87 utf82ord(unsigned char *buffer, int *vptr)
88 {
89 int c = *buffer++;
90 int d = c;
91 int i, j, s;
92
93 for (i = -1; i < 6; i++) /* i is number of additional bytes */
94 {
95 if ((d & 0x80) == 0) break;
96 d <<= 1;
97 }
98
99 if (i == -1) { *vptr = c; return 1; } /* ascii character */
100 if (i == 0 || i == 6) return 0; /* invalid UTF-8 */
101
102 /* i now has a value in the range 1-5 */
103
104 s = 6*i;
105 d = (c & utf8_table3[i]) << s;
106
107 for (j = 0; j < i; j++)
108 {
109 c = *buffer++;
110 if ((c & 0xc0) != 0x80) return -(j+1);
111 s -= 6;
112 d |= (c & 0x3f) << s;
113 }
114
115 /* Check that encoding was the correct unique one */
116
117 for (j = 0; j < sizeof(utf8_table1)/sizeof(int); j++)
118 if (d <= utf8_table1[j]) break;
119 if (j != i) return -(i+1);
120
121 /* Valid value */
122
123 *vptr = d;
124 return i+1;
125 }
126
127
128
129
130 /*************************************************
131 * Main Program *
132 *************************************************/
133
134
135 int
136 main(int argc, char **argv)
137 {
138 int i;
139 unsigned char buffer[8];
140 for (i = 1; i < argc; i++)
141 {
142 unsigned char *x = argv[i];
143 if (strncmp(x, "0x", 2) == 0)
144 {
145 int j;
146 int d = strtol(x+2, NULL, 16);
147 int rc = ord2utf8(d, buffer);
148 printf("0x%08x => ", d);
149 if (rc <= 0) printf("*** Error %d ***", rc);
150 else for (j = 0; j < rc; j++) printf("%02x ", buffer[j]);
151 printf("\n");
152 }
153 else
154 {
155 int d, rc;
156 int j = 0;
157 int y = 0;
158 int z = 0;
159 for (;;)
160 {
161 while (*x == ' ') x++;
162 if (*x == 0 && !z) break;
163 if (!isxdigit(*x))
164 {
165 printf("Malformed hex string: %s\n", argv[i]);
166 j = -1;
167 break;
168 }
169 y = y * 16 + tolower(*x) - ((isdigit(*x))? '0' : 'W');
170 x++;
171 if (z)
172 {
173 buffer[j++] = y;
174 y = 0;
175 }
176 z ^= 1;
177 }
178 if (j < 0) continue;
179 buffer[j] = 0;
180 rc = utf82ord(buffer, &d);
181 if (rc > 0) printf("0x%08x <= %s\n", d, argv[i]);
182 else printf("Error %d <= %s\n", rc, argv[i]);
183 }
184 }
185 return 0;
186 }
187
188 /* End */

webmaster@exim.org
ViewVC Help
Powered by ViewVC 1.1.12