/[pcre]/code/trunk/maint/utf8.c
ViewVC logotype

Contents of /code/trunk/maint/utf8.c

Parent Directory Parent Directory | Revision Log Revision Log


Revision 114 - (hide annotations) (download)
Fri Mar 9 10:15:12 2007 UTC (7 years, 7 months ago) by ph10
File MIME type: text/plain
File size: 4396 byte(s)
Rename "maintain" as "maint".

1 ph10 97 /* A program for converting characters to UTF-8 and vice versa */
2    
3     #include <stdio.h>
4     #include <stdlib.h>
5     #include <ctype.h>
6    
7     /* The valid ranges for UTF-8 characters are:
8    
9     0000 0000 to 0000 007f 1 byte (ascii)
10     0000 0080 to 0000 07ff 2 bytes
11     0000 0800 to 0000 ffff 3 bytes
12     0001 0000 to 001f ffff 4 bytes
13     0020 0000 to 03ff ffff 5 bytes
14     0400 0000 to 7fff ffff 6 bytes
15     */
16    
17    
18     static const int utf8_table1[] = {
19     0x0000007f, 0x000007ff, 0x0000ffff, 0x001fffff, 0x03ffffff, 0x7fffffff};
20    
21     static const int utf8_table2[] = {
22     0, 0xc0, 0xe0, 0xf0, 0xf8, 0xfc};
23    
24     static const int utf8_table3[] = {
25     0xff, 0x1f, 0x0f, 0x07, 0x03, 0x01};
26    
27     static const unsigned char utf8_table4[] = {
28     1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
29     1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
30     2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,
31     3,3,3,3,3,3,3,3,4,4,4,4,5,5,6,6 };
32    
33    
34     /*************************************************
35     * Convert character value to UTF-8 *
36     *************************************************/
37    
38     /* This function takes an integer value in the range 0 - 0x7fffffff
39     and encodes it as a UTF-8 character in 0 to 6 bytes.
40    
41     Arguments:
42     cvalue the character value
43     buffer pointer to buffer for result - at least 6 bytes long
44    
45     Returns: number of characters placed in the buffer
46     -1 if input character is negative
47     0 if input character is positive but too big (only when
48     int is longer than 32 bits)
49     */
50    
51     int
52     ord2utf8(int cvalue, unsigned char *buffer)
53     {
54     register int i, j;
55     for (i = 0; i < sizeof(utf8_table1)/sizeof(int); i++)
56     if (cvalue <= utf8_table1[i]) break;
57     if (i >= sizeof(utf8_table1)/sizeof(int)) return 0;
58     if (cvalue < 0) return -1;
59     buffer += i;
60     for (j = i; j > 0; j--)
61     {
62     *buffer-- = 0x80 | (cvalue & 0x3f);
63     cvalue >>= 6;
64     }
65     *buffer = utf8_table2[i] | cvalue;
66     return i + 1;
67     }
68    
69    
70    
71     /*************************************************
72     * Convert UTF-8 string to value *
73     *************************************************/
74    
75     /* This function takes one or more bytes that represents a UTF-8 character,
76     and returns the value of the character.
77    
78     Argument:
79     buffer a pointer to the byte vector
80     vptr a pointer to an int to receive the value
81    
82     Returns: > 0 => the number of bytes consumed
83     -6 to 0 => malformed UTF-8 character at offset = (-return)
84     */
85    
86     int
87     utf82ord(unsigned char *buffer, int *vptr)
88     {
89     int c = *buffer++;
90     int d = c;
91     int i, j, s;
92    
93     for (i = -1; i < 6; i++) /* i is number of additional bytes */
94     {
95     if ((d & 0x80) == 0) break;
96     d <<= 1;
97     }
98    
99     if (i == -1) { *vptr = c; return 1; } /* ascii character */
100     if (i == 0 || i == 6) return 0; /* invalid UTF-8 */
101    
102     /* i now has a value in the range 1-5 */
103    
104     s = 6*i;
105     d = (c & utf8_table3[i]) << s;
106    
107     for (j = 0; j < i; j++)
108     {
109     c = *buffer++;
110     if ((c & 0xc0) != 0x80) return -(j+1);
111     s -= 6;
112     d |= (c & 0x3f) << s;
113     }
114    
115     /* Check that encoding was the correct unique one */
116    
117     for (j = 0; j < sizeof(utf8_table1)/sizeof(int); j++)
118     if (d <= utf8_table1[j]) break;
119     if (j != i) return -(i+1);
120    
121     /* Valid value */
122    
123     *vptr = d;
124     return i+1;
125     }
126    
127    
128    
129    
130     /*************************************************
131     * Main Program *
132     *************************************************/
133    
134    
135     int
136     main(int argc, char **argv)
137     {
138     int i;
139     unsigned char buffer[8];
140     for (i = 1; i < argc; i++)
141     {
142     unsigned char *x = argv[i];
143     if (strncmp(x, "0x", 2) == 0)
144     {
145     int j;
146     int d = strtol(x+2, NULL, 16);
147     int rc = ord2utf8(d, buffer);
148     printf("0x%08x => ", d);
149     if (rc <= 0) printf("*** Error %d ***", rc);
150     else for (j = 0; j < rc; j++) printf("%02x ", buffer[j]);
151     printf("\n");
152     }
153     else
154     {
155     int d, rc;
156     int j = 0;
157     int y = 0;
158     int z = 0;
159     for (;;)
160     {
161     while (*x == ' ') x++;
162     if (*x == 0 && !z) break;
163     if (!isxdigit(*x))
164     {
165     printf("Malformed hex string: %s\n", argv[i]);
166     j = -1;
167     break;
168     }
169     y = y * 16 + tolower(*x) - ((isdigit(*x))? '0' : 'W');
170     x++;
171     if (z)
172     {
173     buffer[j++] = y;
174     y = 0;
175     }
176     z ^= 1;
177     }
178     if (j < 0) continue;
179     buffer[j] = 0;
180     rc = utf82ord(buffer, &d);
181     if (rc > 0) printf("0x%08x <= %s\n", d, argv[i]);
182     else printf("Error %d <= %s\n", rc, argv[i]);
183     }
184     }
185     return 0;
186     }
187    
188     /* End */

webmaster@exim.org
ViewVC Help
Powered by ViewVC 1.1.12