/[pcre]/code/trunk/maint/ucptest.c
ViewVC logotype

Contents of /code/trunk/maint/ucptest.c

Parent Directory Parent Directory | Revision Log Revision Log


Revision 491 - (show annotations) (download)
Mon Mar 1 17:45:08 2010 UTC (4 years, 1 month ago) by ph10
File MIME type: text/plain
File size: 9991 byte(s)
Update Unicode tables to Unicode version 5.2.0.

1 /***************************************************
2 * A program for testing the Unicode property table *
3 ***************************************************/
4
5 /* Copyright (c) University of Cambridge 2008 */
6
7 /* Compile thus:
8 gcc -DHAVE_CONFIG_H -o ucptest ucptest.c ../pcre_ucd.c ../pcre_tables.c
9 */
10
11 /* The program expects to read commands on stdin, and it writes output
12 to stdout. There is only one command, "findprop", followed by a list of Unicode
13 code points as hex numbers (without any prefixes). The output is one line per
14 character, giving its Unicode properties followed by its other case if there is
15 one. */
16
17 #ifdef HAVE_CONFIG_H
18 #include "../config.h"
19 #endif
20
21 #include <ctype.h>
22 #include <stdio.h>
23 #include <stdlib.h>
24 #include <string.h>
25 #include "../pcre_internal.h"
26 #include "../ucp.h"
27
28
29 /* -------------------------------------------------------------------*/
30
31 #define CS (char *)
32 #define CCS (const char *)
33 #define CSS (char **)
34 #define US (unsigned char *)
35 #define CUS (const unsigned char *)
36 #define USS (unsigned char **)
37
38 /* -------------------------------------------------------------------*/
39
40
41
42
43 /*************************************************
44 * Print Unicode property info for a char *
45 *************************************************/
46
47 static void
48 print_prop(int c)
49 {
50 int type = UCD_CATEGORY(c);
51 int fulltype = UCD_CHARTYPE(c);
52 int script = UCD_SCRIPT(c);
53 int othercase = UCD_OTHERCASE(c);
54
55 uschar *fulltypename = US"??";
56 uschar *typename = US"??";
57 uschar *scriptname = US"??";
58
59 switch (type)
60 {
61 case ucp_C: typename = US"Control"; break;
62 case ucp_L: typename = US"Letter"; break;
63 case ucp_M: typename = US"Mark"; break;
64 case ucp_N: typename = US"Number"; break;
65 case ucp_P: typename = US"Punctuation"; break;
66 case ucp_S: typename = US"Symbol"; break;
67 case ucp_Z: typename = US"Separator"; break;
68 }
69
70 switch (fulltype)
71 {
72 case ucp_Cc: fulltypename = US"Control"; break;
73 case ucp_Cf: fulltypename = US"Format"; break;
74 case ucp_Cn: fulltypename = US"Unassigned"; break;
75 case ucp_Co: fulltypename = US"Private use"; break;
76 case ucp_Cs: fulltypename = US"Surrogate"; break;
77 case ucp_Ll: fulltypename = US"Lower case letter"; break;
78 case ucp_Lm: fulltypename = US"Modifier letter"; break;
79 case ucp_Lo: fulltypename = US"Other letter"; break;
80 case ucp_Lt: fulltypename = US"Title case letter"; break;
81 case ucp_Lu: fulltypename = US"Upper case letter"; break;
82 case ucp_Mc: fulltypename = US"Spacing mark"; break;
83 case ucp_Me: fulltypename = US"Enclosing mark"; break;
84 case ucp_Mn: fulltypename = US"Non-spacing mark"; break;
85 case ucp_Nd: fulltypename = US"Decimal number"; break;
86 case ucp_Nl: fulltypename = US"Letter number"; break;
87 case ucp_No: fulltypename = US"Other number"; break;
88 case ucp_Pc: fulltypename = US"Connector punctuation"; break;
89 case ucp_Pd: fulltypename = US"Dash punctuation"; break;
90 case ucp_Pe: fulltypename = US"Close punctuation"; break;
91 case ucp_Pf: fulltypename = US"Final punctuation"; break;
92 case ucp_Pi: fulltypename = US"Initial punctuation"; break;
93 case ucp_Po: fulltypename = US"Other punctuation"; break;
94 case ucp_Ps: fulltypename = US"Open punctuation"; break;
95 case ucp_Sc: fulltypename = US"Currency symbol"; break;
96 case ucp_Sk: fulltypename = US"Modifier symbol"; break;
97 case ucp_Sm: fulltypename = US"Mathematical symbol"; break;
98 case ucp_So: fulltypename = US"Other symbol"; break;
99 case ucp_Zl: fulltypename = US"Line separator"; break;
100 case ucp_Zp: fulltypename = US"Paragraph separator"; break;
101 case ucp_Zs: fulltypename = US"Space separator"; break;
102 }
103
104 switch(script)
105 {
106 case ucp_Arabic: scriptname = US"Arabic"; break;
107 case ucp_Armenian: scriptname = US"Armenian"; break;
108 case ucp_Balinese: scriptname = US"Balinese"; break;
109 case ucp_Bengali: scriptname = US"Bengali"; break;
110 case ucp_Bopomofo: scriptname = US"Bopomofo"; break;
111 case ucp_Braille: scriptname = US"Braille"; break;
112 case ucp_Buginese: scriptname = US"Buginese"; break;
113 case ucp_Buhid: scriptname = US"Buhid"; break;
114 case ucp_Canadian_Aboriginal: scriptname = US"Canadian_Aboriginal"; break;
115 case ucp_Cherokee: scriptname = US"Cherokee"; break;
116 case ucp_Common: scriptname = US"Common"; break;
117 case ucp_Coptic: scriptname = US"Coptic"; break;
118 case ucp_Cuneiform: scriptname = US"Cuneiform"; break;
119 case ucp_Cypriot: scriptname = US"Cypriot"; break;
120 case ucp_Cyrillic: scriptname = US"Cyrillic"; break;
121 case ucp_Deseret: scriptname = US"Deseret"; break;
122 case ucp_Devanagari: scriptname = US"Devanagari"; break;
123 case ucp_Ethiopic: scriptname = US"Ethiopic"; break;
124 case ucp_Georgian: scriptname = US"Georgian"; break;
125 case ucp_Glagolitic: scriptname = US"Glagolitic"; break;
126 case ucp_Gothic: scriptname = US"Gothic"; break;
127 case ucp_Greek: scriptname = US"Greek"; break;
128 case ucp_Gujarati: scriptname = US"Gujarati"; break;
129 case ucp_Gurmukhi: scriptname = US"Gurmukhi"; break;
130 case ucp_Han: scriptname = US"Han"; break;
131 case ucp_Hangul: scriptname = US"Hangul"; break;
132 case ucp_Hanunoo: scriptname = US"Hanunoo"; break;
133 case ucp_Hebrew: scriptname = US"Hebrew"; break;
134 case ucp_Hiragana: scriptname = US"Hiragana"; break;
135 case ucp_Inherited: scriptname = US"Inherited"; break;
136 case ucp_Kannada: scriptname = US"Kannada"; break;
137 case ucp_Katakana: scriptname = US"Katakana"; break;
138 case ucp_Kharoshthi: scriptname = US"Kharoshthi"; break;
139 case ucp_Khmer: scriptname = US"Khmer"; break;
140 case ucp_Lao: scriptname = US"Lao"; break;
141 case ucp_Latin: scriptname = US"Latin"; break;
142 case ucp_Limbu: scriptname = US"Limbu"; break;
143 case ucp_Linear_B: scriptname = US"Linear_B"; break;
144 case ucp_Malayalam: scriptname = US"Malayalam"; break;
145 case ucp_Mongolian: scriptname = US"Mongolian"; break;
146 case ucp_Myanmar: scriptname = US"Myanmar"; break;
147 case ucp_New_Tai_Lue: scriptname = US"New_Tai_Lue"; break;
148 case ucp_Nko: scriptname = US"Nko"; break;
149 case ucp_Ogham: scriptname = US"Ogham"; break;
150 case ucp_Old_Italic: scriptname = US"Old_Italic"; break;
151 case ucp_Old_Persian: scriptname = US"Old_Persian"; break;
152 case ucp_Oriya: scriptname = US"Oriya"; break;
153 case ucp_Osmanya: scriptname = US"Osmanya"; break;
154 case ucp_Phags_Pa: scriptname = US"Phags_Pa"; break;
155 case ucp_Phoenician: scriptname = US"Phoenician"; break;
156 case ucp_Runic: scriptname = US"Runic"; break;
157 case ucp_Shavian: scriptname = US"Shavian"; break;
158 case ucp_Sinhala: scriptname = US"Sinhala"; break;
159 case ucp_Syloti_Nagri: scriptname = US"Syloti_Nagri"; break;
160 case ucp_Syriac: scriptname = US"Syriac"; break;
161 case ucp_Tagalog: scriptname = US"Tagalog"; break;
162 case ucp_Tagbanwa: scriptname = US"Tagbanwa"; break;
163 case ucp_Tai_Le: scriptname = US"Tai_Le"; break;
164 case ucp_Tamil: scriptname = US"Tamil"; break;
165 case ucp_Telugu: scriptname = US"Telugu"; break;
166 case ucp_Thaana: scriptname = US"Thaana"; break;
167 case ucp_Thai: scriptname = US"Thai"; break;
168 case ucp_Tibetan: scriptname = US"Tibetan"; break;
169 case ucp_Tifinagh: scriptname = US"Tifinagh"; break;
170 case ucp_Ugaritic: scriptname = US"Ugaritic"; break;
171 case ucp_Yi: scriptname = US"Yi"; break;
172 /* New for Unicode 5.1: */
173 case ucp_Carian: scriptname = US"Carian"; break;
174 case ucp_Cham: scriptname = US"Cham"; break;
175 case ucp_Kayah_Li: scriptname = US"Kayah_Li"; break;
176 case ucp_Lepcha: scriptname = US"Lepcha"; break;
177 case ucp_Lycian: scriptname = US"Lycian"; break;
178 case ucp_Lydian: scriptname = US"Lydian"; break;
179 case ucp_Ol_Chiki: scriptname = US"Ol_Chiki"; break;
180 case ucp_Rejang: scriptname = US"Rejang"; break;
181 case ucp_Saurashtra: scriptname = US"Saurashtra"; break;
182 case ucp_Sundanese: scriptname = US"Sundanese"; break;
183 case ucp_Vai: scriptname = US"Vai"; break;
184 /* New for Unicode 5.2: */
185 case ucp_Avestan: scriptname = US"Avestan"; break;
186 case ucp_Bamum: scriptname = US"Bamum"; break;
187 case ucp_Egyptian_Hieroglyphs: scriptname = US"Egyptian_Hieroglyphs"; break;
188 case ucp_Imperial_Aramaic: scriptname = US"Imperial_Aramaic"; break;
189 case ucp_Inscriptional_Pahlavi: scriptname = US"Inscriptional_Pahlavi"; break;
190 case ucp_Inscriptional_Parthian: scriptname = US"Inscriptional_Parthian"; break;
191 case ucp_Javanese: scriptname = US"Javanese"; break;
192 case ucp_Kaithi: scriptname = US"Kaithi"; break;
193 case ucp_Lisu: scriptname = US"Lisu"; break;
194 case ucp_Meetei_Mayek: scriptname = US"Meetei_Mayek"; break;
195 case ucp_Old_South_Arabian: scriptname = US"Old_South_Arabian"; break;
196 case ucp_Old_Turkic: scriptname = US"Old_Turkic"; break;
197 case ucp_Samaritan: scriptname = US"Samaritan"; break;
198 case ucp_Tai_Tham: scriptname = US"Tai_Tham"; break;
199 case ucp_Tai_Viet: scriptname = US"Tai_Viet"; break;
200 }
201
202 printf("%04x %s: %s %s", c, typename, fulltypename, scriptname);
203 if (othercase != c) printf(" %04x", othercase);
204 printf("\n");
205 }
206
207
208
209 /*************************************************
210 * Main program *
211 *************************************************/
212
213 int
214 main(void)
215 {
216 uschar buffer[1024];
217 while (fgets(CS buffer, sizeof(buffer), stdin) != NULL)
218 {
219 uschar name[24];
220 uschar *s, *t;
221
222 printf("%s", buffer);
223 s = buffer;
224 while (isspace(*s)) s++;
225 if (*s == 0) continue;
226
227 for (t = name; *s != 0 && !isspace(*s); s++) *t++ = *s;
228 *t = 0;
229 while (isspace(*s)) s++;
230
231 if (strcmp(CS name, "findprop") == 0)
232 {
233 while (*s != 0)
234 {
235 uschar *endptr;
236 int c = strtoul(CS s, CSS(&endptr), 16);
237 print_prop(c);
238 s = endptr;
239 while (isspace(*s)) s++;
240 }
241 }
242
243 else printf("Unknown test command %s\n", name);
244 }
245
246 return 0;
247 }
248
249 /* End */

webmaster@exim.org
ViewVC Help
Powered by ViewVC 1.1.12