/[pcre]/code/trunk/maint/ucptest.c
ViewVC logotype

Contents of /code/trunk/maint/ucptest.c

Parent Directory Parent Directory | Revision Log Revision Log


Revision 351 - (hide annotations) (download)
Fri Jul 4 18:27:16 2008 UTC (6 years, 1 month ago) by ph10
File MIME type: text/plain
File size: 8346 byte(s)
Final tidies for new Unicode property code; upgrade to Unicode 5.1.0.

1 ph10 97 /***************************************************
2     * A program for testing the Unicode property table *
3     ***************************************************/
4    
5 ph10 351 /* Copyright (c) University of Cambridge 2008 */
6 ph10 97
7     /* Compile thus:
8 ph10 351 gcc -DHAVE_CONFIG_H -o ucptest ucptest.c ../pcre_ucd.c ../pcre_tables.c
9 ph10 97 */
10    
11 ph10 351 /* The program expects to read commands on stdin, and it writes output
12     to stdout. There is only one command, "findprop", followed by a list of Unicode
13     code points as hex numbers (without any prefixes). The output is one line per
14     character, giving its Unicode properties followed by its other case if there is
15     one. */
16    
17     #ifdef HAVE_CONFIG_H
18     #include "../config.h"
19     #endif
20    
21 ph10 97 #include <ctype.h>
22     #include <stdio.h>
23     #include <stdlib.h>
24     #include <string.h>
25 ph10 98 #include "../pcre_internal.h"
26     #include "../ucp.h"
27 ph10 97
28    
29     /* -------------------------------------------------------------------*/
30    
31     #define CS (char *)
32     #define CCS (const char *)
33     #define CSS (char **)
34     #define US (unsigned char *)
35     #define CUS (const unsigned char *)
36     #define USS (unsigned char **)
37    
38     /* -------------------------------------------------------------------*/
39    
40    
41    
42    
43     /*************************************************
44     * Print Unicode property info for a char *
45     *************************************************/
46    
47     static void
48     print_prop(int c)
49     {
50 ph10 351 int type = UCD_CATEGORY(c);
51     int fulltype = UCD_CHARTYPE(c);
52     int script = UCD_SCRIPT(c);
53     int othercase = UCD_OTHERCASE(c);
54 ph10 97
55 ph10 351 uschar *fulltypename = US"??";
56     uschar *typename = US"??";
57     uschar *scriptname = US"??";
58    
59     switch (type)
60 ph10 97 {
61 ph10 351 case ucp_C: typename = US"Control"; break;
62     case ucp_L: typename = US"Letter"; break;
63     case ucp_M: typename = US"Mark"; break;
64     case ucp_N: typename = US"Number"; break;
65     case ucp_P: typename = US"Punctuation"; break;
66     case ucp_S: typename = US"Symbol"; break;
67     case ucp_Z: typename = US"Separator"; break;
68     }
69 ph10 97
70 ph10 351 switch (fulltype)
71     {
72     case ucp_Cc: fulltypename = US"Control"; break;
73     case ucp_Cf: fulltypename = US"Format"; break;
74     case ucp_Cn: fulltypename = US"Unassigned"; break;
75     case ucp_Co: fulltypename = US"Private use"; break;
76     case ucp_Cs: fulltypename = US"Surrogate"; break;
77     case ucp_Ll: fulltypename = US"Lower case letter"; break;
78     case ucp_Lm: fulltypename = US"Modifier letter"; break;
79     case ucp_Lo: fulltypename = US"Other letter"; break;
80     case ucp_Lt: fulltypename = US"Title case letter"; break;
81     case ucp_Lu: fulltypename = US"Upper case letter"; break;
82     case ucp_Mc: fulltypename = US"Spacing mark"; break;
83     case ucp_Me: fulltypename = US"Enclosing mark"; break;
84     case ucp_Mn: fulltypename = US"Non-spacing mark"; break;
85     case ucp_Nd: fulltypename = US"Decimal number"; break;
86     case ucp_Nl: fulltypename = US"Letter number"; break;
87     case ucp_No: fulltypename = US"Other number"; break;
88     case ucp_Pc: fulltypename = US"Connector punctuation"; break;
89     case ucp_Pd: fulltypename = US"Dash punctuation"; break;
90     case ucp_Pe: fulltypename = US"Close punctuation"; break;
91     case ucp_Pf: fulltypename = US"Final punctuation"; break;
92     case ucp_Pi: fulltypename = US"Initial punctuation"; break;
93     case ucp_Po: fulltypename = US"Other punctuation"; break;
94     case ucp_Ps: fulltypename = US"Open punctuation"; break;
95     case ucp_Sc: fulltypename = US"Currency symbol"; break;
96     case ucp_Sk: fulltypename = US"Modifier symbol"; break;
97     case ucp_Sm: fulltypename = US"Mathematical symbol"; break;
98     case ucp_So: fulltypename = US"Other symbol"; break;
99     case ucp_Zl: fulltypename = US"Line separator"; break;
100     case ucp_Zp: fulltypename = US"Paragraph separator"; break;
101     case ucp_Zs: fulltypename = US"Space separator"; break;
102 ph10 97 }
103 ph10 351
104     switch(script)
105     {
106     case ucp_Arabic: scriptname = US"Arabic"; break;
107     case ucp_Armenian: scriptname = US"Armenian"; break;
108     case ucp_Balinese: scriptname = US"Balinese"; break;
109     case ucp_Bengali: scriptname = US"Bengali"; break;
110     case ucp_Bopomofo: scriptname = US"Bopomofo"; break;
111     case ucp_Braille: scriptname = US"Braille"; break;
112     case ucp_Buginese: scriptname = US"Buginese"; break;
113     case ucp_Buhid: scriptname = US"Buhid"; break;
114     case ucp_Canadian_Aboriginal: scriptname = US"Canadian_Aboriginal"; break;
115     case ucp_Cherokee: scriptname = US"Cherokee"; break;
116     case ucp_Common: scriptname = US"Common"; break;
117     case ucp_Coptic: scriptname = US"Coptic"; break;
118     case ucp_Cuneiform: scriptname = US"Cuneiform"; break;
119     case ucp_Cypriot: scriptname = US"Cypriot"; break;
120     case ucp_Cyrillic: scriptname = US"Cyrillic"; break;
121     case ucp_Deseret: scriptname = US"Deseret"; break;
122     case ucp_Devanagari: scriptname = US"Devanagari"; break;
123     case ucp_Ethiopic: scriptname = US"Ethiopic"; break;
124     case ucp_Georgian: scriptname = US"Georgian"; break;
125     case ucp_Glagolitic: scriptname = US"Glagolitic"; break;
126     case ucp_Gothic: scriptname = US"Gothic"; break;
127     case ucp_Greek: scriptname = US"Greek"; break;
128     case ucp_Gujarati: scriptname = US"Gujarati"; break;
129     case ucp_Gurmukhi: scriptname = US"Gurmukhi"; break;
130     case ucp_Han: scriptname = US"Han"; break;
131     case ucp_Hangul: scriptname = US"Hangul"; break;
132     case ucp_Hanunoo: scriptname = US"Hanunoo"; break;
133     case ucp_Hebrew: scriptname = US"Hebrew"; break;
134     case ucp_Hiragana: scriptname = US"Hiragana"; break;
135     case ucp_Inherited: scriptname = US"Inherited"; break;
136     case ucp_Kannada: scriptname = US"Kannada"; break;
137     case ucp_Katakana: scriptname = US"Katakana"; break;
138     case ucp_Kharoshthi: scriptname = US"Kharoshthi"; break;
139     case ucp_Khmer: scriptname = US"Khmer"; break;
140     case ucp_Lao: scriptname = US"Lao"; break;
141     case ucp_Latin: scriptname = US"Latin"; break;
142     case ucp_Limbu: scriptname = US"Limbu"; break;
143     case ucp_Linear_B: scriptname = US"Linear_B"; break;
144     case ucp_Malayalam: scriptname = US"Malayalam"; break;
145     case ucp_Mongolian: scriptname = US"Mongolian"; break;
146     case ucp_Myanmar: scriptname = US"Myanmar"; break;
147     case ucp_New_Tai_Lue: scriptname = US"New_Tai_Lue"; break;
148     case ucp_Nko: scriptname = US"Nko"; break;
149     case ucp_Ogham: scriptname = US"Ogham"; break;
150     case ucp_Old_Italic: scriptname = US"Old_Italic"; break;
151     case ucp_Old_Persian: scriptname = US"Old_Persian"; break;
152     case ucp_Oriya: scriptname = US"Oriya"; break;
153     case ucp_Osmanya: scriptname = US"Osmanya"; break;
154     case ucp_Phags_Pa: scriptname = US"Phags_Pa"; break;
155     case ucp_Phoenician: scriptname = US"Phoenician"; break;
156     case ucp_Runic: scriptname = US"Runic"; break;
157     case ucp_Shavian: scriptname = US"Shavian"; break;
158     case ucp_Sinhala: scriptname = US"Sinhala"; break;
159     case ucp_Syloti_Nagri: scriptname = US"Syloti_Nagri"; break;
160     case ucp_Syriac: scriptname = US"Syriac"; break;
161     case ucp_Tagalog: scriptname = US"Tagalog"; break;
162     case ucp_Tagbanwa: scriptname = US"Tagbanwa"; break;
163     case ucp_Tai_Le: scriptname = US"Tai_Le"; break;
164     case ucp_Tamil: scriptname = US"Tamil"; break;
165     case ucp_Telugu: scriptname = US"Telugu"; break;
166     case ucp_Thaana: scriptname = US"Thaana"; break;
167     case ucp_Thai: scriptname = US"Thai"; break;
168     case ucp_Tibetan: scriptname = US"Tibetan"; break;
169     case ucp_Tifinagh: scriptname = US"Tifinagh"; break;
170     case ucp_Ugaritic: scriptname = US"Ugaritic"; break;
171     case ucp_Yi: scriptname = US"Yi"; break;
172     }
173    
174     printf("%04x %s: %s %s", c, typename, fulltypename, scriptname);
175     if (othercase != c) printf(" %04x", othercase);
176     printf("\n");
177 ph10 97 }
178    
179    
180    
181     /*************************************************
182     * Main program *
183     *************************************************/
184    
185     int
186     main(void)
187     {
188     uschar buffer[1024];
189     while (fgets(CS buffer, sizeof(buffer), stdin) != NULL)
190     {
191     uschar name[24];
192     uschar *s, *t;
193    
194     printf("%s", buffer);
195     s = buffer;
196     while (isspace(*s)) s++;
197     if (*s == 0) continue;
198    
199     for (t = name; *s != 0 && !isspace(*s); s++) *t++ = *s;
200     *t = 0;
201     while (isspace(*s)) s++;
202    
203     if (strcmp(CS name, "findprop") == 0)
204     {
205     while (*s != 0)
206     {
207     uschar *endptr;
208     int c = strtoul(CS s, CSS(&endptr), 16);
209     print_prop(c);
210     s = endptr;
211     while (isspace(*s)) s++;
212     }
213     }
214    
215     else printf("Unknown test command %s\n", name);
216     }
217    
218     return 0;
219     }
220    
221     /* End */

webmaster@exim.org
ViewVC Help
Powered by ViewVC 1.1.12