| 1 |
/***************************************************
|
| 2 |
* A program for testing the Unicode property table *
|
| 3 |
***************************************************/
|
| 4 |
|
| 5 |
/* Copyright (c) University of Cambridge 2008 */
|
| 6 |
|
| 7 |
/* Compile thus:
|
| 8 |
gcc -DHAVE_CONFIG_H -o ucptest ucptest.c ../pcre_ucd.c ../pcre_tables.c
|
| 9 |
*/
|
| 10 |
|
| 11 |
/* The program expects to read commands on stdin, and it writes output
|
| 12 |
to stdout. There is only one command, "findprop", followed by a list of Unicode
|
| 13 |
code points as hex numbers (without any prefixes). The output is one line per
|
| 14 |
character, giving its Unicode properties followed by its other case if there is
|
| 15 |
one. */
|
| 16 |
|
| 17 |
#ifdef HAVE_CONFIG_H
|
| 18 |
#include "../config.h"
|
| 19 |
#endif
|
| 20 |
|
| 21 |
#include <ctype.h>
|
| 22 |
#include <stdio.h>
|
| 23 |
#include <stdlib.h>
|
| 24 |
#include <string.h>
|
| 25 |
#include "../pcre_internal.h"
|
| 26 |
#include "../ucp.h"
|
| 27 |
|
| 28 |
|
| 29 |
/* -------------------------------------------------------------------*/
|
| 30 |
|
| 31 |
#define CS (char *)
|
| 32 |
#define CCS (const char *)
|
| 33 |
#define CSS (char **)
|
| 34 |
#define US (unsigned char *)
|
| 35 |
#define CUS (const unsigned char *)
|
| 36 |
#define USS (unsigned char **)
|
| 37 |
|
| 38 |
/* -------------------------------------------------------------------*/
|
| 39 |
|
| 40 |
|
| 41 |
|
| 42 |
|
| 43 |
/*************************************************
|
| 44 |
* Print Unicode property info for a char *
|
| 45 |
*************************************************/
|
| 46 |
|
| 47 |
static void
|
| 48 |
print_prop(int c)
|
| 49 |
{
|
| 50 |
int type = UCD_CATEGORY(c);
|
| 51 |
int fulltype = UCD_CHARTYPE(c);
|
| 52 |
int script = UCD_SCRIPT(c);
|
| 53 |
int othercase = UCD_OTHERCASE(c);
|
| 54 |
|
| 55 |
uschar *fulltypename = US"??";
|
| 56 |
uschar *typename = US"??";
|
| 57 |
uschar *scriptname = US"??";
|
| 58 |
|
| 59 |
switch (type)
|
| 60 |
{
|
| 61 |
case ucp_C: typename = US"Control"; break;
|
| 62 |
case ucp_L: typename = US"Letter"; break;
|
| 63 |
case ucp_M: typename = US"Mark"; break;
|
| 64 |
case ucp_N: typename = US"Number"; break;
|
| 65 |
case ucp_P: typename = US"Punctuation"; break;
|
| 66 |
case ucp_S: typename = US"Symbol"; break;
|
| 67 |
case ucp_Z: typename = US"Separator"; break;
|
| 68 |
}
|
| 69 |
|
| 70 |
switch (fulltype)
|
| 71 |
{
|
| 72 |
case ucp_Cc: fulltypename = US"Control"; break;
|
| 73 |
case ucp_Cf: fulltypename = US"Format"; break;
|
| 74 |
case ucp_Cn: fulltypename = US"Unassigned"; break;
|
| 75 |
case ucp_Co: fulltypename = US"Private use"; break;
|
| 76 |
case ucp_Cs: fulltypename = US"Surrogate"; break;
|
| 77 |
case ucp_Ll: fulltypename = US"Lower case letter"; break;
|
| 78 |
case ucp_Lm: fulltypename = US"Modifier letter"; break;
|
| 79 |
case ucp_Lo: fulltypename = US"Other letter"; break;
|
| 80 |
case ucp_Lt: fulltypename = US"Title case letter"; break;
|
| 81 |
case ucp_Lu: fulltypename = US"Upper case letter"; break;
|
| 82 |
case ucp_Mc: fulltypename = US"Spacing mark"; break;
|
| 83 |
case ucp_Me: fulltypename = US"Enclosing mark"; break;
|
| 84 |
case ucp_Mn: fulltypename = US"Non-spacing mark"; break;
|
| 85 |
case ucp_Nd: fulltypename = US"Decimal number"; break;
|
| 86 |
case ucp_Nl: fulltypename = US"Letter number"; break;
|
| 87 |
case ucp_No: fulltypename = US"Other number"; break;
|
| 88 |
case ucp_Pc: fulltypename = US"Connector punctuation"; break;
|
| 89 |
case ucp_Pd: fulltypename = US"Dash punctuation"; break;
|
| 90 |
case ucp_Pe: fulltypename = US"Close punctuation"; break;
|
| 91 |
case ucp_Pf: fulltypename = US"Final punctuation"; break;
|
| 92 |
case ucp_Pi: fulltypename = US"Initial punctuation"; break;
|
| 93 |
case ucp_Po: fulltypename = US"Other punctuation"; break;
|
| 94 |
case ucp_Ps: fulltypename = US"Open punctuation"; break;
|
| 95 |
case ucp_Sc: fulltypename = US"Currency symbol"; break;
|
| 96 |
case ucp_Sk: fulltypename = US"Modifier symbol"; break;
|
| 97 |
case ucp_Sm: fulltypename = US"Mathematical symbol"; break;
|
| 98 |
case ucp_So: fulltypename = US"Other symbol"; break;
|
| 99 |
case ucp_Zl: fulltypename = US"Line separator"; break;
|
| 100 |
case ucp_Zp: fulltypename = US"Paragraph separator"; break;
|
| 101 |
case ucp_Zs: fulltypename = US"Space separator"; break;
|
| 102 |
}
|
| 103 |
|
| 104 |
switch(script)
|
| 105 |
{
|
| 106 |
case ucp_Arabic: scriptname = US"Arabic"; break;
|
| 107 |
case ucp_Armenian: scriptname = US"Armenian"; break;
|
| 108 |
case ucp_Balinese: scriptname = US"Balinese"; break;
|
| 109 |
case ucp_Bengali: scriptname = US"Bengali"; break;
|
| 110 |
case ucp_Bopomofo: scriptname = US"Bopomofo"; break;
|
| 111 |
case ucp_Braille: scriptname = US"Braille"; break;
|
| 112 |
case ucp_Buginese: scriptname = US"Buginese"; break;
|
| 113 |
case ucp_Buhid: scriptname = US"Buhid"; break;
|
| 114 |
case ucp_Canadian_Aboriginal: scriptname = US"Canadian_Aboriginal"; break;
|
| 115 |
case ucp_Cherokee: scriptname = US"Cherokee"; break;
|
| 116 |
case ucp_Common: scriptname = US"Common"; break;
|
| 117 |
case ucp_Coptic: scriptname = US"Coptic"; break;
|
| 118 |
case ucp_Cuneiform: scriptname = US"Cuneiform"; break;
|
| 119 |
case ucp_Cypriot: scriptname = US"Cypriot"; break;
|
| 120 |
case ucp_Cyrillic: scriptname = US"Cyrillic"; break;
|
| 121 |
case ucp_Deseret: scriptname = US"Deseret"; break;
|
| 122 |
case ucp_Devanagari: scriptname = US"Devanagari"; break;
|
| 123 |
case ucp_Ethiopic: scriptname = US"Ethiopic"; break;
|
| 124 |
case ucp_Georgian: scriptname = US"Georgian"; break;
|
| 125 |
case ucp_Glagolitic: scriptname = US"Glagolitic"; break;
|
| 126 |
case ucp_Gothic: scriptname = US"Gothic"; break;
|
| 127 |
case ucp_Greek: scriptname = US"Greek"; break;
|
| 128 |
case ucp_Gujarati: scriptname = US"Gujarati"; break;
|
| 129 |
case ucp_Gurmukhi: scriptname = US"Gurmukhi"; break;
|
| 130 |
case ucp_Han: scriptname = US"Han"; break;
|
| 131 |
case ucp_Hangul: scriptname = US"Hangul"; break;
|
| 132 |
case ucp_Hanunoo: scriptname = US"Hanunoo"; break;
|
| 133 |
case ucp_Hebrew: scriptname = US"Hebrew"; break;
|
| 134 |
case ucp_Hiragana: scriptname = US"Hiragana"; break;
|
| 135 |
case ucp_Inherited: scriptname = US"Inherited"; break;
|
| 136 |
case ucp_Kannada: scriptname = US"Kannada"; break;
|
| 137 |
case ucp_Katakana: scriptname = US"Katakana"; break;
|
| 138 |
case ucp_Kharoshthi: scriptname = US"Kharoshthi"; break;
|
| 139 |
case ucp_Khmer: scriptname = US"Khmer"; break;
|
| 140 |
case ucp_Lao: scriptname = US"Lao"; break;
|
| 141 |
case ucp_Latin: scriptname = US"Latin"; break;
|
| 142 |
case ucp_Limbu: scriptname = US"Limbu"; break;
|
| 143 |
case ucp_Linear_B: scriptname = US"Linear_B"; break;
|
| 144 |
case ucp_Malayalam: scriptname = US"Malayalam"; break;
|
| 145 |
case ucp_Mongolian: scriptname = US"Mongolian"; break;
|
| 146 |
case ucp_Myanmar: scriptname = US"Myanmar"; break;
|
| 147 |
case ucp_New_Tai_Lue: scriptname = US"New_Tai_Lue"; break;
|
| 148 |
case ucp_Nko: scriptname = US"Nko"; break;
|
| 149 |
case ucp_Ogham: scriptname = US"Ogham"; break;
|
| 150 |
case ucp_Old_Italic: scriptname = US"Old_Italic"; break;
|
| 151 |
case ucp_Old_Persian: scriptname = US"Old_Persian"; break;
|
| 152 |
case ucp_Oriya: scriptname = US"Oriya"; break;
|
| 153 |
case ucp_Osmanya: scriptname = US"Osmanya"; break;
|
| 154 |
case ucp_Phags_Pa: scriptname = US"Phags_Pa"; break;
|
| 155 |
case ucp_Phoenician: scriptname = US"Phoenician"; break;
|
| 156 |
case ucp_Runic: scriptname = US"Runic"; break;
|
| 157 |
case ucp_Shavian: scriptname = US"Shavian"; break;
|
| 158 |
case ucp_Sinhala: scriptname = US"Sinhala"; break;
|
| 159 |
case ucp_Syloti_Nagri: scriptname = US"Syloti_Nagri"; break;
|
| 160 |
case ucp_Syriac: scriptname = US"Syriac"; break;
|
| 161 |
case ucp_Tagalog: scriptname = US"Tagalog"; break;
|
| 162 |
case ucp_Tagbanwa: scriptname = US"Tagbanwa"; break;
|
| 163 |
case ucp_Tai_Le: scriptname = US"Tai_Le"; break;
|
| 164 |
case ucp_Tamil: scriptname = US"Tamil"; break;
|
| 165 |
case ucp_Telugu: scriptname = US"Telugu"; break;
|
| 166 |
case ucp_Thaana: scriptname = US"Thaana"; break;
|
| 167 |
case ucp_Thai: scriptname = US"Thai"; break;
|
| 168 |
case ucp_Tibetan: scriptname = US"Tibetan"; break;
|
| 169 |
case ucp_Tifinagh: scriptname = US"Tifinagh"; break;
|
| 170 |
case ucp_Ugaritic: scriptname = US"Ugaritic"; break;
|
| 171 |
case ucp_Yi: scriptname = US"Yi"; break;
|
| 172 |
}
|
| 173 |
|
| 174 |
printf("%04x %s: %s %s", c, typename, fulltypename, scriptname);
|
| 175 |
if (othercase != c) printf(" %04x", othercase);
|
| 176 |
printf("\n");
|
| 177 |
}
|
| 178 |
|
| 179 |
|
| 180 |
|
| 181 |
/*************************************************
|
| 182 |
* Main program *
|
| 183 |
*************************************************/
|
| 184 |
|
| 185 |
int
|
| 186 |
main(void)
|
| 187 |
{
|
| 188 |
uschar buffer[1024];
|
| 189 |
while (fgets(CS buffer, sizeof(buffer), stdin) != NULL)
|
| 190 |
{
|
| 191 |
uschar name[24];
|
| 192 |
uschar *s, *t;
|
| 193 |
|
| 194 |
printf("%s", buffer);
|
| 195 |
s = buffer;
|
| 196 |
while (isspace(*s)) s++;
|
| 197 |
if (*s == 0) continue;
|
| 198 |
|
| 199 |
for (t = name; *s != 0 && !isspace(*s); s++) *t++ = *s;
|
| 200 |
*t = 0;
|
| 201 |
while (isspace(*s)) s++;
|
| 202 |
|
| 203 |
if (strcmp(CS name, "findprop") == 0)
|
| 204 |
{
|
| 205 |
while (*s != 0)
|
| 206 |
{
|
| 207 |
uschar *endptr;
|
| 208 |
int c = strtoul(CS s, CSS(&endptr), 16);
|
| 209 |
print_prop(c);
|
| 210 |
s = endptr;
|
| 211 |
while (isspace(*s)) s++;
|
| 212 |
}
|
| 213 |
}
|
| 214 |
|
| 215 |
else printf("Unknown test command %s\n", name);
|
| 216 |
}
|
| 217 |
|
| 218 |
return 0;
|
| 219 |
}
|
| 220 |
|
| 221 |
/* End */
|