| 1 |
#! /usr/bin/perl -w
|
| 2 |
|
| 3 |
# This is a Perl script to create the table of character properties. For
|
| 4 |
# information on the format, see ucpinternal.h. The Unicode files are
|
| 5 |
# expected to be in maintain/Unicode.tables/{Scripts,UnicodeData}.txt. The
|
| 6 |
# ucp.h file is also required. The table is written to the standard output.
|
| 7 |
|
| 8 |
# The script is rather slow because it just searches linearly through the
|
| 9 |
# Scripts data in order to find the script for each character or character
|
| 10 |
# range. It could be made faster by sorting that data, or something, but hey,
|
| 11 |
# it is only ever run once in a blue moon.
|
| 12 |
|
| 13 |
# Subroutine: Given a character number, return the script number. The
|
| 14 |
# Scripts.txt file has been read into an array, keeping just the codepoints
|
| 15 |
# and the script name. The lines are in one of two formats:
|
| 16 |
#
|
| 17 |
# xxxx name
|
| 18 |
# xxxx..yyyy name
|
| 19 |
#
|
| 20 |
# where xxxx and yyyy are code points.
|
| 21 |
|
| 22 |
sub script{
|
| 23 |
my($n) = $_[0];
|
| 24 |
foreach $line (@scriptlist)
|
| 25 |
{
|
| 26 |
my($a,$z,$s);
|
| 27 |
|
| 28 |
if ($line =~ /\.\./)
|
| 29 |
{
|
| 30 |
($a,$z,$s) = $line =~ /^([^\.]+)\.\.(\S+)\s+(.*)/;
|
| 31 |
}
|
| 32 |
else
|
| 33 |
{
|
| 34 |
($a,$s) = $line =~ /^(\S+)\s+(.*)/;
|
| 35 |
$z = $a;
|
| 36 |
}
|
| 37 |
|
| 38 |
die "Problem on this scripts data line:\n$line"
|
| 39 |
if (!defined $a || !defined $z);
|
| 40 |
|
| 41 |
if ($n >= hex($a) && $n <= hex($z))
|
| 42 |
{
|
| 43 |
my($x) = $scriptnum{$s};
|
| 44 |
return $x if defined $x;
|
| 45 |
die "Can't find script number for $s\n";
|
| 46 |
}
|
| 47 |
}
|
| 48 |
|
| 49 |
# All code points not explicitly listed are "Common"
|
| 50 |
|
| 51 |
return $scriptnum{"Common"};
|
| 52 |
}
|
| 53 |
|
| 54 |
|
| 55 |
# Subroutine: given a category name, return its number
|
| 56 |
|
| 57 |
sub category {
|
| 58 |
my($x) = $gencat{$_[0]};
|
| 59 |
return $x if defined $x;
|
| 60 |
die "Can't find number for general category $_[0]\n";
|
| 61 |
}
|
| 62 |
|
| 63 |
|
| 64 |
# Subroutine: output an entry for a range, unless it isn't really a range,
|
| 65 |
# in which case just output a single entry.
|
| 66 |
|
| 67 |
sub outrange{
|
| 68 |
my($cp,$ncp,$gc) = @_;
|
| 69 |
my($flag) = ($cp != $ncp)? 0x00800000 : 0;
|
| 70 |
printf " { 0x%08x, 0x%08x },\n",
|
| 71 |
$cp | $flag | (script($cp) << 24),
|
| 72 |
(category($gc) << 26) | $ncp - $cp;
|
| 73 |
}
|
| 74 |
|
| 75 |
|
| 76 |
# Entry point: An argument giving the Unicode version is required.
|
| 77 |
|
| 78 |
die "Require a single argument, the Unicode version"
|
| 79 |
if ($#ARGV != 0);
|
| 80 |
$Uversion = shift @ARGV;
|
| 81 |
|
| 82 |
|
| 83 |
# Read in the Scripts.txt file, retaining only the code points
|
| 84 |
# and script names.
|
| 85 |
|
| 86 |
open(IN, "Unicode.tables/Scripts.txt") ||
|
| 87 |
die "Can't open Unicode.tables/Scripts.txt: $!\n";
|
| 88 |
|
| 89 |
while (<IN>)
|
| 90 |
{
|
| 91 |
next if !/^[0-9A-Z]/;
|
| 92 |
my($range,$name) = $_ =~ /^(\S+)\s*;\s*(\S+)/;
|
| 93 |
push @scriptlist, "$range $name";
|
| 94 |
}
|
| 95 |
close(IN);
|
| 96 |
|
| 97 |
|
| 98 |
# Now read the ucp.h file to get the values for the general categories
|
| 99 |
# and for the scripts.
|
| 100 |
|
| 101 |
open(IN, "ucp.h") || die "Can't open ucp.h: $!\n";
|
| 102 |
|
| 103 |
while (<IN>) { last if /^enum/; }
|
| 104 |
while (<IN>) { last if /^enum/; }
|
| 105 |
|
| 106 |
|
| 107 |
# The second enum are the general categories.
|
| 108 |
|
| 109 |
$count = 0;
|
| 110 |
while (<IN>)
|
| 111 |
{
|
| 112 |
last if $_ !~ /^\s+ucp_(..)/;
|
| 113 |
$gencat{$1} = $count++;
|
| 114 |
}
|
| 115 |
|
| 116 |
while (<IN>) { last if /^enum/; }
|
| 117 |
|
| 118 |
# The third enum are script names.
|
| 119 |
|
| 120 |
$count = 0;
|
| 121 |
while (<IN>)
|
| 122 |
{
|
| 123 |
last if $_ !~ /^\s+ucp_(\w+)/;
|
| 124 |
$scriptnum{$1} = $count++;
|
| 125 |
}
|
| 126 |
|
| 127 |
close(IN);
|
| 128 |
|
| 129 |
# Write out the initial boilerplace.
|
| 130 |
|
| 131 |
print "/* This source module is automatically generated from the Unicode\n" .
|
| 132 |
"property table. See ucpinternal.h for a description of the layout.\n" .
|
| 133 |
"This version was made from the Unicode $Uversion tables. */\n\n" .
|
| 134 |
"static cnode ucp_table[] = {\n";
|
| 135 |
|
| 136 |
# Now read the input file and generate the output.
|
| 137 |
|
| 138 |
open(IN, "Unicode.tables/UnicodeData.txt") ||
|
| 139 |
die "Can't open Unicode.tables/UnicodeData.txt: $!\n";
|
| 140 |
|
| 141 |
while (<IN>)
|
| 142 |
{
|
| 143 |
@fields = split /;/;
|
| 144 |
|
| 145 |
$cp = hex($fields[0]);
|
| 146 |
$gc = $fields[2];
|
| 147 |
$uc = $fields[12];
|
| 148 |
$lc = $fields[13];
|
| 149 |
|
| 150 |
# If this line has no "other case" data, it might be the start or end of
|
| 151 |
# a range, either one that is explicit in the data, or one that we can
|
| 152 |
# create by scanning forwards.
|
| 153 |
|
| 154 |
if ($uc eq "" && $lc eq "")
|
| 155 |
{
|
| 156 |
if ($fields[1] =~ /First>$/)
|
| 157 |
{
|
| 158 |
$_ = <IN>;
|
| 159 |
@fields = split /;/;
|
| 160 |
die "First not followed by Last\n", if $fields[1] !~ /Last>$/;
|
| 161 |
die "First and last have different categories\n",
|
| 162 |
if $gc ne $fields[2];
|
| 163 |
outrange($cp, hex($fields[0]), $gc);
|
| 164 |
}
|
| 165 |
|
| 166 |
else
|
| 167 |
{
|
| 168 |
my($ncp) = $cp + 1;
|
| 169 |
while (<IN>)
|
| 170 |
{
|
| 171 |
@fields = split /;/;
|
| 172 |
last if (hex($fields[0]) != $ncp ||
|
| 173 |
$fields[2] ne $gc ||
|
| 174 |
$fields[12] ne "" ||
|
| 175 |
$fields[13] ne "");
|
| 176 |
|
| 177 |
$ncp++;
|
| 178 |
}
|
| 179 |
|
| 180 |
$ncp--;
|
| 181 |
outrange($cp, $ncp, $gc);
|
| 182 |
redo if defined $_; # Reprocess terminating line
|
| 183 |
}
|
| 184 |
}
|
| 185 |
|
| 186 |
# If there is an "other case" character, we output a single-char line
|
| 187 |
|
| 188 |
else
|
| 189 |
{
|
| 190 |
my($co) = (hex(($uc eq "")? $lc : $uc) - $cp) & 0xffff;
|
| 191 |
printf " { 0x%08x, 0x%08x },\n",
|
| 192 |
$cp | (script($cp) << 24), (category($gc) << 26) | $co;
|
| 193 |
}
|
| 194 |
}
|
| 195 |
|
| 196 |
close(IN);
|
| 197 |
|
| 198 |
# Final boilerplate
|
| 199 |
|
| 200 |
print "};\n";
|
| 201 |
|
| 202 |
# End
|