| 1 |
############################################################
|
| 2 |
############################################################
|
| 3 |
## As of PCRE 8.0 this file is OBSOLETE. A different way ##
|
| 4 |
## of handling Unicode property data is now used. See the ##
|
| 5 |
## maint/README document. ##
|
| 6 |
## PH 02 July 2008 ##
|
| 7 |
############################################################
|
| 8 |
############################################################
|
| 9 |
|
| 10 |
#! /usr/bin/perl -w
|
| 11 |
|
| 12 |
# This is a Perl script to create the table of character properties. For
|
| 13 |
# information on the format, see ucpinternal.h. The Unicode files are expected
|
| 14 |
# to be in Unicode.tables/{Scripts,UnicodeData}.txt. The ../ucp.h file is also
|
| 15 |
# required. The table is written to the standard output.
|
| 16 |
|
| 17 |
# The script is rather slow because it just searches linearly through the
|
| 18 |
# Scripts data in order to find the script for each character or character
|
| 19 |
# range. It could be made faster by sorting that data, or something, but hey,
|
| 20 |
# it is only ever run once in a blue moon. (It's even slower after I mended the
|
| 21 |
# "forgot to check for script number before amalgamation" bug, but even so,
|
| 22 |
# the effort of improving it isn't worth it.)
|
| 23 |
|
| 24 |
# Subroutine: Given a character number, return the script number. The
|
| 25 |
# Scripts.txt file has been read into an array, keeping just the codepoints
|
| 26 |
# and the script name. The lines are in one of two formats:
|
| 27 |
#
|
| 28 |
# xxxx name
|
| 29 |
# xxxx..yyyy name
|
| 30 |
#
|
| 31 |
# where xxxx and yyyy are code points.
|
| 32 |
|
| 33 |
sub script{
|
| 34 |
my($n) = $_[0];
|
| 35 |
foreach $line (@scriptlist)
|
| 36 |
{
|
| 37 |
my($a,$z,$s);
|
| 38 |
|
| 39 |
if ($line =~ /\.\./)
|
| 40 |
{
|
| 41 |
($a,$z,$s) = $line =~ /^([^\.]+)\.\.(\S+)\s+(.*)/;
|
| 42 |
}
|
| 43 |
else
|
| 44 |
{
|
| 45 |
($a,$s) = $line =~ /^(\S+)\s+(.*)/;
|
| 46 |
$z = $a;
|
| 47 |
}
|
| 48 |
|
| 49 |
die "Problem on this scripts data line:\n$line"
|
| 50 |
if (!defined $a || !defined $z);
|
| 51 |
|
| 52 |
if ($n >= hex($a) && $n <= hex($z))
|
| 53 |
{
|
| 54 |
my($x) = $scriptnum{$s};
|
| 55 |
return $x if defined $x;
|
| 56 |
die "Can't find script number for $s\n";
|
| 57 |
}
|
| 58 |
}
|
| 59 |
|
| 60 |
# All code points not explicitly listed are "Common"
|
| 61 |
|
| 62 |
return $scriptnum{"Common"};
|
| 63 |
}
|
| 64 |
|
| 65 |
|
| 66 |
# Subroutine: given a category name, return its number
|
| 67 |
|
| 68 |
sub category {
|
| 69 |
my($x) = $gencat{$_[0]};
|
| 70 |
return $x if defined $x;
|
| 71 |
die "Can't find number for general category $_[0]\n";
|
| 72 |
}
|
| 73 |
|
| 74 |
|
| 75 |
# Subroutine: output an entry for a range, unless it isn't really a range,
|
| 76 |
# in which case just output a single entry.
|
| 77 |
|
| 78 |
sub outrange{
|
| 79 |
my($cp,$ncp,$gc) = @_;
|
| 80 |
my($flag) = ($cp != $ncp)? 0x00800000 : 0;
|
| 81 |
printf " { 0x%08x, 0x%08x },\n",
|
| 82 |
$cp | $flag | (script($cp) << 24),
|
| 83 |
(category($gc) << 26) | $ncp - $cp;
|
| 84 |
}
|
| 85 |
|
| 86 |
|
| 87 |
# Entry point: An argument giving the Unicode version is required.
|
| 88 |
|
| 89 |
die "Require a single argument, the Unicode version"
|
| 90 |
if ($#ARGV != 0);
|
| 91 |
$Uversion = shift @ARGV;
|
| 92 |
|
| 93 |
|
| 94 |
# Read in the Scripts.txt file, retaining only the code points
|
| 95 |
# and script names.
|
| 96 |
|
| 97 |
open(IN, "Unicode.tables/Scripts.txt") ||
|
| 98 |
die "Can't open Unicode.tables/Scripts.txt: $!\n";
|
| 99 |
|
| 100 |
while (<IN>)
|
| 101 |
{
|
| 102 |
next if !/^[0-9A-Z]/;
|
| 103 |
my($range,$name) = $_ =~ /^(\S+)\s*;\s*(\S+)/;
|
| 104 |
push @scriptlist, "$range $name";
|
| 105 |
}
|
| 106 |
close(IN);
|
| 107 |
|
| 108 |
|
| 109 |
# Now read the ucp.h file to get the values for the general categories
|
| 110 |
# and for the scripts.
|
| 111 |
|
| 112 |
open(IN, "../ucp.h") || die "Can't open ../ucp.h: $!\n";
|
| 113 |
|
| 114 |
while (<IN>) { last if /^enum/; }
|
| 115 |
while (<IN>) { last if /^enum/; }
|
| 116 |
|
| 117 |
|
| 118 |
# The second enum are the general categories.
|
| 119 |
|
| 120 |
$count = 0;
|
| 121 |
while (<IN>)
|
| 122 |
{
|
| 123 |
last if $_ !~ /^\s+ucp_(..)/;
|
| 124 |
$gencat{$1} = $count++;
|
| 125 |
}
|
| 126 |
|
| 127 |
while (<IN>) { last if /^enum/; }
|
| 128 |
|
| 129 |
# The third enum are script names.
|
| 130 |
|
| 131 |
$count = 0;
|
| 132 |
while (<IN>)
|
| 133 |
{
|
| 134 |
last if $_ !~ /^\s+ucp_(\w+)/;
|
| 135 |
$scriptnum{$1} = $count++;
|
| 136 |
}
|
| 137 |
|
| 138 |
close(IN);
|
| 139 |
|
| 140 |
# Write out the initial boilerplace.
|
| 141 |
|
| 142 |
print "/* This source module is automatically generated from the Unicode\n" .
|
| 143 |
"property table. See ucpinternal.h for a description of the layout.\n" .
|
| 144 |
"This version was made from the Unicode $Uversion tables. */\n\n" .
|
| 145 |
"static const cnode ucp_table[] = {\n";
|
| 146 |
|
| 147 |
# Now read the input file and generate the output.
|
| 148 |
|
| 149 |
open(IN, "Unicode.tables/UnicodeData.txt") ||
|
| 150 |
die "Can't open Unicode.tables/UnicodeData.txt: $!\n";
|
| 151 |
|
| 152 |
while (<IN>)
|
| 153 |
{
|
| 154 |
@fields = split /;/;
|
| 155 |
|
| 156 |
$cp = hex($fields[0]);
|
| 157 |
$gc = $fields[2];
|
| 158 |
$uc = $fields[12];
|
| 159 |
$lc = $fields[13];
|
| 160 |
|
| 161 |
# If this line has no "other case" data, it might be the start or end of
|
| 162 |
# a range, either one that is explicit in the data, or one that we can
|
| 163 |
# create by scanning forwards.
|
| 164 |
|
| 165 |
if ($uc eq "" && $lc eq "")
|
| 166 |
{
|
| 167 |
if ($fields[1] =~ /First>$/)
|
| 168 |
{
|
| 169 |
$_ = <IN>;
|
| 170 |
@fields = split /;/;
|
| 171 |
die "First not followed by Last\n", if $fields[1] !~ /Last>$/;
|
| 172 |
die "First and last have different categories\n",
|
| 173 |
if $gc ne $fields[2];
|
| 174 |
outrange($cp, hex($fields[0]), $gc);
|
| 175 |
}
|
| 176 |
|
| 177 |
else
|
| 178 |
{
|
| 179 |
my($startscript) = script($cp);
|
| 180 |
my($ncp) = $cp + 1;
|
| 181 |
while (<IN>)
|
| 182 |
{
|
| 183 |
@fields = split /;/;
|
| 184 |
last if (hex($fields[0]) != $ncp ||
|
| 185 |
$fields[2] ne $gc ||
|
| 186 |
$fields[12] ne "" ||
|
| 187 |
$fields[13] ne "" ||
|
| 188 |
script($ncp) != $startscript);
|
| 189 |
|
| 190 |
$ncp++;
|
| 191 |
}
|
| 192 |
|
| 193 |
$ncp--;
|
| 194 |
outrange($cp, $ncp, $gc);
|
| 195 |
redo if defined $_; # Reprocess terminating line
|
| 196 |
}
|
| 197 |
}
|
| 198 |
|
| 199 |
# If there is an "other case" character, we output a single-char line
|
| 200 |
|
| 201 |
else
|
| 202 |
{
|
| 203 |
my($co) = (hex(($uc eq "")? $lc : $uc) - $cp) & 0xffff;
|
| 204 |
printf " { 0x%08x, 0x%08x },\n",
|
| 205 |
$cp | (script($cp) << 24), (category($gc) << 26) | $co;
|
| 206 |
}
|
| 207 |
}
|
| 208 |
|
| 209 |
close(IN);
|
| 210 |
|
| 211 |
# Final boilerplate
|
| 212 |
|
| 213 |
print "};\n";
|
| 214 |
|
| 215 |
# End
|