/[pcre]/code/trunk/maint/Builducptable
ViewVC logotype

Contents of /code/trunk/maint/Builducptable

Parent Directory Parent Directory | Revision Log Revision Log


Revision 277 - (show annotations) (download)
Mon Nov 26 17:03:45 2007 UTC (6 years, 11 months ago) by ph10
File size: 5119 byte(s)
Fix incorrect script bug in Unicode character table.

1 #! /usr/bin/perl -w
2
3 # This is a Perl script to create the table of character properties. For
4 # information on the format, see ucpinternal.h. The Unicode files are expected
5 # to be in Unicode.tables/{Scripts,UnicodeData}.txt. The ../ucp.h file is also
6 # required. The table is written to the standard output.
7
8 # The script is rather slow because it just searches linearly through the
9 # Scripts data in order to find the script for each character or character
10 # range. It could be made faster by sorting that data, or something, but hey,
11 # it is only ever run once in a blue moon. (It's even slower after I mended the
12 # "forgot to check for script number before amalgamation" bug, but even so,
13 # the effort of improving it isn't worth it.)
14
15 # Subroutine: Given a character number, return the script number. The
16 # Scripts.txt file has been read into an array, keeping just the codepoints
17 # and the script name. The lines are in one of two formats:
18 #
19 # xxxx name
20 # xxxx..yyyy name
21 #
22 # where xxxx and yyyy are code points.
23
24 sub script{
25 my($n) = $_[0];
26 foreach $line (@scriptlist)
27 {
28 my($a,$z,$s);
29
30 if ($line =~ /\.\./)
31 {
32 ($a,$z,$s) = $line =~ /^([^\.]+)\.\.(\S+)\s+(.*)/;
33 }
34 else
35 {
36 ($a,$s) = $line =~ /^(\S+)\s+(.*)/;
37 $z = $a;
38 }
39
40 die "Problem on this scripts data line:\n$line"
41 if (!defined $a || !defined $z);
42
43 if ($n >= hex($a) && $n <= hex($z))
44 {
45 my($x) = $scriptnum{$s};
46 return $x if defined $x;
47 die "Can't find script number for $s\n";
48 }
49 }
50
51 # All code points not explicitly listed are "Common"
52
53 return $scriptnum{"Common"};
54 }
55
56
57 # Subroutine: given a category name, return its number
58
59 sub category {
60 my($x) = $gencat{$_[0]};
61 return $x if defined $x;
62 die "Can't find number for general category $_[0]\n";
63 }
64
65
66 # Subroutine: output an entry for a range, unless it isn't really a range,
67 # in which case just output a single entry.
68
69 sub outrange{
70 my($cp,$ncp,$gc) = @_;
71 my($flag) = ($cp != $ncp)? 0x00800000 : 0;
72 printf " { 0x%08x, 0x%08x },\n",
73 $cp | $flag | (script($cp) << 24),
74 (category($gc) << 26) | $ncp - $cp;
75 }
76
77
78 # Entry point: An argument giving the Unicode version is required.
79
80 die "Require a single argument, the Unicode version"
81 if ($#ARGV != 0);
82 $Uversion = shift @ARGV;
83
84
85 # Read in the Scripts.txt file, retaining only the code points
86 # and script names.
87
88 open(IN, "Unicode.tables/Scripts.txt") ||
89 die "Can't open Unicode.tables/Scripts.txt: $!\n";
90
91 while (<IN>)
92 {
93 next if !/^[0-9A-Z]/;
94 my($range,$name) = $_ =~ /^(\S+)\s*;\s*(\S+)/;
95 push @scriptlist, "$range $name";
96 }
97 close(IN);
98
99
100 # Now read the ucp.h file to get the values for the general categories
101 # and for the scripts.
102
103 open(IN, "../ucp.h") || die "Can't open ../ucp.h: $!\n";
104
105 while (<IN>) { last if /^enum/; }
106 while (<IN>) { last if /^enum/; }
107
108
109 # The second enum are the general categories.
110
111 $count = 0;
112 while (<IN>)
113 {
114 last if $_ !~ /^\s+ucp_(..)/;
115 $gencat{$1} = $count++;
116 }
117
118 while (<IN>) { last if /^enum/; }
119
120 # The third enum are script names.
121
122 $count = 0;
123 while (<IN>)
124 {
125 last if $_ !~ /^\s+ucp_(\w+)/;
126 $scriptnum{$1} = $count++;
127 }
128
129 close(IN);
130
131 # Write out the initial boilerplace.
132
133 print "/* This source module is automatically generated from the Unicode\n" .
134 "property table. See ucpinternal.h for a description of the layout.\n" .
135 "This version was made from the Unicode $Uversion tables. */\n\n" .
136 "static const cnode ucp_table[] = {\n";
137
138 # Now read the input file and generate the output.
139
140 open(IN, "Unicode.tables/UnicodeData.txt") ||
141 die "Can't open Unicode.tables/UnicodeData.txt: $!\n";
142
143 while (<IN>)
144 {
145 @fields = split /;/;
146
147 $cp = hex($fields[0]);
148 $gc = $fields[2];
149 $uc = $fields[12];
150 $lc = $fields[13];
151
152 # If this line has no "other case" data, it might be the start or end of
153 # a range, either one that is explicit in the data, or one that we can
154 # create by scanning forwards.
155
156 if ($uc eq "" && $lc eq "")
157 {
158 if ($fields[1] =~ /First>$/)
159 {
160 $_ = <IN>;
161 @fields = split /;/;
162 die "First not followed by Last\n", if $fields[1] !~ /Last>$/;
163 die "First and last have different categories\n",
164 if $gc ne $fields[2];
165 outrange($cp, hex($fields[0]), $gc);
166 }
167
168 else
169 {
170 my($startscript) = script($cp);
171 my($ncp) = $cp + 1;
172 while (<IN>)
173 {
174 @fields = split /;/;
175 last if (hex($fields[0]) != $ncp ||
176 $fields[2] ne $gc ||
177 $fields[12] ne "" ||
178 $fields[13] ne "" ||
179 script($ncp) != $startscript);
180
181 $ncp++;
182 }
183
184 $ncp--;
185 outrange($cp, $ncp, $gc);
186 redo if defined $_; # Reprocess terminating line
187 }
188 }
189
190 # If there is an "other case" character, we output a single-char line
191
192 else
193 {
194 my($co) = (hex(($uc eq "")? $lc : $uc) - $cp) & 0xffff;
195 printf " { 0x%08x, 0x%08x },\n",
196 $cp | (script($cp) << 24), (category($gc) << 26) | $co;
197 }
198 }
199
200 close(IN);
201
202 # Final boilerplate
203
204 print "};\n";
205
206 # End

Properties

Name Value
svn:executable *

webmaster@exim.org
ViewVC Help
Powered by ViewVC 1.1.12