/[pcre]/code/trunk/maint/Builducptable
ViewVC logotype

Contents of /code/trunk/maint/Builducptable

Parent Directory Parent Directory | Revision Log Revision Log


Revision 350 - (show annotations) (download)
Wed Jul 2 19:18:41 2008 UTC (6 years, 4 months ago) by ph10
File size: 5608 byte(s)
More of the UCP speedup update.

1 ############################################################
2 ############################################################
3 ## As of PCRE 8.0 this file is OBSOLETE. A different way ##
4 ## of handling Unicode property data is now used. See the ##
5 ## maint/README document. ##
6 ## PH 02 July 2008 ##
7 ############################################################
8 ############################################################
9
10 #! /usr/bin/perl -w
11
12 # This is a Perl script to create the table of character properties. For
13 # information on the format, see ucpinternal.h. The Unicode files are expected
14 # to be in Unicode.tables/{Scripts,UnicodeData}.txt. The ../ucp.h file is also
15 # required. The table is written to the standard output.
16
17 # The script is rather slow because it just searches linearly through the
18 # Scripts data in order to find the script for each character or character
19 # range. It could be made faster by sorting that data, or something, but hey,
20 # it is only ever run once in a blue moon. (It's even slower after I mended the
21 # "forgot to check for script number before amalgamation" bug, but even so,
22 # the effort of improving it isn't worth it.)
23
24 # Subroutine: Given a character number, return the script number. The
25 # Scripts.txt file has been read into an array, keeping just the codepoints
26 # and the script name. The lines are in one of two formats:
27 #
28 # xxxx name
29 # xxxx..yyyy name
30 #
31 # where xxxx and yyyy are code points.
32
33 sub script{
34 my($n) = $_[0];
35 foreach $line (@scriptlist)
36 {
37 my($a,$z,$s);
38
39 if ($line =~ /\.\./)
40 {
41 ($a,$z,$s) = $line =~ /^([^\.]+)\.\.(\S+)\s+(.*)/;
42 }
43 else
44 {
45 ($a,$s) = $line =~ /^(\S+)\s+(.*)/;
46 $z = $a;
47 }
48
49 die "Problem on this scripts data line:\n$line"
50 if (!defined $a || !defined $z);
51
52 if ($n >= hex($a) && $n <= hex($z))
53 {
54 my($x) = $scriptnum{$s};
55 return $x if defined $x;
56 die "Can't find script number for $s\n";
57 }
58 }
59
60 # All code points not explicitly listed are "Common"
61
62 return $scriptnum{"Common"};
63 }
64
65
66 # Subroutine: given a category name, return its number
67
68 sub category {
69 my($x) = $gencat{$_[0]};
70 return $x if defined $x;
71 die "Can't find number for general category $_[0]\n";
72 }
73
74
75 # Subroutine: output an entry for a range, unless it isn't really a range,
76 # in which case just output a single entry.
77
78 sub outrange{
79 my($cp,$ncp,$gc) = @_;
80 my($flag) = ($cp != $ncp)? 0x00800000 : 0;
81 printf " { 0x%08x, 0x%08x },\n",
82 $cp | $flag | (script($cp) << 24),
83 (category($gc) << 26) | $ncp - $cp;
84 }
85
86
87 # Entry point: An argument giving the Unicode version is required.
88
89 die "Require a single argument, the Unicode version"
90 if ($#ARGV != 0);
91 $Uversion = shift @ARGV;
92
93
94 # Read in the Scripts.txt file, retaining only the code points
95 # and script names.
96
97 open(IN, "Unicode.tables/Scripts.txt") ||
98 die "Can't open Unicode.tables/Scripts.txt: $!\n";
99
100 while (<IN>)
101 {
102 next if !/^[0-9A-Z]/;
103 my($range,$name) = $_ =~ /^(\S+)\s*;\s*(\S+)/;
104 push @scriptlist, "$range $name";
105 }
106 close(IN);
107
108
109 # Now read the ucp.h file to get the values for the general categories
110 # and for the scripts.
111
112 open(IN, "../ucp.h") || die "Can't open ../ucp.h: $!\n";
113
114 while (<IN>) { last if /^enum/; }
115 while (<IN>) { last if /^enum/; }
116
117
118 # The second enum are the general categories.
119
120 $count = 0;
121 while (<IN>)
122 {
123 last if $_ !~ /^\s+ucp_(..)/;
124 $gencat{$1} = $count++;
125 }
126
127 while (<IN>) { last if /^enum/; }
128
129 # The third enum are script names.
130
131 $count = 0;
132 while (<IN>)
133 {
134 last if $_ !~ /^\s+ucp_(\w+)/;
135 $scriptnum{$1} = $count++;
136 }
137
138 close(IN);
139
140 # Write out the initial boilerplace.
141
142 print "/* This source module is automatically generated from the Unicode\n" .
143 "property table. See ucpinternal.h for a description of the layout.\n" .
144 "This version was made from the Unicode $Uversion tables. */\n\n" .
145 "static const cnode ucp_table[] = {\n";
146
147 # Now read the input file and generate the output.
148
149 open(IN, "Unicode.tables/UnicodeData.txt") ||
150 die "Can't open Unicode.tables/UnicodeData.txt: $!\n";
151
152 while (<IN>)
153 {
154 @fields = split /;/;
155
156 $cp = hex($fields[0]);
157 $gc = $fields[2];
158 $uc = $fields[12];
159 $lc = $fields[13];
160
161 # If this line has no "other case" data, it might be the start or end of
162 # a range, either one that is explicit in the data, or one that we can
163 # create by scanning forwards.
164
165 if ($uc eq "" && $lc eq "")
166 {
167 if ($fields[1] =~ /First>$/)
168 {
169 $_ = <IN>;
170 @fields = split /;/;
171 die "First not followed by Last\n", if $fields[1] !~ /Last>$/;
172 die "First and last have different categories\n",
173 if $gc ne $fields[2];
174 outrange($cp, hex($fields[0]), $gc);
175 }
176
177 else
178 {
179 my($startscript) = script($cp);
180 my($ncp) = $cp + 1;
181 while (<IN>)
182 {
183 @fields = split /;/;
184 last if (hex($fields[0]) != $ncp ||
185 $fields[2] ne $gc ||
186 $fields[12] ne "" ||
187 $fields[13] ne "" ||
188 script($ncp) != $startscript);
189
190 $ncp++;
191 }
192
193 $ncp--;
194 outrange($cp, $ncp, $gc);
195 redo if defined $_; # Reprocess terminating line
196 }
197 }
198
199 # If there is an "other case" character, we output a single-char line
200
201 else
202 {
203 my($co) = (hex(($uc eq "")? $lc : $uc) - $cp) & 0xffff;
204 printf " { 0x%08x, 0x%08x },\n",
205 $cp | (script($cp) << 24), (category($gc) << 26) | $co;
206 }
207 }
208
209 close(IN);
210
211 # Final boilerplate
212
213 print "};\n";
214
215 # End

Properties

Name Value
svn:executable *

webmaster@exim.org
ViewVC Help
Powered by ViewVC 1.1.12