/[pcre]/code/trunk/maint/Builducptable
ViewVC logotype

Contents of /code/trunk/maint/Builducptable

Parent Directory Parent Directory | Revision Log Revision Log


Revision 114 - (show annotations) (download)
Fri Mar 9 10:15:12 2007 UTC (7 years, 1 month ago) by ph10
File size: 4871 byte(s)
Rename "maintain" as "maint".

1 #! /usr/bin/perl -w
2
3 # This is a Perl script to create the table of character properties. For
4 # information on the format, see ucpinternal.h. The Unicode files are expected
5 # to be in Unicode.tables/{Scripts,UnicodeData}.txt. The ../ucp.h file is also
6 # required. The table is written to the standard output.
7
8 # The script is rather slow because it just searches linearly through the
9 # Scripts data in order to find the script for each character or character
10 # range. It could be made faster by sorting that data, or something, but hey,
11 # it is only ever run once in a blue moon.
12
13 # Subroutine: Given a character number, return the script number. The
14 # Scripts.txt file has been read into an array, keeping just the codepoints
15 # and the script name. The lines are in one of two formats:
16 #
17 # xxxx name
18 # xxxx..yyyy name
19 #
20 # where xxxx and yyyy are code points.
21
22 sub script{
23 my($n) = $_[0];
24 foreach $line (@scriptlist)
25 {
26 my($a,$z,$s);
27
28 if ($line =~ /\.\./)
29 {
30 ($a,$z,$s) = $line =~ /^([^\.]+)\.\.(\S+)\s+(.*)/;
31 }
32 else
33 {
34 ($a,$s) = $line =~ /^(\S+)\s+(.*)/;
35 $z = $a;
36 }
37
38 die "Problem on this scripts data line:\n$line"
39 if (!defined $a || !defined $z);
40
41 if ($n >= hex($a) && $n <= hex($z))
42 {
43 my($x) = $scriptnum{$s};
44 return $x if defined $x;
45 die "Can't find script number for $s\n";
46 }
47 }
48
49 # All code points not explicitly listed are "Common"
50
51 return $scriptnum{"Common"};
52 }
53
54
55 # Subroutine: given a category name, return its number
56
57 sub category {
58 my($x) = $gencat{$_[0]};
59 return $x if defined $x;
60 die "Can't find number for general category $_[0]\n";
61 }
62
63
64 # Subroutine: output an entry for a range, unless it isn't really a range,
65 # in which case just output a single entry.
66
67 sub outrange{
68 my($cp,$ncp,$gc) = @_;
69 my($flag) = ($cp != $ncp)? 0x00800000 : 0;
70 printf " { 0x%08x, 0x%08x },\n",
71 $cp | $flag | (script($cp) << 24),
72 (category($gc) << 26) | $ncp - $cp;
73 }
74
75
76 # Entry point: An argument giving the Unicode version is required.
77
78 die "Require a single argument, the Unicode version"
79 if ($#ARGV != 0);
80 $Uversion = shift @ARGV;
81
82
83 # Read in the Scripts.txt file, retaining only the code points
84 # and script names.
85
86 open(IN, "Unicode.tables/Scripts.txt") ||
87 die "Can't open Unicode.tables/Scripts.txt: $!\n";
88
89 while (<IN>)
90 {
91 next if !/^[0-9A-Z]/;
92 my($range,$name) = $_ =~ /^(\S+)\s*;\s*(\S+)/;
93 push @scriptlist, "$range $name";
94 }
95 close(IN);
96
97
98 # Now read the ucp.h file to get the values for the general categories
99 # and for the scripts.
100
101 open(IN, "../ucp.h") || die "Can't open ../ucp.h: $!\n";
102
103 while (<IN>) { last if /^enum/; }
104 while (<IN>) { last if /^enum/; }
105
106
107 # The second enum are the general categories.
108
109 $count = 0;
110 while (<IN>)
111 {
112 last if $_ !~ /^\s+ucp_(..)/;
113 $gencat{$1} = $count++;
114 }
115
116 while (<IN>) { last if /^enum/; }
117
118 # The third enum are script names.
119
120 $count = 0;
121 while (<IN>)
122 {
123 last if $_ !~ /^\s+ucp_(\w+)/;
124 $scriptnum{$1} = $count++;
125 }
126
127 close(IN);
128
129 # Write out the initial boilerplace.
130
131 print "/* This source module is automatically generated from the Unicode\n" .
132 "property table. See ucpinternal.h for a description of the layout.\n" .
133 "This version was made from the Unicode $Uversion tables. */\n\n" .
134 "static const cnode ucp_table[] = {\n";
135
136 # Now read the input file and generate the output.
137
138 open(IN, "Unicode.tables/UnicodeData.txt") ||
139 die "Can't open Unicode.tables/UnicodeData.txt: $!\n";
140
141 while (<IN>)
142 {
143 @fields = split /;/;
144
145 $cp = hex($fields[0]);
146 $gc = $fields[2];
147 $uc = $fields[12];
148 $lc = $fields[13];
149
150 # If this line has no "other case" data, it might be the start or end of
151 # a range, either one that is explicit in the data, or one that we can
152 # create by scanning forwards.
153
154 if ($uc eq "" && $lc eq "")
155 {
156 if ($fields[1] =~ /First>$/)
157 {
158 $_ = <IN>;
159 @fields = split /;/;
160 die "First not followed by Last\n", if $fields[1] !~ /Last>$/;
161 die "First and last have different categories\n",
162 if $gc ne $fields[2];
163 outrange($cp, hex($fields[0]), $gc);
164 }
165
166 else
167 {
168 my($ncp) = $cp + 1;
169 while (<IN>)
170 {
171 @fields = split /;/;
172 last if (hex($fields[0]) != $ncp ||
173 $fields[2] ne $gc ||
174 $fields[12] ne "" ||
175 $fields[13] ne "");
176
177 $ncp++;
178 }
179
180 $ncp--;
181 outrange($cp, $ncp, $gc);
182 redo if defined $_; # Reprocess terminating line
183 }
184 }
185
186 # If there is an "other case" character, we output a single-char line
187
188 else
189 {
190 my($co) = (hex(($uc eq "")? $lc : $uc) - $cp) & 0xffff;
191 printf " { 0x%08x, 0x%08x },\n",
192 $cp | (script($cp) << 24), (category($gc) << 26) | $co;
193 }
194 }
195
196 close(IN);
197
198 # Final boilerplate
199
200 print "};\n";
201
202 # End

Properties

Name Value
svn:executable *

webmaster@exim.org
ViewVC Help
Powered by ViewVC 1.1.12