/[pcre]/code/trunk/maintain/Builducptable
ViewVC logotype

Contents of /code/trunk/maintain/Builducptable

Parent Directory Parent Directory | Revision Log Revision Log


Revision 97 - (show annotations) (download)
Mon Mar 5 12:36:47 2007 UTC (7 years, 4 months ago) by ph10
File size: 4866 byte(s)
Applied Bob and Daniel's patches to convert the build system to automake. Added 
the maintain directory, containing files that are used for maintenance, but are 
not distributed. This is an intermediate step.

1 #! /usr/bin/perl -w
2
3 # This is a Perl script to create the table of character properties. For
4 # information on the format, see ucpinternal.h. The Unicode files are
5 # expected to be in maintain/Unicode.tables/{Scripts,UnicodeData}.txt. The
6 # ucp.h file is also required. The table is written to the standard output.
7
8 # The script is rather slow because it just searches linearly through the
9 # Scripts data in order to find the script for each character or character
10 # range. It could be made faster by sorting that data, or something, but hey,
11 # it is only ever run once in a blue moon.
12
13 # Subroutine: Given a character number, return the script number. The
14 # Scripts.txt file has been read into an array, keeping just the codepoints
15 # and the script name. The lines are in one of two formats:
16 #
17 # xxxx name
18 # xxxx..yyyy name
19 #
20 # where xxxx and yyyy are code points.
21
22 sub script{
23 my($n) = $_[0];
24 foreach $line (@scriptlist)
25 {
26 my($a,$z,$s);
27
28 if ($line =~ /\.\./)
29 {
30 ($a,$z,$s) = $line =~ /^([^\.]+)\.\.(\S+)\s+(.*)/;
31 }
32 else
33 {
34 ($a,$s) = $line =~ /^(\S+)\s+(.*)/;
35 $z = $a;
36 }
37
38 die "Problem on this scripts data line:\n$line"
39 if (!defined $a || !defined $z);
40
41 if ($n >= hex($a) && $n <= hex($z))
42 {
43 my($x) = $scriptnum{$s};
44 return $x if defined $x;
45 die "Can't find script number for $s\n";
46 }
47 }
48
49 # All code points not explicitly listed are "Common"
50
51 return $scriptnum{"Common"};
52 }
53
54
55 # Subroutine: given a category name, return its number
56
57 sub category {
58 my($x) = $gencat{$_[0]};
59 return $x if defined $x;
60 die "Can't find number for general category $_[0]\n";
61 }
62
63
64 # Subroutine: output an entry for a range, unless it isn't really a range,
65 # in which case just output a single entry.
66
67 sub outrange{
68 my($cp,$ncp,$gc) = @_;
69 my($flag) = ($cp != $ncp)? 0x00800000 : 0;
70 printf " { 0x%08x, 0x%08x },\n",
71 $cp | $flag | (script($cp) << 24),
72 (category($gc) << 26) | $ncp - $cp;
73 }
74
75
76 # Entry point: An argument giving the Unicode version is required.
77
78 die "Require a single argument, the Unicode version"
79 if ($#ARGV != 0);
80 $Uversion = shift @ARGV;
81
82
83 # Read in the Scripts.txt file, retaining only the code points
84 # and script names.
85
86 open(IN, "Unicode.tables/Scripts.txt") ||
87 die "Can't open Unicode.tables/Scripts.txt: $!\n";
88
89 while (<IN>)
90 {
91 next if !/^[0-9A-Z]/;
92 my($range,$name) = $_ =~ /^(\S+)\s*;\s*(\S+)/;
93 push @scriptlist, "$range $name";
94 }
95 close(IN);
96
97
98 # Now read the ucp.h file to get the values for the general categories
99 # and for the scripts.
100
101 open(IN, "ucp.h") || die "Can't open ucp.h: $!\n";
102
103 while (<IN>) { last if /^enum/; }
104 while (<IN>) { last if /^enum/; }
105
106
107 # The second enum are the general categories.
108
109 $count = 0;
110 while (<IN>)
111 {
112 last if $_ !~ /^\s+ucp_(..)/;
113 $gencat{$1} = $count++;
114 }
115
116 while (<IN>) { last if /^enum/; }
117
118 # The third enum are script names.
119
120 $count = 0;
121 while (<IN>)
122 {
123 last if $_ !~ /^\s+ucp_(\w+)/;
124 $scriptnum{$1} = $count++;
125 }
126
127 close(IN);
128
129 # Write out the initial boilerplace.
130
131 print "/* This source module is automatically generated from the Unicode\n" .
132 "property table. See ucpinternal.h for a description of the layout.\n" .
133 "This version was made from the Unicode $Uversion tables. */\n\n" .
134 "static cnode ucp_table[] = {\n";
135
136 # Now read the input file and generate the output.
137
138 open(IN, "Unicode.tables/UnicodeData.txt") ||
139 die "Can't open Unicode.tables/UnicodeData.txt: $!\n";
140
141 while (<IN>)
142 {
143 @fields = split /;/;
144
145 $cp = hex($fields[0]);
146 $gc = $fields[2];
147 $uc = $fields[12];
148 $lc = $fields[13];
149
150 # If this line has no "other case" data, it might be the start or end of
151 # a range, either one that is explicit in the data, or one that we can
152 # create by scanning forwards.
153
154 if ($uc eq "" && $lc eq "")
155 {
156 if ($fields[1] =~ /First>$/)
157 {
158 $_ = <IN>;
159 @fields = split /;/;
160 die "First not followed by Last\n", if $fields[1] !~ /Last>$/;
161 die "First and last have different categories\n",
162 if $gc ne $fields[2];
163 outrange($cp, hex($fields[0]), $gc);
164 }
165
166 else
167 {
168 my($ncp) = $cp + 1;
169 while (<IN>)
170 {
171 @fields = split /;/;
172 last if (hex($fields[0]) != $ncp ||
173 $fields[2] ne $gc ||
174 $fields[12] ne "" ||
175 $fields[13] ne "");
176
177 $ncp++;
178 }
179
180 $ncp--;
181 outrange($cp, $ncp, $gc);
182 redo if defined $_; # Reprocess terminating line
183 }
184 }
185
186 # If there is an "other case" character, we output a single-char line
187
188 else
189 {
190 my($co) = (hex(($uc eq "")? $lc : $uc) - $cp) & 0xffff;
191 printf " { 0x%08x, 0x%08x },\n",
192 $cp | (script($cp) << 24), (category($gc) << 26) | $co;
193 }
194 }
195
196 close(IN);
197
198 # Final boilerplate
199
200 print "};\n";
201
202 # End

Properties

Name Value
svn:executable *

webmaster@exim.org
ViewVC Help
Powered by ViewVC 1.1.12