/[pcre]/code/trunk/maint/Builducptable
ViewVC logotype

Contents of /code/trunk/maint/Builducptable

Parent Directory Parent Directory | Revision Log Revision Log


Revision 277 - (hide annotations) (download)
Mon Nov 26 17:03:45 2007 UTC (5 years, 5 months ago) by ph10
File size: 5119 byte(s)
Fix incorrect script bug in Unicode character table.

1 ph10 97 #! /usr/bin/perl -w
2    
3 ph10 98 # This is a Perl script to create the table of character properties. For
4     # information on the format, see ucpinternal.h. The Unicode files are expected
5     # to be in Unicode.tables/{Scripts,UnicodeData}.txt. The ../ucp.h file is also
6     # required. The table is written to the standard output.
7 ph10 97
8     # The script is rather slow because it just searches linearly through the
9     # Scripts data in order to find the script for each character or character
10     # range. It could be made faster by sorting that data, or something, but hey,
11 ph10 277 # it is only ever run once in a blue moon. (It's even slower after I mended the
12     # "forgot to check for script number before amalgamation" bug, but even so,
13     # the effort of improving it isn't worth it.)
14 ph10 97
15     # Subroutine: Given a character number, return the script number. The
16     # Scripts.txt file has been read into an array, keeping just the codepoints
17     # and the script name. The lines are in one of two formats:
18     #
19     # xxxx name
20     # xxxx..yyyy name
21     #
22     # where xxxx and yyyy are code points.
23    
24     sub script{
25     my($n) = $_[0];
26     foreach $line (@scriptlist)
27     {
28     my($a,$z,$s);
29    
30     if ($line =~ /\.\./)
31     {
32     ($a,$z,$s) = $line =~ /^([^\.]+)\.\.(\S+)\s+(.*)/;
33     }
34     else
35     {
36     ($a,$s) = $line =~ /^(\S+)\s+(.*)/;
37     $z = $a;
38     }
39    
40     die "Problem on this scripts data line:\n$line"
41     if (!defined $a || !defined $z);
42    
43     if ($n >= hex($a) && $n <= hex($z))
44     {
45     my($x) = $scriptnum{$s};
46     return $x if defined $x;
47     die "Can't find script number for $s\n";
48     }
49     }
50    
51     # All code points not explicitly listed are "Common"
52    
53     return $scriptnum{"Common"};
54     }
55    
56    
57     # Subroutine: given a category name, return its number
58    
59     sub category {
60     my($x) = $gencat{$_[0]};
61     return $x if defined $x;
62     die "Can't find number for general category $_[0]\n";
63     }
64    
65    
66     # Subroutine: output an entry for a range, unless it isn't really a range,
67     # in which case just output a single entry.
68    
69     sub outrange{
70     my($cp,$ncp,$gc) = @_;
71     my($flag) = ($cp != $ncp)? 0x00800000 : 0;
72     printf " { 0x%08x, 0x%08x },\n",
73     $cp | $flag | (script($cp) << 24),
74     (category($gc) << 26) | $ncp - $cp;
75     }
76    
77    
78     # Entry point: An argument giving the Unicode version is required.
79    
80     die "Require a single argument, the Unicode version"
81     if ($#ARGV != 0);
82     $Uversion = shift @ARGV;
83    
84    
85     # Read in the Scripts.txt file, retaining only the code points
86     # and script names.
87    
88     open(IN, "Unicode.tables/Scripts.txt") ||
89     die "Can't open Unicode.tables/Scripts.txt: $!\n";
90    
91     while (<IN>)
92     {
93     next if !/^[0-9A-Z]/;
94     my($range,$name) = $_ =~ /^(\S+)\s*;\s*(\S+)/;
95     push @scriptlist, "$range $name";
96     }
97     close(IN);
98    
99    
100     # Now read the ucp.h file to get the values for the general categories
101     # and for the scripts.
102    
103 ph10 98 open(IN, "../ucp.h") || die "Can't open ../ucp.h: $!\n";
104 ph10 97
105     while (<IN>) { last if /^enum/; }
106     while (<IN>) { last if /^enum/; }
107    
108    
109     # The second enum are the general categories.
110    
111     $count = 0;
112     while (<IN>)
113     {
114     last if $_ !~ /^\s+ucp_(..)/;
115     $gencat{$1} = $count++;
116     }
117    
118     while (<IN>) { last if /^enum/; }
119    
120     # The third enum are script names.
121    
122     $count = 0;
123     while (<IN>)
124     {
125     last if $_ !~ /^\s+ucp_(\w+)/;
126     $scriptnum{$1} = $count++;
127     }
128    
129     close(IN);
130    
131     # Write out the initial boilerplace.
132    
133     print "/* This source module is automatically generated from the Unicode\n" .
134     "property table. See ucpinternal.h for a description of the layout.\n" .
135     "This version was made from the Unicode $Uversion tables. */\n\n" .
136 ph10 105 "static const cnode ucp_table[] = {\n";
137 ph10 97
138     # Now read the input file and generate the output.
139    
140     open(IN, "Unicode.tables/UnicodeData.txt") ||
141     die "Can't open Unicode.tables/UnicodeData.txt: $!\n";
142    
143     while (<IN>)
144     {
145     @fields = split /;/;
146    
147     $cp = hex($fields[0]);
148     $gc = $fields[2];
149     $uc = $fields[12];
150     $lc = $fields[13];
151    
152     # If this line has no "other case" data, it might be the start or end of
153     # a range, either one that is explicit in the data, or one that we can
154     # create by scanning forwards.
155    
156     if ($uc eq "" && $lc eq "")
157     {
158     if ($fields[1] =~ /First>$/)
159     {
160     $_ = <IN>;
161     @fields = split /;/;
162     die "First not followed by Last\n", if $fields[1] !~ /Last>$/;
163     die "First and last have different categories\n",
164     if $gc ne $fields[2];
165     outrange($cp, hex($fields[0]), $gc);
166     }
167    
168     else
169 ph10 277 {
170     my($startscript) = script($cp);
171 ph10 97 my($ncp) = $cp + 1;
172     while (<IN>)
173     {
174     @fields = split /;/;
175     last if (hex($fields[0]) != $ncp ||
176     $fields[2] ne $gc ||
177     $fields[12] ne "" ||
178 ph10 277 $fields[13] ne "" ||
179     script($ncp) != $startscript);
180 ph10 97
181     $ncp++;
182     }
183    
184     $ncp--;
185     outrange($cp, $ncp, $gc);
186     redo if defined $_; # Reprocess terminating line
187     }
188     }
189    
190     # If there is an "other case" character, we output a single-char line
191    
192     else
193     {
194     my($co) = (hex(($uc eq "")? $lc : $uc) - $cp) & 0xffff;
195     printf " { 0x%08x, 0x%08x },\n",
196     $cp | (script($cp) << 24), (category($gc) << 26) | $co;
197     }
198     }
199    
200     close(IN);
201    
202     # Final boilerplate
203    
204     print "};\n";
205    
206     # End

Properties

Name Value
svn:executable *

webmaster@exim.org
ViewVC Help
Powered by ViewVC 1.1.12