/[pcre]/code/trunk/maintain/Builducptable
ViewVC logotype

Contents of /code/trunk/maintain/Builducptable

Parent Directory Parent Directory | Revision Log Revision Log


Revision 97 - (hide annotations) (download)
Mon Mar 5 12:36:47 2007 UTC (7 years, 1 month ago) by ph10
File size: 4866 byte(s)
Applied Bob and Daniel's patches to convert the build system to automake. Added 
the maintain directory, containing files that are used for maintenance, but are 
not distributed. This is an intermediate step.

1 ph10 97 #! /usr/bin/perl -w
2    
3     # This is a Perl script to create the table of character properties. For
4     # information on the format, see ucpinternal.h. The Unicode files are
5     # expected to be in maintain/Unicode.tables/{Scripts,UnicodeData}.txt. The
6     # ucp.h file is also required. The table is written to the standard output.
7    
8     # The script is rather slow because it just searches linearly through the
9     # Scripts data in order to find the script for each character or character
10     # range. It could be made faster by sorting that data, or something, but hey,
11     # it is only ever run once in a blue moon.
12    
13     # Subroutine: Given a character number, return the script number. The
14     # Scripts.txt file has been read into an array, keeping just the codepoints
15     # and the script name. The lines are in one of two formats:
16     #
17     # xxxx name
18     # xxxx..yyyy name
19     #
20     # where xxxx and yyyy are code points.
21    
22     sub script{
23     my($n) = $_[0];
24     foreach $line (@scriptlist)
25     {
26     my($a,$z,$s);
27    
28     if ($line =~ /\.\./)
29     {
30     ($a,$z,$s) = $line =~ /^([^\.]+)\.\.(\S+)\s+(.*)/;
31     }
32     else
33     {
34     ($a,$s) = $line =~ /^(\S+)\s+(.*)/;
35     $z = $a;
36     }
37    
38     die "Problem on this scripts data line:\n$line"
39     if (!defined $a || !defined $z);
40    
41     if ($n >= hex($a) && $n <= hex($z))
42     {
43     my($x) = $scriptnum{$s};
44     return $x if defined $x;
45     die "Can't find script number for $s\n";
46     }
47     }
48    
49     # All code points not explicitly listed are "Common"
50    
51     return $scriptnum{"Common"};
52     }
53    
54    
55     # Subroutine: given a category name, return its number
56    
57     sub category {
58     my($x) = $gencat{$_[0]};
59     return $x if defined $x;
60     die "Can't find number for general category $_[0]\n";
61     }
62    
63    
64     # Subroutine: output an entry for a range, unless it isn't really a range,
65     # in which case just output a single entry.
66    
67     sub outrange{
68     my($cp,$ncp,$gc) = @_;
69     my($flag) = ($cp != $ncp)? 0x00800000 : 0;
70     printf " { 0x%08x, 0x%08x },\n",
71     $cp | $flag | (script($cp) << 24),
72     (category($gc) << 26) | $ncp - $cp;
73     }
74    
75    
76     # Entry point: An argument giving the Unicode version is required.
77    
78     die "Require a single argument, the Unicode version"
79     if ($#ARGV != 0);
80     $Uversion = shift @ARGV;
81    
82    
83     # Read in the Scripts.txt file, retaining only the code points
84     # and script names.
85    
86     open(IN, "Unicode.tables/Scripts.txt") ||
87     die "Can't open Unicode.tables/Scripts.txt: $!\n";
88    
89     while (<IN>)
90     {
91     next if !/^[0-9A-Z]/;
92     my($range,$name) = $_ =~ /^(\S+)\s*;\s*(\S+)/;
93     push @scriptlist, "$range $name";
94     }
95     close(IN);
96    
97    
98     # Now read the ucp.h file to get the values for the general categories
99     # and for the scripts.
100    
101     open(IN, "ucp.h") || die "Can't open ucp.h: $!\n";
102    
103     while (<IN>) { last if /^enum/; }
104     while (<IN>) { last if /^enum/; }
105    
106    
107     # The second enum are the general categories.
108    
109     $count = 0;
110     while (<IN>)
111     {
112     last if $_ !~ /^\s+ucp_(..)/;
113     $gencat{$1} = $count++;
114     }
115    
116     while (<IN>) { last if /^enum/; }
117    
118     # The third enum are script names.
119    
120     $count = 0;
121     while (<IN>)
122     {
123     last if $_ !~ /^\s+ucp_(\w+)/;
124     $scriptnum{$1} = $count++;
125     }
126    
127     close(IN);
128    
129     # Write out the initial boilerplace.
130    
131     print "/* This source module is automatically generated from the Unicode\n" .
132     "property table. See ucpinternal.h for a description of the layout.\n" .
133     "This version was made from the Unicode $Uversion tables. */\n\n" .
134     "static cnode ucp_table[] = {\n";
135    
136     # Now read the input file and generate the output.
137    
138     open(IN, "Unicode.tables/UnicodeData.txt") ||
139     die "Can't open Unicode.tables/UnicodeData.txt: $!\n";
140    
141     while (<IN>)
142     {
143     @fields = split /;/;
144    
145     $cp = hex($fields[0]);
146     $gc = $fields[2];
147     $uc = $fields[12];
148     $lc = $fields[13];
149    
150     # If this line has no "other case" data, it might be the start or end of
151     # a range, either one that is explicit in the data, or one that we can
152     # create by scanning forwards.
153    
154     if ($uc eq "" && $lc eq "")
155     {
156     if ($fields[1] =~ /First>$/)
157     {
158     $_ = <IN>;
159     @fields = split /;/;
160     die "First not followed by Last\n", if $fields[1] !~ /Last>$/;
161     die "First and last have different categories\n",
162     if $gc ne $fields[2];
163     outrange($cp, hex($fields[0]), $gc);
164     }
165    
166     else
167     {
168     my($ncp) = $cp + 1;
169     while (<IN>)
170     {
171     @fields = split /;/;
172     last if (hex($fields[0]) != $ncp ||
173     $fields[2] ne $gc ||
174     $fields[12] ne "" ||
175     $fields[13] ne "");
176    
177     $ncp++;
178     }
179    
180     $ncp--;
181     outrange($cp, $ncp, $gc);
182     redo if defined $_; # Reprocess terminating line
183     }
184     }
185    
186     # If there is an "other case" character, we output a single-char line
187    
188     else
189     {
190     my($co) = (hex(($uc eq "")? $lc : $uc) - $cp) & 0xffff;
191     printf " { 0x%08x, 0x%08x },\n",
192     $cp | (script($cp) << 24), (category($gc) << 26) | $co;
193     }
194     }
195    
196     close(IN);
197    
198     # Final boilerplate
199    
200     print "};\n";
201    
202     # End

Properties

Name Value
svn:executable *

webmaster@exim.org
ViewVC Help
Powered by ViewVC 1.1.12