/[pcre]/code/trunk/maintain/Builducptable
ViewVC logotype

Contents of /code/trunk/maintain/Builducptable

Parent Directory Parent Directory | Revision Log Revision Log


Revision 105 - (hide annotations) (download)
Tue Mar 6 16:32:53 2007 UTC (7 years, 7 months ago) by ph10
File size: 4871 byte(s)
Mark ucp_table (in ucptable.h) and ucp_gentype (in pcre_ucp_searchfuncs.c)
as "const".

1 ph10 97 #! /usr/bin/perl -w
2    
3 ph10 98 # This is a Perl script to create the table of character properties. For
4     # information on the format, see ucpinternal.h. The Unicode files are expected
5     # to be in Unicode.tables/{Scripts,UnicodeData}.txt. The ../ucp.h file is also
6     # required. The table is written to the standard output.
7 ph10 97
8     # The script is rather slow because it just searches linearly through the
9     # Scripts data in order to find the script for each character or character
10     # range. It could be made faster by sorting that data, or something, but hey,
11     # it is only ever run once in a blue moon.
12    
13     # Subroutine: Given a character number, return the script number. The
14     # Scripts.txt file has been read into an array, keeping just the codepoints
15     # and the script name. The lines are in one of two formats:
16     #
17     # xxxx name
18     # xxxx..yyyy name
19     #
20     # where xxxx and yyyy are code points.
21    
22     sub script{
23     my($n) = $_[0];
24     foreach $line (@scriptlist)
25     {
26     my($a,$z,$s);
27    
28     if ($line =~ /\.\./)
29     {
30     ($a,$z,$s) = $line =~ /^([^\.]+)\.\.(\S+)\s+(.*)/;
31     }
32     else
33     {
34     ($a,$s) = $line =~ /^(\S+)\s+(.*)/;
35     $z = $a;
36     }
37    
38     die "Problem on this scripts data line:\n$line"
39     if (!defined $a || !defined $z);
40    
41     if ($n >= hex($a) && $n <= hex($z))
42     {
43     my($x) = $scriptnum{$s};
44     return $x if defined $x;
45     die "Can't find script number for $s\n";
46     }
47     }
48    
49     # All code points not explicitly listed are "Common"
50    
51     return $scriptnum{"Common"};
52     }
53    
54    
55     # Subroutine: given a category name, return its number
56    
57     sub category {
58     my($x) = $gencat{$_[0]};
59     return $x if defined $x;
60     die "Can't find number for general category $_[0]\n";
61     }
62    
63    
64     # Subroutine: output an entry for a range, unless it isn't really a range,
65     # in which case just output a single entry.
66    
67     sub outrange{
68     my($cp,$ncp,$gc) = @_;
69     my($flag) = ($cp != $ncp)? 0x00800000 : 0;
70     printf " { 0x%08x, 0x%08x },\n",
71     $cp | $flag | (script($cp) << 24),
72     (category($gc) << 26) | $ncp - $cp;
73     }
74    
75    
76     # Entry point: An argument giving the Unicode version is required.
77    
78     die "Require a single argument, the Unicode version"
79     if ($#ARGV != 0);
80     $Uversion = shift @ARGV;
81    
82    
83     # Read in the Scripts.txt file, retaining only the code points
84     # and script names.
85    
86     open(IN, "Unicode.tables/Scripts.txt") ||
87     die "Can't open Unicode.tables/Scripts.txt: $!\n";
88    
89     while (<IN>)
90     {
91     next if !/^[0-9A-Z]/;
92     my($range,$name) = $_ =~ /^(\S+)\s*;\s*(\S+)/;
93     push @scriptlist, "$range $name";
94     }
95     close(IN);
96    
97    
98     # Now read the ucp.h file to get the values for the general categories
99     # and for the scripts.
100    
101 ph10 98 open(IN, "../ucp.h") || die "Can't open ../ucp.h: $!\n";
102 ph10 97
103     while (<IN>) { last if /^enum/; }
104     while (<IN>) { last if /^enum/; }
105    
106    
107     # The second enum are the general categories.
108    
109     $count = 0;
110     while (<IN>)
111     {
112     last if $_ !~ /^\s+ucp_(..)/;
113     $gencat{$1} = $count++;
114     }
115    
116     while (<IN>) { last if /^enum/; }
117    
118     # The third enum are script names.
119    
120     $count = 0;
121     while (<IN>)
122     {
123     last if $_ !~ /^\s+ucp_(\w+)/;
124     $scriptnum{$1} = $count++;
125     }
126    
127     close(IN);
128    
129     # Write out the initial boilerplace.
130    
131     print "/* This source module is automatically generated from the Unicode\n" .
132     "property table. See ucpinternal.h for a description of the layout.\n" .
133     "This version was made from the Unicode $Uversion tables. */\n\n" .
134 ph10 105 "static const cnode ucp_table[] = {\n";
135 ph10 97
136     # Now read the input file and generate the output.
137    
138     open(IN, "Unicode.tables/UnicodeData.txt") ||
139     die "Can't open Unicode.tables/UnicodeData.txt: $!\n";
140    
141     while (<IN>)
142     {
143     @fields = split /;/;
144    
145     $cp = hex($fields[0]);
146     $gc = $fields[2];
147     $uc = $fields[12];
148     $lc = $fields[13];
149    
150     # If this line has no "other case" data, it might be the start or end of
151     # a range, either one that is explicit in the data, or one that we can
152     # create by scanning forwards.
153    
154     if ($uc eq "" && $lc eq "")
155     {
156     if ($fields[1] =~ /First>$/)
157     {
158     $_ = <IN>;
159     @fields = split /;/;
160     die "First not followed by Last\n", if $fields[1] !~ /Last>$/;
161     die "First and last have different categories\n",
162     if $gc ne $fields[2];
163     outrange($cp, hex($fields[0]), $gc);
164     }
165    
166     else
167     {
168     my($ncp) = $cp + 1;
169     while (<IN>)
170     {
171     @fields = split /;/;
172     last if (hex($fields[0]) != $ncp ||
173     $fields[2] ne $gc ||
174     $fields[12] ne "" ||
175     $fields[13] ne "");
176    
177     $ncp++;
178     }
179    
180     $ncp--;
181     outrange($cp, $ncp, $gc);
182     redo if defined $_; # Reprocess terminating line
183     }
184     }
185    
186     # If there is an "other case" character, we output a single-char line
187    
188     else
189     {
190     my($co) = (hex(($uc eq "")? $lc : $uc) - $cp) & 0xffff;
191     printf " { 0x%08x, 0x%08x },\n",
192     $cp | (script($cp) << 24), (category($gc) << 26) | $co;
193     }
194     }
195    
196     close(IN);
197    
198     # Final boilerplate
199    
200     print "};\n";
201    
202     # End

Properties

Name Value
svn:executable *

webmaster@exim.org
ViewVC Help
Powered by ViewVC 1.1.12