/[pcre]/code/trunk/maint/MultiStage2.py
ViewVC logotype

Contents of /code/trunk/maint/MultiStage2.py

Parent Directory Parent Directory | Revision Log Revision Log


Revision 942 - (show annotations) (download) (as text)
Tue Feb 28 14:50:31 2012 UTC (2 years, 7 months ago) by ph10
File MIME type: text/x-python
File size: 15590 byte(s)
Update for Unicode 6.1.0.

1 #! /usr/bin/python
2
3 # Multistage table builder
4 # (c) Peter Kankowski, 2008
5
6 ##############################################################################
7 # This script was submitted to the PCRE project by Peter Kankowski as part of
8 # the upgrading of Unicode property support. The new code speeds up property
9 # matching many times. The script is for the use of PCRE maintainers, to
10 # generate the pcre_ucd.c file that contains a digested form of the Unicode
11 # data tables.
12 #
13 # The script should be run in the maint subdirectory, using the command
14 #
15 # ./MultiStage2.py >../pcre_ucd.c
16 #
17 # It requires three Unicode data tables, DerivedGeneralCategory.txt,
18 # Scripts.txt, and UnicodeData.txt, to be in the Unicode.tables subdirectory.
19 # The first of these is found in the "extracted" subdirectory of the Unicode
20 # database (UCD) on the Unicode web site; the other two are directly in the
21 # UCD directory.
22 #
23 # Minor modifications made to this script:
24 # Added #! line at start
25 # Removed tabs
26 # Made it work with Python 2.4 by rewriting two statements that needed 2.5
27 # Consequent code tidy
28 # Adjusted data file names to take from the Unicode.tables directory
29 # Adjusted global table names by prefixing _pcre_.
30 # Commented out stuff relating to the casefolding table, which isn't used.
31 # Corrected size calculation
32 # Add #ifndef SUPPORT_UCP to use dummy tables when no UCP support is needed.
33 #
34 # The tables generated by this script are used by macros defined in
35 # pcre_internal.h. They look up Unicode character properties using short
36 # sequences of code that contains no branches, which makes for greater speed.
37 #
38 # Conceptually, there is a table of records (of type ucd_record), containing a
39 # script number, character type, and offset to the character's other case for
40 # every character. However, a real table covering all Unicode characters would
41 # be far too big. It can be efficiently compressed by observing that many
42 # characters have the same record, and many blocks of characters (taking 128
43 # characters in a block) have the same set of records as other blocks. This
44 # leads to a 2-stage lookup process.
45 #
46 # This script constructs three tables. The _pcre_ucd_records table contains
47 # one instance of every unique record that is required. The _pcre_ucd_stage1
48 # table is indexed by a character's block number, and yields what is in effect
49 # a "virtual" block number. The _pcre_ucd_stage2 table is a table of "virtual"
50 # blocks; each block is indexed by the offset of a character within its own
51 # block, and the result is the offset of the required record.
52 #
53 # Example: lowercase "a" (U+0061) is in block 0
54 # lookup 0 in stage1 table yields 0
55 # lookup 97 in the first table in stage2 yields 12
56 # record 12 is { 33, 5, -32 } (Latin, lowercase, upper is U+0041)
57 #
58 # All lowercase latin characters resolve to the same record.
59 #
60 # Example: hiragana letter A (U+3042) is in block 96 (0x60)
61 # lookup 96 in stage1 table yields 83
62 # lookup 66 in the 83rd table in stage2 yields 348
63 # record 348 is { 26, 7, 0 } (Hiragana, other letter, no other case)
64 #
65 # In these examples, no other blocks resolve to the same "virtual" block, as it
66 # happens, but plenty of other blocks do share "virtual" blocks.
67 #
68 # There is a fourth table, maintained by hand, which translates from the
69 # individual character types such as ucp_Cc to the general types like ucp_C.
70 #
71 # Philip Hazel, 03 July 2008
72 #
73 # 01-March-2010: Updated list of scripts for Unicode 5.2.0
74 # 30-April-2011: Updated list of scripts for Unicode 6.0.0
75 ##############################################################################
76
77
78 import re
79 import string
80 import sys
81
82 MAX_UNICODE = 0x110000
83 NOTACHAR = 0xffffffff
84
85 # Parse a line of CaseFolding.txt, Scripts.txt, and DerivedGeneralCategory.txt file
86 def make_get_names(enum):
87 return lambda chardata: enum.index(chardata[1])
88
89 #def get_case_folding_value(chardata):
90 # if chardata[1] != 'C' and chardata[1] != 'S':
91 # return 0
92 # return int(chardata[2], 16) - int(chardata[0], 16)
93
94 def get_other_case(chardata):
95 if chardata[12] != '':
96 return int(chardata[12], 16) - int(chardata[0], 16)
97 if chardata[13] != '':
98 return int(chardata[13], 16) - int(chardata[0], 16)
99 return 0
100
101 # Read the whole table in memory
102 def read_table(file_name, get_value, default_value):
103 file = open(file_name, 'r')
104 table = [default_value] * MAX_UNICODE
105 for line in file:
106 line = re.sub(r'#.*', '', line)
107 chardata = map(string.strip, line.split(';'))
108 if len(chardata) <= 1:
109 continue
110 value = get_value(chardata)
111 m = re.match(r'([0-9a-fA-F]+)(\.\.([0-9a-fA-F]+))?$', chardata[0])
112 char = int(m.group(1), 16)
113 if m.group(3) is None:
114 last = char
115 else:
116 last = int(m.group(3), 16)
117 for i in range(char, last + 1):
118 table[i] = value
119 file.close()
120 return table
121
122 # Get the smallest possible C language type for the values
123 def get_type_size(table):
124 type_size = [("pcre_uint8", 1), ("pcre_uint16", 2), ("pcre_uint32", 4),
125 ("signed char", 1), ("pcre_int16", 2), ("pcre_int32", 4)]
126 limits = [(0, 255), (0, 65535), (0, 4294967295),
127 (-128, 127), (-32768, 32767), (-2147483648, 2147483647)]
128 minval = min(table)
129 maxval = max(table)
130 for num, (minlimit, maxlimit) in enumerate(limits):
131 if minlimit <= minval and maxval <= maxlimit:
132 return type_size[num]
133 else:
134 raise OverflowError, "Too large to fit into C types"
135
136 def get_tables_size(*tables):
137 total_size = 0
138 for table in tables:
139 type, size = get_type_size(table)
140 total_size += size * len(table)
141 return total_size
142
143 # Compress the table into the two stages
144 def compress_table(table, block_size):
145 blocks = {} # Dictionary for finding identical blocks
146 stage1 = [] # Stage 1 table contains block numbers (indices into stage 2 table)
147 stage2 = [] # Stage 2 table contains the blocks with property values
148 table = tuple(table)
149 for i in range(0, len(table), block_size):
150 block = table[i:i+block_size]
151 start = blocks.get(block)
152 if start is None:
153 # Allocate a new block
154 start = len(stage2) / block_size
155 stage2 += block
156 blocks[block] = start
157 stage1.append(start)
158
159 return stage1, stage2
160
161 # Print a table
162 def print_table(table, table_name, block_size = None):
163 type, size = get_type_size(table)
164 ELEMS_PER_LINE = 16
165
166 s = "const %s %s[] = { /* %d bytes" % (type, table_name, size * len(table))
167 if block_size:
168 s += ", block = %d" % block_size
169 print s + " */"
170 table = tuple(table)
171 if block_size is None:
172 fmt = "%3d," * ELEMS_PER_LINE + " /* U+%04X */"
173 mult = MAX_UNICODE / len(table)
174 for i in range(0, len(table), ELEMS_PER_LINE):
175 print fmt % (table[i:i+ELEMS_PER_LINE] + (i * mult,))
176 else:
177 if block_size > ELEMS_PER_LINE:
178 el = ELEMS_PER_LINE
179 else:
180 el = block_size
181 fmt = "%3d," * el + "\n"
182 if block_size > ELEMS_PER_LINE:
183 fmt = fmt * (block_size / ELEMS_PER_LINE)
184 for i in range(0, len(table), block_size):
185 print ("/* block %d */\n" + fmt) % ((i / block_size,) + table[i:i+block_size])
186 print "};\n"
187
188 # Extract the unique combinations of properties into records
189 def combine_tables(*tables):
190 records = {}
191 index = []
192 for t in zip(*tables):
193 i = records.get(t)
194 if i is None:
195 i = records[t] = len(records)
196 index.append(i)
197 return index, records
198
199 def get_record_size_struct(records):
200 size = 0
201 structure = '/* When recompiling tables with a new Unicode version,\n' + \
202 'please check types in the structure definition from pcre_internal.h:\ntypedef struct {\n'
203 for i in range(len(records[0])):
204 record_slice = map(lambda record: record[i], records)
205 slice_type, slice_size = get_type_size(record_slice)
206 # add padding: round up to the nearest power of slice_size
207 size = (size + slice_size - 1) & -slice_size
208 size += slice_size
209 structure += '%s property_%d;\n' % (slice_type, i)
210
211 # round up to the first item of the next structure in array
212 record_slice = map(lambda record: record[0], records)
213 slice_type, slice_size = get_type_size(record_slice)
214 size = (size + slice_size - 1) & -slice_size
215
216 structure += '} ucd_record; */\n\n'
217 return size, structure
218
219 def test_record_size():
220 tests = [ \
221 ( [(3,), (6,), (6,), (1,)], 1 ), \
222 ( [(300,), (600,), (600,), (100,)], 2 ), \
223 ( [(25, 3), (6, 6), (34, 6), (68, 1)], 2 ), \
224 ( [(300, 3), (6, 6), (340, 6), (690, 1)], 4 ), \
225 ( [(3, 300), (6, 6), (6, 340), (1, 690)], 4 ), \
226 ( [(300, 300), (6, 6), (6, 340), (1, 690)], 4 ), \
227 ( [(3, 100000), (6, 6), (6, 123456), (1, 690)], 8 ), \
228 ( [(100000, 300), (6, 6), (123456, 6), (1, 690)], 8 ), \
229 ]
230 for test in tests:
231 size, struct = get_record_size_struct(test[0])
232 assert(size == test[1])
233 #print struct
234
235 def print_records(records, record_size):
236 print 'const ucd_record PRIV(ucd_records)[] = { ' + \
237 '/* %d bytes, record size %d */' % (len(records) * record_size, record_size)
238 records = zip(records.keys(), records.values())
239 records.sort(None, lambda x: x[1])
240 for i, record in enumerate(records):
241 print (' {' + '%6d, ' * len(record[0]) + '}, /* %3d */') % (record[0] + (i,))
242 print '};\n'
243
244 script_names = ['Arabic', 'Armenian', 'Bengali', 'Bopomofo', 'Braille', 'Buginese', 'Buhid', 'Canadian_Aboriginal', \
245 'Cherokee', 'Common', 'Coptic', 'Cypriot', 'Cyrillic', 'Deseret', 'Devanagari', 'Ethiopic', 'Georgian', \
246 'Glagolitic', 'Gothic', 'Greek', 'Gujarati', 'Gurmukhi', 'Han', 'Hangul', 'Hanunoo', 'Hebrew', 'Hiragana', \
247 'Inherited', 'Kannada', 'Katakana', 'Kharoshthi', 'Khmer', 'Lao', 'Latin', 'Limbu', 'Linear_B', 'Malayalam', \
248 'Mongolian', 'Myanmar', 'New_Tai_Lue', 'Ogham', 'Old_Italic', 'Old_Persian', 'Oriya', 'Osmanya', 'Runic', \
249 'Shavian', 'Sinhala', 'Syloti_Nagri', 'Syriac', 'Tagalog', 'Tagbanwa', 'Tai_Le', 'Tamil', 'Telugu', 'Thaana', \
250 'Thai', 'Tibetan', 'Tifinagh', 'Ugaritic', 'Yi', \
251 # New for Unicode 5.0
252 'Balinese', 'Cuneiform', 'Nko', 'Phags_Pa', 'Phoenician', \
253 # New for Unicode 5.1
254 'Carian', 'Cham', 'Kayah_Li', 'Lepcha', 'Lycian', 'Lydian', 'Ol_Chiki', 'Rejang', 'Saurashtra', 'Sundanese', 'Vai', \
255 # New for Unicode 5.2
256 'Avestan', 'Bamum', 'Egyptian_Hieroglyphs', 'Imperial_Aramaic', \
257 'Inscriptional_Pahlavi', 'Inscriptional_Parthian', \
258 'Javanese', 'Kaithi', 'Lisu', 'Meetei_Mayek', \
259 'Old_South_Arabian', 'Old_Turkic', 'Samaritan', 'Tai_Tham', 'Tai_Viet', \
260 # New for Unicode 6.0.0
261 'Batak', 'Brahmi', 'Mandaic', \
262 # New for Unicode 6.1.0
263 'Chakma', 'Meroitic_Cursive', 'Meroitic_Hieroglyphs', 'Miao', 'Sharada', 'Sora_Sompeng', 'Takri'
264 ]
265
266 category_names = ['Cc', 'Cf', 'Cn', 'Co', 'Cs', 'Ll', 'Lm', 'Lo', 'Lt', 'Lu',
267 'Mc', 'Me', 'Mn', 'Nd', 'Nl', 'No', 'Pc', 'Pd', 'Pe', 'Pf', 'Pi', 'Po', 'Ps',
268 'Sc', 'Sk', 'Sm', 'So', 'Zl', 'Zp', 'Zs' ]
269
270 test_record_size()
271
272 script = read_table('Unicode.tables/Scripts.txt', make_get_names(script_names), script_names.index('Common'))
273 category = read_table('Unicode.tables/DerivedGeneralCategory.txt', make_get_names(category_names), category_names.index('Cn'))
274 other_case = read_table('Unicode.tables/UnicodeData.txt', get_other_case, 0)
275 # case_fold = read_table('CaseFolding.txt', get_case_folding_value, 0)
276
277 table, records = combine_tables(script, category, other_case)
278 record_size, record_struct = get_record_size_struct(records.keys())
279
280 # Find the optimum block size for the two-stage table
281 min_size = sys.maxint
282 for block_size in [2 ** i for i in range(5,10)]:
283 size = len(records) * record_size
284 stage1, stage2 = compress_table(table, block_size)
285 size += get_tables_size(stage1, stage2)
286 #print "/* block size %5d => %5d bytes */" % (block_size, size)
287 if size < min_size:
288 min_size = size
289 min_stage1, min_stage2 = stage1, stage2
290 min_block_size = block_size
291
292 print "#ifdef HAVE_CONFIG_H"
293 print "#include \"config.h\""
294 print "#endif"
295 print
296 print "#include \"pcre_internal.h\""
297 print
298 print "/* Unicode character database. */"
299 print "/* This file was autogenerated by the MultiStage2.py script. */"
300 print "/* Total size: %d bytes, block size: %d. */" % (min_size, min_block_size)
301 print
302 print "/* The tables herein are needed only when UCP support is built */"
303 print "/* into PCRE. This module should not be referenced otherwise, so */"
304 print "/* it should not matter whether it is compiled or not. However */"
305 print "/* a comment was received about space saving - maybe the guy linked */"
306 print "/* all the modules rather than using a library - so we include a */"
307 print "/* condition to cut out the tables when not needed. But don't leave */"
308 print "/* a totally empty module because some compilers barf at that. */"
309 print "/* Instead, just supply small dummy tables. */"
310 print
311 print "#ifndef SUPPORT_UCP"
312 print "const ucd_record PRIV(ucd_records)[] = {{0,0,0 }};"
313 print "const pcre_uint8 PRIV(ucd_stage1)[] = {0};"
314 print "const pcre_uint16 PRIV(ucd_stage2)[] = {0};"
315 print "#else"
316 print
317 print record_struct
318 print_records(records, record_size)
319 print_table(min_stage1, 'PRIV(ucd_stage1)')
320 print_table(min_stage2, 'PRIV(ucd_stage2)', min_block_size)
321 print "#if UCD_BLOCK_SIZE != %d" % min_block_size
322 print "#error Please correct UCD_BLOCK_SIZE in pcre_internal.h"
323 print "#endif"
324 print "#endif /* SUPPORT_UCP */"
325
326 """
327
328 # Three-stage tables:
329
330 # Find the optimum block size for 3-stage table
331 min_size = sys.maxint
332 for stage3_block in [2 ** i for i in range(2,6)]:
333 stage_i, stage3 = compress_table(table, stage3_block)
334 for stage2_block in [2 ** i for i in range(5,10)]:
335 size = len(records) * 4
336 stage1, stage2 = compress_table(stage_i, stage2_block)
337 size += get_tables_size(stage1, stage2, stage3)
338 # print "/* %5d / %3d => %5d bytes */" % (stage2_block, stage3_block, size)
339 if size < min_size:
340 min_size = size
341 min_stage1, min_stage2, min_stage3 = stage1, stage2, stage3
342 min_stage2_block, min_stage3_block = stage2_block, stage3_block
343
344 print "/* Total size: %d bytes" % min_size */
345 print_records(records)
346 print_table(min_stage1, 'ucd_stage1')
347 print_table(min_stage2, 'ucd_stage2', min_stage2_block)
348 print_table(min_stage3, 'ucd_stage3', min_stage3_block)
349
350 """

Properties

Name Value
svn:executable *

webmaster@exim.org
ViewVC Help
Powered by ViewVC 1.1.12