| 25 |
# Adjusted data file names to take from the Unicode.tables directory |
# Adjusted data file names to take from the Unicode.tables directory |
| 26 |
# Adjusted global table names by prefixing _pcre_. |
# Adjusted global table names by prefixing _pcre_. |
| 27 |
# Commented out stuff relating to the casefolding table, which isn't used. |
# Commented out stuff relating to the casefolding table, which isn't used. |
| 28 |
|
# Corrected size calculation |
| 29 |
# |
# |
| 30 |
# The tables generated by this script are used by macros defined in |
# The tables generated by this script are used by macros defined in |
| 31 |
# pcre_internal.h. They look up Unicode character properties using short |
# pcre_internal.h. They look up Unicode character properties using short |
| 190 |
index.append(i) |
index.append(i) |
| 191 |
return index, records |
return index, records |
| 192 |
|
|
| 193 |
def print_records(records): |
def get_record_size_struct(records): |
| 194 |
print 'const ucd_record _pcre_ucd_records[] = { /* %d bytes */' % (len(records) * 4) |
size = 0 |
| 195 |
|
structure = '/* When recompiling tables with a new Unicode version,\n' + \ |
| 196 |
|
'please check types in the structure definition from pcre_internal.h:\ntypedef struct {\n' |
| 197 |
|
for i in range(len(records[0])): |
| 198 |
|
record_slice = map(lambda record: record[i], records) |
| 199 |
|
slice_type, slice_size = get_type_size(record_slice) |
| 200 |
|
# add padding: round up to the nearest power of slice_size |
| 201 |
|
size = (size + slice_size - 1) & -slice_size |
| 202 |
|
size += slice_size |
| 203 |
|
structure += '%s property_%d;\n' % (slice_type, i) |
| 204 |
|
|
| 205 |
|
# round up to the first item of the next structure in array |
| 206 |
|
record_slice = map(lambda record: record[0], records) |
| 207 |
|
slice_type, slice_size = get_type_size(record_slice) |
| 208 |
|
size = (size + slice_size - 1) & -slice_size |
| 209 |
|
|
| 210 |
|
structure += '} ucd_record; */\n\n' |
| 211 |
|
return size, structure |
| 212 |
|
|
| 213 |
|
def test_record_size(): |
| 214 |
|
tests = [ \ |
| 215 |
|
( [(3,), (6,), (6,), (1,)], 1 ), \ |
| 216 |
|
( [(300,), (600,), (600,), (100,)], 2 ), \ |
| 217 |
|
( [(25, 3), (6, 6), (34, 6), (68, 1)], 2 ), \ |
| 218 |
|
( [(300, 3), (6, 6), (340, 6), (690, 1)], 4 ), \ |
| 219 |
|
( [(3, 300), (6, 6), (6, 340), (1, 690)], 4 ), \ |
| 220 |
|
( [(300, 300), (6, 6), (6, 340), (1, 690)], 4 ), \ |
| 221 |
|
( [(3, 100000), (6, 6), (6, 123456), (1, 690)], 8 ), \ |
| 222 |
|
( [(100000, 300), (6, 6), (123456, 6), (1, 690)], 8 ), \ |
| 223 |
|
] |
| 224 |
|
for test in tests: |
| 225 |
|
size, struct = get_record_size_struct(test[0]) |
| 226 |
|
assert(size == test[1]) |
| 227 |
|
#print struct |
| 228 |
|
|
| 229 |
|
def print_records(records, record_size): |
| 230 |
|
print 'const ucd_record _pcre_ucd_records[] = { ' + \ |
| 231 |
|
'/* %d bytes, record size %d */' % (len(records) * record_size, record_size) |
| 232 |
records = zip(records.keys(), records.values()) |
records = zip(records.keys(), records.values()) |
| 233 |
records.sort(None, lambda x: x[1]) |
records.sort(None, lambda x: x[1]) |
| 234 |
for i, record in enumerate(records): |
for i, record in enumerate(records): |
| 251 |
'Mc', 'Me', 'Mn', 'Nd', 'Nl', 'No', 'Pc', 'Pd', 'Pe', 'Pf', 'Pi', 'Po', 'Ps', |
'Mc', 'Me', 'Mn', 'Nd', 'Nl', 'No', 'Pc', 'Pd', 'Pe', 'Pf', 'Pi', 'Po', 'Ps', |
| 252 |
'Sc', 'Sk', 'Sm', 'So', 'Zl', 'Zp', 'Zs' ] |
'Sc', 'Sk', 'Sm', 'So', 'Zl', 'Zp', 'Zs' ] |
| 253 |
|
|
| 254 |
|
test_record_size() |
| 255 |
|
|
| 256 |
script = read_table('Unicode.tables/Scripts.txt', make_get_names(script_names), script_names.index('Common')) |
script = read_table('Unicode.tables/Scripts.txt', make_get_names(script_names), script_names.index('Common')) |
| 257 |
category = read_table('Unicode.tables/DerivedGeneralCategory.txt', make_get_names(category_names), category_names.index('Cn')) |
category = read_table('Unicode.tables/DerivedGeneralCategory.txt', make_get_names(category_names), category_names.index('Cn')) |
| 259 |
# case_fold = read_table('CaseFolding.txt', get_case_folding_value, 0) |
# case_fold = read_table('CaseFolding.txt', get_case_folding_value, 0) |
| 260 |
|
|
| 261 |
table, records = combine_tables(script, category, other_case) |
table, records = combine_tables(script, category, other_case) |
| 262 |
|
record_size, record_struct = get_record_size_struct(records.keys()) |
| 263 |
|
|
| 264 |
# Find the optimum block size for the two-stage table |
# Find the optimum block size for the two-stage table |
| 265 |
min_size = sys.maxint |
min_size = sys.maxint |
| 266 |
for block_size in [2 ** i for i in range(5,10)]: |
for block_size in [2 ** i for i in range(5,10)]: |
| 267 |
size = len(records) * 4 |
size = len(records) * record_size |
| 268 |
stage1, stage2 = compress_table(table, block_size) |
stage1, stage2 = compress_table(table, block_size) |
| 269 |
size += get_tables_size(stage1, stage2) |
size += get_tables_size(stage1, stage2) |
| 270 |
#print "/* block size %5d => %5d bytes */" % (block_size, size) |
#print "/* block size %5d => %5d bytes */" % (block_size, size) |
| 281 |
print "/* Unicode character database. */" |
print "/* Unicode character database. */" |
| 282 |
print "/* This file was autogenerated by the MultiStage2.py script. */" |
print "/* This file was autogenerated by the MultiStage2.py script. */" |
| 283 |
print "/* Total size: %d bytes, block size: %d. */" % (min_size, min_block_size) |
print "/* Total size: %d bytes, block size: %d. */" % (min_size, min_block_size) |
| 284 |
print_records(records) |
print record_struct |
| 285 |
|
print_records(records, record_size) |
| 286 |
print_table(min_stage1, '_pcre_ucd_stage1') |
print_table(min_stage1, '_pcre_ucd_stage1') |
| 287 |
print_table(min_stage2, '_pcre_ucd_stage2', min_block_size) |
print_table(min_stage2, '_pcre_ucd_stage2', min_block_size) |
| 288 |
print "#if UCD_BLOCK_SIZE != %d" % min_block_size |
print "#if UCD_BLOCK_SIZE != %d" % min_block_size |