| 1 |
#! /usr/bin/python
|
| 2 |
|
| 3 |
# Multistage table builder
|
| 4 |
# (c) Peter Kankowski, 2008
|
| 5 |
|
| 6 |
# This script was submitted to the PCRE project by Peter Kankowski as part of
|
| 7 |
# the upgrading of Unicode property support. The new code speeds up property
|
| 8 |
# matching many times. The script is for the use of PCRE maintainers, to
|
| 9 |
# generate the pcre_ucd.c file that contains a digested form of the Unicode
|
| 10 |
# data tables.
|
| 11 |
|
| 12 |
# The script should be run in the maint subdirectory, using the command
|
| 13 |
#
|
| 14 |
# ./MultiStage2.py >../pcre_ucd.c
|
| 15 |
#
|
| 16 |
# It requires three Unicode data tables, DerivedGeneralCategory.txt,
|
| 17 |
# Scripts.txt, and UnicodeData.txt, to be in the Unicode.tables subdirectory.
|
| 18 |
|
| 19 |
# Added with minor modifications:
|
| 20 |
# Added #! line at start
|
| 21 |
# Removed tabs
|
| 22 |
# Made it work with Python 2.4 by rewriting two statements that needed 2.5
|
| 23 |
# Consequent code tidy
|
| 24 |
# Adjusted file names to Unicode.tables directory
|
| 25 |
#
|
| 26 |
# Philip Hazel, 02 July 2008
|
| 27 |
|
| 28 |
|
| 29 |
import re
|
| 30 |
import string
|
| 31 |
import sys
|
| 32 |
|
| 33 |
MAX_UNICODE = 0x110000
|
| 34 |
NOTACHAR = 0xffffffff
|
| 35 |
|
| 36 |
# Parse a line of CaseFolding.txt, Scripts.txt, and DerivedGeneralCategory.txt file
|
| 37 |
def make_get_names(enum):
|
| 38 |
return lambda chardata: enum.index(chardata[1])
|
| 39 |
|
| 40 |
def get_case_folding_value(chardata):
|
| 41 |
if chardata[1] != 'C' and chardata[1] != 'S':
|
| 42 |
return 0
|
| 43 |
return int(chardata[2], 16) - int(chardata[0], 16)
|
| 44 |
|
| 45 |
def get_other_case(chardata):
|
| 46 |
if chardata[12] != '':
|
| 47 |
return int(chardata[12], 16) - int(chardata[0], 16)
|
| 48 |
if chardata[13] != '':
|
| 49 |
return int(chardata[13], 16) - int(chardata[0], 16)
|
| 50 |
return 0
|
| 51 |
|
| 52 |
# Read the whole table in memory
|
| 53 |
def read_table(file_name, get_value, default_value):
|
| 54 |
file = open(file_name, 'r')
|
| 55 |
table = [default_value] * MAX_UNICODE
|
| 56 |
for line in file:
|
| 57 |
line = re.sub(r'#.*', '', line)
|
| 58 |
chardata = map(string.strip, line.split(';'))
|
| 59 |
if len(chardata) <= 1:
|
| 60 |
continue
|
| 61 |
value = get_value(chardata)
|
| 62 |
|
| 63 |
m = re.match(r'([0-9a-fA-F]+)(\.\.([0-9a-fA-F]+))?$', chardata[0])
|
| 64 |
char = int(m.group(1), 16)
|
| 65 |
# PH last = char if m.group(3) is None else int(m.group(3), 16)
|
| 66 |
if m.group(3) is None:
|
| 67 |
last = char
|
| 68 |
else:
|
| 69 |
last = int(m.group(3), 16)
|
| 70 |
for i in range(char, last + 1):
|
| 71 |
table[i] = value
|
| 72 |
file.close()
|
| 73 |
return table
|
| 74 |
|
| 75 |
# Get the smallest possible C language type for the values
|
| 76 |
def get_type_size(table):
|
| 77 |
type_size = [("uschar", 1), ("pcre_uint16", 2), ("pcre_uint32", 4),
|
| 78 |
("signed char", 1), ("pcre_int16", 2), ("pcre_int32", 4)]
|
| 79 |
limits = [(0, 255), (0, 65535), (0, 4294967295),
|
| 80 |
(-128, 127), (-32768, 32767), (-2147483648, 2147483647)]
|
| 81 |
minval = min(table)
|
| 82 |
maxval = max(table)
|
| 83 |
for num, (minlimit, maxlimit) in enumerate(limits):
|
| 84 |
if minlimit <= minval and maxval <= maxlimit:
|
| 85 |
return type_size[num]
|
| 86 |
else:
|
| 87 |
raise OverflowError, "Too large to fit into C types"
|
| 88 |
|
| 89 |
def get_tables_size(*tables):
|
| 90 |
total_size = 0
|
| 91 |
for table in tables:
|
| 92 |
type, size = get_type_size(table)
|
| 93 |
total_size += size * len(table)
|
| 94 |
return total_size
|
| 95 |
|
| 96 |
# Compress the table into the two stages
|
| 97 |
def compress_table(table, block_size):
|
| 98 |
blocks = {} # Dictionary for finding identical blocks
|
| 99 |
stage1 = [] # Stage 1 table contains block numbers (indices into stage 2 table)
|
| 100 |
stage2 = [] # Stage 2 table contains the blocks with property values
|
| 101 |
table = tuple(table)
|
| 102 |
for i in range(0, len(table), block_size):
|
| 103 |
block = table[i:i+block_size]
|
| 104 |
start = blocks.get(block)
|
| 105 |
if start is None:
|
| 106 |
# Allocate a new block
|
| 107 |
start = len(stage2) / block_size
|
| 108 |
stage2 += block
|
| 109 |
blocks[block] = start
|
| 110 |
stage1.append(start)
|
| 111 |
|
| 112 |
return stage1, stage2
|
| 113 |
|
| 114 |
# Print a table
|
| 115 |
def print_table(table, table_name, block_size = None):
|
| 116 |
type, size = get_type_size(table)
|
| 117 |
ELEMS_PER_LINE = 16
|
| 118 |
|
| 119 |
s = "const %s %s[] = { /* %d bytes" % (type, table_name, size * len(table))
|
| 120 |
if block_size:
|
| 121 |
s += ", block = %d" % block_size
|
| 122 |
print s + " */"
|
| 123 |
table = tuple(table)
|
| 124 |
if block_size is None:
|
| 125 |
fmt = "%3d," * ELEMS_PER_LINE + " /* U+%04X */"
|
| 126 |
mult = MAX_UNICODE / len(table)
|
| 127 |
for i in range(0, len(table), ELEMS_PER_LINE):
|
| 128 |
print fmt % (table[i:i+ELEMS_PER_LINE] + (i * mult,))
|
| 129 |
else:
|
| 130 |
# PH fmt = "%3d," * (ELEMS_PER_LINE if block_size > ELEMS_PER_LINE else block_size) + "\n"
|
| 131 |
if block_size > ELEMS_PER_LINE:
|
| 132 |
fmt = "%3d," * ELEMS_PER_LINE + "\n"
|
| 133 |
fmt = fmt * (block_size / ELEMS_PER_LINE)
|
| 134 |
else:
|
| 135 |
fmt = "%3d," * block_size + "\n"
|
| 136 |
# PH if block_size > ELEMS_PER_LINE:
|
| 137 |
# PH fmt = fmt * (block_size / ELEMS_PER_LINE)
|
| 138 |
for i in range(0, len(table), block_size):
|
| 139 |
print ("/* block %d */\n" + fmt) % ((i / block_size,) + table[i:i+block_size])
|
| 140 |
print "};\n"
|
| 141 |
|
| 142 |
# Extract the unique combinations of properties into records
|
| 143 |
def combine_tables(*tables):
|
| 144 |
records = {}
|
| 145 |
index = []
|
| 146 |
for t in zip(*tables):
|
| 147 |
i = records.get(t)
|
| 148 |
if i is None:
|
| 149 |
i = records[t] = len(records)
|
| 150 |
index.append(i)
|
| 151 |
return index, records
|
| 152 |
|
| 153 |
def print_records(records):
|
| 154 |
print 'const ucd_record ucd_records[] = { /* %d bytes */' % (len(records) * 4)
|
| 155 |
records = zip(records.keys(), records.values())
|
| 156 |
records.sort(None, lambda x: x[1])
|
| 157 |
for i, record in enumerate(records):
|
| 158 |
print (' {' + '%6d, ' * len(record[0]) + '}, /* %3d */') % (record[0] + (i,))
|
| 159 |
print '};\n'
|
| 160 |
|
| 161 |
script_names = ['Arabic', 'Armenian', 'Bengali', 'Bopomofo', 'Braille', 'Buginese', 'Buhid', 'Canadian_Aboriginal', \
|
| 162 |
'Cherokee', 'Common', 'Coptic', 'Cypriot', 'Cyrillic', 'Deseret', 'Devanagari', 'Ethiopic', 'Georgian', \
|
| 163 |
'Glagolitic', 'Gothic', 'Greek', 'Gujarati', 'Gurmukhi', 'Han', 'Hangul', 'Hanunoo', 'Hebrew', 'Hiragana', \
|
| 164 |
'Inherited', 'Kannada', 'Katakana', 'Kharoshthi', 'Khmer', 'Lao', 'Latin', 'Limbu', 'Linear_B', 'Malayalam', \
|
| 165 |
'Mongolian', 'Myanmar', 'New_Tai_Lue', 'Ogham', 'Old_Italic', 'Old_Persian', 'Oriya', 'Osmanya', 'Runic', \
|
| 166 |
'Shavian', 'Sinhala', 'Syloti_Nagri', 'Syriac', 'Tagalog', 'Tagbanwa', 'Tai_Le', 'Tamil', 'Telugu', 'Thaana', \
|
| 167 |
'Thai', 'Tibetan', 'Tifinagh', 'Ugaritic', 'Yi', \
|
| 168 |
'Balinese', 'Cuneiform', 'Nko', 'Phags_Pa', 'Phoenician']
|
| 169 |
|
| 170 |
category_names = ['Cc', 'Cf', 'Cn', 'Co', 'Cs', 'Ll', 'Lm', 'Lo', 'Lt', 'Lu',
|
| 171 |
'Mc', 'Me', 'Mn', 'Nd', 'Nl', 'No', 'Pc', 'Pd', 'Pe', 'Pf', 'Pi', 'Po', 'Ps',
|
| 172 |
'Sc', 'Sk', 'Sm', 'So', 'Zl', 'Zp', 'Zs' ]
|
| 173 |
|
| 174 |
|
| 175 |
script = read_table('Unicode.tables/Scripts.txt', make_get_names(script_names), script_names.index('Common'))
|
| 176 |
category = read_table('Unicode.tables/DerivedGeneralCategory.txt', make_get_names(category_names), category_names.index('Cn'))
|
| 177 |
other_case = read_table('Unicode.tables/UnicodeData.txt', get_other_case, 0)
|
| 178 |
# case_fold = read_table('CaseFolding.txt', get_case_folding_value, 0)
|
| 179 |
|
| 180 |
table, records = combine_tables(script, category, other_case)
|
| 181 |
|
| 182 |
# Find the optimum block size for the two-stage table
|
| 183 |
min_size = sys.maxint
|
| 184 |
for block_size in [2 ** i for i in range(5,10)]:
|
| 185 |
size = len(records) * 4
|
| 186 |
stage1, stage2 = compress_table(table, block_size)
|
| 187 |
size += get_tables_size(stage1, stage2)
|
| 188 |
#print "/* block size %5d => %5d bytes */" % (block_size, size)
|
| 189 |
if size < min_size:
|
| 190 |
min_size = size
|
| 191 |
min_stage1, min_stage2 = stage1, stage2
|
| 192 |
min_block_size = block_size
|
| 193 |
|
| 194 |
print "#ifdef HAVE_CONFIG_H"
|
| 195 |
print "#include \"config.h\""
|
| 196 |
print "#endif"
|
| 197 |
print "#include \"pcre_internal.h\""
|
| 198 |
print
|
| 199 |
print "/* Unicode character database. */"
|
| 200 |
print "/* This file was autogenerated by MultiStage2.py script. */"
|
| 201 |
print "/* Total size: %d bytes, block size: %d. */" % (min_size, min_block_size)
|
| 202 |
print_records(records)
|
| 203 |
print_table(min_stage1, 'ucd_stage1')
|
| 204 |
print_table(min_stage2, 'ucd_stage2', min_block_size)
|
| 205 |
print "#if UCD_BLOCK_SIZE != %d" % min_block_size
|
| 206 |
print "#error Please correct UCD_BLOCK_SIZE in pcre_internal.h"
|
| 207 |
print "#endif"
|
| 208 |
|
| 209 |
"""
|
| 210 |
|
| 211 |
# Three-stage tables:
|
| 212 |
|
| 213 |
# Find the optimum block size for 3-stage table
|
| 214 |
min_size = sys.maxint
|
| 215 |
for stage3_block in [2 ** i for i in range(2,6)]:
|
| 216 |
stage_i, stage3 = compress_table(table, stage3_block)
|
| 217 |
for stage2_block in [2 ** i for i in range(5,10)]:
|
| 218 |
size = len(records) * 4
|
| 219 |
stage1, stage2 = compress_table(stage_i, stage2_block)
|
| 220 |
size += get_tables_size(stage1, stage2, stage3)
|
| 221 |
# print "/* %5d / %3d => %5d bytes */" % (stage2_block, stage3_block, size)
|
| 222 |
if size < min_size:
|
| 223 |
min_size = size
|
| 224 |
min_stage1, min_stage2, min_stage3 = stage1, stage2, stage3
|
| 225 |
min_stage2_block, min_stage3_block = stage2_block, stage3_block
|
| 226 |
|
| 227 |
print "/* Total size: %d bytes" % min_size */
|
| 228 |
print_records(records)
|
| 229 |
print_table(min_stage1, 'ucd_stage1')
|
| 230 |
print_table(min_stage2, 'ucd_stage2', min_stage2_block)
|
| 231 |
print_table(min_stage3, 'ucd_stage3', min_stage3_block)
|
| 232 |
|
| 233 |
"""
|