| 1 |
#! /usr/bin/python
|
| 2 |
|
| 3 |
# Multistage table builder
|
| 4 |
# (c) Peter Kankowski, 2008
|
| 5 |
|
| 6 |
##############################################################################
|
| 7 |
# This script was submitted to the PCRE project by Peter Kankowski as part of
|
| 8 |
# the upgrading of Unicode property support. The new code speeds up property
|
| 9 |
# matching many times. The script is for the use of PCRE maintainers, to
|
| 10 |
# generate the pcre_ucd.c file that contains a digested form of the Unicode
|
| 11 |
# data tables.
|
| 12 |
#
|
| 13 |
# The script should be run in the maint subdirectory, using the command
|
| 14 |
#
|
| 15 |
# ./MultiStage2.py >../pcre_ucd.c
|
| 16 |
#
|
| 17 |
# It requires four Unicode data tables, DerivedGeneralCategory.txt,
|
| 18 |
# GraphemeBreakProperty.txt, Scripts.txt, and CaseFolding.txt, to be in the
|
| 19 |
# Unicode.tables subdirectory. The first of these is found in the "extracted"
|
| 20 |
# subdirectory of the Unicode database (UCD) on the Unicode web site; the
|
| 21 |
# second is in the "auxiliary" subdirectory; the other two are directly in the
|
| 22 |
# UCD directory.
|
| 23 |
#
|
| 24 |
# Minor modifications made to this script:
|
| 25 |
# Added #! line at start
|
| 26 |
# Removed tabs
|
| 27 |
# Made it work with Python 2.4 by rewriting two statements that needed 2.5
|
| 28 |
# Consequent code tidy
|
| 29 |
# Adjusted data file names to take from the Unicode.tables directory
|
| 30 |
# Adjusted global table names by prefixing _pcre_.
|
| 31 |
# Commented out stuff relating to the casefolding table, which isn't used;
|
| 32 |
# removed completely in 2012.
|
| 33 |
# Corrected size calculation
|
| 34 |
# Add #ifndef SUPPORT_UCP to use dummy tables when no UCP support is needed.
|
| 35 |
#
|
| 36 |
# Major modifications made to this script:
|
| 37 |
# Added code to add a grapheme break property field to records.
|
| 38 |
#
|
| 39 |
# Added code to search for sets of more than two characters that must match
|
| 40 |
# each other caselessly. A new table is output containing these sets, and
|
| 41 |
# offsets into the table are added to the main output records. This new
|
| 42 |
# code scans CaseFolding.txt instead of UnicodeData.txt.
|
| 43 |
#
|
| 44 |
# The main tables generated by this script are used by macros defined in
|
| 45 |
# pcre_internal.h. They look up Unicode character properties using short
|
| 46 |
# sequences of code that contains no branches, which makes for greater speed.
|
| 47 |
#
|
| 48 |
# Conceptually, there is a table of records (of type ucd_record), containing a
|
| 49 |
# script number, character type, grapheme break type, offset to caseless
|
| 50 |
# matching set, and offset to the character's other case for every character.
|
| 51 |
# However, a real table covering all Unicode characters would be far too big.
|
| 52 |
# It can be efficiently compressed by observing that many characters have the
|
| 53 |
# same record, and many blocks of characters (taking 128 characters in a block)
|
| 54 |
# have the same set of records as other blocks. This leads to a 2-stage lookup
|
| 55 |
# process.
|
| 56 |
#
|
| 57 |
# This script constructs four tables. The ucd_caseless_sets table contains
|
| 58 |
# lists of characters that all match each other caselessly. Each list is
|
| 59 |
# in order, and is terminated by NOTACHAR (0xffffffff), which is larger than
|
| 60 |
# any valid character. The first list is empty; this is used for characters
|
| 61 |
# that are not part of any list.
|
| 62 |
#
|
| 63 |
# The ucd_records table contains one instance of every unique record that is
|
| 64 |
# required. The ucd_stage1 table is indexed by a character's block number, and
|
| 65 |
# yields what is in effect a "virtual" block number. The ucd_stage2 table is a
|
| 66 |
# table of "virtual" blocks; each block is indexed by the offset of a character
|
| 67 |
# within its own block, and the result is the offset of the required record.
|
| 68 |
#
|
| 69 |
# Example: lowercase "a" (U+0061) is in block 0
|
| 70 |
# lookup 0 in stage1 table yields 0
|
| 71 |
# lookup 97 in the first table in stage2 yields 16
|
| 72 |
# record 17 is { 33, 5, 11, 0, -32 }
|
| 73 |
# 33 = ucp_Latin => Latin script
|
| 74 |
# 5 = ucp_Ll => Lower case letter
|
| 75 |
# 11 = ucp_gbOther => Grapheme break property "Other"
|
| 76 |
# 0 => not part of a caseless set
|
| 77 |
# -32 => Other case is U+0041
|
| 78 |
#
|
| 79 |
# Almost all lowercase latin characters resolve to the same record. One or two
|
| 80 |
# are different because they are part of a multi-character caseless set (for
|
| 81 |
# example, k, K and the Kelvin symbol are such a set).
|
| 82 |
#
|
| 83 |
# Example: hiragana letter A (U+3042) is in block 96 (0x60)
|
| 84 |
# lookup 96 in stage1 table yields 88
|
| 85 |
# lookup 66 in the 88th table in stage2 yields 467
|
| 86 |
# record 470 is { 26, 7, 11, 0, 0 }
|
| 87 |
# 26 = ucp_Hiragana => Hiragana script
|
| 88 |
# 7 = ucp_Lo => Other letter
|
| 89 |
# 11 = ucp_gbOther => Grapheme break property "Other"
|
| 90 |
# 0 => not part of a caseless set
|
| 91 |
# 0 => No other case
|
| 92 |
#
|
| 93 |
# In these examples, no other blocks resolve to the same "virtual" block, as it
|
| 94 |
# happens, but plenty of other blocks do share "virtual" blocks.
|
| 95 |
#
|
| 96 |
# There is a fourth table, maintained by hand, which translates from the
|
| 97 |
# individual character types such as ucp_Cc to the general types like ucp_C.
|
| 98 |
#
|
| 99 |
# Philip Hazel, 03 July 2008
|
| 100 |
#
|
| 101 |
# 01-March-2010: Updated list of scripts for Unicode 5.2.0
|
| 102 |
# 30-April-2011: Updated list of scripts for Unicode 6.0.0
|
| 103 |
# July-2012: Updated list of scripts for Unicode 6.1.0
|
| 104 |
# 20-August-2012: Added scan of GraphemeBreakProperty.txt and added a new
|
| 105 |
# field in the record to hold the value. Luckily, the
|
| 106 |
# structure had a hole in it, so the resulting table is
|
| 107 |
# not much bigger than before.
|
| 108 |
# 18-September-2012: Added code for multiple caseless sets. This uses the
|
| 109 |
# final hole in the structure.
|
| 110 |
# 30-September-2012: Added RegionalIndicator break property from Unicode 6.2.0
|
| 111 |
##############################################################################
|
| 112 |
|
| 113 |
|
| 114 |
import re
|
| 115 |
import string
|
| 116 |
import sys
|
| 117 |
|
| 118 |
MAX_UNICODE = 0x110000
|
| 119 |
NOTACHAR = 0xffffffff
|
| 120 |
|
| 121 |
# Parse a line of Scripts.txt, GraphemeBreakProperty.txt or DerivedGeneralCategory.txt
|
| 122 |
def make_get_names(enum):
|
| 123 |
return lambda chardata: enum.index(chardata[1])
|
| 124 |
|
| 125 |
# Parse a line of CaseFolding.txt
|
| 126 |
def get_other_case(chardata):
|
| 127 |
if chardata[1] == 'C' or chardata[1] == 'S':
|
| 128 |
return int(chardata[2], 16) - int(chardata[0], 16)
|
| 129 |
return 0
|
| 130 |
|
| 131 |
|
| 132 |
# Read the whole table in memory
|
| 133 |
def read_table(file_name, get_value, default_value):
|
| 134 |
file = open(file_name, 'r')
|
| 135 |
table = [default_value] * MAX_UNICODE
|
| 136 |
for line in file:
|
| 137 |
line = re.sub(r'#.*', '', line)
|
| 138 |
chardata = map(string.strip, line.split(';'))
|
| 139 |
if len(chardata) <= 1:
|
| 140 |
continue
|
| 141 |
value = get_value(chardata)
|
| 142 |
m = re.match(r'([0-9a-fA-F]+)(\.\.([0-9a-fA-F]+))?$', chardata[0])
|
| 143 |
char = int(m.group(1), 16)
|
| 144 |
if m.group(3) is None:
|
| 145 |
last = char
|
| 146 |
else:
|
| 147 |
last = int(m.group(3), 16)
|
| 148 |
for i in range(char, last + 1):
|
| 149 |
# It is important not to overwrite a previously set
|
| 150 |
# value because in the CaseFolding file there are lines
|
| 151 |
# to be ignored (returning the default value of 0)
|
| 152 |
# which often come after a line which has already set
|
| 153 |
# data.
|
| 154 |
if table[i] == default_value:
|
| 155 |
table[i] = value
|
| 156 |
file.close()
|
| 157 |
return table
|
| 158 |
|
| 159 |
# Get the smallest possible C language type for the values
|
| 160 |
def get_type_size(table):
|
| 161 |
type_size = [("pcre_uint8", 1), ("pcre_uint16", 2), ("pcre_uint32", 4),
|
| 162 |
("signed char", 1), ("pcre_int16", 2), ("pcre_int32", 4)]
|
| 163 |
limits = [(0, 255), (0, 65535), (0, 4294967295),
|
| 164 |
(-128, 127), (-32768, 32767), (-2147483648, 2147483647)]
|
| 165 |
minval = min(table)
|
| 166 |
maxval = max(table)
|
| 167 |
for num, (minlimit, maxlimit) in enumerate(limits):
|
| 168 |
if minlimit <= minval and maxval <= maxlimit:
|
| 169 |
return type_size[num]
|
| 170 |
else:
|
| 171 |
raise OverflowError, "Too large to fit into C types"
|
| 172 |
|
| 173 |
def get_tables_size(*tables):
|
| 174 |
total_size = 0
|
| 175 |
for table in tables:
|
| 176 |
type, size = get_type_size(table)
|
| 177 |
total_size += size * len(table)
|
| 178 |
return total_size
|
| 179 |
|
| 180 |
# Compress the table into the two stages
|
| 181 |
def compress_table(table, block_size):
|
| 182 |
blocks = {} # Dictionary for finding identical blocks
|
| 183 |
stage1 = [] # Stage 1 table contains block numbers (indices into stage 2 table)
|
| 184 |
stage2 = [] # Stage 2 table contains the blocks with property values
|
| 185 |
table = tuple(table)
|
| 186 |
for i in range(0, len(table), block_size):
|
| 187 |
block = table[i:i+block_size]
|
| 188 |
start = blocks.get(block)
|
| 189 |
if start is None:
|
| 190 |
# Allocate a new block
|
| 191 |
start = len(stage2) / block_size
|
| 192 |
stage2 += block
|
| 193 |
blocks[block] = start
|
| 194 |
stage1.append(start)
|
| 195 |
|
| 196 |
return stage1, stage2
|
| 197 |
|
| 198 |
# Print a table
|
| 199 |
def print_table(table, table_name, block_size = None):
|
| 200 |
type, size = get_type_size(table)
|
| 201 |
ELEMS_PER_LINE = 16
|
| 202 |
|
| 203 |
s = "const %s %s[] = { /* %d bytes" % (type, table_name, size * len(table))
|
| 204 |
if block_size:
|
| 205 |
s += ", block = %d" % block_size
|
| 206 |
print s + " */"
|
| 207 |
table = tuple(table)
|
| 208 |
if block_size is None:
|
| 209 |
fmt = "%3d," * ELEMS_PER_LINE + " /* U+%04X */"
|
| 210 |
mult = MAX_UNICODE / len(table)
|
| 211 |
for i in range(0, len(table), ELEMS_PER_LINE):
|
| 212 |
print fmt % (table[i:i+ELEMS_PER_LINE] + (i * mult,))
|
| 213 |
else:
|
| 214 |
if block_size > ELEMS_PER_LINE:
|
| 215 |
el = ELEMS_PER_LINE
|
| 216 |
else:
|
| 217 |
el = block_size
|
| 218 |
fmt = "%3d," * el + "\n"
|
| 219 |
if block_size > ELEMS_PER_LINE:
|
| 220 |
fmt = fmt * (block_size / ELEMS_PER_LINE)
|
| 221 |
for i in range(0, len(table), block_size):
|
| 222 |
print ("/* block %d */\n" + fmt) % ((i / block_size,) + table[i:i+block_size])
|
| 223 |
print "};\n"
|
| 224 |
|
| 225 |
# Extract the unique combinations of properties into records
|
| 226 |
def combine_tables(*tables):
|
| 227 |
records = {}
|
| 228 |
index = []
|
| 229 |
for t in zip(*tables):
|
| 230 |
i = records.get(t)
|
| 231 |
if i is None:
|
| 232 |
i = records[t] = len(records)
|
| 233 |
index.append(i)
|
| 234 |
return index, records
|
| 235 |
|
| 236 |
def get_record_size_struct(records):
|
| 237 |
size = 0
|
| 238 |
structure = '/* When recompiling tables with a new Unicode version, please check the\n' + \
|
| 239 |
'types in this structure definition from pcre_internal.h (the actual\n' + \
|
| 240 |
'field names will be different):\n\ntypedef struct {\n'
|
| 241 |
for i in range(len(records[0])):
|
| 242 |
record_slice = map(lambda record: record[i], records)
|
| 243 |
slice_type, slice_size = get_type_size(record_slice)
|
| 244 |
# add padding: round up to the nearest power of slice_size
|
| 245 |
size = (size + slice_size - 1) & -slice_size
|
| 246 |
size += slice_size
|
| 247 |
structure += '%s property_%d;\n' % (slice_type, i)
|
| 248 |
|
| 249 |
# round up to the first item of the next structure in array
|
| 250 |
record_slice = map(lambda record: record[0], records)
|
| 251 |
slice_type, slice_size = get_type_size(record_slice)
|
| 252 |
size = (size + slice_size - 1) & -slice_size
|
| 253 |
|
| 254 |
structure += '} ucd_record;\n*/\n\n'
|
| 255 |
return size, structure
|
| 256 |
|
| 257 |
def test_record_size():
|
| 258 |
tests = [ \
|
| 259 |
( [(3,), (6,), (6,), (1,)], 1 ), \
|
| 260 |
( [(300,), (600,), (600,), (100,)], 2 ), \
|
| 261 |
( [(25, 3), (6, 6), (34, 6), (68, 1)], 2 ), \
|
| 262 |
( [(300, 3), (6, 6), (340, 6), (690, 1)], 4 ), \
|
| 263 |
( [(3, 300), (6, 6), (6, 340), (1, 690)], 4 ), \
|
| 264 |
( [(300, 300), (6, 6), (6, 340), (1, 690)], 4 ), \
|
| 265 |
( [(3, 100000), (6, 6), (6, 123456), (1, 690)], 8 ), \
|
| 266 |
( [(100000, 300), (6, 6), (123456, 6), (1, 690)], 8 ), \
|
| 267 |
]
|
| 268 |
for test in tests:
|
| 269 |
size, struct = get_record_size_struct(test[0])
|
| 270 |
assert(size == test[1])
|
| 271 |
#print struct
|
| 272 |
|
| 273 |
def print_records(records, record_size):
|
| 274 |
print 'const ucd_record PRIV(ucd_records)[] = { ' + \
|
| 275 |
'/* %d bytes, record size %d */' % (len(records) * record_size, record_size)
|
| 276 |
records = zip(records.keys(), records.values())
|
| 277 |
records.sort(None, lambda x: x[1])
|
| 278 |
for i, record in enumerate(records):
|
| 279 |
print (' {' + '%6d, ' * len(record[0]) + '}, /* %3d */') % (record[0] + (i,))
|
| 280 |
print '};\n'
|
| 281 |
|
| 282 |
script_names = ['Arabic', 'Armenian', 'Bengali', 'Bopomofo', 'Braille', 'Buginese', 'Buhid', 'Canadian_Aboriginal', \
|
| 283 |
'Cherokee', 'Common', 'Coptic', 'Cypriot', 'Cyrillic', 'Deseret', 'Devanagari', 'Ethiopic', 'Georgian', \
|
| 284 |
'Glagolitic', 'Gothic', 'Greek', 'Gujarati', 'Gurmukhi', 'Han', 'Hangul', 'Hanunoo', 'Hebrew', 'Hiragana', \
|
| 285 |
'Inherited', 'Kannada', 'Katakana', 'Kharoshthi', 'Khmer', 'Lao', 'Latin', 'Limbu', 'Linear_B', 'Malayalam', \
|
| 286 |
'Mongolian', 'Myanmar', 'New_Tai_Lue', 'Ogham', 'Old_Italic', 'Old_Persian', 'Oriya', 'Osmanya', 'Runic', \
|
| 287 |
'Shavian', 'Sinhala', 'Syloti_Nagri', 'Syriac', 'Tagalog', 'Tagbanwa', 'Tai_Le', 'Tamil', 'Telugu', 'Thaana', \
|
| 288 |
'Thai', 'Tibetan', 'Tifinagh', 'Ugaritic', 'Yi', \
|
| 289 |
# New for Unicode 5.0
|
| 290 |
'Balinese', 'Cuneiform', 'Nko', 'Phags_Pa', 'Phoenician', \
|
| 291 |
# New for Unicode 5.1
|
| 292 |
'Carian', 'Cham', 'Kayah_Li', 'Lepcha', 'Lycian', 'Lydian', 'Ol_Chiki', 'Rejang', 'Saurashtra', 'Sundanese', 'Vai', \
|
| 293 |
# New for Unicode 5.2
|
| 294 |
'Avestan', 'Bamum', 'Egyptian_Hieroglyphs', 'Imperial_Aramaic', \
|
| 295 |
'Inscriptional_Pahlavi', 'Inscriptional_Parthian', \
|
| 296 |
'Javanese', 'Kaithi', 'Lisu', 'Meetei_Mayek', \
|
| 297 |
'Old_South_Arabian', 'Old_Turkic', 'Samaritan', 'Tai_Tham', 'Tai_Viet', \
|
| 298 |
# New for Unicode 6.0.0
|
| 299 |
'Batak', 'Brahmi', 'Mandaic', \
|
| 300 |
# New for Unicode 6.1.0
|
| 301 |
'Chakma', 'Meroitic_Cursive', 'Meroitic_Hieroglyphs', 'Miao', 'Sharada', 'Sora_Sompeng', 'Takri'
|
| 302 |
]
|
| 303 |
|
| 304 |
category_names = ['Cc', 'Cf', 'Cn', 'Co', 'Cs', 'Ll', 'Lm', 'Lo', 'Lt', 'Lu',
|
| 305 |
'Mc', 'Me', 'Mn', 'Nd', 'Nl', 'No', 'Pc', 'Pd', 'Pe', 'Pf', 'Pi', 'Po', 'Ps',
|
| 306 |
'Sc', 'Sk', 'Sm', 'So', 'Zl', 'Zp', 'Zs' ]
|
| 307 |
|
| 308 |
break_property_names = ['CR', 'LF', 'Control', 'Extend', 'Prepend',
|
| 309 |
'SpacingMark', 'L', 'V', 'T', 'LV', 'LVT', 'Regional_Indicator', 'Other' ]
|
| 310 |
|
| 311 |
test_record_size()
|
| 312 |
|
| 313 |
script = read_table('Unicode.tables/Scripts.txt', make_get_names(script_names), script_names.index('Common'))
|
| 314 |
category = read_table('Unicode.tables/DerivedGeneralCategory.txt', make_get_names(category_names), category_names.index('Cn'))
|
| 315 |
break_props = read_table('Unicode.tables/GraphemeBreakProperty.txt', make_get_names(break_property_names), break_property_names.index('Other'))
|
| 316 |
other_case = read_table('Unicode.tables/CaseFolding.txt', get_other_case, 0)
|
| 317 |
|
| 318 |
|
| 319 |
# This block of code was added by PH in September 2012. I am not a Python
|
| 320 |
# programmer, so the style is probably dreadful, but it does the job. It scans
|
| 321 |
# the other_case table to find sets of more than two characters that must all
|
| 322 |
# match each other caselessly. Later in this script a table of these sets is
|
| 323 |
# written out. However, we have to do this work here in order to compute the
|
| 324 |
# offsets in the table that are inserted into the main table.
|
| 325 |
|
| 326 |
# The CaseFolding.txt file lists pairs, but the common logic for reading data
|
| 327 |
# sets only one value, so first we go through the table and set "return"
|
| 328 |
# offsets for those that are not already set.
|
| 329 |
|
| 330 |
for c in range(0x10ffff):
|
| 331 |
if other_case[c] != 0 and other_case[c + other_case[c]] == 0:
|
| 332 |
other_case[c + other_case[c]] = -other_case[c]
|
| 333 |
|
| 334 |
# Now scan again and create equivalence sets.
|
| 335 |
|
| 336 |
sets = []
|
| 337 |
|
| 338 |
for c in range(0x10ffff):
|
| 339 |
o = c + other_case[c]
|
| 340 |
|
| 341 |
# Trigger when this character's other case does not point back here. We
|
| 342 |
# now have three characters that are case-equivalent.
|
| 343 |
|
| 344 |
if other_case[o] != -other_case[c]:
|
| 345 |
t = o + other_case[o]
|
| 346 |
|
| 347 |
# Scan the existing sets to see if any of the three characters are already
|
| 348 |
# part of a set. If so, unite the existing set with the new set.
|
| 349 |
|
| 350 |
appended = 0
|
| 351 |
for s in sets:
|
| 352 |
found = 0
|
| 353 |
for x in s:
|
| 354 |
if x == c or x == o or x == t:
|
| 355 |
found = 1
|
| 356 |
|
| 357 |
# Add new characters to an existing set
|
| 358 |
|
| 359 |
if found:
|
| 360 |
found = 0
|
| 361 |
for y in [c, o, t]:
|
| 362 |
for x in s:
|
| 363 |
if x == y:
|
| 364 |
found = 1
|
| 365 |
if not found:
|
| 366 |
s.append(y)
|
| 367 |
appended = 1
|
| 368 |
|
| 369 |
# If we have not added to an existing set, create a new one.
|
| 370 |
|
| 371 |
if not appended:
|
| 372 |
sets.append([c, o, t])
|
| 373 |
|
| 374 |
# End of loop looking for caseless sets.
|
| 375 |
|
| 376 |
# Now scan the sets and set appropriate offsets for the characters.
|
| 377 |
|
| 378 |
caseless_offsets = [0] * MAX_UNICODE
|
| 379 |
|
| 380 |
offset = 1;
|
| 381 |
for s in sets:
|
| 382 |
for x in s:
|
| 383 |
caseless_offsets[x] = offset
|
| 384 |
offset += len(s) + 1
|
| 385 |
|
| 386 |
# End of block of code for creating offsets for caseless matching sets.
|
| 387 |
|
| 388 |
|
| 389 |
# Combine the tables
|
| 390 |
|
| 391 |
table, records = combine_tables(script, category, break_props,
|
| 392 |
caseless_offsets, other_case)
|
| 393 |
|
| 394 |
record_size, record_struct = get_record_size_struct(records.keys())
|
| 395 |
|
| 396 |
# Find the optimum block size for the two-stage table
|
| 397 |
min_size = sys.maxint
|
| 398 |
for block_size in [2 ** i for i in range(5,10)]:
|
| 399 |
size = len(records) * record_size
|
| 400 |
stage1, stage2 = compress_table(table, block_size)
|
| 401 |
size += get_tables_size(stage1, stage2)
|
| 402 |
#print "/* block size %5d => %5d bytes */" % (block_size, size)
|
| 403 |
if size < min_size:
|
| 404 |
min_size = size
|
| 405 |
min_stage1, min_stage2 = stage1, stage2
|
| 406 |
min_block_size = block_size
|
| 407 |
|
| 408 |
print "/* This module is generated by the maint/MultiStage2.py script."
|
| 409 |
print "Do not modify it by hand. Instead modify the script and run it"
|
| 410 |
print "to regenerate this code."
|
| 411 |
print
|
| 412 |
print "As well as being part of the PCRE library, this module is #included"
|
| 413 |
print "by the pcretest program, which redefines the PRIV macro to change"
|
| 414 |
print "table names from _pcre_xxx to xxxx, thereby avoiding name clashes"
|
| 415 |
print "with the library. At present, just one of these tables is actually"
|
| 416 |
print "needed. */"
|
| 417 |
print
|
| 418 |
print "#ifndef PCRE_INCLUDED"
|
| 419 |
print
|
| 420 |
print "#ifdef HAVE_CONFIG_H"
|
| 421 |
print "#include \"config.h\""
|
| 422 |
print "#endif"
|
| 423 |
print
|
| 424 |
print "#include \"pcre_internal.h\""
|
| 425 |
print
|
| 426 |
print "#endif /* PCRE_INCLUDED */"
|
| 427 |
print
|
| 428 |
print "/* Unicode character database. */"
|
| 429 |
print "/* This file was autogenerated by the MultiStage2.py script. */"
|
| 430 |
print "/* Total size: %d bytes, block size: %d. */" % (min_size, min_block_size)
|
| 431 |
print
|
| 432 |
print "/* The tables herein are needed only when UCP support is built"
|
| 433 |
print "into PCRE. This module should not be referenced otherwise, so"
|
| 434 |
print "it should not matter whether it is compiled or not. However"
|
| 435 |
print "a comment was received about space saving - maybe the guy linked"
|
| 436 |
print "all the modules rather than using a library - so we include a"
|
| 437 |
print "condition to cut out the tables when not needed. But don't leave"
|
| 438 |
print "a totally empty module because some compilers barf at that."
|
| 439 |
print "Instead, just supply small dummy tables. */"
|
| 440 |
print
|
| 441 |
print "#ifndef SUPPORT_UCP"
|
| 442 |
print "const ucd_record PRIV(ucd_records)[] = {{0,0,0,0,0 }};"
|
| 443 |
print "const pcre_uint8 PRIV(ucd_stage1)[] = {0};"
|
| 444 |
print "const pcre_uint16 PRIV(ucd_stage2)[] = {0};"
|
| 445 |
print "const pcre_uint32 PRIV(ucd_caseless_sets)[] = {0};"
|
| 446 |
print "#else"
|
| 447 |
print
|
| 448 |
print record_struct
|
| 449 |
|
| 450 |
# --- Added by PH: output the table of caseless character sets ---
|
| 451 |
|
| 452 |
print "const pcre_uint32 PRIV(ucd_caseless_sets)[] = {"
|
| 453 |
print " NOTACHAR,"
|
| 454 |
for s in sets:
|
| 455 |
s = sorted(s)
|
| 456 |
for x in s:
|
| 457 |
print ' 0x%04x,' % x,
|
| 458 |
print ' NOTACHAR,'
|
| 459 |
print '};'
|
| 460 |
print
|
| 461 |
|
| 462 |
# ------
|
| 463 |
|
| 464 |
print "/* When #included in pcretest, we don't need this large table. */"
|
| 465 |
print
|
| 466 |
print "#ifndef PCRE_INCLUDED"
|
| 467 |
print
|
| 468 |
print_records(records, record_size)
|
| 469 |
print_table(min_stage1, 'PRIV(ucd_stage1)')
|
| 470 |
print_table(min_stage2, 'PRIV(ucd_stage2)', min_block_size)
|
| 471 |
print "#if UCD_BLOCK_SIZE != %d" % min_block_size
|
| 472 |
print "#error Please correct UCD_BLOCK_SIZE in pcre_internal.h"
|
| 473 |
print "#endif"
|
| 474 |
print "#endif /* SUPPORT_UCP */"
|
| 475 |
print
|
| 476 |
print "#endif /* PCRE_INCLUDED */"
|
| 477 |
|
| 478 |
"""
|
| 479 |
|
| 480 |
# Three-stage tables:
|
| 481 |
|
| 482 |
# Find the optimum block size for 3-stage table
|
| 483 |
min_size = sys.maxint
|
| 484 |
for stage3_block in [2 ** i for i in range(2,6)]:
|
| 485 |
stage_i, stage3 = compress_table(table, stage3_block)
|
| 486 |
for stage2_block in [2 ** i for i in range(5,10)]:
|
| 487 |
size = len(records) * 4
|
| 488 |
stage1, stage2 = compress_table(stage_i, stage2_block)
|
| 489 |
size += get_tables_size(stage1, stage2, stage3)
|
| 490 |
# print "/* %5d / %3d => %5d bytes */" % (stage2_block, stage3_block, size)
|
| 491 |
if size < min_size:
|
| 492 |
min_size = size
|
| 493 |
min_stage1, min_stage2, min_stage3 = stage1, stage2, stage3
|
| 494 |
min_stage2_block, min_stage3_block = stage2_block, stage3_block
|
| 495 |
|
| 496 |
print "/* Total size: %d bytes" % min_size */
|
| 497 |
print_records(records)
|
| 498 |
print_table(min_stage1, 'ucd_stage1')
|
| 499 |
print_table(min_stage2, 'ucd_stage2', min_stage2_block)
|
| 500 |
print_table(min_stage3, 'ucd_stage3', min_stage3_block)
|
| 501 |
|
| 502 |
"""
|