| 3 |
# Multistage table builder |
# Multistage table builder |
| 4 |
# (c) Peter Kankowski, 2008 |
# (c) Peter Kankowski, 2008 |
| 5 |
|
|
| 6 |
|
############################################################################## |
| 7 |
# This script was submitted to the PCRE project by Peter Kankowski as part of |
# This script was submitted to the PCRE project by Peter Kankowski as part of |
| 8 |
# the upgrading of Unicode property support. The new code speeds up property |
# the upgrading of Unicode property support. The new code speeds up property |
| 9 |
# matching many times. The script is for the use of PCRE maintainers, to |
# matching many times. The script is for the use of PCRE maintainers, to |
| 10 |
# generate the pcre_ucd.c file that contains a digested form of the Unicode |
# generate the pcre_ucd.c file that contains a digested form of the Unicode |
| 11 |
# data tables. |
# data tables. |
| 12 |
|
# |
| 13 |
# The script should be run in the maint subdirectory, using the command |
# The script should be run in the maint subdirectory, using the command |
| 14 |
# |
# |
| 15 |
# ./MultiStage2.py >../pcre_ucd.c |
# ./MultiStage2.py >../pcre_ucd.c |
| 16 |
# |
# |
| 17 |
# It requires three Unicode data tables, DerivedGeneralCategory.txt, |
# It requires three Unicode data tables, DerivedGeneralCategory.txt, |
| 18 |
# Scripts.txt, and UnicodeData.txt, to be in the Unicode.tables subdirectory. |
# Scripts.txt, and UnicodeData.txt, to be in the Unicode.tables subdirectory. |
| 19 |
|
# |
| 20 |
# Added with minor modifications: |
# Minor modifications made to this script: |
| 21 |
# Added #! line at start |
# Added #! line at start |
| 22 |
# Removed tabs |
# Removed tabs |
| 23 |
# Made it work with Python 2.4 by rewriting two statements that needed 2.5 |
# Made it work with Python 2.4 by rewriting two statements that needed 2.5 |
| 24 |
# Consequent code tidy |
# Consequent code tidy |
| 25 |
# Adjusted file names to Unicode.tables directory |
# Adjusted data file names to take from the Unicode.tables directory |
| 26 |
|
# Adjusted global table names by prefixing _pcre_. |
| 27 |
|
# Commented out stuff relating to the casefolding table, which isn't used. |
| 28 |
|
# |
| 29 |
|
# The tables generated by this script are used by macros defined in |
| 30 |
|
# pcre_internal.h. They look up Unicode character properties using short |
| 31 |
|
# sequences of code that contains no branches, which makes for greater speed. |
| 32 |
|
# |
| 33 |
|
# Conceptually, there is a table of records (of type ucd_record), containing a |
| 34 |
|
# script number, character type, and offset to the character's other case for |
| 35 |
|
# every character. However, a real table covering all Unicode characters would |
| 36 |
|
# be far too big. It can be efficiently compressed by observing that many |
| 37 |
|
# characters have the same record, and many blocks of characters (taking 128 |
| 38 |
|
# characters in a block) have the same set of records as other blocks. This |
| 39 |
|
# leads to a 2-stage lookup process. |
| 40 |
|
# |
| 41 |
|
# This script constructs three tables. The _pcre_ucd_records table contains |
| 42 |
|
# one instance of every unique record that is required. The _pcre_ucd_stage1 |
| 43 |
|
# table is indexed by a character's block number, and yields what is in effect |
| 44 |
|
# a "virtual" block number. The _pcre_ucd_stage2 table is a table of "virtual" |
| 45 |
|
# blocks; each block is indexed by the offset of a character within its own |
| 46 |
|
# block, and the result is the offset of the required record. |
| 47 |
|
# |
| 48 |
|
# Example: lowercase "a" (U+0061) is in block 0 |
| 49 |
|
# lookup 0 in stage1 table yields 0 |
| 50 |
|
# lookup 97 in the first table in stage2 yields 12 |
| 51 |
|
# record 12 is { 33, 5, -32 } (Latin, lowercase, upper is U+0041) |
| 52 |
|
# |
| 53 |
|
# All lowercase latin characters resolve to the same record. |
| 54 |
# |
# |
| 55 |
# Philip Hazel, 02 July 2008 |
# Example: hiragana letter A (U+3042) is in block 96 (0x60) |
| 56 |
|
# lookup 96 in stage1 table yields 83 |
| 57 |
|
# lookup 66 in the 83rd table in stage2 yields 348 |
| 58 |
|
# record 348 is { 26, 7, 0 } (Hiragana, other letter, no other case) |
| 59 |
|
# |
| 60 |
|
# In these examples, no other blocks resolve to the same "virtual" block, as it |
| 61 |
|
# happens, but plenty of other blocks do share "virtual" blocks. |
| 62 |
|
# |
| 63 |
|
# There is a fourth table, maintained by hand, which translates from the |
| 64 |
|
# individual character types such as ucp_Cc to the general types like ucp_C. |
| 65 |
|
# |
| 66 |
|
# Philip Hazel, 03 July 2008 |
| 67 |
|
############################################################################## |
| 68 |
|
|
| 69 |
|
|
| 70 |
import re |
import re |
| 78 |
def make_get_names(enum): |
def make_get_names(enum): |
| 79 |
return lambda chardata: enum.index(chardata[1]) |
return lambda chardata: enum.index(chardata[1]) |
| 80 |
|
|
| 81 |
def get_case_folding_value(chardata): |
#def get_case_folding_value(chardata): |
| 82 |
if chardata[1] != 'C' and chardata[1] != 'S': |
# if chardata[1] != 'C' and chardata[1] != 'S': |
| 83 |
return 0 |
# return 0 |
| 84 |
return int(chardata[2], 16) - int(chardata[0], 16) |
# return int(chardata[2], 16) - int(chardata[0], 16) |
| 85 |
|
|
| 86 |
def get_other_case(chardata): |
def get_other_case(chardata): |
| 87 |
if chardata[12] != '': |
if chardata[12] != '': |
| 103 |
|
|
| 104 |
m = re.match(r'([0-9a-fA-F]+)(\.\.([0-9a-fA-F]+))?$', chardata[0]) |
m = re.match(r'([0-9a-fA-F]+)(\.\.([0-9a-fA-F]+))?$', chardata[0]) |
| 105 |
char = int(m.group(1), 16) |
char = int(m.group(1), 16) |
|
# PH last = char if m.group(3) is None else int(m.group(3), 16) |
|
| 106 |
if m.group(3) is None: |
if m.group(3) is None: |
| 107 |
last = char |
last = char |
| 108 |
else: |
else: |
| 167 |
for i in range(0, len(table), ELEMS_PER_LINE): |
for i in range(0, len(table), ELEMS_PER_LINE): |
| 168 |
print fmt % (table[i:i+ELEMS_PER_LINE] + (i * mult,)) |
print fmt % (table[i:i+ELEMS_PER_LINE] + (i * mult,)) |
| 169 |
else: |
else: |
|
# PH fmt = "%3d," * (ELEMS_PER_LINE if block_size > ELEMS_PER_LINE else block_size) + "\n" |
|
| 170 |
if block_size > ELEMS_PER_LINE: |
if block_size > ELEMS_PER_LINE: |
| 171 |
fmt = "%3d," * ELEMS_PER_LINE + "\n" |
el = ELEMS_PER_LINE |
|
fmt = fmt * (block_size / ELEMS_PER_LINE) |
|
| 172 |
else: |
else: |
| 173 |
fmt = "%3d," * block_size + "\n" |
el = block_size |
| 174 |
# PH if block_size > ELEMS_PER_LINE: |
fmt = "%3d," * el + "\n" |
| 175 |
# PH fmt = fmt * (block_size / ELEMS_PER_LINE) |
if block_size > ELEMS_PER_LINE: |
| 176 |
|
fmt = fmt * (block_size / ELEMS_PER_LINE) |
| 177 |
for i in range(0, len(table), block_size): |
for i in range(0, len(table), block_size): |
| 178 |
print ("/* block %d */\n" + fmt) % ((i / block_size,) + table[i:i+block_size]) |
print ("/* block %d */\n" + fmt) % ((i / block_size,) + table[i:i+block_size]) |
| 179 |
print "};\n" |
print "};\n" |
| 190 |
return index, records |
return index, records |
| 191 |
|
|
| 192 |
def print_records(records): |
def print_records(records): |
| 193 |
print 'const ucd_record ucd_records[] = { /* %d bytes */' % (len(records) * 4) |
print 'const ucd_record _pcre_ucd_records[] = { /* %d bytes */' % (len(records) * 4) |
| 194 |
records = zip(records.keys(), records.values()) |
records = zip(records.keys(), records.values()) |
| 195 |
records.sort(None, lambda x: x[1]) |
records.sort(None, lambda x: x[1]) |
| 196 |
for i, record in enumerate(records): |
for i, record in enumerate(records): |
| 204 |
'Mongolian', 'Myanmar', 'New_Tai_Lue', 'Ogham', 'Old_Italic', 'Old_Persian', 'Oriya', 'Osmanya', 'Runic', \ |
'Mongolian', 'Myanmar', 'New_Tai_Lue', 'Ogham', 'Old_Italic', 'Old_Persian', 'Oriya', 'Osmanya', 'Runic', \ |
| 205 |
'Shavian', 'Sinhala', 'Syloti_Nagri', 'Syriac', 'Tagalog', 'Tagbanwa', 'Tai_Le', 'Tamil', 'Telugu', 'Thaana', \ |
'Shavian', 'Sinhala', 'Syloti_Nagri', 'Syriac', 'Tagalog', 'Tagbanwa', 'Tai_Le', 'Tamil', 'Telugu', 'Thaana', \ |
| 206 |
'Thai', 'Tibetan', 'Tifinagh', 'Ugaritic', 'Yi', \ |
'Thai', 'Tibetan', 'Tifinagh', 'Ugaritic', 'Yi', \ |
| 207 |
'Balinese', 'Cuneiform', 'Nko', 'Phags_Pa', 'Phoenician'] |
# New for Unicode 5.0 |
| 208 |
|
'Balinese', 'Cuneiform', 'Nko', 'Phags_Pa', 'Phoenician', \ |
| 209 |
|
# New for Unicode 5.1 |
| 210 |
|
'Carian', 'Cham', 'Kayah_Li', 'Lepcha', 'Lycian', 'Lydian', 'Ol_Chiki', 'Rejang', 'Saurashtra', 'Sundanese', 'Vai'] |
| 211 |
|
|
| 212 |
category_names = ['Cc', 'Cf', 'Cn', 'Co', 'Cs', 'Ll', 'Lm', 'Lo', 'Lt', 'Lu', |
category_names = ['Cc', 'Cf', 'Cn', 'Co', 'Cs', 'Ll', 'Lm', 'Lo', 'Lt', 'Lu', |
| 213 |
'Mc', 'Me', 'Mn', 'Nd', 'Nl', 'No', 'Pc', 'Pd', 'Pe', 'Pf', 'Pi', 'Po', 'Ps', |
'Mc', 'Me', 'Mn', 'Nd', 'Nl', 'No', 'Pc', 'Pd', 'Pe', 'Pf', 'Pi', 'Po', 'Ps', |
| 239 |
print "#include \"pcre_internal.h\"" |
print "#include \"pcre_internal.h\"" |
| 240 |
print |
print |
| 241 |
print "/* Unicode character database. */" |
print "/* Unicode character database. */" |
| 242 |
print "/* This file was autogenerated by MultiStage2.py script. */" |
print "/* This file was autogenerated by the MultiStage2.py script. */" |
| 243 |
print "/* Total size: %d bytes, block size: %d. */" % (min_size, min_block_size) |
print "/* Total size: %d bytes, block size: %d. */" % (min_size, min_block_size) |
| 244 |
print_records(records) |
print_records(records) |
| 245 |
print_table(min_stage1, 'ucd_stage1') |
print_table(min_stage1, '_pcre_ucd_stage1') |
| 246 |
print_table(min_stage2, 'ucd_stage2', min_block_size) |
print_table(min_stage2, '_pcre_ucd_stage2', min_block_size) |
| 247 |
print "#if UCD_BLOCK_SIZE != %d" % min_block_size |
print "#if UCD_BLOCK_SIZE != %d" % min_block_size |
| 248 |
print "#error Please correct UCD_BLOCK_SIZE in pcre_internal.h" |
print "#error Please correct UCD_BLOCK_SIZE in pcre_internal.h" |
| 249 |
print "#endif" |
print "#endif" |