| 1 |
ph10 |
351 |
#! /usr/bin/python |
| 2 |
|
|
|
| 3 |
|
|
# Generate utt tables. |
| 4 |
|
|
|
| 5 |
|
|
# The source file pcre_tables.c contains (amongst other things), a table that |
| 6 |
|
|
# is indexed by script name. In order to reduce the number of relocations when |
| 7 |
ph10 |
491 |
# loading the library, the names are held as a single large string, with |
| 8 |
ph10 |
351 |
# offsets in the table. This is tedious to maintain by hand. Therefore, this |
| 9 |
|
|
# script is used to generate the table. The output is sent to stdout. |
| 10 |
|
|
|
| 11 |
ph10 |
391 |
# Modified by PH 17-March-2009 to generate the more verbose form that works |
| 12 |
ph10 |
491 |
# for UTF-support in EBCDIC as well as ASCII environments. |
| 13 |
|
|
# Modified by PH 01-March-2010 to add new scripts from Unicode 5.2.0. |
| 14 |
ph10 |
517 |
# Modified by PH 04-May-2010 to add new "X.." special categories. |
| 15 |
ph10 |
391 |
|
| 16 |
ph10 |
351 |
script_names = ['Arabic', 'Armenian', 'Bengali', 'Bopomofo', 'Braille', 'Buginese', 'Buhid', 'Canadian_Aboriginal', \ |
| 17 |
|
|
'Cherokee', 'Common', 'Coptic', 'Cypriot', 'Cyrillic', 'Deseret', 'Devanagari', 'Ethiopic', 'Georgian', \ |
| 18 |
|
|
'Glagolitic', 'Gothic', 'Greek', 'Gujarati', 'Gurmukhi', 'Han', 'Hangul', 'Hanunoo', 'Hebrew', 'Hiragana', \ |
| 19 |
|
|
'Inherited', 'Kannada', 'Katakana', 'Kharoshthi', 'Khmer', 'Lao', 'Latin', 'Limbu', 'Linear_B', 'Malayalam', \ |
| 20 |
|
|
'Mongolian', 'Myanmar', 'New_Tai_Lue', 'Ogham', 'Old_Italic', 'Old_Persian', 'Oriya', 'Osmanya', 'Runic', \ |
| 21 |
|
|
'Shavian', 'Sinhala', 'Syloti_Nagri', 'Syriac', 'Tagalog', 'Tagbanwa', 'Tai_Le', 'Tamil', 'Telugu', 'Thaana', \ |
| 22 |
|
|
'Thai', 'Tibetan', 'Tifinagh', 'Ugaritic', 'Yi', \ |
| 23 |
|
|
# New for Unicode 5.0 |
| 24 |
|
|
'Balinese', 'Cuneiform', 'Nko', 'Phags_Pa', 'Phoenician', \ |
| 25 |
|
|
# New for Unicode 5.1 |
| 26 |
ph10 |
491 |
'Carian', 'Cham', 'Kayah_Li', 'Lepcha', 'Lycian', 'Lydian', 'Ol_Chiki', 'Rejang', 'Saurashtra', 'Sundanese', 'Vai', \ |
| 27 |
|
|
# New for Unicode 5.2 |
| 28 |
|
|
'Avestan', 'Bamum', 'Egyptian_Hieroglyphs', 'Imperial_Aramaic', \ |
| 29 |
|
|
'Inscriptional_Pahlavi', 'Inscriptional_Parthian', \ |
| 30 |
|
|
'Javanese', 'Kaithi', 'Lisu', 'Meetei_Mayek', \ |
| 31 |
|
|
'Old_South_Arabian', 'Old_Turkic', 'Samaritan', 'Tai_Tham', 'Tai_Viet' |
| 32 |
ph10 |
351 |
] |
| 33 |
ph10 |
491 |
|
| 34 |
ph10 |
351 |
category_names = ['Cc', 'Cf', 'Cn', 'Co', 'Cs', 'Ll', 'Lm', 'Lo', 'Lt', 'Lu', |
| 35 |
|
|
'Mc', 'Me', 'Mn', 'Nd', 'Nl', 'No', 'Pc', 'Pd', 'Pe', 'Pf', 'Pi', 'Po', 'Ps', |
| 36 |
|
|
'Sc', 'Sk', 'Sm', 'So', 'Zl', 'Zp', 'Zs' ] |
| 37 |
|
|
|
| 38 |
|
|
general_category_names = ['C', 'L', 'M', 'N', 'P', 'S', 'Z'] |
| 39 |
|
|
|
| 40 |
ph10 |
517 |
# First add the Unicode script and category names. |
| 41 |
|
|
|
| 42 |
ph10 |
351 |
utt_table = zip(script_names, ['PT_SC'] * len(script_names)) |
| 43 |
|
|
utt_table += zip(category_names, ['PT_PC'] * len(category_names)) |
| 44 |
|
|
utt_table += zip(general_category_names, ['PT_GC'] * len(general_category_names)) |
| 45 |
ph10 |
517 |
|
| 46 |
|
|
# Now add our own specials. |
| 47 |
|
|
|
| 48 |
ph10 |
351 |
utt_table.append(('Any', 'PT_ANY')) |
| 49 |
ph10 |
517 |
utt_table.append(('L&', 'PT_LAMP')) |
| 50 |
|
|
utt_table.append(('Xan', 'PT_ALNUM')) |
| 51 |
|
|
utt_table.append(('Xps', 'PT_PXSPACE')) |
| 52 |
|
|
utt_table.append(('Xsp', 'PT_SPACE')) |
| 53 |
|
|
utt_table.append(('Xwd', 'PT_WORD')) |
| 54 |
ph10 |
351 |
|
| 55 |
ph10 |
517 |
# Sort the table. |
| 56 |
|
|
|
| 57 |
ph10 |
351 |
utt_table.sort() |
| 58 |
|
|
|
| 59 |
ph10 |
391 |
# We have to use STR_ macros to define the strings so that it all works in |
| 60 |
|
|
# UTF-8 mode on EBCDIC platforms. |
| 61 |
|
|
|
| 62 |
|
|
for utt in utt_table: |
| 63 |
|
|
print '#define STRING_%s0' % (utt[0].replace('&', '_AMPERSAND')), |
| 64 |
|
|
for c in utt[0]: |
| 65 |
|
|
if c == '_': |
| 66 |
|
|
print 'STR_UNDERSCORE', |
| 67 |
ph10 |
491 |
elif c == '&': |
| 68 |
|
|
print 'STR_AMPERSAND', |
| 69 |
|
|
else: |
| 70 |
ph10 |
391 |
print 'STR_%s' % c,; |
| 71 |
ph10 |
491 |
print '"\\0"' |
| 72 |
ph10 |
391 |
|
| 73 |
|
|
# Print the actual table, using the string names |
| 74 |
|
|
|
| 75 |
|
|
print '' |
| 76 |
ph10 |
351 |
print 'const char _pcre_utt_names[] = '; |
| 77 |
|
|
last = '' |
| 78 |
|
|
for utt in utt_table: |
| 79 |
|
|
if utt == utt_table[-1]: |
| 80 |
|
|
last = ';' |
| 81 |
ph10 |
391 |
print ' STRING_%s0%s' % (utt[0].replace('&', '_AMPERSAND'), last) |
| 82 |
ph10 |
491 |
# This was how it was done before the EBCDIC-compatible modification. |
| 83 |
ph10 |
391 |
# print ' "%s\\0"%s' % (utt[0], last) |
| 84 |
ph10 |
351 |
|
| 85 |
|
|
print '\nconst ucp_type_table _pcre_utt[] = { ' |
| 86 |
|
|
offset = 0 |
| 87 |
|
|
last = ',' |
| 88 |
|
|
for utt in utt_table: |
| 89 |
ph10 |
517 |
if utt[1] in ('PT_ANY', 'PT_LAMP', 'PT_ALNUM', 'PT_PXSPACE', |
| 90 |
|
|
'PT_SPACE', 'PT_WORD'): |
| 91 |
ph10 |
351 |
value = '0' |
| 92 |
|
|
else: |
| 93 |
|
|
value = 'ucp_' + utt[0] |
| 94 |
|
|
if utt == utt_table[-1]: |
| 95 |
|
|
last = '' |
| 96 |
|
|
print ' { %3d, %s, %s }%s ' % (offset, utt[1], value, last) |
| 97 |
|
|
offset += len(utt[0]) + 1 |
| 98 |
|
|
print '};' |