/[pcre]/code/trunk/maint/GenerateUtt.py
ViewVC logotype

Contents of /code/trunk/maint/GenerateUtt.py

Parent Directory Parent Directory | Revision Log Revision Log


Revision 592 - (hide annotations) (download) (as text)
Sat Apr 30 17:37:37 2011 UTC (3 years, 3 months ago) by ph10
File MIME type: text/x-python
File size: 4245 byte(s)
Update tables to Unicode 6.0.0.

1 ph10 351 #! /usr/bin/python
2    
3     # Generate utt tables.
4    
5     # The source file pcre_tables.c contains (amongst other things), a table that
6     # is indexed by script name. In order to reduce the number of relocations when
7 ph10 491 # loading the library, the names are held as a single large string, with
8 ph10 351 # offsets in the table. This is tedious to maintain by hand. Therefore, this
9 ph10 592 # script is used to generate the table. The output is sent to stdout; usually
10     # that should be directed to a temporary file. Then pcre_tables.c can be edited
11     # by replacing the relevant definitions and table therein with the temporary
12     # file.
13 ph10 351
14 ph10 391 # Modified by PH 17-March-2009 to generate the more verbose form that works
15 ph10 491 # for UTF-support in EBCDIC as well as ASCII environments.
16 ph10 592 # Modified by PH 01-March-2010 to add new scripts for Unicode 5.2.0.
17 ph10 517 # Modified by PH 04-May-2010 to add new "X.." special categories.
18 ph10 592 # Modified by PH 30-April-2011 to add new scripts for Unicode 6.0.0
19 ph10 391
20 ph10 351 script_names = ['Arabic', 'Armenian', 'Bengali', 'Bopomofo', 'Braille', 'Buginese', 'Buhid', 'Canadian_Aboriginal', \
21     'Cherokee', 'Common', 'Coptic', 'Cypriot', 'Cyrillic', 'Deseret', 'Devanagari', 'Ethiopic', 'Georgian', \
22     'Glagolitic', 'Gothic', 'Greek', 'Gujarati', 'Gurmukhi', 'Han', 'Hangul', 'Hanunoo', 'Hebrew', 'Hiragana', \
23     'Inherited', 'Kannada', 'Katakana', 'Kharoshthi', 'Khmer', 'Lao', 'Latin', 'Limbu', 'Linear_B', 'Malayalam', \
24     'Mongolian', 'Myanmar', 'New_Tai_Lue', 'Ogham', 'Old_Italic', 'Old_Persian', 'Oriya', 'Osmanya', 'Runic', \
25     'Shavian', 'Sinhala', 'Syloti_Nagri', 'Syriac', 'Tagalog', 'Tagbanwa', 'Tai_Le', 'Tamil', 'Telugu', 'Thaana', \
26     'Thai', 'Tibetan', 'Tifinagh', 'Ugaritic', 'Yi', \
27     # New for Unicode 5.0
28     'Balinese', 'Cuneiform', 'Nko', 'Phags_Pa', 'Phoenician', \
29     # New for Unicode 5.1
30 ph10 491 'Carian', 'Cham', 'Kayah_Li', 'Lepcha', 'Lycian', 'Lydian', 'Ol_Chiki', 'Rejang', 'Saurashtra', 'Sundanese', 'Vai', \
31     # New for Unicode 5.2
32     'Avestan', 'Bamum', 'Egyptian_Hieroglyphs', 'Imperial_Aramaic', \
33     'Inscriptional_Pahlavi', 'Inscriptional_Parthian', \
34     'Javanese', 'Kaithi', 'Lisu', 'Meetei_Mayek', \
35 ph10 592 'Old_South_Arabian', 'Old_Turkic', 'Samaritan', 'Tai_Tham', 'Tai_Viet', \
36     # New for Unicode 6.0.0
37     'Batak', 'Brahmi', 'Mandaic'
38 ph10 351 ]
39 ph10 491
40 ph10 351 category_names = ['Cc', 'Cf', 'Cn', 'Co', 'Cs', 'Ll', 'Lm', 'Lo', 'Lt', 'Lu',
41     'Mc', 'Me', 'Mn', 'Nd', 'Nl', 'No', 'Pc', 'Pd', 'Pe', 'Pf', 'Pi', 'Po', 'Ps',
42     'Sc', 'Sk', 'Sm', 'So', 'Zl', 'Zp', 'Zs' ]
43    
44     general_category_names = ['C', 'L', 'M', 'N', 'P', 'S', 'Z']
45    
46 ph10 517 # First add the Unicode script and category names.
47    
48 ph10 351 utt_table = zip(script_names, ['PT_SC'] * len(script_names))
49     utt_table += zip(category_names, ['PT_PC'] * len(category_names))
50     utt_table += zip(general_category_names, ['PT_GC'] * len(general_category_names))
51 ph10 517
52     # Now add our own specials.
53    
54 ph10 351 utt_table.append(('Any', 'PT_ANY'))
55 ph10 517 utt_table.append(('L&', 'PT_LAMP'))
56     utt_table.append(('Xan', 'PT_ALNUM'))
57     utt_table.append(('Xps', 'PT_PXSPACE'))
58     utt_table.append(('Xsp', 'PT_SPACE'))
59     utt_table.append(('Xwd', 'PT_WORD'))
60 ph10 351
61 ph10 517 # Sort the table.
62    
63 ph10 351 utt_table.sort()
64    
65 ph10 391 # We have to use STR_ macros to define the strings so that it all works in
66     # UTF-8 mode on EBCDIC platforms.
67    
68     for utt in utt_table:
69     print '#define STRING_%s0' % (utt[0].replace('&', '_AMPERSAND')),
70     for c in utt[0]:
71     if c == '_':
72     print 'STR_UNDERSCORE',
73 ph10 491 elif c == '&':
74     print 'STR_AMPERSAND',
75     else:
76 ph10 391 print 'STR_%s' % c,;
77 ph10 491 print '"\\0"'
78 ph10 391
79     # Print the actual table, using the string names
80    
81     print ''
82 ph10 351 print 'const char _pcre_utt_names[] = ';
83     last = ''
84     for utt in utt_table:
85     if utt == utt_table[-1]:
86     last = ';'
87 ph10 391 print ' STRING_%s0%s' % (utt[0].replace('&', '_AMPERSAND'), last)
88 ph10 491 # This was how it was done before the EBCDIC-compatible modification.
89 ph10 391 # print ' "%s\\0"%s' % (utt[0], last)
90 ph10 351
91     print '\nconst ucp_type_table _pcre_utt[] = { '
92     offset = 0
93     last = ','
94     for utt in utt_table:
95 ph10 517 if utt[1] in ('PT_ANY', 'PT_LAMP', 'PT_ALNUM', 'PT_PXSPACE',
96     'PT_SPACE', 'PT_WORD'):
97 ph10 351 value = '0'
98     else:
99     value = 'ucp_' + utt[0]
100     if utt == utt_table[-1]:
101     last = ''
102     print ' { %3d, %s, %s }%s ' % (offset, utt[1], value, last)
103     offset += len(utt[0]) + 1
104     print '};'

Properties

Name Value
svn:executable *

webmaster@exim.org
ViewVC Help
Powered by ViewVC 1.1.12