| 1 |
############################################################
|
| 2 |
############################################################
|
| 3 |
## As of PCRE 8.0 this file is OBSOLETE. A different way ##
|
| 4 |
## of handling Unicode property data is now used. See the ##
|
| 5 |
## maint/README document. ##
|
| 6 |
## PH 02 July 2008 ##
|
| 7 |
############################################################
|
| 8 |
############################################################
|
| 9 |
|
| 10 |
|
| 11 |
/*************************************************
|
| 12 |
* Unicode Property Table handler *
|
| 13 |
*************************************************/
|
| 14 |
|
| 15 |
#ifndef _UCPINTERNAL_H
|
| 16 |
#define _UCPINTERNAL_H
|
| 17 |
|
| 18 |
/* Internal header file defining the layout of the bits in each pair of 32-bit
|
| 19 |
words that form a data item in the table. */
|
| 20 |
|
| 21 |
typedef struct cnode {
|
| 22 |
pcre_uint32 f0;
|
| 23 |
pcre_uint32 f1;
|
| 24 |
} cnode;
|
| 25 |
|
| 26 |
/* Things for the f0 field */
|
| 27 |
|
| 28 |
#define f0_scriptmask 0xff000000 /* Mask for script field */
|
| 29 |
#define f0_scriptshift 24 /* Shift for script value */
|
| 30 |
#define f0_rangeflag 0x00800000 /* Flag for a range item */
|
| 31 |
#define f0_charmask 0x001fffff /* Mask for code point value */
|
| 32 |
|
| 33 |
/* Things for the f1 field */
|
| 34 |
|
| 35 |
#define f1_typemask 0xfc000000 /* Mask for char type field */
|
| 36 |
#define f1_typeshift 26 /* Shift for the type field */
|
| 37 |
#define f1_rangemask 0x0000ffff /* Mask for a range offset */
|
| 38 |
#define f1_casemask 0x0000ffff /* Mask for a case offset */
|
| 39 |
#define f1_caseneg 0xffff8000 /* Bits for negation */
|
| 40 |
|
| 41 |
/* The data consists of a vector of structures of type cnode. The two unsigned
|
| 42 |
32-bit integers are used as follows:
|
| 43 |
|
| 44 |
(f0) (1) The most significant byte holds the script number. The numbers are
|
| 45 |
defined by the enum in ucp.h.
|
| 46 |
|
| 47 |
(2) The 0x00800000 bit is set if this entry defines a range of characters.
|
| 48 |
It is not set if this entry defines a single character
|
| 49 |
|
| 50 |
(3) The 0x00600000 bits are spare.
|
| 51 |
|
| 52 |
(4) The 0x001fffff bits contain the code point. No Unicode code point will
|
| 53 |
ever be greater than 0x0010ffff, so this should be OK for ever.
|
| 54 |
|
| 55 |
(f1) (1) The 0xfc000000 bits contain the character type number. The numbers are
|
| 56 |
defined by an enum in ucp.h.
|
| 57 |
|
| 58 |
(2) The 0x03ff0000 bits are spare.
|
| 59 |
|
| 60 |
(3) The 0x0000ffff bits contain EITHER the unsigned offset to the top of
|
| 61 |
range if this entry defines a range, OR the *signed* offset to the
|
| 62 |
character's "other case" partner if this entry defines a single
|
| 63 |
character. There is no partner if the value is zero.
|
| 64 |
|
| 65 |
-------------------------------------------------------------------------------
|
| 66 |
| script (8) |.|.|.| codepoint (21) || type (6) |.|.| spare (8) | offset (16) |
|
| 67 |
-------------------------------------------------------------------------------
|
| 68 |
| | | | |
|
| 69 |
| | |-> spare | |-> spare
|
| 70 |
| | |
|
| 71 |
| |-> spare |-> spare
|
| 72 |
|
|
| 73 |
|-> range flag
|
| 74 |
|
| 75 |
The upper/lower casing information is set only for characters that come in
|
| 76 |
pairs. The non-one-to-one mappings in the Unicode data are ignored.
|
| 77 |
|
| 78 |
When searching the data, proceed as follows:
|
| 79 |
|
| 80 |
(1) Set up for a binary chop search.
|
| 81 |
|
| 82 |
(2) If the top is not greater than the bottom, the character is not in the
|
| 83 |
table. Its type must therefore be "Cn" ("Undefined").
|
| 84 |
|
| 85 |
(3) Find the middle vector element.
|
| 86 |
|
| 87 |
(4) Extract the code point and compare. If equal, we are done.
|
| 88 |
|
| 89 |
(5) If the test character is smaller, set the top to the current point, and
|
| 90 |
goto (2).
|
| 91 |
|
| 92 |
(6) If the current entry defines a range, compute the last character by adding
|
| 93 |
the offset, and see if the test character is within the range. If it is,
|
| 94 |
we are done.
|
| 95 |
|
| 96 |
(7) Otherwise, set the bottom to one element past the current point and goto
|
| 97 |
(2).
|
| 98 |
*/
|
| 99 |
|
| 100 |
#endif /* _UCPINTERNAL_H */
|
| 101 |
|
| 102 |
/* End of ucpinternal.h */
|