| 1 |
/*************************************************
|
| 2 |
* libucp - Unicode Property Table handler *
|
| 3 |
*************************************************/
|
| 4 |
|
| 5 |
/* Internal header file defining the layout of compact nodes in the tree. */
|
| 6 |
|
| 7 |
typedef struct cnode {
|
| 8 |
unsigned short int f0;
|
| 9 |
unsigned short int f1;
|
| 10 |
unsigned short int f2;
|
| 11 |
} cnode;
|
| 12 |
|
| 13 |
/* Things for the f0 field */
|
| 14 |
|
| 15 |
#define f0_leftexists 0x8000 /* Left child exists */
|
| 16 |
#define f0_typemask 0x3f00 /* Type bits */
|
| 17 |
#define f0_typeshift 8 /* Type shift */
|
| 18 |
#define f0_chhmask 0x00ff /* Character high bits */
|
| 19 |
|
| 20 |
/* Things for the f2 field */
|
| 21 |
|
| 22 |
#define f2_rightmask 0xf000 /* Mask for right offset bits */
|
| 23 |
#define f2_rightshift 12 /* Shift for right offset */
|
| 24 |
#define f2_casemask 0x0fff /* Mask for case offset */
|
| 25 |
|
| 26 |
/* The tree consists of a vector of structures of type cnode, with the root
|
| 27 |
node as the first element. The three short ints (16-bits) are used as follows:
|
| 28 |
|
| 29 |
(f0) (1) The 0x8000 bit of f0 is set if a left child exists. The child's node
|
| 30 |
is the next node in the vector.
|
| 31 |
(2) The 0x4000 bits of f0 is spare.
|
| 32 |
(3) The 0x3f00 bits of f0 contain the character type; this is a number
|
| 33 |
defined by the enumeration in ucp.h (e.g. ucp_Lu).
|
| 34 |
(4) The bottom 8 bits of f0 contain the most significant byte of the
|
| 35 |
character's 24-bit codepoint.
|
| 36 |
|
| 37 |
(f1) (1) The f1 field contains the two least significant bytes of the
|
| 38 |
codepoint.
|
| 39 |
|
| 40 |
(f2) (1) The 0xf000 bits of f2 contain zero if there is no right child of this
|
| 41 |
node. Otherwise, they contain one plus the exponent of the power of
|
| 42 |
two of the offset to the right node (e.g. a value of 3 means 8). The
|
| 43 |
units of the offset are node items.
|
| 44 |
|
| 45 |
(2) The 0x0fff bits of f2 contain the signed offset from this character to
|
| 46 |
its alternate cased value. They are zero if there is no such
|
| 47 |
character.
|
| 48 |
|
| 49 |
|
| 50 |
-----------------------------------------------------------------------------
|
| 51 |
||.|.| type (6) | ms char (8) || ls char (16) ||....| case offset (12) ||
|
| 52 |
-----------------------------------------------------------------------------
|
| 53 |
| | |
|
| 54 |
| |-> spare |
|
| 55 |
| exponent of right
|
| 56 |
|-> left child exists child offset
|
| 57 |
|
| 58 |
|
| 59 |
The upper/lower casing information is set only for characters that come in
|
| 60 |
pairs. There are (at present) four non-one-to-one mappings in the Unicode data.
|
| 61 |
These are ignored. They are:
|
| 62 |
|
| 63 |
1FBE Greek Prosgegrammeni (lower, with upper -> capital iota)
|
| 64 |
2126 Ohm
|
| 65 |
212A Kelvin
|
| 66 |
212B Angstrom
|
| 67 |
|
| 68 |
Certainly for the last three, having an alternate case would seem to be a
|
| 69 |
mistake. I don't know any Greek, so cannot comment on the first one.
|
| 70 |
|
| 71 |
|
| 72 |
When searching the tree, proceed as follows:
|
| 73 |
|
| 74 |
(1) Start at the first node.
|
| 75 |
|
| 76 |
(2) Extract the character value from f1 and the bottom 8 bits of f0;
|
| 77 |
|
| 78 |
(3) Compare with the character being sought. If equal, we are done.
|
| 79 |
|
| 80 |
(4) If the test character is smaller, inspect the f0_leftexists flag. If it is
|
| 81 |
not set, the character is not in the tree. If it is set, move to the next
|
| 82 |
node, and go to (2).
|
| 83 |
|
| 84 |
(5) If the test character is bigger, extract the f2_rightmask bits from f2, and
|
| 85 |
shift them right by f2_rightshift. If the result is zero, the character is
|
| 86 |
not in the tree. Otherwise, calculate the number of nodes to skip by
|
| 87 |
shifting the value 1 left by this number minus one. Go to (2).
|
| 88 |
*/
|
| 89 |
|
| 90 |
|
| 91 |
/* End of internal.h */
|