| 1 |
nigel |
75 |
/************************************************* |
| 2 |
|
|
* libucp - Unicode Property Table handler * |
| 3 |
|
|
*************************************************/ |
| 4 |
|
|
|
| 5 |
|
|
/* Internal header file defining the layout of compact nodes in the tree. */ |
| 6 |
|
|
|
| 7 |
|
|
typedef struct cnode { |
| 8 |
|
|
unsigned short int f0; |
| 9 |
|
|
unsigned short int f1; |
| 10 |
|
|
unsigned short int f2; |
| 11 |
|
|
} cnode; |
| 12 |
|
|
|
| 13 |
|
|
/* Things for the f0 field */ |
| 14 |
|
|
|
| 15 |
|
|
#define f0_leftexists 0x8000 /* Left child exists */ |
| 16 |
|
|
#define f0_typemask 0x3f00 /* Type bits */ |
| 17 |
|
|
#define f0_typeshift 8 /* Type shift */ |
| 18 |
|
|
#define f0_chhmask 0x00ff /* Character high bits */ |
| 19 |
|
|
|
| 20 |
|
|
/* Things for the f2 field */ |
| 21 |
|
|
|
| 22 |
|
|
#define f2_rightmask 0xf000 /* Mask for right offset bits */ |
| 23 |
|
|
#define f2_rightshift 12 /* Shift for right offset */ |
| 24 |
|
|
#define f2_casemask 0x0fff /* Mask for case offset */ |
| 25 |
|
|
|
| 26 |
|
|
/* The tree consists of a vector of structures of type cnode, with the root |
| 27 |
|
|
node as the first element. The three short ints (16-bits) are used as follows: |
| 28 |
|
|
|
| 29 |
|
|
(f0) (1) The 0x8000 bit of f0 is set if a left child exists. The child's node |
| 30 |
|
|
is the next node in the vector. |
| 31 |
|
|
(2) The 0x4000 bits of f0 is spare. |
| 32 |
|
|
(3) The 0x3f00 bits of f0 contain the character type; this is a number |
| 33 |
|
|
defined by the enumeration in ucp.h (e.g. ucp_Lu). |
| 34 |
|
|
(4) The bottom 8 bits of f0 contain the most significant byte of the |
| 35 |
|
|
character's 24-bit codepoint. |
| 36 |
|
|
|
| 37 |
|
|
(f1) (1) The f1 field contains the two least significant bytes of the |
| 38 |
|
|
codepoint. |
| 39 |
|
|
|
| 40 |
|
|
(f2) (1) The 0xf000 bits of f2 contain zero if there is no right child of this |
| 41 |
|
|
node. Otherwise, they contain one plus the exponent of the power of |
| 42 |
|
|
two of the offset to the right node (e.g. a value of 3 means 8). The |
| 43 |
|
|
units of the offset are node items. |
| 44 |
|
|
|
| 45 |
|
|
(2) The 0x0fff bits of f2 contain the signed offset from this character to |
| 46 |
|
|
its alternate cased value. They are zero if there is no such |
| 47 |
|
|
character. |
| 48 |
|
|
|
| 49 |
|
|
|
| 50 |
|
|
----------------------------------------------------------------------------- |
| 51 |
|
|
||.|.| type (6) | ms char (8) || ls char (16) ||....| case offset (12) || |
| 52 |
|
|
----------------------------------------------------------------------------- |
| 53 |
|
|
| | | |
| 54 |
|
|
| |-> spare | |
| 55 |
|
|
| exponent of right |
| 56 |
|
|
|-> left child exists child offset |
| 57 |
|
|
|
| 58 |
|
|
|
| 59 |
|
|
The upper/lower casing information is set only for characters that come in |
| 60 |
|
|
pairs. There are (at present) four non-one-to-one mappings in the Unicode data. |
| 61 |
|
|
These are ignored. They are: |
| 62 |
|
|
|
| 63 |
|
|
1FBE Greek Prosgegrammeni (lower, with upper -> capital iota) |
| 64 |
|
|
2126 Ohm |
| 65 |
|
|
212A Kelvin |
| 66 |
|
|
212B Angstrom |
| 67 |
|
|
|
| 68 |
|
|
Certainly for the last three, having an alternate case would seem to be a |
| 69 |
|
|
mistake. I don't know any Greek, so cannot comment on the first one. |
| 70 |
|
|
|
| 71 |
|
|
|
| 72 |
|
|
When searching the tree, proceed as follows: |
| 73 |
|
|
|
| 74 |
|
|
(1) Start at the first node. |
| 75 |
|
|
|
| 76 |
|
|
(2) Extract the character value from f1 and the bottom 8 bits of f0; |
| 77 |
|
|
|
| 78 |
|
|
(3) Compare with the character being sought. If equal, we are done. |
| 79 |
|
|
|
| 80 |
|
|
(4) If the test character is smaller, inspect the f0_leftexists flag. If it is |
| 81 |
|
|
not set, the character is not in the tree. If it is set, move to the next |
| 82 |
|
|
node, and go to (2). |
| 83 |
|
|
|
| 84 |
|
|
(5) If the test character is bigger, extract the f2_rightmask bits from f2, and |
| 85 |
|
|
shift them right by f2_rightshift. If the result is zero, the character is |
| 86 |
|
|
not in the tree. Otherwise, calculate the number of nodes to skip by |
| 87 |
|
|
shifting the value 1 left by this number minus one. Go to (2). |
| 88 |
|
|
*/ |
| 89 |
|
|
|
| 90 |
|
|
|
| 91 |
|
|
/* End of internal.h */ |