| 1 |
/************************************************* |
/************************************************* |
| 2 |
* libucp - Unicode Property Table handler * |
* Unicode Property Table handler * |
| 3 |
*************************************************/ |
*************************************************/ |
| 4 |
|
|
| 5 |
/* Internal header file defining the layout of compact nodes in the tree. */ |
/* Internal header file defining the layout of the bits in each pair of 32-bit |
| 6 |
|
words that form a data item in the table. */ |
| 7 |
|
|
| 8 |
typedef struct cnode { |
typedef struct cnode { |
| 9 |
unsigned short int f0; |
pcre_uint32 f0; |
| 10 |
unsigned short int f1; |
pcre_uint32 f1; |
|
unsigned short int f2; |
|
| 11 |
} cnode; |
} cnode; |
| 12 |
|
|
| 13 |
/* Things for the f0 field */ |
/* Things for the f0 field */ |
| 14 |
|
|
| 15 |
#define f0_leftexists 0x8000 /* Left child exists */ |
#define f0_scriptmask 0xff000000 /* Mask for script field */ |
| 16 |
#define f0_typemask 0x3f00 /* Type bits */ |
#define f0_scriptshift 24 /* Shift for script value */ |
| 17 |
#define f0_typeshift 8 /* Type shift */ |
#define f0_rangeflag 0x00f00000 /* Flag for a range item */ |
| 18 |
#define f0_chhmask 0x00ff /* Character high bits */ |
#define f0_charmask 0x001fffff /* Mask for code point value */ |
| 19 |
|
|
| 20 |
/* Things for the f2 field */ |
/* Things for the f1 field */ |
| 21 |
|
|
| 22 |
#define f2_rightmask 0xf000 /* Mask for right offset bits */ |
#define f1_typemask 0xfc000000 /* Mask for char type field */ |
| 23 |
#define f2_rightshift 12 /* Shift for right offset */ |
#define f1_typeshift 26 /* Shift for the type field */ |
| 24 |
#define f2_casemask 0x0fff /* Mask for case offset */ |
#define f1_rangemask 0x0000ffff /* Mask for a range offset */ |
| 25 |
|
#define f1_casemask 0x0000ffff /* Mask for a case offset */ |
| 26 |
/* The tree consists of a vector of structures of type cnode, with the root |
#define f1_caseneg 0xffff8000 /* Bits for negation */ |
| 27 |
node as the first element. The three short ints (16-bits) are used as follows: |
|
| 28 |
|
/* The data consists of a vector of structures of type cnode. The two unsigned |
| 29 |
(f0) (1) The 0x8000 bit of f0 is set if a left child exists. The child's node |
32-bit integers are used as follows: |
| 30 |
is the next node in the vector. |
|
| 31 |
(2) The 0x4000 bits of f0 is spare. |
(f0) (1) The most significant byte holds the script number. The numbers are |
| 32 |
(3) The 0x3f00 bits of f0 contain the character type; this is a number |
defined by the enum in ucp.h. |
| 33 |
defined by the enumeration in ucp.h (e.g. ucp_Lu). |
|
| 34 |
(4) The bottom 8 bits of f0 contain the most significant byte of the |
(2) The 0x00800000 bit is set if this entry defines a range of characters. |
| 35 |
character's 24-bit codepoint. |
It is not set if this entry defines a single character |
| 36 |
|
|
| 37 |
(f1) (1) The f1 field contains the two least significant bytes of the |
(3) The 0x00600000 bits are spare. |
| 38 |
codepoint. |
|
| 39 |
|
(4) The 0x001fffff bits contain the code point. No Unicode code point will |
| 40 |
(f2) (1) The 0xf000 bits of f2 contain zero if there is no right child of this |
ever be greater than 0x0010ffff, so this should be OK for ever. |
| 41 |
node. Otherwise, they contain one plus the exponent of the power of |
|
| 42 |
two of the offset to the right node (e.g. a value of 3 means 8). The |
(f1) (1) The 0xfc000000 bits contain the character type number. The numbers are |
| 43 |
units of the offset are node items. |
defined by an enum in ucp.h. |
| 44 |
|
|
| 45 |
(2) The 0x0fff bits of f2 contain the signed offset from this character to |
(2) The 0x03ff0000 bits are spare. |
| 46 |
its alternate cased value. They are zero if there is no such |
|
| 47 |
character. |
(3) The 0x0000ffff bits contain EITHER the unsigned offset to the top of |
| 48 |
|
range if this entry defines a range, OR the *signed* offset to the |
| 49 |
|
character's "other case" partner if this entry defines a single |
| 50 |
----------------------------------------------------------------------------- |
character. There is no partner if the value is zero. |
| 51 |
||.|.| type (6) | ms char (8) || ls char (16) ||....| case offset (12) || |
|
| 52 |
----------------------------------------------------------------------------- |
------------------------------------------------------------------------------- |
| 53 |
| | | |
| script (8) |.|.|.| codepoint (21) || type (6) |.|.| spare (8) | offset (16) | |
| 54 |
| |-> spare | |
------------------------------------------------------------------------------- |
| 55 |
| exponent of right |
| | | | | |
| 56 |
|-> left child exists child offset |
| | |-> spare | |-> spare |
| 57 |
|
| | | |
| 58 |
|
| |-> spare |-> spare |
| 59 |
|
| |
| 60 |
|
|-> range flag |
| 61 |
|
|
| 62 |
The upper/lower casing information is set only for characters that come in |
The upper/lower casing information is set only for characters that come in |
| 63 |
pairs. There are (at present) four non-one-to-one mappings in the Unicode data. |
pairs. The non-one-to-one mappings in the Unicode data are ignored. |
|
These are ignored. They are: |
|
|
|
|
|
1FBE Greek Prosgegrammeni (lower, with upper -> capital iota) |
|
|
2126 Ohm |
|
|
212A Kelvin |
|
|
212B Angstrom |
|
| 64 |
|
|
| 65 |
Certainly for the last three, having an alternate case would seem to be a |
When searching the data, proceed as follows: |
|
mistake. I don't know any Greek, so cannot comment on the first one. |
|
| 66 |
|
|
| 67 |
|
(1) Set up for a binary chop search. |
| 68 |
|
|
| 69 |
When searching the tree, proceed as follows: |
(2) If the top is not greater than the bottom, the character is not in the |
| 70 |
|
table. Its type must therefore be "Cn" ("Undefined"). |
| 71 |
|
|
| 72 |
(1) Start at the first node. |
(3) Find the middle vector element. |
| 73 |
|
|
| 74 |
(2) Extract the character value from f1 and the bottom 8 bits of f0; |
(4) Extract the code point and compare. If equal, we are done. |
| 75 |
|
|
| 76 |
(3) Compare with the character being sought. If equal, we are done. |
(5) If the test character is smaller, set the top to the current point, and |
| 77 |
|
goto (2). |
| 78 |
|
|
| 79 |
(4) If the test character is smaller, inspect the f0_leftexists flag. If it is |
(6) If the current entry defines a range, compute the last character by adding |
| 80 |
not set, the character is not in the tree. If it is set, move to the next |
the offset, and see if the test character is within the range. If it is, |
| 81 |
node, and go to (2). |
we are done. |
| 82 |
|
|
| 83 |
(5) If the test character is bigger, extract the f2_rightmask bits from f2, and |
(7) Otherwise, set the bottom to one element past the current point and goto |
| 84 |
shift them right by f2_rightshift. If the result is zero, the character is |
(2). |
|
not in the tree. Otherwise, calculate the number of nodes to skip by |
|
|
shifting the value 1 left by this number minus one. Go to (2). |
|
| 85 |
*/ |
*/ |
| 86 |
|
|
| 87 |
|
/* End of ucpinternal.h */ |
|
/* End of internal.h */ |
|