| 1 |
nigel |
75 |
/************************************************* |
| 2 |
nigel |
87 |
* Unicode Property Table handler * |
| 3 |
nigel |
75 |
*************************************************/ |
| 4 |
|
|
|
| 5 |
nigel |
93 |
#ifndef _UCPINTERNAL_H |
| 6 |
|
|
#define _UCPINTERNAL_H |
| 7 |
|
|
|
| 8 |
nigel |
87 |
/* Internal header file defining the layout of the bits in each pair of 32-bit |
| 9 |
|
|
words that form a data item in the table. */ |
| 10 |
nigel |
75 |
|
| 11 |
|
|
typedef struct cnode { |
| 12 |
nigel |
87 |
pcre_uint32 f0; |
| 13 |
|
|
pcre_uint32 f1; |
| 14 |
nigel |
75 |
} cnode; |
| 15 |
|
|
|
| 16 |
|
|
/* Things for the f0 field */ |
| 17 |
|
|
|
| 18 |
nigel |
87 |
#define f0_scriptmask 0xff000000 /* Mask for script field */ |
| 19 |
|
|
#define f0_scriptshift 24 /* Shift for script value */ |
| 20 |
ph10 |
346 |
#define f0_rangeflag 0x00800000 /* Flag for a range item */ |
| 21 |
nigel |
87 |
#define f0_charmask 0x001fffff /* Mask for code point value */ |
| 22 |
nigel |
75 |
|
| 23 |
nigel |
87 |
/* Things for the f1 field */ |
| 24 |
nigel |
75 |
|
| 25 |
nigel |
87 |
#define f1_typemask 0xfc000000 /* Mask for char type field */ |
| 26 |
|
|
#define f1_typeshift 26 /* Shift for the type field */ |
| 27 |
|
|
#define f1_rangemask 0x0000ffff /* Mask for a range offset */ |
| 28 |
|
|
#define f1_casemask 0x0000ffff /* Mask for a case offset */ |
| 29 |
|
|
#define f1_caseneg 0xffff8000 /* Bits for negation */ |
| 30 |
nigel |
75 |
|
| 31 |
nigel |
87 |
/* The data consists of a vector of structures of type cnode. The two unsigned |
| 32 |
|
|
32-bit integers are used as follows: |
| 33 |
nigel |
75 |
|
| 34 |
nigel |
87 |
(f0) (1) The most significant byte holds the script number. The numbers are |
| 35 |
|
|
defined by the enum in ucp.h. |
| 36 |
nigel |
75 |
|
| 37 |
nigel |
87 |
(2) The 0x00800000 bit is set if this entry defines a range of characters. |
| 38 |
|
|
It is not set if this entry defines a single character |
| 39 |
nigel |
75 |
|
| 40 |
nigel |
87 |
(3) The 0x00600000 bits are spare. |
| 41 |
nigel |
75 |
|
| 42 |
nigel |
87 |
(4) The 0x001fffff bits contain the code point. No Unicode code point will |
| 43 |
|
|
ever be greater than 0x0010ffff, so this should be OK for ever. |
| 44 |
nigel |
75 |
|
| 45 |
nigel |
87 |
(f1) (1) The 0xfc000000 bits contain the character type number. The numbers are |
| 46 |
|
|
defined by an enum in ucp.h. |
| 47 |
nigel |
75 |
|
| 48 |
nigel |
87 |
(2) The 0x03ff0000 bits are spare. |
| 49 |
nigel |
75 |
|
| 50 |
nigel |
87 |
(3) The 0x0000ffff bits contain EITHER the unsigned offset to the top of |
| 51 |
|
|
range if this entry defines a range, OR the *signed* offset to the |
| 52 |
|
|
character's "other case" partner if this entry defines a single |
| 53 |
|
|
character. There is no partner if the value is zero. |
| 54 |
nigel |
75 |
|
| 55 |
nigel |
87 |
------------------------------------------------------------------------------- |
| 56 |
|
|
| script (8) |.|.|.| codepoint (21) || type (6) |.|.| spare (8) | offset (16) | |
| 57 |
|
|
------------------------------------------------------------------------------- |
| 58 |
|
|
| | | | | |
| 59 |
|
|
| | |-> spare | |-> spare |
| 60 |
|
|
| | | |
| 61 |
|
|
| |-> spare |-> spare |
| 62 |
|
|
| |
| 63 |
|
|
|-> range flag |
| 64 |
|
|
|
| 65 |
nigel |
75 |
The upper/lower casing information is set only for characters that come in |
| 66 |
nigel |
87 |
pairs. The non-one-to-one mappings in the Unicode data are ignored. |
| 67 |
nigel |
75 |
|
| 68 |
nigel |
87 |
When searching the data, proceed as follows: |
| 69 |
nigel |
75 |
|
| 70 |
nigel |
87 |
(1) Set up for a binary chop search. |
| 71 |
nigel |
75 |
|
| 72 |
nigel |
87 |
(2) If the top is not greater than the bottom, the character is not in the |
| 73 |
|
|
table. Its type must therefore be "Cn" ("Undefined"). |
| 74 |
nigel |
75 |
|
| 75 |
nigel |
87 |
(3) Find the middle vector element. |
| 76 |
nigel |
75 |
|
| 77 |
nigel |
87 |
(4) Extract the code point and compare. If equal, we are done. |
| 78 |
nigel |
75 |
|
| 79 |
nigel |
87 |
(5) If the test character is smaller, set the top to the current point, and |
| 80 |
|
|
goto (2). |
| 81 |
nigel |
75 |
|
| 82 |
nigel |
87 |
(6) If the current entry defines a range, compute the last character by adding |
| 83 |
|
|
the offset, and see if the test character is within the range. If it is, |
| 84 |
|
|
we are done. |
| 85 |
nigel |
75 |
|
| 86 |
nigel |
87 |
(7) Otherwise, set the bottom to one element past the current point and goto |
| 87 |
|
|
(2). |
| 88 |
nigel |
75 |
*/ |
| 89 |
|
|
|
| 90 |
nigel |
93 |
#endif /* _UCPINTERNAL_H */ |
| 91 |
|
|
|
| 92 |
nigel |
87 |
/* End of ucpinternal.h */ |