/[pcre]/code/trunk/ucpinternal.h
ViewVC logotype

Diff of /code/trunk/ucpinternal.h

Parent Directory Parent Directory | Revision Log Revision Log | View Patch Patch

revision 75 by nigel, Sat Feb 24 21:40:37 2007 UTC revision 87 by nigel, Sat Feb 24 21:41:21 2007 UTC
# Line 1  Line 1 
1  /*************************************************  /*************************************************
2  *     libucp - Unicode Property Table handler    *  *           Unicode Property Table handler       *
3  *************************************************/  *************************************************/
4    
5  /* Internal header file defining the layout of compact nodes in the tree. */  /* Internal header file defining the layout of the bits in each pair of 32-bit
6    words that form a data item in the table. */
7    
8  typedef struct cnode {  typedef struct cnode {
9    unsigned short int f0;    pcre_uint32 f0;
10    unsigned short int f1;    pcre_uint32 f1;
   unsigned short int f2;  
11  } cnode;  } cnode;
12    
13  /* Things for the f0 field */  /* Things for the f0 field */
14    
15  #define f0_leftexists   0x8000    /* Left child exists */  #define f0_scriptmask   0xff000000  /* Mask for script field */
16  #define f0_typemask     0x3f00    /* Type bits */  #define f0_scriptshift          24  /* Shift for script value */
17  #define f0_typeshift         8    /* Type shift */  #define f0_rangeflag    0x00f00000  /* Flag for a range item */
18  #define f0_chhmask      0x00ff    /* Character high bits */  #define f0_charmask     0x001fffff  /* Mask for code point value */
19    
20  /* Things for the f2 field */  /* Things for the f1 field */
21    
22  #define f2_rightmask    0xf000    /* Mask for right offset bits */  #define f1_typemask     0xfc000000  /* Mask for char type field */
23  #define f2_rightshift       12    /* Shift for right offset */  #define f1_typeshift            26  /* Shift for the type field */
24  #define f2_casemask     0x0fff    /* Mask for case offset */  #define f1_rangemask    0x0000ffff  /* Mask for a range offset */
25    #define f1_casemask     0x0000ffff  /* Mask for a case offset */
26  /* The tree consists of a vector of structures of type cnode, with the root  #define f1_caseneg      0xffff8000  /* Bits for negation */
27  node as the first element. The three short ints (16-bits) are used as follows:  
28    /* The data consists of a vector of structures of type cnode. The two unsigned
29  (f0) (1) The 0x8000 bit of f0 is set if a left child exists. The child's node  32-bit integers are used as follows:
30           is the next node in the vector.  
31       (2) The 0x4000 bits of f0 is spare.  (f0) (1) The most significant byte holds the script number. The numbers are
32       (3) The 0x3f00 bits of f0 contain the character type; this is a number           defined by the enum in ucp.h.
33           defined by the enumeration in ucp.h (e.g. ucp_Lu).  
34       (4) The bottom 8 bits of f0 contain the most significant byte of the       (2) The 0x00800000 bit is set if this entry defines a range of characters.
35           character's 24-bit codepoint.           It is not set if this entry defines a single character
36    
37  (f1) (1) The f1 field contains the two least significant bytes of the       (3) The 0x00600000 bits are spare.
38           codepoint.  
39         (4) The 0x001fffff bits contain the code point. No Unicode code point will
40  (f2) (1) The 0xf000 bits of f2 contain zero if there is no right child of this           ever be greater than 0x0010ffff, so this should be OK for ever.
41           node. Otherwise, they contain one plus the exponent of the power of  
42           two of the offset to the right node (e.g. a value of 3 means 8). The  (f1) (1) The 0xfc000000 bits contain the character type number. The numbers are
43           units of the offset are node items.           defined by an enum in ucp.h.
44    
45       (2) The 0x0fff bits of f2 contain the signed offset from this character to       (2) The 0x03ff0000 bits are spare.
46           its alternate cased value. They are zero if there is no such  
47           character.       (3) The 0x0000ffff bits contain EITHER the unsigned offset to the top of
48             range if this entry defines a range, OR the *signed* offset to the
49             character's "other case" partner if this entry defines a single
50  -----------------------------------------------------------------------------           character. There is no partner if the value is zero.
51  ||.|.| type (6) | ms char (8) ||  ls char (16)  ||....|  case offset (12)  ||  
52  -----------------------------------------------------------------------------  -------------------------------------------------------------------------------
53    | |                                              |  | script (8) |.|.|.| codepoint (21) || type (6) |.|.| spare (8) | offset (16) |
54    | |-> spare                                      |  -------------------------------------------------------------------------------
55    |                                        exponent of right                | | |                              | |
56    |-> left child exists                       child offset                | | |-> spare                      | |-> spare
57                  | |                                |
58                  | |-> spare                        |-> spare
59                  |
60                  |-> range flag
61    
62  The upper/lower casing information is set only for characters that come in  The upper/lower casing information is set only for characters that come in
63  pairs. There are (at present) four non-one-to-one mappings in the Unicode data.  pairs. The non-one-to-one mappings in the Unicode data are ignored.
 These are ignored. They are:  
   
   1FBE Greek Prosgegrammeni (lower, with upper -> capital iota)  
   2126 Ohm  
   212A Kelvin  
   212B Angstrom  
64    
65  Certainly for the last three, having an alternate case would seem to be a  When searching the data, proceed as follows:
 mistake. I don't know any Greek, so cannot comment on the first one.  
66    
67    (1) Set up for a binary chop search.
68    
69  When searching the tree, proceed as follows:  (2) If the top is not greater than the bottom, the character is not in the
70        table. Its type must therefore be "Cn" ("Undefined").
71    
72  (1) Start at the first node.  (3) Find the middle vector element.
73    
74  (2) Extract the character value from f1 and the bottom 8 bits of f0;  (4) Extract the code point and compare. If equal, we are done.
75    
76  (3) Compare with the character being sought. If equal, we are done.  (5) If the test character is smaller, set the top to the current point, and
77        goto (2).
78    
79  (4) If the test character is smaller, inspect the f0_leftexists flag. If it is  (6) If the current entry defines a range, compute the last character by adding
80      not set, the character is not in the tree. If it is set, move to the next      the offset, and see if the test character is within the range. If it is,
81      node, and go to (2).      we are done.
82    
83  (5) If the test character is bigger, extract the f2_rightmask bits from f2, and  (7) Otherwise, set the bottom to one element past the current point and goto
84      shift them right by f2_rightshift. If the result is zero, the character is      (2).
     not in the tree. Otherwise, calculate the number of nodes to skip by  
     shifting the value 1 left by this number minus one. Go to (2).  
85  */  */
86    
87    /* End of ucpinternal.h */
 /* End of internal.h */  

Legend:
Removed from v.75  
changed lines
  Added in v.87

webmaster@exim.org
ViewVC Help
Powered by ViewVC 1.1.12