| 9 |
|
|
| 10 |
Written by: Philip Hazel <ph10@cam.ac.uk> |
Written by: Philip Hazel <ph10@cam.ac.uk> |
| 11 |
|
|
| 12 |
Copyright (c) 1997-2002 University of Cambridge |
Copyright (c) 1997-2003 University of Cambridge |
| 13 |
|
|
| 14 |
----------------------------------------------------------------------------- |
----------------------------------------------------------------------------- |
| 15 |
Permission is granted to anyone to use this software for any purpose on any |
Permission is granted to anyone to use this software for any purpose on any |
| 297 |
/* Character class where all the information is in a bit map: set the |
/* Character class where all the information is in a bit map: set the |
| 298 |
bits and either carry on or not, according to the repeat count. If it was |
bits and either carry on or not, according to the repeat count. If it was |
| 299 |
a negative class, and we are operating with UTF-8 characters, any byte |
a negative class, and we are operating with UTF-8 characters, any byte |
| 300 |
with the top-bit set is a potentially valid starter because it may start |
with a value >= 0xc4 is a potentially valid starter because it starts a |
| 301 |
a character with a value > 255. (This is sub-optimal in that the |
character with a value > 255. */ |
|
character may be in the range 128-255, and those characters might be |
|
|
unwanted, but that's as far as we go for the moment.) */ |
|
| 302 |
|
|
| 303 |
case OP_NCLASS: |
case OP_NCLASS: |
| 304 |
if (utf8) memset(start_bits+16, 0xff, 16); |
if (utf8) |
| 305 |
|
{ |
| 306 |
|
start_bits[24] |= 0xf0; /* Bits for 0xc4 - 0xc8 */ |
| 307 |
|
memset(start_bits+25, 0xff, 7); /* Bits for 0xc9 - 0xff */ |
| 308 |
|
} |
| 309 |
/* Fall through */ |
/* Fall through */ |
| 310 |
|
|
| 311 |
case OP_CLASS: |
case OP_CLASS: |
| 312 |
{ |
{ |
| 313 |
tcode++; |
tcode++; |
| 314 |
for (c = 0; c < 32; c++) start_bits[c] |= tcode[c]; |
|
| 315 |
|
/* In UTF-8 mode, the bits in a bit map correspond to character |
| 316 |
|
values, not to byte values. However, the bit map we are constructing is |
| 317 |
|
for byte values. So we have to do a conversion for characters whose |
| 318 |
|
value is > 127. In fact, there are only two possible starting bytes for |
| 319 |
|
characters in the range 128 - 255. */ |
| 320 |
|
|
| 321 |
|
if (utf8) |
| 322 |
|
{ |
| 323 |
|
for (c = 0; c < 16; c++) start_bits[c] |= tcode[c]; |
| 324 |
|
for (c = 128; c < 256; c++) |
| 325 |
|
{ |
| 326 |
|
if ((tcode[c/8] && (1 << (c&7))) != 0) |
| 327 |
|
{ |
| 328 |
|
int d = (c >> 6) | 0xc0; /* Set bit for this starter */ |
| 329 |
|
start_bits[d/8] |= (1 << (d&7)); /* and then skip on to the */ |
| 330 |
|
c = (c & 0xc0) + 0x40 - 1; /* next relevant character. */ |
| 331 |
|
} |
| 332 |
|
} |
| 333 |
|
} |
| 334 |
|
|
| 335 |
|
/* In non-UTF-8 mode, the two bit maps are completely compatible. */ |
| 336 |
|
|
| 337 |
|
else |
| 338 |
|
{ |
| 339 |
|
for (c = 0; c < 32; c++) start_bits[c] |= tcode[c]; |
| 340 |
|
} |
| 341 |
|
|
| 342 |
|
/* Advance past the bit map, and act on what follows */ |
| 343 |
|
|
| 344 |
tcode += 32; |
tcode += 32; |
| 345 |
switch (*tcode) |
switch (*tcode) |
| 346 |
{ |
{ |