| 408 |
|
|
| 409 |
/* When UTF-8 encoding is being used, a character is no longer just a single |
/* When UTF-8 encoding is being used, a character is no longer just a single |
| 410 |
byte. The macros for character handling generate simple sequences when used in |
byte. The macros for character handling generate simple sequences when used in |
| 411 |
byte-mode, and more complicated ones for UTF-8 characters. BACKCHAR should |
byte-mode, and more complicated ones for UTF-8 characters. GETCHARLENTEST is |
| 412 |
never be called in byte mode. To make sure it can never even appear when UTF-8 |
not used when UTF-8 is not supported, so it is not defined, and BACKCHAR should |
| 413 |
support is omitted, we don't even define it. */ |
never be called in byte mode. To make sure they can never even appear when |
| 414 |
|
UTF-8 support is omitted, we don't even define them. */ |
| 415 |
|
|
| 416 |
#ifndef SUPPORT_UTF8 |
#ifndef SUPPORT_UTF8 |
| 417 |
#define GETCHAR(c, eptr) c = *eptr; |
#define GETCHAR(c, eptr) c = *eptr; |
| 419 |
#define GETCHARINC(c, eptr) c = *eptr++; |
#define GETCHARINC(c, eptr) c = *eptr++; |
| 420 |
#define GETCHARINCTEST(c, eptr) c = *eptr++; |
#define GETCHARINCTEST(c, eptr) c = *eptr++; |
| 421 |
#define GETCHARLEN(c, eptr, len) c = *eptr; |
#define GETCHARLEN(c, eptr, len) c = *eptr; |
| 422 |
|
/* #define GETCHARLENTEST(c, eptr, len) */ |
| 423 |
/* #define BACKCHAR(eptr) */ |
/* #define BACKCHAR(eptr) */ |
| 424 |
|
|
| 425 |
#else /* SUPPORT_UTF8 */ |
#else /* SUPPORT_UTF8 */ |
| 426 |
|
|
| 427 |
|
/* These macros were originally written in the form of loops that used data |
| 428 |
|
from the tables whose names start with _pcre_utf8_table. They were rewritten by |
| 429 |
|
a user so as not to use loops, because in some environments this gives a |
| 430 |
|
significant performance advantage, and it seems never to do any harm. */ |
| 431 |
|
|
| 432 |
|
/* Base macro to pick up the remaining bytes of a UTF-8 character, not |
| 433 |
|
advancing the pointer. */ |
| 434 |
|
|
| 435 |
|
#define GETUTF8(c, eptr) \ |
| 436 |
|
{ \ |
| 437 |
|
if ((c & 0x20) == 0) \ |
| 438 |
|
c = ((c & 0x1f) << 6) | (eptr[1] & 0x3f); \ |
| 439 |
|
else if ((c & 0x10) == 0) \ |
| 440 |
|
c = ((c & 0x0f) << 12) | ((eptr[1] & 0x3f) << 6) | (eptr[2] & 0x3f); \ |
| 441 |
|
else if ((c & 0x08) == 0) \ |
| 442 |
|
c = ((c & 0x07) << 18) | ((eptr[1] & 0x3f) << 12) | \ |
| 443 |
|
((eptr[2] & 0x3f) << 6) | (eptr[3] & 0x3f); \ |
| 444 |
|
else if ((c & 0x04) == 0) \ |
| 445 |
|
c = ((c & 0x03) << 24) | ((eptr[1] & 0x3f) << 18) | \ |
| 446 |
|
((eptr[2] & 0x3f) << 12) | ((eptr[3] & 0x3f) << 6) | \ |
| 447 |
|
(eptr[4] & 0x3f); \ |
| 448 |
|
else \ |
| 449 |
|
c = ((c & 0x01) << 30) | ((eptr[1] & 0x3f) << 24) | \ |
| 450 |
|
((eptr[2] & 0x3f) << 18) | ((eptr[3] & 0x3f) << 12) | \ |
| 451 |
|
((eptr[4] & 0x3f) << 6) | (eptr[5] & 0x3f); \ |
| 452 |
|
} |
| 453 |
|
|
| 454 |
/* Get the next UTF-8 character, not advancing the pointer. This is called when |
/* Get the next UTF-8 character, not advancing the pointer. This is called when |
| 455 |
we know we are in UTF-8 mode. */ |
we know we are in UTF-8 mode. */ |
| 456 |
|
|
| 457 |
#define GETCHAR(c, eptr) \ |
#define GETCHAR(c, eptr) \ |
| 458 |
c = *eptr; \ |
c = *eptr; \ |
| 459 |
if (c >= 0xc0) \ |
if (c >= 0xc0) GETUTF8(c, eptr); |
|
{ \ |
|
|
int gcii; \ |
|
|
int gcaa = _pcre_utf8_table4[c & 0x3f]; /* Number of additional bytes */ \ |
|
|
int gcss = 6*gcaa; \ |
|
|
c = (c & _pcre_utf8_table3[gcaa]) << gcss; \ |
|
|
for (gcii = 1; gcii <= gcaa; gcii++) \ |
|
|
{ \ |
|
|
gcss -= 6; \ |
|
|
c |= (eptr[gcii] & 0x3f) << gcss; \ |
|
|
} \ |
|
|
} |
|
| 460 |
|
|
| 461 |
/* Get the next UTF-8 character, testing for UTF-8 mode, and not advancing the |
/* Get the next UTF-8 character, testing for UTF-8 mode, and not advancing the |
| 462 |
pointer. */ |
pointer. */ |
| 463 |
|
|
| 464 |
#define GETCHARTEST(c, eptr) \ |
#define GETCHARTEST(c, eptr) \ |
| 465 |
c = *eptr; \ |
c = *eptr; \ |
| 466 |
if (utf8 && c >= 0xc0) \ |
if (utf8 && c >= 0xc0) GETUTF8(c, eptr); |
| 467 |
|
|
| 468 |
|
/* Base macro to pick up the remaining bytes of a UTF-8 character, advancing |
| 469 |
|
the pointer. */ |
| 470 |
|
|
| 471 |
|
#define GETUTF8INC(c, eptr) \ |
| 472 |
{ \ |
{ \ |
| 473 |
int gcii; \ |
if ((c & 0x20) == 0) \ |
| 474 |
int gcaa = _pcre_utf8_table4[c & 0x3f]; /* Number of additional bytes */ \ |
c = ((c & 0x1f) << 6) | (*eptr++ & 0x3f); \ |
| 475 |
int gcss = 6*gcaa; \ |
else if ((c & 0x10) == 0) \ |
| 476 |
c = (c & _pcre_utf8_table3[gcaa]) << gcss; \ |
{ \ |
| 477 |
for (gcii = 1; gcii <= gcaa; gcii++) \ |
c = ((c & 0x0f) << 12) | ((*eptr & 0x3f) << 6) | (eptr[1] & 0x3f); \ |
| 478 |
|
eptr += 2; \ |
| 479 |
|
} \ |
| 480 |
|
else if ((c & 0x08) == 0) \ |
| 481 |
|
{ \ |
| 482 |
|
c = ((c & 0x07) << 18) | ((*eptr & 0x3f) << 12) | \ |
| 483 |
|
((eptr[1] & 0x3f) << 6) | (eptr[2] & 0x3f); \ |
| 484 |
|
eptr += 3; \ |
| 485 |
|
} \ |
| 486 |
|
else if ((c & 0x04) == 0) \ |
| 487 |
{ \ |
{ \ |
| 488 |
gcss -= 6; \ |
c = ((c & 0x03) << 24) | ((*eptr & 0x3f) << 18) | \ |
| 489 |
c |= (eptr[gcii] & 0x3f) << gcss; \ |
((eptr[1] & 0x3f) << 12) | ((eptr[2] & 0x3f) << 6) | \ |
| 490 |
|
(eptr[3] & 0x3f); \ |
| 491 |
|
eptr += 4; \ |
| 492 |
|
} \ |
| 493 |
|
else \ |
| 494 |
|
{ \ |
| 495 |
|
c = ((c & 0x01) << 30) | ((*eptr & 0x3f) << 24) | \ |
| 496 |
|
((eptr[1] & 0x3f) << 18) | ((eptr[2] & 0x3f) << 12) | \ |
| 497 |
|
((eptr[3] & 0x3f) << 6) | (eptr[4] & 0x3f); \ |
| 498 |
|
eptr += 5; \ |
| 499 |
} \ |
} \ |
| 500 |
} |
} |
| 501 |
|
|
| 504 |
|
|
| 505 |
#define GETCHARINC(c, eptr) \ |
#define GETCHARINC(c, eptr) \ |
| 506 |
c = *eptr++; \ |
c = *eptr++; \ |
| 507 |
if (c >= 0xc0) \ |
if (c >= 0xc0) GETUTF8INC(c, eptr); |
|
{ \ |
|
|
int gcaa = _pcre_utf8_table4[c & 0x3f]; /* Number of additional bytes */ \ |
|
|
int gcss = 6*gcaa; \ |
|
|
c = (c & _pcre_utf8_table3[gcaa]) << gcss; \ |
|
|
while (gcaa-- > 0) \ |
|
|
{ \ |
|
|
gcss -= 6; \ |
|
|
c |= (*eptr++ & 0x3f) << gcss; \ |
|
|
} \ |
|
|
} |
|
| 508 |
|
|
| 509 |
/* Get the next character, testing for UTF-8 mode, and advancing the pointer. |
/* Get the next character, testing for UTF-8 mode, and advancing the pointer. |
| 510 |
This is called when we don't know if we are in UTF-8 mode. */ |
This is called when we don't know if we are in UTF-8 mode. */ |
| 511 |
|
|
| 512 |
#define GETCHARINCTEST(c, eptr) \ |
#define GETCHARINCTEST(c, eptr) \ |
| 513 |
c = *eptr++; \ |
c = *eptr++; \ |
| 514 |
if (utf8 && c >= 0xc0) \ |
if (utf8 && c >= 0xc0) GETUTF8INC(c, eptr); |
| 515 |
|
|
| 516 |
|
/* Base macro to pick up the remaining bytes of a UTF-8 character, not |
| 517 |
|
advancing the pointer, incrementing the length. */ |
| 518 |
|
|
| 519 |
|
#define GETUTF8LEN(c, eptr, len) \ |
| 520 |
{ \ |
{ \ |
| 521 |
int gcaa = _pcre_utf8_table4[c & 0x3f]; /* Number of additional bytes */ \ |
if ((c & 0x20) == 0) \ |
| 522 |
int gcss = 6*gcaa; \ |
{ \ |
| 523 |
c = (c & _pcre_utf8_table3[gcaa]) << gcss; \ |
c = ((c & 0x1f) << 6) | (eptr[1] & 0x3f); \ |
| 524 |
while (gcaa-- > 0) \ |
len++; \ |
| 525 |
|
} \ |
| 526 |
|
else if ((c & 0x10) == 0) \ |
| 527 |
{ \ |
{ \ |
| 528 |
gcss -= 6; \ |
c = ((c & 0x0f) << 12) | ((eptr[1] & 0x3f) << 6) | (eptr[2] & 0x3f); \ |
| 529 |
c |= (*eptr++ & 0x3f) << gcss; \ |
len += 2; \ |
| 530 |
|
} \ |
| 531 |
|
else if ((c & 0x08) == 0) \ |
| 532 |
|
{\ |
| 533 |
|
c = ((c & 0x07) << 18) | ((eptr[1] & 0x3f) << 12) | \ |
| 534 |
|
((eptr[2] & 0x3f) << 6) | (eptr[3] & 0x3f); \ |
| 535 |
|
len += 3; \ |
| 536 |
|
} \ |
| 537 |
|
else if ((c & 0x04) == 0) \ |
| 538 |
|
{ \ |
| 539 |
|
c = ((c & 0x03) << 24) | ((eptr[1] & 0x3f) << 18) | \ |
| 540 |
|
((eptr[2] & 0x3f) << 12) | ((eptr[3] & 0x3f) << 6) | \ |
| 541 |
|
(eptr[4] & 0x3f); \ |
| 542 |
|
len += 4; \ |
| 543 |
|
} \ |
| 544 |
|
else \ |
| 545 |
|
{\ |
| 546 |
|
c = ((c & 0x01) << 30) | ((eptr[1] & 0x3f) << 24) | \ |
| 547 |
|
((eptr[2] & 0x3f) << 18) | ((eptr[3] & 0x3f) << 12) | \ |
| 548 |
|
((eptr[4] & 0x3f) << 6) | (eptr[5] & 0x3f); \ |
| 549 |
|
len += 5; \ |
| 550 |
} \ |
} \ |
| 551 |
} |
} |
| 552 |
|
|
| 555 |
|
|
| 556 |
#define GETCHARLEN(c, eptr, len) \ |
#define GETCHARLEN(c, eptr, len) \ |
| 557 |
c = *eptr; \ |
c = *eptr; \ |
| 558 |
if (c >= 0xc0) \ |
if (c >= 0xc0) GETUTF8LEN(c, eptr, len); |
|
{ \ |
|
|
int gcii; \ |
|
|
int gcaa = _pcre_utf8_table4[c & 0x3f]; /* Number of additional bytes */ \ |
|
|
int gcss = 6*gcaa; \ |
|
|
c = (c & _pcre_utf8_table3[gcaa]) << gcss; \ |
|
|
for (gcii = 1; gcii <= gcaa; gcii++) \ |
|
|
{ \ |
|
|
gcss -= 6; \ |
|
|
c |= (eptr[gcii] & 0x3f) << gcss; \ |
|
|
} \ |
|
|
len += gcaa; \ |
|
|
} |
|
| 559 |
|
|
| 560 |
/* Get the next UTF-8 character, testing for UTF-8 mode, not advancing the |
/* Get the next UTF-8 character, testing for UTF-8 mode, not advancing the |
| 561 |
pointer, incrementing length if there are extra bytes. This is called when we |
pointer, incrementing length if there are extra bytes. This is called when we |
| 563 |
|
|
| 564 |
#define GETCHARLENTEST(c, eptr, len) \ |
#define GETCHARLENTEST(c, eptr, len) \ |
| 565 |
c = *eptr; \ |
c = *eptr; \ |
| 566 |
if (utf8 && c >= 0xc0) \ |
if (utf8 && c >= 0xc0) GETUTF8LEN(c, eptr, len); |
|
{ \ |
|
|
int gcii; \ |
|
|
int gcaa = _pcre_utf8_table4[c & 0x3f]; /* Number of additional bytes */ \ |
|
|
int gcss = 6*gcaa; \ |
|
|
c = (c & _pcre_utf8_table3[gcaa]) << gcss; \ |
|
|
for (gcii = 1; gcii <= gcaa; gcii++) \ |
|
|
{ \ |
|
|
gcss -= 6; \ |
|
|
c |= (eptr[gcii] & 0x3f) << gcss; \ |
|
|
} \ |
|
|
len += gcaa; \ |
|
|
} |
|
| 567 |
|
|
| 568 |
/* If the pointer is not at the start of a character, move it back until |
/* If the pointer is not at the start of a character, move it back until |
| 569 |
it is. This is called only in UTF-8 mode - we don't put a test within the macro |
it is. This is called only in UTF-8 mode - we don't put a test within the macro |
| 571 |
|
|
| 572 |
#define BACKCHAR(eptr) while((*eptr & 0xc0) == 0x80) eptr-- |
#define BACKCHAR(eptr) while((*eptr & 0xc0) == 0x80) eptr-- |
| 573 |
|
|
| 574 |
#endif |
#endif /* SUPPORT_UTF8 */ |
| 575 |
|
|
| 576 |
|
|
| 577 |
/* In case there is no definition of offsetof() provided - though any proper |
/* In case there is no definition of offsetof() provided - though any proper |