| 6 |
and semantics are as close as possible to those of the Perl 5 language. |
and semantics are as close as possible to those of the Perl 5 language. |
| 7 |
|
|
| 8 |
Written by Philip Hazel |
Written by Philip Hazel |
| 9 |
Copyright (c) 1997-2005 University of Cambridge |
Copyright (c) 1997-2006 University of Cambridge |
| 10 |
|
|
| 11 |
----------------------------------------------------------------------------- |
----------------------------------------------------------------------------- |
| 12 |
Redistribution and use in source and binary forms, with or without |
Redistribution and use in source and binary forms, with or without |
| 116 |
|
|
| 117 |
|
|
| 118 |
/* Tables of names of POSIX character classes and their lengths. The list is |
/* Tables of names of POSIX character classes and their lengths. The list is |
| 119 |
terminated by a zero length entry. The first three must be alpha, upper, lower, |
terminated by a zero length entry. The first three must be alpha, lower, upper, |
| 120 |
as this is assumed for handling case independence. */ |
as this is assumed for handling case independence. */ |
| 121 |
|
|
| 122 |
static const char *const posix_names[] = { |
static const char *const posix_names[] = { |
| 127 |
static const uschar posix_name_lengths[] = { |
static const uschar posix_name_lengths[] = { |
| 128 |
5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 4, 6, 0 }; |
5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 4, 6, 0 }; |
| 129 |
|
|
| 130 |
/* Table of class bit maps for each POSIX class; up to three may be combined |
/* Table of class bit maps for each POSIX class. Each class is formed from a |
| 131 |
to form the class. The table for [:blank:] is dynamically modified to remove |
base map, with an optional addition or removal of another map. Then, for some |
| 132 |
the vertical space characters. */ |
classes, there is some additional tweaking: for [:blank:] the vertical space |
| 133 |
|
characters are removed, and for [:alpha:] and [:alnum:] the underscore |
| 134 |
|
character is removed. The triples in the table consist of the base map offset, |
| 135 |
|
second map offset or -1 if no second map, and a non-negative value for map |
| 136 |
|
addition or a negative value for map subtraction (if there are two maps). The |
| 137 |
|
absolute value of the third field has these meanings: 0 => no tweaking, 1 => |
| 138 |
|
remove vertical space characters, 2 => remove underscore. */ |
| 139 |
|
|
| 140 |
static const int posix_class_maps[] = { |
static const int posix_class_maps[] = { |
| 141 |
cbit_lower, cbit_upper, -1, /* alpha */ |
cbit_word, cbit_digit, -2, /* alpha */ |
| 142 |
cbit_lower, -1, -1, /* lower */ |
cbit_lower, -1, 0, /* lower */ |
| 143 |
cbit_upper, -1, -1, /* upper */ |
cbit_upper, -1, 0, /* upper */ |
| 144 |
cbit_digit, cbit_lower, cbit_upper, /* alnum */ |
cbit_word, -1, 2, /* alnum - word without underscore */ |
| 145 |
cbit_print, cbit_cntrl, -1, /* ascii */ |
cbit_print, cbit_cntrl, 0, /* ascii */ |
| 146 |
cbit_space, -1, -1, /* blank - a GNU extension */ |
cbit_space, -1, 1, /* blank - a GNU extension */ |
| 147 |
cbit_cntrl, -1, -1, /* cntrl */ |
cbit_cntrl, -1, 0, /* cntrl */ |
| 148 |
cbit_digit, -1, -1, /* digit */ |
cbit_digit, -1, 0, /* digit */ |
| 149 |
cbit_graph, -1, -1, /* graph */ |
cbit_graph, -1, 0, /* graph */ |
| 150 |
cbit_print, -1, -1, /* print */ |
cbit_print, -1, 0, /* print */ |
| 151 |
cbit_punct, -1, -1, /* punct */ |
cbit_punct, -1, 0, /* punct */ |
| 152 |
cbit_space, -1, -1, /* space */ |
cbit_space, -1, 0, /* space */ |
| 153 |
cbit_word, -1, -1, /* word - a Perl extension */ |
cbit_word, -1, 0, /* word - a Perl extension */ |
| 154 |
cbit_xdigit,-1, -1 /* xdigit */ |
cbit_xdigit,-1, 0 /* xdigit */ |
| 155 |
}; |
}; |
| 156 |
|
|
| 157 |
|
|
| 377 |
check_escape(const uschar **ptrptr, int *errorcodeptr, int bracount, |
check_escape(const uschar **ptrptr, int *errorcodeptr, int bracount, |
| 378 |
int options, BOOL isclass) |
int options, BOOL isclass) |
| 379 |
{ |
{ |
| 380 |
const uschar *ptr = *ptrptr; |
BOOL utf8 = (options & PCRE_UTF8) != 0; |
| 381 |
|
const uschar *ptr = *ptrptr + 1; |
| 382 |
int c, i; |
int c, i; |
| 383 |
|
|
| 384 |
|
GETCHARINCTEST(c, ptr); /* Get character value, increment pointer */ |
| 385 |
|
ptr--; /* Set pointer back to the last byte */ |
| 386 |
|
|
| 387 |
/* If backslash is at the end of the pattern, it's an error. */ |
/* If backslash is at the end of the pattern, it's an error. */ |
| 388 |
|
|
|
c = *(++ptr); |
|
| 389 |
if (c == 0) *errorcodeptr = ERR1; |
if (c == 0) *errorcodeptr = ERR1; |
| 390 |
|
|
| 391 |
/* Non-alphamerics are literals. For digits or letters, do an initial lookup in |
/* Non-alphamerics are literals. For digits or letters, do an initial lookup in |
| 469 |
c &= 255; /* Take least significant 8 bits */ |
c &= 255; /* Take least significant 8 bits */ |
| 470 |
break; |
break; |
| 471 |
|
|
| 472 |
/* \x is complicated when UTF-8 is enabled. \x{ddd} is a character number |
/* \x is complicated. \x{ddd} is a character number which can be greater |
| 473 |
which can be greater than 0xff, but only if the ddd are hex digits. */ |
than 0xff in utf8 mode, but only if the ddd are hex digits. If not, { is |
| 474 |
|
treated as a data character. */ |
| 475 |
|
|
| 476 |
case 'x': |
case 'x': |
| 477 |
#ifdef SUPPORT_UTF8 |
if (ptr[1] == '{') |
|
if (ptr[1] == '{' && (options & PCRE_UTF8) != 0) |
|
| 478 |
{ |
{ |
| 479 |
const uschar *pt = ptr + 2; |
const uschar *pt = ptr + 2; |
| 480 |
register int count = 0; |
int count = 0; |
| 481 |
|
|
| 482 |
c = 0; |
c = 0; |
| 483 |
while ((digitab[*pt] & ctype_xdigit) != 0) |
while ((digitab[*pt] & ctype_xdigit) != 0) |
| 484 |
{ |
{ |
| 485 |
int cc = *pt++; |
register int cc = *pt++; |
| 486 |
|
if (c == 0 && cc == '0') continue; /* Leading zeroes */ |
| 487 |
count++; |
count++; |
| 488 |
|
|
| 489 |
#if !EBCDIC /* ASCII coding */ |
#if !EBCDIC /* ASCII coding */ |
| 490 |
if (cc >= 'a') cc -= 32; /* Convert to upper case */ |
if (cc >= 'a') cc -= 32; /* Convert to upper case */ |
| 491 |
c = c * 16 + cc - ((cc < 'A')? '0' : ('A' - 10)); |
c = (c << 4) + cc - ((cc < 'A')? '0' : ('A' - 10)); |
| 492 |
#else /* EBCDIC coding */ |
#else /* EBCDIC coding */ |
| 493 |
if (cc >= 'a' && cc <= 'z') cc += 64; /* Convert to upper case */ |
if (cc >= 'a' && cc <= 'z') cc += 64; /* Convert to upper case */ |
| 494 |
c = c * 16 + cc - ((cc >= '0')? '0' : ('A' - 10)); |
c = (c << 4) + cc - ((cc >= '0')? '0' : ('A' - 10)); |
| 495 |
#endif |
#endif |
| 496 |
} |
} |
| 497 |
|
|
| 498 |
if (*pt == '}') |
if (*pt == '}') |
| 499 |
{ |
{ |
| 500 |
if (c < 0 || count > 8) *errorcodeptr = ERR34; |
if (c < 0 || count > (utf8? 8 : 2)) *errorcodeptr = ERR34; |
| 501 |
ptr = pt; |
ptr = pt; |
| 502 |
break; |
break; |
| 503 |
} |
} |
| 504 |
|
|
| 505 |
/* If the sequence of hex digits does not end with '}', then we don't |
/* If the sequence of hex digits does not end with '}', then we don't |
| 506 |
recognize this construct; fall through to the normal \x handling. */ |
recognize this construct; fall through to the normal \x handling. */ |
| 507 |
} |
} |
|
#endif |
|
| 508 |
|
|
| 509 |
/* Read just a single hex char */ |
/* Read just a single-byte hex-defined char */ |
| 510 |
|
|
| 511 |
c = 0; |
c = 0; |
| 512 |
while (i++ < 2 && (digitab[ptr[1]] & ctype_xdigit) != 0) |
while (i++ < 2 && (digitab[ptr[1]] & ctype_xdigit) != 0) |
| 582 |
Argument: |
Argument: |
| 583 |
ptrptr points to the pattern position pointer |
ptrptr points to the pattern position pointer |
| 584 |
negptr points to a boolean that is set TRUE for negation else FALSE |
negptr points to a boolean that is set TRUE for negation else FALSE |
| 585 |
|
dptr points to an int that is set to the detailed property value |
| 586 |
errorcodeptr points to the error code variable |
errorcodeptr points to the error code variable |
| 587 |
|
|
| 588 |
Returns: value from ucp_type_table, or -1 for an invalid type |
Returns: type value from ucp_type_table, or -1 for an invalid type |
| 589 |
*/ |
*/ |
| 590 |
|
|
| 591 |
static int |
static int |
| 592 |
get_ucp(const uschar **ptrptr, BOOL *negptr, int *errorcodeptr) |
get_ucp(const uschar **ptrptr, BOOL *negptr, int *dptr, int *errorcodeptr) |
| 593 |
{ |
{ |
| 594 |
int c, i, bot, top; |
int c, i, bot, top; |
| 595 |
const uschar *ptr = *ptrptr; |
const uschar *ptr = *ptrptr; |
| 596 |
char name[4]; |
char name[32]; |
| 597 |
|
|
| 598 |
c = *(++ptr); |
c = *(++ptr); |
| 599 |
if (c == 0) goto ERROR_RETURN; |
if (c == 0) goto ERROR_RETURN; |
| 600 |
|
|
| 601 |
*negptr = FALSE; |
*negptr = FALSE; |
| 602 |
|
|
| 603 |
/* \P or \p can be followed by a one- or two-character name in {}, optionally |
/* \P or \p can be followed by a name in {}, optionally preceded by ^ for |
| 604 |
preceded by ^ for negation. */ |
negation. */ |
| 605 |
|
|
| 606 |
if (c == '{') |
if (c == '{') |
| 607 |
{ |
{ |
| 610 |
*negptr = TRUE; |
*negptr = TRUE; |
| 611 |
ptr++; |
ptr++; |
| 612 |
} |
} |
| 613 |
for (i = 0; i <= 2; i++) |
for (i = 0; i < sizeof(name) - 1; i++) |
| 614 |
{ |
{ |
| 615 |
c = *(++ptr); |
c = *(++ptr); |
| 616 |
if (c == 0) goto ERROR_RETURN; |
if (c == 0) goto ERROR_RETURN; |
| 617 |
if (c == '}') break; |
if (c == '}') break; |
| 618 |
name[i] = c; |
name[i] = c; |
| 619 |
} |
} |
| 620 |
if (c !='}') /* Try to distinguish error cases */ |
if (c !='}') goto ERROR_RETURN; |
|
{ |
|
|
while (*(++ptr) != 0 && *ptr != '}'); |
|
|
if (*ptr == '}') goto UNKNOWN_RETURN; else goto ERROR_RETURN; |
|
|
} |
|
| 621 |
name[i] = 0; |
name[i] = 0; |
| 622 |
} |
} |
| 623 |
|
|
| 638 |
|
|
| 639 |
while (bot < top) |
while (bot < top) |
| 640 |
{ |
{ |
| 641 |
i = (bot + top)/2; |
i = (bot + top) >> 1; |
| 642 |
c = strcmp(name, _pcre_utt[i].name); |
c = strcmp(name, _pcre_utt[i].name); |
| 643 |
if (c == 0) return _pcre_utt[i].value; |
if (c == 0) |
| 644 |
|
{ |
| 645 |
|
*dptr = _pcre_utt[i].value; |
| 646 |
|
return _pcre_utt[i].type; |
| 647 |
|
} |
| 648 |
if (c > 0) bot = i + 1; else top = i; |
if (c > 0) bot = i + 1; else top = i; |
| 649 |
} |
} |
| 650 |
|
|
|
UNKNOWN_RETURN: |
|
| 651 |
*errorcodeptr = ERR47; |
*errorcodeptr = ERR47; |
| 652 |
*ptrptr = ptr; |
*ptrptr = ptr; |
| 653 |
return -1; |
return -1; |
| 950 |
|
|
| 951 |
case OP_PROP: |
case OP_PROP: |
| 952 |
case OP_NOTPROP: |
case OP_NOTPROP: |
| 953 |
cc++; |
cc += 2; |
| 954 |
/* Fall through */ |
/* Fall through */ |
| 955 |
|
|
| 956 |
case OP_NOT_DIGIT: |
case OP_NOT_DIGIT: |
| 1510 |
static BOOL |
static BOOL |
| 1511 |
get_othercase_range(int *cptr, int d, int *ocptr, int *odptr) |
get_othercase_range(int *cptr, int d, int *ocptr, int *odptr) |
| 1512 |
{ |
{ |
| 1513 |
int c, chartype, othercase, next; |
int c, othercase, next; |
| 1514 |
|
|
| 1515 |
for (c = *cptr; c <= d; c++) |
for (c = *cptr; c <= d; c++) |
| 1516 |
{ |
{ if ((othercase = _pcre_ucp_othercase(c)) >= 0) break; } |
|
if (_pcre_ucp_findchar(c, &chartype, &othercase) == ucp_L && othercase != 0) |
|
|
break; |
|
|
} |
|
| 1517 |
|
|
| 1518 |
if (c > d) return FALSE; |
if (c > d) return FALSE; |
| 1519 |
|
|
| 1522 |
|
|
| 1523 |
for (++c; c <= d; c++) |
for (++c; c <= d; c++) |
| 1524 |
{ |
{ |
| 1525 |
if (_pcre_ucp_findchar(c, &chartype, &othercase) != ucp_L || |
if (_pcre_ucp_othercase(c) != next) break; |
|
othercase != next) |
|
|
break; |
|
| 1526 |
next++; |
next++; |
| 1527 |
} |
} |
| 1528 |
|
|
| 1739 |
*code++ = OP_ANY; |
*code++ = OP_ANY; |
| 1740 |
break; |
break; |
| 1741 |
|
|
| 1742 |
/* Character classes. If the included characters are all < 255 in value, we |
/* Character classes. If the included characters are all < 256, we build a |
| 1743 |
build a 32-byte bitmap of the permitted characters, except in the special |
32-byte bitmap of the permitted characters, except in the special case |
| 1744 |
case where there is only one such character. For negated classes, we build |
where there is only one such character. For negated classes, we build the |
| 1745 |
the map as usual, then invert it at the end. However, we use a different |
map as usual, then invert it at the end. However, we use a different opcode |
| 1746 |
opcode so that data characters > 255 can be handled correctly. |
so that data characters > 255 can be handled correctly. |
| 1747 |
|
|
| 1748 |
If the class contains characters outside the 0-255 range, a different |
If the class contains characters outside the 0-255 range, a different |
| 1749 |
opcode is compiled. It may optionally have a bit map for characters < 256, |
opcode is compiled. It may optionally have a bit map for characters < 256, |
| 1834 |
check_posix_syntax(ptr, &tempptr, cd)) |
check_posix_syntax(ptr, &tempptr, cd)) |
| 1835 |
{ |
{ |
| 1836 |
BOOL local_negate = FALSE; |
BOOL local_negate = FALSE; |
| 1837 |
int posix_class, i; |
int posix_class, taboffset, tabopt; |
| 1838 |
register const uschar *cbits = cd->cbits; |
register const uschar *cbits = cd->cbits; |
| 1839 |
|
uschar pbits[32]; |
| 1840 |
|
|
| 1841 |
if (ptr[1] != ':') |
if (ptr[1] != ':') |
| 1842 |
{ |
{ |
| 1865 |
if ((options & PCRE_CASELESS) != 0 && posix_class <= 2) |
if ((options & PCRE_CASELESS) != 0 && posix_class <= 2) |
| 1866 |
posix_class = 0; |
posix_class = 0; |
| 1867 |
|
|
| 1868 |
/* Or into the map we are building up to 3 of the static class |
/* We build the bit map for the POSIX class in a chunk of local store |
| 1869 |
tables, or their negations. The [:blank:] class sets up the same |
because we may be adding and subtracting from it, and we don't want to |
| 1870 |
chars as the [:space:] class (all white space). We remove the vertical |
subtract bits that may be in the main map already. At the end we or the |
| 1871 |
white space chars afterwards. */ |
result into the bit map that is being built. */ |
| 1872 |
|
|
| 1873 |
posix_class *= 3; |
posix_class *= 3; |
| 1874 |
for (i = 0; i < 3; i++) |
|
| 1875 |
|
/* Copy in the first table (always present) */ |
| 1876 |
|
|
| 1877 |
|
memcpy(pbits, cbits + posix_class_maps[posix_class], |
| 1878 |
|
32 * sizeof(uschar)); |
| 1879 |
|
|
| 1880 |
|
/* If there is a second table, add or remove it as required. */ |
| 1881 |
|
|
| 1882 |
|
taboffset = posix_class_maps[posix_class + 1]; |
| 1883 |
|
tabopt = posix_class_maps[posix_class + 2]; |
| 1884 |
|
|
| 1885 |
|
if (taboffset >= 0) |
| 1886 |
{ |
{ |
| 1887 |
BOOL blankclass = strncmp((char *)ptr, "blank", 5) == 0; |
if (tabopt >= 0) |
| 1888 |
int taboffset = posix_class_maps[posix_class + i]; |
for (c = 0; c < 32; c++) pbits[c] |= cbits[c + taboffset]; |
|
if (taboffset < 0) break; |
|
|
if (local_negate) |
|
|
{ |
|
|
if (i == 0) |
|
|
for (c = 0; c < 32; c++) classbits[c] |= ~cbits[c+taboffset]; |
|
|
else |
|
|
for (c = 0; c < 32; c++) classbits[c] &= ~cbits[c+taboffset]; |
|
|
if (blankclass) classbits[1] |= 0x3c; |
|
|
} |
|
| 1889 |
else |
else |
| 1890 |
{ |
for (c = 0; c < 32; c++) pbits[c] &= ~cbits[c + taboffset]; |
|
for (c = 0; c < 32; c++) classbits[c] |= cbits[c+taboffset]; |
|
|
if (blankclass) classbits[1] &= ~0x3c; |
|
|
} |
|
| 1891 |
} |
} |
| 1892 |
|
|
| 1893 |
|
/* Not see if we need to remove any special characters. An option |
| 1894 |
|
value of 1 removes vertical space and 2 removes underscore. */ |
| 1895 |
|
|
| 1896 |
|
if (tabopt < 0) tabopt = -tabopt; |
| 1897 |
|
if (tabopt == 1) pbits[1] &= ~0x3c; |
| 1898 |
|
else if (tabopt == 2) pbits[11] &= 0x7f; |
| 1899 |
|
|
| 1900 |
|
/* Add the POSIX table or its complement into the main table that is |
| 1901 |
|
being built and we are done. */ |
| 1902 |
|
|
| 1903 |
|
if (local_negate) |
| 1904 |
|
for (c = 0; c < 32; c++) classbits[c] |= ~pbits[c]; |
| 1905 |
|
else |
| 1906 |
|
for (c = 0; c < 32; c++) classbits[c] |= pbits[c]; |
| 1907 |
|
|
| 1908 |
ptr = tempptr + 1; |
ptr = tempptr + 1; |
| 1909 |
class_charcount = 10; /* Set > 1; assumes more than 1 per class */ |
class_charcount = 10; /* Set > 1; assumes more than 1 per class */ |
| 1910 |
continue; /* End of POSIX syntax handling */ |
continue; /* End of POSIX syntax handling */ |
| 1971 |
case ESC_P: |
case ESC_P: |
| 1972 |
{ |
{ |
| 1973 |
BOOL negated; |
BOOL negated; |
| 1974 |
int property = get_ucp(&ptr, &negated, errorcodeptr); |
int pdata; |
| 1975 |
if (property < 0) goto FAILED; |
int ptype = get_ucp(&ptr, &negated, &pdata, errorcodeptr); |
| 1976 |
|
if (ptype < 0) goto FAILED; |
| 1977 |
class_utf8 = TRUE; |
class_utf8 = TRUE; |
| 1978 |
*class_utf8data++ = ((-c == ESC_p) != negated)? |
*class_utf8data++ = ((-c == ESC_p) != negated)? |
| 1979 |
XCL_PROP : XCL_NOTPROP; |
XCL_PROP : XCL_NOTPROP; |
| 1980 |
*class_utf8data++ = property; |
*class_utf8data++ = ptype; |
| 1981 |
|
*class_utf8data++ = pdata; |
| 1982 |
class_charcount -= 2; /* Not a < 256 character */ |
class_charcount -= 2; /* Not a < 256 character */ |
| 1983 |
} |
} |
| 1984 |
continue; |
continue; |
| 2160 |
#ifdef SUPPORT_UCP |
#ifdef SUPPORT_UCP |
| 2161 |
if ((options & PCRE_CASELESS) != 0) |
if ((options & PCRE_CASELESS) != 0) |
| 2162 |
{ |
{ |
|
int chartype; |
|
| 2163 |
int othercase; |
int othercase; |
| 2164 |
if (_pcre_ucp_findchar(c, &chartype, &othercase) >= 0 && |
if ((othercase = _pcre_ucp_othercase(c)) >= 0) |
|
othercase > 0) |
|
| 2165 |
{ |
{ |
| 2166 |
*class_utf8data++ = XCL_SINGLE; |
*class_utf8data++ = XCL_SINGLE; |
| 2167 |
class_utf8data += _pcre_ord2utf8(othercase, class_utf8data); |
class_utf8data += _pcre_ord2utf8(othercase, class_utf8data); |
| 2446 |
else if (*previous < OP_EODN) |
else if (*previous < OP_EODN) |
| 2447 |
{ |
{ |
| 2448 |
uschar *oldcode; |
uschar *oldcode; |
| 2449 |
int prop_type; |
int prop_type, prop_value; |
| 2450 |
op_type = OP_TYPESTAR - OP_STAR; /* Use type opcodes */ |
op_type = OP_TYPESTAR - OP_STAR; /* Use type opcodes */ |
| 2451 |
c = *previous; |
c = *previous; |
| 2452 |
|
|
| 2453 |
OUTPUT_SINGLE_REPEAT: |
OUTPUT_SINGLE_REPEAT: |
| 2454 |
prop_type = (*previous == OP_PROP || *previous == OP_NOTPROP)? |
if (*previous == OP_PROP || *previous == OP_NOTPROP) |
| 2455 |
previous[1] : -1; |
{ |
| 2456 |
|
prop_type = previous[1]; |
| 2457 |
|
prop_value = previous[2]; |
| 2458 |
|
} |
| 2459 |
|
else prop_type = prop_value = -1; |
| 2460 |
|
|
| 2461 |
oldcode = code; |
oldcode = code; |
| 2462 |
code = previous; /* Usually overwrite previous item */ |
code = previous; /* Usually overwrite previous item */ |
| 2517 |
|
|
| 2518 |
/* If the maximum is unlimited, insert an OP_STAR. Before doing so, |
/* If the maximum is unlimited, insert an OP_STAR. Before doing so, |
| 2519 |
we have to insert the character for the previous code. For a repeated |
we have to insert the character for the previous code. For a repeated |
| 2520 |
Unicode property match, there is an extra byte that defines the |
Unicode property match, there are two extra bytes that define the |
| 2521 |
required property. In UTF-8 mode, long characters have their length in |
required property. In UTF-8 mode, long characters have their length in |
| 2522 |
c, with the 0x80 bit as a flag. */ |
c, with the 0x80 bit as a flag. */ |
| 2523 |
|
|
| 2533 |
#endif |
#endif |
| 2534 |
{ |
{ |
| 2535 |
*code++ = c; |
*code++ = c; |
| 2536 |
if (prop_type >= 0) *code++ = prop_type; |
if (prop_type >= 0) |
| 2537 |
|
{ |
| 2538 |
|
*code++ = prop_type; |
| 2539 |
|
*code++ = prop_value; |
| 2540 |
|
} |
| 2541 |
} |
} |
| 2542 |
*code++ = OP_STAR + repeat_type; |
*code++ = OP_STAR + repeat_type; |
| 2543 |
} |
} |
| 2556 |
else |
else |
| 2557 |
#endif |
#endif |
| 2558 |
*code++ = c; |
*code++ = c; |
| 2559 |
if (prop_type >= 0) *code++ = prop_type; |
if (prop_type >= 0) |
| 2560 |
|
{ |
| 2561 |
|
*code++ = prop_type; |
| 2562 |
|
*code++ = prop_value; |
| 2563 |
|
} |
| 2564 |
repeat_max -= repeat_min; |
repeat_max -= repeat_min; |
| 2565 |
*code++ = OP_UPTO + repeat_type; |
*code++ = OP_UPTO + repeat_type; |
| 2566 |
PUT2INC(code, 0, repeat_max); |
PUT2INC(code, 0, repeat_max); |
| 2579 |
#endif |
#endif |
| 2580 |
*code++ = c; |
*code++ = c; |
| 2581 |
|
|
| 2582 |
/* For a repeated Unicode property match, there is an extra byte that |
/* For a repeated Unicode property match, there are two extra bytes that |
| 2583 |
defines the required property. */ |
define the required property. */ |
| 2584 |
|
|
| 2585 |
#ifdef SUPPORT_UCP |
#ifdef SUPPORT_UCP |
| 2586 |
if (prop_type >= 0) *code++ = prop_type; |
if (prop_type >= 0) |
| 2587 |
|
{ |
| 2588 |
|
*code++ = prop_type; |
| 2589 |
|
*code++ = prop_value; |
| 2590 |
|
} |
| 2591 |
#endif |
#endif |
| 2592 |
} |
} |
| 2593 |
|
|
| 3055 |
goto FAILED; |
goto FAILED; |
| 3056 |
} |
} |
| 3057 |
|
|
| 3058 |
/* Insert the recursion/subroutine item */ |
/* Insert the recursion/subroutine item, automatically wrapped inside |
| 3059 |
|
"once" brackets. */ |
| 3060 |
|
|
| 3061 |
|
*code = OP_ONCE; |
| 3062 |
|
PUT(code, 1, 2 + 2*LINK_SIZE); |
| 3063 |
|
code += 1 + LINK_SIZE; |
| 3064 |
|
|
| 3065 |
*code = OP_RECURSE; |
*code = OP_RECURSE; |
| 3066 |
PUT(code, 1, called - cd->start_code); |
PUT(code, 1, called - cd->start_code); |
| 3067 |
code += 1 + LINK_SIZE; |
code += 1 + LINK_SIZE; |
| 3068 |
|
|
| 3069 |
|
*code = OP_KET; |
| 3070 |
|
PUT(code, 1, 2 + 2*LINK_SIZE); |
| 3071 |
|
code += 1 + LINK_SIZE; |
| 3072 |
} |
} |
| 3073 |
continue; |
continue; |
| 3074 |
|
|
| 3338 |
else if (-c == ESC_P || -c == ESC_p) |
else if (-c == ESC_P || -c == ESC_p) |
| 3339 |
{ |
{ |
| 3340 |
BOOL negated; |
BOOL negated; |
| 3341 |
int value = get_ucp(&ptr, &negated, errorcodeptr); |
int pdata; |
| 3342 |
|
int ptype = get_ucp(&ptr, &negated, &pdata, errorcodeptr); |
| 3343 |
previous = code; |
previous = code; |
| 3344 |
*code++ = ((-c == ESC_p) != negated)? OP_PROP : OP_NOTPROP; |
*code++ = ((-c == ESC_p) != negated)? OP_PROP : OP_NOTPROP; |
| 3345 |
*code++ = value; |
*code++ = ptype; |
| 3346 |
|
*code++ = pdata; |
| 3347 |
} |
} |
| 3348 |
#endif |
#endif |
| 3349 |
|
|
| 3898 |
with errorptr and erroroffset set |
with errorptr and erroroffset set |
| 3899 |
*/ |
*/ |
| 3900 |
|
|
| 3901 |
PCRE_EXPORT pcre * |
PCRE_DATA_SCOPE pcre * |
| 3902 |
pcre_compile(const char *pattern, int options, const char **errorptr, |
pcre_compile(const char *pattern, int options, const char **errorptr, |
| 3903 |
int *erroroffset, const unsigned char *tables) |
int *erroroffset, const unsigned char *tables) |
| 3904 |
{ |
{ |
| 3906 |
} |
} |
| 3907 |
|
|
| 3908 |
|
|
| 3909 |
PCRE_EXPORT pcre * |
PCRE_DATA_SCOPE pcre * |
| 3910 |
pcre_compile2(const char *pattern, int options, int *errorcodeptr, |
pcre_compile2(const char *pattern, int options, int *errorcodeptr, |
| 3911 |
const char **errorptr, int *erroroffset, const unsigned char *tables) |
const char **errorptr, int *erroroffset, const unsigned char *tables) |
| 3912 |
{ |
{ |
| 4099 |
#endif |
#endif |
| 4100 |
|
|
| 4101 |
/* \P and \p are for Unicode properties, but only when the support has |
/* \P and \p are for Unicode properties, but only when the support has |
| 4102 |
been compiled. Each item needs 2 bytes. */ |
been compiled. Each item needs 3 bytes. */ |
| 4103 |
|
|
| 4104 |
else if (-c == ESC_P || -c == ESC_p) |
else if (-c == ESC_P || -c == ESC_p) |
| 4105 |
{ |
{ |
| 4106 |
#ifdef SUPPORT_UCP |
#ifdef SUPPORT_UCP |
| 4107 |
BOOL negated; |
BOOL negated; |
| 4108 |
length += 2; |
BOOL pdata; |
| 4109 |
lastitemlength = 2; |
length += 3; |
| 4110 |
if (get_ucp(&ptr, &negated, &errorcode) < 0) goto PCRE_ERROR_RETURN; |
lastitemlength = 3; |
| 4111 |
|
if (get_ucp(&ptr, &negated, &pdata, &errorcode) < 0) |
| 4112 |
|
goto PCRE_ERROR_RETURN; |
| 4113 |
continue; |
continue; |
| 4114 |
#else |
#else |
| 4115 |
errorcode = ERR45; |
errorcode = ERR45; |
| 4275 |
class_utf8 = TRUE; |
class_utf8 = TRUE; |
| 4276 |
length += LINK_SIZE + 2; |
length += LINK_SIZE + 2; |
| 4277 |
} |
} |
| 4278 |
length += 2; |
length += 3; |
| 4279 |
} |
} |
| 4280 |
#endif |
#endif |
| 4281 |
} |
} |
| 4538 |
errorcode = ERR29; |
errorcode = ERR29; |
| 4539 |
goto PCRE_ERROR_RETURN; |
goto PCRE_ERROR_RETURN; |
| 4540 |
} |
} |
| 4541 |
length += 1 + LINK_SIZE; |
length += 3 + 3*LINK_SIZE; /* Allows for the automatic "once" */ |
| 4542 |
|
|
| 4543 |
/* If this item is quantified, it will get wrapped inside brackets so |
/* If this item is quantified, it will get wrapped inside brackets so |
| 4544 |
as to use the code for quantified brackets. We jump down and use the |
as to use the code for quantified brackets. We jump down and use the |
| 4594 |
|
|
| 4595 |
if (*ptr == '=' || *ptr == '>') |
if (*ptr == '=' || *ptr == '>') |
| 4596 |
{ |
{ |
| 4597 |
|
length += 2 + 2*LINK_SIZE; /* Allow for the automatic "once" */ |
| 4598 |
while ((compile_block.ctypes[*(++ptr)] & ctype_word) != 0); |
while ((compile_block.ctypes[*(++ptr)] & ctype_word) != 0); |
| 4599 |
if (*ptr != ')') |
if (*ptr != ')') |
| 4600 |
{ |
{ |
| 5088 |
if ((re->options & PCRE_FIRSTSET) != 0) |
if ((re->options & PCRE_FIRSTSET) != 0) |
| 5089 |
{ |
{ |
| 5090 |
int ch = re->first_byte & 255; |
int ch = re->first_byte & 255; |
| 5091 |
const char *caseless = ((re->first_byte & REQ_CASELESS) == 0)? "" : " (caseless)"; |
const char *caseless = ((re->first_byte & REQ_CASELESS) == 0)? |
| 5092 |
|
"" : " (caseless)"; |
| 5093 |
if (isprint(ch)) printf("First char = %c%s\n", ch, caseless); |
if (isprint(ch)) printf("First char = %c%s\n", ch, caseless); |
| 5094 |
else printf("First char = \\x%02x%s\n", ch, caseless); |
else printf("First char = \\x%02x%s\n", ch, caseless); |
| 5095 |
} |
} |
| 5097 |
if ((re->options & PCRE_REQCHSET) != 0) |
if ((re->options & PCRE_REQCHSET) != 0) |
| 5098 |
{ |
{ |
| 5099 |
int ch = re->req_byte & 255; |
int ch = re->req_byte & 255; |
| 5100 |
const char *caseless = ((re->req_byte & REQ_CASELESS) == 0)? "" : " (caseless)"; |
const char *caseless = ((re->req_byte & REQ_CASELESS) == 0)? |
| 5101 |
|
"" : " (caseless)"; |
| 5102 |
if (isprint(ch)) printf("Req char = %c%s\n", ch, caseless); |
if (isprint(ch)) printf("Req char = %c%s\n", ch, caseless); |
| 5103 |
else printf("Req char = \\x%02x%s\n", ch, caseless); |
else printf("Req char = \\x%02x%s\n", ch, caseless); |
| 5104 |
} |
} |