| 59 |
can be turned off for maximum performance, but the consequences of supplying |
can be turned off for maximum performance, but the consequences of supplying |
| 60 |
an invalid string are then undefined. |
an invalid string are then undefined. |
| 61 |
|
|
| 62 |
|
Originally, this function checked according to RFC 2279, allowing for values in |
| 63 |
|
the range 0 to 0x7fffffff, up to 6 bytes long, but ensuring that they were in |
| 64 |
|
the canonical format. Once somebody had pointed out RFC 3629 to me (it |
| 65 |
|
obsoletes 2279), additional restrictions were applies. The values are now |
| 66 |
|
limited to be between 0 and 0x0010ffff, no more than 4 bytes long, and the |
| 67 |
|
subrange 0xd000 to 0xdfff is excluded. |
| 68 |
|
|
| 69 |
Arguments: |
Arguments: |
| 70 |
string points to the string |
string points to the string |
| 71 |
length length of string, or -1 if the string is zero-terminated |
length length of string, or -1 if the string is zero-terminated |
| 92 |
register int c = *p; |
register int c = *p; |
| 93 |
if (c < 128) continue; |
if (c < 128) continue; |
| 94 |
if (c < 0xc0) return p - string; |
if (c < 0xc0) return p - string; |
| 95 |
ab = _pcre_utf8_table4[c & 0x3f]; /* Number of additional bytes */ |
ab = _pcre_utf8_table4[c & 0x3f]; /* Number of additional bytes */ |
| 96 |
if (length < ab) return p - string; |
if (length < ab || ab > 3) return p - string; |
| 97 |
length -= ab; |
length -= ab; |
| 98 |
|
|
| 99 |
/* Check top bits in the second byte */ |
/* Check top bits in the second byte */ |
| 100 |
if ((*(++p) & 0xc0) != 0x80) return p - string; |
if ((*(++p) & 0xc0) != 0x80) return p - string; |
| 101 |
|
|
| 102 |
/* Check for overlong sequences for each different length */ |
/* Check for overlong sequences for each different length, and for the |
| 103 |
|
excluded range 0xd000 to 0xdfff. */ |
| 104 |
|
|
| 105 |
switch (ab) |
switch (ab) |
| 106 |
{ |
{ |
| 107 |
/* Check for xx00 000x */ |
/* Check for xx00 000x (overlong sequence) */ |
| 108 |
|
|
| 109 |
case 1: |
case 1: |
| 110 |
if ((c & 0x3e) == 0) return p - string; |
if ((c & 0x3e) == 0) return p - string; |
| 111 |
continue; /* We know there aren't any more bytes to check */ |
continue; /* We know there aren't any more bytes to check */ |
| 112 |
|
|
| 113 |
/* Check for 1110 0000, xx0x xxxx */ |
/* Check for 1110 0000, xx0x xxxx (overlong sequence) or |
| 114 |
|
1110 1101, 1010 xxxx (0xd000 - 0xdfff) */ |
| 115 |
|
|
| 116 |
case 2: |
case 2: |
| 117 |
if (c == 0xe0 && (*p & 0x20) == 0) return p - string; |
if ((c == 0xe0 && (*p & 0x20) == 0) || |
| 118 |
|
(c == 0xed && *p >= 0xa0)) |
| 119 |
|
return p - string; |
| 120 |
break; |
break; |
| 121 |
|
|
| 122 |
/* Check for 1111 0000, xx00 xxxx */ |
/* Check for 1111 0000, xx00 xxxx (overlong sequence) or |
| 123 |
|
greater than 0x0010ffff (f4 8f bf bf) */ |
| 124 |
|
|
| 125 |
case 3: |
case 3: |
| 126 |
if (c == 0xf0 && (*p & 0x30) == 0) return p - string; |
if ((c == 0xf0 && (*p & 0x30) == 0) || |
| 127 |
|
(c > 0xf4 ) || |
| 128 |
|
(c == 0xf4 && *p > 0x8f)) |
| 129 |
|
return p - string; |
| 130 |
break; |
break; |
| 131 |
|
|
| 132 |
|
#if 0 |
| 133 |
|
/* These cases can no longer occur, as we restrict to a maximum of four |
| 134 |
|
bytes nowadays. Leave the code here in case we ever want to add an option |
| 135 |
|
for longer sequences. */ |
| 136 |
|
|
| 137 |
/* Check for 1111 1000, xx00 0xxx */ |
/* Check for 1111 1000, xx00 0xxx */ |
| 138 |
case 4: |
case 4: |
| 139 |
if (c == 0xf8 && (*p & 0x38) == 0) return p - string; |
if (c == 0xf8 && (*p & 0x38) == 0) return p - string; |
| 144 |
if (c == 0xfe || c == 0xff || |
if (c == 0xfe || c == 0xff || |
| 145 |
(c == 0xfc && (*p & 0x3c) == 0)) return p - string; |
(c == 0xfc && (*p & 0x3c) == 0)) return p - string; |
| 146 |
break; |
break; |
| 147 |
|
#endif |
| 148 |
|
|
| 149 |
} |
} |
| 150 |
|
|
| 151 |
/* Check for valid bytes after the 2nd, if any; all must start 10 */ |
/* Check for valid bytes after the 2nd, if any; all must start 10 */ |