| 72 |
|
|
| 73 |
Returns: < 0 if the string is a valid UTF-8 string |
Returns: < 0 if the string is a valid UTF-8 string |
| 74 |
>= 0 otherwise; the value is the offset of the bad byte |
>= 0 otherwise; the value is the offset of the bad byte |
| 75 |
|
|
| 76 |
|
Bad bytes can be: |
| 77 |
|
|
| 78 |
|
. An isolated byte whose most significant bits are 0x80, because this |
| 79 |
|
can only correctly appear within a UTF-8 character; |
| 80 |
|
|
| 81 |
|
. A byte whose most significant bits are 0xc0, but whose other bits indicate |
| 82 |
|
that there are more than 3 additional bytes (i.e. an RFC 2279 starting |
| 83 |
|
byte, which is no longer valid under RFC 3629); |
| 84 |
|
|
| 85 |
|
. |
| 86 |
|
|
| 87 |
|
The returned offset may also be equal to the length of the string; this means |
| 88 |
|
that one or more bytes is missing from the final UTF-8 character. |
| 89 |
*/ |
*/ |
| 90 |
|
|
| 91 |
int |
int |
| 107 |
if (c < 128) continue; |
if (c < 128) continue; |
| 108 |
if (c < 0xc0) return p - string; |
if (c < 0xc0) return p - string; |
| 109 |
ab = _pcre_utf8_table4[c & 0x3f]; /* Number of additional bytes */ |
ab = _pcre_utf8_table4[c & 0x3f]; /* Number of additional bytes */ |
| 110 |
if (length < ab || ab > 3) return p - string; |
if (ab > 3) return p - string; /* Too many for RFC 3629 */ |
| 111 |
|
if (length < ab) return p + 1 + length - string; /* Missing bytes */ |
| 112 |
length -= ab; |
length -= ab; |
| 113 |
|
|
| 114 |
/* Check top bits in the second byte */ |
/* Check top bits in the second byte */ |