/[pcre]/code/trunk/pcre_valid_utf8.c
ViewVC logotype

Diff of /code/trunk/pcre_valid_utf8.c

Parent Directory Parent Directory | Revision Log Revision Log | View Patch Patch

revision 210 by ph10, Wed Aug 1 09:10:40 2007 UTC revision 211 by ph10, Thu Aug 9 09:52:43 2007 UTC
# Line 59  that subsequent code can assume it is de Line 59  that subsequent code can assume it is de
59  can be turned off for maximum performance, but the consequences of supplying  can be turned off for maximum performance, but the consequences of supplying
60  an invalid string are then undefined.  an invalid string are then undefined.
61    
62    Originally, this function checked according to RFC 2279, allowing for values in
63    the range 0 to 0x7fffffff, up to 6 bytes long, but ensuring that they were in
64    the canonical format. Once somebody had pointed out RFC 3629 to me (it
65    obsoletes 2279), additional restrictions were applies. The values are now
66    limited to be between 0 and 0x0010ffff, no more than 4 bytes long, and the
67    subrange 0xd000 to 0xdfff is excluded.
68    
69  Arguments:  Arguments:
70    string       points to the string    string       points to the string
71    length       length of string, or -1 if the string is zero-terminated    length       length of string, or -1 if the string is zero-terminated
# Line 85  for (p = string; length-- > 0; p++) Line 92  for (p = string; length-- > 0; p++)
92    register int c = *p;    register int c = *p;
93    if (c < 128) continue;    if (c < 128) continue;
94    if (c < 0xc0) return p - string;    if (c < 0xc0) return p - string;
95    ab = _pcre_utf8_table4[c & 0x3f];  /* Number of additional bytes */    ab = _pcre_utf8_table4[c & 0x3f];     /* Number of additional bytes */
96    if (length < ab) return p - string;    if (length < ab || ab > 3) return p - string;
97    length -= ab;    length -= ab;
98    
99    /* Check top bits in the second byte */    /* Check top bits in the second byte */
100    if ((*(++p) & 0xc0) != 0x80) return p - string;    if ((*(++p) & 0xc0) != 0x80) return p - string;
101    
102    /* Check for overlong sequences for each different length */    /* Check for overlong sequences for each different length, and for the
103      excluded range 0xd000 to 0xdfff.  */
104    
105    switch (ab)    switch (ab)
106      {      {
107      /* Check for xx00 000x */      /* Check for xx00 000x (overlong sequence) */
108    
109      case 1:      case 1:
110      if ((c & 0x3e) == 0) return p - string;      if ((c & 0x3e) == 0) return p - string;
111      continue;   /* We know there aren't any more bytes to check */      continue;   /* We know there aren't any more bytes to check */
112    
113      /* Check for 1110 0000, xx0x xxxx */      /* Check for 1110 0000, xx0x xxxx (overlong sequence) or
114                     1110 1101, 1010 xxxx (0xd000 - 0xdfff) */
115    
116      case 2:      case 2:
117      if (c == 0xe0 && (*p & 0x20) == 0) return p - string;      if ((c == 0xe0 && (*p & 0x20) == 0) ||
118            (c == 0xed && *p >= 0xa0))
119          return p - string;
120      break;      break;
121    
122      /* Check for 1111 0000, xx00 xxxx */      /* Check for 1111 0000, xx00 xxxx (overlong sequence) or
123           greater than 0x0010ffff (f4 8f bf bf) */
124    
125      case 3:      case 3:
126      if (c == 0xf0 && (*p & 0x30) == 0) return p - string;      if ((c == 0xf0 && (*p & 0x30) == 0) ||
127            (c > 0xf4 ) ||
128            (c == 0xf4 && *p > 0x8f))
129          return p - string;
130      break;      break;
131    
132    #if 0
133        /* These cases can no longer occur, as we restrict to a maximum of four
134        bytes nowadays. Leave the code here in case we ever want to add an option
135        for longer sequences. */
136    
137      /* Check for 1111 1000, xx00 0xxx */      /* Check for 1111 1000, xx00 0xxx */
138      case 4:      case 4:
139      if (c == 0xf8 && (*p & 0x38) == 0) return p - string;      if (c == 0xf8 && (*p & 0x38) == 0) return p - string;
# Line 120  for (p = string; length-- > 0; p++) Line 144  for (p = string; length-- > 0; p++)
144      if (c == 0xfe || c == 0xff ||      if (c == 0xfe || c == 0xff ||
145         (c == 0xfc && (*p & 0x3c) == 0)) return p - string;         (c == 0xfc && (*p & 0x3c) == 0)) return p - string;
146      break;      break;
147    #endif
148    
149      }      }
150    
151    /* Check for valid bytes after the 2nd, if any; all must start 10 */    /* Check for valid bytes after the 2nd, if any; all must start 10 */

Legend:
Removed from v.210  
changed lines
  Added in v.211

webmaster@exim.org
ViewVC Help
Powered by ViewVC 1.1.12