| 6 |
and semantics are as close as possible to those of the Perl 5 language. |
and semantics are as close as possible to those of the Perl 5 language. |
| 7 |
|
|
| 8 |
Written by Philip Hazel |
Written by Philip Hazel |
| 9 |
Copyright (c) 1997-2006 University of Cambridge |
Copyright (c) 1997-2007 University of Cambridge |
| 10 |
|
|
| 11 |
----------------------------------------------------------------------------- |
----------------------------------------------------------------------------- |
| 12 |
Redistribution and use in source and binary forms, with or without |
Redistribution and use in source and binary forms, with or without |
| 82 |
on. Zero means further processing is needed (for things like \x), or the escape |
on. Zero means further processing is needed (for things like \x), or the escape |
| 83 |
is invalid. */ |
is invalid. */ |
| 84 |
|
|
| 85 |
#if !EBCDIC /* This is the "normal" table for ASCII systems */ |
#ifndef EBCDIC /* This is the "normal" table for ASCII systems */ |
| 86 |
static const short int escapes[] = { |
static const short int escapes[] = { |
| 87 |
0, 0, 0, 0, 0, 0, 0, 0, /* 0 - 7 */ |
0, 0, 0, 0, 0, 0, 0, 0, /* 0 - 7 */ |
| 88 |
0, 0, ':', ';', '<', '=', '>', '?', /* 8 - ? */ |
0, 0, ':', ';', '<', '=', '>', '?', /* 8 - ? */ |
| 96 |
0, 0, -ESC_z /* x - z */ |
0, 0, -ESC_z /* x - z */ |
| 97 |
}; |
}; |
| 98 |
|
|
| 99 |
#else /* This is the "abnormal" table for EBCDIC systems */ |
#else /* This is the "abnormal" table for EBCDIC systems */ |
| 100 |
static const short int escapes[] = { |
static const short int escapes[] = { |
| 101 |
/* 48 */ 0, 0, 0, '.', '<', '(', '+', '|', |
/* 48 */ 0, 0, 0, '.', '<', '(', '+', '|', |
| 102 |
/* 50 */ '&', 0, 0, 0, 0, 0, 0, 0, |
/* 50 */ '&', 0, 0, 0, 0, 0, 0, 0, |
| 208 |
"malformed number or name after (?(", |
"malformed number or name after (?(", |
| 209 |
"conditional group contains more than two branches", |
"conditional group contains more than two branches", |
| 210 |
"assertion expected after (?(", |
"assertion expected after (?(", |
| 211 |
"(?R or (?digits must be followed by )", |
"(?R or (?[+-]digits must be followed by )", |
| 212 |
/* 30 */ |
/* 30 */ |
| 213 |
"unknown POSIX class name", |
"unknown POSIX class name", |
| 214 |
"POSIX collating elements are not supported", |
"POSIX collating elements are not supported", |
| 242 |
/* 55 */ |
/* 55 */ |
| 243 |
"repeating a DEFINE group is not allowed", |
"repeating a DEFINE group is not allowed", |
| 244 |
"inconsistent NEWLINE options", |
"inconsistent NEWLINE options", |
| 245 |
"\\g is not followed by an (optionally braced) non-zero number" |
"\\g is not followed by an (optionally braced) non-zero number", |
| 246 |
|
"(?+ or (?- must be followed by a non-zero number" |
| 247 |
}; |
}; |
| 248 |
|
|
| 249 |
|
|
| 263 |
|
|
| 264 |
Then we can use ctype_digit and ctype_xdigit in the code. */ |
Then we can use ctype_digit and ctype_xdigit in the code. */ |
| 265 |
|
|
| 266 |
#if !EBCDIC /* This is the "normal" case, for ASCII systems */ |
#ifndef EBCDIC /* This is the "normal" case, for ASCII systems */ |
| 267 |
static const unsigned char digitab[] = |
static const unsigned char digitab[] = |
| 268 |
{ |
{ |
| 269 |
0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 0- 7 */ |
0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 0- 7 */ |
| 299 |
0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 240-247 */ |
0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 240-247 */ |
| 300 |
0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00};/* 248-255 */ |
0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00};/* 248-255 */ |
| 301 |
|
|
| 302 |
#else /* This is the "abnormal" case, for EBCDIC systems */ |
#else /* This is the "abnormal" case, for EBCDIC systems */ |
| 303 |
static const unsigned char digitab[] = |
static const unsigned char digitab[] = |
| 304 |
{ |
{ |
| 305 |
0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 0- 7 0 */ |
0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 0- 7 0 */ |
| 313 |
0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* - 71 40 */ |
0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* - 71 40 */ |
| 314 |
0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 72- | */ |
0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 72- | */ |
| 315 |
0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* & - 87 50 */ |
0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* & - 87 50 */ |
| 316 |
0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 88- ¬ */ |
0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 88- 95 */ |
| 317 |
0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* - -103 60 */ |
0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* - -103 60 */ |
| 318 |
0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 104- ? */ |
0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 104- ? */ |
| 319 |
0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 112-119 70 */ |
0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 112-119 70 */ |
| 347 |
0x01,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* - 71 */ |
0x01,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* - 71 */ |
| 348 |
0x00,0x00,0x00,0x80,0x00,0x80,0x80,0x80, /* 72- | */ |
0x00,0x00,0x00,0x80,0x00,0x80,0x80,0x80, /* 72- | */ |
| 349 |
0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* & - 87 */ |
0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* & - 87 */ |
| 350 |
0x00,0x00,0x00,0x80,0x80,0x80,0x00,0x00, /* 88- ¬ */ |
0x00,0x00,0x00,0x80,0x80,0x80,0x00,0x00, /* 88- 95 */ |
| 351 |
0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* - -103 */ |
0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* - -103 */ |
| 352 |
0x00,0x00,0x00,0x00,0x00,0x10,0x00,0x80, /* 104- ? */ |
0x00,0x00,0x00,0x00,0x00,0x10,0x00,0x80, /* 104- ? */ |
| 353 |
0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 112-119 */ |
0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 112-119 */ |
| 422 |
a table. A non-zero result is something that can be returned immediately. |
a table. A non-zero result is something that can be returned immediately. |
| 423 |
Otherwise further processing may be required. */ |
Otherwise further processing may be required. */ |
| 424 |
|
|
| 425 |
#if !EBCDIC /* ASCII coding */ |
#ifndef EBCDIC /* ASCII coding */ |
| 426 |
else if (c < '0' || c > 'z') {} /* Not alphameric */ |
else if (c < '0' || c > 'z') {} /* Not alphameric */ |
| 427 |
else if ((i = escapes[c - '0']) != 0) c = i; |
else if ((i = escapes[c - '0']) != 0) c = i; |
| 428 |
|
|
| 429 |
#else /* EBCDIC coding */ |
#else /* EBCDIC coding */ |
| 430 |
else if (c < 'a' || (ebcdic_chartab[c] & 0x0E) == 0) {} /* Not alphameric */ |
else if (c < 'a' || (ebcdic_chartab[c] & 0x0E) == 0) {} /* Not alphameric */ |
| 431 |
else if ((i = escapes[c - 0x48]) != 0) c = i; |
else if ((i = escapes[c - 0x48]) != 0) c = i; |
| 432 |
#endif |
#endif |
| 563 |
if (c == 0 && cc == '0') continue; /* Leading zeroes */ |
if (c == 0 && cc == '0') continue; /* Leading zeroes */ |
| 564 |
count++; |
count++; |
| 565 |
|
|
| 566 |
#if !EBCDIC /* ASCII coding */ |
#ifndef EBCDIC /* ASCII coding */ |
| 567 |
if (cc >= 'a') cc -= 32; /* Convert to upper case */ |
if (cc >= 'a') cc -= 32; /* Convert to upper case */ |
| 568 |
c = (c << 4) + cc - ((cc < 'A')? '0' : ('A' - 10)); |
c = (c << 4) + cc - ((cc < 'A')? '0' : ('A' - 10)); |
| 569 |
#else /* EBCDIC coding */ |
#else /* EBCDIC coding */ |
| 570 |
if (cc >= 'a' && cc <= 'z') cc += 64; /* Convert to upper case */ |
if (cc >= 'a' && cc <= 'z') cc += 64; /* Convert to upper case */ |
| 571 |
c = (c << 4) + cc - ((cc >= '0')? '0' : ('A' - 10)); |
c = (c << 4) + cc - ((cc >= '0')? '0' : ('A' - 10)); |
| 572 |
#endif |
#endif |
| 590 |
{ |
{ |
| 591 |
int cc; /* Some compilers don't like ++ */ |
int cc; /* Some compilers don't like ++ */ |
| 592 |
cc = *(++ptr); /* in initializers */ |
cc = *(++ptr); /* in initializers */ |
| 593 |
#if !EBCDIC /* ASCII coding */ |
#ifndef EBCDIC /* ASCII coding */ |
| 594 |
if (cc >= 'a') cc -= 32; /* Convert to upper case */ |
if (cc >= 'a') cc -= 32; /* Convert to upper case */ |
| 595 |
c = c * 16 + cc - ((cc < 'A')? '0' : ('A' - 10)); |
c = c * 16 + cc - ((cc < 'A')? '0' : ('A' - 10)); |
| 596 |
#else /* EBCDIC coding */ |
#else /* EBCDIC coding */ |
| 597 |
if (cc <= 'z') cc += 64; /* Convert to upper case */ |
if (cc <= 'z') cc += 64; /* Convert to upper case */ |
| 598 |
c = c * 16 + cc - ((cc >= '0')? '0' : ('A' - 10)); |
c = c * 16 + cc - ((cc >= '0')? '0' : ('A' - 10)); |
| 599 |
#endif |
#endif |
| 612 |
return 0; |
return 0; |
| 613 |
} |
} |
| 614 |
|
|
| 615 |
#if !EBCDIC /* ASCII coding */ |
#ifndef EBCDIC /* ASCII coding */ |
| 616 |
if (c >= 'a' && c <= 'z') c -= 32; |
if (c >= 'a' && c <= 'z') c -= 32; |
| 617 |
c ^= 0x40; |
c ^= 0x40; |
| 618 |
#else /* EBCDIC coding */ |
#else /* EBCDIC coding */ |
| 619 |
if (c >= 'a' && c <= 'z') c += 64; |
if (c >= 'a' && c <= 'z') c += 64; |
| 620 |
c ^= 0xC0; |
c ^= 0xC0; |
| 621 |
#endif |
#endif |
| 1247 |
else |
else |
| 1248 |
{ |
{ |
| 1249 |
code += _pcre_OP_lengths[c]; |
code += _pcre_OP_lengths[c]; |
| 1250 |
|
#ifdef SUPPORT_UTF8 |
| 1251 |
if (utf8) switch(c) |
if (utf8) switch(c) |
| 1252 |
{ |
{ |
| 1253 |
case OP_CHAR: |
case OP_CHAR: |
| 1268 |
if (code[-1] >= 0xc0) code += _pcre_utf8_table4[code[-1] & 0x3f]; |
if (code[-1] >= 0xc0) code += _pcre_utf8_table4[code[-1] & 0x3f]; |
| 1269 |
break; |
break; |
| 1270 |
} |
} |
| 1271 |
|
#endif |
| 1272 |
} |
} |
| 1273 |
} |
} |
| 1274 |
} |
} |
| 1312 |
else |
else |
| 1313 |
{ |
{ |
| 1314 |
code += _pcre_OP_lengths[c]; |
code += _pcre_OP_lengths[c]; |
| 1315 |
|
#ifdef SUPPORT_UTF8 |
| 1316 |
if (utf8) switch(c) |
if (utf8) switch(c) |
| 1317 |
{ |
{ |
| 1318 |
case OP_CHAR: |
case OP_CHAR: |
| 1333 |
if (code[-1] >= 0xc0) code += _pcre_utf8_table4[code[-1] & 0x3f]; |
if (code[-1] >= 0xc0) code += _pcre_utf8_table4[code[-1] & 0x3f]; |
| 1334 |
break; |
break; |
| 1335 |
} |
} |
| 1336 |
|
#endif |
| 1337 |
} |
} |
| 1338 |
} |
} |
| 1339 |
} |
} |
| 4000 |
|
|
| 4001 |
|
|
| 4002 |
/* ------------------------------------------------------------ */ |
/* ------------------------------------------------------------ */ |
| 4003 |
|
case '-': case '+': |
| 4004 |
case '0': case '1': case '2': case '3': case '4': /* Recursion or */ |
case '0': case '1': case '2': case '3': case '4': /* Recursion or */ |
| 4005 |
case '5': case '6': case '7': case '8': case '9': /* subroutine */ |
case '5': case '6': case '7': case '8': case '9': /* subroutine */ |
| 4006 |
{ |
{ |
| 4007 |
const uschar *called; |
const uschar *called; |
| 4008 |
|
int sign = *ptr; |
| 4009 |
|
|
| 4010 |
|
if (sign == '+') ptr++; |
| 4011 |
|
else if (sign == '-') |
| 4012 |
|
{ |
| 4013 |
|
if ((digitab[ptr[1]] & ctype_digit) == 0) |
| 4014 |
|
goto OTHER_CHAR_AFTER_QUERY; |
| 4015 |
|
ptr++; |
| 4016 |
|
} |
| 4017 |
|
|
| 4018 |
recno = 0; |
recno = 0; |
| 4019 |
while((digitab[*ptr] & ctype_digit) != 0) |
while((digitab[*ptr] & ctype_digit) != 0) |
| 4020 |
recno = recno * 10 + *ptr++ - '0'; |
recno = recno * 10 + *ptr++ - '0'; |
| 4021 |
|
|
| 4022 |
if (*ptr != ')') |
if (*ptr != ')') |
| 4023 |
{ |
{ |
| 4024 |
*errorcodeptr = ERR29; |
*errorcodeptr = ERR29; |
| 4025 |
goto FAILED; |
goto FAILED; |
| 4026 |
} |
} |
| 4027 |
|
|
| 4028 |
|
if (sign == '-') |
| 4029 |
|
{ |
| 4030 |
|
if (recno == 0) |
| 4031 |
|
{ |
| 4032 |
|
*errorcodeptr = ERR58; |
| 4033 |
|
goto FAILED; |
| 4034 |
|
} |
| 4035 |
|
recno = cd->bracount - recno + 1; |
| 4036 |
|
if (recno <= 0) |
| 4037 |
|
{ |
| 4038 |
|
*errorcodeptr = ERR15; |
| 4039 |
|
goto FAILED; |
| 4040 |
|
} |
| 4041 |
|
} |
| 4042 |
|
else if (sign == '+') |
| 4043 |
|
{ |
| 4044 |
|
if (recno == 0) |
| 4045 |
|
{ |
| 4046 |
|
*errorcodeptr = ERR58; |
| 4047 |
|
goto FAILED; |
| 4048 |
|
} |
| 4049 |
|
recno += cd->bracount; |
| 4050 |
|
} |
| 4051 |
|
|
| 4052 |
/* Come here from code above that handles a named recursion */ |
/* Come here from code above that handles a named recursion */ |
| 4053 |
|
|
| 4121 |
|
|
| 4122 |
/* ------------------------------------------------------------ */ |
/* ------------------------------------------------------------ */ |
| 4123 |
default: /* Other characters: check option setting */ |
default: /* Other characters: check option setting */ |
| 4124 |
|
OTHER_CHAR_AFTER_QUERY: |
| 4125 |
set = unset = 0; |
set = unset = 0; |
| 4126 |
optset = &set; |
optset = &set; |
| 4127 |
|
|
| 5081 |
with errorptr and erroroffset set |
with errorptr and erroroffset set |
| 5082 |
*/ |
*/ |
| 5083 |
|
|
| 5084 |
PCRE_DATA_SCOPE pcre * |
PCRE_EXP_DEFN pcre * |
| 5085 |
pcre_compile(const char *pattern, int options, const char **errorptr, |
pcre_compile(const char *pattern, int options, const char **errorptr, |
| 5086 |
int *erroroffset, const unsigned char *tables) |
int *erroroffset, const unsigned char *tables) |
| 5087 |
{ |
{ |
| 5089 |
} |
} |
| 5090 |
|
|
| 5091 |
|
|
| 5092 |
PCRE_DATA_SCOPE pcre * |
PCRE_EXP_DEFN pcre * |
| 5093 |
pcre_compile2(const char *pattern, int options, int *errorcodeptr, |
pcre_compile2(const char *pattern, int options, int *errorcodeptr, |
| 5094 |
const char **errorptr, int *erroroffset, const unsigned char *tables) |
const char **errorptr, int *erroroffset, const unsigned char *tables) |
| 5095 |
{ |
{ |
| 5138 |
if (erroroffset == NULL) |
if (erroroffset == NULL) |
| 5139 |
{ |
{ |
| 5140 |
errorcode = ERR16; |
errorcode = ERR16; |
| 5141 |
goto PCRE_EARLY_ERROR_RETURN; |
goto PCRE_EARLY_ERROR_RETURN2; |
| 5142 |
} |
} |
| 5143 |
|
|
| 5144 |
*erroroffset = 0; |
*erroroffset = 0; |
| 5151 |
(*erroroffset = _pcre_valid_utf8((uschar *)pattern, -1)) >= 0) |
(*erroroffset = _pcre_valid_utf8((uschar *)pattern, -1)) >= 0) |
| 5152 |
{ |
{ |
| 5153 |
errorcode = ERR44; |
errorcode = ERR44; |
| 5154 |
goto PCRE_UTF8_ERROR_RETURN; |
goto PCRE_EARLY_ERROR_RETURN2; |
| 5155 |
} |
} |
| 5156 |
#else |
#else |
| 5157 |
if ((options & PCRE_UTF8) != 0) |
if ((options & PCRE_UTF8) != 0) |
| 5176 |
cd->ctypes = tables + ctypes_offset; |
cd->ctypes = tables + ctypes_offset; |
| 5177 |
|
|
| 5178 |
/* Handle different types of newline. The three bits give seven cases. The |
/* Handle different types of newline. The three bits give seven cases. The |
| 5179 |
current code allows for fixed one- or two-byte sequences, plus "any". */ |
current code allows for fixed one- or two-byte sequences, plus "any" and |
| 5180 |
|
"anycrlf". */ |
| 5181 |
|
|
| 5182 |
switch (options & (PCRE_NEWLINE_CRLF | PCRE_NEWLINE_ANY)) |
switch (options & (PCRE_NEWLINE_CRLF | PCRE_NEWLINE_ANY)) |
| 5183 |
{ |
{ |
| 5187 |
case PCRE_NEWLINE_CR+ |
case PCRE_NEWLINE_CR+ |
| 5188 |
PCRE_NEWLINE_LF: newline = ('\r' << 8) | '\n'; break; |
PCRE_NEWLINE_LF: newline = ('\r' << 8) | '\n'; break; |
| 5189 |
case PCRE_NEWLINE_ANY: newline = -1; break; |
case PCRE_NEWLINE_ANY: newline = -1; break; |
| 5190 |
|
case PCRE_NEWLINE_ANYCRLF: newline = -2; break; |
| 5191 |
default: errorcode = ERR56; goto PCRE_EARLY_ERROR_RETURN; |
default: errorcode = ERR56; goto PCRE_EARLY_ERROR_RETURN; |
| 5192 |
} |
} |
| 5193 |
|
|
| 5194 |
if (newline < 0) |
if (newline == -2) |
| 5195 |
|
{ |
| 5196 |
|
cd->nltype = NLTYPE_ANYCRLF; |
| 5197 |
|
} |
| 5198 |
|
else if (newline < 0) |
| 5199 |
{ |
{ |
| 5200 |
cd->nltype = NLTYPE_ANY; |
cd->nltype = NLTYPE_ANY; |
| 5201 |
} |
} |
| 5369 |
(pcre_free)(re); |
(pcre_free)(re); |
| 5370 |
PCRE_EARLY_ERROR_RETURN: |
PCRE_EARLY_ERROR_RETURN: |
| 5371 |
*erroroffset = ptr - (const uschar *)pattern; |
*erroroffset = ptr - (const uschar *)pattern; |
| 5372 |
#ifdef SUPPORT_UTF8 |
PCRE_EARLY_ERROR_RETURN2: |
|
PCRE_UTF8_ERROR_RETURN: |
|
|
#endif |
|
| 5373 |
*errorptr = error_texts[errorcode]; |
*errorptr = error_texts[errorcode]; |
| 5374 |
if (errorcodeptr != NULL) *errorcodeptr = errorcode; |
if (errorcodeptr != NULL) *errorcodeptr = errorcode; |
| 5375 |
return NULL; |
return NULL; |
| 5459 |
else printf("Req char = \\x%02x%s\n", ch, caseless); |
else printf("Req char = \\x%02x%s\n", ch, caseless); |
| 5460 |
} |
} |
| 5461 |
|
|
| 5462 |
pcre_printint(re, stdout); |
pcre_printint(re, stdout, TRUE); |
| 5463 |
|
|
| 5464 |
/* This check is done here in the debugging case so that the code that |
/* This check is done here in the debugging case so that the code that |
| 5465 |
was compiled can be seen. */ |
was compiled can be seen. */ |