| 6 |
and semantics are as close as possible to those of the Perl 5 language. |
and semantics are as close as possible to those of the Perl 5 language. |
| 7 |
|
|
| 8 |
Written by Philip Hazel |
Written by Philip Hazel |
| 9 |
Copyright (c) 1997-2006 University of Cambridge |
Copyright (c) 1997-2007 University of Cambridge |
| 10 |
|
|
| 11 |
----------------------------------------------------------------------------- |
----------------------------------------------------------------------------- |
| 12 |
Redistribution and use in source and binary forms, with or without |
Redistribution and use in source and binary forms, with or without |
| 74 |
character that is to be tested in some way. This makes is possible to |
character that is to be tested in some way. This makes is possible to |
| 75 |
centralize the loading of these characters. In the case of Type * etc, the |
centralize the loading of these characters. In the case of Type * etc, the |
| 76 |
"character" is the opcode for \D, \d, \S, \s, \W, or \w, which will always be a |
"character" is the opcode for \D, \d, \S, \s, \W, or \w, which will always be a |
| 77 |
small value. */ |
small value. ***NOTE*** If the start of this table is modified, the two tables |
| 78 |
|
that follow must also be modified. */ |
| 79 |
|
|
| 80 |
static uschar coptable[] = { |
static uschar coptable[] = { |
| 81 |
0, /* End */ |
0, /* End */ |
| 82 |
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* \A, \G, \B, \b, \D, \d, \S, \s, \W, \w */ |
0, 0, 0, 0, 0, /* \A, \G, \K, \B, \b */ |
| 83 |
|
0, 0, 0, 0, 0, 0, /* \D, \d, \S, \s, \W, \w */ |
| 84 |
0, 0, /* Any, Anybyte */ |
0, 0, /* Any, Anybyte */ |
| 85 |
0, 0, 0, 0, /* NOTPROP, PROP, EXTUNI, ANYNL */ |
0, 0, 0, 0, /* NOTPROP, PROP, EXTUNI, ANYNL */ |
| 86 |
0, 0, 0, 0, 0, /* \Z, \z, Opt, ^, $ */ |
0, 0, 0, 0, 0, /* \Z, \z, Opt, ^, $ */ |
| 129 |
and \w */ |
and \w */ |
| 130 |
|
|
| 131 |
static uschar toptable1[] = { |
static uschar toptable1[] = { |
| 132 |
0, 0, 0, 0, 0, |
0, 0, 0, 0, 0, 0, |
| 133 |
ctype_digit, ctype_digit, |
ctype_digit, ctype_digit, |
| 134 |
ctype_space, ctype_space, |
ctype_space, ctype_space, |
| 135 |
ctype_word, ctype_word, |
ctype_word, ctype_word, |
| 137 |
}; |
}; |
| 138 |
|
|
| 139 |
static uschar toptable2[] = { |
static uschar toptable2[] = { |
| 140 |
0, 0, 0, 0, 0, |
0, 0, 0, 0, 0, 0, |
| 141 |
ctype_digit, 0, |
ctype_digit, 0, |
| 142 |
ctype_space, 0, |
ctype_space, 0, |
| 143 |
ctype_word, 0, |
ctype_word, 0, |
| 502 |
const uschar *code; |
const uschar *code; |
| 503 |
int state_offset = current_state->offset; |
int state_offset = current_state->offset; |
| 504 |
int count, codevalue; |
int count, codevalue; |
| 505 |
|
#ifdef SUPPORT_UCP |
| 506 |
int chartype, script; |
int chartype, script; |
| 507 |
|
#endif |
| 508 |
|
|
| 509 |
#ifdef DEBUG |
#ifdef DEBUG |
| 510 |
printf ("%.*sProcessing state %d c=", rlevel*2-2, SP, state_offset); |
printf ("%.*sProcessing state %d c=", rlevel*2-2, SP, state_offset); |
| 787 |
break; |
break; |
| 788 |
|
|
| 789 |
|
|
|
#ifdef SUPPORT_UCP |
|
|
|
|
| 790 |
/*-----------------------------------------------------------------*/ |
/*-----------------------------------------------------------------*/ |
| 791 |
/* Check the next character by Unicode property. We will get here only |
/* Check the next character by Unicode property. We will get here only |
| 792 |
if the support is in the binary; otherwise a compile-time error occurs. |
if the support is in the binary; otherwise a compile-time error occurs. |
| 793 |
*/ |
*/ |
| 794 |
|
|
| 795 |
|
#ifdef SUPPORT_UCP |
| 796 |
case OP_PROP: |
case OP_PROP: |
| 797 |
case OP_NOTPROP: |
case OP_NOTPROP: |
| 798 |
if (clen > 0) |
if (clen > 0) |
| 973 |
argument. It keeps the code above fast for the other cases. The argument |
argument. It keeps the code above fast for the other cases. The argument |
| 974 |
is in the d variable. */ |
is in the d variable. */ |
| 975 |
|
|
| 976 |
|
#ifdef SUPPORT_UCP |
| 977 |
case OP_PROP_EXTRA + OP_TYPEPLUS: |
case OP_PROP_EXTRA + OP_TYPEPLUS: |
| 978 |
case OP_PROP_EXTRA + OP_TYPEMINPLUS: |
case OP_PROP_EXTRA + OP_TYPEMINPLUS: |
| 979 |
case OP_PROP_EXTRA + OP_TYPEPOSPLUS: |
case OP_PROP_EXTRA + OP_TYPEPOSPLUS: |
| 1053 |
ADD_NEW_DATA(-state_offset, count, ncount); |
ADD_NEW_DATA(-state_offset, count, ncount); |
| 1054 |
} |
} |
| 1055 |
break; |
break; |
| 1056 |
|
#endif |
| 1057 |
|
|
| 1058 |
/*-----------------------------------------------------------------*/ |
/*-----------------------------------------------------------------*/ |
| 1059 |
case OP_ANYNL_EXTRA + OP_TYPEPLUS: |
case OP_ANYNL_EXTRA + OP_TYPEPLUS: |
| 1090 |
break; |
break; |
| 1091 |
|
|
| 1092 |
/*-----------------------------------------------------------------*/ |
/*-----------------------------------------------------------------*/ |
| 1093 |
|
#ifdef SUPPORT_UCP |
| 1094 |
case OP_PROP_EXTRA + OP_TYPEQUERY: |
case OP_PROP_EXTRA + OP_TYPEQUERY: |
| 1095 |
case OP_PROP_EXTRA + OP_TYPEMINQUERY: |
case OP_PROP_EXTRA + OP_TYPEMINQUERY: |
| 1096 |
case OP_PROP_EXTRA + OP_TYPEPOSQUERY: |
case OP_PROP_EXTRA + OP_TYPEPOSQUERY: |
| 1188 |
ADD_NEW_DATA(-(state_offset + count), 0, ncount); |
ADD_NEW_DATA(-(state_offset + count), 0, ncount); |
| 1189 |
} |
} |
| 1190 |
break; |
break; |
| 1191 |
|
#endif |
| 1192 |
|
|
| 1193 |
/*-----------------------------------------------------------------*/ |
/*-----------------------------------------------------------------*/ |
| 1194 |
case OP_ANYNL_EXTRA + OP_TYPEQUERY: |
case OP_ANYNL_EXTRA + OP_TYPEQUERY: |
| 1233 |
break; |
break; |
| 1234 |
|
|
| 1235 |
/*-----------------------------------------------------------------*/ |
/*-----------------------------------------------------------------*/ |
| 1236 |
|
#ifdef SUPPORT_UCP |
| 1237 |
case OP_PROP_EXTRA + OP_TYPEEXACT: |
case OP_PROP_EXTRA + OP_TYPEEXACT: |
| 1238 |
case OP_PROP_EXTRA + OP_TYPEUPTO: |
case OP_PROP_EXTRA + OP_TYPEUPTO: |
| 1239 |
case OP_PROP_EXTRA + OP_TYPEMINUPTO: |
case OP_PROP_EXTRA + OP_TYPEMINUPTO: |
| 1321 |
{ ADD_NEW_DATA(-state_offset, count, ncount); } |
{ ADD_NEW_DATA(-state_offset, count, ncount); } |
| 1322 |
} |
} |
| 1323 |
break; |
break; |
| 1324 |
|
#endif |
| 1325 |
|
|
| 1326 |
/*-----------------------------------------------------------------*/ |
/*-----------------------------------------------------------------*/ |
| 1327 |
case OP_ANYNL_EXTRA + OP_TYPEEXACT: |
case OP_ANYNL_EXTRA + OP_TYPEEXACT: |
| 2066 |
|
|
| 2067 |
Arguments: |
Arguments: |
| 2068 |
argument_re points to the compiled expression |
argument_re points to the compiled expression |
| 2069 |
extra_data points to extra data or is NULL (not currently used) |
extra_data points to extra data or is NULL |
| 2070 |
subject points to the subject string |
subject points to the subject string |
| 2071 |
length length of subject string (may contain binary zeros) |
length length of subject string (may contain binary zeros) |
| 2072 |
start_offset where to start in the subject string |
start_offset where to start in the subject string |
| 2082 |
< -1 => some kind of unexpected problem |
< -1 => some kind of unexpected problem |
| 2083 |
*/ |
*/ |
| 2084 |
|
|
| 2085 |
PCRE_DATA_SCOPE int |
PCRE_EXP_DEFN int |
| 2086 |
pcre_dfa_exec(const pcre *argument_re, const pcre_extra *extra_data, |
pcre_dfa_exec(const pcre *argument_re, const pcre_extra *extra_data, |
| 2087 |
const char *subject, int length, int start_offset, int options, int *offsets, |
const char *subject, int length, int start_offset, int options, int *offsets, |
| 2088 |
int offsetcount, int *workspace, int wscount) |
int offsetcount, int *workspace, int wscount) |
| 2172 |
md->moptions = options; |
md->moptions = options; |
| 2173 |
md->poptions = re->options; |
md->poptions = re->options; |
| 2174 |
|
|
| 2175 |
/* Handle different types of newline. The two bits give four cases. If nothing |
/* Handle different types of newline. The three bits give eight cases. If |
| 2176 |
is set at run time, whatever was used at compile time applies. */ |
nothing is set at run time, whatever was used at compile time applies. */ |
| 2177 |
|
|
| 2178 |
switch ((((options & PCRE_NEWLINE_BITS) == 0)? re->options : options) & |
switch ((((options & PCRE_NEWLINE_BITS) == 0)? re->options : (pcre_uint32)options) & |
| 2179 |
PCRE_NEWLINE_BITS) |
PCRE_NEWLINE_BITS) |
| 2180 |
{ |
{ |
| 2181 |
case 0: newline = NEWLINE; break; /* Compile-time default */ |
case 0: newline = NEWLINE; break; /* Compile-time default */ |
| 2184 |
case PCRE_NEWLINE_CR+ |
case PCRE_NEWLINE_CR+ |
| 2185 |
PCRE_NEWLINE_LF: newline = ('\r' << 8) | '\n'; break; |
PCRE_NEWLINE_LF: newline = ('\r' << 8) | '\n'; break; |
| 2186 |
case PCRE_NEWLINE_ANY: newline = -1; break; |
case PCRE_NEWLINE_ANY: newline = -1; break; |
| 2187 |
|
case PCRE_NEWLINE_ANYCRLF: newline = -2; break; |
| 2188 |
default: return PCRE_ERROR_BADNEWLINE; |
default: return PCRE_ERROR_BADNEWLINE; |
| 2189 |
} |
} |
| 2190 |
|
|
| 2191 |
if (newline < 0) |
if (newline == -2) |
| 2192 |
|
{ |
| 2193 |
|
md->nltype = NLTYPE_ANYCRLF; |
| 2194 |
|
} |
| 2195 |
|
else if (newline < 0) |
| 2196 |
{ |
{ |
| 2197 |
md->nltype = NLTYPE_ANY; |
md->nltype = NLTYPE_ANY; |
| 2198 |
} |
} |
| 2322 |
{ |
{ |
| 2323 |
while (current_subject <= end_subject && !WAS_NEWLINE(current_subject)) |
while (current_subject <= end_subject && !WAS_NEWLINE(current_subject)) |
| 2324 |
current_subject++; |
current_subject++; |
| 2325 |
|
|
| 2326 |
|
/* If we have just passed a CR and the newline option is ANY or |
| 2327 |
|
ANYCRLF, and we are now at a LF, advance the match position by one more |
| 2328 |
|
character. */ |
| 2329 |
|
|
| 2330 |
|
if (current_subject[-1] == '\r' && |
| 2331 |
|
(md->nltype == NLTYPE_ANY || md->nltype == NLTYPE_ANYCRLF) && |
| 2332 |
|
current_subject < end_subject && |
| 2333 |
|
*current_subject == '\n') |
| 2334 |
|
current_subject++; |
| 2335 |
} |
} |
| 2336 |
} |
} |
| 2337 |
|
|
| 2440 |
} |
} |
| 2441 |
if (current_subject > end_subject) break; |
if (current_subject > end_subject) break; |
| 2442 |
|
|
| 2443 |
/* If we have just passed a CR and the newline option is CRLF or ANY, and we |
/* If we have just passed a CR and the newline option is CRLF or ANY or |
| 2444 |
are now at a LF, advance the match position by one more character. */ |
ANYCRLF, and we are now at a LF, advance the match position by one more |
| 2445 |
|
character. */ |
| 2446 |
|
|
| 2447 |
if (current_subject[-1] == '\r' && |
if (current_subject[-1] == '\r' && |
| 2448 |
(md->nltype == NLTYPE_ANY || md->nllen == 2) && |
(md->nltype == NLTYPE_ANY || |
| 2449 |
|
md->nltype == NLTYPE_ANYCRLF || |
| 2450 |
|
md->nllen == 2) && |
| 2451 |
current_subject < end_subject && |
current_subject < end_subject && |
| 2452 |
*current_subject == '\n') |
*current_subject == '\n') |
| 2453 |
current_subject++; |
current_subject++; |