| 67 |
char *pattern; |
char *pattern; |
| 68 |
char *subject; |
char *subject; |
| 69 |
unsigned char *name_table; |
unsigned char *name_table; |
| 70 |
|
unsigned int option_bits; |
| 71 |
int erroffset; |
int erroffset; |
| 72 |
int find_all; |
int find_all; |
| 73 |
|
int crlf_is_newline; |
| 74 |
int namecount; |
int namecount; |
| 75 |
int name_entry_size; |
int name_entry_size; |
| 76 |
int ovector[OVECCOUNT]; |
int ovector[OVECCOUNT]; |
| 77 |
int subject_length; |
int subject_length; |
| 78 |
int rc, i; |
int rc, i; |
| 79 |
|
int utf8; |
| 80 |
|
|
| 81 |
|
|
| 82 |
/************************************************************************** |
/************************************************************************** |
| 258 |
* subject is not a valid match; other possibilities must be tried. The * |
* subject is not a valid match; other possibilities must be tried. The * |
| 259 |
* second flag restricts PCRE to one match attempt at the initial string * |
* second flag restricts PCRE to one match attempt at the initial string * |
| 260 |
* position. If this match succeeds, an alternative to the empty string * |
* position. If this match succeeds, an alternative to the empty string * |
| 261 |
* match has been found, and we can proceed round the loop. * |
* match has been found, and we can print it and proceed round the loop, * |
| 262 |
|
* advancing by the length of whatever was found. If this match does not * |
| 263 |
|
* succeed, we still stay in the loop, advancing by just one character. * |
| 264 |
|
* In UTF-8 mode, which can be set by (*UTF8) in the pattern, this may be * |
| 265 |
|
* more than one byte. * |
| 266 |
|
* * |
| 267 |
|
* However, there is a complication concerned with newlines. When the * |
| 268 |
|
* newline convention is such that CRLF is a valid newline, we want must * |
| 269 |
|
* advance by two characters rather than one. The newline convention can * |
| 270 |
|
* be set in the regex by (*CR), etc.; if not, we must find the default. * |
| 271 |
*************************************************************************/ |
*************************************************************************/ |
| 272 |
|
|
| 273 |
if (!find_all) |
if (!find_all) /* Check for -g */ |
| 274 |
{ |
{ |
| 275 |
pcre_free(re); /* Release the memory used for the compiled pattern */ |
pcre_free(re); /* Release the memory used for the compiled pattern */ |
| 276 |
return 0; /* Finish unless -g was given */ |
return 0; /* Finish unless -g was given */ |
| 277 |
} |
} |
| 278 |
|
|
| 279 |
|
/* Before running the loop, check for UTF-8 and whether CRLF is a valid newline |
| 280 |
|
sequence. First, find the options with which the regex was compiled; extract |
| 281 |
|
the UTF-8 state, and mask off all but the newline options. */ |
| 282 |
|
|
| 283 |
|
(void)pcre_fullinfo(re, NULL, PCRE_INFO_OPTIONS, &option_bits); |
| 284 |
|
utf8 = option_bits & PCRE_UTF8; |
| 285 |
|
option_bits &= PCRE_NEWLINE_CR|PCRE_NEWLINE_LF|PCRE_NEWLINE_CRLF| |
| 286 |
|
PCRE_NEWLINE_ANY|PCRE_NEWLINE_ANYCRLF; |
| 287 |
|
|
| 288 |
|
/* If no newline options were set, find the default newline convention from the |
| 289 |
|
build configuration. */ |
| 290 |
|
|
| 291 |
|
if (option_bits == 0) |
| 292 |
|
{ |
| 293 |
|
int d; |
| 294 |
|
(void)pcre_config(PCRE_CONFIG_NEWLINE, &d); |
| 295 |
|
/* Note that these values are always the ASCII ones, even in |
| 296 |
|
EBCDIC environments. CR = 13, NL = 10. */ |
| 297 |
|
option_bits = (d == 13)? PCRE_NEWLINE_CR : |
| 298 |
|
(d == 10)? PCRE_NEWLINE_LF : |
| 299 |
|
(d == (13<<8 | 10))? PCRE_NEWLINE_CRLF : |
| 300 |
|
(d == -2)? PCRE_NEWLINE_ANYCRLF : |
| 301 |
|
(d == -1)? PCRE_NEWLINE_ANY : 0; |
| 302 |
|
} |
| 303 |
|
|
| 304 |
|
/* See if CRLF is a valid newline sequence. */ |
| 305 |
|
|
| 306 |
|
crlf_is_newline = |
| 307 |
|
option_bits == PCRE_NEWLINE_ANY || |
| 308 |
|
option_bits == PCRE_NEWLINE_CRLF || |
| 309 |
|
option_bits == PCRE_NEWLINE_ANYCRLF; |
| 310 |
|
|
| 311 |
/* Loop for second and subsequent matches */ |
/* Loop for second and subsequent matches */ |
| 312 |
|
|
| 313 |
for (;;) |
for (;;) |
| 341 |
is zero, it just means we have found all possible matches, so the loop ends. |
is zero, it just means we have found all possible matches, so the loop ends. |
| 342 |
Otherwise, it means we have failed to find a non-empty-string match at a |
Otherwise, it means we have failed to find a non-empty-string match at a |
| 343 |
point where there was a previous empty-string match. In this case, we do what |
point where there was a previous empty-string match. In this case, we do what |
| 344 |
Perl does: advance the matching position by one, and continue. We do this by |
Perl does: advance the matching position by one character, and continue. We |
| 345 |
setting the "end of previous match" offset, because that is picked up at the |
do this by setting the "end of previous match" offset, because that is picked |
| 346 |
top of the loop as the point at which to start again. */ |
up at the top of the loop as the point at which to start again. |
| 347 |
|
|
| 348 |
|
There are two complications: (a) When CRLF is a valid newline sequence, and |
| 349 |
|
the current position is just before it, advance by an extra byte. (b) |
| 350 |
|
Otherwise we must ensure that we skip an entire UTF-8 character if we are in |
| 351 |
|
UTF-8 mode. */ |
| 352 |
|
|
| 353 |
if (rc == PCRE_ERROR_NOMATCH) |
if (rc == PCRE_ERROR_NOMATCH) |
| 354 |
{ |
{ |
| 355 |
if (options == 0) break; |
if (options == 0) break; /* All matches found */ |
| 356 |
ovector[1] = start_offset + 1; |
ovector[1] = start_offset + 1; /* Advance one byte */ |
| 357 |
|
if (crlf_is_newline && /* If CRLF is newline & */ |
| 358 |
|
start_offset < subject_length - 1 && /* we are at CRLF, */ |
| 359 |
|
subject[start_offset] == '\er' && |
| 360 |
|
subject[start_offset + 1] == '\en') |
| 361 |
|
ovector[1] += 1; /* Advance by one more. */ |
| 362 |
|
else if (utf8) /* Otherwise, ensure we */ |
| 363 |
|
{ /* advance a whole UTF-8 */ |
| 364 |
|
while (ovector[1] < subject_length) /* character. */ |
| 365 |
|
{ |
| 366 |
|
if ((subject[ovector[1]] & 0xc0) != 0x80) break; |
| 367 |
|
ovector[1] += 1; |
| 368 |
|
} |
| 369 |
|
} |
| 370 |
continue; /* Go round the loop again */ |
continue; /* Go round the loop again */ |
| 371 |
} |
} |
| 372 |
|
|