/[pcre]/code/trunk/pcre_study.c
ViewVC logotype

Contents of /code/trunk/pcre_study.c

Parent Directory Parent Directory | Revision Log Revision Log


Revision 545 - (show annotations) (download)
Wed Jun 16 10:51:15 2010 UTC (4 years, 5 months ago) by ph10
File MIME type: text/plain
File size: 32174 byte(s)
Tidyup for 8.10-RC2 test release.

1 /*************************************************
2 * Perl-Compatible Regular Expressions *
3 *************************************************/
4
5 /* PCRE is a library of functions to support regular expressions whose syntax
6 and semantics are as close as possible to those of the Perl 5 language.
7
8 Written by Philip Hazel
9 Copyright (c) 1997-2010 University of Cambridge
10
11 -----------------------------------------------------------------------------
12 Redistribution and use in source and binary forms, with or without
13 modification, are permitted provided that the following conditions are met:
14
15 * Redistributions of source code must retain the above copyright notice,
16 this list of conditions and the following disclaimer.
17
18 * Redistributions in binary form must reproduce the above copyright
19 notice, this list of conditions and the following disclaimer in the
20 documentation and/or other materials provided with the distribution.
21
22 * Neither the name of the University of Cambridge nor the names of its
23 contributors may be used to endorse or promote products derived from
24 this software without specific prior written permission.
25
26 THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
27 AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
28 IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
29 ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
30 LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
31 CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
32 SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
33 INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
34 CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
35 ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
36 POSSIBILITY OF SUCH DAMAGE.
37 -----------------------------------------------------------------------------
38 */
39
40
41 /* This module contains the external function pcre_study(), along with local
42 supporting functions. */
43
44
45 #ifdef HAVE_CONFIG_H
46 #include "config.h"
47 #endif
48
49 #include "pcre_internal.h"
50
51 #define SET_BIT(c) start_bits[c/8] |= (1 << (c&7))
52
53 /* Returns from set_start_bits() */
54
55 enum { SSB_FAIL, SSB_DONE, SSB_CONTINUE };
56
57
58
59 /*************************************************
60 * Find the minimum subject length for a group *
61 *************************************************/
62
63 /* Scan a parenthesized group and compute the minimum length of subject that
64 is needed to match it. This is a lower bound; it does not mean there is a
65 string of that length that matches. In UTF8 mode, the result is in characters
66 rather than bytes.
67
68 Arguments:
69 code pointer to start of group (the bracket)
70 startcode pointer to start of the whole pattern
71 options the compiling options
72
73 Returns: the minimum length
74 -1 if \C was encountered
75 -2 internal error (missing capturing bracket)
76 */
77
78 static int
79 find_minlength(const uschar *code, const uschar *startcode, int options)
80 {
81 int length = -1;
82 BOOL utf8 = (options & PCRE_UTF8) != 0;
83 BOOL had_recurse = FALSE;
84 register int branchlength = 0;
85 register uschar *cc = (uschar *)code + 1 + LINK_SIZE;
86
87 if (*code == OP_CBRA || *code == OP_SCBRA) cc += 2;
88
89 /* Scan along the opcodes for this branch. If we get to the end of the
90 branch, check the length against that of the other branches. */
91
92 for (;;)
93 {
94 int d, min;
95 uschar *cs, *ce;
96 register int op = *cc;
97
98 switch (op)
99 {
100 case OP_COND:
101 case OP_SCOND:
102
103 /* If there is only one branch in a condition, the implied branch has zero
104 length, so we don't add anything. This covers the DEFINE "condition"
105 automatically. */
106
107 cs = cc + GET(cc, 1);
108 if (*cs != OP_ALT)
109 {
110 cc = cs + 1 + LINK_SIZE;
111 break;
112 }
113
114 /* Otherwise we can fall through and treat it the same as any other
115 subpattern. */
116
117 case OP_CBRA:
118 case OP_SCBRA:
119 case OP_BRA:
120 case OP_SBRA:
121 case OP_ONCE:
122 d = find_minlength(cc, startcode, options);
123 if (d < 0) return d;
124 branchlength += d;
125 do cc += GET(cc, 1); while (*cc == OP_ALT);
126 cc += 1 + LINK_SIZE;
127 break;
128
129 /* Reached end of a branch; if it's a ket it is the end of a nested
130 call. If it's ALT it is an alternation in a nested call. If it is
131 END it's the end of the outer call. All can be handled by the same code. */
132
133 case OP_ALT:
134 case OP_KET:
135 case OP_KETRMAX:
136 case OP_KETRMIN:
137 case OP_END:
138 if (length < 0 || (!had_recurse && branchlength < length))
139 length = branchlength;
140 if (*cc != OP_ALT) return length;
141 cc += 1 + LINK_SIZE;
142 branchlength = 0;
143 had_recurse = FALSE;
144 break;
145
146 /* Skip over assertive subpatterns */
147
148 case OP_ASSERT:
149 case OP_ASSERT_NOT:
150 case OP_ASSERTBACK:
151 case OP_ASSERTBACK_NOT:
152 do cc += GET(cc, 1); while (*cc == OP_ALT);
153 /* Fall through */
154
155 /* Skip over things that don't match chars */
156
157 case OP_REVERSE:
158 case OP_CREF:
159 case OP_NCREF:
160 case OP_RREF:
161 case OP_NRREF:
162 case OP_DEF:
163 case OP_OPT:
164 case OP_CALLOUT:
165 case OP_SOD:
166 case OP_SOM:
167 case OP_EOD:
168 case OP_EODN:
169 case OP_CIRC:
170 case OP_DOLL:
171 case OP_NOT_WORD_BOUNDARY:
172 case OP_WORD_BOUNDARY:
173 cc += _pcre_OP_lengths[*cc];
174 break;
175
176 /* Skip over a subpattern that has a {0} or {0,x} quantifier */
177
178 case OP_BRAZERO:
179 case OP_BRAMINZERO:
180 case OP_SKIPZERO:
181 cc += _pcre_OP_lengths[*cc];
182 do cc += GET(cc, 1); while (*cc == OP_ALT);
183 cc += 1 + LINK_SIZE;
184 break;
185
186 /* Handle literal characters and + repetitions */
187
188 case OP_CHAR:
189 case OP_CHARNC:
190 case OP_NOT:
191 case OP_PLUS:
192 case OP_MINPLUS:
193 case OP_POSPLUS:
194 case OP_NOTPLUS:
195 case OP_NOTMINPLUS:
196 case OP_NOTPOSPLUS:
197 branchlength++;
198 cc += 2;
199 #ifdef SUPPORT_UTF8
200 if (utf8 && cc[-1] >= 0xc0) cc += _pcre_utf8_table4[cc[-1] & 0x3f];
201 #endif
202 break;
203
204 case OP_TYPEPLUS:
205 case OP_TYPEMINPLUS:
206 case OP_TYPEPOSPLUS:
207 branchlength++;
208 cc += (cc[1] == OP_PROP || cc[1] == OP_NOTPROP)? 4 : 2;
209 break;
210
211 /* Handle exact repetitions. The count is already in characters, but we
212 need to skip over a multibyte character in UTF8 mode. */
213
214 case OP_EXACT:
215 case OP_NOTEXACT:
216 branchlength += GET2(cc,1);
217 cc += 4;
218 #ifdef SUPPORT_UTF8
219 if (utf8 && cc[-1] >= 0xc0) cc += _pcre_utf8_table4[cc[-1] & 0x3f];
220 #endif
221 break;
222
223 case OP_TYPEEXACT:
224 branchlength += GET2(cc,1);
225 cc += (cc[3] == OP_PROP || cc[3] == OP_NOTPROP)? 6 : 4;
226 break;
227
228 /* Handle single-char non-literal matchers */
229
230 case OP_PROP:
231 case OP_NOTPROP:
232 cc += 2;
233 /* Fall through */
234
235 case OP_NOT_DIGIT:
236 case OP_DIGIT:
237 case OP_NOT_WHITESPACE:
238 case OP_WHITESPACE:
239 case OP_NOT_WORDCHAR:
240 case OP_WORDCHAR:
241 case OP_ANY:
242 case OP_ALLANY:
243 case OP_EXTUNI:
244 case OP_HSPACE:
245 case OP_NOT_HSPACE:
246 case OP_VSPACE:
247 case OP_NOT_VSPACE:
248 branchlength++;
249 cc++;
250 break;
251
252 /* "Any newline" might match two characters */
253
254 case OP_ANYNL:
255 branchlength += 2;
256 cc++;
257 break;
258
259 /* The single-byte matcher means we can't proceed in UTF-8 mode */
260
261 case OP_ANYBYTE:
262 #ifdef SUPPORT_UTF8
263 if (utf8) return -1;
264 #endif
265 branchlength++;
266 cc++;
267 break;
268
269 /* For repeated character types, we have to test for \p and \P, which have
270 an extra two bytes of parameters. */
271
272 case OP_TYPESTAR:
273 case OP_TYPEMINSTAR:
274 case OP_TYPEQUERY:
275 case OP_TYPEMINQUERY:
276 case OP_TYPEPOSSTAR:
277 case OP_TYPEPOSQUERY:
278 if (cc[1] == OP_PROP || cc[1] == OP_NOTPROP) cc += 2;
279 cc += _pcre_OP_lengths[op];
280 break;
281
282 case OP_TYPEUPTO:
283 case OP_TYPEMINUPTO:
284 case OP_TYPEPOSUPTO:
285 if (cc[3] == OP_PROP || cc[3] == OP_NOTPROP) cc += 2;
286 cc += _pcre_OP_lengths[op];
287 break;
288
289 /* Check a class for variable quantification */
290
291 #ifdef SUPPORT_UTF8
292 case OP_XCLASS:
293 cc += GET(cc, 1) - 33;
294 /* Fall through */
295 #endif
296
297 case OP_CLASS:
298 case OP_NCLASS:
299 cc += 33;
300
301 switch (*cc)
302 {
303 case OP_CRPLUS:
304 case OP_CRMINPLUS:
305 branchlength++;
306 /* Fall through */
307
308 case OP_CRSTAR:
309 case OP_CRMINSTAR:
310 case OP_CRQUERY:
311 case OP_CRMINQUERY:
312 cc++;
313 break;
314
315 case OP_CRRANGE:
316 case OP_CRMINRANGE:
317 branchlength += GET2(cc,1);
318 cc += 5;
319 break;
320
321 default:
322 branchlength++;
323 break;
324 }
325 break;
326
327 /* Backreferences and subroutine calls are treated in the same way: we find
328 the minimum length for the subpattern. A recursion, however, causes an
329 a flag to be set that causes the length of this branch to be ignored. The
330 logic is that a recursion can only make sense if there is another
331 alternation that stops the recursing. That will provide the minimum length
332 (when no recursion happens). A backreference within the group that it is
333 referencing behaves in the same way.
334
335 If PCRE_JAVASCRIPT_COMPAT is set, a backreference to an unset bracket
336 matches an empty string (by default it causes a matching failure), so in
337 that case we must set the minimum length to zero. */
338
339 case OP_REF:
340 if ((options & PCRE_JAVASCRIPT_COMPAT) == 0)
341 {
342 ce = cs = (uschar *)_pcre_find_bracket(startcode, utf8, GET2(cc, 1));
343 if (cs == NULL) return -2;
344 do ce += GET(ce, 1); while (*ce == OP_ALT);
345 if (cc > cs && cc < ce)
346 {
347 d = 0;
348 had_recurse = TRUE;
349 }
350 else d = find_minlength(cs, startcode, options);
351 }
352 else d = 0;
353 cc += 3;
354
355 /* Handle repeated back references */
356
357 switch (*cc)
358 {
359 case OP_CRSTAR:
360 case OP_CRMINSTAR:
361 case OP_CRQUERY:
362 case OP_CRMINQUERY:
363 min = 0;
364 cc++;
365 break;
366
367 case OP_CRRANGE:
368 case OP_CRMINRANGE:
369 min = GET2(cc, 1);
370 cc += 5;
371 break;
372
373 default:
374 min = 1;
375 break;
376 }
377
378 branchlength += min * d;
379 break;
380
381 case OP_RECURSE:
382 cs = ce = (uschar *)startcode + GET(cc, 1);
383 if (cs == NULL) return -2;
384 do ce += GET(ce, 1); while (*ce == OP_ALT);
385 if (cc > cs && cc < ce)
386 had_recurse = TRUE;
387 else
388 branchlength += find_minlength(cs, startcode, options);
389 cc += 1 + LINK_SIZE;
390 break;
391
392 /* Anything else does not or need not match a character. We can get the
393 item's length from the table, but for those that can match zero occurrences
394 of a character, we must take special action for UTF-8 characters. */
395
396 case OP_UPTO:
397 case OP_NOTUPTO:
398 case OP_MINUPTO:
399 case OP_NOTMINUPTO:
400 case OP_POSUPTO:
401 case OP_STAR:
402 case OP_MINSTAR:
403 case OP_NOTMINSTAR:
404 case OP_POSSTAR:
405 case OP_NOTPOSSTAR:
406 case OP_QUERY:
407 case OP_MINQUERY:
408 case OP_NOTMINQUERY:
409 case OP_POSQUERY:
410 case OP_NOTPOSQUERY:
411 cc += _pcre_OP_lengths[op];
412 #ifdef SUPPORT_UTF8
413 if (utf8 && cc[-1] >= 0xc0) cc += _pcre_utf8_table4[cc[-1] & 0x3f];
414 #endif
415 break;
416
417 /* Skip these, but we need to add in the name length. */
418
419 case OP_MARK:
420 case OP_PRUNE_ARG:
421 case OP_SKIP_ARG:
422 case OP_THEN_ARG:
423 cc += _pcre_OP_lengths[op] + cc[1];
424 break;
425
426 /* For the record, these are the opcodes that are matched by "default":
427 OP_ACCEPT, OP_CLOSE, OP_COMMIT, OP_FAIL, OP_PRUNE, OP_SET_SOM, OP_SKIP,
428 OP_THEN. */
429
430 default:
431 cc += _pcre_OP_lengths[op];
432 break;
433 }
434 }
435 /* Control never gets here */
436 }
437
438
439
440 /*************************************************
441 * Set a bit and maybe its alternate case *
442 *************************************************/
443
444 /* Given a character, set its first byte's bit in the table, and also the
445 corresponding bit for the other version of a letter if we are caseless. In
446 UTF-8 mode, for characters greater than 127, we can only do the caseless thing
447 when Unicode property support is available.
448
449 Arguments:
450 start_bits points to the bit map
451 p points to the character
452 caseless the caseless flag
453 cd the block with char table pointers
454 utf8 TRUE for UTF-8 mode
455
456 Returns: pointer after the character
457 */
458
459 static const uschar *
460 set_table_bit(uschar *start_bits, const uschar *p, BOOL caseless,
461 compile_data *cd, BOOL utf8)
462 {
463 unsigned int c = *p;
464
465 SET_BIT(c);
466
467 #ifdef SUPPORT_UTF8
468 if (utf8 && c > 127)
469 {
470 GETCHARINC(c, p);
471 #ifdef SUPPORT_UCP
472 if (caseless)
473 {
474 uschar buff[8];
475 c = UCD_OTHERCASE(c);
476 (void)_pcre_ord2utf8(c, buff);
477 SET_BIT(buff[0]);
478 }
479 #endif
480 return p;
481 }
482 #endif
483
484 /* Not UTF-8 mode, or character is less than 127. */
485
486 if (caseless && (cd->ctypes[c] & ctype_letter) != 0) SET_BIT(cd->fcc[c]);
487 return p + 1;
488 }
489
490
491
492 /*************************************************
493 * Set bits for a positive character type *
494 *************************************************/
495
496 /* This function sets starting bits for a character type. In UTF-8 mode, we can
497 only do a direct setting for bytes less than 128, as otherwise there can be
498 confusion with bytes in the middle of UTF-8 characters. In a "traditional"
499 environment, the tables will only recognize ASCII characters anyway, but in at
500 least one Windows environment, some higher bytes bits were set in the tables.
501 So we deal with that case by considering the UTF-8 encoding.
502
503 Arguments:
504 start_bits the starting bitmap
505 cbit type the type of character wanted
506 table_limit 32 for non-UTF-8; 16 for UTF-8
507 cd the block with char table pointers
508
509 Returns: nothing
510 */
511
512 static void
513 set_type_bits(uschar *start_bits, int cbit_type, int table_limit,
514 compile_data *cd)
515 {
516 register int c;
517 for (c = 0; c < table_limit; c++) start_bits[c] |= cd->cbits[c+cbit_type];
518 if (table_limit == 32) return;
519 for (c = 128; c < 256; c++)
520 {
521 if ((cd->cbits[c/8] & (1 << (c&7))) != 0)
522 {
523 uschar buff[8];
524 (void)_pcre_ord2utf8(c, buff);
525 SET_BIT(buff[0]);
526 }
527 }
528 }
529
530
531 /*************************************************
532 * Set bits for a negative character type *
533 *************************************************/
534
535 /* This function sets starting bits for a negative character type such as \D.
536 In UTF-8 mode, we can only do a direct setting for bytes less than 128, as
537 otherwise there can be confusion with bytes in the middle of UTF-8 characters.
538 Unlike in the positive case, where we can set appropriate starting bits for
539 specific high-valued UTF-8 characters, in this case we have to set the bits for
540 all high-valued characters. The lowest is 0xc2, but we overkill by starting at
541 0xc0 (192) for simplicity.
542
543 Arguments:
544 start_bits the starting bitmap
545 cbit type the type of character wanted
546 table_limit 32 for non-UTF-8; 16 for UTF-8
547 cd the block with char table pointers
548
549 Returns: nothing
550 */
551
552 static void
553 set_nottype_bits(uschar *start_bits, int cbit_type, int table_limit,
554 compile_data *cd)
555 {
556 register int c;
557 for (c = 0; c < table_limit; c++) start_bits[c] |= ~cd->cbits[c+cbit_type];
558 if (table_limit != 32) for (c = 24; c < 32; c++) start_bits[c] = 0xff;
559 }
560
561
562
563 /*************************************************
564 * Create bitmap of starting bytes *
565 *************************************************/
566
567 /* This function scans a compiled unanchored expression recursively and
568 attempts to build a bitmap of the set of possible starting bytes. As time goes
569 by, we may be able to get more clever at doing this. The SSB_CONTINUE return is
570 useful for parenthesized groups in patterns such as (a*)b where the group
571 provides some optional starting bytes but scanning must continue at the outer
572 level to find at least one mandatory byte. At the outermost level, this
573 function fails unless the result is SSB_DONE.
574
575 Arguments:
576 code points to an expression
577 start_bits points to a 32-byte table, initialized to 0
578 caseless the current state of the caseless flag
579 utf8 TRUE if in UTF-8 mode
580 cd the block with char table pointers
581
582 Returns: SSB_FAIL => Failed to find any starting bytes
583 SSB_DONE => Found mandatory starting bytes
584 SSB_CONTINUE => Found optional starting bytes
585 */
586
587 static int
588 set_start_bits(const uschar *code, uschar *start_bits, BOOL caseless,
589 BOOL utf8, compile_data *cd)
590 {
591 register int c;
592 int yield = SSB_DONE;
593 int table_limit = utf8? 16:32;
594
595 #if 0
596 /* ========================================================================= */
597 /* The following comment and code was inserted in January 1999. In May 2006,
598 when it was observed to cause compiler warnings about unused values, I took it
599 out again. If anybody is still using OS/2, they will have to put it back
600 manually. */
601
602 /* This next statement and the later reference to dummy are here in order to
603 trick the optimizer of the IBM C compiler for OS/2 into generating correct
604 code. Apparently IBM isn't going to fix the problem, and we would rather not
605 disable optimization (in this module it actually makes a big difference, and
606 the pcre module can use all the optimization it can get). */
607
608 volatile int dummy;
609 /* ========================================================================= */
610 #endif
611
612 do
613 {
614 const uschar *tcode = code + (((int)*code == OP_CBRA)? 3:1) + LINK_SIZE;
615 BOOL try_next = TRUE;
616
617 while (try_next) /* Loop for items in this branch */
618 {
619 int rc;
620 switch(*tcode)
621 {
622 /* Fail if we reach something we don't understand */
623
624 default:
625 return SSB_FAIL;
626
627 /* If we hit a bracket or a positive lookahead assertion, recurse to set
628 bits from within the subpattern. If it can't find anything, we have to
629 give up. If it finds some mandatory character(s), we are done for this
630 branch. Otherwise, carry on scanning after the subpattern. */
631
632 case OP_BRA:
633 case OP_SBRA:
634 case OP_CBRA:
635 case OP_SCBRA:
636 case OP_ONCE:
637 case OP_ASSERT:
638 rc = set_start_bits(tcode, start_bits, caseless, utf8, cd);
639 if (rc == SSB_FAIL) return SSB_FAIL;
640 if (rc == SSB_DONE) try_next = FALSE; else
641 {
642 do tcode += GET(tcode, 1); while (*tcode == OP_ALT);
643 tcode += 1 + LINK_SIZE;
644 }
645 break;
646
647 /* If we hit ALT or KET, it means we haven't found anything mandatory in
648 this branch, though we might have found something optional. For ALT, we
649 continue with the next alternative, but we have to arrange that the final
650 result from subpattern is SSB_CONTINUE rather than SSB_DONE. For KET,
651 return SSB_CONTINUE: if this is the top level, that indicates failure,
652 but after a nested subpattern, it causes scanning to continue. */
653
654 case OP_ALT:
655 yield = SSB_CONTINUE;
656 try_next = FALSE;
657 break;
658
659 case OP_KET:
660 case OP_KETRMAX:
661 case OP_KETRMIN:
662 return SSB_CONTINUE;
663
664 /* Skip over callout */
665
666 case OP_CALLOUT:
667 tcode += 2 + 2*LINK_SIZE;
668 break;
669
670 /* Skip over lookbehind and negative lookahead assertions */
671
672 case OP_ASSERT_NOT:
673 case OP_ASSERTBACK:
674 case OP_ASSERTBACK_NOT:
675 do tcode += GET(tcode, 1); while (*tcode == OP_ALT);
676 tcode += 1 + LINK_SIZE;
677 break;
678
679 /* Skip over an option setting, changing the caseless flag */
680
681 case OP_OPT:
682 caseless = (tcode[1] & PCRE_CASELESS) != 0;
683 tcode += 2;
684 break;
685
686 /* BRAZERO does the bracket, but carries on. */
687
688 case OP_BRAZERO:
689 case OP_BRAMINZERO:
690 if (set_start_bits(++tcode, start_bits, caseless, utf8, cd) == SSB_FAIL)
691 return SSB_FAIL;
692 /* =========================================================================
693 See the comment at the head of this function concerning the next line,
694 which was an old fudge for the benefit of OS/2.
695 dummy = 1;
696 ========================================================================= */
697 do tcode += GET(tcode,1); while (*tcode == OP_ALT);
698 tcode += 1 + LINK_SIZE;
699 break;
700
701 /* SKIPZERO skips the bracket. */
702
703 case OP_SKIPZERO:
704 tcode++;
705 do tcode += GET(tcode,1); while (*tcode == OP_ALT);
706 tcode += 1 + LINK_SIZE;
707 break;
708
709 /* Single-char * or ? sets the bit and tries the next item */
710
711 case OP_STAR:
712 case OP_MINSTAR:
713 case OP_POSSTAR:
714 case OP_QUERY:
715 case OP_MINQUERY:
716 case OP_POSQUERY:
717 tcode = set_table_bit(start_bits, tcode + 1, caseless, cd, utf8);
718 break;
719
720 /* Single-char upto sets the bit and tries the next */
721
722 case OP_UPTO:
723 case OP_MINUPTO:
724 case OP_POSUPTO:
725 tcode = set_table_bit(start_bits, tcode + 3, caseless, cd, utf8);
726 break;
727
728 /* At least one single char sets the bit and stops */
729
730 case OP_EXACT: /* Fall through */
731 tcode += 2;
732
733 case OP_CHAR:
734 case OP_CHARNC:
735 case OP_PLUS:
736 case OP_MINPLUS:
737 case OP_POSPLUS:
738 (void)set_table_bit(start_bits, tcode + 1, caseless, cd, utf8);
739 try_next = FALSE;
740 break;
741
742 /* Special spacing and line-terminating items. These recognize specific
743 lists of characters. The difference between VSPACE and ANYNL is that the
744 latter can match the two-character CRLF sequence, but that is not
745 relevant for finding the first character, so their code here is
746 identical. */
747
748 case OP_HSPACE:
749 SET_BIT(0x09);
750 SET_BIT(0x20);
751 if (utf8)
752 {
753 SET_BIT(0xC2); /* For U+00A0 */
754 SET_BIT(0xE1); /* For U+1680, U+180E */
755 SET_BIT(0xE2); /* For U+2000 - U+200A, U+202F, U+205F */
756 SET_BIT(0xE3); /* For U+3000 */
757 }
758 else SET_BIT(0xA0);
759 try_next = FALSE;
760 break;
761
762 case OP_ANYNL:
763 case OP_VSPACE:
764 SET_BIT(0x0A);
765 SET_BIT(0x0B);
766 SET_BIT(0x0C);
767 SET_BIT(0x0D);
768 if (utf8)
769 {
770 SET_BIT(0xC2); /* For U+0085 */
771 SET_BIT(0xE2); /* For U+2028, U+2029 */
772 }
773 else SET_BIT(0x85);
774 try_next = FALSE;
775 break;
776
777 /* Single character types set the bits and stop. Note that if PCRE_UCP
778 is set, we do not see these op codes because \d etc are converted to
779 properties. Therefore, these apply in the case when only characters less
780 than 256 are recognized to match the types. */
781
782 case OP_NOT_DIGIT:
783 set_nottype_bits(start_bits, cbit_digit, table_limit, cd);
784 try_next = FALSE;
785 break;
786
787 case OP_DIGIT:
788 set_type_bits(start_bits, cbit_digit, table_limit, cd);
789 try_next = FALSE;
790 break;
791
792 /* The cbit_space table has vertical tab as whitespace; we have to
793 ensure it is set as not whitespace. */
794
795 case OP_NOT_WHITESPACE:
796 set_nottype_bits(start_bits, cbit_space, table_limit, cd);
797 start_bits[1] |= 0x08;
798 try_next = FALSE;
799 break;
800
801 /* The cbit_space table has vertical tab as whitespace; we have to
802 not set it from the table. */
803
804 case OP_WHITESPACE:
805 c = start_bits[1]; /* Save in case it was already set */
806 set_type_bits(start_bits, cbit_space, table_limit, cd);
807 start_bits[1] = (start_bits[1] & ~0x08) | c;
808 try_next = FALSE;
809 break;
810
811 case OP_NOT_WORDCHAR:
812 set_nottype_bits(start_bits, cbit_word, table_limit, cd);
813 try_next = FALSE;
814 break;
815
816 case OP_WORDCHAR:
817 set_type_bits(start_bits, cbit_word, table_limit, cd);
818 try_next = FALSE;
819 break;
820
821 /* One or more character type fudges the pointer and restarts, knowing
822 it will hit a single character type and stop there. */
823
824 case OP_TYPEPLUS:
825 case OP_TYPEMINPLUS:
826 case OP_TYPEPOSPLUS:
827 tcode++;
828 break;
829
830 case OP_TYPEEXACT:
831 tcode += 3;
832 break;
833
834 /* Zero or more repeats of character types set the bits and then
835 try again. */
836
837 case OP_TYPEUPTO:
838 case OP_TYPEMINUPTO:
839 case OP_TYPEPOSUPTO:
840 tcode += 2; /* Fall through */
841
842 case OP_TYPESTAR:
843 case OP_TYPEMINSTAR:
844 case OP_TYPEPOSSTAR:
845 case OP_TYPEQUERY:
846 case OP_TYPEMINQUERY:
847 case OP_TYPEPOSQUERY:
848 switch(tcode[1])
849 {
850 default:
851 case OP_ANY:
852 case OP_ALLANY:
853 return SSB_FAIL;
854
855 case OP_HSPACE:
856 SET_BIT(0x09);
857 SET_BIT(0x20);
858 if (utf8)
859 {
860 SET_BIT(0xC2); /* For U+00A0 */
861 SET_BIT(0xE1); /* For U+1680, U+180E */
862 SET_BIT(0xE2); /* For U+2000 - U+200A, U+202F, U+205F */
863 SET_BIT(0xE3); /* For U+3000 */
864 }
865 else SET_BIT(0xA0);
866 break;
867
868 case OP_ANYNL:
869 case OP_VSPACE:
870 SET_BIT(0x0A);
871 SET_BIT(0x0B);
872 SET_BIT(0x0C);
873 SET_BIT(0x0D);
874 if (utf8)
875 {
876 SET_BIT(0xC2); /* For U+0085 */
877 SET_BIT(0xE2); /* For U+2028, U+2029 */
878 }
879 else SET_BIT(0x85);
880 break;
881
882 case OP_NOT_DIGIT:
883 set_nottype_bits(start_bits, cbit_digit, table_limit, cd);
884 break;
885
886 case OP_DIGIT:
887 set_type_bits(start_bits, cbit_digit, table_limit, cd);
888 break;
889
890 /* The cbit_space table has vertical tab as whitespace; we have to
891 ensure it gets set as not whitespace. */
892
893 case OP_NOT_WHITESPACE:
894 set_nottype_bits(start_bits, cbit_space, table_limit, cd);
895 start_bits[1] |= 0x08;
896 break;
897
898 /* The cbit_space table has vertical tab as whitespace; we have to
899 avoid setting it. */
900
901 case OP_WHITESPACE:
902 c = start_bits[1]; /* Save in case it was already set */
903 set_type_bits(start_bits, cbit_space, table_limit, cd);
904 start_bits[1] = (start_bits[1] & ~0x08) | c;
905 break;
906
907 case OP_NOT_WORDCHAR:
908 set_nottype_bits(start_bits, cbit_word, table_limit, cd);
909 break;
910
911 case OP_WORDCHAR:
912 set_type_bits(start_bits, cbit_word, table_limit, cd);
913 break;
914 }
915
916 tcode += 2;
917 break;
918
919 /* Character class where all the information is in a bit map: set the
920 bits and either carry on or not, according to the repeat count. If it was
921 a negative class, and we are operating with UTF-8 characters, any byte
922 with a value >= 0xc4 is a potentially valid starter because it starts a
923 character with a value > 255. */
924
925 case OP_NCLASS:
926 #ifdef SUPPORT_UTF8
927 if (utf8)
928 {
929 start_bits[24] |= 0xf0; /* Bits for 0xc4 - 0xc8 */
930 memset(start_bits+25, 0xff, 7); /* Bits for 0xc9 - 0xff */
931 }
932 #endif
933 /* Fall through */
934
935 case OP_CLASS:
936 {
937 tcode++;
938
939 /* In UTF-8 mode, the bits in a bit map correspond to character
940 values, not to byte values. However, the bit map we are constructing is
941 for byte values. So we have to do a conversion for characters whose
942 value is > 127. In fact, there are only two possible starting bytes for
943 characters in the range 128 - 255. */
944
945 #ifdef SUPPORT_UTF8
946 if (utf8)
947 {
948 for (c = 0; c < 16; c++) start_bits[c] |= tcode[c];
949 for (c = 128; c < 256; c++)
950 {
951 if ((tcode[c/8] && (1 << (c&7))) != 0)
952 {
953 int d = (c >> 6) | 0xc0; /* Set bit for this starter */
954 start_bits[d/8] |= (1 << (d&7)); /* and then skip on to the */
955 c = (c & 0xc0) + 0x40 - 1; /* next relevant character. */
956 }
957 }
958 }
959
960 /* In non-UTF-8 mode, the two bit maps are completely compatible. */
961
962 else
963 #endif
964 {
965 for (c = 0; c < 32; c++) start_bits[c] |= tcode[c];
966 }
967
968 /* Advance past the bit map, and act on what follows */
969
970 tcode += 32;
971 switch (*tcode)
972 {
973 case OP_CRSTAR:
974 case OP_CRMINSTAR:
975 case OP_CRQUERY:
976 case OP_CRMINQUERY:
977 tcode++;
978 break;
979
980 case OP_CRRANGE:
981 case OP_CRMINRANGE:
982 if (((tcode[1] << 8) + tcode[2]) == 0) tcode += 5;
983 else try_next = FALSE;
984 break;
985
986 default:
987 try_next = FALSE;
988 break;
989 }
990 }
991 break; /* End of bitmap class handling */
992
993 } /* End of switch */
994 } /* End of try_next loop */
995
996 code += GET(code, 1); /* Advance to next branch */
997 }
998 while (*code == OP_ALT);
999 return yield;
1000 }
1001
1002
1003
1004 /*************************************************
1005 * Study a compiled expression *
1006 *************************************************/
1007
1008 /* This function is handed a compiled expression that it must study to produce
1009 information that will speed up the matching. It returns a pcre_extra block
1010 which then gets handed back to pcre_exec().
1011
1012 Arguments:
1013 re points to the compiled expression
1014 options contains option bits
1015 errorptr points to where to place error messages;
1016 set NULL unless error
1017
1018 Returns: pointer to a pcre_extra block, with study_data filled in and the
1019 appropriate flags set;
1020 NULL on error or if no optimization possible
1021 */
1022
1023 PCRE_EXP_DEFN pcre_extra * PCRE_CALL_CONVENTION
1024 pcre_study(const pcre *external_re, int options, const char **errorptr)
1025 {
1026 int min;
1027 BOOL bits_set = FALSE;
1028 uschar start_bits[32];
1029 pcre_extra *extra;
1030 pcre_study_data *study;
1031 const uschar *tables;
1032 uschar *code;
1033 compile_data compile_block;
1034 const real_pcre *re = (const real_pcre *)external_re;
1035
1036 *errorptr = NULL;
1037
1038 if (re == NULL || re->magic_number != MAGIC_NUMBER)
1039 {
1040 *errorptr = "argument is not a compiled regular expression";
1041 return NULL;
1042 }
1043
1044 if ((options & ~PUBLIC_STUDY_OPTIONS) != 0)
1045 {
1046 *errorptr = "unknown or incorrect option bit(s) set";
1047 return NULL;
1048 }
1049
1050 code = (uschar *)re + re->name_table_offset +
1051 (re->name_count * re->name_entry_size);
1052
1053 /* For an anchored pattern, or an unanchored pattern that has a first char, or
1054 a multiline pattern that matches only at "line starts", there is no point in
1055 seeking a list of starting bytes. */
1056
1057 if ((re->options & PCRE_ANCHORED) == 0 &&
1058 (re->flags & (PCRE_FIRSTSET|PCRE_STARTLINE)) == 0)
1059 {
1060 /* Set the character tables in the block that is passed around */
1061
1062 tables = re->tables;
1063 if (tables == NULL)
1064 (void)pcre_fullinfo(external_re, NULL, PCRE_INFO_DEFAULT_TABLES,
1065 (void *)(&tables));
1066
1067 compile_block.lcc = tables + lcc_offset;
1068 compile_block.fcc = tables + fcc_offset;
1069 compile_block.cbits = tables + cbits_offset;
1070 compile_block.ctypes = tables + ctypes_offset;
1071
1072 /* See if we can find a fixed set of initial characters for the pattern. */
1073
1074 memset(start_bits, 0, 32 * sizeof(uschar));
1075 bits_set = set_start_bits(code, start_bits,
1076 (re->options & PCRE_CASELESS) != 0, (re->options & PCRE_UTF8) != 0,
1077 &compile_block) == SSB_DONE;
1078 }
1079
1080 /* Find the minimum length of subject string. */
1081
1082 min = find_minlength(code, code, re->options);
1083
1084 /* Return NULL if no optimization is possible. */
1085
1086 if (!bits_set && min < 0) return NULL;
1087
1088 /* Get a pcre_extra block and a pcre_study_data block. The study data is put in
1089 the latter, which is pointed to by the former, which may also get additional
1090 data set later by the calling program. At the moment, the size of
1091 pcre_study_data is fixed. We nevertheless save it in a field for returning via
1092 the pcre_fullinfo() function so that if it becomes variable in the future, we
1093 don't have to change that code. */
1094
1095 extra = (pcre_extra *)(pcre_malloc)
1096 (sizeof(pcre_extra) + sizeof(pcre_study_data));
1097
1098 if (extra == NULL)
1099 {
1100 *errorptr = "failed to get memory";
1101 return NULL;
1102 }
1103
1104 study = (pcre_study_data *)((char *)extra + sizeof(pcre_extra));
1105 extra->flags = PCRE_EXTRA_STUDY_DATA;
1106 extra->study_data = study;
1107
1108 study->size = sizeof(pcre_study_data);
1109 study->flags = 0;
1110
1111 if (bits_set)
1112 {
1113 study->flags |= PCRE_STUDY_MAPPED;
1114 memcpy(study->start_bits, start_bits, sizeof(start_bits));
1115 }
1116
1117 if (min >= 0)
1118 {
1119 study->flags |= PCRE_STUDY_MINLEN;
1120 study->minlength = min;
1121 }
1122
1123 return extra;
1124 }
1125
1126 /* End of pcre_study.c */

Properties

Name Value
svn:eol-style native
svn:keywords "Author Date Id Revision Url"

webmaster@exim.org
ViewVC Help
Powered by ViewVC 1.1.12