/[pcre]/code/trunk/pcre_study.c
ViewVC logotype

Contents of /code/trunk/pcre_study.c

Parent Directory Parent Directory | Revision Log Revision Log


Revision 455 - (show annotations) (download)
Sat Sep 26 19:12:32 2009 UTC (4 years, 6 months ago) by ph10
File MIME type: text/plain
File size: 26844 byte(s)
Added lower bound length-finding to pcre_study() and use it when matching; make 
the value available via pcre_fullinfo(); also fixed bugs connected with
pcre_study() in pcre_dfa_exec(). 

1 /*************************************************
2 * Perl-Compatible Regular Expressions *
3 *************************************************/
4
5 /* PCRE is a library of functions to support regular expressions whose syntax
6 and semantics are as close as possible to those of the Perl 5 language.
7
8 Written by Philip Hazel
9 Copyright (c) 1997-2009 University of Cambridge
10
11 -----------------------------------------------------------------------------
12 Redistribution and use in source and binary forms, with or without
13 modification, are permitted provided that the following conditions are met:
14
15 * Redistributions of source code must retain the above copyright notice,
16 this list of conditions and the following disclaimer.
17
18 * Redistributions in binary form must reproduce the above copyright
19 notice, this list of conditions and the following disclaimer in the
20 documentation and/or other materials provided with the distribution.
21
22 * Neither the name of the University of Cambridge nor the names of its
23 contributors may be used to endorse or promote products derived from
24 this software without specific prior written permission.
25
26 THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
27 AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
28 IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
29 ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
30 LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
31 CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
32 SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
33 INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
34 CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
35 ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
36 POSSIBILITY OF SUCH DAMAGE.
37 -----------------------------------------------------------------------------
38 */
39
40
41 /* This module contains the external function pcre_study(), along with local
42 supporting functions. */
43
44
45 #ifdef HAVE_CONFIG_H
46 #include "config.h"
47 #endif
48
49 #include "pcre_internal.h"
50
51
52 /* Returns from set_start_bits() */
53
54 enum { SSB_FAIL, SSB_DONE, SSB_CONTINUE };
55
56
57
58 /*************************************************
59 * Find the minimum subject length for a group *
60 *************************************************/
61
62 /* Scan a parenthesized group and compute the minimum length of subject that
63 is needed to match it. This is a lower bound; it does not mean there is a
64 string of that length that matches. In UTF8 mode, the result is in characters
65 rather than bytes.
66
67 Arguments:
68 code pointer to start of group (the bracket)
69 startcode pointer to start of the whole pattern
70 options the compiling options
71
72 Returns: the minimum length
73 -1 if \C was encountered
74 -2 internal error (missing capturing bracket)
75 */
76
77 static int
78 find_minlength(const uschar *code, const uschar *startcode, int options)
79 {
80 int length = -1;
81 BOOL utf8 = (options & PCRE_UTF8) != 0;
82 BOOL had_recurse = FALSE;
83 register int branchlength = 0;
84 register uschar *cc = (uschar *)code + 1 + LINK_SIZE;
85
86 if (*code == OP_CBRA || *code == OP_SCBRA) cc += 2;
87
88 /* Scan along the opcodes for this branch. If we get to the end of the
89 branch, check the length against that of the other branches. */
90
91 for (;;)
92 {
93 int d, min;
94 uschar *cs, *ce;
95 register int op = *cc;
96
97 switch (op)
98 {
99 case OP_CBRA:
100 case OP_SCBRA:
101 case OP_BRA:
102 case OP_SBRA:
103 case OP_ONCE:
104 case OP_COND:
105 case OP_SCOND:
106 d = find_minlength(cc, startcode, options);
107 if (d < 0) return d;
108 branchlength += d;
109 do cc += GET(cc, 1); while (*cc == OP_ALT);
110 cc += 1 + LINK_SIZE;
111 break;
112
113 /* Reached end of a branch; if it's a ket it is the end of a nested
114 call. If it's ALT it is an alternation in a nested call. If it is
115 END it's the end of the outer call. All can be handled by the same code. */
116
117 case OP_ALT:
118 case OP_KET:
119 case OP_KETRMAX:
120 case OP_KETRMIN:
121 case OP_END:
122 if (length < 0 || (!had_recurse && branchlength < length))
123 length = branchlength;
124 if (*cc != OP_ALT) return length;
125 cc += 1 + LINK_SIZE;
126 branchlength = 0;
127 had_recurse = FALSE;
128 break;
129
130 /* Skip over assertive subpatterns */
131
132 case OP_ASSERT:
133 case OP_ASSERT_NOT:
134 case OP_ASSERTBACK:
135 case OP_ASSERTBACK_NOT:
136 do cc += GET(cc, 1); while (*cc == OP_ALT);
137 /* Fall through */
138
139 /* Skip over things that don't match chars */
140
141 case OP_REVERSE:
142 case OP_CREF:
143 case OP_RREF:
144 case OP_DEF:
145 case OP_OPT:
146 case OP_CALLOUT:
147 case OP_SOD:
148 case OP_SOM:
149 case OP_EOD:
150 case OP_EODN:
151 case OP_CIRC:
152 case OP_DOLL:
153 case OP_NOT_WORD_BOUNDARY:
154 case OP_WORD_BOUNDARY:
155 cc += _pcre_OP_lengths[*cc];
156 break;
157
158 /* Skip over a subpattern that has a {0} or {0,x} quantifier */
159
160 case OP_BRAZERO:
161 case OP_BRAMINZERO:
162 case OP_SKIPZERO:
163 cc += _pcre_OP_lengths[*cc];
164 do cc += GET(cc, 1); while (*cc == OP_ALT);
165 cc += 1 + LINK_SIZE;
166 break;
167
168 /* Handle literal characters and + repetitions */
169
170 case OP_CHAR:
171 case OP_CHARNC:
172 case OP_NOT:
173 case OP_PLUS:
174 case OP_MINPLUS:
175 case OP_POSPLUS:
176 case OP_NOTPLUS:
177 case OP_NOTMINPLUS:
178 case OP_NOTPOSPLUS:
179 branchlength++;
180 cc += 2;
181 #ifdef SUPPORT_UTF8
182 if (utf8 && cc[-1] >= 0xc0) cc += _pcre_utf8_table4[cc[-1] & 0x3f];
183 #endif
184 break;
185
186 case OP_TYPEPLUS:
187 case OP_TYPEMINPLUS:
188 case OP_TYPEPOSPLUS:
189 branchlength++;
190 cc += (cc[1] == OP_PROP || cc[1] == OP_NOTPROP)? 4 : 2;
191 break;
192
193 /* Handle exact repetitions. The count is already in characters, but we
194 need to skip over a multibyte character in UTF8 mode. */
195
196 case OP_EXACT:
197 case OP_NOTEXACT:
198 branchlength += GET2(cc,1);
199 cc += 4;
200 #ifdef SUPPORT_UTF8
201 if (utf8 && cc[-1] >= 0xc0) cc += _pcre_utf8_table4[cc[-1] & 0x3f];
202 #endif
203 break;
204
205 case OP_TYPEEXACT:
206 branchlength += GET2(cc,1);
207 cc += (cc[3] == OP_PROP || cc[3] == OP_NOTPROP)? 6 : 4;
208 break;
209
210 /* Handle single-char non-literal matchers */
211
212 case OP_PROP:
213 case OP_NOTPROP:
214 cc += 2;
215 /* Fall through */
216
217 case OP_NOT_DIGIT:
218 case OP_DIGIT:
219 case OP_NOT_WHITESPACE:
220 case OP_WHITESPACE:
221 case OP_NOT_WORDCHAR:
222 case OP_WORDCHAR:
223 case OP_ANY:
224 case OP_ALLANY:
225 case OP_EXTUNI:
226 case OP_HSPACE:
227 case OP_NOT_HSPACE:
228 case OP_VSPACE:
229 case OP_NOT_VSPACE:
230 branchlength++;
231 cc++;
232 break;
233
234 /* "Any newline" might match two characters */
235
236 case OP_ANYNL:
237 branchlength += 2;
238 cc++;
239 break;
240
241 /* The single-byte matcher means we can't proceed in UTF-8 mode */
242
243 case OP_ANYBYTE:
244 #ifdef SUPPORT_UTF8
245 if (utf8) return -1;
246 #endif
247 branchlength++;
248 cc++;
249 break;
250
251 /* For repeated character types, we have to test for \p and \P, which have
252 an extra two bytes of parameters. */
253
254 case OP_TYPESTAR:
255 case OP_TYPEMINSTAR:
256 case OP_TYPEQUERY:
257 case OP_TYPEMINQUERY:
258 case OP_TYPEPOSSTAR:
259 case OP_TYPEPOSQUERY:
260 if (cc[1] == OP_PROP || cc[1] == OP_NOTPROP) cc += 2;
261 cc += _pcre_OP_lengths[op];
262 break;
263
264 case OP_TYPEUPTO:
265 case OP_TYPEMINUPTO:
266 case OP_TYPEPOSUPTO:
267 if (cc[3] == OP_PROP || cc[3] == OP_NOTPROP) cc += 2;
268 cc += _pcre_OP_lengths[op];
269 break;
270
271 /* Check a class for variable quantification */
272
273 #ifdef SUPPORT_UTF8
274 case OP_XCLASS:
275 cc += GET(cc, 1) - 33;
276 /* Fall through */
277 #endif
278
279 case OP_CLASS:
280 case OP_NCLASS:
281 cc += 33;
282
283 switch (*cc)
284 {
285 case OP_CRPLUS:
286 case OP_CRMINPLUS:
287 branchlength++;
288 /* Fall through */
289
290 case OP_CRSTAR:
291 case OP_CRMINSTAR:
292 case OP_CRQUERY:
293 case OP_CRMINQUERY:
294 cc++;
295 break;
296
297 case OP_CRRANGE:
298 case OP_CRMINRANGE:
299 branchlength += GET2(cc,1);
300 cc += 5;
301 break;
302
303 default:
304 branchlength++;
305 break;
306 }
307 break;
308
309 /* Backreferences and subroutine calls are treated in the same way: we find
310 the minimum length for the subpattern. A recursion, however, causes an
311 a flag to be set that causes the length of this branch to be ignored. The
312 logic is that a recursion can only make sense if there is another
313 alternation that stops the recursing. That will provide the minimum length
314 (when no recursion happens). A backreference within the group that it is
315 referencing behaves in the same way. */
316
317 case OP_REF:
318 ce = cs = (uschar *)_pcre_find_bracket(startcode, utf8, GET2(cc, 1));
319 if (cs == NULL) return -2;
320 do ce += GET(ce, 1); while (*ce == OP_ALT);
321 if (cc > cs && cc < ce)
322 {
323 d = 0;
324 had_recurse = TRUE;
325 }
326 else d = find_minlength(cs, startcode, options);
327 cc += 3;
328
329 /* Handle repeated back references */
330
331 switch (*cc)
332 {
333 case OP_CRSTAR:
334 case OP_CRMINSTAR:
335 case OP_CRQUERY:
336 case OP_CRMINQUERY:
337 min = 0;
338 cc++;
339 break;
340
341 case OP_CRRANGE:
342 case OP_CRMINRANGE:
343 min = GET2(cc, 1);
344 cc += 5;
345 break;
346
347 default:
348 min = 1;
349 break;
350 }
351
352 branchlength += min * d;
353 break;
354
355 case OP_RECURSE:
356 cs = ce = (uschar *)startcode + GET(cc, 1);
357 if (cs == NULL) return -2;
358 do ce += GET(ce, 1); while (*ce == OP_ALT);
359 if (cc > cs && cc < ce)
360 had_recurse = TRUE;
361 else
362 branchlength += find_minlength(cs, startcode, options);
363 cc += 1 + LINK_SIZE;
364 break;
365
366 /* Anything else does not or need not match a character. We can get the
367 item's length from the table, but for those that can match zero occurrences
368 of a character, we must take special action for UTF-8 characters. */
369
370 case OP_UPTO:
371 case OP_NOTUPTO:
372 case OP_MINUPTO:
373 case OP_NOTMINUPTO:
374 case OP_POSUPTO:
375 case OP_STAR:
376 case OP_MINSTAR:
377 case OP_NOTMINSTAR:
378 case OP_POSSTAR:
379 case OP_NOTPOSSTAR:
380 case OP_QUERY:
381 case OP_MINQUERY:
382 case OP_NOTMINQUERY:
383 case OP_POSQUERY:
384 case OP_NOTPOSQUERY:
385 cc += _pcre_OP_lengths[op];
386 #ifdef SUPPORT_UTF8
387 if (utf8 && cc[-1] >= 0xc0) cc += _pcre_utf8_table4[cc[-1] & 0x3f];
388 #endif
389 break;
390
391 /* For the record, these are the opcodes that are matched by "default":
392 OP_ACCEPT, OP_CLOSE, OP_COMMIT, OP_FAIL, OP_PRUNE, OP_SET_SOM, OP_SKIP,
393 OP_THEN. */
394
395 default:
396 cc += _pcre_OP_lengths[op];
397 break;
398 }
399 }
400 /* Control never gets here */
401 }
402
403
404
405 /*************************************************
406 * Set a bit and maybe its alternate case *
407 *************************************************/
408
409 /* Given a character, set its bit in the table, and also the bit for the other
410 version of a letter if we are caseless.
411
412 Arguments:
413 start_bits points to the bit map
414 c is the character
415 caseless the caseless flag
416 cd the block with char table pointers
417
418 Returns: nothing
419 */
420
421 static void
422 set_bit(uschar *start_bits, unsigned int c, BOOL caseless, compile_data *cd)
423 {
424 start_bits[c/8] |= (1 << (c&7));
425 if (caseless && (cd->ctypes[c] & ctype_letter) != 0)
426 start_bits[cd->fcc[c]/8] |= (1 << (cd->fcc[c]&7));
427 }
428
429
430
431 /*************************************************
432 * Create bitmap of starting bytes *
433 *************************************************/
434
435 /* This function scans a compiled unanchored expression recursively and
436 attempts to build a bitmap of the set of possible starting bytes. As time goes
437 by, we may be able to get more clever at doing this. The SSB_CONTINUE return is
438 useful for parenthesized groups in patterns such as (a*)b where the group
439 provides some optional starting bytes but scanning must continue at the outer
440 level to find at least one mandatory byte. At the outermost level, this
441 function fails unless the result is SSB_DONE.
442
443 Arguments:
444 code points to an expression
445 start_bits points to a 32-byte table, initialized to 0
446 caseless the current state of the caseless flag
447 utf8 TRUE if in UTF-8 mode
448 cd the block with char table pointers
449
450 Returns: SSB_FAIL => Failed to find any starting bytes
451 SSB_DONE => Found mandatory starting bytes
452 SSB_CONTINUE => Found optional starting bytes
453 */
454
455 static int
456 set_start_bits(const uschar *code, uschar *start_bits, BOOL caseless,
457 BOOL utf8, compile_data *cd)
458 {
459 register int c;
460 int yield = SSB_DONE;
461
462 #if 0
463 /* ========================================================================= */
464 /* The following comment and code was inserted in January 1999. In May 2006,
465 when it was observed to cause compiler warnings about unused values, I took it
466 out again. If anybody is still using OS/2, they will have to put it back
467 manually. */
468
469 /* This next statement and the later reference to dummy are here in order to
470 trick the optimizer of the IBM C compiler for OS/2 into generating correct
471 code. Apparently IBM isn't going to fix the problem, and we would rather not
472 disable optimization (in this module it actually makes a big difference, and
473 the pcre module can use all the optimization it can get). */
474
475 volatile int dummy;
476 /* ========================================================================= */
477 #endif
478
479 do
480 {
481 const uschar *tcode = code + (((int)*code == OP_CBRA)? 3:1) + LINK_SIZE;
482 BOOL try_next = TRUE;
483
484 while (try_next) /* Loop for items in this branch */
485 {
486 int rc;
487 switch(*tcode)
488 {
489 /* Fail if we reach something we don't understand */
490
491 default:
492 return SSB_FAIL;
493
494 /* If we hit a bracket or a positive lookahead assertion, recurse to set
495 bits from within the subpattern. If it can't find anything, we have to
496 give up. If it finds some mandatory character(s), we are done for this
497 branch. Otherwise, carry on scanning after the subpattern. */
498
499 case OP_BRA:
500 case OP_SBRA:
501 case OP_CBRA:
502 case OP_SCBRA:
503 case OP_ONCE:
504 case OP_ASSERT:
505 rc = set_start_bits(tcode, start_bits, caseless, utf8, cd);
506 if (rc == SSB_FAIL) return SSB_FAIL;
507 if (rc == SSB_DONE) try_next = FALSE; else
508 {
509 do tcode += GET(tcode, 1); while (*tcode == OP_ALT);
510 tcode += 1 + LINK_SIZE;
511 }
512 break;
513
514 /* If we hit ALT or KET, it means we haven't found anything mandatory in
515 this branch, though we might have found something optional. For ALT, we
516 continue with the next alternative, but we have to arrange that the final
517 result from subpattern is SSB_CONTINUE rather than SSB_DONE. For KET,
518 return SSB_CONTINUE: if this is the top level, that indicates failure,
519 but after a nested subpattern, it causes scanning to continue. */
520
521 case OP_ALT:
522 yield = SSB_CONTINUE;
523 try_next = FALSE;
524 break;
525
526 case OP_KET:
527 case OP_KETRMAX:
528 case OP_KETRMIN:
529 return SSB_CONTINUE;
530
531 /* Skip over callout */
532
533 case OP_CALLOUT:
534 tcode += 2 + 2*LINK_SIZE;
535 break;
536
537 /* Skip over lookbehind and negative lookahead assertions */
538
539 case OP_ASSERT_NOT:
540 case OP_ASSERTBACK:
541 case OP_ASSERTBACK_NOT:
542 do tcode += GET(tcode, 1); while (*tcode == OP_ALT);
543 tcode += 1 + LINK_SIZE;
544 break;
545
546 /* Skip over an option setting, changing the caseless flag */
547
548 case OP_OPT:
549 caseless = (tcode[1] & PCRE_CASELESS) != 0;
550 tcode += 2;
551 break;
552
553 /* BRAZERO does the bracket, but carries on. */
554
555 case OP_BRAZERO:
556 case OP_BRAMINZERO:
557 if (set_start_bits(++tcode, start_bits, caseless, utf8, cd) == SSB_FAIL)
558 return SSB_FAIL;
559 /* =========================================================================
560 See the comment at the head of this function concerning the next line,
561 which was an old fudge for the benefit of OS/2.
562 dummy = 1;
563 ========================================================================= */
564 do tcode += GET(tcode,1); while (*tcode == OP_ALT);
565 tcode += 1 + LINK_SIZE;
566 break;
567
568 /* SKIPZERO skips the bracket. */
569
570 case OP_SKIPZERO:
571 tcode++;
572 do tcode += GET(tcode,1); while (*tcode == OP_ALT);
573 tcode += 1 + LINK_SIZE;
574 break;
575
576 /* Single-char * or ? sets the bit and tries the next item */
577
578 case OP_STAR:
579 case OP_MINSTAR:
580 case OP_POSSTAR:
581 case OP_QUERY:
582 case OP_MINQUERY:
583 case OP_POSQUERY:
584 set_bit(start_bits, tcode[1], caseless, cd);
585 tcode += 2;
586 #ifdef SUPPORT_UTF8
587 if (utf8 && tcode[-1] >= 0xc0)
588 tcode += _pcre_utf8_table4[tcode[-1] & 0x3f];
589 #endif
590 break;
591
592 /* Single-char upto sets the bit and tries the next */
593
594 case OP_UPTO:
595 case OP_MINUPTO:
596 case OP_POSUPTO:
597 set_bit(start_bits, tcode[3], caseless, cd);
598 tcode += 4;
599 #ifdef SUPPORT_UTF8
600 if (utf8 && tcode[-1] >= 0xc0)
601 tcode += _pcre_utf8_table4[tcode[-1] & 0x3f];
602 #endif
603 break;
604
605 /* At least one single char sets the bit and stops */
606
607 case OP_EXACT: /* Fall through */
608 tcode += 2;
609
610 case OP_CHAR:
611 case OP_CHARNC:
612 case OP_PLUS:
613 case OP_MINPLUS:
614 case OP_POSPLUS:
615 set_bit(start_bits, tcode[1], caseless, cd);
616 try_next = FALSE;
617 break;
618
619 /* Single character type sets the bits and stops */
620
621 case OP_NOT_DIGIT:
622 for (c = 0; c < 32; c++)
623 start_bits[c] |= ~cd->cbits[c+cbit_digit];
624 try_next = FALSE;
625 break;
626
627 case OP_DIGIT:
628 for (c = 0; c < 32; c++)
629 start_bits[c] |= cd->cbits[c+cbit_digit];
630 try_next = FALSE;
631 break;
632
633 /* The cbit_space table has vertical tab as whitespace; we have to
634 discard it. */
635
636 case OP_NOT_WHITESPACE:
637 for (c = 0; c < 32; c++)
638 {
639 int d = cd->cbits[c+cbit_space];
640 if (c == 1) d &= ~0x08;
641 start_bits[c] |= ~d;
642 }
643 try_next = FALSE;
644 break;
645
646 /* The cbit_space table has vertical tab as whitespace; we have to
647 discard it. */
648
649 case OP_WHITESPACE:
650 for (c = 0; c < 32; c++)
651 {
652 int d = cd->cbits[c+cbit_space];
653 if (c == 1) d &= ~0x08;
654 start_bits[c] |= d;
655 }
656 try_next = FALSE;
657 break;
658
659 case OP_NOT_WORDCHAR:
660 for (c = 0; c < 32; c++)
661 start_bits[c] |= ~cd->cbits[c+cbit_word];
662 try_next = FALSE;
663 break;
664
665 case OP_WORDCHAR:
666 for (c = 0; c < 32; c++)
667 start_bits[c] |= cd->cbits[c+cbit_word];
668 try_next = FALSE;
669 break;
670
671 /* One or more character type fudges the pointer and restarts, knowing
672 it will hit a single character type and stop there. */
673
674 case OP_TYPEPLUS:
675 case OP_TYPEMINPLUS:
676 tcode++;
677 break;
678
679 case OP_TYPEEXACT:
680 tcode += 3;
681 break;
682
683 /* Zero or more repeats of character types set the bits and then
684 try again. */
685
686 case OP_TYPEUPTO:
687 case OP_TYPEMINUPTO:
688 case OP_TYPEPOSUPTO:
689 tcode += 2; /* Fall through */
690
691 case OP_TYPESTAR:
692 case OP_TYPEMINSTAR:
693 case OP_TYPEPOSSTAR:
694 case OP_TYPEQUERY:
695 case OP_TYPEMINQUERY:
696 case OP_TYPEPOSQUERY:
697 switch(tcode[1])
698 {
699 case OP_ANY:
700 case OP_ALLANY:
701 return SSB_FAIL;
702
703 case OP_NOT_DIGIT:
704 for (c = 0; c < 32; c++)
705 start_bits[c] |= ~cd->cbits[c+cbit_digit];
706 break;
707
708 case OP_DIGIT:
709 for (c = 0; c < 32; c++)
710 start_bits[c] |= cd->cbits[c+cbit_digit];
711 break;
712
713 /* The cbit_space table has vertical tab as whitespace; we have to
714 discard it. */
715
716 case OP_NOT_WHITESPACE:
717 for (c = 0; c < 32; c++)
718 {
719 int d = cd->cbits[c+cbit_space];
720 if (c == 1) d &= ~0x08;
721 start_bits[c] |= ~d;
722 }
723 break;
724
725 /* The cbit_space table has vertical tab as whitespace; we have to
726 discard it. */
727
728 case OP_WHITESPACE:
729 for (c = 0; c < 32; c++)
730 {
731 int d = cd->cbits[c+cbit_space];
732 if (c == 1) d &= ~0x08;
733 start_bits[c] |= d;
734 }
735 break;
736
737 case OP_NOT_WORDCHAR:
738 for (c = 0; c < 32; c++)
739 start_bits[c] |= ~cd->cbits[c+cbit_word];
740 break;
741
742 case OP_WORDCHAR:
743 for (c = 0; c < 32; c++)
744 start_bits[c] |= cd->cbits[c+cbit_word];
745 break;
746 }
747
748 tcode += 2;
749 break;
750
751 /* Character class where all the information is in a bit map: set the
752 bits and either carry on or not, according to the repeat count. If it was
753 a negative class, and we are operating with UTF-8 characters, any byte
754 with a value >= 0xc4 is a potentially valid starter because it starts a
755 character with a value > 255. */
756
757 case OP_NCLASS:
758 #ifdef SUPPORT_UTF8
759 if (utf8)
760 {
761 start_bits[24] |= 0xf0; /* Bits for 0xc4 - 0xc8 */
762 memset(start_bits+25, 0xff, 7); /* Bits for 0xc9 - 0xff */
763 }
764 #endif
765 /* Fall through */
766
767 case OP_CLASS:
768 {
769 tcode++;
770
771 /* In UTF-8 mode, the bits in a bit map correspond to character
772 values, not to byte values. However, the bit map we are constructing is
773 for byte values. So we have to do a conversion for characters whose
774 value is > 127. In fact, there are only two possible starting bytes for
775 characters in the range 128 - 255. */
776
777 #ifdef SUPPORT_UTF8
778 if (utf8)
779 {
780 for (c = 0; c < 16; c++) start_bits[c] |= tcode[c];
781 for (c = 128; c < 256; c++)
782 {
783 if ((tcode[c/8] && (1 << (c&7))) != 0)
784 {
785 int d = (c >> 6) | 0xc0; /* Set bit for this starter */
786 start_bits[d/8] |= (1 << (d&7)); /* and then skip on to the */
787 c = (c & 0xc0) + 0x40 - 1; /* next relevant character. */
788 }
789 }
790 }
791
792 /* In non-UTF-8 mode, the two bit maps are completely compatible. */
793
794 else
795 #endif
796 {
797 for (c = 0; c < 32; c++) start_bits[c] |= tcode[c];
798 }
799
800 /* Advance past the bit map, and act on what follows */
801
802 tcode += 32;
803 switch (*tcode)
804 {
805 case OP_CRSTAR:
806 case OP_CRMINSTAR:
807 case OP_CRQUERY:
808 case OP_CRMINQUERY:
809 tcode++;
810 break;
811
812 case OP_CRRANGE:
813 case OP_CRMINRANGE:
814 if (((tcode[1] << 8) + tcode[2]) == 0) tcode += 5;
815 else try_next = FALSE;
816 break;
817
818 default:
819 try_next = FALSE;
820 break;
821 }
822 }
823 break; /* End of bitmap class handling */
824
825 } /* End of switch */
826 } /* End of try_next loop */
827
828 code += GET(code, 1); /* Advance to next branch */
829 }
830 while (*code == OP_ALT);
831 return yield;
832 }
833
834
835
836 /*************************************************
837 * Study a compiled expression *
838 *************************************************/
839
840 /* This function is handed a compiled expression that it must study to produce
841 information that will speed up the matching. It returns a pcre_extra block
842 which then gets handed back to pcre_exec().
843
844 Arguments:
845 re points to the compiled expression
846 options contains option bits
847 errorptr points to where to place error messages;
848 set NULL unless error
849
850 Returns: pointer to a pcre_extra block, with study_data filled in and the
851 appropriate flags set;
852 NULL on error or if no optimization possible
853 */
854
855 PCRE_EXP_DEFN pcre_extra * PCRE_CALL_CONVENTION
856 pcre_study(const pcre *external_re, int options, const char **errorptr)
857 {
858 int min;
859 BOOL bits_set = FALSE;
860 uschar start_bits[32];
861 pcre_extra *extra;
862 pcre_study_data *study;
863 const uschar *tables;
864 uschar *code;
865 compile_data compile_block;
866 const real_pcre *re = (const real_pcre *)external_re;
867
868 *errorptr = NULL;
869
870 if (re == NULL || re->magic_number != MAGIC_NUMBER)
871 {
872 *errorptr = "argument is not a compiled regular expression";
873 return NULL;
874 }
875
876 if ((options & ~PUBLIC_STUDY_OPTIONS) != 0)
877 {
878 *errorptr = "unknown or incorrect option bit(s) set";
879 return NULL;
880 }
881
882 code = (uschar *)re + re->name_table_offset +
883 (re->name_count * re->name_entry_size);
884
885 /* For an anchored pattern, or an unanchored pattern that has a first char, or
886 a multiline pattern that matches only at "line starts", there is no point in
887 seeking a list of starting bytes. */
888
889 if ((re->options & PCRE_ANCHORED) == 0 &&
890 (re->flags & (PCRE_FIRSTSET|PCRE_STARTLINE)) == 0)
891 {
892 /* Set the character tables in the block that is passed around */
893
894 tables = re->tables;
895 if (tables == NULL)
896 (void)pcre_fullinfo(external_re, NULL, PCRE_INFO_DEFAULT_TABLES,
897 (void *)(&tables));
898
899 compile_block.lcc = tables + lcc_offset;
900 compile_block.fcc = tables + fcc_offset;
901 compile_block.cbits = tables + cbits_offset;
902 compile_block.ctypes = tables + ctypes_offset;
903
904 /* See if we can find a fixed set of initial characters for the pattern. */
905
906 memset(start_bits, 0, 32 * sizeof(uschar));
907 bits_set = set_start_bits(code, start_bits,
908 (re->options & PCRE_CASELESS) != 0, (re->options & PCRE_UTF8) != 0,
909 &compile_block) == SSB_DONE;
910 }
911
912 /* Find the minimum length of subject string. */
913
914 min = find_minlength(code, code, re->options);
915
916 /* Return NULL if no optimization is possible. */
917
918 if (!bits_set && min < 0) return NULL;
919
920 /* Get a pcre_extra block and a pcre_study_data block. The study data is put in
921 the latter, which is pointed to by the former, which may also get additional
922 data set later by the calling program. At the moment, the size of
923 pcre_study_data is fixed. We nevertheless save it in a field for returning via
924 the pcre_fullinfo() function so that if it becomes variable in the future, we
925 don't have to change that code. */
926
927 extra = (pcre_extra *)(pcre_malloc)
928 (sizeof(pcre_extra) + sizeof(pcre_study_data));
929
930 if (extra == NULL)
931 {
932 *errorptr = "failed to get memory";
933 return NULL;
934 }
935
936 study = (pcre_study_data *)((char *)extra + sizeof(pcre_extra));
937 extra->flags = PCRE_EXTRA_STUDY_DATA;
938 extra->study_data = study;
939
940 study->size = sizeof(pcre_study_data);
941 study->flags = 0;
942
943 if (bits_set)
944 {
945 study->flags |= PCRE_STUDY_MAPPED;
946 memcpy(study->start_bits, start_bits, sizeof(start_bits));
947 }
948
949 if (min >= 0)
950 {
951 study->flags |= PCRE_STUDY_MINLEN;
952 study->minlength = min;
953 }
954
955 return extra;
956 }
957
958 /* End of pcre_study.c */

Properties

Name Value
svn:eol-style native
svn:keywords "Author Date Id Revision Url"

webmaster@exim.org
ViewVC Help
Powered by ViewVC 1.1.12