/[pcre]/code/trunk/pcre_study.c
ViewVC logotype

Contents of /code/trunk/pcre_study.c

Parent Directory Parent Directory | Revision Log Revision Log


Revision 611 - (show annotations) (download)
Wed Jun 29 08:49:21 2011 UTC (3 years ago) by ph10
File MIME type: text/plain
File size: 36708 byte(s)
Fix \R problem with study: incorrect minimum subject length.

1 /*************************************************
2 * Perl-Compatible Regular Expressions *
3 *************************************************/
4
5 /* PCRE is a library of functions to support regular expressions whose syntax
6 and semantics are as close as possible to those of the Perl 5 language.
7
8 Written by Philip Hazel
9 Copyright (c) 1997-2010 University of Cambridge
10
11 -----------------------------------------------------------------------------
12 Redistribution and use in source and binary forms, with or without
13 modification, are permitted provided that the following conditions are met:
14
15 * Redistributions of source code must retain the above copyright notice,
16 this list of conditions and the following disclaimer.
17
18 * Redistributions in binary form must reproduce the above copyright
19 notice, this list of conditions and the following disclaimer in the
20 documentation and/or other materials provided with the distribution.
21
22 * Neither the name of the University of Cambridge nor the names of its
23 contributors may be used to endorse or promote products derived from
24 this software without specific prior written permission.
25
26 THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
27 AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
28 IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
29 ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
30 LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
31 CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
32 SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
33 INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
34 CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
35 ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
36 POSSIBILITY OF SUCH DAMAGE.
37 -----------------------------------------------------------------------------
38 */
39
40
41 /* This module contains the external function pcre_study(), along with local
42 supporting functions. */
43
44
45 #ifdef HAVE_CONFIG_H
46 #include "config.h"
47 #endif
48
49 #include "pcre_internal.h"
50
51 #define SET_BIT(c) start_bits[c/8] |= (1 << (c&7))
52
53 /* Returns from set_start_bits() */
54
55 enum { SSB_FAIL, SSB_DONE, SSB_CONTINUE, SSB_UNKNOWN };
56
57
58
59 /*************************************************
60 * Find the minimum subject length for a group *
61 *************************************************/
62
63 /* Scan a parenthesized group and compute the minimum length of subject that
64 is needed to match it. This is a lower bound; it does not mean there is a
65 string of that length that matches. In UTF8 mode, the result is in characters
66 rather than bytes.
67
68 Arguments:
69 code pointer to start of group (the bracket)
70 startcode pointer to start of the whole pattern
71 options the compiling options
72
73 Returns: the minimum length
74 -1 if \C was encountered
75 -2 internal error (missing capturing bracket)
76 -3 internal error (opcode not listed)
77 */
78
79 static int
80 find_minlength(const uschar *code, const uschar *startcode, int options)
81 {
82 int length = -1;
83 BOOL utf8 = (options & PCRE_UTF8) != 0;
84 BOOL had_recurse = FALSE;
85 register int branchlength = 0;
86 register uschar *cc = (uschar *)code + 1 + LINK_SIZE;
87
88 if (*code == OP_CBRA || *code == OP_SCBRA ||
89 *code == OP_CBRAPOS || *code == OP_SCBRAPOS) cc += 2;
90
91 /* Scan along the opcodes for this branch. If we get to the end of the
92 branch, check the length against that of the other branches. */
93
94 for (;;)
95 {
96 int d, min;
97 uschar *cs, *ce;
98 register int op = *cc;
99
100 switch (op)
101 {
102 case OP_COND:
103 case OP_SCOND:
104
105 /* If there is only one branch in a condition, the implied branch has zero
106 length, so we don't add anything. This covers the DEFINE "condition"
107 automatically. */
108
109 cs = cc + GET(cc, 1);
110 if (*cs != OP_ALT)
111 {
112 cc = cs + 1 + LINK_SIZE;
113 break;
114 }
115
116 /* Otherwise we can fall through and treat it the same as any other
117 subpattern. */
118
119 case OP_CBRA:
120 case OP_SCBRA:
121 case OP_BRA:
122 case OP_SBRA:
123 case OP_CBRAPOS:
124 case OP_SCBRAPOS:
125 case OP_BRAPOS:
126 case OP_SBRAPOS:
127 case OP_ONCE:
128 d = find_minlength(cc, startcode, options);
129 if (d < 0) return d;
130 branchlength += d;
131 do cc += GET(cc, 1); while (*cc == OP_ALT);
132 cc += 1 + LINK_SIZE;
133 break;
134
135 /* Reached end of a branch; if it's a ket it is the end of a nested
136 call. If it's ALT it is an alternation in a nested call. If it is
137 END it's the end of the outer call. All can be handled by the same code. */
138
139 case OP_ALT:
140 case OP_KET:
141 case OP_KETRMAX:
142 case OP_KETRMIN:
143 case OP_KETRPOS:
144 case OP_END:
145 if (length < 0 || (!had_recurse && branchlength < length))
146 length = branchlength;
147 if (*cc != OP_ALT) return length;
148 cc += 1 + LINK_SIZE;
149 branchlength = 0;
150 had_recurse = FALSE;
151 break;
152
153 /* Skip over assertive subpatterns */
154
155 case OP_ASSERT:
156 case OP_ASSERT_NOT:
157 case OP_ASSERTBACK:
158 case OP_ASSERTBACK_NOT:
159 do cc += GET(cc, 1); while (*cc == OP_ALT);
160 /* Fall through */
161
162 /* Skip over things that don't match chars */
163
164 case OP_REVERSE:
165 case OP_CREF:
166 case OP_NCREF:
167 case OP_RREF:
168 case OP_NRREF:
169 case OP_DEF:
170 case OP_CALLOUT:
171 case OP_SOD:
172 case OP_SOM:
173 case OP_EOD:
174 case OP_EODN:
175 case OP_CIRC:
176 case OP_CIRCM:
177 case OP_DOLL:
178 case OP_DOLLM:
179 case OP_NOT_WORD_BOUNDARY:
180 case OP_WORD_BOUNDARY:
181 cc += _pcre_OP_lengths[*cc];
182 break;
183
184 /* Skip over a subpattern that has a {0} or {0,x} quantifier */
185
186 case OP_BRAZERO:
187 case OP_BRAMINZERO:
188 case OP_BRAPOSZERO:
189 case OP_SKIPZERO:
190 cc += _pcre_OP_lengths[*cc];
191 do cc += GET(cc, 1); while (*cc == OP_ALT);
192 cc += 1 + LINK_SIZE;
193 break;
194
195 /* Handle literal characters and + repetitions */
196
197 case OP_CHAR:
198 case OP_CHARI:
199 case OP_NOT:
200 case OP_NOTI:
201 case OP_PLUS:
202 case OP_PLUSI:
203 case OP_MINPLUS:
204 case OP_MINPLUSI:
205 case OP_POSPLUS:
206 case OP_POSPLUSI:
207 case OP_NOTPLUS:
208 case OP_NOTPLUSI:
209 case OP_NOTMINPLUS:
210 case OP_NOTMINPLUSI:
211 case OP_NOTPOSPLUS:
212 case OP_NOTPOSPLUSI:
213 branchlength++;
214 cc += 2;
215 #ifdef SUPPORT_UTF8
216 if (utf8 && cc[-1] >= 0xc0) cc += _pcre_utf8_table4[cc[-1] & 0x3f];
217 #endif
218 break;
219
220 case OP_TYPEPLUS:
221 case OP_TYPEMINPLUS:
222 case OP_TYPEPOSPLUS:
223 branchlength++;
224 cc += (cc[1] == OP_PROP || cc[1] == OP_NOTPROP)? 4 : 2;
225 break;
226
227 /* Handle exact repetitions. The count is already in characters, but we
228 need to skip over a multibyte character in UTF8 mode. */
229
230 case OP_EXACT:
231 case OP_EXACTI:
232 case OP_NOTEXACT:
233 case OP_NOTEXACTI:
234 branchlength += GET2(cc,1);
235 cc += 4;
236 #ifdef SUPPORT_UTF8
237 if (utf8 && cc[-1] >= 0xc0) cc += _pcre_utf8_table4[cc[-1] & 0x3f];
238 #endif
239 break;
240
241 case OP_TYPEEXACT:
242 branchlength += GET2(cc,1);
243 cc += (cc[3] == OP_PROP || cc[3] == OP_NOTPROP)? 6 : 4;
244 break;
245
246 /* Handle single-char non-literal matchers */
247
248 case OP_PROP:
249 case OP_NOTPROP:
250 cc += 2;
251 /* Fall through */
252
253 case OP_NOT_DIGIT:
254 case OP_DIGIT:
255 case OP_NOT_WHITESPACE:
256 case OP_WHITESPACE:
257 case OP_NOT_WORDCHAR:
258 case OP_WORDCHAR:
259 case OP_ANY:
260 case OP_ALLANY:
261 case OP_EXTUNI:
262 case OP_HSPACE:
263 case OP_NOT_HSPACE:
264 case OP_VSPACE:
265 case OP_NOT_VSPACE:
266 branchlength++;
267 cc++;
268 break;
269
270 /* "Any newline" might match two characters, but it also might match just
271 one. */
272
273 case OP_ANYNL:
274 branchlength += 1;
275 cc++;
276 break;
277
278 /* The single-byte matcher means we can't proceed in UTF-8 mode */
279
280 case OP_ANYBYTE:
281 #ifdef SUPPORT_UTF8
282 if (utf8) return -1;
283 #endif
284 branchlength++;
285 cc++;
286 break;
287
288 /* For repeated character types, we have to test for \p and \P, which have
289 an extra two bytes of parameters. */
290
291 case OP_TYPESTAR:
292 case OP_TYPEMINSTAR:
293 case OP_TYPEQUERY:
294 case OP_TYPEMINQUERY:
295 case OP_TYPEPOSSTAR:
296 case OP_TYPEPOSQUERY:
297 if (cc[1] == OP_PROP || cc[1] == OP_NOTPROP) cc += 2;
298 cc += _pcre_OP_lengths[op];
299 break;
300
301 case OP_TYPEUPTO:
302 case OP_TYPEMINUPTO:
303 case OP_TYPEPOSUPTO:
304 if (cc[3] == OP_PROP || cc[3] == OP_NOTPROP) cc += 2;
305 cc += _pcre_OP_lengths[op];
306 break;
307
308 /* Check a class for variable quantification */
309
310 #ifdef SUPPORT_UTF8
311 case OP_XCLASS:
312 cc += GET(cc, 1) - 33;
313 /* Fall through */
314 #endif
315
316 case OP_CLASS:
317 case OP_NCLASS:
318 cc += 33;
319
320 switch (*cc)
321 {
322 case OP_CRPLUS:
323 case OP_CRMINPLUS:
324 branchlength++;
325 /* Fall through */
326
327 case OP_CRSTAR:
328 case OP_CRMINSTAR:
329 case OP_CRQUERY:
330 case OP_CRMINQUERY:
331 cc++;
332 break;
333
334 case OP_CRRANGE:
335 case OP_CRMINRANGE:
336 branchlength += GET2(cc,1);
337 cc += 5;
338 break;
339
340 default:
341 branchlength++;
342 break;
343 }
344 break;
345
346 /* Backreferences and subroutine calls are treated in the same way: we find
347 the minimum length for the subpattern. A recursion, however, causes an
348 a flag to be set that causes the length of this branch to be ignored. The
349 logic is that a recursion can only make sense if there is another
350 alternation that stops the recursing. That will provide the minimum length
351 (when no recursion happens). A backreference within the group that it is
352 referencing behaves in the same way.
353
354 If PCRE_JAVASCRIPT_COMPAT is set, a backreference to an unset bracket
355 matches an empty string (by default it causes a matching failure), so in
356 that case we must set the minimum length to zero. */
357
358 case OP_REF:
359 case OP_REFI:
360 if ((options & PCRE_JAVASCRIPT_COMPAT) == 0)
361 {
362 ce = cs = (uschar *)_pcre_find_bracket(startcode, utf8, GET2(cc, 1));
363 if (cs == NULL) return -2;
364 do ce += GET(ce, 1); while (*ce == OP_ALT);
365 if (cc > cs && cc < ce)
366 {
367 d = 0;
368 had_recurse = TRUE;
369 }
370 else d = find_minlength(cs, startcode, options);
371 }
372 else d = 0;
373 cc += 3;
374
375 /* Handle repeated back references */
376
377 switch (*cc)
378 {
379 case OP_CRSTAR:
380 case OP_CRMINSTAR:
381 case OP_CRQUERY:
382 case OP_CRMINQUERY:
383 min = 0;
384 cc++;
385 break;
386
387 case OP_CRPLUS:
388 case OP_CRMINPLUS:
389 min = 1;
390 cc++;
391 break;
392
393 case OP_CRRANGE:
394 case OP_CRMINRANGE:
395 min = GET2(cc, 1);
396 cc += 5;
397 break;
398
399 default:
400 min = 1;
401 break;
402 }
403
404 branchlength += min * d;
405 break;
406
407 case OP_RECURSE:
408 cs = ce = (uschar *)startcode + GET(cc, 1);
409 if (cs == NULL) return -2;
410 do ce += GET(ce, 1); while (*ce == OP_ALT);
411 if (cc > cs && cc < ce)
412 had_recurse = TRUE;
413 else
414 branchlength += find_minlength(cs, startcode, options);
415 cc += 1 + LINK_SIZE;
416 break;
417
418 /* Anything else does not or need not match a character. We can get the
419 item's length from the table, but for those that can match zero occurrences
420 of a character, we must take special action for UTF-8 characters. As it
421 happens, the "NOT" versions of these opcodes are used at present only for
422 ASCII characters, so they could be omitted from this list. However, in
423 future that may change, so we include them here so as not to leave a
424 gotcha for a future maintainer. */
425
426 case OP_UPTO:
427 case OP_UPTOI:
428 case OP_NOTUPTO:
429 case OP_NOTUPTOI:
430 case OP_MINUPTO:
431 case OP_MINUPTOI:
432 case OP_NOTMINUPTO:
433 case OP_NOTMINUPTOI:
434 case OP_POSUPTO:
435 case OP_POSUPTOI:
436 case OP_NOTPOSUPTO:
437 case OP_NOTPOSUPTOI:
438
439 case OP_STAR:
440 case OP_STARI:
441 case OP_NOTSTAR:
442 case OP_NOTSTARI:
443 case OP_MINSTAR:
444 case OP_MINSTARI:
445 case OP_NOTMINSTAR:
446 case OP_NOTMINSTARI:
447 case OP_POSSTAR:
448 case OP_POSSTARI:
449 case OP_NOTPOSSTAR:
450 case OP_NOTPOSSTARI:
451
452 case OP_QUERY:
453 case OP_QUERYI:
454 case OP_NOTQUERY:
455 case OP_NOTQUERYI:
456 case OP_MINQUERY:
457 case OP_MINQUERYI:
458 case OP_NOTMINQUERY:
459 case OP_NOTMINQUERYI:
460 case OP_POSQUERY:
461 case OP_POSQUERYI:
462 case OP_NOTPOSQUERY:
463 case OP_NOTPOSQUERYI:
464
465 cc += _pcre_OP_lengths[op];
466 #ifdef SUPPORT_UTF8
467 if (utf8 && cc[-1] >= 0xc0) cc += _pcre_utf8_table4[cc[-1] & 0x3f];
468 #endif
469 break;
470
471 /* Skip these, but we need to add in the name length. */
472
473 case OP_MARK:
474 case OP_PRUNE_ARG:
475 case OP_SKIP_ARG:
476 cc += _pcre_OP_lengths[op] + cc[1];
477 break;
478
479 case OP_THEN_ARG:
480 cc += _pcre_OP_lengths[op] + cc[1+LINK_SIZE];
481 break;
482
483 /* The remaining opcodes are just skipped over. */
484
485 case OP_ACCEPT:
486 case OP_CLOSE:
487 case OP_COMMIT:
488 case OP_FAIL:
489 case OP_PRUNE:
490 case OP_SET_SOM:
491 case OP_SKIP:
492 case OP_THEN:
493 cc += _pcre_OP_lengths[op];
494 break;
495
496 /* This should not occur: we list all opcodes explicitly so that when
497 new ones get added they are properly considered. */
498
499 default:
500 return -3;
501 }
502 }
503 /* Control never gets here */
504 }
505
506
507
508 /*************************************************
509 * Set a bit and maybe its alternate case *
510 *************************************************/
511
512 /* Given a character, set its first byte's bit in the table, and also the
513 corresponding bit for the other version of a letter if we are caseless. In
514 UTF-8 mode, for characters greater than 127, we can only do the caseless thing
515 when Unicode property support is available.
516
517 Arguments:
518 start_bits points to the bit map
519 p points to the character
520 caseless the caseless flag
521 cd the block with char table pointers
522 utf8 TRUE for UTF-8 mode
523
524 Returns: pointer after the character
525 */
526
527 static const uschar *
528 set_table_bit(uschar *start_bits, const uschar *p, BOOL caseless,
529 compile_data *cd, BOOL utf8)
530 {
531 unsigned int c = *p;
532
533 SET_BIT(c);
534
535 #ifdef SUPPORT_UTF8
536 if (utf8 && c > 127)
537 {
538 GETCHARINC(c, p);
539 #ifdef SUPPORT_UCP
540 if (caseless)
541 {
542 uschar buff[8];
543 c = UCD_OTHERCASE(c);
544 (void)_pcre_ord2utf8(c, buff);
545 SET_BIT(buff[0]);
546 }
547 #endif
548 return p;
549 }
550 #endif
551
552 /* Not UTF-8 mode, or character is less than 127. */
553
554 if (caseless && (cd->ctypes[c] & ctype_letter) != 0) SET_BIT(cd->fcc[c]);
555 return p + 1;
556 }
557
558
559
560 /*************************************************
561 * Set bits for a positive character type *
562 *************************************************/
563
564 /* This function sets starting bits for a character type. In UTF-8 mode, we can
565 only do a direct setting for bytes less than 128, as otherwise there can be
566 confusion with bytes in the middle of UTF-8 characters. In a "traditional"
567 environment, the tables will only recognize ASCII characters anyway, but in at
568 least one Windows environment, some higher bytes bits were set in the tables.
569 So we deal with that case by considering the UTF-8 encoding.
570
571 Arguments:
572 start_bits the starting bitmap
573 cbit type the type of character wanted
574 table_limit 32 for non-UTF-8; 16 for UTF-8
575 cd the block with char table pointers
576
577 Returns: nothing
578 */
579
580 static void
581 set_type_bits(uschar *start_bits, int cbit_type, int table_limit,
582 compile_data *cd)
583 {
584 register int c;
585 for (c = 0; c < table_limit; c++) start_bits[c] |= cd->cbits[c+cbit_type];
586 if (table_limit == 32) return;
587 for (c = 128; c < 256; c++)
588 {
589 if ((cd->cbits[c/8] & (1 << (c&7))) != 0)
590 {
591 uschar buff[8];
592 (void)_pcre_ord2utf8(c, buff);
593 SET_BIT(buff[0]);
594 }
595 }
596 }
597
598
599 /*************************************************
600 * Set bits for a negative character type *
601 *************************************************/
602
603 /* This function sets starting bits for a negative character type such as \D.
604 In UTF-8 mode, we can only do a direct setting for bytes less than 128, as
605 otherwise there can be confusion with bytes in the middle of UTF-8 characters.
606 Unlike in the positive case, where we can set appropriate starting bits for
607 specific high-valued UTF-8 characters, in this case we have to set the bits for
608 all high-valued characters. The lowest is 0xc2, but we overkill by starting at
609 0xc0 (192) for simplicity.
610
611 Arguments:
612 start_bits the starting bitmap
613 cbit type the type of character wanted
614 table_limit 32 for non-UTF-8; 16 for UTF-8
615 cd the block with char table pointers
616
617 Returns: nothing
618 */
619
620 static void
621 set_nottype_bits(uschar *start_bits, int cbit_type, int table_limit,
622 compile_data *cd)
623 {
624 register int c;
625 for (c = 0; c < table_limit; c++) start_bits[c] |= ~cd->cbits[c+cbit_type];
626 if (table_limit != 32) for (c = 24; c < 32; c++) start_bits[c] = 0xff;
627 }
628
629
630
631 /*************************************************
632 * Create bitmap of starting bytes *
633 *************************************************/
634
635 /* This function scans a compiled unanchored expression recursively and
636 attempts to build a bitmap of the set of possible starting bytes. As time goes
637 by, we may be able to get more clever at doing this. The SSB_CONTINUE return is
638 useful for parenthesized groups in patterns such as (a*)b where the group
639 provides some optional starting bytes but scanning must continue at the outer
640 level to find at least one mandatory byte. At the outermost level, this
641 function fails unless the result is SSB_DONE.
642
643 Arguments:
644 code points to an expression
645 start_bits points to a 32-byte table, initialized to 0
646 utf8 TRUE if in UTF-8 mode
647 cd the block with char table pointers
648
649 Returns: SSB_FAIL => Failed to find any starting bytes
650 SSB_DONE => Found mandatory starting bytes
651 SSB_CONTINUE => Found optional starting bytes
652 SSB_UNKNOWN => Hit an unrecognized opcode
653 */
654
655 static int
656 set_start_bits(const uschar *code, uschar *start_bits, BOOL utf8,
657 compile_data *cd)
658 {
659 register int c;
660 int yield = SSB_DONE;
661 int table_limit = utf8? 16:32;
662
663 #if 0
664 /* ========================================================================= */
665 /* The following comment and code was inserted in January 1999. In May 2006,
666 when it was observed to cause compiler warnings about unused values, I took it
667 out again. If anybody is still using OS/2, they will have to put it back
668 manually. */
669
670 /* This next statement and the later reference to dummy are here in order to
671 trick the optimizer of the IBM C compiler for OS/2 into generating correct
672 code. Apparently IBM isn't going to fix the problem, and we would rather not
673 disable optimization (in this module it actually makes a big difference, and
674 the pcre module can use all the optimization it can get). */
675
676 volatile int dummy;
677 /* ========================================================================= */
678 #endif
679
680 do
681 {
682 BOOL try_next = TRUE;
683 const uschar *tcode = code + 1 + LINK_SIZE;
684
685 if (*code == OP_CBRA || *code == OP_SCBRA ||
686 *code == OP_CBRAPOS || *code == OP_SCBRAPOS) tcode += 2;
687
688 while (try_next) /* Loop for items in this branch */
689 {
690 int rc;
691 switch(*tcode)
692 {
693 /* If we reach something we don't understand, it means a new opcode has
694 been created that hasn't been added to this code. Hopefully this problem
695 will be discovered during testing. */
696
697 default:
698 return SSB_UNKNOWN;
699
700 /* Fail for a valid opcode that implies no starting bits. */
701
702 case OP_ACCEPT:
703 case OP_ALLANY:
704 case OP_ANY:
705 case OP_ANYBYTE:
706 case OP_CIRC:
707 case OP_CIRCM:
708 case OP_CLOSE:
709 case OP_COMMIT:
710 case OP_COND:
711 case OP_CREF:
712 case OP_DEF:
713 case OP_DOLL:
714 case OP_DOLLM:
715 case OP_END:
716 case OP_EOD:
717 case OP_EODN:
718 case OP_EXTUNI:
719 case OP_FAIL:
720 case OP_MARK:
721 case OP_NCREF:
722 case OP_NOT:
723 case OP_NOTEXACT:
724 case OP_NOTEXACTI:
725 case OP_NOTI:
726 case OP_NOTMINPLUS:
727 case OP_NOTMINPLUSI:
728 case OP_NOTMINQUERY:
729 case OP_NOTMINQUERYI:
730 case OP_NOTMINSTAR:
731 case OP_NOTMINSTARI:
732 case OP_NOTMINUPTO:
733 case OP_NOTMINUPTOI:
734 case OP_NOTPLUS:
735 case OP_NOTPLUSI:
736 case OP_NOTPOSPLUS:
737 case OP_NOTPOSPLUSI:
738 case OP_NOTPOSQUERY:
739 case OP_NOTPOSQUERYI:
740 case OP_NOTPOSSTAR:
741 case OP_NOTPOSSTARI:
742 case OP_NOTPOSUPTO:
743 case OP_NOTPOSUPTOI:
744 case OP_NOTPROP:
745 case OP_NOTQUERY:
746 case OP_NOTQUERYI:
747 case OP_NOTSTAR:
748 case OP_NOTSTARI:
749 case OP_NOTUPTO:
750 case OP_NOTUPTOI:
751 case OP_NOT_HSPACE:
752 case OP_NOT_VSPACE:
753 case OP_NOT_WORD_BOUNDARY:
754 case OP_NRREF:
755 case OP_PROP:
756 case OP_PRUNE:
757 case OP_PRUNE_ARG:
758 case OP_RECURSE:
759 case OP_REF:
760 case OP_REFI:
761 case OP_REVERSE:
762 case OP_RREF:
763 case OP_SCOND:
764 case OP_SET_SOM:
765 case OP_SKIP:
766 case OP_SKIP_ARG:
767 case OP_SOD:
768 case OP_SOM:
769 case OP_THEN:
770 case OP_THEN_ARG:
771 case OP_WORD_BOUNDARY:
772 case OP_XCLASS:
773 return SSB_FAIL;
774
775 /* If we hit a bracket or a positive lookahead assertion, recurse to set
776 bits from within the subpattern. If it can't find anything, we have to
777 give up. If it finds some mandatory character(s), we are done for this
778 branch. Otherwise, carry on scanning after the subpattern. */
779
780 case OP_BRA:
781 case OP_SBRA:
782 case OP_CBRA:
783 case OP_SCBRA:
784 case OP_BRAPOS:
785 case OP_SBRAPOS:
786 case OP_CBRAPOS:
787 case OP_SCBRAPOS:
788 case OP_ONCE:
789 case OP_ASSERT:
790 rc = set_start_bits(tcode, start_bits, utf8, cd);
791 if (rc == SSB_FAIL || rc == SSB_UNKNOWN) return rc;
792 if (rc == SSB_DONE) try_next = FALSE; else
793 {
794 do tcode += GET(tcode, 1); while (*tcode == OP_ALT);
795 tcode += 1 + LINK_SIZE;
796 }
797 break;
798
799 /* If we hit ALT or KET, it means we haven't found anything mandatory in
800 this branch, though we might have found something optional. For ALT, we
801 continue with the next alternative, but we have to arrange that the final
802 result from subpattern is SSB_CONTINUE rather than SSB_DONE. For KET,
803 return SSB_CONTINUE: if this is the top level, that indicates failure,
804 but after a nested subpattern, it causes scanning to continue. */
805
806 case OP_ALT:
807 yield = SSB_CONTINUE;
808 try_next = FALSE;
809 break;
810
811 case OP_KET:
812 case OP_KETRMAX:
813 case OP_KETRMIN:
814 case OP_KETRPOS:
815 return SSB_CONTINUE;
816
817 /* Skip over callout */
818
819 case OP_CALLOUT:
820 tcode += 2 + 2*LINK_SIZE;
821 break;
822
823 /* Skip over lookbehind and negative lookahead assertions */
824
825 case OP_ASSERT_NOT:
826 case OP_ASSERTBACK:
827 case OP_ASSERTBACK_NOT:
828 do tcode += GET(tcode, 1); while (*tcode == OP_ALT);
829 tcode += 1 + LINK_SIZE;
830 break;
831
832 /* BRAZERO does the bracket, but carries on. */
833
834 case OP_BRAZERO:
835 case OP_BRAMINZERO:
836 case OP_BRAPOSZERO:
837 rc = set_start_bits(++tcode, start_bits, utf8, cd);
838 if (rc == SSB_FAIL || rc == SSB_UNKNOWN) return rc;
839 /* =========================================================================
840 See the comment at the head of this function concerning the next line,
841 which was an old fudge for the benefit of OS/2.
842 dummy = 1;
843 ========================================================================= */
844 do tcode += GET(tcode,1); while (*tcode == OP_ALT);
845 tcode += 1 + LINK_SIZE;
846 break;
847
848 /* SKIPZERO skips the bracket. */
849
850 case OP_SKIPZERO:
851 tcode++;
852 do tcode += GET(tcode,1); while (*tcode == OP_ALT);
853 tcode += 1 + LINK_SIZE;
854 break;
855
856 /* Single-char * or ? sets the bit and tries the next item */
857
858 case OP_STAR:
859 case OP_MINSTAR:
860 case OP_POSSTAR:
861 case OP_QUERY:
862 case OP_MINQUERY:
863 case OP_POSQUERY:
864 tcode = set_table_bit(start_bits, tcode + 1, FALSE, cd, utf8);
865 break;
866
867 case OP_STARI:
868 case OP_MINSTARI:
869 case OP_POSSTARI:
870 case OP_QUERYI:
871 case OP_MINQUERYI:
872 case OP_POSQUERYI:
873 tcode = set_table_bit(start_bits, tcode + 1, TRUE, cd, utf8);
874 break;
875
876 /* Single-char upto sets the bit and tries the next */
877
878 case OP_UPTO:
879 case OP_MINUPTO:
880 case OP_POSUPTO:
881 tcode = set_table_bit(start_bits, tcode + 3, FALSE, cd, utf8);
882 break;
883
884 case OP_UPTOI:
885 case OP_MINUPTOI:
886 case OP_POSUPTOI:
887 tcode = set_table_bit(start_bits, tcode + 3, TRUE, cd, utf8);
888 break;
889
890 /* At least one single char sets the bit and stops */
891
892 case OP_EXACT:
893 tcode += 2;
894 /* Fall through */
895 case OP_CHAR:
896 case OP_PLUS:
897 case OP_MINPLUS:
898 case OP_POSPLUS:
899 (void)set_table_bit(start_bits, tcode + 1, FALSE, cd, utf8);
900 try_next = FALSE;
901 break;
902
903 case OP_EXACTI:
904 tcode += 2;
905 /* Fall through */
906 case OP_CHARI:
907 case OP_PLUSI:
908 case OP_MINPLUSI:
909 case OP_POSPLUSI:
910 (void)set_table_bit(start_bits, tcode + 1, TRUE, cd, utf8);
911 try_next = FALSE;
912 break;
913
914 /* Special spacing and line-terminating items. These recognize specific
915 lists of characters. The difference between VSPACE and ANYNL is that the
916 latter can match the two-character CRLF sequence, but that is not
917 relevant for finding the first character, so their code here is
918 identical. */
919
920 case OP_HSPACE:
921 SET_BIT(0x09);
922 SET_BIT(0x20);
923 if (utf8)
924 {
925 SET_BIT(0xC2); /* For U+00A0 */
926 SET_BIT(0xE1); /* For U+1680, U+180E */
927 SET_BIT(0xE2); /* For U+2000 - U+200A, U+202F, U+205F */
928 SET_BIT(0xE3); /* For U+3000 */
929 }
930 else SET_BIT(0xA0);
931 try_next = FALSE;
932 break;
933
934 case OP_ANYNL:
935 case OP_VSPACE:
936 SET_BIT(0x0A);
937 SET_BIT(0x0B);
938 SET_BIT(0x0C);
939 SET_BIT(0x0D);
940 if (utf8)
941 {
942 SET_BIT(0xC2); /* For U+0085 */
943 SET_BIT(0xE2); /* For U+2028, U+2029 */
944 }
945 else SET_BIT(0x85);
946 try_next = FALSE;
947 break;
948
949 /* Single character types set the bits and stop. Note that if PCRE_UCP
950 is set, we do not see these op codes because \d etc are converted to
951 properties. Therefore, these apply in the case when only characters less
952 than 256 are recognized to match the types. */
953
954 case OP_NOT_DIGIT:
955 set_nottype_bits(start_bits, cbit_digit, table_limit, cd);
956 try_next = FALSE;
957 break;
958
959 case OP_DIGIT:
960 set_type_bits(start_bits, cbit_digit, table_limit, cd);
961 try_next = FALSE;
962 break;
963
964 /* The cbit_space table has vertical tab as whitespace; we have to
965 ensure it is set as not whitespace. */
966
967 case OP_NOT_WHITESPACE:
968 set_nottype_bits(start_bits, cbit_space, table_limit, cd);
969 start_bits[1] |= 0x08;
970 try_next = FALSE;
971 break;
972
973 /* The cbit_space table has vertical tab as whitespace; we have to
974 not set it from the table. */
975
976 case OP_WHITESPACE:
977 c = start_bits[1]; /* Save in case it was already set */
978 set_type_bits(start_bits, cbit_space, table_limit, cd);
979 start_bits[1] = (start_bits[1] & ~0x08) | c;
980 try_next = FALSE;
981 break;
982
983 case OP_NOT_WORDCHAR:
984 set_nottype_bits(start_bits, cbit_word, table_limit, cd);
985 try_next = FALSE;
986 break;
987
988 case OP_WORDCHAR:
989 set_type_bits(start_bits, cbit_word, table_limit, cd);
990 try_next = FALSE;
991 break;
992
993 /* One or more character type fudges the pointer and restarts, knowing
994 it will hit a single character type and stop there. */
995
996 case OP_TYPEPLUS:
997 case OP_TYPEMINPLUS:
998 case OP_TYPEPOSPLUS:
999 tcode++;
1000 break;
1001
1002 case OP_TYPEEXACT:
1003 tcode += 3;
1004 break;
1005
1006 /* Zero or more repeats of character types set the bits and then
1007 try again. */
1008
1009 case OP_TYPEUPTO:
1010 case OP_TYPEMINUPTO:
1011 case OP_TYPEPOSUPTO:
1012 tcode += 2; /* Fall through */
1013
1014 case OP_TYPESTAR:
1015 case OP_TYPEMINSTAR:
1016 case OP_TYPEPOSSTAR:
1017 case OP_TYPEQUERY:
1018 case OP_TYPEMINQUERY:
1019 case OP_TYPEPOSQUERY:
1020 switch(tcode[1])
1021 {
1022 default:
1023 case OP_ANY:
1024 case OP_ALLANY:
1025 return SSB_FAIL;
1026
1027 case OP_HSPACE:
1028 SET_BIT(0x09);
1029 SET_BIT(0x20);
1030 if (utf8)
1031 {
1032 SET_BIT(0xC2); /* For U+00A0 */
1033 SET_BIT(0xE1); /* For U+1680, U+180E */
1034 SET_BIT(0xE2); /* For U+2000 - U+200A, U+202F, U+205F */
1035 SET_BIT(0xE3); /* For U+3000 */
1036 }
1037 else SET_BIT(0xA0);
1038 break;
1039
1040 case OP_ANYNL:
1041 case OP_VSPACE:
1042 SET_BIT(0x0A);
1043 SET_BIT(0x0B);
1044 SET_BIT(0x0C);
1045 SET_BIT(0x0D);
1046 if (utf8)
1047 {
1048 SET_BIT(0xC2); /* For U+0085 */
1049 SET_BIT(0xE2); /* For U+2028, U+2029 */
1050 }
1051 else SET_BIT(0x85);
1052 break;
1053
1054 case OP_NOT_DIGIT:
1055 set_nottype_bits(start_bits, cbit_digit, table_limit, cd);
1056 break;
1057
1058 case OP_DIGIT:
1059 set_type_bits(start_bits, cbit_digit, table_limit, cd);
1060 break;
1061
1062 /* The cbit_space table has vertical tab as whitespace; we have to
1063 ensure it gets set as not whitespace. */
1064
1065 case OP_NOT_WHITESPACE:
1066 set_nottype_bits(start_bits, cbit_space, table_limit, cd);
1067 start_bits[1] |= 0x08;
1068 break;
1069
1070 /* The cbit_space table has vertical tab as whitespace; we have to
1071 avoid setting it. */
1072
1073 case OP_WHITESPACE:
1074 c = start_bits[1]; /* Save in case it was already set */
1075 set_type_bits(start_bits, cbit_space, table_limit, cd);
1076 start_bits[1] = (start_bits[1] & ~0x08) | c;
1077 break;
1078
1079 case OP_NOT_WORDCHAR:
1080 set_nottype_bits(start_bits, cbit_word, table_limit, cd);
1081 break;
1082
1083 case OP_WORDCHAR:
1084 set_type_bits(start_bits, cbit_word, table_limit, cd);
1085 break;
1086 }
1087
1088 tcode += 2;
1089 break;
1090
1091 /* Character class where all the information is in a bit map: set the
1092 bits and either carry on or not, according to the repeat count. If it was
1093 a negative class, and we are operating with UTF-8 characters, any byte
1094 with a value >= 0xc4 is a potentially valid starter because it starts a
1095 character with a value > 255. */
1096
1097 case OP_NCLASS:
1098 #ifdef SUPPORT_UTF8
1099 if (utf8)
1100 {
1101 start_bits[24] |= 0xf0; /* Bits for 0xc4 - 0xc8 */
1102 memset(start_bits+25, 0xff, 7); /* Bits for 0xc9 - 0xff */
1103 }
1104 #endif
1105 /* Fall through */
1106
1107 case OP_CLASS:
1108 {
1109 tcode++;
1110
1111 /* In UTF-8 mode, the bits in a bit map correspond to character
1112 values, not to byte values. However, the bit map we are constructing is
1113 for byte values. So we have to do a conversion for characters whose
1114 value is > 127. In fact, there are only two possible starting bytes for
1115 characters in the range 128 - 255. */
1116
1117 #ifdef SUPPORT_UTF8
1118 if (utf8)
1119 {
1120 for (c = 0; c < 16; c++) start_bits[c] |= tcode[c];
1121 for (c = 128; c < 256; c++)
1122 {
1123 if ((tcode[c/8] && (1 << (c&7))) != 0)
1124 {
1125 int d = (c >> 6) | 0xc0; /* Set bit for this starter */
1126 start_bits[d/8] |= (1 << (d&7)); /* and then skip on to the */
1127 c = (c & 0xc0) + 0x40 - 1; /* next relevant character. */
1128 }
1129 }
1130 }
1131
1132 /* In non-UTF-8 mode, the two bit maps are completely compatible. */
1133
1134 else
1135 #endif
1136 {
1137 for (c = 0; c < 32; c++) start_bits[c] |= tcode[c];
1138 }
1139
1140 /* Advance past the bit map, and act on what follows. For a zero
1141 minimum repeat, continue; otherwise stop processing. */
1142
1143 tcode += 32;
1144 switch (*tcode)
1145 {
1146 case OP_CRSTAR:
1147 case OP_CRMINSTAR:
1148 case OP_CRQUERY:
1149 case OP_CRMINQUERY:
1150 tcode++;
1151 break;
1152
1153 case OP_CRRANGE:
1154 case OP_CRMINRANGE:
1155 if (((tcode[1] << 8) + tcode[2]) == 0) tcode += 5;
1156 else try_next = FALSE;
1157 break;
1158
1159 default:
1160 try_next = FALSE;
1161 break;
1162 }
1163 }
1164 break; /* End of bitmap class handling */
1165
1166 } /* End of switch */
1167 } /* End of try_next loop */
1168
1169 code += GET(code, 1); /* Advance to next branch */
1170 }
1171 while (*code == OP_ALT);
1172 return yield;
1173 }
1174
1175
1176
1177
1178
1179 /*************************************************
1180 * Study a compiled expression *
1181 *************************************************/
1182
1183 /* This function is handed a compiled expression that it must study to produce
1184 information that will speed up the matching. It returns a pcre_extra block
1185 which then gets handed back to pcre_exec().
1186
1187 Arguments:
1188 re points to the compiled expression
1189 options contains option bits
1190 errorptr points to where to place error messages;
1191 set NULL unless error
1192
1193 Returns: pointer to a pcre_extra block, with study_data filled in and the
1194 appropriate flags set;
1195 NULL on error or if no optimization possible
1196 */
1197
1198 PCRE_EXP_DEFN pcre_extra * PCRE_CALL_CONVENTION
1199 pcre_study(const pcre *external_re, int options, const char **errorptr)
1200 {
1201 int min;
1202 BOOL bits_set = FALSE;
1203 uschar start_bits[32];
1204 pcre_extra *extra;
1205 pcre_study_data *study;
1206 const uschar *tables;
1207 uschar *code;
1208 compile_data compile_block;
1209 const real_pcre *re = (const real_pcre *)external_re;
1210
1211 *errorptr = NULL;
1212
1213 if (re == NULL || re->magic_number != MAGIC_NUMBER)
1214 {
1215 *errorptr = "argument is not a compiled regular expression";
1216 return NULL;
1217 }
1218
1219 if ((options & ~PUBLIC_STUDY_OPTIONS) != 0)
1220 {
1221 *errorptr = "unknown or incorrect option bit(s) set";
1222 return NULL;
1223 }
1224
1225 code = (uschar *)re + re->name_table_offset +
1226 (re->name_count * re->name_entry_size);
1227
1228 /* For an anchored pattern, or an unanchored pattern that has a first char, or
1229 a multiline pattern that matches only at "line starts", there is no point in
1230 seeking a list of starting bytes. */
1231
1232 if ((re->options & PCRE_ANCHORED) == 0 &&
1233 (re->flags & (PCRE_FIRSTSET|PCRE_STARTLINE)) == 0)
1234 {
1235 int rc;
1236
1237 /* Set the character tables in the block that is passed around */
1238
1239 tables = re->tables;
1240 if (tables == NULL)
1241 (void)pcre_fullinfo(external_re, NULL, PCRE_INFO_DEFAULT_TABLES,
1242 (void *)(&tables));
1243
1244 compile_block.lcc = tables + lcc_offset;
1245 compile_block.fcc = tables + fcc_offset;
1246 compile_block.cbits = tables + cbits_offset;
1247 compile_block.ctypes = tables + ctypes_offset;
1248
1249 /* See if we can find a fixed set of initial characters for the pattern. */
1250
1251 memset(start_bits, 0, 32 * sizeof(uschar));
1252 rc = set_start_bits(code, start_bits, (re->options & PCRE_UTF8) != 0,
1253 &compile_block);
1254 bits_set = rc == SSB_DONE;
1255 if (rc == SSB_UNKNOWN) *errorptr = "internal error: opcode not recognized";
1256 }
1257
1258 /* Find the minimum length of subject string. */
1259
1260 switch(min = find_minlength(code, code, re->options))
1261 {
1262 case -2: *errorptr = "internal error: missing capturing bracket"; break;
1263 case -3: *errorptr = "internal error: opcode not recognized"; break;
1264 default: break;
1265 }
1266
1267 /* Return NULL if there's been an error or if no optimization is possible. */
1268
1269 if (*errorptr != NULL || (!bits_set && min < 0)) return NULL;
1270
1271 /* Get a pcre_extra block and a pcre_study_data block. The study data is put in
1272 the latter, which is pointed to by the former, which may also get additional
1273 data set later by the calling program. At the moment, the size of
1274 pcre_study_data is fixed. We nevertheless save it in a field for returning via
1275 the pcre_fullinfo() function so that if it becomes variable in the future, we
1276 don't have to change that code. */
1277
1278 extra = (pcre_extra *)(pcre_malloc)
1279 (sizeof(pcre_extra) + sizeof(pcre_study_data));
1280
1281 if (extra == NULL)
1282 {
1283 *errorptr = "failed to get memory";
1284 return NULL;
1285 }
1286
1287 study = (pcre_study_data *)((char *)extra + sizeof(pcre_extra));
1288 extra->flags = PCRE_EXTRA_STUDY_DATA;
1289 extra->study_data = study;
1290
1291 study->size = sizeof(pcre_study_data);
1292 study->flags = 0;
1293
1294 if (bits_set)
1295 {
1296 study->flags |= PCRE_STUDY_MAPPED;
1297 memcpy(study->start_bits, start_bits, sizeof(start_bits));
1298 }
1299
1300 if (min >= 0)
1301 {
1302 study->flags |= PCRE_STUDY_MINLEN;
1303 study->minlength = min;
1304 }
1305
1306 return extra;
1307 }
1308
1309 /* End of pcre_study.c */

Properties

Name Value
svn:eol-style native
svn:keywords "Author Date Id Revision Url"

webmaster@exim.org
ViewVC Help
Powered by ViewVC 1.1.12