/[pcre]/code/trunk/pcre_study.c
ViewVC logotype

Contents of /code/trunk/pcre_study.c

Parent Directory Parent Directory | Revision Log Revision Log


Revision 602 - (show annotations) (download)
Wed May 25 08:29:03 2011 UTC (3 years, 5 months ago) by ph10
File MIME type: text/plain
File size: 33374 byte(s)
Remove OP_OPT by handling /i and /m entirely at compile time. Fixes bug with 
patterns like /(?i:([^b]))(?1)/, where the /i option was mishandled.

1 /*************************************************
2 * Perl-Compatible Regular Expressions *
3 *************************************************/
4
5 /* PCRE is a library of functions to support regular expressions whose syntax
6 and semantics are as close as possible to those of the Perl 5 language.
7
8 Written by Philip Hazel
9 Copyright (c) 1997-2010 University of Cambridge
10
11 -----------------------------------------------------------------------------
12 Redistribution and use in source and binary forms, with or without
13 modification, are permitted provided that the following conditions are met:
14
15 * Redistributions of source code must retain the above copyright notice,
16 this list of conditions and the following disclaimer.
17
18 * Redistributions in binary form must reproduce the above copyright
19 notice, this list of conditions and the following disclaimer in the
20 documentation and/or other materials provided with the distribution.
21
22 * Neither the name of the University of Cambridge nor the names of its
23 contributors may be used to endorse or promote products derived from
24 this software without specific prior written permission.
25
26 THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
27 AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
28 IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
29 ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
30 LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
31 CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
32 SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
33 INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
34 CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
35 ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
36 POSSIBILITY OF SUCH DAMAGE.
37 -----------------------------------------------------------------------------
38 */
39
40
41 /* This module contains the external function pcre_study(), along with local
42 supporting functions. */
43
44
45 #ifdef HAVE_CONFIG_H
46 #include "config.h"
47 #endif
48
49 #include "pcre_internal.h"
50
51 #define SET_BIT(c) start_bits[c/8] |= (1 << (c&7))
52
53 /* Returns from set_start_bits() */
54
55 enum { SSB_FAIL, SSB_DONE, SSB_CONTINUE };
56
57
58
59 /*************************************************
60 * Find the minimum subject length for a group *
61 *************************************************/
62
63 /* Scan a parenthesized group and compute the minimum length of subject that
64 is needed to match it. This is a lower bound; it does not mean there is a
65 string of that length that matches. In UTF8 mode, the result is in characters
66 rather than bytes.
67
68 Arguments:
69 code pointer to start of group (the bracket)
70 startcode pointer to start of the whole pattern
71 options the compiling options
72
73 Returns: the minimum length
74 -1 if \C was encountered
75 -2 internal error (missing capturing bracket)
76 */
77
78 static int
79 find_minlength(const uschar *code, const uschar *startcode, int options)
80 {
81 int length = -1;
82 BOOL utf8 = (options & PCRE_UTF8) != 0;
83 BOOL had_recurse = FALSE;
84 register int branchlength = 0;
85 register uschar *cc = (uschar *)code + 1 + LINK_SIZE;
86
87 if (*code == OP_CBRA || *code == OP_SCBRA) cc += 2;
88
89 /* Scan along the opcodes for this branch. If we get to the end of the
90 branch, check the length against that of the other branches. */
91
92 for (;;)
93 {
94 int d, min;
95 uschar *cs, *ce;
96 register int op = *cc;
97
98 switch (op)
99 {
100 case OP_COND:
101 case OP_SCOND:
102
103 /* If there is only one branch in a condition, the implied branch has zero
104 length, so we don't add anything. This covers the DEFINE "condition"
105 automatically. */
106
107 cs = cc + GET(cc, 1);
108 if (*cs != OP_ALT)
109 {
110 cc = cs + 1 + LINK_SIZE;
111 break;
112 }
113
114 /* Otherwise we can fall through and treat it the same as any other
115 subpattern. */
116
117 case OP_CBRA:
118 case OP_SCBRA:
119 case OP_BRA:
120 case OP_SBRA:
121 case OP_ONCE:
122 d = find_minlength(cc, startcode, options);
123 if (d < 0) return d;
124 branchlength += d;
125 do cc += GET(cc, 1); while (*cc == OP_ALT);
126 cc += 1 + LINK_SIZE;
127 break;
128
129 /* Reached end of a branch; if it's a ket it is the end of a nested
130 call. If it's ALT it is an alternation in a nested call. If it is
131 END it's the end of the outer call. All can be handled by the same code. */
132
133 case OP_ALT:
134 case OP_KET:
135 case OP_KETRMAX:
136 case OP_KETRMIN:
137 case OP_END:
138 if (length < 0 || (!had_recurse && branchlength < length))
139 length = branchlength;
140 if (*cc != OP_ALT) return length;
141 cc += 1 + LINK_SIZE;
142 branchlength = 0;
143 had_recurse = FALSE;
144 break;
145
146 /* Skip over assertive subpatterns */
147
148 case OP_ASSERT:
149 case OP_ASSERT_NOT:
150 case OP_ASSERTBACK:
151 case OP_ASSERTBACK_NOT:
152 do cc += GET(cc, 1); while (*cc == OP_ALT);
153 /* Fall through */
154
155 /* Skip over things that don't match chars */
156
157 case OP_REVERSE:
158 case OP_CREF:
159 case OP_NCREF:
160 case OP_RREF:
161 case OP_NRREF:
162 case OP_DEF:
163 case OP_CALLOUT:
164 case OP_SOD:
165 case OP_SOM:
166 case OP_EOD:
167 case OP_EODN:
168 case OP_CIRC:
169 case OP_CIRCM:
170 case OP_DOLL:
171 case OP_DOLLM:
172 case OP_NOT_WORD_BOUNDARY:
173 case OP_WORD_BOUNDARY:
174 cc += _pcre_OP_lengths[*cc];
175 break;
176
177 /* Skip over a subpattern that has a {0} or {0,x} quantifier */
178
179 case OP_BRAZERO:
180 case OP_BRAMINZERO:
181 case OP_SKIPZERO:
182 cc += _pcre_OP_lengths[*cc];
183 do cc += GET(cc, 1); while (*cc == OP_ALT);
184 cc += 1 + LINK_SIZE;
185 break;
186
187 /* Handle literal characters and + repetitions */
188
189 case OP_CHAR:
190 case OP_CHARI:
191 case OP_NOT:
192 case OP_NOTI:
193 case OP_PLUS:
194 case OP_MINPLUS:
195 case OP_POSPLUS:
196 case OP_NOTPLUS:
197 case OP_NOTMINPLUS:
198 case OP_NOTPOSPLUS:
199 branchlength++;
200 cc += 2;
201 #ifdef SUPPORT_UTF8
202 if (utf8 && cc[-1] >= 0xc0) cc += _pcre_utf8_table4[cc[-1] & 0x3f];
203 #endif
204 break;
205
206 case OP_TYPEPLUS:
207 case OP_TYPEMINPLUS:
208 case OP_TYPEPOSPLUS:
209 branchlength++;
210 cc += (cc[1] == OP_PROP || cc[1] == OP_NOTPROP)? 4 : 2;
211 break;
212
213 /* Handle exact repetitions. The count is already in characters, but we
214 need to skip over a multibyte character in UTF8 mode. */
215
216 case OP_EXACT:
217 case OP_NOTEXACT:
218 branchlength += GET2(cc,1);
219 cc += 4;
220 #ifdef SUPPORT_UTF8
221 if (utf8 && cc[-1] >= 0xc0) cc += _pcre_utf8_table4[cc[-1] & 0x3f];
222 #endif
223 break;
224
225 case OP_TYPEEXACT:
226 branchlength += GET2(cc,1);
227 cc += (cc[3] == OP_PROP || cc[3] == OP_NOTPROP)? 6 : 4;
228 break;
229
230 /* Handle single-char non-literal matchers */
231
232 case OP_PROP:
233 case OP_NOTPROP:
234 cc += 2;
235 /* Fall through */
236
237 case OP_NOT_DIGIT:
238 case OP_DIGIT:
239 case OP_NOT_WHITESPACE:
240 case OP_WHITESPACE:
241 case OP_NOT_WORDCHAR:
242 case OP_WORDCHAR:
243 case OP_ANY:
244 case OP_ALLANY:
245 case OP_EXTUNI:
246 case OP_HSPACE:
247 case OP_NOT_HSPACE:
248 case OP_VSPACE:
249 case OP_NOT_VSPACE:
250 branchlength++;
251 cc++;
252 break;
253
254 /* "Any newline" might match two characters */
255
256 case OP_ANYNL:
257 branchlength += 2;
258 cc++;
259 break;
260
261 /* The single-byte matcher means we can't proceed in UTF-8 mode */
262
263 case OP_ANYBYTE:
264 #ifdef SUPPORT_UTF8
265 if (utf8) return -1;
266 #endif
267 branchlength++;
268 cc++;
269 break;
270
271 /* For repeated character types, we have to test for \p and \P, which have
272 an extra two bytes of parameters. */
273
274 case OP_TYPESTAR:
275 case OP_TYPEMINSTAR:
276 case OP_TYPEQUERY:
277 case OP_TYPEMINQUERY:
278 case OP_TYPEPOSSTAR:
279 case OP_TYPEPOSQUERY:
280 if (cc[1] == OP_PROP || cc[1] == OP_NOTPROP) cc += 2;
281 cc += _pcre_OP_lengths[op];
282 break;
283
284 case OP_TYPEUPTO:
285 case OP_TYPEMINUPTO:
286 case OP_TYPEPOSUPTO:
287 if (cc[3] == OP_PROP || cc[3] == OP_NOTPROP) cc += 2;
288 cc += _pcre_OP_lengths[op];
289 break;
290
291 /* Check a class for variable quantification */
292
293 #ifdef SUPPORT_UTF8
294 case OP_XCLASS:
295 cc += GET(cc, 1) - 33;
296 /* Fall through */
297 #endif
298
299 case OP_CLASS:
300 case OP_NCLASS:
301 cc += 33;
302
303 switch (*cc)
304 {
305 case OP_CRPLUS:
306 case OP_CRMINPLUS:
307 branchlength++;
308 /* Fall through */
309
310 case OP_CRSTAR:
311 case OP_CRMINSTAR:
312 case OP_CRQUERY:
313 case OP_CRMINQUERY:
314 cc++;
315 break;
316
317 case OP_CRRANGE:
318 case OP_CRMINRANGE:
319 branchlength += GET2(cc,1);
320 cc += 5;
321 break;
322
323 default:
324 branchlength++;
325 break;
326 }
327 break;
328
329 /* Backreferences and subroutine calls are treated in the same way: we find
330 the minimum length for the subpattern. A recursion, however, causes an
331 a flag to be set that causes the length of this branch to be ignored. The
332 logic is that a recursion can only make sense if there is another
333 alternation that stops the recursing. That will provide the minimum length
334 (when no recursion happens). A backreference within the group that it is
335 referencing behaves in the same way.
336
337 If PCRE_JAVASCRIPT_COMPAT is set, a backreference to an unset bracket
338 matches an empty string (by default it causes a matching failure), so in
339 that case we must set the minimum length to zero. */
340
341 case OP_REF:
342 case OP_REFI:
343 if ((options & PCRE_JAVASCRIPT_COMPAT) == 0)
344 {
345 ce = cs = (uschar *)_pcre_find_bracket(startcode, utf8, GET2(cc, 1));
346 if (cs == NULL) return -2;
347 do ce += GET(ce, 1); while (*ce == OP_ALT);
348 if (cc > cs && cc < ce)
349 {
350 d = 0;
351 had_recurse = TRUE;
352 }
353 else d = find_minlength(cs, startcode, options);
354 }
355 else d = 0;
356 cc += 3;
357
358 /* Handle repeated back references */
359
360 switch (*cc)
361 {
362 case OP_CRSTAR:
363 case OP_CRMINSTAR:
364 case OP_CRQUERY:
365 case OP_CRMINQUERY:
366 min = 0;
367 cc++;
368 break;
369
370 case OP_CRRANGE:
371 case OP_CRMINRANGE:
372 min = GET2(cc, 1);
373 cc += 5;
374 break;
375
376 default:
377 min = 1;
378 break;
379 }
380
381 branchlength += min * d;
382 break;
383
384 case OP_RECURSE:
385 cs = ce = (uschar *)startcode + GET(cc, 1);
386 if (cs == NULL) return -2;
387 do ce += GET(ce, 1); while (*ce == OP_ALT);
388 if (cc > cs && cc < ce)
389 had_recurse = TRUE;
390 else
391 branchlength += find_minlength(cs, startcode, options);
392 cc += 1 + LINK_SIZE;
393 break;
394
395 /* Anything else does not or need not match a character. We can get the
396 item's length from the table, but for those that can match zero occurrences
397 of a character, we must take special action for UTF-8 characters. As it
398 happens, the "NOT" versions of these opcodes are used at present only for
399 ASCII characters, so they could be omitted from this list. However, in
400 future that may change, so we leave them in this special case. */
401
402 case OP_UPTO:
403 case OP_UPTOI:
404 case OP_NOTUPTO:
405 case OP_NOTUPTOI:
406 case OP_MINUPTO:
407 case OP_MINUPTOI:
408 case OP_NOTMINUPTO:
409 case OP_NOTMINUPTOI:
410 case OP_POSUPTO:
411 case OP_POSUPTOI:
412 case OP_NOTPOSUPTO:
413 case OP_NOTPOSUPTOI:
414
415 case OP_STAR:
416 case OP_STARI:
417 case OP_NOTSTAR:
418 case OP_NOTSTARI:
419 case OP_MINSTAR:
420 case OP_MINSTARI:
421 case OP_NOTMINSTAR:
422 case OP_NOTMINSTARI:
423 case OP_POSSTAR:
424 case OP_POSSTARI:
425 case OP_NOTPOSSTAR:
426 case OP_NOTPOSSTARI:
427
428 case OP_QUERY:
429 case OP_QUERYI:
430 case OP_NOTQUERY:
431 case OP_NOTQUERYI:
432 case OP_MINQUERY:
433 case OP_MINQUERYI:
434 case OP_NOTMINQUERY:
435 case OP_NOTMINQUERYI:
436 case OP_POSQUERY:
437 case OP_POSQUERYI:
438 case OP_NOTPOSQUERY:
439 case OP_NOTPOSQUERYI:
440
441 cc += _pcre_OP_lengths[op];
442 #ifdef SUPPORT_UTF8
443 if (utf8 && cc[-1] >= 0xc0) cc += _pcre_utf8_table4[cc[-1] & 0x3f];
444 #endif
445 break;
446
447 /* Skip these, but we need to add in the name length. */
448
449 case OP_MARK:
450 case OP_PRUNE_ARG:
451 case OP_SKIP_ARG:
452 cc += _pcre_OP_lengths[op] + cc[1];
453 break;
454
455 case OP_THEN_ARG:
456 cc += _pcre_OP_lengths[op] + cc[1+LINK_SIZE];
457 break;
458
459 /* For the record, these are the opcodes that are matched by "default":
460 OP_ACCEPT, OP_CLOSE, OP_COMMIT, OP_FAIL, OP_PRUNE, OP_SET_SOM, OP_SKIP,
461 OP_THEN. */
462
463 default:
464 cc += _pcre_OP_lengths[op];
465 break;
466 }
467 }
468 /* Control never gets here */
469 }
470
471
472
473 /*************************************************
474 * Set a bit and maybe its alternate case *
475 *************************************************/
476
477 /* Given a character, set its first byte's bit in the table, and also the
478 corresponding bit for the other version of a letter if we are caseless. In
479 UTF-8 mode, for characters greater than 127, we can only do the caseless thing
480 when Unicode property support is available.
481
482 Arguments:
483 start_bits points to the bit map
484 p points to the character
485 caseless the caseless flag
486 cd the block with char table pointers
487 utf8 TRUE for UTF-8 mode
488
489 Returns: pointer after the character
490 */
491
492 static const uschar *
493 set_table_bit(uschar *start_bits, const uschar *p, BOOL caseless,
494 compile_data *cd, BOOL utf8)
495 {
496 unsigned int c = *p;
497
498 SET_BIT(c);
499
500 #ifdef SUPPORT_UTF8
501 if (utf8 && c > 127)
502 {
503 GETCHARINC(c, p);
504 #ifdef SUPPORT_UCP
505 if (caseless)
506 {
507 uschar buff[8];
508 c = UCD_OTHERCASE(c);
509 (void)_pcre_ord2utf8(c, buff);
510 SET_BIT(buff[0]);
511 }
512 #endif
513 return p;
514 }
515 #endif
516
517 /* Not UTF-8 mode, or character is less than 127. */
518
519 if (caseless && (cd->ctypes[c] & ctype_letter) != 0) SET_BIT(cd->fcc[c]);
520 return p + 1;
521 }
522
523
524
525 /*************************************************
526 * Set bits for a positive character type *
527 *************************************************/
528
529 /* This function sets starting bits for a character type. In UTF-8 mode, we can
530 only do a direct setting for bytes less than 128, as otherwise there can be
531 confusion with bytes in the middle of UTF-8 characters. In a "traditional"
532 environment, the tables will only recognize ASCII characters anyway, but in at
533 least one Windows environment, some higher bytes bits were set in the tables.
534 So we deal with that case by considering the UTF-8 encoding.
535
536 Arguments:
537 start_bits the starting bitmap
538 cbit type the type of character wanted
539 table_limit 32 for non-UTF-8; 16 for UTF-8
540 cd the block with char table pointers
541
542 Returns: nothing
543 */
544
545 static void
546 set_type_bits(uschar *start_bits, int cbit_type, int table_limit,
547 compile_data *cd)
548 {
549 register int c;
550 for (c = 0; c < table_limit; c++) start_bits[c] |= cd->cbits[c+cbit_type];
551 if (table_limit == 32) return;
552 for (c = 128; c < 256; c++)
553 {
554 if ((cd->cbits[c/8] & (1 << (c&7))) != 0)
555 {
556 uschar buff[8];
557 (void)_pcre_ord2utf8(c, buff);
558 SET_BIT(buff[0]);
559 }
560 }
561 }
562
563
564 /*************************************************
565 * Set bits for a negative character type *
566 *************************************************/
567
568 /* This function sets starting bits for a negative character type such as \D.
569 In UTF-8 mode, we can only do a direct setting for bytes less than 128, as
570 otherwise there can be confusion with bytes in the middle of UTF-8 characters.
571 Unlike in the positive case, where we can set appropriate starting bits for
572 specific high-valued UTF-8 characters, in this case we have to set the bits for
573 all high-valued characters. The lowest is 0xc2, but we overkill by starting at
574 0xc0 (192) for simplicity.
575
576 Arguments:
577 start_bits the starting bitmap
578 cbit type the type of character wanted
579 table_limit 32 for non-UTF-8; 16 for UTF-8
580 cd the block with char table pointers
581
582 Returns: nothing
583 */
584
585 static void
586 set_nottype_bits(uschar *start_bits, int cbit_type, int table_limit,
587 compile_data *cd)
588 {
589 register int c;
590 for (c = 0; c < table_limit; c++) start_bits[c] |= ~cd->cbits[c+cbit_type];
591 if (table_limit != 32) for (c = 24; c < 32; c++) start_bits[c] = 0xff;
592 }
593
594
595
596 /*************************************************
597 * Create bitmap of starting bytes *
598 *************************************************/
599
600 /* This function scans a compiled unanchored expression recursively and
601 attempts to build a bitmap of the set of possible starting bytes. As time goes
602 by, we may be able to get more clever at doing this. The SSB_CONTINUE return is
603 useful for parenthesized groups in patterns such as (a*)b where the group
604 provides some optional starting bytes but scanning must continue at the outer
605 level to find at least one mandatory byte. At the outermost level, this
606 function fails unless the result is SSB_DONE.
607
608 Arguments:
609 code points to an expression
610 start_bits points to a 32-byte table, initialized to 0
611 caseless the current state of the caseless flag
612 utf8 TRUE if in UTF-8 mode
613 cd the block with char table pointers
614
615 Returns: SSB_FAIL => Failed to find any starting bytes
616 SSB_DONE => Found mandatory starting bytes
617 SSB_CONTINUE => Found optional starting bytes
618 */
619
620 static int
621 set_start_bits(const uschar *code, uschar *start_bits, BOOL caseless,
622 BOOL utf8, compile_data *cd)
623 {
624 register int c;
625 int yield = SSB_DONE;
626 int table_limit = utf8? 16:32;
627
628 #if 0
629 /* ========================================================================= */
630 /* The following comment and code was inserted in January 1999. In May 2006,
631 when it was observed to cause compiler warnings about unused values, I took it
632 out again. If anybody is still using OS/2, they will have to put it back
633 manually. */
634
635 /* This next statement and the later reference to dummy are here in order to
636 trick the optimizer of the IBM C compiler for OS/2 into generating correct
637 code. Apparently IBM isn't going to fix the problem, and we would rather not
638 disable optimization (in this module it actually makes a big difference, and
639 the pcre module can use all the optimization it can get). */
640
641 volatile int dummy;
642 /* ========================================================================= */
643 #endif
644
645 do
646 {
647 const uschar *tcode = code + (((int)*code == OP_CBRA)? 3:1) + LINK_SIZE;
648 BOOL try_next = TRUE;
649
650 while (try_next) /* Loop for items in this branch */
651 {
652 int rc;
653 switch(*tcode)
654 {
655 /* Fail if we reach something we don't understand */
656
657 default:
658 return SSB_FAIL;
659
660 /* If we hit a bracket or a positive lookahead assertion, recurse to set
661 bits from within the subpattern. If it can't find anything, we have to
662 give up. If it finds some mandatory character(s), we are done for this
663 branch. Otherwise, carry on scanning after the subpattern. */
664
665 case OP_BRA:
666 case OP_SBRA:
667 case OP_CBRA:
668 case OP_SCBRA:
669 case OP_ONCE:
670 case OP_ASSERT:
671 rc = set_start_bits(tcode, start_bits, caseless, utf8, cd);
672 if (rc == SSB_FAIL) return SSB_FAIL;
673 if (rc == SSB_DONE) try_next = FALSE; else
674 {
675 do tcode += GET(tcode, 1); while (*tcode == OP_ALT);
676 tcode += 1 + LINK_SIZE;
677 }
678 break;
679
680 /* If we hit ALT or KET, it means we haven't found anything mandatory in
681 this branch, though we might have found something optional. For ALT, we
682 continue with the next alternative, but we have to arrange that the final
683 result from subpattern is SSB_CONTINUE rather than SSB_DONE. For KET,
684 return SSB_CONTINUE: if this is the top level, that indicates failure,
685 but after a nested subpattern, it causes scanning to continue. */
686
687 case OP_ALT:
688 yield = SSB_CONTINUE;
689 try_next = FALSE;
690 break;
691
692 case OP_KET:
693 case OP_KETRMAX:
694 case OP_KETRMIN:
695 return SSB_CONTINUE;
696
697 /* Skip over callout */
698
699 case OP_CALLOUT:
700 tcode += 2 + 2*LINK_SIZE;
701 break;
702
703 /* Skip over lookbehind and negative lookahead assertions */
704
705 case OP_ASSERT_NOT:
706 case OP_ASSERTBACK:
707 case OP_ASSERTBACK_NOT:
708 do tcode += GET(tcode, 1); while (*tcode == OP_ALT);
709 tcode += 1 + LINK_SIZE;
710 break;
711
712 /* BRAZERO does the bracket, but carries on. */
713
714 case OP_BRAZERO:
715 case OP_BRAMINZERO:
716 if (set_start_bits(++tcode, start_bits, caseless, utf8, cd) == SSB_FAIL)
717 return SSB_FAIL;
718 /* =========================================================================
719 See the comment at the head of this function concerning the next line,
720 which was an old fudge for the benefit of OS/2.
721 dummy = 1;
722 ========================================================================= */
723 do tcode += GET(tcode,1); while (*tcode == OP_ALT);
724 tcode += 1 + LINK_SIZE;
725 break;
726
727 /* SKIPZERO skips the bracket. */
728
729 case OP_SKIPZERO:
730 tcode++;
731 do tcode += GET(tcode,1); while (*tcode == OP_ALT);
732 tcode += 1 + LINK_SIZE;
733 break;
734
735 /* Single-char * or ? sets the bit and tries the next item */
736
737 case OP_STAR:
738 case OP_MINSTAR:
739 case OP_POSSTAR:
740 case OP_QUERY:
741 case OP_MINQUERY:
742 case OP_POSQUERY:
743 tcode = set_table_bit(start_bits, tcode + 1, caseless, cd, utf8);
744 break;
745
746 case OP_STARI:
747 case OP_MINSTARI:
748 case OP_POSSTARI:
749 case OP_QUERYI:
750 case OP_MINQUERYI:
751 case OP_POSQUERYI:
752 tcode = set_table_bit(start_bits, tcode + 1, TRUE, cd, utf8);
753 break;
754
755 /* Single-char upto sets the bit and tries the next */
756
757 case OP_UPTO:
758 case OP_MINUPTO:
759 case OP_POSUPTO:
760 tcode = set_table_bit(start_bits, tcode + 3, caseless, cd, utf8);
761 break;
762
763 case OP_UPTOI:
764 case OP_MINUPTOI:
765 case OP_POSUPTOI:
766 tcode = set_table_bit(start_bits, tcode + 3, TRUE, cd, utf8);
767 break;
768
769 /* At least one single char sets the bit and stops */
770
771 case OP_EXACT:
772 tcode += 2;
773 /* Fall through */
774 case OP_CHAR:
775 case OP_PLUS:
776 case OP_MINPLUS:
777 case OP_POSPLUS:
778 (void)set_table_bit(start_bits, tcode + 1, caseless, cd, utf8);
779 try_next = FALSE;
780 break;
781
782 case OP_CHARI:
783 case OP_PLUSI:
784 case OP_MINPLUSI:
785 case OP_POSPLUSI:
786 (void)set_table_bit(start_bits, tcode + 1, TRUE, cd, utf8);
787 try_next = FALSE;
788 break;
789
790 /* Special spacing and line-terminating items. These recognize specific
791 lists of characters. The difference between VSPACE and ANYNL is that the
792 latter can match the two-character CRLF sequence, but that is not
793 relevant for finding the first character, so their code here is
794 identical. */
795
796 case OP_HSPACE:
797 SET_BIT(0x09);
798 SET_BIT(0x20);
799 if (utf8)
800 {
801 SET_BIT(0xC2); /* For U+00A0 */
802 SET_BIT(0xE1); /* For U+1680, U+180E */
803 SET_BIT(0xE2); /* For U+2000 - U+200A, U+202F, U+205F */
804 SET_BIT(0xE3); /* For U+3000 */
805 }
806 else SET_BIT(0xA0);
807 try_next = FALSE;
808 break;
809
810 case OP_ANYNL:
811 case OP_VSPACE:
812 SET_BIT(0x0A);
813 SET_BIT(0x0B);
814 SET_BIT(0x0C);
815 SET_BIT(0x0D);
816 if (utf8)
817 {
818 SET_BIT(0xC2); /* For U+0085 */
819 SET_BIT(0xE2); /* For U+2028, U+2029 */
820 }
821 else SET_BIT(0x85);
822 try_next = FALSE;
823 break;
824
825 /* Single character types set the bits and stop. Note that if PCRE_UCP
826 is set, we do not see these op codes because \d etc are converted to
827 properties. Therefore, these apply in the case when only characters less
828 than 256 are recognized to match the types. */
829
830 case OP_NOT_DIGIT:
831 set_nottype_bits(start_bits, cbit_digit, table_limit, cd);
832 try_next = FALSE;
833 break;
834
835 case OP_DIGIT:
836 set_type_bits(start_bits, cbit_digit, table_limit, cd);
837 try_next = FALSE;
838 break;
839
840 /* The cbit_space table has vertical tab as whitespace; we have to
841 ensure it is set as not whitespace. */
842
843 case OP_NOT_WHITESPACE:
844 set_nottype_bits(start_bits, cbit_space, table_limit, cd);
845 start_bits[1] |= 0x08;
846 try_next = FALSE;
847 break;
848
849 /* The cbit_space table has vertical tab as whitespace; we have to
850 not set it from the table. */
851
852 case OP_WHITESPACE:
853 c = start_bits[1]; /* Save in case it was already set */
854 set_type_bits(start_bits, cbit_space, table_limit, cd);
855 start_bits[1] = (start_bits[1] & ~0x08) | c;
856 try_next = FALSE;
857 break;
858
859 case OP_NOT_WORDCHAR:
860 set_nottype_bits(start_bits, cbit_word, table_limit, cd);
861 try_next = FALSE;
862 break;
863
864 case OP_WORDCHAR:
865 set_type_bits(start_bits, cbit_word, table_limit, cd);
866 try_next = FALSE;
867 break;
868
869 /* One or more character type fudges the pointer and restarts, knowing
870 it will hit a single character type and stop there. */
871
872 case OP_TYPEPLUS:
873 case OP_TYPEMINPLUS:
874 case OP_TYPEPOSPLUS:
875 tcode++;
876 break;
877
878 case OP_TYPEEXACT:
879 tcode += 3;
880 break;
881
882 /* Zero or more repeats of character types set the bits and then
883 try again. */
884
885 case OP_TYPEUPTO:
886 case OP_TYPEMINUPTO:
887 case OP_TYPEPOSUPTO:
888 tcode += 2; /* Fall through */
889
890 case OP_TYPESTAR:
891 case OP_TYPEMINSTAR:
892 case OP_TYPEPOSSTAR:
893 case OP_TYPEQUERY:
894 case OP_TYPEMINQUERY:
895 case OP_TYPEPOSQUERY:
896 switch(tcode[1])
897 {
898 default:
899 case OP_ANY:
900 case OP_ALLANY:
901 return SSB_FAIL;
902
903 case OP_HSPACE:
904 SET_BIT(0x09);
905 SET_BIT(0x20);
906 if (utf8)
907 {
908 SET_BIT(0xC2); /* For U+00A0 */
909 SET_BIT(0xE1); /* For U+1680, U+180E */
910 SET_BIT(0xE2); /* For U+2000 - U+200A, U+202F, U+205F */
911 SET_BIT(0xE3); /* For U+3000 */
912 }
913 else SET_BIT(0xA0);
914 break;
915
916 case OP_ANYNL:
917 case OP_VSPACE:
918 SET_BIT(0x0A);
919 SET_BIT(0x0B);
920 SET_BIT(0x0C);
921 SET_BIT(0x0D);
922 if (utf8)
923 {
924 SET_BIT(0xC2); /* For U+0085 */
925 SET_BIT(0xE2); /* For U+2028, U+2029 */
926 }
927 else SET_BIT(0x85);
928 break;
929
930 case OP_NOT_DIGIT:
931 set_nottype_bits(start_bits, cbit_digit, table_limit, cd);
932 break;
933
934 case OP_DIGIT:
935 set_type_bits(start_bits, cbit_digit, table_limit, cd);
936 break;
937
938 /* The cbit_space table has vertical tab as whitespace; we have to
939 ensure it gets set as not whitespace. */
940
941 case OP_NOT_WHITESPACE:
942 set_nottype_bits(start_bits, cbit_space, table_limit, cd);
943 start_bits[1] |= 0x08;
944 break;
945
946 /* The cbit_space table has vertical tab as whitespace; we have to
947 avoid setting it. */
948
949 case OP_WHITESPACE:
950 c = start_bits[1]; /* Save in case it was already set */
951 set_type_bits(start_bits, cbit_space, table_limit, cd);
952 start_bits[1] = (start_bits[1] & ~0x08) | c;
953 break;
954
955 case OP_NOT_WORDCHAR:
956 set_nottype_bits(start_bits, cbit_word, table_limit, cd);
957 break;
958
959 case OP_WORDCHAR:
960 set_type_bits(start_bits, cbit_word, table_limit, cd);
961 break;
962 }
963
964 tcode += 2;
965 break;
966
967 /* Character class where all the information is in a bit map: set the
968 bits and either carry on or not, according to the repeat count. If it was
969 a negative class, and we are operating with UTF-8 characters, any byte
970 with a value >= 0xc4 is a potentially valid starter because it starts a
971 character with a value > 255. */
972
973 case OP_NCLASS:
974 #ifdef SUPPORT_UTF8
975 if (utf8)
976 {
977 start_bits[24] |= 0xf0; /* Bits for 0xc4 - 0xc8 */
978 memset(start_bits+25, 0xff, 7); /* Bits for 0xc9 - 0xff */
979 }
980 #endif
981 /* Fall through */
982
983 case OP_CLASS:
984 {
985 tcode++;
986
987 /* In UTF-8 mode, the bits in a bit map correspond to character
988 values, not to byte values. However, the bit map we are constructing is
989 for byte values. So we have to do a conversion for characters whose
990 value is > 127. In fact, there are only two possible starting bytes for
991 characters in the range 128 - 255. */
992
993 #ifdef SUPPORT_UTF8
994 if (utf8)
995 {
996 for (c = 0; c < 16; c++) start_bits[c] |= tcode[c];
997 for (c = 128; c < 256; c++)
998 {
999 if ((tcode[c/8] && (1 << (c&7))) != 0)
1000 {
1001 int d = (c >> 6) | 0xc0; /* Set bit for this starter */
1002 start_bits[d/8] |= (1 << (d&7)); /* and then skip on to the */
1003 c = (c & 0xc0) + 0x40 - 1; /* next relevant character. */
1004 }
1005 }
1006 }
1007
1008 /* In non-UTF-8 mode, the two bit maps are completely compatible. */
1009
1010 else
1011 #endif
1012 {
1013 for (c = 0; c < 32; c++) start_bits[c] |= tcode[c];
1014 }
1015
1016 /* Advance past the bit map, and act on what follows */
1017
1018 tcode += 32;
1019 switch (*tcode)
1020 {
1021 case OP_CRSTAR:
1022 case OP_CRMINSTAR:
1023 case OP_CRQUERY:
1024 case OP_CRMINQUERY:
1025 tcode++;
1026 break;
1027
1028 case OP_CRRANGE:
1029 case OP_CRMINRANGE:
1030 if (((tcode[1] << 8) + tcode[2]) == 0) tcode += 5;
1031 else try_next = FALSE;
1032 break;
1033
1034 default:
1035 try_next = FALSE;
1036 break;
1037 }
1038 }
1039 break; /* End of bitmap class handling */
1040
1041 } /* End of switch */
1042 } /* End of try_next loop */
1043
1044 code += GET(code, 1); /* Advance to next branch */
1045 }
1046 while (*code == OP_ALT);
1047 return yield;
1048 }
1049
1050
1051
1052 /*************************************************
1053 * Study a compiled expression *
1054 *************************************************/
1055
1056 /* This function is handed a compiled expression that it must study to produce
1057 information that will speed up the matching. It returns a pcre_extra block
1058 which then gets handed back to pcre_exec().
1059
1060 Arguments:
1061 re points to the compiled expression
1062 options contains option bits
1063 errorptr points to where to place error messages;
1064 set NULL unless error
1065
1066 Returns: pointer to a pcre_extra block, with study_data filled in and the
1067 appropriate flags set;
1068 NULL on error or if no optimization possible
1069 */
1070
1071 PCRE_EXP_DEFN pcre_extra * PCRE_CALL_CONVENTION
1072 pcre_study(const pcre *external_re, int options, const char **errorptr)
1073 {
1074 int min;
1075 BOOL bits_set = FALSE;
1076 uschar start_bits[32];
1077 pcre_extra *extra;
1078 pcre_study_data *study;
1079 const uschar *tables;
1080 uschar *code;
1081 compile_data compile_block;
1082 const real_pcre *re = (const real_pcre *)external_re;
1083
1084 *errorptr = NULL;
1085
1086 if (re == NULL || re->magic_number != MAGIC_NUMBER)
1087 {
1088 *errorptr = "argument is not a compiled regular expression";
1089 return NULL;
1090 }
1091
1092 if ((options & ~PUBLIC_STUDY_OPTIONS) != 0)
1093 {
1094 *errorptr = "unknown or incorrect option bit(s) set";
1095 return NULL;
1096 }
1097
1098 code = (uschar *)re + re->name_table_offset +
1099 (re->name_count * re->name_entry_size);
1100
1101 /* For an anchored pattern, or an unanchored pattern that has a first char, or
1102 a multiline pattern that matches only at "line starts", there is no point in
1103 seeking a list of starting bytes. */
1104
1105 if ((re->options & PCRE_ANCHORED) == 0 &&
1106 (re->flags & (PCRE_FIRSTSET|PCRE_STARTLINE)) == 0)
1107 {
1108 /* Set the character tables in the block that is passed around */
1109
1110 tables = re->tables;
1111 if (tables == NULL)
1112 (void)pcre_fullinfo(external_re, NULL, PCRE_INFO_DEFAULT_TABLES,
1113 (void *)(&tables));
1114
1115 compile_block.lcc = tables + lcc_offset;
1116 compile_block.fcc = tables + fcc_offset;
1117 compile_block.cbits = tables + cbits_offset;
1118 compile_block.ctypes = tables + ctypes_offset;
1119
1120 /* See if we can find a fixed set of initial characters for the pattern. */
1121
1122 memset(start_bits, 0, 32 * sizeof(uschar));
1123 bits_set = set_start_bits(code, start_bits,
1124 (re->options & PCRE_CASELESS) != 0, (re->options & PCRE_UTF8) != 0,
1125 &compile_block) == SSB_DONE;
1126 }
1127
1128 /* Find the minimum length of subject string. */
1129
1130 min = find_minlength(code, code, re->options);
1131
1132 /* Return NULL if no optimization is possible. */
1133
1134 if (!bits_set && min < 0) return NULL;
1135
1136 /* Get a pcre_extra block and a pcre_study_data block. The study data is put in
1137 the latter, which is pointed to by the former, which may also get additional
1138 data set later by the calling program. At the moment, the size of
1139 pcre_study_data is fixed. We nevertheless save it in a field for returning via
1140 the pcre_fullinfo() function so that if it becomes variable in the future, we
1141 don't have to change that code. */
1142
1143 extra = (pcre_extra *)(pcre_malloc)
1144 (sizeof(pcre_extra) + sizeof(pcre_study_data));
1145
1146 if (extra == NULL)
1147 {
1148 *errorptr = "failed to get memory";
1149 return NULL;
1150 }
1151
1152 study = (pcre_study_data *)((char *)extra + sizeof(pcre_extra));
1153 extra->flags = PCRE_EXTRA_STUDY_DATA;
1154 extra->study_data = study;
1155
1156 study->size = sizeof(pcre_study_data);
1157 study->flags = 0;
1158
1159 if (bits_set)
1160 {
1161 study->flags |= PCRE_STUDY_MAPPED;
1162 memcpy(study->start_bits, start_bits, sizeof(start_bits));
1163 }
1164
1165 if (min >= 0)
1166 {
1167 study->flags |= PCRE_STUDY_MINLEN;
1168 study->minlength = min;
1169 }
1170
1171 return extra;
1172 }
1173
1174 /* End of pcre_study.c */

Properties

Name Value
svn:eol-style native
svn:keywords "Author Date Id Revision Url"

webmaster@exim.org
ViewVC Help
Powered by ViewVC 1.1.12