/[pcre]/code/trunk/pcre_study.c
ViewVC logotype

Contents of /code/trunk/pcre_study.c

Parent Directory Parent Directory | Revision Log Revision Log


Revision 603 - (hide annotations) (download)
Fri May 27 10:14:09 2011 UTC (3 years, 6 months ago) by ph10
File MIME type: text/plain
File size: 33954 byte(s)
Fixed some omissions in pcre_study() for the new caseless opcodes.

1 nigel 77 /*************************************************
2     * Perl-Compatible Regular Expressions *
3     *************************************************/
4    
5     /* PCRE is a library of functions to support regular expressions whose syntax
6     and semantics are as close as possible to those of the Perl 5 language.
7    
8     Written by Philip Hazel
9 ph10 475 Copyright (c) 1997-2010 University of Cambridge
10 nigel 77
11     -----------------------------------------------------------------------------
12     Redistribution and use in source and binary forms, with or without
13     modification, are permitted provided that the following conditions are met:
14    
15     * Redistributions of source code must retain the above copyright notice,
16     this list of conditions and the following disclaimer.
17    
18     * Redistributions in binary form must reproduce the above copyright
19     notice, this list of conditions and the following disclaimer in the
20     documentation and/or other materials provided with the distribution.
21    
22     * Neither the name of the University of Cambridge nor the names of its
23     contributors may be used to endorse or promote products derived from
24     this software without specific prior written permission.
25    
26     THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
27     AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
28     IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
29     ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
30     LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
31     CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
32     SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
33     INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
34     CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
35     ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
36     POSSIBILITY OF SUCH DAMAGE.
37     -----------------------------------------------------------------------------
38     */
39    
40    
41     /* This module contains the external function pcre_study(), along with local
42     supporting functions. */
43    
44    
45 ph10 200 #ifdef HAVE_CONFIG_H
46 ph10 236 #include "config.h"
47 ph10 200 #endif
48 ph10 199
49 nigel 77 #include "pcre_internal.h"
50    
51 ph10 524 #define SET_BIT(c) start_bits[c/8] |= (1 << (c&7))
52 nigel 77
53 nigel 93 /* Returns from set_start_bits() */
54    
55     enum { SSB_FAIL, SSB_DONE, SSB_CONTINUE };
56    
57    
58 ph10 455
59 nigel 77 /*************************************************
60 ph10 455 * Find the minimum subject length for a group *
61     *************************************************/
62    
63     /* Scan a parenthesized group and compute the minimum length of subject that
64 ph10 461 is needed to match it. This is a lower bound; it does not mean there is a
65 ph10 455 string of that length that matches. In UTF8 mode, the result is in characters
66     rather than bytes.
67    
68     Arguments:
69     code pointer to start of group (the bracket)
70     startcode pointer to start of the whole pattern
71 ph10 461 options the compiling options
72 ph10 455
73     Returns: the minimum length
74     -1 if \C was encountered
75 ph10 461 -2 internal error (missing capturing bracket)
76 ph10 603 -3 internal error (opcode not listed)
77 ph10 455 */
78    
79     static int
80     find_minlength(const uschar *code, const uschar *startcode, int options)
81     {
82     int length = -1;
83     BOOL utf8 = (options & PCRE_UTF8) != 0;
84     BOOL had_recurse = FALSE;
85     register int branchlength = 0;
86     register uschar *cc = (uschar *)code + 1 + LINK_SIZE;
87    
88     if (*code == OP_CBRA || *code == OP_SCBRA) cc += 2;
89    
90     /* Scan along the opcodes for this branch. If we get to the end of the
91     branch, check the length against that of the other branches. */
92    
93     for (;;)
94     {
95     int d, min;
96 ph10 461 uschar *cs, *ce;
97 ph10 455 register int op = *cc;
98 ph10 461
99 ph10 455 switch (op)
100     {
101 ph10 471 case OP_COND:
102     case OP_SCOND:
103    
104     /* If there is only one branch in a condition, the implied branch has zero
105     length, so we don't add anything. This covers the DEFINE "condition"
106     automatically. */
107    
108     cs = cc + GET(cc, 1);
109     if (*cs != OP_ALT)
110     {
111     cc = cs + 1 + LINK_SIZE;
112     break;
113     }
114    
115     /* Otherwise we can fall through and treat it the same as any other
116     subpattern. */
117    
118 ph10 455 case OP_CBRA:
119 ph10 461 case OP_SCBRA:
120 ph10 455 case OP_BRA:
121 ph10 461 case OP_SBRA:
122 ph10 455 case OP_ONCE:
123     d = find_minlength(cc, startcode, options);
124     if (d < 0) return d;
125     branchlength += d;
126     do cc += GET(cc, 1); while (*cc == OP_ALT);
127     cc += 1 + LINK_SIZE;
128     break;
129    
130     /* Reached end of a branch; if it's a ket it is the end of a nested
131     call. If it's ALT it is an alternation in a nested call. If it is
132     END it's the end of the outer call. All can be handled by the same code. */
133    
134     case OP_ALT:
135     case OP_KET:
136     case OP_KETRMAX:
137     case OP_KETRMIN:
138     case OP_END:
139 ph10 461 if (length < 0 || (!had_recurse && branchlength < length))
140 ph10 455 length = branchlength;
141     if (*cc != OP_ALT) return length;
142     cc += 1 + LINK_SIZE;
143     branchlength = 0;
144 ph10 461 had_recurse = FALSE;
145 ph10 455 break;
146    
147     /* Skip over assertive subpatterns */
148    
149     case OP_ASSERT:
150     case OP_ASSERT_NOT:
151     case OP_ASSERTBACK:
152     case OP_ASSERTBACK_NOT:
153     do cc += GET(cc, 1); while (*cc == OP_ALT);
154     /* Fall through */
155    
156     /* Skip over things that don't match chars */
157    
158     case OP_REVERSE:
159     case OP_CREF:
160 ph10 459 case OP_NCREF:
161 ph10 455 case OP_RREF:
162 ph10 459 case OP_NRREF:
163 ph10 455 case OP_DEF:
164     case OP_CALLOUT:
165     case OP_SOD:
166     case OP_SOM:
167     case OP_EOD:
168     case OP_EODN:
169     case OP_CIRC:
170 ph10 602 case OP_CIRCM:
171 ph10 455 case OP_DOLL:
172 ph10 602 case OP_DOLLM:
173 ph10 455 case OP_NOT_WORD_BOUNDARY:
174     case OP_WORD_BOUNDARY:
175     cc += _pcre_OP_lengths[*cc];
176     break;
177 ph10 461
178 ph10 455 /* Skip over a subpattern that has a {0} or {0,x} quantifier */
179    
180     case OP_BRAZERO:
181 ph10 461 case OP_BRAMINZERO:
182 ph10 455 case OP_SKIPZERO:
183     cc += _pcre_OP_lengths[*cc];
184     do cc += GET(cc, 1); while (*cc == OP_ALT);
185     cc += 1 + LINK_SIZE;
186     break;
187    
188     /* Handle literal characters and + repetitions */
189    
190     case OP_CHAR:
191 ph10 602 case OP_CHARI:
192 ph10 455 case OP_NOT:
193 ph10 602 case OP_NOTI:
194 ph10 455 case OP_PLUS:
195 ph10 603 case OP_PLUSI:
196 ph10 455 case OP_MINPLUS:
197 ph10 603 case OP_MINPLUSI:
198 ph10 455 case OP_POSPLUS:
199 ph10 603 case OP_POSPLUSI:
200 ph10 455 case OP_NOTPLUS:
201 ph10 603 case OP_NOTPLUSI:
202 ph10 455 case OP_NOTMINPLUS:
203 ph10 603 case OP_NOTMINPLUSI:
204 ph10 455 case OP_NOTPOSPLUS:
205 ph10 603 case OP_NOTPOSPLUSI:
206 ph10 455 branchlength++;
207     cc += 2;
208     #ifdef SUPPORT_UTF8
209     if (utf8 && cc[-1] >= 0xc0) cc += _pcre_utf8_table4[cc[-1] & 0x3f];
210     #endif
211     break;
212 ph10 461
213 ph10 455 case OP_TYPEPLUS:
214     case OP_TYPEMINPLUS:
215 ph10 461 case OP_TYPEPOSPLUS:
216 ph10 455 branchlength++;
217     cc += (cc[1] == OP_PROP || cc[1] == OP_NOTPROP)? 4 : 2;
218     break;
219    
220     /* Handle exact repetitions. The count is already in characters, but we
221     need to skip over a multibyte character in UTF8 mode. */
222    
223     case OP_EXACT:
224 ph10 603 case OP_EXACTI:
225 ph10 461 case OP_NOTEXACT:
226 ph10 603 case OP_NOTEXACTI:
227 ph10 455 branchlength += GET2(cc,1);
228     cc += 4;
229     #ifdef SUPPORT_UTF8
230     if (utf8 && cc[-1] >= 0xc0) cc += _pcre_utf8_table4[cc[-1] & 0x3f];
231     #endif
232     break;
233    
234     case OP_TYPEEXACT:
235     branchlength += GET2(cc,1);
236     cc += (cc[3] == OP_PROP || cc[3] == OP_NOTPROP)? 6 : 4;
237     break;
238    
239     /* Handle single-char non-literal matchers */
240    
241     case OP_PROP:
242     case OP_NOTPROP:
243     cc += 2;
244     /* Fall through */
245    
246     case OP_NOT_DIGIT:
247     case OP_DIGIT:
248     case OP_NOT_WHITESPACE:
249     case OP_WHITESPACE:
250     case OP_NOT_WORDCHAR:
251     case OP_WORDCHAR:
252     case OP_ANY:
253     case OP_ALLANY:
254     case OP_EXTUNI:
255 ph10 461 case OP_HSPACE:
256 ph10 455 case OP_NOT_HSPACE:
257     case OP_VSPACE:
258 ph10 461 case OP_NOT_VSPACE:
259 ph10 455 branchlength++;
260     cc++;
261     break;
262 ph10 461
263 ph10 455 /* "Any newline" might match two characters */
264 ph10 461
265 ph10 455 case OP_ANYNL:
266     branchlength += 2;
267     cc++;
268 ph10 461 break;
269 ph10 455
270     /* The single-byte matcher means we can't proceed in UTF-8 mode */
271    
272     case OP_ANYBYTE:
273     #ifdef SUPPORT_UTF8
274     if (utf8) return -1;
275     #endif
276     branchlength++;
277     cc++;
278 ph10 461 break;
279 ph10 455
280     /* For repeated character types, we have to test for \p and \P, which have
281     an extra two bytes of parameters. */
282    
283     case OP_TYPESTAR:
284     case OP_TYPEMINSTAR:
285     case OP_TYPEQUERY:
286     case OP_TYPEMINQUERY:
287     case OP_TYPEPOSSTAR:
288     case OP_TYPEPOSQUERY:
289     if (cc[1] == OP_PROP || cc[1] == OP_NOTPROP) cc += 2;
290     cc += _pcre_OP_lengths[op];
291     break;
292    
293     case OP_TYPEUPTO:
294     case OP_TYPEMINUPTO:
295     case OP_TYPEPOSUPTO:
296     if (cc[3] == OP_PROP || cc[3] == OP_NOTPROP) cc += 2;
297     cc += _pcre_OP_lengths[op];
298     break;
299    
300     /* Check a class for variable quantification */
301    
302     #ifdef SUPPORT_UTF8
303     case OP_XCLASS:
304     cc += GET(cc, 1) - 33;
305     /* Fall through */
306     #endif
307    
308     case OP_CLASS:
309     case OP_NCLASS:
310     cc += 33;
311    
312     switch (*cc)
313     {
314     case OP_CRPLUS:
315     case OP_CRMINPLUS:
316     branchlength++;
317 ph10 461 /* Fall through */
318 ph10 455
319     case OP_CRSTAR:
320     case OP_CRMINSTAR:
321     case OP_CRQUERY:
322     case OP_CRMINQUERY:
323 ph10 461 cc++;
324 ph10 455 break;
325 ph10 461
326 ph10 455 case OP_CRRANGE:
327     case OP_CRMINRANGE:
328     branchlength += GET2(cc,1);
329     cc += 5;
330     break;
331 ph10 461
332 ph10 455 default:
333     branchlength++;
334 ph10 461 break;
335 ph10 455 }
336     break;
337 ph10 461
338     /* Backreferences and subroutine calls are treated in the same way: we find
339     the minimum length for the subpattern. A recursion, however, causes an
340 ph10 455 a flag to be set that causes the length of this branch to be ignored. The
341     logic is that a recursion can only make sense if there is another
342     alternation that stops the recursing. That will provide the minimum length
343     (when no recursion happens). A backreference within the group that it is
344 ph10 469 referencing behaves in the same way.
345    
346 ph10 467 If PCRE_JAVASCRIPT_COMPAT is set, a backreference to an unset bracket
347     matches an empty string (by default it causes a matching failure), so in
348     that case we must set the minimum length to zero. */
349 ph10 461
350 ph10 455 case OP_REF:
351 ph10 602 case OP_REFI:
352 ph10 467 if ((options & PCRE_JAVASCRIPT_COMPAT) == 0)
353 ph10 469 {
354 ph10 467 ce = cs = (uschar *)_pcre_find_bracket(startcode, utf8, GET2(cc, 1));
355     if (cs == NULL) return -2;
356     do ce += GET(ce, 1); while (*ce == OP_ALT);
357     if (cc > cs && cc < ce)
358     {
359     d = 0;
360     had_recurse = TRUE;
361     }
362     else d = find_minlength(cs, startcode, options);
363 ph10 461 }
364 ph10 469 else d = 0;
365 ph10 461 cc += 3;
366 ph10 455
367     /* Handle repeated back references */
368 ph10 461
369 ph10 455 switch (*cc)
370     {
371     case OP_CRSTAR:
372     case OP_CRMINSTAR:
373     case OP_CRQUERY:
374     case OP_CRMINQUERY:
375     min = 0;
376     cc++;
377     break;
378 ph10 461
379 ph10 455 case OP_CRRANGE:
380     case OP_CRMINRANGE:
381     min = GET2(cc, 1);
382     cc += 5;
383     break;
384 ph10 461
385 ph10 455 default:
386     min = 1;
387     break;
388     }
389    
390     branchlength += min * d;
391 ph10 461 break;
392 ph10 455
393 ph10 461 case OP_RECURSE:
394 ph10 455 cs = ce = (uschar *)startcode + GET(cc, 1);
395     if (cs == NULL) return -2;
396     do ce += GET(ce, 1); while (*ce == OP_ALT);
397     if (cc > cs && cc < ce)
398 ph10 461 had_recurse = TRUE;
399     else
400 ph10 455 branchlength += find_minlength(cs, startcode, options);
401     cc += 1 + LINK_SIZE;
402     break;
403    
404     /* Anything else does not or need not match a character. We can get the
405 ph10 461 item's length from the table, but for those that can match zero occurrences
406 ph10 602 of a character, we must take special action for UTF-8 characters. As it
407     happens, the "NOT" versions of these opcodes are used at present only for
408     ASCII characters, so they could be omitted from this list. However, in
409 ph10 603 future that may change, so we include them here so as not to leave a
410     gotcha for a future maintainer. */
411 ph10 461
412 ph10 455 case OP_UPTO:
413 ph10 602 case OP_UPTOI:
414 ph10 461 case OP_NOTUPTO:
415 ph10 602 case OP_NOTUPTOI:
416 ph10 455 case OP_MINUPTO:
417 ph10 602 case OP_MINUPTOI:
418 ph10 461 case OP_NOTMINUPTO:
419 ph10 602 case OP_NOTMINUPTOI:
420 ph10 455 case OP_POSUPTO:
421 ph10 602 case OP_POSUPTOI:
422     case OP_NOTPOSUPTO:
423     case OP_NOTPOSUPTOI:
424    
425 ph10 455 case OP_STAR:
426 ph10 602 case OP_STARI:
427     case OP_NOTSTAR:
428     case OP_NOTSTARI:
429 ph10 455 case OP_MINSTAR:
430 ph10 602 case OP_MINSTARI:
431 ph10 461 case OP_NOTMINSTAR:
432 ph10 602 case OP_NOTMINSTARI:
433 ph10 455 case OP_POSSTAR:
434 ph10 602 case OP_POSSTARI:
435 ph10 461 case OP_NOTPOSSTAR:
436 ph10 602 case OP_NOTPOSSTARI:
437    
438 ph10 455 case OP_QUERY:
439 ph10 602 case OP_QUERYI:
440     case OP_NOTQUERY:
441     case OP_NOTQUERYI:
442 ph10 455 case OP_MINQUERY:
443 ph10 602 case OP_MINQUERYI:
444 ph10 455 case OP_NOTMINQUERY:
445 ph10 602 case OP_NOTMINQUERYI:
446 ph10 455 case OP_POSQUERY:
447 ph10 602 case OP_POSQUERYI:
448 ph10 461 case OP_NOTPOSQUERY:
449 ph10 602 case OP_NOTPOSQUERYI:
450    
451 ph10 455 cc += _pcre_OP_lengths[op];
452 ph10 461 #ifdef SUPPORT_UTF8
453 ph10 455 if (utf8 && cc[-1] >= 0xc0) cc += _pcre_utf8_table4[cc[-1] & 0x3f];
454 ph10 461 #endif
455 ph10 455 break;
456 ph10 512
457 ph10 510 /* Skip these, but we need to add in the name length. */
458 ph10 512
459 ph10 510 case OP_MARK:
460     case OP_PRUNE_ARG:
461     case OP_SKIP_ARG:
462     cc += _pcre_OP_lengths[op] + cc[1];
463 ph10 512 break;
464 ph10 455
465 ph10 550 case OP_THEN_ARG:
466     cc += _pcre_OP_lengths[op] + cc[1+LINK_SIZE];
467     break;
468    
469 ph10 603 /* The remaining opcodes are just skipped over. */
470 ph10 461
471 ph10 603 case OP_ACCEPT:
472     case OP_CLOSE:
473     case OP_COMMIT:
474     case OP_FAIL:
475     case OP_PRUNE:
476     case OP_SET_SOM:
477     case OP_SKIP:
478     case OP_THEN:
479 ph10 455 cc += _pcre_OP_lengths[op];
480     break;
481 ph10 603
482     /* This should not occur: we list all opcodes explicitly so that when
483     new ones get added they are properly considered. */
484    
485     default:
486     return -3;
487 ph10 455 }
488     }
489     /* Control never gets here */
490     }
491    
492    
493    
494     /*************************************************
495 nigel 77 * Set a bit and maybe its alternate case *
496     *************************************************/
497    
498 ph10 535 /* Given a character, set its first byte's bit in the table, and also the
499 ph10 520 corresponding bit for the other version of a letter if we are caseless. In
500     UTF-8 mode, for characters greater than 127, we can only do the caseless thing
501     when Unicode property support is available.
502 nigel 77
503     Arguments:
504     start_bits points to the bit map
505 ph10 520 p points to the character
506 nigel 77 caseless the caseless flag
507     cd the block with char table pointers
508 ph10 535 utf8 TRUE for UTF-8 mode
509 nigel 77
510 ph10 520 Returns: pointer after the character
511 nigel 77 */
512    
513 ph10 520 static const uschar *
514     set_table_bit(uschar *start_bits, const uschar *p, BOOL caseless,
515     compile_data *cd, BOOL utf8)
516 nigel 77 {
517 ph10 520 unsigned int c = *p;
518    
519 ph10 524 SET_BIT(c);
520    
521 ph10 520 #ifdef SUPPORT_UTF8
522     if (utf8 && c > 127)
523     {
524     GETCHARINC(c, p);
525     #ifdef SUPPORT_UCP
526     if (caseless)
527     {
528 ph10 535 uschar buff[8];
529 ph10 520 c = UCD_OTHERCASE(c);
530 ph10 535 (void)_pcre_ord2utf8(c, buff);
531     SET_BIT(buff[0]);
532     }
533     #endif
534 ph10 520 return p;
535     }
536 ph10 535 #endif
537 ph10 520
538     /* Not UTF-8 mode, or character is less than 127. */
539    
540 ph10 524 if (caseless && (cd->ctypes[c] & ctype_letter) != 0) SET_BIT(cd->fcc[c]);
541 ph10 520 return p + 1;
542 nigel 77 }
543    
544    
545    
546     /*************************************************
547 ph10 539 * Set bits for a positive character type *
548     *************************************************/
549    
550     /* This function sets starting bits for a character type. In UTF-8 mode, we can
551     only do a direct setting for bytes less than 128, as otherwise there can be
552     confusion with bytes in the middle of UTF-8 characters. In a "traditional"
553     environment, the tables will only recognize ASCII characters anyway, but in at
554     least one Windows environment, some higher bytes bits were set in the tables.
555     So we deal with that case by considering the UTF-8 encoding.
556    
557     Arguments:
558     start_bits the starting bitmap
559     cbit type the type of character wanted
560     table_limit 32 for non-UTF-8; 16 for UTF-8
561 ph10 545 cd the block with char table pointers
562 ph10 539
563     Returns: nothing
564     */
565    
566     static void
567     set_type_bits(uschar *start_bits, int cbit_type, int table_limit,
568     compile_data *cd)
569     {
570 ph10 545 register int c;
571 ph10 539 for (c = 0; c < table_limit; c++) start_bits[c] |= cd->cbits[c+cbit_type];
572     if (table_limit == 32) return;
573     for (c = 128; c < 256; c++)
574     {
575     if ((cd->cbits[c/8] & (1 << (c&7))) != 0)
576     {
577     uschar buff[8];
578     (void)_pcre_ord2utf8(c, buff);
579 ph10 545 SET_BIT(buff[0]);
580     }
581     }
582 ph10 539 }
583    
584    
585     /*************************************************
586     * Set bits for a negative character type *
587     *************************************************/
588    
589     /* This function sets starting bits for a negative character type such as \D.
590     In UTF-8 mode, we can only do a direct setting for bytes less than 128, as
591     otherwise there can be confusion with bytes in the middle of UTF-8 characters.
592 ph10 545 Unlike in the positive case, where we can set appropriate starting bits for
593 ph10 539 specific high-valued UTF-8 characters, in this case we have to set the bits for
594 ph10 545 all high-valued characters. The lowest is 0xc2, but we overkill by starting at
595 ph10 539 0xc0 (192) for simplicity.
596    
597     Arguments:
598     start_bits the starting bitmap
599     cbit type the type of character wanted
600     table_limit 32 for non-UTF-8; 16 for UTF-8
601 ph10 545 cd the block with char table pointers
602 ph10 539
603     Returns: nothing
604     */
605    
606     static void
607     set_nottype_bits(uschar *start_bits, int cbit_type, int table_limit,
608     compile_data *cd)
609     {
610 ph10 545 register int c;
611 ph10 539 for (c = 0; c < table_limit; c++) start_bits[c] |= ~cd->cbits[c+cbit_type];
612     if (table_limit != 32) for (c = 24; c < 32; c++) start_bits[c] = 0xff;
613     }
614    
615    
616    
617     /*************************************************
618 nigel 93 * Create bitmap of starting bytes *
619 nigel 77 *************************************************/
620    
621 nigel 93 /* This function scans a compiled unanchored expression recursively and
622     attempts to build a bitmap of the set of possible starting bytes. As time goes
623     by, we may be able to get more clever at doing this. The SSB_CONTINUE return is
624     useful for parenthesized groups in patterns such as (a*)b where the group
625     provides some optional starting bytes but scanning must continue at the outer
626     level to find at least one mandatory byte. At the outermost level, this
627     function fails unless the result is SSB_DONE.
628 nigel 77
629     Arguments:
630     code points to an expression
631     start_bits points to a 32-byte table, initialized to 0
632     utf8 TRUE if in UTF-8 mode
633     cd the block with char table pointers
634    
635 nigel 93 Returns: SSB_FAIL => Failed to find any starting bytes
636     SSB_DONE => Found mandatory starting bytes
637     SSB_CONTINUE => Found optional starting bytes
638 nigel 77 */
639    
640 nigel 93 static int
641 ph10 603 set_start_bits(const uschar *code, uschar *start_bits, BOOL utf8,
642     compile_data *cd)
643 nigel 77 {
644     register int c;
645 nigel 93 int yield = SSB_DONE;
646 ph10 538 int table_limit = utf8? 16:32;
647 nigel 77
648 nigel 91 #if 0
649     /* ========================================================================= */
650     /* The following comment and code was inserted in January 1999. In May 2006,
651     when it was observed to cause compiler warnings about unused values, I took it
652     out again. If anybody is still using OS/2, they will have to put it back
653     manually. */
654    
655 nigel 77 /* This next statement and the later reference to dummy are here in order to
656     trick the optimizer of the IBM C compiler for OS/2 into generating correct
657     code. Apparently IBM isn't going to fix the problem, and we would rather not
658     disable optimization (in this module it actually makes a big difference, and
659     the pcre module can use all the optimization it can get). */
660    
661     volatile int dummy;
662 nigel 91 /* ========================================================================= */
663     #endif
664 nigel 77
665     do
666     {
667 nigel 93 const uschar *tcode = code + (((int)*code == OP_CBRA)? 3:1) + LINK_SIZE;
668 nigel 77 BOOL try_next = TRUE;
669    
670 nigel 93 while (try_next) /* Loop for items in this branch */
671 nigel 77 {
672 nigel 93 int rc;
673     switch(*tcode)
674 nigel 77 {
675 nigel 93 /* Fail if we reach something we don't understand */
676 nigel 77
677     default:
678 nigel 93 return SSB_FAIL;
679 nigel 77
680 nigel 93 /* If we hit a bracket or a positive lookahead assertion, recurse to set
681     bits from within the subpattern. If it can't find anything, we have to
682     give up. If it finds some mandatory character(s), we are done for this
683     branch. Otherwise, carry on scanning after the subpattern. */
684    
685     case OP_BRA:
686     case OP_SBRA:
687     case OP_CBRA:
688     case OP_SCBRA:
689     case OP_ONCE:
690     case OP_ASSERT:
691 ph10 603 rc = set_start_bits(tcode, start_bits, utf8, cd);
692 nigel 93 if (rc == SSB_FAIL) return SSB_FAIL;
693     if (rc == SSB_DONE) try_next = FALSE; else
694     {
695     do tcode += GET(tcode, 1); while (*tcode == OP_ALT);
696     tcode += 1 + LINK_SIZE;
697     }
698     break;
699    
700     /* If we hit ALT or KET, it means we haven't found anything mandatory in
701     this branch, though we might have found something optional. For ALT, we
702     continue with the next alternative, but we have to arrange that the final
703     result from subpattern is SSB_CONTINUE rather than SSB_DONE. For KET,
704     return SSB_CONTINUE: if this is the top level, that indicates failure,
705     but after a nested subpattern, it causes scanning to continue. */
706    
707     case OP_ALT:
708     yield = SSB_CONTINUE;
709     try_next = FALSE;
710     break;
711    
712     case OP_KET:
713     case OP_KETRMAX:
714     case OP_KETRMIN:
715     return SSB_CONTINUE;
716    
717 nigel 77 /* Skip over callout */
718    
719     case OP_CALLOUT:
720     tcode += 2 + 2*LINK_SIZE;
721     break;
722    
723     /* Skip over lookbehind and negative lookahead assertions */
724    
725     case OP_ASSERT_NOT:
726     case OP_ASSERTBACK:
727     case OP_ASSERTBACK_NOT:
728     do tcode += GET(tcode, 1); while (*tcode == OP_ALT);
729 nigel 93 tcode += 1 + LINK_SIZE;
730 nigel 77 break;
731    
732     /* BRAZERO does the bracket, but carries on. */
733    
734     case OP_BRAZERO:
735     case OP_BRAMINZERO:
736 ph10 603 if (set_start_bits(++tcode, start_bits, utf8, cd) == SSB_FAIL)
737 nigel 93 return SSB_FAIL;
738 nigel 91 /* =========================================================================
739     See the comment at the head of this function concerning the next line,
740     which was an old fudge for the benefit of OS/2.
741 nigel 77 dummy = 1;
742 nigel 91 ========================================================================= */
743 nigel 77 do tcode += GET(tcode,1); while (*tcode == OP_ALT);
744 nigel 93 tcode += 1 + LINK_SIZE;
745 nigel 77 break;
746    
747 ph10 335 /* SKIPZERO skips the bracket. */
748    
749     case OP_SKIPZERO:
750 ph10 358 tcode++;
751 ph10 335 do tcode += GET(tcode,1); while (*tcode == OP_ALT);
752     tcode += 1 + LINK_SIZE;
753     break;
754    
755 nigel 77 /* Single-char * or ? sets the bit and tries the next item */
756    
757     case OP_STAR:
758     case OP_MINSTAR:
759 nigel 93 case OP_POSSTAR:
760 nigel 77 case OP_QUERY:
761     case OP_MINQUERY:
762 nigel 93 case OP_POSQUERY:
763 ph10 603 tcode = set_table_bit(start_bits, tcode + 1, FALSE, cd, utf8);
764 nigel 77 break;
765    
766 ph10 602 case OP_STARI:
767     case OP_MINSTARI:
768     case OP_POSSTARI:
769     case OP_QUERYI:
770     case OP_MINQUERYI:
771     case OP_POSQUERYI:
772     tcode = set_table_bit(start_bits, tcode + 1, TRUE, cd, utf8);
773     break;
774    
775 nigel 77 /* Single-char upto sets the bit and tries the next */
776    
777     case OP_UPTO:
778     case OP_MINUPTO:
779 nigel 93 case OP_POSUPTO:
780 ph10 603 tcode = set_table_bit(start_bits, tcode + 3, FALSE, cd, utf8);
781 nigel 77 break;
782    
783 ph10 602 case OP_UPTOI:
784     case OP_MINUPTOI:
785     case OP_POSUPTOI:
786     tcode = set_table_bit(start_bits, tcode + 3, TRUE, cd, utf8);
787     break;
788    
789 nigel 77 /* At least one single char sets the bit and stops */
790    
791 ph10 602 case OP_EXACT:
792 nigel 77 tcode += 2;
793 ph10 602 /* Fall through */
794 nigel 77 case OP_CHAR:
795     case OP_PLUS:
796     case OP_MINPLUS:
797 nigel 93 case OP_POSPLUS:
798 ph10 603 (void)set_table_bit(start_bits, tcode + 1, FALSE, cd, utf8);
799 nigel 77 try_next = FALSE;
800     break;
801 ph10 535
802 ph10 603 case OP_EXACTI:
803     tcode += 2;
804     /* Fall through */
805 ph10 602 case OP_CHARI:
806     case OP_PLUSI:
807     case OP_MINPLUSI:
808     case OP_POSPLUSI:
809     (void)set_table_bit(start_bits, tcode + 1, TRUE, cd, utf8);
810     try_next = FALSE;
811     break;
812    
813 ph10 535 /* Special spacing and line-terminating items. These recognize specific
814     lists of characters. The difference between VSPACE and ANYNL is that the
815     latter can match the two-character CRLF sequence, but that is not
816     relevant for finding the first character, so their code here is
817 ph10 524 identical. */
818 ph10 535
819 ph10 524 case OP_HSPACE:
820     SET_BIT(0x09);
821     SET_BIT(0x20);
822     if (utf8)
823 ph10 535 {
824 ph10 545 SET_BIT(0xC2); /* For U+00A0 */
825 ph10 524 SET_BIT(0xE1); /* For U+1680, U+180E */
826     SET_BIT(0xE2); /* For U+2000 - U+200A, U+202F, U+205F */
827 ph10 535 SET_BIT(0xE3); /* For U+3000 */
828 ph10 524 }
829 ph10 538 else SET_BIT(0xA0);
830 ph10 524 try_next = FALSE;
831 ph10 535 break;
832 nigel 77
833 ph10 535 case OP_ANYNL:
834 ph10 524 case OP_VSPACE:
835 ph10 535 SET_BIT(0x0A);
836     SET_BIT(0x0B);
837     SET_BIT(0x0C);
838     SET_BIT(0x0D);
839 ph10 545 if (utf8)
840     {
841     SET_BIT(0xC2); /* For U+0085 */
842 ph10 538 SET_BIT(0xE2); /* For U+2028, U+2029 */
843 ph10 545 }
844 ph10 538 else SET_BIT(0x85);
845 ph10 524 try_next = FALSE;
846 ph10 535 break;
847 ph10 524
848 ph10 535 /* Single character types set the bits and stop. Note that if PCRE_UCP
849     is set, we do not see these op codes because \d etc are converted to
850 ph10 545 properties. Therefore, these apply in the case when only characters less
851 ph10 539 than 256 are recognized to match the types. */
852 nigel 77
853     case OP_NOT_DIGIT:
854 ph10 539 set_nottype_bits(start_bits, cbit_digit, table_limit, cd);
855 nigel 77 try_next = FALSE;
856     break;
857    
858     case OP_DIGIT:
859 ph10 539 set_type_bits(start_bits, cbit_digit, table_limit, cd);
860 nigel 77 try_next = FALSE;
861     break;
862    
863 nigel 91 /* The cbit_space table has vertical tab as whitespace; we have to
864 ph10 539 ensure it is set as not whitespace. */
865 nigel 91
866 nigel 77 case OP_NOT_WHITESPACE:
867 ph10 539 set_nottype_bits(start_bits, cbit_space, table_limit, cd);
868     start_bits[1] |= 0x08;
869 nigel 77 try_next = FALSE;
870     break;
871    
872 nigel 91 /* The cbit_space table has vertical tab as whitespace; we have to
873 ph10 539 not set it from the table. */
874 nigel 91
875 nigel 77 case OP_WHITESPACE:
876 ph10 539 c = start_bits[1]; /* Save in case it was already set */
877     set_type_bits(start_bits, cbit_space, table_limit, cd);
878     start_bits[1] = (start_bits[1] & ~0x08) | c;
879 nigel 77 try_next = FALSE;
880     break;
881    
882     case OP_NOT_WORDCHAR:
883 ph10 539 set_nottype_bits(start_bits, cbit_word, table_limit, cd);
884 nigel 77 try_next = FALSE;
885     break;
886    
887     case OP_WORDCHAR:
888 ph10 539 set_type_bits(start_bits, cbit_word, table_limit, cd);
889 nigel 77 try_next = FALSE;
890     break;
891 ph10 545
892 nigel 77 /* One or more character type fudges the pointer and restarts, knowing
893     it will hit a single character type and stop there. */
894    
895     case OP_TYPEPLUS:
896     case OP_TYPEMINPLUS:
897 ph10 535 case OP_TYPEPOSPLUS:
898 nigel 77 tcode++;
899     break;
900    
901     case OP_TYPEEXACT:
902     tcode += 3;
903     break;
904    
905     /* Zero or more repeats of character types set the bits and then
906     try again. */
907    
908     case OP_TYPEUPTO:
909     case OP_TYPEMINUPTO:
910 nigel 93 case OP_TYPEPOSUPTO:
911 nigel 77 tcode += 2; /* Fall through */
912    
913     case OP_TYPESTAR:
914     case OP_TYPEMINSTAR:
915 nigel 93 case OP_TYPEPOSSTAR:
916 nigel 77 case OP_TYPEQUERY:
917     case OP_TYPEMINQUERY:
918 nigel 93 case OP_TYPEPOSQUERY:
919 nigel 77 switch(tcode[1])
920     {
921 ph10 523 default:
922 nigel 77 case OP_ANY:
923 ph10 345 case OP_ALLANY:
924 nigel 93 return SSB_FAIL;
925 ph10 535
926 ph10 524 case OP_HSPACE:
927     SET_BIT(0x09);
928     SET_BIT(0x20);
929     if (utf8)
930 ph10 535 {
931 ph10 545 SET_BIT(0xC2); /* For U+00A0 */
932 ph10 524 SET_BIT(0xE1); /* For U+1680, U+180E */
933     SET_BIT(0xE2); /* For U+2000 - U+200A, U+202F, U+205F */
934 ph10 535 SET_BIT(0xE3); /* For U+3000 */
935 ph10 524 }
936 ph10 538 else SET_BIT(0xA0);
937 ph10 535 break;
938    
939     case OP_ANYNL:
940 ph10 524 case OP_VSPACE:
941 ph10 535 SET_BIT(0x0A);
942     SET_BIT(0x0B);
943     SET_BIT(0x0C);
944     SET_BIT(0x0D);
945 ph10 545 if (utf8)
946 ph10 538 {
947 ph10 545 SET_BIT(0xC2); /* For U+0085 */
948 ph10 538 SET_BIT(0xE2); /* For U+2028, U+2029 */
949 ph10 545 }
950 ph10 538 else SET_BIT(0x85);
951 ph10 535 break;
952    
953 nigel 77 case OP_NOT_DIGIT:
954 ph10 539 set_nottype_bits(start_bits, cbit_digit, table_limit, cd);
955 nigel 77 break;
956    
957     case OP_DIGIT:
958 ph10 539 set_type_bits(start_bits, cbit_digit, table_limit, cd);
959 nigel 77 break;
960    
961 nigel 91 /* The cbit_space table has vertical tab as whitespace; we have to
962 ph10 539 ensure it gets set as not whitespace. */
963 nigel 91
964 nigel 77 case OP_NOT_WHITESPACE:
965 ph10 539 set_nottype_bits(start_bits, cbit_space, table_limit, cd);
966 ph10 545 start_bits[1] |= 0x08;
967 nigel 77 break;
968    
969 nigel 91 /* The cbit_space table has vertical tab as whitespace; we have to
970 ph10 539 avoid setting it. */
971 nigel 91
972 nigel 77 case OP_WHITESPACE:
973 ph10 539 c = start_bits[1]; /* Save in case it was already set */
974     set_type_bits(start_bits, cbit_space, table_limit, cd);
975     start_bits[1] = (start_bits[1] & ~0x08) | c;
976 nigel 77 break;
977    
978     case OP_NOT_WORDCHAR:
979 ph10 539 set_nottype_bits(start_bits, cbit_word, table_limit, cd);
980 nigel 77 break;
981    
982     case OP_WORDCHAR:
983 ph10 539 set_type_bits(start_bits, cbit_word, table_limit, cd);
984 nigel 77 break;
985     }
986    
987     tcode += 2;
988     break;
989    
990     /* Character class where all the information is in a bit map: set the
991     bits and either carry on or not, according to the repeat count. If it was
992     a negative class, and we are operating with UTF-8 characters, any byte
993     with a value >= 0xc4 is a potentially valid starter because it starts a
994     character with a value > 255. */
995    
996     case OP_NCLASS:
997 ph10 111 #ifdef SUPPORT_UTF8
998 nigel 77 if (utf8)
999     {
1000     start_bits[24] |= 0xf0; /* Bits for 0xc4 - 0xc8 */
1001     memset(start_bits+25, 0xff, 7); /* Bits for 0xc9 - 0xff */
1002     }
1003 ph10 111 #endif
1004 nigel 77 /* Fall through */
1005    
1006     case OP_CLASS:
1007     {
1008     tcode++;
1009    
1010     /* In UTF-8 mode, the bits in a bit map correspond to character
1011     values, not to byte values. However, the bit map we are constructing is
1012     for byte values. So we have to do a conversion for characters whose
1013     value is > 127. In fact, there are only two possible starting bytes for
1014     characters in the range 128 - 255. */
1015    
1016 ph10 107 #ifdef SUPPORT_UTF8
1017 nigel 77 if (utf8)
1018     {
1019     for (c = 0; c < 16; c++) start_bits[c] |= tcode[c];
1020     for (c = 128; c < 256; c++)
1021     {
1022     if ((tcode[c/8] && (1 << (c&7))) != 0)
1023     {
1024     int d = (c >> 6) | 0xc0; /* Set bit for this starter */
1025     start_bits[d/8] |= (1 << (d&7)); /* and then skip on to the */
1026     c = (c & 0xc0) + 0x40 - 1; /* next relevant character. */
1027     }
1028     }
1029     }
1030    
1031     /* In non-UTF-8 mode, the two bit maps are completely compatible. */
1032    
1033     else
1034 ph10 111 #endif
1035 nigel 77 {
1036     for (c = 0; c < 32; c++) start_bits[c] |= tcode[c];
1037     }
1038    
1039     /* Advance past the bit map, and act on what follows */
1040    
1041     tcode += 32;
1042     switch (*tcode)
1043     {
1044     case OP_CRSTAR:
1045     case OP_CRMINSTAR:
1046     case OP_CRQUERY:
1047     case OP_CRMINQUERY:
1048     tcode++;
1049     break;
1050    
1051     case OP_CRRANGE:
1052     case OP_CRMINRANGE:
1053     if (((tcode[1] << 8) + tcode[2]) == 0) tcode += 5;
1054     else try_next = FALSE;
1055     break;
1056    
1057     default:
1058     try_next = FALSE;
1059     break;
1060     }
1061     }
1062     break; /* End of bitmap class handling */
1063    
1064     } /* End of switch */
1065     } /* End of try_next loop */
1066    
1067     code += GET(code, 1); /* Advance to next branch */
1068     }
1069     while (*code == OP_ALT);
1070 nigel 93 return yield;
1071 nigel 77 }
1072    
1073    
1074    
1075     /*************************************************
1076     * Study a compiled expression *
1077     *************************************************/
1078    
1079     /* This function is handed a compiled expression that it must study to produce
1080     information that will speed up the matching. It returns a pcre_extra block
1081     which then gets handed back to pcre_exec().
1082    
1083     Arguments:
1084     re points to the compiled expression
1085     options contains option bits
1086     errorptr points to where to place error messages;
1087     set NULL unless error
1088    
1089     Returns: pointer to a pcre_extra block, with study_data filled in and the
1090 ph10 455 appropriate flags set;
1091 nigel 77 NULL on error or if no optimization possible
1092     */
1093    
1094 ph10 359 PCRE_EXP_DEFN pcre_extra * PCRE_CALL_CONVENTION
1095 nigel 77 pcre_study(const pcre *external_re, int options, const char **errorptr)
1096     {
1097 ph10 455 int min;
1098     BOOL bits_set = FALSE;
1099 nigel 77 uschar start_bits[32];
1100     pcre_extra *extra;
1101     pcre_study_data *study;
1102     const uschar *tables;
1103 nigel 91 uschar *code;
1104     compile_data compile_block;
1105 nigel 77 const real_pcre *re = (const real_pcre *)external_re;
1106    
1107     *errorptr = NULL;
1108    
1109     if (re == NULL || re->magic_number != MAGIC_NUMBER)
1110     {
1111     *errorptr = "argument is not a compiled regular expression";
1112     return NULL;
1113     }
1114    
1115     if ((options & ~PUBLIC_STUDY_OPTIONS) != 0)
1116     {
1117     *errorptr = "unknown or incorrect option bit(s) set";
1118     return NULL;
1119     }
1120    
1121 nigel 91 code = (uschar *)re + re->name_table_offset +
1122     (re->name_count * re->name_entry_size);
1123    
1124 nigel 77 /* For an anchored pattern, or an unanchored pattern that has a first char, or
1125 ph10 461 a multiline pattern that matches only at "line starts", there is no point in
1126 ph10 455 seeking a list of starting bytes. */
1127 nigel 77
1128 ph10 455 if ((re->options & PCRE_ANCHORED) == 0 &&
1129     (re->flags & (PCRE_FIRSTSET|PCRE_STARTLINE)) == 0)
1130     {
1131     /* Set the character tables in the block that is passed around */
1132 ph10 461
1133 ph10 455 tables = re->tables;
1134     if (tables == NULL)
1135     (void)pcre_fullinfo(external_re, NULL, PCRE_INFO_DEFAULT_TABLES,
1136     (void *)(&tables));
1137 ph10 461
1138 ph10 455 compile_block.lcc = tables + lcc_offset;
1139     compile_block.fcc = tables + fcc_offset;
1140     compile_block.cbits = tables + cbits_offset;
1141     compile_block.ctypes = tables + ctypes_offset;
1142 ph10 461
1143 ph10 455 /* See if we can find a fixed set of initial characters for the pattern. */
1144 ph10 461
1145 ph10 455 memset(start_bits, 0, 32 * sizeof(uschar));
1146 ph10 603 bits_set = set_start_bits(code, start_bits, (re->options & PCRE_UTF8) != 0,
1147 ph10 455 &compile_block) == SSB_DONE;
1148     }
1149 ph10 461
1150 ph10 455 /* Find the minimum length of subject string. */
1151 nigel 77
1152 ph10 603 switch(min = find_minlength(code, code, re->options))
1153     {
1154     case -2: *errorptr = "internal error: missing capturing bracket"; break;
1155     case -3: *errorptr = "internal error: opcode not recognized"; break;
1156     default: break;
1157     }
1158 nigel 77
1159 ph10 455 /* Return NULL if no optimization is possible. */
1160 nigel 77
1161 ph10 455 if (!bits_set && min < 0) return NULL;
1162 nigel 77
1163     /* Get a pcre_extra block and a pcre_study_data block. The study data is put in
1164     the latter, which is pointed to by the former, which may also get additional
1165     data set later by the calling program. At the moment, the size of
1166     pcre_study_data is fixed. We nevertheless save it in a field for returning via
1167     the pcre_fullinfo() function so that if it becomes variable in the future, we
1168     don't have to change that code. */
1169    
1170     extra = (pcre_extra *)(pcre_malloc)
1171     (sizeof(pcre_extra) + sizeof(pcre_study_data));
1172    
1173     if (extra == NULL)
1174     {
1175     *errorptr = "failed to get memory";
1176     return NULL;
1177     }
1178    
1179     study = (pcre_study_data *)((char *)extra + sizeof(pcre_extra));
1180     extra->flags = PCRE_EXTRA_STUDY_DATA;
1181     extra->study_data = study;
1182    
1183     study->size = sizeof(pcre_study_data);
1184 ph10 455 study->flags = 0;
1185 nigel 77
1186 ph10 455 if (bits_set)
1187     {
1188     study->flags |= PCRE_STUDY_MAPPED;
1189     memcpy(study->start_bits, start_bits, sizeof(start_bits));
1190     }
1191 ph10 461
1192 ph10 455 if (min >= 0)
1193     {
1194     study->flags |= PCRE_STUDY_MINLEN;
1195     study->minlength = min;
1196 ph10 461 }
1197 ph10 455
1198 nigel 77 return extra;
1199     }
1200    
1201     /* End of pcre_study.c */

Properties

Name Value
svn:eol-style native
svn:keywords "Author Date Id Revision Url"

webmaster@exim.org
ViewVC Help
Powered by ViewVC 1.1.12