/[pcre]/code/branches/pcre16/pcre_study.c
ViewVC logotype

Contents of /code/branches/pcre16/pcre_study.c

Parent Directory Parent Directory | Revision Log Revision Log


Revision 602 - (hide annotations) (download)
Wed May 25 08:29:03 2011 UTC (3 years, 2 months ago) by ph10
Original Path: code/trunk/pcre_study.c
File MIME type: text/plain
File size: 33374 byte(s)
Remove OP_OPT by handling /i and /m entirely at compile time. Fixes bug with 
patterns like /(?i:([^b]))(?1)/, where the /i option was mishandled.

1 nigel 77 /*************************************************
2     * Perl-Compatible Regular Expressions *
3     *************************************************/
4    
5     /* PCRE is a library of functions to support regular expressions whose syntax
6     and semantics are as close as possible to those of the Perl 5 language.
7    
8     Written by Philip Hazel
9 ph10 475 Copyright (c) 1997-2010 University of Cambridge
10 nigel 77
11     -----------------------------------------------------------------------------
12     Redistribution and use in source and binary forms, with or without
13     modification, are permitted provided that the following conditions are met:
14    
15     * Redistributions of source code must retain the above copyright notice,
16     this list of conditions and the following disclaimer.
17    
18     * Redistributions in binary form must reproduce the above copyright
19     notice, this list of conditions and the following disclaimer in the
20     documentation and/or other materials provided with the distribution.
21    
22     * Neither the name of the University of Cambridge nor the names of its
23     contributors may be used to endorse or promote products derived from
24     this software without specific prior written permission.
25    
26     THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
27     AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
28     IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
29     ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
30     LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
31     CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
32     SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
33     INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
34     CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
35     ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
36     POSSIBILITY OF SUCH DAMAGE.
37     -----------------------------------------------------------------------------
38     */
39    
40    
41     /* This module contains the external function pcre_study(), along with local
42     supporting functions. */
43    
44    
45 ph10 200 #ifdef HAVE_CONFIG_H
46 ph10 236 #include "config.h"
47 ph10 200 #endif
48 ph10 199
49 nigel 77 #include "pcre_internal.h"
50    
51 ph10 524 #define SET_BIT(c) start_bits[c/8] |= (1 << (c&7))
52 nigel 77
53 nigel 93 /* Returns from set_start_bits() */
54    
55     enum { SSB_FAIL, SSB_DONE, SSB_CONTINUE };
56    
57    
58 ph10 455
59 nigel 77 /*************************************************
60 ph10 455 * Find the minimum subject length for a group *
61     *************************************************/
62    
63     /* Scan a parenthesized group and compute the minimum length of subject that
64 ph10 461 is needed to match it. This is a lower bound; it does not mean there is a
65 ph10 455 string of that length that matches. In UTF8 mode, the result is in characters
66     rather than bytes.
67    
68     Arguments:
69     code pointer to start of group (the bracket)
70     startcode pointer to start of the whole pattern
71 ph10 461 options the compiling options
72 ph10 455
73     Returns: the minimum length
74     -1 if \C was encountered
75 ph10 461 -2 internal error (missing capturing bracket)
76 ph10 455 */
77    
78     static int
79     find_minlength(const uschar *code, const uschar *startcode, int options)
80     {
81     int length = -1;
82     BOOL utf8 = (options & PCRE_UTF8) != 0;
83     BOOL had_recurse = FALSE;
84     register int branchlength = 0;
85     register uschar *cc = (uschar *)code + 1 + LINK_SIZE;
86    
87     if (*code == OP_CBRA || *code == OP_SCBRA) cc += 2;
88    
89     /* Scan along the opcodes for this branch. If we get to the end of the
90     branch, check the length against that of the other branches. */
91    
92     for (;;)
93     {
94     int d, min;
95 ph10 461 uschar *cs, *ce;
96 ph10 455 register int op = *cc;
97 ph10 461
98 ph10 455 switch (op)
99     {
100 ph10 471 case OP_COND:
101     case OP_SCOND:
102    
103     /* If there is only one branch in a condition, the implied branch has zero
104     length, so we don't add anything. This covers the DEFINE "condition"
105     automatically. */
106    
107     cs = cc + GET(cc, 1);
108     if (*cs != OP_ALT)
109     {
110     cc = cs + 1 + LINK_SIZE;
111     break;
112     }
113    
114     /* Otherwise we can fall through and treat it the same as any other
115     subpattern. */
116    
117 ph10 455 case OP_CBRA:
118 ph10 461 case OP_SCBRA:
119 ph10 455 case OP_BRA:
120 ph10 461 case OP_SBRA:
121 ph10 455 case OP_ONCE:
122     d = find_minlength(cc, startcode, options);
123     if (d < 0) return d;
124     branchlength += d;
125     do cc += GET(cc, 1); while (*cc == OP_ALT);
126     cc += 1 + LINK_SIZE;
127     break;
128    
129     /* Reached end of a branch; if it's a ket it is the end of a nested
130     call. If it's ALT it is an alternation in a nested call. If it is
131     END it's the end of the outer call. All can be handled by the same code. */
132    
133     case OP_ALT:
134     case OP_KET:
135     case OP_KETRMAX:
136     case OP_KETRMIN:
137     case OP_END:
138 ph10 461 if (length < 0 || (!had_recurse && branchlength < length))
139 ph10 455 length = branchlength;
140     if (*cc != OP_ALT) return length;
141     cc += 1 + LINK_SIZE;
142     branchlength = 0;
143 ph10 461 had_recurse = FALSE;
144 ph10 455 break;
145    
146     /* Skip over assertive subpatterns */
147    
148     case OP_ASSERT:
149     case OP_ASSERT_NOT:
150     case OP_ASSERTBACK:
151     case OP_ASSERTBACK_NOT:
152     do cc += GET(cc, 1); while (*cc == OP_ALT);
153     /* Fall through */
154    
155     /* Skip over things that don't match chars */
156    
157     case OP_REVERSE:
158     case OP_CREF:
159 ph10 459 case OP_NCREF:
160 ph10 455 case OP_RREF:
161 ph10 459 case OP_NRREF:
162 ph10 455 case OP_DEF:
163     case OP_CALLOUT:
164     case OP_SOD:
165     case OP_SOM:
166     case OP_EOD:
167     case OP_EODN:
168     case OP_CIRC:
169 ph10 602 case OP_CIRCM:
170 ph10 455 case OP_DOLL:
171 ph10 602 case OP_DOLLM:
172 ph10 455 case OP_NOT_WORD_BOUNDARY:
173     case OP_WORD_BOUNDARY:
174     cc += _pcre_OP_lengths[*cc];
175     break;
176 ph10 461
177 ph10 455 /* Skip over a subpattern that has a {0} or {0,x} quantifier */
178    
179     case OP_BRAZERO:
180 ph10 461 case OP_BRAMINZERO:
181 ph10 455 case OP_SKIPZERO:
182     cc += _pcre_OP_lengths[*cc];
183     do cc += GET(cc, 1); while (*cc == OP_ALT);
184     cc += 1 + LINK_SIZE;
185     break;
186    
187     /* Handle literal characters and + repetitions */
188    
189     case OP_CHAR:
190 ph10 602 case OP_CHARI:
191 ph10 455 case OP_NOT:
192 ph10 602 case OP_NOTI:
193 ph10 455 case OP_PLUS:
194     case OP_MINPLUS:
195     case OP_POSPLUS:
196     case OP_NOTPLUS:
197     case OP_NOTMINPLUS:
198     case OP_NOTPOSPLUS:
199     branchlength++;
200     cc += 2;
201     #ifdef SUPPORT_UTF8
202     if (utf8 && cc[-1] >= 0xc0) cc += _pcre_utf8_table4[cc[-1] & 0x3f];
203     #endif
204     break;
205 ph10 461
206 ph10 455 case OP_TYPEPLUS:
207     case OP_TYPEMINPLUS:
208 ph10 461 case OP_TYPEPOSPLUS:
209 ph10 455 branchlength++;
210     cc += (cc[1] == OP_PROP || cc[1] == OP_NOTPROP)? 4 : 2;
211     break;
212    
213     /* Handle exact repetitions. The count is already in characters, but we
214     need to skip over a multibyte character in UTF8 mode. */
215    
216     case OP_EXACT:
217 ph10 461 case OP_NOTEXACT:
218 ph10 455 branchlength += GET2(cc,1);
219     cc += 4;
220     #ifdef SUPPORT_UTF8
221     if (utf8 && cc[-1] >= 0xc0) cc += _pcre_utf8_table4[cc[-1] & 0x3f];
222     #endif
223     break;
224    
225     case OP_TYPEEXACT:
226     branchlength += GET2(cc,1);
227     cc += (cc[3] == OP_PROP || cc[3] == OP_NOTPROP)? 6 : 4;
228     break;
229    
230     /* Handle single-char non-literal matchers */
231    
232     case OP_PROP:
233     case OP_NOTPROP:
234     cc += 2;
235     /* Fall through */
236    
237     case OP_NOT_DIGIT:
238     case OP_DIGIT:
239     case OP_NOT_WHITESPACE:
240     case OP_WHITESPACE:
241     case OP_NOT_WORDCHAR:
242     case OP_WORDCHAR:
243     case OP_ANY:
244     case OP_ALLANY:
245     case OP_EXTUNI:
246 ph10 461 case OP_HSPACE:
247 ph10 455 case OP_NOT_HSPACE:
248     case OP_VSPACE:
249 ph10 461 case OP_NOT_VSPACE:
250 ph10 455 branchlength++;
251     cc++;
252     break;
253 ph10 461
254 ph10 455 /* "Any newline" might match two characters */
255 ph10 461
256 ph10 455 case OP_ANYNL:
257     branchlength += 2;
258     cc++;
259 ph10 461 break;
260 ph10 455
261     /* The single-byte matcher means we can't proceed in UTF-8 mode */
262    
263     case OP_ANYBYTE:
264     #ifdef SUPPORT_UTF8
265     if (utf8) return -1;
266     #endif
267     branchlength++;
268     cc++;
269 ph10 461 break;
270 ph10 455
271     /* For repeated character types, we have to test for \p and \P, which have
272     an extra two bytes of parameters. */
273    
274     case OP_TYPESTAR:
275     case OP_TYPEMINSTAR:
276     case OP_TYPEQUERY:
277     case OP_TYPEMINQUERY:
278     case OP_TYPEPOSSTAR:
279     case OP_TYPEPOSQUERY:
280     if (cc[1] == OP_PROP || cc[1] == OP_NOTPROP) cc += 2;
281     cc += _pcre_OP_lengths[op];
282     break;
283    
284     case OP_TYPEUPTO:
285     case OP_TYPEMINUPTO:
286     case OP_TYPEPOSUPTO:
287     if (cc[3] == OP_PROP || cc[3] == OP_NOTPROP) cc += 2;
288     cc += _pcre_OP_lengths[op];
289     break;
290    
291     /* Check a class for variable quantification */
292    
293     #ifdef SUPPORT_UTF8
294     case OP_XCLASS:
295     cc += GET(cc, 1) - 33;
296     /* Fall through */
297     #endif
298    
299     case OP_CLASS:
300     case OP_NCLASS:
301     cc += 33;
302    
303     switch (*cc)
304     {
305     case OP_CRPLUS:
306     case OP_CRMINPLUS:
307     branchlength++;
308 ph10 461 /* Fall through */
309 ph10 455
310     case OP_CRSTAR:
311     case OP_CRMINSTAR:
312     case OP_CRQUERY:
313     case OP_CRMINQUERY:
314 ph10 461 cc++;
315 ph10 455 break;
316 ph10 461
317 ph10 455 case OP_CRRANGE:
318     case OP_CRMINRANGE:
319     branchlength += GET2(cc,1);
320     cc += 5;
321     break;
322 ph10 461
323 ph10 455 default:
324     branchlength++;
325 ph10 461 break;
326 ph10 455 }
327     break;
328 ph10 461
329     /* Backreferences and subroutine calls are treated in the same way: we find
330     the minimum length for the subpattern. A recursion, however, causes an
331 ph10 455 a flag to be set that causes the length of this branch to be ignored. The
332     logic is that a recursion can only make sense if there is another
333     alternation that stops the recursing. That will provide the minimum length
334     (when no recursion happens). A backreference within the group that it is
335 ph10 469 referencing behaves in the same way.
336    
337 ph10 467 If PCRE_JAVASCRIPT_COMPAT is set, a backreference to an unset bracket
338     matches an empty string (by default it causes a matching failure), so in
339     that case we must set the minimum length to zero. */
340 ph10 461
341 ph10 455 case OP_REF:
342 ph10 602 case OP_REFI:
343 ph10 467 if ((options & PCRE_JAVASCRIPT_COMPAT) == 0)
344 ph10 469 {
345 ph10 467 ce = cs = (uschar *)_pcre_find_bracket(startcode, utf8, GET2(cc, 1));
346     if (cs == NULL) return -2;
347     do ce += GET(ce, 1); while (*ce == OP_ALT);
348     if (cc > cs && cc < ce)
349     {
350     d = 0;
351     had_recurse = TRUE;
352     }
353     else d = find_minlength(cs, startcode, options);
354 ph10 461 }
355 ph10 469 else d = 0;
356 ph10 461 cc += 3;
357 ph10 455
358     /* Handle repeated back references */
359 ph10 461
360 ph10 455 switch (*cc)
361     {
362     case OP_CRSTAR:
363     case OP_CRMINSTAR:
364     case OP_CRQUERY:
365     case OP_CRMINQUERY:
366     min = 0;
367     cc++;
368     break;
369 ph10 461
370 ph10 455 case OP_CRRANGE:
371     case OP_CRMINRANGE:
372     min = GET2(cc, 1);
373     cc += 5;
374     break;
375 ph10 461
376 ph10 455 default:
377     min = 1;
378     break;
379     }
380    
381     branchlength += min * d;
382 ph10 461 break;
383 ph10 455
384 ph10 461 case OP_RECURSE:
385 ph10 455 cs = ce = (uschar *)startcode + GET(cc, 1);
386     if (cs == NULL) return -2;
387     do ce += GET(ce, 1); while (*ce == OP_ALT);
388     if (cc > cs && cc < ce)
389 ph10 461 had_recurse = TRUE;
390     else
391 ph10 455 branchlength += find_minlength(cs, startcode, options);
392     cc += 1 + LINK_SIZE;
393     break;
394    
395     /* Anything else does not or need not match a character. We can get the
396 ph10 461 item's length from the table, but for those that can match zero occurrences
397 ph10 602 of a character, we must take special action for UTF-8 characters. As it
398     happens, the "NOT" versions of these opcodes are used at present only for
399     ASCII characters, so they could be omitted from this list. However, in
400     future that may change, so we leave them in this special case. */
401 ph10 461
402 ph10 455 case OP_UPTO:
403 ph10 602 case OP_UPTOI:
404 ph10 461 case OP_NOTUPTO:
405 ph10 602 case OP_NOTUPTOI:
406 ph10 455 case OP_MINUPTO:
407 ph10 602 case OP_MINUPTOI:
408 ph10 461 case OP_NOTMINUPTO:
409 ph10 602 case OP_NOTMINUPTOI:
410 ph10 455 case OP_POSUPTO:
411 ph10 602 case OP_POSUPTOI:
412     case OP_NOTPOSUPTO:
413     case OP_NOTPOSUPTOI:
414    
415 ph10 455 case OP_STAR:
416 ph10 602 case OP_STARI:
417     case OP_NOTSTAR:
418     case OP_NOTSTARI:
419 ph10 455 case OP_MINSTAR:
420 ph10 602 case OP_MINSTARI:
421 ph10 461 case OP_NOTMINSTAR:
422 ph10 602 case OP_NOTMINSTARI:
423 ph10 455 case OP_POSSTAR:
424 ph10 602 case OP_POSSTARI:
425 ph10 461 case OP_NOTPOSSTAR:
426 ph10 602 case OP_NOTPOSSTARI:
427    
428 ph10 455 case OP_QUERY:
429 ph10 602 case OP_QUERYI:
430     case OP_NOTQUERY:
431     case OP_NOTQUERYI:
432 ph10 455 case OP_MINQUERY:
433 ph10 602 case OP_MINQUERYI:
434 ph10 455 case OP_NOTMINQUERY:
435 ph10 602 case OP_NOTMINQUERYI:
436 ph10 455 case OP_POSQUERY:
437 ph10 602 case OP_POSQUERYI:
438 ph10 461 case OP_NOTPOSQUERY:
439 ph10 602 case OP_NOTPOSQUERYI:
440    
441 ph10 455 cc += _pcre_OP_lengths[op];
442 ph10 461 #ifdef SUPPORT_UTF8
443 ph10 455 if (utf8 && cc[-1] >= 0xc0) cc += _pcre_utf8_table4[cc[-1] & 0x3f];
444 ph10 461 #endif
445 ph10 455 break;
446 ph10 512
447 ph10 510 /* Skip these, but we need to add in the name length. */
448 ph10 512
449 ph10 510 case OP_MARK:
450     case OP_PRUNE_ARG:
451     case OP_SKIP_ARG:
452     cc += _pcre_OP_lengths[op] + cc[1];
453 ph10 512 break;
454 ph10 455
455 ph10 550 case OP_THEN_ARG:
456     cc += _pcre_OP_lengths[op] + cc[1+LINK_SIZE];
457     break;
458    
459 ph10 455 /* For the record, these are the opcodes that are matched by "default":
460     OP_ACCEPT, OP_CLOSE, OP_COMMIT, OP_FAIL, OP_PRUNE, OP_SET_SOM, OP_SKIP,
461     OP_THEN. */
462 ph10 461
463 ph10 455 default:
464     cc += _pcre_OP_lengths[op];
465     break;
466     }
467     }
468     /* Control never gets here */
469     }
470    
471    
472    
473     /*************************************************
474 nigel 77 * Set a bit and maybe its alternate case *
475     *************************************************/
476    
477 ph10 535 /* Given a character, set its first byte's bit in the table, and also the
478 ph10 520 corresponding bit for the other version of a letter if we are caseless. In
479     UTF-8 mode, for characters greater than 127, we can only do the caseless thing
480     when Unicode property support is available.
481 nigel 77
482     Arguments:
483     start_bits points to the bit map
484 ph10 520 p points to the character
485 nigel 77 caseless the caseless flag
486     cd the block with char table pointers
487 ph10 535 utf8 TRUE for UTF-8 mode
488 nigel 77
489 ph10 520 Returns: pointer after the character
490 nigel 77 */
491    
492 ph10 520 static const uschar *
493     set_table_bit(uschar *start_bits, const uschar *p, BOOL caseless,
494     compile_data *cd, BOOL utf8)
495 nigel 77 {
496 ph10 520 unsigned int c = *p;
497    
498 ph10 524 SET_BIT(c);
499    
500 ph10 520 #ifdef SUPPORT_UTF8
501     if (utf8 && c > 127)
502     {
503     GETCHARINC(c, p);
504     #ifdef SUPPORT_UCP
505     if (caseless)
506     {
507 ph10 535 uschar buff[8];
508 ph10 520 c = UCD_OTHERCASE(c);
509 ph10 535 (void)_pcre_ord2utf8(c, buff);
510     SET_BIT(buff[0]);
511     }
512     #endif
513 ph10 520 return p;
514     }
515 ph10 535 #endif
516 ph10 520
517     /* Not UTF-8 mode, or character is less than 127. */
518    
519 ph10 524 if (caseless && (cd->ctypes[c] & ctype_letter) != 0) SET_BIT(cd->fcc[c]);
520 ph10 520 return p + 1;
521 nigel 77 }
522    
523    
524    
525     /*************************************************
526 ph10 539 * Set bits for a positive character type *
527     *************************************************/
528    
529     /* This function sets starting bits for a character type. In UTF-8 mode, we can
530     only do a direct setting for bytes less than 128, as otherwise there can be
531     confusion with bytes in the middle of UTF-8 characters. In a "traditional"
532     environment, the tables will only recognize ASCII characters anyway, but in at
533     least one Windows environment, some higher bytes bits were set in the tables.
534     So we deal with that case by considering the UTF-8 encoding.
535    
536     Arguments:
537     start_bits the starting bitmap
538     cbit type the type of character wanted
539     table_limit 32 for non-UTF-8; 16 for UTF-8
540 ph10 545 cd the block with char table pointers
541 ph10 539
542     Returns: nothing
543     */
544    
545     static void
546     set_type_bits(uschar *start_bits, int cbit_type, int table_limit,
547     compile_data *cd)
548     {
549 ph10 545 register int c;
550 ph10 539 for (c = 0; c < table_limit; c++) start_bits[c] |= cd->cbits[c+cbit_type];
551     if (table_limit == 32) return;
552     for (c = 128; c < 256; c++)
553     {
554     if ((cd->cbits[c/8] & (1 << (c&7))) != 0)
555     {
556     uschar buff[8];
557     (void)_pcre_ord2utf8(c, buff);
558 ph10 545 SET_BIT(buff[0]);
559     }
560     }
561 ph10 539 }
562    
563    
564     /*************************************************
565     * Set bits for a negative character type *
566     *************************************************/
567    
568     /* This function sets starting bits for a negative character type such as \D.
569     In UTF-8 mode, we can only do a direct setting for bytes less than 128, as
570     otherwise there can be confusion with bytes in the middle of UTF-8 characters.
571 ph10 545 Unlike in the positive case, where we can set appropriate starting bits for
572 ph10 539 specific high-valued UTF-8 characters, in this case we have to set the bits for
573 ph10 545 all high-valued characters. The lowest is 0xc2, but we overkill by starting at
574 ph10 539 0xc0 (192) for simplicity.
575    
576     Arguments:
577     start_bits the starting bitmap
578     cbit type the type of character wanted
579     table_limit 32 for non-UTF-8; 16 for UTF-8
580 ph10 545 cd the block with char table pointers
581 ph10 539
582     Returns: nothing
583     */
584    
585     static void
586     set_nottype_bits(uschar *start_bits, int cbit_type, int table_limit,
587     compile_data *cd)
588     {
589 ph10 545 register int c;
590 ph10 539 for (c = 0; c < table_limit; c++) start_bits[c] |= ~cd->cbits[c+cbit_type];
591     if (table_limit != 32) for (c = 24; c < 32; c++) start_bits[c] = 0xff;
592     }
593    
594    
595    
596     /*************************************************
597 nigel 93 * Create bitmap of starting bytes *
598 nigel 77 *************************************************/
599    
600 nigel 93 /* This function scans a compiled unanchored expression recursively and
601     attempts to build a bitmap of the set of possible starting bytes. As time goes
602     by, we may be able to get more clever at doing this. The SSB_CONTINUE return is
603     useful for parenthesized groups in patterns such as (a*)b where the group
604     provides some optional starting bytes but scanning must continue at the outer
605     level to find at least one mandatory byte. At the outermost level, this
606     function fails unless the result is SSB_DONE.
607 nigel 77
608     Arguments:
609     code points to an expression
610     start_bits points to a 32-byte table, initialized to 0
611     caseless the current state of the caseless flag
612     utf8 TRUE if in UTF-8 mode
613     cd the block with char table pointers
614    
615 nigel 93 Returns: SSB_FAIL => Failed to find any starting bytes
616     SSB_DONE => Found mandatory starting bytes
617     SSB_CONTINUE => Found optional starting bytes
618 nigel 77 */
619    
620 nigel 93 static int
621 nigel 77 set_start_bits(const uschar *code, uschar *start_bits, BOOL caseless,
622     BOOL utf8, compile_data *cd)
623     {
624     register int c;
625 nigel 93 int yield = SSB_DONE;
626 ph10 538 int table_limit = utf8? 16:32;
627 nigel 77
628 nigel 91 #if 0
629     /* ========================================================================= */
630     /* The following comment and code was inserted in January 1999. In May 2006,
631     when it was observed to cause compiler warnings about unused values, I took it
632     out again. If anybody is still using OS/2, they will have to put it back
633     manually. */
634    
635 nigel 77 /* This next statement and the later reference to dummy are here in order to
636     trick the optimizer of the IBM C compiler for OS/2 into generating correct
637     code. Apparently IBM isn't going to fix the problem, and we would rather not
638     disable optimization (in this module it actually makes a big difference, and
639     the pcre module can use all the optimization it can get). */
640    
641     volatile int dummy;
642 nigel 91 /* ========================================================================= */
643     #endif
644 nigel 77
645     do
646     {
647 nigel 93 const uschar *tcode = code + (((int)*code == OP_CBRA)? 3:1) + LINK_SIZE;
648 nigel 77 BOOL try_next = TRUE;
649    
650 nigel 93 while (try_next) /* Loop for items in this branch */
651 nigel 77 {
652 nigel 93 int rc;
653     switch(*tcode)
654 nigel 77 {
655 nigel 93 /* Fail if we reach something we don't understand */
656 nigel 77
657     default:
658 nigel 93 return SSB_FAIL;
659 nigel 77
660 nigel 93 /* If we hit a bracket or a positive lookahead assertion, recurse to set
661     bits from within the subpattern. If it can't find anything, we have to
662     give up. If it finds some mandatory character(s), we are done for this
663     branch. Otherwise, carry on scanning after the subpattern. */
664    
665     case OP_BRA:
666     case OP_SBRA:
667     case OP_CBRA:
668     case OP_SCBRA:
669     case OP_ONCE:
670     case OP_ASSERT:
671     rc = set_start_bits(tcode, start_bits, caseless, utf8, cd);
672     if (rc == SSB_FAIL) return SSB_FAIL;
673     if (rc == SSB_DONE) try_next = FALSE; else
674     {
675     do tcode += GET(tcode, 1); while (*tcode == OP_ALT);
676     tcode += 1 + LINK_SIZE;
677     }
678     break;
679    
680     /* If we hit ALT or KET, it means we haven't found anything mandatory in
681     this branch, though we might have found something optional. For ALT, we
682     continue with the next alternative, but we have to arrange that the final
683     result from subpattern is SSB_CONTINUE rather than SSB_DONE. For KET,
684     return SSB_CONTINUE: if this is the top level, that indicates failure,
685     but after a nested subpattern, it causes scanning to continue. */
686    
687     case OP_ALT:
688     yield = SSB_CONTINUE;
689     try_next = FALSE;
690     break;
691    
692     case OP_KET:
693     case OP_KETRMAX:
694     case OP_KETRMIN:
695     return SSB_CONTINUE;
696    
697 nigel 77 /* Skip over callout */
698    
699     case OP_CALLOUT:
700     tcode += 2 + 2*LINK_SIZE;
701     break;
702    
703     /* Skip over lookbehind and negative lookahead assertions */
704    
705     case OP_ASSERT_NOT:
706     case OP_ASSERTBACK:
707     case OP_ASSERTBACK_NOT:
708     do tcode += GET(tcode, 1); while (*tcode == OP_ALT);
709 nigel 93 tcode += 1 + LINK_SIZE;
710 nigel 77 break;
711    
712     /* BRAZERO does the bracket, but carries on. */
713    
714     case OP_BRAZERO:
715     case OP_BRAMINZERO:
716 nigel 93 if (set_start_bits(++tcode, start_bits, caseless, utf8, cd) == SSB_FAIL)
717     return SSB_FAIL;
718 nigel 91 /* =========================================================================
719     See the comment at the head of this function concerning the next line,
720     which was an old fudge for the benefit of OS/2.
721 nigel 77 dummy = 1;
722 nigel 91 ========================================================================= */
723 nigel 77 do tcode += GET(tcode,1); while (*tcode == OP_ALT);
724 nigel 93 tcode += 1 + LINK_SIZE;
725 nigel 77 break;
726    
727 ph10 335 /* SKIPZERO skips the bracket. */
728    
729     case OP_SKIPZERO:
730 ph10 358 tcode++;
731 ph10 335 do tcode += GET(tcode,1); while (*tcode == OP_ALT);
732     tcode += 1 + LINK_SIZE;
733     break;
734    
735 nigel 77 /* Single-char * or ? sets the bit and tries the next item */
736    
737     case OP_STAR:
738     case OP_MINSTAR:
739 nigel 93 case OP_POSSTAR:
740 nigel 77 case OP_QUERY:
741     case OP_MINQUERY:
742 nigel 93 case OP_POSQUERY:
743 ph10 520 tcode = set_table_bit(start_bits, tcode + 1, caseless, cd, utf8);
744 nigel 77 break;
745    
746 ph10 602 case OP_STARI:
747     case OP_MINSTARI:
748     case OP_POSSTARI:
749     case OP_QUERYI:
750     case OP_MINQUERYI:
751     case OP_POSQUERYI:
752     tcode = set_table_bit(start_bits, tcode + 1, TRUE, cd, utf8);
753     break;
754    
755 nigel 77 /* Single-char upto sets the bit and tries the next */
756    
757     case OP_UPTO:
758     case OP_MINUPTO:
759 nigel 93 case OP_POSUPTO:
760 ph10 520 tcode = set_table_bit(start_bits, tcode + 3, caseless, cd, utf8);
761 nigel 77 break;
762    
763 ph10 602 case OP_UPTOI:
764     case OP_MINUPTOI:
765     case OP_POSUPTOI:
766     tcode = set_table_bit(start_bits, tcode + 3, TRUE, cd, utf8);
767     break;
768    
769 nigel 77 /* At least one single char sets the bit and stops */
770    
771 ph10 602 case OP_EXACT:
772 nigel 77 tcode += 2;
773 ph10 602 /* Fall through */
774 nigel 77 case OP_CHAR:
775     case OP_PLUS:
776     case OP_MINPLUS:
777 nigel 93 case OP_POSPLUS:
778 ph10 520 (void)set_table_bit(start_bits, tcode + 1, caseless, cd, utf8);
779 nigel 77 try_next = FALSE;
780     break;
781 ph10 535
782 ph10 602 case OP_CHARI:
783     case OP_PLUSI:
784     case OP_MINPLUSI:
785     case OP_POSPLUSI:
786     (void)set_table_bit(start_bits, tcode + 1, TRUE, cd, utf8);
787     try_next = FALSE;
788     break;
789    
790 ph10 535 /* Special spacing and line-terminating items. These recognize specific
791     lists of characters. The difference between VSPACE and ANYNL is that the
792     latter can match the two-character CRLF sequence, but that is not
793     relevant for finding the first character, so their code here is
794 ph10 524 identical. */
795 ph10 535
796 ph10 524 case OP_HSPACE:
797     SET_BIT(0x09);
798     SET_BIT(0x20);
799     if (utf8)
800 ph10 535 {
801 ph10 545 SET_BIT(0xC2); /* For U+00A0 */
802 ph10 524 SET_BIT(0xE1); /* For U+1680, U+180E */
803     SET_BIT(0xE2); /* For U+2000 - U+200A, U+202F, U+205F */
804 ph10 535 SET_BIT(0xE3); /* For U+3000 */
805 ph10 524 }
806 ph10 538 else SET_BIT(0xA0);
807 ph10 524 try_next = FALSE;
808 ph10 535 break;
809 nigel 77
810 ph10 535 case OP_ANYNL:
811 ph10 524 case OP_VSPACE:
812 ph10 535 SET_BIT(0x0A);
813     SET_BIT(0x0B);
814     SET_BIT(0x0C);
815     SET_BIT(0x0D);
816 ph10 545 if (utf8)
817     {
818     SET_BIT(0xC2); /* For U+0085 */
819 ph10 538 SET_BIT(0xE2); /* For U+2028, U+2029 */
820 ph10 545 }
821 ph10 538 else SET_BIT(0x85);
822 ph10 524 try_next = FALSE;
823 ph10 535 break;
824 ph10 524
825 ph10 535 /* Single character types set the bits and stop. Note that if PCRE_UCP
826     is set, we do not see these op codes because \d etc are converted to
827 ph10 545 properties. Therefore, these apply in the case when only characters less
828 ph10 539 than 256 are recognized to match the types. */
829 nigel 77
830     case OP_NOT_DIGIT:
831 ph10 539 set_nottype_bits(start_bits, cbit_digit, table_limit, cd);
832 nigel 77 try_next = FALSE;
833     break;
834    
835     case OP_DIGIT:
836 ph10 539 set_type_bits(start_bits, cbit_digit, table_limit, cd);
837 nigel 77 try_next = FALSE;
838     break;
839    
840 nigel 91 /* The cbit_space table has vertical tab as whitespace; we have to
841 ph10 539 ensure it is set as not whitespace. */
842 nigel 91
843 nigel 77 case OP_NOT_WHITESPACE:
844 ph10 539 set_nottype_bits(start_bits, cbit_space, table_limit, cd);
845     start_bits[1] |= 0x08;
846 nigel 77 try_next = FALSE;
847     break;
848    
849 nigel 91 /* The cbit_space table has vertical tab as whitespace; we have to
850 ph10 539 not set it from the table. */
851 nigel 91
852 nigel 77 case OP_WHITESPACE:
853 ph10 539 c = start_bits[1]; /* Save in case it was already set */
854     set_type_bits(start_bits, cbit_space, table_limit, cd);
855     start_bits[1] = (start_bits[1] & ~0x08) | c;
856 nigel 77 try_next = FALSE;
857     break;
858    
859     case OP_NOT_WORDCHAR:
860 ph10 539 set_nottype_bits(start_bits, cbit_word, table_limit, cd);
861 nigel 77 try_next = FALSE;
862     break;
863    
864     case OP_WORDCHAR:
865 ph10 539 set_type_bits(start_bits, cbit_word, table_limit, cd);
866 nigel 77 try_next = FALSE;
867     break;
868 ph10 545
869 nigel 77 /* One or more character type fudges the pointer and restarts, knowing
870     it will hit a single character type and stop there. */
871    
872     case OP_TYPEPLUS:
873     case OP_TYPEMINPLUS:
874 ph10 535 case OP_TYPEPOSPLUS:
875 nigel 77 tcode++;
876     break;
877    
878     case OP_TYPEEXACT:
879     tcode += 3;
880     break;
881    
882     /* Zero or more repeats of character types set the bits and then
883     try again. */
884    
885     case OP_TYPEUPTO:
886     case OP_TYPEMINUPTO:
887 nigel 93 case OP_TYPEPOSUPTO:
888 nigel 77 tcode += 2; /* Fall through */
889    
890     case OP_TYPESTAR:
891     case OP_TYPEMINSTAR:
892 nigel 93 case OP_TYPEPOSSTAR:
893 nigel 77 case OP_TYPEQUERY:
894     case OP_TYPEMINQUERY:
895 nigel 93 case OP_TYPEPOSQUERY:
896 nigel 77 switch(tcode[1])
897     {
898 ph10 523 default:
899 nigel 77 case OP_ANY:
900 ph10 345 case OP_ALLANY:
901 nigel 93 return SSB_FAIL;
902 ph10 535
903 ph10 524 case OP_HSPACE:
904     SET_BIT(0x09);
905     SET_BIT(0x20);
906     if (utf8)
907 ph10 535 {
908 ph10 545 SET_BIT(0xC2); /* For U+00A0 */
909 ph10 524 SET_BIT(0xE1); /* For U+1680, U+180E */
910     SET_BIT(0xE2); /* For U+2000 - U+200A, U+202F, U+205F */
911 ph10 535 SET_BIT(0xE3); /* For U+3000 */
912 ph10 524 }
913 ph10 538 else SET_BIT(0xA0);
914 ph10 535 break;
915    
916     case OP_ANYNL:
917 ph10 524 case OP_VSPACE:
918 ph10 535 SET_BIT(0x0A);
919     SET_BIT(0x0B);
920     SET_BIT(0x0C);
921     SET_BIT(0x0D);
922 ph10 545 if (utf8)
923 ph10 538 {
924 ph10 545 SET_BIT(0xC2); /* For U+0085 */
925 ph10 538 SET_BIT(0xE2); /* For U+2028, U+2029 */
926 ph10 545 }
927 ph10 538 else SET_BIT(0x85);
928 ph10 535 break;
929    
930 nigel 77 case OP_NOT_DIGIT:
931 ph10 539 set_nottype_bits(start_bits, cbit_digit, table_limit, cd);
932 nigel 77 break;
933    
934     case OP_DIGIT:
935 ph10 539 set_type_bits(start_bits, cbit_digit, table_limit, cd);
936 nigel 77 break;
937    
938 nigel 91 /* The cbit_space table has vertical tab as whitespace; we have to
939 ph10 539 ensure it gets set as not whitespace. */
940 nigel 91
941 nigel 77 case OP_NOT_WHITESPACE:
942 ph10 539 set_nottype_bits(start_bits, cbit_space, table_limit, cd);
943 ph10 545 start_bits[1] |= 0x08;
944 nigel 77 break;
945    
946 nigel 91 /* The cbit_space table has vertical tab as whitespace; we have to
947 ph10 539 avoid setting it. */
948 nigel 91
949 nigel 77 case OP_WHITESPACE:
950 ph10 539 c = start_bits[1]; /* Save in case it was already set */
951     set_type_bits(start_bits, cbit_space, table_limit, cd);
952     start_bits[1] = (start_bits[1] & ~0x08) | c;
953 nigel 77 break;
954    
955     case OP_NOT_WORDCHAR:
956 ph10 539 set_nottype_bits(start_bits, cbit_word, table_limit, cd);
957 nigel 77 break;
958    
959     case OP_WORDCHAR:
960 ph10 539 set_type_bits(start_bits, cbit_word, table_limit, cd);
961 nigel 77 break;
962     }
963    
964     tcode += 2;
965     break;
966    
967     /* Character class where all the information is in a bit map: set the
968     bits and either carry on or not, according to the repeat count. If it was
969     a negative class, and we are operating with UTF-8 characters, any byte
970     with a value >= 0xc4 is a potentially valid starter because it starts a
971     character with a value > 255. */
972    
973     case OP_NCLASS:
974 ph10 111 #ifdef SUPPORT_UTF8
975 nigel 77 if (utf8)
976     {
977     start_bits[24] |= 0xf0; /* Bits for 0xc4 - 0xc8 */
978     memset(start_bits+25, 0xff, 7); /* Bits for 0xc9 - 0xff */
979     }
980 ph10 111 #endif
981 nigel 77 /* Fall through */
982    
983     case OP_CLASS:
984     {
985     tcode++;
986    
987     /* In UTF-8 mode, the bits in a bit map correspond to character
988     values, not to byte values. However, the bit map we are constructing is
989     for byte values. So we have to do a conversion for characters whose
990     value is > 127. In fact, there are only two possible starting bytes for
991     characters in the range 128 - 255. */
992    
993 ph10 107 #ifdef SUPPORT_UTF8
994 nigel 77 if (utf8)
995     {
996     for (c = 0; c < 16; c++) start_bits[c] |= tcode[c];
997     for (c = 128; c < 256; c++)
998     {
999     if ((tcode[c/8] && (1 << (c&7))) != 0)
1000     {
1001     int d = (c >> 6) | 0xc0; /* Set bit for this starter */
1002     start_bits[d/8] |= (1 << (d&7)); /* and then skip on to the */
1003     c = (c & 0xc0) + 0x40 - 1; /* next relevant character. */
1004     }
1005     }
1006     }
1007    
1008     /* In non-UTF-8 mode, the two bit maps are completely compatible. */
1009    
1010     else
1011 ph10 111 #endif
1012 nigel 77 {
1013     for (c = 0; c < 32; c++) start_bits[c] |= tcode[c];
1014     }
1015    
1016     /* Advance past the bit map, and act on what follows */
1017    
1018     tcode += 32;
1019     switch (*tcode)
1020     {
1021     case OP_CRSTAR:
1022     case OP_CRMINSTAR:
1023     case OP_CRQUERY:
1024     case OP_CRMINQUERY:
1025     tcode++;
1026     break;
1027    
1028     case OP_CRRANGE:
1029     case OP_CRMINRANGE:
1030     if (((tcode[1] << 8) + tcode[2]) == 0) tcode += 5;
1031     else try_next = FALSE;
1032     break;
1033    
1034     default:
1035     try_next = FALSE;
1036     break;
1037     }
1038     }
1039     break; /* End of bitmap class handling */
1040    
1041     } /* End of switch */
1042     } /* End of try_next loop */
1043    
1044     code += GET(code, 1); /* Advance to next branch */
1045     }
1046     while (*code == OP_ALT);
1047 nigel 93 return yield;
1048 nigel 77 }
1049    
1050    
1051    
1052     /*************************************************
1053     * Study a compiled expression *
1054     *************************************************/
1055    
1056     /* This function is handed a compiled expression that it must study to produce
1057     information that will speed up the matching. It returns a pcre_extra block
1058     which then gets handed back to pcre_exec().
1059    
1060     Arguments:
1061     re points to the compiled expression
1062     options contains option bits
1063     errorptr points to where to place error messages;
1064     set NULL unless error
1065    
1066     Returns: pointer to a pcre_extra block, with study_data filled in and the
1067 ph10 455 appropriate flags set;
1068 nigel 77 NULL on error or if no optimization possible
1069     */
1070    
1071 ph10 359 PCRE_EXP_DEFN pcre_extra * PCRE_CALL_CONVENTION
1072 nigel 77 pcre_study(const pcre *external_re, int options, const char **errorptr)
1073     {
1074 ph10 455 int min;
1075     BOOL bits_set = FALSE;
1076 nigel 77 uschar start_bits[32];
1077     pcre_extra *extra;
1078     pcre_study_data *study;
1079     const uschar *tables;
1080 nigel 91 uschar *code;
1081     compile_data compile_block;
1082 nigel 77 const real_pcre *re = (const real_pcre *)external_re;
1083    
1084     *errorptr = NULL;
1085    
1086     if (re == NULL || re->magic_number != MAGIC_NUMBER)
1087     {
1088     *errorptr = "argument is not a compiled regular expression";
1089     return NULL;
1090     }
1091    
1092     if ((options & ~PUBLIC_STUDY_OPTIONS) != 0)
1093     {
1094     *errorptr = "unknown or incorrect option bit(s) set";
1095     return NULL;
1096     }
1097    
1098 nigel 91 code = (uschar *)re + re->name_table_offset +
1099     (re->name_count * re->name_entry_size);
1100    
1101 nigel 77 /* For an anchored pattern, or an unanchored pattern that has a first char, or
1102 ph10 461 a multiline pattern that matches only at "line starts", there is no point in
1103 ph10 455 seeking a list of starting bytes. */
1104 nigel 77
1105 ph10 455 if ((re->options & PCRE_ANCHORED) == 0 &&
1106     (re->flags & (PCRE_FIRSTSET|PCRE_STARTLINE)) == 0)
1107     {
1108     /* Set the character tables in the block that is passed around */
1109 ph10 461
1110 ph10 455 tables = re->tables;
1111     if (tables == NULL)
1112     (void)pcre_fullinfo(external_re, NULL, PCRE_INFO_DEFAULT_TABLES,
1113     (void *)(&tables));
1114 ph10 461
1115 ph10 455 compile_block.lcc = tables + lcc_offset;
1116     compile_block.fcc = tables + fcc_offset;
1117     compile_block.cbits = tables + cbits_offset;
1118     compile_block.ctypes = tables + ctypes_offset;
1119 ph10 461
1120 ph10 455 /* See if we can find a fixed set of initial characters for the pattern. */
1121 ph10 461
1122 ph10 455 memset(start_bits, 0, 32 * sizeof(uschar));
1123 ph10 461 bits_set = set_start_bits(code, start_bits,
1124     (re->options & PCRE_CASELESS) != 0, (re->options & PCRE_UTF8) != 0,
1125 ph10 455 &compile_block) == SSB_DONE;
1126     }
1127 ph10 461
1128 ph10 455 /* Find the minimum length of subject string. */
1129 nigel 77
1130 ph10 455 min = find_minlength(code, code, re->options);
1131 nigel 77
1132 ph10 455 /* Return NULL if no optimization is possible. */
1133 nigel 77
1134 ph10 455 if (!bits_set && min < 0) return NULL;
1135 nigel 77
1136     /* Get a pcre_extra block and a pcre_study_data block. The study data is put in
1137     the latter, which is pointed to by the former, which may also get additional
1138     data set later by the calling program. At the moment, the size of
1139     pcre_study_data is fixed. We nevertheless save it in a field for returning via
1140     the pcre_fullinfo() function so that if it becomes variable in the future, we
1141     don't have to change that code. */
1142    
1143     extra = (pcre_extra *)(pcre_malloc)
1144     (sizeof(pcre_extra) + sizeof(pcre_study_data));
1145    
1146     if (extra == NULL)
1147     {
1148     *errorptr = "failed to get memory";
1149     return NULL;
1150     }
1151    
1152     study = (pcre_study_data *)((char *)extra + sizeof(pcre_extra));
1153     extra->flags = PCRE_EXTRA_STUDY_DATA;
1154     extra->study_data = study;
1155    
1156     study->size = sizeof(pcre_study_data);
1157 ph10 455 study->flags = 0;
1158 nigel 77
1159 ph10 455 if (bits_set)
1160     {
1161     study->flags |= PCRE_STUDY_MAPPED;
1162     memcpy(study->start_bits, start_bits, sizeof(start_bits));
1163     }
1164 ph10 461
1165 ph10 455 if (min >= 0)
1166     {
1167     study->flags |= PCRE_STUDY_MINLEN;
1168     study->minlength = min;
1169 ph10 461 }
1170 ph10 455
1171 nigel 77 return extra;
1172     }
1173    
1174     /* End of pcre_study.c */

Properties

Name Value
svn:eol-style native
svn:keywords "Author Date Id Revision Url"

webmaster@exim.org
ViewVC Help
Powered by ViewVC 1.1.12