/[pcre]/code/trunk/pcre_study.c
ViewVC logotype

Contents of /code/trunk/pcre_study.c

Parent Directory Parent Directory | Revision Log Revision Log


Revision 335 - (hide annotations) (download)
Sat Apr 12 14:36:14 2008 UTC (6 years, 4 months ago) by ph10
File MIME type: text/plain
File size: 17910 byte(s)
Do not discard subpatterns with {0} quantifiers, as they may be called as 
subroutines.

1 nigel 77 /*************************************************
2     * Perl-Compatible Regular Expressions *
3     *************************************************/
4    
5     /* PCRE is a library of functions to support regular expressions whose syntax
6     and semantics are as close as possible to those of the Perl 5 language.
7    
8     Written by Philip Hazel
9 ph10 305 Copyright (c) 1997-2008 University of Cambridge
10 nigel 77
11     -----------------------------------------------------------------------------
12     Redistribution and use in source and binary forms, with or without
13     modification, are permitted provided that the following conditions are met:
14    
15     * Redistributions of source code must retain the above copyright notice,
16     this list of conditions and the following disclaimer.
17    
18     * Redistributions in binary form must reproduce the above copyright
19     notice, this list of conditions and the following disclaimer in the
20     documentation and/or other materials provided with the distribution.
21    
22     * Neither the name of the University of Cambridge nor the names of its
23     contributors may be used to endorse or promote products derived from
24     this software without specific prior written permission.
25    
26     THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
27     AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
28     IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
29     ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
30     LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
31     CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
32     SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
33     INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
34     CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
35     ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
36     POSSIBILITY OF SUCH DAMAGE.
37     -----------------------------------------------------------------------------
38     */
39    
40    
41     /* This module contains the external function pcre_study(), along with local
42     supporting functions. */
43    
44    
45 ph10 200 #ifdef HAVE_CONFIG_H
46 ph10 236 #include "config.h"
47 ph10 200 #endif
48 ph10 199
49 nigel 77 #include "pcre_internal.h"
50    
51    
52 nigel 93 /* Returns from set_start_bits() */
53    
54     enum { SSB_FAIL, SSB_DONE, SSB_CONTINUE };
55    
56    
57 nigel 77 /*************************************************
58     * Set a bit and maybe its alternate case *
59     *************************************************/
60    
61     /* Given a character, set its bit in the table, and also the bit for the other
62     version of a letter if we are caseless.
63    
64     Arguments:
65     start_bits points to the bit map
66     c is the character
67     caseless the caseless flag
68     cd the block with char table pointers
69    
70     Returns: nothing
71     */
72    
73     static void
74     set_bit(uschar *start_bits, unsigned int c, BOOL caseless, compile_data *cd)
75     {
76     start_bits[c/8] |= (1 << (c&7));
77     if (caseless && (cd->ctypes[c] & ctype_letter) != 0)
78     start_bits[cd->fcc[c]/8] |= (1 << (cd->fcc[c]&7));
79     }
80    
81    
82    
83     /*************************************************
84 nigel 93 * Create bitmap of starting bytes *
85 nigel 77 *************************************************/
86    
87 nigel 93 /* This function scans a compiled unanchored expression recursively and
88     attempts to build a bitmap of the set of possible starting bytes. As time goes
89     by, we may be able to get more clever at doing this. The SSB_CONTINUE return is
90     useful for parenthesized groups in patterns such as (a*)b where the group
91     provides some optional starting bytes but scanning must continue at the outer
92     level to find at least one mandatory byte. At the outermost level, this
93     function fails unless the result is SSB_DONE.
94 nigel 77
95     Arguments:
96     code points to an expression
97     start_bits points to a 32-byte table, initialized to 0
98     caseless the current state of the caseless flag
99     utf8 TRUE if in UTF-8 mode
100     cd the block with char table pointers
101    
102 nigel 93 Returns: SSB_FAIL => Failed to find any starting bytes
103     SSB_DONE => Found mandatory starting bytes
104     SSB_CONTINUE => Found optional starting bytes
105 nigel 77 */
106    
107 nigel 93 static int
108 nigel 77 set_start_bits(const uschar *code, uschar *start_bits, BOOL caseless,
109     BOOL utf8, compile_data *cd)
110     {
111     register int c;
112 nigel 93 int yield = SSB_DONE;
113 nigel 77
114 nigel 91 #if 0
115     /* ========================================================================= */
116     /* The following comment and code was inserted in January 1999. In May 2006,
117     when it was observed to cause compiler warnings about unused values, I took it
118     out again. If anybody is still using OS/2, they will have to put it back
119     manually. */
120    
121 nigel 77 /* This next statement and the later reference to dummy are here in order to
122     trick the optimizer of the IBM C compiler for OS/2 into generating correct
123     code. Apparently IBM isn't going to fix the problem, and we would rather not
124     disable optimization (in this module it actually makes a big difference, and
125     the pcre module can use all the optimization it can get). */
126    
127     volatile int dummy;
128 nigel 91 /* ========================================================================= */
129     #endif
130 nigel 77
131     do
132     {
133 nigel 93 const uschar *tcode = code + (((int)*code == OP_CBRA)? 3:1) + LINK_SIZE;
134 nigel 77 BOOL try_next = TRUE;
135    
136 nigel 93 while (try_next) /* Loop for items in this branch */
137 nigel 77 {
138 nigel 93 int rc;
139     switch(*tcode)
140 nigel 77 {
141 nigel 93 /* Fail if we reach something we don't understand */
142 nigel 77
143     default:
144 nigel 93 return SSB_FAIL;
145 nigel 77
146 nigel 93 /* If we hit a bracket or a positive lookahead assertion, recurse to set
147     bits from within the subpattern. If it can't find anything, we have to
148     give up. If it finds some mandatory character(s), we are done for this
149     branch. Otherwise, carry on scanning after the subpattern. */
150    
151     case OP_BRA:
152     case OP_SBRA:
153     case OP_CBRA:
154     case OP_SCBRA:
155     case OP_ONCE:
156     case OP_ASSERT:
157     rc = set_start_bits(tcode, start_bits, caseless, utf8, cd);
158     if (rc == SSB_FAIL) return SSB_FAIL;
159     if (rc == SSB_DONE) try_next = FALSE; else
160     {
161     do tcode += GET(tcode, 1); while (*tcode == OP_ALT);
162     tcode += 1 + LINK_SIZE;
163     }
164     break;
165    
166     /* If we hit ALT or KET, it means we haven't found anything mandatory in
167     this branch, though we might have found something optional. For ALT, we
168     continue with the next alternative, but we have to arrange that the final
169     result from subpattern is SSB_CONTINUE rather than SSB_DONE. For KET,
170     return SSB_CONTINUE: if this is the top level, that indicates failure,
171     but after a nested subpattern, it causes scanning to continue. */
172    
173     case OP_ALT:
174     yield = SSB_CONTINUE;
175     try_next = FALSE;
176     break;
177    
178     case OP_KET:
179     case OP_KETRMAX:
180     case OP_KETRMIN:
181     return SSB_CONTINUE;
182    
183 nigel 77 /* Skip over callout */
184    
185     case OP_CALLOUT:
186     tcode += 2 + 2*LINK_SIZE;
187     break;
188    
189     /* Skip over lookbehind and negative lookahead assertions */
190    
191     case OP_ASSERT_NOT:
192     case OP_ASSERTBACK:
193     case OP_ASSERTBACK_NOT:
194     do tcode += GET(tcode, 1); while (*tcode == OP_ALT);
195 nigel 93 tcode += 1 + LINK_SIZE;
196 nigel 77 break;
197    
198     /* Skip over an option setting, changing the caseless flag */
199    
200     case OP_OPT:
201     caseless = (tcode[1] & PCRE_CASELESS) != 0;
202     tcode += 2;
203     break;
204    
205     /* BRAZERO does the bracket, but carries on. */
206    
207     case OP_BRAZERO:
208     case OP_BRAMINZERO:
209 nigel 93 if (set_start_bits(++tcode, start_bits, caseless, utf8, cd) == SSB_FAIL)
210     return SSB_FAIL;
211 nigel 91 /* =========================================================================
212     See the comment at the head of this function concerning the next line,
213     which was an old fudge for the benefit of OS/2.
214 nigel 77 dummy = 1;
215 nigel 91 ========================================================================= */
216 nigel 77 do tcode += GET(tcode,1); while (*tcode == OP_ALT);
217 nigel 93 tcode += 1 + LINK_SIZE;
218 nigel 77 break;
219    
220 ph10 335 /* SKIPZERO skips the bracket. */
221    
222     case OP_SKIPZERO:
223     do tcode += GET(tcode,1); while (*tcode == OP_ALT);
224     tcode += 1 + LINK_SIZE;
225     break;
226    
227 nigel 77 /* Single-char * or ? sets the bit and tries the next item */
228    
229     case OP_STAR:
230     case OP_MINSTAR:
231 nigel 93 case OP_POSSTAR:
232 nigel 77 case OP_QUERY:
233     case OP_MINQUERY:
234 nigel 93 case OP_POSQUERY:
235 nigel 77 set_bit(start_bits, tcode[1], caseless, cd);
236     tcode += 2;
237     #ifdef SUPPORT_UTF8
238 nigel 93 if (utf8 && tcode[-1] >= 0xc0)
239     tcode += _pcre_utf8_table4[tcode[-1] & 0x3f];
240 nigel 77 #endif
241     break;
242    
243     /* Single-char upto sets the bit and tries the next */
244    
245     case OP_UPTO:
246     case OP_MINUPTO:
247 nigel 93 case OP_POSUPTO:
248 nigel 77 set_bit(start_bits, tcode[3], caseless, cd);
249     tcode += 4;
250     #ifdef SUPPORT_UTF8
251 nigel 93 if (utf8 && tcode[-1] >= 0xc0)
252     tcode += _pcre_utf8_table4[tcode[-1] & 0x3f];
253 nigel 77 #endif
254     break;
255    
256     /* At least one single char sets the bit and stops */
257    
258     case OP_EXACT: /* Fall through */
259     tcode += 2;
260    
261     case OP_CHAR:
262     case OP_CHARNC:
263     case OP_PLUS:
264     case OP_MINPLUS:
265 nigel 93 case OP_POSPLUS:
266 nigel 77 set_bit(start_bits, tcode[1], caseless, cd);
267     try_next = FALSE;
268     break;
269    
270     /* Single character type sets the bits and stops */
271    
272     case OP_NOT_DIGIT:
273     for (c = 0; c < 32; c++)
274     start_bits[c] |= ~cd->cbits[c+cbit_digit];
275     try_next = FALSE;
276     break;
277    
278     case OP_DIGIT:
279     for (c = 0; c < 32; c++)
280     start_bits[c] |= cd->cbits[c+cbit_digit];
281     try_next = FALSE;
282     break;
283    
284 nigel 91 /* The cbit_space table has vertical tab as whitespace; we have to
285     discard it. */
286    
287 nigel 77 case OP_NOT_WHITESPACE:
288     for (c = 0; c < 32; c++)
289 nigel 91 {
290     int d = cd->cbits[c+cbit_space];
291     if (c == 1) d &= ~0x08;
292     start_bits[c] |= ~d;
293     }
294 nigel 77 try_next = FALSE;
295     break;
296    
297 nigel 91 /* The cbit_space table has vertical tab as whitespace; we have to
298     discard it. */
299    
300 nigel 77 case OP_WHITESPACE:
301     for (c = 0; c < 32; c++)
302 nigel 91 {
303     int d = cd->cbits[c+cbit_space];
304     if (c == 1) d &= ~0x08;
305     start_bits[c] |= d;
306     }
307 nigel 77 try_next = FALSE;
308     break;
309    
310     case OP_NOT_WORDCHAR:
311     for (c = 0; c < 32; c++)
312     start_bits[c] |= ~cd->cbits[c+cbit_word];
313     try_next = FALSE;
314     break;
315    
316     case OP_WORDCHAR:
317     for (c = 0; c < 32; c++)
318     start_bits[c] |= cd->cbits[c+cbit_word];
319     try_next = FALSE;
320     break;
321    
322     /* One or more character type fudges the pointer and restarts, knowing
323     it will hit a single character type and stop there. */
324    
325     case OP_TYPEPLUS:
326     case OP_TYPEMINPLUS:
327     tcode++;
328     break;
329    
330     case OP_TYPEEXACT:
331     tcode += 3;
332     break;
333    
334     /* Zero or more repeats of character types set the bits and then
335     try again. */
336    
337     case OP_TYPEUPTO:
338     case OP_TYPEMINUPTO:
339 nigel 93 case OP_TYPEPOSUPTO:
340 nigel 77 tcode += 2; /* Fall through */
341    
342     case OP_TYPESTAR:
343     case OP_TYPEMINSTAR:
344 nigel 93 case OP_TYPEPOSSTAR:
345 nigel 77 case OP_TYPEQUERY:
346     case OP_TYPEMINQUERY:
347 nigel 93 case OP_TYPEPOSQUERY:
348 nigel 77 switch(tcode[1])
349     {
350     case OP_ANY:
351 nigel 93 return SSB_FAIL;
352 nigel 77
353     case OP_NOT_DIGIT:
354     for (c = 0; c < 32; c++)
355     start_bits[c] |= ~cd->cbits[c+cbit_digit];
356     break;
357    
358     case OP_DIGIT:
359     for (c = 0; c < 32; c++)
360     start_bits[c] |= cd->cbits[c+cbit_digit];
361     break;
362    
363 nigel 91 /* The cbit_space table has vertical tab as whitespace; we have to
364     discard it. */
365    
366 nigel 77 case OP_NOT_WHITESPACE:
367     for (c = 0; c < 32; c++)
368 nigel 91 {
369     int d = cd->cbits[c+cbit_space];
370     if (c == 1) d &= ~0x08;
371     start_bits[c] |= ~d;
372     }
373 nigel 77 break;
374    
375 nigel 91 /* The cbit_space table has vertical tab as whitespace; we have to
376     discard it. */
377    
378 nigel 77 case OP_WHITESPACE:
379     for (c = 0; c < 32; c++)
380 nigel 91 {
381     int d = cd->cbits[c+cbit_space];
382     if (c == 1) d &= ~0x08;
383     start_bits[c] |= d;
384     }
385 nigel 77 break;
386    
387     case OP_NOT_WORDCHAR:
388     for (c = 0; c < 32; c++)
389     start_bits[c] |= ~cd->cbits[c+cbit_word];
390     break;
391    
392     case OP_WORDCHAR:
393     for (c = 0; c < 32; c++)
394     start_bits[c] |= cd->cbits[c+cbit_word];
395     break;
396     }
397    
398     tcode += 2;
399     break;
400    
401     /* Character class where all the information is in a bit map: set the
402     bits and either carry on or not, according to the repeat count. If it was
403     a negative class, and we are operating with UTF-8 characters, any byte
404     with a value >= 0xc4 is a potentially valid starter because it starts a
405     character with a value > 255. */
406    
407     case OP_NCLASS:
408 ph10 111 #ifdef SUPPORT_UTF8
409 nigel 77 if (utf8)
410     {
411     start_bits[24] |= 0xf0; /* Bits for 0xc4 - 0xc8 */
412     memset(start_bits+25, 0xff, 7); /* Bits for 0xc9 - 0xff */
413     }
414 ph10 111 #endif
415 nigel 77 /* Fall through */
416    
417     case OP_CLASS:
418     {
419     tcode++;
420    
421     /* In UTF-8 mode, the bits in a bit map correspond to character
422     values, not to byte values. However, the bit map we are constructing is
423     for byte values. So we have to do a conversion for characters whose
424     value is > 127. In fact, there are only two possible starting bytes for
425     characters in the range 128 - 255. */
426    
427 ph10 107 #ifdef SUPPORT_UTF8
428 nigel 77 if (utf8)
429     {
430     for (c = 0; c < 16; c++) start_bits[c] |= tcode[c];
431     for (c = 128; c < 256; c++)
432     {
433     if ((tcode[c/8] && (1 << (c&7))) != 0)
434     {
435     int d = (c >> 6) | 0xc0; /* Set bit for this starter */
436     start_bits[d/8] |= (1 << (d&7)); /* and then skip on to the */
437     c = (c & 0xc0) + 0x40 - 1; /* next relevant character. */
438     }
439     }
440     }
441    
442     /* In non-UTF-8 mode, the two bit maps are completely compatible. */
443    
444     else
445 ph10 111 #endif
446 nigel 77 {
447     for (c = 0; c < 32; c++) start_bits[c] |= tcode[c];
448     }
449    
450     /* Advance past the bit map, and act on what follows */
451    
452     tcode += 32;
453     switch (*tcode)
454     {
455     case OP_CRSTAR:
456     case OP_CRMINSTAR:
457     case OP_CRQUERY:
458     case OP_CRMINQUERY:
459     tcode++;
460     break;
461    
462     case OP_CRRANGE:
463     case OP_CRMINRANGE:
464     if (((tcode[1] << 8) + tcode[2]) == 0) tcode += 5;
465     else try_next = FALSE;
466     break;
467    
468     default:
469     try_next = FALSE;
470     break;
471     }
472     }
473     break; /* End of bitmap class handling */
474    
475     } /* End of switch */
476     } /* End of try_next loop */
477    
478     code += GET(code, 1); /* Advance to next branch */
479     }
480     while (*code == OP_ALT);
481 nigel 93 return yield;
482 nigel 77 }
483    
484    
485    
486     /*************************************************
487     * Study a compiled expression *
488     *************************************************/
489    
490     /* This function is handed a compiled expression that it must study to produce
491     information that will speed up the matching. It returns a pcre_extra block
492     which then gets handed back to pcre_exec().
493    
494     Arguments:
495     re points to the compiled expression
496     options contains option bits
497     errorptr points to where to place error messages;
498     set NULL unless error
499    
500     Returns: pointer to a pcre_extra block, with study_data filled in and the
501     appropriate flag set;
502     NULL on error or if no optimization possible
503     */
504    
505 ph10 145 PCRE_EXP_DEFN pcre_extra *
506 nigel 77 pcre_study(const pcre *external_re, int options, const char **errorptr)
507     {
508     uschar start_bits[32];
509     pcre_extra *extra;
510     pcre_study_data *study;
511     const uschar *tables;
512 nigel 91 uschar *code;
513     compile_data compile_block;
514 nigel 77 const real_pcre *re = (const real_pcre *)external_re;
515    
516     *errorptr = NULL;
517    
518     if (re == NULL || re->magic_number != MAGIC_NUMBER)
519     {
520     *errorptr = "argument is not a compiled regular expression";
521     return NULL;
522     }
523    
524     if ((options & ~PUBLIC_STUDY_OPTIONS) != 0)
525     {
526     *errorptr = "unknown or incorrect option bit(s) set";
527     return NULL;
528     }
529    
530 nigel 91 code = (uschar *)re + re->name_table_offset +
531     (re->name_count * re->name_entry_size);
532    
533 nigel 77 /* For an anchored pattern, or an unanchored pattern that has a first char, or
534     a multiline pattern that matches only at "line starts", no further processing
535     at present. */
536    
537 ph10 230 if ((re->options & PCRE_ANCHORED) != 0 ||
538     (re->flags & (PCRE_FIRSTSET|PCRE_STARTLINE)) != 0)
539 nigel 77 return NULL;
540    
541     /* Set the character tables in the block that is passed around */
542    
543     tables = re->tables;
544     if (tables == NULL)
545     (void)pcre_fullinfo(external_re, NULL, PCRE_INFO_DEFAULT_TABLES,
546     (void *)(&tables));
547    
548     compile_block.lcc = tables + lcc_offset;
549     compile_block.fcc = tables + fcc_offset;
550     compile_block.cbits = tables + cbits_offset;
551     compile_block.ctypes = tables + ctypes_offset;
552    
553     /* See if we can find a fixed set of initial characters for the pattern. */
554    
555     memset(start_bits, 0, 32 * sizeof(uschar));
556 nigel 93 if (set_start_bits(code, start_bits, (re->options & PCRE_CASELESS) != 0,
557     (re->options & PCRE_UTF8) != 0, &compile_block) != SSB_DONE) return NULL;
558 nigel 77
559     /* Get a pcre_extra block and a pcre_study_data block. The study data is put in
560     the latter, which is pointed to by the former, which may also get additional
561     data set later by the calling program. At the moment, the size of
562     pcre_study_data is fixed. We nevertheless save it in a field for returning via
563     the pcre_fullinfo() function so that if it becomes variable in the future, we
564     don't have to change that code. */
565    
566     extra = (pcre_extra *)(pcre_malloc)
567     (sizeof(pcre_extra) + sizeof(pcre_study_data));
568    
569     if (extra == NULL)
570     {
571     *errorptr = "failed to get memory";
572     return NULL;
573     }
574    
575     study = (pcre_study_data *)((char *)extra + sizeof(pcre_extra));
576     extra->flags = PCRE_EXTRA_STUDY_DATA;
577     extra->study_data = study;
578    
579     study->size = sizeof(pcre_study_data);
580     study->options = PCRE_STUDY_MAPPED;
581     memcpy(study->start_bits, start_bits, sizeof(start_bits));
582    
583     return extra;
584     }
585    
586     /* End of pcre_study.c */

Properties

Name Value
svn:eol-style native
svn:keywords "Author Date Id Revision Url"

webmaster@exim.org
ViewVC Help
Powered by ViewVC 1.1.12