/[pcre]/code/trunk/pcre_study.c
ViewVC logotype

Contents of /code/trunk/pcre_study.c

Parent Directory Parent Directory | Revision Log Revision Log


Revision 345 - (hide annotations) (download)
Mon Apr 28 15:10:02 2008 UTC (6 years, 6 months ago) by ph10
File MIME type: text/plain
File size: 17934 byte(s)
Tidies for the 7.7-RC1 distribution.

1 nigel 77 /*************************************************
2     * Perl-Compatible Regular Expressions *
3     *************************************************/
4    
5     /* PCRE is a library of functions to support regular expressions whose syntax
6     and semantics are as close as possible to those of the Perl 5 language.
7    
8     Written by Philip Hazel
9 ph10 305 Copyright (c) 1997-2008 University of Cambridge
10 nigel 77
11     -----------------------------------------------------------------------------
12     Redistribution and use in source and binary forms, with or without
13     modification, are permitted provided that the following conditions are met:
14    
15     * Redistributions of source code must retain the above copyright notice,
16     this list of conditions and the following disclaimer.
17    
18     * Redistributions in binary form must reproduce the above copyright
19     notice, this list of conditions and the following disclaimer in the
20     documentation and/or other materials provided with the distribution.
21    
22     * Neither the name of the University of Cambridge nor the names of its
23     contributors may be used to endorse or promote products derived from
24     this software without specific prior written permission.
25    
26     THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
27     AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
28     IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
29     ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
30     LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
31     CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
32     SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
33     INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
34     CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
35     ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
36     POSSIBILITY OF SUCH DAMAGE.
37     -----------------------------------------------------------------------------
38     */
39    
40    
41     /* This module contains the external function pcre_study(), along with local
42     supporting functions. */
43    
44    
45 ph10 200 #ifdef HAVE_CONFIG_H
46 ph10 236 #include "config.h"
47 ph10 200 #endif
48 ph10 199
49 nigel 77 #include "pcre_internal.h"
50    
51    
52 nigel 93 /* Returns from set_start_bits() */
53    
54     enum { SSB_FAIL, SSB_DONE, SSB_CONTINUE };
55    
56    
57 nigel 77 /*************************************************
58     * Set a bit and maybe its alternate case *
59     *************************************************/
60    
61     /* Given a character, set its bit in the table, and also the bit for the other
62     version of a letter if we are caseless.
63    
64     Arguments:
65     start_bits points to the bit map
66     c is the character
67     caseless the caseless flag
68     cd the block with char table pointers
69    
70     Returns: nothing
71     */
72    
73     static void
74     set_bit(uschar *start_bits, unsigned int c, BOOL caseless, compile_data *cd)
75     {
76     start_bits[c/8] |= (1 << (c&7));
77     if (caseless && (cd->ctypes[c] & ctype_letter) != 0)
78     start_bits[cd->fcc[c]/8] |= (1 << (cd->fcc[c]&7));
79     }
80    
81    
82    
83     /*************************************************
84 nigel 93 * Create bitmap of starting bytes *
85 nigel 77 *************************************************/
86    
87 nigel 93 /* This function scans a compiled unanchored expression recursively and
88     attempts to build a bitmap of the set of possible starting bytes. As time goes
89     by, we may be able to get more clever at doing this. The SSB_CONTINUE return is
90     useful for parenthesized groups in patterns such as (a*)b where the group
91     provides some optional starting bytes but scanning must continue at the outer
92     level to find at least one mandatory byte. At the outermost level, this
93     function fails unless the result is SSB_DONE.
94 nigel 77
95     Arguments:
96     code points to an expression
97     start_bits points to a 32-byte table, initialized to 0
98     caseless the current state of the caseless flag
99     utf8 TRUE if in UTF-8 mode
100     cd the block with char table pointers
101    
102 nigel 93 Returns: SSB_FAIL => Failed to find any starting bytes
103     SSB_DONE => Found mandatory starting bytes
104     SSB_CONTINUE => Found optional starting bytes
105 nigel 77 */
106    
107 nigel 93 static int
108 nigel 77 set_start_bits(const uschar *code, uschar *start_bits, BOOL caseless,
109     BOOL utf8, compile_data *cd)
110     {
111     register int c;
112 nigel 93 int yield = SSB_DONE;
113 nigel 77
114 nigel 91 #if 0
115     /* ========================================================================= */
116     /* The following comment and code was inserted in January 1999. In May 2006,
117     when it was observed to cause compiler warnings about unused values, I took it
118     out again. If anybody is still using OS/2, they will have to put it back
119     manually. */
120    
121 nigel 77 /* This next statement and the later reference to dummy are here in order to
122     trick the optimizer of the IBM C compiler for OS/2 into generating correct
123     code. Apparently IBM isn't going to fix the problem, and we would rather not
124     disable optimization (in this module it actually makes a big difference, and
125     the pcre module can use all the optimization it can get). */
126    
127     volatile int dummy;
128 nigel 91 /* ========================================================================= */
129     #endif
130 nigel 77
131     do
132     {
133 nigel 93 const uschar *tcode = code + (((int)*code == OP_CBRA)? 3:1) + LINK_SIZE;
134 nigel 77 BOOL try_next = TRUE;
135    
136 nigel 93 while (try_next) /* Loop for items in this branch */
137 nigel 77 {
138 nigel 93 int rc;
139     switch(*tcode)
140 nigel 77 {
141 nigel 93 /* Fail if we reach something we don't understand */
142 nigel 77
143     default:
144 nigel 93 return SSB_FAIL;
145 nigel 77
146 nigel 93 /* If we hit a bracket or a positive lookahead assertion, recurse to set
147     bits from within the subpattern. If it can't find anything, we have to
148     give up. If it finds some mandatory character(s), we are done for this
149     branch. Otherwise, carry on scanning after the subpattern. */
150    
151     case OP_BRA:
152     case OP_SBRA:
153     case OP_CBRA:
154     case OP_SCBRA:
155     case OP_ONCE:
156     case OP_ASSERT:
157     rc = set_start_bits(tcode, start_bits, caseless, utf8, cd);
158     if (rc == SSB_FAIL) return SSB_FAIL;
159     if (rc == SSB_DONE) try_next = FALSE; else
160     {
161     do tcode += GET(tcode, 1); while (*tcode == OP_ALT);
162     tcode += 1 + LINK_SIZE;
163     }
164     break;
165    
166     /* If we hit ALT or KET, it means we haven't found anything mandatory in
167     this branch, though we might have found something optional. For ALT, we
168     continue with the next alternative, but we have to arrange that the final
169     result from subpattern is SSB_CONTINUE rather than SSB_DONE. For KET,
170     return SSB_CONTINUE: if this is the top level, that indicates failure,
171     but after a nested subpattern, it causes scanning to continue. */
172    
173     case OP_ALT:
174     yield = SSB_CONTINUE;
175     try_next = FALSE;
176     break;
177    
178     case OP_KET:
179     case OP_KETRMAX:
180     case OP_KETRMIN:
181     return SSB_CONTINUE;
182    
183 nigel 77 /* Skip over callout */
184    
185     case OP_CALLOUT:
186     tcode += 2 + 2*LINK_SIZE;
187     break;
188    
189     /* Skip over lookbehind and negative lookahead assertions */
190    
191     case OP_ASSERT_NOT:
192     case OP_ASSERTBACK:
193     case OP_ASSERTBACK_NOT:
194     do tcode += GET(tcode, 1); while (*tcode == OP_ALT);
195 nigel 93 tcode += 1 + LINK_SIZE;
196 nigel 77 break;
197    
198     /* Skip over an option setting, changing the caseless flag */
199    
200     case OP_OPT:
201     caseless = (tcode[1] & PCRE_CASELESS) != 0;
202     tcode += 2;
203     break;
204    
205     /* BRAZERO does the bracket, but carries on. */
206    
207     case OP_BRAZERO:
208     case OP_BRAMINZERO:
209 nigel 93 if (set_start_bits(++tcode, start_bits, caseless, utf8, cd) == SSB_FAIL)
210     return SSB_FAIL;
211 nigel 91 /* =========================================================================
212     See the comment at the head of this function concerning the next line,
213     which was an old fudge for the benefit of OS/2.
214 nigel 77 dummy = 1;
215 nigel 91 ========================================================================= */
216 nigel 77 do tcode += GET(tcode,1); while (*tcode == OP_ALT);
217 nigel 93 tcode += 1 + LINK_SIZE;
218 nigel 77 break;
219    
220 ph10 335 /* SKIPZERO skips the bracket. */
221    
222     case OP_SKIPZERO:
223     do tcode += GET(tcode,1); while (*tcode == OP_ALT);
224     tcode += 1 + LINK_SIZE;
225     break;
226    
227 nigel 77 /* Single-char * or ? sets the bit and tries the next item */
228    
229     case OP_STAR:
230     case OP_MINSTAR:
231 nigel 93 case OP_POSSTAR:
232 nigel 77 case OP_QUERY:
233     case OP_MINQUERY:
234 nigel 93 case OP_POSQUERY:
235 nigel 77 set_bit(start_bits, tcode[1], caseless, cd);
236     tcode += 2;
237     #ifdef SUPPORT_UTF8
238 nigel 93 if (utf8 && tcode[-1] >= 0xc0)
239     tcode += _pcre_utf8_table4[tcode[-1] & 0x3f];
240 nigel 77 #endif
241     break;
242    
243     /* Single-char upto sets the bit and tries the next */
244    
245     case OP_UPTO:
246     case OP_MINUPTO:
247 nigel 93 case OP_POSUPTO:
248 nigel 77 set_bit(start_bits, tcode[3], caseless, cd);
249     tcode += 4;
250     #ifdef SUPPORT_UTF8
251 nigel 93 if (utf8 && tcode[-1] >= 0xc0)
252     tcode += _pcre_utf8_table4[tcode[-1] & 0x3f];
253 nigel 77 #endif
254     break;
255    
256     /* At least one single char sets the bit and stops */
257    
258     case OP_EXACT: /* Fall through */
259     tcode += 2;
260    
261     case OP_CHAR:
262     case OP_CHARNC:
263     case OP_PLUS:
264     case OP_MINPLUS:
265 nigel 93 case OP_POSPLUS:
266 nigel 77 set_bit(start_bits, tcode[1], caseless, cd);
267     try_next = FALSE;
268     break;
269    
270     /* Single character type sets the bits and stops */
271    
272     case OP_NOT_DIGIT:
273     for (c = 0; c < 32; c++)
274     start_bits[c] |= ~cd->cbits[c+cbit_digit];
275     try_next = FALSE;
276     break;
277    
278     case OP_DIGIT:
279     for (c = 0; c < 32; c++)
280     start_bits[c] |= cd->cbits[c+cbit_digit];
281     try_next = FALSE;
282     break;
283    
284 nigel 91 /* The cbit_space table has vertical tab as whitespace; we have to
285     discard it. */
286    
287 nigel 77 case OP_NOT_WHITESPACE:
288     for (c = 0; c < 32; c++)
289 nigel 91 {
290     int d = cd->cbits[c+cbit_space];
291     if (c == 1) d &= ~0x08;
292     start_bits[c] |= ~d;
293     }
294 nigel 77 try_next = FALSE;
295     break;
296    
297 nigel 91 /* The cbit_space table has vertical tab as whitespace; we have to
298     discard it. */
299    
300 nigel 77 case OP_WHITESPACE:
301     for (c = 0; c < 32; c++)
302 nigel 91 {
303     int d = cd->cbits[c+cbit_space];
304     if (c == 1) d &= ~0x08;
305     start_bits[c] |= d;
306     }
307 nigel 77 try_next = FALSE;
308     break;
309    
310     case OP_NOT_WORDCHAR:
311     for (c = 0; c < 32; c++)
312     start_bits[c] |= ~cd->cbits[c+cbit_word];
313     try_next = FALSE;
314     break;
315    
316     case OP_WORDCHAR:
317     for (c = 0; c < 32; c++)
318     start_bits[c] |= cd->cbits[c+cbit_word];
319     try_next = FALSE;
320     break;
321    
322     /* One or more character type fudges the pointer and restarts, knowing
323     it will hit a single character type and stop there. */
324    
325     case OP_TYPEPLUS:
326     case OP_TYPEMINPLUS:
327     tcode++;
328     break;
329    
330     case OP_TYPEEXACT:
331     tcode += 3;
332     break;
333    
334     /* Zero or more repeats of character types set the bits and then
335     try again. */
336    
337     case OP_TYPEUPTO:
338     case OP_TYPEMINUPTO:
339 nigel 93 case OP_TYPEPOSUPTO:
340 nigel 77 tcode += 2; /* Fall through */
341    
342     case OP_TYPESTAR:
343     case OP_TYPEMINSTAR:
344 nigel 93 case OP_TYPEPOSSTAR:
345 nigel 77 case OP_TYPEQUERY:
346     case OP_TYPEMINQUERY:
347 nigel 93 case OP_TYPEPOSQUERY:
348 nigel 77 switch(tcode[1])
349     {
350     case OP_ANY:
351 ph10 345 case OP_ALLANY:
352 nigel 93 return SSB_FAIL;
353 nigel 77
354     case OP_NOT_DIGIT:
355     for (c = 0; c < 32; c++)
356     start_bits[c] |= ~cd->cbits[c+cbit_digit];
357     break;
358    
359     case OP_DIGIT:
360     for (c = 0; c < 32; c++)
361     start_bits[c] |= cd->cbits[c+cbit_digit];
362     break;
363    
364 nigel 91 /* The cbit_space table has vertical tab as whitespace; we have to
365     discard it. */
366    
367 nigel 77 case OP_NOT_WHITESPACE:
368     for (c = 0; c < 32; c++)
369 nigel 91 {
370     int d = cd->cbits[c+cbit_space];
371     if (c == 1) d &= ~0x08;
372     start_bits[c] |= ~d;
373     }
374 nigel 77 break;
375    
376 nigel 91 /* The cbit_space table has vertical tab as whitespace; we have to
377     discard it. */
378    
379 nigel 77 case OP_WHITESPACE:
380     for (c = 0; c < 32; c++)
381 nigel 91 {
382     int d = cd->cbits[c+cbit_space];
383     if (c == 1) d &= ~0x08;
384     start_bits[c] |= d;
385     }
386 nigel 77 break;
387    
388     case OP_NOT_WORDCHAR:
389     for (c = 0; c < 32; c++)
390     start_bits[c] |= ~cd->cbits[c+cbit_word];
391     break;
392    
393     case OP_WORDCHAR:
394     for (c = 0; c < 32; c++)
395     start_bits[c] |= cd->cbits[c+cbit_word];
396     break;
397     }
398    
399     tcode += 2;
400     break;
401    
402     /* Character class where all the information is in a bit map: set the
403     bits and either carry on or not, according to the repeat count. If it was
404     a negative class, and we are operating with UTF-8 characters, any byte
405     with a value >= 0xc4 is a potentially valid starter because it starts a
406     character with a value > 255. */
407    
408     case OP_NCLASS:
409 ph10 111 #ifdef SUPPORT_UTF8
410 nigel 77 if (utf8)
411     {
412     start_bits[24] |= 0xf0; /* Bits for 0xc4 - 0xc8 */
413     memset(start_bits+25, 0xff, 7); /* Bits for 0xc9 - 0xff */
414     }
415 ph10 111 #endif
416 nigel 77 /* Fall through */
417    
418     case OP_CLASS:
419     {
420     tcode++;
421    
422     /* In UTF-8 mode, the bits in a bit map correspond to character
423     values, not to byte values. However, the bit map we are constructing is
424     for byte values. So we have to do a conversion for characters whose
425     value is > 127. In fact, there are only two possible starting bytes for
426     characters in the range 128 - 255. */
427    
428 ph10 107 #ifdef SUPPORT_UTF8
429 nigel 77 if (utf8)
430     {
431     for (c = 0; c < 16; c++) start_bits[c] |= tcode[c];
432     for (c = 128; c < 256; c++)
433     {
434     if ((tcode[c/8] && (1 << (c&7))) != 0)
435     {
436     int d = (c >> 6) | 0xc0; /* Set bit for this starter */
437     start_bits[d/8] |= (1 << (d&7)); /* and then skip on to the */
438     c = (c & 0xc0) + 0x40 - 1; /* next relevant character. */
439     }
440     }
441     }
442    
443     /* In non-UTF-8 mode, the two bit maps are completely compatible. */
444    
445     else
446 ph10 111 #endif
447 nigel 77 {
448     for (c = 0; c < 32; c++) start_bits[c] |= tcode[c];
449     }
450    
451     /* Advance past the bit map, and act on what follows */
452    
453     tcode += 32;
454     switch (*tcode)
455     {
456     case OP_CRSTAR:
457     case OP_CRMINSTAR:
458     case OP_CRQUERY:
459     case OP_CRMINQUERY:
460     tcode++;
461     break;
462    
463     case OP_CRRANGE:
464     case OP_CRMINRANGE:
465     if (((tcode[1] << 8) + tcode[2]) == 0) tcode += 5;
466     else try_next = FALSE;
467     break;
468    
469     default:
470     try_next = FALSE;
471     break;
472     }
473     }
474     break; /* End of bitmap class handling */
475    
476     } /* End of switch */
477     } /* End of try_next loop */
478    
479     code += GET(code, 1); /* Advance to next branch */
480     }
481     while (*code == OP_ALT);
482 nigel 93 return yield;
483 nigel 77 }
484    
485    
486    
487     /*************************************************
488     * Study a compiled expression *
489     *************************************************/
490    
491     /* This function is handed a compiled expression that it must study to produce
492     information that will speed up the matching. It returns a pcre_extra block
493     which then gets handed back to pcre_exec().
494    
495     Arguments:
496     re points to the compiled expression
497     options contains option bits
498     errorptr points to where to place error messages;
499     set NULL unless error
500    
501     Returns: pointer to a pcre_extra block, with study_data filled in and the
502     appropriate flag set;
503     NULL on error or if no optimization possible
504     */
505    
506 ph10 145 PCRE_EXP_DEFN pcre_extra *
507 nigel 77 pcre_study(const pcre *external_re, int options, const char **errorptr)
508     {
509     uschar start_bits[32];
510     pcre_extra *extra;
511     pcre_study_data *study;
512     const uschar *tables;
513 nigel 91 uschar *code;
514     compile_data compile_block;
515 nigel 77 const real_pcre *re = (const real_pcre *)external_re;
516    
517     *errorptr = NULL;
518    
519     if (re == NULL || re->magic_number != MAGIC_NUMBER)
520     {
521     *errorptr = "argument is not a compiled regular expression";
522     return NULL;
523     }
524    
525     if ((options & ~PUBLIC_STUDY_OPTIONS) != 0)
526     {
527     *errorptr = "unknown or incorrect option bit(s) set";
528     return NULL;
529     }
530    
531 nigel 91 code = (uschar *)re + re->name_table_offset +
532     (re->name_count * re->name_entry_size);
533    
534 nigel 77 /* For an anchored pattern, or an unanchored pattern that has a first char, or
535     a multiline pattern that matches only at "line starts", no further processing
536     at present. */
537    
538 ph10 230 if ((re->options & PCRE_ANCHORED) != 0 ||
539     (re->flags & (PCRE_FIRSTSET|PCRE_STARTLINE)) != 0)
540 nigel 77 return NULL;
541    
542     /* Set the character tables in the block that is passed around */
543    
544     tables = re->tables;
545     if (tables == NULL)
546     (void)pcre_fullinfo(external_re, NULL, PCRE_INFO_DEFAULT_TABLES,
547     (void *)(&tables));
548    
549     compile_block.lcc = tables + lcc_offset;
550     compile_block.fcc = tables + fcc_offset;
551     compile_block.cbits = tables + cbits_offset;
552     compile_block.ctypes = tables + ctypes_offset;
553    
554     /* See if we can find a fixed set of initial characters for the pattern. */
555    
556     memset(start_bits, 0, 32 * sizeof(uschar));
557 nigel 93 if (set_start_bits(code, start_bits, (re->options & PCRE_CASELESS) != 0,
558     (re->options & PCRE_UTF8) != 0, &compile_block) != SSB_DONE) return NULL;
559 nigel 77
560     /* Get a pcre_extra block and a pcre_study_data block. The study data is put in
561     the latter, which is pointed to by the former, which may also get additional
562     data set later by the calling program. At the moment, the size of
563     pcre_study_data is fixed. We nevertheless save it in a field for returning via
564     the pcre_fullinfo() function so that if it becomes variable in the future, we
565     don't have to change that code. */
566    
567     extra = (pcre_extra *)(pcre_malloc)
568     (sizeof(pcre_extra) + sizeof(pcre_study_data));
569    
570     if (extra == NULL)
571     {
572     *errorptr = "failed to get memory";
573     return NULL;
574     }
575    
576     study = (pcre_study_data *)((char *)extra + sizeof(pcre_extra));
577     extra->flags = PCRE_EXTRA_STUDY_DATA;
578     extra->study_data = study;
579    
580     study->size = sizeof(pcre_study_data);
581     study->options = PCRE_STUDY_MAPPED;
582     memcpy(study->start_bits, start_bits, sizeof(start_bits));
583    
584     return extra;
585     }
586    
587     /* End of pcre_study.c */

Properties

Name Value
svn:eol-style native
svn:keywords "Author Date Id Revision Url"

webmaster@exim.org
ViewVC Help
Powered by ViewVC 1.1.12