/[pcre]/code/trunk/pcre_study.c
ViewVC logotype

Contents of /code/trunk/pcre_study.c

Parent Directory Parent Directory | Revision Log Revision Log


Revision 353 - (hide annotations) (download)
Mon Jul 7 15:44:24 2008 UTC (6 years, 3 months ago) by ph10
File MIME type: text/plain
File size: 17950 byte(s)
Fix SKIPZERO bug in pcre_study().

1 nigel 77 /*************************************************
2     * Perl-Compatible Regular Expressions *
3     *************************************************/
4    
5     /* PCRE is a library of functions to support regular expressions whose syntax
6     and semantics are as close as possible to those of the Perl 5 language.
7    
8     Written by Philip Hazel
9 ph10 305 Copyright (c) 1997-2008 University of Cambridge
10 nigel 77
11     -----------------------------------------------------------------------------
12     Redistribution and use in source and binary forms, with or without
13     modification, are permitted provided that the following conditions are met:
14    
15     * Redistributions of source code must retain the above copyright notice,
16     this list of conditions and the following disclaimer.
17    
18     * Redistributions in binary form must reproduce the above copyright
19     notice, this list of conditions and the following disclaimer in the
20     documentation and/or other materials provided with the distribution.
21    
22     * Neither the name of the University of Cambridge nor the names of its
23     contributors may be used to endorse or promote products derived from
24     this software without specific prior written permission.
25    
26     THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
27     AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
28     IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
29     ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
30     LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
31     CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
32     SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
33     INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
34     CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
35     ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
36     POSSIBILITY OF SUCH DAMAGE.
37     -----------------------------------------------------------------------------
38     */
39    
40    
41     /* This module contains the external function pcre_study(), along with local
42     supporting functions. */
43    
44    
45 ph10 200 #ifdef HAVE_CONFIG_H
46 ph10 236 #include "config.h"
47 ph10 200 #endif
48 ph10 199
49 nigel 77 #include "pcre_internal.h"
50    
51    
52 nigel 93 /* Returns from set_start_bits() */
53    
54     enum { SSB_FAIL, SSB_DONE, SSB_CONTINUE };
55    
56    
57 nigel 77 /*************************************************
58     * Set a bit and maybe its alternate case *
59     *************************************************/
60    
61     /* Given a character, set its bit in the table, and also the bit for the other
62     version of a letter if we are caseless.
63    
64     Arguments:
65     start_bits points to the bit map
66     c is the character
67     caseless the caseless flag
68     cd the block with char table pointers
69    
70     Returns: nothing
71     */
72    
73     static void
74     set_bit(uschar *start_bits, unsigned int c, BOOL caseless, compile_data *cd)
75     {
76     start_bits[c/8] |= (1 << (c&7));
77     if (caseless && (cd->ctypes[c] & ctype_letter) != 0)
78     start_bits[cd->fcc[c]/8] |= (1 << (cd->fcc[c]&7));
79     }
80    
81    
82    
83     /*************************************************
84 nigel 93 * Create bitmap of starting bytes *
85 nigel 77 *************************************************/
86    
87 nigel 93 /* This function scans a compiled unanchored expression recursively and
88     attempts to build a bitmap of the set of possible starting bytes. As time goes
89     by, we may be able to get more clever at doing this. The SSB_CONTINUE return is
90     useful for parenthesized groups in patterns such as (a*)b where the group
91     provides some optional starting bytes but scanning must continue at the outer
92     level to find at least one mandatory byte. At the outermost level, this
93     function fails unless the result is SSB_DONE.
94 nigel 77
95     Arguments:
96     code points to an expression
97     start_bits points to a 32-byte table, initialized to 0
98     caseless the current state of the caseless flag
99     utf8 TRUE if in UTF-8 mode
100     cd the block with char table pointers
101    
102 nigel 93 Returns: SSB_FAIL => Failed to find any starting bytes
103     SSB_DONE => Found mandatory starting bytes
104     SSB_CONTINUE => Found optional starting bytes
105 nigel 77 */
106    
107 nigel 93 static int
108 nigel 77 set_start_bits(const uschar *code, uschar *start_bits, BOOL caseless,
109     BOOL utf8, compile_data *cd)
110     {
111     register int c;
112 nigel 93 int yield = SSB_DONE;
113 nigel 77
114 nigel 91 #if 0
115     /* ========================================================================= */
116     /* The following comment and code was inserted in January 1999. In May 2006,
117     when it was observed to cause compiler warnings about unused values, I took it
118     out again. If anybody is still using OS/2, they will have to put it back
119     manually. */
120    
121 nigel 77 /* This next statement and the later reference to dummy are here in order to
122     trick the optimizer of the IBM C compiler for OS/2 into generating correct
123     code. Apparently IBM isn't going to fix the problem, and we would rather not
124     disable optimization (in this module it actually makes a big difference, and
125     the pcre module can use all the optimization it can get). */
126    
127     volatile int dummy;
128 nigel 91 /* ========================================================================= */
129     #endif
130 nigel 77
131     do
132     {
133 nigel 93 const uschar *tcode = code + (((int)*code == OP_CBRA)? 3:1) + LINK_SIZE;
134 nigel 77 BOOL try_next = TRUE;
135    
136 nigel 93 while (try_next) /* Loop for items in this branch */
137 nigel 77 {
138 nigel 93 int rc;
139     switch(*tcode)
140 nigel 77 {
141 nigel 93 /* Fail if we reach something we don't understand */
142 nigel 77
143     default:
144 nigel 93 return SSB_FAIL;
145 nigel 77
146 nigel 93 /* If we hit a bracket or a positive lookahead assertion, recurse to set
147     bits from within the subpattern. If it can't find anything, we have to
148     give up. If it finds some mandatory character(s), we are done for this
149     branch. Otherwise, carry on scanning after the subpattern. */
150    
151     case OP_BRA:
152     case OP_SBRA:
153     case OP_CBRA:
154     case OP_SCBRA:
155     case OP_ONCE:
156     case OP_ASSERT:
157     rc = set_start_bits(tcode, start_bits, caseless, utf8, cd);
158     if (rc == SSB_FAIL) return SSB_FAIL;
159     if (rc == SSB_DONE) try_next = FALSE; else
160     {
161     do tcode += GET(tcode, 1); while (*tcode == OP_ALT);
162     tcode += 1 + LINK_SIZE;
163     }
164     break;
165    
166     /* If we hit ALT or KET, it means we haven't found anything mandatory in
167     this branch, though we might have found something optional. For ALT, we
168     continue with the next alternative, but we have to arrange that the final
169     result from subpattern is SSB_CONTINUE rather than SSB_DONE. For KET,
170     return SSB_CONTINUE: if this is the top level, that indicates failure,
171     but after a nested subpattern, it causes scanning to continue. */
172    
173     case OP_ALT:
174     yield = SSB_CONTINUE;
175     try_next = FALSE;
176     break;
177    
178     case OP_KET:
179     case OP_KETRMAX:
180     case OP_KETRMIN:
181     return SSB_CONTINUE;
182    
183 nigel 77 /* Skip over callout */
184    
185     case OP_CALLOUT:
186     tcode += 2 + 2*LINK_SIZE;
187     break;
188    
189     /* Skip over lookbehind and negative lookahead assertions */
190    
191     case OP_ASSERT_NOT:
192     case OP_ASSERTBACK:
193     case OP_ASSERTBACK_NOT:
194     do tcode += GET(tcode, 1); while (*tcode == OP_ALT);
195 nigel 93 tcode += 1 + LINK_SIZE;
196 nigel 77 break;
197    
198     /* Skip over an option setting, changing the caseless flag */
199    
200     case OP_OPT:
201     caseless = (tcode[1] & PCRE_CASELESS) != 0;
202     tcode += 2;
203     break;
204    
205     /* BRAZERO does the bracket, but carries on. */
206    
207     case OP_BRAZERO:
208     case OP_BRAMINZERO:
209 nigel 93 if (set_start_bits(++tcode, start_bits, caseless, utf8, cd) == SSB_FAIL)
210     return SSB_FAIL;
211 nigel 91 /* =========================================================================
212     See the comment at the head of this function concerning the next line,
213     which was an old fudge for the benefit of OS/2.
214 nigel 77 dummy = 1;
215 nigel 91 ========================================================================= */
216 nigel 77 do tcode += GET(tcode,1); while (*tcode == OP_ALT);
217 nigel 93 tcode += 1 + LINK_SIZE;
218 nigel 77 break;
219    
220 ph10 335 /* SKIPZERO skips the bracket. */
221    
222     case OP_SKIPZERO:
223 ph10 353 tcode++;
224 ph10 335 do tcode += GET(tcode,1); while (*tcode == OP_ALT);
225     tcode += 1 + LINK_SIZE;
226     break;
227    
228 nigel 77 /* Single-char * or ? sets the bit and tries the next item */
229    
230     case OP_STAR:
231     case OP_MINSTAR:
232 nigel 93 case OP_POSSTAR:
233 nigel 77 case OP_QUERY:
234     case OP_MINQUERY:
235 nigel 93 case OP_POSQUERY:
236 nigel 77 set_bit(start_bits, tcode[1], caseless, cd);
237     tcode += 2;
238     #ifdef SUPPORT_UTF8
239 nigel 93 if (utf8 && tcode[-1] >= 0xc0)
240     tcode += _pcre_utf8_table4[tcode[-1] & 0x3f];
241 nigel 77 #endif
242     break;
243    
244     /* Single-char upto sets the bit and tries the next */
245    
246     case OP_UPTO:
247     case OP_MINUPTO:
248 nigel 93 case OP_POSUPTO:
249 nigel 77 set_bit(start_bits, tcode[3], caseless, cd);
250     tcode += 4;
251     #ifdef SUPPORT_UTF8
252 nigel 93 if (utf8 && tcode[-1] >= 0xc0)
253     tcode += _pcre_utf8_table4[tcode[-1] & 0x3f];
254 nigel 77 #endif
255     break;
256    
257     /* At least one single char sets the bit and stops */
258    
259     case OP_EXACT: /* Fall through */
260     tcode += 2;
261    
262     case OP_CHAR:
263     case OP_CHARNC:
264     case OP_PLUS:
265     case OP_MINPLUS:
266 nigel 93 case OP_POSPLUS:
267 nigel 77 set_bit(start_bits, tcode[1], caseless, cd);
268     try_next = FALSE;
269     break;
270    
271     /* Single character type sets the bits and stops */
272    
273     case OP_NOT_DIGIT:
274     for (c = 0; c < 32; c++)
275     start_bits[c] |= ~cd->cbits[c+cbit_digit];
276     try_next = FALSE;
277     break;
278    
279     case OP_DIGIT:
280     for (c = 0; c < 32; c++)
281     start_bits[c] |= cd->cbits[c+cbit_digit];
282     try_next = FALSE;
283     break;
284    
285 nigel 91 /* The cbit_space table has vertical tab as whitespace; we have to
286     discard it. */
287    
288 nigel 77 case OP_NOT_WHITESPACE:
289     for (c = 0; c < 32; c++)
290 nigel 91 {
291     int d = cd->cbits[c+cbit_space];
292     if (c == 1) d &= ~0x08;
293     start_bits[c] |= ~d;
294     }
295 nigel 77 try_next = FALSE;
296     break;
297    
298 nigel 91 /* The cbit_space table has vertical tab as whitespace; we have to
299     discard it. */
300    
301 nigel 77 case OP_WHITESPACE:
302     for (c = 0; c < 32; c++)
303 nigel 91 {
304     int d = cd->cbits[c+cbit_space];
305     if (c == 1) d &= ~0x08;
306     start_bits[c] |= d;
307     }
308 nigel 77 try_next = FALSE;
309     break;
310    
311     case OP_NOT_WORDCHAR:
312     for (c = 0; c < 32; c++)
313     start_bits[c] |= ~cd->cbits[c+cbit_word];
314     try_next = FALSE;
315     break;
316    
317     case OP_WORDCHAR:
318     for (c = 0; c < 32; c++)
319     start_bits[c] |= cd->cbits[c+cbit_word];
320     try_next = FALSE;
321     break;
322    
323     /* One or more character type fudges the pointer and restarts, knowing
324     it will hit a single character type and stop there. */
325    
326     case OP_TYPEPLUS:
327     case OP_TYPEMINPLUS:
328     tcode++;
329     break;
330    
331     case OP_TYPEEXACT:
332     tcode += 3;
333     break;
334    
335     /* Zero or more repeats of character types set the bits and then
336     try again. */
337    
338     case OP_TYPEUPTO:
339     case OP_TYPEMINUPTO:
340 nigel 93 case OP_TYPEPOSUPTO:
341 nigel 77 tcode += 2; /* Fall through */
342    
343     case OP_TYPESTAR:
344     case OP_TYPEMINSTAR:
345 nigel 93 case OP_TYPEPOSSTAR:
346 nigel 77 case OP_TYPEQUERY:
347     case OP_TYPEMINQUERY:
348 nigel 93 case OP_TYPEPOSQUERY:
349 nigel 77 switch(tcode[1])
350     {
351     case OP_ANY:
352 ph10 345 case OP_ALLANY:
353 nigel 93 return SSB_FAIL;
354 nigel 77
355     case OP_NOT_DIGIT:
356     for (c = 0; c < 32; c++)
357     start_bits[c] |= ~cd->cbits[c+cbit_digit];
358     break;
359    
360     case OP_DIGIT:
361     for (c = 0; c < 32; c++)
362     start_bits[c] |= cd->cbits[c+cbit_digit];
363     break;
364    
365 nigel 91 /* The cbit_space table has vertical tab as whitespace; we have to
366     discard it. */
367    
368 nigel 77 case OP_NOT_WHITESPACE:
369     for (c = 0; c < 32; c++)
370 nigel 91 {
371     int d = cd->cbits[c+cbit_space];
372     if (c == 1) d &= ~0x08;
373     start_bits[c] |= ~d;
374     }
375 nigel 77 break;
376    
377 nigel 91 /* The cbit_space table has vertical tab as whitespace; we have to
378     discard it. */
379    
380 nigel 77 case OP_WHITESPACE:
381     for (c = 0; c < 32; c++)
382 nigel 91 {
383     int d = cd->cbits[c+cbit_space];
384     if (c == 1) d &= ~0x08;
385     start_bits[c] |= d;
386     }
387 nigel 77 break;
388    
389     case OP_NOT_WORDCHAR:
390     for (c = 0; c < 32; c++)
391     start_bits[c] |= ~cd->cbits[c+cbit_word];
392     break;
393    
394     case OP_WORDCHAR:
395     for (c = 0; c < 32; c++)
396     start_bits[c] |= cd->cbits[c+cbit_word];
397     break;
398     }
399    
400     tcode += 2;
401     break;
402    
403     /* Character class where all the information is in a bit map: set the
404     bits and either carry on or not, according to the repeat count. If it was
405     a negative class, and we are operating with UTF-8 characters, any byte
406     with a value >= 0xc4 is a potentially valid starter because it starts a
407     character with a value > 255. */
408    
409     case OP_NCLASS:
410 ph10 111 #ifdef SUPPORT_UTF8
411 nigel 77 if (utf8)
412     {
413     start_bits[24] |= 0xf0; /* Bits for 0xc4 - 0xc8 */
414     memset(start_bits+25, 0xff, 7); /* Bits for 0xc9 - 0xff */
415     }
416 ph10 111 #endif
417 nigel 77 /* Fall through */
418    
419     case OP_CLASS:
420     {
421     tcode++;
422    
423     /* In UTF-8 mode, the bits in a bit map correspond to character
424     values, not to byte values. However, the bit map we are constructing is
425     for byte values. So we have to do a conversion for characters whose
426     value is > 127. In fact, there are only two possible starting bytes for
427     characters in the range 128 - 255. */
428    
429 ph10 107 #ifdef SUPPORT_UTF8
430 nigel 77 if (utf8)
431     {
432     for (c = 0; c < 16; c++) start_bits[c] |= tcode[c];
433     for (c = 128; c < 256; c++)
434     {
435     if ((tcode[c/8] && (1 << (c&7))) != 0)
436     {
437     int d = (c >> 6) | 0xc0; /* Set bit for this starter */
438     start_bits[d/8] |= (1 << (d&7)); /* and then skip on to the */
439     c = (c & 0xc0) + 0x40 - 1; /* next relevant character. */
440     }
441     }
442     }
443    
444     /* In non-UTF-8 mode, the two bit maps are completely compatible. */
445    
446     else
447 ph10 111 #endif
448 nigel 77 {
449     for (c = 0; c < 32; c++) start_bits[c] |= tcode[c];
450     }
451    
452     /* Advance past the bit map, and act on what follows */
453    
454     tcode += 32;
455     switch (*tcode)
456     {
457     case OP_CRSTAR:
458     case OP_CRMINSTAR:
459     case OP_CRQUERY:
460     case OP_CRMINQUERY:
461     tcode++;
462     break;
463    
464     case OP_CRRANGE:
465     case OP_CRMINRANGE:
466     if (((tcode[1] << 8) + tcode[2]) == 0) tcode += 5;
467     else try_next = FALSE;
468     break;
469    
470     default:
471     try_next = FALSE;
472     break;
473     }
474     }
475     break; /* End of bitmap class handling */
476    
477     } /* End of switch */
478     } /* End of try_next loop */
479    
480     code += GET(code, 1); /* Advance to next branch */
481     }
482     while (*code == OP_ALT);
483 nigel 93 return yield;
484 nigel 77 }
485    
486    
487    
488     /*************************************************
489     * Study a compiled expression *
490     *************************************************/
491    
492     /* This function is handed a compiled expression that it must study to produce
493     information that will speed up the matching. It returns a pcre_extra block
494     which then gets handed back to pcre_exec().
495    
496     Arguments:
497     re points to the compiled expression
498     options contains option bits
499     errorptr points to where to place error messages;
500     set NULL unless error
501    
502     Returns: pointer to a pcre_extra block, with study_data filled in and the
503     appropriate flag set;
504     NULL on error or if no optimization possible
505     */
506    
507 ph10 145 PCRE_EXP_DEFN pcre_extra *
508 nigel 77 pcre_study(const pcre *external_re, int options, const char **errorptr)
509     {
510     uschar start_bits[32];
511     pcre_extra *extra;
512     pcre_study_data *study;
513     const uschar *tables;
514 nigel 91 uschar *code;
515     compile_data compile_block;
516 nigel 77 const real_pcre *re = (const real_pcre *)external_re;
517    
518     *errorptr = NULL;
519    
520     if (re == NULL || re->magic_number != MAGIC_NUMBER)
521     {
522     *errorptr = "argument is not a compiled regular expression";
523     return NULL;
524     }
525    
526     if ((options & ~PUBLIC_STUDY_OPTIONS) != 0)
527     {
528     *errorptr = "unknown or incorrect option bit(s) set";
529     return NULL;
530     }
531    
532 nigel 91 code = (uschar *)re + re->name_table_offset +
533     (re->name_count * re->name_entry_size);
534    
535 nigel 77 /* For an anchored pattern, or an unanchored pattern that has a first char, or
536     a multiline pattern that matches only at "line starts", no further processing
537     at present. */
538    
539 ph10 230 if ((re->options & PCRE_ANCHORED) != 0 ||
540     (re->flags & (PCRE_FIRSTSET|PCRE_STARTLINE)) != 0)
541 nigel 77 return NULL;
542    
543     /* Set the character tables in the block that is passed around */
544    
545     tables = re->tables;
546     if (tables == NULL)
547     (void)pcre_fullinfo(external_re, NULL, PCRE_INFO_DEFAULT_TABLES,
548     (void *)(&tables));
549    
550     compile_block.lcc = tables + lcc_offset;
551     compile_block.fcc = tables + fcc_offset;
552     compile_block.cbits = tables + cbits_offset;
553     compile_block.ctypes = tables + ctypes_offset;
554    
555     /* See if we can find a fixed set of initial characters for the pattern. */
556    
557     memset(start_bits, 0, 32 * sizeof(uschar));
558 nigel 93 if (set_start_bits(code, start_bits, (re->options & PCRE_CASELESS) != 0,
559     (re->options & PCRE_UTF8) != 0, &compile_block) != SSB_DONE) return NULL;
560 nigel 77
561     /* Get a pcre_extra block and a pcre_study_data block. The study data is put in
562     the latter, which is pointed to by the former, which may also get additional
563     data set later by the calling program. At the moment, the size of
564     pcre_study_data is fixed. We nevertheless save it in a field for returning via
565     the pcre_fullinfo() function so that if it becomes variable in the future, we
566     don't have to change that code. */
567    
568     extra = (pcre_extra *)(pcre_malloc)
569     (sizeof(pcre_extra) + sizeof(pcre_study_data));
570    
571     if (extra == NULL)
572     {
573     *errorptr = "failed to get memory";
574     return NULL;
575     }
576    
577     study = (pcre_study_data *)((char *)extra + sizeof(pcre_extra));
578     extra->flags = PCRE_EXTRA_STUDY_DATA;
579     extra->study_data = study;
580    
581     study->size = sizeof(pcre_study_data);
582     study->options = PCRE_STUDY_MAPPED;
583     memcpy(study->start_bits, start_bits, sizeof(start_bits));
584    
585     return extra;
586     }
587    
588     /* End of pcre_study.c */

Properties

Name Value
svn:eol-style native
svn:keywords "Author Date Id Revision Url"

webmaster@exim.org
ViewVC Help
Powered by ViewVC 1.1.12