/[pcre]/code/trunk/pcre_study.c
ViewVC logotype

Contents of /code/trunk/pcre_study.c

Parent Directory Parent Directory | Revision Log Revision Log


Revision 145 - (hide annotations) (download)
Wed Apr 4 14:06:52 2007 UTC (7 years ago) by ph10
File MIME type: text/plain
File size: 17668 byte(s)
Reworked all the WIN32 __declspec stuff in the hope of getting it right.

1 nigel 77 /*************************************************
2     * Perl-Compatible Regular Expressions *
3     *************************************************/
4    
5     /* PCRE is a library of functions to support regular expressions whose syntax
6     and semantics are as close as possible to those of the Perl 5 language.
7    
8     Written by Philip Hazel
9 ph10 117 Copyright (c) 1997-2007 University of Cambridge
10 nigel 77
11     -----------------------------------------------------------------------------
12     Redistribution and use in source and binary forms, with or without
13     modification, are permitted provided that the following conditions are met:
14    
15     * Redistributions of source code must retain the above copyright notice,
16     this list of conditions and the following disclaimer.
17    
18     * Redistributions in binary form must reproduce the above copyright
19     notice, this list of conditions and the following disclaimer in the
20     documentation and/or other materials provided with the distribution.
21    
22     * Neither the name of the University of Cambridge nor the names of its
23     contributors may be used to endorse or promote products derived from
24     this software without specific prior written permission.
25    
26     THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
27     AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
28     IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
29     ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
30     LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
31     CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
32     SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
33     INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
34     CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
35     ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
36     POSSIBILITY OF SUCH DAMAGE.
37     -----------------------------------------------------------------------------
38     */
39    
40    
41     /* This module contains the external function pcre_study(), along with local
42     supporting functions. */
43    
44    
45     #include "pcre_internal.h"
46    
47    
48 nigel 93 /* Returns from set_start_bits() */
49    
50     enum { SSB_FAIL, SSB_DONE, SSB_CONTINUE };
51    
52    
53 nigel 77 /*************************************************
54     * Set a bit and maybe its alternate case *
55     *************************************************/
56    
57     /* Given a character, set its bit in the table, and also the bit for the other
58     version of a letter if we are caseless.
59    
60     Arguments:
61     start_bits points to the bit map
62     c is the character
63     caseless the caseless flag
64     cd the block with char table pointers
65    
66     Returns: nothing
67     */
68    
69     static void
70     set_bit(uschar *start_bits, unsigned int c, BOOL caseless, compile_data *cd)
71     {
72     start_bits[c/8] |= (1 << (c&7));
73     if (caseless && (cd->ctypes[c] & ctype_letter) != 0)
74     start_bits[cd->fcc[c]/8] |= (1 << (cd->fcc[c]&7));
75     }
76    
77    
78    
79     /*************************************************
80 nigel 93 * Create bitmap of starting bytes *
81 nigel 77 *************************************************/
82    
83 nigel 93 /* This function scans a compiled unanchored expression recursively and
84     attempts to build a bitmap of the set of possible starting bytes. As time goes
85     by, we may be able to get more clever at doing this. The SSB_CONTINUE return is
86     useful for parenthesized groups in patterns such as (a*)b where the group
87     provides some optional starting bytes but scanning must continue at the outer
88     level to find at least one mandatory byte. At the outermost level, this
89     function fails unless the result is SSB_DONE.
90 nigel 77
91     Arguments:
92     code points to an expression
93     start_bits points to a 32-byte table, initialized to 0
94     caseless the current state of the caseless flag
95     utf8 TRUE if in UTF-8 mode
96     cd the block with char table pointers
97    
98 nigel 93 Returns: SSB_FAIL => Failed to find any starting bytes
99     SSB_DONE => Found mandatory starting bytes
100     SSB_CONTINUE => Found optional starting bytes
101 nigel 77 */
102    
103 nigel 93 static int
104 nigel 77 set_start_bits(const uschar *code, uschar *start_bits, BOOL caseless,
105     BOOL utf8, compile_data *cd)
106     {
107     register int c;
108 nigel 93 int yield = SSB_DONE;
109 nigel 77
110 nigel 91 #if 0
111     /* ========================================================================= */
112     /* The following comment and code was inserted in January 1999. In May 2006,
113     when it was observed to cause compiler warnings about unused values, I took it
114     out again. If anybody is still using OS/2, they will have to put it back
115     manually. */
116    
117 nigel 77 /* This next statement and the later reference to dummy are here in order to
118     trick the optimizer of the IBM C compiler for OS/2 into generating correct
119     code. Apparently IBM isn't going to fix the problem, and we would rather not
120     disable optimization (in this module it actually makes a big difference, and
121     the pcre module can use all the optimization it can get). */
122    
123     volatile int dummy;
124 nigel 91 /* ========================================================================= */
125     #endif
126 nigel 77
127     do
128     {
129 nigel 93 const uschar *tcode = code + (((int)*code == OP_CBRA)? 3:1) + LINK_SIZE;
130 nigel 77 BOOL try_next = TRUE;
131    
132 nigel 93 while (try_next) /* Loop for items in this branch */
133 nigel 77 {
134 nigel 93 int rc;
135     switch(*tcode)
136 nigel 77 {
137 nigel 93 /* Fail if we reach something we don't understand */
138 nigel 77
139     default:
140 nigel 93 return SSB_FAIL;
141 nigel 77
142 nigel 93 /* If we hit a bracket or a positive lookahead assertion, recurse to set
143     bits from within the subpattern. If it can't find anything, we have to
144     give up. If it finds some mandatory character(s), we are done for this
145     branch. Otherwise, carry on scanning after the subpattern. */
146    
147     case OP_BRA:
148     case OP_SBRA:
149     case OP_CBRA:
150     case OP_SCBRA:
151     case OP_ONCE:
152     case OP_ASSERT:
153     rc = set_start_bits(tcode, start_bits, caseless, utf8, cd);
154     if (rc == SSB_FAIL) return SSB_FAIL;
155     if (rc == SSB_DONE) try_next = FALSE; else
156     {
157     do tcode += GET(tcode, 1); while (*tcode == OP_ALT);
158     tcode += 1 + LINK_SIZE;
159     }
160     break;
161    
162     /* If we hit ALT or KET, it means we haven't found anything mandatory in
163     this branch, though we might have found something optional. For ALT, we
164     continue with the next alternative, but we have to arrange that the final
165     result from subpattern is SSB_CONTINUE rather than SSB_DONE. For KET,
166     return SSB_CONTINUE: if this is the top level, that indicates failure,
167     but after a nested subpattern, it causes scanning to continue. */
168    
169     case OP_ALT:
170     yield = SSB_CONTINUE;
171     try_next = FALSE;
172     break;
173    
174     case OP_KET:
175     case OP_KETRMAX:
176     case OP_KETRMIN:
177     return SSB_CONTINUE;
178    
179 nigel 77 /* Skip over callout */
180    
181     case OP_CALLOUT:
182     tcode += 2 + 2*LINK_SIZE;
183     break;
184    
185     /* Skip over lookbehind and negative lookahead assertions */
186    
187     case OP_ASSERT_NOT:
188     case OP_ASSERTBACK:
189     case OP_ASSERTBACK_NOT:
190     do tcode += GET(tcode, 1); while (*tcode == OP_ALT);
191 nigel 93 tcode += 1 + LINK_SIZE;
192 nigel 77 break;
193    
194     /* Skip over an option setting, changing the caseless flag */
195    
196     case OP_OPT:
197     caseless = (tcode[1] & PCRE_CASELESS) != 0;
198     tcode += 2;
199     break;
200    
201     /* BRAZERO does the bracket, but carries on. */
202    
203     case OP_BRAZERO:
204     case OP_BRAMINZERO:
205 nigel 93 if (set_start_bits(++tcode, start_bits, caseless, utf8, cd) == SSB_FAIL)
206     return SSB_FAIL;
207 nigel 91 /* =========================================================================
208     See the comment at the head of this function concerning the next line,
209     which was an old fudge for the benefit of OS/2.
210 nigel 77 dummy = 1;
211 nigel 91 ========================================================================= */
212 nigel 77 do tcode += GET(tcode,1); while (*tcode == OP_ALT);
213 nigel 93 tcode += 1 + LINK_SIZE;
214 nigel 77 break;
215    
216     /* Single-char * or ? sets the bit and tries the next item */
217    
218     case OP_STAR:
219     case OP_MINSTAR:
220 nigel 93 case OP_POSSTAR:
221 nigel 77 case OP_QUERY:
222     case OP_MINQUERY:
223 nigel 93 case OP_POSQUERY:
224 nigel 77 set_bit(start_bits, tcode[1], caseless, cd);
225     tcode += 2;
226     #ifdef SUPPORT_UTF8
227 nigel 93 if (utf8 && tcode[-1] >= 0xc0)
228     tcode += _pcre_utf8_table4[tcode[-1] & 0x3f];
229 nigel 77 #endif
230     break;
231    
232     /* Single-char upto sets the bit and tries the next */
233    
234     case OP_UPTO:
235     case OP_MINUPTO:
236 nigel 93 case OP_POSUPTO:
237 nigel 77 set_bit(start_bits, tcode[3], caseless, cd);
238     tcode += 4;
239     #ifdef SUPPORT_UTF8
240 nigel 93 if (utf8 && tcode[-1] >= 0xc0)
241     tcode += _pcre_utf8_table4[tcode[-1] & 0x3f];
242 nigel 77 #endif
243     break;
244    
245     /* At least one single char sets the bit and stops */
246    
247     case OP_EXACT: /* Fall through */
248     tcode += 2;
249    
250     case OP_CHAR:
251     case OP_CHARNC:
252     case OP_PLUS:
253     case OP_MINPLUS:
254 nigel 93 case OP_POSPLUS:
255 nigel 77 set_bit(start_bits, tcode[1], caseless, cd);
256     try_next = FALSE;
257     break;
258    
259     /* Single character type sets the bits and stops */
260    
261     case OP_NOT_DIGIT:
262     for (c = 0; c < 32; c++)
263     start_bits[c] |= ~cd->cbits[c+cbit_digit];
264     try_next = FALSE;
265     break;
266    
267     case OP_DIGIT:
268     for (c = 0; c < 32; c++)
269     start_bits[c] |= cd->cbits[c+cbit_digit];
270     try_next = FALSE;
271     break;
272    
273 nigel 91 /* The cbit_space table has vertical tab as whitespace; we have to
274     discard it. */
275    
276 nigel 77 case OP_NOT_WHITESPACE:
277     for (c = 0; c < 32; c++)
278 nigel 91 {
279     int d = cd->cbits[c+cbit_space];
280     if (c == 1) d &= ~0x08;
281     start_bits[c] |= ~d;
282     }
283 nigel 77 try_next = FALSE;
284     break;
285    
286 nigel 91 /* The cbit_space table has vertical tab as whitespace; we have to
287     discard it. */
288    
289 nigel 77 case OP_WHITESPACE:
290     for (c = 0; c < 32; c++)
291 nigel 91 {
292     int d = cd->cbits[c+cbit_space];
293     if (c == 1) d &= ~0x08;
294     start_bits[c] |= d;
295     }
296 nigel 77 try_next = FALSE;
297     break;
298    
299     case OP_NOT_WORDCHAR:
300     for (c = 0; c < 32; c++)
301     start_bits[c] |= ~cd->cbits[c+cbit_word];
302     try_next = FALSE;
303     break;
304    
305     case OP_WORDCHAR:
306     for (c = 0; c < 32; c++)
307     start_bits[c] |= cd->cbits[c+cbit_word];
308     try_next = FALSE;
309     break;
310    
311     /* One or more character type fudges the pointer and restarts, knowing
312     it will hit a single character type and stop there. */
313    
314     case OP_TYPEPLUS:
315     case OP_TYPEMINPLUS:
316     tcode++;
317     break;
318    
319     case OP_TYPEEXACT:
320     tcode += 3;
321     break;
322    
323     /* Zero or more repeats of character types set the bits and then
324     try again. */
325    
326     case OP_TYPEUPTO:
327     case OP_TYPEMINUPTO:
328 nigel 93 case OP_TYPEPOSUPTO:
329 nigel 77 tcode += 2; /* Fall through */
330    
331     case OP_TYPESTAR:
332     case OP_TYPEMINSTAR:
333 nigel 93 case OP_TYPEPOSSTAR:
334 nigel 77 case OP_TYPEQUERY:
335     case OP_TYPEMINQUERY:
336 nigel 93 case OP_TYPEPOSQUERY:
337 nigel 77 switch(tcode[1])
338     {
339     case OP_ANY:
340 nigel 93 return SSB_FAIL;
341 nigel 77
342     case OP_NOT_DIGIT:
343     for (c = 0; c < 32; c++)
344     start_bits[c] |= ~cd->cbits[c+cbit_digit];
345     break;
346    
347     case OP_DIGIT:
348     for (c = 0; c < 32; c++)
349     start_bits[c] |= cd->cbits[c+cbit_digit];
350     break;
351    
352 nigel 91 /* The cbit_space table has vertical tab as whitespace; we have to
353     discard it. */
354    
355 nigel 77 case OP_NOT_WHITESPACE:
356     for (c = 0; c < 32; c++)
357 nigel 91 {
358     int d = cd->cbits[c+cbit_space];
359     if (c == 1) d &= ~0x08;
360     start_bits[c] |= ~d;
361     }
362 nigel 77 break;
363    
364 nigel 91 /* The cbit_space table has vertical tab as whitespace; we have to
365     discard it. */
366    
367 nigel 77 case OP_WHITESPACE:
368     for (c = 0; c < 32; c++)
369 nigel 91 {
370     int d = cd->cbits[c+cbit_space];
371     if (c == 1) d &= ~0x08;
372     start_bits[c] |= d;
373     }
374 nigel 77 break;
375    
376     case OP_NOT_WORDCHAR:
377     for (c = 0; c < 32; c++)
378     start_bits[c] |= ~cd->cbits[c+cbit_word];
379     break;
380    
381     case OP_WORDCHAR:
382     for (c = 0; c < 32; c++)
383     start_bits[c] |= cd->cbits[c+cbit_word];
384     break;
385     }
386    
387     tcode += 2;
388     break;
389    
390     /* Character class where all the information is in a bit map: set the
391     bits and either carry on or not, according to the repeat count. If it was
392     a negative class, and we are operating with UTF-8 characters, any byte
393     with a value >= 0xc4 is a potentially valid starter because it starts a
394     character with a value > 255. */
395    
396     case OP_NCLASS:
397 ph10 111 #ifdef SUPPORT_UTF8
398 nigel 77 if (utf8)
399     {
400     start_bits[24] |= 0xf0; /* Bits for 0xc4 - 0xc8 */
401     memset(start_bits+25, 0xff, 7); /* Bits for 0xc9 - 0xff */
402     }
403 ph10 111 #endif
404 nigel 77 /* Fall through */
405    
406     case OP_CLASS:
407     {
408     tcode++;
409    
410     /* In UTF-8 mode, the bits in a bit map correspond to character
411     values, not to byte values. However, the bit map we are constructing is
412     for byte values. So we have to do a conversion for characters whose
413     value is > 127. In fact, there are only two possible starting bytes for
414     characters in the range 128 - 255. */
415    
416 ph10 107 #ifdef SUPPORT_UTF8
417 nigel 77 if (utf8)
418     {
419     for (c = 0; c < 16; c++) start_bits[c] |= tcode[c];
420     for (c = 128; c < 256; c++)
421     {
422     if ((tcode[c/8] && (1 << (c&7))) != 0)
423     {
424     int d = (c >> 6) | 0xc0; /* Set bit for this starter */
425     start_bits[d/8] |= (1 << (d&7)); /* and then skip on to the */
426     c = (c & 0xc0) + 0x40 - 1; /* next relevant character. */
427     }
428     }
429     }
430    
431     /* In non-UTF-8 mode, the two bit maps are completely compatible. */
432    
433     else
434 ph10 111 #endif
435 nigel 77 {
436     for (c = 0; c < 32; c++) start_bits[c] |= tcode[c];
437     }
438    
439     /* Advance past the bit map, and act on what follows */
440    
441     tcode += 32;
442     switch (*tcode)
443     {
444     case OP_CRSTAR:
445     case OP_CRMINSTAR:
446     case OP_CRQUERY:
447     case OP_CRMINQUERY:
448     tcode++;
449     break;
450    
451     case OP_CRRANGE:
452     case OP_CRMINRANGE:
453     if (((tcode[1] << 8) + tcode[2]) == 0) tcode += 5;
454     else try_next = FALSE;
455     break;
456    
457     default:
458     try_next = FALSE;
459     break;
460     }
461     }
462     break; /* End of bitmap class handling */
463    
464     } /* End of switch */
465     } /* End of try_next loop */
466    
467     code += GET(code, 1); /* Advance to next branch */
468     }
469     while (*code == OP_ALT);
470 nigel 93 return yield;
471 nigel 77 }
472    
473    
474    
475     /*************************************************
476     * Study a compiled expression *
477     *************************************************/
478    
479     /* This function is handed a compiled expression that it must study to produce
480     information that will speed up the matching. It returns a pcre_extra block
481     which then gets handed back to pcre_exec().
482    
483     Arguments:
484     re points to the compiled expression
485     options contains option bits
486     errorptr points to where to place error messages;
487     set NULL unless error
488    
489     Returns: pointer to a pcre_extra block, with study_data filled in and the
490     appropriate flag set;
491     NULL on error or if no optimization possible
492     */
493    
494 ph10 145 PCRE_EXP_DEFN pcre_extra *
495 nigel 77 pcre_study(const pcre *external_re, int options, const char **errorptr)
496     {
497     uschar start_bits[32];
498     pcre_extra *extra;
499     pcre_study_data *study;
500     const uschar *tables;
501 nigel 91 uschar *code;
502     compile_data compile_block;
503 nigel 77 const real_pcre *re = (const real_pcre *)external_re;
504    
505     *errorptr = NULL;
506    
507     if (re == NULL || re->magic_number != MAGIC_NUMBER)
508     {
509     *errorptr = "argument is not a compiled regular expression";
510     return NULL;
511     }
512    
513     if ((options & ~PUBLIC_STUDY_OPTIONS) != 0)
514     {
515     *errorptr = "unknown or incorrect option bit(s) set";
516     return NULL;
517     }
518    
519 nigel 91 code = (uschar *)re + re->name_table_offset +
520     (re->name_count * re->name_entry_size);
521    
522 nigel 77 /* For an anchored pattern, or an unanchored pattern that has a first char, or
523     a multiline pattern that matches only at "line starts", no further processing
524     at present. */
525    
526     if ((re->options & (PCRE_ANCHORED|PCRE_FIRSTSET|PCRE_STARTLINE)) != 0)
527     return NULL;
528    
529     /* Set the character tables in the block that is passed around */
530    
531     tables = re->tables;
532     if (tables == NULL)
533     (void)pcre_fullinfo(external_re, NULL, PCRE_INFO_DEFAULT_TABLES,
534     (void *)(&tables));
535    
536     compile_block.lcc = tables + lcc_offset;
537     compile_block.fcc = tables + fcc_offset;
538     compile_block.cbits = tables + cbits_offset;
539     compile_block.ctypes = tables + ctypes_offset;
540    
541     /* See if we can find a fixed set of initial characters for the pattern. */
542    
543     memset(start_bits, 0, 32 * sizeof(uschar));
544 nigel 93 if (set_start_bits(code, start_bits, (re->options & PCRE_CASELESS) != 0,
545     (re->options & PCRE_UTF8) != 0, &compile_block) != SSB_DONE) return NULL;
546 nigel 77
547     /* Get a pcre_extra block and a pcre_study_data block. The study data is put in
548     the latter, which is pointed to by the former, which may also get additional
549     data set later by the calling program. At the moment, the size of
550     pcre_study_data is fixed. We nevertheless save it in a field for returning via
551     the pcre_fullinfo() function so that if it becomes variable in the future, we
552     don't have to change that code. */
553    
554     extra = (pcre_extra *)(pcre_malloc)
555     (sizeof(pcre_extra) + sizeof(pcre_study_data));
556    
557     if (extra == NULL)
558     {
559     *errorptr = "failed to get memory";
560     return NULL;
561     }
562    
563     study = (pcre_study_data *)((char *)extra + sizeof(pcre_extra));
564     extra->flags = PCRE_EXTRA_STUDY_DATA;
565     extra->study_data = study;
566    
567     study->size = sizeof(pcre_study_data);
568     study->options = PCRE_STUDY_MAPPED;
569     memcpy(study->start_bits, start_bits, sizeof(start_bits));
570    
571     return extra;
572     }
573    
574     /* End of pcre_study.c */

Properties

Name Value
svn:eol-style native
svn:keywords "Author Date Id Revision Url"

webmaster@exim.org
ViewVC Help
Powered by ViewVC 1.1.12