/[pcre]/code/trunk/pcre_study.c
ViewVC logotype

Contents of /code/trunk/pcre_study.c

Parent Directory Parent Directory | Revision Log Revision Log


Revision 91 - (hide annotations) (download)
Sat Feb 24 21:41:34 2007 UTC (7 years, 1 month ago) by nigel
File MIME type: text/plain
File size: 15778 byte(s)
Load pcre-6.7 into code/trunk.

1 nigel 77 /*************************************************
2     * Perl-Compatible Regular Expressions *
3     *************************************************/
4    
5     /* PCRE is a library of functions to support regular expressions whose syntax
6     and semantics are as close as possible to those of the Perl 5 language.
7    
8     Written by Philip Hazel
9 nigel 87 Copyright (c) 1997-2006 University of Cambridge
10 nigel 77
11     -----------------------------------------------------------------------------
12     Redistribution and use in source and binary forms, with or without
13     modification, are permitted provided that the following conditions are met:
14    
15     * Redistributions of source code must retain the above copyright notice,
16     this list of conditions and the following disclaimer.
17    
18     * Redistributions in binary form must reproduce the above copyright
19     notice, this list of conditions and the following disclaimer in the
20     documentation and/or other materials provided with the distribution.
21    
22     * Neither the name of the University of Cambridge nor the names of its
23     contributors may be used to endorse or promote products derived from
24     this software without specific prior written permission.
25    
26     THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
27     AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
28     IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
29     ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
30     LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
31     CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
32     SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
33     INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
34     CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
35     ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
36     POSSIBILITY OF SUCH DAMAGE.
37     -----------------------------------------------------------------------------
38     */
39    
40    
41     /* This module contains the external function pcre_study(), along with local
42     supporting functions. */
43    
44    
45     #include "pcre_internal.h"
46    
47    
48     /*************************************************
49     * Set a bit and maybe its alternate case *
50     *************************************************/
51    
52     /* Given a character, set its bit in the table, and also the bit for the other
53     version of a letter if we are caseless.
54    
55     Arguments:
56     start_bits points to the bit map
57     c is the character
58     caseless the caseless flag
59     cd the block with char table pointers
60    
61     Returns: nothing
62     */
63    
64     static void
65     set_bit(uschar *start_bits, unsigned int c, BOOL caseless, compile_data *cd)
66     {
67     start_bits[c/8] |= (1 << (c&7));
68     if (caseless && (cd->ctypes[c] & ctype_letter) != 0)
69     start_bits[cd->fcc[c]/8] |= (1 << (cd->fcc[c]&7));
70     }
71    
72    
73    
74     /*************************************************
75     * Create bitmap of starting chars *
76     *************************************************/
77    
78     /* This function scans a compiled unanchored expression and attempts to build a
79     bitmap of the set of initial characters. If it can't, it returns FALSE. As time
80     goes by, we may be able to get more clever at doing this.
81    
82     Arguments:
83     code points to an expression
84     start_bits points to a 32-byte table, initialized to 0
85     caseless the current state of the caseless flag
86     utf8 TRUE if in UTF-8 mode
87     cd the block with char table pointers
88    
89     Returns: TRUE if table built, FALSE otherwise
90     */
91    
92     static BOOL
93     set_start_bits(const uschar *code, uschar *start_bits, BOOL caseless,
94     BOOL utf8, compile_data *cd)
95     {
96     register int c;
97    
98 nigel 91 #if 0
99     /* ========================================================================= */
100     /* The following comment and code was inserted in January 1999. In May 2006,
101     when it was observed to cause compiler warnings about unused values, I took it
102     out again. If anybody is still using OS/2, they will have to put it back
103     manually. */
104    
105 nigel 77 /* This next statement and the later reference to dummy are here in order to
106     trick the optimizer of the IBM C compiler for OS/2 into generating correct
107     code. Apparently IBM isn't going to fix the problem, and we would rather not
108     disable optimization (in this module it actually makes a big difference, and
109     the pcre module can use all the optimization it can get). */
110    
111     volatile int dummy;
112 nigel 91 /* ========================================================================= */
113     #endif
114 nigel 77
115     do
116     {
117     const uschar *tcode = code + 1 + LINK_SIZE;
118     BOOL try_next = TRUE;
119    
120     while (try_next)
121     {
122     /* If a branch starts with a bracket or a positive lookahead assertion,
123     recurse to set bits from within them. That's all for this branch. */
124    
125     if ((int)*tcode >= OP_BRA || *tcode == OP_ASSERT)
126     {
127     if (!set_start_bits(tcode, start_bits, caseless, utf8, cd))
128     return FALSE;
129     try_next = FALSE;
130     }
131    
132     else switch(*tcode)
133     {
134     default:
135     return FALSE;
136    
137     /* Skip over callout */
138    
139     case OP_CALLOUT:
140     tcode += 2 + 2*LINK_SIZE;
141     break;
142    
143     /* Skip over extended extraction bracket number */
144    
145     case OP_BRANUMBER:
146     tcode += 3;
147     break;
148    
149     /* Skip over lookbehind and negative lookahead assertions */
150    
151     case OP_ASSERT_NOT:
152     case OP_ASSERTBACK:
153     case OP_ASSERTBACK_NOT:
154     do tcode += GET(tcode, 1); while (*tcode == OP_ALT);
155     tcode += 1+LINK_SIZE;
156     break;
157    
158     /* Skip over an option setting, changing the caseless flag */
159    
160     case OP_OPT:
161     caseless = (tcode[1] & PCRE_CASELESS) != 0;
162     tcode += 2;
163     break;
164    
165     /* BRAZERO does the bracket, but carries on. */
166    
167     case OP_BRAZERO:
168     case OP_BRAMINZERO:
169     if (!set_start_bits(++tcode, start_bits, caseless, utf8, cd))
170     return FALSE;
171 nigel 91 /* =========================================================================
172     See the comment at the head of this function concerning the next line,
173     which was an old fudge for the benefit of OS/2.
174 nigel 77 dummy = 1;
175 nigel 91 ========================================================================= */
176 nigel 77 do tcode += GET(tcode,1); while (*tcode == OP_ALT);
177     tcode += 1+LINK_SIZE;
178     break;
179    
180     /* Single-char * or ? sets the bit and tries the next item */
181    
182     case OP_STAR:
183     case OP_MINSTAR:
184     case OP_QUERY:
185     case OP_MINQUERY:
186     set_bit(start_bits, tcode[1], caseless, cd);
187     tcode += 2;
188     #ifdef SUPPORT_UTF8
189     if (utf8) while ((*tcode & 0xc0) == 0x80) tcode++;
190     #endif
191     break;
192    
193     /* Single-char upto sets the bit and tries the next */
194    
195     case OP_UPTO:
196     case OP_MINUPTO:
197     set_bit(start_bits, tcode[3], caseless, cd);
198     tcode += 4;
199     #ifdef SUPPORT_UTF8
200     if (utf8) while ((*tcode & 0xc0) == 0x80) tcode++;
201     #endif
202     break;
203    
204     /* At least one single char sets the bit and stops */
205    
206     case OP_EXACT: /* Fall through */
207     tcode += 2;
208    
209     case OP_CHAR:
210     case OP_CHARNC:
211     case OP_PLUS:
212     case OP_MINPLUS:
213     set_bit(start_bits, tcode[1], caseless, cd);
214     try_next = FALSE;
215     break;
216    
217     /* Single character type sets the bits and stops */
218    
219     case OP_NOT_DIGIT:
220     for (c = 0; c < 32; c++)
221     start_bits[c] |= ~cd->cbits[c+cbit_digit];
222     try_next = FALSE;
223     break;
224    
225     case OP_DIGIT:
226     for (c = 0; c < 32; c++)
227     start_bits[c] |= cd->cbits[c+cbit_digit];
228     try_next = FALSE;
229     break;
230    
231 nigel 91 /* The cbit_space table has vertical tab as whitespace; we have to
232     discard it. */
233    
234 nigel 77 case OP_NOT_WHITESPACE:
235     for (c = 0; c < 32; c++)
236 nigel 91 {
237     int d = cd->cbits[c+cbit_space];
238     if (c == 1) d &= ~0x08;
239     start_bits[c] |= ~d;
240     }
241 nigel 77 try_next = FALSE;
242     break;
243    
244 nigel 91 /* The cbit_space table has vertical tab as whitespace; we have to
245     discard it. */
246    
247 nigel 77 case OP_WHITESPACE:
248     for (c = 0; c < 32; c++)
249 nigel 91 {
250     int d = cd->cbits[c+cbit_space];
251     if (c == 1) d &= ~0x08;
252     start_bits[c] |= d;
253     }
254 nigel 77 try_next = FALSE;
255     break;
256    
257     case OP_NOT_WORDCHAR:
258     for (c = 0; c < 32; c++)
259     start_bits[c] |= ~cd->cbits[c+cbit_word];
260     try_next = FALSE;
261     break;
262    
263     case OP_WORDCHAR:
264     for (c = 0; c < 32; c++)
265     start_bits[c] |= cd->cbits[c+cbit_word];
266     try_next = FALSE;
267     break;
268    
269     /* One or more character type fudges the pointer and restarts, knowing
270     it will hit a single character type and stop there. */
271    
272     case OP_TYPEPLUS:
273     case OP_TYPEMINPLUS:
274     tcode++;
275     break;
276    
277     case OP_TYPEEXACT:
278     tcode += 3;
279     break;
280    
281     /* Zero or more repeats of character types set the bits and then
282     try again. */
283    
284     case OP_TYPEUPTO:
285     case OP_TYPEMINUPTO:
286     tcode += 2; /* Fall through */
287    
288     case OP_TYPESTAR:
289     case OP_TYPEMINSTAR:
290     case OP_TYPEQUERY:
291     case OP_TYPEMINQUERY:
292     switch(tcode[1])
293     {
294     case OP_ANY:
295     return FALSE;
296    
297     case OP_NOT_DIGIT:
298     for (c = 0; c < 32; c++)
299     start_bits[c] |= ~cd->cbits[c+cbit_digit];
300     break;
301    
302     case OP_DIGIT:
303     for (c = 0; c < 32; c++)
304     start_bits[c] |= cd->cbits[c+cbit_digit];
305     break;
306    
307 nigel 91 /* The cbit_space table has vertical tab as whitespace; we have to
308     discard it. */
309    
310 nigel 77 case OP_NOT_WHITESPACE:
311     for (c = 0; c < 32; c++)
312 nigel 91 {
313     int d = cd->cbits[c+cbit_space];
314     if (c == 1) d &= ~0x08;
315     start_bits[c] |= ~d;
316     }
317 nigel 77 break;
318    
319 nigel 91 /* The cbit_space table has vertical tab as whitespace; we have to
320     discard it. */
321    
322 nigel 77 case OP_WHITESPACE:
323     for (c = 0; c < 32; c++)
324 nigel 91 {
325     int d = cd->cbits[c+cbit_space];
326     if (c == 1) d &= ~0x08;
327     start_bits[c] |= d;
328     }
329 nigel 77 break;
330    
331     case OP_NOT_WORDCHAR:
332     for (c = 0; c < 32; c++)
333     start_bits[c] |= ~cd->cbits[c+cbit_word];
334     break;
335    
336     case OP_WORDCHAR:
337     for (c = 0; c < 32; c++)
338     start_bits[c] |= cd->cbits[c+cbit_word];
339     break;
340     }
341    
342     tcode += 2;
343     break;
344    
345     /* Character class where all the information is in a bit map: set the
346     bits and either carry on or not, according to the repeat count. If it was
347     a negative class, and we are operating with UTF-8 characters, any byte
348     with a value >= 0xc4 is a potentially valid starter because it starts a
349     character with a value > 255. */
350    
351     case OP_NCLASS:
352     if (utf8)
353     {
354     start_bits[24] |= 0xf0; /* Bits for 0xc4 - 0xc8 */
355     memset(start_bits+25, 0xff, 7); /* Bits for 0xc9 - 0xff */
356     }
357     /* Fall through */
358    
359     case OP_CLASS:
360     {
361     tcode++;
362    
363     /* In UTF-8 mode, the bits in a bit map correspond to character
364     values, not to byte values. However, the bit map we are constructing is
365     for byte values. So we have to do a conversion for characters whose
366     value is > 127. In fact, there are only two possible starting bytes for
367     characters in the range 128 - 255. */
368    
369     if (utf8)
370     {
371     for (c = 0; c < 16; c++) start_bits[c] |= tcode[c];
372     for (c = 128; c < 256; c++)
373     {
374     if ((tcode[c/8] && (1 << (c&7))) != 0)
375     {
376     int d = (c >> 6) | 0xc0; /* Set bit for this starter */
377     start_bits[d/8] |= (1 << (d&7)); /* and then skip on to the */
378     c = (c & 0xc0) + 0x40 - 1; /* next relevant character. */
379     }
380     }
381     }
382    
383     /* In non-UTF-8 mode, the two bit maps are completely compatible. */
384    
385     else
386     {
387     for (c = 0; c < 32; c++) start_bits[c] |= tcode[c];
388     }
389    
390     /* Advance past the bit map, and act on what follows */
391    
392     tcode += 32;
393     switch (*tcode)
394     {
395     case OP_CRSTAR:
396     case OP_CRMINSTAR:
397     case OP_CRQUERY:
398     case OP_CRMINQUERY:
399     tcode++;
400     break;
401    
402     case OP_CRRANGE:
403     case OP_CRMINRANGE:
404     if (((tcode[1] << 8) + tcode[2]) == 0) tcode += 5;
405     else try_next = FALSE;
406     break;
407    
408     default:
409     try_next = FALSE;
410     break;
411     }
412     }
413     break; /* End of bitmap class handling */
414    
415     } /* End of switch */
416     } /* End of try_next loop */
417    
418     code += GET(code, 1); /* Advance to next branch */
419     }
420     while (*code == OP_ALT);
421     return TRUE;
422     }
423    
424    
425    
426     /*************************************************
427     * Study a compiled expression *
428     *************************************************/
429    
430     /* This function is handed a compiled expression that it must study to produce
431     information that will speed up the matching. It returns a pcre_extra block
432     which then gets handed back to pcre_exec().
433    
434     Arguments:
435     re points to the compiled expression
436     options contains option bits
437     errorptr points to where to place error messages;
438     set NULL unless error
439    
440     Returns: pointer to a pcre_extra block, with study_data filled in and the
441     appropriate flag set;
442     NULL on error or if no optimization possible
443     */
444    
445 nigel 87 PCRE_DATA_SCOPE pcre_extra *
446 nigel 77 pcre_study(const pcre *external_re, int options, const char **errorptr)
447     {
448     uschar start_bits[32];
449     pcre_extra *extra;
450     pcre_study_data *study;
451     const uschar *tables;
452 nigel 91 uschar *code;
453     compile_data compile_block;
454 nigel 77 const real_pcre *re = (const real_pcre *)external_re;
455    
456     *errorptr = NULL;
457    
458     if (re == NULL || re->magic_number != MAGIC_NUMBER)
459     {
460     *errorptr = "argument is not a compiled regular expression";
461     return NULL;
462     }
463    
464     if ((options & ~PUBLIC_STUDY_OPTIONS) != 0)
465     {
466     *errorptr = "unknown or incorrect option bit(s) set";
467     return NULL;
468     }
469    
470 nigel 91 code = (uschar *)re + re->name_table_offset +
471     (re->name_count * re->name_entry_size);
472    
473 nigel 77 /* For an anchored pattern, or an unanchored pattern that has a first char, or
474     a multiline pattern that matches only at "line starts", no further processing
475     at present. */
476    
477     if ((re->options & (PCRE_ANCHORED|PCRE_FIRSTSET|PCRE_STARTLINE)) != 0)
478     return NULL;
479    
480     /* Set the character tables in the block that is passed around */
481    
482     tables = re->tables;
483     if (tables == NULL)
484     (void)pcre_fullinfo(external_re, NULL, PCRE_INFO_DEFAULT_TABLES,
485     (void *)(&tables));
486    
487     compile_block.lcc = tables + lcc_offset;
488     compile_block.fcc = tables + fcc_offset;
489     compile_block.cbits = tables + cbits_offset;
490     compile_block.ctypes = tables + ctypes_offset;
491    
492     /* See if we can find a fixed set of initial characters for the pattern. */
493    
494     memset(start_bits, 0, 32 * sizeof(uschar));
495     if (!set_start_bits(code, start_bits, (re->options & PCRE_CASELESS) != 0,
496     (re->options & PCRE_UTF8) != 0, &compile_block)) return NULL;
497    
498     /* Get a pcre_extra block and a pcre_study_data block. The study data is put in
499     the latter, which is pointed to by the former, which may also get additional
500     data set later by the calling program. At the moment, the size of
501     pcre_study_data is fixed. We nevertheless save it in a field for returning via
502     the pcre_fullinfo() function so that if it becomes variable in the future, we
503     don't have to change that code. */
504    
505     extra = (pcre_extra *)(pcre_malloc)
506     (sizeof(pcre_extra) + sizeof(pcre_study_data));
507    
508     if (extra == NULL)
509     {
510     *errorptr = "failed to get memory";
511     return NULL;
512     }
513    
514     study = (pcre_study_data *)((char *)extra + sizeof(pcre_extra));
515     extra->flags = PCRE_EXTRA_STUDY_DATA;
516     extra->study_data = study;
517    
518     study->size = sizeof(pcre_study_data);
519     study->options = PCRE_STUDY_MAPPED;
520     memcpy(study->start_bits, start_bits, sizeof(start_bits));
521    
522     return extra;
523     }
524    
525     /* End of pcre_study.c */

webmaster@exim.org
ViewVC Help
Powered by ViewVC 1.1.12