/[pcre]/code/trunk/pcre_study.c
ViewVC logotype

Contents of /code/trunk/pcre_study.c

Parent Directory Parent Directory | Revision Log Revision Log


Revision 87 - (hide annotations) (download)
Sat Feb 24 21:41:21 2007 UTC (7 years, 6 months ago) by nigel
File MIME type: text/plain
File size: 14380 byte(s)
Load pcre-6.5 into code/trunk.

1 nigel 77 /*************************************************
2     * Perl-Compatible Regular Expressions *
3     *************************************************/
4    
5     /* PCRE is a library of functions to support regular expressions whose syntax
6     and semantics are as close as possible to those of the Perl 5 language.
7    
8     Written by Philip Hazel
9 nigel 87 Copyright (c) 1997-2006 University of Cambridge
10 nigel 77
11     -----------------------------------------------------------------------------
12     Redistribution and use in source and binary forms, with or without
13     modification, are permitted provided that the following conditions are met:
14    
15     * Redistributions of source code must retain the above copyright notice,
16     this list of conditions and the following disclaimer.
17    
18     * Redistributions in binary form must reproduce the above copyright
19     notice, this list of conditions and the following disclaimer in the
20     documentation and/or other materials provided with the distribution.
21    
22     * Neither the name of the University of Cambridge nor the names of its
23     contributors may be used to endorse or promote products derived from
24     this software without specific prior written permission.
25    
26     THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
27     AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
28     IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
29     ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
30     LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
31     CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
32     SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
33     INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
34     CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
35     ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
36     POSSIBILITY OF SUCH DAMAGE.
37     -----------------------------------------------------------------------------
38     */
39    
40    
41     /* This module contains the external function pcre_study(), along with local
42     supporting functions. */
43    
44    
45     #include "pcre_internal.h"
46    
47    
48     /*************************************************
49     * Set a bit and maybe its alternate case *
50     *************************************************/
51    
52     /* Given a character, set its bit in the table, and also the bit for the other
53     version of a letter if we are caseless.
54    
55     Arguments:
56     start_bits points to the bit map
57     c is the character
58     caseless the caseless flag
59     cd the block with char table pointers
60    
61     Returns: nothing
62     */
63    
64     static void
65     set_bit(uschar *start_bits, unsigned int c, BOOL caseless, compile_data *cd)
66     {
67     start_bits[c/8] |= (1 << (c&7));
68     if (caseless && (cd->ctypes[c] & ctype_letter) != 0)
69     start_bits[cd->fcc[c]/8] |= (1 << (cd->fcc[c]&7));
70     }
71    
72    
73    
74     /*************************************************
75     * Create bitmap of starting chars *
76     *************************************************/
77    
78     /* This function scans a compiled unanchored expression and attempts to build a
79     bitmap of the set of initial characters. If it can't, it returns FALSE. As time
80     goes by, we may be able to get more clever at doing this.
81    
82     Arguments:
83     code points to an expression
84     start_bits points to a 32-byte table, initialized to 0
85     caseless the current state of the caseless flag
86     utf8 TRUE if in UTF-8 mode
87     cd the block with char table pointers
88    
89     Returns: TRUE if table built, FALSE otherwise
90     */
91    
92     static BOOL
93     set_start_bits(const uschar *code, uschar *start_bits, BOOL caseless,
94     BOOL utf8, compile_data *cd)
95     {
96     register int c;
97    
98     /* This next statement and the later reference to dummy are here in order to
99     trick the optimizer of the IBM C compiler for OS/2 into generating correct
100     code. Apparently IBM isn't going to fix the problem, and we would rather not
101     disable optimization (in this module it actually makes a big difference, and
102     the pcre module can use all the optimization it can get). */
103    
104     volatile int dummy;
105    
106     do
107     {
108     const uschar *tcode = code + 1 + LINK_SIZE;
109     BOOL try_next = TRUE;
110    
111     while (try_next)
112     {
113     /* If a branch starts with a bracket or a positive lookahead assertion,
114     recurse to set bits from within them. That's all for this branch. */
115    
116     if ((int)*tcode >= OP_BRA || *tcode == OP_ASSERT)
117     {
118     if (!set_start_bits(tcode, start_bits, caseless, utf8, cd))
119     return FALSE;
120     try_next = FALSE;
121     }
122    
123     else switch(*tcode)
124     {
125     default:
126     return FALSE;
127    
128     /* Skip over callout */
129    
130     case OP_CALLOUT:
131     tcode += 2 + 2*LINK_SIZE;
132     break;
133    
134     /* Skip over extended extraction bracket number */
135    
136     case OP_BRANUMBER:
137     tcode += 3;
138     break;
139    
140     /* Skip over lookbehind and negative lookahead assertions */
141    
142     case OP_ASSERT_NOT:
143     case OP_ASSERTBACK:
144     case OP_ASSERTBACK_NOT:
145     do tcode += GET(tcode, 1); while (*tcode == OP_ALT);
146     tcode += 1+LINK_SIZE;
147     break;
148    
149     /* Skip over an option setting, changing the caseless flag */
150    
151     case OP_OPT:
152     caseless = (tcode[1] & PCRE_CASELESS) != 0;
153     tcode += 2;
154     break;
155    
156     /* BRAZERO does the bracket, but carries on. */
157    
158     case OP_BRAZERO:
159     case OP_BRAMINZERO:
160     if (!set_start_bits(++tcode, start_bits, caseless, utf8, cd))
161     return FALSE;
162     dummy = 1;
163     do tcode += GET(tcode,1); while (*tcode == OP_ALT);
164     tcode += 1+LINK_SIZE;
165     break;
166    
167     /* Single-char * or ? sets the bit and tries the next item */
168    
169     case OP_STAR:
170     case OP_MINSTAR:
171     case OP_QUERY:
172     case OP_MINQUERY:
173     set_bit(start_bits, tcode[1], caseless, cd);
174     tcode += 2;
175     #ifdef SUPPORT_UTF8
176     if (utf8) while ((*tcode & 0xc0) == 0x80) tcode++;
177     #endif
178     break;
179    
180     /* Single-char upto sets the bit and tries the next */
181    
182     case OP_UPTO:
183     case OP_MINUPTO:
184     set_bit(start_bits, tcode[3], caseless, cd);
185     tcode += 4;
186     #ifdef SUPPORT_UTF8
187     if (utf8) while ((*tcode & 0xc0) == 0x80) tcode++;
188     #endif
189     break;
190    
191     /* At least one single char sets the bit and stops */
192    
193     case OP_EXACT: /* Fall through */
194     tcode += 2;
195    
196     case OP_CHAR:
197     case OP_CHARNC:
198     case OP_PLUS:
199     case OP_MINPLUS:
200     set_bit(start_bits, tcode[1], caseless, cd);
201     try_next = FALSE;
202     break;
203    
204     /* Single character type sets the bits and stops */
205    
206     case OP_NOT_DIGIT:
207     for (c = 0; c < 32; c++)
208     start_bits[c] |= ~cd->cbits[c+cbit_digit];
209     try_next = FALSE;
210     break;
211    
212     case OP_DIGIT:
213     for (c = 0; c < 32; c++)
214     start_bits[c] |= cd->cbits[c+cbit_digit];
215     try_next = FALSE;
216     break;
217    
218     case OP_NOT_WHITESPACE:
219     for (c = 0; c < 32; c++)
220     start_bits[c] |= ~cd->cbits[c+cbit_space];
221     try_next = FALSE;
222     break;
223    
224     case OP_WHITESPACE:
225     for (c = 0; c < 32; c++)
226     start_bits[c] |= cd->cbits[c+cbit_space];
227     try_next = FALSE;
228     break;
229    
230     case OP_NOT_WORDCHAR:
231     for (c = 0; c < 32; c++)
232     start_bits[c] |= ~cd->cbits[c+cbit_word];
233     try_next = FALSE;
234     break;
235    
236     case OP_WORDCHAR:
237     for (c = 0; c < 32; c++)
238     start_bits[c] |= cd->cbits[c+cbit_word];
239     try_next = FALSE;
240     break;
241    
242     /* One or more character type fudges the pointer and restarts, knowing
243     it will hit a single character type and stop there. */
244    
245     case OP_TYPEPLUS:
246     case OP_TYPEMINPLUS:
247     tcode++;
248     break;
249    
250     case OP_TYPEEXACT:
251     tcode += 3;
252     break;
253    
254     /* Zero or more repeats of character types set the bits and then
255     try again. */
256    
257     case OP_TYPEUPTO:
258     case OP_TYPEMINUPTO:
259     tcode += 2; /* Fall through */
260    
261     case OP_TYPESTAR:
262     case OP_TYPEMINSTAR:
263     case OP_TYPEQUERY:
264     case OP_TYPEMINQUERY:
265     switch(tcode[1])
266     {
267     case OP_ANY:
268     return FALSE;
269    
270     case OP_NOT_DIGIT:
271     for (c = 0; c < 32; c++)
272     start_bits[c] |= ~cd->cbits[c+cbit_digit];
273     break;
274    
275     case OP_DIGIT:
276     for (c = 0; c < 32; c++)
277     start_bits[c] |= cd->cbits[c+cbit_digit];
278     break;
279    
280     case OP_NOT_WHITESPACE:
281     for (c = 0; c < 32; c++)
282     start_bits[c] |= ~cd->cbits[c+cbit_space];
283     break;
284    
285     case OP_WHITESPACE:
286     for (c = 0; c < 32; c++)
287     start_bits[c] |= cd->cbits[c+cbit_space];
288     break;
289    
290     case OP_NOT_WORDCHAR:
291     for (c = 0; c < 32; c++)
292     start_bits[c] |= ~cd->cbits[c+cbit_word];
293     break;
294    
295     case OP_WORDCHAR:
296     for (c = 0; c < 32; c++)
297     start_bits[c] |= cd->cbits[c+cbit_word];
298     break;
299     }
300    
301     tcode += 2;
302     break;
303    
304     /* Character class where all the information is in a bit map: set the
305     bits and either carry on or not, according to the repeat count. If it was
306     a negative class, and we are operating with UTF-8 characters, any byte
307     with a value >= 0xc4 is a potentially valid starter because it starts a
308     character with a value > 255. */
309    
310     case OP_NCLASS:
311     if (utf8)
312     {
313     start_bits[24] |= 0xf0; /* Bits for 0xc4 - 0xc8 */
314     memset(start_bits+25, 0xff, 7); /* Bits for 0xc9 - 0xff */
315     }
316     /* Fall through */
317    
318     case OP_CLASS:
319     {
320     tcode++;
321    
322     /* In UTF-8 mode, the bits in a bit map correspond to character
323     values, not to byte values. However, the bit map we are constructing is
324     for byte values. So we have to do a conversion for characters whose
325     value is > 127. In fact, there are only two possible starting bytes for
326     characters in the range 128 - 255. */
327    
328     if (utf8)
329     {
330     for (c = 0; c < 16; c++) start_bits[c] |= tcode[c];
331     for (c = 128; c < 256; c++)
332     {
333     if ((tcode[c/8] && (1 << (c&7))) != 0)
334     {
335     int d = (c >> 6) | 0xc0; /* Set bit for this starter */
336     start_bits[d/8] |= (1 << (d&7)); /* and then skip on to the */
337     c = (c & 0xc0) + 0x40 - 1; /* next relevant character. */
338     }
339     }
340     }
341    
342     /* In non-UTF-8 mode, the two bit maps are completely compatible. */
343    
344     else
345     {
346     for (c = 0; c < 32; c++) start_bits[c] |= tcode[c];
347     }
348    
349     /* Advance past the bit map, and act on what follows */
350    
351     tcode += 32;
352     switch (*tcode)
353     {
354     case OP_CRSTAR:
355     case OP_CRMINSTAR:
356     case OP_CRQUERY:
357     case OP_CRMINQUERY:
358     tcode++;
359     break;
360    
361     case OP_CRRANGE:
362     case OP_CRMINRANGE:
363     if (((tcode[1] << 8) + tcode[2]) == 0) tcode += 5;
364     else try_next = FALSE;
365     break;
366    
367     default:
368     try_next = FALSE;
369     break;
370     }
371     }
372     break; /* End of bitmap class handling */
373    
374     } /* End of switch */
375     } /* End of try_next loop */
376    
377     code += GET(code, 1); /* Advance to next branch */
378     }
379     while (*code == OP_ALT);
380     return TRUE;
381     }
382    
383    
384    
385     /*************************************************
386     * Study a compiled expression *
387     *************************************************/
388    
389     /* This function is handed a compiled expression that it must study to produce
390     information that will speed up the matching. It returns a pcre_extra block
391     which then gets handed back to pcre_exec().
392    
393     Arguments:
394     re points to the compiled expression
395     options contains option bits
396     errorptr points to where to place error messages;
397     set NULL unless error
398    
399     Returns: pointer to a pcre_extra block, with study_data filled in and the
400     appropriate flag set;
401     NULL on error or if no optimization possible
402     */
403    
404 nigel 87 PCRE_DATA_SCOPE pcre_extra *
405 nigel 77 pcre_study(const pcre *external_re, int options, const char **errorptr)
406     {
407     uschar start_bits[32];
408     pcre_extra *extra;
409     pcre_study_data *study;
410     const uschar *tables;
411     const real_pcre *re = (const real_pcre *)external_re;
412     uschar *code = (uschar *)re + re->name_table_offset +
413     (re->name_count * re->name_entry_size);
414     compile_data compile_block;
415    
416     *errorptr = NULL;
417    
418     if (re == NULL || re->magic_number != MAGIC_NUMBER)
419     {
420     *errorptr = "argument is not a compiled regular expression";
421     return NULL;
422     }
423    
424     if ((options & ~PUBLIC_STUDY_OPTIONS) != 0)
425     {
426     *errorptr = "unknown or incorrect option bit(s) set";
427     return NULL;
428     }
429    
430     /* For an anchored pattern, or an unanchored pattern that has a first char, or
431     a multiline pattern that matches only at "line starts", no further processing
432     at present. */
433    
434     if ((re->options & (PCRE_ANCHORED|PCRE_FIRSTSET|PCRE_STARTLINE)) != 0)
435     return NULL;
436    
437     /* Set the character tables in the block that is passed around */
438    
439     tables = re->tables;
440     if (tables == NULL)
441     (void)pcre_fullinfo(external_re, NULL, PCRE_INFO_DEFAULT_TABLES,
442     (void *)(&tables));
443    
444     compile_block.lcc = tables + lcc_offset;
445     compile_block.fcc = tables + fcc_offset;
446     compile_block.cbits = tables + cbits_offset;
447     compile_block.ctypes = tables + ctypes_offset;
448    
449     /* See if we can find a fixed set of initial characters for the pattern. */
450    
451     memset(start_bits, 0, 32 * sizeof(uschar));
452     if (!set_start_bits(code, start_bits, (re->options & PCRE_CASELESS) != 0,
453     (re->options & PCRE_UTF8) != 0, &compile_block)) return NULL;
454    
455     /* Get a pcre_extra block and a pcre_study_data block. The study data is put in
456     the latter, which is pointed to by the former, which may also get additional
457     data set later by the calling program. At the moment, the size of
458     pcre_study_data is fixed. We nevertheless save it in a field for returning via
459     the pcre_fullinfo() function so that if it becomes variable in the future, we
460     don't have to change that code. */
461    
462     extra = (pcre_extra *)(pcre_malloc)
463     (sizeof(pcre_extra) + sizeof(pcre_study_data));
464    
465     if (extra == NULL)
466     {
467     *errorptr = "failed to get memory";
468     return NULL;
469     }
470    
471     study = (pcre_study_data *)((char *)extra + sizeof(pcre_extra));
472     extra->flags = PCRE_EXTRA_STUDY_DATA;
473     extra->study_data = study;
474    
475     study->size = sizeof(pcre_study_data);
476     study->options = PCRE_STUDY_MAPPED;
477     memcpy(study->start_bits, start_bits, sizeof(start_bits));
478    
479     return extra;
480     }
481    
482     /* End of pcre_study.c */

webmaster@exim.org
ViewVC Help
Powered by ViewVC 1.1.12