/[pcre]/code/trunk/study.c
ViewVC logotype

Contents of /code/trunk/study.c

Parent Directory Parent Directory | Revision Log Revision Log


Revision 75 - (hide annotations) (download)
Sat Feb 24 21:40:37 2007 UTC (6 years, 2 months ago) by nigel
File MIME type: text/plain
File size: 14414 byte(s)
Load pcre-5.0 into code/trunk.

1 nigel 3 /*************************************************
2     * Perl-Compatible Regular Expressions *
3     *************************************************/
4    
5     /*
6     This is a library of functions to support regular expressions whose syntax
7     and semantics are as close as possible to those of the Perl 5 language. See
8     the file Tech.Notes for some information on the internals.
9    
10     Written by: Philip Hazel <ph10@cam.ac.uk>
11    
12 nigel 75 Copyright (c) 1997-2004 University of Cambridge
13 nigel 3
14     -----------------------------------------------------------------------------
15 nigel 75 Redistribution and use in source and binary forms, with or without
16     modification, are permitted provided that the following conditions are met:
17 nigel 3
18 nigel 75 * Redistributions of source code must retain the above copyright notice,
19     this list of conditions and the following disclaimer.
20 nigel 3
21 nigel 75 * Redistributions in binary form must reproduce the above copyright
22     notice, this list of conditions and the following disclaimer in the
23     documentation and/or other materials provided with the distribution.
24 nigel 3
25 nigel 75 * Neither the name of the University of Cambridge nor the names of its
26     contributors may be used to endorse or promote products derived from
27     this software without specific prior written permission.
28 nigel 29
29 nigel 75 THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
30     AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
31     IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
32     ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
33     LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
34     CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
35     SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
36     INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
37     CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
38     ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
39     POSSIBILITY OF SUCH DAMAGE.
40 nigel 3 -----------------------------------------------------------------------------
41     */
42    
43    
44     /* Include the internals header, which itself includes Standard C headers plus
45     the external pcre header. */
46    
47     #include "internal.h"
48    
49    
50    
51     /*************************************************
52 nigel 23 * Set a bit and maybe its alternate case *
53     *************************************************/
54    
55     /* Given a character, set its bit in the table, and also the bit for the other
56     version of a letter if we are caseless.
57    
58     Arguments:
59     start_bits points to the bit map
60     c is the character
61     caseless the caseless flag
62 nigel 25 cd the block with char table pointers
63 nigel 23
64     Returns: nothing
65     */
66    
67     static void
68 nigel 75 set_bit(uschar *start_bits, unsigned int c, BOOL caseless, compile_data *cd)
69 nigel 23 {
70     start_bits[c/8] |= (1 << (c&7));
71 nigel 25 if (caseless && (cd->ctypes[c] & ctype_letter) != 0)
72     start_bits[cd->fcc[c]/8] |= (1 << (cd->fcc[c]&7));
73 nigel 23 }
74    
75    
76    
77     /*************************************************
78 nigel 3 * Create bitmap of starting chars *
79     *************************************************/
80    
81     /* This function scans a compiled unanchored expression and attempts to build a
82     bitmap of the set of initial characters. If it can't, it returns FALSE. As time
83     goes by, we may be able to get more clever at doing this.
84    
85     Arguments:
86     code points to an expression
87     start_bits points to a 32-byte table, initialized to 0
88 nigel 23 caseless the current state of the caseless flag
89 nigel 63 utf8 TRUE if in UTF-8 mode
90 nigel 25 cd the block with char table pointers
91 nigel 3
92     Returns: TRUE if table built, FALSE otherwise
93     */
94    
95     static BOOL
96 nigel 25 set_start_bits(const uschar *code, uschar *start_bits, BOOL caseless,
97 nigel 63 BOOL utf8, compile_data *cd)
98 nigel 3 {
99     register int c;
100    
101 nigel 27 /* This next statement and the later reference to dummy are here in order to
102     trick the optimizer of the IBM C compiler for OS/2 into generating correct
103     code. Apparently IBM isn't going to fix the problem, and we would rather not
104     disable optimization (in this module it actually makes a big difference, and
105     the pcre module can use all the optimization it can get). */
106    
107     volatile int dummy;
108    
109 nigel 3 do
110     {
111 nigel 63 const uschar *tcode = code + 1 + LINK_SIZE;
112 nigel 3 BOOL try_next = TRUE;
113    
114     while (try_next)
115     {
116 nigel 23 /* If a branch starts with a bracket or a positive lookahead assertion,
117     recurse to set bits from within them. That's all for this branch. */
118    
119 nigel 3 if ((int)*tcode >= OP_BRA || *tcode == OP_ASSERT)
120     {
121 nigel 63 if (!set_start_bits(tcode, start_bits, caseless, utf8, cd))
122 nigel 25 return FALSE;
123 nigel 53 try_next = FALSE;
124 nigel 3 }
125    
126     else switch(*tcode)
127     {
128     default:
129     return FALSE;
130    
131 nigel 63 /* Skip over callout */
132    
133     case OP_CALLOUT:
134 nigel 75 tcode += 2 + 2*LINK_SIZE;
135 nigel 63 break;
136    
137 nigel 53 /* Skip over extended extraction bracket number */
138    
139     case OP_BRANUMBER:
140     tcode += 3;
141     break;
142    
143 nigel 23 /* Skip over lookbehind and negative lookahead assertions */
144    
145     case OP_ASSERT_NOT:
146     case OP_ASSERTBACK:
147     case OP_ASSERTBACK_NOT:
148 nigel 63 do tcode += GET(tcode, 1); while (*tcode == OP_ALT);
149     tcode += 1+LINK_SIZE;
150 nigel 23 break;
151    
152     /* Skip over an option setting, changing the caseless flag */
153    
154     case OP_OPT:
155     caseless = (tcode[1] & PCRE_CASELESS) != 0;
156     tcode += 2;
157     break;
158    
159 nigel 3 /* BRAZERO does the bracket, but carries on. */
160    
161     case OP_BRAZERO:
162     case OP_BRAMINZERO:
163 nigel 63 if (!set_start_bits(++tcode, start_bits, caseless, utf8, cd))
164 nigel 25 return FALSE;
165 nigel 27 dummy = 1;
166 nigel 63 do tcode += GET(tcode,1); while (*tcode == OP_ALT);
167     tcode += 1+LINK_SIZE;
168 nigel 3 break;
169    
170     /* Single-char * or ? sets the bit and tries the next item */
171    
172     case OP_STAR:
173     case OP_MINSTAR:
174     case OP_QUERY:
175     case OP_MINQUERY:
176 nigel 25 set_bit(start_bits, tcode[1], caseless, cd);
177 nigel 3 tcode += 2;
178 nigel 63 #ifdef SUPPORT_UTF8
179     if (utf8) while ((*tcode & 0xc0) == 0x80) tcode++;
180     #endif
181 nigel 3 break;
182    
183     /* Single-char upto sets the bit and tries the next */
184    
185     case OP_UPTO:
186     case OP_MINUPTO:
187 nigel 25 set_bit(start_bits, tcode[3], caseless, cd);
188 nigel 3 tcode += 4;
189 nigel 63 #ifdef SUPPORT_UTF8
190     if (utf8) while ((*tcode & 0xc0) == 0x80) tcode++;
191     #endif
192 nigel 3 break;
193    
194     /* At least one single char sets the bit and stops */
195    
196     case OP_EXACT: /* Fall through */
197 nigel 75 tcode += 2;
198 nigel 3
199 nigel 75 case OP_CHAR:
200     case OP_CHARNC:
201 nigel 3 case OP_PLUS:
202     case OP_MINPLUS:
203 nigel 25 set_bit(start_bits, tcode[1], caseless, cd);
204 nigel 53 try_next = FALSE;
205 nigel 3 break;
206    
207     /* Single character type sets the bits and stops */
208    
209     case OP_NOT_DIGIT:
210 nigel 25 for (c = 0; c < 32; c++)
211     start_bits[c] |= ~cd->cbits[c+cbit_digit];
212 nigel 53 try_next = FALSE;
213 nigel 3 break;
214    
215     case OP_DIGIT:
216 nigel 25 for (c = 0; c < 32; c++)
217     start_bits[c] |= cd->cbits[c+cbit_digit];
218 nigel 53 try_next = FALSE;
219 nigel 3 break;
220    
221     case OP_NOT_WHITESPACE:
222 nigel 25 for (c = 0; c < 32; c++)
223     start_bits[c] |= ~cd->cbits[c+cbit_space];
224 nigel 53 try_next = FALSE;
225 nigel 3 break;
226    
227     case OP_WHITESPACE:
228 nigel 25 for (c = 0; c < 32; c++)
229     start_bits[c] |= cd->cbits[c+cbit_space];
230 nigel 53 try_next = FALSE;
231 nigel 3 break;
232    
233     case OP_NOT_WORDCHAR:
234     for (c = 0; c < 32; c++)
235 nigel 43 start_bits[c] |= ~cd->cbits[c+cbit_word];
236 nigel 53 try_next = FALSE;
237 nigel 3 break;
238    
239     case OP_WORDCHAR:
240     for (c = 0; c < 32; c++)
241 nigel 43 start_bits[c] |= cd->cbits[c+cbit_word];
242 nigel 53 try_next = FALSE;
243 nigel 3 break;
244    
245     /* One or more character type fudges the pointer and restarts, knowing
246     it will hit a single character type and stop there. */
247    
248     case OP_TYPEPLUS:
249     case OP_TYPEMINPLUS:
250     tcode++;
251     break;
252    
253     case OP_TYPEEXACT:
254     tcode += 3;
255     break;
256    
257     /* Zero or more repeats of character types set the bits and then
258     try again. */
259    
260     case OP_TYPEUPTO:
261     case OP_TYPEMINUPTO:
262     tcode += 2; /* Fall through */
263    
264     case OP_TYPESTAR:
265     case OP_TYPEMINSTAR:
266     case OP_TYPEQUERY:
267     case OP_TYPEMINQUERY:
268     switch(tcode[1])
269     {
270 nigel 73 case OP_ANY:
271     return FALSE;
272    
273 nigel 3 case OP_NOT_DIGIT:
274 nigel 25 for (c = 0; c < 32; c++)
275     start_bits[c] |= ~cd->cbits[c+cbit_digit];
276 nigel 3 break;
277    
278     case OP_DIGIT:
279 nigel 25 for (c = 0; c < 32; c++)
280     start_bits[c] |= cd->cbits[c+cbit_digit];
281 nigel 3 break;
282    
283     case OP_NOT_WHITESPACE:
284 nigel 25 for (c = 0; c < 32; c++)
285     start_bits[c] |= ~cd->cbits[c+cbit_space];
286 nigel 3 break;
287    
288     case OP_WHITESPACE:
289 nigel 25 for (c = 0; c < 32; c++)
290     start_bits[c] |= cd->cbits[c+cbit_space];
291 nigel 3 break;
292    
293     case OP_NOT_WORDCHAR:
294     for (c = 0; c < 32; c++)
295 nigel 43 start_bits[c] |= ~cd->cbits[c+cbit_word];
296 nigel 3 break;
297    
298     case OP_WORDCHAR:
299     for (c = 0; c < 32; c++)
300 nigel 43 start_bits[c] |= cd->cbits[c+cbit_word];
301 nigel 3 break;
302     }
303    
304     tcode += 2;
305     break;
306    
307 nigel 63 /* Character class where all the information is in a bit map: set the
308     bits and either carry on or not, according to the repeat count. If it was
309     a negative class, and we are operating with UTF-8 characters, any byte
310 nigel 71 with a value >= 0xc4 is a potentially valid starter because it starts a
311     character with a value > 255. */
312 nigel 3
313 nigel 63 case OP_NCLASS:
314 nigel 71 if (utf8)
315     {
316     start_bits[24] |= 0xf0; /* Bits for 0xc4 - 0xc8 */
317     memset(start_bits+25, 0xff, 7); /* Bits for 0xc9 - 0xff */
318     }
319 nigel 63 /* Fall through */
320    
321 nigel 3 case OP_CLASS:
322     {
323     tcode++;
324 nigel 71
325     /* In UTF-8 mode, the bits in a bit map correspond to character
326     values, not to byte values. However, the bit map we are constructing is
327     for byte values. So we have to do a conversion for characters whose
328     value is > 127. In fact, there are only two possible starting bytes for
329     characters in the range 128 - 255. */
330    
331     if (utf8)
332     {
333     for (c = 0; c < 16; c++) start_bits[c] |= tcode[c];
334     for (c = 128; c < 256; c++)
335     {
336     if ((tcode[c/8] && (1 << (c&7))) != 0)
337     {
338     int d = (c >> 6) | 0xc0; /* Set bit for this starter */
339     start_bits[d/8] |= (1 << (d&7)); /* and then skip on to the */
340     c = (c & 0xc0) + 0x40 - 1; /* next relevant character. */
341     }
342     }
343     }
344    
345     /* In non-UTF-8 mode, the two bit maps are completely compatible. */
346    
347     else
348     {
349     for (c = 0; c < 32; c++) start_bits[c] |= tcode[c];
350     }
351    
352     /* Advance past the bit map, and act on what follows */
353    
354 nigel 3 tcode += 32;
355     switch (*tcode)
356     {
357     case OP_CRSTAR:
358     case OP_CRMINSTAR:
359     case OP_CRQUERY:
360     case OP_CRMINQUERY:
361     tcode++;
362     break;
363    
364     case OP_CRRANGE:
365     case OP_CRMINRANGE:
366 nigel 53 if (((tcode[1] << 8) + tcode[2]) == 0) tcode += 5;
367     else try_next = FALSE;
368 nigel 3 break;
369 nigel 53
370     default:
371     try_next = FALSE;
372     break;
373 nigel 3 }
374     }
375 nigel 63 break; /* End of bitmap class handling */
376 nigel 3
377     } /* End of switch */
378     } /* End of try_next loop */
379    
380 nigel 63 code += GET(code, 1); /* Advance to next branch */
381 nigel 3 }
382     while (*code == OP_ALT);
383     return TRUE;
384     }
385    
386    
387    
388     /*************************************************
389     * Study a compiled expression *
390     *************************************************/
391    
392     /* This function is handed a compiled expression that it must study to produce
393     information that will speed up the matching. It returns a pcre_extra block
394     which then gets handed back to pcre_exec().
395    
396     Arguments:
397     re points to the compiled expression
398     options contains option bits
399     errorptr points to where to place error messages;
400     set NULL unless error
401    
402 nigel 63 Returns: pointer to a pcre_extra block, with study_data filled in and the
403     appropriate flag set;
404 nigel 3 NULL on error or if no optimization possible
405     */
406    
407 nigel 73 EXPORT pcre_extra *
408 nigel 7 pcre_study(const pcre *external_re, int options, const char **errorptr)
409 nigel 3 {
410     uschar start_bits[32];
411 nigel 63 pcre_extra *extra;
412     pcre_study_data *study;
413 nigel 75 const uschar *tables;
414 nigel 7 const real_pcre *re = (const real_pcre *)external_re;
415 nigel 75 uschar *code = (uschar *)re + re->name_table_offset +
416 nigel 63 (re->name_count * re->name_entry_size);
417 nigel 25 compile_data compile_block;
418 nigel 3
419     *errorptr = NULL;
420    
421     if (re == NULL || re->magic_number != MAGIC_NUMBER)
422     {
423     *errorptr = "argument is not a compiled regular expression";
424     return NULL;
425     }
426    
427     if ((options & ~PUBLIC_STUDY_OPTIONS) != 0)
428     {
429     *errorptr = "unknown or incorrect option bit(s) set";
430     return NULL;
431     }
432    
433 nigel 63 /* For an anchored pattern, or an unanchored pattern that has a first char, or
434     a multiline pattern that matches only at "line starts", no further processing
435     at present. */
436 nigel 3
437     if ((re->options & (PCRE_ANCHORED|PCRE_FIRSTSET|PCRE_STARTLINE)) != 0)
438     return NULL;
439    
440 nigel 75 /* Set the character tables in the block that is passed around */
441 nigel 25
442 nigel 75 tables = re->tables;
443     if (tables == NULL)
444     (void)pcre_fullinfo(external_re, NULL, PCRE_INFO_DEFAULT_TABLES, &tables);
445 nigel 25
446 nigel 75 compile_block.lcc = tables + lcc_offset;
447     compile_block.fcc = tables + fcc_offset;
448     compile_block.cbits = tables + cbits_offset;
449     compile_block.ctypes = tables + ctypes_offset;
450    
451 nigel 3 /* See if we can find a fixed set of initial characters for the pattern. */
452    
453     memset(start_bits, 0, 32 * sizeof(uschar));
454 nigel 63 if (!set_start_bits(code, start_bits, (re->options & PCRE_CASELESS) != 0,
455     (re->options & PCRE_UTF8) != 0, &compile_block)) return NULL;
456 nigel 3
457 nigel 63 /* Get a pcre_extra block and a pcre_study_data block. The study data is put in
458     the latter, which is pointed to by the former, which may also get additional
459     data set later by the calling program. At the moment, the size of
460     pcre_study_data is fixed. We nevertheless save it in a field for returning via
461     the pcre_fullinfo() function so that if it becomes variable in the future, we
462     don't have to change that code. */
463 nigel 3
464 nigel 63 extra = (pcre_extra *)(pcre_malloc)
465     (sizeof(pcre_extra) + sizeof(pcre_study_data));
466 nigel 3
467     if (extra == NULL)
468     {
469     *errorptr = "failed to get memory";
470     return NULL;
471     }
472    
473 nigel 63 study = (pcre_study_data *)((char *)extra + sizeof(pcre_extra));
474     extra->flags = PCRE_EXTRA_STUDY_DATA;
475     extra->study_data = study;
476 nigel 3
477 nigel 63 study->size = sizeof(pcre_study_data);
478     study->options = PCRE_STUDY_MAPPED;
479     memcpy(study->start_bits, start_bits, sizeof(start_bits));
480    
481     return extra;
482 nigel 3 }
483    
484     /* End of study.c */

webmaster@exim.org
ViewVC Help
Powered by ViewVC 1.1.12