/[pcre]/code/trunk/study.c
ViewVC logotype

Contents of /code/trunk/study.c

Parent Directory Parent Directory | Revision Log Revision Log


Revision 73 - (hide annotations) (download)
Sat Feb 24 21:40:30 2007 UTC (6 years, 3 months ago) by nigel
File MIME type: text/plain
File size: 13590 byte(s)
Load pcre-4.5 into code/trunk.

1 nigel 3 /*************************************************
2     * Perl-Compatible Regular Expressions *
3     *************************************************/
4    
5     /*
6     This is a library of functions to support regular expressions whose syntax
7     and semantics are as close as possible to those of the Perl 5 language. See
8     the file Tech.Notes for some information on the internals.
9    
10     Written by: Philip Hazel <ph10@cam.ac.uk>
11    
12 nigel 71 Copyright (c) 1997-2003 University of Cambridge
13 nigel 3
14     -----------------------------------------------------------------------------
15     Permission is granted to anyone to use this software for any purpose on any
16     computer system, and to redistribute it freely, subject to the following
17     restrictions:
18    
19     1. This software is distributed in the hope that it will be useful,
20     but WITHOUT ANY WARRANTY; without even the implied warranty of
21     MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
22    
23     2. The origin of this software must not be misrepresented, either by
24     explicit claim or by omission.
25    
26     3. Altered versions must be plainly marked as such, and must not be
27     misrepresented as being the original software.
28 nigel 29
29     4. If PCRE is embedded in any software that is released under the GNU
30     General Purpose Licence (GPL), then the terms of that licence shall
31     supersede any condition above with which it is incompatible.
32 nigel 3 -----------------------------------------------------------------------------
33     */
34    
35    
36     /* Include the internals header, which itself includes Standard C headers plus
37     the external pcre header. */
38    
39     #include "internal.h"
40    
41    
42    
43     /*************************************************
44 nigel 23 * Set a bit and maybe its alternate case *
45     *************************************************/
46    
47     /* Given a character, set its bit in the table, and also the bit for the other
48     version of a letter if we are caseless.
49    
50     Arguments:
51     start_bits points to the bit map
52     c is the character
53     caseless the caseless flag
54 nigel 25 cd the block with char table pointers
55 nigel 23
56     Returns: nothing
57     */
58    
59     static void
60 nigel 25 set_bit(uschar *start_bits, int c, BOOL caseless, compile_data *cd)
61 nigel 23 {
62     start_bits[c/8] |= (1 << (c&7));
63 nigel 25 if (caseless && (cd->ctypes[c] & ctype_letter) != 0)
64     start_bits[cd->fcc[c]/8] |= (1 << (cd->fcc[c]&7));
65 nigel 23 }
66    
67    
68    
69     /*************************************************
70 nigel 3 * Create bitmap of starting chars *
71     *************************************************/
72    
73     /* This function scans a compiled unanchored expression and attempts to build a
74     bitmap of the set of initial characters. If it can't, it returns FALSE. As time
75     goes by, we may be able to get more clever at doing this.
76    
77     Arguments:
78     code points to an expression
79     start_bits points to a 32-byte table, initialized to 0
80 nigel 23 caseless the current state of the caseless flag
81 nigel 63 utf8 TRUE if in UTF-8 mode
82 nigel 25 cd the block with char table pointers
83 nigel 3
84     Returns: TRUE if table built, FALSE otherwise
85     */
86    
87     static BOOL
88 nigel 25 set_start_bits(const uschar *code, uschar *start_bits, BOOL caseless,
89 nigel 63 BOOL utf8, compile_data *cd)
90 nigel 3 {
91     register int c;
92    
93 nigel 27 /* This next statement and the later reference to dummy are here in order to
94     trick the optimizer of the IBM C compiler for OS/2 into generating correct
95     code. Apparently IBM isn't going to fix the problem, and we would rather not
96     disable optimization (in this module it actually makes a big difference, and
97     the pcre module can use all the optimization it can get). */
98    
99     volatile int dummy;
100    
101 nigel 3 do
102     {
103 nigel 63 const uschar *tcode = code + 1 + LINK_SIZE;
104 nigel 3 BOOL try_next = TRUE;
105    
106     while (try_next)
107     {
108 nigel 23 /* If a branch starts with a bracket or a positive lookahead assertion,
109     recurse to set bits from within them. That's all for this branch. */
110    
111 nigel 3 if ((int)*tcode >= OP_BRA || *tcode == OP_ASSERT)
112     {
113 nigel 63 if (!set_start_bits(tcode, start_bits, caseless, utf8, cd))
114 nigel 25 return FALSE;
115 nigel 53 try_next = FALSE;
116 nigel 3 }
117    
118     else switch(*tcode)
119     {
120     default:
121     return FALSE;
122    
123 nigel 63 /* Skip over callout */
124    
125     case OP_CALLOUT:
126     tcode += 2;
127     break;
128    
129 nigel 53 /* Skip over extended extraction bracket number */
130    
131     case OP_BRANUMBER:
132     tcode += 3;
133     break;
134    
135 nigel 23 /* Skip over lookbehind and negative lookahead assertions */
136    
137     case OP_ASSERT_NOT:
138     case OP_ASSERTBACK:
139     case OP_ASSERTBACK_NOT:
140 nigel 63 do tcode += GET(tcode, 1); while (*tcode == OP_ALT);
141     tcode += 1+LINK_SIZE;
142 nigel 23 break;
143    
144     /* Skip over an option setting, changing the caseless flag */
145    
146     case OP_OPT:
147     caseless = (tcode[1] & PCRE_CASELESS) != 0;
148     tcode += 2;
149     break;
150    
151 nigel 3 /* BRAZERO does the bracket, but carries on. */
152    
153     case OP_BRAZERO:
154     case OP_BRAMINZERO:
155 nigel 63 if (!set_start_bits(++tcode, start_bits, caseless, utf8, cd))
156 nigel 25 return FALSE;
157 nigel 27 dummy = 1;
158 nigel 63 do tcode += GET(tcode,1); while (*tcode == OP_ALT);
159     tcode += 1+LINK_SIZE;
160 nigel 3 break;
161    
162     /* Single-char * or ? sets the bit and tries the next item */
163    
164     case OP_STAR:
165     case OP_MINSTAR:
166     case OP_QUERY:
167     case OP_MINQUERY:
168 nigel 25 set_bit(start_bits, tcode[1], caseless, cd);
169 nigel 3 tcode += 2;
170 nigel 63 #ifdef SUPPORT_UTF8
171     if (utf8) while ((*tcode & 0xc0) == 0x80) tcode++;
172     #endif
173 nigel 3 break;
174    
175     /* Single-char upto sets the bit and tries the next */
176    
177     case OP_UPTO:
178     case OP_MINUPTO:
179 nigel 25 set_bit(start_bits, tcode[3], caseless, cd);
180 nigel 3 tcode += 4;
181 nigel 63 #ifdef SUPPORT_UTF8
182     if (utf8) while ((*tcode & 0xc0) == 0x80) tcode++;
183     #endif
184 nigel 3 break;
185    
186     /* At least one single char sets the bit and stops */
187    
188     case OP_EXACT: /* Fall through */
189     tcode++;
190    
191     case OP_CHARS: /* Fall through */
192     tcode++;
193    
194     case OP_PLUS:
195     case OP_MINPLUS:
196 nigel 25 set_bit(start_bits, tcode[1], caseless, cd);
197 nigel 53 try_next = FALSE;
198 nigel 3 break;
199    
200     /* Single character type sets the bits and stops */
201    
202     case OP_NOT_DIGIT:
203 nigel 25 for (c = 0; c < 32; c++)
204     start_bits[c] |= ~cd->cbits[c+cbit_digit];
205 nigel 53 try_next = FALSE;
206 nigel 3 break;
207    
208     case OP_DIGIT:
209 nigel 25 for (c = 0; c < 32; c++)
210     start_bits[c] |= cd->cbits[c+cbit_digit];
211 nigel 53 try_next = FALSE;
212 nigel 3 break;
213    
214     case OP_NOT_WHITESPACE:
215 nigel 25 for (c = 0; c < 32; c++)
216     start_bits[c] |= ~cd->cbits[c+cbit_space];
217 nigel 53 try_next = FALSE;
218 nigel 3 break;
219    
220     case OP_WHITESPACE:
221 nigel 25 for (c = 0; c < 32; c++)
222     start_bits[c] |= cd->cbits[c+cbit_space];
223 nigel 53 try_next = FALSE;
224 nigel 3 break;
225    
226     case OP_NOT_WORDCHAR:
227     for (c = 0; c < 32; c++)
228 nigel 43 start_bits[c] |= ~cd->cbits[c+cbit_word];
229 nigel 53 try_next = FALSE;
230 nigel 3 break;
231    
232     case OP_WORDCHAR:
233     for (c = 0; c < 32; c++)
234 nigel 43 start_bits[c] |= cd->cbits[c+cbit_word];
235 nigel 53 try_next = FALSE;
236 nigel 3 break;
237    
238     /* One or more character type fudges the pointer and restarts, knowing
239     it will hit a single character type and stop there. */
240    
241     case OP_TYPEPLUS:
242     case OP_TYPEMINPLUS:
243     tcode++;
244     break;
245    
246     case OP_TYPEEXACT:
247     tcode += 3;
248     break;
249    
250     /* Zero or more repeats of character types set the bits and then
251     try again. */
252    
253     case OP_TYPEUPTO:
254     case OP_TYPEMINUPTO:
255     tcode += 2; /* Fall through */
256    
257     case OP_TYPESTAR:
258     case OP_TYPEMINSTAR:
259     case OP_TYPEQUERY:
260     case OP_TYPEMINQUERY:
261     switch(tcode[1])
262     {
263 nigel 73 case OP_ANY:
264     return FALSE;
265    
266 nigel 3 case OP_NOT_DIGIT:
267 nigel 25 for (c = 0; c < 32; c++)
268     start_bits[c] |= ~cd->cbits[c+cbit_digit];
269 nigel 3 break;
270    
271     case OP_DIGIT:
272 nigel 25 for (c = 0; c < 32; c++)
273     start_bits[c] |= cd->cbits[c+cbit_digit];
274 nigel 3 break;
275    
276     case OP_NOT_WHITESPACE:
277 nigel 25 for (c = 0; c < 32; c++)
278     start_bits[c] |= ~cd->cbits[c+cbit_space];
279 nigel 3 break;
280    
281     case OP_WHITESPACE:
282 nigel 25 for (c = 0; c < 32; c++)
283     start_bits[c] |= cd->cbits[c+cbit_space];
284 nigel 3 break;
285    
286     case OP_NOT_WORDCHAR:
287     for (c = 0; c < 32; c++)
288 nigel 43 start_bits[c] |= ~cd->cbits[c+cbit_word];
289 nigel 3 break;
290    
291     case OP_WORDCHAR:
292     for (c = 0; c < 32; c++)
293 nigel 43 start_bits[c] |= cd->cbits[c+cbit_word];
294 nigel 3 break;
295     }
296    
297     tcode += 2;
298     break;
299    
300 nigel 63 /* Character class where all the information is in a bit map: set the
301     bits and either carry on or not, according to the repeat count. If it was
302     a negative class, and we are operating with UTF-8 characters, any byte
303 nigel 71 with a value >= 0xc4 is a potentially valid starter because it starts a
304     character with a value > 255. */
305 nigel 3
306 nigel 63 case OP_NCLASS:
307 nigel 71 if (utf8)
308     {
309     start_bits[24] |= 0xf0; /* Bits for 0xc4 - 0xc8 */
310     memset(start_bits+25, 0xff, 7); /* Bits for 0xc9 - 0xff */
311     }
312 nigel 63 /* Fall through */
313    
314 nigel 3 case OP_CLASS:
315     {
316     tcode++;
317 nigel 71
318     /* In UTF-8 mode, the bits in a bit map correspond to character
319     values, not to byte values. However, the bit map we are constructing is
320     for byte values. So we have to do a conversion for characters whose
321     value is > 127. In fact, there are only two possible starting bytes for
322     characters in the range 128 - 255. */
323    
324     if (utf8)
325     {
326     for (c = 0; c < 16; c++) start_bits[c] |= tcode[c];
327     for (c = 128; c < 256; c++)
328     {
329     if ((tcode[c/8] && (1 << (c&7))) != 0)
330     {
331     int d = (c >> 6) | 0xc0; /* Set bit for this starter */
332     start_bits[d/8] |= (1 << (d&7)); /* and then skip on to the */
333     c = (c & 0xc0) + 0x40 - 1; /* next relevant character. */
334     }
335     }
336     }
337    
338     /* In non-UTF-8 mode, the two bit maps are completely compatible. */
339    
340     else
341     {
342     for (c = 0; c < 32; c++) start_bits[c] |= tcode[c];
343     }
344    
345     /* Advance past the bit map, and act on what follows */
346    
347 nigel 3 tcode += 32;
348     switch (*tcode)
349     {
350     case OP_CRSTAR:
351     case OP_CRMINSTAR:
352     case OP_CRQUERY:
353     case OP_CRMINQUERY:
354     tcode++;
355     break;
356    
357     case OP_CRRANGE:
358     case OP_CRMINRANGE:
359 nigel 53 if (((tcode[1] << 8) + tcode[2]) == 0) tcode += 5;
360     else try_next = FALSE;
361 nigel 3 break;
362 nigel 53
363     default:
364     try_next = FALSE;
365     break;
366 nigel 3 }
367     }
368 nigel 63 break; /* End of bitmap class handling */
369 nigel 3
370     } /* End of switch */
371     } /* End of try_next loop */
372    
373 nigel 63 code += GET(code, 1); /* Advance to next branch */
374 nigel 3 }
375     while (*code == OP_ALT);
376     return TRUE;
377     }
378    
379    
380    
381     /*************************************************
382     * Study a compiled expression *
383     *************************************************/
384    
385     /* This function is handed a compiled expression that it must study to produce
386     information that will speed up the matching. It returns a pcre_extra block
387     which then gets handed back to pcre_exec().
388    
389     Arguments:
390     re points to the compiled expression
391     options contains option bits
392     errorptr points to where to place error messages;
393     set NULL unless error
394    
395 nigel 63 Returns: pointer to a pcre_extra block, with study_data filled in and the
396     appropriate flag set;
397 nigel 3 NULL on error or if no optimization possible
398     */
399    
400 nigel 73 EXPORT pcre_extra *
401 nigel 7 pcre_study(const pcre *external_re, int options, const char **errorptr)
402 nigel 3 {
403     uschar start_bits[32];
404 nigel 63 pcre_extra *extra;
405     pcre_study_data *study;
406 nigel 7 const real_pcre *re = (const real_pcre *)external_re;
407 nigel 63 uschar *code = (uschar *)re + sizeof(real_pcre) +
408     (re->name_count * re->name_entry_size);
409 nigel 25 compile_data compile_block;
410 nigel 3
411     *errorptr = NULL;
412    
413     if (re == NULL || re->magic_number != MAGIC_NUMBER)
414     {
415     *errorptr = "argument is not a compiled regular expression";
416     return NULL;
417     }
418    
419     if ((options & ~PUBLIC_STUDY_OPTIONS) != 0)
420     {
421     *errorptr = "unknown or incorrect option bit(s) set";
422     return NULL;
423     }
424    
425 nigel 63 /* For an anchored pattern, or an unanchored pattern that has a first char, or
426     a multiline pattern that matches only at "line starts", no further processing
427     at present. */
428 nigel 3
429     if ((re->options & (PCRE_ANCHORED|PCRE_FIRSTSET|PCRE_STARTLINE)) != 0)
430     return NULL;
431    
432 nigel 25 /* Set the character tables in the block which is passed around */
433    
434     compile_block.lcc = re->tables + lcc_offset;
435     compile_block.fcc = re->tables + fcc_offset;
436     compile_block.cbits = re->tables + cbits_offset;
437     compile_block.ctypes = re->tables + ctypes_offset;
438    
439 nigel 3 /* See if we can find a fixed set of initial characters for the pattern. */
440    
441     memset(start_bits, 0, 32 * sizeof(uschar));
442 nigel 63 if (!set_start_bits(code, start_bits, (re->options & PCRE_CASELESS) != 0,
443     (re->options & PCRE_UTF8) != 0, &compile_block)) return NULL;
444 nigel 3
445 nigel 63 /* Get a pcre_extra block and a pcre_study_data block. The study data is put in
446     the latter, which is pointed to by the former, which may also get additional
447     data set later by the calling program. At the moment, the size of
448     pcre_study_data is fixed. We nevertheless save it in a field for returning via
449     the pcre_fullinfo() function so that if it becomes variable in the future, we
450     don't have to change that code. */
451 nigel 3
452 nigel 63 extra = (pcre_extra *)(pcre_malloc)
453     (sizeof(pcre_extra) + sizeof(pcre_study_data));
454 nigel 3
455     if (extra == NULL)
456     {
457     *errorptr = "failed to get memory";
458     return NULL;
459     }
460    
461 nigel 63 study = (pcre_study_data *)((char *)extra + sizeof(pcre_extra));
462     extra->flags = PCRE_EXTRA_STUDY_DATA;
463     extra->study_data = study;
464 nigel 3
465 nigel 63 study->size = sizeof(pcre_study_data);
466     study->options = PCRE_STUDY_MAPPED;
467     memcpy(study->start_bits, start_bits, sizeof(start_bits));
468    
469     return extra;
470 nigel 3 }
471    
472     /* End of study.c */

webmaster@exim.org
ViewVC Help
Powered by ViewVC 1.1.12