/[pcre]/code/trunk/study.c
ViewVC logotype

Contents of /code/trunk/study.c

Parent Directory Parent Directory | Revision Log Revision Log


Revision 63 - (hide annotations) (download)
Sat Feb 24 21:40:03 2007 UTC (7 years, 6 months ago) by nigel
File MIME type: text/plain
File size: 12544 byte(s)
Load pcre-4.0 into code/trunk.

1 nigel 3 /*************************************************
2     * Perl-Compatible Regular Expressions *
3     *************************************************/
4    
5     /*
6     This is a library of functions to support regular expressions whose syntax
7     and semantics are as close as possible to those of the Perl 5 language. See
8     the file Tech.Notes for some information on the internals.
9    
10     Written by: Philip Hazel <ph10@cam.ac.uk>
11    
12 nigel 63 Copyright (c) 1997-2002 University of Cambridge
13 nigel 3
14     -----------------------------------------------------------------------------
15     Permission is granted to anyone to use this software for any purpose on any
16     computer system, and to redistribute it freely, subject to the following
17     restrictions:
18    
19     1. This software is distributed in the hope that it will be useful,
20     but WITHOUT ANY WARRANTY; without even the implied warranty of
21     MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
22    
23     2. The origin of this software must not be misrepresented, either by
24     explicit claim or by omission.
25    
26     3. Altered versions must be plainly marked as such, and must not be
27     misrepresented as being the original software.
28 nigel 29
29     4. If PCRE is embedded in any software that is released under the GNU
30     General Purpose Licence (GPL), then the terms of that licence shall
31     supersede any condition above with which it is incompatible.
32 nigel 3 -----------------------------------------------------------------------------
33     */
34    
35    
36     /* Include the internals header, which itself includes Standard C headers plus
37     the external pcre header. */
38    
39     #include "internal.h"
40    
41    
42    
43     /*************************************************
44 nigel 23 * Set a bit and maybe its alternate case *
45     *************************************************/
46    
47     /* Given a character, set its bit in the table, and also the bit for the other
48     version of a letter if we are caseless.
49    
50     Arguments:
51     start_bits points to the bit map
52     c is the character
53     caseless the caseless flag
54 nigel 25 cd the block with char table pointers
55 nigel 23
56     Returns: nothing
57     */
58    
59     static void
60 nigel 25 set_bit(uschar *start_bits, int c, BOOL caseless, compile_data *cd)
61 nigel 23 {
62     start_bits[c/8] |= (1 << (c&7));
63 nigel 25 if (caseless && (cd->ctypes[c] & ctype_letter) != 0)
64     start_bits[cd->fcc[c]/8] |= (1 << (cd->fcc[c]&7));
65 nigel 23 }
66    
67    
68    
69     /*************************************************
70 nigel 3 * Create bitmap of starting chars *
71     *************************************************/
72    
73     /* This function scans a compiled unanchored expression and attempts to build a
74     bitmap of the set of initial characters. If it can't, it returns FALSE. As time
75     goes by, we may be able to get more clever at doing this.
76    
77     Arguments:
78     code points to an expression
79     start_bits points to a 32-byte table, initialized to 0
80 nigel 23 caseless the current state of the caseless flag
81 nigel 63 utf8 TRUE if in UTF-8 mode
82 nigel 25 cd the block with char table pointers
83 nigel 3
84     Returns: TRUE if table built, FALSE otherwise
85     */
86    
87     static BOOL
88 nigel 25 set_start_bits(const uschar *code, uschar *start_bits, BOOL caseless,
89 nigel 63 BOOL utf8, compile_data *cd)
90 nigel 3 {
91     register int c;
92    
93 nigel 27 /* This next statement and the later reference to dummy are here in order to
94     trick the optimizer of the IBM C compiler for OS/2 into generating correct
95     code. Apparently IBM isn't going to fix the problem, and we would rather not
96     disable optimization (in this module it actually makes a big difference, and
97     the pcre module can use all the optimization it can get). */
98    
99     volatile int dummy;
100    
101 nigel 3 do
102     {
103 nigel 63 const uschar *tcode = code + 1 + LINK_SIZE;
104 nigel 3 BOOL try_next = TRUE;
105    
106     while (try_next)
107     {
108 nigel 23 /* If a branch starts with a bracket or a positive lookahead assertion,
109     recurse to set bits from within them. That's all for this branch. */
110    
111 nigel 3 if ((int)*tcode >= OP_BRA || *tcode == OP_ASSERT)
112     {
113 nigel 63 if (!set_start_bits(tcode, start_bits, caseless, utf8, cd))
114 nigel 25 return FALSE;
115 nigel 53 try_next = FALSE;
116 nigel 3 }
117    
118     else switch(*tcode)
119     {
120     default:
121     return FALSE;
122    
123 nigel 63 /* Skip over callout */
124    
125     case OP_CALLOUT:
126     tcode += 2;
127     break;
128    
129 nigel 53 /* Skip over extended extraction bracket number */
130    
131     case OP_BRANUMBER:
132     tcode += 3;
133     break;
134    
135 nigel 23 /* Skip over lookbehind and negative lookahead assertions */
136    
137     case OP_ASSERT_NOT:
138     case OP_ASSERTBACK:
139     case OP_ASSERTBACK_NOT:
140 nigel 63 do tcode += GET(tcode, 1); while (*tcode == OP_ALT);
141     tcode += 1+LINK_SIZE;
142 nigel 23 break;
143    
144     /* Skip over an option setting, changing the caseless flag */
145    
146     case OP_OPT:
147     caseless = (tcode[1] & PCRE_CASELESS) != 0;
148     tcode += 2;
149     break;
150    
151 nigel 3 /* BRAZERO does the bracket, but carries on. */
152    
153     case OP_BRAZERO:
154     case OP_BRAMINZERO:
155 nigel 63 if (!set_start_bits(++tcode, start_bits, caseless, utf8, cd))
156 nigel 25 return FALSE;
157 nigel 27 dummy = 1;
158 nigel 63 do tcode += GET(tcode,1); while (*tcode == OP_ALT);
159     tcode += 1+LINK_SIZE;
160 nigel 3 break;
161    
162     /* Single-char * or ? sets the bit and tries the next item */
163    
164     case OP_STAR:
165     case OP_MINSTAR:
166     case OP_QUERY:
167     case OP_MINQUERY:
168 nigel 25 set_bit(start_bits, tcode[1], caseless, cd);
169 nigel 3 tcode += 2;
170 nigel 63 #ifdef SUPPORT_UTF8
171     if (utf8) while ((*tcode & 0xc0) == 0x80) tcode++;
172     #endif
173 nigel 3 break;
174    
175     /* Single-char upto sets the bit and tries the next */
176    
177     case OP_UPTO:
178     case OP_MINUPTO:
179 nigel 25 set_bit(start_bits, tcode[3], caseless, cd);
180 nigel 3 tcode += 4;
181 nigel 63 #ifdef SUPPORT_UTF8
182     if (utf8) while ((*tcode & 0xc0) == 0x80) tcode++;
183     #endif
184 nigel 3 break;
185    
186     /* At least one single char sets the bit and stops */
187    
188     case OP_EXACT: /* Fall through */
189     tcode++;
190    
191     case OP_CHARS: /* Fall through */
192     tcode++;
193    
194     case OP_PLUS:
195     case OP_MINPLUS:
196 nigel 25 set_bit(start_bits, tcode[1], caseless, cd);
197 nigel 53 try_next = FALSE;
198 nigel 3 break;
199    
200     /* Single character type sets the bits and stops */
201    
202     case OP_NOT_DIGIT:
203 nigel 25 for (c = 0; c < 32; c++)
204     start_bits[c] |= ~cd->cbits[c+cbit_digit];
205 nigel 53 try_next = FALSE;
206 nigel 3 break;
207    
208     case OP_DIGIT:
209 nigel 25 for (c = 0; c < 32; c++)
210     start_bits[c] |= cd->cbits[c+cbit_digit];
211 nigel 53 try_next = FALSE;
212 nigel 3 break;
213    
214     case OP_NOT_WHITESPACE:
215 nigel 25 for (c = 0; c < 32; c++)
216     start_bits[c] |= ~cd->cbits[c+cbit_space];
217 nigel 53 try_next = FALSE;
218 nigel 3 break;
219    
220     case OP_WHITESPACE:
221 nigel 25 for (c = 0; c < 32; c++)
222     start_bits[c] |= cd->cbits[c+cbit_space];
223 nigel 53 try_next = FALSE;
224 nigel 3 break;
225    
226     case OP_NOT_WORDCHAR:
227     for (c = 0; c < 32; c++)
228 nigel 43 start_bits[c] |= ~cd->cbits[c+cbit_word];
229 nigel 53 try_next = FALSE;
230 nigel 3 break;
231    
232     case OP_WORDCHAR:
233     for (c = 0; c < 32; c++)
234 nigel 43 start_bits[c] |= cd->cbits[c+cbit_word];
235 nigel 53 try_next = FALSE;
236 nigel 3 break;
237    
238     /* One or more character type fudges the pointer and restarts, knowing
239     it will hit a single character type and stop there. */
240    
241     case OP_TYPEPLUS:
242     case OP_TYPEMINPLUS:
243     tcode++;
244     break;
245    
246     case OP_TYPEEXACT:
247     tcode += 3;
248     break;
249    
250     /* Zero or more repeats of character types set the bits and then
251     try again. */
252    
253     case OP_TYPEUPTO:
254     case OP_TYPEMINUPTO:
255     tcode += 2; /* Fall through */
256    
257     case OP_TYPESTAR:
258     case OP_TYPEMINSTAR:
259     case OP_TYPEQUERY:
260     case OP_TYPEMINQUERY:
261     switch(tcode[1])
262     {
263     case OP_NOT_DIGIT:
264 nigel 25 for (c = 0; c < 32; c++)
265     start_bits[c] |= ~cd->cbits[c+cbit_digit];
266 nigel 3 break;
267    
268     case OP_DIGIT:
269 nigel 25 for (c = 0; c < 32; c++)
270     start_bits[c] |= cd->cbits[c+cbit_digit];
271 nigel 3 break;
272    
273     case OP_NOT_WHITESPACE:
274 nigel 25 for (c = 0; c < 32; c++)
275     start_bits[c] |= ~cd->cbits[c+cbit_space];
276 nigel 3 break;
277    
278     case OP_WHITESPACE:
279 nigel 25 for (c = 0; c < 32; c++)
280     start_bits[c] |= cd->cbits[c+cbit_space];
281 nigel 3 break;
282    
283     case OP_NOT_WORDCHAR:
284     for (c = 0; c < 32; c++)
285 nigel 43 start_bits[c] |= ~cd->cbits[c+cbit_word];
286 nigel 3 break;
287    
288     case OP_WORDCHAR:
289     for (c = 0; c < 32; c++)
290 nigel 43 start_bits[c] |= cd->cbits[c+cbit_word];
291 nigel 3 break;
292     }
293    
294     tcode += 2;
295     break;
296    
297 nigel 63 /* Character class where all the information is in a bit map: set the
298     bits and either carry on or not, according to the repeat count. If it was
299     a negative class, and we are operating with UTF-8 characters, any byte
300     with the top-bit set is a potentially valid starter because it may start
301     a character with a value > 255. (This is sub-optimal in that the
302     character may be in the range 128-255, and those characters might be
303     unwanted, but that's as far as we go for the moment.) */
304 nigel 3
305 nigel 63 case OP_NCLASS:
306     if (utf8) memset(start_bits+16, 0xff, 16);
307     /* Fall through */
308    
309 nigel 3 case OP_CLASS:
310     {
311     tcode++;
312     for (c = 0; c < 32; c++) start_bits[c] |= tcode[c];
313     tcode += 32;
314     switch (*tcode)
315     {
316     case OP_CRSTAR:
317     case OP_CRMINSTAR:
318     case OP_CRQUERY:
319     case OP_CRMINQUERY:
320     tcode++;
321     break;
322    
323     case OP_CRRANGE:
324     case OP_CRMINRANGE:
325 nigel 53 if (((tcode[1] << 8) + tcode[2]) == 0) tcode += 5;
326     else try_next = FALSE;
327 nigel 3 break;
328 nigel 53
329     default:
330     try_next = FALSE;
331     break;
332 nigel 3 }
333     }
334 nigel 63 break; /* End of bitmap class handling */
335 nigel 3
336     } /* End of switch */
337     } /* End of try_next loop */
338    
339 nigel 63 code += GET(code, 1); /* Advance to next branch */
340 nigel 3 }
341     while (*code == OP_ALT);
342     return TRUE;
343     }
344    
345    
346    
347     /*************************************************
348     * Study a compiled expression *
349     *************************************************/
350    
351     /* This function is handed a compiled expression that it must study to produce
352     information that will speed up the matching. It returns a pcre_extra block
353     which then gets handed back to pcre_exec().
354    
355     Arguments:
356     re points to the compiled expression
357     options contains option bits
358     errorptr points to where to place error messages;
359     set NULL unless error
360    
361 nigel 63 Returns: pointer to a pcre_extra block, with study_data filled in and the
362     appropriate flag set;
363 nigel 3 NULL on error or if no optimization possible
364     */
365    
366     pcre_extra *
367 nigel 7 pcre_study(const pcre *external_re, int options, const char **errorptr)
368 nigel 3 {
369     uschar start_bits[32];
370 nigel 63 pcre_extra *extra;
371     pcre_study_data *study;
372 nigel 7 const real_pcre *re = (const real_pcre *)external_re;
373 nigel 63 uschar *code = (uschar *)re + sizeof(real_pcre) +
374     (re->name_count * re->name_entry_size);
375 nigel 25 compile_data compile_block;
376 nigel 3
377     *errorptr = NULL;
378    
379     if (re == NULL || re->magic_number != MAGIC_NUMBER)
380     {
381     *errorptr = "argument is not a compiled regular expression";
382     return NULL;
383     }
384    
385     if ((options & ~PUBLIC_STUDY_OPTIONS) != 0)
386     {
387     *errorptr = "unknown or incorrect option bit(s) set";
388     return NULL;
389     }
390    
391 nigel 63 /* For an anchored pattern, or an unanchored pattern that has a first char, or
392     a multiline pattern that matches only at "line starts", no further processing
393     at present. */
394 nigel 3
395     if ((re->options & (PCRE_ANCHORED|PCRE_FIRSTSET|PCRE_STARTLINE)) != 0)
396     return NULL;
397    
398 nigel 25 /* Set the character tables in the block which is passed around */
399    
400     compile_block.lcc = re->tables + lcc_offset;
401     compile_block.fcc = re->tables + fcc_offset;
402     compile_block.cbits = re->tables + cbits_offset;
403     compile_block.ctypes = re->tables + ctypes_offset;
404    
405 nigel 3 /* See if we can find a fixed set of initial characters for the pattern. */
406    
407     memset(start_bits, 0, 32 * sizeof(uschar));
408 nigel 63 if (!set_start_bits(code, start_bits, (re->options & PCRE_CASELESS) != 0,
409     (re->options & PCRE_UTF8) != 0, &compile_block)) return NULL;
410 nigel 3
411 nigel 63 /* Get a pcre_extra block and a pcre_study_data block. The study data is put in
412     the latter, which is pointed to by the former, which may also get additional
413     data set later by the calling program. At the moment, the size of
414     pcre_study_data is fixed. We nevertheless save it in a field for returning via
415     the pcre_fullinfo() function so that if it becomes variable in the future, we
416     don't have to change that code. */
417 nigel 3
418 nigel 63 extra = (pcre_extra *)(pcre_malloc)
419     (sizeof(pcre_extra) + sizeof(pcre_study_data));
420 nigel 3
421     if (extra == NULL)
422     {
423     *errorptr = "failed to get memory";
424     return NULL;
425     }
426    
427 nigel 63 study = (pcre_study_data *)((char *)extra + sizeof(pcre_extra));
428     extra->flags = PCRE_EXTRA_STUDY_DATA;
429     extra->study_data = study;
430 nigel 3
431 nigel 63 study->size = sizeof(pcre_study_data);
432     study->options = PCRE_STUDY_MAPPED;
433     memcpy(study->start_bits, start_bits, sizeof(start_bits));
434    
435     return extra;
436 nigel 3 }
437    
438     /* End of study.c */

webmaster@exim.org
ViewVC Help
Powered by ViewVC 1.1.12