/[pcre]/code/tags/pcre-3.3/study.c
ViewVC logotype

Contents of /code/tags/pcre-3.3/study.c

Parent Directory Parent Directory | Revision Log Revision Log


Revision 25 - (hide annotations) (download)
Sat Feb 24 21:38:45 2007 UTC (6 years, 2 months ago) by nigel
Original Path: code/trunk/study.c
File MIME type: text/plain
File size: 10325 byte(s)
Load pcre-2.01 into code/trunk.

1 nigel 3 /*************************************************
2     * Perl-Compatible Regular Expressions *
3     *************************************************/
4    
5     /*
6     This is a library of functions to support regular expressions whose syntax
7     and semantics are as close as possible to those of the Perl 5 language. See
8     the file Tech.Notes for some information on the internals.
9    
10     Written by: Philip Hazel <ph10@cam.ac.uk>
11    
12 nigel 15 Copyright (c) 1998 University of Cambridge
13 nigel 3
14     -----------------------------------------------------------------------------
15     Permission is granted to anyone to use this software for any purpose on any
16     computer system, and to redistribute it freely, subject to the following
17     restrictions:
18    
19     1. This software is distributed in the hope that it will be useful,
20     but WITHOUT ANY WARRANTY; without even the implied warranty of
21     MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
22    
23     2. The origin of this software must not be misrepresented, either by
24     explicit claim or by omission.
25    
26     3. Altered versions must be plainly marked as such, and must not be
27     misrepresented as being the original software.
28     -----------------------------------------------------------------------------
29     */
30    
31    
32     /* Include the internals header, which itself includes Standard C headers plus
33     the external pcre header. */
34    
35     #include "internal.h"
36    
37    
38    
39     /*************************************************
40 nigel 23 * Set a bit and maybe its alternate case *
41     *************************************************/
42    
43     /* Given a character, set its bit in the table, and also the bit for the other
44     version of a letter if we are caseless.
45    
46     Arguments:
47     start_bits points to the bit map
48     c is the character
49     caseless the caseless flag
50 nigel 25 cd the block with char table pointers
51 nigel 23
52     Returns: nothing
53     */
54    
55     static void
56 nigel 25 set_bit(uschar *start_bits, int c, BOOL caseless, compile_data *cd)
57 nigel 23 {
58     start_bits[c/8] |= (1 << (c&7));
59 nigel 25 if (caseless && (cd->ctypes[c] & ctype_letter) != 0)
60     start_bits[cd->fcc[c]/8] |= (1 << (cd->fcc[c]&7));
61 nigel 23 }
62    
63    
64    
65     /*************************************************
66 nigel 3 * Create bitmap of starting chars *
67     *************************************************/
68    
69     /* This function scans a compiled unanchored expression and attempts to build a
70     bitmap of the set of initial characters. If it can't, it returns FALSE. As time
71     goes by, we may be able to get more clever at doing this.
72    
73     Arguments:
74     code points to an expression
75     start_bits points to a 32-byte table, initialized to 0
76 nigel 23 caseless the current state of the caseless flag
77 nigel 25 cd the block with char table pointers
78 nigel 3
79     Returns: TRUE if table built, FALSE otherwise
80     */
81    
82     static BOOL
83 nigel 25 set_start_bits(const uschar *code, uschar *start_bits, BOOL caseless,
84     compile_data *cd)
85 nigel 3 {
86     register int c;
87    
88     do
89     {
90 nigel 7 const uschar *tcode = code + 3;
91 nigel 3 BOOL try_next = TRUE;
92    
93     while (try_next)
94     {
95     try_next = FALSE;
96    
97 nigel 23 /* If a branch starts with a bracket or a positive lookahead assertion,
98     recurse to set bits from within them. That's all for this branch. */
99    
100 nigel 3 if ((int)*tcode >= OP_BRA || *tcode == OP_ASSERT)
101     {
102 nigel 25 if (!set_start_bits(tcode, start_bits, caseless, cd))
103     return FALSE;
104 nigel 3 }
105    
106     else switch(*tcode)
107     {
108     default:
109     return FALSE;
110    
111 nigel 23 /* Skip over lookbehind and negative lookahead assertions */
112    
113     case OP_ASSERT_NOT:
114     case OP_ASSERTBACK:
115     case OP_ASSERTBACK_NOT:
116     try_next = TRUE;
117     do tcode += (tcode[1] << 8) + tcode[2]; while (*tcode == OP_ALT);
118     tcode += 3;
119     break;
120    
121     /* Skip over an option setting, changing the caseless flag */
122    
123     case OP_OPT:
124     caseless = (tcode[1] & PCRE_CASELESS) != 0;
125     tcode += 2;
126     try_next = TRUE;
127     break;
128    
129 nigel 3 /* BRAZERO does the bracket, but carries on. */
130    
131     case OP_BRAZERO:
132     case OP_BRAMINZERO:
133 nigel 25 if (!set_start_bits(++tcode, start_bits, caseless, cd))
134     return FALSE;
135 nigel 3 do tcode += (tcode[1] << 8) + tcode[2]; while (*tcode == OP_ALT);
136     tcode += 3;
137     try_next = TRUE;
138     break;
139    
140     /* Single-char * or ? sets the bit and tries the next item */
141    
142     case OP_STAR:
143     case OP_MINSTAR:
144     case OP_QUERY:
145     case OP_MINQUERY:
146 nigel 25 set_bit(start_bits, tcode[1], caseless, cd);
147 nigel 3 tcode += 2;
148     try_next = TRUE;
149     break;
150    
151     /* Single-char upto sets the bit and tries the next */
152    
153     case OP_UPTO:
154     case OP_MINUPTO:
155 nigel 25 set_bit(start_bits, tcode[3], caseless, cd);
156 nigel 3 tcode += 4;
157     try_next = TRUE;
158     break;
159    
160     /* At least one single char sets the bit and stops */
161    
162     case OP_EXACT: /* Fall through */
163     tcode++;
164    
165     case OP_CHARS: /* Fall through */
166     tcode++;
167    
168     case OP_PLUS:
169     case OP_MINPLUS:
170 nigel 25 set_bit(start_bits, tcode[1], caseless, cd);
171 nigel 3 break;
172    
173     /* Single character type sets the bits and stops */
174    
175     case OP_NOT_DIGIT:
176 nigel 25 for (c = 0; c < 32; c++)
177     start_bits[c] |= ~cd->cbits[c+cbit_digit];
178 nigel 3 break;
179    
180     case OP_DIGIT:
181 nigel 25 for (c = 0; c < 32; c++)
182     start_bits[c] |= cd->cbits[c+cbit_digit];
183 nigel 3 break;
184    
185     case OP_NOT_WHITESPACE:
186 nigel 25 for (c = 0; c < 32; c++)
187     start_bits[c] |= ~cd->cbits[c+cbit_space];
188 nigel 3 break;
189    
190     case OP_WHITESPACE:
191 nigel 25 for (c = 0; c < 32; c++)
192     start_bits[c] |= cd->cbits[c+cbit_space];
193 nigel 3 break;
194    
195     case OP_NOT_WORDCHAR:
196     for (c = 0; c < 32; c++)
197 nigel 25 start_bits[c] |= ~(cd->cbits[c] | cd->cbits[c+cbit_word]);
198 nigel 3 break;
199    
200     case OP_WORDCHAR:
201     for (c = 0; c < 32; c++)
202 nigel 25 start_bits[c] |= (cd->cbits[c] | cd->cbits[c+cbit_word]);
203 nigel 3 break;
204    
205     /* One or more character type fudges the pointer and restarts, knowing
206     it will hit a single character type and stop there. */
207    
208     case OP_TYPEPLUS:
209     case OP_TYPEMINPLUS:
210     tcode++;
211     try_next = TRUE;
212     break;
213    
214     case OP_TYPEEXACT:
215     tcode += 3;
216     try_next = TRUE;
217     break;
218    
219     /* Zero or more repeats of character types set the bits and then
220     try again. */
221    
222     case OP_TYPEUPTO:
223     case OP_TYPEMINUPTO:
224     tcode += 2; /* Fall through */
225    
226     case OP_TYPESTAR:
227     case OP_TYPEMINSTAR:
228     case OP_TYPEQUERY:
229     case OP_TYPEMINQUERY:
230     switch(tcode[1])
231     {
232     case OP_NOT_DIGIT:
233 nigel 25 for (c = 0; c < 32; c++)
234     start_bits[c] |= ~cd->cbits[c+cbit_digit];
235 nigel 3 break;
236    
237     case OP_DIGIT:
238 nigel 25 for (c = 0; c < 32; c++)
239     start_bits[c] |= cd->cbits[c+cbit_digit];
240 nigel 3 break;
241    
242     case OP_NOT_WHITESPACE:
243 nigel 25 for (c = 0; c < 32; c++)
244     start_bits[c] |= ~cd->cbits[c+cbit_space];
245 nigel 3 break;
246    
247     case OP_WHITESPACE:
248 nigel 25 for (c = 0; c < 32; c++)
249     start_bits[c] |= cd->cbits[c+cbit_space];
250 nigel 3 break;
251    
252     case OP_NOT_WORDCHAR:
253     for (c = 0; c < 32; c++)
254 nigel 25 start_bits[c] |= ~(cd->cbits[c] | cd->cbits[c+cbit_word]);
255 nigel 3 break;
256    
257     case OP_WORDCHAR:
258     for (c = 0; c < 32; c++)
259 nigel 25 start_bits[c] |= (cd->cbits[c] | cd->cbits[c+cbit_word]);
260 nigel 3 break;
261     }
262    
263     tcode += 2;
264     try_next = TRUE;
265     break;
266    
267     /* Character class: set the bits and either carry on or not,
268     according to the repeat count. */
269    
270     case OP_CLASS:
271     {
272     tcode++;
273     for (c = 0; c < 32; c++) start_bits[c] |= tcode[c];
274     tcode += 32;
275     switch (*tcode)
276     {
277     case OP_CRSTAR:
278     case OP_CRMINSTAR:
279     case OP_CRQUERY:
280     case OP_CRMINQUERY:
281     tcode++;
282     try_next = TRUE;
283     break;
284    
285     case OP_CRRANGE:
286     case OP_CRMINRANGE:
287     if (((tcode[1] << 8) + tcode[2]) == 0)
288     {
289     tcode += 5;
290     try_next = TRUE;
291     }
292     break;
293     }
294     }
295     break; /* End of class handling */
296    
297     } /* End of switch */
298     } /* End of try_next loop */
299    
300     code += (code[1] << 8) + code[2]; /* Advance to next branch */
301     }
302     while (*code == OP_ALT);
303     return TRUE;
304     }
305    
306    
307    
308     /*************************************************
309     * Study a compiled expression *
310     *************************************************/
311    
312     /* This function is handed a compiled expression that it must study to produce
313     information that will speed up the matching. It returns a pcre_extra block
314     which then gets handed back to pcre_exec().
315    
316     Arguments:
317     re points to the compiled expression
318     options contains option bits
319     errorptr points to where to place error messages;
320     set NULL unless error
321    
322     Returns: pointer to a pcre_extra block,
323     NULL on error or if no optimization possible
324     */
325    
326     pcre_extra *
327 nigel 7 pcre_study(const pcre *external_re, int options, const char **errorptr)
328 nigel 3 {
329     uschar start_bits[32];
330     real_pcre_extra *extra;
331 nigel 7 const real_pcre *re = (const real_pcre *)external_re;
332 nigel 25 compile_data compile_block;
333 nigel 3
334     *errorptr = NULL;
335    
336     if (re == NULL || re->magic_number != MAGIC_NUMBER)
337     {
338     *errorptr = "argument is not a compiled regular expression";
339     return NULL;
340     }
341    
342     if ((options & ~PUBLIC_STUDY_OPTIONS) != 0)
343     {
344     *errorptr = "unknown or incorrect option bit(s) set";
345     return NULL;
346     }
347    
348     /* For an anchored pattern, or an unchored pattern that has a first char, or a
349     multiline pattern that matches only at "line starts", no further processing at
350     present. */
351    
352     if ((re->options & (PCRE_ANCHORED|PCRE_FIRSTSET|PCRE_STARTLINE)) != 0)
353     return NULL;
354    
355 nigel 25 /* Set the character tables in the block which is passed around */
356    
357     compile_block.lcc = re->tables + lcc_offset;
358     compile_block.fcc = re->tables + fcc_offset;
359     compile_block.cbits = re->tables + cbits_offset;
360     compile_block.ctypes = re->tables + ctypes_offset;
361    
362 nigel 3 /* See if we can find a fixed set of initial characters for the pattern. */
363    
364     memset(start_bits, 0, 32 * sizeof(uschar));
365 nigel 25 if (!set_start_bits(re->code, start_bits, (re->options & PCRE_CASELESS) != 0,
366     &compile_block)) return NULL;
367 nigel 3
368     /* Get an "extra" block and put the information therein. */
369    
370     extra = (real_pcre_extra *)(pcre_malloc)(sizeof(real_pcre_extra));
371    
372     if (extra == NULL)
373     {
374     *errorptr = "failed to get memory";
375     return NULL;
376     }
377    
378 nigel 23 extra->options = PCRE_STUDY_MAPPED;
379 nigel 3 memcpy(extra->start_bits, start_bits, sizeof(start_bits));
380    
381     return (pcre_extra *)extra;
382     }
383    
384     /* End of study.c */

webmaster@exim.org
ViewVC Help
Powered by ViewVC 1.1.12