/[pcre]/code/tags/pcre-3.5/study.c
ViewVC logotype

Contents of /code/tags/pcre-3.5/study.c

Parent Directory Parent Directory | Revision Log Revision Log


Revision 54 - (hide annotations) (download)
Sat Feb 24 21:39:44 2007 UTC (7 years, 6 months ago) by nigel
File MIME type: text/plain
File size: 10978 byte(s)
Tag code/trunk as code/tags/pcre-3.5.

1 nigel 3 /*************************************************
2     * Perl-Compatible Regular Expressions *
3     *************************************************/
4    
5     /*
6     This is a library of functions to support regular expressions whose syntax
7     and semantics are as close as possible to those of the Perl 5 language. See
8     the file Tech.Notes for some information on the internals.
9    
10     Written by: Philip Hazel <ph10@cam.ac.uk>
11    
12 nigel 53 Copyright (c) 1997-2001 University of Cambridge
13 nigel 3
14     -----------------------------------------------------------------------------
15     Permission is granted to anyone to use this software for any purpose on any
16     computer system, and to redistribute it freely, subject to the following
17     restrictions:
18    
19     1. This software is distributed in the hope that it will be useful,
20     but WITHOUT ANY WARRANTY; without even the implied warranty of
21     MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
22    
23     2. The origin of this software must not be misrepresented, either by
24     explicit claim or by omission.
25    
26     3. Altered versions must be plainly marked as such, and must not be
27     misrepresented as being the original software.
28 nigel 29
29     4. If PCRE is embedded in any software that is released under the GNU
30     General Purpose Licence (GPL), then the terms of that licence shall
31     supersede any condition above with which it is incompatible.
32 nigel 3 -----------------------------------------------------------------------------
33     */
34    
35    
36     /* Include the internals header, which itself includes Standard C headers plus
37     the external pcre header. */
38    
39     #include "internal.h"
40    
41    
42    
43     /*************************************************
44 nigel 23 * Set a bit and maybe its alternate case *
45     *************************************************/
46    
47     /* Given a character, set its bit in the table, and also the bit for the other
48     version of a letter if we are caseless.
49    
50     Arguments:
51     start_bits points to the bit map
52     c is the character
53     caseless the caseless flag
54 nigel 25 cd the block with char table pointers
55 nigel 23
56     Returns: nothing
57     */
58    
59     static void
60 nigel 25 set_bit(uschar *start_bits, int c, BOOL caseless, compile_data *cd)
61 nigel 23 {
62     start_bits[c/8] |= (1 << (c&7));
63 nigel 25 if (caseless && (cd->ctypes[c] & ctype_letter) != 0)
64     start_bits[cd->fcc[c]/8] |= (1 << (cd->fcc[c]&7));
65 nigel 23 }
66    
67    
68    
69     /*************************************************
70 nigel 3 * Create bitmap of starting chars *
71     *************************************************/
72    
73     /* This function scans a compiled unanchored expression and attempts to build a
74     bitmap of the set of initial characters. If it can't, it returns FALSE. As time
75     goes by, we may be able to get more clever at doing this.
76    
77     Arguments:
78     code points to an expression
79     start_bits points to a 32-byte table, initialized to 0
80 nigel 23 caseless the current state of the caseless flag
81 nigel 25 cd the block with char table pointers
82 nigel 3
83     Returns: TRUE if table built, FALSE otherwise
84     */
85    
86     static BOOL
87 nigel 25 set_start_bits(const uschar *code, uschar *start_bits, BOOL caseless,
88     compile_data *cd)
89 nigel 3 {
90     register int c;
91    
92 nigel 27 /* This next statement and the later reference to dummy are here in order to
93     trick the optimizer of the IBM C compiler for OS/2 into generating correct
94     code. Apparently IBM isn't going to fix the problem, and we would rather not
95     disable optimization (in this module it actually makes a big difference, and
96     the pcre module can use all the optimization it can get). */
97    
98     volatile int dummy;
99    
100 nigel 3 do
101     {
102 nigel 7 const uschar *tcode = code + 3;
103 nigel 3 BOOL try_next = TRUE;
104    
105     while (try_next)
106     {
107 nigel 23 /* If a branch starts with a bracket or a positive lookahead assertion,
108     recurse to set bits from within them. That's all for this branch. */
109    
110 nigel 3 if ((int)*tcode >= OP_BRA || *tcode == OP_ASSERT)
111     {
112 nigel 25 if (!set_start_bits(tcode, start_bits, caseless, cd))
113     return FALSE;
114 nigel 53 try_next = FALSE;
115 nigel 3 }
116    
117     else switch(*tcode)
118     {
119     default:
120     return FALSE;
121    
122 nigel 53 /* Skip over extended extraction bracket number */
123    
124     case OP_BRANUMBER:
125     tcode += 3;
126     break;
127    
128 nigel 23 /* Skip over lookbehind and negative lookahead assertions */
129    
130     case OP_ASSERT_NOT:
131     case OP_ASSERTBACK:
132     case OP_ASSERTBACK_NOT:
133     do tcode += (tcode[1] << 8) + tcode[2]; while (*tcode == OP_ALT);
134     tcode += 3;
135     break;
136    
137     /* Skip over an option setting, changing the caseless flag */
138    
139     case OP_OPT:
140     caseless = (tcode[1] & PCRE_CASELESS) != 0;
141     tcode += 2;
142     break;
143    
144 nigel 3 /* BRAZERO does the bracket, but carries on. */
145    
146     case OP_BRAZERO:
147     case OP_BRAMINZERO:
148 nigel 25 if (!set_start_bits(++tcode, start_bits, caseless, cd))
149     return FALSE;
150 nigel 27 dummy = 1;
151 nigel 3 do tcode += (tcode[1] << 8) + tcode[2]; while (*tcode == OP_ALT);
152     tcode += 3;
153     break;
154    
155     /* Single-char * or ? sets the bit and tries the next item */
156    
157     case OP_STAR:
158     case OP_MINSTAR:
159     case OP_QUERY:
160     case OP_MINQUERY:
161 nigel 25 set_bit(start_bits, tcode[1], caseless, cd);
162 nigel 3 tcode += 2;
163     break;
164    
165     /* Single-char upto sets the bit and tries the next */
166    
167     case OP_UPTO:
168     case OP_MINUPTO:
169 nigel 25 set_bit(start_bits, tcode[3], caseless, cd);
170 nigel 3 tcode += 4;
171     break;
172    
173     /* At least one single char sets the bit and stops */
174    
175     case OP_EXACT: /* Fall through */
176     tcode++;
177    
178     case OP_CHARS: /* Fall through */
179     tcode++;
180    
181     case OP_PLUS:
182     case OP_MINPLUS:
183 nigel 25 set_bit(start_bits, tcode[1], caseless, cd);
184 nigel 53 try_next = FALSE;
185 nigel 3 break;
186    
187     /* Single character type sets the bits and stops */
188    
189     case OP_NOT_DIGIT:
190 nigel 25 for (c = 0; c < 32; c++)
191     start_bits[c] |= ~cd->cbits[c+cbit_digit];
192 nigel 53 try_next = FALSE;
193 nigel 3 break;
194    
195     case OP_DIGIT:
196 nigel 25 for (c = 0; c < 32; c++)
197     start_bits[c] |= cd->cbits[c+cbit_digit];
198 nigel 53 try_next = FALSE;
199 nigel 3 break;
200    
201     case OP_NOT_WHITESPACE:
202 nigel 25 for (c = 0; c < 32; c++)
203     start_bits[c] |= ~cd->cbits[c+cbit_space];
204 nigel 53 try_next = FALSE;
205 nigel 3 break;
206    
207     case OP_WHITESPACE:
208 nigel 25 for (c = 0; c < 32; c++)
209     start_bits[c] |= cd->cbits[c+cbit_space];
210 nigel 53 try_next = FALSE;
211 nigel 3 break;
212    
213     case OP_NOT_WORDCHAR:
214     for (c = 0; c < 32; c++)
215 nigel 43 start_bits[c] |= ~cd->cbits[c+cbit_word];
216 nigel 53 try_next = FALSE;
217 nigel 3 break;
218    
219     case OP_WORDCHAR:
220     for (c = 0; c < 32; c++)
221 nigel 43 start_bits[c] |= cd->cbits[c+cbit_word];
222 nigel 53 try_next = FALSE;
223 nigel 3 break;
224    
225     /* One or more character type fudges the pointer and restarts, knowing
226     it will hit a single character type and stop there. */
227    
228     case OP_TYPEPLUS:
229     case OP_TYPEMINPLUS:
230     tcode++;
231     break;
232    
233     case OP_TYPEEXACT:
234     tcode += 3;
235     break;
236    
237     /* Zero or more repeats of character types set the bits and then
238     try again. */
239    
240     case OP_TYPEUPTO:
241     case OP_TYPEMINUPTO:
242     tcode += 2; /* Fall through */
243    
244     case OP_TYPESTAR:
245     case OP_TYPEMINSTAR:
246     case OP_TYPEQUERY:
247     case OP_TYPEMINQUERY:
248     switch(tcode[1])
249     {
250     case OP_NOT_DIGIT:
251 nigel 25 for (c = 0; c < 32; c++)
252     start_bits[c] |= ~cd->cbits[c+cbit_digit];
253 nigel 3 break;
254    
255     case OP_DIGIT:
256 nigel 25 for (c = 0; c < 32; c++)
257     start_bits[c] |= cd->cbits[c+cbit_digit];
258 nigel 3 break;
259    
260     case OP_NOT_WHITESPACE:
261 nigel 25 for (c = 0; c < 32; c++)
262     start_bits[c] |= ~cd->cbits[c+cbit_space];
263 nigel 3 break;
264    
265     case OP_WHITESPACE:
266 nigel 25 for (c = 0; c < 32; c++)
267     start_bits[c] |= cd->cbits[c+cbit_space];
268 nigel 3 break;
269    
270     case OP_NOT_WORDCHAR:
271     for (c = 0; c < 32; c++)
272 nigel 43 start_bits[c] |= ~cd->cbits[c+cbit_word];
273 nigel 3 break;
274    
275     case OP_WORDCHAR:
276     for (c = 0; c < 32; c++)
277 nigel 43 start_bits[c] |= cd->cbits[c+cbit_word];
278 nigel 3 break;
279     }
280    
281     tcode += 2;
282     break;
283    
284     /* Character class: set the bits and either carry on or not,
285     according to the repeat count. */
286    
287     case OP_CLASS:
288     {
289     tcode++;
290     for (c = 0; c < 32; c++) start_bits[c] |= tcode[c];
291     tcode += 32;
292     switch (*tcode)
293     {
294     case OP_CRSTAR:
295     case OP_CRMINSTAR:
296     case OP_CRQUERY:
297     case OP_CRMINQUERY:
298     tcode++;
299     break;
300    
301     case OP_CRRANGE:
302     case OP_CRMINRANGE:
303 nigel 53 if (((tcode[1] << 8) + tcode[2]) == 0) tcode += 5;
304     else try_next = FALSE;
305 nigel 3 break;
306 nigel 53
307     default:
308     try_next = FALSE;
309     break;
310 nigel 3 }
311     }
312     break; /* End of class handling */
313    
314     } /* End of switch */
315     } /* End of try_next loop */
316    
317     code += (code[1] << 8) + code[2]; /* Advance to next branch */
318     }
319     while (*code == OP_ALT);
320     return TRUE;
321     }
322    
323    
324    
325     /*************************************************
326     * Study a compiled expression *
327     *************************************************/
328    
329     /* This function is handed a compiled expression that it must study to produce
330     information that will speed up the matching. It returns a pcre_extra block
331     which then gets handed back to pcre_exec().
332    
333     Arguments:
334     re points to the compiled expression
335     options contains option bits
336     errorptr points to where to place error messages;
337     set NULL unless error
338    
339     Returns: pointer to a pcre_extra block,
340     NULL on error or if no optimization possible
341     */
342    
343     pcre_extra *
344 nigel 7 pcre_study(const pcre *external_re, int options, const char **errorptr)
345 nigel 3 {
346     uschar start_bits[32];
347     real_pcre_extra *extra;
348 nigel 7 const real_pcre *re = (const real_pcre *)external_re;
349 nigel 25 compile_data compile_block;
350 nigel 3
351     *errorptr = NULL;
352    
353     if (re == NULL || re->magic_number != MAGIC_NUMBER)
354     {
355     *errorptr = "argument is not a compiled regular expression";
356     return NULL;
357     }
358    
359     if ((options & ~PUBLIC_STUDY_OPTIONS) != 0)
360     {
361     *errorptr = "unknown or incorrect option bit(s) set";
362     return NULL;
363     }
364    
365     /* For an anchored pattern, or an unchored pattern that has a first char, or a
366     multiline pattern that matches only at "line starts", no further processing at
367     present. */
368    
369     if ((re->options & (PCRE_ANCHORED|PCRE_FIRSTSET|PCRE_STARTLINE)) != 0)
370     return NULL;
371    
372 nigel 25 /* Set the character tables in the block which is passed around */
373    
374     compile_block.lcc = re->tables + lcc_offset;
375     compile_block.fcc = re->tables + fcc_offset;
376     compile_block.cbits = re->tables + cbits_offset;
377     compile_block.ctypes = re->tables + ctypes_offset;
378    
379 nigel 3 /* See if we can find a fixed set of initial characters for the pattern. */
380    
381     memset(start_bits, 0, 32 * sizeof(uschar));
382 nigel 25 if (!set_start_bits(re->code, start_bits, (re->options & PCRE_CASELESS) != 0,
383     &compile_block)) return NULL;
384 nigel 3
385     /* Get an "extra" block and put the information therein. */
386    
387     extra = (real_pcre_extra *)(pcre_malloc)(sizeof(real_pcre_extra));
388    
389     if (extra == NULL)
390     {
391     *errorptr = "failed to get memory";
392     return NULL;
393     }
394    
395 nigel 23 extra->options = PCRE_STUDY_MAPPED;
396 nigel 3 memcpy(extra->start_bits, start_bits, sizeof(start_bits));
397    
398     return (pcre_extra *)extra;
399     }
400    
401     /* End of study.c */

webmaster@exim.org
ViewVC Help
Powered by ViewVC 1.1.12