/[pcre]/code/tags/pcre-3.1/study.c
ViewVC logotype

Contents of /code/tags/pcre-3.1/study.c

Parent Directory Parent Directory | Revision Log Revision Log


Revision 23 - (hide annotations) (download)
Sat Feb 24 21:38:41 2007 UTC (6 years, 2 months ago) by nigel
Original Path: code/trunk/study.c
File MIME type: text/plain
File size: 9794 byte(s)
Load pcre-2.00 into code/trunk.

1 nigel 3 /*************************************************
2     * Perl-Compatible Regular Expressions *
3     *************************************************/
4    
5     /*
6     This is a library of functions to support regular expressions whose syntax
7     and semantics are as close as possible to those of the Perl 5 language. See
8     the file Tech.Notes for some information on the internals.
9    
10     Written by: Philip Hazel <ph10@cam.ac.uk>
11    
12 nigel 15 Copyright (c) 1998 University of Cambridge
13 nigel 3
14     -----------------------------------------------------------------------------
15     Permission is granted to anyone to use this software for any purpose on any
16     computer system, and to redistribute it freely, subject to the following
17     restrictions:
18    
19     1. This software is distributed in the hope that it will be useful,
20     but WITHOUT ANY WARRANTY; without even the implied warranty of
21     MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
22    
23     2. The origin of this software must not be misrepresented, either by
24     explicit claim or by omission.
25    
26     3. Altered versions must be plainly marked as such, and must not be
27     misrepresented as being the original software.
28     -----------------------------------------------------------------------------
29     */
30    
31    
32     /* Include the internals header, which itself includes Standard C headers plus
33     the external pcre header. */
34    
35     #include "internal.h"
36    
37    
38    
39     /*************************************************
40 nigel 23 * Set a bit and maybe its alternate case *
41     *************************************************/
42    
43     /* Given a character, set its bit in the table, and also the bit for the other
44     version of a letter if we are caseless.
45    
46     Arguments:
47     start_bits points to the bit map
48     c is the character
49     caseless the caseless flag
50    
51     Returns: nothing
52     */
53    
54     static void
55     set_bit(uschar *start_bits, int c, BOOL caseless)
56     {
57     start_bits[c/8] |= (1 << (c&7));
58     if (caseless && (pcre_ctypes[c] & ctype_letter) != 0)
59     start_bits[pcre_fcc[c]/8] |= (1 << (pcre_fcc[c]&7));
60     }
61    
62    
63    
64     /*************************************************
65 nigel 3 * Create bitmap of starting chars *
66     *************************************************/
67    
68     /* This function scans a compiled unanchored expression and attempts to build a
69     bitmap of the set of initial characters. If it can't, it returns FALSE. As time
70     goes by, we may be able to get more clever at doing this.
71    
72     Arguments:
73     code points to an expression
74     start_bits points to a 32-byte table, initialized to 0
75 nigel 23 caseless the current state of the caseless flag
76 nigel 3
77     Returns: TRUE if table built, FALSE otherwise
78     */
79    
80     static BOOL
81 nigel 23 set_start_bits(const uschar *code, uschar *start_bits, BOOL caseless)
82 nigel 3 {
83     register int c;
84    
85     do
86     {
87 nigel 7 const uschar *tcode = code + 3;
88 nigel 3 BOOL try_next = TRUE;
89    
90     while (try_next)
91     {
92     try_next = FALSE;
93    
94 nigel 23 /* If a branch starts with a bracket or a positive lookahead assertion,
95     recurse to set bits from within them. That's all for this branch. */
96    
97 nigel 3 if ((int)*tcode >= OP_BRA || *tcode == OP_ASSERT)
98     {
99 nigel 23 if (!set_start_bits(tcode, start_bits, caseless)) return FALSE;
100 nigel 3 }
101    
102     else switch(*tcode)
103     {
104     default:
105     return FALSE;
106    
107 nigel 23 /* Skip over lookbehind and negative lookahead assertions */
108    
109     case OP_ASSERT_NOT:
110     case OP_ASSERTBACK:
111     case OP_ASSERTBACK_NOT:
112     try_next = TRUE;
113     do tcode += (tcode[1] << 8) + tcode[2]; while (*tcode == OP_ALT);
114     tcode += 3;
115     break;
116    
117     /* Skip over an option setting, changing the caseless flag */
118    
119     case OP_OPT:
120     caseless = (tcode[1] & PCRE_CASELESS) != 0;
121     tcode += 2;
122     try_next = TRUE;
123     break;
124    
125 nigel 3 /* BRAZERO does the bracket, but carries on. */
126    
127     case OP_BRAZERO:
128     case OP_BRAMINZERO:
129 nigel 23 if (!set_start_bits(++tcode, start_bits, caseless)) return FALSE;
130 nigel 3 do tcode += (tcode[1] << 8) + tcode[2]; while (*tcode == OP_ALT);
131     tcode += 3;
132     try_next = TRUE;
133     break;
134    
135     /* Single-char * or ? sets the bit and tries the next item */
136    
137     case OP_STAR:
138     case OP_MINSTAR:
139     case OP_QUERY:
140     case OP_MINQUERY:
141 nigel 23 set_bit(start_bits, tcode[1], caseless);
142 nigel 3 tcode += 2;
143     try_next = TRUE;
144     break;
145    
146     /* Single-char upto sets the bit and tries the next */
147    
148     case OP_UPTO:
149     case OP_MINUPTO:
150 nigel 23 set_bit(start_bits, tcode[3], caseless);
151 nigel 3 tcode += 4;
152     try_next = TRUE;
153     break;
154    
155     /* At least one single char sets the bit and stops */
156    
157     case OP_EXACT: /* Fall through */
158     tcode++;
159    
160     case OP_CHARS: /* Fall through */
161     tcode++;
162    
163     case OP_PLUS:
164     case OP_MINPLUS:
165 nigel 23 set_bit(start_bits, tcode[1], caseless);
166 nigel 3 break;
167    
168     /* Single character type sets the bits and stops */
169    
170     case OP_NOT_DIGIT:
171     for (c = 0; c < 32; c++) start_bits[c] |= ~pcre_cbits[c+cbit_digit];
172     break;
173    
174     case OP_DIGIT:
175     for (c = 0; c < 32; c++) start_bits[c] |= pcre_cbits[c+cbit_digit];
176     break;
177    
178     case OP_NOT_WHITESPACE:
179     for (c = 0; c < 32; c++) start_bits[c] |= ~pcre_cbits[c+cbit_space];
180     break;
181    
182     case OP_WHITESPACE:
183     for (c = 0; c < 32; c++) start_bits[c] |= pcre_cbits[c+cbit_space];
184     break;
185    
186     case OP_NOT_WORDCHAR:
187     for (c = 0; c < 32; c++)
188     start_bits[c] |= ~(pcre_cbits[c] | pcre_cbits[c+cbit_word]);
189     break;
190    
191     case OP_WORDCHAR:
192     for (c = 0; c < 32; c++)
193     start_bits[c] |= (pcre_cbits[c] | pcre_cbits[c+cbit_word]);
194     break;
195    
196     /* One or more character type fudges the pointer and restarts, knowing
197     it will hit a single character type and stop there. */
198    
199     case OP_TYPEPLUS:
200     case OP_TYPEMINPLUS:
201     tcode++;
202     try_next = TRUE;
203     break;
204    
205     case OP_TYPEEXACT:
206     tcode += 3;
207     try_next = TRUE;
208     break;
209    
210     /* Zero or more repeats of character types set the bits and then
211     try again. */
212    
213     case OP_TYPEUPTO:
214     case OP_TYPEMINUPTO:
215     tcode += 2; /* Fall through */
216    
217     case OP_TYPESTAR:
218     case OP_TYPEMINSTAR:
219     case OP_TYPEQUERY:
220     case OP_TYPEMINQUERY:
221     switch(tcode[1])
222     {
223     case OP_NOT_DIGIT:
224     for (c = 0; c < 32; c++) start_bits[c] |= ~pcre_cbits[c+cbit_digit];
225     break;
226    
227     case OP_DIGIT:
228     for (c = 0; c < 32; c++) start_bits[c] |= pcre_cbits[c+cbit_digit];
229     break;
230    
231     case OP_NOT_WHITESPACE:
232     for (c = 0; c < 32; c++) start_bits[c] |= ~pcre_cbits[c+cbit_space];
233     break;
234    
235     case OP_WHITESPACE:
236     for (c = 0; c < 32; c++) start_bits[c] |= pcre_cbits[c+cbit_space];
237     break;
238    
239     case OP_NOT_WORDCHAR:
240     for (c = 0; c < 32; c++)
241     start_bits[c] |= ~(pcre_cbits[c] | pcre_cbits[c+cbit_word]);
242     break;
243    
244     case OP_WORDCHAR:
245     for (c = 0; c < 32; c++)
246     start_bits[c] |= (pcre_cbits[c] | pcre_cbits[c+cbit_word]);
247     break;
248     }
249    
250     tcode += 2;
251     try_next = TRUE;
252     break;
253    
254     /* Character class: set the bits and either carry on or not,
255     according to the repeat count. */
256    
257     case OP_CLASS:
258     {
259     tcode++;
260     for (c = 0; c < 32; c++) start_bits[c] |= tcode[c];
261     tcode += 32;
262     switch (*tcode)
263     {
264     case OP_CRSTAR:
265     case OP_CRMINSTAR:
266     case OP_CRQUERY:
267     case OP_CRMINQUERY:
268     tcode++;
269     try_next = TRUE;
270     break;
271    
272     case OP_CRRANGE:
273     case OP_CRMINRANGE:
274     if (((tcode[1] << 8) + tcode[2]) == 0)
275     {
276     tcode += 5;
277     try_next = TRUE;
278     }
279     break;
280     }
281     }
282     break; /* End of class handling */
283    
284     } /* End of switch */
285     } /* End of try_next loop */
286    
287     code += (code[1] << 8) + code[2]; /* Advance to next branch */
288     }
289     while (*code == OP_ALT);
290     return TRUE;
291     }
292    
293    
294    
295     /*************************************************
296     * Study a compiled expression *
297     *************************************************/
298    
299     /* This function is handed a compiled expression that it must study to produce
300     information that will speed up the matching. It returns a pcre_extra block
301     which then gets handed back to pcre_exec().
302    
303     Arguments:
304     re points to the compiled expression
305     options contains option bits
306     errorptr points to where to place error messages;
307     set NULL unless error
308    
309     Returns: pointer to a pcre_extra block,
310     NULL on error or if no optimization possible
311     */
312    
313     pcre_extra *
314 nigel 7 pcre_study(const pcre *external_re, int options, const char **errorptr)
315 nigel 3 {
316     uschar start_bits[32];
317     real_pcre_extra *extra;
318 nigel 7 const real_pcre *re = (const real_pcre *)external_re;
319 nigel 3
320     *errorptr = NULL;
321    
322     if (re == NULL || re->magic_number != MAGIC_NUMBER)
323     {
324     *errorptr = "argument is not a compiled regular expression";
325     return NULL;
326     }
327    
328     if ((options & ~PUBLIC_STUDY_OPTIONS) != 0)
329     {
330     *errorptr = "unknown or incorrect option bit(s) set";
331     return NULL;
332     }
333    
334     /* For an anchored pattern, or an unchored pattern that has a first char, or a
335     multiline pattern that matches only at "line starts", no further processing at
336     present. */
337    
338     if ((re->options & (PCRE_ANCHORED|PCRE_FIRSTSET|PCRE_STARTLINE)) != 0)
339     return NULL;
340    
341     /* See if we can find a fixed set of initial characters for the pattern. */
342    
343     memset(start_bits, 0, 32 * sizeof(uschar));
344 nigel 23 if (!set_start_bits(re->code, start_bits, (re->options & PCRE_CASELESS) != 0))
345     return NULL;
346 nigel 3
347     /* Get an "extra" block and put the information therein. */
348    
349     extra = (real_pcre_extra *)(pcre_malloc)(sizeof(real_pcre_extra));
350    
351     if (extra == NULL)
352     {
353     *errorptr = "failed to get memory";
354     return NULL;
355     }
356    
357 nigel 23 extra->options = PCRE_STUDY_MAPPED;
358 nigel 3 memcpy(extra->start_bits, start_bits, sizeof(start_bits));
359    
360     return (pcre_extra *)extra;
361     }
362    
363     /* End of study.c */

webmaster@exim.org
ViewVC Help
Powered by ViewVC 1.1.12