/[pcre]/code/tags/pcre-2.08/study.c
ViewVC logotype

Contents of /code/tags/pcre-2.08/study.c

Parent Directory Parent Directory | Revision Log Revision Log


Revision 27 - (hide annotations) (download)
Sat Feb 24 21:38:49 2007 UTC (7 years, 4 months ago) by nigel
Original Path: code/trunk/study.c
File MIME type: text/plain
File size: 10736 byte(s)
Load pcre-2.02 into code/trunk.

1 nigel 3 /*************************************************
2     * Perl-Compatible Regular Expressions *
3     *************************************************/
4    
5     /*
6     This is a library of functions to support regular expressions whose syntax
7     and semantics are as close as possible to those of the Perl 5 language. See
8     the file Tech.Notes for some information on the internals.
9    
10     Written by: Philip Hazel <ph10@cam.ac.uk>
11    
12 nigel 27 Copyright (c) 1997-1999 University of Cambridge
13 nigel 3
14     -----------------------------------------------------------------------------
15     Permission is granted to anyone to use this software for any purpose on any
16     computer system, and to redistribute it freely, subject to the following
17     restrictions:
18    
19     1. This software is distributed in the hope that it will be useful,
20     but WITHOUT ANY WARRANTY; without even the implied warranty of
21     MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
22    
23     2. The origin of this software must not be misrepresented, either by
24     explicit claim or by omission.
25    
26     3. Altered versions must be plainly marked as such, and must not be
27     misrepresented as being the original software.
28     -----------------------------------------------------------------------------
29     */
30    
31    
32     /* Include the internals header, which itself includes Standard C headers plus
33     the external pcre header. */
34    
35     #include "internal.h"
36    
37    
38    
39     /*************************************************
40 nigel 23 * Set a bit and maybe its alternate case *
41     *************************************************/
42    
43     /* Given a character, set its bit in the table, and also the bit for the other
44     version of a letter if we are caseless.
45    
46     Arguments:
47     start_bits points to the bit map
48     c is the character
49     caseless the caseless flag
50 nigel 25 cd the block with char table pointers
51 nigel 23
52     Returns: nothing
53     */
54    
55     static void
56 nigel 25 set_bit(uschar *start_bits, int c, BOOL caseless, compile_data *cd)
57 nigel 23 {
58     start_bits[c/8] |= (1 << (c&7));
59 nigel 25 if (caseless && (cd->ctypes[c] & ctype_letter) != 0)
60     start_bits[cd->fcc[c]/8] |= (1 << (cd->fcc[c]&7));
61 nigel 23 }
62    
63    
64    
65     /*************************************************
66 nigel 3 * Create bitmap of starting chars *
67     *************************************************/
68    
69     /* This function scans a compiled unanchored expression and attempts to build a
70     bitmap of the set of initial characters. If it can't, it returns FALSE. As time
71     goes by, we may be able to get more clever at doing this.
72    
73     Arguments:
74     code points to an expression
75     start_bits points to a 32-byte table, initialized to 0
76 nigel 23 caseless the current state of the caseless flag
77 nigel 25 cd the block with char table pointers
78 nigel 3
79     Returns: TRUE if table built, FALSE otherwise
80     */
81    
82     static BOOL
83 nigel 25 set_start_bits(const uschar *code, uschar *start_bits, BOOL caseless,
84     compile_data *cd)
85 nigel 3 {
86     register int c;
87    
88 nigel 27 /* This next statement and the later reference to dummy are here in order to
89     trick the optimizer of the IBM C compiler for OS/2 into generating correct
90     code. Apparently IBM isn't going to fix the problem, and we would rather not
91     disable optimization (in this module it actually makes a big difference, and
92     the pcre module can use all the optimization it can get). */
93    
94     volatile int dummy;
95    
96 nigel 3 do
97     {
98 nigel 7 const uschar *tcode = code + 3;
99 nigel 3 BOOL try_next = TRUE;
100    
101     while (try_next)
102     {
103     try_next = FALSE;
104    
105 nigel 23 /* If a branch starts with a bracket or a positive lookahead assertion,
106     recurse to set bits from within them. That's all for this branch. */
107    
108 nigel 3 if ((int)*tcode >= OP_BRA || *tcode == OP_ASSERT)
109     {
110 nigel 25 if (!set_start_bits(tcode, start_bits, caseless, cd))
111     return FALSE;
112 nigel 3 }
113    
114     else switch(*tcode)
115     {
116     default:
117     return FALSE;
118    
119 nigel 23 /* Skip over lookbehind and negative lookahead assertions */
120    
121     case OP_ASSERT_NOT:
122     case OP_ASSERTBACK:
123     case OP_ASSERTBACK_NOT:
124     try_next = TRUE;
125     do tcode += (tcode[1] << 8) + tcode[2]; while (*tcode == OP_ALT);
126     tcode += 3;
127     break;
128    
129     /* Skip over an option setting, changing the caseless flag */
130    
131     case OP_OPT:
132     caseless = (tcode[1] & PCRE_CASELESS) != 0;
133     tcode += 2;
134     try_next = TRUE;
135     break;
136    
137 nigel 3 /* BRAZERO does the bracket, but carries on. */
138    
139     case OP_BRAZERO:
140     case OP_BRAMINZERO:
141 nigel 25 if (!set_start_bits(++tcode, start_bits, caseless, cd))
142     return FALSE;
143 nigel 27 dummy = 1;
144 nigel 3 do tcode += (tcode[1] << 8) + tcode[2]; while (*tcode == OP_ALT);
145     tcode += 3;
146     try_next = TRUE;
147     break;
148    
149     /* Single-char * or ? sets the bit and tries the next item */
150    
151     case OP_STAR:
152     case OP_MINSTAR:
153     case OP_QUERY:
154     case OP_MINQUERY:
155 nigel 25 set_bit(start_bits, tcode[1], caseless, cd);
156 nigel 3 tcode += 2;
157     try_next = TRUE;
158     break;
159    
160     /* Single-char upto sets the bit and tries the next */
161    
162     case OP_UPTO:
163     case OP_MINUPTO:
164 nigel 25 set_bit(start_bits, tcode[3], caseless, cd);
165 nigel 3 tcode += 4;
166     try_next = TRUE;
167     break;
168    
169     /* At least one single char sets the bit and stops */
170    
171     case OP_EXACT: /* Fall through */
172     tcode++;
173    
174     case OP_CHARS: /* Fall through */
175     tcode++;
176    
177     case OP_PLUS:
178     case OP_MINPLUS:
179 nigel 25 set_bit(start_bits, tcode[1], caseless, cd);
180 nigel 3 break;
181    
182     /* Single character type sets the bits and stops */
183    
184     case OP_NOT_DIGIT:
185 nigel 25 for (c = 0; c < 32; c++)
186     start_bits[c] |= ~cd->cbits[c+cbit_digit];
187 nigel 3 break;
188    
189     case OP_DIGIT:
190 nigel 25 for (c = 0; c < 32; c++)
191     start_bits[c] |= cd->cbits[c+cbit_digit];
192 nigel 3 break;
193    
194     case OP_NOT_WHITESPACE:
195 nigel 25 for (c = 0; c < 32; c++)
196     start_bits[c] |= ~cd->cbits[c+cbit_space];
197 nigel 3 break;
198    
199     case OP_WHITESPACE:
200 nigel 25 for (c = 0; c < 32; c++)
201     start_bits[c] |= cd->cbits[c+cbit_space];
202 nigel 3 break;
203    
204     case OP_NOT_WORDCHAR:
205     for (c = 0; c < 32; c++)
206 nigel 25 start_bits[c] |= ~(cd->cbits[c] | cd->cbits[c+cbit_word]);
207 nigel 3 break;
208    
209     case OP_WORDCHAR:
210     for (c = 0; c < 32; c++)
211 nigel 25 start_bits[c] |= (cd->cbits[c] | cd->cbits[c+cbit_word]);
212 nigel 3 break;
213    
214     /* One or more character type fudges the pointer and restarts, knowing
215     it will hit a single character type and stop there. */
216    
217     case OP_TYPEPLUS:
218     case OP_TYPEMINPLUS:
219     tcode++;
220     try_next = TRUE;
221     break;
222    
223     case OP_TYPEEXACT:
224     tcode += 3;
225     try_next = TRUE;
226     break;
227    
228     /* Zero or more repeats of character types set the bits and then
229     try again. */
230    
231     case OP_TYPEUPTO:
232     case OP_TYPEMINUPTO:
233     tcode += 2; /* Fall through */
234    
235     case OP_TYPESTAR:
236     case OP_TYPEMINSTAR:
237     case OP_TYPEQUERY:
238     case OP_TYPEMINQUERY:
239     switch(tcode[1])
240     {
241     case OP_NOT_DIGIT:
242 nigel 25 for (c = 0; c < 32; c++)
243     start_bits[c] |= ~cd->cbits[c+cbit_digit];
244 nigel 3 break;
245    
246     case OP_DIGIT:
247 nigel 25 for (c = 0; c < 32; c++)
248     start_bits[c] |= cd->cbits[c+cbit_digit];
249 nigel 3 break;
250    
251     case OP_NOT_WHITESPACE:
252 nigel 25 for (c = 0; c < 32; c++)
253     start_bits[c] |= ~cd->cbits[c+cbit_space];
254 nigel 3 break;
255    
256     case OP_WHITESPACE:
257 nigel 25 for (c = 0; c < 32; c++)
258     start_bits[c] |= cd->cbits[c+cbit_space];
259 nigel 3 break;
260    
261     case OP_NOT_WORDCHAR:
262     for (c = 0; c < 32; c++)
263 nigel 25 start_bits[c] |= ~(cd->cbits[c] | cd->cbits[c+cbit_word]);
264 nigel 3 break;
265    
266     case OP_WORDCHAR:
267     for (c = 0; c < 32; c++)
268 nigel 25 start_bits[c] |= (cd->cbits[c] | cd->cbits[c+cbit_word]);
269 nigel 3 break;
270     }
271    
272     tcode += 2;
273     try_next = TRUE;
274     break;
275    
276     /* Character class: set the bits and either carry on or not,
277     according to the repeat count. */
278    
279     case OP_CLASS:
280     {
281     tcode++;
282     for (c = 0; c < 32; c++) start_bits[c] |= tcode[c];
283     tcode += 32;
284     switch (*tcode)
285     {
286     case OP_CRSTAR:
287     case OP_CRMINSTAR:
288     case OP_CRQUERY:
289     case OP_CRMINQUERY:
290     tcode++;
291     try_next = TRUE;
292     break;
293    
294     case OP_CRRANGE:
295     case OP_CRMINRANGE:
296     if (((tcode[1] << 8) + tcode[2]) == 0)
297     {
298     tcode += 5;
299     try_next = TRUE;
300     }
301     break;
302     }
303     }
304     break; /* End of class handling */
305    
306     } /* End of switch */
307     } /* End of try_next loop */
308    
309     code += (code[1] << 8) + code[2]; /* Advance to next branch */
310     }
311     while (*code == OP_ALT);
312     return TRUE;
313     }
314    
315    
316    
317     /*************************************************
318     * Study a compiled expression *
319     *************************************************/
320    
321     /* This function is handed a compiled expression that it must study to produce
322     information that will speed up the matching. It returns a pcre_extra block
323     which then gets handed back to pcre_exec().
324    
325     Arguments:
326     re points to the compiled expression
327     options contains option bits
328     errorptr points to where to place error messages;
329     set NULL unless error
330    
331     Returns: pointer to a pcre_extra block,
332     NULL on error or if no optimization possible
333     */
334    
335     pcre_extra *
336 nigel 7 pcre_study(const pcre *external_re, int options, const char **errorptr)
337 nigel 3 {
338     uschar start_bits[32];
339     real_pcre_extra *extra;
340 nigel 7 const real_pcre *re = (const real_pcre *)external_re;
341 nigel 25 compile_data compile_block;
342 nigel 3
343     *errorptr = NULL;
344    
345     if (re == NULL || re->magic_number != MAGIC_NUMBER)
346     {
347     *errorptr = "argument is not a compiled regular expression";
348     return NULL;
349     }
350    
351     if ((options & ~PUBLIC_STUDY_OPTIONS) != 0)
352     {
353     *errorptr = "unknown or incorrect option bit(s) set";
354     return NULL;
355     }
356    
357     /* For an anchored pattern, or an unchored pattern that has a first char, or a
358     multiline pattern that matches only at "line starts", no further processing at
359     present. */
360    
361     if ((re->options & (PCRE_ANCHORED|PCRE_FIRSTSET|PCRE_STARTLINE)) != 0)
362     return NULL;
363    
364 nigel 25 /* Set the character tables in the block which is passed around */
365    
366     compile_block.lcc = re->tables + lcc_offset;
367     compile_block.fcc = re->tables + fcc_offset;
368     compile_block.cbits = re->tables + cbits_offset;
369     compile_block.ctypes = re->tables + ctypes_offset;
370    
371 nigel 3 /* See if we can find a fixed set of initial characters for the pattern. */
372    
373     memset(start_bits, 0, 32 * sizeof(uschar));
374 nigel 25 if (!set_start_bits(re->code, start_bits, (re->options & PCRE_CASELESS) != 0,
375     &compile_block)) return NULL;
376 nigel 3
377     /* Get an "extra" block and put the information therein. */
378    
379     extra = (real_pcre_extra *)(pcre_malloc)(sizeof(real_pcre_extra));
380    
381     if (extra == NULL)
382     {
383     *errorptr = "failed to get memory";
384     return NULL;
385     }
386    
387 nigel 23 extra->options = PCRE_STUDY_MAPPED;
388 nigel 3 memcpy(extra->start_bits, start_bits, sizeof(start_bits));
389    
390     return (pcre_extra *)extra;
391     }
392    
393     /* End of study.c */

webmaster@exim.org
ViewVC Help
Powered by ViewVC 1.1.12