/[pcre]/code/trunk/study.c
ViewVC logotype

Contents of /code/trunk/study.c

Parent Directory Parent Directory | Revision Log Revision Log


Revision 43 - (hide annotations) (download)
Sat Feb 24 21:39:21 2007 UTC (7 years, 1 month ago) by nigel
File MIME type: text/plain
File size: 10874 byte(s)
Load pcre-3.0 into code/trunk.

1 nigel 3 /*************************************************
2     * Perl-Compatible Regular Expressions *
3     *************************************************/
4    
5     /*
6     This is a library of functions to support regular expressions whose syntax
7     and semantics are as close as possible to those of the Perl 5 language. See
8     the file Tech.Notes for some information on the internals.
9    
10     Written by: Philip Hazel <ph10@cam.ac.uk>
11    
12 nigel 43 Copyright (c) 1997-2000 University of Cambridge
13 nigel 3
14     -----------------------------------------------------------------------------
15     Permission is granted to anyone to use this software for any purpose on any
16     computer system, and to redistribute it freely, subject to the following
17     restrictions:
18    
19     1. This software is distributed in the hope that it will be useful,
20     but WITHOUT ANY WARRANTY; without even the implied warranty of
21     MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
22    
23     2. The origin of this software must not be misrepresented, either by
24     explicit claim or by omission.
25    
26     3. Altered versions must be plainly marked as such, and must not be
27     misrepresented as being the original software.
28 nigel 29
29     4. If PCRE is embedded in any software that is released under the GNU
30     General Purpose Licence (GPL), then the terms of that licence shall
31     supersede any condition above with which it is incompatible.
32 nigel 3 -----------------------------------------------------------------------------
33     */
34    
35    
36     /* Include the internals header, which itself includes Standard C headers plus
37     the external pcre header. */
38    
39     #include "internal.h"
40    
41    
42    
43     /*************************************************
44 nigel 23 * Set a bit and maybe its alternate case *
45     *************************************************/
46    
47     /* Given a character, set its bit in the table, and also the bit for the other
48     version of a letter if we are caseless.
49    
50     Arguments:
51     start_bits points to the bit map
52     c is the character
53     caseless the caseless flag
54 nigel 25 cd the block with char table pointers
55 nigel 23
56     Returns: nothing
57     */
58    
59     static void
60 nigel 25 set_bit(uschar *start_bits, int c, BOOL caseless, compile_data *cd)
61 nigel 23 {
62     start_bits[c/8] |= (1 << (c&7));
63 nigel 25 if (caseless && (cd->ctypes[c] & ctype_letter) != 0)
64     start_bits[cd->fcc[c]/8] |= (1 << (cd->fcc[c]&7));
65 nigel 23 }
66    
67    
68    
69     /*************************************************
70 nigel 3 * Create bitmap of starting chars *
71     *************************************************/
72    
73     /* This function scans a compiled unanchored expression and attempts to build a
74     bitmap of the set of initial characters. If it can't, it returns FALSE. As time
75     goes by, we may be able to get more clever at doing this.
76    
77     Arguments:
78     code points to an expression
79     start_bits points to a 32-byte table, initialized to 0
80 nigel 23 caseless the current state of the caseless flag
81 nigel 25 cd the block with char table pointers
82 nigel 3
83     Returns: TRUE if table built, FALSE otherwise
84     */
85    
86     static BOOL
87 nigel 25 set_start_bits(const uschar *code, uschar *start_bits, BOOL caseless,
88     compile_data *cd)
89 nigel 3 {
90     register int c;
91    
92 nigel 27 /* This next statement and the later reference to dummy are here in order to
93     trick the optimizer of the IBM C compiler for OS/2 into generating correct
94     code. Apparently IBM isn't going to fix the problem, and we would rather not
95     disable optimization (in this module it actually makes a big difference, and
96     the pcre module can use all the optimization it can get). */
97    
98     volatile int dummy;
99    
100 nigel 3 do
101     {
102 nigel 7 const uschar *tcode = code + 3;
103 nigel 3 BOOL try_next = TRUE;
104    
105     while (try_next)
106     {
107     try_next = FALSE;
108    
109 nigel 23 /* If a branch starts with a bracket or a positive lookahead assertion,
110     recurse to set bits from within them. That's all for this branch. */
111    
112 nigel 3 if ((int)*tcode >= OP_BRA || *tcode == OP_ASSERT)
113     {
114 nigel 25 if (!set_start_bits(tcode, start_bits, caseless, cd))
115     return FALSE;
116 nigel 3 }
117    
118     else switch(*tcode)
119     {
120     default:
121     return FALSE;
122    
123 nigel 23 /* Skip over lookbehind and negative lookahead assertions */
124    
125     case OP_ASSERT_NOT:
126     case OP_ASSERTBACK:
127     case OP_ASSERTBACK_NOT:
128     try_next = TRUE;
129     do tcode += (tcode[1] << 8) + tcode[2]; while (*tcode == OP_ALT);
130     tcode += 3;
131     break;
132    
133     /* Skip over an option setting, changing the caseless flag */
134    
135     case OP_OPT:
136     caseless = (tcode[1] & PCRE_CASELESS) != 0;
137     tcode += 2;
138     try_next = TRUE;
139     break;
140    
141 nigel 3 /* BRAZERO does the bracket, but carries on. */
142    
143     case OP_BRAZERO:
144     case OP_BRAMINZERO:
145 nigel 25 if (!set_start_bits(++tcode, start_bits, caseless, cd))
146     return FALSE;
147 nigel 27 dummy = 1;
148 nigel 3 do tcode += (tcode[1] << 8) + tcode[2]; while (*tcode == OP_ALT);
149     tcode += 3;
150     try_next = TRUE;
151     break;
152    
153     /* Single-char * or ? sets the bit and tries the next item */
154    
155     case OP_STAR:
156     case OP_MINSTAR:
157     case OP_QUERY:
158     case OP_MINQUERY:
159 nigel 25 set_bit(start_bits, tcode[1], caseless, cd);
160 nigel 3 tcode += 2;
161     try_next = TRUE;
162     break;
163    
164     /* Single-char upto sets the bit and tries the next */
165    
166     case OP_UPTO:
167     case OP_MINUPTO:
168 nigel 25 set_bit(start_bits, tcode[3], caseless, cd);
169 nigel 3 tcode += 4;
170     try_next = TRUE;
171     break;
172    
173     /* At least one single char sets the bit and stops */
174    
175     case OP_EXACT: /* Fall through */
176     tcode++;
177    
178     case OP_CHARS: /* Fall through */
179     tcode++;
180    
181     case OP_PLUS:
182     case OP_MINPLUS:
183 nigel 25 set_bit(start_bits, tcode[1], caseless, cd);
184 nigel 3 break;
185    
186     /* Single character type sets the bits and stops */
187    
188     case OP_NOT_DIGIT:
189 nigel 25 for (c = 0; c < 32; c++)
190     start_bits[c] |= ~cd->cbits[c+cbit_digit];
191 nigel 3 break;
192    
193     case OP_DIGIT:
194 nigel 25 for (c = 0; c < 32; c++)
195     start_bits[c] |= cd->cbits[c+cbit_digit];
196 nigel 3 break;
197    
198     case OP_NOT_WHITESPACE:
199 nigel 25 for (c = 0; c < 32; c++)
200     start_bits[c] |= ~cd->cbits[c+cbit_space];
201 nigel 3 break;
202    
203     case OP_WHITESPACE:
204 nigel 25 for (c = 0; c < 32; c++)
205     start_bits[c] |= cd->cbits[c+cbit_space];
206 nigel 3 break;
207    
208     case OP_NOT_WORDCHAR:
209     for (c = 0; c < 32; c++)
210 nigel 43 start_bits[c] |= ~cd->cbits[c+cbit_word];
211 nigel 3 break;
212    
213     case OP_WORDCHAR:
214     for (c = 0; c < 32; c++)
215 nigel 43 start_bits[c] |= cd->cbits[c+cbit_word];
216 nigel 3 break;
217    
218     /* One or more character type fudges the pointer and restarts, knowing
219     it will hit a single character type and stop there. */
220    
221     case OP_TYPEPLUS:
222     case OP_TYPEMINPLUS:
223     tcode++;
224     try_next = TRUE;
225     break;
226    
227     case OP_TYPEEXACT:
228     tcode += 3;
229     try_next = TRUE;
230     break;
231    
232     /* Zero or more repeats of character types set the bits and then
233     try again. */
234    
235     case OP_TYPEUPTO:
236     case OP_TYPEMINUPTO:
237     tcode += 2; /* Fall through */
238    
239     case OP_TYPESTAR:
240     case OP_TYPEMINSTAR:
241     case OP_TYPEQUERY:
242     case OP_TYPEMINQUERY:
243     switch(tcode[1])
244     {
245     case OP_NOT_DIGIT:
246 nigel 25 for (c = 0; c < 32; c++)
247     start_bits[c] |= ~cd->cbits[c+cbit_digit];
248 nigel 3 break;
249    
250     case OP_DIGIT:
251 nigel 25 for (c = 0; c < 32; c++)
252     start_bits[c] |= cd->cbits[c+cbit_digit];
253 nigel 3 break;
254    
255     case OP_NOT_WHITESPACE:
256 nigel 25 for (c = 0; c < 32; c++)
257     start_bits[c] |= ~cd->cbits[c+cbit_space];
258 nigel 3 break;
259    
260     case OP_WHITESPACE:
261 nigel 25 for (c = 0; c < 32; c++)
262     start_bits[c] |= cd->cbits[c+cbit_space];
263 nigel 3 break;
264    
265     case OP_NOT_WORDCHAR:
266     for (c = 0; c < 32; c++)
267 nigel 43 start_bits[c] |= ~cd->cbits[c+cbit_word];
268 nigel 3 break;
269    
270     case OP_WORDCHAR:
271     for (c = 0; c < 32; c++)
272 nigel 43 start_bits[c] |= cd->cbits[c+cbit_word];
273 nigel 3 break;
274     }
275    
276     tcode += 2;
277     try_next = TRUE;
278     break;
279    
280     /* Character class: set the bits and either carry on or not,
281     according to the repeat count. */
282    
283     case OP_CLASS:
284     {
285     tcode++;
286     for (c = 0; c < 32; c++) start_bits[c] |= tcode[c];
287     tcode += 32;
288     switch (*tcode)
289     {
290     case OP_CRSTAR:
291     case OP_CRMINSTAR:
292     case OP_CRQUERY:
293     case OP_CRMINQUERY:
294     tcode++;
295     try_next = TRUE;
296     break;
297    
298     case OP_CRRANGE:
299     case OP_CRMINRANGE:
300     if (((tcode[1] << 8) + tcode[2]) == 0)
301     {
302     tcode += 5;
303     try_next = TRUE;
304     }
305     break;
306     }
307     }
308     break; /* End of class handling */
309    
310     } /* End of switch */
311     } /* End of try_next loop */
312    
313     code += (code[1] << 8) + code[2]; /* Advance to next branch */
314     }
315     while (*code == OP_ALT);
316     return TRUE;
317     }
318    
319    
320    
321     /*************************************************
322     * Study a compiled expression *
323     *************************************************/
324    
325     /* This function is handed a compiled expression that it must study to produce
326     information that will speed up the matching. It returns a pcre_extra block
327     which then gets handed back to pcre_exec().
328    
329     Arguments:
330     re points to the compiled expression
331     options contains option bits
332     errorptr points to where to place error messages;
333     set NULL unless error
334    
335     Returns: pointer to a pcre_extra block,
336     NULL on error or if no optimization possible
337     */
338    
339     pcre_extra *
340 nigel 7 pcre_study(const pcre *external_re, int options, const char **errorptr)
341 nigel 3 {
342     uschar start_bits[32];
343     real_pcre_extra *extra;
344 nigel 7 const real_pcre *re = (const real_pcre *)external_re;
345 nigel 25 compile_data compile_block;
346 nigel 3
347     *errorptr = NULL;
348    
349     if (re == NULL || re->magic_number != MAGIC_NUMBER)
350     {
351     *errorptr = "argument is not a compiled regular expression";
352     return NULL;
353     }
354    
355     if ((options & ~PUBLIC_STUDY_OPTIONS) != 0)
356     {
357     *errorptr = "unknown or incorrect option bit(s) set";
358     return NULL;
359     }
360    
361     /* For an anchored pattern, or an unchored pattern that has a first char, or a
362     multiline pattern that matches only at "line starts", no further processing at
363     present. */
364    
365     if ((re->options & (PCRE_ANCHORED|PCRE_FIRSTSET|PCRE_STARTLINE)) != 0)
366     return NULL;
367    
368 nigel 25 /* Set the character tables in the block which is passed around */
369    
370     compile_block.lcc = re->tables + lcc_offset;
371     compile_block.fcc = re->tables + fcc_offset;
372     compile_block.cbits = re->tables + cbits_offset;
373     compile_block.ctypes = re->tables + ctypes_offset;
374    
375 nigel 3 /* See if we can find a fixed set of initial characters for the pattern. */
376    
377     memset(start_bits, 0, 32 * sizeof(uschar));
378 nigel 25 if (!set_start_bits(re->code, start_bits, (re->options & PCRE_CASELESS) != 0,
379     &compile_block)) return NULL;
380 nigel 3
381     /* Get an "extra" block and put the information therein. */
382    
383     extra = (real_pcre_extra *)(pcre_malloc)(sizeof(real_pcre_extra));
384    
385     if (extra == NULL)
386     {
387     *errorptr = "failed to get memory";
388     return NULL;
389     }
390    
391 nigel 23 extra->options = PCRE_STUDY_MAPPED;
392 nigel 3 memcpy(extra->start_bits, start_bits, sizeof(start_bits));
393    
394     return (pcre_extra *)extra;
395     }
396    
397     /* End of study.c */

webmaster@exim.org
ViewVC Help
Powered by ViewVC 1.1.12