/[pcre]/code/tags/pcre-2.04/study.c
ViewVC logotype

Contents of /code/tags/pcre-2.04/study.c

Parent Directory Parent Directory | Revision Log Revision Log


Revision 25 - (show annotations) (download)
Sat Feb 24 21:38:45 2007 UTC (7 years, 6 months ago) by nigel
Original Path: code/trunk/study.c
File MIME type: text/plain
File size: 10325 byte(s)
Load pcre-2.01 into code/trunk.

1 /*************************************************
2 * Perl-Compatible Regular Expressions *
3 *************************************************/
4
5 /*
6 This is a library of functions to support regular expressions whose syntax
7 and semantics are as close as possible to those of the Perl 5 language. See
8 the file Tech.Notes for some information on the internals.
9
10 Written by: Philip Hazel <ph10@cam.ac.uk>
11
12 Copyright (c) 1998 University of Cambridge
13
14 -----------------------------------------------------------------------------
15 Permission is granted to anyone to use this software for any purpose on any
16 computer system, and to redistribute it freely, subject to the following
17 restrictions:
18
19 1. This software is distributed in the hope that it will be useful,
20 but WITHOUT ANY WARRANTY; without even the implied warranty of
21 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
22
23 2. The origin of this software must not be misrepresented, either by
24 explicit claim or by omission.
25
26 3. Altered versions must be plainly marked as such, and must not be
27 misrepresented as being the original software.
28 -----------------------------------------------------------------------------
29 */
30
31
32 /* Include the internals header, which itself includes Standard C headers plus
33 the external pcre header. */
34
35 #include "internal.h"
36
37
38
39 /*************************************************
40 * Set a bit and maybe its alternate case *
41 *************************************************/
42
43 /* Given a character, set its bit in the table, and also the bit for the other
44 version of a letter if we are caseless.
45
46 Arguments:
47 start_bits points to the bit map
48 c is the character
49 caseless the caseless flag
50 cd the block with char table pointers
51
52 Returns: nothing
53 */
54
55 static void
56 set_bit(uschar *start_bits, int c, BOOL caseless, compile_data *cd)
57 {
58 start_bits[c/8] |= (1 << (c&7));
59 if (caseless && (cd->ctypes[c] & ctype_letter) != 0)
60 start_bits[cd->fcc[c]/8] |= (1 << (cd->fcc[c]&7));
61 }
62
63
64
65 /*************************************************
66 * Create bitmap of starting chars *
67 *************************************************/
68
69 /* This function scans a compiled unanchored expression and attempts to build a
70 bitmap of the set of initial characters. If it can't, it returns FALSE. As time
71 goes by, we may be able to get more clever at doing this.
72
73 Arguments:
74 code points to an expression
75 start_bits points to a 32-byte table, initialized to 0
76 caseless the current state of the caseless flag
77 cd the block with char table pointers
78
79 Returns: TRUE if table built, FALSE otherwise
80 */
81
82 static BOOL
83 set_start_bits(const uschar *code, uschar *start_bits, BOOL caseless,
84 compile_data *cd)
85 {
86 register int c;
87
88 do
89 {
90 const uschar *tcode = code + 3;
91 BOOL try_next = TRUE;
92
93 while (try_next)
94 {
95 try_next = FALSE;
96
97 /* If a branch starts with a bracket or a positive lookahead assertion,
98 recurse to set bits from within them. That's all for this branch. */
99
100 if ((int)*tcode >= OP_BRA || *tcode == OP_ASSERT)
101 {
102 if (!set_start_bits(tcode, start_bits, caseless, cd))
103 return FALSE;
104 }
105
106 else switch(*tcode)
107 {
108 default:
109 return FALSE;
110
111 /* Skip over lookbehind and negative lookahead assertions */
112
113 case OP_ASSERT_NOT:
114 case OP_ASSERTBACK:
115 case OP_ASSERTBACK_NOT:
116 try_next = TRUE;
117 do tcode += (tcode[1] << 8) + tcode[2]; while (*tcode == OP_ALT);
118 tcode += 3;
119 break;
120
121 /* Skip over an option setting, changing the caseless flag */
122
123 case OP_OPT:
124 caseless = (tcode[1] & PCRE_CASELESS) != 0;
125 tcode += 2;
126 try_next = TRUE;
127 break;
128
129 /* BRAZERO does the bracket, but carries on. */
130
131 case OP_BRAZERO:
132 case OP_BRAMINZERO:
133 if (!set_start_bits(++tcode, start_bits, caseless, cd))
134 return FALSE;
135 do tcode += (tcode[1] << 8) + tcode[2]; while (*tcode == OP_ALT);
136 tcode += 3;
137 try_next = TRUE;
138 break;
139
140 /* Single-char * or ? sets the bit and tries the next item */
141
142 case OP_STAR:
143 case OP_MINSTAR:
144 case OP_QUERY:
145 case OP_MINQUERY:
146 set_bit(start_bits, tcode[1], caseless, cd);
147 tcode += 2;
148 try_next = TRUE;
149 break;
150
151 /* Single-char upto sets the bit and tries the next */
152
153 case OP_UPTO:
154 case OP_MINUPTO:
155 set_bit(start_bits, tcode[3], caseless, cd);
156 tcode += 4;
157 try_next = TRUE;
158 break;
159
160 /* At least one single char sets the bit and stops */
161
162 case OP_EXACT: /* Fall through */
163 tcode++;
164
165 case OP_CHARS: /* Fall through */
166 tcode++;
167
168 case OP_PLUS:
169 case OP_MINPLUS:
170 set_bit(start_bits, tcode[1], caseless, cd);
171 break;
172
173 /* Single character type sets the bits and stops */
174
175 case OP_NOT_DIGIT:
176 for (c = 0; c < 32; c++)
177 start_bits[c] |= ~cd->cbits[c+cbit_digit];
178 break;
179
180 case OP_DIGIT:
181 for (c = 0; c < 32; c++)
182 start_bits[c] |= cd->cbits[c+cbit_digit];
183 break;
184
185 case OP_NOT_WHITESPACE:
186 for (c = 0; c < 32; c++)
187 start_bits[c] |= ~cd->cbits[c+cbit_space];
188 break;
189
190 case OP_WHITESPACE:
191 for (c = 0; c < 32; c++)
192 start_bits[c] |= cd->cbits[c+cbit_space];
193 break;
194
195 case OP_NOT_WORDCHAR:
196 for (c = 0; c < 32; c++)
197 start_bits[c] |= ~(cd->cbits[c] | cd->cbits[c+cbit_word]);
198 break;
199
200 case OP_WORDCHAR:
201 for (c = 0; c < 32; c++)
202 start_bits[c] |= (cd->cbits[c] | cd->cbits[c+cbit_word]);
203 break;
204
205 /* One or more character type fudges the pointer and restarts, knowing
206 it will hit a single character type and stop there. */
207
208 case OP_TYPEPLUS:
209 case OP_TYPEMINPLUS:
210 tcode++;
211 try_next = TRUE;
212 break;
213
214 case OP_TYPEEXACT:
215 tcode += 3;
216 try_next = TRUE;
217 break;
218
219 /* Zero or more repeats of character types set the bits and then
220 try again. */
221
222 case OP_TYPEUPTO:
223 case OP_TYPEMINUPTO:
224 tcode += 2; /* Fall through */
225
226 case OP_TYPESTAR:
227 case OP_TYPEMINSTAR:
228 case OP_TYPEQUERY:
229 case OP_TYPEMINQUERY:
230 switch(tcode[1])
231 {
232 case OP_NOT_DIGIT:
233 for (c = 0; c < 32; c++)
234 start_bits[c] |= ~cd->cbits[c+cbit_digit];
235 break;
236
237 case OP_DIGIT:
238 for (c = 0; c < 32; c++)
239 start_bits[c] |= cd->cbits[c+cbit_digit];
240 break;
241
242 case OP_NOT_WHITESPACE:
243 for (c = 0; c < 32; c++)
244 start_bits[c] |= ~cd->cbits[c+cbit_space];
245 break;
246
247 case OP_WHITESPACE:
248 for (c = 0; c < 32; c++)
249 start_bits[c] |= cd->cbits[c+cbit_space];
250 break;
251
252 case OP_NOT_WORDCHAR:
253 for (c = 0; c < 32; c++)
254 start_bits[c] |= ~(cd->cbits[c] | cd->cbits[c+cbit_word]);
255 break;
256
257 case OP_WORDCHAR:
258 for (c = 0; c < 32; c++)
259 start_bits[c] |= (cd->cbits[c] | cd->cbits[c+cbit_word]);
260 break;
261 }
262
263 tcode += 2;
264 try_next = TRUE;
265 break;
266
267 /* Character class: set the bits and either carry on or not,
268 according to the repeat count. */
269
270 case OP_CLASS:
271 {
272 tcode++;
273 for (c = 0; c < 32; c++) start_bits[c] |= tcode[c];
274 tcode += 32;
275 switch (*tcode)
276 {
277 case OP_CRSTAR:
278 case OP_CRMINSTAR:
279 case OP_CRQUERY:
280 case OP_CRMINQUERY:
281 tcode++;
282 try_next = TRUE;
283 break;
284
285 case OP_CRRANGE:
286 case OP_CRMINRANGE:
287 if (((tcode[1] << 8) + tcode[2]) == 0)
288 {
289 tcode += 5;
290 try_next = TRUE;
291 }
292 break;
293 }
294 }
295 break; /* End of class handling */
296
297 } /* End of switch */
298 } /* End of try_next loop */
299
300 code += (code[1] << 8) + code[2]; /* Advance to next branch */
301 }
302 while (*code == OP_ALT);
303 return TRUE;
304 }
305
306
307
308 /*************************************************
309 * Study a compiled expression *
310 *************************************************/
311
312 /* This function is handed a compiled expression that it must study to produce
313 information that will speed up the matching. It returns a pcre_extra block
314 which then gets handed back to pcre_exec().
315
316 Arguments:
317 re points to the compiled expression
318 options contains option bits
319 errorptr points to where to place error messages;
320 set NULL unless error
321
322 Returns: pointer to a pcre_extra block,
323 NULL on error or if no optimization possible
324 */
325
326 pcre_extra *
327 pcre_study(const pcre *external_re, int options, const char **errorptr)
328 {
329 uschar start_bits[32];
330 real_pcre_extra *extra;
331 const real_pcre *re = (const real_pcre *)external_re;
332 compile_data compile_block;
333
334 *errorptr = NULL;
335
336 if (re == NULL || re->magic_number != MAGIC_NUMBER)
337 {
338 *errorptr = "argument is not a compiled regular expression";
339 return NULL;
340 }
341
342 if ((options & ~PUBLIC_STUDY_OPTIONS) != 0)
343 {
344 *errorptr = "unknown or incorrect option bit(s) set";
345 return NULL;
346 }
347
348 /* For an anchored pattern, or an unchored pattern that has a first char, or a
349 multiline pattern that matches only at "line starts", no further processing at
350 present. */
351
352 if ((re->options & (PCRE_ANCHORED|PCRE_FIRSTSET|PCRE_STARTLINE)) != 0)
353 return NULL;
354
355 /* Set the character tables in the block which is passed around */
356
357 compile_block.lcc = re->tables + lcc_offset;
358 compile_block.fcc = re->tables + fcc_offset;
359 compile_block.cbits = re->tables + cbits_offset;
360 compile_block.ctypes = re->tables + ctypes_offset;
361
362 /* See if we can find a fixed set of initial characters for the pattern. */
363
364 memset(start_bits, 0, 32 * sizeof(uschar));
365 if (!set_start_bits(re->code, start_bits, (re->options & PCRE_CASELESS) != 0,
366 &compile_block)) return NULL;
367
368 /* Get an "extra" block and put the information therein. */
369
370 extra = (real_pcre_extra *)(pcre_malloc)(sizeof(real_pcre_extra));
371
372 if (extra == NULL)
373 {
374 *errorptr = "failed to get memory";
375 return NULL;
376 }
377
378 extra->options = PCRE_STUDY_MAPPED;
379 memcpy(extra->start_bits, start_bits, sizeof(start_bits));
380
381 return (pcre_extra *)extra;
382 }
383
384 /* End of study.c */

webmaster@exim.org
ViewVC Help
Powered by ViewVC 1.1.12