/[pcre]/code/trunk/pcre_study.c
ViewVC logotype

Contents of /code/trunk/pcre_study.c

Parent Directory Parent Directory | Revision Log Revision Log


Revision 77 - (show annotations) (download)
Sat Feb 24 21:40:45 2007 UTC (7 years, 5 months ago) by nigel
File MIME type: text/plain
File size: 14371 byte(s)
Load pcre-6.0 into code/trunk.

1 /*************************************************
2 * Perl-Compatible Regular Expressions *
3 *************************************************/
4
5 /* PCRE is a library of functions to support regular expressions whose syntax
6 and semantics are as close as possible to those of the Perl 5 language.
7
8 Written by Philip Hazel
9 Copyright (c) 1997-2005 University of Cambridge
10
11 -----------------------------------------------------------------------------
12 Redistribution and use in source and binary forms, with or without
13 modification, are permitted provided that the following conditions are met:
14
15 * Redistributions of source code must retain the above copyright notice,
16 this list of conditions and the following disclaimer.
17
18 * Redistributions in binary form must reproduce the above copyright
19 notice, this list of conditions and the following disclaimer in the
20 documentation and/or other materials provided with the distribution.
21
22 * Neither the name of the University of Cambridge nor the names of its
23 contributors may be used to endorse or promote products derived from
24 this software without specific prior written permission.
25
26 THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
27 AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
28 IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
29 ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
30 LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
31 CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
32 SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
33 INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
34 CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
35 ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
36 POSSIBILITY OF SUCH DAMAGE.
37 -----------------------------------------------------------------------------
38 */
39
40
41 /* This module contains the external function pcre_study(), along with local
42 supporting functions. */
43
44
45 #include "pcre_internal.h"
46
47
48 /*************************************************
49 * Set a bit and maybe its alternate case *
50 *************************************************/
51
52 /* Given a character, set its bit in the table, and also the bit for the other
53 version of a letter if we are caseless.
54
55 Arguments:
56 start_bits points to the bit map
57 c is the character
58 caseless the caseless flag
59 cd the block with char table pointers
60
61 Returns: nothing
62 */
63
64 static void
65 set_bit(uschar *start_bits, unsigned int c, BOOL caseless, compile_data *cd)
66 {
67 start_bits[c/8] |= (1 << (c&7));
68 if (caseless && (cd->ctypes[c] & ctype_letter) != 0)
69 start_bits[cd->fcc[c]/8] |= (1 << (cd->fcc[c]&7));
70 }
71
72
73
74 /*************************************************
75 * Create bitmap of starting chars *
76 *************************************************/
77
78 /* This function scans a compiled unanchored expression and attempts to build a
79 bitmap of the set of initial characters. If it can't, it returns FALSE. As time
80 goes by, we may be able to get more clever at doing this.
81
82 Arguments:
83 code points to an expression
84 start_bits points to a 32-byte table, initialized to 0
85 caseless the current state of the caseless flag
86 utf8 TRUE if in UTF-8 mode
87 cd the block with char table pointers
88
89 Returns: TRUE if table built, FALSE otherwise
90 */
91
92 static BOOL
93 set_start_bits(const uschar *code, uschar *start_bits, BOOL caseless,
94 BOOL utf8, compile_data *cd)
95 {
96 register int c;
97
98 /* This next statement and the later reference to dummy are here in order to
99 trick the optimizer of the IBM C compiler for OS/2 into generating correct
100 code. Apparently IBM isn't going to fix the problem, and we would rather not
101 disable optimization (in this module it actually makes a big difference, and
102 the pcre module can use all the optimization it can get). */
103
104 volatile int dummy;
105
106 do
107 {
108 const uschar *tcode = code + 1 + LINK_SIZE;
109 BOOL try_next = TRUE;
110
111 while (try_next)
112 {
113 /* If a branch starts with a bracket or a positive lookahead assertion,
114 recurse to set bits from within them. That's all for this branch. */
115
116 if ((int)*tcode >= OP_BRA || *tcode == OP_ASSERT)
117 {
118 if (!set_start_bits(tcode, start_bits, caseless, utf8, cd))
119 return FALSE;
120 try_next = FALSE;
121 }
122
123 else switch(*tcode)
124 {
125 default:
126 return FALSE;
127
128 /* Skip over callout */
129
130 case OP_CALLOUT:
131 tcode += 2 + 2*LINK_SIZE;
132 break;
133
134 /* Skip over extended extraction bracket number */
135
136 case OP_BRANUMBER:
137 tcode += 3;
138 break;
139
140 /* Skip over lookbehind and negative lookahead assertions */
141
142 case OP_ASSERT_NOT:
143 case OP_ASSERTBACK:
144 case OP_ASSERTBACK_NOT:
145 do tcode += GET(tcode, 1); while (*tcode == OP_ALT);
146 tcode += 1+LINK_SIZE;
147 break;
148
149 /* Skip over an option setting, changing the caseless flag */
150
151 case OP_OPT:
152 caseless = (tcode[1] & PCRE_CASELESS) != 0;
153 tcode += 2;
154 break;
155
156 /* BRAZERO does the bracket, but carries on. */
157
158 case OP_BRAZERO:
159 case OP_BRAMINZERO:
160 if (!set_start_bits(++tcode, start_bits, caseless, utf8, cd))
161 return FALSE;
162 dummy = 1;
163 do tcode += GET(tcode,1); while (*tcode == OP_ALT);
164 tcode += 1+LINK_SIZE;
165 break;
166
167 /* Single-char * or ? sets the bit and tries the next item */
168
169 case OP_STAR:
170 case OP_MINSTAR:
171 case OP_QUERY:
172 case OP_MINQUERY:
173 set_bit(start_bits, tcode[1], caseless, cd);
174 tcode += 2;
175 #ifdef SUPPORT_UTF8
176 if (utf8) while ((*tcode & 0xc0) == 0x80) tcode++;
177 #endif
178 break;
179
180 /* Single-char upto sets the bit and tries the next */
181
182 case OP_UPTO:
183 case OP_MINUPTO:
184 set_bit(start_bits, tcode[3], caseless, cd);
185 tcode += 4;
186 #ifdef SUPPORT_UTF8
187 if (utf8) while ((*tcode & 0xc0) == 0x80) tcode++;
188 #endif
189 break;
190
191 /* At least one single char sets the bit and stops */
192
193 case OP_EXACT: /* Fall through */
194 tcode += 2;
195
196 case OP_CHAR:
197 case OP_CHARNC:
198 case OP_PLUS:
199 case OP_MINPLUS:
200 set_bit(start_bits, tcode[1], caseless, cd);
201 try_next = FALSE;
202 break;
203
204 /* Single character type sets the bits and stops */
205
206 case OP_NOT_DIGIT:
207 for (c = 0; c < 32; c++)
208 start_bits[c] |= ~cd->cbits[c+cbit_digit];
209 try_next = FALSE;
210 break;
211
212 case OP_DIGIT:
213 for (c = 0; c < 32; c++)
214 start_bits[c] |= cd->cbits[c+cbit_digit];
215 try_next = FALSE;
216 break;
217
218 case OP_NOT_WHITESPACE:
219 for (c = 0; c < 32; c++)
220 start_bits[c] |= ~cd->cbits[c+cbit_space];
221 try_next = FALSE;
222 break;
223
224 case OP_WHITESPACE:
225 for (c = 0; c < 32; c++)
226 start_bits[c] |= cd->cbits[c+cbit_space];
227 try_next = FALSE;
228 break;
229
230 case OP_NOT_WORDCHAR:
231 for (c = 0; c < 32; c++)
232 start_bits[c] |= ~cd->cbits[c+cbit_word];
233 try_next = FALSE;
234 break;
235
236 case OP_WORDCHAR:
237 for (c = 0; c < 32; c++)
238 start_bits[c] |= cd->cbits[c+cbit_word];
239 try_next = FALSE;
240 break;
241
242 /* One or more character type fudges the pointer and restarts, knowing
243 it will hit a single character type and stop there. */
244
245 case OP_TYPEPLUS:
246 case OP_TYPEMINPLUS:
247 tcode++;
248 break;
249
250 case OP_TYPEEXACT:
251 tcode += 3;
252 break;
253
254 /* Zero or more repeats of character types set the bits and then
255 try again. */
256
257 case OP_TYPEUPTO:
258 case OP_TYPEMINUPTO:
259 tcode += 2; /* Fall through */
260
261 case OP_TYPESTAR:
262 case OP_TYPEMINSTAR:
263 case OP_TYPEQUERY:
264 case OP_TYPEMINQUERY:
265 switch(tcode[1])
266 {
267 case OP_ANY:
268 return FALSE;
269
270 case OP_NOT_DIGIT:
271 for (c = 0; c < 32; c++)
272 start_bits[c] |= ~cd->cbits[c+cbit_digit];
273 break;
274
275 case OP_DIGIT:
276 for (c = 0; c < 32; c++)
277 start_bits[c] |= cd->cbits[c+cbit_digit];
278 break;
279
280 case OP_NOT_WHITESPACE:
281 for (c = 0; c < 32; c++)
282 start_bits[c] |= ~cd->cbits[c+cbit_space];
283 break;
284
285 case OP_WHITESPACE:
286 for (c = 0; c < 32; c++)
287 start_bits[c] |= cd->cbits[c+cbit_space];
288 break;
289
290 case OP_NOT_WORDCHAR:
291 for (c = 0; c < 32; c++)
292 start_bits[c] |= ~cd->cbits[c+cbit_word];
293 break;
294
295 case OP_WORDCHAR:
296 for (c = 0; c < 32; c++)
297 start_bits[c] |= cd->cbits[c+cbit_word];
298 break;
299 }
300
301 tcode += 2;
302 break;
303
304 /* Character class where all the information is in a bit map: set the
305 bits and either carry on or not, according to the repeat count. If it was
306 a negative class, and we are operating with UTF-8 characters, any byte
307 with a value >= 0xc4 is a potentially valid starter because it starts a
308 character with a value > 255. */
309
310 case OP_NCLASS:
311 if (utf8)
312 {
313 start_bits[24] |= 0xf0; /* Bits for 0xc4 - 0xc8 */
314 memset(start_bits+25, 0xff, 7); /* Bits for 0xc9 - 0xff */
315 }
316 /* Fall through */
317
318 case OP_CLASS:
319 {
320 tcode++;
321
322 /* In UTF-8 mode, the bits in a bit map correspond to character
323 values, not to byte values. However, the bit map we are constructing is
324 for byte values. So we have to do a conversion for characters whose
325 value is > 127. In fact, there are only two possible starting bytes for
326 characters in the range 128 - 255. */
327
328 if (utf8)
329 {
330 for (c = 0; c < 16; c++) start_bits[c] |= tcode[c];
331 for (c = 128; c < 256; c++)
332 {
333 if ((tcode[c/8] && (1 << (c&7))) != 0)
334 {
335 int d = (c >> 6) | 0xc0; /* Set bit for this starter */
336 start_bits[d/8] |= (1 << (d&7)); /* and then skip on to the */
337 c = (c & 0xc0) + 0x40 - 1; /* next relevant character. */
338 }
339 }
340 }
341
342 /* In non-UTF-8 mode, the two bit maps are completely compatible. */
343
344 else
345 {
346 for (c = 0; c < 32; c++) start_bits[c] |= tcode[c];
347 }
348
349 /* Advance past the bit map, and act on what follows */
350
351 tcode += 32;
352 switch (*tcode)
353 {
354 case OP_CRSTAR:
355 case OP_CRMINSTAR:
356 case OP_CRQUERY:
357 case OP_CRMINQUERY:
358 tcode++;
359 break;
360
361 case OP_CRRANGE:
362 case OP_CRMINRANGE:
363 if (((tcode[1] << 8) + tcode[2]) == 0) tcode += 5;
364 else try_next = FALSE;
365 break;
366
367 default:
368 try_next = FALSE;
369 break;
370 }
371 }
372 break; /* End of bitmap class handling */
373
374 } /* End of switch */
375 } /* End of try_next loop */
376
377 code += GET(code, 1); /* Advance to next branch */
378 }
379 while (*code == OP_ALT);
380 return TRUE;
381 }
382
383
384
385 /*************************************************
386 * Study a compiled expression *
387 *************************************************/
388
389 /* This function is handed a compiled expression that it must study to produce
390 information that will speed up the matching. It returns a pcre_extra block
391 which then gets handed back to pcre_exec().
392
393 Arguments:
394 re points to the compiled expression
395 options contains option bits
396 errorptr points to where to place error messages;
397 set NULL unless error
398
399 Returns: pointer to a pcre_extra block, with study_data filled in and the
400 appropriate flag set;
401 NULL on error or if no optimization possible
402 */
403
404 EXPORT pcre_extra *
405 pcre_study(const pcre *external_re, int options, const char **errorptr)
406 {
407 uschar start_bits[32];
408 pcre_extra *extra;
409 pcre_study_data *study;
410 const uschar *tables;
411 const real_pcre *re = (const real_pcre *)external_re;
412 uschar *code = (uschar *)re + re->name_table_offset +
413 (re->name_count * re->name_entry_size);
414 compile_data compile_block;
415
416 *errorptr = NULL;
417
418 if (re == NULL || re->magic_number != MAGIC_NUMBER)
419 {
420 *errorptr = "argument is not a compiled regular expression";
421 return NULL;
422 }
423
424 if ((options & ~PUBLIC_STUDY_OPTIONS) != 0)
425 {
426 *errorptr = "unknown or incorrect option bit(s) set";
427 return NULL;
428 }
429
430 /* For an anchored pattern, or an unanchored pattern that has a first char, or
431 a multiline pattern that matches only at "line starts", no further processing
432 at present. */
433
434 if ((re->options & (PCRE_ANCHORED|PCRE_FIRSTSET|PCRE_STARTLINE)) != 0)
435 return NULL;
436
437 /* Set the character tables in the block that is passed around */
438
439 tables = re->tables;
440 if (tables == NULL)
441 (void)pcre_fullinfo(external_re, NULL, PCRE_INFO_DEFAULT_TABLES,
442 (void *)(&tables));
443
444 compile_block.lcc = tables + lcc_offset;
445 compile_block.fcc = tables + fcc_offset;
446 compile_block.cbits = tables + cbits_offset;
447 compile_block.ctypes = tables + ctypes_offset;
448
449 /* See if we can find a fixed set of initial characters for the pattern. */
450
451 memset(start_bits, 0, 32 * sizeof(uschar));
452 if (!set_start_bits(code, start_bits, (re->options & PCRE_CASELESS) != 0,
453 (re->options & PCRE_UTF8) != 0, &compile_block)) return NULL;
454
455 /* Get a pcre_extra block and a pcre_study_data block. The study data is put in
456 the latter, which is pointed to by the former, which may also get additional
457 data set later by the calling program. At the moment, the size of
458 pcre_study_data is fixed. We nevertheless save it in a field for returning via
459 the pcre_fullinfo() function so that if it becomes variable in the future, we
460 don't have to change that code. */
461
462 extra = (pcre_extra *)(pcre_malloc)
463 (sizeof(pcre_extra) + sizeof(pcre_study_data));
464
465 if (extra == NULL)
466 {
467 *errorptr = "failed to get memory";
468 return NULL;
469 }
470
471 study = (pcre_study_data *)((char *)extra + sizeof(pcre_extra));
472 extra->flags = PCRE_EXTRA_STUDY_DATA;
473 extra->study_data = study;
474
475 study->size = sizeof(pcre_study_data);
476 study->options = PCRE_STUDY_MAPPED;
477 memcpy(study->start_bits, start_bits, sizeof(start_bits));
478
479 return extra;
480 }
481
482 /* End of pcre_study.c */

webmaster@exim.org
ViewVC Help
Powered by ViewVC 1.1.12