/[pcre]/code/trunk/study.c
ViewVC logotype

Contents of /code/trunk/study.c

Parent Directory Parent Directory | Revision Log Revision Log


Revision 23 - (show annotations) (download)
Sat Feb 24 21:38:41 2007 UTC (7 years, 1 month ago) by nigel
File MIME type: text/plain
File size: 9794 byte(s)
Load pcre-2.00 into code/trunk.

1 /*************************************************
2 * Perl-Compatible Regular Expressions *
3 *************************************************/
4
5 /*
6 This is a library of functions to support regular expressions whose syntax
7 and semantics are as close as possible to those of the Perl 5 language. See
8 the file Tech.Notes for some information on the internals.
9
10 Written by: Philip Hazel <ph10@cam.ac.uk>
11
12 Copyright (c) 1998 University of Cambridge
13
14 -----------------------------------------------------------------------------
15 Permission is granted to anyone to use this software for any purpose on any
16 computer system, and to redistribute it freely, subject to the following
17 restrictions:
18
19 1. This software is distributed in the hope that it will be useful,
20 but WITHOUT ANY WARRANTY; without even the implied warranty of
21 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
22
23 2. The origin of this software must not be misrepresented, either by
24 explicit claim or by omission.
25
26 3. Altered versions must be plainly marked as such, and must not be
27 misrepresented as being the original software.
28 -----------------------------------------------------------------------------
29 */
30
31
32 /* Include the internals header, which itself includes Standard C headers plus
33 the external pcre header. */
34
35 #include "internal.h"
36
37
38
39 /*************************************************
40 * Set a bit and maybe its alternate case *
41 *************************************************/
42
43 /* Given a character, set its bit in the table, and also the bit for the other
44 version of a letter if we are caseless.
45
46 Arguments:
47 start_bits points to the bit map
48 c is the character
49 caseless the caseless flag
50
51 Returns: nothing
52 */
53
54 static void
55 set_bit(uschar *start_bits, int c, BOOL caseless)
56 {
57 start_bits[c/8] |= (1 << (c&7));
58 if (caseless && (pcre_ctypes[c] & ctype_letter) != 0)
59 start_bits[pcre_fcc[c]/8] |= (1 << (pcre_fcc[c]&7));
60 }
61
62
63
64 /*************************************************
65 * Create bitmap of starting chars *
66 *************************************************/
67
68 /* This function scans a compiled unanchored expression and attempts to build a
69 bitmap of the set of initial characters. If it can't, it returns FALSE. As time
70 goes by, we may be able to get more clever at doing this.
71
72 Arguments:
73 code points to an expression
74 start_bits points to a 32-byte table, initialized to 0
75 caseless the current state of the caseless flag
76
77 Returns: TRUE if table built, FALSE otherwise
78 */
79
80 static BOOL
81 set_start_bits(const uschar *code, uschar *start_bits, BOOL caseless)
82 {
83 register int c;
84
85 do
86 {
87 const uschar *tcode = code + 3;
88 BOOL try_next = TRUE;
89
90 while (try_next)
91 {
92 try_next = FALSE;
93
94 /* If a branch starts with a bracket or a positive lookahead assertion,
95 recurse to set bits from within them. That's all for this branch. */
96
97 if ((int)*tcode >= OP_BRA || *tcode == OP_ASSERT)
98 {
99 if (!set_start_bits(tcode, start_bits, caseless)) return FALSE;
100 }
101
102 else switch(*tcode)
103 {
104 default:
105 return FALSE;
106
107 /* Skip over lookbehind and negative lookahead assertions */
108
109 case OP_ASSERT_NOT:
110 case OP_ASSERTBACK:
111 case OP_ASSERTBACK_NOT:
112 try_next = TRUE;
113 do tcode += (tcode[1] << 8) + tcode[2]; while (*tcode == OP_ALT);
114 tcode += 3;
115 break;
116
117 /* Skip over an option setting, changing the caseless flag */
118
119 case OP_OPT:
120 caseless = (tcode[1] & PCRE_CASELESS) != 0;
121 tcode += 2;
122 try_next = TRUE;
123 break;
124
125 /* BRAZERO does the bracket, but carries on. */
126
127 case OP_BRAZERO:
128 case OP_BRAMINZERO:
129 if (!set_start_bits(++tcode, start_bits, caseless)) return FALSE;
130 do tcode += (tcode[1] << 8) + tcode[2]; while (*tcode == OP_ALT);
131 tcode += 3;
132 try_next = TRUE;
133 break;
134
135 /* Single-char * or ? sets the bit and tries the next item */
136
137 case OP_STAR:
138 case OP_MINSTAR:
139 case OP_QUERY:
140 case OP_MINQUERY:
141 set_bit(start_bits, tcode[1], caseless);
142 tcode += 2;
143 try_next = TRUE;
144 break;
145
146 /* Single-char upto sets the bit and tries the next */
147
148 case OP_UPTO:
149 case OP_MINUPTO:
150 set_bit(start_bits, tcode[3], caseless);
151 tcode += 4;
152 try_next = TRUE;
153 break;
154
155 /* At least one single char sets the bit and stops */
156
157 case OP_EXACT: /* Fall through */
158 tcode++;
159
160 case OP_CHARS: /* Fall through */
161 tcode++;
162
163 case OP_PLUS:
164 case OP_MINPLUS:
165 set_bit(start_bits, tcode[1], caseless);
166 break;
167
168 /* Single character type sets the bits and stops */
169
170 case OP_NOT_DIGIT:
171 for (c = 0; c < 32; c++) start_bits[c] |= ~pcre_cbits[c+cbit_digit];
172 break;
173
174 case OP_DIGIT:
175 for (c = 0; c < 32; c++) start_bits[c] |= pcre_cbits[c+cbit_digit];
176 break;
177
178 case OP_NOT_WHITESPACE:
179 for (c = 0; c < 32; c++) start_bits[c] |= ~pcre_cbits[c+cbit_space];
180 break;
181
182 case OP_WHITESPACE:
183 for (c = 0; c < 32; c++) start_bits[c] |= pcre_cbits[c+cbit_space];
184 break;
185
186 case OP_NOT_WORDCHAR:
187 for (c = 0; c < 32; c++)
188 start_bits[c] |= ~(pcre_cbits[c] | pcre_cbits[c+cbit_word]);
189 break;
190
191 case OP_WORDCHAR:
192 for (c = 0; c < 32; c++)
193 start_bits[c] |= (pcre_cbits[c] | pcre_cbits[c+cbit_word]);
194 break;
195
196 /* One or more character type fudges the pointer and restarts, knowing
197 it will hit a single character type and stop there. */
198
199 case OP_TYPEPLUS:
200 case OP_TYPEMINPLUS:
201 tcode++;
202 try_next = TRUE;
203 break;
204
205 case OP_TYPEEXACT:
206 tcode += 3;
207 try_next = TRUE;
208 break;
209
210 /* Zero or more repeats of character types set the bits and then
211 try again. */
212
213 case OP_TYPEUPTO:
214 case OP_TYPEMINUPTO:
215 tcode += 2; /* Fall through */
216
217 case OP_TYPESTAR:
218 case OP_TYPEMINSTAR:
219 case OP_TYPEQUERY:
220 case OP_TYPEMINQUERY:
221 switch(tcode[1])
222 {
223 case OP_NOT_DIGIT:
224 for (c = 0; c < 32; c++) start_bits[c] |= ~pcre_cbits[c+cbit_digit];
225 break;
226
227 case OP_DIGIT:
228 for (c = 0; c < 32; c++) start_bits[c] |= pcre_cbits[c+cbit_digit];
229 break;
230
231 case OP_NOT_WHITESPACE:
232 for (c = 0; c < 32; c++) start_bits[c] |= ~pcre_cbits[c+cbit_space];
233 break;
234
235 case OP_WHITESPACE:
236 for (c = 0; c < 32; c++) start_bits[c] |= pcre_cbits[c+cbit_space];
237 break;
238
239 case OP_NOT_WORDCHAR:
240 for (c = 0; c < 32; c++)
241 start_bits[c] |= ~(pcre_cbits[c] | pcre_cbits[c+cbit_word]);
242 break;
243
244 case OP_WORDCHAR:
245 for (c = 0; c < 32; c++)
246 start_bits[c] |= (pcre_cbits[c] | pcre_cbits[c+cbit_word]);
247 break;
248 }
249
250 tcode += 2;
251 try_next = TRUE;
252 break;
253
254 /* Character class: set the bits and either carry on or not,
255 according to the repeat count. */
256
257 case OP_CLASS:
258 {
259 tcode++;
260 for (c = 0; c < 32; c++) start_bits[c] |= tcode[c];
261 tcode += 32;
262 switch (*tcode)
263 {
264 case OP_CRSTAR:
265 case OP_CRMINSTAR:
266 case OP_CRQUERY:
267 case OP_CRMINQUERY:
268 tcode++;
269 try_next = TRUE;
270 break;
271
272 case OP_CRRANGE:
273 case OP_CRMINRANGE:
274 if (((tcode[1] << 8) + tcode[2]) == 0)
275 {
276 tcode += 5;
277 try_next = TRUE;
278 }
279 break;
280 }
281 }
282 break; /* End of class handling */
283
284 } /* End of switch */
285 } /* End of try_next loop */
286
287 code += (code[1] << 8) + code[2]; /* Advance to next branch */
288 }
289 while (*code == OP_ALT);
290 return TRUE;
291 }
292
293
294
295 /*************************************************
296 * Study a compiled expression *
297 *************************************************/
298
299 /* This function is handed a compiled expression that it must study to produce
300 information that will speed up the matching. It returns a pcre_extra block
301 which then gets handed back to pcre_exec().
302
303 Arguments:
304 re points to the compiled expression
305 options contains option bits
306 errorptr points to where to place error messages;
307 set NULL unless error
308
309 Returns: pointer to a pcre_extra block,
310 NULL on error or if no optimization possible
311 */
312
313 pcre_extra *
314 pcre_study(const pcre *external_re, int options, const char **errorptr)
315 {
316 uschar start_bits[32];
317 real_pcre_extra *extra;
318 const real_pcre *re = (const real_pcre *)external_re;
319
320 *errorptr = NULL;
321
322 if (re == NULL || re->magic_number != MAGIC_NUMBER)
323 {
324 *errorptr = "argument is not a compiled regular expression";
325 return NULL;
326 }
327
328 if ((options & ~PUBLIC_STUDY_OPTIONS) != 0)
329 {
330 *errorptr = "unknown or incorrect option bit(s) set";
331 return NULL;
332 }
333
334 /* For an anchored pattern, or an unchored pattern that has a first char, or a
335 multiline pattern that matches only at "line starts", no further processing at
336 present. */
337
338 if ((re->options & (PCRE_ANCHORED|PCRE_FIRSTSET|PCRE_STARTLINE)) != 0)
339 return NULL;
340
341 /* See if we can find a fixed set of initial characters for the pattern. */
342
343 memset(start_bits, 0, 32 * sizeof(uschar));
344 if (!set_start_bits(re->code, start_bits, (re->options & PCRE_CASELESS) != 0))
345 return NULL;
346
347 /* Get an "extra" block and put the information therein. */
348
349 extra = (real_pcre_extra *)(pcre_malloc)(sizeof(real_pcre_extra));
350
351 if (extra == NULL)
352 {
353 *errorptr = "failed to get memory";
354 return NULL;
355 }
356
357 extra->options = PCRE_STUDY_MAPPED;
358 memcpy(extra->start_bits, start_bits, sizeof(start_bits));
359
360 return (pcre_extra *)extra;
361 }
362
363 /* End of study.c */

webmaster@exim.org
ViewVC Help
Powered by ViewVC 1.1.12