/[pcre]/code/tags/pcre-1.05/study.c
ViewVC logotype

Contents of /code/tags/pcre-1.05/study.c

Parent Directory Parent Directory | Revision Log Revision Log


Revision 14 - (show annotations) (download)
Sat Feb 24 21:38:23 2007 UTC (7 years, 9 months ago) by nigel
File MIME type: text/plain
File size: 9003 byte(s)
Tag code/trunk as code/tags/pcre-1.05.

1 /*************************************************
2 * Perl-Compatible Regular Expressions *
3 *************************************************/
4
5 /*
6 This is a library of functions to support regular expressions whose syntax
7 and semantics are as close as possible to those of the Perl 5 language. See
8 the file Tech.Notes for some information on the internals.
9
10 Written by: Philip Hazel <ph10@cam.ac.uk>
11
12 Copyright (c) 1997 University of Cambridge
13
14 -----------------------------------------------------------------------------
15 Permission is granted to anyone to use this software for any purpose on any
16 computer system, and to redistribute it freely, subject to the following
17 restrictions:
18
19 1. This software is distributed in the hope that it will be useful,
20 but WITHOUT ANY WARRANTY; without even the implied warranty of
21 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
22
23 2. The origin of this software must not be misrepresented, either by
24 explicit claim or by omission.
25
26 3. Altered versions must be plainly marked as such, and must not be
27 misrepresented as being the original software.
28 -----------------------------------------------------------------------------
29 */
30
31
32 /* Include the internals header, which itself includes Standard C headers plus
33 the external pcre header. */
34
35 #include "internal.h"
36
37
38
39 /*************************************************
40 * Create bitmap of starting chars *
41 *************************************************/
42
43 /* This function scans a compiled unanchored expression and attempts to build a
44 bitmap of the set of initial characters. If it can't, it returns FALSE. As time
45 goes by, we may be able to get more clever at doing this.
46
47 Arguments:
48 code points to an expression
49 start_bits points to a 32-byte table, initialized to 0
50
51 Returns: TRUE if table built, FALSE otherwise
52 */
53
54 static BOOL
55 set_start_bits(const uschar *code, uschar *start_bits)
56 {
57 register int c;
58
59 do
60 {
61 const uschar *tcode = code + 3;
62 BOOL try_next = TRUE;
63
64 while (try_next)
65 {
66 try_next = FALSE;
67
68 if ((int)*tcode >= OP_BRA || *tcode == OP_ASSERT)
69 {
70 if (!set_start_bits(tcode, start_bits)) return FALSE;
71 }
72
73 else switch(*tcode)
74 {
75 default:
76 return FALSE;
77
78 /* BRAZERO does the bracket, but carries on. */
79
80 case OP_BRAZERO:
81 case OP_BRAMINZERO:
82 if (!set_start_bits(++tcode, start_bits)) return FALSE;
83 do tcode += (tcode[1] << 8) + tcode[2]; while (*tcode == OP_ALT);
84 tcode += 3;
85 try_next = TRUE;
86 break;
87
88 /* Single-char * or ? sets the bit and tries the next item */
89
90 case OP_STAR:
91 case OP_MINSTAR:
92 case OP_QUERY:
93 case OP_MINQUERY:
94 start_bits[tcode[1]/8] |= (1 << (tcode[1]&7));
95 tcode += 2;
96 try_next = TRUE;
97 break;
98
99 /* Single-char upto sets the bit and tries the next */
100
101 case OP_UPTO:
102 case OP_MINUPTO:
103 start_bits[tcode[3]/8] |= (1 << (tcode[3]&7));
104 tcode += 4;
105 try_next = TRUE;
106 break;
107
108 /* At least one single char sets the bit and stops */
109
110 case OP_EXACT: /* Fall through */
111 tcode++;
112
113 case OP_CHARS: /* Fall through */
114 tcode++;
115
116 case OP_PLUS:
117 case OP_MINPLUS:
118 start_bits[tcode[1]/8] |= (1 << (tcode[1]&7));
119 break;
120
121 /* Single character type sets the bits and stops */
122
123 case OP_NOT_DIGIT:
124 for (c = 0; c < 32; c++) start_bits[c] |= ~pcre_cbits[c+cbit_digit];
125 break;
126
127 case OP_DIGIT:
128 for (c = 0; c < 32; c++) start_bits[c] |= pcre_cbits[c+cbit_digit];
129 break;
130
131 case OP_NOT_WHITESPACE:
132 for (c = 0; c < 32; c++) start_bits[c] |= ~pcre_cbits[c+cbit_space];
133 break;
134
135 case OP_WHITESPACE:
136 for (c = 0; c < 32; c++) start_bits[c] |= pcre_cbits[c+cbit_space];
137 break;
138
139 case OP_NOT_WORDCHAR:
140 for (c = 0; c < 32; c++)
141 start_bits[c] |= ~(pcre_cbits[c] | pcre_cbits[c+cbit_word]);
142 break;
143
144 case OP_WORDCHAR:
145 for (c = 0; c < 32; c++)
146 start_bits[c] |= (pcre_cbits[c] | pcre_cbits[c+cbit_word]);
147 break;
148
149 /* One or more character type fudges the pointer and restarts, knowing
150 it will hit a single character type and stop there. */
151
152 case OP_TYPEPLUS:
153 case OP_TYPEMINPLUS:
154 tcode++;
155 try_next = TRUE;
156 break;
157
158 case OP_TYPEEXACT:
159 tcode += 3;
160 try_next = TRUE;
161 break;
162
163 /* Zero or more repeats of character types set the bits and then
164 try again. */
165
166 case OP_TYPEUPTO:
167 case OP_TYPEMINUPTO:
168 tcode += 2; /* Fall through */
169
170 case OP_TYPESTAR:
171 case OP_TYPEMINSTAR:
172 case OP_TYPEQUERY:
173 case OP_TYPEMINQUERY:
174 switch(tcode[1])
175 {
176 case OP_NOT_DIGIT:
177 for (c = 0; c < 32; c++) start_bits[c] |= ~pcre_cbits[c+cbit_digit];
178 break;
179
180 case OP_DIGIT:
181 for (c = 0; c < 32; c++) start_bits[c] |= pcre_cbits[c+cbit_digit];
182 break;
183
184 case OP_NOT_WHITESPACE:
185 for (c = 0; c < 32; c++) start_bits[c] |= ~pcre_cbits[c+cbit_space];
186 break;
187
188 case OP_WHITESPACE:
189 for (c = 0; c < 32; c++) start_bits[c] |= pcre_cbits[c+cbit_space];
190 break;
191
192 case OP_NOT_WORDCHAR:
193 for (c = 0; c < 32; c++)
194 start_bits[c] |= ~(pcre_cbits[c] | pcre_cbits[c+cbit_word]);
195 break;
196
197 case OP_WORDCHAR:
198 for (c = 0; c < 32; c++)
199 start_bits[c] |= (pcre_cbits[c] | pcre_cbits[c+cbit_word]);
200 break;
201 }
202
203 tcode += 2;
204 try_next = TRUE;
205 break;
206
207 /* Character class: set the bits and either carry on or not,
208 according to the repeat count. */
209
210 case OP_CLASS:
211 case OP_NEGCLASS:
212 {
213 tcode++;
214 for (c = 0; c < 32; c++) start_bits[c] |= tcode[c];
215 tcode += 32;
216 switch (*tcode)
217 {
218 case OP_CRSTAR:
219 case OP_CRMINSTAR:
220 case OP_CRQUERY:
221 case OP_CRMINQUERY:
222 tcode++;
223 try_next = TRUE;
224 break;
225
226 case OP_CRRANGE:
227 case OP_CRMINRANGE:
228 if (((tcode[1] << 8) + tcode[2]) == 0)
229 {
230 tcode += 5;
231 try_next = TRUE;
232 }
233 break;
234 }
235 }
236 break; /* End of class handling */
237
238 } /* End of switch */
239 } /* End of try_next loop */
240
241 code += (code[1] << 8) + code[2]; /* Advance to next branch */
242 }
243 while (*code == OP_ALT);
244 return TRUE;
245 }
246
247
248
249 /*************************************************
250 * Study a compiled expression *
251 *************************************************/
252
253 /* This function is handed a compiled expression that it must study to produce
254 information that will speed up the matching. It returns a pcre_extra block
255 which then gets handed back to pcre_exec().
256
257 Arguments:
258 re points to the compiled expression
259 options contains option bits
260 errorptr points to where to place error messages;
261 set NULL unless error
262
263 Returns: pointer to a pcre_extra block,
264 NULL on error or if no optimization possible
265 */
266
267 pcre_extra *
268 pcre_study(const pcre *external_re, int options, const char **errorptr)
269 {
270 BOOL caseless;
271 uschar start_bits[32];
272 real_pcre_extra *extra;
273 const real_pcre *re = (const real_pcre *)external_re;
274
275 *errorptr = NULL;
276
277 if (re == NULL || re->magic_number != MAGIC_NUMBER)
278 {
279 *errorptr = "argument is not a compiled regular expression";
280 return NULL;
281 }
282
283 if ((options & ~PUBLIC_STUDY_OPTIONS) != 0)
284 {
285 *errorptr = "unknown or incorrect option bit(s) set";
286 return NULL;
287 }
288
289 /* Caseless can either be from the compiled regex or from options. */
290
291 caseless = ((re->options | options) & PCRE_CASELESS) != 0;
292
293 /* For an anchored pattern, or an unchored pattern that has a first char, or a
294 multiline pattern that matches only at "line starts", no further processing at
295 present. */
296
297 if ((re->options & (PCRE_ANCHORED|PCRE_FIRSTSET|PCRE_STARTLINE)) != 0)
298 return NULL;
299
300 /* See if we can find a fixed set of initial characters for the pattern. */
301
302 memset(start_bits, 0, 32 * sizeof(uschar));
303 if (!set_start_bits(re->code, start_bits)) return NULL;
304
305 /* If this studying is caseless, scan the created bit map and duplicate the
306 bits for any letters. */
307
308 if (caseless)
309 {
310 register int c;
311 for (c = 0; c < 256; c++)
312 {
313 if ((start_bits[c/8] & (1 << (c&7))) != 0 &&
314 (pcre_ctypes[c] & ctype_letter) != 0)
315 {
316 int d = pcre_fcc[c];
317 start_bits[d/8] |= (1 << (d&7));
318 }
319 }
320 }
321
322 /* Get an "extra" block and put the information therein. */
323
324 extra = (real_pcre_extra *)(pcre_malloc)(sizeof(real_pcre_extra));
325
326 if (extra == NULL)
327 {
328 *errorptr = "failed to get memory";
329 return NULL;
330 }
331
332 extra->options = PCRE_STUDY_MAPPED | (caseless? PCRE_STUDY_CASELESS : 0);
333 memcpy(extra->start_bits, start_bits, sizeof(start_bits));
334
335 return (pcre_extra *)extra;
336 }
337
338 /* End of study.c */

webmaster@exim.org
ViewVC Help
Powered by ViewVC 1.1.12