/[pcre]/code/trunk/pcre_study.c
ViewVC logotype

Contents of /code/trunk/pcre_study.c

Parent Directory Parent Directory | Revision Log Revision Log


Revision 230 - (hide annotations) (download)
Mon Sep 10 13:23:56 2007 UTC (6 years, 7 months ago) by ph10
File MIME type: text/plain
File size: 17743 byte(s)
(1) Move internal flags out of the options field, to make room.
(2) \r and \n must be explicit to trigger the special CRLF handline exception.
(3) (?J) at the start now sets JCHANGED as well as DUPNAMES.

1 nigel 77 /*************************************************
2     * Perl-Compatible Regular Expressions *
3     *************************************************/
4    
5     /* PCRE is a library of functions to support regular expressions whose syntax
6     and semantics are as close as possible to those of the Perl 5 language.
7    
8     Written by Philip Hazel
9 ph10 117 Copyright (c) 1997-2007 University of Cambridge
10 nigel 77
11     -----------------------------------------------------------------------------
12     Redistribution and use in source and binary forms, with or without
13     modification, are permitted provided that the following conditions are met:
14    
15     * Redistributions of source code must retain the above copyright notice,
16     this list of conditions and the following disclaimer.
17    
18     * Redistributions in binary form must reproduce the above copyright
19     notice, this list of conditions and the following disclaimer in the
20     documentation and/or other materials provided with the distribution.
21    
22     * Neither the name of the University of Cambridge nor the names of its
23     contributors may be used to endorse or promote products derived from
24     this software without specific prior written permission.
25    
26     THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
27     AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
28     IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
29     ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
30     LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
31     CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
32     SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
33     INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
34     CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
35     ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
36     POSSIBILITY OF SUCH DAMAGE.
37     -----------------------------------------------------------------------------
38     */
39    
40    
41     /* This module contains the external function pcre_study(), along with local
42     supporting functions. */
43    
44    
45 ph10 200 #ifdef HAVE_CONFIG_H
46 ph10 199 #include <config.h>
47 ph10 200 #endif
48 ph10 199
49 nigel 77 #include "pcre_internal.h"
50    
51    
52 nigel 93 /* Returns from set_start_bits() */
53    
54     enum { SSB_FAIL, SSB_DONE, SSB_CONTINUE };
55    
56    
57 nigel 77 /*************************************************
58     * Set a bit and maybe its alternate case *
59     *************************************************/
60    
61     /* Given a character, set its bit in the table, and also the bit for the other
62     version of a letter if we are caseless.
63    
64     Arguments:
65     start_bits points to the bit map
66     c is the character
67     caseless the caseless flag
68     cd the block with char table pointers
69    
70     Returns: nothing
71     */
72    
73     static void
74     set_bit(uschar *start_bits, unsigned int c, BOOL caseless, compile_data *cd)
75     {
76     start_bits[c/8] |= (1 << (c&7));
77     if (caseless && (cd->ctypes[c] & ctype_letter) != 0)
78     start_bits[cd->fcc[c]/8] |= (1 << (cd->fcc[c]&7));
79     }
80    
81    
82    
83     /*************************************************
84 nigel 93 * Create bitmap of starting bytes *
85 nigel 77 *************************************************/
86    
87 nigel 93 /* This function scans a compiled unanchored expression recursively and
88     attempts to build a bitmap of the set of possible starting bytes. As time goes
89     by, we may be able to get more clever at doing this. The SSB_CONTINUE return is
90     useful for parenthesized groups in patterns such as (a*)b where the group
91     provides some optional starting bytes but scanning must continue at the outer
92     level to find at least one mandatory byte. At the outermost level, this
93     function fails unless the result is SSB_DONE.
94 nigel 77
95     Arguments:
96     code points to an expression
97     start_bits points to a 32-byte table, initialized to 0
98     caseless the current state of the caseless flag
99     utf8 TRUE if in UTF-8 mode
100     cd the block with char table pointers
101    
102 nigel 93 Returns: SSB_FAIL => Failed to find any starting bytes
103     SSB_DONE => Found mandatory starting bytes
104     SSB_CONTINUE => Found optional starting bytes
105 nigel 77 */
106    
107 nigel 93 static int
108 nigel 77 set_start_bits(const uschar *code, uschar *start_bits, BOOL caseless,
109     BOOL utf8, compile_data *cd)
110     {
111     register int c;
112 nigel 93 int yield = SSB_DONE;
113 nigel 77
114 nigel 91 #if 0
115     /* ========================================================================= */
116     /* The following comment and code was inserted in January 1999. In May 2006,
117     when it was observed to cause compiler warnings about unused values, I took it
118     out again. If anybody is still using OS/2, they will have to put it back
119     manually. */
120    
121 nigel 77 /* This next statement and the later reference to dummy are here in order to
122     trick the optimizer of the IBM C compiler for OS/2 into generating correct
123     code. Apparently IBM isn't going to fix the problem, and we would rather not
124     disable optimization (in this module it actually makes a big difference, and
125     the pcre module can use all the optimization it can get). */
126    
127     volatile int dummy;
128 nigel 91 /* ========================================================================= */
129     #endif
130 nigel 77
131     do
132     {
133 nigel 93 const uschar *tcode = code + (((int)*code == OP_CBRA)? 3:1) + LINK_SIZE;
134 nigel 77 BOOL try_next = TRUE;
135    
136 nigel 93 while (try_next) /* Loop for items in this branch */
137 nigel 77 {
138 nigel 93 int rc;
139     switch(*tcode)
140 nigel 77 {
141 nigel 93 /* Fail if we reach something we don't understand */
142 nigel 77
143     default:
144 nigel 93 return SSB_FAIL;
145 nigel 77
146 nigel 93 /* If we hit a bracket or a positive lookahead assertion, recurse to set
147     bits from within the subpattern. If it can't find anything, we have to
148     give up. If it finds some mandatory character(s), we are done for this
149     branch. Otherwise, carry on scanning after the subpattern. */
150    
151     case OP_BRA:
152     case OP_SBRA:
153     case OP_CBRA:
154     case OP_SCBRA:
155     case OP_ONCE:
156     case OP_ASSERT:
157     rc = set_start_bits(tcode, start_bits, caseless, utf8, cd);
158     if (rc == SSB_FAIL) return SSB_FAIL;
159     if (rc == SSB_DONE) try_next = FALSE; else
160     {
161     do tcode += GET(tcode, 1); while (*tcode == OP_ALT);
162     tcode += 1 + LINK_SIZE;
163     }
164     break;
165    
166     /* If we hit ALT or KET, it means we haven't found anything mandatory in
167     this branch, though we might have found something optional. For ALT, we
168     continue with the next alternative, but we have to arrange that the final
169     result from subpattern is SSB_CONTINUE rather than SSB_DONE. For KET,
170     return SSB_CONTINUE: if this is the top level, that indicates failure,
171     but after a nested subpattern, it causes scanning to continue. */
172    
173     case OP_ALT:
174     yield = SSB_CONTINUE;
175     try_next = FALSE;
176     break;
177    
178     case OP_KET:
179     case OP_KETRMAX:
180     case OP_KETRMIN:
181     return SSB_CONTINUE;
182    
183 nigel 77 /* Skip over callout */
184    
185     case OP_CALLOUT:
186     tcode += 2 + 2*LINK_SIZE;
187     break;
188    
189     /* Skip over lookbehind and negative lookahead assertions */
190    
191     case OP_ASSERT_NOT:
192     case OP_ASSERTBACK:
193     case OP_ASSERTBACK_NOT:
194     do tcode += GET(tcode, 1); while (*tcode == OP_ALT);
195 nigel 93 tcode += 1 + LINK_SIZE;
196 nigel 77 break;
197    
198     /* Skip over an option setting, changing the caseless flag */
199    
200     case OP_OPT:
201     caseless = (tcode[1] & PCRE_CASELESS) != 0;
202     tcode += 2;
203     break;
204    
205     /* BRAZERO does the bracket, but carries on. */
206    
207     case OP_BRAZERO:
208     case OP_BRAMINZERO:
209 nigel 93 if (set_start_bits(++tcode, start_bits, caseless, utf8, cd) == SSB_FAIL)
210     return SSB_FAIL;
211 nigel 91 /* =========================================================================
212     See the comment at the head of this function concerning the next line,
213     which was an old fudge for the benefit of OS/2.
214 nigel 77 dummy = 1;
215 nigel 91 ========================================================================= */
216 nigel 77 do tcode += GET(tcode,1); while (*tcode == OP_ALT);
217 nigel 93 tcode += 1 + LINK_SIZE;
218 nigel 77 break;
219    
220     /* Single-char * or ? sets the bit and tries the next item */
221    
222     case OP_STAR:
223     case OP_MINSTAR:
224 nigel 93 case OP_POSSTAR:
225 nigel 77 case OP_QUERY:
226     case OP_MINQUERY:
227 nigel 93 case OP_POSQUERY:
228 nigel 77 set_bit(start_bits, tcode[1], caseless, cd);
229     tcode += 2;
230     #ifdef SUPPORT_UTF8
231 nigel 93 if (utf8 && tcode[-1] >= 0xc0)
232     tcode += _pcre_utf8_table4[tcode[-1] & 0x3f];
233 nigel 77 #endif
234     break;
235    
236     /* Single-char upto sets the bit and tries the next */
237    
238     case OP_UPTO:
239     case OP_MINUPTO:
240 nigel 93 case OP_POSUPTO:
241 nigel 77 set_bit(start_bits, tcode[3], caseless, cd);
242     tcode += 4;
243     #ifdef SUPPORT_UTF8
244 nigel 93 if (utf8 && tcode[-1] >= 0xc0)
245     tcode += _pcre_utf8_table4[tcode[-1] & 0x3f];
246 nigel 77 #endif
247     break;
248    
249     /* At least one single char sets the bit and stops */
250    
251     case OP_EXACT: /* Fall through */
252     tcode += 2;
253    
254     case OP_CHAR:
255     case OP_CHARNC:
256     case OP_PLUS:
257     case OP_MINPLUS:
258 nigel 93 case OP_POSPLUS:
259 nigel 77 set_bit(start_bits, tcode[1], caseless, cd);
260     try_next = FALSE;
261     break;
262    
263     /* Single character type sets the bits and stops */
264    
265     case OP_NOT_DIGIT:
266     for (c = 0; c < 32; c++)
267     start_bits[c] |= ~cd->cbits[c+cbit_digit];
268     try_next = FALSE;
269     break;
270    
271     case OP_DIGIT:
272     for (c = 0; c < 32; c++)
273     start_bits[c] |= cd->cbits[c+cbit_digit];
274     try_next = FALSE;
275     break;
276    
277 nigel 91 /* The cbit_space table has vertical tab as whitespace; we have to
278     discard it. */
279    
280 nigel 77 case OP_NOT_WHITESPACE:
281     for (c = 0; c < 32; c++)
282 nigel 91 {
283     int d = cd->cbits[c+cbit_space];
284     if (c == 1) d &= ~0x08;
285     start_bits[c] |= ~d;
286     }
287 nigel 77 try_next = FALSE;
288     break;
289    
290 nigel 91 /* The cbit_space table has vertical tab as whitespace; we have to
291     discard it. */
292    
293 nigel 77 case OP_WHITESPACE:
294     for (c = 0; c < 32; c++)
295 nigel 91 {
296     int d = cd->cbits[c+cbit_space];
297     if (c == 1) d &= ~0x08;
298     start_bits[c] |= d;
299     }
300 nigel 77 try_next = FALSE;
301     break;
302    
303     case OP_NOT_WORDCHAR:
304     for (c = 0; c < 32; c++)
305     start_bits[c] |= ~cd->cbits[c+cbit_word];
306     try_next = FALSE;
307     break;
308    
309     case OP_WORDCHAR:
310     for (c = 0; c < 32; c++)
311     start_bits[c] |= cd->cbits[c+cbit_word];
312     try_next = FALSE;
313     break;
314    
315     /* One or more character type fudges the pointer and restarts, knowing
316     it will hit a single character type and stop there. */
317    
318     case OP_TYPEPLUS:
319     case OP_TYPEMINPLUS:
320     tcode++;
321     break;
322    
323     case OP_TYPEEXACT:
324     tcode += 3;
325     break;
326    
327     /* Zero or more repeats of character types set the bits and then
328     try again. */
329    
330     case OP_TYPEUPTO:
331     case OP_TYPEMINUPTO:
332 nigel 93 case OP_TYPEPOSUPTO:
333 nigel 77 tcode += 2; /* Fall through */
334    
335     case OP_TYPESTAR:
336     case OP_TYPEMINSTAR:
337 nigel 93 case OP_TYPEPOSSTAR:
338 nigel 77 case OP_TYPEQUERY:
339     case OP_TYPEMINQUERY:
340 nigel 93 case OP_TYPEPOSQUERY:
341 nigel 77 switch(tcode[1])
342     {
343     case OP_ANY:
344 nigel 93 return SSB_FAIL;
345 nigel 77
346     case OP_NOT_DIGIT:
347     for (c = 0; c < 32; c++)
348     start_bits[c] |= ~cd->cbits[c+cbit_digit];
349     break;
350    
351     case OP_DIGIT:
352     for (c = 0; c < 32; c++)
353     start_bits[c] |= cd->cbits[c+cbit_digit];
354     break;
355    
356 nigel 91 /* The cbit_space table has vertical tab as whitespace; we have to
357     discard it. */
358    
359 nigel 77 case OP_NOT_WHITESPACE:
360     for (c = 0; c < 32; c++)
361 nigel 91 {
362     int d = cd->cbits[c+cbit_space];
363     if (c == 1) d &= ~0x08;
364     start_bits[c] |= ~d;
365     }
366 nigel 77 break;
367    
368 nigel 91 /* The cbit_space table has vertical tab as whitespace; we have to
369     discard it. */
370    
371 nigel 77 case OP_WHITESPACE:
372     for (c = 0; c < 32; c++)
373 nigel 91 {
374     int d = cd->cbits[c+cbit_space];
375     if (c == 1) d &= ~0x08;
376     start_bits[c] |= d;
377     }
378 nigel 77 break;
379    
380     case OP_NOT_WORDCHAR:
381     for (c = 0; c < 32; c++)
382     start_bits[c] |= ~cd->cbits[c+cbit_word];
383     break;
384    
385     case OP_WORDCHAR:
386     for (c = 0; c < 32; c++)
387     start_bits[c] |= cd->cbits[c+cbit_word];
388     break;
389     }
390    
391     tcode += 2;
392     break;
393    
394     /* Character class where all the information is in a bit map: set the
395     bits and either carry on or not, according to the repeat count. If it was
396     a negative class, and we are operating with UTF-8 characters, any byte
397     with a value >= 0xc4 is a potentially valid starter because it starts a
398     character with a value > 255. */
399    
400     case OP_NCLASS:
401 ph10 111 #ifdef SUPPORT_UTF8
402 nigel 77 if (utf8)
403     {
404     start_bits[24] |= 0xf0; /* Bits for 0xc4 - 0xc8 */
405     memset(start_bits+25, 0xff, 7); /* Bits for 0xc9 - 0xff */
406     }
407 ph10 111 #endif
408 nigel 77 /* Fall through */
409    
410     case OP_CLASS:
411     {
412     tcode++;
413    
414     /* In UTF-8 mode, the bits in a bit map correspond to character
415     values, not to byte values. However, the bit map we are constructing is
416     for byte values. So we have to do a conversion for characters whose
417     value is > 127. In fact, there are only two possible starting bytes for
418     characters in the range 128 - 255. */
419    
420 ph10 107 #ifdef SUPPORT_UTF8
421 nigel 77 if (utf8)
422     {
423     for (c = 0; c < 16; c++) start_bits[c] |= tcode[c];
424     for (c = 128; c < 256; c++)
425     {
426     if ((tcode[c/8] && (1 << (c&7))) != 0)
427     {
428     int d = (c >> 6) | 0xc0; /* Set bit for this starter */
429     start_bits[d/8] |= (1 << (d&7)); /* and then skip on to the */
430     c = (c & 0xc0) + 0x40 - 1; /* next relevant character. */
431     }
432     }
433     }
434    
435     /* In non-UTF-8 mode, the two bit maps are completely compatible. */
436    
437     else
438 ph10 111 #endif
439 nigel 77 {
440     for (c = 0; c < 32; c++) start_bits[c] |= tcode[c];
441     }
442    
443     /* Advance past the bit map, and act on what follows */
444    
445     tcode += 32;
446     switch (*tcode)
447     {
448     case OP_CRSTAR:
449     case OP_CRMINSTAR:
450     case OP_CRQUERY:
451     case OP_CRMINQUERY:
452     tcode++;
453     break;
454    
455     case OP_CRRANGE:
456     case OP_CRMINRANGE:
457     if (((tcode[1] << 8) + tcode[2]) == 0) tcode += 5;
458     else try_next = FALSE;
459     break;
460    
461     default:
462     try_next = FALSE;
463     break;
464     }
465     }
466     break; /* End of bitmap class handling */
467    
468     } /* End of switch */
469     } /* End of try_next loop */
470    
471     code += GET(code, 1); /* Advance to next branch */
472     }
473     while (*code == OP_ALT);
474 nigel 93 return yield;
475 nigel 77 }
476    
477    
478    
479     /*************************************************
480     * Study a compiled expression *
481     *************************************************/
482    
483     /* This function is handed a compiled expression that it must study to produce
484     information that will speed up the matching. It returns a pcre_extra block
485     which then gets handed back to pcre_exec().
486    
487     Arguments:
488     re points to the compiled expression
489     options contains option bits
490     errorptr points to where to place error messages;
491     set NULL unless error
492    
493     Returns: pointer to a pcre_extra block, with study_data filled in and the
494     appropriate flag set;
495     NULL on error or if no optimization possible
496     */
497    
498 ph10 145 PCRE_EXP_DEFN pcre_extra *
499 nigel 77 pcre_study(const pcre *external_re, int options, const char **errorptr)
500     {
501     uschar start_bits[32];
502     pcre_extra *extra;
503     pcre_study_data *study;
504     const uschar *tables;
505 nigel 91 uschar *code;
506     compile_data compile_block;
507 nigel 77 const real_pcre *re = (const real_pcre *)external_re;
508    
509     *errorptr = NULL;
510    
511     if (re == NULL || re->magic_number != MAGIC_NUMBER)
512     {
513     *errorptr = "argument is not a compiled regular expression";
514     return NULL;
515     }
516    
517     if ((options & ~PUBLIC_STUDY_OPTIONS) != 0)
518     {
519     *errorptr = "unknown or incorrect option bit(s) set";
520     return NULL;
521     }
522    
523 nigel 91 code = (uschar *)re + re->name_table_offset +
524     (re->name_count * re->name_entry_size);
525    
526 nigel 77 /* For an anchored pattern, or an unanchored pattern that has a first char, or
527     a multiline pattern that matches only at "line starts", no further processing
528     at present. */
529    
530 ph10 230 if ((re->options & PCRE_ANCHORED) != 0 ||
531     (re->flags & (PCRE_FIRSTSET|PCRE_STARTLINE)) != 0)
532 nigel 77 return NULL;
533    
534     /* Set the character tables in the block that is passed around */
535    
536     tables = re->tables;
537     if (tables == NULL)
538     (void)pcre_fullinfo(external_re, NULL, PCRE_INFO_DEFAULT_TABLES,
539     (void *)(&tables));
540    
541     compile_block.lcc = tables + lcc_offset;
542     compile_block.fcc = tables + fcc_offset;
543     compile_block.cbits = tables + cbits_offset;
544     compile_block.ctypes = tables + ctypes_offset;
545    
546     /* See if we can find a fixed set of initial characters for the pattern. */
547    
548     memset(start_bits, 0, 32 * sizeof(uschar));
549 nigel 93 if (set_start_bits(code, start_bits, (re->options & PCRE_CASELESS) != 0,
550     (re->options & PCRE_UTF8) != 0, &compile_block) != SSB_DONE) return NULL;
551 nigel 77
552     /* Get a pcre_extra block and a pcre_study_data block. The study data is put in
553     the latter, which is pointed to by the former, which may also get additional
554     data set later by the calling program. At the moment, the size of
555     pcre_study_data is fixed. We nevertheless save it in a field for returning via
556     the pcre_fullinfo() function so that if it becomes variable in the future, we
557     don't have to change that code. */
558    
559     extra = (pcre_extra *)(pcre_malloc)
560     (sizeof(pcre_extra) + sizeof(pcre_study_data));
561    
562     if (extra == NULL)
563     {
564     *errorptr = "failed to get memory";
565     return NULL;
566     }
567    
568     study = (pcre_study_data *)((char *)extra + sizeof(pcre_extra));
569     extra->flags = PCRE_EXTRA_STUDY_DATA;
570     extra->study_data = study;
571    
572     study->size = sizeof(pcre_study_data);
573     study->options = PCRE_STUDY_MAPPED;
574     memcpy(study->start_bits, start_bits, sizeof(start_bits));
575    
576     return extra;
577     }
578    
579     /* End of pcre_study.c */

Properties

Name Value
svn:eol-style native
svn:keywords "Author Date Id Revision Url"

webmaster@exim.org
ViewVC Help
Powered by ViewVC 1.1.12