/[pcre]/code/trunk/pcre_study.c
ViewVC logotype

Contents of /code/trunk/pcre_study.c

Parent Directory Parent Directory | Revision Log Revision Log


Revision 850 - (hide annotations) (download)
Wed Jan 4 17:29:11 2012 UTC (2 years, 6 months ago) by zherczeg
File MIME type: text/plain
File size: 42453 byte(s)
add pcre16_callout_block and pcre16_extra
1 nigel 77 /*************************************************
2     * Perl-Compatible Regular Expressions *
3     *************************************************/
4    
5     /* PCRE is a library of functions to support regular expressions whose syntax
6     and semantics are as close as possible to those of the Perl 5 language.
7    
8     Written by Philip Hazel
9 ph10 836 Copyright (c) 1997-2012 University of Cambridge
10 nigel 77
11     -----------------------------------------------------------------------------
12     Redistribution and use in source and binary forms, with or without
13     modification, are permitted provided that the following conditions are met:
14    
15     * Redistributions of source code must retain the above copyright notice,
16     this list of conditions and the following disclaimer.
17    
18     * Redistributions in binary form must reproduce the above copyright
19     notice, this list of conditions and the following disclaimer in the
20     documentation and/or other materials provided with the distribution.
21    
22     * Neither the name of the University of Cambridge nor the names of its
23     contributors may be used to endorse or promote products derived from
24     this software without specific prior written permission.
25    
26     THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
27     AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
28     IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
29     ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
30     LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
31     CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
32     SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
33     INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
34     CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
35     ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
36     POSSIBILITY OF SUCH DAMAGE.
37     -----------------------------------------------------------------------------
38     */
39    
40    
41     /* This module contains the external function pcre_study(), along with local
42     supporting functions. */
43    
44    
45 ph10 200 #ifdef HAVE_CONFIG_H
46 ph10 236 #include "config.h"
47 ph10 200 #endif
48 ph10 199
49 nigel 77 #include "pcre_internal.h"
50    
51 ph10 524 #define SET_BIT(c) start_bits[c/8] |= (1 << (c&7))
52 nigel 77
53 nigel 93 /* Returns from set_start_bits() */
54    
55 ph10 605 enum { SSB_FAIL, SSB_DONE, SSB_CONTINUE, SSB_UNKNOWN };
56 nigel 93
57    
58 ph10 455
59 nigel 77 /*************************************************
60 ph10 455 * Find the minimum subject length for a group *
61     *************************************************/
62    
63     /* Scan a parenthesized group and compute the minimum length of subject that
64 ph10 461 is needed to match it. This is a lower bound; it does not mean there is a
65 ph10 455 string of that length that matches. In UTF8 mode, the result is in characters
66     rather than bytes.
67    
68     Arguments:
69 ph10 697 code pointer to start of group (the bracket)
70     startcode pointer to start of the whole pattern
71     options the compiling options
72     int RECURSE depth
73 ph10 455
74     Returns: the minimum length
75 ph10 697 -1 if \C in UTF-8 mode or (*ACCEPT) was encountered
76 ph10 461 -2 internal error (missing capturing bracket)
77 ph10 605 -3 internal error (opcode not listed)
78 ph10 455 */
79    
80     static int
81 ph10 836 find_minlength(const pcre_uchar *code, const pcre_uchar *startcode, int options,
82 ph10 697 int recurse_depth)
83 ph10 455 {
84     int length = -1;
85 ph10 836 /* PCRE_UTF16 has the same value as PCRE_UTF8. */
86     BOOL utf = (options & PCRE_UTF8) != 0;
87 ph10 455 BOOL had_recurse = FALSE;
88     register int branchlength = 0;
89 ph10 836 register pcre_uchar *cc = (pcre_uchar *)code + 1 + LINK_SIZE;
90 ph10 455
91 ph10 604 if (*code == OP_CBRA || *code == OP_SCBRA ||
92 ph10 836 *code == OP_CBRAPOS || *code == OP_SCBRAPOS) cc += IMM2_SIZE;
93 ph10 455
94     /* Scan along the opcodes for this branch. If we get to the end of the
95     branch, check the length against that of the other branches. */
96    
97     for (;;)
98     {
99     int d, min;
100 ph10 836 pcre_uchar *cs, *ce;
101 ph10 455 register int op = *cc;
102 ph10 461
103 ph10 455 switch (op)
104     {
105 ph10 471 case OP_COND:
106     case OP_SCOND:
107    
108     /* If there is only one branch in a condition, the implied branch has zero
109     length, so we don't add anything. This covers the DEFINE "condition"
110     automatically. */
111    
112     cs = cc + GET(cc, 1);
113     if (*cs != OP_ALT)
114     {
115     cc = cs + 1 + LINK_SIZE;
116     break;
117     }
118    
119     /* Otherwise we can fall through and treat it the same as any other
120     subpattern. */
121    
122 ph10 455 case OP_CBRA:
123 ph10 461 case OP_SCBRA:
124 ph10 455 case OP_BRA:
125 ph10 461 case OP_SBRA:
126 ph10 604 case OP_CBRAPOS:
127     case OP_SCBRAPOS:
128     case OP_BRAPOS:
129     case OP_SBRAPOS:
130 ph10 455 case OP_ONCE:
131 ph10 723 case OP_ONCE_NC:
132 ph10 697 d = find_minlength(cc, startcode, options, recurse_depth);
133 ph10 455 if (d < 0) return d;
134     branchlength += d;
135     do cc += GET(cc, 1); while (*cc == OP_ALT);
136     cc += 1 + LINK_SIZE;
137     break;
138    
139 ph10 697 /* ACCEPT makes things far too complicated; we have to give up. */
140 ph10 455
141 ph10 654 case OP_ACCEPT:
142     case OP_ASSERT_ACCEPT:
143 ph10 708 return -1;
144 ph10 697
145     /* Reached end of a branch; if it's a ket it is the end of a nested
146     call. If it's ALT it is an alternation in a nested call. If it is END it's
147 ph10 708 the end of the outer call. All can be handled by the same code. If an
148     ACCEPT was previously encountered, use the length that was in force at that
149 ph10 697 time, and pass back the shortest ACCEPT length. */
150 ph10 708
151 ph10 455 case OP_ALT:
152     case OP_KET:
153     case OP_KETRMAX:
154     case OP_KETRMIN:
155 ph10 605 case OP_KETRPOS:
156 ph10 455 case OP_END:
157 ph10 461 if (length < 0 || (!had_recurse && branchlength < length))
158 ph10 455 length = branchlength;
159 ph10 612 if (op != OP_ALT) return length;
160 ph10 455 cc += 1 + LINK_SIZE;
161     branchlength = 0;
162 ph10 461 had_recurse = FALSE;
163 ph10 455 break;
164    
165     /* Skip over assertive subpatterns */
166    
167     case OP_ASSERT:
168     case OP_ASSERT_NOT:
169     case OP_ASSERTBACK:
170     case OP_ASSERTBACK_NOT:
171     do cc += GET(cc, 1); while (*cc == OP_ALT);
172     /* Fall through */
173    
174     /* Skip over things that don't match chars */
175    
176     case OP_REVERSE:
177     case OP_CREF:
178 ph10 459 case OP_NCREF:
179 ph10 455 case OP_RREF:
180 ph10 459 case OP_NRREF:
181 ph10 455 case OP_DEF:
182     case OP_CALLOUT:
183     case OP_SOD:
184     case OP_SOM:
185     case OP_EOD:
186     case OP_EODN:
187     case OP_CIRC:
188 ph10 602 case OP_CIRCM:
189 ph10 455 case OP_DOLL:
190 ph10 602 case OP_DOLLM:
191 ph10 455 case OP_NOT_WORD_BOUNDARY:
192     case OP_WORD_BOUNDARY:
193 ph10 836 cc += PRIV(OP_lengths)[*cc];
194 ph10 455 break;
195 ph10 461
196 ph10 455 /* Skip over a subpattern that has a {0} or {0,x} quantifier */
197    
198     case OP_BRAZERO:
199 ph10 461 case OP_BRAMINZERO:
200 ph10 604 case OP_BRAPOSZERO:
201 ph10 455 case OP_SKIPZERO:
202 ph10 836 cc += PRIV(OP_lengths)[*cc];
203 ph10 455 do cc += GET(cc, 1); while (*cc == OP_ALT);
204     cc += 1 + LINK_SIZE;
205     break;
206    
207     /* Handle literal characters and + repetitions */
208    
209     case OP_CHAR:
210 ph10 602 case OP_CHARI:
211 ph10 455 case OP_NOT:
212 ph10 602 case OP_NOTI:
213 ph10 455 case OP_PLUS:
214 ph10 603 case OP_PLUSI:
215 ph10 455 case OP_MINPLUS:
216 ph10 603 case OP_MINPLUSI:
217 ph10 455 case OP_POSPLUS:
218 ph10 603 case OP_POSPLUSI:
219 ph10 455 case OP_NOTPLUS:
220 ph10 603 case OP_NOTPLUSI:
221 ph10 455 case OP_NOTMINPLUS:
222 ph10 603 case OP_NOTMINPLUSI:
223 ph10 455 case OP_NOTPOSPLUS:
224 ph10 603 case OP_NOTPOSPLUSI:
225 ph10 455 branchlength++;
226     cc += 2;
227 ph10 836 #ifdef SUPPORT_UTF
228     if (utf && HAS_EXTRALEN(cc[-1])) cc += GET_EXTRALEN(cc[-1]);
229 ph10 455 #endif
230     break;
231 ph10 461
232 ph10 455 case OP_TYPEPLUS:
233     case OP_TYPEMINPLUS:
234 ph10 461 case OP_TYPEPOSPLUS:
235 ph10 455 branchlength++;
236     cc += (cc[1] == OP_PROP || cc[1] == OP_NOTPROP)? 4 : 2;
237     break;
238    
239     /* Handle exact repetitions. The count is already in characters, but we
240     need to skip over a multibyte character in UTF8 mode. */
241    
242     case OP_EXACT:
243 ph10 603 case OP_EXACTI:
244 ph10 461 case OP_NOTEXACT:
245 ph10 603 case OP_NOTEXACTI:
246 ph10 455 branchlength += GET2(cc,1);
247 ph10 836 cc += 2 + IMM2_SIZE;
248     #ifdef SUPPORT_UTF
249     if (utf && HAS_EXTRALEN(cc[-1])) cc += GET_EXTRALEN(cc[-1]);
250 ph10 455 #endif
251     break;
252    
253     case OP_TYPEEXACT:
254     branchlength += GET2(cc,1);
255 ph10 836 cc += 2 + IMM2_SIZE + ((cc[1 + IMM2_SIZE] == OP_PROP
256     || cc[1 + IMM2_SIZE] == OP_NOTPROP)? 2 : 0);
257 ph10 455 break;
258    
259     /* Handle single-char non-literal matchers */
260    
261     case OP_PROP:
262     case OP_NOTPROP:
263     cc += 2;
264     /* Fall through */
265    
266     case OP_NOT_DIGIT:
267     case OP_DIGIT:
268     case OP_NOT_WHITESPACE:
269     case OP_WHITESPACE:
270     case OP_NOT_WORDCHAR:
271     case OP_WORDCHAR:
272     case OP_ANY:
273     case OP_ALLANY:
274     case OP_EXTUNI:
275 ph10 461 case OP_HSPACE:
276 ph10 455 case OP_NOT_HSPACE:
277     case OP_VSPACE:
278 ph10 461 case OP_NOT_VSPACE:
279 ph10 455 branchlength++;
280     cc++;
281     break;
282 ph10 461
283 ph10 654 /* "Any newline" might match two characters, but it also might match just
284 ph10 611 one. */
285 ph10 461
286 ph10 455 case OP_ANYNL:
287 ph10 611 branchlength += 1;
288 ph10 455 cc++;
289 ph10 461 break;
290 ph10 455
291 ph10 836 /* The single-byte matcher means we can't proceed in UTF-8 mode. (In
292     non-UTF-8 mode \C will actually be turned into OP_ALLANY, so won't ever
293 ph10 754 appear, but leave the code, just in case.) */
294 ph10 455
295     case OP_ANYBYTE:
296 ph10 836 #ifdef SUPPORT_UTF
297     if (utf) return -1;
298 ph10 455 #endif
299     branchlength++;
300     cc++;
301 ph10 461 break;
302 ph10 455
303     /* For repeated character types, we have to test for \p and \P, which have
304     an extra two bytes of parameters. */
305    
306     case OP_TYPESTAR:
307     case OP_TYPEMINSTAR:
308     case OP_TYPEQUERY:
309     case OP_TYPEMINQUERY:
310     case OP_TYPEPOSSTAR:
311     case OP_TYPEPOSQUERY:
312     if (cc[1] == OP_PROP || cc[1] == OP_NOTPROP) cc += 2;
313 ph10 836 cc += PRIV(OP_lengths)[op];
314 ph10 455 break;
315    
316     case OP_TYPEUPTO:
317     case OP_TYPEMINUPTO:
318     case OP_TYPEPOSUPTO:
319 ph10 836 if (cc[1 + IMM2_SIZE] == OP_PROP
320     || cc[1 + IMM2_SIZE] == OP_NOTPROP) cc += 2;
321     cc += PRIV(OP_lengths)[op];
322 ph10 455 break;
323    
324     /* Check a class for variable quantification */
325    
326 ph10 836 #if defined SUPPORT_UTF || !defined COMPILE_PCRE8
327 ph10 455 case OP_XCLASS:
328 ph10 836 cc += GET(cc, 1) - PRIV(OP_lengths)[OP_CLASS];
329 ph10 455 /* Fall through */
330     #endif
331    
332     case OP_CLASS:
333     case OP_NCLASS:
334 ph10 836 cc += PRIV(OP_lengths)[OP_CLASS];
335 ph10 455
336     switch (*cc)
337     {
338     case OP_CRPLUS:
339     case OP_CRMINPLUS:
340     branchlength++;
341 ph10 461 /* Fall through */
342 ph10 455
343     case OP_CRSTAR:
344     case OP_CRMINSTAR:
345     case OP_CRQUERY:
346     case OP_CRMINQUERY:
347 ph10 461 cc++;
348 ph10 455 break;
349 ph10 461
350 ph10 455 case OP_CRRANGE:
351     case OP_CRMINRANGE:
352     branchlength += GET2(cc,1);
353 ph10 836 cc += 1 + 2 * IMM2_SIZE;
354 ph10 455 break;
355 ph10 461
356 ph10 455 default:
357     branchlength++;
358 ph10 461 break;
359 ph10 455 }
360     break;
361 ph10 461
362     /* Backreferences and subroutine calls are treated in the same way: we find
363     the minimum length for the subpattern. A recursion, however, causes an
364 ph10 455 a flag to be set that causes the length of this branch to be ignored. The
365     logic is that a recursion can only make sense if there is another
366     alternation that stops the recursing. That will provide the minimum length
367     (when no recursion happens). A backreference within the group that it is
368 ph10 469 referencing behaves in the same way.
369    
370 ph10 467 If PCRE_JAVASCRIPT_COMPAT is set, a backreference to an unset bracket
371     matches an empty string (by default it causes a matching failure), so in
372     that case we must set the minimum length to zero. */
373 ph10 461
374 ph10 455 case OP_REF:
375 ph10 602 case OP_REFI:
376 ph10 467 if ((options & PCRE_JAVASCRIPT_COMPAT) == 0)
377 ph10 469 {
378 ph10 836 ce = cs = (pcre_uchar *)PRIV(find_bracket)(startcode, utf, GET2(cc, 1));
379 ph10 467 if (cs == NULL) return -2;
380     do ce += GET(ce, 1); while (*ce == OP_ALT);
381     if (cc > cs && cc < ce)
382     {
383     d = 0;
384     had_recurse = TRUE;
385     }
386 ph10 654 else
387 ph10 612 {
388 ph10 697 d = find_minlength(cs, startcode, options, recurse_depth);
389 ph10 654 }
390 ph10 461 }
391 ph10 469 else d = 0;
392 ph10 836 cc += 1 + IMM2_SIZE;
393 ph10 455
394     /* Handle repeated back references */
395 ph10 461
396 ph10 455 switch (*cc)
397     {
398     case OP_CRSTAR:
399     case OP_CRMINSTAR:
400     case OP_CRQUERY:
401     case OP_CRMINQUERY:
402     min = 0;
403     cc++;
404     break;
405 ph10 605
406     case OP_CRPLUS:
407     case OP_CRMINPLUS:
408     min = 1;
409     cc++;
410     break;
411    
412 ph10 455 case OP_CRRANGE:
413     case OP_CRMINRANGE:
414     min = GET2(cc, 1);
415 ph10 836 cc += 1 + 2 * IMM2_SIZE;
416 ph10 455 break;
417 ph10 461
418 ph10 455 default:
419     min = 1;
420     break;
421     }
422    
423     branchlength += min * d;
424 ph10 461 break;
425 ph10 455
426 ph10 654 /* We can easily detect direct recursion, but not mutual recursion. This is
427     caught by a recursion depth count. */
428    
429 ph10 461 case OP_RECURSE:
430 ph10 836 cs = ce = (pcre_uchar *)startcode + GET(cc, 1);
431 ph10 455 do ce += GET(ce, 1); while (*ce == OP_ALT);
432 ph10 620 if ((cc > cs && cc < ce) || recurse_depth > 10)
433 ph10 461 had_recurse = TRUE;
434     else
435 ph10 654 {
436 ph10 697 branchlength += find_minlength(cs, startcode, options, recurse_depth + 1);
437 ph10 654 }
438 ph10 455 cc += 1 + LINK_SIZE;
439     break;
440    
441     /* Anything else does not or need not match a character. We can get the
442 ph10 461 item's length from the table, but for those that can match zero occurrences
443 ph10 602 of a character, we must take special action for UTF-8 characters. As it
444     happens, the "NOT" versions of these opcodes are used at present only for
445     ASCII characters, so they could be omitted from this list. However, in
446 ph10 605 future that may change, so we include them here so as not to leave a
447 ph10 603 gotcha for a future maintainer. */
448 ph10 461
449 ph10 455 case OP_UPTO:
450 ph10 602 case OP_UPTOI:
451 ph10 461 case OP_NOTUPTO:
452 ph10 602 case OP_NOTUPTOI:
453 ph10 455 case OP_MINUPTO:
454 ph10 602 case OP_MINUPTOI:
455 ph10 461 case OP_NOTMINUPTO:
456 ph10 602 case OP_NOTMINUPTOI:
457 ph10 455 case OP_POSUPTO:
458 ph10 602 case OP_POSUPTOI:
459     case OP_NOTPOSUPTO:
460     case OP_NOTPOSUPTOI:
461    
462 ph10 455 case OP_STAR:
463 ph10 602 case OP_STARI:
464     case OP_NOTSTAR:
465     case OP_NOTSTARI:
466 ph10 455 case OP_MINSTAR:
467 ph10 602 case OP_MINSTARI:
468 ph10 461 case OP_NOTMINSTAR:
469 ph10 602 case OP_NOTMINSTARI:
470 ph10 455 case OP_POSSTAR:
471 ph10 602 case OP_POSSTARI:
472 ph10 461 case OP_NOTPOSSTAR:
473 ph10 602 case OP_NOTPOSSTARI:
474    
475 ph10 455 case OP_QUERY:
476 ph10 602 case OP_QUERYI:
477     case OP_NOTQUERY:
478     case OP_NOTQUERYI:
479 ph10 455 case OP_MINQUERY:
480 ph10 602 case OP_MINQUERYI:
481 ph10 455 case OP_NOTMINQUERY:
482 ph10 602 case OP_NOTMINQUERYI:
483 ph10 455 case OP_POSQUERY:
484 ph10 602 case OP_POSQUERYI:
485 ph10 461 case OP_NOTPOSQUERY:
486 ph10 602 case OP_NOTPOSQUERYI:
487    
488 ph10 836 cc += PRIV(OP_lengths)[op];
489     #ifdef SUPPORT_UTF
490     if (utf && HAS_EXTRALEN(cc[-1])) cc += GET_EXTRALEN(cc[-1]);
491 ph10 461 #endif
492 ph10 455 break;
493 ph10 512
494 ph10 510 /* Skip these, but we need to add in the name length. */
495 ph10 512
496 ph10 510 case OP_MARK:
497     case OP_PRUNE_ARG:
498     case OP_SKIP_ARG:
499 ph10 729 case OP_THEN_ARG:
500 ph10 836 cc += PRIV(OP_lengths)[op] + cc[1];
501 ph10 512 break;
502 ph10 455
503 ph10 603 /* The remaining opcodes are just skipped over. */
504 ph10 461
505 ph10 603 case OP_CLOSE:
506     case OP_COMMIT:
507     case OP_FAIL:
508     case OP_PRUNE:
509     case OP_SET_SOM:
510     case OP_SKIP:
511 ph10 605 case OP_THEN:
512 ph10 836 cc += PRIV(OP_lengths)[op];
513 ph10 455 break;
514 ph10 605
515 ph10 603 /* This should not occur: we list all opcodes explicitly so that when
516     new ones get added they are properly considered. */
517    
518     default:
519     return -3;
520 ph10 455 }
521     }
522     /* Control never gets here */
523     }
524    
525    
526    
527     /*************************************************
528 nigel 77 * Set a bit and maybe its alternate case *
529     *************************************************/
530    
531 ph10 535 /* Given a character, set its first byte's bit in the table, and also the
532 ph10 520 corresponding bit for the other version of a letter if we are caseless. In
533     UTF-8 mode, for characters greater than 127, we can only do the caseless thing
534     when Unicode property support is available.
535 nigel 77
536     Arguments:
537     start_bits points to the bit map
538 ph10 520 p points to the character
539 nigel 77 caseless the caseless flag
540     cd the block with char table pointers
541 ph10 836 utf TRUE for UTF-8 / UTF-16 mode
542 nigel 77
543 ph10 520 Returns: pointer after the character
544 nigel 77 */
545    
546 ph10 836 static const pcre_uchar *
547     set_table_bit(pcre_uint8 *start_bits, const pcre_uchar *p, BOOL caseless,
548     compile_data *cd, BOOL utf)
549 nigel 77 {
550 ph10 520 unsigned int c = *p;
551    
552 ph10 836 #ifdef COMPILE_PCRE8
553 ph10 524 SET_BIT(c);
554    
555 ph10 836 #ifdef SUPPORT_UTF
556     if (utf && c > 127)
557 ph10 520 {
558     GETCHARINC(c, p);
559     #ifdef SUPPORT_UCP
560     if (caseless)
561     {
562 ph10 836 pcre_uchar buff[6];
563 ph10 520 c = UCD_OTHERCASE(c);
564 ph10 836 (void)PRIV(ord2utf)(c, buff);
565 ph10 535 SET_BIT(buff[0]);
566     }
567     #endif
568 ph10 520 return p;
569     }
570 ph10 535 #endif
571 ph10 520
572     /* Not UTF-8 mode, or character is less than 127. */
573    
574 ph10 524 if (caseless && (cd->ctypes[c] & ctype_letter) != 0) SET_BIT(cd->fcc[c]);
575 ph10 520 return p + 1;
576 ph10 836 #endif
577    
578     #ifdef COMPILE_PCRE16
579     if (c > 0xff)
580     {
581     c = 0xff;
582     caseless = FALSE;
583     }
584     SET_BIT(c);
585    
586     #ifdef SUPPORT_UTF
587     if (utf && c > 127)
588     {
589     GETCHARINC(c, p);
590     #ifdef SUPPORT_UCP
591     if (caseless)
592     {
593     c = UCD_OTHERCASE(c);
594     if (c > 0xff)
595     c = 0xff;
596     SET_BIT(c);
597     }
598     #endif
599     return p;
600     }
601     #endif
602    
603     if (caseless && (cd->ctypes[c] & ctype_letter) != 0) SET_BIT(cd->fcc[c]);
604     return p + 1;
605     #endif
606 nigel 77 }
607    
608    
609    
610     /*************************************************
611 ph10 539 * Set bits for a positive character type *
612     *************************************************/
613    
614     /* This function sets starting bits for a character type. In UTF-8 mode, we can
615     only do a direct setting for bytes less than 128, as otherwise there can be
616     confusion with bytes in the middle of UTF-8 characters. In a "traditional"
617     environment, the tables will only recognize ASCII characters anyway, but in at
618     least one Windows environment, some higher bytes bits were set in the tables.
619     So we deal with that case by considering the UTF-8 encoding.
620    
621     Arguments:
622     start_bits the starting bitmap
623     cbit type the type of character wanted
624     table_limit 32 for non-UTF-8; 16 for UTF-8
625 ph10 545 cd the block with char table pointers
626 ph10 539
627     Returns: nothing
628     */
629    
630     static void
631 ph10 836 set_type_bits(pcre_uint8 *start_bits, int cbit_type, int table_limit,
632 ph10 539 compile_data *cd)
633     {
634 ph10 545 register int c;
635 ph10 539 for (c = 0; c < table_limit; c++) start_bits[c] |= cd->cbits[c+cbit_type];
636 ph10 836 #if defined SUPPORT_UTF && defined COMPILE_PCRE8
637 ph10 539 if (table_limit == 32) return;
638     for (c = 128; c < 256; c++)
639     {
640     if ((cd->cbits[c/8] & (1 << (c&7))) != 0)
641     {
642 ph10 836 pcre_uchar buff[6];
643     (void)PRIV(ord2utf)(c, buff);
644 ph10 545 SET_BIT(buff[0]);
645     }
646     }
647 ph10 836 #endif
648 ph10 539 }
649    
650    
651     /*************************************************
652     * Set bits for a negative character type *
653     *************************************************/
654    
655     /* This function sets starting bits for a negative character type such as \D.
656     In UTF-8 mode, we can only do a direct setting for bytes less than 128, as
657     otherwise there can be confusion with bytes in the middle of UTF-8 characters.
658 ph10 545 Unlike in the positive case, where we can set appropriate starting bits for
659 ph10 539 specific high-valued UTF-8 characters, in this case we have to set the bits for
660 ph10 545 all high-valued characters. The lowest is 0xc2, but we overkill by starting at
661 ph10 539 0xc0 (192) for simplicity.
662    
663     Arguments:
664     start_bits the starting bitmap
665     cbit type the type of character wanted
666     table_limit 32 for non-UTF-8; 16 for UTF-8
667 ph10 545 cd the block with char table pointers
668 ph10 539
669     Returns: nothing
670     */
671    
672     static void
673 ph10 836 set_nottype_bits(pcre_uint8 *start_bits, int cbit_type, int table_limit,
674 ph10 539 compile_data *cd)
675     {
676 ph10 545 register int c;
677 ph10 539 for (c = 0; c < table_limit; c++) start_bits[c] |= ~cd->cbits[c+cbit_type];
678 ph10 836 #if defined SUPPORT_UTF && defined COMPILE_PCRE8
679 ph10 539 if (table_limit != 32) for (c = 24; c < 32; c++) start_bits[c] = 0xff;
680 ph10 836 #endif
681 ph10 539 }
682    
683    
684    
685     /*************************************************
686 nigel 93 * Create bitmap of starting bytes *
687 nigel 77 *************************************************/
688    
689 nigel 93 /* This function scans a compiled unanchored expression recursively and
690     attempts to build a bitmap of the set of possible starting bytes. As time goes
691     by, we may be able to get more clever at doing this. The SSB_CONTINUE return is
692     useful for parenthesized groups in patterns such as (a*)b where the group
693     provides some optional starting bytes but scanning must continue at the outer
694     level to find at least one mandatory byte. At the outermost level, this
695     function fails unless the result is SSB_DONE.
696 nigel 77
697     Arguments:
698     code points to an expression
699     start_bits points to a 32-byte table, initialized to 0
700 ph10 836 utf TRUE if in UTF-8 / UTF-16 mode
701 nigel 77 cd the block with char table pointers
702    
703 nigel 93 Returns: SSB_FAIL => Failed to find any starting bytes
704     SSB_DONE => Found mandatory starting bytes
705     SSB_CONTINUE => Found optional starting bytes
706 ph10 605 SSB_UNKNOWN => Hit an unrecognized opcode
707 nigel 77 */
708    
709 nigel 93 static int
710 ph10 836 set_start_bits(const pcre_uchar *code, pcre_uint8 *start_bits, BOOL utf,
711 ph10 603 compile_data *cd)
712 nigel 77 {
713     register int c;
714 nigel 93 int yield = SSB_DONE;
715 ph10 836 #if defined SUPPORT_UTF && defined COMPILE_PCRE8
716     int table_limit = utf? 16:32;
717     #else
718     int table_limit = 32;
719     #endif
720 nigel 77
721 nigel 91 #if 0
722     /* ========================================================================= */
723     /* The following comment and code was inserted in January 1999. In May 2006,
724     when it was observed to cause compiler warnings about unused values, I took it
725     out again. If anybody is still using OS/2, they will have to put it back
726     manually. */
727    
728 nigel 77 /* This next statement and the later reference to dummy are here in order to
729     trick the optimizer of the IBM C compiler for OS/2 into generating correct
730     code. Apparently IBM isn't going to fix the problem, and we would rather not
731     disable optimization (in this module it actually makes a big difference, and
732     the pcre module can use all the optimization it can get). */
733    
734     volatile int dummy;
735 nigel 91 /* ========================================================================= */
736     #endif
737 nigel 77
738     do
739     {
740     BOOL try_next = TRUE;
741 ph10 836 const pcre_uchar *tcode = code + 1 + LINK_SIZE;
742 ph10 605
743 ph10 604 if (*code == OP_CBRA || *code == OP_SCBRA ||
744 ph10 836 *code == OP_CBRAPOS || *code == OP_SCBRAPOS) tcode += IMM2_SIZE;
745 nigel 77
746 nigel 93 while (try_next) /* Loop for items in this branch */
747 nigel 77 {
748 nigel 93 int rc;
749 ph10 612
750 nigel 93 switch(*tcode)
751 nigel 77 {
752 ph10 605 /* If we reach something we don't understand, it means a new opcode has
753     been created that hasn't been added to this code. Hopefully this problem
754     will be discovered during testing. */
755 nigel 77
756     default:
757 ph10 605 return SSB_UNKNOWN;
758    
759     /* Fail for a valid opcode that implies no starting bits. */
760    
761     case OP_ACCEPT:
762 ph10 654 case OP_ASSERT_ACCEPT:
763 ph10 605 case OP_ALLANY:
764     case OP_ANY:
765     case OP_ANYBYTE:
766     case OP_CIRC:
767 ph10 654 case OP_CIRCM:
768 ph10 605 case OP_CLOSE:
769     case OP_COMMIT:
770     case OP_COND:
771 ph10 654 case OP_CREF:
772 ph10 605 case OP_DEF:
773     case OP_DOLL:
774 ph10 654 case OP_DOLLM:
775 ph10 605 case OP_END:
776     case OP_EOD:
777 ph10 654 case OP_EODN:
778 ph10 605 case OP_EXTUNI:
779     case OP_FAIL:
780     case OP_MARK:
781     case OP_NCREF:
782     case OP_NOT:
783     case OP_NOTEXACT:
784     case OP_NOTEXACTI:
785 ph10 654 case OP_NOTI:
786 ph10 605 case OP_NOTMINPLUS:
787     case OP_NOTMINPLUSI:
788     case OP_NOTMINQUERY:
789     case OP_NOTMINQUERYI:
790     case OP_NOTMINSTAR:
791     case OP_NOTMINSTARI:
792     case OP_NOTMINUPTO:
793     case OP_NOTMINUPTOI:
794     case OP_NOTPLUS:
795     case OP_NOTPLUSI:
796     case OP_NOTPOSPLUS:
797     case OP_NOTPOSPLUSI:
798     case OP_NOTPOSQUERY:
799     case OP_NOTPOSQUERYI:
800     case OP_NOTPOSSTAR:
801     case OP_NOTPOSSTARI:
802     case OP_NOTPOSUPTO:
803     case OP_NOTPOSUPTOI:
804     case OP_NOTPROP:
805     case OP_NOTQUERY:
806     case OP_NOTQUERYI:
807     case OP_NOTSTAR:
808     case OP_NOTSTARI:
809     case OP_NOTUPTO:
810     case OP_NOTUPTOI:
811     case OP_NOT_HSPACE:
812     case OP_NOT_VSPACE:
813     case OP_NRREF:
814     case OP_PROP:
815     case OP_PRUNE:
816     case OP_PRUNE_ARG:
817     case OP_RECURSE:
818     case OP_REF:
819     case OP_REFI:
820     case OP_REVERSE:
821     case OP_RREF:
822 ph10 654 case OP_SCOND:
823 ph10 605 case OP_SET_SOM:
824     case OP_SKIP:
825     case OP_SKIP_ARG:
826     case OP_SOD:
827     case OP_SOM:
828     case OP_THEN:
829     case OP_THEN_ARG:
830 ph10 836 #if defined SUPPORT_UTF || !defined COMPILE_PCRE8
831 ph10 605 case OP_XCLASS:
832 ph10 836 #endif
833 nigel 93 return SSB_FAIL;
834 nigel 77
835 ph10 657 /* We can ignore word boundary tests. */
836 ph10 659
837 ph10 657 case OP_WORD_BOUNDARY:
838     case OP_NOT_WORD_BOUNDARY:
839 ph10 659 tcode++;
840     break;
841 ph10 657
842 nigel 93 /* If we hit a bracket or a positive lookahead assertion, recurse to set
843     bits from within the subpattern. If it can't find anything, we have to
844     give up. If it finds some mandatory character(s), we are done for this
845     branch. Otherwise, carry on scanning after the subpattern. */
846    
847     case OP_BRA:
848     case OP_SBRA:
849     case OP_CBRA:
850     case OP_SCBRA:
851 ph10 604 case OP_BRAPOS:
852     case OP_SBRAPOS:
853     case OP_CBRAPOS:
854     case OP_SCBRAPOS:
855 nigel 93 case OP_ONCE:
856 ph10 723 case OP_ONCE_NC:
857 nigel 93 case OP_ASSERT:
858 ph10 836 rc = set_start_bits(tcode, start_bits, utf, cd);
859 ph10 605 if (rc == SSB_FAIL || rc == SSB_UNKNOWN) return rc;
860 nigel 93 if (rc == SSB_DONE) try_next = FALSE; else
861     {
862     do tcode += GET(tcode, 1); while (*tcode == OP_ALT);
863     tcode += 1 + LINK_SIZE;
864     }
865     break;
866    
867     /* If we hit ALT or KET, it means we haven't found anything mandatory in
868     this branch, though we might have found something optional. For ALT, we
869     continue with the next alternative, but we have to arrange that the final
870     result from subpattern is SSB_CONTINUE rather than SSB_DONE. For KET,
871     return SSB_CONTINUE: if this is the top level, that indicates failure,
872     but after a nested subpattern, it causes scanning to continue. */
873    
874     case OP_ALT:
875     yield = SSB_CONTINUE;
876     try_next = FALSE;
877     break;
878    
879     case OP_KET:
880     case OP_KETRMAX:
881     case OP_KETRMIN:
882 ph10 605 case OP_KETRPOS:
883 nigel 93 return SSB_CONTINUE;
884    
885 nigel 77 /* Skip over callout */
886    
887     case OP_CALLOUT:
888     tcode += 2 + 2*LINK_SIZE;
889     break;
890    
891     /* Skip over lookbehind and negative lookahead assertions */
892    
893     case OP_ASSERT_NOT:
894     case OP_ASSERTBACK:
895     case OP_ASSERTBACK_NOT:
896     do tcode += GET(tcode, 1); while (*tcode == OP_ALT);
897 nigel 93 tcode += 1 + LINK_SIZE;
898 nigel 77 break;
899    
900     /* BRAZERO does the bracket, but carries on. */
901    
902     case OP_BRAZERO:
903     case OP_BRAMINZERO:
904 ph10 604 case OP_BRAPOSZERO:
905 ph10 836 rc = set_start_bits(++tcode, start_bits, utf, cd);
906 ph10 605 if (rc == SSB_FAIL || rc == SSB_UNKNOWN) return rc;
907 nigel 91 /* =========================================================================
908     See the comment at the head of this function concerning the next line,
909     which was an old fudge for the benefit of OS/2.
910 nigel 77 dummy = 1;
911 nigel 91 ========================================================================= */
912 nigel 77 do tcode += GET(tcode,1); while (*tcode == OP_ALT);
913 nigel 93 tcode += 1 + LINK_SIZE;
914 nigel 77 break;
915    
916 ph10 335 /* SKIPZERO skips the bracket. */
917    
918     case OP_SKIPZERO:
919 ph10 358 tcode++;
920 ph10 335 do tcode += GET(tcode,1); while (*tcode == OP_ALT);
921     tcode += 1 + LINK_SIZE;
922     break;
923    
924 nigel 77 /* Single-char * or ? sets the bit and tries the next item */
925    
926     case OP_STAR:
927     case OP_MINSTAR:
928 nigel 93 case OP_POSSTAR:
929 nigel 77 case OP_QUERY:
930     case OP_MINQUERY:
931 nigel 93 case OP_POSQUERY:
932 ph10 836 tcode = set_table_bit(start_bits, tcode + 1, FALSE, cd, utf);
933 nigel 77 break;
934    
935 ph10 602 case OP_STARI:
936     case OP_MINSTARI:
937     case OP_POSSTARI:
938     case OP_QUERYI:
939     case OP_MINQUERYI:
940     case OP_POSQUERYI:
941 ph10 836 tcode = set_table_bit(start_bits, tcode + 1, TRUE, cd, utf);
942 ph10 602 break;
943    
944 nigel 77 /* Single-char upto sets the bit and tries the next */
945    
946     case OP_UPTO:
947     case OP_MINUPTO:
948 nigel 93 case OP_POSUPTO:
949 ph10 836 tcode = set_table_bit(start_bits, tcode + 1 + IMM2_SIZE, FALSE, cd, utf);
950 nigel 77 break;
951    
952 ph10 602 case OP_UPTOI:
953     case OP_MINUPTOI:
954     case OP_POSUPTOI:
955 ph10 836 tcode = set_table_bit(start_bits, tcode + 1 + IMM2_SIZE, TRUE, cd, utf);
956 ph10 602 break;
957    
958 nigel 77 /* At least one single char sets the bit and stops */
959    
960 ph10 602 case OP_EXACT:
961 ph10 836 tcode += IMM2_SIZE;
962 ph10 602 /* Fall through */
963 nigel 77 case OP_CHAR:
964     case OP_PLUS:
965     case OP_MINPLUS:
966 nigel 93 case OP_POSPLUS:
967 ph10 836 (void)set_table_bit(start_bits, tcode + 1, FALSE, cd, utf);
968 nigel 77 try_next = FALSE;
969     break;
970 ph10 535
971 ph10 603 case OP_EXACTI:
972 ph10 836 tcode += IMM2_SIZE;
973 ph10 603 /* Fall through */
974 ph10 602 case OP_CHARI:
975     case OP_PLUSI:
976     case OP_MINPLUSI:
977     case OP_POSPLUSI:
978 ph10 836 (void)set_table_bit(start_bits, tcode + 1, TRUE, cd, utf);
979 ph10 602 try_next = FALSE;
980     break;
981    
982 ph10 535 /* Special spacing and line-terminating items. These recognize specific
983     lists of characters. The difference between VSPACE and ANYNL is that the
984     latter can match the two-character CRLF sequence, but that is not
985     relevant for finding the first character, so their code here is
986 ph10 524 identical. */
987 ph10 535
988 ph10 524 case OP_HSPACE:
989     SET_BIT(0x09);
990     SET_BIT(0x20);
991 ph10 836 #ifdef SUPPORT_UTF
992     if (utf)
993 ph10 535 {
994 ph10 836 #ifdef COMPILE_PCRE8
995 ph10 545 SET_BIT(0xC2); /* For U+00A0 */
996 ph10 524 SET_BIT(0xE1); /* For U+1680, U+180E */
997     SET_BIT(0xE2); /* For U+2000 - U+200A, U+202F, U+205F */
998 ph10 535 SET_BIT(0xE3); /* For U+3000 */
999 ph10 836 #endif
1000     #ifdef COMPILE_PCRE16
1001     SET_BIT(0xA0);
1002     SET_BIT(0xFF); /* For characters > 255 */
1003     #endif
1004 ph10 524 }
1005 ph10 836 else
1006     #endif /* SUPPORT_UTF */
1007     {
1008     SET_BIT(0xA0);
1009     #ifdef COMPILE_PCRE16
1010     SET_BIT(0xFF); /* For characters > 255 */
1011     #endif
1012     }
1013 ph10 524 try_next = FALSE;
1014 ph10 535 break;
1015 nigel 77
1016 ph10 535 case OP_ANYNL:
1017 ph10 524 case OP_VSPACE:
1018 ph10 535 SET_BIT(0x0A);
1019     SET_BIT(0x0B);
1020     SET_BIT(0x0C);
1021     SET_BIT(0x0D);
1022 ph10 836 #ifdef SUPPORT_UTF
1023     if (utf)
1024 ph10 545 {
1025 ph10 836 #ifdef COMPILE_PCRE8
1026 ph10 545 SET_BIT(0xC2); /* For U+0085 */
1027 ph10 538 SET_BIT(0xE2); /* For U+2028, U+2029 */
1028 ph10 836 #endif
1029     #ifdef COMPILE_PCRE16
1030     SET_BIT(0x85);
1031     SET_BIT(0xFF); /* For characters > 255 */
1032     #endif
1033 ph10 545 }
1034 ph10 836 else
1035     #endif /* SUPPORT_UTF */
1036     {
1037     SET_BIT(0x85);
1038     #ifdef COMPILE_PCRE16
1039     SET_BIT(0xFF); /* For characters > 255 */
1040     #endif
1041     }
1042 ph10 524 try_next = FALSE;
1043 ph10 535 break;
1044 ph10 524
1045 ph10 535 /* Single character types set the bits and stop. Note that if PCRE_UCP
1046     is set, we do not see these op codes because \d etc are converted to
1047 ph10 545 properties. Therefore, these apply in the case when only characters less
1048 ph10 539 than 256 are recognized to match the types. */
1049 nigel 77
1050     case OP_NOT_DIGIT:
1051 ph10 539 set_nottype_bits(start_bits, cbit_digit, table_limit, cd);
1052 nigel 77 try_next = FALSE;
1053     break;
1054    
1055     case OP_DIGIT:
1056 ph10 539 set_type_bits(start_bits, cbit_digit, table_limit, cd);
1057 nigel 77 try_next = FALSE;
1058     break;
1059    
1060 nigel 91 /* The cbit_space table has vertical tab as whitespace; we have to
1061 ph10 539 ensure it is set as not whitespace. */
1062 nigel 91
1063 nigel 77 case OP_NOT_WHITESPACE:
1064 ph10 539 set_nottype_bits(start_bits, cbit_space, table_limit, cd);
1065     start_bits[1] |= 0x08;
1066 nigel 77 try_next = FALSE;
1067     break;
1068    
1069 nigel 91 /* The cbit_space table has vertical tab as whitespace; we have to
1070 ph10 539 not set it from the table. */
1071 nigel 91
1072 nigel 77 case OP_WHITESPACE:
1073 ph10 539 c = start_bits[1]; /* Save in case it was already set */
1074     set_type_bits(start_bits, cbit_space, table_limit, cd);
1075     start_bits[1] = (start_bits[1] & ~0x08) | c;
1076 nigel 77 try_next = FALSE;
1077     break;
1078    
1079     case OP_NOT_WORDCHAR:
1080 ph10 539 set_nottype_bits(start_bits, cbit_word, table_limit, cd);
1081 nigel 77 try_next = FALSE;
1082     break;
1083    
1084     case OP_WORDCHAR:
1085 ph10 539 set_type_bits(start_bits, cbit_word, table_limit, cd);
1086 nigel 77 try_next = FALSE;
1087     break;
1088 ph10 545
1089 nigel 77 /* One or more character type fudges the pointer and restarts, knowing
1090     it will hit a single character type and stop there. */
1091    
1092     case OP_TYPEPLUS:
1093     case OP_TYPEMINPLUS:
1094 ph10 535 case OP_TYPEPOSPLUS:
1095 nigel 77 tcode++;
1096     break;
1097    
1098     case OP_TYPEEXACT:
1099 ph10 836 tcode += 1 + IMM2_SIZE;
1100 nigel 77 break;
1101    
1102     /* Zero or more repeats of character types set the bits and then
1103     try again. */
1104    
1105     case OP_TYPEUPTO:
1106     case OP_TYPEMINUPTO:
1107 nigel 93 case OP_TYPEPOSUPTO:
1108 ph10 836 tcode += IMM2_SIZE; /* Fall through */
1109 nigel 77
1110     case OP_TYPESTAR:
1111     case OP_TYPEMINSTAR:
1112 nigel 93 case OP_TYPEPOSSTAR:
1113 nigel 77 case OP_TYPEQUERY:
1114     case OP_TYPEMINQUERY:
1115 nigel 93 case OP_TYPEPOSQUERY:
1116 nigel 77 switch(tcode[1])
1117     {
1118 ph10 523 default:
1119 nigel 77 case OP_ANY:
1120 ph10 345 case OP_ALLANY:
1121 nigel 93 return SSB_FAIL;
1122 ph10 535
1123 ph10 524 case OP_HSPACE:
1124     SET_BIT(0x09);
1125     SET_BIT(0x20);
1126 ph10 836 #ifdef COMPILE_PCRE8
1127     if (utf)
1128 ph10 535 {
1129 ph10 836 #ifdef COMPILE_PCRE8
1130 ph10 545 SET_BIT(0xC2); /* For U+00A0 */
1131 ph10 524 SET_BIT(0xE1); /* For U+1680, U+180E */
1132     SET_BIT(0xE2); /* For U+2000 - U+200A, U+202F, U+205F */
1133 ph10 535 SET_BIT(0xE3); /* For U+3000 */
1134 ph10 836 #endif
1135     #ifdef COMPILE_PCRE16
1136     SET_BIT(0xA0);
1137     SET_BIT(0xFF); /* For characters > 255 */
1138     #endif
1139 ph10 524 }
1140 ph10 836 else
1141     #endif /* SUPPORT_UTF */
1142     SET_BIT(0xA0);
1143 ph10 535 break;
1144    
1145     case OP_ANYNL:
1146 ph10 524 case OP_VSPACE:
1147 ph10 535 SET_BIT(0x0A);
1148     SET_BIT(0x0B);
1149     SET_BIT(0x0C);
1150     SET_BIT(0x0D);
1151 ph10 836 #ifdef COMPILE_PCRE8
1152     if (utf)
1153 ph10 538 {
1154 ph10 836 #ifdef COMPILE_PCRE8
1155 ph10 545 SET_BIT(0xC2); /* For U+0085 */
1156 ph10 538 SET_BIT(0xE2); /* For U+2028, U+2029 */
1157 ph10 836 #endif
1158     #ifdef COMPILE_PCRE16
1159     SET_BIT(0x85);
1160     SET_BIT(0xFF); /* For characters > 255 */
1161     #endif
1162 ph10 545 }
1163 ph10 836 else
1164     #endif /* SUPPORT_UTF */
1165     SET_BIT(0x85);
1166 ph10 535 break;
1167    
1168 nigel 77 case OP_NOT_DIGIT:
1169 ph10 539 set_nottype_bits(start_bits, cbit_digit, table_limit, cd);
1170 nigel 77 break;
1171    
1172     case OP_DIGIT:
1173 ph10 539 set_type_bits(start_bits, cbit_digit, table_limit, cd);
1174 nigel 77 break;
1175    
1176 nigel 91 /* The cbit_space table has vertical tab as whitespace; we have to
1177 ph10 539 ensure it gets set as not whitespace. */
1178 nigel 91
1179 nigel 77 case OP_NOT_WHITESPACE:
1180 ph10 539 set_nottype_bits(start_bits, cbit_space, table_limit, cd);
1181 ph10 545 start_bits[1] |= 0x08;
1182 nigel 77 break;
1183    
1184 nigel 91 /* The cbit_space table has vertical tab as whitespace; we have to
1185 ph10 539 avoid setting it. */
1186 nigel 91
1187 nigel 77 case OP_WHITESPACE:
1188 ph10 539 c = start_bits[1]; /* Save in case it was already set */
1189     set_type_bits(start_bits, cbit_space, table_limit, cd);
1190     start_bits[1] = (start_bits[1] & ~0x08) | c;
1191 nigel 77 break;
1192    
1193     case OP_NOT_WORDCHAR:
1194 ph10 539 set_nottype_bits(start_bits, cbit_word, table_limit, cd);
1195 nigel 77 break;
1196    
1197     case OP_WORDCHAR:
1198 ph10 539 set_type_bits(start_bits, cbit_word, table_limit, cd);
1199 nigel 77 break;
1200     }
1201    
1202     tcode += 2;
1203     break;
1204    
1205     /* Character class where all the information is in a bit map: set the
1206     bits and either carry on or not, according to the repeat count. If it was
1207     a negative class, and we are operating with UTF-8 characters, any byte
1208     with a value >= 0xc4 is a potentially valid starter because it starts a
1209     character with a value > 255. */
1210    
1211     case OP_NCLASS:
1212 ph10 836 #if defined SUPPORT_UTF && defined COMPILE_PCRE8
1213     if (utf)
1214 nigel 77 {
1215     start_bits[24] |= 0xf0; /* Bits for 0xc4 - 0xc8 */
1216     memset(start_bits+25, 0xff, 7); /* Bits for 0xc9 - 0xff */
1217     }
1218 ph10 111 #endif
1219 ph10 836 #ifdef COMPILE_PCRE16
1220     SET_BIT(0xFF); /* For characters > 255 */
1221     #endif
1222 nigel 77 /* Fall through */
1223    
1224     case OP_CLASS:
1225     {
1226 ph10 836 pcre_uint8 *map;
1227 nigel 77 tcode++;
1228 ph10 836 map = (pcre_uint8 *)tcode;
1229 nigel 77
1230     /* In UTF-8 mode, the bits in a bit map correspond to character
1231     values, not to byte values. However, the bit map we are constructing is
1232     for byte values. So we have to do a conversion for characters whose
1233     value is > 127. In fact, there are only two possible starting bytes for
1234     characters in the range 128 - 255. */
1235    
1236 ph10 836 #if defined SUPPORT_UTF && defined COMPILE_PCRE8
1237     if (utf)
1238 nigel 77 {
1239 ph10 836 for (c = 0; c < 16; c++) start_bits[c] |= map[c];
1240 nigel 77 for (c = 128; c < 256; c++)
1241     {
1242 ph10 836 if ((map[c/8] && (1 << (c&7))) != 0)
1243 nigel 77 {
1244     int d = (c >> 6) | 0xc0; /* Set bit for this starter */
1245     start_bits[d/8] |= (1 << (d&7)); /* and then skip on to the */
1246     c = (c & 0xc0) + 0x40 - 1; /* next relevant character. */
1247     }
1248     }
1249     }
1250     else
1251 ph10 111 #endif
1252 nigel 77 {
1253 ph10 836 /* In non-UTF-8 mode, the two bit maps are completely compatible. */
1254     for (c = 0; c < 32; c++) start_bits[c] |= map[c];
1255 nigel 77 }
1256    
1257 ph10 654 /* Advance past the bit map, and act on what follows. For a zero
1258 ph10 605 minimum repeat, continue; otherwise stop processing. */
1259 nigel 77
1260 ph10 836 tcode += 32 / sizeof(pcre_uchar);
1261 nigel 77 switch (*tcode)
1262     {
1263     case OP_CRSTAR:
1264     case OP_CRMINSTAR:
1265     case OP_CRQUERY:
1266     case OP_CRMINQUERY:
1267     tcode++;
1268     break;
1269    
1270     case OP_CRRANGE:
1271     case OP_CRMINRANGE:
1272 ph10 836 if (GET2(tcode, 1) == 0) tcode += 1 + 2 * IMM2_SIZE;
1273 nigel 77 else try_next = FALSE;
1274     break;
1275 ph10 654
1276 nigel 77 default:
1277     try_next = FALSE;
1278     break;
1279     }
1280     }
1281     break; /* End of bitmap class handling */
1282    
1283     } /* End of switch */
1284     } /* End of try_next loop */
1285    
1286     code += GET(code, 1); /* Advance to next branch */
1287     }
1288     while (*code == OP_ALT);
1289 nigel 93 return yield;
1290 nigel 77 }
1291    
1292    
1293    
1294 ph10 605
1295    
1296 nigel 77 /*************************************************
1297     * Study a compiled expression *
1298     *************************************************/
1299    
1300     /* This function is handed a compiled expression that it must study to produce
1301 zherczeg 850 information that will speed up the matching. It returns a pcre[16]_extra block
1302 nigel 77 which then gets handed back to pcre_exec().
1303    
1304     Arguments:
1305     re points to the compiled expression
1306     options contains option bits
1307     errorptr points to where to place error messages;
1308     set NULL unless error
1309    
1310 zherczeg 850 Returns: pointer to a pcre[16]_extra block, with study_data filled in and
1311     the appropriate flags set;
1312 nigel 77 NULL on error or if no optimization possible
1313     */
1314    
1315 ph10 836 #ifdef COMPILE_PCRE8
1316 ph10 359 PCRE_EXP_DEFN pcre_extra * PCRE_CALL_CONVENTION
1317 nigel 77 pcre_study(const pcre *external_re, int options, const char **errorptr)
1318 ph10 836 #else
1319 zherczeg 850 PCRE_EXP_DEFN pcre16_extra * PCRE_CALL_CONVENTION
1320 ph10 836 pcre16_study(const pcre *external_re, int options, const char **errorptr)
1321     #endif
1322 nigel 77 {
1323 ph10 455 int min;
1324     BOOL bits_set = FALSE;
1325 ph10 836 pcre_uint8 start_bits[32];
1326 zherczeg 850 PUBL(extra) *extra = NULL;
1327 nigel 77 pcre_study_data *study;
1328 ph10 836 const pcre_uint8 *tables;
1329     pcre_uchar *code;
1330 nigel 91 compile_data compile_block;
1331 nigel 77 const real_pcre *re = (const real_pcre *)external_re;
1332    
1333     *errorptr = NULL;
1334    
1335     if (re == NULL || re->magic_number != MAGIC_NUMBER)
1336     {
1337     *errorptr = "argument is not a compiled regular expression";
1338     return NULL;
1339     }
1340    
1341 ph10 836 if ((re->flags & PCRE_MODE) == 0)
1342     {
1343     #ifdef COMPILE_PCRE8
1344     *errorptr = "argument is compiled in 16 bit mode";
1345     #else
1346     *errorptr = "argument is compiled in 8 bit mode";
1347     #endif
1348     return NULL;
1349     }
1350    
1351 nigel 77 if ((options & ~PUBLIC_STUDY_OPTIONS) != 0)
1352     {
1353     *errorptr = "unknown or incorrect option bit(s) set";
1354     return NULL;
1355     }
1356    
1357 ph10 836 code = (pcre_uchar *)re + re->name_table_offset +
1358 nigel 91 (re->name_count * re->name_entry_size);
1359    
1360 nigel 77 /* For an anchored pattern, or an unanchored pattern that has a first char, or
1361 ph10 461 a multiline pattern that matches only at "line starts", there is no point in
1362 ph10 455 seeking a list of starting bytes. */
1363 nigel 77
1364 ph10 455 if ((re->options & PCRE_ANCHORED) == 0 &&
1365     (re->flags & (PCRE_FIRSTSET|PCRE_STARTLINE)) == 0)
1366     {
1367 ph10 605 int rc;
1368    
1369 ph10 455 /* Set the character tables in the block that is passed around */
1370 ph10 461
1371 ph10 455 tables = re->tables;
1372 ph10 836
1373     #ifdef COMPILE_PCRE8
1374 ph10 455 if (tables == NULL)
1375     (void)pcre_fullinfo(external_re, NULL, PCRE_INFO_DEFAULT_TABLES,
1376     (void *)(&tables));
1377 ph10 836 #else
1378     if (tables == NULL)
1379     (void)pcre16_fullinfo(external_re, NULL, PCRE_INFO_DEFAULT_TABLES,
1380     (void *)(&tables));
1381     #endif
1382 ph10 461
1383 ph10 455 compile_block.lcc = tables + lcc_offset;
1384     compile_block.fcc = tables + fcc_offset;
1385     compile_block.cbits = tables + cbits_offset;
1386     compile_block.ctypes = tables + ctypes_offset;
1387 ph10 461
1388 ph10 455 /* See if we can find a fixed set of initial characters for the pattern. */
1389 ph10 461
1390 ph10 836 memset(start_bits, 0, 32 * sizeof(pcre_uint8));
1391 ph10 605 rc = set_start_bits(code, start_bits, (re->options & PCRE_UTF8) != 0,
1392     &compile_block);
1393     bits_set = rc == SSB_DONE;
1394 ph10 691 if (rc == SSB_UNKNOWN)
1395 ph10 688 {
1396     *errorptr = "internal error: opcode not recognized";
1397     return NULL;
1398 ph10 691 }
1399 ph10 455 }
1400 ph10 461
1401 ph10 455 /* Find the minimum length of subject string. */
1402 nigel 77
1403 ph10 697 switch(min = find_minlength(code, code, re->options, 0))
1404 ph10 603 {
1405 ph10 688 case -2: *errorptr = "internal error: missing capturing bracket"; return NULL;
1406     case -3: *errorptr = "internal error: opcode not recognized"; return NULL;
1407 ph10 603 default: break;
1408 ph10 605 }
1409 ph10 708
1410 ph10 688 /* If a set of starting bytes has been identified, or if the minimum length is
1411 zherczeg 850 greater than zero, or if JIT optimization has been requested, get a
1412     pcre[16]_extra block and a pcre_study_data block. The study data is put in the
1413     latter, which is pointed to by the former, which may also get additional data
1414     set later by the calling program. At the moment, the size of pcre_study_data
1415     is fixed. We nevertheless save it in a field for returning via the
1416     pcre_fullinfo() function so that if it becomes variable in the future,
1417     we don't have to change that code. */
1418 nigel 77
1419 ph10 691 if (bits_set || min > 0
1420 ph10 688 #ifdef SUPPORT_JIT
1421     || (options & PCRE_STUDY_JIT_COMPILE) != 0
1422     #endif
1423     )
1424 nigel 77 {
1425 zherczeg 850 extra = (PUBL(extra) *)(PUBL(malloc))
1426     (sizeof(PUBL(extra)) + sizeof(pcre_study_data));
1427 ph10 688 if (extra == NULL)
1428     {
1429     *errorptr = "failed to get memory";
1430     return NULL;
1431     }
1432 ph10 691
1433 zherczeg 850 study = (pcre_study_data *)((char *)extra + sizeof(PUBL(extra)));
1434 ph10 688 extra->flags = PCRE_EXTRA_STUDY_DATA;
1435     extra->study_data = study;
1436 ph10 691
1437 ph10 688 study->size = sizeof(pcre_study_data);
1438     study->flags = 0;
1439 ph10 797
1440 ph10 836 /* Set the start bits always, to avoid unset memory errors if the
1441     study data is written to a file, but set the flag only if any of the bits
1442     are set, to save time looking when none are. */
1443    
1444 ph10 797 if (bits_set)
1445 ph10 688 {
1446     study->flags |= PCRE_STUDY_MAPPED;
1447     memcpy(study->start_bits, start_bits, sizeof(start_bits));
1448     }
1449 ph10 836 else memset(study->start_bits, 0, 32 * sizeof(pcre_uint8));
1450 ph10 691
1451 ph10 836 #ifdef PCRE_DEBUG
1452     if (bits_set)
1453     {
1454     pcre_uint8 *ptr = (pcre_uint32 *)start_bits;
1455     int i;
1456    
1457     printf("Start bits:\n");
1458     for (i = 0; i < 32; i++)
1459     printf("%3d: %02x%s", i * 8, *ptr++, ((i + 1) & 0x7) != 0? " " : "\n");
1460     }
1461     #endif
1462    
1463 ph10 691 /* Always set the minlength value in the block, because the JIT compiler
1464     makes use of it. However, don't set the bit unless the length is greater than
1465     zero - the interpretive pcre_exec() and pcre_dfa_exec() needn't waste time
1466 ph10 697 checking the zero case. */
1467 ph10 691
1468 ph10 708 if (min > 0)
1469 ph10 697 {
1470     study->flags |= PCRE_STUDY_MINLEN;
1471     study->minlength = min;
1472     }
1473 ph10 708 else study->minlength = 0;
1474 ph10 691
1475 ph10 688 /* If JIT support was compiled and requested, attempt the JIT compilation.
1476     If no starting bytes were found, and the minimum length is zero, and JIT
1477 ph10 689 compilation fails, abandon the extra block and return NULL. */
1478 ph10 691
1479 ph10 666 #ifdef SUPPORT_JIT
1480 ph10 688 extra->executable_jit = NULL;
1481 ph10 836 if ((options & PCRE_STUDY_JIT_COMPILE) != 0) PRIV(jit_compile)(re, extra);
1482 ph10 689 if (study->flags == 0 && (extra->flags & PCRE_EXTRA_EXECUTABLE_JIT) == 0)
1483 ph10 688 {
1484 ph10 836 #ifdef COMPILE_PCRE8
1485 ph10 688 pcre_free_study(extra);
1486 ph10 836 #endif
1487     #ifdef COMPILE_PCRE16
1488     pcre16_free_study(extra);
1489     #endif
1490 ph10 688 extra = NULL;
1491 ph10 691 }
1492 ph10 666 #endif
1493 ph10 688 }
1494 ph10 666
1495 nigel 77 return extra;
1496     }
1497    
1498 ph10 666
1499     /*************************************************
1500     * Free the study data *
1501     *************************************************/
1502    
1503     /* This function frees the memory that was obtained by pcre_study().
1504    
1505 zherczeg 850 Argument: a pointer to the pcre[16]_extra block
1506 ph10 666 Returns: nothing
1507     */
1508    
1509 ph10 836 #ifdef COMPILE_PCRE8
1510 ph10 666 PCRE_EXP_DEFN void
1511     pcre_free_study(pcre_extra *extra)
1512 ph10 836 #else
1513     PCRE_EXP_DEFN void
1514 zherczeg 850 pcre16_free_study(pcre16_extra *extra)
1515 ph10 836 #endif
1516 ph10 666 {
1517 ph10 836 if (extra == NULL)
1518     return;
1519 ph10 666 #ifdef SUPPORT_JIT
1520 ph10 691 if ((extra->flags & PCRE_EXTRA_EXECUTABLE_JIT) != 0 &&
1521 ph10 666 extra->executable_jit != NULL)
1522 ph10 836 PRIV(jit_free)(extra->executable_jit);
1523 ph10 666 #endif
1524 ph10 836 PUBL(free)(extra);
1525 ph10 666 }
1526    
1527 nigel 77 /* End of pcre_study.c */

Properties

Name Value
svn:eol-style native
svn:keywords "Author Date Id Revision Url"

webmaster@exim.org
ViewVC Help
Powered by ViewVC 1.1.12