/[pcre]/code/trunk/pcre_study.c
ViewVC logotype

Contents of /code/trunk/pcre_study.c

Parent Directory Parent Directory | Revision Log Revision Log


Revision 1548 - (hide annotations) (download)
Tue Apr 14 17:02:30 2015 UTC (12 days, 20 hours ago) by ph10
File MIME type: text/plain
File size: 48356 byte(s)
Documentation and tidies preparatory to 8.37 release.

1 nigel 77 /*************************************************
2     * Perl-Compatible Regular Expressions *
3     *************************************************/
4    
5     /* PCRE is a library of functions to support regular expressions whose syntax
6     and semantics are as close as possible to those of the Perl 5 language.
7    
8     Written by Philip Hazel
9 ph10 836 Copyright (c) 1997-2012 University of Cambridge
10 nigel 77
11     -----------------------------------------------------------------------------
12     Redistribution and use in source and binary forms, with or without
13     modification, are permitted provided that the following conditions are met:
14    
15     * Redistributions of source code must retain the above copyright notice,
16     this list of conditions and the following disclaimer.
17    
18     * Redistributions in binary form must reproduce the above copyright
19     notice, this list of conditions and the following disclaimer in the
20     documentation and/or other materials provided with the distribution.
21    
22     * Neither the name of the University of Cambridge nor the names of its
23     contributors may be used to endorse or promote products derived from
24     this software without specific prior written permission.
25    
26     THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
27     AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
28     IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
29     ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
30     LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
31     CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
32     SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
33     INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
34     CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
35     ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
36     POSSIBILITY OF SUCH DAMAGE.
37     -----------------------------------------------------------------------------
38     */
39    
40    
41     /* This module contains the external function pcre_study(), along with local
42     supporting functions. */
43    
44    
45 ph10 200 #ifdef HAVE_CONFIG_H
46 ph10 236 #include "config.h"
47 ph10 200 #endif
48 ph10 199
49 nigel 77 #include "pcre_internal.h"
50    
51 ph10 524 #define SET_BIT(c) start_bits[c/8] |= (1 << (c&7))
52 nigel 77
53 nigel 93 /* Returns from set_start_bits() */
54    
55 ph10 605 enum { SSB_FAIL, SSB_DONE, SSB_CONTINUE, SSB_UNKNOWN };
56 nigel 93
57    
58 ph10 455
59 nigel 77 /*************************************************
60 ph10 455 * Find the minimum subject length for a group *
61     *************************************************/
62    
63     /* Scan a parenthesized group and compute the minimum length of subject that
64 ph10 461 is needed to match it. This is a lower bound; it does not mean there is a
65 ph10 455 string of that length that matches. In UTF8 mode, the result is in characters
66     rather than bytes.
67    
68     Arguments:
69 ph10 1361 re compiled pattern block
70 ph10 697 code pointer to start of group (the bracket)
71 ph10 1361 startcode pointer to start of the whole pattern's code
72 ph10 697 options the compiling options
73 ph10 1547 recurses chain of recurse_check to catch mutual recursion
74 ph10 455
75     Returns: the minimum length
76 ph10 697 -1 if \C in UTF-8 mode or (*ACCEPT) was encountered
77 ph10 461 -2 internal error (missing capturing bracket)
78 ph10 605 -3 internal error (opcode not listed)
79 ph10 455 */
80    
81     static int
82 ph10 1404 find_minlength(const REAL_PCRE *re, const pcre_uchar *code,
83 ph10 1547 const pcre_uchar *startcode, int options, recurse_check *recurses)
84 ph10 455 {
85     int length = -1;
86 ph10 836 /* PCRE_UTF16 has the same value as PCRE_UTF8. */
87     BOOL utf = (options & PCRE_UTF8) != 0;
88 ph10 455 BOOL had_recurse = FALSE;
89 ph10 1547 recurse_check this_recurse;
90 ph10 455 register int branchlength = 0;
91 ph10 836 register pcre_uchar *cc = (pcre_uchar *)code + 1 + LINK_SIZE;
92 ph10 455
93 ph10 604 if (*code == OP_CBRA || *code == OP_SCBRA ||
94 ph10 836 *code == OP_CBRAPOS || *code == OP_SCBRAPOS) cc += IMM2_SIZE;
95 ph10 455
96     /* Scan along the opcodes for this branch. If we get to the end of the
97     branch, check the length against that of the other branches. */
98    
99     for (;;)
100     {
101     int d, min;
102 ph10 836 pcre_uchar *cs, *ce;
103 chpe 1084 register pcre_uchar op = *cc;
104 ph10 461
105 ph10 455 switch (op)
106     {
107 ph10 471 case OP_COND:
108     case OP_SCOND:
109    
110     /* If there is only one branch in a condition, the implied branch has zero
111     length, so we don't add anything. This covers the DEFINE "condition"
112     automatically. */
113    
114     cs = cc + GET(cc, 1);
115     if (*cs != OP_ALT)
116     {
117     cc = cs + 1 + LINK_SIZE;
118     break;
119     }
120    
121     /* Otherwise we can fall through and treat it the same as any other
122     subpattern. */
123    
124 ph10 455 case OP_CBRA:
125 ph10 461 case OP_SCBRA:
126 ph10 455 case OP_BRA:
127 ph10 461 case OP_SBRA:
128 ph10 604 case OP_CBRAPOS:
129     case OP_SCBRAPOS:
130     case OP_BRAPOS:
131     case OP_SBRAPOS:
132 ph10 455 case OP_ONCE:
133 ph10 723 case OP_ONCE_NC:
134 ph10 1547 d = find_minlength(re, cc, startcode, options, recurses);
135 ph10 455 if (d < 0) return d;
136     branchlength += d;
137     do cc += GET(cc, 1); while (*cc == OP_ALT);
138     cc += 1 + LINK_SIZE;
139     break;
140    
141 ph10 697 /* ACCEPT makes things far too complicated; we have to give up. */
142 ph10 455
143 ph10 654 case OP_ACCEPT:
144     case OP_ASSERT_ACCEPT:
145 ph10 708 return -1;
146 ph10 697
147     /* Reached end of a branch; if it's a ket it is the end of a nested
148     call. If it's ALT it is an alternation in a nested call. If it is END it's
149 ph10 708 the end of the outer call. All can be handled by the same code. If an
150     ACCEPT was previously encountered, use the length that was in force at that
151 ph10 697 time, and pass back the shortest ACCEPT length. */
152 ph10 708
153 ph10 455 case OP_ALT:
154     case OP_KET:
155     case OP_KETRMAX:
156     case OP_KETRMIN:
157 ph10 605 case OP_KETRPOS:
158 ph10 455 case OP_END:
159 ph10 461 if (length < 0 || (!had_recurse && branchlength < length))
160 ph10 455 length = branchlength;
161 ph10 612 if (op != OP_ALT) return length;
162 ph10 455 cc += 1 + LINK_SIZE;
163     branchlength = 0;
164 ph10 461 had_recurse = FALSE;
165 ph10 455 break;
166    
167     /* Skip over assertive subpatterns */
168    
169     case OP_ASSERT:
170     case OP_ASSERT_NOT:
171     case OP_ASSERTBACK:
172     case OP_ASSERTBACK_NOT:
173     do cc += GET(cc, 1); while (*cc == OP_ALT);
174     /* Fall through */
175    
176     /* Skip over things that don't match chars */
177    
178     case OP_REVERSE:
179     case OP_CREF:
180 ph10 1365 case OP_DNCREF:
181 ph10 455 case OP_RREF:
182 ph10 1365 case OP_DNRREF:
183 ph10 455 case OP_DEF:
184     case OP_CALLOUT:
185     case OP_SOD:
186     case OP_SOM:
187     case OP_EOD:
188     case OP_EODN:
189     case OP_CIRC:
190 ph10 602 case OP_CIRCM:
191 ph10 455 case OP_DOLL:
192 ph10 602 case OP_DOLLM:
193 ph10 455 case OP_NOT_WORD_BOUNDARY:
194     case OP_WORD_BOUNDARY:
195 ph10 836 cc += PRIV(OP_lengths)[*cc];
196 ph10 455 break;
197 ph10 461
198 ph10 455 /* Skip over a subpattern that has a {0} or {0,x} quantifier */
199    
200     case OP_BRAZERO:
201 ph10 461 case OP_BRAMINZERO:
202 ph10 604 case OP_BRAPOSZERO:
203 ph10 455 case OP_SKIPZERO:
204 ph10 836 cc += PRIV(OP_lengths)[*cc];
205 ph10 455 do cc += GET(cc, 1); while (*cc == OP_ALT);
206     cc += 1 + LINK_SIZE;
207     break;
208    
209     /* Handle literal characters and + repetitions */
210    
211     case OP_CHAR:
212 ph10 602 case OP_CHARI:
213 ph10 455 case OP_NOT:
214 ph10 602 case OP_NOTI:
215 ph10 455 case OP_PLUS:
216 ph10 603 case OP_PLUSI:
217 ph10 455 case OP_MINPLUS:
218 ph10 603 case OP_MINPLUSI:
219 ph10 455 case OP_POSPLUS:
220 ph10 603 case OP_POSPLUSI:
221 ph10 455 case OP_NOTPLUS:
222 ph10 603 case OP_NOTPLUSI:
223 ph10 455 case OP_NOTMINPLUS:
224 ph10 603 case OP_NOTMINPLUSI:
225 ph10 455 case OP_NOTPOSPLUS:
226 ph10 603 case OP_NOTPOSPLUSI:
227 ph10 455 branchlength++;
228     cc += 2;
229 chpe 1114 #ifdef SUPPORT_UTF
230 ph10 836 if (utf && HAS_EXTRALEN(cc[-1])) cc += GET_EXTRALEN(cc[-1]);
231 ph10 455 #endif
232     break;
233 ph10 461
234 ph10 455 case OP_TYPEPLUS:
235     case OP_TYPEMINPLUS:
236 ph10 461 case OP_TYPEPOSPLUS:
237 ph10 455 branchlength++;
238     cc += (cc[1] == OP_PROP || cc[1] == OP_NOTPROP)? 4 : 2;
239     break;
240    
241     /* Handle exact repetitions. The count is already in characters, but we
242     need to skip over a multibyte character in UTF8 mode. */
243    
244     case OP_EXACT:
245 ph10 603 case OP_EXACTI:
246 ph10 461 case OP_NOTEXACT:
247 ph10 603 case OP_NOTEXACTI:
248 ph10 455 branchlength += GET2(cc,1);
249 ph10 836 cc += 2 + IMM2_SIZE;
250 chpe 1114 #ifdef SUPPORT_UTF
251 ph10 836 if (utf && HAS_EXTRALEN(cc[-1])) cc += GET_EXTRALEN(cc[-1]);
252 ph10 455 #endif
253     break;
254    
255     case OP_TYPEEXACT:
256     branchlength += GET2(cc,1);
257 ph10 836 cc += 2 + IMM2_SIZE + ((cc[1 + IMM2_SIZE] == OP_PROP
258     || cc[1 + IMM2_SIZE] == OP_NOTPROP)? 2 : 0);
259 ph10 455 break;
260    
261     /* Handle single-char non-literal matchers */
262    
263     case OP_PROP:
264     case OP_NOTPROP:
265     cc += 2;
266     /* Fall through */
267    
268     case OP_NOT_DIGIT:
269     case OP_DIGIT:
270     case OP_NOT_WHITESPACE:
271     case OP_WHITESPACE:
272     case OP_NOT_WORDCHAR:
273     case OP_WORDCHAR:
274     case OP_ANY:
275     case OP_ALLANY:
276     case OP_EXTUNI:
277 ph10 461 case OP_HSPACE:
278 ph10 455 case OP_NOT_HSPACE:
279     case OP_VSPACE:
280 ph10 461 case OP_NOT_VSPACE:
281 ph10 455 branchlength++;
282     cc++;
283     break;
284 ph10 461
285 ph10 654 /* "Any newline" might match two characters, but it also might match just
286 ph10 611 one. */
287 ph10 461
288 ph10 455 case OP_ANYNL:
289 ph10 611 branchlength += 1;
290 ph10 455 cc++;
291 ph10 461 break;
292 ph10 455
293 ph10 836 /* The single-byte matcher means we can't proceed in UTF-8 mode. (In
294     non-UTF-8 mode \C will actually be turned into OP_ALLANY, so won't ever
295 ph10 754 appear, but leave the code, just in case.) */
296 ph10 455
297     case OP_ANYBYTE:
298 ph10 836 #ifdef SUPPORT_UTF
299     if (utf) return -1;
300 ph10 455 #endif
301     branchlength++;
302     cc++;
303 ph10 461 break;
304 ph10 455
305     /* For repeated character types, we have to test for \p and \P, which have
306     an extra two bytes of parameters. */
307    
308     case OP_TYPESTAR:
309     case OP_TYPEMINSTAR:
310     case OP_TYPEQUERY:
311     case OP_TYPEMINQUERY:
312     case OP_TYPEPOSSTAR:
313     case OP_TYPEPOSQUERY:
314     if (cc[1] == OP_PROP || cc[1] == OP_NOTPROP) cc += 2;
315 ph10 836 cc += PRIV(OP_lengths)[op];
316 ph10 455 break;
317    
318     case OP_TYPEUPTO:
319     case OP_TYPEMINUPTO:
320     case OP_TYPEPOSUPTO:
321 ph10 836 if (cc[1 + IMM2_SIZE] == OP_PROP
322     || cc[1 + IMM2_SIZE] == OP_NOTPROP) cc += 2;
323     cc += PRIV(OP_lengths)[op];
324 ph10 455 break;
325    
326     /* Check a class for variable quantification */
327    
328     case OP_CLASS:
329     case OP_NCLASS:
330 zherczeg 1148 #if defined SUPPORT_UTF || defined COMPILE_PCRE16 || defined COMPILE_PCRE32
331     case OP_XCLASS:
332     /* The original code caused an unsigned overflow in 64 bit systems,
333     so now we use a conditional statement. */
334     if (op == OP_XCLASS)
335     cc += GET(cc, 1);
336     else
337     cc += PRIV(OP_lengths)[OP_CLASS];
338     #else
339 ph10 836 cc += PRIV(OP_lengths)[OP_CLASS];
340 zherczeg 1148 #endif
341 ph10 455
342     switch (*cc)
343     {
344     case OP_CRPLUS:
345     case OP_CRMINPLUS:
346 ph10 1379 case OP_CRPOSPLUS:
347 ph10 455 branchlength++;
348 ph10 461 /* Fall through */
349 ph10 455
350     case OP_CRSTAR:
351     case OP_CRMINSTAR:
352     case OP_CRQUERY:
353     case OP_CRMINQUERY:
354 ph10 1379 case OP_CRPOSSTAR:
355     case OP_CRPOSQUERY:
356 ph10 461 cc++;
357 ph10 455 break;
358 ph10 461
359 ph10 455 case OP_CRRANGE:
360     case OP_CRMINRANGE:
361 ph10 1379 case OP_CRPOSRANGE:
362 ph10 455 branchlength += GET2(cc,1);
363 ph10 836 cc += 1 + 2 * IMM2_SIZE;
364 ph10 455 break;
365 ph10 461
366 ph10 455 default:
367     branchlength++;
368 ph10 461 break;
369 ph10 455 }
370     break;
371 ph10 461
372     /* Backreferences and subroutine calls are treated in the same way: we find
373     the minimum length for the subpattern. A recursion, however, causes an
374 ph10 455 a flag to be set that causes the length of this branch to be ignored. The
375     logic is that a recursion can only make sense if there is another
376     alternation that stops the recursing. That will provide the minimum length
377     (when no recursion happens). A backreference within the group that it is
378 ph10 469 referencing behaves in the same way.
379    
380 ph10 467 If PCRE_JAVASCRIPT_COMPAT is set, a backreference to an unset bracket
381     matches an empty string (by default it causes a matching failure), so in
382     that case we must set the minimum length to zero. */
383 ph10 1404
384 ph10 1361 case OP_DNREF: /* Duplicate named pattern back reference */
385     case OP_DNREFI:
386     if ((options & PCRE_JAVASCRIPT_COMPAT) == 0)
387     {
388 ph10 1404 int count = GET2(cc, 1+IMM2_SIZE);
389     pcre_uchar *slot = (pcre_uchar *)re +
390 ph10 1361 re->name_table_offset + GET2(cc, 1) * re->name_entry_size;
391     d = INT_MAX;
392     while (count-- > 0)
393     {
394     ce = cs = (pcre_uchar *)PRIV(find_bracket)(startcode, utf, GET2(slot, 0));
395     if (cs == NULL) return -2;
396     do ce += GET(ce, 1); while (*ce == OP_ALT);
397 ph10 1547 if (cc > cs && cc < ce) /* Simple recursion */
398 ph10 1361 {
399     d = 0;
400     had_recurse = TRUE;
401 ph10 1404 break;
402 ph10 1361 }
403     else
404 ph10 1548 {
405     recurse_check *r = recurses;
406 ph10 1547 for (r = recurses; r != NULL; r = r->prev) if (r->group == cs) break;
407     if (r != NULL) /* Mutual recursion */
408 ph10 1548 {
409     d = 0;
410     had_recurse = TRUE;
411     break;
412     }
413 ph10 1547 else
414     {
415     int dd;
416     this_recurse.prev = recurses;
417 ph10 1548 this_recurse.group = cs;
418 ph10 1547 dd = find_minlength(re, cs, startcode, options, &this_recurse);
419     if (dd < d) d = dd;
420     }
421 ph10 1548 }
422 ph10 1361 slot += re->name_entry_size;
423 ph10 1404 }
424 ph10 1361 }
425 ph10 1404 else d = 0;
426 ph10 1361 cc += 1 + 2*IMM2_SIZE;
427 ph10 1404 goto REPEAT_BACK_REFERENCE;
428 ph10 461
429 ph10 1361 case OP_REF: /* Single back reference */
430 ph10 602 case OP_REFI:
431 ph10 467 if ((options & PCRE_JAVASCRIPT_COMPAT) == 0)
432 ph10 469 {
433 ph10 836 ce = cs = (pcre_uchar *)PRIV(find_bracket)(startcode, utf, GET2(cc, 1));
434 ph10 467 if (cs == NULL) return -2;
435     do ce += GET(ce, 1); while (*ce == OP_ALT);
436 ph10 1547 if (cc > cs && cc < ce) /* Simple recursion */
437 ph10 467 {
438     d = 0;
439     had_recurse = TRUE;
440     }
441 ph10 654 else
442 ph10 1548 {
443     recurse_check *r = recurses;
444 ph10 1547 for (r = recurses; r != NULL; r = r->prev) if (r->group == cs) break;
445     if (r != NULL) /* Mutual recursion */
446 ph10 1548 {
447     d = 0;
448     had_recurse = TRUE;
449     }
450 ph10 1547 else
451     {
452     this_recurse.prev = recurses;
453 ph10 1548 this_recurse.group = cs;
454 ph10 1547 d = find_minlength(re, cs, startcode, options, &this_recurse);
455 ph10 1548 }
456 ph10 654 }
457 ph10 461 }
458 ph10 469 else d = 0;
459 ph10 836 cc += 1 + IMM2_SIZE;
460 ph10 455
461     /* Handle repeated back references */
462 ph10 461
463 ph10 1361 REPEAT_BACK_REFERENCE:
464 ph10 455 switch (*cc)
465     {
466     case OP_CRSTAR:
467     case OP_CRMINSTAR:
468     case OP_CRQUERY:
469     case OP_CRMINQUERY:
470 ph10 1379 case OP_CRPOSSTAR:
471     case OP_CRPOSQUERY:
472 ph10 455 min = 0;
473     cc++;
474     break;
475 ph10 605
476     case OP_CRPLUS:
477     case OP_CRMINPLUS:
478 ph10 1379 case OP_CRPOSPLUS:
479 ph10 605 min = 1;
480     cc++;
481     break;
482    
483 ph10 455 case OP_CRRANGE:
484     case OP_CRMINRANGE:
485 ph10 1379 case OP_CRPOSRANGE:
486 ph10 455 min = GET2(cc, 1);
487 ph10 836 cc += 1 + 2 * IMM2_SIZE;
488 ph10 455 break;
489 ph10 461
490 ph10 455 default:
491     min = 1;
492     break;
493     }
494    
495     branchlength += min * d;
496 ph10 461 break;
497 ph10 455
498 ph10 654 /* We can easily detect direct recursion, but not mutual recursion. This is
499     caught by a recursion depth count. */
500    
501 ph10 461 case OP_RECURSE:
502 ph10 836 cs = ce = (pcre_uchar *)startcode + GET(cc, 1);
503 ph10 455 do ce += GET(ce, 1); while (*ce == OP_ALT);
504 ph10 1547 if (cc > cs && cc < ce) /* Simple recursion */
505 ph10 461 had_recurse = TRUE;
506     else
507 ph10 1548 {
508     recurse_check *r = recurses;
509 ph10 1547 for (r = recurses; r != NULL; r = r->prev) if (r->group == cs) break;
510     if (r != NULL) /* Mutual recursion */
511 ph10 1548 had_recurse = TRUE;
512 ph10 1547 else
513     {
514     this_recurse.prev = recurses;
515 ph10 1548 this_recurse.group = cs;
516 ph10 1547 branchlength += find_minlength(re, cs, startcode, options,
517     &this_recurse);
518 ph10 1548 }
519 ph10 654 }
520 ph10 455 cc += 1 + LINK_SIZE;
521     break;
522    
523     /* Anything else does not or need not match a character. We can get the
524 ph10 461 item's length from the table, but for those that can match zero occurrences
525 ph10 602 of a character, we must take special action for UTF-8 characters. As it
526     happens, the "NOT" versions of these opcodes are used at present only for
527     ASCII characters, so they could be omitted from this list. However, in
528 ph10 605 future that may change, so we include them here so as not to leave a
529 ph10 603 gotcha for a future maintainer. */
530 ph10 461
531 ph10 455 case OP_UPTO:
532 ph10 602 case OP_UPTOI:
533 ph10 461 case OP_NOTUPTO:
534 ph10 602 case OP_NOTUPTOI:
535 ph10 455 case OP_MINUPTO:
536 ph10 602 case OP_MINUPTOI:
537 ph10 461 case OP_NOTMINUPTO:
538 ph10 602 case OP_NOTMINUPTOI:
539 ph10 455 case OP_POSUPTO:
540 ph10 602 case OP_POSUPTOI:
541     case OP_NOTPOSUPTO:
542     case OP_NOTPOSUPTOI:
543    
544 ph10 455 case OP_STAR:
545 ph10 602 case OP_STARI:
546     case OP_NOTSTAR:
547     case OP_NOTSTARI:
548 ph10 455 case OP_MINSTAR:
549 ph10 602 case OP_MINSTARI:
550 ph10 461 case OP_NOTMINSTAR:
551 ph10 602 case OP_NOTMINSTARI:
552 ph10 455 case OP_POSSTAR:
553 ph10 602 case OP_POSSTARI:
554 ph10 461 case OP_NOTPOSSTAR:
555 ph10 602 case OP_NOTPOSSTARI:
556    
557 ph10 455 case OP_QUERY:
558 ph10 602 case OP_QUERYI:
559     case OP_NOTQUERY:
560     case OP_NOTQUERYI:
561 ph10 455 case OP_MINQUERY:
562 ph10 602 case OP_MINQUERYI:
563 ph10 455 case OP_NOTMINQUERY:
564 ph10 602 case OP_NOTMINQUERYI:
565 ph10 455 case OP_POSQUERY:
566 ph10 602 case OP_POSQUERYI:
567 ph10 461 case OP_NOTPOSQUERY:
568 ph10 602 case OP_NOTPOSQUERYI:
569    
570 ph10 836 cc += PRIV(OP_lengths)[op];
571 chpe 1114 #ifdef SUPPORT_UTF
572 ph10 836 if (utf && HAS_EXTRALEN(cc[-1])) cc += GET_EXTRALEN(cc[-1]);
573 ph10 461 #endif
574 ph10 455 break;
575 ph10 512
576 ph10 510 /* Skip these, but we need to add in the name length. */
577 ph10 512
578 ph10 510 case OP_MARK:
579     case OP_PRUNE_ARG:
580     case OP_SKIP_ARG:
581 ph10 729 case OP_THEN_ARG:
582 ph10 836 cc += PRIV(OP_lengths)[op] + cc[1];
583 ph10 512 break;
584 ph10 455
585 ph10 603 /* The remaining opcodes are just skipped over. */
586 ph10 461
587 ph10 603 case OP_CLOSE:
588     case OP_COMMIT:
589     case OP_FAIL:
590     case OP_PRUNE:
591     case OP_SET_SOM:
592     case OP_SKIP:
593 ph10 605 case OP_THEN:
594 ph10 836 cc += PRIV(OP_lengths)[op];
595 ph10 455 break;
596 ph10 605
597 ph10 603 /* This should not occur: we list all opcodes explicitly so that when
598     new ones get added they are properly considered. */
599    
600     default:
601     return -3;
602 ph10 455 }
603     }
604     /* Control never gets here */
605     }
606    
607    
608    
609     /*************************************************
610 nigel 77 * Set a bit and maybe its alternate case *
611     *************************************************/
612    
613 ph10 535 /* Given a character, set its first byte's bit in the table, and also the
614 ph10 520 corresponding bit for the other version of a letter if we are caseless. In
615     UTF-8 mode, for characters greater than 127, we can only do the caseless thing
616     when Unicode property support is available.
617 nigel 77
618     Arguments:
619     start_bits points to the bit map
620 ph10 520 p points to the character
621 nigel 77 caseless the caseless flag
622     cd the block with char table pointers
623 chpe 1055 utf TRUE for UTF-8 / UTF-16 / UTF-32 mode
624 nigel 77
625 ph10 520 Returns: pointer after the character
626 nigel 77 */
627    
628 ph10 836 static const pcre_uchar *
629     set_table_bit(pcre_uint8 *start_bits, const pcre_uchar *p, BOOL caseless,
630     compile_data *cd, BOOL utf)
631 nigel 77 {
632 chpe 1102 pcre_uint32 c = *p;
633 ph10 520
634 ph10 836 #ifdef COMPILE_PCRE8
635 ph10 524 SET_BIT(c);
636    
637 ph10 836 #ifdef SUPPORT_UTF
638     if (utf && c > 127)
639 ph10 520 {
640     GETCHARINC(c, p);
641     #ifdef SUPPORT_UCP
642     if (caseless)
643     {
644 ph10 836 pcre_uchar buff[6];
645 ph10 520 c = UCD_OTHERCASE(c);
646 ph10 836 (void)PRIV(ord2utf)(c, buff);
647 ph10 535 SET_BIT(buff[0]);
648     }
649 ph10 1032 #endif /* Not SUPPORT_UCP */
650 ph10 520 return p;
651     }
652 ph10 1033 #else /* Not SUPPORT_UTF */
653 ph10 1221 (void)(utf); /* Stops warning for unused parameter */
654 ph10 1033 #endif /* SUPPORT_UTF */
655 ph10 520
656     /* Not UTF-8 mode, or character is less than 127. */
657    
658 ph10 524 if (caseless && (cd->ctypes[c] & ctype_letter) != 0) SET_BIT(cd->fcc[c]);
659 ph10 520 return p + 1;
660 ph10 1033 #endif /* COMPILE_PCRE8 */
661 ph10 836
662 chpe 1055 #if defined COMPILE_PCRE16 || defined COMPILE_PCRE32
663 ph10 836 if (c > 0xff)
664     {
665     c = 0xff;
666     caseless = FALSE;
667     }
668     SET_BIT(c);
669    
670     #ifdef SUPPORT_UTF
671     if (utf && c > 127)
672     {
673     GETCHARINC(c, p);
674     #ifdef SUPPORT_UCP
675     if (caseless)
676     {
677     c = UCD_OTHERCASE(c);
678     if (c > 0xff)
679     c = 0xff;
680     SET_BIT(c);
681     }
682 ph10 1033 #endif /* SUPPORT_UCP */
683 ph10 836 return p;
684     }
685 ph10 1033 #else /* Not SUPPORT_UTF */
686 ph10 1221 (void)(utf); /* Stops warning for unused parameter */
687 ph10 1033 #endif /* SUPPORT_UTF */
688 ph10 836
689     if (caseless && (cd->ctypes[c] & ctype_letter) != 0) SET_BIT(cd->fcc[c]);
690     return p + 1;
691     #endif
692 nigel 77 }
693    
694    
695    
696     /*************************************************
697 ph10 539 * Set bits for a positive character type *
698     *************************************************/
699    
700     /* This function sets starting bits for a character type. In UTF-8 mode, we can
701     only do a direct setting for bytes less than 128, as otherwise there can be
702     confusion with bytes in the middle of UTF-8 characters. In a "traditional"
703     environment, the tables will only recognize ASCII characters anyway, but in at
704     least one Windows environment, some higher bytes bits were set in the tables.
705     So we deal with that case by considering the UTF-8 encoding.
706    
707     Arguments:
708     start_bits the starting bitmap
709     cbit type the type of character wanted
710     table_limit 32 for non-UTF-8; 16 for UTF-8
711 ph10 545 cd the block with char table pointers
712 ph10 539
713     Returns: nothing
714     */
715    
716     static void
717 chpe 1134 set_type_bits(pcre_uint8 *start_bits, int cbit_type, unsigned int table_limit,
718 ph10 539 compile_data *cd)
719     {
720 chpe 1095 register pcre_uint32 c;
721 ph10 539 for (c = 0; c < table_limit; c++) start_bits[c] |= cd->cbits[c+cbit_type];
722 ph10 836 #if defined SUPPORT_UTF && defined COMPILE_PCRE8
723 ph10 539 if (table_limit == 32) return;
724     for (c = 128; c < 256; c++)
725     {
726     if ((cd->cbits[c/8] & (1 << (c&7))) != 0)
727     {
728 ph10 836 pcre_uchar buff[6];
729     (void)PRIV(ord2utf)(c, buff);
730 ph10 545 SET_BIT(buff[0]);
731     }
732     }
733 ph10 836 #endif
734 ph10 539 }
735    
736    
737     /*************************************************
738     * Set bits for a negative character type *
739     *************************************************/
740    
741     /* This function sets starting bits for a negative character type such as \D.
742     In UTF-8 mode, we can only do a direct setting for bytes less than 128, as
743     otherwise there can be confusion with bytes in the middle of UTF-8 characters.
744 ph10 545 Unlike in the positive case, where we can set appropriate starting bits for
745 ph10 539 specific high-valued UTF-8 characters, in this case we have to set the bits for
746 ph10 545 all high-valued characters. The lowest is 0xc2, but we overkill by starting at
747 ph10 539 0xc0 (192) for simplicity.
748    
749     Arguments:
750     start_bits the starting bitmap
751     cbit type the type of character wanted
752     table_limit 32 for non-UTF-8; 16 for UTF-8
753 ph10 545 cd the block with char table pointers
754 ph10 539
755     Returns: nothing
756     */
757    
758     static void
759 chpe 1134 set_nottype_bits(pcre_uint8 *start_bits, int cbit_type, unsigned int table_limit,
760 ph10 539 compile_data *cd)
761     {
762 chpe 1095 register pcre_uint32 c;
763 ph10 539 for (c = 0; c < table_limit; c++) start_bits[c] |= ~cd->cbits[c+cbit_type];
764 ph10 836 #if defined SUPPORT_UTF && defined COMPILE_PCRE8
765 ph10 539 if (table_limit != 32) for (c = 24; c < 32; c++) start_bits[c] = 0xff;
766 ph10 836 #endif
767 ph10 539 }
768    
769    
770    
771     /*************************************************
772 nigel 93 * Create bitmap of starting bytes *
773 nigel 77 *************************************************/
774    
775 nigel 93 /* This function scans a compiled unanchored expression recursively and
776     attempts to build a bitmap of the set of possible starting bytes. As time goes
777     by, we may be able to get more clever at doing this. The SSB_CONTINUE return is
778     useful for parenthesized groups in patterns such as (a*)b where the group
779     provides some optional starting bytes but scanning must continue at the outer
780     level to find at least one mandatory byte. At the outermost level, this
781     function fails unless the result is SSB_DONE.
782 nigel 77
783     Arguments:
784     code points to an expression
785     start_bits points to a 32-byte table, initialized to 0
786 chpe 1055 utf TRUE if in UTF-8 / UTF-16 / UTF-32 mode
787 nigel 77 cd the block with char table pointers
788    
789 nigel 93 Returns: SSB_FAIL => Failed to find any starting bytes
790     SSB_DONE => Found mandatory starting bytes
791     SSB_CONTINUE => Found optional starting bytes
792 ph10 605 SSB_UNKNOWN => Hit an unrecognized opcode
793 nigel 77 */
794    
795 nigel 93 static int
796 ph10 836 set_start_bits(const pcre_uchar *code, pcre_uint8 *start_bits, BOOL utf,
797 ph10 603 compile_data *cd)
798 nigel 77 {
799 chpe 1095 register pcre_uint32 c;
800 nigel 93 int yield = SSB_DONE;
801 ph10 836 #if defined SUPPORT_UTF && defined COMPILE_PCRE8
802     int table_limit = utf? 16:32;
803     #else
804     int table_limit = 32;
805     #endif
806 nigel 77
807 nigel 91 #if 0
808     /* ========================================================================= */
809     /* The following comment and code was inserted in January 1999. In May 2006,
810     when it was observed to cause compiler warnings about unused values, I took it
811     out again. If anybody is still using OS/2, they will have to put it back
812     manually. */
813    
814 nigel 77 /* This next statement and the later reference to dummy are here in order to
815     trick the optimizer of the IBM C compiler for OS/2 into generating correct
816     code. Apparently IBM isn't going to fix the problem, and we would rather not
817     disable optimization (in this module it actually makes a big difference, and
818     the pcre module can use all the optimization it can get). */
819    
820     volatile int dummy;
821 nigel 91 /* ========================================================================= */
822     #endif
823 nigel 77
824     do
825     {
826     BOOL try_next = TRUE;
827 ph10 836 const pcre_uchar *tcode = code + 1 + LINK_SIZE;
828 ph10 605
829 ph10 604 if (*code == OP_CBRA || *code == OP_SCBRA ||
830 ph10 836 *code == OP_CBRAPOS || *code == OP_SCBRAPOS) tcode += IMM2_SIZE;
831 nigel 77
832 nigel 93 while (try_next) /* Loop for items in this branch */
833 nigel 77 {
834 nigel 93 int rc;
835 ph10 612
836 nigel 93 switch(*tcode)
837 nigel 77 {
838 ph10 605 /* If we reach something we don't understand, it means a new opcode has
839     been created that hasn't been added to this code. Hopefully this problem
840     will be discovered during testing. */
841 nigel 77
842     default:
843 ph10 605 return SSB_UNKNOWN;
844    
845     /* Fail for a valid opcode that implies no starting bits. */
846    
847     case OP_ACCEPT:
848 ph10 654 case OP_ASSERT_ACCEPT:
849 ph10 605 case OP_ALLANY:
850     case OP_ANY:
851     case OP_ANYBYTE:
852     case OP_CIRC:
853 ph10 654 case OP_CIRCM:
854 ph10 605 case OP_CLOSE:
855     case OP_COMMIT:
856     case OP_COND:
857 ph10 654 case OP_CREF:
858 ph10 605 case OP_DEF:
859 ph10 1365 case OP_DNCREF:
860     case OP_DNREF:
861     case OP_DNREFI:
862     case OP_DNRREF:
863 ph10 605 case OP_DOLL:
864 ph10 654 case OP_DOLLM:
865 ph10 605 case OP_END:
866     case OP_EOD:
867 ph10 654 case OP_EODN:
868 ph10 605 case OP_EXTUNI:
869     case OP_FAIL:
870     case OP_MARK:
871     case OP_NOT:
872     case OP_NOTEXACT:
873     case OP_NOTEXACTI:
874 ph10 654 case OP_NOTI:
875 ph10 605 case OP_NOTMINPLUS:
876     case OP_NOTMINPLUSI:
877     case OP_NOTMINQUERY:
878     case OP_NOTMINQUERYI:
879     case OP_NOTMINSTAR:
880     case OP_NOTMINSTARI:
881     case OP_NOTMINUPTO:
882     case OP_NOTMINUPTOI:
883     case OP_NOTPLUS:
884     case OP_NOTPLUSI:
885     case OP_NOTPOSPLUS:
886     case OP_NOTPOSPLUSI:
887     case OP_NOTPOSQUERY:
888     case OP_NOTPOSQUERYI:
889     case OP_NOTPOSSTAR:
890     case OP_NOTPOSSTARI:
891     case OP_NOTPOSUPTO:
892     case OP_NOTPOSUPTOI:
893     case OP_NOTPROP:
894     case OP_NOTQUERY:
895     case OP_NOTQUERYI:
896     case OP_NOTSTAR:
897     case OP_NOTSTARI:
898     case OP_NOTUPTO:
899     case OP_NOTUPTOI:
900     case OP_NOT_HSPACE:
901     case OP_NOT_VSPACE:
902     case OP_PRUNE:
903     case OP_PRUNE_ARG:
904     case OP_RECURSE:
905     case OP_REF:
906     case OP_REFI:
907     case OP_REVERSE:
908     case OP_RREF:
909 ph10 654 case OP_SCOND:
910 ph10 605 case OP_SET_SOM:
911     case OP_SKIP:
912     case OP_SKIP_ARG:
913     case OP_SOD:
914     case OP_SOM:
915     case OP_THEN:
916     case OP_THEN_ARG:
917 nigel 93 return SSB_FAIL;
918 ph10 1502
919 ph10 1488 /* A "real" property test implies no starting bits, but the fake property
920     PT_CLIST identifies a list of characters. These lists are short, as they
921     are used for characters with more than one "other case", so there is no
922     point in recognizing them for OP_NOTPROP. */
923 ph10 1502
924     case OP_PROP:
925     if (tcode[1] != PT_CLIST) return SSB_FAIL;
926     {
927     const pcre_uint32 *p = PRIV(ucd_caseless_sets) + tcode[2];
928     while ((c = *p++) < NOTACHAR)
929     {
930     #if defined SUPPORT_UTF && defined COMPILE_PCRE8
931     if (utf)
932     {
933     pcre_uchar buff[6];
934 ph10 1488 (void)PRIV(ord2utf)(c, buff);
935     c = buff[0];
936 ph10 1502 }
937     #endif
938 ph10 1488 if (c > 0xff) SET_BIT(0xff); else SET_BIT(c);
939 ph10 1502 }
940     }
941     try_next = FALSE;
942     break;
943 nigel 77
944 ph10 657 /* We can ignore word boundary tests. */
945 ph10 659
946 ph10 657 case OP_WORD_BOUNDARY:
947     case OP_NOT_WORD_BOUNDARY:
948 ph10 659 tcode++;
949     break;
950 ph10 657
951 nigel 93 /* If we hit a bracket or a positive lookahead assertion, recurse to set
952     bits from within the subpattern. If it can't find anything, we have to
953     give up. If it finds some mandatory character(s), we are done for this
954     branch. Otherwise, carry on scanning after the subpattern. */
955    
956     case OP_BRA:
957     case OP_SBRA:
958     case OP_CBRA:
959     case OP_SCBRA:
960 ph10 604 case OP_BRAPOS:
961     case OP_SBRAPOS:
962     case OP_CBRAPOS:
963     case OP_SCBRAPOS:
964 nigel 93 case OP_ONCE:
965 ph10 723 case OP_ONCE_NC:
966 nigel 93 case OP_ASSERT:
967 ph10 836 rc = set_start_bits(tcode, start_bits, utf, cd);
968 ph10 605 if (rc == SSB_FAIL || rc == SSB_UNKNOWN) return rc;
969 nigel 93 if (rc == SSB_DONE) try_next = FALSE; else
970     {
971     do tcode += GET(tcode, 1); while (*tcode == OP_ALT);
972     tcode += 1 + LINK_SIZE;
973     }
974     break;
975    
976     /* If we hit ALT or KET, it means we haven't found anything mandatory in
977     this branch, though we might have found something optional. For ALT, we
978     continue with the next alternative, but we have to arrange that the final
979     result from subpattern is SSB_CONTINUE rather than SSB_DONE. For KET,
980     return SSB_CONTINUE: if this is the top level, that indicates failure,
981     but after a nested subpattern, it causes scanning to continue. */
982    
983     case OP_ALT:
984     yield = SSB_CONTINUE;
985     try_next = FALSE;
986     break;
987    
988     case OP_KET:
989     case OP_KETRMAX:
990     case OP_KETRMIN:
991 ph10 605 case OP_KETRPOS:
992 nigel 93 return SSB_CONTINUE;
993    
994 nigel 77 /* Skip over callout */
995    
996     case OP_CALLOUT:
997     tcode += 2 + 2*LINK_SIZE;
998     break;
999    
1000     /* Skip over lookbehind and negative lookahead assertions */
1001    
1002     case OP_ASSERT_NOT:
1003     case OP_ASSERTBACK:
1004     case OP_ASSERTBACK_NOT:
1005     do tcode += GET(tcode, 1); while (*tcode == OP_ALT);
1006 nigel 93 tcode += 1 + LINK_SIZE;
1007 nigel 77 break;
1008    
1009     /* BRAZERO does the bracket, but carries on. */
1010    
1011     case OP_BRAZERO:
1012     case OP_BRAMINZERO:
1013 ph10 604 case OP_BRAPOSZERO:
1014 ph10 836 rc = set_start_bits(++tcode, start_bits, utf, cd);
1015 ph10 605 if (rc == SSB_FAIL || rc == SSB_UNKNOWN) return rc;
1016 nigel 91 /* =========================================================================
1017     See the comment at the head of this function concerning the next line,
1018     which was an old fudge for the benefit of OS/2.
1019 nigel 77 dummy = 1;
1020 nigel 91 ========================================================================= */
1021 nigel 77 do tcode += GET(tcode,1); while (*tcode == OP_ALT);
1022 nigel 93 tcode += 1 + LINK_SIZE;
1023 nigel 77 break;
1024    
1025 ph10 335 /* SKIPZERO skips the bracket. */
1026    
1027     case OP_SKIPZERO:
1028 ph10 358 tcode++;
1029 ph10 335 do tcode += GET(tcode,1); while (*tcode == OP_ALT);
1030     tcode += 1 + LINK_SIZE;
1031     break;
1032    
1033 nigel 77 /* Single-char * or ? sets the bit and tries the next item */
1034    
1035     case OP_STAR:
1036     case OP_MINSTAR:
1037 nigel 93 case OP_POSSTAR:
1038 nigel 77 case OP_QUERY:
1039     case OP_MINQUERY:
1040 nigel 93 case OP_POSQUERY:
1041 ph10 836 tcode = set_table_bit(start_bits, tcode + 1, FALSE, cd, utf);
1042 nigel 77 break;
1043    
1044 ph10 602 case OP_STARI:
1045     case OP_MINSTARI:
1046     case OP_POSSTARI:
1047     case OP_QUERYI:
1048     case OP_MINQUERYI:
1049     case OP_POSQUERYI:
1050 ph10 836 tcode = set_table_bit(start_bits, tcode + 1, TRUE, cd, utf);
1051 ph10 602 break;
1052    
1053 nigel 77 /* Single-char upto sets the bit and tries the next */
1054    
1055     case OP_UPTO:
1056     case OP_MINUPTO:
1057 nigel 93 case OP_POSUPTO:
1058 ph10 836 tcode = set_table_bit(start_bits, tcode + 1 + IMM2_SIZE, FALSE, cd, utf);
1059 nigel 77 break;
1060    
1061 ph10 602 case OP_UPTOI:
1062     case OP_MINUPTOI:
1063     case OP_POSUPTOI:
1064 ph10 836 tcode = set_table_bit(start_bits, tcode + 1 + IMM2_SIZE, TRUE, cd, utf);
1065 ph10 602 break;
1066    
1067 nigel 77 /* At least one single char sets the bit and stops */
1068    
1069 ph10 602 case OP_EXACT:
1070 ph10 836 tcode += IMM2_SIZE;
1071 ph10 602 /* Fall through */
1072 nigel 77 case OP_CHAR:
1073     case OP_PLUS:
1074     case OP_MINPLUS:
1075 nigel 93 case OP_POSPLUS:
1076 ph10 836 (void)set_table_bit(start_bits, tcode + 1, FALSE, cd, utf);
1077 nigel 77 try_next = FALSE;
1078     break;
1079 ph10 535
1080 ph10 603 case OP_EXACTI:
1081 ph10 836 tcode += IMM2_SIZE;
1082 ph10 603 /* Fall through */
1083 ph10 602 case OP_CHARI:
1084     case OP_PLUSI:
1085     case OP_MINPLUSI:
1086     case OP_POSPLUSI:
1087 ph10 836 (void)set_table_bit(start_bits, tcode + 1, TRUE, cd, utf);
1088 ph10 602 try_next = FALSE;
1089     break;
1090    
1091 ph10 535 /* Special spacing and line-terminating items. These recognize specific
1092     lists of characters. The difference between VSPACE and ANYNL is that the
1093     latter can match the two-character CRLF sequence, but that is not
1094     relevant for finding the first character, so their code here is
1095 ph10 524 identical. */
1096 ph10 535
1097 ph10 524 case OP_HSPACE:
1098 ph10 1033 SET_BIT(CHAR_HT);
1099     SET_BIT(CHAR_SPACE);
1100 ph10 836 #ifdef SUPPORT_UTF
1101     if (utf)
1102 ph10 535 {
1103 ph10 836 #ifdef COMPILE_PCRE8
1104 ph10 545 SET_BIT(0xC2); /* For U+00A0 */
1105 ph10 524 SET_BIT(0xE1); /* For U+1680, U+180E */
1106     SET_BIT(0xE2); /* For U+2000 - U+200A, U+202F, U+205F */
1107 ph10 535 SET_BIT(0xE3); /* For U+3000 */
1108 chpe 1055 #elif defined COMPILE_PCRE16 || defined COMPILE_PCRE32
1109 ph10 836 SET_BIT(0xA0);
1110     SET_BIT(0xFF); /* For characters > 255 */
1111 chpe 1055 #endif /* COMPILE_PCRE[8|16|32] */
1112 ph10 524 }
1113 ph10 836 else
1114     #endif /* SUPPORT_UTF */
1115     {
1116 ph10 1221 #ifndef EBCDIC
1117 ph10 836 SET_BIT(0xA0);
1118 ph10 1221 #endif /* Not EBCDIC */
1119 chpe 1055 #if defined COMPILE_PCRE16 || defined COMPILE_PCRE32
1120 ph10 836 SET_BIT(0xFF); /* For characters > 255 */
1121 chpe 1055 #endif /* COMPILE_PCRE[16|32] */
1122 ph10 836 }
1123 ph10 524 try_next = FALSE;
1124 ph10 535 break;
1125 nigel 77
1126 ph10 535 case OP_ANYNL:
1127 ph10 524 case OP_VSPACE:
1128 ph10 1033 SET_BIT(CHAR_LF);
1129     SET_BIT(CHAR_VT);
1130     SET_BIT(CHAR_FF);
1131     SET_BIT(CHAR_CR);
1132 ph10 836 #ifdef SUPPORT_UTF
1133     if (utf)
1134 ph10 545 {
1135 ph10 836 #ifdef COMPILE_PCRE8
1136 ph10 545 SET_BIT(0xC2); /* For U+0085 */
1137 ph10 538 SET_BIT(0xE2); /* For U+2028, U+2029 */
1138 chpe 1055 #elif defined COMPILE_PCRE16 || defined COMPILE_PCRE32
1139 ph10 1033 SET_BIT(CHAR_NEL);
1140 ph10 836 SET_BIT(0xFF); /* For characters > 255 */
1141 chpe 1055 #endif /* COMPILE_PCRE[8|16|32] */
1142 ph10 545 }
1143 ph10 836 else
1144     #endif /* SUPPORT_UTF */
1145     {
1146 ph10 1033 SET_BIT(CHAR_NEL);
1147 chpe 1055 #if defined COMPILE_PCRE16 || defined COMPILE_PCRE32
1148 ph10 836 SET_BIT(0xFF); /* For characters > 255 */
1149     #endif
1150     }
1151 ph10 524 try_next = FALSE;
1152 ph10 535 break;
1153 ph10 524
1154 ph10 535 /* Single character types set the bits and stop. Note that if PCRE_UCP
1155     is set, we do not see these op codes because \d etc are converted to
1156 ph10 545 properties. Therefore, these apply in the case when only characters less
1157 ph10 539 than 256 are recognized to match the types. */
1158 nigel 77
1159     case OP_NOT_DIGIT:
1160 ph10 539 set_nottype_bits(start_bits, cbit_digit, table_limit, cd);
1161 nigel 77 try_next = FALSE;
1162     break;
1163    
1164     case OP_DIGIT:
1165 ph10 539 set_type_bits(start_bits, cbit_digit, table_limit, cd);
1166 nigel 77 try_next = FALSE;
1167     break;
1168    
1169 ph10 1486 /* The cbit_space table has vertical tab as whitespace; we no longer
1170     have to play fancy tricks because Perl added VT to its whitespace at
1171     release 5.18. PCRE added it at release 8.34. */
1172 nigel 91
1173 nigel 77 case OP_NOT_WHITESPACE:
1174 ph10 539 set_nottype_bits(start_bits, cbit_space, table_limit, cd);
1175 nigel 77 try_next = FALSE;
1176     break;
1177    
1178     case OP_WHITESPACE:
1179 ph10 539 set_type_bits(start_bits, cbit_space, table_limit, cd);
1180 nigel 77 try_next = FALSE;
1181     break;
1182    
1183     case OP_NOT_WORDCHAR:
1184 ph10 539 set_nottype_bits(start_bits, cbit_word, table_limit, cd);
1185 nigel 77 try_next = FALSE;
1186     break;
1187    
1188     case OP_WORDCHAR:
1189 ph10 539 set_type_bits(start_bits, cbit_word, table_limit, cd);
1190 nigel 77 try_next = FALSE;
1191     break;
1192 ph10 545
1193 nigel 77 /* One or more character type fudges the pointer and restarts, knowing
1194     it will hit a single character type and stop there. */
1195    
1196     case OP_TYPEPLUS:
1197     case OP_TYPEMINPLUS:
1198 ph10 535 case OP_TYPEPOSPLUS:
1199 nigel 77 tcode++;
1200     break;
1201    
1202     case OP_TYPEEXACT:
1203 ph10 836 tcode += 1 + IMM2_SIZE;
1204 nigel 77 break;
1205    
1206     /* Zero or more repeats of character types set the bits and then
1207     try again. */
1208    
1209     case OP_TYPEUPTO:
1210     case OP_TYPEMINUPTO:
1211 nigel 93 case OP_TYPEPOSUPTO:
1212 ph10 836 tcode += IMM2_SIZE; /* Fall through */
1213 nigel 77
1214     case OP_TYPESTAR:
1215     case OP_TYPEMINSTAR:
1216 nigel 93 case OP_TYPEPOSSTAR:
1217 nigel 77 case OP_TYPEQUERY:
1218     case OP_TYPEMINQUERY:
1219 nigel 93 case OP_TYPEPOSQUERY:
1220 nigel 77 switch(tcode[1])
1221     {
1222 ph10 523 default:
1223 nigel 77 case OP_ANY:
1224 ph10 345 case OP_ALLANY:
1225 nigel 93 return SSB_FAIL;
1226 ph10 535
1227 ph10 524 case OP_HSPACE:
1228 ph10 1033 SET_BIT(CHAR_HT);
1229     SET_BIT(CHAR_SPACE);
1230 ph10 983 #ifdef SUPPORT_UTF
1231 ph10 836 if (utf)
1232 ph10 535 {
1233 ph10 836 #ifdef COMPILE_PCRE8
1234 ph10 545 SET_BIT(0xC2); /* For U+00A0 */
1235 ph10 524 SET_BIT(0xE1); /* For U+1680, U+180E */
1236     SET_BIT(0xE2); /* For U+2000 - U+200A, U+202F, U+205F */
1237 ph10 535 SET_BIT(0xE3); /* For U+3000 */
1238 chpe 1055 #elif defined COMPILE_PCRE16 || defined COMPILE_PCRE32
1239 ph10 836 SET_BIT(0xA0);
1240     SET_BIT(0xFF); /* For characters > 255 */
1241 chpe 1055 #endif /* COMPILE_PCRE[8|16|32] */
1242 ph10 524 }
1243 ph10 836 else
1244     #endif /* SUPPORT_UTF */
1245 ph10 1033 #ifndef EBCDIC
1246 ph10 836 SET_BIT(0xA0);
1247 ph10 1221 #endif /* Not EBCDIC */
1248 ph10 535 break;
1249    
1250     case OP_ANYNL:
1251 ph10 524 case OP_VSPACE:
1252 ph10 1033 SET_BIT(CHAR_LF);
1253     SET_BIT(CHAR_VT);
1254     SET_BIT(CHAR_FF);
1255     SET_BIT(CHAR_CR);
1256 ph10 983 #ifdef SUPPORT_UTF
1257 ph10 836 if (utf)
1258 ph10 538 {
1259 ph10 836 #ifdef COMPILE_PCRE8
1260 ph10 545 SET_BIT(0xC2); /* For U+0085 */
1261 ph10 538 SET_BIT(0xE2); /* For U+2028, U+2029 */
1262 chpe 1055 #elif defined COMPILE_PCRE16 || defined COMPILE_PCRE32
1263 ph10 1033 SET_BIT(CHAR_NEL);
1264 ph10 836 SET_BIT(0xFF); /* For characters > 255 */
1265 ph10 1033 #endif /* COMPILE_PCRE16 */
1266 ph10 545 }
1267 ph10 836 else
1268     #endif /* SUPPORT_UTF */
1269 ph10 1033 SET_BIT(CHAR_NEL);
1270 ph10 535 break;
1271    
1272 nigel 77 case OP_NOT_DIGIT:
1273 ph10 539 set_nottype_bits(start_bits, cbit_digit, table_limit, cd);
1274 nigel 77 break;
1275    
1276     case OP_DIGIT:
1277 ph10 539 set_type_bits(start_bits, cbit_digit, table_limit, cd);
1278 nigel 77 break;
1279    
1280 ph10 1404 /* The cbit_space table has vertical tab as whitespace; we no longer
1281     have to play fancy tricks because Perl added VT to its whitespace at
1282 ph10 1364 release 5.18. PCRE added it at release 8.34. */
1283 nigel 91
1284 nigel 77 case OP_NOT_WHITESPACE:
1285 ph10 539 set_nottype_bits(start_bits, cbit_space, table_limit, cd);
1286 nigel 77 break;
1287    
1288     case OP_WHITESPACE:
1289 ph10 539 set_type_bits(start_bits, cbit_space, table_limit, cd);
1290 nigel 77 break;
1291    
1292     case OP_NOT_WORDCHAR:
1293 ph10 539 set_nottype_bits(start_bits, cbit_word, table_limit, cd);
1294 nigel 77 break;
1295    
1296     case OP_WORDCHAR:
1297 ph10 539 set_type_bits(start_bits, cbit_word, table_limit, cd);
1298 nigel 77 break;
1299     }
1300    
1301     tcode += 2;
1302     break;
1303    
1304     /* Character class where all the information is in a bit map: set the
1305     bits and either carry on or not, according to the repeat count. If it was
1306     a negative class, and we are operating with UTF-8 characters, any byte
1307     with a value >= 0xc4 is a potentially valid starter because it starts a
1308     character with a value > 255. */
1309    
1310 zherczeg 1414 #if defined SUPPORT_UTF || !defined COMPILE_PCRE8
1311     case OP_XCLASS:
1312     if ((tcode[1 + LINK_SIZE] & XCL_HASPROP) != 0)
1313     return SSB_FAIL;
1314     /* All bits are set. */
1315     if ((tcode[1 + LINK_SIZE] & XCL_MAP) == 0 && (tcode[1 + LINK_SIZE] & XCL_NOT) != 0)
1316     return SSB_FAIL;
1317     #endif
1318     /* Fall through */
1319    
1320 nigel 77 case OP_NCLASS:
1321 ph10 836 #if defined SUPPORT_UTF && defined COMPILE_PCRE8
1322     if (utf)
1323 nigel 77 {
1324     start_bits[24] |= 0xf0; /* Bits for 0xc4 - 0xc8 */
1325     memset(start_bits+25, 0xff, 7); /* Bits for 0xc9 - 0xff */
1326     }
1327 ph10 111 #endif
1328 chpe 1055 #if defined COMPILE_PCRE16 || defined COMPILE_PCRE32
1329 ph10 836 SET_BIT(0xFF); /* For characters > 255 */
1330     #endif
1331 nigel 77 /* Fall through */
1332    
1333     case OP_CLASS:
1334     {
1335 ph10 836 pcre_uint8 *map;
1336 zherczeg 1414 #if defined SUPPORT_UTF || !defined COMPILE_PCRE8
1337     map = NULL;
1338     if (*tcode == OP_XCLASS)
1339     {
1340     if ((tcode[1 + LINK_SIZE] & XCL_MAP) != 0)
1341     map = (pcre_uint8 *)(tcode + 1 + LINK_SIZE + 1);
1342     tcode += GET(tcode, 1);
1343     }
1344     else
1345     #endif
1346     {
1347     tcode++;
1348     map = (pcre_uint8 *)tcode;
1349     tcode += 32 / sizeof(pcre_uchar);
1350     }
1351 nigel 77
1352     /* In UTF-8 mode, the bits in a bit map correspond to character
1353     values, not to byte values. However, the bit map we are constructing is
1354     for byte values. So we have to do a conversion for characters whose
1355     value is > 127. In fact, there are only two possible starting bytes for
1356     characters in the range 128 - 255. */
1357    
1358 zherczeg 1414 #if defined SUPPORT_UTF || !defined COMPILE_PCRE8
1359     if (map != NULL)
1360     #endif
1361     {
1362 ph10 836 #if defined SUPPORT_UTF && defined COMPILE_PCRE8
1363 zherczeg 1414 if (utf)
1364 nigel 77 {
1365 zherczeg 1414 for (c = 0; c < 16; c++) start_bits[c] |= map[c];
1366     for (c = 128; c < 256; c++)
1367 nigel 77 {
1368 zherczeg 1414 if ((map[c/8] && (1 << (c&7))) != 0)
1369     {
1370     int d = (c >> 6) | 0xc0; /* Set bit for this starter */
1371     start_bits[d/8] |= (1 << (d&7)); /* and then skip on to the */
1372     c = (c & 0xc0) + 0x40 - 1; /* next relevant character. */
1373     }
1374 nigel 77 }
1375     }
1376 zherczeg 1414 else
1377 ph10 111 #endif
1378 zherczeg 1414 {
1379     /* In non-UTF-8 mode, the two bit maps are completely compatible. */
1380     for (c = 0; c < 32; c++) start_bits[c] |= map[c];
1381     }
1382 nigel 77 }
1383    
1384 ph10 654 /* Advance past the bit map, and act on what follows. For a zero
1385 ph10 605 minimum repeat, continue; otherwise stop processing. */
1386 nigel 77
1387     switch (*tcode)
1388     {
1389     case OP_CRSTAR:
1390     case OP_CRMINSTAR:
1391     case OP_CRQUERY:
1392     case OP_CRMINQUERY:
1393 ph10 1379 case OP_CRPOSSTAR:
1394     case OP_CRPOSQUERY:
1395 nigel 77 tcode++;
1396     break;
1397    
1398     case OP_CRRANGE:
1399     case OP_CRMINRANGE:
1400 ph10 1379 case OP_CRPOSRANGE:
1401 ph10 836 if (GET2(tcode, 1) == 0) tcode += 1 + 2 * IMM2_SIZE;
1402 nigel 77 else try_next = FALSE;
1403     break;
1404 ph10 654
1405 nigel 77 default:
1406     try_next = FALSE;
1407     break;
1408     }
1409     }
1410     break; /* End of bitmap class handling */
1411    
1412     } /* End of switch */
1413     } /* End of try_next loop */
1414    
1415     code += GET(code, 1); /* Advance to next branch */
1416     }
1417     while (*code == OP_ALT);
1418 nigel 93 return yield;
1419 nigel 77 }
1420    
1421    
1422    
1423 ph10 605
1424    
1425 nigel 77 /*************************************************
1426     * Study a compiled expression *
1427     *************************************************/
1428    
1429     /* This function is handed a compiled expression that it must study to produce
1430 zherczeg 850 information that will speed up the matching. It returns a pcre[16]_extra block
1431 nigel 77 which then gets handed back to pcre_exec().
1432    
1433     Arguments:
1434     re points to the compiled expression
1435     options contains option bits
1436     errorptr points to where to place error messages;
1437     set NULL unless error
1438    
1439 zherczeg 850 Returns: pointer to a pcre[16]_extra block, with study_data filled in and
1440     the appropriate flags set;
1441 nigel 77 NULL on error or if no optimization possible
1442     */
1443    
1444 chpe 1055 #if defined COMPILE_PCRE8
1445 ph10 359 PCRE_EXP_DEFN pcre_extra * PCRE_CALL_CONVENTION
1446 nigel 77 pcre_study(const pcre *external_re, int options, const char **errorptr)
1447 chpe 1055 #elif defined COMPILE_PCRE16
1448 zherczeg 850 PCRE_EXP_DEFN pcre16_extra * PCRE_CALL_CONVENTION
1449 zherczeg 852 pcre16_study(const pcre16 *external_re, int options, const char **errorptr)
1450 chpe 1055 #elif defined COMPILE_PCRE32
1451     PCRE_EXP_DEFN pcre32_extra * PCRE_CALL_CONVENTION
1452     pcre32_study(const pcre32 *external_re, int options, const char **errorptr)
1453 ph10 836 #endif
1454 nigel 77 {
1455 ph10 455 int min;
1456     BOOL bits_set = FALSE;
1457 ph10 836 pcre_uint8 start_bits[32];
1458 zherczeg 850 PUBL(extra) *extra = NULL;
1459 nigel 77 pcre_study_data *study;
1460 ph10 836 const pcre_uint8 *tables;
1461     pcre_uchar *code;
1462 nigel 91 compile_data compile_block;
1463 zherczeg 852 const REAL_PCRE *re = (const REAL_PCRE *)external_re;
1464 nigel 77
1465 ph10 1361
1466 nigel 77 *errorptr = NULL;
1467    
1468     if (re == NULL || re->magic_number != MAGIC_NUMBER)
1469     {
1470     *errorptr = "argument is not a compiled regular expression";
1471     return NULL;
1472     }
1473    
1474 ph10 836 if ((re->flags & PCRE_MODE) == 0)
1475     {
1476 chpe 1055 #if defined COMPILE_PCRE8
1477     *errorptr = "argument not compiled in 8 bit mode";
1478     #elif defined COMPILE_PCRE16
1479     *errorptr = "argument not compiled in 16 bit mode";
1480     #elif defined COMPILE_PCRE32
1481     *errorptr = "argument not compiled in 32 bit mode";
1482 ph10 836 #endif
1483     return NULL;
1484     }
1485    
1486 nigel 77 if ((options & ~PUBLIC_STUDY_OPTIONS) != 0)
1487     {
1488     *errorptr = "unknown or incorrect option bit(s) set";
1489     return NULL;
1490     }
1491    
1492 ph10 836 code = (pcre_uchar *)re + re->name_table_offset +
1493 nigel 91 (re->name_count * re->name_entry_size);
1494    
1495 nigel 77 /* For an anchored pattern, or an unanchored pattern that has a first char, or
1496 ph10 461 a multiline pattern that matches only at "line starts", there is no point in
1497 ph10 455 seeking a list of starting bytes. */
1498 nigel 77
1499 ph10 455 if ((re->options & PCRE_ANCHORED) == 0 &&
1500     (re->flags & (PCRE_FIRSTSET|PCRE_STARTLINE)) == 0)
1501     {
1502 ph10 605 int rc;
1503    
1504 ph10 455 /* Set the character tables in the block that is passed around */
1505 ph10 461
1506 ph10 455 tables = re->tables;
1507 ph10 836
1508 chpe 1055 #if defined COMPILE_PCRE8
1509 ph10 455 if (tables == NULL)
1510     (void)pcre_fullinfo(external_re, NULL, PCRE_INFO_DEFAULT_TABLES,
1511     (void *)(&tables));
1512 chpe 1055 #elif defined COMPILE_PCRE16
1513 ph10 836 if (tables == NULL)
1514     (void)pcre16_fullinfo(external_re, NULL, PCRE_INFO_DEFAULT_TABLES,
1515     (void *)(&tables));
1516 chpe 1055 #elif defined COMPILE_PCRE32
1517     if (tables == NULL)
1518     (void)pcre32_fullinfo(external_re, NULL, PCRE_INFO_DEFAULT_TABLES,
1519     (void *)(&tables));
1520 ph10 836 #endif
1521 ph10 461
1522 ph10 455 compile_block.lcc = tables + lcc_offset;
1523     compile_block.fcc = tables + fcc_offset;
1524     compile_block.cbits = tables + cbits_offset;
1525     compile_block.ctypes = tables + ctypes_offset;
1526 ph10 461
1527 ph10 455 /* See if we can find a fixed set of initial characters for the pattern. */
1528 ph10 461
1529 ph10 836 memset(start_bits, 0, 32 * sizeof(pcre_uint8));
1530 ph10 605 rc = set_start_bits(code, start_bits, (re->options & PCRE_UTF8) != 0,
1531     &compile_block);
1532     bits_set = rc == SSB_DONE;
1533 ph10 691 if (rc == SSB_UNKNOWN)
1534 ph10 688 {
1535     *errorptr = "internal error: opcode not recognized";
1536     return NULL;
1537 ph10 691 }
1538 ph10 455 }
1539 ph10 461
1540 ph10 455 /* Find the minimum length of subject string. */
1541 nigel 77
1542 ph10 1547 switch(min = find_minlength(re, code, code, re->options, NULL))
1543 ph10 603 {
1544 ph10 688 case -2: *errorptr = "internal error: missing capturing bracket"; return NULL;
1545     case -3: *errorptr = "internal error: opcode not recognized"; return NULL;
1546 ph10 603 default: break;
1547 ph10 605 }
1548 ph10 708
1549 ph10 688 /* If a set of starting bytes has been identified, or if the minimum length is
1550 ph10 1221 greater than zero, or if JIT optimization has been requested, or if
1551 ph10 1022 PCRE_STUDY_EXTRA_NEEDED is set, get a pcre[16]_extra block and a
1552     pcre_study_data block. The study data is put in the latter, which is pointed to
1553     by the former, which may also get additional data set later by the calling
1554     program. At the moment, the size of pcre_study_data is fixed. We nevertheless
1555     save it in a field for returning via the pcre_fullinfo() function so that if it
1556     becomes variable in the future, we don't have to change that code. */
1557 nigel 77
1558 ph10 1022 if (bits_set || min > 0 || (options & (
1559 ph10 688 #ifdef SUPPORT_JIT
1560 ph10 1022 PCRE_STUDY_JIT_COMPILE | PCRE_STUDY_JIT_PARTIAL_SOFT_COMPILE |
1561 ph10 1221 PCRE_STUDY_JIT_PARTIAL_HARD_COMPILE |
1562 ph10 688 #endif
1563 ph10 1022 PCRE_STUDY_EXTRA_NEEDED)) != 0)
1564 nigel 77 {
1565 zherczeg 850 extra = (PUBL(extra) *)(PUBL(malloc))
1566     (sizeof(PUBL(extra)) + sizeof(pcre_study_data));
1567 ph10 688 if (extra == NULL)
1568     {
1569     *errorptr = "failed to get memory";
1570     return NULL;
1571     }
1572 ph10 691
1573 zherczeg 850 study = (pcre_study_data *)((char *)extra + sizeof(PUBL(extra)));
1574 ph10 688 extra->flags = PCRE_EXTRA_STUDY_DATA;
1575     extra->study_data = study;
1576 ph10 691
1577 ph10 688 study->size = sizeof(pcre_study_data);
1578     study->flags = 0;
1579 ph10 797
1580 ph10 836 /* Set the start bits always, to avoid unset memory errors if the
1581     study data is written to a file, but set the flag only if any of the bits
1582     are set, to save time looking when none are. */
1583    
1584 ph10 797 if (bits_set)
1585 ph10 688 {
1586     study->flags |= PCRE_STUDY_MAPPED;
1587     memcpy(study->start_bits, start_bits, sizeof(start_bits));
1588     }
1589 ph10 836 else memset(study->start_bits, 0, 32 * sizeof(pcre_uint8));
1590 ph10 691
1591 ph10 836 #ifdef PCRE_DEBUG
1592     if (bits_set)
1593     {
1594 zherczeg 905 pcre_uint8 *ptr = start_bits;
1595 ph10 836 int i;
1596    
1597     printf("Start bits:\n");
1598     for (i = 0; i < 32; i++)
1599     printf("%3d: %02x%s", i * 8, *ptr++, ((i + 1) & 0x7) != 0? " " : "\n");
1600     }
1601     #endif
1602    
1603 ph10 691 /* Always set the minlength value in the block, because the JIT compiler
1604     makes use of it. However, don't set the bit unless the length is greater than
1605     zero - the interpretive pcre_exec() and pcre_dfa_exec() needn't waste time
1606 ph10 697 checking the zero case. */
1607 ph10 691
1608 ph10 708 if (min > 0)
1609 ph10 697 {
1610     study->flags |= PCRE_STUDY_MINLEN;
1611     study->minlength = min;
1612     }
1613 ph10 708 else study->minlength = 0;
1614 ph10 691
1615 ph10 688 /* If JIT support was compiled and requested, attempt the JIT compilation.
1616     If no starting bytes were found, and the minimum length is zero, and JIT
1617 ph10 1221 compilation fails, abandon the extra block and return NULL, unless
1618 ph10 1022 PCRE_STUDY_EXTRA_NEEDED is set. */
1619 ph10 691
1620 ph10 666 #ifdef SUPPORT_JIT
1621 ph10 688 extra->executable_jit = NULL;
1622 zherczeg 914 if ((options & PCRE_STUDY_JIT_COMPILE) != 0)
1623     PRIV(jit_compile)(re, extra, JIT_COMPILE);
1624     if ((options & PCRE_STUDY_JIT_PARTIAL_SOFT_COMPILE) != 0)
1625     PRIV(jit_compile)(re, extra, JIT_PARTIAL_SOFT_COMPILE);
1626     if ((options & PCRE_STUDY_JIT_PARTIAL_HARD_COMPILE) != 0)
1627     PRIV(jit_compile)(re, extra, JIT_PARTIAL_HARD_COMPILE);
1628    
1629 ph10 1022 if (study->flags == 0 && (extra->flags & PCRE_EXTRA_EXECUTABLE_JIT) == 0 &&
1630     (options & PCRE_STUDY_EXTRA_NEEDED) == 0)
1631 ph10 688 {
1632 chpe 1055 #if defined COMPILE_PCRE8
1633 ph10 688 pcre_free_study(extra);
1634 chpe 1055 #elif defined COMPILE_PCRE16
1635 ph10 836 pcre16_free_study(extra);
1636 chpe 1055 #elif defined COMPILE_PCRE32
1637     pcre32_free_study(extra);
1638 ph10 836 #endif
1639 ph10 688 extra = NULL;
1640 ph10 691 }
1641 ph10 666 #endif
1642 ph10 688 }
1643 ph10 666
1644 nigel 77 return extra;
1645     }
1646    
1647 ph10 666
1648     /*************************************************
1649     * Free the study data *
1650     *************************************************/
1651    
1652     /* This function frees the memory that was obtained by pcre_study().
1653    
1654 zherczeg 850 Argument: a pointer to the pcre[16]_extra block
1655 ph10 666 Returns: nothing
1656     */
1657    
1658 chpe 1055 #if defined COMPILE_PCRE8
1659 ph10 666 PCRE_EXP_DEFN void
1660     pcre_free_study(pcre_extra *extra)
1661 chpe 1055 #elif defined COMPILE_PCRE16
1662 ph10 836 PCRE_EXP_DEFN void
1663 zherczeg 850 pcre16_free_study(pcre16_extra *extra)
1664 chpe 1055 #elif defined COMPILE_PCRE32
1665     PCRE_EXP_DEFN void
1666     pcre32_free_study(pcre32_extra *extra)
1667 ph10 836 #endif
1668 ph10 666 {
1669 ph10 836 if (extra == NULL)
1670     return;
1671 ph10 666 #ifdef SUPPORT_JIT
1672 ph10 691 if ((extra->flags & PCRE_EXTRA_EXECUTABLE_JIT) != 0 &&
1673 ph10 666 extra->executable_jit != NULL)
1674 ph10 836 PRIV(jit_free)(extra->executable_jit);
1675 ph10 666 #endif
1676 ph10 836 PUBL(free)(extra);
1677 ph10 666 }
1678    
1679 nigel 77 /* End of pcre_study.c */

Properties

Name Value
svn:eol-style native
svn:keywords "Author Date Id Revision Url"

webmaster@exim.org
ViewVC Help
Powered by ViewVC 1.1.12