/[pcre]/code/trunk/study.c
ViewVC logotype

Diff of /code/trunk/study.c

Parent Directory Parent Directory | Revision Log Revision Log | View Patch Patch

revision 13 by nigel, Sat Feb 24 21:38:21 2007 UTC revision 27 by nigel, Sat Feb 24 21:38:49 2007 UTC
# Line 9  the file Tech.Notes for some information Line 9  the file Tech.Notes for some information
9    
10  Written by: Philip Hazel <ph10@cam.ac.uk>  Written by: Philip Hazel <ph10@cam.ac.uk>
11    
12             Copyright (c) 1997 University of Cambridge             Copyright (c) 1997-1999 University of Cambridge
13    
14  -----------------------------------------------------------------------------  -----------------------------------------------------------------------------
15  Permission is granted to anyone to use this software for any purpose on any  Permission is granted to anyone to use this software for any purpose on any
# Line 37  the external pcre header. */ Line 37  the external pcre header. */
37    
38    
39  /*************************************************  /*************************************************
40    *      Set a bit and maybe its alternate case    *
41    *************************************************/
42    
43    /* Given a character, set its bit in the table, and also the bit for the other
44    version of a letter if we are caseless.
45    
46    Arguments:
47      start_bits    points to the bit map
48      c             is the character
49      caseless      the caseless flag
50      cd            the block with char table pointers
51    
52    Returns:        nothing
53    */
54    
55    static void
56    set_bit(uschar *start_bits, int c, BOOL caseless, compile_data *cd)
57    {
58    start_bits[c/8] |= (1 << (c&7));
59    if (caseless && (cd->ctypes[c] & ctype_letter) != 0)
60      start_bits[cd->fcc[c]/8] |= (1 << (cd->fcc[c]&7));
61    }
62    
63    
64    
65    /*************************************************
66  *          Create bitmap of starting chars       *  *          Create bitmap of starting chars       *
67  *************************************************/  *************************************************/
68    
# Line 47  goes by, we may be able to get more clev Line 73  goes by, we may be able to get more clev
73  Arguments:  Arguments:
74    code         points to an expression    code         points to an expression
75    start_bits   points to a 32-byte table, initialized to 0    start_bits   points to a 32-byte table, initialized to 0
76      caseless     the current state of the caseless flag
77      cd           the block with char table pointers
78    
79  Returns:       TRUE if table built, FALSE otherwise  Returns:       TRUE if table built, FALSE otherwise
80  */  */
81    
82  static BOOL  static BOOL
83  set_start_bits(const uschar *code, uschar *start_bits)  set_start_bits(const uschar *code, uschar *start_bits, BOOL caseless,
84      compile_data *cd)
85  {  {
86  register int c;  register int c;
87    
88    /* This next statement and the later reference to dummy are here in order to
89    trick the optimizer of the IBM C compiler for OS/2 into generating correct
90    code. Apparently IBM isn't going to fix the problem, and we would rather not
91    disable optimization (in this module it actually makes a big difference, and
92    the pcre module can use all the optimization it can get). */
93    
94    volatile int dummy;
95    
96  do  do
97    {    {
98    const uschar *tcode = code + 3;    const uschar *tcode = code + 3;
# Line 65  do Line 102  do
102      {      {
103      try_next = FALSE;      try_next = FALSE;
104    
105        /* If a branch starts with a bracket or a positive lookahead assertion,
106        recurse to set bits from within them. That's all for this branch. */
107    
108      if ((int)*tcode >= OP_BRA || *tcode == OP_ASSERT)      if ((int)*tcode >= OP_BRA || *tcode == OP_ASSERT)
109        {        {
110        if (!set_start_bits(tcode, start_bits)) return FALSE;        if (!set_start_bits(tcode, start_bits, caseless, cd))
111            return FALSE;
112        }        }
113    
114      else switch(*tcode)      else switch(*tcode)
# Line 75  do Line 116  do
116        default:        default:
117        return FALSE;        return FALSE;
118    
119          /* Skip over lookbehind and negative lookahead assertions */
120    
121          case OP_ASSERT_NOT:
122          case OP_ASSERTBACK:
123          case OP_ASSERTBACK_NOT:
124          try_next = TRUE;
125          do tcode += (tcode[1] << 8) + tcode[2]; while (*tcode == OP_ALT);
126          tcode += 3;
127          break;
128    
129          /* Skip over an option setting, changing the caseless flag */
130    
131          case OP_OPT:
132          caseless = (tcode[1] & PCRE_CASELESS) != 0;
133          tcode += 2;
134          try_next = TRUE;
135          break;
136    
137        /* BRAZERO does the bracket, but carries on. */        /* BRAZERO does the bracket, but carries on. */
138    
139        case OP_BRAZERO:        case OP_BRAZERO:
140        case OP_BRAMINZERO:        case OP_BRAMINZERO:
141        if (!set_start_bits(++tcode, start_bits)) return FALSE;        if (!set_start_bits(++tcode, start_bits, caseless, cd))
142            return FALSE;
143          dummy = 1;
144        do tcode += (tcode[1] << 8) + tcode[2]; while (*tcode == OP_ALT);        do tcode += (tcode[1] << 8) + tcode[2]; while (*tcode == OP_ALT);
145        tcode += 3;        tcode += 3;
146        try_next = TRUE;        try_next = TRUE;
# Line 91  do Line 152  do
152        case OP_MINSTAR:        case OP_MINSTAR:
153        case OP_QUERY:        case OP_QUERY:
154        case OP_MINQUERY:        case OP_MINQUERY:
155        start_bits[tcode[1]/8] |= (1 << (tcode[1]&7));        set_bit(start_bits, tcode[1], caseless, cd);
156        tcode += 2;        tcode += 2;
157        try_next = TRUE;        try_next = TRUE;
158        break;        break;
# Line 100  do Line 161  do
161    
162        case OP_UPTO:        case OP_UPTO:
163        case OP_MINUPTO:        case OP_MINUPTO:
164        start_bits[tcode[3]/8] |= (1 << (tcode[3]&7));        set_bit(start_bits, tcode[3], caseless, cd);
165        tcode += 4;        tcode += 4;
166        try_next = TRUE;        try_next = TRUE;
167        break;        break;
# Line 115  do Line 176  do
176    
177        case OP_PLUS:        case OP_PLUS:
178        case OP_MINPLUS:        case OP_MINPLUS:
179        start_bits[tcode[1]/8] |= (1 << (tcode[1]&7));        set_bit(start_bits, tcode[1], caseless, cd);
180        break;        break;
181    
182        /* Single character type sets the bits and stops */        /* Single character type sets the bits and stops */
183    
184        case OP_NOT_DIGIT:        case OP_NOT_DIGIT:
185        for (c = 0; c < 32; c++) start_bits[c] |= ~pcre_cbits[c+cbit_digit];        for (c = 0; c < 32; c++)
186            start_bits[c] |= ~cd->cbits[c+cbit_digit];
187        break;        break;
188    
189        case OP_DIGIT:        case OP_DIGIT:
190        for (c = 0; c < 32; c++) start_bits[c] |= pcre_cbits[c+cbit_digit];        for (c = 0; c < 32; c++)
191            start_bits[c] |= cd->cbits[c+cbit_digit];
192        break;        break;
193    
194        case OP_NOT_WHITESPACE:        case OP_NOT_WHITESPACE:
195        for (c = 0; c < 32; c++) start_bits[c] |= ~pcre_cbits[c+cbit_space];        for (c = 0; c < 32; c++)
196            start_bits[c] |= ~cd->cbits[c+cbit_space];
197        break;        break;
198    
199        case OP_WHITESPACE:        case OP_WHITESPACE:
200        for (c = 0; c < 32; c++) start_bits[c] |= pcre_cbits[c+cbit_space];        for (c = 0; c < 32; c++)
201            start_bits[c] |= cd->cbits[c+cbit_space];
202        break;        break;
203    
204        case OP_NOT_WORDCHAR:        case OP_NOT_WORDCHAR:
205        for (c = 0; c < 32; c++)        for (c = 0; c < 32; c++)
206          start_bits[c] |= ~(pcre_cbits[c] | pcre_cbits[c+cbit_word]);          start_bits[c] |= ~(cd->cbits[c] | cd->cbits[c+cbit_word]);
207        break;        break;
208    
209        case OP_WORDCHAR:        case OP_WORDCHAR:
210        for (c = 0; c < 32; c++)        for (c = 0; c < 32; c++)
211          start_bits[c] |= (pcre_cbits[c] | pcre_cbits[c+cbit_word]);          start_bits[c] |= (cd->cbits[c] | cd->cbits[c+cbit_word]);
212        break;        break;
213    
214        /* One or more character type fudges the pointer and restarts, knowing        /* One or more character type fudges the pointer and restarts, knowing
# Line 174  do Line 239  do
239        switch(tcode[1])        switch(tcode[1])
240          {          {
241          case OP_NOT_DIGIT:          case OP_NOT_DIGIT:
242          for (c = 0; c < 32; c++) start_bits[c] |= ~pcre_cbits[c+cbit_digit];          for (c = 0; c < 32; c++)
243              start_bits[c] |= ~cd->cbits[c+cbit_digit];
244          break;          break;
245    
246          case OP_DIGIT:          case OP_DIGIT:
247          for (c = 0; c < 32; c++) start_bits[c] |= pcre_cbits[c+cbit_digit];          for (c = 0; c < 32; c++)
248              start_bits[c] |= cd->cbits[c+cbit_digit];
249          break;          break;
250    
251          case OP_NOT_WHITESPACE:          case OP_NOT_WHITESPACE:
252          for (c = 0; c < 32; c++) start_bits[c] |= ~pcre_cbits[c+cbit_space];          for (c = 0; c < 32; c++)
253              start_bits[c] |= ~cd->cbits[c+cbit_space];
254          break;          break;
255    
256          case OP_WHITESPACE:          case OP_WHITESPACE:
257          for (c = 0; c < 32; c++) start_bits[c] |= pcre_cbits[c+cbit_space];          for (c = 0; c < 32; c++)
258              start_bits[c] |= cd->cbits[c+cbit_space];
259          break;          break;
260    
261          case OP_NOT_WORDCHAR:          case OP_NOT_WORDCHAR:
262          for (c = 0; c < 32; c++)          for (c = 0; c < 32; c++)
263            start_bits[c] |= ~(pcre_cbits[c] | pcre_cbits[c+cbit_word]);            start_bits[c] |= ~(cd->cbits[c] | cd->cbits[c+cbit_word]);
264          break;          break;
265    
266          case OP_WORDCHAR:          case OP_WORDCHAR:
267          for (c = 0; c < 32; c++)          for (c = 0; c < 32; c++)
268            start_bits[c] |= (pcre_cbits[c] | pcre_cbits[c+cbit_word]);            start_bits[c] |= (cd->cbits[c] | cd->cbits[c+cbit_word]);
269          break;          break;
270          }          }
271    
# Line 208  do Line 277  do
277        according to the repeat count. */        according to the repeat count. */
278    
279        case OP_CLASS:        case OP_CLASS:
       case OP_NEGCLASS:  
280          {          {
281          tcode++;          tcode++;
282          for (c = 0; c < 32; c++) start_bits[c] |= tcode[c];          for (c = 0; c < 32; c++) start_bits[c] |= tcode[c];
# Line 267  Returns: pointer to a pcre_extra bloc Line 335  Returns: pointer to a pcre_extra bloc
335  pcre_extra *  pcre_extra *
336  pcre_study(const pcre *external_re, int options, const char **errorptr)  pcre_study(const pcre *external_re, int options, const char **errorptr)
337  {  {
 BOOL caseless;  
338  uschar start_bits[32];  uschar start_bits[32];
339  real_pcre_extra *extra;  real_pcre_extra *extra;
340  const real_pcre *re = (const real_pcre *)external_re;  const real_pcre *re = (const real_pcre *)external_re;
341    compile_data compile_block;
342    
343  *errorptr = NULL;  *errorptr = NULL;
344    
# Line 286  if ((options & ~PUBLIC_STUDY_OPTIONS) != Line 354  if ((options & ~PUBLIC_STUDY_OPTIONS) !=
354    return NULL;    return NULL;
355    }    }
356    
 /* Caseless can either be from the compiled regex or from options. */  
   
 caseless = ((re->options | options) & PCRE_CASELESS) != 0;  
   
357  /* For an anchored pattern, or an unchored pattern that has a first char, or a  /* For an anchored pattern, or an unchored pattern that has a first char, or a
358  multiline pattern that matches only at "line starts", no further processing at  multiline pattern that matches only at "line starts", no further processing at
359  present. */  present. */
# Line 297  present. */ Line 361  present. */
361  if ((re->options & (PCRE_ANCHORED|PCRE_FIRSTSET|PCRE_STARTLINE)) != 0)  if ((re->options & (PCRE_ANCHORED|PCRE_FIRSTSET|PCRE_STARTLINE)) != 0)
362    return NULL;    return NULL;
363    
364  /* See if we can find a fixed set of initial characters for the pattern. */  /* Set the character tables in the block which is passed around */
365    
366  memset(start_bits, 0, 32 * sizeof(uschar));  compile_block.lcc = re->tables + lcc_offset;
367  if (!set_start_bits(re->code, start_bits)) return NULL;  compile_block.fcc = re->tables + fcc_offset;
368    compile_block.cbits = re->tables + cbits_offset;
369    compile_block.ctypes = re->tables + ctypes_offset;
370    
371  /* If this studying is caseless, scan the created bit map and duplicate the  /* See if we can find a fixed set of initial characters for the pattern. */
 bits for any letters. */  
372    
373  if (caseless)  memset(start_bits, 0, 32 * sizeof(uschar));
374    {  if (!set_start_bits(re->code, start_bits, (re->options & PCRE_CASELESS) != 0,
375    register int c;    &compile_block)) return NULL;
   for (c = 0; c < 256; c++)  
     {  
     if ((start_bits[c/8] & (1 << (c&7))) != 0 &&  
         (pcre_ctypes[c] & ctype_letter) != 0)  
       {  
       int d = pcre_fcc[c];  
       start_bits[d/8] |= (1 << (d&7));  
       }  
     }  
   }  
376    
377  /* Get an "extra" block and put the information therein. */  /* Get an "extra" block and put the information therein. */
378    
# Line 329  if (extra == NULL) Line 384  if (extra == NULL)
384    return NULL;    return NULL;
385    }    }
386    
387  extra->options = PCRE_STUDY_MAPPED | (caseless? PCRE_STUDY_CASELESS : 0);  extra->options = PCRE_STUDY_MAPPED;
388  memcpy(extra->start_bits, start_bits, sizeof(start_bits));  memcpy(extra->start_bits, start_bits, sizeof(start_bits));
389    
390  return (pcre_extra *)extra;  return (pcre_extra *)extra;

Legend:
Removed from v.13  
changed lines
  Added in v.27

webmaster@exim.org
ViewVC Help
Powered by ViewVC 1.1.12