/[pcre]/code/trunk/pcre.c
ViewVC logotype

Diff of /code/trunk/pcre.c

Parent Directory Parent Directory | Revision Log Revision Log | View Patch Patch

revision 31 by nigel, Sat Feb 24 21:38:57 2007 UTC revision 63 by nigel, Sat Feb 24 21:40:03 2007 UTC
# Line 9  the file Tech.Notes for some information Line 9  the file Tech.Notes for some information
9    
10  Written by: Philip Hazel <ph10@cam.ac.uk>  Written by: Philip Hazel <ph10@cam.ac.uk>
11    
12             Copyright (c) 1997-1999 University of Cambridge             Copyright (c) 1997-2003 University of Cambridge
13    
14  -----------------------------------------------------------------------------  -----------------------------------------------------------------------------
15  Permission is granted to anyone to use this software for any purpose on any  Permission is granted to anyone to use this software for any purpose on any
# Line 32  restrictions: Line 32  restrictions:
32  -----------------------------------------------------------------------------  -----------------------------------------------------------------------------
33  */  */
34    
   
35  /* Define DEBUG to get debugging output on stdout. */  /* Define DEBUG to get debugging output on stdout. */
36    
37  /* #define DEBUG */  /* #define DEBUG */
# Line 60  the external pcre header. */ Line 59  the external pcre header. */
59  #endif  #endif
60    
61    
62  /* Number of items on the nested bracket stacks at compile time. This should  /* Maximum number of items on the nested bracket stacks at compile time. This
63  not be set greater than 200. */  applies to the nesting of all kinds of parentheses. It does not limit
64    un-nested, non-capturing parentheses. This number can be made bigger if
65    necessary - it is used to dimension one int and one unsigned char vector at
66    compile time. */
67    
68  #define BRASTACK_SIZE 200  #define BRASTACK_SIZE 200
69    
70    
71    
72    /* Maximum number of ints of offset to save on the stack for recursive calls.
73    If the offset vector is bigger, malloc is used. This should be a multiple of 3,
74    because the offset vector is always a multiple of 3 long. */
75    
76    #define REC_STACK_SAVE_MAX 30
77    
78    
79    /* The number of bytes in a literal character string above which we can't add
80    any more is set at 250 in order to allow for UTF-8 characters. (In theory it
81    could be 255 when UTF-8 support is excluded, but that means that some of the
82    test output would be different, which just complicates things.) */
83    
84    #define MAXLIT 250
85    
86    
87    /* Table of sizes for the fixed-length opcodes. It's defined in a macro so that
88    the definition is next to the definition of the opcodes in internal.h. */
89    
90    static uschar OP_lengths[] = { OP_LENGTHS };
91    
92  /* Min and max values for the common repeats; for the maxima, 0 => infinity */  /* Min and max values for the common repeats; for the maxima, 0 => infinity */
93    
94  static const char rep_min[] = { 0, 0, 1, 1, 0, 0 };  static const char rep_min[] = { 0, 0, 1, 1, 0, 0 };
95  static const char rep_max[] = { 0, 0, 0, 0, 1, 1 };  static const char rep_max[] = { 0, 0, 0, 0, 1, 1 };
96    
 /* Text forms of OP_ values and things, for debugging (not all used) */  
   
 #ifdef DEBUG  
 static const char *OP_names[] = {  
   "End", "\\A", "\\B", "\\b", "\\D", "\\d",  
   "\\S", "\\s", "\\W", "\\w", "\\Z", "\\z",  
   "Opt", "^", "$", "Any", "chars", "not",  
   "*", "*?", "+", "+?", "?", "??", "{", "{", "{",  
   "*", "*?", "+", "+?", "?", "??", "{", "{", "{",  
   "*", "*?", "+", "+?", "?", "??", "{", "{", "{",  
   "*", "*?", "+", "+?", "?", "??", "{", "{",  
   "class", "Ref",  
   "Alt", "Ket", "KetRmax", "KetRmin", "Assert", "Assert not",  
   "AssertB", "AssertB not", "Reverse", "Once", "Cond", "Cref",  
   "Brazero", "Braminzero", "Bra"  
 };  
 #endif  
   
97  /* Table for handling escaped characters in the range '0'-'z'. Positive returns  /* Table for handling escaped characters in the range '0'-'z'. Positive returns
98  are simple data values; negative values are for special things like \d and so  are simple data values; negative values are for special things like \d and so
99  on. Zero means further processing is needed (for things like \x), or the escape  on. Zero means further processing is needed (for things like \x), or the escape
# Line 97  is invalid. */ Line 102  is invalid. */
102  static const short int escapes[] = {  static const short int escapes[] = {
103      0,      0,      0,      0,      0,      0,      0,      0,   /* 0 - 7 */      0,      0,      0,      0,      0,      0,      0,      0,   /* 0 - 7 */
104      0,      0,    ':',    ';',    '<',    '=',    '>',    '?',   /* 8 - ? */      0,      0,    ':',    ';',    '<',    '=',    '>',    '?',   /* 8 - ? */
105    '@', -ESC_A, -ESC_B,      0, -ESC_D,      0,      0,      0,   /* @ - G */    '@', -ESC_A, -ESC_B, -ESC_C, -ESC_D, -ESC_E,      0, -ESC_G,   /* @ - G */
106      0,      0,      0,      0,      0,      0,      0,      0,   /* H - O */      0,      0,      0,      0,      0,      0,      0,      0,   /* H - O */
107      0,      0,      0, -ESC_S,      0,      0,      0, -ESC_W,   /* P - W */      0, -ESC_Q,      0, -ESC_S,      0,      0,      0, -ESC_W,   /* P - W */
108      0,      0, -ESC_Z,    '[',   '\\',    ']',    '^',    '_',   /* X - _ */      0,      0, -ESC_Z,    '[',   '\\',    ']',    '^',    '_',   /* X - _ */
109    '`',      7, -ESC_b,      0, -ESC_d,     27,   '\f',      0,   /* ` - g */    '`',      7, -ESC_b,      0, -ESC_d,  ESC_e,  ESC_f,      0,   /* ` - g */
110      0,      0,      0,      0,      0,      0,   '\n',      0,   /* h - o */      0,      0,      0,      0,      0,      0,  ESC_n,      0,   /* h - o */
111      0,      0,   '\r', -ESC_s,   '\t',      0,      0, -ESC_w,   /* p - w */      0,      0,  ESC_r, -ESC_s,  ESC_t,      0,      0, -ESC_w,   /* p - w */
112      0,      0, -ESC_z                                            /* x - z */      0,      0, -ESC_z                                            /* x - z */
113  };  };
114    
115    /* Tables of names of POSIX character classes and their lengths. The list is
116    terminated by a zero length entry. The first three must be alpha, upper, lower,
117    as this is assumed for handling case independence. */
118    
119    static const char *posix_names[] = {
120      "alpha", "lower", "upper",
121      "alnum", "ascii", "blank", "cntrl", "digit", "graph",
122      "print", "punct", "space", "word",  "xdigit" };
123    
124    static const uschar posix_name_lengths[] = {
125      5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 4, 6, 0 };
126    
127    /* Table of class bit maps for each POSIX class; up to three may be combined
128    to form the class. The table for [:blank:] is dynamically modified to remove
129    the vertical space characters. */
130    
131    static const int posix_class_maps[] = {
132      cbit_lower, cbit_upper, -1,             /* alpha */
133      cbit_lower, -1,         -1,             /* lower */
134      cbit_upper, -1,         -1,             /* upper */
135      cbit_digit, cbit_lower, cbit_upper,     /* alnum */
136      cbit_print, cbit_cntrl, -1,             /* ascii */
137      cbit_space, -1,         -1,             /* blank - a GNU extension */
138      cbit_cntrl, -1,         -1,             /* cntrl */
139      cbit_digit, -1,         -1,             /* digit */
140      cbit_graph, -1,         -1,             /* graph */
141      cbit_print, -1,         -1,             /* print */
142      cbit_punct, -1,         -1,             /* punct */
143      cbit_space, -1,         -1,             /* space */
144      cbit_word,  -1,         -1,             /* word - a Perl extension */
145      cbit_xdigit,-1,         -1              /* xdigit */
146    };
147    
148    
149  /* Definition to allow mutual recursion */  /* Definition to allow mutual recursion */
150    
151  static BOOL  static BOOL
152    compile_regex(int, int, int *, uschar **, const uschar **, const char **,    compile_regex(int, int, int *, uschar **, const uschar **, const char **,
153      BOOL, int, compile_data *);      BOOL, int, int *, int *, branch_chain *, compile_data *);
154    
155    /* Structure for building a chain of data that actually lives on the
156    stack, for holding the values of the subject pointer at the start of each
157    subpattern, so as to detect when an empty string has been matched by a
158    subpattern - to break infinite loops. */
159    
160    typedef struct eptrblock {
161      struct eptrblock *prev;
162      const uschar *saved_eptr;
163    } eptrblock;
164    
165    /* Flag bits for the match() function */
166    
167    #define match_condassert   0x01    /* Called to check a condition assertion */
168    #define match_isgroup      0x02    /* Set if start of bracketed group */
169    
170    /* Non-error returns from the match() function. Error returns are externally
171    defined PCRE_ERROR_xxx codes, which are all negative. */
172    
173    #define MATCH_MATCH        1
174    #define MATCH_NOMATCH      0
175    
176    
177    
# Line 121  static BOOL Line 181  static BOOL
181    
182  /* PCRE is thread-clean and doesn't use any global variables in the normal  /* PCRE is thread-clean and doesn't use any global variables in the normal
183  sense. However, it calls memory allocation and free functions via the two  sense. However, it calls memory allocation and free functions via the two
184  indirections below, which are can be changed by the caller, but are shared  indirections below, and it can optionally do callouts. These values can be
185  between all threads. */  changed by the caller, but are shared between all threads. However, when
186    compiling for Virtual Pascal, things are done differently (see pcre.in). */
187    
188    #ifndef VPCOMPAT
189  void *(*pcre_malloc)(size_t) = malloc;  void *(*pcre_malloc)(size_t) = malloc;
190  void  (*pcre_free)(void *) = free;  void  (*pcre_free)(void *) = free;
191    int   (*pcre_callout)(pcre_callout_block *) = NULL;
192    #endif
193    
194    
195    /*************************************************
196    *    Macros and tables for character handling    *
197    *************************************************/
198    
199    /* When UTF-8 encoding is being used, a character is no longer just a single
200    byte. The macros for character handling generate simple sequences when used in
201    byte-mode, and more complicated ones for UTF-8 characters. */
202    
203    #ifndef SUPPORT_UTF8
204    #define GETCHAR(c, eptr) c = *eptr;
205    #define GETCHARINC(c, eptr) c = *eptr++;
206    #define GETCHARINCTEST(c, eptr) c = *eptr++;
207    #define GETCHARLEN(c, eptr, len) c = *eptr;
208    #define BACKCHAR(eptr)
209    
210    #else   /* SUPPORT_UTF8 */
211    
212    /* Get the next UTF-8 character, not advancing the pointer. This is called when
213    we know we are in UTF-8 mode. */
214    
215    #define GETCHAR(c, eptr) \
216      c = *eptr; \
217      if ((c & 0xc0) == 0xc0) \
218        { \
219        int i; \
220        int a = utf8_table4[c & 0x3f];  /* Number of additional bytes */ \
221        int s = 6*a; \
222        c = (c & utf8_table3[a]) << s; \
223        for (i = 1; i <= a; i++) \
224          { \
225          s -= 6; \
226          c |= (eptr[i] & 0x3f) << s; \
227          } \
228        }
229    
230    /* Get the next UTF-8 character, advancing the pointer. This is called when we
231    know we are in UTF-8 mode. */
232    
233    #define GETCHARINC(c, eptr) \
234      c = *eptr++; \
235      if ((c & 0xc0) == 0xc0) \
236        { \
237        int a = utf8_table4[c & 0x3f];  /* Number of additional bytes */ \
238        int s = 6*a; \
239        c = (c & utf8_table3[a]) << s; \
240        while (a-- > 0) \
241          { \
242          s -= 6; \
243          c |= (*eptr++ & 0x3f) << s; \
244          } \
245        }
246    
247    /* Get the next character, testing for UTF-8 mode, and advancing the pointer */
248    
249    #define GETCHARINCTEST(c, eptr) \
250      c = *eptr++; \
251      if (md->utf8 && (c & 0xc0) == 0xc0) \
252        { \
253        int a = utf8_table4[c & 0x3f];  /* Number of additional bytes */ \
254        int s = 6*a; \
255        c = (c & utf8_table3[a]) << s; \
256        while (a-- > 0) \
257          { \
258          s -= 6; \
259          c |= (*eptr++ & 0x3f) << s; \
260          } \
261        }
262    
263    /* Get the next UTF-8 character, not advancing the pointer, incrementing length
264    if there are extra bytes. This is called when we know we are in UTF-8 mode. */
265    
266    #define GETCHARLEN(c, eptr, len) \
267      c = *eptr; \
268      if ((c & 0xc0) == 0xc0) \
269        { \
270        int i; \
271        int a = utf8_table4[c & 0x3f];  /* Number of additional bytes */ \
272        int s = 6*a; \
273        c = (c & utf8_table3[a]) << s; \
274        for (i = 1; i <= a; i++) \
275          { \
276          s -= 6; \
277          c |= (eptr[i] & 0x3f) << s; \
278          } \
279        len += a; \
280        }
281    
282    /* If the pointer is not at the start of a character, move it back until
283    it is. Called only in UTF-8 mode. */
284    
285    #define BACKCHAR(eptr) while((*eptr & 0xc0) == 0x80) eptr--;
286    
287    #endif
288    
289    
290    
# Line 144  tables. */ Line 302  tables. */
302    
303    
304    
305    #ifdef SUPPORT_UTF8
306    /*************************************************
307    *           Tables for UTF-8 support             *
308    *************************************************/
309    
310    /* These are the breakpoints for different numbers of bytes in a UTF-8
311    character. */
312    
313    static int utf8_table1[] = { 0x7f, 0x7ff, 0xffff, 0x1fffff, 0x3ffffff, 0x7fffffff};
314    
315    /* These are the indicator bits and the mask for the data bits to set in the
316    first byte of a character, indexed by the number of additional bytes. */
317    
318    static int utf8_table2[] = { 0,    0xc0, 0xe0, 0xf0, 0xf8, 0xfc};
319    static int utf8_table3[] = { 0xff, 0x1f, 0x0f, 0x07, 0x03, 0x01};
320    
321    /* Table of the number of extra characters, indexed by the first character
322    masked with 0x3f. The highest number for a valid UTF-8 character is in fact
323    0x3d. */
324    
325    static uschar utf8_table4[] = {
326      1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
327      1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
328      2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,
329      3,3,3,3,3,3,3,3,4,4,4,4,5,5,5,5 };
330    
331    
332    /*************************************************
333    *       Convert character value to UTF-8         *
334    *************************************************/
335    
336    /* This function takes an integer value in the range 0 - 0x7fffffff
337    and encodes it as a UTF-8 character in 0 to 6 bytes.
338    
339    Arguments:
340      cvalue     the character value
341      buffer     pointer to buffer for result - at least 6 bytes long
342    
343    Returns:     number of characters placed in the buffer
344    */
345    
346    static int
347    ord2utf8(int cvalue, uschar *buffer)
348    {
349    register int i, j;
350    for (i = 0; i < sizeof(utf8_table1)/sizeof(int); i++)
351      if (cvalue <= utf8_table1[i]) break;
352    buffer += i;
353    for (j = i; j > 0; j--)
354     {
355     *buffer-- = 0x80 | (cvalue & 0x3f);
356     cvalue >>= 6;
357     }
358    *buffer = utf8_table2[i] | cvalue;
359    return i + 1;
360    }
361    #endif
362    
363    
364    
365    /*************************************************
366    *         Print compiled regex                   *
367    *************************************************/
368    
369    /* The code for doing this is held in a separate file that is also included in
370    pcretest.c. It defines a function called print_internals(). */
371    
372    #ifdef DEBUG
373    #include "printint.c"
374    #endif
375    
376    
377    
378  /*************************************************  /*************************************************
379  *          Return version string                 *  *          Return version string                 *
380  *************************************************/  *************************************************/
381    
382    #define STRING(a)  # a
383    #define XSTRING(s) STRING(s)
384    
385  const char *  const char *
386  pcre_version(void)  pcre_version(void)
387  {  {
388  return PCRE_VERSION;  return XSTRING(PCRE_MAJOR) "." XSTRING(PCRE_MINOR) " " XSTRING(PCRE_DATE);
389  }  }
390    
391    
392    
393    
394  /*************************************************  /*************************************************
395  *       Return info about a compiled pattern     *  * (Obsolete) Return info about compiled pattern  *
396  *************************************************/  *************************************************/
397    
398  /* This function picks potentially useful data out of the private  /* This is the original "info" function. It picks potentially useful data out
399  structure.  of the private structure, but its interface was too rigid. It remains for
400    backwards compatibility. The public options are passed back in an int - though
401    the re->options field has been expanded to a long int, all the public options
402    at the low end of it, and so even on 16-bit systems this will still be OK.
403    Therefore, I haven't changed the API for pcre_info().
404    
405  Arguments:  Arguments:
406    external_re   points to compiled code    external_re   points to compiled code
407    optptr        where to pass back the options    optptr        where to pass back the options
408    first_char    where to pass back the first character,    first_byte    where to pass back the first character,
409                  or -1 if multiline and all branches start ^,                  or -1 if multiline and all branches start ^,
410                  or -2 otherwise                  or -2 otherwise
411    
412  Returns:        number of identifying extraction brackets  Returns:        number of capturing subpatterns
413                  or negative values on error                  or negative values on error
414  */  */
415    
416  int  int
417  pcre_info(const pcre *external_re, int *optptr, int *first_char)  pcre_info(const pcre *external_re, int *optptr, int *first_byte)
418  {  {
419  const real_pcre *re = (const real_pcre *)external_re;  const real_pcre *re = (const real_pcre *)external_re;
420  if (re == NULL) return PCRE_ERROR_NULL;  if (re == NULL) return PCRE_ERROR_NULL;
421  if (re->magic_number != MAGIC_NUMBER) return PCRE_ERROR_BADMAGIC;  if (re->magic_number != MAGIC_NUMBER) return PCRE_ERROR_BADMAGIC;
422  if (optptr != NULL) *optptr = (re->options & PUBLIC_OPTIONS);  if (optptr != NULL) *optptr = (int)(re->options & PUBLIC_OPTIONS);
423  if (first_char != NULL)  if (first_byte != NULL)
424    *first_char = ((re->options & PCRE_FIRSTSET) != 0)? re->first_char :    *first_byte = ((re->options & PCRE_FIRSTSET) != 0)? re->first_byte :
425       ((re->options & PCRE_STARTLINE) != 0)? -1 : -2;       ((re->options & PCRE_STARTLINE) != 0)? -1 : -2;
426  return re->top_bracket;  return re->top_bracket;
427  }  }
428    
429    
430    
431    /*************************************************
432    *        Return info about compiled pattern      *
433    *************************************************/
434    
435    /* This is a newer "info" function which has an extensible interface so
436    that additional items can be added compatibly.
437    
438    Arguments:
439      external_re      points to compiled code
440      extra_data       points extra data, or NULL
441      what             what information is required
442      where            where to put the information
443    
444    Returns:           0 if data returned, negative on error
445    */
446    
447    int
448    pcre_fullinfo(const pcre *external_re, const pcre_extra *extra_data, int what,
449      void *where)
450    {
451    const real_pcre *re = (const real_pcre *)external_re;
452    const pcre_study_data *study = NULL;
453    
454    if (re == NULL || where == NULL) return PCRE_ERROR_NULL;
455    if (re->magic_number != MAGIC_NUMBER) return PCRE_ERROR_BADMAGIC;
456    
457    if (extra_data != NULL && (extra_data->flags & PCRE_EXTRA_STUDY_DATA) != 0)
458      study = extra_data->study_data;
459    
460    switch (what)
461      {
462      case PCRE_INFO_OPTIONS:
463      *((unsigned long int *)where) = re->options & PUBLIC_OPTIONS;
464      break;
465    
466      case PCRE_INFO_SIZE:
467      *((size_t *)where) = re->size;
468      break;
469    
470      case PCRE_INFO_STUDYSIZE:
471      *((size_t *)where) = (study == NULL)? 0 : study->size;
472      break;
473    
474      case PCRE_INFO_CAPTURECOUNT:
475      *((int *)where) = re->top_bracket;
476      break;
477    
478      case PCRE_INFO_BACKREFMAX:
479      *((int *)where) = re->top_backref;
480      break;
481    
482      case PCRE_INFO_FIRSTBYTE:
483      *((int *)where) =
484        ((re->options & PCRE_FIRSTSET) != 0)? re->first_byte :
485        ((re->options & PCRE_STARTLINE) != 0)? -1 : -2;
486      break;
487    
488      case PCRE_INFO_FIRSTTABLE:
489      *((const uschar **)where) =
490        (study != NULL && (study->options & PCRE_STUDY_MAPPED) != 0)?
491          study->start_bits : NULL;
492      break;
493    
494      case PCRE_INFO_LASTLITERAL:
495      *((int *)where) =
496        ((re->options & PCRE_REQCHSET) != 0)? re->req_byte : -1;
497      break;
498    
499      case PCRE_INFO_NAMEENTRYSIZE:
500      *((int *)where) = re->name_entry_size;
501      break;
502    
503      case PCRE_INFO_NAMECOUNT:
504      *((int *)where) = re->name_count;
505      break;
506    
507      case PCRE_INFO_NAMETABLE:
508      *((const uschar **)where) = (const uschar *)re + sizeof(real_pcre);
509      break;
510    
511      default: return PCRE_ERROR_BADOPTION;
512      }
513    
514    return 0;
515    }
516    
517    
518    
519    /*************************************************
520    * Return info about what features are configured *
521    *************************************************/
522    
523    /* This is function which has an extensible interface so that additional items
524    can be added compatibly.
525    
526    Arguments:
527      what             what information is required
528      where            where to put the information
529    
530    Returns:           0 if data returned, negative on error
531    */
532    
533    int
534    pcre_config(int what, void *where)
535    {
536    switch (what)
537      {
538      case PCRE_CONFIG_UTF8:
539      #ifdef SUPPORT_UTF8
540      *((int *)where) = 1;
541      #else
542      *((int *)where) = 0;
543      #endif
544      break;
545    
546      case PCRE_CONFIG_NEWLINE:
547      *((int *)where) = NEWLINE;
548      break;
549    
550      case PCRE_CONFIG_LINK_SIZE:
551      *((int *)where) = LINK_SIZE;
552      break;
553    
554      case PCRE_CONFIG_POSIX_MALLOC_THRESHOLD:
555      *((int *)where) = POSIX_MALLOC_THRESHOLD;
556      break;
557    
558      case PCRE_CONFIG_MATCH_LIMIT:
559      *((unsigned int *)where) = MATCH_LIMIT;
560      break;
561    
562      default: return PCRE_ERROR_BADOPTION;
563      }
564    
565    return 0;
566    }
567    
568    
569    
570  #ifdef DEBUG  #ifdef DEBUG
571  /*************************************************  /*************************************************
# Line 227  while (length-- > 0) Line 603  while (length-- > 0)
603    
604  /* This function is called when a \ has been encountered. It either returns a  /* This function is called when a \ has been encountered. It either returns a
605  positive value for a simple escape such as \n, or a negative value which  positive value for a simple escape such as \n, or a negative value which
606  encodes one of the more complicated things such as \d. On entry, ptr is  encodes one of the more complicated things such as \d. When UTF-8 is enabled,
607  pointing at the \. On exit, it is on the final character of the escape  a positive value greater than 255 may be returned. On entry, ptr is pointing at
608  sequence.  the \. On exit, it is on the final character of the escape sequence.
609    
610  Arguments:  Arguments:
611    ptrptr     points to the pattern position pointer    ptrptr     points to the pattern position pointer
# Line 249  check_escape(const uschar **ptrptr, cons Line 625  check_escape(const uschar **ptrptr, cons
625    int options, BOOL isclass, compile_data *cd)    int options, BOOL isclass, compile_data *cd)
626  {  {
627  const uschar *ptr = *ptrptr;  const uschar *ptr = *ptrptr;
628  int c = *(++ptr) & 255;   /* Ensure > 0 on signed-char systems */  int c, i;
 int i;  
629    
630    /* If backslash is at the end of the pattern, it's an error. */
631    
632    c = *(++ptr);
633  if (c == 0) *errorptr = ERR1;  if (c == 0) *errorptr = ERR1;
634    
635  /* Digits or letters may have special meaning; all others are literals. */  /* Digits or letters may have special meaning; all others are literals. */
# Line 270  else Line 648  else
648    const uschar *oldptr;    const uschar *oldptr;
649    switch (c)    switch (c)
650      {      {
651        /* A number of Perl escapes are not handled by PCRE. We give an explicit
652        error. */
653    
654        case 'l':
655        case 'L':
656        case 'N':
657        case 'p':
658        case 'P':
659        case 'u':
660        case 'U':
661        case 'X':
662        *errorptr = ERR37;
663        break;
664    
665      /* The handling of escape sequences consisting of a string of digits      /* The handling of escape sequences consisting of a string of digits
666      starting with one that is not zero is not straightforward. By experiment,      starting with one that is not zero is not straightforward. By experiment,
667      the way Perl works seems to be as follows:      the way Perl works seems to be as follows:
# Line 311  else Line 703  else
703        }        }
704    
705      /* \0 always starts an octal number, but we may drop through to here with a      /* \0 always starts an octal number, but we may drop through to here with a
706      larger first octal digit */      larger first octal digit. */
707    
708      case '0':      case '0':
709      c -= '0';      c -= '0';
710      while(i++ < 2 && (cd->ctypes[ptr[1]] & ctype_digit) != 0 &&      while(i++ < 2 && (cd->ctypes[ptr[1]] & ctype_digit) != 0 &&
711        ptr[1] != '8' && ptr[1] != '9')        ptr[1] != '8' && ptr[1] != '9')
712          c = c * 8 + *(++ptr) - '0';          c = c * 8 + *(++ptr) - '0';
713        c &= 255;     /* Take least significant 8 bits */
714      break;      break;
715    
716      /* Special escapes not starting with a digit are straightforward */      /* \x is complicated when UTF-8 is enabled. \x{ddd} is a character number
717        which can be greater than 0xff, but only if the ddd are hex digits. */
718    
719      case 'x':      case 'x':
720    #ifdef SUPPORT_UTF8
721        if (ptr[1] == '{' && (options & PCRE_UTF8) != 0)
722          {
723          const uschar *pt = ptr + 2;
724          register int count = 0;
725          c = 0;
726          while ((cd->ctypes[*pt] & ctype_xdigit) != 0)
727            {
728            count++;
729            c = c * 16 + cd->lcc[*pt] -
730              (((cd->ctypes[*pt] & ctype_digit) != 0)? '0' : 'W');
731            pt++;
732            }
733          if (*pt == '}')
734            {
735            if (c < 0 || count > 8) *errorptr = ERR34;
736            ptr = pt;
737            break;
738            }
739          /* If the sequence of hex digits does not end with '}', then we don't
740          recognize this construct; fall through to the normal \x handling. */
741          }
742    #endif
743    
744        /* Read just a single hex char */
745    
746      c = 0;      c = 0;
747      while (i++ < 2 && (cd->ctypes[ptr[1]] & ctype_xdigit) != 0)      while (i++ < 2 && (cd->ctypes[ptr[1]] & ctype_xdigit) != 0)
748        {        {
# Line 332  else Line 752  else
752        }        }
753      break;      break;
754    
755        /* Other special escapes not starting with a digit are straightforward */
756    
757      case 'c':      case 'c':
758      c = *(++ptr);      c = *(++ptr);
759      if (c == 0)      if (c == 0)
# Line 461  return p; Line 883  return p;
883    
884    
885  /*************************************************  /*************************************************
886    *      Find first significant op code            *
887    *************************************************/
888    
889    /* This is called by several functions that scan a compiled expression looking
890    for a fixed first character, or an anchoring op code etc. It skips over things
891    that do not influence this. For some calls, a change of option is important.
892    
893    Arguments:
894      code       pointer to the start of the group
895      options    pointer to external options
896      optbit     the option bit whose changing is significant, or
897                   zero if none are
898    
899    Returns:     pointer to the first significant opcode
900    */
901    
902    static const uschar*
903    first_significant_code(const uschar *code, int *options, int optbit)
904    {
905    for (;;)
906      {
907      switch ((int)*code)
908        {
909        case OP_OPT:
910        if (optbit > 0 && ((int)code[1] & optbit) != (*options & optbit))
911          *options = (int)code[1];
912        code += 2;
913        break;
914    
915        case OP_ASSERT_NOT:
916        case OP_ASSERTBACK:
917        case OP_ASSERTBACK_NOT:
918        do code += GET(code, 1); while (*code == OP_ALT);
919        /* Fall through */
920    
921        case OP_CALLOUT:
922        case OP_CREF:
923        case OP_BRANUMBER:
924        case OP_WORD_BOUNDARY:
925        case OP_NOT_WORD_BOUNDARY:
926        code += OP_lengths[*code];
927        break;
928    
929        default:
930        return code;
931        }
932      }
933    /* Control never reaches here */
934    }
935    
936    
937    
938    
939    /*************************************************
940  *        Find the fixed length of a pattern      *  *        Find the fixed length of a pattern      *
941  *************************************************/  *************************************************/
942    
943  /* Scan a pattern and compute the fixed length of subject that will match it,  /* Scan a pattern and compute the fixed length of subject that will match it,
944  if the length is fixed. This is needed for dealing with backward assertions.  if the length is fixed. This is needed for dealing with backward assertions.
945    In UTF8 mode, the result is in characters rather than bytes.
946    
947  Arguments:  Arguments:
948    code     points to the start of the pattern (the bracket)    code     points to the start of the pattern (the bracket)
949      options  the compiling options
950    
951  Returns:   the fixed length, or -1 if there is no fixed length  Returns:   the fixed length, or -1 if there is no fixed length,
952                 or -2 if \C was encountered
953  */  */
954    
955  static int  static int
956  find_fixedlength(uschar *code)  find_fixedlength(uschar *code, int options)
957  {  {
958  int length = -1;  int length = -1;
959    
960  register int branchlength = 0;  register int branchlength = 0;
961  register uschar *cc = code + 3;  register uschar *cc = code + 1 + LINK_SIZE;
962    
963  /* Scan along the opcodes for this branch. If we get to the end of the  /* Scan along the opcodes for this branch. If we get to the end of the
964  branch, check the length against that of the other branches. */  branch, check the length against that of the other branches. */
# Line 495  for (;;) Line 974  for (;;)
974      case OP_BRA:      case OP_BRA:
975      case OP_ONCE:      case OP_ONCE:
976      case OP_COND:      case OP_COND:
977      d = find_fixedlength(cc);      d = find_fixedlength(cc, options);
978      if (d < 0) return -1;      if (d < 0) return d;
979      branchlength += d;      branchlength += d;
980      do cc += (cc[1] << 8) + cc[2]; while (*cc == OP_ALT);      do cc += GET(cc, 1); while (*cc == OP_ALT);
981      cc += 3;      cc += 1 + LINK_SIZE;
982      break;      break;
983    
984      /* Reached end of a branch; if it's a ket it is the end of a nested      /* Reached end of a branch; if it's a ket it is the end of a nested
# Line 514  for (;;) Line 993  for (;;)
993      if (length < 0) length = branchlength;      if (length < 0) length = branchlength;
994        else if (length != branchlength) return -1;        else if (length != branchlength) return -1;
995      if (*cc != OP_ALT) return length;      if (*cc != OP_ALT) return length;
996      cc += 3;      cc += 1 + LINK_SIZE;
997      branchlength = 0;      branchlength = 0;
998      break;      break;
999    
# Line 524  for (;;) Line 1003  for (;;)
1003      case OP_ASSERT_NOT:      case OP_ASSERT_NOT:
1004      case OP_ASSERTBACK:      case OP_ASSERTBACK:
1005      case OP_ASSERTBACK_NOT:      case OP_ASSERTBACK_NOT:
1006      do cc += (cc[1] << 8) + cc[2]; while (*cc == OP_ALT);      do cc += GET(cc, 1); while (*cc == OP_ALT);
1007      cc += 3;      /* Fall through */
     break;  
1008    
1009      /* Skip over things that don't match chars */      /* Skip over things that don't match chars */
1010    
1011      case OP_REVERSE:      case OP_REVERSE:
1012      cc++;      case OP_BRANUMBER:
   
1013      case OP_CREF:      case OP_CREF:
1014      case OP_OPT:      case OP_OPT:
1015      cc++;      case OP_CALLOUT:
     /* Fall through */  
   
1016      case OP_SOD:      case OP_SOD:
1017        case OP_SOM:
1018      case OP_EOD:      case OP_EOD:
1019      case OP_EODN:      case OP_EODN:
1020      case OP_CIRC:      case OP_CIRC:
1021      case OP_DOLL:      case OP_DOLL:
1022      case OP_NOT_WORD_BOUNDARY:      case OP_NOT_WORD_BOUNDARY:
1023      case OP_WORD_BOUNDARY:      case OP_WORD_BOUNDARY:
1024      cc++;      cc += OP_lengths[*cc];
1025      break;      break;
1026    
1027      /* Handle char strings */      /* Handle char strings. In UTF-8 mode we must count characters, not bytes.
1028        This requires a scan of the string, unfortunately. We assume valid UTF-8
1029        strings, so all we do is reduce the length by one for every byte whose bits
1030        are 10xxxxxx. */
1031    
1032      case OP_CHARS:      case OP_CHARS:
1033      branchlength += *(++cc);      branchlength += *(++cc);
1034    #ifdef SUPPORT_UTF8
1035        if ((options & PCRE_UTF8) != 0)
1036          for (d = 1; d <= *cc; d++)
1037            if ((cc[d] & 0xc0) == 0x80) branchlength--;
1038    #endif
1039      cc += *cc + 1;      cc += *cc + 1;
1040      break;      break;
1041    
1042      /* Handle exact repetitions */      /* Handle exact repetitions. The count is already in characters, but we
1043        need to skip over a multibyte character in UTF8 mode.  */
1044    
1045      case OP_EXACT:      case OP_EXACT:
1046        branchlength += GET2(cc,1);
1047        cc += 4;
1048    #ifdef SUPPORT_UTF8
1049        if ((options & PCRE_UTF8) != 0)
1050          {
1051          while((*cc & 0x80) == 0x80) cc++;
1052          }
1053    #endif
1054        break;
1055    
1056      case OP_TYPEEXACT:      case OP_TYPEEXACT:
1057      branchlength += (cc[1] << 8) + cc[2];      branchlength += GET2(cc,1);
1058      cc += 4;      cc += 4;
1059      break;      break;
1060    
# Line 576  for (;;) Line 1071  for (;;)
1071      cc++;      cc++;
1072      break;      break;
1073    
1074        /* The single-byte matcher isn't allowed */
1075    
1076        case OP_ANYBYTE:
1077        return -2;
1078    
1079      /* Check a class for variable quantification */      /* Check a class for variable quantification */
1080    
1081    #ifdef SUPPORT_UTF8
1082        case OP_XCLASS:
1083        cc += GET(cc, 1) - 33;
1084        /* Fall through */
1085    #endif
1086    
1087      case OP_CLASS:      case OP_CLASS:
1088      cc += (*cc == OP_REF)? 2 : 33;      case OP_NCLASS:
1089        cc += 33;
1090    
1091      switch (*cc)      switch (*cc)
1092        {        {
# Line 592  for (;;) Line 1098  for (;;)
1098    
1099        case OP_CRRANGE:        case OP_CRRANGE:
1100        case OP_CRMINRANGE:        case OP_CRMINRANGE:
1101        if ((cc[1] << 8) + cc[2] != (cc[3] << 8) + cc[4]) return -1;        if (GET2(cc,1) != GET2(cc,3)) return -1;
1102        branchlength += (cc[1] << 8) + cc[2];        branchlength += GET2(cc,1);
1103        cc += 5;        cc += 5;
1104        break;        break;
1105    
# Line 615  for (;;) Line 1121  for (;;)
1121    
1122    
1123  /*************************************************  /*************************************************
1124  *           Compile one branch                   *  *    Scan compiled regex for numbered bracket    *
1125  *************************************************/  *************************************************/
1126    
1127  /* Scan the pattern, compiling it into the code vector.  /* This little function scans through a compiled pattern until it finds a
1128    capturing bracket with the given number.
1129    
1130  Arguments:  Arguments:
1131    options      the option bits    code        points to start of expression
1132    brackets     points to number of brackets used    utf8        TRUE in UTF-8 mode
1133    code         points to the pointer to the current code point    number      the required bracket number
   ptrptr       points to the current pattern pointer  
   errorptr     points to pointer to error message  
   optchanged   set to the value of the last OP_OPT item compiled  
   cd           contains pointers to tables  
1134    
1135  Returns:       TRUE on success  Returns:      pointer to the opcode for the bracket, or NULL if not found
                FALSE, with *errorptr set on error  
1136  */  */
1137    
1138  static BOOL  static const uschar *
1139  compile_branch(int options, int *brackets, uschar **codeptr,  find_bracket(const uschar *code, BOOL utf8, int number)
   const uschar **ptrptr, const char **errorptr, int *optchanged,  
   compile_data *cd)  
1140  {  {
1141  int repeat_type, op_type;  for (;;)
1142  int repeat_min, repeat_max;    {
1143  int bravalue, length;    register int c = *code;
1144  int greedy_default, greedy_non_default;    if (c == OP_END) return NULL;
1145  register int c;    else if (c == OP_CHARS) code += code[1] + OP_lengths[c];
1146  register uschar *code = *codeptr;    else if (c > OP_BRA)
1147  uschar *tempcode;      {
1148  const uschar *ptr = *ptrptr;      int n = c - OP_BRA;
1149  const uschar *tempptr;      if (n > EXTRACT_BASIC_MAX) n = GET2(code, 2+LINK_SIZE);
1150  uschar *previous = NULL;      if (n == number) return (uschar *)code;
1151  uschar class[32];      code += OP_lengths[OP_BRA];
1152        }
1153      else
1154        {
1155        code += OP_lengths[c];
1156    
1157  /* Set up the default and non-default settings for greediness */      /* In UTF-8 mode, opcodes that are followed by a character may be followed
1158        by a multi-byte character. The length in the table is a minimum, so we have
1159        to scan along to skip the extra characters. All opcodes are less than 128,
1160        so we can use relatively efficient code. */
1161    
1162    #ifdef SUPPORT_UTF8
1163        if (utf8) switch(c)
1164          {
1165          case OP_EXACT:
1166          case OP_UPTO:
1167          case OP_MINUPTO:
1168          case OP_STAR:
1169          case OP_MINSTAR:
1170          case OP_PLUS:
1171          case OP_MINPLUS:
1172          case OP_QUERY:
1173          case OP_MINQUERY:
1174          while ((*code & 0xc0) == 0x80) code++;
1175          break;
1176          }
1177    #endif
1178        }
1179      }
1180    }
1181    
 greedy_default = ((options & PCRE_UNGREEDY) != 0);  
 greedy_non_default = greedy_default ^ 1;  
1182    
 /* Switch on next character until the end of the branch */  
1183    
1184  for (;; ptr++)  /*************************************************
1185    *    Scan compiled branch for non-emptiness      *
1186    *************************************************/
1187    
1188    /* This function scans through a branch of a compiled pattern to see whether it
1189    can match the empty string or not. It is called only from could_be_empty()
1190    below. Note that first_significant_code() skips over assertions. If we hit an
1191    unclosed bracket, we return "empty" - this means we've struck an inner bracket
1192    whose current branch will already have been scanned.
1193    
1194    Arguments:
1195      code        points to start of search
1196      endcode     points to where to stop
1197      utf8        TRUE if in UTF8 mode
1198    
1199    Returns:      TRUE if what is matched could be empty
1200    */
1201    
1202    static BOOL
1203    could_be_empty_branch(const uschar *code, const uschar *endcode, BOOL utf8)
1204    {
1205    register int c;
1206    for (code = first_significant_code(code + 1 + LINK_SIZE, NULL, 0);
1207         code < endcode;
1208         code = first_significant_code(code + OP_lengths[c], NULL, 0))
1209    {    {
1210    BOOL negate_class;    const uschar *ccode;
   int class_charcount;  
   int class_lastchar;  
   int newoptions;  
   int condref;  
1211    
1212    c = *ptr;    c = *code;
1213    if ((options & PCRE_EXTENDED) != 0)  
1214      if (c >= OP_BRA)
1215      {      {
1216      if ((cd->ctypes[c] & ctype_space) != 0) continue;      BOOL empty_branch;
1217      if (c == '#')      if (GET(code, 1) == 0) return TRUE;    /* Hit unclosed bracket */
1218    
1219        /* Scan a closed bracket */
1220    
1221        empty_branch = FALSE;
1222        do
1223        {        {
1224        while ((c = *(++ptr)) != 0 && c != '\n');        if (!empty_branch && could_be_empty_branch(code, endcode, utf8))
1225        continue;          empty_branch = TRUE;
1226        }        code += GET(code, 1);
1227          }
1228        while (*code == OP_ALT);
1229        if (!empty_branch) return FALSE;   /* All branches are non-empty */
1230        code += 1 + LINK_SIZE;
1231        c = *code;
1232      }      }
1233    
1234    switch(c)    else switch (c)
1235      {      {
1236      /* The branch terminates at end of string, |, or ). */      /* Check for quantifiers after a class */
1237    
1238      case 0:  #ifdef SUPPORT_UTF8
1239      case '|':      case OP_XCLASS:
1240        ccode = code + GET(code, 1);
1241        goto CHECK_CLASS_REPEAT;
1242    #endif
1243    
1244        case OP_CLASS:
1245        case OP_NCLASS:
1246        ccode = code + 33;
1247    
1248    #ifdef SUPPORT_UTF8
1249        CHECK_CLASS_REPEAT:
1250    #endif
1251    
1252        switch (*ccode)
1253          {
1254          case OP_CRSTAR:            /* These could be empty; continue */
1255          case OP_CRMINSTAR:
1256          case OP_CRQUERY:
1257          case OP_CRMINQUERY:
1258          break;
1259    
1260          default:                   /* Non-repeat => class must match */
1261          case OP_CRPLUS:            /* These repeats aren't empty */
1262          case OP_CRMINPLUS:
1263          return FALSE;
1264    
1265          case OP_CRRANGE:
1266          case OP_CRMINRANGE:
1267          if (GET2(ccode, 1) > 0) return FALSE;  /* Minimum > 0 */
1268          break;
1269          }
1270        break;
1271    
1272        /* Opcodes that must match a character */
1273    
1274        case OP_NOT_DIGIT:
1275        case OP_DIGIT:
1276        case OP_NOT_WHITESPACE:
1277        case OP_WHITESPACE:
1278        case OP_NOT_WORDCHAR:
1279        case OP_WORDCHAR:
1280        case OP_ANY:
1281        case OP_ANYBYTE:
1282        case OP_CHARS:
1283        case OP_NOT:
1284        case OP_PLUS:
1285        case OP_MINPLUS:
1286        case OP_EXACT:
1287        case OP_NOTPLUS:
1288        case OP_NOTMINPLUS:
1289        case OP_NOTEXACT:
1290        case OP_TYPEPLUS:
1291        case OP_TYPEMINPLUS:
1292        case OP_TYPEEXACT:
1293        return FALSE;
1294    
1295        /* End of branch */
1296    
1297        case OP_KET:
1298        case OP_KETRMAX:
1299        case OP_KETRMIN:
1300        case OP_ALT:
1301        return TRUE;
1302    
1303        /* In UTF-8 mode, STAR, MINSTAR, QUERY, MINQUERY, UPTO, and MINUPTO  may be
1304        followed by a multibyte character */
1305    
1306    #ifdef SUPPORT_UTF8
1307        case OP_STAR:
1308        case OP_MINSTAR:
1309        case OP_QUERY:
1310        case OP_MINQUERY:
1311        case OP_UPTO:
1312        case OP_MINUPTO:
1313        if (utf8) while ((code[2] & 0xc0) == 0x80) code++;
1314        break;
1315    #endif
1316        }
1317      }
1318    
1319    return TRUE;
1320    }
1321    
1322    
1323    
1324    /*************************************************
1325    *    Scan compiled regex for non-emptiness       *
1326    *************************************************/
1327    
1328    /* This function is called to check for left recursive calls. We want to check
1329    the current branch of the current pattern to see if it could match the empty
1330    string. If it could, we must look outwards for branches at other levels,
1331    stopping when we pass beyond the bracket which is the subject of the recursion.
1332    
1333    Arguments:
1334      code        points to start of the recursion
1335      endcode     points to where to stop (current RECURSE item)
1336      bcptr       points to the chain of current (unclosed) branch starts
1337      utf8        TRUE if in UTF-8 mode
1338    
1339    Returns:      TRUE if what is matched could be empty
1340    */
1341    
1342    static BOOL
1343    could_be_empty(const uschar *code, const uschar *endcode, branch_chain *bcptr,
1344      BOOL utf8)
1345    {
1346    while (bcptr != NULL && bcptr->current >= code)
1347      {
1348      if (!could_be_empty_branch(bcptr->current, endcode, utf8)) return FALSE;
1349      bcptr = bcptr->outer;
1350      }
1351    return TRUE;
1352    }
1353    
1354    
1355    
1356    /*************************************************
1357    *           Check for POSIX class syntax         *
1358    *************************************************/
1359    
1360    /* This function is called when the sequence "[:" or "[." or "[=" is
1361    encountered in a character class. It checks whether this is followed by an
1362    optional ^ and then a sequence of letters, terminated by a matching ":]" or
1363    ".]" or "=]".
1364    
1365    Argument:
1366      ptr      pointer to the initial [
1367      endptr   where to return the end pointer
1368      cd       pointer to compile data
1369    
1370    Returns:   TRUE or FALSE
1371    */
1372    
1373    static BOOL
1374    check_posix_syntax(const uschar *ptr, const uschar **endptr, compile_data *cd)
1375    {
1376    int terminator;          /* Don't combine these lines; the Solaris cc */
1377    terminator = *(++ptr);   /* compiler warns about "non-constant" initializer. */
1378    if (*(++ptr) == '^') ptr++;
1379    while ((cd->ctypes[*ptr] & ctype_letter) != 0) ptr++;
1380    if (*ptr == terminator && ptr[1] == ']')
1381      {
1382      *endptr = ptr;
1383      return TRUE;
1384      }
1385    return FALSE;
1386    }
1387    
1388    
1389    
1390    
1391    /*************************************************
1392    *          Check POSIX class name                *
1393    *************************************************/
1394    
1395    /* This function is called to check the name given in a POSIX-style class entry
1396    such as [:alnum:].
1397    
1398    Arguments:
1399      ptr        points to the first letter
1400      len        the length of the name
1401    
1402    Returns:     a value representing the name, or -1 if unknown
1403    */
1404    
1405    static int
1406    check_posix_name(const uschar *ptr, int len)
1407    {
1408    register int yield = 0;
1409    while (posix_name_lengths[yield] != 0)
1410      {
1411      if (len == posix_name_lengths[yield] &&
1412        strncmp((const char *)ptr, posix_names[yield], len) == 0) return yield;
1413      yield++;
1414      }
1415    return -1;
1416    }
1417    
1418    
1419    
1420    
1421    /*************************************************
1422    *           Compile one branch                   *
1423    *************************************************/
1424    
1425    /* Scan the pattern, compiling it into the code vector. If the options are
1426    changed during the branch, the pointer is used to change the external options
1427    bits.
1428    
1429    Arguments:
1430      optionsptr     pointer to the option bits
1431      brackets       points to number of extracting brackets used
1432      code           points to the pointer to the current code point
1433      ptrptr         points to the current pattern pointer
1434      errorptr       points to pointer to error message
1435      firstbyteptr   set to initial literal character, or < 0 (REQ_UNSET, REQ_NONE)
1436      reqbyteptr     set to the last literal character required, else < 0
1437      bcptr          points to current branch chain
1438      cd             contains pointers to tables etc.
1439    
1440    Returns:         TRUE on success
1441                     FALSE, with *errorptr set on error
1442    */
1443    
1444    static BOOL
1445    compile_branch(int *optionsptr, int *brackets, uschar **codeptr,
1446      const uschar **ptrptr, const char **errorptr, int *firstbyteptr,
1447      int *reqbyteptr, branch_chain *bcptr, compile_data *cd)
1448    {
1449    int repeat_type, op_type;
1450    int repeat_min = 0, repeat_max = 0;      /* To please picky compilers */
1451    int bravalue = 0;
1452    int length;
1453    int greedy_default, greedy_non_default;
1454    int firstbyte, reqbyte;
1455    int zeroreqbyte, zerofirstbyte;
1456    int req_caseopt;
1457    int condcount = 0;
1458    int options = *optionsptr;
1459    register int c;
1460    register uschar *code = *codeptr;
1461    uschar *tempcode;
1462    BOOL inescq = FALSE;
1463    BOOL groupsetfirstbyte = FALSE;
1464    const uschar *ptr = *ptrptr;
1465    const uschar *tempptr;
1466    uschar *previous = NULL;
1467    uschar class[32];
1468    
1469    #ifdef SUPPORT_UTF8
1470    BOOL class_utf8;
1471    BOOL utf8 = (options & PCRE_UTF8) != 0;
1472    uschar *class_utf8data;
1473    uschar utf8_char[6];
1474    #else
1475    BOOL utf8 = FALSE;
1476    #endif
1477    
1478    /* Set up the default and non-default settings for greediness */
1479    
1480    greedy_default = ((options & PCRE_UNGREEDY) != 0);
1481    greedy_non_default = greedy_default ^ 1;
1482    
1483    /* Initialize no first char, no required char. REQ_UNSET means "no char
1484    matching encountered yet". It gets changed to REQ_NONE if we hit something that
1485    matches a non-fixed char first char; reqbyte just remains unset if we never
1486    find one.
1487    
1488    When we hit a repeat whose minimum is zero, we may have to adjust these values
1489    to take the zero repeat into account. This is implemented by setting them to
1490    zerofirstbyte and zeroreqbyte when such a repeat is encountered. The individual
1491    item types that can be repeated set these backoff variables appropriately. */
1492    
1493    firstbyte = reqbyte = zerofirstbyte = zeroreqbyte = REQ_UNSET;
1494    
1495    /* The variable req_caseopt contains either the REQ_CASELESS value or zero,
1496    according to the current setting of the caseless flag. REQ_CASELESS is a bit
1497    value > 255. It is added into the firstbyte or reqbyte variables to record the
1498    case status of the value. */
1499    
1500    req_caseopt = ((options & PCRE_CASELESS) != 0)? REQ_CASELESS : 0;
1501    
1502    /* Switch on next character until the end of the branch */
1503    
1504    for (;; ptr++)
1505      {
1506      BOOL negate_class;
1507      BOOL possessive_quantifier;
1508      int class_charcount;
1509      int class_lastchar;
1510      int newoptions;
1511      int recno;
1512      int skipbytes;
1513      int subreqbyte;
1514      int subfirstbyte;
1515    
1516      c = *ptr;
1517      if (inescq && c != 0) goto NORMAL_CHAR;
1518    
1519      if ((options & PCRE_EXTENDED) != 0)
1520        {
1521        if ((cd->ctypes[c] & ctype_space) != 0) continue;
1522        if (c == '#')
1523          {
1524          /* The space before the ; is to avoid a warning on a silly compiler
1525          on the Macintosh. */
1526          while ((c = *(++ptr)) != 0 && c != NEWLINE) ;
1527          if (c != 0) continue;   /* Else fall through to handle end of string */
1528          }
1529        }
1530    
1531      switch(c)
1532        {
1533        /* The branch terminates at end of string, |, or ). */
1534    
1535        case 0:
1536        case '|':
1537      case ')':      case ')':
1538        *firstbyteptr = firstbyte;
1539        *reqbyteptr = reqbyte;
1540      *codeptr = code;      *codeptr = code;
1541      *ptrptr = ptr;      *ptrptr = ptr;
1542      return TRUE;      return TRUE;
1543    
1544      /* Handle single-character metacharacters */      /* Handle single-character metacharacters. In multiline mode, ^ disables
1545        the setting of any following char as a first character. */
1546    
1547      case '^':      case '^':
1548        if ((options & PCRE_MULTILINE) != 0)
1549          {
1550          if (firstbyte == REQ_UNSET) firstbyte = REQ_NONE;
1551          }
1552      previous = NULL;      previous = NULL;
1553      *code++ = OP_CIRC;      *code++ = OP_CIRC;
1554      break;      break;
# Line 699  for (;; ptr++) Line 1558  for (;; ptr++)
1558      *code++ = OP_DOLL;      *code++ = OP_DOLL;
1559      break;      break;
1560    
1561        /* There can never be a first char if '.' is first, whatever happens about
1562        repeats. The value of reqbyte doesn't change either. */
1563    
1564      case '.':      case '.':
1565        if (firstbyte == REQ_UNSET) firstbyte = REQ_NONE;
1566        zerofirstbyte = firstbyte;
1567        zeroreqbyte = reqbyte;
1568      previous = code;      previous = code;
1569      *code++ = OP_ANY;      *code++ = OP_ANY;
1570      break;      break;
1571    
1572      /* Character classes. These always build a 32-byte bitmap of the permitted      /* Character classes. If the included characters are all < 255 in value, we
1573      characters, except in the special case where there is only one character.      build a 32-byte bitmap of the permitted characters, except in the special
1574      For negated classes, we build the map as usual, then invert it at the end.      case where there is only one such character. For negated classes, we build
1575        the map as usual, then invert it at the end. However, we use a different
1576        opcode so that data characters > 255 can be handled correctly.
1577    
1578        If the class contains characters outside the 0-255 range, a different
1579        opcode is compiled. It may optionally have a bit map for characters < 256,
1580        but those above are are explicitly listed afterwards. A flag byte tells
1581        whether the bitmap is present, and whether this is a negated class or not.
1582      */      */
1583    
1584      case '[':      case '[':
1585      previous = code;      previous = code;
1586      *code++ = OP_CLASS;  
1587        /* PCRE supports POSIX class stuff inside a class. Perl gives an error if
1588        they are encountered at the top level, so we'll do that too. */
1589    
1590        if ((ptr[1] == ':' || ptr[1] == '.' || ptr[1] == '=') &&
1591            check_posix_syntax(ptr, &tempptr, cd))
1592          {
1593          *errorptr = (ptr[1] == ':')? ERR13 : ERR31;
1594          goto FAILED;
1595          }
1596    
1597      /* If the first character is '^', set the negation flag and skip it. */      /* If the first character is '^', set the negation flag and skip it. */
1598    
# Line 720  for (;; ptr++) Line 1601  for (;; ptr++)
1601        negate_class = TRUE;        negate_class = TRUE;
1602        c = *(++ptr);        c = *(++ptr);
1603        }        }
1604      else negate_class = FALSE;      else
1605          {
1606          negate_class = FALSE;
1607          }
1608    
1609      /* Keep a count of chars so that we can optimize the case of just a single      /* Keep a count of chars with values < 256 so that we can optimize the case
1610      character. */      of just a single character (as long as it's < 256). For higher valued UTF-8
1611        characters, we don't yet do any optimization. */
1612    
1613      class_charcount = 0;      class_charcount = 0;
1614      class_lastchar = -1;      class_lastchar = -1;
1615    
1616    #ifdef SUPPORT_UTF8
1617        class_utf8 = FALSE;                       /* No chars >= 256 */
1618        class_utf8data = code + LINK_SIZE + 34;   /* For UTF-8 items */
1619    #endif
1620    
1621      /* Initialize the 32-char bit map to all zeros. We have to build the      /* Initialize the 32-char bit map to all zeros. We have to build the
1622      map in a temporary bit of store, in case the class contains only 1      map in a temporary bit of store, in case the class contains only 1
1623      character, because in that case the compiled code doesn't use the      character (< 256), because in that case the compiled code doesn't use the
1624      bit map. */      bit map. */
1625    
1626      memset(class, 0, 32 * sizeof(uschar));      memset(class, 0, 32 * sizeof(uschar));
1627    
1628      /* Process characters until ] is reached. By writing this as a "do" it      /* Process characters until ] is reached. By writing this as a "do" it
1629      means that an initial ] is taken as a data character. */      means that an initial ] is taken as a data character. The first pass
1630        through the regex checked the overall syntax, so we don't need to be very
1631        strict here. At the start of the loop, c contains the first byte of the
1632        character. */
1633    
1634      do      do
1635        {        {
1636        if (c == 0)  #ifdef SUPPORT_UTF8
1637          if (utf8 && c > 127) GETCHARLEN(c, ptr, ptr);
1638    #endif
1639    
1640          /* Inside \Q...\E everything is literal except \E */
1641    
1642          if (inescq)
1643          {          {
1644          *errorptr = ERR6;          if (c == '\\' && ptr[1] == 'E')
1645          goto FAILED;            {
1646              inescq = FALSE;
1647              ptr++;
1648              continue;
1649              }
1650            else goto LONE_SINGLE_CHARACTER;
1651            }
1652    
1653          /* Handle POSIX class names. Perl allows a negation extension of the
1654          form [:^name:]. A square bracket that doesn't match the syntax is
1655          treated as a literal. We also recognize the POSIX constructions
1656          [.ch.] and [=ch=] ("collating elements") and fault them, as Perl
1657          5.6 and 5.8 do. */
1658    
1659          if (c == '[' &&
1660              (ptr[1] == ':' || ptr[1] == '.' || ptr[1] == '=') &&
1661              check_posix_syntax(ptr, &tempptr, cd))
1662            {
1663            BOOL local_negate = FALSE;
1664            int posix_class, i;
1665            register const uschar *cbits = cd->cbits;
1666    
1667            if (ptr[1] != ':')
1668              {
1669              *errorptr = ERR31;
1670              goto FAILED;
1671              }
1672    
1673            ptr += 2;
1674            if (*ptr == '^')
1675              {
1676              local_negate = TRUE;
1677              ptr++;
1678              }
1679    
1680            posix_class = check_posix_name(ptr, tempptr - ptr);
1681            if (posix_class < 0)
1682              {
1683              *errorptr = ERR30;
1684              goto FAILED;
1685              }
1686    
1687            /* If matching is caseless, upper and lower are converted to
1688            alpha. This relies on the fact that the class table starts with
1689            alpha, lower, upper as the first 3 entries. */
1690    
1691            if ((options & PCRE_CASELESS) != 0 && posix_class <= 2)
1692              posix_class = 0;
1693    
1694            /* Or into the map we are building up to 3 of the static class
1695            tables, or their negations. The [:blank:] class sets up the same
1696            chars as the [:space:] class (all white space). We remove the vertical
1697            white space chars afterwards. */
1698    
1699            posix_class *= 3;
1700            for (i = 0; i < 3; i++)
1701              {
1702              BOOL isblank = strncmp(ptr, "blank", 5) == 0;
1703              int taboffset = posix_class_maps[posix_class + i];
1704              if (taboffset < 0) break;
1705              if (local_negate)
1706                {
1707                for (c = 0; c < 32; c++) class[c] |= ~cbits[c+taboffset];
1708                if (isblank) class[1] |= 0x3c;
1709                }
1710              else
1711                {
1712                for (c = 0; c < 32; c++) class[c] |= cbits[c+taboffset];
1713                if (isblank) class[1] &= ~0x3c;
1714                }
1715              }
1716    
1717            ptr = tempptr + 1;
1718            class_charcount = 10;  /* Set > 1; assumes more than 1 per class */
1719            continue;    /* End of POSIX syntax handling */
1720          }          }
1721    
1722        /* Backslash may introduce a single character, or it may introduce one        /* Backslash may introduce a single character, or it may introduce one
# Line 752  for (;; ptr++) Line 1725  for (;; ptr++)
1725        Inside a class (and only there) it is treated as backspace. Elsewhere        Inside a class (and only there) it is treated as backspace. Elsewhere
1726        it marks a word boundary. Other escapes have preset maps ready to        it marks a word boundary. Other escapes have preset maps ready to
1727        or into the one we are building. We assume they have more than one        or into the one we are building. We assume they have more than one
1728        character in them, so set class_count bigger than one. */        character in them, so set class_charcount bigger than one. */
1729    
1730        if (c == '\\')        if (c == '\\')
1731          {          {
1732          c = check_escape(&ptr, errorptr, *brackets, options, TRUE, cd);          c = check_escape(&ptr, errorptr, *brackets, options, TRUE, cd);
1733          if (-c == ESC_b) c = '\b';          if (-c == ESC_b) c = '\b';  /* \b is backslash in a class */
1734    
1735            if (-c == ESC_Q)            /* Handle start of quoted string */
1736              {
1737              if (ptr[1] == '\\' && ptr[2] == 'E')
1738                {
1739                ptr += 2; /* avoid empty string */
1740                }
1741              else inescq = TRUE;
1742              continue;
1743              }
1744    
1745          else if (c < 0)          else if (c < 0)
1746            {            {
1747            register const uschar *cbits = cd->cbits;            register const uschar *cbits = cd->cbits;
1748            class_charcount = 10;            class_charcount = 10;     /* Greater than 1 is what matters */
1749            switch (-c)            switch (-c)
1750              {              {
1751              case ESC_d:              case ESC_d:
# Line 773  for (;; ptr++) Line 1757  for (;; ptr++)
1757              continue;              continue;
1758    
1759              case ESC_w:              case ESC_w:
1760              for (c = 0; c < 32; c++)              for (c = 0; c < 32; c++) class[c] |= cbits[c+cbit_word];
               class[c] |= (cbits[c+cbit_digit] | cbits[c+cbit_word]);  
1761              continue;              continue;
1762    
1763              case ESC_W:              case ESC_W:
1764              for (c = 0; c < 32; c++)              for (c = 0; c < 32; c++) class[c] |= ~cbits[c+cbit_word];
               class[c] |= ~(cbits[c+cbit_digit] | cbits[c+cbit_word]);  
1765              continue;              continue;
1766    
1767              case ESC_s:              case ESC_s:
1768              for (c = 0; c < 32; c++) class[c] |= cbits[c+cbit_space];              for (c = 0; c < 32; c++) class[c] |= cbits[c+cbit_space];
1769                class[1] &= ~0x08;   /* Perl 5.004 onwards omits VT from \s */
1770              continue;              continue;
1771    
1772              case ESC_S:              case ESC_S:
1773              for (c = 0; c < 32; c++) class[c] |= ~cbits[c+cbit_space];              for (c = 0; c < 32; c++) class[c] |= ~cbits[c+cbit_space];
1774                class[1] |= 0x08;    /* Perl 5.004 onwards omits VT from \s */
1775              continue;              continue;
1776    
1777                /* Unrecognized escapes are faulted if PCRE is running in its
1778                strict mode. By default, for compatibility with Perl, they are
1779                treated as literals. */
1780    
1781              default:              default:
1782              *errorptr = ERR7;              if ((options & PCRE_EXTRA) != 0)
1783              goto FAILED;                {
1784                  *errorptr = ERR7;
1785                  goto FAILED;
1786                  }
1787                c = *ptr;    /* The final character */
1788              }              }
1789            }            }
1790          /* Fall through if single character */  
1791          }          /* Fall through if we have a single character (c >= 0). This may be
1792            > 256 in UTF-8 mode. */
1793    
1794            }   /* End of backslash handling */
1795    
1796        /* A single character may be followed by '-' to form a range. However,        /* A single character may be followed by '-' to form a range. However,
1797        Perl does not permit ']' to be the end of the range. A '-' character        Perl does not permit ']' to be the end of the range. A '-' character
# Line 806  for (;; ptr++) Line 1801  for (;; ptr++)
1801          {          {
1802          int d;          int d;
1803          ptr += 2;          ptr += 2;
         d = *ptr;  
1804    
1805          if (d == 0)  #ifdef SUPPORT_UTF8
1806            {          if (utf8)
1807            *errorptr = ERR6;            {                           /* Braces are required because the */
1808            goto FAILED;            GETCHARLEN(d, ptr, ptr);    /* macro generates multiple statements */
1809            }            }
1810            else
1811    #endif
1812            d = *ptr;
1813    
1814          /* The second part of a range can be a single-character escape, but          /* The second part of a range can be a single-character escape, but
1815          not any of the other escapes. */          not any of the other escapes. Perl 5.6 treats a hyphen as a literal
1816            in such circumstances. */
1817    
1818          if (d == '\\')          if (d == '\\')
1819            {            {
1820              const uschar *oldptr = ptr;
1821            d = check_escape(&ptr, errorptr, *brackets, options, TRUE, cd);            d = check_escape(&ptr, errorptr, *brackets, options, TRUE, cd);
1822    
1823              /* \b is backslash; any other special means the '-' was literal */
1824    
1825            if (d < 0)            if (d < 0)
1826              {              {
1827              if (d == -ESC_b) d = '\b'; else              if (d == -ESC_b) d = '\b'; else
1828                {                {
1829                *errorptr = ERR7;                ptr = oldptr - 2;
1830                goto FAILED;                goto LONE_SINGLE_CHARACTER;  /* A few lines below */
1831                }                }
1832              }              }
1833            }            }
1834    
1835            /* Check that the two values are in the correct order */
1836    
1837          if (d < c)          if (d < c)
1838            {            {
1839            *errorptr = ERR8;            *errorptr = ERR8;
1840            goto FAILED;            goto FAILED;
1841            }            }
1842    
1843            /* If d is greater than 255, we can't just use the bit map, so set up
1844            for the UTF-8 supporting class type. If we are not caseless, we can
1845            just set up a single range. If we are caseless, the characters < 256
1846            are handled with a bitmap, in order to get the case-insensitive
1847            handling. */
1848    
1849    #ifdef SUPPORT_UTF8
1850            if (d > 255)
1851              {
1852              class_utf8 = TRUE;
1853              *class_utf8data++ = XCL_RANGE;
1854              if ((options & PCRE_CASELESS) == 0)
1855                {
1856                class_utf8data += ord2utf8(c, class_utf8data);
1857                class_utf8data += ord2utf8(d, class_utf8data);
1858                continue;  /* Go get the next char in the class */
1859                }
1860              class_utf8data += ord2utf8(256, class_utf8data);
1861              class_utf8data += ord2utf8(d, class_utf8data);
1862              d = 255;
1863              /* Fall through */
1864              }
1865    #endif
1866            /* We use the bit map if the range is entirely < 255, or if part of it
1867            is < 255 and matching is caseless. */
1868    
1869          for (; c <= d; c++)          for (; c <= d; c++)
1870            {            {
1871            class[c/8] |= (1 << (c&7));            class[c/8] |= (1 << (c&7));
# Line 847  for (;; ptr++) Line 1877  for (;; ptr++)
1877            class_charcount++;                /* in case a one-char range */            class_charcount++;                /* in case a one-char range */
1878            class_lastchar = c;            class_lastchar = c;
1879            }            }
1880    
1881          continue;   /* Go get the next char in the class */          continue;   /* Go get the next char in the class */
1882          }          }
1883    
1884        /* Handle a lone single character - we can get here for a normal        /* Handle a lone single character - we can get here for a normal
1885        non-escape char, or after \ that introduces a single character. */        non-escape char, or after \ that introduces a single character. */
1886    
1887        class [c/8] |= (1 << (c&7));        LONE_SINGLE_CHARACTER:
1888        if ((options & PCRE_CASELESS) != 0)  
1889          /* Handle a multibyte character */
1890    
1891    #ifdef SUPPORT_UTF8
1892          if (utf8 && c > 255)
1893          {          {
1894          c = cd->fcc[c];   /* flip case */          class_utf8 = TRUE;
1895          class[c/8] |= (1 << (c&7));          *class_utf8data++ = XCL_SINGLE;
1896            class_utf8data += ord2utf8(c, class_utf8data);
1897            }
1898          else
1899    #endif
1900          /* Handle a single-byte character */
1901            {
1902            class [c/8] |= (1 << (c&7));
1903            if ((options & PCRE_CASELESS) != 0)
1904              {
1905              c = cd->fcc[c];   /* flip case */
1906              class[c/8] |= (1 << (c&7));
1907              }
1908            class_charcount++;
1909            class_lastchar = c;
1910          }          }
       class_charcount++;  
       class_lastchar = c;  
1911        }        }
1912    
1913      /* Loop until ']' reached; the check for end of string happens inside the      /* Loop until ']' reached; the check for end of string happens inside the
1914      loop. This "while" is the end of the "do" above. */      loop. This "while" is the end of the "do" above. */
1915    
1916      while ((c = *(++ptr)) != ']');      while ((c = *(++ptr)) != ']' || inescq);
1917    
1918      /* If class_charcount is 1 and class_lastchar is not negative, we saw      /* If class_charcount is 1, we saw precisely one character with a value <
1919      precisely one character. This doesn't need the whole 32-byte bit map.      256. In UTF-8 mode, we can optimize if there were no characters >= 256 and
1920      We turn it into a 1-character OP_CHAR if it's positive, or OP_NOT if      the one character is < 128. In non-UTF-8 mode we can always optimize.
1921      it's negative. */  
1922        The optimization throws away the bit map. We turn the item into a
1923        1-character OP_CHARS if it's positive, or OP_NOT if it's negative. Note
1924        that OP_NOT does not support multibyte characters. In the positive case, it
1925        can cause firstbyte to be set. Otherwise, there can be no first char if
1926        this item is first, whatever repeat count may follow. In the case of
1927        reqbyte, save the previous value for reinstating. */
1928    
1929      if (class_charcount == 1 && class_lastchar >= 0)  #ifdef SUPPORT_UTF8
1930        if (!class_utf8 && class_charcount == 1 && class_lastchar < 128)
1931    #else
1932        if (class_charcount == 1)
1933    #endif
1934        {        {
1935          zeroreqbyte = reqbyte;
1936        if (negate_class)        if (negate_class)
1937          {          {
1938          code[-1] = OP_NOT;          if (firstbyte == REQ_UNSET) firstbyte = REQ_NONE;
1939            zerofirstbyte = firstbyte;
1940            *code++ = OP_NOT;
1941          }          }
1942        else        else
1943          {          {
1944          code[-1] = OP_CHARS;          if (firstbyte == REQ_UNSET)
1945              {
1946              zerofirstbyte = REQ_NONE;
1947              firstbyte = class_lastchar | req_caseopt;
1948              }
1949            else
1950              {
1951              zerofirstbyte = firstbyte;
1952              reqbyte = class_lastchar | req_caseopt;
1953              }
1954            *code++ = OP_CHARS;
1955          *code++ = 1;          *code++ = 1;
1956          }          }
1957        *code++ = class_lastchar;        *code++ = class_lastchar;
1958          break;  /* End of class handling */
1959          }       /* End of 1-byte optimization */
1960    
1961        /* Otherwise, if this is the first thing in the branch, there can be no
1962        first char setting, whatever the repeat count. Any reqbyte setting must
1963        remain unchanged after any kind of repeat. */
1964    
1965        if (firstbyte == REQ_UNSET) firstbyte = REQ_NONE;
1966        zerofirstbyte = firstbyte;
1967        zeroreqbyte = reqbyte;
1968    
1969        /* If there are characters with values > 255, we have to compile an
1970        extended class, with its own opcode. If there are no characters < 256,
1971        we can omit the bitmap. */
1972    
1973    #ifdef SUPPORT_UTF8
1974        if (class_utf8)
1975          {
1976          *class_utf8data++ = XCL_END;    /* Marks the end of extra data */
1977          *code++ = OP_XCLASS;
1978          code += LINK_SIZE;
1979          *code = negate_class? XCL_NOT : 0;
1980    
1981          /* If the map is required, install it, and move on to the end of
1982          the extra data */
1983    
1984          if (class_charcount > 0)
1985            {
1986            *code++ |= XCL_MAP;
1987            memcpy(code, class, 32);
1988            code = class_utf8data;
1989            }
1990    
1991          /* If the map is not required, slide down the extra data. */
1992    
1993          else
1994            {
1995            int len = class_utf8data - (code + 33);
1996            memmove(code + 1, code + 33, len);
1997            code += len + 1;
1998            }
1999    
2000          /* Now fill in the complete length of the item */
2001    
2002          PUT(previous, 1, code - previous);
2003          break;   /* End of class handling */
2004        }        }
2005    #endif
2006    
2007      /* Otherwise, negate the 32-byte map if necessary, and copy it into      /* If there are no characters > 255, negate the 32-byte map if necessary,
2008      the code vector. */      and copy it into the code vector. If this is the first thing in the branch,
2009        there can be no first char setting, whatever the repeat count. Any reqbyte
2010        setting must remain unchanged after any kind of repeat. */
2011    
2012        if (negate_class)
2013          {
2014          *code++ = OP_NCLASS;
2015          for (c = 0; c < 32; c++) code[c] = ~class[c];
2016          }
2017      else      else
2018        {        {
2019        if (negate_class)        *code++ = OP_CLASS;
2020          for (c = 0; c < 32; c++) code[c] = ~class[c];        memcpy(code, class, 32);
       else  
         memcpy(code, class, 32);  
       code += 32;  
2021        }        }
2022        code += 32;
2023      break;      break;
2024    
2025      /* Various kinds of repeat */      /* Various kinds of repeat */
# Line 929  for (;; ptr++) Line 2051  for (;; ptr++)
2051        goto FAILED;        goto FAILED;
2052        }        }
2053    
2054      /* If the next character is '?' this is a minimizing repeat, by default,      if (repeat_min == 0)
2055      but if PCRE_UNGREEDY is set, it works the other way round. Advance to the        {
2056      next character. */        firstbyte = zerofirstbyte;   /* Adjust for zero repeat */
2057          reqbyte = zeroreqbyte;       /* Ditto */
2058          }
2059    
2060      if (ptr[1] == '?')      op_type = 0;                    /* Default single-char op codes */
2061        { repeat_type = greedy_non_default; ptr++; }      possessive_quantifier = FALSE;  /* Default not possessive quantifier */
2062    
2063        /* Save start of previous item, in case we have to move it up to make space
2064        for an inserted OP_ONCE for the additional '+' extension. */
2065    
2066        tempcode = previous;
2067    
2068        /* If the next character is '+', we have a possessive quantifier. This
2069        implies greediness, whatever the setting of the PCRE_UNGREEDY option.
2070        If the next character is '?' this is a minimizing repeat, by default,
2071        but if PCRE_UNGREEDY is set, it works the other way round. We change the
2072        repeat type to the non-default. */
2073    
2074        if (ptr[1] == '+')
2075          {
2076          repeat_type = 0;                  /* Force greedy */
2077          possessive_quantifier = TRUE;
2078          ptr++;
2079          }
2080        else if (ptr[1] == '?')
2081          {
2082          repeat_type = greedy_non_default;
2083          ptr++;
2084          }
2085      else repeat_type = greedy_default;      else repeat_type = greedy_default;
2086    
2087      /* If the maximum is zero then the minimum must also be zero; Perl allows      /* If previous was a recursion, we need to wrap it inside brackets so that
2088      this case, so we do too - by simply omitting the item altogether. */      it can be replicated if necessary. */
2089    
2090      if (repeat_max == 0) code = previous;      if (*previous == OP_RECURSE)
2091          {
2092          memmove(previous + 1 + LINK_SIZE, previous, 1 + LINK_SIZE);
2093          code += 1 + LINK_SIZE;
2094          *previous = OP_BRA;
2095          PUT(previous, 1, code - previous);
2096          *code = OP_KET;
2097          PUT(code, 1, code - previous);
2098          code += 1 + LINK_SIZE;
2099          }
2100    
2101      /* If previous was a string of characters, chop off the last one and use it      /* If previous was a string of characters, chop off the last one and use it
2102      as the subject of the repeat. If there was only one character, we can      as the subject of the repeat. If there was only one character, we can
2103      abolish the previous item altogether. */      abolish the previous item altogether. If a one-char item has a minumum of
2104        more than one, ensure that it is set in reqbyte - it might not be if a
2105      else if (*previous == OP_CHARS)      sequence such as x{3} is the first thing in a branch because the x will
2106        {      have gone into firstbyte instead.  */
2107        int len = previous[1];  
2108        if (len == 1)      if (*previous == OP_CHARS)
2109          {        {
2110          c = previous[2];        /* Deal with UTF-8 characters that take up more than one byte. It's
2111          code = previous;        easier to write this out separately than try to macrify it. Use c to
2112          hold the length of the character in bytes, plus 0x80 to flag that it's a
2113          length rather than a small character. */
2114    
2115    #ifdef SUPPORT_UTF8
2116          if (utf8 && (code[-1] & 0x80) != 0)
2117            {
2118            uschar *lastchar = code - 1;
2119            while((*lastchar & 0xc0) == 0x80) lastchar--;
2120            c = code - lastchar;            /* Length of UTF-8 character */
2121            memcpy(utf8_char, lastchar, c); /* Save the char */
2122            if (lastchar == previous + 2)   /* There was only one character */
2123              {
2124              code = previous;              /* Abolish the previous item */
2125              }
2126            else
2127              {
2128              previous[1] -= c;             /* Adjust length of previous */
2129              code = lastchar;              /* Lost char off the end */
2130              tempcode = code;              /* Adjust position to be moved for '+' */
2131              }
2132            c |= 0x80;                      /* Flag c as a length */
2133          }          }
2134        else        else
2135    #endif
2136    
2137          /* Handle the case of a single byte - either with no UTF8 support, or
2138          with UTF-8 disabled, or for a UTF-8 character < 128. */
2139    
2140          {          {
2141          c = previous[len+1];          c = *(--code);
2142          previous[1]--;          if (code == previous + 2)   /* There was only one character */
2143          code--;            {
2144              code = previous;              /* Abolish the previous item */
2145              if (repeat_min > 1) reqbyte = c | req_caseopt;
2146              }
2147            else
2148              {
2149              previous[1]--;             /* adjust length */
2150              tempcode = code;           /* Adjust position to be moved for '+' */
2151              }
2152          }          }
2153        op_type = 0;                 /* Use single-char op codes */  
2154        goto OUTPUT_SINGLE_REPEAT;   /* Code shared with single character types */        goto OUTPUT_SINGLE_REPEAT;   /* Code shared with single character types */
2155        }        }
2156    
2157      /* If previous was a single negated character ([^a] or similar), we use      /* If previous was a single negated character ([^a] or similar), we use
2158      one of the special opcodes, replacing it. The code is shared with single-      one of the special opcodes, replacing it. The code is shared with single-
2159      character repeats by adding a suitable offset into repeat_type. */      character repeats by setting opt_type to add a suitable offset into
2160        repeat_type. OP_NOT is currently used only for single-byte chars. */
2161    
2162      else if ((int)*previous == OP_NOT)      else if (*previous == OP_NOT)
2163        {        {
2164        op_type = OP_NOTSTAR - OP_STAR;  /* Use "not" opcodes */        op_type = OP_NOTSTAR - OP_STAR;  /* Use "not" opcodes */
2165        c = previous[1];        c = previous[1];
# Line 978  for (;; ptr++) Line 2169  for (;; ptr++)
2169    
2170      /* If previous was a character type match (\d or similar), abolish it and      /* If previous was a character type match (\d or similar), abolish it and
2171      create a suitable repeat item. The code is shared with single-character      create a suitable repeat item. The code is shared with single-character
2172      repeats by adding a suitable offset into repeat_type. */      repeats by setting op_type to add a suitable offset into repeat_type. */
2173    
2174      else if ((int)*previous < OP_EODN || *previous == OP_ANY)      else if (*previous < OP_EODN)
2175        {        {
2176        op_type = OP_TYPESTAR - OP_STAR;  /* Use type opcodes */        op_type = OP_TYPESTAR - OP_STAR;  /* Use type opcodes */
2177        c = *previous;        c = *previous;
2178        code = previous;        code = previous;
2179    
2180        OUTPUT_SINGLE_REPEAT:        OUTPUT_SINGLE_REPEAT:
2181        repeat_type += op_type;      /* Combine both values for many cases */  
2182          /* If the maximum is zero then the minimum must also be zero; Perl allows
2183          this case, so we do too - by simply omitting the item altogether. */
2184    
2185          if (repeat_max == 0) goto END_REPEAT;
2186    
2187          /* Combine the op_type with the repeat_type */
2188    
2189          repeat_type += op_type;
2190    
2191        /* A minimum of zero is handled either as the special case * or ?, or as        /* A minimum of zero is handled either as the special case * or ?, or as
2192        an UPTO, with the maximum given. */        an UPTO, with the maximum given. */
# Line 999  for (;; ptr++) Line 2198  for (;; ptr++)
2198          else          else
2199            {            {
2200            *code++ = OP_UPTO + repeat_type;            *code++ = OP_UPTO + repeat_type;
2201            *code++ = repeat_max >> 8;            PUT2INC(code, 0, repeat_max);
           *code++ = (repeat_max & 255);  
2202            }            }
2203          }          }
2204    
# Line 1017  for (;; ptr++) Line 2215  for (;; ptr++)
2215          if (repeat_min != 1)          if (repeat_min != 1)
2216            {            {
2217            *code++ = OP_EXACT + op_type;  /* NB EXACT doesn't have repeat_type */            *code++ = OP_EXACT + op_type;  /* NB EXACT doesn't have repeat_type */
2218            *code++ = repeat_min >> 8;            PUT2INC(code, 0, repeat_min);
           *code++ = (repeat_min & 255);  
2219            }            }
2220    
2221          /* If the mininum is 1 and the previous item was a character string,          /* If the mininum is 1 and the previous item was a character string,
# Line 1026  for (;; ptr++) Line 2223  for (;; ptr++)
2223          length was 1, or add the character back onto the end of a longer          length was 1, or add the character back onto the end of a longer
2224          string. For a character type nothing need be done; it will just get          string. For a character type nothing need be done; it will just get
2225          put back naturally. Note that the final character is always going to          put back naturally. Note that the final character is always going to
2226          get added below. */          get added below, so we leave code ready for its insertion. */
2227    
2228          else if (*previous == OP_CHARS)          else if (*previous == OP_CHARS)
2229            {            {
2230            if (code == previous) code += 2; else previous[1]++;            if (code == previous) code += 2; else
2231    
2232              /* In UTF-8 mode, a multibyte char has its length in c, with the 0x80
2233              bit set as a flag. The length will always be between 2 and 6. */
2234    
2235    #ifdef SUPPORT_UTF8
2236              if (utf8 && c >= 128) previous[1] += c & 7; else
2237    #endif
2238              previous[1]++;
2239            }            }
2240    
2241          /*  For a single negated character we also have to put back the          /*  For a single negated character we also have to put back the
2242          item that got cancelled. */          item that got cancelled. At present this applies only to single byte
2243            characters in any mode. */
2244    
2245          else if (*previous == OP_NOT) code++;          else if (*previous == OP_NOT) code++;
2246    
2247          /* If the maximum is unlimited, insert an OP_STAR. */          /* If the maximum is unlimited, insert an OP_STAR. Before doing so,
2248            we have to insert the character for the previous code. In UTF-8 mode,
2249            long characters have their length in c, with the 0x80 bit as a flag. */
2250    
2251          if (repeat_max < 0)          if (repeat_max < 0)
2252            {            {
2253    #ifdef SUPPORT_UTF8
2254              if (utf8 && c >= 128)
2255                {
2256                memcpy(code, utf8_char, c & 7);
2257                code += c & 7;
2258                }
2259              else
2260    #endif
2261            *code++ = c;            *code++ = c;
2262            *code++ = OP_STAR + repeat_type;            *code++ = OP_STAR + repeat_type;
2263            }            }
2264    
2265          /* Else insert an UPTO if the max is greater than the min. */          /* Else insert an UPTO if the max is greater than the min, again
2266            preceded by the character, for the previously inserted code. */
2267    
2268          else if (repeat_max != repeat_min)          else if (repeat_max != repeat_min)
2269            {            {
2270    #ifdef SUPPORT_UTF8
2271              if (utf8 && c >= 128)
2272                {
2273                memcpy(code, utf8_char, c & 7);
2274                code += c & 7;
2275                }
2276              else
2277    #endif
2278            *code++ = c;            *code++ = c;
2279            repeat_max -= repeat_min;            repeat_max -= repeat_min;
2280            *code++ = OP_UPTO + repeat_type;            *code++ = OP_UPTO + repeat_type;
2281            *code++ = repeat_max >> 8;            PUT2INC(code, 0, repeat_max);
           *code++ = (repeat_max & 255);  
2282            }            }
2283          }          }
2284    
2285        /* The character or character type itself comes last in all cases. */        /* The character or character type itself comes last in all cases. */
2286    
2287    #ifdef SUPPORT_UTF8
2288          if (utf8 && c >= 128)
2289            {
2290            memcpy(code, utf8_char, c & 7);
2291            code += c & 7;
2292            }
2293          else
2294    #endif
2295    
2296        *code++ = c;        *code++ = c;
2297        }        }
2298    
2299      /* If previous was a character class or a back reference, we put the repeat      /* If previous was a character class or a back reference, we put the repeat
2300      stuff after it. */      stuff after it, but just skip the item if the repeat was {0,0}. */
2301    
2302      else if (*previous == OP_CLASS || *previous == OP_REF)      else if (*previous == OP_CLASS ||
2303                 *previous == OP_NCLASS ||
2304    #ifdef SUPPORT_UTF8
2305                 *previous == OP_XCLASS ||
2306    #endif
2307                 *previous == OP_REF)
2308        {        {
2309          if (repeat_max == 0)
2310            {
2311            code = previous;
2312            goto END_REPEAT;
2313            }
2314        if (repeat_min == 0 && repeat_max == -1)        if (repeat_min == 0 && repeat_max == -1)
2315          *code++ = OP_CRSTAR + repeat_type;          *code++ = OP_CRSTAR + repeat_type;
2316        else if (repeat_min == 1 && repeat_max == -1)        else if (repeat_min == 1 && repeat_max == -1)
# Line 1077  for (;; ptr++) Line 2320  for (;; ptr++)
2320        else        else
2321          {          {
2322          *code++ = OP_CRRANGE + repeat_type;          *code++ = OP_CRRANGE + repeat_type;
2323          *code++ = repeat_min >> 8;          PUT2INC(code, 0, repeat_min);
         *code++ = repeat_min & 255;  
2324          if (repeat_max == -1) repeat_max = 0;  /* 2-byte encoding for max */          if (repeat_max == -1) repeat_max = 0;  /* 2-byte encoding for max */
2325          *code++ = repeat_max >> 8;          PUT2INC(code, 0, repeat_max);
         *code++ = repeat_max & 255;  
2326          }          }
2327        }        }
2328    
2329      /* If previous was a bracket group, we may have to replicate it in certain      /* If previous was a bracket group, we may have to replicate it in certain
2330      cases. */      cases. */
2331    
2332      else if ((int)*previous >= OP_BRA || (int)*previous == OP_ONCE ||      else if (*previous >= OP_BRA || *previous == OP_ONCE ||
2333               (int)*previous == OP_COND)               *previous == OP_COND)
2334        {        {
2335        register int i;        register int i;
2336        int ketoffset = 0;        int ketoffset = 0;
# Line 1105  for (;; ptr++) Line 2346  for (;; ptr++)
2346        if (repeat_max == -1)        if (repeat_max == -1)
2347          {          {
2348          register uschar *ket = previous;          register uschar *ket = previous;
2349          do ket += (ket[1] << 8) + ket[2]; while (*ket != OP_KET);          do ket += GET(ket, 1); while (*ket != OP_KET);
2350          ketoffset = code - ket;          ketoffset = code - ket;
2351          }          }
2352    
# Line 1113  for (;; ptr++) Line 2354  for (;; ptr++)
2354        OP_BRAZERO in front of it, and because the group appears once in the        OP_BRAZERO in front of it, and because the group appears once in the
2355        data, whereas in other cases it appears the minimum number of times. For        data, whereas in other cases it appears the minimum number of times. For
2356        this reason, it is simplest to treat this case separately, as otherwise        this reason, it is simplest to treat this case separately, as otherwise
2357        the code gets far too mess. There are several special subcases when the        the code gets far too messy. There are several special subcases when the
2358        minimum is zero. */        minimum is zero. */
2359    
2360        if (repeat_min == 0)        if (repeat_min == 0)
# Line 1124  for (;; ptr++) Line 2365  for (;; ptr++)
2365          if (repeat_max == 0)          if (repeat_max == 0)
2366            {            {
2367            code = previous;            code = previous;
2368            previous = NULL;            goto END_REPEAT;
           break;  
2369            }            }
2370    
2371          /* If the maximum is 1 or unlimited, we just have to stick in the          /* If the maximum is 1 or unlimited, we just have to stick in the
# Line 1148  for (;; ptr++) Line 2388  for (;; ptr++)
2388          else          else
2389            {            {
2390            int offset;            int offset;
2391            memmove(previous+4, previous, len);            memmove(previous + 2 + LINK_SIZE, previous, len);
2392            code += 4;            code += 2 + LINK_SIZE;
2393            *previous++ = OP_BRAZERO + repeat_type;            *previous++ = OP_BRAZERO + repeat_type;
2394            *previous++ = OP_BRA;            *previous++ = OP_BRA;
2395    
# Line 1158  for (;; ptr++) Line 2398  for (;; ptr++)
2398    
2399            offset = (bralink == NULL)? 0 : previous - bralink;            offset = (bralink == NULL)? 0 : previous - bralink;
2400            bralink = previous;            bralink = previous;
2401            *previous++ = offset >> 8;            PUTINC(previous, 0, offset);
           *previous++ = offset & 255;  
2402            }            }
2403    
2404          repeat_max--;          repeat_max--;
# Line 1167  for (;; ptr++) Line 2406  for (;; ptr++)
2406    
2407        /* If the minimum is greater than zero, replicate the group as many        /* If the minimum is greater than zero, replicate the group as many
2408        times as necessary, and adjust the maximum to the number of subsequent        times as necessary, and adjust the maximum to the number of subsequent
2409        copies that we need. */        copies that we need. If we set a first char from the group, and didn't
2410          set a required char, copy the latter from the former. */
2411    
2412        else        else
2413          {          {
2414          for (i = 1; i < repeat_min; i++)          if (repeat_min > 1)
2415            {            {
2416            memcpy(code, previous, len);            if (groupsetfirstbyte && reqbyte < 0) reqbyte = firstbyte;
2417            code += len;            for (i = 1; i < repeat_min; i++)
2418                {
2419                memcpy(code, previous, len);
2420                code += len;
2421                }
2422            }            }
2423          if (repeat_max > 0) repeat_max -= repeat_min;          if (repeat_max > 0) repeat_max -= repeat_min;
2424          }          }
# Line 1200  for (;; ptr++) Line 2444  for (;; ptr++)
2444              *code++ = OP_BRA;              *code++ = OP_BRA;
2445              offset = (bralink == NULL)? 0 : code - bralink;              offset = (bralink == NULL)? 0 : code - bralink;
2446              bralink = code;              bralink = code;
2447              *code++ = offset >> 8;              PUTINC(code, 0, offset);
             *code++ = offset & 255;  
2448              }              }
2449    
2450            memcpy(code, previous, len);            memcpy(code, previous, len);
# Line 1216  for (;; ptr++) Line 2459  for (;; ptr++)
2459            int oldlinkoffset;            int oldlinkoffset;
2460            int offset = code - bralink + 1;            int offset = code - bralink + 1;
2461            uschar *bra = code - offset;            uschar *bra = code - offset;
2462            oldlinkoffset = (bra[1] << 8) + bra[2];            oldlinkoffset = GET(bra, 1);
2463            bralink = (oldlinkoffset == 0)? NULL : bralink - oldlinkoffset;            bralink = (oldlinkoffset == 0)? NULL : bralink - oldlinkoffset;
2464            *code++ = OP_KET;            *code++ = OP_KET;
2465            *code++ = bra[1] = offset >> 8;            PUTINC(code, 0, offset);
2466            *code++ = bra[2] = (offset & 255);            PUT(bra, 1, offset);
2467            }            }
2468          }          }
2469    
# Line 1230  for (;; ptr++) Line 2473  for (;; ptr++)
2473        correct offset was computed above. */        correct offset was computed above. */
2474    
2475        else code[-ketoffset] = OP_KETRMAX + repeat_type;        else code[-ketoffset] = OP_KETRMAX + repeat_type;
   
   
 #ifdef NEVER  
       /* If the minimum is greater than zero, and the maximum is unlimited or  
       equal to the minimum, the first copy remains where it is, and is  
       replicated up to the minimum number of times. This case includes the +  
       repeat, but of course no replication is needed in that case. */  
   
       if (repeat_min > 0 && (repeat_max == -1 || repeat_max == repeat_min))  
         {  
         for (i = 1; i < repeat_min; i++)  
           {  
           memcpy(code, previous, len);  
           code += len;  
           }  
         }  
   
       /* If the minimum is zero, stick BRAZERO in front of the first copy.  
       Then, if there is a fixed upper limit, replicated up to that many times,  
       sticking BRAZERO in front of all the optional ones. */  
   
       else  
         {  
         if (repeat_min == 0)  
           {  
           memmove(previous+1, previous, len);  
           code++;  
           *previous++ = OP_BRAZERO + repeat_type;  
           }  
   
         for (i = 1; i < repeat_min; i++)  
           {  
           memcpy(code, previous, len);  
           code += len;  
           }  
   
         for (i = (repeat_min > 0)? repeat_min : 1; i < repeat_max; i++)  
           {  
           *code++ = OP_BRAZERO + repeat_type;  
           memcpy(code, previous, len);  
           code += len;  
           }  
         }  
   
       /* If the maximum is unlimited, set a repeater in the final copy. We  
       can't just offset backwards from the current code point, because we  
       don't know if there's been an options resetting after the ket. The  
       correct offset was computed above. */  
   
       if (repeat_max == -1) code[-ketoffset] = OP_KETRMAX + repeat_type;  
 #endif  
   
   
2476        }        }
2477    
2478      /* Else there's some kind of shambles */      /* Else there's some kind of shambles */
# Line 1293  for (;; ptr++) Line 2483  for (;; ptr++)
2483        goto FAILED;        goto FAILED;
2484        }        }
2485    
2486        /* If the character following a repeat is '+', we wrap the entire repeated
2487        item inside OP_ONCE brackets. This is just syntactic sugar, taken from
2488        Sun's Java package. The repeated item starts at tempcode, not at previous,
2489        which might be the first part of a string whose (former) last char we
2490        repeated. However, we don't support '+' after a greediness '?'. */
2491    
2492        if (possessive_quantifier)
2493          {
2494          int len = code - tempcode;
2495          memmove(tempcode + 1+LINK_SIZE, tempcode, len);
2496          code += 1 + LINK_SIZE;
2497          len += 1 + LINK_SIZE;
2498          tempcode[0] = OP_ONCE;
2499          *code++ = OP_KET;
2500          PUTINC(code, 0, len);
2501          PUT(tempcode, 1, len);
2502          }
2503    
2504      /* In all case we no longer have a previous item. */      /* In all case we no longer have a previous item. */
2505    
2506        END_REPEAT:
2507      previous = NULL;      previous = NULL;
2508      break;      break;
2509    
# Line 1308  for (;; ptr++) Line 2517  for (;; ptr++)
2517    
2518      case '(':      case '(':
2519      newoptions = options;      newoptions = options;
2520      condref = -1;      skipbytes = 0;
2521    
2522      if (*(++ptr) == '?')      if (*(++ptr) == '?')
2523        {        {
# Line 1329  for (;; ptr++) Line 2538  for (;; ptr++)
2538    
2539          case '(':          case '(':
2540          bravalue = OP_COND;       /* Conditional group */          bravalue = OP_COND;       /* Conditional group */
2541          if ((cd->ctypes[*(++ptr)] & ctype_digit) != 0)  
2542            /* Condition to test for recursion */
2543    
2544            if (ptr[1] == 'R')
2545              {
2546              code[1+LINK_SIZE] = OP_CREF;
2547              PUT2(code, 2+LINK_SIZE, CREF_RECURSE);
2548              skipbytes = 3;
2549              ptr += 3;
2550              }
2551    
2552            /* Condition to test for a numbered subpattern match */
2553    
2554            else if ((cd->ctypes[ptr[1]] & ctype_digit) != 0)
2555            {            {
2556            condref = *ptr - '0';            int condref = *(++ptr) - '0';
2557            while (*(++ptr) != ')') condref = condref*10 + *ptr - '0';            while (*(++ptr) != ')') condref = condref*10 + *ptr - '0';
2558              if (condref == 0)
2559                {
2560                *errorptr = ERR35;
2561                goto FAILED;
2562                }
2563            ptr++;            ptr++;
2564              code[1+LINK_SIZE] = OP_CREF;
2565              PUT2(code, 2+LINK_SIZE, condref);
2566              skipbytes = 3;
2567            }            }
2568          else ptr--;          /* For conditions that are assertions, we just fall through, having
2569            set bravalue above. */
2570          break;          break;
2571    
2572          case '=':                 /* Positive lookahead */          case '=':                 /* Positive lookahead */
# Line 1360  for (;; ptr++) Line 2591  for (;; ptr++)
2591            bravalue = OP_ASSERTBACK_NOT;            bravalue = OP_ASSERTBACK_NOT;
2592            ptr++;            ptr++;
2593            break;            break;
   
           default:                /* Syntax error */  
           *errorptr = ERR24;  
           goto FAILED;  
2594            }            }
2595          break;          break;
2596    
# Line 1372  for (;; ptr++) Line 2599  for (;; ptr++)
2599          ptr++;          ptr++;
2600          break;          break;
2601    
2602            case 'C':                 /* Callout - may be followed by digits */
2603            *code++ = OP_CALLOUT;
2604              {
2605              int n = 0;
2606              while ((cd->ctypes[*(++ptr)] & ctype_digit) != 0)
2607                n = n * 10 + *ptr - '0';
2608              if (n > 255)
2609                {
2610                *errorptr = ERR38;
2611                goto FAILED;
2612                }
2613              *code++ = n;
2614              }
2615            previous = NULL;
2616            continue;
2617    
2618            case 'P':                 /* Named subpattern handling */
2619            if (*(++ptr) == '<')      /* Definition */
2620              {
2621              int i, namelen;
2622              const uschar *name = ++ptr;
2623              uschar *slot = cd->name_table;
2624    
2625              while (*ptr++ != '>');
2626              namelen = ptr - name - 1;
2627    
2628              for (i = 0; i < cd->names_found; i++)
2629                {
2630                int c = memcmp(name, slot+2, namelen + 1);
2631                if (c == 0)
2632                  {
2633                  *errorptr = ERR43;
2634                  goto FAILED;
2635                  }
2636                if (c < 0)
2637                  {
2638                  memmove(slot + cd->name_entry_size, slot,
2639                    (cd->names_found - i) * cd->name_entry_size);
2640                  break;
2641                  }
2642                slot += cd->name_entry_size;
2643                }
2644    
2645              PUT2(slot, 0, *brackets + 1);
2646              memcpy(slot + 2, name, namelen);
2647              slot[2+namelen] = 0;
2648              cd->names_found++;
2649              goto NUMBERED_GROUP;
2650              }
2651    
2652            if (*ptr == '=' || *ptr == '>')  /* Reference or recursion */
2653              {
2654              int i, namelen;
2655              int type = *ptr++;
2656              const uschar *name = ptr;
2657              uschar *slot = cd->name_table;
2658    
2659              while (*ptr != ')') ptr++;
2660              namelen = ptr - name;
2661    
2662              for (i = 0; i < cd->names_found; i++)
2663                {
2664                if (strncmp(name, slot+2, namelen) == 0) break;
2665                slot += cd->name_entry_size;
2666                }
2667              if (i >= cd->names_found)
2668                {
2669                *errorptr = ERR15;
2670                goto FAILED;
2671                }
2672    
2673              recno = GET2(slot, 0);
2674    
2675              if (type == '>') goto HANDLE_RECURSION;  /* A few lines below */
2676    
2677              /* Back reference */
2678    
2679              previous = code;
2680              *code++ = OP_REF;
2681              PUT2INC(code, 0, recno);
2682              cd->backref_map |= (recno < 32)? (1 << recno) : 1;
2683              if (recno > cd->top_backref) cd->top_backref = recno;
2684              continue;
2685              }
2686    
2687            /* Should never happen */
2688            break;
2689    
2690            case 'R':                 /* Pattern recursion */
2691            ptr++;                    /* Same as (?0)      */
2692            /* Fall through */
2693    
2694            /* Recursion or "subroutine" call */
2695    
2696            case '0': case '1': case '2': case '3': case '4':
2697            case '5': case '6': case '7': case '8': case '9':
2698              {
2699              const uschar *called;
2700              recno = 0;
2701    
2702              while ((cd->ctypes[*ptr] & ctype_digit) != 0)
2703                recno = recno * 10 + *ptr++ - '0';
2704    
2705              /* Come here from code above that handles a named recursion */
2706    
2707              HANDLE_RECURSION:
2708    
2709              previous = code;
2710    
2711              /* Find the bracket that is being referenced. Temporarily end the
2712              regex in case it doesn't exist. */
2713    
2714              *code = OP_END;
2715              called = (recno == 0)?
2716                cd->start_code : find_bracket(cd->start_code, utf8, recno);
2717    
2718              if (called == NULL)
2719                {
2720                *errorptr = ERR15;
2721                goto FAILED;
2722                }
2723    
2724              /* If the subpattern is still open, this is a recursive call. We
2725              check to see if this is a left recursion that could loop for ever,
2726              and diagnose that case. */
2727    
2728              if (GET(called, 1) == 0 && could_be_empty(called, code, bcptr, utf8))
2729                {
2730                *errorptr = ERR40;
2731                goto FAILED;
2732                }
2733    
2734              /* Insert the recursion/subroutine item */
2735    
2736              *code = OP_RECURSE;
2737              PUT(code, 1, called - cd->start_code);
2738              code += 1 + LINK_SIZE;
2739              }
2740            continue;
2741    
2742            /* Character after (? not specially recognized */
2743    
2744          default:                  /* Option setting */          default:                  /* Option setting */
2745          set = unset = 0;          set = unset = 0;
2746          optset = &set;          optset = &set;
# Line 1388  for (;; ptr++) Line 2757  for (;; ptr++)
2757              case 'x': *optset |= PCRE_EXTENDED; break;              case 'x': *optset |= PCRE_EXTENDED; break;
2758              case 'U': *optset |= PCRE_UNGREEDY; break;              case 'U': *optset |= PCRE_UNGREEDY; break;
2759              case 'X': *optset |= PCRE_EXTRA; break;              case 'X': *optset |= PCRE_EXTRA; break;
   
             default:  
             *errorptr = ERR12;  
             goto FAILED;  
2760              }              }
2761            }            }
2762    
# Line 1400  for (;; ptr++) Line 2765  for (;; ptr++)
2765          newoptions = (options | set) & (~unset);          newoptions = (options | set) & (~unset);
2766    
2767          /* If the options ended with ')' this is not the start of a nested          /* If the options ended with ')' this is not the start of a nested
2768          group with option changes, so the options change at this level. At top          group with option changes, so the options change at this level. Compile
2769          level there is nothing else to be done (the options will in fact have          code to change the ims options if this setting actually changes any of
2770          been set from the start of compiling as a result of the first pass) but          them. We also pass the new setting back so that it can be put at the
2771          at an inner level we must compile code to change the ims options if          start of any following branches, and when this group ends (if we are in
2772          necessary, and pass the new setting back so that it can be put at the          a group), a resetting item can be compiled.
2773          start of any following branches, and when this group ends, a resetting  
2774          item can be compiled. */          Note that if this item is right at the start of the pattern, the
2775            options will have been abstracted and made global, so there will be no
2776            change to compile. */
2777    
2778          if (*ptr == ')')          if (*ptr == ')')
2779            {            {
2780            if ((options & PCRE_INGROUP) != 0 &&            if ((options & PCRE_IMS) != (newoptions & PCRE_IMS))
               (options & PCRE_IMS) != (newoptions & PCRE_IMS))  
2781              {              {
2782              *code++ = OP_OPT;              *code++ = OP_OPT;
2783              *code++ = *optchanged = newoptions & PCRE_IMS;              *code++ = newoptions & PCRE_IMS;
2784              }              }
2785            options = newoptions;  /* Change options at this level */  
2786              /* Change options at this level, and pass them back for use
2787              in subsequent branches. Reset the greedy defaults and the case
2788              value for firstbyte and reqbyte. */
2789    
2790              *optionsptr = options = newoptions;
2791              greedy_default = ((newoptions & PCRE_UNGREEDY) != 0);
2792              greedy_non_default = greedy_default ^ 1;
2793              req_caseopt = ((options & PCRE_CASELESS) != 0)? REQ_CASELESS : 0;
2794    
2795            previous = NULL;       /* This item can't be repeated */            previous = NULL;       /* This item can't be repeated */
2796            continue;              /* It is complete */            continue;              /* It is complete */
2797            }            }
# Line 1431  for (;; ptr++) Line 2806  for (;; ptr++)
2806          }          }
2807        }        }
2808    
2809      /* Else we have a referencing group; adjust the opcode. */      /* If PCRE_NO_AUTO_CAPTURE is set, all unadorned brackets become
2810        non-capturing and behave like (?:...) brackets */
2811    
2812        else if ((options & PCRE_NO_AUTO_CAPTURE) != 0)
2813          {
2814          bravalue = OP_BRA;
2815          }
2816    
2817        /* Else we have a referencing group; adjust the opcode. If the bracket
2818        number is greater than EXTRACT_BASIC_MAX, we set the opcode one higher, and
2819        arrange for the true number to follow later, in an OP_BRANUMBER item. */
2820    
2821      else      else
2822        {        {
2823        if (++(*brackets) > EXTRACT_MAX)        NUMBERED_GROUP:
2824          if (++(*brackets) > EXTRACT_BASIC_MAX)
2825          {          {
2826          *errorptr = ERR13;          bravalue = OP_BRA + EXTRACT_BASIC_MAX + 1;
2827          goto FAILED;          code[1+LINK_SIZE] = OP_BRANUMBER;
2828            PUT2(code, 2+LINK_SIZE, *brackets);
2829            skipbytes = 3;
2830          }          }
2831        bravalue = OP_BRA + *brackets;        else bravalue = OP_BRA + *brackets;
2832        }        }
2833    
2834      /* Process nested bracketed re. Assertions may not be repeated, but other      /* Process nested bracketed re. Assertions may not be repeated, but other
# Line 1453  for (;; ptr++) Line 2841  for (;; ptr++)
2841      tempcode = code;      tempcode = code;
2842    
2843      if (!compile_regex(      if (!compile_regex(
2844           options | PCRE_INGROUP,       /* Set for all nested groups */           newoptions,                   /* The complete new option state */
2845           ((options & PCRE_IMS) != (newoptions & PCRE_IMS))?           options & PCRE_IMS,           /* The previous ims option state */
2846             newoptions & PCRE_IMS : -1, /* Pass ims options if changed */           brackets,                     /* Extracting bracket count */
          brackets,                     /* Bracket level */  
2847           &tempcode,                    /* Where to put code (updated) */           &tempcode,                    /* Where to put code (updated) */
2848           &ptr,                         /* Input pointer (updated) */           &ptr,                         /* Input pointer (updated) */
2849           errorptr,                     /* Where to put an error message */           errorptr,                     /* Where to put an error message */
2850           (bravalue == OP_ASSERTBACK ||           (bravalue == OP_ASSERTBACK ||
2851            bravalue == OP_ASSERTBACK_NOT), /* TRUE if back assert */            bravalue == OP_ASSERTBACK_NOT), /* TRUE if back assert */
2852           condref,                      /* Condition reference number */           skipbytes,                    /* Skip over OP_COND/OP_BRANUMBER */
2853             &subfirstbyte,                /* For possible first char */
2854             &subreqbyte,                  /* For possible last char */
2855             bcptr,                        /* Current branch chain */
2856           cd))                          /* Tables block */           cd))                          /* Tables block */
2857        goto FAILED;        goto FAILED;
2858    
# Line 1474  for (;; ptr++) Line 2864  for (;; ptr++)
2864      /* If this is a conditional bracket, check that there are no more than      /* If this is a conditional bracket, check that there are no more than
2865      two branches in the group. */      two branches in the group. */
2866    
2867      if (bravalue == OP_COND)      else if (bravalue == OP_COND)
2868        {        {
       int branchcount = 0;  
2869        uschar *tc = code;        uschar *tc = code;
2870          condcount = 0;
2871    
2872        do {        do {
2873           branchcount++;           condcount++;
2874           tc += (tc[1] << 8) | tc[2];           tc += GET(tc,1);
2875           }           }
2876        while (*tc != OP_KET);        while (*tc != OP_KET);
2877    
2878        if (branchcount > 2)        if (condcount > 2)
2879          {          {
2880          *errorptr = ERR27;          *errorptr = ERR27;
2881          goto FAILED;          goto FAILED;
2882          }          }
2883    
2884          /* If there is just one branch, we must not make use of its firstbyte or
2885          reqbyte, because this is equivalent to an empty second branch. */
2886    
2887          if (condcount == 1) subfirstbyte = subreqbyte = REQ_NONE;
2888          }
2889    
2890        /* Handle updating of the required and first characters. Update for normal
2891        brackets of all kinds, and conditions with two branches (see code above).
2892        If the bracket is followed by a quantifier with zero repeat, we have to
2893        back off. Hence the definition of zeroreqbyte and zerofirstbyte outside the
2894        main loop so that they can be accessed for the back off. */
2895    
2896        zeroreqbyte = reqbyte;
2897        zerofirstbyte = firstbyte;
2898        groupsetfirstbyte = FALSE;
2899    
2900        if (bravalue >= OP_BRA || bravalue == OP_ONCE || bravalue == OP_COND)
2901          {
2902          /* If we have not yet set a firstbyte in this branch, take it from the
2903          subpattern, remembering that it was set here so that a repeat of more
2904          than one can replicate it as reqbyte if necessary. If the subpattern has
2905          no firstbyte, set "none" for the whole branch. In both cases, a zero
2906          repeat forces firstbyte to "none". */
2907    
2908          if (firstbyte == REQ_UNSET)
2909            {
2910            if (subfirstbyte >= 0)
2911              {
2912              firstbyte = subfirstbyte;
2913              groupsetfirstbyte = TRUE;
2914              }
2915            else firstbyte = REQ_NONE;
2916            zerofirstbyte = REQ_NONE;
2917            }
2918    
2919          /* If firstbyte was previously set, convert the subpattern's firstbyte
2920          into reqbyte if there wasn't one. */
2921    
2922          else if (subfirstbyte >= 0 && subreqbyte < 0) subreqbyte = subfirstbyte;
2923    
2924          /* If the subpattern set a required char (or set a first char that isn't
2925          really the first char - see above), set it. */
2926    
2927          if (subreqbyte >= 0) reqbyte = subreqbyte;
2928        }        }
2929    
2930        /* For a forward assertion, we take the reqbyte, if set. This can be
2931        helpful if the pattern that follows the assertion doesn't set a different
2932        char. For example, it's useful for /(?=abcde).+/. We can't set firstbyte
2933        for an assertion, however because it leads to incorrect effect for patterns
2934        such as /(?=a)a.+/ when the "real" "a" would then become a reqbyte instead
2935        of a firstbyte. This is overcome by a scan at the end if there's no
2936        firstbyte, looking for an asserted first char. */
2937    
2938        else if (bravalue == OP_ASSERT && subreqbyte >= 0) reqbyte = subreqbyte;
2939    
2940      /* Now update the main code pointer to the end of the group. */      /* Now update the main code pointer to the end of the group. */
2941    
2942      code = tempcode;      code = tempcode;
# Line 1522  for (;; ptr++) Line 2967  for (;; ptr++)
2967    
2968      if (c < 0)      if (c < 0)
2969        {        {
2970          if (-c == ESC_Q)            /* Handle start of quoted string */
2971            {
2972            if (ptr[1] == '\\' && ptr[2] == 'E') ptr += 2; /* avoid empty string */
2973              else inescq = TRUE;
2974            continue;
2975            }
2976    
2977          /* For metasequences that actually match a character, we disable the
2978          setting of a first character if it hasn't already been set. */
2979    
2980          if (firstbyte == REQ_UNSET && -c > ESC_b && -c < ESC_Z)
2981            firstbyte = REQ_NONE;
2982    
2983          /* Set values to reset to if this is followed by a zero repeat. */
2984    
2985          zerofirstbyte = firstbyte;
2986          zeroreqbyte = reqbyte;
2987    
2988          /* Back references are handled specially */
2989    
2990        if (-c >= ESC_REF)        if (-c >= ESC_REF)
2991          {          {
2992            int number = -c - ESC_REF;
2993          previous = code;          previous = code;
2994          *code++ = OP_REF;          *code++ = OP_REF;
2995          *code++ = -c - ESC_REF;          PUT2INC(code, 0, number);
2996          }          }
2997        else        else
2998          {          {
# Line 1545  for (;; ptr++) Line 3011  for (;; ptr++)
3011      The first character is guaranteed not to be whitespace or # when the      The first character is guaranteed not to be whitespace or # when the
3012      extended flag is set. */      extended flag is set. */
3013    
3014      NORMAL_CHAR:      NORMAL_CHAR:
3015      default:      default:
3016      previous = code;      previous = code;
3017      *code = OP_CHARS;      *code = OP_CHARS;
3018      code += 2;      code += 2;
3019      length = 0;      length = 0;
3020    
3021        do
3022          {
3023          /* If in \Q...\E, check for the end; if not, we always have a literal */
3024    
3025          if (inescq)
3026            {
3027            if (c == '\\' && ptr[1] == 'E')
3028              {
3029              inescq = FALSE;
3030              ptr++;
3031              }
3032            else
3033              {
3034              *code++ = c;
3035              length++;
3036              }
3037            continue;
3038            }
3039    
3040          /* Skip white space and comments for /x patterns */
3041    
3042          if ((options & PCRE_EXTENDED) != 0)
3043            {
3044            if ((cd->ctypes[c] & ctype_space) != 0) continue;
3045            if (c == '#')
3046              {
3047              /* The space before the ; is to avoid a warning on a silly compiler
3048              on the Macintosh. */
3049              while ((c = *(++ptr)) != 0 && c != NEWLINE) ;
3050              if (c == 0) break;
3051              continue;
3052              }
3053            }
3054    
3055          /* Backslash may introduce a data char or a metacharacter. Escaped items
3056          are checked for validity in the pre-compiling pass. Stop the string
3057          before a metaitem. */
3058    
3059          if (c == '\\')
3060            {
3061            tempptr = ptr;
3062            c = check_escape(&ptr, errorptr, *brackets, options, FALSE, cd);
3063            if (c < 0) { ptr = tempptr; break; }
3064    
3065            /* If a character is > 127 in UTF-8 mode, we have to turn it into
3066            two or more characters in the UTF-8 encoding. */
3067    
3068    #ifdef SUPPORT_UTF8
3069            if (utf8 && c > 127)
3070              {
3071              uschar buffer[8];
3072              int len = ord2utf8(c, buffer);
3073              for (c = 0; c < len; c++) *code++ = buffer[c];
3074              length += len;
3075              continue;
3076              }
3077    #endif
3078            }
3079    
3080          /* Ordinary character or single-char escape */
3081    
3082          *code++ = c;
3083          length++;
3084          }
3085    
3086        /* This "while" is the end of the "do" above. */
3087    
3088        while (length < MAXLIT && (cd->ctypes[c = *(++ptr)] & ctype_meta) == 0);
3089    
3090        /* Update the first and last requirements. These are always bytes, even in
3091        UTF-8 mode. However, there is a special case to be considered when there
3092        are only one or two characters. Because this gets messy in UTF-8 mode, the
3093        code is kept separate. When we get here "length" contains the number of
3094        bytes. */
3095    
3096    #ifdef SUPPORT_UTF8
3097        if (utf8 && length > 1)
3098          {
3099          uschar *t = previous + 3;                      /* After this code, t */
3100          while (t < code && (*t & 0xc0) == 0x80) t++;   /* follows the 1st char */
3101    
3102          /* Handle the case when there is only one multibyte character. It must
3103          have at least two bytes because of the "length > 1" test above. */
3104    
3105          if (t == code)
3106            {
3107            /* If no previous first byte, set it from this character, but revert to
3108            none on a zero repeat. */
3109    
3110            if (firstbyte == REQ_UNSET)
3111              {
3112              zerofirstbyte = REQ_NONE;
3113              firstbyte = previous[2];
3114              }
3115    
3116            /* Otherwise, leave the first byte value alone, and don't change it on
3117            a zero repeat */
3118    
3119            else zerofirstbyte = firstbyte;
3120    
3121            /* In both cases, a zero repeat resets the previous required byte */
3122    
3123            zeroreqbyte = reqbyte;
3124            }
3125    
3126          /* Handle the case when there is more than one character. These may be
3127          single-byte or multibyte characters */
3128    
3129          else
3130            {
3131            uschar *t = code - 1;               /* After this code, t is at the */
3132            while ((*t & 0xc0) == 0x80) t--;    /* start of the last character */
3133    
3134            /* If no previous first byte, set it from the first character, and
3135            retain it on a zero repeat (of the last character). The required byte
3136            is reset on a zero repeat, either to the byte before the last
3137            character, unless this is the first byte of the string. In that case,
3138            it reverts to its previous value. */
3139    
3140            if (firstbyte == REQ_UNSET)
3141              {
3142              zerofirstbyte = firstbyte = previous[2] | req_caseopt;
3143              zeroreqbyte = (t - 1 == previous + 2)? reqbyte : t[-1] | req_caseopt;
3144              }
3145    
3146            /* If there was a previous first byte, leave it alone, and don't change
3147            it on a zero repeat. The required byte is reset on a zero repeat to the
3148            byte before the last character. */
3149    
3150            else
3151              {
3152              zerofirstbyte = firstbyte;
3153              zeroreqbyte = t[-1] | req_caseopt;
3154              }
3155            }
3156    
3157          /* In all cases (we know length > 1), the new required byte is the last
3158          byte of the string. */
3159    
3160          reqbyte = code[-1] | req_caseopt;
3161          }
3162    
3163        else   /* End of UTF-8 coding */
3164    #endif
3165    
3166        /* This is the code for non-UTF-8 operation, either without UTF-8 support,
3167        or when UTF-8 is not enabled. */
3168    
     do  
3169        {        {
3170        if ((options & PCRE_EXTENDED) != 0)        /* firstbyte was not previously set; take it from this string */
3171    
3172          if (firstbyte == REQ_UNSET)
3173          {          {
3174          if ((cd->ctypes[c] & ctype_space) != 0) continue;          if (length == 1)
         if (c == '#')  
3175            {            {
3176            while ((c = *(++ptr)) != 0 && c != '\n');            zerofirstbyte = REQ_NONE;
3177            if (c == 0) break;            firstbyte = previous[2] | req_caseopt;
3178            continue;            zeroreqbyte = reqbyte;
3179              }
3180            else
3181              {
3182              zerofirstbyte = firstbyte = previous[2] | req_caseopt;
3183              zeroreqbyte = (length > 2)? (code[-2] | req_caseopt) : reqbyte;
3184              reqbyte = code[-1] | req_caseopt;
3185            }            }
3186          }          }
3187    
3188        /* Backslash may introduce a data char or a metacharacter. Escaped items        /* firstbyte was previously set */
       are checked for validity in the pre-compiling pass. Stop the string  
       before a metaitem. */  
3189    
3190        if (c == '\\')        else
3191          {          {
3192          tempptr = ptr;          zerofirstbyte = firstbyte;
3193          c = check_escape(&ptr, errorptr, *brackets, options, FALSE, cd);          zeroreqbyte = (length == 1)? reqbyte : code[-2] | req_caseopt;
3194          if (c < 0) { ptr = tempptr; break; }          reqbyte = code[-1] | req_caseopt;
3195          }          }
   
       /* Ordinary character or single-char escape */  
   
       *code++ = c;  
       length++;  
3196        }        }
3197    
3198      /* This "while" is the end of the "do" above. */      /* Set the length in the data vector, and advance to the next state. */
   
     while (length < 255 && (cd->ctypes[c = *(++ptr)] & ctype_meta) == 0);  
   
     /* Compute the length and set it in the data vector, and advance to  
     the next state. */  
3199    
3200      previous[1] = length;      previous[1] = length;
3201      if (length < 255) ptr--;      if (length < MAXLIT) ptr--;
3202      break;      break;
3203      }      }
3204    }                   /* end of big loop */    }                   /* end of big loop */
# Line 1620  following branch to ensure they get set Line 3228  following branch to ensure they get set
3228  the new options into every subsequent branch compile.  the new options into every subsequent branch compile.
3229    
3230  Argument:  Argument:
3231    options     the option bits    options        option bits, including any changes for this subpattern
3232    optchanged  new ims options to set as if (?ims) were at the start, or -1    oldims         previous settings of ims option bits
3233                 for no change    brackets       -> int containing the number of extracting brackets used
3234    brackets    -> int containing the number of extracting brackets used    codeptr        -> the address of the current code pointer
3235    codeptr     -> the address of the current code pointer    ptrptr         -> the address of the current pattern pointer
3236    ptrptr      -> the address of the current pattern pointer    errorptr       -> pointer to error message
3237    errorptr    -> pointer to error message    lookbehind     TRUE if this is a lookbehind assertion
3238    lookbehind  TRUE if this is a lookbehind assertion    skipbytes      skip this many bytes at start (for OP_COND, OP_BRANUMBER)
3239    condref     > 0 for OPT_CREF setting at start of conditional group    firstbyteptr   place to put the first required character, or a negative number
3240    cd          points to the data block with tables pointers    reqbyteptr     place to put the last required character, or a negative number
3241      bcptr          pointer to the chain of currently open branches
3242      cd             points to the data block with tables pointers etc.
3243    
3244  Returns:      TRUE on success  Returns:      TRUE on success
3245  */  */
3246    
3247  static BOOL  static BOOL
3248  compile_regex(int options, int optchanged, int *brackets, uschar **codeptr,  compile_regex(int options, int oldims, int *brackets, uschar **codeptr,
3249    const uschar **ptrptr, const char **errorptr, BOOL lookbehind, int condref,    const uschar **ptrptr, const char **errorptr, BOOL lookbehind, int skipbytes,
3250    compile_data *cd)    int *firstbyteptr, int *reqbyteptr, branch_chain *bcptr, compile_data *cd)
3251  {  {
3252  const uschar *ptr = *ptrptr;  const uschar *ptr = *ptrptr;
3253  uschar *code = *codeptr;  uschar *code = *codeptr;
3254  uschar *last_branch = code;  uschar *last_branch = code;
3255  uschar *start_bracket = code;  uschar *start_bracket = code;
3256  uschar *reverse_count = NULL;  uschar *reverse_count = NULL;
3257  int oldoptions = options & PCRE_IMS;  int firstbyte, reqbyte;
3258    int branchfirstbyte, branchreqbyte;
3259    branch_chain bc;
3260    
3261  code += 3;  bc.outer = bcptr;
3262    bc.current = code;
3263    
3264  /* At the start of a reference-based conditional group, insert the reference  firstbyte = reqbyte = REQ_UNSET;
 number as an OP_CREF item. */  
3265    
3266  if (condref > 0)  /* Offset is set zero to mark that this bracket is still open */
3267    {  
3268    *code++ = OP_CREF;  PUT(code, 1, 0);
3269    *code++ = condref;  code += 1 + LINK_SIZE + skipbytes;
   }  
3270    
3271  /* Loop for each alternative branch */  /* Loop for each alternative branch */
3272    
3273  for (;;)  for (;;)
3274    {    {
3275    int length;    /* Handle a change of ims options at the start of the branch */
3276    
3277    /* Handle change of options */    if ((options & PCRE_IMS) != oldims)
   
   if (optchanged >= 0)  
3278      {      {
3279      *code++ = OP_OPT;      *code++ = OP_OPT;
3280      *code++ = optchanged;      *code++ = options & PCRE_IMS;
     options = (options & ~PCRE_IMS) | optchanged;  
3281      }      }
3282    
3283    /* Set up dummy OP_REVERSE if lookbehind assertion */    /* Set up dummy OP_REVERSE if lookbehind assertion */
# Line 1678  for (;;) Line 3286  for (;;)
3286      {      {
3287      *code++ = OP_REVERSE;      *code++ = OP_REVERSE;
3288      reverse_count = code;      reverse_count = code;
3289      *code++ = 0;      PUTINC(code, 0, 0);
     *code++ = 0;  
3290      }      }
3291    
3292    /* Now compile the branch */    /* Now compile the branch */
3293    
3294    if (!compile_branch(options,brackets,&code,&ptr,errorptr,&optchanged,cd))    if (!compile_branch(&options, brackets, &code, &ptr, errorptr,
3295            &branchfirstbyte, &branchreqbyte, &bc, cd))
3296      {      {
3297      *ptrptr = ptr;      *ptrptr = ptr;
3298      return FALSE;      return FALSE;
3299      }      }
3300    
3301    /* Fill in the length of the last branch */    /* If this is the first branch, the firstbyte and reqbyte values for the
3302      branch become the values for the regex. */
3303    
3304      if (*last_branch != OP_ALT)
3305        {
3306        firstbyte = branchfirstbyte;
3307        reqbyte = branchreqbyte;
3308        }
3309    
3310      /* If this is not the first branch, the first char and reqbyte have to
3311      match the values from all the previous branches. */
3312    
3313      else
3314        {
3315        /* If we previously had a firstbyte, but it doesn't match the new branch,
3316        we have to abandon the firstbyte for the regex, but if there was previously
3317        no reqbyte, it takes on the value of the old firstbyte. */
3318    
3319        if (firstbyte >= 0 && firstbyte != branchfirstbyte)
3320          {
3321          if (reqbyte < 0) reqbyte = firstbyte;
3322          firstbyte = REQ_NONE;
3323          }
3324    
3325        /* If we (now or from before) have no firstbyte, a firstbyte from the
3326        branch becomes a reqbyte if there isn't a branch reqbyte. */
3327    
3328    length = code - last_branch;      if (firstbyte < 0 && branchfirstbyte >= 0 && branchreqbyte < 0)
3329    last_branch[1] = length >> 8;          branchreqbyte = branchfirstbyte;
3330    last_branch[2] = length & 255;  
3331        /* Now ensure that the reqbytes match */
3332    
3333        if (reqbyte != branchreqbyte) reqbyte = REQ_NONE;
3334        }
3335    
3336    /* If lookbehind, check that this branch matches a fixed-length string,    /* If lookbehind, check that this branch matches a fixed-length string,
3337    and put the length into the OP_REVERSE item. Temporarily mark the end of    and put the length into the OP_REVERSE item. Temporarily mark the end of
# Line 1702  for (;;) Line 3339  for (;;)
3339    
3340    if (lookbehind)    if (lookbehind)
3341      {      {
3342        int length;
3343      *code = OP_END;      *code = OP_END;
3344      length = find_fixedlength(last_branch);      length = find_fixedlength(last_branch, options);
3345      DPRINTF(("fixed length = %d\n", length));      DPRINTF(("fixed length = %d\n", length));
3346      if (length < 0)      if (length < 0)
3347        {        {
3348        *errorptr = ERR25;        *errorptr = (length == -2)? ERR36 : ERR25;
3349        *ptrptr = ptr;        *ptrptr = ptr;
3350        return FALSE;        return FALSE;
3351        }        }
3352      reverse_count[0] = (length >> 8);      PUT(reverse_count, 0, length);
     reverse_count[1] = length & 255;  
3353      }      }
3354    
3355    /* Reached end of expression, either ')' or end of pattern. Insert a    /* Reached end of expression, either ')' or end of pattern. Go back through
3356    terminating ket and the length of the whole bracketed item, and return,    the alternative branches and reverse the chain of offsets, with the field in
3357    leaving the pointer at the terminating char. If any of the ims options    the BRA item now becoming an offset to the first alternative. If there are
3358    were changed inside the group, compile a resetting op-code following. */    no alternatives, it points to the end of the group. The length in the
3359      terminating ket is always the length of the whole bracketed item. If any of
3360      the ims options were changed inside the group, compile a resetting op-code
3361      following, except at the very end of the pattern. Return leaving the pointer
3362      at the terminating char. */
3363    
3364    if (*ptr != '|')    if (*ptr != '|')
3365      {      {
3366      length = code - start_bracket;      int length = code - last_branch;
3367      *code++ = OP_KET;      do
     *code++ = length >> 8;  
     *code++ = length & 255;  
     if (optchanged >= 0)  
3368        {        {
3369        *code++ = OP_OPT;        int prev_length = GET(last_branch, 1);
3370        *code++ = oldoptions;        PUT(last_branch, 1, length);
3371          length = prev_length;
3372          last_branch -= length;
3373        }        }
3374      *codeptr = code;      while (length > 0);
     *ptrptr = ptr;  
     return TRUE;  
     }  
   
   /* Another branch follows; insert an "or" node and advance the pointer. */  
   
   *code = OP_ALT;  
   last_branch = code;  
   code += 3;  
   ptr++;  
   }  
 /* Control never reaches here */  
 }  
3375    
3376        /* Fill in the ket */
3377    
3378        *code = OP_KET;
3379        PUT(code, 1, code - start_bracket);
3380        code += 1 + LINK_SIZE;
3381    
3382        /* Resetting option if needed */
3383    
3384  /*************************************************      if ((options & PCRE_IMS) != oldims && *ptr == ')')
 *      Find first significant op code            *  
 *************************************************/  
   
 /* This is called by several functions that scan a compiled expression looking  
 for a fixed first character, or an anchoring op code etc. It skips over things  
 that do not influence this. For one application, a change of caseless option is  
 important.  
   
 Arguments:  
   code       pointer to the start of the group  
   options    pointer to external options  
   optbit     the option bit whose changing is significant, or  
              zero if none are  
   optstop    TRUE to return on option change, otherwise change the options  
                value and continue  
   
 Returns:     pointer to the first significant opcode  
 */  
   
 static const uschar*  
 first_significant_code(const uschar *code, int *options, int optbit,  
   BOOL optstop)  
 {  
 for (;;)  
   {  
   switch ((int)*code)  
     {  
     case OP_OPT:  
     if (optbit > 0 && ((int)code[1] & optbit) != (*options & optbit))  
3385        {        {
3386        if (optstop) return code;        *code++ = OP_OPT;
3387        *options = (int)code[1];        *code++ = oldims;
3388        }        }
     code += 2;  
     break;  
   
     case OP_CREF:  
     code += 2;  
     break;  
3389    
3390      case OP_ASSERT_NOT:      /* Set values to pass back */
     case OP_ASSERTBACK:  
     case OP_ASSERTBACK_NOT:  
     do code += (code[1] << 8) + code[2]; while (*code == OP_ALT);  
     code += 3;  
     break;  
3391    
3392      default:      *codeptr = code;
3393      return code;      *ptrptr = ptr;
3394        *firstbyteptr = firstbyte;
3395        *reqbyteptr = reqbyte;
3396        return TRUE;
3397      }      }
3398    
3399      /* Another branch follows; insert an "or" node. Its length field points back
3400      to the previous branch while the bracket remains open. At the end the chain
3401      is reversed. It's done like this so that the start of the bracket has a
3402      zero offset until it is closed, making it possible to detect recursion. */
3403    
3404      *code = OP_ALT;
3405      PUT(code, 1, code - last_branch);
3406      bc.current = last_branch = code;
3407      code += 1 + LINK_SIZE;
3408      ptr++;
3409    }    }
3410  /* Control never reaches here */  /* Control never reaches here */
3411  }  }
# Line 1817  all of whose alternatives start with OP_ Line 3423  all of whose alternatives start with OP_
3423  it's anchored. However, if this is a multiline pattern, then only OP_SOD  it's anchored. However, if this is a multiline pattern, then only OP_SOD
3424  counts, since OP_CIRC can match in the middle.  counts, since OP_CIRC can match in the middle.
3425    
3426  A branch is also implicitly anchored if it starts with .* because that will try  We can also consider a regex to be anchored if OP_SOM starts all its branches.
3427  the rest of the pattern at all possible matching points, so there is no point  This is the code for \G, which means "match at start of match position, taking
3428  trying them again.  into account the match offset".
3429    
3430    A branch is also implicitly anchored if it starts with .* and DOTALL is set,
3431    because that will try the rest of the pattern at all possible matching points,
3432    so there is no point trying again.... er ....
3433    
3434    .... except when the .* appears inside capturing parentheses, and there is a
3435    subsequent back reference to those parentheses. We haven't enough information
3436    to catch that case precisely.
3437    
3438    At first, the best we could do was to detect when .* was in capturing brackets
3439    and the highest back reference was greater than or equal to that level.
3440    However, by keeping a bitmap of the first 31 back references, we can catch some
3441    of the more common cases more precisely.
3442    
3443  Arguments:  Arguments:
3444    code       points to start of expression (the bracket)    code           points to start of expression (the bracket)
3445    options    points to the options setting    options        points to the options setting
3446      bracket_map    a bitmap of which brackets we are inside while testing; this
3447                      handles up to substring 31; after that we just have to take
3448                      the less precise approach
3449      backref_map    the back reference bitmap
3450    
3451  Returns:     TRUE or FALSE  Returns:     TRUE or FALSE
3452  */  */
3453    
3454  static BOOL  static BOOL
3455  is_anchored(register const uschar *code, int *options)  is_anchored(register const uschar *code, int *options, unsigned int bracket_map,
3456      unsigned int backref_map)
3457  {  {
3458  do {  do {
3459     const uschar *scode = first_significant_code(code + 3, options,     const uschar *scode =
3460       PCRE_MULTILINE, FALSE);       first_significant_code(code + 1+LINK_SIZE, options, PCRE_MULTILINE);
3461     register int op = *scode;     register int op = *scode;
3462     if (op >= OP_BRA || op == OP_ASSERT || op == OP_ONCE || op == OP_COND)  
3463       { if (!is_anchored(scode, options)) return FALSE; }     /* Capturing brackets */
3464     else if (op == OP_TYPESTAR || op == OP_TYPEMINSTAR)  
3465       { if (scode[1] != OP_ANY) return FALSE; }     if (op > OP_BRA)
3466     else if (op != OP_SOD &&       {
3467         int new_map;
3468         op -= OP_BRA;
3469         if (op > EXTRACT_BASIC_MAX) op = GET2(scode, 2+LINK_SIZE);
3470         new_map = bracket_map | ((op < 32)? (1 << op) : 1);
3471         if (!is_anchored(scode, options, new_map, backref_map)) return FALSE;
3472         }
3473    
3474       /* Other brackets */
3475    
3476       else if (op == OP_BRA || op == OP_ASSERT || op == OP_ONCE || op == OP_COND)
3477         {
3478         if (!is_anchored(scode, options, bracket_map, backref_map)) return FALSE;
3479         }
3480    
3481       /* .* is not anchored unless DOTALL is set and it isn't in brackets that
3482       are or may be referenced. */
3483    
3484       else if ((op == OP_TYPESTAR || op == OP_TYPEMINSTAR) &&
3485                (*options & PCRE_DOTALL) != 0)
3486         {
3487         if (scode[1] != OP_ANY || (bracket_map & backref_map) != 0) return FALSE;
3488         }
3489    
3490       /* Check for explicit anchoring */
3491    
3492       else if (op != OP_SOD && op != OP_SOM &&
3493             ((*options & PCRE_MULTILINE) != 0 || op != OP_CIRC))             ((*options & PCRE_MULTILINE) != 0 || op != OP_CIRC))
3494       return FALSE;       return FALSE;
3495     code += (code[1] << 8) + code[2];     code += GET(code, 1);
3496     }     }
3497  while (*code == OP_ALT);  while (*code == OP_ALT);   /* Loop for each alternative */
3498  return TRUE;  return TRUE;
3499  }  }
3500    
3501    
3502    
3503  /*************************************************  /*************************************************
3504  *     Check for start with \n line expression    *  *         Check for starting with ^ or .*        *
3505  *************************************************/  *************************************************/
3506    
3507  /* This is called for multiline expressions to try to find out if every branch  /* This is called to find out if every branch starts with ^ or .* so that
3508  starts with ^ so that "first char" processing can be done to speed things up.  "first char" processing can be done to speed things up in multiline
3509    matching and for non-DOTALL patterns that start with .* (which must start at
3510    the beginning or after \n). As in the case of is_anchored() (see above), we
3511    have to take account of back references to capturing brackets that contain .*
3512    because in that case we can't make the assumption.
3513    
3514  Argument:  points to start of expression (the bracket)  Arguments:
3515  Returns:   TRUE or FALSE    code           points to start of expression (the bracket)
3516      bracket_map    a bitmap of which brackets we are inside while testing; this
3517                      handles up to substring 31; after that we just have to take
3518                      the less precise approach
3519      backref_map    the back reference bitmap
3520    
3521    Returns:         TRUE or FALSE
3522  */  */
3523    
3524  static BOOL  static BOOL
3525  is_startline(const uschar *code)  is_startline(const uschar *code, unsigned int bracket_map,
3526      unsigned int backref_map)
3527  {  {
3528  do {  do {
3529     const uschar *scode = first_significant_code(code + 3, NULL, 0, FALSE);     const uschar *scode = first_significant_code(code + 1+LINK_SIZE, NULL, 0);
3530     register int op = *scode;     register int op = *scode;
3531     if (op >= OP_BRA || op == OP_ASSERT || op == OP_ONCE || op == OP_COND)  
3532       { if (!is_startline(scode)) return FALSE; }     /* Capturing brackets */
3533    
3534       if (op > OP_BRA)
3535         {
3536         int new_map;
3537         op -= OP_BRA;
3538         if (op > EXTRACT_BASIC_MAX) op = GET2(scode, 2+LINK_SIZE);
3539         new_map = bracket_map | ((op < 32)? (1 << op) : 1);
3540         if (!is_startline(scode, new_map, backref_map)) return FALSE;
3541         }
3542    
3543       /* Other brackets */
3544    
3545       else if (op == OP_BRA || op == OP_ASSERT || op == OP_ONCE || op == OP_COND)
3546         { if (!is_startline(scode, bracket_map, backref_map)) return FALSE; }
3547    
3548       /* .* is not anchored unless DOTALL is set and it isn't in brackets that
3549       may be referenced. */
3550    
3551       else if (op == OP_TYPESTAR || op == OP_TYPEMINSTAR)
3552         {
3553         if (scode[1] != OP_ANY || (bracket_map & backref_map) != 0) return FALSE;
3554         }
3555    
3556       /* Check for explicit circumflex */
3557    
3558     else if (op != OP_CIRC) return FALSE;     else if (op != OP_CIRC) return FALSE;
3559     code += (code[1] << 8) + code[2];     code += GET(code, 1);
3560     }     }
3561  while (*code == OP_ALT);  while (*code == OP_ALT);  /* Loop for each alternative */
3562  return TRUE;  return TRUE;
3563  }  }
3564    
3565    
3566    
3567  /*************************************************  /*************************************************
3568  *          Check for fixed first char            *  *       Check for asserted fixed first char      *
3569  *************************************************/  *************************************************/
3570    
3571  /* Try to find out if there is a fixed first character. This is called for  /* During compilation, the "first char" settings from forward assertions are
3572  unanchored expressions, as it speeds up their processing quite considerably.  discarded, because they can cause conflicts with actual literals that follow.
3573  Consider each alternative branch. If they all start with the same char, or with  However, if we end up without a first char setting for an unanchored pattern,
3574  a bracket all of whose alternatives start with the same char (recurse ad lib),  it is worth scanning the regex to see if there is an initial asserted first
3575  then we return that char, otherwise -1.  char. If all branches start with the same asserted char, or with a bracket all
3576    of whose alternatives start with the same asserted char (recurse ad lib), then
3577    we return that char, otherwise -1.
3578    
3579  Arguments:  Arguments:
3580    code       points to start of expression (the bracket)    code       points to start of expression (the bracket)
3581    options    pointer to the options (used to check casing changes)    options    pointer to the options (used to check casing changes)
3582      inassert   TRUE if in an assertion
3583    
3584  Returns:     -1 or the fixed first char  Returns:     -1 or the fixed first char
3585  */  */
3586    
3587  static int  static int
3588  find_firstchar(const uschar *code, int *options)  find_firstassertedchar(const uschar *code, int *options, BOOL inassert)
3589  {  {
3590  register int c = -1;  register int c = -1;
3591  do {  do {
3592     int d;     int d;
3593     const uschar *scode = first_significant_code(code + 3, options,     const uschar *scode =
3594       PCRE_CASELESS, TRUE);       first_significant_code(code + 1+LINK_SIZE, options, PCRE_CASELESS);
3595     register int op = *scode;     register int op = *scode;
3596    
3597     if (op >= OP_BRA) op = OP_BRA;     if (op >= OP_BRA) op = OP_BRA;
# Line 1916  do { Line 3605  do {
3605       case OP_ASSERT:       case OP_ASSERT:
3606       case OP_ONCE:       case OP_ONCE:
3607       case OP_COND:       case OP_COND:
3608       if ((d = find_firstchar(scode, options)) < 0) return -1;       if ((d = find_firstassertedchar(scode, options, op == OP_ASSERT)) < 0)
3609           return -1;
3610       if (c < 0) c = d; else if (c != d) return -1;       if (c < 0) c = d; else if (c != d) return -1;
3611       break;       break;
3612    
# Line 1928  do { Line 3618  do {
3618    
3619       case OP_PLUS:       case OP_PLUS:
3620       case OP_MINPLUS:       case OP_MINPLUS:
3621       if (c < 0) c = scode[1]; else if (c != scode[1]) return -1;       if (!inassert) return -1;
3622         if (c < 0)
3623           {
3624           c = scode[1];
3625           if ((*options & PCRE_CASELESS) != 0) c |= REQ_CASELESS;
3626           }
3627         else if (c != scode[1]) return -1;
3628       break;       break;
3629       }       }
3630    
3631     code += (code[1] << 8) + code[2];     code += GET(code, 1);
3632     }     }
3633  while (*code == OP_ALT);  while (*code == OP_ALT);
3634  return c;  return c;
# Line 1941  return c; Line 3637  return c;
3637    
3638    
3639    
   
3640  /*************************************************  /*************************************************
3641  *        Compile a Regular Expression            *  *        Compile a Regular Expression            *
3642  *************************************************/  *************************************************/
# Line 1965  pcre_compile(const char *pattern, int op Line 3660  pcre_compile(const char *pattern, int op
3660    int *erroroffset, const unsigned char *tables)    int *erroroffset, const unsigned char *tables)
3661  {  {
3662  real_pcre *re;  real_pcre *re;
3663  int length = 3;      /* For initial BRA plus length */  int length = 1 + LINK_SIZE;      /* For initial BRA plus length */
3664  int runlength;  int runlength;
3665  int c, size;  int c, firstbyte, reqbyte;
3666  int bracount = 0;  int bracount = 0;
 int top_backref = 0;  
3667  int branch_extra = 0;  int branch_extra = 0;
3668  int branch_newextra;  int branch_newextra;
3669    int item_count = -1;
3670    int name_count = 0;
3671    int max_name_size = 0;
3672    #ifdef SUPPORT_UTF8
3673    int lastcharlength = 0;
3674    BOOL utf8;
3675    BOOL class_utf8;
3676    #endif
3677    BOOL inescq = FALSE;
3678  unsigned int brastackptr = 0;  unsigned int brastackptr = 0;
3679    size_t size;
3680  uschar *code;  uschar *code;
3681    const uschar *codestart;
3682  const uschar *ptr;  const uschar *ptr;
3683  compile_data compile_block;  compile_data compile_block;
3684  int brastack[BRASTACK_SIZE];  int brastack[BRASTACK_SIZE];
3685  uschar bralenstack[BRASTACK_SIZE];  uschar bralenstack[BRASTACK_SIZE];
3686    
 #ifdef DEBUG  
 uschar *code_base, *code_end;  
 #endif  
   
3687  /* We can't pass back an error message if errorptr is NULL; I guess the best we  /* We can't pass back an error message if errorptr is NULL; I guess the best we
3688  can do is just return NULL. */  can do is just return NULL. */
3689    
# Line 1998  if (erroroffset == NULL) Line 3699  if (erroroffset == NULL)
3699    }    }
3700  *erroroffset = 0;  *erroroffset = 0;
3701    
3702    /* Can't support UTF8 unless PCRE has been compiled to include the code. */
3703    
3704    #ifdef SUPPORT_UTF8
3705    utf8 = (options & PCRE_UTF8) != 0;
3706    #else
3707    if ((options & PCRE_UTF8) != 0)
3708      {
3709      *errorptr = ERR32;
3710      return NULL;
3711      }
3712    #endif
3713    
3714  if ((options & ~PUBLIC_OPTIONS) != 0)  if ((options & ~PUBLIC_OPTIONS) != 0)
3715    {    {
3716    *errorptr = ERR17;    *errorptr = ERR17;
# Line 2012  compile_block.fcc = tables + fcc_offset; Line 3725  compile_block.fcc = tables + fcc_offset;
3725  compile_block.cbits = tables + cbits_offset;  compile_block.cbits = tables + cbits_offset;
3726  compile_block.ctypes = tables + ctypes_offset;  compile_block.ctypes = tables + ctypes_offset;
3727    
3728    /* Maximum back reference and backref bitmap. This is updated for numeric
3729    references during the first pass, but for named references during the actual
3730    compile pass. The bitmap records up to 31 back references to help in deciding
3731    whether (.*) can be treated as anchored or not. */
3732    
3733    compile_block.top_backref = 0;
3734    compile_block.backref_map = 0;
3735    
3736  /* Reflect pattern for debugging output */  /* Reflect pattern for debugging output */
3737    
3738  DPRINTF(("------------------------------------------------------------------\n"));  DPRINTF(("------------------------------------------------------------------\n"));
# Line 2020  DPRINTF(("%s\n", pattern)); Line 3741  DPRINTF(("%s\n", pattern));
3741  /* The first thing to do is to make a pass over the pattern to compute the  /* The first thing to do is to make a pass over the pattern to compute the
3742  amount of store required to hold the compiled code. This does not have to be  amount of store required to hold the compiled code. This does not have to be
3743  perfect as long as errors are overestimates. At the same time we can detect any  perfect as long as errors are overestimates. At the same time we can detect any
3744  internal flag settings. Make an attempt to correct for any counted white space  flag settings right at the start, and extract them. Make an attempt to correct
3745  if an "extended" flag setting appears late in the pattern. We can't be so  for any counted white space if an "extended" flag setting appears late in the
3746  clever for #-comments. */  pattern. We can't be so clever for #-comments. */
3747    
3748  ptr = (const uschar *)(pattern - 1);  ptr = (const uschar *)(pattern - 1);
3749  while ((c = *(++ptr)) != 0)  while ((c = *(++ptr)) != 0)
3750    {    {
3751    int min, max;    int min, max;
3752    int class_charcount;    int class_optcount;
3753      int bracket_length;
3754      int duplength;
3755    
3756      /* If we are inside a \Q...\E sequence, all chars are literal */
3757    
3758      if (inescq) goto NORMAL_CHAR;
3759    
3760      /* Otherwise, first check for ignored whitespace and comments */
3761    
3762    if ((options & PCRE_EXTENDED) != 0)    if ((options & PCRE_EXTENDED) != 0)
3763      {      {
3764      if ((compile_block.ctypes[c] & ctype_space) != 0) continue;      if ((compile_block.ctypes[c] & ctype_space) != 0) continue;
3765      if (c == '#')      if (c == '#')
3766        {        {
3767        while ((c = *(++ptr)) != 0 && c != '\n');        /* The space before the ; is to avoid a warning on a silly compiler
3768          on the Macintosh. */
3769          while ((c = *(++ptr)) != 0 && c != NEWLINE) ;
3770          if (c == 0) break;
3771        continue;        continue;
3772        }        }
3773      }      }
3774    
3775      item_count++;    /* Is zero for the first non-comment item */
3776    
3777    switch(c)    switch(c)
3778      {      {
3779      /* A backslashed item may be an escaped "normal" character or a      /* A backslashed item may be an escaped "normal" character or a
# Line 2059  while ((c = *(++ptr)) != 0) Line 3793  while ((c = *(++ptr)) != 0)
3793          goto NORMAL_CHAR;          goto NORMAL_CHAR;
3794          }          }
3795        }        }
3796    
3797        /* If \Q, enter "literal" mode */
3798    
3799        if (-c == ESC_Q)
3800          {
3801          inescq = TRUE;
3802          continue;
3803          }
3804    
3805        /* Other escapes need one byte, and are of length one for repeats */
3806    
3807      length++;      length++;
3808    #ifdef SUPPORT_UTF8
3809        lastcharlength = 1;
3810    #endif
3811    
3812      /* A back reference needs an additional char, plus either one or 5      /* A back reference needs an additional 2 bytes, plus either one or 5
3813      bytes for a repeat. We also need to keep the value of the highest      bytes for a repeat. We also need to keep the value of the highest
3814      back reference. */      back reference. */
3815    
3816      if (c <= -ESC_REF)      if (c <= -ESC_REF)
3817        {        {
3818        int refnum = -c - ESC_REF;        int refnum = -c - ESC_REF;
3819        if (refnum > top_backref) top_backref = refnum;        compile_block.backref_map |= (refnum < 32)? (1 << refnum) : 1;
3820        length++;   /* For single back reference */        if (refnum > compile_block.top_backref)
3821            compile_block.top_backref = refnum;
3822          length += 2;   /* For single back reference */
3823        if (ptr[1] == '{' && is_counted_repeat(ptr+2, &compile_block))        if (ptr[1] == '{' && is_counted_repeat(ptr+2, &compile_block))
3824          {          {
3825          ptr = read_repeat_counts(ptr+2, &min, &max, errorptr, &compile_block);          ptr = read_repeat_counts(ptr+2, &min, &max, errorptr, &compile_block);
# Line 2083  while ((c = *(++ptr)) != 0) Line 3833  while ((c = *(++ptr)) != 0)
3833        }        }
3834      continue;      continue;
3835    
3836      case '^':      case '^':     /* Single-byte metacharacters */
3837      case '.':      case '.':
3838      case '$':      case '$':
     case '*':     /* These repeats won't be after brackets; */  
     case '+':     /* those are handled separately */  
     case '?':  
3839      length++;      length++;
3840    #ifdef SUPPORT_UTF8
3841        lastcharlength = 1;
3842    #endif
3843      continue;      continue;
3844    
3845      /* This covers the cases of repeats after a single char, metachar, class,      case '*':            /* These repeats won't be after brackets; */
3846      or back reference. */      case '+':            /* those are handled separately */
3847        case '?':
3848        length++;
3849        goto POSESSIVE;      /* A few lines below */
3850    
3851        /* This covers the cases of braced repeats after a single char, metachar,
3852        class, or back reference. */
3853    
3854      case '{':      case '{':
3855      if (!is_counted_repeat(ptr+1, &compile_block)) goto NORMAL_CHAR;      if (!is_counted_repeat(ptr+1, &compile_block)) goto NORMAL_CHAR;
3856      ptr = read_repeat_counts(ptr+1, &min, &max, errorptr, &compile_block);      ptr = read_repeat_counts(ptr+1, &min, &max, errorptr, &compile_block);
3857      if (*errorptr != NULL) goto PCRE_ERROR_RETURN;      if (*errorptr != NULL) goto PCRE_ERROR_RETURN;
3858    
3859        /* These special cases just insert one extra opcode */
3860    
3861      if ((min == 0 && (max == 1 || max == -1)) ||      if ((min == 0 && (max == 1 || max == -1)) ||
3862        (min == 1 && max == -1))        (min == 1 && max == -1))
3863          length++;          length++;
3864    
3865        /* These cases might insert additional copies of a preceding character. */
3866    
3867      else      else
3868        {        {
3869        length--;   /* Uncount the original char or metachar */  #ifdef SUPPORT_UTF8
3870        if (min == 1) length++; else if (min > 0) length += 4;        /* In UTF-8 mode, we should find the length in lastcharlength */
3871        if (max > 0) length += 4; else length += 2;        if (utf8)
3872            {
3873            if (min != 1)
3874              {
3875              length -= lastcharlength;   /* Uncount the original char or metachar */
3876              if (min > 0) length += 3 + lastcharlength;
3877              }
3878            length += lastcharlength + ((max > 0)? 3 : 1);
3879            }
3880          else
3881    #endif
3882    
3883          /* Not UTF-8 mode: all characters are one byte */
3884            {
3885            if (min != 1)
3886              {
3887              length--;   /* Uncount the original char or metachar */
3888              if (min > 0) length += 4;
3889              }
3890    
3891            length += (max > 0)? 4 : 2;
3892            }
3893          }
3894    
3895        if (ptr[1] == '?') ptr++;      /* Needs no extra length */
3896    
3897        POSESSIVE:                     /* Test for possessive quantifier */
3898        if (ptr[1] == '+')
3899          {
3900          ptr++;
3901          length += 2 + 2*LINK_SIZE;   /* Allow for atomic brackets */
3902        }        }
     if (ptr[1] == '?') ptr++;  
3903      continue;      continue;
3904    
3905      /* An alternation contains an offset to the next branch or ket. If any ims      /* An alternation contains an offset to the next branch or ket. If any ims
3906      options changed in the previous branch(es), and/or if we are in a      options changed in the previous branch(es), and/or if we are in a
3907      lookbehind assertion, extra space will be needed at the start of the      lookbehind assertion, extra space will be needed at the start of the
3908      branch. This is handled by branch_extra. */      branch. This is handled by branch_extra. */
3909    
3910        case '|':
3911        length += 1 + LINK_SIZE + branch_extra;
3912        continue;
3913    
3914        /* A character class uses 33 characters provided that all the character
3915        values are less than 256. Otherwise, it uses a bit map for low valued
3916        characters, and individual items for others. Don't worry about character
3917        types that aren't allowed in classes - they'll get picked up during the
3918        compile. A character class that contains only one single-byte character
3919        uses 2 or 3 bytes, depending on whether it is negated or not. Notice this
3920        where we can. (In UTF-8 mode we can do this only for chars < 128.) */
3921    
3922        case '[':
3923        class_optcount = 0;
3924    
3925    #ifdef SUPPORT_UTF8
3926        class_utf8 = FALSE;
3927    #endif
3928    
3929        if (*(++ptr) == '^') ptr++;
3930    
3931        /* Written as a "do" so that an initial ']' is taken as data */
3932    
3933        if (*ptr != 0) do
3934          {
3935          /* Inside \Q...\E everything is literal except \E */
3936    
3937          if (inescq)
3938            {
3939            if (*ptr != '\\' || ptr[1] != 'E') goto NON_SPECIAL_CHARACTER;
3940            inescq = FALSE;
3941            ptr += 1;
3942            continue;
3943            }