/[pcre]/code/trunk/pcre_dfa_exec.c
ViewVC logotype

Diff of /code/trunk/pcre_dfa_exec.c

Parent Directory Parent Directory | Revision Log Revision Log | View Patch Patch

revision 406 by ph10, Mon Mar 23 12:05:43 2009 UTC revision 894 by ph10, Fri Jan 20 11:58:19 2012 UTC
# Line 7  and semantics are as close as possible t Line 7  and semantics are as close as possible t
7  below for why this module is different).  below for why this module is different).
8    
9                         Written by Philip Hazel                         Written by Philip Hazel
10             Copyright (c) 1997-2009 University of Cambridge             Copyright (c) 1997-2012 University of Cambridge
11    
12  -----------------------------------------------------------------------------  -----------------------------------------------------------------------------
13  Redistribution and use in source and binary forms, with or without  Redistribution and use in source and binary forms, with or without
# Line 45  FSM). This is NOT Perl- compatible, but Line 45  FSM). This is NOT Perl- compatible, but
45  applications. */  applications. */
46    
47    
48    /* NOTE ABOUT PERFORMANCE: A user of this function sent some code that improved
49    the performance of his patterns greatly. I could not use it as it stood, as it
50    was not thread safe, and made assumptions about pattern sizes. Also, it caused
51    test 7 to loop, and test 9 to crash with a segfault.
52    
53    The issue is the check for duplicate states, which is done by a simple linear
54    search up the state list. (Grep for "duplicate" below to find the code.) For
55    many patterns, there will never be many states active at one time, so a simple
56    linear search is fine. In patterns that have many active states, it might be a
57    bottleneck. The suggested code used an indexing scheme to remember which states
58    had previously been used for each character, and avoided the linear search when
59    it knew there was no chance of a duplicate. This was implemented when adding
60    states to the state lists.
61    
62    I wrote some thread-safe, not-limited code to try something similar at the time
63    of checking for duplicates (instead of when adding states), using index vectors
64    on the stack. It did give a 13% improvement with one specially constructed
65    pattern for certain subject strings, but on other strings and on many of the
66    simpler patterns in the test suite it did worse. The major problem, I think,
67    was the extra time to initialize the index. This had to be done for each call
68    of internal_dfa_exec(). (The supplied patch used a static vector, initialized
69    only once - I suspect this was the cause of the problems with the tests.)
70    
71    Overall, I concluded that the gains in some cases did not outweigh the losses
72    in others, so I abandoned this code. */
73    
74    
75    
76  #ifdef HAVE_CONFIG_H  #ifdef HAVE_CONFIG_H
77  #include "config.h"  #include "config.h"
78  #endif  #endif
# Line 78  never stored, so we push them well clear Line 106  never stored, so we push them well clear
106    
107    
108  /* This table identifies those opcodes that are followed immediately by a  /* This table identifies those opcodes that are followed immediately by a
109  character that is to be tested in some way. This makes is possible to  character that is to be tested in some way. This makes it possible to
110  centralize the loading of these characters. In the case of Type * etc, the  centralize the loading of these characters. In the case of Type * etc, the
111  "character" is the opcode for \D, \d, \S, \s, \W, or \w, which will always be a  "character" is the opcode for \D, \d, \S, \s, \W, or \w, which will always be a
112  small value. ***NOTE*** If the start of this table is modified, the two tables  small value. Non-zero values in the table are the offsets from the opcode where
113  that follow must also be modified. */  the character is to be found. ***NOTE*** If the start of this table is
114    modified, the three tables that follow must also be modified. */
115    
116  static const uschar coptable[] = {  static const pcre_uint8 coptable[] = {
117    0,                             /* End                                    */    0,                             /* End                                    */
118    0, 0, 0, 0, 0,                 /* \A, \G, \K, \B, \b                     */    0, 0, 0, 0, 0,                 /* \A, \G, \K, \B, \b                     */
119    0, 0, 0, 0, 0, 0,              /* \D, \d, \S, \s, \W, \w                 */    0, 0, 0, 0, 0, 0,              /* \D, \d, \S, \s, \W, \w                 */
120    0, 0, 0,                       /* Any, AllAny, Anybyte                   */    0, 0, 0,                       /* Any, AllAny, Anybyte                   */
121    0, 0, 0,                       /* NOTPROP, PROP, EXTUNI                  */    0, 0,                          /* \P, \p                                 */
122    0, 0, 0, 0, 0,                 /* \R, \H, \h, \V, \v                     */    0, 0, 0, 0, 0,                 /* \R, \H, \h, \V, \v                     */
123    0, 0, 0, 0, 0,                 /* \Z, \z, Opt, ^, $                      */    0,                             /* \X                                     */
124      0, 0, 0, 0, 0, 0,              /* \Z, \z, ^, ^M, $, $M                   */
125    1,                             /* Char                                   */    1,                             /* Char                                   */
126    1,                             /* Charnc                                 */    1,                             /* Chari                                  */
127    1,                             /* not                                    */    1,                             /* not                                    */
128      1,                             /* noti                                   */
129    /* Positive single-char repeats                                          */    /* Positive single-char repeats                                          */
130    1, 1, 1, 1, 1, 1,              /* *, *?, +, +?, ?, ??                    */    1, 1, 1, 1, 1, 1,              /* *, *?, +, +?, ?, ??                    */
131    3, 3, 3,                       /* upto, minupto, exact                   */    1+IMM2_SIZE, 1+IMM2_SIZE,      /* upto, minupto                          */
132    1, 1, 1, 3,                    /* *+, ++, ?+, upto+                      */    1+IMM2_SIZE,                   /* exact                                  */
133      1, 1, 1, 1+IMM2_SIZE,          /* *+, ++, ?+, upto+                      */
134      1, 1, 1, 1, 1, 1,              /* *I, *?I, +I, +?I, ?I, ??I              */
135      1+IMM2_SIZE, 1+IMM2_SIZE,      /* upto I, minupto I                      */
136      1+IMM2_SIZE,                   /* exact I                                */
137      1, 1, 1, 1+IMM2_SIZE,          /* *+I, ++I, ?+I, upto+I                  */
138    /* Negative single-char repeats - only for chars < 256                   */    /* Negative single-char repeats - only for chars < 256                   */
139    1, 1, 1, 1, 1, 1,              /* NOT *, *?, +, +?, ?, ??                */    1, 1, 1, 1, 1, 1,              /* NOT *, *?, +, +?, ?, ??                */
140    3, 3, 3,                       /* NOT upto, minupto, exact               */    1+IMM2_SIZE, 1+IMM2_SIZE,      /* NOT upto, minupto                      */
141    1, 1, 1, 3,                    /* NOT *+, ++, ?+, updo+                  */    1+IMM2_SIZE,                   /* NOT exact                              */
142      1, 1, 1, 1+IMM2_SIZE,          /* NOT *+, ++, ?+, upto+                  */
143      1, 1, 1, 1, 1, 1,              /* NOT *I, *?I, +I, +?I, ?I, ??I          */
144      1+IMM2_SIZE, 1+IMM2_SIZE,      /* NOT upto I, minupto I                  */
145      1+IMM2_SIZE,                   /* NOT exact I                            */
146      1, 1, 1, 1+IMM2_SIZE,          /* NOT *+I, ++I, ?+I, upto+I              */
147    /* Positive type repeats                                                 */    /* Positive type repeats                                                 */
148    1, 1, 1, 1, 1, 1,              /* Type *, *?, +, +?, ?, ??               */    1, 1, 1, 1, 1, 1,              /* Type *, *?, +, +?, ?, ??               */
149    3, 3, 3,                       /* Type upto, minupto, exact              */    1+IMM2_SIZE, 1+IMM2_SIZE,      /* Type upto, minupto                     */
150    1, 1, 1, 3,                    /* Type *+, ++, ?+, upto+                 */    1+IMM2_SIZE,                   /* Type exact                             */
151      1, 1, 1, 1+IMM2_SIZE,          /* Type *+, ++, ?+, upto+                 */
152    /* Character class & ref repeats                                         */    /* Character class & ref repeats                                         */
153    0, 0, 0, 0, 0, 0,              /* *, *?, +, +?, ?, ??                    */    0, 0, 0, 0, 0, 0,              /* *, *?, +, +?, ?, ??                    */
154    0, 0,                          /* CRRANGE, CRMINRANGE                    */    0, 0,                          /* CRRANGE, CRMINRANGE                    */
# Line 114  static const uschar coptable[] = { Line 156  static const uschar coptable[] = {
156    0,                             /* NCLASS                                 */    0,                             /* NCLASS                                 */
157    0,                             /* XCLASS - variable length               */    0,                             /* XCLASS - variable length               */
158    0,                             /* REF                                    */    0,                             /* REF                                    */
159      0,                             /* REFI                                   */
160    0,                             /* RECURSE                                */    0,                             /* RECURSE                                */
161    0,                             /* CALLOUT                                */    0,                             /* CALLOUT                                */
162    0,                             /* Alt                                    */    0,                             /* Alt                                    */
163    0,                             /* Ket                                    */    0,                             /* Ket                                    */
164    0,                             /* KetRmax                                */    0,                             /* KetRmax                                */
165    0,                             /* KetRmin                                */    0,                             /* KetRmin                                */
166      0,                             /* KetRpos                                */
167      0,                             /* Reverse                                */
168    0,                             /* Assert                                 */    0,                             /* Assert                                 */
169    0,                             /* Assert not                             */    0,                             /* Assert not                             */
170    0,                             /* Assert behind                          */    0,                             /* Assert behind                          */
171    0,                             /* Assert behind not                      */    0,                             /* Assert behind not                      */
172      0, 0,                          /* ONCE, ONCE_NC                          */
173      0, 0, 0, 0, 0,                 /* BRA, BRAPOS, CBRA, CBRAPOS, COND       */
174      0, 0, 0, 0, 0,                 /* SBRA, SBRAPOS, SCBRA, SCBRAPOS, SCOND  */
175      0, 0,                          /* CREF, NCREF                            */
176      0, 0,                          /* RREF, NRREF                            */
177      0,                             /* DEF                                    */
178      0, 0, 0,                       /* BRAZERO, BRAMINZERO, BRAPOSZERO        */
179      0, 0, 0,                       /* MARK, PRUNE, PRUNE_ARG                 */
180      0, 0, 0, 0,                    /* SKIP, SKIP_ARG, THEN, THEN_ARG         */
181      0, 0, 0, 0,                    /* COMMIT, FAIL, ACCEPT, ASSERT_ACCEPT    */
182      0, 0                           /* CLOSE, SKIPZERO  */
183    };
184    
185    /* This table identifies those opcodes that inspect a character. It is used to
186    remember the fact that a character could have been inspected when the end of
187    the subject is reached. ***NOTE*** If the start of this table is modified, the
188    two tables that follow must also be modified. */
189    
190    static const pcre_uint8 poptable[] = {
191      0,                             /* End                                    */
192      0, 0, 0, 1, 1,                 /* \A, \G, \K, \B, \b                     */
193      1, 1, 1, 1, 1, 1,              /* \D, \d, \S, \s, \W, \w                 */
194      1, 1, 1,                       /* Any, AllAny, Anybyte                   */
195      1, 1,                          /* \P, \p                                 */
196      1, 1, 1, 1, 1,                 /* \R, \H, \h, \V, \v                     */
197      1,                             /* \X                                     */
198      0, 0, 0, 0, 0, 0,              /* \Z, \z, ^, ^M, $, $M                   */
199      1,                             /* Char                                   */
200      1,                             /* Chari                                  */
201      1,                             /* not                                    */
202      1,                             /* noti                                   */
203      /* Positive single-char repeats                                          */
204      1, 1, 1, 1, 1, 1,              /* *, *?, +, +?, ?, ??                    */
205      1, 1, 1,                       /* upto, minupto, exact                   */
206      1, 1, 1, 1,                    /* *+, ++, ?+, upto+                      */
207      1, 1, 1, 1, 1, 1,              /* *I, *?I, +I, +?I, ?I, ??I              */
208      1, 1, 1,                       /* upto I, minupto I, exact I             */
209      1, 1, 1, 1,                    /* *+I, ++I, ?+I, upto+I                  */
210      /* Negative single-char repeats - only for chars < 256                   */
211      1, 1, 1, 1, 1, 1,              /* NOT *, *?, +, +?, ?, ??                */
212      1, 1, 1,                       /* NOT upto, minupto, exact               */
213      1, 1, 1, 1,                    /* NOT *+, ++, ?+, upto+                  */
214      1, 1, 1, 1, 1, 1,              /* NOT *I, *?I, +I, +?I, ?I, ??I          */
215      1, 1, 1,                       /* NOT upto I, minupto I, exact I         */
216      1, 1, 1, 1,                    /* NOT *+I, ++I, ?+I, upto+I              */
217      /* Positive type repeats                                                 */
218      1, 1, 1, 1, 1, 1,              /* Type *, *?, +, +?, ?, ??               */
219      1, 1, 1,                       /* Type upto, minupto, exact              */
220      1, 1, 1, 1,                    /* Type *+, ++, ?+, upto+                 */
221      /* Character class & ref repeats                                         */
222      1, 1, 1, 1, 1, 1,              /* *, *?, +, +?, ?, ??                    */
223      1, 1,                          /* CRRANGE, CRMINRANGE                    */
224      1,                             /* CLASS                                  */
225      1,                             /* NCLASS                                 */
226      1,                             /* XCLASS - variable length               */
227      0,                             /* REF                                    */
228      0,                             /* REFI                                   */
229      0,                             /* RECURSE                                */
230      0,                             /* CALLOUT                                */
231      0,                             /* Alt                                    */
232      0,                             /* Ket                                    */
233      0,                             /* KetRmax                                */
234      0,                             /* KetRmin                                */
235      0,                             /* KetRpos                                */
236    0,                             /* Reverse                                */    0,                             /* Reverse                                */
237    0, 0, 0, 0,                    /* ONCE, BRA, CBRA, COND                  */    0,                             /* Assert                                 */
238    0, 0, 0,                       /* SBRA, SCBRA, SCOND                     */    0,                             /* Assert not                             */
239    0,                             /* CREF                                   */    0,                             /* Assert behind                          */
240    0,                             /* RREF                                   */    0,                             /* Assert behind not                      */
241      0, 0,                          /* ONCE, ONCE_NC                          */
242      0, 0, 0, 0, 0,                 /* BRA, BRAPOS, CBRA, CBRAPOS, COND       */
243      0, 0, 0, 0, 0,                 /* SBRA, SBRAPOS, SCBRA, SCBRAPOS, SCOND  */
244      0, 0,                          /* CREF, NCREF                            */
245      0, 0,                          /* RREF, NRREF                            */
246    0,                             /* DEF                                    */    0,                             /* DEF                                    */
247    0, 0,                          /* BRAZERO, BRAMINZERO                    */    0, 0, 0,                       /* BRAZERO, BRAMINZERO, BRAPOSZERO        */
248    0, 0, 0, 0,                    /* PRUNE, SKIP, THEN, COMMIT              */    0, 0, 0,                       /* MARK, PRUNE, PRUNE_ARG                 */
249    0, 0, 0                        /* FAIL, ACCEPT, SKIPZERO                 */    0, 0, 0, 0,                    /* SKIP, SKIP_ARG, THEN, THEN_ARG         */
250      0, 0, 0, 0,                    /* COMMIT, FAIL, ACCEPT, ASSERT_ACCEPT    */
251      0, 0                           /* CLOSE, SKIPZERO                        */
252  };  };
253    
254  /* These 2 tables allow for compact code for testing for \D, \d, \S, \s, \W,  /* These 2 tables allow for compact code for testing for \D, \d, \S, \s, \W,
255  and \w */  and \w */
256    
257  static const uschar toptable1[] = {  static const pcre_uint8 toptable1[] = {
258    0, 0, 0, 0, 0, 0,    0, 0, 0, 0, 0, 0,
259    ctype_digit, ctype_digit,    ctype_digit, ctype_digit,
260    ctype_space, ctype_space,    ctype_space, ctype_space,
# Line 146  static const uschar toptable1[] = { Line 262  static const uschar toptable1[] = {
262    0, 0                            /* OP_ANY, OP_ALLANY */    0, 0                            /* OP_ANY, OP_ALLANY */
263  };  };
264    
265  static const uschar toptable2[] = {  static const pcre_uint8 toptable2[] = {
266    0, 0, 0, 0, 0, 0,    0, 0, 0, 0, 0, 0,
267    ctype_digit, 0,    ctype_digit, 0,
268    ctype_space, 0,    ctype_space, 0,
# Line 163  these structures in, is a vector of ints Line 279  these structures in, is a vector of ints
279  typedef struct stateblock {  typedef struct stateblock {
280    int offset;                     /* Offset to opcode */    int offset;                     /* Offset to opcode */
281    int count;                      /* Count for repeats */    int count;                      /* Count for repeats */
   int ims;                        /* ims flag bits */  
282    int data;                       /* Some use extra data */    int data;                       /* Some use extra data */
283  } stateblock;  } stateblock;
284    
285  #define INTS_PER_STATEBLOCK  (sizeof(stateblock)/sizeof(int))  #define INTS_PER_STATEBLOCK  (sizeof(stateblock)/sizeof(int))
286    
287    
288  #ifdef DEBUG  #ifdef PCRE_DEBUG
289  /*************************************************  /*************************************************
290  *             Print character string             *  *             Print character string             *
291  *************************************************/  *************************************************/
# Line 186  Returns: nothing Line 301  Returns: nothing
301  */  */
302    
303  static void  static void
304  pchars(unsigned char *p, int length, FILE *f)  pchars(const pcre_uchar *p, int length, FILE *f)
305  {  {
306  int c;  int c;
307  while (length-- > 0)  while (length-- > 0)
# Line 219  Arguments: Line 334  Arguments:
334    offsetcount       size of same    offsetcount       size of same
335    workspace         vector of workspace    workspace         vector of workspace
336    wscount           size of same    wscount           size of same
   ims               the current ims flags  
337    rlevel            function call recursion level    rlevel            function call recursion level
   recursing         regex recursive call level  
338    
339  Returns:            > 0 => number of match offset pairs placed in offsets  Returns:            > 0 => number of match offset pairs placed in offsets
340                      = 0 => offsets overflowed; longest matches are present                      = 0 => offsets overflowed; longest matches are present
# Line 236  for the current character, one for the f Line 349  for the current character, one for the f
349      { \      { \
350      next_active_state->offset = (x); \      next_active_state->offset = (x); \
351      next_active_state->count  = (y); \      next_active_state->count  = (y); \
     next_active_state->ims    = ims; \  
352      next_active_state++; \      next_active_state++; \
353      DPRINTF(("%.*sADD_ACTIVE(%d,%d)\n", rlevel*2-2, SP, (x), (y))); \      DPRINTF(("%.*sADD_ACTIVE(%d,%d)\n", rlevel*2-2, SP, (x), (y))); \
354      } \      } \
# Line 247  for the current character, one for the f Line 359  for the current character, one for the f
359      { \      { \
360      next_active_state->offset = (x); \      next_active_state->offset = (x); \
361      next_active_state->count  = (y); \      next_active_state->count  = (y); \
     next_active_state->ims    = ims; \  
362      next_active_state->data   = (z); \      next_active_state->data   = (z); \
363      next_active_state++; \      next_active_state++; \
364      DPRINTF(("%.*sADD_ACTIVE_DATA(%d,%d,%d)\n", rlevel*2-2, SP, (x), (y), (z))); \      DPRINTF(("%.*sADD_ACTIVE_DATA(%d,%d,%d)\n", rlevel*2-2, SP, (x), (y), (z))); \
# Line 259  for the current character, one for the f Line 370  for the current character, one for the f
370      { \      { \
371      next_new_state->offset = (x); \      next_new_state->offset = (x); \
372      next_new_state->count  = (y); \      next_new_state->count  = (y); \
     next_new_state->ims    = ims; \  
373      next_new_state++; \      next_new_state++; \
374      DPRINTF(("%.*sADD_NEW(%d,%d)\n", rlevel*2-2, SP, (x), (y))); \      DPRINTF(("%.*sADD_NEW(%d,%d)\n", rlevel*2-2, SP, (x), (y))); \
375      } \      } \
# Line 270  for the current character, one for the f Line 380  for the current character, one for the f
380      { \      { \
381      next_new_state->offset = (x); \      next_new_state->offset = (x); \
382      next_new_state->count  = (y); \      next_new_state->count  = (y); \
     next_new_state->ims    = ims; \  
383      next_new_state->data   = (z); \      next_new_state->data   = (z); \
384      next_new_state++; \      next_new_state++; \
385      DPRINTF(("%.*sADD_NEW_DATA(%d,%d,%d)\n", rlevel*2-2, SP, (x), (y), (z))); \      DPRINTF(("%.*sADD_NEW_DATA(%d,%d,%d)\n", rlevel*2-2, SP, (x), (y), (z))); \
# Line 282  for the current character, one for the f Line 391  for the current character, one for the f
391  static int  static int
392  internal_dfa_exec(  internal_dfa_exec(
393    dfa_match_data *md,    dfa_match_data *md,
394    const uschar *this_start_code,    const pcre_uchar *this_start_code,
395    const uschar *current_subject,    const pcre_uchar *current_subject,
396    int start_offset,    int start_offset,
397    int *offsets,    int *offsets,
398    int offsetcount,    int offsetcount,
399    int *workspace,    int *workspace,
400    int wscount,    int wscount,
401    int ims,    int  rlevel)
   int  rlevel,  
   int  recursing)  
402  {  {
403  stateblock *active_states, *new_states, *temp_states;  stateblock *active_states, *new_states, *temp_states;
404  stateblock *next_active_state, *next_new_state;  stateblock *next_active_state, *next_new_state;
405    
406  const uschar *ctypes, *lcc, *fcc;  const pcre_uint8 *ctypes, *lcc, *fcc;
407  const uschar *ptr;  const pcre_uchar *ptr;
408  const uschar *end_code, *first_op;  const pcre_uchar *end_code, *first_op;
409    
410    dfa_recursion_info new_recursive;
411    
412  int active_count, new_count, match_count;  int active_count, new_count, match_count;
413    
414  /* Some fields in the md block are frequently referenced, so we load them into  /* Some fields in the md block are frequently referenced, so we load them into
415  independent variables in the hope that this will perform better. */  independent variables in the hope that this will perform better. */
416    
417  const uschar *start_subject = md->start_subject;  const pcre_uchar *start_subject = md->start_subject;
418  const uschar *end_subject = md->end_subject;  const pcre_uchar *end_subject = md->end_subject;
419  const uschar *start_code = md->start_code;  const pcre_uchar *start_code = md->start_code;
420    
421  #ifdef SUPPORT_UTF8  #ifdef SUPPORT_UTF
422  BOOL utf8 = (md->poptions & PCRE_UTF8) != 0;  BOOL utf = (md->poptions & PCRE_UTF8) != 0;
423  #else  #else
424  BOOL utf8 = FALSE;  BOOL utf = FALSE;
425  #endif  #endif
426    
427  rlevel++;  rlevel++;
# Line 323  wscount = (wscount - (wscount % (INTS_PE Line 432  wscount = (wscount - (wscount % (INTS_PE
432            (2 * INTS_PER_STATEBLOCK);            (2 * INTS_PER_STATEBLOCK);
433    
434  DPRINTF(("\n%.*s---------------------\n"  DPRINTF(("\n%.*s---------------------\n"
435    "%.*sCall to internal_dfa_exec f=%d r=%d\n",    "%.*sCall to internal_dfa_exec f=%d\n",
436    rlevel*2-2, SP, rlevel*2-2, SP, rlevel, recursing));    rlevel*2-2, SP, rlevel*2-2, SP, rlevel));
437    
438  ctypes = md->tables + ctypes_offset;  ctypes = md->tables + ctypes_offset;
439  lcc = md->tables + lcc_offset;  lcc = md->tables + lcc_offset;
# Line 337  next_new_state = new_states = active_sta Line 446  next_new_state = new_states = active_sta
446  new_count = 0;  new_count = 0;
447    
448  first_op = this_start_code + 1 + LINK_SIZE +  first_op = this_start_code + 1 + LINK_SIZE +
449    ((*this_start_code == OP_CBRA || *this_start_code == OP_SCBRA)? 2:0);    ((*this_start_code == OP_CBRA || *this_start_code == OP_SCBRA ||
450        *this_start_code == OP_CBRAPOS || *this_start_code == OP_SCBRAPOS)
451        ? IMM2_SIZE:0);
452    
453  /* The first thing in any (sub) pattern is a bracket of some sort. Push all  /* The first thing in any (sub) pattern is a bracket of some sort. Push all
454  the alternative states onto the list, and find out where the end is. This  the alternative states onto the list, and find out where the end is. This
# Line 365  if (*first_op == OP_REVERSE) Line 476  if (*first_op == OP_REVERSE)
476    /* If we can't go back the amount required for the longest lookbehind    /* If we can't go back the amount required for the longest lookbehind
477    pattern, go back as far as we can; some alternatives may still be viable. */    pattern, go back as far as we can; some alternatives may still be viable. */
478    
479  #ifdef SUPPORT_UTF8  #ifdef SUPPORT_UTF
480    /* In character mode we have to step back character by character */    /* In character mode we have to step back character by character */
481    
482    if (utf8)    if (utf)
483      {      {
484      for (gone_back = 0; gone_back < max_back; gone_back++)      for (gone_back = 0; gone_back < max_back; gone_back++)
485        {        {
486        if (current_subject <= start_subject) break;        if (current_subject <= start_subject) break;
487        current_subject--;        current_subject--;
488        while (current_subject > start_subject &&        ACROSSCHAR(current_subject > start_subject, *current_subject, current_subject--);
              (*current_subject & 0xc0) == 0x80)  
         current_subject--;  
489        }        }
490      }      }
491    else    else
# Line 386  if (*first_op == OP_REVERSE) Line 495  if (*first_op == OP_REVERSE)
495    
496      {      {
497      gone_back = (current_subject - max_back < start_subject)?      gone_back = (current_subject - max_back < start_subject)?
498        current_subject - start_subject : max_back;        (int)(current_subject - start_subject) : max_back;
499      current_subject -= gone_back;      current_subject -= gone_back;
500      }      }
501    
502      /* Save the earliest consulted character */
503    
504      if (current_subject < md->start_used_ptr)
505        md->start_used_ptr = current_subject;
506    
507    /* Now we can process the individual branches. */    /* Now we can process the individual branches. */
508    
509    end_code = this_start_code;    end_code = this_start_code;
# Line 398  if (*first_op == OP_REVERSE) Line 512  if (*first_op == OP_REVERSE)
512      int back = GET(end_code, 2+LINK_SIZE);      int back = GET(end_code, 2+LINK_SIZE);
513      if (back <= gone_back)      if (back <= gone_back)
514        {        {
515        int bstate = end_code - start_code + 2 + 2*LINK_SIZE;        int bstate = (int)(end_code - start_code + 2 + 2*LINK_SIZE);
516        ADD_NEW_DATA(-bstate, 0, gone_back - back);        ADD_NEW_DATA(-bstate, 0, gone_back - back);
517        }        }
518      end_code += GET(end_code, 1);      end_code += GET(end_code, 1);
# Line 431  else Line 545  else
545    else    else
546      {      {
547      int length = 1 + LINK_SIZE +      int length = 1 + LINK_SIZE +
548        ((*this_start_code == OP_CBRA || *this_start_code == OP_SCBRA)? 2:0);        ((*this_start_code == OP_CBRA || *this_start_code == OP_SCBRA ||
549            *this_start_code == OP_CBRAPOS || *this_start_code == OP_SCBRAPOS)
550            ? IMM2_SIZE:0);
551      do      do
552        {        {
553        ADD_NEW(end_code - start_code + length, 0);        ADD_NEW((int)(end_code - start_code + length), 0);
554        end_code += GET(end_code, 1);        end_code += GET(end_code, 1);
555        length = 1 + LINK_SIZE;        length = 1 + LINK_SIZE;
556        }        }
# Line 444  else Line 560  else
560    
561  workspace[0] = 0;    /* Bit indicating which vector is current */  workspace[0] = 0;    /* Bit indicating which vector is current */
562    
563  DPRINTF(("%.*sEnd state = %d\n", rlevel*2-2, SP, end_code - start_code));  DPRINTF(("%.*sEnd state = %d\n", rlevel*2-2, SP, (int)(end_code - start_code)));
564    
565  /* Loop for scanning the subject */  /* Loop for scanning the subject */
566    
# Line 454  for (;;) Line 570  for (;;)
570    int i, j;    int i, j;
571    int clen, dlen;    int clen, dlen;
572    unsigned int c, d;    unsigned int c, d;
573      int forced_fail = 0;
574      BOOL could_continue = FALSE;
575    
576    /* Make the new state list into the active state list and empty the    /* Make the new state list into the active state list and empty the
577    new state list. */    new state list. */
# Line 467  for (;;) Line 585  for (;;)
585    workspace[0] ^= 1;              /* Remember for the restarting feature */    workspace[0] ^= 1;              /* Remember for the restarting feature */
586    workspace[1] = active_count;    workspace[1] = active_count;
587    
588  #ifdef DEBUG  #ifdef PCRE_DEBUG
589    printf("%.*sNext character: rest of subject = \"", rlevel*2-2, SP);    printf("%.*sNext character: rest of subject = \"", rlevel*2-2, SP);
590    pchars((uschar *)ptr, strlen((char *)ptr), stdout);    pchars(ptr, STRLEN_UC(ptr), stdout);
591    printf("\"\n");    printf("\"\n");
592    
593    printf("%.*sActive states: ", rlevel*2-2, SP);    printf("%.*sActive states: ", rlevel*2-2, SP);
# Line 490  for (;;) Line 608  for (;;)
608    if (ptr < end_subject)    if (ptr < end_subject)
609      {      {
610      clen = 1;        /* Number of bytes in the character */      clen = 1;        /* Number of bytes in the character */
611  #ifdef SUPPORT_UTF8  #ifdef SUPPORT_UTF
612      if (utf8) { GETCHARLEN(c, ptr, clen); } else      if (utf) { GETCHARLEN(c, ptr, clen); } else
613  #endif  /* SUPPORT_UTF8 */  #endif  /* SUPPORT_UTF */
614      c = *ptr;      c = *ptr;
615      }      }
616    else    else
# Line 509  for (;;) Line 627  for (;;)
627    for (i = 0; i < active_count; i++)    for (i = 0; i < active_count; i++)
628      {      {
629      stateblock *current_state = active_states + i;      stateblock *current_state = active_states + i;
630      const uschar *code;      BOOL caseless = FALSE;
631        const pcre_uchar *code;
632      int state_offset = current_state->offset;      int state_offset = current_state->offset;
633      int count, codevalue, rrc;      int count, codevalue, rrc;
634    
635  #ifdef DEBUG  #ifdef PCRE_DEBUG
636      printf ("%.*sProcessing state %d c=", rlevel*2-2, SP, state_offset);      printf ("%.*sProcessing state %d c=", rlevel*2-2, SP, state_offset);
637      if (clen == 0) printf("EOL\n");      if (clen == 0) printf("EOL\n");
638        else if (c > 32 && c < 127) printf("'%c'\n", c);        else if (c > 32 && c < 127) printf("'%c'\n", c);
639          else printf("0x%02x\n", c);          else printf("0x%02x\n", c);
640  #endif  #endif
641    
     /* This variable is referred to implicity in the ADD_xxx macros. */  
   
     ims = current_state->ims;  
   
642      /* A negative offset is a special case meaning "hold off going to this      /* A negative offset is a special case meaning "hold off going to this
643      (negated) state until the number of characters in the data field have      (negated) state until the number of characters in the data field have
644      been skipped". */      been skipped". */
# Line 543  for (;;) Line 658  for (;;)
658          }          }
659        }        }
660    
661      /* Check for a duplicate state with the same count, and skip if found. */      /* Check for a duplicate state with the same count, and skip if found.
662        See the note at the head of this module about the possibility of improving
663        performance here. */
664    
665      for (j = 0; j < i; j++)      for (j = 0; j < i; j++)
666        {        {
# Line 560  for (;;) Line 677  for (;;)
677      code = start_code + state_offset;      code = start_code + state_offset;
678      codevalue = *code;      codevalue = *code;
679    
680        /* If this opcode inspects a character, but we are at the end of the
681        subject, remember the fact for use when testing for a partial match. */
682    
683        if (clen == 0 && poptable[codevalue] != 0)
684          could_continue = TRUE;
685    
686      /* If this opcode is followed by an inline character, load it. It is      /* If this opcode is followed by an inline character, load it. It is
687      tempting to test for the presence of a subject character here, but that      tempting to test for the presence of a subject character here, but that
688      is wrong, because sometimes zero repetitions of the subject are      is wrong, because sometimes zero repetitions of the subject are
# Line 574  for (;;) Line 697  for (;;)
697      if (coptable[codevalue] > 0)      if (coptable[codevalue] > 0)
698        {        {
699        dlen = 1;        dlen = 1;
700  #ifdef SUPPORT_UTF8  #ifdef SUPPORT_UTF
701        if (utf8) { GETCHARLEN(d, (code + coptable[codevalue]), dlen); } else        if (utf) { GETCHARLEN(d, (code + coptable[codevalue]), dlen); } else
702  #endif  /* SUPPORT_UTF8 */  #endif  /* SUPPORT_UTF */
703        d = code[coptable[codevalue]];        d = code[coptable[codevalue]];
704        if (codevalue >= OP_TYPESTAR)        if (codevalue >= OP_TYPESTAR)
705          {          {
# Line 606  for (;;) Line 729  for (;;)
729    
730      switch (codevalue)      switch (codevalue)
731        {        {
732    /* ========================================================================== */
733          /* These cases are never obeyed. This is a fudge that causes a compile-
734          time error if the vectors coptable or poptable, which are indexed by
735          opcode, are not the correct length. It seems to be the only way to do
736          such a check at compile time, as the sizeof() operator does not work
737          in the C preprocessor. */
738    
739          case OP_TABLE_LENGTH:
740          case OP_TABLE_LENGTH +
741            ((sizeof(coptable) == OP_TABLE_LENGTH) &&
742             (sizeof(poptable) == OP_TABLE_LENGTH)):
743          break;
744    
745  /* ========================================================================== */  /* ========================================================================== */
746        /* Reached a closing bracket. If not at the end of the pattern, carry        /* Reached a closing bracket. If not at the end of the pattern, carry
747        on with the next opcode. Otherwise, unless we have an empty string and        on with the next opcode. For repeating opcodes, also add the repeat
748        PCRE_NOTEMPTY is set, save the match data, shifting up all previous        state. Note that KETRPOS will always be encountered at the end of the
749          subpattern, because the possessive subpattern repeats are always handled
750          using recursive calls. Thus, it never adds any new states.
751    
752          At the end of the (sub)pattern, unless we have an empty string and
753          PCRE_NOTEMPTY is set, or PCRE_NOTEMPTY_ATSTART is set and we are at the
754          start of the subject, save the match data, shifting up all previous
755        matches so we always have the longest first. */        matches so we always have the longest first. */
756    
757        case OP_KET:        case OP_KET:
758        case OP_KETRMIN:        case OP_KETRMIN:
759        case OP_KETRMAX:        case OP_KETRMAX:
760          case OP_KETRPOS:
761        if (code != end_code)        if (code != end_code)
762          {          {
763          ADD_ACTIVE(state_offset + 1 + LINK_SIZE, 0);          ADD_ACTIVE(state_offset + 1 + LINK_SIZE, 0);
# Line 624  for (;;) Line 766  for (;;)
766            ADD_ACTIVE(state_offset - GET(code, 1), 0);            ADD_ACTIVE(state_offset - GET(code, 1), 0);
767            }            }
768          }          }
769        else if (ptr > current_subject || (md->moptions & PCRE_NOTEMPTY) == 0)        else
770          {          {
771          if (match_count < 0) match_count = (offsetcount >= 2)? 1 : 0;          if (ptr > current_subject ||
772            else if (match_count > 0 && ++match_count * 2 >= offsetcount)              ((md->moptions & PCRE_NOTEMPTY) == 0 &&
773              match_count = 0;                ((md->moptions & PCRE_NOTEMPTY_ATSTART) == 0 ||
774          count = ((match_count == 0)? offsetcount : match_count * 2) - 2;                  current_subject > start_subject + md->start_offset)))
775          if (count > 0) memmove(offsets + 2, offsets, count * sizeof(int));            {
776          if (offsetcount >= 2)            if (match_count < 0) match_count = (offsetcount >= 2)? 1 : 0;
777            {              else if (match_count > 0 && ++match_count * 2 > offsetcount)
778            offsets[0] = current_subject - start_subject;                match_count = 0;
779            offsets[1] = ptr - start_subject;            count = ((match_count == 0)? offsetcount : match_count * 2) - 2;
780            DPRINTF(("%.*sSet matched string = \"%.*s\"\n", rlevel*2-2, SP,            if (count > 0) memmove(offsets + 2, offsets, count * sizeof(int));
781              offsets[1] - offsets[0], current_subject));            if (offsetcount >= 2)
782            }              {
783          if ((md->moptions & PCRE_DFA_SHORTEST) != 0)              offsets[0] = (int)(current_subject - start_subject);
784            {              offsets[1] = (int)(ptr - start_subject);
785            DPRINTF(("%.*sEnd of internal_dfa_exec %d: returning %d\n"              DPRINTF(("%.*sSet matched string = \"%.*s\"\n", rlevel*2-2, SP,
786              "%.*s---------------------\n\n", rlevel*2-2, SP, rlevel,                offsets[1] - offsets[0], current_subject));
787              match_count, rlevel*2-2, SP));              }
788            return match_count;            if ((md->moptions & PCRE_DFA_SHORTEST) != 0)
789                {
790                DPRINTF(("%.*sEnd of internal_dfa_exec %d: returning %d\n"
791                  "%.*s---------------------\n\n", rlevel*2-2, SP, rlevel,
792                  match_count, rlevel*2-2, SP));
793                return match_count;
794                }
795            }            }
796          }          }
797        break;        break;
# Line 655  for (;;) Line 803  for (;;)
803        /*-----------------------------------------------------------------*/        /*-----------------------------------------------------------------*/
804        case OP_ALT:        case OP_ALT:
805        do { code += GET(code, 1); } while (*code == OP_ALT);        do { code += GET(code, 1); } while (*code == OP_ALT);
806        ADD_ACTIVE(code - start_code, 0);        ADD_ACTIVE((int)(code - start_code), 0);
807        break;        break;
808    
809        /*-----------------------------------------------------------------*/        /*-----------------------------------------------------------------*/
# Line 663  for (;;) Line 811  for (;;)
811        case OP_SBRA:        case OP_SBRA:
812        do        do
813          {          {
814          ADD_ACTIVE(code - start_code + 1 + LINK_SIZE, 0);          ADD_ACTIVE((int)(code - start_code + 1 + LINK_SIZE), 0);
815          code += GET(code, 1);          code += GET(code, 1);
816          }          }
817        while (*code == OP_ALT);        while (*code == OP_ALT);
# Line 672  for (;;) Line 820  for (;;)
820        /*-----------------------------------------------------------------*/        /*-----------------------------------------------------------------*/
821        case OP_CBRA:        case OP_CBRA:
822        case OP_SCBRA:        case OP_SCBRA:
823        ADD_ACTIVE(code - start_code + 3 + LINK_SIZE,  0);        ADD_ACTIVE((int)(code - start_code + 1 + LINK_SIZE + IMM2_SIZE),  0);
824        code += GET(code, 1);        code += GET(code, 1);
825        while (*code == OP_ALT)        while (*code == OP_ALT)
826          {          {
827          ADD_ACTIVE(code - start_code + 1 + LINK_SIZE,  0);          ADD_ACTIVE((int)(code - start_code + 1 + LINK_SIZE),  0);
828          code += GET(code, 1);          code += GET(code, 1);
829          }          }
830        break;        break;
# Line 687  for (;;) Line 835  for (;;)
835        ADD_ACTIVE(state_offset + 1, 0);        ADD_ACTIVE(state_offset + 1, 0);
836        code += 1 + GET(code, 2);        code += 1 + GET(code, 2);
837        while (*code == OP_ALT) code += GET(code, 1);        while (*code == OP_ALT) code += GET(code, 1);
838        ADD_ACTIVE(code - start_code + 1 + LINK_SIZE, 0);        ADD_ACTIVE((int)(code - start_code + 1 + LINK_SIZE), 0);
839        break;        break;
840    
841        /*-----------------------------------------------------------------*/        /*-----------------------------------------------------------------*/
842        case OP_SKIPZERO:        case OP_SKIPZERO:
843        code += 1 + GET(code, 2);        code += 1 + GET(code, 2);
844        while (*code == OP_ALT) code += GET(code, 1);        while (*code == OP_ALT) code += GET(code, 1);
845        ADD_ACTIVE(code - start_code + 1 + LINK_SIZE, 0);        ADD_ACTIVE((int)(code - start_code + 1 + LINK_SIZE), 0);
846        break;        break;
847    
848        /*-----------------------------------------------------------------*/        /*-----------------------------------------------------------------*/
849        case OP_CIRC:        case OP_CIRC:
850        if ((ptr == start_subject && (md->moptions & PCRE_NOTBOL) == 0) ||        if (ptr == start_subject && (md->moptions & PCRE_NOTBOL) == 0)
           ((ims & PCRE_MULTILINE) != 0 &&  
             ptr != end_subject &&  
             WAS_NEWLINE(ptr)))  
851          { ADD_ACTIVE(state_offset + 1, 0); }          { ADD_ACTIVE(state_offset + 1, 0); }
852        break;        break;
853    
854        /*-----------------------------------------------------------------*/        /*-----------------------------------------------------------------*/
855        case OP_EOD:        case OP_CIRCM:
856        if (ptr >= end_subject) { ADD_ACTIVE(state_offset + 1, 0); }        if ((ptr == start_subject && (md->moptions & PCRE_NOTBOL) == 0) ||
857              (ptr != end_subject && WAS_NEWLINE(ptr)))
858            { ADD_ACTIVE(state_offset + 1, 0); }
859        break;        break;
860    
861        /*-----------------------------------------------------------------*/        /*-----------------------------------------------------------------*/
862        case OP_OPT:        case OP_EOD:
863        ims = code[1];        if (ptr >= end_subject)
864        ADD_ACTIVE(state_offset + 2, 0);          {
865            if ((md->moptions & PCRE_PARTIAL_HARD) != 0)
866              could_continue = TRUE;
867            else { ADD_ACTIVE(state_offset + 1, 0); }
868            }
869        break;        break;
870    
871        /*-----------------------------------------------------------------*/        /*-----------------------------------------------------------------*/
# Line 748  for (;;) Line 899  for (;;)
899    
900        /*-----------------------------------------------------------------*/        /*-----------------------------------------------------------------*/
901        case OP_EODN:        case OP_EODN:
902        if (clen == 0 || (IS_NEWLINE(ptr) && ptr == end_subject - md->nllen))        if (clen == 0 && (md->moptions & PCRE_PARTIAL_HARD) != 0)
903            could_continue = TRUE;
904          else if (clen == 0 || (IS_NEWLINE(ptr) && ptr == end_subject - md->nllen))
905          { ADD_ACTIVE(state_offset + 1, 0); }          { ADD_ACTIVE(state_offset + 1, 0); }
906        break;        break;
907    
# Line 756  for (;;) Line 909  for (;;)
909        case OP_DOLL:        case OP_DOLL:
910        if ((md->moptions & PCRE_NOTEOL) == 0)        if ((md->moptions & PCRE_NOTEOL) == 0)
911          {          {
912          if (clen == 0 ||          if (clen == 0 && (md->moptions & PCRE_PARTIAL_HARD) != 0)
913              could_continue = TRUE;
914            else if (clen == 0 ||
915              ((md->poptions & PCRE_DOLLAR_ENDONLY) == 0 && IS_NEWLINE(ptr) &&              ((md->poptions & PCRE_DOLLAR_ENDONLY) == 0 && IS_NEWLINE(ptr) &&
916                 ((ims & PCRE_MULTILINE) != 0 || ptr == end_subject - md->nllen)                 (ptr == end_subject - md->nllen)
917              ))              ))
918            { ADD_ACTIVE(state_offset + 1, 0); }            { ADD_ACTIVE(state_offset + 1, 0); }
919          }          }
920        else if ((ims & PCRE_MULTILINE) != 0 && IS_NEWLINE(ptr))        break;
921    
922          /*-----------------------------------------------------------------*/
923          case OP_DOLLM:
924          if ((md->moptions & PCRE_NOTEOL) == 0)
925            {
926            if (clen == 0 && (md->moptions & PCRE_PARTIAL_HARD) != 0)
927              could_continue = TRUE;
928            else if (clen == 0 ||
929                ((md->poptions & PCRE_DOLLAR_ENDONLY) == 0 && IS_NEWLINE(ptr)))
930              { ADD_ACTIVE(state_offset + 1, 0); }
931            }
932          else if (IS_NEWLINE(ptr))
933          { ADD_ACTIVE(state_offset + 1, 0); }          { ADD_ACTIVE(state_offset + 1, 0); }
934        break;        break;
935    
# Line 793  for (;;) Line 960  for (;;)
960    
961          if (ptr > start_subject)          if (ptr > start_subject)
962            {            {
963            const uschar *temp = ptr - 1;            const pcre_uchar *temp = ptr - 1;
964  #ifdef SUPPORT_UTF8            if (temp < md->start_used_ptr) md->start_used_ptr = temp;
965            if (utf8) BACKCHAR(temp);  #ifdef SUPPORT_UTF
966              if (utf) { BACKCHAR(temp); }
967  #endif  #endif
968            GETCHARTEST(d, temp);            GETCHARTEST(d, temp);
969    #ifdef SUPPORT_UCP
970              if ((md->poptions & PCRE_UCP) != 0)
971                {
972                if (d == '_') left_word = TRUE; else
973                  {
974                  int cat = UCD_CATEGORY(d);
975                  left_word = (cat == ucp_L || cat == ucp_N);
976                  }
977                }
978              else
979    #endif
980            left_word = d < 256 && (ctypes[d] & ctype_word) != 0;            left_word = d < 256 && (ctypes[d] & ctype_word) != 0;
981            }            }
982          else left_word = 0;          else left_word = FALSE;
983    
984          if (clen > 0) right_word = c < 256 && (ctypes[c] & ctype_word) != 0;          if (clen > 0)
985            else right_word = 0;            {
986    #ifdef SUPPORT_UCP
987              if ((md->poptions & PCRE_UCP) != 0)
988                {
989                if (c == '_') right_word = TRUE; else
990                  {
991                  int cat = UCD_CATEGORY(c);
992                  right_word = (cat == ucp_L || cat == ucp_N);
993                  }
994                }
995              else
996    #endif
997              right_word = c < 256 && (ctypes[c] & ctype_word) != 0;
998              }
999            else right_word = FALSE;
1000    
1001          if ((left_word == right_word) == (codevalue == OP_NOT_WORD_BOUNDARY))          if ((left_word == right_word) == (codevalue == OP_NOT_WORD_BOUNDARY))
1002            { ADD_ACTIVE(state_offset + 1, 0); }            { ADD_ACTIVE(state_offset + 1, 0); }
# Line 830  for (;;) Line 1023  for (;;)
1023            break;            break;
1024    
1025            case PT_LAMP:            case PT_LAMP:
1026            OK = prop->chartype == ucp_Lu || prop->chartype == ucp_Ll || prop->chartype == ucp_Lt;            OK = prop->chartype == ucp_Lu || prop->chartype == ucp_Ll ||
1027                   prop->chartype == ucp_Lt;
1028            break;            break;
1029    
1030            case PT_GC:            case PT_GC:
1031            OK = _pcre_ucp_gentype[prop->chartype] == code[2];            OK = PRIV(ucp_gentype)[prop->chartype] == code[2];
1032            break;            break;
1033    
1034            case PT_PC:            case PT_PC:
# Line 845  for (;;) Line 1039  for (;;)
1039            OK = prop->script == code[2];            OK = prop->script == code[2];
1040            break;            break;
1041    
1042              /* These are specials for combination cases. */
1043    
1044              case PT_ALNUM:
1045              OK = PRIV(ucp_gentype)[prop->chartype] == ucp_L ||
1046                   PRIV(ucp_gentype)[prop->chartype] == ucp_N;
1047              break;
1048    
1049              case PT_SPACE:    /* Perl space */
1050              OK = PRIV(ucp_gentype)[prop->chartype] == ucp_Z ||
1051                   c == CHAR_HT || c == CHAR_NL || c == CHAR_FF || c == CHAR_CR;
1052              break;
1053    
1054              case PT_PXSPACE:  /* POSIX space */
1055              OK = PRIV(ucp_gentype)[prop->chartype] == ucp_Z ||
1056                   c == CHAR_HT || c == CHAR_NL || c == CHAR_VT ||
1057                   c == CHAR_FF || c == CHAR_CR;
1058              break;
1059    
1060              case PT_WORD:
1061              OK = PRIV(ucp_gentype)[prop->chartype] == ucp_L ||
1062                   PRIV(ucp_gentype)[prop->chartype] == ucp_N ||
1063                   c == CHAR_UNDERSCORE;
1064              break;
1065    
1066            /* Should never occur, but keep compilers from grumbling. */            /* Should never occur, but keep compilers from grumbling. */
1067    
1068            default:            default:
# Line 943  for (;;) Line 1161  for (;;)
1161                ((ctypes[c] & toptable1[d]) ^ toptable2[d]) != 0))                ((ctypes[c] & toptable1[d]) ^ toptable2[d]) != 0))
1162            {            {
1163            if (++count >= GET2(code, 1))            if (++count >= GET2(code, 1))
1164              { ADD_NEW(state_offset + 4, 0); }              { ADD_NEW(state_offset + 1 + IMM2_SIZE + 1, 0); }
1165            else            else
1166              { ADD_NEW(state_offset, count); }              { ADD_NEW(state_offset, count); }
1167            }            }
# Line 954  for (;;) Line 1172  for (;;)
1172        case OP_TYPEUPTO:        case OP_TYPEUPTO:
1173        case OP_TYPEMINUPTO:        case OP_TYPEMINUPTO:
1174        case OP_TYPEPOSUPTO:        case OP_TYPEPOSUPTO:
1175        ADD_ACTIVE(state_offset + 4, 0);        ADD_ACTIVE(state_offset + 2 + IMM2_SIZE, 0);
1176        count = current_state->count;  /* Number already matched */        count = current_state->count;  /* Number already matched */
1177        if (clen > 0)        if (clen > 0)
1178          {          {
# Line 969  for (;;) Line 1187  for (;;)
1187              next_active_state--;              next_active_state--;
1188              }              }
1189            if (++count >= GET2(code, 1))            if (++count >= GET2(code, 1))
1190              { ADD_NEW(state_offset + 4, 0); }              { ADD_NEW(state_offset + 2 + IMM2_SIZE, 0); }
1191            else            else
1192              { ADD_NEW(state_offset, count); }              { ADD_NEW(state_offset, count); }
1193            }            }
# Line 999  for (;;) Line 1217  for (;;)
1217            break;            break;
1218    
1219            case PT_LAMP:            case PT_LAMP:
1220            OK = prop->chartype == ucp_Lu || prop->chartype == ucp_Ll || prop->chartype == ucp_Lt;            OK = prop->chartype == ucp_Lu || prop->chartype == ucp_Ll ||
1221                prop->chartype == ucp_Lt;
1222            break;            break;
1223    
1224            case PT_GC:            case PT_GC:
1225            OK = _pcre_ucp_gentype[prop->chartype] == code[3];            OK = PRIV(ucp_gentype)[prop->chartype] == code[3];
1226            break;            break;
1227    
1228            case PT_PC:            case PT_PC:
# Line 1014  for (;;) Line 1233  for (;;)
1233            OK = prop->script == code[3];            OK = prop->script == code[3];
1234            break;            break;
1235    
1236              /* These are specials for combination cases. */
1237    
1238              case PT_ALNUM:
1239              OK = PRIV(ucp_gentype)[prop->chartype] == ucp_L ||
1240                   PRIV(ucp_gentype)[prop->chartype] == ucp_N;
1241              break;
1242    
1243              case PT_SPACE:    /* Perl space */
1244              OK = PRIV(ucp_gentype)[prop->chartype] == ucp_Z ||
1245                   c == CHAR_HT || c == CHAR_NL || c == CHAR_FF || c == CHAR_CR;
1246              break;
1247    
1248              case PT_PXSPACE:  /* POSIX space */
1249              OK = PRIV(ucp_gentype)[prop->chartype] == ucp_Z ||
1250                   c == CHAR_HT || c == CHAR_NL || c == CHAR_VT ||
1251                   c == CHAR_FF || c == CHAR_CR;
1252              break;
1253    
1254              case PT_WORD:
1255              OK = PRIV(ucp_gentype)[prop->chartype] == ucp_L ||
1256                   PRIV(ucp_gentype)[prop->chartype] == ucp_N ||
1257                   c == CHAR_UNDERSCORE;
1258              break;
1259    
1260            /* Should never occur, but keep compilers from grumbling. */            /* Should never occur, but keep compilers from grumbling. */
1261    
1262            default:            default:
# Line 1042  for (;;) Line 1285  for (;;)
1285        if (count > 0) { ADD_ACTIVE(state_offset + 2, 0); }        if (count > 0) { ADD_ACTIVE(state_offset + 2, 0); }
1286        if (clen > 0 && UCD_CATEGORY(c) != ucp_M)        if (clen > 0 && UCD_CATEGORY(c) != ucp_M)
1287          {          {
1288          const uschar *nptr = ptr + clen;          const pcre_uchar *nptr = ptr + clen;
1289          int ncount = 0;          int ncount = 0;
1290          if (count > 0 && codevalue == OP_EXTUNI_EXTRA + OP_TYPEPOSPLUS)          if (count > 0 && codevalue == OP_EXTUNI_EXTRA + OP_TYPEPOSPLUS)
1291            {            {
# Line 1221  for (;;) Line 1464  for (;;)
1464            break;            break;
1465    
1466            case PT_LAMP:            case PT_LAMP:
1467            OK = prop->chartype == ucp_Lu || prop->chartype == ucp_Ll || prop->chartype == ucp_Lt;            OK = prop->chartype == ucp_Lu || prop->chartype == ucp_Ll ||
1468                prop->chartype == ucp_Lt;
1469            break;            break;
1470    
1471            case PT_GC:            case PT_GC:
1472            OK = _pcre_ucp_gentype[prop->chartype] == code[3];            OK = PRIV(ucp_gentype)[prop->chartype] == code[3];
1473            break;            break;
1474    
1475            case PT_PC:            case PT_PC:
# Line 1236  for (;;) Line 1480  for (;;)
1480            OK = prop->script == code[3];            OK = prop->script == code[3];
1481            break;            break;
1482    
1483              /* These are specials for combination cases. */
1484    
1485              case PT_ALNUM:
1486              OK = PRIV(ucp_gentype)[prop->chartype] == ucp_L ||
1487                   PRIV(ucp_gentype)[prop->chartype] == ucp_N;
1488              break;
1489    
1490              case PT_SPACE:    /* Perl space */
1491              OK = PRIV(ucp_gentype)[prop->chartype] == ucp_Z ||
1492                   c == CHAR_HT || c == CHAR_NL || c == CHAR_FF || c == CHAR_CR;
1493              break;
1494    
1495              case PT_PXSPACE:  /* POSIX space */
1496              OK = PRIV(ucp_gentype)[prop->chartype] == ucp_Z ||
1497                   c == CHAR_HT || c == CHAR_NL || c == CHAR_VT ||
1498                   c == CHAR_FF || c == CHAR_CR;
1499              break;
1500    
1501              case PT_WORD:
1502              OK = PRIV(ucp_gentype)[prop->chartype] == ucp_L ||
1503                   PRIV(ucp_gentype)[prop->chartype] == ucp_N ||
1504                   c == CHAR_UNDERSCORE;
1505              break;
1506    
1507            /* Should never occur, but keep compilers from grumbling. */            /* Should never occur, but keep compilers from grumbling. */
1508    
1509            default:            default:
# Line 1273  for (;;) Line 1541  for (;;)
1541        ADD_ACTIVE(state_offset + 2, 0);        ADD_ACTIVE(state_offset + 2, 0);
1542        if (clen > 0 && UCD_CATEGORY(c) != ucp_M)        if (clen > 0 && UCD_CATEGORY(c) != ucp_M)
1543          {          {
1544          const uschar *nptr = ptr + clen;          const pcre_uchar *nptr = ptr + clen;
1545          int ncount = 0;          int ncount = 0;
1546          if (codevalue == OP_EXTUNI_EXTRA + OP_TYPEPOSSTAR ||          if (codevalue == OP_EXTUNI_EXTRA + OP_TYPEPOSSTAR ||
1547              codevalue == OP_EXTUNI_EXTRA + OP_TYPEPOSQUERY)              codevalue == OP_EXTUNI_EXTRA + OP_TYPEPOSQUERY)
# Line 1455  for (;;) Line 1723  for (;;)
1723        case OP_PROP_EXTRA + OP_TYPEMINUPTO:        case OP_PROP_EXTRA + OP_TYPEMINUPTO:
1724        case OP_PROP_EXTRA + OP_TYPEPOSUPTO:        case OP_PROP_EXTRA + OP_TYPEPOSUPTO:
1725        if (codevalue != OP_PROP_EXTRA + OP_TYPEEXACT)        if (codevalue != OP_PROP_EXTRA + OP_TYPEEXACT)
1726          { ADD_ACTIVE(state_offset + 6, 0); }          { ADD_ACTIVE(state_offset + 1 + IMM2_SIZE + 3, 0); }
1727        count = current_state->count;  /* Number already matched */        count = current_state->count;  /* Number already matched */
1728        if (clen > 0)        if (clen > 0)
1729          {          {
1730          BOOL OK;          BOOL OK;
1731          const ucd_record * prop = GET_UCD(c);          const ucd_record * prop = GET_UCD(c);
1732          switch(code[4])          switch(code[1 + IMM2_SIZE + 1])
1733            {            {
1734            case PT_ANY:            case PT_ANY:
1735            OK = TRUE;            OK = TRUE;
1736            break;            break;
1737    
1738            case PT_LAMP:            case PT_LAMP:
1739            OK = prop->chartype == ucp_Lu || prop->chartype == ucp_Ll || prop->chartype == ucp_Lt;            OK = prop->chartype == ucp_Lu || prop->chartype == ucp_Ll ||
1740                prop->chartype == ucp_Lt;
1741            break;            break;
1742    
1743            case PT_GC:            case PT_GC:
1744            OK = _pcre_ucp_gentype[prop->chartype] == code[5];            OK = PRIV(ucp_gentype)[prop->chartype] == code[1 + IMM2_SIZE + 2];
1745            break;            break;
1746    
1747            case PT_PC:            case PT_PC:
1748            OK = prop->chartype == code[5];            OK = prop->chartype == code[1 + IMM2_SIZE + 2];
1749            break;            break;
1750    
1751            case PT_SC:            case PT_SC:
1752            OK = prop->script == code[5];            OK = prop->script == code[1 + IMM2_SIZE + 2];
1753              break;
1754    
1755              /* These are specials for combination cases. */
1756    
1757              case PT_ALNUM:
1758              OK = PRIV(ucp_gentype)[prop->chartype] == ucp_L ||
1759                   PRIV(ucp_gentype)[prop->chartype] == ucp_N;
1760              break;
1761    
1762              case PT_SPACE:    /* Perl space */
1763              OK = PRIV(ucp_gentype)[prop->chartype] == ucp_Z ||
1764                   c == CHAR_HT || c == CHAR_NL || c == CHAR_FF || c == CHAR_CR;
1765              break;
1766    
1767              case PT_PXSPACE:  /* POSIX space */
1768              OK = PRIV(ucp_gentype)[prop->chartype] == ucp_Z ||
1769                   c == CHAR_HT || c == CHAR_NL || c == CHAR_VT ||
1770                   c == CHAR_FF || c == CHAR_CR;
1771              break;
1772    
1773              case PT_WORD:
1774              OK = PRIV(ucp_gentype)[prop->chartype] == ucp_L ||
1775                   PRIV(ucp_gentype)[prop->chartype] == ucp_N ||
1776                   c == CHAR_UNDERSCORE;
1777            break;            break;
1778    
1779            /* Should never occur, but keep compilers from grumbling. */            /* Should never occur, but keep compilers from grumbling. */
# Line 1498  for (;;) Line 1791  for (;;)
1791              next_active_state--;              next_active_state--;
1792              }              }
1793            if (++count >= GET2(code, 1))            if (++count >= GET2(code, 1))
1794              { ADD_NEW(state_offset + 6, 0); }              { ADD_NEW(state_offset + 1 + IMM2_SIZE + 3, 0); }
1795            else            else
1796              { ADD_NEW(state_offset, count); }              { ADD_NEW(state_offset, count); }
1797            }            }
# Line 1511  for (;;) Line 1804  for (;;)
1804        case OP_EXTUNI_EXTRA + OP_TYPEMINUPTO:        case OP_EXTUNI_EXTRA + OP_TYPEMINUPTO:
1805        case OP_EXTUNI_EXTRA + OP_TYPEPOSUPTO:        case OP_EXTUNI_EXTRA + OP_TYPEPOSUPTO:
1806        if (codevalue != OP_EXTUNI_EXTRA + OP_TYPEEXACT)        if (codevalue != OP_EXTUNI_EXTRA + OP_TYPEEXACT)
1807          { ADD_ACTIVE(state_offset + 4, 0); }          { ADD_ACTIVE(state_offset + 2 + IMM2_SIZE, 0); }
1808        count = current_state->count;  /* Number already matched */        count = current_state->count;  /* Number already matched */
1809        if (clen > 0 && UCD_CATEGORY(c) != ucp_M)        if (clen > 0 && UCD_CATEGORY(c) != ucp_M)
1810          {          {
1811          const uschar *nptr = ptr + clen;          const pcre_uchar *nptr = ptr + clen;
1812          int ncount = 0;          int ncount = 0;
1813          if (codevalue == OP_EXTUNI_EXTRA + OP_TYPEPOSUPTO)          if (codevalue == OP_EXTUNI_EXTRA + OP_TYPEPOSUPTO)
1814            {            {
# Line 1532  for (;;) Line 1825  for (;;)
1825            nptr += ndlen;            nptr += ndlen;
1826            }            }
1827          if (++count >= GET2(code, 1))          if (++count >= GET2(code, 1))
1828            { ADD_NEW_DATA(-(state_offset + 4), 0, ncount); }            { ADD_NEW_DATA(-(state_offset + 2 + IMM2_SIZE), 0, ncount); }
1829          else          else
1830            { ADD_NEW_DATA(-state_offset, count, ncount); }            { ADD_NEW_DATA(-state_offset, count, ncount); }
1831          }          }
# Line 1545  for (;;) Line 1838  for (;;)
1838        case OP_ANYNL_EXTRA + OP_TYPEMINUPTO:        case OP_ANYNL_EXTRA + OP_TYPEMINUPTO:
1839        case OP_ANYNL_EXTRA + OP_TYPEPOSUPTO:        case OP_ANYNL_EXTRA + OP_TYPEPOSUPTO:
1840        if (codevalue != OP_ANYNL_EXTRA + OP_TYPEEXACT)        if (codevalue != OP_ANYNL_EXTRA + OP_TYPEEXACT)
1841          { ADD_ACTIVE(state_offset + 4, 0); }          { ADD_ACTIVE(state_offset + 2 + IMM2_SIZE, 0); }
1842        count = current_state->count;  /* Number already matched */        count = current_state->count;  /* Number already matched */
1843        if (clen > 0)        if (clen > 0)
1844          {          {
# Line 1572  for (;;) Line 1865  for (;;)
1865              next_active_state--;              next_active_state--;
1866              }              }
1867            if (++count >= GET2(code, 1))            if (++count >= GET2(code, 1))
1868              { ADD_NEW_DATA(-(state_offset + 4), 0, ncount); }              { ADD_NEW_DATA(-(state_offset + 2 + IMM2_SIZE), 0, ncount); }
1869            else            else
1870              { ADD_NEW_DATA(-state_offset, count, ncount); }              { ADD_NEW_DATA(-state_offset, count, ncount); }
1871            break;            break;
# Line 1589  for (;;) Line 1882  for (;;)
1882        case OP_VSPACE_EXTRA + OP_TYPEMINUPTO:        case OP_VSPACE_EXTRA + OP_TYPEMINUPTO:
1883        case OP_VSPACE_EXTRA + OP_TYPEPOSUPTO:        case OP_VSPACE_EXTRA + OP_TYPEPOSUPTO:
1884        if (codevalue != OP_VSPACE_EXTRA + OP_TYPEEXACT)        if (codevalue != OP_VSPACE_EXTRA + OP_TYPEEXACT)
1885          { ADD_ACTIVE(state_offset + 4, 0); }          { ADD_ACTIVE(state_offset + 2 + IMM2_SIZE, 0); }
1886        count = current_state->count;  /* Number already matched */        count = current_state->count;  /* Number already matched */
1887        if (clen > 0)        if (clen > 0)
1888          {          {
# Line 1618  for (;;) Line 1911  for (;;)
1911              next_active_state--;              next_active_state--;
1912              }              }
1913            if (++count >= GET2(code, 1))            if (++count >= GET2(code, 1))
1914              { ADD_NEW_DATA(-(state_offset + 4), 0, 0); }              { ADD_NEW_DATA(-(state_offset + 2 + IMM2_SIZE), 0, 0); }
1915            else            else
1916              { ADD_NEW_DATA(-state_offset, count, 0); }              { ADD_NEW_DATA(-state_offset, count, 0); }
1917            }            }
# Line 1631  for (;;) Line 1924  for (;;)
1924        case OP_HSPACE_EXTRA + OP_TYPEMINUPTO:        case OP_HSPACE_EXTRA + OP_TYPEMINUPTO:
1925        case OP_HSPACE_EXTRA + OP_TYPEPOSUPTO:        case OP_HSPACE_EXTRA + OP_TYPEPOSUPTO:
1926        if (codevalue != OP_HSPACE_EXTRA + OP_TYPEEXACT)        if (codevalue != OP_HSPACE_EXTRA + OP_TYPEEXACT)
1927          { ADD_ACTIVE(state_offset + 4, 0); }          { ADD_ACTIVE(state_offset + 2 + IMM2_SIZE, 0); }
1928        count = current_state->count;  /* Number already matched */        count = current_state->count;  /* Number already matched */
1929        if (clen > 0)        if (clen > 0)
1930          {          {
# Line 1673  for (;;) Line 1966  for (;;)
1966              next_active_state--;              next_active_state--;
1967              }              }
1968            if (++count >= GET2(code, 1))            if (++count >= GET2(code, 1))
1969              { ADD_NEW_DATA(-(state_offset + 4), 0, 0); }              { ADD_NEW_DATA(-(state_offset + 2 + IMM2_SIZE), 0, 0); }
1970            else            else
1971              { ADD_NEW_DATA(-state_offset, count, 0); }              { ADD_NEW_DATA(-state_offset, count, 0); }
1972            }            }
# Line 1692  for (;;) Line 1985  for (;;)
1985        break;        break;
1986    
1987        /*-----------------------------------------------------------------*/        /*-----------------------------------------------------------------*/
1988        case OP_CHARNC:        case OP_CHARI:
1989        if (clen == 0) break;        if (clen == 0) break;
1990    
1991  #ifdef SUPPORT_UTF8  #ifdef SUPPORT_UTF
1992        if (utf8)        if (utf)
1993          {          {
1994          if (c == d) { ADD_NEW(state_offset + dlen + 1, 0); } else          if (c == d) { ADD_NEW(state_offset + dlen + 1, 0); } else
1995            {            {
1996            unsigned int othercase;            unsigned int othercase;
1997            if (c < 128) othercase = fcc[c]; else            if (c < 128)
1998                othercase = fcc[c];
1999            /* If we have Unicode property support, we can use it to test the            else
2000            other case of the character. */              /* If we have Unicode property support, we can use it to test the
2001                other case of the character. */
2002  #ifdef SUPPORT_UCP  #ifdef SUPPORT_UCP
2003            othercase = UCD_OTHERCASE(c);              othercase = UCD_OTHERCASE(c);
2004  #else  #else
2005            othercase = NOTACHAR;              othercase = NOTACHAR;
2006  #endif  #endif
2007    
2008            if (d == othercase) { ADD_NEW(state_offset + dlen + 1, 0); }            if (d == othercase) { ADD_NEW(state_offset + dlen + 1, 0); }
2009            }            }
2010          }          }
2011        else        else
2012  #endif  /* SUPPORT_UTF8 */  #endif  /* SUPPORT_UTF */
2013          /* Not UTF mode */
       /* Non-UTF-8 mode */  
2014          {          {
2015          if (lcc[c] == lcc[d]) { ADD_NEW(state_offset + 2, 0); }          if (TABLE_GET(c, lcc, c) == TABLE_GET(d, lcc, d))
2016              { ADD_NEW(state_offset + 2, 0); }
2017          }          }
2018        break;        break;
2019    
# Line 1734  for (;;) Line 2027  for (;;)
2027        case OP_EXTUNI:        case OP_EXTUNI:
2028        if (clen > 0 && UCD_CATEGORY(c) != ucp_M)        if (clen > 0 && UCD_CATEGORY(c) != ucp_M)
2029          {          {
2030          const uschar *nptr = ptr + clen;          const pcre_uchar *nptr = ptr + clen;
2031          int ncount = 0;          int ncount = 0;
2032          while (nptr < end_subject)          while (nptr < end_subject)
2033            {            {
# Line 1878  for (;;) Line 2171  for (;;)
2171        break;        break;
2172    
2173        /*-----------------------------------------------------------------*/        /*-----------------------------------------------------------------*/
2174        /* Match a negated single character. This is only used for one-byte        /* Match a negated single character casefully. This is only used for
2175        characters, that is, we know that d < 256. The character we are        one-byte characters, that is, we know that d < 256. The character we are
2176        checking (c) can be multibyte. */        checking (c) can be multibyte. */
2177    
2178        case OP_NOT:        case OP_NOT:
2179        if (clen > 0)        if (clen > 0 && c != d) { ADD_NEW(state_offset + dlen + 1, 0); }
2180          {        break;
2181          unsigned int otherd = ((ims & PCRE_CASELESS) != 0)? fcc[d] : d;  
2182          if (c != d && c != otherd) { ADD_NEW(state_offset + dlen + 1, 0); }        /*-----------------------------------------------------------------*/
2183          }        /* Match a negated single character caselessly. This is only used for
2184          one-byte characters, that is, we know that d < 256. The character we are
2185          checking (c) can be multibyte. */
2186    
2187          case OP_NOTI:
2188          if (clen > 0 && c != d && c != fcc[d])
2189            { ADD_NEW(state_offset + dlen + 1, 0); }
2190        break;        break;
2191    
2192        /*-----------------------------------------------------------------*/        /*-----------------------------------------------------------------*/
2193          case OP_PLUSI:
2194          case OP_MINPLUSI:
2195          case OP_POSPLUSI:
2196          case OP_NOTPLUSI:
2197          case OP_NOTMINPLUSI:
2198          case OP_NOTPOSPLUSI:
2199          caseless = TRUE;
2200          codevalue -= OP_STARI - OP_STAR;
2201    
2202          /* Fall through */
2203        case OP_PLUS:        case OP_PLUS:
2204        case OP_MINPLUS:        case OP_MINPLUS:
2205        case OP_POSPLUS:        case OP_POSPLUS:
# Line 1902  for (;;) Line 2211  for (;;)
2211        if (clen > 0)        if (clen > 0)
2212          {          {
2213          unsigned int otherd = NOTACHAR;          unsigned int otherd = NOTACHAR;
2214          if ((ims & PCRE_CASELESS) != 0)          if (caseless)
2215            {            {
2216  #ifdef SUPPORT_UTF8  #ifdef SUPPORT_UTF
2217            if (utf8 && d >= 128)            if (utf && d >= 128)
2218              {              {
2219  #ifdef SUPPORT_UCP  #ifdef SUPPORT_UCP
2220              otherd = UCD_OTHERCASE(d);              otherd = UCD_OTHERCASE(d);
2221  #endif  /* SUPPORT_UCP */  #endif  /* SUPPORT_UCP */
2222              }              }
2223            else            else
2224  #endif  /* SUPPORT_UTF8 */  #endif  /* SUPPORT_UTF */
2225            otherd = fcc[d];            otherd = TABLE_GET(d, fcc, d);
2226            }            }
2227          if ((c == d || c == otherd) == (codevalue < OP_NOTSTAR))          if ((c == d || c == otherd) == (codevalue < OP_NOTSTAR))
2228            {            {
# Line 1930  for (;;) Line 2239  for (;;)
2239        break;        break;
2240    
2241        /*-----------------------------------------------------------------*/        /*-----------------------------------------------------------------*/
2242          case OP_QUERYI:
2243          case OP_MINQUERYI:
2244          case OP_POSQUERYI:
2245          case OP_NOTQUERYI:
2246          case OP_NOTMINQUERYI:
2247          case OP_NOTPOSQUERYI:
2248          caseless = TRUE;
2249          codevalue -= OP_STARI - OP_STAR;
2250          /* Fall through */
2251        case OP_QUERY:        case OP_QUERY:
2252        case OP_MINQUERY:        case OP_MINQUERY:
2253        case OP_POSQUERY:        case OP_POSQUERY:
# Line 1940  for (;;) Line 2258  for (;;)
2258        if (clen > 0)        if (clen > 0)
2259          {          {
2260          unsigned int otherd = NOTACHAR;          unsigned int otherd = NOTACHAR;
2261          if ((ims & PCRE_CASELESS) != 0)          if (caseless)
2262            {            {
2263  #ifdef SUPPORT_UTF8  #ifdef SUPPORT_UTF
2264            if (utf8 && d >= 128)            if (utf && d >= 128)
2265              {              {
2266  #ifdef SUPPORT_UCP  #ifdef SUPPORT_UCP
2267              otherd = UCD_OTHERCASE(d);              otherd = UCD_OTHERCASE(d);
2268  #endif  /* SUPPORT_UCP */  #endif  /* SUPPORT_UCP */
2269              }              }
2270            else            else
2271  #endif  /* SUPPORT_UTF8 */  #endif  /* SUPPORT_UTF */
2272            otherd = fcc[d];            otherd = TABLE_GET(d, fcc, d);
2273            }            }
2274          if ((c == d || c == otherd) == (codevalue < OP_NOTSTAR))          if ((c == d || c == otherd) == (codevalue < OP_NOTSTAR))
2275            {            {
# Line 1966  for (;;) Line 2284  for (;;)
2284        break;        break;
2285    
2286        /*-----------------------------------------------------------------*/        /*-----------------------------------------------------------------*/
2287          case OP_STARI:
2288          case OP_MINSTARI:
2289          case OP_POSSTARI:
2290          case OP_NOTSTARI:
2291          case OP_NOTMINSTARI:
2292          case OP_NOTPOSSTARI:
2293          caseless = TRUE;
2294          codevalue -= OP_STARI - OP_STAR;
2295          /* Fall through */
2296        case OP_STAR:        case OP_STAR:
2297        case OP_MINSTAR:        case OP_MINSTAR:
2298        case OP_POSSTAR:        case OP_POSSTAR:
# Line 1976  for (;;) Line 2303  for (;;)
2303        if (clen > 0)        if (clen > 0)
2304          {          {
2305          unsigned int otherd = NOTACHAR;          unsigned int otherd = NOTACHAR;
2306          if ((ims & PCRE_CASELESS) != 0)          if (caseless)
2307            {            {
2308  #ifdef SUPPORT_UTF8  #ifdef SUPPORT_UTF
2309            if (utf8 && d >= 128)            if (utf && d >= 128)
2310              {              {
2311  #ifdef SUPPORT_UCP  #ifdef SUPPORT_UCP
2312              otherd = UCD_OTHERCASE(d);              otherd = UCD_OTHERCASE(d);
2313  #endif  /* SUPPORT_UCP */  #endif  /* SUPPORT_UCP */
2314              }              }
2315            else            else
2316  #endif  /* SUPPORT_UTF8 */  #endif  /* SUPPORT_UTF */
2317            otherd = fcc[d];            otherd = TABLE_GET(d, fcc, d);
2318            }            }
2319          if ((c == d || c == otherd) == (codevalue < OP_NOTSTAR))          if ((c == d || c == otherd) == (codevalue < OP_NOTSTAR))
2320            {            {
# Line 2002  for (;;) Line 2329  for (;;)
2329        break;        break;
2330    
2331        /*-----------------------------------------------------------------*/        /*-----------------------------------------------------------------*/
2332          case OP_EXACTI:
2333          case OP_NOTEXACTI:
2334          caseless = TRUE;
2335          codevalue -= OP_STARI - OP_STAR;
2336          /* Fall through */
2337        case OP_EXACT:        case OP_EXACT:
2338        case OP_NOTEXACT:        case OP_NOTEXACT:
2339        count = current_state->count;  /* Number already matched */        count = current_state->count;  /* Number already matched */
2340        if (clen > 0)        if (clen > 0)
2341          {          {
2342          unsigned int otherd = NOTACHAR;          unsigned int otherd = NOTACHAR;
2343          if ((ims & PCRE_CASELESS) != 0)          if (caseless)
2344            {            {
2345  #ifdef SUPPORT_UTF8  #ifdef SUPPORT_UTF
2346            if (utf8 && d >= 128)            if (utf && d >= 128)
2347              {              {
2348  #ifdef SUPPORT_UCP  #ifdef SUPPORT_UCP
2349              otherd = UCD_OTHERCASE(d);              otherd = UCD_OTHERCASE(d);
2350  #endif  /* SUPPORT_UCP */  #endif  /* SUPPORT_UCP */
2351              }              }
2352            else            else
2353  #endif  /* SUPPORT_UTF8 */  #endif  /* SUPPORT_UTF */
2354            otherd = fcc[d];            otherd = TABLE_GET(d, fcc, d);
2355            }            }
2356          if ((c == d || c == otherd) == (codevalue < OP_NOTSTAR))          if ((c == d || c == otherd) == (codevalue < OP_NOTSTAR))
2357            {            {
2358            if (++count >= GET2(code, 1))            if (++count >= GET2(code, 1))
2359              { ADD_NEW(state_offset + dlen + 3, 0); }              { ADD_NEW(state_offset + dlen + 1 + IMM2_SIZE, 0); }
2360            else            else
2361              { ADD_NEW(state_offset, count); }              { ADD_NEW(state_offset, count); }
2362            }            }
# Line 2032  for (;;) Line 2364  for (;;)
2364        break;        break;
2365    
2366        /*-----------------------------------------------------------------*/        /*-----------------------------------------------------------------*/
2367          case OP_UPTOI:
2368          case OP_MINUPTOI:
2369          case OP_POSUPTOI:
2370          case OP_NOTUPTOI:
2371          case OP_NOTMINUPTOI:
2372          case OP_NOTPOSUPTOI:
2373          caseless = TRUE;
2374          codevalue -= OP_STARI - OP_STAR;
2375          /* Fall through */
2376        case OP_UPTO:        case OP_UPTO:
2377        case OP_MINUPTO:        case OP_MINUPTO:
2378        case OP_POSUPTO:        case OP_POSUPTO:
2379        case OP_NOTUPTO:        case OP_NOTUPTO:
2380        case OP_NOTMINUPTO:        case OP_NOTMINUPTO:
2381        case OP_NOTPOSUPTO:        case OP_NOTPOSUPTO:
2382        ADD_ACTIVE(state_offset + dlen + 3, 0);        ADD_ACTIVE(state_offset + dlen + 1 + IMM2_SIZE, 0);
2383        count = current_state->count;  /* Number already matched */        count = current_state->count;  /* Number already matched */
2384        if (clen > 0)        if (clen > 0)
2385          {          {
2386          unsigned int otherd = NOTACHAR;          unsigned int otherd = NOTACHAR;
2387          if ((ims & PCRE_CASELESS) != 0)          if (caseless)
2388            {            {
2389  #ifdef SUPPORT_UTF8  #ifdef SUPPORT_UTF
2390            if (utf8 && d >= 128)            if (utf && d >= 128)
2391              {              {
2392  #ifdef SUPPORT_UCP  #ifdef SUPPORT_UCP
2393              otherd = UCD_OTHERCASE(d);              otherd = UCD_OTHERCASE(d);
2394  #endif  /* SUPPORT_UCP */  #endif  /* SUPPORT_UCP */
2395              }              }
2396            else            else
2397  #endif  /* SUPPORT_UTF8 */  #endif  /* SUPPORT_UTF */
2398            otherd = fcc[d];            otherd = TABLE_GET(d, fcc, d);
2399            }            }
2400          if ((c == d || c == otherd) == (codevalue < OP_NOTSTAR))          if ((c == d || c == otherd) == (codevalue < OP_NOTSTAR))
2401            {            {
# Line 2064  for (;;) Line 2405  for (;;)
2405              next_active_state--;              next_active_state--;
2406              }              }
2407            if (++count >= GET2(code, 1))            if (++count >= GET2(code, 1))
2408              { ADD_NEW(state_offset + dlen + 3, 0); }              { ADD_NEW(state_offset + dlen + 1 + IMM2_SIZE, 0); }
2409            else            else
2410              { ADD_NEW(state_offset, count); }              { ADD_NEW(state_offset, count); }
2411            }            }
# Line 2081  for (;;) Line 2422  for (;;)
2422          {          {
2423          BOOL isinclass = FALSE;          BOOL isinclass = FALSE;
2424          int next_state_offset;          int next_state_offset;
2425          const uschar *ecode;          const pcre_uchar *ecode;
2426    
2427          /* For a simple class, there is always just a 32-byte table, and we          /* For a simple class, there is always just a 32-byte table, and we
2428          can set isinclass from it. */          can set isinclass from it. */
2429    
2430          if (codevalue != OP_XCLASS)          if (codevalue != OP_XCLASS)
2431            {            {
2432            ecode = code + 33;            ecode = code + 1 + (32 / sizeof(pcre_uchar));
2433            if (clen > 0)            if (clen > 0)
2434              {              {
2435              isinclass = (c > 255)? (codevalue == OP_NCLASS) :              isinclass = (c > 255)? (codevalue == OP_NCLASS) :
2436                ((code[1 + c/8] & (1 << (c&7))) != 0);                ((((pcre_uint8 *)(code + 1))[c/8] & (1 << (c&7))) != 0);
2437              }              }
2438            }            }
2439    
# Line 2103  for (;;) Line 2444  for (;;)
2444          else          else
2445           {           {
2446           ecode = code + GET(code, 1);           ecode = code + GET(code, 1);
2447           if (clen > 0) isinclass = _pcre_xclass(c, code + 1 + LINK_SIZE);           if (clen > 0) isinclass = PRIV(xclass)(c, code + 1 + LINK_SIZE, utf);
2448           }           }
2449    
2450          /* At this point, isinclass is set for all kinds of class, and ecode          /* At this point, isinclass is set for all kinds of class, and ecode
2451          points to the byte after the end of the class. If there is a          points to the byte after the end of the class. If there is a
2452          quantifier, this is where it will be. */          quantifier, this is where it will be. */
2453    
2454          next_state_offset = ecode - start_code;          next_state_offset = (int)(ecode - start_code);
2455    
2456          switch (*ecode)          switch (*ecode)
2457            {            {
# Line 2137  for (;;) Line 2478  for (;;)
2478            case OP_CRMINRANGE:            case OP_CRMINRANGE:
2479            count = current_state->count;  /* Already matched */            count = current_state->count;  /* Already matched */
2480            if (count >= GET2(ecode, 1))            if (count >= GET2(ecode, 1))
2481              { ADD_ACTIVE(next_state_offset + 5, 0); }              { ADD_ACTIVE(next_state_offset + 1 + 2 * IMM2_SIZE, 0); }
2482            if (isinclass)            if (isinclass)
2483              {              {
2484              int max = GET2(ecode, 3);              int max = GET2(ecode, 1 + IMM2_SIZE);
2485              if (++count >= max && max != 0)   /* Max 0 => no limit */              if (++count >= max && max != 0)   /* Max 0 => no limit */
2486                { ADD_NEW(next_state_offset + 5, 0); }                { ADD_NEW(next_state_offset + 1 + 2 * IMM2_SIZE, 0); }
2487              else              else
2488                { ADD_NEW(state_offset, count); }                { ADD_NEW(state_offset, count); }
2489              }              }
# Line 2157  for (;;) Line 2498  for (;;)
2498    
2499  /* ========================================================================== */  /* ========================================================================== */
2500        /* These are the opcodes for fancy brackets of various kinds. We have        /* These are the opcodes for fancy brackets of various kinds. We have
2501        to use recursion in order to handle them. The "always failing" assersion        to use recursion in order to handle them. The "always failing" assertion
2502        (?!) is optimised when compiling to OP_FAIL, so we have to support that,        (?!) is optimised to OP_FAIL when compiling, so we have to support that,
2503        though the other "backtracking verbs" are not supported. */        though the other "backtracking verbs" are not supported. */
2504    
2505        case OP_FAIL:        case OP_FAIL:
2506          forced_fail++;    /* Count FAILs for multiple states */
2507        break;        break;
2508    
2509        case OP_ASSERT:        case OP_ASSERT:
# Line 2172  for (;;) Line 2514  for (;;)
2514          int rc;          int rc;
2515          int local_offsets[2];          int local_offsets[2];
2516          int local_workspace[1000];          int local_workspace[1000];
2517          const uschar *endasscode = code + GET(code, 1);          const pcre_uchar *endasscode = code + GET(code, 1);
2518    
2519          while (*endasscode == OP_ALT) endasscode += GET(endasscode, 1);          while (*endasscode == OP_ALT) endasscode += GET(endasscode, 1);
2520    
# Line 2180  for (;;) Line 2522  for (;;)
2522            md,                                   /* static match data */            md,                                   /* static match data */
2523            code,                                 /* this subexpression's code */            code,                                 /* this subexpression's code */
2524            ptr,                                  /* where we currently are */            ptr,                                  /* where we currently are */
2525            ptr - start_subject,                  /* start offset */            (int)(ptr - start_subject),           /* start offset */
2526            local_offsets,                        /* offset vector */            local_offsets,                        /* offset vector */
2527            sizeof(local_offsets)/sizeof(int),    /* size of same */            sizeof(local_offsets)/sizeof(int),    /* size of same */
2528            local_workspace,                      /* workspace vector */            local_workspace,                      /* workspace vector */
2529            sizeof(local_workspace)/sizeof(int),  /* size of same */            sizeof(local_workspace)/sizeof(int),  /* size of same */
2530            ims,                                  /* the current ims flags */            rlevel);                              /* function recursion level */
           rlevel,                               /* function recursion level */  
           recursing);                           /* pass on regex recursion */  
2531    
2532            if (rc == PCRE_ERROR_DFA_UITEM) return rc;
2533          if ((rc >= 0) == (codevalue == OP_ASSERT || codevalue == OP_ASSERTBACK))          if ((rc >= 0) == (codevalue == OP_ASSERT || codevalue == OP_ASSERTBACK))
2534              { ADD_ACTIVE(endasscode + LINK_SIZE + 1 - start_code, 0); }              { ADD_ACTIVE((int)(endasscode + LINK_SIZE + 1 - start_code), 0); }
2535          }          }
2536        break;        break;
2537    
# Line 2210  for (;;) Line 2551  for (;;)
2551          if (code[LINK_SIZE+1] == OP_CALLOUT)          if (code[LINK_SIZE+1] == OP_CALLOUT)
2552            {            {
2553            rrc = 0;            rrc = 0;
2554            if (pcre_callout != NULL)            if (PUBL(callout) != NULL)
2555              {              {
2556              pcre_callout_block cb;              PUBL(callout_block) cb;
2557              cb.version          = 1;   /* Version 1 of the callout block */              cb.version          = 1;   /* Version 1 of the callout block */
2558              cb.callout_number   = code[LINK_SIZE+2];              cb.callout_number   = code[LINK_SIZE+2];
2559              cb.offset_vector    = offsets;              cb.offset_vector    = offsets;
2560    #ifdef COMPILE_PCRE8
2561              cb.subject          = (PCRE_SPTR)start_subject;              cb.subject          = (PCRE_SPTR)start_subject;
2562              cb.subject_length   = end_subject - start_subject;  #else
2563              cb.start_match      = current_subject - start_subject;              cb.subject          = (PCRE_SPTR16)start_subject;
2564              cb.current_position = ptr - start_subject;  #endif
2565                cb.subject_length   = (int)(end_subject - start_subject);
2566                cb.start_match      = (int)(current_subject - start_subject);
2567                cb.current_position = (int)(ptr - start_subject);
2568              cb.pattern_position = GET(code, LINK_SIZE + 3);              cb.pattern_position = GET(code, LINK_SIZE + 3);
2569              cb.next_item_length = GET(code, 3 + 2*LINK_SIZE);              cb.next_item_length = GET(code, 3 + 2*LINK_SIZE);
2570              cb.capture_top      = 1;              cb.capture_top      = 1;
2571              cb.capture_last     = -1;              cb.capture_last     = -1;
2572              cb.callout_data     = md->callout_data;              cb.callout_data     = md->callout_data;
2573              if ((rrc = (*pcre_callout)(&cb)) < 0) return rrc;   /* Abandon */              cb.mark             = NULL;   /* No (*MARK) support */
2574                if ((rrc = (*PUBL(callout))(&cb)) < 0) return rrc;   /* Abandon */
2575              }              }
2576            if (rrc > 0) break;                      /* Fail this thread */            if (rrc > 0) break;                      /* Fail this thread */
2577            code += _pcre_OP_lengths[OP_CALLOUT];    /* Skip callout data */            code += PRIV(OP_lengths)[OP_CALLOUT];    /* Skip callout data */
2578            }            }
2579    
2580          condcode = code[LINK_SIZE+1];          condcode = code[LINK_SIZE+1];
2581    
2582          /* Back reference conditions are not supported */          /* Back reference conditions are not supported */
2583    
2584          if (condcode == OP_CREF) return PCRE_ERROR_DFA_UCOND;          if (condcode == OP_CREF || condcode == OP_NCREF)
2585              return PCRE_ERROR_DFA_UCOND;
2586    
2587          /* The DEFINE condition is always false */          /* The DEFINE condition is always false */
2588    
# Line 2246  for (;;) Line 2593  for (;;)
2593          which means "test if in any recursion". We can't test for specifically          which means "test if in any recursion". We can't test for specifically
2594          recursed groups. */          recursed groups. */
2595    
2596          else if (condcode == OP_RREF)          else if (condcode == OP_RREF || condcode == OP_NRREF)
2597            {            {
2598            int value = GET2(code, LINK_SIZE+2);            int value = GET2(code, LINK_SIZE + 2);
2599            if (value != RREF_ANY) return PCRE_ERROR_DFA_UCOND;            if (value != RREF_ANY) return PCRE_ERROR_DFA_UCOND;
2600            if (recursing > 0)            if (md->recursive != NULL)
2601              { ADD_ACTIVE(state_offset + LINK_SIZE + 4, 0); }              { ADD_ACTIVE(state_offset + LINK_SIZE + 2 + IMM2_SIZE, 0); }
2602            else { ADD_ACTIVE(state_offset + codelink + LINK_SIZE + 1, 0); }            else { ADD_ACTIVE(state_offset + codelink + LINK_SIZE + 1, 0); }
2603            }            }
2604    
# Line 2260  for (;;) Line 2607  for (;;)
2607          else          else
2608            {            {
2609            int rc;            int rc;
2610            const uschar *asscode = code + LINK_SIZE + 1;            const pcre_uchar *asscode = code + LINK_SIZE + 1;
2611            const uschar *endasscode = asscode + GET(asscode, 1);            const pcre_uchar *endasscode = asscode + GET(asscode, 1);
2612    
2613            while (*endasscode == OP_ALT) endasscode += GET(endasscode, 1);            while (*endasscode == OP_ALT) endasscode += GET(endasscode, 1);
2614    
# Line 2269  for (;;) Line 2616  for (;;)
2616              md,                                   /* fixed match data */              md,                                   /* fixed match data */
2617              asscode,                              /* this subexpression's code */              asscode,                              /* this subexpression's code */
2618              ptr,                                  /* where we currently are */              ptr,                                  /* where we currently are */
2619              ptr - start_subject,                  /* start offset */              (int)(ptr - start_subject),           /* start offset */
2620              local_offsets,                        /* offset vector */              local_offsets,                        /* offset vector */
2621              sizeof(local_offsets)/sizeof(int),    /* size of same */              sizeof(local_offsets)/sizeof(int),    /* size of same */
2622              local_workspace,                      /* workspace vector */              local_workspace,                      /* workspace vector */
2623              sizeof(local_workspace)/sizeof(int),  /* size of same */              sizeof(local_workspace)/sizeof(int),  /* size of same */
2624              ims,                                  /* the current ims flags */              rlevel);                              /* function recursion level */
             rlevel,                               /* function recursion level */  
             recursing);                           /* pass on regex recursion */  
2625    
2626              if (rc == PCRE_ERROR_DFA_UITEM) return rc;
2627            if ((rc >= 0) ==            if ((rc >= 0) ==
2628                  (condcode == OP_ASSERT || condcode == OP_ASSERTBACK))                  (condcode == OP_ASSERT || condcode == OP_ASSERTBACK))
2629              { ADD_ACTIVE(endasscode + LINK_SIZE + 1 - start_code, 0); }              { ADD_ACTIVE((int)(endasscode + LINK_SIZE + 1 - start_code), 0); }
2630            else            else
2631              { ADD_ACTIVE(state_offset + codelink + LINK_SIZE + 1, 0); }              { ADD_ACTIVE(state_offset + codelink + LINK_SIZE + 1, 0); }
2632            }            }
# Line 2290  for (;;) Line 2636  for (;;)
2636        /*-----------------------------------------------------------------*/        /*-----------------------------------------------------------------*/
2637        case OP_RECURSE:        case OP_RECURSE:
2638          {          {
2639            dfa_recursion_info *ri;
2640          int local_offsets[1000];          int local_offsets[1000];
2641          int local_workspace[1000];          int local_workspace[1000];
2642            const pcre_uchar *callpat = start_code + GET(code, 1);
2643            int recno = (callpat == md->start_code)? 0 :
2644              GET2(callpat, 1 + LINK_SIZE);
2645          int rc;          int rc;
2646    
2647          DPRINTF(("%.*sStarting regex recursion %d\n", rlevel*2-2, SP,          DPRINTF(("%.*sStarting regex recursion\n", rlevel*2-2, SP));
2648            recursing + 1));  
2649            /* Check for repeating a recursion without advancing the subject
2650            pointer. This should catch convoluted mutual recursions. (Some simple
2651            cases are caught at compile time.) */
2652    
2653            for (ri = md->recursive; ri != NULL; ri = ri->prevrec)
2654              if (recno == ri->group_num && ptr == ri->subject_position)
2655                return PCRE_ERROR_RECURSELOOP;
2656    
2657            /* Remember this recursion and where we started it so as to
2658            catch infinite loops. */
2659    
2660            new_recursive.group_num = recno;
2661            new_recursive.subject_position = ptr;
2662            new_recursive.prevrec = md->recursive;
2663            md->recursive = &new_recursive;
2664    
2665          rc = internal_dfa_exec(          rc = internal_dfa_exec(
2666            md,                                   /* fixed match data */            md,                                   /* fixed match data */
2667            start_code + GET(code, 1),            /* this subexpression's code */            callpat,                              /* this subexpression's code */
2668            ptr,                                  /* where we currently are */            ptr,                                  /* where we currently are */
2669            ptr - start_subject,                  /* start offset */            (int)(ptr - start_subject),           /* start offset */
2670            local_offsets,                        /* offset vector */            local_offsets,                        /* offset vector */
2671            sizeof(local_offsets)/sizeof(int),    /* size of same */            sizeof(local_offsets)/sizeof(int),    /* size of same */
2672            local_workspace,                      /* workspace vector */            local_workspace,                      /* workspace vector */
2673            sizeof(local_workspace)/sizeof(int),  /* size of same */            sizeof(local_workspace)/sizeof(int),  /* size of same */
2674            ims,                                  /* the current ims flags */            rlevel);                              /* function recursion level */
2675            rlevel,                               /* function recursion level */  
2676            recursing + 1);                       /* regex recurse level */          md->recursive = new_recursive.prevrec;  /* Done this recursion */
2677    
2678          DPRINTF(("%.*sReturn from regex recursion %d: rc=%d\n", rlevel*2-2, SP,          DPRINTF(("%.*sReturn from regex recursion: rc=%d\n", rlevel*2-2, SP,
2679            recursing + 1, rc));            rc));
2680    
2681          /* Ran out of internal offsets */          /* Ran out of internal offsets */
2682    
# Line 2325  for (;;) Line 2690  for (;;)
2690            {            {
2691            for (rc = rc*2 - 2; rc >= 0; rc -= 2)            for (rc = rc*2 - 2; rc >= 0; rc -= 2)
2692              {              {
             const uschar *p = start_subject + local_offsets[rc];  
             const uschar *pp = start_subject + local_offsets[rc+1];  
2693              int charcount = local_offsets[rc+1] - local_offsets[rc];              int charcount = local_offsets[rc+1] - local_offsets[rc];
2694              while (p < pp) if ((*p++ & 0xc0) == 0x80) charcount--;  #ifdef SUPPORT_UTF
2695                const pcre_uchar *p = start_subject + local_offsets[rc];
2696                const pcre_uchar *pp = start_subject + local_offsets[rc+1];
2697                while (p < pp) if (NOT_FIRSTCHAR(*p++)) charcount--;
2698    #endif
2699              if (charcount > 0)              if (charcount > 0)
2700                {                {
2701                ADD_NEW_DATA(-(state_offset + LINK_SIZE + 1), 0, (charcount - 1));                ADD_NEW_DATA(-(state_offset + LINK_SIZE + 1), 0, (charcount - 1));
# Line 2344  for (;;) Line 2711  for (;;)
2711        break;        break;
2712    
2713        /*-----------------------------------------------------------------*/        /*-----------------------------------------------------------------*/
2714          case OP_BRAPOS:
2715          case OP_SBRAPOS:
2716          case OP_CBRAPOS:
2717          case OP_SCBRAPOS:
2718          case OP_BRAPOSZERO:
2719            {
2720            int charcount, matched_count;
2721            const pcre_uchar *local_ptr = ptr;
2722            BOOL allow_zero;
2723    
2724            if (codevalue == OP_BRAPOSZERO)
2725              {
2726              allow_zero = TRUE;
2727              codevalue = *(++code);  /* Codevalue will be one of above BRAs */
2728              }
2729            else allow_zero = FALSE;
2730    
2731            /* Loop to match the subpattern as many times as possible as if it were
2732            a complete pattern. */
2733    
2734            for (matched_count = 0;; matched_count++)
2735              {
2736              int local_offsets[2];
2737              int local_workspace[1000];
2738    
2739              int rc = internal_dfa_exec(
2740                md,                                   /* fixed match data */
2741                code,                                 /* this subexpression's code */
2742                local_ptr,                            /* where we currently are */
2743                (int)(ptr - start_subject),           /* start offset */
2744                local_offsets,                        /* offset vector */
2745                sizeof(local_offsets)/sizeof(int),    /* size of same */
2746                local_workspace,                      /* workspace vector */
2747                sizeof(local_workspace)/sizeof(int),  /* size of same */
2748                rlevel);                              /* function recursion level */
2749    
2750              /* Failed to match */
2751    
2752              if (rc < 0)
2753                {
2754                if (rc != PCRE_ERROR_NOMATCH) return rc;
2755                break;
2756                }
2757    
2758              /* Matched: break the loop if zero characters matched. */
2759    
2760              charcount = local_offsets[1] - local_offsets[0];
2761              if (charcount == 0) break;
2762              local_ptr += charcount;    /* Advance temporary position ptr */
2763              }
2764    
2765            /* At this point we have matched the subpattern matched_count
2766            times, and local_ptr is pointing to the character after the end of the
2767            last match. */
2768    
2769            if (matched_count > 0 || allow_zero)
2770              {
2771              const pcre_uchar *end_subpattern = code;
2772              int next_state_offset;
2773    
2774              do { end_subpattern += GET(end_subpattern, 1); }
2775                while (*end_subpattern == OP_ALT);
2776              next_state_offset =
2777                (int)(end_subpattern - start_code + LINK_SIZE + 1);
2778    
2779              /* Optimization: if there are no more active states, and there
2780              are no new states yet set up, then skip over the subject string
2781              right here, to save looping. Otherwise, set up the new state to swing
2782              into action when the end of the matched substring is reached. */
2783    
2784              if (i + 1 >= active_count && new_count == 0)
2785                {
2786                ptr = local_ptr;
2787                clen = 0;
2788                ADD_NEW(next_state_offset, 0);
2789                }
2790              else
2791                {
2792                const pcre_uchar *p = ptr;
2793                const pcre_uchar *pp = local_ptr;
2794                charcount = (int)(pp - p);
2795    #ifdef SUPPORT_UTF
2796                while (p < pp) if (NOT_FIRSTCHAR(*p++)) charcount--;
2797    #endif
2798                ADD_NEW_DATA(-next_state_offset, 0, (charcount - 1));
2799                }
2800              }
2801            }
2802          break;
2803    
2804          /*-----------------------------------------------------------------*/
2805        case OP_ONCE:        case OP_ONCE:
2806          case OP_ONCE_NC:
2807          {          {
2808          int local_offsets[2];          int local_offsets[2];
2809          int local_workspace[1000];          int local_workspace[1000];
# Line 2353  for (;;) Line 2812  for (;;)
2812            md,                                   /* fixed match data */            md,                                   /* fixed match data */
2813            code,                                 /* this subexpression's code */            code,                                 /* this subexpression's code */
2814            ptr,                                  /* where we currently are */            ptr,                                  /* where we currently are */
2815            ptr - start_subject,                  /* start offset */            (int)(ptr - start_subject),           /* start offset */
2816            local_offsets,                        /* offset vector */            local_offsets,                        /* offset vector */
2817            sizeof(local_offsets)/sizeof(int),    /* size of same */            sizeof(local_offsets)/sizeof(int),    /* size of same */
2818            local_workspace,                      /* workspace vector */            local_workspace,                      /* workspace vector */
2819            sizeof(local_workspace)/sizeof(int),  /* size of same */            sizeof(local_workspace)/sizeof(int),  /* size of same */
2820            ims,                                  /* the current ims flags */            rlevel);                              /* function recursion level */
           rlevel,                               /* function recursion level */  
           recursing);                           /* pass on regex recursion */  
2821    
2822          if (rc >= 0)          if (rc >= 0)
2823            {            {
2824            const uschar *end_subpattern = code;            const pcre_uchar *end_subpattern = code;
2825            int charcount = local_offsets[1] - local_offsets[0];            int charcount = local_offsets[1] - local_offsets[0];
2826            int next_state_offset, repeat_state_offset;            int next_state_offset, repeat_state_offset;
2827    
2828            do { end_subpattern += GET(end_subpattern, 1); }            do { end_subpattern += GET(end_subpattern, 1); }
2829              while (*end_subpattern == OP_ALT);              while (*end_subpattern == OP_ALT);
2830            next_state_offset = end_subpattern - start_code + LINK_SIZE + 1;            next_state_offset =
2831                (int)(end_subpattern - start_code + LINK_SIZE + 1);
2832    
2833            /* If the end of this subpattern is KETRMAX or KETRMIN, we must            /* If the end of this subpattern is KETRMAX or KETRMIN, we must
2834            arrange for the repeat state also to be added to the relevant list.            arrange for the repeat state also to be added to the relevant list.
# Line 2378  for (;;) Line 2836  for (;;)
2836    
2837            repeat_state_offset = (*end_subpattern == OP_KETRMAX ||            repeat_state_offset = (*end_subpattern == OP_KETRMAX ||
2838                                   *end_subpattern == OP_KETRMIN)?                                   *end_subpattern == OP_KETRMIN)?
2839              end_subpattern - start_code - GET(end_subpattern, 1) : -1;              (int)(end_subpattern - start_code - GET(end_subpattern, 1)) : -1;
2840    
2841            /* If we have matched an empty string, add the next state at the            /* If we have matched an empty string, add the next state at the
2842            current character pointer. This is important so that the duplicate            current character pointer. This is important so that the duplicate
# Line 2393  for (;;) Line 2851  for (;;)
2851            /* Optimization: if there are no more active states, and there            /* Optimization: if there are no more active states, and there
2852            are no new states yet set up, then skip over the subject string            are no new states yet set up, then skip over the subject string
2853            right here, to save looping. Otherwise, set up the new state to swing            right here, to save looping. Otherwise, set up the new state to swing
2854            into action when the end of the substring is reached. */            into action when the end of the matched substring is reached. */
2855    
2856            else if (i + 1 >= active_count && new_count == 0)            else if (i + 1 >= active_count && new_count == 0)
2857              {              {
# Line 2416  for (;;) Line 2874  for (;;)
2874              }              }
2875            else            else
2876              {              {
2877              const uschar *p = start_subject + local_offsets[0];  #ifdef SUPPORT_UTF
2878              const uschar *pp = start_subject + local_offsets[1];              const pcre_uchar *p = start_subject + local_offsets[0];
2879              while (p < pp) if ((*p++ & 0xc0) == 0x80) charcount--;              const pcre_uchar *pp = start_subject + local_offsets[1];
2880                while (p < pp) if (NOT_FIRSTCHAR(*p++)) charcount--;
2881    #endif
2882              ADD_NEW_DATA(-next_state_offset, 0, (charcount - 1));              ADD_NEW_DATA(-next_state_offset, 0, (charcount - 1));
2883              if (repeat_state_offset >= 0)              if (repeat_state_offset >= 0)
2884                { ADD_NEW_DATA(-repeat_state_offset, 0, (charcount - 1)); }                { ADD_NEW_DATA(-repeat_state_offset, 0, (charcount - 1)); }
2885              }              }
   
2886            }            }
2887          else if (rc != PCRE_ERROR_NOMATCH) return rc;          else if (rc != PCRE_ERROR_NOMATCH) return rc;
2888          }          }
# Line 2435  for (;;) Line 2894  for (;;)
2894    
2895        case OP_CALLOUT:        case OP_CALLOUT:
2896        rrc = 0;        rrc = 0;
2897        if (pcre_callout != NULL)        if (PUBL(callout) != NULL)
2898          {          {
2899          pcre_callout_block cb;          PUBL(callout_block) cb;
2900          cb.version          = 1;   /* Version 1 of the callout block */          cb.version          = 1;   /* Version 1 of the callout block */
2901          cb.callout_number   = code[1];          cb.callout_number   = code[1];
2902          cb.offset_vector    = offsets;          cb.offset_vector    = offsets;
2903    #ifdef COMPILE_PCRE8
2904          cb.subject          = (PCRE_SPTR)start_subject;          cb.subject          = (PCRE_SPTR)start_subject;
2905          cb.subject_length   = end_subject - start_subject;  #else
2906          cb.start_match      = current_subject - start_subject;          cb.subject          = (PCRE_SPTR16)start_subject;
2907          cb.current_position = ptr - start_subject;  #endif
2908            cb.subject_length   = (int)(end_subject - start_subject);
2909            cb.start_match      = (int)(current_subject - start_subject);
2910            cb.current_position = (int)(ptr - start_subject);
2911          cb.pattern_position = GET(code, 2);          cb.pattern_position = GET(code, 2);
2912          cb.next_item_length = GET(code, 2 + LINK_SIZE);          cb.next_item_length = GET(code, 2 + LINK_SIZE);
2913          cb.capture_top      = 1;          cb.capture_top      = 1;
2914          cb.capture_last     = -1;          cb.capture_last     = -1;
2915          cb.callout_data     = md->callout_data;          cb.callout_data     = md->callout_data;
2916          if ((rrc = (*pcre_callout)(&cb)) < 0) return rrc;   /* Abandon */          cb.mark             = NULL;   /* No (*MARK) support */
2917            if ((rrc = (*PUBL(callout))(&cb)) < 0) return rrc;   /* Abandon */
2918          }          }
2919        if (rrc == 0)        if (rrc == 0)
2920          { ADD_ACTIVE(state_offset + _pcre_OP_lengths[OP_CALLOUT], 0); }          { ADD_ACTIVE(state_offset + PRIV(OP_lengths)[OP_CALLOUT], 0); }
2921        break;        break;
2922    
2923    
# Line 2469  for (;;) Line 2933  for (;;)
2933    /* We have finished the processing at the current subject character. If no    /* We have finished the processing at the current subject character. If no
2934    new states have been set for the next character, we have found all the    new states have been set for the next character, we have found all the
2935    matches that we are going to find. If we are at the top level and partial    matches that we are going to find. If we are at the top level and partial
2936    matching has been requested, check for appropriate conditions. */    matching has been requested, check for appropriate conditions.
2937    
2938      The "forced_ fail" variable counts the number of (*F) encountered for the
2939      character. If it is equal to the original active_count (saved in
2940      workspace[1]) it means that (*F) was found on every active state. In this
2941      case we don't want to give a partial match.
2942    
2943      The "could_continue" variable is true if a state could have continued but
2944      for the fact that the end of the subject was reached. */
2945    
2946    if (new_count <= 0)    if (new_count <= 0)
2947      {      {
2948      if (match_count < 0 &&                     /* No matches found */      if (rlevel == 1 &&                               /* Top level, and */
2949          rlevel == 1 &&                         /* Top level match function */          could_continue &&                            /* Some could go on */
2950          (md->moptions & PCRE_PARTIAL) != 0 &&  /* Want partial matching */          forced_fail != workspace[1] &&               /* Not all forced fail & */
2951            (                                            /* either... */
2952            (md->moptions & PCRE_PARTIAL_HARD) != 0      /* Hard partial */
2953            ||                                           /* or... */
2954            ((md->moptions & PCRE_PARTIAL_SOFT) != 0 &&  /* Soft partial and */
2955             match_count < 0)                            /* no matches */
2956            ) &&                                         /* And... */
2957          ptr >= end_subject &&                  /* Reached end of subject */          ptr >= end_subject &&                  /* Reached end of subject */
2958          ptr > current_subject)                 /* Matched non-empty string */          ptr > md->start_used_ptr)              /* Inspected non-empty string */
2959        {        {
2960        if (offsetcount >= 2)        if (offsetcount >= 2)
2961          {          {
2962          offsets[0] = current_subject - start_subject;          offsets[0] = (int)(md->start_used_ptr - start_subject);
2963          offsets[1] = end_subject - start_subject;          offsets[1] = (int)(end_subject - start_subject);
2964          }          }
2965        match_count = PCRE_ERROR_PARTIAL;        match_count = PCRE_ERROR_PARTIAL;
2966        }        }
# Line 2536  Returns: > 0 => number of match Line 3014  Returns: > 0 => number of match
3014                   < -1 => some kind of unexpected problem                   < -1 => some kind of unexpected problem
3015  */  */
3016    
3017    #ifdef COMPILE_PCRE8
3018  PCRE_EXP_DEFN int PCRE_CALL_CONVENTION  PCRE_EXP_DEFN int PCRE_CALL_CONVENTION
3019  pcre_dfa_exec(const pcre *argument_re, const pcre_extra *extra_data,  pcre_dfa_exec(const pcre *argument_re, const pcre_extra *extra_data,
3020    const char *subject, int length, int start_offset, int options, int *offsets,    const char *subject, int length, int start_offset, int options, int *offsets,
3021    int offsetcount, int *workspace, int wscount)    int offsetcount, int *workspace, int wscount)
3022    #else
3023    PCRE_EXP_DEFN int PCRE_CALL_CONVENTION
3024    pcre16_dfa_exec(const pcre16 *argument_re, const pcre16_extra *extra_data,
3025      PCRE_SPTR16 subject, int length, int start_offset, int options, int *offsets,
3026      int offsetcount, int *workspace, int wscount)
3027    #endif
3028  {  {
3029  real_pcre *re = (real_pcre *)argument_re;  REAL_PCRE *re = (REAL_PCRE *)argument_re;
3030  dfa_match_data match_block;  dfa_match_data match_block;
3031  dfa_match_data *md = &match_block;  dfa_match_data *md = &match_block;
3032  BOOL utf8, anchored, startline, firstline;  BOOL utf, anchored, startline, firstline;
3033  const uschar *current_subject, *end_subject, *lcc;  const pcre_uchar *current_subject, *end_subject;
   
 pcre_study_data internal_study;  
3034  const pcre_study_data *study = NULL;  const pcre_study_data *study = NULL;
 real_pcre internal_re;  
3035    
3036  const uschar *req_byte_ptr;  const pcre_uchar *req_char_ptr;
3037  const uschar *start_bits = NULL;  const pcre_uint8 *start_bits = NULL;
3038  BOOL first_byte_caseless = FALSE;  BOOL has_first_char = FALSE;
3039  BOOL req_byte_caseless = FALSE;  BOOL has_req_char = FALSE;
3040  int first_byte = -1;  pcre_uchar first_char = 0;
3041  int req_byte = -1;  pcre_uchar first_char2 = 0;
3042  int req_byte2 = -1;  pcre_uchar req_char = 0;
3043    pcre_uchar req_char2 = 0;
3044  int newline;  int newline;
3045    
3046  /* Plausibility checks */  /* Plausibility checks */
# Line 2567  if (re == NULL || subject == NULL || wor Line 3050  if (re == NULL || subject == NULL || wor
3050     (offsets == NULL && offsetcount > 0)) return PCRE_ERROR_NULL;     (offsets == NULL && offsetcount > 0)) return PCRE_ERROR_NULL;
3051  if (offsetcount < 0) return PCRE_ERROR_BADCOUNT;  if (offsetcount < 0) return PCRE_ERROR_BADCOUNT;
3052  if (wscount < 20) return PCRE_ERROR_DFA_WSSIZE;  if (wscount < 20) return PCRE_ERROR_DFA_WSSIZE;
3053    if (start_offset < 0 || start_offset > length) return PCRE_ERROR_BADOFFSET;
3054    
3055  /* We need to find the pointer to any study data before we test for byte  /* We need to find the pointer to any study data before we test for byte
3056  flipping, so we scan the extra_data block first. This may set two fields in the  flipping, so we scan the extra_data block first. This may set two fields in the
# Line 2591  if (extra_data != NULL) Line 3075  if (extra_data != NULL)
3075    }    }
3076    
3077  /* Check that the first field in the block is the magic number. If it is not,  /* Check that the first field in the block is the magic number. If it is not,
3078  test for a regex that was compiled on a host of opposite endianness. If this is  return with PCRE_ERROR_BADMAGIC. However, if the magic number is equal to
3079  the case, flipped values are put in internal_re and internal_study if there was  REVERSED_MAGIC_NUMBER we return with PCRE_ERROR_BADENDIANNESS, which
3080  study data too. */  means that the pattern is likely compiled with different endianness. */
3081    
3082  if (re->magic_number != MAGIC_NUMBER)  if (re->magic_number != MAGIC_NUMBER)
3083    {    return re->magic_number == REVERSED_MAGIC_NUMBER?
3084    re = _pcre_try_flipped(re, &internal_re, study, &internal_study);      PCRE_ERROR_BADENDIANNESS:PCRE_ERROR_BADMAGIC;
3085    if (re == NULL) return PCRE_ERROR_BADMAGIC;  if ((re->flags & PCRE_MODE) == 0) return PCRE_ERROR_BADMODE;
   if (study != NULL) study = &internal_study;  
   }  
3086    
3087  /* Set some local values */  /* Set some local values */
3088    
3089  current_subject = (const unsigned char *)subject + start_offset;  current_subject = (const pcre_uchar *)subject + start_offset;
3090  end_subject = (const unsigned char *)subject + length;  end_subject = (const pcre_uchar *)subject + length;
3091  req_byte_ptr = current_subject - 1;  req_char_ptr = current_subject - 1;
3092    
3093  #ifdef SUPPORT_UTF8  #ifdef SUPPORT_UTF
3094  utf8 = (re->options & PCRE_UTF8) != 0;  /* PCRE_UTF16 has the same value as PCRE_UTF8. */
3095    utf = (re->options & PCRE_UTF8) != 0;
3096  #else  #else
3097  utf8 = FALSE;  utf = FALSE;
3098  #endif  #endif
3099    
3100  anchored = (options & (PCRE_ANCHORED|PCRE_DFA_RESTART)) != 0 ||  anchored = (options & (PCRE_ANCHORED|PCRE_DFA_RESTART)) != 0 ||
# Line 2619  anchored = (options & (PCRE_ANCHORED|PCR Line 3102  anchored = (options & (PCRE_ANCHORED|PCR
3102    
3103  /* The remaining fixed data for passing around. */  /* The remaining fixed data for passing around. */
3104    
3105  md->start_code = (const uschar *)argument_re +  md->start_code = (const pcre_uchar *)argument_re +
3106      re->name_table_offset + re->name_count * re->name_entry_size;      re->name_table_offset + re->name_count * re->name_entry_size;
3107  md->start_subject = (const unsigned char *)subject;  md->start_subject = (const pcre_uchar *)subject;
3108  md->end_subject = end_subject;  md->end_subject = end_subject;
3109    md->start_offset = start_offset;
3110  md->moptions = options;  md->moptions = options;
3111  md->poptions = re->options;  md->poptions = re->options;
3112    
# Line 2681  else Line 3165  else
3165  /* Check a UTF-8 string if required. Unfortunately there's no way of passing  /* Check a UTF-8 string if required. Unfortunately there's no way of passing
3166  back the character offset. */  back the character offset. */
3167    
3168  #ifdef SUPPORT_UTF8  #ifdef SUPPORT_UTF
3169  if (utf8 && (options & PCRE_NO_UTF8_CHECK) == 0)  if (utf && (options & PCRE_NO_UTF8_CHECK) == 0)
3170    {    {
3171    if (_pcre_valid_utf8((uschar *)subject, length) >= 0)    int erroroffset;
3172      return PCRE_ERROR_BADUTF8;    int errorcode = PRIV(valid_utf)((pcre_uchar *)subject, length, &erroroffset);
3173    if (start_offset > 0 && start_offset < length)    if (errorcode != 0)
3174      {      {
3175      int tb = ((uschar *)subject)[start_offset];      if (offsetcount >= 2)
     if (tb > 127)  
3176        {        {
3177        tb &= 0xc0;        offsets[0] = erroroffset;
3178        if (tb != 0 && tb != 0xc0) return PCRE_ERROR_BADUTF8_OFFSET;        offsets[1] = errorcode;
3179        }        }
3180        return (errorcode <= PCRE_UTF8_ERR5 && (options & PCRE_PARTIAL_HARD) != 0)?
3181          PCRE_ERROR_SHORTUTF8 : PCRE_ERROR_BADUTF8;
3182      }      }
3183      if (start_offset > 0 && start_offset < length &&
3184            NOT_FIRSTCHAR(((PCRE_PUCHAR)subject)[start_offset]))
3185        return PCRE_ERROR_BADUTF8_OFFSET;
3186    }    }
3187  #endif  #endif
3188    
# Line 2702  if (utf8 && (options & PCRE_NO_UTF8_CHEC Line 3190  if (utf8 && (options & PCRE_NO_UTF8_CHEC
3190  is a feature that makes it possible to save compiled regex and re-use them  is a feature that makes it possible to save compiled regex and re-use them
3191  in other programs later. */  in other programs later. */
3192    
3193  if (md->tables == NULL) md->tables = _pcre_default_tables;  if (md->tables == NULL) md->tables = PRIV(default_tables);
3194    
3195  /* The lower casing table and the "must be at the start of a line" flag are  /* The "must be at the start of a line" flags are used in a loop when finding
3196  used in a loop when finding where to start. */  where to start. */
3197    
 lcc = md->tables + lcc_offset;  
3198  startline = (re->flags & PCRE_STARTLINE) != 0;  startline = (re->flags & PCRE_STARTLINE) != 0;
3199  firstline = (re->options & PCRE_FIRSTLINE) != 0;  firstline = (re->options & PCRE_FIRSTLINE) != 0;
3200    
# Line 2721  if (!anchored) Line 3208  if (!anchored)
3208    {    {
3209    if ((re->flags & PCRE_FIRSTSET) != 0)    if ((re->flags & PCRE_FIRSTSET) != 0)
3210      {      {
3211      first_byte = re->first_byte & 255;      has_first_char = TRUE;
3212      if ((first_byte_caseless = ((re->first_byte & REQ_CASELESS) != 0)) == TRUE)      first_char = first_char2 = re->first_char;
3213        first_byte = lcc[first_byte];      if ((re->flags & PCRE_FCH_CASELESS) != 0)
3214          {
3215          first_char2 = TABLE_GET(first_char, md->tables + fcc_offset, first_char);
3216    #if defined SUPPORT_UCP && !(defined COMPILE_PCRE8)
3217          if (utf && first_char > 127)
3218            first_char2 = UCD_OTHERCASE(first_char);
3219    #endif
3220          }
3221      }      }
3222    else    else
3223      {      {
3224      if (startline && study != NULL &&      if (!startline && study != NULL &&
3225           (study->options & PCRE_STUDY_MAPPED) != 0)           (study->flags & PCRE_STUDY_MAPPED) != 0)
3226        start_bits = study->start_bits;        start_bits = study->start_bits;
3227      }      }
3228    }    }
# Line 2738  character" set. */ Line 3232  character" set. */
3232    
3233  if ((re->flags & PCRE_REQCHSET) != 0)  if ((re->flags & PCRE_REQCHSET) != 0)
3234    {    {
3235    req_byte = re->req_byte & 255;    has_req_char = TRUE;
3236    req_byte_caseless = (re->req_byte & REQ_CASELESS) != 0;    req_char = req_char2 = re->req_char;
3237    req_byte2 = (md->tables + fcc_offset)[req_byte];  /* case flipped */    if ((re->flags & PCRE_RCH_CASELESS) != 0)
3238        {
3239        req_char2 = TABLE_GET(req_char, md->tables + fcc_offset, req_char);
3240    #if defined SUPPORT_UCP && !(defined COMPILE_PCRE8)
3241        if (utf && req_char > 127)
3242          req_char2 = UCD_OTHERCASE(req_char);
3243    #endif
3244        }
3245    }    }
3246    
3247  /* Call the main matching function, looping for a non-anchored regex after a  /* Call the main matching function, looping for a non-anchored regex after a
# Line 2753  for (;;) Line 3254  for (;;)
3254    
3255    if ((options & PCRE_DFA_RESTART) == 0)    if ((options & PCRE_DFA_RESTART) == 0)
3256      {      {
3257      const uschar *save_end_subject = end_subject;      const pcre_uchar *save_end_subject = end_subject;
3258    
3259      /* If firstline is TRUE, the start of the match is constrained to the first      /* If firstline is TRUE, the start of the match is constrained to the first
3260      line of a multiline string. Implement this by temporarily adjusting      line of a multiline string. Implement this by temporarily adjusting
# Line 2762  for (;;) Line 3263  for (;;)
3263    
3264      if (firstline)      if (firstline)
3265        {        {
3266        USPTR t = current_subject;        PCRE_PUCHAR t = current_subject;
3267  #ifdef SUPPORT_UTF8  #ifdef SUPPORT_UTF
3268        if (utf8)        if (utf)
3269          {          {
3270          while (t < md->end_subject && !IS_NEWLINE(t))          while (t < md->end_subject && !IS_NEWLINE(t))
3271            {            {
3272            t++;            t++;
3273            while (t < end_subject && (*t & 0xc0) == 0x80) t++;            ACROSSCHAR(t < end_subject, *t, t++);
3274            }            }
3275          }          }
3276        else        else
# Line 2779  for (;;) Line 3280  for (;;)
3280        }        }
3281    
3282      /* There are some optimizations that avoid running the match if a known      /* There are some optimizations that avoid running the match if a known
3283      starting point is not found, or if a known later character is not present.      starting point is not found. However, there is an option that disables
3284      However, there is an option that disables these, for testing and for      these, for testing and for ensuring that all callouts do actually occur.
3285      ensuring that all callouts do actually occur. */      The option can be set in the regex by (*NO_START_OPT) or passed in
3286        match-time options. */
3287    
3288      if ((options & PCRE_NO_START_OPTIMIZE) == 0)      if (((options | re->options) & PCRE_NO_START_OPTIMIZE) == 0)
3289        {        {
3290          /* Advance to a known first char. */
3291    
3292        /* Advance to a known first byte. */        if (has_first_char)
   
       if (first_byte >= 0)  
3293          {          {
3294          if (first_byte_caseless)          if (first_char != first_char2)
3295            while (current_subject < end_subject &&            while (current_subject < end_subject &&
3296                   lcc[*current_subject] != first_byte)                *current_subject != first_char && *current_subject != first_char2)
3297              current_subject++;              current_subject++;
3298          else          else
3299            while (current_subject < end_subject &&            while (current_subject < end_subject &&
3300                   *current_subject != first_byte)                   *current_subject != first_char)
3301              current_subject++;              current_subject++;
3302          }          }
3303    
# Line 2806  for (;;) Line 3307  for (;;)
3307          {          {
3308          if (current_subject > md->start_subject + start_offset)          if (current_subject > md->start_subject + start_offset)
3309            {            {
3310  #ifdef SUPPORT_UTF8  #ifdef SUPPORT_UTF
3311            if (utf8)            if (utf)
3312              {              {
3313              while (current_subject < end_subject &&              while (current_subject < end_subject &&
3314                     !WAS_NEWLINE(current_subject))                     !WAS_NEWLINE(current_subject))
3315                {                {
3316                current_subject++;                current_subject++;
3317                while(current_subject < end_subject &&                ACROSSCHAR(current_subject < end_subject, *current_subject,
3318                      (*current_subject & 0xc0) == 0x80)                  current_subject++);
                 current_subject++;  
3319                }                }
3320              }              }
3321            else            else
# Line 2842  for (;;) Line 3342  for (;;)
3342          while (current_subject < end_subject)          while (current_subject < end_subject)
3343            {            {
3344            register unsigned int c = *current_subject;            register unsigned int c = *current_subject;
3345            if ((start_bits[c/8] & (1 << (c&7))) == 0) current_subject++;  #ifndef COMPILE_PCRE8
3346              else break;            if (c > 255) c = 255;
3347    #endif
3348              if ((start_bits[c/8] & (1 << (c&7))) == 0)
3349                {
3350                current_subject++;
3351    #if defined SUPPORT_UTF && defined COMPILE_PCRE8
3352                /* In non 8-bit mode, the iteration will stop for
3353                characters > 255 at the beginning or not stop at all. */
3354                if (utf)
3355                  ACROSSCHAR(current_subject < end_subject, *current_subject,
3356                    current_subject++);
3357    #endif
3358                }
3359              else break;
3360            }            }
3361          }          }
3362        }        }
# Line 2851  for (;;) Line 3364  for (;;)
3364      /* Restore fudged end_subject */      /* Restore fudged end_subject */
3365    
3366      end_subject = save_end_subject;      end_subject = save_end_subject;
     }  
3367    
3368    /* If req_byte is set, we know that that character must appear in the subject      /* The following two optimizations are disabled for partial matching or if
3369    for the match to succeed. If the first character is set, req_byte must be      disabling is explicitly requested (and of course, by the test above, this
3370    later in the subject; otherwise the test starts at the match point. This      code is not obeyed when restarting after a partial match). */
   optimization can save a huge amount of work in patterns with nested unlimited  
   repeats that aren't going to match. Writing separate code for cased/caseless  
   versions makes it go faster, as does using an autoincrement and backing off  
   on a match.  
   
   HOWEVER: when the subject string is very, very long, searching to its end can  
   take a long time, and give bad performance on quite ordinary patterns. This  
   showed up when somebody was matching /^C/ on a 32-megabyte string... so we  
   don't do this when the string is sufficiently long.  
   
   ALSO: this processing is disabled when partial matching is requested, and can  
   also be explicitly deactivated. */  
   
   if ((options & PCRE_NO_START_OPTIMIZE) == 0 &&  
       req_byte >= 0 &&  
       end_subject - current_subject < REQ_BYTE_MAX &&  
       (options & PCRE_PARTIAL) == 0)  
     {  
     register const uschar *p = current_subject + ((first_byte >= 0)? 1 : 0);  
3371    
3372      /* We don't need to repeat the search if we haven't yet reached the      if (((options | re->options) & PCRE_NO_START_OPTIMIZE) == 0 &&
3373      place we found it at last time. */          (options & (PCRE_PARTIAL_HARD|PCRE_PARTIAL_SOFT)) == 0)
   
     if (p > req_byte_ptr)  
3374        {        {
3375        if (req_byte_caseless)        /* If the pattern was studied, a minimum subject length may be set. This
3376          {        is a lower bound; no actual string of that length may actually match the
3377          while (p < end_subject)        pattern. Although the value is, strictly, in characters, we treat it as
3378            {        bytes to avoid spending too much time in this optimization. */
3379            register int pp = *p++;  
3380            if (pp == req_byte || pp == req_byte2) { p--; break; }        if (study != NULL && (study->flags & PCRE_STUDY_MINLEN) != 0 &&
3381            }            (pcre_uint32)(end_subject - current_subject) < study->minlength)
3382          }          return PCRE_ERROR_NOMATCH;
3383        else  
3384          /* If req_char is set, we know that that character must appear in the
3385          subject for the match to succeed. If the first character is set, req_char
3386          must be later in the subject; otherwise the test starts at the match
3387          point. This optimization can save a huge amount of work in patterns with
3388          nested unlimited repeats that aren't going to match. Writing separate
3389          code for cased/caseless versions makes it go faster, as does using an
3390          autoincrement and backing off on a match.
3391    
3392          HOWEVER: when the subject string is very, very long, searching to its end
3393          can take a long time, and give bad performance on quite ordinary
3394          patterns. This showed up when somebody was matching /^C/ on a 32-megabyte
3395          string... so we don't do this when the string is sufficiently long. */
3396    
3397          if (has_req_char && end_subject - current_subject < REQ_BYTE_MAX)
3398          {          {
3399          while (p < end_subject)          register PCRE_PUCHAR p = current_subject + (has_first_char? 1:0);
3400    
3401            /* We don't need to repeat the search if we haven't yet reached the
3402            place we found it at last time. */
3403    
3404            if (p > req_char_ptr)
3405            {            {
3406            if (*p++ == req_byte) { p--; break; }            if (req_char != req_char2)
3407            }              {
3408          }              while (p < end_subject)
3409                  {
3410                  register int pp = *p++;
3411                  if (pp == req_char || pp == req_char2) { p--; break; }
3412                  }
3413                }
3414              else
3415                {
3416                while (p < end_subject)
3417                  {
3418                  if (*p++ == req_char) { p--; break; }
3419                  }
3420                }
3421    
3422        /* If we can't find the required character, break the matching loop,            /* If we can't find the required character, break the matching loop,
3423        which will cause a return or PCRE_ERROR_NOMATCH. */            which will cause a return or PCRE_ERROR_NOMATCH. */
3424    
3425        if (p >= end_subject) break;            if (p >= end_subject) break;
3426    
3427        /* If we have found the required character, save the point where we            /* If we have found the required character, save the point where we
3428        found it, so that we don't search again next time round the loop if            found it, so that we don't search again next time round the loop if
3429        the start hasn't passed this character yet. */            the start hasn't passed this character yet. */
3430    
3431        req_byte_ptr = p;            req_char_ptr = p;
3432              }
3433            }
3434        }        }
3435      }      }   /* End of optimizations that are done when not restarting */
3436    
3437    /* OK, now we can do the business */    /* OK, now we can do the business */
3438    
3439      md->start_used_ptr = current_subject;
3440      md->recursive = NULL;
3441    
3442    rc = internal_dfa_exec(    rc = internal_dfa_exec(
3443      md,                                /* fixed match data */      md,                                /* fixed match data */
3444      md->start_code,                    /* this subexpression's code */      md->start_code,                    /* this subexpression's code */
# Line 2921  for (;;) Line 3448  for (;;)
3448      offsetcount,                       /* size of same */      offsetcount,                       /* size of same */
3449      workspace,                         /* workspace vector */      workspace,                         /* workspace vector */
3450      wscount,                           /* size of same */      wscount,                           /* size of same */
3451      re->options & (PCRE_CASELESS|PCRE_MULTILINE|PCRE_DOTALL), /* ims flags */      0);                                /* function recurse level */
     0,                                 /* function recurse level */  
     0);                                /* regex recurse level */  
3452    
3453    /* Anything other than "no match" means we are done, always; otherwise, carry    /* Anything other than "no match" means we are done, always; otherwise, carry
3454    on only if not anchored. */    on only if not anchored. */
# Line 2935  for (;;) Line 3460  for (;;)
3460    
3461    if (firstline && IS_NEWLINE(current_subject)) break;    if (firstline && IS_NEWLINE(current_subject)) break;
3462    current_subject++;    current_subject++;
3463    if (utf8)  #ifdef SUPPORT_UTF
3464      if (utf)
3465      {      {
3466      while (current_subject < end_subject && (*current_subject & 0xc0) == 0x80)      ACROSSCHAR(current_subject < end_subject, *current_subject,
3467        current_subject++;        current_subject++);
3468      }      }
3469    #endif
3470    if (current_subject > end_subject) break;    if (current_subject > end_subject) break;
3471    
3472    /* If we have just passed a CR and we are now at a LF, and the pattern does    /* If we have just passed a CR and we are now at a LF, and the pattern does

Legend:
Removed from v.406  
changed lines
  Added in v.894

webmaster@exim.org
ViewVC Help
Powered by ViewVC 1.1.12