/[pcre]/code/trunk/pcre_dfa_exec.c
ViewVC logotype

Diff of /code/trunk/pcre_dfa_exec.c

Parent Directory Parent Directory | Revision Log Revision Log | View Patch Patch

revision 182 by ph10, Wed Jun 13 15:09:54 2007 UTC revision 733 by ph10, Tue Oct 11 10:29:36 2011 UTC
# Line 3  Line 3 
3  *************************************************/  *************************************************/
4    
5  /* PCRE is a library of functions to support regular expressions whose syntax  /* PCRE is a library of functions to support regular expressions whose syntax
6  and semantics are as close as possible to those of the Perl 5 language.  and semantics are as close as possible to those of the Perl 5 language (but see
7    below for why this module is different).
8    
9                         Written by Philip Hazel                         Written by Philip Hazel
10             Copyright (c) 1997-2007 University of Cambridge             Copyright (c) 1997-2011 University of Cambridge
11    
12  -----------------------------------------------------------------------------  -----------------------------------------------------------------------------
13  Redistribution and use in source and binary forms, with or without  Redistribution and use in source and binary forms, with or without
# Line 44  FSM). This is NOT Perl- compatible, but Line 45  FSM). This is NOT Perl- compatible, but
45  applications. */  applications. */
46    
47    
48    /* NOTE ABOUT PERFORMANCE: A user of this function sent some code that improved
49    the performance of his patterns greatly. I could not use it as it stood, as it
50    was not thread safe, and made assumptions about pattern sizes. Also, it caused
51    test 7 to loop, and test 9 to crash with a segfault.
52    
53    The issue is the check for duplicate states, which is done by a simple linear
54    search up the state list. (Grep for "duplicate" below to find the code.) For
55    many patterns, there will never be many states active at one time, so a simple
56    linear search is fine. In patterns that have many active states, it might be a
57    bottleneck. The suggested code used an indexing scheme to remember which states
58    had previously been used for each character, and avoided the linear search when
59    it knew there was no chance of a duplicate. This was implemented when adding
60    states to the state lists.
61    
62    I wrote some thread-safe, not-limited code to try something similar at the time
63    of checking for duplicates (instead of when adding states), using index vectors
64    on the stack. It did give a 13% improvement with one specially constructed
65    pattern for certain subject strings, but on other strings and on many of the
66    simpler patterns in the test suite it did worse. The major problem, I think,
67    was the extra time to initialize the index. This had to be done for each call
68    of internal_dfa_exec(). (The supplied patch used a static vector, initialized
69    only once - I suspect this was the cause of the problems with the tests.)
70    
71    Overall, I concluded that the gains in some cases did not outweigh the losses
72    in others, so I abandoned this code. */
73    
74    
75    
76    #ifdef HAVE_CONFIG_H
77    #include "config.h"
78    #endif
79    
80  #define NLBLOCK md             /* Block containing newline information */  #define NLBLOCK md             /* Block containing newline information */
81  #define PSSTART start_subject  /* Field containing processed string start */  #define PSSTART start_subject  /* Field containing processed string start */
82  #define PSEND   end_subject    /* Field containing processed string end */  #define PSEND   end_subject    /* Field containing processed string end */
# Line 56  applications. */ Line 89  applications. */
89  #define SP "                   "  #define SP "                   "
90    
91    
   
92  /*************************************************  /*************************************************
93  *      Code parameters and static tables         *  *      Code parameters and static tables         *
94  *************************************************/  *************************************************/
# Line 74  never stored, so we push them well clear Line 106  never stored, so we push them well clear
106    
107    
108  /* This table identifies those opcodes that are followed immediately by a  /* This table identifies those opcodes that are followed immediately by a
109  character that is to be tested in some way. This makes is possible to  character that is to be tested in some way. This makes it possible to
110  centralize the loading of these characters. In the case of Type * etc, the  centralize the loading of these characters. In the case of Type * etc, the
111  "character" is the opcode for \D, \d, \S, \s, \W, or \w, which will always be a  "character" is the opcode for \D, \d, \S, \s, \W, or \w, which will always be a
112  small value. ***NOTE*** If the start of this table is modified, the two tables  small value. Non-zero values in the table are the offsets from the opcode where
113  that follow must also be modified. */  the character is to be found. ***NOTE*** If the start of this table is
114    modified, the three tables that follow must also be modified. */
115    
116  static uschar coptable[] = {  static const uschar coptable[] = {
117    0,                             /* End                                    */    0,                             /* End                                    */
118    0, 0, 0, 0, 0,                 /* \A, \G, \K, \B, \b                     */    0, 0, 0, 0, 0,                 /* \A, \G, \K, \B, \b                     */
119    0, 0, 0, 0, 0, 0,              /* \D, \d, \S, \s, \W, \w                 */    0, 0, 0, 0, 0, 0,              /* \D, \d, \S, \s, \W, \w                 */
120    0, 0,                          /* Any, Anybyte                           */    0, 0, 0,                       /* Any, AllAny, Anybyte                   */
121    0, 0, 0,                       /* NOTPROP, PROP, EXTUNI                  */    0, 0,                          /* \P, \p                                 */
122    0, 0, 0, 0, 0,                 /* \R, \H, \h, \V, \v                     */    0, 0, 0, 0, 0,                 /* \R, \H, \h, \V, \v                     */
123    0, 0, 0, 0, 0,                 /* \Z, \z, Opt, ^, $                      */    0,                             /* \X                                     */
124      0, 0, 0, 0, 0, 0,              /* \Z, \z, ^, ^M, $, $M                   */
125    1,                             /* Char                                   */    1,                             /* Char                                   */
126    1,                             /* Charnc                                 */    1,                             /* Chari                                  */
127    1,                             /* not                                    */    1,                             /* not                                    */
128      1,                             /* noti                                   */
129    /* Positive single-char repeats                                          */    /* Positive single-char repeats                                          */
130    1, 1, 1, 1, 1, 1,              /* *, *?, +, +?, ?, ??                    */    1, 1, 1, 1, 1, 1,              /* *, *?, +, +?, ?, ??                    */
131    3, 3, 3,                       /* upto, minupto, exact                   */    3, 3, 3,                       /* upto, minupto, exact                   */
132    1, 1, 1, 3,                    /* *+, ++, ?+, upto+                      */    1, 1, 1, 3,                    /* *+, ++, ?+, upto+                      */
133      1, 1, 1, 1, 1, 1,              /* *I, *?I, +I, +?I, ?I, ??I              */
134      3, 3, 3,                       /* upto I, minupto I, exact I             */
135      1, 1, 1, 3,                    /* *+I, ++I, ?+I, upto+I                  */
136    /* Negative single-char repeats - only for chars < 256                   */    /* Negative single-char repeats - only for chars < 256                   */
137    1, 1, 1, 1, 1, 1,              /* NOT *, *?, +, +?, ?, ??                */    1, 1, 1, 1, 1, 1,              /* NOT *, *?, +, +?, ?, ??                */
138    3, 3, 3,                       /* NOT upto, minupto, exact               */    3, 3, 3,                       /* NOT upto, minupto, exact               */
139    1, 1, 1, 3,                    /* NOT *+, ++, ?+, updo+                  */    1, 1, 1, 3,                    /* NOT *+, ++, ?+, upto+                  */
140      1, 1, 1, 1, 1, 1,              /* NOT *I, *?I, +I, +?I, ?I, ??I          */
141      3, 3, 3,                       /* NOT upto I, minupto I, exact I         */
142      1, 1, 1, 3,                    /* NOT *+I, ++I, ?+I, upto+I              */
143    /* Positive type repeats                                                 */    /* Positive type repeats                                                 */
144    1, 1, 1, 1, 1, 1,              /* Type *, *?, +, +?, ?, ??               */    1, 1, 1, 1, 1, 1,              /* Type *, *?, +, +?, ?, ??               */
145    3, 3, 3,                       /* Type upto, minupto, exact              */    3, 3, 3,                       /* Type upto, minupto, exact              */
# Line 110  static uschar coptable[] = { Line 151  static uschar coptable[] = {
151    0,                             /* NCLASS                                 */    0,                             /* NCLASS                                 */
152    0,                             /* XCLASS - variable length               */    0,                             /* XCLASS - variable length               */
153    0,                             /* REF                                    */    0,                             /* REF                                    */
154      0,                             /* REFI                                   */
155    0,                             /* RECURSE                                */    0,                             /* RECURSE                                */
156    0,                             /* CALLOUT                                */    0,                             /* CALLOUT                                */
157    0,                             /* Alt                                    */    0,                             /* Alt                                    */
158    0,                             /* Ket                                    */    0,                             /* Ket                                    */
159    0,                             /* KetRmax                                */    0,                             /* KetRmax                                */
160    0,                             /* KetRmin                                */    0,                             /* KetRmin                                */
161      0,                             /* KetRpos                                */
162      0,                             /* Reverse                                */
163    0,                             /* Assert                                 */    0,                             /* Assert                                 */
164    0,                             /* Assert not                             */    0,                             /* Assert not                             */
165    0,                             /* Assert behind                          */    0,                             /* Assert behind                          */
166    0,                             /* Assert behind not                      */    0,                             /* Assert behind not                      */
167      0, 0,                          /* ONCE, ONCE_NC                          */
168      0, 0, 0, 0, 0,                 /* BRA, BRAPOS, CBRA, CBRAPOS, COND       */
169      0, 0, 0, 0, 0,                 /* SBRA, SBRAPOS, SCBRA, SCBRAPOS, SCOND  */
170      0, 0,                          /* CREF, NCREF                            */
171      0, 0,                          /* RREF, NRREF                            */
172      0,                             /* DEF                                    */
173      0, 0, 0,                       /* BRAZERO, BRAMINZERO, BRAPOSZERO        */
174      0, 0, 0,                       /* MARK, PRUNE, PRUNE_ARG                 */
175      0, 0, 0, 0,                    /* SKIP, SKIP_ARG, THEN, THEN_ARG         */
176      0, 0, 0, 0,                    /* COMMIT, FAIL, ACCEPT, ASSERT_ACCEPT    */
177      0, 0                           /* CLOSE, SKIPZERO  */
178    };
179    
180    /* This table identifies those opcodes that inspect a character. It is used to
181    remember the fact that a character could have been inspected when the end of
182    the subject is reached. ***NOTE*** If the start of this table is modified, the
183    two tables that follow must also be modified. */
184    
185    static const uschar poptable[] = {
186      0,                             /* End                                    */
187      0, 0, 0, 1, 1,                 /* \A, \G, \K, \B, \b                     */
188      1, 1, 1, 1, 1, 1,              /* \D, \d, \S, \s, \W, \w                 */
189      1, 1, 1,                       /* Any, AllAny, Anybyte                   */
190      1, 1,                          /* \P, \p                                 */
191      1, 1, 1, 1, 1,                 /* \R, \H, \h, \V, \v                     */
192      1,                             /* \X                                     */
193      0, 0, 0, 0, 0, 0,              /* \Z, \z, ^, ^M, $, $M                   */
194      1,                             /* Char                                   */
195      1,                             /* Chari                                  */
196      1,                             /* not                                    */
197      1,                             /* noti                                   */
198      /* Positive single-char repeats                                          */
199      1, 1, 1, 1, 1, 1,              /* *, *?, +, +?, ?, ??                    */
200      1, 1, 1,                       /* upto, minupto, exact                   */
201      1, 1, 1, 1,                    /* *+, ++, ?+, upto+                      */
202      1, 1, 1, 1, 1, 1,              /* *I, *?I, +I, +?I, ?I, ??I              */
203      1, 1, 1,                       /* upto I, minupto I, exact I             */
204      1, 1, 1, 1,                    /* *+I, ++I, ?+I, upto+I                  */
205      /* Negative single-char repeats - only for chars < 256                   */
206      1, 1, 1, 1, 1, 1,              /* NOT *, *?, +, +?, ?, ??                */
207      1, 1, 1,                       /* NOT upto, minupto, exact               */
208      1, 1, 1, 1,                    /* NOT *+, ++, ?+, upto+                  */
209      1, 1, 1, 1, 1, 1,              /* NOT *I, *?I, +I, +?I, ?I, ??I          */
210      1, 1, 1,                       /* NOT upto I, minupto I, exact I         */
211      1, 1, 1, 1,                    /* NOT *+I, ++I, ?+I, upto+I              */
212      /* Positive type repeats                                                 */
213      1, 1, 1, 1, 1, 1,              /* Type *, *?, +, +?, ?, ??               */
214      1, 1, 1,                       /* Type upto, minupto, exact              */
215      1, 1, 1, 1,                    /* Type *+, ++, ?+, upto+                 */
216      /* Character class & ref repeats                                         */
217      1, 1, 1, 1, 1, 1,              /* *, *?, +, +?, ?, ??                    */
218      1, 1,                          /* CRRANGE, CRMINRANGE                    */
219      1,                             /* CLASS                                  */
220      1,                             /* NCLASS                                 */
221      1,                             /* XCLASS - variable length               */
222      0,                             /* REF                                    */
223      0,                             /* REFI                                   */
224      0,                             /* RECURSE                                */
225      0,                             /* CALLOUT                                */
226      0,                             /* Alt                                    */
227      0,                             /* Ket                                    */
228      0,                             /* KetRmax                                */
229      0,                             /* KetRmin                                */
230      0,                             /* KetRpos                                */
231    0,                             /* Reverse                                */    0,                             /* Reverse                                */
232    0, 0, 0, 0,                    /* ONCE, BRA, CBRA, COND                  */    0,                             /* Assert                                 */
233    0, 0, 0,                       /* SBRA, SCBRA, SCOND                     */    0,                             /* Assert not                             */
234    0,                             /* CREF                                   */    0,                             /* Assert behind                          */
235    0,                             /* RREF                                   */    0,                             /* Assert behind not                      */
236      0, 0,                          /* ONCE, ONCE_NC                          */
237      0, 0, 0, 0, 0,                 /* BRA, BRAPOS, CBRA, CBRAPOS, COND       */
238      0, 0, 0, 0, 0,                 /* SBRA, SBRAPOS, SCBRA, SCBRAPOS, SCOND  */
239      0, 0,                          /* CREF, NCREF                            */
240      0, 0,                          /* RREF, NRREF                            */
241    0,                             /* DEF                                    */    0,                             /* DEF                                    */
242    0, 0                           /* BRAZERO, BRAMINZERO                    */    0, 0, 0,                       /* BRAZERO, BRAMINZERO, BRAPOSZERO        */
243      0, 0, 0,                       /* MARK, PRUNE, PRUNE_ARG                 */
244      0, 0, 0, 0,                    /* SKIP, SKIP_ARG, THEN, THEN_ARG         */
245      0, 0, 0, 0,                    /* COMMIT, FAIL, ACCEPT, ASSERT_ACCEPT    */
246      0, 0                           /* CLOSE, SKIPZERO                        */
247  };  };
248    
249  /* These 2 tables allow for compact code for testing for \D, \d, \S, \s, \W,  /* These 2 tables allow for compact code for testing for \D, \d, \S, \s, \W,
250  and \w */  and \w */
251    
252  static uschar toptable1[] = {  static const uschar toptable1[] = {
253    0, 0, 0, 0, 0, 0,    0, 0, 0, 0, 0, 0,
254    ctype_digit, ctype_digit,    ctype_digit, ctype_digit,
255    ctype_space, ctype_space,    ctype_space, ctype_space,
256    ctype_word,  ctype_word,    ctype_word,  ctype_word,
257    0                               /* OP_ANY */    0, 0                            /* OP_ANY, OP_ALLANY */
258  };  };
259    
260  static uschar toptable2[] = {  static const uschar toptable2[] = {
261    0, 0, 0, 0, 0, 0,    0, 0, 0, 0, 0, 0,
262    ctype_digit, 0,    ctype_digit, 0,
263    ctype_space, 0,    ctype_space, 0,
264    ctype_word,  0,    ctype_word,  0,
265    1                               /* OP_ANY */    1, 1                            /* OP_ANY, OP_ALLANY */
266  };  };
267    
268    
# Line 157  these structures in, is a vector of ints Line 274  these structures in, is a vector of ints
274  typedef struct stateblock {  typedef struct stateblock {
275    int offset;                     /* Offset to opcode */    int offset;                     /* Offset to opcode */
276    int count;                      /* Count for repeats */    int count;                      /* Count for repeats */
   int ims;                        /* ims flag bits */  
277    int data;                       /* Some use extra data */    int data;                       /* Some use extra data */
278  } stateblock;  } stateblock;
279    
280  #define INTS_PER_STATEBLOCK  (sizeof(stateblock)/sizeof(int))  #define INTS_PER_STATEBLOCK  (sizeof(stateblock)/sizeof(int))
281    
282    
283  #ifdef DEBUG  #ifdef PCRE_DEBUG
284  /*************************************************  /*************************************************
285  *             Print character string             *  *             Print character string             *
286  *************************************************/  *************************************************/
# Line 213  Arguments: Line 329  Arguments:
329    offsetcount       size of same    offsetcount       size of same
330    workspace         vector of workspace    workspace         vector of workspace
331    wscount           size of same    wscount           size of same
   ims               the current ims flags  
332    rlevel            function call recursion level    rlevel            function call recursion level
   recursing         regex recursive call level  
333    
334  Returns:            > 0 =>  Returns:            > 0 => number of match offset pairs placed in offsets
335                      = 0 =>                      = 0 => offsets overflowed; longest matches are present
336                       -1 => failed to match                       -1 => failed to match
337                     < -1 => some kind of unexpected problem                     < -1 => some kind of unexpected problem
338    
# Line 230  for the current character, one for the f Line 344  for the current character, one for the f
344      { \      { \
345      next_active_state->offset = (x); \      next_active_state->offset = (x); \
346      next_active_state->count  = (y); \      next_active_state->count  = (y); \
     next_active_state->ims    = ims; \  
347      next_active_state++; \      next_active_state++; \
348      DPRINTF(("%.*sADD_ACTIVE(%d,%d)\n", rlevel*2-2, SP, (x), (y))); \      DPRINTF(("%.*sADD_ACTIVE(%d,%d)\n", rlevel*2-2, SP, (x), (y))); \
349      } \      } \
# Line 241  for the current character, one for the f Line 354  for the current character, one for the f
354      { \      { \
355      next_active_state->offset = (x); \      next_active_state->offset = (x); \
356      next_active_state->count  = (y); \      next_active_state->count  = (y); \
     next_active_state->ims    = ims; \  
357      next_active_state->data   = (z); \      next_active_state->data   = (z); \
358      next_active_state++; \      next_active_state++; \
359      DPRINTF(("%.*sADD_ACTIVE_DATA(%d,%d,%d)\n", rlevel*2-2, SP, (x), (y), (z))); \      DPRINTF(("%.*sADD_ACTIVE_DATA(%d,%d,%d)\n", rlevel*2-2, SP, (x), (y), (z))); \
# Line 253  for the current character, one for the f Line 365  for the current character, one for the f
365      { \      { \
366      next_new_state->offset = (x); \      next_new_state->offset = (x); \
367      next_new_state->count  = (y); \      next_new_state->count  = (y); \
     next_new_state->ims    = ims; \  
368      next_new_state++; \      next_new_state++; \
369      DPRINTF(("%.*sADD_NEW(%d,%d)\n", rlevel*2-2, SP, (x), (y))); \      DPRINTF(("%.*sADD_NEW(%d,%d)\n", rlevel*2-2, SP, (x), (y))); \
370      } \      } \
# Line 264  for the current character, one for the f Line 375  for the current character, one for the f
375      { \      { \
376      next_new_state->offset = (x); \      next_new_state->offset = (x); \
377      next_new_state->count  = (y); \      next_new_state->count  = (y); \
     next_new_state->ims    = ims; \  
378      next_new_state->data   = (z); \      next_new_state->data   = (z); \
379      next_new_state++; \      next_new_state++; \
380      DPRINTF(("%.*sADD_NEW_DATA(%d,%d,%d)\n", rlevel*2-2, SP, (x), (y), (z))); \      DPRINTF(("%.*sADD_NEW_DATA(%d,%d,%d)\n", rlevel*2-2, SP, (x), (y), (z))); \
# Line 283  internal_dfa_exec( Line 393  internal_dfa_exec(
393    int offsetcount,    int offsetcount,
394    int *workspace,    int *workspace,
395    int wscount,    int wscount,
396    int ims,    int  rlevel)
   int  rlevel,  
   int  recursing)  
397  {  {
398  stateblock *active_states, *new_states, *temp_states;  stateblock *active_states, *new_states, *temp_states;
399  stateblock *next_active_state, *next_new_state;  stateblock *next_active_state, *next_new_state;
# Line 294  const uschar *ctypes, *lcc, *fcc; Line 402  const uschar *ctypes, *lcc, *fcc;
402  const uschar *ptr;  const uschar *ptr;
403  const uschar *end_code, *first_op;  const uschar *end_code, *first_op;
404    
405    dfa_recursion_info new_recursive;
406    
407  int active_count, new_count, match_count;  int active_count, new_count, match_count;
408    
409  /* Some fields in the md block are frequently referenced, so we load them into  /* Some fields in the md block are frequently referenced, so we load them into
# Line 317  wscount = (wscount - (wscount % (INTS_PE Line 427  wscount = (wscount - (wscount % (INTS_PE
427            (2 * INTS_PER_STATEBLOCK);            (2 * INTS_PER_STATEBLOCK);
428    
429  DPRINTF(("\n%.*s---------------------\n"  DPRINTF(("\n%.*s---------------------\n"
430    "%.*sCall to internal_dfa_exec f=%d r=%d\n",    "%.*sCall to internal_dfa_exec f=%d\n",
431    rlevel*2-2, SP, rlevel*2-2, SP, rlevel, recursing));    rlevel*2-2, SP, rlevel*2-2, SP, rlevel));
432    
433  ctypes = md->tables + ctypes_offset;  ctypes = md->tables + ctypes_offset;
434  lcc = md->tables + lcc_offset;  lcc = md->tables + lcc_offset;
# Line 331  next_new_state = new_states = active_sta Line 441  next_new_state = new_states = active_sta
441  new_count = 0;  new_count = 0;
442    
443  first_op = this_start_code + 1 + LINK_SIZE +  first_op = this_start_code + 1 + LINK_SIZE +
444    ((*this_start_code == OP_CBRA || *this_start_code == OP_SCBRA)? 2:0);    ((*this_start_code == OP_CBRA || *this_start_code == OP_SCBRA ||
445        *this_start_code == OP_CBRAPOS || *this_start_code == OP_SCBRAPOS)? 2:0);
446    
447  /* The first thing in any (sub) pattern is a bracket of some sort. Push all  /* The first thing in any (sub) pattern is a bracket of some sort. Push all
448  the alternative states onto the list, and find out where the end is. This  the alternative states onto the list, and find out where the end is. This
# Line 380  if (*first_op == OP_REVERSE) Line 491  if (*first_op == OP_REVERSE)
491    
492      {      {
493      gone_back = (current_subject - max_back < start_subject)?      gone_back = (current_subject - max_back < start_subject)?
494        current_subject - start_subject : max_back;        (int)(current_subject - start_subject) : max_back;
495      current_subject -= gone_back;      current_subject -= gone_back;
496      }      }
497    
498      /* Save the earliest consulted character */
499    
500      if (current_subject < md->start_used_ptr)
501        md->start_used_ptr = current_subject;
502    
503    /* Now we can process the individual branches. */    /* Now we can process the individual branches. */
504    
505    end_code = this_start_code;    end_code = this_start_code;
# Line 392  if (*first_op == OP_REVERSE) Line 508  if (*first_op == OP_REVERSE)
508      int back = GET(end_code, 2+LINK_SIZE);      int back = GET(end_code, 2+LINK_SIZE);
509      if (back <= gone_back)      if (back <= gone_back)
510        {        {
511        int bstate = end_code - start_code + 2 + 2*LINK_SIZE;        int bstate = (int)(end_code - start_code + 2 + 2*LINK_SIZE);
512        ADD_NEW_DATA(-bstate, 0, gone_back - back);        ADD_NEW_DATA(-bstate, 0, gone_back - back);
513        }        }
514      end_code += GET(end_code, 1);      end_code += GET(end_code, 1);
# Line 425  else Line 541  else
541    else    else
542      {      {
543      int length = 1 + LINK_SIZE +      int length = 1 + LINK_SIZE +
544        ((*this_start_code == OP_CBRA || *this_start_code == OP_SCBRA)? 2:0);        ((*this_start_code == OP_CBRA || *this_start_code == OP_SCBRA ||
545            *this_start_code == OP_CBRAPOS || *this_start_code == OP_SCBRAPOS)?
546            2:0);
547      do      do
548        {        {
549        ADD_NEW(end_code - start_code + length, 0);        ADD_NEW((int)(end_code - start_code + length), 0);
550        end_code += GET(end_code, 1);        end_code += GET(end_code, 1);
551        length = 1 + LINK_SIZE;        length = 1 + LINK_SIZE;
552        }        }
# Line 448  for (;;) Line 566  for (;;)
566    int i, j;    int i, j;
567    int clen, dlen;    int clen, dlen;
568    unsigned int c, d;    unsigned int c, d;
569      int forced_fail = 0;
570      BOOL could_continue = FALSE;
571    
572    /* Make the new state list into the active state list and empty the    /* Make the new state list into the active state list and empty the
573    new state list. */    new state list. */
# Line 461  for (;;) Line 581  for (;;)
581    workspace[0] ^= 1;              /* Remember for the restarting feature */    workspace[0] ^= 1;              /* Remember for the restarting feature */
582    workspace[1] = active_count;    workspace[1] = active_count;
583    
584  #ifdef DEBUG  #ifdef PCRE_DEBUG
585    printf("%.*sNext character: rest of subject = \"", rlevel*2-2, SP);    printf("%.*sNext character: rest of subject = \"", rlevel*2-2, SP);
586    pchars((uschar *)ptr, strlen((char *)ptr), stdout);    pchars((uschar *)ptr, strlen((char *)ptr), stdout);
587    printf("\"\n");    printf("\"\n");
# Line 503  for (;;) Line 623  for (;;)
623    for (i = 0; i < active_count; i++)    for (i = 0; i < active_count; i++)
624      {      {
625      stateblock *current_state = active_states + i;      stateblock *current_state = active_states + i;
626        BOOL caseless = FALSE;
627      const uschar *code;      const uschar *code;
628      int state_offset = current_state->offset;      int state_offset = current_state->offset;
629      int count, codevalue;      int count, codevalue, rrc;
 #ifdef SUPPORT_UCP  
     int chartype, script;  
 #endif  
630    
631  #ifdef DEBUG  #ifdef PCRE_DEBUG
632      printf ("%.*sProcessing state %d c=", rlevel*2-2, SP, state_offset);      printf ("%.*sProcessing state %d c=", rlevel*2-2, SP, state_offset);
633      if (clen == 0) printf("EOL\n");      if (clen == 0) printf("EOL\n");
634        else if (c > 32 && c < 127) printf("'%c'\n", c);        else if (c > 32 && c < 127) printf("'%c'\n", c);
635          else printf("0x%02x\n", c);          else printf("0x%02x\n", c);
636  #endif  #endif
637    
     /* This variable is referred to implicity in the ADD_xxx macros. */  
   
     ims = current_state->ims;  
   
638      /* A negative offset is a special case meaning "hold off going to this      /* A negative offset is a special case meaning "hold off going to this
639      (negated) state until the number of characters in the data field have      (negated) state until the number of characters in the data field have
640      been skipped". */      been skipped". */
# Line 540  for (;;) Line 654  for (;;)
654          }          }
655        }        }
656    
657      /* Check for a duplicate state with the same count, and skip if found. */      /* Check for a duplicate state with the same count, and skip if found.
658        See the note at the head of this module about the possibility of improving
659        performance here. */
660    
661      for (j = 0; j < i; j++)      for (j = 0; j < i; j++)
662        {        {
# Line 557  for (;;) Line 673  for (;;)
673      code = start_code + state_offset;      code = start_code + state_offset;
674      codevalue = *code;      codevalue = *code;
675    
676        /* If this opcode inspects a character, but we are at the end of the
677        subject, remember the fact for use when testing for a partial match. */
678    
679        if (clen == 0 && poptable[codevalue] != 0)
680          could_continue = TRUE;
681    
682      /* If this opcode is followed by an inline character, load it. It is      /* If this opcode is followed by an inline character, load it. It is
683      tempting to test for the presence of a subject character here, but that      tempting to test for the presence of a subject character here, but that
684      is wrong, because sometimes zero repetitions of the subject are      is wrong, because sometimes zero repetitions of the subject are
# Line 603  for (;;) Line 725  for (;;)
725    
726      switch (codevalue)      switch (codevalue)
727        {        {
728    /* ========================================================================== */
729          /* These cases are never obeyed. This is a fudge that causes a compile-
730          time error if the vectors coptable or poptable, which are indexed by
731          opcode, are not the correct length. It seems to be the only way to do
732          such a check at compile time, as the sizeof() operator does not work
733          in the C preprocessor. */
734    
735          case OP_TABLE_LENGTH:
736          case OP_TABLE_LENGTH +
737            ((sizeof(coptable) == OP_TABLE_LENGTH) &&
738             (sizeof(poptable) == OP_TABLE_LENGTH)):
739          break;
740    
741  /* ========================================================================== */  /* ========================================================================== */
742        /* Reached a closing bracket. If not at the end of the pattern, carry        /* Reached a closing bracket. If not at the end of the pattern, carry
743        on with the next opcode. Otherwise, unless we have an empty string and        on with the next opcode. For repeating opcodes, also add the repeat
744        PCRE_NOTEMPTY is set, save the match data, shifting up all previous        state. Note that KETRPOS will always be encountered at the end of the
745          subpattern, because the possessive subpattern repeats are always handled
746          using recursive calls. Thus, it never adds any new states.
747    
748          At the end of the (sub)pattern, unless we have an empty string and
749          PCRE_NOTEMPTY is set, or PCRE_NOTEMPTY_ATSTART is set and we are at the
750          start of the subject, save the match data, shifting up all previous
751        matches so we always have the longest first. */        matches so we always have the longest first. */
752    
753        case OP_KET:        case OP_KET:
754        case OP_KETRMIN:        case OP_KETRMIN:
755        case OP_KETRMAX:        case OP_KETRMAX:
756          case OP_KETRPOS:
757        if (code != end_code)        if (code != end_code)
758          {          {
759          ADD_ACTIVE(state_offset + 1 + LINK_SIZE, 0);          ADD_ACTIVE(state_offset + 1 + LINK_SIZE, 0);
# Line 621  for (;;) Line 762  for (;;)
762            ADD_ACTIVE(state_offset - GET(code, 1), 0);            ADD_ACTIVE(state_offset - GET(code, 1), 0);
763            }            }
764          }          }
765        else if (ptr > current_subject || (md->moptions & PCRE_NOTEMPTY) == 0)        else
766          {          {
767          if (match_count < 0) match_count = (offsetcount >= 2)? 1 : 0;          if (ptr > current_subject ||
768            else if (match_count > 0 && ++match_count * 2 >= offsetcount)              ((md->moptions & PCRE_NOTEMPTY) == 0 &&
769              match_count = 0;                ((md->moptions & PCRE_NOTEMPTY_ATSTART) == 0 ||
770          count = ((match_count == 0)? offsetcount : match_count * 2) - 2;                  current_subject > start_subject + md->start_offset)))
771          if (count > 0) memmove(offsets + 2, offsets, count * sizeof(int));            {
772          if (offsetcount >= 2)            if (match_count < 0) match_count = (offsetcount >= 2)? 1 : 0;
773            {              else if (match_count > 0 && ++match_count * 2 > offsetcount)
774            offsets[0] = current_subject - start_subject;                match_count = 0;
775            offsets[1] = ptr - start_subject;            count = ((match_count == 0)? offsetcount : match_count * 2) - 2;
776            DPRINTF(("%.*sSet matched string = \"%.*s\"\n", rlevel*2-2, SP,            if (count > 0) memmove(offsets + 2, offsets, count * sizeof(int));
777              offsets[1] - offsets[0], current_subject));            if (offsetcount >= 2)
778            }              {
779          if ((md->moptions & PCRE_DFA_SHORTEST) != 0)              offsets[0] = (int)(current_subject - start_subject);
780            {              offsets[1] = (int)(ptr - start_subject);
781            DPRINTF(("%.*sEnd of internal_dfa_exec %d: returning %d\n"              DPRINTF(("%.*sSet matched string = \"%.*s\"\n", rlevel*2-2, SP,
782              "%.*s---------------------\n\n", rlevel*2-2, SP, rlevel,                offsets[1] - offsets[0], current_subject));
783              match_count, rlevel*2-2, SP));              }
784            return match_count;            if ((md->moptions & PCRE_DFA_SHORTEST) != 0)
785                {
786                DPRINTF(("%.*sEnd of internal_dfa_exec %d: returning %d\n"
787                  "%.*s---------------------\n\n", rlevel*2-2, SP, rlevel,
788                  match_count, rlevel*2-2, SP));
789                return match_count;
790                }
791            }            }
792          }          }
793        break;        break;
# Line 652  for (;;) Line 799  for (;;)
799        /*-----------------------------------------------------------------*/        /*-----------------------------------------------------------------*/
800        case OP_ALT:        case OP_ALT:
801        do { code += GET(code, 1); } while (*code == OP_ALT);        do { code += GET(code, 1); } while (*code == OP_ALT);
802        ADD_ACTIVE(code - start_code, 0);        ADD_ACTIVE((int)(code - start_code), 0);
803        break;        break;
804    
805        /*-----------------------------------------------------------------*/        /*-----------------------------------------------------------------*/
# Line 660  for (;;) Line 807  for (;;)
807        case OP_SBRA:        case OP_SBRA:
808        do        do
809          {          {
810          ADD_ACTIVE(code - start_code + 1 + LINK_SIZE, 0);          ADD_ACTIVE((int)(code - start_code + 1 + LINK_SIZE), 0);
811          code += GET(code, 1);          code += GET(code, 1);
812          }          }
813        while (*code == OP_ALT);        while (*code == OP_ALT);
# Line 669  for (;;) Line 816  for (;;)
816        /*-----------------------------------------------------------------*/        /*-----------------------------------------------------------------*/
817        case OP_CBRA:        case OP_CBRA:
818        case OP_SCBRA:        case OP_SCBRA:
819        ADD_ACTIVE(code - start_code + 3 + LINK_SIZE,  0);        ADD_ACTIVE((int)(code - start_code + 3 + LINK_SIZE),  0);
820        code += GET(code, 1);        code += GET(code, 1);
821        while (*code == OP_ALT)        while (*code == OP_ALT)
822          {          {
823          ADD_ACTIVE(code - start_code + 1 + LINK_SIZE,  0);          ADD_ACTIVE((int)(code - start_code + 1 + LINK_SIZE),  0);
824          code += GET(code, 1);          code += GET(code, 1);
825          }          }
826        break;        break;
# Line 684  for (;;) Line 831  for (;;)
831        ADD_ACTIVE(state_offset + 1, 0);        ADD_ACTIVE(state_offset + 1, 0);
832        code += 1 + GET(code, 2);        code += 1 + GET(code, 2);
833        while (*code == OP_ALT) code += GET(code, 1);        while (*code == OP_ALT) code += GET(code, 1);
834        ADD_ACTIVE(code - start_code + 1 + LINK_SIZE, 0);        ADD_ACTIVE((int)(code - start_code + 1 + LINK_SIZE), 0);
835          break;
836    
837          /*-----------------------------------------------------------------*/
838          case OP_SKIPZERO:
839          code += 1 + GET(code, 2);
840          while (*code == OP_ALT) code += GET(code, 1);
841          ADD_ACTIVE((int)(code - start_code + 1 + LINK_SIZE), 0);
842        break;        break;
843    
844        /*-----------------------------------------------------------------*/        /*-----------------------------------------------------------------*/
845        case OP_CIRC:        case OP_CIRC:
846        if ((ptr == start_subject && (md->moptions & PCRE_NOTBOL) == 0) ||        if (ptr == start_subject && (md->moptions & PCRE_NOTBOL) == 0)
           ((ims & PCRE_MULTILINE) != 0 &&  
             ptr != end_subject &&  
             WAS_NEWLINE(ptr)))  
847          { ADD_ACTIVE(state_offset + 1, 0); }          { ADD_ACTIVE(state_offset + 1, 0); }
848        break;        break;
849    
850        /*-----------------------------------------------------------------*/        /*-----------------------------------------------------------------*/
851        case OP_EOD:        case OP_CIRCM:
852        if (ptr >= end_subject) { ADD_ACTIVE(state_offset + 1, 0); }        if ((ptr == start_subject && (md->moptions & PCRE_NOTBOL) == 0) ||
853              (ptr != end_subject && WAS_NEWLINE(ptr)))
854            { ADD_ACTIVE(state_offset + 1, 0); }
855        break;        break;
856    
857        /*-----------------------------------------------------------------*/        /*-----------------------------------------------------------------*/
858        case OP_OPT:        case OP_EOD:
859        ims = code[1];        if (ptr >= end_subject)
860        ADD_ACTIVE(state_offset + 2, 0);          {
861            if ((md->moptions & PCRE_PARTIAL_HARD) != 0)
862              could_continue = TRUE;
863            else { ADD_ACTIVE(state_offset + 1, 0); }
864            }
865        break;        break;
866    
867        /*-----------------------------------------------------------------*/        /*-----------------------------------------------------------------*/
# Line 726  for (;;) Line 883  for (;;)
883    
884        /*-----------------------------------------------------------------*/        /*-----------------------------------------------------------------*/
885        case OP_ANY:        case OP_ANY:
886        if (clen > 0 && ((ims & PCRE_DOTALL) != 0 || !IS_NEWLINE(ptr)))        if (clen > 0 && !IS_NEWLINE(ptr))
887            { ADD_NEW(state_offset + 1, 0); }
888          break;
889    
890          /*-----------------------------------------------------------------*/
891          case OP_ALLANY:
892          if (clen > 0)
893          { ADD_NEW(state_offset + 1, 0); }          { ADD_NEW(state_offset + 1, 0); }
894        break;        break;
895    
896        /*-----------------------------------------------------------------*/        /*-----------------------------------------------------------------*/
897        case OP_EODN:        case OP_EODN:
898        if (clen == 0 || (IS_NEWLINE(ptr) && ptr == end_subject - md->nllen))        if (clen == 0 && (md->moptions & PCRE_PARTIAL_HARD) != 0)
899            could_continue = TRUE;
900          else if (clen == 0 || (IS_NEWLINE(ptr) && ptr == end_subject - md->nllen))
901          { ADD_ACTIVE(state_offset + 1, 0); }          { ADD_ACTIVE(state_offset + 1, 0); }
902        break;        break;
903    
# Line 740  for (;;) Line 905  for (;;)
905        case OP_DOLL:        case OP_DOLL:
906        if ((md->moptions & PCRE_NOTEOL) == 0)        if ((md->moptions & PCRE_NOTEOL) == 0)
907          {          {
908          if (clen == 0 ||          if (clen == 0 && (md->moptions & PCRE_PARTIAL_HARD) != 0)
909              (IS_NEWLINE(ptr) &&            could_continue = TRUE;
910                 ((ims & PCRE_MULTILINE) != 0 || ptr == end_subject - md->nllen)          else if (clen == 0 ||
911                ((md->poptions & PCRE_DOLLAR_ENDONLY) == 0 && IS_NEWLINE(ptr) &&
912                   (ptr == end_subject - md->nllen)
913              ))              ))
914            { ADD_ACTIVE(state_offset + 1, 0); }            { ADD_ACTIVE(state_offset + 1, 0); }
915          }          }
916        else if ((ims & PCRE_MULTILINE) != 0 && IS_NEWLINE(ptr))        break;
917    
918          /*-----------------------------------------------------------------*/
919          case OP_DOLLM:
920          if ((md->moptions & PCRE_NOTEOL) == 0)
921            {
922            if (clen == 0 && (md->moptions & PCRE_PARTIAL_HARD) != 0)
923              could_continue = TRUE;
924            else if (clen == 0 ||
925                ((md->poptions & PCRE_DOLLAR_ENDONLY) == 0 && IS_NEWLINE(ptr)))
926              { ADD_ACTIVE(state_offset + 1, 0); }
927            }
928          else if (IS_NEWLINE(ptr))
929          { ADD_ACTIVE(state_offset + 1, 0); }          { ADD_ACTIVE(state_offset + 1, 0); }
930        break;        break;
931    
# Line 778  for (;;) Line 957  for (;;)
957          if (ptr > start_subject)          if (ptr > start_subject)
958            {            {
959            const uschar *temp = ptr - 1;            const uschar *temp = ptr - 1;
960              if (temp < md->start_used_ptr) md->start_used_ptr = temp;
961  #ifdef SUPPORT_UTF8  #ifdef SUPPORT_UTF8
962            if (utf8) BACKCHAR(temp);            if (utf8) BACKCHAR(temp);
963  #endif  #endif
964            GETCHARTEST(d, temp);            GETCHARTEST(d, temp);
965    #ifdef SUPPORT_UCP
966              if ((md->poptions & PCRE_UCP) != 0)
967                {
968                if (d == '_') left_word = TRUE; else
969                  {
970                  int cat = UCD_CATEGORY(d);
971                  left_word = (cat == ucp_L || cat == ucp_N);
972                  }
973                }
974              else
975    #endif
976            left_word = d < 256 && (ctypes[d] & ctype_word) != 0;            left_word = d < 256 && (ctypes[d] & ctype_word) != 0;
977            }            }
978          else left_word = 0;          else left_word = FALSE;
979    
980          if (clen > 0) right_word = c < 256 && (ctypes[c] & ctype_word) != 0;          if (clen > 0)
981            else right_word = 0;            {
982    #ifdef SUPPORT_UCP
983              if ((md->poptions & PCRE_UCP) != 0)
984                {
985                if (c == '_') right_word = TRUE; else
986                  {
987                  int cat = UCD_CATEGORY(c);
988                  right_word = (cat == ucp_L || cat == ucp_N);
989                  }
990                }
991              else
992    #endif
993              right_word = c < 256 && (ctypes[c] & ctype_word) != 0;
994              }
995            else right_word = FALSE;
996    
997          if ((left_word == right_word) == (codevalue == OP_NOT_WORD_BOUNDARY))          if ((left_word == right_word) == (codevalue == OP_NOT_WORD_BOUNDARY))
998            { ADD_ACTIVE(state_offset + 1, 0); }            { ADD_ACTIVE(state_offset + 1, 0); }
# Line 806  for (;;) Line 1011  for (;;)
1011        if (clen > 0)        if (clen > 0)
1012          {          {
1013          BOOL OK;          BOOL OK;
1014          int category = _pcre_ucp_findprop(c, &chartype, &script);          const ucd_record * prop = GET_UCD(c);
1015          switch(code[1])          switch(code[1])
1016            {            {
1017            case PT_ANY:            case PT_ANY:
# Line 814  for (;;) Line 1019  for (;;)
1019            break;            break;
1020    
1021            case PT_LAMP:            case PT_LAMP:
1022            OK = chartype == ucp_Lu || chartype == ucp_Ll || chartype == ucp_Lt;            OK = prop->chartype == ucp_Lu || prop->chartype == ucp_Ll ||
1023                   prop->chartype == ucp_Lt;
1024            break;            break;
1025    
1026            case PT_GC:            case PT_GC:
1027            OK = category == code[2];            OK = _pcre_ucp_gentype[prop->chartype] == code[2];
1028            break;            break;
1029    
1030            case PT_PC:            case PT_PC:
1031            OK = chartype == code[2];            OK = prop->chartype == code[2];
1032            break;            break;
1033    
1034            case PT_SC:            case PT_SC:
1035            OK = script == code[2];            OK = prop->script == code[2];
1036              break;
1037    
1038              /* These are specials for combination cases. */
1039    
1040              case PT_ALNUM:
1041              OK = _pcre_ucp_gentype[prop->chartype] == ucp_L ||
1042                   _pcre_ucp_gentype[prop->chartype] == ucp_N;
1043              break;
1044    
1045              case PT_SPACE:    /* Perl space */
1046              OK = _pcre_ucp_gentype[prop->chartype] == ucp_Z ||
1047                   c == CHAR_HT || c == CHAR_NL || c == CHAR_FF || c == CHAR_CR;
1048              break;
1049    
1050              case PT_PXSPACE:  /* POSIX space */
1051              OK = _pcre_ucp_gentype[prop->chartype] == ucp_Z ||
1052                   c == CHAR_HT || c == CHAR_NL || c == CHAR_VT ||
1053                   c == CHAR_FF || c == CHAR_CR;
1054              break;
1055    
1056              case PT_WORD:
1057              OK = _pcre_ucp_gentype[prop->chartype] == ucp_L ||
1058                   _pcre_ucp_gentype[prop->chartype] == ucp_N ||
1059                   c == CHAR_UNDERSCORE;
1060            break;            break;
1061    
1062            /* Should never occur, but keep compilers from grumbling. */            /* Should never occur, but keep compilers from grumbling. */
# Line 846  for (;;) Line 1076  for (;;)
1076  /* ========================================================================== */  /* ========================================================================== */
1077        /* These opcodes likewise inspect the subject character, but have an        /* These opcodes likewise inspect the subject character, but have an
1078        argument that is not a data character. It is one of these opcodes:        argument that is not a data character. It is one of these opcodes:
1079        OP_ANY, OP_DIGIT, OP_NOT_DIGIT, OP_WHITESPACE, OP_NOT_SPACE, OP_WORDCHAR,        OP_ANY, OP_ALLANY, OP_DIGIT, OP_NOT_DIGIT, OP_WHITESPACE, OP_NOT_SPACE,
1080        OP_NOT_WORDCHAR. The value is loaded into d. */        OP_WORDCHAR, OP_NOT_WORDCHAR. The value is loaded into d. */
1081    
1082        case OP_TYPEPLUS:        case OP_TYPEPLUS:
1083        case OP_TYPEMINPLUS:        case OP_TYPEMINPLUS:
# Line 858  for (;;) Line 1088  for (;;)
1088          {          {
1089          if ((c >= 256 && d != OP_DIGIT && d != OP_WHITESPACE && d != OP_WORDCHAR) ||          if ((c >= 256 && d != OP_DIGIT && d != OP_WHITESPACE && d != OP_WORDCHAR) ||
1090              (c < 256 &&              (c < 256 &&
1091                (d != OP_ANY ||                (d != OP_ANY || !IS_NEWLINE(ptr)) &&
                (ims & PCRE_DOTALL) != 0 ||  
                !IS_NEWLINE(ptr)  
               ) &&  
1092                ((ctypes[c] & toptable1[d]) ^ toptable2[d]) != 0))                ((ctypes[c] & toptable1[d]) ^ toptable2[d]) != 0))
1093            {            {
1094            if (count > 0 && codevalue == OP_TYPEPOSPLUS)            if (count > 0 && codevalue == OP_TYPEPOSPLUS)
# Line 884  for (;;) Line 1111  for (;;)
1111          {          {
1112          if ((c >= 256 && d != OP_DIGIT && d != OP_WHITESPACE && d != OP_WORDCHAR) ||          if ((c >= 256 && d != OP_DIGIT && d != OP_WHITESPACE && d != OP_WORDCHAR) ||
1113              (c < 256 &&              (c < 256 &&
1114                (d != OP_ANY ||                (d != OP_ANY || !IS_NEWLINE(ptr)) &&
                (ims & PCRE_DOTALL) != 0 ||  
                !IS_NEWLINE(ptr)  
               ) &&  
1115                ((ctypes[c] & toptable1[d]) ^ toptable2[d]) != 0))                ((ctypes[c] & toptable1[d]) ^ toptable2[d]) != 0))
1116            {            {
1117            if (codevalue == OP_TYPEPOSQUERY)            if (codevalue == OP_TYPEPOSQUERY)
# Line 909  for (;;) Line 1133  for (;;)
1133          {          {
1134          if ((c >= 256 && d != OP_DIGIT && d != OP_WHITESPACE && d != OP_WORDCHAR) ||          if ((c >= 256 && d != OP_DIGIT && d != OP_WHITESPACE && d != OP_WORDCHAR) ||
1135              (c < 256 &&              (c < 256 &&
1136                (d != OP_ANY ||                (d != OP_ANY || !IS_NEWLINE(ptr)) &&
                (ims & PCRE_DOTALL) != 0 ||  
                !IS_NEWLINE(ptr)  
               ) &&  
1137                ((ctypes[c] & toptable1[d]) ^ toptable2[d]) != 0))                ((ctypes[c] & toptable1[d]) ^ toptable2[d]) != 0))
1138            {            {
1139            if (codevalue == OP_TYPEPOSSTAR)            if (codevalue == OP_TYPEPOSSTAR)
# Line 932  for (;;) Line 1153  for (;;)
1153          {          {
1154          if ((c >= 256 && d != OP_DIGIT && d != OP_WHITESPACE && d != OP_WORDCHAR) ||          if ((c >= 256 && d != OP_DIGIT && d != OP_WHITESPACE && d != OP_WORDCHAR) ||
1155              (c < 256 &&              (c < 256 &&
1156                (d != OP_ANY ||                (d != OP_ANY || !IS_NEWLINE(ptr)) &&
                (ims & PCRE_DOTALL) != 0 ||  
                !IS_NEWLINE(ptr)  
               ) &&  
1157                ((ctypes[c] & toptable1[d]) ^ toptable2[d]) != 0))                ((ctypes[c] & toptable1[d]) ^ toptable2[d]) != 0))
1158            {            {
1159            if (++count >= GET2(code, 1))            if (++count >= GET2(code, 1))
# Line 956  for (;;) Line 1174  for (;;)
1174          {          {
1175          if ((c >= 256 && d != OP_DIGIT && d != OP_WHITESPACE && d != OP_WORDCHAR) ||          if ((c >= 256 && d != OP_DIGIT && d != OP_WHITESPACE && d != OP_WORDCHAR) ||
1176              (c < 256 &&              (c < 256 &&
1177                (d != OP_ANY ||                (d != OP_ANY || !IS_NEWLINE(ptr)) &&
                (ims & PCRE_DOTALL) != 0 ||  
                !IS_NEWLINE(ptr)  
               ) &&  
1178                ((ctypes[c] & toptable1[d]) ^ toptable2[d]) != 0))                ((ctypes[c] & toptable1[d]) ^ toptable2[d]) != 0))
1179            {            {
1180            if (codevalue == OP_TYPEPOSUPTO)            if (codevalue == OP_TYPEPOSUPTO)
# Line 990  for (;;) Line 1205  for (;;)
1205        if (clen > 0)        if (clen > 0)
1206          {          {
1207          BOOL OK;          BOOL OK;
1208          int category = _pcre_ucp_findprop(c, &chartype, &script);          const ucd_record * prop = GET_UCD(c);
1209          switch(code[2])          switch(code[2])
1210            {            {
1211            case PT_ANY:            case PT_ANY:
# Line 998  for (;;) Line 1213  for (;;)
1213            break;            break;
1214    
1215            case PT_LAMP:            case PT_LAMP:
1216            OK = chartype == ucp_Lu || chartype == ucp_Ll || chartype == ucp_Lt;            OK = prop->chartype == ucp_Lu || prop->chartype == ucp_Ll ||
1217                prop->chartype == ucp_Lt;
1218            break;            break;
1219    
1220            case PT_GC:            case PT_GC:
1221            OK = category == code[3];            OK = _pcre_ucp_gentype[prop->chartype] == code[3];
1222            break;            break;
1223    
1224            case PT_PC:            case PT_PC:
1225            OK = chartype == code[3];            OK = prop->chartype == code[3];
1226            break;            break;
1227    
1228            case PT_SC:            case PT_SC:
1229            OK = script == code[3];            OK = prop->script == code[3];
1230              break;
1231    
1232              /* These are specials for combination cases. */
1233    
1234              case PT_ALNUM:
1235              OK = _pcre_ucp_gentype[prop->chartype] == ucp_L ||
1236                   _pcre_ucp_gentype[prop->chartype] == ucp_N;
1237              break;
1238    
1239              case PT_SPACE:    /* Perl space */
1240              OK = _pcre_ucp_gentype[prop->chartype] == ucp_Z ||
1241                   c == CHAR_HT || c == CHAR_NL || c == CHAR_FF || c == CHAR_CR;
1242              break;
1243    
1244              case PT_PXSPACE:  /* POSIX space */
1245              OK = _pcre_ucp_gentype[prop->chartype] == ucp_Z ||
1246                   c == CHAR_HT || c == CHAR_NL || c == CHAR_VT ||
1247                   c == CHAR_FF || c == CHAR_CR;
1248              break;
1249    
1250              case PT_WORD:
1251              OK = _pcre_ucp_gentype[prop->chartype] == ucp_L ||
1252                   _pcre_ucp_gentype[prop->chartype] == ucp_N ||
1253                   c == CHAR_UNDERSCORE;
1254            break;            break;
1255    
1256            /* Should never occur, but keep compilers from grumbling. */            /* Should never occur, but keep compilers from grumbling. */
# Line 1039  for (;;) Line 1279  for (;;)
1279        case OP_EXTUNI_EXTRA + OP_TYPEPOSPLUS:        case OP_EXTUNI_EXTRA + OP_TYPEPOSPLUS:
1280        count = current_state->count;  /* Already matched */        count = current_state->count;  /* Already matched */
1281        if (count > 0) { ADD_ACTIVE(state_offset + 2, 0); }        if (count > 0) { ADD_ACTIVE(state_offset + 2, 0); }
1282        if (clen > 0 && _pcre_ucp_findprop(c, &chartype, &script) != ucp_M)        if (clen > 0 && UCD_CATEGORY(c) != ucp_M)
1283          {          {
1284          const uschar *nptr = ptr + clen;          const uschar *nptr = ptr + clen;
1285          int ncount = 0;          int ncount = 0;
# Line 1053  for (;;) Line 1293  for (;;)
1293            int nd;            int nd;
1294            int ndlen = 1;            int ndlen = 1;
1295            GETCHARLEN(nd, nptr, ndlen);            GETCHARLEN(nd, nptr, ndlen);
1296            if (_pcre_ucp_findprop(nd, &chartype, &script) != ucp_M) break;            if (UCD_CATEGORY(nd) != ucp_M) break;
1297            ncount++;            ncount++;
1298            nptr += ndlen;            nptr += ndlen;
1299            }            }
# Line 1074  for (;;) Line 1314  for (;;)
1314          int ncount = 0;          int ncount = 0;
1315          switch (c)          switch (c)
1316            {            {
           case 0x000d:  
           if (ptr + 1 < end_subject && ptr[1] == 0x0a) ncount = 1;  
           /* Fall through */  
           case 0x000a:  
1317            case 0x000b:            case 0x000b:
1318            case 0x000c:            case 0x000c:
1319            case 0x0085:            case 0x0085:
1320            case 0x2028:            case 0x2028:
1321            case 0x2029:            case 0x2029:
1322              if ((md->moptions & PCRE_BSR_ANYCRLF) != 0) break;
1323              goto ANYNL01;
1324    
1325              case 0x000d:
1326              if (ptr + 1 < end_subject && ptr[1] == 0x0a) ncount = 1;
1327              /* Fall through */
1328    
1329              ANYNL01:
1330              case 0x000a:
1331            if (count > 0 && codevalue == OP_ANYNL_EXTRA + OP_TYPEPOSPLUS)            if (count > 0 && codevalue == OP_ANYNL_EXTRA + OP_TYPEPOSPLUS)
1332              {              {
1333              active_count--;           /* Remove non-match possibility */              active_count--;           /* Remove non-match possibility */
# Line 1091  for (;;) Line 1336  for (;;)
1336            count++;            count++;
1337            ADD_NEW_DATA(-state_offset, count, ncount);            ADD_NEW_DATA(-state_offset, count, ncount);
1338            break;            break;
1339    
1340            default:            default:
1341            break;            break;
1342            }            }
# Line 1206  for (;;) Line 1452  for (;;)
1452        if (clen > 0)        if (clen > 0)
1453          {          {
1454          BOOL OK;          BOOL OK;
1455          int category = _pcre_ucp_findprop(c, &chartype, &script);          const ucd_record * prop = GET_UCD(c);
1456          switch(code[2])          switch(code[2])
1457            {            {
1458            case PT_ANY:            case PT_ANY:
# Line 1214  for (;;) Line 1460  for (;;)
1460            break;            break;
1461    
1462            case PT_LAMP:            case PT_LAMP:
1463            OK = chartype == ucp_Lu || chartype == ucp_Ll || chartype == ucp_Lt;            OK = prop->chartype == ucp_Lu || prop->chartype == ucp_Ll ||
1464                prop->chartype == ucp_Lt;
1465            break;            break;
1466    
1467            case PT_GC:            case PT_GC:
1468            OK = category == code[3];            OK = _pcre_ucp_gentype[prop->chartype] == code[3];
1469            break;            break;
1470    
1471            case PT_PC:            case PT_PC:
1472            OK = chartype == code[3];            OK = prop->chartype == code[3];
1473            break;            break;
1474    
1475            case PT_SC:            case PT_SC:
1476            OK = script == code[3];            OK = prop->script == code[3];
1477              break;
1478    
1479              /* These are specials for combination cases. */
1480    
1481              case PT_ALNUM:
1482              OK = _pcre_ucp_gentype[prop->chartype] == ucp_L ||
1483                   _pcre_ucp_gentype[prop->chartype] == ucp_N;
1484              break;
1485    
1486              case PT_SPACE:    /* Perl space */
1487              OK = _pcre_ucp_gentype[prop->chartype] == ucp_Z ||
1488                   c == CHAR_HT || c == CHAR_NL || c == CHAR_FF || c == CHAR_CR;
1489              break;
1490    
1491              case PT_PXSPACE:  /* POSIX space */
1492              OK = _pcre_ucp_gentype[prop->chartype] == ucp_Z ||
1493                   c == CHAR_HT || c == CHAR_NL || c == CHAR_VT ||
1494                   c == CHAR_FF || c == CHAR_CR;
1495              break;
1496    
1497              case PT_WORD:
1498              OK = _pcre_ucp_gentype[prop->chartype] == ucp_L ||
1499                   _pcre_ucp_gentype[prop->chartype] == ucp_N ||
1500                   c == CHAR_UNDERSCORE;
1501            break;            break;
1502    
1503            /* Should never occur, but keep compilers from grumbling. */            /* Should never occur, but keep compilers from grumbling. */
# Line 1264  for (;;) Line 1535  for (;;)
1535        QS2:        QS2:
1536    
1537        ADD_ACTIVE(state_offset + 2, 0);        ADD_ACTIVE(state_offset + 2, 0);
1538        if (clen > 0 && _pcre_ucp_findprop(c, &chartype, &script) != ucp_M)        if (clen > 0 && UCD_CATEGORY(c) != ucp_M)
1539          {          {
1540          const uschar *nptr = ptr + clen;          const uschar *nptr = ptr + clen;
1541          int ncount = 0;          int ncount = 0;
# Line 1279  for (;;) Line 1550  for (;;)
1550            int nd;            int nd;
1551            int ndlen = 1;            int ndlen = 1;
1552            GETCHARLEN(nd, nptr, ndlen);            GETCHARLEN(nd, nptr, ndlen);
1553            if (_pcre_ucp_findprop(nd, &chartype, &script) != ucp_M) break;            if (UCD_CATEGORY(nd) != ucp_M) break;
1554            ncount++;            ncount++;
1555            nptr += ndlen;            nptr += ndlen;
1556            }            }
# Line 1307  for (;;) Line 1578  for (;;)
1578          int ncount = 0;          int ncount = 0;
1579          switch (c)          switch (c)
1580            {            {
           case 0x000d:  
           if (ptr + 1 < end_subject && ptr[1] == 0x0a) ncount = 1;  
           /* Fall through */  
           case 0x000a:  
1581            case 0x000b:            case 0x000b:
1582            case 0x000c:            case 0x000c:
1583            case 0x0085:            case 0x0085:
1584            case 0x2028:            case 0x2028:
1585            case 0x2029:            case 0x2029:
1586              if ((md->moptions & PCRE_BSR_ANYCRLF) != 0) break;
1587              goto ANYNL02;
1588    
1589              case 0x000d:
1590              if (ptr + 1 < end_subject && ptr[1] == 0x0a) ncount = 1;
1591              /* Fall through */
1592    
1593              ANYNL02:
1594              case 0x000a:
1595            if (codevalue == OP_ANYNL_EXTRA + OP_TYPEPOSSTAR ||            if (codevalue == OP_ANYNL_EXTRA + OP_TYPEPOSSTAR ||
1596                codevalue == OP_ANYNL_EXTRA + OP_TYPEPOSQUERY)                codevalue == OP_ANYNL_EXTRA + OP_TYPEPOSQUERY)
1597              {              {
# Line 1324  for (;;) Line 1600  for (;;)
1600              }              }
1601            ADD_NEW_DATA(-(state_offset + count), 0, ncount);            ADD_NEW_DATA(-(state_offset + count), 0, ncount);
1602            break;            break;
1603    
1604            default:            default:
1605            break;            break;
1606            }            }
# Line 1447  for (;;) Line 1724  for (;;)
1724        if (clen > 0)        if (clen > 0)
1725          {          {
1726          BOOL OK;          BOOL OK;
1727          int category = _pcre_ucp_findprop(c, &chartype, &script);          const ucd_record * prop = GET_UCD(c);
1728          switch(code[4])          switch(code[4])
1729            {            {
1730            case PT_ANY:            case PT_ANY:
# Line 1455  for (;;) Line 1732  for (;;)
1732            break;            break;
1733    
1734            case PT_LAMP:            case PT_LAMP:
1735            OK = chartype == ucp_Lu || chartype == ucp_Ll || chartype == ucp_Lt;            OK = prop->chartype == ucp_Lu || prop->chartype == ucp_Ll ||
1736                prop->chartype == ucp_Lt;
1737            break;            break;
1738    
1739            case PT_GC:            case PT_GC:
1740            OK = category == code[5];            OK = _pcre_ucp_gentype[prop->chartype] == code[5];
1741            break;            break;
1742    
1743            case PT_PC:            case PT_PC:
1744            OK = chartype == code[5];            OK = prop->chartype == code[5];
1745            break;            break;
1746    
1747            case PT_SC:            case PT_SC:
1748            OK = script == code[5];            OK = prop->script == code[5];
1749              break;
1750    
1751              /* These are specials for combination cases. */
1752    
1753              case PT_ALNUM:
1754              OK = _pcre_ucp_gentype[prop->chartype] == ucp_L ||
1755                   _pcre_ucp_gentype[prop->chartype] == ucp_N;
1756              break;
1757    
1758              case PT_SPACE:    /* Perl space */
1759              OK = _pcre_ucp_gentype[prop->chartype] == ucp_Z ||
1760                   c == CHAR_HT || c == CHAR_NL || c == CHAR_FF || c == CHAR_CR;
1761              break;
1762    
1763              case PT_PXSPACE:  /* POSIX space */
1764              OK = _pcre_ucp_gentype[prop->chartype] == ucp_Z ||
1765                   c == CHAR_HT || c == CHAR_NL || c == CHAR_VT ||
1766                   c == CHAR_FF || c == CHAR_CR;
1767              break;
1768    
1769              case PT_WORD:
1770              OK = _pcre_ucp_gentype[prop->chartype] == ucp_L ||
1771                   _pcre_ucp_gentype[prop->chartype] == ucp_N ||
1772                   c == CHAR_UNDERSCORE;
1773            break;            break;
1774    
1775            /* Should never occur, but keep compilers from grumbling. */            /* Should never occur, but keep compilers from grumbling. */
# Line 1500  for (;;) Line 1802  for (;;)
1802        if (codevalue != OP_EXTUNI_EXTRA + OP_TYPEEXACT)        if (codevalue != OP_EXTUNI_EXTRA + OP_TYPEEXACT)
1803          { ADD_ACTIVE(state_offset + 4, 0); }          { ADD_ACTIVE(state_offset + 4, 0); }
1804        count = current_state->count;  /* Number already matched */        count = current_state->count;  /* Number already matched */
1805        if (clen > 0 && _pcre_ucp_findprop(c, &chartype, &script) != ucp_M)        if (clen > 0 && UCD_CATEGORY(c) != ucp_M)
1806          {          {
1807          const uschar *nptr = ptr + clen;          const uschar *nptr = ptr + clen;
1808          int ncount = 0;          int ncount = 0;
# Line 1514  for (;;) Line 1816  for (;;)
1816            int nd;            int nd;
1817            int ndlen = 1;            int ndlen = 1;
1818            GETCHARLEN(nd, nptr, ndlen);            GETCHARLEN(nd, nptr, ndlen);
1819            if (_pcre_ucp_findprop(nd, &chartype, &script) != ucp_M) break;            if (UCD_CATEGORY(nd) != ucp_M) break;
1820            ncount++;            ncount++;
1821            nptr += ndlen;            nptr += ndlen;
1822            }            }
# Line 1539  for (;;) Line 1841  for (;;)
1841          int ncount = 0;          int ncount = 0;
1842          switch (c)          switch (c)
1843            {            {
           case 0x000d:  
           if (ptr + 1 < end_subject && ptr[1] == 0x0a) ncount = 1;  
           /* Fall through */  
           case 0x000a:  
1844            case 0x000b:            case 0x000b:
1845            case 0x000c:            case 0x000c:
1846            case 0x0085:            case 0x0085:
1847            case 0x2028:            case 0x2028:
1848            case 0x2029:            case 0x2029:
1849              if ((md->moptions & PCRE_BSR_ANYCRLF) != 0) break;
1850              goto ANYNL03;
1851    
1852              case 0x000d:
1853              if (ptr + 1 < end_subject && ptr[1] == 0x0a) ncount = 1;
1854              /* Fall through */
1855    
1856              ANYNL03:
1857              case 0x000a:
1858            if (codevalue == OP_ANYNL_EXTRA + OP_TYPEPOSUPTO)            if (codevalue == OP_ANYNL_EXTRA + OP_TYPEPOSUPTO)
1859              {              {
1860              active_count--;           /* Remove non-match possibility */              active_count--;           /* Remove non-match possibility */
# Line 1558  for (;;) Line 1865  for (;;)
1865            else            else
1866              { ADD_NEW_DATA(-state_offset, count, ncount); }              { ADD_NEW_DATA(-state_offset, count, ncount); }
1867            break;            break;
1868    
1869            default:            default:
1870            break;            break;
1871            }            }
# Line 1673  for (;;) Line 1981  for (;;)
1981        break;        break;
1982    
1983        /*-----------------------------------------------------------------*/        /*-----------------------------------------------------------------*/
1984        case OP_CHARNC:        case OP_CHARI:
1985        if (clen == 0) break;        if (clen == 0) break;
1986    
1987  #ifdef SUPPORT_UTF8  #ifdef SUPPORT_UTF8
# Line 1688  for (;;) Line 1996  for (;;)
1996            other case of the character. */            other case of the character. */
1997    
1998  #ifdef SUPPORT_UCP  #ifdef SUPPORT_UCP
1999            othercase = _pcre_ucp_othercase(c);            othercase = UCD_OTHERCASE(c);
2000  #else  #else
2001            othercase = NOTACHAR;            othercase = NOTACHAR;
2002  #endif  #endif
# Line 1713  for (;;) Line 2021  for (;;)
2021        to wait for them to pass before continuing. */        to wait for them to pass before continuing. */
2022    
2023        case OP_EXTUNI:        case OP_EXTUNI:
2024        if (clen > 0 && _pcre_ucp_findprop(c, &chartype, &script) != ucp_M)        if (clen > 0 && UCD_CATEGORY(c) != ucp_M)
2025          {          {
2026          const uschar *nptr = ptr + clen;          const uschar *nptr = ptr + clen;
2027          int ncount = 0;          int ncount = 0;
# Line 1721  for (;;) Line 2029  for (;;)
2029            {            {
2030            int nclen = 1;            int nclen = 1;
2031            GETCHARLEN(c, nptr, nclen);            GETCHARLEN(c, nptr, nclen);
2032            if (_pcre_ucp_findprop(c, &chartype, &script) != ucp_M) break;            if (UCD_CATEGORY(c) != ucp_M) break;
2033            ncount++;            ncount++;
2034            nptr += nclen;            nptr += nclen;
2035            }            }
# Line 1738  for (;;) Line 2046  for (;;)
2046        case OP_ANYNL:        case OP_ANYNL:
2047        if (clen > 0) switch(c)        if (clen > 0) switch(c)
2048          {          {
         case 0x000a:  
2049          case 0x000b:          case 0x000b:
2050          case 0x000c:          case 0x000c:
2051          case 0x0085:          case 0x0085:
2052          case 0x2028:          case 0x2028:
2053          case 0x2029:          case 0x2029:
2054            if ((md->moptions & PCRE_BSR_ANYCRLF) != 0) break;
2055    
2056            case 0x000a:
2057          ADD_NEW(state_offset + 1, 0);          ADD_NEW(state_offset + 1, 0);
2058          break;          break;
2059    
2060          case 0x000d:          case 0x000d:
2061          if (ptr + 1 < end_subject && ptr[1] == 0x0a)          if (ptr + 1 < end_subject && ptr[1] == 0x0a)
2062            {            {
# Line 1856  for (;;) Line 2167  for (;;)
2167        break;        break;
2168    
2169        /*-----------------------------------------------------------------*/        /*-----------------------------------------------------------------*/
2170        /* Match a negated single character. This is only used for one-byte        /* Match a negated single character casefully. This is only used for
2171        characters, that is, we know that d < 256. The character we are        one-byte characters, that is, we know that d < 256. The character we are
2172        checking (c) can be multibyte. */        checking (c) can be multibyte. */
2173    
2174        case OP_NOT:        case OP_NOT:
2175        if (clen > 0)        if (clen > 0 && c != d) { ADD_NEW(state_offset + dlen + 1, 0); }
         {  
         unsigned int otherd = ((ims & PCRE_CASELESS) != 0)? fcc[d] : d;  
         if (c != d && c != otherd) { ADD_NEW(state_offset + dlen + 1, 0); }  
         }  
2176        break;        break;
2177    
2178        /*-----------------------------------------------------------------*/        /*-----------------------------------------------------------------*/
2179          /* Match a negated single character caselessly. This is only used for
2180          one-byte characters, that is, we know that d < 256. The character we are
2181          checking (c) can be multibyte. */
2182    
2183          case OP_NOTI:
2184          if (clen > 0 && c != d && c != fcc[d])
2185            { ADD_NEW(state_offset + dlen + 1, 0); }
2186          break;
2187    
2188          /*-----------------------------------------------------------------*/
2189          case OP_PLUSI:
2190          case OP_MINPLUSI:
2191          case OP_POSPLUSI:
2192          case OP_NOTPLUSI:
2193          case OP_NOTMINPLUSI:
2194          case OP_NOTPOSPLUSI:
2195          caseless = TRUE;
2196          codevalue -= OP_STARI - OP_STAR;
2197    
2198          /* Fall through */
2199        case OP_PLUS:        case OP_PLUS:
2200        case OP_MINPLUS:        case OP_MINPLUS:
2201        case OP_POSPLUS:        case OP_POSPLUS:
# Line 1880  for (;;) Line 2207  for (;;)
2207        if (clen > 0)        if (clen > 0)
2208          {          {
2209          unsigned int otherd = NOTACHAR;          unsigned int otherd = NOTACHAR;
2210          if ((ims & PCRE_CASELESS) != 0)          if (caseless)
2211            {            {
2212  #ifdef SUPPORT_UTF8  #ifdef SUPPORT_UTF8
2213            if (utf8 && d >= 128)            if (utf8 && d >= 128)
2214              {              {
2215  #ifdef SUPPORT_UCP  #ifdef SUPPORT_UCP
2216              otherd = _pcre_ucp_othercase(d);              otherd = UCD_OTHERCASE(d);
2217  #endif  /* SUPPORT_UCP */  #endif  /* SUPPORT_UCP */
2218              }              }
2219            else            else
# Line 1908  for (;;) Line 2235  for (;;)
2235        break;        break;
2236    
2237        /*-----------------------------------------------------------------*/        /*-----------------------------------------------------------------*/
2238          case OP_QUERYI:
2239          case OP_MINQUERYI:
2240          case OP_POSQUERYI:
2241          case OP_NOTQUERYI:
2242          case OP_NOTMINQUERYI:
2243          case OP_NOTPOSQUERYI:
2244          caseless = TRUE;
2245          codevalue -= OP_STARI - OP_STAR;
2246          /* Fall through */
2247        case OP_QUERY:        case OP_QUERY:
2248        case OP_MINQUERY:        case OP_MINQUERY:
2249        case OP_POSQUERY:        case OP_POSQUERY:
# Line 1918  for (;;) Line 2254  for (;;)
2254        if (clen > 0)        if (clen > 0)
2255          {          {
2256          unsigned int otherd = NOTACHAR;          unsigned int otherd = NOTACHAR;
2257          if ((ims & PCRE_CASELESS) != 0)          if (caseless)
2258            {            {
2259  #ifdef SUPPORT_UTF8  #ifdef SUPPORT_UTF8
2260            if (utf8 && d >= 128)            if (utf8 && d >= 128)
2261              {              {
2262  #ifdef SUPPORT_UCP  #ifdef SUPPORT_UCP
2263              otherd = _pcre_ucp_othercase(d);              otherd = UCD_OTHERCASE(d);
2264  #endif  /* SUPPORT_UCP */  #endif  /* SUPPORT_UCP */
2265              }              }
2266            else            else
# Line 1944  for (;;) Line 2280  for (;;)
2280        break;        break;
2281    
2282        /*-----------------------------------------------------------------*/        /*-----------------------------------------------------------------*/
2283          case OP_STARI:
2284          case OP_MINSTARI:
2285          case OP_POSSTARI:
2286          case OP_NOTSTARI:
2287          case OP_NOTMINSTARI:
2288          case OP_NOTPOSSTARI:
2289          caseless = TRUE;
2290          codevalue -= OP_STARI - OP_STAR;
2291          /* Fall through */
2292        case OP_STAR:        case OP_STAR:
2293        case OP_MINSTAR:        case OP_MINSTAR:
2294        case OP_POSSTAR:        case OP_POSSTAR:
# Line 1954  for (;;) Line 2299  for (;;)
2299        if (clen > 0)        if (clen > 0)
2300          {          {
2301          unsigned int otherd = NOTACHAR;          unsigned int otherd = NOTACHAR;
2302          if ((ims & PCRE_CASELESS) != 0)          if (caseless)
2303            {            {
2304  #ifdef SUPPORT_UTF8  #ifdef SUPPORT_UTF8
2305            if (utf8 && d >= 128)            if (utf8 && d >= 128)
2306              {              {
2307  #ifdef SUPPORT_UCP  #ifdef SUPPORT_UCP
2308              otherd = _pcre_ucp_othercase(d);              otherd = UCD_OTHERCASE(d);
2309  #endif  /* SUPPORT_UCP */  #endif  /* SUPPORT_UCP */
2310              }              }
2311            else            else
# Line 1980  for (;;) Line 2325  for (;;)
2325        break;        break;
2326    
2327        /*-----------------------------------------------------------------*/        /*-----------------------------------------------------------------*/
2328          case OP_EXACTI:
2329          case OP_NOTEXACTI:
2330          caseless = TRUE;
2331          codevalue -= OP_STARI - OP_STAR;
2332          /* Fall through */
2333        case OP_EXACT:        case OP_EXACT:
2334        case OP_NOTEXACT:        case OP_NOTEXACT:
2335        count = current_state->count;  /* Number already matched */        count = current_state->count;  /* Number already matched */
2336        if (clen > 0)        if (clen > 0)
2337          {          {
2338          unsigned int otherd = NOTACHAR;          unsigned int otherd = NOTACHAR;
2339          if ((ims & PCRE_CASELESS) != 0)          if (caseless)
2340            {            {
2341  #ifdef SUPPORT_UTF8  #ifdef SUPPORT_UTF8
2342            if (utf8 && d >= 128)            if (utf8 && d >= 128)
2343              {              {
2344  #ifdef SUPPORT_UCP  #ifdef SUPPORT_UCP
2345              otherd = _pcre_ucp_othercase(d);              otherd = UCD_OTHERCASE(d);
2346  #endif  /* SUPPORT_UCP */  #endif  /* SUPPORT_UCP */
2347              }              }
2348            else            else
# Line 2010  for (;;) Line 2360  for (;;)
2360        break;        break;
2361    
2362        /*-----------------------------------------------------------------*/        /*-----------------------------------------------------------------*/
2363          case OP_UPTOI:
2364          case OP_MINUPTOI:
2365          case OP_POSUPTOI:
2366          case OP_NOTUPTOI:
2367          case OP_NOTMINUPTOI:
2368          case OP_NOTPOSUPTOI:
2369          caseless = TRUE;
2370          codevalue -= OP_STARI - OP_STAR;
2371          /* Fall through */
2372        case OP_UPTO:        case OP_UPTO:
2373        case OP_MINUPTO:        case OP_MINUPTO:
2374        case OP_POSUPTO:        case OP_POSUPTO:
# Line 2021  for (;;) Line 2380  for (;;)
2380        if (clen > 0)        if (clen > 0)
2381          {          {
2382          unsigned int otherd = NOTACHAR;          unsigned int otherd = NOTACHAR;
2383          if ((ims & PCRE_CASELESS) != 0)          if (caseless)
2384            {            {
2385  #ifdef SUPPORT_UTF8  #ifdef SUPPORT_UTF8
2386            if (utf8 && d >= 128)            if (utf8 && d >= 128)
2387              {              {
2388  #ifdef SUPPORT_UCP  #ifdef SUPPORT_UCP
2389              otherd = _pcre_ucp_othercase(d);              otherd = UCD_OTHERCASE(d);
2390  #endif  /* SUPPORT_UCP */  #endif  /* SUPPORT_UCP */
2391              }              }
2392            else            else
# Line 2088  for (;;) Line 2447  for (;;)
2447          points to the byte after the end of the class. If there is a          points to the byte after the end of the class. If there is a
2448          quantifier, this is where it will be. */          quantifier, this is where it will be. */
2449    
2450          next_state_offset = ecode - start_code;          next_state_offset = (int)(ecode - start_code);
2451    
2452          switch (*ecode)          switch (*ecode)
2453            {            {
# Line 2135  for (;;) Line 2494  for (;;)
2494    
2495  /* ========================================================================== */  /* ========================================================================== */
2496        /* These are the opcodes for fancy brackets of various kinds. We have        /* These are the opcodes for fancy brackets of various kinds. We have
2497        to use recursion in order to handle them. */        to use recursion in order to handle them. The "always failing" assertion
2498          (?!) is optimised to OP_FAIL when compiling, so we have to support that,
2499          though the other "backtracking verbs" are not supported. */
2500    
2501          case OP_FAIL:
2502          forced_fail++;    /* Count FAILs for multiple states */
2503          break;
2504    
2505        case OP_ASSERT:        case OP_ASSERT:
2506        case OP_ASSERT_NOT:        case OP_ASSERT_NOT:
# Line 2153  for (;;) Line 2518  for (;;)
2518            md,                                   /* static match data */            md,                                   /* static match data */
2519            code,                                 /* this subexpression's code */            code,                                 /* this subexpression's code */
2520            ptr,                                  /* where we currently are */            ptr,                                  /* where we currently are */
2521            ptr - start_subject,                  /* start offset */            (int)(ptr - start_subject),           /* start offset */
2522            local_offsets,                        /* offset vector */            local_offsets,                        /* offset vector */
2523            sizeof(local_offsets)/sizeof(int),    /* size of same */            sizeof(local_offsets)/sizeof(int),    /* size of same */
2524            local_workspace,                      /* workspace vector */            local_workspace,                      /* workspace vector */
2525            sizeof(local_workspace)/sizeof(int),  /* size of same */            sizeof(local_workspace)/sizeof(int),  /* size of same */
2526            ims,                                  /* the current ims flags */            rlevel);                              /* function recursion level */
           rlevel,                               /* function recursion level */  
           recursing);                           /* pass on regex recursion */  
2527    
2528            if (rc == PCRE_ERROR_DFA_UITEM) return rc;
2529          if ((rc >= 0) == (codevalue == OP_ASSERT || codevalue == OP_ASSERTBACK))          if ((rc >= 0) == (codevalue == OP_ASSERT || codevalue == OP_ASSERTBACK))
2530              { ADD_ACTIVE(endasscode + LINK_SIZE + 1 - start_code, 0); }              { ADD_ACTIVE((int)(endasscode + LINK_SIZE + 1 - start_code), 0); }
2531          }          }
2532        break;        break;
2533    
# Line 2173  for (;;) Line 2537  for (;;)
2537          {          {
2538          int local_offsets[1000];          int local_offsets[1000];
2539          int local_workspace[1000];          int local_workspace[1000];
2540          int condcode = code[LINK_SIZE+1];          int codelink = GET(code, 1);
2541            int condcode;
2542    
2543            /* Because of the way auto-callout works during compile, a callout item
2544            is inserted between OP_COND and an assertion condition. This does not
2545            happen for the other conditions. */
2546    
2547            if (code[LINK_SIZE+1] == OP_CALLOUT)
2548              {
2549              rrc = 0;
2550              if (pcre_callout != NULL)
2551                {
2552                pcre_callout_block cb;
2553                cb.version          = 1;   /* Version 1 of the callout block */
2554                cb.callout_number   = code[LINK_SIZE+2];
2555                cb.offset_vector    = offsets;
2556                cb.subject          = (PCRE_SPTR)start_subject;
2557                cb.subject_length   = (int)(end_subject - start_subject);
2558                cb.start_match      = (int)(current_subject - start_subject);
2559                cb.current_position = (int)(ptr - start_subject);
2560                cb.pattern_position = GET(code, LINK_SIZE + 3);
2561                cb.next_item_length = GET(code, 3 + 2*LINK_SIZE);
2562                cb.capture_top      = 1;
2563                cb.capture_last     = -1;
2564                cb.callout_data     = md->callout_data;
2565                cb.mark             = NULL;   /* No (*MARK) support */
2566                if ((rrc = (*pcre_callout)(&cb)) < 0) return rrc;   /* Abandon */
2567                }
2568              if (rrc > 0) break;                      /* Fail this thread */
2569              code += _pcre_OP_lengths[OP_CALLOUT];    /* Skip callout data */
2570              }
2571    
2572            condcode = code[LINK_SIZE+1];
2573    
2574          /* Back reference conditions are not supported */          /* Back reference conditions are not supported */
2575    
2576          if (condcode == OP_CREF) return PCRE_ERROR_DFA_UCOND;          if (condcode == OP_CREF || condcode == OP_NCREF)
2577              return PCRE_ERROR_DFA_UCOND;
2578    
2579          /* The DEFINE condition is always false */          /* The DEFINE condition is always false */
2580    
2581          if (condcode == OP_DEF)          if (condcode == OP_DEF)
2582            {            { ADD_ACTIVE(state_offset + codelink + LINK_SIZE + 1, 0); }
           ADD_ACTIVE(state_offset + GET(code, 1) + LINK_SIZE + 1, 0);  
           }  
2583    
2584          /* The only supported version of OP_RREF is for the value RREF_ANY,          /* The only supported version of OP_RREF is for the value RREF_ANY,
2585          which means "test if in any recursion". We can't test for specifically          which means "test if in any recursion". We can't test for specifically
2586          recursed groups. */          recursed groups. */
2587    
2588          else if (condcode == OP_RREF)          else if (condcode == OP_RREF || condcode == OP_NRREF)
2589            {            {
2590            int value = GET2(code, LINK_SIZE+2);            int value = GET2(code, LINK_SIZE+2);
2591            if (value != RREF_ANY) return PCRE_ERROR_DFA_UCOND;            if (value != RREF_ANY) return PCRE_ERROR_DFA_UCOND;
2592            if (recursing > 0) { ADD_ACTIVE(state_offset + LINK_SIZE + 4, 0); }            if (md->recursive != NULL)
2593              else { ADD_ACTIVE(state_offset + GET(code, 1) + LINK_SIZE + 1, 0); }              { ADD_ACTIVE(state_offset + LINK_SIZE + 4, 0); }
2594              else { ADD_ACTIVE(state_offset + codelink + LINK_SIZE + 1, 0); }
2595            }            }
2596    
2597          /* Otherwise, the condition is an assertion */          /* Otherwise, the condition is an assertion */
# Line 2212  for (;;) Line 2608  for (;;)
2608              md,                                   /* fixed match data */              md,                                   /* fixed match data */
2609              asscode,                              /* this subexpression's code */              asscode,                              /* this subexpression's code */
2610              ptr,                                  /* where we currently are */              ptr,                                  /* where we currently are */
2611              ptr - start_subject,                  /* start offset */              (int)(ptr - start_subject),           /* start offset */
2612              local_offsets,                        /* offset vector */              local_offsets,                        /* offset vector */
2613              sizeof(local_offsets)/sizeof(int),    /* size of same */              sizeof(local_offsets)/sizeof(int),    /* size of same */
2614              local_workspace,                      /* workspace vector */              local_workspace,                      /* workspace vector */
2615              sizeof(local_workspace)/sizeof(int),  /* size of same */              sizeof(local_workspace)/sizeof(int),  /* size of same */
2616              ims,                                  /* the current ims flags */              rlevel);                              /* function recursion level */
             rlevel,                               /* function recursion level */  
             recursing);                           /* pass on regex recursion */  
2617    
2618              if (rc == PCRE_ERROR_DFA_UITEM) return rc;
2619            if ((rc >= 0) ==            if ((rc >= 0) ==
2620                  (condcode == OP_ASSERT || condcode == OP_ASSERTBACK))                  (condcode == OP_ASSERT || condcode == OP_ASSERTBACK))
2621              { ADD_ACTIVE(endasscode + LINK_SIZE + 1 - start_code, 0); }              { ADD_ACTIVE((int)(endasscode + LINK_SIZE + 1 - start_code), 0); }
2622            else            else
2623              { ADD_ACTIVE(state_offset + GET(code, 1) + LINK_SIZE + 1, 0); }              { ADD_ACTIVE(state_offset + codelink + LINK_SIZE + 1, 0); }
2624            }            }
2625          }          }
2626        break;        break;
# Line 2233  for (;;) Line 2628  for (;;)
2628        /*-----------------------------------------------------------------*/        /*-----------------------------------------------------------------*/
2629        case OP_RECURSE:        case OP_RECURSE:
2630          {          {
2631            dfa_recursion_info *ri;
2632          int local_offsets[1000];          int local_offsets[1000];
2633          int local_workspace[1000];          int local_workspace[1000];
2634            const uschar *callpat = start_code + GET(code, 1);
2635            int recno = (callpat == md->start_code)? 0 :
2636              GET2(callpat, 1 + LINK_SIZE);
2637          int rc;          int rc;
2638    
2639          DPRINTF(("%.*sStarting regex recursion %d\n", rlevel*2-2, SP,          DPRINTF(("%.*sStarting regex recursion\n", rlevel*2-2, SP));
2640            recursing + 1));  
2641            /* Check for repeating a recursion without advancing the subject
2642            pointer. This should catch convoluted mutual recursions. (Some simple
2643            cases are caught at compile time.) */
2644    
2645            for (ri = md->recursive; ri != NULL; ri = ri->prevrec)
2646              if (recno == ri->group_num && ptr == ri->subject_position)
2647                return PCRE_ERROR_RECURSELOOP;
2648    
2649            /* Remember this recursion and where we started it so as to
2650            catch infinite loops. */
2651    
2652            new_recursive.group_num = recno;
2653            new_recursive.subject_position = ptr;
2654            new_recursive.prevrec = md->recursive;
2655            md->recursive = &new_recursive;
2656    
2657          rc = internal_dfa_exec(          rc = internal_dfa_exec(
2658            md,                                   /* fixed match data */            md,                                   /* fixed match data */
2659            start_code + GET(code, 1),            /* this subexpression's code */            callpat,                              /* this subexpression's code */
2660            ptr,                                  /* where we currently are */            ptr,                                  /* where we currently are */
2661            ptr - start_subject,                  /* start offset */            (int)(ptr - start_subject),           /* start offset */
2662            local_offsets,                        /* offset vector */            local_offsets,                        /* offset vector */
2663            sizeof(local_offsets)/sizeof(int),    /* size of same */            sizeof(local_offsets)/sizeof(int),    /* size of same */
2664            local_workspace,                      /* workspace vector */            local_workspace,                      /* workspace vector */
2665            sizeof(local_workspace)/sizeof(int),  /* size of same */            sizeof(local_workspace)/sizeof(int),  /* size of same */
2666            ims,                                  /* the current ims flags */            rlevel);                              /* function recursion level */
2667            rlevel,                               /* function recursion level */  
2668            recursing + 1);                       /* regex recurse level */          md->recursive = new_recursive.prevrec;  /* Done this recursion */
2669    
2670          DPRINTF(("%.*sReturn from regex recursion %d: rc=%d\n", rlevel*2-2, SP,          DPRINTF(("%.*sReturn from regex recursion: rc=%d\n", rlevel*2-2, SP,
2671            recursing + 1, rc));            rc));
2672    
2673          /* Ran out of internal offsets */          /* Ran out of internal offsets */
2674    
# Line 2287  for (;;) Line 2701  for (;;)
2701        break;        break;
2702    
2703        /*-----------------------------------------------------------------*/        /*-----------------------------------------------------------------*/
2704          case OP_BRAPOS:
2705          case OP_SBRAPOS:
2706          case OP_CBRAPOS:
2707          case OP_SCBRAPOS:
2708          case OP_BRAPOSZERO:
2709            {
2710            int charcount, matched_count;
2711            const uschar *local_ptr = ptr;
2712            BOOL allow_zero;
2713    
2714            if (codevalue == OP_BRAPOSZERO)
2715              {
2716              allow_zero = TRUE;
2717              codevalue = *(++code);  /* Codevalue will be one of above BRAs */
2718              }
2719            else allow_zero = FALSE;
2720    
2721            /* Loop to match the subpattern as many times as possible as if it were
2722            a complete pattern. */
2723    
2724            for (matched_count = 0;; matched_count++)
2725              {
2726              int local_offsets[2];
2727              int local_workspace[1000];
2728    
2729              int rc = internal_dfa_exec(
2730                md,                                   /* fixed match data */
2731                code,                                 /* this subexpression's code */
2732                local_ptr,                            /* where we currently are */
2733                (int)(ptr - start_subject),           /* start offset */
2734                local_offsets,                        /* offset vector */
2735                sizeof(local_offsets)/sizeof(int),    /* size of same */
2736                local_workspace,                      /* workspace vector */
2737                sizeof(local_workspace)/sizeof(int),  /* size of same */
2738                rlevel);                              /* function recursion level */
2739    
2740              /* Failed to match */
2741    
2742              if (rc < 0)
2743                {
2744                if (rc != PCRE_ERROR_NOMATCH) return rc;
2745                break;
2746                }
2747    
2748              /* Matched: break the loop if zero characters matched. */
2749    
2750              charcount = local_offsets[1] - local_offsets[0];
2751              if (charcount == 0) break;
2752              local_ptr += charcount;    /* Advance temporary position ptr */
2753              }
2754    
2755            /* At this point we have matched the subpattern matched_count
2756            times, and local_ptr is pointing to the character after the end of the
2757            last match. */
2758    
2759            if (matched_count > 0 || allow_zero)
2760              {
2761              const uschar *end_subpattern = code;
2762              int next_state_offset;
2763    
2764              do { end_subpattern += GET(end_subpattern, 1); }
2765                while (*end_subpattern == OP_ALT);
2766              next_state_offset =
2767                (int)(end_subpattern - start_code + LINK_SIZE + 1);
2768    
2769              /* Optimization: if there are no more active states, and there
2770              are no new states yet set up, then skip over the subject string
2771              right here, to save looping. Otherwise, set up the new state to swing
2772              into action when the end of the matched substring is reached. */
2773    
2774              if (i + 1 >= active_count && new_count == 0)
2775                {
2776                ptr = local_ptr;
2777                clen = 0;
2778                ADD_NEW(next_state_offset, 0);
2779                }
2780              else
2781                {
2782                const uschar *p = ptr;
2783                const uschar *pp = local_ptr;
2784                charcount = pp - p;
2785                while (p < pp) if ((*p++ & 0xc0) == 0x80) charcount--;
2786                ADD_NEW_DATA(-next_state_offset, 0, (charcount - 1));
2787                }
2788              }
2789            }
2790          break;
2791    
2792          /*-----------------------------------------------------------------*/
2793        case OP_ONCE:        case OP_ONCE:
2794          case OP_ONCE_NC:
2795          {          {
2796          int local_offsets[2];          int local_offsets[2];
2797          int local_workspace[1000];          int local_workspace[1000];
# Line 2296  for (;;) Line 2800  for (;;)
2800            md,                                   /* fixed match data */            md,                                   /* fixed match data */
2801            code,                                 /* this subexpression's code */            code,                                 /* this subexpression's code */
2802            ptr,                                  /* where we currently are */            ptr,                                  /* where we currently are */
2803            ptr - start_subject,                  /* start offset */            (int)(ptr - start_subject),           /* start offset */
2804            local_offsets,                        /* offset vector */            local_offsets,                        /* offset vector */
2805            sizeof(local_offsets)/sizeof(int),    /* size of same */            sizeof(local_offsets)/sizeof(int),    /* size of same */
2806            local_workspace,                      /* workspace vector */            local_workspace,                      /* workspace vector */
2807            sizeof(local_workspace)/sizeof(int),  /* size of same */            sizeof(local_workspace)/sizeof(int),  /* size of same */
2808            ims,                                  /* the current ims flags */            rlevel);                              /* function recursion level */
           rlevel,                               /* function recursion level */  
           recursing);                           /* pass on regex recursion */  
2809    
2810          if (rc >= 0)          if (rc >= 0)
2811            {            {
# Line 2313  for (;;) Line 2815  for (;;)
2815    
2816            do { end_subpattern += GET(end_subpattern, 1); }            do { end_subpattern += GET(end_subpattern, 1); }
2817              while (*end_subpattern == OP_ALT);              while (*end_subpattern == OP_ALT);
2818            next_state_offset = end_subpattern - start_code + LINK_SIZE + 1;            next_state_offset =
2819                (int)(end_subpattern - start_code + LINK_SIZE + 1);
2820    
2821            /* If the end of this subpattern is KETRMAX or KETRMIN, we must            /* If the end of this subpattern is KETRMAX or KETRMIN, we must
2822            arrange for the repeat state also to be added to the relevant list.            arrange for the repeat state also to be added to the relevant list.
# Line 2321  for (;;) Line 2824  for (;;)
2824    
2825            repeat_state_offset = (*end_subpattern == OP_KETRMAX ||            repeat_state_offset = (*end_subpattern == OP_KETRMAX ||
2826                                   *end_subpattern == OP_KETRMIN)?                                   *end_subpattern == OP_KETRMIN)?
2827              end_subpattern - start_code - GET(end_subpattern, 1) : -1;              (int)(end_subpattern - start_code - GET(end_subpattern, 1)) : -1;
2828    
2829            /* If we have matched an empty string, add the next state at the            /* If we have matched an empty string, add the next state at the
2830            current character pointer. This is important so that the duplicate            current character pointer. This is important so that the duplicate
# Line 2336  for (;;) Line 2839  for (;;)
2839            /* Optimization: if there are no more active states, and there            /* Optimization: if there are no more active states, and there
2840            are no new states yet set up, then skip over the subject string            are no new states yet set up, then skip over the subject string
2841            right here, to save looping. Otherwise, set up the new state to swing            right here, to save looping. Otherwise, set up the new state to swing
2842            into action when the end of the substring is reached. */            into action when the end of the matched substring is reached. */
2843    
2844            else if (i + 1 >= active_count && new_count == 0)            else if (i + 1 >= active_count && new_count == 0)
2845              {              {
# Line 2366  for (;;) Line 2869  for (;;)
2869              if (repeat_state_offset >= 0)              if (repeat_state_offset >= 0)
2870                { ADD_NEW_DATA(-repeat_state_offset, 0, (charcount - 1)); }                { ADD_NEW_DATA(-repeat_state_offset, 0, (charcount - 1)); }
2871              }              }
   
2872            }            }
2873          else if (rc != PCRE_ERROR_NOMATCH) return rc;          else if (rc != PCRE_ERROR_NOMATCH) return rc;
2874          }          }
# Line 2377  for (;;) Line 2879  for (;;)
2879        /* Handle callouts */        /* Handle callouts */
2880    
2881        case OP_CALLOUT:        case OP_CALLOUT:
2882          rrc = 0;
2883        if (pcre_callout != NULL)        if (pcre_callout != NULL)
2884          {          {
         int rrc;  
2885          pcre_callout_block cb;          pcre_callout_block cb;
2886          cb.version          = 1;   /* Version 1 of the callout block */          cb.version          = 1;   /* Version 1 of the callout block */
2887          cb.callout_number   = code[1];          cb.callout_number   = code[1];
2888          cb.offset_vector    = offsets;          cb.offset_vector    = offsets;
2889          cb.subject          = (PCRE_SPTR)start_subject;          cb.subject          = (PCRE_SPTR)start_subject;
2890          cb.subject_length   = end_subject - start_subject;          cb.subject_length   = (int)(end_subject - start_subject);
2891          cb.start_match      = current_subject - start_subject;          cb.start_match      = (int)(current_subject - start_subject);
2892          cb.current_position = ptr - start_subject;          cb.current_position = (int)(ptr - start_subject);
2893          cb.pattern_position = GET(code, 2);          cb.pattern_position = GET(code, 2);
2894          cb.next_item_length = GET(code, 2 + LINK_SIZE);          cb.next_item_length = GET(code, 2 + LINK_SIZE);
2895          cb.capture_top      = 1;          cb.capture_top      = 1;
2896          cb.capture_last     = -1;          cb.capture_last     = -1;
2897          cb.callout_data     = md->callout_data;          cb.callout_data     = md->callout_data;
2898            cb.mark             = NULL;   /* No (*MARK) support */
2899          if ((rrc = (*pcre_callout)(&cb)) < 0) return rrc;   /* Abandon */          if ((rrc = (*pcre_callout)(&cb)) < 0) return rrc;   /* Abandon */
         if (rrc == 0) { ADD_ACTIVE(state_offset + 2 + 2*LINK_SIZE, 0); }  
2900          }          }
2901          if (rrc == 0)
2902            { ADD_ACTIVE(state_offset + _pcre_OP_lengths[OP_CALLOUT], 0); }
2903        break;        break;
2904    
2905    
# Line 2411  for (;;) Line 2915  for (;;)
2915    /* We have finished the processing at the current subject character. If no    /* We have finished the processing at the current subject character. If no
2916    new states have been set for the next character, we have found all the    new states have been set for the next character, we have found all the
2917    matches that we are going to find. If we are at the top level and partial    matches that we are going to find. If we are at the top level and partial
2918    matching has been requested, check for appropriate conditions. */    matching has been requested, check for appropriate conditions.
2919    
2920      The "forced_ fail" variable counts the number of (*F) encountered for the
2921      character. If it is equal to the original active_count (saved in
2922      workspace[1]) it means that (*F) was found on every active state. In this
2923      case we don't want to give a partial match.
2924    
2925      The "could_continue" variable is true if a state could have continued but
2926      for the fact that the end of the subject was reached. */
2927    
2928    if (new_count <= 0)    if (new_count <= 0)
2929      {      {
2930      if (match_count < 0 &&                     /* No matches found */      if (rlevel == 1 &&                               /* Top level, and */
2931          rlevel == 1 &&                         /* Top level match function */          could_continue &&                            /* Some could go on */
2932          (md->moptions & PCRE_PARTIAL) != 0 &&  /* Want partial matching */          forced_fail != workspace[1] &&               /* Not all forced fail & */
2933            (                                            /* either... */
2934            (md->moptions & PCRE_PARTIAL_HARD) != 0      /* Hard partial */
2935            ||                                           /* or... */
2936            ((md->moptions & PCRE_PARTIAL_SOFT) != 0 &&  /* Soft partial and */
2937             match_count < 0)                            /* no matches */
2938            ) &&                                         /* And... */
2939          ptr >= end_subject &&                  /* Reached end of subject */          ptr >= end_subject &&                  /* Reached end of subject */
2940          ptr > current_subject)                 /* Matched non-empty string */          ptr > md->start_used_ptr)              /* Inspected non-empty string */
2941        {        {
2942        if (offsetcount >= 2)        if (offsetcount >= 2)
2943          {          {
2944          offsets[0] = current_subject - start_subject;          offsets[0] = (int)(md->start_used_ptr - start_subject);
2945          offsets[1] = end_subject - start_subject;          offsets[1] = (int)(end_subject - start_subject);
2946          }          }
2947        match_count = PCRE_ERROR_PARTIAL;        match_count = PCRE_ERROR_PARTIAL;
2948        }        }
# Line 2478  Returns: > 0 => number of match Line 2996  Returns: > 0 => number of match
2996                   < -1 => some kind of unexpected problem                   < -1 => some kind of unexpected problem
2997  */  */
2998    
2999  PCRE_EXP_DEFN int  PCRE_EXP_DEFN int PCRE_CALL_CONVENTION
3000  pcre_dfa_exec(const pcre *argument_re, const pcre_extra *extra_data,  pcre_dfa_exec(const pcre *argument_re, const pcre_extra *extra_data,
3001    const char *subject, int length, int start_offset, int options, int *offsets,    const char *subject, int length, int start_offset, int options, int *offsets,
3002    int offsetcount, int *workspace, int wscount)    int offsetcount, int *workspace, int wscount)
# Line 2509  if (re == NULL || subject == NULL || wor Line 3027  if (re == NULL || subject == NULL || wor
3027     (offsets == NULL && offsetcount > 0)) return PCRE_ERROR_NULL;     (offsets == NULL && offsetcount > 0)) return PCRE_ERROR_NULL;
3028  if (offsetcount < 0) return PCRE_ERROR_BADCOUNT;  if (offsetcount < 0) return PCRE_ERROR_BADCOUNT;
3029  if (wscount < 20) return PCRE_ERROR_DFA_WSSIZE;  if (wscount < 20) return PCRE_ERROR_DFA_WSSIZE;
3030    if (start_offset < 0 || start_offset > length) return PCRE_ERROR_BADOFFSET;
3031    
3032  /* We need to find the pointer to any study data before we test for byte  /* We need to find the pointer to any study data before we test for byte
3033  flipping, so we scan the extra_data block first. This may set two fields in the  flipping, so we scan the extra_data block first. This may set two fields in the
# Line 2565  md->start_code = (const uschar *)argumen Line 3084  md->start_code = (const uschar *)argumen
3084      re->name_table_offset + re->name_count * re->name_entry_size;      re->name_table_offset + re->name_count * re->name_entry_size;
3085  md->start_subject = (const unsigned char *)subject;  md->start_subject = (const unsigned char *)subject;
3086  md->end_subject = end_subject;  md->end_subject = end_subject;
3087    md->start_offset = start_offset;
3088  md->moptions = options;  md->moptions = options;
3089  md->poptions = re->options;  md->poptions = re->options;
3090    
3091    /* If the BSR option is not set at match time, copy what was set
3092    at compile time. */
3093    
3094    if ((md->moptions & (PCRE_BSR_ANYCRLF|PCRE_BSR_UNICODE)) == 0)
3095      {
3096      if ((re->options & (PCRE_BSR_ANYCRLF|PCRE_BSR_UNICODE)) != 0)
3097        md->moptions |= re->options & (PCRE_BSR_ANYCRLF|PCRE_BSR_UNICODE);
3098    #ifdef BSR_ANYCRLF
3099      else md->moptions |= PCRE_BSR_ANYCRLF;
3100    #endif
3101      }
3102    
3103  /* Handle different types of newline. The three bits give eight cases. If  /* Handle different types of newline. The three bits give eight cases. If
3104  nothing is set at run time, whatever was used at compile time applies. */  nothing is set at run time, whatever was used at compile time applies. */
3105    
# Line 2575  switch ((((options & PCRE_NEWLINE_BITS) Line 3107  switch ((((options & PCRE_NEWLINE_BITS)
3107           PCRE_NEWLINE_BITS)           PCRE_NEWLINE_BITS)
3108    {    {
3109    case 0: newline = NEWLINE; break;   /* Compile-time default */    case 0: newline = NEWLINE; break;   /* Compile-time default */
3110    case PCRE_NEWLINE_CR: newline = '\r'; break;    case PCRE_NEWLINE_CR: newline = CHAR_CR; break;
3111    case PCRE_NEWLINE_LF: newline = '\n'; break;    case PCRE_NEWLINE_LF: newline = CHAR_NL; break;
3112    case PCRE_NEWLINE_CR+    case PCRE_NEWLINE_CR+
3113         PCRE_NEWLINE_LF: newline = ('\r' << 8) | '\n'; break;         PCRE_NEWLINE_LF: newline = (CHAR_CR << 8) | CHAR_NL; break;
3114    case PCRE_NEWLINE_ANY: newline = -1; break;    case PCRE_NEWLINE_ANY: newline = -1; break;
3115    case PCRE_NEWLINE_ANYCRLF: newline = -2; break;    case PCRE_NEWLINE_ANYCRLF: newline = -2; break;
3116    default: return PCRE_ERROR_BADNEWLINE;    default: return PCRE_ERROR_BADNEWLINE;
# Line 2614  back the character offset. */ Line 3146  back the character offset. */
3146  #ifdef SUPPORT_UTF8  #ifdef SUPPORT_UTF8
3147  if (utf8 && (options & PCRE_NO_UTF8_CHECK) == 0)  if (utf8 && (options & PCRE_NO_UTF8_CHECK) == 0)
3148    {    {
3149    if (_pcre_valid_utf8((uschar *)subject, length) >= 0)    int erroroffset;
3150      return PCRE_ERROR_BADUTF8;    int errorcode = _pcre_valid_utf8((uschar *)subject, length, &erroroffset);
3151    if (start_offset > 0 && start_offset < length)    if (errorcode != 0)
3152      {      {
3153      int tb = ((uschar *)subject)[start_offset];      if (offsetcount >= 2)
     if (tb > 127)  
3154        {        {
3155        tb &= 0xc0;        offsets[0] = erroroffset;
3156        if (tb != 0 && tb != 0xc0) return PCRE_ERROR_BADUTF8_OFFSET;        offsets[1] = errorcode;
3157        }        }
3158        return (errorcode <= PCRE_UTF8_ERR5 && (options & PCRE_PARTIAL_HARD) != 0)?
3159          PCRE_ERROR_SHORTUTF8 : PCRE_ERROR_BADUTF8;
3160      }      }
3161      if (start_offset > 0 && start_offset < length &&
3162            (((USPTR)subject)[start_offset] & 0xc0) == 0x80)
3163        return PCRE_ERROR_BADUTF8_OFFSET;
3164    }    }
3165  #endif  #endif
3166    
# Line 2638  if (md->tables == NULL) md->tables = _pc Line 3174  if (md->tables == NULL) md->tables = _pc
3174  used in a loop when finding where to start. */  used in a loop when finding where to start. */
3175    
3176  lcc = md->tables + lcc_offset;  lcc = md->tables + lcc_offset;
3177  startline = (re->options & PCRE_STARTLINE) != 0;  startline = (re->flags & PCRE_STARTLINE) != 0;
3178  firstline = (re->options & PCRE_FIRSTLINE) != 0;  firstline = (re->options & PCRE_FIRSTLINE) != 0;
3179    
3180  /* Set up the first character to match, if available. The first_byte value is  /* Set up the first character to match, if available. The first_byte value is
# Line 2649  studied, there may be a bitmap of possib Line 3185  studied, there may be a bitmap of possib
3185    
3186  if (!anchored)  if (!anchored)
3187    {    {
3188    if ((re->options & PCRE_FIRSTSET) != 0)    if ((re->flags & PCRE_FIRSTSET) != 0)
3189      {      {
3190      first_byte = re->first_byte & 255;      first_byte = re->first_byte & 255;
3191      if ((first_byte_caseless = ((re->first_byte & REQ_CASELESS) != 0)) == TRUE)      if ((first_byte_caseless = ((re->first_byte & REQ_CASELESS) != 0)) == TRUE)
# Line 2657  if (!anchored) Line 3193  if (!anchored)
3193      }      }
3194    else    else
3195      {      {
3196      if (startline && study != NULL &&      if (!startline && study != NULL &&
3197           (study->options & PCRE_STUDY_MAPPED) != 0)           (study->flags & PCRE_STUDY_MAPPED) != 0)
3198        start_bits = study->start_bits;        start_bits = study->start_bits;
3199      }      }
3200    }    }
# Line 2666  if (!anchored) Line 3202  if (!anchored)
3202  /* For anchored or unanchored matches, there may be a "last known required  /* For anchored or unanchored matches, there may be a "last known required
3203  character" set. */  character" set. */
3204    
3205  if ((re->options & PCRE_REQCHSET) != 0)  if ((re->flags & PCRE_REQCHSET) != 0)
3206    {    {
3207    req_byte = re->req_byte & 255;    req_byte = re->req_byte & 255;
3208    req_byte_caseless = (re->req_byte & REQ_CASELESS) != 0;    req_byte_caseless = (re->req_byte & REQ_CASELESS) != 0;
# Line 2674  if ((re->options & PCRE_REQCHSET) != 0) Line 3210  if ((re->options & PCRE_REQCHSET) != 0)
3210    }    }
3211    
3212  /* Call the main matching function, looping for a non-anchored regex after a  /* Call the main matching function, looping for a non-anchored regex after a
3213  failed match. Unless restarting, optimize by moving to the first match  failed match. If not restarting, perform certain optimizations at the start of
3214  character if possible, when not anchored. Then unless wanting a partial match,  a match. */
 check for a required later character. */  
3215    
3216  for (;;)  for (;;)
3217    {    {
# Line 2686  for (;;) Line 3221  for (;;)
3221      {      {
3222      const uschar *save_end_subject = end_subject;      const uschar *save_end_subject = end_subject;
3223    
3224      /* Advance to a unique first char if possible. If firstline is TRUE, the      /* If firstline is TRUE, the start of the match is constrained to the first
3225      start of the match is constrained to the first line of a multiline string.      line of a multiline string. Implement this by temporarily adjusting
3226      Implement this by temporarily adjusting end_subject so that we stop      end_subject so that we stop scanning at a newline. If the match fails at
3227      scanning at a newline. If the match fails at the newline, later code breaks      the newline, later code breaks this loop. */
     this loop. */  
3228    
3229      if (firstline)      if (firstline)
3230        {        {
3231        const uschar *t = current_subject;        USPTR t = current_subject;
3232    #ifdef SUPPORT_UTF8
3233          if (utf8)
3234            {
3235            while (t < md->end_subject && !IS_NEWLINE(t))
3236              {
3237              t++;
3238              while (t < end_subject && (*t & 0xc0) == 0x80) t++;
3239              }
3240            }
3241          else
3242    #endif
3243        while (t < md->end_subject && !IS_NEWLINE(t)) t++;        while (t < md->end_subject && !IS_NEWLINE(t)) t++;
3244        end_subject = t;        end_subject = t;
3245        }        }
3246    
3247      if (first_byte >= 0)      /* There are some optimizations that avoid running the match if a known
3248        starting point is not found. However, there is an option that disables
3249        these, for testing and for ensuring that all callouts do actually occur.
3250        The option can be set in the regex by (*NO_START_OPT) or passed in
3251        match-time options. */
3252    
3253        if (((options | re->options) & PCRE_NO_START_OPTIMIZE) == 0)
3254        {        {
3255        if (first_byte_caseless)        /* Advance to a known first byte. */
         while (current_subject < end_subject &&  
                lcc[*current_subject] != first_byte)  
           current_subject++;  
       else  
         while (current_subject < end_subject && *current_subject != first_byte)  
           current_subject++;  
       }  
3256    
3257      /* Or to just after a linebreak for a multiline match if possible */        if (first_byte >= 0)
3258            {
3259            if (first_byte_caseless)
3260              while (current_subject < end_subject &&
3261                     lcc[*current_subject] != first_byte)
3262                current_subject++;
3263            else
3264              while (current_subject < end_subject &&
3265                     *current_subject != first_byte)
3266                current_subject++;
3267            }
3268    
3269      else if (startline)        /* Or to just after a linebreak for a multiline match if possible */
3270        {  
3271        if (current_subject > md->start_subject + start_offset)        else if (startline)
3272          {          {
3273          while (current_subject <= end_subject && !WAS_NEWLINE(current_subject))          if (current_subject > md->start_subject + start_offset)
3274            current_subject++;            {
3275    #ifdef SUPPORT_UTF8
3276              if (utf8)
3277                {
3278                while (current_subject < end_subject &&
3279                       !WAS_NEWLINE(current_subject))
3280                  {
3281                  current_subject++;
3282                  while(current_subject < end_subject &&
3283                        (*current_subject & 0xc0) == 0x80)
3284                    current_subject++;
3285                  }
3286                }
3287              else
3288    #endif
3289              while (current_subject < end_subject && !WAS_NEWLINE(current_subject))
3290                current_subject++;
3291    
3292          /* If we have just passed a CR and the newline option is ANY or            /* If we have just passed a CR and the newline option is ANY or
3293          ANYCRLF, and we are now at a LF, advance the match position by one more            ANYCRLF, and we are now at a LF, advance the match position by one
3294          character. */            more character. */
3295    
3296          if (current_subject[-1] == '\r' &&            if (current_subject[-1] == CHAR_CR &&
3297               (md->nltype == NLTYPE_ANY || md->nltype == NLTYPE_ANYCRLF) &&                 (md->nltype == NLTYPE_ANY || md->nltype == NLTYPE_ANYCRLF) &&
3298               current_subject < end_subject &&                 current_subject < end_subject &&
3299               *current_subject == '\n')                 *current_subject == CHAR_NL)
3300            current_subject++;              current_subject++;
3301              }
3302          }          }
       }  
3303    
3304      /* Or to a non-unique first char after study */        /* Or to a non-unique first char after study */
3305    
3306      else if (start_bits != NULL)        else if (start_bits != NULL)
       {  
       while (current_subject < end_subject)  
3307          {          {
3308          register unsigned int c = *current_subject;          while (current_subject < end_subject)
3309          if ((start_bits[c/8] & (1 << (c&7))) == 0) current_subject++;            {
3310              register unsigned int c = *current_subject;
3311              if ((start_bits[c/8] & (1 << (c&7))) == 0)
3312                {
3313                current_subject++;
3314    #ifdef SUPPORT_UTF8
3315                if (utf8)
3316                  while(current_subject < end_subject &&
3317                        (*current_subject & 0xc0) == 0x80) current_subject++;
3318    #endif
3319                }
3320            else break;            else break;
3321              }
3322          }          }
3323        }        }
3324    
3325      /* Restore fudged end_subject */      /* Restore fudged end_subject */
3326    
3327      end_subject = save_end_subject;      end_subject = save_end_subject;
     }  
3328    
3329    /* If req_byte is set, we know that that character must appear in the subject      /* The following two optimizations are disabled for partial matching or if
3330    for the match to succeed. If the first character is set, req_byte must be      disabling is explicitly requested (and of course, by the test above, this
3331    later in the subject; otherwise the test starts at the match point. This      code is not obeyed when restarting after a partial match). */
   optimization can save a huge amount of work in patterns with nested unlimited  
   repeats that aren't going to match. Writing separate code for cased/caseless  
   versions makes it go faster, as does using an autoincrement and backing off  
   on a match.  
   
   HOWEVER: when the subject string is very, very long, searching to its end can  
   take a long time, and give bad performance on quite ordinary patterns. This  
   showed up when somebody was matching /^C/ on a 32-megabyte string... so we  
   don't do this when the string is sufficiently long.  
   
   ALSO: this processing is disabled when partial matching is requested.  
   */  
   
   if (req_byte >= 0 &&  
       end_subject - current_subject < REQ_BYTE_MAX &&  
       (options & PCRE_PARTIAL) == 0)  
     {  
     register const uschar *p = current_subject + ((first_byte >= 0)? 1 : 0);  
3332    
3333      /* We don't need to repeat the search if we haven't yet reached the      if (((options | re->options) & PCRE_NO_START_OPTIMIZE) == 0 &&
3334      place we found it at last time. */          (options & (PCRE_PARTIAL_HARD|PCRE_PARTIAL_SOFT)) == 0)
   
     if (p > req_byte_ptr)  
3335        {        {
3336        if (req_byte_caseless)        /* If the pattern was studied, a minimum subject length may be set. This
3337          {        is a lower bound; no actual string of that length may actually match the
3338          while (p < end_subject)        pattern. Although the value is, strictly, in characters, we treat it as
3339            {        bytes to avoid spending too much time in this optimization. */
3340            register int pp = *p++;  
3341            if (pp == req_byte || pp == req_byte2) { p--; break; }        if (study != NULL && (study->flags & PCRE_STUDY_MINLEN) != 0 &&
3342            }            (pcre_uint32)(end_subject - current_subject) < study->minlength)
3343          }          return PCRE_ERROR_NOMATCH;
3344        else  
3345          /* If req_byte is set, we know that that character must appear in the
3346          subject for the match to succeed. If the first character is set, req_byte
3347          must be later in the subject; otherwise the test starts at the match
3348          point. This optimization can save a huge amount of work in patterns with
3349          nested unlimited repeats that aren't going to match. Writing separate
3350          code for cased/caseless versions makes it go faster, as does using an
3351          autoincrement and backing off on a match.
3352    
3353          HOWEVER: when the subject string is very, very long, searching to its end
3354          can take a long time, and give bad performance on quite ordinary
3355          patterns. This showed up when somebody was matching /^C/ on a 32-megabyte
3356          string... so we don't do this when the string is sufficiently long. */
3357    
3358          if (req_byte >= 0 && end_subject - current_subject < REQ_BYTE_MAX)
3359          {          {
3360          while (p < end_subject)          register const uschar *p = current_subject + ((first_byte >= 0)? 1 : 0);
3361    
3362            /* We don't need to repeat the search if we haven't yet reached the
3363            place we found it at last time. */
3364    
3365            if (p > req_byte_ptr)
3366            {            {
3367            if (*p++ == req_byte) { p--; break; }            if (req_byte_caseless)
3368            }              {
3369          }              while (p < end_subject)
3370                  {
3371                  register int pp = *p++;
3372                  if (pp == req_byte || pp == req_byte2) { p--; break; }
3373                  }
3374                }
3375              else
3376                {
3377                while (p < end_subject)
3378                  {
3379                  if (*p++ == req_byte) { p--; break; }
3380                  }
3381                }
3382    
3383        /* If we can't find the required character, break the matching loop,            /* If we can't find the required character, break the matching loop,
3384        which will cause a return or PCRE_ERROR_NOMATCH. */            which will cause a return or PCRE_ERROR_NOMATCH. */
3385    
3386        if (p >= end_subject) break;            if (p >= end_subject) break;
3387    
3388        /* If we have found the required character, save the point where we            /* If we have found the required character, save the point where we
3389        found it, so that we don't search again next time round the loop if            found it, so that we don't search again next time round the loop if
3390        the start hasn't passed this character yet. */            the start hasn't passed this character yet. */
3391    
3392        req_byte_ptr = p;            req_byte_ptr = p;
3393              }
3394            }
3395        }        }
3396      }      }   /* End of optimizations that are done when not restarting */
3397    
3398    /* OK, now we can do the business */    /* OK, now we can do the business */
3399    
3400      md->start_used_ptr = current_subject;
3401      md->recursive = NULL;
3402    
3403    rc = internal_dfa_exec(    rc = internal_dfa_exec(
3404      md,                                /* fixed match data */      md,                                /* fixed match data */
3405      md->start_code,                    /* this subexpression's code */      md->start_code,                    /* this subexpression's code */
# Line 2815  for (;;) Line 3409  for (;;)
3409      offsetcount,                       /* size of same */      offsetcount,                       /* size of same */
3410      workspace,                         /* workspace vector */      workspace,                         /* workspace vector */
3411      wscount,                           /* size of same */      wscount,                           /* size of same */
3412      re->options & (PCRE_CASELESS|PCRE_MULTILINE|PCRE_DOTALL), /* ims flags */      0);                                /* function recurse level */
     0,                                 /* function recurse level */  
     0);                                /* regex recurse level */  
3413    
3414    /* Anything other than "no match" means we are done, always; otherwise, carry    /* Anything other than "no match" means we are done, always; otherwise, carry
3415    on only if not anchored. */    on only if not anchored. */
# Line 2836  for (;;) Line 3428  for (;;)
3428      }      }
3429    if (current_subject > end_subject) break;    if (current_subject > end_subject) break;
3430    
3431    /* If we have just passed a CR and the newline option is CRLF or ANY or    /* If we have just passed a CR and we are now at a LF, and the pattern does
3432    ANYCRLF, and we are now at a LF, advance the match position by one more    not contain any explicit matches for \r or \n, and the newline option is CRLF
3433    character. */    or ANY or ANYCRLF, advance the match position by one more character. */
3434    
3435    if (current_subject[-1] == '\r' &&    if (current_subject[-1] == CHAR_CR &&
3436         (md->nltype == NLTYPE_ANY ||        current_subject < end_subject &&
3437          md->nltype == NLTYPE_ANYCRLF ||        *current_subject == CHAR_NL &&
3438          md->nllen == 2) &&        (re->flags & PCRE_HASCRORLF) == 0 &&
3439         current_subject < end_subject &&          (md->nltype == NLTYPE_ANY ||
3440         *current_subject == '\n')           md->nltype == NLTYPE_ANYCRLF ||
3441             md->nllen == 2))
3442      current_subject++;      current_subject++;
3443    
3444    }   /* "Bumpalong" loop */    }   /* "Bumpalong" loop */

Legend:
Removed from v.182  
changed lines
  Added in v.733

webmaster@exim.org
ViewVC Help
Powered by ViewVC 1.1.12