/[pcre]/code/trunk/pcre_exec.c
ViewVC logotype

Diff of /code/trunk/pcre_exec.c

Parent Directory Parent Directory | Revision Log Revision Log | View Patch Patch

revision 91 by nigel, Sat Feb 24 21:41:34 2007 UTC revision 93 by nigel, Sat Feb 24 21:41:42 2007 UTC
# Line 42  POSSIBILITY OF SUCH DAMAGE. Line 42  POSSIBILITY OF SUCH DAMAGE.
42  pattern matching using an NFA algorithm, trying to mimic Perl as closely as  pattern matching using an NFA algorithm, trying to mimic Perl as closely as
43  possible. There are also some static supporting functions. */  possible. There are also some static supporting functions. */
44    
45  #define NLBLOCK md           /* The block containing newline information */  #define NLBLOCK md             /* Block containing newline information */
46    #define PSSTART start_subject  /* Field containing processed string start */
47    #define PSEND   end_subject    /* Field containing processed string end */
48    
49  #include "pcre_internal.h"  #include "pcre_internal.h"
50    
51    /* The chain of eptrblocks for tail recursions uses memory in stack workspace,
52    obtained at top level, the size of which is defined by EPTR_WORK_SIZE. */
53    
54  /* Structure for building a chain of data that actually lives on the  #define EPTR_WORK_SIZE (1000)
 stack, for holding the values of the subject pointer at the start of each  
 subpattern, so as to detect when an empty string has been matched by a  
 subpattern - to break infinite loops. When NO_RECURSE is set, these blocks  
 are on the heap, not on the stack. */  
   
 typedef struct eptrblock {  
   struct eptrblock *epb_prev;  
   USPTR epb_saved_eptr;  
 } eptrblock;  
55    
56  /* Flag bits for the match() function */  /* Flag bits for the match() function */
57    
58  #define match_condassert   0x01    /* Called to check a condition assertion */  #define match_condassert     0x01  /* Called to check a condition assertion */
59  #define match_isgroup      0x02    /* Set if start of bracketed group */  #define match_cbegroup       0x02  /* Could-be-empty unlimited repeat group */
60    #define match_tail_recursed  0x04  /* Tail recursive call */
61    
62  /* Non-error returns from the match() function. Error returns are externally  /* Non-error returns from the match() function. Error returns are externally
63  defined PCRE_ERROR_xxx codes, which are all negative. */  defined PCRE_ERROR_xxx codes, which are all negative. */
# Line 101  Returns: nothing Line 98  Returns: nothing
98  static void  static void
99  pchars(const uschar *p, int length, BOOL is_subject, match_data *md)  pchars(const uschar *p, int length, BOOL is_subject, match_data *md)
100  {  {
101  int c;  unsigned int c;
102  if (is_subject && length > md->end_subject - p) length = md->end_subject - p;  if (is_subject && length > md->end_subject - p) length = md->end_subject - p;
103  while (length-- > 0)  while (length-- > 0)
104    if (isprint(c = *(p++))) printf("%c", c); else printf("\\x%02x", c);    if (isprint(c = *(p++))) printf("%c", c); else printf("\\x%02x", c);
# Line 291  typedef struct heapframe { Line 288  typedef struct heapframe {
288    
289    BOOL Xcur_is_word;    BOOL Xcur_is_word;
290    BOOL Xcondition;    BOOL Xcondition;
   BOOL Xminimize;  
291    BOOL Xprev_is_word;    BOOL Xprev_is_word;
292    
293    unsigned long int Xoriginal_ims;    unsigned long int Xoriginal_ims;
# Line 303  typedef struct heapframe { Line 299  typedef struct heapframe {
299    int Xprop_category;    int Xprop_category;
300    int Xprop_chartype;    int Xprop_chartype;
301    int Xprop_script;    int Xprop_script;
   int *Xprop_test_variable;  
302  #endif  #endif
303    
304    int Xctype;    int Xctype;
305    int Xfc;    unsigned int Xfc;
306    int Xfi;    int Xfi;
307    int Xlength;    int Xlength;
308    int Xmax;    int Xmax;
# Line 340  typedef struct heapframe { Line 335  typedef struct heapframe {
335  *         Match from current position            *  *         Match from current position            *
336  *************************************************/  *************************************************/
337    
338  /* On entry ecode points to the first opcode, and eptr to the first character  /* This function is called recursively in many circumstances. Whenever it
 in the subject string, while eptrb holds the value of eptr at the start of the  
 last bracketed group - used for breaking infinite loops matching zero-length  
 strings. This function is called recursively in many circumstances. Whenever it  
339  returns a negative (error) response, the outer incarnation must also return the  returns a negative (error) response, the outer incarnation must also return the
340  same response.  same response.
341    
# Line 353  performance. Tests using gcc on a SPARC Line 345  performance. Tests using gcc on a SPARC
345  made performance worse.  made performance worse.
346    
347  Arguments:  Arguments:
348     eptr        pointer in subject     eptr        pointer to current character in subject
349     ecode       position in code     ecode       pointer to current position in compiled code
350     offset_top  current top pointer     offset_top  current top pointer
351     md          pointer to "static" info for the match     md          pointer to "static" info for the match
352     ims         current /i, /m, and /s options     ims         current /i, /m, and /s options
# Line 362  Arguments: Line 354  Arguments:
354                   brackets - for testing for empty matches                   brackets - for testing for empty matches
355     flags       can contain     flags       can contain
356                   match_condassert - this is an assertion condition                   match_condassert - this is an assertion condition
357                   match_isgroup - this is the start of a bracketed group                   match_cbegroup - this is the start of an unlimited repeat
358                       group that can match an empty string
359                     match_tail_recursed - this is a tail_recursed group
360     rdepth      the recursion depth     rdepth      the recursion depth
361    
362  Returns:       MATCH_MATCH if matched            )  these values are >= 0  Returns:       MATCH_MATCH if matched            )  these values are >= 0
# Line 377  match(REGISTER USPTR eptr, REGISTER cons Line 371  match(REGISTER USPTR eptr, REGISTER cons
371    int flags, unsigned int rdepth)    int flags, unsigned int rdepth)
372  {  {
373  /* These variables do not need to be preserved over recursion in this function,  /* These variables do not need to be preserved over recursion in this function,
374  so they can be ordinary variables in all cases. Mark them with "register"  so they can be ordinary variables in all cases. Mark some of them with
375  because they are used a lot in loops. */  "register" because they are used a lot in loops. */
376    
377  register int  rrc;         /* Returns from recursive calls */  register int  rrc;         /* Returns from recursive calls */
378  register int  i;           /* Used for loops not involving calls to RMATCH() */  register int  i;           /* Used for loops not involving calls to RMATCH() */
379  register unsigned int  c;  /* Character values not kept over RMATCH() calls */  register unsigned int c;   /* Character values not kept over RMATCH() calls */
380  register BOOL utf8;        /* Local copy of UTF-8 flag for speed */  register BOOL utf8;        /* Local copy of UTF-8 flag for speed */
381    
382    BOOL minimize, possessive; /* Quantifier options */
383    
384  /* When recursion is not being used, all "local" variables that have to be  /* When recursion is not being used, all "local" variables that have to be
385  preserved over calls to RMATCH() are part of a "frame" which is obtained from  preserved over calls to RMATCH() are part of a "frame" which is obtained from
386  heap storage. Set up the top-level frame here; others are obtained from the  heap storage. Set up the top-level frame here; others are obtained from the
# Line 434  HEAP_RECURSE: Line 430  HEAP_RECURSE:
430    
431  #define cur_is_word        frame->Xcur_is_word  #define cur_is_word        frame->Xcur_is_word
432  #define condition          frame->Xcondition  #define condition          frame->Xcondition
 #define minimize           frame->Xminimize  
433  #define prev_is_word       frame->Xprev_is_word  #define prev_is_word       frame->Xprev_is_word
434    
435  #define original_ims       frame->Xoriginal_ims  #define original_ims       frame->Xoriginal_ims
# Line 446  HEAP_RECURSE: Line 441  HEAP_RECURSE:
441  #define prop_category      frame->Xprop_category  #define prop_category      frame->Xprop_category
442  #define prop_chartype      frame->Xprop_chartype  #define prop_chartype      frame->Xprop_chartype
443  #define prop_script        frame->Xprop_script  #define prop_script        frame->Xprop_script
 #define prop_test_variable frame->Xprop_test_variable  
444  #endif  #endif
445    
446  #define ctype              frame->Xctype  #define ctype              frame->Xctype
# Line 470  HEAP_RECURSE: Line 464  HEAP_RECURSE:
464  get preserved during recursion in the normal way. In this environment, fi and  get preserved during recursion in the normal way. In this environment, fi and
465  i, and fc and c, can be the same variables. */  i, and fc and c, can be the same variables. */
466    
467  #else  #else         /* NO_RECURSE not defined */
468  #define fi i  #define fi i
469  #define fc c  #define fc c
470    
# Line 489  recursion_info new_recursive; /* wi Line 483  recursion_info new_recursive; /* wi
483                                     /* that do not have to be preserved over  */                                     /* that do not have to be preserved over  */
484  BOOL cur_is_word;                  /* a recursive call to RMATCH().          */  BOOL cur_is_word;                  /* a recursive call to RMATCH().          */
485  BOOL condition;  BOOL condition;
 BOOL minimize;  
486  BOOL prev_is_word;  BOOL prev_is_word;
487    
488  unsigned long int original_ims;  unsigned long int original_ims;
# Line 501  int prop_fail_result; Line 494  int prop_fail_result;
494  int prop_category;  int prop_category;
495  int prop_chartype;  int prop_chartype;
496  int prop_script;  int prop_script;
 int *prop_test_variable;  
497  #endif  #endif
498    
499  int ctype;  int ctype;
# Line 516  int save_offset1, save_offset2, save_off Line 508  int save_offset1, save_offset2, save_off
508  int stacksave[REC_STACK_SAVE_MAX];  int stacksave[REC_STACK_SAVE_MAX];
509    
510  eptrblock newptrb;  eptrblock newptrb;
511  #endif  #endif     /* NO_RECURSE */
512    
513  /* These statements are here to stop the compiler complaining about unitialized  /* These statements are here to stop the compiler complaining about unitialized
514  variables. */  variables. */
# Line 524  variables. */ Line 516  variables. */
516  #ifdef SUPPORT_UCP  #ifdef SUPPORT_UCP
517  prop_value = 0;  prop_value = 0;
518  prop_fail_result = 0;  prop_fail_result = 0;
 prop_test_variable = NULL;  
519  #endif  #endif
520    
521    
522  /* This label is used for tail recursion, which is used in a few cases even  /* This label is used for tail recursion, which is used in a few cases even
523  when NO_RECURSE is not defined, in order to reduce the amount of stack that is  when NO_RECURSE is not defined, in order to reduce the amount of stack that is
524  used. Thanks to Ian Taylor for noticing this possibility and sending the  used. Thanks to Ian Taylor for noticing this possibility and sending the
# Line 556  utf8 = md->utf8; /* Local copy of Line 548  utf8 = md->utf8; /* Local copy of
548  utf8 = FALSE;  utf8 = FALSE;
549  #endif  #endif
550    
551  /* At the start of a bracketed group, add the current subject pointer to the  /* At the start of a group with an unlimited repeat that may match an empty
552  stack of such pointers, to be re-instated at the end of the group when we hit  string, the match_cbegroup flag is set. When this is the case, add the current
553  the closing ket. When match() is called in other circumstances, we don't add to  subject pointer to the chain of such remembered pointers, to be checked when we
554  this stack. */  hit the closing ket, in order to break infinite loops that match no characters.
555    When match() is called in other circumstances, don't add to the chain. If this
556    is a tail recursion, use a block from the workspace, as the one on the stack is
557    already used. */
558    
559  if ((flags & match_isgroup) != 0)  if ((flags & match_cbegroup) != 0)
560    {    {
561    newptrb.epb_prev = eptrb;    eptrblock *p;
562    newptrb.epb_saved_eptr = eptr;    if ((flags & match_tail_recursed) != 0)
563    eptrb = &newptrb;      {
564        if (md->eptrn >= EPTR_WORK_SIZE) RRETURN(PCRE_ERROR_NULLWSLIMIT);
565        p = md->eptrchain + md->eptrn++;
566        }
567      else p = &newptrb;
568      p->epb_saved_eptr = eptr;
569      p->epb_prev = eptrb;
570      eptrb = p;
571    }    }
572    
573  /* Now start processing the operations. */  /* Now start processing the opcodes. */
574    
575  for (;;)  for (;;)
576    {    {
577      minimize = possessive = FALSE;
578    op = *ecode;    op = *ecode;
   minimize = FALSE;  
579    
580    /* For partial matching, remember if we ever hit the end of the subject after    /* For partial matching, remember if we ever hit the end of the subject after
581    matching at least one subject character. */    matching at least one subject character. */
# Line 583  for (;;) Line 585  for (;;)
585        eptr > md->start_match)        eptr > md->start_match)
586      md->hitend = TRUE;      md->hitend = TRUE;
587    
588    /* Opening capturing bracket. If there is space in the offset vector, save    switch(op)
   the current subject position in the working slot at the top of the vector. We  
   mustn't change the current values of the data slot, because they may be set  
   from a previous iteration of this group, and be referred to by a reference  
   inside the group.  
   
   If the bracket fails to match, we need to restore this value and also the  
   values of the final offsets, in case they were set by a previous iteration of  
   the same bracket.  
   
   If there isn't enough space in the offset vector, treat this as if it were a  
   non-capturing bracket. Don't worry about setting the flag for the error case  
   here; that is handled in the code for KET. */  
   
   if (op > OP_BRA)  
589      {      {
590      number = op - OP_BRA;      /* Handle a capturing bracket. If there is space in the offset vector, save
591        the current subject position in the working slot at the top of the vector.
592      /* For extended extraction brackets (large number), we have to fish out the      We mustn't change the current values of the data slot, because they may be
593      number from a dummy opcode at the start. */      set from a previous iteration of this group, and be referred to by a
594        reference inside the group.
595      if (number > EXTRACT_BASIC_MAX)  
596        number = GET2(ecode, 2+LINK_SIZE);      If the bracket fails to match, we need to restore this value and also the
597        values of the final offsets, in case they were set by a previous iteration
598        of the same bracket.
599    
600        If there isn't enough space in the offset vector, treat this as if it were
601        a non-capturing bracket. Don't worry about setting the flag for the error
602        case here; that is handled in the code for KET. */
603    
604        case OP_CBRA:
605        case OP_SCBRA:
606        number = GET2(ecode, 1+LINK_SIZE);
607      offset = number << 1;      offset = number << 1;
608    
609  #ifdef DEBUG  #ifdef DEBUG
610      printf("start bracket %d subject=", number);      printf("start bracket %d\n", number);
611        printf("subject=");
612      pchars(eptr, 16, TRUE, md);      pchars(eptr, 16, TRUE, md);
613      printf("\n");      printf("\n");
614  #endif  #endif
# Line 624  for (;;) Line 623  for (;;)
623        DPRINTF(("saving %d %d %d\n", save_offset1, save_offset2, save_offset3));        DPRINTF(("saving %d %d %d\n", save_offset1, save_offset2, save_offset3));
624        md->offset_vector[md->offset_end - number] = eptr - md->start_subject;        md->offset_vector[md->offset_end - number] = eptr - md->start_subject;
625    
626          flags = (op == OP_SCBRA)? match_cbegroup : 0;
627        do        do
628          {          {
629          RMATCH(rrc, eptr, ecode + 1 + LINK_SIZE, offset_top, md, ims, eptrb,          RMATCH(rrc, eptr, ecode + _pcre_OP_lengths[*ecode], offset_top, md,
630            match_isgroup);            ims, eptrb, flags);
631          if (rrc != MATCH_NOMATCH) RRETURN(rrc);          if (rrc != MATCH_NOMATCH) RRETURN(rrc);
632          md->capture_last = save_capture_last;          md->capture_last = save_capture_last;
633          ecode += GET(ecode, 1);          ecode += GET(ecode, 1);
# Line 643  for (;;) Line 643  for (;;)
643        RRETURN(MATCH_NOMATCH);        RRETURN(MATCH_NOMATCH);
644        }        }
645    
646      /* Insufficient room for saving captured contents */      /* Insufficient room for saving captured contents. Treat as a non-capturing
647        bracket. */
     else op = OP_BRA;  
     }  
   
   /* Other types of node can be handled by a switch */  
648    
649    switch(op)      DPRINTF(("insufficient capture room: treat as non-capturing\n"));
     {  
     case OP_BRA:     /* Non-capturing bracket: optimized */  
     DPRINTF(("start bracket 0\n"));  
   
     /* Loop for all the alternatives */  
650    
651        /* Non-capturing bracket. Loop for all the alternatives. When we get to the
652        final alternative within the brackets, we would return the result of a
653        recursive call to match() whatever happened. We can reduce stack usage by
654        turning this into a tail recursion. */
655    
656        case OP_BRA:
657        case OP_SBRA:
658        DPRINTF(("start non-capturing bracket\n"));
659        flags = (op >= OP_SBRA)? match_cbegroup : 0;
660      for (;;)      for (;;)
661        {        {
       /* When we get to the final alternative within the brackets, we would  
       return the result of a recursive call to match() whatever happened. We  
       can reduce stack usage by turning this into a tail recursion. */  
   
662        if (ecode[GET(ecode, 1)] != OP_ALT)        if (ecode[GET(ecode, 1)] != OP_ALT)
663         {          {
664         ecode += 1 + LINK_SIZE;          ecode += _pcre_OP_lengths[*ecode];
665         flags = match_isgroup;          flags |= match_tail_recursed;
666         DPRINTF(("bracket 0 tail recursion\n"));          DPRINTF(("bracket 0 tail recursion\n"));
667         goto TAIL_RECURSE;          goto TAIL_RECURSE;
668         }          }
669    
670        /* For non-final alternatives, continue the loop for a NOMATCH result;        /* For non-final alternatives, continue the loop for a NOMATCH result;
671        otherwise return. */        otherwise return. */
672    
673        RMATCH(rrc, eptr, ecode + 1 + LINK_SIZE, offset_top, md, ims, eptrb,        RMATCH(rrc, eptr, ecode + _pcre_OP_lengths[*ecode], offset_top, md, ims,
674          match_isgroup);          eptrb, flags);
675        if (rrc != MATCH_NOMATCH) RRETURN(rrc);        if (rrc != MATCH_NOMATCH) RRETURN(rrc);
676        ecode += GET(ecode, 1);        ecode += GET(ecode, 1);
677        }        }
# Line 688  for (;;) Line 684  for (;;)
684      obeyed, we can use tail recursion to avoid using another stack frame. */      obeyed, we can use tail recursion to avoid using another stack frame. */
685    
686      case OP_COND:      case OP_COND:
687      if (ecode[LINK_SIZE+1] == OP_CREF) /* Condition extract or recurse test */      case OP_SCOND:
688        if (ecode[LINK_SIZE+1] == OP_RREF)         /* Recursion test */
689          {
690          offset = GET2(ecode, LINK_SIZE + 2);     /* Recursion group number*/
691          condition = md->recursive != NULL &&
692            (offset == RREF_ANY || offset == md->recursive->group_num);
693          ecode += condition? 3 : GET(ecode, 1);
694          }
695    
696        else if (ecode[LINK_SIZE+1] == OP_CREF)    /* Group used test */
697        {        {
698        offset = GET2(ecode, LINK_SIZE+2) << 1;  /* Doubled ref number */        offset = GET2(ecode, LINK_SIZE+2) << 1;  /* Doubled ref number */
699        condition = (offset == CREF_RECURSE * 2)?        condition = offset < offset_top && md->offset_vector[offset] >= 0;
700          (md->recursive != NULL) :        ecode += condition? 3 : GET(ecode, 1);
701          (offset < offset_top && md->offset_vector[offset] >= 0);        }
702        ecode += condition? (LINK_SIZE + 4) : (LINK_SIZE + 1 + GET(ecode, 1));  
703        flags = match_isgroup;      else if (ecode[LINK_SIZE+1] == OP_DEF)     /* DEFINE - always false */
704        goto TAIL_RECURSE;        {
705          condition = FALSE;
706          ecode += GET(ecode, 1);
707        }        }
708    
709      /* The condition is an assertion. Call match() to evaluate it - setting      /* The condition is an assertion. Call match() to evaluate it - setting
710      the final argument TRUE causes it to stop at the end of an assertion. */      the final argument match_condassert causes it to stop at the end of an
711        assertion. */
712    
713      else      else
714        {        {
715        RMATCH(rrc, eptr, ecode + 1 + LINK_SIZE, offset_top, md, ims, NULL,        RMATCH(rrc, eptr, ecode + 1 + LINK_SIZE, offset_top, md, ims, NULL,
716            match_condassert | match_isgroup);            match_condassert);
717        if (rrc == MATCH_MATCH)        if (rrc == MATCH_MATCH)
718          {          {
719          ecode += 1 + LINK_SIZE + GET(ecode, LINK_SIZE+2);          condition = TRUE;
720            ecode += 1 + LINK_SIZE + GET(ecode, LINK_SIZE + 2);
721          while (*ecode == OP_ALT) ecode += GET(ecode, 1);          while (*ecode == OP_ALT) ecode += GET(ecode, 1);
722          }          }
723        else if (rrc != MATCH_NOMATCH)        else if (rrc != MATCH_NOMATCH)
724          {          {
725          RRETURN(rrc);         /* Need braces because of following else */          RRETURN(rrc);         /* Need braces because of following else */
726          }          }
727        else ecode += GET(ecode, 1);        else
728            {
729            condition = FALSE;
730            ecode += GET(ecode, 1);
731            }
732          }
733    
734        /* We are now at the branch that is to be obeyed. As there is only one,      /* We are now at the branch that is to be obeyed. As there is only one,
735        we can use tail recursion to avoid using another stack frame. */      we can use tail recursion to avoid using another stack frame. If the second
736        alternative doesn't exist, we can just plough on. */
737    
738        if (condition || *ecode == OP_ALT)
739          {
740        ecode += 1 + LINK_SIZE;        ecode += 1 + LINK_SIZE;
741        flags = match_isgroup;        flags = match_tail_recursed | ((op == OP_SCOND)? match_cbegroup : 0);
742        goto TAIL_RECURSE;        goto TAIL_RECURSE;
743        }        }
744      /* Control never reaches here */      else
745          {
746      /* Skip over conditional reference or large extraction number data if        ecode += 1 + LINK_SIZE;
747      encountered. */        }
   
     case OP_CREF:  
     case OP_BRANUMBER:  
     ecode += 3;  
748      break;      break;
749    
750      /* End of the pattern. If we are in a recursion, we should restore the  
751      offsets appropriately and continue from after the call. */      /* End of the pattern. If we are in a top-level recursion, we should
752        restore the offsets appropriately and continue from after the call. */
753    
754      case OP_END:      case OP_END:
755      if (md->recursive != NULL && md->recursive->group_num == 0)      if (md->recursive != NULL && md->recursive->group_num == 0)
# Line 777  for (;;) Line 791  for (;;)
791      case OP_ASSERTBACK:      case OP_ASSERTBACK:
792      do      do
793        {        {
794        RMATCH(rrc, eptr, ecode + 1 + LINK_SIZE, offset_top, md, ims, NULL,        RMATCH(rrc, eptr, ecode + 1 + LINK_SIZE, offset_top, md, ims, NULL, 0);
         match_isgroup);  
795        if (rrc == MATCH_MATCH) break;        if (rrc == MATCH_MATCH) break;
796        if (rrc != MATCH_NOMATCH) RRETURN(rrc);        if (rrc != MATCH_NOMATCH) RRETURN(rrc);
797        ecode += GET(ecode, 1);        ecode += GET(ecode, 1);
# Line 804  for (;;) Line 817  for (;;)
817      case OP_ASSERTBACK_NOT:      case OP_ASSERTBACK_NOT:
818      do      do
819        {        {
820        RMATCH(rrc, eptr, ecode + 1 + LINK_SIZE, offset_top, md, ims, NULL,        RMATCH(rrc, eptr, ecode + 1 + LINK_SIZE, offset_top, md, ims, NULL, 0);
         match_isgroup);  
821        if (rrc == MATCH_MATCH) RRETURN(MATCH_NOMATCH);        if (rrc == MATCH_MATCH) RRETURN(MATCH_NOMATCH);
822        if (rrc != MATCH_NOMATCH) RRETURN(rrc);        if (rrc != MATCH_NOMATCH) RRETURN(rrc);
823        ecode += GET(ecode,1);        ecode += GET(ecode,1);
# Line 826  for (;;) Line 838  for (;;)
838  #ifdef SUPPORT_UTF8  #ifdef SUPPORT_UTF8
839      if (utf8)      if (utf8)
840        {        {
841        c = GET(ecode,1);        i = GET(ecode, 1);
842        for (i = 0; i < c; i++)        while (i-- > 0)
843          {          {
844          eptr--;          eptr--;
845          if (eptr < md->start_subject) RRETURN(MATCH_NOMATCH);          if (eptr < md->start_subject) RRETURN(MATCH_NOMATCH);
# Line 840  for (;;) Line 852  for (;;)
852      /* No UTF-8 support, or not in UTF-8 mode: count is byte count */      /* No UTF-8 support, or not in UTF-8 mode: count is byte count */
853    
854        {        {
855        eptr -= GET(ecode,1);        eptr -= GET(ecode, 1);
856        if (eptr < md->start_subject) RRETURN(MATCH_NOMATCH);        if (eptr < md->start_subject) RRETURN(MATCH_NOMATCH);
857        }        }
858    
# Line 897  for (;;) Line 909  for (;;)
909      case OP_RECURSE:      case OP_RECURSE:
910        {        {
911        callpat = md->start_code + GET(ecode, 1);        callpat = md->start_code + GET(ecode, 1);
912        new_recursive.group_num = *callpat - OP_BRA;        new_recursive.group_num = (callpat == md->start_code)? 0 :
913            GET2(callpat, 1 + LINK_SIZE);
       /* For extended extraction brackets (large number), we have to fish out  
       the number from a dummy opcode at the start. */  
   
       if (new_recursive.group_num > EXTRACT_BASIC_MAX)  
         new_recursive.group_num = GET2(callpat, 2+LINK_SIZE);  
914    
915        /* Add to "recursing stack" */        /* Add to "recursing stack" */
916    
# Line 936  for (;;) Line 943  for (;;)
943        restore the offset and recursion data. */        restore the offset and recursion data. */
944    
945        DPRINTF(("Recursing into group %d\n", new_recursive.group_num));        DPRINTF(("Recursing into group %d\n", new_recursive.group_num));
946          flags = (*callpat >= OP_SBRA)? match_cbegroup : 0;
947        do        do
948          {          {
949          RMATCH(rrc, eptr, callpat + 1 + LINK_SIZE, offset_top, md, ims,          RMATCH(rrc, eptr, callpat + _pcre_OP_lengths[*callpat], offset_top,
950              eptrb, match_isgroup);            md, ims, eptrb, flags);
951          if (rrc == MATCH_MATCH)          if (rrc == MATCH_MATCH)
952            {            {
953            DPRINTF(("Recursion matched\n"));            DPRINTF(("Recursion matched\n"));
# Line 983  for (;;) Line 991  for (;;)
991      do      do
992        {        {
993        RMATCH(rrc, eptr, ecode + 1 + LINK_SIZE, offset_top, md, ims,        RMATCH(rrc, eptr, ecode + 1 + LINK_SIZE, offset_top, md, ims,
994          eptrb, match_isgroup);          eptrb, 0);
995        if (rrc == MATCH_MATCH) break;        if (rrc == MATCH_MATCH) break;
996        if (rrc != MATCH_NOMATCH) RRETURN(rrc);        if (rrc != MATCH_NOMATCH) RRETURN(rrc);
997        ecode += GET(ecode,1);        ecode += GET(ecode,1);
# Line 997  for (;;) Line 1005  for (;;)
1005      /* Continue as from after the assertion, updating the offsets high water      /* Continue as from after the assertion, updating the offsets high water
1006      mark, since extracts may have been taken. */      mark, since extracts may have been taken. */
1007    
1008      do ecode += GET(ecode,1); while (*ecode == OP_ALT);      do ecode += GET(ecode, 1); while (*ecode == OP_ALT);
1009    
1010      offset_top = md->end_offset_top;      offset_top = md->end_offset_top;
1011      eptr = md->end_match_ptr;      eptr = md->end_match_ptr;
# Line 1031  for (;;) Line 1039  for (;;)
1039        RMATCH(rrc, eptr, ecode + 1 + LINK_SIZE, offset_top, md, ims, eptrb, 0);        RMATCH(rrc, eptr, ecode + 1 + LINK_SIZE, offset_top, md, ims, eptrb, 0);
1040        if (rrc != MATCH_NOMATCH) RRETURN(rrc);        if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1041        ecode = prev;        ecode = prev;
1042        flags = match_isgroup;        flags = match_tail_recursed;
1043        goto TAIL_RECURSE;        goto TAIL_RECURSE;
1044        }        }
1045      else  /* OP_KETRMAX */      else  /* OP_KETRMAX */
1046        {        {
1047        RMATCH(rrc, eptr, prev, offset_top, md, ims, eptrb, match_isgroup);        RMATCH(rrc, eptr, prev, offset_top, md, ims, eptrb, match_cbegroup);
1048        if (rrc != MATCH_NOMATCH) RRETURN(rrc);        if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1049        ecode += 1 + LINK_SIZE;        ecode += 1 + LINK_SIZE;
1050        flags = 0;        flags = match_tail_recursed;
1051        goto TAIL_RECURSE;        goto TAIL_RECURSE;
1052        }        }
1053      /* Control never gets here */      /* Control never gets here */
# Line 1060  for (;;) Line 1068  for (;;)
1068      case OP_BRAZERO:      case OP_BRAZERO:
1069        {        {
1070        next = ecode+1;        next = ecode+1;
1071        RMATCH(rrc, eptr, next, offset_top, md, ims, eptrb, match_isgroup);        RMATCH(rrc, eptr, next, offset_top, md, ims, eptrb, 0);
1072        if (rrc != MATCH_NOMATCH) RRETURN(rrc);        if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1073        do next += GET(next,1); while (*next == OP_ALT);        do next += GET(next,1); while (*next == OP_ALT);
1074        ecode = next + 1+LINK_SIZE;        ecode = next + 1 + LINK_SIZE;
1075        }        }
1076      break;      break;
1077    
1078      case OP_BRAMINZERO:      case OP_BRAMINZERO:
1079        {        {
1080        next = ecode+1;        next = ecode+1;
1081        do next += GET(next,1); while (*next == OP_ALT);        do next += GET(next, 1); while (*next == OP_ALT);
1082        RMATCH(rrc, eptr, next + 1+LINK_SIZE, offset_top, md, ims, eptrb,        RMATCH(rrc, eptr, next + 1+LINK_SIZE, offset_top, md, ims, eptrb, 0);
         match_isgroup);  
1083        if (rrc != MATCH_NOMATCH) RRETURN(rrc);        if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1084        ecode++;        ecode++;
1085        }        }
1086      break;      break;
1087    
1088      /* End of a group, repeated or non-repeating. If we are at the end of      /* End of a group, repeated or non-repeating. */
     an assertion "group", stop matching and return MATCH_MATCH, but record the  
     current high water mark for use by positive assertions. Do this also  
     for the "once" (not-backup up) groups. */  
1089    
1090      case OP_KET:      case OP_KET:
1091      case OP_KETRMIN:      case OP_KETRMIN:
1092      case OP_KETRMAX:      case OP_KETRMAX:
1093      prev = ecode - GET(ecode, 1);      prev = ecode - GET(ecode, 1);
     saved_eptr = eptrb->epb_saved_eptr;  
1094    
1095      /* Back up the stack of bracket start pointers. */      /* If this was a group that remembered the subject start, in order to break
1096        infinite repeats of empty string matches, retrieve the subject start from
1097        the chain. Otherwise, set it NULL. */
1098    
1099        if (*prev >= OP_SBRA)
1100          {
1101          saved_eptr = eptrb->epb_saved_eptr;   /* Value at start of group */
1102          eptrb = eptrb->epb_prev;              /* Backup to previous group */
1103          }
1104        else saved_eptr = NULL;
1105    
1106      eptrb = eptrb->epb_prev;      /* If we are at the end of an assertion group, stop matching and return
1107        MATCH_MATCH, but record the current high water mark for use by positive
1108        assertions. Do this also for the "once" (atomic) groups. */
1109    
1110      if (*prev == OP_ASSERT || *prev == OP_ASSERT_NOT ||      if (*prev == OP_ASSERT || *prev == OP_ASSERT_NOT ||
1111          *prev == OP_ASSERTBACK || *prev == OP_ASSERTBACK_NOT ||          *prev == OP_ASSERTBACK || *prev == OP_ASSERTBACK_NOT ||
# Line 1102  for (;;) Line 1116  for (;;)
1116        RRETURN(MATCH_MATCH);        RRETURN(MATCH_MATCH);
1117        }        }
1118    
1119      /* In all other cases except a conditional group we have to check the      /* For capturing groups we have to check the group number back at the start
1120      group number back at the start and if necessary complete handling an      and if necessary complete handling an extraction by setting the offsets and
1121      extraction by setting the offsets and bumping the high water mark. */      bumping the high water mark. Note that whole-pattern recursion is coded as
1122        a recurse into group 0, so it won't be picked up here. Instead, we catch it
1123        when the OP_END is reached. Other recursion is handled here. */
1124    
1125      if (*prev != OP_COND)      if (*prev == OP_CBRA || *prev == OP_SCBRA)
1126        {        {
1127        number = *prev - OP_BRA;        number = GET2(prev, 1+LINK_SIZE);
   
       /* For extended extraction brackets (large number), we have to fish out  
       the number from a dummy opcode at the start. */  
   
       if (number > EXTRACT_BASIC_MAX) number = GET2(prev, 2+LINK_SIZE);  
1128        offset = number << 1;        offset = number << 1;
1129    
1130  #ifdef DEBUG  #ifdef DEBUG
# Line 1121  for (;;) Line 1132  for (;;)
1132        printf("\n");        printf("\n");
1133  #endif  #endif
1134    
1135        /* Test for a numbered group. This includes groups called as a result        md->capture_last = number;
1136        of recursion. Note that whole-pattern recursion is coded as a recurse        if (offset >= md->offset_max) md->offset_overflow = TRUE; else
       into group 0, so it won't be picked up here. Instead, we catch it when  
       the OP_END is reached. */  
   
       if (number > 0)  
1137          {          {
1138          md->capture_last = number;          md->offset_vector[offset] =
1139          if (offset >= md->offset_max) md->offset_overflow = TRUE; else            md->offset_vector[md->offset_end - number];
1140            {          md->offset_vector[offset+1] = eptr - md->start_subject;
1141            md->offset_vector[offset] =          if (offset_top <= offset) offset_top = offset + 2;
1142              md->offset_vector[md->offset_end - number];          }
1143            md->offset_vector[offset+1] = eptr - md->start_subject;  
1144            if (offset_top <= offset) offset_top = offset + 2;        /* Handle a recursively called group. Restore the offsets
1145            }        appropriately and continue from after the call. */
1146    
1147          /* Handle a recursively called group. Restore the offsets        if (md->recursive != NULL && md->recursive->group_num == number)
1148          appropriately and continue from after the call. */          {
1149            recursion_info *rec = md->recursive;
1150          if (md->recursive != NULL && md->recursive->group_num == number)          DPRINTF(("Recursion (%d) succeeded - continuing\n", number));
1151            {          md->recursive = rec->prevrec;
1152            recursion_info *rec = md->recursive;          md->start_match = rec->save_start;
1153            DPRINTF(("Recursion (%d) succeeded - continuing\n", number));          memcpy(md->offset_vector, rec->offset_save,
1154            md->recursive = rec->prevrec;            rec->saved_max * sizeof(int));
1155            md->start_match = rec->save_start;          ecode = rec->after_call;
1156            memcpy(md->offset_vector, rec->offset_save,          ims = original_ims;
1157              rec->saved_max * sizeof(int));          break;
           ecode = rec->after_call;  
           ims = original_ims;  
           break;  
           }  
1158          }          }
1159        }        }
1160    
1161      /* Reset the value of the ims flags, in case they got changed during      /* For both capturing and non-capturing groups, reset the value of the ims
1162      the group. */      flags, in case they got changed during the group. */
1163    
1164      ims = original_ims;      ims = original_ims;
1165      DPRINTF(("ims reset to %02lx\n", ims));      DPRINTF(("ims reset to %02lx\n", ims));
# Line 1177  for (;;) Line 1180  for (;;)
1180      preceding bracket, in the appropriate order. In the second case, we can use      preceding bracket, in the appropriate order. In the second case, we can use
1181      tail recursion to avoid using another stack frame. */      tail recursion to avoid using another stack frame. */
1182    
1183        flags = (*prev >= OP_SBRA)? match_cbegroup : 0;
1184    
1185      if (*ecode == OP_KETRMIN)      if (*ecode == OP_KETRMIN)
1186        {        {
1187        RMATCH(rrc, eptr, ecode + 1+LINK_SIZE, offset_top, md, ims, eptrb, 0);        RMATCH(rrc, eptr, ecode + 1+LINK_SIZE, offset_top, md, ims, eptrb, 0);
1188        if (rrc != MATCH_NOMATCH) RRETURN(rrc);        if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1189        ecode = prev;        ecode = prev;
1190        flags = match_isgroup;        flags |= match_tail_recursed;
1191        goto TAIL_RECURSE;        goto TAIL_RECURSE;
1192        }        }
1193      else  /* OP_KETRMAX */      else  /* OP_KETRMAX */
1194        {        {
1195        RMATCH(rrc, eptr, prev, offset_top, md, ims, eptrb, match_isgroup);        RMATCH(rrc, eptr, prev, offset_top, md, ims, eptrb, flags);
1196        if (rrc != MATCH_NOMATCH) RRETURN(rrc);        if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1197        ecode += 1 + LINK_SIZE;        ecode += 1 + LINK_SIZE;
1198        flags = 0;        flags = match_tail_recursed;
1199        goto TAIL_RECURSE;        goto TAIL_RECURSE;
1200        }        }
1201      /* Control never gets here */      /* Control never gets here */
# Line 1202  for (;;) Line 1207  for (;;)
1207      if ((ims & PCRE_MULTILINE) != 0)      if ((ims & PCRE_MULTILINE) != 0)
1208        {        {
1209        if (eptr != md->start_subject &&        if (eptr != md->start_subject &&
1210            (eptr == md->end_subject ||            (eptr == md->end_subject || !WAS_NEWLINE(eptr)))
            eptr < md->start_subject + md->nllen ||  
            !IS_NEWLINE(eptr - md->nllen)))  
1211          RRETURN(MATCH_NOMATCH);          RRETURN(MATCH_NOMATCH);
1212        ecode++;        ecode++;
1213        break;        break;
# Line 1244  for (;;) Line 1247  for (;;)
1247        if (!md->endonly)        if (!md->endonly)
1248          {          {
1249          if (eptr != md->end_subject &&          if (eptr != md->end_subject &&
1250              (eptr != md->end_subject - md->nllen || !IS_NEWLINE(eptr)))              (!IS_NEWLINE(eptr) || eptr != md->end_subject - md->nllen))
1251            RRETURN(MATCH_NOMATCH);            RRETURN(MATCH_NOMATCH);
1252          ecode++;          ecode++;
1253          break;          break;
# Line 1263  for (;;) Line 1266  for (;;)
1266    
1267      case OP_EODN:      case OP_EODN:
1268      if (eptr != md->end_subject &&      if (eptr != md->end_subject &&
1269          (eptr != md->end_subject - md->nllen || !IS_NEWLINE(eptr)))          (!IS_NEWLINE(eptr) || eptr != md->end_subject - md->nllen))
1270        RRETURN(MATCH_NOMATCH);        RRETURN(MATCH_NOMATCH);
1271      ecode++;      ecode++;
1272      break;      break;
# Line 1319  for (;;) Line 1322  for (;;)
1322      case OP_ANY:      case OP_ANY:
1323      if ((ims & PCRE_DOTALL) == 0)      if ((ims & PCRE_DOTALL) == 0)
1324        {        {
1325        if (eptr <= md->end_subject - md->nllen && IS_NEWLINE(eptr))        if (IS_NEWLINE(eptr)) RRETURN(MATCH_NOMATCH);
         RRETURN(MATCH_NOMATCH);  
1326        }        }
1327      if (eptr++ >= md->end_subject) RRETURN(MATCH_NOMATCH);      if (eptr++ >= md->end_subject) RRETURN(MATCH_NOMATCH);
1328      if (utf8)      if (utf8)
# Line 1414  for (;;) Line 1416  for (;;)
1416      ecode++;      ecode++;
1417      break;      break;
1418    
1419        case OP_ANYNL:
1420        if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
1421        GETCHARINCTEST(c, eptr);
1422        switch(c)
1423          {
1424          default: RRETURN(MATCH_NOMATCH);
1425          case 0x000d:
1426          if (eptr < md->end_subject && *eptr == 0x0a) eptr++;
1427          break;
1428          case 0x000a:
1429          case 0x000b:
1430          case 0x000c:
1431          case 0x0085:
1432          case 0x2028:
1433          case 0x2029:
1434          break;
1435          }
1436        ecode++;
1437        break;
1438    
1439  #ifdef SUPPORT_UCP  #ifdef SUPPORT_UCP
1440      /* Check the next character by Unicode property. We will get here only      /* Check the next character by Unicode property. We will get here only
1441      if the support is in the binary; otherwise a compile-time error occurs. */      if the support is in the binary; otherwise a compile-time error occurs. */
# Line 1456  for (;;) Line 1478  for (;;)
1478    
1479          default:          default:
1480          RRETURN(PCRE_ERROR_INTERNAL);          RRETURN(PCRE_ERROR_INTERNAL);
         break;  
1481          }          }
1482    
1483        ecode += 3;        ecode += 3;
# Line 1926  for (;;) Line 1947  for (;;)
1947    
1948        else        else
1949          {          {
1950          int dc;          unsigned int dc;
1951          GETCHARINC(dc, eptr);          GETCHARINC(dc, eptr);
1952          ecode += length;          ecode += length;
1953    
# Line 1953  for (;;) Line 1974  for (;;)
1974        }        }
1975      break;      break;
1976    
1977      /* Match a single character repeatedly; different opcodes share code. */      /* Match a single character repeatedly. */
1978    
1979      case OP_EXACT:      case OP_EXACT:
1980      min = max = GET2(ecode, 1);      min = max = GET2(ecode, 1);
1981      ecode += 3;      ecode += 3;
1982      goto REPEATCHAR;      goto REPEATCHAR;
1983    
1984        case OP_POSUPTO:
1985        possessive = TRUE;
1986        /* Fall through */
1987    
1988      case OP_UPTO:      case OP_UPTO:
1989      case OP_MINUPTO:      case OP_MINUPTO:
1990      min = 0;      min = 0;
# Line 1968  for (;;) Line 1993  for (;;)
1993      ecode += 3;      ecode += 3;
1994      goto REPEATCHAR;      goto REPEATCHAR;
1995    
1996        case OP_POSSTAR:
1997        possessive = TRUE;
1998        min = 0;
1999        max = INT_MAX;
2000        ecode++;
2001        goto REPEATCHAR;
2002    
2003        case OP_POSPLUS:
2004        possessive = TRUE;
2005        min = 1;
2006        max = INT_MAX;
2007        ecode++;
2008        goto REPEATCHAR;
2009    
2010        case OP_POSQUERY:
2011        possessive = TRUE;
2012        min = 0;
2013        max = 1;
2014        ecode++;
2015        goto REPEATCHAR;
2016    
2017      case OP_STAR:      case OP_STAR:
2018      case OP_MINSTAR:      case OP_MINSTAR:
2019      case OP_PLUS:      case OP_PLUS:
# Line 2003  for (;;) Line 2049  for (;;)
2049          uschar occhars[8];          uschar occhars[8];
2050    
2051  #ifdef SUPPORT_UCP  #ifdef SUPPORT_UCP
2052          int othercase;          unsigned int othercase;
2053          if ((ims & PCRE_CASELESS) != 0 &&          if ((ims & PCRE_CASELESS) != 0 &&
2054              (othercase = _pcre_ucp_othercase(fc)) >= 0 &&              (othercase = _pcre_ucp_othercase(fc)) != NOTACHAR)
              othercase >= 0)  
2055            oclength = _pcre_ord2utf8(othercase, occhars);            oclength = _pcre_ord2utf8(othercase, occhars);
2056  #endif  /* SUPPORT_UCP */  #endif  /* SUPPORT_UCP */
2057    
# Line 2042  for (;;) Line 2087  for (;;)
2087              }              }
2088            /* Control never gets here */            /* Control never gets here */
2089            }            }
2090          else  
2091            else  /* Maximize */
2092            {            {
2093            pp = eptr;            pp = eptr;
2094            for (i = min; i < max; i++)            for (i = min; i < max; i++)
# Line 2056  for (;;) Line 2102  for (;;)
2102                eptr += oclength;                eptr += oclength;
2103                }                }
2104              }              }
2105    
2106              if (possessive) continue;
2107            while (eptr >= pp)            while (eptr >= pp)
2108             {             {
2109             RMATCH(rrc, eptr, ecode, offset_top, md, ims, eptrb, 0);             RMATCH(rrc, eptr, ecode, offset_top, md, ims, eptrb, 0);
# Line 2110  for (;;) Line 2158  for (;;)
2158            }            }
2159          /* Control never gets here */          /* Control never gets here */
2160          }          }
2161        else        else  /* Maximize */
2162          {          {
2163          pp = eptr;          pp = eptr;
2164          for (i = min; i < max; i++)          for (i = min; i < max; i++)
# Line 2118  for (;;) Line 2166  for (;;)
2166            if (eptr >= md->end_subject || fc != md->lcc[*eptr]) break;            if (eptr >= md->end_subject || fc != md->lcc[*eptr]) break;
2167            eptr++;            eptr++;
2168            }            }
2169            if (possessive) continue;
2170          while (eptr >= pp)          while (eptr >= pp)
2171            {            {
2172            RMATCH(rrc, eptr, ecode, offset_top, md, ims, eptrb, 0);            RMATCH(rrc, eptr, ecode, offset_top, md, ims, eptrb, 0);
# Line 2146  for (;;) Line 2195  for (;;)
2195            }            }
2196          /* Control never gets here */          /* Control never gets here */
2197          }          }
2198        else        else  /* Maximize */
2199          {          {
2200          pp = eptr;          pp = eptr;
2201          for (i = min; i < max; i++)          for (i = min; i < max; i++)
# Line 2154  for (;;) Line 2203  for (;;)
2203            if (eptr >= md->end_subject || fc != *eptr) break;            if (eptr >= md->end_subject || fc != *eptr) break;
2204            eptr++;            eptr++;
2205            }            }
2206            if (possessive) continue;
2207          while (eptr >= pp)          while (eptr >= pp)
2208            {            {
2209            RMATCH(rrc, eptr, ecode, offset_top, md, ims, eptrb, 0);            RMATCH(rrc, eptr, ecode, offset_top, md, ims, eptrb, 0);
# Line 2206  for (;;) Line 2256  for (;;)
2256      ecode += 3;      ecode += 3;
2257      goto REPEATNOTCHAR;      goto REPEATNOTCHAR;
2258    
2259        case OP_NOTPOSSTAR:
2260        possessive = TRUE;
2261        min = 0;
2262        max = INT_MAX;
2263        ecode++;
2264        goto REPEATNOTCHAR;
2265    
2266        case OP_NOTPOSPLUS:
2267        possessive = TRUE;
2268        min = 1;
2269        max = INT_MAX;
2270        ecode++;
2271        goto REPEATNOTCHAR;
2272    
2273        case OP_NOTPOSQUERY:
2274        possessive = TRUE;
2275        min = 0;
2276        max = 1;
2277        ecode++;
2278        goto REPEATNOTCHAR;
2279    
2280        case OP_NOTPOSUPTO:
2281        possessive = TRUE;
2282        min = 0;
2283        max = GET2(ecode, 1);
2284        ecode += 3;
2285        goto REPEATNOTCHAR;
2286    
2287      case OP_NOTSTAR:      case OP_NOTSTAR:
2288      case OP_NOTMINSTAR:      case OP_NOTMINSTAR:
2289      case OP_NOTPLUS:      case OP_NOTPLUS:
# Line 2245  for (;;) Line 2323  for (;;)
2323        /* UTF-8 mode */        /* UTF-8 mode */
2324        if (utf8)        if (utf8)
2325          {          {
2326          register int d;          register unsigned int d;
2327          for (i = 1; i <= min; i++)          for (i = 1; i <= min; i++)
2328            {            {
2329            GETCHARINC(d, eptr);            GETCHARINC(d, eptr);
# Line 2270  for (;;) Line 2348  for (;;)
2348          /* UTF-8 mode */          /* UTF-8 mode */
2349          if (utf8)          if (utf8)
2350            {            {
2351            register int d;            register unsigned int d;
2352            for (fi = min;; fi++)            for (fi = min;; fi++)
2353              {              {
2354              RMATCH(rrc, eptr, ecode, offset_top, md, ims, eptrb, 0);              RMATCH(rrc, eptr, ecode, offset_top, md, ims, eptrb, 0);
# Line 2306  for (;;) Line 2384  for (;;)
2384          /* UTF-8 mode */          /* UTF-8 mode */
2385          if (utf8)          if (utf8)
2386            {            {
2387            register int d;            register unsigned int d;
2388            for (i = min; i < max; i++)            for (i = min; i < max; i++)
2389              {              {
2390              int len = 1;              int len = 1;
# Line 2316  for (;;) Line 2394  for (;;)
2394              if (fc == d) break;              if (fc == d) break;
2395              eptr += len;              eptr += len;
2396              }              }
2397            for(;;)          if (possessive) continue;
2398            for(;;)
2399              {              {
2400              RMATCH(rrc, eptr, ecode, offset_top, md, ims, eptrb, 0);              RMATCH(rrc, eptr, ecode, offset_top, md, ims, eptrb, 0);
2401              if (rrc != MATCH_NOMATCH) RRETURN(rrc);              if (rrc != MATCH_NOMATCH) RRETURN(rrc);
# Line 2333  for (;;) Line 2412  for (;;)
2412              if (eptr >= md->end_subject || fc == md->lcc[*eptr]) break;              if (eptr >= md->end_subject || fc == md->lcc[*eptr]) break;
2413              eptr++;              eptr++;
2414              }              }
2415              if (possessive) continue;
2416            while (eptr >= pp)            while (eptr >= pp)
2417              {              {
2418              RMATCH(rrc, eptr, ecode, offset_top, md, ims, eptrb, 0);              RMATCH(rrc, eptr, ecode, offset_top, md, ims, eptrb, 0);
# Line 2354  for (;;) Line 2434  for (;;)
2434        /* UTF-8 mode */        /* UTF-8 mode */
2435        if (utf8)        if (utf8)
2436          {          {
2437          register int d;          register unsigned int d;
2438          for (i = 1; i <= min; i++)          for (i = 1; i <= min; i++)
2439            {            {
2440            GETCHARINC(d, eptr);            GETCHARINC(d, eptr);
# Line 2377  for (;;) Line 2457  for (;;)
2457          /* UTF-8 mode */          /* UTF-8 mode */
2458          if (utf8)          if (utf8)
2459            {            {
2460            register int d;            register unsigned int d;
2461            for (fi = min;; fi++)            for (fi = min;; fi++)
2462              {              {
2463              RMATCH(rrc, eptr, ecode, offset_top, md, ims, eptrb, 0);              RMATCH(rrc, eptr, ecode, offset_top, md, ims, eptrb, 0);
# Line 2412  for (;;) Line 2492  for (;;)
2492          /* UTF-8 mode */          /* UTF-8 mode */
2493          if (utf8)          if (utf8)
2494            {            {
2495            register int d;            register unsigned int d;
2496            for (i = min; i < max; i++)            for (i = min; i < max; i++)
2497              {              {
2498              int len = 1;              int len = 1;
# Line 2421  for (;;) Line 2501  for (;;)
2501              if (fc == d) break;              if (fc == d) break;
2502              eptr += len;              eptr += len;
2503              }              }
2504              if (possessive) continue;
2505            for(;;)            for(;;)
2506              {              {
2507              RMATCH(rrc, eptr, ecode, offset_top, md, ims, eptrb, 0);              RMATCH(rrc, eptr, ecode, offset_top, md, ims, eptrb, 0);
# Line 2438  for (;;) Line 2519  for (;;)
2519              if (eptr >= md->end_subject || fc == *eptr) break;              if (eptr >= md->end_subject || fc == *eptr) break;
2520              eptr++;              eptr++;
2521              }              }
2522              if (possessive) continue;
2523            while (eptr >= pp)            while (eptr >= pp)
2524              {              {
2525              RMATCH(rrc, eptr, ecode, offset_top, md, ims, eptrb, 0);              RMATCH(rrc, eptr, ecode, offset_top, md, ims, eptrb, 0);
# Line 2469  for (;;) Line 2551  for (;;)
2551      ecode += 3;      ecode += 3;
2552      goto REPEATTYPE;      goto REPEATTYPE;
2553    
2554        case OP_TYPEPOSSTAR:
2555        possessive = TRUE;
2556        min = 0;
2557        max = INT_MAX;
2558        ecode++;
2559        goto REPEATTYPE;
2560    
2561        case OP_TYPEPOSPLUS:
2562        possessive = TRUE;
2563        min = 1;
2564        max = INT_MAX;
2565        ecode++;
2566        goto REPEATTYPE;
2567    
2568        case OP_TYPEPOSQUERY:
2569        possessive = TRUE;
2570        min = 0;
2571        max = 1;
2572        ecode++;
2573        goto REPEATTYPE;
2574    
2575        case OP_TYPEPOSUPTO:
2576        possessive = TRUE;
2577        min = 0;
2578        max = GET2(ecode, 1);
2579        ecode += 3;
2580        goto REPEATTYPE;
2581    
2582      case OP_TYPESTAR:      case OP_TYPESTAR:
2583      case OP_TYPEMINSTAR:      case OP_TYPEMINSTAR:
2584      case OP_TYPEPLUS:      case OP_TYPEPLUS:
# Line 2571  for (;;) Line 2681  for (;;)
2681    
2682            default:            default:
2683            RRETURN(PCRE_ERROR_INTERNAL);            RRETURN(PCRE_ERROR_INTERNAL);
           break;  
2684            }            }
2685          }          }
2686    
# Line 2611  for (;;) Line 2720  for (;;)
2720          for (i = 1; i <= min; i++)          for (i = 1; i <= min; i++)
2721            {            {
2722            if (eptr >= md->end_subject ||            if (eptr >= md->end_subject ||
2723                 ((ims & PCRE_DOTALL) == 0 &&                 ((ims & PCRE_DOTALL) == 0 && IS_NEWLINE(eptr)))
                  eptr <= md->end_subject - md->nllen &&  
                  IS_NEWLINE(eptr)))  
2724              RRETURN(MATCH_NOMATCH);              RRETURN(MATCH_NOMATCH);
2725            eptr++;            eptr++;
2726            while (eptr < md->end_subject && (*eptr & 0xc0) == 0x80) eptr++;            while (eptr < md->end_subject && (*eptr & 0xc0) == 0x80) eptr++;
# Line 2624  for (;;) Line 2731  for (;;)
2731          eptr += min;          eptr += min;
2732          break;          break;
2733    
2734            case OP_ANYNL:
2735            for (i = 1; i <= min; i++)
2736              {
2737              if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
2738              GETCHARINC(c, eptr);
2739              switch(c)
2740                {
2741                default: RRETURN(MATCH_NOMATCH);
2742                case 0x000d:
2743                if (eptr < md->end_subject && *eptr == 0x0a) eptr++;
2744                break;
2745                case 0x000a:
2746                case 0x000b:
2747                case 0x000c:
2748                case 0x0085:
2749                case 0x2028:
2750                case 0x2029:
2751                break;
2752                }
2753              }
2754            break;
2755    
2756          case OP_NOT_DIGIT:          case OP_NOT_DIGIT:
2757          for (i = 1; i <= min; i++)          for (i = 1; i <= min; i++)
2758            {            {
# Line 2692  for (;;) Line 2821  for (;;)
2821  #endif     /* SUPPORT_UTF8 */  #endif     /* SUPPORT_UTF8 */
2822    
2823        /* Code for the non-UTF-8 case for minimum matching of operators other        /* Code for the non-UTF-8 case for minimum matching of operators other
2824        than OP_PROP and OP_NOTPROP. */        than OP_PROP and OP_NOTPROP. We can assume that there are the minimum
2825          number of bytes present, as this was tested above. */
2826    
2827        switch(ctype)        switch(ctype)
2828          {          {
# Line 2701  for (;;) Line 2831  for (;;)
2831            {            {
2832            for (i = 1; i <= min; i++)            for (i = 1; i <= min; i++)
2833              {              {
2834              if (eptr <= md->end_subject - md->nllen && IS_NEWLINE(eptr))              if (IS_NEWLINE(eptr)) RRETURN(MATCH_NOMATCH);
               RRETURN(MATCH_NOMATCH);  
2835              eptr++;              eptr++;
2836              }              }
2837            }            }
# Line 2713  for (;;) Line 2842  for (;;)
2842          eptr += min;          eptr += min;
2843          break;          break;
2844    
2845            /* Because of the CRLF case, we can't assume the minimum number of
2846            bytes are present in this case. */
2847    
2848            case OP_ANYNL:
2849            for (i = 1; i <= min; i++)
2850              {
2851              if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
2852              switch(*eptr++)
2853                {
2854                default: RRETURN(MATCH_NOMATCH);
2855                case 0x000d:
2856                if (eptr < md->end_subject && *eptr == 0x0a) eptr++;
2857                break;
2858                case 0x000a:
2859                case 0x000b:
2860                case 0x000c:
2861                case 0x0085:
2862                break;
2863                }
2864              }
2865            break;
2866    
2867          case OP_NOT_DIGIT:          case OP_NOT_DIGIT:
2868          for (i = 1; i <= min; i++)          for (i = 1; i <= min; i++)
2869            if ((md->ctypes[*eptr++] & ctype_digit) != 0) RRETURN(MATCH_NOMATCH);            if ((md->ctypes[*eptr++] & ctype_digit) != 0) RRETURN(MATCH_NOMATCH);
# Line 2774  for (;;) Line 2925  for (;;)
2925              GETCHARINC(c, eptr);              GETCHARINC(c, eptr);
2926              if (prop_fail_result) RRETURN(MATCH_NOMATCH);              if (prop_fail_result) RRETURN(MATCH_NOMATCH);
2927              }              }
2928            break;            /* Control never gets here */
2929    
2930            case PT_LAMP:            case PT_LAMP:
2931            for (fi = min;; fi++)            for (fi = min;; fi++)
# Line 2789  for (;;) Line 2940  for (;;)
2940                   prop_chartype == ucp_Lt) == prop_fail_result)                   prop_chartype == ucp_Lt) == prop_fail_result)
2941                RRETURN(MATCH_NOMATCH);                RRETURN(MATCH_NOMATCH);
2942              }              }
2943            break;            /* Control never gets here */
2944    
2945            case PT_GC:            case PT_GC:
2946            for (fi = min;; fi++)            for (fi = min;; fi++)
# Line 2802  for (;;) Line 2953  for (;;)
2953              if ((prop_category == prop_value) == prop_fail_result)              if ((prop_category == prop_value) == prop_fail_result)
2954                RRETURN(MATCH_NOMATCH);                RRETURN(MATCH_NOMATCH);
2955              }              }
2956            break;            /* Control never gets here */
2957    
2958            case PT_PC:            case PT_PC:
2959            for (fi = min;; fi++)            for (fi = min;; fi++)
# Line 2815  for (;;) Line 2966  for (;;)
2966              if ((prop_chartype == prop_value) == prop_fail_result)              if ((prop_chartype == prop_value) == prop_fail_result)
2967                RRETURN(MATCH_NOMATCH);                RRETURN(MATCH_NOMATCH);
2968              }              }
2969            break;            /* Control never gets here */
2970    
2971            case PT_SC:            case PT_SC:
2972            for (fi = min;; fi++)            for (fi = min;; fi++)
# Line 2828  for (;;) Line 2979  for (;;)
2979              if ((prop_script == prop_value) == prop_fail_result)              if ((prop_script == prop_value) == prop_fail_result)
2980                RRETURN(MATCH_NOMATCH);                RRETURN(MATCH_NOMATCH);
2981              }              }
2982            break;            /* Control never gets here */
2983    
2984            default:            default:
2985            RRETURN(PCRE_ERROR_INTERNAL);            RRETURN(PCRE_ERROR_INTERNAL);
           break;  
2986            }            }
2987          }          }
2988    
# Line 2876  for (;;) Line 3026  for (;;)
3026            if (rrc != MATCH_NOMATCH) RRETURN(rrc);            if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3027            if (fi >= max || eptr >= md->end_subject ||            if (fi >= max || eptr >= md->end_subject ||
3028                 (ctype == OP_ANY && (ims & PCRE_DOTALL) == 0 &&                 (ctype == OP_ANY && (ims & PCRE_DOTALL) == 0 &&
3029                  eptr <= md->end_subject - md->nllen && IS_NEWLINE(eptr)))                  IS_NEWLINE(eptr)))
3030              RRETURN(MATCH_NOMATCH);              RRETURN(MATCH_NOMATCH);
3031    
3032            GETCHARINC(c, eptr);            GETCHARINC(c, eptr);
# Line 2888  for (;;) Line 3038  for (;;)
3038              case OP_ANYBYTE:              case OP_ANYBYTE:
3039              break;              break;
3040    
3041                case OP_ANYNL:
3042                switch(c)
3043                  {
3044                  default: RRETURN(MATCH_NOMATCH);
3045                  case 0x000d:
3046                  if (eptr < md->end_subject && *eptr == 0x0a) eptr++;
3047                  break;
3048                  case 0x000a:
3049                  case 0x000b:
3050                  case 0x000c:
3051                  case 0x0085:
3052                  case 0x2028:
3053                  case 0x2029:
3054                  break;
3055                  }
3056                break;
3057    
3058              case OP_NOT_DIGIT:              case OP_NOT_DIGIT:
3059              if (c < 256 && (md->ctypes[c] & ctype_digit) != 0)              if (c < 256 && (md->ctypes[c] & ctype_digit) != 0)
3060                RRETURN(MATCH_NOMATCH);                RRETURN(MATCH_NOMATCH);
# Line 2932  for (;;) Line 3099  for (;;)
3099            RMATCH(rrc, eptr, ecode, offset_top, md, ims, eptrb, 0);            RMATCH(rrc, eptr, ecode, offset_top, md, ims, eptrb, 0);
3100            if (rrc != MATCH_NOMATCH) RRETURN(rrc);            if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3101            if (fi >= max || eptr >= md->end_subject ||            if (fi >= max || eptr >= md->end_subject ||
3102                 ((ims & PCRE_DOTALL) == 0 &&                 ((ims & PCRE_DOTALL) == 0 && IS_NEWLINE(eptr)))
                  eptr <= md->end_subject - md->nllen && IS_NEWLINE(eptr)))  
3103              RRETURN(MATCH_NOMATCH);              RRETURN(MATCH_NOMATCH);
3104    
3105            c = *eptr++;            c = *eptr++;
# Line 2945  for (;;) Line 3111  for (;;)
3111              case OP_ANYBYTE:              case OP_ANYBYTE:
3112              break;              break;
3113    
3114                case OP_ANYNL:
3115                switch(c)
3116                  {
3117                  default: RRETURN(MATCH_NOMATCH);
3118                  case 0x000d:
3119                  if (eptr < md->end_subject && *eptr == 0x0a) eptr++;
3120                  break;
3121                  case 0x000a:
3122                  case 0x000b:
3123                  case 0x000c:
3124                  case 0x0085:
3125                  break;
3126                  }
3127                break;
3128    
3129              case OP_NOT_DIGIT:              case OP_NOT_DIGIT:
3130              if ((md->ctypes[c] & ctype_digit) != 0) RRETURN(MATCH_NOMATCH);              if ((md->ctypes[c] & ctype_digit) != 0) RRETURN(MATCH_NOMATCH);
3131              break;              break;
# Line 2977  for (;;) Line 3158  for (;;)
3158        /* Control never gets here */        /* Control never gets here */
3159        }        }
3160    
3161      /* If maximizing it is worth using inline code for speed, doing the type      /* If maximizing, it is worth using inline code for speed, doing the type
3162      test once at the start (i.e. keep it out of the loop). Again, keep the      test once at the start (i.e. keep it out of the loop). Again, keep the
3163      UTF-8 and UCP stuff separate. */      UTF-8 and UCP stuff separate. */
3164    
# Line 3058  for (;;) Line 3239  for (;;)
3239    
3240          /* eptr is now past the end of the maximum run */          /* eptr is now past the end of the maximum run */
3241    
3242            if (possessive) continue;
3243          for(;;)          for(;;)
3244            {            {
3245            RMATCH(rrc, eptr, ecode, offset_top, md, ims, eptrb, 0);            RMATCH(rrc, eptr, ecode, offset_top, md, ims, eptrb, 0);
# Line 3093  for (;;) Line 3275  for (;;)
3275    
3276          /* eptr is now past the end of the maximum run */          /* eptr is now past the end of the maximum run */
3277    
3278            if (possessive) continue;
3279          for(;;)          for(;;)
3280            {            {
3281            RMATCH(rrc, eptr, ecode, offset_top, md, ims, eptrb, 0);            RMATCH(rrc, eptr, ecode, offset_top, md, ims, eptrb, 0);
# Line 3135  for (;;) Line 3318  for (;;)
3318                {                {
3319                for (i = min; i < max; i++)                for (i = min; i < max; i++)
3320                  {                  {
3321                  if (eptr >= md->end_subject ||                  if (eptr >= md->end_subject || IS_NEWLINE(eptr)) break;
                     (eptr <= md->end_subject - md->nllen && IS_NEWLINE(eptr)))  
                   break;  
3322                  eptr++;                  eptr++;
3323                  while (eptr < md->end_subject && (*eptr & 0xc0) == 0x80) eptr++;                  while (eptr < md->end_subject && (*eptr & 0xc0) == 0x80) eptr++;
3324                  }                  }
# Line 3161  for (;;) Line 3342  for (;;)
3342                {                {
3343                for (i = min; i < max; i++)                for (i = min; i < max; i++)
3344                  {                  {
3345                  if (eptr >= md->end_subject ||                  if (eptr >= md->end_subject || IS_NEWLINE(eptr)) break;
                     (eptr <= md->end_subject - md->nllen && IS_NEWLINE(eptr)))  
                   break;  
3346                  eptr++;                  eptr++;
3347                  }                  }
3348                break;                break;
# Line 3171  for (;;) Line 3350  for (;;)
3350              else              else
3351                {                {
3352                c = max - min;                c = max - min;
3353                if (c > md->end_subject - eptr) c = md->end_subject - eptr;                if (c > (unsigned int)(md->end_subject - eptr))
3354                    c = md->end_subject - eptr;
3355                eptr += c;                eptr += c;
3356                }                }
3357              }              }
# Line 3181  for (;;) Line 3361  for (;;)
3361    
3362            case OP_ANYBYTE:            case OP_ANYBYTE:
3363            c = max - min;            c = max - min;
3364            if (c > md->end_subject - eptr) c = md->end_subject - eptr;            if (c > (unsigned int)(md->end_subject - eptr))
3365                c = md->end_subject - eptr;
3366            eptr += c;            eptr += c;
3367            break;            break;
3368    
3369              case OP_ANYNL:
3370              for (i = min; i < max; i++)
3371                {
3372                int len = 1;
3373                if (eptr >= md->end_subject) break;
3374                GETCHARLEN(c, eptr, len);
3375                if (c == 0x000d)
3376                  {
3377                  if (++eptr >= md->end_subject) break;
3378                  if (*eptr == 0x000a) eptr++;
3379                  }
3380                else
3381                  {
3382                  if (c != 0x000a && c != 0x000b && c != 0x000c &&
3383                      c != 0x0085 && c != 0x2028 && c != 0x2029)
3384                    break;
3385                  eptr += len;
3386                  }
3387                }
3388              break;
3389    
3390            case OP_NOT_DIGIT:            case OP_NOT_DIGIT:
3391            for (i = min; i < max; i++)            for (i = min; i < max; i++)
3392              {              {
# Line 3257  for (;;) Line 3459  for (;;)
3459    
3460          /* eptr is now past the end of the maximum run */          /* eptr is now past the end of the maximum run */
3461    
3462            if (possessive) continue;
3463          for(;;)          for(;;)
3464            {            {
3465            RMATCH(rrc, eptr, ecode, offset_top, md, ims, eptrb, 0);            RMATCH(rrc, eptr, ecode, offset_top, md, ims, eptrb, 0);
# Line 3277  for (;;) Line 3480  for (;;)
3480              {              {
3481              for (i = min; i < max; i++)              for (i = min; i < max; i++)
3482                {                {
3483                if (eptr >= md->end_subject ||                if (eptr >= md->end_subject || IS_NEWLINE(eptr)) break;
                   (eptr <= md->end_subject - md->nllen && IS_NEWLINE(eptr)))  
                 break;  
3484                eptr++;                eptr++;
3485                }                }
3486              break;              break;
# Line 3288  for (;;) Line 3489  for (;;)
3489    
3490            case OP_ANYBYTE:            case OP_ANYBYTE:
3491            c = max - min;            c = max - min;
3492            if (c > md->end_subject - eptr) c = md->end_subject - eptr;            if (c > (unsigned int)(md->end_subject - eptr))
3493                c = md->end_subject - eptr;
3494            eptr += c;            eptr += c;
3495            break;            break;
3496    
3497              case OP_ANYNL:
3498              for (i = min; i < max; i++)
3499                {
3500                if (eptr >= md->end_subject) break;
3501                c = *eptr;
3502                if (c == 0x000d)
3503                  {
3504                  if (++eptr >= md->end_subject) break;
3505                  if (*eptr == 0x000a) eptr++;
3506                  }
3507                else
3508                  {
3509                  if (c != 0x000a && c != 0x000b && c != 0x000c && c != 0x0085)
3510                    break;
3511                  eptr++;
3512                  }
3513                }
3514              break;
3515    
3516            case OP_NOT_DIGIT:            case OP_NOT_DIGIT:
3517            for (i = min; i < max; i++)            for (i = min; i < max; i++)
3518              {              {
# Line 3352  for (;;) Line 3573  for (;;)
3573    
3574          /* eptr is now past the end of the maximum run */          /* eptr is now past the end of the maximum run */
3575    
3576            if (possessive) continue;
3577          while (eptr >= pp)          while (eptr >= pp)
3578            {            {
3579            RMATCH(rrc, eptr, ecode, offset_top, md, ims, eptrb, 0);            RMATCH(rrc, eptr, ecode, offset_top, md, ims, eptrb, 0);
# Line 3366  for (;;) Line 3588  for (;;)
3588        }        }
3589      /* Control never gets here */      /* Control never gets here */
3590    
3591      /* There's been some horrible disaster. Since all codes > OP_BRA are      /* There's been some horrible disaster. Arrival here can only mean there is
3592      for capturing brackets, and there shouldn't be any gaps between 0 and      something seriously wrong in the code above or the OP_xxx definitions. */
     OP_BRA, arrival here can only mean there is something seriously wrong  
     in the code above or the OP_xxx definitions. */  
3593    
3594      default:      default:
3595      DPRINTF(("Unknown opcode %d\n", *ecode));      DPRINTF(("Unknown opcode %d\n", *ecode));
3596      RRETURN(PCRE_ERROR_UNKNOWN_NODE);      RRETURN(PCRE_ERROR_UNKNOWN_OPCODE);
3597      }      }
3598    
3599    /* Do not stick any code in here without much thought; it is assumed    /* Do not stick any code in here without much thought; it is assumed
# Line 3411  Undefine all the macros that were define Line 3631  Undefine all the macros that were define
3631    
3632  #undef cur_is_word  #undef cur_is_word
3633  #undef condition  #undef condition
 #undef minimize  
3634  #undef prev_is_word  #undef prev_is_word
3635    
3636  #undef original_ims  #undef original_ims
# Line 3484  BOOL startline; Line 3703  BOOL startline;
3703  BOOL firstline;  BOOL firstline;
3704  BOOL first_byte_caseless = FALSE;  BOOL first_byte_caseless = FALSE;
3705  BOOL req_byte_caseless = FALSE;  BOOL req_byte_caseless = FALSE;
3706    BOOL utf8;
3707  match_data match_block;  match_data match_block;
3708  match_data *md = &match_block;  match_data *md = &match_block;
3709  const uschar *tables;  const uschar *tables;
# Line 3491  const uschar *start_bits = NULL; Line 3711  const uschar *start_bits = NULL;
3711  USPTR start_match = (USPTR)subject + start_offset;  USPTR start_match = (USPTR)subject + start_offset;
3712  USPTR end_subject;  USPTR end_subject;
3713  USPTR req_byte_ptr = start_match - 1;  USPTR req_byte_ptr = start_match - 1;
3714    eptrblock eptrchain[EPTR_WORK_SIZE];
3715    
3716  pcre_study_data internal_study;  pcre_study_data internal_study;
3717  const pcre_study_data *study;  const pcre_study_data *study;
# Line 3567  md->end_subject = md->start_subject + le Line 3788  md->end_subject = md->start_subject + le
3788  end_subject = md->end_subject;  end_subject = md->end_subject;
3789    
3790  md->endonly = (re->options & PCRE_DOLLAR_ENDONLY) != 0;  md->endonly = (re->options & PCRE_DOLLAR_ENDONLY) != 0;
3791  md->utf8 = (re->options & PCRE_UTF8) != 0;  utf8 = md->utf8 = (re->options & PCRE_UTF8) != 0;
3792    
3793  md->notbol = (options & PCRE_NOTBOL) != 0;  md->notbol = (options & PCRE_NOTBOL) != 0;
3794  md->noteol = (options & PCRE_NOTEOL) != 0;  md->noteol = (options & PCRE_NOTEOL) != 0;
# Line 3576  md->partial = (options & PCRE_PARTIAL) ! Line 3797  md->partial = (options & PCRE_PARTIAL) !
3797  md->hitend = FALSE;  md->hitend = FALSE;
3798    
3799  md->recursive = NULL;                   /* No recursion at top level */  md->recursive = NULL;                   /* No recursion at top level */
3800    md->eptrchain = eptrchain;              /* Make workspace generally available */
3801    
3802  md->lcc = tables + lcc_offset;  md->lcc = tables + lcc_offset;
3803  md->ctypes = tables + ctypes_offset;  md->ctypes = tables + ctypes_offset;
# Line 3583  md->ctypes = tables + ctypes_offset; Line 3805  md->ctypes = tables + ctypes_offset;
3805  /* Handle different types of newline. The two bits give four cases. If nothing  /* Handle different types of newline. The two bits give four cases. If nothing
3806  is set at run time, whatever was used at compile time applies. */  is set at run time, whatever was used at compile time applies. */
3807    
3808  switch ((((options & PCRE_NEWLINE_CRLF) == 0)? re->options : options) &  switch ((((options & PCRE_NEWLINE_BITS) == 0)? re->options : options) &
3809           PCRE_NEWLINE_CRLF)         PCRE_NEWLINE_BITS)
3810    {    {
3811    default:              newline = NEWLINE; break;   /* Compile-time default */    case 0: newline = NEWLINE; break;   /* Compile-time default */
3812    case PCRE_NEWLINE_CR: newline = '\r'; break;    case PCRE_NEWLINE_CR: newline = '\r'; break;
3813    case PCRE_NEWLINE_LF: newline = '\n'; break;    case PCRE_NEWLINE_LF: newline = '\n'; break;
3814    case PCRE_NEWLINE_CR+    case PCRE_NEWLINE_CR+
3815         PCRE_NEWLINE_LF: newline = ('\r' << 8) | '\n'; break;         PCRE_NEWLINE_LF: newline = ('\r' << 8) | '\n'; break;
3816      case PCRE_NEWLINE_ANY: newline = -1; break;
3817      default: return PCRE_ERROR_BADNEWLINE;
3818    }    }
3819    
3820  if (newline > 255)  if (newline < 0)
3821    {    {
3822    md->nllen = 2;    md->nltype = NLTYPE_ANY;
   md->nl[0] = (newline >> 8) & 255;  
   md->nl[1] = newline & 255;  
3823    }    }
3824  else  else
3825    {    {
3826    md->nllen = 1;    md->nltype = NLTYPE_FIXED;
3827    md->nl[0] = newline;    if (newline > 255)
3828        {
3829        md->nllen = 2;
3830        md->nl[0] = (newline >> 8) & 255;
3831        md->nl[1] = newline & 255;
3832        }
3833      else
3834        {
3835        md->nllen = 1;
3836        md->nl[0] = newline;
3837        }
3838    }    }
3839    
3840  /* Partial matching is supported only for a restricted set of regexes at the  /* Partial matching is supported only for a restricted set of regexes at the
# Line 3615  if (md->partial && (re->options & PCRE_N Line 3847  if (md->partial && (re->options & PCRE_N
3847  back the character offset. */  back the character offset. */
3848    
3849  #ifdef SUPPORT_UTF8  #ifdef SUPPORT_UTF8
3850  if (md->utf8 && (options & PCRE_NO_UTF8_CHECK) == 0)  if (utf8 && (options & PCRE_NO_UTF8_CHECK) == 0)
3851    {    {
3852    if (_pcre_valid_utf8((uschar *)subject, length) >= 0)    if (_pcre_valid_utf8((uschar *)subject, length) >= 0)
3853      return PCRE_ERROR_BADUTF8;      return PCRE_ERROR_BADUTF8;
# Line 3707  if ((re->options & PCRE_REQCHSET) != 0) Line 3939  if ((re->options & PCRE_REQCHSET) != 0)
3939    req_byte2 = (tables + fcc_offset)[req_byte];  /* case flipped */    req_byte2 = (tables + fcc_offset)[req_byte];  /* case flipped */
3940    }    }
3941    
3942    
3943    /* ==========================================================================*/
3944    
3945  /* Loop for handling unanchored repeated matching attempts; for anchored regexs  /* Loop for handling unanchored repeated matching attempts; for anchored regexs
3946  the loop runs just once. */  the loop runs just once. */
3947    
3948  do  for(;;)
3949    {    {
3950    USPTR save_end_subject = end_subject;    USPTR save_end_subject = end_subject;
3951    
# Line 3725  do Line 3960  do
3960    
3961    /* Advance to a unique first char if possible. If firstline is TRUE, the    /* Advance to a unique first char if possible. If firstline is TRUE, the
3962    start of the match is constrained to the first line of a multiline string.    start of the match is constrained to the first line of a multiline string.
3963    Implement this by temporarily adjusting end_subject so that we stop scanning    That is, the match must be before or at the first newline. Implement this by
3964    at a newline. If the match fails at the newline, later code breaks this loop.    temporarily adjusting end_subject so that we stop scanning at a newline. If
3965    */    the match fails at the newline, later code breaks this loop. */
3966    
3967    if (firstline)    if (firstline)
3968      {      {
3969      USPTR t = start_match;      USPTR t = start_match;
3970      while (t <= save_end_subject - md->nllen && !IS_NEWLINE(t)) t++;      while (t < md->end_subject && !IS_NEWLINE(t)) t++;
3971      end_subject = t;      end_subject = t;
3972      }      }
3973    
# Line 3753  do Line 3988  do
3988    
3989    else if (startline)    else if (startline)
3990      {      {
3991      if (start_match >= md->start_subject + md->nllen +      if (start_match > md->start_subject + start_offset)
           start_offset)  
3992        {        {
3993        while (start_match <= end_subject &&        while (start_match <= end_subject && !WAS_NEWLINE(start_match))
              !IS_NEWLINE(start_match - md->nllen))  
3994          start_match++;          start_match++;
3995        }        }
3996      }      }
# Line 3793  do Line 4026  do
4026    
4027    HOWEVER: when the subject string is very, very long, searching to its end can    HOWEVER: when the subject string is very, very long, searching to its end can
4028    take a long time, and give bad performance on quite ordinary patterns. This    take a long time, and give bad performance on quite ordinary patterns. This
4029    showed up when somebody was matching /^C/ on a 32-megabyte string... so we    showed up when somebody was matching something like /^\d+C/ on a 32-megabyte
4030    don't do this when the string is sufficiently long.    string... so we don't do this when the string is sufficiently long.
4031    
4032    ALSO: this processing is disabled when partial matching is requested.    ALSO: this processing is disabled when partial matching is requested.
4033    */    */
# Line 3826  do Line 4059  do
4059            }            }
4060          }          }
4061    
4062        /* If we can't find the required character, break the matching loop */        /* If we can't find the required character, break the matching loop,
4063          forcing a match failure. */
4064    
4065        if (p >= end_subject) break;        if (p >= end_subject)
4066            {
4067            rc = MATCH_NOMATCH;
4068            break;
4069            }
4070    
4071        /* If we have found the required character, save the point where we        /* If we have found the required character, save the point where we
4072        found it, so that we don't search again next time round the loop if        found it, so that we don't search again next time round the loop if
# Line 3838  do Line 4076  do
4076        }        }
4077      }      }
4078    
4079    /* When a match occurs, substrings will be set for all internal extractions;    /* OK, we can now run the match. */
   we just need to set up the whole thing as substring 0 before returning. If  
   there were too many extractions, set the return code to zero. In the case  
   where we had to get some local store to hold offsets for backreferences, copy  
   those back references that we can. In this case there need not be overflow  
   if certain parts of the pattern were not used. */  
4080    
4081    md->start_match = start_match;    md->start_match = start_match;
4082    md->match_call_count = 0;    md->match_call_count = 0;
4083      md->eptrn = 0;                          /* Next free eptrchain slot */
4084      rc = match(start_match, md->start_code, 2, md, ims, NULL, 0, 0);
4085    
4086    rc = match(start_match, md->start_code, 2, md, ims, NULL, match_isgroup, 0);    /* Any return other than MATCH_NOMATCH breaks the loop. */
4087    
4088    /* When the result is no match, if the subject's first character was a    if (rc != MATCH_NOMATCH) break;
   newline and the PCRE_FIRSTLINE option is set, break (which will return  
   PCRE_ERROR_NOMATCH). The option requests that a match occur before the first  
   newline in the subject. Otherwise, advance the pointer to the next character  
   and continue - but the continuation will actually happen only when the  
   pattern is not anchored. */  
4089    
4090    if (rc == MATCH_NOMATCH)    /* If PCRE_FIRSTLINE is set, the match must happen before or at the first
4091      {    newline in the subject (though it may continue over the newline). Therefore,
4092      if (firstline &&    if we have just failed to match, starting at a newline, do not continue. */
4093          start_match <= md->end_subject - md->nllen &&  
4094          IS_NEWLINE(start_match))    if (firstline && IS_NEWLINE(start_match)) break;
4095        break;  
4096      start_match++;    /* Advance the match position by one character. */
4097    
4098      start_match++;
4099  #ifdef SUPPORT_UTF8  #ifdef SUPPORT_UTF8
4100      if (md->utf8)    if (utf8)
4101        while(start_match < end_subject && (*start_match & 0xc0) == 0x80)      while(start_match < end_subject && (*start_match & 0xc0) == 0x80)
4102          start_match++;        start_match++;
4103  #endif  #endif
     continue;  
     }  
4104    
4105    if (rc != MATCH_MATCH)    /* Break the loop if the pattern is anchored or if we have passed the end of
4106      {    the subject. */
4107      DPRINTF((">>>> error: returning %d\n", rc));  
4108      return rc;    if (anchored || start_match > end_subject) break;
4109      }  
4110      /* If we have just passed a CR and the newline option is CRLF or ANY, and we
4111      are now at a LF, advance the match position by one more character. */
4112    
4113      if (start_match[-1] == '\r' &&
4114           (md->nltype == NLTYPE_ANY || md->nllen == 2) &&
4115           start_match < end_subject &&
4116           *start_match == '\n')
4117        start_match++;
4118    
4119      }   /* End of for(;;) "bumpalong" loop */
4120    
4121    /* ==========================================================================*/
4122    
4123    /* We reach here when rc is not MATCH_NOMATCH, or if one of the stopping
4124    conditions is true:
4125    
4126    /* We have a match! Copy the offset information from temporary store if  (1) The pattern is anchored;
   necessary */  
4127    
4128    (2) We are past the end of the subject;
4129    
4130    (3) PCRE_FIRSTLINE is set and we have failed to match at a newline, because
4131        this option requests that a match occur at or before the first newline in
4132        the subject.
4133    
4134    When we have a match and the offset vector is big enough to deal with any
4135    backreferences, captured substring offsets will already be set up. In the case
4136    where we had to get some local store to hold offsets for backreference
4137    processing, copy those that we can. In this case there need not be overflow if
4138    certain parts of the pattern were not used, even though there are more
4139    capturing parentheses than vector slots. */
4140    
4141    if (rc == MATCH_MATCH)
4142      {
4143    if (using_temporary_offsets)    if (using_temporary_offsets)
4144      {      {
4145      if (offsetcount >= 4)      if (offsetcount >= 4)
# Line 3889  do Line 4148  do
4148          (offsetcount - 2) * sizeof(int));          (offsetcount - 2) * sizeof(int));
4149        DPRINTF(("Copied offsets from temporary memory\n"));        DPRINTF(("Copied offsets from temporary memory\n"));
4150        }        }
4151      if (md->end_offset_top > offsetcount)      if (md->end_offset_top > offsetcount) md->offset_overflow = TRUE;
       md->offset_overflow = TRUE;  
   
4152      DPRINTF(("Freeing temporary memory\n"));      DPRINTF(("Freeing temporary memory\n"));
4153      (pcre_free)(md->offset_vector);      (pcre_free)(md->offset_vector);
4154      }      }
4155    
4156      /* Set the return code to the number of captured strings, or 0 if there are
4157      too many to fit into the vector. */
4158    
4159    rc = md->offset_overflow? 0 : md->end_offset_top/2;    rc = md->offset_overflow? 0 : md->end_offset_top/2;
4160    
4161      /* If there is space, set up the whole thing as substring 0. */
4162    
4163    if (offsetcount < 2) rc = 0; else    if (offsetcount < 2) rc = 0; else
4164      {      {
4165      offsets[0] = start_match - md->start_subject;      offsets[0] = start_match - md->start_subject;
# Line 3908  do Line 4170  do
4170    return rc;    return rc;
4171    }    }
4172    
4173  /* This "while" is the end of the "do" above */  /* Control gets here if there has been an error, or if the overall match
4174    attempt has failed at all permitted starting positions. */
 while (!anchored && start_match <= end_subject);  
4175    
4176  if (using_temporary_offsets)  if (using_temporary_offsets)
4177    {    {
# Line 3918  if (using_temporary_offsets) Line 4179  if (using_temporary_offsets)
4179    (pcre_free)(md->offset_vector);    (pcre_free)(md->offset_vector);
4180    }    }
4181    
4182  if (md->partial && md->hitend)  if (rc != MATCH_NOMATCH)
4183      {
4184      DPRINTF((">>>> error: returning %d\n", rc));
4185      return rc;
4186      }
4187    else if (md->partial && md->hitend)
4188    {    {
4189    DPRINTF((">>>> returning PCRE_ERROR_PARTIAL\n"));    DPRINTF((">>>> returning PCRE_ERROR_PARTIAL\n"));
4190    return PCRE_ERROR_PARTIAL;    return PCRE_ERROR_PARTIAL;

Legend:
Removed from v.91  
changed lines
  Added in v.93

webmaster@exim.org
ViewVC Help
Powered by ViewVC 1.1.12