/[pcre]/code/trunk/pcre_compile.c
ViewVC logotype

Diff of /code/trunk/pcre_compile.c

Parent Directory Parent Directory | Revision Log Revision Log | View Patch Patch

revision 602 by ph10, Wed May 25 08:29:03 2011 UTC revision 605 by ph10, Fri Jun 3 18:18:30 2011 UTC
# Line 545  static const unsigned char ebcdic_charta Line 545  static const unsigned char ebcdic_charta
545  /* Definition to allow mutual recursion */  /* Definition to allow mutual recursion */
546    
547  static BOOL  static BOOL
548    compile_regex(int, int, uschar **, const uschar **, int *, BOOL, BOOL, int,    compile_regex(int, uschar **, const uschar **, int *, BOOL, BOOL, int, int *,
549      int *, int *, branch_chain *, compile_data *, int *);      int *, branch_chain *, compile_data *, int *);
550    
551    
552    
# Line 1403  does not. Line 1403  does not.
1403    
1404  Arguments:  Arguments:
1405    code         pointer to the start of the group    code         pointer to the start of the group
   options      pointer to external options  
   optbit       the option bit whose changing is significant, or  
                  zero if none are  
1406    skipassert   TRUE if certain assertions are to be skipped    skipassert   TRUE if certain assertions are to be skipped
1407    
1408  Returns:       pointer to the first significant opcode  Returns:       pointer to the first significant opcode
1409  */  */
1410    
1411  static const uschar*  static const uschar*
1412  first_significant_code(const uschar *code, int *options, int optbit,  first_significant_code(const uschar *code, BOOL skipassert)
   BOOL skipassert)  
1413  {  {
1414  for (;;)  for (;;)
1415    {    {
# Line 1468  and doing the check at the end; a flag s Line 1464  and doing the check at the end; a flag s
1464    
1465  Arguments:  Arguments:
1466    code     points to the start of the pattern (the bracket)    code     points to the start of the pattern (the bracket)
1467    options  the compiling options    utf8     TRUE in UTF-8 mode
1468    atend    TRUE if called when the pattern is complete    atend    TRUE if called when the pattern is complete
1469    cd       the "compile data" structure    cd       the "compile data" structure
1470    
# Line 1479  Returns: the fixed length, Line 1475  Returns: the fixed length,
1475  */  */
1476    
1477  static int  static int
1478  find_fixedlength(uschar *code, int options, BOOL atend, compile_data *cd)  find_fixedlength(uschar *code, BOOL utf8, BOOL atend, compile_data *cd)
1479  {  {
1480  int length = -1;  int length = -1;
1481    
# Line 1496  for (;;) Line 1492  for (;;)
1492    register int op = *cc;    register int op = *cc;
1493    switch (op)    switch (op)
1494      {      {
1495        /* We only need to continue for OP_CBRA (normal capturing bracket) and
1496        OP_BRA (normal non-capturing bracket) because the other variants of these
1497        opcodes are all concerned with unlimited repeated groups, which of course
1498        are not of fixed length. They will cause a -1 response from the default
1499        case of this switch. */
1500    
1501      case OP_CBRA:      case OP_CBRA:
1502      case OP_BRA:      case OP_BRA:
1503      case OP_ONCE:      case OP_ONCE:
1504      case OP_COND:      case OP_COND:
1505      d = find_fixedlength(cc + ((op == OP_CBRA)? 2:0), options, atend, cd);      d = find_fixedlength(cc + ((op == OP_CBRA)? 2:0), utf8, atend, cd);
1506      if (d < 0) return d;      if (d < 0) return d;
1507      branchlength += d;      branchlength += d;
1508      do cc += GET(cc, 1); while (*cc == OP_ALT);      do cc += GET(cc, 1); while (*cc == OP_ALT);
# Line 1509  for (;;) Line 1511  for (;;)
1511    
1512      /* Reached end of a branch; if it's a ket it is the end of a nested      /* Reached end of a branch; if it's a ket it is the end of a nested
1513      call. If it's ALT it is an alternation in a nested call. If it is      call. If it's ALT it is an alternation in a nested call. If it is
1514      END it's the end of the outer call. All can be handled by the same code. */      END it's the end of the outer call. All can be handled by the same code.
1515        Note that we must not include the OP_KETRxxx opcodes here, because they
1516        all imply an unlimited repeat. */
1517    
1518      case OP_ALT:      case OP_ALT:
1519      case OP_KET:      case OP_KET:
     case OP_KETRMAX:  
     case OP_KETRMIN:  
1520      case OP_END:      case OP_END:
1521      if (length < 0) length = branchlength;      if (length < 0) length = branchlength;
1522        else if (length != branchlength) return -1;        else if (length != branchlength) return -1;
# Line 1532  for (;;) Line 1534  for (;;)
1534      cs = ce = (uschar *)cd->start_code + GET(cc, 1);  /* Start subpattern */      cs = ce = (uschar *)cd->start_code + GET(cc, 1);  /* Start subpattern */
1535      do ce += GET(ce, 1); while (*ce == OP_ALT);       /* End subpattern */      do ce += GET(ce, 1); while (*ce == OP_ALT);       /* End subpattern */
1536      if (cc > cs && cc < ce) return -1;                /* Recursion */      if (cc > cs && cc < ce) return -1;                /* Recursion */
1537      d = find_fixedlength(cs + 2, options, atend, cd);      d = find_fixedlength(cs + 2, utf8, atend, cd);
1538      if (d < 0) return d;      if (d < 0) return d;
1539      branchlength += d;      branchlength += d;
1540      cc += 1 + LINK_SIZE;      cc += 1 + LINK_SIZE;
# Line 1575  for (;;) Line 1577  for (;;)
1577      case OP_CHAR:      case OP_CHAR:
1578      case OP_CHARI:      case OP_CHARI:
1579      case OP_NOT:      case OP_NOT:
1580      case OP_NOTI:      case OP_NOTI:
1581      branchlength++;      branchlength++;
1582      cc += 2;      cc += 2;
1583  #ifdef SUPPORT_UTF8  #ifdef SUPPORT_UTF8
1584      if ((options & PCRE_UTF8) != 0 && cc[-1] >= 0xc0)      if (utf8 && cc[-1] >= 0xc0) cc += _pcre_utf8_table4[cc[-1] & 0x3f];
       cc += _pcre_utf8_table4[cc[-1] & 0x3f];  
1585  #endif  #endif
1586      break;      break;
1587    
# Line 1591  for (;;) Line 1592  for (;;)
1592      branchlength += GET2(cc,1);      branchlength += GET2(cc,1);
1593      cc += 4;      cc += 4;
1594  #ifdef SUPPORT_UTF8  #ifdef SUPPORT_UTF8
1595      if ((options & PCRE_UTF8) != 0 && cc[-1] >= 0xc0)      if (utf8 && cc[-1] >= 0xc0) cc += _pcre_utf8_table4[cc[-1] & 0x3f];
       cc += _pcre_utf8_table4[cc[-1] & 0x3f];  
1596  #endif  #endif
1597      break;      break;
1598    
# Line 1712  for (;;) Line 1712  for (;;)
1712    
1713    /* Handle capturing bracket */    /* Handle capturing bracket */
1714    
1715    else if (c == OP_CBRA)    else if (c == OP_CBRA || c == OP_SCBRA ||
1716               c == OP_CBRAPOS || c == OP_SCBRAPOS)
1717      {      {
1718      int n = GET2(code, 1+LINK_SIZE);      int n = GET2(code, 1+LINK_SIZE);
1719      if (n == number) return (uschar *)code;      if (n == number) return (uschar *)code;
# Line 1954  could_be_empty_branch(const uschar *code Line 1955  could_be_empty_branch(const uschar *code
1955    compile_data *cd)    compile_data *cd)
1956  {  {
1957  register int c;  register int c;
1958  for (code = first_significant_code(code + _pcre_OP_lengths[*code], NULL, 0, TRUE);  for (code = first_significant_code(code + _pcre_OP_lengths[*code], TRUE);
1959       code < endcode;       code < endcode;
1960       code = first_significant_code(code + _pcre_OP_lengths[c], NULL, 0, TRUE))       code = first_significant_code(code + _pcre_OP_lengths[c], TRUE))
1961    {    {
1962    const uschar *ccode;    const uschar *ccode;
1963    
# Line 1972  for (code = first_significant_code(code Line 1973  for (code = first_significant_code(code
1973      continue;      continue;
1974      }      }
1975    
   /* Groups with zero repeats can of course be empty; skip them. */  
   
   if (c == OP_BRAZERO || c == OP_BRAMINZERO || c == OP_SKIPZERO)  
     {  
     code += _pcre_OP_lengths[c];  
     do code += GET(code, 1); while (*code == OP_ALT);  
     c = *code;  
     continue;  
     }  
   
1976    /* For a recursion/subroutine call, if its end has been reached, which    /* For a recursion/subroutine call, if its end has been reached, which
1977    implies a subroutine call, we can scan it. */    implies a subroutine call, we can scan it. */
1978    
# Line 2004  for (code = first_significant_code(code Line 1995  for (code = first_significant_code(code
1995      continue;      continue;
1996      }      }
1997    
1998      /* Groups with zero repeats can of course be empty; skip them. */
1999    
2000      if (c == OP_BRAZERO || c == OP_BRAMINZERO || c == OP_SKIPZERO ||
2001          c == OP_BRAPOSZERO)
2002        {
2003        code += _pcre_OP_lengths[c];
2004        do code += GET(code, 1); while (*code == OP_ALT);
2005        c = *code;
2006        continue;
2007        }
2008    
2009      /* A nested group that is already marked as "could be empty" can just be
2010      skipped. */
2011    
2012      if (c == OP_SBRA  || c == OP_SBRAPOS ||
2013          c == OP_SCBRA || c == OP_SCBRAPOS)
2014        {
2015        do code += GET(code, 1); while (*code == OP_ALT);
2016        c = *code;
2017        continue;
2018        }
2019    
2020    /* For other groups, scan the branches. */    /* For other groups, scan the branches. */
2021    
2022    if (c == OP_BRA || c == OP_CBRA || c == OP_ONCE || c == OP_COND)    if (c == OP_BRA  || c == OP_BRAPOS ||
2023          c == OP_CBRA || c == OP_CBRAPOS ||
2024          c == OP_ONCE || c == OP_COND)
2025      {      {
2026      BOOL empty_branch;      BOOL empty_branch;
2027      if (GET(code, 1) == 0) return TRUE;    /* Hit unclosed bracket */      if (GET(code, 1) == 0) return TRUE;    /* Hit unclosed bracket */
# Line 2135  for (code = first_significant_code(code Line 2150  for (code = first_significant_code(code
2150      case OP_KET:      case OP_KET:
2151      case OP_KETRMAX:      case OP_KETRMAX:
2152      case OP_KETRMIN:      case OP_KETRMIN:
2153        case OP_KETRPOS:
2154      case OP_ALT:      case OP_ALT:
2155      return TRUE;      return TRUE;
2156    
# Line 2682  if (next >= 0) switch(op_code) Line 2698  if (next >= 0) switch(op_code)
2698    return (c != cd->fcc[next]);  /* Non-UTF-8 mode */    return (c != cd->fcc[next]);  /* Non-UTF-8 mode */
2699    
2700    /* For OP_NOT and OP_NOTI, the data is always a single-byte character. These    /* For OP_NOT and OP_NOTI, the data is always a single-byte character. These
2701    opcodes are not used for multi-byte characters, because they are coded using    opcodes are not used for multi-byte characters, because they are coded using
2702    an XCLASS instead. */    an XCLASS instead. */
2703    
2704    case OP_NOT:    case OP_NOT:
2705    return (c = *previous) == next;    return (c = *previous) == next;
2706    
2707    case OP_NOTI:    case OP_NOTI:
2708    if ((c = *previous) == next) return TRUE;    if ((c = *previous) == next) return TRUE;
2709  #ifdef SUPPORT_UTF8  #ifdef SUPPORT_UTF8
2710    if (utf8)    if (utf8)
# Line 4201  for (;; ptr++) Line 4217  for (;; ptr++)
4217      if (*previous == OP_CHAR || *previous == OP_CHARI)      if (*previous == OP_CHAR || *previous == OP_CHARI)
4218        {        {
4219        op_type = (*previous == OP_CHAR)? 0 : OP_STARI - OP_STAR;        op_type = (*previous == OP_CHAR)? 0 : OP_STARI - OP_STAR;
4220    
4221        /* Deal with UTF-8 characters that take up more than one byte. It's        /* Deal with UTF-8 characters that take up more than one byte. It's
4222        easier to write this out separately than try to macrify it. Use c to        easier to write this out separately than try to macrify it. Use c to
4223        hold the length of the character in bytes, plus 0x80 to flag that it's a        hold the length of the character in bytes, plus 0x80 to flag that it's a
# Line 4246  for (;; ptr++) Line 4262  for (;; ptr++)
4262      /* If previous was a single negated character ([^a] or similar), we use      /* If previous was a single negated character ([^a] or similar), we use
4263      one of the special opcodes, replacing it. The code is shared with single-      one of the special opcodes, replacing it. The code is shared with single-
4264      character repeats by setting opt_type to add a suitable offset into      character repeats by setting opt_type to add a suitable offset into
4265      repeat_type. We can also test for auto-possessification. OP_NOT and OP_NOTI      repeat_type. We can also test for auto-possessification. OP_NOT and OP_NOTI
4266      are currently used only for single-byte chars. */      are currently used only for single-byte chars. */
4267    
4268      else if (*previous == OP_NOT || *previous == OP_NOTI)      else if (*previous == OP_NOT || *previous == OP_NOTI)
# Line 4483  for (;; ptr++) Line 4499  for (;; ptr++)
4499        }        }
4500    
4501      /* If previous was a bracket group, we may have to replicate it in certain      /* If previous was a bracket group, we may have to replicate it in certain
4502      cases. */      cases. Note that at this point we can encounter only the "basic" BRA and
4503        KET opcodes, as this is the place where they get converted into the more
4504        special varieties. */
4505    
4506      else if (*previous == OP_BRA  || *previous == OP_CBRA ||      else if (*previous == OP_BRA  || *previous == OP_CBRA ||
4507               *previous == OP_ONCE || *previous == OP_COND)               *previous == OP_ONCE || *previous == OP_COND)
4508        {        {
4509        register int i;        register int i;
       int ketoffset = 0;  
4510        int len = (int)(code - previous);        int len = (int)(code - previous);
4511        uschar *bralink = NULL;        uschar *bralink = NULL;
4512          uschar *brazeroptr = NULL;
4513    
4514        /* Repeating a DEFINE group is pointless */        /* Repeating a DEFINE group is pointless */
4515    
4516        if (*previous == OP_COND && previous[LINK_SIZE+1] == OP_DEF)        if (*previous == OP_COND && previous[LINK_SIZE+1] == OP_DEF)
# Line 4501  for (;; ptr++) Line 4519  for (;; ptr++)
4519          goto FAILED;          goto FAILED;
4520          }          }
4521    
       /* If the maximum repeat count is unlimited, find the end of the bracket  
       by scanning through from the start, and compute the offset back to it  
       from the current code pointer. */  
   
       if (repeat_max == -1)  
         {  
         register uschar *ket = previous;  
         do ket += GET(ket, 1); while (*ket != OP_KET);  
         ketoffset = (int)(code - ket);  
         }  
   
4522        /* The case of a zero minimum is special because of the need to stick        /* The case of a zero minimum is special because of the need to stick
4523        OP_BRAZERO in front of it, and because the group appears once in the        OP_BRAZERO in front of it, and because the group appears once in the
4524        data, whereas in other cases it appears the minimum number of times. For        data, whereas in other cases it appears the minimum number of times. For
# Line 4553  for (;; ptr++) Line 4560  for (;; ptr++)
4560              *previous++ = OP_SKIPZERO;              *previous++ = OP_SKIPZERO;
4561              goto END_REPEAT;              goto END_REPEAT;
4562              }              }
4563              brazeroptr = previous;    /* Save for possessive optimizing */
4564            *previous++ = OP_BRAZERO + repeat_type;            *previous++ = OP_BRAZERO + repeat_type;
4565            }            }
4566    
# Line 4717  for (;; ptr++) Line 4725  for (;; ptr++)
4725            }            }
4726          }          }
4727    
4728        /* If the maximum is unlimited, set a repeater in the final copy. We        /* If the maximum is unlimited, set a repeater in the final copy. For
4729        can't just offset backwards from the current code point, because we        ONCE brackets, that's all we need to do.
4730        don't know if there's been an options resetting after the ket. The  
4731        correct offset was computed above.        Otherwise, if the quantifier was possessive, we convert the BRA code to
4732          the POS form, and the KET code to KETRPOS. (It turns out to be convenient
4733          at runtime to detect this kind of subpattern at both the start and at the
4734          end.) The use of special opcodes makes it possible to reduce greatly the
4735          stack usage in pcre_exec(). If the group is preceded by OP_BRAZERO,
4736          convert this to OP_BRAPOSZERO. Then cancel the possessive flag so that
4737          the default action below, of wrapping everything inside atomic brackets,
4738          does not happen.
4739    
4740        Then, when we are doing the actual compile phase, check to see whether        Then, when we are doing the actual compile phase, check to see whether
4741        this group is a non-atomic one that could match an empty string. If so,        this group is one that could match an empty string. If so, convert the
4742        convert the initial operator to the S form (e.g. OP_BRA -> OP_SBRA) so        initial operator to the S form (e.g. OP_BRA -> OP_SBRA) so that runtime
4743        that runtime checking can be done. [This check is also applied to        checking can be done. [This check is also applied to ONCE groups at
4744        atomic groups at runtime, but in a different way.] */        runtime, but in a different way.] */
4745    
4746        else        else
4747          {          {
4748          uschar *ketcode = code - ketoffset;          uschar *ketcode = code - 1 - LINK_SIZE;
4749          uschar *bracode = ketcode - GET(ketcode, 1);          uschar *bracode = ketcode - GET(ketcode, 1);
4750          *ketcode = OP_KETRMAX + repeat_type;  
4751          if (lengthptr == NULL && *bracode != OP_ONCE)          if (*bracode == OP_ONCE)
4752              *ketcode = OP_KETRMAX + repeat_type;
4753            else
4754            {            {
4755            uschar *scode = bracode;            if (possessive_quantifier)
           do  
4756              {              {
4757              if (could_be_empty_branch(scode, ketcode, utf8, cd))              *bracode += 1;                   /* Switch to xxxPOS opcodes */
4758                *ketcode = OP_KETRPOS;
4759                if (brazeroptr != NULL) *brazeroptr = OP_BRAPOSZERO;
4760                possessive_quantifier = FALSE;
4761                }
4762              else *ketcode = OP_KETRMAX + repeat_type;
4763    
4764              if (lengthptr == NULL)
4765                {
4766                uschar *scode = bracode;
4767                do
4768                {                {
4769                *bracode += OP_SBRA - OP_BRA;                if (could_be_empty_branch(scode, ketcode, utf8, cd))
4770                break;                  {
4771                    *bracode += OP_SBRA - OP_BRA;
4772                    break;
4773                    }
4774                  scode += GET(scode, 1);
4775                }                }
4776              scode += GET(scode, 1);              while (*scode == OP_ALT);
4777              }              }
           while (*scode == OP_ALT);  
4778            }            }
4779          }          }
4780        }        }
# Line 4766  for (;; ptr++) Line 4795  for (;; ptr++)
4795        }        }
4796    
4797      /* If the character following a repeat is '+', or if certain optimization      /* If the character following a repeat is '+', or if certain optimization
4798      tests above succeeded, possessive_quantifier is TRUE. For some of the      tests above succeeded, possessive_quantifier is TRUE. For some opcodes,
4799      simpler opcodes, there is an special alternative opcode for this. For      there are special alternative opcodes for this case. For anything else, we
4800      anything else, we wrap the entire repeated item inside OP_ONCE brackets.      wrap the entire repeated item inside OP_ONCE brackets. Logically, the '+'
4801      The '+' notation is just syntactic sugar, taken from Sun's Java package,      notation is just syntactic sugar, taken from Sun's Java package, but the
4802      but the special opcodes can optimize it a bit. The repeated item starts at      special opcodes can optimize it.
4803      tempcode, not at previous, which might be the first part of a string whose  
4804      (former) last char we repeated.      Possessively repeated subpatterns have already been handled in the code
4805        just above, so possessive_quantifier is always FALSE for them at this
4806        stage.
4807    
4808        Note that the repeated item starts at tempcode, not at previous, which
4809        might be the first part of a string whose (former) last char we repeated.
4810    
4811      Possessifying an 'exact' quantifier has no effect, so we can ignore it. But      Possessifying an 'exact' quantifier has no effect, so we can ignore it. But
4812      an 'upto' may follow. We skip over an 'exact' item, and then test the      an 'upto' may follow. We skip over an 'exact' item, and then test the
# Line 5714  for (;; ptr++) Line 5748  for (;; ptr++)
5748          is necessary to ensure we correctly detect the start of the pattern in          is necessary to ensure we correctly detect the start of the pattern in
5749          both phases.          both phases.
5750    
5751          If we are not at the pattern start, compile code to change the ims          If we are not at the pattern start, reset the greedy defaults and the
5752          options if this setting actually changes any of them, and reset the          case value for firstbyte and reqbyte. */
         greedy defaults and the case value for firstbyte and reqbyte. */  
5753    
5754          if (*ptr == CHAR_RIGHT_PARENTHESIS)          if (*ptr == CHAR_RIGHT_PARENTHESIS)
5755            {            {
# Line 5733  for (;; ptr++) Line 5766  for (;; ptr++)
5766              }              }
5767    
5768            /* Change options at this level, and pass them back for use            /* Change options at this level, and pass them back for use
5769            in subsequent branches. When not at the start of the pattern, this            in subsequent branches. */
           information is also necessary so that a resetting item can be  
           compiled at the end of a group (if we are in a group). */  
5770    
5771            *optionsptr = options = newoptions;            *optionsptr = options = newoptions;
5772            previous = NULL;       /* This item can't be repeated */            previous = NULL;       /* This item can't be repeated */
# Line 5773  for (;; ptr++) Line 5804  for (;; ptr++)
5804    
5805      /* Process nested bracketed regex. Assertions may not be repeated, but      /* Process nested bracketed regex. Assertions may not be repeated, but
5806      other kinds can be. All their opcodes are >= OP_ONCE. We copy code into a      other kinds can be. All their opcodes are >= OP_ONCE. We copy code into a
5807      non-register variable in order to be able to pass its address because some      non-register variable (tempcode) in order to be able to pass its address
5808      compilers complain otherwise. Pass in a new setting for the ims options if      because some compilers complain otherwise. */
     they have changed. */  
5809    
5810      previous = (bravalue >= OP_ONCE)? code : NULL;      previous = (bravalue >= OP_ONCE)? code : NULL;
5811      *code = bravalue;      *code = bravalue;
# Line 5785  for (;; ptr++) Line 5815  for (;; ptr++)
5815    
5816      if (!compile_regex(      if (!compile_regex(
5817           newoptions,                   /* The complete new option state */           newoptions,                   /* The complete new option state */
          options & PCRE_IMS,           /* The previous ims option state */  
5818           &tempcode,                    /* Where to put code (updated) */           &tempcode,                    /* Where to put code (updated) */
5819           &ptr,                         /* Input pointer (updated) */           &ptr,                         /* Input pointer (updated) */
5820           errorcodeptr,                 /* Where to put an error message */           errorcodeptr,                 /* Where to put an error message */
# Line 5872  for (;; ptr++) Line 5901  for (;; ptr++)
5901          goto FAILED;          goto FAILED;
5902          }          }
5903        *lengthptr += length_prevgroup - 2 - 2*LINK_SIZE;        *lengthptr += length_prevgroup - 2 - 2*LINK_SIZE;
5904        *code++ = OP_BRA;        code++;   /* This already contains bravalue */
5905        PUTINC(code, 0, 1 + LINK_SIZE);        PUTINC(code, 0, 1 + LINK_SIZE);
5906        *code++ = OP_KET;        *code++ = OP_KET;
5907        PUTINC(code, 0, 1 + LINK_SIZE);        PUTINC(code, 0, 1 + LINK_SIZE);
# Line 6242  value of lengthptr distinguishes the two Line 6271  value of lengthptr distinguishes the two
6271    
6272  Arguments:  Arguments:
6273    options        option bits, including any changes for this subpattern    options        option bits, including any changes for this subpattern
   oldims         previous settings of ims option bits  
6274    codeptr        -> the address of the current code pointer    codeptr        -> the address of the current code pointer
6275    ptrptr         -> the address of the current pattern pointer    ptrptr         -> the address of the current pattern pointer
6276    errorcodeptr   -> pointer to error code variable    errorcodeptr   -> pointer to error code variable
# Line 6260  Returns: TRUE on success Line 6288  Returns: TRUE on success
6288  */  */
6289    
6290  static BOOL  static BOOL
6291  compile_regex(int options, int oldims, uschar **codeptr, const uschar **ptrptr,  compile_regex(int options, uschar **codeptr, const uschar **ptrptr,
6292    int *errorcodeptr, BOOL lookbehind, BOOL reset_bracount, int skipbytes,    int *errorcodeptr, BOOL lookbehind, BOOL reset_bracount, int skipbytes,
6293    int *firstbyteptr, int *reqbyteptr, branch_chain *bcptr, compile_data *cd,    int *firstbyteptr, int *reqbyteptr, branch_chain *bcptr, compile_data *cd,
6294    int *lengthptr)    int *lengthptr)
# Line 6277  int branchfirstbyte, branchreqbyte; Line 6305  int branchfirstbyte, branchreqbyte;
6305  int length;  int length;
6306  int orig_bracount;  int orig_bracount;
6307  int max_bracount;  int max_bracount;
 int old_external_options = cd->external_options;  
6308  branch_chain bc;  branch_chain bc;
6309    
6310  bc.outer = bcptr;  bc.outer = bcptr;
# Line 6301  pre-compile phase to find out whether an Line 6328  pre-compile phase to find out whether an
6328    
6329  /* If this is a capturing subpattern, add to the chain of open capturing items  /* If this is a capturing subpattern, add to the chain of open capturing items
6330  so that we can detect them if (*ACCEPT) is encountered. This is also used to  so that we can detect them if (*ACCEPT) is encountered. This is also used to
6331  detect groups that contain recursive back references to themselves. */  detect groups that contain recursive back references to themselves. Note that
6332    only OP_CBRA need be tested here; changing this opcode to one of its variants,
6333    e.g. OP_SCBRAPOS, happens later, after the group has been compiled. */
6334    
6335  if (*code == OP_CBRA)  if (*code == OP_CBRA)
6336    {    {
# Line 6347  for (;;) Line 6376  for (;;)
6376      return FALSE;      return FALSE;
6377      }      }
6378    
   /* If the external options have changed during this branch, it means that we  
   are at the top level, and a leading option setting has been encountered. We  
   need to re-set the original option values to take account of this so that,  
   during the pre-compile phase, we know to allow for a re-set at the start of  
   subsequent branches. */  
   
   if (old_external_options != cd->external_options)  
     oldims = cd->external_options & PCRE_IMS;  
   
6379    /* Keep the highest bracket count in case (?| was used and some branch    /* Keep the highest bracket count in case (?| was used and some branch
6380    has fewer than the rest. */    has fewer than the rest. */
6381    
# Line 6416  for (;;) Line 6436  for (;;)
6436        {        {
6437        int fixed_length;        int fixed_length;
6438        *code = OP_END;        *code = OP_END;
6439        fixed_length = find_fixedlength(last_branch, options, FALSE, cd);        fixed_length = find_fixedlength(last_branch,  (options & PCRE_UTF8) != 0,
6440            FALSE, cd);
6441        DPRINTF(("fixed length = %d\n", fixed_length));        DPRINTF(("fixed length = %d\n", fixed_length));
6442        if (fixed_length == -3)        if (fixed_length == -3)
6443          {          {
# Line 6437  for (;;) Line 6458  for (;;)
6458    of offsets, with the field in the BRA item now becoming an offset to the    of offsets, with the field in the BRA item now becoming an offset to the
6459    first alternative. If there are no alternatives, it points to the end of the    first alternative. If there are no alternatives, it points to the end of the
6460    group. The length in the terminating ket is always the length of the whole    group. The length in the terminating ket is always the length of the whole
6461    bracketed item. If any of the ims options were changed inside the group,    bracketed item. Return leaving the pointer at the terminating char. */
   compile a resetting op-code following, except at the very end of the pattern.  
   Return leaving the pointer at the terminating char. */  
6462    
6463    if (*ptr != CHAR_VERTICAL_LINE)    if (*ptr != CHAR_VERTICAL_LINE)
6464      {      {
# Line 6564  of the more common cases more precisely. Line 6583  of the more common cases more precisely.
6583    
6584  Arguments:  Arguments:
6585    code           points to start of expression (the bracket)    code           points to start of expression (the bracket)
   options        points to the options setting  
6586    bracket_map    a bitmap of which brackets we are inside while testing; this    bracket_map    a bitmap of which brackets we are inside while testing; this
6587                    handles up to substring 31; after that we just have to take                    handles up to substring 31; after that we just have to take
6588                    the less precise approach                    the less precise approach
# Line 6574  Returns: TRUE or FALSE Line 6592  Returns: TRUE or FALSE
6592  */  */
6593    
6594  static BOOL  static BOOL
6595  is_anchored(register const uschar *code, int *options, unsigned int bracket_map,  is_anchored(register const uschar *code, unsigned int bracket_map,
6596    unsigned int backref_map)    unsigned int backref_map)
6597  {  {
6598  do {  do {
6599     const uschar *scode = first_significant_code(code + _pcre_OP_lengths[*code],     const uschar *scode = first_significant_code(code + _pcre_OP_lengths[*code],
6600       options, PCRE_MULTILINE, FALSE);       FALSE);
6601     register int op = *scode;     register int op = *scode;
6602    
6603     /* Non-capturing brackets */     /* Non-capturing brackets */
6604    
6605     if (op == OP_BRA)     if (op == OP_BRA  || op == OP_BRAPOS ||
6606           op == OP_SBRA || op == OP_SBRAPOS)
6607       {       {
6608       if (!is_anchored(scode, options, bracket_map, backref_map)) return FALSE;       if (!is_anchored(scode, bracket_map, backref_map)) return FALSE;
6609       }       }
6610    
6611     /* Capturing brackets */     /* Capturing brackets */
6612    
6613     else if (op == OP_CBRA)     else if (op == OP_CBRA  || op == OP_CBRAPOS ||
6614                op == OP_SCBRA || op == OP_SCBRAPOS)
6615       {       {
6616       int n = GET2(scode, 1+LINK_SIZE);       int n = GET2(scode, 1+LINK_SIZE);
6617       int new_map = bracket_map | ((n < 32)? (1 << n) : 1);       int new_map = bracket_map | ((n < 32)? (1 << n) : 1);
6618       if (!is_anchored(scode, options, new_map, backref_map)) return FALSE;       if (!is_anchored(scode, new_map, backref_map)) return FALSE;
6619       }       }
6620    
6621     /* Other brackets */     /* Other brackets */
6622    
6623     else if (op == OP_ASSERT || op == OP_ONCE || op == OP_COND)     else if (op == OP_ASSERT || op == OP_ONCE || op == OP_COND)
6624       {       {
6625       if (!is_anchored(scode, options, bracket_map, backref_map)) return FALSE;       if (!is_anchored(scode, bracket_map, backref_map)) return FALSE;
6626       }       }
6627    
6628     /* .* is not anchored unless DOTALL is set (which generates OP_ALLANY) and     /* .* is not anchored unless DOTALL is set (which generates OP_ALLANY) and
# Line 6653  is_startline(const uschar *code, unsigne Line 6673  is_startline(const uschar *code, unsigne
6673  {  {
6674  do {  do {
6675     const uschar *scode = first_significant_code(code + _pcre_OP_lengths[*code],     const uschar *scode = first_significant_code(code + _pcre_OP_lengths[*code],
6676       NULL, 0, FALSE);       FALSE);
6677     register int op = *scode;     register int op = *scode;
6678    
6679     /* If we are at the start of a conditional assertion group, *both* the     /* If we are at the start of a conditional assertion group, *both* the
# Line 6680  do { Line 6700  do {
6700         scode += 1 + LINK_SIZE;         scode += 1 + LINK_SIZE;
6701         break;         break;
6702         }         }
6703       scode = first_significant_code(scode, NULL, 0, FALSE);       scode = first_significant_code(scode, FALSE);
6704       op = *scode;       op = *scode;
6705       }       }
6706    
6707     /* Non-capturing brackets */     /* Non-capturing brackets */
6708    
6709     if (op == OP_BRA)     if (op == OP_BRA  || op == OP_BRAPOS ||
6710           op == OP_SBRA || op == OP_SBRAPOS)
6711       {       {
6712       if (!is_startline(scode, bracket_map, backref_map)) return FALSE;       if (!is_startline(scode, bracket_map, backref_map)) return FALSE;
6713       }       }
6714    
6715     /* Capturing brackets */     /* Capturing brackets */
6716    
6717     else if (op == OP_CBRA)     else if (op == OP_CBRA  || op == OP_CBRAPOS ||
6718                op == OP_SCBRA || op == OP_SCBRAPOS)
6719       {       {
6720       int n = GET2(scode, 1+LINK_SIZE);       int n = GET2(scode, 1+LINK_SIZE);
6721       int new_map = bracket_map | ((n < 32)? (1 << n) : 1);       int new_map = bracket_map | ((n < 32)? (1 << n) : 1);
# Line 6743  we return that char, otherwise -1. Line 6765  we return that char, otherwise -1.
6765    
6766  Arguments:  Arguments:
6767    code       points to start of expression (the bracket)    code       points to start of expression (the bracket)
   options    pointer to the options (used to check casing changes)  
6768    inassert   TRUE if in an assertion    inassert   TRUE if in an assertion
6769    
6770  Returns:     -1 or the fixed first char  Returns:     -1 or the fixed first char
6771  */  */
6772    
6773  static int  static int
6774  find_firstassertedchar(const uschar *code, int *options, BOOL inassert)  find_firstassertedchar(const uschar *code, BOOL inassert)
6775  {  {
6776  register int c = -1;  register int c = -1;
6777  do {  do {
6778     int d;     int d;
6779     const uschar *scode =     int xl = (*code == OP_CBRA || *code == OP_SCBRA ||
6780       first_significant_code(code + 1+LINK_SIZE, options, PCRE_CASELESS, TRUE);               *code == OP_CBRAPOS || *code == OP_SCBRAPOS)? 2:0;
6781       const uschar *scode = first_significant_code(code + 1+LINK_SIZE + xl, TRUE);
6782     register int op = *scode;     register int op = *scode;
6783    
6784     switch(op)     switch(op)
# Line 6765  do { Line 6787  do {
6787       return -1;       return -1;
6788    
6789       case OP_BRA:       case OP_BRA:
6790         case OP_BRAPOS:
6791       case OP_CBRA:       case OP_CBRA:
6792         case OP_SCBRA:
6793         case OP_CBRAPOS:
6794         case OP_SCBRAPOS:
6795       case OP_ASSERT:       case OP_ASSERT:
6796       case OP_ONCE:       case OP_ONCE:
6797       case OP_COND:       case OP_COND:
6798       if ((d = find_firstassertedchar(scode, options, op == OP_ASSERT)) < 0)       if ((d = find_firstassertedchar(scode, op == OP_ASSERT)) < 0)
6799         return -1;         return -1;
6800       if (c < 0) c = d; else if (c != d) return -1;       if (c < 0) c = d; else if (c != d) return -1;
6801       break;       break;
6802    
6803       case OP_EXACT:       /* Fall through */       case OP_EXACT:
6804       scode += 2;       scode += 2;
6805         /* Fall through */
6806    
6807       case OP_CHAR:       case OP_CHAR:
      case OP_CHARI:  
6808       case OP_PLUS:       case OP_PLUS:
6809       case OP_MINPLUS:       case OP_MINPLUS:
6810       case OP_POSPLUS:       case OP_POSPLUS:
6811       if (!inassert) return -1;       if (!inassert) return -1;
6812       if (c < 0)       if (c < 0) c = scode[1];
6813         {         else if (c != scode[1]) return -1;
6814         c = scode[1];       break;
6815         if ((*options & PCRE_CASELESS) != 0) c |= REQ_CASELESS;  
6816         }       case OP_EXACTI:
6817       else if (c != scode[1]) return -1;       scode += 2;
6818         /* Fall through */
6819    
6820         case OP_CHARI:
6821         case OP_PLUSI:
6822         case OP_MINPLUSI:
6823         case OP_POSPLUSI:
6824         if (!inassert) return -1;
6825         if (c < 0) c = scode[1] | REQ_CASELESS;
6826           else if (c != scode[1]) return -1;
6827       break;       break;
6828       }       }
6829    
# Line 6939  while (ptr[skipatstart] == CHAR_LEFT_PAR Line 6974  while (ptr[skipatstart] == CHAR_LEFT_PAR
6974    
6975  utf8 = (options & PCRE_UTF8) != 0;  utf8 = (options & PCRE_UTF8) != 0;
6976    
6977  /* Can't support UTF8 unless PCRE has been compiled to include the code. The  /* Can't support UTF8 unless PCRE has been compiled to include the code. The
6978  return of an error code from _pcre_valid_utf8() is a new feature, introduced in  return of an error code from _pcre_valid_utf8() is a new feature, introduced in
6979  release 8.13. The only use we make of it here is to adjust the offset value to  release 8.13. The only use we make of it here is to adjust the offset value to
6980  the end of the string for a short string error, for compatibility with previous  the end of the string for a short string error, for compatibility with previous
6981  versions. */  versions. */
6982    
6983  #ifdef SUPPORT_UTF8  #ifdef SUPPORT_UTF8
# Line 7063  outside can help speed up starting point Line 7098  outside can help speed up starting point
7098  ptr += skipatstart;  ptr += skipatstart;
7099  code = cworkspace;  code = cworkspace;
7100  *code = OP_BRA;  *code = OP_BRA;
7101  (void)compile_regex(cd->external_options, cd->external_options & PCRE_IMS,  (void)compile_regex(cd->external_options, &code, &ptr, &errorcode, FALSE,
7102    &code, &ptr, &errorcode, FALSE, FALSE, 0, &firstbyte, &reqbyte, NULL, cd,    FALSE, 0, &firstbyte, &reqbyte, NULL, cd, &length);
   &length);  
7103  if (errorcode != 0) goto PCRE_EARLY_ERROR_RETURN;  if (errorcode != 0) goto PCRE_EARLY_ERROR_RETURN;
7104    
7105  DPRINTF(("end pre-compile: length=%d workspace=%d\n", length,  DPRINTF(("end pre-compile: length=%d workspace=%d\n", length,
# Line 7137  of the function here. */ Line 7171  of the function here. */
7171  ptr = (const uschar *)pattern + skipatstart;  ptr = (const uschar *)pattern + skipatstart;
7172  code = (uschar *)codestart;  code = (uschar *)codestart;
7173  *code = OP_BRA;  *code = OP_BRA;
7174  (void)compile_regex(re->options, re->options & PCRE_IMS, &code, &ptr,  (void)compile_regex(re->options, &code, &ptr, &errorcode, FALSE, FALSE, 0,
7175    &errorcode, FALSE, FALSE, 0, &firstbyte, &reqbyte, NULL, cd, NULL);    &firstbyte, &reqbyte, NULL, cd, NULL);
7176  re->top_bracket = cd->bracount;  re->top_bracket = cd->bracount;
7177  re->top_backref = cd->top_backref;  re->top_backref = cd->top_backref;
7178  re->flags = cd->external_flags;  re->flags = cd->external_flags;
# Line 7204  if (cd->check_lookbehind) Line 7238  if (cd->check_lookbehind)
7238        uschar *be = cc - 1 - LINK_SIZE + GET(cc, -LINK_SIZE);        uschar *be = cc - 1 - LINK_SIZE + GET(cc, -LINK_SIZE);
7239        int end_op = *be;        int end_op = *be;
7240        *be = OP_END;        *be = OP_END;
7241        fixed_length = find_fixedlength(cc, re->options, TRUE, cd);        fixed_length = find_fixedlength(cc, (re->options & PCRE_UTF8) != 0, TRUE,
7242            cd);
7243        *be = end_op;        *be = end_op;
7244        DPRINTF(("fixed length = %d\n", fixed_length));        DPRINTF(("fixed length = %d\n", fixed_length));
7245        if (fixed_length < 0)        if (fixed_length < 0)
# Line 7243  start with ^. and also when all branches Line 7278  start with ^. and also when all branches
7278    
7279  if ((re->options & PCRE_ANCHORED) == 0)  if ((re->options & PCRE_ANCHORED) == 0)
7280    {    {
7281    int temp_options = re->options;   /* May get changed during these scans */    if (is_anchored(codestart, 0, cd->backref_map))
   if (is_anchored(codestart, &temp_options, 0, cd->backref_map))  
7282      re->options |= PCRE_ANCHORED;      re->options |= PCRE_ANCHORED;
7283    else    else
7284      {      {
7285      if (firstbyte < 0)      if (firstbyte < 0)
7286        firstbyte = find_firstassertedchar(codestart, &temp_options, FALSE);        firstbyte = find_firstassertedchar(codestart, FALSE);
7287      if (firstbyte >= 0)   /* Remove caseless flag for non-caseable chars */      if (firstbyte >= 0)   /* Remove caseless flag for non-caseable chars */
7288        {        {
7289        int ch = firstbyte & 255;        int ch = firstbyte & 255;

Legend:
Removed from v.602  
changed lines
  Added in v.605

webmaster@exim.org
ViewVC Help
Powered by ViewVC 1.1.12