/[pcre]/code/trunk/pcre.c
ViewVC logotype

Diff of /code/trunk/pcre.c

Parent Directory Parent Directory | Revision Log Revision Log | View Patch Patch

revision 25 by nigel, Sat Feb 24 21:38:45 2007 UTC revision 33 by nigel, Sat Feb 24 21:39:01 2007 UTC
# Line 9  the file Tech.Notes for some information Line 9  the file Tech.Notes for some information
9    
10  Written by: Philip Hazel <ph10@cam.ac.uk>  Written by: Philip Hazel <ph10@cam.ac.uk>
11    
12             Copyright (c) 1998 University of Cambridge             Copyright (c) 1997-1999 University of Cambridge
13    
14  -----------------------------------------------------------------------------  -----------------------------------------------------------------------------
15  Permission is granted to anyone to use this software for any purpose on any  Permission is granted to anyone to use this software for any purpose on any
# Line 25  restrictions: Line 25  restrictions:
25    
26  3. Altered versions must be plainly marked as such, and must not be  3. Altered versions must be plainly marked as such, and must not be
27     misrepresented as being the original software.     misrepresented as being the original software.
28    
29    4. If PCRE is embedded in any software that is released under the GNU
30       General Purpose Licence (GPL), then the terms of that licence shall
31       supersede any condition above with which it is incompatible.
32  -----------------------------------------------------------------------------  -----------------------------------------------------------------------------
33  */  */
34    
# Line 1087  for (;; ptr++) Line 1091  for (;; ptr++)
1091      else if ((int)*previous >= OP_BRA || (int)*previous == OP_ONCE ||      else if ((int)*previous >= OP_BRA || (int)*previous == OP_ONCE ||
1092               (int)*previous == OP_COND)               (int)*previous == OP_COND)
1093        {        {
1094        int i, ketoffset = 0;        register int i;
1095          int ketoffset = 0;
1096        int len = code - previous;        int len = code - previous;
1097          uschar *bralink = NULL;
1098    
1099        /* If the maximum repeat count is unlimited, find the end of the bracket        /* If the maximum repeat count is unlimited, find the end of the bracket
1100        by scanning through from the start, and compute the offset back to it        by scanning through from the start, and compute the offset back to it
# Line 1103  for (;; ptr++) Line 1109  for (;; ptr++)
1109          ketoffset = code - ket;          ketoffset = code - ket;
1110          }          }
1111    
1112          /* The case of a zero minimum is special because of the need to stick
1113          OP_BRAZERO in front of it, and because the group appears once in the
1114          data, whereas in other cases it appears the minimum number of times. For
1115          this reason, it is simplest to treat this case separately, as otherwise
1116          the code gets far too mess. There are several special subcases when the
1117          minimum is zero. */
1118    
1119          if (repeat_min == 0)
1120            {
1121            /* If the maximum is also zero, we just omit the group from the output
1122            altogether. */
1123    
1124            if (repeat_max == 0)
1125              {
1126              code = previous;
1127              previous = NULL;
1128              break;
1129              }
1130    
1131            /* If the maximum is 1 or unlimited, we just have to stick in the
1132            BRAZERO and do no more at this point. */
1133    
1134            if (repeat_max <= 1)
1135              {
1136              memmove(previous+1, previous, len);
1137              code++;
1138              *previous++ = OP_BRAZERO + repeat_type;
1139              }
1140    
1141            /* If the maximum is greater than 1 and limited, we have to replicate
1142            in a nested fashion, sticking OP_BRAZERO before each set of brackets.
1143            The first one has to be handled carefully because it's the original
1144            copy, which has to be moved up. The remainder can be handled by code
1145            that is common with the non-zero minimum case below. We just have to
1146            adjust the value or repeat_max, since one less copy is required. */
1147    
1148            else
1149              {
1150              int offset;
1151              memmove(previous+4, previous, len);
1152              code += 4;
1153              *previous++ = OP_BRAZERO + repeat_type;
1154              *previous++ = OP_BRA;
1155    
1156              /* We chain together the bracket offset fields that have to be
1157              filled in later when the ends of the brackets are reached. */
1158    
1159              offset = (bralink == NULL)? 0 : previous - bralink;
1160              bralink = previous;
1161              *previous++ = offset >> 8;
1162              *previous++ = offset & 255;
1163              }
1164    
1165            repeat_max--;
1166            }
1167    
1168          /* If the minimum is greater than zero, replicate the group as many
1169          times as necessary, and adjust the maximum to the number of subsequent
1170          copies that we need. */
1171    
1172          else
1173            {
1174            for (i = 1; i < repeat_min; i++)
1175              {
1176              memcpy(code, previous, len);
1177              code += len;
1178              }
1179            if (repeat_max > 0) repeat_max -= repeat_min;
1180            }
1181    
1182          /* This code is common to both the zero and non-zero minimum cases. If
1183          the maximum is limited, it replicates the group in a nested fashion,
1184          remembering the bracket starts on a stack. In the case of a zero minimum,
1185          the first one was set up above. In all cases the repeat_max now specifies
1186          the number of additional copies needed. */
1187    
1188          if (repeat_max >= 0)
1189            {
1190            for (i = repeat_max - 1; i >= 0; i--)
1191              {
1192              *code++ = OP_BRAZERO + repeat_type;
1193    
1194              /* All but the final copy start a new nesting, maintaining the
1195              chain of brackets outstanding. */
1196    
1197              if (i != 0)
1198                {
1199                int offset;
1200                *code++ = OP_BRA;
1201                offset = (bralink == NULL)? 0 : code - bralink;
1202                bralink = code;
1203                *code++ = offset >> 8;
1204                *code++ = offset & 255;
1205                }
1206    
1207              memcpy(code, previous, len);
1208              code += len;
1209              }
1210    
1211            /* Now chain through the pending brackets, and fill in their length
1212            fields (which are holding the chain links pro tem). */
1213    
1214            while (bralink != NULL)
1215              {
1216              int oldlinkoffset;
1217              int offset = code - bralink + 1;
1218              uschar *bra = code - offset;
1219              oldlinkoffset = (bra[1] << 8) + bra[2];
1220              bralink = (oldlinkoffset == 0)? NULL : bralink - oldlinkoffset;
1221              *code++ = OP_KET;
1222              *code++ = bra[1] = offset >> 8;
1223              *code++ = bra[2] = (offset & 255);
1224              }
1225            }
1226    
1227          /* If the maximum is unlimited, set a repeater in the final copy. We
1228          can't just offset backwards from the current code point, because we
1229          don't know if there's been an options resetting after the ket. The
1230          correct offset was computed above. */
1231    
1232          else code[-ketoffset] = OP_KETRMAX + repeat_type;
1233    
1234    
1235    #ifdef NEVER
1236        /* If the minimum is greater than zero, and the maximum is unlimited or        /* If the minimum is greater than zero, and the maximum is unlimited or
1237        equal to the minimum, the first copy remains where it is, and is        equal to the minimum, the first copy remains where it is, and is
1238        replicated up to the minimum number of times. This case includes the +        replicated up to the minimum number of times. This case includes the +
# Line 1150  for (;; ptr++) Line 1280  for (;; ptr++)
1280        correct offset was computed above. */        correct offset was computed above. */
1281    
1282        if (repeat_max == -1) code[-ketoffset] = OP_KETRMAX + repeat_type;        if (repeat_max == -1) code[-ketoffset] = OP_KETRMAX + repeat_type;
1283    #endif
1284    
1285    
1286        }        }
1287    
1288      /* Else there's some kind of shambles */      /* Else there's some kind of shambles */
# Line 1684  all of whose alternatives start with OP_ Line 1817  all of whose alternatives start with OP_
1817  it's anchored. However, if this is a multiline pattern, then only OP_SOD  it's anchored. However, if this is a multiline pattern, then only OP_SOD
1818  counts, since OP_CIRC can match in the middle.  counts, since OP_CIRC can match in the middle.
1819    
1820  A branch is also implicitly anchored if it starts with .* because that will try  A branch is also implicitly anchored if it starts with .* and DOTALL is set,
1821  the rest of the pattern at all possible matching points, so there is no point  because that will try the rest of the pattern at all possible matching points,
1822  trying them again.  so there is no point trying them again.
1823    
1824  Arguments:  Arguments:
1825    code       points to start of expression (the bracket)    code       points to start of expression (the bracket)
# Line 1704  do { Line 1837  do {
1837     register int op = *scode;     register int op = *scode;
1838     if (op >= OP_BRA || op == OP_ASSERT || op == OP_ONCE || op == OP_COND)     if (op >= OP_BRA || op == OP_ASSERT || op == OP_ONCE || op == OP_COND)
1839       { if (!is_anchored(scode, options)) return FALSE; }       { if (!is_anchored(scode, options)) return FALSE; }
1840     else if (op == OP_TYPESTAR || op == OP_TYPEMINSTAR)     else if ((op == OP_TYPESTAR || op == OP_TYPEMINSTAR) &&
1841                (*options & PCRE_DOTALL) != 0)
1842       { if (scode[1] != OP_ANY) return FALSE; }       { if (scode[1] != OP_ANY) return FALSE; }
1843     else if (op != OP_SOD &&     else if (op != OP_SOD &&
1844             ((*options & PCRE_MULTILINE) != 0 || op != OP_CIRC))             ((*options & PCRE_MULTILINE) != 0 || op != OP_CIRC))
# Line 1718  return TRUE; Line 1852  return TRUE;
1852    
1853    
1854  /*************************************************  /*************************************************
1855  *     Check for start with \n line expression    *  *         Check for starting with ^ or .*        *
1856  *************************************************/  *************************************************/
1857    
1858  /* This is called for multiline expressions to try to find out if every branch  /* This is called to find out if every branch starts with ^ or .* so that
1859  starts with ^ so that "first char" processing can be done to speed things up.  "first char" processing can be done to speed things up in multiline
1860    matching and for non-DOTALL patterns that start with .* (which must start at
1861    the beginning or after \n).
1862    
1863  Argument:  points to start of expression (the bracket)  Argument:  points to start of expression (the bracket)
1864  Returns:   TRUE or FALSE  Returns:   TRUE or FALSE
# Line 1736  do { Line 1872  do {
1872     register int op = *scode;     register int op = *scode;
1873     if (op >= OP_BRA || op == OP_ASSERT || op == OP_ONCE || op == OP_COND)     if (op >= OP_BRA || op == OP_ASSERT || op == OP_ONCE || op == OP_COND)
1874       { if (!is_startline(scode)) return FALSE; }       { if (!is_startline(scode)) return FALSE; }
1875       else if (op == OP_TYPESTAR || op == OP_TYPEMINSTAR)
1876         { if (scode[1] != OP_ANY) return FALSE; }
1877     else if (op != OP_CIRC) return FALSE;     else if (op != OP_CIRC) return FALSE;
1878     code += (code[1] << 8) + code[2];     code += (code[1] << 8) + code[2];
1879     }     }
# Line 2268  while ((c = *(++ptr)) != 0) Line 2406  while ((c = *(++ptr)) != 0)
2406        else if (c == '+') { maxval = -1; ptr++; }        else if (c == '+') { maxval = -1; ptr++; }
2407        else if (c == '?') { minval = 0; ptr++; }        else if (c == '?') { minval = 0; ptr++; }
2408    
2409        /* If there is a minimum > 1 we have to replicate up to minval-1 times;        /* If the minimum is zero, we have to allow for an OP_BRAZERO before the
2410        if there is a limited maximum we have to replicate up to maxval-1 times        group, and if the maximum is greater than zero, we have to replicate
2411        and allow for a BRAZERO item before each optional copy, as we also have        maxval-1 times; each replication acquires an OP_BRAZERO plus a nesting
2412        to do before the first copy if the minimum is zero. */        bracket set - hence the 7. */
2413    
2414        if (minval == 0) length++;        if (minval == 0)
2415          else if (minval > 1) length += (minval - 1) * duplength;          {
2416        if (maxval > minval) length += (maxval - minval) * (duplength + 1);          length++;
2417            if (maxval > 0) length += (maxval - 1) * (duplength + 7);
2418            }
2419    
2420          /* When the minimum is greater than zero, 1 we have to replicate up to
2421          minval-1 times, with no additions required in the copies. Then, if
2422          there is a limited maximum we have to replicate up to maxval-1 times
2423          allowing for a BRAZERO item before each optional copy and nesting
2424          brackets for all but one of the optional copies. */
2425    
2426          else
2427            {
2428            length += (minval - 1) * duplength;
2429            if (maxval > minval)   /* Need this test as maxval=-1 means no limit */
2430              length += (maxval - minval) * (duplength + 7) - 6;
2431            }
2432        }        }
2433      continue;      continue;
2434    
# Line 2398  if (*errorptr != NULL) Line 2551  if (*errorptr != NULL)
2551    return NULL;    return NULL;
2552    }    }
2553    
2554  /* If the anchored option was not passed, set flag if we can determine that it  /* If the anchored option was not passed, set flag if we can determine that the
2555  is anchored by virtue of ^ characters or \A or anything else. Otherwise, see if  pattern is anchored by virtue of ^ characters or \A or anything else (such as
2556  we can determine what the first character has to be, because that speeds up  starting with .* when DOTALL is set).
2557  unanchored matches no end. In the case of multiline matches, an alternative is  
2558  to set the PCRE_STARTLINE flag if all branches start with ^. */  Otherwise, see if we can determine what the first character has to be, because
2559    that speeds up unanchored matches no end. If not, see if we can set the
2560    PCRE_STARTLINE flag. This is helpful for multiline matches when all branches
2561    start with ^. and also when all branches start with .* for non-DOTALL matches.
2562    */
2563    
2564  if ((options & PCRE_ANCHORED) == 0)  if ((options & PCRE_ANCHORED) == 0)
2565    {    {
# Line 2771  for (;;) Line 2928  for (;;)
2928      int number = op - OP_BRA;      int number = op - OP_BRA;
2929      int offset = number << 1;      int offset = number << 1;
2930    
2931      DPRINTF(("start bracket %d\n", number));  #ifdef DEBUG
2932        printf("start bracket %d subject=", number);
2933        pchars(eptr, 16, TRUE, md);
2934        printf("\n");
2935    #endif
2936    
2937      if (offset < md->offset_max)      if (offset < md->offset_max)
2938        {        {
# Line 4033  in the pattern. */ Line 4194  in the pattern. */
4194  resetcount = 2 + re->top_bracket * 2;  resetcount = 2 + re->top_bracket * 2;
4195  if (resetcount > offsetcount) resetcount = ocount;  if (resetcount > offsetcount) resetcount = ocount;
4196    
4197    /* Reset the working variable associated with each extraction. These should
4198    never be used unless previously set, but they get saved and restored, and so we
4199    initialize them to avoid reading uninitialized locations. */
4200    
4201    if (match_block.offset_vector != NULL)
4202      {
4203      register int *iptr = match_block.offset_vector + ocount;
4204      register int *iend = iptr - resetcount/2 + 1;
4205      while (--iptr >= iend) *iptr = -1;
4206      }
4207    
4208  /* Set up the first character to match, if available. The first_char value is  /* Set up the first character to match, if available. The first_char value is
4209  never set for an anchored regular expression, but the anchoring may be forced  never set for an anchored regular expression, but the anchoring may be forced
4210  at run time, so we have to test for anchoring. The first char may be unset for  at run time, so we have to test for anchoring. The first char may be unset for

Legend:
Removed from v.25  
changed lines
  Added in v.33

webmaster@exim.org
ViewVC Help
Powered by ViewVC 1.1.12