/[pcre]/code/trunk/pcrecpp.cc
ViewVC logotype

Diff of /code/trunk/pcrecpp.cc

Parent Directory Parent Directory | Revision Log Revision Log | View Patch Patch

revision 137 by ph10, Thu Mar 29 13:56:00 2007 UTC revision 328 by ph10, Wed Mar 26 17:39:06 2008 UTC
# Line 30  Line 30 
30  // Author: Sanjay Ghemawat  // Author: Sanjay Ghemawat
31    
32  #ifdef HAVE_CONFIG_H  #ifdef HAVE_CONFIG_H
33  #  include <config.h>  #include "config.h"
34  #endif  #endif
35    
36  #include <stdlib.h>  #include <stdlib.h>
# Line 41  Line 41 
41  #include <errno.h>  #include <errno.h>
42  #include <string>  #include <string>
43  #include <algorithm>  #include <algorithm>
44  // We need this to compile the proper dll on windows/msys.  This is copied  
45  // from pcre_internal.h.  It would probably be better just to include that.  #include "pcrecpp_internal.h"
46  #define PCRE_DEFINITION  /* Win32 __declspec(export) trigger for .dll */  #include "pcre.h"
 #include <pcre.h>  
 #include "pcre_stringpiece.h"  
47  #include "pcrecpp.h"  #include "pcrecpp.h"
48    #include "pcre_stringpiece.h"
49    
50    
51  namespace pcrecpp {  namespace pcrecpp {
# Line 56  static const int kMaxArgs = 16; Line 55  static const int kMaxArgs = 16;
55  static const int kVecSize = (1 + kMaxArgs) * 3;  // results + PCRE workspace  static const int kVecSize = (1 + kMaxArgs) * 3;  // results + PCRE workspace
56    
57  // Special object that stands-in for no argument  // Special object that stands-in for no argument
58  Arg no_arg((void*)NULL);  Arg RE::no_arg((void*)NULL);
59    
60    // This is for ABI compatibility with old versions of pcre (pre-7.6),
61    // which defined a global no_arg variable instead of putting it in the
62    // RE class.  This works on GCC >= 3, at least.  We could probably
63    // have a more inclusive test if we ever needed it.  (Note that not
64    // only the __attribute__ syntax, but also __USER_LABEL_PREFIX__, are
65    // gnu-specific.)
66    #if defined(__GNUC__) && __GNUC__ >= 3
67    #if defined(__ELF__)
68    extern Arg no_arg
69      __attribute__((alias(__USER_LABEL_PREFIX__ "_ZN7pcrecpp2RE6no_argE")));
70    #else
71    // While we know elf supports strong aliases, not all formats do (Mach
72    // doesn't, for instance).  So make aliases weak by default.  This is
73    // a smidge less safe in theory (conceivably, someone could override
74    // this symbol in their own binary), but perfectly ok in practice.
75    extern Arg no_arg
76      __attribute__((weak, alias(__USER_LABEL_PREFIX__ "_ZN7pcrecpp2RE6no_argE")));
77    #endif
78    #endif
79    
80  // If a regular expression has no error, its error_ field points here  // If a regular expression has no error, its error_ field points here
81  static const string empty_string;  static const string empty_string;
# Line 77  void RE::Init(const string& pat, const R Line 96  void RE::Init(const string& pat, const R
96    
97    re_partial_ = Compile(UNANCHORED);    re_partial_ = Compile(UNANCHORED);
98    if (re_partial_ != NULL) {    if (re_partial_ != NULL) {
99      // Check for complicated patterns.  The following change is      re_full_ = Compile(ANCHOR_BOTH);
     // conservative in that it may treat some "simple" patterns  
     // as "complex" (e.g., if the vertical bar is in a character  
     // class or is escaped).  But it seems good enough.  
     if (strchr(pat.c_str(), '|') == NULL) {  
       // Simple pattern: we can use position-based checks to perform  
       // fully anchored matches  
       re_full_ = re_partial_;  
     } else {  
       // We need a special pattern for anchored matches  
       re_full_ = Compile(ANCHOR_BOTH);  
     }  
100    }    }
101  }  }
102    
103  void RE::Cleanup() {  void RE::Cleanup() {
104    if (re_full_ != NULL && re_full_ != re_partial_) (*pcre_free)(re_full_);    if (re_full_ != NULL)         (*pcre_free)(re_full_);
105    if (re_partial_ != NULL)                         (*pcre_free)(re_partial_);    if (re_partial_ != NULL)      (*pcre_free)(re_partial_);
106    if (error_ != &empty_string)                     delete error_;    if (error_ != &empty_string)  delete error_;
107  }  }
108    
109    
# Line 343  bool RE::Replace(const StringPiece& rewr Line 351  bool RE::Replace(const StringPiece& rewr
351    
352  // Returns PCRE_NEWLINE_CRLF, PCRE_NEWLINE_CR, or PCRE_NEWLINE_LF.  // Returns PCRE_NEWLINE_CRLF, PCRE_NEWLINE_CR, or PCRE_NEWLINE_LF.
353  // Note that PCRE_NEWLINE_CRLF is defined to be P_N_CR | P_N_LF.  // Note that PCRE_NEWLINE_CRLF is defined to be P_N_CR | P_N_LF.
354    // Modified by PH to add PCRE_NEWLINE_ANY and PCRE_NEWLINE_ANYCRLF.
355    
356  static int NewlineMode(int pcre_options) {  static int NewlineMode(int pcre_options) {
357    // TODO: if we can make it threadsafe, cache this var    // TODO: if we can make it threadsafe, cache this var
358    int newline_mode = 0;    int newline_mode = 0;
359    /* if (newline_mode) return newline_mode; */  // do this once it's cached    /* if (newline_mode) return newline_mode; */  // do this once it's cached
360    if (pcre_options & (PCRE_NEWLINE_CRLF|PCRE_NEWLINE_CR|PCRE_NEWLINE_LF)) {    if (pcre_options & (PCRE_NEWLINE_CRLF|PCRE_NEWLINE_CR|PCRE_NEWLINE_LF|
361                          PCRE_NEWLINE_ANY|PCRE_NEWLINE_ANYCRLF)) {
362      newline_mode = (pcre_options &      newline_mode = (pcre_options &
363                      (PCRE_NEWLINE_CRLF|PCRE_NEWLINE_CR|PCRE_NEWLINE_LF));                      (PCRE_NEWLINE_CRLF|PCRE_NEWLINE_CR|PCRE_NEWLINE_LF|
364                         PCRE_NEWLINE_ANY|PCRE_NEWLINE_ANYCRLF));
365    } else {    } else {
366      int newline;      int newline;
367      pcre_config(PCRE_CONFIG_NEWLINE, &newline);      pcre_config(PCRE_CONFIG_NEWLINE, &newline);
# Line 359  static int NewlineMode(int pcre_options) Line 371  static int NewlineMode(int pcre_options)
371        newline_mode = PCRE_NEWLINE_CR;        newline_mode = PCRE_NEWLINE_CR;
372      else if (newline == 3338)      else if (newline == 3338)
373        newline_mode = PCRE_NEWLINE_CRLF;        newline_mode = PCRE_NEWLINE_CRLF;
374        else if (newline == -1)
375          newline_mode = PCRE_NEWLINE_ANY;
376        else if (newline == -2)
377          newline_mode = PCRE_NEWLINE_ANYCRLF;
378      else      else
379        assert("" == "Unexpected return value from pcre_config(NEWLINE)");        assert("" == "Unexpected return value from pcre_config(NEWLINE)");
380    }    }
# Line 373  int RE::GlobalReplace(const StringPiece& Line 389  int RE::GlobalReplace(const StringPiece&
389    int start = 0;    int start = 0;
390    int lastend = -1;    int lastend = -1;
391    
392    for (; start <= static_cast<int>(str->length()); count++) {    while (start <= static_cast<int>(str->length())) {
393      int matches = TryMatch(*str, start, UNANCHORED, vec, kVecSize);      int matches = TryMatch(*str, start, UNANCHORED, vec, kVecSize);
394      if (matches <= 0)      if (matches <= 0)
395        break;        break;
# Line 388  int RE::GlobalReplace(const StringPiece& Line 404  int RE::GlobalReplace(const StringPiece&
404        // Note it's better to call pcre_fullinfo() than to examine        // Note it's better to call pcre_fullinfo() than to examine
405        // all_options(), since options_ could have changed bewteen        // all_options(), since options_ could have changed bewteen
406        // compile-time and now, but this is simpler and safe enough.        // compile-time and now, but this is simpler and safe enough.
407          // Modified by PH to add ANY and ANYCRLF.
408        if (start+1 < static_cast<int>(str->length()) &&        if (start+1 < static_cast<int>(str->length()) &&
409            (*str)[start] == '\r' && (*str)[start+1] == '\n' &&            (*str)[start] == '\r' && (*str)[start+1] == '\n' &&
410            NewlineMode(options_.all_options()) == PCRE_NEWLINE_CRLF) {            (NewlineMode(options_.all_options()) == PCRE_NEWLINE_CRLF ||
411               NewlineMode(options_.all_options()) == PCRE_NEWLINE_ANY ||
412               NewlineMode(options_.all_options()) == PCRE_NEWLINE_ANYCRLF)
413              ) {
414          matchend++;          matchend++;
415        }        }
416        // We also need to advance more than one char if we're in utf8 mode.        // We also need to advance more than one char if we're in utf8 mode.
# Line 441  bool RE::Extract(const StringPiece& rewr Line 461  bool RE::Extract(const StringPiece& rewr
461    // Note that it's legal to escape a character even if it has no    // Note that it's legal to escape a character even if it has no
462    // special meaning in a regular expression -- so this function does    // special meaning in a regular expression -- so this function does
463    // that.  (This also makes it identical to the perl function of the    // that.  (This also makes it identical to the perl function of the
464    // same name; see `perldoc -f quotemeta`.)    // same name; see `perldoc -f quotemeta`.)  The one exception is
465      // escaping NUL: rather than doing backslash + NUL, like perl does,
466      // we do '\0', because pcre itself doesn't take embedded NUL chars.
467    for (int ii = 0; ii < unquoted.size(); ++ii) {    for (int ii = 0; ii < unquoted.size(); ++ii) {
468      // Note that using 'isalnum' here raises the benchmark time from      // Note that using 'isalnum' here raises the benchmark time from
469      // 32ns to 58ns:      // 32ns to 58ns:
470      if ((unquoted[ii] < 'a' || unquoted[ii] > 'z') &&      if (unquoted[ii] == '\0') {
471          (unquoted[ii] < 'A' || unquoted[ii] > 'Z') &&        result += "\\0";
472          (unquoted[ii] < '0' || unquoted[ii] > '9') &&      } else if ((unquoted[ii] < 'a' || unquoted[ii] > 'z') &&
473          unquoted[ii] != '_' &&                 (unquoted[ii] < 'A' || unquoted[ii] > 'Z') &&
474          // If this is the part of a UTF8 or Latin1 character, we need                 (unquoted[ii] < '0' || unquoted[ii] > '9') &&
475          // to copy this byte without escaping.  Experimentally this is                 unquoted[ii] != '_' &&
476          // what works correctly with the regexp library.                 // If this is the part of a UTF8 or Latin1 character, we need
477          !(unquoted[ii] & 128)) {                 // to copy this byte without escaping.  Experimentally this is
478                   // what works correctly with the regexp library.
479                   !(unquoted[ii] & 128)) {
480        result += '\\';        result += '\\';
481          result += unquoted[ii];
482        } else {
483          result += unquoted[ii];
484      }      }
     result += unquoted[ii];  
485    }    }
486    
487    return result;    return result;
# Line 474  int RE::TryMatch(const StringPiece& text Line 500  int RE::TryMatch(const StringPiece& text
500      return 0;      return 0;
501    }    }
502    
503    pcre_extra extra = { 0 };    pcre_extra extra = { 0, 0, 0, 0, 0, 0 };
504    if (options_.match_limit() > 0) {    if (options_.match_limit() > 0) {
505      extra.flags |= PCRE_EXTRA_MATCH_LIMIT;      extra.flags |= PCRE_EXTRA_MATCH_LIMIT;
506      extra.match_limit = options_.match_limit();      extra.match_limit = options_.match_limit();
# Line 507  int RE::TryMatch(const StringPiece& text Line 533  int RE::TryMatch(const StringPiece& text
533      rc = vecsize / 2;      rc = vecsize / 2;
534    }    }
535    
   if ((anchor == ANCHOR_BOTH) && (re_full_ == re_partial_)) {  
     // We need an extra check to make sure that the match extended  
     // to the end of the input string  
     assert(vec[0] == 0);                 // PCRE_ANCHORED forces starting match  
     if (vec[1] != text.size()) return 0; // Did not get ending match  
   }  
   
536    return rc;    return rc;
537  }  }
538    
# Line 625  bool Arg::parse_null(const char* str, in Line 644  bool Arg::parse_null(const char* str, in
644  }  }
645    
646  bool Arg::parse_string(const char* str, int n, void* dest) {  bool Arg::parse_string(const char* str, int n, void* dest) {
647      if (dest == NULL) return true;
648    reinterpret_cast<string*>(dest)->assign(str, n);    reinterpret_cast<string*>(dest)->assign(str, n);
649    return true;    return true;
650  }  }
651    
652  bool Arg::parse_stringpiece(const char* str, int n, void* dest) {  bool Arg::parse_stringpiece(const char* str, int n, void* dest) {
653      if (dest == NULL) return true;
654    reinterpret_cast<StringPiece*>(dest)->set(str, n);    reinterpret_cast<StringPiece*>(dest)->set(str, n);
655    return true;    return true;
656  }  }
657    
658  bool Arg::parse_char(const char* str, int n, void* dest) {  bool Arg::parse_char(const char* str, int n, void* dest) {
659    if (n != 1) return false;    if (n != 1) return false;
660      if (dest == NULL) return true;
661    *(reinterpret_cast<char*>(dest)) = str[0];    *(reinterpret_cast<char*>(dest)) = str[0];
662    return true;    return true;
663  }  }
664    
665  bool Arg::parse_uchar(const char* str, int n, void* dest) {  bool Arg::parse_uchar(const char* str, int n, void* dest) {
666    if (n != 1) return false;    if (n != 1) return false;
667      if (dest == NULL) return true;
668    *(reinterpret_cast<unsigned char*>(dest)) = str[0];    *(reinterpret_cast<unsigned char*>(dest)) = str[0];
669    return true;    return true;
670  }  }
# Line 690  bool Arg::parse_long_radix(const char* s Line 713  bool Arg::parse_long_radix(const char* s
713    long r = strtol(str, &end, radix);    long r = strtol(str, &end, radix);
714    if (end != str + n) return false;   // Leftover junk    if (end != str + n) return false;   // Leftover junk
715    if (errno) return false;    if (errno) return false;
716      if (dest == NULL) return true;
717    *(reinterpret_cast<long*>(dest)) = r;    *(reinterpret_cast<long*>(dest)) = r;
718    return true;    return true;
719  }  }
# Line 707  bool Arg::parse_ulong_radix(const char* Line 731  bool Arg::parse_ulong_radix(const char*
731    unsigned long r = strtoul(str, &end, radix);    unsigned long r = strtoul(str, &end, radix);
732    if (end != str + n) return false;   // Leftover junk    if (end != str + n) return false;   // Leftover junk
733    if (errno) return false;    if (errno) return false;
734      if (dest == NULL) return true;
735    *(reinterpret_cast<unsigned long*>(dest)) = r;    *(reinterpret_cast<unsigned long*>(dest)) = r;
736    return true;    return true;
737  }  }
# Line 718  bool Arg::parse_short_radix(const char* Line 743  bool Arg::parse_short_radix(const char*
743    long r;    long r;
744    if (!parse_long_radix(str, n, &r, radix)) return false; // Could not parse    if (!parse_long_radix(str, n, &r, radix)) return false; // Could not parse
745    if (r < SHRT_MIN || r > SHRT_MAX) return false;       // Out of range    if (r < SHRT_MIN || r > SHRT_MAX) return false;       // Out of range
746    *(reinterpret_cast<short*>(dest)) = r;    if (dest == NULL) return true;
747      *(reinterpret_cast<short*>(dest)) = static_cast<short>(r);
748    return true;    return true;
749  }  }
750    
# Line 729  bool Arg::parse_ushort_radix(const char* Line 755  bool Arg::parse_ushort_radix(const char*
755    unsigned long r;    unsigned long r;
756    if (!parse_ulong_radix(str, n, &r, radix)) return false; // Could not parse    if (!parse_ulong_radix(str, n, &r, radix)) return false; // Could not parse
757    if (r > USHRT_MAX) return false;                      // Out of range    if (r > USHRT_MAX) return false;                      // Out of range
758    *(reinterpret_cast<unsigned short*>(dest)) = r;    if (dest == NULL) return true;
759      *(reinterpret_cast<unsigned short*>(dest)) = static_cast<unsigned short>(r);
760    return true;    return true;
761  }  }
762    
# Line 740  bool Arg::parse_int_radix(const char* st Line 767  bool Arg::parse_int_radix(const char* st
767    long r;    long r;
768    if (!parse_long_radix(str, n, &r, radix)) return false; // Could not parse    if (!parse_long_radix(str, n, &r, radix)) return false; // Could not parse
769    if (r < INT_MIN || r > INT_MAX) return false;         // Out of range    if (r < INT_MIN || r > INT_MAX) return false;         // Out of range
770      if (dest == NULL) return true;
771    *(reinterpret_cast<int*>(dest)) = r;    *(reinterpret_cast<int*>(dest)) = r;
772    return true;    return true;
773  }  }
# Line 751  bool Arg::parse_uint_radix(const char* s Line 779  bool Arg::parse_uint_radix(const char* s
779    unsigned long r;    unsigned long r;
780    if (!parse_ulong_radix(str, n, &r, radix)) return false; // Could not parse    if (!parse_ulong_radix(str, n, &r, radix)) return false; // Could not parse
781    if (r > UINT_MAX) return false;                       // Out of range    if (r > UINT_MAX) return false;                       // Out of range
782      if (dest == NULL) return true;
783    *(reinterpret_cast<unsigned int*>(dest)) = r;    *(reinterpret_cast<unsigned int*>(dest)) = r;
784    return true;    return true;
785  }  }
# Line 771  bool Arg::parse_longlong_radix(const cha Line 800  bool Arg::parse_longlong_radix(const cha
800    long long r = strtoq(str, &end, radix);    long long r = strtoq(str, &end, radix);
801  #elif defined HAVE_STRTOLL  #elif defined HAVE_STRTOLL
802    long long r = strtoll(str, &end, radix);    long long r = strtoll(str, &end, radix);
803    #elif defined HAVE__STRTOI64
804      long long r = _strtoi64(str, &end, radix);
805  #else  #else
806  #error parse_longlong_radix: cannot convert input to a long-long  #error parse_longlong_radix: cannot convert input to a long-long
807  #endif  #endif
808    if (end != str + n) return false;   // Leftover junk    if (end != str + n) return false;   // Leftover junk
809    if (errno) return false;    if (errno) return false;
810      if (dest == NULL) return true;
811    *(reinterpret_cast<long long*>(dest)) = r;    *(reinterpret_cast<long long*>(dest)) = r;
812    return true;    return true;
813  #endif   /* HAVE_LONG_LONG */  #endif   /* HAVE_LONG_LONG */
# Line 798  bool Arg::parse_ulonglong_radix(const ch Line 830  bool Arg::parse_ulonglong_radix(const ch
830    unsigned long long r = strtouq(str, &end, radix);    unsigned long long r = strtouq(str, &end, radix);
831  #elif defined HAVE_STRTOLL  #elif defined HAVE_STRTOLL
832    unsigned long long r = strtoull(str, &end, radix);    unsigned long long r = strtoull(str, &end, radix);
833    #elif defined HAVE__STRTOI64
834      unsigned long long r = _strtoui64(str, &end, radix);
835  #else  #else
836  #error parse_ulonglong_radix: cannot convert input to a long-long  #error parse_ulonglong_radix: cannot convert input to a long-long
837  #endif  #endif
838    if (end != str + n) return false;   // Leftover junk    if (end != str + n) return false;   // Leftover junk
839    if (errno) return false;    if (errno) return false;
840      if (dest == NULL) return true;
841    *(reinterpret_cast<unsigned long long*>(dest)) = r;    *(reinterpret_cast<unsigned long long*>(dest)) = r;
842    return true;    return true;
843  #endif   /* HAVE_UNSIGNED_LONG_LONG */  #endif   /* HAVE_UNSIGNED_LONG_LONG */
# Line 820  bool Arg::parse_double(const char* str, Line 855  bool Arg::parse_double(const char* str,
855    double r = strtod(buf, &end);    double r = strtod(buf, &end);
856    if (end != buf + n) return false;   // Leftover junk    if (end != buf + n) return false;   // Leftover junk
857    if (errno) return false;    if (errno) return false;
858      if (dest == NULL) return true;
859    *(reinterpret_cast<double*>(dest)) = r;    *(reinterpret_cast<double*>(dest)) = r;
860    return true;    return true;
861  }  }
# Line 827  bool Arg::parse_double(const char* str, Line 863  bool Arg::parse_double(const char* str,
863  bool Arg::parse_float(const char* str, int n, void* dest) {  bool Arg::parse_float(const char* str, int n, void* dest) {
864    double r;    double r;
865    if (!parse_double(str, n, &r)) return false;    if (!parse_double(str, n, &r)) return false;
866      if (dest == NULL) return true;
867    *(reinterpret_cast<float*>(dest)) = static_cast<float>(r);    *(reinterpret_cast<float*>(dest)) = static_cast<float>(r);
868    return true;    return true;
869  }  }

Legend:
Removed from v.137  
changed lines
  Added in v.328

webmaster@exim.org
ViewVC Help
Powered by ViewVC 1.1.12