| 1 |
nigel |
77 |
// Copyright (c) 2005, Google Inc. |
| 2 |
|
|
// All rights reserved. |
| 3 |
|
|
// |
| 4 |
|
|
// Redistribution and use in source and binary forms, with or without |
| 5 |
|
|
// modification, are permitted provided that the following conditions are |
| 6 |
|
|
// met: |
| 7 |
|
|
// |
| 8 |
|
|
// * Redistributions of source code must retain the above copyright |
| 9 |
|
|
// notice, this list of conditions and the following disclaimer. |
| 10 |
|
|
// * Redistributions in binary form must reproduce the above |
| 11 |
|
|
// copyright notice, this list of conditions and the following disclaimer |
| 12 |
|
|
// in the documentation and/or other materials provided with the |
| 13 |
|
|
// distribution. |
| 14 |
|
|
// * Neither the name of Google Inc. nor the names of its |
| 15 |
|
|
// contributors may be used to endorse or promote products derived from |
| 16 |
|
|
// this software without specific prior written permission. |
| 17 |
|
|
// |
| 18 |
|
|
// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS |
| 19 |
|
|
// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT |
| 20 |
|
|
// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR |
| 21 |
|
|
// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT |
| 22 |
|
|
// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, |
| 23 |
|
|
// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT |
| 24 |
|
|
// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, |
| 25 |
|
|
// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY |
| 26 |
|
|
// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT |
| 27 |
|
|
// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE |
| 28 |
|
|
// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. |
| 29 |
|
|
// |
| 30 |
|
|
// Author: Sanjay Ghemawat |
| 31 |
|
|
// |
| 32 |
|
|
// Regular-expression based scanner for parsing an input stream. |
| 33 |
|
|
// |
| 34 |
|
|
// Example 1: parse a sequence of "var = number" entries from input: |
| 35 |
|
|
// |
| 36 |
|
|
// Scanner scanner(input); |
| 37 |
|
|
// string var; |
| 38 |
|
|
// int number; |
| 39 |
nigel |
93 |
// scanner.SetSkipExpression("\\s+"); // Skip any white space we encounter |
| 40 |
nigel |
77 |
// while (scanner.Consume("(\\w+) = (\\d+)", &var, &number)) { |
| 41 |
|
|
// ...; |
| 42 |
|
|
// } |
| 43 |
|
|
|
| 44 |
|
|
#ifndef _PCRE_SCANNER_H |
| 45 |
|
|
#define _PCRE_SCANNER_H |
| 46 |
|
|
|
| 47 |
|
|
#include <assert.h> |
| 48 |
|
|
#include <string> |
| 49 |
|
|
#include <vector> |
| 50 |
ph10 |
199 |
|
| 51 |
nigel |
77 |
#include <pcrecpp.h> |
| 52 |
|
|
#include <pcre_stringpiece.h> |
| 53 |
|
|
|
| 54 |
|
|
namespace pcrecpp { |
| 55 |
|
|
|
| 56 |
ph10 |
199 |
class PCRECPP_EXP_DEFN Scanner { |
| 57 |
nigel |
77 |
public: |
| 58 |
|
|
Scanner(); |
| 59 |
|
|
explicit Scanner(const std::string& input); |
| 60 |
|
|
~Scanner(); |
| 61 |
|
|
|
| 62 |
|
|
// Return current line number. The returned line-number is |
| 63 |
|
|
// one-based. I.e. it returns 1 + the number of consumed newlines. |
| 64 |
|
|
// |
| 65 |
|
|
// Note: this method may be slow. It may take time proportional to |
| 66 |
|
|
// the size of the input. |
| 67 |
|
|
int LineNumber() const; |
| 68 |
|
|
|
| 69 |
|
|
// Return the byte-offset that the scanner is looking in the |
| 70 |
|
|
// input data; |
| 71 |
|
|
int Offset() const; |
| 72 |
|
|
|
| 73 |
|
|
// Return true iff the start of the remaining input matches "re" |
| 74 |
|
|
bool LookingAt(const RE& re) const; |
| 75 |
|
|
|
| 76 |
|
|
// Return true iff all of the following are true |
| 77 |
|
|
// a. the start of the remaining input matches "re", |
| 78 |
|
|
// b. if any arguments are supplied, matched sub-patterns can be |
| 79 |
|
|
// parsed and stored into the arguments. |
| 80 |
|
|
// If it returns true, it skips over the matched input and any |
| 81 |
|
|
// following input that matches the "skip" regular expression. |
| 82 |
|
|
bool Consume(const RE& re, |
| 83 |
|
|
const Arg& arg0 = no_arg, |
| 84 |
|
|
const Arg& arg1 = no_arg, |
| 85 |
|
|
const Arg& arg2 = no_arg |
| 86 |
|
|
// TODO: Allow more arguments? |
| 87 |
|
|
); |
| 88 |
|
|
|
| 89 |
|
|
// Set the "skip" regular expression. If after consuming some data, |
| 90 |
|
|
// a prefix of the input matches this RE, it is automatically |
| 91 |
|
|
// skipped. For example, a programming language scanner would use |
| 92 |
|
|
// a skip RE that matches white space and comments. |
| 93 |
|
|
// |
| 94 |
nigel |
93 |
// scanner.SetSkipExpression("\\s+|//.*|/[*](.|\n)*?[*]/"); |
| 95 |
nigel |
77 |
// |
| 96 |
nigel |
93 |
// Skipping repeats as long as it succeeds. We used to let people do |
| 97 |
|
|
// this by writing "(...)*" in the regular expression, but that added |
| 98 |
|
|
// up to lots of recursive calls within the pcre library, so now we |
| 99 |
|
|
// control repetition explicitly via the function call API. |
| 100 |
|
|
// |
| 101 |
nigel |
77 |
// You can pass NULL for "re" if you do not want any data to be skipped. |
| 102 |
nigel |
93 |
void Skip(const char* re); // DEPRECATED; does *not* repeat |
| 103 |
|
|
void SetSkipExpression(const char* re); |
| 104 |
nigel |
77 |
|
| 105 |
|
|
// Temporarily pause "skip"ing. This |
| 106 |
|
|
// Skip("Foo"); code ; DisableSkip(); code; EnableSkip() |
| 107 |
|
|
// is similar to |
| 108 |
|
|
// Skip("Foo"); code ; Skip(NULL); code ; Skip("Foo"); |
| 109 |
|
|
// but avoids creating/deleting new RE objects. |
| 110 |
|
|
void DisableSkip(); |
| 111 |
|
|
|
| 112 |
|
|
// Reenable previously paused skipping. Any prefix of the input |
| 113 |
|
|
// that matches the skip pattern is immediately dropped. |
| 114 |
|
|
void EnableSkip(); |
| 115 |
|
|
|
| 116 |
|
|
/***** Special wrappers around SetSkip() for some common idioms *****/ |
| 117 |
|
|
|
| 118 |
|
|
// Arranges to skip whitespace, C comments, C++ comments. |
| 119 |
nigel |
93 |
// The overall RE is a disjunction of the following REs: |
| 120 |
nigel |
77 |
// \\s whitespace |
| 121 |
|
|
// //.*\n C++ comment |
| 122 |
|
|
// /[*](.|\n)*?[*]/ C comment (x*? means minimal repetitions of x) |
| 123 |
nigel |
93 |
// We get repetition via the semantics of SetSkipExpression, not by using * |
| 124 |
nigel |
77 |
void SkipCXXComments() { |
| 125 |
nigel |
93 |
SetSkipExpression("\\s|//.*\n|/[*](?:\n|.)*?[*]/"); |
| 126 |
nigel |
77 |
} |
| 127 |
|
|
|
| 128 |
|
|
void set_save_comments(bool comments) { |
| 129 |
|
|
save_comments_ = comments; |
| 130 |
|
|
} |
| 131 |
|
|
|
| 132 |
|
|
bool save_comments() { |
| 133 |
|
|
return save_comments_; |
| 134 |
|
|
} |
| 135 |
|
|
|
| 136 |
|
|
// Append to vector ranges the comments found in the |
| 137 |
|
|
// byte range [start,end] (inclusive) of the input data. |
| 138 |
|
|
// Only comments that were extracted entirely within that |
| 139 |
|
|
// range are returned: no range splitting of atomically-extracted |
| 140 |
|
|
// comments is performed. |
| 141 |
|
|
void GetComments(int start, int end, std::vector<StringPiece> *ranges); |
| 142 |
|
|
|
| 143 |
|
|
// Append to vector ranges the comments added |
| 144 |
|
|
// since the last time this was called. This |
| 145 |
|
|
// functionality is provided for efficiency when |
| 146 |
|
|
// interleaving scanning with parsing. |
| 147 |
|
|
void GetNextComments(std::vector<StringPiece> *ranges); |
| 148 |
|
|
|
| 149 |
|
|
private: |
| 150 |
|
|
std::string data_; // All the input data |
| 151 |
|
|
StringPiece input_; // Unprocessed input |
| 152 |
|
|
RE* skip_; // If non-NULL, RE for skipping input |
| 153 |
|
|
bool should_skip_; // If true, use skip_ |
| 154 |
nigel |
93 |
bool skip_repeat_; // If true, repeat skip_ as long as it works |
| 155 |
nigel |
77 |
bool save_comments_; // If true, aggregate the skip expression |
| 156 |
|
|
|
| 157 |
|
|
// the skipped comments |
| 158 |
|
|
// TODO: later consider requiring that the StringPieces be added |
| 159 |
|
|
// in order by their start position |
| 160 |
|
|
std::vector<StringPiece> *comments_; |
| 161 |
|
|
|
| 162 |
|
|
// the offset into comments_ that has been returned by GetNextComments |
| 163 |
|
|
int comments_offset_; |
| 164 |
|
|
|
| 165 |
|
|
// helper function to consume *skip_ and honour |
| 166 |
|
|
// save_comments_ |
| 167 |
|
|
void ConsumeSkip(); |
| 168 |
|
|
}; |
| 169 |
|
|
|
| 170 |
|
|
} // namespace pcrecpp |
| 171 |
|
|
|
| 172 |
|
|
#endif /* _PCRE_SCANNER_H */ |