/[pcre]/code/trunk/pcrecpp.cc
ViewVC logotype

Contents of /code/trunk/pcrecpp.cc

Parent Directory Parent Directory | Revision Log Revision Log


Revision 329 - (hide annotations) (download)
Fri Mar 28 12:06:36 2008 UTC (6 years, 6 months ago) by ph10
File size: 31625 byte(s)
Craig's second patch to fix the previous one.

1 nigel 77 // Copyright (c) 2005, Google Inc.
2     // All rights reserved.
3     //
4     // Redistribution and use in source and binary forms, with or without
5     // modification, are permitted provided that the following conditions are
6     // met:
7     //
8     // * Redistributions of source code must retain the above copyright
9     // notice, this list of conditions and the following disclaimer.
10     // * Redistributions in binary form must reproduce the above
11     // copyright notice, this list of conditions and the following disclaimer
12     // in the documentation and/or other materials provided with the
13     // distribution.
14     // * Neither the name of Google Inc. nor the names of its
15     // contributors may be used to endorse or promote products derived from
16     // this software without specific prior written permission.
17     //
18     // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
19     // "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
20     // LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
21     // A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
22     // OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
23     // SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
24     // LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
25     // DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
26     // THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
27     // (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
28     // OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
29     //
30     // Author: Sanjay Ghemawat
31    
32 ph10 97 #ifdef HAVE_CONFIG_H
33 ph10 236 #include "config.h"
34 ph10 97 #endif
35    
36 nigel 77 #include <stdlib.h>
37     #include <stdio.h>
38     #include <ctype.h>
39     #include <limits.h> /* for SHRT_MIN, USHRT_MAX, etc */
40     #include <assert.h>
41     #include <errno.h>
42     #include <string>
43 nigel 81 #include <algorithm>
44 ph10 199
45     #include "pcrecpp_internal.h"
46 ph10 236 #include "pcre.h"
47 nigel 77 #include "pcrecpp.h"
48 ph10 199 #include "pcre_stringpiece.h"
49 nigel 77
50    
51     namespace pcrecpp {
52    
53     // Maximum number of args we can set
54     static const int kMaxArgs = 16;
55     static const int kVecSize = (1 + kMaxArgs) * 3; // results + PCRE workspace
56    
57     // Special object that stands-in for no argument
58 ph10 308 Arg RE::no_arg((void*)NULL);
59 nigel 77
60 ph10 322 // This is for ABI compatibility with old versions of pcre (pre-7.6),
61     // which defined a global no_arg variable instead of putting it in the
62 ph10 328 // RE class. This works on GCC >= 3, at least. We could probably
63     // have a more inclusive test if we ever needed it. (Note that not
64     // only the __attribute__ syntax, but also __USER_LABEL_PREFIX__, are
65     // gnu-specific.)
66 ph10 322 #if defined(__GNUC__) && __GNUC__ >= 3
67 ph10 329 # define AS_STRING(x) AS_STRING_INTERNAL(x)
68     # define AS_STRING_INTERNAL(x) #x
69     # define USER_LABEL_PREFIX AS_STRING(__USER_LABEL_PREFIX__)
70     # if defined(__ELF__)
71 ph10 328 extern Arg no_arg
72 ph10 329 __attribute__((alias(USER_LABEL_PREFIX "_ZN7pcrecpp2RE6no_argE")));
73     # else
74 ph10 328 // While we know elf supports strong aliases, not all formats do (Mach
75     // doesn't, for instance). So make aliases weak by default. This is
76     // a smidge less safe in theory (conceivably, someone could override
77     // this symbol in their own binary), but perfectly ok in practice.
78     extern Arg no_arg
79 ph10 329 __attribute__((weak, alias(USER_LABEL_PREFIX "_ZN7pcrecpp2RE6no_argE")));
80     # endif
81 ph10 322 #endif
82    
83 nigel 77 // If a regular expression has no error, its error_ field points here
84     static const string empty_string;
85    
86     // If the user doesn't ask for any options, we just use this one
87     static RE_Options default_options;
88    
89 nigel 93 void RE::Init(const string& pat, const RE_Options* options) {
90 nigel 77 pattern_ = pat;
91     if (options == NULL) {
92     options_ = default_options;
93     } else {
94     options_ = *options;
95     }
96     error_ = &empty_string;
97     re_full_ = NULL;
98     re_partial_ = NULL;
99    
100     re_partial_ = Compile(UNANCHORED);
101     if (re_partial_ != NULL) {
102 ph10 179 re_full_ = Compile(ANCHOR_BOTH);
103 nigel 77 }
104     }
105    
106 nigel 93 void RE::Cleanup() {
107 ph10 179 if (re_full_ != NULL) (*pcre_free)(re_full_);
108     if (re_partial_ != NULL) (*pcre_free)(re_partial_);
109     if (error_ != &empty_string) delete error_;
110 nigel 77 }
111    
112 nigel 93
113     RE::~RE() {
114     Cleanup();
115     }
116    
117    
118 nigel 77 pcre* RE::Compile(Anchor anchor) {
119     // First, convert RE_Options into pcre options
120     int pcre_options = 0;
121 nigel 81 pcre_options = options_.all_options();
122 nigel 77
123     // Special treatment for anchoring. This is needed because at
124     // runtime pcre only provides an option for anchoring at the
125     // beginning of a string (unless you use offset).
126     //
127     // There are three types of anchoring we want:
128     // UNANCHORED Compile the original pattern, and use
129     // a pcre unanchored match.
130     // ANCHOR_START Compile the original pattern, and use
131     // a pcre anchored match.
132     // ANCHOR_BOTH Tack a "\z" to the end of the original pattern
133     // and use a pcre anchored match.
134    
135     const char* compile_error;
136     int eoffset;
137     pcre* re;
138     if (anchor != ANCHOR_BOTH) {
139     re = pcre_compile(pattern_.c_str(), pcre_options,
140     &compile_error, &eoffset, NULL);
141     } else {
142     // Tack a '\z' at the end of RE. Parenthesize it first so that
143     // the '\z' applies to all top-level alternatives in the regexp.
144     string wrapped = "(?:"; // A non-counting grouping operator
145     wrapped += pattern_;
146     wrapped += ")\\z";
147     re = pcre_compile(wrapped.c_str(), pcre_options,
148     &compile_error, &eoffset, NULL);
149     }
150     if (re == NULL) {
151     if (error_ == &empty_string) error_ = new string(compile_error);
152     }
153     return re;
154     }
155    
156     /***** Matching interfaces *****/
157    
158     bool RE::FullMatch(const StringPiece& text,
159     const Arg& ptr1,
160     const Arg& ptr2,
161     const Arg& ptr3,
162     const Arg& ptr4,
163     const Arg& ptr5,
164     const Arg& ptr6,
165     const Arg& ptr7,
166     const Arg& ptr8,
167     const Arg& ptr9,
168     const Arg& ptr10,
169     const Arg& ptr11,
170     const Arg& ptr12,
171     const Arg& ptr13,
172     const Arg& ptr14,
173     const Arg& ptr15,
174     const Arg& ptr16) const {
175     const Arg* args[kMaxArgs];
176     int n = 0;
177     if (&ptr1 == &no_arg) goto done; args[n++] = &ptr1;
178     if (&ptr2 == &no_arg) goto done; args[n++] = &ptr2;
179     if (&ptr3 == &no_arg) goto done; args[n++] = &ptr3;
180     if (&ptr4 == &no_arg) goto done; args[n++] = &ptr4;
181     if (&ptr5 == &no_arg) goto done; args[n++] = &ptr5;
182     if (&ptr6 == &no_arg) goto done; args[n++] = &ptr6;
183     if (&ptr7 == &no_arg) goto done; args[n++] = &ptr7;
184     if (&ptr8 == &no_arg) goto done; args[n++] = &ptr8;
185     if (&ptr9 == &no_arg) goto done; args[n++] = &ptr9;
186     if (&ptr10 == &no_arg) goto done; args[n++] = &ptr10;
187     if (&ptr11 == &no_arg) goto done; args[n++] = &ptr11;
188     if (&ptr12 == &no_arg) goto done; args[n++] = &ptr12;
189     if (&ptr13 == &no_arg) goto done; args[n++] = &ptr13;
190     if (&ptr14 == &no_arg) goto done; args[n++] = &ptr14;
191     if (&ptr15 == &no_arg) goto done; args[n++] = &ptr15;
192     if (&ptr16 == &no_arg) goto done; args[n++] = &ptr16;
193     done:
194    
195     int consumed;
196     int vec[kVecSize];
197     return DoMatchImpl(text, ANCHOR_BOTH, &consumed, args, n, vec, kVecSize);
198     }
199    
200     bool RE::PartialMatch(const StringPiece& text,
201     const Arg& ptr1,
202     const Arg& ptr2,
203     const Arg& ptr3,
204     const Arg& ptr4,
205     const Arg& ptr5,
206     const Arg& ptr6,
207     const Arg& ptr7,
208     const Arg& ptr8,
209     const Arg& ptr9,
210     const Arg& ptr10,
211     const Arg& ptr11,
212     const Arg& ptr12,
213     const Arg& ptr13,
214     const Arg& ptr14,
215     const Arg& ptr15,
216     const Arg& ptr16) const {
217     const Arg* args[kMaxArgs];
218     int n = 0;
219     if (&ptr1 == &no_arg) goto done; args[n++] = &ptr1;
220     if (&ptr2 == &no_arg) goto done; args[n++] = &ptr2;
221     if (&ptr3 == &no_arg) goto done; args[n++] = &ptr3;
222     if (&ptr4 == &no_arg) goto done; args[n++] = &ptr4;
223     if (&ptr5 == &no_arg) goto done; args[n++] = &ptr5;
224     if (&ptr6 == &no_arg) goto done; args[n++] = &ptr6;
225     if (&ptr7 == &no_arg) goto done; args[n++] = &ptr7;
226     if (&ptr8 == &no_arg) goto done; args[n++] = &ptr8;
227     if (&ptr9 == &no_arg) goto done; args[n++] = &ptr9;
228     if (&ptr10 == &no_arg) goto done; args[n++] = &ptr10;
229     if (&ptr11 == &no_arg) goto done; args[n++] = &ptr11;
230     if (&ptr12 == &no_arg) goto done; args[n++] = &ptr12;
231     if (&ptr13 == &no_arg) goto done; args[n++] = &ptr13;
232     if (&ptr14 == &no_arg) goto done; args[n++] = &ptr14;
233     if (&ptr15 == &no_arg) goto done; args[n++] = &ptr15;
234     if (&ptr16 == &no_arg) goto done; args[n++] = &ptr16;
235     done:
236    
237     int consumed;
238     int vec[kVecSize];
239     return DoMatchImpl(text, UNANCHORED, &consumed, args, n, vec, kVecSize);
240     }
241    
242     bool RE::Consume(StringPiece* input,
243     const Arg& ptr1,
244     const Arg& ptr2,
245     const Arg& ptr3,
246     const Arg& ptr4,
247     const Arg& ptr5,
248     const Arg& ptr6,
249     const Arg& ptr7,
250     const Arg& ptr8,
251     const Arg& ptr9,
252     const Arg& ptr10,
253     const Arg& ptr11,
254     const Arg& ptr12,
255     const Arg& ptr13,
256     const Arg& ptr14,
257     const Arg& ptr15,
258     const Arg& ptr16) const {
259     const Arg* args[kMaxArgs];
260     int n = 0;
261     if (&ptr1 == &no_arg) goto done; args[n++] = &ptr1;
262     if (&ptr2 == &no_arg) goto done; args[n++] = &ptr2;
263     if (&ptr3 == &no_arg) goto done; args[n++] = &ptr3;
264     if (&ptr4 == &no_arg) goto done; args[n++] = &ptr4;
265     if (&ptr5 == &no_arg) goto done; args[n++] = &ptr5;
266     if (&ptr6 == &no_arg) goto done; args[n++] = &ptr6;
267     if (&ptr7 == &no_arg) goto done; args[n++] = &ptr7;
268     if (&ptr8 == &no_arg) goto done; args[n++] = &ptr8;
269     if (&ptr9 == &no_arg) goto done; args[n++] = &ptr9;
270     if (&ptr10 == &no_arg) goto done; args[n++] = &ptr10;
271     if (&ptr11 == &no_arg) goto done; args[n++] = &ptr11;
272     if (&ptr12 == &no_arg) goto done; args[n++] = &ptr12;
273     if (&ptr13 == &no_arg) goto done; args[n++] = &ptr13;
274     if (&ptr14 == &no_arg) goto done; args[n++] = &ptr14;
275     if (&ptr15 == &no_arg) goto done; args[n++] = &ptr15;
276     if (&ptr16 == &no_arg) goto done; args[n++] = &ptr16;
277     done:
278    
279     int consumed;
280     int vec[kVecSize];
281     if (DoMatchImpl(*input, ANCHOR_START, &consumed,
282     args, n, vec, kVecSize)) {
283     input->remove_prefix(consumed);
284     return true;
285     } else {
286     return false;
287     }
288     }
289    
290     bool RE::FindAndConsume(StringPiece* input,
291     const Arg& ptr1,
292     const Arg& ptr2,
293     const Arg& ptr3,
294     const Arg& ptr4,
295     const Arg& ptr5,
296     const Arg& ptr6,
297     const Arg& ptr7,
298     const Arg& ptr8,
299     const Arg& ptr9,
300     const Arg& ptr10,
301     const Arg& ptr11,
302     const Arg& ptr12,
303     const Arg& ptr13,
304     const Arg& ptr14,
305     const Arg& ptr15,
306     const Arg& ptr16) const {
307     const Arg* args[kMaxArgs];
308     int n = 0;
309     if (&ptr1 == &no_arg) goto done; args[n++] = &ptr1;
310     if (&ptr2 == &no_arg) goto done; args[n++] = &ptr2;
311     if (&ptr3 == &no_arg) goto done; args[n++] = &ptr3;
312     if (&ptr4 == &no_arg) goto done; args[n++] = &ptr4;
313     if (&ptr5 == &no_arg) goto done; args[n++] = &ptr5;
314     if (&ptr6 == &no_arg) goto done; args[n++] = &ptr6;
315     if (&ptr7 == &no_arg) goto done; args[n++] = &ptr7;
316     if (&ptr8 == &no_arg) goto done; args[n++] = &ptr8;
317     if (&ptr9 == &no_arg) goto done; args[n++] = &ptr9;
318     if (&ptr10 == &no_arg) goto done; args[n++] = &ptr10;
319     if (&ptr11 == &no_arg) goto done; args[n++] = &ptr11;
320     if (&ptr12 == &no_arg) goto done; args[n++] = &ptr12;
321     if (&ptr13 == &no_arg) goto done; args[n++] = &ptr13;
322     if (&ptr14 == &no_arg) goto done; args[n++] = &ptr14;
323     if (&ptr15 == &no_arg) goto done; args[n++] = &ptr15;
324     if (&ptr16 == &no_arg) goto done; args[n++] = &ptr16;
325     done:
326    
327     int consumed;
328     int vec[kVecSize];
329     if (DoMatchImpl(*input, UNANCHORED, &consumed,
330     args, n, vec, kVecSize)) {
331     input->remove_prefix(consumed);
332     return true;
333     } else {
334     return false;
335     }
336     }
337    
338     bool RE::Replace(const StringPiece& rewrite,
339     string *str) const {
340     int vec[kVecSize];
341     int matches = TryMatch(*str, 0, UNANCHORED, vec, kVecSize);
342     if (matches == 0)
343     return false;
344    
345     string s;
346     if (!Rewrite(&s, rewrite, *str, vec, matches))
347     return false;
348    
349     assert(vec[0] >= 0);
350     assert(vec[1] >= 0);
351     str->replace(vec[0], vec[1] - vec[0], s);
352     return true;
353     }
354    
355 nigel 91 // Returns PCRE_NEWLINE_CRLF, PCRE_NEWLINE_CR, or PCRE_NEWLINE_LF.
356     // Note that PCRE_NEWLINE_CRLF is defined to be P_N_CR | P_N_LF.
357 ph10 253 // Modified by PH to add PCRE_NEWLINE_ANY and PCRE_NEWLINE_ANYCRLF.
358    
359 nigel 91 static int NewlineMode(int pcre_options) {
360     // TODO: if we can make it threadsafe, cache this var
361     int newline_mode = 0;
362     /* if (newline_mode) return newline_mode; */ // do this once it's cached
363 ph10 253 if (pcre_options & (PCRE_NEWLINE_CRLF|PCRE_NEWLINE_CR|PCRE_NEWLINE_LF|
364     PCRE_NEWLINE_ANY|PCRE_NEWLINE_ANYCRLF)) {
365 nigel 91 newline_mode = (pcre_options &
366 ph10 253 (PCRE_NEWLINE_CRLF|PCRE_NEWLINE_CR|PCRE_NEWLINE_LF|
367     PCRE_NEWLINE_ANY|PCRE_NEWLINE_ANYCRLF));
368 nigel 91 } else {
369     int newline;
370     pcre_config(PCRE_CONFIG_NEWLINE, &newline);
371     if (newline == 10)
372     newline_mode = PCRE_NEWLINE_LF;
373     else if (newline == 13)
374     newline_mode = PCRE_NEWLINE_CR;
375     else if (newline == 3338)
376     newline_mode = PCRE_NEWLINE_CRLF;
377 ph10 253 else if (newline == -1)
378     newline_mode = PCRE_NEWLINE_ANY;
379     else if (newline == -2)
380 ph10 259 newline_mode = PCRE_NEWLINE_ANYCRLF;
381 nigel 91 else
382     assert("" == "Unexpected return value from pcre_config(NEWLINE)");
383     }
384     return newline_mode;
385     }
386    
387 nigel 77 int RE::GlobalReplace(const StringPiece& rewrite,
388     string *str) const {
389     int count = 0;
390     int vec[kVecSize];
391     string out;
392     int start = 0;
393     int lastend = -1;
394    
395 ph10 297 while (start <= static_cast<int>(str->length())) {
396 nigel 77 int matches = TryMatch(*str, start, UNANCHORED, vec, kVecSize);
397     if (matches <= 0)
398     break;
399     int matchstart = vec[0], matchend = vec[1];
400     assert(matchstart >= start);
401     assert(matchend >= matchstart);
402     if (matchstart == matchend && matchstart == lastend) {
403     // advance one character if we matched an empty string at the same
404     // place as the last match occurred
405 nigel 91 matchend = start + 1;
406     // If the current char is CR and we're in CRLF mode, skip LF too.
407     // Note it's better to call pcre_fullinfo() than to examine
408     // all_options(), since options_ could have changed bewteen
409     // compile-time and now, but this is simpler and safe enough.
410 ph10 259 // Modified by PH to add ANY and ANYCRLF.
411 nigel 91 if (start+1 < static_cast<int>(str->length()) &&
412     (*str)[start] == '\r' && (*str)[start+1] == '\n' &&
413 ph10 253 (NewlineMode(options_.all_options()) == PCRE_NEWLINE_CRLF ||
414     NewlineMode(options_.all_options()) == PCRE_NEWLINE_ANY ||
415     NewlineMode(options_.all_options()) == PCRE_NEWLINE_ANYCRLF)
416     ) {
417 nigel 91 matchend++;
418     }
419     // We also need to advance more than one char if we're in utf8 mode.
420     #ifdef SUPPORT_UTF8
421     if (options_.utf8()) {
422     while (matchend < static_cast<int>(str->length()) &&
423     ((*str)[matchend] & 0xc0) == 0x80)
424     matchend++;
425     }
426     #endif
427     if (matchend <= static_cast<int>(str->length()))
428     out.append(*str, start, matchend - start);
429     start = matchend;
430 nigel 77 } else {
431     out.append(*str, start, matchstart - start);
432     Rewrite(&out, rewrite, *str, vec, matches);
433     start = matchend;
434     lastend = matchend;
435 ph10 297 count++;
436 nigel 77 }
437     }
438    
439     if (count == 0)
440     return 0;
441    
442     if (start < static_cast<int>(str->length()))
443     out.append(*str, start, str->length() - start);
444     swap(out, *str);
445     return count;
446     }
447    
448     bool RE::Extract(const StringPiece& rewrite,
449     const StringPiece& text,
450     string *out) const {
451     int vec[kVecSize];
452     int matches = TryMatch(text, 0, UNANCHORED, vec, kVecSize);
453     if (matches == 0)
454     return false;
455 nigel 81 out->erase();
456 nigel 77 return Rewrite(out, rewrite, text, vec, matches);
457     }
458    
459 nigel 93 /*static*/ string RE::QuoteMeta(const StringPiece& unquoted) {
460     string result;
461    
462     // Escape any ascii character not in [A-Za-z_0-9].
463     //
464     // Note that it's legal to escape a character even if it has no
465     // special meaning in a regular expression -- so this function does
466     // that. (This also makes it identical to the perl function of the
467 ph10 326 // same name; see `perldoc -f quotemeta`.) The one exception is
468     // escaping NUL: rather than doing backslash + NUL, like perl does,
469     // we do '\0', because pcre itself doesn't take embedded NUL chars.
470 nigel 93 for (int ii = 0; ii < unquoted.size(); ++ii) {
471     // Note that using 'isalnum' here raises the benchmark time from
472     // 32ns to 58ns:
473 ph10 326 if (unquoted[ii] == '\0') {
474     result += "\\0";
475     } else if ((unquoted[ii] < 'a' || unquoted[ii] > 'z') &&
476     (unquoted[ii] < 'A' || unquoted[ii] > 'Z') &&
477     (unquoted[ii] < '0' || unquoted[ii] > '9') &&
478     unquoted[ii] != '_' &&
479     // If this is the part of a UTF8 or Latin1 character, we need
480     // to copy this byte without escaping. Experimentally this is
481     // what works correctly with the regexp library.
482     !(unquoted[ii] & 128)) {
483 nigel 93 result += '\\';
484 ph10 326 result += unquoted[ii];
485     } else {
486     result += unquoted[ii];
487 nigel 93 }
488     }
489    
490     return result;
491     }
492    
493 nigel 77 /***** Actual matching and rewriting code *****/
494    
495     int RE::TryMatch(const StringPiece& text,
496     int startpos,
497     Anchor anchor,
498     int *vec,
499     int vecsize) const {
500     pcre* re = (anchor == ANCHOR_BOTH) ? re_full_ : re_partial_;
501     if (re == NULL) {
502     //fprintf(stderr, "Matching against invalid re: %s\n", error_->c_str());
503     return 0;
504     }
505    
506 ph10 199 pcre_extra extra = { 0, 0, 0, 0, 0, 0 };
507 nigel 77 if (options_.match_limit() > 0) {
508 nigel 87 extra.flags |= PCRE_EXTRA_MATCH_LIMIT;
509 nigel 77 extra.match_limit = options_.match_limit();
510     }
511 nigel 87 if (options_.match_limit_recursion() > 0) {
512     extra.flags |= PCRE_EXTRA_MATCH_LIMIT_RECURSION;
513     extra.match_limit_recursion = options_.match_limit_recursion();
514     }
515 nigel 77 int rc = pcre_exec(re, // The regular expression object
516     &extra,
517 nigel 87 (text.data() == NULL) ? "" : text.data(),
518 nigel 77 text.size(),
519     startpos,
520     (anchor == UNANCHORED) ? 0 : PCRE_ANCHORED,
521     vec,
522     vecsize);
523    
524     // Handle errors
525     if (rc == PCRE_ERROR_NOMATCH) {
526     return 0;
527     } else if (rc < 0) {
528     //fprintf(stderr, "Unexpected return code: %d when matching '%s'\n",
529     // re, pattern_.c_str());
530     return 0;
531     } else if (rc == 0) {
532     // pcre_exec() returns 0 as a special case when the number of
533     // capturing subpatterns exceeds the size of the vector.
534     // When this happens, there is a match and the output vector
535     // is filled, but we miss out on the positions of the extra subpatterns.
536     rc = vecsize / 2;
537     }
538    
539     return rc;
540     }
541    
542     bool RE::DoMatchImpl(const StringPiece& text,
543     Anchor anchor,
544     int* consumed,
545     const Arg* const* args,
546     int n,
547     int* vec,
548     int vecsize) const {
549     assert((1 + n) * 3 <= vecsize); // results + PCRE workspace
550     int matches = TryMatch(text, 0, anchor, vec, vecsize);
551     assert(matches >= 0); // TryMatch never returns negatives
552     if (matches == 0)
553     return false;
554    
555     *consumed = vec[1];
556    
557 nigel 87 if (n == 0 || args == NULL) {
558 nigel 77 // We are not interested in results
559     return true;
560     }
561    
562 nigel 87 if (NumberOfCapturingGroups() < n) {
563     // RE has fewer capturing groups than number of arg pointers passed in
564     return false;
565     }
566    
567 nigel 77 // If we got here, we must have matched the whole pattern.
568     // We do not need (can not do) any more checks on the value of 'matches' here
569     // -- see the comment for TryMatch.
570     for (int i = 0; i < n; i++) {
571     const int start = vec[2*(i+1)];
572     const int limit = vec[2*(i+1)+1];
573     if (!args[i]->Parse(text.data() + start, limit-start)) {
574     // TODO: Should we indicate what the error was?
575     return false;
576     }
577     }
578    
579     return true;
580     }
581    
582     bool RE::DoMatch(const StringPiece& text,
583     Anchor anchor,
584     int* consumed,
585     const Arg* const args[],
586     int n) const {
587     assert(n >= 0);
588     size_t const vecsize = (1 + n) * 3; // results + PCRE workspace
589     // (as for kVecSize)
590     int space[21]; // use stack allocation for small vecsize (common case)
591     int* vec = vecsize <= 21 ? space : new int[vecsize];
592     bool retval = DoMatchImpl(text, anchor, consumed, args, n, vec, vecsize);
593     if (vec != space) delete [] vec;
594     return retval;
595     }
596    
597     bool RE::Rewrite(string *out, const StringPiece &rewrite,
598     const StringPiece &text, int *vec, int veclen) const {
599     for (const char *s = rewrite.data(), *end = s + rewrite.size();
600     s < end; s++) {
601     int c = *s;
602     if (c == '\\') {
603     c = *++s;
604     if (isdigit(c)) {
605     int n = (c - '0');
606     if (n >= veclen) {
607     //fprintf(stderr, requested group %d in regexp %.*s\n",
608     // n, rewrite.size(), rewrite.data());
609     return false;
610     }
611     int start = vec[2 * n];
612     if (start >= 0)
613     out->append(text.data() + start, vec[2 * n + 1] - start);
614     } else if (c == '\\') {
615     out->push_back('\\');
616     } else {
617     //fprintf(stderr, "invalid rewrite pattern: %.*s\n",
618     // rewrite.size(), rewrite.data());
619     return false;
620     }
621     } else {
622     out->push_back(c);
623     }
624     }
625     return true;
626     }
627    
628     // Return the number of capturing subpatterns, or -1 if the
629     // regexp wasn't valid on construction.
630 nigel 87 int RE::NumberOfCapturingGroups() const {
631 nigel 77 if (re_partial_ == NULL) return -1;
632    
633     int result;
634     int pcre_retval = pcre_fullinfo(re_partial_, // The regular expression object
635     NULL, // We did not study the pattern
636     PCRE_INFO_CAPTURECOUNT,
637     &result);
638     assert(pcre_retval == 0);
639     return result;
640     }
641    
642     /***** Parsers for various types *****/
643    
644     bool Arg::parse_null(const char* str, int n, void* dest) {
645     // We fail if somebody asked us to store into a non-NULL void* pointer
646     return (dest == NULL);
647     }
648    
649     bool Arg::parse_string(const char* str, int n, void* dest) {
650 ph10 263 if (dest == NULL) return true;
651 nigel 77 reinterpret_cast<string*>(dest)->assign(str, n);
652     return true;
653     }
654    
655     bool Arg::parse_stringpiece(const char* str, int n, void* dest) {
656 ph10 263 if (dest == NULL) return true;
657 nigel 77 reinterpret_cast<StringPiece*>(dest)->set(str, n);
658     return true;
659     }
660    
661     bool Arg::parse_char(const char* str, int n, void* dest) {
662     if (n != 1) return false;
663 ph10 263 if (dest == NULL) return true;
664 nigel 77 *(reinterpret_cast<char*>(dest)) = str[0];
665     return true;
666     }
667    
668     bool Arg::parse_uchar(const char* str, int n, void* dest) {
669     if (n != 1) return false;
670 ph10 263 if (dest == NULL) return true;
671 nigel 77 *(reinterpret_cast<unsigned char*>(dest)) = str[0];
672     return true;
673     }
674    
675     // Largest number spec that we are willing to parse
676     static const int kMaxNumberLength = 32;
677    
678     // REQUIRES "buf" must have length at least kMaxNumberLength+1
679     // REQUIRES "n > 0"
680     // Copies "str" into "buf" and null-terminates if necessary.
681     // Returns one of:
682     // a. "str" if no termination is needed
683     // b. "buf" if the string was copied and null-terminated
684     // c. "" if the input was invalid and has no hope of being parsed
685     static const char* TerminateNumber(char* buf, const char* str, int n) {
686     if ((n > 0) && isspace(*str)) {
687     // We are less forgiving than the strtoxxx() routines and do not
688     // allow leading spaces.
689     return "";
690     }
691    
692     // See if the character right after the input text may potentially
693     // look like a digit.
694     if (isdigit(str[n]) ||
695     ((str[n] >= 'a') && (str[n] <= 'f')) ||
696     ((str[n] >= 'A') && (str[n] <= 'F'))) {
697     if (n > kMaxNumberLength) return ""; // Input too big to be a valid number
698     memcpy(buf, str, n);
699     buf[n] = '\0';
700     return buf;
701     } else {
702     // We can parse right out of the supplied string, so return it.
703     return str;
704     }
705     }
706    
707     bool Arg::parse_long_radix(const char* str,
708     int n,
709     void* dest,
710     int radix) {
711     if (n == 0) return false;
712     char buf[kMaxNumberLength+1];
713     str = TerminateNumber(buf, str, n);
714     char* end;
715     errno = 0;
716     long r = strtol(str, &end, radix);
717     if (end != str + n) return false; // Leftover junk
718     if (errno) return false;
719 ph10 263 if (dest == NULL) return true;
720 nigel 77 *(reinterpret_cast<long*>(dest)) = r;
721     return true;
722     }
723    
724     bool Arg::parse_ulong_radix(const char* str,
725     int n,
726     void* dest,
727     int radix) {
728     if (n == 0) return false;
729     char buf[kMaxNumberLength+1];
730     str = TerminateNumber(buf, str, n);
731 nigel 87 if (str[0] == '-') return false; // strtoul() on a negative number?!
732 nigel 77 char* end;
733     errno = 0;
734     unsigned long r = strtoul(str, &end, radix);
735     if (end != str + n) return false; // Leftover junk
736     if (errno) return false;
737 ph10 263 if (dest == NULL) return true;
738 nigel 77 *(reinterpret_cast<unsigned long*>(dest)) = r;
739     return true;
740     }
741    
742     bool Arg::parse_short_radix(const char* str,
743     int n,
744     void* dest,
745     int radix) {
746     long r;
747     if (!parse_long_radix(str, n, &r, radix)) return false; // Could not parse
748     if (r < SHRT_MIN || r > SHRT_MAX) return false; // Out of range
749 ph10 263 if (dest == NULL) return true;
750 ph10 256 *(reinterpret_cast<short*>(dest)) = static_cast<short>(r);
751 nigel 77 return true;
752     }
753    
754     bool Arg::parse_ushort_radix(const char* str,
755     int n,
756     void* dest,
757     int radix) {
758     unsigned long r;
759     if (!parse_ulong_radix(str, n, &r, radix)) return false; // Could not parse
760     if (r > USHRT_MAX) return false; // Out of range
761 ph10 263 if (dest == NULL) return true;
762 ph10 256 *(reinterpret_cast<unsigned short*>(dest)) = static_cast<unsigned short>(r);
763 nigel 77 return true;
764     }
765    
766     bool Arg::parse_int_radix(const char* str,
767     int n,
768     void* dest,
769     int radix) {
770     long r;
771     if (!parse_long_radix(str, n, &r, radix)) return false; // Could not parse
772     if (r < INT_MIN || r > INT_MAX) return false; // Out of range
773 ph10 263 if (dest == NULL) return true;
774 nigel 77 *(reinterpret_cast<int*>(dest)) = r;
775     return true;
776     }
777    
778     bool Arg::parse_uint_radix(const char* str,
779     int n,
780     void* dest,
781     int radix) {
782     unsigned long r;
783     if (!parse_ulong_radix(str, n, &r, radix)) return false; // Could not parse
784     if (r > UINT_MAX) return false; // Out of range
785 ph10 263 if (dest == NULL) return true;
786 nigel 77 *(reinterpret_cast<unsigned int*>(dest)) = r;
787     return true;
788     }
789    
790     bool Arg::parse_longlong_radix(const char* str,
791     int n,
792     void* dest,
793     int radix) {
794     #ifndef HAVE_LONG_LONG
795     return false;
796     #else
797     if (n == 0) return false;
798     char buf[kMaxNumberLength+1];
799     str = TerminateNumber(buf, str, n);
800     char* end;
801     errno = 0;
802     #if defined HAVE_STRTOQ
803     long long r = strtoq(str, &end, radix);
804     #elif defined HAVE_STRTOLL
805     long long r = strtoll(str, &end, radix);
806 ph10 257 #elif defined HAVE__STRTOI64
807     long long r = _strtoi64(str, &end, radix);
808 nigel 77 #else
809     #error parse_longlong_radix: cannot convert input to a long-long
810     #endif
811     if (end != str + n) return false; // Leftover junk
812     if (errno) return false;
813 ph10 263 if (dest == NULL) return true;
814 nigel 77 *(reinterpret_cast<long long*>(dest)) = r;
815     return true;
816     #endif /* HAVE_LONG_LONG */
817     }
818    
819     bool Arg::parse_ulonglong_radix(const char* str,
820     int n,
821     void* dest,
822     int radix) {
823     #ifndef HAVE_UNSIGNED_LONG_LONG
824     return false;
825     #else
826     if (n == 0) return false;
827     char buf[kMaxNumberLength+1];
828     str = TerminateNumber(buf, str, n);
829 nigel 87 if (str[0] == '-') return false; // strtoull() on a negative number?!
830 nigel 77 char* end;
831     errno = 0;
832     #if defined HAVE_STRTOQ
833     unsigned long long r = strtouq(str, &end, radix);
834     #elif defined HAVE_STRTOLL
835     unsigned long long r = strtoull(str, &end, radix);
836 ph10 257 #elif defined HAVE__STRTOI64
837     unsigned long long r = _strtoui64(str, &end, radix);
838 nigel 77 #else
839     #error parse_ulonglong_radix: cannot convert input to a long-long
840     #endif
841     if (end != str + n) return false; // Leftover junk
842     if (errno) return false;
843 ph10 263 if (dest == NULL) return true;
844 nigel 77 *(reinterpret_cast<unsigned long long*>(dest)) = r;
845     return true;
846     #endif /* HAVE_UNSIGNED_LONG_LONG */
847     }
848    
849     bool Arg::parse_double(const char* str, int n, void* dest) {
850     if (n == 0) return false;
851     static const int kMaxLength = 200;
852     char buf[kMaxLength];
853     if (n >= kMaxLength) return false;
854     memcpy(buf, str, n);
855     buf[n] = '\0';
856     errno = 0;
857     char* end;
858     double r = strtod(buf, &end);
859     if (end != buf + n) return false; // Leftover junk
860     if (errno) return false;
861 ph10 263 if (dest == NULL) return true;
862 nigel 77 *(reinterpret_cast<double*>(dest)) = r;
863     return true;
864     }
865    
866     bool Arg::parse_float(const char* str, int n, void* dest) {
867     double r;
868     if (!parse_double(str, n, &r)) return false;
869 ph10 263 if (dest == NULL) return true;
870 nigel 77 *(reinterpret_cast<float*>(dest)) = static_cast<float>(r);
871     return true;
872     }
873    
874    
875     #define DEFINE_INTEGER_PARSERS(name) \
876     bool Arg::parse_##name(const char* str, int n, void* dest) { \
877     return parse_##name##_radix(str, n, dest, 10); \
878     } \
879     bool Arg::parse_##name##_hex(const char* str, int n, void* dest) { \
880     return parse_##name##_radix(str, n, dest, 16); \
881     } \
882     bool Arg::parse_##name##_octal(const char* str, int n, void* dest) { \
883     return parse_##name##_radix(str, n, dest, 8); \
884     } \
885     bool Arg::parse_##name##_cradix(const char* str, int n, void* dest) { \
886     return parse_##name##_radix(str, n, dest, 0); \
887     }
888    
889 nigel 93 DEFINE_INTEGER_PARSERS(short) /* */
890     DEFINE_INTEGER_PARSERS(ushort) /* */
891     DEFINE_INTEGER_PARSERS(int) /* Don't use semicolons after these */
892     DEFINE_INTEGER_PARSERS(uint) /* statements because they can cause */
893     DEFINE_INTEGER_PARSERS(long) /* compiler warnings if the checking */
894     DEFINE_INTEGER_PARSERS(ulong) /* level is turned up high enough. */
895     DEFINE_INTEGER_PARSERS(longlong) /* */
896     DEFINE_INTEGER_PARSERS(ulonglong) /* */
897 nigel 77
898     #undef DEFINE_INTEGER_PARSERS
899    
900     } // namespace pcrecpp

Properties

Name Value
svn:eol-style native
svn:keywords "Author Date Id Revision Url"

webmaster@exim.org
ViewVC Help
Powered by ViewVC 1.1.12