/[pcre]/code/branches/oldtrunk8/pcrecpp.cc
ViewVC logotype

Contents of /code/branches/oldtrunk8/pcrecpp.cc

Parent Directory Parent Directory | Revision Log Revision Log


Revision 328 - (hide annotations) (download)
Wed Mar 26 17:39:06 2008 UTC (6 years, 8 months ago) by ph10
Original Path: code/trunk/pcrecpp.cc
File size: 31487 byte(s)
Patch to pcrecpp.cc to fix an OS X linking problem.

1 nigel 77 // Copyright (c) 2005, Google Inc.
2     // All rights reserved.
3     //
4     // Redistribution and use in source and binary forms, with or without
5     // modification, are permitted provided that the following conditions are
6     // met:
7     //
8     // * Redistributions of source code must retain the above copyright
9     // notice, this list of conditions and the following disclaimer.
10     // * Redistributions in binary form must reproduce the above
11     // copyright notice, this list of conditions and the following disclaimer
12     // in the documentation and/or other materials provided with the
13     // distribution.
14     // * Neither the name of Google Inc. nor the names of its
15     // contributors may be used to endorse or promote products derived from
16     // this software without specific prior written permission.
17     //
18     // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
19     // "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
20     // LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
21     // A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
22     // OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
23     // SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
24     // LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
25     // DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
26     // THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
27     // (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
28     // OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
29     //
30     // Author: Sanjay Ghemawat
31    
32 ph10 97 #ifdef HAVE_CONFIG_H
33 ph10 236 #include "config.h"
34 ph10 97 #endif
35    
36 nigel 77 #include <stdlib.h>
37     #include <stdio.h>
38     #include <ctype.h>
39     #include <limits.h> /* for SHRT_MIN, USHRT_MAX, etc */
40     #include <assert.h>
41     #include <errno.h>
42     #include <string>
43 nigel 81 #include <algorithm>
44 ph10 199
45     #include "pcrecpp_internal.h"
46 ph10 236 #include "pcre.h"
47 nigel 77 #include "pcrecpp.h"
48 ph10 199 #include "pcre_stringpiece.h"
49 nigel 77
50    
51     namespace pcrecpp {
52    
53     // Maximum number of args we can set
54     static const int kMaxArgs = 16;
55     static const int kVecSize = (1 + kMaxArgs) * 3; // results + PCRE workspace
56    
57     // Special object that stands-in for no argument
58 ph10 308 Arg RE::no_arg((void*)NULL);
59 nigel 77
60 ph10 322 // This is for ABI compatibility with old versions of pcre (pre-7.6),
61     // which defined a global no_arg variable instead of putting it in the
62 ph10 328 // RE class. This works on GCC >= 3, at least. We could probably
63     // have a more inclusive test if we ever needed it. (Note that not
64     // only the __attribute__ syntax, but also __USER_LABEL_PREFIX__, are
65     // gnu-specific.)
66 ph10 322 #if defined(__GNUC__) && __GNUC__ >= 3
67 ph10 328 #if defined(__ELF__)
68     extern Arg no_arg
69     __attribute__((alias(__USER_LABEL_PREFIX__ "_ZN7pcrecpp2RE6no_argE")));
70     #else
71     // While we know elf supports strong aliases, not all formats do (Mach
72     // doesn't, for instance). So make aliases weak by default. This is
73     // a smidge less safe in theory (conceivably, someone could override
74     // this symbol in their own binary), but perfectly ok in practice.
75     extern Arg no_arg
76     __attribute__((weak, alias(__USER_LABEL_PREFIX__ "_ZN7pcrecpp2RE6no_argE")));
77 ph10 322 #endif
78 ph10 328 #endif
79 ph10 322
80 nigel 77 // If a regular expression has no error, its error_ field points here
81     static const string empty_string;
82    
83     // If the user doesn't ask for any options, we just use this one
84     static RE_Options default_options;
85    
86 nigel 93 void RE::Init(const string& pat, const RE_Options* options) {
87 nigel 77 pattern_ = pat;
88     if (options == NULL) {
89     options_ = default_options;
90     } else {
91     options_ = *options;
92     }
93     error_ = &empty_string;
94     re_full_ = NULL;
95     re_partial_ = NULL;
96    
97     re_partial_ = Compile(UNANCHORED);
98     if (re_partial_ != NULL) {
99 ph10 179 re_full_ = Compile(ANCHOR_BOTH);
100 nigel 77 }
101     }
102    
103 nigel 93 void RE::Cleanup() {
104 ph10 179 if (re_full_ != NULL) (*pcre_free)(re_full_);
105     if (re_partial_ != NULL) (*pcre_free)(re_partial_);
106     if (error_ != &empty_string) delete error_;
107 nigel 77 }
108    
109 nigel 93
110     RE::~RE() {
111     Cleanup();
112     }
113    
114    
115 nigel 77 pcre* RE::Compile(Anchor anchor) {
116     // First, convert RE_Options into pcre options
117     int pcre_options = 0;
118 nigel 81 pcre_options = options_.all_options();
119 nigel 77
120     // Special treatment for anchoring. This is needed because at
121     // runtime pcre only provides an option for anchoring at the
122     // beginning of a string (unless you use offset).
123     //
124     // There are three types of anchoring we want:
125     // UNANCHORED Compile the original pattern, and use
126     // a pcre unanchored match.
127     // ANCHOR_START Compile the original pattern, and use
128     // a pcre anchored match.
129     // ANCHOR_BOTH Tack a "\z" to the end of the original pattern
130     // and use a pcre anchored match.
131    
132     const char* compile_error;
133     int eoffset;
134     pcre* re;
135     if (anchor != ANCHOR_BOTH) {
136     re = pcre_compile(pattern_.c_str(), pcre_options,
137     &compile_error, &eoffset, NULL);
138     } else {
139     // Tack a '\z' at the end of RE. Parenthesize it first so that
140     // the '\z' applies to all top-level alternatives in the regexp.
141     string wrapped = "(?:"; // A non-counting grouping operator
142     wrapped += pattern_;
143     wrapped += ")\\z";
144     re = pcre_compile(wrapped.c_str(), pcre_options,
145     &compile_error, &eoffset, NULL);
146     }
147     if (re == NULL) {
148     if (error_ == &empty_string) error_ = new string(compile_error);
149     }
150     return re;
151     }
152    
153     /***** Matching interfaces *****/
154    
155     bool RE::FullMatch(const StringPiece& text,
156     const Arg& ptr1,
157     const Arg& ptr2,
158     const Arg& ptr3,
159     const Arg& ptr4,
160     const Arg& ptr5,
161     const Arg& ptr6,
162     const Arg& ptr7,
163     const Arg& ptr8,
164     const Arg& ptr9,
165     const Arg& ptr10,
166     const Arg& ptr11,
167     const Arg& ptr12,
168     const Arg& ptr13,
169     const Arg& ptr14,
170     const Arg& ptr15,
171     const Arg& ptr16) const {
172     const Arg* args[kMaxArgs];
173     int n = 0;
174     if (&ptr1 == &no_arg) goto done; args[n++] = &ptr1;
175     if (&ptr2 == &no_arg) goto done; args[n++] = &ptr2;
176     if (&ptr3 == &no_arg) goto done; args[n++] = &ptr3;
177     if (&ptr4 == &no_arg) goto done; args[n++] = &ptr4;
178     if (&ptr5 == &no_arg) goto done; args[n++] = &ptr5;
179     if (&ptr6 == &no_arg) goto done; args[n++] = &ptr6;
180     if (&ptr7 == &no_arg) goto done; args[n++] = &ptr7;
181     if (&ptr8 == &no_arg) goto done; args[n++] = &ptr8;
182     if (&ptr9 == &no_arg) goto done; args[n++] = &ptr9;
183     if (&ptr10 == &no_arg) goto done; args[n++] = &ptr10;
184     if (&ptr11 == &no_arg) goto done; args[n++] = &ptr11;
185     if (&ptr12 == &no_arg) goto done; args[n++] = &ptr12;
186     if (&ptr13 == &no_arg) goto done; args[n++] = &ptr13;
187     if (&ptr14 == &no_arg) goto done; args[n++] = &ptr14;
188     if (&ptr15 == &no_arg) goto done; args[n++] = &ptr15;
189     if (&ptr16 == &no_arg) goto done; args[n++] = &ptr16;
190     done:
191    
192     int consumed;
193     int vec[kVecSize];
194     return DoMatchImpl(text, ANCHOR_BOTH, &consumed, args, n, vec, kVecSize);
195     }
196    
197     bool RE::PartialMatch(const StringPiece& text,
198     const Arg& ptr1,
199     const Arg& ptr2,
200     const Arg& ptr3,
201     const Arg& ptr4,
202     const Arg& ptr5,
203     const Arg& ptr6,
204     const Arg& ptr7,
205     const Arg& ptr8,
206     const Arg& ptr9,
207     const Arg& ptr10,
208     const Arg& ptr11,
209     const Arg& ptr12,
210     const Arg& ptr13,
211     const Arg& ptr14,
212     const Arg& ptr15,
213     const Arg& ptr16) const {
214     const Arg* args[kMaxArgs];
215     int n = 0;
216     if (&ptr1 == &no_arg) goto done; args[n++] = &ptr1;
217     if (&ptr2 == &no_arg) goto done; args[n++] = &ptr2;
218     if (&ptr3 == &no_arg) goto done; args[n++] = &ptr3;
219     if (&ptr4 == &no_arg) goto done; args[n++] = &ptr4;
220     if (&ptr5 == &no_arg) goto done; args[n++] = &ptr5;
221     if (&ptr6 == &no_arg) goto done; args[n++] = &ptr6;
222     if (&ptr7 == &no_arg) goto done; args[n++] = &ptr7;
223     if (&ptr8 == &no_arg) goto done; args[n++] = &ptr8;
224     if (&ptr9 == &no_arg) goto done; args[n++] = &ptr9;
225     if (&ptr10 == &no_arg) goto done; args[n++] = &ptr10;
226     if (&ptr11 == &no_arg) goto done; args[n++] = &ptr11;
227     if (&ptr12 == &no_arg) goto done; args[n++] = &ptr12;
228     if (&ptr13 == &no_arg) goto done; args[n++] = &ptr13;
229     if (&ptr14 == &no_arg) goto done; args[n++] = &ptr14;
230     if (&ptr15 == &no_arg) goto done; args[n++] = &ptr15;
231     if (&ptr16 == &no_arg) goto done; args[n++] = &ptr16;
232     done:
233    
234     int consumed;
235     int vec[kVecSize];
236     return DoMatchImpl(text, UNANCHORED, &consumed, args, n, vec, kVecSize);
237     }
238    
239     bool RE::Consume(StringPiece* input,
240     const Arg& ptr1,
241     const Arg& ptr2,
242     const Arg& ptr3,
243     const Arg& ptr4,
244     const Arg& ptr5,
245     const Arg& ptr6,
246     const Arg& ptr7,
247     const Arg& ptr8,
248     const Arg& ptr9,
249     const Arg& ptr10,
250     const Arg& ptr11,
251     const Arg& ptr12,
252     const Arg& ptr13,
253     const Arg& ptr14,
254     const Arg& ptr15,
255     const Arg& ptr16) const {
256     const Arg* args[kMaxArgs];
257     int n = 0;
258     if (&ptr1 == &no_arg) goto done; args[n++] = &ptr1;
259     if (&ptr2 == &no_arg) goto done; args[n++] = &ptr2;
260     if (&ptr3 == &no_arg) goto done; args[n++] = &ptr3;
261     if (&ptr4 == &no_arg) goto done; args[n++] = &ptr4;
262     if (&ptr5 == &no_arg) goto done; args[n++] = &ptr5;
263     if (&ptr6 == &no_arg) goto done; args[n++] = &ptr6;
264     if (&ptr7 == &no_arg) goto done; args[n++] = &ptr7;
265     if (&ptr8 == &no_arg) goto done; args[n++] = &ptr8;
266     if (&ptr9 == &no_arg) goto done; args[n++] = &ptr9;
267     if (&ptr10 == &no_arg) goto done; args[n++] = &ptr10;
268     if (&ptr11 == &no_arg) goto done; args[n++] = &ptr11;
269     if (&ptr12 == &no_arg) goto done; args[n++] = &ptr12;
270     if (&ptr13 == &no_arg) goto done; args[n++] = &ptr13;
271     if (&ptr14 == &no_arg) goto done; args[n++] = &ptr14;
272     if (&ptr15 == &no_arg) goto done; args[n++] = &ptr15;
273     if (&ptr16 == &no_arg) goto done; args[n++] = &ptr16;
274     done:
275    
276     int consumed;
277     int vec[kVecSize];
278     if (DoMatchImpl(*input, ANCHOR_START, &consumed,
279     args, n, vec, kVecSize)) {
280     input->remove_prefix(consumed);
281     return true;
282     } else {
283     return false;
284     }
285     }
286    
287     bool RE::FindAndConsume(StringPiece* input,
288     const Arg& ptr1,
289     const Arg& ptr2,
290     const Arg& ptr3,
291     const Arg& ptr4,
292     const Arg& ptr5,
293     const Arg& ptr6,
294     const Arg& ptr7,
295     const Arg& ptr8,
296     const Arg& ptr9,
297     const Arg& ptr10,
298     const Arg& ptr11,
299     const Arg& ptr12,
300     const Arg& ptr13,
301     const Arg& ptr14,
302     const Arg& ptr15,
303     const Arg& ptr16) const {
304     const Arg* args[kMaxArgs];
305     int n = 0;
306     if (&ptr1 == &no_arg) goto done; args[n++] = &ptr1;
307     if (&ptr2 == &no_arg) goto done; args[n++] = &ptr2;
308     if (&ptr3 == &no_arg) goto done; args[n++] = &ptr3;
309     if (&ptr4 == &no_arg) goto done; args[n++] = &ptr4;
310     if (&ptr5 == &no_arg) goto done; args[n++] = &ptr5;
311     if (&ptr6 == &no_arg) goto done; args[n++] = &ptr6;
312     if (&ptr7 == &no_arg) goto done; args[n++] = &ptr7;
313     if (&ptr8 == &no_arg) goto done; args[n++] = &ptr8;
314     if (&ptr9 == &no_arg) goto done; args[n++] = &ptr9;
315     if (&ptr10 == &no_arg) goto done; args[n++] = &ptr10;
316     if (&ptr11 == &no_arg) goto done; args[n++] = &ptr11;
317     if (&ptr12 == &no_arg) goto done; args[n++] = &ptr12;
318     if (&ptr13 == &no_arg) goto done; args[n++] = &ptr13;
319     if (&ptr14 == &no_arg) goto done; args[n++] = &ptr14;
320     if (&ptr15 == &no_arg) goto done; args[n++] = &ptr15;
321     if (&ptr16 == &no_arg) goto done; args[n++] = &ptr16;
322     done:
323    
324     int consumed;
325     int vec[kVecSize];
326     if (DoMatchImpl(*input, UNANCHORED, &consumed,
327     args, n, vec, kVecSize)) {
328     input->remove_prefix(consumed);
329     return true;
330     } else {
331     return false;
332     }
333     }
334    
335     bool RE::Replace(const StringPiece& rewrite,
336     string *str) const {
337     int vec[kVecSize];
338     int matches = TryMatch(*str, 0, UNANCHORED, vec, kVecSize);
339     if (matches == 0)
340     return false;
341    
342     string s;
343     if (!Rewrite(&s, rewrite, *str, vec, matches))
344     return false;
345    
346     assert(vec[0] >= 0);
347     assert(vec[1] >= 0);
348     str->replace(vec[0], vec[1] - vec[0], s);
349     return true;
350     }
351    
352 nigel 91 // Returns PCRE_NEWLINE_CRLF, PCRE_NEWLINE_CR, or PCRE_NEWLINE_LF.
353     // Note that PCRE_NEWLINE_CRLF is defined to be P_N_CR | P_N_LF.
354 ph10 253 // Modified by PH to add PCRE_NEWLINE_ANY and PCRE_NEWLINE_ANYCRLF.
355    
356 nigel 91 static int NewlineMode(int pcre_options) {
357     // TODO: if we can make it threadsafe, cache this var
358     int newline_mode = 0;
359     /* if (newline_mode) return newline_mode; */ // do this once it's cached
360 ph10 253 if (pcre_options & (PCRE_NEWLINE_CRLF|PCRE_NEWLINE_CR|PCRE_NEWLINE_LF|
361     PCRE_NEWLINE_ANY|PCRE_NEWLINE_ANYCRLF)) {
362 nigel 91 newline_mode = (pcre_options &
363 ph10 253 (PCRE_NEWLINE_CRLF|PCRE_NEWLINE_CR|PCRE_NEWLINE_LF|
364     PCRE_NEWLINE_ANY|PCRE_NEWLINE_ANYCRLF));
365 nigel 91 } else {
366     int newline;
367     pcre_config(PCRE_CONFIG_NEWLINE, &newline);
368     if (newline == 10)
369     newline_mode = PCRE_NEWLINE_LF;
370     else if (newline == 13)
371     newline_mode = PCRE_NEWLINE_CR;
372     else if (newline == 3338)
373     newline_mode = PCRE_NEWLINE_CRLF;
374 ph10 253 else if (newline == -1)
375     newline_mode = PCRE_NEWLINE_ANY;
376     else if (newline == -2)
377 ph10 259 newline_mode = PCRE_NEWLINE_ANYCRLF;
378 nigel 91 else
379     assert("" == "Unexpected return value from pcre_config(NEWLINE)");
380     }
381     return newline_mode;
382     }
383    
384 nigel 77 int RE::GlobalReplace(const StringPiece& rewrite,
385     string *str) const {
386     int count = 0;
387     int vec[kVecSize];
388     string out;
389     int start = 0;
390     int lastend = -1;
391    
392 ph10 297 while (start <= static_cast<int>(str->length())) {
393 nigel 77 int matches = TryMatch(*str, start, UNANCHORED, vec, kVecSize);
394     if (matches <= 0)
395     break;
396     int matchstart = vec[0], matchend = vec[1];
397     assert(matchstart >= start);
398     assert(matchend >= matchstart);
399     if (matchstart == matchend && matchstart == lastend) {
400     // advance one character if we matched an empty string at the same
401     // place as the last match occurred
402 nigel 91 matchend = start + 1;
403     // If the current char is CR and we're in CRLF mode, skip LF too.
404     // Note it's better to call pcre_fullinfo() than to examine
405     // all_options(), since options_ could have changed bewteen
406     // compile-time and now, but this is simpler and safe enough.
407 ph10 259 // Modified by PH to add ANY and ANYCRLF.
408 nigel 91 if (start+1 < static_cast<int>(str->length()) &&
409     (*str)[start] == '\r' && (*str)[start+1] == '\n' &&
410 ph10 253 (NewlineMode(options_.all_options()) == PCRE_NEWLINE_CRLF ||
411     NewlineMode(options_.all_options()) == PCRE_NEWLINE_ANY ||
412     NewlineMode(options_.all_options()) == PCRE_NEWLINE_ANYCRLF)
413     ) {
414 nigel 91 matchend++;
415     }
416     // We also need to advance more than one char if we're in utf8 mode.
417     #ifdef SUPPORT_UTF8
418     if (options_.utf8()) {
419     while (matchend < static_cast<int>(str->length()) &&
420     ((*str)[matchend] & 0xc0) == 0x80)
421     matchend++;
422     }
423     #endif
424     if (matchend <= static_cast<int>(str->length()))
425     out.append(*str, start, matchend - start);
426     start = matchend;
427 nigel 77 } else {
428     out.append(*str, start, matchstart - start);
429     Rewrite(&out, rewrite, *str, vec, matches);
430     start = matchend;
431     lastend = matchend;
432 ph10 297 count++;
433 nigel 77 }
434     }
435    
436     if (count == 0)
437     return 0;
438    
439     if (start < static_cast<int>(str->length()))
440     out.append(*str, start, str->length() - start);
441     swap(out, *str);
442     return count;
443     }
444    
445     bool RE::Extract(const StringPiece& rewrite,
446     const StringPiece& text,
447     string *out) const {
448     int vec[kVecSize];
449     int matches = TryMatch(text, 0, UNANCHORED, vec, kVecSize);
450     if (matches == 0)
451     return false;
452 nigel 81 out->erase();
453 nigel 77 return Rewrite(out, rewrite, text, vec, matches);
454     }
455    
456 nigel 93 /*static*/ string RE::QuoteMeta(const StringPiece& unquoted) {
457     string result;
458    
459     // Escape any ascii character not in [A-Za-z_0-9].
460     //
461     // Note that it's legal to escape a character even if it has no
462     // special meaning in a regular expression -- so this function does
463     // that. (This also makes it identical to the perl function of the
464 ph10 326 // same name; see `perldoc -f quotemeta`.) The one exception is
465     // escaping NUL: rather than doing backslash + NUL, like perl does,
466     // we do '\0', because pcre itself doesn't take embedded NUL chars.
467 nigel 93 for (int ii = 0; ii < unquoted.size(); ++ii) {
468     // Note that using 'isalnum' here raises the benchmark time from
469     // 32ns to 58ns:
470 ph10 326 if (unquoted[ii] == '\0') {
471     result += "\\0";
472     } else if ((unquoted[ii] < 'a' || unquoted[ii] > 'z') &&
473     (unquoted[ii] < 'A' || unquoted[ii] > 'Z') &&
474     (unquoted[ii] < '0' || unquoted[ii] > '9') &&
475     unquoted[ii] != '_' &&
476     // If this is the part of a UTF8 or Latin1 character, we need
477     // to copy this byte without escaping. Experimentally this is
478     // what works correctly with the regexp library.
479     !(unquoted[ii] & 128)) {
480 nigel 93 result += '\\';
481 ph10 326 result += unquoted[ii];
482     } else {
483     result += unquoted[ii];
484 nigel 93 }
485     }
486    
487     return result;
488     }
489    
490 nigel 77 /***** Actual matching and rewriting code *****/
491    
492     int RE::TryMatch(const StringPiece& text,
493     int startpos,
494     Anchor anchor,
495     int *vec,
496     int vecsize) const {
497     pcre* re = (anchor == ANCHOR_BOTH) ? re_full_ : re_partial_;
498     if (re == NULL) {
499     //fprintf(stderr, "Matching against invalid re: %s\n", error_->c_str());
500     return 0;
501     }
502    
503 ph10 199 pcre_extra extra = { 0, 0, 0, 0, 0, 0 };
504 nigel 77 if (options_.match_limit() > 0) {
505 nigel 87 extra.flags |= PCRE_EXTRA_MATCH_LIMIT;
506 nigel 77 extra.match_limit = options_.match_limit();
507     }
508 nigel 87 if (options_.match_limit_recursion() > 0) {
509     extra.flags |= PCRE_EXTRA_MATCH_LIMIT_RECURSION;
510     extra.match_limit_recursion = options_.match_limit_recursion();
511     }
512 nigel 77 int rc = pcre_exec(re, // The regular expression object
513     &extra,
514 nigel 87 (text.data() == NULL) ? "" : text.data(),
515 nigel 77 text.size(),
516     startpos,
517     (anchor == UNANCHORED) ? 0 : PCRE_ANCHORED,
518     vec,
519     vecsize);
520    
521     // Handle errors
522     if (rc == PCRE_ERROR_NOMATCH) {
523     return 0;
524     } else if (rc < 0) {
525     //fprintf(stderr, "Unexpected return code: %d when matching '%s'\n",
526     // re, pattern_.c_str());
527     return 0;
528     } else if (rc == 0) {
529     // pcre_exec() returns 0 as a special case when the number of
530     // capturing subpatterns exceeds the size of the vector.
531     // When this happens, there is a match and the output vector
532     // is filled, but we miss out on the positions of the extra subpatterns.
533     rc = vecsize / 2;
534     }
535    
536     return rc;
537     }
538    
539     bool RE::DoMatchImpl(const StringPiece& text,
540     Anchor anchor,
541     int* consumed,
542     const Arg* const* args,
543     int n,
544     int* vec,
545     int vecsize) const {
546     assert((1 + n) * 3 <= vecsize); // results + PCRE workspace
547     int matches = TryMatch(text, 0, anchor, vec, vecsize);
548     assert(matches >= 0); // TryMatch never returns negatives
549     if (matches == 0)
550     return false;
551    
552     *consumed = vec[1];
553    
554 nigel 87 if (n == 0 || args == NULL) {
555 nigel 77 // We are not interested in results
556     return true;
557     }
558    
559 nigel 87 if (NumberOfCapturingGroups() < n) {
560     // RE has fewer capturing groups than number of arg pointers passed in
561     return false;
562     }
563    
564 nigel 77 // If we got here, we must have matched the whole pattern.
565     // We do not need (can not do) any more checks on the value of 'matches' here
566     // -- see the comment for TryMatch.
567     for (int i = 0; i < n; i++) {
568     const int start = vec[2*(i+1)];
569     const int limit = vec[2*(i+1)+1];
570     if (!args[i]->Parse(text.data() + start, limit-start)) {
571     // TODO: Should we indicate what the error was?
572     return false;
573     }
574     }
575    
576     return true;
577     }
578    
579     bool RE::DoMatch(const StringPiece& text,
580     Anchor anchor,
581     int* consumed,
582     const Arg* const args[],
583     int n) const {
584     assert(n >= 0);
585     size_t const vecsize = (1 + n) * 3; // results + PCRE workspace
586     // (as for kVecSize)
587     int space[21]; // use stack allocation for small vecsize (common case)
588     int* vec = vecsize <= 21 ? space : new int[vecsize];
589     bool retval = DoMatchImpl(text, anchor, consumed, args, n, vec, vecsize);
590     if (vec != space) delete [] vec;
591     return retval;
592     }
593    
594     bool RE::Rewrite(string *out, const StringPiece &rewrite,
595     const StringPiece &text, int *vec, int veclen) const {
596     for (const char *s = rewrite.data(), *end = s + rewrite.size();
597     s < end; s++) {
598     int c = *s;
599     if (c == '\\') {
600     c = *++s;
601     if (isdigit(c)) {
602     int n = (c - '0');
603     if (n >= veclen) {
604     //fprintf(stderr, requested group %d in regexp %.*s\n",
605     // n, rewrite.size(), rewrite.data());
606     return false;
607     }
608     int start = vec[2 * n];
609     if (start >= 0)
610     out->append(text.data() + start, vec[2 * n + 1] - start);
611     } else if (c == '\\') {
612     out->push_back('\\');
613     } else {
614     //fprintf(stderr, "invalid rewrite pattern: %.*s\n",
615     // rewrite.size(), rewrite.data());
616     return false;
617     }
618     } else {
619     out->push_back(c);
620     }
621     }
622     return true;
623     }
624    
625     // Return the number of capturing subpatterns, or -1 if the
626     // regexp wasn't valid on construction.
627 nigel 87 int RE::NumberOfCapturingGroups() const {
628 nigel 77 if (re_partial_ == NULL) return -1;
629    
630     int result;
631     int pcre_retval = pcre_fullinfo(re_partial_, // The regular expression object
632     NULL, // We did not study the pattern
633     PCRE_INFO_CAPTURECOUNT,
634     &result);
635     assert(pcre_retval == 0);
636     return result;
637     }
638    
639     /***** Parsers for various types *****/
640    
641     bool Arg::parse_null(const char* str, int n, void* dest) {
642     // We fail if somebody asked us to store into a non-NULL void* pointer
643     return (dest == NULL);
644     }
645    
646     bool Arg::parse_string(const char* str, int n, void* dest) {
647 ph10 263 if (dest == NULL) return true;
648 nigel 77 reinterpret_cast<string*>(dest)->assign(str, n);
649     return true;
650     }
651    
652     bool Arg::parse_stringpiece(const char* str, int n, void* dest) {
653 ph10 263 if (dest == NULL) return true;
654 nigel 77 reinterpret_cast<StringPiece*>(dest)->set(str, n);
655     return true;
656     }
657    
658     bool Arg::parse_char(const char* str, int n, void* dest) {
659     if (n != 1) return false;
660 ph10 263 if (dest == NULL) return true;
661 nigel 77 *(reinterpret_cast<char*>(dest)) = str[0];
662     return true;
663     }
664    
665     bool Arg::parse_uchar(const char* str, int n, void* dest) {
666     if (n != 1) return false;
667 ph10 263 if (dest == NULL) return true;
668 nigel 77 *(reinterpret_cast<unsigned char*>(dest)) = str[0];
669     return true;
670     }
671    
672     // Largest number spec that we are willing to parse
673     static const int kMaxNumberLength = 32;
674    
675     // REQUIRES "buf" must have length at least kMaxNumberLength+1
676     // REQUIRES "n > 0"
677     // Copies "str" into "buf" and null-terminates if necessary.
678     // Returns one of:
679     // a. "str" if no termination is needed
680     // b. "buf" if the string was copied and null-terminated
681     // c. "" if the input was invalid and has no hope of being parsed
682     static const char* TerminateNumber(char* buf, const char* str, int n) {
683     if ((n > 0) && isspace(*str)) {
684     // We are less forgiving than the strtoxxx() routines and do not
685     // allow leading spaces.
686     return "";
687     }
688    
689     // See if the character right after the input text may potentially
690     // look like a digit.
691     if (isdigit(str[n]) ||
692     ((str[n] >= 'a') && (str[n] <= 'f')) ||
693     ((str[n] >= 'A') && (str[n] <= 'F'))) {
694     if (n > kMaxNumberLength) return ""; // Input too big to be a valid number
695     memcpy(buf, str, n);
696     buf[n] = '\0';
697     return buf;
698     } else {
699     // We can parse right out of the supplied string, so return it.
700     return str;
701     }
702     }
703    
704     bool Arg::parse_long_radix(const char* str,
705     int n,
706     void* dest,
707     int radix) {
708     if (n == 0) return false;
709     char buf[kMaxNumberLength+1];
710     str = TerminateNumber(buf, str, n);
711     char* end;
712     errno = 0;
713     long r = strtol(str, &end, radix);
714     if (end != str + n) return false; // Leftover junk
715     if (errno) return false;
716 ph10 263 if (dest == NULL) return true;
717 nigel 77 *(reinterpret_cast<long*>(dest)) = r;
718     return true;
719     }
720    
721     bool Arg::parse_ulong_radix(const char* str,
722     int n,
723     void* dest,
724     int radix) {
725     if (n == 0) return false;
726     char buf[kMaxNumberLength+1];
727     str = TerminateNumber(buf, str, n);
728 nigel 87 if (str[0] == '-') return false; // strtoul() on a negative number?!
729 nigel 77 char* end;
730     errno = 0;
731     unsigned long r = strtoul(str, &end, radix);
732     if (end != str + n) return false; // Leftover junk
733     if (errno) return false;
734 ph10 263 if (dest == NULL) return true;
735 nigel 77 *(reinterpret_cast<unsigned long*>(dest)) = r;
736     return true;
737     }
738    
739     bool Arg::parse_short_radix(const char* str,
740     int n,
741     void* dest,
742     int radix) {
743     long r;
744     if (!parse_long_radix(str, n, &r, radix)) return false; // Could not parse
745     if (r < SHRT_MIN || r > SHRT_MAX) return false; // Out of range
746 ph10 263 if (dest == NULL) return true;
747 ph10 256 *(reinterpret_cast<short*>(dest)) = static_cast<short>(r);
748 nigel 77 return true;
749     }
750    
751     bool Arg::parse_ushort_radix(const char* str,
752     int n,
753     void* dest,
754     int radix) {
755     unsigned long r;
756     if (!parse_ulong_radix(str, n, &r, radix)) return false; // Could not parse
757     if (r > USHRT_MAX) return false; // Out of range
758 ph10 263 if (dest == NULL) return true;
759 ph10 256 *(reinterpret_cast<unsigned short*>(dest)) = static_cast<unsigned short>(r);
760 nigel 77 return true;
761     }
762    
763     bool Arg::parse_int_radix(const char* str,
764     int n,
765     void* dest,
766     int radix) {
767     long r;
768     if (!parse_long_radix(str, n, &r, radix)) return false; // Could not parse
769     if (r < INT_MIN || r > INT_MAX) return false; // Out of range
770 ph10 263 if (dest == NULL) return true;
771 nigel 77 *(reinterpret_cast<int*>(dest)) = r;
772     return true;
773     }
774    
775     bool Arg::parse_uint_radix(const char* str,
776     int n,
777     void* dest,
778     int radix) {
779     unsigned long r;
780     if (!parse_ulong_radix(str, n, &r, radix)) return false; // Could not parse
781     if (r > UINT_MAX) return false; // Out of range
782 ph10 263 if (dest == NULL) return true;
783 nigel 77 *(reinterpret_cast<unsigned int*>(dest)) = r;
784     return true;
785     }
786    
787     bool Arg::parse_longlong_radix(const char* str,
788     int n,
789     void* dest,
790     int radix) {
791     #ifndef HAVE_LONG_LONG
792     return false;
793     #else
794     if (n == 0) return false;
795     char buf[kMaxNumberLength+1];
796     str = TerminateNumber(buf, str, n);
797     char* end;
798     errno = 0;
799     #if defined HAVE_STRTOQ
800     long long r = strtoq(str, &end, radix);
801     #elif defined HAVE_STRTOLL
802     long long r = strtoll(str, &end, radix);
803 ph10 257 #elif defined HAVE__STRTOI64
804     long long r = _strtoi64(str, &end, radix);
805 nigel 77 #else
806     #error parse_longlong_radix: cannot convert input to a long-long
807     #endif
808     if (end != str + n) return false; // Leftover junk
809     if (errno) return false;
810 ph10 263 if (dest == NULL) return true;
811 nigel 77 *(reinterpret_cast<long long*>(dest)) = r;
812     return true;
813     #endif /* HAVE_LONG_LONG */
814     }
815    
816     bool Arg::parse_ulonglong_radix(const char* str,
817     int n,
818     void* dest,
819     int radix) {
820     #ifndef HAVE_UNSIGNED_LONG_LONG
821     return false;
822     #else
823     if (n == 0) return false;
824     char buf[kMaxNumberLength+1];
825     str = TerminateNumber(buf, str, n);
826 nigel 87 if (str[0] == '-') return false; // strtoull() on a negative number?!
827 nigel 77 char* end;
828     errno = 0;
829     #if defined HAVE_STRTOQ
830     unsigned long long r = strtouq(str, &end, radix);
831     #elif defined HAVE_STRTOLL
832     unsigned long long r = strtoull(str, &end, radix);
833 ph10 257 #elif defined HAVE__STRTOI64
834     unsigned long long r = _strtoui64(str, &end, radix);
835 nigel 77 #else
836     #error parse_ulonglong_radix: cannot convert input to a long-long
837     #endif
838     if (end != str + n) return false; // Leftover junk
839     if (errno) return false;
840 ph10 263 if (dest == NULL) return true;
841 nigel 77 *(reinterpret_cast<unsigned long long*>(dest)) = r;
842     return true;
843     #endif /* HAVE_UNSIGNED_LONG_LONG */
844     }
845    
846     bool Arg::parse_double(const char* str, int n, void* dest) {
847     if (n == 0) return false;
848     static const int kMaxLength = 200;
849     char buf[kMaxLength];
850     if (n >= kMaxLength) return false;
851     memcpy(buf, str, n);
852     buf[n] = '\0';
853     errno = 0;
854     char* end;
855     double r = strtod(buf, &end);
856     if (end != buf + n) return false; // Leftover junk
857     if (errno) return false;
858 ph10 263 if (dest == NULL) return true;
859 nigel 77 *(reinterpret_cast<double*>(dest)) = r;
860     return true;
861     }
862    
863     bool Arg::parse_float(const char* str, int n, void* dest) {
864     double r;
865     if (!parse_double(str, n, &r)) return false;
866 ph10 263 if (dest == NULL) return true;
867 nigel 77 *(reinterpret_cast<float*>(dest)) = static_cast<float>(r);
868     return true;
869     }
870    
871    
872     #define DEFINE_INTEGER_PARSERS(name) \
873     bool Arg::parse_##name(const char* str, int n, void* dest) { \
874     return parse_##name##_radix(str, n, dest, 10); \
875     } \
876     bool Arg::parse_##name##_hex(const char* str, int n, void* dest) { \
877     return parse_##name##_radix(str, n, dest, 16); \
878     } \
879     bool Arg::parse_##name##_octal(const char* str, int n, void* dest) { \
880     return parse_##name##_radix(str, n, dest, 8); \
881     } \
882     bool Arg::parse_##name##_cradix(const char* str, int n, void* dest) { \
883     return parse_##name##_radix(str, n, dest, 0); \
884     }
885    
886 nigel 93 DEFINE_INTEGER_PARSERS(short) /* */
887     DEFINE_INTEGER_PARSERS(ushort) /* */
888     DEFINE_INTEGER_PARSERS(int) /* Don't use semicolons after these */
889     DEFINE_INTEGER_PARSERS(uint) /* statements because they can cause */
890     DEFINE_INTEGER_PARSERS(long) /* compiler warnings if the checking */
891     DEFINE_INTEGER_PARSERS(ulong) /* level is turned up high enough. */
892     DEFINE_INTEGER_PARSERS(longlong) /* */
893     DEFINE_INTEGER_PARSERS(ulonglong) /* */
894 nigel 77
895     #undef DEFINE_INTEGER_PARSERS
896    
897     } // namespace pcrecpp

Properties

Name Value
svn:eol-style native
svn:keywords "Author Date Id Revision Url"

webmaster@exim.org
ViewVC Help
Powered by ViewVC 1.1.12