/[pcre]/code/trunk/pcredemo.c
ViewVC logotype

Contents of /code/trunk/pcredemo.c

Parent Directory Parent Directory | Revision Log Revision Log


Revision 945 - (hide annotations) (download)
Wed Feb 29 09:37:15 2012 UTC (2 years, 1 month ago) by ph10
File MIME type: text/plain
File size: 15520 byte(s)
Fix typo in comment.

1 nigel 63 /*************************************************
2     * PCRE DEMONSTRATION PROGRAM *
3     *************************************************/
4 nigel 53
5 nigel 63 /* This is a demonstration program to illustrate the most straightforward ways
6     of calling the PCRE regular expression library from a C program. See the
7 ph10 319 pcresample documentation for a short discussion ("man pcresample" if you have
8 ph10 315 the PCRE man pages installed).
9 nigel 63
10 ph10 487 In Unix-like environments, if PCRE is installed in your standard system
11 ph10 477 libraries, you should be able to compile this program using this command:
12 ph10 315
13 ph10 477 gcc -Wall pcredemo.c -lpcre -o pcredemo
14 nigel 63
15 ph10 477 If PCRE is not installed in a standard place, it is likely to be installed with
16     support for the pkg-config mechanism. If you have pkg-config, you can compile
17     this program using this command:
18    
19     gcc -Wall pcredemo.c `pkg-config --cflags --libs libpcre` -o pcredemo
20    
21     If you do not have pkg-config, you may have to use this:
22    
23     gcc -Wall pcredemo.c -I/usr/local/include -L/usr/local/lib \
24     -R/usr/local/lib -lpcre -o pcredemo
25    
26 nigel 75 Replace "/usr/local/include" and "/usr/local/lib" with wherever the include and
27 ph10 477 library files for PCRE are installed on your system. Only some operating
28 nigel 63 systems (e.g. Solaris) use the -R option.
29 nigel 53
30 ph10 315 Building under Windows:
31 nigel 63
32 ph10 315 If you want to statically link this program against a non-dll .a file, you must
33     define PCRE_STATIC before including pcre.h, otherwise the pcre_malloc() and
34     pcre_free() exported functions will be declared __declspec(dllimport), with
35     unwanted results. So in this environment, uncomment the following line. */
36    
37     /* #define PCRE_STATIC */
38    
39 nigel 63 #include <stdio.h>
40     #include <string.h>
41     #include <pcre.h>
42    
43 nigel 53 #define OVECCOUNT 30 /* should be a multiple of 3 */
44    
45 nigel 63
46 nigel 53 int main(int argc, char **argv)
47     {
48     pcre *re;
49     const char *error;
50 nigel 63 char *pattern;
51     char *subject;
52     unsigned char *name_table;
53 ph10 566 unsigned int option_bits;
54 nigel 53 int erroffset;
55 nigel 63 int find_all;
56 ph10 566 int crlf_is_newline;
57 nigel 63 int namecount;
58     int name_entry_size;
59 nigel 53 int ovector[OVECCOUNT];
60 nigel 63 int subject_length;
61 nigel 53 int rc, i;
62 ph10 566 int utf8;
63 nigel 53
64 nigel 63
65 nigel 75 /**************************************************************************
66     * First, sort out the command line. There is only one possible option at *
67     * the moment, "-g" to request repeated matching to find all occurrences, *
68     * like Perl's /g option. We set the variable find_all to a non-zero value *
69     * if the -g option is present. Apart from that, there must be exactly two *
70     * arguments. *
71     **************************************************************************/
72 nigel 63
73     find_all = 0;
74     for (i = 1; i < argc; i++)
75 nigel 53 {
76 nigel 63 if (strcmp(argv[i], "-g") == 0) find_all = 1;
77     else break;
78     }
79    
80     /* After the options, we require exactly two arguments, which are the pattern,
81     and the subject string. */
82    
83     if (argc - i != 2)
84     {
85 nigel 53 printf("Two arguments required: a regex and a subject string\n");
86     return 1;
87     }
88    
89 nigel 63 pattern = argv[i];
90     subject = argv[i+1];
91     subject_length = (int)strlen(subject);
92 nigel 53
93 nigel 63
94     /*************************************************************************
95     * Now we are going to compile the regular expression pattern, and handle *
96     * and errors that are detected. *
97     *************************************************************************/
98    
99 nigel 53 re = pcre_compile(
100 nigel 63 pattern, /* the pattern */
101 nigel 53 0, /* default options */
102     &error, /* for error message */
103     &erroffset, /* for error offset */
104     NULL); /* use default character tables */
105    
106     /* Compilation failed: print the error message and exit */
107    
108     if (re == NULL)
109     {
110     printf("PCRE compilation failed at offset %d: %s\n", erroffset, error);
111     return 1;
112     }
113    
114    
115 nigel 63 /*************************************************************************
116     * If the compilation succeeded, we call PCRE again, in order to do a *
117 nigel 75 * pattern match against the subject string. This does just ONE match. If *
118 nigel 63 * further matching is needed, it will be done below. *
119     *************************************************************************/
120    
121 nigel 53 rc = pcre_exec(
122     re, /* the compiled pattern */
123     NULL, /* no extra data - we didn't study the pattern */
124 nigel 63 subject, /* the subject string */
125     subject_length, /* the length of the subject */
126 nigel 53 0, /* start at offset 0 in the subject */
127     0, /* default options */
128     ovector, /* output vector for substring information */
129     OVECCOUNT); /* number of elements in the output vector */
130    
131     /* Matching failed: handle error cases */
132    
133     if (rc < 0)
134     {
135     switch(rc)
136     {
137     case PCRE_ERROR_NOMATCH: printf("No match\n"); break;
138     /*
139     Handle other special cases if you like
140     */
141     default: printf("Matching error %d\n", rc); break;
142     }
143 nigel 91 pcre_free(re); /* Release memory used for the compiled pattern */
144 nigel 53 return 1;
145     }
146    
147     /* Match succeded */
148    
149 nigel 63 printf("\nMatch succeeded at offset %d\n", ovector[0]);
150 nigel 53
151 nigel 63
152     /*************************************************************************
153     * We have found the first match within the subject string. If the output *
154 ph10 315 * vector wasn't big enough, say so. Then output any substrings that were *
155     * captured. *
156 nigel 63 *************************************************************************/
157    
158 nigel 53 /* The output vector wasn't big enough */
159    
160     if (rc == 0)
161     {
162     rc = OVECCOUNT/3;
163     printf("ovector only has room for %d captured substrings\n", rc - 1);
164     }
165    
166 nigel 63 /* Show substrings stored in the output vector by number. Obviously, in a real
167     application you might want to do things other than print them. */
168 nigel 53
169     for (i = 0; i < rc; i++)
170     {
171 nigel 63 char *substring_start = subject + ovector[2*i];
172 nigel 53 int substring_length = ovector[2*i+1] - ovector[2*i];
173     printf("%2d: %.*s\n", i, substring_length, substring_start);
174     }
175    
176 nigel 63
177 nigel 75 /**************************************************************************
178     * That concludes the basic part of this demonstration program. We have *
179     * compiled a pattern, and performed a single match. The code that follows *
180 ph10 315 * shows first how to access named substrings, and then how to code for *
181 nigel 75 * repeated matches on the same subject. *
182     **************************************************************************/
183 nigel 63
184     /* See if there are any named substrings, and if so, show them by name. First
185     we have to extract the count of named parentheses from the pattern. */
186    
187     (void)pcre_fullinfo(
188     re, /* the compiled pattern */
189     NULL, /* no extra data - we didn't study the pattern */
190     PCRE_INFO_NAMECOUNT, /* number of named substrings */
191     &namecount); /* where to put the answer */
192    
193     if (namecount <= 0) printf("No named substrings\n"); else
194     {
195     unsigned char *tabptr;
196     printf("Named substrings\n");
197    
198     /* Before we can access the substrings, we must extract the table for
199     translating names to numbers, and the size of each entry in the table. */
200    
201     (void)pcre_fullinfo(
202     re, /* the compiled pattern */
203     NULL, /* no extra data - we didn't study the pattern */
204     PCRE_INFO_NAMETABLE, /* address of the table */
205     &name_table); /* where to put the answer */
206    
207     (void)pcre_fullinfo(
208     re, /* the compiled pattern */
209     NULL, /* no extra data - we didn't study the pattern */
210     PCRE_INFO_NAMEENTRYSIZE, /* size of each entry in the table */
211     &name_entry_size); /* where to put the answer */
212    
213     /* Now we can scan the table and, for each entry, print the number, the name,
214     and the substring itself. */
215    
216     tabptr = name_table;
217     for (i = 0; i < namecount; i++)
218     {
219     int n = (tabptr[0] << 8) | tabptr[1];
220     printf("(%d) %*s: %.*s\n", n, name_entry_size - 3, tabptr + 2,
221     ovector[2*n+1] - ovector[2*n], subject + ovector[2*n]);
222     tabptr += name_entry_size;
223     }
224     }
225    
226    
227     /*************************************************************************
228     * If the "-g" option was given on the command line, we want to continue *
229     * to search for additional matches in the subject string, in a similar *
230     * way to the /g option in Perl. This turns out to be trickier than you *
231     * might think because of the possibility of matching an empty string. *
232     * What happens is as follows: *
233     * *
234     * If the previous match was NOT for an empty string, we can just start *
235     * the next match at the end of the previous one. *
236     * *
237     * If the previous match WAS for an empty string, we can't do that, as it *
238     * would lead to an infinite loop. Instead, a special call of pcre_exec() *
239 ph10 442 * is made with the PCRE_NOTEMPTY_ATSTART and PCRE_ANCHORED flags set. *
240     * The first of these tells PCRE that an empty string at the start of the *
241     * subject is not a valid match; other possibilities must be tried. The *
242     * second flag restricts PCRE to one match attempt at the initial string *
243     * position. If this match succeeds, an alternative to the empty string *
244 ph10 566 * match has been found, and we can print it and proceed round the loop, *
245     * advancing by the length of whatever was found. If this match does not *
246     * succeed, we still stay in the loop, advancing by just one character. *
247     * In UTF-8 mode, which can be set by (*UTF8) in the pattern, this may be *
248     * more than one byte. *
249     * *
250     * However, there is a complication concerned with newlines. When the *
251 ph10 945 * newline convention is such that CRLF is a valid newline, we must *
252 ph10 566 * advance by two characters rather than one. The newline convention can *
253     * be set in the regex by (*CR), etc.; if not, we must find the default. *
254 nigel 63 *************************************************************************/
255    
256 ph10 566 if (!find_all) /* Check for -g */
257 nigel 75 {
258 nigel 91 pcre_free(re); /* Release the memory used for the compiled pattern */
259     return 0; /* Finish unless -g was given */
260 nigel 75 }
261 nigel 63
262 ph10 566 /* Before running the loop, check for UTF-8 and whether CRLF is a valid newline
263 ph10 579 sequence. First, find the options with which the regex was compiled; extract
264 ph10 566 the UTF-8 state, and mask off all but the newline options. */
265    
266     (void)pcre_fullinfo(re, NULL, PCRE_INFO_OPTIONS, &option_bits);
267     utf8 = option_bits & PCRE_UTF8;
268     option_bits &= PCRE_NEWLINE_CR|PCRE_NEWLINE_LF|PCRE_NEWLINE_CRLF|
269     PCRE_NEWLINE_ANY|PCRE_NEWLINE_ANYCRLF;
270    
271     /* If no newline options were set, find the default newline convention from the
272     build configuration. */
273    
274     if (option_bits == 0)
275     {
276     int d;
277     (void)pcre_config(PCRE_CONFIG_NEWLINE, &d);
278     /* Note that these values are always the ASCII ones, even in
279     EBCDIC environments. CR = 13, NL = 10. */
280     option_bits = (d == 13)? PCRE_NEWLINE_CR :
281     (d == 10)? PCRE_NEWLINE_LF :
282     (d == (13<<8 | 10))? PCRE_NEWLINE_CRLF :
283     (d == -2)? PCRE_NEWLINE_ANYCRLF :
284     (d == -1)? PCRE_NEWLINE_ANY : 0;
285     }
286    
287     /* See if CRLF is a valid newline sequence. */
288    
289 ph10 579 crlf_is_newline =
290 ph10 566 option_bits == PCRE_NEWLINE_ANY ||
291     option_bits == PCRE_NEWLINE_CRLF ||
292     option_bits == PCRE_NEWLINE_ANYCRLF;
293    
294 nigel 63 /* Loop for second and subsequent matches */
295    
296     for (;;)
297     {
298     int options = 0; /* Normally no options */
299     int start_offset = ovector[1]; /* Start at end of previous match */
300    
301     /* If the previous match was for an empty string, we are finished if we are
302     at the end of the subject. Otherwise, arrange to run another match at the
303     same point to see if a non-empty match can be found. */
304    
305     if (ovector[0] == ovector[1])
306     {
307     if (ovector[0] == subject_length) break;
308 ph10 442 options = PCRE_NOTEMPTY_ATSTART | PCRE_ANCHORED;
309 nigel 63 }
310    
311     /* Run the next matching operation */
312    
313     rc = pcre_exec(
314     re, /* the compiled pattern */
315     NULL, /* no extra data - we didn't study the pattern */
316     subject, /* the subject string */
317     subject_length, /* the length of the subject */
318     start_offset, /* starting offset in the subject */
319     options, /* options */
320     ovector, /* output vector for substring information */
321     OVECCOUNT); /* number of elements in the output vector */
322    
323     /* This time, a result of NOMATCH isn't an error. If the value in "options"
324     is zero, it just means we have found all possible matches, so the loop ends.
325     Otherwise, it means we have failed to find a non-empty-string match at a
326     point where there was a previous empty-string match. In this case, we do what
327 ph10 566 Perl does: advance the matching position by one character, and continue. We
328     do this by setting the "end of previous match" offset, because that is picked
329     up at the top of the loop as the point at which to start again.
330 nigel 63
331 ph10 566 There are two complications: (a) When CRLF is a valid newline sequence, and
332     the current position is just before it, advance by an extra byte. (b)
333     Otherwise we must ensure that we skip an entire UTF-8 character if we are in
334     UTF-8 mode. */
335    
336 nigel 63 if (rc == PCRE_ERROR_NOMATCH)
337     {
338 ph10 566 if (options == 0) break; /* All matches found */
339     ovector[1] = start_offset + 1; /* Advance one byte */
340     if (crlf_is_newline && /* If CRLF is newline & */
341     start_offset < subject_length - 1 && /* we are at CRLF, */
342     subject[start_offset] == '\r' &&
343     subject[start_offset + 1] == '\n')
344     ovector[1] += 1; /* Advance by one more. */
345     else if (utf8) /* Otherwise, ensure we */
346     { /* advance a whole UTF-8 */
347     while (ovector[1] < subject_length) /* character. */
348 ph10 579 {
349 ph10 566 if ((subject[ovector[1]] & 0xc0) != 0x80) break;
350     ovector[1] += 1;
351     }
352 ph10 579 }
353 nigel 63 continue; /* Go round the loop again */
354     }
355    
356     /* Other matching errors are not recoverable. */
357    
358     if (rc < 0)
359     {
360     printf("Matching error %d\n", rc);
361 nigel 91 pcre_free(re); /* Release memory used for the compiled pattern */
362 nigel 63 return 1;
363     }
364    
365     /* Match succeded */
366    
367     printf("\nMatch succeeded again at offset %d\n", ovector[0]);
368    
369     /* The match succeeded, but the output vector wasn't big enough. */
370    
371     if (rc == 0)
372     {
373     rc = OVECCOUNT/3;
374     printf("ovector only has room for %d captured substrings\n", rc - 1);
375     }
376    
377     /* As before, show substrings stored in the output vector by number, and then
378     also any named substrings. */
379    
380     for (i = 0; i < rc; i++)
381     {
382     char *substring_start = subject + ovector[2*i];
383     int substring_length = ovector[2*i+1] - ovector[2*i];
384     printf("%2d: %.*s\n", i, substring_length, substring_start);
385     }
386    
387     if (namecount <= 0) printf("No named substrings\n"); else
388     {
389     unsigned char *tabptr = name_table;
390     printf("Named substrings\n");
391     for (i = 0; i < namecount; i++)
392     {
393     int n = (tabptr[0] << 8) | tabptr[1];
394     printf("(%d) %*s: %.*s\n", n, name_entry_size - 3, tabptr + 2,
395     ovector[2*n+1] - ovector[2*n], subject + ovector[2*n]);
396     tabptr += name_entry_size;
397     }
398     }
399     } /* End of loop to find second and subsequent matches */
400    
401     printf("\n");
402 nigel 91 pcre_free(re); /* Release memory used for the compiled pattern */
403 nigel 53 return 0;
404     }
405    
406 nigel 63 /* End of pcredemo.c */

Properties

Name Value
svn:eol-style native
svn:keywords "Author Date Id Revision Url"

webmaster@exim.org
ViewVC Help
Powered by ViewVC 1.1.12