/[pcre]/code/trunk/doc/html/pcredemo.html
ViewVC logotype

Contents of /code/trunk/doc/html/pcredemo.html

Parent Directory Parent Directory | Revision Log Revision Log


Revision 954 - (hide annotations) (download) (as text)
Sat Mar 31 18:09:26 2012 UTC (2 years, 4 months ago) by ph10
File MIME type: text/html
File size: 16192 byte(s)
Add date and PCRE version to .TH macros of all man pages.

1 ph10 429 <html>
2     <head>
3     <title>pcredemo specification</title>
4     </head>
5     <body bgcolor="#FFFFFF" text="#00005A" link="#0066FF" alink="#3399FF" vlink="#2222BB">
6     <h1>pcredemo man page</h1>
7     <p>
8     Return to the <a href="index.html">PCRE index page</a>.
9     </p>
10     <p>
11     This page is part of the PCRE HTML documentation. It was generated automatically
12     from the original man page. If there is any nonsense in it, please consult the
13     man page, in case the conversion went wrong.
14     <br>
15     <ul>
16     </ul>
17     <PRE>
18     /*************************************************
19     * PCRE DEMONSTRATION PROGRAM *
20     *************************************************/
21    
22     /* This is a demonstration program to illustrate the most straightforward ways
23     of calling the PCRE regular expression library from a C program. See the
24     pcresample documentation for a short discussion ("man pcresample" if you have
25     the PCRE man pages installed).
26    
27 ph10 487 In Unix-like environments, if PCRE is installed in your standard system
28     libraries, you should be able to compile this program using this command:
29 ph10 429
30 ph10 487 gcc -Wall pcredemo.c -lpcre -o pcredemo
31 ph10 429
32 ph10 487 If PCRE is not installed in a standard place, it is likely to be installed with
33     support for the pkg-config mechanism. If you have pkg-config, you can compile
34     this program using this command:
35    
36     gcc -Wall pcredemo.c `pkg-config --cflags --libs libpcre` -o pcredemo
37    
38     If you do not have pkg-config, you may have to use this:
39    
40     gcc -Wall pcredemo.c -I/usr/local/include -L/usr/local/lib \
41     -R/usr/local/lib -lpcre -o pcredemo
42    
43 ph10 429 Replace "/usr/local/include" and "/usr/local/lib" with wherever the include and
44 ph10 487 library files for PCRE are installed on your system. Only some operating
45 ph10 429 systems (e.g. Solaris) use the -R option.
46    
47     Building under Windows:
48    
49     If you want to statically link this program against a non-dll .a file, you must
50     define PCRE_STATIC before including pcre.h, otherwise the pcre_malloc() and
51     pcre_free() exported functions will be declared __declspec(dllimport), with
52     unwanted results. So in this environment, uncomment the following line. */
53    
54     /* #define PCRE_STATIC */
55    
56     #include &lt;stdio.h&gt;
57     #include &lt;string.h&gt;
58     #include &lt;pcre.h&gt;
59    
60     #define OVECCOUNT 30 /* should be a multiple of 3 */
61    
62    
63     int main(int argc, char **argv)
64     {
65     pcre *re;
66     const char *error;
67     char *pattern;
68     char *subject;
69     unsigned char *name_table;
70 ph10 567 unsigned int option_bits;
71 ph10 429 int erroffset;
72     int find_all;
73 ph10 567 int crlf_is_newline;
74 ph10 429 int namecount;
75     int name_entry_size;
76     int ovector[OVECCOUNT];
77     int subject_length;
78     int rc, i;
79 ph10 567 int utf8;
80 ph10 429
81    
82     /**************************************************************************
83     * First, sort out the command line. There is only one possible option at *
84     * the moment, "-g" to request repeated matching to find all occurrences, *
85     * like Perl's /g option. We set the variable find_all to a non-zero value *
86     * if the -g option is present. Apart from that, there must be exactly two *
87     * arguments. *
88     **************************************************************************/
89    
90     find_all = 0;
91     for (i = 1; i &lt; argc; i++)
92     {
93     if (strcmp(argv[i], "-g") == 0) find_all = 1;
94     else break;
95     }
96    
97     /* After the options, we require exactly two arguments, which are the pattern,
98     and the subject string. */
99    
100     if (argc - i != 2)
101     {
102     printf("Two arguments required: a regex and a subject string\n");
103     return 1;
104     }
105    
106     pattern = argv[i];
107     subject = argv[i+1];
108     subject_length = (int)strlen(subject);
109    
110    
111     /*************************************************************************
112     * Now we are going to compile the regular expression pattern, and handle *
113     * and errors that are detected. *
114     *************************************************************************/
115    
116     re = pcre_compile(
117     pattern, /* the pattern */
118     0, /* default options */
119     &amp;error, /* for error message */
120     &amp;erroffset, /* for error offset */
121     NULL); /* use default character tables */
122    
123     /* Compilation failed: print the error message and exit */
124    
125     if (re == NULL)
126     {
127     printf("PCRE compilation failed at offset %d: %s\n", erroffset, error);
128     return 1;
129     }
130    
131    
132     /*************************************************************************
133     * If the compilation succeeded, we call PCRE again, in order to do a *
134     * pattern match against the subject string. This does just ONE match. If *
135     * further matching is needed, it will be done below. *
136     *************************************************************************/
137    
138     rc = pcre_exec(
139     re, /* the compiled pattern */
140     NULL, /* no extra data - we didn't study the pattern */
141     subject, /* the subject string */
142     subject_length, /* the length of the subject */
143     0, /* start at offset 0 in the subject */
144     0, /* default options */
145     ovector, /* output vector for substring information */
146     OVECCOUNT); /* number of elements in the output vector */
147    
148     /* Matching failed: handle error cases */
149    
150     if (rc &lt; 0)
151     {
152     switch(rc)
153     {
154     case PCRE_ERROR_NOMATCH: printf("No match\n"); break;
155     /*
156     Handle other special cases if you like
157     */
158     default: printf("Matching error %d\n", rc); break;
159     }
160     pcre_free(re); /* Release memory used for the compiled pattern */
161     return 1;
162     }
163    
164     /* Match succeded */
165    
166     printf("\nMatch succeeded at offset %d\n", ovector[0]);
167    
168    
169     /*************************************************************************
170     * We have found the first match within the subject string. If the output *
171     * vector wasn't big enough, say so. Then output any substrings that were *
172     * captured. *
173     *************************************************************************/
174    
175     /* The output vector wasn't big enough */
176    
177     if (rc == 0)
178     {
179     rc = OVECCOUNT/3;
180     printf("ovector only has room for %d captured substrings\n", rc - 1);
181     }
182    
183     /* Show substrings stored in the output vector by number. Obviously, in a real
184     application you might want to do things other than print them. */
185    
186     for (i = 0; i &lt; rc; i++)
187     {
188     char *substring_start = subject + ovector[2*i];
189     int substring_length = ovector[2*i+1] - ovector[2*i];
190     printf("%2d: %.*s\n", i, substring_length, substring_start);
191     }
192    
193    
194     /**************************************************************************
195     * That concludes the basic part of this demonstration program. We have *
196     * compiled a pattern, and performed a single match. The code that follows *
197     * shows first how to access named substrings, and then how to code for *
198     * repeated matches on the same subject. *
199     **************************************************************************/
200    
201     /* See if there are any named substrings, and if so, show them by name. First
202     we have to extract the count of named parentheses from the pattern. */
203    
204     (void)pcre_fullinfo(
205     re, /* the compiled pattern */
206     NULL, /* no extra data - we didn't study the pattern */
207     PCRE_INFO_NAMECOUNT, /* number of named substrings */
208     &amp;namecount); /* where to put the answer */
209    
210     if (namecount &lt;= 0) printf("No named substrings\n"); else
211     {
212     unsigned char *tabptr;
213     printf("Named substrings\n");
214    
215     /* Before we can access the substrings, we must extract the table for
216     translating names to numbers, and the size of each entry in the table. */
217    
218     (void)pcre_fullinfo(
219     re, /* the compiled pattern */
220     NULL, /* no extra data - we didn't study the pattern */
221     PCRE_INFO_NAMETABLE, /* address of the table */
222     &amp;name_table); /* where to put the answer */
223    
224     (void)pcre_fullinfo(
225     re, /* the compiled pattern */
226     NULL, /* no extra data - we didn't study the pattern */
227     PCRE_INFO_NAMEENTRYSIZE, /* size of each entry in the table */
228     &amp;name_entry_size); /* where to put the answer */
229    
230     /* Now we can scan the table and, for each entry, print the number, the name,
231     and the substring itself. */
232    
233     tabptr = name_table;
234     for (i = 0; i &lt; namecount; i++)
235     {
236     int n = (tabptr[0] &lt;&lt; 8) | tabptr[1];
237     printf("(%d) %*s: %.*s\n", n, name_entry_size - 3, tabptr + 2,
238     ovector[2*n+1] - ovector[2*n], subject + ovector[2*n]);
239     tabptr += name_entry_size;
240     }
241     }
242    
243    
244     /*************************************************************************
245     * If the "-g" option was given on the command line, we want to continue *
246     * to search for additional matches in the subject string, in a similar *
247     * way to the /g option in Perl. This turns out to be trickier than you *
248     * might think because of the possibility of matching an empty string. *
249     * What happens is as follows: *
250     * *
251     * If the previous match was NOT for an empty string, we can just start *
252     * the next match at the end of the previous one. *
253     * *
254     * If the previous match WAS for an empty string, we can't do that, as it *
255     * would lead to an infinite loop. Instead, a special call of pcre_exec() *
256 ph10 453 * is made with the PCRE_NOTEMPTY_ATSTART and PCRE_ANCHORED flags set. *
257     * The first of these tells PCRE that an empty string at the start of the *
258     * subject is not a valid match; other possibilities must be tried. The *
259     * second flag restricts PCRE to one match attempt at the initial string *
260     * position. If this match succeeds, an alternative to the empty string *
261 ph10 567 * match has been found, and we can print it and proceed round the loop, *
262     * advancing by the length of whatever was found. If this match does not *
263     * succeed, we still stay in the loop, advancing by just one character. *
264     * In UTF-8 mode, which can be set by (*UTF8) in the pattern, this may be *
265     * more than one byte. *
266     * *
267     * However, there is a complication concerned with newlines. When the *
268 ph10 954 * newline convention is such that CRLF is a valid newline, we must *
269 ph10 567 * advance by two characters rather than one. The newline convention can *
270     * be set in the regex by (*CR), etc.; if not, we must find the default. *
271 ph10 429 *************************************************************************/
272    
273 ph10 567 if (!find_all) /* Check for -g */
274 ph10 429 {
275     pcre_free(re); /* Release the memory used for the compiled pattern */
276     return 0; /* Finish unless -g was given */
277     }
278    
279 ph10 567 /* Before running the loop, check for UTF-8 and whether CRLF is a valid newline
280 ph10 579 sequence. First, find the options with which the regex was compiled; extract
281 ph10 567 the UTF-8 state, and mask off all but the newline options. */
282    
283     (void)pcre_fullinfo(re, NULL, PCRE_INFO_OPTIONS, &amp;option_bits);
284     utf8 = option_bits &amp; PCRE_UTF8;
285     option_bits &amp;= PCRE_NEWLINE_CR|PCRE_NEWLINE_LF|PCRE_NEWLINE_CRLF|
286     PCRE_NEWLINE_ANY|PCRE_NEWLINE_ANYCRLF;
287    
288     /* If no newline options were set, find the default newline convention from the
289     build configuration. */
290    
291     if (option_bits == 0)
292     {
293     int d;
294     (void)pcre_config(PCRE_CONFIG_NEWLINE, &amp;d);
295     /* Note that these values are always the ASCII ones, even in
296     EBCDIC environments. CR = 13, NL = 10. */
297     option_bits = (d == 13)? PCRE_NEWLINE_CR :
298     (d == 10)? PCRE_NEWLINE_LF :
299     (d == (13&lt;&lt;8 | 10))? PCRE_NEWLINE_CRLF :
300     (d == -2)? PCRE_NEWLINE_ANYCRLF :
301     (d == -1)? PCRE_NEWLINE_ANY : 0;
302     }
303    
304     /* See if CRLF is a valid newline sequence. */
305    
306 ph10 579 crlf_is_newline =
307 ph10 567 option_bits == PCRE_NEWLINE_ANY ||
308     option_bits == PCRE_NEWLINE_CRLF ||
309     option_bits == PCRE_NEWLINE_ANYCRLF;
310    
311 ph10 429 /* Loop for second and subsequent matches */
312    
313     for (;;)
314     {
315     int options = 0; /* Normally no options */
316     int start_offset = ovector[1]; /* Start at end of previous match */
317    
318     /* If the previous match was for an empty string, we are finished if we are
319     at the end of the subject. Otherwise, arrange to run another match at the
320     same point to see if a non-empty match can be found. */
321    
322     if (ovector[0] == ovector[1])
323     {
324     if (ovector[0] == subject_length) break;
325 ph10 453 options = PCRE_NOTEMPTY_ATSTART | PCRE_ANCHORED;
326 ph10 429 }
327    
328     /* Run the next matching operation */
329    
330     rc = pcre_exec(
331     re, /* the compiled pattern */
332     NULL, /* no extra data - we didn't study the pattern */
333     subject, /* the subject string */
334     subject_length, /* the length of the subject */
335     start_offset, /* starting offset in the subject */
336     options, /* options */
337     ovector, /* output vector for substring information */
338     OVECCOUNT); /* number of elements in the output vector */
339    
340     /* This time, a result of NOMATCH isn't an error. If the value in "options"
341     is zero, it just means we have found all possible matches, so the loop ends.
342     Otherwise, it means we have failed to find a non-empty-string match at a
343     point where there was a previous empty-string match. In this case, we do what
344 ph10 567 Perl does: advance the matching position by one character, and continue. We
345     do this by setting the "end of previous match" offset, because that is picked
346     up at the top of the loop as the point at which to start again.
347 ph10 429
348 ph10 567 There are two complications: (a) When CRLF is a valid newline sequence, and
349     the current position is just before it, advance by an extra byte. (b)
350     Otherwise we must ensure that we skip an entire UTF-8 character if we are in
351     UTF-8 mode. */
352    
353 ph10 429 if (rc == PCRE_ERROR_NOMATCH)
354     {
355 ph10 567 if (options == 0) break; /* All matches found */
356     ovector[1] = start_offset + 1; /* Advance one byte */
357     if (crlf_is_newline &amp;&amp; /* If CRLF is newline &amp; */
358     start_offset &lt; subject_length - 1 &amp;&amp; /* we are at CRLF, */
359     subject[start_offset] == '\r' &amp;&amp;
360     subject[start_offset + 1] == '\n')
361     ovector[1] += 1; /* Advance by one more. */
362     else if (utf8) /* Otherwise, ensure we */
363     { /* advance a whole UTF-8 */
364     while (ovector[1] &lt; subject_length) /* character. */
365 ph10 579 {
366 ph10 567 if ((subject[ovector[1]] &amp; 0xc0) != 0x80) break;
367     ovector[1] += 1;
368     }
369 ph10 579 }
370 ph10 429 continue; /* Go round the loop again */
371     }
372    
373     /* Other matching errors are not recoverable. */
374    
375     if (rc &lt; 0)
376     {
377     printf("Matching error %d\n", rc);
378     pcre_free(re); /* Release memory used for the compiled pattern */
379     return 1;
380     }
381    
382     /* Match succeded */
383    
384     printf("\nMatch succeeded again at offset %d\n", ovector[0]);
385    
386     /* The match succeeded, but the output vector wasn't big enough. */
387    
388     if (rc == 0)
389     {
390     rc = OVECCOUNT/3;
391     printf("ovector only has room for %d captured substrings\n", rc - 1);
392     }
393    
394     /* As before, show substrings stored in the output vector by number, and then
395     also any named substrings. */
396    
397     for (i = 0; i &lt; rc; i++)
398     {
399     char *substring_start = subject + ovector[2*i];
400     int substring_length = ovector[2*i+1] - ovector[2*i];
401     printf("%2d: %.*s\n", i, substring_length, substring_start);
402     }
403    
404     if (namecount &lt;= 0) printf("No named substrings\n"); else
405     {
406     unsigned char *tabptr = name_table;
407     printf("Named substrings\n");
408     for (i = 0; i &lt; namecount; i++)
409     {
410     int n = (tabptr[0] &lt;&lt; 8) | tabptr[1];
411     printf("(%d) %*s: %.*s\n", n, name_entry_size - 3, tabptr + 2,
412     ovector[2*n+1] - ovector[2*n], subject + ovector[2*n]);
413     tabptr += name_entry_size;
414     }
415     }
416     } /* End of loop to find second and subsequent matches */
417    
418     printf("\n");
419     pcre_free(re); /* Release memory used for the compiled pattern */
420     return 0;
421     }
422    
423     /* End of pcredemo.c */
424     <p>
425     Return to the <a href="index.html">PCRE index page</a>.
426     </p>

webmaster@exim.org
ViewVC Help
Powered by ViewVC 1.1.12