Libpcre: Difference between revisions
(→Basic) |
No edit summary |
||
Line 63: | Line 63: | ||
* stringnumber Number of the required substring | * stringnumber Number of the required substring | ||
* stringptr Where to put the string pointer | * stringptr Where to put the string pointer | ||
== Example == | |||
=== URI parser === | |||
Parsing the given uri info to protocol, hostname, etc.. | |||
<source lang=c> | |||
// main.c | |||
#include <stdio.h> | |||
#include <pcre.h> | |||
#include <string.h> | |||
#include <stdbool.h> | |||
#define REGEX_URI "^(([^:/?#]+):)?(//([^/?#]*))?([^?#]*)(\\?([^#]*))?(#(.*))?" // RFC 3986 | |||
#define REGEX_HOSTNAME "^(.+):(.*)" | |||
#define IDX_PROTOCOL 2 | |||
#define IDX_ENDPOINT 4 | |||
#define IDX_RESOURCE 5 | |||
#define IDX_HOSTNAME 1 | |||
#define IDX_PORT 2 | |||
typedef struct _st_hostinfo { | |||
char* uri; // http://example.com:80/test1/test2/test3/index.html // original URI | |||
char* protocol; // http | |||
char* endpoint; // exmaple.com:80 | |||
char* hostname; // example.com | |||
char* resource; // /test1/test2/test3/index.html | |||
int port; // 80 | |||
} st_hostinfo; | |||
static bool parse_uri(st_hostinfo* hostinfo); | |||
static bool parse_endpoint(st_hostinfo* hostinfo); | |||
/** | |||
* \brief create hostinfo. Initiate all items to NULL | |||
*/ | |||
static st_hostinfo* create_hostinfo(void) { | |||
st_hostinfo* hostinfo; | |||
hostinfo = calloc(sizeof(st_hostinfo), 1); | |||
hostinfo->uri = NULL; | |||
hostinfo->protocol = NULL; | |||
hostinfo->hostname = NULL; | |||
hostinfo->resource = NULL; | |||
hostinfo->port = -1; | |||
return hostinfo; | |||
} | |||
/** | |||
* \brief destroy the hostinfo. Release all items in the given value. | |||
*/ | |||
static void destroy_hostinfo(st_hostinfo* hostinfo) { | |||
if(hostinfo == NULL) { | |||
return; | |||
} | |||
if(hostinfo->uri != NULL) { | |||
free(hostinfo->uri); | |||
} | |||
if(hostinfo->protocol != NULL) { | |||
free(hostinfo->protocol); | |||
} | |||
if(hostinfo->endpoint != NULL) { | |||
free(hostinfo->endpoint); | |||
} | |||
if(hostinfo->hostname != NULL) { | |||
free(hostinfo->hostname); | |||
} | |||
if(hostinfo->resource != NULL) { | |||
free(hostinfo->resource); | |||
} | |||
free(hostinfo); | |||
return; | |||
} | |||
/** | |||
* \brief initiate hostinfo. parse the given uri. | |||
*/ | |||
static bool init_hostinfo(st_hostinfo* hostinfo, const char* uri) | |||
{ | |||
int ret; | |||
if(hostinfo->uri != NULL) { | |||
free(hostinfo->uri); | |||
} | |||
hostinfo->uri = strdup(uri); | |||
ret = parse_uri(hostinfo); | |||
if(ret == false) { | |||
printf("Could not initiate hostinfo.\n"); | |||
return false; | |||
} | |||
ret = parse_endpoint(hostinfo); | |||
if(ret == false) { | |||
// copy the endpoint to hostname. | |||
hostinfo->hostname = strdup(hostinfo->endpoint); | |||
if(strcmp(hostinfo->protocol, "http") == 0) { | |||
hostinfo->port = 80; | |||
} | |||
else if(strcmp(hostinfo->protocol, "ftp") == 0) { | |||
hostinfo->port = 21; | |||
} | |||
else { | |||
printf("Could not get correct port info.\n"); | |||
return false; | |||
} | |||
} | |||
return true; | |||
} | |||
/** | |||
* \brief parse uri info form given hostinfo.\n | |||
* it parsing only the below items using uri. | |||
* protocol, endpoint, resource | |||
*/ | |||
static bool parse_uri(st_hostinfo* hostinfo) | |||
{ | |||
const char* tmp_err; | |||
int offset; | |||
pcre* re; | |||
pcre_extra* re_ex; | |||
int subStrVec[30]; | |||
int ret; | |||
const char* tmp_const; | |||
const char* uri; | |||
if((hostinfo == NULL) || (hostinfo->uri == NULL)) { | |||
printf("Wrong input parameter.\n"); | |||
return false; | |||
} | |||
uri = hostinfo->uri; | |||
re = pcre_compile(REGEX_URI, 0, &tmp_err, &offset, NULL); | |||
if(re == NULL) { | |||
printf("Could not compile the regex. regex=%s, err=%s\n", REGEX_URI, tmp_err); | |||
return false; | |||
} | |||
// Optimize the regex | |||
re_ex = pcre_study(re, 0, &tmp_err); | |||
if(tmp_err != NULL) { | |||
printf("Could not optimize the regex. regex=%s, err=%s\n", REGEX_URI, tmp_err); | |||
pcre_free(re); | |||
return false; | |||
} | |||
ret = pcre_exec(re, | |||
re_ex, | |||
uri, | |||
strlen(uri), // length of uri | |||
0, // Start looking at this point | |||
0, // OPTIONS | |||
subStrVec, | |||
sizeof(subStrVec) // Length of subStrVec | |||
); | |||
pcre_free(re); | |||
pcre_free(re_ex); | |||
if(ret < 0) { | |||
printf("Could not parsed input uri. uri[%s]\n", uri); | |||
return false; | |||
} | |||
// get protocol | |||
pcre_get_substring(uri, subStrVec, ret, IDX_PROTOCOL, &tmp_const); | |||
if(tmp_const == NULL) { | |||
printf("Could not get hostname info.\n"); | |||
return false; | |||
} | |||
hostinfo->protocol = strdup(tmp_const); | |||
printf("The resource. protocol[%s]\n", hostinfo->protocol); | |||
pcre_free_substring(tmp_const); | |||
// get endpoint | |||
pcre_get_substring(uri, subStrVec, ret, IDX_ENDPOINT, &tmp_const); | |||
if(tmp_const == NULL) { | |||
printf("Could not get endpoint info.\n"); | |||
return false; | |||
} | |||
hostinfo->endpoint = strdup(tmp_const); | |||
printf("The endpoint. endpoint[%s]\n", hostinfo->endpoint); | |||
pcre_free_substring(tmp_const); | |||
// get resource | |||
pcre_get_substring(uri, subStrVec, ret, IDX_RESOURCE, &tmp_const); | |||
if(tmp_const == NULL) { | |||
printf("Could not get hostname info.\n"); | |||
return false; | |||
} | |||
hostinfo->resource = strdup(tmp_const); | |||
printf("The resource. resource[%s]\n", hostinfo->resource); | |||
pcre_free_substring(tmp_const); | |||
return true; | |||
} | |||
static bool parse_endpoint(st_hostinfo* hostinfo) | |||
{ | |||
pcre* re; | |||
pcre_extra* re_ex; | |||
const char* tmp_err; | |||
const char* endpoint; | |||
const char* tmp_const; | |||
int subStrVec[30]; | |||
int offset; | |||
int ret; | |||
// parameter check. | |||
if((hostinfo == NULL) || (hostinfo->endpoint == NULL)) { | |||
printf("Could not set hostname and port info. Wrong input parameter.\n"); | |||
return false; | |||
} | |||
re = pcre_compile(REGEX_HOSTNAME, 0, &tmp_err, &offset, NULL); | |||
if(re == NULL) { | |||
printf("Could not compile the reges. regex[%s], err[%s]\n", REGEX_HOSTNAME, tmp_err); | |||
return NULL; | |||
} | |||
re_ex = pcre_study(re, 0, &tmp_err); | |||
if(tmp_err != NULL) { | |||
printf("Could not optimize the regex. regex[%s], err[%s]\n", REGEX_HOSTNAME, tmp_err); | |||
pcre_free(re); | |||
return NULL; | |||
} | |||
endpoint = hostinfo->endpoint; | |||
ret = pcre_exec(re, | |||
re_ex, | |||
endpoint, | |||
strlen(endpoint), // length of endpoint | |||
0, // Start looking at this point | |||
0, // OPTIONS | |||
subStrVec, | |||
sizeof(subStrVec) // Length of subStrVec | |||
); | |||
pcre_free(re); | |||
pcre_free(re_ex); | |||
if(ret < 0) { | |||
printf("Could not parse endpoint info.\n"); | |||
return false; | |||
} | |||
// get hostname | |||
pcre_get_substring(endpoint, subStrVec, ret, IDX_HOSTNAME, &tmp_const); | |||
if(tmp_const == NULL) { | |||
printf("Could not get hostname info.\n"); | |||
return false; | |||
} | |||
hostinfo->hostname = strdup(tmp_const); | |||
printf("The hostname. hostname[%s]\n", hostinfo->hostname); | |||
pcre_free_substring(tmp_const); | |||
// get port | |||
pcre_get_substring(endpoint, subStrVec, ret, IDX_PORT, &tmp_const); | |||
if(tmp_const == NULL) { | |||
printf("Could not get port info.\n"); | |||
return false; | |||
} | |||
hostinfo->port = atoi(tmp_const); | |||
printf("The port. port[%d]\n", hostinfo->port); | |||
pcre_free_substring(tmp_const); | |||
return true; | |||
} | |||
int main(int argc, char** argv) | |||
{ | |||
int ret; | |||
st_hostinfo* hostinfo; | |||
if(argc < 2) { | |||
printf("Usage:\n"); | |||
printf(" %s <fully qualified hostname>\n", argv[0]); | |||
return 1; | |||
} | |||
hostinfo = create_hostinfo(); | |||
ret = init_hostinfo(hostinfo, argv[1]); | |||
if(ret != true) { | |||
destroy_hostinfo(hostinfo); | |||
return 0; | |||
} | |||
printf("Parsing info. uri[%s], protocol[%s], hostname[%s], port[%d], resource[%s]\n", | |||
hostinfo->uri, | |||
hostinfo->protocol, | |||
hostinfo->hostname, | |||
hostinfo->port, | |||
hostinfo->resource | |||
); | |||
destroy_hostinfo(hostinfo); | |||
return 0; | |||
} | |||
</source> | |||
'''Run''' | |||
<pre> | |||
$ valgrind --tool=memcheck --leak-check=full ./main http://test.com:8080/test1/test2.html | |||
==32580== Memcheck, a memory error detector | |||
==32580== Copyright (C) 2002-2013, and GNU GPL'd, by Julian Seward et al. | |||
==32580== Using Valgrind-3.10.1 and LibVEX; rerun with -h for copyright info | |||
==32580== Command: ./main http://test.com:8080/test1/test2.html | |||
==32580== | |||
The resource. protocol[http] | |||
The endpoint. endpoint[test.com:8080] | |||
The resource. resource[/test1/test2.html] | |||
The hostname. hostname[test.com] | |||
The port. port[8080] | |||
Parsing info. uri[http://test.com:8080/test1/test2.html], protocol[http], hostname[test.com], port[8080], resource[/test1/test2.html] | |||
==32580== | |||
==32580== HEAP SUMMARY: | |||
==32580== in use at exit: 0 bytes in 0 blocks | |||
==32580== total heap usage: 14 allocs, 14 frees, 617 bytes allocated | |||
==32580== | |||
==32580== All heap blocks were freed -- no leaks are possible | |||
==32580== | |||
==32580== For counts of detected and suppressed errors, rerun with: -v | |||
==32580== ERROR SUMMARY: 0 errors from 0 contexts (suppressed: 0 from 0) | |||
</pre> | |||
[[category:c]] | [[category:c]] | ||
[[category:regex]] | [[category:regex]] |
Revision as of 13:39, 19 August 2016
Overview
libpcre 내용 정리
Basic
pcre_exec/pcre16_exec
<source lang=c>
- include <pcre.h>
int pcre_exec(const pcre *code, const pcre_extra *extra,
const char *subject, int length, int startoffset, int options, int *ovector, int ovecsize);
int pcre16_exec(const pcre16 *code, const pcre16_extra *extra,
PCRE_SPTR16 subject, int length, int startoffset, int options, int *ovector, int ovecsize);
</source>
- code : Points to the compiled pattern.
- extra : Points to an associated pcre[16]_extra structure, or is NULL.
- subject : Points to the subject string.
- length : Length of the subject string, in bytes.
- startoffset : Offset in bytes in the subject at which to start matching.
- options : Option bits.
- ovector : Points to a vector of ints for result offsets.
- ovecsize : Number of elements in the vector(a multiple of 3)
Options
PCRE_ANCHORED Match only at the first position PCRE_BSR_ANYCRLF \R matches only CR, LF, or CRLF PCRE_BSR_UNICODE \R matches all Unicode line endings PCRE_NEWLINE_ANY Recognize any Unicode newline sequence PCRE_NEWLINE_ANYCRLF Recognize CR, LF, & CRLF as newline sequences PCRE_NEWLINE_CR Recognize CR as the only newline sequence PCRE_NEWLINE_CRLF Recognize CRLF as the only newline sequence PCRE_NEWLINE_LF Recognize LF as the only newline sequence PCRE_NOTBOL Subject string is not the beginning of a line PCRE_NOTEOL Subject string is not the end of a line PCRE_NOTEMPTY An empty string is not a valid match PCRE_NOTEMPTY_ATSTART An empty string at the start of the subject is not a valid match PCRE_NO_START_OPTIMIZE Do not do "start-match" optimizations PCRE_NO_UTF16_CHECK Do not check the subject for UTF-16 validity (only relevant if PCRE_UTF16 was set at compile time) PCRE_NO_UTF8_CHECK Do not check the subject for UTF-8 validity (only relevant if PCRE_UTF8 was set at compile time) PCRE_PARTIAL ) Return PCRE_ERROR_PARTIAL for a partial PCRE_PARTIAL_SOFT ) match if no full matches are found PCRE_PARTIAL_HARD Return PCRE_ERROR_PARTIAL for a partial match if that is found before a full match
pcre_get_substring/pcre16_get_substring
<source lang=c>
- include <pcre.h>
int pcre_get_substring(const char *subject, int *ovector,
int stringcount, int stringnumber, const char **stringptr);
int pcre16_get_substring(PCRE_SPTR16 subject, int *ovector,
int stringcount, int stringnumber, PCRE_SPTR16 *stringptr);
</source>
- subject Subject that has been successfully matched
- ovector Offset vector that pcre[16]_exec() used
- stringcount Value returned by pcre[16]_exec()
- stringnumber Number of the required substring
- stringptr Where to put the string pointer
Example
URI parser
Parsing the given uri info to protocol, hostname, etc.. <source lang=c> // main.c
- include <stdio.h>
- include <pcre.h>
- include <string.h>
- include <stdbool.h>
- define REGEX_URI "^(([^:/?#]+):)?(//([^/?#]*))?([^?#]*)(\\?([^#]*))?(#(.*))?" // RFC 3986
- define REGEX_HOSTNAME "^(.+):(.*)"
- define IDX_PROTOCOL 2
- define IDX_ENDPOINT 4
- define IDX_RESOURCE 5
- define IDX_HOSTNAME 1
- define IDX_PORT 2
typedef struct _st_hostinfo {
char* uri; // http://example.com:80/test1/test2/test3/index.html // original URI char* protocol; // http char* endpoint; // exmaple.com:80 char* hostname; // example.com char* resource; // /test1/test2/test3/index.html int port; // 80
} st_hostinfo;
static bool parse_uri(st_hostinfo* hostinfo); static bool parse_endpoint(st_hostinfo* hostinfo);
/**
* \brief create hostinfo. Initiate all items to NULL */
static st_hostinfo* create_hostinfo(void) {
st_hostinfo* hostinfo; hostinfo = calloc(sizeof(st_hostinfo), 1); hostinfo->uri = NULL; hostinfo->protocol = NULL; hostinfo->hostname = NULL; hostinfo->resource = NULL; hostinfo->port = -1; return hostinfo;
}
/**
* \brief destroy the hostinfo. Release all items in the given value. */
static void destroy_hostinfo(st_hostinfo* hostinfo) {
if(hostinfo == NULL) { return; } if(hostinfo->uri != NULL) { free(hostinfo->uri); } if(hostinfo->protocol != NULL) { free(hostinfo->protocol); } if(hostinfo->endpoint != NULL) { free(hostinfo->endpoint); } if(hostinfo->hostname != NULL) { free(hostinfo->hostname); } if(hostinfo->resource != NULL) { free(hostinfo->resource); } free(hostinfo); return;
}
/**
* \brief initiate hostinfo. parse the given uri. */
static bool init_hostinfo(st_hostinfo* hostinfo, const char* uri) {
int ret; if(hostinfo->uri != NULL) { free(hostinfo->uri); } hostinfo->uri = strdup(uri); ret = parse_uri(hostinfo); if(ret == false) { printf("Could not initiate hostinfo.\n"); return false; } ret = parse_endpoint(hostinfo); if(ret == false) { // copy the endpoint to hostname. hostinfo->hostname = strdup(hostinfo->endpoint); if(strcmp(hostinfo->protocol, "http") == 0) { hostinfo->port = 80; } else if(strcmp(hostinfo->protocol, "ftp") == 0) { hostinfo->port = 21; } else { printf("Could not get correct port info.\n"); return false; } } return true;
}
/**
* \brief parse uri info form given hostinfo.\n * it parsing only the below items using uri. * protocol, endpoint, resource */
static bool parse_uri(st_hostinfo* hostinfo) {
const char* tmp_err; int offset; pcre* re; pcre_extra* re_ex; int subStrVec[30]; int ret; const char* tmp_const; const char* uri; if((hostinfo == NULL) || (hostinfo->uri == NULL)) { printf("Wrong input parameter.\n"); return false; } uri = hostinfo->uri; re = pcre_compile(REGEX_URI, 0, &tmp_err, &offset, NULL); if(re == NULL) { printf("Could not compile the regex. regex=%s, err=%s\n", REGEX_URI, tmp_err); return false; } // Optimize the regex re_ex = pcre_study(re, 0, &tmp_err); if(tmp_err != NULL) { printf("Could not optimize the regex. regex=%s, err=%s\n", REGEX_URI, tmp_err); pcre_free(re); return false; } ret = pcre_exec(re, re_ex, uri, strlen(uri), // length of uri 0, // Start looking at this point 0, // OPTIONS subStrVec, sizeof(subStrVec) // Length of subStrVec ); pcre_free(re); pcre_free(re_ex); if(ret < 0) { printf("Could not parsed input uri. uri[%s]\n", uri); return false; } // get protocol pcre_get_substring(uri, subStrVec, ret, IDX_PROTOCOL, &tmp_const); if(tmp_const == NULL) { printf("Could not get hostname info.\n"); return false; } hostinfo->protocol = strdup(tmp_const); printf("The resource. protocol[%s]\n", hostinfo->protocol); pcre_free_substring(tmp_const); // get endpoint pcre_get_substring(uri, subStrVec, ret, IDX_ENDPOINT, &tmp_const); if(tmp_const == NULL) { printf("Could not get endpoint info.\n"); return false; } hostinfo->endpoint = strdup(tmp_const); printf("The endpoint. endpoint[%s]\n", hostinfo->endpoint); pcre_free_substring(tmp_const); // get resource pcre_get_substring(uri, subStrVec, ret, IDX_RESOURCE, &tmp_const); if(tmp_const == NULL) { printf("Could not get hostname info.\n"); return false; } hostinfo->resource = strdup(tmp_const); printf("The resource. resource[%s]\n", hostinfo->resource); pcre_free_substring(tmp_const);
return true;
}
static bool parse_endpoint(st_hostinfo* hostinfo) {
pcre* re; pcre_extra* re_ex; const char* tmp_err; const char* endpoint; const char* tmp_const; int subStrVec[30]; int offset; int ret;
// parameter check. if((hostinfo == NULL) || (hostinfo->endpoint == NULL)) { printf("Could not set hostname and port info. Wrong input parameter.\n"); return false; } re = pcre_compile(REGEX_HOSTNAME, 0, &tmp_err, &offset, NULL); if(re == NULL) { printf("Could not compile the reges. regex[%s], err[%s]\n", REGEX_HOSTNAME, tmp_err); return NULL; }
re_ex = pcre_study(re, 0, &tmp_err); if(tmp_err != NULL) { printf("Could not optimize the regex. regex[%s], err[%s]\n", REGEX_HOSTNAME, tmp_err); pcre_free(re); return NULL; } endpoint = hostinfo->endpoint;
ret = pcre_exec(re, re_ex, endpoint, strlen(endpoint), // length of endpoint 0, // Start looking at this point 0, // OPTIONS subStrVec, sizeof(subStrVec) // Length of subStrVec ); pcre_free(re); pcre_free(re_ex); if(ret < 0) { printf("Could not parse endpoint info.\n"); return false; }
// get hostname pcre_get_substring(endpoint, subStrVec, ret, IDX_HOSTNAME, &tmp_const); if(tmp_const == NULL) { printf("Could not get hostname info.\n"); return false; } hostinfo->hostname = strdup(tmp_const); printf("The hostname. hostname[%s]\n", hostinfo->hostname); pcre_free_substring(tmp_const);
// get port pcre_get_substring(endpoint, subStrVec, ret, IDX_PORT, &tmp_const); if(tmp_const == NULL) { printf("Could not get port info.\n"); return false; } hostinfo->port = atoi(tmp_const); printf("The port. port[%d]\n", hostinfo->port); pcre_free_substring(tmp_const);
return true;
}
int main(int argc, char** argv) {
int ret; st_hostinfo* hostinfo; if(argc < 2) { printf("Usage:\n"); printf(" %s <fully qualified hostname>\n", argv[0]); return 1; } hostinfo = create_hostinfo(); ret = init_hostinfo(hostinfo, argv[1]); if(ret != true) { destroy_hostinfo(hostinfo); return 0; } printf("Parsing info. uri[%s], protocol[%s], hostname[%s], port[%d], resource[%s]\n", hostinfo->uri, hostinfo->protocol, hostinfo->hostname, hostinfo->port, hostinfo->resource ); destroy_hostinfo(hostinfo); return 0;
} </source>
Run
$ valgrind --tool=memcheck --leak-check=full ./main http://test.com:8080/test1/test2.html ==32580== Memcheck, a memory error detector ==32580== Copyright (C) 2002-2013, and GNU GPL'd, by Julian Seward et al. ==32580== Using Valgrind-3.10.1 and LibVEX; rerun with -h for copyright info ==32580== Command: ./main http://test.com:8080/test1/test2.html ==32580== The resource. protocol[http] The endpoint. endpoint[test.com:8080] The resource. resource[/test1/test2.html] The hostname. hostname[test.com] The port. port[8080] Parsing info. uri[http://test.com:8080/test1/test2.html], protocol[http], hostname[test.com], port[8080], resource[/test1/test2.html] ==32580== ==32580== HEAP SUMMARY: ==32580== in use at exit: 0 bytes in 0 blocks ==32580== total heap usage: 14 allocs, 14 frees, 617 bytes allocated ==32580== ==32580== All heap blocks were freed -- no leaks are possible ==32580== ==32580== For counts of detected and suppressed errors, rerun with: -v ==32580== ERROR SUMMARY: 0 errors from 0 contexts (suppressed: 0 from 0)