Libpcre
Overview
libpcre 내용 정리
Basic
pcre_exec/pcre16_exec
<source lang=c>
- include <pcre.h>
int pcre_exec(const pcre *code, const pcre_extra *extra,
const char *subject, int length, int startoffset, int options, int *ovector, int ovecsize);
int pcre16_exec(const pcre16 *code, const pcre16_extra *extra,
PCRE_SPTR16 subject, int length, int startoffset, int options, int *ovector, int ovecsize);
</source>
- code : Points to the compiled pattern.
- extra : Points to an associated pcre[16]_extra structure, or is NULL.
- subject : Points to the subject string.
- length : Length of the subject string, in bytes.
- startoffset : Offset in bytes in the subject at which to start matching.
- options : Option bits.
- ovector : Points to a vector of ints for result offsets.
- ovecsize : Number of elements in the vector(a multiple of 3)
Options
PCRE_ANCHORED Match only at the first position PCRE_BSR_ANYCRLF \R matches only CR, LF, or CRLF PCRE_BSR_UNICODE \R matches all Unicode line endings PCRE_NEWLINE_ANY Recognize any Unicode newline sequence PCRE_NEWLINE_ANYCRLF Recognize CR, LF, & CRLF as newline sequences PCRE_NEWLINE_CR Recognize CR as the only newline sequence PCRE_NEWLINE_CRLF Recognize CRLF as the only newline sequence PCRE_NEWLINE_LF Recognize LF as the only newline sequence PCRE_NOTBOL Subject string is not the beginning of a line PCRE_NOTEOL Subject string is not the end of a line PCRE_NOTEMPTY An empty string is not a valid match PCRE_NOTEMPTY_ATSTART An empty string at the start of the subject is not a valid match PCRE_NO_START_OPTIMIZE Do not do "start-match" optimizations PCRE_NO_UTF16_CHECK Do not check the subject for UTF-16 validity (only relevant if PCRE_UTF16 was set at compile time) PCRE_NO_UTF8_CHECK Do not check the subject for UTF-8 validity (only relevant if PCRE_UTF8 was set at compile time) PCRE_PARTIAL ) Return PCRE_ERROR_PARTIAL for a partial PCRE_PARTIAL_SOFT ) match if no full matches are found PCRE_PARTIAL_HARD Return PCRE_ERROR_PARTIAL for a partial match if that is found before a full match
EXTRACTING CAPTURED SUBSTRINGS BY NUMBER
<source lang=c> int pcre_copy_substring(const char *subject, int *ovector,
int stringcount, int stringnumber, char *buffer, int buffersize);
int pcre_get_substring(const char *subject, int *ovector,
int stringcount, int stringnumber, const char **stringptr);
int pcre_get_substring_list(const char *subject,
int *ovector, int stringcount, const char ***listptr);
</source>
EXTRACTING CAPTURED SUBSTRINGS BY NAME
<source lang=c> int pcre_get_stringnumber(const pcre *code,
const char *name);
int pcre_copy_named_substring(const pcre *code,
const char *subject, int *ovector, int stringcount, const char *stringname, char *buffer, int buffersize);
int pcre_get_named_substring(const pcre *code,
const char *subject, int *ovector, int stringcount, const char *stringname, const char **stringptr);
</source>
Example
URI parser
Parsing the given uri info to protocol, hostname, etc.. <source lang=c> // main.c
- include <stdio.h>
- include <pcre.h>
- include <string.h>
- include <stdbool.h>
- include <arpa/inet.h>
- include <netdb.h>
- define REGEX_URI "^(([^:/?#]+):)?(//([^/?#]*))?([^?#]*)(\\?([^#]*))?(#(.*))?" // RFC 3986
- define REGEX_HOSTNAME "^(.+):(.*)"
- define IDX_PROTOCOL 2
- define IDX_ENDPOINT 4
- define IDX_RESOURCE 5
- define IDX_HOSTNAME 1
- define IDX_PORT 2
- define DEF_SUBVEC_SIZE 30
typedef struct _st_hostinfo {
char* uri; // original uri. "http://example.com:80/test1/test2/test3/index.html" char* protocol; // "http" char* endpoint; // "exmaple.com:80" char* hostname; // "example.com" char* hostaddr; // "127.0.0.1" char* resource; // "/test1/test2/test3/index.html" int port; // 80
} st_hostinfo;
static bool parse_uri(st_hostinfo* hostinfo); static bool parse_endpoint(st_hostinfo* hostinfo); static bool resolve_addr(st_hostinfo* hostinfo);
/**
* \brief create hostinfo. Initiate all items to NULL */
static st_hostinfo* create_hostinfo(void) {
st_hostinfo* hostinfo; hostinfo = calloc(sizeof(st_hostinfo), 1); hostinfo->uri = NULL; hostinfo->protocol = NULL; hostinfo->hostname = NULL; hostinfo->hostaddr = NULL; hostinfo->resource = NULL; hostinfo->port = -1; return hostinfo;
}
/**
* \brief destroy the hostinfo. Release all items in the given value. */
static void destroy_hostinfo(st_hostinfo* hostinfo) {
if(hostinfo == NULL) {
return;
}
if(hostinfo->uri != NULL) {
free(hostinfo->uri);
}
if(hostinfo->protocol != NULL) {
free(hostinfo->protocol);
}
if(hostinfo->endpoint != NULL) {
free(hostinfo->endpoint);
}
if(hostinfo->hostname != NULL) {
free(hostinfo->hostname);
}
if(hostinfo->hostaddr != NULL) {
free(hostinfo->hostaddr);
}
if(hostinfo->resource != NULL) {
free(hostinfo->resource);
}
free(hostinfo);
return;
}
/**
* \brief initiate hostinfo. parse the given uri. */
static bool init_hostinfo(st_hostinfo* hostinfo, const char* uri) {
int ret;
if((hostinfo == NULL) || (uri == NULL)) {
printf("Could not initiate hostinfo. Wrong input parameter.\n");
return false;
}
hostinfo->uri = strdup(uri);
// parsing the protocol, endpoint, resource
ret = parse_uri(hostinfo);
if(ret == false) {
printf("Could not initiate hostinfo.\n");
return false;
}
// parsing the hostname and port
ret = parse_endpoint(hostinfo);
if(ret == false) {
// No given port number.
// copy the endpoint to hostname.
// guess the common port number using protocol.
hostinfo->hostname = strdup(hostinfo->endpoint);
if(strcmp(hostinfo->protocol, "http") == 0) {
hostinfo->port = 80;
}
else if(strcmp(hostinfo->protocol, "ftp") == 0) {
hostinfo->port = 21;
}
else {
printf("Could not get correct port info.\n");
return false;
}
}
ret = resolve_addr(hostinfo);
if(ret == false) {
printf("Could not initiate hostinfo.\n");
return false;
}
return true;
}
/**
* \brief parse uri info form given hostinfo.\n * it parsing only the below items using uri. * protocol, endpoint, resource */
static bool parse_uri(st_hostinfo* hostinfo) {
const char* tmp_err;
int offset;
pcre* re;
pcre_extra* re_ex;
int subStrVec[DEF_SUBVEC_SIZE];
int ret;
const char* tmp_const;
const char* uri;
if((hostinfo == NULL) || (hostinfo->uri == NULL)) {
printf("Wrong input parameter.\n");
return false;
}
uri = hostinfo->uri;
re = pcre_compile(REGEX_URI, 0, &tmp_err, &offset, NULL);
if(re == NULL) {
printf("Could not compile the regex. regex=%s, err=%s\n", REGEX_URI, tmp_err);
return false;
}
// Optimize the regex
re_ex = pcre_study(re, 0, &tmp_err);
if(tmp_err != NULL) {
printf("Could not optimize the regex. regex=%s, err=%s\n", REGEX_URI, tmp_err);
pcre_free(re);
return false;
}
ret = pcre_exec(re,
re_ex,
uri,
strlen(uri), // length of uri
0, // Start looking at this point
0, // OPTIONS
subStrVec,
DEF_SUBVEC_SIZE // Length of subStrVec
);
pcre_free(re);
pcre_free(re_ex);
if(ret < 0) {
printf("Could not parsed input uri. uri[%s]\n", uri);
return false;
}
// get protocol
pcre_get_substring(uri, subStrVec, ret, IDX_PROTOCOL, &tmp_const);
if(tmp_const == NULL) {
printf("Could not get hostname info.\n");
return false;
}
hostinfo->protocol = strdup(tmp_const);
printf("The resource. protocol[%s]\n", hostinfo->protocol);
pcre_free_substring(tmp_const);
// get endpoint
pcre_get_substring(uri, subStrVec, ret, IDX_ENDPOINT, &tmp_const);
if(tmp_const == NULL) {
printf("Could not get endpoint info.\n");
return false;
}
hostinfo->endpoint = strdup(tmp_const);
printf("The endpoint. endpoint[%s]\n", hostinfo->endpoint);
pcre_free_substring(tmp_const);
// get resource
pcre_get_substring(uri, subStrVec, ret, IDX_RESOURCE, &tmp_const);
if(tmp_const == NULL) {
printf("Could not get hostname info.\n");
return false;
}
hostinfo->resource = strdup(tmp_const);
printf("The resource. resource[%s]\n", hostinfo->resource);
pcre_free_substring(tmp_const);
return true;
}
static bool parse_endpoint(st_hostinfo* hostinfo) {
pcre* re; pcre_extra* re_ex; const char* tmp_err; const char* endpoint; const char* tmp_const; int subStrVec[DEF_SUBVEC_SIZE]; int offset; int ret;
// parameter check.
if((hostinfo == NULL) || (hostinfo->endpoint == NULL)) {
printf("Could not set hostname and port info. Wrong input parameter.\n");
return false;
}
re = pcre_compile(REGEX_HOSTNAME, 0, &tmp_err, &offset, NULL);
if(re == NULL) {
printf("Could not compile the reges. regex[%s], err[%s]\n", REGEX_HOSTNAME, tmp_err);
return NULL;
}
re_ex = pcre_study(re, 0, &tmp_err);
if(tmp_err != NULL) {
printf("Could not optimize the regex. regex[%s], err[%s]\n", REGEX_HOSTNAME, tmp_err);
pcre_free(re);
return NULL;
}
endpoint = hostinfo->endpoint;
ret = pcre_exec(re,
re_ex,
endpoint,
strlen(endpoint), // length of endpoint
0, // Start looking at this point
0, // OPTIONS
subStrVec,
DEF_SUBVEC_SIZE // Length of subStrVec
);
pcre_free(re);
pcre_free(re_ex);
if(ret < 0) {
printf("Could not parse endpoint info.\n");
return false;
}
// get hostname
pcre_get_substring(endpoint, subStrVec, ret, IDX_HOSTNAME, &tmp_const);
if(tmp_const == NULL) {
printf("Could not get hostname info.\n");
return false;
}
hostinfo->hostname = strdup(tmp_const);
printf("The hostname. hostname[%s]\n", hostinfo->hostname);
pcre_free_substring(tmp_const);
// get port
pcre_get_substring(endpoint, subStrVec, ret, IDX_PORT, &tmp_const);
if(tmp_const == NULL) {
printf("Could not get port info.\n");
return false;
}
hostinfo->port = atoi(tmp_const);
printf("The port. port[%d]\n", hostinfo->port);
pcre_free_substring(tmp_const);
return true;
}
/**
* \brief resolve the hostname to ip address. * \return Success:true, Fail:false */
static bool resolve_addr(st_hostinfo* hostinfo) {
struct hostent* tmp_hostent;
char addr[INET_ADDRSTRLEN];
char** addr_ptr;
if((hostinfo == NULL) || (hostinfo->hostname == NULL)) {
printf("Could not resolve hostaddr. Wrong input parameter.\n");
return false;
}
// get ip address
tmp_hostent = gethostbyname(hostinfo->hostname);
if(tmp_hostent == NULL) {
printf("Could not get ip address.\n");
return false;
}
addr_ptr = tmp_hostent->h_addr_list;
inet_ntop(AF_INET,(void *)*addr_ptr, addr, sizeof(addr));
if(addr == NULL) {
printf("Could not get ip address.\n");
return false;
}
hostinfo->hostaddr = strdup(addr);
printf("The hostaddr. hostaddr[%s]\n", hostinfo->hostaddr);
return true;
}
int main(int argc, char** argv) {
int ret;
st_hostinfo* hostinfo;
if(argc < 2) {
printf("Usage:\n");
printf(" %s <fully qualified hostname>\n", argv[0]);
return 1;
}
hostinfo = create_hostinfo();
ret = init_hostinfo(hostinfo, argv[1]);
if(ret != true) {
destroy_hostinfo(hostinfo);
return 0;
}
printf("Parsing info. uri[%s], protocol[%s], hostname[%s], hostaddr[%s], port[%d], resource[%s]\n",
hostinfo->uri,
hostinfo->protocol,
hostinfo->hostname,
hostinfo->hostaddr,
hostinfo->port,
hostinfo->resource
);
destroy_hostinfo(hostinfo);
return 0;
} </source>
Run
$ valgrind --tool=memcheck --leak-check=full ./main http://test.com:8080/test1/test2.html ==32580== Memcheck, a memory error detector ==32580== Copyright (C) 2002-2013, and GNU GPL'd, by Julian Seward et al. ==32580== Using Valgrind-3.10.1 and LibVEX; rerun with -h for copyright info ==32580== Command: ./main http://test.com:8080/test1/test2.html ==32580== The resource. protocol[http] The endpoint. endpoint[test.com:8080] The resource. resource[/test1/test2.html] The hostname. hostname[test.com] The port. port[8080] Parsing info. uri[http://test.com:8080/test1/test2.html], protocol[http], hostname[test.com], port[8080], resource[/test1/test2.html] ==32580== ==32580== HEAP SUMMARY: ==32580== in use at exit: 0 bytes in 0 blocks ==32580== total heap usage: 14 allocs, 14 frees, 617 bytes allocated ==32580== ==32580== All heap blocks were freed -- no leaks are possible ==32580== ==32580== For counts of detected and suppressed errors, rerun with: -v ==32580== ERROR SUMMARY: 0 errors from 0 contexts (suppressed: 0 from 0)
hostname parser
<source lang=c>
- include <stdio.h>
- include <string.h>
- include <pcre.h>
- define DEF_TEST_REGEX "(.+):(.+)"
- define DEF_MATCHES_SIZE 10
int main(int argc, char** argv) {
pcre* re;
int re_res;
const char* pcreErrorStr;
int pcreErrorOffset;
int matches[DEF_MATCHES_SIZE];
char tmp[100];
if(argc < 2) {
printf("Usage:\n");
printf(" %s <test text>\n", argv[0]);
return 1;
}
printf("regex info. string[%s]\n", argv[1]);
re = pcre_compile(DEF_TEST_REGEX, 0, &pcreErrorStr, &pcreErrorOffset, NULL);
if(re == NULL) {
printf("Could not compile the regex. regex[%s]\n", DEF_TEST_REGEX);
return 1;
}
re_res = pcre_exec(re, NULL, argv[1], strlen(argv[1]), 0, 0, matches, DEF_MATCHES_SIZE);
if(re_res < 0) {
printf("Could not parse correctly.\n");
return 1;
}
pcre_copy_substring(argv[1], matches, re_res, 0, tmp, sizeof(tmp));
printf("Result. tmp[%s]\n", tmp);
pcre_copy_substring(argv[1], matches, re_res, 1, tmp, sizeof(tmp));
printf("Result. tmp[%s]\n", tmp);
pcre_copy_substring(argv[1], matches, re_res, 2, tmp, sizeof(tmp));
printf("Result. tmp[%s]\n", tmp);
//pcre_free_substring(tmp); pcre_free(re);
return 0;
} </source>
Run
$ valgrind --tool=memcheck --leak-check=full ./main test.com:80 ==10874== Memcheck, a memory error detector ==10874== Copyright (C) 2002-2013, and GNU GPL'd, by Julian Seward et al. ==10874== Using Valgrind-3.10.1 and LibVEX; rerun with -h for copyright info ==10874== Command: ./main test.com:80 ==10874== regex info. string[test.com:80] Result. tmp[test.com:80] Result. tmp[test.com] Result. tmp[80] ==10874== ==10874== HEAP SUMMARY: ==10874== in use at exit: 0 bytes in 0 blocks ==10874== total heap usage: 1 allocs, 1 frees, 77 bytes allocated ==10874== ==10874== All heap blocks were freed -- no leaks are possible ==10874== ==10874== For counts of detected and suppressed errors, rerun with: -v ==10874== ERROR SUMMARY: 0 errors from 0 contexts (suppressed: 0 from 0)