Libpcre
Overview
libpcre 내용 정리
Basic
pcre_exec/pcre16_exec
<source lang=c>
- include <pcre.h>
int pcre_exec(const pcre *code, const pcre_extra *extra,
const char *subject, int length, int startoffset, int options, int *ovector, int ovecsize);
int pcre16_exec(const pcre16 *code, const pcre16_extra *extra,
PCRE_SPTR16 subject, int length, int startoffset, int options, int *ovector, int ovecsize);
</source>
- code : Points to the compiled pattern.
- extra : Points to an associated pcre[16]_extra structure, or is NULL.
- subject : Points to the subject string.
- length : Length of the subject string, in bytes.
- startoffset : Offset in bytes in the subject at which to start matching.
- options : Option bits.
- ovector : Points to a vector of ints for result offsets.
- ovecsize : Number of elements in the vector(a multiple of 3)
Options
PCRE_ANCHORED Match only at the first position PCRE_BSR_ANYCRLF \R matches only CR, LF, or CRLF PCRE_BSR_UNICODE \R matches all Unicode line endings PCRE_NEWLINE_ANY Recognize any Unicode newline sequence PCRE_NEWLINE_ANYCRLF Recognize CR, LF, & CRLF as newline sequences PCRE_NEWLINE_CR Recognize CR as the only newline sequence PCRE_NEWLINE_CRLF Recognize CRLF as the only newline sequence PCRE_NEWLINE_LF Recognize LF as the only newline sequence PCRE_NOTBOL Subject string is not the beginning of a line PCRE_NOTEOL Subject string is not the end of a line PCRE_NOTEMPTY An empty string is not a valid match PCRE_NOTEMPTY_ATSTART An empty string at the start of the subject is not a valid match PCRE_NO_START_OPTIMIZE Do not do "start-match" optimizations PCRE_NO_UTF16_CHECK Do not check the subject for UTF-16 validity (only relevant if PCRE_UTF16 was set at compile time) PCRE_NO_UTF8_CHECK Do not check the subject for UTF-8 validity (only relevant if PCRE_UTF8 was set at compile time) PCRE_PARTIAL ) Return PCRE_ERROR_PARTIAL for a partial PCRE_PARTIAL_SOFT ) match if no full matches are found PCRE_PARTIAL_HARD Return PCRE_ERROR_PARTIAL for a partial match if that is found before a full match
EXTRACTING CAPTURED SUBSTRINGS BY NUMBER
<source lang=c> int pcre_copy_substring(const char *subject, int *ovector,
int stringcount, int stringnumber, char *buffer, int buffersize);
int pcre_get_substring(const char *subject, int *ovector,
int stringcount, int stringnumber, const char **stringptr);
int pcre_get_substring_list(const char *subject,
int *ovector, int stringcount, const char ***listptr);
</source>
EXTRACTING CAPTURED SUBSTRINGS BY NAME
<source lang=c> int pcre_get_stringnumber(const pcre *code,
const char *name);
int pcre_copy_named_substring(const pcre *code,
const char *subject, int *ovector, int stringcount, const char *stringname, char *buffer, int buffersize);
int pcre_get_named_substring(const pcre *code,
const char *subject, int *ovector, int stringcount, const char *stringname, const char **stringptr);
</source>
Example
URI parser
Parsing the given uri info to protocol, hostname, etc.. <source lang=c> // main.c
- include <stdio.h>
- include <pcre.h>
- include <string.h>
- include <stdbool.h>
- include <arpa/inet.h>
- include <netdb.h>
- define REGEX_URI "^(([^:/?#]+):)?(//([^/?#]*))?([^?#]*)(\\?([^#]*))?(#(.*))?" // RFC 3986
- define REGEX_HOSTNAME "^(.+):(.*)"
- define IDX_PROTOCOL 2
- define IDX_ENDPOINT 4
- define IDX_RESOURCE 5
- define IDX_HOSTNAME 1
- define IDX_PORT 2
- define DEF_SUBVEC_SIZE 30
typedef struct _st_hostinfo {
char* uri; // original uri. "http://example.com:80/test1/test2/test3/index.html" char* protocol; // "http" char* endpoint; // "exmaple.com:80" char* hostname; // "example.com" char* hostaddr; // "127.0.0.1" char* resource; // "/test1/test2/test3/index.html" int port; // 80
} st_hostinfo;
static bool parse_uri(st_hostinfo* hostinfo); static bool parse_endpoint(st_hostinfo* hostinfo); static bool resolve_addr(st_hostinfo* hostinfo);
/**
* \brief create hostinfo. Initiate all items to NULL */
static st_hostinfo* create_hostinfo(void) {
st_hostinfo* hostinfo; hostinfo = calloc(sizeof(st_hostinfo), 1); hostinfo->uri = NULL; hostinfo->protocol = NULL; hostinfo->hostname = NULL; hostinfo->hostaddr = NULL; hostinfo->resource = NULL; hostinfo->port = -1; return hostinfo;
}
/**
* \brief destroy the hostinfo. Release all items in the given value. */
static void destroy_hostinfo(st_hostinfo* hostinfo) {
if(hostinfo == NULL) { return; } if(hostinfo->uri != NULL) { free(hostinfo->uri); } if(hostinfo->protocol != NULL) { free(hostinfo->protocol); } if(hostinfo->endpoint != NULL) { free(hostinfo->endpoint); } if(hostinfo->hostname != NULL) { free(hostinfo->hostname); } if(hostinfo->hostaddr != NULL) { free(hostinfo->hostaddr); } if(hostinfo->resource != NULL) { free(hostinfo->resource); } free(hostinfo); return;
}
/**
* \brief initiate hostinfo. parse the given uri. */
static bool init_hostinfo(st_hostinfo* hostinfo, const char* uri) {
int ret; if((hostinfo == NULL) || (uri == NULL)) { printf("Could not initiate hostinfo. Wrong input parameter.\n"); return false; } hostinfo->uri = strdup(uri); // parsing the protocol, endpoint, resource ret = parse_uri(hostinfo); if(ret == false) { printf("Could not initiate hostinfo.\n"); return false; } // parsing the hostname and port ret = parse_endpoint(hostinfo); if(ret == false) { // No given port number. // copy the endpoint to hostname. // guess the common port number using protocol. hostinfo->hostname = strdup(hostinfo->endpoint); if(strcmp(hostinfo->protocol, "http") == 0) { hostinfo->port = 80; } else if(strcmp(hostinfo->protocol, "ftp") == 0) { hostinfo->port = 21; } else { printf("Could not get correct port info.\n"); return false; } } ret = resolve_addr(hostinfo); if(ret == false) { printf("Could not initiate hostinfo.\n"); return false; } return true;
}
/**
* \brief parse uri info form given hostinfo.\n * it parsing only the below items using uri. * protocol, endpoint, resource */
static bool parse_uri(st_hostinfo* hostinfo) {
const char* tmp_err; int offset; pcre* re; pcre_extra* re_ex; int subStrVec[DEF_SUBVEC_SIZE]; int ret; const char* tmp_const; const char* uri; if((hostinfo == NULL) || (hostinfo->uri == NULL)) { printf("Wrong input parameter.\n"); return false; } uri = hostinfo->uri; re = pcre_compile(REGEX_URI, 0, &tmp_err, &offset, NULL); if(re == NULL) { printf("Could not compile the regex. regex=%s, err=%s\n", REGEX_URI, tmp_err); return false; } // Optimize the regex re_ex = pcre_study(re, 0, &tmp_err); if(tmp_err != NULL) { printf("Could not optimize the regex. regex=%s, err=%s\n", REGEX_URI, tmp_err); pcre_free(re); return false; } ret = pcre_exec(re, re_ex, uri, strlen(uri), // length of uri 0, // Start looking at this point 0, // OPTIONS subStrVec, DEF_SUBVEC_SIZE // Length of subStrVec ); pcre_free(re); pcre_free(re_ex); if(ret < 0) { printf("Could not parsed input uri. uri[%s]\n", uri); return false; } // get protocol pcre_get_substring(uri, subStrVec, ret, IDX_PROTOCOL, &tmp_const); if(tmp_const == NULL) { printf("Could not get hostname info.\n"); return false; } hostinfo->protocol = strdup(tmp_const); printf("The resource. protocol[%s]\n", hostinfo->protocol); pcre_free_substring(tmp_const); // get endpoint pcre_get_substring(uri, subStrVec, ret, IDX_ENDPOINT, &tmp_const); if(tmp_const == NULL) { printf("Could not get endpoint info.\n"); return false; } hostinfo->endpoint = strdup(tmp_const); printf("The endpoint. endpoint[%s]\n", hostinfo->endpoint); pcre_free_substring(tmp_const); // get resource pcre_get_substring(uri, subStrVec, ret, IDX_RESOURCE, &tmp_const); if(tmp_const == NULL) { printf("Could not get hostname info.\n"); return false; } hostinfo->resource = strdup(tmp_const); printf("The resource. resource[%s]\n", hostinfo->resource); pcre_free_substring(tmp_const);
return true;
}
static bool parse_endpoint(st_hostinfo* hostinfo) {
pcre* re; pcre_extra* re_ex; const char* tmp_err; const char* endpoint; const char* tmp_const; int subStrVec[DEF_SUBVEC_SIZE]; int offset; int ret;
// parameter check. if((hostinfo == NULL) || (hostinfo->endpoint == NULL)) { printf("Could not set hostname and port info. Wrong input parameter.\n"); return false; } re = pcre_compile(REGEX_HOSTNAME, 0, &tmp_err, &offset, NULL); if(re == NULL) { printf("Could not compile the reges. regex[%s], err[%s]\n", REGEX_HOSTNAME, tmp_err); return NULL; }
re_ex = pcre_study(re, 0, &tmp_err); if(tmp_err != NULL) { printf("Could not optimize the regex. regex[%s], err[%s]\n", REGEX_HOSTNAME, tmp_err); pcre_free(re); return NULL; } endpoint = hostinfo->endpoint;
ret = pcre_exec(re, re_ex, endpoint, strlen(endpoint), // length of endpoint 0, // Start looking at this point 0, // OPTIONS subStrVec, DEF_SUBVEC_SIZE // Length of subStrVec ); pcre_free(re); pcre_free(re_ex); if(ret < 0) { printf("Could not parse endpoint info.\n"); return false; }
// get hostname pcre_get_substring(endpoint, subStrVec, ret, IDX_HOSTNAME, &tmp_const); if(tmp_const == NULL) { printf("Could not get hostname info.\n"); return false; } hostinfo->hostname = strdup(tmp_const); printf("The hostname. hostname[%s]\n", hostinfo->hostname); pcre_free_substring(tmp_const);
// get port pcre_get_substring(endpoint, subStrVec, ret, IDX_PORT, &tmp_const); if(tmp_const == NULL) { printf("Could not get port info.\n"); return false; } hostinfo->port = atoi(tmp_const); printf("The port. port[%d]\n", hostinfo->port); pcre_free_substring(tmp_const);
return true;
}
/**
* \brief resolve the hostname to ip address. * \return Success:true, Fail:false */
static bool resolve_addr(st_hostinfo* hostinfo) {
struct hostent* tmp_hostent; char addr[INET_ADDRSTRLEN]; char** addr_ptr; if((hostinfo == NULL) || (hostinfo->hostname == NULL)) { printf("Could not resolve hostaddr. Wrong input parameter.\n"); return false; }
// get ip address tmp_hostent = gethostbyname(hostinfo->hostname); if(tmp_hostent == NULL) { printf("Could not get ip address.\n"); return false; } addr_ptr = tmp_hostent->h_addr_list; inet_ntop(AF_INET,(void *)*addr_ptr, addr, sizeof(addr)); if(addr == NULL) { printf("Could not get ip address.\n"); return false; } hostinfo->hostaddr = strdup(addr); printf("The hostaddr. hostaddr[%s]\n", hostinfo->hostaddr);
return true;
}
int main(int argc, char** argv) {
int ret; st_hostinfo* hostinfo; if(argc < 2) { printf("Usage:\n"); printf(" %s <fully qualified hostname>\n", argv[0]); return 1; } hostinfo = create_hostinfo(); ret = init_hostinfo(hostinfo, argv[1]); if(ret != true) { destroy_hostinfo(hostinfo); return 0; } printf("Parsing info. uri[%s], protocol[%s], hostname[%s], hostaddr[%s], port[%d], resource[%s]\n", hostinfo->uri, hostinfo->protocol, hostinfo->hostname, hostinfo->hostaddr, hostinfo->port, hostinfo->resource ); destroy_hostinfo(hostinfo); return 0;
} </source>
Run
$ valgrind --tool=memcheck --leak-check=full ./main http://test.com:8080/test1/test2.html ==32580== Memcheck, a memory error detector ==32580== Copyright (C) 2002-2013, and GNU GPL'd, by Julian Seward et al. ==32580== Using Valgrind-3.10.1 and LibVEX; rerun with -h for copyright info ==32580== Command: ./main http://test.com:8080/test1/test2.html ==32580== The resource. protocol[http] The endpoint. endpoint[test.com:8080] The resource. resource[/test1/test2.html] The hostname. hostname[test.com] The port. port[8080] Parsing info. uri[http://test.com:8080/test1/test2.html], protocol[http], hostname[test.com], port[8080], resource[/test1/test2.html] ==32580== ==32580== HEAP SUMMARY: ==32580== in use at exit: 0 bytes in 0 blocks ==32580== total heap usage: 14 allocs, 14 frees, 617 bytes allocated ==32580== ==32580== All heap blocks were freed -- no leaks are possible ==32580== ==32580== For counts of detected and suppressed errors, rerun with: -v ==32580== ERROR SUMMARY: 0 errors from 0 contexts (suppressed: 0 from 0)