Libpcre

From 탱이의 잡동사니
Jump to navigation Jump to search

Overview

libpcre 내용 정리

Basic

pcre_exec/pcre16_exec

<source lang=c>

  1. include <pcre.h>

int pcre_exec(const pcre *code, const pcre_extra *extra,

   const char *subject, int length, int startoffset,
   int options, int *ovector, int ovecsize);

int pcre16_exec(const pcre16 *code, const pcre16_extra *extra,

   PCRE_SPTR16 subject, int length, int startoffset,
   int options, int *ovector, int ovecsize);

</source>

  • code : Points to the compiled pattern.
  • extra : Points to an associated pcre[16]_extra structure, or is NULL.
  • subject : Points to the subject string.
  • length : Length of the subject string, in bytes.
  • startoffset : Offset in bytes in the subject at which to start matching.
  • options : Option bits.
  • ovector : Points to a vector of ints for result offsets.
  • ovecsize : Number of elements in the vector(a multiple of 3)

Options

PCRE_ANCHORED          Match only at the first position
PCRE_BSR_ANYCRLF       \R matches only CR, LF, or CRLF
PCRE_BSR_UNICODE       \R matches all Unicode line endings
PCRE_NEWLINE_ANY       Recognize any Unicode newline sequence
PCRE_NEWLINE_ANYCRLF   Recognize CR, LF, & CRLF as newline sequences
PCRE_NEWLINE_CR        Recognize CR as the only newline sequence
PCRE_NEWLINE_CRLF      Recognize CRLF as the only newline sequence
PCRE_NEWLINE_LF        Recognize LF as the only newline sequence
PCRE_NOTBOL            Subject string is not the beginning of a line
PCRE_NOTEOL            Subject string is not the end of a line
PCRE_NOTEMPTY          An empty string is not a valid match
PCRE_NOTEMPTY_ATSTART  An empty string at the start of the subject is not a valid match
PCRE_NO_START_OPTIMIZE Do not do "start-match" optimizations
PCRE_NO_UTF16_CHECK    Do not check the subject for UTF-16 validity (only relevant if PCRE_UTF16 was set at compile time)
PCRE_NO_UTF8_CHECK     Do not check the subject for UTF-8 validity (only relevant if PCRE_UTF8 was set at compile time)
PCRE_PARTIAL           ) Return PCRE_ERROR_PARTIAL for a partial
PCRE_PARTIAL_SOFT      )   match if no full matches are found
PCRE_PARTIAL_HARD      Return PCRE_ERROR_PARTIAL for a partial match if that is found before a full match

EXTRACTING CAPTURED SUBSTRINGS BY NUMBER

<source lang=c> int pcre_copy_substring(const char *subject, int *ovector,

   int stringcount, int stringnumber, char *buffer,
   int buffersize);

int pcre_get_substring(const char *subject, int *ovector,

   int stringcount, int stringnumber,
   const char **stringptr);

int pcre_get_substring_list(const char *subject,

   int *ovector, int stringcount, const char ***listptr);

</source>

EXTRACTING CAPTURED SUBSTRINGS BY NAME

<source lang=c> int pcre_get_stringnumber(const pcre *code,

   const char *name);

int pcre_copy_named_substring(const pcre *code,

   const char *subject, int *ovector,
   int stringcount, const char *stringname,
   char *buffer, int buffersize);

int pcre_get_named_substring(const pcre *code,

   const char *subject, int *ovector,
   int stringcount, const char *stringname,
   const char **stringptr);

</source>

Example

URI parser

Parsing the given uri info to protocol, hostname, etc.. <source lang=c> // main.c

  1. include <stdio.h>
  2. include <pcre.h>
  3. include <string.h>
  4. include <stdbool.h>
  5. include <arpa/inet.h>
  6. include <netdb.h>


  1. define REGEX_URI "^(([^:/?#]+):)?(//([^/?#]*))?([^?#]*)(\\?([^#]*))?(#(.*))?" // RFC 3986
  2. define REGEX_HOSTNAME "^(.+):(.*)"
  1. define IDX_PROTOCOL 2
  2. define IDX_ENDPOINT 4
  3. define IDX_RESOURCE 5
  1. define IDX_HOSTNAME 1
  2. define IDX_PORT 2
  1. define DEF_SUBVEC_SIZE 30

typedef struct _st_hostinfo {

   char* uri;      // original uri. "http://example.com:80/test1/test2/test3/index.html"
   
   char* protocol; // "http"
   char* endpoint; // "exmaple.com:80"
   char* hostname; // "example.com"
   char* hostaddr; // "127.0.0.1"
   char* resource; // "/test1/test2/test3/index.html"
   int port;       // 80
   

} st_hostinfo;

static bool parse_uri(st_hostinfo* hostinfo); static bool parse_endpoint(st_hostinfo* hostinfo); static bool resolve_addr(st_hostinfo* hostinfo);


/**

* \brief create hostinfo. Initiate all items to NULL
*/

static st_hostinfo* create_hostinfo(void) {

   st_hostinfo* hostinfo;
   
   hostinfo = calloc(sizeof(st_hostinfo), 1);
   
   hostinfo->uri = NULL;
   hostinfo->protocol = NULL;
   hostinfo->hostname = NULL;
   hostinfo->hostaddr = NULL;
   hostinfo->resource = NULL;
   hostinfo->port = -1;
   
   return hostinfo;

}

/**

* \brief destroy the hostinfo. Release all items in the given value.
*/

static void destroy_hostinfo(st_hostinfo* hostinfo) {

   if(hostinfo == NULL) {
       return;
   }
   
   if(hostinfo->uri != NULL) {
       free(hostinfo->uri);
   }
   
   if(hostinfo->protocol != NULL) {
       free(hostinfo->protocol);
   }
   
   if(hostinfo->endpoint != NULL) {
       free(hostinfo->endpoint);
   }
   
   if(hostinfo->hostname != NULL) {
       free(hostinfo->hostname);
   }
   
   if(hostinfo->hostaddr != NULL) {
       free(hostinfo->hostaddr);
   }
   
   if(hostinfo->resource != NULL) {
       free(hostinfo->resource);
   }
   
   free(hostinfo);
   
   return;

}

/**

* \brief initiate hostinfo. parse the given uri.
*/

static bool init_hostinfo(st_hostinfo* hostinfo, const char* uri) {

   int ret;
       
   if((hostinfo == NULL) || (uri == NULL)) {
       printf("Could not initiate hostinfo. Wrong input parameter.\n");
       return false;
   }
   
   hostinfo->uri = strdup(uri);
   
   // parsing the protocol, endpoint, resource
   ret = parse_uri(hostinfo);
   if(ret == false) {
       printf("Could not initiate hostinfo.\n");
       return false;
   }
   
   // parsing the hostname and port
   ret = parse_endpoint(hostinfo);    
   if(ret == false) {
       // No given port number.
       // copy the endpoint to hostname.
       // guess the common port number using protocol.
       hostinfo->hostname = strdup(hostinfo->endpoint);
       
       if(strcmp(hostinfo->protocol, "http") == 0) {
           hostinfo->port = 80;
       }
       else if(strcmp(hostinfo->protocol, "ftp") == 0) {
           hostinfo->port = 21;
       }
       else {
           printf("Could not get correct port info.\n");
           return false;
       }
   }
   
   ret = resolve_addr(hostinfo);
   if(ret == false) {
       printf("Could not initiate hostinfo.\n");
       return false;
   }
   
   return true;

}

/**

* \brief parse uri info form given hostinfo.\n
*  it parsing only the below items using uri.
*  protocol, endpoint, resource
*/

static bool parse_uri(st_hostinfo* hostinfo) {

   const char* tmp_err;
   int offset;
   pcre* re;
   pcre_extra* re_ex;
   int subStrVec[DEF_SUBVEC_SIZE];
   int ret;
   const char* tmp_const;
   const char* uri;
   
   if((hostinfo == NULL) || (hostinfo->uri == NULL)) {
       printf("Wrong input parameter.\n");
       return false;
   }
   uri = hostinfo->uri;
   
   re = pcre_compile(REGEX_URI, 0, &tmp_err, &offset, NULL);
   if(re == NULL) {
       printf("Could not compile the regex. regex=%s, err=%s\n", REGEX_URI, tmp_err);
       return false;
   }
   
   // Optimize the regex
   re_ex = pcre_study(re, 0, &tmp_err);
   if(tmp_err != NULL) {
       printf("Could not optimize the regex. regex=%s, err=%s\n", REGEX_URI, tmp_err);
       pcre_free(re);
       return false;
   }
   
   ret = pcre_exec(re,
       re_ex,
       uri,
       strlen(uri),        // length of uri
       0,                  // Start looking at this point
       0,                  // OPTIONS
       subStrVec,
       DEF_SUBVEC_SIZE     // Length of subStrVec
       );
   pcre_free(re);
   pcre_free(re_ex);
   if(ret < 0) {
       printf("Could not parsed input uri. uri[%s]\n", uri);
       return false;
   }
   
   // get protocol
   pcre_get_substring(uri, subStrVec, ret, IDX_PROTOCOL, &tmp_const);
   if(tmp_const == NULL) {
       printf("Could not get hostname info.\n");
       return false;
   }
   hostinfo->protocol = strdup(tmp_const);
   printf("The resource. protocol[%s]\n", hostinfo->protocol);
   pcre_free_substring(tmp_const);
       
   // get endpoint
   pcre_get_substring(uri, subStrVec, ret, IDX_ENDPOINT, &tmp_const);
   if(tmp_const == NULL) {
       printf("Could not get endpoint info.\n");
       return false;
   }
   hostinfo->endpoint = strdup(tmp_const);
   printf("The endpoint. endpoint[%s]\n", hostinfo->endpoint);
   pcre_free_substring(tmp_const);
   
   // get resource
   pcre_get_substring(uri, subStrVec, ret, IDX_RESOURCE, &tmp_const);
   if(tmp_const == NULL) {
       printf("Could not get hostname info.\n");
       return false;
   }
   hostinfo->resource = strdup(tmp_const);
   printf("The resource. resource[%s]\n", hostinfo->resource);
   pcre_free_substring(tmp_const);
   return true;

}

static bool parse_endpoint(st_hostinfo* hostinfo) {

   pcre* re;
   pcre_extra* re_ex;
   const char* tmp_err;
   const char* endpoint;
   const char* tmp_const;
   int subStrVec[DEF_SUBVEC_SIZE];
   int offset;
   int ret;


   // parameter check.
   if((hostinfo == NULL) || (hostinfo->endpoint == NULL)) {
       printf("Could not set hostname and port info. Wrong input parameter.\n");
       return false;
   }
   
   re = pcre_compile(REGEX_HOSTNAME, 0, &tmp_err, &offset, NULL);
   if(re == NULL) {
       printf("Could not compile the reges. regex[%s], err[%s]\n", REGEX_HOSTNAME, tmp_err);
       return NULL;
   }
   re_ex = pcre_study(re, 0, &tmp_err);
   if(tmp_err != NULL) {
       printf("Could not optimize the regex. regex[%s], err[%s]\n", REGEX_HOSTNAME, tmp_err);
       pcre_free(re);
       return NULL;
   }
   
   endpoint = hostinfo->endpoint;
   ret = pcre_exec(re,
       re_ex,
       endpoint,
       strlen(endpoint),   // length of endpoint
       0,                  // Start looking at this point
       0,                  // OPTIONS
       subStrVec,
       DEF_SUBVEC_SIZE     // Length of subStrVec
       );
   pcre_free(re);
   pcre_free(re_ex);
   if(ret < 0) {
       printf("Could not parse endpoint info.\n");
       return false;
   }
   // get hostname
   pcre_get_substring(endpoint, subStrVec, ret, IDX_HOSTNAME, &tmp_const);
   if(tmp_const == NULL) {
       printf("Could not get hostname info.\n");
       return false;
   }
   hostinfo->hostname = strdup(tmp_const);
   printf("The hostname. hostname[%s]\n", hostinfo->hostname);
   pcre_free_substring(tmp_const);
   // get port
   pcre_get_substring(endpoint, subStrVec, ret, IDX_PORT, &tmp_const);
   if(tmp_const == NULL) {
       printf("Could not get port info.\n");
       return false;
   }
   hostinfo->port = atoi(tmp_const);
   printf("The port. port[%d]\n", hostinfo->port);
   pcre_free_substring(tmp_const);
   return true;

}

/**

* \brief resolve the hostname to ip address.
* \return Success:true, Fail:false
*/

static bool resolve_addr(st_hostinfo* hostinfo) {

   struct hostent* tmp_hostent;
   char addr[INET_ADDRSTRLEN];
   char** addr_ptr;
   
   if((hostinfo == NULL) || (hostinfo->hostname == NULL)) {
       printf("Could not resolve hostaddr. Wrong input parameter.\n");
       return false;
   }
   // get ip address
   tmp_hostent = gethostbyname(hostinfo->hostname);
   if(tmp_hostent == NULL) {
       printf("Could not get ip address.\n");
       return false;
   }
   
   addr_ptr = tmp_hostent->h_addr_list;
   inet_ntop(AF_INET,(void *)*addr_ptr, addr, sizeof(addr));
   if(addr == NULL) {
       printf("Could not get ip address.\n");
       return false;
   }
   
   hostinfo->hostaddr = strdup(addr);
   printf("The hostaddr. hostaddr[%s]\n", hostinfo->hostaddr);
   return true;

}

int main(int argc, char** argv) {

   int ret;
   st_hostinfo* hostinfo;
   
   if(argc < 2) {
       printf("Usage:\n");
       printf("    %s <fully qualified hostname>\n", argv[0]);
       return 1;
   }
   
   hostinfo = create_hostinfo();
   ret = init_hostinfo(hostinfo, argv[1]);
   if(ret != true) {
       destroy_hostinfo(hostinfo);
       return 0;
   }
   
   printf("Parsing info. uri[%s], protocol[%s], hostname[%s], hostaddr[%s], port[%d], resource[%s]\n",
       hostinfo->uri,
       hostinfo->protocol,
       hostinfo->hostname,
       hostinfo->hostaddr,
       hostinfo->port,
       hostinfo->resource
       );
   
   destroy_hostinfo(hostinfo);
   
   return 0;

} </source>

Run

$ valgrind --tool=memcheck --leak-check=full ./main http://test.com:8080/test1/test2.html
==32580== Memcheck, a memory error detector
==32580== Copyright (C) 2002-2013, and GNU GPL'd, by Julian Seward et al.
==32580== Using Valgrind-3.10.1 and LibVEX; rerun with -h for copyright info
==32580== Command: ./main http://test.com:8080/test1/test2.html
==32580== 
The resource. protocol[http]
The endpoint. endpoint[test.com:8080]
The resource. resource[/test1/test2.html]
The hostname. hostname[test.com]
The port. port[8080]
Parsing info. uri[http://test.com:8080/test1/test2.html], protocol[http], hostname[test.com], port[8080], resource[/test1/test2.html]
==32580== 
==32580== HEAP SUMMARY:
==32580==     in use at exit: 0 bytes in 0 blocks
==32580==   total heap usage: 14 allocs, 14 frees, 617 bytes allocated
==32580== 
==32580== All heap blocks were freed -- no leaks are possible
==32580== 
==32580== For counts of detected and suppressed errors, rerun with: -v
==32580== ERROR SUMMARY: 0 errors from 0 contexts (suppressed: 0 from 0)

hostname parser

<source lang=c>

  1. include <stdio.h>
  2. include <string.h>
  3. include <pcre.h>
  1. define DEF_TEST_REGEX "(.+):(.+)"
  1. define DEF_MATCHES_SIZE 10

int main(int argc, char** argv) {

   pcre* re;
   int re_res;
   const char* pcreErrorStr;
   int pcreErrorOffset;
   int matches[DEF_MATCHES_SIZE];
   char tmp[100];
       
   if(argc < 2) {
       printf("Usage:\n");
       printf("    %s <test text>\n", argv[0]);
       return 1;
   }
   
   printf("regex info. string[%s]\n", argv[1]);
   
   re = pcre_compile(DEF_TEST_REGEX, 0, &pcreErrorStr, &pcreErrorOffset, NULL);
   if(re == NULL) {
       printf("Could not compile the regex. regex[%s]\n", DEF_TEST_REGEX);
       return 1;
   }
   
   re_res = pcre_exec(re, NULL, argv[1], strlen(argv[1]), 0, 0, matches, DEF_MATCHES_SIZE);
   if(re_res < 0) {
       printf("Could not parse correctly.\n");
       return 1;
   }
   
   pcre_copy_substring(argv[1], matches, re_res, 0, tmp, sizeof(tmp));
   printf("Result. tmp[%s]\n", tmp);
   pcre_copy_substring(argv[1], matches, re_res, 1, tmp, sizeof(tmp));
   printf("Result. tmp[%s]\n", tmp);
   
   pcre_copy_substring(argv[1], matches, re_res, 2, tmp, sizeof(tmp));
   printf("Result. tmp[%s]\n", tmp);
   //pcre_free_substring(tmp);
   pcre_free(re);
   return 0;

} </source>

Run

$ valgrind --tool=memcheck --leak-check=full ./main test.com:80
==10874== Memcheck, a memory error detector
==10874== Copyright (C) 2002-2013, and GNU GPL'd, by Julian Seward et al.
==10874== Using Valgrind-3.10.1 and LibVEX; rerun with -h for copyright info
==10874== Command: ./main test.com:80
==10874== 
regex info. string[test.com:80]
Result. tmp[test.com:80]
Result. tmp[test.com]
Result. tmp[80]
==10874== 
==10874== HEAP SUMMARY:
==10874==     in use at exit: 0 bytes in 0 blocks
==10874==   total heap usage: 1 allocs, 1 frees, 77 bytes allocated
==10874== 
==10874== All heap blocks were freed -- no leaks are possible
==10874== 
==10874== For counts of detected and suppressed errors, rerun with: -v
==10874== ERROR SUMMARY: 0 errors from 0 contexts (suppressed: 0 from 0)

See also