Libpcre: Difference between revisions

From 탱이의 잡동사니
Jump to navigation Jump to search
Line 69: Line 69:
<source lang=c>
<source lang=c>
// main.c
// main.c
#include <stdio.h>
#include <stdio.h>
#include <pcre.h>
#include <pcre.h>
#include <string.h>
#include <string.h>
#include <stdbool.h>
#include <stdbool.h>
#include <arpa/inet.h>
#include <netdb.h>


#define REGEX_URI "^(([^:/?#]+):)?(//([^/?#]*))?([^?#]*)(\\?([^#]*))?(#(.*))?" // RFC 3986
#define REGEX_URI "^(([^:/?#]+):)?(//([^/?#]*))?([^?#]*)(\\?([^#]*))?(#(.*))?" // RFC 3986
Line 83: Line 87:
#define IDX_HOSTNAME 1
#define IDX_HOSTNAME 1
#define IDX_PORT    2
#define IDX_PORT    2
#define DEF_SUBVEC_SIZE 30


typedef struct _st_hostinfo {     
typedef struct _st_hostinfo {     
     char* uri; // http://example.com:80/test1/test2/test3/index.html   // original URI
     char* uri;     // original uri. "http://example.com:80/test1/test2/test3/index.html"
      
      
     char* protocol; // http
     char* protocol; // "http"
     char* endpoint; // exmaple.com:80
     char* endpoint; // "exmaple.com:80"
     char* hostname; // example.com
     char* hostname; // "example.com"
     char* resource; // /test1/test2/test3/index.html
    char* hostaddr; // "127.0.0.1"
     int port;   // 80
     char* resource; // "/test1/test2/test3/index.html"
     int port;       // 80
      
      
} st_hostinfo;
} st_hostinfo;
Line 97: Line 104:
static bool parse_uri(st_hostinfo* hostinfo);
static bool parse_uri(st_hostinfo* hostinfo);
static bool parse_endpoint(st_hostinfo* hostinfo);
static bool parse_endpoint(st_hostinfo* hostinfo);
static bool resolve_addr(st_hostinfo* hostinfo);


/**
/**
Line 109: Line 118:
     hostinfo->protocol = NULL;
     hostinfo->protocol = NULL;
     hostinfo->hostname = NULL;
     hostinfo->hostname = NULL;
    hostinfo->hostaddr = NULL;
     hostinfo->resource = NULL;
     hostinfo->resource = NULL;
     hostinfo->port = -1;
     hostinfo->port = -1;
Line 138: Line 148:
     if(hostinfo->hostname != NULL) {
     if(hostinfo->hostname != NULL) {
         free(hostinfo->hostname);
         free(hostinfo->hostname);
    }
   
    if(hostinfo->hostaddr != NULL) {
        free(hostinfo->hostaddr);
     }
     }
      
      
Line 156: Line 170:
     int ret;
     int ret;
          
          
     if(hostinfo->uri != NULL) {
     if((hostinfo == NULL) || (uri == NULL)) {
         free(hostinfo->uri);
         printf("Could not initiate hostinfo. Wrong input parameter.\n");
        return false;
     }
     }
      
      
     hostinfo->uri = strdup(uri);
     hostinfo->uri = strdup(uri);
      
      
    // parsing the protocol, endpoint, resource
     ret = parse_uri(hostinfo);
     ret = parse_uri(hostinfo);
     if(ret == false) {
     if(ret == false) {
Line 168: Line 184:
     }
     }
      
      
    // parsing the hostname and port
     ret = parse_endpoint(hostinfo);     
     ret = parse_endpoint(hostinfo);     
     if(ret == false) {
     if(ret == false) {
        // No given port number.
         // copy the endpoint to hostname.
         // copy the endpoint to hostname.
        // guess the common port number using protocol.
         hostinfo->hostname = strdup(hostinfo->endpoint);
         hostinfo->hostname = strdup(hostinfo->endpoint);
          
          
Line 183: Line 202:
             return false;
             return false;
         }
         }
    }
   
    ret = resolve_addr(hostinfo);
    if(ret == false) {
        printf("Could not initiate hostinfo.\n");
        return false;
     }
     }
      
      
Line 199: Line 224:
     pcre* re;
     pcre* re;
     pcre_extra* re_ex;
     pcre_extra* re_ex;
     int subStrVec[30];
     int subStrVec[DEF_SUBVEC_SIZE];
     int ret;
     int ret;
     const char* tmp_const;
     const char* tmp_const;
Line 231: Line 256:
         0,                  // OPTIONS
         0,                  // OPTIONS
         subStrVec,
         subStrVec,
         sizeof(subStrVec)  // Length of subStrVec
         DEF_SUBVEC_SIZE    // Length of subStrVec
         );
         );
     pcre_free(re);
     pcre_free(re);
Line 262: Line 287:
     // get resource
     // get resource
     pcre_get_substring(uri, subStrVec, ret, IDX_RESOURCE, &tmp_const);
     pcre_get_substring(uri, subStrVec, ret, IDX_RESOURCE, &tmp_const);
        if(tmp_const == NULL) {
    if(tmp_const == NULL) {
         printf("Could not get hostname info.\n");
         printf("Could not get hostname info.\n");
         return false;
         return false;
Line 280: Line 305:
     const char* endpoint;
     const char* endpoint;
     const char* tmp_const;
     const char* tmp_const;
     int subStrVec[30];
     int subStrVec[DEF_SUBVEC_SIZE];
     int offset;
     int offset;
     int ret;
     int ret;
Line 313: Line 338:
         0,                  // OPTIONS
         0,                  // OPTIONS
         subStrVec,
         subStrVec,
         sizeof(subStrVec)  // Length of subStrVec
         DEF_SUBVEC_SIZE    // Length of subStrVec
         );
         );
     pcre_free(re);
     pcre_free(re);
Line 341: Line 366:
     printf("The port. port[%d]\n", hostinfo->port);
     printf("The port. port[%d]\n", hostinfo->port);
     pcre_free_substring(tmp_const);
     pcre_free_substring(tmp_const);
    return true;
}
/**
* \brief resolve the hostname to ip address.
* \return Success:true, Fail:false
*/
static bool resolve_addr(st_hostinfo* hostinfo)
{
    struct hostent* tmp_hostent;
    char addr[INET_ADDRSTRLEN];
    char** addr_ptr;
   
    if((hostinfo == NULL) || (hostinfo->hostname == NULL)) {
        printf("Could not resolve hostaddr. Wrong input parameter.\n");
        return false;
    }
    // get ip address
    tmp_hostent = gethostbyname(hostinfo->hostname);
    if(tmp_hostent == NULL) {
        printf("Could not get ip address.\n");
        return false;
    }
   
    addr_ptr = tmp_hostent->h_addr_list;
    inet_ntop(AF_INET,(void *)*addr_ptr, addr, sizeof(addr));
    if(addr == NULL) {
        printf("Could not get ip address.\n");
        return false;
    }
   
    hostinfo->hostaddr = strdup(addr);
    printf("The hostaddr. hostaddr[%s]\n", hostinfo->hostaddr);


     return true;
     return true;
Line 363: Line 423:
     }
     }
      
      
     printf("Parsing info. uri[%s], protocol[%s], hostname[%s], port[%d], resource[%s]\n",
     printf("Parsing info. uri[%s], protocol[%s], hostname[%s], hostaddr[%s], port[%d], resource[%s]\n",
         hostinfo->uri,
         hostinfo->uri,
         hostinfo->protocol,
         hostinfo->protocol,
         hostinfo->hostname,
         hostinfo->hostname,
        hostinfo->hostaddr,
         hostinfo->port,
         hostinfo->port,
         hostinfo->resource
         hostinfo->resource

Revision as of 08:52, 24 August 2016

Overview

libpcre 내용 정리

Basic

pcre_exec/pcre16_exec

<source lang=c>

  1. include <pcre.h>

int pcre_exec(const pcre *code, const pcre_extra *extra,

   const char *subject, int length, int startoffset,
   int options, int *ovector, int ovecsize);

int pcre16_exec(const pcre16 *code, const pcre16_extra *extra,

   PCRE_SPTR16 subject, int length, int startoffset,
   int options, int *ovector, int ovecsize);

</source>

  • code : Points to the compiled pattern.
  • extra : Points to an associated pcre[16]_extra structure, or is NULL.
  • subject : Points to the subject string.
  • length : Length of the subject string, in bytes.
  • startoffset : Offset in bytes in the subject at which to start matching.
  • options : Option bits.
  • ovector : Points to a vector of ints for result offsets.
  • ovecsize : Number of elements in the vector(a multiple of 3)

Options

PCRE_ANCHORED          Match only at the first position
PCRE_BSR_ANYCRLF       \R matches only CR, LF, or CRLF
PCRE_BSR_UNICODE       \R matches all Unicode line endings
PCRE_NEWLINE_ANY       Recognize any Unicode newline sequence
PCRE_NEWLINE_ANYCRLF   Recognize CR, LF, & CRLF as newline sequences
PCRE_NEWLINE_CR        Recognize CR as the only newline sequence
PCRE_NEWLINE_CRLF      Recognize CRLF as the only newline sequence
PCRE_NEWLINE_LF        Recognize LF as the only newline sequence
PCRE_NOTBOL            Subject string is not the beginning of a line
PCRE_NOTEOL            Subject string is not the end of a line
PCRE_NOTEMPTY          An empty string is not a valid match
PCRE_NOTEMPTY_ATSTART  An empty string at the start of the subject is not a valid match
PCRE_NO_START_OPTIMIZE Do not do "start-match" optimizations
PCRE_NO_UTF16_CHECK    Do not check the subject for UTF-16 validity (only relevant if PCRE_UTF16 was set at compile time)
PCRE_NO_UTF8_CHECK     Do not check the subject for UTF-8 validity (only relevant if PCRE_UTF8 was set at compile time)
PCRE_PARTIAL           ) Return PCRE_ERROR_PARTIAL for a partial
PCRE_PARTIAL_SOFT      )   match if no full matches are found
PCRE_PARTIAL_HARD      Return PCRE_ERROR_PARTIAL for a partial match if that is found before a full match

pcre_get_substring/pcre16_get_substring

<source lang=c>

  1. include <pcre.h>

int pcre_get_substring(const char *subject, int *ovector,

   int stringcount, int stringnumber,
   const char **stringptr);

int pcre16_get_substring(PCRE_SPTR16 subject, int *ovector,

   int stringcount, int stringnumber,
   PCRE_SPTR16 *stringptr);

</source>

  • subject Subject that has been successfully matched
  • ovector Offset vector that pcre[16]_exec() used
  • stringcount Value returned by pcre[16]_exec()
  • stringnumber Number of the required substring
  • stringptr Where to put the string pointer

Example

URI parser

Parsing the given uri info to protocol, hostname, etc.. <source lang=c> // main.c

  1. include <stdio.h>
  2. include <pcre.h>
  3. include <string.h>
  4. include <stdbool.h>
  5. include <arpa/inet.h>
  6. include <netdb.h>


  1. define REGEX_URI "^(([^:/?#]+):)?(//([^/?#]*))?([^?#]*)(\\?([^#]*))?(#(.*))?" // RFC 3986
  2. define REGEX_HOSTNAME "^(.+):(.*)"
  1. define IDX_PROTOCOL 2
  2. define IDX_ENDPOINT 4
  3. define IDX_RESOURCE 5
  1. define IDX_HOSTNAME 1
  2. define IDX_PORT 2
  1. define DEF_SUBVEC_SIZE 30

typedef struct _st_hostinfo {

   char* uri;      // original uri. "http://example.com:80/test1/test2/test3/index.html"
   
   char* protocol; // "http"
   char* endpoint; // "exmaple.com:80"
   char* hostname; // "example.com"
   char* hostaddr; // "127.0.0.1"
   char* resource; // "/test1/test2/test3/index.html"
   int port;       // 80
   

} st_hostinfo;

static bool parse_uri(st_hostinfo* hostinfo); static bool parse_endpoint(st_hostinfo* hostinfo); static bool resolve_addr(st_hostinfo* hostinfo);


/**

* \brief create hostinfo. Initiate all items to NULL
*/

static st_hostinfo* create_hostinfo(void) {

   st_hostinfo* hostinfo;
   
   hostinfo = calloc(sizeof(st_hostinfo), 1);
   
   hostinfo->uri = NULL;
   hostinfo->protocol = NULL;
   hostinfo->hostname = NULL;
   hostinfo->hostaddr = NULL;
   hostinfo->resource = NULL;
   hostinfo->port = -1;
   
   return hostinfo;

}

/**

* \brief destroy the hostinfo. Release all items in the given value.
*/

static void destroy_hostinfo(st_hostinfo* hostinfo) {

   if(hostinfo == NULL) {
       return;
   }
   
   if(hostinfo->uri != NULL) {
       free(hostinfo->uri);
   }
   
   if(hostinfo->protocol != NULL) {
       free(hostinfo->protocol);
   }
   
   if(hostinfo->endpoint != NULL) {
       free(hostinfo->endpoint);
   }
   
   if(hostinfo->hostname != NULL) {
       free(hostinfo->hostname);
   }
   
   if(hostinfo->hostaddr != NULL) {
       free(hostinfo->hostaddr);
   }
   
   if(hostinfo->resource != NULL) {
       free(hostinfo->resource);
   }
   
   free(hostinfo);
   
   return;

}

/**

* \brief initiate hostinfo. parse the given uri.
*/

static bool init_hostinfo(st_hostinfo* hostinfo, const char* uri) {

   int ret;
       
   if((hostinfo == NULL) || (uri == NULL)) {
       printf("Could not initiate hostinfo. Wrong input parameter.\n");
       return false;
   }
   
   hostinfo->uri = strdup(uri);
   
   // parsing the protocol, endpoint, resource
   ret = parse_uri(hostinfo);
   if(ret == false) {
       printf("Could not initiate hostinfo.\n");
       return false;
   }
   
   // parsing the hostname and port
   ret = parse_endpoint(hostinfo);    
   if(ret == false) {
       // No given port number.
       // copy the endpoint to hostname.
       // guess the common port number using protocol.
       hostinfo->hostname = strdup(hostinfo->endpoint);
       
       if(strcmp(hostinfo->protocol, "http") == 0) {
           hostinfo->port = 80;
       }
       else if(strcmp(hostinfo->protocol, "ftp") == 0) {
           hostinfo->port = 21;
       }
       else {
           printf("Could not get correct port info.\n");
           return false;
       }
   }
   
   ret = resolve_addr(hostinfo);
   if(ret == false) {
       printf("Could not initiate hostinfo.\n");
       return false;
   }
   
   return true;

}

/**

* \brief parse uri info form given hostinfo.\n
*  it parsing only the below items using uri.
*  protocol, endpoint, resource
*/

static bool parse_uri(st_hostinfo* hostinfo) {

   const char* tmp_err;
   int offset;
   pcre* re;
   pcre_extra* re_ex;
   int subStrVec[DEF_SUBVEC_SIZE];
   int ret;
   const char* tmp_const;
   const char* uri;
   
   if((hostinfo == NULL) || (hostinfo->uri == NULL)) {
       printf("Wrong input parameter.\n");
       return false;
   }
   uri = hostinfo->uri;
   
   re = pcre_compile(REGEX_URI, 0, &tmp_err, &offset, NULL);
   if(re == NULL) {
       printf("Could not compile the regex. regex=%s, err=%s\n", REGEX_URI, tmp_err);
       return false;
   }
   
   // Optimize the regex
   re_ex = pcre_study(re, 0, &tmp_err);
   if(tmp_err != NULL) {
       printf("Could not optimize the regex. regex=%s, err=%s\n", REGEX_URI, tmp_err);
       pcre_free(re);
       return false;
   }
   
   ret = pcre_exec(re,
       re_ex,
       uri,
       strlen(uri),        // length of uri
       0,                  // Start looking at this point
       0,                  // OPTIONS
       subStrVec,
       DEF_SUBVEC_SIZE     // Length of subStrVec
       );
   pcre_free(re);
   pcre_free(re_ex);
   if(ret < 0) {
       printf("Could not parsed input uri. uri[%s]\n", uri);
       return false;
   }
   
   // get protocol
   pcre_get_substring(uri, subStrVec, ret, IDX_PROTOCOL, &tmp_const);
   if(tmp_const == NULL) {
       printf("Could not get hostname info.\n");
       return false;
   }
   hostinfo->protocol = strdup(tmp_const);
   printf("The resource. protocol[%s]\n", hostinfo->protocol);
   pcre_free_substring(tmp_const);
       
   // get endpoint
   pcre_get_substring(uri, subStrVec, ret, IDX_ENDPOINT, &tmp_const);
   if(tmp_const == NULL) {
       printf("Could not get endpoint info.\n");
       return false;
   }
   hostinfo->endpoint = strdup(tmp_const);
   printf("The endpoint. endpoint[%s]\n", hostinfo->endpoint);
   pcre_free_substring(tmp_const);
   
   // get resource
   pcre_get_substring(uri, subStrVec, ret, IDX_RESOURCE, &tmp_const);
   if(tmp_const == NULL) {
       printf("Could not get hostname info.\n");
       return false;
   }
   hostinfo->resource = strdup(tmp_const);
   printf("The resource. resource[%s]\n", hostinfo->resource);
   pcre_free_substring(tmp_const);
   return true;

}

static bool parse_endpoint(st_hostinfo* hostinfo) {

   pcre* re;
   pcre_extra* re_ex;
   const char* tmp_err;
   const char* endpoint;
   const char* tmp_const;
   int subStrVec[DEF_SUBVEC_SIZE];
   int offset;
   int ret;


   // parameter check.
   if((hostinfo == NULL) || (hostinfo->endpoint == NULL)) {
       printf("Could not set hostname and port info. Wrong input parameter.\n");
       return false;
   }
   
   re = pcre_compile(REGEX_HOSTNAME, 0, &tmp_err, &offset, NULL);
   if(re == NULL) {
       printf("Could not compile the reges. regex[%s], err[%s]\n", REGEX_HOSTNAME, tmp_err);
       return NULL;
   }
   re_ex = pcre_study(re, 0, &tmp_err);
   if(tmp_err != NULL) {
       printf("Could not optimize the regex. regex[%s], err[%s]\n", REGEX_HOSTNAME, tmp_err);
       pcre_free(re);
       return NULL;
   }
   
   endpoint = hostinfo->endpoint;
   ret = pcre_exec(re,
       re_ex,
       endpoint,
       strlen(endpoint),   // length of endpoint
       0,                  // Start looking at this point
       0,                  // OPTIONS
       subStrVec,
       DEF_SUBVEC_SIZE     // Length of subStrVec
       );
   pcre_free(re);
   pcre_free(re_ex);
   if(ret < 0) {
       printf("Could not parse endpoint info.\n");
       return false;
   }
   // get hostname
   pcre_get_substring(endpoint, subStrVec, ret, IDX_HOSTNAME, &tmp_const);
   if(tmp_const == NULL) {
       printf("Could not get hostname info.\n");
       return false;
   }
   hostinfo->hostname = strdup(tmp_const);
   printf("The hostname. hostname[%s]\n", hostinfo->hostname);
   pcre_free_substring(tmp_const);
   // get port
   pcre_get_substring(endpoint, subStrVec, ret, IDX_PORT, &tmp_const);
   if(tmp_const == NULL) {
       printf("Could not get port info.\n");
       return false;
   }
   hostinfo->port = atoi(tmp_const);
   printf("The port. port[%d]\n", hostinfo->port);
   pcre_free_substring(tmp_const);
   return true;

}

/**

* \brief resolve the hostname to ip address.
* \return Success:true, Fail:false
*/

static bool resolve_addr(st_hostinfo* hostinfo) {

   struct hostent* tmp_hostent;
   char addr[INET_ADDRSTRLEN];
   char** addr_ptr;
   
   if((hostinfo == NULL) || (hostinfo->hostname == NULL)) {
       printf("Could not resolve hostaddr. Wrong input parameter.\n");
       return false;
   }
   // get ip address
   tmp_hostent = gethostbyname(hostinfo->hostname);
   if(tmp_hostent == NULL) {
       printf("Could not get ip address.\n");
       return false;
   }
   
   addr_ptr = tmp_hostent->h_addr_list;
   inet_ntop(AF_INET,(void *)*addr_ptr, addr, sizeof(addr));
   if(addr == NULL) {
       printf("Could not get ip address.\n");
       return false;
   }
   
   hostinfo->hostaddr = strdup(addr);
   printf("The hostaddr. hostaddr[%s]\n", hostinfo->hostaddr);
   return true;

}

int main(int argc, char** argv) {

   int ret;
   st_hostinfo* hostinfo;
   
   if(argc < 2) {
       printf("Usage:\n");
       printf("    %s <fully qualified hostname>\n", argv[0]);
       return 1;
   }
   
   hostinfo = create_hostinfo();
   ret = init_hostinfo(hostinfo, argv[1]);
   if(ret != true) {
       destroy_hostinfo(hostinfo);
       return 0;
   }
   
   printf("Parsing info. uri[%s], protocol[%s], hostname[%s], hostaddr[%s], port[%d], resource[%s]\n",
       hostinfo->uri,
       hostinfo->protocol,
       hostinfo->hostname,
       hostinfo->hostaddr,
       hostinfo->port,
       hostinfo->resource
       );
   
   destroy_hostinfo(hostinfo);
   
   return 0;

} </source>

Run

$ valgrind --tool=memcheck --leak-check=full ./main http://test.com:8080/test1/test2.html
==32580== Memcheck, a memory error detector
==32580== Copyright (C) 2002-2013, and GNU GPL'd, by Julian Seward et al.
==32580== Using Valgrind-3.10.1 and LibVEX; rerun with -h for copyright info
==32580== Command: ./main http://test.com:8080/test1/test2.html
==32580== 
The resource. protocol[http]
The endpoint. endpoint[test.com:8080]
The resource. resource[/test1/test2.html]
The hostname. hostname[test.com]
The port. port[8080]
Parsing info. uri[http://test.com:8080/test1/test2.html], protocol[http], hostname[test.com], port[8080], resource[/test1/test2.html]
==32580== 
==32580== HEAP SUMMARY:
==32580==     in use at exit: 0 bytes in 0 blocks
==32580==   total heap usage: 14 allocs, 14 frees, 617 bytes allocated
==32580== 
==32580== All heap blocks were freed -- no leaks are possible
==32580== 
==32580== For counts of detected and suppressed errors, rerun with: -v
==32580== ERROR SUMMARY: 0 errors from 0 contexts (suppressed: 0 from 0)

See also