Upload
andrew-henderson
View
244
Download
0
Embed Size (px)
Citation preview
Applying patternApplying pattern
Regular Expression on CRegular Expression on C
작 성 자 : 김선영
메 일 : [email protected]
버 전 : 1.04
Copyright by SunYoung Kim
Copyright by SunYoung Kim <sunyzero (at) gmail (dot) com>
PrefacePreface
Regular Expression ( 정규표현식 ) 의 약칭 REGEX
string pattern 은 문자열의 조합되는 규칙
meta charater 는 다른 의미를 수식하는 문자
Pro*C 는 Oracle. 사의 상표입니다 .
Copyright by SunYoung Kim <sunyzero (at) gmail (dot) com>
Why needs?Why needs? short term transaction
DB 내부에서 데이터에 변경을 가하면 , 트랜잭션에 소모되는 시간이 길어진다 .
대용량 데이터를 다루는 경우라면 bottle-neck 이 될 가능성이 커진다 .
conclusion
되도록이면 데이터 가공은 프로시저보다 외부에서 하는 것이 좋다 .
통계수치상으로 볼때 데이터의 90% 이상은 텍스트 데이터이다 .
텍스트 처리는 정규표현식을 사용하는 것이 확장성에서 유용하다 .
Copyright by SunYoung Kim <sunyzero (at) gmail (dot) com>
REGEX on CREGEX on C POSIX style
표준화 작업의 산물
호환성이 높음
직관적이며 다른 언어와 API 가 비슷함
BSD style
BSD 초기 방식의 API (old fashion)
PCRE(Perl Compatible Regular Expression)
perl 확장 포함
perl 관련 라이브러리 필요
Copyright by SunYoung Kim <sunyzero (at) gmail (dot) com>
APIAPI pattern buffer : regex_t
패턴을 테스트하고 사용하기 위한 공간
regcomp(3)
정규표현식을 패턴 버퍼로 컴파일 ( 메모리 할당과정 발생 )
regerror(3)
패턴버퍼 컴파일 에러 보고
regexec(3)
패턴에 문자열 적용
regfree(3)
패턴버퍼 해제 ( 메모리 할당 해제 )
Copyright by SunYoung Kim <sunyzero (at) gmail (dot) com>
API: regcompAPI: regcomp int regcomp(regex_t *preg, const char *regex, int cflags)
regex_t *preg : 패턴버퍼
const char *regex : 컴파일할 패턴 문자열
int cflags : 컴파일 플래그
» REG_EXTENDED : POSIX 확장 정규표현식 사용
» REG_ICASE : 대소문자 무시
» REG_NOSUB : 서브스트링을 무시
» REG_NEWLINE: [...], [^...] 등이 New line 과는 매칭하지 않음 ( 라인단위 매칭 )
return value : 0 ( 성공 ), 이외의 값 ( 실패 )
Copyright by SunYoung Kim <sunyzero (at) gmail (dot) com>
API: regerrorAPI: regerror size_t regerror(int errcode, const regex_t *preg, char *errbuf, size_t err
buf_size)
int errcode : regcomp() 가 에러발생시 반환값
regex_t *preg : 패턴버퍼
char *errbuf : 에러를 출력해줄 버퍼
size_t errbuf_size : errbuf 인수의 크기 (Byte)
return value : errbuf 에 출력한 에러메시지의 크기
if ((ret = regcomp(&re_expr, p_regex_str, REG_EXTENDED|REG_NEWLINE))){
regerror(ret, &re_expr, errbuf, sizeof(errbuf));
printf("Error regcomp() : %s\n", errbuf);
/* 에러처리 */
}
if ((ret = regcomp(&re_expr, p_regex_str, REG_EXTENDED|REG_NEWLINE))){
regerror(ret, &re_expr, errbuf, sizeof(errbuf));
printf("Error regcomp() : %s\n", errbuf);
/* 에러처리 */
}
Copyright by SunYoung Kim <sunyzero (at) gmail (dot) com>
API: regexecAPI: regexec int regexec(const regex_t *preg, const char *string, size_t nmatch, re
gmatch_t pmatch[], int eflag)
regex_t *preg : 패턴버퍼
char *string : 패턴 매칭할 대상 문자열
size_t nmatch : 매칭테이블 pmatch 배열의 개수
regmatch_t pmatch[] : 패턴 매칭 결과의 offset 을 저장해줄 매칭 테이블
eflags
» REG_NOTBOL: (not beginning-of-line) 라인 시작 패턴인 ^ 을 사용못함
» REG_NOTEOL: (not end-of-line) 라인 마지막 패턴인 $ 를 사용못함
return value : 0 ( 성공 ), 이외의 값 ( 실패 )
Copyright by SunYoung Kim <sunyzero (at) gmail (dot) com>
API: regexecAPI: regexec regmatch_t 타입의 형태 : <regex.h> 에 선언됨
typedef struct {
regoff_t rm_so;
regoff_t rm_eo;
} regmatch_t;
typedef struct {
regoff_t rm_so;
regoff_t rm_eo;
} regmatch_t;
Copyright by SunYoung Kim <sunyzero (at) gmail (dot) com>
API: regfreeAPI: regfree void regfree(regex_t *preg)
regex_t *preg : 패턴버퍼
Copyright by SunYoung Kim <sunyzero (at) gmail (dot) com>
Example of REGEXExample of REGEX Usage : ./posix_regex [dest_string pattern_string]
#define MAX_EXPR_SUB_MATCH 5
#define DEFAULT_REGEX_STR "(</.+>).*<br>"
#define DEFAULT_DEST_STR "<center>align to center</center> \
align to left <br>New Line<br><br><p>"
int main(int argc, char **argv) {
int i, ret;
char *p_regex_str; /* pattern string */
char *p_dest_str; /* destination string to apply pattern */
regex_t re_expr; /* POSIX REGEX pattern buffer */
regmatch_t rm_matchtab[MAX_EXPR_SUB_MATCH]; /* matching table*/
char errbuf[0xff];
#define MAX_EXPR_SUB_MATCH 5
#define DEFAULT_REGEX_STR "(</.+>).*<br>"
#define DEFAULT_DEST_STR "<center>align to center</center> \
align to left <br>New Line<br><br><p>"
int main(int argc, char **argv) {
int i, ret;
char *p_regex_str; /* pattern string */
char *p_dest_str; /* destination string to apply pattern */
regex_t re_expr; /* POSIX REGEX pattern buffer */
regmatch_t rm_matchtab[MAX_EXPR_SUB_MATCH]; /* matching table*/
char errbuf[0xff];
Copyright by SunYoung Kim <sunyzero (at) gmail (dot) com>
Example of REGEX (con't)Example of REGEX (con't)
if (argc != 3) {
printf("Using default string!!\n");
printf("* Dest str : %s\n", DEFAULT_DEST_STR);
printf("* Regex str: %s\n", DEFAULT_REGEX_STR);
p_dest_str = strdup(DEFAULT_DEST_STR);
p_regex_str = strdup(DEFAULT_REGEX_STR);
} else {
p_dest_str = strdup(argv[1]); p_regex_str = strdup(argv[2]);
}
if ((ret = regcomp(&re_expr, p_regex_str, REG_EXTENDED|REG_NEWLINE))){
regerror(ret, &re_expr, errbuf, sizeof(errbuf));
printf("Error regcomp() : %s\n", errbuf);
exit(EXIT_FAILURE);
}
if (argc != 3) {
printf("Using default string!!\n");
printf("* Dest str : %s\n", DEFAULT_DEST_STR);
printf("* Regex str: %s\n", DEFAULT_REGEX_STR);
p_dest_str = strdup(DEFAULT_DEST_STR);
p_regex_str = strdup(DEFAULT_REGEX_STR);
} else {
p_dest_str = strdup(argv[1]); p_regex_str = strdup(argv[2]);
}
if ((ret = regcomp(&re_expr, p_regex_str, REG_EXTENDED|REG_NEWLINE))){
regerror(ret, &re_expr, errbuf, sizeof(errbuf));
printf("Error regcomp() : %s\n", errbuf);
exit(EXIT_FAILURE);
}
Copyright by SunYoung Kim <sunyzero (at) gmail (dot) com>
Example of REGEX (con't)Example of REGEX (con't)
printf("regcomp : %s\n", p_regex_str);
memset(rm_matchtab, 0x00, sizeof(rm_matchtab));
if (regexec(&re_expr, p_dest_str, MAX_EXPR_SUB_MATCH, rm_matchtab, 0)) {
printf("fail to match\n");
} else {
printf("* All Match offset : (%d -> %d), len(%d) : %.*s\n",
rm_matchtab[0].rm_so, rm_matchtab[0].rm_eo,
rm_matchtab[0].rm_eo - rm_matchtab[0].rm_so,
rm_matchtab[0].rm_eo - rm_matchtab[0].rm_so,
&p_dest_str[rm_matchtab[0].rm_so]);
printf("regcomp : %s\n", p_regex_str);
memset(rm_matchtab, 0x00, sizeof(rm_matchtab));
if (regexec(&re_expr, p_dest_str, MAX_EXPR_SUB_MATCH, rm_matchtab, 0)) {
printf("fail to match\n");
} else {
printf("* All Match offset : (%d -> %d), len(%d) : %.*s\n",
rm_matchtab[0].rm_so, rm_matchtab[0].rm_eo,
rm_matchtab[0].rm_eo - rm_matchtab[0].rm_so,
rm_matchtab[0].rm_eo - rm_matchtab[0].rm_so,
&p_dest_str[rm_matchtab[0].rm_so]);
Copyright by SunYoung Kim <sunyzero (at) gmail (dot) com>
Example of REGEX (con't)Example of REGEX (con't)
for (i=1; i<MAX_EXPR_SUB_MATCH; i++) {
if (rm_matchtab[i].rm_so == -1) break;
printf("* Submatch[%d] offset : (%d -> %d), len(%d) : %.*s\n",i,
rm_matchtab[i].rm_so, rm_matchtab[i].rm_eo,
rm_matchtab[i].rm_eo - rm_matchtab[i].rm_so,
rm_matchtab[i].rm_eo - rm_matchtab[i].rm_so,
&p_dest_str[rm_matchtab[i].rm_so]);
} /* end: for */
} /* end: else */
regfree(&re_expr); /* freeing pattern buffer memory */
return 0;
}
for (i=1; i<MAX_EXPR_SUB_MATCH; i++) {
if (rm_matchtab[i].rm_so == -1) break;
printf("* Submatch[%d] offset : (%d -> %d), len(%d) : %.*s\n",i,
rm_matchtab[i].rm_so, rm_matchtab[i].rm_eo,
rm_matchtab[i].rm_eo - rm_matchtab[i].rm_so,
rm_matchtab[i].rm_eo - rm_matchtab[i].rm_so,
&p_dest_str[rm_matchtab[i].rm_so]);
} /* end: for */
} /* end: else */
regfree(&re_expr); /* freeing pattern buffer memory */
return 0;
}
Copyright by SunYoung Kim <sunyzero (at) gmail (dot) com>
Example of REGEX (con't)Example of REGEX (con't) execution
Todo: URL 으로부터 hostname 과 URI 를 분리하는 것을 실습해봅시다
$ ./posix_regex
Using default string!!
* Dest str:<center>align to center</center> align to left<br>New Line<br><br><p>
* Regex str: (</.+>).*<br>
regcomp : (</.+>).*<br>
* All Match offset:(23->66),len(43):</center> align to left<br>New Line<br><br>
* Sub Match offset:(23->62),len(39):</center> align to left<br>New Line<br>
$ ./posix_regex.exe http://news.naver.com/news/read.php "http://([^/]+)(.*)"
regcomp : http://([^/]+)(.*)
* All Match offset : (0 -> 35), len(35) : http://news.naver.com/news/read.php
* Submatch[1] offset : (7 -> 21), len(14) : news.naver.com
* Submatch[2] offset : (21 -> 35), len(14) : /news/read.php
$ ./posix_regex
Using default string!!
* Dest str:<center>align to center</center> align to left<br>New Line<br><br><p>
* Regex str: (</.+>).*<br>
regcomp : (</.+>).*<br>
* All Match offset:(23->66),len(43):</center> align to left<br>New Line<br><br>
* Sub Match offset:(23->62),len(39):</center> align to left<br>New Line<br>
$ ./posix_regex.exe http://news.naver.com/news/read.php "http://([^/]+)(.*)"
regcomp : http://([^/]+)(.*)
* All Match offset : (0 -> 35), len(35) : http://news.naver.com/news/read.php
* Submatch[1] offset : (7 -> 21), len(14) : news.naver.com
* Submatch[2] offset : (21 -> 35), len(14) : /news/read.php
Copyright by SunYoung Kim <sunyzero (at) gmail (dot) com>
apply to Pro*C: raw dataapply to Pro*C: raw data 배치 처리할 데이터
# Raw data seqeunce: [Name] [Age] [Gender] [LOCALE]
--------------- KOR --------------
Korean staff list
YeongHee Lee| 25| Female| Korea
Hoon Kim, 29, Male, Pusan Korea
--------------- USA --------------
Steve, 29, Male, USA
Ken Jacobs, 48, Male,birmingham USA
Dave Roberts, 32, Male, New York USA
--------------- JAP --------------
Rikako, 42, Female, Nagano Japan
Lily, 35, Female,Osaka Japan
--------------- CHI --------------
Xiangping, 32, Male ,HongKong China
Chao Jien, 41, Male|HongKong China
# Raw data seqeunce: [Name] [Age] [Gender] [LOCALE]
--------------- KOR --------------
Korean staff list
YeongHee Lee| 25| Female| Korea
Hoon Kim, 29, Male, Pusan Korea
--------------- USA --------------
Steve, 29, Male, USA
Ken Jacobs, 48, Male,birmingham USA
Dave Roberts, 32, Male, New York USA
--------------- JAP --------------
Rikako, 42, Female, Nagano Japan
Lily, 35, Female,Osaka Japan
--------------- CHI --------------
Xiangping, 32, Male ,HongKong China
Chao Jien, 41, Male|HongKong China
Copyright by SunYoung Kim <sunyzero (at) gmail (dot) com>
apply to Pro*C: schemeapply to Pro*C: scheme DB scheme
WHENEVER SQLERROR CONTINUE;
DROP SEQUENCE SEQ_EMPLIST;
CREATE SEQUENCE SEQ_EMPLIST START WITH 1 INCREMENT BY 1 NOCYCLE NOCACHE;
DROP TABLE EMPLIST CASCADE CONSTRAINTS;
CREATE TABLE EMPLIST (
SNO NUMBER(5),
NAME VARCHAR(30),
GENDER NUMBER(1),
AGE NUMBER(3),
LOCALE VARCHAR(20)
);
WHENEVER SQLERROR CONTINUE;
DROP SEQUENCE SEQ_EMPLIST;
CREATE SEQUENCE SEQ_EMPLIST START WITH 1 INCREMENT BY 1 NOCYCLE NOCACHE;
DROP TABLE EMPLIST CASCADE CONSTRAINTS;
CREATE TABLE EMPLIST (
SNO NUMBER(5),
NAME VARCHAR(30),
GENDER NUMBER(1),
AGE NUMBER(3),
LOCALE VARCHAR(20)
);
Copyright by SunYoung Kim <sunyzero (at) gmail (dot) com>
apply to Pro*Capply to Pro*C pr7_regex.c : (todo) 필터링한 결과를 DB insert 하는것이 목적
예제는 불완전 하므로 기능을 완성하도록 합시다 .
typedef struct my_record {
short sno; char name[30+1]; short gender; unsigned short age; char locale[20+1];
} MY_RECORD;
int set_record(MY_RECORD *, const char *, const regmatch_t *);
int insert_rec(const MY_RECORD *s);
#define MAX_EXPR_SUB_MATCH 10
#define DEF_FILENAME "regdata.txt"
#define REGEX_STR "^([a-zA-Z ]+)[|,]([0-9 ]+)[|,]([a-zA-Z ]+)[|,]+([a-zA-Z ]+)"
int main(int argc, char **argv) {
char *p_filename;
regex_t re_expr; /* posix regex patern buffer */
regmatch_t rm_matchtab[MAX_EXPR_SUB_MATCH]; /* pattern matching table */
int i, ret;
FILE *fp;
char errbuf[0xff], buf[0xff];
MY_RECORD a_rec;
typedef struct my_record {
short sno; char name[30+1]; short gender; unsigned short age; char locale[20+1];
} MY_RECORD;
int set_record(MY_RECORD *, const char *, const regmatch_t *);
int insert_rec(const MY_RECORD *s);
#define MAX_EXPR_SUB_MATCH 10
#define DEF_FILENAME "regdata.txt"
#define REGEX_STR "^([a-zA-Z ]+)[|,]([0-9 ]+)[|,]([a-zA-Z ]+)[|,]+([a-zA-Z ]+)"
int main(int argc, char **argv) {
char *p_filename;
regex_t re_expr; /* posix regex patern buffer */
regmatch_t rm_matchtab[MAX_EXPR_SUB_MATCH]; /* pattern matching table */
int i, ret;
FILE *fp;
char errbuf[0xff], buf[0xff];
MY_RECORD a_rec;
Copyright by SunYoung Kim <sunyzero (at) gmail (dot) com>
apply to Pro*C (con't)apply to Pro*C (con't) if (argc != 2) {
printf("Using default filename!\n");
p_filename = DEF_FILENAME;
} else {
p_filename = strdup(argv[1]);
}
if ((fp = fopen(p_filename, "r")) == NULL) {
perror("FAIL: fopen");
exit(EXIT_FAILURE);
}
if ((ret = regcomp(&re_expr, REGEX_STR, REG_EXTENDED|REG_NEWLINE))) {
regerror(ret, &re_expr, errbuf, sizeof(errbuf));
printf("Error regcomp() : %s\n", errbuf);
exit(EXIT_FAILURE);
}
printf("regcomp : %s\n", REGEX_STR);
/* ( 연습 ) DB 연결을 만든다 */
if (argc != 2) {
printf("Using default filename!\n");
p_filename = DEF_FILENAME;
} else {
p_filename = strdup(argv[1]);
}
if ((fp = fopen(p_filename, "r")) == NULL) {
perror("FAIL: fopen");
exit(EXIT_FAILURE);
}
if ((ret = regcomp(&re_expr, REGEX_STR, REG_EXTENDED|REG_NEWLINE))) {
regerror(ret, &re_expr, errbuf, sizeof(errbuf));
printf("Error regcomp() : %s\n", errbuf);
exit(EXIT_FAILURE);
}
printf("regcomp : %s\n", REGEX_STR);
/* ( 연습 ) DB 연결을 만든다 */
Copyright by SunYoung Kim <sunyzero (at) gmail (dot) com>
apply to Pro*C (con't)apply to Pro*C (con't) while (!feof(fp)) {
if (fgets(buf, sizeof(buf), fp) == NULL) break;
memset(rm_matchtab, 0x00, sizeof(rm_matchtab));
if (regexec(&re_expr, buf, MAX_EXPR_SUB_MATCH, rm_matchtab, 0)) {
printf("fail to match: (%.30s ...)\n", buf);
} else {
if (set_record(&a_rec, buf, rm_matchtab)) {/* inser to db */
fprintf(stderr,"[%s:%d] FAIL: set_record()\n", __FILE__, __LINE__);
break;
}
if (insert_rec(&a_rec)) {
fprintf(stderr,"[%s:%d] FAIL: insert_rec()\n", __FILE__, __LINE__);
break;
}
EXEC SQL COMMIT;
} /* end: else */
} /* end: while */
while (!feof(fp)) {
if (fgets(buf, sizeof(buf), fp) == NULL) break;
memset(rm_matchtab, 0x00, sizeof(rm_matchtab));
if (regexec(&re_expr, buf, MAX_EXPR_SUB_MATCH, rm_matchtab, 0)) {
printf("fail to match: (%.30s ...)\n", buf);
} else {
if (set_record(&a_rec, buf, rm_matchtab)) {/* inser to db */
fprintf(stderr,"[%s:%d] FAIL: set_record()\n", __FILE__, __LINE__);
break;
}
if (insert_rec(&a_rec)) {
fprintf(stderr,"[%s:%d] FAIL: insert_rec()\n", __FILE__, __LINE__);
break;
}
EXEC SQL COMMIT;
} /* end: else */
} /* end: while */
Copyright by SunYoung Kim <sunyzero (at) gmail (dot) com>
apply to Pro*C (con't)apply to Pro*C (con't) regfree(&re_expr); /* free memory */
/* 연습 : commit 하면서 DB 연결 해제 */;
return 0;
} /* end: main() */
/* Macro: copy string indicated by the match table offset */
#define COPY_RMTAB(dest, src, matchtab) memcpy(dest, &src[matchtab.rm_so], \
matchtab.rm_eo - matchtab.rm_so); \
dest[matchtab.rm_eo - matchtab.rm_so] = 0x0
regfree(&re_expr); /* free memory */
/* 연습 : commit 하면서 DB 연결 해제 */;
return 0;
} /* end: main() */
/* Macro: copy string indicated by the match table offset */
#define COPY_RMTAB(dest, src, matchtab) memcpy(dest, &src[matchtab.rm_so], \
matchtab.rm_eo - matchtab.rm_so); \
dest[matchtab.rm_eo - matchtab.rm_so] = 0x0
Copyright by SunYoung Kim <sunyzero (at) gmail (dot) com>
apply to Pro*C (con't)apply to Pro*C (con't)int set_record(MY_RECORD *d, const char *sbuf, const regmatch_t *rmtab)
{
char buf[40]; /* temp. buffuer */
COPY_RMTAB(d->name, sbuf, rmtab[1]); /* name : 1st field => rmtab[1] */
COPY_RMTAB(buf, sbuf, rmtab[3]); /* gender: 3rd field => rmtab[3] */
if (strncmp(buf, "Male", 4) == 0) { /* with logical error! Why? */
d->gender = 1;
} else {
d->gender = 2; /* always '2'. !! */
}
COPY_RMTAB(buf, sbuf, rmtab[2]); d->age = atoi(buf);/* age:2nd field=>rmtab[2] */
COPY_RMTAB(d->locale, sbuf, rmtab[4]); /* locale: 4th field => rmtab[4] */
return 0;
} /* end: set_record() */
int insert_rec(const MY_RECORD *s) {
EXEC SQL INSERT INTO EMPLIST ... 생략 ...;
return SQLCODE;
} /* end: insert_rec() */
int set_record(MY_RECORD *d, const char *sbuf, const regmatch_t *rmtab)
{
char buf[40]; /* temp. buffuer */
COPY_RMTAB(d->name, sbuf, rmtab[1]); /* name : 1st field => rmtab[1] */
COPY_RMTAB(buf, sbuf, rmtab[3]); /* gender: 3rd field => rmtab[3] */
if (strncmp(buf, "Male", 4) == 0) { /* with logical error! Why? */
d->gender = 1;
} else {
d->gender = 2; /* always '2'. !! */
}
COPY_RMTAB(buf, sbuf, rmtab[2]); d->age = atoi(buf);/* age:2nd field=>rmtab[2] */
COPY_RMTAB(d->locale, sbuf, rmtab[4]); /* locale: 4th field => rmtab[4] */
return 0;
} /* end: set_record() */
int insert_rec(const MY_RECORD *s) {
EXEC SQL INSERT INTO EMPLIST ... 생략 ...;
return SQLCODE;
} /* end: insert_rec() */