基于linux c编程的正则表达式
写这个模块的目的在于更加方便的使用基于c语言的正则匹配,在做通信协议报文解析的时候,正则表达式匹配相当有效。而linux自有的正则表达式相关函数用起来相对来说还是比较麻烦。
例如,若想把字符串in中的mac地址和timeout和freetime提取出来,可以设计正则表达式为p:
<span style="font-family:Courier New;">char *in = "QueryTimesResult,MAC=00:22:33:44:55:77,timeout=1,freetime=10;00:22:33:44:55:88,timeout=0,freetime=20"; char p[] = "\\s*([A-F0-9:]+)\\s*,\\s*timeout\\s*=\\s*([0-9]+)\\s*,\\s*freetime\\s*=\\s*([0-9]+)";</span>
调用regex_match_all接口,可以把匹配结果都提取到内存块match之中;其中cell_num代表着正则表达式的分组数目,即小括号数目;match_num代表在字符串in中可以匹配正则表达式p的次数。结构体t_mbc相当重要, 代表正则表达一个分组小括号里面的内容(整数或者字符串的首地址)。
<span style="font-family:Courier New;">regex_match_all(in, p, &match, &cell_num, &match_num);</span>
所以一个match指针代表:
0 cell_num
match--->+-------+-------+------------+-------+
0 | t_mbc | t_mbc | ...........| t_mbc |
+-------+-------+------------+-------+
| ................................ |
|------------------------------------+
|....................................|
+---------------+------------+-------+
match_num| t_mbc | t_mbc | ...........| t_mbc |
+-------+-------+------------+-------+
匹配结果放入到match中之后,就可以方便的把值拿出来使用。
在match使用完之后,需要释放内存,调用接口函数regex_free_all即可:
<span style="font-family:Courier New;">regex_free_all(void* match, int cell_num, int match_num)</span>
源代码如下:
<span style="font-family:Courier New;">#include <stdio.h> #include <string.h> #include <regex.h> #include <stdlib.h> #include <mcheck.h> #define SUBSLEN 10 #define EBUFLEN 128 #define BUFLEN 1024 #define reg_comp(a, b, c) regcomp(a, b ,c) #define reg_error(a, b , c, d) regerror(a, b, c, d) #define reg_exec(a, b, c, d, e) regexec(a, b, c, d, e) #define reg_free(a) regfree(a) typedef enum data_type { integer, string, } e_dt; /* content match in brace */ typedef struct match_brace_content { e_dt type; union { int integer; char *string; } data; } s_mbc; /* * Description: * 根据正则表达式的括号数, 以及括号里面的内容, 来初始化匹配结果集模式, 暂时不支持括号嵌套的情况 */ int init_brace_mode(char* pattern, s_mbc** out, int* found) { int ret = -1, str_len = 0; char *stack = NULL; if(pattern == NULL) { printf("null input pointer\n"); goto err; } str_len = strlen(pattern); stack = (char *)malloc(str_len); if(stack == NULL) { printf("no spare memory!\n"); goto err; } int i, j, find_left_brace = 0, stack_top = 0, brace_pair_cnt = 0; s_mbc* braces = malloc(sizeof(s_mbc)); for(i = 0; i < str_len; i++) { if(*(pattern + i) == '(') { find_left_brace = 1; continue; } if(*(pattern + i) == ')') { brace_pair_cnt++; stack[stack_top] = '\0'; find_left_brace = 0; stack_top = 0; braces = realloc(braces, brace_pair_cnt * sizeof(s_mbc)); /* 判断括号里面匹配的是字符串还是数字 */ for(j = 0; j < strlen(stack); j++) { if(((stack[j] > 'a') && (stack[j] < 'z')) || ((stack[j] > 'A') && (stack[j] < 'Z'))) { braces[brace_pair_cnt - 1].type = string; printf("to match string!\n"); break; } } if(j == strlen(stack)) { braces[brace_pair_cnt - 1].type = integer; printf("to match integer!\n"); } continue; } if(find_left_brace == 0) { continue; } else { stack[stack_top++] = *(pattern + i); } } if(brace_pair_cnt > 0) { ret = 0; *found = brace_pair_cnt; *out = braces; } else { free(braces); } err: if(stack != NULL) free(stack); return ret; } int regex_match_all(char *buf, char* pattern, void** match, int *cell_num, int *match_num) { size_t len; regex_t re; regmatch_t subs [SUBSLEN]; char matched [BUFLEN]; char errbuf [EBUFLEN]; int err, i, find = 0, ret = -1; char *src = buf; err = reg_comp(&re, pattern, REG_EXTENDED); if (err) { len = reg_error(err, &re, errbuf, sizeof(errbuf)); printf("error: regcomp: %s\n", errbuf); goto err_no_free; } s_mbc* tamplate = NULL, *one_match; int brace_pair_num = 0; init_brace_mode(pattern, &tamplate, &brace_pair_num); // TODO 记住释放tamplate的内存 if(re.re_nsub != brace_pair_num) { printf("please check function init_brace_mode()!\n"); goto err0; } int one_match_size = brace_pair_num * sizeof(s_mbc); void *out, *temp; out = malloc(one_match_size); if(out == NULL) { printf("[%20s:%d]no spare memory!\n", __FUNCTION__, __LINE__); goto err0; } while(1) { err = reg_exec(&re, src, (size_t) SUBSLEN, subs, 0); if (err == REG_NOMATCH) { printf("Sorry, no match ...\n"); goto err0; } else if (err) { len = reg_error(err, &re, errbuf, sizeof(errbuf)); printf("error: regexec: %s\n", errbuf); goto err0; } printf("\nOK, has matched ...\n\n"); temp = realloc(out, (find + 1)* one_match_size); if(temp == NULL) { printf("[%20s:%d]no spare memory!\n", __FUNCTION__, __LINE__); goto err0; } else { out = temp; } one_match = (s_mbc*)(out + one_match_size * find); for (i = 0; i <= re.re_nsub; i++) { len = subs[i].rm_eo - subs[i].rm_so; if (i == 0) { printf ("begin: %d, len = %d ", subs[i].rm_so, len); continue; } else { printf("subexpression %d begin: %d, len = %d ", i, subs[i].rm_so, len); } memcpy(matched, src + subs[i].rm_so, len); matched[len] = '\0'; printf("match: %s\n", matched); one_match[i - 1].type = tamplate[i - 1].type; /* 第一括号内容对应的下标i为1 */ if(one_match[i - 1].type == integer) { one_match[i - 1].data.integer = atoi(matched); } else { one_match[i - 1].data.string = strdup(matched); } } src = src + subs[i - 1].rm_so + len; find++; } ret = 0; err0: if(tamplate != NULL) free(tamplate); reg_free(&re); *match = out; *match_num = find; *cell_num = brace_pair_num; err_no_free: return ret; } void regex_free_all(void* match, int cell_num, int match_num) { int i, j; s_mbc *cell; for(i = 0; i < match_num; i++) { cell = (s_mbc*)(match + cell_num * sizeof(s_mbc) * i); for(j = 0; j < cell_num; j++) { if(cell[j].type == string) free(cell[j].data.string); } } free(match); } int main() { s_mbc *cell; int cell_num = 0, match_num = 0, i, j; void *match = NULL; char *in = "QueryTimesResult,MAC=00:22:33:44:55:77,timeout=1,freetime=10;00:22:33:44:55:88,timeout=0,freetime=20"; char p[] = "\\s*([A-F0-9:]+)\\s*,\\s*timeout\\s*=\\s*([0-9]+)\\s*,\\s*freetime\\s*=\\s*([0-9]+)"; setenv("MALLOC_TRACE", "output", 1); mtrace(); regex_match_all(in, p, &match, &cell_num, &match_num); if(match_num > 0) { for(i = 0; i < match_num; i++) { cell = (s_mbc*)(match + cell_num * sizeof(s_mbc) * i); for(j = 0; j < cell_num; j++) { if(cell[j].type == string) { printf("----- string: %s\n",cell[j].data.string); } else { printf("----- integer: %d\n",cell[j].data.integer); } } } } regex_free_all(match, cell_num, match_num); return (0); } </span>
打印结果:
----- string: 00:22:33:44:55:77
----- integer: 1
----- integer: 10
----- string: 00:22:33:44:55:88
----- integer: 0
----- integer: 20
郑重声明:本站内容如果来自互联网及其他传播媒体,其版权均属原媒体及文章作者所有。转载目的在于传递更多信息及用于网络分享,并不代表本站赞同其观点和对其真实性负责,也不构成任何其他建议。