基于linux c编程的正则表达式

写这个模块的目的在于更加方便的使用基于c语言的正则匹配,在做通信协议报文解析的时候,正则表达式匹配相当有效。而linux自有的正则表达式相关函数用起来相对来说还是比较麻烦。

例如,若想把字符串in中的mac地址和timeout和freetime提取出来,可以设计正则表达式为p:

<span style="font-family:Courier New;">char *in = "QueryTimesResult,MAC=00:22:33:44:55:77,timeout=1,freetime=10;00:22:33:44:55:88,timeout=0,freetime=20";
char p[] = "\\s*([A-F0-9:]+)\\s*,\\s*timeout\\s*=\\s*([0-9]+)\\s*,\\s*freetime\\s*=\\s*([0-9]+)";</span>

调用regex_match_all接口,可以把匹配结果都提取到内存块match之中;其中cell_num代表着正则表达式的分组数目,即小括号数目;match_num代表在字符串in中可以匹配正则表达式p的次数。结构体t_mbc相当重要, 代表正则表达一个分组小括号里面的内容(整数或者字符串的首地址)。

<span style="font-family:Courier New;">regex_match_all(in, p, &match, &cell_num, &match_num);</span>

所以一个match指针代表:

              0                       cell_num

match--->+-------+-------+------------+-------+

   0     | t_mbc | t_mbc | ...........| t_mbc |

         +-------+-------+------------+-------+

         |  ................................  | 

         |------------------------------------+ 

         |....................................|

         +---------------+------------+-------+

match_num| t_mbc | t_mbc | ...........| t_mbc |

         +-------+-------+------------+-------+

匹配结果放入到match中之后,就可以方便的把值拿出来使用。

在match使用完之后,需要释放内存,调用接口函数regex_free_all即可:

<span style="font-family:Courier New;">regex_free_all(void* match, int cell_num, int match_num)</span>

源代码如下:

<span style="font-family:Courier New;">#include <stdio.h>
#include <string.h>
#include <regex.h>
#include <stdlib.h>
#include <mcheck.h>

#define SUBSLEN 10  
#define EBUFLEN 128 
#define BUFLEN 1024 

#define reg_comp(a, b, c)        regcomp(a, b ,c)
#define reg_error(a, b , c, d)   regerror(a, b, c, d)
#define reg_exec(a, b, c, d, e)  regexec(a, b, c, d, e)
#define reg_free(a)			     regfree(a)	


typedef enum data_type {
	integer,
	string,
} e_dt;

/* content match in brace */
typedef struct match_brace_content {
	e_dt type;
	union {
		int  integer;
		char *string;
	} data;	
} s_mbc;

/*
 * Description: 
 * 根据正则表达式的括号数, 以及括号里面的内容, 来初始化匹配结果集模式, 暂时不支持括号嵌套的情况
 */
int init_brace_mode(char* pattern, s_mbc** out, int* found)
{
	int ret = -1, str_len = 0;
	char *stack = NULL;
	if(pattern == NULL) {
		printf("null input pointer\n");
		goto err;
	}
	
	str_len = strlen(pattern);
	stack = (char *)malloc(str_len);
	if(stack == NULL) {
		printf("no spare memory!\n");
		goto err;
	}
	
	int i, j, find_left_brace = 0, stack_top = 0, brace_pair_cnt = 0;
	s_mbc* braces = malloc(sizeof(s_mbc));
	
	for(i = 0; i < str_len; i++) {
		if(*(pattern + i) == '(') {
			find_left_brace = 1;
			continue;
		}
				
		if(*(pattern + i) == ')') {
			brace_pair_cnt++;
			stack[stack_top] = '\0';
			find_left_brace = 0;
			stack_top = 0;
			braces = realloc(braces, brace_pair_cnt * sizeof(s_mbc));
			
			/* 判断括号里面匹配的是字符串还是数字 */
			for(j = 0; j < strlen(stack); j++) {
				if(((stack[j] > 'a') && (stack[j] < 'z')) || ((stack[j] > 'A') && (stack[j] < 'Z'))) { 
					braces[brace_pair_cnt - 1].type = string;
					printf("to match string!\n");
					break;
				}
			}
			
			if(j == strlen(stack)) {
				braces[brace_pair_cnt - 1].type = integer;
				printf("to match integer!\n");
			}
			
			continue;
		}
		
		if(find_left_brace == 0) {
			continue; 
		} else {
			stack[stack_top++] = *(pattern + i);
		}
	}
	
	if(brace_pair_cnt > 0) {
		ret = 0;
		*found = brace_pair_cnt;
		*out = braces;
	} else {
		free(braces);
	}
	
err:
	if(stack != NULL) free(stack);
	return ret;
}

int regex_match_all(char *buf, char* pattern, void** match, int *cell_num, int *match_num) { 
	size_t          len;
	regex_t         re;            
	regmatch_t      subs [SUBSLEN];    
	char            matched [BUFLEN]; 
	char            errbuf  [EBUFLEN]; 
	int             err, i, find = 0, ret = -1;
	char 			*src = buf; 

	err = reg_comp(&re, pattern, REG_EXTENDED);
	if (err) {
		len = reg_error(err, &re, errbuf, sizeof(errbuf));
		printf("error: regcomp: %s\n", errbuf);
		goto err_no_free;
	}

	s_mbc* tamplate = NULL, *one_match;
	int brace_pair_num = 0;
	init_brace_mode(pattern, &tamplate, &brace_pair_num); // TODO 记住释放tamplate的内存
	if(re.re_nsub != brace_pair_num) {
		printf("please check function init_brace_mode()!\n");
		goto err0;
	}
	
	int one_match_size = brace_pair_num * sizeof(s_mbc);
	void *out, *temp;
	out = malloc(one_match_size);
	if(out == NULL) {
		printf("[%20s:%d]no spare memory!\n", __FUNCTION__, __LINE__);
		goto err0;
	}
	
	while(1) {
        err = reg_exec(&re, src, (size_t) SUBSLEN, subs, 0);
        if (err == REG_NOMATCH) {
            printf("Sorry, no match ...\n");
			goto err0;
        } else if (err) { 
            len = reg_error(err, &re, errbuf, sizeof(errbuf));
            printf("error: regexec: %s\n", errbuf);
			goto err0;
        }

        printf("\nOK, has matched ...\n\n");
		
		temp = realloc(out, (find + 1)* one_match_size);
		if(temp == NULL) {
			printf("[%20s:%d]no spare memory!\n", __FUNCTION__, __LINE__);
			goto err0;
		} else {
			out = temp;
		}
		
		one_match = (s_mbc*)(out + one_match_size * find);
        for (i = 0; i <= re.re_nsub; i++) {
			len = subs[i].rm_eo - subs[i].rm_so;
			if (i == 0) {
				printf ("begin: %d, len = %d  ", subs[i].rm_so, len); 
				continue;
			} else {
				printf("subexpression %d begin: %d, len = %d  ", i, subs[i].rm_so, len);
			}
			memcpy(matched, src + subs[i].rm_so, len);
			matched[len] = '\0';
			printf("match: %s\n", matched);
			one_match[i - 1].type = tamplate[i - 1].type;	/* 第一括号内容对应的下标i为1 */
			if(one_match[i - 1].type == integer) {
				one_match[i - 1].data.integer = atoi(matched);
			} else {
				one_match[i - 1].data.string = strdup(matched);	
			}
        }
		src = src + subs[i - 1].rm_so + len; 
		find++;
	}
	ret = 0;

err0:	
	if(tamplate != NULL) free(tamplate);
    reg_free(&re); 
	*match = out;
	*match_num = find; 
	*cell_num = brace_pair_num;

err_no_free:	
    return ret; 
}

void regex_free_all(void* match, int cell_num, int match_num)
{
	int i, j;
	s_mbc *cell;
	for(i = 0; i < match_num; i++) {
		cell = (s_mbc*)(match + cell_num * sizeof(s_mbc) * i);
		for(j = 0; j < cell_num; j++) {
			if(cell[j].type == string) free(cell[j].data.string);
		}
	}
	free(match);
}


int main()
{	
	s_mbc *cell;
	int cell_num = 0, match_num = 0, i, j;
	void *match = NULL;
	
	char *in = "QueryTimesResult,MAC=00:22:33:44:55:77,timeout=1,freetime=10;00:22:33:44:55:88,timeout=0,freetime=20";
	char p[] = "\\s*([A-F0-9:]+)\\s*,\\s*timeout\\s*=\\s*([0-9]+)\\s*,\\s*freetime\\s*=\\s*([0-9]+)";

	setenv("MALLOC_TRACE", "output", 1);
	mtrace();

	regex_match_all(in, p, &match, &cell_num, &match_num);
	if(match_num > 0) {
		for(i = 0; i < match_num; i++) {
			cell = (s_mbc*)(match + cell_num * sizeof(s_mbc) * i);
			for(j = 0; j < cell_num; j++) {
				if(cell[j].type == string) {
					printf("----- string: %s\n",cell[j].data.string);
				} else {
					printf("----- integer: %d\n",cell[j].data.integer);
				}
			}
		}
	}
	
	regex_free_all(match, cell_num, match_num);
		
    return (0);
}

</span>

打印结果:

----- string: 00:22:33:44:55:77
----- integer: 1
----- integer: 10
----- string: 00:22:33:44:55:88
----- integer: 0
----- integer: 20

郑重声明:本站内容如果来自互联网及其他传播媒体,其版权均属原媒体及文章作者所有。转载目的在于传递更多信息及用于网络分享,并不代表本站赞同其观点和对其真实性负责,也不构成任何其他建议。