编译原理学习：TINY语言词法扫描程序实现

浏览数：20 / 时间：2015年06月08日

最近对解释型程序（类似python或者是linux里的bc计算器）非常感兴趣，就开始学习一下编译原理。今天自己实现了TINY语言的词法扫描程序。大部分参考《编译原理及实践》一书。但是我做了一些小小的改进。

先说一下TINY语言：

1、注释：放在一对大括号内。书上的注释不能嵌套，我做了一点改进，允许嵌套。

2、关键字：read write if end repeat until else

3、类型：只支持整型和布尔型。

4、计算：+ - * / ( ) < = :=，其中:=为赋值运算，=为判断。没有〈和<= >=

一个示例的TINY语言程序：

test.tine: （选自《编译原理及实践》）

{ Sample program
  in TINY language -
  computes factorial
}
read x; { input an integer }
if 0 < x then { don't compute if x <= 0 }
	fact := 1;
	repeat
		fact := fact * x;
		x := x - 1;
	until x = 0;
	write fact { output factorial of x }
end

在globals.h中，涉及到一些类型的声明：

#ifndef GLOBALS_H
#define GLOBALS_H

#include <stdio.h>
typedef enum 
{
	ENDFILE, ERROR,
	IF, THEN, ELSE, END, REPEAT, UNTIL, READ, WRITE,
	ID, NUM,
	ASSIGN, EQ, LT, PLUS, MINUS, TIMES, OVER, LPAREN, RPAREN, SEMI
} TokenType;
extern lineno;

/* The max size of identifier of reserved word */
#define MAXTOKENLEN 50

#endif

用于生成词法扫描的flex输入，这是程序的核心部分：

tiny.l

%{
#include <stdio.h>
#include <string.h>
#include "globals.h"
#include "util.h"

char tokenString[MAXTOKENLEN + 1];
%}

digit		[0-9]
number		{digit}+
letter		[a-zA-Z]
identifier	{letter}[a-zA-Z0-9]*
newline		\n
whitespace	[ \t]

%%

"if"			{return IF;}
"then"			{return THEN;}
"else"			{return ELSE;}
"end"			{return END;}
"repeat"		{return REPEAT;}
"until"			{return UNTIL;}
"read"			{return READ;}
"write"			{return WRITE;}
":="			{return ASSIGN;}
"="			{return EQ;}
"<"			{return LT;}
"+"			{return PLUS;}
"-"			{return MINUS;}
"*"			{return TIMES;}
"/"			{return OVER;}
"("			{return LPAREN;}
")"			{return RPAREN;}
";"			{return SEMI;}
{number}		{return NUM;}
{identifier}	<span style="white-space:pre">	</span>{return ID;}
{newline}		{lineno++;}
{whitespace}	<span style="white-space:pre">	</span>{ /* Do nothing */ }
"{"			{ char c;
			  int count = 1;
			  do
			  {
				  c = input();
				  if (c == EOF) break;
				  else if (c == '\n') lineno++;
				  else if (c == '{') count++;
				  else if (c == '}') count--;
			  } while (count != 0);
			}
.			{return ERROR;}

%%

TokenType getToken(void)
{
	TokenType currentToken;
	currentToken = yylex();
	strncpy(tokenString, yytext, MAXTOKENLEN);
	printf("%d: ", lineno);
	printToken(currentToken, tokenString);

	return currentToken;
}

printToken函数在util.c中实现：

util.h:

#ifndef UTIL_H
#define UTIL_H

#include "globals.h"

void printToken(TokenType token, char* tokenString);

TokenType getToken(void);

#endif

util.c:

#include "util.h"
#include <stdio.h>
#include "globals.h"

void printToken(TokenType token, char* tokenString)
{
	switch(token)
	{
		case IF:
		case THEN:
		case ELSE:
		case END:
		case REPEAT:
		case UNTIL:
		case READ:
		case WRITE:
			printf("\treversed word: %s\n", tokenString);
			break;
		case ID:
			printf("\tidentifier: %s\n", tokenString);
			break;
		case NUM:
			printf("\tnumber: %s\n", tokenString);
			break;
		case ASSIGN:
		case EQ:
		case LT:
		case PLUS:
		case MINUS:
		case TIMES:
		case OVER:
		case LPAREN:
		case RPAREN:
		case SEMI:
			printf("\toperator: %s\n", tokenString);
	}
}

这就是所有的文件了！最后，是makefile文件：

scanner.exe: main.o lex.yy.o util.o
	gcc main.o lex.yy.o util.o -o scanner.exe -lfl
main.o: main.c globals.h util.h
	gcc main.c -c
util.o: util.c util.h globals.h
	gcc util.c -c
lex.yy.o: tiny.l
	flex tiny.l
	gcc lex.yy.c -c

于是，一个简单的词法扫描程序就完成了。

由于使用的是默认的输入，所以这个程序直接支持从键盘输入，运行效果如下：