c语言词法分析初试(C++实现)

开篇

所谓词法分析,就是将源代码按照构词规则分解成一系列单词符号。单词是语言中具有独立意义的最小单位,包括关键字、标识符、运算符、界符和常量等。

分析

源程序分解的单词可以归为以下几类:

1. 关键字

c语言关键字有不少,这里只列举其中一部分:
for,while,do,continue,if,else,
char,int,double,return

2. 变量名

变量名的规则为:
由字母数字以及下划线组成,开头必须为下划线或字母
用正规表达式描述为

letter = {a|b|...|z|A|B|...|Z}
digit = {0|1|...|9}
id = (letter|_ )(letter| _| digit)*

转化为NFA图为:
这里写图片描述

3. 数字常量

变量名的规则为:

由字母数字以及下划线组成,开头必须为下划线或字母
用正规表达式描述为

digit = {0|1|...|9}
id = digit((digit)*|.(digit)*)(ℇ | (E|e)digit(digit)*)

转化为NFA图为:

这里写图片描述

4. 界符和运算符

()[]{};+-/等等
本人这里只加入了+-和=,因为
/以及&^%等都是类似的,这里只是试验,不需要全部写。

思路

根据有限自动机的概念,将整个源代码分解的规则转化为一个一个状态,用switch或if/else将状态之间的转移描述出来即可。这里省略了将常量以及变量名插
入到符号表返回指针的那一步。
简单一个伪代码描述这个过程

    初始化 状态=0
    switch(状态):
    {
        case 0:
            接收源码
            if(符合1规则)
                处理,状态=1
            else if(符合2规则)
                处理,状态=2
            else
                ...
        case 1:
        case 2:
        case ...
    }

代码

#include <iostream>
#include <algorithm>
#include <vector>
#include <stack>
#include <cstdlib>
#include <cstdio>
using namespace std;

const int KEY_NUM = 11;
const char* KEY_SET[] = {
    "for",
    "while",
    "do",
    "continue",
    "if",
    "else",
    "char",
    "int",
    "double",
    "float",
    "return"
};

const char* FILE_NAME = "/Users/shiyi/abc.c";
char strBuffer[1026];
char strBox[200];

void retract(char* &str)
{
    str--;
}

char getChar(char* &str)
{
    return *str++;
}

bool isNum(char ch)
{
    if(ch >= '0' && ch <= '9')
        return true;
    return false;
}

bool isLetter(char ch)
{
    if(ch >= 'a' && ch <= 'z')
        return true;
    if(ch >= 'A' && ch <= 'Z')
        return true;
    return false;
}

int findKey(char* ch)
{
    for(int i=0; i<KEY_NUM; i++)
    {
        if(0 == strcmp(ch, KEY_SET[i]))
            return i+1;
    }
    return 0;
}

void output(char* key, char* value)
{
    printf("< %s, %s >\n", key, value);
}

//扫描
void scanner(char* str)
{
    int state = 0;
    char ch = ' ';
    int pos = 0;
    while(ch != '\0')
    {
    switch (state) {
        case 0:
        {
            ch = getChar(str);

            switch (ch) {
                case ' ':
                    pos = 0;
                    break;
                case '[':
                case ']':
                case '(':
                case ')':
                case '{':
                case '}': {
                    char temp[2];
                    temp[0] = ch;
                    output(temp, "-");
                    pos = 0;
                    break;
                }
                case '\'': {
                    state = 0;
                    while((ch = getChar(str)) != '\'' && ch != '\0')
                    {
                        strBox[pos++] = ch;
                    }
                    if(ch == '\0')
                    {
                        //error
                    }
                    strBox[pos] = '\0';
                    output("string", strBox);
                    pos = 0;
                    break;
                }
                case '"': {
                    state = 0;
                    while((ch = getChar(str)) != '"' && ch != '\0')
                    {
                        strBox[pos++] = ch;
                    }
                    if(ch == '\0')
                    {
                        ////
                        output("error", strBox);
                    }
                    else
                    {
                        strBox[pos] = '\0';
                        output("string", strBox);
                        pos = 0;
                    }
                    break;
                }
                case '+': {
                    state = 0;
                    ch = getChar(str);
                    switch (ch) {
                        case '+':
                            output("++", "-");
                            pos = 0;
                            break;
                        case '=':
                            output("+=", "-");
                            pos = 0;
                            break;
                        default:
                            retract(str);
                            output("+", "-");
                            pos = 0;
                            break;
                    }
                }
                case '-': {
                    state = 0;
                    ch = getChar(str);
                    switch (ch) {
                        case '-':
                            output("--", "-");
                            pos = 0;
                            break;
                        case '=':
                            output("-=", "-");
                            pos = 0;
                            break;
                        default:
                            retract(str);
                            output("-", "-");
                            pos = 0;
                            break;
                    }
                }
                case '=': {
                    state = 0;
                    ch = getChar(str);

                    switch (ch) {
                        case '=':
                            output("==", "-");
                            pos = 0;
                            break;
                        default:
                            retract(str);
                            output("=", "-");

                            pos = 0;
                            break;
                    }
                    break;
                }
                default: {

                    if(isNum(ch))
                    {

                        strBox[pos++] = ch;
                        state = 2;
                    }
                    else if(isLetter(ch) || ch == '_')
                    {
                        strBox[pos++] = ch;
                        state = 1;
                    }
                    break;
                }
            }
            break;
        }
        case 1:
        {
            while(true)
            {
                ch = getChar(str);
                if(isLetter(ch) || isNum(ch) || ch == '_')
                    strBox[pos++] = ch;
                else
                {
                    strBox[pos] = '\0';
                    int id = findKey(strBox);
                    if(id == 0)
                        output("id", strBox);
                    else
                        output("key", strBox);
                    retract(str);
                    pos = 0;
                    state = 0;
                    break;
                }
            }
            break;
        }
        case 2:
        {
            while(true)
            {
                ch = getChar(str);

                if(isNum(ch))
                    strBox[pos++] = ch;
                else if(ch == '.')
                {
                    strBox[pos++] = ch;
                    state = 3;
                    break;
                }
                else if(ch == 'E' || ch == 'e')
                {
                    strBox[pos++] = ch;
                    state = 4;
                    break;
                }
                else
                {
                    strBox[pos] = '\0';
                    output("number", strBox);
                    retract(str);
                    pos = 0;
                    state = 0;
                    break;
                }
            }
            break;
        }
        case 3:
        {
            while(true)
            {
                ch = getChar(str);
                if(isNum(ch))
                    strBox[pos++] = ch;
                else if(ch == 'E' || ch == 'e')
                {
                    strBox[pos++] = ch;
                    state = 4;
                }
                else
                {
                    strBox[pos] = '\0';
                    output("number", strBox);
                    retract(str);
                    pos = 0;
                    state = 0;
                    break;
                }
            }
            break;
        }
        case 4:
        {
            while(true)
            {
                ch = getChar(str);
                if(isNum(ch))
                    strBox[pos++] = ch;
                else
                {
                    strBox[pos] = '\0';
                    output("number", strBox);
                    retract(str);
                    pos = 0;
                    state = 0;
                    break;
                }
            }
            break;
        }
        default:
            ////
            output("error", strBox);
            break;
    }
    }
}

int main()
{
    FILE* file = fopen(FILE_NAME, "r");

    while(NULL != fgets(strBuffer, 1024, file))
    {
        scanner(strBuffer);
    }

    return 0;
}

测试结果

源程序

int main()
{
    int a = 5;
    for(int i=0; i<5; i++)
    {
        a++;
    }
    return 0;
}

输出结果

< key, int >
< id, main >
< (, - >
< ), - >
< {, - >
< key, int >
< id, a >
< =, - >
< number, 5 >
< key, for >
< (, - >
< key, int >
< id, i >
< =, - >
< number, 0 >
< id, i >
< number, 5 >
< id, i >
< ++, - >
< -, - >
< =, - >
< ), - >
< {, - >
< id, a >
< ++, - >
< -, - >
< =, - >
< }, - >
< key, return >
< number, 0 >
< }, - >
Program ended with exit code: 0