利用状态机实现一个简单的解释器

Posted by Yinode on Sunday, March 31, 2019

TOC

有限状态机(英语:finite-state machine,缩写:FSM)又称有限状态自动机,简称状态机,是表示有限个状态以及在这些状态之间的转移和动作等行为的数学模型。

事实上状态机在很多领域都是极为重要的一个概念,并不是单单在CS领域,最初这个概念应该是来自于逻辑学。在CS中的应用有编译器,正则等。

今天我就用状态机来实现一个四则运算的编译器。

200行的代码预警

其中状态机的主要职责在于将代码解析成一个TOKEN(Token是代码中的最小单位 比如一个数字 一个操作符)数组,最后将Token解析成AST,最后进行运算。

状态机实现解析Token

// 工具
const isNumber = (char) => {
	return (
		char === '1' ||
		char === '2' ||
		char === '3' ||
		char === '4' ||
		char === '5' ||
		char === '6' ||
		char === '7' ||
		char === '8' ||
		char === '9' ||
		char === '0'
	)
}

var token = []
const start = (char) => {
	if (isNumber(char)) {
		token.push(char)
		return inNumber
	}
	if (char === '-') {
		token.push(char)
		return inMinus
	}
	if (char === '+' || char === '*' || char === '/') {
		emmitToken(char, char)
		return start
	}
	if (char === '(' || char === ')') {
		emmitToken(char, char)
		return start
	}
	if (char === ' ') {
		return start
	}
	if (char === '\r' || char === '\n') {
		return start
	} else {
		emmitToken('EOF', 'EOF')
	}
}

const inMinus = (char) => {
	if (isNumber(char)) {
		token.push(char)
		return inNumber
	} else if (char === ' ') {
		emmitToken(char, char)
		return start
	}
}

const inNumber = (char) => {
	if (isNumber(char)) {
		token.push(char)
		return inNumber
	} else if (char === '.') {
		token.push(char)
		return inNumber
	} else {
		emmitToken('Number', token.join(''))
		token = []
		return start(char) // put back char
	}
}

let tokens = []

function emmitToken(type, value) {
	tokens.push({
		type,
		value
	})
	token = []
}

function getTokens(code) {
	tokens = []
	let state = start
	for (var c of code.split('')) state = state(c)
	state(Symbol('EOF'))
	return tokens
}

console.log(getTokens('250 + 111 * 2'))

/*
[ { type: 'Number', value: '250' },
  { type: '+', value: '+' },
  { type: 'Number', value: '111' },
  { type: '*', value: '*' },
  { type: 'Number', value: '2' },
  { type: 'EOF', value: 'EOF' } 
]
*/

FSM的下一个状态和输出是由输入和当前状态决定的

我们用函数代表当前的状态,用参数(输入)+if语句来表示状态的迁移逻辑,用返回的函数来代表下一个状态。

整个代码的运行过程类似于从前往后扫描,token是一个char的临时集合,比如我们扫描1.3这个数字的时候,他的迭代过程会类似于这样[] > ['1'] > ['1','.'] > ['1','.','3'] > emmitToken > []

完整的代码

接下来是完整的编译器代码,其中包含了我们的token转换器+AST生成器+代码执行器

// 工具
const isNumber = (char) => {
	return (
		char === '1' ||
		char === '2' ||
		char === '3' ||
		char === '4' ||
		char === '5' ||
		char === '6' ||
		char === '7' ||
		char === '8' ||
		char === '9' ||
		char === '0'
	)
}

var token = []
const start = (char) => {
	if (isNumber(char)) {
		token.push(char)
		return inNumber
	}
	if (char === '-') {
		token.push(char)
		return inMinus
	}
	if (char === '+' || char === '*' || char === '/') {
		emmitToken(char, char)
		return start
	}
	if (char === '(' || char === ')') {
		emmitToken(char, char)
		return start
	}
	if (char === ' ') {
		return start
	}
	if (char === '\r' || char === '\n') {
		return start
	} else {
		emmitToken('EOF', 'EOF')
	}
}

const inMinus = (char) => {
	if (isNumber(char)) {
		token.push(char)
		return inNumber
	} else if (char === ' ') {
		emmitToken(char, char)
		return start
	}
}

const inNumber = (char) => {
	if (isNumber(char)) {
		token.push(char)
		return inNumber
	} else if (char === '.') {
		token.push(char)
		return inNumber
	} else {
		emmitToken('Number', token.join(''))
		token = []
		return start(char) // put back char
	}
}

let tokens = []

function emmitToken(type, value) {
	tokens.push({
		type,
		value
	})
	token = []
}

function getTokens(code) {
	tokens = []
	let state = start
	for (var c of code.split('')) state = state(c)
	state(Symbol('EOF'))
	return tokens
}

// tokens to AST
function Expression(source) {
	if (source[0].type === 'AdditiveExpression' && source[1] && source[1].type === 'EOF') {
		let node = {
			type: 'Expression',
			children: [ source.shift(), source.shift() ]
		}
		source.unshift(node)
		return node
	}
	AdditiveExpression(source)
	return Expression(source)
}
function AdditiveExpression(source) {
	if (source[0].type === 'MultiplicativeExpression') {
		let node = {
			type: 'AdditiveExpression',
			children: [ source[0] ]
		}
		source[0] = node
		return AdditiveExpression(source)
	}
	if (source[0].type === 'AdditiveExpression' && source[1] && source[1].type === '+') {
		let node = {
			type: 'AdditiveExpression',
			operator: '+',
			children: []
		}
		node.children.push(source.shift())
    node.children.push(source.shift())
    // 这一步就是区分* 和 + 优先级的关键 我们把加法看成是乘法的结果相加
		MultiplicativeExpression(source)
		node.children.push(source.shift())
		source.unshift(node)
		return AdditiveExpression(source)
	}
	if (source[0].type === 'AdditiveExpression' && source[1] && source[1].type === '-') {
		let node = {
			type: 'AdditiveExpression',
			operator: '-',
			children: []
		}
		node.children.push(source.shift())
		node.children.push(source.shift())
		MultiplicativeExpression(source)
		node.children.push(source.shift())
		source.unshift(node)
		return AdditiveExpression(source)
	}
	if (source[0].type === 'AdditiveExpression') return source[0]
	MultiplicativeExpression(source)
	return AdditiveExpression(source)
}
function MultiplicativeExpression(source) {
	if (source[0].type === 'Number') {
		let node = {
			type: 'MultiplicativeExpression',
			children: [ source[0] ]
		}
		source[0] = node
		return MultiplicativeExpression(source)
	}
	if (source[0].type === 'MultiplicativeExpression' && source[1] && source[1].type === '*') {
		let node = {
			type: 'MultiplicativeExpression',
			operator: '*',
			children: []
		}
		node.children.push(source.shift())
		node.children.push(source.shift())
		node.children.push(source.shift())
		source.unshift(node)
		return MultiplicativeExpression(source)
	}
	if (source[0].type === 'MultiplicativeExpression' && source[1] && source[1].type === '/') {
		let node = {
			type: 'MultiplicativeExpression',
			operator: '/',
			children: []
		}
		node.children.push(source.shift())
		node.children.push(source.shift())
		node.children.push(source.shift())
		source.unshift(node)
		return MultiplicativeExpression(source)
	}
	if (source[0].type === 'MultiplicativeExpression') return source[0]

	return MultiplicativeExpression(source)
}

function evaluate(node) {
	if (node.type === 'Expression') {
		return evaluate(node.children[0])
	}
	if (node.type === 'AdditiveExpression') {
		if (node.operator === '-') {
			return evaluate(node.children[0]) - evaluate(node.children[2])
		}
		if (node.operator === '+') {
			return evaluate(node.children[0]) + evaluate(node.children[2])
		}
		return evaluate(node.children[0])
	}
	if (node.type === 'MultiplicativeExpression') {
		if (node.operator === '*') {
			return evaluate(node.children[0]) * evaluate(node.children[2])
		}
		if (node.operator === '/') {
			return evaluate(node.children[0]) / evaluate(node.children[2])
		}
		return evaluate(node.children[0])
	}
	if (node.type === 'Number') {
		return Number(node.value)
	}
}

let s = getTokens('6.2 + 3 * -2 * 8 + 2 * -256')
let ast = Expression(s)
let res = evaluate(ast)

console.log(res)