def

# 实现一个简单的递归下降分析器

## 解法

``````expr ::= expr + term
| expr - term
| term
term ::= term * factor
| term / factor
| factor
factor ::= ( expr )
| NUM
``````

``````expr ::= term { (+|-) term }*
term ::= factor { (*|/) factor }*
factor ::= ( expr )
| NUM
``````

``````NUM + NUM * NUM
``````

``````expr
expr ::= term { (+|-) term }*
expr ::= factor { (*|/) factor }* { (+|-) term }*
expr ::= NUM { (*|/) factor }* { (+|-) term }*
expr ::= NUM { (+|-) term }*
expr ::= NUM + term { (+|-) term }*
expr ::= NUM + factor { (*|/) factor }* { (+|-) term }*
expr ::= NUM + NUM { (*|/) factor}* { (+|-) term }*
expr ::= NUM + NUM * factor { (*|/) factor }* { (+|-) term }*
expr ::= NUM + NUM * NUM { (*|/) factor }* { (+|-) term }*
expr ::= NUM + NUM * NUM { (+|-) term }*
expr ::= NUM + NUM * NUM
``````

``````import re
import collections
# Token specification
NUM = r'(?P<NUM>\d+)'
PLUS = r'(?P<PLUS>\+)'
MINUS = r'(?P<MINUS>-)'
TIMES = r'(?P<TIMES>\*)'
DIVIDE = r'(?P<DIVIDE>/)'
LPAREN = r'(?P<LPAREN>\()'
RPAREN = r'(?P<RPAREN>\))'
WS = r'(?P<WS>\s+)'
master_pat = re.compile('|'.join([NUM, PLUS, MINUS, TIMES, DIVIDE, LPAREN, RPAREN, WS]))
# Tokenizer
Token = collections.namedtuple('Token', ['type','value'])
def generate_tokens(text):
scanner = master_pat.scanner(text)
for m in iter(scanner.match, None):
tok = Token(m.lastgroup, m.group())
if tok.type != 'WS':
yield tok
# Parser
class ExpressionEvaluator:
'''
Implementation of a recursive descent parser. Each method
implements a single grammar rule. Use the ._accept() method
to test and accept the current lookahead token. Use the ._expect()
method to exactly match and discard the next token on on the input
(or raise a SyntaxError if it doesn't match).
'''
def parse(self,text):
self.tokens = generate_tokens(text)
self.tok = None # Last symbol consumed
self.nexttok = None # Next symbol tokenized
return self.expr()
self.tok, self.nexttok = self.nexttok, next(self.tokens, None)
def _accept(self,toktype):
'Test and consume the next token if it matches toktype'
if self.nexttok and self.nexttok.type == toktype:
return True
else:
return False
def _expect(self,toktype):
'Consume next token if it matches toktype or raise SyntaxError'
if not self._accept(toktype):
raise SyntaxError('Expected ' + toktype)
# Grammar rules follow
def expr(self):
"expression ::= term { ('+'|'-') term }*"
exprval = self.term()
while self._accept('PLUS') or self._accept('MINUS'):
op = self.tok.type
right = self.term()
if op == 'PLUS':
exprval += right
elif op == 'MINUS':
exprval -= right
return exprval
def term(self):
"term ::= factor { ('*'|'/') factor }*"
termval = self.factor()
while self._accept('TIMES') or self._accept('DIVIDE'):
op = self.tok.type
right = self.factor()
if op == 'TIMES':
termval *= right
elif op == 'DIVIDE':
termval /= right
return termval
def factor(self):
"factor ::= NUM | ( expr )"
if self._accept('NUM'):
return int(self.tok.value)
elif self._accept('LPAREN'):
exprval = self.expr()
self._expect('RPAREN')
return exprval
else:
raise SyntaxError('Expected NUMBER or LPAREN')
``````

``````>>> e = ExpressionEvaluator()
>>> e.parse('2')
2
>>> e.parse('2 + 3')
5
>>> e.parse('2 + 3 * 4')
14
>>> e.parse('2 + (3 + 4) * 5')
37
>>> e.parse('2 + (3 + * 4)')
Traceback (most recent call last):
File "<stdin>", line 1, in <module>
File "exprparse.py", line 40, in parse
return self.expr()
File "exprparse.py", line 67, in expr
right = self.term()
File "exprparse.py", line 77, in term
termval = self.factor()
File "exprparse.py", line 93, in factor
exprval = self.expr()
File "exprparse.py", line 67, in expr
right = self.term()
File "exprparse.py", line 77, in term
termval = self.factor()
File "exprparse.py", line 97, in factor
raise SyntaxError("Expected NUMBER or LPAREN")
SyntaxError: Expected NUMBER or LPAREN
>>>
``````

``````class ExpressionTreeBuilder(ExpressionEvaluator):
def expr(self):
"expression ::= term { ('+'|'-') term }"
exprval = self.term()
while self._accept('PLUS') or self._accept('MINUS'):
op = self.tok.type
right = self.term()
if op == 'PLUS':
exprval = ('+', exprval, right)
elif op == 'MINUS':
exprval = ('-', exprval, right)
return exprval
def term(self):
"term ::= factor { ('*'|'/') factor }"
termval = self.factor()
while self._accept('TIMES') or self._accept('DIVIDE'):
op = self.tok.type
right = self.factor()
if op == 'TIMES':
termval = ('*', termval, right)
elif op == 'DIVIDE':
termval = ('/', termval, right)
return termval
def factor(self):
'factor ::= NUM | ( expr )'
if self._accept('NUM'):
return int(self.tok.value)
elif self._accept('LPAREN'):
exprval = self.expr()
self._expect('RPAREN')
return exprval
else:
raise SyntaxError('Expected NUMBER or LPAREN')
``````

``````>>> e = ExpressionTreeBuilder()
>>> e.parse('2 + 3')
('+', 2, 3)
>>> e.parse('2 + 3 * 4')
('+', 2, ('*', 3, 4))
>>> e.parse('2 + (3 + 4) * 5')
('+', 2, ('*', ('+', 3, 4), 5))
>>> e.parse('2 + 3 + 4')
('+', ('+', 2, 3), 4)
>>>
``````

## 讨论

``````expr ::= term { ('+'|'-') term }*
term ::= factor { ('*'|'/') factor }*
factor ::= '(' expr ')'
| NUM
``````

``````class ExpressionEvaluator:
...
def expr(self):
...
def term(self):
...
def factor(self):
...
``````

• 如果规则中的下个符号是另外一个语法规则的名字 (比如 term 或 factor)，就简单的调用同名的方法即可。这就是该算法中” 下降” 的由来 - 控制下降到另一个语法规则中去。有时候规则会调用已经执行的方法 (比如，在 factor ::= '('expr')' 中对 expr 的调用)。这就是算法中” 递归” 的由来。
• 如果规则中下一个符号是个特殊符号 (比如 ()，你得查找下一个令牌并确认是一个精确匹配)。如果不匹配，就产生一个语法错误。这一节中的 expect() 方法就是用来做这一步的。
• 如果规则中下一个符号为一些可能的选择项 (比如 + 或 -)，你必须对每一种可能情况检查下一个令牌，只有当它匹配一个的时候才能继续。这也是本节示例中accept() 方法的目的。它相当于 expect() 方法的弱化版本，因为如果一个匹配找到了它会继续，但是如果没找到，它不会产生错误而是回滚 (允许后续的检查继续进行)。
• 对于有重复部分的规则 (比如在规则表达式 ::= term { ('+'|'-') term }* 中)，重复动作通过一个 while 循环来实现。循环主体会收集或处理所有的重复元素直到没有其他元素可以找到。
• 一旦整个语法规则处理完成，每个方法会返回某种结果给调用者。这就是在解析过程中值是怎样累加的原理。比如，在表达式求值程序中，返回值代表表达式解析后的部分结果。最后所有值会在最顶层的语法规则方法中合并起来。

``````items ::= items ',' item
| item
``````

``````def items(self):
itemsval = self.items()
if itemsval and self._accept(','):
itemsval.append(self.item())
else:
itemsval = [ self.item() ]
``````

``````expr ::= factor { ('+'|'-'|'*'|'/') factor }*
factor ::= '(' expression ')'
| NUM
``````

``````# plyexample.py
#
# Example of parsing with PLY
from ply.lex import lex
from ply.yacc import yacc
# Token list
tokens = [ 'NUM', 'PLUS', 'MINUS', 'TIMES', 'DIVIDE', 'LPAREN', 'RPAREN' ]
# Ignored characters
t_ignore = ' \t\n'
# Token specifications (as regexs)
t_PLUS = r'\+'
t_MINUS = r'-'
t_TIMES = r'\*'
t_DIVIDE = r'/'
t_LPAREN = r'\('
t_RPAREN = r'\)'
# Token processing functions
def t_NUM(t):
r'\d+'
t.value = int(t.value)
return t
# Error handler
def t_error(t):
t.skip(1)
# Build the lexer
lexer = lex()
# Grammar rules and handler functions
def p_expr(p):
'''
expr : expr PLUS term
| expr MINUS term
'''
if p[2] == '+':
p[0] = p[1] + p[3]
elif p[2] == '-':
p[0] = p[1] - p[3]
def p_expr_term(p):
'''
expr : term
'''
p[0] = p[1]
def p_term(p):
'''
term : term TIMES factor
| term DIVIDE factor
'''
if p[2] == '*':
p[0] = p[1] * p[3]
elif p[2] == '/':
p[0] = p[1] / p[3]
def p_term_factor(p):
'''
term : factor
'''
p[0] = p[1]
def p_factor(p):
'''
factor : NUM
'''
p[0] = p[1]
def p_factor_group(p):
'''
factor : LPAREN expr RPAREN
'''
p[0] = p[2]
def p_error(p):
print('Syntax error')
parser = yacc()
``````

``````>>> parser.parse('2')
2
>>> parser.parse('2+3')
5
>>> parser.parse('2+(3+4)*5')
37
>>>
``````

https://my.oschina.net/jallenkwong/blog/4717504