mirror of
https://github.com/open-thought/reasoning-gym.git
synced 2026-04-19 12:58:07 +00:00
183 lines
7 KiB
Python
183 lines
7 KiB
Python
import re
|
|
from .Token import Token
|
|
from .Optimizer import optimize
|
|
|
|
|
|
class LexicalErrorException(Exception):
|
|
pass
|
|
|
|
|
|
def analyze(text):
|
|
"""
|
|
:returns list of tokens in the text
|
|
raises exception in case of lexical error
|
|
"""
|
|
|
|
rules = [
|
|
('\s+', Token.WHITESPACE),
|
|
('void', Token.VOID),
|
|
('int', Token.INT),
|
|
('bool', Token.INT), # treat bool as int
|
|
('char', Token.INT), # treat char as int
|
|
|
|
('true', Token.TRUE),
|
|
('false', Token.FALSE),
|
|
('&&', Token.AND),
|
|
('\|\|', Token.OR),
|
|
('\!', Token.NOT),
|
|
('return', Token.RETURN),
|
|
('if', Token.IF),
|
|
('else', Token.ELSE),
|
|
('while', Token.WHILE),
|
|
('for', Token.FOR),
|
|
('do', Token.DO),
|
|
('print', Token.PRINT),
|
|
('switch', Token.SWITCH),
|
|
('case', Token.CASE),
|
|
('default', Token.DEFAULT),
|
|
('break', Token.BREAK),
|
|
('continue', Token.CONTINUE), # todo
|
|
(':', Token.COLON),
|
|
(';', Token.SEMICOLON),
|
|
(',', Token.COMMA),
|
|
|
|
('\(', Token.LPAREN),
|
|
('\)', Token.RPAREN),
|
|
('\{', Token.LBRACE),
|
|
('\}', Token.RBRACE),
|
|
('\[', Token.LBRACK),
|
|
('\]', Token.RBRACK),
|
|
('=|\+=|-=|\*=|/=|%=|<<=|>>=|&=|\|=|\^=', Token.ASSIGN),
|
|
('\?', Token.TERNARY),
|
|
|
|
('<=|>=|==|!=|<|>', Token.RELOP),
|
|
('\+\+', Token.INCREMENT),
|
|
('--', Token.DECREMENT),
|
|
('\+|-|\*|/|%', Token.BINOP),
|
|
('\*\*|//|%%', Token.UNARY_MULTIPLICATIVE),
|
|
|
|
('<<|>>', Token.BITWISE_SHIFT),
|
|
('~', Token.BITWISE_NOT),
|
|
('&', Token.BITWISE_AND),
|
|
('\|', Token.BITWISE_OR),
|
|
('\^', Token.BITWISE_XOR),
|
|
|
|
('([a-zA-Z_][a-zA-Z0-9_]*)', Token.ID),
|
|
('(\d+)', Token.NUM),
|
|
('(0x[A-Fa-f\d]+)', Token.NUM), # hexadecimal number
|
|
('(0o[0-7]+)', Token.NUM), # octal number
|
|
('(0b[01]+)', Token.NUM), # binary number
|
|
(r'\"(\\\"|[^"])*"', Token.STRING),
|
|
(r'\'(\\\'|(\\)?[^\'])\'', Token.CHAR),
|
|
('//.*(\\n|$)', Token.COMMENT),
|
|
(r'/\*[\s\S]*?\*/', Token.COMMENT), # multiline comments
|
|
('.', Token.UNIDENTIFIED)
|
|
]
|
|
|
|
rules = [(re.compile(r), t) for r, t in rules]
|
|
|
|
tokens = []
|
|
|
|
# create a mapping of [line number] to [offset of that line from the beginning of the text]
|
|
newline = re.compile('\n')
|
|
lines = [0] + [m.end() for m in re.finditer(newline, text)]
|
|
|
|
i = 0
|
|
while i < len(text):
|
|
current_matches = []
|
|
for regex, token_type in rules:
|
|
m = regex.match(text, i)
|
|
if m:
|
|
current_matches.append((m, token_type))
|
|
|
|
# pick the token that fits the longest match
|
|
# if tie - pick the one defined first in the rules list
|
|
longest_match, max_i, matched_token = None, i, None
|
|
for match, token_type in current_matches:
|
|
if match.end() > max_i:
|
|
longest_match, max_i, matched_token = match, match.end(), token_type
|
|
|
|
# calculate line and column
|
|
line, column = None, None
|
|
for line_idx in range(len(lines)-1):
|
|
if lines[line_idx] <= longest_match.start() < lines[line_idx+1]:
|
|
line, column = line_idx+1, (longest_match.start() - lines[line_idx])+1 # humans count from 1 :)
|
|
break
|
|
if not line:
|
|
line, column = len(lines), (longest_match.start() - lines[-1])+1
|
|
|
|
if matched_token in [Token.COMMENT, Token.WHITESPACE]:
|
|
pass # do nothing
|
|
elif matched_token == Token.UNIDENTIFIED:
|
|
raise LexicalErrorException("Unidentified Character '%s' (line %s column %s)" % (text[i], line, column))
|
|
elif matched_token in [Token.STRING, Token.CHAR]:
|
|
# remove quotes at beginning and end, un-escape characters
|
|
tokens.append(Token(matched_token, line, column, longest_match.group()[1:-1].encode("utf8").decode("unicode_escape")))
|
|
elif matched_token in [Token.NUM, Token.ID, Token.BINOP, Token.RELOP, Token.ASSIGN, Token.UNARY_MULTIPLICATIVE, Token.BITWISE_SHIFT]:
|
|
tokens.append(Token(matched_token, line, column, longest_match.group()))
|
|
else:
|
|
tokens.append(Token(matched_token, line, column))
|
|
i = longest_match.end()
|
|
|
|
return tokens
|
|
|
|
|
|
def tests():
|
|
def test1():
|
|
# test token priorities: INT should not be confused with ID even if ID contains "int"
|
|
text = "my international int ; int; pints; international;"
|
|
res = analyze(text)
|
|
|
|
expected = [Token.ID, Token.ID, Token.INT, Token.SEMICOLON, Token.INT, Token.SEMICOLON, Token.ID,
|
|
Token.SEMICOLON, Token.ID, Token.SEMICOLON]
|
|
assert len(res) == len(expected) and all(res[i].type == expected[i] for i in range(len(res)))
|
|
|
|
def test2():
|
|
text = "true !||!false falsek k||y+-a&&x"
|
|
res = analyze(text)
|
|
|
|
expected = [Token.TRUE, Token.NOT, Token.OR, Token.NOT, Token.FALSE, Token.ID, Token.ID, Token.OR, Token.ID,
|
|
Token.BINOP, Token.BINOP, Token.ID, Token.AND, Token.ID]
|
|
assert len(res) == len(expected) and all(res[i].type == expected[i] for i in range(len(res)))
|
|
|
|
def test3():
|
|
text = "1+2"
|
|
tokens = analyze(text)
|
|
expected = [Token.NUM, Token.BINOP, Token.NUM]
|
|
assert len(tokens) == len(expected) and all(tokens[i].type == expected[i] for i in range(len(tokens)))
|
|
optimize(tokens)
|
|
assert len(tokens) == 1 and tokens[0].type == Token.NUM and tokens[0].data == "3"
|
|
|
|
text = "1+2+3"
|
|
tokens = analyze(text)
|
|
expected = [Token.NUM, Token.BINOP, Token.NUM, Token.BINOP, Token.NUM]
|
|
assert len(tokens) == len(expected) and all(tokens[i].type == expected[i] for i in range(len(tokens)))
|
|
optimize(tokens)
|
|
assert len(tokens) == 1 and tokens[0].type == Token.NUM and tokens[0].data == "6"
|
|
|
|
# make sure it is not optimized to 9 (3*3)
|
|
text = "1+2*3"
|
|
tokens = analyze(text)
|
|
expected = [Token.NUM, Token.BINOP, Token.NUM, Token.BINOP, Token.NUM]
|
|
assert len(tokens) == len(expected) and all(tokens[i].type == expected[i] for i in range(len(tokens)))
|
|
optimize(tokens)
|
|
assert len(tokens) == 1 and tokens[0].type == Token.NUM and tokens[0].data == "7"
|
|
|
|
# test all arithmetic operations
|
|
text = "(1+2*3/6)+(1%3)*(6-1)"
|
|
tokens = analyze(text)
|
|
expected = [Token.LPAREN, Token.NUM, Token.BINOP, Token.NUM, Token.BINOP, Token.NUM, Token.BINOP, Token.NUM,
|
|
Token.RPAREN, Token.BINOP, Token.LPAREN, Token.NUM, Token.BINOP, Token.NUM, Token.RPAREN,
|
|
Token.BINOP, Token.LPAREN, Token.NUM, Token.BINOP, Token.NUM, Token.RPAREN]
|
|
assert len(tokens) == len(expected) and all(tokens[i].type == expected[i] for i in range(len(tokens)))
|
|
optimize(tokens)
|
|
assert tokens[1].data == "2" and tokens[5].data == "1" and tokens[9].data == "5"
|
|
|
|
# todo find a better way to test?
|
|
test1()
|
|
test2()
|
|
test3()
|
|
|
|
|
|
if __name__ == '__main__':
|
|
tests()
|