Restructure {reasoning_gym, tests}/{core, exercises, curricula}

This commit is contained in:
EduardDurech 2025-02-02 21:14:29 +00:00
parent 8b0f634f4c
commit 10dbb374b0
110 changed files with 0 additions and 0 deletions

View file

@ -1,193 +0,0 @@
#!/usr/bin/env python3
from .Exceptions import BFSemanticError, BFSyntaxError
from .FunctionCompiler import FunctionCompiler
from .Functions import check_function_exists, get_function_object, insert_function_object
from .General import get_literal_token_code, is_token_literal, unpack_literal_tokens_to_array_dimensions
from .Globals import (
create_variable_from_definition,
get_global_variables_size,
get_variable_dimensions,
get_variable_size,
insert_global_variable,
)
from .Lexical_analyzer import analyze
from .LibraryFunctionCompiler import insert_library_functions
from .Optimizer import optimize
from .Parser import Parser
from .Token import Token
"""
This file is responsible for creating FunctionCompiler objects and global variables objects
And finally, return the code of the main function
"""
class Compiler:
def __init__(self, code, optimize_code=False):
tokens = analyze(code)
if optimize_code:
optimize(tokens)
self.parser = Parser(tokens)
# global variables and functions
def create_function_object(self):
# function: (INT | VOID) ID LPAREN expression_list RPAREN LBRACE statements RBRACE
# returns function named tuple
if self.parser.current_token().type not in [Token.VOID, Token.INT]:
raise BFSemanticError(
"Function return type can be either void or int, not '%s'" % str(self.parser.current_token())
)
self.parser.check_next_tokens_are([Token.ID, Token.LPAREN])
# save all tokens of this function
function_name = self.parser.next_token(next_amount=1).data
RPAREN_index = self.parser.find_matching(
starting_index=self.parser.current_token_index + 2
) # first find RPAREN
self.parser.check_next_token_is(Token.LBRACE, starting_index=RPAREN_index)
RBRACE_index = self.parser.find_matching(starting_index=RPAREN_index + 1) # then find RBRACE
# take all tokens between INT and RBRACE and pass them to function object
function_tokens = self.parser.tokens[self.parser.current_token_index : RBRACE_index + 1]
# skip function definition
self.parser.advance_to_token_at_index(RBRACE_index + 1)
function = FunctionCompiler(function_name, function_tokens)
return function
def compile_global_variable_definition(self):
# INT ID (ASSIGN NUM | (LBRACK NUM RBRACK)+ (ASSIGN LBRACE ... RBRACE)?)? SEMICOLON
# returns code that initializes this variable, and advances pointer according to variable size
self.parser.check_current_tokens_are([Token.INT, Token.ID])
ID_token = self.parser.next_token()
variable = create_variable_from_definition(self.parser, advance_tokens=True)
insert_global_variable(variable)
# if this is set to False, then the compiler assumes that initially all cells are set to zero
# if this is set to True, then the compiler zeros each cell before using it (may generate a lot of unnecessary BF code)
ZERO_CELLS_BEFORE_USE = False
code = "[-]" if ZERO_CELLS_BEFORE_USE else ""
if get_variable_size(variable) > 1: # its an array
if self.parser.current_token().type == Token.SEMICOLON:
# array definition - INT ID (LBRACK NUM RBRACK)+ SEMICOLON
self.parser.advance_token() # skip SEMICOLON
code = (code + ">") * get_variable_size(variable) # advance to after this variable
return code
elif self.parser.current_token().type == Token.ASSIGN and self.parser.current_token().data == "=":
# array definition and initialization - INT ID (LBRACK NUM RBRACK)+ ASSIGN ((LBRACE ... RBRACE)+|STRING) SEMICOLON
self.parser.advance_token() # skip ASSIGN
if self.parser.current_token().type not in [Token.LBRACE, Token.STRING]:
raise BFSyntaxError("Expected LBRACE or STRING at '%s'" % self.parser.current_token())
literal_tokens_list = self.parser.compile_array_initialization_list()
self.parser.check_current_token_is(Token.SEMICOLON)
self.parser.advance_token() # skip SEMICOLON
array_dimensions = get_variable_dimensions(variable)
unpacked_literals_list = unpack_literal_tokens_to_array_dimensions(
ID_token, array_dimensions, literal_tokens_list
)
for literal in unpacked_literals_list:
code += get_literal_token_code(literal) # evaluate this literal and point to next array element
return code
else:
raise BFSyntaxError(
"Unexpected %s in array definition. Expected SEMICOLON (;) or ASSIGN (=)"
% self.parser.current_token()
)
elif self.parser.current_token().type == Token.SEMICOLON: # no need to initialize
self.parser.advance_token() # skip SEMICOLON
code += ">" # advance to after this variable
else:
self.parser.check_current_token_is(Token.ASSIGN)
if self.parser.current_token().data != "=":
raise BFSyntaxError(
"Unexpected %s when initializing global variable. Expected ASSIGN (=)" % self.parser.current_token()
)
self.parser.advance_token() # skip ASSIGN
if not is_token_literal(self.parser.current_token()):
raise BFSemanticError(
"Unexpected '%s'. expected literal (NUM | CHAR | TRUE | FALSE )" % str(self.parser.current_token())
)
code += get_literal_token_code(self.parser.current_token())
self.parser.check_next_token_is(Token.SEMICOLON)
self.parser.advance_token(amount=2) # skip (NUM|CHAR|TRUE|FALSE) SEMICOLON
return code
def process_global_definitions(self):
"""
Iterate through all tokens
When encountering function definition - create Function object and pass it the function's tokens
When encountering global variable definition - create Variable object
Returns code that initializes global variables and advances the pointer to after them
"""
code = ""
token = self.parser.current_token()
while token is not None and token.type in [Token.VOID, Token.INT, Token.SEMICOLON]:
if token.type == Token.SEMICOLON: # can have random semicolons ;)
self.parser.advance_token()
token = self.parser.current_token()
continue
self.parser.check_next_token_is(Token.ID)
if self.parser.next_token(next_amount=2).type == Token.LPAREN:
function = self.create_function_object()
insert_function_object(function)
elif token.type is Token.INT and self.parser.next_token(next_amount=2).type in [
Token.SEMICOLON,
Token.ASSIGN,
Token.LBRACK,
]:
code += self.compile_global_variable_definition()
else:
raise BFSyntaxError(
"Unexpected '%s' after '%s'. Expected '(' (function definition) or one of: '=', ';', '[' (global variable definition)"
% (str(self.parser.next_token(next_amount=2)), str(self.parser.next_token()))
)
token = self.parser.current_token()
if self.parser.current_token() is not None: # we have not reached the last token
untouched_tokens = [str(t) for t in self.parser.tokens[self.parser.current_token_index :]]
raise BFSyntaxError("Did not reach the end of the code. Untouched tokens:\n%s" % untouched_tokens)
return code
def compile(self):
insert_library_functions()
code = (
self.process_global_definitions()
) # code that initializes global variables and advances pointer to after them
check_function_exists(Token(Token.ID, 0, 0, "main"), 0)
code += get_function_object("main").get_code(get_global_variables_size())
code += "<" * get_global_variables_size() # point to the first cell to end the program nicely :)
return code
def compile(code, optimize_code=False):
"""
:param code: C-like code (string)
:param optimize_code: syntax optimization (bool)
:return code: Brainfuck code (string)
"""
compiler = Compiler(code, optimize_code)
brainfuck_code = compiler.compile()
return brainfuck_code
if __name__ == "__main__":
print("This file cannot be directly run")
print("Please import it and use the 'compile' function")
print("Which receives a C-like code (string) and returns Brainfuck code (string)")

View file

@ -1,6 +0,0 @@
class BFSyntaxError(Exception):
pass
class BFSemanticError(Exception):
pass

File diff suppressed because it is too large Load diff

View file

@ -1,37 +0,0 @@
from copy import deepcopy
from .Exceptions import BFSemanticError
functions = dict() # Global dictionary of function_name --> FunctionCompiler objects
def insert_function_object(function):
functions[function.name] = function
def get_function_object(name):
"""
must return a copy of the function
because we might need to compile function recursively
and if we don't work on different copies then we will interfere with the current token pointer etc
for example:
int increase(int n) { return n+1;}
int main() {int x = increase(increase(1));}
while compiling the first call, we start a compilation of the same function object in the second call
"""
return deepcopy(functions[name])
def check_function_exists(function_token, parameters_amount):
function_name = function_token.data
if function_name not in functions:
raise BFSemanticError("Function '%s' is undefined" % str(function_token))
function = functions[function_name]
if len(function.parameters) != parameters_amount:
raise BFSemanticError(
"Function '%s' has %s parameters (called it with %s parameters)"
% (str(function_token), len(function.parameters), parameters_amount)
)

File diff suppressed because it is too large Load diff

View file

@ -1,83 +0,0 @@
from collections import namedtuple
from .General import dimensions_to_size, get_NUM_token_value
from .Token import Token
"""
This file holds the program's functions and global variables
(as global variables, hehe)
And related functions
"""
global_variables = list() # Global list of global variables
# variables
def get_global_variables():
return global_variables
def insert_global_variable(variable):
get_global_variables().append(variable)
def get_global_variables_size():
return sum(get_variable_size(variable) for variable in get_global_variables())
def create_variable(name, type, dimensions):
# return variable named tuple
variable = namedtuple("variable", ["name", "type", "size", "cell_index"])
variable.name = name
variable.type = type
variable.dimensions = dimensions # list of array dimensions sizes (for non-arrays it will be [1])
variable.cell_index = None # will be updated when we insert this variable into an ids map
return variable
def get_variable_size(variable):
# return total variable size
return dimensions_to_size(variable.dimensions)
def get_variable_dimensions(variable):
return variable.dimensions
def is_variable_array(variable):
return variable.dimensions != [1]
def create_variable_from_definition(parser, index=None, advance_tokens=False):
"""
processes the variable definition at index, and returns the variable named tuple
if index is None, then assumes we start at the current_token_index
if advance_tokens is True, then modifies current_token_index accordingly using parser.advance_token()
"""
if index is None:
index = parser.current_token_index
assert parser.tokens[index].type == Token.INT
parser.check_next_token_is(Token.ID, starting_index=index)
ID = parser.tokens[index + 1].data
if advance_tokens:
parser.advance_token(amount=2) # skip INT ID
if parser.tokens[index + 2].type == Token.LBRACK: # array (support multi-dimensional arrays)
dimensions = [] # element[i] holds the size of dimension[i]
while parser.tokens[index + 2].type == Token.LBRACK:
parser.check_next_tokens_are([Token.LBRACK, Token.NUM, Token.RBRACK], starting_index=index + 1)
dimensions.append(get_NUM_token_value(parser.tokens[index + 3]))
if advance_tokens:
parser.advance_token(amount=3) # skip LBRACK NUM RBRACK
index += 3
else:
dimensions = [1]
return create_variable(ID, Token.INT, dimensions)

View file

@ -1,233 +0,0 @@
import re
from .Optimizer import optimize
from .Token import Token
class LexicalErrorException(Exception):
pass
def analyze(text):
"""
:returns list of tokens in the text
raises exception in case of lexical error
"""
rules = [
(r"\s+", Token.WHITESPACE),
("void", Token.VOID),
("int", Token.INT),
("bool", Token.INT), # treat bool as int
("char", Token.INT), # treat char as int
("true", Token.TRUE),
("false", Token.FALSE),
("&&", Token.AND),
(r"\|\|", Token.OR),
(r"\!", Token.NOT),
("return", Token.RETURN),
("if", Token.IF),
("else", Token.ELSE),
("while", Token.WHILE),
("for", Token.FOR),
("do", Token.DO),
("print", Token.PRINT),
("switch", Token.SWITCH),
("case", Token.CASE),
("default", Token.DEFAULT),
("break", Token.BREAK),
("continue", Token.CONTINUE), # todo
(":", Token.COLON),
(";", Token.SEMICOLON),
(",", Token.COMMA),
(r"\(", Token.LPAREN),
(r"\)", Token.RPAREN),
(r"\{", Token.LBRACE),
(r"\}", Token.RBRACE),
(r"\[", Token.LBRACK),
(r"\]", Token.RBRACK),
(r"=|\+=|-=|\*=|/=|%=|<<=|>>=|&=|\|=|\^=", Token.ASSIGN),
(r"\?", Token.TERNARY),
(r"<=|>=|==|!=|<|>", Token.RELOP),
(r"\+\+", Token.INCREMENT),
("--", Token.DECREMENT),
(r"\+|-|\*|/|%", Token.BINOP),
(r"\*\*|//|%%", Token.UNARY_MULTIPLICATIVE),
("<<|>>", Token.BITWISE_SHIFT),
("~", Token.BITWISE_NOT),
("&", Token.BITWISE_AND),
(r"\|", Token.BITWISE_OR),
(r"\^", Token.BITWISE_XOR),
("([a-zA-Z_][a-zA-Z0-9_]*)", Token.ID),
(r"(\d+)", Token.NUM),
(r"(0x[A-Fa-f\d]+)", Token.NUM), # hexadecimal number
("(0o[0-7]+)", Token.NUM), # octal number
("(0b[01]+)", Token.NUM), # binary number
(r'\"(\\\"|[^"])*"', Token.STRING),
(r"\'(\\\'|(\\)?[^\'])\'", Token.CHAR),
("//.*(\\n|$)", Token.COMMENT),
(r"/\*[\s\S]*?\*/", Token.COMMENT), # multiline comments
(".", Token.UNIDENTIFIED),
]
rules = [(re.compile(r), t) for r, t in rules]
tokens = []
# create a mapping of [line number] to [offset of that line from the beginning of the text]
newline = re.compile("\n")
lines = [0] + [m.end() for m in re.finditer(newline, text)]
i = 0
while i < len(text):
current_matches = []
for regex, token_type in rules:
m = regex.match(text, i)
if m:
current_matches.append((m, token_type))
# pick the token that fits the longest match
# if tie - pick the one defined first in the rules list
longest_match, max_i, matched_token = None, i, None
for match, token_type in current_matches:
if match.end() > max_i:
longest_match, max_i, matched_token = match, match.end(), token_type
# calculate line and column
line, column = None, None
for line_idx in range(len(lines) - 1):
if lines[line_idx] <= longest_match.start() < lines[line_idx + 1]:
line, column = line_idx + 1, (longest_match.start() - lines[line_idx]) + 1 # humans count from 1 :)
break
if not line:
line, column = len(lines), (longest_match.start() - lines[-1]) + 1
if matched_token in [Token.COMMENT, Token.WHITESPACE]:
pass # do nothing
elif matched_token == Token.UNIDENTIFIED:
raise LexicalErrorException("Unidentified Character '%s' (line %s column %s)" % (text[i], line, column))
elif matched_token in [Token.STRING, Token.CHAR]:
# remove quotes at beginning and end, un-escape characters
tokens.append(
Token(matched_token, line, column, longest_match.group()[1:-1].encode("utf8").decode("unicode_escape"))
)
elif matched_token in [
Token.NUM,
Token.ID,
Token.BINOP,
Token.RELOP,
Token.ASSIGN,
Token.UNARY_MULTIPLICATIVE,
Token.BITWISE_SHIFT,
]:
tokens.append(Token(matched_token, line, column, longest_match.group()))
else:
tokens.append(Token(matched_token, line, column))
i = longest_match.end()
return tokens
def tests():
def test1():
# test token priorities: INT should not be confused with ID even if ID contains "int"
text = "my international int ; int; pints; international;"
res = analyze(text)
expected = [
Token.ID,
Token.ID,
Token.INT,
Token.SEMICOLON,
Token.INT,
Token.SEMICOLON,
Token.ID,
Token.SEMICOLON,
Token.ID,
Token.SEMICOLON,
]
assert len(res) == len(expected) and all(res[i].type == expected[i] for i in range(len(res)))
def test2():
text = "true !||!false falsek k||y+-a&&x"
res = analyze(text)
expected = [
Token.TRUE,
Token.NOT,
Token.OR,
Token.NOT,
Token.FALSE,
Token.ID,
Token.ID,
Token.OR,
Token.ID,
Token.BINOP,
Token.BINOP,
Token.ID,
Token.AND,
Token.ID,
]
assert len(res) == len(expected) and all(res[i].type == expected[i] for i in range(len(res)))
def test3():
text = "1+2"
tokens = analyze(text)
expected = [Token.NUM, Token.BINOP, Token.NUM]
assert len(tokens) == len(expected) and all(tokens[i].type == expected[i] for i in range(len(tokens)))
optimize(tokens)
assert len(tokens) == 1 and tokens[0].type == Token.NUM and tokens[0].data == "3"
text = "1+2+3"
tokens = analyze(text)
expected = [Token.NUM, Token.BINOP, Token.NUM, Token.BINOP, Token.NUM]
assert len(tokens) == len(expected) and all(tokens[i].type == expected[i] for i in range(len(tokens)))
optimize(tokens)
assert len(tokens) == 1 and tokens[0].type == Token.NUM and tokens[0].data == "6"
# make sure it is not optimized to 9 (3*3)
text = "1+2*3"
tokens = analyze(text)
expected = [Token.NUM, Token.BINOP, Token.NUM, Token.BINOP, Token.NUM]
assert len(tokens) == len(expected) and all(tokens[i].type == expected[i] for i in range(len(tokens)))
optimize(tokens)
assert len(tokens) == 1 and tokens[0].type == Token.NUM and tokens[0].data == "7"
# test all arithmetic operations
text = "(1+2*3/6)+(1%3)*(6-1)"
tokens = analyze(text)
expected = [
Token.LPAREN,
Token.NUM,
Token.BINOP,
Token.NUM,
Token.BINOP,
Token.NUM,
Token.BINOP,
Token.NUM,
Token.RPAREN,
Token.BINOP,
Token.LPAREN,
Token.NUM,
Token.BINOP,
Token.NUM,
Token.RPAREN,
Token.BINOP,
Token.LPAREN,
Token.NUM,
Token.BINOP,
Token.NUM,
Token.RPAREN,
]
assert len(tokens) == len(expected) and all(tokens[i].type == expected[i] for i in range(len(tokens)))
optimize(tokens)
assert tokens[1].data == "2" and tokens[5].data == "1" and tokens[9].data == "5"
# todo find a better way to test?
test1()
test2()
test3()
if __name__ == "__main__":
tests()

View file

@ -1,129 +0,0 @@
from .Functions import insert_function_object
from .Token import Token
class LibraryFunctionCompiler:
def __init__(self, name, type, parameters, code):
self.name = name
self.type = type
self.parameters = parameters
self.code = code
def get_code(self, current_stack_pointer):
return self.code
def get_readint_code():
# res, tmp, input, loop
# tmp is used for multiplication
"""
res = 0
loop = 1
while loop
loop = 0
input = input()
if input != newline # todo add a eof check as well. run it in several interpreters to look for common ways for "end of number" input
loop = 1
res *= 10 + char_to_digit(input)
"""
code = "[-]" # clear res = 0
code += ">[-]" # tmp = 0
code += ">>[-]+" # loop = 1
code += "[" # while loop == 1
code += "[-]" # loop = 0
code += "<" # point to input
code += "," # input character
code += "----------" # sub 10 (check for newline)
code += "[" # if input is not newline
code += ">" # point to loop
code += "+" # loop = 1
# multiply res by 10 and add the input digit
code += "<<<" # point to res
code += "[>+<-]" # move res to tmp
code += ">" # point to tmp
code += "[<++++++++++>-]" # res = tmp * 10, tmp = 0
code += ">" # point to input
code += "-" * (
0x30 - 10
) # convert character to a digit by subtracting 0x30 from it (we already subtracted 10 before)
code += "[<<+>>-]" # res += input
code += "]" # end if
code += ">" # point to loop
code += "]" # end while
code += "<<<" # point to res
return code
def get_printint_code():
# return_cell value_to_print_cell
code = ">" # point to value_to_print cell
code += ">[-]" * 8 + "<" * 8 # zero some cells
code += ">++++++++++<" # div amount
code += "[->-[>+>>]>[+[<+>-]>+>>]<<<<<]" # value_to_print/10
code += ">[-]" # zero d-n%d
code += ">>" # point to div result
code += ">++++++++++<" # div amount
code += "[->-[>+>>]>[+[<+>-]>+>>]<<<<<]" # res/10
code += ">[-]" # zero d-n%d
code += ">>" # point to div result
code += "[" # if the first digit is not 0
code += ">++++++[<++++++++>-]<." # add 48 to the first digit and print it
code += "<<"
code += "+>" # set is_over_100 to true
code += "+>" # add 1 to the second digit so it prints even when it's 0
code += "[-]" # zero the first digit
code += "]" # end if
code += "<" # point to the second digit
code += "[" # if the second digit is not 0
code += "<[>-<-]" # if is_over_100 is true then subtract 1 from the second digit
code += "++++++[>++++++++<-]>." # add 48 to the second digit and print it
code += "[-]" # zero the second digit
code += "]" # end if
code += "<<" # point to the cell after the third digit
code += "++++++[<++++++++>-]<." # add 48 to the third digit and print it
code += "[-]" # zero the third digit
code += "<<" # point to value_to_print_cell which is 0
code += "<" # point to return_cell
return code
def get_readchar_code():
# read input into "return value cell". no need to move the pointer
code = ","
return code
def get_printchar_code():
# point to parameter, output it, and then point back to "return value cell"
code = ">.<"
return code
def insert_library_functions():
readint = LibraryFunctionCompiler("readint", Token.INT, list(), get_readint_code())
insert_function_object(readint)
printint = LibraryFunctionCompiler("printint", Token.VOID, [Token.INT], get_printint_code())
insert_function_object(printint)
readchar = LibraryFunctionCompiler("readchar", Token.INT, list(), get_readchar_code())
insert_function_object(readchar)
printchar = LibraryFunctionCompiler("printchar", Token.VOID, [Token.INT], get_printchar_code())
insert_function_object(printchar)

View file

@ -1,14 +0,0 @@
def minify(code):
old_code = ""
while old_code != code:
old_code = code
code = code.replace("><", "")
code = code.replace("<>", "")
code = code.replace("+-", "")
code = code.replace("-+", "")
code = code.replace("][-]", "]")
return code

View file

@ -1,437 +0,0 @@
from .Exceptions import BFSemanticError
from .General import (
get_copy_from_variable_code,
get_copy_to_variable_code,
get_literal_token_code,
get_move_left_index_cell_code,
get_move_right_index_cells_code,
get_offset_to_variable,
get_op_between_literals_code,
get_op_boolean_operator_code,
get_token_ID_code,
get_unary_postfix_op_code,
get_unary_prefix_op_code,
get_variable_dimensions_from_token,
is_token_literal,
unpack_literal_tokens_to_array_dimensions,
)
from .Token import Token
"""
This file holds classes that are used to create the parse tree of expressions
Each class implements a get_code() function that receives a "stack pointer" and returns code that evaluates the expression
"""
class Node:
def __init__(self, ids_map_list):
# holds a copy of ids_map_list as it was when we parsed the expression
self.ids_map_list = ids_map_list[:]
def assign_token_to_op_token(self, assign_token):
assert assign_token.data in ["+=", "-=", "*=", "/=", "%=", "<<=", ">>=", "&=", "|=", "^="]
assignment_map = {
"+=": Token(Token.BINOP, assign_token.line, assign_token.column, data="+"),
"-=": Token(Token.BINOP, assign_token.line, assign_token.column, data="-"),
"*=": Token(Token.BINOP, assign_token.line, assign_token.column, data="*"),
"/=": Token(Token.BINOP, assign_token.line, assign_token.column, data="/"),
"%=": Token(Token.BINOP, assign_token.line, assign_token.column, data="%"),
"<<=": Token(Token.BITWISE_SHIFT, assign_token.line, assign_token.column, data="<<"),
">>=": Token(Token.BITWISE_SHIFT, assign_token.line, assign_token.column, data=">>"),
"&=": Token(Token.BITWISE_AND, assign_token.line, assign_token.column),
"|=": Token(Token.BITWISE_OR, assign_token.line, assign_token.column),
"^=": Token(Token.BITWISE_XOR, assign_token.line, assign_token.column),
}
op_token = assignment_map[assign_token.data]
op_node = NodeToken(self.ids_map_list, token=op_token)
return op_node
def get_code(self, *args, **kwargs):
pass
class NodeToken(Node):
def __init__(self, ids_map_list, left=None, token=None, right=None):
Node.__init__(self, ids_map_list)
self.left = left
self.right = right
self.token = token
def get_code(self, current_pointer, *args, **kwargs):
# returns the code that evaluates the parse tree
if is_token_literal(self.token) or self.token.type == Token.ID:
# its a literal (leaf)
assert self.left is None and self.right is None
if self.token.type == Token.ID:
return get_token_ID_code(self.ids_map_list, self.token, current_pointer)
else:
return get_literal_token_code(self.token)
elif self.token.type in [
Token.BINOP,
Token.RELOP,
Token.BITWISE_SHIFT,
Token.BITWISE_AND,
Token.BITWISE_OR,
Token.BITWISE_XOR,
]:
code = self.left.get_code(current_pointer)
code += self.right.get_code(current_pointer + 1)
code += "<<" # point to the first operand
right_token = None
if isinstance(self.right, NodeToken):
right_token = self.right.token
code += get_op_between_literals_code(self.token, right_token)
return code
elif self.token.type in [Token.AND, Token.OR]: # short-circuit evaluation treated differently
return get_op_boolean_operator_code(self, current_pointer)
elif self.token.type == Token.ASSIGN:
assert self.left.token.type == Token.ID
if self.token.data == "=":
# id = expression
code = self.right.get_code(current_pointer)
# create code to copy from evaluated expression to ID's cell
code += "<" # point to evaluated expression cell
code += get_copy_to_variable_code(self.ids_map_list, self.left.token, current_pointer)
code += ">" # point to next available cell
return code
else:
assert self.token.data in ["+=", "-=", "*=", "/=", "%=", "<<=", ">>=", "&=", "|=", "^="]
# id += expression
# create a node for id + expression
op_node = self.assign_token_to_op_token(self.token)
op_node.left = self.left
op_node.right = self.right
# create a node for id = id + expression
assign_token = Token(Token.ASSIGN, self.token.line, self.token.column, data="=")
assignment_node = NodeToken(self.ids_map_list, left=self.left, token=assign_token, right=op_node)
return assignment_node.get_code(current_pointer)
class NodeTernary(Node):
def __init__(self, ids_map_list, condition, node_true, node_false):
# node_condition ? node_true : node_false;
Node.__init__(self, ids_map_list)
self.condition = condition
self.node_true = node_true
self.node_false = node_false
def get_code(self, current_pointer, *args, **kwargs):
# cells layout:
# result, bool_evaluate_node_false, condition
code = ">" # point to bool_evaluate_node_false
code += "[-]+" # bool_evaluate_node_false=1
code += ">" # point to condition
code += self.condition.get_code(current_pointer + 2) # evaluate condition
code += "<" # point to condition
code += "[" # if condition is non-zero
code += "<<" # point to result
code += self.node_true.get_code(current_pointer) # evaluate node_true
# now we point to bool_evaluate_node_false
code += "[-]" # zero bool_evaluate_node_false
code += ">" # point to condition
code += "[-]" # zero condition
code += "]" # end if
code += "<" # point to bool_evaluate_node_false
code += "[" # if bool_evaluate_node_false is 1
code += "<" # point to result
code += self.node_false.get_code(current_pointer) # evaluate node_false
# now we point to bool_evaluate_node_false
code += "[-]" # zero bool_evaluate_node_false
code += "]" # end if
# now we point to one cell after result - next available cell
return code
class NodeUnaryPrefix(Node):
def __init__(self, ids_map_list, operation, literal):
Node.__init__(self, ids_map_list)
self.token_operation = operation
self.node_literal = literal
def get_code(self, current_pointer, *args, **kwargs):
# unary prefix (!x or ++x or ~x or -x)
assert self.token_operation.type in [
Token.NOT,
Token.INCREMENT,
Token.DECREMENT,
Token.UNARY_MULTIPLICATIVE,
Token.BITWISE_NOT,
Token.BINOP,
]
if self.token_operation.type in [Token.NOT, Token.BITWISE_NOT, Token.BINOP]:
code = self.node_literal.get_code(current_pointer)
code += "<" # point to operand
code += get_unary_prefix_op_code(self.token_operation)
return code
else:
# its INCREMENT or DECREMENT
if isinstance(self.node_literal, NodeArrayGetElement):
token_id, index_node = self.node_literal.token_id, self.node_literal.node_expression
code = get_move_right_index_cells_code(current_pointer, index_node)
offset_to_array = get_offset_to_variable(self.ids_map_list, token_id, current_pointer + 2)
# it is +2 because in "get_move_right_index_cells_code", we moved 2 extra cells to the right, for retrieving the value
code += get_unary_prefix_op_code(self.token_operation, offset_to_array)
code += "<" # point to res
code += "[<<+>>-]" # move res to old "index cell"
code += "<" # point to new index cell
code += get_move_left_index_cell_code()
return code
# the token to apply on must be an ID
if isinstance(self.node_literal, NodeToken) is False:
raise BFSemanticError(
"Prefix operator %s can only be applied to a variable" % str(self.token_operation)
)
if self.node_literal.token.type != Token.ID:
raise BFSemanticError(
"Prefix operator %s cannot be applied to %s, but only to a variable"
% (str(self.token_operation), str(self.node_literal.token))
)
offset_to_ID = get_offset_to_variable(self.ids_map_list, self.node_literal.token, current_pointer)
return get_unary_prefix_op_code(self.token_operation, offset_to_ID)
class NodeUnaryPostfix(Node):
def __init__(self, ids_map_list, operation, literal):
Node.__init__(self, ids_map_list)
self.token_operation = operation
self.node_literal = literal
def get_code(self, current_pointer, *args, **kwargs):
# its an unary postfix operation (x++)
assert self.token_operation.type in [Token.INCREMENT, Token.DECREMENT, Token.UNARY_MULTIPLICATIVE]
if isinstance(self.node_literal, NodeArrayGetElement):
token_id, index_node = self.node_literal.token_id, self.node_literal.node_expression
code = get_move_right_index_cells_code(current_pointer, index_node)
offset_to_array = get_offset_to_variable(self.ids_map_list, token_id, current_pointer + 2)
# it is +2 because in "get_move_right_index_cells_code", we moved 2 extra cells to the right, for retrieving the value
code += get_unary_postfix_op_code(self.token_operation, offset_to_array)
code += "<" # point to res
code += "[<<+>>-]" # move res to old "index cell"
code += "<" # point to new index cell
code += get_move_left_index_cell_code()
return code
# the token to apply on must be an ID
if isinstance(self.node_literal, NodeToken) is False:
raise BFSemanticError("Postfix operator %s can only be applied to a variable" % str(self.token_operation))
if self.node_literal.token.type != Token.ID:
raise BFSemanticError(
"Postfix operator %s cannot be applied to %s, but only to a variable"
% (str(self.token_operation), str(self.node_literal.token))
)
offset_to_ID = get_offset_to_variable(self.ids_map_list, self.node_literal.token, current_pointer)
return get_unary_postfix_op_code(self.token_operation, offset_to_ID)
class NodeFunctionCall(Node):
def __init__(self, ids_map_list, function_to_call, parameters):
"""
receives a FunctionCompiler object
that implements get_code() which gets a stack pointer and returns code
receives a list of parameters - Node objects
each one gets a stack pointer and returns code that evaluates the parameter
"""
Node.__init__(self, ids_map_list)
self.function_to_call = function_to_call
self.parameters = parameters
def get_code(self, current_pointer, *args, **kwargs):
code = "[-]>" # return_value_cell=0
# evaluate parameters from left to right, and put them on the "stack" in that order
# after each parameter code, the pointer points to the next available cell (one after the parameter)
for i, parameter in enumerate(self.parameters):
code += parameter.get_code(
current_pointer + 1 + i
) # evaluate each parameter at its cell offset (starting at one after return_value_cell)
# at this point we point to one after the last parameter
code += "<" * len(self.parameters) # point back to first parameter
code += "<" # point to return_value_cell
code += self.function_to_call.get_code(
current_stack_pointer=current_pointer
) # after this we point to return value cell
code += ">" # point to next available cell (one after return value)
return code
class NodeArrayElement(Node):
def __init__(self, ids_map_list):
Node.__init__(self, ids_map_list)
"""
the idea:
1. evaluate index. it is known only in run time, so we need to perform a little trick
2. move <index> steps to the right, while counting how many steps we moved so far
hold an index, and a steps_counter, and move them to the right while decreasing index and increasing steps_counter
e.g: 4,0 --> 3,1 --> 2,2 --> 1,3 --> 0,4
(move right until index is 0. counter will hold the old index)
this way we know we moved <index> steps, and know how many steps to go back when we are done
3. move <offset from stack pointer to array> steps left, to get/set the relevant array element
this offset is known at compilation time
"""
class NodeArrayGetElement(NodeArrayElement):
"""
class for getting element of a one-dimensional array
it receives an expression, indicating the required index
and returns a code that gets that element
"""
def __init__(self, ids_map_list, token_id, node_expression):
Node.__init__(self, ids_map_list)
self.token_id = token_id
self.node_expression = node_expression
def get_code(self, current_pointer, *args, **kwargs):
code = get_move_right_index_cells_code(current_pointer, self.node_expression)
code += get_copy_from_variable_code(self.ids_map_list, self.token_id, current_pointer + 2)
# it is +2 because in "get_move_right_index_cells_code", we moved 2 extra cells to the right, for retrieving the value
code += "<" # point to res
code += "[<<+>>-]" # move res to old "index cell"
code += "<" # point to new index cell
code += get_move_left_index_cell_code()
return code
class NodeArraySetElement(NodeArrayElement):
"""
class for setting element of a one-dimensional array
it receives:
1. an expression, indicating the required index
2. assignment operator (=|+=|-=|*=|/=|%=|<<=|>>=|&=|(|=)|^=)
3. an expression, indicating the value to be used for the assignment
and returns a code that gets that element
"""
def __init__(self, ids_map_list, token_id, node_expression_index, assign_token, node_expression_value):
Node.__init__(self, ids_map_list)
self.token_id = token_id
self.node_expression_index = node_expression_index
if assign_token.data == "=":
# id[exp] = expression
self.assign_token = assign_token
self.node_expression_value = node_expression_value
else:
# id[exp] += expression
assert assign_token.data in ["+=", "-=", "*=", "/=", "%=", "<<=", ">>=", "&=", "|=", "^="]
self.assign_token = Token(Token.ASSIGN, assign_token.line, assign_token.column, data="=")
# create a node for id[exp] + expression
op_node = self.assign_token_to_op_token(assign_token)
op_node.left = NodeArrayGetElement(self.ids_map_list, token_id, node_expression_index)
op_node.right = node_expression_value
self.node_expression_value = op_node
def get_code(self, current_pointer, *args, **kwargs):
# index, steps_taken_counter, value
code = self.node_expression_index.get_code(current_pointer)
code += "[-]" # counter = 0
code += ">" # point to value cell
code += self.node_expression_value.get_code(current_pointer + 2)
code += "<<<" # point to index
code += "[" # while index != 0
code += ">>>" # point to new_value (one after current value)
code += "[-]" # zero new_value
code += "<" # move to old value
code += "[>+<-]" # move old value to new counter
code += "<" # point to old counter
code += "+" # increase old counter
code += "[>+<-]" # move old counter to new counter
code += "<" # point to old index
code += "-" # decrease old index
code += "[>+<-]" # move old index to new index
code += ">" # point to new index
code += "]" # end while
code += ">>" # point to value
code += get_copy_to_variable_code(self.ids_map_list, self.token_id, current_pointer + 2)
# it is +2 because we moved 2 extra cells to the right, for pointing to value
# layout: 0, idx, value (pointing to value)
# create layout: value, idx
code += "[<<+>>-]" # move value to old "index" cell (which is now 0)
# value, index (pointing to one after index)
code += "<" # point to index
code += "[" # while index != 0
code += "<" # point to value
code += "[<+>-]" # move value to the left
code += ">" # point to index
code += "-" # sub 1 from index
code += "[<+>-]" # move index to left
code += "<" # point to index
code += "]" # end while
# now value is at the desired cell, and we point to the next available cell
return code
class NodeArrayAssignment(Node):
"""
Used for array assignment
E.g arr = = { 1, 2, 3... }
"""
def __init__(self, ids_map_list, token_id, literal_tokens_list):
Node.__init__(self, ids_map_list)
self.token_id = token_id
self.literal_tokens_list = literal_tokens_list
def get_code(self, current_pointer, *args, **kwargs):
array_dimensions = get_variable_dimensions_from_token(self.ids_map_list, self.token_id)
unpacked_literals_list = unpack_literal_tokens_to_array_dimensions(
self.token_id, array_dimensions, self.literal_tokens_list
)
offset = get_offset_to_variable(self.ids_map_list, self.token_id, current_pointer)
code = "<" * offset # point to first array element
for literal in unpacked_literals_list:
code += get_literal_token_code(literal) # evaluate this literal and point to next array element
code += ">" * (offset - len(unpacked_literals_list)) # move back to the original position
code += ">" # point to the next cell
return code

View file

@ -1,106 +0,0 @@
from .General import get_NUM_token_value
from .Token import Token
"""
This file holds functions that optimize code on syntax-level. For example:
The tokens corresponding to the code "3*5" will be replaced in-place by a token that represents "15"
"""
def optimize_once(tokens):
# performs one pass on the tokens and optimizes them in-place if possible
# optimization based on a list of rules
def optimize_binop(tokens, start_index):
# optimize arithmetic operations. E.g replace 1+2 with 3
# need to be careful not to optimize (1+2*3) to (3*3)
if (
tokens[start_index + 1].data in ["*", "/", "%"]
or (start_index + 3 >= len(tokens))
or (tokens[start_index + 3].data not in ["*", "/", "%"])
):
num1, num2 = get_NUM_token_value(tokens[start_index]), get_NUM_token_value(tokens[start_index + 2])
op = tokens[start_index + 1].data
if op == "+":
val = num1 + num2
elif op == "-":
val = num1 - num2
if val < 0: # cannot optimize negative values
return False
elif op == "*":
val = num1 * num2
elif op in ["/", "%"]:
if num2 == 0:
print("WARNING (optimizer) - division by zero at %s" % str(tokens[start_index]))
return False
if op == "/":
val = num1 // num2
else:
val = num1 % num2
else:
raise NotImplementedError(op)
# remove the 3 old tokens and replace them with new one
new_token = Token(
Token.NUM,
tokens[start_index].line,
tokens[start_index].column,
data=str(val),
original_tokens=tokens[start_index : start_index + 3],
)
for _ in range(3):
tokens.pop(start_index)
tokens.insert(start_index, new_token)
return True
return False
def optimize_printint(tokens, start_index):
# replace printint(50) with print("50")
# since printing strings compiles into less Brainfuck code than printing ints
if tokens[start_index].data == "printint":
tokens[start_index] = Token(
Token.PRINT, tokens[start_index].line, tokens[start_index].column, original_tokens=[tokens[start_index]]
)
tokens[start_index + 2] = Token(
Token.STRING,
tokens[start_index].line,
tokens[start_index].column,
data=str(tokens[start_index + 2].data),
original_tokens=[tokens[start_index + 2]],
)
return True
return False
rules = [
([Token.NUM, Token.BINOP, Token.NUM], optimize_binop), # arithmetic operations
([Token.ID, Token.LPAREN, Token.NUM, Token.RPAREN], optimize_printint), # printint(50) to print("50")
]
# try to match one of the rules to the tokens in a "sliding window" style
i = 0
while i < len(tokens):
optimized = False
for tokens_sequence, optimization_function in rules:
if i + len(tokens_sequence) <= len(tokens):
if all(tokens_sequence[n] == tokens[i + n].type for n in range(len(tokens_sequence))):
if optimization_function(tokens, i):
optimized = True
if optimized:
continue # don't increment i, try to optimize the same location again
i += 1
def optimize(tokens):
# optimize tokens again and again until there is nothing left to optimize
prev_tokens = [token.type for token in tokens]
while True:
optimize_once(tokens)
print(".", end="")
current_tokens = [token.type for token in tokens]
if current_tokens == prev_tokens:
break
prev_tokens = current_tokens

View file

@ -1,133 +0,0 @@
from .Exceptions import BFSemanticError, BFSyntaxError
from .General import is_token_literal
from .Token import Token
class Parser:
"""
Used to easily iterate tokens
"""
def __init__(self, tokens):
self.tokens = tokens
self.current_token_index = 0
# parsing tokens
def current_token(self):
if self.current_token_index >= len(self.tokens):
return None
else:
return self.token_at_index(self.current_token_index)
def advance_token(self, amount=1):
self.current_token_index += amount
def advance_to_token_at_index(self, token_index):
self.current_token_index = token_index
def token_at_index(self, index):
assert index < len(self.tokens)
return self.tokens[index]
def next_token(self, next_amount=1):
return self.token_at_index(self.current_token_index + next_amount)
def find_matching(self, starting_index=None):
"""
:return: the index of the token that matches the current token
:param starting_index (optional) - the index of the token we want to match
for example, if current token is {
it returns the index of the matching }
"""
if starting_index is None:
starting_index = self.current_token_index
tokens = self.tokens
token_to_match = tokens[starting_index]
if token_to_match.type == Token.LBRACE:
inc = Token.LBRACE
dec = Token.RBRACE
elif token_to_match.type == Token.LBRACK:
inc = Token.LBRACK
dec = Token.RBRACK
elif token_to_match.type == Token.LPAREN:
inc = Token.LPAREN
dec = Token.RPAREN
else:
raise BFSemanticError("No support for matching %s" % str(token_to_match))
i = starting_index
cnt = 0
while i < len(tokens):
if tokens[i].type == inc:
cnt += 1
elif tokens[i].type == dec:
cnt -= 1
if cnt == 0:
return i
i += 1
raise BFSyntaxError("Did not find matching %s for %s" % (dec, str(token_to_match)))
def check_next_tokens_are(self, tokens_list, starting_index=None):
if starting_index is None:
starting_index = self.current_token_index
# used for "assertion" and print a nice message to the user
if starting_index + len(tokens_list) >= len(self.tokens):
raise BFSyntaxError("Expected %s after %s" % (str(tokens_list), str(self.tokens[starting_index])))
for i in range(0, len(tokens_list)):
if self.tokens[starting_index + 1 + i].type != tokens_list[i]:
raise BFSyntaxError(
"Expected %s after %s"
% (str(tokens_list[i]), [str(t) for t in self.tokens[starting_index : starting_index + 1 + i]])
)
def check_next_token_is(self, token, starting_index=None):
self.check_next_tokens_are([token], starting_index=starting_index)
def check_current_tokens_are(self, tokens_list):
self.check_next_tokens_are(tokens_list, starting_index=self.current_token_index - 1)
def check_current_token_is(self, token):
self.check_current_tokens_are([token])
def compile_array_initialization_list(self):
# {1, 2, 3, ...} or {array_initialization_list, array_initialization_list, array_initialization_list, ...} or string
# parses the definition and returns a list (of list of list ....) of literal tokens (NUM, CHAR, TRUE, FALSE)
list_tokens = []
if self.current_token().type == Token.STRING:
string_token = self.current_token()
line, column = string_token.line, string_token.column
for char in string_token.data:
list_tokens.append(Token(Token.NUM, line, column, str(ord(char))))
self.advance_token() # point to after STRING
return list_tokens
assert self.current_token().type == Token.LBRACE
self.advance_token() # skip to after LBRACE
while is_token_literal(self.current_token()) or self.current_token().type == Token.LBRACE:
if self.current_token().type == Token.LBRACE: # list of (literals | list)
list_tokens.append(self.compile_array_initialization_list())
else: # literal
list_tokens.append(self.current_token())
self.advance_token() # skip literal
if self.current_token().type not in [Token.COMMA, Token.RBRACE]:
raise BFSyntaxError("Unexpected %s (expected comma (,) or RBRACE (}))" % self.current_token())
if self.current_token().type == Token.COMMA:
self.advance_token() # skip comma
if self.current_token().type == Token.RBRACE:
break
self.check_current_token_is(Token.RBRACE)
self.advance_token() # skip RBRACE
return list_tokens

View file

@ -1,70 +0,0 @@
class Token:
INT = "INT"
VOID = "VOID"
TRUE = "TRUE"
FALSE = "FALSE"
AND = "AND"
OR = "OR"
NOT = "NOT"
RETURN = "RETURN"
IF = "IF"
ELSE = "ELSE"
WHILE = "WHILE"
FOR = "FOR"
DO = "DO"
BREAK = "BREAK"
CONTINUE = "CONTINUE"
SWITCH = "SWITCH"
CASE = "CASE"
DEFAULT = "DEFAULT"
COLON = "COLON"
SEMICOLON = "SEMICOLON"
COMMA = "COMMA"
LPAREN = "LPAREN"
RPAREN = "RPAREN"
LBRACE = "LBRACE"
RBRACE = "RBRACE"
LBRACK = "LBRACK"
RBRACK = "RBRACK"
ASSIGN = "ASSIGN"
TERNARY = "TERNARY"
RELOP = "RELOP"
BINOP = "BINOP"
INCREMENT = "INCREMENT"
DECREMENT = "DECREMENT"
UNARY_MULTIPLICATIVE = "UNARY_MULTIPLICATIVE"
BITWISE_SHIFT = "BITWISE_SHIFT"
BITWISE_NOT = "BITWISE_NOT"
BITWISE_AND = "BITWISE_AND"
BITWISE_OR = "BITWISE_OR"
BITWISE_XOR = "BITWISE_XOR"
WHITESPACE = "WHITESPACE"
ID = "ID"
NUM = "NUM"
STRING = "STRING"
CHAR = "CHAR"
PRINT = "PRINT"
COMMENT = "COMMENT"
UNIDENTIFIED = "UNIDENTIFIED"
def __init__(self, type, line, column, data=None, original_tokens=None):
self.type = type
self.line = line
self.column = column
self.data = data
self.original_tokens = original_tokens
def __str__(self):
result = self.type
if self.data:
result += " " + self.data
result += " (line %s column %s)" % (self.line, self.column)
if self.original_tokens:
result += " (original tokens: " + ", ".join([str(t) for t in self.original_tokens]) + ")"
return result