add contrib

2026-04-22 16:49:06 +00:00 · 2025-01-30 15:42:11 +01:00 · 2025-01-30 15:42:11 +01:00 · 0688cadf59
commit 0688cadf59
parent bf053e2266
20 changed files with 3971 additions and 0 deletions
--- a/reasoning_gym/code/contrib/bfit/Compiler/Lexical_analyzer.py
+++ b/reasoning_gym/code/contrib/bfit/Compiler/Lexical_analyzer.py
@ -0,0 +1,183 @@
+import re
+from .Token import Token
+from .Optimizer import optimize
+
+
+class LexicalErrorException(Exception):
+    pass
+
+
+def analyze(text):
+    """
+    :returns list of tokens in the text
+    raises exception in case of lexical error
+    """
+
+    rules = [
+        ('\s+', Token.WHITESPACE),
+        ('void',    Token.VOID),
+        ('int',     Token.INT),
+        ('bool', Token.INT),  # treat bool as int
+        ('char', Token.INT),  # treat char as int
+
+        ('true', Token.TRUE),
+        ('false', Token.FALSE),
+        ('&&', Token.AND),
+        ('\|\|', Token.OR),
+        ('\!', Token.NOT),
+        ('return', Token.RETURN),
+        ('if', Token.IF),
+        ('else', Token.ELSE),
+        ('while', Token.WHILE),
+        ('for', Token.FOR),
+        ('do', Token.DO),
+        ('print', Token.PRINT),
+        ('switch', Token.SWITCH),
+        ('case', Token.CASE),
+        ('default', Token.DEFAULT),
+        ('break', Token.BREAK),
+        ('continue', Token.CONTINUE),  # todo
+        (':', Token.COLON),
+        (';', Token.SEMICOLON),
+        (',', Token.COMMA),
+
+        ('\(', Token.LPAREN),
+        ('\)', Token.RPAREN),
+        ('\{', Token.LBRACE),
+        ('\}', Token.RBRACE),
+        ('\[', Token.LBRACK),
+        ('\]', Token.RBRACK),
+        ('=|\+=|-=|\*=|/=|%=|<<=|>>=|&=|\|=|\^=', Token.ASSIGN),
+        ('\?', Token.TERNARY),
+
+        ('<=|>=|==|!=|<|>', Token.RELOP),
+        ('\+\+', Token.INCREMENT),
+        ('--', Token.DECREMENT),
+        ('\+|-|\*|/|%', Token.BINOP),
+        ('\*\*|//|%%', Token.UNARY_MULTIPLICATIVE),
+
+        ('<<|>>', Token.BITWISE_SHIFT),
+        ('~', Token.BITWISE_NOT),
+        ('&', Token.BITWISE_AND),
+        ('\|', Token.BITWISE_OR),
+        ('\^', Token.BITWISE_XOR),
+
+        ('([a-zA-Z_][a-zA-Z0-9_]*)',    Token.ID),
+        ('(\d+)',     Token.NUM),
+        ('(0x[A-Fa-f\d]+)',     Token.NUM),  # hexadecimal number
+        ('(0o[0-7]+)',     Token.NUM),  # octal number
+        ('(0b[01]+)',     Token.NUM),  # binary number
+        (r'\"(\\\"|[^"])*"',   Token.STRING),
+        (r'\'(\\\'|(\\)?[^\'])\'', Token.CHAR),
+        ('//.*(\\n|$)', Token.COMMENT),
+        (r'/\*[\s\S]*?\*/', Token.COMMENT),  # multiline comments
+        ('.',       Token.UNIDENTIFIED)
+    ]
+
+    rules = [(re.compile(r), t) for r, t in rules]
+
+    tokens = []
+
+    # create a mapping of [line number] to [offset of that line from the beginning of the text]
+    newline = re.compile('\n')
+    lines = [0] + [m.end() for m in re.finditer(newline, text)]
+
+    i = 0
+    while i < len(text):
+        current_matches = []
+        for regex, token_type in rules:
+            m = regex.match(text, i)
+            if m:
+                current_matches.append((m, token_type))
+
+        # pick the token that fits the longest match
+        # if tie - pick the one defined first in the rules list
+        longest_match, max_i, matched_token = None, i, None
+        for match, token_type in current_matches:
+            if match.end() > max_i:
+                longest_match, max_i, matched_token = match, match.end(), token_type
+
+        # calculate line and column
+        line, column = None, None
+        for line_idx in range(len(lines)-1):
+            if lines[line_idx] <= longest_match.start() < lines[line_idx+1]:
+                line, column = line_idx+1, (longest_match.start() - lines[line_idx])+1  # humans count from 1 :)
+                break
+        if not line:
+            line, column = len(lines), (longest_match.start() - lines[-1])+1
+
+        if matched_token in [Token.COMMENT, Token.WHITESPACE]:
+            pass  # do nothing
+        elif matched_token == Token.UNIDENTIFIED:
+            raise LexicalErrorException("Unidentified Character '%s' (line %s column %s)" % (text[i], line, column))
+        elif matched_token in [Token.STRING, Token.CHAR]:
+            # remove quotes at beginning and end, un-escape characters
+            tokens.append(Token(matched_token, line, column, longest_match.group()[1:-1].encode("utf8").decode("unicode_escape")))
+        elif matched_token in [Token.NUM, Token.ID, Token.BINOP, Token.RELOP, Token.ASSIGN, Token.UNARY_MULTIPLICATIVE, Token.BITWISE_SHIFT]:
+            tokens.append(Token(matched_token, line, column, longest_match.group()))
+        else:
+            tokens.append(Token(matched_token, line, column))
+        i = longest_match.end()
+
+    return tokens
+
+
+def tests():
+    def test1():
+        # test token priorities: INT should not be confused with ID even if ID contains "int"
+        text = "my international int ; int; pints; international;"
+        res = analyze(text)
+
+        expected = [Token.ID, Token.ID, Token.INT, Token.SEMICOLON, Token.INT, Token.SEMICOLON, Token.ID,
+                    Token.SEMICOLON, Token.ID, Token.SEMICOLON]
+        assert len(res) == len(expected) and all(res[i].type == expected[i] for i in range(len(res)))
+
+    def test2():
+        text = "true !||!false falsek  k||y+-a&&x"
+        res = analyze(text)
+
+        expected = [Token.TRUE, Token.NOT, Token.OR, Token.NOT, Token.FALSE, Token.ID, Token.ID, Token.OR, Token.ID,
+                    Token.BINOP, Token.BINOP, Token.ID, Token.AND, Token.ID]
+        assert len(res) == len(expected) and all(res[i].type == expected[i] for i in range(len(res)))
+
+    def test3():
+        text = "1+2"
+        tokens = analyze(text)
+        expected = [Token.NUM, Token.BINOP, Token.NUM]
+        assert len(tokens) == len(expected) and all(tokens[i].type == expected[i] for i in range(len(tokens)))
+        optimize(tokens)
+        assert len(tokens) == 1 and tokens[0].type == Token.NUM and tokens[0].data == "3"
+
+        text = "1+2+3"
+        tokens = analyze(text)
+        expected = [Token.NUM, Token.BINOP, Token.NUM, Token.BINOP, Token.NUM]
+        assert len(tokens) == len(expected) and all(tokens[i].type == expected[i] for i in range(len(tokens)))
+        optimize(tokens)
+        assert len(tokens) == 1 and tokens[0].type == Token.NUM and tokens[0].data == "6"
+
+        # make sure it is not optimized to 9 (3*3)
+        text = "1+2*3"
+        tokens = analyze(text)
+        expected = [Token.NUM, Token.BINOP, Token.NUM, Token.BINOP, Token.NUM]
+        assert len(tokens) == len(expected) and all(tokens[i].type == expected[i] for i in range(len(tokens)))
+        optimize(tokens)
+        assert len(tokens) == 1 and tokens[0].type == Token.NUM and tokens[0].data == "7"
+
+        # test all arithmetic operations
+        text = "(1+2*3/6)+(1%3)*(6-1)"
+        tokens = analyze(text)
+        expected = [Token.LPAREN, Token.NUM, Token.BINOP, Token.NUM, Token.BINOP, Token.NUM, Token.BINOP, Token.NUM,
+                    Token.RPAREN, Token.BINOP, Token.LPAREN, Token.NUM, Token.BINOP, Token.NUM, Token.RPAREN,
+                    Token.BINOP, Token.LPAREN, Token.NUM, Token.BINOP, Token.NUM, Token.RPAREN]
+        assert len(tokens) == len(expected) and all(tokens[i].type == expected[i] for i in range(len(tokens)))
+        optimize(tokens)
+        assert tokens[1].data == "2" and tokens[5].data == "1" and tokens[9].data == "5"
+
+    # todo find a better way to test?
+    test1()
+    test2()
+    test3()
+
+
+if __name__ == '__main__':
+    tests()