summaryrefslogtreecommitdiff
path: root/tokenizer.py
blob: 4fd534c0b62914ed90ddf0806f3ac43aa01e641e (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
from consts import *


class TokenizerError(Exception):
    pass


class Token:
    def __init__(self, node_type, *, value=None, subtype=None):
        self.type = node_type
        self.value = value
        self.subtype = subtype

    def __eq__(self, other):
        return self.type == other.type and self.value == other.value

    def __repr__(self):
        if self.type == TokenType.LEFT_PARENTHESIS:
            return "Token(LEFT_PARENTHESIS)"
        if self.type == TokenType.RIGHT_PARENTHESIS:
            return "Token(RIGHT_PARENTHESIS)"
        if self.type == TokenType.NUMBER:
            return f"Token({self.value})"
        if self.type == TokenType.OPERATOR:
            return f"Token({self.subtype})"
        if self.type == TokenType.COMMA:
            return f"Token(COMMA)"
        if self.type == TokenType.SYMBOL:
            return f"Token(SYMBOL {self.value})"
        if self.type == TokenType.KEYWORD:
            return f"Token(KEYWORD {keywords_repr[self.subtype]})"
        if self.type == TokenType.EQUALS:
            return f"Token(=)"
        if self.type == TokenType.SEMICOLON:
            return f"Token(;)"
        if self.type == TokenType.LEFT_BRACE:
            return "Token(LEFT_BRACE)"
        if self.type == TokenType.LEFT_BRACE:
            return "Token(RIGHT_BRACE)"


        return f"Token(repr not defined)"
    __str__ = __repr__


def parse_token(line, start, charset):
    token = ""
    i = start
    while i < len(line) and line[i] in charset:
        token += line[i]
        i += 1
    return token, i - 1


def parse_operator(line, start):
    return parse_token(line, start, OPERATOR_CHARS)


def parse_number(line, start):
    val, i = parse_token(line, start, NUMBER_CHARS)
    return float(val), i


def parse_symbol(line, start):
    return parse_token(line, start, SYMBOL_CHARS)


def tokenize(line):
    state = State.NAME

    tokens = []
    i = 0
    while i < len(line):
        char = line[i]
        if char == "(":
            tokens.append(Token(TokenType.LEFT_PARENTHESIS))
            state = State.NAME
        elif char == ")":
            tokens.append(Token(TokenType.RIGHT_PARENTHESIS))
            state = State.OPERATOR
        elif char == ",":
            tokens.append(Token(TokenType.COMMA))
            state = State.NAME
        elif char == ";":
            tokens.append(Token(TokenType.SEMICOLON))
            state = State.NAME
        elif char == "{":
            tokens.append(Token(TokenType.LEFT_BRACE))
            state = State.NAME
        elif char == "}":
            tokens.append(Token(TokenType.RIGHT_BRACE))
            state = State.NAME
        elif char in OPERATOR_CHARS:
            val, i = parse_operator(line, i)
            if val == "=":
                tokens.append(Token(TokenType.EQUALS))
                state = State.NAME
            elif state == State.OPERATOR:
                tokens.append(Token(TokenType.OPERATOR, subtype=val))
                state = State.NAME
            elif state == State.NAME:
                tokens.append(Token(TokenType.OPERATOR, subtype=TokenType.UNARY, value=val))
                state = State.NAME
        elif char in NUMBER_CHARS:
            val, i = parse_number(line, i)
            tokens.append(Token(TokenType.NUMBER, value=val))
            state = State.OPERATOR
        elif char in SYMBOL_CHARS:
            val, i = parse_symbol(line, i)
            if val in keywords:
                tokens.append(Token(TokenType.KEYWORD, subtype=keywords[val]))
                state = State.NAME
            else:
                tokens.append(Token(TokenType.SYMBOL, value=val))
                state = State.OPERATOR
        elif char != " ":
            raise ValueError("Line is not a valid expression")
        i += 1

    return tokens