summaryrefslogtreecommitdiff
path: root/tokenizer.py
blob: 13b824a4a834f58306b25c03e55690db73da57c4 (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
from consts import *


class TokenizerError(Exception):
    pass


class Token:
    def __init__(self, node_type, value=None, subtype=None):
        self.type = node_type
        self.value = value
        self.subtype = subtype

    def __eq__(self, other):
        return self.type == other.type and self.value == other.value

    def __repr__(self):
        if self.type == TokenType.LEFT_PARENTHESIS:
            return "Token(LEFT_PARENTHESIS)"
        if self.type == TokenType.RIGHT_PARENTHESIS:
            return "Token(RIGHT_PARENTHESIS)"
        if self.type == TokenType.NUMBER:
            return f"Token({self.value})"
        if self.type == TokenType.OPERATOR:
            return f"Token({self.subtype})"
        if self.type == TokenType.COMMA:
            return f"Token(COMMA)"
        if self.type == TokenType.SYMBOL:
            return f"Token(SYMBOL {self.value})"
    __str__ = __repr__


def parse_token(line, start, charset):
    token = ""
    i = start
    while i < len(line) and line[i] in charset:
        token += line[i]
        i += 1
    return token, i - 1


def parse_operator(line, start):
    return parse_token(line, start, OPERATOR_CHARS)


def parse_number(line, start):
    val, i = parse_token(line, start, NUMBER_CHARS)
    return float(val), i


def parse_symbol(line, start):
    return parse_token(line, start, SYMBOL_CHARS)


def tokenize(line):
    state = State.NAME

    tokens = []
    i = 0
    while i < len(line):
        char = line[i]
        if char == "(":
            tokens.append(Token(TokenType.LEFT_PARENTHESIS, None))
            state = State.NAME
        elif char == ")":
            tokens.append(Token(TokenType.RIGHT_PARENTHESIS, None))
            state = State.OPERATOR
        elif char == ",":
            tokens.append(Token(TokenType.COMMA, None))
            state = State.NAME
        elif char in OPERATOR_CHARS:
            if state == State.OPERATOR:
                val, i = parse_operator(line, i)
                tokens.append(Token(TokenType.OPERATOR, subtype=val))
                state = State.NAME
            elif state == State.NAME:
                val, i = parse_operator(line, i)
                tokens.append(Token(TokenType.OPERATOR, subtype=UNARY, value=val))
                state = State.NAME
        elif char in NUMBER_CHARS:
            val, i = parse_number(line, i)
            tokens.append(Token(TokenType.NUMBER, val))
            state = State.OPERATOR
        elif char in SYMBOL_CHARS:
            val, i = parse_symbol(line, i)
            tokens.append(Token(TokenType.SYMBOL, val))
            state = State.OPERATOR
        elif char != " ":
            raise ValueError("Line is not a valid expression")
        i += 1

    return tokens