Source code for cutile_basic.lexer

"""Lexer for BASIC source code."""

from .tokens import Token, TokenType, KEYWORDS

BUILTIN_FUNCTIONS = {"ABS", "SQR", "INT", "SIN", "COS", "TAN", "EXP", "LOG", "SGN", "MMA"}



[docs]
class LexError(Exception):
    def __init__(self, msg: str, line: int, col: int):
        super().__init__(f"Lex error at L{line}:{col}: {msg}")
        self.line = line
        self.col = col




[docs]
def lex(source: str) -> list[Token]:
    """Tokenize BASIC source code into a list of tokens."""
    tokens: list[Token] = []
    lines = source.split("\n")

    for line_num, line_text in enumerate(lines, start=1):
        i = 0
        while i < len(line_text):
            ch = line_text[i]
            col = i + 1

            # Whitespace
            if ch in (" ", "\t"):
                i += 1
                continue

            # REM comment — rest of line
            if line_text[i:].upper().startswith("REM") and (
                i + 3 >= len(line_text) or not line_text[i + 3].isalnum()
            ):
                comment = line_text[i + 3:].strip()
                tokens.append(Token(TokenType.REM, comment, line_num, col))
                break

            # String literal
            if ch == '"':
                j = i + 1
                while j < len(line_text) and line_text[j] != '"':
                    j += 1
                if j >= len(line_text):
                    raise LexError("Unterminated string", line_num, col)
                tokens.append(Token(TokenType.STRING, line_text[i + 1 : j], line_num, col))
                i = j + 1
                continue

            # Number (integer or float)
            if ch.isdigit() or (ch == "." and i + 1 < len(line_text) and line_text[i + 1].isdigit()):
                j = i
                has_dot = False
                while j < len(line_text) and (line_text[j].isdigit() or line_text[j] == "."):
                    if line_text[j] == ".":
                        if has_dot:
                            break
                        has_dot = True
                    j += 1
                # Handle scientific notation
                if j < len(line_text) and line_text[j].upper() == "E":
                    j += 1
                    if j < len(line_text) and line_text[j] in "+-":
                        j += 1
                    while j < len(line_text) and line_text[j].isdigit():
                        j += 1
                    has_dot = True  # treat scientific notation as float
                num_str = line_text[i:j]
                if has_dot:
                    tokens.append(Token(TokenType.FLOAT, num_str, line_num, col))
                else:
                    tokens.append(Token(TokenType.INTEGER, num_str, line_num, col))
                i = j
                continue

            # Identifier or keyword
            if ch.isalpha() or ch == "_":
                j = i
                while j < len(line_text) and (line_text[j].isalnum() or line_text[j] == "_"):
                    j += 1
                # Allow trailing $ or % for type suffixes
                if j < len(line_text) and line_text[j] in ("%", "$"):
                    j += 1
                word = line_text[i:j]
                upper = word.upper().rstrip("%$")
                if upper in KEYWORDS:
                    tokens.append(Token(KEYWORDS[upper], word, line_num, col))
                else:
                    tokens.append(Token(TokenType.IDENTIFIER, word, line_num, col))
                i = j
                continue

            # Two-character operators
            two = line_text[i : i + 2]
            if two == "<>":
                tokens.append(Token(TokenType.NEQ, "<>", line_num, col))
                i += 2
                continue
            if two == "<=":
                tokens.append(Token(TokenType.LE, "<=", line_num, col))
                i += 2
                continue
            if two == ">=":
                tokens.append(Token(TokenType.GE, ">=", line_num, col))
                i += 2
                continue

            # Single-character operators and delimiters
            single_map = {
                "+": TokenType.PLUS,
                "-": TokenType.MINUS,
                "*": TokenType.STAR,
                "/": TokenType.SLASH,
                "^": TokenType.CARET,
                "=": TokenType.EQ,
                "<": TokenType.LT,
                ">": TokenType.GT,
                "(": TokenType.LPAREN,
                ")": TokenType.RPAREN,
                ",": TokenType.COMMA,
                ";": TokenType.SEMICOLON,
                ":": TokenType.COLON,
            }
            if ch in single_map:
                tokens.append(Token(single_map[ch], ch, line_num, col))
                i += 1
                continue

            raise LexError(f"Unexpected character: {ch!r}", line_num, col)

        # End of line
        tokens.append(Token(TokenType.NEWLINE, "\\n", line_num, len(line_text) + 1))

    # Replace trailing newline with EOF
    if tokens and tokens[-1].type == TokenType.NEWLINE:
        tokens[-1] = Token(TokenType.EOF, "", tokens[-1].line, tokens[-1].col)
    else:
        tokens.append(Token(TokenType.EOF, "", len(lines), 1))

    return tokens