Files
l2/main.py

608 lines
16 KiB
Python
Raw Normal View History

2025-12-06 16:30:58 +01:00
"""Bootstrap compiler for the L2 language.
This file now contains working scaffolding for:
* Parsing definitions, literals, and ordinary word references.
* Respecting immediate/macro words so syntax can be rewritten on the fly.
* Emitting NASM-compatible x86-64 assembly with explicit data and return stacks.
* Driving the toolchain via ``nasm`` + ``ld``.
"""
from __future__ import annotations
import argparse
import subprocess
import sys
from dataclasses import dataclass, field
from pathlib import Path
from typing import Callable, Dict, Iterable, List, Optional, Sequence, Set, Union
class ParseError(Exception):
"""Raised when the source stream cannot be parsed."""
class CompileError(Exception):
"""Raised when IR cannot be turned into assembly."""
# ---------------------------------------------------------------------------
# Tokenizer / Reader
# ---------------------------------------------------------------------------
@dataclass
class Token:
lexeme: str
line: int
column: int
start: int
end: int
def __repr__(self) -> str: # pragma: no cover - debug helper
return f"Token({self.lexeme!r}@{self.line}:{self.column})"
class Reader:
"""Default reader; users can swap implementations at runtime."""
def __init__(self) -> None:
self.line = 1
self.column = 0
def tokenize(self, source: str) -> Iterable[Token]:
self.line = 1
self.column = 0
index = 0
lexeme: List[str] = []
token_start = 0
token_line = 1
token_column = 0
2025-12-06 17:24:30 +01:00
source_len = len(source)
while index < source_len:
char = source[index]
if char == "#":
while index < source_len and source[index] != "\n":
index += 1
continue
2025-12-06 16:30:58 +01:00
if char.isspace():
if lexeme:
yield Token(
"".join(lexeme),
token_line,
token_column,
token_start,
index,
)
lexeme.clear()
if char == "\n":
self.line += 1
self.column = 0
else:
self.column += 1
index += 1
continue
if not lexeme:
token_start = index
token_line = self.line
token_column = self.column
lexeme.append(char)
self.column += 1
index += 1
if lexeme:
yield Token("".join(lexeme), token_line, token_column, token_start, index)
# ---------------------------------------------------------------------------
# Dictionary / Words
# ---------------------------------------------------------------------------
class ASTNode:
"""Base class for all AST nodes."""
@dataclass
class WordRef(ASTNode):
name: str
@dataclass
class Literal(ASTNode):
value: int
@dataclass
class Definition(ASTNode):
name: str
body: List[ASTNode]
immediate: bool = False
@dataclass
class AsmDefinition(ASTNode):
name: str
body: str
immediate: bool = False
@dataclass
class Module(ASTNode):
forms: List[ASTNode]
MacroHandler = Callable[["Parser"], Optional[List[ASTNode]]]
IntrinsicEmitter = Callable[["FunctionEmitter"], None]
@dataclass
class Word:
name: str
immediate: bool = False
stack_effect: str = "( -- )"
definition: Optional[Union[Definition, AsmDefinition]] = None
macro: Optional[MacroHandler] = None
intrinsic: Optional[IntrinsicEmitter] = None
@dataclass
class Dictionary:
words: Dict[str, Word] = field(default_factory=dict)
def register(self, word: Word) -> None:
if word.name in self.words:
sys.stderr.write(f"[warn] redefining word {word.name}\n")
self.words[word.name] = word
def lookup(self, name: str) -> Optional[Word]:
return self.words.get(name)
# ---------------------------------------------------------------------------
# Parser
# ---------------------------------------------------------------------------
Context = Union[Module, Definition]
class Parser:
def __init__(self, dictionary: Dictionary) -> None:
self.dictionary = dictionary
self.tokens: List[Token] = []
self.pos = 0
self.context_stack: List[Context] = []
self.definition_stack: List[Word] = []
self.last_defined: Optional[Word] = None
self.source: str = ""
# Public helpers for macros ------------------------------------------------
def next_token(self) -> Token:
return self._consume()
def peek_token(self) -> Optional[Token]:
return None if self._eof() else self.tokens[self.pos]
def emit_node(self, node: ASTNode) -> None:
self._append_node(node)
def most_recent_definition(self) -> Optional[Word]:
return self.last_defined
# Parsing ------------------------------------------------------------------
def parse(self, tokens: Iterable[Token], source: str) -> Module:
self.tokens = list(tokens)
self.source = source
self.pos = 0
self.context_stack = [Module(forms=[])]
self.definition_stack.clear()
self.last_defined = None
while not self._eof():
token = self._consume()
lexeme = token.lexeme
if lexeme == ":":
self._begin_definition(token)
continue
if lexeme == ";":
self._end_definition(token)
continue
if lexeme == ":asm":
self._parse_asm_definition(token)
continue
self._handle_token(token)
if len(self.context_stack) != 1:
raise ParseError("unclosed definition at EOF")
module = self.context_stack.pop()
if not isinstance(module, Module): # pragma: no cover - defensive
raise ParseError("internal parser state corrupt")
return module
# Internal helpers ---------------------------------------------------------
def _handle_token(self, token: Token) -> None:
if self._try_literal(token):
return
word = self.dictionary.lookup(token.lexeme)
if word and word.immediate:
if not word.macro:
raise ParseError(f"immediate word {word.name} lacks macro handler")
produced = word.macro(self)
if produced:
for node in produced:
self._append_node(node)
return
self._append_node(WordRef(name=token.lexeme))
def _begin_definition(self, token: Token) -> None:
if self._eof():
raise ParseError(f"definition name missing after ':' at {token.line}:{token.column}")
name_token = self._consume()
definition = Definition(name=name_token.lexeme, body=[])
self.context_stack.append(definition)
word = self.dictionary.lookup(definition.name)
if word is None:
word = Word(name=definition.name)
self.dictionary.register(word)
word.definition = definition
self.definition_stack.append(word)
def _end_definition(self, token: Token) -> None:
if len(self.context_stack) <= 1:
raise ParseError(f"unexpected ';' at {token.line}:{token.column}")
ctx = self.context_stack.pop()
if not isinstance(ctx, Definition):
raise ParseError("';' can only close definitions")
word = self.definition_stack.pop()
ctx.immediate = word.immediate
module = self.context_stack[-1]
if not isinstance(module, Module):
raise ParseError("nested definitions are not supported yet")
module.forms.append(ctx)
self.last_defined = word
def _parse_asm_definition(self, token: Token) -> None:
if self._eof():
raise ParseError(f"definition name missing after ':asm' at {token.line}:{token.column}")
name_token = self._consume()
brace_token = self._consume()
if brace_token.lexeme != "{":
raise ParseError(f"expected '{{' after asm name at {brace_token.line}:{brace_token.column}")
block_start = brace_token.end
block_end: Optional[int] = None
while not self._eof():
next_token = self._consume()
if next_token.lexeme == "}":
block_end = next_token.start
break
if block_end is None:
raise ParseError("missing '}' to terminate asm body")
asm_body = self.source[block_start:block_end]
definition = AsmDefinition(name=name_token.lexeme, body=asm_body)
word = self.dictionary.lookup(definition.name)
if word is None:
word = Word(name=definition.name)
self.dictionary.register(word)
word.definition = definition
definition.immediate = word.immediate
module = self.context_stack[-1]
if not isinstance(module, Module):
raise ParseError("asm definitions must be top-level forms")
module.forms.append(definition)
self.last_defined = word
if self._eof():
raise ParseError("asm definition missing terminator ';'")
terminator = self._consume()
if terminator.lexeme != ";":
raise ParseError(f"expected ';' after asm definition at {terminator.line}:{terminator.column}")
def _append_node(self, node: ASTNode) -> None:
target = self.context_stack[-1]
if isinstance(target, Module):
target.forms.append(node)
elif isinstance(target, Definition):
target.body.append(node)
else: # pragma: no cover - defensive
raise ParseError("unknown parse context")
def _try_literal(self, token: Token) -> bool:
try:
value = int(token.lexeme, 0)
except ValueError:
return False
self._append_node(Literal(value=value))
return True
def _consume(self) -> Token:
if self._eof():
raise ParseError("unexpected EOF")
token = self.tokens[self.pos]
self.pos += 1
return token
def _eof(self) -> bool:
return self.pos >= len(self.tokens)
# ---------------------------------------------------------------------------
# NASM Emitter
# ---------------------------------------------------------------------------
@dataclass
class Emission:
text: List[str] = field(default_factory=list)
data: List[str] = field(default_factory=list)
bss: List[str] = field(default_factory=list)
def snapshot(self) -> str:
parts: List[str] = []
if self.text:
parts.extend(["section .text", *self.text])
if self.data:
parts.extend(["section .data", *self.data])
if self.bss:
parts.extend(["section .bss", *self.bss])
return "\n".join(parts)
class FunctionEmitter:
"""Utility for emitting per-word assembly."""
def __init__(self, text: List[str]) -> None:
self.text = text
def emit(self, line: str) -> None:
self.text.append(line)
def comment(self, message: str) -> None:
self.text.append(f" ; {message}")
def push_literal(self, value: int) -> None:
self.text.extend([
f" ; push {value}",
" sub r12, 8",
f" mov qword [r12], {value}",
])
def push_from(self, register: str) -> None:
self.text.extend([
" sub r12, 8",
f" mov [r12], {register}",
])
def pop_to(self, register: str) -> None:
self.text.extend([
f" mov {register}, [r12]",
" add r12, 8",
])
def sanitize_label(name: str) -> str:
parts: List[str] = []
for ch in name:
if ch.isalnum() or ch == "_":
parts.append(ch)
else:
parts.append(f"_{ord(ch):02x}")
safe = "".join(parts) or "anon"
return f"word_{safe}"
class Assembler:
def __init__(self, dictionary: Dictionary) -> None:
self.dictionary = dictionary
self.stack_bytes = 65536
self.io_buffer_bytes = 128
def emit(self, module: Module) -> Emission:
emission = Emission()
emission.text.extend(self._runtime_prelude())
valid_defs = (Definition, AsmDefinition)
definitions = [form for form in module.forms if isinstance(form, valid_defs)]
stray_forms = [form for form in module.forms if not isinstance(form, valid_defs)]
if stray_forms:
raise CompileError("top-level literals or word references are not supported yet")
if not any(defn.name == "main" for defn in definitions):
raise CompileError("missing 'main' definition")
for definition in definitions:
self._emit_definition(definition, emission.text)
emission.bss.extend(self._bss_layout())
return emission
def _emit_definition(self, definition: Union[Definition, AsmDefinition], text: List[str]) -> None:
label = sanitize_label(definition.name)
text.append(f"{label}:")
builder = FunctionEmitter(text)
if isinstance(definition, Definition):
for node in definition.body:
self._emit_node(node, builder)
elif isinstance(definition, AsmDefinition):
self._emit_asm_body(definition, builder)
else: # pragma: no cover - defensive
raise CompileError("unknown definition type")
builder.emit(" ret")
def _emit_asm_body(self, definition: AsmDefinition, builder: FunctionEmitter) -> None:
body = definition.body.strip("\n")
if not body:
return
for line in body.splitlines():
if line.strip():
builder.emit(line)
else:
builder.emit("")
def _emit_node(self, node: ASTNode, builder: FunctionEmitter) -> None:
if isinstance(node, Literal):
builder.push_literal(node.value)
return
if isinstance(node, WordRef):
self._emit_wordref(node, builder)
return
raise CompileError(f"unsupported AST node {node!r}")
def _emit_wordref(self, ref: WordRef, builder: FunctionEmitter) -> None:
word = self.dictionary.lookup(ref.name)
if word is None:
raise CompileError(f"unknown word '{ref.name}'")
if word.intrinsic:
word.intrinsic(builder)
return
builder.emit(f" call {sanitize_label(ref.name)}")
def _runtime_prelude(self) -> List[str]:
return [
"%define DSTK_BYTES 65536",
"%define RSTK_BYTES 65536",
"%define PRINT_BUF_BYTES 128",
"global _start",
"_start:",
" ; initialize data/return stack pointers",
" lea r12, [rel dstack_top]",
" mov r15, r12",
" lea r13, [rel rstack_top]",
" call word_main",
" mov rax, 0",
" cmp r12, r15",
" je .no_exit_value",
" mov rax, [r12]",
" add r12, 8",
".no_exit_value:",
" mov rdi, rax",
" mov rax, 60",
" syscall",
]
def _bss_layout(self) -> List[str]:
return [
"align 16",
"dstack: resb DSTK_BYTES",
"dstack_top:",
"align 16",
"rstack: resb RSTK_BYTES",
"rstack_top:",
"align 16",
"print_buf: resb PRINT_BUF_BYTES",
"print_buf_end:",
]
def write_asm(self, emission: Emission, path: Path) -> None:
path.write_text(emission.snapshot())
# ---------------------------------------------------------------------------
# Built-in macros and intrinsics
# ---------------------------------------------------------------------------
def macro_immediate(parser: Parser) -> Optional[List[ASTNode]]:
word = parser.most_recent_definition()
if word is None:
raise ParseError("'immediate' must follow a definition")
word.immediate = True
if word.definition is not None:
word.definition.immediate = True
return None
def bootstrap_dictionary() -> Dictionary:
dictionary = Dictionary()
dictionary.register(Word(name="immediate", immediate=True, macro=macro_immediate))
return dictionary
# ---------------------------------------------------------------------------
# Driver
# ---------------------------------------------------------------------------
class Compiler:
def __init__(self) -> None:
self.reader = Reader()
self.dictionary = bootstrap_dictionary()
self.parser = Parser(self.dictionary)
self.assembler = Assembler(self.dictionary)
def compile_source(self, source: str) -> Emission:
tokens = list(self.reader.tokenize(source))
module = self.parser.parse(tokens, source)
return self.assembler.emit(module)
def compile_file(self, path: Path) -> Emission:
source = self._load_with_imports(path.resolve())
return self.compile_source(source)
def _load_with_imports(self, path: Path, seen: Optional[Set[Path]] = None) -> str:
if seen is None:
seen = set()
path = path.resolve()
if path in seen:
return ""
seen.add(path)
try:
contents = path.read_text()
except FileNotFoundError as exc:
raise ParseError(f"cannot import {path}: {exc}") from exc
lines: List[str] = []
for idx, line in enumerate(contents.splitlines()):
stripped = line.strip()
if stripped.startswith("import "):
target = stripped.split(None, 1)[1].strip()
if not target:
raise ParseError(f"empty import target in {path}:{idx + 1}")
target_path = (path.parent / target).resolve()
lines.append(self._load_with_imports(target_path, seen))
continue
lines.append(line)
return "\n".join(lines) + "\n"
def run_nasm(asm_path: Path, obj_path: Path) -> None:
subprocess.run(["nasm", "-f", "elf64", "-o", str(obj_path), str(asm_path)], check=True)
def run_linker(obj_path: Path, exe_path: Path) -> None:
subprocess.run(["ld", "-o", str(exe_path), str(obj_path)], check=True)
def cli(argv: Sequence[str]) -> int:
parser = argparse.ArgumentParser(description="L2 compiler driver")
parser.add_argument("source", type=Path, help="input .sl file")
parser.add_argument("-o", dest="output", type=Path, default=Path("a.out"))
parser.add_argument("--emit-asm", action="store_true", help="stop after generating asm")
parser.add_argument("--temp-dir", type=Path, default=Path("build"))
args = parser.parse_args(argv)
compiler = Compiler()
emission = compiler.compile_file(args.source)
args.temp_dir.mkdir(parents=True, exist_ok=True)
asm_path = args.temp_dir / (args.source.stem + ".asm")
obj_path = args.temp_dir / (args.source.stem + ".o")
compiler.assembler.write_asm(emission, asm_path)
if args.emit_asm:
print(f"[info] wrote {asm_path}")
return 0
run_nasm(asm_path, obj_path)
run_linker(obj_path, args.output)
print(f"[info] built {args.output}")
return 0
def main() -> None:
sys.exit(cli(sys.argv[1:]))
if __name__ == "__main__":
main()